Optimize common vector shuffle patterns for ARM32.

Use VDUP for replicating a single element.
Use VZIP for interleaving vectors.
Use VMOV Dd, Dm for rearranging quadword vectors.

Bug b/67106219

Change-Id: I0de1457454c1db6d467bf870288b7af7cb59ac09
Reviewed-on: https://chromium-review.googlesource.com/695004
Reviewed-by: Jim Stichnoth <stichnot@chromium.org>
Reviewed-on: https://swiftshader-review.googlesource.com/12968
Reviewed-by: Nicolas Capens <nicolascapens@google.com>
Tested-by: Nicolas Capens <nicolascapens@google.com>
diff --git a/third_party/subzero/src/IceAssemblerARM32.cpp b/third_party/subzero/src/IceAssemblerARM32.cpp
index 502668c..4b1fcb9 100644
--- a/third_party/subzero/src/IceAssemblerARM32.cpp
+++ b/third_party/subzero/src/IceAssemblerARM32.cpp
@@ -3418,6 +3418,97 @@
   emitSIMDBase(VpaddOpcode, Dd, Dd, Dd + 1, UseQRegs, IsFloatTy);
 }
 
+void AssemblerARM32::vdup(Type ElmtTy, const Operand *OpQd, const Operand *OpQn,
+                          IValueT Idx) {
+  // VDUP (scalar) - ARMv7-A/R section A8.6.302, encoding A1:
+  //   VDUP<c>.<size> <Qd>, <Dm[x]>
+  //
+  // 111100111D11iiiiddd011000QM0mmmm where Dddd=<Qd>, Mmmmm=<Dm>, and
+  // iiii=imm4 encodes <size> and [x].
+  constexpr const char *Vdup = "vdup";
+
+  const IValueT VdupOpcode = B25 | B24 | B23 | B21 | B20 | B11 | B10;
+
+  const IValueT Dd = mapQRegToDReg(encodeQRegister(OpQd, "Qd", Vdup));
+  const IValueT Dn = mapQRegToDReg(encodeQRegister(OpQn, "Qn", Vdup));
+
+  constexpr bool UseQRegs = true;
+  constexpr bool IsFloatTy = false;
+
+  IValueT Imm4 = 0;
+  bool Lower = true;
+  switch (ElmtTy) {
+  case IceType_i8:
+    assert(Idx < 16);
+    Lower = Idx < 8;
+    Imm4 = 1 | ((Idx & 0x7) << 1);
+    break;
+  case IceType_i16:
+    assert(Idx < 8);
+    Lower = Idx < 4;
+    Imm4 = 2 | ((Idx & 0x3) << 2);
+    break;
+  case IceType_i32:
+  case IceType_f32:
+    assert(Idx < 4);
+    Lower = Idx < 2;
+    Imm4 = 4 | ((Idx & 0x1) << 3);
+    break;
+  default:
+    assert(false && "vdup only supports 8, 16, and 32-bit elements");
+    break;
+  }
+
+  emitSIMDBase(VdupOpcode, Dd, Imm4, Dn + (Lower ? 0 : 1), UseQRegs, IsFloatTy);
+}
+
+void AssemblerARM32::vzip(Type ElmtTy, const Operand *OpQd, const Operand *OpQn,
+                          const Operand *OpQm) {
+  // Pseudo-instruction which interleaves the elements of the lower halves of
+  // two quadword registers.
+
+  // Vzip - ARMv7-A/R section A8.6.410, encoding A1:
+  //   VZIP<c>.<size> <Dd>, <Dm>
+  //
+  // 111100111D11ss10dddd00011QM0mmmm where Ddddd=<Dd>, Mmmmm=<Dm>, and
+  // ss=<size>
+  assert(ElmtTy != IceType_i64 && "vzip on i64 vector not allowed");
+
+  constexpr const char *Vzip = "vzip";
+  const IValueT Dd = mapQRegToDReg(encodeQRegister(OpQd, "Qd", Vzip));
+  const IValueT Dn = mapQRegToDReg(encodeQRegister(OpQn, "Qn", Vzip));
+  const IValueT Dm = mapQRegToDReg(encodeQRegister(OpQm, "Qm", Vzip));
+
+  constexpr bool UseQRegs = false;
+  constexpr bool IsFloatTy = false;
+
+  // VMOV Dd, Dm
+  // 111100100D10mmmmdddd0001MQM1mmmm
+  constexpr IValueT VmovOpcode = B25 | B21 | B8 | B4;
+
+  // Copy lower half of second source to upper half of destination.
+  emitSIMDBase(VmovOpcode, Dd + 1, Dm, Dm, UseQRegs, IsFloatTy);
+
+  // Copy lower half of first source to lower half of destination.
+  if (Dd != Dn)
+    emitSIMDBase(VmovOpcode, Dd, Dn, Dn, UseQRegs, IsFloatTy);
+
+  constexpr IValueT ElmtShift = 18;
+  const IValueT ElmtSize = encodeElmtType(ElmtTy);
+  assert(Utils::IsUint(2, ElmtSize));
+
+  if (ElmtTy != IceType_i32 && ElmtTy != IceType_f32) {
+    constexpr IValueT VzipOpcode = B25 | B24 | B23 | B21 | B20 | B17 | B8 | B7;
+    // Zip the lower and upper half of destination.
+    emitSIMDBase(VzipOpcode | (ElmtSize << ElmtShift), Dd, 0, Dd + 1, UseQRegs,
+                 IsFloatTy);
+  } else {
+    constexpr IValueT VtrnOpcode = B25 | B24 | B23 | B21 | B20 | B17 | B7;
+    emitSIMDBase(VtrnOpcode | (ElmtSize << ElmtShift), Dd, 0, Dd + 1, UseQRegs,
+                 IsFloatTy);
+  }
+}
+
 void AssemblerARM32::vmulqf(const Operand *OpQd, const Operand *OpQn,
                             const Operand *OpQm) {
   // VMUL (floating-point) - ARM section A8.8.351, encoding A1:
@@ -3448,6 +3539,110 @@
                mapQRegToDReg(Qm), UseQRegs, IsFloat);
 }
 
+void AssemblerARM32::vmovlq(const Operand *OpQd, const Operand *OpQn,
+                            const Operand *OpQm) {
+  // Pseudo-instruction to copy the first source operand and insert the lower
+  // half of the second operand into the lower half of the destination.
+
+  // VMOV (register) - ARMv7-A/R section A8.6.327, encoding A1:
+  //   VMOV<c> <Dd>, <Dm>
+  //
+  // 111100111D110000ddd001011QM0mmm0 where Dddd=Qd, Mmmm=Qm, and Q=0.
+
+  constexpr const char *Vmov = "vmov";
+  const IValueT Dd = mapQRegToDReg(encodeQRegister(OpQd, "Qd", Vmov));
+  const IValueT Dn = mapQRegToDReg(encodeQRegister(OpQn, "Qn", Vmov));
+  const IValueT Dm = mapQRegToDReg(encodeQRegister(OpQm, "Qm", Vmov));
+
+  constexpr bool UseQRegs = false;
+  constexpr bool IsFloat = false;
+
+  const IValueT VmovOpcode = B25 | B21 | B8 | B4;
+
+  if (Dd != Dm)
+    emitSIMDBase(VmovOpcode, Dd, Dm, Dm, UseQRegs, IsFloat);
+  if (Dd + 1 != Dn + 1)
+    emitSIMDBase(VmovOpcode, Dd + 1, Dn + 1, Dn + 1, UseQRegs, IsFloat);
+}
+
+void AssemblerARM32::vmovhq(const Operand *OpQd, const Operand *OpQn,
+                            const Operand *OpQm) {
+  // Pseudo-instruction to copy the first source operand and insert the high
+  // half of the second operand into the high half of the destination.
+
+  // VMOV (register) - ARMv7-A/R section A8.6.327, encoding A1:
+  //   VMOV<c> <Dd>, <Dm>
+  //
+  // 111100111D110000ddd001011QM0mmm0 where Dddd=Qd, Mmmm=Qm, and Q=0.
+
+  constexpr const char *Vmov = "vmov";
+  const IValueT Dd = mapQRegToDReg(encodeQRegister(OpQd, "Qd", Vmov));
+  const IValueT Dn = mapQRegToDReg(encodeQRegister(OpQn, "Qn", Vmov));
+  const IValueT Dm = mapQRegToDReg(encodeQRegister(OpQm, "Qm", Vmov));
+
+  constexpr bool UseQRegs = false;
+  constexpr bool IsFloat = false;
+
+  const IValueT VmovOpcode = B25 | B21 | B8 | B4;
+
+  if (Dd != Dn)
+    emitSIMDBase(VmovOpcode, Dd, Dn, Dn, UseQRegs, IsFloat);
+  if (Dd + 1 != Dm + 1)
+    emitSIMDBase(VmovOpcode, Dd + 1, Dm + 1, Dm + 1, UseQRegs, IsFloat);
+}
+
+void AssemblerARM32::vmovhlq(const Operand *OpQd, const Operand *OpQn,
+                             const Operand *OpQm) {
+  // Pseudo-instruction to copy the first source operand and insert the high
+  // half of the second operand into the lower half of the destination.
+
+  // VMOV (register) - ARMv7-A/R section A8.6.327, encoding A1:
+  //   VMOV<c> <Dd>, <Dm>
+  //
+  // 111100111D110000ddd001011QM0mmm0 where Dddd=Qd, Mmmm=Qm, and Q=0.
+
+  constexpr const char *Vmov = "vmov";
+  const IValueT Dd = mapQRegToDReg(encodeQRegister(OpQd, "Qd", Vmov));
+  const IValueT Dn = mapQRegToDReg(encodeQRegister(OpQn, "Qn", Vmov));
+  const IValueT Dm = mapQRegToDReg(encodeQRegister(OpQm, "Qm", Vmov));
+
+  constexpr bool UseQRegs = false;
+  constexpr bool IsFloat = false;
+
+  const IValueT VmovOpcode = B25 | B21 | B8 | B4;
+
+  if (Dd != Dm + 1)
+    emitSIMDBase(VmovOpcode, Dd, Dm + 1, Dm + 1, UseQRegs, IsFloat);
+  if (Dd + 1 != Dn + 1)
+    emitSIMDBase(VmovOpcode, Dd + 1, Dn + 1, Dn + 1, UseQRegs, IsFloat);
+}
+
+void AssemblerARM32::vmovlhq(const Operand *OpQd, const Operand *OpQn,
+                             const Operand *OpQm) {
+  // Pseudo-instruction to copy the first source operand and insert the lower
+  // half of the second operand into the high half of the destination.
+
+  // VMOV (register) - ARMv7-A/R section A8.6.327, encoding A1:
+  //   VMOV<c> <Dd>, <Dm>
+  //
+  // 111100111D110000ddd001011QM0mmm0 where Dddd=Qd, Mmmm=Qm, and Q=0.
+
+  constexpr const char *Vmov = "vmov";
+  const IValueT Dd = mapQRegToDReg(encodeQRegister(OpQd, "Qd", Vmov));
+  const IValueT Dn = mapQRegToDReg(encodeQRegister(OpQn, "Qn", Vmov));
+  const IValueT Dm = mapQRegToDReg(encodeQRegister(OpQm, "Qm", Vmov));
+
+  constexpr bool UseQRegs = false;
+  constexpr bool IsFloat = false;
+
+  const IValueT VmovOpcode = B25 | B21 | B8 | B4;
+
+  if (Dd + 1 != Dm)
+    emitSIMDBase(VmovOpcode, Dd + 1, Dm, Dm, UseQRegs, IsFloat);
+  if (Dd != Dn)
+    emitSIMDBase(VmovOpcode, Dd, Dn, Dn, UseQRegs, IsFloat);
+}
+
 void AssemblerARM32::vnegqs(Type ElmtTy, const Operand *OpQd,
                             const Operand *OpQm) {
   // VNEG - ARM section A8.8.355, encoding A1:
diff --git a/third_party/subzero/src/IceAssemblerARM32.h b/third_party/subzero/src/IceAssemblerARM32.h
index 1f80043..43c3f56 100644
--- a/third_party/subzero/src/IceAssemblerARM32.h
+++ b/third_party/subzero/src/IceAssemblerARM32.h
@@ -546,6 +546,13 @@
   void vmlap(Type ElmtTy, const Operand *OpQd, const Operand *OpQn,
              const Operand *OpQm);
 
+  // Vector element replication.
+  void vdup(Type ElmtTy, const Operand *OpQd, const Operand *OpQn, IValueT Idx);
+
+  // Vector interleave lower halves.
+  void vzip(Type ElmtTy, const Operand *OpQd, const Operand *OpQn,
+            const Operand *OpQm);
+
   // Float vector multiply.
   void vmulqf(const Operand *OpQd, const Operand *OpQn, const Operand *OpQm);
 
@@ -554,6 +561,11 @@
 
   void vmvnq(const Operand *OpQd, const Operand *OpQm);
 
+  void vmovlq(const Operand *OpQd, const Operand *OpQn, const Operand *OpQm);
+  void vmovhq(const Operand *OpQd, const Operand *OpQn, const Operand *OpQm);
+  void vmovhlq(const Operand *OpQd, const Operand *OpQn, const Operand *OpQm);
+  void vmovlhq(const Operand *OpQd, const Operand *OpQn, const Operand *OpQm);
+
   void vnegqs(const Operand *OpQd, const Operand *OpQm);
 
   void vnegqs(Type ElmtTy, const Operand *OpQd, const Operand *OpQm);
diff --git a/third_party/subzero/src/IceInst.h b/third_party/subzero/src/IceInst.h
index 889ead5..187c16d 100644
--- a/third_party/subzero/src/IceInst.h
+++ b/third_party/subzero/src/IceInst.h
@@ -997,35 +997,45 @@
     return Indexes[Pos];
   }
 
-  inline bool indexesAre(int32_t i0, int32_t i1, int32_t i2, int32_t i3,
-                         int32_t i4, int32_t i5, int32_t i6, int32_t i7) const {
+  int32_t getIndexValue(SizeT Pos) const { return getIndex(Pos)->getValue(); }
+
+  bool indexesAre(int32_t i0, int32_t i1, int32_t i2, int32_t i3) const {
+    static constexpr SizeT ExpectedNumElements = 4;
+    assert(ExpectedNumElements == getNumIndexes());
+    (void)ExpectedNumElements;
+
+    return getIndexValue(0) == i0 && getIndexValue(1) == i1 &&
+           getIndexValue(2) == i2 && getIndexValue(3) == i3;
+  }
+
+  bool indexesAre(int32_t i0, int32_t i1, int32_t i2, int32_t i3, int32_t i4,
+                  int32_t i5, int32_t i6, int32_t i7) const {
     static constexpr SizeT ExpectedNumElements = 8;
     assert(ExpectedNumElements == getNumIndexes());
     (void)ExpectedNumElements;
 
-    return getIndex(0)->getValue() == i0 && getIndex(1)->getValue() == i1 &&
-           getIndex(2)->getValue() == i2 && getIndex(3)->getValue() == i3 &&
-           getIndex(4)->getValue() == i4 && getIndex(5)->getValue() == i5 &&
-           getIndex(6)->getValue() == i6 && getIndex(7)->getValue() == i7;
+    return getIndexValue(0) == i0 && getIndexValue(1) == i1 &&
+           getIndexValue(2) == i2 && getIndexValue(3) == i3 &&
+           getIndexValue(4) == i4 && getIndexValue(5) == i5 &&
+           getIndexValue(6) == i6 && getIndexValue(7) == i7;
   }
 
-  inline bool indexesAre(int32_t i0, int32_t i1, int32_t i2, int32_t i3,
-                         int32_t i4, int32_t i5, int32_t i6, int32_t i7,
-                         int32_t i8, int32_t i9, int32_t i10, int32_t i11,
-                         int32_t i12, int32_t i13, int32_t i14,
-                         int32_t i15) const {
+  bool indexesAre(int32_t i0, int32_t i1, int32_t i2, int32_t i3, int32_t i4,
+                  int32_t i5, int32_t i6, int32_t i7, int32_t i8, int32_t i9,
+                  int32_t i10, int32_t i11, int32_t i12, int32_t i13,
+                  int32_t i14, int32_t i15) const {
     static constexpr SizeT ExpectedNumElements = 16;
     assert(ExpectedNumElements == getNumIndexes());
     (void)ExpectedNumElements;
 
-    return getIndex(0)->getValue() == i0 && getIndex(1)->getValue() == i1 &&
-           getIndex(2)->getValue() == i2 && getIndex(3)->getValue() == i3 &&
-           getIndex(4)->getValue() == i4 && getIndex(5)->getValue() == i5 &&
-           getIndex(6)->getValue() == i6 && getIndex(7)->getValue() == i7 &&
-           getIndex(8)->getValue() == i8 && getIndex(9)->getValue() == i9 &&
-           getIndex(10)->getValue() == i10 && getIndex(11)->getValue() == i11 &&
-           getIndex(12)->getValue() == i12 && getIndex(13)->getValue() == i13 &&
-           getIndex(14)->getValue() == i14 && getIndex(15)->getValue() == i15;
+    return getIndexValue(0) == i0 && getIndexValue(1) == i1 &&
+           getIndexValue(2) == i2 && getIndexValue(3) == i3 &&
+           getIndexValue(4) == i4 && getIndexValue(5) == i5 &&
+           getIndexValue(6) == i6 && getIndexValue(7) == i7 &&
+           getIndexValue(8) == i8 && getIndexValue(9) == i9 &&
+           getIndexValue(10) == i10 && getIndexValue(11) == i11 &&
+           getIndexValue(12) == i12 && getIndexValue(13) == i13 &&
+           getIndexValue(14) == i14 && getIndexValue(15) == i15;
   }
 
   bool isMemoryWrite() const override { return false; }
diff --git a/third_party/subzero/src/IceInstARM32.cpp b/third_party/subzero/src/IceInstARM32.cpp
index 2f12b85..646730f 100644
--- a/third_party/subzero/src/IceInstARM32.cpp
+++ b/third_party/subzero/src/IceInstARM32.cpp
@@ -903,6 +903,82 @@
   }
 }
 
+template <> void InstARM32Vmovl::emitIAS(const Cfg *Func) const {
+  auto *Asm = Func->getAssembler<ARM32::AssemblerARM32>();
+  const Variable *Dest = getDest();
+  switch (Dest->getType()) {
+  default:
+    llvm::report_fatal_error("Vmovlq not defined on type " +
+                             typeStdString(Dest->getType()));
+  case IceType_v4i1:
+  case IceType_v8i1:
+  case IceType_v16i1:
+  case IceType_v16i8:
+  case IceType_v8i16:
+  case IceType_v4i32:
+  case IceType_v4f32: {
+    Asm->vmovlq(Dest, getSrc(0), getSrc(1));
+  } break;
+  }
+}
+
+template <> void InstARM32Vmovh::emitIAS(const Cfg *Func) const {
+  auto *Asm = Func->getAssembler<ARM32::AssemblerARM32>();
+  const Variable *Dest = getDest();
+  switch (Dest->getType()) {
+  default:
+    llvm::report_fatal_error("Vmovhq not defined on type " +
+                             typeStdString(Dest->getType()));
+  case IceType_v4i1:
+  case IceType_v8i1:
+  case IceType_v16i1:
+  case IceType_v16i8:
+  case IceType_v8i16:
+  case IceType_v4i32:
+  case IceType_v4f32: {
+    Asm->vmovhq(Dest, getSrc(0), getSrc(1));
+  } break;
+  }
+}
+
+template <> void InstARM32Vmovhl::emitIAS(const Cfg *Func) const {
+  auto *Asm = Func->getAssembler<ARM32::AssemblerARM32>();
+  const Variable *Dest = getDest();
+  switch (Dest->getType()) {
+  default:
+    llvm::report_fatal_error("Vmovhlq not defined on type " +
+                             typeStdString(Dest->getType()));
+  case IceType_v4i1:
+  case IceType_v8i1:
+  case IceType_v16i1:
+  case IceType_v16i8:
+  case IceType_v8i16:
+  case IceType_v4i32:
+  case IceType_v4f32: {
+    Asm->vmovhlq(Dest, getSrc(0), getSrc(1));
+  } break;
+  }
+}
+
+template <> void InstARM32Vmovlh::emitIAS(const Cfg *Func) const {
+  auto *Asm = Func->getAssembler<ARM32::AssemblerARM32>();
+  const Variable *Dest = getDest();
+  switch (Dest->getType()) {
+  default:
+    llvm::report_fatal_error("Vmovlhq not defined on type " +
+                             typeStdString(Dest->getType()));
+  case IceType_v4i1:
+  case IceType_v8i1:
+  case IceType_v16i1:
+  case IceType_v16i8:
+  case IceType_v8i16:
+  case IceType_v4i32:
+  case IceType_v4f32: {
+    Asm->vmovlhq(Dest, getSrc(0), getSrc(1));
+  } break;
+  }
+}
+
 template <> void InstARM32Vneg::emitIAS(const Cfg *Func) const {
   auto *Asm = Func->getAssembler<ARM32::AssemblerARM32>();
   const Variable *Dest = getDest();
@@ -1168,6 +1244,15 @@
   assert(!Asm->needsTextFixup());
 }
 
+template <> void InstARM32Vzip::emitIAS(const Cfg *Func) const {
+  auto *Asm = Func->getAssembler<ARM32::AssemblerARM32>();
+  const Operand *Src0 = getSrc(0);
+  const Operand *Src1 = getSrc(1);
+  Type DestTy = Dest->getType();
+  Asm->vzip(typeElementType(DestTy), Dest, Src0, Src1);
+  assert(!Asm->needsTextFixup());
+}
+
 template <> void InstARM32Vmul::emitIAS(const Cfg *Func) const {
   auto *Asm = Func->getAssembler<ARM32::AssemblerARM32>();
   const Variable *Dest = getDest();
@@ -1425,6 +1510,12 @@
   this->Size = Size;
 }
 
+InstARM32Vdup::InstARM32Vdup(Cfg *Func, Variable *Dest, Variable *Src,
+                             IValueT Idx)
+    : InstARM32Pred(Func, InstARM32::Vdup, 1, Dest, CondARM32::AL), Idx(Idx) {
+  addSource(Src);
+}
+
 InstARM32Trap::InstARM32Trap(Cfg *Func)
     : InstARM32(Func, InstARM32::Trap, 0, nullptr) {}
 
@@ -1775,6 +1866,10 @@
 template <> const char *InstARM32Vmls::Opcode = "vmls";
 template <> const char *InstARM32Vmul::Opcode = "vmul";
 template <> const char *InstARM32Vmvn::Opcode = "vmvn";
+template <> const char *InstARM32Vmovl::Opcode = "vmovl";
+template <> const char *InstARM32Vmovh::Opcode = "vmovh";
+template <> const char *InstARM32Vmovhl::Opcode = "vmovhl";
+template <> const char *InstARM32Vmovlh::Opcode = "vmovlh";
 template <> const char *InstARM32Vorr::Opcode = "vorr";
 template <> const char *InstARM32UnaryopFP<InstARM32::Vneg>::Opcode = "vneg";
 template <> const char *InstARM32ThreeAddrFP<InstARM32::Vshl>::Opcode = "vshl";
@@ -1790,6 +1885,7 @@
 const char *InstARM32ThreeAddrFP<InstARM32::Vmulh>::Opcode = "vmulh";
 template <>
 const char *InstARM32ThreeAddrFP<InstARM32::Vmlap>::Opcode = "vmlap";
+template <> const char *InstARM32ThreeAddrFP<InstARM32::Vzip>::Opcode = "vzip";
 // Four-addr ops
 template <> const char *InstARM32Mla::Opcode = "mla";
 template <> const char *InstARM32Mls::Opcode = "mls";
@@ -2805,6 +2901,43 @@
   getSrc(0)->dump(Func);
 }
 
+void InstARM32Vdup::emit(const Cfg *Func) const {
+  if (!BuildDefs::dump())
+    return;
+  Ostream &Str = Func->getContext()->getStrEmit();
+  assert(getSrcSize() == 2);
+  Type Ty = getSrc(0)->getType();
+  const char *Opcode = "vdup";
+  Str << "\t" << Opcode;
+  Str << getPredicate() << "." << getWidthString(Ty) << getVecElmtBitsize(Ty);
+  Str << "\t";
+  getSrc(0)->emit(Func);
+  Str << ", ";
+  getSrc(1)->emit(Func);
+  Str << ", " << Idx;
+}
+
+void InstARM32Vdup::emitIAS(const Cfg *Func) const {
+  assert(getSrcSize() == 1);
+  auto *Asm = Func->getAssembler<ARM32::AssemblerARM32>();
+  const Operand *Dest = getDest();
+  const Operand *Src = getSrc(0);
+  Type DestTy = Dest->getType();
+  Asm->vdup(typeElementType(DestTy), Dest, Src, Idx);
+}
+
+void InstARM32Vdup::dump(const Cfg *Func) const {
+  if (!BuildDefs::dump())
+    return;
+  Ostream &Str = Func->getContext()->getStrDump();
+  dumpDest(Func);
+  Str << " = ";
+  dumpOpcodePred(Str, "vdup", getDest()->getType());
+  Str << " ";
+  dumpSources(Func);
+  Str << ", " << Idx;
+}
+
 void InstARM32Trap::emit(const Cfg *Func) const {
   if (!BuildDefs::dump())
     return;
@@ -3386,6 +3519,7 @@
 template class InstARM32LoadBase<InstARM32::Ldrex>;
 template class InstARM32LoadBase<InstARM32::Vldr1d>;
 template class InstARM32LoadBase<InstARM32::Vldr1q>;
+template class InstARM32ThreeAddrFP<InstARM32::Vzip>;
 template class InstARM32TwoAddrGPR<InstARM32::Movt>;
 
 template class InstARM32UnaryopGPR<InstARM32::Movw, false>;
diff --git a/third_party/subzero/src/IceInstARM32.h b/third_party/subzero/src/IceInstARM32.h
index 593d96d..e31aabc 100644
--- a/third_party/subzero/src/IceInstARM32.h
+++ b/third_party/subzero/src/IceInstARM32.h
@@ -434,12 +434,17 @@
     Vcmp,
     Vcvt,
     Vdiv,
+    Vdup,
     Veor,
     Vldr1d,
     Vldr1q,
     Vmla,
     Vmlap,
     Vmls,
+    Vmovl,
+    Vmovh,
+    Vmovhl,
+    Vmovlh,
     Vmrs,
     Vmul,
     Vmulh,
@@ -453,7 +458,8 @@
     Vshr,
     Vsqrt,
     Vstr1,
-    Vsub
+    Vsub,
+    Vzip
   };
 
   static constexpr size_t InstSize = sizeof(uint32_t);
@@ -1020,6 +1026,10 @@
 using InstARM32Veor = InstARM32ThreeAddrFP<InstARM32::Veor>;
 using InstARM32Vmla = InstARM32FourAddrFP<InstARM32::Vmla>;
 using InstARM32Vmls = InstARM32FourAddrFP<InstARM32::Vmls>;
+using InstARM32Vmovl = InstARM32ThreeAddrFP<InstARM32::Vmovl>;
+using InstARM32Vmovh = InstARM32ThreeAddrFP<InstARM32::Vmovh>;
+using InstARM32Vmovhl = InstARM32ThreeAddrFP<InstARM32::Vmovhl>;
+using InstARM32Vmovlh = InstARM32ThreeAddrFP<InstARM32::Vmovlh>;
 using InstARM32Vmul = InstARM32ThreeAddrFP<InstARM32::Vmul>;
 using InstARM32Vmvn = InstARM32UnaryopFP<InstARM32::Vmvn>;
 using InstARM32Vneg = InstARM32UnaryopSignAwareFP<InstARM32::Vneg>;
@@ -1036,6 +1046,7 @@
 using InstARM32Ldrex = InstARM32LoadBase<InstARM32::Ldrex>;
 using InstARM32Vldr1d = InstARM32LoadBase<InstARM32::Vldr1d>;
 using InstARM32Vldr1q = InstARM32LoadBase<InstARM32::Vldr1q>;
+using InstARM32Vzip = InstARM32ThreeAddrFP<InstARM32::Vzip>;
 /// MovT leaves the bottom bits alone so dest is also a source. This helps
 /// indicate that a previous MovW setting dest is not dead code.
 using InstARM32Movt = InstARM32TwoAddrGPR<InstARM32::Movt>;
@@ -1374,6 +1385,30 @@
   SizeT Size;
 };
 
+/// Vector element duplication/replication instruction.
+class InstARM32Vdup final : public InstARM32Pred {
+  InstARM32Vdup() = delete;
+  InstARM32Vdup(const InstARM32Vdup &) = delete;
+  InstARM32Vdup &operator=(const InstARM32Vdup &) = delete;
+
+public:
+  /// Value must be a register.
+  static InstARM32Vdup *create(Cfg *Func, Variable *Dest, Variable *Src,
+                               IValueT Idx) {
+    return new (Func->allocate<InstARM32Vdup>())
+        InstARM32Vdup(Func, Dest, Src, Idx);
+  }
+  void emit(const Cfg *Func) const override;
+  void emitIAS(const Cfg *Func) const override;
+  void dump(const Cfg *Func) const override;
+  static bool classof(const Inst *Instr) { return isClassof(Instr, Vdup); }
+
+private:
+  InstARM32Vdup(Cfg *Func, Variable *Dest, Variable *Src, IValueT Idx);
+
+  const IValueT Idx;
+};
+
 class InstARM32Trap : public InstARM32 {
   InstARM32Trap() = delete;
   InstARM32Trap(const InstARM32Trap &) = delete;
diff --git a/third_party/subzero/src/IceTargetLoweringARM32.cpp b/third_party/subzero/src/IceTargetLoweringARM32.cpp
index 9856f7a..d820bca 100644
--- a/third_party/subzero/src/IceTargetLoweringARM32.cpp
+++ b/third_party/subzero/src/IceTargetLoweringARM32.cpp
@@ -5357,7 +5357,7 @@
       Func->setError("Unexpected size for LoadSubVector");
       return;
     }
-    _mov(Dest, T); // FIXME: necessary?
+    _mov(Dest, T);
     return;
   }
   case Intrinsics::StoreSubVector: {
@@ -5975,8 +5975,121 @@
   const Type DestTy = Dest->getType();
 
   auto *T = makeReg(DestTy);
+  auto *Src0 = Instr->getSrc(0);
+  auto *Src1 = Instr->getSrc(1);
+  const SizeT NumElements = typeNumElements(DestTy);
+  const Type ElementType = typeElementType(DestTy);
+
+  bool Replicate = true;
+  for (SizeT I = 1; Replicate && I < Instr->getNumIndexes(); ++I) {
+    if (Instr->getIndexValue(I) != Instr->getIndexValue(0)) {
+      Replicate = false;
+    }
+  }
+
+  if (Replicate) {
+    Variable *Src0Var = legalizeToReg(Src0);
+    _vdup(T, Src0Var, Instr->getIndexValue(0));
+    _mov(Dest, T);
+    return;
+  }
 
   switch (DestTy) {
+  case IceType_v8i1:
+  case IceType_v8i16: {
+    static constexpr SizeT ExpectedNumElements = 8;
+    assert(ExpectedNumElements == Instr->getNumIndexes());
+    (void)ExpectedNumElements;
+
+    if (Instr->indexesAre(0, 0, 1, 1, 2, 2, 3, 3)) {
+      Variable *Src0R = legalizeToReg(Src0);
+      _vzip(T, Src0R, Src0R);
+      _mov(Dest, T);
+      return;
+    }
+
+    if (Instr->indexesAre(0, 8, 1, 9, 2, 10, 3, 11)) {
+      Variable *Src0R = legalizeToReg(Src0);
+      Variable *Src1R = legalizeToReg(Src1);
+      _vzip(T, Src0R, Src1R);
+      _mov(Dest, T);
+      return;
+    }
+
+    if (Instr->indexesAre(0, 2, 4, 6, 0, 2, 4, 6)) {
+      Variable *Src0R = legalizeToReg(Src0);
+      _vqmovn2(T, Src0R, Src0R, false, false);
+      _mov(Dest, T);
+      return;
+    }
+  } break;
+  case IceType_v16i1:
+  case IceType_v16i8: {
+    static constexpr SizeT ExpectedNumElements = 16;
+    assert(ExpectedNumElements == Instr->getNumIndexes());
+    (void)ExpectedNumElements;
+
+    if (Instr->indexesAre(0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7)) {
+      Variable *Src0R = legalizeToReg(Src0);
+      _vzip(T, Src0R, Src0R);
+      _mov(Dest, T);
+      return;
+    }
+
+    if (Instr->indexesAre(0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7,
+                          23)) {
+      Variable *Src0R = legalizeToReg(Src0);
+      Variable *Src1R = legalizeToReg(Src1);
+      _vzip(T, Src0R, Src1R);
+      _mov(Dest, T);
+      return;
+    }
+  } break;
+  case IceType_v4i1:
+  case IceType_v4i32:
+  case IceType_v4f32: {
+    static constexpr SizeT ExpectedNumElements = 4;
+    assert(ExpectedNumElements == Instr->getNumIndexes());
+    (void)ExpectedNumElements;
+
+    if (Instr->indexesAre(0, 0, 1, 1)) {
+      Variable *Src0R = legalizeToReg(Src0);
+      _vzip(T, Src0R, Src0R);
+      _mov(Dest, T);
+      return;
+    }
+
+    if (Instr->indexesAre(0, 4, 1, 5)) {
+      Variable *Src0R = legalizeToReg(Src0);
+      Variable *Src1R = legalizeToReg(Src1);
+      _vzip(T, Src0R, Src1R);
+      _mov(Dest, T);
+      return;
+    }
+
+    if (Instr->indexesAre(0, 1, 4, 5)) {
+      Variable *Src0R = legalizeToReg(Src0);
+      Variable *Src1R = legalizeToReg(Src1);
+      _vmovlh(T, Src0R, Src1R);
+      _mov(Dest, T);
+      return;
+    }
+
+    if (Instr->indexesAre(2, 3, 2, 3)) {
+      Variable *Src0R = legalizeToReg(Src0);
+      _vmovhl(T, Src0R, Src0R);
+      _mov(Dest, T);
+      return;
+    }
+
+    if (Instr->indexesAre(2, 3, 6, 7)) {
+      Variable *Src0R = legalizeToReg(Src0);
+      Variable *Src1R = legalizeToReg(Src1);
+      _vmovhl(T, Src1R, Src0R);
+      _mov(Dest, T);
+      return;
+    }
+  } break;
   default:
     break;
     // TODO(jpp): figure out how to properly lower this without scalarization.
@@ -5984,10 +6097,6 @@
 
   // Unoptimized shuffle. Perform a series of inserts and extracts.
   Context.insert<InstFakeDef>(T);
-  auto *Src0 = Instr->getSrc(0);
-  auto *Src1 = Instr->getSrc(1);
-  const SizeT NumElements = typeNumElements(DestTy);
-  const Type ElementType = typeElementType(DestTy);
   for (SizeT I = 0; I < Instr->getNumIndexes(); ++I) {
     auto *Index = Instr->getIndex(I);
     const SizeT Elem = Index->getValue();
diff --git a/third_party/subzero/src/IceTargetLoweringARM32.h b/third_party/subzero/src/IceTargetLoweringARM32.h
index a82337a..a629627 100644
--- a/third_party/subzero/src/IceTargetLoweringARM32.h
+++ b/third_party/subzero/src/IceTargetLoweringARM32.h
@@ -885,6 +885,9 @@
              CondARM32::Cond Pred = CondARM32::AL) {
     Context.insert<InstARM32Vcmp>(Src0, FpZero, Pred);
   }
+  void _vdup(Variable *Dest, Variable *Src, int Idx) {
+    Context.insert<InstARM32Vdup>(Dest, Src, Idx);
+  }
   void _veor(Variable *Dest, Variable *Src0, Variable *Src1) {
     Context.insert<InstARM32Veor>(Dest, Src0, Src1);
   }
@@ -908,6 +911,18 @@
   void _vmls(Variable *Dest, Variable *Src0, Variable *Src1) {
     Context.insert<InstARM32Vmls>(Dest, Src0, Src1);
   }
+  void _vmovl(Variable *Dest, Variable *Src0, Variable *Src1) {
+    Context.insert<InstARM32Vmovl>(Dest, Src0, Src1);
+  }
+  void _vmovh(Variable *Dest, Variable *Src0, Variable *Src1) {
+    Context.insert<InstARM32Vmovh>(Dest, Src0, Src1);
+  }
+  void _vmovhl(Variable *Dest, Variable *Src0, Variable *Src1) {
+    Context.insert<InstARM32Vmovhl>(Dest, Src0, Src1);
+  }
+  void _vmovlh(Variable *Dest, Variable *Src0, Variable *Src1) {
+    Context.insert<InstARM32Vmovlh>(Dest, Src0, Src1);
+  }
   void _vmul(Variable *Dest, Variable *Src0, Variable *Src1) {
     Context.insert<InstARM32Vmul>(Dest, Src0, Src1);
   }
@@ -966,6 +981,9 @@
   void _vsub(Variable *Dest, Variable *Src0, Variable *Src1) {
     Context.insert<InstARM32Vsub>(Dest, Src0, Src1);
   }
+  void _vzip(Variable *Dest, Variable *Src0, Variable *Src1) {
+    Context.insert<InstARM32Vzip>(Dest, Src0, Src1);
+  }
 
   // Iterates over the CFG and determines the maximum outgoing stack arguments
   // bytes. This information is later used during addProlog() to pre-allocate
diff --git a/third_party/subzero/src/IceTargetLoweringX86BaseImpl.h b/third_party/subzero/src/IceTargetLoweringX86BaseImpl.h
index c5eac33..f2fd83e 100644
--- a/third_party/subzero/src/IceTargetLoweringX86BaseImpl.h
+++ b/third_party/subzero/src/IceTargetLoweringX86BaseImpl.h
@@ -6304,22 +6304,22 @@
       break;
     }
 
-    const SizeT Index0 = Instr->getIndex(0)->getValue();
-    const SizeT Index1 = Instr->getIndex(1)->getValue();
-    const SizeT Index2 = Instr->getIndex(2)->getValue();
-    const SizeT Index3 = Instr->getIndex(3)->getValue();
-    const SizeT Index4 = Instr->getIndex(4)->getValue();
-    const SizeT Index5 = Instr->getIndex(5)->getValue();
-    const SizeT Index6 = Instr->getIndex(6)->getValue();
-    const SizeT Index7 = Instr->getIndex(7)->getValue();
-    const SizeT Index8 = Instr->getIndex(8)->getValue();
-    const SizeT Index9 = Instr->getIndex(9)->getValue();
-    const SizeT Index10 = Instr->getIndex(10)->getValue();
-    const SizeT Index11 = Instr->getIndex(11)->getValue();
-    const SizeT Index12 = Instr->getIndex(12)->getValue();
-    const SizeT Index13 = Instr->getIndex(13)->getValue();
-    const SizeT Index14 = Instr->getIndex(14)->getValue();
-    const SizeT Index15 = Instr->getIndex(15)->getValue();
+    const SizeT Index0 = Instr->getIndexValue(0);
+    const SizeT Index1 = Instr->getIndexValue(1);
+    const SizeT Index2 = Instr->getIndexValue(2);
+    const SizeT Index3 = Instr->getIndexValue(3);
+    const SizeT Index4 = Instr->getIndexValue(4);
+    const SizeT Index5 = Instr->getIndexValue(5);
+    const SizeT Index6 = Instr->getIndexValue(6);
+    const SizeT Index7 = Instr->getIndexValue(7);
+    const SizeT Index8 = Instr->getIndexValue(8);
+    const SizeT Index9 = Instr->getIndexValue(9);
+    const SizeT Index10 = Instr->getIndexValue(10);
+    const SizeT Index11 = Instr->getIndexValue(11);
+    const SizeT Index12 = Instr->getIndexValue(12);
+    const SizeT Index13 = Instr->getIndexValue(13);
+    const SizeT Index14 = Instr->getIndexValue(14);
+    const SizeT Index15 = Instr->getIndexValue(15);
 
     lowerShuffleVector_UsingPshufb(Dest, Src0, Src1, Index0, Index1, Index2,
                                    Index3, Index4, Index5, Index6, Index7,
@@ -6376,14 +6376,14 @@
       break;
     }
 
-    const SizeT Index0 = Instr->getIndex(0)->getValue();
-    const SizeT Index1 = Instr->getIndex(1)->getValue();
-    const SizeT Index2 = Instr->getIndex(2)->getValue();
-    const SizeT Index3 = Instr->getIndex(3)->getValue();
-    const SizeT Index4 = Instr->getIndex(4)->getValue();
-    const SizeT Index5 = Instr->getIndex(5)->getValue();
-    const SizeT Index6 = Instr->getIndex(6)->getValue();
-    const SizeT Index7 = Instr->getIndex(7)->getValue();
+    const SizeT Index0 = Instr->getIndexValue(0);
+    const SizeT Index1 = Instr->getIndexValue(1);
+    const SizeT Index2 = Instr->getIndexValue(2);
+    const SizeT Index3 = Instr->getIndexValue(3);
+    const SizeT Index4 = Instr->getIndexValue(4);
+    const SizeT Index5 = Instr->getIndexValue(5);
+    const SizeT Index6 = Instr->getIndexValue(6);
+    const SizeT Index7 = Instr->getIndexValue(7);
 
 #define TO_BYTE_INDEX(I) ((I) << 1)
     lowerShuffleVector_UsingPshufb(
@@ -6403,10 +6403,10 @@
   case IceType_v4f32: {
     static constexpr SizeT ExpectedNumElements = 4;
     assert(ExpectedNumElements == Instr->getNumIndexes());
-    const SizeT Index0 = Instr->getIndex(0)->getValue();
-    const SizeT Index1 = Instr->getIndex(1)->getValue();
-    const SizeT Index2 = Instr->getIndex(2)->getValue();
-    const SizeT Index3 = Instr->getIndex(3)->getValue();
+    const SizeT Index0 = Instr->getIndexValue(0);
+    const SizeT Index1 = Instr->getIndexValue(1);
+    const SizeT Index2 = Instr->getIndexValue(2);
+    const SizeT Index3 = Instr->getIndexValue(3);
     Variable *T = nullptr;
     switch (makeSrcSwitchMask(Index0, Index1, Index2, Index3)) {
 #define CASE_SRCS_IN(S0, S1, S2, S3)                                           \
@@ -6611,8 +6611,7 @@
           InstExtractElement::create(Func, ExtElmt, Src0, Index));
     } else {
       lowerExtractElement(InstExtractElement::create(
-          Func, ExtElmt, Src1,
-          Ctx->getConstantInt32(Index->getValue() - NumElements)));
+          Func, ExtElmt, Src1, Ctx->getConstantInt32(Elem - NumElements)));
     }
     auto *NewT = makeReg(DestTy);
     lowerInsertElement(InstInsertElement::create(Func, NewT, T, ExtElmt,