Optimize common vector shuffle patterns for ARM32.
Use VDUP for replicating a single element.
Use VZIP for interleaving vectors.
Use VMOV Dd, Dm for rearranging quadword vectors.
Bug b/67106219
Change-Id: I0de1457454c1db6d467bf870288b7af7cb59ac09
Reviewed-on: https://chromium-review.googlesource.com/695004
Reviewed-by: Jim Stichnoth <stichnot@chromium.org>
Reviewed-on: https://swiftshader-review.googlesource.com/12968
Reviewed-by: Nicolas Capens <nicolascapens@google.com>
Tested-by: Nicolas Capens <nicolascapens@google.com>
diff --git a/third_party/subzero/src/IceAssemblerARM32.cpp b/third_party/subzero/src/IceAssemblerARM32.cpp
index 502668c..4b1fcb9 100644
--- a/third_party/subzero/src/IceAssemblerARM32.cpp
+++ b/third_party/subzero/src/IceAssemblerARM32.cpp
@@ -3418,6 +3418,97 @@
emitSIMDBase(VpaddOpcode, Dd, Dd, Dd + 1, UseQRegs, IsFloatTy);
}
+void AssemblerARM32::vdup(Type ElmtTy, const Operand *OpQd, const Operand *OpQn,
+ IValueT Idx) {
+ // VDUP (scalar) - ARMv7-A/R section A8.6.302, encoding A1:
+ // VDUP<c>.<size> <Qd>, <Dm[x]>
+ //
+ // 111100111D11iiiiddd011000QM0mmmm where Dddd=<Qd>, Mmmmm=<Dm>, and
+ // iiii=imm4 encodes <size> and [x].
+ constexpr const char *Vdup = "vdup";
+
+ const IValueT VdupOpcode = B25 | B24 | B23 | B21 | B20 | B11 | B10;
+
+ const IValueT Dd = mapQRegToDReg(encodeQRegister(OpQd, "Qd", Vdup));
+ const IValueT Dn = mapQRegToDReg(encodeQRegister(OpQn, "Qn", Vdup));
+
+ constexpr bool UseQRegs = true;
+ constexpr bool IsFloatTy = false;
+
+ IValueT Imm4 = 0;
+ bool Lower = true;
+ switch (ElmtTy) {
+ case IceType_i8:
+ assert(Idx < 16);
+ Lower = Idx < 8;
+ Imm4 = 1 | ((Idx & 0x7) << 1);
+ break;
+ case IceType_i16:
+ assert(Idx < 8);
+ Lower = Idx < 4;
+ Imm4 = 2 | ((Idx & 0x3) << 2);
+ break;
+ case IceType_i32:
+ case IceType_f32:
+ assert(Idx < 4);
+ Lower = Idx < 2;
+ Imm4 = 4 | ((Idx & 0x1) << 3);
+ break;
+ default:
+ assert(false && "vdup only supports 8, 16, and 32-bit elements");
+ break;
+ }
+
+ emitSIMDBase(VdupOpcode, Dd, Imm4, Dn + (Lower ? 0 : 1), UseQRegs, IsFloatTy);
+}
+
+void AssemblerARM32::vzip(Type ElmtTy, const Operand *OpQd, const Operand *OpQn,
+ const Operand *OpQm) {
+ // Pseudo-instruction which interleaves the elements of the lower halves of
+ // two quadword registers.
+
+ // Vzip - ARMv7-A/R section A8.6.410, encoding A1:
+ // VZIP<c>.<size> <Dd>, <Dm>
+ //
+ // 111100111D11ss10dddd00011QM0mmmm where Ddddd=<Dd>, Mmmmm=<Dm>, and
+ // ss=<size>
+ assert(ElmtTy != IceType_i64 && "vzip on i64 vector not allowed");
+
+ constexpr const char *Vzip = "vzip";
+ const IValueT Dd = mapQRegToDReg(encodeQRegister(OpQd, "Qd", Vzip));
+ const IValueT Dn = mapQRegToDReg(encodeQRegister(OpQn, "Qn", Vzip));
+ const IValueT Dm = mapQRegToDReg(encodeQRegister(OpQm, "Qm", Vzip));
+
+ constexpr bool UseQRegs = false;
+ constexpr bool IsFloatTy = false;
+
+ // VMOV Dd, Dm
+ // 111100100D10mmmmdddd0001MQM1mmmm
+ constexpr IValueT VmovOpcode = B25 | B21 | B8 | B4;
+
+ // Copy lower half of second source to upper half of destination.
+ emitSIMDBase(VmovOpcode, Dd + 1, Dm, Dm, UseQRegs, IsFloatTy);
+
+ // Copy lower half of first source to lower half of destination.
+ if (Dd != Dn)
+ emitSIMDBase(VmovOpcode, Dd, Dn, Dn, UseQRegs, IsFloatTy);
+
+ constexpr IValueT ElmtShift = 18;
+ const IValueT ElmtSize = encodeElmtType(ElmtTy);
+ assert(Utils::IsUint(2, ElmtSize));
+
+ if (ElmtTy != IceType_i32 && ElmtTy != IceType_f32) {
+ constexpr IValueT VzipOpcode = B25 | B24 | B23 | B21 | B20 | B17 | B8 | B7;
+ // Zip the lower and upper half of destination.
+ emitSIMDBase(VzipOpcode | (ElmtSize << ElmtShift), Dd, 0, Dd + 1, UseQRegs,
+ IsFloatTy);
+ } else {
+ constexpr IValueT VtrnOpcode = B25 | B24 | B23 | B21 | B20 | B17 | B7;
+ emitSIMDBase(VtrnOpcode | (ElmtSize << ElmtShift), Dd, 0, Dd + 1, UseQRegs,
+ IsFloatTy);
+ }
+}
+
void AssemblerARM32::vmulqf(const Operand *OpQd, const Operand *OpQn,
const Operand *OpQm) {
// VMUL (floating-point) - ARM section A8.8.351, encoding A1:
@@ -3448,6 +3539,110 @@
mapQRegToDReg(Qm), UseQRegs, IsFloat);
}
+void AssemblerARM32::vmovlq(const Operand *OpQd, const Operand *OpQn,
+ const Operand *OpQm) {
+ // Pseudo-instruction to copy the first source operand and insert the lower
+ // half of the second operand into the lower half of the destination.
+
+ // VMOV (register) - ARMv7-A/R section A8.6.327, encoding A1:
+ // VMOV<c> <Dd>, <Dm>
+ //
+ // 111100111D110000ddd001011QM0mmm0 where Dddd=Qd, Mmmm=Qm, and Q=0.
+
+ constexpr const char *Vmov = "vmov";
+ const IValueT Dd = mapQRegToDReg(encodeQRegister(OpQd, "Qd", Vmov));
+ const IValueT Dn = mapQRegToDReg(encodeQRegister(OpQn, "Qn", Vmov));
+ const IValueT Dm = mapQRegToDReg(encodeQRegister(OpQm, "Qm", Vmov));
+
+ constexpr bool UseQRegs = false;
+ constexpr bool IsFloat = false;
+
+ const IValueT VmovOpcode = B25 | B21 | B8 | B4;
+
+ if (Dd != Dm)
+ emitSIMDBase(VmovOpcode, Dd, Dm, Dm, UseQRegs, IsFloat);
+ if (Dd + 1 != Dn + 1)
+ emitSIMDBase(VmovOpcode, Dd + 1, Dn + 1, Dn + 1, UseQRegs, IsFloat);
+}
+
+void AssemblerARM32::vmovhq(const Operand *OpQd, const Operand *OpQn,
+ const Operand *OpQm) {
+ // Pseudo-instruction to copy the first source operand and insert the high
+ // half of the second operand into the high half of the destination.
+
+ // VMOV (register) - ARMv7-A/R section A8.6.327, encoding A1:
+ // VMOV<c> <Dd>, <Dm>
+ //
+ // 111100111D110000ddd001011QM0mmm0 where Dddd=Qd, Mmmm=Qm, and Q=0.
+
+ constexpr const char *Vmov = "vmov";
+ const IValueT Dd = mapQRegToDReg(encodeQRegister(OpQd, "Qd", Vmov));
+ const IValueT Dn = mapQRegToDReg(encodeQRegister(OpQn, "Qn", Vmov));
+ const IValueT Dm = mapQRegToDReg(encodeQRegister(OpQm, "Qm", Vmov));
+
+ constexpr bool UseQRegs = false;
+ constexpr bool IsFloat = false;
+
+ const IValueT VmovOpcode = B25 | B21 | B8 | B4;
+
+ if (Dd != Dn)
+ emitSIMDBase(VmovOpcode, Dd, Dn, Dn, UseQRegs, IsFloat);
+ if (Dd + 1 != Dm + 1)
+ emitSIMDBase(VmovOpcode, Dd + 1, Dm + 1, Dm + 1, UseQRegs, IsFloat);
+}
+
+void AssemblerARM32::vmovhlq(const Operand *OpQd, const Operand *OpQn,
+ const Operand *OpQm) {
+ // Pseudo-instruction to copy the first source operand and insert the high
+ // half of the second operand into the lower half of the destination.
+
+ // VMOV (register) - ARMv7-A/R section A8.6.327, encoding A1:
+ // VMOV<c> <Dd>, <Dm>
+ //
+ // 111100111D110000ddd001011QM0mmm0 where Dddd=Qd, Mmmm=Qm, and Q=0.
+
+ constexpr const char *Vmov = "vmov";
+ const IValueT Dd = mapQRegToDReg(encodeQRegister(OpQd, "Qd", Vmov));
+ const IValueT Dn = mapQRegToDReg(encodeQRegister(OpQn, "Qn", Vmov));
+ const IValueT Dm = mapQRegToDReg(encodeQRegister(OpQm, "Qm", Vmov));
+
+ constexpr bool UseQRegs = false;
+ constexpr bool IsFloat = false;
+
+ const IValueT VmovOpcode = B25 | B21 | B8 | B4;
+
+ if (Dd != Dm + 1)
+ emitSIMDBase(VmovOpcode, Dd, Dm + 1, Dm + 1, UseQRegs, IsFloat);
+ if (Dd + 1 != Dn + 1)
+ emitSIMDBase(VmovOpcode, Dd + 1, Dn + 1, Dn + 1, UseQRegs, IsFloat);
+}
+
+void AssemblerARM32::vmovlhq(const Operand *OpQd, const Operand *OpQn,
+ const Operand *OpQm) {
+ // Pseudo-instruction to copy the first source operand and insert the lower
+ // half of the second operand into the high half of the destination.
+
+ // VMOV (register) - ARMv7-A/R section A8.6.327, encoding A1:
+ // VMOV<c> <Dd>, <Dm>
+ //
+ // 111100111D110000ddd001011QM0mmm0 where Dddd=Qd, Mmmm=Qm, and Q=0.
+
+ constexpr const char *Vmov = "vmov";
+ const IValueT Dd = mapQRegToDReg(encodeQRegister(OpQd, "Qd", Vmov));
+ const IValueT Dn = mapQRegToDReg(encodeQRegister(OpQn, "Qn", Vmov));
+ const IValueT Dm = mapQRegToDReg(encodeQRegister(OpQm, "Qm", Vmov));
+
+ constexpr bool UseQRegs = false;
+ constexpr bool IsFloat = false;
+
+ const IValueT VmovOpcode = B25 | B21 | B8 | B4;
+
+ if (Dd + 1 != Dm)
+ emitSIMDBase(VmovOpcode, Dd + 1, Dm, Dm, UseQRegs, IsFloat);
+ if (Dd != Dn)
+ emitSIMDBase(VmovOpcode, Dd, Dn, Dn, UseQRegs, IsFloat);
+}
+
void AssemblerARM32::vnegqs(Type ElmtTy, const Operand *OpQd,
const Operand *OpQm) {
// VNEG - ARM section A8.8.355, encoding A1:
diff --git a/third_party/subzero/src/IceAssemblerARM32.h b/third_party/subzero/src/IceAssemblerARM32.h
index 1f80043..43c3f56 100644
--- a/third_party/subzero/src/IceAssemblerARM32.h
+++ b/third_party/subzero/src/IceAssemblerARM32.h
@@ -546,6 +546,13 @@
void vmlap(Type ElmtTy, const Operand *OpQd, const Operand *OpQn,
const Operand *OpQm);
+ // Vector element replication.
+ void vdup(Type ElmtTy, const Operand *OpQd, const Operand *OpQn, IValueT Idx);
+
+ // Vector interleave lower halves.
+ void vzip(Type ElmtTy, const Operand *OpQd, const Operand *OpQn,
+ const Operand *OpQm);
+
// Float vector multiply.
void vmulqf(const Operand *OpQd, const Operand *OpQn, const Operand *OpQm);
@@ -554,6 +561,11 @@
void vmvnq(const Operand *OpQd, const Operand *OpQm);
+ void vmovlq(const Operand *OpQd, const Operand *OpQn, const Operand *OpQm);
+ void vmovhq(const Operand *OpQd, const Operand *OpQn, const Operand *OpQm);
+ void vmovhlq(const Operand *OpQd, const Operand *OpQn, const Operand *OpQm);
+ void vmovlhq(const Operand *OpQd, const Operand *OpQn, const Operand *OpQm);
+
void vnegqs(const Operand *OpQd, const Operand *OpQm);
void vnegqs(Type ElmtTy, const Operand *OpQd, const Operand *OpQm);
diff --git a/third_party/subzero/src/IceInst.h b/third_party/subzero/src/IceInst.h
index 889ead5..187c16d 100644
--- a/third_party/subzero/src/IceInst.h
+++ b/third_party/subzero/src/IceInst.h
@@ -997,35 +997,45 @@
return Indexes[Pos];
}
- inline bool indexesAre(int32_t i0, int32_t i1, int32_t i2, int32_t i3,
- int32_t i4, int32_t i5, int32_t i6, int32_t i7) const {
+ int32_t getIndexValue(SizeT Pos) const { return getIndex(Pos)->getValue(); }
+
+ bool indexesAre(int32_t i0, int32_t i1, int32_t i2, int32_t i3) const {
+ static constexpr SizeT ExpectedNumElements = 4;
+ assert(ExpectedNumElements == getNumIndexes());
+ (void)ExpectedNumElements;
+
+ return getIndexValue(0) == i0 && getIndexValue(1) == i1 &&
+ getIndexValue(2) == i2 && getIndexValue(3) == i3;
+ }
+
+ bool indexesAre(int32_t i0, int32_t i1, int32_t i2, int32_t i3, int32_t i4,
+ int32_t i5, int32_t i6, int32_t i7) const {
static constexpr SizeT ExpectedNumElements = 8;
assert(ExpectedNumElements == getNumIndexes());
(void)ExpectedNumElements;
- return getIndex(0)->getValue() == i0 && getIndex(1)->getValue() == i1 &&
- getIndex(2)->getValue() == i2 && getIndex(3)->getValue() == i3 &&
- getIndex(4)->getValue() == i4 && getIndex(5)->getValue() == i5 &&
- getIndex(6)->getValue() == i6 && getIndex(7)->getValue() == i7;
+ return getIndexValue(0) == i0 && getIndexValue(1) == i1 &&
+ getIndexValue(2) == i2 && getIndexValue(3) == i3 &&
+ getIndexValue(4) == i4 && getIndexValue(5) == i5 &&
+ getIndexValue(6) == i6 && getIndexValue(7) == i7;
}
- inline bool indexesAre(int32_t i0, int32_t i1, int32_t i2, int32_t i3,
- int32_t i4, int32_t i5, int32_t i6, int32_t i7,
- int32_t i8, int32_t i9, int32_t i10, int32_t i11,
- int32_t i12, int32_t i13, int32_t i14,
- int32_t i15) const {
+ bool indexesAre(int32_t i0, int32_t i1, int32_t i2, int32_t i3, int32_t i4,
+ int32_t i5, int32_t i6, int32_t i7, int32_t i8, int32_t i9,
+ int32_t i10, int32_t i11, int32_t i12, int32_t i13,
+ int32_t i14, int32_t i15) const {
static constexpr SizeT ExpectedNumElements = 16;
assert(ExpectedNumElements == getNumIndexes());
(void)ExpectedNumElements;
- return getIndex(0)->getValue() == i0 && getIndex(1)->getValue() == i1 &&
- getIndex(2)->getValue() == i2 && getIndex(3)->getValue() == i3 &&
- getIndex(4)->getValue() == i4 && getIndex(5)->getValue() == i5 &&
- getIndex(6)->getValue() == i6 && getIndex(7)->getValue() == i7 &&
- getIndex(8)->getValue() == i8 && getIndex(9)->getValue() == i9 &&
- getIndex(10)->getValue() == i10 && getIndex(11)->getValue() == i11 &&
- getIndex(12)->getValue() == i12 && getIndex(13)->getValue() == i13 &&
- getIndex(14)->getValue() == i14 && getIndex(15)->getValue() == i15;
+ return getIndexValue(0) == i0 && getIndexValue(1) == i1 &&
+ getIndexValue(2) == i2 && getIndexValue(3) == i3 &&
+ getIndexValue(4) == i4 && getIndexValue(5) == i5 &&
+ getIndexValue(6) == i6 && getIndexValue(7) == i7 &&
+ getIndexValue(8) == i8 && getIndexValue(9) == i9 &&
+ getIndexValue(10) == i10 && getIndexValue(11) == i11 &&
+ getIndexValue(12) == i12 && getIndexValue(13) == i13 &&
+ getIndexValue(14) == i14 && getIndexValue(15) == i15;
}
bool isMemoryWrite() const override { return false; }
diff --git a/third_party/subzero/src/IceInstARM32.cpp b/third_party/subzero/src/IceInstARM32.cpp
index 2f12b85..646730f 100644
--- a/third_party/subzero/src/IceInstARM32.cpp
+++ b/third_party/subzero/src/IceInstARM32.cpp
@@ -903,6 +903,82 @@
}
}
+template <> void InstARM32Vmovl::emitIAS(const Cfg *Func) const {
+ auto *Asm = Func->getAssembler<ARM32::AssemblerARM32>();
+ const Variable *Dest = getDest();
+ switch (Dest->getType()) {
+ default:
+ llvm::report_fatal_error("Vmovlq not defined on type " +
+ typeStdString(Dest->getType()));
+ case IceType_v4i1:
+ case IceType_v8i1:
+ case IceType_v16i1:
+ case IceType_v16i8:
+ case IceType_v8i16:
+ case IceType_v4i32:
+ case IceType_v4f32: {
+ Asm->vmovlq(Dest, getSrc(0), getSrc(1));
+ } break;
+ }
+}
+
+template <> void InstARM32Vmovh::emitIAS(const Cfg *Func) const {
+ auto *Asm = Func->getAssembler<ARM32::AssemblerARM32>();
+ const Variable *Dest = getDest();
+ switch (Dest->getType()) {
+ default:
+ llvm::report_fatal_error("Vmovhq not defined on type " +
+ typeStdString(Dest->getType()));
+ case IceType_v4i1:
+ case IceType_v8i1:
+ case IceType_v16i1:
+ case IceType_v16i8:
+ case IceType_v8i16:
+ case IceType_v4i32:
+ case IceType_v4f32: {
+ Asm->vmovhq(Dest, getSrc(0), getSrc(1));
+ } break;
+ }
+}
+
+template <> void InstARM32Vmovhl::emitIAS(const Cfg *Func) const {
+ auto *Asm = Func->getAssembler<ARM32::AssemblerARM32>();
+ const Variable *Dest = getDest();
+ switch (Dest->getType()) {
+ default:
+ llvm::report_fatal_error("Vmovhlq not defined on type " +
+ typeStdString(Dest->getType()));
+ case IceType_v4i1:
+ case IceType_v8i1:
+ case IceType_v16i1:
+ case IceType_v16i8:
+ case IceType_v8i16:
+ case IceType_v4i32:
+ case IceType_v4f32: {
+ Asm->vmovhlq(Dest, getSrc(0), getSrc(1));
+ } break;
+ }
+}
+
+template <> void InstARM32Vmovlh::emitIAS(const Cfg *Func) const {
+ auto *Asm = Func->getAssembler<ARM32::AssemblerARM32>();
+ const Variable *Dest = getDest();
+ switch (Dest->getType()) {
+ default:
+ llvm::report_fatal_error("Vmovlhq not defined on type " +
+ typeStdString(Dest->getType()));
+ case IceType_v4i1:
+ case IceType_v8i1:
+ case IceType_v16i1:
+ case IceType_v16i8:
+ case IceType_v8i16:
+ case IceType_v4i32:
+ case IceType_v4f32: {
+ Asm->vmovlhq(Dest, getSrc(0), getSrc(1));
+ } break;
+ }
+}
+
template <> void InstARM32Vneg::emitIAS(const Cfg *Func) const {
auto *Asm = Func->getAssembler<ARM32::AssemblerARM32>();
const Variable *Dest = getDest();
@@ -1168,6 +1244,15 @@
assert(!Asm->needsTextFixup());
}
+template <> void InstARM32Vzip::emitIAS(const Cfg *Func) const {
+ auto *Asm = Func->getAssembler<ARM32::AssemblerARM32>();
+ const Operand *Src0 = getSrc(0);
+ const Operand *Src1 = getSrc(1);
+ Type DestTy = Dest->getType();
+ Asm->vzip(typeElementType(DestTy), Dest, Src0, Src1);
+ assert(!Asm->needsTextFixup());
+}
+
template <> void InstARM32Vmul::emitIAS(const Cfg *Func) const {
auto *Asm = Func->getAssembler<ARM32::AssemblerARM32>();
const Variable *Dest = getDest();
@@ -1425,6 +1510,12 @@
this->Size = Size;
}
+InstARM32Vdup::InstARM32Vdup(Cfg *Func, Variable *Dest, Variable *Src,
+ IValueT Idx)
+ : InstARM32Pred(Func, InstARM32::Vdup, 1, Dest, CondARM32::AL), Idx(Idx) {
+ addSource(Src);
+}
+
InstARM32Trap::InstARM32Trap(Cfg *Func)
: InstARM32(Func, InstARM32::Trap, 0, nullptr) {}
@@ -1775,6 +1866,10 @@
template <> const char *InstARM32Vmls::Opcode = "vmls";
template <> const char *InstARM32Vmul::Opcode = "vmul";
template <> const char *InstARM32Vmvn::Opcode = "vmvn";
+template <> const char *InstARM32Vmovl::Opcode = "vmovl";
+template <> const char *InstARM32Vmovh::Opcode = "vmovh";
+template <> const char *InstARM32Vmovhl::Opcode = "vmovhl";
+template <> const char *InstARM32Vmovlh::Opcode = "vmovlh";
template <> const char *InstARM32Vorr::Opcode = "vorr";
template <> const char *InstARM32UnaryopFP<InstARM32::Vneg>::Opcode = "vneg";
template <> const char *InstARM32ThreeAddrFP<InstARM32::Vshl>::Opcode = "vshl";
@@ -1790,6 +1885,7 @@
const char *InstARM32ThreeAddrFP<InstARM32::Vmulh>::Opcode = "vmulh";
template <>
const char *InstARM32ThreeAddrFP<InstARM32::Vmlap>::Opcode = "vmlap";
+template <> const char *InstARM32ThreeAddrFP<InstARM32::Vzip>::Opcode = "vzip";
// Four-addr ops
template <> const char *InstARM32Mla::Opcode = "mla";
template <> const char *InstARM32Mls::Opcode = "mls";
@@ -2805,6 +2901,43 @@
getSrc(0)->dump(Func);
}
+void InstARM32Vdup::emit(const Cfg *Func) const {
+ if (!BuildDefs::dump())
+ return;
+ Ostream &Str = Func->getContext()->getStrEmit();
+ assert(getSrcSize() == 2);
+ Type Ty = getSrc(0)->getType();
+ const char *Opcode = "vdup";
+ Str << "\t" << Opcode;
+ Str << getPredicate() << "." << getWidthString(Ty) << getVecElmtBitsize(Ty);
+ Str << "\t";
+ getSrc(0)->emit(Func);
+ Str << ", ";
+ getSrc(1)->emit(Func);
+ Str << ", " << Idx;
+}
+
+void InstARM32Vdup::emitIAS(const Cfg *Func) const {
+ assert(getSrcSize() == 1);
+ auto *Asm = Func->getAssembler<ARM32::AssemblerARM32>();
+ const Operand *Dest = getDest();
+ const Operand *Src = getSrc(0);
+ Type DestTy = Dest->getType();
+ Asm->vdup(typeElementType(DestTy), Dest, Src, Idx);
+}
+
+void InstARM32Vdup::dump(const Cfg *Func) const {
+ if (!BuildDefs::dump())
+ return;
+ Ostream &Str = Func->getContext()->getStrDump();
+ dumpDest(Func);
+ Str << " = ";
+ dumpOpcodePred(Str, "vdup", getDest()->getType());
+ Str << " ";
+ dumpSources(Func);
+ Str << ", " << Idx;
+}
+
void InstARM32Trap::emit(const Cfg *Func) const {
if (!BuildDefs::dump())
return;
@@ -3386,6 +3519,7 @@
template class InstARM32LoadBase<InstARM32::Ldrex>;
template class InstARM32LoadBase<InstARM32::Vldr1d>;
template class InstARM32LoadBase<InstARM32::Vldr1q>;
+template class InstARM32ThreeAddrFP<InstARM32::Vzip>;
template class InstARM32TwoAddrGPR<InstARM32::Movt>;
template class InstARM32UnaryopGPR<InstARM32::Movw, false>;
diff --git a/third_party/subzero/src/IceInstARM32.h b/third_party/subzero/src/IceInstARM32.h
index 593d96d..e31aabc 100644
--- a/third_party/subzero/src/IceInstARM32.h
+++ b/third_party/subzero/src/IceInstARM32.h
@@ -434,12 +434,17 @@
Vcmp,
Vcvt,
Vdiv,
+ Vdup,
Veor,
Vldr1d,
Vldr1q,
Vmla,
Vmlap,
Vmls,
+ Vmovl,
+ Vmovh,
+ Vmovhl,
+ Vmovlh,
Vmrs,
Vmul,
Vmulh,
@@ -453,7 +458,8 @@
Vshr,
Vsqrt,
Vstr1,
- Vsub
+ Vsub,
+ Vzip
};
static constexpr size_t InstSize = sizeof(uint32_t);
@@ -1020,6 +1026,10 @@
using InstARM32Veor = InstARM32ThreeAddrFP<InstARM32::Veor>;
using InstARM32Vmla = InstARM32FourAddrFP<InstARM32::Vmla>;
using InstARM32Vmls = InstARM32FourAddrFP<InstARM32::Vmls>;
+using InstARM32Vmovl = InstARM32ThreeAddrFP<InstARM32::Vmovl>;
+using InstARM32Vmovh = InstARM32ThreeAddrFP<InstARM32::Vmovh>;
+using InstARM32Vmovhl = InstARM32ThreeAddrFP<InstARM32::Vmovhl>;
+using InstARM32Vmovlh = InstARM32ThreeAddrFP<InstARM32::Vmovlh>;
using InstARM32Vmul = InstARM32ThreeAddrFP<InstARM32::Vmul>;
using InstARM32Vmvn = InstARM32UnaryopFP<InstARM32::Vmvn>;
using InstARM32Vneg = InstARM32UnaryopSignAwareFP<InstARM32::Vneg>;
@@ -1036,6 +1046,7 @@
using InstARM32Ldrex = InstARM32LoadBase<InstARM32::Ldrex>;
using InstARM32Vldr1d = InstARM32LoadBase<InstARM32::Vldr1d>;
using InstARM32Vldr1q = InstARM32LoadBase<InstARM32::Vldr1q>;
+using InstARM32Vzip = InstARM32ThreeAddrFP<InstARM32::Vzip>;
/// MovT leaves the bottom bits alone so dest is also a source. This helps
/// indicate that a previous MovW setting dest is not dead code.
using InstARM32Movt = InstARM32TwoAddrGPR<InstARM32::Movt>;
@@ -1374,6 +1385,30 @@
SizeT Size;
};
+/// Vector element duplication/replication instruction.
+class InstARM32Vdup final : public InstARM32Pred {
+ InstARM32Vdup() = delete;
+ InstARM32Vdup(const InstARM32Vdup &) = delete;
+ InstARM32Vdup &operator=(const InstARM32Vdup &) = delete;
+
+public:
+ /// Value must be a register.
+ static InstARM32Vdup *create(Cfg *Func, Variable *Dest, Variable *Src,
+ IValueT Idx) {
+ return new (Func->allocate<InstARM32Vdup>())
+ InstARM32Vdup(Func, Dest, Src, Idx);
+ }
+ void emit(const Cfg *Func) const override;
+ void emitIAS(const Cfg *Func) const override;
+ void dump(const Cfg *Func) const override;
+ static bool classof(const Inst *Instr) { return isClassof(Instr, Vdup); }
+
+private:
+ InstARM32Vdup(Cfg *Func, Variable *Dest, Variable *Src, IValueT Idx);
+
+ const IValueT Idx;
+};
+
class InstARM32Trap : public InstARM32 {
InstARM32Trap() = delete;
InstARM32Trap(const InstARM32Trap &) = delete;
diff --git a/third_party/subzero/src/IceTargetLoweringARM32.cpp b/third_party/subzero/src/IceTargetLoweringARM32.cpp
index 9856f7a..d820bca 100644
--- a/third_party/subzero/src/IceTargetLoweringARM32.cpp
+++ b/third_party/subzero/src/IceTargetLoweringARM32.cpp
@@ -5357,7 +5357,7 @@
Func->setError("Unexpected size for LoadSubVector");
return;
}
- _mov(Dest, T); // FIXME: necessary?
+ _mov(Dest, T);
return;
}
case Intrinsics::StoreSubVector: {
@@ -5975,8 +5975,121 @@
const Type DestTy = Dest->getType();
auto *T = makeReg(DestTy);
+ auto *Src0 = Instr->getSrc(0);
+ auto *Src1 = Instr->getSrc(1);
+ const SizeT NumElements = typeNumElements(DestTy);
+ const Type ElementType = typeElementType(DestTy);
+
+ bool Replicate = true;
+ for (SizeT I = 1; Replicate && I < Instr->getNumIndexes(); ++I) {
+ if (Instr->getIndexValue(I) != Instr->getIndexValue(0)) {
+ Replicate = false;
+ }
+ }
+
+ if (Replicate) {
+ Variable *Src0Var = legalizeToReg(Src0);
+ _vdup(T, Src0Var, Instr->getIndexValue(0));
+ _mov(Dest, T);
+ return;
+ }
switch (DestTy) {
+ case IceType_v8i1:
+ case IceType_v8i16: {
+ static constexpr SizeT ExpectedNumElements = 8;
+ assert(ExpectedNumElements == Instr->getNumIndexes());
+ (void)ExpectedNumElements;
+
+ if (Instr->indexesAre(0, 0, 1, 1, 2, 2, 3, 3)) {
+ Variable *Src0R = legalizeToReg(Src0);
+ _vzip(T, Src0R, Src0R);
+ _mov(Dest, T);
+ return;
+ }
+
+ if (Instr->indexesAre(0, 8, 1, 9, 2, 10, 3, 11)) {
+ Variable *Src0R = legalizeToReg(Src0);
+ Variable *Src1R = legalizeToReg(Src1);
+ _vzip(T, Src0R, Src1R);
+ _mov(Dest, T);
+ return;
+ }
+
+ if (Instr->indexesAre(0, 2, 4, 6, 0, 2, 4, 6)) {
+ Variable *Src0R = legalizeToReg(Src0);
+ _vqmovn2(T, Src0R, Src0R, false, false);
+ _mov(Dest, T);
+ return;
+ }
+ } break;
+ case IceType_v16i1:
+ case IceType_v16i8: {
+ static constexpr SizeT ExpectedNumElements = 16;
+ assert(ExpectedNumElements == Instr->getNumIndexes());
+ (void)ExpectedNumElements;
+
+ if (Instr->indexesAre(0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7)) {
+ Variable *Src0R = legalizeToReg(Src0);
+ _vzip(T, Src0R, Src0R);
+ _mov(Dest, T);
+ return;
+ }
+
+ if (Instr->indexesAre(0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7,
+ 23)) {
+ Variable *Src0R = legalizeToReg(Src0);
+ Variable *Src1R = legalizeToReg(Src1);
+ _vzip(T, Src0R, Src1R);
+ _mov(Dest, T);
+ return;
+ }
+ } break;
+ case IceType_v4i1:
+ case IceType_v4i32:
+ case IceType_v4f32: {
+ static constexpr SizeT ExpectedNumElements = 4;
+ assert(ExpectedNumElements == Instr->getNumIndexes());
+ (void)ExpectedNumElements;
+
+ if (Instr->indexesAre(0, 0, 1, 1)) {
+ Variable *Src0R = legalizeToReg(Src0);
+ _vzip(T, Src0R, Src0R);
+ _mov(Dest, T);
+ return;
+ }
+
+ if (Instr->indexesAre(0, 4, 1, 5)) {
+ Variable *Src0R = legalizeToReg(Src0);
+ Variable *Src1R = legalizeToReg(Src1);
+ _vzip(T, Src0R, Src1R);
+ _mov(Dest, T);
+ return;
+ }
+
+ if (Instr->indexesAre(0, 1, 4, 5)) {
+ Variable *Src0R = legalizeToReg(Src0);
+ Variable *Src1R = legalizeToReg(Src1);
+ _vmovlh(T, Src0R, Src1R);
+ _mov(Dest, T);
+ return;
+ }
+
+ if (Instr->indexesAre(2, 3, 2, 3)) {
+ Variable *Src0R = legalizeToReg(Src0);
+ _vmovhl(T, Src0R, Src0R);
+ _mov(Dest, T);
+ return;
+ }
+
+ if (Instr->indexesAre(2, 3, 6, 7)) {
+ Variable *Src0R = legalizeToReg(Src0);
+ Variable *Src1R = legalizeToReg(Src1);
+ _vmovhl(T, Src1R, Src0R);
+ _mov(Dest, T);
+ return;
+ }
+ } break;
default:
break;
// TODO(jpp): figure out how to properly lower this without scalarization.
@@ -5984,10 +6097,6 @@
// Unoptimized shuffle. Perform a series of inserts and extracts.
Context.insert<InstFakeDef>(T);
- auto *Src0 = Instr->getSrc(0);
- auto *Src1 = Instr->getSrc(1);
- const SizeT NumElements = typeNumElements(DestTy);
- const Type ElementType = typeElementType(DestTy);
for (SizeT I = 0; I < Instr->getNumIndexes(); ++I) {
auto *Index = Instr->getIndex(I);
const SizeT Elem = Index->getValue();
diff --git a/third_party/subzero/src/IceTargetLoweringARM32.h b/third_party/subzero/src/IceTargetLoweringARM32.h
index a82337a..a629627 100644
--- a/third_party/subzero/src/IceTargetLoweringARM32.h
+++ b/third_party/subzero/src/IceTargetLoweringARM32.h
@@ -885,6 +885,9 @@
CondARM32::Cond Pred = CondARM32::AL) {
Context.insert<InstARM32Vcmp>(Src0, FpZero, Pred);
}
+ void _vdup(Variable *Dest, Variable *Src, int Idx) {
+ Context.insert<InstARM32Vdup>(Dest, Src, Idx);
+ }
void _veor(Variable *Dest, Variable *Src0, Variable *Src1) {
Context.insert<InstARM32Veor>(Dest, Src0, Src1);
}
@@ -908,6 +911,18 @@
void _vmls(Variable *Dest, Variable *Src0, Variable *Src1) {
Context.insert<InstARM32Vmls>(Dest, Src0, Src1);
}
+ void _vmovl(Variable *Dest, Variable *Src0, Variable *Src1) {
+ Context.insert<InstARM32Vmovl>(Dest, Src0, Src1);
+ }
+ void _vmovh(Variable *Dest, Variable *Src0, Variable *Src1) {
+ Context.insert<InstARM32Vmovh>(Dest, Src0, Src1);
+ }
+ void _vmovhl(Variable *Dest, Variable *Src0, Variable *Src1) {
+ Context.insert<InstARM32Vmovhl>(Dest, Src0, Src1);
+ }
+ void _vmovlh(Variable *Dest, Variable *Src0, Variable *Src1) {
+ Context.insert<InstARM32Vmovlh>(Dest, Src0, Src1);
+ }
void _vmul(Variable *Dest, Variable *Src0, Variable *Src1) {
Context.insert<InstARM32Vmul>(Dest, Src0, Src1);
}
@@ -966,6 +981,9 @@
void _vsub(Variable *Dest, Variable *Src0, Variable *Src1) {
Context.insert<InstARM32Vsub>(Dest, Src0, Src1);
}
+ void _vzip(Variable *Dest, Variable *Src0, Variable *Src1) {
+ Context.insert<InstARM32Vzip>(Dest, Src0, Src1);
+ }
// Iterates over the CFG and determines the maximum outgoing stack arguments
// bytes. This information is later used during addProlog() to pre-allocate
diff --git a/third_party/subzero/src/IceTargetLoweringX86BaseImpl.h b/third_party/subzero/src/IceTargetLoweringX86BaseImpl.h
index c5eac33..f2fd83e 100644
--- a/third_party/subzero/src/IceTargetLoweringX86BaseImpl.h
+++ b/third_party/subzero/src/IceTargetLoweringX86BaseImpl.h
@@ -6304,22 +6304,22 @@
break;
}
- const SizeT Index0 = Instr->getIndex(0)->getValue();
- const SizeT Index1 = Instr->getIndex(1)->getValue();
- const SizeT Index2 = Instr->getIndex(2)->getValue();
- const SizeT Index3 = Instr->getIndex(3)->getValue();
- const SizeT Index4 = Instr->getIndex(4)->getValue();
- const SizeT Index5 = Instr->getIndex(5)->getValue();
- const SizeT Index6 = Instr->getIndex(6)->getValue();
- const SizeT Index7 = Instr->getIndex(7)->getValue();
- const SizeT Index8 = Instr->getIndex(8)->getValue();
- const SizeT Index9 = Instr->getIndex(9)->getValue();
- const SizeT Index10 = Instr->getIndex(10)->getValue();
- const SizeT Index11 = Instr->getIndex(11)->getValue();
- const SizeT Index12 = Instr->getIndex(12)->getValue();
- const SizeT Index13 = Instr->getIndex(13)->getValue();
- const SizeT Index14 = Instr->getIndex(14)->getValue();
- const SizeT Index15 = Instr->getIndex(15)->getValue();
+ const SizeT Index0 = Instr->getIndexValue(0);
+ const SizeT Index1 = Instr->getIndexValue(1);
+ const SizeT Index2 = Instr->getIndexValue(2);
+ const SizeT Index3 = Instr->getIndexValue(3);
+ const SizeT Index4 = Instr->getIndexValue(4);
+ const SizeT Index5 = Instr->getIndexValue(5);
+ const SizeT Index6 = Instr->getIndexValue(6);
+ const SizeT Index7 = Instr->getIndexValue(7);
+ const SizeT Index8 = Instr->getIndexValue(8);
+ const SizeT Index9 = Instr->getIndexValue(9);
+ const SizeT Index10 = Instr->getIndexValue(10);
+ const SizeT Index11 = Instr->getIndexValue(11);
+ const SizeT Index12 = Instr->getIndexValue(12);
+ const SizeT Index13 = Instr->getIndexValue(13);
+ const SizeT Index14 = Instr->getIndexValue(14);
+ const SizeT Index15 = Instr->getIndexValue(15);
lowerShuffleVector_UsingPshufb(Dest, Src0, Src1, Index0, Index1, Index2,
Index3, Index4, Index5, Index6, Index7,
@@ -6376,14 +6376,14 @@
break;
}
- const SizeT Index0 = Instr->getIndex(0)->getValue();
- const SizeT Index1 = Instr->getIndex(1)->getValue();
- const SizeT Index2 = Instr->getIndex(2)->getValue();
- const SizeT Index3 = Instr->getIndex(3)->getValue();
- const SizeT Index4 = Instr->getIndex(4)->getValue();
- const SizeT Index5 = Instr->getIndex(5)->getValue();
- const SizeT Index6 = Instr->getIndex(6)->getValue();
- const SizeT Index7 = Instr->getIndex(7)->getValue();
+ const SizeT Index0 = Instr->getIndexValue(0);
+ const SizeT Index1 = Instr->getIndexValue(1);
+ const SizeT Index2 = Instr->getIndexValue(2);
+ const SizeT Index3 = Instr->getIndexValue(3);
+ const SizeT Index4 = Instr->getIndexValue(4);
+ const SizeT Index5 = Instr->getIndexValue(5);
+ const SizeT Index6 = Instr->getIndexValue(6);
+ const SizeT Index7 = Instr->getIndexValue(7);
#define TO_BYTE_INDEX(I) ((I) << 1)
lowerShuffleVector_UsingPshufb(
@@ -6403,10 +6403,10 @@
case IceType_v4f32: {
static constexpr SizeT ExpectedNumElements = 4;
assert(ExpectedNumElements == Instr->getNumIndexes());
- const SizeT Index0 = Instr->getIndex(0)->getValue();
- const SizeT Index1 = Instr->getIndex(1)->getValue();
- const SizeT Index2 = Instr->getIndex(2)->getValue();
- const SizeT Index3 = Instr->getIndex(3)->getValue();
+ const SizeT Index0 = Instr->getIndexValue(0);
+ const SizeT Index1 = Instr->getIndexValue(1);
+ const SizeT Index2 = Instr->getIndexValue(2);
+ const SizeT Index3 = Instr->getIndexValue(3);
Variable *T = nullptr;
switch (makeSrcSwitchMask(Index0, Index1, Index2, Index3)) {
#define CASE_SRCS_IN(S0, S1, S2, S3) \
@@ -6611,8 +6611,7 @@
InstExtractElement::create(Func, ExtElmt, Src0, Index));
} else {
lowerExtractElement(InstExtractElement::create(
- Func, ExtElmt, Src1,
- Ctx->getConstantInt32(Index->getValue() - NumElements)));
+ Func, ExtElmt, Src1, Ctx->getConstantInt32(Elem - NumElements)));
}
auto *NewT = makeReg(DestTy);
lowerInsertElement(InstInsertElement::create(Func, NewT, T, ExtElmt,