ARM32 Vector lowering - scalarize select
With this change, we pass the select crosstest.
Since this would have introduced a three-argument version of
scalarizeInstruction, I decided to generalize it using templates.
BUG= https://bugs.chromium.org/p/nativeclient/issues/detail?id=4076
R=stichnot@chromium.org
Review URL: https://codereview.chromium.org/1683243003 .
diff --git a/Makefile.standalone b/Makefile.standalone
index 43b1e80..ab947f4 100644
--- a/Makefile.standalone
+++ b/Makefile.standalone
@@ -448,7 +448,8 @@
-i x8664,sandbox,sse4.1,Om1 \
-i arm32,neon \
-e arm32,neon,test_vector_ops \
- -e arm32,neon,test_select
+ -e arm32,nonsfi \
+ -e arm32,neon,test_vector_ops
PNACL_BIN_PATH=$(PNACL_BIN_PATH) \
$(LLVM_SRC_PATH)/utils/lit/lit.py -sv $(CHECK_XTEST_TESTS)
endif
diff --git a/src/IceTargetLowering.cpp b/src/IceTargetLowering.cpp
index c696ba6..f0cf542 100644
--- a/src/IceTargetLowering.cpp
+++ b/src/IceTargetLowering.cpp
@@ -313,6 +313,7 @@
}
void TargetLowering::genTargetHelperCalls() {
+ Utils::BoolFlagSaver _(GeneratingTargetHelpers, true);
for (CfgNode *Node : Func->getNodes()) {
Context.init(Node);
while (!Context.atEnd()) {
@@ -711,10 +712,9 @@
Variable *Dest, Operand *Src0,
Operand *Src1) {
scalarizeInstruction(
- Dest, Src0, Src1,
- [this, Kind](Variable *Dest, Variable *Src0, Variable *Src1) {
+ Dest, [this, Kind](Variable *Dest, Operand *Src0, Operand *Src1) {
return Context.insert<InstArithmetic>(Kind, Dest, Src0, Src1);
- });
+ }, Src0, Src1);
}
void TargetLowering::emitWithoutPrefix(const ConstantRelocatable *C,
diff --git a/src/IceTargetLowering.h b/src/IceTargetLowering.h
index 3477f70..36e945f 100644
--- a/src/IceTargetLowering.h
+++ b/src/IceTargetLowering.h
@@ -325,6 +325,10 @@
// locking/unlocking) to prevent nested bundles.
bool AutoBundling = false;
+ /// This indicates whether we are in the genTargetHelperCalls phase, and
+ /// therefore can do things like scalarization.
+ bool GeneratingTargetHelpers = false;
+
// _bundle_lock(), and _bundle_unlock(), were made private to force subtargets
// to use the AutoBundle helper.
void
@@ -469,39 +473,42 @@
/// Generalizes scalarizeArithmetic to support other instruction types.
///
- /// MakeInstruction is a function-like object with signature
+ /// insertScalarInstruction is a function-like object with signature
/// (Variable *Dest, Variable *Src0, Variable *Src1) -> Instr *.
- template <typename F>
- void scalarizeInstruction(Variable *Dest, Operand *Src0, Operand *Src1,
- F &&MakeInstruction) {
+ template <typename... Operands,
+ typename F = std::function<Inst *(Variable *, Operands *...)>>
+ void scalarizeInstruction(Variable *Dest, F insertScalarInstruction,
+ Operands *... Srcs) {
+ assert(GeneratingTargetHelpers &&
+ "scalarizeInstruction called during incorrect phase");
const Type DestTy = Dest->getType();
assert(isVectorType(DestTy));
const Type DestElementTy = typeElementType(DestTy);
const SizeT NumElements = typeNumElements(DestTy);
- const Type Src0ElementTy = typeElementType(Src0->getType());
- const Type Src1ElementTy = typeElementType(Src1->getType());
-
- assert(NumElements == typeNumElements(Src0->getType()));
- assert(NumElements == typeNumElements(Src1->getType()));
Variable *T = Func->makeVariable(DestTy);
Context.insert<InstFakeDef>(T);
- for (SizeT I = 0; I < NumElements; ++I) {
- Constant *Index = Ctx->getConstantInt32(I);
- // Extract the next two inputs.
- Variable *Op0 = Func->makeVariable(Src0ElementTy);
- Context.insert<InstExtractElement>(Op0, Src0, Index);
- Variable *Op1 = Func->makeVariable(Src1ElementTy);
- Context.insert<InstExtractElement>(Op1, Src1, Index);
+ for (SizeT I = 0; I < NumElements; ++I) {
+ auto *Index = Ctx->getConstantInt32(I);
+
+ auto makeExtractThunk = [this, Index, NumElements](Operand *Src) {
+ return [this, Index, NumElements, Src]() {
+ assert(typeNumElements(Src->getType()) == NumElements);
+
+ const auto ElementTy = typeElementType(Src->getType());
+ auto *Op = Func->makeVariable(ElementTy);
+ Context.insert<InstExtractElement>(Op, Src, Index);
+ return Op;
+ };
+ };
// Perform the operation as a scalar operation.
- Variable *Res = Func->makeVariable(DestElementTy);
- auto Arith = MakeInstruction(Res, Op0, Op1);
- // We might have created an operation that needed a helper call.
+ auto *Res = Func->makeVariable(DestElementTy);
+ auto *Arith = applyToThunkedArgs(insertScalarInstruction, Res,
+ makeExtractThunk(Srcs)...);
genTargetHelperCallFor(Arith);
- // Insert the result into position.
Variable *DestT = Func->makeVariable(DestTy);
Context.insert<InstInsertElement>(DestT, T, Res, Index);
T = DestT;
@@ -509,38 +516,38 @@
Context.insert<InstAssign>(Dest, T);
}
- template <typename F>
- void scalarizeUnaryInstruction(Variable *Dest, Operand *Src0,
- F &&MakeInstruction) {
- const Type DestTy = Dest->getType();
- assert(isVectorType(DestTy));
- const Type DestElementTy = typeElementType(DestTy);
- const SizeT NumElements = typeNumElements(DestTy);
- const Type Src0ElementTy = typeElementType(Src0->getType());
+ // applyToThunkedArgs is used by scalarizeInstruction. Ideally, we would just
+ // call insertScalarInstruction(Res, Srcs...), but C++ does not specify
+ // evaluation order which means this leads to an unpredictable final
+ // output. Instead, we wrap each of the Srcs in a thunk and these
+ // applyToThunkedArgs functions apply the thunks in a well defined order so we
+ // still get well-defined output.
+ Inst *applyToThunkedArgs(
+ std::function<Inst *(Variable *, Variable *)> insertScalarInstruction,
+ Variable *Res, std::function<Variable *()> thunk0) {
+ auto *Src0 = thunk0();
+ return insertScalarInstruction(Res, Src0);
+ }
- assert(NumElements == typeNumElements(Src0->getType()));
+ Inst *
+ applyToThunkedArgs(std::function<Inst *(Variable *, Variable *, Variable *)>
+ insertScalarInstruction,
+ Variable *Res, std::function<Variable *()> thunk0,
+ std::function<Variable *()> thunk1) {
+ auto *Src0 = thunk0();
+ auto *Src1 = thunk1();
+ return insertScalarInstruction(Res, Src0, Src1);
+ }
- Variable *T = Func->makeVariable(DestTy);
- Context.insert<InstFakeDef>(T);
- for (SizeT I = 0; I < NumElements; ++I) {
- Constant *Index = Ctx->getConstantInt32(I);
-
- // Extract the next two inputs.
- Variable *Op0 = Func->makeVariable(Src0ElementTy);
- Context.insert<InstExtractElement>(Op0, Src0, Index);
-
- // Perform the operation as a scalar operation.
- Variable *Res = Func->makeVariable(DestElementTy);
- auto Arith = MakeInstruction(Res, Op0);
- // We might have created an operation that needed a helper call.
- genTargetHelperCallFor(Arith);
-
- // Insert the result into position.
- Variable *DestT = Func->makeVariable(DestTy);
- Context.insert<InstInsertElement>(DestT, T, Res, Index);
- T = DestT;
- }
- Context.insert<InstAssign>(Dest, T);
+ Inst *applyToThunkedArgs(
+ std::function<Inst *(Variable *, Variable *, Variable *, Variable *)>
+ insertScalarInstruction,
+ Variable *Res, std::function<Variable *()> thunk0,
+ std::function<Variable *()> thunk1, std::function<Variable *()> thunk2) {
+ auto *Src0 = thunk0();
+ auto *Src1 = thunk1();
+ auto *Src2 = thunk2();
+ return insertScalarInstruction(Res, Src0, Src1, Src2);
}
/// SandboxType enumerates all possible sandboxing strategies that
diff --git a/src/IceTargetLoweringARM32.cpp b/src/IceTargetLoweringARM32.cpp
index b4a9b67..1eaec8b 100644
--- a/src/IceTargetLoweringARM32.cpp
+++ b/src/IceTargetLoweringARM32.cpp
@@ -592,10 +592,10 @@
const InstCast::OpKind CastKind = CastInstr->getCastKind();
if (isVectorType(DestTy)) {
- scalarizeUnaryInstruction(
- Dest, Src0, [this, CastKind](Variable *Dest, Variable *Src) {
+ scalarizeInstruction(
+ Dest, [this, CastKind](Variable *Dest, Variable *Src) {
return Context.insert<InstCast>(CastKind, Dest, Src);
- });
+ }, Src0);
CastInstr->setDeleted();
return;
}
@@ -753,10 +753,11 @@
auto *CmpInstr = llvm::cast<InstIcmp>(Instr);
const auto Condition = CmpInstr->getCondition();
scalarizeInstruction(
- Dest, CmpInstr->getSrc(0), CmpInstr->getSrc(1),
+ Dest,
[this, Condition](Variable *Dest, Variable *Src0, Variable *Src1) {
return Context.insert<InstIcmp>(Condition, Dest, Src0, Src1);
- });
+ },
+ CmpInstr->getSrc(0), CmpInstr->getSrc(1));
CmpInstr->setDeleted();
}
return;
@@ -768,14 +769,33 @@
auto *CmpInstr = llvm::cast<InstFcmp>(Instr);
const auto Condition = CmpInstr->getCondition();
scalarizeInstruction(
- Dest, CmpInstr->getSrc(0), CmpInstr->getSrc(1),
+ Dest,
[this, Condition](Variable *Dest, Variable *Src0, Variable *Src1) {
return Context.insert<InstFcmp>(Condition, Dest, Src0, Src1);
- });
+ },
+ CmpInstr->getSrc(0), CmpInstr->getSrc(1));
CmpInstr->setDeleted();
}
return;
}
+ case Inst::Select: {
+ Variable *Dest = Instr->getDest();
+ const auto DestTy = Dest->getType();
+ if (isVectorType(DestTy)) {
+ auto *SelectInstr = llvm::cast<InstSelect>(Instr);
+ scalarizeInstruction(Dest,
+ [this](Variable *Dest, Variable *Src0,
+ Variable *Src1, Variable *Src2) {
+ return Context.insert<InstSelect>(Dest, Src0, Src1,
+ Src2);
+ },
+ llvm::cast<Variable>(SelectInstr->getSrc(0)),
+ llvm::cast<Variable>(SelectInstr->getSrc(1)),
+ llvm::cast<Variable>(SelectInstr->getSrc(2)));
+ SelectInstr->setDeleted();
+ }
+ return;
+ }
}
}
diff --git a/src/IceTargetLoweringX86BaseImpl.h b/src/IceTargetLoweringX86BaseImpl.h
index cdece9c..967eabe 100644
--- a/src/IceTargetLoweringX86BaseImpl.h
+++ b/src/IceTargetLoweringX86BaseImpl.h
@@ -87,21 +87,7 @@
namespace X86NAMESPACE {
-/// A helper class to ease the settings of RandomizationPoolingPause to disable
-/// constant blinding or pooling for some translation phases.
-class BoolFlagSaver {
- BoolFlagSaver() = delete;
- BoolFlagSaver(const BoolFlagSaver &) = delete;
- BoolFlagSaver &operator=(const BoolFlagSaver &) = delete;
-
-public:
- BoolFlagSaver(bool &F, bool NewValue) : OldValue(F), Flag(F) { F = NewValue; }
- ~BoolFlagSaver() { Flag = OldValue; }
-
-private:
- const bool OldValue;
- bool &Flag;
-};
+using Utils::BoolFlagSaver;
template <typename Traits> class BoolFoldingEntry {
BoolFoldingEntry(const BoolFoldingEntry &) = delete;
diff --git a/src/IceUtils.h b/src/IceUtils.h
index f9b03bc..83b3fe9 100644
--- a/src/IceUtils.h
+++ b/src/IceUtils.h
@@ -123,6 +123,25 @@
return Val == 0 && !std::signbit(Val);
}
+/// An RAII class to ensure that a boolean flag is restored to its previous
+/// value upon function exit.
+///
+/// Used in places like RandomizationPoolingPause and generating target helper
+/// calls.
+class BoolFlagSaver {
+ BoolFlagSaver() = delete;
+ BoolFlagSaver(const BoolFlagSaver &) = delete;
+ BoolFlagSaver &operator=(const BoolFlagSaver &) = delete;
+
+public:
+ BoolFlagSaver(bool &F, bool NewValue) : OldValue(F), Flag(F) { F = NewValue; }
+ ~BoolFlagSaver() { Flag = OldValue; }
+
+private:
+ const bool OldValue;
+ bool &Flag;
+};
+
} // end of namespace Utils
} // end of namespace Ice
diff --git a/tests_lit/assembler/arm32/select-vec.ll b/tests_lit/assembler/arm32/select-vec.ll
new file mode 100644
index 0000000..0edbcf4
--- /dev/null
+++ b/tests_lit/assembler/arm32/select-vec.ll
@@ -0,0 +1,263 @@
+; Test that we handle select on vectors.
+
+; TODO(eholk): This test will need to be updated once comparison is no longer
+; scalarized.
+
+; REQUIRES: allow_dump
+
+; Compile using standalone assembler.
+; RUN: %p2i --filetype=asm -i %s --target=arm32 --args -O2 \
+; RUN: | FileCheck %s --check-prefix=ASM
+
+define internal <4 x float> @select4float(<4 x i1> %s, <4 x float> %a,
+ <4 x float> %b) {
+; ASM-LABEL:select4float:
+; DIS-LABEL:00000000 <select4float>:
+
+entry:
+ %res = select <4 x i1> %s, <4 x float> %a, <4 x float> %b
+
+; ASM: # q3 = def.pseudo
+; ASM-NEXT: vmov.s8 r0, d0[0]
+; ASM-NEXT: vmov.f32 s16, s4
+; ASM-NEXT: vmov.f32 s17, s8
+; ASM-NEXT: tst r0, #1
+; ASM-NEXT: vmovne.f32 s17, s16
+; ASM-NEXT: vmov.f32 s12, s17
+; ASM-NEXT: vmov.s8 r0, d0[1]
+; ASM-NEXT: vmov.f32 s16, s5
+; ASM-NEXT: vmov.f32 s17, s9
+; ASM-NEXT: tst r0, #1
+; ASM-NEXT: vmovne.f32 s17, s16
+; ASM-NEXT: vmov.f32 s13, s17
+; ASM-NEXT: vmov.s8 r0, d1[0]
+; ASM-NEXT: vmov.f32 s16, s6
+; ASM-NEXT: vmov.f32 s17, s10
+; ASM-NEXT: tst r0, #1
+; ASM-NEXT: vmovne.f32 s17, s16
+; ASM-NEXT: vmov.f32 s14, s17
+; ASM-NEXT: vmov.s8 r0, d1[1]
+; ASM-NEXT: vmov.f32 s4, s7
+; ASM-NEXT: vmov.f32 s8, s11
+; ASM-NEXT: tst r0, #1
+; ASM-NEXT: vmovne.f32 s8, s4
+; ASM-NEXT: vmov.f32 s15, s8
+; ASM-NEXT: vmov.f32 q0, q3
+; ASM-NEXT: vpop {s16, s17}
+; ASM-NEXT: # s16 = def.pseudo
+; ASM-NEXT: # s17 = def.pseudo
+; ASM-NEXT: bx lr
+
+ ret <4 x float> %res
+}
+
+define internal <4 x i32> @select4i32(<4 x i1> %s, <4 x i32> %a, <4 x i32> %b) {
+; ASM-LABEL:select4i32:
+; DIS-LABEL:00000000 <select4i32>:
+
+entry:
+ %res = select <4 x i1> %s, <4 x i32> %a, <4 x i32> %b
+
+; ASM: # q3 = def.pseudo
+; ASM-NEXT: vmov.s8 r0, d0[0]
+; ASM-NEXT: vmov.32 r1, d2[0]
+; ASM-NEXT: vmov.32 r2, d4[0]
+; ASM-NEXT: tst r0, #1
+; ASM-NEXT: movne r2, r1
+; ASM-NEXT: vmov.32 d6[0], r2
+; ASM-NEXT: vmov.s8 r0, d0[1]
+; ASM-NEXT: vmov.32 r1, d2[1]
+; ASM-NEXT: vmov.32 r2, d4[1]
+; ASM-NEXT: tst r0, #1
+; ASM-NEXT: movne r2, r1
+; ASM-NEXT: vmov.32 d6[1], r2
+; ASM-NEXT: vmov.s8 r0, d1[0]
+; ASM-NEXT: vmov.32 r1, d3[0]
+; ASM-NEXT: vmov.32 r2, d5[0]
+; ASM-NEXT: tst r0, #1
+; ASM-NEXT: movne r2, r1
+; ASM-NEXT: vmov.32 d7[0], r2
+; ASM-NEXT: vmov.s8 r0, d1[1]
+; ASM-NEXT: vmov.32 r1, d3[1]
+; ASM-NEXT: vmov.32 r2, d5[1]
+; ASM-NEXT: tst r0, #1
+; ASM-NEXT: movne r2, r1
+; ASM-NEXT: vmov.32 d7[1], r2
+; ASM-NEXT: vmov.i32 q0, q3
+; ASM-NEXT: bx lr
+
+ ret <4 x i32> %res
+}
+
+define internal <8 x i16> @select8i16(<8 x i1> %s, <8 x i16> %a, <8 x i16> %b) {
+; ASM-LABEL:select8i16:
+; DIS-LABEL:00000000 <select8i16>:
+
+entry:
+ %res = select <8 x i1> %s, <8 x i16> %a, <8 x i16> %b
+
+; ASM: # q3 = def.pseudo
+; ASM-NEXT: vmov.s8 r0, d0[0]
+; ASM-NEXT: vmov.s16 r1, d2[0]
+; ASM-NEXT: vmov.s16 r2, d4[0]
+; ASM-NEXT: tst r0, #1
+; ASM-NEXT: movne r2, r1
+; ASM-NEXT: vmov.16 d6[0], r2
+; ASM-NEXT: vmov.s8 r0, d0[1]
+; ASM-NEXT: vmov.s16 r1, d2[1]
+; ASM-NEXT: vmov.s16 r2, d4[1]
+; ASM-NEXT: tst r0, #1
+; ASM-NEXT: movne r2, r1
+; ASM-NEXT: vmov.16 d6[1], r2
+; ASM-NEXT: vmov.s8 r0, d0[2]
+; ASM-NEXT: vmov.s16 r1, d2[2]
+; ASM-NEXT: vmov.s16 r2, d4[2]
+; ASM-NEXT: tst r0, #1
+; ASM-NEXT: movne r2, r1
+; ASM-NEXT: vmov.16 d6[2], r2
+; ASM-NEXT: vmov.s8 r0, d0[3]
+; ASM-NEXT: vmov.s16 r1, d2[3]
+; ASM-NEXT: vmov.s16 r2, d4[3]
+; ASM-NEXT: tst r0, #1
+; ASM-NEXT: movne r2, r1
+; ASM-NEXT: vmov.16 d6[3], r2
+; ASM-NEXT: vmov.s8 r0, d1[0]
+; ASM-NEXT: vmov.s16 r1, d3[0]
+; ASM-NEXT: vmov.s16 r2, d5[0]
+; ASM-NEXT: tst r0, #1
+; ASM-NEXT: movne r2, r1
+; ASM-NEXT: vmov.16 d7[0], r2
+; ASM-NEXT: vmov.s8 r0, d1[1]
+; ASM-NEXT: vmov.s16 r1, d3[1]
+; ASM-NEXT: vmov.s16 r2, d5[1]
+; ASM-NEXT: tst r0, #1
+; ASM-NEXT: movne r2, r1
+; ASM-NEXT: vmov.16 d7[1], r2
+; ASM-NEXT: vmov.s8 r0, d1[2]
+; ASM-NEXT: vmov.s16 r1, d3[2]
+; ASM-NEXT: vmov.s16 r2, d5[2]
+; ASM-NEXT: tst r0, #1
+; ASM-NEXT: movne r2, r1
+; ASM-NEXT: vmov.16 d7[2], r2
+; ASM-NEXT: vmov.s8 r0, d1[3]
+; ASM-NEXT: vmov.s16 r1, d3[3]
+; ASM-NEXT: vmov.s16 r2, d5[3]
+; ASM-NEXT: tst r0, #1
+; ASM-NEXT: movne r2, r1
+; ASM-NEXT: vmov.16 d7[3], r2
+; ASM-NEXT: vmov.i16 q0, q3
+; ASM-NEXT: bx lr
+
+ ret <8 x i16> %res
+}
+
+define internal <16 x i8> @select16i8(<16 x i1> %s, <16 x i8> %a,
+ <16 x i8> %b) {
+; ASM-LABEL:select16i8:
+; DIS-LABEL:00000000 <select16i8>:
+
+entry:
+ %res = select <16 x i1> %s, <16 x i8> %a, <16 x i8> %b
+
+; ASM: # q3 = def.pseudo
+; ASM-NEXT: vmov.s8 r0, d0[0]
+; ASM-NEXT: vmov.s8 r1, d2[0]
+; ASM-NEXT: vmov.s8 r2, d4[0]
+; ASM-NEXT: tst r0, #1
+; ASM-NEXT: movne r2, r1
+; ASM-NEXT: vmov.8 d6[0], r2
+; ASM-NEXT: vmov.s8 r0, d0[1]
+; ASM-NEXT: vmov.s8 r1, d2[1]
+; ASM-NEXT: vmov.s8 r2, d4[1]
+; ASM-NEXT: tst r0, #1
+; ASM-NEXT: movne r2, r1
+; ASM-NEXT: vmov.8 d6[1], r2
+; ASM-NEXT: vmov.s8 r0, d0[2]
+; ASM-NEXT: vmov.s8 r1, d2[2]
+; ASM-NEXT: vmov.s8 r2, d4[2]
+; ASM-NEXT: tst r0, #1
+; ASM-NEXT: movne r2, r1
+; ASM-NEXT: vmov.8 d6[2], r2
+; ASM-NEXT: vmov.s8 r0, d0[3]
+; ASM-NEXT: vmov.s8 r1, d2[3]
+; ASM-NEXT: vmov.s8 r2, d4[3]
+; ASM-NEXT: tst r0, #1
+; ASM-NEXT: movne r2, r1
+; ASM-NEXT: vmov.8 d6[3], r2
+; ASM-NEXT: vmov.s8 r0, d0[4]
+; ASM-NEXT: vmov.s8 r1, d2[4]
+; ASM-NEXT: vmov.s8 r2, d4[4]
+; ASM-NEXT: tst r0, #1
+; ASM-NEXT: movne r2, r1
+; ASM-NEXT: vmov.8 d6[4], r2
+; ASM-NEXT: vmov.s8 r0, d0[5]
+; ASM-NEXT: vmov.s8 r1, d2[5]
+; ASM-NEXT: vmov.s8 r2, d4[5]
+; ASM-NEXT: tst r0, #1
+; ASM-NEXT: movne r2, r1
+; ASM-NEXT: vmov.8 d6[5], r2
+; ASM-NEXT: vmov.s8 r0, d0[6]
+; ASM-NEXT: vmov.s8 r1, d2[6]
+; ASM-NEXT: vmov.s8 r2, d4[6]
+; ASM-NEXT: tst r0, #1
+; ASM-NEXT: movne r2, r1
+; ASM-NEXT: vmov.8 d6[6], r2
+; ASM-NEXT: vmov.s8 r0, d0[7]
+; ASM-NEXT: vmov.s8 r1, d2[7]
+; ASM-NEXT: vmov.s8 r2, d4[7]
+; ASM-NEXT: tst r0, #1
+; ASM-NEXT: movne r2, r1
+; ASM-NEXT: vmov.8 d6[7], r2
+; ASM-NEXT: vmov.s8 r0, d1[0]
+; ASM-NEXT: vmov.s8 r1, d3[0]
+; ASM-NEXT: vmov.s8 r2, d5[0]
+; ASM-NEXT: tst r0, #1
+; ASM-NEXT: movne r2, r1
+; ASM-NEXT: vmov.8 d7[0], r2
+; ASM-NEXT: vmov.s8 r0, d1[1]
+; ASM-NEXT: vmov.s8 r1, d3[1]
+; ASM-NEXT: vmov.s8 r2, d5[1]
+; ASM-NEXT: tst r0, #1
+; ASM-NEXT: movne r2, r1
+; ASM-NEXT: vmov.8 d7[1], r2
+; ASM-NEXT: vmov.s8 r0, d1[2]
+; ASM-NEXT: vmov.s8 r1, d3[2]
+; ASM-NEXT: vmov.s8 r2, d5[2]
+; ASM-NEXT: tst r0, #1
+; ASM-NEXT: movne r2, r1
+; ASM-NEXT: vmov.8 d7[2], r2
+; ASM-NEXT: vmov.s8 r0, d1[3]
+; ASM-NEXT: vmov.s8 r1, d3[3]
+; ASM-NEXT: vmov.s8 r2, d5[3]
+; ASM-NEXT: tst r0, #1
+; ASM-NEXT: movne r2, r1
+; ASM-NEXT: vmov.8 d7[3], r2
+; ASM-NEXT: vmov.s8 r0, d1[4]
+; ASM-NEXT: vmov.s8 r1, d3[4]
+; ASM-NEXT: vmov.s8 r2, d5[4]
+; ASM-NEXT: tst r0, #1
+; ASM-NEXT: movne r2, r1
+; ASM-NEXT: vmov.8 d7[4], r2
+; ASM-NEXT: vmov.s8 r0, d1[5]
+; ASM-NEXT: vmov.s8 r1, d3[5]
+; ASM-NEXT: vmov.s8 r2, d5[5]
+; ASM-NEXT: tst r0, #1
+; ASM-NEXT: movne r2, r1
+; ASM-NEXT: vmov.8 d7[5], r2
+; ASM-NEXT: vmov.s8 r0, d1[6]
+; ASM-NEXT: vmov.s8 r1, d3[6]
+; ASM-NEXT: vmov.s8 r2, d5[6]
+; ASM-NEXT: tst r0, #1
+; ASM-NEXT: movne r2, r1
+; ASM-NEXT: vmov.8 d7[6], r2
+; ASM-NEXT: vmov.s8 r0, d1[7]
+; ASM-NEXT: vmov.s8 r1, d3[7]
+; ASM-NEXT: vmov.s8 r2, d5[7]
+; ASM-NEXT: tst r0, #1
+; ASM-NEXT: movne r2, r1
+; ASM-NEXT: vmov.8 d7[7], r2
+; ASM-NEXT: vmov.i8 q0, q3
+; ASM-NEXT: bx lr
+
+ ret <16 x i8> %res
+}