ARM32 Vector lowering - scalarize select

With this change, we pass the select crosstest.

Since this would have introduced a three-argument version of
scalarizeInstruction, I decided to generalize it using templates.

BUG= https://bugs.chromium.org/p/nativeclient/issues/detail?id=4076
R=stichnot@chromium.org

Review URL: https://codereview.chromium.org/1683243003 .
diff --git a/Makefile.standalone b/Makefile.standalone
index 43b1e80..ab947f4 100644
--- a/Makefile.standalone
+++ b/Makefile.standalone
@@ -448,7 +448,8 @@
           -i x8664,sandbox,sse4.1,Om1 \
           -i arm32,neon \
           -e arm32,neon,test_vector_ops \
-          -e arm32,neon,test_select
+          -e arm32,nonsfi \
+          -e arm32,neon,test_vector_ops
 	PNACL_BIN_PATH=$(PNACL_BIN_PATH) \
 	$(LLVM_SRC_PATH)/utils/lit/lit.py -sv $(CHECK_XTEST_TESTS)
 endif
diff --git a/src/IceTargetLowering.cpp b/src/IceTargetLowering.cpp
index c696ba6..f0cf542 100644
--- a/src/IceTargetLowering.cpp
+++ b/src/IceTargetLowering.cpp
@@ -313,6 +313,7 @@
 }
 
 void TargetLowering::genTargetHelperCalls() {
+  Utils::BoolFlagSaver _(GeneratingTargetHelpers, true);
   for (CfgNode *Node : Func->getNodes()) {
     Context.init(Node);
     while (!Context.atEnd()) {
@@ -711,10 +712,9 @@
                                          Variable *Dest, Operand *Src0,
                                          Operand *Src1) {
   scalarizeInstruction(
-      Dest, Src0, Src1,
-      [this, Kind](Variable *Dest, Variable *Src0, Variable *Src1) {
+      Dest, [this, Kind](Variable *Dest, Operand *Src0, Operand *Src1) {
         return Context.insert<InstArithmetic>(Kind, Dest, Src0, Src1);
-      });
+      }, Src0, Src1);
 }
 
 void TargetLowering::emitWithoutPrefix(const ConstantRelocatable *C,
diff --git a/src/IceTargetLowering.h b/src/IceTargetLowering.h
index 3477f70..36e945f 100644
--- a/src/IceTargetLowering.h
+++ b/src/IceTargetLowering.h
@@ -325,6 +325,10 @@
   // locking/unlocking) to prevent nested bundles.
   bool AutoBundling = false;
 
+  /// This indicates whether we are in the genTargetHelperCalls phase, and
+  /// therefore can do things like scalarization.
+  bool GeneratingTargetHelpers = false;
+
   // _bundle_lock(), and _bundle_unlock(), were made private to force subtargets
   // to use the AutoBundle helper.
   void
@@ -469,39 +473,42 @@
 
   /// Generalizes scalarizeArithmetic to support other instruction types.
   ///
-  /// MakeInstruction is a function-like object with signature
+  /// insertScalarInstruction is a function-like object with signature
   /// (Variable *Dest, Variable *Src0, Variable *Src1) -> Instr *.
-  template <typename F>
-  void scalarizeInstruction(Variable *Dest, Operand *Src0, Operand *Src1,
-                            F &&MakeInstruction) {
+  template <typename... Operands,
+            typename F = std::function<Inst *(Variable *, Operands *...)>>
+  void scalarizeInstruction(Variable *Dest, F insertScalarInstruction,
+                            Operands *... Srcs) {
+    assert(GeneratingTargetHelpers &&
+           "scalarizeInstruction called during incorrect phase");
     const Type DestTy = Dest->getType();
     assert(isVectorType(DestTy));
     const Type DestElementTy = typeElementType(DestTy);
     const SizeT NumElements = typeNumElements(DestTy);
-    const Type Src0ElementTy = typeElementType(Src0->getType());
-    const Type Src1ElementTy = typeElementType(Src1->getType());
-
-    assert(NumElements == typeNumElements(Src0->getType()));
-    assert(NumElements == typeNumElements(Src1->getType()));
 
     Variable *T = Func->makeVariable(DestTy);
     Context.insert<InstFakeDef>(T);
-    for (SizeT I = 0; I < NumElements; ++I) {
-      Constant *Index = Ctx->getConstantInt32(I);
 
-      // Extract the next two inputs.
-      Variable *Op0 = Func->makeVariable(Src0ElementTy);
-      Context.insert<InstExtractElement>(Op0, Src0, Index);
-      Variable *Op1 = Func->makeVariable(Src1ElementTy);
-      Context.insert<InstExtractElement>(Op1, Src1, Index);
+    for (SizeT I = 0; I < NumElements; ++I) {
+      auto *Index = Ctx->getConstantInt32(I);
+
+      auto makeExtractThunk = [this, Index, NumElements](Operand *Src) {
+        return [this, Index, NumElements, Src]() {
+          assert(typeNumElements(Src->getType()) == NumElements);
+
+          const auto ElementTy = typeElementType(Src->getType());
+          auto *Op = Func->makeVariable(ElementTy);
+          Context.insert<InstExtractElement>(Op, Src, Index);
+          return Op;
+        };
+      };
 
       // Perform the operation as a scalar operation.
-      Variable *Res = Func->makeVariable(DestElementTy);
-      auto Arith = MakeInstruction(Res, Op0, Op1);
-      // We might have created an operation that needed a helper call.
+      auto *Res = Func->makeVariable(DestElementTy);
+      auto *Arith = applyToThunkedArgs(insertScalarInstruction, Res,
+                                       makeExtractThunk(Srcs)...);
       genTargetHelperCallFor(Arith);
 
-      // Insert the result into position.
       Variable *DestT = Func->makeVariable(DestTy);
       Context.insert<InstInsertElement>(DestT, T, Res, Index);
       T = DestT;
@@ -509,38 +516,38 @@
     Context.insert<InstAssign>(Dest, T);
   }
 
-  template <typename F>
-  void scalarizeUnaryInstruction(Variable *Dest, Operand *Src0,
-                                 F &&MakeInstruction) {
-    const Type DestTy = Dest->getType();
-    assert(isVectorType(DestTy));
-    const Type DestElementTy = typeElementType(DestTy);
-    const SizeT NumElements = typeNumElements(DestTy);
-    const Type Src0ElementTy = typeElementType(Src0->getType());
+  // applyToThunkedArgs is used by scalarizeInstruction. Ideally, we would just
+  // call insertScalarInstruction(Res, Srcs...), but C++ does not specify
+  // evaluation order which means this leads to an unpredictable final
+  // output. Instead, we wrap each of the Srcs in a thunk and these
+  // applyToThunkedArgs functions apply the thunks in a well defined order so we
+  // still get well-defined output.
+  Inst *applyToThunkedArgs(
+      std::function<Inst *(Variable *, Variable *)> insertScalarInstruction,
+      Variable *Res, std::function<Variable *()> thunk0) {
+    auto *Src0 = thunk0();
+    return insertScalarInstruction(Res, Src0);
+  }
 
-    assert(NumElements == typeNumElements(Src0->getType()));
+  Inst *
+  applyToThunkedArgs(std::function<Inst *(Variable *, Variable *, Variable *)>
+                         insertScalarInstruction,
+                     Variable *Res, std::function<Variable *()> thunk0,
+                     std::function<Variable *()> thunk1) {
+    auto *Src0 = thunk0();
+    auto *Src1 = thunk1();
+    return insertScalarInstruction(Res, Src0, Src1);
+  }
 
-    Variable *T = Func->makeVariable(DestTy);
-    Context.insert<InstFakeDef>(T);
-    for (SizeT I = 0; I < NumElements; ++I) {
-      Constant *Index = Ctx->getConstantInt32(I);
-
-      // Extract the next two inputs.
-      Variable *Op0 = Func->makeVariable(Src0ElementTy);
-      Context.insert<InstExtractElement>(Op0, Src0, Index);
-
-      // Perform the operation as a scalar operation.
-      Variable *Res = Func->makeVariable(DestElementTy);
-      auto Arith = MakeInstruction(Res, Op0);
-      // We might have created an operation that needed a helper call.
-      genTargetHelperCallFor(Arith);
-
-      // Insert the result into position.
-      Variable *DestT = Func->makeVariable(DestTy);
-      Context.insert<InstInsertElement>(DestT, T, Res, Index);
-      T = DestT;
-    }
-    Context.insert<InstAssign>(Dest, T);
+  Inst *applyToThunkedArgs(
+      std::function<Inst *(Variable *, Variable *, Variable *, Variable *)>
+          insertScalarInstruction,
+      Variable *Res, std::function<Variable *()> thunk0,
+      std::function<Variable *()> thunk1, std::function<Variable *()> thunk2) {
+    auto *Src0 = thunk0();
+    auto *Src1 = thunk1();
+    auto *Src2 = thunk2();
+    return insertScalarInstruction(Res, Src0, Src1, Src2);
   }
 
   /// SandboxType enumerates all possible sandboxing strategies that
diff --git a/src/IceTargetLoweringARM32.cpp b/src/IceTargetLoweringARM32.cpp
index b4a9b67..1eaec8b 100644
--- a/src/IceTargetLoweringARM32.cpp
+++ b/src/IceTargetLoweringARM32.cpp
@@ -592,10 +592,10 @@
     const InstCast::OpKind CastKind = CastInstr->getCastKind();
 
     if (isVectorType(DestTy)) {
-      scalarizeUnaryInstruction(
-          Dest, Src0, [this, CastKind](Variable *Dest, Variable *Src) {
+      scalarizeInstruction(
+          Dest, [this, CastKind](Variable *Dest, Variable *Src) {
             return Context.insert<InstCast>(CastKind, Dest, Src);
-          });
+          }, Src0);
       CastInstr->setDeleted();
       return;
     }
@@ -753,10 +753,11 @@
       auto *CmpInstr = llvm::cast<InstIcmp>(Instr);
       const auto Condition = CmpInstr->getCondition();
       scalarizeInstruction(
-          Dest, CmpInstr->getSrc(0), CmpInstr->getSrc(1),
+          Dest,
           [this, Condition](Variable *Dest, Variable *Src0, Variable *Src1) {
             return Context.insert<InstIcmp>(Condition, Dest, Src0, Src1);
-          });
+          },
+          CmpInstr->getSrc(0), CmpInstr->getSrc(1));
       CmpInstr->setDeleted();
     }
     return;
@@ -768,14 +769,33 @@
       auto *CmpInstr = llvm::cast<InstFcmp>(Instr);
       const auto Condition = CmpInstr->getCondition();
       scalarizeInstruction(
-          Dest, CmpInstr->getSrc(0), CmpInstr->getSrc(1),
+          Dest,
           [this, Condition](Variable *Dest, Variable *Src0, Variable *Src1) {
             return Context.insert<InstFcmp>(Condition, Dest, Src0, Src1);
-          });
+          },
+          CmpInstr->getSrc(0), CmpInstr->getSrc(1));
       CmpInstr->setDeleted();
     }
     return;
   }
+  case Inst::Select: {
+    Variable *Dest = Instr->getDest();
+    const auto DestTy = Dest->getType();
+    if (isVectorType(DestTy)) {
+      auto *SelectInstr = llvm::cast<InstSelect>(Instr);
+      scalarizeInstruction(Dest,
+                           [this](Variable *Dest, Variable *Src0,
+                                  Variable *Src1, Variable *Src2) {
+                             return Context.insert<InstSelect>(Dest, Src0, Src1,
+                                                               Src2);
+                           },
+                           llvm::cast<Variable>(SelectInstr->getSrc(0)),
+                           llvm::cast<Variable>(SelectInstr->getSrc(1)),
+                           llvm::cast<Variable>(SelectInstr->getSrc(2)));
+      SelectInstr->setDeleted();
+    }
+    return;
+  }
   }
 }
 
diff --git a/src/IceTargetLoweringX86BaseImpl.h b/src/IceTargetLoweringX86BaseImpl.h
index cdece9c..967eabe 100644
--- a/src/IceTargetLoweringX86BaseImpl.h
+++ b/src/IceTargetLoweringX86BaseImpl.h
@@ -87,21 +87,7 @@
 
 namespace X86NAMESPACE {
 
-/// A helper class to ease the settings of RandomizationPoolingPause to disable
-/// constant blinding or pooling for some translation phases.
-class BoolFlagSaver {
-  BoolFlagSaver() = delete;
-  BoolFlagSaver(const BoolFlagSaver &) = delete;
-  BoolFlagSaver &operator=(const BoolFlagSaver &) = delete;
-
-public:
-  BoolFlagSaver(bool &F, bool NewValue) : OldValue(F), Flag(F) { F = NewValue; }
-  ~BoolFlagSaver() { Flag = OldValue; }
-
-private:
-  const bool OldValue;
-  bool &Flag;
-};
+using Utils::BoolFlagSaver;
 
 template <typename Traits> class BoolFoldingEntry {
   BoolFoldingEntry(const BoolFoldingEntry &) = delete;
diff --git a/src/IceUtils.h b/src/IceUtils.h
index f9b03bc..83b3fe9 100644
--- a/src/IceUtils.h
+++ b/src/IceUtils.h
@@ -123,6 +123,25 @@
   return Val == 0 && !std::signbit(Val);
 }
 
+/// An RAII class to ensure that a boolean flag is restored to its previous
+/// value upon function exit.
+///
+/// Used in places like RandomizationPoolingPause and generating target helper
+/// calls.
+class BoolFlagSaver {
+  BoolFlagSaver() = delete;
+  BoolFlagSaver(const BoolFlagSaver &) = delete;
+  BoolFlagSaver &operator=(const BoolFlagSaver &) = delete;
+
+public:
+  BoolFlagSaver(bool &F, bool NewValue) : OldValue(F), Flag(F) { F = NewValue; }
+  ~BoolFlagSaver() { Flag = OldValue; }
+
+private:
+  const bool OldValue;
+  bool &Flag;
+};
+
 } // end of namespace Utils
 } // end of namespace Ice
 
diff --git a/tests_lit/assembler/arm32/select-vec.ll b/tests_lit/assembler/arm32/select-vec.ll
new file mode 100644
index 0000000..0edbcf4
--- /dev/null
+++ b/tests_lit/assembler/arm32/select-vec.ll
@@ -0,0 +1,263 @@
+; Test that we handle select on vectors.
+
+; TODO(eholk): This test will need to be updated once comparison is no longer
+; scalarized.
+
+; REQUIRES: allow_dump
+
+; Compile using standalone assembler.
+; RUN: %p2i --filetype=asm -i %s --target=arm32 --args -O2 \
+; RUN:   | FileCheck %s --check-prefix=ASM
+
+define internal <4 x float> @select4float(<4 x i1> %s, <4 x float> %a,
+                                          <4 x float> %b) {
+; ASM-LABEL:select4float:
+; DIS-LABEL:00000000 <select4float>:
+
+entry:
+  %res = select <4 x i1> %s, <4 x float> %a, <4 x float> %b
+
+; ASM:	# q3 = def.pseudo
+; ASM-NEXT:	vmov.s8	r0, d0[0]
+; ASM-NEXT:	vmov.f32	s16, s4
+; ASM-NEXT:	vmov.f32	s17, s8
+; ASM-NEXT:	tst	r0, #1
+; ASM-NEXT:	vmovne.f32	s17, s16
+; ASM-NEXT:	vmov.f32	s12, s17
+; ASM-NEXT:	vmov.s8	r0, d0[1]
+; ASM-NEXT:	vmov.f32	s16, s5
+; ASM-NEXT:	vmov.f32	s17, s9
+; ASM-NEXT:	tst	r0, #1
+; ASM-NEXT:	vmovne.f32	s17, s16
+; ASM-NEXT:	vmov.f32	s13, s17
+; ASM-NEXT:	vmov.s8	r0, d1[0]
+; ASM-NEXT:	vmov.f32	s16, s6
+; ASM-NEXT:	vmov.f32	s17, s10
+; ASM-NEXT:	tst	r0, #1
+; ASM-NEXT:	vmovne.f32	s17, s16
+; ASM-NEXT:	vmov.f32	s14, s17
+; ASM-NEXT:	vmov.s8	r0, d1[1]
+; ASM-NEXT:	vmov.f32	s4, s7
+; ASM-NEXT:	vmov.f32	s8, s11
+; ASM-NEXT:	tst	r0, #1
+; ASM-NEXT:	vmovne.f32	s8, s4
+; ASM-NEXT:	vmov.f32	s15, s8
+; ASM-NEXT:	vmov.f32	q0, q3
+; ASM-NEXT:	vpop	{s16, s17}
+; ASM-NEXT:	# s16 = def.pseudo
+; ASM-NEXT:	# s17 = def.pseudo
+; ASM-NEXT:	bx	lr
+
+  ret <4 x float> %res
+}
+
+define internal <4 x i32> @select4i32(<4 x i1> %s, <4 x i32> %a, <4 x i32> %b) {
+; ASM-LABEL:select4i32:
+; DIS-LABEL:00000000 <select4i32>:
+
+entry:
+  %res = select <4 x i1> %s, <4 x i32> %a, <4 x i32> %b
+
+; ASM:	# q3 = def.pseudo
+; ASM-NEXT:	vmov.s8	r0, d0[0]
+; ASM-NEXT:	vmov.32	r1, d2[0]
+; ASM-NEXT:	vmov.32	r2, d4[0]
+; ASM-NEXT:	tst	r0, #1
+; ASM-NEXT:	movne	r2, r1
+; ASM-NEXT:	vmov.32	d6[0], r2
+; ASM-NEXT:	vmov.s8	r0, d0[1]
+; ASM-NEXT:	vmov.32	r1, d2[1]
+; ASM-NEXT:	vmov.32	r2, d4[1]
+; ASM-NEXT:	tst	r0, #1
+; ASM-NEXT:	movne	r2, r1
+; ASM-NEXT:	vmov.32	d6[1], r2
+; ASM-NEXT:	vmov.s8	r0, d1[0]
+; ASM-NEXT:	vmov.32	r1, d3[0]
+; ASM-NEXT:	vmov.32	r2, d5[0]
+; ASM-NEXT:	tst	r0, #1
+; ASM-NEXT:	movne	r2, r1
+; ASM-NEXT:	vmov.32	d7[0], r2
+; ASM-NEXT:	vmov.s8	r0, d1[1]
+; ASM-NEXT:	vmov.32	r1, d3[1]
+; ASM-NEXT:	vmov.32	r2, d5[1]
+; ASM-NEXT:	tst	r0, #1
+; ASM-NEXT:	movne	r2, r1
+; ASM-NEXT:	vmov.32	d7[1], r2
+; ASM-NEXT:	vmov.i32	q0, q3
+; ASM-NEXT:	bx	lr
+
+  ret <4 x i32> %res
+}
+
+define internal <8 x i16> @select8i16(<8 x i1> %s, <8 x i16> %a, <8 x i16> %b) {
+; ASM-LABEL:select8i16:
+; DIS-LABEL:00000000 <select8i16>:
+
+entry:
+  %res = select <8 x i1> %s, <8 x i16> %a, <8 x i16> %b
+
+; ASM:	# q3 = def.pseudo
+; ASM-NEXT:	vmov.s8	r0, d0[0]
+; ASM-NEXT:	vmov.s16	r1, d2[0]
+; ASM-NEXT:	vmov.s16	r2, d4[0]
+; ASM-NEXT:	tst	r0, #1
+; ASM-NEXT:	movne	r2, r1
+; ASM-NEXT:	vmov.16	d6[0], r2
+; ASM-NEXT:	vmov.s8	r0, d0[1]
+; ASM-NEXT:	vmov.s16	r1, d2[1]
+; ASM-NEXT:	vmov.s16	r2, d4[1]
+; ASM-NEXT:	tst	r0, #1
+; ASM-NEXT:	movne	r2, r1
+; ASM-NEXT:	vmov.16	d6[1], r2
+; ASM-NEXT:	vmov.s8	r0, d0[2]
+; ASM-NEXT:	vmov.s16	r1, d2[2]
+; ASM-NEXT:	vmov.s16	r2, d4[2]
+; ASM-NEXT:	tst	r0, #1
+; ASM-NEXT:	movne	r2, r1
+; ASM-NEXT:	vmov.16	d6[2], r2
+; ASM-NEXT:	vmov.s8	r0, d0[3]
+; ASM-NEXT:	vmov.s16	r1, d2[3]
+; ASM-NEXT:	vmov.s16	r2, d4[3]
+; ASM-NEXT:	tst	r0, #1
+; ASM-NEXT:	movne	r2, r1
+; ASM-NEXT:	vmov.16	d6[3], r2
+; ASM-NEXT:	vmov.s8	r0, d1[0]
+; ASM-NEXT:	vmov.s16	r1, d3[0]
+; ASM-NEXT:	vmov.s16	r2, d5[0]
+; ASM-NEXT:	tst	r0, #1
+; ASM-NEXT:	movne	r2, r1
+; ASM-NEXT:	vmov.16	d7[0], r2
+; ASM-NEXT:	vmov.s8	r0, d1[1]
+; ASM-NEXT:	vmov.s16	r1, d3[1]
+; ASM-NEXT:	vmov.s16	r2, d5[1]
+; ASM-NEXT:	tst	r0, #1
+; ASM-NEXT:	movne	r2, r1
+; ASM-NEXT:	vmov.16	d7[1], r2
+; ASM-NEXT:	vmov.s8	r0, d1[2]
+; ASM-NEXT:	vmov.s16	r1, d3[2]
+; ASM-NEXT:	vmov.s16	r2, d5[2]
+; ASM-NEXT:	tst	r0, #1
+; ASM-NEXT:	movne	r2, r1
+; ASM-NEXT:	vmov.16	d7[2], r2
+; ASM-NEXT:	vmov.s8	r0, d1[3]
+; ASM-NEXT:	vmov.s16	r1, d3[3]
+; ASM-NEXT:	vmov.s16	r2, d5[3]
+; ASM-NEXT:	tst	r0, #1
+; ASM-NEXT:	movne	r2, r1
+; ASM-NEXT:	vmov.16	d7[3], r2
+; ASM-NEXT:	vmov.i16	q0, q3
+; ASM-NEXT:	bx	lr
+
+  ret <8 x i16> %res
+}
+
+define internal <16 x i8> @select16i8(<16 x i1> %s, <16 x i8> %a,
+                                      <16 x i8> %b) {
+; ASM-LABEL:select16i8:
+; DIS-LABEL:00000000 <select16i8>:
+
+entry:
+  %res = select <16 x i1> %s, <16 x i8> %a, <16 x i8> %b
+
+; ASM:	# q3 = def.pseudo
+; ASM-NEXT:	vmov.s8	r0, d0[0]
+; ASM-NEXT:	vmov.s8	r1, d2[0]
+; ASM-NEXT:	vmov.s8	r2, d4[0]
+; ASM-NEXT:	tst	r0, #1
+; ASM-NEXT:	movne	r2, r1
+; ASM-NEXT:	vmov.8	d6[0], r2
+; ASM-NEXT:	vmov.s8	r0, d0[1]
+; ASM-NEXT:	vmov.s8	r1, d2[1]
+; ASM-NEXT:	vmov.s8	r2, d4[1]
+; ASM-NEXT:	tst	r0, #1
+; ASM-NEXT:	movne	r2, r1
+; ASM-NEXT:	vmov.8	d6[1], r2
+; ASM-NEXT:	vmov.s8	r0, d0[2]
+; ASM-NEXT:	vmov.s8	r1, d2[2]
+; ASM-NEXT:	vmov.s8	r2, d4[2]
+; ASM-NEXT:	tst	r0, #1
+; ASM-NEXT:	movne	r2, r1
+; ASM-NEXT:	vmov.8	d6[2], r2
+; ASM-NEXT:	vmov.s8	r0, d0[3]
+; ASM-NEXT:	vmov.s8	r1, d2[3]
+; ASM-NEXT:	vmov.s8	r2, d4[3]
+; ASM-NEXT:	tst	r0, #1
+; ASM-NEXT:	movne	r2, r1
+; ASM-NEXT:	vmov.8	d6[3], r2
+; ASM-NEXT:	vmov.s8	r0, d0[4]
+; ASM-NEXT:	vmov.s8	r1, d2[4]
+; ASM-NEXT:	vmov.s8	r2, d4[4]
+; ASM-NEXT:	tst	r0, #1
+; ASM-NEXT:	movne	r2, r1
+; ASM-NEXT:	vmov.8	d6[4], r2
+; ASM-NEXT:	vmov.s8	r0, d0[5]
+; ASM-NEXT:	vmov.s8	r1, d2[5]
+; ASM-NEXT:	vmov.s8	r2, d4[5]
+; ASM-NEXT:	tst	r0, #1
+; ASM-NEXT:	movne	r2, r1
+; ASM-NEXT:	vmov.8	d6[5], r2
+; ASM-NEXT:	vmov.s8	r0, d0[6]
+; ASM-NEXT:	vmov.s8	r1, d2[6]
+; ASM-NEXT:	vmov.s8	r2, d4[6]
+; ASM-NEXT:	tst	r0, #1
+; ASM-NEXT:	movne	r2, r1
+; ASM-NEXT:	vmov.8	d6[6], r2
+; ASM-NEXT:	vmov.s8	r0, d0[7]
+; ASM-NEXT:	vmov.s8	r1, d2[7]
+; ASM-NEXT:	vmov.s8	r2, d4[7]
+; ASM-NEXT:	tst	r0, #1
+; ASM-NEXT:	movne	r2, r1
+; ASM-NEXT:	vmov.8	d6[7], r2
+; ASM-NEXT:	vmov.s8	r0, d1[0]
+; ASM-NEXT:	vmov.s8	r1, d3[0]
+; ASM-NEXT:	vmov.s8	r2, d5[0]
+; ASM-NEXT:	tst	r0, #1
+; ASM-NEXT:	movne	r2, r1
+; ASM-NEXT:	vmov.8	d7[0], r2
+; ASM-NEXT:	vmov.s8	r0, d1[1]
+; ASM-NEXT:	vmov.s8	r1, d3[1]
+; ASM-NEXT:	vmov.s8	r2, d5[1]
+; ASM-NEXT:	tst	r0, #1
+; ASM-NEXT:	movne	r2, r1
+; ASM-NEXT:	vmov.8	d7[1], r2
+; ASM-NEXT:	vmov.s8	r0, d1[2]
+; ASM-NEXT:	vmov.s8	r1, d3[2]
+; ASM-NEXT:	vmov.s8	r2, d5[2]
+; ASM-NEXT:	tst	r0, #1
+; ASM-NEXT:	movne	r2, r1
+; ASM-NEXT:	vmov.8	d7[2], r2
+; ASM-NEXT:	vmov.s8	r0, d1[3]
+; ASM-NEXT:	vmov.s8	r1, d3[3]
+; ASM-NEXT:	vmov.s8	r2, d5[3]
+; ASM-NEXT:	tst	r0, #1
+; ASM-NEXT:	movne	r2, r1
+; ASM-NEXT:	vmov.8	d7[3], r2
+; ASM-NEXT:	vmov.s8	r0, d1[4]
+; ASM-NEXT:	vmov.s8	r1, d3[4]
+; ASM-NEXT:	vmov.s8	r2, d5[4]
+; ASM-NEXT:	tst	r0, #1
+; ASM-NEXT:	movne	r2, r1
+; ASM-NEXT:	vmov.8	d7[4], r2
+; ASM-NEXT:	vmov.s8	r0, d1[5]
+; ASM-NEXT:	vmov.s8	r1, d3[5]
+; ASM-NEXT:	vmov.s8	r2, d5[5]
+; ASM-NEXT:	tst	r0, #1
+; ASM-NEXT:	movne	r2, r1
+; ASM-NEXT:	vmov.8	d7[5], r2
+; ASM-NEXT:	vmov.s8	r0, d1[6]
+; ASM-NEXT:	vmov.s8	r1, d3[6]
+; ASM-NEXT:	vmov.s8	r2, d5[6]
+; ASM-NEXT:	tst	r0, #1
+; ASM-NEXT:	movne	r2, r1
+; ASM-NEXT:	vmov.8	d7[6], r2
+; ASM-NEXT:	vmov.s8	r0, d1[7]
+; ASM-NEXT:	vmov.s8	r1, d3[7]
+; ASM-NEXT:	vmov.s8	r2, d5[7]
+; ASM-NEXT:	tst	r0, #1
+; ASM-NEXT:	movne	r2, r1
+; ASM-NEXT:	vmov.8	d7[7], r2
+; ASM-NEXT:	vmov.i8	q0, q3
+; ASM-NEXT:	bx	lr
+
+  ret <16 x i8> %res
+}