ARM32 vector division lowering.

Enables vector division by scalarization.

Also, removed an assert as suggested by Karl in a previous CL:
https://codereview.chromium.org/1646033002/diff/1/src/IceInstARM32.cpp#newcode717

BUG= https://bugs.chromium.org/p/nativeclient/issues/detail?id=4076
R=stichnot@chromium.org

Review URL: https://codereview.chromium.org/1681003002 .
diff --git a/src/IceInstARM32.cpp b/src/IceInstARM32.cpp
index b1d052c..600658c 100644
--- a/src/IceInstARM32.cpp
+++ b/src/IceInstARM32.cpp
@@ -804,7 +804,6 @@
     Asm->vmuld(Dest, getSrc(0), getSrc(1), CondARM32::AL);
     break;
   }
-  assert(!Asm->needsTextFixup());
 }
 
 InstARM32Call::InstARM32Call(Cfg *Func, Variable *Dest, Operand *CallTarget)
diff --git a/src/IceTargetLowering.cpp b/src/IceTargetLowering.cpp
index d53fbab..75886d6 100644
--- a/src/IceTargetLowering.cpp
+++ b/src/IceTargetLowering.cpp
@@ -700,6 +700,39 @@
          Ctx->getFlags().getForceMemIntrinOpt();
 }
 
+void TargetLowering::scalarizeArithmetic(InstArithmetic::OpKind Kind,
+                                         Variable *Dest, Operand *Src0,
+                                         Operand *Src1) {
+  assert(isVectorType(Dest->getType()));
+  Type Ty = Dest->getType();
+  Type ElementTy = typeElementType(Ty);
+  SizeT NumElements = typeNumElements(Ty);
+
+  Operand *T = Ctx->getConstantUndef(Ty);
+  for (SizeT I = 0; I < NumElements; ++I) {
+    Constant *Index = Ctx->getConstantInt32(I);
+
+    // Extract the next two inputs.
+    Variable *Op0 = Func->makeVariable(ElementTy);
+    Context.insert<InstExtractElement>(Op0, Src0, Index);
+    Variable *Op1 = Func->makeVariable(ElementTy);
+    Context.insert<InstExtractElement>(Op1, Src1, Index);
+
+    // Perform the arithmetic as a scalar operation.
+    Variable *Res = Func->makeVariable(ElementTy);
+    auto *Arith = Context.insert<InstArithmetic>(Kind, Res, Op0, Op1);
+    // We might have created an operation that needed a helper call.
+    genTargetHelperCallFor(Arith);
+
+    // Insert the result into position.
+    Variable *DestT = Func->makeVariable(Ty);
+    Context.insert<InstInsertElement>(DestT, T, Res, Index);
+    T = DestT;
+  }
+
+  Context.insert<InstAssign>(Dest, T);
+}
+
 void TargetLowering::emitWithoutPrefix(const ConstantRelocatable *C,
                                        const char *Suffix) const {
   if (!BuildDefs::dump())
diff --git a/src/IceTargetLowering.h b/src/IceTargetLowering.h
index 1a0f0b7..55597ca 100644
--- a/src/IceTargetLowering.h
+++ b/src/IceTargetLowering.h
@@ -464,6 +464,9 @@
 
   bool shouldOptimizeMemIntrins();
 
+  void scalarizeArithmetic(InstArithmetic::OpKind K, Variable *Dest,
+                           Operand *Src0, Operand *Src1);
+
   /// SandboxType enumerates all possible sandboxing strategies that
   enum SandboxType {
     ST_None,
diff --git a/src/IceTargetLoweringARM32.cpp b/src/IceTargetLoweringARM32.cpp
index 9fa218b..1dde797 100644
--- a/src/IceTargetLoweringARM32.cpp
+++ b/src/IceTargetLoweringARM32.cpp
@@ -430,6 +430,18 @@
     const Type DestTy = Dest->getType();
     const InstArithmetic::OpKind Op =
         llvm::cast<InstArithmetic>(Instr)->getOp();
+    if (isVectorType(DestTy)) {
+      switch (Op) {
+      default:
+        break;
+      case InstArithmetic::Fdiv:
+      case InstArithmetic::Udiv:
+      case InstArithmetic::Sdiv:
+        scalarizeArithmetic(Op, Dest, Instr->getSrc(0), Instr->getSrc(1));
+        Instr->setDeleted();
+        return;
+      }
+    }
     switch (DestTy) {
     default:
       return;
@@ -2015,7 +2027,8 @@
   Variable *SrcLoReg = legalizeToReg(SrcLo);
   switch (Ty) {
   default:
-    llvm::report_fatal_error("Unexpected type");
+    llvm_unreachable(
+        ("Unexpected type in div0Check: " + typeIceString(Ty)).c_str());
   case IceType_i8:
   case IceType_i16: {
     Operand *ShAmtImm = shAmtImm(32 - getScalarIntBitWidth(Ty));
@@ -5508,7 +5521,8 @@
 Variable *TargetARM32::makeVectorOfZeros(Type Ty, int32_t RegNum) {
   Variable *Reg = makeReg(Ty, RegNum);
   Context.insert<InstFakeDef>(Reg);
-  UnimplementedError(Func->getContext()->getFlags());
+  assert(isVectorType(Ty));
+  _veor(Reg, Reg, Reg);
   return Reg;
 }
 
diff --git a/src/IceTargetLoweringX86Base.h b/src/IceTargetLoweringX86Base.h
index bda5077..ee2e5b0 100644
--- a/src/IceTargetLoweringX86Base.h
+++ b/src/IceTargetLoweringX86Base.h
@@ -343,9 +343,6 @@
 
   void eliminateNextVectorSextInstruction(Variable *SignExtendedResult);
 
-  void scalarizeArithmetic(InstArithmetic::OpKind K, Variable *Dest,
-                           Operand *Src0, Operand *Src1);
-
   void emitGetIP(CfgNode *Node) {
     dispatchToConcrete(&Traits::ConcreteTarget::emitGetIP, std::move(Node));
   }
diff --git a/src/IceTargetLoweringX86BaseImpl.h b/src/IceTargetLoweringX86BaseImpl.h
index 8d838d4..6e338e6 100644
--- a/src/IceTargetLoweringX86BaseImpl.h
+++ b/src/IceTargetLoweringX86BaseImpl.h
@@ -6061,41 +6061,6 @@
   _br(DefaultTarget);
 }
 
-template <typename TraitsType>
-void TargetX86Base<TraitsType>::scalarizeArithmetic(InstArithmetic::OpKind Kind,
-                                                    Variable *Dest,
-                                                    Operand *Src0,
-                                                    Operand *Src1) {
-  assert(isVectorType(Dest->getType()));
-  Type Ty = Dest->getType();
-  Type ElementTy = typeElementType(Ty);
-  SizeT NumElements = typeNumElements(Ty);
-
-  Operand *T = Ctx->getConstantUndef(Ty);
-  for (SizeT I = 0; I < NumElements; ++I) {
-    Constant *Index = Ctx->getConstantInt32(I);
-
-    // Extract the next two inputs.
-    Variable *Op0 = Func->makeVariable(ElementTy);
-    Context.insert<InstExtractElement>(Op0, Src0, Index);
-    Variable *Op1 = Func->makeVariable(ElementTy);
-    Context.insert<InstExtractElement>(Op1, Src1, Index);
-
-    // Perform the arithmetic as a scalar operation.
-    Variable *Res = Func->makeVariable(ElementTy);
-    auto *Arith = Context.insert<InstArithmetic>(Kind, Res, Op0, Op1);
-    // We might have created an operation that needed a helper call.
-    genTargetHelperCallFor(Arith);
-
-    // Insert the result into position.
-    Variable *DestT = Func->makeVariable(Ty);
-    Context.insert<InstInsertElement>(DestT, T, Res, Index);
-    T = DestT;
-  }
-
-  Context.insert<InstAssign>(Dest, T);
-}
-
 /// The following pattern occurs often in lowered C and C++ code:
 ///
 ///   %cmp     = fcmp/icmp pred <n x ty> %src0, %src1
diff --git a/src/IceTypes.def b/src/IceTypes.def
index a1a2552..c5f3386 100644
--- a/src/IceTypes.def
+++ b/src/IceTypes.def
@@ -45,7 +45,7 @@
   X(f32,    2,  4,  1, f32,  "float",       "f32")                             \
   X(f64,    3,  8,  1, f64,  "double",      "f64")                             \
   X(v4i1,   4,  1,  4, i1,   "<4 x i1>",    "v4i1")                            \
-  X(v8i1,   4,  1,  8, i1,   "<8 x i1>",    "v8ii")                            \
+  X(v8i1,   4,  1,  8, i1,   "<8 x i1>",    "v8i1")                            \
   X(v16i1,  4,  1, 16, i1,   "<16 x i1>",   "v16i1")                           \
   X(v16i8,  4,  1, 16, i8,   "<16 x i8>",   "v16i8")                           \
   X(v8i16,  4,  2,  8, i16,  "<8 x i16>",   "v8i16")                           \
diff --git a/tests_lit/assembler/arm32/udiv-vec.ll b/tests_lit/assembler/arm32/udiv-vec.ll
new file mode 100644
index 0000000..6c7c3a1
--- /dev/null
+++ b/tests_lit/assembler/arm32/udiv-vec.ll
@@ -0,0 +1,246 @@
+; Show that we know how to translate vector division instructions.
+
+; REQUIRES: allow_dump
+
+; Compile using standalone assembler.
+; RUN: %p2i --filetype=asm -i %s --target=arm32 --args -O2 -mattr=hwdiv-arm \
+; RUN:   | FileCheck %s --check-prefix=ASM
+
+; Show bytes in assembled standalone code.
+; RUN: %p2i --filetype=asm -i %s --target=arm32 --assemble --disassemble \
+; RUN:   --args -O2 -mattr=hwdiv-arm \
+; RUN:   | FileCheck %s --check-prefix=DIS
+
+; Compile using integrated assembler.
+; RUN: %p2i --filetype=iasm -i %s --target=arm32 --args -O2 -mattr=hwdiv-arm \
+; RUN:   | FileCheck %s --check-prefix=IASM
+
+; Show bytes in assembled integrated code.
+; RUN: %p2i --filetype=iasm -i %s --target=arm32 --assemble --disassemble \
+; RUN:   --args -O2 -mattr=hwdiv-arm \
+; RUN:   | FileCheck %s --check-prefix=DIS
+
+define internal <4 x float> @testVdivFloat4(<4 x float> %v1, <4 x float> %v2) {
+; ASM-LABEL: testVdivFloat4:
+; DIS-LABEL: 00000000 <testVdivFloat4>:
+; IASM-LABEL: testVdivFloat4:
+
+entry:
+  %res = fdiv <4 x float> %v1, %v2
+
+; TODO(eholk): this code could be a lot better. Fix the code generator
+; and update the test. Same for the rest of the tests.
+
+; ASM:     vdiv.f32        s8, s8, s9
+; ASM:     vdiv.f32        s8, s8, s9
+; ASM:     vdiv.f32        s8, s8, s9
+; ASM:     vdiv.f32        s0, s0, s4
+
+; DIS:   8:	ee844a24
+; DIS:  1c:	ee844a24
+; DIS:  2c:	ee844a24
+; DIS:  3c:	ee800a02
+
+; IASM-NOT:     vdiv
+
+  ret <4 x float> %res
+}
+
+define internal <4 x i32> @testVdiv4i32(<4 x i32> %v1, <4 x i32> %v2) {
+; ASM-LABEL: testVdiv4i32:
+; DIS-LABEL: 00000050 <testVdiv4i32>:
+; IASM-LABEL: testVdiv4i32:
+
+entry:
+  %res = udiv <4 x i32> %v1, %v2
+
+; ASM:     udiv r0, r0, r1
+; ASM:     udiv r0, r0, r1
+; ASM:     udiv r0, r0, r1
+; ASM:     udiv r0, r0, r1
+
+; DIS:  64:	e730f110
+; DIS:  84:	e730f110
+; DIS:  a0:	e730f110
+; DIS:  bc:	e730f110
+
+; IASM-NOT:     udiv
+
+  ret <4 x i32> %res
+}
+
+define internal <8 x i16> @testVdiv8i16(<8 x i16> %v1, <8 x i16> %v2) {
+; ASM-LABEL: testVdiv8i16:
+; DIS-LABEL: 000000d0 <testVdiv8i16>:
+; IASM-LABEL: testVdiv8i16:
+
+entry:
+  %res = udiv <8 x i16> %v1, %v2
+
+; ASM:     uxth            r0, r0
+; ASM:     uxth            r1, r1
+; ASM:     udiv r0, r0, r1
+; ASM:     uxth            r0, r0
+; ASM:     uxth            r1, r1
+; ASM:     udiv r0, r0, r1
+; ASM:     uxth            r0, r0
+; ASM:     uxth            r1, r1
+; ASM:     udiv r0, r0, r1
+; ASM:     uxth            r0, r0
+; ASM:     uxth            r1, r1
+; ASM:     udiv r0, r0, r1
+; ASM:     uxth            r0, r0
+; ASM:     uxth            r1, r1
+; ASM:     udiv r0, r0, r1
+; ASM:     uxth            r0, r0
+; ASM:     uxth            r1, r1
+; ASM:     udiv r0, r0, r1
+; ASM:     uxth            r0, r0
+; ASM:     uxth            r1, r1
+; ASM:     udiv r0, r0, r1
+; ASM:     uxth            r0, r0
+; ASM:     uxth            r1, r1
+; ASM:     udiv r0, r0, r1
+
+; DIS:  e4:	e6ff0070
+; DIS:  e8:	e6ff1071
+; DIS:  ec:	e730f110
+; DIS: 10c:	e6ff0070
+; DIS: 110:	e6ff1071
+; DIS: 114:	e730f110
+; DIS: 130:	e6ff0070
+; DIS: 134:	e6ff1071
+; DIS: 138:	e730f110
+; DIS: 154:	e6ff0070
+; DIS: 158:	e6ff1071
+; DIS: 15c:	e730f110
+; DIS: 178:	e6ff0070
+; DIS: 17c:	e6ff1071
+; DIS: 180:	e730f110
+; DIS: 19c:	e6ff0070
+; DIS: 1a0:	e6ff1071
+; DIS: 1a4:	e730f110
+; DIS: 1c0:	e6ff0070
+; DIS: 1c4:	e6ff1071
+; DIS: 1c8:	e730f110
+; DIS: 1e4:	e6ff0070
+; DIS: 1e8:	e6ff1071
+; DIS: 1ec:	e730f110
+
+; IASM-NOT:     uxth
+; IASM-NOT:     udiv
+
+  ret <8 x i16> %res
+}
+
+define internal <16 x i8> @testVdiv16i8(<16 x i8> %v1, <16 x i8> %v2) {
+; ASM-LABEL: testVdiv16i8:
+; DIS-LABEL: 00000200 <testVdiv16i8>:
+; IASM-LABEL: testVdiv16i8:
+
+entry:
+  %res = udiv <16 x i8> %v1, %v2
+
+; ASM:     uxtb            r0, r0
+; ASM:     uxtb            r1, r1
+; ASM:     udiv r0, r0, r1
+; ASM:     uxtb            r0, r0
+; ASM:     uxtb            r1, r1
+; ASM:     udiv r0, r0, r1
+; ASM:     uxtb            r0, r0
+; ASM:     uxtb            r1, r1
+; ASM:     udiv r0, r0, r1
+; ASM:     uxtb            r0, r0
+; ASM:     uxtb            r1, r1
+; ASM:     udiv r0, r0, r1
+; ASM:     uxtb            r0, r0
+; ASM:     uxtb            r1, r1
+; ASM:     udiv r0, r0, r1
+; ASM:     uxtb            r0, r0
+; ASM:     uxtb            r1, r1
+; ASM:     udiv r0, r0, r1
+; ASM:     uxtb            r0, r0
+; ASM:     uxtb            r1, r1
+; ASM:     udiv r0, r0, r1
+; ASM:     uxtb            r0, r0
+; ASM:     uxtb            r1, r1
+; ASM:     udiv r0, r0, r1
+; ASM:     uxtb            r0, r0
+; ASM:     uxtb            r1, r1
+; ASM:     udiv r0, r0, r1
+; ASM:     uxtb            r0, r0
+; ASM:     uxtb            r1, r1
+; ASM:     udiv r0, r0, r1
+; ASM:     uxtb            r0, r0
+; ASM:     uxtb            r1, r1
+; ASM:     udiv r0, r0, r1
+; ASM:     uxtb            r0, r0
+; ASM:     uxtb            r1, r1
+; ASM:     udiv r0, r0, r1
+; ASM:     uxtb            r0, r0
+; ASM:     uxtb            r1, r1
+; ASM:     udiv r0, r0, r1
+; ASM:     uxtb            r0, r0
+; ASM:     uxtb            r1, r1
+; ASM:     udiv r0, r0, r1
+; ASM:     uxtb            r0, r0
+; ASM:     uxtb            r1, r1
+; ASM:     udiv r0, r0, r1
+; ASM:     uxtb            r0, r0
+; ASM:     uxtb            r1, r1
+; ASM:     udiv r0, r0, r1
+
+; DIS: 214:	e6ef0070
+; DIS: 218:	e6ef1071
+; DIS: 21c:	e730f110
+; DIS: 23c:	e6ef0070
+; DIS: 240:	e6ef1071
+; DIS: 244:	e730f110
+; DIS: 260:	e6ef0070
+; DIS: 264:	e6ef1071
+; DIS: 268:	e730f110
+; DIS: 284:	e6ef0070
+; DIS: 288:	e6ef1071
+; DIS: 28c:	e730f110
+; DIS: 2a8:	e6ef0070
+; DIS: 2ac:	e6ef1071
+; DIS: 2b0:	e730f110
+; DIS: 2cc:	e6ef0070
+; DIS: 2d0:	e6ef1071
+; DIS: 2d4:	e730f110
+; DIS: 2f0:	e6ef0070
+; DIS: 2f4:	e6ef1071
+; DIS: 2f8:	e730f110
+; DIS: 314:	e6ef0070
+; DIS: 318:	e6ef1071
+; DIS: 31c:	e730f110
+; DIS: 338:	e6ef0070
+; DIS: 33c:	e6ef1071
+; DIS: 340:	e730f110
+; DIS: 35c:	e6ef0070
+; DIS: 360:	e6ef1071
+; DIS: 364:	e730f110
+; DIS: 380:	e6ef0070
+; DIS: 384:	e6ef1071
+; DIS: 388:	e730f110
+; DIS: 3a4:	e6ef0070
+; DIS: 3a8:	e6ef1071
+; DIS: 3ac:	e730f110
+; DIS: 3c8:	e6ef0070
+; DIS: 3cc:	e6ef1071
+; DIS: 3d0:	e730f110
+; DIS: 3ec:	e6ef0070
+; DIS: 3f0:	e6ef1071
+; DIS: 3f4:	e730f110
+; DIS: 410:	e6ef0070
+; DIS: 414:	e6ef1071
+; DIS: 418:	e730f110
+; DIS: 434:	e6ef0070
+; DIS: 438:	e6ef1071
+; DIS: 43c:	e730f110
+
+; IASM-NOT:     uxtb
+; IASM-NOT:     udiv
+
+  ret <16 x i8> %res
+}