Revert "Revert^2 "Implement Dot Product Vectorization for x86""

This reverts commit 7cf5607f472020711e36eedbbfebb25b40d3f90e.

Bug: 144947842

Reason for revert: Seems to have broken android.jvmti.cts.JvmtiHostTest1936#testJvmt

Change-Id: Ied6ff6ddf1cb2e3e76adcaa0fda5e36af254b7c5
diff --git a/compiler/optimizing/code_generator_vector_x86.cc b/compiler/optimizing/code_generator_vector_x86.cc
index 1390af2..68aef77 100644
--- a/compiler/optimizing/code_generator_vector_x86.cc
+++ b/compiler/optimizing/code_generator_vector_x86.cc
@@ -1201,38 +1201,11 @@
 }
 
 void LocationsBuilderX86::VisitVecDotProd(HVecDotProd* instruction) {
-  LocationSummary* locations = new (GetGraph()->GetAllocator()) LocationSummary(instruction);
-  locations->SetInAt(0, Location::RequiresFpuRegister());
-  locations->SetInAt(1, Location::RequiresFpuRegister());
-  locations->SetInAt(2, Location::RequiresFpuRegister());
-  locations->SetOut(Location::SameAsFirstInput());
-  locations->AddTemp(Location::RequiresFpuRegister());
+  LOG(FATAL) << "No SIMD for " << instruction->GetId();
 }
 
 void InstructionCodeGeneratorX86::VisitVecDotProd(HVecDotProd* instruction) {
-  bool cpu_has_avx = CpuHasAvxFeatureFlag();
-  LocationSummary* locations = instruction->GetLocations();
-  XmmRegister acc = locations->InAt(0).AsFpuRegister<XmmRegister>();
-  XmmRegister left = locations->InAt(1).AsFpuRegister<XmmRegister>();
-  XmmRegister right = locations->InAt(2).AsFpuRegister<XmmRegister>();
-  switch (instruction->GetPackedType()) {
-    case DataType::Type::kInt32: {
-      DCHECK_EQ(4u, instruction->GetVectorLength());
-      XmmRegister tmp = locations->GetTemp(0).AsFpuRegister<XmmRegister>();
-      if (!cpu_has_avx) {
-        __ movaps(tmp, right);
-        __ pmaddwd(tmp, left);
-        __ paddd(acc, tmp);
-      } else {
-        __ vpmaddwd(tmp, left, right);
-        __ vpaddd(acc, acc, tmp);
-      }
-      break;
-    }
-    default:
-      LOG(FATAL) << "Unsupported SIMD Type" << instruction->GetPackedType();
-      UNREACHABLE();
-  }
+  LOG(FATAL) << "No SIMD for " << instruction->GetId();
 }
 
 // Helper to set up locations for vector memory operations.
diff --git a/compiler/optimizing/code_generator_vector_x86_64.cc b/compiler/optimizing/code_generator_vector_x86_64.cc
index 7fac44d..19dfd1d 100644
--- a/compiler/optimizing/code_generator_vector_x86_64.cc
+++ b/compiler/optimizing/code_generator_vector_x86_64.cc
@@ -1174,38 +1174,11 @@
 }
 
 void LocationsBuilderX86_64::VisitVecDotProd(HVecDotProd* instruction) {
-  LocationSummary* locations = new (GetGraph()->GetAllocator()) LocationSummary(instruction);
-  locations->SetInAt(0, Location::RequiresFpuRegister());
-  locations->SetInAt(1, Location::RequiresFpuRegister());
-  locations->SetInAt(2, Location::RequiresFpuRegister());
-  locations->SetOut(Location::SameAsFirstInput());
-  locations->AddTemp(Location::RequiresFpuRegister());
+  LOG(FATAL) << "No SIMD for " << instruction->GetId();
 }
 
 void InstructionCodeGeneratorX86_64::VisitVecDotProd(HVecDotProd* instruction) {
-  bool cpu_has_avx = CpuHasAvxFeatureFlag();
-  LocationSummary* locations = instruction->GetLocations();
-  XmmRegister acc = locations->InAt(0).AsFpuRegister<XmmRegister>();
-  XmmRegister left = locations->InAt(1).AsFpuRegister<XmmRegister>();
-  XmmRegister right = locations->InAt(2).AsFpuRegister<XmmRegister>();
-  switch (instruction->GetPackedType()) {
-    case DataType::Type::kInt32: {
-      DCHECK_EQ(4u, instruction->GetVectorLength());
-      XmmRegister tmp = locations->GetTemp(0).AsFpuRegister<XmmRegister>();
-      if (!cpu_has_avx) {
-        __ movaps(tmp, right);
-        __ pmaddwd(tmp, left);
-        __ paddd(acc, tmp);
-      } else {
-        __ vpmaddwd(tmp, left, right);
-        __ vpaddd(acc, acc, tmp);
-      }
-      break;
-    }
-    default:
-      LOG(FATAL) << "Unsupported SIMD Type" << instruction->GetPackedType();
-      UNREACHABLE();
-  }
+  LOG(FATAL) << "No SIMD for " << instruction->GetId();
 }
 
 // Helper to set up locations for vector memory operations.
diff --git a/compiler/optimizing/loop_optimization.cc b/compiler/optimizing/loop_optimization.cc
index 567a41e..9c4e9d2 100644
--- a/compiler/optimizing/loop_optimization.cc
+++ b/compiler/optimizing/loop_optimization.cc
@@ -1623,19 +1623,13 @@
                              kNoDotProd;
             return TrySetVectorLength(16);
           case DataType::Type::kUint16:
-            *restrictions |= kNoDiv |
-                             kNoAbs |
-                             kNoSignedHAdd |
-                             kNoUnroundedHAdd |
-                             kNoSAD |
-                             kNoDotProd;
-            return TrySetVectorLength(8);
           case DataType::Type::kInt16:
             *restrictions |= kNoDiv |
                              kNoAbs |
                              kNoSignedHAdd |
                              kNoUnroundedHAdd |
-                             kNoSAD;
+                             kNoSAD|
+                             kNoDotProd;
             return TrySetVectorLength(8);
           case DataType::Type::kInt32:
             *restrictions |= kNoDiv | kNoSAD;
@@ -2172,7 +2166,7 @@
                                               bool generate_code,
                                               DataType::Type reduction_type,
                                               uint64_t restrictions) {
-  if (!instruction->IsAdd() || reduction_type != DataType::Type::kInt32) {
+  if (!instruction->IsAdd() || (reduction_type != DataType::Type::kInt32)) {
     return false;
   }
 
diff --git a/compiler/utils/x86/assembler_x86.cc b/compiler/utils/x86/assembler_x86.cc
index 55f7691..166aec8 100644
--- a/compiler/utils/x86/assembler_x86.cc
+++ b/compiler/utils/x86/assembler_x86.cc
@@ -2268,20 +2268,6 @@
 }
 
 
-void X86Assembler::vpmaddwd(XmmRegister dst, XmmRegister src1, XmmRegister src2) {
-  DCHECK(CpuHasAVXorAVX2FeatureFlag());
-  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
-  uint8_t ByteZero = 0x00, ByteOne = 0x00;
-  ByteZero = EmitVexPrefixByteZero(/* is_twobyte_form=*/ true);
-  X86ManagedRegister vvvv_reg = X86ManagedRegister::FromXmmRegister(src1);
-  ByteOne = EmitVexPrefixByteOne(/*R=*/ false, vvvv_reg, SET_VEX_L_128, SET_VEX_PP_66);
-  EmitUint8(ByteZero);
-  EmitUint8(ByteOne);
-  EmitUint8(0xF5);
-  EmitXmmRegisterOperand(dst, src2);
-}
-
-
 void X86Assembler::phaddw(XmmRegister dst, XmmRegister src) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
   EmitUint8(0x66);
diff --git a/compiler/utils/x86/assembler_x86.h b/compiler/utils/x86/assembler_x86.h
index 27fde26..1b6941c 100644
--- a/compiler/utils/x86/assembler_x86.h
+++ b/compiler/utils/x86/assembler_x86.h
@@ -577,7 +577,6 @@
   void pavgw(XmmRegister dst, XmmRegister src);
   void psadbw(XmmRegister dst, XmmRegister src);
   void pmaddwd(XmmRegister dst, XmmRegister src);
-  void vpmaddwd(XmmRegister dst, XmmRegister src1, XmmRegister src2);
   void phaddw(XmmRegister dst, XmmRegister src);
   void phaddd(XmmRegister dst, XmmRegister src);
   void haddps(XmmRegister dst, XmmRegister src);
diff --git a/compiler/utils/x86/assembler_x86_test.cc b/compiler/utils/x86/assembler_x86_test.cc
index 9253730..12d9646 100644
--- a/compiler/utils/x86/assembler_x86_test.cc
+++ b/compiler/utils/x86/assembler_x86_test.cc
@@ -965,11 +965,6 @@
   DriverStr(RepeatFF(&x86::X86Assembler::pmaddwd, "pmaddwd %{reg2}, %{reg1}"), "pmaddwd");
 }
 
-TEST_F(AssemblerX86AVXTest, VPMAddWD) {
-  DriverStr(
-      RepeatFFF(&x86::X86Assembler::vpmaddwd, "vpmaddwd %{reg3}, %{reg2}, %{reg1}"), "vpmaddwd");
-}
-
 TEST_F(AssemblerX86Test, PHAddW) {
   DriverStr(RepeatFF(&x86::X86Assembler::phaddw, "phaddw %{reg2}, %{reg1}"), "phaddw");
 }
diff --git a/compiler/utils/x86_64/assembler_x86_64.cc b/compiler/utils/x86_64/assembler_x86_64.cc
index 2c5dd9e..64246aa 100644
--- a/compiler/utils/x86_64/assembler_x86_64.cc
+++ b/compiler/utils/x86_64/assembler_x86_64.cc
@@ -71,7 +71,6 @@
   return false;
 }
 
-
 void X86_64Assembler::call(CpuRegister reg) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
   EmitOptionalRex32(reg);
@@ -759,6 +758,7 @@
   EmitOperand(src.LowBits(), Operand(dst));
 }
 
+
 void X86_64Assembler::addss(XmmRegister dst, XmmRegister src) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
   EmitUint8(0xF3);
@@ -768,6 +768,7 @@
   EmitXmmRegisterOperand(dst.LowBits(), src);
 }
 
+
 void X86_64Assembler::addss(XmmRegister dst, const Address& src) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
   EmitUint8(0xF3);
@@ -2632,6 +2633,7 @@
   EmitXmmRegisterOperand(dst.LowBits(), src);
 }
 
+
 void X86_64Assembler::pxor(XmmRegister dst, XmmRegister src) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
   EmitUint8(0x66);
@@ -3143,35 +3145,6 @@
   EmitXmmRegisterOperand(dst.LowBits(), src);
 }
 
-void X86_64Assembler::vpmaddwd(XmmRegister dst, XmmRegister src1, XmmRegister src2) {
-  DCHECK(CpuHasAVXorAVX2FeatureFlag());
-  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
-  bool is_twobyte_form = false;
-  uint8_t ByteZero = 0x00, ByteOne = 0x00, ByteTwo = 0x00;
-  if (!src2.NeedsRex()) {
-    is_twobyte_form = true;
-  }
-  ByteZero = EmitVexPrefixByteZero(is_twobyte_form);
-  X86_64ManagedRegister vvvv_reg =
-      X86_64ManagedRegister::FromXmmRegister(src1.AsFloatRegister());
-  if (is_twobyte_form) {
-    ByteOne = EmitVexPrefixByteOne(dst.NeedsRex(), vvvv_reg, SET_VEX_L_128, SET_VEX_PP_66);
-  } else {
-    ByteOne = EmitVexPrefixByteOne(dst.NeedsRex(),
-                                   /*X=*/ false,
-                                   src2.NeedsRex(),
-                                   SET_VEX_M_0F);
-    ByteTwo = EmitVexPrefixByteTwo(/*W=*/ false, vvvv_reg, SET_VEX_L_128, SET_VEX_PP_66);
-  }
-  EmitUint8(ByteZero);
-  EmitUint8(ByteOne);
-  if (!is_twobyte_form) {
-    EmitUint8(ByteTwo);
-  }
-  EmitUint8(0xF5);
-  EmitXmmRegisterOperand(dst.LowBits(), src2);
-}
-
 void X86_64Assembler::phaddw(XmmRegister dst, XmmRegister src) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
   EmitUint8(0x66);
diff --git a/compiler/utils/x86_64/assembler_x86_64.h b/compiler/utils/x86_64/assembler_x86_64.h
index 70072d9..15f3ab9 100644
--- a/compiler/utils/x86_64/assembler_x86_64.h
+++ b/compiler/utils/x86_64/assembler_x86_64.h
@@ -615,7 +615,6 @@
   void pavgw(XmmRegister dst, XmmRegister src);
   void psadbw(XmmRegister dst, XmmRegister src);
   void pmaddwd(XmmRegister dst, XmmRegister src);
-  void vpmaddwd(XmmRegister dst, XmmRegister src1, XmmRegister src2);
   void phaddw(XmmRegister dst, XmmRegister src);
   void phaddd(XmmRegister dst, XmmRegister src);
   void haddps(XmmRegister dst, XmmRegister src);
diff --git a/compiler/utils/x86_64/assembler_x86_64_test.cc b/compiler/utils/x86_64/assembler_x86_64_test.cc
index 3921c4a..e3b8390 100644
--- a/compiler/utils/x86_64/assembler_x86_64_test.cc
+++ b/compiler/utils/x86_64/assembler_x86_64_test.cc
@@ -1740,11 +1740,6 @@
   DriverStr(RepeatFF(&x86_64::X86_64Assembler::pmaddwd, "pmaddwd %{reg2}, %{reg1}"), "pmadwd");
 }
 
-TEST_F(AssemblerX86_64AVXTest, VPmaddwd) {
-  DriverStr(RepeatFFF(&x86_64::X86_64Assembler::vpmaddwd,
-                      "vpmaddwd %{reg3}, %{reg2}, %{reg1}"), "vpmaddwd");
-}
-
 TEST_F(AssemblerX86_64Test, Phaddw) {
   DriverStr(RepeatFF(&x86_64::X86_64Assembler::phaddw, "phaddw %{reg2}, %{reg1}"), "phaddw");
 }