Revert "Emit vector mulitply and accumulate instructions for x86."

This reverts commit 61908880e6565acfadbafe93fa64de000014f1a6.

Reason for revert: By failing to round multiply results, it does not follow Java rounding rules.

Change-Id: Ic0ef08691bef266c9f8d91973e596e09ff3307c6
diff --git a/compiler/optimizing/code_generator_vector_x86.cc b/compiler/optimizing/code_generator_vector_x86.cc
index 5880876..086ae07 100644
--- a/compiler/optimizing/code_generator_vector_x86.cc
+++ b/compiler/optimizing/code_generator_vector_x86.cc
@@ -1125,59 +1125,13 @@
   }
 }
 
-void LocationsBuilderX86::VisitVecMultiplyAccumulate(HVecMultiplyAccumulate* instr) {
-  LocationSummary* locations = new (GetGraph()->GetAllocator()) LocationSummary(instr);
-  switch (instr->GetPackedType()) {
-    case DataType::Type::kFloat32:
-    case DataType::Type::kFloat64:
-      locations->SetInAt(
-          HVecMultiplyAccumulate::kInputAccumulatorIndex, Location::RequiresFpuRegister());
-      locations->SetInAt(
-          HVecMultiplyAccumulate::kInputMulLeftIndex, Location::RequiresFpuRegister());
-      locations->SetInAt(
-          HVecMultiplyAccumulate::kInputMulRightIndex, Location::RequiresFpuRegister());
-      DCHECK_EQ(HVecMultiplyAccumulate::kInputAccumulatorIndex, 0);
-      locations->SetOut(Location::SameAsFirstInput());
-  break;
-  default:
-    // VecMultiplyAccumulate is supported only for single and
-    // double precision floating points. Hence integral types
-    // are still not converted.
-    LOG(FATAL) << "Unsupported SIMD Type";
-  }
+void LocationsBuilderX86::VisitVecMultiplyAccumulate(HVecMultiplyAccumulate* instruction) {
+  CreateVecAccumLocations(GetGraph()->GetAllocator(), instruction);
 }
 
-void InstructionCodeGeneratorX86::VisitVecMultiplyAccumulate(HVecMultiplyAccumulate* instr) {
-  LocationSummary* locations = instr->GetLocations();
-  DCHECK(locations->InAt(0).Equals(locations->Out()));
-  XmmRegister accumulator = locations->InAt(
-      HVecMultiplyAccumulate::kInputAccumulatorIndex).AsFpuRegister<XmmRegister>();
-  XmmRegister mul_left = locations->InAt(
-      HVecMultiplyAccumulate::kInputMulLeftIndex).AsFpuRegister<XmmRegister>();
-  XmmRegister mul_right = locations->InAt(
-      HVecMultiplyAccumulate::kInputMulRightIndex).AsFpuRegister<XmmRegister>();
-  switch (instr->GetPackedType()) {
-    case DataType::Type::kFloat32:
-      DCHECK_EQ(4u, instr->GetVectorLength());
-      if (instr->GetOpKind() == HInstruction::InstructionKind::kAdd)
-        __ vfmadd231ps(accumulator, mul_left, mul_right);
-      else
-        __ vfmsub231ps(accumulator, mul_left, mul_right);
-    break;
-    case DataType::Type::kFloat64:
-      DCHECK_EQ(2u, instr->GetVectorLength());
-      if (instr->GetOpKind() == HInstruction::InstructionKind::kAdd)
-        __ vfmadd231pd(accumulator, mul_left, mul_right);
-      else
-        __ vfmsub231pd(accumulator, mul_left, mul_right);
-      break;
-  default:
-
-    // VecMultiplyAccumulate is supported only for single and
-    // double precision floating points. Hence integral types
-    // are still not converted.
-    LOG(FATAL) << "Unsupported SIMD Type";
-  }
+void InstructionCodeGeneratorX86::VisitVecMultiplyAccumulate(HVecMultiplyAccumulate* instruction) {
+  // TODO: pmaddwd?
+  LOG(FATAL) << "No SIMD for " << instruction->GetId();
 }
 
 void LocationsBuilderX86::VisitVecSADAccumulate(HVecSADAccumulate* instruction) {
diff --git a/compiler/optimizing/code_generator_vector_x86_64.cc b/compiler/optimizing/code_generator_vector_x86_64.cc
index 4795e86..4d31ab6 100644
--- a/compiler/optimizing/code_generator_vector_x86_64.cc
+++ b/compiler/optimizing/code_generator_vector_x86_64.cc
@@ -1098,61 +1098,13 @@
   }
 }
 
-void LocationsBuilderX86_64::VisitVecMultiplyAccumulate(HVecMultiplyAccumulate* instr) {
-  LocationSummary* locations = new (GetGraph()->GetAllocator()) LocationSummary(instr);
-  switch (instr->GetPackedType()) {
-    case DataType::Type::kFloat32:
-    case DataType::Type::kFloat64:
-      locations->SetInAt(
-          HVecMultiplyAccumulate::kInputAccumulatorIndex, Location::RequiresFpuRegister());
-      locations->SetInAt(
-          HVecMultiplyAccumulate::kInputMulLeftIndex, Location::RequiresFpuRegister());
-      locations->SetInAt(
-          HVecMultiplyAccumulate::kInputMulRightIndex, Location::RequiresFpuRegister());
-      DCHECK_EQ(HVecMultiplyAccumulate::kInputAccumulatorIndex, 0);
-      locations->SetOut(Location::SameAsFirstInput());
-      break;
-    default:
-      // VecMultiplyAccumulate is supported only for single and
-      // double precision floating points. Hence integral types
-      // are still not converted.
-      LOG(FATAL) << "Unsupported SIMD type";
-  }
+void LocationsBuilderX86_64::VisitVecMultiplyAccumulate(HVecMultiplyAccumulate* instruction) {
+  CreateVecAccumLocations(GetGraph()->GetAllocator(), instruction);
 }
 
-
-void InstructionCodeGeneratorX86_64::VisitVecMultiplyAccumulate(HVecMultiplyAccumulate* instr) {
-  LocationSummary* locations = instr->GetLocations();
-  DCHECK(locations->InAt(0).Equals(locations->Out()));
-  XmmRegister accumulator = locations->InAt(
-      HVecMultiplyAccumulate::kInputAccumulatorIndex).AsFpuRegister<XmmRegister>();
-  XmmRegister mul_left = locations->InAt(
-      HVecMultiplyAccumulate::kInputMulLeftIndex).AsFpuRegister<XmmRegister>();
-  XmmRegister mul_right = locations->InAt(
-      HVecMultiplyAccumulate::kInputMulRightIndex).AsFpuRegister<XmmRegister>();
-
-  switch (instr->GetPackedType()) {
-    case DataType::Type::kFloat32:
-      DCHECK_EQ(4u, instr->GetVectorLength());
-      if (instr->GetOpKind() == HInstruction::InstructionKind::kAdd)
-         __ vfmadd231ps(accumulator, mul_left, mul_right);
-      else
-        __ vfmsub231ps(accumulator, mul_left, mul_right);
-    break;
-    case DataType::Type::kFloat64:
-      DCHECK_EQ(2u, instr->GetVectorLength());
-      if (instr->GetOpKind() == HInstruction::InstructionKind::kAdd)
-        __ vfmadd231pd(accumulator, mul_left, mul_right);
-      else
-        __ vfmsub231pd(accumulator, mul_left, mul_right);
-    break;
-  default:
-
-    // VecMultiplyAccumulate is supported only for single and
-    // double precision floating points. Hence integral types
-    // are still not converted.
-    LOG(FATAL) << "Unsupported SIMD Type";
-  }
+void InstructionCodeGeneratorX86_64::VisitVecMultiplyAccumulate(HVecMultiplyAccumulate* instruction) {
+  // TODO: pmaddwd?
+  LOG(FATAL) << "No SIMD for " << instruction->GetId();
 }
 
 void LocationsBuilderX86_64::VisitVecSADAccumulate(HVecSADAccumulate* instruction) {
diff --git a/compiler/optimizing/instruction_simplifier_x86.cc b/compiler/optimizing/instruction_simplifier_x86.cc
deleted file mode 100644
index b3f67d6..0000000
--- a/compiler/optimizing/instruction_simplifier_x86.cc
+++ /dev/null
@@ -1,149 +0,0 @@
-/*
- * Copyright (C) 2018 The Android Open Source Project
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "instruction_simplifier_x86.h"
-#include "arch/x86/instruction_set_features_x86.h"
-#include "mirror/array-inl.h"
-#include "code_generator.h"
-
-
-namespace art {
-
-namespace x86 {
-
-class InstructionSimplifierX86Visitor : public HGraphVisitor {
- public:
-  InstructionSimplifierX86Visitor(HGraph* graph,
-                                  CodeGeneratorX86 *codegen,
-                                  OptimizingCompilerStats* stats)
-      : HGraphVisitor(graph), codegen_(codegen), stats_(stats) {}
-
- private:
-  void RecordSimplification() {
-    MaybeRecordStat(stats_, MethodCompilationStat::kInstructionSimplificationsArch);
-  }
-
-  bool HasCpuFeatureFlag() {
-     return (codegen_->GetInstructionSetFeatures().HasAVX2());
-  }
-
-  /**
-   * This simplifier uses a special-purpose BB visitor.
-   * (1) No need to visit Phi nodes.
-   * (2) Since statements can be removed in a "forward" fashion,
-   *     the visitor should test if each statement is still there.
-   */
-  void VisitBasicBlock(HBasicBlock* block) OVERRIDE {
-    // TODO: fragile iteration, provide more robust iterators?
-    for (HInstructionIterator it(block->GetInstructions()); !it.Done(); it.Advance()) {
-      HInstruction* instruction = it.Current();
-      if (instruction->IsInBlock()) {
-        instruction->Accept(this);
-      }
-    }
-  }
-
-  bool TryGenerateVecMultiplyAccumulate(HVecMul* mul);
-  void VisitVecMul(HVecMul* instruction) OVERRIDE;
-
-  CodeGeneratorX86* codegen_;
-  OptimizingCompilerStats* stats_;
-};
-
-/* generic expressions for FMA
-a = (b * c) + a
-a = (b * c) – a
-*/
-bool InstructionSimplifierX86Visitor::TryGenerateVecMultiplyAccumulate(HVecMul* mul) {
-  if (!(mul->GetPackedType() == DataType::Type::kFloat32 ||
-        mul->GetPackedType() == DataType::Type::kFloat64)) {
-     return false;
-  }
-  ArenaAllocator* allocator = mul->GetBlock()->GetGraph()->GetAllocator();
-  if (mul->HasOnlyOneNonEnvironmentUse()) {
-    HInstruction* use = mul->GetUses().front().GetUser();
-    if (use->IsVecAdd() || use->IsVecSub()) {
-      // Replace code looking like
-      //    VECMUL tmp, x, y
-      //    VECADD dst, acc, tmp or VECADD dst, tmp, acc
-      //      or
-      //    VECSUB dst, tmp, acc
-      // with
-      //    VECMULACC dst, acc, x, y
-
-      // Note that we do not want to (unconditionally) perform the merge when the
-      // multiplication has multiple uses and it can be merged in all of them.
-      // Multiple uses could happen on the same control-flow path, and we would
-      // then increase the amount of work. In the future we could try to evaluate
-      // whether all uses are on different control-flow paths (using dominance and
-      // reverse-dominance information) and only perform the merge when they are.
-      HInstruction* accumulator = nullptr;
-      HVecBinaryOperation* binop = use->AsVecBinaryOperation();
-      HInstruction* binop_left = binop->GetLeft();
-      HInstruction* binop_right = binop->GetRight();
-      DCHECK_NE(binop_left, binop_right);
-      if (use->IsVecSub()) {
-        if (binop_left == mul) {
-          accumulator = binop_right;
-         }
-      } else {
-        // VecAdd
-        if (binop_right == mul) {
-          accumulator = binop_left;
-        } else {
-          DCHECK_EQ(binop_left, mul);
-          accumulator = binop_right;
-        }
-      }
-      HInstruction::InstructionKind kind =
-        use->IsVecAdd() ? HInstruction::kAdd : HInstruction::kSub;
-
-      if (accumulator != nullptr) {
-        HVecMultiplyAccumulate* mulacc =
-          new (allocator) HVecMultiplyAccumulate(allocator,
-                                                 kind,
-                                                 accumulator,
-                                                 mul->GetLeft(),
-                                                 mul->GetRight(),
-                                                 binop->GetPackedType(),
-                                                 binop->GetVectorLength(),
-                                                 binop->GetDexPc());
-        binop->GetBlock()->ReplaceAndRemoveInstructionWith(binop, mulacc);
-        DCHECK(!mul->HasUses());
-        mul->GetBlock()->RemoveInstruction(mul);
-        return true;
-      }
-    }
-  }
-  return false;
-}
-
-void InstructionSimplifierX86Visitor::VisitVecMul(HVecMul* instruction) {
-  if (HasCpuFeatureFlag()) {
-    if (TryGenerateVecMultiplyAccumulate(instruction)) {
-      RecordSimplification();
-    }
-  }
-}
-
-bool InstructionSimplifierX86::Run() {
-  InstructionSimplifierX86Visitor visitor(graph_, codegen_, stats_);
-  visitor.VisitReversePostOrder();
-  return true;
-}
-
-}  // namespace x86
-}  // namespace art
diff --git a/compiler/optimizing/instruction_simplifier_x86.h b/compiler/optimizing/instruction_simplifier_x86.h
deleted file mode 100644
index 1fb199f..0000000
--- a/compiler/optimizing/instruction_simplifier_x86.h
+++ /dev/null
@@ -1,44 +0,0 @@
-/*
- * Copyright (C) 2018 The Android Open Source Project
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef ART_COMPILER_OPTIMIZING_INSTRUCTION_SIMPLIFIER_X86_H_
-#define ART_COMPILER_OPTIMIZING_INSTRUCTION_SIMPLIFIER_X86_H_
-
-#include "nodes.h"
-#include "optimization.h"
-#include "code_generator_x86.h"
-
-namespace art {
-namespace x86 {
-
-class InstructionSimplifierX86 : public HOptimization {
- public:
-  InstructionSimplifierX86(HGraph* graph, CodeGenerator* codegen, OptimizingCompilerStats* stats)
-      : HOptimization(graph, kInstructionSimplifierX86PassName, stats),
-        codegen_(down_cast<CodeGeneratorX86*>(codegen)) {}
-
-  static constexpr const char* kInstructionSimplifierX86PassName = "instruction_simplifier_x86";
-
-  bool Run() OVERRIDE;
-
- private:
-  CodeGeneratorX86* codegen_;
-};
-
-}  // namespace x86
-}  // namespace art
-
-#endif  // ART_COMPILER_OPTIMIZING_INSTRUCTION_SIMPLIFIER_X86_H_
diff --git a/compiler/optimizing/nodes_vector.h b/compiler/optimizing/nodes_vector.h
index b4f9993..c5e9a8d 100644
--- a/compiler/optimizing/nodes_vector.h
+++ b/compiler/optimizing/nodes_vector.h
@@ -958,10 +958,6 @@
     SetRawInputAt(2, mul_right);
   }
 
-  static constexpr int kInputAccumulatorIndex = 0;
-  static constexpr int kInputMulLeftIndex = 1;
-  static constexpr int kInputMulRightIndex = 2;
-
   bool CanBeMoved() const OVERRIDE { return true; }
 
   bool InstructionDataEquals(const HInstruction* other) const OVERRIDE {
diff --git a/compiler/optimizing/optimization.cc b/compiler/optimizing/optimization.cc
index 3ad2c6b..a38bd24 100644
--- a/compiler/optimizing/optimization.cc
+++ b/compiler/optimizing/optimization.cc
@@ -28,7 +28,6 @@
 #endif
 #ifdef ART_ENABLE_CODEGEN_x86
 #include "pc_relative_fixups_x86.h"
-#include "instruction_simplifier_x86.h"
 #endif
 #if defined(ART_ENABLE_CODEGEN_x86) || defined(ART_ENABLE_CODEGEN_x86_64)
 #include "x86_memory_gen.h"
@@ -122,8 +121,6 @@
 #if defined(ART_ENABLE_CODEGEN_x86) || defined(ART_ENABLE_CODEGEN_x86_64)
     case OptimizationPass::kX86MemoryOperandGeneration:
       return x86::X86MemoryOperandGeneration::kX86MemoryOperandGenerationPassName;
-    case OptimizationPass::kInstructionSimplifierX86:
-      return x86::InstructionSimplifierX86::kInstructionSimplifierX86PassName;
 #endif
     case OptimizationPass::kNone:
       LOG(FATAL) << "kNone does not represent an actual pass";
@@ -166,7 +163,6 @@
 #ifdef ART_ENABLE_CODEGEN_x86
   X(OptimizationPass::kPcRelativeFixupsX86);
   X(OptimizationPass::kX86MemoryOperandGeneration);
-  X(OptimizationPass::kInstructionSimplifierX86);
 #endif
   LOG(FATAL) << "Cannot find optimization " << pass_name;
   UNREACHABLE();
@@ -327,10 +323,6 @@
         DCHECK(alt_name == nullptr) << "arch-specific pass does not support alternative name";
         opt = new (allocator) x86::X86MemoryOperandGeneration(graph, codegen, stats);
         break;
-      case OptimizationPass::kInstructionSimplifierX86:
-        DCHECK(alt_name == nullptr) << "arch-specific pass does not support alternative name";
-        opt = new (allocator) x86::InstructionSimplifierX86(graph, codegen, stats);
-        break;
 #endif
       case OptimizationPass::kNone:
         LOG(FATAL) << "kNone does not represent an actual pass";
diff --git a/compiler/optimizing/optimization.h b/compiler/optimizing/optimization.h
index a9fafa0..88b283c 100644
--- a/compiler/optimizing/optimization.h
+++ b/compiler/optimizing/optimization.h
@@ -101,7 +101,6 @@
 #endif
 #if defined(ART_ENABLE_CODEGEN_x86) || defined(ART_ENABLE_CODEGEN_x86_64)
   kX86MemoryOperandGeneration,
-  kInstructionSimplifierX86,
 #endif
   kNone,
   kLast = kNone
diff --git a/compiler/optimizing/optimizing_compiler.cc b/compiler/optimizing/optimizing_compiler.cc
index bb33ba3..84863e4 100644
--- a/compiler/optimizing/optimizing_compiler.cc
+++ b/compiler/optimizing/optimizing_compiler.cc
@@ -530,8 +530,7 @@
         OptDef(OptimizationPass::kSideEffectsAnalysis),
         OptDef(OptimizationPass::kGlobalValueNumbering, "GVN$after_arch"),
         OptDef(OptimizationPass::kPcRelativeFixupsX86),
-        OptDef(OptimizationPass::kX86MemoryOperandGeneration),
-        OptDef(OptimizationPass::kInstructionSimplifierX86)
+        OptDef(OptimizationPass::kX86MemoryOperandGeneration)
       };
       return RunOptimizations(graph,
                               codegen,
@@ -546,8 +545,7 @@
       OptimizationDef x86_64_optimizations[] = {
         OptDef(OptimizationPass::kSideEffectsAnalysis),
         OptDef(OptimizationPass::kGlobalValueNumbering, "GVN$after_arch"),
-        OptDef(OptimizationPass::kX86MemoryOperandGeneration),
-        OptDef(OptimizationPass::kInstructionSimplifierX86)
+        OptDef(OptimizationPass::kX86MemoryOperandGeneration)
       };
       return RunOptimizations(graph,
                               codegen,