[x86] extend machine combiner reassociation optimization to SSE scalar adds

Extend the reassociation optimization of http://reviews.llvm.org/rL240361 (D10460)
to SSE scalar FP SP adds in addition to AVX scalar FP SP adds.

With the 'switch' in place, we can trivially add other opcodes and test cases in
future patches.

Differential Revision: http://reviews.llvm.org/D10975

llvm-svn: 241515
diff --git a/llvm/test/CodeGen/X86/machine-combiner.ll b/llvm/test/CodeGen/X86/machine-combiner.ll
index d4cd59f..545decb 100644
--- a/llvm/test/CodeGen/X86/machine-combiner.ll
+++ b/llvm/test/CodeGen/X86/machine-combiner.ll
@@ -1,15 +1,23 @@
-; RUN: llc -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=avx -enable-unsafe-fp-math < %s | FileCheck %s
+; RUN: llc -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=sse -enable-unsafe-fp-math < %s | FileCheck %s --check-prefix=SSE
+; RUN: llc -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=avx -enable-unsafe-fp-math < %s | FileCheck %s --check-prefix=AVX
 
 ; Verify that the first two adds are independent regardless of how the inputs are
 ; commuted. The destination registers are used as source registers for the third add.
 
 define float @reassociate_adds1(float %x0, float %x1, float %x2, float %x3) {
-; CHECK-LABEL: reassociate_adds1:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vaddss %xmm1, %xmm0, %xmm0
-; CHECK-NEXT:    vaddss %xmm3, %xmm2, %xmm1
-; CHECK-NEXT:    vaddss %xmm1, %xmm0, %xmm0
-; CHECK-NEXT:    retq
+; SSE-LABEL: reassociate_adds1:
+; SSE:       # BB#0:
+; SSE-NEXT:    addss %xmm1, %xmm0
+; SSE-NEXT:    addss %xmm3, %xmm2
+; SSE-NEXT:    addss %xmm2, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: reassociate_adds1:
+; AVX:       # BB#0:
+; AVX-NEXT:    vaddss %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vaddss %xmm3, %xmm2, %xmm1
+; AVX-NEXT:    vaddss %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
   %t0 = fadd float %x0, %x1
   %t1 = fadd float %t0, %x2
   %t2 = fadd float %t1, %x3
@@ -17,12 +25,19 @@
 }
 
 define float @reassociate_adds2(float %x0, float %x1, float %x2, float %x3) {
-; CHECK-LABEL: reassociate_adds2:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vaddss %xmm1, %xmm0, %xmm0
-; CHECK-NEXT:    vaddss %xmm3, %xmm2, %xmm1
-; CHECK-NEXT:    vaddss %xmm1, %xmm0, %xmm0
-; CHECK-NEXT:    retq
+; SSE-LABEL: reassociate_adds2:
+; SSE:       # BB#0:
+; SSE-NEXT:    addss %xmm1, %xmm0
+; SSE-NEXT:    addss %xmm3, %xmm2
+; SSE-NEXT:    addss %xmm2, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: reassociate_adds2:
+; AVX:       # BB#0:
+; AVX-NEXT:    vaddss %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vaddss %xmm3, %xmm2, %xmm1
+; AVX-NEXT:    vaddss %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
   %t0 = fadd float %x0, %x1
   %t1 = fadd float %x2, %t0
   %t2 = fadd float %t1, %x3
@@ -30,12 +45,19 @@
 }
 
 define float @reassociate_adds3(float %x0, float %x1, float %x2, float %x3) {
-; CHECK-LABEL: reassociate_adds3:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vaddss %xmm1, %xmm0, %xmm0
-; CHECK-NEXT:    vaddss %xmm3, %xmm2, %xmm1
-; CHECK-NEXT:    vaddss %xmm1, %xmm0, %xmm0
-; CHECK-NEXT:    retq
+; SSE-LABEL: reassociate_adds3:
+; SSE:       # BB#0:
+; SSE-NEXT:    addss %xmm1, %xmm0
+; SSE-NEXT:    addss %xmm3, %xmm2
+; SSE-NEXT:    addss %xmm2, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: reassociate_adds3:
+; AVX:       # BB#0:
+; AVX-NEXT:    vaddss %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vaddss %xmm3, %xmm2, %xmm1
+; AVX-NEXT:    vaddss %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
   %t0 = fadd float %x0, %x1
   %t1 = fadd float %t0, %x2
   %t2 = fadd float %x3, %t1
@@ -43,12 +65,19 @@
 }
 
 define float @reassociate_adds4(float %x0, float %x1, float %x2, float %x3) {
-; CHECK-LABEL: reassociate_adds4:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vaddss %xmm1, %xmm0, %xmm0
-; CHECK-NEXT:    vaddss %xmm3, %xmm2, %xmm1
-; CHECK-NEXT:    vaddss %xmm1, %xmm0, %xmm0
-; CHECK-NEXT:    retq
+; SSE-LABEL: reassociate_adds4:
+; SSE:       # BB#0:
+; SSE-NEXT:    addss %xmm1, %xmm0
+; SSE-NEXT:    addss %xmm3, %xmm2
+; SSE-NEXT:    addss %xmm2, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: reassociate_adds4:
+; AVX:       # BB#0:
+; AVX-NEXT:    vaddss %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vaddss %xmm3, %xmm2, %xmm1
+; AVX-NEXT:    vaddss %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
   %t0 = fadd float %x0, %x1
   %t1 = fadd float %x2, %t0
   %t2 = fadd float %x3, %t1
@@ -59,16 +88,27 @@
 ; produced because that would cost more compile time.
 
 define float @reassociate_adds5(float %x0, float %x1, float %x2, float %x3, float %x4, float %x5, float %x6, float %x7) {
-; CHECK-LABEL: reassociate_adds5:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vaddss %xmm1, %xmm0, %xmm0
-; CHECK-NEXT:    vaddss %xmm3, %xmm2, %xmm1
-; CHECK-NEXT:    vaddss %xmm1, %xmm0, %xmm0
-; CHECK-NEXT:    vaddss %xmm5, %xmm4, %xmm1
-; CHECK-NEXT:    vaddss %xmm6, %xmm1, %xmm1
-; CHECK-NEXT:    vaddss %xmm1, %xmm0, %xmm0
-; CHECK-NEXT:    vaddss %xmm7, %xmm0, %xmm0
-; CHECK-NEXT:    retq
+; SSE-LABEL: reassociate_adds5:
+; SSE:       # BB#0:
+; SSE-NEXT:    addss %xmm1, %xmm0
+; SSE-NEXT:    addss %xmm3, %xmm2
+; SSE-NEXT:    addss %xmm2, %xmm0
+; SSE-NEXT:    addss %xmm5, %xmm4
+; SSE-NEXT:    addss %xmm6, %xmm4
+; SSE-NEXT:    addss %xmm4, %xmm0
+; SSE-NEXT:    addss %xmm7, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: reassociate_adds5:
+; AVX:       # BB#0:
+; AVX-NEXT:    vaddss %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vaddss %xmm3, %xmm2, %xmm1
+; AVX-NEXT:    vaddss %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vaddss %xmm5, %xmm4, %xmm1
+; AVX-NEXT:    vaddss %xmm6, %xmm1, %xmm1
+; AVX-NEXT:    vaddss %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vaddss %xmm7, %xmm0, %xmm0
+; AVX-NEXT:    retq
   %t0 = fadd float %x0, %x1
   %t1 = fadd float %t0, %x2
   %t2 = fadd float %t1, %x3
@@ -83,14 +123,21 @@
 ; Also, we should reassociate such that the result of the high latency division
 ; is used by the final 'add' rather than reassociating the %x3 operand with the
 ; division. The latter reassociation would not improve anything.
- 
+
 define float @reassociate_adds6(float %x0, float %x1, float %x2, float %x3) {
-; CHECK-LABEL: reassociate_adds6:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vdivss %xmm1, %xmm0, %xmm0
-; CHECK-NEXT:    vaddss %xmm3, %xmm2, %xmm1
-; CHECK-NEXT:    vaddss %xmm1, %xmm0, %xmm0
-; CHECK-NEXT:    retq
+; SSE-LABEL: reassociate_adds6:
+; SSE:       # BB#0:
+; SSE-NEXT:    divss %xmm1, %xmm0
+; SSE-NEXT:    addss %xmm3, %xmm2
+; SSE-NEXT:    addss %xmm2, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: reassociate_adds6:
+; AVX:       # BB#0:
+; AVX-NEXT:    vdivss %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vaddss %xmm3, %xmm2, %xmm1
+; AVX-NEXT:    vaddss %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
   %t0 = fdiv float %x0, %x1
   %t1 = fadd float %x2, %t0
   %t2 = fadd float %x3, %t1