[X86] Memory folding for commutative instructions (updated)

This patch improves support for commutative instructions in the x86 memory folding implementation by attempting to fold a commuted version of the instruction if the original folding fails - if that folding fails as well the instruction is 're-commuted' back to its original order before returning.

Updated version of r219584 (reverted in r219595) - the commutation attempt now explicitly ensures that neither of the commuted source operands are tied to the destination operand / register, which was the source of all the regressions that occurred with the original patch attempt.

Added additional regression test case provided by Joerg Sonnenberger.

Differential Revision: http://reviews.llvm.org/D5818

llvm-svn: 220239
diff --git a/llvm/test/CodeGen/X86/avx1-stack-reload-folding.ll b/llvm/test/CodeGen/X86/avx1-stack-reload-folding.ll
new file mode 100644
index 0000000..5da1b4c
--- /dev/null
+++ b/llvm/test/CodeGen/X86/avx1-stack-reload-folding.ll
@@ -0,0 +1,16 @@
+; RUN: llc -O3 -disable-peephole -mcpu=corei7-avx -mattr=+avx < %s | FileCheck %s

+

+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"

+target triple = "x86_64-unknown-unknown"

+

+; Function Attrs: nounwind readonly uwtable

+define <32 x double> @_Z14vstack_foldDv32_dS_(<32 x double> %a, <32 x double> %b) #0 {

+  %1 = fadd <32 x double> %a, %b

+  %2 = fsub <32 x double> %a, %b

+  %3 = fmul <32 x double> %1, %2

+  ret <32 x double> %3

+

+  ;CHECK-NOT:  vmovapd {{.*#+}} 32-byte Reload

+  ;CHECK:       vmulpd {{[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload

+  ;CHECK-NOT:  vmovapd {{.*#+}} 32-byte Reload

+}