Teach two-address lowering how to unfold a load to open up commuting
opportunities. For example, this lets it emit this:

   movq (%rax), %rcx
   addq %rdx, %rcx

instead of this:

   movq %rdx, %rcx
   addq (%rax), %rcx

in the case where %rdx has subsequent uses. It's the same number
of instructions, and usually the same encoding size on x86, but
it appears faster, and in general, it may allow better scheduling
for the load.


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@106493 91177308-0d34-0410-b5e6-96231b3b80d8
diff --git a/test/CodeGen/X86/stack-align.ll b/test/CodeGen/X86/stack-align.ll
index 271ad1a..8ca0b12 100644
--- a/test/CodeGen/X86/stack-align.ll
+++ b/test/CodeGen/X86/stack-align.ll
@@ -9,14 +9,15 @@
 
 define void @test({ double, double }* byval  %z, double* %P) {
 entry:
-	%tmp = getelementptr { double, double }* %z, i32 0, i32 0		; <double*> [#uses=1]
-	%tmp1 = load double* %tmp, align 8		; <double> [#uses=1]
-	%tmp2 = tail call double @fabs( double %tmp1 )		; <double> [#uses=1]
-    ; CHECK: andpd{{.*}}4(%esp), %xmm
 	%tmp3 = load double* @G, align 16		; <double> [#uses=1]
 	%tmp4 = tail call double @fabs( double %tmp3 )		; <double> [#uses=1]
+        volatile store double %tmp4, double* %P
+	%tmp = getelementptr { double, double }* %z, i32 0, i32 0		; <double*> [#uses=1]
+	%tmp1 = volatile load double* %tmp, align 8		; <double> [#uses=1]
+	%tmp2 = tail call double @fabs( double %tmp1 )		; <double> [#uses=1]
+    ; CHECK: andpd{{.*}}4(%esp), %xmm
 	%tmp6 = fadd double %tmp4, %tmp2		; <double> [#uses=1]
-	store double %tmp6, double* %P, align 8
+	volatile store double %tmp6, double* %P, align 8
 	ret void
 }