In the pre-RA scheduler, maintain cmp+br proximity.

This is done by pushing physical register definitions close to their
use, which happens to handle flag definitions if they're not glued to
the branch. This seems to be generally a good thing though, so I
didn't need to add a target hook yet.

The primary motivation is to generate code closer to what people
expect and rule out missed opportunity from enabling macro-op
fusion. As a side benefit, we get several 2-5% gains on x86
benchmarks. There is one regression:
SingleSource/Benchmarks/Shootout/lists slows down be -10%. But this is
an independent scheduler bug that will be tracked separately.
See rdar://problem/9283108.

Incidentally, pre-RA scheduling is only half the solution. Fixing the
later passes is tracked by:
<rdar://problem/8932804> [pre-RA-sched] on x86, attempt to schedule CMP/TEST adjacent with condition jump

Fixes:
<rdar://problem/9262453> Scheduler unnecessary break of cmp/jump fusion


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@129508 91177308-0d34-0410-b5e6-96231b3b80d8
diff --git a/test/CodeGen/X86/2011-04-13-SchedCmpJmp.ll b/test/CodeGen/X86/2011-04-13-SchedCmpJmp.ll
new file mode 100644
index 0000000..07b1971
--- /dev/null
+++ b/test/CodeGen/X86/2011-04-13-SchedCmpJmp.ll
@@ -0,0 +1,65 @@
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=yonah | FileCheck %s
+; Reduced from JavaScriptCore
+
+%"class.JSC::CodeLocationCall" = type { [8 x i8] }
+%"class.JSC::JSGlobalData" = type { [4 x i8] }
+%"class.JSC::FunctionPtr" = type { i8* }
+%"class.JSC::Structure" = type { [4 x i8] }
+%"class.JSC::UString" = type { i8* }
+%"class.JSC::JSString" = type { [16 x i8], i32, %"class.JSC::UString", i32 }
+
+declare hidden fastcc void @_ZN3JSCL23returnToThrowTrampolineEPNS_12JSGlobalDataENS_16ReturnAddressPtrERS2_(%"class.JSC::JSGlobalData"* nocapture, i8*, %"class.JSC::FunctionPtr"* nocapture) nounwind noinline ssp
+
+; Avoid hoisting the test above loads or copies
+; CHECK: %entry
+; CHECK: cmpq
+; CHECK-NOT: mov
+; CHECK: jb
+define i32 @cti_op_eq(i8** nocapture %args) nounwind ssp {
+entry:
+  %0 = load i8** null, align 8
+  %tmp13 = bitcast i8* %0 to %"class.JSC::CodeLocationCall"*
+  %tobool.i.i.i = icmp ugt i8* undef, inttoptr (i64 281474976710655 to i8*)
+  %or.cond.i = and i1 %tobool.i.i.i, undef
+  br i1 %or.cond.i, label %if.then.i, label %if.end.i
+
+if.then.i:                                        ; preds = %entry
+  br i1 undef, label %if.then.i.i.i, label %_ZN3JSC7JSValue19equalSlowCaseInlineEPNS_9ExecStateES0_S0_.exit
+
+if.then.i.i.i:                                    ; preds = %if.then.i
+  %conv.i.i.i.i = trunc i64 undef to i32
+  br label %_ZN3JSC7JSValue19equalSlowCaseInlineEPNS_9ExecStateES0_S0_.exit
+
+if.end.i:                                         ; preds = %entry
+  br i1 undef, label %land.rhs.i121.i, label %_ZNK3JSC7JSValue8isStringEv.exit122.i
+
+land.rhs.i121.i:                                  ; preds = %if.end.i
+  %tmp.i.i117.i = load %"class.JSC::Structure"** undef, align 8
+  br label %_ZNK3JSC7JSValue8isStringEv.exit122.i
+
+_ZNK3JSC7JSValue8isStringEv.exit122.i:            ; preds = %land.rhs.i121.i, %if.end.i
+  %brmerge.i = or i1 undef, false
+  %or.cond = or i1 false, %brmerge.i
+  br i1 %or.cond, label %_ZN3JSC7JSValue19equalSlowCaseInlineEPNS_9ExecStateES0_S0_.exit, label %if.then.i92.i
+
+if.then.i92.i:                                    ; preds = %_ZNK3JSC7JSValue8isStringEv.exit122.i
+  tail call void @_ZNK3JSC8JSString11resolveRopeEPNS_9ExecStateE(%"class.JSC::JSString"* undef, %"class.JSC::CodeLocationCall"* %tmp13) nounwind
+  unreachable
+
+_ZN3JSC7JSValue19equalSlowCaseInlineEPNS_9ExecStateES0_S0_.exit: ; preds = %_ZNK3JSC7JSValue8isStringEv.exit122.i, %if.then.i.i.i, %if.then.i
+
+  %1 = load i8** undef, align 8
+  br i1 undef, label %do.end39, label %do.body27
+
+do.body27:                                        ; preds = %_ZN3JSC7JSValue19equalSlowCaseInlineEPNS_9ExecStateES0_S0_.exit
+  %tmp30 = bitcast i8* %1 to %"class.JSC::JSGlobalData"*
+  %2 = getelementptr inbounds i8** %args, i64 -1
+  %3 = bitcast i8** %2 to %"class.JSC::FunctionPtr"*
+  tail call fastcc void @_ZN3JSCL23returnToThrowTrampolineEPNS_12JSGlobalDataENS_16ReturnAddressPtrERS2_(%"class.JSC::JSGlobalData"* %tmp30, i8* undef, %"class.JSC::FunctionPtr"* %3)
+  unreachable
+
+do.end39:                                         ; preds = %_ZN3JSC7JSValue19equalSlowCaseInlineEPNS_9ExecStateES0_S0_.exit
+  ret i32 undef
+}
+
+declare void @_ZNK3JSC8JSString11resolveRopeEPNS_9ExecStateE(%"class.JSC::JSString"*, %"class.JSC::CodeLocationCall"*)
diff --git a/test/CodeGen/X86/lsr-loop-exit-cond.ll b/test/CodeGen/X86/lsr-loop-exit-cond.ll
index d33cc3a..938023f 100644
--- a/test/CodeGen/X86/lsr-loop-exit-cond.ll
+++ b/test/CodeGen/X86/lsr-loop-exit-cond.ll
@@ -1,4 +1,3 @@
-; XFAIL: *
 ; RUN: llc -march=x86-64 < %s | FileCheck %s
 
 ; CHECK: decq
diff --git a/test/CodeGen/X86/pr2659.ll b/test/CodeGen/X86/pr2659.ll
index 54d043d..ef0f9ea 100644
--- a/test/CodeGen/X86/pr2659.ll
+++ b/test/CodeGen/X86/pr2659.ll
@@ -18,7 +18,8 @@
 ; CHECK: movl $1
 ; CHECK-NOT: xorl
 ; CHECK-NOT: movl
-; CHECK-NEXT: je
+; CHECK-NOT: LBB
+; CHECK: je
 
 ifthen:         ; preds = %entry
   ret i32 0
diff --git a/test/CodeGen/X86/tail-opts.ll b/test/CodeGen/X86/tail-opts.ll
index 424bd21..77710ad 100644
--- a/test/CodeGen/X86/tail-opts.ll
+++ b/test/CodeGen/X86/tail-opts.ll
@@ -109,15 +109,15 @@
 
 ; CHECK: dont_merge_oddly:
 ; CHECK-NOT:   ret
-; CHECK:        ucomiss %xmm1, %xmm2
+; CHECK:        ucomiss %xmm{{[0-2]}}, %xmm{{[0-2]}}
 ; CHECK-NEXT:   jbe .LBB2_3
-; CHECK-NEXT:   ucomiss %xmm0, %xmm1
+; CHECK-NEXT:   ucomiss %xmm{{[0-2]}}, %xmm{{[0-2]}}
 ; CHECK-NEXT:   ja .LBB2_4
 ; CHECK-NEXT: .LBB2_2:
 ; CHECK-NEXT:   movb $1, %al
 ; CHECK-NEXT:   ret
 ; CHECK-NEXT: .LBB2_3:
-; CHECK-NEXT:   ucomiss %xmm0, %xmm2
+; CHECK-NEXT:   ucomiss %xmm{{[0-2]}}, %xmm{{[0-2]}}
 ; CHECK-NEXT:   jbe .LBB2_2
 ; CHECK-NEXT: .LBB2_4:
 ; CHECK-NEXT:   xorb %al, %al
diff --git a/test/CodeGen/X86/test-nofold.ll b/test/CodeGen/X86/test-nofold.ll
index f1063dc..97db1b3 100644
--- a/test/CodeGen/X86/test-nofold.ll
+++ b/test/CodeGen/X86/test-nofold.ll
@@ -2,10 +2,10 @@
 ; rdar://5752025
 
 ; We want:
-;      CHECK: movl	4(%esp), %ecx
-; CHECK-NEXT: andl	$15, %ecx
-; CHECK-NEXT: movl	$42, %eax
-; CHECK-NEXT: cmovel	%ecx, %eax
+;      CHECK: movl	$42, %ecx
+; CHECK-NEXT: movl	4(%esp), %eax
+; CHECK-NEXT: andl	$15, %eax
+; CHECK-NEXT: cmovnel	%ecx, %eax
 ; CHECK-NEXT: ret
 ;
 ; We don't want: