AMDGPU/SI: Enable lanemask tracking in misched

Summary:
This results in higher register usage, but should make it easier for
the compiler to hide latency.

This pass is a prerequisite for some more scheduler improvements, and I
think the increase register usage with this patch is acceptable, because
when combined with the scheduler improvements, the total register usage
will decrease.

shader-db stats:

2382 shaders in 478 tests
Totals:
SGPRS: 48672 -> 49088 (0.85 %)
VGPRS: 34148 -> 34847 (2.05 %)
Code Size: 1285816 -> 1289128 (0.26 %) bytes
LDS: 28 -> 28 (0.00 %) blocks
Scratch: 492544 -> 573440 (16.42 %) bytes per wave
Max Waves: 6856 -> 6846 (-0.15 %)
Wait states: 0 -> 0 (0.00 %)

Depends on D18451

Reviewers: nhaehnle, arsenm

Subscribers: arsenm, llvm-commits

Differential Revision: http://reviews.llvm.org/D18452

llvm-svn: 264876
diff --git a/llvm/test/CodeGen/AMDGPU/max.ll b/llvm/test/CodeGen/AMDGPU/max.ll
index d39d090..d445d8e 100644
--- a/llvm/test/CodeGen/AMDGPU/max.ll
+++ b/llvm/test/CodeGen/AMDGPU/max.ll
@@ -191,8 +191,8 @@
 }
 
 ; FUNC-LABEL: {{^}}s_test_umax_ugt_imm_v2i32:
-; SI: s_max_u32 {{s[0-9]+}}, {{s[0-9]+}}, 15
-; SI: s_max_u32 {{s[0-9]+}}, {{s[0-9]+}}, 23
+; SI-DAG: s_max_u32 {{s[0-9]+}}, {{s[0-9]+}}, 15
+; SI-DAG: s_max_u32 {{s[0-9]+}}, {{s[0-9]+}}, 23
 define void @s_test_umax_ugt_imm_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> %a) nounwind {
   %cmp = icmp ugt <2 x i32> %a, <i32 15, i32 23>
   %val = select <2 x i1> %cmp, <2 x i32> %a, <2 x i32> <i32 15, i32 23>
@@ -205,8 +205,8 @@
 ; SI-DAG: s_load_dword [[A:s[0-9]+]], {{s\[[0-9]+:[0-9]+\]}}, 0xb
 ; SI-DAG: s_load_dword [[B:s[0-9]+]], {{s\[[0-9]+:[0-9]+\]}}, 0xc
 ; SI: s_max_u32 [[MAX:s[0-9]+]], [[A]], [[B]]
-; SI-NEXT: v_mov_b32_e32 [[VMAX:v[0-9]+]], [[MAX]]
-; SI-NEXT: buffer_store_dword [[VMAX]]
+; SI: v_mov_b32_e32 [[VMAX:v[0-9]+]], [[MAX]]
+; SI: buffer_store_dword [[VMAX]]
 define void @simplify_demanded_bits_test_umax_ugt_i16(i32 addrspace(1)* %out, i16 zeroext %a, i16 zeroext %b) nounwind {
   %a.ext = zext i16 %a to i32
   %b.ext = zext i16 %b to i32
@@ -223,8 +223,8 @@
 ; SI-DAG: s_load_dword [[A:s[0-9]+]], {{s\[[0-9]+:[0-9]+\]}}, 0xb
 ; SI-DAG: s_load_dword [[B:s[0-9]+]], {{s\[[0-9]+:[0-9]+\]}}, 0xc
 ; SI: s_max_i32 [[MAX:s[0-9]+]], [[A]], [[B]]
-; SI-NEXT: v_mov_b32_e32 [[VMAX:v[0-9]+]], [[MAX]]
-; SI-NEXT: buffer_store_dword [[VMAX]]
+; SI: v_mov_b32_e32 [[VMAX:v[0-9]+]], [[MAX]]
+; SI: buffer_store_dword [[VMAX]]
 define void @simplify_demanded_bits_test_max_slt_i16(i32 addrspace(1)* %out, i16 signext %a, i16 signext %b) nounwind {
   %a.ext = sext i16 %a to i32
   %b.ext = sext i16 %b to i32