AMDGPU/SI: Enable lanemask tracking in misched

Summary:
This results in higher register usage, but should make it easier for
the compiler to hide latency.

This pass is a prerequisite for some more scheduler improvements, and I
think the increase register usage with this patch is acceptable, because
when combined with the scheduler improvements, the total register usage
will decrease.

shader-db stats:

2382 shaders in 478 tests
Totals:
SGPRS: 48672 -> 49088 (0.85 %)
VGPRS: 34148 -> 34847 (2.05 %)
Code Size: 1285816 -> 1289128 (0.26 %) bytes
LDS: 28 -> 28 (0.00 %) blocks
Scratch: 492544 -> 573440 (16.42 %) bytes per wave
Max Waves: 6856 -> 6846 (-0.15 %)
Wait states: 0 -> 0 (0.00 %)

Depends on D18451

Reviewers: nhaehnle, arsenm

Subscribers: arsenm, llvm-commits

Differential Revision: http://reviews.llvm.org/D18452

llvm-svn: 264876
diff --git a/llvm/test/CodeGen/AMDGPU/use-sgpr-multiple-times.ll b/llvm/test/CodeGen/AMDGPU/use-sgpr-multiple-times.ll
index e92a4fd..be0670d 100644
--- a/llvm/test/CodeGen/AMDGPU/use-sgpr-multiple-times.ll
+++ b/llvm/test/CodeGen/AMDGPU/use-sgpr-multiple-times.ll
@@ -28,10 +28,10 @@
 }
 
 ; GCN-LABEL: {{^}}test_sgpr_use_twice_ternary_op_a_a_b:
-; SI: s_load_dword [[SGPR0:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb
-; SI: s_load_dword [[SGPR1:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xc
-; VI: s_load_dword [[SGPR0:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x2c
-; VI: s_load_dword [[SGPR1:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x30
+; SI-DAG: s_load_dword [[SGPR0:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb
+; SI-DAG: s_load_dword [[SGPR1:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xc
+; VI-DAG: s_load_dword [[SGPR0:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x2c
+; VI-DAG: s_load_dword [[SGPR1:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x30
 ; GCN: v_mov_b32_e32 [[VGPR1:v[0-9]+]], [[SGPR1]]
 ; GCN: v_fma_f32 [[RESULT:v[0-9]+]], [[SGPR0]], [[SGPR0]], [[VGPR1]]
 ; GCN: buffer_store_dword [[RESULT]]
@@ -42,13 +42,13 @@
 }
 
 ; GCN-LABEL: {{^}}test_use_s_v_s:
-; GCN-DAG: s_load_dword [[SA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, {{0xb|0x2c}}
-; GCN-DAG: s_load_dword [[SB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, {{0xc|0x30}}
-
 ; GCN: buffer_load_dword [[VA0:v[0-9]+]]
 ; GCN-NOT: v_mov_b32
 ; GCN: buffer_load_dword [[VA1:v[0-9]+]]
 
+; GCN-DAG: s_load_dword [[SA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, {{0xb|0x2c}}
+; GCN-DAG: s_load_dword [[SB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, {{0xc|0x30}}
+
 ; GCN-NOT: v_mov_b32
 ; GCN: v_mov_b32_e32 [[VB:v[0-9]+]], [[SB]]
 ; GCN-NOT: v_mov_b32
@@ -68,10 +68,10 @@
 }
 
 ; GCN-LABEL: {{^}}test_sgpr_use_twice_ternary_op_a_b_a:
-; SI: s_load_dword [[SGPR0:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb
-; SI: s_load_dword [[SGPR1:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xc
-; VI: s_load_dword [[SGPR0:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x2c
-; VI: s_load_dword [[SGPR1:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x30
+; SI-DAG: s_load_dword [[SGPR0:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb
+; SI-DAG: s_load_dword [[SGPR1:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xc
+; VI-DAG: s_load_dword [[SGPR0:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x2c
+; VI-DAG: s_load_dword [[SGPR1:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x30
 ; GCN: v_mov_b32_e32 [[VGPR1:v[0-9]+]], [[SGPR1]]
 ; GCN: v_fma_f32 [[RESULT:v[0-9]+]], [[VGPR1]], [[SGPR0]], [[SGPR0]]
 ; GCN: buffer_store_dword [[RESULT]]
@@ -82,10 +82,10 @@
 }
 
 ; GCN-LABEL: {{^}}test_sgpr_use_twice_ternary_op_b_a_a:
-; SI: s_load_dword [[SGPR0:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb
-; SI: s_load_dword [[SGPR1:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xc
-; VI: s_load_dword [[SGPR0:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x2c
-; VI: s_load_dword [[SGPR1:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x30
+; SI-DAG: s_load_dword [[SGPR0:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb
+; SI-DAG: s_load_dword [[SGPR1:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xc
+; VI-DAG: s_load_dword [[SGPR0:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x2c
+; VI-DAG: s_load_dword [[SGPR1:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x30
 ; GCN: v_mov_b32_e32 [[VGPR1:v[0-9]+]], [[SGPR1]]
 ; GCN: v_fma_f32 [[RESULT:v[0-9]+]], [[SGPR0]], [[VGPR1]], [[SGPR0]]
 ; GCN: buffer_store_dword [[RESULT]]