AMDGPU/SI: Implement a custom MachineSchedStrategy

Summary:
GCNSchedStrategy re-uses most of GenericScheduler, it's just uses
a different method to compute the excess and critical register
pressure limits.

It's not enabled by default, to enable it you need to pass -misched=gcn
to llc.

Shader DB stats:

32464 shaders in 17874 tests
Totals:
SGPRS: 1542846 -> 1643125 (6.50 %)
VGPRS: 1005595 -> 904653 (-10.04 %)
Spilled SGPRs: 29929 -> 27745 (-7.30 %)
Spilled VGPRs: 334 -> 352 (5.39 %)
Scratch VGPRs: 1612 -> 1624 (0.74 %) dwords per thread
Code Size: 36688188 -> 37034900 (0.95 %) bytes
LDS: 1913 -> 1913 (0.00 %) blocks
Max Waves: 254101 -> 265125 (4.34 %)
Wait states: 0 -> 0 (0.00 %)

Totals from affected shaders:
SGPRS: 1338220 -> 1438499 (7.49 %)
VGPRS: 886221 -> 785279 (-11.39 %)
Spilled SGPRs: 29869 -> 27685 (-7.31 %)
Spilled VGPRs: 334 -> 352 (5.39 %)
Scratch VGPRs: 1612 -> 1624 (0.74 %) dwords per thread
Code Size: 34315716 -> 34662428 (1.01 %) bytes
LDS: 1551 -> 1551 (0.00 %) blocks
Max Waves: 188127 -> 199151 (5.86 %)
Wait states: 0 -> 0 (0.00 %)

Reviewers: arsenm, mareko, nhaehnle, MatzeB, atrick

Subscribers: arsenm, kzhuravl, llvm-commits

Differential Revision: https://reviews.llvm.org/D23688

llvm-svn: 279995
diff --git a/llvm/test/CodeGen/AMDGPU/and.ll b/llvm/test/CodeGen/AMDGPU/and.ll
index 0046bc9..eb0bf65 100644
--- a/llvm/test/CodeGen/AMDGPU/and.ll
+++ b/llvm/test/CodeGen/AMDGPU/and.ll
@@ -258,10 +258,10 @@
 }
 
 ; FUNC-LABEL: {{^}}v_and_multi_use_constant_i64:
-; SI: buffer_load_dwordx2 v{{\[}}[[LO0:[0-9]+]]:[[HI0:[0-9]+]]{{\]}}
-; SI: buffer_load_dwordx2 v{{\[}}[[LO1:[0-9]+]]:[[HI1:[0-9]+]]{{\]}}
-; SI-DAG: s_mov_b32 [[KLO:s[0-9]+]], 0xab19b207{{$}}
+; SI-DAG: buffer_load_dwordx2 v{{\[}}[[LO0:[0-9]+]]:[[HI0:[0-9]+]]{{\]}}
+; SI-DAG: buffer_load_dwordx2 v{{\[}}[[LO1:[0-9]+]]:[[HI1:[0-9]+]]{{\]}}
 ; SI-DAG: s_movk_i32 [[KHI:s[0-9]+]], 0x11e{{$}}
+; SI-DAG: s_mov_b32 [[KLO:s[0-9]+]], 0xab19b207{{$}}
 ; SI-DAG: v_and_b32_e32 {{v[0-9]+}}, [[KLO]], v[[LO0]]
 ; SI-DAG: v_and_b32_e32 {{v[0-9]+}}, [[KHI]], v[[HI0]]
 ; SI-DAG: v_and_b32_e32 {{v[0-9]+}}, [[KLO]], v[[LO1]]
@@ -284,10 +284,9 @@
 ; SI: buffer_load_dwordx2 v{{\[}}[[LO1:[0-9]+]]:[[HI1:[0-9]+]]{{\]}}
 ; SI-NOT: and
 ; SI: v_and_b32_e32 v[[RESLO0:[0-9]+]], 63, v[[LO0]]
-; SI-NOT: and
-; SI: buffer_store_dwordx2 v{{\[}}[[RESLO0]]
 ; SI: v_and_b32_e32 v[[RESLO1:[0-9]+]], 63, v[[LO1]]
 ; SI-NOT: and
+; SI: buffer_store_dwordx2 v{{\[}}[[RESLO0]]
 ; SI: buffer_store_dwordx2 v{{\[}}[[RESLO1]]
 define void @v_and_multi_use_inline_imm_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr) {
   %a = load volatile i64, i64 addrspace(1)* %aptr
@@ -486,8 +485,8 @@
 ; low 32-bits, which is not a valid 64-bit inline immmediate.
 
 ; FUNC-LABEL: {{^}}s_and_inline_imm_f32_4.0_i64:
-; SI: s_load_dword s
 ; SI: s_load_dwordx2
+; SI: s_load_dword s
 ; SI-NOT: and
 ; SI: s_and_b32 s[[K_HI:[0-9]+]], s{{[0-9]+}}, 4.0
 ; SI-NOT: and
diff --git a/llvm/test/CodeGen/AMDGPU/ctpop64.ll b/llvm/test/CodeGen/AMDGPU/ctpop64.ll
index d0976b7..ee18032 100644
--- a/llvm/test/CodeGen/AMDGPU/ctpop64.ll
+++ b/llvm/test/CodeGen/AMDGPU/ctpop64.ll
@@ -155,8 +155,8 @@
 }
 
 ; FUNC-LABEL: {{^}}s_ctpop_i65:
-; GCN: s_and_b32
 ; GCN: s_bcnt1_i32_b64 [[REG0:s[0-9]+]],
+; GCN: s_and_b32
 ; GCN: s_bcnt1_i32_b64 [[REG1:s[0-9]+]],
 ; GCN: s_add_i32 {{s[0-9]+}}, [[REG0]], [[REG1]]
 ; GCN: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/ds_read2_offset_order.ll b/llvm/test/CodeGen/AMDGPU/ds_read2_offset_order.ll
index dab3c10..c028d10 100644
--- a/llvm/test/CodeGen/AMDGPU/ds_read2_offset_order.ll
+++ b/llvm/test/CodeGen/AMDGPU/ds_read2_offset_order.ll
@@ -8,7 +8,7 @@
 
 ; SI-LABEL: {{^}}offset_order:
 
-; SI: ds_read2_b32 v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}} offset0:2 offset1:3
+; SI-DAG: ds_read2_b32 v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}} offset0:2 offset1:3
 ; SI-DAG: ds_read2_b32 v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}} offset0:12 offset1:14
 ; SI-DAG: ds_read_b32 v{{[0-9]+}}, v{{[0-9]+}} offset:44
 
diff --git a/llvm/test/CodeGen/AMDGPU/fceil64.ll b/llvm/test/CodeGen/AMDGPU/fceil64.ll
index f6cdbb4..7d351d5 100644
--- a/llvm/test/CodeGen/AMDGPU/fceil64.ll
+++ b/llvm/test/CodeGen/AMDGPU/fceil64.ll
@@ -13,8 +13,10 @@
 ; CI: v_ceil_f64_e32
 ; SI: s_bfe_u32 [[SEXP:s[0-9]+]], {{s[0-9]+}}, 0xb0014
 ; SI-DAG: s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x80000000
-; SI-DAG: s_addk_i32 [[SEXP]], 0xfc01
-; SI-DAG: s_lshr_b64 s[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], [[SEXP]]
+; FIXME: We should be using s_addk_i32 here, but the reg allocation hints
+;        are not always followed.
+; SI-DAG: s_add_i32 [[SEXP0:s[0-9]+]], [[SEXP]], 0xfffffc01
+; SI-DAG: s_lshr_b64 s[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], [[SEXP0]]
 ; SI-DAG: s_not_b64
 ; SI-DAG: s_and_b64
 ; SI-DAG: cmp_gt_i32
diff --git a/llvm/test/CodeGen/AMDGPU/fma-combine.ll b/llvm/test/CodeGen/AMDGPU/fma-combine.ll
index 19deefe..9126a47 100644
--- a/llvm/test/CodeGen/AMDGPU/fma-combine.ll
+++ b/llvm/test/CodeGen/AMDGPU/fma-combine.ll
@@ -548,7 +548,7 @@
 
 ; FUNC-LABEL: {{^}}test_f64_interp:
 ; SI: v_fma_f64 [[VR:v\[[0-9]+:[0-9]+\]]], -[[VT:v\[[0-9]+:[0-9]+\]]], [[VY:v\[[0-9]+:[0-9]+\]]], [[VY]]
-; SI: v_fma_f64 [[VR:v\[[0-9]+:[0-9]+\]]], [[VX:v\[[0-9]+:[0-9]+\]]], [[VT]], [[VR]]
+; SI: v_fma_f64 v{{\[[0-9]+:[0-9]+\]}}, [[VX:v\[[0-9]+:[0-9]+\]]], [[VT]], [[VR]]
 define void @test_f64_interp(double addrspace(1)* %out,
                              double addrspace(1)* %in1,
                              double addrspace(1)* %in2,
diff --git a/llvm/test/CodeGen/AMDGPU/fmax3.f64.ll b/llvm/test/CodeGen/AMDGPU/fmax3.f64.ll
index 9bbfe1e..4dae566 100644
--- a/llvm/test/CodeGen/AMDGPU/fmax3.f64.ll
+++ b/llvm/test/CodeGen/AMDGPU/fmax3.f64.ll
@@ -6,8 +6,8 @@
 ; SI-LABEL: {{^}}test_fmax3_f64:
 ; SI-DAG: buffer_load_dwordx2 [[REGA:v\[[0-9]+:[0-9]+\]]], off, s[{{[0-9]+:[0-9]+}}], 0{{$}}
 ; SI-DAG: buffer_load_dwordx2 [[REGB:v\[[0-9]+:[0-9]+\]]], off, s[{{[0-9]+:[0-9]+}}], 0 offset:8
-; SI-DAG: buffer_load_dwordx2 [[REGC:v\[[0-9]+:[0-9]+\]]], off, s[{{[0-9]+:[0-9]+}}], 0 offset:16
 ; SI: v_max_f64 [[REGA]], [[REGA]], [[REGB]]
+; SI: buffer_load_dwordx2 [[REGC:v\[[0-9]+:[0-9]+\]]], off, s[{{[0-9]+:[0-9]+}}], 0 offset:16
 ; SI: v_max_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[REGA]], [[REGC]]
 ; SI: buffer_store_dwordx2 [[RESULT]],
 ; SI: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/indirect-addressing-si.ll b/llvm/test/CodeGen/AMDGPU/indirect-addressing-si.ll
index bf230b2..6d729f4 100644
--- a/llvm/test/CodeGen/AMDGPU/indirect-addressing-si.ll
+++ b/llvm/test/CodeGen/AMDGPU/indirect-addressing-si.ll
@@ -22,16 +22,16 @@
 
 ; XXX: Could do v_or_b32 directly
 ; CHECK-LABEL: {{^}}extract_w_offset_salu_use_vector:
-; CHECK-DAG: s_or_b32
-; CHECK-DAG: s_or_b32
-; CHECK-DAG: s_or_b32
-; CHECK-DAG: s_or_b32
-; CHECK-DAG: v_mov_b32_e32 v{{[0-9]+}}, s{{[0-9]+}}
-; CHECK-DAG: v_mov_b32_e32 v{{[0-9]+}}, s{{[0-9]+}}
-; CHECK-DAG: v_mov_b32_e32 v{{[0-9]+}}, s{{[0-9]+}}
-; CHECK-DAG: v_mov_b32_e32 v{{[0-9]+}}, s{{[0-9]+}}
 ; CHECK: s_mov_b32 m0
-; CHECK-NEXT: v_movrels_b32_e32
+; CHECK-DAG: s_or_b32
+; CHECK-DAG: s_or_b32
+; CHECK-DAG: s_or_b32
+; CHECK-DAG: s_or_b32
+; CHECK-DAG: v_mov_b32_e32 v{{[0-9]+}}, s{{[0-9]+}}
+; CHECK-DAG: v_mov_b32_e32 v{{[0-9]+}}, s{{[0-9]+}}
+; CHECK-DAG: v_mov_b32_e32 v{{[0-9]+}}, s{{[0-9]+}}
+; CHECK-DAG: v_mov_b32_e32 v{{[0-9]+}}, s{{[0-9]+}}
+; CHECK: v_movrels_b32_e32
 define void @extract_w_offset_salu_use_vector(i32 addrspace(1)* %out, i32 %in, <4 x i32> %or.val) {
 entry:
   %idx = add i32 %in, 1
@@ -242,13 +242,13 @@
 ; FIXME: Why is vector copied in between?
 
 ; CHECK-DAG: {{buffer|flat}}_load_dword [[IDX0:v[0-9]+]]
-; CHECK-DAG: s_mov_b32 [[S_ELT0:s[0-9]+]], 7
 ; CHECK-DAG: s_mov_b32 [[S_ELT1:s[0-9]+]], 9
+; CHECK-DAG: s_mov_b32 [[S_ELT0:s[0-9]+]], 7
 ; CHECK-DAG: v_mov_b32_e32 [[VEC_ELT0:v[0-9]+]], [[S_ELT0]]
 ; CHECK-DAG: v_mov_b32_e32 [[VEC_ELT1:v[0-9]+]], [[S_ELT1]]
 
 ; CHECK: s_mov_b64 [[MASK:s\[[0-9]+:[0-9]+\]]], exec
-; CHECK: s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK: s_waitcnt vmcnt(0)
 
 ; CHECK: [[LOOP0:BB[0-9]+_[0-9]+]]:
 ; CHECK-NEXT: v_readfirstlane_b32 [[READLANE:s[0-9]+]], [[IDX0]]
@@ -303,8 +303,10 @@
 ; CHECK-DAG: {{buffer|flat}}_load_dword [[IDX0:v[0-9]+]]
 ; CHECK-DAG: v_mov_b32 [[INS0:v[0-9]+]], 62
 
-; CHECK-DAG: v_mov_b32_e32 v[[VEC_ELT0:[0-9]+]], s[[S_ELT0]]
 ; CHECK-DAG: v_mov_b32_e32 v[[VEC_ELT3:[0-9]+]], s[[S_ELT3]]
+; CHECK: v_mov_b32_e32 v[[VEC_ELT2:[0-9]+]], s{{[0-9]+}}
+; CHECK: v_mov_b32_e32 v[[VEC_ELT1:[0-9]+]], s{{[0-9]+}}
+; CHECK: v_mov_b32_e32 v[[VEC_ELT0:[0-9]+]], s[[S_ELT0]]
 
 ; CHECK: [[LOOP0:BB[0-9]+_[0-9]+]]:
 ; CHECK-NEXT: v_readfirstlane_b32 [[READLANE:s[0-9]+]], [[IDX0]]
@@ -324,7 +326,7 @@
 ; CHECK: v_cmp_eq_u32_e32 vcc, [[READLANE]], [[IDX0]]
 ; CHECK: s_mov_b32 m0, [[READLANE]]
 ; CHECK: s_and_saveexec_b64 vcc, vcc
-; CHECK-NEXT: v_movreld_b32_e32 [[VEC_ELT1]], 63
+; CHECK-NEXT: v_movreld_b32_e32 v[[VEC_ELT1]], 63
 ; CHECK-NEXT: s_xor_b64 exec, exec, vcc
 ; CHECK: s_cbranch_execnz [[LOOP1]]
 
diff --git a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll
index b1405c7..e38ee47 100644
--- a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll
+++ b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll
@@ -344,7 +344,7 @@
 ; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, s{{[0-9]+}}
 ; GCN-DAG: v_mov_b32_e32 [[ELT1:v[0-9]+]], 0x40200000
 
-; GCN: s_mov_b32 m0, [[SCALEDIDX]]
+; GCN-DAG: s_mov_b32 m0, [[SCALEDIDX]]
 ; GCN: v_movreld_b32_e32 v{{[0-9]+}}, 0
 
 ; Increment to next element folded into base register, but FileCheck
diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll b/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll
index 397e172..31ff7d9 100644
--- a/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll
@@ -343,8 +343,8 @@
 ; FUNC-LABEL: {{^}}constant_zextload_v32i16_to_v32i32:
 ; GCN-DAG: s_load_dwordx16
 ; GCN-DAG: s_mov_b32 [[K:s[0-9]+]], 0xffff{{$}}
-; GCN: s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, [[K]]
-; GCN: s_lshr_b32 s{{[0-9]+}}, s{{[0-9]+}}, 16
+; GCN-DAG: s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, [[K]]
+; GCN-DAG: s_lshr_b32 s{{[0-9]+}}, s{{[0-9]+}}, 16
 
 ; EG-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+\.[XYZW]}}, 0, #1
 ; EG-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+\.[XYZW]}}, 16, #1
diff --git a/llvm/test/CodeGen/AMDGPU/load-global-i32.ll b/llvm/test/CodeGen/AMDGPU/load-global-i32.ll
index 5e1171a..86f5d4e 100644
--- a/llvm/test/CodeGen/AMDGPU/load-global-i32.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-global-i32.ll
@@ -360,7 +360,7 @@
 ; GCN-NOHSA: buffer_load_dwordx4
 ; GCN-NOHSA: buffer_load_dwordx4
 ; GCN-NOHSA: buffer_load_dwordx4
-; GCN-NOHSA: buffer_load_dwordx4
+; GCN-NOHSA-DAG: buffer_load_dwordx4
 
 ; GCN-HSA: flat_load_dwordx4
 ; GCN-HSA: flat_load_dwordx4
diff --git a/llvm/test/CodeGen/AMDGPU/missing-store.ll b/llvm/test/CodeGen/AMDGPU/missing-store.ll
index 3d6d7fa..8e1b003 100644
--- a/llvm/test/CodeGen/AMDGPU/missing-store.ll
+++ b/llvm/test/CodeGen/AMDGPU/missing-store.ll
@@ -6,12 +6,12 @@
 ; resulting in losing the store to gptr
 
 ; FUNC-LABEL: {{^}}missing_store_reduced:
+; SI: s_load_dwordx2
 ; SI: ds_read_b64
 ; SI-DAG: buffer_store_dword
 ; SI-DAG: v_readfirstlane_b32 s[[PTR_LO:[0-9]+]], v{{[0-9]+}}
 ; SI: v_readfirstlane_b32 s[[PTR_HI:[0-9]+]], v{{[0-9]+}}
-; SI: s_load_dword
-; SI: s_nop 2
+; SI: s_nop 3
 ; SI: s_load_dword s{{[0-9]+}}, s{{\[}}[[PTR_LO]]:[[PTR_HI]]{{\]}}
 ; SI: buffer_store_dword
 ; SI: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/move-addr64-rsrc-dead-subreg-writes.ll b/llvm/test/CodeGen/AMDGPU/move-addr64-rsrc-dead-subreg-writes.ll
index 36f1257..85dfbe6 100644
--- a/llvm/test/CodeGen/AMDGPU/move-addr64-rsrc-dead-subreg-writes.ll
+++ b/llvm/test/CodeGen/AMDGPU/move-addr64-rsrc-dead-subreg-writes.ll
@@ -10,10 +10,10 @@
 ; GCN-DAG: buffer_load_dwordx2 v{{\[}}[[LDPTRLO:[0-9]+]]:[[LDPTRHI:[0-9]+]]{{\]}}
 
 ; GCN-NOT: v_mov_b32
-; GCN: v_mov_b32_e32 v[[VARG1HI:[0-9]+]], s[[ARG1HI]]
-; GCN-NOT: v_mov_b32
 ; GCN: v_mov_b32_e32 v[[VARG1LO:[0-9]+]], s[[ARG1LO]]
 ; GCN-NOT: v_mov_b32
+; GCN: v_mov_b32_e32 v[[VARG1HI:[0-9]+]], s[[ARG1HI]]
+; GCN-NOT: v_mov_b32
 
 ; GCN: v_add_i32_e32 v[[PTRLO:[0-9]+]], vcc, v[[LDPTRLO]], v[[VARG1LO]]
 ; GCN: v_addc_u32_e32 v[[PTRHI:[0-9]+]], vcc, v[[LDPTRHI]], v[[VARG1HI]]
diff --git a/llvm/test/CodeGen/AMDGPU/rcp-pattern.ll b/llvm/test/CodeGen/AMDGPU/rcp-pattern.ll
index 9eb76eb..58d4117 100644
--- a/llvm/test/CodeGen/AMDGPU/rcp-pattern.ll
+++ b/llvm/test/CodeGen/AMDGPU/rcp-pattern.ll
@@ -103,9 +103,8 @@
 ; FUNC-LABEL: {{^}}rcp_fabs_fneg_pat_multi_use_f32:
 ; GCN: s_load_dword [[SRC:s[0-9]+]]
 ; GCN: v_rcp_f32_e64 [[RCP:v[0-9]+]], -|[[SRC]]|
-; GCN: buffer_store_dword [[RCP]]
-
 ; GCN: v_mul_f32_e64 [[MUL:v[0-9]+]], [[SRC]], -|[[SRC]]|
+; GCN: buffer_store_dword [[RCP]]
 ; GCN: buffer_store_dword [[MUL]]
 define void @rcp_fabs_fneg_pat_multi_use_f32(float addrspace(1)* %out, float %src) #0 {
   %src.fabs = call float @llvm.fabs.f32(float %src)
diff --git a/llvm/test/CodeGen/AMDGPU/ret.ll b/llvm/test/CodeGen/AMDGPU/ret.ll
index 915c438..0408413 100644
--- a/llvm/test/CodeGen/AMDGPU/ret.ll
+++ b/llvm/test/CodeGen/AMDGPU/ret.ll
@@ -18,12 +18,13 @@
 }
 
 ; GCN-LABEL: {{^}}vgpr_literal:
-; GCN: exp 15, 0, 1, 1, 1, v0, v0, v0, v0
-; GCN: s_waitcnt expcnt(0)
+; GCN: v_mov_b32_e32 v4, v0
+; GCN: exp 15, 0, 1, 1, 1, v4, v4, v4, v4
 ; GCN-DAG: v_mov_b32_e32 v0, 1.0
 ; GCN-DAG: v_mov_b32_e32 v1, 2.0
 ; GCN-DAG: v_mov_b32_e32 v2, 4.0
 ; GCN-DAG: v_mov_b32_e32 v3, -1.0
+; GCN: s_waitcnt expcnt(0)
 ; GCN-NOT: s_endpgm
 define amdgpu_vs {float, float, float, float} @vgpr_literal([9 x <16 x i8>] addrspace(2)* byval, i32 inreg, i32 inreg, float) {
   call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %3, float %3, float %3, float %3)
@@ -229,13 +230,14 @@
 
 
 ; GCN-LABEL: {{^}}structure_literal:
-; GCN: exp 15, 0, 1, 1, 1, v0, v0, v0, v0
-; GCN: s_waitcnt expcnt(0)
+; GCN: v_mov_b32_e32 v3, v0
+; GCN: exp 15, 0, 1, 1, 1, v3, v3, v3, v3
 ; GCN-DAG: v_mov_b32_e32 v0, 1.0
 ; GCN-DAG: s_mov_b32 s0, 2
 ; GCN-DAG: s_mov_b32 s1, 3
 ; GCN-DAG: v_mov_b32_e32 v1, 2.0
 ; GCN-DAG: v_mov_b32_e32 v2, 4.0
+; GCN: s_waitcnt expcnt(0)
 define amdgpu_vs {{float, i32}, {i32, <2 x float>}} @structure_literal([9 x <16 x i8>] addrspace(2)* byval, i32 inreg, i32 inreg, float) {
   call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %3, float %3, float %3, float %3)
   ret {{float, i32}, {i32, <2 x float>}} {{float, i32} {float 1.0, i32 2}, {i32, <2 x float>} {i32 3, <2 x float> <float 2.0, float 4.0>}}
diff --git a/llvm/test/CodeGen/AMDGPU/sad.ll b/llvm/test/CodeGen/AMDGPU/sad.ll
index 87362dc..5344834 100644
--- a/llvm/test/CodeGen/AMDGPU/sad.ll
+++ b/llvm/test/CodeGen/AMDGPU/sad.ll
@@ -134,8 +134,8 @@
 
 ; GCN-LABEL: {{^}}v_sad_u32_multi_use_select_pat2:
 ; GCN: s_sub_i32 s{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}}
-; GCN: s_sub_i32 s{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}}
 ; GCN: v_cmp_gt_u32_e32 vcc, s{{[0-9]+}}, v{{[0-9]+}}
+; GCN: s_sub_i32 s{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}}
 define void @v_sad_u32_multi_use_select_pat2(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c) {
   %icmp0 = icmp ugt i32 %a, %b
   %sub0 = sub i32 %a, %b
diff --git a/llvm/test/CodeGen/AMDGPU/salu-to-valu.ll b/llvm/test/CodeGen/AMDGPU/salu-to-valu.ll
index 52f3cce..fd73ff8 100644
--- a/llvm/test/CodeGen/AMDGPU/salu-to-valu.ll
+++ b/llvm/test/CodeGen/AMDGPU/salu-to-valu.ll
@@ -170,14 +170,12 @@
 ; CI.
 
 ; GCN-LABEL: {{^}}smrd_valu_ci_offset_x8:
+; GCN-NOHSA: s_mov_b32 [[OFFSET1:s[0-9]+]], 0x9a50{{$}}
 ; GCN-NOHSA-NOT: v_add
 ; GCN-NOHSA: s_mov_b32 [[OFFSET0:s[0-9]+]], 0x9a40{{$}}
 ; GCN-NOHSA-NOT: v_add
-; GCN-NOHSA: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s[{{[0-9]+:[0-9]+}}], [[OFFSET0]] addr64{{$}}
-; GCN-NOHSA-NOT: v_add
-; GCN-NOHSA: s_mov_b32 [[OFFSET1:s[0-9]+]], 0x9a50{{$}}
-; GCN-NOHSA-NOT: v_add
 ; GCN-NOHSA: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s[{{[0-9]+:[0-9]+}}], [[OFFSET1]] addr64{{$}}
+; GCN-NOHSA: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s[{{[0-9]+:[0-9]+}}], [[OFFSET0]] addr64{{$}}
 
 ; GCN-NOHSA: v_or_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}
 ; GCN-NOHSA: v_or_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}
diff --git a/llvm/test/CodeGen/AMDGPU/select-vectors.ll b/llvm/test/CodeGen/AMDGPU/select-vectors.ll
index faf8d8a..d5c1dd9 100644
--- a/llvm/test/CodeGen/AMDGPU/select-vectors.ll
+++ b/llvm/test/CodeGen/AMDGPU/select-vectors.ll
@@ -93,13 +93,13 @@
 ; SI-DAG: s_load_dwordx2 s{{\[}}[[ALO:[0-9]+]]:[[AHI:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, {{0xb|0x2c}}
 ; SI-DAG: s_load_dwordx2 s{{\[}}[[BLO:[0-9]+]]:[[BHI:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, {{0xd|0x34}}
 
-; SI-DAG: v_mov_b32_e32 v{{[0-9]+}}, s[[ALO]]
 ; SI-DAG: v_mov_b32_e32 v{{[0-9]+}}, s[[AHI]]
-; SI-DAG: v_mov_b32_e32 v{{[0-9]+}}, s[[BLO]]
 ; SI-DAG: v_mov_b32_e32 v{{[0-9]+}}, s[[BHI]]
+; SI-DAG: v_mov_b32_e32 v{{[0-9]+}}, s[[ALO]]
 ; SI-DAG: v_cmp_eq_i32_e64 vcc, 0, s{{[0-9]+}}
 
 ; SI: v_cndmask_b32_e32
+; SI: v_mov_b32_e32 v{{[0-9]+}}, s[[BLO]]
 ; SI: v_cndmask_b32_e32
 ; SI: buffer_store_dwordx2
 define void @s_select_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b, i32 %c) nounwind {
diff --git a/llvm/test/CodeGen/AMDGPU/shift-and-i64-ubfe.ll b/llvm/test/CodeGen/AMDGPU/shift-and-i64-ubfe.ll
index c5dbfd9..b85714e 100644
--- a/llvm/test/CodeGen/AMDGPU/shift-and-i64-ubfe.ll
+++ b/llvm/test/CodeGen/AMDGPU/shift-and-i64-ubfe.ll
@@ -341,11 +341,12 @@
 
 ; GCN-LABEL: {{^}}v_uextract_bit_34_37_multi_use_shift_i64:
 ; GCN: buffer_load_dword [[VAL:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
+; GCN: v_mov_b32_e32 v[[ZERO_SHR:[0-9]+]], 0{{$}}
+; GCN: v_mov_b32_e32 v[[ZERO_BFE:[0-9]+]], v[[ZERO_SHR]]
 ; GCN-DAG: v_lshrrev_b32_e32 v[[SHR:[0-9]+]], 2, [[VAL]]
 ; GCN-DAG: v_bfe_u32 v[[BFE:[0-9]+]], [[VAL]], 2, 3
-; GCN-DAG: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}}
-; GCN-DAG: buffer_store_dwordx2 v{{\[}}[[SHR]]:[[ZERO]]{{\]}}
-; GCN: buffer_store_dwordx2 v{{\[}}[[BFE]]:[[ZERO]]{{\]}}
+; GCN-DAG: buffer_store_dwordx2 v{{\[}}[[SHR]]:[[ZERO_SHR]]{{\]}}
+; GCN: buffer_store_dwordx2 v{{\[}}[[BFE]]:[[ZERO_BFE]]{{\]}}
 define void @v_uextract_bit_34_37_multi_use_shift_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
   %id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
   %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %id.x
diff --git a/llvm/test/CodeGen/AMDGPU/si-triv-disjoint-mem-access.ll b/llvm/test/CodeGen/AMDGPU/si-triv-disjoint-mem-access.ll
index f6ed41d..8523f1c 100644
--- a/llvm/test/CodeGen/AMDGPU/si-triv-disjoint-mem-access.ll
+++ b/llvm/test/CodeGen/AMDGPU/si-triv-disjoint-mem-access.ll
@@ -70,9 +70,9 @@
 }
 
 ; FUNC-LABEL: @reorder_constant_load_global_store_constant_load
-; CI-DAG: buffer_store_dword
 ; CI-DAG: v_readfirstlane_b32 s[[PTR_LO:[0-9]+]], v{{[0-9]+}}
 ; CI: v_readfirstlane_b32 s[[PTR_HI:[0-9]+]], v{{[0-9]+}}
+; CI: buffer_store_dword
 ; CI-DAG: s_load_dword s{{[0-9]+}}, s{{\[}}[[PTR_LO]]:[[PTR_HI]]{{\]}}, 0x1
 ; CI-DAG: s_load_dword s{{[0-9]+}}, s{{\[}}[[PTR_LO]]:[[PTR_HI]]{{\]}}, 0x3
 ; CI: buffer_store_dword
@@ -136,9 +136,9 @@
 }
 
 ; FUNC-LABEL: @reorder_global_load_local_store_global_load
-; CI: buffer_load_dword
-; CI: buffer_load_dword
 ; CI: ds_write_b32
+; CI: buffer_load_dword
+; CI: buffer_load_dword
 ; CI: buffer_store_dword
 define void @reorder_global_load_local_store_global_load(i32 addrspace(1)* %out, i32 addrspace(3)* %lptr, i32 addrspace(1)* %ptr0) #0 {
   %ptr1 = getelementptr inbounds i32, i32 addrspace(1)* %ptr0, i64 1
@@ -181,11 +181,11 @@
 
 ; FUNC-LABEL: @reorder_global_offsets
 ; CI: buffer_load_dword {{v[0-9]+}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0 offset:400
-; CI: buffer_store_dword {{v[0-9]+}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0 offset:12
 ; CI: buffer_load_dword {{v[0-9]+}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0 offset:408
-; CI: buffer_load_dword {{v[0-9]+}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0 offset:12
+; CI: buffer_store_dword {{v[0-9]+}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0 offset:12
 ; CI: buffer_store_dword {{v[0-9]+}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0 offset:400
 ; CI: buffer_store_dword {{v[0-9]+}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0 offset:408
+; CI: buffer_load_dword {{v[0-9]+}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0 offset:12
 ; CI: s_endpgm
 define void @reorder_global_offsets(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* noalias nocapture readnone %gptr, i32 addrspace(1)* noalias nocapture %ptr0) #0 {
   %ptr1 = getelementptr inbounds i32, i32 addrspace(1)* %ptr0, i32 3
diff --git a/llvm/test/CodeGen/AMDGPU/trunc.ll b/llvm/test/CodeGen/AMDGPU/trunc.ll
index dbd07fe..db4c1c8 100644
--- a/llvm/test/CodeGen/AMDGPU/trunc.ll
+++ b/llvm/test/CodeGen/AMDGPU/trunc.ll
@@ -35,11 +35,11 @@
 ; SI: s_load_dwordx2 s{{\[}}[[LO_SREG:[0-9]+]]:{{[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0xd
 ; SI: s_lshl_b64 s{{\[}}[[LO_SHL:[0-9]+]]:{{[0-9]+\]}}, s{{\[}}[[LO_SREG]]:{{[0-9]+\]}}, 2
 ; SI: s_add_u32 s[[LO_SREG2:[0-9]+]], s[[LO_SHL]],
-; SI: s_addc_u32
-; SI: v_mov_b32_e32
 ; SI: v_mov_b32_e32 v[[LO_VREG:[0-9]+]], s[[LO_SREG2]]
-; SI: v_mov_b32_e32
+; SI: s_addc_u32
 ; SI: buffer_store_dword v[[LO_VREG]],
+; SI: v_mov_b32_e32
+; SI: v_mov_b32_e32
 define void @trunc_shl_i64(i64 addrspace(1)* %out2, i32 addrspace(1)* %out, i64 %a) {
   %aa = add i64 %a, 234 ; Prevent shrinking store.
   %b = shl i64 %aa, 2
diff --git a/llvm/test/CodeGen/AMDGPU/udivrem.ll b/llvm/test/CodeGen/AMDGPU/udivrem.ll
index 268f3c7..078df53 100644
--- a/llvm/test/CodeGen/AMDGPU/udivrem.ll
+++ b/llvm/test/CodeGen/AMDGPU/udivrem.ll
@@ -38,16 +38,16 @@
 ; SI: v_cndmask_b32_e64
 ; SI: v_mul_hi_u32 [[Quotient:v[0-9]+]]
 ; SI: v_mul_lo_i32 [[Num_S_Remainder:v[0-9]+]]
+; SI-DAG: v_add_i32_e32 [[Quotient_A_One:v[0-9]+]], vcc, 1, [[Quotient]]
 ; SI-DAG: v_sub_i32_e32 [[Remainder:v[0-9]+]], vcc, {{[vs][0-9]+}}, [[Num_S_Remainder]]
 ; SI-DAG: v_cndmask_b32_e64
 ; SI-DAG: v_cndmask_b32_e64
-; SI: v_and_b32_e32 [[Tmp1:v[0-9]+]]
-; SI-DAG: v_add_i32_e32 [[Quotient_A_One:v[0-9]+]], vcc, 1, [[Quotient]]
 ; SI-DAG: v_subrev_i32_e32 [[Quotient_S_One:v[0-9]+]],
+; SI-DAG: v_subrev_i32_e32 [[Remainder_S_Den:v[0-9]+]],
+; SI: v_and_b32_e32 [[Tmp1:v[0-9]+]]
 ; SI-DAG: v_cndmask_b32_e64
 ; SI-DAG: v_cndmask_b32_e64
 ; SI-DAG: v_add_i32_e32 [[Remainder_A_Den:v[0-9]+]],
-; SI-DAG: v_subrev_i32_e32 [[Remainder_S_Den:v[0-9]+]],
 ; SI-DAG: v_cndmask_b32_e64
 ; SI-DAG: v_cndmask_b32_e64
 ; SI: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/use-sgpr-multiple-times.ll b/llvm/test/CodeGen/AMDGPU/use-sgpr-multiple-times.ll
index 7a72a4c..87fcfba 100644
--- a/llvm/test/CodeGen/AMDGPU/use-sgpr-multiple-times.ll
+++ b/llvm/test/CodeGen/AMDGPU/use-sgpr-multiple-times.ll
@@ -42,15 +42,18 @@
 }
 
 ; GCN-LABEL: {{^}}test_use_s_v_s:
-; GCN: buffer_load_dword [[VA0:v[0-9]+]]
-; GCN: buffer_load_dword [[VA1:v[0-9]+]]
 ; GCN-DAG: s_load_dword [[SA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, {{0xb|0x2c}}
 ; GCN-DAG: s_load_dword [[SB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, {{0xc|0x30}}
+; SI: buffer_load_dword [[VA0:v[0-9]+]]
+; SI: buffer_load_dword [[VA1:v[0-9]+]]
 
 ; GCN-NOT: v_mov_b32
 ; GCN: v_mov_b32_e32 [[VB:v[0-9]+]], [[SB]]
 ; GCN-NOT: v_mov_b32
 
+; VI: buffer_load_dword [[VA0:v[0-9]+]]
+; VI: buffer_load_dword [[VA1:v[0-9]+]]
+
 ; GCN-DAG: v_fma_f32 [[RESULT0:v[0-9]+]], [[VA0]], [[SA]], [[VB]]
 ; GCN-DAG: v_fma_f32 [[RESULT1:v[0-9]+]], [[VA1]], [[SA]], [[VB]]
 ; GCN: buffer_store_dword [[RESULT0]]
diff --git a/llvm/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot.ll b/llvm/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot.ll
index 7d97777..52fa0be 100644
--- a/llvm/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot.ll
+++ b/llvm/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot.ll
@@ -12,16 +12,16 @@
 
 ; GCN-LABEL: {{^}}main:
 
-; GCN-DAG: s_mov_b32 s13, s12
-; GCN-DAG: s_mov_b32 s16, SCRATCH_RSRC_DWORD0
-; GCN-DAG: s_mov_b32 s17, SCRATCH_RSRC_DWORD1
-; GCN-DAG: s_mov_b32 s18, -1
-; SI-DAG: s_mov_b32 s19, 0xe8f000
-; VI-DAG: s_mov_b32 s19, 0xe80000
+; GCN-DAG: s_mov_b32 s11, s12
+; GCN-DAG: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
+; GCN-DAG: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
+; GCN-DAG: s_mov_b32 s14, -1
+; SI-DAG: s_mov_b32 s15, 0xe8f000
+; VI-DAG: s_mov_b32 s15, 0xe80000
 
-; s13 is offset system SGPR
-; GCN: buffer_store_dword {{v[0-9]+}}, off, s[16:19], s13 offset:{{[0-9]+}} ; 16-byte Folded Spill
-; GCN: buffer_load_dword v{{[0-9]+}}, off, s[16:19], s13 offset:{{[0-9]+}} ; 16-byte Folded Reload
+; s11 is offset system SGPR
+; GCN: buffer_store_dword {{v[0-9]+}}, off, s[12:15], s11 offset:{{[0-9]+}} ; 16-byte Folded Spill
+; GCN: buffer_load_dword v{{[0-9]+}}, off, s[12:15], s11 offset:{{[0-9]+}} ; 16-byte Folded Reload
 
 ; GCN: NumVgprs: 256
 ; GCN: ScratchSize: 1024