RegScavenging: Add scavengeRegisterBackwards()

Re-apply r276044/r279124/r305516. Fixed a problem where we would refuse
to place spills as the very first instruciton of a basic block and thus
artifically increase pressure (test in
test/CodeGen/PowerPC/scavenging.mir:spill_at_begin)

This is a variant of scavengeRegister() that works for
enterBasicBlockEnd()/backward(). The benefit of the backward mode is
that it is not affected by incomplete kill flags.

This patch also changes
PrologEpilogInserter::doScavengeFrameVirtualRegs() to use the register
scavenger in backwards mode.

Differential Revision: http://reviews.llvm.org/D21885

llvm-svn: 305625
diff --git a/llvm/test/CodeGen/AArch64/reg-scavenge-frame.mir b/llvm/test/CodeGen/AArch64/reg-scavenge-frame.mir
index 3300bb1..f79e75e 100644
--- a/llvm/test/CodeGen/AArch64/reg-scavenge-frame.mir
+++ b/llvm/test/CodeGen/AArch64/reg-scavenge-frame.mir
@@ -45,8 +45,42 @@
     %fp = COPY %xzr
     %lr = COPY %xzr
     ST1Fourv1d killed %d16_d17_d18_d19, %stack.0 :: (store 32 into %stack.0, align 8)
-# CHECK:  STRXui killed %[[SCAVREG:x[0-9]+|fp|lr]], %sp, [[SPOFFSET:[0-9]+]] :: (store 8 into %stack.1)
-# CHECK-NEXT:  %[[SCAVREG]] = ADDXri %sp, {{[0-9]+}}, 0
-# CHECK-NEXT:  ST1Fourv1d killed %d16_d17_d18_d19, killed %[[SCAVREG]] :: (store 32 into %stack.0, align 8)
-# CHECK-NEXT:  %[[SCAVREG]] = LDRXui %sp, [[SPOFFSET]] :: (load 8 from %stack.1)
+    ; CHECK:  STRXui killed %[[SCAVREG:x[0-9]+|fp|lr]], %sp, [[SPOFFSET:[0-9]+]] :: (store 8 into %stack.1)
+    ; CHECK-NEXT:  %[[SCAVREG]] = ADDXri %sp, {{[0-9]+}}, 0
+    ; CHECK-NEXT:  ST1Fourv1d killed %d16_d17_d18_d19, killed %[[SCAVREG]] :: (store 32 into %stack.0, align 8)
+    ; CHECK-NEXT:  %[[SCAVREG]] = LDRXui %sp, [[SPOFFSET]] :: (load 8 from %stack.1)
+
+    HINT 0, implicit %x0
+    HINT 0, implicit %x1
+    HINT 0, implicit %x2
+    HINT 0, implicit %x3
+    HINT 0, implicit %x4
+    HINT 0, implicit %x5
+    HINT 0, implicit %x6
+    HINT 0, implicit %x7
+    HINT 0, implicit %x8
+    HINT 0, implicit %x9
+    HINT 0, implicit %x10
+    HINT 0, implicit %x11
+    HINT 0, implicit %x12
+    HINT 0, implicit %x13
+    HINT 0, implicit %x14
+    HINT 0, implicit %x15
+    HINT 0, implicit %x16
+    HINT 0, implicit %x17
+    HINT 0, implicit %x18
+    HINT 0, implicit %x19
+    HINT 0, implicit %x20
+    HINT 0, implicit %x21
+    HINT 0, implicit %x22
+    HINT 0, implicit %x23
+    HINT 0, implicit %x24
+    HINT 0, implicit %x25
+    HINT 0, implicit %x26
+    HINT 0, implicit %x27
+    HINT 0, implicit %x28
+    HINT 0, implicit %fp
+    HINT 0, implicit %lr
+
+    RET_ReallyLR
 ...
diff --git a/llvm/test/CodeGen/AMDGPU/attr-amdgpu-num-sgpr.ll b/llvm/test/CodeGen/AMDGPU/attr-amdgpu-num-sgpr.ll
index ac2f7b4..822ea80 100644
--- a/llvm/test/CodeGen/AMDGPU/attr-amdgpu-num-sgpr.ll
+++ b/llvm/test/CodeGen/AMDGPU/attr-amdgpu-num-sgpr.ll
@@ -39,44 +39,49 @@
 ; features when the number of registers is frozen), this ends up using
 ; more than expected.
 
-; ALL-LABEL: {{^}}max_12_sgprs_14_input_sgprs:
-; TOSGPR: SGPRBlocks: 1
-; TOSGPR: NumSGPRsForWavesPerEU: 16
+; XALL-LABEL: {{^}}max_12_sgprs_14_input_sgprs:
+; XTOSGPR: SGPRBlocks: 1
+; XTOSGPR: NumSGPRsForWavesPerEU: 16
 
-; TOSMEM: s_mov_b64 s[10:11], s[2:3]
-; TOSMEM: s_mov_b64 s[8:9], s[0:1]
-; TOSMEM: s_mov_b32 s7, s13
+; XTOSMEM: s_mov_b64 s[10:11], s[2:3]
+; XTOSMEM: s_mov_b64 s[8:9], s[0:1]
+; XTOSMEM: s_mov_b32 s7, s13
 
-; TOSMEM: SGPRBlocks: 1
-; TOSMEM: NumSGPRsForWavesPerEU: 16
-define amdgpu_kernel void @max_12_sgprs_14_input_sgprs(i32 addrspace(1)* %out1,
-                                        i32 addrspace(1)* %out2,
-                                        i32 addrspace(1)* %out3,
-                                        i32 addrspace(1)* %out4,
-                                        i32 %one, i32 %two, i32 %three, i32 %four) #2 {
-  %x.0 = call i32 @llvm.amdgcn.workgroup.id.x()
-  %x.1 = call i32 @llvm.amdgcn.workgroup.id.y()
-  %x.2 = call i32 @llvm.amdgcn.workgroup.id.z()
-  %x.3 = call i64 @llvm.amdgcn.dispatch.id()
-  %x.4 = call i8 addrspace(2)* @llvm.amdgcn.dispatch.ptr()
-  %x.5 = call i8 addrspace(2)* @llvm.amdgcn.queue.ptr()
-  store volatile i32 0, i32* undef
-  br label %stores
-
-stores:
-  store volatile i32 %x.0, i32 addrspace(1)* undef
-  store volatile i32 %x.0, i32 addrspace(1)* undef
-  store volatile i32 %x.0, i32 addrspace(1)* undef
-  store volatile i64 %x.3, i64 addrspace(1)* undef
-  store volatile i8 addrspace(2)* %x.4, i8 addrspace(2)* addrspace(1)* undef
-  store volatile i8 addrspace(2)* %x.5, i8 addrspace(2)* addrspace(1)* undef
-
-  store i32 %one, i32 addrspace(1)* %out1
-  store i32 %two, i32 addrspace(1)* %out2
-  store i32 %three, i32 addrspace(1)* %out3
-  store i32 %four, i32 addrspace(1)* %out4
-  ret void
-}
+; XTOSMEM: SGPRBlocks: 1
+; XTOSMEM: NumSGPRsForWavesPerEU: 16
+;
+; This test case is disabled: When calculating the spillslot addresses AMDGPU
+; creates an extra vreg to save/restore m0 which in a point of maximum register
+; pressure would trigger an endless loop; the compiler aborts earlier with
+; "Incomplete scavenging after 2nd pass" in practice.
+;define amdgpu_kernel void @max_12_sgprs_14_input_sgprs(i32 addrspace(1)* %out1,
+;                                        i32 addrspace(1)* %out2,
+;                                        i32 addrspace(1)* %out3,
+;                                        i32 addrspace(1)* %out4,
+;                                        i32 %one, i32 %two, i32 %three, i32 %four) #2 {
+;  %x.0 = call i32 @llvm.amdgcn.workgroup.id.x()
+;  %x.1 = call i32 @llvm.amdgcn.workgroup.id.y()
+;  %x.2 = call i32 @llvm.amdgcn.workgroup.id.z()
+;  %x.3 = call i64 @llvm.amdgcn.dispatch.id()
+;  %x.4 = call i8 addrspace(2)* @llvm.amdgcn.dispatch.ptr()
+;  %x.5 = call i8 addrspace(2)* @llvm.amdgcn.queue.ptr()
+;  store volatile i32 0, i32* undef
+;  br label %stores
+;
+;stores:
+;  store volatile i32 %x.0, i32 addrspace(1)* undef
+;  store volatile i32 %x.0, i32 addrspace(1)* undef
+;  store volatile i32 %x.0, i32 addrspace(1)* undef
+;  store volatile i64 %x.3, i64 addrspace(1)* undef
+;  store volatile i8 addrspace(2)* %x.4, i8 addrspace(2)* addrspace(1)* undef
+;  store volatile i8 addrspace(2)* %x.5, i8 addrspace(2)* addrspace(1)* undef
+;
+;  store i32 %one, i32 addrspace(1)* %out1
+;  store i32 %two, i32 addrspace(1)* %out2
+;  store i32 %three, i32 addrspace(1)* %out3
+;  store i32 %four, i32 addrspace(1)* %out4
+;  ret void
+;}
 
 ; The following test is commented out for now; http://llvm.org/PR31230
 ; XALL-LABEL: max_12_sgprs_12_input_sgprs{{$}}
diff --git a/llvm/test/CodeGen/AMDGPU/code-object-metadata-kernel-debug-props.ll b/llvm/test/CodeGen/AMDGPU/code-object-metadata-kernel-debug-props.ll
index 0796c24..0ffc922 100644
--- a/llvm/test/CodeGen/AMDGPU/code-object-metadata-kernel-debug-props.ll
+++ b/llvm/test/CodeGen/AMDGPU/code-object-metadata-kernel-debug-props.ll
@@ -12,8 +12,8 @@
 ; CHECK:      DebugProps:
 ; CHECK:        DebuggerABIVersion:                [ 1, 0 ]
 ; CHECK:        ReservedNumVGPRs:                  4
-; GFX700:       ReservedFirstVGPR:                 11
-; GFX800:       ReservedFirstVGPR:                 11
+; GFX700:       ReservedFirstVGPR:                 8
+; GFX800:       ReservedFirstVGPR:                 8
 ; GFX9:         ReservedFirstVGPR:                 14
 ; CHECK:        PrivateSegmentBufferSGPR:          0
 ; CHECK:        WavefrontPrivateSegmentOffsetSGPR: 11
diff --git a/llvm/test/CodeGen/AMDGPU/frame-index-elimination.ll b/llvm/test/CodeGen/AMDGPU/frame-index-elimination.ll
index d67988b..eab73b9 100644
--- a/llvm/test/CodeGen/AMDGPU/frame-index-elimination.ll
+++ b/llvm/test/CodeGen/AMDGPU/frame-index-elimination.ll
@@ -22,9 +22,9 @@
 
 ; GCN-LABEL: {{^}}func_add_constant_to_fi_i32:
 ; GCN: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN: s_sub_u32 s6, s5, s4
-; GCN-NEXT: s_lshr_b32 s6, s6, 6
-; GCN-NEXT: v_add_i32_e64 v0, s{{\[[0-9]+:[0-9]+\]}}, s6, 4
+; GCN: s_sub_u32 vcc_hi, s5, s4
+; GCN-NEXT: s_lshr_b32 vcc_hi, vcc_hi, 6
+; GCN-NEXT: v_add_i32_e64 v0, {{s\[[0-9]+:[0-9]+\]|vcc}}, vcc_hi, 4
 ; GCN-NEXT: v_add_i32_e32 v0, vcc, 4, v0
 ; GCN-NOT: v_mov
 ; GCN: ds_write_b32 v0, v0
@@ -71,8 +71,8 @@
 
 ; GCN-LABEL: {{^}}void_func_byval_struct_i8_i32_ptr:
 ; GCN: s_waitcnt
-; GCN-NEXT: s_sub_u32 s6, s5, s4
-; GCN-NEXT: v_lshr_b32_e64 v0, s6, 6
+; GCN-NEXT: s_sub_u32 vcc_hi, s5, s4
+; GCN-NEXT: v_lshr_b32_e64 v0, vcc_hi, 6
 ; GCN-NEXT: v_add_i32_e32 v0, vcc, 4, v0
 ; GCN-NOT: v_mov
 ; GCN: ds_write_b32 v0, v0
@@ -99,8 +99,8 @@
 }
 
 ; GCN-LABEL: {{^}}void_func_byval_struct_i8_i32_ptr_nonentry_block:
-; GCN: s_sub_u32 s8, s5, s4
-; GCN: v_lshr_b32_e64 v1, s8, 6
+; GCN: s_sub_u32 vcc_hi, s5, s4
+; GCN: v_lshr_b32_e64 v1, vcc_hi, 6
 ; GCN: s_and_saveexec_b64
 
 ; GCN: v_add_i32_e32 v0, vcc, 4, v1
diff --git a/llvm/test/CodeGen/ARM/alloca-align.ll b/llvm/test/CodeGen/ARM/alloca-align.ll
index 3bba156..6186d13 100644
--- a/llvm/test/CodeGen/ARM/alloca-align.ll
+++ b/llvm/test/CodeGen/ARM/alloca-align.ll
@@ -12,7 +12,7 @@
 ; And a base pointer getting used.
 ; CHECK: mov r6, sp
 ; Which is passed to the call
-; CHECK: add [[REG:r[0-9]+]], r6, #19456
+; CHECK: add [[REG:r[0-9]+|lr]], r6, #19456
 ; CHECK: add r0, [[REG]], #536
 ; CHECK: bl bar
 define void @foo([20000 x i8]* %addr) {
diff --git a/llvm/test/CodeGen/ARM/execute-only-big-stack-frame.ll b/llvm/test/CodeGen/ARM/execute-only-big-stack-frame.ll
index fb498a8..0fe67f9 100644
--- a/llvm/test/CodeGen/ARM/execute-only-big-stack-frame.ll
+++ b/llvm/test/CodeGen/ARM/execute-only-big-stack-frame.ll
@@ -10,10 +10,10 @@
 ; CHECK-SUBW-ADDW-NOT:   ldr {{r[0-9]+}}, .{{.*}}
 ; CHECK-SUBW-ADDW:       sub.w sp, sp, #65536
 ; CHECK-SUBW-ADDW-NOT:   ldr {{r[0-9]+}}, .{{.*}}
-; CHECK-SUBW-ADDW:       add.w [[REG1:r[0-9]+]], sp, #255
+; CHECK-SUBW-ADDW:       add.w [[REG1:r[0-9]+|lr]], sp, #255
 ; CHECK-SUBW-ADDW:       add.w {{r[0-9]+}}, [[REG1]], #65280
 ; CHECK-SUBW-ADDW-NOT:   ldr {{r[0-9]+}}, .{{.*}}
-; CHECK-SUBW-ADDW:       add.w lr, sp, #61440
+; CHECK-SUBW-ADDW:       add.w [[REGX:r[0-9]+|lr]], sp, #61440
 ; CHECK-SUBW-ADDW-NOT:   ldr {{r[0-9]+}}, .{{.*}}
 ; CHECK-SUBW-ADDW:       add.w sp, sp, #65536
 
diff --git a/llvm/test/CodeGen/ARM/fpoffset_overflow.mir b/llvm/test/CodeGen/ARM/fpoffset_overflow.mir
index 9c6cd93..4f3524b 100644
--- a/llvm/test/CodeGen/ARM/fpoffset_overflow.mir
+++ b/llvm/test/CodeGen/ARM/fpoffset_overflow.mir
@@ -3,10 +3,10 @@
 # This should trigger an emergency spill in the register scavenger because the
 # frame offset into the large argument is too large.
 # CHECK-LABEL: name: func0
-# CHECK: t2STRi12 killed %r7, %sp, 0, 14, _ :: (store 4 into %stack.0)
-# CHECK: %r7 = t2ADDri killed %sp, 4096, 14, _, _
-# CHECK: %r11 = t2LDRi12 killed %r7, 36, 14, _ :: (load 4)
-# CHECK: %r7 = t2LDRi12 %sp, 0, 14, _ :: (load 4 from %stack.0)
+# CHECK: t2STRi12 killed [[SPILLED:%r[0-9]+]], %sp, 0, 14, _ :: (store 4 into %stack.0)
+# CHECK: [[SPILLED]] = t2ADDri killed %sp, 4096, 14, _, _
+# CHECK: %sp = t2LDRi12 killed [[SPILLED]], 40, 14, _ :: (load 4)
+# CHECK: [[SPILLED]] = t2LDRi12 %sp, 0, 14, _ :: (load 4 from %stack.0)
 name: func0
 tracksRegLiveness: true
 fixedStack:
@@ -23,6 +23,7 @@
     %r4 = IMPLICIT_DEF
     %r5 = IMPLICIT_DEF
     %r6 = IMPLICIT_DEF
+    %r7 = IMPLICIT_DEF
     %r8 = IMPLICIT_DEF
     %r9 = IMPLICIT_DEF
     %r10 = IMPLICIT_DEF
@@ -30,7 +31,7 @@
     %r12 = IMPLICIT_DEF
     %lr = IMPLICIT_DEF
 
-    %r11 = t2LDRi12 %fixed-stack.0, 0, 14, _ :: (load 4)
+    %sp = t2LDRi12 %fixed-stack.0, 0, 14, _ :: (load 4)
 
     KILL %r0
     KILL %r1
@@ -39,6 +40,7 @@
     KILL %r4
     KILL %r5
     KILL %r6
+    KILL %r7
     KILL %r8
     KILL %r9
     KILL %r10
diff --git a/llvm/test/CodeGen/Mips/emergency-spill-slot-near-fp.ll b/llvm/test/CodeGen/Mips/emergency-spill-slot-near-fp.ll
index a08b681..625abc1 100644
--- a/llvm/test/CodeGen/Mips/emergency-spill-slot-near-fp.ll
+++ b/llvm/test/CodeGen/Mips/emergency-spill-slot-near-fp.ll
@@ -1,34 +1,62 @@
-; Check that register scavenging spill slot is close to $fp.
 ; RUN: llc -march=mipsel -O0 -relocation-model=pic < %s | FileCheck %s
+; Check that register scavenging spill slot is close to $fp.
+target triple="mipsel--"
 
-; CHECK: sw ${{.*}}, 8($sp)
-; CHECK: lw ${{.*}}, 8($sp)
+@var = external global i32
+@ptrvar = external global i8*
 
-define i32 @main(i32 signext %argc, i8** %argv) #0 {
-entry:
-  %retval = alloca i32, align 4
-  %argc.addr = alloca i32, align 4
-  %argv.addr = alloca i8**, align 4
-  %v0 = alloca <16 x i8>, align 16
-  %.compoundliteral = alloca <16 x i8>, align 16
-  %v1 = alloca <16 x i8>, align 16
-  %.compoundliteral1 = alloca <16 x i8>, align 16
-  %unused_variable = alloca [16384 x i32], align 4
-  %result = alloca <16 x i8>, align 16
-  store i32 0, i32* %retval
-  store i32 %argc, i32* %argc.addr, align 4
-  store i8** %argv, i8*** %argv.addr, align 4
-  store <16 x i8> <i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16>, <16 x i8>* %.compoundliteral
-  %0 = load <16 x i8>, <16 x i8>* %.compoundliteral
-  store <16 x i8> %0, <16 x i8>* %v0, align 16
-  store <16 x i8> zeroinitializer, <16 x i8>* %.compoundliteral1
-  %1 = load <16 x i8>, <16 x i8>* %.compoundliteral1
-  store <16 x i8> %1, <16 x i8>* %v1, align 16
-  %2 = load <16 x i8>, <16 x i8>* %v0, align 16
-  %3 = load <16 x i8>, <16 x i8>* %v1, align 16
-  %mul = mul <16 x i8> %2, %3
-  store <16 x i8> %mul, <16 x i8>* %result, align 16
-  ret i32 0
+; CHECK-LABEL: func:
+define void @func() {
+  %space = alloca i32, align 4
+  %stackspace = alloca[16384 x i32], align 4
+
+  ; ensure stackspace is not optimized out
+  %stackspace_casted = bitcast [16384 x i32]* %stackspace to i8*
+  store volatile i8* %stackspace_casted, i8** @ptrvar
+
+  ; Load values to increase register pressure.
+  %v0 = load volatile i32, i32* @var
+  %v1 = load volatile i32, i32* @var
+  %v2 = load volatile i32, i32* @var
+  %v3 = load volatile i32, i32* @var
+  %v4 = load volatile i32, i32* @var
+  %v5 = load volatile i32, i32* @var
+  %v6 = load volatile i32, i32* @var
+  %v7 = load volatile i32, i32* @var
+  %v8 = load volatile i32, i32* @var
+  %v9 = load volatile i32, i32* @var
+  %v10 = load volatile i32, i32* @var
+  %v11 = load volatile i32, i32* @var
+  %v12 = load volatile i32, i32* @var
+  %v13 = load volatile i32, i32* @var
+  %v14 = load volatile i32, i32* @var
+  %v15 = load volatile i32, i32* @var
+  %v16 = load volatile i32, i32* @var
+
+  ; Computing a stack-relative values needs an additional register.
+  ; We should get an emergency spill/reload for this.
+  ; CHECK: sw ${{.*}}, 0($sp)
+  ; CHECK: lw ${{.*}}, 0($sp)
+  store volatile i32 %v0, i32* %space
+
+  ; store values so they are used.
+  store volatile i32 %v0, i32* @var
+  store volatile i32 %v1, i32* @var
+  store volatile i32 %v2, i32* @var
+  store volatile i32 %v3, i32* @var
+  store volatile i32 %v4, i32* @var
+  store volatile i32 %v5, i32* @var
+  store volatile i32 %v6, i32* @var
+  store volatile i32 %v7, i32* @var
+  store volatile i32 %v8, i32* @var
+  store volatile i32 %v9, i32* @var
+  store volatile i32 %v10, i32* @var
+  store volatile i32 %v11, i32* @var
+  store volatile i32 %v12, i32* @var
+  store volatile i32 %v13, i32* @var
+  store volatile i32 %v14, i32* @var
+  store volatile i32 %v15, i32* @var
+  store volatile i32 %v16, i32* @var
+
+  ret void
 }
-
-attributes #0 = { noinline "no-frame-pointer-elim"="true" }
diff --git a/llvm/test/CodeGen/PowerPC/dyn-alloca-aligned.ll b/llvm/test/CodeGen/PowerPC/dyn-alloca-aligned.ll
index 0de2e22..e0f2847 100644
--- a/llvm/test/CodeGen/PowerPC/dyn-alloca-aligned.ll
+++ b/llvm/test/CodeGen/PowerPC/dyn-alloca-aligned.ll
@@ -25,8 +25,8 @@
 
 ; CHECK-DAG: li [[REG1:[0-9]+]], -128
 ; CHECK-DAG: neg [[REG2:[0-9]+]],
-; CHECK: and [[REG1]], [[REG2]], [[REG1]]
-; CHECK: stdux {{[0-9]+}}, 1, [[REG1]]
+; CHECK: and [[REG3:[0-9]+]], [[REG2]], [[REG1]]
+; CHECK: stdux {{[0-9]+}}, 1, [[REG3]]
 
 ; CHECK: blr
 
diff --git a/llvm/test/CodeGen/PowerPC/scavenging.mir b/llvm/test/CodeGen/PowerPC/scavenging.mir
index 8b5c262..a72aaa4 100644
--- a/llvm/test/CodeGen/PowerPC/scavenging.mir
+++ b/llvm/test/CodeGen/PowerPC/scavenging.mir
@@ -6,7 +6,7 @@
 body: |
   bb.0:
     ; CHECK: [[REG0:%r[0-9]+]] = LI 42
-    ; CHECK-NEXT: NOP implicit [[REG0]]
+    ; CHECK-NEXT: NOP implicit killed [[REG0]]
     %0 : gprc = LI 42
     NOP implicit %0
 
@@ -14,7 +14,7 @@
     ; CHECK-NEXT: NOP
     ; CHECK-NEXT: NOP implicit [[REG1]]
     ; CHECK-NEXT: NOP
-    ; CHECK-NEXT: NOP implicit [[REG1]]
+    ; CHECK-NEXT: NOP implicit killed [[REG1]]
     %1 : gprc = LI 42
     NOP
     NOP implicit %1
@@ -48,8 +48,8 @@
     ; CHECK-NOT: %x30 = LI 42
     ; CHECK: [[REG3:%r[0-9]+]] = LI 42
     ; CHECK-NEXT: %x5 = IMPLICIT_DEF
-    ; CHECK-NEXT: NOP implicit [[REG2]]
-    ; CHECK-NEXT: NOP implicit [[REG3]]
+    ; CHECK-NEXT: NOP implicit killed [[REG2]]
+    ; CHECK-NEXT: NOP implicit killed [[REG3]]
     %3 : gprc = LI 42
     %x5 = IMPLICIT_DEF
     NOP implicit %2
@@ -110,7 +110,7 @@
 
     ; CHECK: STD killed [[SPILLEDREG:%x[0-9]+]]
     ; CHECK: [[SPILLEDREG]] = LI8 42
-    ; CHECK: NOP implicit [[SPILLEDREG]]
+    ; CHECK: NOP implicit killed [[SPILLEDREG]]
     ; CHECK: [[SPILLEDREG]] = LD
     %0 : g8rc = LI8 42
     NOP implicit %0
@@ -147,3 +147,60 @@
     NOP implicit %x29
     NOP implicit %x30
 ...
+---
+# Check for bug where we would refuse to spill before the first instruction in a
+# block.
+# CHECK-LABEL: name: spill_at_begin
+# CHECK: bb.0:
+# CHECK: liveins:
+# CHECK: STD killed [[REG:%x[0-9]+]]{{.*}}(store 8 into %stack.{{[0-9]+}})
+# CHECK: [[REG]] = LIS8 0
+# CHECK: [[REG]] = ORI8 killed [[REG]], 48
+# CHECK: NOP implicit killed [[REG]]
+# CHEKC: [[REG]] = LD{{.*}}(load 8 from %stack.{{[0-9]+}})
+name: spill_at_begin
+tracksRegLiveness: true
+stack:
+  # variable-sized object should be a reason to reserve an emergency spillslot
+  # in the RegScavenger
+  - { id: 0, type: variable-sized, offset: -32, alignment: 1 }
+body: |
+  bb.0:
+    liveins: %x0, %x1, %x2, %x3, %x4, %x5, %x6, %x7, %x8, %x9, %x10, %x11, %x12, %x13, %x14, %x15, %x16, %x17, %x18, %x19, %x20, %x21, %x22, %x23, %x24, %x25, %x26, %x27, %x28, %x29, %x30, %x31
+    %0 : g8rc = LIS8 0
+    %1 : g8rc = ORI8 %0, 48
+    NOP implicit %1
+
+    NOP implicit %x0
+    NOP implicit %x1
+    NOP implicit %x2
+    NOP implicit %x3
+    NOP implicit %x4
+    NOP implicit %x5
+    NOP implicit %x6
+    NOP implicit %x7
+    NOP implicit %x8
+    NOP implicit %x9
+    NOP implicit %x10
+    NOP implicit %x11
+    NOP implicit %x12
+    NOP implicit %x13
+    NOP implicit %x14
+    NOP implicit %x15
+    NOP implicit %x16
+    NOP implicit %x17
+    NOP implicit %x18
+    NOP implicit %x19
+    NOP implicit %x20
+    NOP implicit %x21
+    NOP implicit %x22
+    NOP implicit %x23
+    NOP implicit %x24
+    NOP implicit %x25
+    NOP implicit %x26
+    NOP implicit %x27
+    NOP implicit %x28
+    NOP implicit %x29
+    NOP implicit %x30
+    NOP implicit %x31
+...
diff --git a/llvm/test/CodeGen/Thumb/large-stack.ll b/llvm/test/CodeGen/Thumb/large-stack.ll
index 938dadc..b0152dd 100644
--- a/llvm/test/CodeGen/Thumb/large-stack.ll
+++ b/llvm/test/CodeGen/Thumb/large-stack.ll
@@ -69,10 +69,10 @@
 ; CHECK-LABEL: test3:
 ; CHECK: ldr [[TEMP:r[0-7]]],
 ; CHECK: add sp, [[TEMP]]
-; CHECK: ldr [[TEMP]],
-; CHECK: add [[TEMP]], sp
-; CHECK: ldr [[TEMP:r[0-7]]],
-; CHECK: add sp, [[TEMP]]
+; CHECK: ldr [[TEMP2:r[0-7]]],
+; CHECK: add [[TEMP2]], sp
+; CHECK: ldr [[TEMP3:r[0-7]]],
+; CHECK: add sp, [[TEMP3]]
     %retval = alloca i32, align 4
     %tmp = alloca i32, align 4
     %a = alloca [805306369 x i8], align 16
@@ -85,8 +85,8 @@
 ; CHECK-LABEL: test3_nofpelim:
 ; CHECK: ldr [[TEMP:r[0-7]]],
 ; CHECK: add sp, [[TEMP]]
-; CHECK: ldr [[TEMP]],
-; CHECK: add [[TEMP]], sp
+; CHECK: ldr [[TEMP2:r[0-7]]],
+; CHECK: add [[TEMP2]], sp
 ; CHECK: subs r4, r7,
 ; CHECK: mov sp, r4
     %retval = alloca i32, align 4
diff --git a/llvm/test/CodeGen/X86/scavenger.mir b/llvm/test/CodeGen/X86/scavenger.mir
index 8d97aeb..5e964f8 100644
--- a/llvm/test/CodeGen/X86/scavenger.mir
+++ b/llvm/test/CodeGen/X86/scavenger.mir
@@ -5,6 +5,8 @@
 tracksRegLiveness: true
 body: |
   bb.0:
+    ; CHECK: [[REG0:%e[a-z]+]] = MOV32ri 42
+    ; CHECK: %ebp = COPY killed [[REG0]]
     %0 : gr32 = MOV32ri 42
     %ebp = COPY %0
 ...
@@ -16,7 +18,7 @@
   bb.0:
     ; CHECK-NOT: %eax = MOV32ri 42
     ; CHECK: [[REG0:%e[a-z]+]] = MOV32ri 42
-    ; CHECK: %ebp = COPY [[REG0]]
+    ; CHECK: %ebp = COPY killed [[REG0]]
     %eax = MOV32ri 13
     %0 : gr32 = MOV32ri 42
     %ebp = COPY %0
@@ -30,25 +32,18 @@
 
     NOOP implicit %ebp
 
-    ; CHECK: NOOP implicit [[REG2]]
-    ; CHECK: NOOP implicit [[REG1]]
+    ; CHECK: NOOP implicit killed [[REG2]]
+    ; CHECK: NOOP implicit killed [[REG1]]
     NOOP implicit %2
     NOOP implicit %1
     RETQ %eax
 ...
 ---
-# Defs without uses are currently broken
-#name: func3
-#tracksRegLiveness: true
-#body: |
-#  bb.0:
-#    dead %0 : gr32 = MOV32ri 42
-...
----
-# Uses without defs are currently broken (and honestly not that useful).
-#name: func3
-#tracksRegLiveness: true
-#body: |
-#  bb.0:
-#    NOOP undef implicit %0 : gr32
+# CHECK-LABEL: name: func3
+name: func3
+tracksRegLiveness: true
+body: |
+  bb.0:
+    ; CHECK dead {{%e[a-z]+}} = MOV32ri 42
+    dead %0 : gr32 = MOV32ri 42
 ...