[MachineCopyPropagation] Extend pass to do COPY source forwarding

Summary:
This change extends MachineCopyPropagation to do COPY source forwarding
and adds an additional run of the pass to the default pass pipeline just
after register allocation.

This version of this patch uses the newly added
MachineOperand::isRenamable bit to avoid forwarding registers is such a
way as to violate constraints that aren't captured in the
Machine IR (e.g. ABI or ISA constraints).

This change is a continuation of the work started in D30751.

Reviewers: qcolombet, javed.absar, MatzeB, jonpa, tstellar

Subscribers: tpr, mgorny, mcrosier, nhaehnle, nemanjai, jyknight, hfinkel, arsenm, inouehrs, eraman, sdardis, guyblank, fedor.sergeev, aheejin, dschuff, jfb, myatsina, llvm-commits

Differential Revision: https://reviews.llvm.org/D41835

llvm-svn: 323991
diff --git a/llvm/test/CodeGen/AMDGPU/ret.ll b/llvm/test/CodeGen/AMDGPU/ret.ll
index 831c71d..2d673a9 100644
--- a/llvm/test/CodeGen/AMDGPU/ret.ll
+++ b/llvm/test/CodeGen/AMDGPU/ret.ll
@@ -2,10 +2,10 @@
 ; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
 
 ; GCN-LABEL: {{^}}vgpr:
-; GCN: v_mov_b32_e32 v1, v0
-; GCN-DAG: v_add_f32_e32 v0, 1.0, v1
-; GCN-DAG: exp mrt0 v1, v1, v1, v1 done vm
+; GCN-DAG: v_mov_b32_e32 v1, v0
+; GCN-DAG: exp mrt0 v0, v0, v0, v0 done vm
 ; GCN: s_waitcnt expcnt(0)
+; GCN: v_add_f32_e32 v0, 1.0, v0
 ; GCN-NOT: s_endpgm
 define amdgpu_vs { float, float } @vgpr([9 x <16 x i8>] addrspace(2)* byval %arg, i32 inreg %arg1, i32 inreg %arg2, float %arg3) #0 {
 bb:
@@ -204,13 +204,13 @@
 }
 
 ; GCN-LABEL: {{^}}both:
-; GCN: v_mov_b32_e32 v1, v0
-; GCN-DAG: exp mrt0 v1, v1, v1, v1 done vm
-; GCN-DAG: v_add_f32_e32 v0, 1.0, v1
-; GCN-DAG: s_add_i32 s0, s3, 2
+; GCN-DAG: exp mrt0 v0, v0, v0, v0 done vm
+; GCN-DAG: v_mov_b32_e32 v1, v0
 ; GCN-DAG: s_mov_b32 s1, s2
-; GCN: s_mov_b32 s2, s3
 ; GCN: s_waitcnt expcnt(0)
+; GCN: v_add_f32_e32 v0, 1.0, v0
+; GCN-DAG: s_add_i32 s0, s3, 2
+; GCN-DAG: s_mov_b32 s2, s3
 ; GCN-NOT: s_endpgm
 define amdgpu_vs { float, i32, float, i32, i32 } @both([9 x <16 x i8>] addrspace(2)* byval %arg, i32 inreg %arg1, i32 inreg %arg2, float %arg3) #0 {
 bb: