AMDGPU: Fix folding immediate into readfirstlane through reg_sequence
The def instruction for the vreg may not match, because it may be
folding through a reg_sequence. The assert was overly conservative and
not necessary. It's not actually important if DefMI really defined the
register, because the fold that will be done cares about the def of
the value that will be folded.
For some reason copies aren't making it through the reg_sequence,
although they should.
llvm-svn: 363876
diff --git a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
index 4a1fc13..3f56688 100644
--- a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
@@ -436,9 +436,11 @@
unsigned RegSeqDstReg = UseMI->getOperand(0).getReg();
unsigned RegSeqDstSubReg = UseMI->getOperand(UseOpIdx + 1).getImm();
+ MachineRegisterInfo::use_iterator Next;
for (MachineRegisterInfo::use_iterator
RSUse = MRI->use_begin(RegSeqDstReg), RSE = MRI->use_end();
- RSUse != RSE; ++RSUse) {
+ RSUse != RSE; RSUse = Next) {
+ Next = std::next(RSUse);
MachineInstr *RSUseMI = RSUse->getParent();
if (RSUse->getSubReg() != RegSeqDstSubReg)
@@ -523,6 +525,9 @@
return;
UseMI->setDesc(TII->get(AMDGPU::S_MOV_B32));
+
+ // FIXME: ChangeToImmediate should clear subreg
+ UseMI->getOperand(1).setSubReg(0);
UseMI->getOperand(1).ChangeToImmediate(OpToFold.getImm());
UseMI->RemoveOperand(2); // Remove exec read (or src1 for readlane)
return;
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index 56935b3..5831abb 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -6079,7 +6079,6 @@
const MachineInstr &DefMI,
const MachineInstr *UseMI) {
assert(MRI.isSSA() && "Must be run on SSA");
- assert(DefMI.definesRegister(VReg) && "wrong def instruction");
auto *TRI = MRI.getTargetRegisterInfo();
auto *DefBB = DefMI.getParent();
diff --git a/llvm/test/CodeGen/AMDGPU/constant-address-space-32bit.ll b/llvm/test/CodeGen/AMDGPU/constant-address-space-32bit.ll
index 040bcbc..e90c855 100644
--- a/llvm/test/CodeGen/AMDGPU/constant-address-space-32bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/constant-address-space-32bit.ll
@@ -279,12 +279,24 @@
ret float %r2
}
+; CHECK-LABEL: {{^}}vgpr_arg_src:
+; CHECK: v_readfirstlane_b32 s[[READLANE:[0-9]+]], v0
+; CHECK: s_mov_b32 s[[ZERO:[0-9]+]]
+; CHECK: s_load_dwordx4 s{{\[[0-9]+:[0-9]+\]}}, s{{\[}}[[READLANE]]:[[ZERO]]{{\]}}
+define amdgpu_vs float @vgpr_arg_src(<4 x i32> addrspace(6)* %arg) {
+main_body:
+ %tmp9 = load <4 x i32>, <4 x i32> addrspace(6)* %arg
+ %tmp10 = call nsz float @llvm.amdgcn.struct.buffer.load.format.f32(<4 x i32> %tmp9, i32 undef, i32 0, i32 0, i32 0) #1
+ ret float %tmp10
+}
+
; Function Attrs: nounwind readnone speculatable
declare float @llvm.amdgcn.interp.mov(i32, i32, i32, i32) #6
; Function Attrs: nounwind readonly
declare <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32, float, <8 x i32>, <4 x i32>, i1, i32, i32) #7
+declare float @llvm.amdgcn.struct.buffer.load.format.f32(<4 x i32>, i32, i32, i32, i32) #7
!0 = !{}
diff --git a/llvm/test/CodeGen/AMDGPU/fold-readlane.mir b/llvm/test/CodeGen/AMDGPU/fold-readlane.mir
index 55b7a61..3c68686 100644
--- a/llvm/test/CodeGen/AMDGPU/fold-readlane.mir
+++ b/llvm/test/CodeGen/AMDGPU/fold-readlane.mir
@@ -248,3 +248,126 @@
%1:sreg_32_xm0 = S_MOV_B32 12
%2:sreg_32_xm0 = V_READLANE_B32 %0, %1, implicit $exec
...
+
+# Constant for subreg0
+# GCN-LABEL: name: fold-imm-readfirstlane-regsequence0{{$}}
+
+# GCN: %0:vgpr_32 = COPY $vgpr0
+# GCN-NEXT: %1:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+# GCN-NEXT: %2:vreg_64 = REG_SEQUENCE %0, %subreg.sub0, killed %1, %subreg.sub1
+# GCN-NEXT: %3:sgpr_32 = V_READFIRSTLANE_B32 %2.sub0, implicit $exec
+# GCN-NEXT: %4:sgpr_32 = S_MOV_B32 0
+---
+name: fold-imm-readfirstlane-regsequence0
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $vgpr0
+ %0:vgpr_32 = COPY $vgpr0
+ %1:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ %2:vreg_64 = REG_SEQUENCE %0:vgpr_32, %subreg.sub0, killed %1:vgpr_32, %subreg.sub1
+ %3:sgpr_32 = V_READFIRSTLANE_B32 %2.sub0:vreg_64, implicit $exec
+ %4:sgpr_32 = V_READFIRSTLANE_B32 %2.sub1:vreg_64, implicit $exec
+...
+
+# Constant for subreg1
+# GCN-LABEL: name: fold-imm-readfirstlane-regsequence1{{$}}
+# GCN: %0:vgpr_32 = COPY $vgpr0
+# GCN-NEXT: %1:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+# GCN-NEXT: %2:vreg_64 = REG_SEQUENCE %1, %subreg.sub0, killed %0, %subreg.sub1
+# GCN-NEXT: %3:sgpr_32 = S_MOV_B32 0
+# GCN-NEXT: %4:sgpr_32 = V_READFIRSTLANE_B32 %2.sub1, implicit $exec
+
+---
+name: fold-imm-readfirstlane-regsequence1
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $vgpr0
+ %0:vgpr_32 = COPY $vgpr0
+ %1:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ %2:vreg_64 = REG_SEQUENCE %1:vgpr_32, %subreg.sub0, killed %0:vgpr_32, %subreg.sub1
+ %3:sgpr_32 = V_READFIRSTLANE_B32 %2.sub0:vreg_64, implicit $exec
+ %4:sgpr_32 = V_READFIRSTLANE_B32 %2.sub1:vreg_64, implicit $exec
+...
+
+# Different constant regs for each subreg
+# GCN-LABEL: name: fold-imm-readfirstlane-regsequence2{{$}}
+# GCN: %0:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+# GCN-NEXT: %1:vgpr_32 = V_MOV_B32_e32 1, implicit $exec
+# GCN-NEXT: %2:vreg_64 = REG_SEQUENCE %0, %subreg.sub0, killed %1, %subreg.sub1
+# GCN-NEXT: %3:sgpr_32 = S_MOV_B32 0
+# GCN-NEXT: %4:sgpr_32 = S_MOV_B32 1
+---
+name: fold-imm-readfirstlane-regsequence2
+tracksRegLiveness: true
+body: |
+ bb.0:
+ %0:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ %1:vgpr_32 = V_MOV_B32_e32 1, implicit $exec
+ %2:vreg_64 = REG_SEQUENCE %0:vgpr_32, %subreg.sub0, killed %1:vgpr_32, %subreg.sub1
+ %3:sgpr_32 = V_READFIRSTLANE_B32 %2.sub0:vreg_64, implicit $exec
+ %4:sgpr_32 = V_READFIRSTLANE_B32 %2.sub1:vreg_64, implicit $exec
+...
+
+# Same constant reg for each subreg, so there are multiple constant uses
+# GCN-LABEL: name: fold-imm-readfirstlane-regsequence3{{$}}
+# GCN: %0:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+# GCN-NEXT: %1:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+# GCN-NEXT: %2:vreg_64 = REG_SEQUENCE %0, %subreg.sub0, killed %1, %subreg.sub1
+# GCN-NEXT: %3:sgpr_32 = S_MOV_B32 0
+# GCN-NEXT: %4:sgpr_32 = S_MOV_B32 0
+---
+name: fold-imm-readfirstlane-regsequence3
+tracksRegLiveness: true
+body: |
+ bb.0:
+ %0:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ %1:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ %2:vreg_64 = REG_SEQUENCE %0:vgpr_32, %subreg.sub0, killed %1:vgpr_32, %subreg.sub1
+ %3:sgpr_32 = V_READFIRSTLANE_B32 %2.sub0:vreg_64, implicit $exec
+ %4:sgpr_32 = V_READFIRSTLANE_B32 %2.sub1:vreg_64, implicit $exec
+...
+
+# FIXME: This should fold
+# GCN-LABEL: name: fold-copy-readfirstlane-regsequence0{{$}}
+# GCN: %0:vgpr_32 = COPY $sgpr10
+# GCN-NEXT: %1:vgpr_32 = COPY $sgpr11
+# GCN-NEXT: %2:vreg_64 = REG_SEQUENCE %0, %subreg.sub0, killed %1, %subreg.sub1
+# GCN-NEXT: %3:sgpr_32 = V_READFIRSTLANE_B32 %2.sub0, implicit $exec
+# GCN-NEXT: %4:sgpr_32 = V_READFIRSTLANE_B32 %2.sub1, implicit $exec
+---
+name: fold-copy-readfirstlane-regsequence0
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $sgpr10, $sgpr11
+ %0:vgpr_32 = COPY $sgpr10
+ %1:vgpr_32 = COPY $sgpr11
+ %2:vreg_64 = REG_SEQUENCE %0:vgpr_32, %subreg.sub0, killed %1:vgpr_32, %subreg.sub1
+ %3:sgpr_32 = V_READFIRSTLANE_B32 %2.sub0:vreg_64, implicit $exec
+ %4:sgpr_32 = V_READFIRSTLANE_B32 %2.sub1:vreg_64, implicit $exec
+...
+
+# GCN-LABEL: name: fold-copy-readfirstlane-regsequence1{{$}}
+# GCN: %0:sreg_32_xm0 = COPY $sgpr10
+# GCN-NEXT: %1:sreg_32_xm0 = COPY $sgpr11
+# GCN-NEXT: %2:vgpr_32 = COPY %0
+# GCN-NEXT: %3:vgpr_32 = COPY %1
+# GCN-NEXT: %4:vreg_64 = REG_SEQUENCE %2, %subreg.sub0, killed %3, %subreg.sub1
+# GCN-NEXT: %5:sgpr_32 = V_READFIRSTLANE_B32 %4.sub0, implicit $exec
+# GCN-NEXT: %6:sgpr_32 = V_READFIRSTLANE_B32 %4.sub1, implicit $exec
+---
+name: fold-copy-readfirstlane-regsequence1
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $sgpr10, $sgpr11
+ %0:sreg_32_xm0 = COPY $sgpr10
+ %1:sreg_32_xm0 = COPY $sgpr11
+ %2:vgpr_32 = COPY %0
+ %3:vgpr_32 = COPY %1
+ %4:vreg_64 = REG_SEQUENCE %2:vgpr_32, %subreg.sub0, killed %3:vgpr_32, %subreg.sub1
+ %5:sgpr_32 = V_READFIRSTLANE_B32 %4.sub0:vreg_64, implicit $exec
+ %6:sgpr_32 = V_READFIRSTLANE_B32 %4.sub1:vreg_64, implicit $exec
+...