blob: f8fb12eefa620a56a09640f580e5fb616857fdf4 [file] [log] [blame]
Matt Arsenault4c519d32016-07-18 18:34:59 +00001; RUN: llc -march=r600 -mcpu=redwood -verify-machineinstrs < %s | FileCheck %s
Tom Stellard8f9fc202013-11-15 00:12:45 +00002;
3; This test checks that the lds input queue will is empty at the end of
4; the ALU clause.
5
Tom Stellard79243d92014-10-01 17:15:17 +00006; CHECK-LABEL: {{^}}lds_input_queue:
Tom Stellard8f9fc202013-11-15 00:12:45 +00007; CHECK: LDS_READ_RET * OQAP
8; CHECK-NOT: ALU clause
9; CHECK: MOV * T{{[0-9]\.[XYZW]}}, OQAP
10
Matt Arsenaultcc8d3b82014-11-13 19:56:13 +000011@local_mem = internal unnamed_addr addrspace(3) global [2 x i32] undef, align 4
Tom Stellard8f9fc202013-11-15 00:12:45 +000012
Matt Arsenault3dbeefa2017-03-21 21:39:51 +000013define amdgpu_kernel void @lds_input_queue(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %index) {
Tom Stellard8f9fc202013-11-15 00:12:45 +000014entry:
David Blaikie79e6c742015-02-27 19:29:02 +000015 %0 = getelementptr inbounds [2 x i32], [2 x i32] addrspace(3)* @local_mem, i32 0, i32 %index
David Blaikiea79ac142015-02-27 21:17:42 +000016 %1 = load i32, i32 addrspace(3)* %0
Matt Arsenault4c519d32016-07-18 18:34:59 +000017 call void @llvm.r600.group.barrier()
Tom Stellard8f9fc202013-11-15 00:12:45 +000018
19 ; This will start a new clause for the vertex fetch
David Blaikiea79ac142015-02-27 21:17:42 +000020 %2 = load i32, i32 addrspace(1)* %in
Tom Stellard8f9fc202013-11-15 00:12:45 +000021 %3 = add i32 %1, %2
22 store i32 %3, i32 addrspace(1)* %out
23 ret void
24}
25
Matt Arsenault4c519d32016-07-18 18:34:59 +000026declare void @llvm.r600.group.barrier() nounwind convergent
Tom Stellard8f9fc202013-11-15 00:12:45 +000027
28; The machine scheduler does not do proper alias analysis and assumes that
29; loads from global values (Note that a global value is different that a
30; value from global memory. A global value is a value that is declared
31; outside of a function, it can reside in any address space) alias with
32; all other loads.
33;
34; This is a problem for scheduling the reads from the local data share (lds).
35; These reads are implemented using two instructions. The first copies the
36; data from lds into the lds output queue, and the second moves the data from
37; the input queue into main memory. These two instructions don't have to be
38; scheduled one after the other, but they do need to be scheduled in the same
39; clause. The aliasing problem mentioned above causes problems when there is a
40; load from global memory which immediately follows a load from a global value that
41; has been declared in the local memory space:
42;
David Blaikie79e6c742015-02-27 19:29:02 +000043; %0 = getelementptr inbounds [2 x i32], [2 x i32] addrspace(3)* @local_mem, i32 0, i32 %index
David Blaikiea79ac142015-02-27 21:17:42 +000044; %1 = load i32, i32 addrspace(3)* %0
45; %2 = load i32, i32 addrspace(1)* %in
Tom Stellard8f9fc202013-11-15 00:12:45 +000046;
47; The instruction selection phase will generate ISA that looks like this:
Francis Visoiu Mistrih9d7bb0c2017-11-28 17:15:09 +000048; %oqap = LDS_READ_RET
Francis Visoiu Mistrih93ef1452017-11-30 12:12:19 +000049; %0 = MOV %oqap
50; %1 = VTX_READ_32
51; %2 = ADD_INT %1, %0
Tom Stellard8f9fc202013-11-15 00:12:45 +000052;
53; The bottom scheduler will schedule the two ALU instructions first:
54;
55; UNSCHEDULED:
Francis Visoiu Mistrih9d7bb0c2017-11-28 17:15:09 +000056; %oqap = LDS_READ_RET
Francis Visoiu Mistrih93ef1452017-11-30 12:12:19 +000057; %1 = VTX_READ_32
Tom Stellard8f9fc202013-11-15 00:12:45 +000058;
59; SCHEDULED:
60;
Francis Visoiu Mistrih93ef1452017-11-30 12:12:19 +000061; %0 = MOV %oqap
62; %2 = ADD_INT %1, %2
Tom Stellard8f9fc202013-11-15 00:12:45 +000063;
64; The lack of proper aliasing results in the local memory read (LDS_READ_RET)
65; to consider the global memory read (VTX_READ_32) has a chain dependency, so
66; the global memory read will always be scheduled first. This will give us a
67; final program which looks like this:
68;
69; Alu clause:
Francis Visoiu Mistrih9d7bb0c2017-11-28 17:15:09 +000070; %oqap = LDS_READ_RET
Tom Stellard8f9fc202013-11-15 00:12:45 +000071; VTX clause:
Francis Visoiu Mistrih93ef1452017-11-30 12:12:19 +000072; %1 = VTX_READ_32
Tom Stellard8f9fc202013-11-15 00:12:45 +000073; Alu clause:
Francis Visoiu Mistrih93ef1452017-11-30 12:12:19 +000074; %0 = MOV %oqap
75; %2 = ADD_INT %1, %2
Tom Stellard8f9fc202013-11-15 00:12:45 +000076;
Francis Visoiu Mistrih9d7bb0c2017-11-28 17:15:09 +000077; This is an illegal program because the oqap def and use know occur in
Tom Stellard8f9fc202013-11-15 00:12:45 +000078; different ALU clauses.
79;
80; This test checks this scenario and makes sure it doesn't result in an
81; illegal program. For now, we have fixed this issue by merging the
82; LDS_READ_RET and MOV together during instruction selection and then
83; expanding them after scheduling. Once the scheduler has better alias
84; analysis, we should be able to keep these instructions sparate before
85; scheduling.
86;
Tom Stellard79243d92014-10-01 17:15:17 +000087; CHECK-LABEL: {{^}}local_global_alias:
Tom Stellard8f9fc202013-11-15 00:12:45 +000088; CHECK: LDS_READ_RET
89; CHECK-NOT: ALU clause
Benjamin Kramerc10563d2014-01-11 21:06:00 +000090; CHECK: MOV * T{{[0-9]\.[XYZW]}}, OQAP
Matt Arsenault3dbeefa2017-03-21 21:39:51 +000091define amdgpu_kernel void @local_global_alias(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
Tom Stellard8f9fc202013-11-15 00:12:45 +000092entry:
David Blaikie79e6c742015-02-27 19:29:02 +000093 %0 = getelementptr inbounds [2 x i32], [2 x i32] addrspace(3)* @local_mem, i32 0, i32 0
David Blaikiea79ac142015-02-27 21:17:42 +000094 %1 = load i32, i32 addrspace(3)* %0
95 %2 = load i32, i32 addrspace(1)* %in
Tom Stellard8f9fc202013-11-15 00:12:45 +000096 %3 = add i32 %2, %1
97 store i32 %3, i32 addrspace(1)* %out
98 ret void
99}