Matt Arsenault | 4c519d3 | 2016-07-18 18:34:59 +0000 | [diff] [blame] | 1 | ; RUN: llc -march=r600 -mcpu=redwood -verify-machineinstrs < %s | FileCheck %s |
Tom Stellard | 8f9fc20 | 2013-11-15 00:12:45 +0000 | [diff] [blame] | 2 | ; |
| 3 | ; This test checks that the lds input queue will is empty at the end of |
| 4 | ; the ALU clause. |
| 5 | |
Tom Stellard | 79243d9 | 2014-10-01 17:15:17 +0000 | [diff] [blame] | 6 | ; CHECK-LABEL: {{^}}lds_input_queue: |
Tom Stellard | 8f9fc20 | 2013-11-15 00:12:45 +0000 | [diff] [blame] | 7 | ; CHECK: LDS_READ_RET * OQAP |
| 8 | ; CHECK-NOT: ALU clause |
| 9 | ; CHECK: MOV * T{{[0-9]\.[XYZW]}}, OQAP |
| 10 | |
Matt Arsenault | cc8d3b8 | 2014-11-13 19:56:13 +0000 | [diff] [blame] | 11 | @local_mem = internal unnamed_addr addrspace(3) global [2 x i32] undef, align 4 |
Tom Stellard | 8f9fc20 | 2013-11-15 00:12:45 +0000 | [diff] [blame] | 12 | |
Matt Arsenault | 3dbeefa | 2017-03-21 21:39:51 +0000 | [diff] [blame] | 13 | define amdgpu_kernel void @lds_input_queue(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %index) { |
Tom Stellard | 8f9fc20 | 2013-11-15 00:12:45 +0000 | [diff] [blame] | 14 | entry: |
David Blaikie | 79e6c74 | 2015-02-27 19:29:02 +0000 | [diff] [blame] | 15 | %0 = getelementptr inbounds [2 x i32], [2 x i32] addrspace(3)* @local_mem, i32 0, i32 %index |
David Blaikie | a79ac14 | 2015-02-27 21:17:42 +0000 | [diff] [blame] | 16 | %1 = load i32, i32 addrspace(3)* %0 |
Matt Arsenault | 4c519d3 | 2016-07-18 18:34:59 +0000 | [diff] [blame] | 17 | call void @llvm.r600.group.barrier() |
Tom Stellard | 8f9fc20 | 2013-11-15 00:12:45 +0000 | [diff] [blame] | 18 | |
| 19 | ; This will start a new clause for the vertex fetch |
David Blaikie | a79ac14 | 2015-02-27 21:17:42 +0000 | [diff] [blame] | 20 | %2 = load i32, i32 addrspace(1)* %in |
Tom Stellard | 8f9fc20 | 2013-11-15 00:12:45 +0000 | [diff] [blame] | 21 | %3 = add i32 %1, %2 |
| 22 | store i32 %3, i32 addrspace(1)* %out |
| 23 | ret void |
| 24 | } |
| 25 | |
Matt Arsenault | 4c519d3 | 2016-07-18 18:34:59 +0000 | [diff] [blame] | 26 | declare void @llvm.r600.group.barrier() nounwind convergent |
Tom Stellard | 8f9fc20 | 2013-11-15 00:12:45 +0000 | [diff] [blame] | 27 | |
| 28 | ; The machine scheduler does not do proper alias analysis and assumes that |
| 29 | ; loads from global values (Note that a global value is different that a |
| 30 | ; value from global memory. A global value is a value that is declared |
| 31 | ; outside of a function, it can reside in any address space) alias with |
| 32 | ; all other loads. |
| 33 | ; |
| 34 | ; This is a problem for scheduling the reads from the local data share (lds). |
| 35 | ; These reads are implemented using two instructions. The first copies the |
| 36 | ; data from lds into the lds output queue, and the second moves the data from |
| 37 | ; the input queue into main memory. These two instructions don't have to be |
| 38 | ; scheduled one after the other, but they do need to be scheduled in the same |
| 39 | ; clause. The aliasing problem mentioned above causes problems when there is a |
| 40 | ; load from global memory which immediately follows a load from a global value that |
| 41 | ; has been declared in the local memory space: |
| 42 | ; |
David Blaikie | 79e6c74 | 2015-02-27 19:29:02 +0000 | [diff] [blame] | 43 | ; %0 = getelementptr inbounds [2 x i32], [2 x i32] addrspace(3)* @local_mem, i32 0, i32 %index |
David Blaikie | a79ac14 | 2015-02-27 21:17:42 +0000 | [diff] [blame] | 44 | ; %1 = load i32, i32 addrspace(3)* %0 |
| 45 | ; %2 = load i32, i32 addrspace(1)* %in |
Tom Stellard | 8f9fc20 | 2013-11-15 00:12:45 +0000 | [diff] [blame] | 46 | ; |
| 47 | ; The instruction selection phase will generate ISA that looks like this: |
Francis Visoiu Mistrih | 9d7bb0c | 2017-11-28 17:15:09 +0000 | [diff] [blame] | 48 | ; %oqap = LDS_READ_RET |
Francis Visoiu Mistrih | 93ef145 | 2017-11-30 12:12:19 +0000 | [diff] [blame] | 49 | ; %0 = MOV %oqap |
| 50 | ; %1 = VTX_READ_32 |
| 51 | ; %2 = ADD_INT %1, %0 |
Tom Stellard | 8f9fc20 | 2013-11-15 00:12:45 +0000 | [diff] [blame] | 52 | ; |
| 53 | ; The bottom scheduler will schedule the two ALU instructions first: |
| 54 | ; |
| 55 | ; UNSCHEDULED: |
Francis Visoiu Mistrih | 9d7bb0c | 2017-11-28 17:15:09 +0000 | [diff] [blame] | 56 | ; %oqap = LDS_READ_RET |
Francis Visoiu Mistrih | 93ef145 | 2017-11-30 12:12:19 +0000 | [diff] [blame] | 57 | ; %1 = VTX_READ_32 |
Tom Stellard | 8f9fc20 | 2013-11-15 00:12:45 +0000 | [diff] [blame] | 58 | ; |
| 59 | ; SCHEDULED: |
| 60 | ; |
Francis Visoiu Mistrih | 93ef145 | 2017-11-30 12:12:19 +0000 | [diff] [blame] | 61 | ; %0 = MOV %oqap |
| 62 | ; %2 = ADD_INT %1, %2 |
Tom Stellard | 8f9fc20 | 2013-11-15 00:12:45 +0000 | [diff] [blame] | 63 | ; |
| 64 | ; The lack of proper aliasing results in the local memory read (LDS_READ_RET) |
| 65 | ; to consider the global memory read (VTX_READ_32) has a chain dependency, so |
| 66 | ; the global memory read will always be scheduled first. This will give us a |
| 67 | ; final program which looks like this: |
| 68 | ; |
| 69 | ; Alu clause: |
Francis Visoiu Mistrih | 9d7bb0c | 2017-11-28 17:15:09 +0000 | [diff] [blame] | 70 | ; %oqap = LDS_READ_RET |
Tom Stellard | 8f9fc20 | 2013-11-15 00:12:45 +0000 | [diff] [blame] | 71 | ; VTX clause: |
Francis Visoiu Mistrih | 93ef145 | 2017-11-30 12:12:19 +0000 | [diff] [blame] | 72 | ; %1 = VTX_READ_32 |
Tom Stellard | 8f9fc20 | 2013-11-15 00:12:45 +0000 | [diff] [blame] | 73 | ; Alu clause: |
Francis Visoiu Mistrih | 93ef145 | 2017-11-30 12:12:19 +0000 | [diff] [blame] | 74 | ; %0 = MOV %oqap |
| 75 | ; %2 = ADD_INT %1, %2 |
Tom Stellard | 8f9fc20 | 2013-11-15 00:12:45 +0000 | [diff] [blame] | 76 | ; |
Francis Visoiu Mistrih | 9d7bb0c | 2017-11-28 17:15:09 +0000 | [diff] [blame] | 77 | ; This is an illegal program because the oqap def and use know occur in |
Tom Stellard | 8f9fc20 | 2013-11-15 00:12:45 +0000 | [diff] [blame] | 78 | ; different ALU clauses. |
| 79 | ; |
| 80 | ; This test checks this scenario and makes sure it doesn't result in an |
| 81 | ; illegal program. For now, we have fixed this issue by merging the |
| 82 | ; LDS_READ_RET and MOV together during instruction selection and then |
| 83 | ; expanding them after scheduling. Once the scheduler has better alias |
| 84 | ; analysis, we should be able to keep these instructions sparate before |
| 85 | ; scheduling. |
| 86 | ; |
Tom Stellard | 79243d9 | 2014-10-01 17:15:17 +0000 | [diff] [blame] | 87 | ; CHECK-LABEL: {{^}}local_global_alias: |
Tom Stellard | 8f9fc20 | 2013-11-15 00:12:45 +0000 | [diff] [blame] | 88 | ; CHECK: LDS_READ_RET |
| 89 | ; CHECK-NOT: ALU clause |
Benjamin Kramer | c10563d | 2014-01-11 21:06:00 +0000 | [diff] [blame] | 90 | ; CHECK: MOV * T{{[0-9]\.[XYZW]}}, OQAP |
Matt Arsenault | 3dbeefa | 2017-03-21 21:39:51 +0000 | [diff] [blame] | 91 | define amdgpu_kernel void @local_global_alias(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { |
Tom Stellard | 8f9fc20 | 2013-11-15 00:12:45 +0000 | [diff] [blame] | 92 | entry: |
David Blaikie | 79e6c74 | 2015-02-27 19:29:02 +0000 | [diff] [blame] | 93 | %0 = getelementptr inbounds [2 x i32], [2 x i32] addrspace(3)* @local_mem, i32 0, i32 0 |
David Blaikie | a79ac14 | 2015-02-27 21:17:42 +0000 | [diff] [blame] | 94 | %1 = load i32, i32 addrspace(3)* %0 |
| 95 | %2 = load i32, i32 addrspace(1)* %in |
Tom Stellard | 8f9fc20 | 2013-11-15 00:12:45 +0000 | [diff] [blame] | 96 | %3 = add i32 %2, %1 |
| 97 | store i32 %3, i32 addrspace(1)* %out |
| 98 | ret void |
| 99 | } |