| Marek Olsak | 79c0587 | 2016-11-25 17:37:09 +0000 | [diff] [blame] | 1 | ; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=fiji -amdgpu-spill-sgpr-to-smem=0 -verify-machineinstrs < %s | FileCheck -check-prefix=TOSGPR -check-prefix=ALL %s | 
|  | 2 | ; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=fiji -amdgpu-spill-sgpr-to-smem=1 -verify-machineinstrs < %s | FileCheck -check-prefix=TOSMEM -check-prefix=ALL %s | 
| Konstantin Zhuravlyov | 1d65026 | 2016-09-06 20:22:28 +0000 | [diff] [blame] | 3 |  | 
| Marek Olsak | 79c0587 | 2016-11-25 17:37:09 +0000 | [diff] [blame] | 4 | ; If spilling to smem, additional registers are used for the resource | 
|  | 5 | ; descriptor. | 
|  | 6 |  | 
| Valery Pykhtin | 75d1de9 | 2017-01-26 10:51:47 +0000 | [diff] [blame] | 7 | ; ALL-LABEL: {{^}}max_9_sgprs: | 
| Matt Arsenault | 08906a3 | 2016-10-28 19:43:31 +0000 | [diff] [blame] | 8 |  | 
| Marek Olsak | 79c0587 | 2016-11-25 17:37:09 +0000 | [diff] [blame] | 9 | ; ALL: SGPRBlocks: 1 | 
| Valery Pykhtin | 75d1de9 | 2017-01-26 10:51:47 +0000 | [diff] [blame] | 10 | ; ALL: NumSGPRsForWavesPerEU: 9 | 
| Matt Arsenault | 3dbeefa | 2017-03-21 21:39:51 +0000 | [diff] [blame] | 11 | define amdgpu_kernel void @max_9_sgprs(i32 addrspace(1)* %out1, | 
| Marek Olsak | 79c0587 | 2016-11-25 17:37:09 +0000 | [diff] [blame] | 12 |  | 
| Konstantin Zhuravlyov | 1d65026 | 2016-09-06 20:22:28 +0000 | [diff] [blame] | 13 | i32 addrspace(1)* %out2, | 
|  | 14 | i32 addrspace(1)* %out3, | 
|  | 15 | i32 addrspace(1)* %out4, | 
| Stanislav Mekhanoshin | 582a523 | 2017-02-15 17:19:50 +0000 | [diff] [blame] | 16 | i32 addrspace(1)* %out5, | 
|  | 17 | i32 %one, i32 %two, i32 %three, i32 %four, i32 %five) #0 { | 
| Konstantin Zhuravlyov | 1d65026 | 2016-09-06 20:22:28 +0000 | [diff] [blame] | 18 | store i32 %one, i32 addrspace(1)* %out1 | 
|  | 19 | store i32 %two, i32 addrspace(1)* %out2 | 
|  | 20 | store i32 %three, i32 addrspace(1)* %out3 | 
|  | 21 | store i32 %four, i32 addrspace(1)* %out4 | 
| Stanislav Mekhanoshin | 582a523 | 2017-02-15 17:19:50 +0000 | [diff] [blame] | 22 | store i32 %five, i32 addrspace(1)* %out5 | 
| Konstantin Zhuravlyov | 1d65026 | 2016-09-06 20:22:28 +0000 | [diff] [blame] | 23 | ret void | 
|  | 24 | } | 
| Matt Arsenault | 08906a3 | 2016-10-28 19:43:31 +0000 | [diff] [blame] | 25 |  | 
|  | 26 | ; private resource: 4 | 
|  | 27 | ; scratch wave offset: 1 | 
|  | 28 | ; workgroup ids: 3 | 
|  | 29 | ; dispatch id: 2 | 
|  | 30 | ; queue ptr: 2 | 
|  | 31 | ; flat scratch init: 2 | 
|  | 32 | ; --------------------- | 
|  | 33 | ; total: 14 | 
|  | 34 |  | 
| Marek Olsak | 693e9be | 2016-12-09 19:49:48 +0000 | [diff] [blame] | 35 | ; + reserved vcc = 16 | 
| Matt Arsenault | 08906a3 | 2016-10-28 19:43:31 +0000 | [diff] [blame] | 36 |  | 
|  | 37 | ; Because we can't handle re-using the last few input registers as the | 
|  | 38 | ; special vcc etc. registers (as well as decide to not use the unused | 
|  | 39 | ; features when the number of registers is frozen), this ends up using | 
|  | 40 | ; more than expected. | 
|  | 41 |  | 
| Matthias Braun | 537d039 | 2017-06-17 02:08:18 +0000 | [diff] [blame] | 42 | ; XALL-LABEL: {{^}}max_12_sgprs_14_input_sgprs: | 
|  | 43 | ; XTOSGPR: SGPRBlocks: 1 | 
|  | 44 | ; XTOSGPR: NumSGPRsForWavesPerEU: 16 | 
| Matt Arsenault | 08906a3 | 2016-10-28 19:43:31 +0000 | [diff] [blame] | 45 |  | 
| Matthias Braun | 537d039 | 2017-06-17 02:08:18 +0000 | [diff] [blame] | 46 | ; XTOSMEM: s_mov_b64 s[10:11], s[2:3] | 
|  | 47 | ; XTOSMEM: s_mov_b64 s[8:9], s[0:1] | 
|  | 48 | ; XTOSMEM: s_mov_b32 s7, s13 | 
| Matt Arsenault | 08906a3 | 2016-10-28 19:43:31 +0000 | [diff] [blame] | 49 |  | 
| Matthias Braun | 537d039 | 2017-06-17 02:08:18 +0000 | [diff] [blame] | 50 | ; XTOSMEM: SGPRBlocks: 1 | 
|  | 51 | ; XTOSMEM: NumSGPRsForWavesPerEU: 16 | 
|  | 52 | ; | 
|  | 53 | ; This test case is disabled: When calculating the spillslot addresses AMDGPU | 
|  | 54 | ; creates an extra vreg to save/restore m0 which in a point of maximum register | 
|  | 55 | ; pressure would trigger an endless loop; the compiler aborts earlier with | 
|  | 56 | ; "Incomplete scavenging after 2nd pass" in practice. | 
|  | 57 | ;define amdgpu_kernel void @max_12_sgprs_14_input_sgprs(i32 addrspace(1)* %out1, | 
|  | 58 | ;                                        i32 addrspace(1)* %out2, | 
|  | 59 | ;                                        i32 addrspace(1)* %out3, | 
|  | 60 | ;                                        i32 addrspace(1)* %out4, | 
|  | 61 | ;                                        i32 %one, i32 %two, i32 %three, i32 %four) #2 { | 
|  | 62 | ;  %x.0 = call i32 @llvm.amdgcn.workgroup.id.x() | 
|  | 63 | ;  %x.1 = call i32 @llvm.amdgcn.workgroup.id.y() | 
|  | 64 | ;  %x.2 = call i32 @llvm.amdgcn.workgroup.id.z() | 
|  | 65 | ;  %x.3 = call i64 @llvm.amdgcn.dispatch.id() | 
|  | 66 | ;  %x.4 = call i8 addrspace(2)* @llvm.amdgcn.dispatch.ptr() | 
|  | 67 | ;  %x.5 = call i8 addrspace(2)* @llvm.amdgcn.queue.ptr() | 
|  | 68 | ;  store volatile i32 0, i32* undef | 
|  | 69 | ;  br label %stores | 
|  | 70 | ; | 
|  | 71 | ;stores: | 
|  | 72 | ;  store volatile i32 %x.0, i32 addrspace(1)* undef | 
|  | 73 | ;  store volatile i32 %x.0, i32 addrspace(1)* undef | 
|  | 74 | ;  store volatile i32 %x.0, i32 addrspace(1)* undef | 
|  | 75 | ;  store volatile i64 %x.3, i64 addrspace(1)* undef | 
|  | 76 | ;  store volatile i8 addrspace(2)* %x.4, i8 addrspace(2)* addrspace(1)* undef | 
|  | 77 | ;  store volatile i8 addrspace(2)* %x.5, i8 addrspace(2)* addrspace(1)* undef | 
|  | 78 | ; | 
|  | 79 | ;  store i32 %one, i32 addrspace(1)* %out1 | 
|  | 80 | ;  store i32 %two, i32 addrspace(1)* %out2 | 
|  | 81 | ;  store i32 %three, i32 addrspace(1)* %out3 | 
|  | 82 | ;  store i32 %four, i32 addrspace(1)* %out4 | 
|  | 83 | ;  ret void | 
|  | 84 | ;} | 
| Matt Arsenault | 08906a3 | 2016-10-28 19:43:31 +0000 | [diff] [blame] | 85 |  | 
| Matthias Braun | 709a4cc | 2016-12-01 22:39:51 +0000 | [diff] [blame] | 86 | ; The following test is commented out for now; http://llvm.org/PR31230 | 
|  | 87 | ; XALL-LABEL: max_12_sgprs_12_input_sgprs{{$}} | 
| Matt Arsenault | 08906a3 | 2016-10-28 19:43:31 +0000 | [diff] [blame] | 88 | ; ; Make sure copies for input buffer are not clobbered. This requires | 
|  | 89 | ; ; swapping the order the registers are copied from what normally | 
|  | 90 | ; ; happens. | 
|  | 91 |  | 
| Matthias Braun | 709a4cc | 2016-12-01 22:39:51 +0000 | [diff] [blame] | 92 | ; XTOSMEM: s_mov_b32 s5, s11 | 
|  | 93 | ; XTOSMEM: s_add_u32 m0, s5, | 
|  | 94 | ; XTOSMEM: s_buffer_store_dword vcc_lo, s[0:3], m0 | 
| Matt Arsenault | 08906a3 | 2016-10-28 19:43:31 +0000 | [diff] [blame] | 95 |  | 
| Matthias Braun | 709a4cc | 2016-12-01 22:39:51 +0000 | [diff] [blame] | 96 | ; XALL: SGPRBlocks: 2 | 
|  | 97 | ; XALL: NumSGPRsForWavesPerEU: 18 | 
| Matt Arsenault | 3dbeefa | 2017-03-21 21:39:51 +0000 | [diff] [blame] | 98 | ;define amdgpu_kernel void @max_12_sgprs_12_input_sgprs(i32 addrspace(1)* %out1, | 
| Matthias Braun | 709a4cc | 2016-12-01 22:39:51 +0000 | [diff] [blame] | 99 | ;                                        i32 addrspace(1)* %out2, | 
|  | 100 | ;                                        i32 addrspace(1)* %out3, | 
|  | 101 | ;                                        i32 addrspace(1)* %out4, | 
|  | 102 | ;                                        i32 %one, i32 %two, i32 %three, i32 %four) #2 { | 
|  | 103 | ;  store volatile i32 0, i32* undef | 
|  | 104 | ;  %x.0 = call i32 @llvm.amdgcn.workgroup.id.x() | 
|  | 105 | ;  store volatile i32 %x.0, i32 addrspace(1)* undef | 
|  | 106 | ;  %x.1 = call i32 @llvm.amdgcn.workgroup.id.y() | 
|  | 107 | ;  store volatile i32 %x.0, i32 addrspace(1)* undef | 
|  | 108 | ;  %x.2 = call i32 @llvm.amdgcn.workgroup.id.z() | 
|  | 109 | ;  store volatile i32 %x.0, i32 addrspace(1)* undef | 
|  | 110 | ;  %x.3 = call i64 @llvm.amdgcn.dispatch.id() | 
|  | 111 | ;  store volatile i64 %x.3, i64 addrspace(1)* undef | 
|  | 112 | ;  %x.4 = call i8 addrspace(2)* @llvm.amdgcn.dispatch.ptr() | 
|  | 113 | ;  store volatile i8 addrspace(2)* %x.4, i8 addrspace(2)* addrspace(1)* undef | 
|  | 114 | ; | 
|  | 115 | ;  store i32 %one, i32 addrspace(1)* %out1 | 
|  | 116 | ;  store i32 %two, i32 addrspace(1)* %out2 | 
|  | 117 | ;  store i32 %three, i32 addrspace(1)* %out3 | 
|  | 118 | ;  store i32 %four, i32 addrspace(1)* %out4 | 
|  | 119 | ;  ret void | 
|  | 120 | ;} | 
| Matt Arsenault | 08906a3 | 2016-10-28 19:43:31 +0000 | [diff] [blame] | 121 |  | 
|  | 122 | declare i32 @llvm.amdgcn.workgroup.id.x() #1 | 
|  | 123 | declare i32 @llvm.amdgcn.workgroup.id.y() #1 | 
|  | 124 | declare i32 @llvm.amdgcn.workgroup.id.z() #1 | 
|  | 125 | declare i64 @llvm.amdgcn.dispatch.id() #1 | 
|  | 126 | declare i8 addrspace(2)* @llvm.amdgcn.dispatch.ptr() #1 | 
|  | 127 | declare i8 addrspace(2)* @llvm.amdgcn.queue.ptr() #1 | 
|  | 128 |  | 
|  | 129 | attributes #0 = { nounwind "amdgpu-num-sgpr"="14" } | 
|  | 130 | attributes #1 = { nounwind readnone } | 
|  | 131 | attributes #2 = { nounwind "amdgpu-num-sgpr"="12" } | 
|  | 132 | attributes #3 = { nounwind "amdgpu-num-sgpr"="11" } |