Marek Olšák | 7209703 | 2014-01-22 18:50:36 +0100 | [diff] [blame] | 1 | /* |
| 2 | * Copyright 2013 Advanced Micro Devices, Inc. |
| 3 | * |
| 4 | * Permission is hereby granted, free of charge, to any person obtaining a |
| 5 | * copy of this software and associated documentation files (the "Software"), |
| 6 | * to deal in the Software without restriction, including without limitation |
| 7 | * on the rights to use, copy, modify, merge, publish, distribute, sub |
| 8 | * license, and/or sell copies of the Software, and to permit persons to whom |
| 9 | * the Software is furnished to do so, subject to the following conditions: |
| 10 | * |
| 11 | * The above copyright notice and this permission notice (including the next |
| 12 | * paragraph) shall be included in all copies or substantial portions of the |
| 13 | * Software. |
| 14 | * |
| 15 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR |
| 16 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, |
| 17 | * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL |
| 18 | * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM, |
| 19 | * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR |
| 20 | * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE |
| 21 | * USE OR OTHER DEALINGS IN THE SOFTWARE. |
| 22 | * |
| 23 | */ |
| 24 | |
Tom Stellard | 302f53d | 2012-10-25 13:50:10 -0400 | [diff] [blame] | 25 | #include "util/u_memory.h" |
| 26 | |
Marek Olšák | a81c3e0 | 2013-08-14 01:04:39 +0200 | [diff] [blame] | 27 | #include "../radeon/r600_cs.h" |
Andreas Hartmetz | 786af2f | 2014-01-04 18:44:33 +0100 | [diff] [blame] | 28 | #include "si_pipe.h" |
| 29 | #include "si_shader.h" |
Marek Olšák | 7209703 | 2014-01-22 18:50:36 +0100 | [diff] [blame] | 30 | #include "sid.h" |
Tom Stellard | 302f53d | 2012-10-25 13:50:10 -0400 | [diff] [blame] | 31 | |
| 32 | #include "radeon_llvm_util.h" |
| 33 | |
Tom Stellard | 67e5c9a | 2013-05-17 17:02:25 -0400 | [diff] [blame] | 34 | #define MAX_GLOBAL_BUFFERS 20 |
Tom Stellard | 6cc5334 | 2014-07-18 14:40:50 -0400 | [diff] [blame] | 35 | #if HAVE_LLVM < 0x0305 |
| 36 | #define NUM_USER_SGPRS 2 |
| 37 | #else |
| 38 | #define NUM_USER_SGPRS 4 |
| 39 | #endif |
Tom Stellard | 67e5c9a | 2013-05-17 17:02:25 -0400 | [diff] [blame] | 40 | |
Tom Stellard | 302f53d | 2012-10-25 13:50:10 -0400 | [diff] [blame] | 41 | struct si_pipe_compute { |
Andreas Hartmetz | 238aeab | 2014-01-11 15:47:07 +0100 | [diff] [blame] | 42 | struct si_context *ctx; |
Tom Stellard | 302f53d | 2012-10-25 13:50:10 -0400 | [diff] [blame] | 43 | |
| 44 | unsigned local_size; |
| 45 | unsigned private_size; |
| 46 | unsigned input_size; |
Tom Stellard | d2472ce | 2013-05-21 17:02:33 -0400 | [diff] [blame] | 47 | unsigned num_kernels; |
| 48 | struct si_pipe_shader *kernels; |
Tom Stellard | 302f53d | 2012-10-25 13:50:10 -0400 | [diff] [blame] | 49 | unsigned num_user_sgprs; |
| 50 | |
Tom Stellard | 67e5c9a | 2013-05-17 17:02:25 -0400 | [diff] [blame] | 51 | struct pipe_resource *global_buffers[MAX_GLOBAL_BUFFERS]; |
Tom Stellard | 302f53d | 2012-10-25 13:50:10 -0400 | [diff] [blame] | 52 | |
Aaron Watry | 8c9a920 | 2013-12-12 16:34:09 -0600 | [diff] [blame] | 53 | LLVMContextRef llvm_ctx; |
Tom Stellard | 302f53d | 2012-10-25 13:50:10 -0400 | [diff] [blame] | 54 | }; |
| 55 | |
Andreas Hartmetz | 3160aa4 | 2014-01-07 02:53:26 +0100 | [diff] [blame] | 56 | static void *si_create_compute_state( |
Tom Stellard | 302f53d | 2012-10-25 13:50:10 -0400 | [diff] [blame] | 57 | struct pipe_context *ctx, |
| 58 | const struct pipe_compute_state *cso) |
| 59 | { |
Andreas Hartmetz | 8662e66 | 2014-01-11 16:00:50 +0100 | [diff] [blame] | 60 | struct si_context *sctx = (struct si_context *)ctx; |
Tom Stellard | 302f53d | 2012-10-25 13:50:10 -0400 | [diff] [blame] | 61 | struct si_pipe_compute *program = |
| 62 | CALLOC_STRUCT(si_pipe_compute); |
| 63 | const struct pipe_llvm_program_header *header; |
| 64 | const unsigned char *code; |
Tom Stellard | d2472ce | 2013-05-21 17:02:33 -0400 | [diff] [blame] | 65 | unsigned i; |
Tom Stellard | 302f53d | 2012-10-25 13:50:10 -0400 | [diff] [blame] | 66 | |
Aaron Watry | 8c9a920 | 2013-12-12 16:34:09 -0600 | [diff] [blame] | 67 | program->llvm_ctx = LLVMContextCreate(); |
| 68 | |
Tom Stellard | 302f53d | 2012-10-25 13:50:10 -0400 | [diff] [blame] | 69 | header = cso->prog; |
| 70 | code = cso->prog + sizeof(struct pipe_llvm_program_header); |
| 71 | |
Andreas Hartmetz | 8662e66 | 2014-01-11 16:00:50 +0100 | [diff] [blame] | 72 | program->ctx = sctx; |
Tom Stellard | 302f53d | 2012-10-25 13:50:10 -0400 | [diff] [blame] | 73 | program->local_size = cso->req_local_mem; |
| 74 | program->private_size = cso->req_private_mem; |
| 75 | program->input_size = cso->req_input_mem; |
| 76 | |
Aaron Watry | 8c9a920 | 2013-12-12 16:34:09 -0600 | [diff] [blame] | 77 | program->num_kernels = radeon_llvm_get_num_kernels(program->llvm_ctx, code, |
Tom Stellard | d2472ce | 2013-05-21 17:02:33 -0400 | [diff] [blame] | 78 | header->num_bytes); |
| 79 | program->kernels = CALLOC(sizeof(struct si_pipe_shader), |
| 80 | program->num_kernels); |
| 81 | for (i = 0; i < program->num_kernels; i++) { |
Aaron Watry | 8c9a920 | 2013-12-12 16:34:09 -0600 | [diff] [blame] | 82 | LLVMModuleRef mod = radeon_llvm_get_kernel_module(program->llvm_ctx, i, |
| 83 | code, header->num_bytes); |
Andreas Hartmetz | 8662e66 | 2014-01-11 16:00:50 +0100 | [diff] [blame] | 84 | si_compile_llvm(sctx, &program->kernels[i], mod); |
Aaron Watry | 4c6ac9e | 2013-11-06 16:49:23 -0600 | [diff] [blame] | 85 | LLVMDisposeModule(mod); |
Tom Stellard | d2472ce | 2013-05-21 17:02:33 -0400 | [diff] [blame] | 86 | } |
Tom Stellard | 302f53d | 2012-10-25 13:50:10 -0400 | [diff] [blame] | 87 | |
| 88 | return program; |
| 89 | } |
| 90 | |
Andreas Hartmetz | 3160aa4 | 2014-01-07 02:53:26 +0100 | [diff] [blame] | 91 | static void si_bind_compute_state(struct pipe_context *ctx, void *state) |
Tom Stellard | 302f53d | 2012-10-25 13:50:10 -0400 | [diff] [blame] | 92 | { |
Andreas Hartmetz | 8662e66 | 2014-01-11 16:00:50 +0100 | [diff] [blame] | 93 | struct si_context *sctx = (struct si_context*)ctx; |
| 94 | sctx->cs_shader_state.program = (struct si_pipe_compute*)state; |
Tom Stellard | 302f53d | 2012-10-25 13:50:10 -0400 | [diff] [blame] | 95 | } |
| 96 | |
Andreas Hartmetz | 3160aa4 | 2014-01-07 02:53:26 +0100 | [diff] [blame] | 97 | static void si_set_global_binding( |
Tom Stellard | 302f53d | 2012-10-25 13:50:10 -0400 | [diff] [blame] | 98 | struct pipe_context *ctx, unsigned first, unsigned n, |
| 99 | struct pipe_resource **resources, |
| 100 | uint32_t **handles) |
| 101 | { |
| 102 | unsigned i; |
Andreas Hartmetz | 8662e66 | 2014-01-11 16:00:50 +0100 | [diff] [blame] | 103 | struct si_context *sctx = (struct si_context*)ctx; |
| 104 | struct si_pipe_compute *program = sctx->cs_shader_state.program; |
Tom Stellard | 302f53d | 2012-10-25 13:50:10 -0400 | [diff] [blame] | 105 | |
| 106 | if (!resources) { |
Tom Stellard | 67e5c9a | 2013-05-17 17:02:25 -0400 | [diff] [blame] | 107 | for (i = first; i < first + n; i++) { |
| 108 | program->global_buffers[i] = NULL; |
| 109 | } |
Tom Stellard | 302f53d | 2012-10-25 13:50:10 -0400 | [diff] [blame] | 110 | return; |
| 111 | } |
| 112 | |
| 113 | for (i = first; i < first + n; i++) { |
Tom Stellard | 67e5c9a | 2013-05-17 17:02:25 -0400 | [diff] [blame] | 114 | uint64_t va; |
Tom Stellard | 945d87f | 2014-02-13 14:46:25 -0800 | [diff] [blame] | 115 | uint32_t offset; |
Tom Stellard | 67e5c9a | 2013-05-17 17:02:25 -0400 | [diff] [blame] | 116 | program->global_buffers[i] = resources[i]; |
| 117 | va = r600_resource_va(ctx->screen, resources[i]); |
Tom Stellard | 945d87f | 2014-02-13 14:46:25 -0800 | [diff] [blame] | 118 | offset = util_le32_to_cpu(*handles[i]); |
| 119 | va += offset; |
| 120 | va = util_cpu_to_le64(va); |
Tom Stellard | 302f53d | 2012-10-25 13:50:10 -0400 | [diff] [blame] | 121 | memcpy(handles[i], &va, sizeof(va)); |
| 122 | } |
| 123 | } |
| 124 | |
Tom Stellard | b0f7803 | 2014-07-18 14:45:18 -0400 | [diff] [blame^] | 125 | /** |
| 126 | * This function computes the value for R_00B860_COMPUTE_TMPRING_SIZE.WAVES |
| 127 | * /p block_layout is the number of threads in each work group. |
| 128 | * /p grid layout is the number of work groups. |
| 129 | */ |
| 130 | static unsigned compute_num_waves_for_scratch( |
| 131 | const struct radeon_info *info, |
| 132 | const uint *block_layout, |
| 133 | const uint *grid_layout) |
| 134 | { |
| 135 | unsigned num_sh = MAX2(info->max_sh_per_se, 1); |
| 136 | unsigned num_se = MAX2(info->max_se, 1); |
| 137 | unsigned num_blocks = 1; |
| 138 | unsigned threads_per_block = 1; |
| 139 | unsigned waves_per_block; |
| 140 | unsigned waves_per_sh; |
| 141 | unsigned waves; |
| 142 | unsigned scratch_waves; |
| 143 | unsigned i; |
| 144 | |
| 145 | for (i = 0; i < 3; i++) { |
| 146 | threads_per_block *= block_layout[i]; |
| 147 | num_blocks *= grid_layout[i]; |
| 148 | } |
| 149 | |
| 150 | waves_per_block = align(threads_per_block, 64) / 64; |
| 151 | waves = waves_per_block * num_blocks; |
| 152 | waves_per_sh = align(waves, num_sh * num_se) / (num_sh * num_se); |
| 153 | scratch_waves = waves_per_sh * num_sh * num_se; |
| 154 | |
| 155 | if (waves_per_block > waves_per_sh) { |
| 156 | scratch_waves = waves_per_block * num_sh * num_se; |
| 157 | } |
| 158 | |
| 159 | return scratch_waves; |
| 160 | } |
| 161 | |
Andreas Hartmetz | 3160aa4 | 2014-01-07 02:53:26 +0100 | [diff] [blame] | 162 | static void si_launch_grid( |
Tom Stellard | 302f53d | 2012-10-25 13:50:10 -0400 | [diff] [blame] | 163 | struct pipe_context *ctx, |
| 164 | const uint *block_layout, const uint *grid_layout, |
| 165 | uint32_t pc, const void *input) |
| 166 | { |
Andreas Hartmetz | 8662e66 | 2014-01-11 16:00:50 +0100 | [diff] [blame] | 167 | struct si_context *sctx = (struct si_context*)ctx; |
| 168 | struct si_pipe_compute *program = sctx->cs_shader_state.program; |
Tom Stellard | 302f53d | 2012-10-25 13:50:10 -0400 | [diff] [blame] | 169 | struct si_pm4_state *pm4 = CALLOC_STRUCT(si_pm4_state); |
Marek Olšák | a81c3e0 | 2013-08-14 01:04:39 +0200 | [diff] [blame] | 170 | struct r600_resource *kernel_args_buffer = NULL; |
Tom Stellard | 124e1f9 | 2013-05-13 22:13:53 -0400 | [diff] [blame] | 171 | unsigned kernel_args_size; |
| 172 | unsigned num_work_size_bytes = 36; |
| 173 | uint32_t kernel_args_offset = 0; |
| 174 | uint32_t *kernel_args; |
| 175 | uint64_t kernel_args_va; |
Tom Stellard | b0f7803 | 2014-07-18 14:45:18 -0400 | [diff] [blame^] | 176 | uint64_t scratch_buffer_va = 0; |
Tom Stellard | 302f53d | 2012-10-25 13:50:10 -0400 | [diff] [blame] | 177 | uint64_t shader_va; |
Tom Stellard | 6cc5334 | 2014-07-18 14:40:50 -0400 | [diff] [blame] | 178 | unsigned arg_user_sgpr_count = NUM_USER_SGPRS; |
Tom Stellard | 302f53d | 2012-10-25 13:50:10 -0400 | [diff] [blame] | 179 | unsigned i; |
Tom Stellard | d2472ce | 2013-05-21 17:02:33 -0400 | [diff] [blame] | 180 | struct si_pipe_shader *shader = &program->kernels[pc]; |
Tom Stellard | 1bdb993 | 2013-08-22 11:22:58 -0400 | [diff] [blame] | 181 | unsigned lds_blocks; |
Tom Stellard | b0f7803 | 2014-07-18 14:45:18 -0400 | [diff] [blame^] | 182 | unsigned num_waves_for_scratch; |
Tom Stellard | 302f53d | 2012-10-25 13:50:10 -0400 | [diff] [blame] | 183 | |
| 184 | pm4->compute_pkt = true; |
| 185 | si_cmd_context_control(pm4); |
| 186 | |
| 187 | si_pm4_cmd_begin(pm4, PKT3_EVENT_WRITE); |
| 188 | si_pm4_cmd_add(pm4, EVENT_TYPE(EVENT_TYPE_CACHE_FLUSH) | |
| 189 | EVENT_INDEX(0x7) | |
| 190 | EVENT_WRITE_INV_L2); |
| 191 | si_pm4_cmd_end(pm4, false); |
| 192 | |
| 193 | si_pm4_inval_texture_cache(pm4); |
| 194 | si_pm4_inval_shader_cache(pm4); |
| 195 | si_cmd_surface_sync(pm4, pm4->cp_coher_cntl); |
| 196 | |
Tom Stellard | 124e1f9 | 2013-05-13 22:13:53 -0400 | [diff] [blame] | 197 | /* Upload the kernel arguments */ |
Tom Stellard | 302f53d | 2012-10-25 13:50:10 -0400 | [diff] [blame] | 198 | |
Tom Stellard | 124e1f9 | 2013-05-13 22:13:53 -0400 | [diff] [blame] | 199 | /* The extra num_work_size_bytes are for work group / work item size information */ |
Tom Stellard | b0f7803 | 2014-07-18 14:45:18 -0400 | [diff] [blame^] | 200 | kernel_args_size = program->input_size + num_work_size_bytes + 8 /* For scratch va */; |
| 201 | |
Tom Stellard | 124e1f9 | 2013-05-13 22:13:53 -0400 | [diff] [blame] | 202 | kernel_args = MALLOC(kernel_args_size); |
| 203 | for (i = 0; i < 3; i++) { |
| 204 | kernel_args[i] = grid_layout[i]; |
| 205 | kernel_args[i + 3] = grid_layout[i] * block_layout[i]; |
| 206 | kernel_args[i + 6] = block_layout[i]; |
| 207 | } |
Tom Stellard | 302f53d | 2012-10-25 13:50:10 -0400 | [diff] [blame] | 208 | |
Tom Stellard | b0f7803 | 2014-07-18 14:45:18 -0400 | [diff] [blame^] | 209 | num_waves_for_scratch = compute_num_waves_for_scratch( |
| 210 | &sctx->screen->b.info, block_layout, grid_layout); |
| 211 | |
Tom Stellard | 124e1f9 | 2013-05-13 22:13:53 -0400 | [diff] [blame] | 212 | memcpy(kernel_args + (num_work_size_bytes / 4), input, program->input_size); |
| 213 | |
Tom Stellard | b0f7803 | 2014-07-18 14:45:18 -0400 | [diff] [blame^] | 214 | if (shader->scratch_bytes_per_wave > 0) { |
| 215 | unsigned scratch_bytes = shader->scratch_bytes_per_wave * |
| 216 | num_waves_for_scratch; |
| 217 | |
| 218 | COMPUTE_DBG(sctx->screen, "Waves: %u; Scratch per wave: %u bytes; " |
| 219 | "Total Scratch: %u bytes\n", num_waves_for_scratch, |
| 220 | shader->scratch_bytes_per_wave, scratch_bytes); |
| 221 | if (!shader->scratch_bo) { |
| 222 | shader->scratch_bo = (struct r600_resource*) |
| 223 | si_resource_create_custom(sctx->b.b.screen, |
| 224 | PIPE_USAGE_DEFAULT, scratch_bytes); |
| 225 | } |
| 226 | scratch_buffer_va = r600_resource_va(ctx->screen, |
| 227 | (struct pipe_resource*)shader->scratch_bo); |
| 228 | si_pm4_add_bo(pm4, shader->scratch_bo, |
| 229 | RADEON_USAGE_READWRITE, |
| 230 | RADEON_PRIO_SHADER_RESOURCE_RW); |
| 231 | |
| 232 | } |
| 233 | |
Tom Stellard | 245e861 | 2014-07-18 12:25:29 -0400 | [diff] [blame] | 234 | for (i = 0; i < (kernel_args_size / 4); i++) { |
| 235 | COMPUTE_DBG(sctx->screen, "input %u : %u\n", i, |
| 236 | kernel_args[i]); |
| 237 | } |
| 238 | |
Andreas Hartmetz | 8662e66 | 2014-01-11 16:00:50 +0100 | [diff] [blame] | 239 | si_upload_const_buffer(sctx, &kernel_args_buffer, (uint8_t*)kernel_args, |
Tom Stellard | 124e1f9 | 2013-05-13 22:13:53 -0400 | [diff] [blame] | 240 | kernel_args_size, &kernel_args_offset); |
| 241 | kernel_args_va = r600_resource_va(ctx->screen, |
| 242 | (struct pipe_resource*)kernel_args_buffer); |
| 243 | kernel_args_va += kernel_args_offset; |
| 244 | |
Marek Olšák | bee2b96 | 2014-02-20 15:39:35 +0100 | [diff] [blame] | 245 | si_pm4_add_bo(pm4, kernel_args_buffer, RADEON_USAGE_READ, RADEON_PRIO_SHADER_DATA); |
Tom Stellard | 124e1f9 | 2013-05-13 22:13:53 -0400 | [diff] [blame] | 246 | |
| 247 | si_pm4_set_reg(pm4, R_00B900_COMPUTE_USER_DATA_0, kernel_args_va); |
| 248 | si_pm4_set_reg(pm4, R_00B900_COMPUTE_USER_DATA_0 + 4, S_008F04_BASE_ADDRESS_HI (kernel_args_va >> 32) | S_008F04_STRIDE(0)); |
Tom Stellard | b0f7803 | 2014-07-18 14:45:18 -0400 | [diff] [blame^] | 249 | si_pm4_set_reg(pm4, R_00B900_COMPUTE_USER_DATA_0 + 8, scratch_buffer_va); |
| 250 | si_pm4_set_reg(pm4, R_00B900_COMPUTE_USER_DATA_0 + 12, |
| 251 | S_008F04_BASE_ADDRESS_HI(scratch_buffer_va >> 32) |
| 252 | | S_008F04_STRIDE(shader->scratch_bytes_per_wave / 64)); |
Tom Stellard | 302f53d | 2012-10-25 13:50:10 -0400 | [diff] [blame] | 253 | |
| 254 | si_pm4_set_reg(pm4, R_00B810_COMPUTE_START_X, 0); |
| 255 | si_pm4_set_reg(pm4, R_00B814_COMPUTE_START_Y, 0); |
| 256 | si_pm4_set_reg(pm4, R_00B818_COMPUTE_START_Z, 0); |
| 257 | |
| 258 | si_pm4_set_reg(pm4, R_00B81C_COMPUTE_NUM_THREAD_X, |
| 259 | S_00B81C_NUM_THREAD_FULL(block_layout[0])); |
| 260 | si_pm4_set_reg(pm4, R_00B820_COMPUTE_NUM_THREAD_Y, |
| 261 | S_00B820_NUM_THREAD_FULL(block_layout[1])); |
| 262 | si_pm4_set_reg(pm4, R_00B824_COMPUTE_NUM_THREAD_Z, |
| 263 | S_00B824_NUM_THREAD_FULL(block_layout[2])); |
| 264 | |
Tom Stellard | 67e5c9a | 2013-05-17 17:02:25 -0400 | [diff] [blame] | 265 | /* Global buffers */ |
| 266 | for (i = 0; i < MAX_GLOBAL_BUFFERS; i++) { |
Marek Olšák | a81c3e0 | 2013-08-14 01:04:39 +0200 | [diff] [blame] | 267 | struct r600_resource *buffer = |
| 268 | (struct r600_resource*)program->global_buffers[i]; |
Tom Stellard | 67e5c9a | 2013-05-17 17:02:25 -0400 | [diff] [blame] | 269 | if (!buffer) { |
| 270 | continue; |
| 271 | } |
Marek Olšák | bee2b96 | 2014-02-20 15:39:35 +0100 | [diff] [blame] | 272 | si_pm4_add_bo(pm4, buffer, RADEON_USAGE_READWRITE, RADEON_PRIO_SHADER_RESOURCE_RW); |
Tom Stellard | 67e5c9a | 2013-05-17 17:02:25 -0400 | [diff] [blame] | 273 | } |
| 274 | |
Tom Stellard | a859131 | 2013-10-16 13:43:08 -0400 | [diff] [blame] | 275 | /* This register has been moved to R_00CD20_COMPUTE_MAX_WAVE_ID |
| 276 | * and is now per pipe, so it should be handled in the |
| 277 | * kernel if we want to use something other than the default value, |
| 278 | * which is now 0x22f. |
| 279 | */ |
Andreas Hartmetz | 8662e66 | 2014-01-11 16:00:50 +0100 | [diff] [blame] | 280 | if (sctx->b.chip_class <= SI) { |
Tom Stellard | a859131 | 2013-10-16 13:43:08 -0400 | [diff] [blame] | 281 | /* XXX: This should be: |
| 282 | * (number of compute units) * 4 * (waves per simd) - 1 */ |
| 283 | |
| 284 | si_pm4_set_reg(pm4, R_00B82C_COMPUTE_MAX_WAVE_ID, |
| 285 | 0x190 /* Default value */); |
| 286 | } |
Tom Stellard | 302f53d | 2012-10-25 13:50:10 -0400 | [diff] [blame] | 287 | |
Tom Stellard | d2472ce | 2013-05-21 17:02:33 -0400 | [diff] [blame] | 288 | shader_va = r600_resource_va(ctx->screen, (void *)shader->bo); |
Marek Olšák | bee2b96 | 2014-02-20 15:39:35 +0100 | [diff] [blame] | 289 | si_pm4_add_bo(pm4, shader->bo, RADEON_USAGE_READ, RADEON_PRIO_SHADER_DATA); |
Tom Stellard | 302f53d | 2012-10-25 13:50:10 -0400 | [diff] [blame] | 290 | si_pm4_set_reg(pm4, R_00B830_COMPUTE_PGM_LO, (shader_va >> 8) & 0xffffffff); |
| 291 | si_pm4_set_reg(pm4, R_00B834_COMPUTE_PGM_HI, shader_va >> 40); |
| 292 | |
| 293 | si_pm4_set_reg(pm4, R_00B848_COMPUTE_PGM_RSRC1, |
| 294 | /* We always use at least 3 VGPRS, these come from |
| 295 | * TIDIG_COMP_CNT. |
| 296 | * XXX: The compiler should account for this. |
| 297 | */ |
Tom Stellard | d2472ce | 2013-05-21 17:02:33 -0400 | [diff] [blame] | 298 | S_00B848_VGPRS((MAX2(3, shader->num_vgprs) - 1) / 4) |
Tom Stellard | 302f53d | 2012-10-25 13:50:10 -0400 | [diff] [blame] | 299 | /* We always use at least 4 + arg_user_sgpr_count. The 4 extra |
| 300 | * sgprs are from TGID_X_EN, TGID_Y_EN, TGID_Z_EN, TG_SIZE_EN |
| 301 | * XXX: The compiler should account for this. |
| 302 | */ |
| 303 | | S_00B848_SGPRS(((MAX2(4 + arg_user_sgpr_count, |
Tom Stellard | d2472ce | 2013-05-21 17:02:33 -0400 | [diff] [blame] | 304 | shader->num_sgprs)) - 1) / 8)) |
Tom Stellard | 302f53d | 2012-10-25 13:50:10 -0400 | [diff] [blame] | 305 | ; |
| 306 | |
Tom Stellard | 1bdb993 | 2013-08-22 11:22:58 -0400 | [diff] [blame] | 307 | lds_blocks = shader->lds_size; |
| 308 | /* XXX: We are over allocating LDS. For SI, the shader reports LDS in |
| 309 | * blocks of 256 bytes, so if there are 4 bytes lds allocated in |
| 310 | * the shader and 4 bytes allocated by the state tracker, then |
| 311 | * we will set LDS_SIZE to 512 bytes rather than 256. |
| 312 | */ |
Andreas Hartmetz | 8662e66 | 2014-01-11 16:00:50 +0100 | [diff] [blame] | 313 | if (sctx->b.chip_class <= SI) { |
Tom Stellard | 1bdb993 | 2013-08-22 11:22:58 -0400 | [diff] [blame] | 314 | lds_blocks += align(program->local_size, 256) >> 8; |
| 315 | } else { |
| 316 | lds_blocks += align(program->local_size, 512) >> 9; |
| 317 | } |
| 318 | |
| 319 | assert(lds_blocks <= 0xFF); |
| 320 | |
Tom Stellard | 302f53d | 2012-10-25 13:50:10 -0400 | [diff] [blame] | 321 | si_pm4_set_reg(pm4, R_00B84C_COMPUTE_PGM_RSRC2, |
Tom Stellard | b0f7803 | 2014-07-18 14:45:18 -0400 | [diff] [blame^] | 322 | S_00B84C_SCRATCH_EN(shader->scratch_bytes_per_wave > 0) |
Tom Stellard | 302f53d | 2012-10-25 13:50:10 -0400 | [diff] [blame] | 323 | | S_00B84C_USER_SGPR(arg_user_sgpr_count) |
| 324 | | S_00B84C_TGID_X_EN(1) |
| 325 | | S_00B84C_TGID_Y_EN(1) |
| 326 | | S_00B84C_TGID_Z_EN(1) |
| 327 | | S_00B84C_TG_SIZE_EN(1) |
| 328 | | S_00B84C_TIDIG_COMP_CNT(2) |
Tom Stellard | 1bdb993 | 2013-08-22 11:22:58 -0400 | [diff] [blame] | 329 | | S_00B84C_LDS_SIZE(lds_blocks) |
Tom Stellard | 302f53d | 2012-10-25 13:50:10 -0400 | [diff] [blame] | 330 | | S_00B84C_EXCP_EN(0)) |
| 331 | ; |
| 332 | si_pm4_set_reg(pm4, R_00B854_COMPUTE_RESOURCE_LIMITS, 0); |
| 333 | |
| 334 | si_pm4_set_reg(pm4, R_00B858_COMPUTE_STATIC_THREAD_MGMT_SE0, |
| 335 | S_00B858_SH0_CU_EN(0xffff /* Default value */) |
| 336 | | S_00B858_SH1_CU_EN(0xffff /* Default value */)) |
| 337 | ; |
| 338 | |
| 339 | si_pm4_set_reg(pm4, R_00B85C_COMPUTE_STATIC_THREAD_MGMT_SE1, |
| 340 | S_00B85C_SH0_CU_EN(0xffff /* Default value */) |
| 341 | | S_00B85C_SH1_CU_EN(0xffff /* Default value */)) |
| 342 | ; |
| 343 | |
Tom Stellard | b0f7803 | 2014-07-18 14:45:18 -0400 | [diff] [blame^] | 344 | si_pm4_set_reg(pm4, R_00B860_COMPUTE_TMPRING_SIZE, |
| 345 | /* The maximum value for WAVES is 32 * num CU. |
| 346 | * If you program this value incorrectly, the GPU will hang if |
| 347 | * COMPUTE_PGM_RSRC2.SCRATCH_EN is enabled. |
| 348 | */ |
| 349 | S_00B860_WAVES(num_waves_for_scratch) |
| 350 | | S_00B860_WAVESIZE(shader->scratch_bytes_per_wave >> 10)) |
| 351 | ; |
| 352 | |
Tom Stellard | 302f53d | 2012-10-25 13:50:10 -0400 | [diff] [blame] | 353 | si_pm4_cmd_begin(pm4, PKT3_DISPATCH_DIRECT); |
| 354 | si_pm4_cmd_add(pm4, grid_layout[0]); /* Thread groups DIM_X */ |
| 355 | si_pm4_cmd_add(pm4, grid_layout[1]); /* Thread groups DIM_Y */ |
| 356 | si_pm4_cmd_add(pm4, grid_layout[2]); /* Thread gropus DIM_Z */ |
| 357 | si_pm4_cmd_add(pm4, 1); /* DISPATCH_INITIATOR */ |
| 358 | si_pm4_cmd_end(pm4, false); |
| 359 | |
| 360 | si_pm4_cmd_begin(pm4, PKT3_EVENT_WRITE); |
| 361 | si_pm4_cmd_add(pm4, EVENT_TYPE(V_028A90_CS_PARTIAL_FLUSH | EVENT_INDEX(0x4))); |
| 362 | si_pm4_cmd_end(pm4, false); |
| 363 | |
| 364 | si_pm4_inval_texture_cache(pm4); |
| 365 | si_pm4_inval_shader_cache(pm4); |
| 366 | si_cmd_surface_sync(pm4, pm4->cp_coher_cntl); |
| 367 | |
Andreas Hartmetz | 8662e66 | 2014-01-11 16:00:50 +0100 | [diff] [blame] | 368 | si_pm4_emit(sctx, pm4); |
Tom Stellard | 302f53d | 2012-10-25 13:50:10 -0400 | [diff] [blame] | 369 | |
| 370 | #if 0 |
Andreas Hartmetz | 8662e66 | 2014-01-11 16:00:50 +0100 | [diff] [blame] | 371 | fprintf(stderr, "cdw: %i\n", sctx->cs->cdw); |
| 372 | for (i = 0; i < sctx->cs->cdw; i++) { |
| 373 | fprintf(stderr, "%4i : 0x%08X\n", i, sctx->cs->buf[i]); |
Tom Stellard | 302f53d | 2012-10-25 13:50:10 -0400 | [diff] [blame] | 374 | } |
| 375 | #endif |
| 376 | |
Tom Stellard | 302f53d | 2012-10-25 13:50:10 -0400 | [diff] [blame] | 377 | FREE(pm4); |
Vinson Lee | f12e551 | 2013-06-25 21:37:07 -0700 | [diff] [blame] | 378 | FREE(kernel_args); |
Tom Stellard | 302f53d | 2012-10-25 13:50:10 -0400 | [diff] [blame] | 379 | } |
| 380 | |
| 381 | |
Aaron Watry | 35dad4a | 2013-11-06 16:49:22 -0600 | [diff] [blame] | 382 | static void si_delete_compute_state(struct pipe_context *ctx, void* state){ |
| 383 | struct si_pipe_compute *program = (struct si_pipe_compute *)state; |
| 384 | |
| 385 | if (!state) { |
| 386 | return; |
| 387 | } |
| 388 | |
| 389 | if (program->kernels) { |
Aaron Watry | ec1ada7 | 2014-03-12 13:26:10 -0500 | [diff] [blame] | 390 | for (int i = 0; i < program->num_kernels; i++){ |
| 391 | if (program->kernels[i].bo){ |
| 392 | si_pipe_shader_destroy(ctx, &program->kernels[i]); |
| 393 | } |
| 394 | } |
| 395 | |
Aaron Watry | 35dad4a | 2013-11-06 16:49:22 -0600 | [diff] [blame] | 396 | FREE(program->kernels); |
| 397 | } |
| 398 | |
Aaron Watry | 8c9a920 | 2013-12-12 16:34:09 -0600 | [diff] [blame] | 399 | if (program->llvm_ctx){ |
| 400 | LLVMContextDispose(program->llvm_ctx); |
| 401 | } |
| 402 | |
Aaron Watry | 35dad4a | 2013-11-06 16:49:22 -0600 | [diff] [blame] | 403 | //And then free the program itself. |
| 404 | FREE(program); |
| 405 | } |
| 406 | |
Tom Stellard | 302f53d | 2012-10-25 13:50:10 -0400 | [diff] [blame] | 407 | static void si_set_compute_resources(struct pipe_context * ctx_, |
| 408 | unsigned start, unsigned count, |
| 409 | struct pipe_surface ** surfaces) { } |
Tom Stellard | 302f53d | 2012-10-25 13:50:10 -0400 | [diff] [blame] | 410 | |
Andreas Hartmetz | 8662e66 | 2014-01-11 16:00:50 +0100 | [diff] [blame] | 411 | void si_init_compute_functions(struct si_context *sctx) |
Tom Stellard | 302f53d | 2012-10-25 13:50:10 -0400 | [diff] [blame] | 412 | { |
Andreas Hartmetz | 8662e66 | 2014-01-11 16:00:50 +0100 | [diff] [blame] | 413 | sctx->b.b.create_compute_state = si_create_compute_state; |
| 414 | sctx->b.b.delete_compute_state = si_delete_compute_state; |
| 415 | sctx->b.b.bind_compute_state = si_bind_compute_state; |
Tom Stellard | 302f53d | 2012-10-25 13:50:10 -0400 | [diff] [blame] | 416 | /* ctx->context.create_sampler_view = evergreen_compute_create_sampler_view; */ |
Andreas Hartmetz | 8662e66 | 2014-01-11 16:00:50 +0100 | [diff] [blame] | 417 | sctx->b.b.set_compute_resources = si_set_compute_resources; |
| 418 | sctx->b.b.set_global_binding = si_set_global_binding; |
| 419 | sctx->b.b.launch_grid = si_launch_grid; |
Tom Stellard | 302f53d | 2012-10-25 13:50:10 -0400 | [diff] [blame] | 420 | } |