Allan MacKinnon | c110e79 | 2018-06-21 09:09:56 -0700 | [diff] [blame^] | 1 | /* |
| 2 | * Copyright 2017 Google Inc. |
| 3 | * |
| 4 | * Use of this source code is governed by a BSD-style license that can |
| 5 | * be found in the LICENSE file. |
| 6 | * |
| 7 | */ |
| 8 | |
| 9 | // |
| 10 | // |
| 11 | // |
| 12 | |
| 13 | #include "block.h" |
| 14 | #include "path.h" |
| 15 | #include "common.h" |
| 16 | #include "atomic_cl.h" |
| 17 | #include "raster_builder_cl_12.h" |
| 18 | #include "kernel_cl_12.h" |
| 19 | |
| 20 | // |
| 21 | // |
| 22 | // |
| 23 | |
| 24 | #define SKC_FILLS_EXPAND_SUBGROUP_SIZE_MASK (SKC_FILLS_EXPAND_SUBGROUP_SIZE - 1) |
| 25 | |
| 26 | #define SKC_FILLS_EXPAND_ELEMS_PER_BLOCK (SKC_DEVICE_BLOCK_WORDS / SKC_FILLS_EXPAND_ELEM_WORDS) |
| 27 | #define SKC_FILLS_EXPAND_ELEMS_PER_SUBBLOCK (SKC_DEVICE_SUBBLOCK_WORDS / SKC_FILLS_EXPAND_ELEM_WORDS) |
| 28 | |
| 29 | #define SKC_FILLS_EXPAND_ELEMS_PER_THREAD (SKC_FILLS_EXPAND_ELEMS_PER_BLOCK / SKC_FILLS_EXPAND_SUBGROUP_SIZE) |
| 30 | |
| 31 | // |
| 32 | // |
| 33 | // |
| 34 | |
| 35 | #define SKC_FILLS_EXPAND_X (SKC_DEVICE_BLOCK_WORDS / SKC_FILLS_EXPAND_SUBGROUP_SIZE) |
| 36 | |
| 37 | // |
| 38 | // |
| 39 | // |
| 40 | |
| 41 | #if ( SKC_FILLS_EXPAND_X == 1 ) |
| 42 | #define SKC_FILLS_EXPAND_PATH_BLOCK_EXPAND() SKC_EXPAND_1() |
| 43 | #define SKC_FILLS_EXPAND_PATH_BLOCK_EXPAND_I_LAST 0 |
| 44 | |
| 45 | #elif ( SKC_FILLS_EXPAND_X == 2 ) |
| 46 | #define SKC_FILLS_EXPAND_PATH_BLOCK_EXPAND() SKC_EXPAND_2() |
| 47 | #define SKC_FILLS_EXPAND_PATH_BLOCK_EXPAND_I_LAST 1 |
| 48 | |
| 49 | #elif ( SKC_FILLS_EXPAND_X == 4 ) |
| 50 | #define SKC_FILLS_EXPAND_PATH_BLOCK_EXPAND() SKC_EXPAND_4() |
| 51 | #define SKC_FILLS_EXPAND_PATH_BLOCK_EXPAND_I_LAST 3 |
| 52 | |
| 53 | #elif ( SKC_FILLS_EXPAND_X == 8 ) |
| 54 | #define SKC_FILLS_EXPAND_PATH_BLOCK_EXPAND() SKC_EXPAND_8() |
| 55 | #define SKC_FILLS_EXPAND_PATH_BLOCK_EXPAND_I_LAST 7 |
| 56 | |
| 57 | #elif ( SKC_FILLS_EXPAND_X == 16) |
| 58 | #define SKC_FILLS_EXPAND_PATH_BLOCK_EXPAND() SKC_EXPAND_16() |
| 59 | #define SKC_FILLS_EXPAND_PATH_BLOCK_EXPAND_I_LAST 15 |
| 60 | |
| 61 | #else |
| 62 | #error "MISSING SKC_FILLS_EXPAND_X" |
| 63 | #endif |
| 64 | |
| 65 | // |
| 66 | // Fill and rasterize cmds only differ in their first word semantics |
| 67 | // |
| 68 | |
| 69 | union skc_cmd_expand |
| 70 | { |
| 71 | union skc_cmd_fill fill; |
| 72 | union skc_cmd_rasterize rasterize; |
| 73 | }; |
| 74 | |
| 75 | // |
| 76 | // |
| 77 | // |
| 78 | |
| 79 | union skc_path_elem |
| 80 | { |
| 81 | skc_uint u32; |
| 82 | skc_float f32; |
| 83 | }; |
| 84 | |
| 85 | // |
| 86 | // COMPILE-TIME AND RUN-TIME MACROS |
| 87 | // |
| 88 | |
| 89 | #define SKC_ELEM_IN_RANGE(X,I) \ |
| 90 | (skc_bool)SKC_GTE_MACRO(X,(I ) * SKC_FILLS_EXPAND_SUBGROUP_SIZE) && \ |
| 91 | (skc_bool)SKC_LT_MACRO(X,(I+1) * SKC_FILLS_EXPAND_SUBGROUP_SIZE) |
| 92 | |
| 93 | #define SKC_ELEM_GTE(X,I) \ |
| 94 | SKC_GTE_MACRO(X,(I+1) * SKC_FILLS_EXPAND_SUBGROUP_SIZE) |
| 95 | |
| 96 | // |
| 97 | // FIXME -- slate these for replacement |
| 98 | // |
| 99 | |
| 100 | #define SKC_BROADCAST(E,S,I) \ |
| 101 | sub_group_broadcast(E##I.u32,S - I * SKC_FILLS_EXPAND_SUBGROUP_SIZE) |
| 102 | |
| 103 | #define SKC_BROADCAST_LAST_HELPER(E,I) \ |
| 104 | sub_group_broadcast(E##I.u32,SKC_FILLS_EXPAND_SUBGROUP_SIZE - 1) |
| 105 | |
| 106 | #define SKC_BROADCAST_LAST(E,I) \ |
| 107 | SKC_BROADCAST_LAST_HELPER(E,I) |
| 108 | |
| 109 | // |
| 110 | // |
| 111 | // |
| 112 | |
| 113 | void |
| 114 | skc_cmds_out_append(__global union skc_cmd_rasterize * const cmds_out, |
| 115 | skc_uint * const out_idx, |
| 116 | union skc_cmd_expand * const cmd, |
| 117 | union skc_path_elem const e, |
| 118 | skc_uint const e_idx) |
| 119 | { |
| 120 | // |
| 121 | // FIXME -- we can append a large number of nodeword indices to a |
| 122 | // local SMEM queue and flush when full. It may or may not be a |
| 123 | // performance win on some architectures. |
| 124 | // |
| 125 | skc_bool const is_elem = SKC_TAGGED_BLOCK_ID_GET_TAG(e.u32) < SKC_BLOCK_ID_TAG_PATH_NEXT; |
| 126 | skc_uint const offset = sub_group_scan_inclusive_add(is_elem ? 1 : 0); |
| 127 | |
| 128 | cmd->rasterize.nodeword = e_idx; |
| 129 | |
| 130 | if (is_elem) { |
| 131 | cmds_out[*out_idx + offset] = cmd->rasterize; |
| 132 | } |
| 133 | |
| 134 | *out_idx += sub_group_broadcast(offset,SKC_FILLS_EXPAND_SUBGROUP_SIZE-1); |
| 135 | } |
| 136 | |
| 137 | // |
| 138 | // |
| 139 | // |
| 140 | |
| 141 | __kernel |
| 142 | SKC_FILLS_EXPAND_KERNEL_ATTRIBS |
| 143 | void |
| 144 | skc_kernel_fills_expand(__global union skc_path_elem const * const blocks, |
| 145 | __global skc_uint volatile * const atomics, |
| 146 | __global skc_block_id_t const * const map, |
| 147 | __global union skc_cmd_fill const * const cmds_in, |
| 148 | __global union skc_cmd_rasterize * const cmds_out) |
| 149 | { |
| 150 | // |
| 151 | // Need to harmonize the way we determine a subgroup's id. In this |
| 152 | // kernel it's not as important because no local memory is being |
| 153 | // used. Although the device/mask calc to determine subgroup and |
| 154 | // lanes is still proper, we might want to make it clearer that |
| 155 | // we're working with subgroups by using the subgroup API. |
| 156 | // |
| 157 | // every subgroup/simd that will work on the block loads the same command |
| 158 | // |
| 159 | #if (__OPENCL_VERSION__ < 200) |
| 160 | skc_uint const cmd_stride = get_num_sub_groups(); |
| 161 | #else |
| 162 | skc_uint const cmd_stride = get_enqueued_num_sub_groups(); // 2.0 supports non-uniform workgroups |
| 163 | #endif |
| 164 | skc_uint cmd_idx = get_group_id(0) * cmd_stride + get_sub_group_id(); |
| 165 | |
| 166 | // load fill command -- we reuse y component |
| 167 | union skc_cmd_expand cmd = { .fill = cmds_in[cmd_idx] }; |
| 168 | |
| 169 | // get the path header block from the map |
| 170 | skc_block_id_t id = map[cmd.fill.path]; |
| 171 | |
| 172 | #if 0 |
| 173 | if (get_sub_group_local_id() == 0) |
| 174 | printf("expand[%u] = %u\n",cmd_idx,id); |
| 175 | #endif |
| 176 | |
| 177 | // |
| 178 | // blindly load all of the head elements into registers |
| 179 | // |
| 180 | skc_uint head_idx = id * SKC_FILLS_EXPAND_ELEMS_PER_SUBBLOCK + get_sub_group_local_id(); |
| 181 | |
| 182 | #undef SKC_EXPAND_X |
| 183 | #define SKC_EXPAND_X(I,S,C,P,R) \ |
| 184 | union skc_path_elem h##I = blocks[head_idx + I * SKC_FILLS_EXPAND_SUBGROUP_SIZE]; |
| 185 | |
| 186 | SKC_FILLS_EXPAND_PATH_BLOCK_EXPAND(); |
| 187 | |
| 188 | // |
| 189 | // pick out count.nodes and count.prims from the header |
| 190 | // |
| 191 | skc_uint count_nodes, count_prims; |
| 192 | |
| 193 | #undef SKC_EXPAND_X |
| 194 | #define SKC_EXPAND_X(I,S,C,P,R) \ |
| 195 | if (SKC_ELEM_IN_RANGE(SKC_PATH_HEAD_OFFSET_NODES,I)) { \ |
| 196 | count_nodes = SKC_BROADCAST(h,SKC_PATH_HEAD_OFFSET_NODES,I); \ |
| 197 | } \ |
| 198 | if (SKC_ELEM_IN_RANGE(SKC_PATH_HEAD_OFFSET_PRIMS,I)) { \ |
| 199 | count_prims = SKC_BROADCAST(h,SKC_PATH_HEAD_OFFSET_PRIMS,I); \ |
| 200 | } |
| 201 | |
| 202 | SKC_FILLS_EXPAND_PATH_BLOCK_EXPAND(); |
| 203 | |
| 204 | // |
| 205 | // debug of path head |
| 206 | // |
| 207 | #if 0 |
| 208 | skc_uint count_blocks; |
| 209 | |
| 210 | #undef SKC_EXPAND_X |
| 211 | #define SKC_EXPAND_X(I,S,C,P,R) \ |
| 212 | if (SKC_ELEM_IN_RANGE(SKC_PATH_HEAD_OFFSET_BLOCKS,I)) { \ |
| 213 | count_blocks = SKC_BROADCAST(h,SKC_PATH_HEAD_OFFSET_BLOCKS,I); \ |
| 214 | } |
| 215 | |
| 216 | SKC_FILLS_EXPAND_PATH_BLOCK_EXPAND(); |
| 217 | |
| 218 | if (get_sub_group_local_id() == 0) |
| 219 | printf("path header = { %5u, %5u, %5u }\n", |
| 220 | count_blocks,count_nodes,count_prims); |
| 221 | #endif |
| 222 | |
| 223 | // |
| 224 | // acquire slots in the expanded cmd extent |
| 225 | // |
| 226 | // decrement prim_idx by 1 so we can use inclusive warp scan later |
| 227 | // |
| 228 | skc_uint out_idx = 0; |
| 229 | |
| 230 | if (get_sub_group_local_id() == 0) { |
| 231 | out_idx = SKC_ATOMIC_ADD_GLOBAL_RELAXED_SUBGROUP |
| 232 | (atomics+SKC_RASTER_COHORT_ATOMIC_OFFSET_CMDS,count_prims) - 1; |
| 233 | } |
| 234 | |
| 235 | out_idx = sub_group_broadcast(out_idx,0); |
| 236 | |
| 237 | // |
| 238 | // process ids trailing the path header |
| 239 | // |
| 240 | #undef SKC_EXPAND_X |
| 241 | #define SKC_EXPAND_X(I,S,C,P,R) \ |
| 242 | if (!SKC_ELEM_GTE(SKC_PATH_HEAD_OFFSET_IDS,I)) { \ |
| 243 | if (SKC_ELEM_IN_RANGE(SKC_PATH_HEAD_OFFSET_IDS,I)) { \ |
| 244 | if (get_sub_group_local_id() + I * SKC_FILLS_EXPAND_SUBGROUP_SIZE < SKC_PATH_HEAD_OFFSET_IDS) { \ |
| 245 | h##I.u32 = SKC_TAGGED_BLOCK_ID_INVALID; \ |
| 246 | } \ |
| 247 | } \ |
| 248 | skc_cmds_out_append(cmds_out,&out_idx,&cmd,h##I, \ |
| 249 | head_idx + I * SKC_FILLS_EXPAND_SUBGROUP_SIZE); \ |
| 250 | } |
| 251 | |
| 252 | SKC_FILLS_EXPAND_PATH_BLOCK_EXPAND(); |
| 253 | |
| 254 | // |
| 255 | // we're done if it was just the header |
| 256 | // |
| 257 | if (count_nodes == 0) |
| 258 | return; |
| 259 | |
| 260 | // |
| 261 | // otherwise, process the nodes |
| 262 | // |
| 263 | |
| 264 | // |
| 265 | // get id of next node |
| 266 | // |
| 267 | id = SKC_TAGGED_BLOCK_ID_GET_ID(SKC_BROADCAST_LAST(h,SKC_FILLS_EXPAND_PATH_BLOCK_EXPAND_I_LAST)); |
| 268 | |
| 269 | // |
| 270 | // the following blocks are nodes |
| 271 | // |
| 272 | while (true) |
| 273 | { |
| 274 | // get index of each element |
| 275 | skc_uint node_idx = id * SKC_FILLS_EXPAND_ELEMS_PER_SUBBLOCK + get_sub_group_local_id(); |
| 276 | |
| 277 | // |
| 278 | // blindly load all of the node elements into registers |
| 279 | // |
| 280 | #undef SKC_EXPAND_X |
| 281 | #define SKC_EXPAND_X(I,S,C,P,R) \ |
| 282 | union skc_path_elem const n##I = blocks[node_idx + I * SKC_FILLS_EXPAND_SUBGROUP_SIZE]; |
| 283 | |
| 284 | SKC_FILLS_EXPAND_PATH_BLOCK_EXPAND(); |
| 285 | |
| 286 | // |
| 287 | // append all valid ids |
| 288 | // |
| 289 | #undef SKC_EXPAND_X |
| 290 | #define SKC_EXPAND_X(I,S,C,P,R) \ |
| 291 | skc_cmds_out_append(cmds_out,&out_idx,&cmd,n##I, \ |
| 292 | node_idx + I * SKC_FILLS_EXPAND_SUBGROUP_SIZE); |
| 293 | |
| 294 | SKC_FILLS_EXPAND_PATH_BLOCK_EXPAND(); |
| 295 | |
| 296 | // any more nodes? |
| 297 | if (--count_nodes == 0) |
| 298 | return; |
| 299 | |
| 300 | // |
| 301 | // get id of next node |
| 302 | // |
| 303 | id = SKC_TAGGED_BLOCK_ID_GET_ID(SKC_BROADCAST_LAST(n,SKC_FILLS_EXPAND_PATH_BLOCK_EXPAND_I_LAST)); |
| 304 | } |
| 305 | } |
| 306 | |
| 307 | // |
| 308 | // |
| 309 | // |