blob: bcff0a37c117ff4bb434f9b866935cfaa7a589e3 [file] [log] [blame]
Allan MacKinnonc110e792018-06-21 09:09:56 -07001/*
2 * Copyright 2017 Google Inc.
3 *
4 * Use of this source code is governed by a BSD-style license that can
5 * be found in the LICENSE file.
6 *
7 */
8
9//
10//
11//
12
13#include "block.h"
14#include "path.h"
15#include "common.h"
16#include "atomic_cl.h"
17#include "raster_builder_cl_12.h"
18#include "kernel_cl_12.h"
19
20//
21//
22//
23
24#define SKC_FILLS_EXPAND_SUBGROUP_SIZE_MASK (SKC_FILLS_EXPAND_SUBGROUP_SIZE - 1)
25
26#define SKC_FILLS_EXPAND_ELEMS_PER_BLOCK (SKC_DEVICE_BLOCK_WORDS / SKC_FILLS_EXPAND_ELEM_WORDS)
27#define SKC_FILLS_EXPAND_ELEMS_PER_SUBBLOCK (SKC_DEVICE_SUBBLOCK_WORDS / SKC_FILLS_EXPAND_ELEM_WORDS)
28
29#define SKC_FILLS_EXPAND_ELEMS_PER_THREAD (SKC_FILLS_EXPAND_ELEMS_PER_BLOCK / SKC_FILLS_EXPAND_SUBGROUP_SIZE)
30
31//
32//
33//
34
35#define SKC_FILLS_EXPAND_X (SKC_DEVICE_BLOCK_WORDS / SKC_FILLS_EXPAND_SUBGROUP_SIZE)
36
37//
38//
39//
40
41#if ( SKC_FILLS_EXPAND_X == 1 )
42#define SKC_FILLS_EXPAND_PATH_BLOCK_EXPAND() SKC_EXPAND_1()
43#define SKC_FILLS_EXPAND_PATH_BLOCK_EXPAND_I_LAST 0
44
45#elif ( SKC_FILLS_EXPAND_X == 2 )
46#define SKC_FILLS_EXPAND_PATH_BLOCK_EXPAND() SKC_EXPAND_2()
47#define SKC_FILLS_EXPAND_PATH_BLOCK_EXPAND_I_LAST 1
48
49#elif ( SKC_FILLS_EXPAND_X == 4 )
50#define SKC_FILLS_EXPAND_PATH_BLOCK_EXPAND() SKC_EXPAND_4()
51#define SKC_FILLS_EXPAND_PATH_BLOCK_EXPAND_I_LAST 3
52
53#elif ( SKC_FILLS_EXPAND_X == 8 )
54#define SKC_FILLS_EXPAND_PATH_BLOCK_EXPAND() SKC_EXPAND_8()
55#define SKC_FILLS_EXPAND_PATH_BLOCK_EXPAND_I_LAST 7
56
57#elif ( SKC_FILLS_EXPAND_X == 16)
58#define SKC_FILLS_EXPAND_PATH_BLOCK_EXPAND() SKC_EXPAND_16()
59#define SKC_FILLS_EXPAND_PATH_BLOCK_EXPAND_I_LAST 15
60
61#else
62#error "MISSING SKC_FILLS_EXPAND_X"
63#endif
64
65//
66// Fill and rasterize cmds only differ in their first word semantics
67//
68
69union skc_cmd_expand
70{
71 union skc_cmd_fill fill;
72 union skc_cmd_rasterize rasterize;
73};
74
75//
76//
77//
78
79union skc_path_elem
80{
81 skc_uint u32;
82 skc_float f32;
83};
84
85//
86// COMPILE-TIME AND RUN-TIME MACROS
87//
88
89#define SKC_ELEM_IN_RANGE(X,I) \
90 (skc_bool)SKC_GTE_MACRO(X,(I ) * SKC_FILLS_EXPAND_SUBGROUP_SIZE) && \
91 (skc_bool)SKC_LT_MACRO(X,(I+1) * SKC_FILLS_EXPAND_SUBGROUP_SIZE)
92
93#define SKC_ELEM_GTE(X,I) \
94 SKC_GTE_MACRO(X,(I+1) * SKC_FILLS_EXPAND_SUBGROUP_SIZE)
95
96//
97// FIXME -- slate these for replacement
98//
99
100#define SKC_BROADCAST(E,S,I) \
101 sub_group_broadcast(E##I.u32,S - I * SKC_FILLS_EXPAND_SUBGROUP_SIZE)
102
103#define SKC_BROADCAST_LAST_HELPER(E,I) \
104 sub_group_broadcast(E##I.u32,SKC_FILLS_EXPAND_SUBGROUP_SIZE - 1)
105
106#define SKC_BROADCAST_LAST(E,I) \
107 SKC_BROADCAST_LAST_HELPER(E,I)
108
109//
110//
111//
112
113void
114skc_cmds_out_append(__global union skc_cmd_rasterize * const cmds_out,
115 skc_uint * const out_idx,
116 union skc_cmd_expand * const cmd,
117 union skc_path_elem const e,
118 skc_uint const e_idx)
119{
120 //
121 // FIXME -- we can append a large number of nodeword indices to a
122 // local SMEM queue and flush when full. It may or may not be a
123 // performance win on some architectures.
124 //
125 skc_bool const is_elem = SKC_TAGGED_BLOCK_ID_GET_TAG(e.u32) < SKC_BLOCK_ID_TAG_PATH_NEXT;
126 skc_uint const offset = sub_group_scan_inclusive_add(is_elem ? 1 : 0);
127
128 cmd->rasterize.nodeword = e_idx;
129
130 if (is_elem) {
131 cmds_out[*out_idx + offset] = cmd->rasterize;
132 }
133
134 *out_idx += sub_group_broadcast(offset,SKC_FILLS_EXPAND_SUBGROUP_SIZE-1);
135}
136
137//
138//
139//
140
141__kernel
142SKC_FILLS_EXPAND_KERNEL_ATTRIBS
143void
144skc_kernel_fills_expand(__global union skc_path_elem const * const blocks,
145 __global skc_uint volatile * const atomics,
146 __global skc_block_id_t const * const map,
147 __global union skc_cmd_fill const * const cmds_in,
148 __global union skc_cmd_rasterize * const cmds_out)
149{
150 //
151 // Need to harmonize the way we determine a subgroup's id. In this
152 // kernel it's not as important because no local memory is being
153 // used. Although the device/mask calc to determine subgroup and
154 // lanes is still proper, we might want to make it clearer that
155 // we're working with subgroups by using the subgroup API.
156 //
157 // every subgroup/simd that will work on the block loads the same command
158 //
159#if (__OPENCL_VERSION__ < 200)
160 skc_uint const cmd_stride = get_num_sub_groups();
161#else
162 skc_uint const cmd_stride = get_enqueued_num_sub_groups(); // 2.0 supports non-uniform workgroups
163#endif
164 skc_uint cmd_idx = get_group_id(0) * cmd_stride + get_sub_group_id();
165
166 // load fill command -- we reuse y component
167 union skc_cmd_expand cmd = { .fill = cmds_in[cmd_idx] };
168
169 // get the path header block from the map
170 skc_block_id_t id = map[cmd.fill.path];
171
172#if 0
173 if (get_sub_group_local_id() == 0)
174 printf("expand[%u] = %u\n",cmd_idx,id);
175#endif
176
177 //
178 // blindly load all of the head elements into registers
179 //
180 skc_uint head_idx = id * SKC_FILLS_EXPAND_ELEMS_PER_SUBBLOCK + get_sub_group_local_id();
181
182#undef SKC_EXPAND_X
183#define SKC_EXPAND_X(I,S,C,P,R) \
184 union skc_path_elem h##I = blocks[head_idx + I * SKC_FILLS_EXPAND_SUBGROUP_SIZE];
185
186 SKC_FILLS_EXPAND_PATH_BLOCK_EXPAND();
187
188 //
189 // pick out count.nodes and count.prims from the header
190 //
191 skc_uint count_nodes, count_prims;
192
193#undef SKC_EXPAND_X
194#define SKC_EXPAND_X(I,S,C,P,R) \
195 if (SKC_ELEM_IN_RANGE(SKC_PATH_HEAD_OFFSET_NODES,I)) { \
196 count_nodes = SKC_BROADCAST(h,SKC_PATH_HEAD_OFFSET_NODES,I); \
197 } \
198 if (SKC_ELEM_IN_RANGE(SKC_PATH_HEAD_OFFSET_PRIMS,I)) { \
199 count_prims = SKC_BROADCAST(h,SKC_PATH_HEAD_OFFSET_PRIMS,I); \
200 }
201
202 SKC_FILLS_EXPAND_PATH_BLOCK_EXPAND();
203
204 //
205 // debug of path head
206 //
207#if 0
208 skc_uint count_blocks;
209
210#undef SKC_EXPAND_X
211#define SKC_EXPAND_X(I,S,C,P,R) \
212 if (SKC_ELEM_IN_RANGE(SKC_PATH_HEAD_OFFSET_BLOCKS,I)) { \
213 count_blocks = SKC_BROADCAST(h,SKC_PATH_HEAD_OFFSET_BLOCKS,I); \
214 }
215
216 SKC_FILLS_EXPAND_PATH_BLOCK_EXPAND();
217
218 if (get_sub_group_local_id() == 0)
219 printf("path header = { %5u, %5u, %5u }\n",
220 count_blocks,count_nodes,count_prims);
221#endif
222
223 //
224 // acquire slots in the expanded cmd extent
225 //
226 // decrement prim_idx by 1 so we can use inclusive warp scan later
227 //
228 skc_uint out_idx = 0;
229
230 if (get_sub_group_local_id() == 0) {
231 out_idx = SKC_ATOMIC_ADD_GLOBAL_RELAXED_SUBGROUP
232 (atomics+SKC_RASTER_COHORT_ATOMIC_OFFSET_CMDS,count_prims) - 1;
233 }
234
235 out_idx = sub_group_broadcast(out_idx,0);
236
237 //
238 // process ids trailing the path header
239 //
240#undef SKC_EXPAND_X
241#define SKC_EXPAND_X(I,S,C,P,R) \
242 if (!SKC_ELEM_GTE(SKC_PATH_HEAD_OFFSET_IDS,I)) { \
243 if (SKC_ELEM_IN_RANGE(SKC_PATH_HEAD_OFFSET_IDS,I)) { \
244 if (get_sub_group_local_id() + I * SKC_FILLS_EXPAND_SUBGROUP_SIZE < SKC_PATH_HEAD_OFFSET_IDS) { \
245 h##I.u32 = SKC_TAGGED_BLOCK_ID_INVALID; \
246 } \
247 } \
248 skc_cmds_out_append(cmds_out,&out_idx,&cmd,h##I, \
249 head_idx + I * SKC_FILLS_EXPAND_SUBGROUP_SIZE); \
250 }
251
252 SKC_FILLS_EXPAND_PATH_BLOCK_EXPAND();
253
254 //
255 // we're done if it was just the header
256 //
257 if (count_nodes == 0)
258 return;
259
260 //
261 // otherwise, process the nodes
262 //
263
264 //
265 // get id of next node
266 //
267 id = SKC_TAGGED_BLOCK_ID_GET_ID(SKC_BROADCAST_LAST(h,SKC_FILLS_EXPAND_PATH_BLOCK_EXPAND_I_LAST));
268
269 //
270 // the following blocks are nodes
271 //
272 while (true)
273 {
274 // get index of each element
275 skc_uint node_idx = id * SKC_FILLS_EXPAND_ELEMS_PER_SUBBLOCK + get_sub_group_local_id();
276
277 //
278 // blindly load all of the node elements into registers
279 //
280#undef SKC_EXPAND_X
281#define SKC_EXPAND_X(I,S,C,P,R) \
282 union skc_path_elem const n##I = blocks[node_idx + I * SKC_FILLS_EXPAND_SUBGROUP_SIZE];
283
284 SKC_FILLS_EXPAND_PATH_BLOCK_EXPAND();
285
286 //
287 // append all valid ids
288 //
289#undef SKC_EXPAND_X
290#define SKC_EXPAND_X(I,S,C,P,R) \
291 skc_cmds_out_append(cmds_out,&out_idx,&cmd,n##I, \
292 node_idx + I * SKC_FILLS_EXPAND_SUBGROUP_SIZE);
293
294 SKC_FILLS_EXPAND_PATH_BLOCK_EXPAND();
295
296 // any more nodes?
297 if (--count_nodes == 0)
298 return;
299
300 //
301 // get id of next node
302 //
303 id = SKC_TAGGED_BLOCK_ID_GET_ID(SKC_BROADCAST_LAST(n,SKC_FILLS_EXPAND_PATH_BLOCK_EXPAND_I_LAST));
304 }
305}
306
307//
308//
309//