blob: 63a1a43177071487a4017588490cea87a24f8ead [file] [log] [blame]
Allan MacKinnonc110e792018-06-21 09:09:56 -07001/*
2 * Copyright 2017 Google Inc.
3 *
4 * Use of this source code is governed by a BSD-style license that can
5 * be found in the LICENSE file.
6 *
7 */
8
9//
10//
11//
12
13#include "path.h"
14#include "block_pool_cl.h"
15#include "path_builder_cl_12.h"
16#include "kernel_cl_12.h"
17
18//
19//
20//
21
22#if 0
23
24//
25// SIMD AVX2
26//
27
28#define SKC_PATHS_COPY_WORDS_PER_ELEM 8
29#define SKC_PATHS_COPY_SUBGROUP_SIZE 1
30#define SKC_PATHS_COPY_KERNEL_ATTRIBUTES
31
32typedef skc_uint8 skc_paths_copy_elem;
33typedef skc_uint8 skc_pb_idx_v;
34
35#define SKC_PATHS_COPY_ELEM_EXPAND() SKC_EXPAND_8()
36
37#define SKC_IS_NOT_PATH_HEAD(sg,I) ((sg) + I >= SKC_PATH_HEAD_WORDS)
38
39#endif
40
41//
42//
43//
44
45#define SKC_PATHS_COPY_SUBGROUP_SIZE_MASK (SKC_PATHS_COPY_SUBGROUP_SIZE - 1)
46#define SKC_PATHS_COPY_ELEMS_PER_BLOCK (SKC_DEVICE_BLOCK_WORDS / SKC_PATHS_COPY_ELEM_WORDS)
47#define SKC_PATHS_COPY_ELEMS_PER_SUBBLOCK (SKC_DEVICE_SUBBLOCK_WORDS / SKC_PATHS_COPY_ELEM_WORDS)
48#define SKC_PATHS_COPY_ELEMS_PER_THREAD (SKC_PATHS_COPY_ELEMS_PER_BLOCK / SKC_PATHS_COPY_SUBGROUP_SIZE)
49
50// FIXME -- use SUBGROUP terminology everywhere
51#define SKC_PATHS_COPY_SUBGROUP_WORDS (SKC_PATHS_COPY_SUBGROUP_SIZE * SKC_PATHS_COPY_ELEM_WORDS)
52
53//
54//
55//
56
57#define SKC_PATHS_COPY_ELEMS_BEFORE_HEADER \
58 (SKC_PATHS_COPY_SUBGROUP_SIZE * ((SKC_PATH_HEAD_WORDS / SKC_PATHS_COPY_ELEM_WORDS) / SKC_PATHS_COPY_SUBGROUP_WORDS))
59
60#define SKC_PATHS_COPY_ELEMS_INCLUDING_HEADER \
61 (SKC_PATHS_COPY_SUBGROUP_SIZE * ((SKC_PATH_HEAD_WORDS + SKC_PATHS_COPY_SUBGROUP_WORDS - 1) / SKC_PATHS_COPY_SUBGROUP_WORDS))
62
63// #define SKC_PATHS_COPY_HEAD_ELEMS ((SKC_PATH_HEAD_WORDS + SKC_PATHS_COPY_ELEM_WORDS - 1) / SKC_PATHS_COPY_ELEM_WORDS)
64
65//
66//
67//
68
69//
70// BIT-FIELD EXTRACT/INSERT ARE NOT AVAILABLE IN OPENCL
71//
72
73#define SKC_CMD_PATHS_COPY_ONE_BITS (SKC_TAGGED_BLOCK_ID_BITS_TAG + SKC_DEVICE_SUBBLOCK_WORDS_LOG2)
74
75#define SKC_CMD_PATHS_COPY_ONE_MASK SKC_BITS_TO_MASK(SKC_CMD_PATHS_COPY_ONE_BITS)
76
77#define SKC_CMD_PATHS_COPY_ONE (1u << SKC_CMD_PATHS_COPY_ONE_BITS)
78
79#define SKC_CMD_PATHS_COPY_GET_TAG(ti) SKC_TAGGED_BLOCK_ID_GET_TAG(ti)
80
81#define SKC_CMD_PATHS_COPY_GET_ROLLING(ti) ((ti) >> SKC_CMD_PATHS_COPY_ONE_BITS)
82
83#define SKC_CMD_PATHS_COPY_UPDATE_ROLLING(ti,b) (((ti) & SKC_CMD_PATHS_COPY_ONE_MASK) | ((b) << SKC_TAGGED_BLOCK_ID_BITS_TAG))
84
85//
86//
87//
88
89skc_uint
90skc_sub_group_local_id()
91{
92#if SKC_PATHS_COPY_SUBGROUP_SIZE > 1
93 return get_sub_group_local_id();
94#else
95 return 0;
96#endif
97}
98
99//
100// convert an atomic read counter offset to a block id
101//
102
103skc_block_id_t
104skc_bp_off_to_id(__global skc_block_id_t const * const bp_ids,
105 skc_uint const bp_idx_mask,
106 skc_uint const bp_reads,
107 skc_uint const bp_off)
108{
109 skc_uint const bp_idx = (bp_reads + bp_off) & bp_idx_mask;
110
111 return bp_ids[bp_idx];
112}
113
114//
115//
116//
117
118void
119skc_copy_segs(__global skc_paths_copy_elem * const bp_elems, // to
120 skc_uint const bp_elems_idx,
121 __global skc_paths_copy_elem const * const pb_elems, // from
122 skc_uint const pb_elems_idx)
123{
124 for (skc_uint ii=0; ii<SKC_PATHS_COPY_ELEMS_PER_BLOCK; ii+=SKC_PATHS_COPY_SUBGROUP_SIZE)
125 {
126 (bp_elems+bp_elems_idx)[ii] = (pb_elems+pb_elems_idx)[ii];
127 }
128
129#if 0
130 //
131 // NOTE THIS IS PRINTING 8 ROWS
132 //
133 printf("%5u : (%8u) : { { %5.0f, %5.0f }, { %5.0f, %5.0f } },\n",
134 (skc_uint)get_global_id(0),pb_elems_idx,
135 as_float((pb_elems+pb_elems_idx)[0*SKC_PATHS_COPY_SUBGROUP_SIZE]),
136 as_float((pb_elems+pb_elems_idx)[1*SKC_PATHS_COPY_SUBGROUP_SIZE]),
137 as_float((pb_elems+pb_elems_idx)[2*SKC_PATHS_COPY_SUBGROUP_SIZE]),
138 as_float((pb_elems+pb_elems_idx)[3*SKC_PATHS_COPY_SUBGROUP_SIZE]));
139 printf("%5u : (%8u) : { { %5.0f, %5.0f }, { %5.0f, %5.0f } },\n",
140 (skc_uint)get_global_id(0),pb_elems_idx,
141 as_float((pb_elems+pb_elems_idx)[4*SKC_PATHS_COPY_SUBGROUP_SIZE]),
142 as_float((pb_elems+pb_elems_idx)[5*SKC_PATHS_COPY_SUBGROUP_SIZE]),
143 as_float((pb_elems+pb_elems_idx)[6*SKC_PATHS_COPY_SUBGROUP_SIZE]),
144 as_float((pb_elems+pb_elems_idx)[7*SKC_PATHS_COPY_SUBGROUP_SIZE]));
145#endif
146}
147
148//
149//
150//
151
152void
153skc_copy_node(__global skc_paths_copy_elem * const bp_elems, // to
154 skc_uint const bp_elems_idx,
155 __global skc_block_id_t const * const bp_ids,
156 skc_uint const bp_reads,
157 skc_uint const bp_idx_mask,
158 __global skc_paths_copy_elem const * const pb_elems, // from
159 skc_uint const pb_elems_idx,
160 skc_uint const pb_rolling)
161{
162 //
163 // remap block id tags bp_elems the host-side rolling counter pb_elems a
164 // device-side block pool id
165 //
166 for (skc_uint ii=0; ii<SKC_PATHS_COPY_ELEMS_PER_BLOCK; ii+=SKC_PATHS_COPY_SUBGROUP_SIZE)
167 {
168 // load block_id_tag words
169 skc_paths_copy_elem elem = (pb_elems + pb_elems_idx)[ii];
170
171 // calculate ahead of time -- if elem was invalid then bp_idx is definitely invalid
172 skc_pb_idx_v const bp_idx = (bp_reads + SKC_CMD_PATHS_COPY_GET_ROLLING(elem - pb_rolling)) & bp_idx_mask;
173
174 // FIXME ^^^^^ THE IDX PROBABLY DOESN'T NEED TO BE SHIFTED TWICE AND WE CAN SAVE A FEW INSTRUCTIONS
175
176 //
177 // FIXME -- SIMD can be fully parallelized since a bp_ids[] load
178 // will _always_ be safe as long as we don't use the loaded
179 // value! So... fix UPDATE_ROLLING to be SIMD-friendly instead
180 // of iterating over the vector components.
181 //
182
183 // only convert if original elem is not invalid
184
185#undef SKC_EXPAND_X
186#define SKC_EXPAND_X(I,S,C,P,R) \
187 if (elem C != SKC_TAGGED_BLOCK_ID_INVALID) { \
188 skc_block_id_t const b = bp_ids[bp_idx C]; \
189 elem C = SKC_CMD_PATHS_COPY_UPDATE_ROLLING(elem C,b); \
190 }
191
192 // printf("%2u: < %8X, %8X, %8X >\n",ii,bp_idx,b,elem C);
193
194 SKC_PATHS_COPY_ELEM_EXPAND();
195
196 // store the elem back
197 (bp_elems+bp_elems_idx)[ii] = elem;
198 }
199}
200
201//
202//
203//
204
205void
206skc_host_map_update(__global skc_uint * const host_map,
207 skc_uint const block,
208 skc_paths_copy_elem const elem)
209{
210 //
211 // write first elem to map -- FIXME -- this is a little nasty
212 // because it relies on the the host handle always being the first
213 // word in the path header.
214 //
215 // OTOH, this is not unreasonable. The alternative is to have a
216 // separate kernel initializing the map.
217 //
218#if SKC_PATHS_COPY_SUBGROUP_SIZE > 1
219 if (get_sub_group_local_id() == SKC_PATH_HEAD_OFFSET_HANDLE)
220#endif
221 {
222#if SKC_PATHS_COPY_ELEM_WORDS == 1
223 host_map[elem] = block;
224#if 0
225 printf("[%u] = %u\n",elem,block);
226#endif
227#else
228 host_map[elem.SKC_CONCAT(s,SKC_PATH_HEAD_OFFSET_HANDLE)] = block;
229#endif
230 }
231}
232
233//
234//
235//
236
237void
238skc_copy_head(__global skc_uint * const host_map,
239 skc_uint const block,
240 __global skc_paths_copy_elem * const bp_elems, // to
241 skc_uint const bp_elems_idx,
242 __global skc_block_id_t const * const bp_ids,
243 skc_uint const bp_reads,
244 skc_uint const bp_idx_mask,
245 __global skc_paths_copy_elem const * const pb_elems, // from
246 skc_uint const pb_elems_idx,
247 skc_uint const pb_rolling)
248{
249 //
250 // if there are more path header words than there are
251 // threads-per-block then we can just copy the initial header words
252 //
253#if ( SKC_PATHS_COPY_ELEMS_BEFORE_HEADER > 0 )
254 for (skc_uint ii=0; ii<SKC_PATHS_COPY_ELEMS_BEFORE_HEADER; ii+=SKC_PATHS_COPY_SUBGROUP_SIZE)
255 {
256 skc_paths_copy_elem const elem = (pb_elems+pb_elems_idx)[ii];
257
258 (bp_elems+bp_elems_idx)[ii] = elem;
259
260 if (ii == 0) {
261 skc_host_map_update(host_map,block,elem);
262 }
263 }
264#endif
265
266 //
267 // this is similar to copy node but the first H words of the path
268 // header are not modified and simply copied
269 //
270 for (skc_uint ii=SKC_PATHS_COPY_ELEMS_BEFORE_HEADER; ii<SKC_PATHS_COPY_ELEMS_INCLUDING_HEADER; ii+=SKC_PATHS_COPY_SUBGROUP_SIZE)
271 {
272 skc_paths_copy_elem elem = (pb_elems+pb_elems_idx)[ii];
273
274#if ( SKC_PATHS_COPY_ELEMS_BEFORE_HEADER == 0 )
275 if (ii == 0) {
276 skc_host_map_update(host_map,block,elem);
277 }
278#endif
279 // calculate ahead of time -- if elem was invalid then bp_idx is definitely invalid
280 skc_pb_idx_v const bp_idx = (bp_reads + SKC_CMD_PATHS_COPY_GET_ROLLING(elem - pb_rolling)) & bp_idx_mask;
281
282 //
283 // FIXME -- SIMD can be fully parallelized since a bp_ids[] load
284 // will _always_ be safe as long as we don't use the loaded
285 // value! So... fix UPDATE_ROLLING to be SIMD-friendly instead
286 // of iterating over the vector components.
287 //
288
289 // FIXME ^^^^^ THE IDX PROBABLY DOESN'T NEED TO BE SHIFTED TWICE AND WE CAN SAVE A FEW INSTRUCTIONS
290
291 // FIXME -- MIX MIX MIX MIX / SELECT
292
293 // only convert if original elem is not invalid
294#undef SKC_EXPAND_X
295#define SKC_EXPAND_X(I,S,C,P,R) \
296 if (SKC_IS_NOT_PATH_HEAD(ii,I) && (elem C != SKC_TAGGED_BLOCK_ID_INVALID)) { \
297 skc_block_id_t const b = bp_ids[bp_idx C]; \
298 elem C = SKC_CMD_PATHS_COPY_UPDATE_ROLLING(elem C,b); \
299 }
300
301 // printf("%2u: ( %8X, %8X, %8X )\n",ii,bp_idx,b,elem C);
302
303 SKC_PATHS_COPY_ELEM_EXPAND();
304
305 // store the elem back
306 (bp_elems+bp_elems_idx)[ii] = elem;
307 }
308
309 //
310 // the remaining words are treated like a node
311 //
312 for (skc_uint ii=SKC_PATHS_COPY_ELEMS_INCLUDING_HEADER; ii<SKC_PATHS_COPY_ELEMS_PER_BLOCK; ii+=SKC_PATHS_COPY_SUBGROUP_SIZE)
313 {
314 // load block_id_tag words
315 skc_paths_copy_elem elem = (pb_elems+pb_elems_idx)[ii];
316
317 // calculate ahead of time
318 skc_pb_idx_v const bp_idx = (bp_reads + SKC_CMD_PATHS_COPY_GET_ROLLING(elem - pb_rolling)) & bp_idx_mask;
319
320 //
321 // FIXME -- SIMD can be fully parallelized since a bp_ids[] load
322 // will _always_ be safe as long as we don't use the loaded
323 // value! So... fix UPDATE_ROLLING to be SIMD-friendly instead
324 // of iterating over the vector components.
325 //
326
327 // FIXME ^^^^^ THE IDX PROBABLY DOESN'T NEED TO BE SHIFTED TWICE AND WE CAN SAVE A FEW INSTRUCTIONS
328
329 // only convert if original elem is not invalid
330#undef SKC_EXPAND_X
331#define SKC_EXPAND_X(I,S,C,P,R) \
332 if (elem C != SKC_TAGGED_BLOCK_ID_INVALID) { \
333 skc_block_id_t const b = bp_ids[bp_idx C]; \
334 elem C = SKC_CMD_PATHS_COPY_UPDATE_ROLLING(elem C,b); \
335 }
336
337 // printf("%2u: [ %8X, %8X, %8X ]\n",ii,bp_idx,b,elem C);
338
339 SKC_PATHS_COPY_ELEM_EXPAND();
340
341 // store the elem
342 (bp_elems+bp_elems_idx)[ii] = elem;
343 }
344}
345
346//
347// FIXME -- pack some of these constant integer args in a vec or struct
348//
349
350__kernel
351SKC_PATHS_COPY_KERNEL_ATTRIBS
352void
353skc_kernel_paths_copy
354(__global skc_uint * const host_map,
355
356 __global skc_block_id_t const * const bp_ids,
357 __global skc_paths_copy_elem * const bp_elems,
358 skc_uint const bp_idx_mask, // pow2 modulo mask for block pool ring
359
360 __global skc_uint const * const bp_alloc, // block pool ring base
361 skc_uint const bp_alloc_idx,// which subbuf
362
363 __global union skc_tagged_block_id const * const pb_cmds,
364 __global skc_paths_copy_elem const * const pb_elems,
365
366 skc_uint const pb_size, // # of commands/blocks in buffer
367 skc_uint const pb_rolling, // shifted rolling counter base
368
369 skc_uint const pb_prev_from,
370 skc_uint const pb_prev_span,
371 skc_uint const pb_curr_from)
372{
373 //
374 // THERE ARE 3 TYPES OF PATH COPYING COMMANDS:
375 //
376 // - HEAD
377 // - NODE
378 // - SEGS
379 //
380 // THESE ARE SUBGROUP ORIENTED KERNELS
381 //
382 // A SUBGROUP CAN OPERATE ON [1,N] BLOCKS
383 //
384
385 //
386 // It's likely that peak bandwidth is achievable with a single
387 // workgroup.
388 //
389 // So let's keep the grids modestly sized and for simplicity and
390 // portability, let's assume that a single workgroup can perform all
391 // steps in the copy.
392 //
393 // Launch as large of a workgroup as possiblex
394 //
395 // 1. ATOMICALLY ALLOCATE BLOCKS BP_ELEMS POOL
396 // 2. CONVERT COMMANDS IN PB_ELEMS BLOCK OFFSETS
397 // 3. FOR EACH COMMAND:
398 // - HEAD: SAVED HEAD ID PB_ELEMS MAP. CONVERT AND COPY H INDICES.
399 // - NODE: CONVERT AND COPY B INDICES
400 // - SEGS: BULK COPY
401 //
402 // B : number of words in block -- always pow2
403 // W : intelligently/arbitrarily chosen factor of B -- always pow2
404 //
405
406 //
407 // There are several approaches to processing the commands:
408 //
409 // 1. B threads are responsible for one block. All threads broadcast
410 // load a single command word. Workgroup size must be a facpb_elemsr of
411 // B.
412 //
413 // 2. W threads process an entire block. W will typically be the
414 // device's subgroup/warp/wave width. W threads broadcast load a
415 // single command word.
416 //
417 // 3. W threads process W blocks. W threads load W command words and
418 // process W blocks.
419 //
420 // Clearly (1) has low I/O intensity but will achieve high
421 // parallelism by activating the most possible threads. The downside
422 // of this kind of approach is that the kernel will occupy even a
423 // large GPU with low intensity work and reduce opportunities for
424 // concurrent kernel execution (of other kernels).
425 //
426 // See Vasily Volkov's CUDA presentation describing these tradeoffs.
427 //
428 // Note that there are many other approaches. For example, similar
429 // pb_elems (1) but each thread loads a pow2 vector of block data.
430 //
431
432 // load the copied atomic read "base" from gmem
433 skc_uint const bp_reads = bp_alloc[bp_alloc_idx];
434 // will always be less than 2^32
435 skc_uint const gid = get_global_id(0);
436 // every subgroup/simd that will work on the block loads the same command
437 skc_uint const sg_idx = gid / SKC_PATHS_COPY_SUBGROUP_SIZE;
438 // path builder data can be spread across two spans
439 skc_uint pb_idx = sg_idx + ((sg_idx < pb_prev_span) ? pb_prev_from : pb_curr_from);
440
441 // no need pb_elems make this branchless
442 if (pb_idx >= pb_size)
443 pb_idx -= pb_size;
444
445 // broadcast load the command
446 union skc_tagged_block_id const pb_cmd = pb_cmds[pb_idx];
447
448 // what do we want pb_elems do with this block?
449 skc_cmd_paths_copy_tag const tag = SKC_CMD_PATHS_COPY_GET_TAG(pb_cmd.u32);
450
451 // compute offset from rolling base to get index into block pool ring allocation
452 skc_uint const bp_off = SKC_CMD_PATHS_COPY_GET_ROLLING(pb_cmd.u32 - pb_rolling);
453
454 // convert the pb_cmd's offset counter pb_elems a block id
455 skc_block_id_t const block = skc_bp_off_to_id(bp_ids,bp_idx_mask,bp_reads,bp_off);
456
457#if 0
458 if (get_sub_group_local_id() == 0) {
459 printf("bp_off/reads = %u / %u\n",bp_off,bp_reads);
460 printf("< %8u >\n",block);
461 }
462#endif
463
464 // FIXME -- could make this 0 for SIMD, gid&mask or get_sub_group_local_id()
465 skc_uint const tid = gid & SKC_PATHS_COPY_SUBGROUP_SIZE_MASK;
466
467 // calculate bp_elems (to) / pb_elems (from)
468 skc_uint const bp_elems_idx = block * SKC_PATHS_COPY_ELEMS_PER_SUBBLOCK + tid;
469 skc_uint const pb_elems_idx = pb_idx * SKC_PATHS_COPY_ELEMS_PER_BLOCK + tid;
470
471 if (tag == SKC_CMD_PATHS_COPY_TAG_SEGS)
472 {
473#if 0
474 if (tid == 0)
475 printf("%3u, segs\n",bp_off);
476#endif
477 skc_copy_segs(bp_elems,
478 bp_elems_idx,
479 pb_elems,
480 pb_elems_idx);
481 }
482 else if (tag == SKC_CMD_PATHS_COPY_TAG_NODE)
483 {
484#if 0
485 if (tid == 0)
486 printf("%3u, NODE\n",bp_off);
487#endif
488 skc_copy_node(bp_elems, // to
489 bp_elems_idx,
490 bp_ids,
491 bp_reads,
492 bp_idx_mask,
493 pb_elems, // from
494 pb_elems_idx,
495 pb_rolling);
496 }
497 else // ( tag == SKC_CMD_PATHS_COPY_TAG_HEAD)
498 {
499#if 0
500 if (tid == 0)
501 printf("%3u, HEAD\n",bp_off);
502#endif
503 skc_copy_head(host_map,
504 block,
505 bp_elems, // to
506 bp_elems_idx,
507 bp_ids,
508 bp_reads,
509 bp_idx_mask,
510 pb_elems, // from
511 pb_elems_idx,
512 pb_rolling);
513 }
514}
515
516//
517//
518//
519
520__kernel
521SKC_PATHS_ALLOC_KERNEL_ATTRIBS
522void
523skc_kernel_paths_alloc(__global skc_uint volatile * const bp_atomics,
524 __global skc_uint * const bp_alloc,
525 skc_uint const bp_alloc_idx,
526 skc_uint const pb_cmd_count)
527{
528 //
529 // allocate blocks in block pool
530 //
531 skc_uint const reads = atomic_add(bp_atomics+SKC_BP_ATOMIC_OFFSET_READS,pb_cmd_count);
532
533 // store in slot
534 bp_alloc[bp_alloc_idx] = reads;
535
536#if 0
537 printf("pc: %8u + %u\n",reads,pb_cmd_count);
538#endif
539}
540
541//
542//
543//