blob: 8866bdb3e6b2d10102d94e5f0b74e33a1f8a95c0 [file] [log] [blame]
Allan MacKinnonc110e792018-06-21 09:09:56 -07001/*
2 * Copyright 2017 Google Inc.
3 *
4 * Use of this source code is governed by a BSD-style license that can
5 * be found in the LICENSE file.
6 *
7 */
8
9//
10//
11//
12
13#include "tile.h"
14#include "common.h"
15#include "raster.h"
16#include "atomic_cl.h"
17#include "kernel_cl_12.h"
18
19//
20//
21//
22
23#define SKC_PLACE_SUBGROUP_MASK (SKC_PLACE_SUBGROUP_SIZE - 1)
24#define SKC_PLACE_SUBGROUP_LAST (SKC_PLACE_SUBGROUP_SIZE - 1)
25
26//
27//
28//
29
30#define SKC_PLACE_SMEM_COUNT_TTSK SKC_MAX_MACRO(SKC_RASTER_NODE_MAX_TTSK,SKC_PLACE_SUBGROUP_SIZE)
31#define SKC_PLACE_SMEM_COUNT_TTPK SKC_RASTER_NODE_MAX_TTPK
32
33//
34//
35//
36
37#define SKC_PLACE_X (SKC_DEVICE_BLOCK_DWORDS / SKC_PLACE_SUBGROUP_SIZE)
38
39//
40//
41//
42
43#if ( SKC_PLACE_X == 1 )
44#define SKC_PLACE_EXPAND() SKC_EXPAND_1()
45#define SKC_PLACE_EXPAND_I_LAST 0
46
47#elif ( SKC_PLACE_X == 2 )
48#define SKC_PLACE_EXPAND() SKC_EXPAND_2()
49#define SKC_PLACE_EXPAND_I_LAST 1
50
51#elif ( SKC_PLACE_X == 4 )
52#define SKC_PLACE_EXPAND() SKC_EXPAND_4()
53#define SKC_PLACE_EXPAND_I_LAST 3
54
55#elif ( SKC_PLACE_X == 8 )
56#define SKC_PLACE_EXPAND() SKC_EXPAND_8()
57#define SKC_PLACE_EXPAND_I_LAST 7
58
59#elif ( SKC_PLACE_X == 16)
60#define SKC_PLACE_EXPAND() SKC_EXPAND_16()
61#define SKC_PLACE_EXPAND_I_LAST 15
62#endif
63
64//
65// PREFIX STORES THE 64-BIT KEYS WITH TWO 32-BIT SUBGROUP-WIDE
66// COALESCED WRITES. LO FIRST, FOLLOWED BY HI.
67//
68// THIS SLIGHTLY COMPLICATES LOADING BY THE PLACE KERNEL IF THE
69// KERNELS USE DIFFERENT SUBGROUP SIZES.
70//
71// THE BENEFIT IS THAT THE RASTER RECLAIM KERNEL ONLY HAS TO LOAD THE
72// LO WORD OF THE KEY SINCE IT CONTAINS THE BLOCK ID.
73//
74// NOTE: AT THIS POINT, ONLY INTEL'S HD GRAPHICS ARCHITECTURE UNDER
75// OPENCL SUPPORTS SELECTING A SUBGROUP SIZE (8/16/32). VULKAN MAY
76// ONLY SUPPORT A SUBGROUP SIZE OF 16.
77//
78
79#if ( SKC_PREFIX_SUBGROUP_SIZE == SKC_PLACE_SUBGROUP_SIZE )
80
81#define SKC_PLACE_STRIDE_H(L) (L)
82#define SKC_PLACE_STRIDE_V_LO(I) (I * 2 * SKC_PLACE_SUBGROUP_SIZE)
83#define SKC_PLACE_STRIDE_V_HI(I) (SKC_PLACE_STRIDE_V_LO(I) + SKC_PLACE_SUBGROUP_SIZE)
84
85#elif ( SKC_PREFIX_SUBGROUP_SIZE > SKC_PLACE_SUBGROUP_SIZE ) // same as above when ratio equals 1
86
87#define SKC_PLACE_SUBGROUP_RATIO (SKC_PREFIX_SUBGROUP_SIZE / SKC_PLACE_SUBGROUP_SIZE)
88#define SKC_PLACE_SUBGROUP_RATIO_MASK (SKC_PLACE_SUBGROUP_RATIO - 1)
89#define SKC_PLACE_SUBGROUP_RATIO_SCALE(I) ((I / SKC_PLACE_SUBGROUP_RATIO) * 2 * SKC_PLACE_SUBGROUP_RATIO + (I & SKC_PLACE_SUBGROUP_RATIO_MASK))
90
91#define SKC_PLACE_STRIDE_H(L) (L)
92#define SKC_PLACE_STRIDE_V_LO(I) (SKC_PLACE_SUBGROUP_RATIO_SCALE(I) * SKC_PLACE_SUBGROUP_SIZE)
93#define SKC_PLACE_STRIDE_V_HI(I) (SKC_PLACE_STRIDE_V_LO(I) + SKC_PLACE_SUBGROUP_RATIO * SKC_PLACE_SUBGROUP_SIZE)
94
95#elif ( SKC_PREFIX_SUBGROUP_SIZE < SKC_PLACE_SUBGROUP_SIZE ) // same as above when ratio equals 1
96
97#define SKC_PLACE_SUBGROUP_RATIO (SKC_PLACE_SUBGROUP_SIZE / SKC_PREFIX_SUBGROUP_SIZE)
98#define SKC_PLACE_SUBGROUP_RATIO_MASK (SKC_PLACE_SUBGROUP_SIZE / SKC_PLACE_SUBGROUP_RATIO - 1) // equal to prefix subgroup mask
99
100#define SKC_PLACE_STRIDE_H(L) (((L) & ~SKC_PLACE_SUBGROUP_RATIO_MASK) * 2 + ((L) & SKC_PLACE_SUBGROUP_RATIO_MASK))
101#define SKC_PLACE_STRIDE_V_LO(I) (I * 2 * SKC_PLACE_SUBGROUP_SIZE)
102#define SKC_PLACE_STRIDE_V_HI(I) (SKC_PLACE_STRIDE_V_LO(I) + SKC_PLACE_SUBGROUP_SIZE / SKC_PLACE_SUBGROUP_RATIO)
103
104#endif
105
106//
107// A COARSE COMPILE-TIME GUARD -- WILL ONLY MATTER WHEN SUBGROUP SIZE
108// IS EQUAL TO THE RASTER HEADER SIZE (CURRENTLY 8)
109//
110
111#define SKC_PLACE_IS_ALL_HEADER_ROW(i) (((i)+1) * SKC_PLACE_SUBGROUP_SIZE <= SKC_RASTER_HEAD_DWORDS)
112
113#define SKC_PLACE_IS_NOT_HEADER_ROW(i) ( (i) * SKC_PLACE_SUBGROUP_SIZE >= SKC_RASTER_HEAD_DWORDS)
114
115#define SKC_PLACE_IS_TRAILING_ROW(i) (((i)+1) * SKC_PLACE_SUBGROUP_SIZE == SKC_DEVICE_BLOCK_DWORDS)
116
117#define SKC_PLACE_IS_HEADER_ROW_KEY(i) ((i) * SKC_PLACE_SUBGROUP_SIZE + get_sub_group_local_id() - SKC_RASTER_HEAD_DWORDS < (k))
118
119
120//
121// Note: HEADER_LESS_THAN purposefully wraps unsigned integer to ~UINT_MAX
122//
123#define SKC_PLACE_HEADER_LESS_THAN(i,k) ((i) * SKC_PLACE_SUBGROUP_SIZE + get_sub_group_local_id() - SKC_RASTER_HEAD_DWORDS < (k))
124#define SKC_PLACE_NODE_LESS_THAN(i,k) ((i) * SKC_PLACE_SUBGROUP_SIZE + get_sub_group_local_id() < (k))
125
126//
127// TTSK v2:
128//
129// 0 63
130// | TTSB ID | PREFIX | SPAN | X | Y |
131// +---------+--------+---------+-----+-----+
132// | 27 | 1 (=0) | 12 (=0) | 12 | 12 |
133//
134//
135// TTPK v2:
136//
137// 0 63
138// | TTPB ID | PREFIX | SPAN | X | Y |
139// +---------+--------+------+-----+-----+
140// | 27 | 1 (=1) | 12 | 12 | 12 |
141//
142//
143
144//
145// TTCK (32-BIT COMPARE) v1:
146//
147// 0 63
148// | PAYLOAD/TTSB/TTPB ID | PREFIX | ESCAPE | LAYER | X | Y |
149// +----------------------+--------+--------+-------+-----+-----+
150// | 30 | 1 | 1 | 18 | 7 | 7 |
151//
152//
153// TTCK (32-BIT COMPARE) v2:
154//
155// 0 63
156// | PAYLOAD/TTSB/TTPB ID | PREFIX | ESCAPE | LAYER | X | Y |
157// +----------------------+--------+--------+-------+-----+-----+
158// | 30 | 1 | 1 | 15 | 9 | 8 |
159//
160//
161// TTCK (64-BIT COMPARE) -- achieves 4K x 4K with an 8x16 tile:
162//
163// 0 63
164// | PAYLOAD/TTSB/TTPB ID | PREFIX | ESCAPE | LAYER | X | Y |
165// +----------------------+--------+--------+-------+-----+-----+
166// | 27 | 1 | 1 | 18 | 9 | 8 |
167//
168
169union skc_subgroup_smem
170{
171 skc_uint scratch[SKC_PLACE_SUBGROUP_SIZE]; // will only use SKC_PLACE_SUBGROUP_SIZE
172
173 struct {
174 struct {
175 skc_ttsk_lo_t sk[SKC_PLACE_SMEM_COUNT_TTSK];
176 skc_ttpk_lo_t pk[SKC_PLACE_SMEM_COUNT_TTPK];
177 } lo;
178
179 struct {
180 skc_ttsk_hi_t sk[SKC_PLACE_SMEM_COUNT_TTSK];
181 skc_ttpk_hi_t pk[SKC_PLACE_SMEM_COUNT_TTPK];
182 } hi;
183
184 // skc_uint span[SKC_PLACE_SMEM_COUNT_TTPK];
185 };
186
187};
188
189//
190// scatter scan max
191//
192static
193skc_int_v_t
194skc_scatter_scan_max(__local union skc_subgroup_smem volatile * const smem,
195 skc_int_v_t const iss,
196 skc_int_v_t const ess)
197{
198 //
199 // prefix sums determine which lanes we're going to work on next
200 //
201 skc_pred_v_t const is_scratch_store = (iss > 0) && (ess < SKC_PLACE_SUBGROUP_SIZE);
202 skc_int_v_t const scratch_idx = max(ess,0);
203
204 //
205 // SIMT
206 //
207
208 //
209 // zero the volatile smem scratchpad using vector syntax
210 //
211 smem->scratch[get_sub_group_local_id()] = ( 0 );
212
213 //
214 // store source lane at starting lane
215 //
216 if (is_scratch_store) {
217 smem->scratch[scratch_idx] = get_sub_group_local_id();
218 }
219
220 //
221 // propagate lanes to right using max scan
222 //
223 skc_int_v_t const scratch = smem->scratch[get_sub_group_local_id()];
224 skc_int_v_t const source = sub_group_scan_inclusive_max(scratch);
225
226 return source;
227}
228
229//
230//
231//
232
233static
234skc_bool
235skc_xk_clip(union skc_tile_clip const * const tile_clip,
236 skc_ttxk_t * const xk)
237{
238 //
239 // clip the sk and pk keys
240 //
241 // if fully clipped then return false
242 //
243 // alternatively -- we can expand all these keys in place
244 //
245 // alternatively -- keep sk and pk keys segregated because sk
246 // represents the vast majority of keys and are easier to process.
247 // don't mess with the fastpath!
248 //
249 return false;
250}
251
252//
253//
254//
255
256static
257skc_ttck_t
258skc_sk_to_ck(__local union skc_subgroup_smem volatile * const smem,
259 union skc_cmd_place const * const cmd,
260 skc_uint const sk_idx)
261{
262 skc_uint const lo = smem->lo.sk[sk_idx]; // assumes prefix bit is 0
263 skc_uint const hi = smem->hi.sk[sk_idx];
264
265 skc_ttck_t ck;
266
267 ck.lo = lo | (cmd->layer_id << SKC_TTCK_LO_BITS_ID_PREFIX_ESCAPE); // FIXME -- preshift the layer id
268
269 // FIXME -- x and y should already be clipped and shifted
270 skc_uint const x = (cmd->tx + SKC_BFE(hi,SKC_TTXK_HI_BITS_X,SKC_TTXK_HI_OFFSET_X)) << SKC_TTCK_HI_OFFSET_X;
271 skc_uint const y = (cmd->ty + SKC_BFE(hi,SKC_TTXK_HI_BITS_Y,SKC_TTXK_HI_OFFSET_Y)) << SKC_TTCK_HI_OFFSET_Y;
272
273 ck.hi = (cmd->layer_id >> SKC_TTCK_HI_SHR_LAYER) | x | y;
274
275 return ck;
276}
277
278static
279skc_ttck_t
280skc_pk_to_ck(__local union skc_subgroup_smem volatile * const smem,
281 union skc_cmd_place const * const cmd,
282 skc_uint const pk_idx,
283 skc_uint const dx)
284{
285 skc_uint const lo = smem->lo.pk[pk_idx] & SKC_TTXK_LO_MASK_ID_PREFIX; // assumes prefix bit is 1
286 skc_uint const hi = smem->hi.pk[pk_idx];
287
288 skc_ttck_t ck;
289
290 ck.lo = lo | (cmd->layer_id << SKC_TTCK_LO_BITS_ID_PREFIX_ESCAPE); // FIXME -- preshift the layer id
291
292 // FIXME -- x and y should already be clipped and shifted
293 skc_uint const x = (cmd->tx + dx + SKC_BFE(hi,SKC_TTXK_HI_BITS_X,SKC_TTXK_HI_OFFSET_X)) << SKC_TTCK_HI_OFFSET_X;
294 skc_uint const y = (cmd->ty + SKC_BFE(hi,SKC_TTXK_HI_BITS_Y,SKC_TTXK_HI_OFFSET_Y)) << SKC_TTCK_HI_OFFSET_Y;
295
296 ck.hi = (cmd->layer_id >> SKC_TTCK_HI_SHR_LAYER) | x | y;
297
298 return ck;
299}
300
301//
302//
303//
304
305static
306void
307skc_ttsk_flush(__global SKC_ATOMIC_UINT volatile * const place_atomics,
308 __global skc_ttck_t * const ck_extent,
309 __local union skc_subgroup_smem volatile * const smem,
310 union skc_cmd_place const * const cmd,
311 skc_uint const sk)
312{
313 //
314 // Pretty sure you can never ever have an sk count equal to 0
315 //
316 skc_uint ck_base = 0;
317
318 // last lane performs the block pool allocation with an atomic increment
319 if (get_sub_group_local_id() == 0) {
320 ck_base = SKC_ATOMIC_ADD_GLOBAL_RELAXED_DEVICE(place_atomics,sk);
321 }
322
323 // broadcast base to all lanes
324 ck_base = sub_group_broadcast(ck_base,0);
325
326 // convert sk keys to ck keys
327 for (skc_uint ii=get_sub_group_local_id(); ii<sk; ii+=SKC_PLACE_SUBGROUP_SIZE)
328 {
329 ck_extent[ck_base+ii] = skc_sk_to_ck(smem,cmd,ii);
330 }
331}
332
333//
334//
335//
336
337static
338skc_int
339skc_ttpk_get_span(__local union skc_subgroup_smem volatile * const smem,
340 skc_uint const idx)
341{
342 skc_uint const lo = smem->lo.pk[idx];
343 skc_uint const hi = smem->hi.pk[idx];
344
345 skc_uint const span_lo = lo >> SKC_TTXK_LO_OFFSET_SPAN;
346 skc_uint const span_hi = (hi & SKC_BITS_TO_MASK(SKC_TTXK_HI_BITS_SPAN)) << SKC_TTXK_LO_BITS_SPAN;
347
348 return (span_lo | span_hi) + 1;
349}
350
351//
352//
353//
354
355static
356void
357skc_ttpk_flush(__global SKC_ATOMIC_UINT volatile * const place_atomics,
358 __global skc_ttck_t * const ck_extent,
359 __local union skc_subgroup_smem volatile * const smem,
360 union skc_cmd_place const * const cmd,
361 skc_uint const pk)
362{
363 // bail out if pk queue is empty
364 if (pk == 0)
365 return;
366
367#if 0
368 if (get_sub_group_local_id() == 0)
369 printf("%u\n",pk);
370#endif
371
372 //
373 // FIXME -- this nested loop iterates over the queue processing a
374 // subgroup of 64-bit keys at a time. This is probably not the most
375 // efficient approach so investigate how to store and iterate over a
376 // wider than subgroup (node-sized) queue of keys.
377 //
378
379 // round up so we work with full subgroups
380 skc_uint const pk_ru = (pk + SKC_PLACE_SUBGROUP_SIZE - 1) & ~SKC_PLACE_SUBGROUP_MASK;
381 skc_uint ii = 0;
382
383 // nested loop that expands all ttpk keys
384#if (SKC_PLACE_SMEM_COUNT_TTPK > SKC_PLACE_SUBGROUP_SIZE)
385 for (; ii<pk_ru; ii+=SKC_PLACE_SUBGROUP_SIZE)
386#endif
387 {
388 skc_uint idx = ii + get_sub_group_local_id();
389 skc_int span = 0;
390
391 // how many tiles does this ttpk span?
392 if (idx < pk)
393 span = skc_ttpk_get_span(smem,idx);
394
395 // we need inclusive, exclusive and total
396 skc_int iss = sub_group_scan_inclusive_add(span);
397 skc_int ess = iss - span;
398 skc_int rem = sub_group_broadcast(iss,SKC_PLACE_SUBGROUP_SIZE-1);
399
400 // printf("%u : %u\n",span,iss);
401 // continue;
402
403 // atomically allocate space for the pk keys
404 skc_uint ck_base = 0;
405
406 // last lane performs the block pool allocation with an atomic increment
407 if (get_sub_group_local_id() == 0) {
408 ck_base = SKC_ATOMIC_ADD_GLOBAL_RELAXED_DEVICE(place_atomics,rem);
409 }
410
411 // broadcast atomically allocated extent base to all lanes
412 skc_uint ck_idx = sub_group_broadcast(ck_base,0) + get_sub_group_local_id();
413
414 //
415 // FIXME -- this loop would probably be faster if the ttpk keys
416 // were held in registers and accessed with shuffles instead of
417 // SMEM loads
418 //
419
420 //
421 // loop until there are no more expanded pk keys
422 //
423 while (true)
424 {
425 skc_int const source = skc_scatter_scan_max(smem,iss,ess);
426 skc_int const dx = get_sub_group_local_id() - intel_sub_group_shuffle(ess,source);
427
428 // store valid ck keys to gmem
429 if (get_sub_group_local_id() < rem) {
430 ck_extent[ck_idx] = skc_pk_to_ck(smem,cmd,ii+source,dx);
431 }
432
433 // decrement remainder
434 rem -= SKC_PLACE_SUBGROUP_SIZE;
435
436 if (rem <= 0)
437 break;
438
439 // increment/decrement indices
440 ck_idx += SKC_PLACE_SUBGROUP_SIZE;
441 iss -= SKC_PLACE_SUBGROUP_SIZE;
442 ess -= SKC_PLACE_SUBGROUP_SIZE;
443 }
444 }
445}
446
447//
448//
449//
450
451static
452skc_uint
453skc_ballot(skc_uint * const xk, skc_uint const is_xk)
454{
455#if 0
456 //
457 // FIXME -- when available, this should use the idiom:
458 //
459 // ballot() + lane_mask_less_than_or_equal + popcount()
460 //
461 // Supported by:
462 //
463 // - Vulkan 1.1 / SPIR-V 1.3
464 // - CUDA
465 // - AVX2 (SSE*?)
466 //
467#else
468 //
469 // otherwise, emulate with an inclusive scan (yuk)
470 //
471 skc_uint const prefix = sub_group_scan_inclusive_add(is_xk);
472
473 skc_uint const xk_idx = *xk + prefix - is_xk;
474
475 *xk += sub_group_broadcast(prefix,SKC_PLACE_SUBGROUP_LAST);
476
477#if 0
478 printf("< %3u >\n",xk_idx);
479#endif
480
481 return xk_idx;
482#endif
483}
484
485//
486//
487//
488__kernel
489SKC_PLACE_KERNEL_ATTRIBS
490void
491skc_kernel_place(__global skc_bp_elem_t * const bp_elems,
492 __global SKC_ATOMIC_UINT volatile * const place_atomics,
493 __global skc_ttck_t * const ck_extent,
494 __global union skc_cmd_place const * const cmds,
495 __global skc_block_id_t * const map,
496 skc_uint4 const clip,
497 skc_uint const count)
498{
499 //
500 // declare shared memory block
501 //
502#if ( SKC_PLACE_WORKGROUP_SUBGROUPS == 1 )
503 __local union skc_subgroup_smem volatile smem[1];
504#else
505 __local union skc_subgroup_smem volatile smem_wg[SKC_PLACE_WORKGROUP_SUBGROUPS];
506 __local union skc_subgroup_smem volatile * const smem = smem_wg + get_sub_group_id();
507#endif
508
509 //
510 // This is a subgroup-centric kernel
511 //
512 // Which subgroup in the grid is this?
513 //
514 // TAKE NOTE: the Intel GEN compiler appears to be recognizing
515 // get_group_id(0) as a uniform but the alternative calculation used
516 // when there are multiple subgroups per workgroup is not
517 // cooperating and driving spillage elsewhere.
518 //
519 // Test the raster's translated bounds against the composition's
520 // tile clip
521 //
522 // There are 3 cases:
523 //
524 // - the raster is completely clipped -> return
525 // - the raster is partially clipped -> all keys must clipped
526 // - the raster is not clipped -> no keys are tested
527 //
528 //
529 // There are at least 4 implementations of place and we want to
530 // special-case them as much as possible so that, at the least, the
531 // fastpath remains fast.
532 //
533 // - implement NO CLIP + NO TRANSLATION fastpath -- CAN ATOMICALLY ALLOCATE SK+PK KEYS IN ONE STEP
534 //
535 // - implement CLIPPED + NO TRANSLATION path
536 //
537 // - implement NO CLIP + TRANSLATION path
538 //
539 // - implement CLIPPED + TRANSLATION path
540 //
541 //
542 // FIXME/OPTIMIZATION: split scan accumulator into a triple-bin
543 // 12:12:8 integer where:
544 //
545 // 12: ttsk
546 // 12: ttpk
547 // 8: /dev/null -- clipped or invalid key
548 //
549 // Three kinds of nodes in a raster's list:
550 //
551 // - the head node
552 // - an internal node
553 // - the final node
554 //
555
556#if ( SKC_PLACE_WORKGROUP_SUBGROUPS == 1 )
557 skc_uint const cmd_idx = get_group_id(0);
558#else
559 skc_uint const cmd_idx = get_group_id(0) * SKC_PLACE_WORKGROUP_SUBGROUPS + get_sub_group_id();
560#endif
561
562 // load command
563 union skc_cmd_place const cmd = cmds[cmd_idx];
564
565 // get the raster header from the raster host id -- scalar
566 skc_block_id_t id = map[cmd.raster_h];
567
568 //
569 // load all of the head block ttxk keys into registers
570 //
571 // FIXME -- this pattern lends itself to using the higher
572 // performance Intel GEN block load instructions
573 //
574 skc_uint const head_id = id * SKC_DEVICE_SUBBLOCK_WORDS + SKC_PLACE_STRIDE_H(get_sub_group_local_id());
575
576#undef SKC_EXPAND_X
577#define SKC_EXPAND_X(I,S,C,P,R) \
578 union skc_raster_node_elem const h##I = { \
579 .u32v2 = { bp_elems[head_id + SKC_PLACE_STRIDE_V_LO(I)], \
580 bp_elems[head_id + SKC_PLACE_STRIDE_V_HI(I)] } \
581 };
582
583 SKC_PLACE_EXPAND();
584
585 //
586 // load raster header counts -- we only need the "nodes" and "keys"
587 // words but the keys we loaded are doublewords.
588 //
589 // FIXME -- this can be made portable with compile-time macro expansion
590 //
591 skc_uint nodes = sub_group_broadcast(h0.u32v2.lo,1); // SKC_RASTER_HEAD_OFFSET_COUNTS_NODES
592 skc_uint keys = sub_group_broadcast(h0.u32v2.hi,1); // SKC_RASTER_HEAD_OFFSET_COUNTS_KEYS
593
594 //
595 //
596 //
597#if 0
598#undef SKC_EXPAND_X
599#define SKC_EXPAND_X(I,S,C,P,R) \
600 printf("%5u : %6u : %3u : %08X . %08X - %08X\n", \
601 nodes,keys, \
602 I*SKC_PLACE_SUBGROUP_SIZE + get_sub_group_local_id(), \
603 h##I.u32v2.hi,h##I.u32v2.lo, \
604 h##I.u32v2.lo & SKC_TTXK_LO_MASK_PREFIX);
605
606 SKC_PLACE_EXPAND();
607#endif
608
609 //
610#if 0
611 if (get_sub_group_local_id() == 0) {
612 printf("place: %u / %u / %u\n",head_id,nodes,keys);
613 }
614#endif
615
616 {
617 //
618 // classify every key in the header
619 //
620 // keys: 0 is not a key / 1 is a key
621 // skpk: 0 is sk / 1 is pk
622 //
623 skc_uint bits_keys = 0;
624 skc_uint bits_skpk = 0;
625
626 //
627 // calculate bits_keys
628 //
629#undef SKC_EXPAND_X
630#define SKC_EXPAND_X(I,S,C,P,R) \
631 if (!SKC_PLACE_IS_ALL_HEADER_ROW(I)) { \
632 skc_uint const idx = I * SKC_PLACE_SUBGROUP_SIZE + get_sub_group_local_id() - SKC_RASTER_HEAD_DWORDS; \
633 if (idx < keys) { \
634 bits_keys |= (1u << I); \
635 } \
636 if (SKC_PLACE_IS_TRAILING_ROW(I)) { \
637 if (keys > SKC_RASTER_HEAD_COUNT_KEYS) { \
638 if (get_sub_group_local_id() == SKC_PLACE_SUBGROUP_LAST) { \
639 bits_keys &= ~(1u << I); \
640 } \
641 } \
642 } \
643 }
644
645 SKC_PLACE_EXPAND();
646
647 //
648 // blindly calculate bits_skpk
649 //
650#undef SKC_EXPAND_X
651#define SKC_EXPAND_X(I,S,C,P,R) \
652 if (!SKC_PLACE_IS_ALL_HEADER_ROW(I)) { \
653 bits_skpk |= (h##I.xk.lo & SKC_TTXK_LO_MASK_PREFIX) >> (SKC_TTXK_LO_OFFSET_PREFIX - I); \
654 }
655
656 SKC_PLACE_EXPAND();
657
658#if 0
659 printf("%2X : %2X\n",bits_keys,bits_skpk);
660#endif
661
662 //
663 // next pointer is last element of last row. save it now because
664 // this might be recognized as a subgroup-uniform/scalar.
665 //
666 id = sub_group_broadcast(SKC_CONCAT(h,SKC_PLACE_EXPAND_I_LAST).next.node,SKC_PLACE_SUBGROUP_LAST);
667
668 //
669 // append SK keys first
670 //
671 skc_uint const bits_sk = bits_keys & ~bits_skpk;
672 skc_uint sk = 0;
673
674#undef SKC_EXPAND_X
675#define SKC_EXPAND_X(I,S,C,P,R) \
676 if (!SKC_PLACE_IS_ALL_HEADER_ROW(I)) { \
677 skc_uint is_sk = (bits_sk >> I) & 1; \
678 skc_uint sk_idx = skc_ballot(&sk,is_sk); \
679 if (is_sk) { \
680 smem->lo.sk[sk_idx] = h##I.xk.lo; \
681 smem->hi.sk[sk_idx] = h##I.xk.hi; \
682 } \
683 }
684
685 SKC_PLACE_EXPAND();
686
687 //
688 // append PK keys next
689 //
690 skc_uint const bits_pk = bits_keys & bits_skpk;
691 skc_uint pk = 0;
692
693#undef SKC_EXPAND_X
694#define SKC_EXPAND_X(I,S,C,P,R) \
695 if (!SKC_PLACE_IS_ALL_HEADER_ROW(I)) { \
696 skc_uint is_pk = (bits_pk >> I) & 1; \
697 skc_uint pk_idx = skc_ballot(&pk,is_pk); \
698 if (is_pk) { \
699 smem->lo.pk[pk_idx] = h##I.xk.lo; \
700 smem->hi.pk[pk_idx] = h##I.xk.hi; \
701 } \
702 }
703
704 SKC_PLACE_EXPAND();
705
706#if 0
707 printf("%2u * %2u\n",sk,pk);
708#endif
709 //
710 // flush the keys
711 //
712 skc_ttsk_flush(place_atomics,ck_extent,smem,&cmd,sk);
713 skc_ttpk_flush(place_atomics,ck_extent,smem,&cmd,pk);
714 }
715
716 //
717 // we're done if there was only a head node
718 //
719 if (nodes == 0)
720 return;
721
722 //
723 // decrement keys
724 //
725 keys -= SKC_RASTER_HEAD_COUNT_KEYS;
726
727 //
728 // otherwise, append keys in trailing nodes to smem
729 //
730 while (true)
731 {
732 //
733 // load all of the node block ttxk keys into registers
734 //
735 // FIXME -- this pattern lends itself to using the higher
736 // performance Intel GEN block load instructions
737 //
738 skc_uint const node_id = id * SKC_DEVICE_SUBBLOCK_WORDS + SKC_PLACE_STRIDE_H(get_sub_group_local_id());
739
740#undef SKC_EXPAND_X
741#define SKC_EXPAND_X(I,S,C,P,R) \
742 union skc_raster_node_elem const n##I = { \
743 .u32v2 = { bp_elems[node_id + SKC_PLACE_STRIDE_V_LO(I)], \
744 bp_elems[node_id + SKC_PLACE_STRIDE_V_HI(I)] } \
745 };
746
747 SKC_PLACE_EXPAND();
748
749#if 0
750#undef SKC_EXPAND_X
751#define SKC_EXPAND_X(I,S,C,P,R) \
752 printf("%5u : %6u : %3u : %08X . %08X - %08X\n", \
753 nodes,keys, \
754 I*SKC_PLACE_SUBGROUP_SIZE + get_sub_group_local_id(), \
755 n##I.u32v2.hi,n##I.u32v2.lo, \
756 n##I.u32v2.lo & SKC_TTXK_LO_MASK_PREFIX);
757
758 SKC_PLACE_EXPAND();
759#endif
760
761 //
762 // classify every key in the header
763 //
764 // keys: 0 is not a key / 1 is a key
765 // skpk: 0 is sk / 1 is pk
766 //
767 skc_uint bits_keys = 0;
768 skc_uint bits_skpk = 0;
769
770 //
771 // calculate bits_keys
772 //
773#undef SKC_EXPAND_X
774#define SKC_EXPAND_X(I,S,C,P,R) { \
775 skc_uint const idx = I * SKC_PLACE_SUBGROUP_SIZE + get_sub_group_local_id(); \
776 if (idx < keys) { \
777 bits_keys |= (1u << I); \
778 } \
779 if (SKC_PLACE_IS_TRAILING_ROW(I)) { \
780 if (keys > SKC_RASTER_NODE_COUNT_KEYS) { \
781 if (get_sub_group_local_id() == SKC_PLACE_SUBGROUP_LAST) { \
782 bits_keys &= ~(1u << I); \
783 } \
784 } \
785 } \
786 }
787
788 SKC_PLACE_EXPAND();
789
790 //
791 // blindly calculate bits_skpk
792 //
793#undef SKC_EXPAND_X
794#define SKC_EXPAND_X(I,S,C,P,R) { \
795 bits_skpk |= (n##I.xk.lo & SKC_TTXK_LO_MASK_PREFIX) >> (SKC_TTXK_LO_OFFSET_PREFIX - I); \
796 }
797
798 SKC_PLACE_EXPAND();
799
800#if 0
801 printf("%2X : %2X\n",bits_keys,bits_skpk);
802#endif
803
804 //
805 // next pointer is last element of last row. save it now because
806 // this might be recognized as a subgroup-uniform/scalar.
807 //
808 id = sub_group_broadcast(SKC_CONCAT(n,SKC_PLACE_EXPAND_I_LAST).next.node,SKC_PLACE_SUBGROUP_LAST);
809
810 //
811 // append SK keys first
812 //
813 skc_uint const bits_sk = bits_keys & ~bits_skpk;
814 skc_uint sk = 0;
815
816#undef SKC_EXPAND_X
817#define SKC_EXPAND_X(I,S,C,P,R) { \
818 skc_uint is_sk = (bits_sk >> I) & 1; \
819 skc_uint sk_idx = skc_ballot(&sk,is_sk); \
820 if (is_sk) { \
821 smem->lo.sk[sk_idx] = n##I.xk.lo; \
822 smem->hi.sk[sk_idx] = n##I.xk.hi; \
823 } \
824 }
825
826 SKC_PLACE_EXPAND();
827
828 //
829 // append PK keys next
830 //
831 skc_uint const bits_pk = bits_keys & bits_skpk;
832 skc_uint pk = 0;
833
834#undef SKC_EXPAND_X
835#define SKC_EXPAND_X(I,S,C,P,R) { \
836 skc_uint is_pk = (bits_pk >> I) & 1; \
837 skc_uint pk_idx = skc_ballot(&pk,is_pk); \
838 if (is_pk) { \
839 smem->lo.pk[pk_idx] = n##I.xk.lo; \
840 smem->hi.pk[pk_idx] = n##I.xk.hi; \
841 } \
842 }
843
844 SKC_PLACE_EXPAND();
845
846#if 0
847 printf("%2u * %2u\n",sk,pk);
848#endif
849 //
850 // if total for either the sk or pk queue reaches the
851 // highwater mark then flush it to the extent
852 //
853 skc_ttsk_flush(place_atomics,ck_extent,smem,&cmd,sk);
854 skc_ttpk_flush(place_atomics,ck_extent,smem,&cmd,pk);
855
856 //
857 // if this was the last node then we're done
858 //
859 if (--nodes == 0)
860 return;
861
862 //
863 // otherwise decrement keys
864 //
865 keys -= SKC_RASTER_NODE_COUNT_KEYS;
866 }
867}
868
869//
870//
871//