Blame - src/compute/skc/platforms/cl_12/kernels/place.cl - platform/external/skia

blob: 8866bdb3e6b2d10102d94e5f0b74e33a1f8a95c0 [file] [log] [blame]

Allan MacKinnon	c110e79	2018-06-21 09:09:56 -0700	[diff] [blame^]	1	/*
				2	* Copyright 2017 Google Inc.
				3	*
				4	* Use of this source code is governed by a BSD-style license that can
				5	* be found in the LICENSE file.
				6	*
				7	*/
				8
				9	//
				10	//
				11	//
				12
				13	#include "tile.h"
				14	#include "common.h"
				15	#include "raster.h"
				16	#include "atomic_cl.h"
				17	#include "kernel_cl_12.h"
				18
				19	//
				20	//
				21	//
				22
				23	#define SKC_PLACE_SUBGROUP_MASK (SKC_PLACE_SUBGROUP_SIZE - 1)
				24	#define SKC_PLACE_SUBGROUP_LAST (SKC_PLACE_SUBGROUP_SIZE - 1)
				25
				26	//
				27	//
				28	//
				29
				30	#define SKC_PLACE_SMEM_COUNT_TTSK SKC_MAX_MACRO(SKC_RASTER_NODE_MAX_TTSK,SKC_PLACE_SUBGROUP_SIZE)
				31	#define SKC_PLACE_SMEM_COUNT_TTPK SKC_RASTER_NODE_MAX_TTPK
				32
				33	//
				34	//
				35	//
				36
				37	#define SKC_PLACE_X (SKC_DEVICE_BLOCK_DWORDS / SKC_PLACE_SUBGROUP_SIZE)
				38
				39	//
				40	//
				41	//
				42
				43	#if ( SKC_PLACE_X == 1 )
				44	#define SKC_PLACE_EXPAND() SKC_EXPAND_1()
				45	#define SKC_PLACE_EXPAND_I_LAST 0
				46
				47	#elif ( SKC_PLACE_X == 2 )
				48	#define SKC_PLACE_EXPAND() SKC_EXPAND_2()
				49	#define SKC_PLACE_EXPAND_I_LAST 1
				50
				51	#elif ( SKC_PLACE_X == 4 )
				52	#define SKC_PLACE_EXPAND() SKC_EXPAND_4()
				53	#define SKC_PLACE_EXPAND_I_LAST 3
				54
				55	#elif ( SKC_PLACE_X == 8 )
				56	#define SKC_PLACE_EXPAND() SKC_EXPAND_8()
				57	#define SKC_PLACE_EXPAND_I_LAST 7
				58
				59	#elif ( SKC_PLACE_X == 16)
				60	#define SKC_PLACE_EXPAND() SKC_EXPAND_16()
				61	#define SKC_PLACE_EXPAND_I_LAST 15
				62	#endif
				63
				64	//
				65	// PREFIX STORES THE 64-BIT KEYS WITH TWO 32-BIT SUBGROUP-WIDE
				66	// COALESCED WRITES. LO FIRST, FOLLOWED BY HI.
				67	//
				68	// THIS SLIGHTLY COMPLICATES LOADING BY THE PLACE KERNEL IF THE
				69	// KERNELS USE DIFFERENT SUBGROUP SIZES.
				70	//
				71	// THE BENEFIT IS THAT THE RASTER RECLAIM KERNEL ONLY HAS TO LOAD THE
				72	// LO WORD OF THE KEY SINCE IT CONTAINS THE BLOCK ID.
				73	//
				74	// NOTE: AT THIS POINT, ONLY INTEL'S HD GRAPHICS ARCHITECTURE UNDER
				75	// OPENCL SUPPORTS SELECTING A SUBGROUP SIZE (8/16/32). VULKAN MAY
				76	// ONLY SUPPORT A SUBGROUP SIZE OF 16.
				77	//
				78
				79	#if ( SKC_PREFIX_SUBGROUP_SIZE == SKC_PLACE_SUBGROUP_SIZE )
				80
				81	#define SKC_PLACE_STRIDE_H(L) (L)
				82	#define SKC_PLACE_STRIDE_V_LO(I) (I * 2 * SKC_PLACE_SUBGROUP_SIZE)
				83	#define SKC_PLACE_STRIDE_V_HI(I) (SKC_PLACE_STRIDE_V_LO(I) + SKC_PLACE_SUBGROUP_SIZE)
				84
				85	#elif ( SKC_PREFIX_SUBGROUP_SIZE > SKC_PLACE_SUBGROUP_SIZE ) // same as above when ratio equals 1
				86
				87	#define SKC_PLACE_SUBGROUP_RATIO (SKC_PREFIX_SUBGROUP_SIZE / SKC_PLACE_SUBGROUP_SIZE)
				88	#define SKC_PLACE_SUBGROUP_RATIO_MASK (SKC_PLACE_SUBGROUP_RATIO - 1)
				89	#define SKC_PLACE_SUBGROUP_RATIO_SCALE(I) ((I / SKC_PLACE_SUBGROUP_RATIO) * 2 * SKC_PLACE_SUBGROUP_RATIO + (I & SKC_PLACE_SUBGROUP_RATIO_MASK))
				90
				91	#define SKC_PLACE_STRIDE_H(L) (L)
				92	#define SKC_PLACE_STRIDE_V_LO(I) (SKC_PLACE_SUBGROUP_RATIO_SCALE(I) * SKC_PLACE_SUBGROUP_SIZE)
				93	#define SKC_PLACE_STRIDE_V_HI(I) (SKC_PLACE_STRIDE_V_LO(I) + SKC_PLACE_SUBGROUP_RATIO * SKC_PLACE_SUBGROUP_SIZE)
				94
				95	#elif ( SKC_PREFIX_SUBGROUP_SIZE < SKC_PLACE_SUBGROUP_SIZE ) // same as above when ratio equals 1
				96
				97	#define SKC_PLACE_SUBGROUP_RATIO (SKC_PLACE_SUBGROUP_SIZE / SKC_PREFIX_SUBGROUP_SIZE)
				98	#define SKC_PLACE_SUBGROUP_RATIO_MASK (SKC_PLACE_SUBGROUP_SIZE / SKC_PLACE_SUBGROUP_RATIO - 1) // equal to prefix subgroup mask
				99
				100	#define SKC_PLACE_STRIDE_H(L) (((L) & ~SKC_PLACE_SUBGROUP_RATIO_MASK) * 2 + ((L) & SKC_PLACE_SUBGROUP_RATIO_MASK))
				101	#define SKC_PLACE_STRIDE_V_LO(I) (I * 2 * SKC_PLACE_SUBGROUP_SIZE)
				102	#define SKC_PLACE_STRIDE_V_HI(I) (SKC_PLACE_STRIDE_V_LO(I) + SKC_PLACE_SUBGROUP_SIZE / SKC_PLACE_SUBGROUP_RATIO)
				103
				104	#endif
				105
				106	//
				107	// A COARSE COMPILE-TIME GUARD -- WILL ONLY MATTER WHEN SUBGROUP SIZE
				108	// IS EQUAL TO THE RASTER HEADER SIZE (CURRENTLY 8)
				109	//
				110
				111	#define SKC_PLACE_IS_ALL_HEADER_ROW(i) (((i)+1) * SKC_PLACE_SUBGROUP_SIZE <= SKC_RASTER_HEAD_DWORDS)
				112
				113	#define SKC_PLACE_IS_NOT_HEADER_ROW(i) ( (i) * SKC_PLACE_SUBGROUP_SIZE >= SKC_RASTER_HEAD_DWORDS)
				114
				115	#define SKC_PLACE_IS_TRAILING_ROW(i) (((i)+1) * SKC_PLACE_SUBGROUP_SIZE == SKC_DEVICE_BLOCK_DWORDS)
				116
				117	#define SKC_PLACE_IS_HEADER_ROW_KEY(i) ((i) * SKC_PLACE_SUBGROUP_SIZE + get_sub_group_local_id() - SKC_RASTER_HEAD_DWORDS < (k))
				118
				119
				120	//
				121	// Note: HEADER_LESS_THAN purposefully wraps unsigned integer to ~UINT_MAX
				122	//
				123	#define SKC_PLACE_HEADER_LESS_THAN(i,k) ((i) * SKC_PLACE_SUBGROUP_SIZE + get_sub_group_local_id() - SKC_RASTER_HEAD_DWORDS < (k))
				124	#define SKC_PLACE_NODE_LESS_THAN(i,k) ((i) * SKC_PLACE_SUBGROUP_SIZE + get_sub_group_local_id() < (k))
				125
				126	//
				127	// TTSK v2:
				128	//
				129	// 0 63
				130	// \| TTSB ID \| PREFIX \| SPAN \| X \| Y \|
				131	// +---------+--------+---------+-----+-----+
				132	// \| 27 \| 1 (=0) \| 12 (=0) \| 12 \| 12 \|
				133	//
				134	//
				135	// TTPK v2:
				136	//
				137	// 0 63
				138	// \| TTPB ID \| PREFIX \| SPAN \| X \| Y \|
				139	// +---------+--------+------+-----+-----+
				140	// \| 27 \| 1 (=1) \| 12 \| 12 \| 12 \|
				141	//
				142	//
				143
				144	//
				145	// TTCK (32-BIT COMPARE) v1:
				146	//
				147	// 0 63
				148	// \| PAYLOAD/TTSB/TTPB ID \| PREFIX \| ESCAPE \| LAYER \| X \| Y \|
				149	// +----------------------+--------+--------+-------+-----+-----+
				150	// \| 30 \| 1 \| 1 \| 18 \| 7 \| 7 \|
				151	//
				152	//
				153	// TTCK (32-BIT COMPARE) v2:
				154	//
				155	// 0 63
				156	// \| PAYLOAD/TTSB/TTPB ID \| PREFIX \| ESCAPE \| LAYER \| X \| Y \|
				157	// +----------------------+--------+--------+-------+-----+-----+
				158	// \| 30 \| 1 \| 1 \| 15 \| 9 \| 8 \|
				159	//
				160	//
				161	// TTCK (64-BIT COMPARE) -- achieves 4K x 4K with an 8x16 tile:
				162	//
				163	// 0 63
				164	// \| PAYLOAD/TTSB/TTPB ID \| PREFIX \| ESCAPE \| LAYER \| X \| Y \|
				165	// +----------------------+--------+--------+-------+-----+-----+
				166	// \| 27 \| 1 \| 1 \| 18 \| 9 \| 8 \|
				167	//
				168
				169	union skc_subgroup_smem
				170	{
				171	skc_uint scratch[SKC_PLACE_SUBGROUP_SIZE]; // will only use SKC_PLACE_SUBGROUP_SIZE
				172
				173	struct {
				174	struct {
				175	skc_ttsk_lo_t sk[SKC_PLACE_SMEM_COUNT_TTSK];
				176	skc_ttpk_lo_t pk[SKC_PLACE_SMEM_COUNT_TTPK];
				177	} lo;
				178
				179	struct {
				180	skc_ttsk_hi_t sk[SKC_PLACE_SMEM_COUNT_TTSK];
				181	skc_ttpk_hi_t pk[SKC_PLACE_SMEM_COUNT_TTPK];
				182	} hi;
				183
				184	// skc_uint span[SKC_PLACE_SMEM_COUNT_TTPK];
				185	};
				186
				187	};
				188
				189	//
				190	// scatter scan max
				191	//
				192	static
				193	skc_int_v_t
				194	skc_scatter_scan_max(__local union skc_subgroup_smem volatile * const smem,
				195	skc_int_v_t const iss,
				196	skc_int_v_t const ess)
				197	{
				198	//
				199	// prefix sums determine which lanes we're going to work on next
				200	//
				201	skc_pred_v_t const is_scratch_store = (iss > 0) && (ess < SKC_PLACE_SUBGROUP_SIZE);
				202	skc_int_v_t const scratch_idx = max(ess,0);
				203
				204	//
				205	// SIMT
				206	//
				207
				208	//
				209	// zero the volatile smem scratchpad using vector syntax
				210	//
				211	smem->scratch[get_sub_group_local_id()] = ( 0 );
				212
				213	//
				214	// store source lane at starting lane
				215	//
				216	if (is_scratch_store) {
				217	smem->scratch[scratch_idx] = get_sub_group_local_id();
				218	}
				219
				220	//
				221	// propagate lanes to right using max scan
				222	//
				223	skc_int_v_t const scratch = smem->scratch[get_sub_group_local_id()];
				224	skc_int_v_t const source = sub_group_scan_inclusive_max(scratch);
				225
				226	return source;
				227	}
				228
				229	//
				230	//
				231	//
				232
				233	static
				234	skc_bool
				235	skc_xk_clip(union skc_tile_clip const * const tile_clip,
				236	skc_ttxk_t * const xk)
				237	{
				238	//
				239	// clip the sk and pk keys
				240	//
				241	// if fully clipped then return false
				242	//
				243	// alternatively -- we can expand all these keys in place
				244	//
				245	// alternatively -- keep sk and pk keys segregated because sk
				246	// represents the vast majority of keys and are easier to process.
				247	// don't mess with the fastpath!
				248	//
				249	return false;
				250	}
				251
				252	//
				253	//
				254	//
				255
				256	static
				257	skc_ttck_t
				258	skc_sk_to_ck(__local union skc_subgroup_smem volatile * const smem,
				259	union skc_cmd_place const * const cmd,
				260	skc_uint const sk_idx)
				261	{
				262	skc_uint const lo = smem->lo.sk[sk_idx]; // assumes prefix bit is 0
				263	skc_uint const hi = smem->hi.sk[sk_idx];
				264
				265	skc_ttck_t ck;
				266
				267	ck.lo = lo \| (cmd->layer_id << SKC_TTCK_LO_BITS_ID_PREFIX_ESCAPE); // FIXME -- preshift the layer id
				268
				269	// FIXME -- x and y should already be clipped and shifted
				270	skc_uint const x = (cmd->tx + SKC_BFE(hi,SKC_TTXK_HI_BITS_X,SKC_TTXK_HI_OFFSET_X)) << SKC_TTCK_HI_OFFSET_X;
				271	skc_uint const y = (cmd->ty + SKC_BFE(hi,SKC_TTXK_HI_BITS_Y,SKC_TTXK_HI_OFFSET_Y)) << SKC_TTCK_HI_OFFSET_Y;
				272
				273	ck.hi = (cmd->layer_id >> SKC_TTCK_HI_SHR_LAYER) \| x \| y;
				274
				275	return ck;
				276	}
				277
				278	static
				279	skc_ttck_t
				280	skc_pk_to_ck(__local union skc_subgroup_smem volatile * const smem,
				281	union skc_cmd_place const * const cmd,
				282	skc_uint const pk_idx,
				283	skc_uint const dx)
				284	{
				285	skc_uint const lo = smem->lo.pk[pk_idx] & SKC_TTXK_LO_MASK_ID_PREFIX; // assumes prefix bit is 1
				286	skc_uint const hi = smem->hi.pk[pk_idx];
				287
				288	skc_ttck_t ck;
				289
				290	ck.lo = lo \| (cmd->layer_id << SKC_TTCK_LO_BITS_ID_PREFIX_ESCAPE); // FIXME -- preshift the layer id
				291
				292	// FIXME -- x and y should already be clipped and shifted
				293	skc_uint const x = (cmd->tx + dx + SKC_BFE(hi,SKC_TTXK_HI_BITS_X,SKC_TTXK_HI_OFFSET_X)) << SKC_TTCK_HI_OFFSET_X;
				294	skc_uint const y = (cmd->ty + SKC_BFE(hi,SKC_TTXK_HI_BITS_Y,SKC_TTXK_HI_OFFSET_Y)) << SKC_TTCK_HI_OFFSET_Y;
				295
				296	ck.hi = (cmd->layer_id >> SKC_TTCK_HI_SHR_LAYER) \| x \| y;
				297
				298	return ck;
				299	}
				300
				301	//
				302	//
				303	//
				304
				305	static
				306	void
				307	skc_ttsk_flush(__global SKC_ATOMIC_UINT volatile * const place_atomics,
				308	__global skc_ttck_t * const ck_extent,
				309	__local union skc_subgroup_smem volatile * const smem,
				310	union skc_cmd_place const * const cmd,
				311	skc_uint const sk)
				312	{
				313	//
				314	// Pretty sure you can never ever have an sk count equal to 0
				315	//
				316	skc_uint ck_base = 0;
				317
				318	// last lane performs the block pool allocation with an atomic increment
				319	if (get_sub_group_local_id() == 0) {
				320	ck_base = SKC_ATOMIC_ADD_GLOBAL_RELAXED_DEVICE(place_atomics,sk);
				321	}
				322
				323	// broadcast base to all lanes
				324	ck_base = sub_group_broadcast(ck_base,0);
				325
				326	// convert sk keys to ck keys
				327	for (skc_uint ii=get_sub_group_local_id(); ii<sk; ii+=SKC_PLACE_SUBGROUP_SIZE)
				328	{
				329	ck_extent[ck_base+ii] = skc_sk_to_ck(smem,cmd,ii);
				330	}
				331	}
				332
				333	//
				334	//
				335	//
				336
				337	static
				338	skc_int
				339	skc_ttpk_get_span(__local union skc_subgroup_smem volatile * const smem,
				340	skc_uint const idx)
				341	{
				342	skc_uint const lo = smem->lo.pk[idx];
				343	skc_uint const hi = smem->hi.pk[idx];
				344
				345	skc_uint const span_lo = lo >> SKC_TTXK_LO_OFFSET_SPAN;
				346	skc_uint const span_hi = (hi & SKC_BITS_TO_MASK(SKC_TTXK_HI_BITS_SPAN)) << SKC_TTXK_LO_BITS_SPAN;
				347
				348	return (span_lo \| span_hi) + 1;
				349	}
				350
				351	//
				352	//
				353	//
				354
				355	static
				356	void
				357	skc_ttpk_flush(__global SKC_ATOMIC_UINT volatile * const place_atomics,
				358	__global skc_ttck_t * const ck_extent,
				359	__local union skc_subgroup_smem volatile * const smem,
				360	union skc_cmd_place const * const cmd,
				361	skc_uint const pk)
				362	{
				363	// bail out if pk queue is empty
				364	if (pk == 0)
				365	return;
				366
				367	#if 0
				368	if (get_sub_group_local_id() == 0)
				369	printf("%u\n",pk);
				370	#endif
				371
				372	//
				373	// FIXME -- this nested loop iterates over the queue processing a
				374	// subgroup of 64-bit keys at a time. This is probably not the most
				375	// efficient approach so investigate how to store and iterate over a
				376	// wider than subgroup (node-sized) queue of keys.
				377	//
				378
				379	// round up so we work with full subgroups
				380	skc_uint const pk_ru = (pk + SKC_PLACE_SUBGROUP_SIZE - 1) & ~SKC_PLACE_SUBGROUP_MASK;
				381	skc_uint ii = 0;
				382
				383	// nested loop that expands all ttpk keys
				384	#if (SKC_PLACE_SMEM_COUNT_TTPK > SKC_PLACE_SUBGROUP_SIZE)
				385	for (; ii<pk_ru; ii+=SKC_PLACE_SUBGROUP_SIZE)
				386	#endif
				387	{
				388	skc_uint idx = ii + get_sub_group_local_id();
				389	skc_int span = 0;
				390
				391	// how many tiles does this ttpk span?
				392	if (idx < pk)
				393	span = skc_ttpk_get_span(smem,idx);
				394
				395	// we need inclusive, exclusive and total
				396	skc_int iss = sub_group_scan_inclusive_add(span);
				397	skc_int ess = iss - span;
				398	skc_int rem = sub_group_broadcast(iss,SKC_PLACE_SUBGROUP_SIZE-1);
				399
				400	// printf("%u : %u\n",span,iss);
				401	// continue;
				402
				403	// atomically allocate space for the pk keys
				404	skc_uint ck_base = 0;
				405
				406	// last lane performs the block pool allocation with an atomic increment
				407	if (get_sub_group_local_id() == 0) {
				408	ck_base = SKC_ATOMIC_ADD_GLOBAL_RELAXED_DEVICE(place_atomics,rem);
				409	}
				410
				411	// broadcast atomically allocated extent base to all lanes
				412	skc_uint ck_idx = sub_group_broadcast(ck_base,0) + get_sub_group_local_id();
				413
				414	//
				415	// FIXME -- this loop would probably be faster if the ttpk keys
				416	// were held in registers and accessed with shuffles instead of
				417	// SMEM loads
				418	//
				419
				420	//
				421	// loop until there are no more expanded pk keys
				422	//
				423	while (true)
				424	{
				425	skc_int const source = skc_scatter_scan_max(smem,iss,ess);
				426	skc_int const dx = get_sub_group_local_id() - intel_sub_group_shuffle(ess,source);
				427
				428	// store valid ck keys to gmem
				429	if (get_sub_group_local_id() < rem) {
				430	ck_extent[ck_idx] = skc_pk_to_ck(smem,cmd,ii+source,dx);
				431	}
				432
				433	// decrement remainder
				434	rem -= SKC_PLACE_SUBGROUP_SIZE;
				435
				436	if (rem <= 0)
				437	break;
				438
				439	// increment/decrement indices
				440	ck_idx += SKC_PLACE_SUBGROUP_SIZE;
				441	iss -= SKC_PLACE_SUBGROUP_SIZE;
				442	ess -= SKC_PLACE_SUBGROUP_SIZE;
				443	}
				444	}
				445	}
				446
				447	//
				448	//
				449	//
				450
				451	static
				452	skc_uint
				453	skc_ballot(skc_uint * const xk, skc_uint const is_xk)
				454	{
				455	#if 0
				456	//
				457	// FIXME -- when available, this should use the idiom:
				458	//
				459	// ballot() + lane_mask_less_than_or_equal + popcount()
				460	//
				461	// Supported by:
				462	//
				463	// - Vulkan 1.1 / SPIR-V 1.3
				464	// - CUDA
				465	// - AVX2 (SSE*?)
				466	//
				467	#else
				468	//
				469	// otherwise, emulate with an inclusive scan (yuk)
				470	//
				471	skc_uint const prefix = sub_group_scan_inclusive_add(is_xk);
				472
				473	skc_uint const xk_idx = *xk + prefix - is_xk;
				474
				475	*xk += sub_group_broadcast(prefix,SKC_PLACE_SUBGROUP_LAST);
				476
				477	#if 0
				478	printf("< %3u >\n",xk_idx);
				479	#endif
				480
				481	return xk_idx;
				482	#endif
				483	}
				484
				485	//
				486	//
				487	//
				488	__kernel
				489	SKC_PLACE_KERNEL_ATTRIBS
				490	void
				491	skc_kernel_place(__global skc_bp_elem_t * const bp_elems,
				492	__global SKC_ATOMIC_UINT volatile * const place_atomics,
				493	__global skc_ttck_t * const ck_extent,
				494	__global union skc_cmd_place const * const cmds,
				495	__global skc_block_id_t * const map,
				496	skc_uint4 const clip,
				497	skc_uint const count)
				498	{
				499	//
				500	// declare shared memory block
				501	//
				502	#if ( SKC_PLACE_WORKGROUP_SUBGROUPS == 1 )
				503	__local union skc_subgroup_smem volatile smem[1];
				504	#else
				505	__local union skc_subgroup_smem volatile smem_wg[SKC_PLACE_WORKGROUP_SUBGROUPS];
				506	__local union skc_subgroup_smem volatile * const smem = smem_wg + get_sub_group_id();
				507	#endif
				508
				509	//
				510	// This is a subgroup-centric kernel
				511	//
				512	// Which subgroup in the grid is this?
				513	//
				514	// TAKE NOTE: the Intel GEN compiler appears to be recognizing
				515	// get_group_id(0) as a uniform but the alternative calculation used
				516	// when there are multiple subgroups per workgroup is not
				517	// cooperating and driving spillage elsewhere.
				518	//
				519	// Test the raster's translated bounds against the composition's
				520	// tile clip
				521	//
				522	// There are 3 cases:
				523	//
				524	// - the raster is completely clipped -> return
				525	// - the raster is partially clipped -> all keys must clipped
				526	// - the raster is not clipped -> no keys are tested
				527	//
				528	//
				529	// There are at least 4 implementations of place and we want to
				530	// special-case them as much as possible so that, at the least, the
				531	// fastpath remains fast.
				532	//
				533	// - implement NO CLIP + NO TRANSLATION fastpath -- CAN ATOMICALLY ALLOCATE SK+PK KEYS IN ONE STEP
				534	//
				535	// - implement CLIPPED + NO TRANSLATION path
				536	//
				537	// - implement NO CLIP + TRANSLATION path
				538	//
				539	// - implement CLIPPED + TRANSLATION path
				540	//
				541	//
				542	// FIXME/OPTIMIZATION: split scan accumulator into a triple-bin
				543	// 12:12:8 integer where:
				544	//
				545	// 12: ttsk
				546	// 12: ttpk
				547	// 8: /dev/null -- clipped or invalid key
				548	//
				549	// Three kinds of nodes in a raster's list:
				550	//
				551	// - the head node
				552	// - an internal node
				553	// - the final node
				554	//
				555
				556	#if ( SKC_PLACE_WORKGROUP_SUBGROUPS == 1 )
				557	skc_uint const cmd_idx = get_group_id(0);
				558	#else
				559	skc_uint const cmd_idx = get_group_id(0) * SKC_PLACE_WORKGROUP_SUBGROUPS + get_sub_group_id();
				560	#endif
				561
				562	// load command
				563	union skc_cmd_place const cmd = cmds[cmd_idx];
				564
				565	// get the raster header from the raster host id -- scalar
				566	skc_block_id_t id = map[cmd.raster_h];
				567
				568	//
				569	// load all of the head block ttxk keys into registers
				570	//
				571	// FIXME -- this pattern lends itself to using the higher
				572	// performance Intel GEN block load instructions
				573	//
				574	skc_uint const head_id = id * SKC_DEVICE_SUBBLOCK_WORDS + SKC_PLACE_STRIDE_H(get_sub_group_local_id());
				575
				576	#undef SKC_EXPAND_X
				577	#define SKC_EXPAND_X(I,S,C,P,R) \
				578	union skc_raster_node_elem const h##I = { \
				579	.u32v2 = { bp_elems[head_id + SKC_PLACE_STRIDE_V_LO(I)], \
				580	bp_elems[head_id + SKC_PLACE_STRIDE_V_HI(I)] } \
				581	};
				582
				583	SKC_PLACE_EXPAND();
				584
				585	//
				586	// load raster header counts -- we only need the "nodes" and "keys"
				587	// words but the keys we loaded are doublewords.
				588	//
				589	// FIXME -- this can be made portable with compile-time macro expansion
				590	//
				591	skc_uint nodes = sub_group_broadcast(h0.u32v2.lo,1); // SKC_RASTER_HEAD_OFFSET_COUNTS_NODES
				592	skc_uint keys = sub_group_broadcast(h0.u32v2.hi,1); // SKC_RASTER_HEAD_OFFSET_COUNTS_KEYS
				593
				594	//
				595	//
				596	//
				597	#if 0
				598	#undef SKC_EXPAND_X
				599	#define SKC_EXPAND_X(I,S,C,P,R) \
				600	printf("%5u : %6u : %3u : %08X . %08X - %08X\n", \
				601	nodes,keys, \
				602	I*SKC_PLACE_SUBGROUP_SIZE + get_sub_group_local_id(), \
				603	h##I.u32v2.hi,h##I.u32v2.lo, \
				604	h##I.u32v2.lo & SKC_TTXK_LO_MASK_PREFIX);
				605
				606	SKC_PLACE_EXPAND();
				607	#endif
				608
				609	//
				610	#if 0
				611	if (get_sub_group_local_id() == 0) {
				612	printf("place: %u / %u / %u\n",head_id,nodes,keys);
				613	}
				614	#endif
				615
				616	{
				617	//
				618	// classify every key in the header
				619	//
				620	// keys: 0 is not a key / 1 is a key
				621	// skpk: 0 is sk / 1 is pk
				622	//
				623	skc_uint bits_keys = 0;
				624	skc_uint bits_skpk = 0;
				625
				626	//
				627	// calculate bits_keys
				628	//
				629	#undef SKC_EXPAND_X
				630	#define SKC_EXPAND_X(I,S,C,P,R) \
				631	if (!SKC_PLACE_IS_ALL_HEADER_ROW(I)) { \
				632	skc_uint const idx = I * SKC_PLACE_SUBGROUP_SIZE + get_sub_group_local_id() - SKC_RASTER_HEAD_DWORDS; \
				633	if (idx < keys) { \
				634	bits_keys \|= (1u << I); \
				635	} \
				636	if (SKC_PLACE_IS_TRAILING_ROW(I)) { \
				637	if (keys > SKC_RASTER_HEAD_COUNT_KEYS) { \
				638	if (get_sub_group_local_id() == SKC_PLACE_SUBGROUP_LAST) { \
				639	bits_keys &= ~(1u << I); \
				640	} \
				641	} \
				642	} \
				643	}
				644
				645	SKC_PLACE_EXPAND();
				646
				647	//
				648	// blindly calculate bits_skpk
				649	//
				650	#undef SKC_EXPAND_X
				651	#define SKC_EXPAND_X(I,S,C,P,R) \
				652	if (!SKC_PLACE_IS_ALL_HEADER_ROW(I)) { \
				653	bits_skpk \|= (h##I.xk.lo & SKC_TTXK_LO_MASK_PREFIX) >> (SKC_TTXK_LO_OFFSET_PREFIX - I); \
				654	}
				655
				656	SKC_PLACE_EXPAND();
				657
				658	#if 0
				659	printf("%2X : %2X\n",bits_keys,bits_skpk);
				660	#endif
				661
				662	//
				663	// next pointer is last element of last row. save it now because
				664	// this might be recognized as a subgroup-uniform/scalar.
				665	//
				666	id = sub_group_broadcast(SKC_CONCAT(h,SKC_PLACE_EXPAND_I_LAST).next.node,SKC_PLACE_SUBGROUP_LAST);
				667
				668	//
				669	// append SK keys first
				670	//
				671	skc_uint const bits_sk = bits_keys & ~bits_skpk;
				672	skc_uint sk = 0;
				673
				674	#undef SKC_EXPAND_X
				675	#define SKC_EXPAND_X(I,S,C,P,R) \
				676	if (!SKC_PLACE_IS_ALL_HEADER_ROW(I)) { \
				677	skc_uint is_sk = (bits_sk >> I) & 1; \
				678	skc_uint sk_idx = skc_ballot(&sk,is_sk); \
				679	if (is_sk) { \
				680	smem->lo.sk[sk_idx] = h##I.xk.lo; \
				681	smem->hi.sk[sk_idx] = h##I.xk.hi; \
				682	} \
				683	}
				684
				685	SKC_PLACE_EXPAND();
				686
				687	//
				688	// append PK keys next
				689	//
				690	skc_uint const bits_pk = bits_keys & bits_skpk;
				691	skc_uint pk = 0;
				692
				693	#undef SKC_EXPAND_X
				694	#define SKC_EXPAND_X(I,S,C,P,R) \
				695	if (!SKC_PLACE_IS_ALL_HEADER_ROW(I)) { \
				696	skc_uint is_pk = (bits_pk >> I) & 1; \
				697	skc_uint pk_idx = skc_ballot(&pk,is_pk); \
				698	if (is_pk) { \
				699	smem->lo.pk[pk_idx] = h##I.xk.lo; \
				700	smem->hi.pk[pk_idx] = h##I.xk.hi; \
				701	} \
				702	}
				703
				704	SKC_PLACE_EXPAND();
				705
				706	#if 0
				707	printf("%2u * %2u\n",sk,pk);
				708	#endif
				709	//
				710	// flush the keys
				711	//
				712	skc_ttsk_flush(place_atomics,ck_extent,smem,&cmd,sk);
				713	skc_ttpk_flush(place_atomics,ck_extent,smem,&cmd,pk);
				714	}
				715
				716	//
				717	// we're done if there was only a head node
				718	//
				719	if (nodes == 0)
				720	return;
				721
				722	//
				723	// decrement keys
				724	//
				725	keys -= SKC_RASTER_HEAD_COUNT_KEYS;
				726
				727	//
				728	// otherwise, append keys in trailing nodes to smem
				729	//
				730	while (true)
				731	{
				732	//
				733	// load all of the node block ttxk keys into registers
				734	//
				735	// FIXME -- this pattern lends itself to using the higher
				736	// performance Intel GEN block load instructions
				737	//
				738	skc_uint const node_id = id * SKC_DEVICE_SUBBLOCK_WORDS + SKC_PLACE_STRIDE_H(get_sub_group_local_id());
				739
				740	#undef SKC_EXPAND_X
				741	#define SKC_EXPAND_X(I,S,C,P,R) \
				742	union skc_raster_node_elem const n##I = { \
				743	.u32v2 = { bp_elems[node_id + SKC_PLACE_STRIDE_V_LO(I)], \
				744	bp_elems[node_id + SKC_PLACE_STRIDE_V_HI(I)] } \
				745	};
				746
				747	SKC_PLACE_EXPAND();
				748
				749	#if 0
				750	#undef SKC_EXPAND_X
				751	#define SKC_EXPAND_X(I,S,C,P,R) \
				752	printf("%5u : %6u : %3u : %08X . %08X - %08X\n", \
				753	nodes,keys, \
				754	I*SKC_PLACE_SUBGROUP_SIZE + get_sub_group_local_id(), \
				755	n##I.u32v2.hi,n##I.u32v2.lo, \
				756	n##I.u32v2.lo & SKC_TTXK_LO_MASK_PREFIX);
				757
				758	SKC_PLACE_EXPAND();
				759	#endif
				760
				761	//
				762	// classify every key in the header
				763	//
				764	// keys: 0 is not a key / 1 is a key
				765	// skpk: 0 is sk / 1 is pk
				766	//
				767	skc_uint bits_keys = 0;
				768	skc_uint bits_skpk = 0;
				769
				770	//
				771	// calculate bits_keys
				772	//
				773	#undef SKC_EXPAND_X
				774	#define SKC_EXPAND_X(I,S,C,P,R) { \
				775	skc_uint const idx = I * SKC_PLACE_SUBGROUP_SIZE + get_sub_group_local_id(); \
				776	if (idx < keys) { \
				777	bits_keys \|= (1u << I); \
				778	} \
				779	if (SKC_PLACE_IS_TRAILING_ROW(I)) { \
				780	if (keys > SKC_RASTER_NODE_COUNT_KEYS) { \
				781	if (get_sub_group_local_id() == SKC_PLACE_SUBGROUP_LAST) { \
				782	bits_keys &= ~(1u << I); \
				783	} \
				784	} \
				785	} \
				786	}
				787
				788	SKC_PLACE_EXPAND();
				789
				790	//
				791	// blindly calculate bits_skpk
				792	//
				793	#undef SKC_EXPAND_X
				794	#define SKC_EXPAND_X(I,S,C,P,R) { \
				795	bits_skpk \|= (n##I.xk.lo & SKC_TTXK_LO_MASK_PREFIX) >> (SKC_TTXK_LO_OFFSET_PREFIX - I); \
				796	}
				797
				798	SKC_PLACE_EXPAND();
				799
				800	#if 0
				801	printf("%2X : %2X\n",bits_keys,bits_skpk);
				802	#endif
				803
				804	//
				805	// next pointer is last element of last row. save it now because
				806	// this might be recognized as a subgroup-uniform/scalar.
				807	//
				808	id = sub_group_broadcast(SKC_CONCAT(n,SKC_PLACE_EXPAND_I_LAST).next.node,SKC_PLACE_SUBGROUP_LAST);
				809
				810	//
				811	// append SK keys first
				812	//
				813	skc_uint const bits_sk = bits_keys & ~bits_skpk;
				814	skc_uint sk = 0;
				815
				816	#undef SKC_EXPAND_X
				817	#define SKC_EXPAND_X(I,S,C,P,R) { \
				818	skc_uint is_sk = (bits_sk >> I) & 1; \
				819	skc_uint sk_idx = skc_ballot(&sk,is_sk); \
				820	if (is_sk) { \
				821	smem->lo.sk[sk_idx] = n##I.xk.lo; \
				822	smem->hi.sk[sk_idx] = n##I.xk.hi; \
				823	} \
				824	}
				825
				826	SKC_PLACE_EXPAND();
				827
				828	//
				829	// append PK keys next
				830	//
				831	skc_uint const bits_pk = bits_keys & bits_skpk;
				832	skc_uint pk = 0;
				833
				834	#undef SKC_EXPAND_X
				835	#define SKC_EXPAND_X(I,S,C,P,R) { \
				836	skc_uint is_pk = (bits_pk >> I) & 1; \
				837	skc_uint pk_idx = skc_ballot(&pk,is_pk); \
				838	if (is_pk) { \
				839	smem->lo.pk[pk_idx] = n##I.xk.lo; \
				840	smem->hi.pk[pk_idx] = n##I.xk.hi; \
				841	} \
				842	}
				843
				844	SKC_PLACE_EXPAND();
				845
				846	#if 0
				847	printf("%2u * %2u\n",sk,pk);
				848	#endif
				849	//
				850	// if total for either the sk or pk queue reaches the
				851	// highwater mark then flush it to the extent
				852	//
				853	skc_ttsk_flush(place_atomics,ck_extent,smem,&cmd,sk);
				854	skc_ttpk_flush(place_atomics,ck_extent,smem,&cmd,pk);
				855
				856	//
				857	// if this was the last node then we're done
				858	//
				859	if (--nodes == 0)
				860	return;
				861
				862	//
				863	// otherwise decrement keys
				864	//
				865	keys -= SKC_RASTER_NODE_COUNT_KEYS;
				866	}
				867	}
				868
				869	//
				870	//
				871	//