Blame - src/compute/skc/platforms/cl_12/kernels/paths_copy.cl - platform/external/skia

blob: 63a1a43177071487a4017588490cea87a24f8ead [file] [log] [blame]

Allan MacKinnon	c110e79	2018-06-21 09:09:56 -0700	[diff] [blame^]	1	/*
				2	* Copyright 2017 Google Inc.
				3	*
				4	* Use of this source code is governed by a BSD-style license that can
				5	* be found in the LICENSE file.
				6	*
				7	*/
				8
				9	//
				10	//
				11	//
				12
				13	#include "path.h"
				14	#include "block_pool_cl.h"
				15	#include "path_builder_cl_12.h"
				16	#include "kernel_cl_12.h"
				17
				18	//
				19	//
				20	//
				21
				22	#if 0
				23
				24	//
				25	// SIMD AVX2
				26	//
				27
				28	#define SKC_PATHS_COPY_WORDS_PER_ELEM 8
				29	#define SKC_PATHS_COPY_SUBGROUP_SIZE 1
				30	#define SKC_PATHS_COPY_KERNEL_ATTRIBUTES
				31
				32	typedef skc_uint8 skc_paths_copy_elem;
				33	typedef skc_uint8 skc_pb_idx_v;
				34
				35	#define SKC_PATHS_COPY_ELEM_EXPAND() SKC_EXPAND_8()
				36
				37	#define SKC_IS_NOT_PATH_HEAD(sg,I) ((sg) + I >= SKC_PATH_HEAD_WORDS)
				38
				39	#endif
				40
				41	//
				42	//
				43	//
				44
				45	#define SKC_PATHS_COPY_SUBGROUP_SIZE_MASK (SKC_PATHS_COPY_SUBGROUP_SIZE - 1)
				46	#define SKC_PATHS_COPY_ELEMS_PER_BLOCK (SKC_DEVICE_BLOCK_WORDS / SKC_PATHS_COPY_ELEM_WORDS)
				47	#define SKC_PATHS_COPY_ELEMS_PER_SUBBLOCK (SKC_DEVICE_SUBBLOCK_WORDS / SKC_PATHS_COPY_ELEM_WORDS)
				48	#define SKC_PATHS_COPY_ELEMS_PER_THREAD (SKC_PATHS_COPY_ELEMS_PER_BLOCK / SKC_PATHS_COPY_SUBGROUP_SIZE)
				49
				50	// FIXME -- use SUBGROUP terminology everywhere
				51	#define SKC_PATHS_COPY_SUBGROUP_WORDS (SKC_PATHS_COPY_SUBGROUP_SIZE * SKC_PATHS_COPY_ELEM_WORDS)
				52
				53	//
				54	//
				55	//
				56
				57	#define SKC_PATHS_COPY_ELEMS_BEFORE_HEADER \
				58	(SKC_PATHS_COPY_SUBGROUP_SIZE * ((SKC_PATH_HEAD_WORDS / SKC_PATHS_COPY_ELEM_WORDS) / SKC_PATHS_COPY_SUBGROUP_WORDS))
				59
				60	#define SKC_PATHS_COPY_ELEMS_INCLUDING_HEADER \
				61	(SKC_PATHS_COPY_SUBGROUP_SIZE * ((SKC_PATH_HEAD_WORDS + SKC_PATHS_COPY_SUBGROUP_WORDS - 1) / SKC_PATHS_COPY_SUBGROUP_WORDS))
				62
				63	// #define SKC_PATHS_COPY_HEAD_ELEMS ((SKC_PATH_HEAD_WORDS + SKC_PATHS_COPY_ELEM_WORDS - 1) / SKC_PATHS_COPY_ELEM_WORDS)
				64
				65	//
				66	//
				67	//
				68
				69	//
				70	// BIT-FIELD EXTRACT/INSERT ARE NOT AVAILABLE IN OPENCL
				71	//
				72
				73	#define SKC_CMD_PATHS_COPY_ONE_BITS (SKC_TAGGED_BLOCK_ID_BITS_TAG + SKC_DEVICE_SUBBLOCK_WORDS_LOG2)
				74
				75	#define SKC_CMD_PATHS_COPY_ONE_MASK SKC_BITS_TO_MASK(SKC_CMD_PATHS_COPY_ONE_BITS)
				76
				77	#define SKC_CMD_PATHS_COPY_ONE (1u << SKC_CMD_PATHS_COPY_ONE_BITS)
				78
				79	#define SKC_CMD_PATHS_COPY_GET_TAG(ti) SKC_TAGGED_BLOCK_ID_GET_TAG(ti)
				80
				81	#define SKC_CMD_PATHS_COPY_GET_ROLLING(ti) ((ti) >> SKC_CMD_PATHS_COPY_ONE_BITS)
				82
				83	#define SKC_CMD_PATHS_COPY_UPDATE_ROLLING(ti,b) (((ti) & SKC_CMD_PATHS_COPY_ONE_MASK) \| ((b) << SKC_TAGGED_BLOCK_ID_BITS_TAG))
				84
				85	//
				86	//
				87	//
				88
				89	skc_uint
				90	skc_sub_group_local_id()
				91	{
				92	#if SKC_PATHS_COPY_SUBGROUP_SIZE > 1
				93	return get_sub_group_local_id();
				94	#else
				95	return 0;
				96	#endif
				97	}
				98
				99	//
				100	// convert an atomic read counter offset to a block id
				101	//
				102
				103	skc_block_id_t
				104	skc_bp_off_to_id(__global skc_block_id_t const * const bp_ids,
				105	skc_uint const bp_idx_mask,
				106	skc_uint const bp_reads,
				107	skc_uint const bp_off)
				108	{
				109	skc_uint const bp_idx = (bp_reads + bp_off) & bp_idx_mask;
				110
				111	return bp_ids[bp_idx];
				112	}
				113
				114	//
				115	//
				116	//
				117
				118	void
				119	skc_copy_segs(__global skc_paths_copy_elem * const bp_elems, // to
				120	skc_uint const bp_elems_idx,
				121	__global skc_paths_copy_elem const * const pb_elems, // from
				122	skc_uint const pb_elems_idx)
				123	{
				124	for (skc_uint ii=0; ii<SKC_PATHS_COPY_ELEMS_PER_BLOCK; ii+=SKC_PATHS_COPY_SUBGROUP_SIZE)
				125	{
				126	(bp_elems+bp_elems_idx)[ii] = (pb_elems+pb_elems_idx)[ii];
				127	}
				128
				129	#if 0
				130	//
				131	// NOTE THIS IS PRINTING 8 ROWS
				132	//
				133	printf("%5u : (%8u) : { { %5.0f, %5.0f }, { %5.0f, %5.0f } },\n",
				134	(skc_uint)get_global_id(0),pb_elems_idx,
				135	as_float((pb_elems+pb_elems_idx)[0*SKC_PATHS_COPY_SUBGROUP_SIZE]),
				136	as_float((pb_elems+pb_elems_idx)[1*SKC_PATHS_COPY_SUBGROUP_SIZE]),
				137	as_float((pb_elems+pb_elems_idx)[2*SKC_PATHS_COPY_SUBGROUP_SIZE]),
				138	as_float((pb_elems+pb_elems_idx)[3*SKC_PATHS_COPY_SUBGROUP_SIZE]));
				139	printf("%5u : (%8u) : { { %5.0f, %5.0f }, { %5.0f, %5.0f } },\n",
				140	(skc_uint)get_global_id(0),pb_elems_idx,
				141	as_float((pb_elems+pb_elems_idx)[4*SKC_PATHS_COPY_SUBGROUP_SIZE]),
				142	as_float((pb_elems+pb_elems_idx)[5*SKC_PATHS_COPY_SUBGROUP_SIZE]),
				143	as_float((pb_elems+pb_elems_idx)[6*SKC_PATHS_COPY_SUBGROUP_SIZE]),
				144	as_float((pb_elems+pb_elems_idx)[7*SKC_PATHS_COPY_SUBGROUP_SIZE]));
				145	#endif
				146	}
				147
				148	//
				149	//
				150	//
				151
				152	void
				153	skc_copy_node(__global skc_paths_copy_elem * const bp_elems, // to
				154	skc_uint const bp_elems_idx,
				155	__global skc_block_id_t const * const bp_ids,
				156	skc_uint const bp_reads,
				157	skc_uint const bp_idx_mask,
				158	__global skc_paths_copy_elem const * const pb_elems, // from
				159	skc_uint const pb_elems_idx,
				160	skc_uint const pb_rolling)
				161	{
				162	//
				163	// remap block id tags bp_elems the host-side rolling counter pb_elems a
				164	// device-side block pool id
				165	//
				166	for (skc_uint ii=0; ii<SKC_PATHS_COPY_ELEMS_PER_BLOCK; ii+=SKC_PATHS_COPY_SUBGROUP_SIZE)
				167	{
				168	// load block_id_tag words
				169	skc_paths_copy_elem elem = (pb_elems + pb_elems_idx)[ii];
				170
				171	// calculate ahead of time -- if elem was invalid then bp_idx is definitely invalid
				172	skc_pb_idx_v const bp_idx = (bp_reads + SKC_CMD_PATHS_COPY_GET_ROLLING(elem - pb_rolling)) & bp_idx_mask;
				173
				174	// FIXME ^^^^^ THE IDX PROBABLY DOESN'T NEED TO BE SHIFTED TWICE AND WE CAN SAVE A FEW INSTRUCTIONS
				175
				176	//
				177	// FIXME -- SIMD can be fully parallelized since a bp_ids[] load
				178	// will _always_ be safe as long as we don't use the loaded
				179	// value! So... fix UPDATE_ROLLING to be SIMD-friendly instead
				180	// of iterating over the vector components.
				181	//
				182
				183	// only convert if original elem is not invalid
				184
				185	#undef SKC_EXPAND_X
				186	#define SKC_EXPAND_X(I,S,C,P,R) \
				187	if (elem C != SKC_TAGGED_BLOCK_ID_INVALID) { \
				188	skc_block_id_t const b = bp_ids[bp_idx C]; \
				189	elem C = SKC_CMD_PATHS_COPY_UPDATE_ROLLING(elem C,b); \
				190	}
				191
				192	// printf("%2u: < %8X, %8X, %8X >\n",ii,bp_idx,b,elem C);
				193
				194	SKC_PATHS_COPY_ELEM_EXPAND();
				195
				196	// store the elem back
				197	(bp_elems+bp_elems_idx)[ii] = elem;
				198	}
				199	}
				200
				201	//
				202	//
				203	//
				204
				205	void
				206	skc_host_map_update(__global skc_uint * const host_map,
				207	skc_uint const block,
				208	skc_paths_copy_elem const elem)
				209	{
				210	//
				211	// write first elem to map -- FIXME -- this is a little nasty
				212	// because it relies on the the host handle always being the first
				213	// word in the path header.
				214	//
				215	// OTOH, this is not unreasonable. The alternative is to have a
				216	// separate kernel initializing the map.
				217	//
				218	#if SKC_PATHS_COPY_SUBGROUP_SIZE > 1
				219	if (get_sub_group_local_id() == SKC_PATH_HEAD_OFFSET_HANDLE)
				220	#endif
				221	{
				222	#if SKC_PATHS_COPY_ELEM_WORDS == 1
				223	host_map[elem] = block;
				224	#if 0
				225	printf("[%u] = %u\n",elem,block);
				226	#endif
				227	#else
				228	host_map[elem.SKC_CONCAT(s,SKC_PATH_HEAD_OFFSET_HANDLE)] = block;
				229	#endif
				230	}
				231	}
				232
				233	//
				234	//
				235	//
				236
				237	void
				238	skc_copy_head(__global skc_uint * const host_map,
				239	skc_uint const block,
				240	__global skc_paths_copy_elem * const bp_elems, // to
				241	skc_uint const bp_elems_idx,
				242	__global skc_block_id_t const * const bp_ids,
				243	skc_uint const bp_reads,
				244	skc_uint const bp_idx_mask,
				245	__global skc_paths_copy_elem const * const pb_elems, // from
				246	skc_uint const pb_elems_idx,
				247	skc_uint const pb_rolling)
				248	{
				249	//
				250	// if there are more path header words than there are
				251	// threads-per-block then we can just copy the initial header words
				252	//
				253	#if ( SKC_PATHS_COPY_ELEMS_BEFORE_HEADER > 0 )
				254	for (skc_uint ii=0; ii<SKC_PATHS_COPY_ELEMS_BEFORE_HEADER; ii+=SKC_PATHS_COPY_SUBGROUP_SIZE)
				255	{
				256	skc_paths_copy_elem const elem = (pb_elems+pb_elems_idx)[ii];
				257
				258	(bp_elems+bp_elems_idx)[ii] = elem;
				259
				260	if (ii == 0) {
				261	skc_host_map_update(host_map,block,elem);
				262	}
				263	}
				264	#endif
				265
				266	//
				267	// this is similar to copy node but the first H words of the path
				268	// header are not modified and simply copied
				269	//
				270	for (skc_uint ii=SKC_PATHS_COPY_ELEMS_BEFORE_HEADER; ii<SKC_PATHS_COPY_ELEMS_INCLUDING_HEADER; ii+=SKC_PATHS_COPY_SUBGROUP_SIZE)
				271	{
				272	skc_paths_copy_elem elem = (pb_elems+pb_elems_idx)[ii];
				273
				274	#if ( SKC_PATHS_COPY_ELEMS_BEFORE_HEADER == 0 )
				275	if (ii == 0) {
				276	skc_host_map_update(host_map,block,elem);
				277	}
				278	#endif
				279	// calculate ahead of time -- if elem was invalid then bp_idx is definitely invalid
				280	skc_pb_idx_v const bp_idx = (bp_reads + SKC_CMD_PATHS_COPY_GET_ROLLING(elem - pb_rolling)) & bp_idx_mask;
				281
				282	//
				283	// FIXME -- SIMD can be fully parallelized since a bp_ids[] load
				284	// will _always_ be safe as long as we don't use the loaded
				285	// value! So... fix UPDATE_ROLLING to be SIMD-friendly instead
				286	// of iterating over the vector components.
				287	//
				288
				289	// FIXME ^^^^^ THE IDX PROBABLY DOESN'T NEED TO BE SHIFTED TWICE AND WE CAN SAVE A FEW INSTRUCTIONS
				290
				291	// FIXME -- MIX MIX MIX MIX / SELECT
				292
				293	// only convert if original elem is not invalid
				294	#undef SKC_EXPAND_X
				295	#define SKC_EXPAND_X(I,S,C,P,R) \
				296	if (SKC_IS_NOT_PATH_HEAD(ii,I) && (elem C != SKC_TAGGED_BLOCK_ID_INVALID)) { \
				297	skc_block_id_t const b = bp_ids[bp_idx C]; \
				298	elem C = SKC_CMD_PATHS_COPY_UPDATE_ROLLING(elem C,b); \
				299	}
				300
				301	// printf("%2u: ( %8X, %8X, %8X )\n",ii,bp_idx,b,elem C);
				302
				303	SKC_PATHS_COPY_ELEM_EXPAND();
				304
				305	// store the elem back
				306	(bp_elems+bp_elems_idx)[ii] = elem;
				307	}
				308
				309	//
				310	// the remaining words are treated like a node
				311	//
				312	for (skc_uint ii=SKC_PATHS_COPY_ELEMS_INCLUDING_HEADER; ii<SKC_PATHS_COPY_ELEMS_PER_BLOCK; ii+=SKC_PATHS_COPY_SUBGROUP_SIZE)
				313	{
				314	// load block_id_tag words
				315	skc_paths_copy_elem elem = (pb_elems+pb_elems_idx)[ii];
				316
				317	// calculate ahead of time
				318	skc_pb_idx_v const bp_idx = (bp_reads + SKC_CMD_PATHS_COPY_GET_ROLLING(elem - pb_rolling)) & bp_idx_mask;
				319
				320	//
				321	// FIXME -- SIMD can be fully parallelized since a bp_ids[] load
				322	// will _always_ be safe as long as we don't use the loaded
				323	// value! So... fix UPDATE_ROLLING to be SIMD-friendly instead
				324	// of iterating over the vector components.
				325	//
				326
				327	// FIXME ^^^^^ THE IDX PROBABLY DOESN'T NEED TO BE SHIFTED TWICE AND WE CAN SAVE A FEW INSTRUCTIONS
				328
				329	// only convert if original elem is not invalid
				330	#undef SKC_EXPAND_X
				331	#define SKC_EXPAND_X(I,S,C,P,R) \
				332	if (elem C != SKC_TAGGED_BLOCK_ID_INVALID) { \
				333	skc_block_id_t const b = bp_ids[bp_idx C]; \
				334	elem C = SKC_CMD_PATHS_COPY_UPDATE_ROLLING(elem C,b); \
				335	}
				336
				337	// printf("%2u: [ %8X, %8X, %8X ]\n",ii,bp_idx,b,elem C);
				338
				339	SKC_PATHS_COPY_ELEM_EXPAND();
				340
				341	// store the elem
				342	(bp_elems+bp_elems_idx)[ii] = elem;
				343	}
				344	}
				345
				346	//
				347	// FIXME -- pack some of these constant integer args in a vec or struct
				348	//
				349
				350	__kernel
				351	SKC_PATHS_COPY_KERNEL_ATTRIBS
				352	void
				353	skc_kernel_paths_copy
				354	(__global skc_uint * const host_map,
				355
				356	__global skc_block_id_t const * const bp_ids,
				357	__global skc_paths_copy_elem * const bp_elems,
				358	skc_uint const bp_idx_mask, // pow2 modulo mask for block pool ring
				359
				360	__global skc_uint const * const bp_alloc, // block pool ring base
				361	skc_uint const bp_alloc_idx,// which subbuf
				362
				363	__global union skc_tagged_block_id const * const pb_cmds,
				364	__global skc_paths_copy_elem const * const pb_elems,
				365
				366	skc_uint const pb_size, // # of commands/blocks in buffer
				367	skc_uint const pb_rolling, // shifted rolling counter base
				368
				369	skc_uint const pb_prev_from,
				370	skc_uint const pb_prev_span,
				371	skc_uint const pb_curr_from)
				372	{
				373	//
				374	// THERE ARE 3 TYPES OF PATH COPYING COMMANDS:
				375	//
				376	// - HEAD
				377	// - NODE
				378	// - SEGS
				379	//
				380	// THESE ARE SUBGROUP ORIENTED KERNELS
				381	//
				382	// A SUBGROUP CAN OPERATE ON [1,N] BLOCKS
				383	//
				384
				385	//
				386	// It's likely that peak bandwidth is achievable with a single
				387	// workgroup.
				388	//
				389	// So let's keep the grids modestly sized and for simplicity and
				390	// portability, let's assume that a single workgroup can perform all
				391	// steps in the copy.
				392	//
				393	// Launch as large of a workgroup as possiblex
				394	//
				395	// 1. ATOMICALLY ALLOCATE BLOCKS BP_ELEMS POOL
				396	// 2. CONVERT COMMANDS IN PB_ELEMS BLOCK OFFSETS
				397	// 3. FOR EACH COMMAND:
				398	// - HEAD: SAVED HEAD ID PB_ELEMS MAP. CONVERT AND COPY H INDICES.
				399	// - NODE: CONVERT AND COPY B INDICES
				400	// - SEGS: BULK COPY
				401	//
				402	// B : number of words in block -- always pow2
				403	// W : intelligently/arbitrarily chosen factor of B -- always pow2
				404	//
				405
				406	//
				407	// There are several approaches to processing the commands:
				408	//
				409	// 1. B threads are responsible for one block. All threads broadcast
				410	// load a single command word. Workgroup size must be a facpb_elemsr of
				411	// B.
				412	//
				413	// 2. W threads process an entire block. W will typically be the
				414	// device's subgroup/warp/wave width. W threads broadcast load a
				415	// single command word.
				416	//
				417	// 3. W threads process W blocks. W threads load W command words and
				418	// process W blocks.
				419	//
				420	// Clearly (1) has low I/O intensity but will achieve high
				421	// parallelism by activating the most possible threads. The downside
				422	// of this kind of approach is that the kernel will occupy even a
				423	// large GPU with low intensity work and reduce opportunities for
				424	// concurrent kernel execution (of other kernels).
				425	//
				426	// See Vasily Volkov's CUDA presentation describing these tradeoffs.
				427	//
				428	// Note that there are many other approaches. For example, similar
				429	// pb_elems (1) but each thread loads a pow2 vector of block data.
				430	//
				431
				432	// load the copied atomic read "base" from gmem
				433	skc_uint const bp_reads = bp_alloc[bp_alloc_idx];
				434	// will always be less than 2^32
				435	skc_uint const gid = get_global_id(0);
				436	// every subgroup/simd that will work on the block loads the same command
				437	skc_uint const sg_idx = gid / SKC_PATHS_COPY_SUBGROUP_SIZE;
				438	// path builder data can be spread across two spans
				439	skc_uint pb_idx = sg_idx + ((sg_idx < pb_prev_span) ? pb_prev_from : pb_curr_from);
				440
				441	// no need pb_elems make this branchless
				442	if (pb_idx >= pb_size)
				443	pb_idx -= pb_size;
				444
				445	// broadcast load the command
				446	union skc_tagged_block_id const pb_cmd = pb_cmds[pb_idx];
				447
				448	// what do we want pb_elems do with this block?
				449	skc_cmd_paths_copy_tag const tag = SKC_CMD_PATHS_COPY_GET_TAG(pb_cmd.u32);
				450
				451	// compute offset from rolling base to get index into block pool ring allocation
				452	skc_uint const bp_off = SKC_CMD_PATHS_COPY_GET_ROLLING(pb_cmd.u32 - pb_rolling);
				453
				454	// convert the pb_cmd's offset counter pb_elems a block id
				455	skc_block_id_t const block = skc_bp_off_to_id(bp_ids,bp_idx_mask,bp_reads,bp_off);
				456
				457	#if 0
				458	if (get_sub_group_local_id() == 0) {
				459	printf("bp_off/reads = %u / %u\n",bp_off,bp_reads);
				460	printf("< %8u >\n",block);
				461	}
				462	#endif
				463
				464	// FIXME -- could make this 0 for SIMD, gid&mask or get_sub_group_local_id()
				465	skc_uint const tid = gid & SKC_PATHS_COPY_SUBGROUP_SIZE_MASK;
				466
				467	// calculate bp_elems (to) / pb_elems (from)
				468	skc_uint const bp_elems_idx = block * SKC_PATHS_COPY_ELEMS_PER_SUBBLOCK + tid;
				469	skc_uint const pb_elems_idx = pb_idx * SKC_PATHS_COPY_ELEMS_PER_BLOCK + tid;
				470
				471	if (tag == SKC_CMD_PATHS_COPY_TAG_SEGS)
				472	{
				473	#if 0
				474	if (tid == 0)
				475	printf("%3u, segs\n",bp_off);
				476	#endif
				477	skc_copy_segs(bp_elems,
				478	bp_elems_idx,
				479	pb_elems,
				480	pb_elems_idx);
				481	}
				482	else if (tag == SKC_CMD_PATHS_COPY_TAG_NODE)
				483	{
				484	#if 0
				485	if (tid == 0)
				486	printf("%3u, NODE\n",bp_off);
				487	#endif
				488	skc_copy_node(bp_elems, // to
				489	bp_elems_idx,
				490	bp_ids,
				491	bp_reads,
				492	bp_idx_mask,
				493	pb_elems, // from
				494	pb_elems_idx,
				495	pb_rolling);
				496	}
				497	else // ( tag == SKC_CMD_PATHS_COPY_TAG_HEAD)
				498	{
				499	#if 0
				500	if (tid == 0)
				501	printf("%3u, HEAD\n",bp_off);
				502	#endif
				503	skc_copy_head(host_map,
				504	block,
				505	bp_elems, // to
				506	bp_elems_idx,
				507	bp_ids,
				508	bp_reads,
				509	bp_idx_mask,
				510	pb_elems, // from
				511	pb_elems_idx,
				512	pb_rolling);
				513	}
				514	}
				515
				516	//
				517	//
				518	//
				519
				520	__kernel
				521	SKC_PATHS_ALLOC_KERNEL_ATTRIBS
				522	void
				523	skc_kernel_paths_alloc(__global skc_uint volatile * const bp_atomics,
				524	__global skc_uint * const bp_alloc,
				525	skc_uint const bp_alloc_idx,
				526	skc_uint const pb_cmd_count)
				527	{
				528	//
				529	// allocate blocks in block pool
				530	//
				531	skc_uint const reads = atomic_add(bp_atomics+SKC_BP_ATOMIC_OFFSET_READS,pb_cmd_count);
				532
				533	// store in slot
				534	bp_alloc[bp_alloc_idx] = reads;
				535
				536	#if 0
				537	printf("pc: %8u + %u\n",reads,pb_cmd_count);
				538	#endif
				539	}
				540
				541	//
				542	//
				543	//