| /* |
| * Copyright 2017 Google Inc. |
| * |
| * Use of this source code is governed by a BSD-style license that can |
| * be found in the LICENSE file. |
| * |
| */ |
| |
| #include <stdio.h> |
| #include <stdlib.h> |
| #include <assert.h> |
| |
| #include "common/cl/assert_cl.h" |
| |
| #include "tile.h" |
| #include "raster.h" |
| #include "macros.h" |
| |
| #include "config_cl.h" |
| #include "runtime_cl_12.h" |
| |
| #include "kernel_cl_12.h" |
| #include "device_cl_12.h" |
| |
| #include "hs/cl/hs_cl_launcher.h" |
| #include "hs/cl/gen9/hs_cl.h" |
| |
| // |
| // |
| // |
| |
| #define SKC_KERNEL_SPIRV 0 |
| #define SKC_KERNEL_BINARY 1 |
| #define SKC_KERNEL_SRC 0 |
| |
| // |
| // |
| // |
| |
| #if SKC_KERNEL_SPIRV |
| |
| #include "inl/block_pool_init.pre.spv.inl" |
| #include "inl/paths_copy.pre.spv.inl" |
| #include "inl/fills_expand.pre.spv.inl" |
| #include "inl/rasterize.pre.spv.inl" |
| #include "inl/segment_ttrk.pre.spv.inl" |
| #include "inl/rasters_alloc.pre.spv.inl" |
| #include "inl/prefix.pre.spv.inl" |
| #include "inl/place.pre.spv.inl" |
| #include "inl/segment_ttck.pre.spv.inl" |
| #include "inl/render.pre.spv.inl" |
| #include "inl/paths_reclaim.pre.spv.inl" |
| #include "inl/rasters_reclaim.pre.spv.inl" |
| |
| #elif SKC_KERNEL_BINARY |
| |
| #include "inl/block_pool_init.pre.bin.inl" |
| #include "inl/paths_copy.pre.bin.inl" |
| #include "inl/fills_expand.pre.bin.inl" |
| #include "inl/rasterize.pre.bin.inl" |
| #include "inl/segment_ttrk.pre.bin.inl" |
| #include "inl/rasters_alloc.pre.bin.inl" |
| #include "inl/prefix.pre.bin.inl" |
| #include "inl/place.pre.bin.inl" |
| #include "inl/segment_ttck.pre.bin.inl" |
| #include "inl/render.pre.bin.inl" |
| #include "inl/paths_reclaim.pre.bin.inl" |
| #include "inl/rasters_reclaim.pre.bin.inl" |
| |
| #elif SKC_KERNEL_SRC |
| |
| #include "inl/block_pool_init.pre.src.inl" |
| #include "inl/paths_copy.pre.src.inl" |
| #include "inl/fills_expand.pre.src.inl" |
| #include "inl/rasterize.pre.src.inl" |
| #include "inl/segment_ttrk.pre.src.inl" |
| #include "inl/rasters_alloc.pre.src.inl" |
| #include "inl/prefix.pre.src.inl" |
| #include "inl/place.pre.src.inl" |
| #include "inl/segment_ttck.pre.src.inl" |
| #include "inl/render.pre.src.inl" |
| #include "inl/paths_reclaim.pre.src.inl" |
| #include "inl/rasters_reclaim.pre.src.inl" |
| |
| #endif |
| |
| // |
| // FIXME -- THE CONFIG INITIALIZATION IS ONLY HERE TEMPORARILY |
| // |
| // FIXME -- move these to log2 values where appropriate |
| // |
| |
| static |
| struct skc_config const config = |
| { |
| .suballocator = { |
| .host = { |
| .size = 1024 * 1024, // words |
| .subbufs = 1024 // must be <= (1 << (8 * sizeof(skc_subbuf_id_t))) |
| }, |
| .device = { |
| .size = 128 * 1024 * 1024, |
| .subbufs = 1024 // must be <= (1 << (8 * sizeof(skc_subbuf_id_t))) |
| } |
| }, |
| |
| .scheduler = { |
| .size = 4096 // 128 // FIXME -- this is just for testing -- way too big -- schedulees should bring their own state |
| }, |
| |
| .subblock = { |
| .words = SKC_DEVICE_SUBBLOCK_WORDS, // words per subblock -- pow2 |
| .bytes = SKC_DEVICE_SUBBLOCK_WORDS * sizeof(skc_uint) // bytes per subblock -- pow2 |
| }, |
| |
| .block = { |
| .words = SKC_DEVICE_BLOCK_WORDS, // words per block -- pow2 |
| .bytes = SKC_DEVICE_BLOCK_WORDS * sizeof(skc_uint), // bytes per block -- pow2 |
| .subblocks = SKC_DEVICE_BLOCK_WORDS / SKC_DEVICE_SUBBLOCK_WORDS // subblocks per block -- block.bytes >= subblock.bytes |
| }, |
| |
| .block_pool = { |
| .pool_size = 524288, // blocks in pool -- 128 MB |
| .ring_pow2 = 524288, // blocks in pool rounded up pow2 |
| .ring_mask = 524288 - 1 |
| }, |
| |
| .cq_pool = { |
| #ifndef NDEBUG |
| .cq_props = CL_QUEUE_PROFILING_ENABLE, |
| #else |
| .cq_props = 0, |
| #endif |
| .size = 8 |
| }, |
| |
| .handle_pool = { |
| .size = 262144, // large fraction of block pool size (for now, 1:2) |
| .width = SKC_RECLAIM_ARRAY_SIZE, |
| .recs = 256 // too many? too few? |
| }, |
| |
| .tile = { |
| .width = SKC_TILE_WIDTH, // tile width in pixels |
| .height = SKC_TILE_HEIGHT, // tile height in pixels |
| .ratio = SKC_TILE_HEIGHT / SKC_TILE_WIDTH // subblocks per TTPB |
| }, |
| |
| .paths_copy = { |
| |
| .buffer = { |
| .count = 16 // # of subbufs in buffer |
| }, |
| |
| .subbuf = { |
| .count = 1024 // # of blocks/commands in subbuf |
| }, |
| |
| .block = { |
| .subbuf = SKC_DEVICE_BLOCK_WORDS * sizeof(skc_uint) * 1024, // block.bytes * subbuf.blocks -- multiple of CL_DEVICE_MEM_BASE_ADDR_ALIGN |
| .buffer = SKC_DEVICE_BLOCK_WORDS * sizeof(skc_uint) * 1024 * 16 // block.bytes * subbuf.blocks * subbuf.count |
| }, |
| |
| .command = { |
| .subbuf = sizeof(skc_uint) * 1024, // sizeof(skc_uint) * subbuf.blocks -- multiple of CL_DEVICE_MEM_BASE_ADDR_ALIGN |
| .buffer = sizeof(skc_uint) * 1024 * 16 // sizeof(skc_uint) * subbuf.blocks * subbuf.count |
| }, |
| |
| // skc_uint paths_lowat; |
| }, |
| |
| .raster_cohort = { |
| .path_ids = { |
| .elem_count = 8192, |
| .snap_count = 1024 // FIXME -- THIS SHOULD BE WAYYYY BIGGER |
| }, |
| |
| .transforms = { |
| .elem_count = 8192, |
| .snap_count = 1024 // FIXME -- THIS SHOULD BE WAYYYY BIGGER |
| }, |
| |
| .clips = { |
| .elem_count = 8192, |
| .snap_count = 1024 // FIXME -- THIS SHOULD BE WAYYYY BIGGER |
| }, |
| |
| .fill = { |
| .elem_count = 8192, |
| .snap_count = 1024 // FIXME -- THIS SHOULD BE WAYYYY BIGGER |
| }, |
| |
| .raster_ids = { |
| .elem_count = 8192, |
| .snap_count = (1<<SKC_TTRK_HI_BITS_COHORT) // 256 |
| }, |
| |
| .expand = { |
| .cmds = 1024*128, |
| }, |
| |
| .rasterize = { |
| .keys = 1024*1024 |
| } |
| }, |
| |
| .composition = { |
| .cmds = { |
| .elem_count = 1024*16, |
| .snap_count = 1024 |
| }, |
| .raster_ids = { |
| .elem_count = 1024*1024 |
| }, |
| .keys = { |
| .elem_count = 1024*1024, |
| } |
| }, |
| }; |
| |
| // |
| // |
| // |
| |
| static char const cl_build_options_optimized[] = |
| "-cl-std=CL1.2 " |
| "-cl-single-precision-constant " |
| "-cl-denorms-are-zero " |
| "-cl-mad-enable " |
| "-cl-no-signed-zeros " |
| "-cl-fast-relaxed-math " |
| "-cl-kernel-arg-info "; |
| |
| static char const cl_build_options_debug[] = |
| "-cl-std=CL1.2 -cl-kernel-arg-info -g"; // -s c:/users/allanmac/home/google/skia_internal/src/compute/skc"; |
| |
| // #define SKC_BUILD_OPTIONS cl_build_options_debug |
| #define SKC_BUILD_OPTIONS cl_build_options_optimized |
| |
| // |
| // |
| // |
| |
| struct skc_program_source |
| { |
| char const * name; |
| char const * options; |
| char const * src; |
| size_t const srclen; |
| }; |
| |
| // |
| // THIS IS A RELATIVELY COMPACT WAY OF DECLARING EACH PROGRAM SOURCE |
| // AND ITS BUILD OPTIONS |
| // |
| |
| union skc_program_sources |
| { |
| struct { |
| struct skc_program_source block_pool_init; |
| struct skc_program_source paths_copy; |
| struct skc_program_source fills_expand; |
| struct skc_program_source rasterize; |
| struct skc_program_source segment_ttrk; |
| struct skc_program_source rasters_alloc; |
| struct skc_program_source prefix; |
| struct skc_program_source place; |
| struct skc_program_source segment_ttck; |
| struct skc_program_source render; |
| struct skc_program_source paths_reclaim; |
| struct skc_program_source rasters_reclaim; |
| }; |
| struct skc_program_source sources[]; |
| }; |
| |
| typedef size_t * (*skc_grid_shaper)(size_t const work_size, |
| cl_uint * const work_dim, |
| size_t * const global_work_size, |
| size_t * const local_work_size); |
| struct skc_program_kernel |
| { |
| char const * name; |
| skc_grid_shaper shaper; |
| skc_device_kernel_id id; |
| }; |
| |
| union skc_program_kernels |
| { |
| struct { |
| struct skc_program_kernel block_pool_init[2]; |
| struct skc_program_kernel paths_copy [2]; |
| struct skc_program_kernel fills_expand [1]; |
| struct skc_program_kernel rasterize [6]; |
| struct skc_program_kernel segment_ttrk [1]; |
| struct skc_program_kernel rasters_alloc [1]; |
| struct skc_program_kernel prefix [1]; |
| struct skc_program_kernel place [1]; |
| struct skc_program_kernel segment_ttck [1]; |
| struct skc_program_kernel render [1]; |
| struct skc_program_kernel paths_reclaim [1]; |
| struct skc_program_kernel rasters_reclaim[1]; |
| }; |
| struct skc_program_kernel kernels[]; |
| }; |
| |
| // |
| // |
| // |
| |
| #if SKC_KERNEL_SPIRV // PROGRAM IS SPIR-V |
| #define SKC_KERNEL_SUFFIX(n) n ## _pre_spv |
| #elif SKC_KERNEL_BINARY // PROGRAM IS BINARY |
| #define SKC_KERNEL_SUFFIX(n) n ## _pre_ir |
| #elif SKC_KERNEL_SRC // PROGRAM IS SOURCE CODE |
| #define SKC_KERNEL_SUFFIX(n) n ## _pre_cl |
| #else |
| #error "SKC_KERNEL_???" |
| #endif |
| |
| // |
| // |
| // |
| |
| #define SKC_PROGRAM_SOURCE_EXPAND(k,s,o) .k = { SKC_STRINGIFY(k), o, s, sizeof(s) } |
| #define SKC_PROGRAM_SOURCE(k,o) SKC_PROGRAM_SOURCE_EXPAND(k,SKC_KERNEL_SUFFIX(k),o) |
| #define SKC_PROGRAM_KERNEL(k) "skc_kernel_" SKC_STRINGIFY(k), SKC_CONCAT(skc_device_shaper_,k) |
| |
| // |
| // |
| // |
| |
| static |
| size_t * |
| skc_device_shaper_block_pool_init_ids(size_t const work_size, |
| cl_uint * const work_dim, |
| size_t * const work_global, |
| size_t * const work_local) |
| { |
| work_dim [0] = 1; |
| work_global[0] = work_size; |
| |
| return NULL; // let runtime figure out local work size |
| } |
| |
| static |
| size_t * |
| skc_device_shaper_block_pool_init_atomics(size_t const work_size, |
| cl_uint * const work_dim, |
| size_t * const work_global, |
| size_t * const work_local) |
| { |
| work_dim [0] = 1; |
| work_global[0] = 2; |
| |
| return NULL; // let runtime figure out local work size |
| } |
| |
| static |
| size_t * |
| skc_device_shaper_paths_alloc(size_t const work_size, |
| cl_uint * const work_dim, |
| size_t * const work_global, |
| size_t * const work_local) |
| { |
| work_dim [0] = 1; |
| work_global[0] = 1; |
| |
| return NULL; // let runtime figure out local work size |
| } |
| |
| |
| static |
| size_t * |
| skc_device_shaper_paths_copy(size_t const work_size, |
| cl_uint * const work_dim, |
| size_t * const work_global, |
| size_t * const work_local) |
| { |
| work_dim [0] = 1; |
| work_global[0] = SKC_PATHS_COPY_SUBGROUP_SIZE * work_size; |
| #if 0 |
| work_local [0] = SKC_PATHS_COPY_SUBGROUP_SIZE; |
| |
| return work_local; |
| #else |
| return NULL; // let runtime figure out local work size |
| #endif |
| } |
| |
| static |
| size_t * |
| skc_device_shaper_fills_expand(size_t const work_size, |
| cl_uint * const work_dim, |
| size_t * const work_global, |
| size_t * const work_local) |
| { |
| work_dim [0] = 1; |
| work_global[0] = SKC_FILLS_EXPAND_SUBGROUP_SIZE * work_size; |
| work_local [0] = SKC_FILLS_EXPAND_SUBGROUP_SIZE; |
| |
| return work_local; |
| } |
| |
| static |
| size_t * |
| skc_device_shaper_rasterize(size_t const work_size, |
| cl_uint * const work_dim, |
| size_t * const work_global, |
| size_t * const work_local) |
| { |
| work_dim [0] = 1; |
| work_global[0] = SKC_RASTERIZE_SUBGROUP_SIZE * work_size; |
| work_local [0] = SKC_RASTERIZE_SUBGROUP_SIZE; |
| |
| return work_local; |
| } |
| |
| static |
| size_t * |
| skc_device_shaper_rasterize_all(size_t const work_size, |
| cl_uint * const work_dim, |
| size_t * const work_global, |
| size_t * const work_local) |
| { |
| return skc_device_shaper_rasterize(work_size,work_dim,work_global,work_local); |
| } |
| |
| static |
| size_t * |
| skc_device_shaper_rasterize_lines(size_t const work_size, |
| cl_uint * const work_dim, |
| size_t * const work_global, |
| size_t * const work_local) |
| { |
| return skc_device_shaper_rasterize(work_size,work_dim,work_global,work_local); |
| } |
| |
| static |
| size_t * |
| skc_device_shaper_rasterize_quads(size_t const work_size, |
| cl_uint * const work_dim, |
| size_t * const work_global, |
| size_t * const work_local) |
| { |
| return skc_device_shaper_rasterize(work_size,work_dim,work_global,work_local); |
| } |
| |
| static |
| size_t * |
| skc_device_shaper_rasterize_cubics(size_t const work_size, |
| cl_uint * const work_dim, |
| size_t * const work_global, |
| size_t * const work_local) |
| { |
| return skc_device_shaper_rasterize(work_size,work_dim,work_global,work_local); |
| } |
| |
| static |
| size_t * |
| skc_device_shaper_rasterize_rat_quads(size_t const work_size, |
| cl_uint * const work_dim, |
| size_t * const work_global, |
| size_t * const work_local) |
| { |
| return skc_device_shaper_rasterize(work_size,work_dim,work_global,work_local); |
| } |
| |
| static |
| size_t * |
| skc_device_shaper_rasterize_rat_cubics(size_t const work_size, |
| cl_uint * const work_dim, |
| size_t * const work_global, |
| size_t * const work_local) |
| { |
| return skc_device_shaper_rasterize(work_size,work_dim,work_global,work_local); |
| } |
| |
| static |
| size_t * |
| skc_device_shaper_rasters_alloc(size_t const work_size, |
| cl_uint * const work_dim, |
| size_t * const work_global, |
| size_t * const work_local) |
| { |
| // round up to whole groups |
| size_t gs = SKC_ROUND_UP(work_size,SKC_RASTERS_ALLOC_GROUP_SIZE); |
| |
| work_dim [0] = 1; |
| work_global[0] = gs; |
| work_local [0] = SKC_RASTERS_ALLOC_GROUP_SIZE; |
| |
| return work_local; |
| } |
| |
| static |
| size_t * |
| skc_device_shaper_segment_ttrk(size_t const work_size, |
| cl_uint * const work_dim, |
| size_t * const work_global, |
| size_t * const work_local) |
| { |
| // work_size is number of keys -- round up to a whole slab |
| size_t keys_ru = SKC_ROUND_UP(work_size,HS_LANES_PER_WARP*HS_KEYS_PER_LANE); |
| |
| work_dim [0] = 1; |
| work_global[0] = keys_ru / HS_KEYS_PER_LANE; |
| work_local [0] = HS_LANES_PER_WARP; // or just return NULL |
| |
| return work_local; |
| } |
| |
| static |
| size_t * |
| skc_device_shaper_segment_ttck(size_t const work_size, |
| cl_uint * const work_dim, |
| size_t * const work_global, |
| size_t * const work_local) |
| { |
| // work_size is number of keys -- round up to a whole slab |
| size_t keys_ru = SKC_ROUND_UP(work_size,HS_LANES_PER_WARP*HS_KEYS_PER_LANE); |
| |
| work_dim [0] = 1; |
| work_global[0] = keys_ru / HS_KEYS_PER_LANE; |
| work_local [0] = HS_LANES_PER_WARP; // or just return NULL |
| |
| return work_local; |
| } |
| |
| static |
| size_t * |
| skc_device_shaper_prefix(size_t const work_size, |
| cl_uint * const work_dim, |
| size_t * const work_global, |
| size_t * const work_local) |
| { |
| work_dim [0] = 1; |
| work_global[0] = SKC_PREFIX_SUBGROUP_SIZE * work_size; |
| work_local [0] = SKC_PREFIX_SUBGROUP_SIZE; |
| |
| return work_local; |
| } |
| |
| static |
| size_t * |
| skc_device_shaper_place(size_t const work_size, |
| cl_uint * const work_dim, |
| size_t * const work_global, |
| size_t * const work_local) |
| { |
| work_dim [0] = 1; |
| work_global[0] = SKC_PLACE_SUBGROUP_SIZE * work_size; |
| work_local [0] = SKC_PLACE_SUBGROUP_SIZE; |
| |
| return work_local; |
| } |
| |
| static |
| size_t * |
| skc_device_shaper_render(size_t const work_size, |
| cl_uint * const work_dim, |
| size_t * const work_global, |
| size_t * const work_local) |
| { |
| work_dim [0] = 1; |
| work_global[0] = SKC_RENDER_SUBGROUP_SIZE * work_size; |
| work_local [0] = SKC_RENDER_SUBGROUP_SIZE; |
| |
| return work_local; |
| } |
| |
| static |
| size_t * |
| skc_device_shaper_paths_reclaim(size_t const work_size, |
| cl_uint * const work_dim, |
| size_t * const work_global, |
| size_t * const work_local) |
| { |
| assert(work_size == SKC_RECLAIM_ARRAY_SIZE); |
| |
| work_dim [0] = 1; |
| work_global[0] = SKC_RECLAIM_ARRAY_SIZE * SKC_PATHS_RECLAIM_SUBGROUP_SIZE; |
| |
| return NULL; // let runtime figure out local work size |
| } |
| |
| static |
| size_t * |
| skc_device_shaper_rasters_reclaim(size_t const work_size, |
| cl_uint * const work_dim, |
| size_t * const work_global, |
| size_t * const work_local) |
| { |
| assert(work_size == SKC_RECLAIM_ARRAY_SIZE); |
| |
| work_dim [0] = 1; |
| work_global[0] = SKC_RECLAIM_ARRAY_SIZE * SKC_PATHS_RECLAIM_SUBGROUP_SIZE; |
| |
| return NULL; // let runtime figure out local work size |
| } |
| |
| // |
| // |
| // |
| |
| static union skc_program_sources const program_sources = { |
| SKC_PROGRAM_SOURCE(block_pool_init,SKC_BUILD_OPTIONS), |
| SKC_PROGRAM_SOURCE(paths_copy, SKC_BUILD_OPTIONS), |
| SKC_PROGRAM_SOURCE(fills_expand, SKC_BUILD_OPTIONS), |
| SKC_PROGRAM_SOURCE(rasterize, SKC_BUILD_OPTIONS), |
| SKC_PROGRAM_SOURCE(segment_ttrk, SKC_BUILD_OPTIONS), |
| SKC_PROGRAM_SOURCE(rasters_alloc, SKC_BUILD_OPTIONS), |
| SKC_PROGRAM_SOURCE(prefix, SKC_BUILD_OPTIONS), |
| SKC_PROGRAM_SOURCE(place, SKC_BUILD_OPTIONS), |
| SKC_PROGRAM_SOURCE(segment_ttck, SKC_BUILD_OPTIONS), |
| SKC_PROGRAM_SOURCE(render, SKC_BUILD_OPTIONS), |
| SKC_PROGRAM_SOURCE(paths_reclaim, SKC_BUILD_OPTIONS), |
| SKC_PROGRAM_SOURCE(rasters_reclaim,SKC_BUILD_OPTIONS) |
| }; |
| |
| static union skc_program_kernels const program_kernels = { |
| |
| .block_pool_init = { { SKC_PROGRAM_KERNEL(block_pool_init_ids), SKC_DEVICE_KERNEL_ID_BLOCK_POOL_INIT_IDS }, |
| { SKC_PROGRAM_KERNEL(block_pool_init_atomics), SKC_DEVICE_KERNEL_ID_BLOCK_POOL_INIT_ATOMICS } }, |
| |
| .paths_copy = { { SKC_PROGRAM_KERNEL(paths_alloc), SKC_DEVICE_KERNEL_ID_PATHS_ALLOC }, |
| { SKC_PROGRAM_KERNEL(paths_copy) , SKC_DEVICE_KERNEL_ID_PATHS_COPY } }, |
| |
| .fills_expand = { { SKC_PROGRAM_KERNEL(fills_expand), SKC_DEVICE_KERNEL_ID_FILLS_EXPAND } }, |
| |
| .rasterize = { { SKC_PROGRAM_KERNEL(rasterize_all), SKC_DEVICE_KERNEL_ID_RASTERIZE_ALL }, |
| { SKC_PROGRAM_KERNEL(rasterize_lines), SKC_DEVICE_KERNEL_ID_RASTERIZE_LINES }, |
| { SKC_PROGRAM_KERNEL(rasterize_quads), SKC_DEVICE_KERNEL_ID_RASTERIZE_QUADS }, |
| { SKC_PROGRAM_KERNEL(rasterize_cubics), SKC_DEVICE_KERNEL_ID_RASTERIZE_CUBICS }, |
| { SKC_PROGRAM_KERNEL(rasterize_rat_quads), SKC_DEVICE_KERNEL_ID_RASTERIZE_RAT_QUADS }, |
| { SKC_PROGRAM_KERNEL(rasterize_rat_cubics), SKC_DEVICE_KERNEL_ID_RASTERIZE_RAT_CUBICS } }, |
| |
| .segment_ttrk = { { SKC_PROGRAM_KERNEL(segment_ttrk), SKC_DEVICE_KERNEL_ID_SEGMENT_TTRK } }, |
| |
| .rasters_alloc = { { SKC_PROGRAM_KERNEL(rasters_alloc), SKC_DEVICE_KERNEL_ID_RASTERS_ALLOC } }, |
| |
| .prefix = { { SKC_PROGRAM_KERNEL(prefix), SKC_DEVICE_KERNEL_ID_PREFIX } }, |
| |
| .place = { { SKC_PROGRAM_KERNEL(place), SKC_DEVICE_KERNEL_ID_PLACE } }, |
| |
| .segment_ttck = { { SKC_PROGRAM_KERNEL(segment_ttck) , SKC_DEVICE_KERNEL_ID_SEGMENT_TTCK } }, |
| |
| .render = { { SKC_PROGRAM_KERNEL(render), SKC_DEVICE_KERNEL_ID_RENDER } }, |
| |
| .paths_reclaim = { { SKC_PROGRAM_KERNEL(paths_reclaim), SKC_DEVICE_KERNEL_ID_PATHS_RECLAIM } }, |
| |
| .rasters_reclaim = { { SKC_PROGRAM_KERNEL(rasters_reclaim), SKC_DEVICE_KERNEL_ID_RASTERS_RECLAIM } } |
| }; |
| |
| // |
| // |
| // |
| |
| struct skc_device |
| { |
| // |
| // FIXME -- an OpenCL 2.1+ device would clone these kernels in a |
| // multithreaded system. |
| // |
| // Not having the ability to clone kernels (yet set their sticky |
| // args) was an oversight in previous versions of OpenCL. |
| // |
| // For now, we can probably get away with just a single kernel |
| // instance as long as args are set and the kernel is launched |
| // before having its arguments stomped on. |
| // |
| cl_kernel kernels [SKC_DEVICE_KERNEL_ID_COUNT]; |
| size_t reqd_szs[SKC_DEVICE_KERNEL_ID_COUNT][3]; |
| }; |
| |
| // |
| // CREATE KERNELS |
| // |
| |
| static |
| void |
| skc_device_create_kernels(struct skc_runtime * const runtime, |
| struct skc_program_kernel const * const kernels, |
| skc_uint const kernel_count, |
| cl_program program) |
| { |
| for (skc_uint ii=0; ii<kernel_count; ii++) |
| { |
| cl_int cl_err; |
| |
| char const * name = kernels[ii].name; |
| skc_uint const id = kernels[ii].id; |
| |
| fprintf(stderr,"\t\"%s\"\n",name); |
| |
| // create the kernel |
| runtime->device->kernels[id] = clCreateKernel(program,name,&cl_err); cl_ok(cl_err); |
| |
| // |
| // release program now |
| // |
| // FIXME -- if/when we multithread then we need to clone kernels |
| // (>=2.1) or keep programs around (<=2.0) |
| // |
| |
| // get workgroup size |
| cl(GetKernelWorkGroupInfo(runtime->device->kernels[id], |
| runtime->cl.device_id, |
| CL_KERNEL_COMPILE_WORK_GROUP_SIZE, |
| sizeof(runtime->device->reqd_szs[0]), |
| runtime->device->reqd_szs[id], |
| NULL)); |
| |
| // |
| // GEN9+ PROBING |
| // |
| #define SKC_TARGET_GEN9 |
| #ifdef SKC_TARGET_GEN9 |
| |
| #define CL_DEVICE_SUB_GROUP_SIZES_INTEL 0x4108 |
| #define CL_KERNEL_SPILL_MEM_SIZE_INTEL 0x4109 |
| #define CL_KERNEL_COMPILE_SUB_GROUP_SIZE_INTEL 0x410A |
| |
| cl_ulong spill_mem_size; |
| |
| cl(GetKernelWorkGroupInfo(runtime->device->kernels[id], |
| runtime->cl.device_id, |
| CL_KERNEL_SPILL_MEM_SIZE_INTEL, |
| sizeof(spill_mem_size), |
| &spill_mem_size, |
| NULL)); |
| |
| fprintf(stderr,"\t\tspill mem size: %lu bytes\n", |
| (unsigned long)spill_mem_size); |
| |
| cl_ulong local_mem_size; |
| |
| cl(GetKernelWorkGroupInfo(runtime->device->kernels[id], |
| runtime->cl.device_id, |
| CL_KERNEL_LOCAL_MEM_SIZE, |
| sizeof(local_mem_size), |
| &local_mem_size, |
| NULL)); |
| |
| fprintf(stderr,"\t\tlocal mem size: %lu bytes\n", |
| (unsigned long)local_mem_size); |
| #endif |
| } |
| } |
| |
| static |
| void |
| skc_device_build_program(struct skc_runtime * const runtime, |
| struct skc_program_source const * const source, |
| struct skc_program_kernel const * const kernels, |
| skc_uint const kernel_count) |
| { |
| cl_program program; |
| |
| fprintf(stderr,"%-20s: ",source->name); |
| |
| cl_int cl_err; |
| |
| #if SKC_KERNEL_SPIRV // PROGRAM IS SPIR-V |
| |
| fprintf(stderr,"Creating (SPIR-V) ... "); |
| |
| program = clCreateProgramWithIL(runtime->cl.context, |
| source->src, |
| source->srclen, |
| &cl_err); |
| |
| #elif SKC_KERNEL_BINARY // PROGRAM IS BINARY |
| |
| fprintf(stderr,"Creating (Binary) ... "); |
| |
| cl_int status; |
| program = clCreateProgramWithBinary(runtime->cl.context, |
| 1, |
| &runtime->cl.device_id, |
| &source->srclen, |
| (unsigned char const *[]){ source->src }, |
| &status, |
| &cl_err); |
| |
| #elif SKC_KERNEL_SRC // PROGRAM IS SOURCE CODE |
| |
| fprintf(stderr,"Creating (Source) ... "); |
| |
| program = clCreateProgramWithSource(runtime->cl.context, |
| 1, |
| (char const *[]){ source->src }, |
| &source->srclen, |
| &cl_err); |
| #else |
| |
| #error "SKC_KERNEL_???" |
| |
| #endif |
| |
| cl_ok(cl_err); |
| |
| fprintf(stderr,"Building ... "); |
| |
| // build the program |
| cl(BuildProgram(program, |
| 1, |
| &runtime->cl.device_id, |
| source->options, // build options are ignored by binary |
| NULL, |
| NULL)); |
| |
| fprintf(stderr,"Done\n"); |
| |
| // build the kernels |
| skc_device_create_kernels(runtime,kernels,kernel_count,program); |
| |
| // we're done with program for now |
| // can always recover it from a kernel instance |
| cl(ReleaseProgram(program)); |
| } |
| |
| // |
| // RELEASE KERNELS |
| // |
| |
| static |
| void |
| skc_device_release_kernels(struct skc_device * const device) |
| { |
| for (skc_int ii=0; ii<SKC_COUNT_OF(device->kernels); ii++) |
| cl(ReleaseKernel(device->kernels[ii])); |
| } |
| |
| |
| |
| cl_kernel |
| skc_device_acquire_kernel(struct skc_device * const device, |
| skc_device_kernel_id const type) |
| { |
| cl_kernel kernel = device->kernels[type]; |
| |
| cl(RetainKernel(kernel)); |
| |
| return kernel; |
| } |
| |
| |
| void |
| skc_device_release_kernel(struct skc_device * const device, |
| cl_kernel kernel) |
| { |
| cl(ReleaseKernel(kernel)); |
| } |
| |
| // |
| // INITIALIZE KERNEL ARGS |
| // |
| // FIXME |
| // |
| // pre-assign any kernel arguments that are never going to change -- |
| // for example, the block pool |
| // |
| |
| // |
| // |
| // |
| |
| #define SKC_DEVICE_BUILD_PROGRAM(p) \ |
| skc_device_build_program(runtime,&program_sources.p,program_kernels.p,SKC_COUNT_OF(program_kernels.p)) |
| |
| |
| void |
| skc_device_create(struct skc_runtime * const runtime) |
| { |
| struct skc_device * const device = skc_runtime_host_perm_alloc(runtime,SKC_MEM_FLAGS_READ_WRITE,sizeof(*device)); |
| |
| // hang device off of runtime |
| runtime->device = device; |
| |
| // hang config off of runtime |
| runtime->config = &config; |
| |
| // create kernels |
| SKC_DEVICE_BUILD_PROGRAM(block_pool_init); |
| SKC_DEVICE_BUILD_PROGRAM(paths_copy); |
| SKC_DEVICE_BUILD_PROGRAM(fills_expand); |
| SKC_DEVICE_BUILD_PROGRAM(rasterize); |
| SKC_DEVICE_BUILD_PROGRAM(segment_ttrk); |
| SKC_DEVICE_BUILD_PROGRAM(rasters_alloc); |
| SKC_DEVICE_BUILD_PROGRAM(prefix); |
| SKC_DEVICE_BUILD_PROGRAM(place); |
| SKC_DEVICE_BUILD_PROGRAM(segment_ttck); |
| SKC_DEVICE_BUILD_PROGRAM(render); |
| SKC_DEVICE_BUILD_PROGRAM(paths_reclaim); |
| SKC_DEVICE_BUILD_PROGRAM(rasters_reclaim); |
| |
| // create HotSort instance -- FIXME -- how this occurs needs to be cleaned up |
| hs_create(runtime->cl.context,runtime->cl.device_id,NULL); |
| } |
| |
| void |
| skc_device_dispose(struct skc_runtime * const runtime) |
| { |
| // |
| // FIXME -- dispose of programs, kernels, etc. |
| // |
| |
| skc_runtime_host_perm_free(runtime,runtime->device); |
| } |
| |
| // |
| // FIXME -- just pass the device type |
| // |
| |
| void |
| skc_device_enqueue_kernel(struct skc_device * const device, |
| skc_device_kernel_id const type, |
| cl_command_queue cq, |
| cl_kernel kernel, |
| size_t const work_size, |
| cl_uint num_events_in_wait_list, |
| cl_event const * const event_wait_list, |
| cl_event * const event) |
| { |
| if (work_size == 0) |
| return; |
| |
| cl_uint work_dim [1]; |
| size_t work_global[3]; |
| size_t work_local [3]; |
| |
| size_t * work_local_ptr = program_kernels.kernels[type].shaper(work_size, |
| work_dim, |
| work_global, |
| work_local); |
| cl(EnqueueNDRangeKernel(cq, |
| kernel,// device->kernels[type], |
| work_dim[0], |
| NULL, |
| work_global, |
| work_local_ptr, |
| num_events_in_wait_list, |
| event_wait_list, |
| event)); |
| } |
| |
| // |
| // |
| // |