| // |
| // Copyright 2012 Francisco Jerez |
| // |
| // Permission is hereby granted, free of charge, to any person obtaining a |
| // copy of this software and associated documentation files (the "Software"), |
| // to deal in the Software without restriction, including without limitation |
| // the rights to use, copy, modify, merge, publish, distribute, sublicense, |
| // and/or sell copies of the Software, and to permit persons to whom the |
| // Software is furnished to do so, subject to the following conditions: |
| // |
| // The above copyright notice and this permission notice shall be included in |
| // all copies or substantial portions of the Software. |
| // |
| // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR |
| // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, |
| // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL |
| // THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR |
| // OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, |
| // ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR |
| // OTHER DEALINGS IN THE SOFTWARE. |
| // |
| |
| #include "core/kernel.hpp" |
| #include "core/resource.hpp" |
| #include "util/factor.hpp" |
| #include "util/u_math.h" |
| #include "pipe/p_context.h" |
| |
| using namespace clover; |
| |
| kernel::kernel(clover::program &prog, const std::string &name, |
| const std::vector<module::argument> &margs) : |
| program(prog), _name(name), exec(*this), |
| program_ref(prog._kernel_ref_counter) { |
| for (auto &marg : margs) { |
| if (marg.semantic == module::argument::general) |
| _args.emplace_back(argument::create(marg)); |
| } |
| for (auto &dev : prog.devices()) { |
| auto &m = prog.build(dev).binary; |
| auto msym = find(name_equals(name), m.syms); |
| const auto f = id_type_equals(msym.section, module::section::data_constant); |
| if (!any_of(f, m.secs)) |
| continue; |
| |
| auto mconst = find(f, m.secs); |
| auto rb = std::make_unique<root_buffer>(prog.context(), |
| CL_MEM_COPY_HOST_PTR | CL_MEM_READ_ONLY, |
| mconst.size, mconst.data.data()); |
| _constant_buffers.emplace(&dev, std::move(rb)); |
| } |
| } |
| |
| template<typename V> |
| static inline std::vector<uint> |
| pad_vector(command_queue &q, const V &v, uint x) { |
| std::vector<uint> w { v.begin(), v.end() }; |
| w.resize(q.device().max_block_size().size(), x); |
| return w; |
| } |
| |
| void |
| kernel::launch(command_queue &q, |
| const std::vector<size_t> &grid_offset, |
| const std::vector<size_t> &grid_size, |
| const std::vector<size_t> &block_size) { |
| const auto m = program().build(q.device()).binary; |
| const auto reduced_grid_size = |
| map(divides(), grid_size, block_size); |
| void *st = exec.bind(&q, grid_offset); |
| struct pipe_grid_info info = {}; |
| |
| // The handles are created during exec_context::bind(), so we need make |
| // sure to call exec_context::bind() before retrieving them. |
| std::vector<uint32_t *> g_handles = map([&](size_t h) { |
| return (uint32_t *)&exec.input[h]; |
| }, exec.g_handles); |
| |
| q.pipe->bind_compute_state(q.pipe, st); |
| q.pipe->bind_sampler_states(q.pipe, PIPE_SHADER_COMPUTE, |
| 0, exec.samplers.size(), |
| exec.samplers.data()); |
| |
| q.pipe->set_sampler_views(q.pipe, PIPE_SHADER_COMPUTE, 0, |
| exec.sviews.size(), exec.sviews.data()); |
| q.pipe->set_shader_images(q.pipe, PIPE_SHADER_COMPUTE, 0, |
| exec.iviews.size(), exec.iviews.data()); |
| q.pipe->set_compute_resources(q.pipe, 0, exec.resources.size(), |
| exec.resources.data()); |
| q.pipe->set_global_binding(q.pipe, 0, exec.g_buffers.size(), |
| exec.g_buffers.data(), g_handles.data()); |
| |
| // Fill information for the launch_grid() call. |
| info.work_dim = grid_size.size(); |
| copy(pad_vector(q, block_size, 1), info.block); |
| copy(pad_vector(q, reduced_grid_size, 1), info.grid); |
| info.pc = find(name_equals(_name), m.syms).offset; |
| info.input = exec.input.data(); |
| |
| q.pipe->launch_grid(q.pipe, &info); |
| |
| q.pipe->set_global_binding(q.pipe, 0, exec.g_buffers.size(), NULL, NULL); |
| q.pipe->set_compute_resources(q.pipe, 0, exec.resources.size(), NULL); |
| q.pipe->set_shader_images(q.pipe, PIPE_SHADER_COMPUTE, 0, |
| exec.iviews.size(), NULL); |
| q.pipe->set_sampler_views(q.pipe, PIPE_SHADER_COMPUTE, 0, |
| exec.sviews.size(), NULL); |
| q.pipe->bind_sampler_states(q.pipe, PIPE_SHADER_COMPUTE, 0, |
| exec.samplers.size(), NULL); |
| |
| q.pipe->memory_barrier(q.pipe, PIPE_BARRIER_GLOBAL_BUFFER); |
| exec.unbind(); |
| } |
| |
| size_t |
| kernel::mem_local() const { |
| size_t sz = 0; |
| |
| for (auto &arg : args()) { |
| if (dynamic_cast<local_argument *>(&arg)) |
| sz += arg.storage(); |
| } |
| |
| return sz; |
| } |
| |
| size_t |
| kernel::mem_private() const { |
| return 0; |
| } |
| |
| const std::string & |
| kernel::name() const { |
| return _name; |
| } |
| |
| std::vector<size_t> |
| kernel::optimal_block_size(const command_queue &q, |
| const std::vector<size_t> &grid_size) const { |
| return factor::find_grid_optimal_factor<size_t>( |
| q.device().max_threads_per_block(), q.device().max_block_size(), |
| grid_size); |
| } |
| |
| std::vector<size_t> |
| kernel::required_block_size() const { |
| return find(name_equals(_name), program().symbols()).reqd_work_group_size; |
| } |
| |
| kernel::argument_range |
| kernel::args() { |
| return map(derefs(), _args); |
| } |
| |
| kernel::const_argument_range |
| kernel::args() const { |
| return map(derefs(), _args); |
| } |
| |
| std::vector<clover::module::arg_info> |
| kernel::args_infos() { |
| std::vector<clover::module::arg_info> infos; |
| for (auto &marg: find(name_equals(_name), program().symbols()).args) |
| if (marg.semantic == clover::module::argument::general) |
| infos.emplace_back(marg.info); |
| |
| return infos; |
| } |
| |
| const module & |
| kernel::module(const command_queue &q) const { |
| return program().build(q.device()).binary; |
| } |
| |
| kernel::exec_context::exec_context(kernel &kern) : |
| kern(kern), q(NULL), mem_local(0), st(NULL), cs() { |
| } |
| |
| kernel::exec_context::~exec_context() { |
| if (st) |
| q->pipe->delete_compute_state(q->pipe, st); |
| } |
| |
| void * |
| kernel::exec_context::bind(intrusive_ptr<command_queue> _q, |
| const std::vector<size_t> &grid_offset) { |
| std::swap(q, _q); |
| |
| // Bind kernel arguments. |
| auto &m = kern.program().build(q->device()).binary; |
| auto msym = find(name_equals(kern.name()), m.syms); |
| auto margs = msym.args; |
| auto msec = find(id_type_equals(msym.section, module::section::text_executable), m.secs); |
| auto explicit_arg = kern._args.begin(); |
| |
| for (auto &marg : margs) { |
| switch (marg.semantic) { |
| case module::argument::general: |
| (*(explicit_arg++))->bind(*this, marg); |
| break; |
| |
| case module::argument::grid_dimension: { |
| const cl_uint dimension = grid_offset.size(); |
| auto arg = argument::create(marg); |
| |
| arg->set(sizeof(dimension), &dimension); |
| arg->bind(*this, marg); |
| break; |
| } |
| case module::argument::grid_offset: { |
| for (cl_uint x : pad_vector(*q, grid_offset, 0)) { |
| auto arg = argument::create(marg); |
| |
| arg->set(sizeof(x), &x); |
| arg->bind(*this, marg); |
| } |
| break; |
| } |
| case module::argument::image_size: { |
| auto img = dynamic_cast<image_argument &>(**(explicit_arg - 1)).get(); |
| std::vector<cl_uint> image_size{ |
| static_cast<cl_uint>(img->width()), |
| static_cast<cl_uint>(img->height()), |
| static_cast<cl_uint>(img->depth())}; |
| for (auto x : image_size) { |
| auto arg = argument::create(marg); |
| |
| arg->set(sizeof(x), &x); |
| arg->bind(*this, marg); |
| } |
| break; |
| } |
| case module::argument::image_format: { |
| auto img = dynamic_cast<image_argument &>(**(explicit_arg - 1)).get(); |
| cl_image_format fmt = img->format(); |
| std::vector<cl_uint> image_format{ |
| static_cast<cl_uint>(fmt.image_channel_data_type), |
| static_cast<cl_uint>(fmt.image_channel_order)}; |
| for (auto x : image_format) { |
| auto arg = argument::create(marg); |
| |
| arg->set(sizeof(x), &x); |
| arg->bind(*this, marg); |
| } |
| break; |
| } |
| case module::argument::constant_buffer: { |
| auto arg = argument::create(marg); |
| cl_mem buf = kern._constant_buffers.at(&q->device()).get(); |
| arg->set(q->device().address_bits() / 8, &buf); |
| arg->bind(*this, marg); |
| break; |
| } |
| } |
| } |
| |
| // Create a new compute state if anything changed. |
| if (!st || q != _q || |
| cs.req_local_mem != mem_local || |
| cs.req_input_mem != input.size()) { |
| if (st) |
| _q->pipe->delete_compute_state(_q->pipe, st); |
| |
| cs.ir_type = q->device().ir_format(); |
| cs.prog = &(msec.data[0]); |
| cs.req_local_mem = mem_local; |
| cs.req_input_mem = input.size(); |
| st = q->pipe->create_compute_state(q->pipe, &cs); |
| if (!st) { |
| unbind(); // Cleanup |
| throw error(CL_OUT_OF_RESOURCES); |
| } |
| } |
| |
| return st; |
| } |
| |
| void |
| kernel::exec_context::unbind() { |
| for (auto &arg : kern.args()) |
| arg.unbind(*this); |
| |
| input.clear(); |
| samplers.clear(); |
| sviews.clear(); |
| iviews.clear(); |
| resources.clear(); |
| g_buffers.clear(); |
| g_handles.clear(); |
| mem_local = 0; |
| } |
| |
| namespace { |
| template<typename T> |
| std::vector<uint8_t> |
| bytes(const T& x) { |
| return { (uint8_t *)&x, (uint8_t *)&x + sizeof(x) }; |
| } |
| |
| /// |
| /// Transform buffer \a v from the native byte order into the byte |
| /// order specified by \a e. |
| /// |
| template<typename T> |
| void |
| byteswap(T &v, pipe_endian e) { |
| if (PIPE_ENDIAN_NATIVE != e) |
| std::reverse(v.begin(), v.end()); |
| } |
| |
| /// |
| /// Pad buffer \a v to the next multiple of \a n. |
| /// |
| template<typename T> |
| void |
| align(T &v, size_t n) { |
| v.resize(util_align_npot(v.size(), n)); |
| } |
| |
| bool |
| msb(const std::vector<uint8_t> &s) { |
| if (PIPE_ENDIAN_NATIVE == PIPE_ENDIAN_LITTLE) |
| return s.back() & 0x80; |
| else |
| return s.front() & 0x80; |
| } |
| |
| /// |
| /// Resize buffer \a v to size \a n using sign or zero extension |
| /// according to \a ext. |
| /// |
| template<typename T> |
| void |
| extend(T &v, enum module::argument::ext_type ext, size_t n) { |
| const size_t m = std::min(v.size(), n); |
| const bool sign_ext = (ext == module::argument::sign_ext); |
| const uint8_t fill = (sign_ext && msb(v) ? ~0 : 0); |
| T w(n, fill); |
| |
| if (PIPE_ENDIAN_NATIVE == PIPE_ENDIAN_LITTLE) |
| std::copy_n(v.begin(), m, w.begin()); |
| else |
| std::copy_n(v.end() - m, m, w.end() - m); |
| |
| std::swap(v, w); |
| } |
| |
| /// |
| /// Append buffer \a w to \a v. |
| /// |
| template<typename T> |
| void |
| insert(T &v, const T &w) { |
| v.insert(v.end(), w.begin(), w.end()); |
| } |
| |
| /// |
| /// Append \a n elements to the end of buffer \a v. |
| /// |
| template<typename T> |
| size_t |
| allocate(T &v, size_t n) { |
| size_t pos = v.size(); |
| v.resize(pos + n); |
| return pos; |
| } |
| } |
| |
| std::unique_ptr<kernel::argument> |
| kernel::argument::create(const module::argument &marg) { |
| switch (marg.type) { |
| case module::argument::scalar: |
| return std::unique_ptr<kernel::argument>(new scalar_argument(marg.size)); |
| |
| case module::argument::global: |
| return std::unique_ptr<kernel::argument>(new global_argument); |
| |
| case module::argument::local: |
| return std::unique_ptr<kernel::argument>(new local_argument); |
| |
| case module::argument::constant: |
| return std::unique_ptr<kernel::argument>(new constant_argument); |
| |
| case module::argument::image2d_rd: |
| case module::argument::image3d_rd: |
| return std::unique_ptr<kernel::argument>(new image_rd_argument); |
| |
| case module::argument::image2d_wr: |
| case module::argument::image3d_wr: |
| return std::unique_ptr<kernel::argument>(new image_wr_argument); |
| |
| case module::argument::sampler: |
| return std::unique_ptr<kernel::argument>(new sampler_argument); |
| |
| } |
| throw error(CL_INVALID_KERNEL_DEFINITION); |
| } |
| |
| kernel::argument::argument() : _set(false) { |
| } |
| |
| bool |
| kernel::argument::set() const { |
| return _set; |
| } |
| |
| size_t |
| kernel::argument::storage() const { |
| return 0; |
| } |
| |
| kernel::scalar_argument::scalar_argument(size_t size) : size(size) { |
| } |
| |
| void |
| kernel::scalar_argument::set(size_t size, const void *value) { |
| if (!value) |
| throw error(CL_INVALID_ARG_VALUE); |
| |
| if (size != this->size) |
| throw error(CL_INVALID_ARG_SIZE); |
| |
| v = { (uint8_t *)value, (uint8_t *)value + size }; |
| _set = true; |
| } |
| |
| void |
| kernel::scalar_argument::bind(exec_context &ctx, |
| const module::argument &marg) { |
| auto w = v; |
| |
| extend(w, marg.ext_type, marg.target_size); |
| byteswap(w, ctx.q->device().endianness()); |
| align(ctx.input, marg.target_align); |
| insert(ctx.input, w); |
| } |
| |
| void |
| kernel::scalar_argument::unbind(exec_context &ctx) { |
| } |
| |
| void |
| kernel::global_argument::set(size_t size, const void *value) { |
| if (size != sizeof(cl_mem)) |
| throw error(CL_INVALID_ARG_SIZE); |
| |
| buf = pobj<buffer>(value ? *(cl_mem *)value : NULL); |
| svm = nullptr; |
| _set = true; |
| } |
| |
| void |
| kernel::global_argument::set_svm(const void *value) { |
| svm = value; |
| buf = nullptr; |
| _set = true; |
| } |
| |
| void |
| kernel::global_argument::bind(exec_context &ctx, |
| const module::argument &marg) { |
| align(ctx.input, marg.target_align); |
| |
| if (buf) { |
| const resource &r = buf->resource_in(*ctx.q); |
| ctx.g_handles.push_back(ctx.input.size()); |
| ctx.g_buffers.push_back(r.pipe); |
| |
| // How to handle multi-demensional offsets? |
| // We don't need to. Buffer offsets are always |
| // one-dimensional. |
| auto v = bytes(r.offset[0]); |
| extend(v, marg.ext_type, marg.target_size); |
| byteswap(v, ctx.q->device().endianness()); |
| insert(ctx.input, v); |
| } else if (svm) { |
| auto v = bytes(svm); |
| extend(v, marg.ext_type, marg.target_size); |
| byteswap(v, ctx.q->device().endianness()); |
| insert(ctx.input, v); |
| } else { |
| // Null pointer. |
| allocate(ctx.input, marg.target_size); |
| } |
| } |
| |
| void |
| kernel::global_argument::unbind(exec_context &ctx) { |
| } |
| |
| size_t |
| kernel::local_argument::storage() const { |
| return _storage; |
| } |
| |
| void |
| kernel::local_argument::set(size_t size, const void *value) { |
| if (value) |
| throw error(CL_INVALID_ARG_VALUE); |
| |
| if (!size) |
| throw error(CL_INVALID_ARG_SIZE); |
| |
| _storage = size; |
| _set = true; |
| } |
| |
| void |
| kernel::local_argument::bind(exec_context &ctx, |
| const module::argument &marg) { |
| auto v = bytes(ctx.mem_local); |
| |
| extend(v, module::argument::zero_ext, marg.target_size); |
| byteswap(v, ctx.q->device().endianness()); |
| align(ctx.input, marg.target_align); |
| insert(ctx.input, v); |
| |
| ctx.mem_local += _storage; |
| } |
| |
| void |
| kernel::local_argument::unbind(exec_context &ctx) { |
| } |
| |
| void |
| kernel::constant_argument::set(size_t size, const void *value) { |
| if (size != sizeof(cl_mem)) |
| throw error(CL_INVALID_ARG_SIZE); |
| |
| buf = pobj<buffer>(value ? *(cl_mem *)value : NULL); |
| _set = true; |
| } |
| |
| void |
| kernel::constant_argument::bind(exec_context &ctx, |
| const module::argument &marg) { |
| align(ctx.input, marg.target_align); |
| |
| if (buf) { |
| resource &r = buf->resource_in(*ctx.q); |
| auto v = bytes(ctx.resources.size() << 24 | r.offset[0]); |
| |
| extend(v, module::argument::zero_ext, marg.target_size); |
| byteswap(v, ctx.q->device().endianness()); |
| insert(ctx.input, v); |
| |
| st = r.bind_surface(*ctx.q, false); |
| ctx.resources.push_back(st); |
| } else { |
| // Null pointer. |
| allocate(ctx.input, marg.target_size); |
| } |
| } |
| |
| void |
| kernel::constant_argument::unbind(exec_context &ctx) { |
| if (buf) |
| buf->resource_in(*ctx.q).unbind_surface(*ctx.q, st); |
| } |
| |
| void |
| kernel::image_rd_argument::set(size_t size, const void *value) { |
| if (!value) |
| throw error(CL_INVALID_ARG_VALUE); |
| |
| if (size != sizeof(cl_mem)) |
| throw error(CL_INVALID_ARG_SIZE); |
| |
| img = &obj<image>(*(cl_mem *)value); |
| _set = true; |
| } |
| |
| void |
| kernel::image_rd_argument::bind(exec_context &ctx, |
| const module::argument &marg) { |
| auto v = bytes(ctx.sviews.size()); |
| |
| extend(v, module::argument::zero_ext, marg.target_size); |
| byteswap(v, ctx.q->device().endianness()); |
| align(ctx.input, marg.target_align); |
| insert(ctx.input, v); |
| |
| st = img->resource_in(*ctx.q).bind_sampler_view(*ctx.q); |
| ctx.sviews.push_back(st); |
| } |
| |
| void |
| kernel::image_rd_argument::unbind(exec_context &ctx) { |
| img->resource_in(*ctx.q).unbind_sampler_view(*ctx.q, st); |
| } |
| |
| void |
| kernel::image_wr_argument::set(size_t size, const void *value) { |
| if (!value) |
| throw error(CL_INVALID_ARG_VALUE); |
| |
| if (size != sizeof(cl_mem)) |
| throw error(CL_INVALID_ARG_SIZE); |
| |
| img = &obj<image>(*(cl_mem *)value); |
| _set = true; |
| } |
| |
| void |
| kernel::image_wr_argument::bind(exec_context &ctx, |
| const module::argument &marg) { |
| auto v = bytes(ctx.iviews.size()); |
| |
| extend(v, module::argument::zero_ext, marg.target_size); |
| byteswap(v, ctx.q->device().endianness()); |
| align(ctx.input, marg.target_align); |
| insert(ctx.input, v); |
| ctx.iviews.push_back(img->resource_in(*ctx.q).create_image_view(*ctx.q)); |
| } |
| |
| void |
| kernel::image_wr_argument::unbind(exec_context &ctx) { |
| } |
| |
| void |
| kernel::sampler_argument::set(size_t size, const void *value) { |
| if (!value) |
| throw error(CL_INVALID_SAMPLER); |
| |
| if (size != sizeof(cl_sampler)) |
| throw error(CL_INVALID_ARG_SIZE); |
| |
| s = &obj(*(cl_sampler *)value); |
| _set = true; |
| } |
| |
| void |
| kernel::sampler_argument::bind(exec_context &ctx, |
| const module::argument &marg) { |
| st = s->bind(*ctx.q); |
| ctx.samplers.push_back(st); |
| } |
| |
| void |
| kernel::sampler_argument::unbind(exec_context &ctx) { |
| s->unbind(*ctx.q, st); |
| } |