src/gallium/frontends/clover/core/kernel.cpp - platform/external/mesa3d - Gitiles

 //
 // Copyright 2012 Francisco Jerez
 //
 // Permission is hereby granted, free of charge, to any person obtaining a
 // copy of this software and associated documentation files (the "Software"),
 // to deal in the Software without restriction, including without limitation
 // the rights to use, copy, modify, merge, publish, distribute, sublicense,
 // and/or sell copies of the Software, and to permit persons to whom the
 // Software is furnished to do so, subject to the following conditions:
 //
 // The above copyright notice and this permission notice shall be included in
 // all copies or substantial portions of the Software.
 //
 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
 // THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
 // OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
 // ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
 // OTHER DEALINGS IN THE SOFTWARE.
 //

 #include "core/kernel.hpp"
 #include "core/resource.hpp"
 #include "util/factor.hpp"
 #include "util/u_math.h"
 #include "pipe/p_context.h"

 using namespace clover;

 kernel::kernel(clover::program &prog, const std::string &name,
                const std::vector<module::argument> &margs) :
    program(prog), _name(name), exec(*this),
    program_ref(prog._kernel_ref_counter) {
    for (auto &marg : margs) {
       if (marg.semantic == module::argument::general)
          _args.emplace_back(argument::create(marg));
    }
    for (auto &dev : prog.devices()) {
       auto &m = prog.build(dev).binary;
       auto msym = find(name_equals(name), m.syms);
       const auto f = id_type_equals(msym.section, module::section::data_constant);
       if (!any_of(f, m.secs))
          continue;

       auto mconst = find(f, m.secs);
       auto rb = std::make_unique<root_buffer>(prog.context(),
                                               CL_MEM_COPY_HOST_PTR | CL_MEM_READ_ONLY,
                                               mconst.size, mconst.data.data());
       _constant_buffers.emplace(&dev, std::move(rb));
    }
 }

 template<typename V>
 static inline std::vector<uint>
 pad_vector(command_queue &q, const V &v, uint x) {
    std::vector<uint> w { v.begin(), v.end() };
    w.resize(q.device().max_block_size().size(), x);
    return w;
 }

 void
 kernel::launch(command_queue &q,
                const std::vector<size_t> &grid_offset,
                const std::vector<size_t> &grid_size,
                const std::vector<size_t> &block_size) {
    const auto m = program().build(q.device()).binary;
    const auto reduced_grid_size =
       map(divides(), grid_size, block_size);
    void *st = exec.bind(&q, grid_offset);
    struct pipe_grid_info info = {};

    // The handles are created during exec_context::bind(), so we need make
    // sure to call exec_context::bind() before retrieving them.
    std::vector<uint32_t *> g_handles = map([&](size_t h) {
          return (uint32_t *)&exec.input[h];
       }, exec.g_handles);

    q.pipe->bind_compute_state(q.pipe, st);
    q.pipe->bind_sampler_states(q.pipe, PIPE_SHADER_COMPUTE,
                                0, exec.samplers.size(),
                                exec.samplers.data());

    q.pipe->set_sampler_views(q.pipe, PIPE_SHADER_COMPUTE, 0,
                              exec.sviews.size(), exec.sviews.data());
    q.pipe->set_shader_images(q.pipe, PIPE_SHADER_COMPUTE, 0,
                              exec.iviews.size(), exec.iviews.data());
    q.pipe->set_compute_resources(q.pipe, 0, exec.resources.size(),
                                  exec.resources.data());
    q.pipe->set_global_binding(q.pipe, 0, exec.g_buffers.size(),
                               exec.g_buffers.data(), g_handles.data());

    // Fill information for the launch_grid() call.
    info.work_dim = grid_size.size();
    copy(pad_vector(q, block_size, 1), info.block);
    copy(pad_vector(q, reduced_grid_size, 1), info.grid);
    info.pc = find(name_equals(_name), m.syms).offset;
    info.input = exec.input.data();

    q.pipe->launch_grid(q.pipe, &info);

    q.pipe->set_global_binding(q.pipe, 0, exec.g_buffers.size(), NULL, NULL);
    q.pipe->set_compute_resources(q.pipe, 0, exec.resources.size(), NULL);
    q.pipe->set_shader_images(q.pipe, PIPE_SHADER_COMPUTE, 0,
                              exec.iviews.size(), NULL);
    q.pipe->set_sampler_views(q.pipe, PIPE_SHADER_COMPUTE, 0,
                              exec.sviews.size(), NULL);
    q.pipe->bind_sampler_states(q.pipe, PIPE_SHADER_COMPUTE, 0,
                                exec.samplers.size(), NULL);

    q.pipe->memory_barrier(q.pipe, PIPE_BARRIER_GLOBAL_BUFFER);
    exec.unbind();
 }

 size_t
 kernel::mem_local() const {
    size_t sz = 0;

    for (auto &arg : args()) {
       if (dynamic_cast<local_argument *>(&arg))
          sz += arg.storage();
    }

    return sz;
 }

 size_t
 kernel::mem_private() const {
    return 0;
 }

 const std::string &
 kernel::name() const {
    return _name;
 }

 std::vector<size_t>
 kernel::optimal_block_size(const command_queue &q,
                            const std::vector<size_t> &grid_size) const {
    return factor::find_grid_optimal_factor<size_t>(
       q.device().max_threads_per_block(), q.device().max_block_size(),
       grid_size);
 }

 std::vector<size_t>
 kernel::required_block_size() const {
    return find(name_equals(_name), program().symbols()).reqd_work_group_size;
 }

 kernel::argument_range
 kernel::args() {
    return map(derefs(), _args);
 }

 kernel::const_argument_range
 kernel::args() const {
    return map(derefs(), _args);
 }

 std::vector<clover::module::arg_info>
 kernel::args_infos() {
    std::vector<clover::module::arg_info> infos;
    for (auto &marg: find(name_equals(_name), program().symbols()).args)
       if (marg.semantic == clover::module::argument::general)
          infos.emplace_back(marg.info);

    return infos;
 }

 const module &
 kernel::module(const command_queue &q) const {
    return program().build(q.device()).binary;
 }

 kernel::exec_context::exec_context(kernel &kern) :
    kern(kern), q(NULL), mem_local(0), st(NULL), cs() {
 }

 kernel::exec_context::~exec_context() {
    if (st)
       q->pipe->delete_compute_state(q->pipe, st);
 }

 void *
 kernel::exec_context::bind(intrusive_ptr<command_queue> _q,
                            const std::vector<size_t> &grid_offset) {
    std::swap(q, _q);

    // Bind kernel arguments.
    auto &m = kern.program().build(q->device()).binary;
    auto msym = find(name_equals(kern.name()), m.syms);
    auto margs = msym.args;
    auto msec = find(id_type_equals(msym.section, module::section::text_executable), m.secs);
    auto explicit_arg = kern._args.begin();

    for (auto &marg : margs) {
       switch (marg.semantic) {
       case module::argument::general:
          (*(explicit_arg++))->bind(*this, marg);
          break;

       case module::argument::grid_dimension: {
          const cl_uint dimension = grid_offset.size();
          auto arg = argument::create(marg);

          arg->set(sizeof(dimension), &dimension);
          arg->bind(*this, marg);
          break;
       }
       case module::argument::grid_offset: {
          for (cl_uint x : pad_vector(*q, grid_offset, 0)) {
             auto arg = argument::create(marg);

             arg->set(sizeof(x), &x);
             arg->bind(*this, marg);
          }
          break;
       }
       case module::argument::image_size: {
          auto img = dynamic_cast<image_argument &>(**(explicit_arg - 1)).get();
          std::vector<cl_uint> image_size{
                static_cast<cl_uint>(img->width()),
                static_cast<cl_uint>(img->height()),
                static_cast<cl_uint>(img->depth())};
          for (auto x : image_size) {
             auto arg = argument::create(marg);

             arg->set(sizeof(x), &x);
             arg->bind(*this, marg);
          }
          break;
       }
       case module::argument::image_format: {
          auto img = dynamic_cast<image_argument &>(**(explicit_arg - 1)).get();
          cl_image_format fmt = img->format();
          std::vector<cl_uint> image_format{
                static_cast<cl_uint>(fmt.image_channel_data_type),
                static_cast<cl_uint>(fmt.image_channel_order)};
          for (auto x : image_format) {
             auto arg = argument::create(marg);

             arg->set(sizeof(x), &x);
             arg->bind(*this, marg);
          }
          break;
       }
       case module::argument::constant_buffer: {
          auto arg = argument::create(marg);
          cl_mem buf = kern._constant_buffers.at(&q->device()).get();
          arg->set(q->device().address_bits() / 8, &buf);
          arg->bind(*this, marg);
          break;
       }
       }
    }

    // Create a new compute state if anything changed.
    if (!st || q != _q ||
        cs.req_local_mem != mem_local ||
        cs.req_input_mem != input.size()) {
       if (st)
          _q->pipe->delete_compute_state(_q->pipe, st);

       cs.ir_type = q->device().ir_format();
       cs.prog = &(msec.data[0]);
       cs.req_local_mem = mem_local;
       cs.req_input_mem = input.size();
       st = q->pipe->create_compute_state(q->pipe, &cs);
       if (!st) {
          unbind(); // Cleanup
          throw error(CL_OUT_OF_RESOURCES);
       }
    }

    return st;
 }

 void
 kernel::exec_context::unbind() {
    for (auto &arg : kern.args())
       arg.unbind(*this);

    input.clear();
    samplers.clear();
    sviews.clear();
    iviews.clear();
    resources.clear();
    g_buffers.clear();
    g_handles.clear();
    mem_local = 0;
 }

 namespace {
    template<typename T>
    std::vector<uint8_t>
    bytes(const T& x) {
       return { (uint8_t *)&x, (uint8_t *)&x + sizeof(x) };
    }

    ///
    /// Transform buffer \a v from the native byte order into the byte
    /// order specified by \a e.
    ///
    template<typename T>
    void
    byteswap(T &v, pipe_endian e) {
       if (PIPE_ENDIAN_NATIVE != e)
          std::reverse(v.begin(), v.end());
    }

    ///
    /// Pad buffer \a v to the next multiple of \a n.
    ///
    template<typename T>
    void
    align(T &v, size_t n) {
       v.resize(util_align_npot(v.size(), n));
    }

    bool
    msb(const std::vector<uint8_t> &s) {
       if (PIPE_ENDIAN_NATIVE == PIPE_ENDIAN_LITTLE)
          return s.back() & 0x80;
       else
          return s.front() & 0x80;
    }

    ///
    /// Resize buffer \a v to size \a n using sign or zero extension
    /// according to \a ext.
    ///
    template<typename T>
    void
    extend(T &v, enum module::argument::ext_type ext, size_t n) {
       const size_t m = std::min(v.size(), n);
       const bool sign_ext = (ext == module::argument::sign_ext);
       const uint8_t fill = (sign_ext && msb(v) ? ~0 : 0);
       T w(n, fill);

       if (PIPE_ENDIAN_NATIVE == PIPE_ENDIAN_LITTLE)
          std::copy_n(v.begin(), m, w.begin());
       else
          std::copy_n(v.end() - m, m, w.end() - m);

       std::swap(v, w);
    }

    ///
    /// Append buffer \a w to \a v.
    ///
    template<typename T>
    void
    insert(T &v, const T &w) {
       v.insert(v.end(), w.begin(), w.end());
    }

    ///
    /// Append \a n elements to the end of buffer \a v.
    ///
    template<typename T>
    size_t
    allocate(T &v, size_t n) {
       size_t pos = v.size();
       v.resize(pos + n);
       return pos;
    }
 }

 std::unique_ptr<kernel::argument>
 kernel::argument::create(const module::argument &marg) {
    switch (marg.type) {
    case module::argument::scalar:
       return std::unique_ptr<kernel::argument>(new scalar_argument(marg.size));

    case module::argument::global:
       return std::unique_ptr<kernel::argument>(new global_argument);

    case module::argument::local:
       return std::unique_ptr<kernel::argument>(new local_argument);

    case module::argument::constant:
       return std::unique_ptr<kernel::argument>(new constant_argument);

    case module::argument::image2d_rd:
    case module::argument::image3d_rd:
       return std::unique_ptr<kernel::argument>(new image_rd_argument);

    case module::argument::image2d_wr:
    case module::argument::image3d_wr:
       return std::unique_ptr<kernel::argument>(new image_wr_argument);

    case module::argument::sampler:
       return std::unique_ptr<kernel::argument>(new sampler_argument);

    }
    throw error(CL_INVALID_KERNEL_DEFINITION);
 }

 kernel::argument::argument() : _set(false) {
 }

 bool
 kernel::argument::set() const {
    return _set;
 }

 size_t
 kernel::argument::storage() const {
    return 0;
 }

 kernel::scalar_argument::scalar_argument(size_t size) : size(size) {
 }

 void
 kernel::scalar_argument::set(size_t size, const void *value) {
    if (!value)
       throw error(CL_INVALID_ARG_VALUE);

    if (size != this->size)
       throw error(CL_INVALID_ARG_SIZE);

    v = { (uint8_t *)value, (uint8_t *)value + size };
    _set = true;
 }

 void
 kernel::scalar_argument::bind(exec_context &ctx,
                               const module::argument &marg) {
    auto w = v;

    extend(w, marg.ext_type, marg.target_size);
    byteswap(w, ctx.q->device().endianness());
    align(ctx.input, marg.target_align);
    insert(ctx.input, w);
 }

 void
 kernel::scalar_argument::unbind(exec_context &ctx) {
 }

 void
 kernel::global_argument::set(size_t size, const void *value) {
    if (size != sizeof(cl_mem))
       throw error(CL_INVALID_ARG_SIZE);

    buf = pobj<buffer>(value ? *(cl_mem *)value : NULL);
    svm = nullptr;
    _set = true;
 }

 void
 kernel::global_argument::set_svm(const void *value) {
    svm = value;
    buf = nullptr;
    _set = true;
 }

 void
 kernel::global_argument::bind(exec_context &ctx,
                               const module::argument &marg) {
    align(ctx.input, marg.target_align);

    if (buf) {
       const resource &r = buf->resource_in(*ctx.q);
       ctx.g_handles.push_back(ctx.input.size());
       ctx.g_buffers.push_back(r.pipe);

       // How to handle multi-demensional offsets?
       // We don't need to.  Buffer offsets are always
       // one-dimensional.
       auto v = bytes(r.offset[0]);
       extend(v, marg.ext_type, marg.target_size);
       byteswap(v, ctx.q->device().endianness());
       insert(ctx.input, v);
    } else if (svm) {
       auto v = bytes(svm);
       extend(v, marg.ext_type, marg.target_size);
       byteswap(v, ctx.q->device().endianness());
       insert(ctx.input, v);
    } else {
       // Null pointer.
       allocate(ctx.input, marg.target_size);
    }
 }

 void
 kernel::global_argument::unbind(exec_context &ctx) {
 }

 size_t
 kernel::local_argument::storage() const {
    return _storage;
 }

 void
 kernel::local_argument::set(size_t size, const void *value) {
    if (value)
       throw error(CL_INVALID_ARG_VALUE);

    if (!size)
       throw error(CL_INVALID_ARG_SIZE);

    _storage = size;
    _set = true;
 }

 void
 kernel::local_argument::bind(exec_context &ctx,
                              const module::argument &marg) {
    auto v = bytes(ctx.mem_local);

    extend(v, module::argument::zero_ext, marg.target_size);
    byteswap(v, ctx.q->device().endianness());
    align(ctx.input, marg.target_align);
    insert(ctx.input, v);

    ctx.mem_local += _storage;
 }

 void
 kernel::local_argument::unbind(exec_context &ctx) {
 }

 void
 kernel::constant_argument::set(size_t size, const void *value) {
    if (size != sizeof(cl_mem))
       throw error(CL_INVALID_ARG_SIZE);

    buf = pobj<buffer>(value ? *(cl_mem *)value : NULL);
    _set = true;
 }

 void
 kernel::constant_argument::bind(exec_context &ctx,
                                 const module::argument &marg) {
    align(ctx.input, marg.target_align);

    if (buf) {
       resource &r = buf->resource_in(*ctx.q);
       auto v = bytes(ctx.resources.size() << 24 | r.offset[0]);

       extend(v, module::argument::zero_ext, marg.target_size);
       byteswap(v, ctx.q->device().endianness());
       insert(ctx.input, v);

       st = r.bind_surface(*ctx.q, false);
       ctx.resources.push_back(st);
    } else {
       // Null pointer.
       allocate(ctx.input, marg.target_size);
    }
 }

 void
 kernel::constant_argument::unbind(exec_context &ctx) {
    if (buf)
       buf->resource_in(*ctx.q).unbind_surface(*ctx.q, st);
 }

 void
 kernel::image_rd_argument::set(size_t size, const void *value) {
    if (!value)
       throw error(CL_INVALID_ARG_VALUE);

    if (size != sizeof(cl_mem))
       throw error(CL_INVALID_ARG_SIZE);

    img = &obj<image>(*(cl_mem *)value);
    _set = true;
 }

 void
 kernel::image_rd_argument::bind(exec_context &ctx,
                                 const module::argument &marg) {
    auto v = bytes(ctx.sviews.size());

    extend(v, module::argument::zero_ext, marg.target_size);
    byteswap(v, ctx.q->device().endianness());
    align(ctx.input, marg.target_align);
    insert(ctx.input, v);

    st = img->resource_in(*ctx.q).bind_sampler_view(*ctx.q);
    ctx.sviews.push_back(st);
 }

 void
 kernel::image_rd_argument::unbind(exec_context &ctx) {
    img->resource_in(*ctx.q).unbind_sampler_view(*ctx.q, st);
 }

 void
 kernel::image_wr_argument::set(size_t size, const void *value) {
    if (!value)
       throw error(CL_INVALID_ARG_VALUE);

    if (size != sizeof(cl_mem))
       throw error(CL_INVALID_ARG_SIZE);

    img = &obj<image>(*(cl_mem *)value);
    _set = true;
 }

 void
 kernel::image_wr_argument::bind(exec_context &ctx,
                                 const module::argument &marg) {
    auto v = bytes(ctx.iviews.size());

    extend(v, module::argument::zero_ext, marg.target_size);
    byteswap(v, ctx.q->device().endianness());
    align(ctx.input, marg.target_align);
    insert(ctx.input, v);
    ctx.iviews.push_back(img->resource_in(*ctx.q).create_image_view(*ctx.q));
 }

 void
 kernel::image_wr_argument::unbind(exec_context &ctx) {
 }

 void
 kernel::sampler_argument::set(size_t size, const void *value) {
    if (!value)
       throw error(CL_INVALID_SAMPLER);

    if (size != sizeof(cl_sampler))
       throw error(CL_INVALID_ARG_SIZE);

    s = &obj(*(cl_sampler *)value);
    _set = true;
 }

 void
 kernel::sampler_argument::bind(exec_context &ctx,
                                const module::argument &marg) {
    st = s->bind(*ctx.q);
    ctx.samplers.push_back(st);
 }

 void
 kernel::sampler_argument::unbind(exec_context &ctx) {
    s->unbind(*ctx.q, st);
 }
	//
	// Copyright 2012 Francisco Jerez
	//
	// Permission is hereby granted, free of charge, to any person obtaining a
	// copy of this software and associated documentation files (the "Software"),
	// to deal in the Software without restriction, including without limitation
	// the rights to use, copy, modify, merge, publish, distribute, sublicense,
	// and/or sell copies of the Software, and to permit persons to whom the
	// Software is furnished to do so, subject to the following conditions:
	//
	// The above copyright notice and this permission notice shall be included in
	// all copies or substantial portions of the Software.
	//
	// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
	// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
	// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
	// THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
	// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
	// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
	// OTHER DEALINGS IN THE SOFTWARE.
	//

	#include "core/kernel.hpp"
	#include "core/resource.hpp"
	#include "util/factor.hpp"
	#include "util/u_math.h"
	#include "pipe/p_context.h"

	using namespace clover;

	kernel::kernel(clover::program &prog, const std::string &name,
	const std::vector<module::argument> &margs) :
	program(prog), _name(name), exec(*this),
	program_ref(prog._kernel_ref_counter) {
	for (auto &marg : margs) {
	if (marg.semantic == module::argument::general)
	_args.emplace_back(argument::create(marg));
	}
	for (auto &dev : prog.devices()) {
	auto &m = prog.build(dev).binary;
	auto msym = find(name_equals(name), m.syms);
	const auto f = id_type_equals(msym.section, module::section::data_constant);
	if (!any_of(f, m.secs))
	continue;

	auto mconst = find(f, m.secs);
	auto rb = std::make_unique<root_buffer>(prog.context(),
	CL_MEM_COPY_HOST_PTR \| CL_MEM_READ_ONLY,
	mconst.size, mconst.data.data());
	_constant_buffers.emplace(&dev, std::move(rb));
	}
	}

	template<typename V>
	static inline std::vector<uint>
	pad_vector(command_queue &q, const V &v, uint x) {
	std::vector<uint> w { v.begin(), v.end() };
	w.resize(q.device().max_block_size().size(), x);
	return w;
	}

	void
	kernel::launch(command_queue &q,
	const std::vector<size_t> &grid_offset,
	const std::vector<size_t> &grid_size,
	const std::vector<size_t> &block_size) {
	const auto m = program().build(q.device()).binary;
	const auto reduced_grid_size =
	map(divides(), grid_size, block_size);
	void *st = exec.bind(&q, grid_offset);
	struct pipe_grid_info info = {};

	// The handles are created during exec_context::bind(), so we need make
	// sure to call exec_context::bind() before retrieving them.
	std::vector<uint32_t *> g_handles = map([&](size_t h) {
	return (uint32_t *)&exec.input[h];
	}, exec.g_handles);

	q.pipe->bind_compute_state(q.pipe, st);
	q.pipe->bind_sampler_states(q.pipe, PIPE_SHADER_COMPUTE,
	0, exec.samplers.size(),
	exec.samplers.data());

	q.pipe->set_sampler_views(q.pipe, PIPE_SHADER_COMPUTE, 0,
	exec.sviews.size(), exec.sviews.data());
	q.pipe->set_shader_images(q.pipe, PIPE_SHADER_COMPUTE, 0,
	exec.iviews.size(), exec.iviews.data());
	q.pipe->set_compute_resources(q.pipe, 0, exec.resources.size(),
	exec.resources.data());
	q.pipe->set_global_binding(q.pipe, 0, exec.g_buffers.size(),
	exec.g_buffers.data(), g_handles.data());

	// Fill information for the launch_grid() call.
	info.work_dim = grid_size.size();
	copy(pad_vector(q, block_size, 1), info.block);
	copy(pad_vector(q, reduced_grid_size, 1), info.grid);
	info.pc = find(name_equals(_name), m.syms).offset;
	info.input = exec.input.data();

	q.pipe->launch_grid(q.pipe, &info);

	q.pipe->set_global_binding(q.pipe, 0, exec.g_buffers.size(), NULL, NULL);
	q.pipe->set_compute_resources(q.pipe, 0, exec.resources.size(), NULL);
	q.pipe->set_shader_images(q.pipe, PIPE_SHADER_COMPUTE, 0,
	exec.iviews.size(), NULL);
	q.pipe->set_sampler_views(q.pipe, PIPE_SHADER_COMPUTE, 0,
	exec.sviews.size(), NULL);
	q.pipe->bind_sampler_states(q.pipe, PIPE_SHADER_COMPUTE, 0,
	exec.samplers.size(), NULL);

	q.pipe->memory_barrier(q.pipe, PIPE_BARRIER_GLOBAL_BUFFER);
	exec.unbind();
	}

	size_t
	kernel::mem_local() const {
	size_t sz = 0;

	for (auto &arg : args()) {
	if (dynamic_cast<local_argument *>(&arg))
	sz += arg.storage();
	}

	return sz;
	}

	size_t
	kernel::mem_private() const {
	return 0;
	}

	const std::string &
	kernel::name() const {
	return _name;
	}

	std::vector<size_t>
	kernel::optimal_block_size(const command_queue &q,
	const std::vector<size_t> &grid_size) const {
	return factor::find_grid_optimal_factor<size_t>(
	q.device().max_threads_per_block(), q.device().max_block_size(),
	grid_size);
	}

	std::vector<size_t>
	kernel::required_block_size() const {
	return find(name_equals(_name), program().symbols()).reqd_work_group_size;
	}

	kernel::argument_range
	kernel::args() {
	return map(derefs(), _args);
	}

	kernel::const_argument_range
	kernel::args() const {
	return map(derefs(), _args);
	}

	std::vector<clover::module::arg_info>
	kernel::args_infos() {
	std::vector<clover::module::arg_info> infos;
	for (auto &marg: find(name_equals(_name), program().symbols()).args)
	if (marg.semantic == clover::module::argument::general)
	infos.emplace_back(marg.info);

	return infos;
	}

	const module &
	kernel::module(const command_queue &q) const {
	return program().build(q.device()).binary;
	}

	kernel::exec_context::exec_context(kernel &kern) :
	kern(kern), q(NULL), mem_local(0), st(NULL), cs() {
	}

	kernel::exec_context::~exec_context() {
	if (st)
	q->pipe->delete_compute_state(q->pipe, st);
	}

	void *
	kernel::exec_context::bind(intrusive_ptr<command_queue> _q,
	const std::vector<size_t> &grid_offset) {
	std::swap(q, _q);

	// Bind kernel arguments.
	auto &m = kern.program().build(q->device()).binary;
	auto msym = find(name_equals(kern.name()), m.syms);
	auto margs = msym.args;
	auto msec = find(id_type_equals(msym.section, module::section::text_executable), m.secs);
	auto explicit_arg = kern._args.begin();

	for (auto &marg : margs) {
	switch (marg.semantic) {
	case module::argument::general:
	((explicit_arg++))->bind(this, marg);
	break;

	case module::argument::grid_dimension: {
	const cl_uint dimension = grid_offset.size();
	auto arg = argument::create(marg);

	arg->set(sizeof(dimension), &dimension);
	arg->bind(*this, marg);
	break;
	}
	case module::argument::grid_offset: {
	for (cl_uint x : pad_vector(*q, grid_offset, 0)) {
	auto arg = argument::create(marg);

	arg->set(sizeof(x), &x);
	arg->bind(*this, marg);
	}
	break;
	}
	case module::argument::image_size: {
	auto img = dynamic_cast<image_argument &>(**(explicit_arg - 1)).get();
	std::vector<cl_uint> image_size{
	static_cast<cl_uint>(img->width()),
	static_cast<cl_uint>(img->height()),
	static_cast<cl_uint>(img->depth())};
	for (auto x : image_size) {
	auto arg = argument::create(marg);

	arg->set(sizeof(x), &x);
	arg->bind(*this, marg);
	}
	break;
	}
	case module::argument::image_format: {
	auto img = dynamic_cast<image_argument &>(**(explicit_arg - 1)).get();
	cl_image_format fmt = img->format();
	std::vector<cl_uint> image_format{
	static_cast<cl_uint>(fmt.image_channel_data_type),
	static_cast<cl_uint>(fmt.image_channel_order)};
	for (auto x : image_format) {
	auto arg = argument::create(marg);

	arg->set(sizeof(x), &x);
	arg->bind(*this, marg);
	}
	break;
	}
	case module::argument::constant_buffer: {
	auto arg = argument::create(marg);
	cl_mem buf = kern._constant_buffers.at(&q->device()).get();
	arg->set(q->device().address_bits() / 8, &buf);
	arg->bind(*this, marg);
	break;
	}
	}
	}

	// Create a new compute state if anything changed.
	if (!st \|\| q != _q \|\|
	cs.req_local_mem != mem_local \|\|
	cs.req_input_mem != input.size()) {
	if (st)
	_q->pipe->delete_compute_state(_q->pipe, st);

	cs.ir_type = q->device().ir_format();
	cs.prog = &(msec.data[0]);
	cs.req_local_mem = mem_local;
	cs.req_input_mem = input.size();
	st = q->pipe->create_compute_state(q->pipe, &cs);
	if (!st) {
	unbind(); // Cleanup
	throw error(CL_OUT_OF_RESOURCES);
	}
	}

	return st;
	}

	void
	kernel::exec_context::unbind() {
	for (auto &arg : kern.args())
	arg.unbind(*this);

	input.clear();
	samplers.clear();
	sviews.clear();
	iviews.clear();
	resources.clear();
	g_buffers.clear();
	g_handles.clear();
	mem_local = 0;
	}

	namespace {
	template<typename T>
	std::vector<uint8_t>
	bytes(const T& x) {
	return { (uint8_t )&x, (uint8_t )&x + sizeof(x) };
	}

	///
	/// Transform buffer \a v from the native byte order into the byte
	/// order specified by \a e.
	///
	template<typename T>
	void
	byteswap(T &v, pipe_endian e) {
	if (PIPE_ENDIAN_NATIVE != e)
	std::reverse(v.begin(), v.end());
	}

	///
	/// Pad buffer \a v to the next multiple of \a n.
	///
	template<typename T>
	void
	align(T &v, size_t n) {
	v.resize(util_align_npot(v.size(), n));
	}

	bool
	msb(const std::vector<uint8_t> &s) {
	if (PIPE_ENDIAN_NATIVE == PIPE_ENDIAN_LITTLE)
	return s.back() & 0x80;
	else
	return s.front() & 0x80;
	}

	///
	/// Resize buffer \a v to size \a n using sign or zero extension
	/// according to \a ext.
	///
	template<typename T>
	void
	extend(T &v, enum module::argument::ext_type ext, size_t n) {
	const size_t m = std::min(v.size(), n);
	const bool sign_ext = (ext == module::argument::sign_ext);
	const uint8_t fill = (sign_ext && msb(v) ? ~0 : 0);
	T w(n, fill);

	if (PIPE_ENDIAN_NATIVE == PIPE_ENDIAN_LITTLE)
	std::copy_n(v.begin(), m, w.begin());
	else
	std::copy_n(v.end() - m, m, w.end() - m);

	std::swap(v, w);
	}

	///
	/// Append buffer \a w to \a v.
	///
	template<typename T>
	void
	insert(T &v, const T &w) {
	v.insert(v.end(), w.begin(), w.end());
	}

	///
	/// Append \a n elements to the end of buffer \a v.
	///
	template<typename T>
	size_t
	allocate(T &v, size_t n) {
	size_t pos = v.size();
	v.resize(pos + n);
	return pos;
	}
	}

	std::unique_ptr<kernel::argument>
	kernel::argument::create(const module::argument &marg) {
	switch (marg.type) {
	case module::argument::scalar:
	return std::unique_ptr<kernel::argument>(new scalar_argument(marg.size));

	case module::argument::global:
	return std::unique_ptr<kernel::argument>(new global_argument);

	case module::argument::local:
	return std::unique_ptr<kernel::argument>(new local_argument);

	case module::argument::constant:
	return std::unique_ptr<kernel::argument>(new constant_argument);

	case module::argument::image2d_rd:
	case module::argument::image3d_rd:
	return std::unique_ptr<kernel::argument>(new image_rd_argument);

	case module::argument::image2d_wr:
	case module::argument::image3d_wr:
	return std::unique_ptr<kernel::argument>(new image_wr_argument);

	case module::argument::sampler:
	return std::unique_ptr<kernel::argument>(new sampler_argument);

	}
	throw error(CL_INVALID_KERNEL_DEFINITION);
	}

	kernel::argument::argument() : _set(false) {
	}

	bool
	kernel::argument::set() const {
	return _set;
	}

	size_t
	kernel::argument::storage() const {
	return 0;
	}

	kernel::scalar_argument::scalar_argument(size_t size) : size(size) {
	}

	void
	kernel::scalar_argument::set(size_t size, const void *value) {
	if (!value)
	throw error(CL_INVALID_ARG_VALUE);

	if (size != this->size)
	throw error(CL_INVALID_ARG_SIZE);

	v = { (uint8_t )value, (uint8_t )value + size };
	_set = true;
	}

	void
	kernel::scalar_argument::bind(exec_context &ctx,
	const module::argument &marg) {
	auto w = v;

	extend(w, marg.ext_type, marg.target_size);
	byteswap(w, ctx.q->device().endianness());
	align(ctx.input, marg.target_align);
	insert(ctx.input, w);
	}

	void
	kernel::scalar_argument::unbind(exec_context &ctx) {
	}

	void
	kernel::global_argument::set(size_t size, const void *value) {
	if (size != sizeof(cl_mem))
	throw error(CL_INVALID_ARG_SIZE);

	buf = pobj<buffer>(value ? (cl_mem )value : NULL);
	svm = nullptr;
	_set = true;
	}

	void
	kernel::global_argument::set_svm(const void *value) {
	svm = value;
	buf = nullptr;
	_set = true;
	}

	void
	kernel::global_argument::bind(exec_context &ctx,
	const module::argument &marg) {
	align(ctx.input, marg.target_align);

	if (buf) {
	const resource &r = buf->resource_in(*ctx.q);
	ctx.g_handles.push_back(ctx.input.size());
	ctx.g_buffers.push_back(r.pipe);

	// How to handle multi-demensional offsets?
	// We don't need to. Buffer offsets are always
	// one-dimensional.
	auto v = bytes(r.offset[0]);
	extend(v, marg.ext_type, marg.target_size);
	byteswap(v, ctx.q->device().endianness());
	insert(ctx.input, v);
	} else if (svm) {
	auto v = bytes(svm);
	extend(v, marg.ext_type, marg.target_size);
	byteswap(v, ctx.q->device().endianness());
	insert(ctx.input, v);
	} else {
	// Null pointer.
	allocate(ctx.input, marg.target_size);
	}
	}

	void
	kernel::global_argument::unbind(exec_context &ctx) {
	}

	size_t
	kernel::local_argument::storage() const {
	return _storage;
	}

	void
	kernel::local_argument::set(size_t size, const void *value) {
	if (value)
	throw error(CL_INVALID_ARG_VALUE);

	if (!size)
	throw error(CL_INVALID_ARG_SIZE);

	_storage = size;
	_set = true;
	}

	void
	kernel::local_argument::bind(exec_context &ctx,
	const module::argument &marg) {
	auto v = bytes(ctx.mem_local);

	extend(v, module::argument::zero_ext, marg.target_size);
	byteswap(v, ctx.q->device().endianness());
	align(ctx.input, marg.target_align);
	insert(ctx.input, v);

	ctx.mem_local += _storage;
	}

	void
	kernel::local_argument::unbind(exec_context &ctx) {
	}

	void
	kernel::constant_argument::set(size_t size, const void *value) {
	if (size != sizeof(cl_mem))
	throw error(CL_INVALID_ARG_SIZE);

	buf = pobj<buffer>(value ? (cl_mem )value : NULL);
	_set = true;
	}

	void
	kernel::constant_argument::bind(exec_context &ctx,
	const module::argument &marg) {
	align(ctx.input, marg.target_align);

	if (buf) {
	resource &r = buf->resource_in(*ctx.q);
	auto v = bytes(ctx.resources.size() << 24 \| r.offset[0]);

	extend(v, module::argument::zero_ext, marg.target_size);
	byteswap(v, ctx.q->device().endianness());
	insert(ctx.input, v);

	st = r.bind_surface(*ctx.q, false);
	ctx.resources.push_back(st);
	} else {
	// Null pointer.
	allocate(ctx.input, marg.target_size);
	}
	}

	void
	kernel::constant_argument::unbind(exec_context &ctx) {
	if (buf)
	buf->resource_in(ctx.q).unbind_surface(ctx.q, st);
	}

	void
	kernel::image_rd_argument::set(size_t size, const void *value) {
	if (!value)
	throw error(CL_INVALID_ARG_VALUE);

	if (size != sizeof(cl_mem))
	throw error(CL_INVALID_ARG_SIZE);

	img = &obj<image>((cl_mem )value);
	_set = true;
	}

	void
	kernel::image_rd_argument::bind(exec_context &ctx,
	const module::argument &marg) {
	auto v = bytes(ctx.sviews.size());

	extend(v, module::argument::zero_ext, marg.target_size);
	byteswap(v, ctx.q->device().endianness());
	align(ctx.input, marg.target_align);
	insert(ctx.input, v);

	st = img->resource_in(ctx.q).bind_sampler_view(ctx.q);
	ctx.sviews.push_back(st);
	}

	void
	kernel::image_rd_argument::unbind(exec_context &ctx) {
	img->resource_in(ctx.q).unbind_sampler_view(ctx.q, st);
	}

	void
	kernel::image_wr_argument::set(size_t size, const void *value) {
	if (!value)
	throw error(CL_INVALID_ARG_VALUE);

	if (size != sizeof(cl_mem))
	throw error(CL_INVALID_ARG_SIZE);

	img = &obj<image>((cl_mem )value);
	_set = true;
	}

	void
	kernel::image_wr_argument::bind(exec_context &ctx,
	const module::argument &marg) {
	auto v = bytes(ctx.iviews.size());

	extend(v, module::argument::zero_ext, marg.target_size);
	byteswap(v, ctx.q->device().endianness());
	align(ctx.input, marg.target_align);
	insert(ctx.input, v);
	ctx.iviews.push_back(img->resource_in(ctx.q).create_image_view(ctx.q));
	}

	void
	kernel::image_wr_argument::unbind(exec_context &ctx) {
	}

	void
	kernel::sampler_argument::set(size_t size, const void *value) {
	if (!value)
	throw error(CL_INVALID_SAMPLER);

	if (size != sizeof(cl_sampler))
	throw error(CL_INVALID_ARG_SIZE);

	s = &obj((cl_sampler )value);
	_set = true;
	}

	void
	kernel::sampler_argument::bind(exec_context &ctx,
	const module::argument &marg) {
	st = s->bind(*ctx.q);
	ctx.samplers.push_back(st);
	}

	void
	kernel::sampler_argument::unbind(exec_context &ctx) {
	s->unbind(*ctx.q, st);
	}