| /* |
| * Copyright (C) 2019 Collabora, Ltd. |
| * |
| * Permission is hereby granted, free of charge, to any person obtaining a |
| * copy of this software and associated documentation files (the "Software"), |
| * to deal in the Software without restriction, including without limitation |
| * the rights to use, copy, modify, merge, publish, distribute, sublicense, |
| * and/or sell copies of the Software, and to permit persons to whom the |
| * Software is furnished to do so, subject to the following conditions: |
| * |
| * The above copyright notice and this permission notice (including the next |
| * paragraph) shall be included in all copies or substantial portions of the |
| * Software. |
| * |
| * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR |
| * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, |
| * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL |
| * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER |
| * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, |
| * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE |
| * SOFTWARE. |
| * |
| */ |
| |
| #include "pan_context.h" |
| #include "pan_job.h" |
| #include "pan_allocate.h" |
| #include "util/bitset.h" |
| |
| /* |
| * Within a batch (panfrost_job), there are various types of Mali jobs: |
| * |
| * - SET_VALUE: initializes tiler |
| * - VERTEX: runs a vertex shader |
| * - TILER: runs tiling and sets up a fragment shader |
| * - FRAGMENT: runs fragment shaders and writes out |
| * - COMPUTE: runs a compute shader |
| * - FUSED: vertex+tiler fused together, implicit intradependency (Bifrost) |
| * - GEOMETRY: runs a geometry shader (unimplemented) |
| * - CACHE_FLUSH: unseen in the wild, theoretically cache flush |
| * |
| * In between a full batch and a single Mali job is the "job chain", a series |
| * of Mali jobs together forming a linked list. Within the job chain, each Mali |
| * job can set (up to) two dependencies on other earlier jobs in the chain. |
| * This dependency graph forms a scoreboard. The general idea of a scoreboard |
| * applies: when there is a data dependency of job B on job A, job B sets one |
| * of its dependency indices to job A, ensuring that job B won't start until |
| * job A finishes. |
| * |
| * More specifically, here are a set of rules: |
| * |
| * - A set value job must appear if and only if there is at least one tiler job. |
| * |
| * - Vertex jobs and tiler jobs are independent. |
| * |
| * - A tiler job must have a dependency on its data source. If it's getting |
| * data from a vertex job, it depends on the vertex job. If it's getting data |
| * from software, this is null. |
| * |
| * - The first vertex job used as the input to tiling must depend on the set |
| * value job, if it is present. |
| * |
| * - Tiler jobs must be strictly ordered. So each tiler job must depend on the |
| * previous job in the chain. |
| * |
| * - Jobs linking via next_job has no bearing on order of execution, rather it |
| * just establishes the linked list of jobs, EXCEPT: |
| * |
| * - A job's dependencies must appear earlier in the linked list (job chain). |
| * |
| * Justification for each rule: |
| * |
| * - Set value jobs set up tiling, essentially. If tiling occurs, they are |
| * needed; if it does not, we cannot emit them since then tiling partially |
| * occurs and it's bad. |
| * |
| * - The hardware has no notion of a "vertex/tiler job" (at least not our |
| * hardware -- other revs have fused jobs, but --- crap, this just got even |
| * more complicated). They are independent units that take in data, process |
| * it, and spit out data. |
| * |
| * - Any job must depend on its data source, in fact, or risk a |
| * read-before-write hazard. Tiler jobs get their data from vertex jobs, ergo |
| * tiler jobs depend on the corresponding vertex job (if it's there). |
| * |
| * - In fact, tiling depends on the set value job, but tiler jobs depend on the |
| * corresponding vertex jobs and each other, so this rule ensures each tiler |
| * job automatically depends on the set value job. |
| * |
| * - The tiler is not thread-safe; this dependency prevents race conditions |
| * between two different jobs trying to write to the tiler outputs at the |
| * same time. |
| * |
| * - Internally, jobs are scoreboarded; the next job fields just form a linked |
| * list to allow the jobs to be read in; the execution order is from |
| * resolving the dependency fields instead. |
| * |
| * - The hardware cannot set a dependency on a job it doesn't know about yet, |
| * and dependencies are processed in-order of the next job fields. |
| * |
| */ |
| |
| /* Accessor to set the next job field */ |
| |
| static void |
| panfrost_set_job_next(struct mali_job_descriptor_header *first, mali_ptr next) |
| { |
| if (first->job_descriptor_size) |
| first->next_job_64 = (u64) (uintptr_t) next; |
| else |
| first->next_job_32 = (u32) (uintptr_t) next; |
| } |
| |
| /* Coerce a panfrost_transfer to a header */ |
| |
| static inline struct mali_job_descriptor_header * |
| job_descriptor_header(struct panfrost_transfer t) |
| { |
| return (struct mali_job_descriptor_header *) t.cpu; |
| } |
| |
| static void |
| panfrost_assign_index( |
| struct panfrost_job *job, |
| struct panfrost_transfer transfer) |
| { |
| /* Assign the index */ |
| unsigned index = ++job->job_index; |
| job_descriptor_header(transfer)->job_index = index; |
| } |
| |
| /* Helper to add a dependency to a job */ |
| |
| static void |
| panfrost_add_dependency( |
| struct panfrost_transfer depender, |
| struct panfrost_transfer dependent) |
| { |
| |
| struct mali_job_descriptor_header *first = |
| job_descriptor_header(dependent); |
| |
| struct mali_job_descriptor_header *second = |
| job_descriptor_header(depender); |
| |
| /* Ensure we're ready for dependencies */ |
| assert(second->job_index); |
| assert(first->job_index); |
| |
| /* Look for an open slot */ |
| |
| if (!second->job_dependency_index_1) |
| second->job_dependency_index_1 = first->job_index; |
| else if (!second->job_dependency_index_2) |
| second->job_dependency_index_2 = first->job_index; |
| else |
| unreachable("No available slot for new dependency"); |
| } |
| |
| /* Queues a job WITHOUT updating pointers. Be careful. */ |
| |
| static void |
| panfrost_scoreboard_queue_job_internal( |
| struct panfrost_job *batch, |
| struct panfrost_transfer job) |
| { |
| panfrost_assign_index(batch, job); |
| |
| /* Queue a pointer to the job */ |
| util_dynarray_append(&batch->headers, void*, job.cpu); |
| util_dynarray_append(&batch->gpu_headers, mali_ptr, job.gpu); |
| } |
| |
| |
| /* Queues a compute job, with no special dependencies. This is a bit of a |
| * misnomer -- internally, all job types are queued with this function, but |
| * outside of this file, it's for pure compute jobs */ |
| |
| void |
| panfrost_scoreboard_queue_compute_job( |
| struct panfrost_job *batch, |
| struct panfrost_transfer job) |
| { |
| panfrost_scoreboard_queue_job_internal(batch, job); |
| |
| /* Update the linked list metadata as appropriate */ |
| batch->last_job = job; |
| |
| if (!batch->first_job.gpu) |
| batch->first_job = job; |
| } |
| |
| /* Queues a vertex job. There are no special dependencies yet, but if |
| * tiling is required (anytime 'rasterize discard' is disabled), we have |
| * some extra bookkeeping for later */ |
| |
| void |
| panfrost_scoreboard_queue_vertex_job( |
| struct panfrost_job *batch, |
| struct panfrost_transfer vertex, |
| bool requires_tiling) |
| { |
| panfrost_scoreboard_queue_compute_job(batch, vertex); |
| |
| if (requires_tiling && !batch->first_vertex_for_tiler.gpu) |
| batch->first_vertex_for_tiler = vertex; |
| } |
| |
| /* Queues a tiler job, respecting the dependency of each tiler job on the |
| * previous */ |
| |
| void |
| panfrost_scoreboard_queue_tiler_job( |
| struct panfrost_job *batch, |
| struct panfrost_transfer tiler) |
| { |
| panfrost_scoreboard_queue_compute_job(batch, tiler); |
| |
| if (!batch->first_tiler.gpu) |
| batch->first_tiler = tiler; |
| |
| if (batch->last_tiler.gpu) |
| panfrost_add_dependency(tiler, batch->last_tiler); |
| |
| batch->last_tiler = tiler; |
| } |
| |
| /* Queues a fused (vertex/tiler) job, or a pair of vertex/tiler jobs if |
| * fused jobs are not supported (the default until Bifrost rolls out) */ |
| |
| void |
| panfrost_scoreboard_queue_fused_job( |
| struct panfrost_job *batch, |
| struct panfrost_transfer vertex, |
| struct panfrost_transfer tiler) |
| { |
| panfrost_scoreboard_queue_vertex_job(batch, vertex, true); |
| panfrost_scoreboard_queue_tiler_job(batch, tiler); |
| panfrost_add_dependency(tiler, vertex); |
| } |
| |
| /* Queues a fused (vertex/tiler) job prepended *before* the usual set, used for |
| * wallpaper blits */ |
| |
| void |
| panfrost_scoreboard_queue_fused_job_prepend( |
| struct panfrost_job *batch, |
| struct panfrost_transfer vertex, |
| struct panfrost_transfer tiler) |
| { |
| /* Sanity check */ |
| assert(batch->last_tiler.gpu); |
| assert(batch->first_tiler.gpu); |
| |
| /* First, we add the vertex job directly to the queue, forcing it to |
| * the front */ |
| |
| panfrost_scoreboard_queue_job_internal(batch, vertex); |
| batch->first_job = vertex; |
| batch->first_vertex_for_tiler = vertex; |
| |
| /* Similarly, we add the tiler job directly to the queue, forcing it to |
| * the front (second place), manually setting the tiler on vertex |
| * dependency (since this is pseudofused) and forcing a dependency of |
| * the now-second tiler on us (since all tiler jobs are linked in order |
| * and we're injecting ourselves at the front) */ |
| |
| panfrost_scoreboard_queue_job_internal(batch, tiler); |
| panfrost_add_dependency(tiler, vertex); |
| panfrost_add_dependency(batch->first_tiler, tiler); |
| batch->first_tiler = tiler; |
| } |
| |
| /* Generates a set value job, used below as part of TILER job scheduling. */ |
| |
| static struct panfrost_transfer |
| panfrost_set_value_job(struct panfrost_context *ctx, mali_ptr polygon_list) |
| { |
| struct mali_job_descriptor_header job = { |
| .job_type = JOB_TYPE_SET_VALUE, |
| .job_descriptor_size = 1, |
| }; |
| |
| struct mali_payload_set_value payload = { |
| .out = polygon_list, |
| .unknown = 0x3, |
| }; |
| |
| struct panfrost_transfer transfer = panfrost_allocate_transient(ctx, sizeof(job) + sizeof(payload)); |
| memcpy(transfer.cpu, &job, sizeof(job)); |
| memcpy(transfer.cpu + sizeof(job), &payload, sizeof(payload)); |
| |
| return transfer; |
| } |
| |
| /* If there are any tiler jobs, there needs to be a corresponding set value job |
| * linked to the first vertex job feeding into tiling. */ |
| |
| static void |
| panfrost_scoreboard_set_value(struct panfrost_job *batch) |
| { |
| /* Check if we even need tiling */ |
| if (!batch->last_tiler.gpu) |
| return; |
| |
| /* Okay, we do. Let's generate it */ |
| |
| struct panfrost_context *ctx = batch->ctx; |
| mali_ptr polygon_list = ctx->tiler_polygon_list.bo->gpu; |
| |
| struct panfrost_transfer job = |
| panfrost_set_value_job(ctx, polygon_list); |
| |
| /* Queue it */ |
| panfrost_scoreboard_queue_compute_job(batch, job); |
| |
| /* Tiler jobs need us */ |
| panfrost_add_dependency(batch->first_tiler, job); |
| } |
| |
| /* Once all jobs have been added to a batch and we're ready to submit, we need |
| * to order them to set each of the next_job fields, obeying the golden rule: |
| * "A job's dependencies must appear earlier in the job chain than itself". |
| * Fortunately, computing this job chain is a well-studied graph theory problem |
| * known as "topological sorting", which has linear time algorithms. We let |
| * each job represent a node, each dependency a directed edge, and the entire |
| * set of jobs to be a dependency graph. This graph is inherently acyclic, as |
| * otherwise there are unresolveable dependencies. |
| * |
| * We implement Kahn's algorithm here to compute the next_job chain: |
| * https://en.wikipedia.org/wiki/Topological_sorting#Kahn's_algorithm |
| * |
| * A few implementation notes: we represent S explicitly with a bitset, L |
| * implicitly in the next_job fields. The indices of the bitset are off-by-one: |
| * nodes are numbered [0, node_count - 1], whereas in reality job_index in the |
| * hardware and dependencies are [1, node_count]. |
| * |
| * We represent edge removal implicitly with another pair of bitsets, rather |
| * than explicitly removing the edges, since we need to keep the dependencies |
| * there for the hardware. |
| */ |
| |
| #define DESCRIPTOR_FOR_NODE(count) \ |
| *(util_dynarray_element(&batch->headers, \ |
| struct mali_job_descriptor_header*, count)) |
| |
| #define GPU_ADDRESS_FOR_NODE(count) \ |
| *(util_dynarray_element(&batch->gpu_headers, \ |
| mali_ptr, count)) |
| |
| void |
| panfrost_scoreboard_link_batch(struct panfrost_job *batch) |
| { |
| /* Finalize the batch */ |
| panfrost_scoreboard_set_value(batch); |
| |
| /* Let no_incoming represent the set S described. */ |
| |
| unsigned node_count = batch->job_index; |
| |
| size_t sz = BITSET_WORDS(node_count) * sizeof(BITSET_WORD); |
| BITSET_WORD *no_incoming = calloc(sz, 1); |
| |
| /* Sets for edges being removed in dep 1 or 2 respectively */ |
| |
| BITSET_WORD *edge_removal_1 = calloc(sz, 1); |
| BITSET_WORD *edge_removal_2 = calloc(sz, 1); |
| |
| /* We compute no_incoming by traversing the batch. */ |
| |
| for (unsigned i = 0; i < node_count; ++i) { |
| struct mali_job_descriptor_header *node = DESCRIPTOR_FOR_NODE(i); |
| |
| unsigned dep_1 = node->job_dependency_index_1; |
| unsigned dep_2 = node->job_dependency_index_2; |
| |
| if (!(dep_1 || dep_2)) |
| BITSET_SET(no_incoming, i); |
| } |
| |
| /* No next_job fields are set at the beginning, so L is implciitly the |
| * empty set. As next_job fields are filled, L is implicitly set. Tail |
| * is the tail of L, however. */ |
| |
| struct mali_job_descriptor_header *tail = NULL; |
| |
| /* We iterate, popping off elements of S. A simple foreach won't do, |
| * since we mutate S as we go (even adding elements) */ |
| |
| unsigned arr_size = BITSET_WORDS(node_count); |
| |
| for (unsigned node_n_1 = __bitset_ffs(no_incoming, arr_size); |
| (node_n_1 != 0); |
| node_n_1 = __bitset_ffs(no_incoming, arr_size)) { |
| |
| unsigned node_n = node_n_1 - 1; |
| |
| /* We've got a node n, pop it off */ |
| BITSET_CLEAR(no_incoming, node_n); |
| |
| /* Add it to the list */ |
| struct mali_job_descriptor_header *n = |
| DESCRIPTOR_FOR_NODE(node_n); |
| |
| mali_ptr addr = GPU_ADDRESS_FOR_NODE(node_n); |
| |
| if (tail) { |
| /* Link us to the last node */ |
| panfrost_set_job_next(tail, addr); |
| } else { |
| /* We are the first/last node */ |
| batch->first_job.cpu = (uint8_t *) n; |
| batch->first_job.gpu = addr; |
| } |
| |
| tail = n; |
| |
| /* Scan dependencies */ |
| for (unsigned node_m = 0; node_m < node_count; ++node_m) { |
| struct mali_job_descriptor_header *m = |
| DESCRIPTOR_FOR_NODE(node_m); |
| |
| /* Get the deps, accounting for removal */ |
| unsigned dep_1 = m->job_dependency_index_1; |
| unsigned dep_2 = m->job_dependency_index_2; |
| |
| if (BITSET_TEST(edge_removal_1, node_m)) |
| dep_1 = 0; |
| |
| if (BITSET_TEST(edge_removal_2, node_m)) |
| dep_2 = 0; |
| |
| /* Pretend to remove edges */ |
| if (dep_1 == node_n_1) { |
| BITSET_SET(edge_removal_1, node_m); |
| dep_1 = 0; |
| } else if (dep_2 == node_n_1) { |
| BITSET_SET(edge_removal_2, node_m); |
| dep_2 = 0; |
| } else { |
| /* This node has no relevant dependencies */ |
| continue; |
| } |
| |
| /* Are there edges left? If not, add us to S */ |
| bool has_edges = dep_1 || dep_2; |
| |
| if (!has_edges) |
| BITSET_SET(no_incoming, node_m); |
| } |
| } |
| |
| } |