src/gallium/drivers/panfrost/pan_scoreboard.c - platform/external/mesa3d - Gitiles

 /*
  * Copyright (C) 2019 Collabora, Ltd.
  *
  * Permission is hereby granted, free of charge, to any person obtaining a
  * copy of this software and associated documentation files (the "Software"),
  * to deal in the Software without restriction, including without limitation
  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
  * and/or sell copies of the Software, and to permit persons to whom the
  * Software is furnished to do so, subject to the following conditions:
  *
  * The above copyright notice and this permission notice (including the next
  * paragraph) shall be included in all copies or substantial portions of the
  * Software.
  *
  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  *
  */

 #include "pan_context.h"
 #include "pan_job.h"
 #include "pan_allocate.h"
 #include "util/bitset.h"

 /*
  * Within a batch (panfrost_job), there are various types of Mali jobs:
  *
  *  - SET_VALUE: initializes tiler
  *  - VERTEX: runs a vertex shader
  *  - TILER: runs tiling and sets up a fragment shader
  *  - FRAGMENT: runs fragment shaders and writes out
  *  - COMPUTE: runs a compute shader
  *  - FUSED: vertex+tiler fused together, implicit intradependency (Bifrost)
  *  - GEOMETRY: runs a geometry shader (unimplemented)
  *  - CACHE_FLUSH: unseen in the wild, theoretically cache flush
  *
  * In between a full batch and a single Mali job is the "job chain", a series
  * of Mali jobs together forming a linked list. Within the job chain, each Mali
  * job can set (up to) two dependencies on other earlier jobs in the chain.
  * This dependency graph forms a scoreboard. The general idea of a scoreboard
  * applies: when there is a data dependency of job B on job A, job B sets one
  * of its dependency indices to job A, ensuring that job B won't start until
  * job A finishes.
  *
  * More specifically, here are a set of rules:
  *
  * - A set value job must appear if and only if there is at least one tiler job.
  *
  * - Vertex jobs and tiler jobs are independent.
  *
  * - A tiler job must have a dependency on its data source. If it's getting
  *   data from a vertex job, it depends on the vertex job. If it's getting data
  *   from software, this is null.
  *
  * - The first vertex job used as the input to tiling must depend on the set
  *   value job, if it is present.
  *
  * - Tiler jobs must be strictly ordered. So each tiler job must depend on the
  *   previous job in the chain.
  *
  * - Jobs linking via next_job has no bearing on order of execution, rather it
  *   just establishes the linked list of jobs, EXCEPT:
  *
  * - A job's dependencies must appear earlier in the linked list (job chain).
  *
  * Justification for each rule:
  *
  * - Set value jobs set up tiling, essentially. If tiling occurs, they are
  *   needed; if it does not, we cannot emit them since then tiling partially
  *   occurs and it's bad.
  *
  * - The hardware has no notion of a "vertex/tiler job" (at least not our
  *   hardware -- other revs have fused jobs, but --- crap, this just got even
  *   more complicated). They are independent units that take in data, process
  *   it, and spit out data.
  *
  * - Any job must depend on its data source, in fact, or risk a
  *   read-before-write hazard. Tiler jobs get their data from vertex jobs, ergo
  *   tiler jobs depend on the corresponding vertex job (if it's there).
  *
  * - In fact, tiling depends on the set value job, but tiler jobs depend on the
  *   corresponding vertex jobs and each other, so this rule ensures each tiler
  *   job automatically depends on the set value job.
  *
  * - The tiler is not thread-safe; this dependency prevents race conditions
  *   between two different jobs trying to write to the tiler outputs at the
  *   same time.
  *
  * - Internally, jobs are scoreboarded; the next job fields just form a linked
  *   list to allow the jobs to be read in; the execution order is from
  *   resolving the dependency fields instead.
  *
  * - The hardware cannot set a dependency on a job it doesn't know about yet,
  *   and dependencies are processed in-order of the next job fields.
  *
  */

 /* Accessor to set the next job field */

 static void
 panfrost_set_job_next(struct mali_job_descriptor_header *first, mali_ptr next)
 {
         if (first->job_descriptor_size)
                 first->next_job_64 = (u64) (uintptr_t) next;
         else
                 first->next_job_32 = (u32) (uintptr_t) next;
 }

 /* Coerce a panfrost_transfer to a header */

 static inline struct mali_job_descriptor_header *
 job_descriptor_header(struct panfrost_transfer t)
 {
         return (struct mali_job_descriptor_header *) t.cpu;
 }

 static void
 panfrost_assign_index(
         struct panfrost_job *job,
         struct panfrost_transfer transfer)
 {
         /* Assign the index */
         unsigned index = ++job->job_index;
         job_descriptor_header(transfer)->job_index = index;
 }

 /* Helper to add a dependency to a job */

 static void
 panfrost_add_dependency(
         struct panfrost_transfer depender,
         struct panfrost_transfer dependent)
 {

         struct mali_job_descriptor_header *first =
                 job_descriptor_header(dependent);

         struct mali_job_descriptor_header *second =
                 job_descriptor_header(depender);

         /* Ensure we're ready for dependencies */
         assert(second->job_index);
         assert(first->job_index);

         /* Look for an open slot */

         if (!second->job_dependency_index_1)
                 second->job_dependency_index_1 = first->job_index;
         else if (!second->job_dependency_index_2)
                 second->job_dependency_index_2 = first->job_index;
         else
                 unreachable("No available slot for new dependency");
 }

 /* Queues a job WITHOUT updating pointers. Be careful. */

 static void
 panfrost_scoreboard_queue_job_internal(
         struct panfrost_job *batch,
         struct panfrost_transfer job)
 {
         panfrost_assign_index(batch, job);

         /* Queue a pointer to the job */
         util_dynarray_append(&batch->headers, void*, job.cpu);
         util_dynarray_append(&batch->gpu_headers, mali_ptr, job.gpu);
 }


 /* Queues a compute job, with no special dependencies. This is a bit of a
  * misnomer -- internally, all job types are queued with this function, but
  * outside of this file, it's for pure compute jobs */

 void
 panfrost_scoreboard_queue_compute_job(
         struct panfrost_job *batch,
         struct panfrost_transfer job)
 {
         panfrost_scoreboard_queue_job_internal(batch, job);

         /* Update the linked list metadata as appropriate */
         batch->last_job = job;

         if (!batch->first_job.gpu)
                 batch->first_job = job;
 }

 /* Queues a vertex job. There are no special dependencies yet, but if
  * tiling is required (anytime 'rasterize discard' is disabled), we have
  * some extra bookkeeping for later */

 void
 panfrost_scoreboard_queue_vertex_job(
         struct panfrost_job *batch,
         struct panfrost_transfer vertex,
         bool requires_tiling)
 {
         panfrost_scoreboard_queue_compute_job(batch, vertex);

         if (requires_tiling && !batch->first_vertex_for_tiler.gpu)
                 batch->first_vertex_for_tiler = vertex;
 }

 /* Queues a tiler job, respecting the dependency of each tiler job on the
  * previous */

 void
 panfrost_scoreboard_queue_tiler_job(
         struct panfrost_job *batch,
         struct panfrost_transfer tiler)
 {
         panfrost_scoreboard_queue_compute_job(batch, tiler);

         if (!batch->first_tiler.gpu)
                 batch->first_tiler = tiler;

         if (batch->last_tiler.gpu)
                 panfrost_add_dependency(tiler, batch->last_tiler);

         batch->last_tiler = tiler;
 }

 /* Queues a fused (vertex/tiler) job, or a pair of vertex/tiler jobs if
  * fused jobs are not supported (the default until Bifrost rolls out) */

 void
 panfrost_scoreboard_queue_fused_job(
         struct panfrost_job *batch,
         struct panfrost_transfer vertex,
         struct panfrost_transfer tiler)
 {
         panfrost_scoreboard_queue_vertex_job(batch, vertex, true);
         panfrost_scoreboard_queue_tiler_job(batch, tiler);
         panfrost_add_dependency(tiler, vertex);
 }

 /* Queues a fused (vertex/tiler) job prepended *before* the usual set, used for
  * wallpaper blits */

 void
 panfrost_scoreboard_queue_fused_job_prepend(
         struct panfrost_job *batch,
         struct panfrost_transfer vertex,
         struct panfrost_transfer tiler)
 {
         /* Sanity check */
         assert(batch->last_tiler.gpu);
         assert(batch->first_tiler.gpu);

         /* First, we add the vertex job directly to the queue, forcing it to
          * the front */

         panfrost_scoreboard_queue_job_internal(batch, vertex);
         batch->first_job = vertex;
         batch->first_vertex_for_tiler = vertex;

         /* Similarly, we add the tiler job directly to the queue, forcing it to
          * the front (second place), manually setting the tiler on vertex
          * dependency (since this is pseudofused) and forcing a dependency of
          * the now-second tiler on us (since all tiler jobs are linked in order
          * and we're injecting ourselves at the front) */

         panfrost_scoreboard_queue_job_internal(batch, tiler);
         panfrost_add_dependency(tiler, vertex);
         panfrost_add_dependency(batch->first_tiler, tiler);
         batch->first_tiler = tiler;
 }

 /* Generates a set value job, used below as part of TILER job scheduling. */

 static struct panfrost_transfer
 panfrost_set_value_job(struct panfrost_context *ctx, mali_ptr polygon_list)
 {
         struct mali_job_descriptor_header job = {
                 .job_type = JOB_TYPE_SET_VALUE,
                 .job_descriptor_size = 1,
         };

         struct mali_payload_set_value payload = {
                 .out = polygon_list,
                 .unknown = 0x3,
         };

         struct panfrost_transfer transfer = panfrost_allocate_transient(ctx, sizeof(job) + sizeof(payload));
         memcpy(transfer.cpu, &job, sizeof(job));
         memcpy(transfer.cpu + sizeof(job), &payload, sizeof(payload));

         return transfer;
 }

 /* If there are any tiler jobs, there needs to be a corresponding set value job
  * linked to the first vertex job feeding into tiling. */

 static void
 panfrost_scoreboard_set_value(struct panfrost_job *batch)
 {
         /* Check if we even need tiling */
         if (!batch->last_tiler.gpu)
                 return;

         /* Okay, we do. Let's generate it */

         struct panfrost_context *ctx = batch->ctx;
         mali_ptr polygon_list = ctx->tiler_polygon_list.bo->gpu;

         struct panfrost_transfer job =
                 panfrost_set_value_job(ctx, polygon_list);

         /* Queue it */
         panfrost_scoreboard_queue_compute_job(batch, job);

         /* Tiler jobs need us */
         panfrost_add_dependency(batch->first_tiler, job);
 }

 /* Once all jobs have been added to a batch and we're ready to submit, we need
  * to order them to set each of the next_job fields, obeying the golden rule:
  * "A job's dependencies must appear earlier in the job chain than itself".
  * Fortunately, computing this job chain is a well-studied graph theory problem
  * known as "topological sorting", which has linear time algorithms. We let
  * each job represent a node, each dependency a directed edge, and the entire
  * set of jobs to be a dependency graph. This graph is inherently acyclic, as
  * otherwise there are unresolveable dependencies.
  *
  * We implement Kahn's algorithm here to compute the next_job chain:
  * https://en.wikipedia.org/wiki/Topological_sorting#Kahn's_algorithm
  *
  * A few implementation notes: we represent S explicitly with a bitset, L
  * implicitly in the next_job fields. The indices of the bitset are off-by-one:
  * nodes are numbered [0, node_count - 1], whereas in reality job_index in the
  * hardware and dependencies are [1, node_count].
  *
  * We represent edge removal implicitly with another pair of bitsets, rather
  * than explicitly removing the edges, since we need to keep the dependencies
  * there for the hardware.
  */

 #define DESCRIPTOR_FOR_NODE(count) \
         *(util_dynarray_element(&batch->headers, \
                 struct mali_job_descriptor_header*, count))

 #define GPU_ADDRESS_FOR_NODE(count) \
         *(util_dynarray_element(&batch->gpu_headers, \
                 mali_ptr, count))

 void
 panfrost_scoreboard_link_batch(struct panfrost_job *batch)
 {
         /* Finalize the batch */
         panfrost_scoreboard_set_value(batch);

         /* Let no_incoming represent the set S described. */

         unsigned node_count = batch->job_index;

         size_t sz = BITSET_WORDS(node_count) * sizeof(BITSET_WORD);
         BITSET_WORD *no_incoming = calloc(sz, 1);

         /* Sets for edges being removed in dep 1 or 2 respectively */

         BITSET_WORD *edge_removal_1 = calloc(sz, 1);
         BITSET_WORD *edge_removal_2 = calloc(sz, 1);

         /* We compute no_incoming by traversing the batch. */

         for (unsigned i = 0; i < node_count; ++i) {
                 struct mali_job_descriptor_header *node = DESCRIPTOR_FOR_NODE(i);

                 unsigned dep_1 = node->job_dependency_index_1;
                 unsigned dep_2 = node->job_dependency_index_2;

                 if (!(dep_1 || dep_2))
                         BITSET_SET(no_incoming, i);
         }

         /* No next_job fields are set at the beginning, so L is implciitly the
          * empty set. As next_job fields are filled, L is implicitly set. Tail
          * is the tail of L, however. */

         struct mali_job_descriptor_header *tail = NULL;

         /* We iterate, popping off elements of S. A simple foreach won't do,
          * since we mutate S as we go (even adding elements) */

         unsigned arr_size = BITSET_WORDS(node_count);

         for (unsigned node_n_1 = __bitset_ffs(no_incoming, arr_size);
              (node_n_1 != 0);
              node_n_1 = __bitset_ffs(no_incoming, arr_size)) {

                 unsigned node_n = node_n_1 - 1;

                 /* We've got a node n, pop it off */
                 BITSET_CLEAR(no_incoming, node_n);

                 /* Add it to the list */
                 struct mali_job_descriptor_header *n =
                         DESCRIPTOR_FOR_NODE(node_n);

                 mali_ptr addr = GPU_ADDRESS_FOR_NODE(node_n);

                 if (tail) {
                         /* Link us to the last node */
                         panfrost_set_job_next(tail, addr);
                 } else {
                         /* We are the first/last node */
                         batch->first_job.cpu = (uint8_t *) n;
                         batch->first_job.gpu = addr;
                 }

                 tail = n;

                 /* Scan dependencies */
                 for (unsigned node_m = 0; node_m < node_count; ++node_m) {
                         struct mali_job_descriptor_header *m =
                                 DESCRIPTOR_FOR_NODE(node_m);

                         /* Get the deps, accounting for removal */
                         unsigned dep_1 = m->job_dependency_index_1;
                         unsigned dep_2 = m->job_dependency_index_2;

                         if (BITSET_TEST(edge_removal_1, node_m))
                                 dep_1 = 0;

                         if (BITSET_TEST(edge_removal_2, node_m))
                                 dep_2 = 0;

                         /* Pretend to remove edges */
                         if (dep_1 == node_n_1) {
                                 BITSET_SET(edge_removal_1, node_m);
                                 dep_1 = 0;
                         } else if (dep_2 == node_n_1) {
                                 BITSET_SET(edge_removal_2, node_m);
                                 dep_2 = 0;
                         } else {
                                 /* This node has no relevant dependencies */
                                 continue;
                         }

                         /* Are there edges left? If not, add us to S */
                         bool has_edges = dep_1 || dep_2;

                         if (!has_edges)
                                 BITSET_SET(no_incoming, node_m);
                 }
         }

 }
	/*
	* Copyright (C) 2019 Collabora, Ltd.
	*
	* Permission is hereby granted, free of charge, to any person obtaining a
	* copy of this software and associated documentation files (the "Software"),
	* to deal in the Software without restriction, including without limitation
	* the rights to use, copy, modify, merge, publish, distribute, sublicense,
	* and/or sell copies of the Software, and to permit persons to whom the
	* Software is furnished to do so, subject to the following conditions:
	*
	* The above copyright notice and this permission notice (including the next
	* paragraph) shall be included in all copies or substantial portions of the
	* Software.
	*
	* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
	* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
	* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
	* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
	* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
	* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
	* SOFTWARE.
	*
	*/

	#include "pan_context.h"
	#include "pan_job.h"
	#include "pan_allocate.h"
	#include "util/bitset.h"

	/*
	* Within a batch (panfrost_job), there are various types of Mali jobs:
	*
	* - SET_VALUE: initializes tiler
	* - VERTEX: runs a vertex shader
	* - TILER: runs tiling and sets up a fragment shader
	* - FRAGMENT: runs fragment shaders and writes out
	* - COMPUTE: runs a compute shader
	* - FUSED: vertex+tiler fused together, implicit intradependency (Bifrost)
	* - GEOMETRY: runs a geometry shader (unimplemented)
	* - CACHE_FLUSH: unseen in the wild, theoretically cache flush
	*
	* In between a full batch and a single Mali job is the "job chain", a series
	* of Mali jobs together forming a linked list. Within the job chain, each Mali
	* job can set (up to) two dependencies on other earlier jobs in the chain.
	* This dependency graph forms a scoreboard. The general idea of a scoreboard
	* applies: when there is a data dependency of job B on job A, job B sets one
	* of its dependency indices to job A, ensuring that job B won't start until
	* job A finishes.
	*
	* More specifically, here are a set of rules:
	*
	* - A set value job must appear if and only if there is at least one tiler job.
	*
	* - Vertex jobs and tiler jobs are independent.
	*
	* - A tiler job must have a dependency on its data source. If it's getting
	* data from a vertex job, it depends on the vertex job. If it's getting data
	* from software, this is null.
	*
	* - The first vertex job used as the input to tiling must depend on the set
	* value job, if it is present.
	*
	* - Tiler jobs must be strictly ordered. So each tiler job must depend on the
	* previous job in the chain.
	*
	* - Jobs linking via next_job has no bearing on order of execution, rather it
	* just establishes the linked list of jobs, EXCEPT:
	*
	* - A job's dependencies must appear earlier in the linked list (job chain).
	*
	* Justification for each rule:
	*
	* - Set value jobs set up tiling, essentially. If tiling occurs, they are
	* needed; if it does not, we cannot emit them since then tiling partially
	* occurs and it's bad.
	*
	* - The hardware has no notion of a "vertex/tiler job" (at least not our
	* hardware -- other revs have fused jobs, but --- crap, this just got even
	* more complicated). They are independent units that take in data, process
	* it, and spit out data.
	*
	* - Any job must depend on its data source, in fact, or risk a
	* read-before-write hazard. Tiler jobs get their data from vertex jobs, ergo
	* tiler jobs depend on the corresponding vertex job (if it's there).
	*
	* - In fact, tiling depends on the set value job, but tiler jobs depend on the
	* corresponding vertex jobs and each other, so this rule ensures each tiler
	* job automatically depends on the set value job.
	*
	* - The tiler is not thread-safe; this dependency prevents race conditions
	* between two different jobs trying to write to the tiler outputs at the
	* same time.
	*
	* - Internally, jobs are scoreboarded; the next job fields just form a linked
	* list to allow the jobs to be read in; the execution order is from
	* resolving the dependency fields instead.
	*
	* - The hardware cannot set a dependency on a job it doesn't know about yet,
	* and dependencies are processed in-order of the next job fields.
	*
	*/

	/* Accessor to set the next job field */

	static void
	panfrost_set_job_next(struct mali_job_descriptor_header *first, mali_ptr next)
	{
	if (first->job_descriptor_size)
	first->next_job_64 = (u64) (uintptr_t) next;
	else
	first->next_job_32 = (u32) (uintptr_t) next;
	}

	/* Coerce a panfrost_transfer to a header */

	static inline struct mali_job_descriptor_header *
	job_descriptor_header(struct panfrost_transfer t)
	{
	return (struct mali_job_descriptor_header *) t.cpu;
	}

	static void
	panfrost_assign_index(
	struct panfrost_job *job,
	struct panfrost_transfer transfer)
	{
	/* Assign the index */
	unsigned index = ++job->job_index;
	job_descriptor_header(transfer)->job_index = index;
	}

	/* Helper to add a dependency to a job */

	static void
	panfrost_add_dependency(
	struct panfrost_transfer depender,
	struct panfrost_transfer dependent)
	{

	struct mali_job_descriptor_header *first =
	job_descriptor_header(dependent);

	struct mali_job_descriptor_header *second =
	job_descriptor_header(depender);

	/* Ensure we're ready for dependencies */
	assert(second->job_index);
	assert(first->job_index);

	/* Look for an open slot */

	if (!second->job_dependency_index_1)
	second->job_dependency_index_1 = first->job_index;
	else if (!second->job_dependency_index_2)
	second->job_dependency_index_2 = first->job_index;
	else
	unreachable("No available slot for new dependency");
	}

	/* Queues a job WITHOUT updating pointers. Be careful. */

	static void
	panfrost_scoreboard_queue_job_internal(
	struct panfrost_job *batch,
	struct panfrost_transfer job)
	{
	panfrost_assign_index(batch, job);

	/* Queue a pointer to the job */
	util_dynarray_append(&batch->headers, void*, job.cpu);
	util_dynarray_append(&batch->gpu_headers, mali_ptr, job.gpu);
	}


	/* Queues a compute job, with no special dependencies. This is a bit of a
	* misnomer -- internally, all job types are queued with this function, but
	* outside of this file, it's for pure compute jobs */

	void
	panfrost_scoreboard_queue_compute_job(
	struct panfrost_job *batch,
	struct panfrost_transfer job)
	{
	panfrost_scoreboard_queue_job_internal(batch, job);

	/* Update the linked list metadata as appropriate */
	batch->last_job = job;

	if (!batch->first_job.gpu)
	batch->first_job = job;
	}

	/* Queues a vertex job. There are no special dependencies yet, but if
	* tiling is required (anytime 'rasterize discard' is disabled), we have
	* some extra bookkeeping for later */

	void
	panfrost_scoreboard_queue_vertex_job(
	struct panfrost_job *batch,
	struct panfrost_transfer vertex,
	bool requires_tiling)
	{
	panfrost_scoreboard_queue_compute_job(batch, vertex);

	if (requires_tiling && !batch->first_vertex_for_tiler.gpu)
	batch->first_vertex_for_tiler = vertex;
	}

	/* Queues a tiler job, respecting the dependency of each tiler job on the
	* previous */

	void
	panfrost_scoreboard_queue_tiler_job(
	struct panfrost_job *batch,
	struct panfrost_transfer tiler)
	{
	panfrost_scoreboard_queue_compute_job(batch, tiler);

	if (!batch->first_tiler.gpu)
	batch->first_tiler = tiler;

	if (batch->last_tiler.gpu)
	panfrost_add_dependency(tiler, batch->last_tiler);

	batch->last_tiler = tiler;
	}

	/* Queues a fused (vertex/tiler) job, or a pair of vertex/tiler jobs if
	* fused jobs are not supported (the default until Bifrost rolls out) */

	void
	panfrost_scoreboard_queue_fused_job(
	struct panfrost_job *batch,
	struct panfrost_transfer vertex,
	struct panfrost_transfer tiler)
	{
	panfrost_scoreboard_queue_vertex_job(batch, vertex, true);
	panfrost_scoreboard_queue_tiler_job(batch, tiler);
	panfrost_add_dependency(tiler, vertex);
	}

	/* Queues a fused (vertex/tiler) job prepended before the usual set, used for
	* wallpaper blits */

	void
	panfrost_scoreboard_queue_fused_job_prepend(
	struct panfrost_job *batch,
	struct panfrost_transfer vertex,
	struct panfrost_transfer tiler)
	{
	/* Sanity check */
	assert(batch->last_tiler.gpu);
	assert(batch->first_tiler.gpu);

	/* First, we add the vertex job directly to the queue, forcing it to
	* the front */

	panfrost_scoreboard_queue_job_internal(batch, vertex);
	batch->first_job = vertex;
	batch->first_vertex_for_tiler = vertex;

	/* Similarly, we add the tiler job directly to the queue, forcing it to
	* the front (second place), manually setting the tiler on vertex
	* dependency (since this is pseudofused) and forcing a dependency of
	* the now-second tiler on us (since all tiler jobs are linked in order
	* and we're injecting ourselves at the front) */

	panfrost_scoreboard_queue_job_internal(batch, tiler);
	panfrost_add_dependency(tiler, vertex);
	panfrost_add_dependency(batch->first_tiler, tiler);
	batch->first_tiler = tiler;
	}

	/* Generates a set value job, used below as part of TILER job scheduling. */

	static struct panfrost_transfer
	panfrost_set_value_job(struct panfrost_context *ctx, mali_ptr polygon_list)
	{
	struct mali_job_descriptor_header job = {
	.job_type = JOB_TYPE_SET_VALUE,
	.job_descriptor_size = 1,
	};

	struct mali_payload_set_value payload = {
	.out = polygon_list,
	.unknown = 0x3,
	};

	struct panfrost_transfer transfer = panfrost_allocate_transient(ctx, sizeof(job) + sizeof(payload));
	memcpy(transfer.cpu, &job, sizeof(job));
	memcpy(transfer.cpu + sizeof(job), &payload, sizeof(payload));

	return transfer;
	}

	/* If there are any tiler jobs, there needs to be a corresponding set value job
	* linked to the first vertex job feeding into tiling. */

	static void
	panfrost_scoreboard_set_value(struct panfrost_job *batch)
	{
	/* Check if we even need tiling */
	if (!batch->last_tiler.gpu)
	return;

	/* Okay, we do. Let's generate it */

	struct panfrost_context *ctx = batch->ctx;
	mali_ptr polygon_list = ctx->tiler_polygon_list.bo->gpu;

	struct panfrost_transfer job =
	panfrost_set_value_job(ctx, polygon_list);

	/* Queue it */
	panfrost_scoreboard_queue_compute_job(batch, job);

	/* Tiler jobs need us */
	panfrost_add_dependency(batch->first_tiler, job);
	}

	/* Once all jobs have been added to a batch and we're ready to submit, we need
	* to order them to set each of the next_job fields, obeying the golden rule:
	* "A job's dependencies must appear earlier in the job chain than itself".
	* Fortunately, computing this job chain is a well-studied graph theory problem
	* known as "topological sorting", which has linear time algorithms. We let
	* each job represent a node, each dependency a directed edge, and the entire
	* set of jobs to be a dependency graph. This graph is inherently acyclic, as
	* otherwise there are unresolveable dependencies.
	*
	* We implement Kahn's algorithm here to compute the next_job chain:
	* https://en.wikipedia.org/wiki/Topological_sorting#Kahn's_algorithm
	*
	* A few implementation notes: we represent S explicitly with a bitset, L
	* implicitly in the next_job fields. The indices of the bitset are off-by-one:
	* nodes are numbered [0, node_count - 1], whereas in reality job_index in the
	* hardware and dependencies are [1, node_count].
	*
	* We represent edge removal implicitly with another pair of bitsets, rather
	* than explicitly removing the edges, since we need to keep the dependencies
	* there for the hardware.
	*/

	#define DESCRIPTOR_FOR_NODE(count) \
	*(util_dynarray_element(&batch->headers, \
	struct mali_job_descriptor_header*, count))

	#define GPU_ADDRESS_FOR_NODE(count) \
	*(util_dynarray_element(&batch->gpu_headers, \
	mali_ptr, count))

	void
	panfrost_scoreboard_link_batch(struct panfrost_job *batch)
	{
	/* Finalize the batch */
	panfrost_scoreboard_set_value(batch);

	/* Let no_incoming represent the set S described. */

	unsigned node_count = batch->job_index;

	size_t sz = BITSET_WORDS(node_count) * sizeof(BITSET_WORD);
	BITSET_WORD *no_incoming = calloc(sz, 1);

	/* Sets for edges being removed in dep 1 or 2 respectively */

	BITSET_WORD *edge_removal_1 = calloc(sz, 1);
	BITSET_WORD *edge_removal_2 = calloc(sz, 1);

	/* We compute no_incoming by traversing the batch. */

	for (unsigned i = 0; i < node_count; ++i) {
	struct mali_job_descriptor_header *node = DESCRIPTOR_FOR_NODE(i);

	unsigned dep_1 = node->job_dependency_index_1;
	unsigned dep_2 = node->job_dependency_index_2;

	if (!(dep_1 \|\| dep_2))
	BITSET_SET(no_incoming, i);
	}

	/* No next_job fields are set at the beginning, so L is implciitly the
	* empty set. As next_job fields are filled, L is implicitly set. Tail
	* is the tail of L, however. */

	struct mali_job_descriptor_header *tail = NULL;

	/* We iterate, popping off elements of S. A simple foreach won't do,
	* since we mutate S as we go (even adding elements) */

	unsigned arr_size = BITSET_WORDS(node_count);

	for (unsigned node_n_1 = __bitset_ffs(no_incoming, arr_size);
	(node_n_1 != 0);
	node_n_1 = __bitset_ffs(no_incoming, arr_size)) {

	unsigned node_n = node_n_1 - 1;

	/* We've got a node n, pop it off */
	BITSET_CLEAR(no_incoming, node_n);

	/* Add it to the list */
	struct mali_job_descriptor_header *n =
	DESCRIPTOR_FOR_NODE(node_n);

	mali_ptr addr = GPU_ADDRESS_FOR_NODE(node_n);

	if (tail) {
	/* Link us to the last node */
	panfrost_set_job_next(tail, addr);
	} else {
	/* We are the first/last node */
	batch->first_job.cpu = (uint8_t *) n;
	batch->first_job.gpu = addr;
	}

	tail = n;

	/* Scan dependencies */
	for (unsigned node_m = 0; node_m < node_count; ++node_m) {
	struct mali_job_descriptor_header *m =
	DESCRIPTOR_FOR_NODE(node_m);

	/* Get the deps, accounting for removal */
	unsigned dep_1 = m->job_dependency_index_1;
	unsigned dep_2 = m->job_dependency_index_2;

	if (BITSET_TEST(edge_removal_1, node_m))
	dep_1 = 0;

	if (BITSET_TEST(edge_removal_2, node_m))
	dep_2 = 0;

	/* Pretend to remove edges */
	if (dep_1 == node_n_1) {
	BITSET_SET(edge_removal_1, node_m);
	dep_1 = 0;
	} else if (dep_2 == node_n_1) {
	BITSET_SET(edge_removal_2, node_m);
	dep_2 = 0;
	} else {
	/* This node has no relevant dependencies */
	continue;
	}

	/* Are there edges left? If not, add us to S */
	bool has_edges = dep_1 \|\| dep_2;

	if (!has_edges)
	BITSET_SET(no_incoming, node_m);
	}
	}

	}