panfrost: Avoid copying job descriptors around when we can

Job descriptors are written section by section and are never modified
after them been emitted. Let's avoid copying things around by allocating
descriptors upfront and letting the scoreboard logic only write the
header section.

Signed-off-by: Boris Brezillon <boris.brezillon@collabora.com>
Reviewed-by: Alyssa Rosenzweig <alyssa.rosenzweig@collabora.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/6797>
diff --git a/src/gallium/drivers/panfrost/pan_cmdstream.c b/src/gallium/drivers/panfrost/pan_cmdstream.c
index 499b4a2..b15e6b8 100644
--- a/src/gallium/drivers/panfrost/pan_cmdstream.c
+++ b/src/gallium/drivers/panfrost/pan_cmdstream.c
@@ -1771,42 +1771,32 @@
 
 void
 panfrost_emit_vertex_tiler_jobs(struct panfrost_batch *batch,
-                                void *vertex_job,
-                                void *tiler_job)
+                                const struct panfrost_transfer *vertex_job,
+                                const struct panfrost_transfer *tiler_job)
 {
         struct panfrost_context *ctx = batch->ctx;
-        struct panfrost_device *device = pan_device(ctx->base.screen);
         bool wallpapering = ctx->wallpaper_batch && batch->scoreboard.tiler_dep;
-        void *vp = vertex_job + MALI_JOB_HEADER_LENGTH;
-        size_t vp_size = MALI_COMPUTE_JOB_LENGTH -
-                         MALI_JOB_HEADER_LENGTH;
-        void *tp = tiler_job + MALI_JOB_HEADER_LENGTH;
-        bool is_bifrost = device->quirks & IS_BIFROST;
-        size_t tp_size = (is_bifrost ?
-                          MALI_BIFROST_TILER_JOB_LENGTH :
-                          MALI_MIDGARD_TILER_JOB_LENGTH) -
-                         MALI_JOB_HEADER_LENGTH;
 
         if (wallpapering) {
                 /* Inject in reverse order, with "predicted" job indices.
                  * THIS IS A HACK XXX */
-                panfrost_new_job(&batch->pool, &batch->scoreboard, MALI_JOB_TYPE_TILER, false,
-                                 batch->scoreboard.job_index + 2, tp, tp_size, true);
-                panfrost_new_job(&batch->pool, &batch->scoreboard, MALI_JOB_TYPE_VERTEX, false, 0,
-                                 vp, vp_size, true);
+
+                panfrost_add_job(&batch->pool, &batch->scoreboard, MALI_JOB_TYPE_TILER, false,
+                                 batch->scoreboard.job_index + 2, tiler_job, true);
+                panfrost_add_job(&batch->pool, &batch->scoreboard, MALI_JOB_TYPE_VERTEX, false, 0,
+                                 vertex_job, true);
                 return;
         }
 
         /* If rasterizer discard is enable, only submit the vertex */
 
-        unsigned vertex = panfrost_new_job(&batch->pool, &batch->scoreboard, MALI_JOB_TYPE_VERTEX, false, 0,
-                                           vp, vp_size, false);
+        unsigned vertex = panfrost_add_job(&batch->pool, &batch->scoreboard, MALI_JOB_TYPE_VERTEX, false, 0,
+                                           vertex_job, false);
 
         if (ctx->rasterizer->base.rasterizer_discard)
                 return;
 
-        panfrost_new_job(&batch->pool, &batch->scoreboard, MALI_JOB_TYPE_TILER, false, vertex, tp, tp_size,
-                         false);
+        panfrost_add_job(&batch->pool, &batch->scoreboard, MALI_JOB_TYPE_TILER, false, vertex, tiler_job, false);
 }
 
 /* TODO: stop hardcoding this */
diff --git a/src/gallium/drivers/panfrost/pan_cmdstream.h b/src/gallium/drivers/panfrost/pan_cmdstream.h
index ea4729b..20abf61 100644
--- a/src/gallium/drivers/panfrost/pan_cmdstream.h
+++ b/src/gallium/drivers/panfrost/pan_cmdstream.h
@@ -84,8 +84,8 @@
 
 void
 panfrost_emit_vertex_tiler_jobs(struct panfrost_batch *batch,
-                                void *vertex_job,
-                                void *tiler_job);
+                                const struct panfrost_transfer *vertex_job,
+                                const struct panfrost_transfer *tiler_job);
 
 mali_ptr
 panfrost_emit_sample_locations(struct panfrost_batch *batch);
diff --git a/src/gallium/drivers/panfrost/pan_compute.c b/src/gallium/drivers/panfrost/pan_compute.c
index 43907b3..7498699 100644
--- a/src/gallium/drivers/panfrost/pan_compute.c
+++ b/src/gallium/drivers/panfrost/pan_compute.c
@@ -104,7 +104,10 @@
         ctx->compute_grid = info;
 
         /* TODO: Stub */
-        struct mali_compute_job_packed job = { 0 };
+        struct panfrost_transfer t =
+                panfrost_pool_alloc_aligned(&batch->pool,
+                                            MALI_COMPUTE_JOB_LENGTH,
+                                            64);
 
         /* We implement OpenCL inputs as uniforms (or a UBO -- same thing), so
          * reuse the graphics path for this by lowering to Gallium */
@@ -122,7 +125,7 @@
         /* Invoke according to the grid info */
 
         void *invocation =
-                pan_section_ptr(&job, COMPUTE_JOB, INVOCATION);
+                pan_section_ptr(t.cpu, COMPUTE_JOB, INVOCATION);
         panfrost_pack_work_groups_compute(invocation,
                                           info->grid[0], info->grid[1],
                                           info->grid[2],
@@ -130,14 +133,14 @@
                                           info->block[2],
                                           false);
 
-        pan_section_pack(&job, COMPUTE_JOB, PARAMETERS, cfg) {
+        pan_section_pack(t.cpu, COMPUTE_JOB, PARAMETERS, cfg) {
                 cfg.job_task_split =
                         util_logbase2_ceil(info->block[0] + 1) +
                         util_logbase2_ceil(info->block[1] + 1) +
                         util_logbase2_ceil(info->block[2] + 1);
         }
 
-        pan_section_pack(&job, COMPUTE_JOB, DRAW, cfg) {
+        pan_section_pack(t.cpu, COMPUTE_JOB, DRAW, cfg) {
                 cfg.unknown_1 = (dev->quirks & IS_BIFROST) ? 0x2 : 0x6;
                 cfg.state = panfrost_emit_compute_shader_meta(batch, PIPE_SHADER_COMPUTE);
                 cfg.shared = panfrost_emit_shared_memory(batch, info);
@@ -149,12 +152,8 @@
                                 PIPE_SHADER_COMPUTE);
         }
 
-        panfrost_new_job(&batch->pool, &batch->scoreboard,
-                         MALI_JOB_TYPE_COMPUTE, true, 0,
-                         ((void *)&job) + MALI_JOB_HEADER_LENGTH,
-                         MALI_COMPUTE_JOB_LENGTH -
-                         MALI_JOB_HEADER_LENGTH,
-                         false);
+        panfrost_add_job(&batch->pool, &batch->scoreboard,
+                         MALI_JOB_TYPE_COMPUTE, true, 0, &t, true);
         panfrost_flush_all_batches(ctx, 0);
 }
 
diff --git a/src/gallium/drivers/panfrost/pan_context.c b/src/gallium/drivers/panfrost/pan_context.c
index bb79806..2e01e6b 100644
--- a/src/gallium/drivers/panfrost/pan_context.c
+++ b/src/gallium/drivers/panfrost/pan_context.c
@@ -454,13 +454,19 @@
         ctx->instance_count = info->instance_count;
         ctx->active_prim = info->mode;
 
-        /* bifrost tiler is bigger than midgard's one, so let's use it as a
-         * generic container for both.
-         */
-        struct mali_bifrost_tiler_job_packed tiler = {};
-        struct mali_compute_job_packed vertex = {};
-        unsigned vertex_count = ctx->vertex_count;
         bool is_bifrost = device->quirks & IS_BIFROST;
+        struct panfrost_transfer tiler =
+                panfrost_pool_alloc_aligned(&batch->pool,
+                                            is_bifrost ?
+                                            MALI_BIFROST_TILER_JOB_LENGTH :
+                                            MALI_MIDGARD_TILER_JOB_LENGTH,
+                                            64);
+        struct panfrost_transfer vertex =
+                panfrost_pool_alloc_aligned(&batch->pool,
+                                            MALI_COMPUTE_JOB_LENGTH,
+                                            64);
+
+        unsigned vertex_count = ctx->vertex_count;
 
         mali_ptr shared_mem = is_bifrost ?
                 panfrost_vt_emit_shared_memory(batch) :
@@ -506,9 +512,9 @@
 
         /* Fire off the draw itself */
         panfrost_draw_emit_vertex(batch, info, &invocation, shared_mem,
-                                  vs_vary, varyings, &vertex);
+                                  vs_vary, varyings, vertex.cpu);
         panfrost_draw_emit_tiler(batch, info, &invocation, shared_mem, indices,
-                                 fs_vary, varyings, pos, psiz, &tiler);
+                                 fs_vary, varyings, pos, psiz, tiler.cpu);
         panfrost_emit_vertex_tiler_jobs(batch, &vertex, &tiler);
 
         /* Adjust the batch stack size based on the new shader stack sizes. */
diff --git a/src/panfrost/lib/pan_blit.c b/src/panfrost/lib/pan_blit.c
index dd4934d..ff1da6f 100644
--- a/src/panfrost/lib/pan_blit.c
+++ b/src/panfrost/lib/pan_blit.c
@@ -340,9 +340,10 @@
                 }
         }
 
-        struct mali_midgard_tiler_job_packed payload = {};
+        struct panfrost_transfer t =
+                panfrost_pool_alloc_aligned(pool, MALI_MIDGARD_TILER_JOB_LENGTH, 64);
 
-        pan_section_pack(&payload, MIDGARD_TILER_JOB, DRAW, cfg) {
+        pan_section_pack(t.cpu, MIDGARD_TILER_JOB, DRAW, cfg) {
                 cfg.unknown_1 = 0x7;
                 cfg.position = coordinates;
                 cfg.textures = panfrost_pool_upload(pool, &texture_t.gpu, sizeof(texture_t.gpu));
@@ -354,16 +355,14 @@
                 cfg.shared = fbd;
         }
 
-        pan_section_pack(&payload, MIDGARD_TILER_JOB, PRIMITIVE, cfg) {
+        pan_section_pack(t.cpu, MIDGARD_TILER_JOB, PRIMITIVE, cfg) {
                 cfg.draw_mode = MALI_DRAW_MODE_TRIANGLES;
                 cfg.index_count = vertex_count;
                 cfg.unknown_3 = 6;
         }
 
-        panfrost_pack_work_groups_compute(pan_section_ptr(&payload, MIDGARD_TILER_JOB, INVOCATION),
+        panfrost_pack_work_groups_compute(pan_section_ptr(t.cpu, MIDGARD_TILER_JOB, INVOCATION),
                                           1, vertex_count, 1, 1, 1, 1, true);
 
-        panfrost_new_job(pool, scoreboard, MALI_JOB_TYPE_TILER, false, 0,
-                         pan_section_ptr(&payload, MIDGARD_TILER_JOB, INVOCATION),
-                         MALI_MIDGARD_TILER_JOB_LENGTH - MALI_JOB_HEADER_LENGTH, true);
+        panfrost_add_job(pool, scoreboard, MALI_JOB_TYPE_TILER, false, 0, &t, true);
 }
diff --git a/src/panfrost/lib/pan_scoreboard.c b/src/panfrost/lib/pan_scoreboard.c
index 22f3eaa..b0e2cb3 100644
--- a/src/panfrost/lib/pan_scoreboard.c
+++ b/src/panfrost/lib/pan_scoreboard.c
@@ -106,13 +106,13 @@
  * not wallpapering and set this, dragons will eat you. */
 
 unsigned
-panfrost_new_job(
+panfrost_add_job(
                 struct pan_pool *pool,
                 struct pan_scoreboard *scoreboard,
                 enum mali_job_type type,
                 bool barrier,
                 unsigned local_dep,
-                void *payload, size_t payload_size,
+                const struct panfrost_transfer *job,
                 bool inject)
 {
         unsigned global_dep = 0;
@@ -133,24 +133,19 @@
         /* Assign the index */
         unsigned index = ++scoreboard->job_index;
 
-        struct panfrost_transfer transfer =
-                panfrost_pool_alloc_aligned(pool, MALI_JOB_HEADER_LENGTH + payload_size, 64);
-
-        pan_pack(transfer.cpu, JOB_HEADER, job) {
-                job.type = type;
-                job.barrier = barrier;
-                job.index = index;
-                job.dependency_1 = local_dep;
-                job.dependency_2 = global_dep;
+        pan_pack(job->cpu, JOB_HEADER, header) {
+                header.type = type;
+                header.barrier = barrier;
+                header.index = index;
+                header.dependency_1 = local_dep;
+                header.dependency_2 = global_dep;
 
                 if (inject)
-                        job.next = scoreboard->first_job;
+                        header.next = scoreboard->first_job;
         }
 
-        memcpy(transfer.cpu + MALI_JOB_HEADER_LENGTH, payload, payload_size);
-
         if (inject) {
-                scoreboard->first_job = transfer.gpu;
+                scoreboard->first_job = job->gpu;
                 return index;
         }
 
@@ -164,13 +159,13 @@
                  * TODO: Find a way to defer last job header emission until we
                  * have a new job to queue or the batch is ready for execution.
                  */
-                scoreboard->prev_job->opaque[6] = transfer.gpu;
-                scoreboard->prev_job->opaque[7] = transfer.gpu >> 32;
+                scoreboard->prev_job->opaque[6] = job->gpu;
+                scoreboard->prev_job->opaque[7] = job->gpu >> 32;
 	} else {
-                scoreboard->first_job = transfer.gpu;
+                scoreboard->first_job = job->gpu;
         }
 
-        scoreboard->prev_job = (struct mali_job_header_packed *)transfer.cpu;
+        scoreboard->prev_job = (struct mali_job_header_packed *)job->cpu;
         return index;
 }
 
diff --git a/src/panfrost/lib/pan_scoreboard.h b/src/panfrost/lib/pan_scoreboard.h
index 53c65d4..17fbc32 100644
--- a/src/panfrost/lib/pan_scoreboard.h
+++ b/src/panfrost/lib/pan_scoreboard.h
@@ -49,13 +49,13 @@
 };
 
 unsigned
-panfrost_new_job(
+panfrost_add_job(
                 struct pan_pool *pool,
                 struct pan_scoreboard *scoreboard,
                 enum mali_job_type type,
                 bool barrier,
                 unsigned local_dep,
-                void *payload, size_t payload_size,
+                const struct panfrost_transfer *job,
                 bool inject);
 
 void panfrost_scoreboard_initialize_tiler(