panfrost: XMLify invocations

Not so bad :)

Signed-off-by: Alyssa Rosenzweig <alyssa.rosenzweig@collabora.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/6476>
diff --git a/src/panfrost/bifrost/test/bi_submit.c b/src/panfrost/bifrost/test/bi_submit.c
index 8b46569..73c971d 100644
--- a/src/panfrost/bifrost/test/bi_submit.c
+++ b/src/panfrost/bifrost/test/bi_submit.c
@@ -199,11 +199,15 @@
                 },
         };
 
-        panfrost_pack_work_groups_compute(&payload.prefix,
+        struct mali_invocation_packed invocation;
+
+        panfrost_pack_work_groups_compute(&invocation,
                         1, 1, 1,
                         1, 1, 1,
                         true);
 
+        payload.prefix.invocation = invocation;
+
         struct panfrost_bo *bos[] = {
                 scratchpad, shmem, shader, shader_desc, ubo, var, attr
         };
diff --git a/src/panfrost/include/panfrost-job.h b/src/panfrost/include/panfrost-job.h
index 8cb30ed..879023e 100644
--- a/src/panfrost/include/panfrost-job.h
+++ b/src/panfrost/include/panfrost-job.h
@@ -421,42 +421,7 @@
  */
 
 struct mali_vertex_tiler_prefix {
-        /* This is a dynamic bitfield containing the following things in this order:
-         *
-         * - gl_WorkGroupSize.x
-         * - gl_WorkGroupSize.y
-         * - gl_WorkGroupSize.z
-         * - gl_NumWorkGroups.x
-         * - gl_NumWorkGroups.y
-         * - gl_NumWorkGroups.z
-         *
-         * The number of bits allocated for each number is based on the *_shift
-         * fields below. For example, workgroups_y_shift gives the bit that
-         * gl_NumWorkGroups.y starts at, and workgroups_z_shift gives the bit
-         * that gl_NumWorkGroups.z starts at (and therefore one after the bit
-         * that gl_NumWorkGroups.y ends at). The actual value for each gl_*
-         * value is one more than the stored value, since if any of the values
-         * are zero, then there would be no invocations (and hence no job). If
-         * there were 0 bits allocated to a given field, then it must be zero,
-         * and hence the real value is one.
-         *
-         * Vertex jobs reuse the same job dispatch mechanism as compute jobs,
-         * effectively doing glDispatchCompute(1, vertex_count, instance_count)
-         * where vertex count is the number of vertices.
-         */
-        u32 invocation_count;
-
-        /* Bitfield for shifts:
-         *
-         * size_y_shift : 5
-         * size_z_shift : 5
-         * workgroups_x_shift : 6
-         * workgroups_y_shift : 6
-         * workgroups_z_shift : 6
-         * workgroups_x_shift_2 : 4
-         */
-        u32 invocation_shifts;
-
+        struct mali_invocation_packed invocation;
         struct mali_primitive_packed primitive;
 } __attribute__((packed));
 
diff --git a/src/panfrost/lib/decode.c b/src/panfrost/lib/decode.c
index 78b0a54..c6e23b1 100644
--- a/src/panfrost/lib/decode.c
+++ b/src/panfrost/lib/decode.c
@@ -1248,21 +1248,17 @@
         /* Decode invocation_count. See the comment before the definition of
          * invocation_count for an explanation.
          */
+        struct MALI_INVOCATION invocation;
+        struct mali_invocation_packed invocation_packed = p->invocation;
+        MALI_INVOCATION_unpack((const uint8_t *) &invocation_packed, &invocation);
 
-        unsigned size_y_shift = bits(p->invocation_shifts, 0, 5);
-        unsigned size_z_shift = bits(p->invocation_shifts, 5, 10);
-        unsigned workgroups_x_shift = bits(p->invocation_shifts, 10, 16);
-        unsigned workgroups_y_shift = bits(p->invocation_shifts, 16, 22);
-        unsigned workgroups_z_shift = bits(p->invocation_shifts, 22, 28);
-        unsigned workgroups_x_shift_2 = bits(p->invocation_shifts, 28, 32);
+        unsigned size_x = bits(invocation.invocations, 0, invocation.size_y_shift) + 1;
+        unsigned size_y = bits(invocation.invocations, invocation.size_y_shift, invocation.size_z_shift) + 1;
+        unsigned size_z = bits(invocation.invocations, invocation.size_z_shift, invocation.workgroups_x_shift) + 1;
 
-        unsigned size_x = bits(p->invocation_count, 0, size_y_shift) + 1;
-        unsigned size_y = bits(p->invocation_count, size_y_shift, size_z_shift) + 1;
-        unsigned size_z = bits(p->invocation_count, size_z_shift, workgroups_x_shift) + 1;
-
-        unsigned groups_x = bits(p->invocation_count, workgroups_x_shift, workgroups_y_shift) + 1;
-        unsigned groups_y = bits(p->invocation_count, workgroups_y_shift, workgroups_z_shift) + 1;
-        unsigned groups_z = bits(p->invocation_count, workgroups_z_shift, 32) + 1;
+        unsigned groups_x = bits(invocation.invocations, invocation.workgroups_x_shift, invocation.workgroups_y_shift) + 1;
+        unsigned groups_y = bits(invocation.invocations, invocation.workgroups_y_shift, invocation.workgroups_z_shift) + 1;
+        unsigned groups_z = bits(invocation.invocations, invocation.workgroups_z_shift, 32) + 1;
 
         /* Even though we have this decoded, we want to ensure that the
          * representation is "unique" so we don't lose anything by printing only
@@ -1272,30 +1268,17 @@
          * decode and pack it ourselves! If it is bit exact with what we
          * decoded, we're good to go. */
 
-        struct mali_vertex_tiler_prefix ref;
+        struct mali_invocation_packed ref;
         panfrost_pack_work_groups_compute(&ref, groups_x, groups_y, groups_z, size_x, size_y, size_z, graphics);
 
-        bool canonical =
-                (p->invocation_count == ref.invocation_count) &&
-                (p->invocation_shifts == ref.invocation_shifts);
-
-        if (!canonical) {
+        if (memcmp(&ref, &invocation_packed, sizeof(ref))) {
                 pandecode_msg("XXX: non-canonical workgroups packing\n");
-                pandecode_msg("expected: %X, %X",
-                                ref.invocation_count,
-                                ref.invocation_shifts);
-
-                pandecode_prop("invocation_count = 0x%" PRIx32, p->invocation_count);
-                pandecode_prop("size_y_shift = %d", size_y_shift);
-                pandecode_prop("size_z_shift = %d", size_z_shift);
-                pandecode_prop("workgroups_x_shift = %d", workgroups_x_shift);
-                pandecode_prop("workgroups_y_shift = %d", workgroups_y_shift);
-                pandecode_prop("workgroups_z_shift = %d", workgroups_z_shift);
-                pandecode_prop("workgroups_x_shift_2 = %d", workgroups_x_shift_2);
+                MALI_INVOCATION_print(pandecode_dump_stream, &invocation, 1 * 2);
         }
 
         /* Regardless, print the decode */
-        pandecode_msg("size (%d, %d, %d), count (%d, %d, %d)\n",
+        fprintf(pandecode_dump_stream,
+                        "Invocation (%d, %d, %d) x (%d, %d, %d)\n",
                         size_x, size_y, size_z,
                         groups_x, groups_y, groups_z);
 
diff --git a/src/panfrost/lib/pan_blit.c b/src/panfrost/lib/pan_blit.c
index f9ca87e..79a9120 100644
--- a/src/panfrost/lib/pan_blit.c
+++ b/src/panfrost/lib/pan_blit.c
@@ -346,6 +346,7 @@
         struct midgard_payload_vertex_tiler payload = {};
         struct mali_primitive_packed primitive;
         struct mali_draw_packed draw;
+        struct mali_invocation_packed invocation;
 
         pan_pack(&draw, DRAW, cfg) {
                 cfg.unknown_1 = 0x7;
@@ -365,10 +366,11 @@
                 cfg.unknown_3 = 6;
         }
 
-        memcpy(&payload.prefix.primitive, &primitive, MALI_DRAW_LENGTH);
-        memcpy(&payload.postfix, &draw, MALI_DRAW_LENGTH);
+        panfrost_pack_work_groups_compute(&invocation, 1, vertex_count, 1, 1, 1, 1, true);
 
-        panfrost_pack_work_groups_compute(&payload.prefix, 1, vertex_count, 1, 1, 1, 1, true);
+        payload.prefix.primitive = primitive;
+        memcpy(&payload.postfix, &draw, MALI_DRAW_LENGTH);
+        payload.prefix.invocation = invocation;
 
         panfrost_new_job(pool, scoreboard, MALI_JOB_TYPE_TILER, false, 0, &payload, sizeof(payload), true);
 }
diff --git a/src/panfrost/lib/pan_encoder.h b/src/panfrost/lib/pan_encoder.h
index 0471701..9433f02 100644
--- a/src/panfrost/lib/pan_encoder.h
+++ b/src/panfrost/lib/pan_encoder.h
@@ -34,7 +34,7 @@
 
 void
 panfrost_pack_work_groups_compute(
-        struct mali_vertex_tiler_prefix *out,
+        struct mali_invocation_packed *out,
         unsigned num_x,
         unsigned num_y,
         unsigned num_z,
@@ -43,17 +43,6 @@
         unsigned size_z,
         bool quirk_graphics);
 
-void
-panfrost_pack_work_groups_fused(
-        struct mali_vertex_tiler_prefix *vertex,
-        struct mali_vertex_tiler_prefix *tiler,
-        unsigned num_x,
-        unsigned num_y,
-        unsigned num_z,
-        unsigned size_x,
-        unsigned size_y,
-        unsigned size_z);
-
 /* Tiler structure size computation */
 
 unsigned
diff --git a/src/panfrost/lib/pan_invocation.c b/src/panfrost/lib/pan_invocation.c
index cfb5bec..4c0f645 100644
--- a/src/panfrost/lib/pan_invocation.c
+++ b/src/panfrost/lib/pan_invocation.c
@@ -41,7 +41,7 @@
 
 void
 panfrost_pack_work_groups_compute(
-        struct mali_vertex_tiler_prefix *out,
+        struct mali_invocation_packed *out,
         unsigned num_x,
         unsigned num_y,
         unsigned num_z,
@@ -77,53 +77,24 @@
                 shifts[i + 1] = shifts[i] + bit_count;
         }
 
-        /* Quirk: for non-instanced graphics, the blob sets workgroups_z_shift
-         * = 32. This doesn't appear to matter to the hardware, but it's good
-         * to be bit-identical. */
+        pan_pack(out, INVOCATION, cfg) {
+                cfg.invocations = packed;
+                cfg.size_y_shift = shifts[1];
+                cfg.size_z_shift = shifts[2];
+                cfg.workgroups_x_shift = shifts[3];
+                cfg.workgroups_y_shift = shifts[4];
+                cfg.workgroups_z_shift = shifts[5];
 
-        if (quirk_graphics && (num_z <= 1))
-                shifts[5] = 32;
+                /* Quirk: for non-instanced graphics, the blob sets
+                 * workgroups_z_shift = 32. This doesn't appear to matter to
+                 * the hardware, but it's good to be bit-identical. */
 
-        /* Quirk: for graphics, workgroups_x_shift_2 must be at least 2,
-         * whereas for OpenCL it is simply equal to workgroups_x_shift. For GL
-         * compute, it is always 2 if no barriers are in use, but is equal to
-         * workgroups_x_shift is barriers are in use. */
+                if (quirk_graphics && (num_z <= 1))
+                        cfg.workgroups_z_shift = 32;
 
-        unsigned shift_2 = shifts[3];
+                /* Quirk: for graphics, >= 2.  For compute, 2 without barriers
+                 * but equal to workgroups_x_shift with barriers */
 
-        if (quirk_graphics)
-                shift_2 = MAX2(shift_2, 2);
-
-        /* Pack them in */
-        uint32_t packed_shifts =
-                (shifts[1] << 0) |
-                (shifts[2] << 5) |
-                (shifts[3] << 10) |
-                (shifts[4] << 16) |
-                (shifts[5] << 22) |
-                (shift_2 << 28);
-
-        /* Upload the packed bitfields */
-        out->invocation_count = packed;
-        out->invocation_shifts = packed_shifts;
+                cfg.unknown_shift = quirk_graphics ? 2 : cfg.workgroups_x_shift;
+        }
 }
-
-/* Packs vertex/tiler descriptors simultaneously */
-void
-panfrost_pack_work_groups_fused(
-        struct mali_vertex_tiler_prefix *vertex,
-        struct mali_vertex_tiler_prefix *tiler,
-        unsigned num_x,
-        unsigned num_y,
-        unsigned num_z,
-        unsigned size_x,
-        unsigned size_y,
-        unsigned size_z)
-{
-        panfrost_pack_work_groups_compute(vertex, num_x, num_y, num_z, size_x, size_y, size_z, true);
-
-        /* Copy results over */
-        tiler->invocation_count = vertex->invocation_count;
-        tiler->invocation_shifts = vertex->invocation_shifts;
-}
-