pan/bi: Eliminate writemasks in the IR

Since the hardware doesn't support them, they're a burden to deal with,
so let's ensure we never get to a place where we would need to at all.

Disables COMBINE lowering for now.

Signed-off-by: Alyssa Rosenzweig <alyssa.rosenzweig@collabora.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/4766>
diff --git a/src/panfrost/bifrost/compiler.h b/src/panfrost/bifrost/compiler.h
index 3fc819c..6e148b4 100644
--- a/src/panfrost/bifrost/compiler.h
+++ b/src/panfrost/bifrost/compiler.h
@@ -114,8 +114,7 @@
  * the end of a clause. Implies ADD */
 #define BI_SCHED_HI_LATENCY (1 << 7)
 
-/* Intrinsic is vectorized and should read 4 components in the first source
- * regardless of writemask */
+/* Intrinsic is vectorized and acts with `vector_channels` components */
 #define BI_VECTOR (1 << 8)
 
 /* Use a data register for src0/dest respectively, bypassing the usual
@@ -229,6 +228,10 @@
         unsigned dest;
         unsigned src[BIR_SRC_COUNT];
 
+        /* 32-bit word offset for destination, added to the register number in
+         * RA when lowering combines */
+        unsigned dest_offset;
+
         /* If one of the sources has BIR_INDEX_CONSTANT */
         union {
                 uint64_t u64;
@@ -246,14 +249,6 @@
         /* Round mode (requires BI_ROUNDMODE) */
         enum bifrost_roundmode roundmode;
 
-        /* Writemask (bit for each affected byte). This is quite restricted --
-         * ALU ops can only write to a single channel (exception: <32 in which
-         * you can write to 32/N contiguous aligned channels). Load/store can
-         * only write to all channels at once, in a sense. But it's still
-         * better to use this generic form than have synthetic ops flying
-         * about, since we're not essentially vector for RA purposes. */
-        uint16_t writemask;
-
         /* Destination type. Usually the type of the instruction
          * itself, but if sources and destination have different
          * types, the type of the destination wins (so f2i would be
@@ -269,6 +264,9 @@
          * selection, so we don't have to special case extraction. */
         uint8_t swizzle[BIR_SRC_COUNT][NIR_MAX_VEC_COMPONENTS];
 
+        /* For VECTOR ops, how many channels are written? */
+        unsigned vector_channels;
+
         /* A class-specific op from which the actual opcode can be derived
          * (along with the above information) */
 
@@ -299,9 +297,6 @@
 
                 /* For BLEND -- the location 0-7 */
                 unsigned blend_location;
-
-                /* For STORE, STORE_VAR -- channel count */
-                unsigned store_channels;
         };
 } bi_instruction;
 
@@ -578,11 +573,11 @@
 bool bi_is_src_swizzled(bi_instruction *ins, unsigned s);
 bool bi_has_arg(bi_instruction *ins, unsigned arg);
 uint16_t bi_from_bytemask(uint16_t bytemask, unsigned bytes);
-unsigned bi_get_component_count(bi_instruction *ins, unsigned s);
-unsigned bi_load32_components(bi_instruction *ins);
+unsigned bi_get_component_count(bi_instruction *ins, signed s);
 uint16_t bi_bytemask_of_read_components(bi_instruction *ins, unsigned node);
 uint64_t bi_get_immediate(bi_instruction *ins, unsigned index);
 bool bi_writes_component(bi_instruction *ins, unsigned comp);
+unsigned bi_writemask(bi_instruction *ins);
 
 /* BIR passes */