src/intel/compiler/brw_fs_scoreboard.cpp - platform/external/mesa3d - Gitiles

 /*
  * Copyright © 2019 Intel Corporation
  *
  * Permission is hereby granted, free of charge, to any person obtaining a
  * copy of this software and associated documentation files (the "Software"),
  * to deal in the Software without restriction, including without limitation
  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
  * and/or sell copies of the Software, and to permit persons to whom the
  * Software is furnished to do so, subject to the following conditions:
  *
  * The above copyright notice and this permission notice (including the next
  * paragraph) shall be included in all copies or substantial portions of the
  * Software.
  *
  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  * IN THE SOFTWARE.
  */

 /** @file brw_fs_scoreboard.cpp
  *
  * Gen12+ hardware lacks the register scoreboard logic that used to guarantee
  * data coherency between register reads and writes in previous generations.
  * This lowering pass runs after register allocation in order to make up for
  * it.
  *
  * It works by performing global dataflow analysis in order to determine the
  * set of potential dependencies of every instruction in the shader, and then
  * inserts any required SWSB annotations and additional SYNC instructions in
  * order to guarantee data coherency.
  *
  * WARNING - Access of the following (rarely used) ARF registers is not
  *           tracked here, and require the RegDist SWSB annotation to be set
  *           to 1 by the generator in order to avoid data races:
  *
  *  - sp stack pointer
  *  - sr0 state register
  *  - cr0 control register
  *  - ip instruction pointer
  *  - tm0 timestamp register
  *  - dbg0 debug register
  *
  * The following ARF registers don't need to be tracked here because data
  * coherency is still provided transparently by the hardware:
  *
  *  - f0-1 flag registers
  *  - n0 notification register
  *  - tdr0 thread dependency register
  */

 #include "brw_fs.h"
 #include "brw_cfg.h"

 using namespace brw;

 namespace {
    /**
     * In-order instruction accounting.
     * @{
     */

    /**
     * Number of in-order hardware instructions contained in this IR
     * instruction.  This determines the increment applied to the RegDist
     * counter calculated for any ordered dependency that crosses this
     * instruction.
     */
    unsigned
    ordered_unit(const fs_inst *inst)
    {
       switch (inst->opcode) {
       case BRW_OPCODE_SYNC:
       case BRW_OPCODE_DO:
       case SHADER_OPCODE_UNDEF:
       case FS_OPCODE_PLACEHOLDER_HALT:
       case FS_OPCODE_SCHEDULING_FENCE:
          return 0;
       default:
          /* Note that the following is inaccurate for virtual instructions
           * that expand to more in-order instructions than assumed here, but
           * that can only lead to suboptimal execution ordering, data
           * coherency won't be impacted.  Providing exact RegDist counts for
           * each virtual instruction would allow better ALU performance, but
           * it would require keeping this switch statement in perfect sync
           * with the generator in order to avoid data corruption.  Lesson is
           * (again) don't use virtual instructions if you want optimal
           * scheduling.
           */
          return is_unordered(inst) ? 0 : 1;
       }
    }

    /**
     * Type for an instruction counter that increments for in-order
     * instructions only, arbitrarily denoted 'jp' throughout this lowering
     * pass in order to distinguish it from the regular instruction counter.
     */
    typedef int ordered_address;

    /**
     * Return the number of instructions in the program.
     */
    unsigned
    num_instructions(const backend_shader *shader)
    {
       return shader->cfg->blocks[shader->cfg->num_blocks - 1]->end_ip + 1;
    }

    /**
     * Calculate the local ordered_address instruction counter at every
     * instruction of the shader for subsequent constant-time look-up.
     */
    ordered_address *
    ordered_inst_addresses(const fs_visitor *shader)
    {
       ordered_address *jps = new ordered_address[num_instructions(shader)];
       ordered_address jp = 0;
       unsigned ip = 0;

       foreach_block_and_inst(block, fs_inst, inst, shader->cfg) {
          jps[ip] = jp;
          jp += ordered_unit(inst);
          ip++;
       }

       return jps;
    }

    /**
     * Synchronization mode required for data manipulated by in-order
     * instructions.
     *
     * Similar to tgl_sbid_mode, but without SET mode.  Defined as a separate
     * enum for additional type safety.  The hardware doesn't provide control
     * over the synchronization mode for RegDist annotations, this is only used
     * internally in this pass in order to optimize out redundant read
     * dependencies where possible.
     */
    enum tgl_regdist_mode {
       TGL_REGDIST_NULL = 0,
       TGL_REGDIST_SRC = 1,
       TGL_REGDIST_DST = 2
    };

    /**
     * Allow bitwise arithmetic of tgl_regdist_mode enums.
     */
    tgl_regdist_mode
    operator|(tgl_regdist_mode x, tgl_regdist_mode y)
    {
       return tgl_regdist_mode(unsigned(x) | unsigned(y));
    }

    tgl_regdist_mode
    operator&(tgl_regdist_mode x, tgl_regdist_mode y)
    {
       return tgl_regdist_mode(unsigned(x) & unsigned(y));
    }

    tgl_regdist_mode &
    operator|=(tgl_regdist_mode &x, tgl_regdist_mode y)
    {
       return x = x | y;
    }

    tgl_regdist_mode &
    operator&=(tgl_regdist_mode &x, tgl_regdist_mode y)
    {
       return x = x & y;
    }

    /** @} */

    /**
     * Representation of an equivalence relation among the set of unsigned
     * integers.
     *
     * Its initial state is the identity relation '~' such that i ~ j if and
     * only if i == j for every pair of unsigned integers i and j.
     */
    struct equivalence_relation {
       equivalence_relation(unsigned n) : is(new unsigned[n]), n(n)
       {
          for (unsigned i = 0; i < n; i++)
             is[i] = i;
       }

       ~equivalence_relation()
       {
          delete[] is;
       }

       /**
        * Return equivalence class index of the specified element.  Effectively
        * this is the numeric value of an arbitrary representative from the
        * equivalence class.
        *
        * Allows the evaluation of the equivalence relation according to the
        * rule that i ~ j if and only if lookup(i) == lookup(j).
        */
       unsigned
       lookup(unsigned i) const
       {
          if (i < n && is[i] != i)
             return lookup(is[i]);
          else
             return i;
       }

       /**
        * Create an array with the results of the lookup() method for
        * constant-time evaluation.
        */
       unsigned *
       flatten() const
       {
          unsigned *ids = new unsigned[n];

          for (unsigned i = 0; i < n; i++)
             ids[i] = lookup(i);

          return ids;
       }

       /**
        * Mutate the existing equivalence relation minimally by imposing the
        * additional requirement that i ~ j.
        *
        * The algorithm updates the internal representation recursively in
        * order to guarantee transitivity while preserving the previously
        * specified equivalence requirements.
        */
       unsigned
       link(unsigned i, unsigned j)
       {
          const unsigned k = lookup(i);
          assign(i, k);
          assign(j, k);
          return k;
       }

    private:
       equivalence_relation(const equivalence_relation &);

       equivalence_relation &
       operator=(const equivalence_relation &);

       /**
        * Assign the representative of \p from to be equivalent to \p to.
        *
        * At the same time the data structure is partially flattened as much as
        * it's possible without increasing the number of recursive calls.
        */
       void
       assign(unsigned from, unsigned to)
       {
          if (from != to) {
             assert(from < n);

             if (is[from] != from)
                assign(is[from], to);

             is[from] = to;
          }
       }

       unsigned *is;
       unsigned n;
    };

    /**
     * Representation of a data dependency between two instructions in the
     * program.
     * @{
     */
    struct dependency {
       /**
        * No dependency information.
        */
       dependency() : ordered(TGL_REGDIST_NULL), jp(INT_MIN),
                      unordered(TGL_SBID_NULL), id(0),
                      exec_all(false) {}

       /**
        * Construct a dependency on the in-order instruction with the provided
        * ordered_address instruction counter.
        */
       dependency(tgl_regdist_mode mode, ordered_address jp, bool exec_all) :
          ordered(mode), jp(jp), unordered(TGL_SBID_NULL), id(0),
          exec_all(exec_all) {}

       /**
        * Construct a dependency on the out-of-order instruction with the
        * specified synchronization token.
        */
       dependency(tgl_sbid_mode mode, unsigned id, bool exec_all) :
          ordered(TGL_REGDIST_NULL), jp(INT_MIN), unordered(mode), id(id),
          exec_all(exec_all) {}

       /**
        * Synchronization mode of in-order dependency, or zero if no in-order
        * dependency is present.
        */
       tgl_regdist_mode ordered;

       /**
        * Instruction counter of in-order dependency.
        *
        * For a dependency part of a different block in the program, this is
        * relative to the specific control flow path taken between the
        * dependency and the current block: It is the ordered_address such that
        * the difference between it and the ordered_address of the first
        * instruction of the current block is exactly the number of in-order
        * instructions across that control flow path.  It is not guaranteed to
        * be equal to the local ordered_address of the generating instruction
        * [as returned by ordered_inst_addresses()], except for block-local
        * dependencies.
        */
       ordered_address jp;

       /**
        * Synchronization mode of unordered dependency, or zero if no unordered
        * dependency is present.
        */
       tgl_sbid_mode unordered;

       /** Synchronization token of out-of-order dependency. */
       unsigned id;

       /**
        * Whether the dependency could be run with execution masking disabled,
        * which might lead to the unwanted execution of the generating
        * instruction in cases where a BB is executed with all channels
        * disabled due to hardware bug GEN:BUG:1407528679.
        */
       bool exec_all;

       /**
        * Trivial in-order dependency that's always satisfied.
        *
        * Note that unlike a default-constructed dependency() which is also
        * trivially satisfied, this is considered to provide dependency
        * information and can be used to clear a previously pending dependency
        * via shadow().
        */
       static const dependency done;

       friend bool
       operator==(const dependency &dep0, const dependency &dep1)
       {
          return dep0.ordered == dep1.ordered &&
                 dep0.jp == dep1.jp &&
                 dep0.unordered == dep1.unordered &&
                 dep0.id == dep1.id &&
                 dep0.exec_all == dep1.exec_all;
       }

       friend bool
       operator!=(const dependency &dep0, const dependency &dep1)
       {
          return !(dep0 == dep1);
       }
    };

    const dependency dependency::done = dependency(TGL_REGDIST_SRC, INT_MIN, false);

    /**
     * Return whether \p dep contains any dependency information.
     */
    bool
    is_valid(const dependency &dep)
    {
       return dep.ordered || dep.unordered;
    }

    /**
     * Combine \p dep0 and \p dep1 into a single dependency object that is only
     * satisfied when both original dependencies are satisfied.  This might
     * involve updating the equivalence relation \p eq in order to make sure
     * that both out-of-order dependencies are assigned the same hardware SBID
     * as synchronization token.
     */
    dependency
    merge(equivalence_relation &eq,
          const dependency &dep0, const dependency &dep1)
    {
       dependency dep;

       if (dep0.ordered || dep1.ordered) {
          dep.ordered = dep0.ordered | dep1.ordered;
          dep.jp = MAX2(dep0.jp, dep1.jp);
       }

       if (dep0.unordered || dep1.unordered) {
          dep.unordered = dep0.unordered | dep1.unordered;
          dep.id = eq.link(dep0.unordered ? dep0.id : dep1.id,
                           dep1.unordered ? dep1.id : dep0.id);
       }

       dep.exec_all = dep0.exec_all || dep1.exec_all;

       return dep;
    }

    /**
     * Override dependency information of \p dep0 with that of \p dep1.
     */
    dependency
    shadow(const dependency &dep0, const dependency &dep1)
    {
       return is_valid(dep1) ? dep1 : dep0;
    }

    /**
     * Translate dependency information across the program.
     *
     * This returns a dependency on the same instruction translated to the
     * ordered_address space of a different block.  The correct shift for
     * transporting a dependency across an edge of the CFG is the difference
     * between the local ordered_address of the first instruction of the target
     * block and the local ordered_address of the instruction immediately after
     * the end of the origin block.
     */
    dependency
    transport(dependency dep, int delta)
    {
       if (dep.ordered && dep.jp > INT_MIN)
          dep.jp += delta;

       return dep;
    }

    /**
     * Return simplified dependency removing any synchronization modes not
     * applicable to an instruction reading the same register location.
     */
    dependency
    dependency_for_read(dependency dep)
    {
       dep.ordered &= TGL_REGDIST_DST;
       return dep;
    }

    /**
     * Return simplified dependency removing any synchronization modes not
     * applicable to an instruction \p inst writing the same register location.
     */
    dependency
    dependency_for_write(const fs_inst *inst, dependency dep)
    {
       if (!is_unordered(inst))
          dep.ordered &= TGL_REGDIST_DST;
       return dep;
    }

    /** @} */

    /**
     * Scoreboard representation.  This keeps track of the data dependencies of
     * registers with GRF granularity.
     */
    class scoreboard {
    public:
       /**
        * Look up the most current data dependency for register \p r.
        */
       dependency
       get(const fs_reg &r) const
       {
          if (const dependency *p = const_cast<scoreboard *>(this)->dep(r))
             return *p;
          else
             return dependency();
       }

       /**
        * Specify the most current data dependency for register \p r.
        */
       void
       set(const fs_reg &r, const dependency &d)
       {
          if (dependency *p = dep(r))
             *p = d;
       }

       /**
        * Component-wise merge() of corresponding dependencies from two
        * scoreboard objects.  \sa merge().
        */
       friend scoreboard
       merge(equivalence_relation &eq,
             const scoreboard &sb0, const scoreboard &sb1)
       {
          scoreboard sb;

          for (unsigned i = 0; i < ARRAY_SIZE(sb.grf_deps); i++)
             sb.grf_deps[i] = merge(eq, sb0.grf_deps[i], sb1.grf_deps[i]);

          sb.addr_dep = merge(eq, sb0.addr_dep, sb1.addr_dep);

          for (unsigned i = 0; i < ARRAY_SIZE(sb.accum_deps); i++)
             sb.accum_deps[i] = merge(eq, sb0.accum_deps[i], sb1.accum_deps[i]);

          return sb;
       }

       /**
        * Component-wise shadow() of corresponding dependencies from two
        * scoreboard objects.  \sa shadow().
        */
       friend scoreboard
       shadow(const scoreboard &sb0, const scoreboard &sb1)
       {
          scoreboard sb;

          for (unsigned i = 0; i < ARRAY_SIZE(sb.grf_deps); i++)
             sb.grf_deps[i] = shadow(sb0.grf_deps[i], sb1.grf_deps[i]);

          sb.addr_dep = shadow(sb0.addr_dep, sb1.addr_dep);

          for (unsigned i = 0; i < ARRAY_SIZE(sb.accum_deps); i++)
             sb.accum_deps[i] = shadow(sb0.accum_deps[i], sb1.accum_deps[i]);

          return sb;
       }

       /**
        * Component-wise transport() of dependencies from a scoreboard
        * object.  \sa transport().
        */
       friend scoreboard
       transport(const scoreboard &sb0, int delta)
       {
          scoreboard sb;

          for (unsigned i = 0; i < ARRAY_SIZE(sb.grf_deps); i++)
             sb.grf_deps[i] = transport(sb0.grf_deps[i], delta);

          sb.addr_dep = transport(sb0.addr_dep, delta);

          for (unsigned i = 0; i < ARRAY_SIZE(sb.accum_deps); i++)
             sb.accum_deps[i] = transport(sb0.accum_deps[i], delta);

          return sb;
       }

       friend bool
       operator==(const scoreboard &sb0, const scoreboard &sb1)
       {
          for (unsigned i = 0; i < ARRAY_SIZE(sb0.grf_deps); i++) {
             if (sb0.grf_deps[i] != sb1.grf_deps[i])
                return false;
          }

          if (sb0.addr_dep != sb1.addr_dep)
             return false;

          for (unsigned i = 0; i < ARRAY_SIZE(sb0.accum_deps); i++) {
             if (sb0.accum_deps[i] != sb1.accum_deps[i])
                return false;
          }

          return true;
       }

       friend bool
       operator!=(const scoreboard &sb0, const scoreboard &sb1)
       {
          return !(sb0 == sb1);
       }

    private:
       dependency grf_deps[BRW_MAX_GRF];
       dependency addr_dep;
       dependency accum_deps[10];

       dependency *
       dep(const fs_reg &r)
       {
          const unsigned reg = (r.file == VGRF ? r.nr + r.offset / REG_SIZE :
                                reg_offset(r) / REG_SIZE);

          return (r.file == VGRF || r.file == FIXED_GRF ? &grf_deps[reg] :
                  r.file == MRF ? &grf_deps[GEN7_MRF_HACK_START + reg] :
                  r.file == ARF && reg >= BRW_ARF_ADDRESS &&
                                   reg < BRW_ARF_ACCUMULATOR ? &addr_dep :
                  r.file == ARF && reg >= BRW_ARF_ACCUMULATOR &&
                                   reg < BRW_ARF_FLAG ? &accum_deps[
                                      reg - BRW_ARF_ACCUMULATOR] :
                  NULL);
       }
    };

    /**
     * Dependency list handling.
     * @{
     */
    struct dependency_list {
       dependency_list() : deps(NULL), n(0) {}

       ~dependency_list()
       {
          free(deps);
       }

       void
       push_back(const dependency &dep)
       {
          deps = (dependency *)realloc(deps, (n + 1) * sizeof(*deps));
          deps[n++] = dep;
       }

       unsigned
       size() const
       {
          return n;
       }

       const dependency &
       operator[](unsigned i) const
       {
          assert(i < n);
          return deps[i];
       }

       dependency &
       operator[](unsigned i)
       {
          assert(i < n);
          return deps[i];
       }

    private:
       dependency_list(const dependency_list &);
       dependency_list &
       operator=(const dependency_list &);

       dependency *deps;
       unsigned n;
    };

    /**
     * Add dependency \p dep to the list of dependencies of an instruction
     * \p deps.
     */
    void
    add_dependency(const unsigned *ids, dependency_list &deps, dependency dep)
    {
       if (is_valid(dep)) {
          /* Translate the unordered dependency token first in order to keep
           * the list minimally redundant.
           */
          if (dep.unordered)
             dep.id = ids[dep.id];

          /* Try to combine the specified dependency with any existing ones. */
          for (unsigned i = 0; i < deps.size(); i++) {
             /* Don't combine otherwise matching dependencies if there is an
              * exec_all mismatch which would cause a SET dependency to gain an
              * exec_all flag, since that would prevent it from being baked
              * into the instruction we want to allocate an SBID for.
              */
             if (deps[i].exec_all != dep.exec_all &&
                 (!deps[i].exec_all || (dep.unordered & TGL_SBID_SET)) &&
                 (!dep.exec_all || (deps[i].unordered & TGL_SBID_SET)))
                continue;

             if (dep.ordered && deps[i].ordered) {
                deps[i].jp = MAX2(deps[i].jp, dep.jp);
                deps[i].ordered |= dep.ordered;
                deps[i].exec_all |= dep.exec_all;
                dep.ordered = TGL_REGDIST_NULL;
             }

             if (dep.unordered && deps[i].unordered && deps[i].id == dep.id) {
                deps[i].unordered |= dep.unordered;
                deps[i].exec_all |= dep.exec_all;
                dep.unordered = TGL_SBID_NULL;
             }
          }

          /* Add it to the end of the list if necessary. */
          if (is_valid(dep))
             deps.push_back(dep);
       }
    }

    /**
     * Construct a tgl_swsb annotation encoding any ordered dependencies from
     * the dependency list \p deps of an instruction with ordered_address \p
     * jp.  If \p exec_all is false only dependencies known to be executed with
     * channel masking applied will be considered in the calculation.
     */
    tgl_swsb
    ordered_dependency_swsb(const dependency_list &deps,
                            const ordered_address &jp,
                            bool exec_all)
    {
       unsigned min_dist = ~0u;

       for (unsigned i = 0; i < deps.size(); i++) {
          if (deps[i].ordered && exec_all >= deps[i].exec_all) {
             const unsigned dist = jp - deps[i].jp;
             const unsigned max_dist = 10;
             assert(jp > deps[i].jp);
             if (dist <= max_dist)
                min_dist = MIN3(min_dist, dist, 7);
          }
       }

       return { min_dist == ~0u ? 0 : min_dist };
    }

    /**
     * Return whether the dependency list \p deps of an instruction with
     * ordered_address \p jp has any non-trivial ordered dependencies.  If \p
     * exec_all is false only dependencies known to be executed with channel
     * masking applied will be considered in the calculation.
     */
    bool
    find_ordered_dependency(const dependency_list &deps,
                            const ordered_address &jp,
                            bool exec_all)
    {
       return ordered_dependency_swsb(deps, jp, exec_all).regdist;
    }

    /**
     * Return the full tgl_sbid_mode bitset for the first unordered dependency
     * on the list \p deps that matches the specified tgl_sbid_mode, or zero if
     * no such dependency is present.  If \p exec_all is false only
     * dependencies known to be executed with channel masking applied will be
     * considered in the calculation.
     */
    tgl_sbid_mode
    find_unordered_dependency(const dependency_list &deps,
                              tgl_sbid_mode unordered,
                              bool exec_all)
    {
       if (unordered) {
          for (unsigned i = 0; i < deps.size(); i++) {
             if ((unordered & deps[i].unordered) &&
                 exec_all >= deps[i].exec_all)
                return deps[i].unordered;
          }
       }

       return TGL_SBID_NULL;
    }

    /**
     * Return the tgl_sbid_mode bitset of an unordered dependency from the list
     * \p deps that can be represented directly in the SWSB annotation of the
     * instruction without additional SYNC instructions, or zero if no such
     * dependency is present.
     */
    tgl_sbid_mode
    baked_unordered_dependency_mode(const fs_inst *inst,
                                    const dependency_list &deps,
                                    const ordered_address &jp)
    {
       const bool exec_all = inst->force_writemask_all;
       const bool has_ordered = find_ordered_dependency(deps, jp, exec_all);

       if (find_unordered_dependency(deps, TGL_SBID_SET, exec_all))
          return find_unordered_dependency(deps, TGL_SBID_SET, exec_all);
       else if (has_ordered && is_unordered(inst))
          return TGL_SBID_NULL;
       else if (find_unordered_dependency(deps, TGL_SBID_DST, exec_all) &&
                (!has_ordered || !is_unordered(inst)))
          return find_unordered_dependency(deps, TGL_SBID_DST, exec_all);
       else if (!has_ordered)
          return find_unordered_dependency(deps, TGL_SBID_SRC, exec_all);
       else
          return TGL_SBID_NULL;
    }

    /** @} */

    /**
     * Shader instruction dependency calculation.
     * @{
     */

    /**
     * Update scoreboard object \p sb to account for the execution of
     * instruction \p inst.
     */
    void
    update_inst_scoreboard(const ordered_address *jps,
                           const fs_inst *inst, unsigned ip, scoreboard &sb)
    {
       const bool exec_all = inst->force_writemask_all;

       /* Track any source registers that may be fetched asynchronously by this
        * instruction, otherwise clear the dependency in order to avoid
        * subsequent redundant synchronization.
        */
       for (unsigned i = 0; i < inst->sources; i++) {
          const dependency rd_dep =
             (inst->is_payload(i) ||
              inst->is_math()) ? dependency(TGL_SBID_SRC, ip, exec_all) :
             ordered_unit(inst) ? dependency(TGL_REGDIST_SRC, jps[ip], exec_all) :
             dependency::done;

          for (unsigned j = 0; j < regs_read(inst, i); j++)
             sb.set(byte_offset(inst->src[i], REG_SIZE * j), rd_dep);
       }

       if (is_send(inst) && inst->base_mrf != -1) {
          const dependency rd_dep = dependency(TGL_SBID_SRC, ip, exec_all);

          for (unsigned j = 0; j < inst->mlen; j++)
             sb.set(brw_uvec_mrf(8, inst->base_mrf + j, 0), rd_dep);
       }

       /* Track any destination registers of this instruction. */
       const dependency wr_dep =
          is_unordered(inst) ? dependency(TGL_SBID_DST, ip, exec_all) :
          ordered_unit(inst) ? dependency(TGL_REGDIST_DST, jps[ip], exec_all) :
          dependency();

       if (is_valid(wr_dep) && inst->dst.file != BAD_FILE &&
           !inst->dst.is_null()) {
          for (unsigned j = 0; j < regs_written(inst); j++)
             sb.set(byte_offset(inst->dst, REG_SIZE * j), wr_dep);
       }
    }

    /**
     * Calculate scoreboard objects locally that represent any pending (and
     * unconditionally resolved) dependencies at the end of each block of the
     * program.
     */
    scoreboard *
    gather_block_scoreboards(const fs_visitor *shader,
                             const ordered_address *jps)
    {
       scoreboard *sbs = new scoreboard[shader->cfg->num_blocks];
       unsigned ip = 0;

       foreach_block_and_inst(block, fs_inst, inst, shader->cfg)
          update_inst_scoreboard(jps, inst, ip++, sbs[block->num]);

       return sbs;
    }

    /**
     * Propagate data dependencies globally through the control flow graph
     * until a fixed point is reached.
     *
     * Calculates the set of dependencies potentially pending at the beginning
     * of each block, and returns it as an array of scoreboard objects.
     */
    scoreboard *
    propagate_block_scoreboards(const fs_visitor *shader,
                                const ordered_address *jps,
                                equivalence_relation &eq)
    {
       const scoreboard *delta_sbs = gather_block_scoreboards(shader, jps);
       scoreboard *in_sbs = new scoreboard[shader->cfg->num_blocks];
       scoreboard *out_sbs = new scoreboard[shader->cfg->num_blocks];

       for (bool progress = true; progress;) {
          progress = false;

          foreach_block(block, shader->cfg) {
             const scoreboard sb = shadow(in_sbs[block->num],
                                          delta_sbs[block->num]);

             if (sb != out_sbs[block->num]) {
                foreach_list_typed(bblock_link, child_link, link,
                                   &block->children) {
                   scoreboard &in_sb = in_sbs[child_link->block->num];
                   const int delta =
                      jps[child_link->block->start_ip] - jps[block->end_ip]
                      - ordered_unit(static_cast<const fs_inst *>(block->end()));

                   in_sb = merge(eq, in_sb, transport(sb, delta));
                }

                out_sbs[block->num] = sb;
                progress = true;
             }
          }
       }

       delete[] delta_sbs;
       delete[] out_sbs;

       return in_sbs;
    }

    /**
     * Return the list of potential dependencies of each instruction in the
     * shader based on the result of global dependency analysis.
     */
    dependency_list *
    gather_inst_dependencies(const fs_visitor *shader,
                             const ordered_address *jps)
    {
       equivalence_relation eq(num_instructions(shader));
       scoreboard *sbs = propagate_block_scoreboards(shader, jps, eq);
       const unsigned *ids = eq.flatten();
       dependency_list *deps = new dependency_list[num_instructions(shader)];
       unsigned ip = 0;

       foreach_block_and_inst(block, fs_inst, inst, shader->cfg) {
          const bool exec_all = inst->force_writemask_all;
          scoreboard &sb = sbs[block->num];

          for (unsigned i = 0; i < inst->sources; i++) {
             for (unsigned j = 0; j < regs_read(inst, i); j++)
                add_dependency(ids, deps[ip], dependency_for_read(
                   sb.get(byte_offset(inst->src[i], REG_SIZE * j))));
          }

          if (is_send(inst) && inst->base_mrf != -1) {
             for (unsigned j = 0; j < inst->mlen; j++)
                add_dependency(ids, deps[ip], dependency_for_read(
                   sb.get(brw_uvec_mrf(8, inst->base_mrf + j, 0))));
          }

          if (is_unordered(inst))
             add_dependency(ids, deps[ip],
                            dependency(TGL_SBID_SET, ip, exec_all));

          if (!inst->no_dd_check) {
             if (inst->dst.file != BAD_FILE && !inst->dst.is_null()) {
                for (unsigned j = 0; j < regs_written(inst); j++) {
                   add_dependency(ids, deps[ip], dependency_for_write(inst,
                      sb.get(byte_offset(inst->dst, REG_SIZE * j))));
                }
             }

             if (is_send(inst) && inst->base_mrf != -1) {
                for (unsigned j = 0; j < inst->implied_mrf_writes(); j++)
                   add_dependency(ids, deps[ip], dependency_for_write(inst,
                      sb.get(brw_uvec_mrf(8, inst->base_mrf + j, 0))));
             }
          }

          update_inst_scoreboard(jps, inst, ip, sb);
          ip++;
       }

       delete[] sbs;
       delete[] ids;

       return deps;
    }

    /** @} */

    /**
     * Allocate SBID tokens to track the execution of every out-of-order
     * instruction of the shader.
     */
    dependency_list *
    allocate_inst_dependencies(const fs_visitor *shader,
                               const dependency_list *deps0)
    {
       /* XXX - Use bin-packing algorithm to assign hardware SBIDs optimally in
        *       shaders with a large number of SEND messages.
        */

       /* Allocate an unordered dependency ID to hardware SBID translation
        * table with as many entries as instructions there are in the shader,
        * which is the maximum number of unordered IDs we can find in the
        * program.
        */
       unsigned *ids = new unsigned[num_instructions(shader)];
       for (unsigned ip = 0; ip < num_instructions(shader); ip++)
          ids[ip] = ~0u;

       dependency_list *deps1 = new dependency_list[num_instructions(shader)];
       unsigned next_id = 0;

       for (unsigned ip = 0; ip < num_instructions(shader); ip++) {
          for (unsigned i = 0; i < deps0[ip].size(); i++) {
             const dependency &dep = deps0[ip][i];

             if (dep.unordered && ids[dep.id] == ~0u)
                ids[dep.id] = (next_id++) & 0xf;

             add_dependency(ids, deps1[ip], dep);
          }
       }

       delete[] ids;

       return deps1;
    }

    /**
     * Emit dependency information provided by \p deps into the shader,
     * inserting additional SYNC instructions for dependencies that can't be
     * represented directly by annotating existing instructions.
     */
    void
    emit_inst_dependencies(fs_visitor *shader,
                           const ordered_address *jps,
                           const dependency_list *deps)
    {
       unsigned ip = 0;

       foreach_block_and_inst_safe(block, fs_inst, inst, shader->cfg) {
          const bool exec_all = inst->force_writemask_all;
          tgl_swsb swsb = ordered_dependency_swsb(deps[ip], jps[ip], exec_all);
          const tgl_sbid_mode unordered_mode =
             baked_unordered_dependency_mode(inst, deps[ip], jps[ip]);

          for (unsigned i = 0; i < deps[ip].size(); i++) {
             const dependency &dep = deps[ip][i];

             if (dep.unordered) {
                if (unordered_mode == dep.unordered &&
                    exec_all >= dep.exec_all && !swsb.mode) {
                   /* Bake unordered dependency into the instruction's SWSB if
                    * possible, except in cases where the current instruction
                    * isn't marked NoMask but the dependency is, since that
                    * might lead to data coherency issues due to
                    * GEN:BUG:1407528679.
                    */
                   swsb.sbid = dep.id;
                   swsb.mode = dep.unordered;
                } else {
                   /* Emit dependency into the SWSB of an extra SYNC
                    * instruction.
                    */
                   const fs_builder ibld = fs_builder(shader, block, inst)
                                           .exec_all().group(1, 0);
                   fs_inst *sync = ibld.emit(BRW_OPCODE_SYNC, ibld.null_reg_ud(),
                                             brw_imm_ud(TGL_SYNC_NOP));
                   sync->sched.sbid = dep.id;
                   sync->sched.mode = dep.unordered;
                   assert(!(sync->sched.mode & TGL_SBID_SET));
                }
             }
          }

          for (unsigned i = 0; i < deps[ip].size(); i++) {
             const dependency &dep = deps[ip][i];

             if (dep.ordered && dep.exec_all > exec_all &&
                 find_ordered_dependency(deps[ip], jps[ip], true)) {
                /* If the current instruction is not marked NoMask but an
                 * ordered dependency is, perform the synchronization as a
                 * separate NoMask SYNC instruction in order to avoid data
                 * coherency issues due to GEN:BUG:1407528679.  The similar
                 * scenario with unordered dependencies should have been
                 * handled above.
                 */
                const fs_builder ibld = fs_builder(shader, block, inst)
                                        .exec_all().group(1, 0);
                fs_inst *sync = ibld.emit(BRW_OPCODE_SYNC, ibld.null_reg_ud(),
                                          brw_imm_ud(TGL_SYNC_NOP));
                sync->sched = ordered_dependency_swsb(deps[ip], jps[ip], true);
                break;
             }
          }

          /* Update the IR. */
          inst->sched = swsb;
          inst->no_dd_check = inst->no_dd_clear = false;
          ip++;
       }
    }
 }

 bool
 fs_visitor::lower_scoreboard()
 {
    if (devinfo->gen >= 12) {
       const ordered_address *jps = ordered_inst_addresses(this);
       const dependency_list *deps0 = gather_inst_dependencies(this, jps);
       const dependency_list *deps1 = allocate_inst_dependencies(this, deps0);
       emit_inst_dependencies(this, jps, deps1);
       delete[] deps1;
       delete[] deps0;
       delete[] jps;
    }

    return true;
 }