Edit

IABSD.fr/xenocara/lib/mesa/src/intel/compiler/brw_opt.cpp

Branch :

  • Show log

    Commit

  • Author : jsg
    Date : 2025-06-05 11:23:11
    Hash : 67d6f117
    Message : Import Mesa 25.0.7

  • lib/mesa/src/intel/compiler/brw_opt.cpp
  • /*
     * Copyright © 2010 Intel Corporation
     * SPDX-License-Identifier: MIT
     */
    
    #include "brw_eu.h"
    #include "brw_fs.h"
    #include "brw_builder.h"
    
    #include "dev/intel_debug.h"
    
    using namespace brw;
    
    void
    brw_optimize(fs_visitor &s)
    {
       const nir_shader *nir = s.nir;
    
       s.debug_optimizer(nir, "start", 0, 0);
    
       /* Start by validating the shader we currently have. */
       brw_validate(s);
    
       /* Track how much non-SSA at this point. */
       {
          const brw::def_analysis &defs = s.def_analysis.require();
          s.shader_stats.non_ssa_registers_after_nir =
             defs.count() - defs.ssa_count();
       }
    
       bool progress = false;
       int iteration = 0;
       int pass_num = 0;
    
    #define OPT(pass, ...) ({                                               \
          pass_num++;                                                       \
          bool this_progress = pass(s, ##__VA_ARGS__);                      \
                                                                            \
          if (this_progress)                                                \
             s.debug_optimizer(nir, #pass, iteration, pass_num);            \
                                                                            \
          brw_validate(s);                                                  \
                                                                            \
          progress = progress || this_progress;                             \
          this_progress;                                                    \
       })
    
       if (s.compiler->lower_dpas)
          OPT(brw_lower_dpas);
    
       OPT(brw_opt_split_virtual_grfs);
    
       /* Before anything else, eliminate dead code.  The results of some NIR
        * instructions may effectively be calculated twice.  Once when the
        * instruction is encountered, and again when the user of that result is
        * encountered.  Wipe those away before algebraic optimizations and
        * especially copy propagation can mix things up.
        */
       OPT(brw_opt_dead_code_eliminate);
    
       OPT(brw_opt_remove_extra_rounding_modes);
    
       OPT(brw_opt_eliminate_find_live_channel);
    
       do {
          progress = false;
          pass_num = 0;
          iteration++;
    
          OPT(brw_opt_algebraic);
          OPT(brw_opt_cse_defs);
          if (!OPT(brw_opt_copy_propagation_defs))
             OPT(brw_opt_copy_propagation);
          OPT(brw_opt_cmod_propagation);
          OPT(brw_opt_dead_code_eliminate);
          OPT(brw_opt_saturate_propagation);
          OPT(brw_opt_register_coalesce);
    
          OPT(brw_opt_compact_virtual_grfs);
       } while (progress);
    
       brw_shader_phase_update(s, BRW_SHADER_PHASE_AFTER_OPT_LOOP);
    
       progress = false;
       pass_num = 0;
    
       if (OPT(brw_opt_combine_convergent_txf))
          OPT(brw_opt_copy_propagation_defs);
    
       if (OPT(brw_lower_pack)) {
          OPT(brw_opt_register_coalesce);
          OPT(brw_opt_dead_code_eliminate);
       }
    
       OPT(brw_lower_subgroup_ops);
       OPT(brw_lower_csel);
       OPT(brw_lower_simd_width);
       OPT(brw_lower_scalar_fp64_MAD);
       OPT(brw_lower_barycentrics);
       OPT(brw_lower_logical_sends);
    
       brw_shader_phase_update(s, BRW_SHADER_PHASE_AFTER_EARLY_LOWERING);
    
       /* After logical SEND lowering. */
    
       if (!OPT(brw_opt_copy_propagation_defs))
          OPT(brw_opt_copy_propagation);
    
       /* Identify trailing zeros LOAD_PAYLOAD of sampler messages.
        * Do this before splitting SENDs.
        */
       if (OPT(brw_opt_zero_samples)) {
          if (!OPT(brw_opt_copy_propagation_defs)) {
             OPT(brw_opt_copy_propagation);
          }
       }
    
       if (s.devinfo->ver >= 30)
          OPT(brw_opt_send_to_send_gather);
    
       OPT(brw_opt_split_sends);
       OPT(brw_workaround_nomask_control_flow);
    
       if (progress) {
          /* Do both forms of copy propagation because it is important to
           * eliminate as many cases of load_payload-of-load_payload as possible.
           */
          OPT(brw_opt_copy_propagation_defs);
          OPT(brw_opt_copy_propagation);
    
          /* Run after logical send lowering to give it a chance to CSE the
           * LOAD_PAYLOAD instructions created to construct the payloads of
           * e.g. texturing messages in cases where it wasn't possible to CSE the
           * whole logical instruction.
           */
          OPT(brw_opt_cse_defs);
          OPT(brw_opt_register_coalesce);
          OPT(brw_opt_dead_code_eliminate);
       }
    
       OPT(brw_opt_remove_redundant_halts);
    
       if (OPT(brw_lower_load_payload)) {
          OPT(brw_opt_split_virtual_grfs);
    
          OPT(brw_opt_register_coalesce);
          OPT(brw_lower_simd_width);
          OPT(brw_opt_dead_code_eliminate);
       }
    
       brw_shader_phase_update(s, BRW_SHADER_PHASE_AFTER_MIDDLE_LOWERING);
    
       OPT(brw_lower_alu_restrictions);
    
       OPT(brw_opt_combine_constants);
       if (OPT(brw_lower_integer_multiplication)) {
          /* If lower_integer_multiplication made progress, it may have produced
           * some 32x32-bit MULs in the process of lowering 64-bit MULs.  Run it
           * one more time to clean those up if they exist.
           */
          OPT(brw_lower_integer_multiplication);
       }
       OPT(brw_lower_sub_sat);
    
       progress = false;
       OPT(brw_lower_derivatives);
       OPT(brw_lower_regioning);
    
       /* Try both copy propagation passes.  The defs one will likely not be
        * able to handle everything at this point.
        */
       const bool cp1 = OPT(brw_opt_copy_propagation_defs);
       const bool cp2 = OPT(brw_opt_copy_propagation);
       if (cp1 || cp2)
          OPT(brw_opt_combine_constants);
    
       OPT(brw_opt_dead_code_eliminate);
       OPT(brw_opt_register_coalesce);
    
       if (progress)
          OPT(brw_lower_simd_width);
    
       if (s.devinfo->ver >= 30)
          OPT(brw_opt_send_gather_to_send);
    
       OPT(brw_lower_uniform_pull_constant_loads);
    
       /* Do this before brw_lower_send_descriptors. */
       OPT(brw_workaround_memory_fence_before_eot);
    
       if (OPT(brw_lower_send_descriptors)) {
          /* No need for standard copy_propagation since
           * brw_fs_opt_address_reg_load will only optimize defs.
           */
          if (OPT(brw_opt_copy_propagation_defs))
             OPT(brw_opt_algebraic);
          OPT(brw_opt_address_reg_load);
          OPT(brw_opt_dead_code_eliminate);
       }
    
       OPT(brw_lower_sends_overlapping_payload);
    
       OPT(brw_lower_indirect_mov);
    
       OPT(brw_lower_find_live_channel);
    
       OPT(brw_lower_load_subgroup_invocation);
    
       brw_shader_phase_update(s, BRW_SHADER_PHASE_AFTER_LATE_LOWERING);
    }
    
    static unsigned
    load_payload_sources_read_for_size(fs_inst *lp, unsigned size_read)
    {
       assert(lp->opcode == SHADER_OPCODE_LOAD_PAYLOAD);
       assert(size_read >= lp->header_size * REG_SIZE);
    
       unsigned i;
       unsigned size = lp->header_size * REG_SIZE;
       for (i = lp->header_size; size < size_read && i < lp->sources; i++)
          size += lp->exec_size * brw_type_size_bytes(lp->src[i].type);
    
       /* Size read must cover exactly a subset of sources. */
       assert(size == size_read);
       return i;
    }
    
    /**
     * Optimize sample messages that have constant zero values for the trailing
     * parameters. We can just reduce the message length for these
     * instructions instead of reserving a register for it. Trailing parameters
     * that aren't sent default to zero anyway. This will cause the dead code
     * eliminator to remove the MOV instruction that would otherwise be emitted to
     * set up the zero value.
     */
    
    bool
    brw_opt_zero_samples(fs_visitor &s)
    {
       bool progress = false;
    
       foreach_block_and_inst(block, fs_inst, send, s.cfg) {
          if (send->opcode != SHADER_OPCODE_SEND ||
              send->sfid != BRW_SFID_SAMPLER)
             continue;
    
          /* Wa_14012688258:
           *
           * Don't trim zeros at the end of payload for sample operations
           * in cube and cube arrays.
           */
          if (send->keep_payload_trailing_zeros)
             continue;
    
          /* This pass works on SENDs before splitting. */
          if (send->ex_mlen > 0)
             continue;
    
          fs_inst *lp = (fs_inst *) send->prev;
    
          if (lp->is_head_sentinel() || lp->opcode != SHADER_OPCODE_LOAD_PAYLOAD)
             continue;
    
          /* How much of the payload are actually read by this SEND. */
          const unsigned params =
             load_payload_sources_read_for_size(lp, send->mlen * REG_SIZE);
    
          /* We don't want to remove the message header or the first parameter.
           * Removing the first parameter is not allowed, see the Haswell PRM
           * volume 7, page 149:
           *
           *     "Parameter 0 is required except for the sampleinfo message, which
           *      has no parameter 0"
           */
          const unsigned first_param_idx = lp->header_size;
          unsigned zero_size = 0;
          for (unsigned i = params - 1; i > first_param_idx; i--) {
             if (lp->src[i].file != BAD_FILE && !lp->src[i].is_zero())
                break;
             zero_size += lp->exec_size * brw_type_size_bytes(lp->src[i].type) * lp->dst.stride;
          }
    
          /* Round down to ensure to only consider full registers. */
          const unsigned zero_len = ROUND_DOWN_TO(zero_size / REG_SIZE, reg_unit(s.devinfo));
          if (zero_len > 0) {
             /* Note mlen is in REG_SIZE units. */
             send->mlen -= zero_len;
             progress = true;
          }
       }
    
       if (progress)
          s.invalidate_analysis(DEPENDENCY_INSTRUCTION_DETAIL);
    
       return progress;
    }
    
    /**
     * Opportunistically split SEND message payloads.
     *
     * Gfx9+ supports "split" SEND messages, which take two payloads that are
     * implicitly concatenated.  If we find a SEND message with a single payload,
     * we can split that payload in two.  This results in smaller contiguous
     * register blocks for us to allocate.  But it can help beyond that, too.
     *
     * We try and split a LOAD_PAYLOAD between sources which change registers.
     * For example, a sampler message often contains a x/y/z coordinate that may
     * already be in a contiguous VGRF, combined with an LOD, shadow comparitor,
     * or array index, which comes from elsewhere.  In this case, the first few
     * sources will be different offsets of the same VGRF, then a later source
     * will be a different VGRF.  So we split there, possibly eliminating the
     * payload concatenation altogether.
     */
    bool
    brw_opt_split_sends(fs_visitor &s)
    {
       bool progress = false;
    
       foreach_block_and_inst(block, fs_inst, send, s.cfg) {
          if (send->opcode != SHADER_OPCODE_SEND ||
              send->mlen <= reg_unit(s.devinfo) || send->ex_mlen > 0 ||
              send->src[2].file != VGRF)
             continue;
    
          /* Currently don't split sends that reuse a previously used payload. */
          fs_inst *lp = (fs_inst *) send->prev;
    
          if (lp->is_head_sentinel() || lp->opcode != SHADER_OPCODE_LOAD_PAYLOAD)
             continue;
    
          if (lp->dst.file != send->src[2].file || lp->dst.nr != send->src[2].nr)
             continue;
    
          /* Split either after the header (if present), or when consecutive
           * sources switch from one VGRF to a different one.
           */
          unsigned mid = lp->header_size;
          if (mid == 0) {
             for (mid = 1; mid < lp->sources; mid++) {
                if (lp->src[mid].file == BAD_FILE)
                   continue;
    
                if (lp->src[0].file != lp->src[mid].file ||
                    lp->src[0].nr != lp->src[mid].nr)
                   break;
             }
          }
    
          /* SEND mlen might be smaller than what LOAD_PAYLOAD provides, so
           * find out how many sources from the payload does it really need.
           */
          const unsigned end =
             load_payload_sources_read_for_size(lp, send->mlen * REG_SIZE);
    
          /* Nothing to split. */
          if (end <= mid)
             continue;
    
          const brw_builder ibld(&s, block, lp);
          fs_inst *lp1 = ibld.LOAD_PAYLOAD(lp->dst, &lp->src[0], mid, lp->header_size);
          fs_inst *lp2 = ibld.LOAD_PAYLOAD(lp->dst, &lp->src[mid], end - mid, 0);
    
          assert(lp1->size_written % REG_SIZE == 0);
          assert(lp2->size_written % REG_SIZE == 0);
          assert((lp1->size_written + lp2->size_written) / REG_SIZE == send->mlen);
    
          lp1->dst = brw_vgrf(s.alloc.allocate(lp1->size_written / REG_SIZE), lp1->dst.type);
          lp2->dst = brw_vgrf(s.alloc.allocate(lp2->size_written / REG_SIZE), lp2->dst.type);
    
          send->resize_sources(4);
          send->src[2] = lp1->dst;
          send->src[3] = lp2->dst;
          send->ex_mlen = lp2->size_written / REG_SIZE;
          send->mlen -= send->ex_mlen;
    
          progress = true;
       }
    
       if (progress)
          s.invalidate_analysis(DEPENDENCY_INSTRUCTIONS | DEPENDENCY_VARIABLES);
    
       return progress;
    }
    
    /**
     * Remove redundant or useless halts.
     *
     * For example, we can eliminate halts in the following sequence:
     *
     * halt        (redundant with the next halt)
     * halt        (useless; jumps to the next instruction)
     * halt-target
     */
    bool
    brw_opt_remove_redundant_halts(fs_visitor &s)
    {
       bool progress = false;
    
       unsigned halt_count = 0;
       fs_inst *halt_target = NULL;
       bblock_t *halt_target_block = NULL;
       foreach_block_and_inst(block, fs_inst, inst, s.cfg) {
          if (inst->opcode == BRW_OPCODE_HALT)
             halt_count++;
    
          if (inst->opcode == SHADER_OPCODE_HALT_TARGET) {
             halt_target = inst;
             halt_target_block = block;
             break;
          }
       }
    
       if (!halt_target) {
          assert(halt_count == 0);
          return false;
       }
    
       /* Delete any HALTs immediately before the halt target. */
       for (fs_inst *prev = (fs_inst *) halt_target->prev;
            !prev->is_head_sentinel() && prev->opcode == BRW_OPCODE_HALT;
            prev = (fs_inst *) halt_target->prev) {
          prev->remove(halt_target_block);
          halt_count--;
          progress = true;
       }
    
       if (halt_count == 0) {
          halt_target->remove(halt_target_block);
          progress = true;
       }
    
       if (progress)
          s.invalidate_analysis(DEPENDENCY_INSTRUCTIONS);
    
       return progress;
    }
    
    /**
     * Eliminate FIND_LIVE_CHANNEL instructions occurring outside any control
     * flow.  We could probably do better here with some form of divergence
     * analysis.
     */
    bool
    brw_opt_eliminate_find_live_channel(fs_visitor &s)
    {
       bool progress = false;
       unsigned depth = 0;
    
       if (!brw_stage_has_packed_dispatch(s.devinfo, s.stage, s.max_polygons,
                                          s.prog_data)) {
          /* The optimization below assumes that channel zero is live on thread
           * dispatch, which may not be the case if the fixed function dispatches
           * threads sparsely.
           */
          return false;
       }
    
       foreach_block_and_inst_safe(block, fs_inst, inst, s.cfg) {
          switch (inst->opcode) {
          case BRW_OPCODE_IF:
          case BRW_OPCODE_DO:
             depth++;
             break;
    
          case BRW_OPCODE_ENDIF:
          case BRW_OPCODE_WHILE:
             depth--;
             break;
    
          case BRW_OPCODE_HALT:
             /* This can potentially make control flow non-uniform until the end
              * of the program.
              */
             goto out;
    
          case SHADER_OPCODE_FIND_LIVE_CHANNEL:
             if (depth == 0) {
                inst->opcode = BRW_OPCODE_MOV;
                inst->src[0] = brw_imm_ud(0u);
                inst->force_writemask_all = true;
    
                /* FIND_LIVE_CHANNEL emitted by emit_uniformize will have
                 * size_written set by hand to a smaller value. In this case,
                 * munge the exec_size to match.
                 */
                if (inst->size_written == inst->dst.component_size(8 * reg_unit(s.devinfo)))
                   inst->exec_size = 8 * reg_unit(s.devinfo);
    
                inst->resize_sources(1);
                progress = true;
    
                /* emit_uniformize() frequently emits FIND_LIVE_CHANNEL paired
                 * with a BROADCAST.  Save some work for opt_copy_propagation
                 * and opt_algebraic by trivially cleaning up both together.
                 */
                assert(!inst->next->is_tail_sentinel());
                fs_inst *bcast = (fs_inst *) inst->next;
    
                /* Ignore stride when comparing */
                if (bcast->opcode == SHADER_OPCODE_BROADCAST &&
                    inst->dst.file == VGRF &&
                    inst->dst.file == bcast->src[1].file &&
                    inst->dst.nr == bcast->src[1].nr &&
                    inst->dst.offset == bcast->src[1].offset) {
                   bcast->opcode = BRW_OPCODE_MOV;
                   if (!is_uniform(bcast->src[0]))
                      bcast->src[0] = component(bcast->src[0], 0);
    
                   bcast->force_writemask_all = true;
                   bcast->exec_size = 8 * reg_unit(s.devinfo);
                   assert(bcast->size_written == bcast->dst.component_size(bcast->exec_size));
                   bcast->resize_sources(1);
                }
             }
             break;
    
          default:
             break;
          }
       }
    
    out:
       if (progress)
          s.invalidate_analysis(DEPENDENCY_INSTRUCTION_DETAIL);
    
       return progress;
    }
    
    /**
     * Rounding modes for conversion instructions are included for each
     * conversion, but right now it is a state. So once it is set,
     * we don't need to call it again for subsequent calls.
     *
     * This is useful for vector/matrices conversions, as setting the
     * mode once is enough for the full vector/matrix
     */
    bool
    brw_opt_remove_extra_rounding_modes(fs_visitor &s)
    {
       bool progress = false;
       unsigned execution_mode = s.nir->info.float_controls_execution_mode;
    
       brw_rnd_mode base_mode = BRW_RND_MODE_UNSPECIFIED;
       if ((FLOAT_CONTROLS_ROUNDING_MODE_RTE_FP16 |
            FLOAT_CONTROLS_ROUNDING_MODE_RTE_FP32 |
            FLOAT_CONTROLS_ROUNDING_MODE_RTE_FP64) &
           execution_mode)
          base_mode = BRW_RND_MODE_RTNE;
       if ((FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP16 |
            FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP32 |
            FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP64) &
           execution_mode)
          base_mode = BRW_RND_MODE_RTZ;
    
       foreach_block (block, s.cfg) {
          brw_rnd_mode prev_mode = base_mode;
    
          foreach_inst_in_block_safe (fs_inst, inst, block) {
             if (inst->opcode == SHADER_OPCODE_RND_MODE) {
                assert(inst->src[0].file == IMM);
                const brw_rnd_mode mode = (brw_rnd_mode) inst->src[0].d;
                if (mode == prev_mode) {
                   inst->remove(block);
                   progress = true;
                } else {
                   prev_mode = mode;
                }
             }
          }
       }
    
       if (progress)
          s.invalidate_analysis(DEPENDENCY_INSTRUCTIONS);
    
       return progress;
    }
    
    bool
    brw_opt_send_to_send_gather(fs_visitor &s)
    {
       const intel_device_info *devinfo = s.devinfo;
       bool progress = false;
    
       assert(devinfo->ver >= 30);
    
       const unsigned unit = reg_unit(devinfo);
       assert(unit == 2);
    
       unsigned count = 0;
    
       foreach_block_and_inst_safe(block, fs_inst, inst, s.cfg) {
          if (inst->opcode != SHADER_OPCODE_SEND)
             continue;
    
          /* For 1-2 registers, send-gather offers no benefits over split-send. */
          if (inst->mlen + inst->ex_mlen <= 2 * unit)
             continue;
    
          assert(inst->mlen % unit == 0);
          assert(inst->ex_mlen % unit == 0);
    
          struct {
             brw_reg src;
             unsigned phys_len;
          } payload[2] = {
             { inst->src[2], inst->mlen / unit },
             { inst->src[3], inst->ex_mlen / unit },
          };
    
          const unsigned num_payload_sources = payload[0].phys_len + payload[1].phys_len;
    
          /* Limited by Src0.Length in the SEND instruction. */
          if (num_payload_sources > 15)
             continue;
    
          if (INTEL_DEBUG(DEBUG_NO_SEND_GATHER)) {
             count++;
             continue;
          }
    
          inst->resize_sources(3 + num_payload_sources);
          /* Sources 0 and 1 remain the same.  Source 2 will be filled
           * after register allocation.
           */
          inst->src[2] = {};
    
          int idx = 3;
          for (unsigned p = 0; p < ARRAY_SIZE(payload); p++) {
             for (unsigned i = 0; i < payload[p].phys_len; i++) {
                inst->src[idx++] = byte_offset(payload[p].src,
                                               i * reg_unit(devinfo) * REG_SIZE);
             }
          }
          assert(idx == inst->sources);
    
          inst->opcode = SHADER_OPCODE_SEND_GATHER;
          inst->mlen = 0;
          inst->ex_mlen = 0;
    
          progress = true;
       }
    
       if (INTEL_DEBUG(DEBUG_NO_SEND_GATHER)) {
          fprintf(stderr, "Ignored %u opportunities to try SEND_GATHER in %s shader.\n",
                  count, _mesa_shader_stage_to_string(s.stage));
       }
    
       if (progress)
          s.invalidate_analysis(DEPENDENCY_INSTRUCTION_DETAIL |
                                DEPENDENCY_INSTRUCTION_DATA_FLOW);
    
       return progress;
    }
    
    /* If after optimizations, the sources are *still* contiguous in a
     * SEND_GATHER, prefer to use the regular SEND, which would save
     * having to write the ARF scalar register.
     */
    bool
    brw_opt_send_gather_to_send(fs_visitor &s)
    {
       const intel_device_info *devinfo = s.devinfo;
       bool progress = false;
    
       assert(devinfo->ver >= 30);
    
       const unsigned unit = reg_unit(devinfo);
       assert(unit == 2);
    
       foreach_block_and_inst_safe(block, fs_inst, inst, s.cfg) {
          if (inst->opcode != SHADER_OPCODE_SEND_GATHER)
             continue;
    
          assert(inst->sources > 2);
          assert(inst->src[2].file == BAD_FILE);
    
          const int num_payload_sources = inst->sources - 3;
          assert(num_payload_sources > 0);
    
          /* Limited by Src0.Length in the SEND instruction. */
          assert(num_payload_sources < 16);
    
          /* Determine whether the sources are still spread in either one or two
           * spans.  In those cases the regular SEND instruction can be used
           * and there's no need to use SEND_GATHER (which would set ARF scalar register
           * adding an extra instruction).
           */
          const brw_reg *payload = &inst->src[3];
          brw_reg payload1       = payload[0];
          brw_reg payload2       = {};
          int payload1_len       = 0;
          int payload2_len       = 0;
    
          for (int i = 0; i < num_payload_sources; i++) {
             if (payload[i].file == VGRF &&
                 payload[i].nr == payload1.nr &&
                 payload[i].offset == payload1_len * REG_SIZE * unit)
                payload1_len++;
             else {
                payload2 = payload[i];
                break;
             }
          }
    
          if (payload2.file == VGRF) {
             for (int i = payload1_len; i < num_payload_sources; i++) {
                if (payload[i].file == VGRF &&
                    payload[i].nr == payload2.nr &&
                    payload[i].offset == payload2_len * REG_SIZE * unit)
                   payload2_len++;
                else
                   break;
             }
          } else {
             payload2 = brw_null_reg();
          }
    
          if (payload1_len + payload2_len != num_payload_sources)
             continue;
    
          /* Bspec 57058 (r64705) says
           *
           *    When a source data payload is used in dataport message, that payload
           *    must be specified as Source 1 portion of a Split Send message.
           *
           * But at this point the split point is not guaranteed to respect that.
           *
           * TODO: Pass LSC address length or infer it so valid splits can work.
           */
          if (payload2_len && (inst->sfid == GFX12_SFID_UGM ||
                               inst->sfid == GFX12_SFID_TGM ||
                               inst->sfid == GFX12_SFID_SLM ||
                               inst->sfid == BRW_SFID_URB)) {
             enum lsc_opcode lsc_op = lsc_msg_desc_opcode(devinfo, inst->desc);
             if (lsc_op_num_data_values(lsc_op) > 0)
                continue;
          }
    
          inst->resize_sources(4);
          inst->opcode  = SHADER_OPCODE_SEND;
          inst->src[2]  = payload1;
          inst->src[3]  = payload2;
          inst->mlen    = payload1_len * unit;
          inst->ex_mlen = payload2_len * unit;
    
          progress = true;
       }
    
       if (progress) {
          s.invalidate_analysis(DEPENDENCY_INSTRUCTION_DETAIL |
                                DEPENDENCY_INSTRUCTION_DATA_FLOW);
       }
    
       return progress;
    }