Edit

IABSD.fr/xenocara/lib/mesa/src/intel/compiler/brw_workaround.cpp

Branch :

  • Show log

    Commit

  • Author : jsg
    Date : 2025-06-05 11:23:11
    Hash : 67d6f117
    Message : Import Mesa 25.0.7

  • lib/mesa/src/intel/compiler/brw_workaround.cpp
  • /*
     * Copyright © 2010 Intel Corporation
     * SPDX-License-Identifier: MIT
     */
    
    #include "brw_fs.h"
    #include "brw_builder.h"
    
    using namespace brw;
    
    /* Wa_14015360517
     *
     * The first instruction of any kernel should have non-zero emask.
     * Make sure this happens by introducing a dummy mov instruction.
     */
    bool
    brw_workaround_emit_dummy_mov_instruction(fs_visitor &s)
    {
       if (!intel_needs_workaround(s.devinfo, 14015360517))
          return false;
    
       fs_inst *first_inst =
          s.cfg->first_block()->start();
    
       /* We can skip the WA if first instruction is marked with
        * force_writemask_all or exec_size equals dispatch_width.
        */
       if (first_inst->force_writemask_all ||
           first_inst->exec_size == s.dispatch_width)
          return false;
    
       /* Insert dummy mov as first instruction. */
       const brw_builder ubld =
          brw_builder(&s, s.cfg->first_block(), (fs_inst *)first_inst).exec_all().group(8, 0);
       ubld.MOV(ubld.null_reg_ud(), brw_imm_ud(0u));
    
       s.invalidate_analysis(DEPENDENCY_INSTRUCTIONS | DEPENDENCY_VARIABLES);
       return true;
    }
    
    static bool
    needs_dummy_fence(const intel_device_info *devinfo, fs_inst *inst)
    {
       /* This workaround is about making sure that any instruction writing
        * through UGM has completed before we hit EOT.
        */
       if (inst->sfid != GFX12_SFID_UGM)
          return false;
    
       /* Any UGM, non-Scratch-surface Stores (not including Atomic) messages,
        * where the L1-cache override is NOT among {WB, WS, WT}
        */
       enum lsc_opcode opcode = lsc_msg_desc_opcode(devinfo, inst->desc);
       if (lsc_opcode_is_store(opcode)) {
          switch (lsc_msg_desc_cache_ctrl(devinfo, inst->desc)) {
          case LSC_CACHE_STORE_L1STATE_L3MOCS:
          case LSC_CACHE_STORE_L1WB_L3WB:
          case LSC_CACHE_STORE_L1S_L3UC:
          case LSC_CACHE_STORE_L1S_L3WB:
          case LSC_CACHE_STORE_L1WT_L3UC:
          case LSC_CACHE_STORE_L1WT_L3WB:
             return false;
    
          default:
             return true;
          }
       }
    
       /* Any UGM Atomic message WITHOUT return value */
       if (lsc_opcode_is_atomic(opcode) && inst->dst.is_null())
          return true;
    
       return false;
    }
    
    /* Wa_22013689345
     *
     * We need to emit UGM fence message before EOT, if shader has any UGM write
     * or atomic message.
     *
     * TODO/FINISHME: According to Curro we could avoid the fence in some cases.
     *                We probably need a better criteria in needs_dummy_fence().
     */
    bool
    brw_workaround_memory_fence_before_eot(fs_visitor &s)
    {
       bool progress = false;
       bool has_ugm_write_or_atomic = false;
    
       if (!intel_needs_workaround(s.devinfo, 22013689345))
          return false;
    
       /* Needs to happen after brw_lower_logical_sends & before
        * brw_lower_send_descriptors.
        */
       assert(s.phase == BRW_SHADER_PHASE_AFTER_MIDDLE_LOWERING);
    
       foreach_block_and_inst_safe (block, fs_inst, inst, s.cfg) {
          if (!inst->eot) {
             if (needs_dummy_fence(s.devinfo, inst))
                has_ugm_write_or_atomic = true;
             continue;
          }
    
          if (!has_ugm_write_or_atomic)
             break;
    
          const brw_builder ibld(&s, block, inst);
          const brw_builder ubld = ibld.exec_all().group(1, 0);
    
          brw_reg dst = ubld.vgrf(BRW_TYPE_UD);
          fs_inst *dummy_fence = ubld.emit(SHADER_OPCODE_SEND, dst);
    
          dummy_fence->resize_sources(4);
          dummy_fence->src[0] = brw_imm_ud(0);
          dummy_fence->src[1] = brw_imm_ud(0);
          dummy_fence->src[2] = brw_vec8_grf(0, 0);
          dummy_fence->src[3] = brw_reg();
          dummy_fence->mlen = reg_unit(s.devinfo);
          dummy_fence->ex_mlen = 0;
          dummy_fence->sfid = GFX12_SFID_UGM;
          dummy_fence->desc = lsc_fence_msg_desc(s.devinfo, LSC_FENCE_TILE,
                                                 LSC_FLUSH_TYPE_NONE_6, false);
          ubld.emit(FS_OPCODE_SCHEDULING_FENCE, ubld.null_reg_ud(), dst);
          progress = true;
          /* TODO: remove this break if we ever have shader with multiple EOT. */
          break;
       }
    
       if (progress) {
          s.invalidate_analysis(DEPENDENCY_INSTRUCTIONS |
                                DEPENDENCY_VARIABLES);
       }
    
       return progress;
    }
    
    /**
     * Find the first instruction in the program that might start a region of
     * divergent control flow due to a HALT jump.  There is no
     * find_halt_control_flow_region_end(), the region of divergence extends until
     * the only SHADER_OPCODE_HALT_TARGET in the program.
     */
    static const fs_inst *
    find_halt_control_flow_region_start(const fs_visitor *v)
    {
       foreach_block_and_inst(block, fs_inst, inst, v->cfg) {
          if (inst->opcode == BRW_OPCODE_HALT ||
              inst->opcode == SHADER_OPCODE_HALT_TARGET)
             return inst;
       }
    
       return NULL;
    }
    
    /**
     * Work around the Gfx12 hardware bug filed as Wa_1407528679.  EU fusion
     * can cause a BB to be executed with all channels disabled, which will lead
     * to the execution of any NoMask instructions in it, even though any
     * execution-masked instructions will be correctly shot down.  This may break
     * assumptions of some NoMask SEND messages whose descriptor depends on data
     * generated by live invocations of the shader.
     *
     * This avoids the problem by predicating certain instructions on an ANY
     * horizontal predicate that makes sure that their execution is omitted when
     * all channels of the program are disabled.
     */
    bool
    brw_workaround_nomask_control_flow(fs_visitor &s)
    {
       if (s.devinfo->ver != 12)
          return false;
    
       const brw_predicate pred = s.dispatch_width > 16 ? BRW_PREDICATE_ALIGN1_ANY32H :
                                  s.dispatch_width > 8 ? BRW_PREDICATE_ALIGN1_ANY16H :
                                  BRW_PREDICATE_ALIGN1_ANY8H;
       const fs_inst *halt_start = find_halt_control_flow_region_start(&s);
       unsigned depth = 0;
       bool progress = false;
    
       const fs_live_variables &live_vars = s.live_analysis.require();
    
       /* Scan the program backwards in order to be able to easily determine
        * whether the flag register is live at any point.
        */
       foreach_block_reverse_safe(block, s.cfg) {
          BITSET_WORD flag_liveout = live_vars.block_data[block->num]
                                                   .flag_liveout[0];
          STATIC_ASSERT(ARRAY_SIZE(live_vars.block_data[0].flag_liveout) == 1);
    
          foreach_inst_in_block_reverse_safe(fs_inst, inst, block) {
             if (!inst->predicate && inst->exec_size >= 8)
                flag_liveout &= ~inst->flags_written(s.devinfo);
    
             switch (inst->opcode) {
             case BRW_OPCODE_DO:
             case BRW_OPCODE_IF:
                /* Note that this doesn't handle BRW_OPCODE_HALT since only
                 * the first one in the program closes the region of divergent
                 * control flow due to any HALT instructions -- Instead this is
                 * handled with the halt_start check below.
                 */
                depth--;
                break;
    
             case BRW_OPCODE_WHILE:
             case BRW_OPCODE_ENDIF:
             case SHADER_OPCODE_HALT_TARGET:
                depth++;
                break;
    
             default:
                /* Note that the vast majority of NoMask SEND instructions in the
                 * program are harmless while executed in a block with all
                 * channels disabled, since any instructions with side effects we
                 * could hit here should be execution-masked.
                 *
                 * The main concern is NoMask SEND instructions where the message
                 * descriptor or header depends on data generated by live
                 * invocations of the shader (RESINFO and
                 * FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD with a dynamically
                 * computed surface index seem to be the only examples right now
                 * where this could easily lead to GPU hangs).  Unfortunately we
                 * have no straightforward way to detect that currently, so just
                 * predicate any NoMask SEND instructions we find under control
                 * flow.
                 *
                 * If this proves to have a measurable performance impact it can
                 * be easily extended with a whitelist of messages we know we can
                 * safely omit the predication for.
                 */
                if (depth && inst->force_writemask_all &&
                    is_send(inst) && !inst->predicate &&
                    !inst->has_no_mask_send_params) {
                   /* We need to load the execution mask into the flag register by
                    * using a builder with channel group matching the whole shader
                    * (rather than the default which is derived from the original
                    * instruction), in order to avoid getting a right-shifted
                    * value.
                    */
                   const brw_builder ubld = brw_builder(&s, block, inst)
                                           .exec_all().group(s.dispatch_width, 0);
                   const brw_reg flag = retype(brw_flag_reg(0, 0),
                                              BRW_TYPE_UD);
    
                   /* Due to the lack of flag register allocation we need to save
                    * and restore the flag register if it's live.
                    */
                   const bool save_flag = flag_liveout &
                                          brw_fs_flag_mask(flag, s.dispatch_width / 8);
                   const brw_reg tmp = ubld.group(8, 0).vgrf(flag.type);
    
                   if (save_flag) {
                      ubld.group(8, 0).UNDEF(tmp);
                      ubld.group(1, 0).MOV(tmp, flag);
                   }
    
                   ubld.emit(FS_OPCODE_LOAD_LIVE_CHANNELS);
    
                   set_predicate(pred, inst);
                   inst->flag_subreg = 0;
                   inst->predicate_trivial = true;
    
                   if (save_flag)
                      ubld.group(1, 0).at(block, inst->next).MOV(flag, tmp);
    
                   progress = true;
                }
                break;
             }
    
             if (inst == halt_start)
                depth--;
    
             flag_liveout |= inst->flags_read(s.devinfo);
          }
       }
    
       if (progress)
          s.invalidate_analysis(DEPENDENCY_INSTRUCTIONS | DEPENDENCY_VARIABLES);
    
       return progress;
    }
    
    /**
     * flags_read() and flags_written() return flag access with byte granularity,
     * but for Flag Register PRM lists "Access Granularity: Word", so we can assume
     * accessing any part of a word will clear its register dependency.
     */
    static unsigned
    bytes_bitmask_to_words(unsigned b)
    {
       unsigned first_byte_mask = b & 0x55555555;
       unsigned second_byte_mask = b & 0xaaaaaaaa;
       return first_byte_mask |
              (first_byte_mask << 1) |
              second_byte_mask |
              (second_byte_mask >> 1);
    }
    
    /**
     * WaClearArfDependenciesBeforeEot
     *
     * Flag register dependency not cleared after EOT, so we have to source them
     * before EOT. We can do this with simple `mov(1) nullUD, f{0,1}UD`
     *
     * To avoid emitting MOVs when it's not needed, check if each block  reads all
     * the flags it sets. We might falsely determine register as unread if it'll be
     * accessed inside the next blocks, but this still should be good enough.
     */
    bool
    brw_workaround_source_arf_before_eot(fs_visitor &s)
    {
       bool progress = false;
    
       if (s.devinfo->ver != 9)
          return false;
    
       unsigned flags_unread = 0;
    
       foreach_block(block, s.cfg) {
          unsigned flags_unread_in_block = 0;
    
          foreach_inst_in_block(fs_inst, inst, block) {
             /* Instruction can read and write to the same flag, so the order is important */
             flags_unread_in_block &= ~bytes_bitmask_to_words(inst->flags_read(s.devinfo));
             flags_unread_in_block |= bytes_bitmask_to_words(inst->flags_written(s.devinfo));
    
             /* HALT does not start its block even though it can leave a dependency */
             if (inst->opcode == BRW_OPCODE_HALT ||
                 inst->opcode == SHADER_OPCODE_HALT_TARGET) {
                flags_unread |= flags_unread_in_block;
                flags_unread_in_block = 0;
             }
          }
    
          flags_unread |= flags_unread_in_block;
    
          if ((flags_unread & 0x0f) && (flags_unread & 0xf0))
             break;
       }
    
       if (flags_unread) {
          int eot_count = 0;
    
          foreach_block_and_inst_safe(block, fs_inst, inst, s.cfg)
          {
             if (!inst->eot)
                continue;
    
             /* Currently, we always emit only one EOT per program,
              * this WA should be updated if it ever changes.
              */
             assert(++eot_count == 1);
    
             const brw_builder ibld(&s, block, inst);
             const brw_builder ubld = ibld.exec_all().group(1, 0);
    
             if (flags_unread & 0x0f)
                ubld.MOV(ubld.null_reg_ud(), retype(brw_flag_reg(0, 0), BRW_TYPE_UD));
    
             if (flags_unread & 0xf0)
                ubld.MOV(ubld.null_reg_ud(), retype(brw_flag_reg(1, 0), BRW_TYPE_UD));
          }
    
          progress = true;
          s.invalidate_analysis(DEPENDENCY_INSTRUCTIONS);
       }
    
       return progress;
    }