Edit

IABSD.fr/xenocara/lib/mesa/src/amd/compiler/aco_reduce_assign.cpp

Branch :

  • Show log

    Commit

  • Author : jsg
    Date : 2025-06-05 11:23:11
    Hash : 67d6f117
    Message : Import Mesa 25.0.7

  • lib/mesa/src/amd/compiler/aco_reduce_assign.cpp
  • /*
     * Copyright © 2018 Valve Corporation
     * Copyright © 2018 Google
     *
     * SPDX-License-Identifier: MIT
     */
    
    #include "aco_builder.h"
    #include "aco_ir.h"
    
    #include <vector>
    
    /*
     * Insert p_linear_start instructions right before RA to correctly allocate
     * temporaries for reductions that have to disrespect EXEC by executing in
     * WWM.
     */
    
    namespace aco {
    
    void
    setup_reduce_temp(Program* program)
    {
       unsigned last_top_level_block_idx = 0;
       unsigned maxSize = 0;
    
       std::vector<bool> hasReductions(program->blocks.size());
       for (Block& block : program->blocks) {
          for (aco_ptr<Instruction>& instr : block.instructions) {
             if (instr->opcode == aco_opcode::p_interp_gfx11 ||
                 instr->opcode == aco_opcode::p_bpermute_permlane) {
                maxSize = MAX2(maxSize, 1);
                hasReductions[block.index] = true;
             } else if (instr->format == Format::PSEUDO_REDUCTION) {
                maxSize = MAX2(maxSize, instr->operands[0].size());
                hasReductions[block.index] = true;
             }
          }
       }
    
       if (maxSize == 0)
          return;
    
       assert(maxSize == 1 || maxSize == 2);
       Temp reduceTmp(0, RegClass(RegType::vgpr, maxSize).as_linear());
       Temp vtmp(0, RegClass(RegType::vgpr, maxSize).as_linear());
       int inserted_at = -1;
       int vtmp_inserted_at = -1;
    
       for (Block& block : program->blocks) {
    
          if (block.kind & block_kind_top_level) {
             last_top_level_block_idx = block.index;
    
             /* TODO: this could be improved in this case:
              *    start_linear_vgpr
              *    if (...) {
              *       use_linear_vgpr
              *    }
              *    end_linear_vgpr
              * Here, the linear vgpr is used before any phi copies, so this isn't necessary.
              */
             if (inserted_at >= 0) {
                aco_ptr<Instruction> end{create_instruction(
                   aco_opcode::p_end_linear_vgpr, Format::PSEUDO, vtmp_inserted_at >= 0 ? 2 : 1, 0)};
                end->operands[0] = Operand(reduceTmp);
                if (vtmp_inserted_at >= 0)
                   end->operands[1] = Operand(vtmp);
    
                /* insert after the phis of the block */
                std::vector<aco_ptr<Instruction>>::iterator it = block.instructions.begin();
                while ((*it)->opcode == aco_opcode::p_linear_phi || (*it)->opcode == aco_opcode::p_phi)
                   ++it;
                block.instructions.insert(it, std::move(end));
                inserted_at = vtmp_inserted_at = -1;
             }
          }
    
          if (!hasReductions[block.index])
             continue;
    
          std::vector<aco_ptr<Instruction>>::iterator it;
          for (it = block.instructions.begin(); it != block.instructions.end(); ++it) {
             Instruction* instr = (*it).get();
             if (instr->format != Format::PSEUDO_REDUCTION &&
                 instr->opcode != aco_opcode::p_interp_gfx11 &&
                 instr->opcode != aco_opcode::p_bpermute_permlane)
                continue;
    
             if ((int)last_top_level_block_idx != inserted_at) {
                reduceTmp = program->allocateTmp(reduceTmp.regClass());
                aco_ptr<Instruction> create{
                   create_instruction(aco_opcode::p_start_linear_vgpr, Format::PSEUDO, 0, 1)};
                create->definitions[0] = Definition(reduceTmp);
                /* find the right place to insert this definition */
                if (last_top_level_block_idx == block.index) {
                   /* insert right before the current instruction */
                   it = block.instructions.insert(it, std::move(create));
                   it++;
                   /* inserted_at is intentionally not updated here, so later blocks
                    * would insert at the end instead of using this one. */
                } else {
                   assert(last_top_level_block_idx < block.index);
                   /* insert after p_logical_end of the last top-level block */
                   std::vector<aco_ptr<Instruction>>& instructions =
                      program->blocks[last_top_level_block_idx].instructions;
                   auto insert_point =
                      std::find_if(instructions.rbegin(), instructions.rend(),
                                   [](const auto& iter) {
                                      return iter->opcode == aco_opcode::p_logical_end;
                                   })
                         .base();
                   instructions.insert(insert_point, std::move(create));
                   inserted_at = last_top_level_block_idx;
                }
             }
    
             /* same as before, except for the vector temporary instead of the reduce temporary */
             bool need_vtmp = false;
             if (instr->isReduction()) {
                ReduceOp op = instr->reduction().reduce_op;
                unsigned cluster_size = instr->reduction().cluster_size;
                need_vtmp = op == imul32 || op == fadd64 || op == fmul64 || op == fmin64 ||
                            op == fmax64 || op == umin64 || op == umax64 || op == imin64 ||
                            op == imax64 || op == imul64;
                bool gfx10_need_vtmp = op == imul8 || op == imax8 || op == imin8 || op == umin8 ||
                                       op == imul16 || op == imax16 || op == imin16 || op == umin16 ||
                                       op == iadd64;
    
                if (program->gfx_level >= GFX10 && cluster_size == 64)
                   need_vtmp = true;
                if (program->gfx_level >= GFX10 && gfx10_need_vtmp)
                   need_vtmp = true;
                if (program->gfx_level <= GFX7)
                   need_vtmp = true;
    
                need_vtmp |= cluster_size == 32;
             }
    
             if (need_vtmp && (int)last_top_level_block_idx != vtmp_inserted_at) {
                vtmp = program->allocateTmp(vtmp.regClass());
                aco_ptr<Instruction> create{
                   create_instruction(aco_opcode::p_start_linear_vgpr, Format::PSEUDO, 0, 1)};
                create->definitions[0] = Definition(vtmp);
                if (last_top_level_block_idx == block.index) {
                   it = block.instructions.insert(it, std::move(create));
                   it++;
                } else {
                   assert(last_top_level_block_idx < block.index);
                   std::vector<aco_ptr<Instruction>>& instructions =
                      program->blocks[last_top_level_block_idx].instructions;
                   auto insert_point =
                      std::find_if(instructions.rbegin(), instructions.rend(),
                                   [](const auto& iter) {
                                      return iter->opcode == aco_opcode::p_logical_end;
                                   })
                         .base();
                   instructions.insert(insert_point, std::move(create));
                   vtmp_inserted_at = last_top_level_block_idx;
                }
             }
    
             if (instr->isReduction()) {
                instr->operands[1] = Operand(reduceTmp);
                if (need_vtmp)
                   instr->operands[2] = Operand(vtmp);
             } else {
                assert(instr->opcode == aco_opcode::p_interp_gfx11 ||
                       instr->opcode == aco_opcode::p_bpermute_permlane);
                instr->operands[0] = Operand(reduceTmp);
             }
          }
       }
    }
    
    }; // namespace aco