Edit

IABSD.fr/xenocara/lib/mesa/src/intel/compiler/brw_lower_regioning.cpp

Branch :

  • Show log

    Commit

  • Author : jsg
    Date : 2025-06-05 11:23:11
    Hash : 67d6f117
    Message : Import Mesa 25.0.7

  • lib/mesa/src/intel/compiler/brw_lower_regioning.cpp
  • /*
     * Copyright © 2018 Intel Corporation
     *
     * Permission is hereby granted, free of charge, to any person obtaining a
     * copy of this software and associated documentation files (the "Software"),
     * to deal in the Software without restriction, including without limitation
     * the rights to use, copy, modify, merge, publish, distribute, sublicense,
     * and/or sell copies of the Software, and to permit persons to whom the
     * Software is furnished to do so, subject to the following conditions:
     *
     * The above copyright notice and this permission notice (including the next
     * paragraph) shall be included in all copies or substantial portions of the
     * Software.
     *
     * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
     * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
     * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
     * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
     * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
     * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
     * IN THE SOFTWARE.
     */
    
    #include "brw_fs.h"
    #include "brw_cfg.h"
    #include "brw_builder.h"
    
    using namespace brw;
    
    namespace {
       /* From the SKL PRM Vol 2a, "Move":
        *
        * "A mov with the same source and destination type, no source modifier,
        *  and no saturation is a raw move. A packed byte destination region (B
        *  or UB type with HorzStride == 1 and ExecSize > 1) can only be written
        *  using raw move."
        */
       bool
       is_byte_raw_mov(const fs_inst *inst)
       {
          return brw_type_size_bytes(inst->dst.type) == 1 &&
                 inst->opcode == BRW_OPCODE_MOV &&
                 inst->src[0].type == inst->dst.type &&
                 !inst->saturate &&
                 !inst->src[0].negate &&
                 !inst->src[0].abs;
       }
    
       /*
        * Return an acceptable byte stride for the specified source of an
        * instruction affected by a regioning restriction.
        */
       unsigned
       required_src_byte_stride(const intel_device_info *devinfo, const fs_inst *inst,
                                unsigned i)
       {
          if (has_dst_aligned_region_restriction(devinfo, inst)) {
             return MAX2(brw_type_size_bytes(inst->dst.type),
                         byte_stride(inst->dst));
    
          } else if (has_subdword_integer_region_restriction(devinfo, inst,
                                                             &inst->src[i], 1)) {
             /* Use a stride of 32bits if possible, since that will guarantee that
              * the copy emitted to lower this region won't be affected by the
              * sub-dword integer region restrictions.  This may not be possible
              * for the second source of an instruction if we're required to use
              * packed data due to Wa_16012383669.
              */
             return (i == 1 ? brw_type_size_bytes(inst->src[i].type) : 4);
    
          } else {
             return byte_stride(inst->src[i]);
          }
       }
    
       /*
        * Return an acceptable byte sub-register offset for the specified source
        * of an instruction affected by a regioning restriction.
        */
       unsigned
       required_src_byte_offset(const intel_device_info *devinfo, const fs_inst *inst,
                                unsigned i)
       {
          if (has_dst_aligned_region_restriction(devinfo, inst)) {
             return reg_offset(inst->dst) % (reg_unit(devinfo) * REG_SIZE);
    
          } else if (has_subdword_integer_region_restriction(devinfo, inst,
                                                             &inst->src[i], 1)) {
             const unsigned dst_byte_stride =
                MAX2(byte_stride(inst->dst), brw_type_size_bytes(inst->dst.type));
             const unsigned src_byte_stride = required_src_byte_stride(devinfo, inst, i);
             const unsigned dst_byte_offset =
                reg_offset(inst->dst) % (reg_unit(devinfo) * REG_SIZE);
             const unsigned src_byte_offset =
                reg_offset(inst->src[i]) % (reg_unit(devinfo) * REG_SIZE);
    
             if (src_byte_stride > brw_type_size_bytes(inst->src[i].type)) {
                assert(src_byte_stride >= dst_byte_stride);
                /* The source is affected by the Xe2+ sub-dword integer regioning
                 * restrictions.  For the case of source 0 BSpec#56640 specifies a
                 * number of equations relating the source and destination
                 * sub-register numbers in all cases where a source stride of
                 * 32bits is allowed.  These equations have the form:
                 *
                 *   k * Dst.SubReg % m = Src.SubReg / l
                 *
                 * For some constants k, l and m different for each combination of
                 * source and destination types and strides.  The expression in
                 * the return statement below computes a valid source offset by
                 * inverting the equation like:
                 *
                 *   Src.SubReg = l * k * (Dst.SubReg % m)
                 *
                 * and then scaling by the element type sizes in order to get an
                 * expression in terms of byte offsets instead of sub-register
                 * numbers.  It can be easily verified that in all cases listed on
                 * the hardware spec where the source has a well-defined uniform
                 * stride the product l*k is equal to the ratio between the source
                 * and destination strides.
                 */
                const unsigned m = 64 * dst_byte_stride / src_byte_stride;
                return dst_byte_offset % m * src_byte_stride / dst_byte_stride;
             } else {
                assert(src_byte_stride == brw_type_size_bytes(inst->src[i].type));
                /* A packed source is required, likely due to the stricter
                 * requirements of the second source region.  The source being
                 * packed guarantees that the region of the original instruction
                 * will be valid, but the copy may break the regioning
                 * restrictions.  Do our best to try to prevent that from
                 * happening by making sure the offset of the temporary matches
                 * the original source based on the same equation above -- However
                 * that may not be sufficient if the source had a stride larger
                 * than 32bits, lowering the copy recursively may be necessary.
                 */
                return src_byte_offset * src_byte_stride / byte_stride(inst->src[i]);
             }
    
          } else {
             return reg_offset(inst->src[i]) % (reg_unit(devinfo) * REG_SIZE);
          }
       }
    
       /*
        * Return an acceptable byte stride for the destination of an instruction
        * that requires it to have some particular alignment.
        */
       unsigned
       required_dst_byte_stride(const fs_inst *inst)
       {
          if (inst->dst.is_accumulator()) {
             /* If the destination is an accumulator, insist that we leave the
              * stride alone.  We cannot "fix" accumulator destinations by writing
              * to a temporary and emitting a MOV into the original destination.
              * For multiply instructions (our one use of the accumulator), the
              * MUL writes the full 66 bits of the accumulator whereas the MOV we
              * would emit only writes 33 bits and leaves the top 33 bits
              * undefined.
              *
              * It's safe to just require the original stride here because the
              * lowering pass will detect the mismatch in has_invalid_src_region
              * and fix the sources of the multiply instead of the destination.
              */
             return inst->dst.hstride * brw_type_size_bytes(inst->dst.type);
          } else if (brw_type_size_bytes(inst->dst.type) < get_exec_type_size(inst) &&
              !is_byte_raw_mov(inst)) {
             return get_exec_type_size(inst);
          } else {
             /* Calculate the maximum byte stride and the minimum/maximum type
              * size across all source and destination operands we are required to
              * lower.
              */
             unsigned max_stride = inst->dst.stride * brw_type_size_bytes(inst->dst.type);
             unsigned min_size = brw_type_size_bytes(inst->dst.type);
             unsigned max_size = brw_type_size_bytes(inst->dst.type);
    
             for (unsigned i = 0; i < inst->sources; i++) {
                if (!is_uniform(inst->src[i]) && !inst->is_control_source(i)) {
                   const unsigned size = brw_type_size_bytes(inst->src[i].type);
                   max_stride = MAX2(max_stride, inst->src[i].stride * size);
                   min_size = MIN2(min_size, size);
                   max_size = MAX2(max_size, size);
                }
             }
    
             /* All operands involved in lowering need to fit in the calculated
              * stride.
              */
             assert(max_size <= 4 * min_size);
    
             /* Attempt to use the largest byte stride among all present operands,
              * but never exceed a stride of 4 since that would lead to illegal
              * destination regions during lowering.
              */
             return MIN2(max_stride, 4 * min_size);
          }
       }
    
       /*
        * Return an acceptable byte sub-register offset for the destination of an
        * instruction that requires it to be aligned to the sub-register offset of
        * the sources.
        */
       unsigned
       required_dst_byte_offset(const intel_device_info *devinfo, const fs_inst *inst)
       {
          for (unsigned i = 0; i < inst->sources; i++) {
             if (!is_uniform(inst->src[i]) && !inst->is_control_source(i))
                if (reg_offset(inst->src[i]) % (reg_unit(devinfo) * REG_SIZE) !=
                    reg_offset(inst->dst) % (reg_unit(devinfo) * REG_SIZE))
                   return 0;
          }
    
          return reg_offset(inst->dst) % (reg_unit(devinfo) * REG_SIZE);
       }
    
       /*
        * Return the closest legal execution type for an instruction on
        * the specified platform.
        */
       brw_reg_type
       required_exec_type(const intel_device_info *devinfo, const fs_inst *inst)
       {
          const brw_reg_type t = get_exec_type(inst);
          const bool has_64bit = brw_type_is_float(t) ?
             devinfo->has_64bit_float : devinfo->has_64bit_int;
    
          switch (inst->opcode) {
          case SHADER_OPCODE_SHUFFLE:
             /* IVB has an issue (which we found empirically) where it reads
              * two address register components per channel for indirectly
              * addressed 64-bit sources.
              *
              * From the Cherryview PRM Vol 7. "Register Region Restrictions":
              *
              *    "When source or destination datatype is 64b or operation is
              *    integer DWord multiply, indirect addressing must not be
              *    used."
              *
              * Work around both of the above and handle platforms that
              * don't support 64-bit types at all.
              */
             if ((!devinfo->has_64bit_int ||
                  intel_device_info_is_9lp(devinfo) ||
                  devinfo->ver >= 20) && brw_type_size_bytes(t) > 4)
                return BRW_TYPE_UD;
             else if (has_dst_aligned_region_restriction(devinfo, inst))
                return brw_int_type(brw_type_size_bytes(t), false);
             else
                return t;
    
          case SHADER_OPCODE_SEL_EXEC:
             if ((!has_64bit || devinfo->has_64bit_float_via_math_pipe) &&
                 brw_type_size_bytes(t) > 4)
                return BRW_TYPE_UD;
             else
                return t;
    
          case SHADER_OPCODE_QUAD_SWIZZLE:
             if (has_dst_aligned_region_restriction(devinfo, inst))
                return brw_int_type(brw_type_size_bytes(t), false);
             else
                return t;
    
          case SHADER_OPCODE_CLUSTER_BROADCAST:
             /* From the Cherryview PRM Vol 7. "Register Region Restrictions":
              *
              *    "When source or destination datatype is 64b or operation is
              *    integer DWord multiply, indirect addressing must not be
              *    used."
              *
              * For MTL (verx10 == 125), float64 is supported, but int64 is not.
              * Therefore we need to lower cluster broadcast using 32-bit int ops.
              *
              * For gfx12.5+ platforms that support int64, the register regions
              * used by cluster broadcast aren't supported by the 64-bit pipeline.
              *
              * Work around the above and handle platforms that don't
              * support 64-bit types at all.
              */
             if ((!has_64bit || devinfo->verx10 >= 125 ||
                  intel_device_info_is_9lp(devinfo) ||
                  devinfo->ver >= 20) && brw_type_size_bytes(t) > 4)
                return BRW_TYPE_UD;
             else
                return brw_int_type(brw_type_size_bytes(t), false);
    
          default:
             return t;
          }
       }
    
       /*
        * Return whether the instruction has an unsupported channel bit layout
        * specified for the i-th source region.
        */
       bool
       has_invalid_src_region(const intel_device_info *devinfo, const fs_inst *inst,
                              unsigned i)
       {
          /* Wa_22016140776:
           *
           *    Scalar broadcast on HF math (packed or unpacked) must not be used.
           *    Compiler must use a mov instruction to expand the scalar value to
           *    a vector before using in a HF (packed or unpacked) math operation.
           */
          if (inst->is_math() && intel_needs_workaround(devinfo, 22016140776) &&
              is_uniform(inst->src[i]) && inst->src[i].type == BRW_TYPE_HF) {
             return true;
          }
    
          if (is_send(inst) || inst->is_control_source(i) ||
              inst->opcode == BRW_OPCODE_DPAS) {
             return false;
          }
    
          const unsigned dst_byte_offset = reg_offset(inst->dst) % (reg_unit(devinfo) * REG_SIZE);
          const unsigned src_byte_offset = reg_offset(inst->src[i]) % (reg_unit(devinfo) * REG_SIZE);
    
          return (has_dst_aligned_region_restriction(devinfo, inst) &&
                  !is_uniform(inst->src[i]) &&
                  (byte_stride(inst->src[i]) != required_src_byte_stride(devinfo, inst, i) ||
                   src_byte_offset != dst_byte_offset)) ||
                 (has_subdword_integer_region_restriction(devinfo, inst) &&
                  (byte_stride(inst->src[i]) != required_src_byte_stride(devinfo, inst, i) ||
                   src_byte_offset != required_src_byte_offset(devinfo, inst, i)));
       }
    
       /*
        * Return whether the instruction has an unsupported channel bit layout
        * specified for the destination region.
        */
       bool
       has_invalid_dst_region(const intel_device_info *devinfo,
                              const fs_inst *inst)
       {
          if (is_send(inst)) {
             return false;
          } else {
             const brw_reg_type exec_type = get_exec_type(inst);
             const unsigned dst_byte_offset = reg_offset(inst->dst) % (reg_unit(devinfo) * REG_SIZE);
             const bool is_narrowing_conversion = !is_byte_raw_mov(inst) &&
                brw_type_size_bytes(inst->dst.type) < brw_type_size_bytes(exec_type);
    
             return (has_dst_aligned_region_restriction(devinfo, inst) &&
                     (required_dst_byte_stride(inst) != byte_stride(inst->dst) ||
                      required_dst_byte_offset(devinfo, inst) != dst_byte_offset)) ||
                    (is_narrowing_conversion &&
                     required_dst_byte_stride(inst) != byte_stride(inst->dst));
          }
       }
    
       /**
        * Return a non-zero value if the execution type of the instruction is
        * unsupported.  The destination and sources matching the returned mask
        * will be bit-cast to an integer type of appropriate size, lowering any
        * source or destination modifiers into separate MOV instructions.
        */
       unsigned
       has_invalid_exec_type(const intel_device_info *devinfo, const fs_inst *inst)
       {
          if (required_exec_type(devinfo, inst) != get_exec_type(inst)) {
             switch (inst->opcode) {
             case SHADER_OPCODE_SHUFFLE:
             case SHADER_OPCODE_QUAD_SWIZZLE:
             case SHADER_OPCODE_CLUSTER_BROADCAST:
             case SHADER_OPCODE_BROADCAST:
             case SHADER_OPCODE_MOV_INDIRECT:
                return 0x1;
    
             case SHADER_OPCODE_SEL_EXEC:
                return 0x3;
    
             default:
                unreachable("Unknown invalid execution type source mask.");
             }
          } else {
             return 0;
          }
       }
    
       /**
        * Return whether the instruction has an unsupported type conversion
        * that must be handled by expanding the source operand.
        */
       bool
       has_invalid_src_conversion(const intel_device_info *devinfo,
                                  const fs_inst *inst)
       {
          /* Scalar byte to float conversion is not allowed on DG2+ */
          return devinfo->verx10 >= 125 &&
                 inst->opcode == BRW_OPCODE_MOV &&
                 brw_type_is_float(inst->dst.type) &&
                 brw_type_size_bits(inst->src[0].type) == 8 &&
                 is_uniform(inst->src[0]);
       }
    
       /*
        * Return whether the instruction has unsupported source modifiers
        * specified for the i-th source region.
        */
       bool
       has_invalid_src_modifiers(const intel_device_info *devinfo,
                                 const fs_inst *inst, unsigned i)
       {
          return (!inst->can_do_source_mods(devinfo) &&
                  (inst->src[i].negate || inst->src[i].abs)) ||
                 ((has_invalid_exec_type(devinfo, inst) & (1u << i)) &&
                  (inst->src[i].negate || inst->src[i].abs ||
                   inst->src[i].type != get_exec_type(inst))) ||
                 has_invalid_src_conversion(devinfo, inst);
       }
    
       /*
        * Return whether the instruction has an unsupported type conversion
        * specified for the destination.
        */
       bool
       has_invalid_conversion(const intel_device_info *devinfo, const fs_inst *inst)
       {
          switch (inst->opcode) {
          case BRW_OPCODE_MOV:
             return false;
          case BRW_OPCODE_SEL:
             return inst->dst.type != get_exec_type(inst);
          default:
             /* FIXME: We assume the opcodes not explicitly mentioned before just
              * work fine with arbitrary conversions, unless they need to be
              * bit-cast.
              */
             return has_invalid_exec_type(devinfo, inst) &&
                    inst->dst.type != get_exec_type(inst);
          }
       }
    
       /**
        * Return whether the instruction has unsupported destination modifiers.
        */
       bool
       has_invalid_dst_modifiers(const intel_device_info *devinfo, const fs_inst *inst)
       {
          return (has_invalid_exec_type(devinfo, inst) &&
                  (inst->saturate || inst->conditional_mod)) ||
                 has_invalid_conversion(devinfo, inst);
       }
    
       /**
        * Return whether the instruction has non-standard semantics for the
        * conditional mod which don't cause the flag register to be updated with
        * the comparison result.
        */
       bool
       has_inconsistent_cmod(const fs_inst *inst)
       {
          return inst->opcode == BRW_OPCODE_SEL ||
                 inst->opcode == BRW_OPCODE_CSEL ||
                 inst->opcode == BRW_OPCODE_IF ||
                 inst->opcode == BRW_OPCODE_WHILE;
       }
    
       bool
       lower_instruction(fs_visitor *v, bblock_t *block, fs_inst *inst);
    }
    
    /**
     * Remove any modifiers from the \p i-th source region of the instruction,
     * including negate, abs and any implicit type conversion to the execution
     * type.  Instead any source modifiers will be implemented as a separate
     * MOV instruction prior to the original instruction.
     */
    bool
    brw_lower_src_modifiers(fs_visitor &s, bblock_t *block, fs_inst *inst, unsigned i)
    {
       assert(inst->components_read(i) == 1);
       assert(s.devinfo->has_integer_dword_mul ||
              inst->opcode != BRW_OPCODE_MUL ||
              brw_type_is_float(get_exec_type(inst)) ||
              MIN2(brw_type_size_bytes(inst->src[0].type), brw_type_size_bytes(inst->src[1].type)) >= 4 ||
              brw_type_size_bytes(inst->src[i].type) == get_exec_type_size(inst));
    
       const brw_builder ibld(&s, block, inst);
       const brw_reg tmp = ibld.vgrf(get_exec_type(inst));
    
       lower_instruction(&s, block, ibld.MOV(tmp, inst->src[i]));
       inst->src[i] = tmp;
    
       return true;
    }
    
    namespace {
       /**
        * Remove any modifiers from the destination region of the instruction,
        * including saturate, conditional mod and any implicit type conversion
        * from the execution type.  Instead any destination modifiers will be
        * implemented as a separate MOV instruction after the original
        * instruction.
        */
       bool
       lower_dst_modifiers(fs_visitor *v, bblock_t *block, fs_inst *inst)
       {
          const brw_builder ibld(v, block, inst);
          const brw_reg_type type = get_exec_type(inst);
          /* Not strictly necessary, but if possible use a temporary with the same
           * channel alignment as the current destination in order to avoid
           * violating the restrictions enforced later on by lower_src_region()
           * and lower_dst_region(), which would introduce additional copy
           * instructions into the program unnecessarily.
           */
          const unsigned stride =
             brw_type_size_bytes(inst->dst.type) * inst->dst.stride <= brw_type_size_bytes(type) ? 1 :
             brw_type_size_bytes(inst->dst.type) * inst->dst.stride / brw_type_size_bytes(type);
          brw_reg tmp = ibld.vgrf(type, stride);
          ibld.UNDEF(tmp);
          tmp = horiz_stride(tmp, stride);
    
          /* Emit a MOV taking care of all the destination modifiers. */
          fs_inst *mov = ibld.at(block, inst->next).MOV(inst->dst, tmp);
          mov->saturate = inst->saturate;
          if (!has_inconsistent_cmod(inst))
             mov->conditional_mod = inst->conditional_mod;
          if (inst->opcode != BRW_OPCODE_SEL) {
             mov->predicate = inst->predicate;
             mov->predicate_inverse = inst->predicate_inverse;
          }
          mov->flag_subreg = inst->flag_subreg;
          lower_instruction(v, block, mov);
    
          /* Point the original instruction at the temporary, and clean up any
           * destination modifiers.
           */
          assert(inst->size_written == inst->dst.component_size(inst->exec_size));
          inst->dst = tmp;
          inst->size_written = inst->dst.component_size(inst->exec_size);
          inst->saturate = false;
          if (!has_inconsistent_cmod(inst))
             inst->conditional_mod = BRW_CONDITIONAL_NONE;
    
          assert(!inst->flags_written(v->devinfo) || !mov->predicate);
          return true;
       }
    
       /**
        * Remove any non-trivial shuffling of data from the \p i-th source region
        * of the instruction.  Instead implement the region as a series of integer
        * copies into a temporary with the same channel layout as the destination.
        */
       bool
       lower_src_region(fs_visitor *v, bblock_t *block, fs_inst *inst, unsigned i)
       {
          assert(inst->components_read(i) == 1);
          const intel_device_info *devinfo = v->devinfo;
          const brw_builder ibld(v, block, inst);
          const unsigned stride = required_src_byte_stride(devinfo, inst, i) /
                                  brw_type_size_bytes(inst->src[i].type);
          assert(stride > 0);
          /* Calculate the size of the temporary allocation manually instead of
           * relying on the builder, since we may have to add some amount of
           * padding mandated by the hardware for Xe2+ instructions with sub-dword
           * integer regions.
           */
          const unsigned size =
             DIV_ROUND_UP(required_src_byte_offset(v->devinfo, inst, i) +
                          inst->exec_size * stride *
                          brw_type_size_bytes(inst->src[i].type),
                          reg_unit(devinfo) * REG_SIZE) * reg_unit(devinfo);
          brw_reg tmp = brw_vgrf(v->alloc.allocate(size), inst->src[i].type);
          ibld.UNDEF(tmp);
          tmp = byte_offset(horiz_stride(tmp, stride),
                            required_src_byte_offset(devinfo, inst, i));
    
          /* Emit a series of 32-bit integer copies with any source modifiers
           * cleaned up (because their semantics are dependent on the type).
           */
          const brw_reg_type raw_type = brw_int_type(MIN2(brw_type_size_bytes(tmp.type), 4),
                                                     false);
          const unsigned n = brw_type_size_bytes(tmp.type) / brw_type_size_bytes(raw_type);
          brw_reg raw_src = inst->src[i];
          raw_src.negate = false;
          raw_src.abs = false;
    
          for (unsigned j = 0; j < n; j++) {
    	fs_inst *jnst = ibld.MOV(subscript(tmp, raw_type, j),
    				 subscript(raw_src, raw_type, j));
    	if (has_subdword_integer_region_restriction(devinfo, jnst)) {
               /* The copy isn't guaranteed to comply with all subdword integer
                * regioning restrictions in some cases.  Lower it recursively.
                */
    	   lower_instruction(v, block, jnst);
            }
          }
    
          /* Point the original instruction at the temporary, making sure to keep
           * any source modifiers in the instruction.
           */
          brw_reg lower_src = tmp;
          lower_src.negate = inst->src[i].negate;
          lower_src.abs = inst->src[i].abs;
          inst->src[i] = lower_src;
    
          return true;
       }
    
       /**
        * Remove any non-trivial shuffling of data from the destination region of
        * the instruction.  Instead implement the region as a series of integer
        * copies from a temporary with a channel layout compatible with the
        * sources.
        */
       bool
       lower_dst_region(fs_visitor *v, bblock_t *block, fs_inst *inst)
       {
          /* We cannot replace the result of an integer multiply which writes the
           * accumulator because MUL+MACH pairs act on the accumulator as a 66-bit
           * value whereas the MOV will act on only 32 or 33 bits of the
           * accumulator.
           */
          assert(inst->opcode != BRW_OPCODE_MUL || !inst->dst.is_accumulator() ||
                 brw_type_is_float(inst->dst.type));
    
          const brw_builder ibld(v, block, inst);
          const unsigned stride = required_dst_byte_stride(inst) /
                                  brw_type_size_bytes(inst->dst.type);
          assert(stride > 0);
          brw_reg tmp = ibld.vgrf(inst->dst.type, stride);
          ibld.UNDEF(tmp);
          tmp = horiz_stride(tmp, stride);
    
          if (!inst->dst.is_null()) {
             /* Emit a series of 32-bit integer copies from the temporary into the
              * original destination.
              */
             const brw_reg_type raw_type =
                brw_int_type(MIN2(brw_type_size_bytes(tmp.type), 4), false);
    
             const unsigned n =
                brw_type_size_bytes(tmp.type) / brw_type_size_bytes(raw_type);
    
             if (inst->predicate && inst->opcode != BRW_OPCODE_SEL) {
                /* Note that in general we cannot simply predicate the copies on
                 * the same flag register as the original instruction, since it
                 * may have been overwritten by the instruction itself.  Instead
                 * initialize the temporary with the previous contents of the
                 * destination register.
                 */
                for (unsigned j = 0; j < n; j++)
                   ibld.MOV(subscript(tmp, raw_type, j),
                            subscript(inst->dst, raw_type, j));
             }
    
             for (unsigned j = 0; j < n; j++) {
                fs_inst *jnst = ibld.at(block, inst->next).MOV(subscript(inst->dst, raw_type, j),
                                                               subscript(tmp, raw_type, j));
                if (has_subdword_integer_region_restriction(v->devinfo, jnst)) {
                   /* The copy isn't guaranteed to comply with all subdword integer
                    * regioning restrictions in some cases.  Lower it recursively.
                    */
                   lower_instruction(v, block, jnst);
                }
             }
    
             /* If the destination was an accumulator, after lowering it will be a
              * GRF. Clear writes_accumulator for the instruction.
              */
             if (inst->dst.is_accumulator())
                inst->writes_accumulator = false;
          }
    
          /* Point the original instruction at the temporary, making sure to keep
           * any destination modifiers in the instruction.
           */
          assert(inst->size_written == inst->dst.component_size(inst->exec_size));
          inst->dst = tmp;
          inst->size_written = inst->dst.component_size(inst->exec_size);
    
          return true;
       }
    
       /**
        * Change sources and destination of the instruction to an
        * appropriate legal type, splitting the instruction into multiple
        * ones of smaller execution type if necessary, to be used in cases
        * where the execution type of an instruction is unsupported.
        */
       bool
       lower_exec_type(fs_visitor *v, bblock_t *block, fs_inst *inst)
       {
          assert(inst->dst.type == get_exec_type(inst));
          const unsigned mask = has_invalid_exec_type(v->devinfo, inst);
          const brw_reg_type raw_type = required_exec_type(v->devinfo, inst);
          const unsigned n = get_exec_type_size(inst) / brw_type_size_bytes(raw_type);
          const brw_builder ibld(v, block, inst);
    
          brw_reg tmp = ibld.vgrf(inst->dst.type, inst->dst.stride);
          ibld.UNDEF(tmp);
          tmp = horiz_stride(tmp, inst->dst.stride);
    
          for (unsigned j = 0; j < n; j++) {
             fs_inst sub_inst = *inst;
    
             for (unsigned i = 0; i < inst->sources; i++) {
                if (mask & (1u << i)) {
                   assert(inst->src[i].type == inst->dst.type);
                   sub_inst.src[i] = subscript(inst->src[i], raw_type, j);
                }
             }
    
             sub_inst.dst = subscript(tmp, raw_type, j);
    
             assert(sub_inst.size_written == sub_inst.dst.component_size(sub_inst.exec_size));
             assert(!sub_inst.flags_written(v->devinfo) && !sub_inst.saturate);
             ibld.emit(sub_inst);
    
             fs_inst *mov = ibld.MOV(subscript(inst->dst, raw_type, j),
                                     subscript(tmp, raw_type, j));
             if (inst->opcode != BRW_OPCODE_SEL) {
                mov->predicate = inst->predicate;
                mov->predicate_inverse = inst->predicate_inverse;
             }
             lower_instruction(v, block, mov);
          }
    
          inst->remove(block);
    
          return true;
       }
    
       /**
        * Fast-path for very specific kinds of invalid regions.
        *
        * Gfx12.5+ does not allow moves of B or UB sources to floating-point
        * destinations. This restriction can be resolved more efficiently than by
        * the general lowering in lower_src_modifiers or lower_src_region.
        */
       void
       lower_src_conversion(fs_visitor *v, bblock_t *block, fs_inst *inst)
       {
          const intel_device_info *devinfo = v->devinfo;
          const brw_builder ibld = brw_builder(v, block, inst).scalar_group();
    
          /* We only handle scalar conversions from small types for now. */
          assert(is_uniform(inst->src[0]));
    
          brw_reg tmp = ibld.vgrf(brw_type_with_size(inst->src[0].type, 32));
          fs_inst *mov = ibld.MOV(tmp, inst->src[0]);
    
          inst->src[0] = component(tmp, 0);
    
          /* Assert that neither the added MOV nor the original instruction will need
           * any additional lowering.
           */
          assert(!has_invalid_src_region(devinfo, mov, 0));
          assert(!has_invalid_src_modifiers(devinfo, mov, 0));
          assert(!has_invalid_dst_region(devinfo, mov));
    
          assert(!has_invalid_src_region(devinfo, inst, 0));
          assert(!has_invalid_src_modifiers(devinfo, inst, 0));
       }
    
       /**
        * Legalize the source and destination regioning controls of the specified
        * instruction.
        */
       bool
       lower_instruction(fs_visitor *v, bblock_t *block, fs_inst *inst)
       {
          const intel_device_info *devinfo = v->devinfo;
          bool progress = false;
    
          /* BROADCAST is special. It's destination region is a bit of a lie, and
           * it gets lower in brw_eu_emit. For the purposes of region
           * restrictions, let's assume that the final code emission will do the
           * right thing. Doing a bunch of shuffling here is only going to make a
           * mess of things.
           */
          if (inst->opcode == SHADER_OPCODE_BROADCAST)
             return false;
    
          if (has_invalid_dst_modifiers(devinfo, inst))
             progress |= lower_dst_modifiers(v, block, inst);
    
          if (has_invalid_dst_region(devinfo, inst))
             progress |= lower_dst_region(v, block, inst);
    
          if (has_invalid_src_conversion(devinfo, inst)) {
             lower_src_conversion(v, block, inst);
             progress = true;
          }
    
          for (unsigned i = 0; i < inst->sources; i++) {
             if (has_invalid_src_modifiers(devinfo, inst, i))
                progress |= brw_lower_src_modifiers(*v, block, inst, i);
    
             if (has_invalid_src_region(devinfo, inst, i))
                progress |= lower_src_region(v, block, inst, i);
          }
    
          if (has_invalid_exec_type(devinfo, inst))
             progress |= lower_exec_type(v, block, inst);
    
          return progress;
       }
    }
    
    bool
    brw_lower_regioning(fs_visitor &s)
    {
       bool progress = false;
    
       foreach_block_and_inst_safe(block, fs_inst, inst, s.cfg)
          progress |= lower_instruction(&s, block, inst);
    
       if (progress)
          s.invalidate_analysis(DEPENDENCY_INSTRUCTIONS | DEPENDENCY_VARIABLES);
    
       return progress;
    }