Edit

IABSD.fr/xenocara/lib/mesa/src/intel/compiler/brw_lower.cpp

Branch :

  • Show log

    Commit

  • Author : jsg
    Date : 2025-06-05 11:23:11
    Hash : 67d6f117
    Message : Import Mesa 25.0.7

  • lib/mesa/src/intel/compiler/brw_lower.cpp
  • /*
     * Copyright © 2010 Intel Corporation
     * SPDX-License-Identifier: MIT
     */
    
    #include "brw_fs.h"
    #include "brw_builder.h"
    
    using namespace brw;
    
    /**
     * Align16 3-source instructions cannot have scalar stride w/64-bit types.
     *
     * The Bspec says:
     *
     *    Replicate Control. This field is only present in three-source
     *    instructions, for each of the three source operands. It controls
     *    replication of the starting channel to all channels in the execution
     *    size. ChanSel does not apply when Replicate Control is set. This is
     *    applicable to 32b datatypes and 16b datatype. 64b datatypes cannot use
     *    the replicate control.
     *
     * In practice, this can only happen on Gfx9 with DF sources to MAD.  Since
     * the source is_scalar, this can be fixed by just making the stride=1. Also
     * clear is_scalar "just in case."
     */
    bool
    brw_lower_scalar_fp64_MAD(fs_visitor &s)
    {
       const intel_device_info *devinfo = s.devinfo;
       bool progress = false;
    
       if (devinfo->ver != 9)
          return false;
    
       foreach_block_and_inst_safe(block, fs_inst, inst, s.cfg) {
          if (inst->opcode == BRW_OPCODE_MAD &&
              inst->dst.type == BRW_TYPE_DF) {
             for (unsigned i = 0; i < 3; i++) {
                if (inst->src[i].is_scalar) {
                   inst->src[i].is_scalar = false;
                   inst->src[i].stride = 1;
                   progress = true;
                }
             }
          }
       }
    
       return progress;
    }
    
    bool
    brw_lower_load_payload(fs_visitor &s)
    {
       bool progress = false;
    
       foreach_block_and_inst_safe (block, fs_inst, inst, s.cfg) {
          if (inst->opcode != SHADER_OPCODE_LOAD_PAYLOAD)
             continue;
    
          assert(inst->dst.file == VGRF);
          assert(inst->saturate == false);
          brw_reg dst = inst->dst;
    
          const brw_builder ibld(&s, block, inst);
          const brw_builder ubld = ibld.exec_all();
    
          for (uint8_t i = 0; i < inst->header_size;) {
             /* Number of header GRFs to initialize at once with a single MOV
              * instruction.
              */
             const unsigned n =
                (i + 1 < inst->header_size && inst->src[i].stride == 1 &&
                 inst->src[i + 1].equals(byte_offset(inst->src[i], REG_SIZE))) ?
                2 : 1;
    
             if (inst->src[i].file != BAD_FILE)
                ubld.group(8 * n, 0).MOV(retype(dst, BRW_TYPE_UD),
                                         retype(inst->src[i], BRW_TYPE_UD));
    
             dst = byte_offset(dst, n * REG_SIZE);
             i += n;
          }
    
          for (uint8_t i = inst->header_size; i < inst->sources; i++) {
             dst.type = inst->src[i].type;
             if (inst->src[i].file != BAD_FILE) {
                ibld.MOV(dst, inst->src[i]);
             }
             dst = offset(dst, ibld, 1);
          }
    
          inst->remove(block);
          progress = true;
       }
    
       if (progress)
          s.invalidate_analysis(DEPENDENCY_INSTRUCTIONS);
    
       return progress;
    }
    
    /**
     * Lower CSEL with unsupported types to CMP+SEL.
     *
     * Or, for unsigned ==/!= comparisons, simply change the types.
     */
    bool
    brw_lower_csel(fs_visitor &s)
    {
       const intel_device_info *devinfo = s.devinfo;
       bool progress = false;
    
       foreach_block_and_inst_safe(block, fs_inst, inst, s.cfg) {
          if (inst->opcode != BRW_OPCODE_CSEL)
             continue;
    
          bool supported = false;
          enum brw_reg_type orig_type = inst->src[2].type;
          enum brw_reg_type new_type = orig_type;
    
          switch (orig_type) {
          case BRW_TYPE_F:
             /* Gfx9 CSEL can only do F */
             supported = true;
             break;
          case BRW_TYPE_HF:
          case BRW_TYPE_W:
          case BRW_TYPE_D:
             /* Gfx11+ CSEL can do HF, W, and D.  Note that we can't simply
              * retype integer ==/!= comparisons as float on earlier hardware
              * because it breaks for 0x8000000 and 0 (-0.0 == 0.0).
              */
             supported = devinfo->ver >= 11;
             break;
          case BRW_TYPE_UW:
          case BRW_TYPE_UD:
             /* CSEL doesn't support UW/UD but we can simply retype to use the
              * signed types when comparing with == or !=.
              */
             supported = devinfo->ver >= 11 &&
                         (inst->conditional_mod == BRW_CONDITIONAL_EQ ||
                          inst->conditional_mod == BRW_CONDITIONAL_NEQ);
    
             /* Bspec 47408, Gfx125+ CSEL does support the both signed and unsigned
              * integer types.
              */
             if (devinfo->verx10 < 125) {
                new_type = inst->src[2].type == BRW_TYPE_UD ?
                           BRW_TYPE_D : BRW_TYPE_W;
             }
             break;
          default:
             break;
          }
    
          if (!supported) {
             const brw_builder ibld(&s, block, inst);
    
             /* CSEL: dst = src2 <op> 0 ? src0 : src1 */
             brw_reg zero = brw_imm_reg(orig_type);
             ibld.CMP(retype(brw_null_reg(), orig_type),
                      inst->src[2], zero, inst->conditional_mod);
    
             inst->opcode = BRW_OPCODE_SEL;
             inst->predicate = BRW_PREDICATE_NORMAL;
             inst->conditional_mod = BRW_CONDITIONAL_NONE;
             inst->resize_sources(2);
             progress = true;
          } else if (new_type != orig_type) {
             inst->src[0].type = new_type;
             inst->src[1].type = new_type;
             inst->src[2].type = new_type;
             progress = true;
          }
       }
    
       if (progress)
          s.invalidate_analysis(DEPENDENCY_INSTRUCTIONS);
    
       return progress;
    }
    
    bool
    brw_lower_sub_sat(fs_visitor &s)
    {
       bool progress = false;
    
       foreach_block_and_inst_safe(block, fs_inst, inst, s.cfg) {
          const brw_builder ibld(&s, block, inst);
    
          if (inst->opcode == SHADER_OPCODE_USUB_SAT ||
              inst->opcode == SHADER_OPCODE_ISUB_SAT) {
             /* The fundamental problem is the hardware performs source negation
              * at the bit width of the source.  If the source is 0x80000000D, the
              * negation is 0x80000000D.  As a result, subtractSaturate(0,
              * 0x80000000) will produce 0x80000000 instead of 0x7fffffff.  There
              * are at least three ways to resolve this:
              *
              * 1. Use the accumulator for the negated source.  The accumulator is
              *    33 bits, so our source 0x80000000 is sign-extended to
              *    0x1800000000.  The negation of which is 0x080000000.  This
              *    doesn't help for 64-bit integers (which are already bigger than
              *    33 bits).  There are also only 8 accumulators, so SIMD16 or
              *    SIMD32 instructions would have to be split into multiple SIMD8
              *    instructions.
              *
              * 2. Use slightly different math.  For any n-bit value x, we know (x
              *    >> 1) != -(x >> 1).  We can use this fact to only do
              *    subtractions involving (x >> 1).  subtractSaturate(a, b) ==
              *    subtractSaturate(subtractSaturate(a, (b >> 1)), b - (b >> 1)).
              *
              * 3. For unsigned sources, it is sufficient to replace the
              *    subtractSaturate with (a > b) ? a - b : 0.
              *
              * It may also be possible to use the SUBB instruction.  This
              * implicitly writes the accumulator, so it could only be used in the
              * same situations as #1 above.  It is further limited by only
              * allowing UD sources.
              */
             if (inst->exec_size == 8 && inst->src[0].type != BRW_TYPE_Q &&
                 inst->src[0].type != BRW_TYPE_UQ) {
                brw_reg acc = retype(brw_acc_reg(inst->exec_size),
                                    inst->src[1].type);
    
                ibld.MOV(acc, inst->src[1]);
                fs_inst *add = ibld.ADD(inst->dst, acc, inst->src[0]);
                add->saturate = true;
                add->src[0].negate = true;
             } else if (inst->opcode == SHADER_OPCODE_ISUB_SAT) {
                /* tmp = src1 >> 1;
                 * dst = add.sat(add.sat(src0, -tmp), -(src1 - tmp));
                 */
                fs_inst *add;
    
                brw_reg tmp = ibld.vgrf(inst->src[0].type);
                ibld.SHR(tmp, inst->src[1], brw_imm_d(1));
    
                brw_reg s1_sub_t = ibld.ADD(inst->src[1], negate(tmp));
                brw_reg sat_s0_sub_t = ibld.ADD(inst->src[0], negate(tmp), &add);
                add->saturate = true;
    
                add = ibld.ADD(inst->dst, sat_s0_sub_t, negate(s1_sub_t));
                add->saturate = true;
             } else {
                /* a > b ? a - b : 0 */
                ibld.CMP(ibld.null_reg_d(), inst->src[0], inst->src[1],
                         BRW_CONDITIONAL_G);
    
                fs_inst *add = ibld.ADD(inst->dst, inst->src[0], inst->src[1]);
                add->src[1].negate = !add->src[1].negate;
    
                ibld.SEL(inst->dst, inst->dst, brw_imm_ud(0))
                   ->predicate = BRW_PREDICATE_NORMAL;
             }
    
             inst->remove(block);
             progress = true;
          }
       }
    
       if (progress)
          s.invalidate_analysis(DEPENDENCY_INSTRUCTIONS | DEPENDENCY_VARIABLES);
    
       return progress;
    }
    
    /**
     * Transform barycentric vectors into the interleaved form expected by the PLN
     * instruction and returned by the Gfx7+ PI shared function.
     *
     * For channels 0-15 in SIMD16 mode they are expected to be laid out as
     * follows in the register file:
     *
     *    rN+0: X[0-7]
     *    rN+1: Y[0-7]
     *    rN+2: X[8-15]
     *    rN+3: Y[8-15]
     *
     * There is no need to handle SIMD32 here -- This is expected to be run after
     * SIMD lowering, since SIMD lowering relies on vectors having the standard
     * component layout.
     */
    bool
    brw_lower_barycentrics(fs_visitor &s)
    {
       const intel_device_info *devinfo = s.devinfo;
    
       if (s.stage != MESA_SHADER_FRAGMENT || devinfo->ver >= 20)
          return false;
    
       bool progress = false;
    
       foreach_block_and_inst_safe(block, fs_inst, inst, s.cfg) {
          if (inst->exec_size < 16)
             continue;
    
          const brw_builder ibld(&s, block, inst);
          const brw_builder ubld = ibld.exec_all().group(8, 0);
    
          switch (inst->opcode) {
          case BRW_OPCODE_PLN: {
             assert(inst->exec_size == 16);
             const brw_reg tmp = ibld.vgrf(inst->src[1].type, 2);
             brw_reg srcs[4];
    
             for (unsigned i = 0; i < ARRAY_SIZE(srcs); i++)
                srcs[i] = horiz_offset(offset(inst->src[1], ibld, i % 2),
                                       8 * (i / 2));
    
             ubld.LOAD_PAYLOAD(tmp, srcs, ARRAY_SIZE(srcs), ARRAY_SIZE(srcs));
    
             inst->src[1] = tmp;
             progress = true;
             break;
          }
          case FS_OPCODE_INTERPOLATE_AT_SAMPLE:
          case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
          case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET: {
             assert(inst->exec_size == 16);
             const brw_reg tmp = ibld.vgrf(inst->dst.type, 2);
    
             for (unsigned i = 0; i < 2; i++) {
                for (unsigned g = 0; g < inst->exec_size / 8; g++) {
                   fs_inst *mov = ibld.at(block, inst->next).group(8, g)
                                      .MOV(horiz_offset(offset(inst->dst, ibld, i),
                                                        8 * g),
                                           offset(tmp, ubld, 2 * g + i));
                   mov->predicate = inst->predicate;
                   mov->predicate_inverse = inst->predicate_inverse;
                   mov->flag_subreg = inst->flag_subreg;
                }
             }
    
             inst->dst = tmp;
             progress = true;
             break;
          }
          default:
             break;
          }
       }
    
       if (progress)
          s.invalidate_analysis(DEPENDENCY_INSTRUCTIONS | DEPENDENCY_VARIABLES);
    
       return progress;
    }
    
    /**
     * Lower a derivative instruction as the floating-point difference of two
     * swizzles of the source, specified as \p swz0 and \p swz1.
     */
    static bool
    lower_derivative(fs_visitor &s, bblock_t *block, fs_inst *inst,
                     unsigned swz0, unsigned swz1)
    {
       const brw_builder ubld = brw_builder(&s, block, inst).exec_all();
       const brw_reg tmp0 = ubld.vgrf(inst->src[0].type);
       const brw_reg tmp1 = ubld.vgrf(inst->src[0].type);
    
       ubld.emit(SHADER_OPCODE_QUAD_SWIZZLE, tmp0, inst->src[0], brw_imm_ud(swz0));
       ubld.emit(SHADER_OPCODE_QUAD_SWIZZLE, tmp1, inst->src[0], brw_imm_ud(swz1));
    
       inst->resize_sources(2);
       inst->src[0] = negate(tmp0);
       inst->src[1] = tmp1;
       inst->opcode = BRW_OPCODE_ADD;
    
       return true;
    }
    
    /**
     * Lower derivative instructions on platforms where codegen cannot implement
     * them efficiently (i.e. XeHP).
     */
    bool
    brw_lower_derivatives(fs_visitor &s)
    {
       bool progress = false;
    
       if (s.devinfo->verx10 < 125)
          return false;
    
       foreach_block_and_inst(block, fs_inst, inst, s.cfg) {
          if (inst->opcode == FS_OPCODE_DDX_COARSE)
             progress |= lower_derivative(s, block, inst,
                                          BRW_SWIZZLE_XXXX, BRW_SWIZZLE_YYYY);
    
          else if (inst->opcode == FS_OPCODE_DDX_FINE)
             progress |= lower_derivative(s, block, inst,
                                          BRW_SWIZZLE_XXZZ, BRW_SWIZZLE_YYWW);
    
          else if (inst->opcode == FS_OPCODE_DDY_COARSE)
             progress |= lower_derivative(s, block, inst,
                                          BRW_SWIZZLE_XXXX, BRW_SWIZZLE_ZZZZ);
    
          else if (inst->opcode == FS_OPCODE_DDY_FINE)
             progress |= lower_derivative(s, block, inst,
                                          BRW_SWIZZLE_XYXY, BRW_SWIZZLE_ZWZW);
       }
    
       if (progress)
          s.invalidate_analysis(DEPENDENCY_INSTRUCTIONS | DEPENDENCY_VARIABLES);
    
       return progress;
    }
    
    bool
    brw_lower_find_live_channel(fs_visitor &s)
    {
       bool progress = false;
    
       bool packed_dispatch =
          brw_stage_has_packed_dispatch(s.devinfo, s.stage, s.max_polygons,
                                        s.prog_data);
       bool vmask =
          s.stage == MESA_SHADER_FRAGMENT &&
          brw_wm_prog_data(s.prog_data)->uses_vmask;
    
       foreach_block_and_inst_safe(block, fs_inst, inst, s.cfg) {
          if (inst->opcode != SHADER_OPCODE_FIND_LIVE_CHANNEL &&
              inst->opcode != SHADER_OPCODE_FIND_LAST_LIVE_CHANNEL &&
              inst->opcode != SHADER_OPCODE_LOAD_LIVE_CHANNELS)
             continue;
    
          bool first = inst->opcode == SHADER_OPCODE_FIND_LIVE_CHANNEL;
    
          /* Getting the first active channel index is easy on Gfx8: Just find
           * the first bit set in the execution mask.  The register exists on
           * HSW already but it reads back as all ones when the current
           * instruction has execution masking disabled, so it's kind of
           * useless there.
           */
    
          const brw_builder ibld(&s, block, inst);
          if (!inst->is_partial_write())
             ibld.emit_undef_for_dst(inst);
    
          const brw_builder ubld = brw_builder(&s, block, inst).exec_all().group(1, 0);
    
          brw_reg exec_mask = ubld.vgrf(BRW_TYPE_UD);
          ubld.UNDEF(exec_mask);
          ubld.emit(SHADER_OPCODE_READ_ARCH_REG, exec_mask,
                                                 retype(brw_mask_reg(0),
                                                        BRW_TYPE_UD));
    
          /* ce0 doesn't consider the thread dispatch mask (DMask or VMask),
           * so combine the execution and dispatch masks to obtain the true mask.
           *
           * If we're looking for the first live channel, and we have packed
           * dispatch, we can skip this step, as we know all dispatched channels
           * will appear at the front of the mask.
           */
          if (!(first && packed_dispatch)) {
             brw_reg mask = ubld.vgrf(BRW_TYPE_UD);
             ubld.UNDEF(mask);
             ubld.emit(SHADER_OPCODE_READ_ARCH_REG, mask,
                                                    retype(brw_sr0_reg(vmask ? 3 : 2),
                                                           BRW_TYPE_UD));
    
             /* Quarter control has the effect of magically shifting the value of
              * ce0 so you'll get the first/last active channel relative to the
              * specified quarter control as result.
              */
             if (inst->group > 0)
                ubld.SHR(mask, mask, brw_imm_ud(ALIGN(inst->group, 8)));
    
             ubld.AND(mask, exec_mask, mask);
             exec_mask = mask;
          }
    
          switch (inst->opcode) {
          case SHADER_OPCODE_FIND_LIVE_CHANNEL:
             ubld.FBL(inst->dst, exec_mask);
             break;
    
          case SHADER_OPCODE_FIND_LAST_LIVE_CHANNEL: {
             brw_reg tmp = ubld.vgrf(BRW_TYPE_UD);
             ubld.UNDEF(tmp);
             ubld.LZD(tmp, exec_mask);
             ubld.ADD(inst->dst, negate(tmp), brw_imm_uw(31));
             break;
          }
    
          case SHADER_OPCODE_LOAD_LIVE_CHANNELS:
             ubld.MOV(inst->dst, exec_mask);
             break;
    
          default:
             unreachable("Impossible.");
          }
    
          inst->remove(block);
          progress = true;
       }
    
       if (progress)
          s.invalidate_analysis(DEPENDENCY_INSTRUCTIONS | DEPENDENCY_VARIABLES);
    
       return progress;
    }
    
    /**
     * From the Skylake PRM Vol. 2a docs for sends:
     *
     *    "It is required that the second block of GRFs does not overlap with the
     *    first block."
     *
     * There are plenty of cases where we may accidentally violate this due to
     * having, for instance, both sources be the constant 0.  This little pass
     * just adds a new vgrf for the second payload and copies it over.
     */
    bool
    brw_lower_sends_overlapping_payload(fs_visitor &s)
    {
       bool progress = false;
    
       foreach_block_and_inst_safe (block, fs_inst, inst, s.cfg) {
          if (inst->opcode == SHADER_OPCODE_SEND && inst->ex_mlen > 0 &&
              regions_overlap(inst->src[2], inst->mlen * REG_SIZE,
                              inst->src[3], inst->ex_mlen * REG_SIZE)) {
             const unsigned arg = inst->mlen < inst->ex_mlen ? 2 : 3;
             const unsigned len = MIN2(inst->mlen, inst->ex_mlen);
    
             brw_reg tmp = brw_vgrf(s.alloc.allocate(len),
                                   BRW_TYPE_UD);
    
             /* Sadly, we've lost all notion of channels and bit sizes at this
              * point.  Just WE_all it.
              */
             const brw_builder ibld = brw_builder(&s, block, inst).exec_all().group(16, 0);
             brw_reg copy_src = retype(inst->src[arg], BRW_TYPE_UD);
             brw_reg copy_dst = tmp;
             for (unsigned i = 0; i < len; i += 2) {
                if (len == i + 1) {
                   /* Only one register left; do SIMD8 */
                   ibld.group(8, 0).MOV(copy_dst, copy_src);
                } else {
                   ibld.MOV(copy_dst, copy_src);
                }
                copy_src = offset(copy_src, ibld, 1);
                copy_dst = offset(copy_dst, ibld, 1);
             }
             inst->src[arg] = tmp;
             progress = true;
          }
       }
    
       if (progress)
          s.invalidate_analysis(DEPENDENCY_INSTRUCTIONS | DEPENDENCY_VARIABLES);
    
       return progress;
    }
    
    /**
     * Three source instruction must have a GRF destination register.
     * ARF NULL is not allowed.  Fix that up by allocating a temporary GRF.
     */
    bool
    brw_lower_3src_null_dest(fs_visitor &s)
    {
       bool progress = false;
    
       foreach_block_and_inst_safe (block, fs_inst, inst, s.cfg) {
          if (inst->is_3src(s.compiler) && inst->dst.is_null()) {
             inst->dst = brw_vgrf(s.alloc.allocate(s.dispatch_width / 8),
                                  inst->dst.type);
             progress = true;
          }
       }
    
       if (progress)
          s.invalidate_analysis(DEPENDENCY_INSTRUCTION_DETAIL |
                                DEPENDENCY_VARIABLES);
    
       return progress;
    }
    
    static bool
    unsupported_64bit_type(const intel_device_info *devinfo,
                           enum brw_reg_type type)
    {
       return (!devinfo->has_64bit_float && type == BRW_TYPE_DF) ||
              (!devinfo->has_64bit_int && (type == BRW_TYPE_UQ ||
                                           type == BRW_TYPE_Q));
    }
    
    /**
     * Perform lowering to legalize the IR for various ALU restrictions.
     *
     * For example:
     * - Splitting 64-bit MOV/SEL into 2x32-bit where needed
     */
    bool
    brw_lower_alu_restrictions(fs_visitor &s)
    {
       const intel_device_info *devinfo = s.devinfo;
       bool progress = false;
    
       foreach_block_and_inst_safe(block, fs_inst, inst, s.cfg) {
          switch (inst->opcode) {
          case BRW_OPCODE_MOV:
             if (unsupported_64bit_type(devinfo, inst->dst.type)) {
                assert(inst->dst.type == inst->src[0].type);
                assert(!inst->saturate);
                assert(!inst->src[0].abs);
                assert(!inst->src[0].negate);
                const brw_builder ibld(&s, block, inst);
    
                enum brw_reg_type type = brw_type_with_size(inst->dst.type, 32);
    
                if (!inst->is_partial_write())
                   ibld.emit_undef_for_dst(inst);
    
                ibld.MOV(subscript(inst->dst, type, 1),
                         subscript(inst->src[0], type, 1));
                ibld.MOV(subscript(inst->dst, type, 0),
                         subscript(inst->src[0], type, 0));
    
                inst->remove(block);
                progress = true;
             }
             break;
    
          case BRW_OPCODE_SEL:
             if (unsupported_64bit_type(devinfo, inst->dst.type)) {
                assert(inst->dst.type == inst->src[0].type);
                assert(!inst->saturate);
                assert(!inst->src[0].abs && !inst->src[0].negate);
                assert(!inst->src[1].abs && !inst->src[1].negate);
                assert(inst->conditional_mod == BRW_CONDITIONAL_NONE);
                const brw_builder ibld(&s, block, inst);
    
                enum brw_reg_type type = brw_type_with_size(inst->dst.type, 32);
    
                if (!inst->is_partial_write())
                   ibld.emit_undef_for_dst(inst);
    
                set_predicate(inst->predicate,
                              ibld.SEL(subscript(inst->dst, type, 0),
                                       subscript(inst->src[0], type, 0),
                                       subscript(inst->src[1], type, 0)));
                set_predicate(inst->predicate,
                              ibld.SEL(subscript(inst->dst, type, 1),
                                       subscript(inst->src[0], type, 1),
                                       subscript(inst->src[1], type, 1)));
    
                inst->remove(block);
                progress = true;
             }
             break;
    
          default:
             break;
          }
       }
    
       if (progress) {
          s.invalidate_analysis(DEPENDENCY_INSTRUCTION_DATA_FLOW |
                                DEPENDENCY_INSTRUCTION_DETAIL);
       }
    
       return progress;
    }
    
    static void
    brw_lower_vgrf_to_fixed_grf(const struct intel_device_info *devinfo, fs_inst *inst,
                                brw_reg *reg, bool compressed)
    {
       if (reg->file != VGRF)
          return;
    
       struct brw_reg new_reg;
    
       if (reg->stride == 0) {
          new_reg = brw_vec1_grf(reg->nr, 0);
       } else if (reg->stride > 4) {
          assert(reg != &inst->dst);
          assert(reg->stride * brw_type_size_bytes(reg->type) <= REG_SIZE);
          new_reg = brw_vecn_grf(1, reg->nr, 0);
          new_reg = stride(new_reg, reg->stride, 1, 0);
       } else {
          /* From the Haswell PRM:
           *
           *  "VertStride must be used to cross GRF register boundaries. This
           *   rule implies that elements within a 'Width' cannot cross GRF
           *   boundaries."
           *
           * The maximum width value that could satisfy this restriction is:
           */
          const unsigned reg_width =
             REG_SIZE / (reg->stride * brw_type_size_bytes(reg->type));
    
          /* Because the hardware can only split source regions at a whole
           * multiple of width during decompression (i.e. vertically), clamp
           * the value obtained above to the physical execution size of a
           * single decompressed chunk of the instruction:
           */
          const bool compressed = inst->dst.component_size(inst->exec_size) > REG_SIZE;
          const unsigned phys_width = compressed ? inst->exec_size / 2 :
                                      inst->exec_size;
    
          /* XXX - The equation above is strictly speaking not correct on
           *       hardware that supports unbalanced GRF writes -- On Gfx9+
           *       each decompressed chunk of the instruction may have a
           *       different execution size when the number of components
           *       written to each destination GRF is not the same.
           */
    
          const unsigned max_hw_width = 16;
    
          const unsigned width = MIN3(reg_width, phys_width, max_hw_width);
          new_reg = brw_vecn_grf(width, reg->nr, 0);
          new_reg = stride(new_reg, width * reg->stride, width, reg->stride);
       }
    
       new_reg = retype(new_reg, reg->type);
       new_reg = byte_offset(new_reg, reg->offset);
       new_reg.abs = reg->abs;
       new_reg.negate = reg->negate;
       new_reg.is_scalar = reg->is_scalar;
    
       *reg = new_reg;
    }
    
    void
    brw_lower_vgrfs_to_fixed_grfs(fs_visitor &s)
    {
       assert(s.grf_used || !"Must be called after register allocation");
    
       foreach_block_and_inst(block, fs_inst, inst, s.cfg) {
          /* If the instruction writes to more than one register, it needs to be
           * explicitly marked as compressed on Gen <= 5.  On Gen >= 6 the
           * hardware figures out by itself what the right compression mode is,
           * but we still need to know whether the instruction is compressed to
           * set up the source register regions appropriately.
           *
           * XXX - This is wrong for instructions that write a single register but
           *       read more than one which should strictly speaking be treated as
           *       compressed.  For instructions that don't write any registers it
           *       relies on the destination being a null register of the correct
           *       type and regioning so the instruction is considered compressed
           *       or not accordingly.
           */
    
          const bool compressed =
               inst->dst.component_size(inst->exec_size) > REG_SIZE;
    
          brw_lower_vgrf_to_fixed_grf(s.devinfo, inst, &inst->dst, compressed);
          for (int i = 0; i < inst->sources; i++) {
             brw_lower_vgrf_to_fixed_grf(s.devinfo, inst, &inst->src[i], compressed);
          }
       }
    
       s.invalidate_analysis(DEPENDENCY_INSTRUCTION_DATA_FLOW |
                             DEPENDENCY_VARIABLES);
    }
    
    static brw_reg
    brw_s0(enum brw_reg_type type, unsigned subnr)
    {
       return brw_make_reg(ARF,
                           BRW_ARF_SCALAR,
                           subnr,
                           0,
                           0,
                           type,
                           BRW_VERTICAL_STRIDE_0,
                           BRW_WIDTH_1,
                           BRW_HORIZONTAL_STRIDE_0,
                           BRW_SWIZZLE_XYZW,
                           WRITEMASK_XYZW);
    }
    
    static bool
    brw_lower_send_gather_inst(fs_visitor &s, bblock_t *block, fs_inst *inst)
    {
       const intel_device_info *devinfo = s.devinfo;
       assert(devinfo->ver >= 30);
    
       const unsigned unit = reg_unit(devinfo);
       assert(unit == 2);
    
       assert(inst->opcode == SHADER_OPCODE_SEND_GATHER);
       assert(inst->sources > 2);
       assert(inst->src[2].file == BAD_FILE);
    
       unsigned count = 0;
       uint8_t regs[16] = {};
    
       const unsigned num_payload_sources = inst->sources - 3;
       assert(num_payload_sources > 0);
    
       /* Limited by Src0.Length in the SEND instruction. */
       assert(num_payload_sources < 16);
    
       for (unsigned i = 3; i < inst->sources; i++) {
          assert(inst->src[i].file == FIXED_GRF);
          assert(inst->src[i].nr % reg_unit(devinfo) == 0);
    
          unsigned nr = phys_nr(devinfo, inst->src[i]);
          assert(nr <= UINT8_MAX);
          regs[count++] = nr;
       }
    
       /* Fill out ARF scalar register with the physical register numbers
        * and use SEND_GATHER.
        */
       brw_builder ubld = brw_builder(&s, block, inst).group(1, 0).exec_all();
       for (unsigned q = 0; q < DIV_ROUND_UP(count, 8); q++) {
          uint64_t v = 0;
          for (unsigned i = 0; i < 8; i++) {
             const uint64_t reg = regs[(q * 8) + i];
             v |= reg << (8 * i);
          }
          ubld.MOV(brw_s0(BRW_TYPE_UQ, q), brw_imm_uq(v));
       }
    
       inst->src[2] = brw_s0(BRW_TYPE_UD, 0);
       inst->mlen = count * unit;
    
       return true;
    }
    
    bool
    brw_lower_send_gather(fs_visitor &s)
    {
       assert(s.devinfo->ver >= 30);
       assert(s.grf_used || !"Must be called after register allocation");
    
       bool progress = false;
    
       foreach_block_and_inst(block, fs_inst, inst, s.cfg) {
          if (inst->opcode == SHADER_OPCODE_SEND_GATHER)
             progress |= brw_lower_send_gather_inst(s, block, inst);
       }
    
       if (progress)
          s.invalidate_analysis(DEPENDENCY_INSTRUCTION_DATA_FLOW |
                                DEPENDENCY_VARIABLES);
    
       return progress;
    }
    
    bool
    brw_lower_load_subgroup_invocation(fs_visitor &s)
    {
       bool progress = false;
    
       foreach_block_and_inst_safe(block, fs_inst, inst, s.cfg) {
          if (inst->opcode != SHADER_OPCODE_LOAD_SUBGROUP_INVOCATION)
             continue;
    
          const brw_builder abld =
             brw_builder(&s, block, inst).annotate("SubgroupInvocation");
          const brw_builder ubld8 = abld.group(8, 0).exec_all();
          ubld8.UNDEF(inst->dst);
    
          if (inst->exec_size == 8) {
             assert(inst->dst.type == BRW_TYPE_UD);
             brw_reg uw = retype(inst->dst, BRW_TYPE_UW);
             ubld8.MOV(uw, brw_imm_v(0x76543210));
             ubld8.MOV(inst->dst, uw);
          } else {
             assert(inst->dst.type == BRW_TYPE_UW);
             ubld8.MOV(inst->dst, brw_imm_v(0x76543210));
             ubld8.ADD(byte_offset(inst->dst, 16), inst->dst, brw_imm_uw(8u));
             if (inst->exec_size > 16) {
                const brw_builder ubld16 = abld.group(16, 0).exec_all();
                ubld16.ADD(byte_offset(inst->dst, 32), inst->dst, brw_imm_uw(16u));
             }
          }
    
          inst->remove(block);
          progress = true;
       }
    
       if (progress)
          s.invalidate_analysis(DEPENDENCY_INSTRUCTIONS | DEPENDENCY_VARIABLES);
    
       return progress;
    }
    
    bool
    brw_lower_indirect_mov(fs_visitor &s)
    {
       bool progress = false;
    
       if (s.devinfo->ver < 20)
          return progress;
    
       foreach_block_and_inst_safe(block, fs_inst, inst, s.cfg) {
          if (inst->opcode == SHADER_OPCODE_MOV_INDIRECT) {
             if (brw_type_size_bytes(inst->src[0].type) > 1 &&
                 brw_type_size_bytes(inst->dst.type) > 1) {
                continue;
             }
    
             assert(brw_type_size_bytes(inst->src[0].type) ==
                    brw_type_size_bytes(inst->dst.type));
    
             const brw_builder ibld(&s, block, inst);
    
             /* Extract unaligned part */
             uint16_t extra_offset = inst->src[0].offset & 0x1;
             brw_reg offset = ibld.ADD(inst->src[1], brw_imm_uw(extra_offset));
    
             /* Check if offset is odd or even so that we can choose either high or
              * low byte from the result.
              */
             brw_reg is_odd = ibld.AND(offset, brw_imm_ud(1));
    
             /* Make sure offset is word (2-bytes) aligned */
             offset = ibld.AND(offset, brw_imm_uw(~1));
    
             /* Indirect addressing(vx1 and vxh) not supported with UB/B datatype for
              * Src0, so change data type for src0 and dst to UW.
              */
             brw_reg dst = ibld.vgrf(BRW_TYPE_UW);
    
             /* Substract unaligned offset from src0 offset since we already
              * accounted unaligned part in the indirect byte offset.
              */
             brw_reg start = retype(inst->src[0], BRW_TYPE_UW);
             start.offset &= ~extra_offset;
    
             /* Adjust length to account extra offset. */
             assert(inst->src[2].file == IMM);
             brw_reg length = brw_imm_ud(inst->src[2].ud + extra_offset);
    
             ibld.emit(SHADER_OPCODE_MOV_INDIRECT, dst, start, offset, length);
    
             /* Select high byte if offset is odd otherwise select low byte. */
             brw_reg lo = ibld.AND(dst, brw_imm_uw(0xff));
             brw_reg hi = ibld.SHR(dst, brw_imm_uw(8));
             brw_reg result = ibld.vgrf(BRW_TYPE_UW);
             ibld.CSEL(result, hi, lo, is_odd, BRW_CONDITIONAL_NZ);
    
             /* Extra MOV needed here to convert back to the corresponding B type */
             ibld.MOV(inst->dst, result);
    
             inst->remove(block);
             progress = true;
          }
       }
    
       if (progress)
          s.invalidate_analysis(DEPENDENCY_INSTRUCTIONS | DEPENDENCY_VARIABLES);
    
       return progress;
    }