Edit

IABSD.fr/xenocara/lib/mesa/src/intel/compiler/brw_compile_mesh.cpp

Branch :

  • Show log

    Commit

  • Author : jsg
    Date : 2025-06-05 11:23:11
    Hash : 67d6f117
    Message : Import Mesa 25.0.7

  • lib/mesa/src/intel/compiler/brw_compile_mesh.cpp
  • /*
     * Copyright © 2021 Intel Corporation
     *
     * Permission is hereby granted, free of charge, to any person obtaining a
     * copy of this software and associated documentation files (the "Software"),
     * to deal in the Software without restriction, including without limitation
     * the rights to use, copy, modify, merge, publish, distribute, sublicense,
     * and/or sell copies of the Software, and to permit persons to whom the
     * Software is furnished to do so, subject to the following conditions:
     *
     * The above copyright notice and this permission notice (including the next
     * paragraph) shall be included in all copies or substantial portions of the
     * Software.
     *
     * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
     * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
     * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
     * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
     * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
     * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
     * IN THE SOFTWARE.
     */
    
    #include <list>
    #include <vector>
    #include "brw_compiler.h"
    #include "brw_fs.h"
    #include "brw_builder.h"
    #include "brw_generator.h"
    #include "brw_nir.h"
    #include "brw_private.h"
    #include "compiler/nir/nir_builder.h"
    #include "dev/intel_debug.h"
    
    #include <memory>
    
    using namespace brw;
    
    static bool
    brw_nir_lower_load_uniforms_filter(const nir_instr *instr,
                                       UNUSED const void *data)
    {
       if (instr->type != nir_instr_type_intrinsic)
          return false;
       nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
       return intrin->intrinsic == nir_intrinsic_load_uniform;
    }
    
    static nir_def *
    brw_nir_lower_load_uniforms_impl(nir_builder *b, nir_instr *instr,
                                     void *data)
    {
       assert(instr->type == nir_instr_type_intrinsic);
       nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
       assert(intrin->intrinsic == nir_intrinsic_load_uniform);
    
       /* Use the first few bytes of InlineData as push constants. */
       if (nir_src_is_const(intrin->src[0])) {
          int offset =
             BRW_TASK_MESH_PUSH_CONSTANTS_START_DW * 4 +
             nir_intrinsic_base(intrin) + nir_src_as_uint(intrin->src[0]);
          int range = intrin->def.num_components * intrin->def.bit_size / 8;
          if ((offset + range) <= (int)(BRW_TASK_MESH_INLINE_DATA_SIZE_DW * 4)) {
             return nir_load_inline_data_intel(b,
                                               intrin->def.num_components,
                                               intrin->def.bit_size,
                                               .base = offset);
          }
       }
    
       return brw_nir_load_global_const(b, intrin,
                                        nir_load_inline_data_intel(b, 1, 64, 0), 0);
    }
    
    static bool
    brw_nir_lower_load_uniforms(nir_shader *nir,
                                const struct intel_device_info *devinfo)
    {
       return nir_shader_lower_instructions(nir, brw_nir_lower_load_uniforms_filter,
                                            brw_nir_lower_load_uniforms_impl,
                                            (void *)devinfo);
    }
    
    static inline int
    type_size_scalar_dwords(const struct glsl_type *type, bool bindless)
    {
       return glsl_count_dword_slots(type, bindless);
    }
    
    /* TODO(mesh): Make this a common function. */
    static void
    shared_type_info(const struct glsl_type *type, unsigned *size, unsigned *align)
    {
       assert(glsl_type_is_vector_or_scalar(type));
    
       uint32_t comp_size = glsl_type_is_boolean(type)
          ? 4 : glsl_get_bit_size(type) / 8;
       unsigned length = glsl_get_vector_elements(type);
       *size = comp_size * length,
       *align = comp_size * (length == 3 ? 4 : length);
    }
    
    static bool
    brw_nir_lower_launch_mesh_workgroups_instr(nir_builder *b,
                                               nir_intrinsic_instr *intrin,
                                               void *data)
    {
       if (intrin->intrinsic != nir_intrinsic_launch_mesh_workgroups)
          return false;
    
       b->cursor = nir_before_instr(&intrin->instr);
    
       nir_def *local_invocation_index = nir_load_local_invocation_index(b);
    
       /* Make sure that the mesh workgroup size is taken from the first invocation
        * (nir_intrinsic_launch_mesh_workgroups requirement)
        */
       nir_def *cmp = nir_ieq_imm(b, local_invocation_index, 0);
       nir_if *if_stmt = nir_push_if(b, cmp);
       {
          /* TUE header contains 4 words:
           *
           * - Word 0 for Task Count.
           *
           * - Words 1-3 used for "Dispatch Dimensions" feature, to allow mapping a
           *   3D dispatch into the 1D dispatch supported by HW.
           */
          nir_def *x = nir_channel(b, intrin->src[0].ssa, 0);
          nir_def *y = nir_channel(b, intrin->src[0].ssa, 1);
          nir_def *z = nir_channel(b, intrin->src[0].ssa, 2);
          nir_def *task_count = nir_imul(b, x, nir_imul(b, y, z));
          nir_def *tue_header = nir_vec4(b, task_count, x, y, z);
          nir_store_task_payload(b, tue_header, nir_imm_int(b, 0));
       }
       nir_pop_if(b, if_stmt);
    
       nir_instr_remove(&intrin->instr);
    
       return true;
    }
    
    static bool
    brw_nir_lower_launch_mesh_workgroups(nir_shader *nir)
    {
       return nir_shader_intrinsics_pass(nir,
                                           brw_nir_lower_launch_mesh_workgroups_instr,
                                           nir_metadata_none,
                                           NULL);
    }
    
    static void
    brw_nir_lower_tue_outputs(nir_shader *nir, brw_tue_map *map)
    {
       memset(map, 0, sizeof(*map));
    
       NIR_PASS(_, nir, nir_lower_io, nir_var_shader_out,
                type_size_scalar_dwords, nir_lower_io_lower_64bit_to_32);
    
       /* From bspec: "It is suggested that SW reserve the 16 bytes following the
        * TUE Header, and therefore start the SW-defined data structure at 32B
        * alignment.  This allows the TUE Header to always be written as 32 bytes
        * with 32B alignment, the most optimal write performance case."
        */
       map->per_task_data_start_dw = 8;
    
       /* Lowering to explicit types will start offsets from task_payload_size, so
        * set it to start after the header.
        */
       nir->info.task_payload_size = map->per_task_data_start_dw * 4;
       NIR_PASS(_, nir, nir_lower_vars_to_explicit_types,
                nir_var_mem_task_payload, shared_type_info);
       NIR_PASS(_, nir, nir_lower_explicit_io,
                nir_var_mem_task_payload, nir_address_format_32bit_offset);
    
       map->size_dw = ALIGN(DIV_ROUND_UP(nir->info.task_payload_size, 4), 8);
    }
    
    static void
    brw_print_tue_map(FILE *fp, const struct brw_tue_map *map)
    {
       fprintf(fp, "TUE (%d dwords)\n\n", map->size_dw);
    }
    
    static bool
    brw_nir_adjust_task_payload_offsets_instr(struct nir_builder *b,
                                              nir_intrinsic_instr *intrin,
                                              void *data)
    {
       switch (intrin->intrinsic) {
       case nir_intrinsic_store_task_payload:
       case nir_intrinsic_load_task_payload: {
          nir_src *offset_src = nir_get_io_offset_src(intrin);
    
          if (nir_src_is_const(*offset_src))
             assert(nir_src_as_uint(*offset_src) % 4 == 0);
    
          b->cursor = nir_before_instr(&intrin->instr);
    
          /* Regular I/O uses dwords while explicit I/O used for task payload uses
           * bytes.  Normalize it to dwords.
           *
           * TODO(mesh): Figure out how to handle 8-bit, 16-bit.
           */
    
          nir_def *offset = nir_ishr_imm(b, offset_src->ssa, 2);
          nir_src_rewrite(offset_src, offset);
    
          unsigned base = nir_intrinsic_base(intrin);
          assert(base % 4 == 0);
          nir_intrinsic_set_base(intrin, base / 4);
    
          return true;
       }
    
       default:
          return false;
       }
    }
    
    static bool
    brw_nir_adjust_task_payload_offsets(nir_shader *nir)
    {
       return nir_shader_intrinsics_pass(nir,
                                           brw_nir_adjust_task_payload_offsets_instr,
                                           nir_metadata_control_flow,
                                           NULL);
    }
    
    void
    brw_nir_adjust_payload(nir_shader *shader)
    {
       /* Adjustment of task payload offsets must be performed *after* last pass
        * which interprets them as bytes, because it changes their unit.
        */
       bool adjusted = false;
       NIR_PASS(adjusted, shader, brw_nir_adjust_task_payload_offsets);
       if (adjusted) /* clean up the mess created by offset adjustments */
          NIR_PASS(_, shader, nir_opt_constant_folding);
    }
    
    static bool
    brw_nir_align_launch_mesh_workgroups_instr(nir_builder *b,
                                               nir_intrinsic_instr *intrin,
                                               void *data)
    {
       if (intrin->intrinsic != nir_intrinsic_launch_mesh_workgroups)
          return false;
    
       /* nir_lower_task_shader uses "range" as task payload size. */
       unsigned range = nir_intrinsic_range(intrin);
       /* This will avoid special case in nir_lower_task_shader dealing with
        * not vec4-aligned payload when payload_in_shared workaround is enabled.
        */
       nir_intrinsic_set_range(intrin, ALIGN(range, 16));
    
       return true;
    }
    
    static bool
    brw_nir_align_launch_mesh_workgroups(nir_shader *nir)
    {
       return nir_shader_intrinsics_pass(nir,
                                           brw_nir_align_launch_mesh_workgroups_instr,
                                           nir_metadata_control_flow,
                                           NULL);
    }
    
    static bool
    lower_set_vtx_and_prim_to_temp_write(nir_builder *b,
                                         nir_intrinsic_instr *intrin,
                                         void *data)
    {
       if (intrin->intrinsic != nir_intrinsic_set_vertex_and_primitive_count)
          return false;
    
       /* Detect some cases of invalid primitive count. They might lead to URB
        * memory corruption, where workgroups overwrite each other output memory.
        */
       if (nir_src_is_const(intrin->src[1]) &&
           nir_src_as_uint(intrin->src[1]) > b->shader->info.mesh.max_primitives_out)
          unreachable("number of primitives bigger than max specified");
    
       b->cursor = nir_instr_remove(&intrin->instr);
    
       nir_variable *temporary_primitive_count = (nir_variable *)data;
       nir_store_var(b, temporary_primitive_count, intrin->src[1].ssa, 0x1);
    
       return true;
    }
    
    static bool
    brw_nir_lower_mesh_primitive_count(nir_shader *nir)
    {
       nir_function_impl *impl = nir_shader_get_entrypoint(nir);
    
       nir_variable *temporary_primitive_count =
          nir_local_variable_create(impl,
                                    glsl_uint_type(),
                                    "__temp_primitive_count");
    
       nir_shader_intrinsics_pass(nir,
                                  lower_set_vtx_and_prim_to_temp_write,
                                  nir_metadata_control_flow,
                                  temporary_primitive_count);
    
       nir_builder _b = nir_builder_at(nir_before_impl(impl)), *b = &_b;
    
       nir_store_var(b, temporary_primitive_count, nir_imm_int(b, 0), 0x1);
    
       b->cursor = nir_after_impl(impl);
    
       /* Have a single lane write the primitive count */
       nir_def *local_invocation_index = nir_load_local_invocation_index(b);
       nir_push_if(b, nir_ieq_imm(b, local_invocation_index, 0));
       {
          nir_variable *final_primitive_count =
             nir_create_variable_with_location(nir, nir_var_shader_out,
                                               VARYING_SLOT_PRIMITIVE_COUNT,
                                               glsl_uint_type());
          final_primitive_count->name = ralloc_strdup(final_primitive_count,
                                                      "gl_PrimitiveCountNV");
          final_primitive_count->data.interpolation = INTERP_MODE_NONE;
    
          nir_store_var(b, final_primitive_count,
                        nir_load_var(b, temporary_primitive_count), 0x1);
       }
       nir_pop_if(b, NULL);
    
       nir_metadata_preserve(impl, nir_metadata_none);
    
       nir->info.outputs_written |= VARYING_BIT_PRIMITIVE_COUNT;
    
       return true;
    }
    
    static void
    brw_emit_urb_fence(fs_visitor &s)
    {
       const brw_builder bld1 = brw_builder(&s).at_end().exec_all().group(1, 0);
       brw_reg dst = bld1.vgrf(BRW_TYPE_UD);
       fs_inst *fence = bld1.emit(SHADER_OPCODE_MEMORY_FENCE, dst,
                                  brw_vec8_grf(0, 0),
                                  brw_imm_ud(true),
                                  brw_imm_ud(0));
       fence->sfid = BRW_SFID_URB;
       /* The logical thing here would likely be a THREADGROUP fence but that's
        * still failing some tests like in dEQP-VK.mesh_shader.ext.query.*
        *
        * Gfx12.5 has a comment about this on BSpec 53533 :
        *
        *    "If fence scope is Local or Threadgroup, HW ignores the flush type
        *     and operates as if it was set to None (no flush)"
        *
        * Software workaround from HSD-22014129519 indicates that a GPU fence
        * resolves the issue.
        */
       fence->desc = lsc_fence_msg_desc(s.devinfo, LSC_FENCE_GPU,
                                        LSC_FLUSH_TYPE_NONE, true);
    
       bld1.emit(FS_OPCODE_SCHEDULING_FENCE, bld1.null_reg_ud(), &dst, 1);
    }
    
    static bool
    run_task_mesh(fs_visitor &s, bool allow_spilling)
    {
       assert(s.stage == MESA_SHADER_TASK ||
              s.stage == MESA_SHADER_MESH);
    
       s.payload_ = new task_mesh_thread_payload(s);
    
       nir_to_brw(&s);
    
       if (s.failed)
          return false;
    
       brw_emit_urb_fence(s);
    
       s.emit_cs_terminate();
    
       brw_calculate_cfg(s);
    
       brw_optimize(s);
    
       s.assign_curb_setup();
    
       brw_lower_3src_null_dest(s);
       brw_workaround_emit_dummy_mov_instruction(s);
    
       brw_allocate_registers(s, allow_spilling);
    
       brw_workaround_source_arf_before_eot(s);
    
       return !s.failed;
    }
    
    const unsigned *
    brw_compile_task(const struct brw_compiler *compiler,
                     struct brw_compile_task_params *params)
    {
       const struct intel_device_info *devinfo = compiler->devinfo;
       struct nir_shader *nir = params->base.nir;
       const struct brw_task_prog_key *key = params->key;
       struct brw_task_prog_data *prog_data = params->prog_data;
       const bool debug_enabled = brw_should_print_shader(nir, DEBUG_TASK);
    
       brw_nir_lower_tue_outputs(nir, &prog_data->map);
    
       NIR_PASS(_, nir, brw_nir_align_launch_mesh_workgroups);
    
       nir_lower_task_shader_options lower_ts_opt = {
          .payload_to_shared_for_atomics = true,
          .payload_to_shared_for_small_types = true,
          /* The actual payload data starts after the TUE header and padding,
           * so skip those when copying.
           */
          .payload_offset_in_bytes = prog_data->map.per_task_data_start_dw * 4,
       };
       NIR_PASS(_, nir, nir_lower_task_shader, lower_ts_opt);
    
       NIR_PASS(_, nir, brw_nir_lower_launch_mesh_workgroups);
    
       prog_data->base.base.stage = MESA_SHADER_TASK;
       prog_data->base.base.total_shared = nir->info.shared_size;
       prog_data->base.base.total_scratch = 0;
    
       prog_data->base.local_size[0] = nir->info.workgroup_size[0];
       prog_data->base.local_size[1] = nir->info.workgroup_size[1];
       prog_data->base.local_size[2] = nir->info.workgroup_size[2];
    
       prog_data->uses_drawid =
          BITSET_TEST(nir->info.system_values_read, SYSTEM_VALUE_DRAW_ID);
    
       NIR_PASS(_, nir, brw_nir_lower_load_uniforms, compiler->devinfo);
       prog_data->base.uses_inline_data = brw_nir_uses_inline_data(nir);
    
       brw_simd_selection_state simd_state{
          .devinfo = compiler->devinfo,
          .prog_data = &prog_data->base,
          .required_width = brw_required_dispatch_width(&nir->info),
       };
    
       std::unique_ptr<fs_visitor> v[3];
    
       for (unsigned i = 0; i < 3; i++) {
          const unsigned simd = devinfo->ver >= 30 ? 2 - i : i;
    
          if (!brw_simd_should_compile(simd_state, simd))
             continue;
    
          const unsigned dispatch_width = 8 << simd;
    
          nir_shader *shader = nir_shader_clone(params->base.mem_ctx, nir);
          brw_nir_apply_key(shader, compiler, &key->base, dispatch_width);
    
          NIR_PASS(_, shader, brw_nir_lower_simd, dispatch_width);
    
          brw_postprocess_nir(shader, compiler, debug_enabled,
                              key->base.robust_flags);
    
          v[simd] = std::make_unique<fs_visitor>(compiler, &params->base,
                                                 &key->base,
                                                 &prog_data->base.base,
                                                 shader, dispatch_width,
                                                 params->base.stats != NULL,
                                                 debug_enabled);
    
          if (prog_data->base.prog_mask) {
             unsigned first = ffs(prog_data->base.prog_mask) - 1;
             v[simd]->import_uniforms(v[first].get());
          }
    
          const bool allow_spilling = simd == 0 ||
             (!simd_state.compiled[simd - 1] && !brw_simd_should_compile(simd_state, simd - 1));
          if (run_task_mesh(*v[simd], allow_spilling)) {
             brw_simd_mark_compiled(simd_state, simd, v[simd]->spilled_any_registers);
    
             if (devinfo->ver >= 30 && !v[simd]->spilled_any_registers)
                break;
          } else {
             simd_state.error[simd] = ralloc_strdup(params->base.mem_ctx, v[simd]->fail_msg);
          }
       }
    
       int selected_simd = brw_simd_select(simd_state);
       if (selected_simd < 0) {
          params->base.error_str =
             ralloc_asprintf(params->base.mem_ctx,
                             "Can't compile shader: "
                             "SIMD8 '%s', SIMD16 '%s' and SIMD32 '%s'.\n",
                             simd_state.error[0], simd_state.error[1],
                             simd_state.error[2]);
          return NULL;
       }
    
       fs_visitor *selected = v[selected_simd].get();
       prog_data->base.prog_mask = 1 << selected_simd;
       prog_data->base.base.grf_used = MAX2(prog_data->base.base.grf_used,
                                            selected->grf_used);
    
       if (unlikely(debug_enabled)) {
          fprintf(stderr, "Task Output ");
          brw_print_tue_map(stderr, &prog_data->map);
       }
    
       brw_generator g(compiler, &params->base, &prog_data->base.base,
                      MESA_SHADER_TASK);
       if (unlikely(debug_enabled)) {
          g.enable_debug(ralloc_asprintf(params->base.mem_ctx,
                                         "%s task shader %s",
                                         nir->info.label ? nir->info.label
                                                         : "unnamed",
                                         nir->info.name));
       }
    
       g.generate_code(selected->cfg, selected->dispatch_width, selected->shader_stats,
                       selected->performance_analysis.require(), params->base.stats);
       g.add_const_data(nir->constant_data, nir->constant_data_size);
       return g.get_assembly();
    }
    
    static void
    brw_nir_lower_tue_inputs(nir_shader *nir, const brw_tue_map *map)
    {
       if (!map)
          return;
    
       nir->info.task_payload_size = map->per_task_data_start_dw * 4;
    
       bool progress = false;
    
       NIR_PASS(progress, nir, nir_lower_vars_to_explicit_types,
                nir_var_mem_task_payload, shared_type_info);
    
       if (progress) {
          /* The types for Task Output and Mesh Input should match, so their sizes
           * should also match.
           */
          assert(map->size_dw == ALIGN(DIV_ROUND_UP(nir->info.task_payload_size, 4), 8));
       } else {
          /* Mesh doesn't read any input, to make it clearer set the
           * task_payload_size to zero instead of keeping an incomplete size that
           * just includes the header.
           */
          nir->info.task_payload_size = 0;
       }
    
       NIR_PASS(_, nir, nir_lower_explicit_io, nir_var_mem_task_payload,
                nir_address_format_32bit_offset);
    }
    
    /* Attribute types. Flat attributes have to be a separate class because
     * flat and interpolated attributes can't share the same vec4 slot
     * (see 3DSTATE_SBE.ConstantInterpolationEnable).
     */
    enum {
       PRIM, /* per primitive */
       VERT, /* per vertex interpolated */
       VERT_FLAT, /* per vertex flat */
    };
    
    struct attr_desc {
       int location;
       const struct glsl_type *type;
       unsigned dwords;
       unsigned slots;
    };
    
    struct attr_type_info {
       /* order of attributes, negative values are holes */
       std::list<struct attr_desc> *order;
    
       /* attributes after which there's hole of size equal to array index */
       std::list<int> holes[5];
    };
    
    static void
    brw_mue_assign_position(const struct attr_desc *attr,
                            struct brw_mue_map *map,
                            unsigned start_dw)
    {
       bool is_array = glsl_type_is_array(attr->type);
       int location = attr->location;
       unsigned remaining = attr->dwords;
    
       for (unsigned slot = 0; slot < attr->slots; ++slot) {
          map->start_dw[location + slot] = start_dw;
    
          unsigned sz;
    
          if (is_array) {
             assert(attr->dwords % attr->slots == 0);
             sz = attr->dwords / attr->slots;
          } else {
             sz = MIN2(remaining, 4);
          }
    
          map->len_dw[location + slot] = sz;
          start_dw += sz;
          remaining -= sz;
       }
    }
    
    static nir_variable *
    brw_nir_find_complete_variable_with_location(nir_shader *shader,
                                                 nir_variable_mode mode,
                                                 int location)
    {
       nir_variable *best_var = NULL;
       unsigned last_size = 0;
    
       nir_foreach_variable_with_modes(var, shader, mode) {
          if (var->data.location != location)
             continue;
    
          unsigned new_size = glsl_count_dword_slots(var->type, false);
          if (new_size > last_size) {
             best_var = var;
             last_size = new_size;
          }
       }
    
       return best_var;
    }
    
    static unsigned
    brw_sum_size(const std::list<struct attr_desc> &orders)
    {
       unsigned sz = 0;
       for (auto it = orders.cbegin(); it != orders.cend(); ++it)
          sz += (*it).dwords;
       return sz;
    }
    
    /* Finds order of outputs which require minimum size, without splitting
     * of URB read/write messages (which operate on vec4-aligned memory).
     */
    static void
    brw_compute_mue_layout(const struct brw_compiler *compiler,
                           std::list<struct attr_desc> *orders,
                           uint64_t outputs_written,
                           struct nir_shader *nir,
                           bool *pack_prim_data_into_header,
                           bool *pack_vert_data_into_header)
    {
       const struct shader_info *info = &nir->info;
    
       struct attr_type_info data[3];
    
       if ((compiler->mesh.mue_header_packing & 1) == 0)
          *pack_prim_data_into_header = false;
       if ((compiler->mesh.mue_header_packing & 2) == 0)
          *pack_vert_data_into_header = false;
    
       for (unsigned i = PRIM; i <= VERT_FLAT; ++i)
          data[i].order = &orders[i];
    
       /* If packing into header is enabled, add a hole of size 4 and add
        * a virtual location to keep the algorithm happy (it expects holes
        * to be preceded by some location). We'll remove those virtual
        * locations at the end.
        */
       const gl_varying_slot virtual_header_location = VARYING_SLOT_POS;
       assert((outputs_written & BITFIELD64_BIT(virtual_header_location)) == 0);
    
       struct attr_desc d;
       d.location = virtual_header_location;
       d.type = NULL;
       d.dwords = 0;
       d.slots = 0;
    
       struct attr_desc h;
       h.location = -1;
       h.type = NULL;
       h.dwords = 4;
       h.slots = 0;
    
       if (*pack_prim_data_into_header) {
          orders[PRIM].push_back(d);
          orders[PRIM].push_back(h);
          data[PRIM].holes[4].push_back(virtual_header_location);
       }
    
       if (*pack_vert_data_into_header) {
          orders[VERT].push_back(d);
          orders[VERT].push_back(h);
          data[VERT].holes[4].push_back(virtual_header_location);
       }
    
       u_foreach_bit64(location, outputs_written) {
          if ((BITFIELD64_BIT(location) & outputs_written) == 0)
             continue;
    
          /* At this point there are both complete and split variables as
           * outputs. We need the complete variable to compute the required
           * size.
           */
          nir_variable *var =
                brw_nir_find_complete_variable_with_location(nir,
                                                             nir_var_shader_out,
                                                             location);
    
          d.location = location;
          d.type     = brw_nir_get_var_type(nir, var);
          d.dwords   = glsl_count_dword_slots(d.type, false);
          d.slots    = glsl_count_attribute_slots(d.type, false);
    
          struct attr_type_info *type_data;
    
          if (BITFIELD64_BIT(location) & info->per_primitive_outputs)
             type_data = &data[PRIM];
          else if (var->data.interpolation == INTERP_MODE_FLAT)
             type_data = &data[VERT_FLAT];
          else
             type_data = &data[VERT];
    
          std::list<struct attr_desc> *order = type_data->order;
          std::list<int> *holes = type_data->holes;
    
          outputs_written &= ~BITFIELD64_RANGE(location, d.slots);
    
          /* special case to use hole of size 4 */
          if (d.dwords == 4 && !holes[4].empty()) {
             holes[4].pop_back();
    
             assert(order->front().location == virtual_header_location);
             order->pop_front();
    
             assert(order->front().location == -1);
             assert(order->front().dwords == 4);
             order->front() = d;
    
             continue;
          }
    
          int mod = d.dwords % 4;
          if (mod == 0) {
             order->push_back(d);
             continue;
          }
    
          h.location = -1;
          h.type = NULL;
          h.dwords = 4 - mod;
          h.slots = 0;
    
          if (!compiler->mesh.mue_compaction) {
             order->push_back(d);
             order->push_back(h);
             continue;
          }
    
          if (d.dwords > 4) {
             order->push_back(d);
             order->push_back(h);
             holes[h.dwords].push_back(location);
             continue;
          }
    
          assert(d.dwords < 4);
    
          unsigned found = 0;
          /* try to find the smallest hole big enough to hold this attribute */
          for (unsigned sz = d.dwords; sz <= 4; sz++){
             if (!holes[sz].empty()) {
                found = sz;
                break;
             }
          }
    
          /* append at the end if not found */
          if (found == 0) {
             order->push_back(d);
             order->push_back(h);
             holes[h.dwords].push_back(location);
    
             continue;
          }
    
          assert(found <= 4);
          assert(!holes[found].empty());
          int after_loc = holes[found].back();
          holes[found].pop_back();
    
          bool inserted_back = false;
    
          for (auto it = order->begin(); it != order->end(); ++it) {
             if ((*it).location != after_loc)
                continue;
    
             ++it;
             /* must be a hole */
             assert((*it).location < 0);
             /* and it must be big enough */
             assert(d.dwords <= (*it).dwords);
    
             if (d.dwords == (*it).dwords) {
                /* exact size, just replace */
                *it = d;
             } else {
                /* inexact size, shrink hole */
                (*it).dwords -= d.dwords;
                /* and insert new attribute before it */
                order->insert(it, d);
    
                /* Insert shrunk hole in a spot so that the order of attributes
                 * is preserved.
                 */
                std::list<int> &hole_list = holes[(*it).dwords];
                std::list<int>::iterator insert_before = hole_list.end();
    
                for (auto it2 = hole_list.begin(); it2 != hole_list.end(); ++it2) {
                   if ((*it2) >= (int)location) {
                      insert_before = it2;
                      break;
                   }
                }
    
                hole_list.insert(insert_before, location);
             }
    
             inserted_back = true;
             break;
          }
    
          assert(inserted_back);
       }
    
       if (*pack_prim_data_into_header) {
          if (orders[PRIM].front().location == virtual_header_location)
             orders[PRIM].pop_front();
    
          if (!data[PRIM].holes[4].empty()) {
             *pack_prim_data_into_header = false;
    
             assert(orders[PRIM].front().location == -1);
             assert(orders[PRIM].front().dwords == 4);
             orders[PRIM].pop_front();
          }
    
          if (*pack_prim_data_into_header) {
             unsigned sz = brw_sum_size(orders[PRIM]);
    
             if (sz % 8 == 0 || sz % 8 > 4)
                *pack_prim_data_into_header = false;
          }
       }
    
       if (*pack_vert_data_into_header) {
          if (orders[VERT].front().location == virtual_header_location)
             orders[VERT].pop_front();
    
          if (!data[VERT].holes[4].empty()) {
             *pack_vert_data_into_header = false;
    
             assert(orders[VERT].front().location == -1);
             assert(orders[VERT].front().dwords == 4);
             orders[VERT].pop_front();
          }
    
          if (*pack_vert_data_into_header) {
             unsigned sz = brw_sum_size(orders[VERT]) +
                           brw_sum_size(orders[VERT_FLAT]);
    
             if (sz % 8 == 0 || sz % 8 > 4)
                *pack_vert_data_into_header = false;
          }
       }
    
    
       if (INTEL_DEBUG(DEBUG_MESH)) {
          fprintf(stderr, "MUE attribute order:\n");
          for (unsigned i = PRIM; i <= VERT_FLAT; ++i) {
             if (!orders[i].empty())
                fprintf(stderr, "%d: ", i);
             for (auto it = orders[i].cbegin(); it != orders[i].cend(); ++it) {
                fprintf(stderr, "%d(%d) ", (*it).location, (*it).dwords);
             }
             if (!orders[i].empty())
                fprintf(stderr, "\n");
          }
       }
    }
    
    /* Mesh URB Entry consists of an initial section
     *
     *  - Primitive Count
     *  - Primitive Indices (from 0 to Max-1)
     *  - Padding to 32B if needed
     *
     * optionally followed by a section for per-primitive data,
     * in which each primitive (from 0 to Max-1) gets
     *
     *  - Primitive Header (e.g. ViewportIndex)
     *  - Primitive Custom Attributes
     *
     * then followed by a section for per-vertex data
     *
     *  - Vertex Header (e.g. Position)
     *  - Vertex Custom Attributes
     *
     * Each per-element section has a pitch and a starting offset.  All the
     * individual attributes offsets in start_dw are considering the first entry
     * of the section (i.e. where the Position for first vertex, or ViewportIndex
     * for first primitive).  Attributes for other elements are calculated using
     * the pitch.
     */
    static void
    brw_compute_mue_map(const struct brw_compiler *compiler,
                        struct nir_shader *nir, struct brw_mue_map *map,
                        enum brw_mesh_index_format index_format, bool compact_mue)
    {
       memset(map, 0, sizeof(*map));
    
       memset(&map->start_dw[0], -1, sizeof(map->start_dw));
       memset(&map->len_dw[0], 0, sizeof(map->len_dw));
    
       unsigned vertices_per_primitive =
          mesa_vertices_per_prim(nir->info.mesh.primitive_type);
    
       map->max_primitives = nir->info.mesh.max_primitives_out;
       map->max_vertices = nir->info.mesh.max_vertices_out;
    
       uint64_t outputs_written = nir->info.outputs_written;
    
       /* One dword for primitives count then K extra dwords for each primitive. */
       switch (index_format) {
       case BRW_INDEX_FORMAT_U32:
          map->per_primitive_indices_dw = vertices_per_primitive;
          break;
       case BRW_INDEX_FORMAT_U888X:
          map->per_primitive_indices_dw = 1;
          break;
       default:
          unreachable("invalid index format");
       }
    
       map->per_primitive_start_dw = ALIGN(map->per_primitive_indices_dw *
                                           map->max_primitives + 1, 8);
    
       /* Assign initial section. */
       if (BITFIELD64_BIT(VARYING_SLOT_PRIMITIVE_COUNT) & outputs_written) {
          map->start_dw[VARYING_SLOT_PRIMITIVE_COUNT] = 0;
          map->len_dw[VARYING_SLOT_PRIMITIVE_COUNT] = 1;
          outputs_written &= ~BITFIELD64_BIT(VARYING_SLOT_PRIMITIVE_COUNT);
       }
       if (BITFIELD64_BIT(VARYING_SLOT_PRIMITIVE_INDICES) & outputs_written) {
          map->start_dw[VARYING_SLOT_PRIMITIVE_INDICES] = 1;
          map->len_dw[VARYING_SLOT_PRIMITIVE_INDICES] =
                map->per_primitive_indices_dw * map->max_primitives;
          outputs_written &= ~BITFIELD64_BIT(VARYING_SLOT_PRIMITIVE_INDICES);
       }
    
       const uint64_t per_primitive_header_bits =
             BITFIELD64_BIT(VARYING_SLOT_PRIMITIVE_SHADING_RATE) |
             BITFIELD64_BIT(VARYING_SLOT_LAYER) |
             BITFIELD64_BIT(VARYING_SLOT_VIEWPORT) |
             BITFIELD64_BIT(VARYING_SLOT_CULL_PRIMITIVE);
    
       const uint64_t per_vertex_header_bits =
             BITFIELD64_BIT(VARYING_SLOT_PSIZ) |
             BITFIELD64_BIT(VARYING_SLOT_POS) |
             BITFIELD64_BIT(VARYING_SLOT_CLIP_DIST0) |
             BITFIELD64_BIT(VARYING_SLOT_CLIP_DIST1);
    
       std::list<struct attr_desc> orders[3];
       uint64_t regular_outputs = outputs_written &
             ~(per_primitive_header_bits | per_vertex_header_bits);
    
       /* packing into prim header is possible only if prim header is present */
       map->user_data_in_primitive_header = compact_mue &&
             (outputs_written & per_primitive_header_bits) != 0;
    
       /* Packing into vert header is always possible, but we allow it only
        * if full vec4 is available (so point size is not used) and there's
        * nothing between it and normal vertex data (so no clip distances).
        */
       map->user_data_in_vertex_header = compact_mue &&
             (outputs_written & per_vertex_header_bits) ==
                   BITFIELD64_BIT(VARYING_SLOT_POS);
    
       if (outputs_written & per_primitive_header_bits) {
          bool zero_layer_viewport = false;
          if (outputs_written & BITFIELD64_BIT(VARYING_SLOT_PRIMITIVE_SHADING_RATE)) {
             map->start_dw[VARYING_SLOT_PRIMITIVE_SHADING_RATE] =
                   map->per_primitive_start_dw + 0;
             map->len_dw[VARYING_SLOT_PRIMITIVE_SHADING_RATE] = 1;
             /* Wa_16020916187: force 0 writes to layer and viewport slots */
             zero_layer_viewport =
                intel_needs_workaround(compiler->devinfo, 16020916187);
          }
    
          if ((outputs_written & BITFIELD64_BIT(VARYING_SLOT_LAYER)) ||
              zero_layer_viewport) {
             map->start_dw[VARYING_SLOT_LAYER] =
                   map->per_primitive_start_dw + 1; /* RTAIndex */
             map->len_dw[VARYING_SLOT_LAYER] = 1;
          }
    
          if ((outputs_written & BITFIELD64_BIT(VARYING_SLOT_VIEWPORT)) ||
              zero_layer_viewport) {
              map->start_dw[VARYING_SLOT_VIEWPORT] =
                    map->per_primitive_start_dw + 2;
              map->len_dw[VARYING_SLOT_VIEWPORT] = 1;
          }
    
          if (outputs_written & BITFIELD64_BIT(VARYING_SLOT_CULL_PRIMITIVE)) {
             map->start_dw[VARYING_SLOT_CULL_PRIMITIVE] =
                   map->per_primitive_start_dw + 3;
             map->len_dw[VARYING_SLOT_CULL_PRIMITIVE] = 1;
          }
    
          map->per_primitive_header_size_dw = 8;
          outputs_written &= ~per_primitive_header_bits;
       } else {
          map->per_primitive_header_size_dw = 0;
       }
    
       map->per_primitive_data_size_dw = 0;
    
       /* For fast linked libraries, we can't pack the MUE, as the fragment shader
        * will be compiled without access to the MUE map and won't be able to find
        * out where everything is.
        * Instead, keep doing things as we did before the packing, just laying out
        * everything in varying order, which is how the FS will expect them.
        */
       if (compact_mue) {
          brw_compute_mue_layout(compiler, orders, regular_outputs, nir,
                                 &map->user_data_in_primitive_header,
                                 &map->user_data_in_vertex_header);
    
          unsigned start_dw = map->per_primitive_start_dw;
          if (map->user_data_in_primitive_header)
             start_dw += 4; /* first 4 dwords are used */
          else
             start_dw += map->per_primitive_header_size_dw;
          unsigned header_used_dw = 0;
    
          for (auto it = orders[PRIM].cbegin(); it != orders[PRIM].cend(); ++it) {
             int location = (*it).location;
             if (location < 0) {
                start_dw += (*it).dwords;
                if (map->user_data_in_primitive_header && header_used_dw < 4)
                   header_used_dw += (*it).dwords;
                else
                   map->per_primitive_data_size_dw += (*it).dwords;
                assert(header_used_dw <= 4);
                continue;
             }
    
             assert(map->start_dw[location] == -1);
    
             assert(location == VARYING_SLOT_PRIMITIVE_ID ||
                    location >= VARYING_SLOT_VAR0);
    
             brw_mue_assign_position(&*it, map, start_dw);
    
             start_dw += (*it).dwords;
             if (map->user_data_in_primitive_header && header_used_dw < 4)
                header_used_dw += (*it).dwords;
             else
                map->per_primitive_data_size_dw += (*it).dwords;
             assert(header_used_dw <= 4);
             outputs_written &= ~BITFIELD64_RANGE(location, (*it).slots);
          }
       } else {
          unsigned start_dw = map->per_primitive_start_dw +
                              map->per_primitive_header_size_dw;
    
          uint64_t per_prim_outputs = outputs_written & nir->info.per_primitive_outputs;
          while (per_prim_outputs) {
             uint64_t location = ffsll(per_prim_outputs) - 1;
    
             assert(map->start_dw[location] == -1);
             assert(location == VARYING_SLOT_PRIMITIVE_ID ||
                    location >= VARYING_SLOT_VAR0);
    
             nir_variable *var =
                brw_nir_find_complete_variable_with_location(nir,
                                                             nir_var_shader_out,
                                                             location);
             struct attr_desc d;
             d.location = location;
             d.type     = brw_nir_get_var_type(nir, var);
             d.dwords   = glsl_count_dword_slots(d.type, false);
             d.slots    = glsl_count_attribute_slots(d.type, false);
    
             brw_mue_assign_position(&d, map, start_dw);
    
             map->per_primitive_data_size_dw += ALIGN(d.dwords, 4);
             start_dw += ALIGN(d.dwords, 4);
    
             per_prim_outputs &= ~BITFIELD64_RANGE(location, d.slots);
          }
       }
    
       map->per_primitive_pitch_dw = ALIGN(map->per_primitive_header_size_dw +
                                           map->per_primitive_data_size_dw, 8);
    
       map->per_vertex_start_dw = ALIGN(map->per_primitive_start_dw +
                                        map->per_primitive_pitch_dw *
                                        map->max_primitives, 8);
    
       /* TODO(mesh): Multiview. */
       unsigned fixed_header_size = 8;
       map->per_vertex_header_size_dw = ALIGN(fixed_header_size +
                                              nir->info.clip_distance_array_size +
                                              nir->info.cull_distance_array_size, 8);
    
       if (outputs_written & per_vertex_header_bits) {
          if (outputs_written & BITFIELD64_BIT(VARYING_SLOT_PSIZ)) {
             map->start_dw[VARYING_SLOT_PSIZ] = map->per_vertex_start_dw + 3;
             map->len_dw[VARYING_SLOT_PSIZ] = 1;
          }
    
          if (outputs_written & BITFIELD64_BIT(VARYING_SLOT_POS)) {
             map->start_dw[VARYING_SLOT_POS] = map->per_vertex_start_dw + 4;
             map->len_dw[VARYING_SLOT_POS] = 4;
          }
    
          if (outputs_written & BITFIELD64_BIT(VARYING_SLOT_CLIP_DIST0)) {
             map->start_dw[VARYING_SLOT_CLIP_DIST0] =
                   map->per_vertex_start_dw + fixed_header_size + 0;
             map->len_dw[VARYING_SLOT_CLIP_DIST0] = 4;
          }
    
          if (outputs_written & BITFIELD64_BIT(VARYING_SLOT_CLIP_DIST1)) {
             map->start_dw[VARYING_SLOT_CLIP_DIST1] =
                   map->per_vertex_start_dw + fixed_header_size + 4;
             map->len_dw[VARYING_SLOT_CLIP_DIST1] = 4;
          }
    
          outputs_written &= ~per_vertex_header_bits;
       }
    
       /* cull distances should be lowered earlier */
       assert(!(outputs_written & BITFIELD64_BIT(VARYING_SLOT_CULL_DIST0)));
       assert(!(outputs_written & BITFIELD64_BIT(VARYING_SLOT_CULL_DIST1)));
    
       map->per_vertex_data_size_dw = 0;
    
       /* For fast linked libraries, we can't pack the MUE, as the fragment shader
        * will be compiled without access to the MUE map and won't be able to find
        * out where everything is.
        * Instead, keep doing things as we did before the packing, just laying out
        * everything in varying order, which is how the FS will expect them.
        */
       if (compact_mue) {
          unsigned start_dw = map->per_vertex_start_dw;
          if (!map->user_data_in_vertex_header)
             start_dw += map->per_vertex_header_size_dw;
    
          unsigned header_used_dw = 0;
          for (unsigned type = VERT; type <= VERT_FLAT; ++type) {
             for (auto it = orders[type].cbegin(); it != orders[type].cend(); ++it) {
                int location = (*it).location;
                if (location < 0) {
                   start_dw += (*it).dwords;
                   if (map->user_data_in_vertex_header && header_used_dw < 4) {
                      header_used_dw += (*it).dwords;
                      assert(header_used_dw <= 4);
                      if (header_used_dw == 4)
                         start_dw += 4; /* jump over gl_position */
                   } else {
                      map->per_vertex_data_size_dw += (*it).dwords;
                   }
                   continue;
                }
    
                assert(map->start_dw[location] == -1);
    
                assert(location >= VARYING_SLOT_VAR0);
    
                brw_mue_assign_position(&*it, map, start_dw);
    
                start_dw += (*it).dwords;
                if (map->user_data_in_vertex_header && header_used_dw < 4) {
                   header_used_dw += (*it).dwords;
                   assert(header_used_dw <= 4);
                   if (header_used_dw == 4)
                      start_dw += 4; /* jump over gl_position */
                } else {
                   map->per_vertex_data_size_dw += (*it).dwords;
                }
                outputs_written &= ~BITFIELD64_RANGE(location, (*it).slots);
             }
          }
       } else {
          unsigned start_dw = map->per_vertex_start_dw +
                              map->per_vertex_header_size_dw;
    
          uint64_t per_vertex_outputs = outputs_written & ~nir->info.per_primitive_outputs;
          while (per_vertex_outputs) {
             uint64_t location = ffsll(per_vertex_outputs) - 1;
    
             assert(map->start_dw[location] == -1);
             assert(location >= VARYING_SLOT_VAR0);
    
             nir_variable *var =
                brw_nir_find_complete_variable_with_location(nir,
                                                             nir_var_shader_out,
                                                             location);
             struct attr_desc d;
             d.location = location;
             d.type     = brw_nir_get_var_type(nir, var);
             d.dwords   = glsl_count_dword_slots(d.type, false);
             d.slots    = glsl_count_attribute_slots(d.type, false);
    
             brw_mue_assign_position(&d, map, start_dw);
    
             map->per_vertex_data_size_dw += ALIGN(d.dwords, 4);
             start_dw += ALIGN(d.dwords, 4);
    
             per_vertex_outputs &= ~BITFIELD64_RANGE(location, d.slots);
          }
       }
    
       map->per_vertex_pitch_dw = ALIGN(map->per_vertex_header_size_dw +
                                        map->per_vertex_data_size_dw, 8);
    
       map->size_dw =
          map->per_vertex_start_dw + map->per_vertex_pitch_dw * map->max_vertices;
    
       assert(map->size_dw % 8 == 0);
    }
    
    static void
    brw_print_mue_map(FILE *fp, const struct brw_mue_map *map, struct nir_shader *nir)
    {
       fprintf(fp, "MUE map (%d dwords, %d primitives, %d vertices)\n",
               map->size_dw, map->max_primitives, map->max_vertices);
       fprintf(fp, "  <%4d, %4d>: VARYING_SLOT_PRIMITIVE_COUNT\n",
               map->start_dw[VARYING_SLOT_PRIMITIVE_COUNT],
               map->start_dw[VARYING_SLOT_PRIMITIVE_COUNT] +
               map->len_dw[VARYING_SLOT_PRIMITIVE_COUNT] - 1);
       fprintf(fp, "  <%4d, %4d>: VARYING_SLOT_PRIMITIVE_INDICES\n",
               map->start_dw[VARYING_SLOT_PRIMITIVE_INDICES],
               map->start_dw[VARYING_SLOT_PRIMITIVE_INDICES] +
               map->len_dw[VARYING_SLOT_PRIMITIVE_INDICES] - 1);
    
       fprintf(fp, "  ----- per primitive (start %d, header_size %d, data_size %d, pitch %d)\n",
               map->per_primitive_start_dw,
               map->per_primitive_header_size_dw,
               map->per_primitive_data_size_dw,
               map->per_primitive_pitch_dw);
    
       for (unsigned i = 0; i < VARYING_SLOT_MAX; i++) {
          if (map->start_dw[i] < 0)
             continue;
    
          const unsigned offset = map->start_dw[i];
          const unsigned len = map->len_dw[i];
    
          if (offset < map->per_primitive_start_dw ||
              offset >= map->per_primitive_start_dw + map->per_primitive_pitch_dw)
             continue;
    
          const char *name =
                gl_varying_slot_name_for_stage((gl_varying_slot)i,
                                               MESA_SHADER_MESH);
    
          fprintf(fp, "  <%4d, %4d>: %s (%d)\n", offset, offset + len - 1,
                  name, i);
       }
    
       fprintf(fp, "  ----- per vertex (start %d, header_size %d, data_size %d, pitch %d)\n",
               map->per_vertex_start_dw,
               map->per_vertex_header_size_dw,
               map->per_vertex_data_size_dw,
               map->per_vertex_pitch_dw);
    
       for (unsigned i = 0; i < VARYING_SLOT_MAX; i++) {
          if (map->start_dw[i] < 0)
             continue;
    
          const unsigned offset = map->start_dw[i];
          const unsigned len = map->len_dw[i];
    
          if (offset < map->per_vertex_start_dw ||
              offset >= map->per_vertex_start_dw + map->per_vertex_pitch_dw)
             continue;
    
          nir_variable *var =
                nir_find_variable_with_location(nir, nir_var_shader_out, i);
          bool flat = var->data.interpolation == INTERP_MODE_FLAT;
    
          const char *name =
                gl_varying_slot_name_for_stage((gl_varying_slot)i,
                                               MESA_SHADER_MESH);
    
          fprintf(fp, "  <%4d, %4d>: %s (%d)%s\n", offset, offset + len - 1,
                  name, i, flat ? " (flat)" : "");
       }
    
       fprintf(fp, "\n");
    }
    
    static void
    brw_nir_lower_mue_outputs(nir_shader *nir, const struct brw_mue_map *map)
    {
       nir_foreach_shader_out_variable(var, nir) {
          int location = var->data.location;
          assert(location >= 0);
          assert(map->start_dw[location] != -1);
          var->data.driver_location = map->start_dw[location];
       }
    
       NIR_PASS(_, nir, nir_lower_io, nir_var_shader_out,
                type_size_scalar_dwords, nir_lower_io_lower_64bit_to_32);
    }
    
    static void
    brw_nir_initialize_mue(nir_shader *nir,
                           const struct brw_mue_map *map,
                           unsigned dispatch_width)
    {
       assert(map->per_primitive_header_size_dw > 0);
    
       nir_builder b;
       nir_function_impl *entrypoint = nir_shader_get_entrypoint(nir);
       b = nir_builder_at(nir_before_impl(entrypoint));
    
       nir_def *dw_off = nir_imm_int(&b, 0);
       nir_def *zerovec = nir_imm_vec4(&b, 0, 0, 0, 0);
    
       /* TODO(mesh): can we write in bigger batches, generating fewer SENDs? */
    
       assert(!nir->info.workgroup_size_variable);
       const unsigned workgroup_size = nir->info.workgroup_size[0] *
                                       nir->info.workgroup_size[1] *
                                       nir->info.workgroup_size[2];
    
       /* Invocations from a single workgroup will cooperate in zeroing MUE. */
    
       /* How many prims each invocation needs to cover without checking its index? */
       unsigned prims_per_inv = map->max_primitives / workgroup_size;
    
       /* Zero first 4 dwords of MUE Primitive Header:
        * Reserved, RTAIndex, ViewportIndex, CullPrimitiveMask.
        */
    
       nir_def *local_invocation_index = nir_load_local_invocation_index(&b);
    
       /* Zero primitive headers distanced by workgroup_size, starting from
        * invocation index.
        */
       for (unsigned prim_in_inv = 0; prim_in_inv < prims_per_inv; ++prim_in_inv) {
          nir_def *prim = nir_iadd_imm(&b, local_invocation_index,
                                               prim_in_inv * workgroup_size);
    
          nir_store_per_primitive_output(&b, zerovec, prim, dw_off,
                                         .base = (int)map->per_primitive_start_dw,
                                         .write_mask = WRITEMASK_XYZW,
                                         .component = 0,
                                         .src_type = nir_type_uint32);
       }
    
       /* How many prims are left? */
       unsigned remaining = map->max_primitives % workgroup_size;
    
       if (remaining) {
          /* Zero "remaining" primitive headers starting from the last one covered
           * by the loop above + workgroup_size.
           */
          nir_def *cmp = nir_ilt_imm(&b, local_invocation_index, remaining);
          nir_if *if_stmt = nir_push_if(&b, cmp);
          {
             nir_def *prim = nir_iadd_imm(&b, local_invocation_index,
                                                   prims_per_inv * workgroup_size);
    
             nir_store_per_primitive_output(&b, zerovec, prim, dw_off,
                                            .base = (int)map->per_primitive_start_dw,
                                            .write_mask = WRITEMASK_XYZW,
                                            .component = 0,
                                            .src_type = nir_type_uint32);
          }
          nir_pop_if(&b, if_stmt);
       }
    
       /* If there's more than one subgroup, then we need to wait for all of them
        * to finish initialization before we can proceed. Otherwise some subgroups
        * may start filling MUE before other finished initializing.
        */
       if (workgroup_size > dispatch_width) {
          nir_barrier(&b, SCOPE_WORKGROUP, SCOPE_WORKGROUP,
                             NIR_MEMORY_ACQ_REL, nir_var_shader_out);
       }
    
       if (remaining) {
          nir_metadata_preserve(entrypoint, nir_metadata_none);
       } else {
          nir_metadata_preserve(entrypoint, nir_metadata_control_flow);
       }
    }
    
    static void
    brw_nir_adjust_offset(nir_builder *b, nir_intrinsic_instr *intrin, uint32_t pitch)
    {
       nir_src *index_src = nir_get_io_arrayed_index_src(intrin);
       nir_src *offset_src = nir_get_io_offset_src(intrin);
    
       b->cursor = nir_before_instr(&intrin->instr);
       nir_def *offset =
          nir_iadd(b,
                   offset_src->ssa,
                   nir_imul_imm(b, index_src->ssa, pitch));
       nir_src_rewrite(offset_src, offset);
    }
    
    static bool
    brw_nir_adjust_offset_for_arrayed_indices_instr(nir_builder *b,
                                                    nir_intrinsic_instr *intrin,
                                                    void *data)
    {
       const struct brw_mue_map *map = (const struct brw_mue_map *) data;
    
       /* Remap per_vertex and per_primitive offsets using the extra source and
        * the pitch.
        */
       switch (intrin->intrinsic) {
       case nir_intrinsic_load_per_vertex_output:
       case nir_intrinsic_store_per_vertex_output:
          brw_nir_adjust_offset(b, intrin, map->per_vertex_pitch_dw);
    
          return true;
    
       case nir_intrinsic_load_per_primitive_output:
       case nir_intrinsic_store_per_primitive_output: {
          struct nir_io_semantics sem = nir_intrinsic_io_semantics(intrin);
          uint32_t pitch;
          if (sem.location == VARYING_SLOT_PRIMITIVE_INDICES)
             pitch = map->per_primitive_indices_dw;
          else
             pitch = map->per_primitive_pitch_dw;
    
          brw_nir_adjust_offset(b, intrin, pitch);
    
          return true;
       }
    
       default:
          return false;
       }
    }
    
    static bool
    brw_nir_adjust_offset_for_arrayed_indices(nir_shader *nir, const struct brw_mue_map *map)
    {
       return nir_shader_intrinsics_pass(nir,
                                           brw_nir_adjust_offset_for_arrayed_indices_instr,
                                           nir_metadata_control_flow,
                                           (void *)map);
    }
    
    struct index_packing_state {
       unsigned vertices_per_primitive;
       nir_variable *original_prim_indices;
       nir_variable *packed_prim_indices;
    };
    
    static bool
    brw_can_pack_primitive_indices(nir_shader *nir, struct index_packing_state *state)
    {
       /* can single index fit into one byte of U888X format? */
       if (nir->info.mesh.max_vertices_out > 255)
          return false;
    
       state->vertices_per_primitive =
             mesa_vertices_per_prim(nir->info.mesh.primitive_type);
       /* packing point indices doesn't help */
       if (state->vertices_per_primitive == 1)
          return false;
    
       state->original_prim_indices =
          nir_find_variable_with_location(nir,
                                          nir_var_shader_out,
                                          VARYING_SLOT_PRIMITIVE_INDICES);
       /* no indices = no changes to the shader, but it's still worth it,
        * because less URB space will be used
        */
       if (!state->original_prim_indices)
          return true;
    
       ASSERTED const struct glsl_type *type = state->original_prim_indices->type;
       assert(glsl_type_is_array(type));
       assert(glsl_type_is_vector(glsl_without_array(type)));
       assert(glsl_without_array(type)->vector_elements == state->vertices_per_primitive);
    
       nir_foreach_function_impl(impl, nir) {
          nir_foreach_block(block, impl) {
             nir_foreach_instr(instr, block) {
                if (instr->type != nir_instr_type_intrinsic)
                   continue;
    
                nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
    
                if (intrin->intrinsic != nir_intrinsic_store_deref) {
                   /* any unknown deref operation on primitive indices -> don't pack */
                   unsigned num_srcs = nir_intrinsic_infos[intrin->intrinsic].num_srcs;
                   for (unsigned i = 0; i < num_srcs; i++) {
                      nir_deref_instr *deref = nir_src_as_deref(intrin->src[i]);
                      if (!deref)
                         continue;
                      nir_variable *var = nir_deref_instr_get_variable(deref);
    
                      if (var == state->original_prim_indices)
                         return false;
                   }
    
                   continue;
                }
    
                nir_deref_instr *deref = nir_src_as_deref(intrin->src[0]);
                if (!deref)
                   continue;
    
                nir_variable *var = nir_deref_instr_get_variable(deref);
                if (var != state->original_prim_indices)
                   continue;
    
                if (deref->deref_type != nir_deref_type_array)
                   return false; /* unknown chain of derefs */
    
                nir_deref_instr *var_deref = nir_src_as_deref(deref->parent);
                if (!var_deref || var_deref->deref_type != nir_deref_type_var)
                   return false; /* unknown chain of derefs */
    
                assert (var_deref->var == state->original_prim_indices);
    
                unsigned write_mask = nir_intrinsic_write_mask(intrin);
    
                /* If only some components are written, then we can't easily pack.
                 * In theory we could, by loading current dword value, bitmasking
                 * one byte and storing back the whole dword, but it would be slow
                 * and could actually decrease performance. TODO: reevaluate this
                 * once there will be something hitting this.
                 */
                if (write_mask != BITFIELD_MASK(state->vertices_per_primitive))
                   return false;
             }
          }
       }
    
       return true;
    }
    
    static bool
    brw_pack_primitive_indices_instr(nir_builder *b, nir_intrinsic_instr *intrin,
                                     void *data)
    {
       if (intrin->intrinsic != nir_intrinsic_store_deref)
          return false;
    
       nir_deref_instr *array_deref = nir_src_as_deref(intrin->src[0]);
       if (!array_deref || array_deref->deref_type != nir_deref_type_array)
          return false;
    
       nir_deref_instr *var_deref = nir_src_as_deref(array_deref->parent);
       if (!var_deref || var_deref->deref_type != nir_deref_type_var)
          return false;
    
       struct index_packing_state *state =
             (struct index_packing_state *)data;
    
       nir_variable *var = var_deref->var;
    
       if (var != state->original_prim_indices)
          return false;
    
       unsigned vertices_per_primitive = state->vertices_per_primitive;
    
       b->cursor = nir_before_instr(&intrin->instr);
    
       nir_deref_instr *new_var_deref =
             nir_build_deref_var(b, state->packed_prim_indices);
       nir_deref_instr *new_array_deref =
             nir_build_deref_array(b, new_var_deref, array_deref->arr.index.ssa);
    
       nir_src *data_src = &intrin->src[1];
       nir_def *data_def =
             data_src->ssa;
    
       nir_def *new_data =
             nir_ior(b, nir_ishl_imm(b, nir_channel(b, data_def, 0), 0),
                        nir_ishl_imm(b, nir_channel(b, data_def, 1), 8));
    
       if (vertices_per_primitive >= 3) {
          new_data =
                nir_ior(b, new_data,
                           nir_ishl_imm(b, nir_channel(b, data_def, 2), 16));
       }
    
       nir_build_store_deref(b, &new_array_deref->def, new_data);
    
       nir_instr_remove(&intrin->instr);
    
       return true;
    }
    
    static bool
    brw_pack_primitive_indices(nir_shader *nir, void *data)
    {
       struct index_packing_state *state = (struct index_packing_state *)data;
    
       const struct glsl_type *new_type =
             glsl_array_type(glsl_uint_type(),
                             nir->info.mesh.max_primitives_out,
                             0);
    
       state->packed_prim_indices =
             nir_variable_create(nir, nir_var_shader_out,
                                 new_type, "gl_PrimitiveIndicesPacked");
       state->packed_prim_indices->data.location = VARYING_SLOT_PRIMITIVE_INDICES;
       state->packed_prim_indices->data.interpolation = INTERP_MODE_NONE;
       state->packed_prim_indices->data.per_primitive = 1;
    
       return nir_shader_intrinsics_pass(nir, brw_pack_primitive_indices_instr,
                                           nir_metadata_control_flow,
                                           data);
    }
    
    static bool
    brw_mesh_autostrip_enable(const struct brw_compiler *compiler, struct nir_shader *nir,
                              struct brw_mue_map *map)
    {
       /* Auto-striping can be enabled when shader either doesn't write to
        * RTA Index and VP Index or writes the same values for all primitives.
        * Since determining whether shader writes the same value across the whole
        * workgroup (not just subgroup!) is tricky, we do the simplest possible
        * thing - say yes only when shader writes const values and they all match.
        *
        * TODO: improve this
        */
    
       if (compiler->devinfo->ver < 20)
          return false;
    
       const uint64_t outputs_written = nir->info.outputs_written;
    
       /* Wa_16020916187
        * We've allocated slots for layer/viewport in brw_compute_mue_map() if this
        * workaround is needed and will let brw_nir_initialize_mue() initialize
        * those to 0. The workaround also requires disabling autostrip.
        */
       if (intel_needs_workaround(compiler->devinfo, 16020916187) &&
           (BITFIELD64_BIT(VARYING_SLOT_PRIMITIVE_SHADING_RATE) & outputs_written))
           return false;
    
       if (map->start_dw[VARYING_SLOT_VIEWPORT] < 0 &&
           map->start_dw[VARYING_SLOT_LAYER] < 0)
          return true;
    
       nir_def *vp = NULL;
       nir_def *layer = NULL;
    
       nir_foreach_function(function, nir) {
          if (!function->impl)
             continue;
    
          nir_foreach_block(block, function->impl) {
             nir_foreach_instr(instr, block) {
                if (instr->type != nir_instr_type_intrinsic)
                   continue;
    
                nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
                if (intrin->intrinsic != nir_intrinsic_store_per_primitive_output)
                   continue;
    
                struct nir_io_semantics io = nir_intrinsic_io_semantics(intrin);
                bool is_vp = io.location == VARYING_SLOT_VIEWPORT;
                bool is_layer = io.location == VARYING_SLOT_LAYER;
                if (!is_vp && !is_layer)
                   continue;
    
                nir_src *src = &intrin->src[0];
    
                if (!nir_src_is_const(*src))
                   return false;
    
                nir_def **cmp;
                if (is_vp)
                   cmp = &vp;
                else
                   cmp = &layer;
    
                if (*cmp == NULL)
                   *cmp = src->ssa;
                else if (*cmp != src->ssa)
                   return false;
             }
          }
       }
    
       return true;
    }
    
    const unsigned *
    brw_compile_mesh(const struct brw_compiler *compiler,
                     struct brw_compile_mesh_params *params)
    {
       const struct intel_device_info *devinfo = compiler->devinfo;
       struct nir_shader *nir = params->base.nir;
       const struct brw_mesh_prog_key *key = params->key;
       struct brw_mesh_prog_data *prog_data = params->prog_data;
       const bool debug_enabled = brw_should_print_shader(nir, DEBUG_MESH);
    
       prog_data->base.base.stage = MESA_SHADER_MESH;
       prog_data->base.base.total_shared = nir->info.shared_size;
       prog_data->base.base.total_scratch = 0;
    
       prog_data->base.local_size[0] = nir->info.workgroup_size[0];
       prog_data->base.local_size[1] = nir->info.workgroup_size[1];
       prog_data->base.local_size[2] = nir->info.workgroup_size[2];
    
       prog_data->clip_distance_mask = (1 << nir->info.clip_distance_array_size) - 1;
       prog_data->cull_distance_mask =
             ((1 << nir->info.cull_distance_array_size) - 1) <<
              nir->info.clip_distance_array_size;
       prog_data->primitive_type = nir->info.mesh.primitive_type;
    
       struct index_packing_state index_packing_state = {};
       if (brw_can_pack_primitive_indices(nir, &index_packing_state)) {
          if (index_packing_state.original_prim_indices)
             NIR_PASS(_, nir, brw_pack_primitive_indices, &index_packing_state);
          prog_data->index_format = BRW_INDEX_FORMAT_U888X;
       } else {
          prog_data->index_format = BRW_INDEX_FORMAT_U32;
       }
    
       prog_data->uses_drawid =
          BITSET_TEST(nir->info.system_values_read, SYSTEM_VALUE_DRAW_ID);
    
       NIR_PASS(_, nir, brw_nir_lower_mesh_primitive_count);
       NIR_PASS(_, nir, nir_opt_dce);
       NIR_PASS(_, nir, nir_remove_dead_variables, nir_var_shader_out, NULL);
    
       brw_nir_lower_tue_inputs(nir, params->tue_map);
    
       brw_compute_mue_map(compiler, nir, &prog_data->map,
                           prog_data->index_format, key->compact_mue);
       brw_nir_lower_mue_outputs(nir, &prog_data->map);
    
       prog_data->autostrip_enable = brw_mesh_autostrip_enable(compiler, nir, &prog_data->map);
    
       NIR_PASS(_, nir, brw_nir_lower_load_uniforms, compiler->devinfo);
       prog_data->base.uses_inline_data = brw_nir_uses_inline_data(nir);
    
       brw_simd_selection_state simd_state{
          .devinfo = compiler->devinfo,
          .prog_data = &prog_data->base,
          .required_width = brw_required_dispatch_width(&nir->info),
       };
    
       std::unique_ptr<fs_visitor> v[3];
    
       for (unsigned i = 0; i < 3; i++) {
          const unsigned simd = devinfo->ver >= 30 ? 2 - i : i;
    
          if (!brw_simd_should_compile(simd_state, simd))
             continue;
    
          const unsigned dispatch_width = 8 << simd;
    
          nir_shader *shader = nir_shader_clone(params->base.mem_ctx, nir);
    
          /*
           * When Primitive Header is enabled, we may not generates writes to all
           * fields, so let's initialize everything.
           */
          if (prog_data->map.per_primitive_header_size_dw > 0)
             NIR_PASS_V(shader, brw_nir_initialize_mue, &prog_data->map, dispatch_width);
    
          brw_nir_apply_key(shader, compiler, &key->base, dispatch_width);
    
          NIR_PASS(_, shader, brw_nir_adjust_offset_for_arrayed_indices, &prog_data->map);
          /* Load uniforms can do a better job for constants, so fold before it. */
          NIR_PASS(_, shader, nir_opt_constant_folding);
    
          NIR_PASS(_, shader, brw_nir_lower_simd, dispatch_width);
    
          brw_postprocess_nir(shader, compiler, debug_enabled,
                              key->base.robust_flags);
    
          v[simd] = std::make_unique<fs_visitor>(compiler, &params->base,
                                                 &key->base,
                                                 &prog_data->base.base,
                                                 shader, dispatch_width,
                                                 params->base.stats != NULL,
                                                 debug_enabled);
    
          if (prog_data->base.prog_mask) {
             unsigned first = ffs(prog_data->base.prog_mask) - 1;
             v[simd]->import_uniforms(v[first].get());
          }
    
          const bool allow_spilling = simd == 0 ||
             (!simd_state.compiled[simd - 1] && !brw_simd_should_compile(simd_state, simd - 1));
          if (run_task_mesh(*v[simd], allow_spilling)) {
             brw_simd_mark_compiled(simd_state, simd, v[simd]->spilled_any_registers);
    
             if (devinfo->ver >= 30 && !v[simd]->spilled_any_registers)
                break;
          } else {
             simd_state.error[simd] = ralloc_strdup(params->base.mem_ctx, v[simd]->fail_msg);
          }
       }
    
       int selected_simd = brw_simd_select(simd_state);
       if (selected_simd < 0) {
          params->base.error_str =
             ralloc_asprintf(params->base.mem_ctx,
                             "Can't compile shader: "
                             "SIMD8 '%s', SIMD16 '%s' and SIMD32 '%s'.\n",
                             simd_state.error[0], simd_state.error[1],
                             simd_state.error[2]);
          return NULL;
       }
    
       fs_visitor *selected = v[selected_simd].get();
       prog_data->base.prog_mask = 1 << selected_simd;
       prog_data->base.base.grf_used = MAX2(prog_data->base.base.grf_used,
                                            selected->grf_used);
    
       if (unlikely(debug_enabled)) {
          if (params->tue_map) {
             fprintf(stderr, "Mesh Input ");
             brw_print_tue_map(stderr, params->tue_map);
          }
          fprintf(stderr, "Mesh Output ");
          brw_print_mue_map(stderr, &prog_data->map, nir);
       }
    
       brw_generator g(compiler, &params->base, &prog_data->base.base,
                      MESA_SHADER_MESH);
       if (unlikely(debug_enabled)) {
          g.enable_debug(ralloc_asprintf(params->base.mem_ctx,
                                         "%s mesh shader %s",
                                         nir->info.label ? nir->info.label
                                                         : "unnamed",
                                         nir->info.name));
       }
    
       g.generate_code(selected->cfg, selected->dispatch_width, selected->shader_stats,
                       selected->performance_analysis.require(), params->base.stats);
       g.add_const_data(nir->constant_data, nir->constant_data_size);
       return g.get_assembly();
    }