Hash :
19f40531
Author :
Date :
2022-02-11T11:17:22
Vulkan: Enable subgroup feature in GenerateMipmap.comp Enable subgroup feature to optimize shaders used for generating mipmaps. Remove definition of SPD_NO_WAVE_OPERATIONS to enable subgroups. Regenerate all internal shaders with target as vulkan 1.1 to enable subgroup feature. Bug: angleproject:7006 Change-Id: I36f50f3d27517fedb52e3028a8f6288347b1bfa0 Reviewed-on: https://chromium-review.googlesource.com/c/angle/angle/+/3453288 Reviewed-by: Shahbaz Youssefi <syoussefi@chromium.org> Commit-Queue: mohan maiya <m.maiya@samsung.com>
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183
//
// Copyright 2020 The ANGLE Project Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
//
// GenerateMipmap.comp: Generate mipmap of texture in a single pass. Uses AMD's FFX SPD located in
// third_parth/ffx_spd/.
//
// Note that due to bugs in that code, we only support downsampling 6 levels at a time (instead of
// the 12 mips supported by FFX SPD). The issue is that FFX SPD tries to `imageLoad` from `dst[5]`
// with coordinates that can potentially be outside the level extents. This results in transparent
// black reads. A possible solution is to clamp the coodinates in `SpdLoad`. However, we opted to
// supporting only 6 levels at a time because:
//
// - On most Android vendors, which is our primary optimization target,
// maxPerStageDescriptorStorageImages is commonly 4, which means we cannot generate mipmaps for
// even 6 levels at a time anyway.
// - By removing support for >6 mips, we can remove the atomic counter logic required by FFX SPD to
// single out an invocation which will be downsampling the rest of the 6 mips. This makes the
// generation of the first 6 mips faster.
#version 450 core
#extension GL_GOOGLE_include_directive : require
#extension GL_EXT_samplerless_texture_functions : require
#if IsRGBA8 || IsRGBA8_UseHalf
#define DST_FORMAT rgba8
#elif IsRGBA16 || IsRGBA16_UseHalf
#define DST_FORMAT rgba16
#elif IsRGBA32F
#define DST_FORMAT rgba32f
#else
#error "Not all formats are accounted for"
#endif
#if DestSize4
#define DST_COUNT 4
#elif DestSize6
#define DST_COUNT 6
#else
#error "Not all destination sizes are accounted for"
#endif
// TODO: Support sRGB
// TODO: Support non-float formats
// TODO: Support subgroup mode
layout(local_size_x = 256, local_size_y = 1, local_size_z = 1) in;
layout(set = 0, binding = 0, DST_FORMAT) uniform coherent image2D dst[DST_COUNT];
layout(set = 0, binding = 1) uniform sampler2D src;
layout(push_constant) uniform PushConstants {
// Inverse extents of src image for uv calculation.
vec2 invSrcExtent;
// Number of levels to generate mipmaps for.
uint levelCount;
} params;
#define A_GPU
#define A_GLSL
// For 8- and 16-bit-per-channel images, use half instead of float if supported.
#if IsRGBA8_UseHalf || IsRGBA16_UseHalf
#define A_HALF
#endif
#include "third_party/ffx_spd/ffx_a.h"
// Shared memory
#ifdef A_HALF
shared AH4 spd_intermediate[16][16];
#else
shared AF4 spd_intermediate[16][16];
#endif
shared AU1 spd_counter;
// Use a linear sampler to sample from mip 0 instead of multiple loads and manual averaging.
#define SPD_LINEAR_SAMPLER
// Utility functions used by ffx_spd.h
#ifdef A_HALF
#define SPD_PACKED_ONLY
// Load from source image
AH4 SpdLoadSourceImageH(ASU2 p)
{
AF2 textureCoord = p * params.invSrcExtent + params.invSrcExtent;
return AH4(texture(src, textureCoord));
}
// SpdLoadH() takes a 32-bit signed integer 2D coordinate and loads color.
// Loads the 5th mip level, each value is computed by a different thread group
// last thread group will access all its elements and compute the subsequent mips.
//
// Unused as we don't support more than 6 levels.
AH4 SpdLoadH(ASU2 p)
{
return AH4(0);
}
// Define the store function
void SpdStoreH(ASU2 p, AH4 value, AU1 mip)
{
imageStore(dst[mip], p, AF4(value));
}
// Define the lds load and store functions
AH4 SpdLoadIntermediateH(AU1 x, AU1 y)
{
return spd_intermediate[x][y];
}
void SpdStoreIntermediateH(AU1 x, AU1 y, AH4 value)
{
spd_intermediate[x][y] = value;
}
// Define your reduction function: takes as input the four 2x2 values and returns 1 output value
AH4 SpdReduce4H(AH4 v0, AH4 v1, AH4 v2, AH4 v3)
{
return (v0 + v1 + v2 + v3) * AH1(0.25);
}
#else // A_HALF
// Load from source image
AF4 SpdLoadSourceImage(ASU2 p)
{
AF2 textureCoord = p * params.invSrcExtent + params.invSrcExtent;
return texture(src, textureCoord);
}
// SpdLoad() takes a 32-bit signed integer 2D coordinate and loads color.
// Loads the 5th mip level, each value is computed by a different thread group
// last thread group will access all its elements and compute the subsequent mips
//
// Unused as we don't support more than 6 levels.
AF4 SpdLoad(ASU2 p)
{
return AF4(0);
}
// Define the store function
void SpdStore(ASU2 p, AF4 value, AU1 mip)
{
imageStore(dst[mip], p, value);
}
// Define the LDS load and store functions
AF4 SpdLoadIntermediate(AU1 x, AU1 y)
{
return spd_intermediate[x][y];
}
void SpdStoreIntermediate(AU1 x, AU1 y, AF4 value)
{
spd_intermediate[x][y] = value;
}
// Define your reduction function: takes as input the four 2x2 values and returns 1 output value
AF4 SpdReduce4(AF4 v0, AF4 v1, AF4 v2, AF4 v3)
{
return (v0 + v1 + v2 + v3) * 0.25;
}
#endif // A_HALF
// Define the atomic counter increase function. We don't support more than 6 mips, so these are
// unused. Returned value is arbitrary as SpdDownsample will early out before looking at it.
#define SpdIncreaseAtomicCounter()
#define SpdGetAtomicCounter() 0
#include "third_party/ffx_spd/ffx_spd.h"
void main()
{
#ifdef A_HALF
SpdDownsampleH(gl_WorkGroupID.xy, gl_LocalInvocationIndex, params.levelCount, 0);
#else
SpdDownsample(gl_WorkGroupID.xy, gl_LocalInvocationIndex, params.levelCount, 0);
#endif
}