mirror of
				https://git.suyu.dev/suyu/suyu
				synced 2025-11-03 16:39:01 -06:00 
			
		
		
		
	astc_decoder: Compute offset swizzles in-shader
Alleviates the dependency on the swizzle table and a uniform which is constant for all ASTC texture sizes.
This commit is contained in:
		@@ -10,8 +10,7 @@
 | 
			
		||||
#define END_PUSH_CONSTANTS };
 | 
			
		||||
#define UNIFORM(n)
 | 
			
		||||
#define BINDING_INPUT_BUFFER 0
 | 
			
		||||
#define BINDING_SWIZZLE_BUFFER 1
 | 
			
		||||
#define BINDING_OUTPUT_IMAGE 2
 | 
			
		||||
#define BINDING_OUTPUT_IMAGE 1
 | 
			
		||||
 | 
			
		||||
#else // ^^^ Vulkan ^^^ // vvv OpenGL vvv
 | 
			
		||||
 | 
			
		||||
@@ -19,7 +18,6 @@
 | 
			
		||||
#define END_PUSH_CONSTANTS
 | 
			
		||||
#define UNIFORM(n) layout(location = n) uniform
 | 
			
		||||
#define BINDING_INPUT_BUFFER 0
 | 
			
		||||
#define BINDING_SWIZZLE_BUFFER 1
 | 
			
		||||
#define BINDING_OUTPUT_IMAGE 0
 | 
			
		||||
 | 
			
		||||
#endif
 | 
			
		||||
@@ -28,13 +26,11 @@ layout(local_size_x = 32, local_size_y = 32, local_size_z = 1) in;
 | 
			
		||||
 | 
			
		||||
BEGIN_PUSH_CONSTANTS
 | 
			
		||||
UNIFORM(1) uvec2 block_dims;
 | 
			
		||||
 | 
			
		||||
UNIFORM(2) uint bytes_per_block_log2;
 | 
			
		||||
UNIFORM(3) uint layer_stride;
 | 
			
		||||
UNIFORM(4) uint block_size;
 | 
			
		||||
UNIFORM(5) uint x_shift;
 | 
			
		||||
UNIFORM(6) uint block_height;
 | 
			
		||||
UNIFORM(7) uint block_height_mask;
 | 
			
		||||
UNIFORM(2) uint layer_stride;
 | 
			
		||||
UNIFORM(3) uint block_size;
 | 
			
		||||
UNIFORM(4) uint x_shift;
 | 
			
		||||
UNIFORM(5) uint block_height;
 | 
			
		||||
UNIFORM(6) uint block_height_mask;
 | 
			
		||||
END_PUSH_CONSTANTS
 | 
			
		||||
 | 
			
		||||
struct EncodingData {
 | 
			
		||||
@@ -53,35 +49,17 @@ struct TexelWeightParams {
 | 
			
		||||
    bool void_extent_hdr;
 | 
			
		||||
};
 | 
			
		||||
 | 
			
		||||
// Swizzle data
 | 
			
		||||
layout(binding = BINDING_SWIZZLE_BUFFER, std430) readonly buffer SwizzleTable {
 | 
			
		||||
    uint swizzle_table[];
 | 
			
		||||
};
 | 
			
		||||
 | 
			
		||||
layout(binding = BINDING_INPUT_BUFFER, std430) readonly buffer InputBufferU32 {
 | 
			
		||||
    uvec4 astc_data[];
 | 
			
		||||
};
 | 
			
		||||
 | 
			
		||||
layout(binding = BINDING_OUTPUT_IMAGE, rgba8) uniform writeonly image2DArray dest_image;
 | 
			
		||||
 | 
			
		||||
const uint GOB_SIZE_X = 64;
 | 
			
		||||
const uint GOB_SIZE_Y = 8;
 | 
			
		||||
const uint GOB_SIZE_Z = 1;
 | 
			
		||||
const uint GOB_SIZE = GOB_SIZE_X * GOB_SIZE_Y * GOB_SIZE_Z;
 | 
			
		||||
 | 
			
		||||
const uint GOB_SIZE_X_SHIFT = 6;
 | 
			
		||||
const uint GOB_SIZE_Y_SHIFT = 3;
 | 
			
		||||
const uint GOB_SIZE_Z_SHIFT = 0;
 | 
			
		||||
const uint GOB_SIZE_SHIFT = GOB_SIZE_X_SHIFT + GOB_SIZE_Y_SHIFT + GOB_SIZE_Z_SHIFT;
 | 
			
		||||
const uint GOB_SIZE_SHIFT = GOB_SIZE_X_SHIFT + GOB_SIZE_Y_SHIFT;
 | 
			
		||||
 | 
			
		||||
const uvec2 SWIZZLE_MASK = uvec2(GOB_SIZE_X - 1, GOB_SIZE_Y - 1);
 | 
			
		||||
 | 
			
		||||
const int BLOCK_SIZE_IN_BYTES = 16;
 | 
			
		||||
 | 
			
		||||
const int BLOCK_INFO_ERROR = 0;
 | 
			
		||||
const int BLOCK_INFO_VOID_EXTENT_HDR = 1;
 | 
			
		||||
const int BLOCK_INFO_VOID_EXTENT_LDR = 2;
 | 
			
		||||
const int BLOCK_INFO_NORMAL = 3;
 | 
			
		||||
const uint BYTES_PER_BLOCK_LOG2 = 4;
 | 
			
		||||
 | 
			
		||||
const int JUST_BITS = 0;
 | 
			
		||||
const int QUINT = 1;
 | 
			
		||||
@@ -168,8 +146,10 @@ int texel_vector_index = 0;
 | 
			
		||||
uint unquantized_texel_weights[2][144];
 | 
			
		||||
 | 
			
		||||
uint SwizzleOffset(uvec2 pos) {
 | 
			
		||||
    pos = pos & SWIZZLE_MASK;
 | 
			
		||||
    return swizzle_table[pos.y * 64 + pos.x];
 | 
			
		||||
    uint x = pos.x;
 | 
			
		||||
    uint y = pos.y;
 | 
			
		||||
    return ((x % 64) / 32) * 256 + ((y % 8) / 2) * 64 + ((x % 32) / 16) * 32 +
 | 
			
		||||
                          (y % 2) * 16 + (x % 16);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
// Replicates low num_bits such that [(to_bit - 1):(to_bit - 1 - from_bit)]
 | 
			
		||||
@@ -1253,7 +1233,7 @@ void DecompressBlock(ivec3 coord) {
 | 
			
		||||
 | 
			
		||||
void main() {
 | 
			
		||||
    uvec3 pos = gl_GlobalInvocationID;
 | 
			
		||||
    pos.x <<= bytes_per_block_log2;
 | 
			
		||||
    pos.x <<= BYTES_PER_BLOCK_LOG2;
 | 
			
		||||
 | 
			
		||||
    // Read as soon as possible due to its latency
 | 
			
		||||
    const uint swizzle = SwizzleOffset(pos.xy);
 | 
			
		||||
 
 | 
			
		||||
@@ -68,7 +68,6 @@ UtilShaders::~UtilShaders() = default;
 | 
			
		||||
void UtilShaders::ASTCDecode(Image& image, const ImageBufferMap& map,
 | 
			
		||||
                             std::span<const VideoCommon::SwizzleParameters> swizzles) {
 | 
			
		||||
    static constexpr GLuint BINDING_INPUT_BUFFER = 0;
 | 
			
		||||
    static constexpr GLuint BINDING_SWIZZLE_BUFFER = 1;
 | 
			
		||||
    static constexpr GLuint BINDING_OUTPUT_IMAGE = 0;
 | 
			
		||||
 | 
			
		||||
    const Extent2D tile_size{
 | 
			
		||||
@@ -76,10 +75,9 @@ void UtilShaders::ASTCDecode(Image& image, const ImageBufferMap& map,
 | 
			
		||||
        .height = VideoCore::Surface::DefaultBlockHeight(image.info.format),
 | 
			
		||||
    };
 | 
			
		||||
    program_manager.BindComputeProgram(astc_decoder_program.handle);
 | 
			
		||||
    glBindBufferBase(GL_SHADER_STORAGE_BUFFER, BINDING_SWIZZLE_BUFFER, swizzle_table_buffer.handle);
 | 
			
		||||
 | 
			
		||||
    glFlushMappedNamedBufferRange(map.buffer, map.offset, image.guest_size_bytes);
 | 
			
		||||
    glUniform2ui(1, tile_size.width, tile_size.height);
 | 
			
		||||
 | 
			
		||||
    // Ensure buffer data is valid before dispatching
 | 
			
		||||
    glFlush();
 | 
			
		||||
    for (const SwizzleParameters& swizzle : swizzles) {
 | 
			
		||||
@@ -90,13 +88,13 @@ void UtilShaders::ASTCDecode(Image& image, const ImageBufferMap& map,
 | 
			
		||||
        const auto params = MakeBlockLinearSwizzle2DParams(swizzle, image.info);
 | 
			
		||||
        ASSERT(params.origin == (std::array<u32, 3>{0, 0, 0}));
 | 
			
		||||
        ASSERT(params.destination == (std::array<s32, 3>{0, 0, 0}));
 | 
			
		||||
        ASSERT(params.bytes_per_block_log2 == 4);
 | 
			
		||||
 | 
			
		||||
        glUniform1ui(2, params.bytes_per_block_log2);
 | 
			
		||||
        glUniform1ui(3, params.layer_stride);
 | 
			
		||||
        glUniform1ui(4, params.block_size);
 | 
			
		||||
        glUniform1ui(5, params.x_shift);
 | 
			
		||||
        glUniform1ui(6, params.block_height);
 | 
			
		||||
        glUniform1ui(7, params.block_height_mask);
 | 
			
		||||
        glUniform1ui(2, params.layer_stride);
 | 
			
		||||
        glUniform1ui(3, params.block_size);
 | 
			
		||||
        glUniform1ui(4, params.x_shift);
 | 
			
		||||
        glUniform1ui(5, params.block_height);
 | 
			
		||||
        glUniform1ui(6, params.block_height_mask);
 | 
			
		||||
 | 
			
		||||
        // ASTC texture data
 | 
			
		||||
        glBindBufferRange(GL_SHADER_STORAGE_BUFFER, BINDING_INPUT_BUFFER, map.buffer, input_offset,
 | 
			
		||||
 
 | 
			
		||||
@@ -34,9 +34,8 @@ using Tegra::Texture::SWIZZLE_TABLE;
 | 
			
		||||
namespace {
 | 
			
		||||
 | 
			
		||||
constexpr u32 ASTC_BINDING_INPUT_BUFFER = 0;
 | 
			
		||||
constexpr u32 ASTC_BINDING_SWIZZLE_BUFFER = 1;
 | 
			
		||||
constexpr u32 ASTC_BINDING_OUTPUT_IMAGE = 2;
 | 
			
		||||
constexpr size_t ASTC_NUM_BINDINGS = 3;
 | 
			
		||||
constexpr u32 ASTC_BINDING_OUTPUT_IMAGE = 1;
 | 
			
		||||
constexpr size_t ASTC_NUM_BINDINGS = 2;
 | 
			
		||||
 | 
			
		||||
template <size_t size>
 | 
			
		||||
inline constexpr VkPushConstantRange COMPUTE_PUSH_CONSTANT_RANGE{
 | 
			
		||||
@@ -80,13 +79,6 @@ constexpr std::array<VkDescriptorSetLayoutBinding, ASTC_NUM_BINDINGS> ASTC_DESCR
 | 
			
		||||
        .stageFlags = VK_SHADER_STAGE_COMPUTE_BIT,
 | 
			
		||||
        .pImmutableSamplers = nullptr,
 | 
			
		||||
    },
 | 
			
		||||
    {
 | 
			
		||||
        .binding = ASTC_BINDING_SWIZZLE_BUFFER,
 | 
			
		||||
        .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
 | 
			
		||||
        .descriptorCount = 1,
 | 
			
		||||
        .stageFlags = VK_SHADER_STAGE_COMPUTE_BIT,
 | 
			
		||||
        .pImmutableSamplers = nullptr,
 | 
			
		||||
    },
 | 
			
		||||
    {
 | 
			
		||||
        .binding = ASTC_BINDING_OUTPUT_IMAGE,
 | 
			
		||||
        .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_IMAGE,
 | 
			
		||||
@@ -98,12 +90,12 @@ constexpr std::array<VkDescriptorSetLayoutBinding, ASTC_NUM_BINDINGS> ASTC_DESCR
 | 
			
		||||
 | 
			
		||||
constexpr DescriptorBankInfo ASTC_BANK_INFO{
 | 
			
		||||
    .uniform_buffers = 0,
 | 
			
		||||
    .storage_buffers = 2,
 | 
			
		||||
    .storage_buffers = 1,
 | 
			
		||||
    .texture_buffers = 0,
 | 
			
		||||
    .image_buffers = 0,
 | 
			
		||||
    .textures = 0,
 | 
			
		||||
    .images = 1,
 | 
			
		||||
    .score = 3,
 | 
			
		||||
    .score = 2,
 | 
			
		||||
};
 | 
			
		||||
 | 
			
		||||
constexpr VkDescriptorUpdateTemplateEntryKHR INPUT_OUTPUT_DESCRIPTOR_UPDATE_TEMPLATE{
 | 
			
		||||
@@ -125,14 +117,6 @@ constexpr std::array<VkDescriptorUpdateTemplateEntryKHR, ASTC_NUM_BINDINGS>
 | 
			
		||||
            .offset = ASTC_BINDING_INPUT_BUFFER * sizeof(DescriptorUpdateEntry),
 | 
			
		||||
            .stride = sizeof(DescriptorUpdateEntry),
 | 
			
		||||
        },
 | 
			
		||||
        {
 | 
			
		||||
            .dstBinding = ASTC_BINDING_SWIZZLE_BUFFER,
 | 
			
		||||
            .dstArrayElement = 0,
 | 
			
		||||
            .descriptorCount = 1,
 | 
			
		||||
            .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
 | 
			
		||||
            .offset = ASTC_BINDING_SWIZZLE_BUFFER * sizeof(DescriptorUpdateEntry),
 | 
			
		||||
            .stride = sizeof(DescriptorUpdateEntry),
 | 
			
		||||
        },
 | 
			
		||||
        {
 | 
			
		||||
            .dstBinding = ASTC_BINDING_OUTPUT_IMAGE,
 | 
			
		||||
            .dstArrayElement = 0,
 | 
			
		||||
@@ -145,7 +129,6 @@ constexpr std::array<VkDescriptorUpdateTemplateEntryKHR, ASTC_NUM_BINDINGS>
 | 
			
		||||
 | 
			
		||||
struct AstcPushConstants {
 | 
			
		||||
    std::array<u32, 2> blocks_dims;
 | 
			
		||||
    u32 bytes_per_block_log2;
 | 
			
		||||
    u32 layer_stride;
 | 
			
		||||
    u32 block_size;
 | 
			
		||||
    u32 x_shift;
 | 
			
		||||
@@ -336,42 +319,6 @@ ASTCDecoderPass::ASTCDecoderPass(const Device& device_, VKScheduler& scheduler_,
 | 
			
		||||
 | 
			
		||||
ASTCDecoderPass::~ASTCDecoderPass() = default;
 | 
			
		||||
 | 
			
		||||
void ASTCDecoderPass::MakeDataBuffer() {
 | 
			
		||||
    constexpr size_t TOTAL_BUFFER_SIZE = sizeof(SWIZZLE_TABLE);
 | 
			
		||||
    data_buffer = device.GetLogical().CreateBuffer(VkBufferCreateInfo{
 | 
			
		||||
        .sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO,
 | 
			
		||||
        .pNext = nullptr,
 | 
			
		||||
        .flags = 0,
 | 
			
		||||
        .size = TOTAL_BUFFER_SIZE,
 | 
			
		||||
        .usage = VK_BUFFER_USAGE_STORAGE_BUFFER_BIT | VK_BUFFER_USAGE_TRANSFER_DST_BIT,
 | 
			
		||||
        .sharingMode = VK_SHARING_MODE_EXCLUSIVE,
 | 
			
		||||
        .queueFamilyIndexCount = 0,
 | 
			
		||||
        .pQueueFamilyIndices = nullptr,
 | 
			
		||||
    });
 | 
			
		||||
    data_buffer_commit = memory_allocator.Commit(data_buffer, MemoryUsage::Upload);
 | 
			
		||||
 | 
			
		||||
    const auto staging_ref = staging_buffer_pool.Request(TOTAL_BUFFER_SIZE, MemoryUsage::Upload);
 | 
			
		||||
    std::memcpy(staging_ref.mapped_span.data(), &SWIZZLE_TABLE, sizeof(SWIZZLE_TABLE));
 | 
			
		||||
 | 
			
		||||
    scheduler.Record([src = staging_ref.buffer, offset = staging_ref.offset, dst = *data_buffer,
 | 
			
		||||
                      TOTAL_BUFFER_SIZE](vk::CommandBuffer cmdbuf) {
 | 
			
		||||
        static constexpr VkMemoryBarrier write_barrier{
 | 
			
		||||
            .sType = VK_STRUCTURE_TYPE_MEMORY_BARRIER,
 | 
			
		||||
            .pNext = nullptr,
 | 
			
		||||
            .srcAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT,
 | 
			
		||||
            .dstAccessMask = VK_ACCESS_SHADER_READ_BIT,
 | 
			
		||||
        };
 | 
			
		||||
        const VkBufferCopy copy{
 | 
			
		||||
            .srcOffset = offset,
 | 
			
		||||
            .dstOffset = 0,
 | 
			
		||||
            .size = TOTAL_BUFFER_SIZE,
 | 
			
		||||
        };
 | 
			
		||||
        cmdbuf.CopyBuffer(src, dst, copy);
 | 
			
		||||
        cmdbuf.PipelineBarrier(VK_PIPELINE_STAGE_TRANSFER_BIT, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT,
 | 
			
		||||
                               0, write_barrier);
 | 
			
		||||
    });
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
void ASTCDecoderPass::Assemble(Image& image, const StagingBufferRef& map,
 | 
			
		||||
                               std::span<const VideoCommon::SwizzleParameters> swizzles) {
 | 
			
		||||
    using namespace VideoCommon::Accelerated;
 | 
			
		||||
@@ -380,9 +327,6 @@ void ASTCDecoderPass::Assemble(Image& image, const StagingBufferRef& map,
 | 
			
		||||
        VideoCore::Surface::DefaultBlockHeight(image.info.format),
 | 
			
		||||
    };
 | 
			
		||||
    scheduler.RequestOutsideRenderPassOperationContext();
 | 
			
		||||
    if (!data_buffer) {
 | 
			
		||||
        MakeDataBuffer();
 | 
			
		||||
    }
 | 
			
		||||
    const VkPipeline vk_pipeline = *pipeline;
 | 
			
		||||
    const VkImageAspectFlags aspect_mask = image.AspectMask();
 | 
			
		||||
    const VkImage vk_image = image.Handle();
 | 
			
		||||
@@ -421,7 +365,6 @@ void ASTCDecoderPass::Assemble(Image& image, const StagingBufferRef& map,
 | 
			
		||||
        update_descriptor_queue.Acquire();
 | 
			
		||||
        update_descriptor_queue.AddBuffer(map.buffer, input_offset,
 | 
			
		||||
                                          image.guest_size_bytes - swizzle.buffer_offset);
 | 
			
		||||
        update_descriptor_queue.AddBuffer(*data_buffer, 0, sizeof(SWIZZLE_TABLE));
 | 
			
		||||
        update_descriptor_queue.AddImage(image.StorageImageView(swizzle.level));
 | 
			
		||||
        const void* const descriptor_data{update_descriptor_queue.UpdateData()};
 | 
			
		||||
 | 
			
		||||
@@ -429,11 +372,11 @@ void ASTCDecoderPass::Assemble(Image& image, const StagingBufferRef& map,
 | 
			
		||||
        const auto params = MakeBlockLinearSwizzle2DParams(swizzle, image.info);
 | 
			
		||||
        ASSERT(params.origin == (std::array<u32, 3>{0, 0, 0}));
 | 
			
		||||
        ASSERT(params.destination == (std::array<s32, 3>{0, 0, 0}));
 | 
			
		||||
        ASSERT(params.bytes_per_block_log2 == 4);
 | 
			
		||||
        scheduler.Record([this, num_dispatches_x, num_dispatches_y, num_dispatches_z, block_dims,
 | 
			
		||||
                          params, descriptor_data](vk::CommandBuffer cmdbuf) {
 | 
			
		||||
            const AstcPushConstants uniforms{
 | 
			
		||||
                .blocks_dims = block_dims,
 | 
			
		||||
                .bytes_per_block_log2 = params.bytes_per_block_log2,
 | 
			
		||||
                .layer_stride = params.layer_stride,
 | 
			
		||||
                .block_size = params.block_size,
 | 
			
		||||
                .x_shift = params.x_shift,
 | 
			
		||||
 
 | 
			
		||||
@@ -96,15 +96,10 @@ public:
 | 
			
		||||
                  std::span<const VideoCommon::SwizzleParameters> swizzles);
 | 
			
		||||
 | 
			
		||||
private:
 | 
			
		||||
    void MakeDataBuffer();
 | 
			
		||||
 | 
			
		||||
    VKScheduler& scheduler;
 | 
			
		||||
    StagingBufferPool& staging_buffer_pool;
 | 
			
		||||
    VKUpdateDescriptorQueue& update_descriptor_queue;
 | 
			
		||||
    MemoryAllocator& memory_allocator;
 | 
			
		||||
 | 
			
		||||
    vk::Buffer data_buffer;
 | 
			
		||||
    MemoryCommit data_buffer_commit;
 | 
			
		||||
};
 | 
			
		||||
 | 
			
		||||
} // namespace Vulkan
 | 
			
		||||
 
 | 
			
		||||
		Reference in New Issue
	
	Block a user