glasm: Use storage buffers instead of global memory when possible
This commit is contained in:
		@@ -195,7 +195,12 @@ void BufferCacheRuntime::BindComputeUniformBuffer(u32 binding_index, Buffer& buf
 | 
			
		||||
 | 
			
		||||
void BufferCacheRuntime::BindStorageBuffer(size_t stage, u32 binding_index, Buffer& buffer,
 | 
			
		||||
                                           u32 offset, u32 size, bool is_written) {
 | 
			
		||||
    if (use_assembly_shaders) {
 | 
			
		||||
    if (use_storage_buffers) {
 | 
			
		||||
        const GLuint base_binding = graphics_base_storage_bindings[stage];
 | 
			
		||||
        const GLuint binding = base_binding + binding_index;
 | 
			
		||||
        glBindBufferRange(GL_SHADER_STORAGE_BUFFER, binding, buffer.Handle(),
 | 
			
		||||
                          static_cast<GLintptr>(offset), static_cast<GLsizeiptr>(size));
 | 
			
		||||
    } else {
 | 
			
		||||
        const BindlessSSBO ssbo{
 | 
			
		||||
            .address = buffer.HostGpuAddr() + offset,
 | 
			
		||||
            .length = static_cast<GLsizei>(size),
 | 
			
		||||
@@ -204,17 +209,19 @@ void BufferCacheRuntime::BindStorageBuffer(size_t stage, u32 binding_index, Buff
 | 
			
		||||
        buffer.MakeResident(is_written ? GL_READ_WRITE : GL_READ_ONLY);
 | 
			
		||||
        glProgramLocalParametersI4uivNV(PROGRAM_LUT[stage], binding_index, 1,
 | 
			
		||||
                                        reinterpret_cast<const GLuint*>(&ssbo));
 | 
			
		||||
    } else {
 | 
			
		||||
        const GLuint base_binding = graphics_base_storage_bindings[stage];
 | 
			
		||||
        const GLuint binding = base_binding + binding_index;
 | 
			
		||||
        glBindBufferRange(GL_SHADER_STORAGE_BUFFER, binding, buffer.Handle(),
 | 
			
		||||
                          static_cast<GLintptr>(offset), static_cast<GLsizeiptr>(size));
 | 
			
		||||
    }
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
void BufferCacheRuntime::BindComputeStorageBuffer(u32 binding_index, Buffer& buffer, u32 offset,
 | 
			
		||||
                                                  u32 size, bool is_written) {
 | 
			
		||||
    if (use_assembly_shaders) {
 | 
			
		||||
    if (use_storage_buffers) {
 | 
			
		||||
        if (size != 0) {
 | 
			
		||||
            glBindBufferRange(GL_SHADER_STORAGE_BUFFER, binding_index, buffer.Handle(),
 | 
			
		||||
                              static_cast<GLintptr>(offset), static_cast<GLsizeiptr>(size));
 | 
			
		||||
        } else {
 | 
			
		||||
            glBindBufferRange(GL_SHADER_STORAGE_BUFFER, binding_index, 0, 0, 0);
 | 
			
		||||
        }
 | 
			
		||||
    } else {
 | 
			
		||||
        const BindlessSSBO ssbo{
 | 
			
		||||
            .address = buffer.HostGpuAddr() + offset,
 | 
			
		||||
            .length = static_cast<GLsizei>(size),
 | 
			
		||||
@@ -223,11 +230,6 @@ void BufferCacheRuntime::BindComputeStorageBuffer(u32 binding_index, Buffer& buf
 | 
			
		||||
        buffer.MakeResident(is_written ? GL_READ_WRITE : GL_READ_ONLY);
 | 
			
		||||
        glProgramLocalParametersI4uivNV(GL_COMPUTE_PROGRAM_NV, binding_index, 1,
 | 
			
		||||
                                        reinterpret_cast<const GLuint*>(&ssbo));
 | 
			
		||||
    } else if (size == 0) {
 | 
			
		||||
        glBindBufferRange(GL_SHADER_STORAGE_BUFFER, binding_index, 0, 0, 0);
 | 
			
		||||
    } else {
 | 
			
		||||
        glBindBufferRange(GL_SHADER_STORAGE_BUFFER, binding_index, buffer.Handle(),
 | 
			
		||||
                          static_cast<GLintptr>(offset), static_cast<GLsizeiptr>(size));
 | 
			
		||||
    }
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
 
 | 
			
		||||
@@ -147,6 +147,10 @@ public:
 | 
			
		||||
        image_handles = image_handles_;
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    void SetEnableStorageBuffers(bool use_storage_buffers_) {
 | 
			
		||||
        use_storage_buffers = use_storage_buffers_;
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
private:
 | 
			
		||||
    static constexpr std::array PABO_LUT{
 | 
			
		||||
        GL_VERTEX_PROGRAM_PARAMETER_BUFFER_NV,          GL_TESS_CONTROL_PROGRAM_PARAMETER_BUFFER_NV,
 | 
			
		||||
@@ -160,6 +164,8 @@ private:
 | 
			
		||||
    bool use_assembly_shaders = false;
 | 
			
		||||
    bool has_unified_vertex_buffers = false;
 | 
			
		||||
 | 
			
		||||
    bool use_storage_buffers = false;
 | 
			
		||||
 | 
			
		||||
    u32 max_attributes = 0;
 | 
			
		||||
 | 
			
		||||
    std::array<GLuint, 5> graphics_base_uniform_bindings{};
 | 
			
		||||
 
 | 
			
		||||
@@ -17,6 +17,15 @@ using VideoCommon::ImageId;
 | 
			
		||||
constexpr u32 MAX_TEXTURES = 64;
 | 
			
		||||
constexpr u32 MAX_IMAGES = 16;
 | 
			
		||||
 | 
			
		||||
template <typename Range>
 | 
			
		||||
u32 AccumulateCount(const Range& range) {
 | 
			
		||||
    u32 num{};
 | 
			
		||||
    for (const auto& desc : range) {
 | 
			
		||||
        num += desc.count;
 | 
			
		||||
    }
 | 
			
		||||
    return num;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
size_t ComputePipelineKey::Hash() const noexcept {
 | 
			
		||||
    return static_cast<size_t>(
 | 
			
		||||
        Common::CityHash64(reinterpret_cast<const char*>(this), sizeof *this));
 | 
			
		||||
@@ -26,31 +35,31 @@ bool ComputePipelineKey::operator==(const ComputePipelineKey& rhs) const noexcep
 | 
			
		||||
    return std::memcmp(this, &rhs, sizeof *this) == 0;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
ComputePipeline::ComputePipeline(TextureCache& texture_cache_, BufferCache& buffer_cache_,
 | 
			
		||||
                                 Tegra::MemoryManager& gpu_memory_,
 | 
			
		||||
ComputePipeline::ComputePipeline(const Device& device, TextureCache& texture_cache_,
 | 
			
		||||
                                 BufferCache& buffer_cache_, Tegra::MemoryManager& gpu_memory_,
 | 
			
		||||
                                 Tegra::Engines::KeplerCompute& kepler_compute_,
 | 
			
		||||
                                 ProgramManager& program_manager_, const Shader::Info& info_,
 | 
			
		||||
                                 OGLProgram source_program_, OGLAssemblyProgram assembly_program_)
 | 
			
		||||
    : texture_cache{texture_cache_}, buffer_cache{buffer_cache_}, gpu_memory{gpu_memory_},
 | 
			
		||||
      kepler_compute{kepler_compute_}, program_manager{program_manager_}, info{info_},
 | 
			
		||||
      source_program{std::move(source_program_)}, assembly_program{std::move(assembly_program_)} {
 | 
			
		||||
    for (const auto& desc : info.texture_buffer_descriptors) {
 | 
			
		||||
        num_texture_buffers += desc.count;
 | 
			
		||||
    }
 | 
			
		||||
    for (const auto& desc : info.image_buffer_descriptors) {
 | 
			
		||||
        num_image_buffers += desc.count;
 | 
			
		||||
    }
 | 
			
		||||
    u32 num_textures = num_texture_buffers;
 | 
			
		||||
    for (const auto& desc : info.texture_descriptors) {
 | 
			
		||||
        num_textures += desc.count;
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    num_texture_buffers = AccumulateCount(info.texture_buffer_descriptors);
 | 
			
		||||
    num_image_buffers = AccumulateCount(info.image_buffer_descriptors);
 | 
			
		||||
 | 
			
		||||
    const u32 num_textures{num_texture_buffers + AccumulateCount(info.texture_descriptors)};
 | 
			
		||||
    ASSERT(num_textures <= MAX_TEXTURES);
 | 
			
		||||
 | 
			
		||||
    u32 num_images = num_image_buffers;
 | 
			
		||||
    for (const auto& desc : info.image_descriptors) {
 | 
			
		||||
        num_images += desc.count;
 | 
			
		||||
    }
 | 
			
		||||
    const u32 num_images{num_image_buffers + AccumulateCount(info.image_descriptors)};
 | 
			
		||||
    ASSERT(num_images <= MAX_IMAGES);
 | 
			
		||||
 | 
			
		||||
    const bool is_glasm{assembly_program.handle != 0};
 | 
			
		||||
    const u32 num_storage_buffers{AccumulateCount(info.storage_buffers_descriptors)};
 | 
			
		||||
    use_storage_buffers =
 | 
			
		||||
        !is_glasm || num_storage_buffers < device.GetMaxGLASMStorageBufferBlocks();
 | 
			
		||||
    writes_global_memory = !use_storage_buffers &&
 | 
			
		||||
                           std::ranges::any_of(info.storage_buffers_descriptors,
 | 
			
		||||
                                               [](const auto& desc) { return desc.is_written; });
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
void ComputePipeline::Configure() {
 | 
			
		||||
@@ -150,6 +159,7 @@ void ComputePipeline::Configure() {
 | 
			
		||||
 | 
			
		||||
    buffer_cache.UpdateComputeBuffers();
 | 
			
		||||
 | 
			
		||||
    buffer_cache.runtime.SetEnableStorageBuffers(use_storage_buffers);
 | 
			
		||||
    buffer_cache.runtime.SetImagePointers(textures.data(), images.data());
 | 
			
		||||
    buffer_cache.BindHostComputeBuffers();
 | 
			
		||||
 | 
			
		||||
 
 | 
			
		||||
@@ -28,6 +28,7 @@ struct Info;
 | 
			
		||||
 | 
			
		||||
namespace OpenGL {
 | 
			
		||||
 | 
			
		||||
class Device;
 | 
			
		||||
class ProgramManager;
 | 
			
		||||
 | 
			
		||||
struct ComputePipelineKey {
 | 
			
		||||
@@ -49,14 +50,18 @@ static_assert(std::is_trivially_constructible_v<ComputePipelineKey>);
 | 
			
		||||
 | 
			
		||||
class ComputePipeline {
 | 
			
		||||
public:
 | 
			
		||||
    explicit ComputePipeline(TextureCache& texture_cache_, BufferCache& buffer_cache_,
 | 
			
		||||
                             Tegra::MemoryManager& gpu_memory_,
 | 
			
		||||
    explicit ComputePipeline(const Device& device, TextureCache& texture_cache_,
 | 
			
		||||
                             BufferCache& buffer_cache_, Tegra::MemoryManager& gpu_memory_,
 | 
			
		||||
                             Tegra::Engines::KeplerCompute& kepler_compute_,
 | 
			
		||||
                             ProgramManager& program_manager_, const Shader::Info& info_,
 | 
			
		||||
                             OGLProgram source_program_, OGLAssemblyProgram assembly_program_);
 | 
			
		||||
 | 
			
		||||
    void Configure();
 | 
			
		||||
 | 
			
		||||
    [[nodiscard]] bool WritesGlobalMemory() const noexcept {
 | 
			
		||||
        return writes_global_memory;
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
private:
 | 
			
		||||
    TextureCache& texture_cache;
 | 
			
		||||
    BufferCache& buffer_cache;
 | 
			
		||||
@@ -70,6 +75,9 @@ private:
 | 
			
		||||
 | 
			
		||||
    u32 num_texture_buffers{};
 | 
			
		||||
    u32 num_image_buffers{};
 | 
			
		||||
 | 
			
		||||
    bool use_storage_buffers{};
 | 
			
		||||
    bool writes_global_memory{};
 | 
			
		||||
};
 | 
			
		||||
 | 
			
		||||
} // namespace OpenGL
 | 
			
		||||
 
 | 
			
		||||
@@ -135,13 +135,13 @@ Device::Device() {
 | 
			
		||||
            "Beta driver 443.24 is known to have issues. There might be performance issues.");
 | 
			
		||||
        disable_fast_buffer_sub_data = true;
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    max_uniform_buffers = BuildMaxUniformBuffers();
 | 
			
		||||
    uniform_buffer_alignment = GetInteger<size_t>(GL_UNIFORM_BUFFER_OFFSET_ALIGNMENT);
 | 
			
		||||
    shader_storage_alignment = GetInteger<size_t>(GL_SHADER_STORAGE_BUFFER_OFFSET_ALIGNMENT);
 | 
			
		||||
    max_vertex_attributes = GetInteger<u32>(GL_MAX_VERTEX_ATTRIBS);
 | 
			
		||||
    max_varyings = GetInteger<u32>(GL_MAX_VARYING_VECTORS);
 | 
			
		||||
    max_compute_shared_memory_size = GetInteger<u32>(GL_MAX_COMPUTE_SHARED_MEMORY_SIZE);
 | 
			
		||||
    max_glasm_storage_buffer_blocks = GetInteger<u32>(GL_MAX_VERTEX_SHADER_STORAGE_BLOCKS);
 | 
			
		||||
    has_warp_intrinsics = GLAD_GL_NV_gpu_shader5 && GLAD_GL_NV_shader_thread_group &&
 | 
			
		||||
                          GLAD_GL_NV_shader_thread_shuffle;
 | 
			
		||||
    has_shader_ballot = GLAD_GL_ARB_shader_ballot;
 | 
			
		||||
@@ -236,22 +236,6 @@ std::string Device::GetVendorName() const {
 | 
			
		||||
    return vendor_name;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
Device::Device(std::nullptr_t) {
 | 
			
		||||
    max_uniform_buffers.fill(std::numeric_limits<u32>::max());
 | 
			
		||||
    uniform_buffer_alignment = 4;
 | 
			
		||||
    shader_storage_alignment = 4;
 | 
			
		||||
    max_vertex_attributes = 16;
 | 
			
		||||
    max_varyings = 15;
 | 
			
		||||
    max_compute_shared_memory_size = 0x10000;
 | 
			
		||||
    has_warp_intrinsics = true;
 | 
			
		||||
    has_shader_ballot = true;
 | 
			
		||||
    has_vertex_viewport_layer = true;
 | 
			
		||||
    has_image_load_formatted = true;
 | 
			
		||||
    has_texture_shadow_lod = true;
 | 
			
		||||
    has_variable_aoffi = true;
 | 
			
		||||
    has_depth_buffer_float = true;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
bool Device::TestVariableAoffi() {
 | 
			
		||||
    return TestProgram(R"(#version 430 core
 | 
			
		||||
// This is a unit test, please ignore me on apitrace bug reports.
 | 
			
		||||
 
 | 
			
		||||
@@ -13,7 +13,6 @@ namespace OpenGL {
 | 
			
		||||
class Device {
 | 
			
		||||
public:
 | 
			
		||||
    explicit Device();
 | 
			
		||||
    explicit Device(std::nullptr_t);
 | 
			
		||||
 | 
			
		||||
    [[nodiscard]] std::string GetVendorName() const;
 | 
			
		||||
 | 
			
		||||
@@ -41,6 +40,10 @@ public:
 | 
			
		||||
        return max_compute_shared_memory_size;
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    u32 GetMaxGLASMStorageBufferBlocks() const {
 | 
			
		||||
        return max_glasm_storage_buffer_blocks;
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    bool HasWarpIntrinsics() const {
 | 
			
		||||
        return has_warp_intrinsics;
 | 
			
		||||
    }
 | 
			
		||||
@@ -124,6 +127,7 @@ private:
 | 
			
		||||
    u32 max_vertex_attributes{};
 | 
			
		||||
    u32 max_varyings{};
 | 
			
		||||
    u32 max_compute_shared_memory_size{};
 | 
			
		||||
    u32 max_glasm_storage_buffer_blocks{};
 | 
			
		||||
    bool has_warp_intrinsics{};
 | 
			
		||||
    bool has_shader_ballot{};
 | 
			
		||||
    bool has_vertex_viewport_layer{};
 | 
			
		||||
 
 | 
			
		||||
@@ -25,7 +25,7 @@ constexpr u32 MAX_TEXTURES = 64;
 | 
			
		||||
constexpr u32 MAX_IMAGES = 8;
 | 
			
		||||
 | 
			
		||||
template <typename Range>
 | 
			
		||||
u32 AccumulateCount(Range&& range) {
 | 
			
		||||
u32 AccumulateCount(const Range& range) {
 | 
			
		||||
    u32 num{};
 | 
			
		||||
    for (const auto& desc : range) {
 | 
			
		||||
        num += desc.count;
 | 
			
		||||
@@ -70,8 +70,8 @@ bool GraphicsPipelineKey::operator==(const GraphicsPipelineKey& rhs) const noexc
 | 
			
		||||
    return std::memcmp(this, &rhs, Size()) == 0;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
GraphicsPipeline::GraphicsPipeline(TextureCache& texture_cache_, BufferCache& buffer_cache_,
 | 
			
		||||
                                   Tegra::MemoryManager& gpu_memory_,
 | 
			
		||||
GraphicsPipeline::GraphicsPipeline(const Device& device, TextureCache& texture_cache_,
 | 
			
		||||
                                   BufferCache& buffer_cache_, Tegra::MemoryManager& gpu_memory_,
 | 
			
		||||
                                   Tegra::Engines::Maxwell3D& maxwell3d_,
 | 
			
		||||
                                   ProgramManager& program_manager_, StateTracker& state_tracker_,
 | 
			
		||||
                                   OGLProgram program_,
 | 
			
		||||
@@ -90,6 +90,7 @@ GraphicsPipeline::GraphicsPipeline(TextureCache& texture_cache_, BufferCache& bu
 | 
			
		||||
    }
 | 
			
		||||
    u32 num_textures{};
 | 
			
		||||
    u32 num_images{};
 | 
			
		||||
    u32 num_storage_buffers{};
 | 
			
		||||
    for (size_t stage = 0; stage < base_uniform_bindings.size(); ++stage) {
 | 
			
		||||
        const auto& info{stage_infos[stage]};
 | 
			
		||||
        if (stage < 4) {
 | 
			
		||||
@@ -109,11 +110,20 @@ GraphicsPipeline::GraphicsPipeline(TextureCache& texture_cache_, BufferCache& bu
 | 
			
		||||
 | 
			
		||||
        num_textures += AccumulateCount(info.texture_descriptors);
 | 
			
		||||
        num_images += AccumulateCount(info.image_descriptors);
 | 
			
		||||
        num_storage_buffers += AccumulateCount(info.storage_buffers_descriptors);
 | 
			
		||||
 | 
			
		||||
        writes_global_memory |= std::ranges::any_of(
 | 
			
		||||
            info.storage_buffers_descriptors, [](const auto& desc) { return desc.is_written; });
 | 
			
		||||
    }
 | 
			
		||||
    ASSERT(num_textures <= MAX_TEXTURES);
 | 
			
		||||
    ASSERT(num_images <= MAX_IMAGES);
 | 
			
		||||
 | 
			
		||||
    if (assembly_programs[0].handle != 0 && xfb_state) {
 | 
			
		||||
    const bool assembly_shaders{assembly_programs[0].handle != 0};
 | 
			
		||||
    use_storage_buffers =
 | 
			
		||||
        !assembly_shaders || num_storage_buffers <= device.GetMaxGLASMStorageBufferBlocks();
 | 
			
		||||
    writes_global_memory &= !use_storage_buffers;
 | 
			
		||||
 | 
			
		||||
    if (assembly_shaders && xfb_state) {
 | 
			
		||||
        GenerateTransformFeedbackState(*xfb_state);
 | 
			
		||||
    }
 | 
			
		||||
}
 | 
			
		||||
@@ -137,6 +147,7 @@ void GraphicsPipeline::Configure(bool is_indexed) {
 | 
			
		||||
 | 
			
		||||
    buffer_cache.runtime.SetBaseUniformBindings(base_uniform_bindings);
 | 
			
		||||
    buffer_cache.runtime.SetBaseStorageBindings(base_storage_bindings);
 | 
			
		||||
    buffer_cache.runtime.SetEnableStorageBuffers(use_storage_buffers);
 | 
			
		||||
 | 
			
		||||
    const auto& regs{maxwell3d.regs};
 | 
			
		||||
    const bool via_header_index{regs.sampler_index == Maxwell::SamplerIndex::ViaHeaderIndex};
 | 
			
		||||
 
 | 
			
		||||
@@ -20,6 +20,7 @@
 | 
			
		||||
 | 
			
		||||
namespace OpenGL {
 | 
			
		||||
 | 
			
		||||
class Device;
 | 
			
		||||
class ProgramManager;
 | 
			
		||||
 | 
			
		||||
using Maxwell = Tegra::Engines::Maxwell3D::Regs;
 | 
			
		||||
@@ -60,8 +61,8 @@ static_assert(std::is_trivially_constructible_v<GraphicsPipelineKey>);
 | 
			
		||||
 | 
			
		||||
class GraphicsPipeline {
 | 
			
		||||
public:
 | 
			
		||||
    explicit GraphicsPipeline(TextureCache& texture_cache_, BufferCache& buffer_cache_,
 | 
			
		||||
                              Tegra::MemoryManager& gpu_memory_,
 | 
			
		||||
    explicit GraphicsPipeline(const Device& device, TextureCache& texture_cache_,
 | 
			
		||||
                              BufferCache& buffer_cache_, Tegra::MemoryManager& gpu_memory_,
 | 
			
		||||
                              Tegra::Engines::Maxwell3D& maxwell3d_,
 | 
			
		||||
                              ProgramManager& program_manager_, StateTracker& state_tracker_,
 | 
			
		||||
                              OGLProgram program_,
 | 
			
		||||
@@ -77,6 +78,10 @@ public:
 | 
			
		||||
        }
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    [[nodiscard]] bool WritesGlobalMemory() const noexcept {
 | 
			
		||||
        return writes_global_memory;
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
private:
 | 
			
		||||
    void GenerateTransformFeedbackState(const VideoCommon::TransformFeedbackState& xfb_state);
 | 
			
		||||
 | 
			
		||||
@@ -99,6 +104,9 @@ private:
 | 
			
		||||
    std::array<u32, 5> num_texture_buffers{};
 | 
			
		||||
    std::array<u32, 5> num_image_buffers{};
 | 
			
		||||
 | 
			
		||||
    bool use_storage_buffers{};
 | 
			
		||||
    bool writes_global_memory{};
 | 
			
		||||
 | 
			
		||||
    static constexpr std::size_t XFB_ENTRY_STRIDE = 3;
 | 
			
		||||
    GLsizei num_xfb_attribs{};
 | 
			
		||||
    GLsizei num_xfb_strides{};
 | 
			
		||||
 
 | 
			
		||||
@@ -268,19 +268,21 @@ void RasterizerOpenGL::Draw(bool is_indexed, bool is_instanced) {
 | 
			
		||||
    EndTransformFeedback();
 | 
			
		||||
 | 
			
		||||
    ++num_queued_commands;
 | 
			
		||||
    has_written_global_memory |= pipeline->WritesGlobalMemory();
 | 
			
		||||
 | 
			
		||||
    gpu.TickWork();
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
void RasterizerOpenGL::DispatchCompute() {
 | 
			
		||||
    ComputePipeline* const program{shader_cache.CurrentComputePipeline()};
 | 
			
		||||
    if (!program) {
 | 
			
		||||
    ComputePipeline* const pipeline{shader_cache.CurrentComputePipeline()};
 | 
			
		||||
    if (!pipeline) {
 | 
			
		||||
        return;
 | 
			
		||||
    }
 | 
			
		||||
    program->Configure();
 | 
			
		||||
    pipeline->Configure();
 | 
			
		||||
    const auto& qmd{kepler_compute.launch_description};
 | 
			
		||||
    glDispatchCompute(qmd.grid_dim_x, qmd.grid_dim_y, qmd.grid_dim_z);
 | 
			
		||||
    ++num_queued_commands;
 | 
			
		||||
    has_written_global_memory |= pipeline->WritesGlobalMemory();
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
void RasterizerOpenGL::ResetCounter(VideoCore::QueryType type) {
 | 
			
		||||
@@ -449,9 +451,8 @@ void RasterizerOpenGL::FlushCommands() {
 | 
			
		||||
 | 
			
		||||
    // Make sure memory stored from the previous GL command stream is visible
 | 
			
		||||
    // This is only needed on assembly shaders where we write to GPU memory with raw pointers
 | 
			
		||||
    // TODO: Call this only when NV_shader_buffer_load or NV_shader_buffer_store have been used
 | 
			
		||||
    //       and prefer using NV_shader_storage_buffer_object when possible
 | 
			
		||||
    if (Settings::values.use_assembly_shaders.GetValue()) {
 | 
			
		||||
    if (has_written_global_memory) {
 | 
			
		||||
        has_written_global_memory = false;
 | 
			
		||||
        glMemoryBarrier(GL_BUFFER_UPDATE_BARRIER_BIT);
 | 
			
		||||
    }
 | 
			
		||||
    glFlush();
 | 
			
		||||
 
 | 
			
		||||
@@ -225,7 +225,8 @@ private:
 | 
			
		||||
    std::array<GLuint, MAX_IMAGES> image_handles{};
 | 
			
		||||
 | 
			
		||||
    /// Number of commands queued to the OpenGL driver. Resetted on flush.
 | 
			
		||||
    std::size_t num_queued_commands = 0;
 | 
			
		||||
    size_t num_queued_commands = 0;
 | 
			
		||||
    bool has_written_global_memory = false;
 | 
			
		||||
 | 
			
		||||
    u32 last_clip_distance_mask = 0;
 | 
			
		||||
};
 | 
			
		||||
 
 | 
			
		||||
@@ -157,7 +157,8 @@ GLenum AssemblyStage(size_t stage_index) {
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
Shader::RuntimeInfo MakeRuntimeInfo(const GraphicsPipelineKey& key,
 | 
			
		||||
                                    const Shader::IR::Program& program) {
 | 
			
		||||
                                    const Shader::IR::Program& program,
 | 
			
		||||
                                    bool glasm_use_storage_buffers) {
 | 
			
		||||
    Shader::RuntimeInfo info;
 | 
			
		||||
    switch (program.stage) {
 | 
			
		||||
    case Shader::Stage::TessellationEval:
 | 
			
		||||
@@ -220,6 +221,7 @@ Shader::RuntimeInfo MakeRuntimeInfo(const GraphicsPipelineKey& key,
 | 
			
		||||
        info.input_topology = Shader::InputTopology::TrianglesAdjacency;
 | 
			
		||||
        break;
 | 
			
		||||
    }
 | 
			
		||||
    info.glasm_use_storage_buffers = glasm_use_storage_buffers;
 | 
			
		||||
    return info;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
@@ -435,7 +437,8 @@ std::unique_ptr<GraphicsPipeline> ShaderCache::CreateGraphicsPipeline(
 | 
			
		||||
    ShaderPools& pools, const GraphicsPipelineKey& key, std::span<Shader::Environment* const> envs,
 | 
			
		||||
    bool build_in_parallel) {
 | 
			
		||||
    LOG_INFO(Render_OpenGL, "0x{:016x}", key.Hash());
 | 
			
		||||
    size_t env_index{0};
 | 
			
		||||
    size_t env_index{};
 | 
			
		||||
    u32 total_storage_buffers{};
 | 
			
		||||
    std::array<Shader::IR::Program, Maxwell::MaxShaderProgram> programs;
 | 
			
		||||
    for (size_t index = 0; index < Maxwell::MaxShaderProgram; ++index) {
 | 
			
		||||
        if (key.unique_hashes[index] == 0) {
 | 
			
		||||
@@ -447,7 +450,14 @@ std::unique_ptr<GraphicsPipeline> ShaderCache::CreateGraphicsPipeline(
 | 
			
		||||
        const u32 cfg_offset{static_cast<u32>(env.StartAddress() + sizeof(Shader::ProgramHeader))};
 | 
			
		||||
        Shader::Maxwell::Flow::CFG cfg(env, pools.flow_block, cfg_offset);
 | 
			
		||||
        programs[index] = TranslateProgram(pools.inst, pools.block, env, cfg);
 | 
			
		||||
 | 
			
		||||
        for (const auto& desc : programs[index].info.storage_buffers_descriptors) {
 | 
			
		||||
            total_storage_buffers += desc.count;
 | 
			
		||||
        }
 | 
			
		||||
    }
 | 
			
		||||
    const u32 glasm_storage_buffer_limit{device.GetMaxGLASMStorageBufferBlocks()};
 | 
			
		||||
    const bool glasm_use_storage_buffers{total_storage_buffers <= glasm_storage_buffer_limit};
 | 
			
		||||
 | 
			
		||||
    std::array<const Shader::Info*, Maxwell::MaxShaderStage> infos{};
 | 
			
		||||
 | 
			
		||||
    OGLProgram source_program;
 | 
			
		||||
@@ -466,7 +476,7 @@ std::unique_ptr<GraphicsPipeline> ShaderCache::CreateGraphicsPipeline(
 | 
			
		||||
        const size_t stage_index{index - 1};
 | 
			
		||||
        infos[stage_index] = &program.info;
 | 
			
		||||
 | 
			
		||||
        const Shader::RuntimeInfo runtime_info{MakeRuntimeInfo(key, program)};
 | 
			
		||||
        const auto runtime_info{MakeRuntimeInfo(key, program, glasm_use_storage_buffers)};
 | 
			
		||||
        if (device.UseAssemblyShaders()) {
 | 
			
		||||
            const std::string code{EmitGLASM(profile, runtime_info, program, binding)};
 | 
			
		||||
            assembly_programs[stage_index] = CompileProgram(code, AssemblyStage(stage_index));
 | 
			
		||||
@@ -479,7 +489,7 @@ std::unique_ptr<GraphicsPipeline> ShaderCache::CreateGraphicsPipeline(
 | 
			
		||||
        LinkProgram(source_program.handle);
 | 
			
		||||
    }
 | 
			
		||||
    return std::make_unique<GraphicsPipeline>(
 | 
			
		||||
        texture_cache, buffer_cache, gpu_memory, maxwell3d, program_manager, state_tracker,
 | 
			
		||||
        device, texture_cache, buffer_cache, gpu_memory, maxwell3d, program_manager, state_tracker,
 | 
			
		||||
        std::move(source_program), std::move(assembly_programs), infos,
 | 
			
		||||
        key.xfb_enabled != 0 ? &key.xfb_state : nullptr);
 | 
			
		||||
}
 | 
			
		||||
@@ -508,10 +518,18 @@ std::unique_ptr<ComputePipeline> ShaderCache::CreateComputePipeline(ShaderPools&
 | 
			
		||||
 | 
			
		||||
    Shader::Maxwell::Flow::CFG cfg{env, pools.flow_block, env.StartAddress()};
 | 
			
		||||
    Shader::IR::Program program{TranslateProgram(pools.inst, pools.block, env, cfg)};
 | 
			
		||||
 | 
			
		||||
    u32 num_storage_buffers{};
 | 
			
		||||
    for (const auto& desc : program.info.storage_buffers_descriptors) {
 | 
			
		||||
        num_storage_buffers += desc.count;
 | 
			
		||||
    }
 | 
			
		||||
    Shader::RuntimeInfo info;
 | 
			
		||||
    info.glasm_use_storage_buffers = num_storage_buffers <= device.GetMaxGLASMStorageBufferBlocks();
 | 
			
		||||
 | 
			
		||||
    OGLAssemblyProgram asm_program;
 | 
			
		||||
    OGLProgram source_program;
 | 
			
		||||
    if (device.UseAssemblyShaders()) {
 | 
			
		||||
        const std::string code{EmitGLASM(profile, program)};
 | 
			
		||||
        const std::string code{EmitGLASM(profile, info, program)};
 | 
			
		||||
        asm_program = CompileProgram(code, GL_COMPUTE_PROGRAM_NV);
 | 
			
		||||
    } else {
 | 
			
		||||
        const std::vector<u32> code{EmitSPIRV(profile, program)};
 | 
			
		||||
@@ -519,7 +537,7 @@ std::unique_ptr<ComputePipeline> ShaderCache::CreateComputePipeline(ShaderPools&
 | 
			
		||||
        AddShader(GL_COMPUTE_SHADER, source_program.handle, code);
 | 
			
		||||
        LinkProgram(source_program.handle);
 | 
			
		||||
    }
 | 
			
		||||
    return std::make_unique<ComputePipeline>(texture_cache, buffer_cache, gpu_memory,
 | 
			
		||||
    return std::make_unique<ComputePipeline>(device, texture_cache, buffer_cache, gpu_memory,
 | 
			
		||||
                                             kepler_compute, program_manager, program.info,
 | 
			
		||||
                                             std::move(source_program), std::move(asm_program));
 | 
			
		||||
}
 | 
			
		||||
 
 | 
			
		||||
		Reference in New Issue
	
	Block a user