mirror of
				https://git.suyu.dev/suyu/suyu
				synced 2025-11-03 16:39:01 -06:00 
			
		
		
		
	Merge pull request #10469 from Kelebek1/bc_state
Move buffer bindings to per-channel state
This commit is contained in:
		@@ -2,6 +2,8 @@
 | 
			
		||||
// SPDX-License-Identifier: GPL-3.0-or-later
 | 
			
		||||
 | 
			
		||||
#include "common/microprofile.h"
 | 
			
		||||
#include "video_core/buffer_cache/buffer_cache_base.h"
 | 
			
		||||
#include "video_core/control/channel_state_cache.inc"
 | 
			
		||||
 | 
			
		||||
namespace VideoCommon {
 | 
			
		||||
 | 
			
		||||
@@ -9,4 +11,6 @@ MICROPROFILE_DEFINE(GPU_PrepareBuffers, "GPU", "Prepare buffers", MP_RGB(224, 12
 | 
			
		||||
MICROPROFILE_DEFINE(GPU_BindUploadBuffers, "GPU", "Bind and upload buffers", MP_RGB(224, 128, 128));
 | 
			
		||||
MICROPROFILE_DEFINE(GPU_DownloadMemory, "GPU", "Download buffers", MP_RGB(224, 128, 128));
 | 
			
		||||
 | 
			
		||||
template class VideoCommon::ChannelSetupCaches<VideoCommon::BufferCacheChannelInfo>;
 | 
			
		||||
 | 
			
		||||
} // namespace VideoCommon
 | 
			
		||||
 
 | 
			
		||||
@@ -64,17 +64,22 @@ void BufferCache<P>::RunGarbageCollector() {
 | 
			
		||||
template <class P>
 | 
			
		||||
void BufferCache<P>::TickFrame() {
 | 
			
		||||
    // Calculate hits and shots and move hit bits to the right
 | 
			
		||||
    const u32 hits = std::reduce(uniform_cache_hits.begin(), uniform_cache_hits.end());
 | 
			
		||||
    const u32 shots = std::reduce(uniform_cache_shots.begin(), uniform_cache_shots.end());
 | 
			
		||||
    std::copy_n(uniform_cache_hits.begin(), uniform_cache_hits.size() - 1,
 | 
			
		||||
                uniform_cache_hits.begin() + 1);
 | 
			
		||||
    std::copy_n(uniform_cache_shots.begin(), uniform_cache_shots.size() - 1,
 | 
			
		||||
                uniform_cache_shots.begin() + 1);
 | 
			
		||||
    uniform_cache_hits[0] = 0;
 | 
			
		||||
    uniform_cache_shots[0] = 0;
 | 
			
		||||
 | 
			
		||||
    const u32 hits = std::reduce(channel_state->uniform_cache_hits.begin(),
 | 
			
		||||
                                 channel_state->uniform_cache_hits.end());
 | 
			
		||||
    const u32 shots = std::reduce(channel_state->uniform_cache_shots.begin(),
 | 
			
		||||
                                  channel_state->uniform_cache_shots.end());
 | 
			
		||||
    std::copy_n(channel_state->uniform_cache_hits.begin(),
 | 
			
		||||
                channel_state->uniform_cache_hits.size() - 1,
 | 
			
		||||
                channel_state->uniform_cache_hits.begin() + 1);
 | 
			
		||||
    std::copy_n(channel_state->uniform_cache_shots.begin(),
 | 
			
		||||
                channel_state->uniform_cache_shots.size() - 1,
 | 
			
		||||
                channel_state->uniform_cache_shots.begin() + 1);
 | 
			
		||||
    channel_state->uniform_cache_hits[0] = 0;
 | 
			
		||||
    channel_state->uniform_cache_shots[0] = 0;
 | 
			
		||||
 | 
			
		||||
    const bool skip_preferred = hits * 256 < shots * 251;
 | 
			
		||||
    uniform_buffer_skip_cache_size = skip_preferred ? DEFAULT_SKIP_CACHE_SIZE : 0;
 | 
			
		||||
    channel_state->uniform_buffer_skip_cache_size = skip_preferred ? DEFAULT_SKIP_CACHE_SIZE : 0;
 | 
			
		||||
 | 
			
		||||
    // If we can obtain the memory info, use it instead of the estimate.
 | 
			
		||||
    if (runtime.CanReportMemoryUsage()) {
 | 
			
		||||
@@ -164,10 +169,10 @@ bool BufferCache<P>::DMACopy(GPUVAddr src_address, GPUVAddr dest_address, u64 am
 | 
			
		||||
    BufferId buffer_a;
 | 
			
		||||
    BufferId buffer_b;
 | 
			
		||||
    do {
 | 
			
		||||
        has_deleted_buffers = false;
 | 
			
		||||
        channel_state->has_deleted_buffers = false;
 | 
			
		||||
        buffer_a = FindBuffer(*cpu_src_address, static_cast<u32>(amount));
 | 
			
		||||
        buffer_b = FindBuffer(*cpu_dest_address, static_cast<u32>(amount));
 | 
			
		||||
    } while (has_deleted_buffers);
 | 
			
		||||
    } while (channel_state->has_deleted_buffers);
 | 
			
		||||
    auto& src_buffer = slot_buffers[buffer_a];
 | 
			
		||||
    auto& dest_buffer = slot_buffers[buffer_b];
 | 
			
		||||
    SynchronizeBuffer(src_buffer, *cpu_src_address, static_cast<u32>(amount));
 | 
			
		||||
@@ -272,30 +277,30 @@ void BufferCache<P>::BindGraphicsUniformBuffer(size_t stage, u32 index, GPUVAddr
 | 
			
		||||
        .size = size,
 | 
			
		||||
        .buffer_id = BufferId{},
 | 
			
		||||
    };
 | 
			
		||||
    uniform_buffers[stage][index] = binding;
 | 
			
		||||
    channel_state->uniform_buffers[stage][index] = binding;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
template <class P>
 | 
			
		||||
void BufferCache<P>::DisableGraphicsUniformBuffer(size_t stage, u32 index) {
 | 
			
		||||
    uniform_buffers[stage][index] = NULL_BINDING;
 | 
			
		||||
    channel_state->uniform_buffers[stage][index] = NULL_BINDING;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
template <class P>
 | 
			
		||||
void BufferCache<P>::UpdateGraphicsBuffers(bool is_indexed) {
 | 
			
		||||
    MICROPROFILE_SCOPE(GPU_PrepareBuffers);
 | 
			
		||||
    do {
 | 
			
		||||
        has_deleted_buffers = false;
 | 
			
		||||
        channel_state->has_deleted_buffers = false;
 | 
			
		||||
        DoUpdateGraphicsBuffers(is_indexed);
 | 
			
		||||
    } while (has_deleted_buffers);
 | 
			
		||||
    } while (channel_state->has_deleted_buffers);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
template <class P>
 | 
			
		||||
void BufferCache<P>::UpdateComputeBuffers() {
 | 
			
		||||
    MICROPROFILE_SCOPE(GPU_PrepareBuffers);
 | 
			
		||||
    do {
 | 
			
		||||
        has_deleted_buffers = false;
 | 
			
		||||
        channel_state->has_deleted_buffers = false;
 | 
			
		||||
        DoUpdateComputeBuffers();
 | 
			
		||||
    } while (has_deleted_buffers);
 | 
			
		||||
    } while (channel_state->has_deleted_buffers);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
template <class P>
 | 
			
		||||
@@ -338,98 +343,102 @@ template <class P>
 | 
			
		||||
void BufferCache<P>::SetUniformBuffersState(const std::array<u32, NUM_STAGES>& mask,
 | 
			
		||||
                                            const UniformBufferSizes* sizes) {
 | 
			
		||||
    if constexpr (HAS_PERSISTENT_UNIFORM_BUFFER_BINDINGS) {
 | 
			
		||||
        if (enabled_uniform_buffer_masks != mask) {
 | 
			
		||||
        if (channel_state->enabled_uniform_buffer_masks != mask) {
 | 
			
		||||
            if constexpr (IS_OPENGL) {
 | 
			
		||||
                fast_bound_uniform_buffers.fill(0);
 | 
			
		||||
                channel_state->fast_bound_uniform_buffers.fill(0);
 | 
			
		||||
            }
 | 
			
		||||
            dirty_uniform_buffers.fill(~u32{0});
 | 
			
		||||
            uniform_buffer_binding_sizes.fill({});
 | 
			
		||||
            channel_state->dirty_uniform_buffers.fill(~u32{0});
 | 
			
		||||
            channel_state->uniform_buffer_binding_sizes.fill({});
 | 
			
		||||
        }
 | 
			
		||||
    }
 | 
			
		||||
    enabled_uniform_buffer_masks = mask;
 | 
			
		||||
    uniform_buffer_sizes = sizes;
 | 
			
		||||
    channel_state->enabled_uniform_buffer_masks = mask;
 | 
			
		||||
    channel_state->uniform_buffer_sizes = sizes;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
template <class P>
 | 
			
		||||
void BufferCache<P>::SetComputeUniformBufferState(u32 mask,
 | 
			
		||||
                                                  const ComputeUniformBufferSizes* sizes) {
 | 
			
		||||
    enabled_compute_uniform_buffer_mask = mask;
 | 
			
		||||
    compute_uniform_buffer_sizes = sizes;
 | 
			
		||||
    channel_state->enabled_compute_uniform_buffer_mask = mask;
 | 
			
		||||
    channel_state->compute_uniform_buffer_sizes = sizes;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
template <class P>
 | 
			
		||||
void BufferCache<P>::UnbindGraphicsStorageBuffers(size_t stage) {
 | 
			
		||||
    enabled_storage_buffers[stage] = 0;
 | 
			
		||||
    written_storage_buffers[stage] = 0;
 | 
			
		||||
    channel_state->enabled_storage_buffers[stage] = 0;
 | 
			
		||||
    channel_state->written_storage_buffers[stage] = 0;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
template <class P>
 | 
			
		||||
void BufferCache<P>::BindGraphicsStorageBuffer(size_t stage, size_t ssbo_index, u32 cbuf_index,
 | 
			
		||||
                                               u32 cbuf_offset, bool is_written) {
 | 
			
		||||
    enabled_storage_buffers[stage] |= 1U << ssbo_index;
 | 
			
		||||
    written_storage_buffers[stage] |= (is_written ? 1U : 0U) << ssbo_index;
 | 
			
		||||
    channel_state->enabled_storage_buffers[stage] |= 1U << ssbo_index;
 | 
			
		||||
    channel_state->written_storage_buffers[stage] |= (is_written ? 1U : 0U) << ssbo_index;
 | 
			
		||||
 | 
			
		||||
    const auto& cbufs = maxwell3d->state.shader_stages[stage];
 | 
			
		||||
    const GPUVAddr ssbo_addr = cbufs.const_buffers[cbuf_index].address + cbuf_offset;
 | 
			
		||||
    storage_buffers[stage][ssbo_index] = StorageBufferBinding(ssbo_addr, cbuf_index, is_written);
 | 
			
		||||
    channel_state->storage_buffers[stage][ssbo_index] =
 | 
			
		||||
        StorageBufferBinding(ssbo_addr, cbuf_index, is_written);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
template <class P>
 | 
			
		||||
void BufferCache<P>::UnbindGraphicsTextureBuffers(size_t stage) {
 | 
			
		||||
    enabled_texture_buffers[stage] = 0;
 | 
			
		||||
    written_texture_buffers[stage] = 0;
 | 
			
		||||
    image_texture_buffers[stage] = 0;
 | 
			
		||||
    channel_state->enabled_texture_buffers[stage] = 0;
 | 
			
		||||
    channel_state->written_texture_buffers[stage] = 0;
 | 
			
		||||
    channel_state->image_texture_buffers[stage] = 0;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
template <class P>
 | 
			
		||||
void BufferCache<P>::BindGraphicsTextureBuffer(size_t stage, size_t tbo_index, GPUVAddr gpu_addr,
 | 
			
		||||
                                               u32 size, PixelFormat format, bool is_written,
 | 
			
		||||
                                               bool is_image) {
 | 
			
		||||
    enabled_texture_buffers[stage] |= 1U << tbo_index;
 | 
			
		||||
    written_texture_buffers[stage] |= (is_written ? 1U : 0U) << tbo_index;
 | 
			
		||||
    channel_state->enabled_texture_buffers[stage] |= 1U << tbo_index;
 | 
			
		||||
    channel_state->written_texture_buffers[stage] |= (is_written ? 1U : 0U) << tbo_index;
 | 
			
		||||
    if constexpr (SEPARATE_IMAGE_BUFFERS_BINDINGS) {
 | 
			
		||||
        image_texture_buffers[stage] |= (is_image ? 1U : 0U) << tbo_index;
 | 
			
		||||
        channel_state->image_texture_buffers[stage] |= (is_image ? 1U : 0U) << tbo_index;
 | 
			
		||||
    }
 | 
			
		||||
    texture_buffers[stage][tbo_index] = GetTextureBufferBinding(gpu_addr, size, format);
 | 
			
		||||
    channel_state->texture_buffers[stage][tbo_index] =
 | 
			
		||||
        GetTextureBufferBinding(gpu_addr, size, format);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
template <class P>
 | 
			
		||||
void BufferCache<P>::UnbindComputeStorageBuffers() {
 | 
			
		||||
    enabled_compute_storage_buffers = 0;
 | 
			
		||||
    written_compute_storage_buffers = 0;
 | 
			
		||||
    image_compute_texture_buffers = 0;
 | 
			
		||||
    channel_state->enabled_compute_storage_buffers = 0;
 | 
			
		||||
    channel_state->written_compute_storage_buffers = 0;
 | 
			
		||||
    channel_state->image_compute_texture_buffers = 0;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
template <class P>
 | 
			
		||||
void BufferCache<P>::BindComputeStorageBuffer(size_t ssbo_index, u32 cbuf_index, u32 cbuf_offset,
 | 
			
		||||
                                              bool is_written) {
 | 
			
		||||
    enabled_compute_storage_buffers |= 1U << ssbo_index;
 | 
			
		||||
    written_compute_storage_buffers |= (is_written ? 1U : 0U) << ssbo_index;
 | 
			
		||||
    channel_state->enabled_compute_storage_buffers |= 1U << ssbo_index;
 | 
			
		||||
    channel_state->written_compute_storage_buffers |= (is_written ? 1U : 0U) << ssbo_index;
 | 
			
		||||
 | 
			
		||||
    const auto& launch_desc = kepler_compute->launch_description;
 | 
			
		||||
    ASSERT(((launch_desc.const_buffer_enable_mask >> cbuf_index) & 1) != 0);
 | 
			
		||||
 | 
			
		||||
    const auto& cbufs = launch_desc.const_buffer_config;
 | 
			
		||||
    const GPUVAddr ssbo_addr = cbufs[cbuf_index].Address() + cbuf_offset;
 | 
			
		||||
    compute_storage_buffers[ssbo_index] = StorageBufferBinding(ssbo_addr, cbuf_index, is_written);
 | 
			
		||||
    channel_state->compute_storage_buffers[ssbo_index] =
 | 
			
		||||
        StorageBufferBinding(ssbo_addr, cbuf_index, is_written);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
template <class P>
 | 
			
		||||
void BufferCache<P>::UnbindComputeTextureBuffers() {
 | 
			
		||||
    enabled_compute_texture_buffers = 0;
 | 
			
		||||
    written_compute_texture_buffers = 0;
 | 
			
		||||
    image_compute_texture_buffers = 0;
 | 
			
		||||
    channel_state->enabled_compute_texture_buffers = 0;
 | 
			
		||||
    channel_state->written_compute_texture_buffers = 0;
 | 
			
		||||
    channel_state->image_compute_texture_buffers = 0;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
template <class P>
 | 
			
		||||
void BufferCache<P>::BindComputeTextureBuffer(size_t tbo_index, GPUVAddr gpu_addr, u32 size,
 | 
			
		||||
                                              PixelFormat format, bool is_written, bool is_image) {
 | 
			
		||||
    enabled_compute_texture_buffers |= 1U << tbo_index;
 | 
			
		||||
    written_compute_texture_buffers |= (is_written ? 1U : 0U) << tbo_index;
 | 
			
		||||
    channel_state->enabled_compute_texture_buffers |= 1U << tbo_index;
 | 
			
		||||
    channel_state->written_compute_texture_buffers |= (is_written ? 1U : 0U) << tbo_index;
 | 
			
		||||
    if constexpr (SEPARATE_IMAGE_BUFFERS_BINDINGS) {
 | 
			
		||||
        image_compute_texture_buffers |= (is_image ? 1U : 0U) << tbo_index;
 | 
			
		||||
        channel_state->image_compute_texture_buffers |= (is_image ? 1U : 0U) << tbo_index;
 | 
			
		||||
    }
 | 
			
		||||
    compute_texture_buffers[tbo_index] = GetTextureBufferBinding(gpu_addr, size, format);
 | 
			
		||||
    channel_state->compute_texture_buffers[tbo_index] =
 | 
			
		||||
        GetTextureBufferBinding(gpu_addr, size, format);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
template <class P>
 | 
			
		||||
@@ -672,10 +681,10 @@ bool BufferCache<P>::IsRegionCpuModified(VAddr addr, size_t size) {
 | 
			
		||||
 | 
			
		||||
template <class P>
 | 
			
		||||
void BufferCache<P>::BindHostIndexBuffer() {
 | 
			
		||||
    Buffer& buffer = slot_buffers[index_buffer.buffer_id];
 | 
			
		||||
    TouchBuffer(buffer, index_buffer.buffer_id);
 | 
			
		||||
    const u32 offset = buffer.Offset(index_buffer.cpu_addr);
 | 
			
		||||
    const u32 size = index_buffer.size;
 | 
			
		||||
    Buffer& buffer = slot_buffers[channel_state->index_buffer.buffer_id];
 | 
			
		||||
    TouchBuffer(buffer, channel_state->index_buffer.buffer_id);
 | 
			
		||||
    const u32 offset = buffer.Offset(channel_state->index_buffer.cpu_addr);
 | 
			
		||||
    const u32 size = channel_state->index_buffer.size;
 | 
			
		||||
    const auto& draw_state = maxwell3d->draw_manager->GetDrawState();
 | 
			
		||||
    if (!draw_state.inline_index_draw_indexes.empty()) [[unlikely]] {
 | 
			
		||||
        if constexpr (USE_MEMORY_MAPS) {
 | 
			
		||||
@@ -689,7 +698,7 @@ void BufferCache<P>::BindHostIndexBuffer() {
 | 
			
		||||
            buffer.ImmediateUpload(0, draw_state.inline_index_draw_indexes);
 | 
			
		||||
        }
 | 
			
		||||
    } else {
 | 
			
		||||
        SynchronizeBuffer(buffer, index_buffer.cpu_addr, size);
 | 
			
		||||
        SynchronizeBuffer(buffer, channel_state->index_buffer.cpu_addr, size);
 | 
			
		||||
    }
 | 
			
		||||
    if constexpr (HAS_FULL_INDEX_AND_PRIMITIVE_SUPPORT) {
 | 
			
		||||
        const u32 new_offset =
 | 
			
		||||
@@ -706,7 +715,7 @@ template <class P>
 | 
			
		||||
void BufferCache<P>::BindHostVertexBuffers() {
 | 
			
		||||
    auto& flags = maxwell3d->dirty.flags;
 | 
			
		||||
    for (u32 index = 0; index < NUM_VERTEX_BUFFERS; ++index) {
 | 
			
		||||
        const Binding& binding = vertex_buffers[index];
 | 
			
		||||
        const Binding& binding = channel_state->vertex_buffers[index];
 | 
			
		||||
        Buffer& buffer = slot_buffers[binding.buffer_id];
 | 
			
		||||
        TouchBuffer(buffer, binding.buffer_id);
 | 
			
		||||
        SynchronizeBuffer(buffer, binding.cpu_addr, binding.size);
 | 
			
		||||
@@ -729,19 +738,19 @@ void BufferCache<P>::BindHostDrawIndirectBuffers() {
 | 
			
		||||
        SynchronizeBuffer(buffer, binding.cpu_addr, binding.size);
 | 
			
		||||
    };
 | 
			
		||||
    if (current_draw_indirect->include_count) {
 | 
			
		||||
        bind_buffer(count_buffer_binding);
 | 
			
		||||
        bind_buffer(channel_state->count_buffer_binding);
 | 
			
		||||
    }
 | 
			
		||||
    bind_buffer(indirect_buffer_binding);
 | 
			
		||||
    bind_buffer(channel_state->indirect_buffer_binding);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
template <class P>
 | 
			
		||||
void BufferCache<P>::BindHostGraphicsUniformBuffers(size_t stage) {
 | 
			
		||||
    u32 dirty = ~0U;
 | 
			
		||||
    if constexpr (HAS_PERSISTENT_UNIFORM_BUFFER_BINDINGS) {
 | 
			
		||||
        dirty = std::exchange(dirty_uniform_buffers[stage], 0);
 | 
			
		||||
        dirty = std::exchange(channel_state->dirty_uniform_buffers[stage], 0);
 | 
			
		||||
    }
 | 
			
		||||
    u32 binding_index = 0;
 | 
			
		||||
    ForEachEnabledBit(enabled_uniform_buffer_masks[stage], [&](u32 index) {
 | 
			
		||||
    ForEachEnabledBit(channel_state->enabled_uniform_buffer_masks[stage], [&](u32 index) {
 | 
			
		||||
        const bool needs_bind = ((dirty >> index) & 1) != 0;
 | 
			
		||||
        BindHostGraphicsUniformBuffer(stage, index, binding_index, needs_bind);
 | 
			
		||||
        if constexpr (NEEDS_BIND_UNIFORM_INDEX) {
 | 
			
		||||
@@ -753,13 +762,13 @@ void BufferCache<P>::BindHostGraphicsUniformBuffers(size_t stage) {
 | 
			
		||||
template <class P>
 | 
			
		||||
void BufferCache<P>::BindHostGraphicsUniformBuffer(size_t stage, u32 index, u32 binding_index,
 | 
			
		||||
                                                   bool needs_bind) {
 | 
			
		||||
    const Binding& binding = uniform_buffers[stage][index];
 | 
			
		||||
    const Binding& binding = channel_state->uniform_buffers[stage][index];
 | 
			
		||||
    const VAddr cpu_addr = binding.cpu_addr;
 | 
			
		||||
    const u32 size = std::min(binding.size, (*uniform_buffer_sizes)[stage][index]);
 | 
			
		||||
    const u32 size = std::min(binding.size, (*channel_state->uniform_buffer_sizes)[stage][index]);
 | 
			
		||||
    Buffer& buffer = slot_buffers[binding.buffer_id];
 | 
			
		||||
    TouchBuffer(buffer, binding.buffer_id);
 | 
			
		||||
    const bool use_fast_buffer = binding.buffer_id != NULL_BUFFER_ID &&
 | 
			
		||||
                                 size <= uniform_buffer_skip_cache_size &&
 | 
			
		||||
                                 size <= channel_state->uniform_buffer_skip_cache_size &&
 | 
			
		||||
                                 !memory_tracker.IsRegionGpuModified(cpu_addr, size);
 | 
			
		||||
    if (use_fast_buffer) {
 | 
			
		||||
        if constexpr (IS_OPENGL) {
 | 
			
		||||
@@ -767,11 +776,11 @@ void BufferCache<P>::BindHostGraphicsUniformBuffer(size_t stage, u32 index, u32
 | 
			
		||||
                // Fast path for Nvidia
 | 
			
		||||
                const bool should_fast_bind =
 | 
			
		||||
                    !HasFastUniformBufferBound(stage, binding_index) ||
 | 
			
		||||
                    uniform_buffer_binding_sizes[stage][binding_index] != size;
 | 
			
		||||
                    channel_state->uniform_buffer_binding_sizes[stage][binding_index] != size;
 | 
			
		||||
                if (should_fast_bind) {
 | 
			
		||||
                    // We only have to bind when the currently bound buffer is not the fast version
 | 
			
		||||
                    fast_bound_uniform_buffers[stage] |= 1U << binding_index;
 | 
			
		||||
                    uniform_buffer_binding_sizes[stage][binding_index] = size;
 | 
			
		||||
                    channel_state->fast_bound_uniform_buffers[stage] |= 1U << binding_index;
 | 
			
		||||
                    channel_state->uniform_buffer_binding_sizes[stage][binding_index] = size;
 | 
			
		||||
                    runtime.BindFastUniformBuffer(stage, binding_index, size);
 | 
			
		||||
                }
 | 
			
		||||
                const auto span = ImmediateBufferWithData(cpu_addr, size);
 | 
			
		||||
@@ -780,8 +789,8 @@ void BufferCache<P>::BindHostGraphicsUniformBuffer(size_t stage, u32 index, u32
 | 
			
		||||
            }
 | 
			
		||||
        }
 | 
			
		||||
        if constexpr (IS_OPENGL) {
 | 
			
		||||
            fast_bound_uniform_buffers[stage] |= 1U << binding_index;
 | 
			
		||||
            uniform_buffer_binding_sizes[stage][binding_index] = size;
 | 
			
		||||
            channel_state->fast_bound_uniform_buffers[stage] |= 1U << binding_index;
 | 
			
		||||
            channel_state->uniform_buffer_binding_sizes[stage][binding_index] = size;
 | 
			
		||||
        }
 | 
			
		||||
        // Stream buffer path to avoid stalling on non-Nvidia drivers or Vulkan
 | 
			
		||||
        const std::span<u8> span = runtime.BindMappedUniformBuffer(stage, binding_index, size);
 | 
			
		||||
@@ -791,15 +800,15 @@ void BufferCache<P>::BindHostGraphicsUniformBuffer(size_t stage, u32 index, u32
 | 
			
		||||
    // Classic cached path
 | 
			
		||||
    const bool sync_cached = SynchronizeBuffer(buffer, cpu_addr, size);
 | 
			
		||||
    if (sync_cached) {
 | 
			
		||||
        ++uniform_cache_hits[0];
 | 
			
		||||
        ++channel_state->uniform_cache_hits[0];
 | 
			
		||||
    }
 | 
			
		||||
    ++uniform_cache_shots[0];
 | 
			
		||||
    ++channel_state->uniform_cache_shots[0];
 | 
			
		||||
 | 
			
		||||
    // Skip binding if it's not needed and if the bound buffer is not the fast version
 | 
			
		||||
    // This exists to avoid instances where the fast buffer is bound and a GPU write happens
 | 
			
		||||
    needs_bind |= HasFastUniformBufferBound(stage, binding_index);
 | 
			
		||||
    if constexpr (HAS_PERSISTENT_UNIFORM_BUFFER_BINDINGS) {
 | 
			
		||||
        needs_bind |= uniform_buffer_binding_sizes[stage][binding_index] != size;
 | 
			
		||||
        needs_bind |= channel_state->uniform_buffer_binding_sizes[stage][binding_index] != size;
 | 
			
		||||
    }
 | 
			
		||||
    if (!needs_bind) {
 | 
			
		||||
        return;
 | 
			
		||||
@@ -807,14 +816,14 @@ void BufferCache<P>::BindHostGraphicsUniformBuffer(size_t stage, u32 index, u32
 | 
			
		||||
    const u32 offset = buffer.Offset(cpu_addr);
 | 
			
		||||
    if constexpr (IS_OPENGL) {
 | 
			
		||||
        // Fast buffer will be unbound
 | 
			
		||||
        fast_bound_uniform_buffers[stage] &= ~(1U << binding_index);
 | 
			
		||||
        channel_state->fast_bound_uniform_buffers[stage] &= ~(1U << binding_index);
 | 
			
		||||
 | 
			
		||||
        // Mark the index as dirty if offset doesn't match
 | 
			
		||||
        const bool is_copy_bind = offset != 0 && !runtime.SupportsNonZeroUniformOffset();
 | 
			
		||||
        dirty_uniform_buffers[stage] |= (is_copy_bind ? 1U : 0U) << index;
 | 
			
		||||
        channel_state->dirty_uniform_buffers[stage] |= (is_copy_bind ? 1U : 0U) << index;
 | 
			
		||||
    }
 | 
			
		||||
    if constexpr (HAS_PERSISTENT_UNIFORM_BUFFER_BINDINGS) {
 | 
			
		||||
        uniform_buffer_binding_sizes[stage][binding_index] = size;
 | 
			
		||||
        channel_state->uniform_buffer_binding_sizes[stage][binding_index] = size;
 | 
			
		||||
    }
 | 
			
		||||
    if constexpr (NEEDS_BIND_UNIFORM_INDEX) {
 | 
			
		||||
        runtime.BindUniformBuffer(stage, binding_index, buffer, offset, size);
 | 
			
		||||
@@ -826,15 +835,15 @@ void BufferCache<P>::BindHostGraphicsUniformBuffer(size_t stage, u32 index, u32
 | 
			
		||||
template <class P>
 | 
			
		||||
void BufferCache<P>::BindHostGraphicsStorageBuffers(size_t stage) {
 | 
			
		||||
    u32 binding_index = 0;
 | 
			
		||||
    ForEachEnabledBit(enabled_storage_buffers[stage], [&](u32 index) {
 | 
			
		||||
        const Binding& binding = storage_buffers[stage][index];
 | 
			
		||||
    ForEachEnabledBit(channel_state->enabled_storage_buffers[stage], [&](u32 index) {
 | 
			
		||||
        const Binding& binding = channel_state->storage_buffers[stage][index];
 | 
			
		||||
        Buffer& buffer = slot_buffers[binding.buffer_id];
 | 
			
		||||
        TouchBuffer(buffer, binding.buffer_id);
 | 
			
		||||
        const u32 size = binding.size;
 | 
			
		||||
        SynchronizeBuffer(buffer, binding.cpu_addr, size);
 | 
			
		||||
 | 
			
		||||
        const u32 offset = buffer.Offset(binding.cpu_addr);
 | 
			
		||||
        const bool is_written = ((written_storage_buffers[stage] >> index) & 1) != 0;
 | 
			
		||||
        const bool is_written = ((channel_state->written_storage_buffers[stage] >> index) & 1) != 0;
 | 
			
		||||
        if constexpr (NEEDS_BIND_STORAGE_INDEX) {
 | 
			
		||||
            runtime.BindStorageBuffer(stage, binding_index, buffer, offset, size, is_written);
 | 
			
		||||
            ++binding_index;
 | 
			
		||||
@@ -846,8 +855,8 @@ void BufferCache<P>::BindHostGraphicsStorageBuffers(size_t stage) {
 | 
			
		||||
 | 
			
		||||
template <class P>
 | 
			
		||||
void BufferCache<P>::BindHostGraphicsTextureBuffers(size_t stage) {
 | 
			
		||||
    ForEachEnabledBit(enabled_texture_buffers[stage], [&](u32 index) {
 | 
			
		||||
        const TextureBufferBinding& binding = texture_buffers[stage][index];
 | 
			
		||||
    ForEachEnabledBit(channel_state->enabled_texture_buffers[stage], [&](u32 index) {
 | 
			
		||||
        const TextureBufferBinding& binding = channel_state->texture_buffers[stage][index];
 | 
			
		||||
        Buffer& buffer = slot_buffers[binding.buffer_id];
 | 
			
		||||
        const u32 size = binding.size;
 | 
			
		||||
        SynchronizeBuffer(buffer, binding.cpu_addr, size);
 | 
			
		||||
@@ -855,7 +864,7 @@ void BufferCache<P>::BindHostGraphicsTextureBuffers(size_t stage) {
 | 
			
		||||
        const u32 offset = buffer.Offset(binding.cpu_addr);
 | 
			
		||||
        const PixelFormat format = binding.format;
 | 
			
		||||
        if constexpr (SEPARATE_IMAGE_BUFFERS_BINDINGS) {
 | 
			
		||||
            if (((image_texture_buffers[stage] >> index) & 1) != 0) {
 | 
			
		||||
            if (((channel_state->image_texture_buffers[stage] >> index) & 1) != 0) {
 | 
			
		||||
                runtime.BindImageBuffer(buffer, offset, size, format);
 | 
			
		||||
            } else {
 | 
			
		||||
                runtime.BindTextureBuffer(buffer, offset, size, format);
 | 
			
		||||
@@ -872,7 +881,7 @@ void BufferCache<P>::BindHostTransformFeedbackBuffers() {
 | 
			
		||||
        return;
 | 
			
		||||
    }
 | 
			
		||||
    for (u32 index = 0; index < NUM_TRANSFORM_FEEDBACK_BUFFERS; ++index) {
 | 
			
		||||
        const Binding& binding = transform_feedback_buffers[index];
 | 
			
		||||
        const Binding& binding = channel_state->transform_feedback_buffers[index];
 | 
			
		||||
        Buffer& buffer = slot_buffers[binding.buffer_id];
 | 
			
		||||
        TouchBuffer(buffer, binding.buffer_id);
 | 
			
		||||
        const u32 size = binding.size;
 | 
			
		||||
@@ -887,15 +896,16 @@ template <class P>
 | 
			
		||||
void BufferCache<P>::BindHostComputeUniformBuffers() {
 | 
			
		||||
    if constexpr (HAS_PERSISTENT_UNIFORM_BUFFER_BINDINGS) {
 | 
			
		||||
        // Mark all uniform buffers as dirty
 | 
			
		||||
        dirty_uniform_buffers.fill(~u32{0});
 | 
			
		||||
        fast_bound_uniform_buffers.fill(0);
 | 
			
		||||
        channel_state->dirty_uniform_buffers.fill(~u32{0});
 | 
			
		||||
        channel_state->fast_bound_uniform_buffers.fill(0);
 | 
			
		||||
    }
 | 
			
		||||
    u32 binding_index = 0;
 | 
			
		||||
    ForEachEnabledBit(enabled_compute_uniform_buffer_mask, [&](u32 index) {
 | 
			
		||||
        const Binding& binding = compute_uniform_buffers[index];
 | 
			
		||||
    ForEachEnabledBit(channel_state->enabled_compute_uniform_buffer_mask, [&](u32 index) {
 | 
			
		||||
        const Binding& binding = channel_state->compute_uniform_buffers[index];
 | 
			
		||||
        Buffer& buffer = slot_buffers[binding.buffer_id];
 | 
			
		||||
        TouchBuffer(buffer, binding.buffer_id);
 | 
			
		||||
        const u32 size = std::min(binding.size, (*compute_uniform_buffer_sizes)[index]);
 | 
			
		||||
        const u32 size =
 | 
			
		||||
            std::min(binding.size, (*channel_state->compute_uniform_buffer_sizes)[index]);
 | 
			
		||||
        SynchronizeBuffer(buffer, binding.cpu_addr, size);
 | 
			
		||||
 | 
			
		||||
        const u32 offset = buffer.Offset(binding.cpu_addr);
 | 
			
		||||
@@ -911,15 +921,16 @@ void BufferCache<P>::BindHostComputeUniformBuffers() {
 | 
			
		||||
template <class P>
 | 
			
		||||
void BufferCache<P>::BindHostComputeStorageBuffers() {
 | 
			
		||||
    u32 binding_index = 0;
 | 
			
		||||
    ForEachEnabledBit(enabled_compute_storage_buffers, [&](u32 index) {
 | 
			
		||||
        const Binding& binding = compute_storage_buffers[index];
 | 
			
		||||
    ForEachEnabledBit(channel_state->enabled_compute_storage_buffers, [&](u32 index) {
 | 
			
		||||
        const Binding& binding = channel_state->compute_storage_buffers[index];
 | 
			
		||||
        Buffer& buffer = slot_buffers[binding.buffer_id];
 | 
			
		||||
        TouchBuffer(buffer, binding.buffer_id);
 | 
			
		||||
        const u32 size = binding.size;
 | 
			
		||||
        SynchronizeBuffer(buffer, binding.cpu_addr, size);
 | 
			
		||||
 | 
			
		||||
        const u32 offset = buffer.Offset(binding.cpu_addr);
 | 
			
		||||
        const bool is_written = ((written_compute_storage_buffers >> index) & 1) != 0;
 | 
			
		||||
        const bool is_written =
 | 
			
		||||
            ((channel_state->written_compute_storage_buffers >> index) & 1) != 0;
 | 
			
		||||
        if constexpr (NEEDS_BIND_STORAGE_INDEX) {
 | 
			
		||||
            runtime.BindComputeStorageBuffer(binding_index, buffer, offset, size, is_written);
 | 
			
		||||
            ++binding_index;
 | 
			
		||||
@@ -931,8 +942,8 @@ void BufferCache<P>::BindHostComputeStorageBuffers() {
 | 
			
		||||
 | 
			
		||||
template <class P>
 | 
			
		||||
void BufferCache<P>::BindHostComputeTextureBuffers() {
 | 
			
		||||
    ForEachEnabledBit(enabled_compute_texture_buffers, [&](u32 index) {
 | 
			
		||||
        const TextureBufferBinding& binding = compute_texture_buffers[index];
 | 
			
		||||
    ForEachEnabledBit(channel_state->enabled_compute_texture_buffers, [&](u32 index) {
 | 
			
		||||
        const TextureBufferBinding& binding = channel_state->compute_texture_buffers[index];
 | 
			
		||||
        Buffer& buffer = slot_buffers[binding.buffer_id];
 | 
			
		||||
        const u32 size = binding.size;
 | 
			
		||||
        SynchronizeBuffer(buffer, binding.cpu_addr, size);
 | 
			
		||||
@@ -940,7 +951,7 @@ void BufferCache<P>::BindHostComputeTextureBuffers() {
 | 
			
		||||
        const u32 offset = buffer.Offset(binding.cpu_addr);
 | 
			
		||||
        const PixelFormat format = binding.format;
 | 
			
		||||
        if constexpr (SEPARATE_IMAGE_BUFFERS_BINDINGS) {
 | 
			
		||||
            if (((image_compute_texture_buffers >> index) & 1) != 0) {
 | 
			
		||||
            if (((channel_state->image_compute_texture_buffers >> index) & 1) != 0) {
 | 
			
		||||
                runtime.BindImageBuffer(buffer, offset, size, format);
 | 
			
		||||
            } else {
 | 
			
		||||
                runtime.BindTextureBuffer(buffer, offset, size, format);
 | 
			
		||||
@@ -954,7 +965,7 @@ void BufferCache<P>::BindHostComputeTextureBuffers() {
 | 
			
		||||
template <class P>
 | 
			
		||||
void BufferCache<P>::DoUpdateGraphicsBuffers(bool is_indexed) {
 | 
			
		||||
    do {
 | 
			
		||||
        has_deleted_buffers = false;
 | 
			
		||||
        channel_state->has_deleted_buffers = false;
 | 
			
		||||
        if (is_indexed) {
 | 
			
		||||
            UpdateIndexBuffer();
 | 
			
		||||
        }
 | 
			
		||||
@@ -968,7 +979,7 @@ void BufferCache<P>::DoUpdateGraphicsBuffers(bool is_indexed) {
 | 
			
		||||
        if (current_draw_indirect) {
 | 
			
		||||
            UpdateDrawIndirect();
 | 
			
		||||
        }
 | 
			
		||||
    } while (has_deleted_buffers);
 | 
			
		||||
    } while (channel_state->has_deleted_buffers);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
template <class P>
 | 
			
		||||
@@ -999,7 +1010,7 @@ void BufferCache<P>::UpdateIndexBuffer() {
 | 
			
		||||
            slot_buffers.erase(inline_buffer_id);
 | 
			
		||||
            inline_buffer_id = CreateBuffer(0, buffer_size);
 | 
			
		||||
        }
 | 
			
		||||
        index_buffer = Binding{
 | 
			
		||||
        channel_state->index_buffer = Binding{
 | 
			
		||||
            .cpu_addr = 0,
 | 
			
		||||
            .size = inline_index_size,
 | 
			
		||||
            .buffer_id = inline_buffer_id,
 | 
			
		||||
@@ -1015,10 +1026,10 @@ void BufferCache<P>::UpdateIndexBuffer() {
 | 
			
		||||
        (index_buffer_ref.count + index_buffer_ref.first) * index_buffer_ref.FormatSizeInBytes();
 | 
			
		||||
    const u32 size = std::min(address_size, draw_size);
 | 
			
		||||
    if (size == 0 || !cpu_addr) {
 | 
			
		||||
        index_buffer = NULL_BINDING;
 | 
			
		||||
        channel_state->index_buffer = NULL_BINDING;
 | 
			
		||||
        return;
 | 
			
		||||
    }
 | 
			
		||||
    index_buffer = Binding{
 | 
			
		||||
    channel_state->index_buffer = Binding{
 | 
			
		||||
        .cpu_addr = *cpu_addr,
 | 
			
		||||
        .size = size,
 | 
			
		||||
        .buffer_id = FindBuffer(*cpu_addr, size),
 | 
			
		||||
@@ -1051,13 +1062,13 @@ void BufferCache<P>::UpdateVertexBuffer(u32 index) {
 | 
			
		||||
    const u32 address_size = static_cast<u32>(gpu_addr_end - gpu_addr_begin);
 | 
			
		||||
    u32 size = address_size; // TODO: Analyze stride and number of vertices
 | 
			
		||||
    if (array.enable == 0 || size == 0 || !cpu_addr) {
 | 
			
		||||
        vertex_buffers[index] = NULL_BINDING;
 | 
			
		||||
        channel_state->vertex_buffers[index] = NULL_BINDING;
 | 
			
		||||
        return;
 | 
			
		||||
    }
 | 
			
		||||
    if (!gpu_memory->IsWithinGPUAddressRange(gpu_addr_end)) {
 | 
			
		||||
        size = static_cast<u32>(gpu_memory->MaxContinuousRange(gpu_addr_begin, size));
 | 
			
		||||
    }
 | 
			
		||||
    vertex_buffers[index] = Binding{
 | 
			
		||||
    channel_state->vertex_buffers[index] = Binding{
 | 
			
		||||
        .cpu_addr = *cpu_addr,
 | 
			
		||||
        .size = size,
 | 
			
		||||
        .buffer_id = FindBuffer(*cpu_addr, size),
 | 
			
		||||
@@ -1079,23 +1090,24 @@ void BufferCache<P>::UpdateDrawIndirect() {
 | 
			
		||||
        };
 | 
			
		||||
    };
 | 
			
		||||
    if (current_draw_indirect->include_count) {
 | 
			
		||||
        update(current_draw_indirect->count_start_address, sizeof(u32), count_buffer_binding);
 | 
			
		||||
        update(current_draw_indirect->count_start_address, sizeof(u32),
 | 
			
		||||
               channel_state->count_buffer_binding);
 | 
			
		||||
    }
 | 
			
		||||
    update(current_draw_indirect->indirect_start_address, current_draw_indirect->buffer_size,
 | 
			
		||||
           indirect_buffer_binding);
 | 
			
		||||
           channel_state->indirect_buffer_binding);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
template <class P>
 | 
			
		||||
void BufferCache<P>::UpdateUniformBuffers(size_t stage) {
 | 
			
		||||
    ForEachEnabledBit(enabled_uniform_buffer_masks[stage], [&](u32 index) {
 | 
			
		||||
        Binding& binding = uniform_buffers[stage][index];
 | 
			
		||||
    ForEachEnabledBit(channel_state->enabled_uniform_buffer_masks[stage], [&](u32 index) {
 | 
			
		||||
        Binding& binding = channel_state->uniform_buffers[stage][index];
 | 
			
		||||
        if (binding.buffer_id) {
 | 
			
		||||
            // Already updated
 | 
			
		||||
            return;
 | 
			
		||||
        }
 | 
			
		||||
        // Mark as dirty
 | 
			
		||||
        if constexpr (HAS_PERSISTENT_UNIFORM_BUFFER_BINDINGS) {
 | 
			
		||||
            dirty_uniform_buffers[stage] |= 1U << index;
 | 
			
		||||
            channel_state->dirty_uniform_buffers[stage] |= 1U << index;
 | 
			
		||||
        }
 | 
			
		||||
        // Resolve buffer
 | 
			
		||||
        binding.buffer_id = FindBuffer(binding.cpu_addr, binding.size);
 | 
			
		||||
@@ -1104,10 +1116,10 @@ void BufferCache<P>::UpdateUniformBuffers(size_t stage) {
 | 
			
		||||
 | 
			
		||||
template <class P>
 | 
			
		||||
void BufferCache<P>::UpdateStorageBuffers(size_t stage) {
 | 
			
		||||
    const u32 written_mask = written_storage_buffers[stage];
 | 
			
		||||
    ForEachEnabledBit(enabled_storage_buffers[stage], [&](u32 index) {
 | 
			
		||||
    const u32 written_mask = channel_state->written_storage_buffers[stage];
 | 
			
		||||
    ForEachEnabledBit(channel_state->enabled_storage_buffers[stage], [&](u32 index) {
 | 
			
		||||
        // Resolve buffer
 | 
			
		||||
        Binding& binding = storage_buffers[stage][index];
 | 
			
		||||
        Binding& binding = channel_state->storage_buffers[stage][index];
 | 
			
		||||
        const BufferId buffer_id = FindBuffer(binding.cpu_addr, binding.size);
 | 
			
		||||
        binding.buffer_id = buffer_id;
 | 
			
		||||
        // Mark buffer as written if needed
 | 
			
		||||
@@ -1119,11 +1131,11 @@ void BufferCache<P>::UpdateStorageBuffers(size_t stage) {
 | 
			
		||||
 | 
			
		||||
template <class P>
 | 
			
		||||
void BufferCache<P>::UpdateTextureBuffers(size_t stage) {
 | 
			
		||||
    ForEachEnabledBit(enabled_texture_buffers[stage], [&](u32 index) {
 | 
			
		||||
        Binding& binding = texture_buffers[stage][index];
 | 
			
		||||
    ForEachEnabledBit(channel_state->enabled_texture_buffers[stage], [&](u32 index) {
 | 
			
		||||
        Binding& binding = channel_state->texture_buffers[stage][index];
 | 
			
		||||
        binding.buffer_id = FindBuffer(binding.cpu_addr, binding.size);
 | 
			
		||||
        // Mark buffer as written if needed
 | 
			
		||||
        if (((written_texture_buffers[stage] >> index) & 1) != 0) {
 | 
			
		||||
        if (((channel_state->written_texture_buffers[stage] >> index) & 1) != 0) {
 | 
			
		||||
            MarkWrittenBuffer(binding.buffer_id, binding.cpu_addr, binding.size);
 | 
			
		||||
        }
 | 
			
		||||
    });
 | 
			
		||||
@@ -1146,11 +1158,11 @@ void BufferCache<P>::UpdateTransformFeedbackBuffer(u32 index) {
 | 
			
		||||
    const u32 size = binding.size;
 | 
			
		||||
    const std::optional<VAddr> cpu_addr = gpu_memory->GpuToCpuAddress(gpu_addr);
 | 
			
		||||
    if (binding.enable == 0 || size == 0 || !cpu_addr) {
 | 
			
		||||
        transform_feedback_buffers[index] = NULL_BINDING;
 | 
			
		||||
        channel_state->transform_feedback_buffers[index] = NULL_BINDING;
 | 
			
		||||
        return;
 | 
			
		||||
    }
 | 
			
		||||
    const BufferId buffer_id = FindBuffer(*cpu_addr, size);
 | 
			
		||||
    transform_feedback_buffers[index] = Binding{
 | 
			
		||||
    channel_state->transform_feedback_buffers[index] = Binding{
 | 
			
		||||
        .cpu_addr = *cpu_addr,
 | 
			
		||||
        .size = size,
 | 
			
		||||
        .buffer_id = buffer_id,
 | 
			
		||||
@@ -1160,8 +1172,8 @@ void BufferCache<P>::UpdateTransformFeedbackBuffer(u32 index) {
 | 
			
		||||
 | 
			
		||||
template <class P>
 | 
			
		||||
void BufferCache<P>::UpdateComputeUniformBuffers() {
 | 
			
		||||
    ForEachEnabledBit(enabled_compute_uniform_buffer_mask, [&](u32 index) {
 | 
			
		||||
        Binding& binding = compute_uniform_buffers[index];
 | 
			
		||||
    ForEachEnabledBit(channel_state->enabled_compute_uniform_buffer_mask, [&](u32 index) {
 | 
			
		||||
        Binding& binding = channel_state->compute_uniform_buffers[index];
 | 
			
		||||
        binding = NULL_BINDING;
 | 
			
		||||
        const auto& launch_desc = kepler_compute->launch_description;
 | 
			
		||||
        if (((launch_desc.const_buffer_enable_mask >> index) & 1) != 0) {
 | 
			
		||||
@@ -1178,12 +1190,12 @@ void BufferCache<P>::UpdateComputeUniformBuffers() {
 | 
			
		||||
 | 
			
		||||
template <class P>
 | 
			
		||||
void BufferCache<P>::UpdateComputeStorageBuffers() {
 | 
			
		||||
    ForEachEnabledBit(enabled_compute_storage_buffers, [&](u32 index) {
 | 
			
		||||
    ForEachEnabledBit(channel_state->enabled_compute_storage_buffers, [&](u32 index) {
 | 
			
		||||
        // Resolve buffer
 | 
			
		||||
        Binding& binding = compute_storage_buffers[index];
 | 
			
		||||
        Binding& binding = channel_state->compute_storage_buffers[index];
 | 
			
		||||
        binding.buffer_id = FindBuffer(binding.cpu_addr, binding.size);
 | 
			
		||||
        // Mark as written if needed
 | 
			
		||||
        if (((written_compute_storage_buffers >> index) & 1) != 0) {
 | 
			
		||||
        if (((channel_state->written_compute_storage_buffers >> index) & 1) != 0) {
 | 
			
		||||
            MarkWrittenBuffer(binding.buffer_id, binding.cpu_addr, binding.size);
 | 
			
		||||
        }
 | 
			
		||||
    });
 | 
			
		||||
@@ -1191,11 +1203,11 @@ void BufferCache<P>::UpdateComputeStorageBuffers() {
 | 
			
		||||
 | 
			
		||||
template <class P>
 | 
			
		||||
void BufferCache<P>::UpdateComputeTextureBuffers() {
 | 
			
		||||
    ForEachEnabledBit(enabled_compute_texture_buffers, [&](u32 index) {
 | 
			
		||||
        Binding& binding = compute_texture_buffers[index];
 | 
			
		||||
    ForEachEnabledBit(channel_state->enabled_compute_texture_buffers, [&](u32 index) {
 | 
			
		||||
        Binding& binding = channel_state->compute_texture_buffers[index];
 | 
			
		||||
        binding.buffer_id = FindBuffer(binding.cpu_addr, binding.size);
 | 
			
		||||
        // Mark as written if needed
 | 
			
		||||
        if (((written_compute_texture_buffers >> index) & 1) != 0) {
 | 
			
		||||
        if (((channel_state->written_compute_texture_buffers >> index) & 1) != 0) {
 | 
			
		||||
            MarkWrittenBuffer(binding.buffer_id, binding.cpu_addr, binding.size);
 | 
			
		||||
        }
 | 
			
		||||
    });
 | 
			
		||||
@@ -1610,13 +1622,13 @@ void BufferCache<P>::DeleteBuffer(BufferId buffer_id, bool do_not_mark) {
 | 
			
		||||
    const auto replace = [scalar_replace](std::span<Binding> bindings) {
 | 
			
		||||
        std::ranges::for_each(bindings, scalar_replace);
 | 
			
		||||
    };
 | 
			
		||||
    scalar_replace(index_buffer);
 | 
			
		||||
    replace(vertex_buffers);
 | 
			
		||||
    std::ranges::for_each(uniform_buffers, replace);
 | 
			
		||||
    std::ranges::for_each(storage_buffers, replace);
 | 
			
		||||
    replace(transform_feedback_buffers);
 | 
			
		||||
    replace(compute_uniform_buffers);
 | 
			
		||||
    replace(compute_storage_buffers);
 | 
			
		||||
    scalar_replace(channel_state->index_buffer);
 | 
			
		||||
    replace(channel_state->vertex_buffers);
 | 
			
		||||
    std::ranges::for_each(channel_state->uniform_buffers, replace);
 | 
			
		||||
    std::ranges::for_each(channel_state->storage_buffers, replace);
 | 
			
		||||
    replace(channel_state->transform_feedback_buffers);
 | 
			
		||||
    replace(channel_state->compute_uniform_buffers);
 | 
			
		||||
    replace(channel_state->compute_storage_buffers);
 | 
			
		||||
 | 
			
		||||
    // Mark the whole buffer as CPU written to stop tracking CPU writes
 | 
			
		||||
    if (!do_not_mark) {
 | 
			
		||||
@@ -1634,8 +1646,8 @@ void BufferCache<P>::DeleteBuffer(BufferId buffer_id, bool do_not_mark) {
 | 
			
		||||
template <class P>
 | 
			
		||||
void BufferCache<P>::NotifyBufferDeletion() {
 | 
			
		||||
    if constexpr (HAS_PERSISTENT_UNIFORM_BUFFER_BINDINGS) {
 | 
			
		||||
        dirty_uniform_buffers.fill(~u32{0});
 | 
			
		||||
        uniform_buffer_binding_sizes.fill({});
 | 
			
		||||
        channel_state->dirty_uniform_buffers.fill(~u32{0});
 | 
			
		||||
        channel_state->uniform_buffer_binding_sizes.fill({});
 | 
			
		||||
    }
 | 
			
		||||
    auto& flags = maxwell3d->dirty.flags;
 | 
			
		||||
    flags[Dirty::IndexBuffer] = true;
 | 
			
		||||
@@ -1643,13 +1655,12 @@ void BufferCache<P>::NotifyBufferDeletion() {
 | 
			
		||||
    for (u32 index = 0; index < NUM_VERTEX_BUFFERS; ++index) {
 | 
			
		||||
        flags[Dirty::VertexBuffer0 + index] = true;
 | 
			
		||||
    }
 | 
			
		||||
    has_deleted_buffers = true;
 | 
			
		||||
    channel_state->has_deleted_buffers = true;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
template <class P>
 | 
			
		||||
typename BufferCache<P>::Binding BufferCache<P>::StorageBufferBinding(GPUVAddr ssbo_addr,
 | 
			
		||||
                                                                      u32 cbuf_index,
 | 
			
		||||
                                                                      bool is_written) const {
 | 
			
		||||
Binding BufferCache<P>::StorageBufferBinding(GPUVAddr ssbo_addr, u32 cbuf_index,
 | 
			
		||||
                                             bool is_written) const {
 | 
			
		||||
    const GPUVAddr gpu_addr = gpu_memory->Read<u64>(ssbo_addr);
 | 
			
		||||
    const auto size = [&]() {
 | 
			
		||||
        const bool is_nvn_cbuf = cbuf_index == 0;
 | 
			
		||||
@@ -1681,8 +1692,8 @@ typename BufferCache<P>::Binding BufferCache<P>::StorageBufferBinding(GPUVAddr s
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
template <class P>
 | 
			
		||||
typename BufferCache<P>::TextureBufferBinding BufferCache<P>::GetTextureBufferBinding(
 | 
			
		||||
    GPUVAddr gpu_addr, u32 size, PixelFormat format) {
 | 
			
		||||
TextureBufferBinding BufferCache<P>::GetTextureBufferBinding(GPUVAddr gpu_addr, u32 size,
 | 
			
		||||
                                                             PixelFormat format) {
 | 
			
		||||
    const std::optional<VAddr> cpu_addr = gpu_memory->GpuToCpuAddress(gpu_addr);
 | 
			
		||||
    TextureBufferBinding binding;
 | 
			
		||||
    if (!cpu_addr || size == 0) {
 | 
			
		||||
@@ -1721,7 +1732,7 @@ std::span<u8> BufferCache<P>::ImmediateBuffer(size_t wanted_capacity) {
 | 
			
		||||
template <class P>
 | 
			
		||||
bool BufferCache<P>::HasFastUniformBufferBound(size_t stage, u32 binding_index) const noexcept {
 | 
			
		||||
    if constexpr (IS_OPENGL) {
 | 
			
		||||
        return ((fast_bound_uniform_buffers[stage] >> binding_index) & 1) != 0;
 | 
			
		||||
        return ((channel_state->fast_bound_uniform_buffers[stage] >> binding_index) & 1) != 0;
 | 
			
		||||
    } else {
 | 
			
		||||
        // Only OpenGL has fast uniform buffers
 | 
			
		||||
        return false;
 | 
			
		||||
@@ -1730,14 +1741,14 @@ bool BufferCache<P>::HasFastUniformBufferBound(size_t stage, u32 binding_index)
 | 
			
		||||
 | 
			
		||||
template <class P>
 | 
			
		||||
std::pair<typename BufferCache<P>::Buffer*, u32> BufferCache<P>::GetDrawIndirectCount() {
 | 
			
		||||
    auto& buffer = slot_buffers[count_buffer_binding.buffer_id];
 | 
			
		||||
    return std::make_pair(&buffer, buffer.Offset(count_buffer_binding.cpu_addr));
 | 
			
		||||
    auto& buffer = slot_buffers[channel_state->count_buffer_binding.buffer_id];
 | 
			
		||||
    return std::make_pair(&buffer, buffer.Offset(channel_state->count_buffer_binding.cpu_addr));
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
template <class P>
 | 
			
		||||
std::pair<typename BufferCache<P>::Buffer*, u32> BufferCache<P>::GetDrawIndirectBuffer() {
 | 
			
		||||
    auto& buffer = slot_buffers[indirect_buffer_binding.buffer_id];
 | 
			
		||||
    return std::make_pair(&buffer, buffer.Offset(indirect_buffer_binding.cpu_addr));
 | 
			
		||||
    auto& buffer = slot_buffers[channel_state->indirect_buffer_binding.buffer_id];
 | 
			
		||||
    return std::make_pair(&buffer, buffer.Offset(channel_state->indirect_buffer_binding.cpu_addr));
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
} // namespace VideoCommon
 | 
			
		||||
 
 | 
			
		||||
@@ -86,8 +86,78 @@ enum class ObtainBufferOperation : u32 {
 | 
			
		||||
    MarkQuery = 3,
 | 
			
		||||
};
 | 
			
		||||
 | 
			
		||||
template <typename P>
 | 
			
		||||
class BufferCache : public VideoCommon::ChannelSetupCaches<VideoCommon::ChannelInfo> {
 | 
			
		||||
static constexpr BufferId NULL_BUFFER_ID{0};
 | 
			
		||||
static constexpr u32 DEFAULT_SKIP_CACHE_SIZE = static_cast<u32>(4_KiB);
 | 
			
		||||
 | 
			
		||||
struct Binding {
 | 
			
		||||
    VAddr cpu_addr{};
 | 
			
		||||
    u32 size{};
 | 
			
		||||
    BufferId buffer_id;
 | 
			
		||||
};
 | 
			
		||||
 | 
			
		||||
struct TextureBufferBinding : Binding {
 | 
			
		||||
    PixelFormat format;
 | 
			
		||||
};
 | 
			
		||||
 | 
			
		||||
static constexpr Binding NULL_BINDING{
 | 
			
		||||
    .cpu_addr = 0,
 | 
			
		||||
    .size = 0,
 | 
			
		||||
    .buffer_id = NULL_BUFFER_ID,
 | 
			
		||||
};
 | 
			
		||||
 | 
			
		||||
class BufferCacheChannelInfo : public ChannelInfo {
 | 
			
		||||
public:
 | 
			
		||||
    BufferCacheChannelInfo() = delete;
 | 
			
		||||
    BufferCacheChannelInfo(Tegra::Control::ChannelState& state) noexcept : ChannelInfo(state) {}
 | 
			
		||||
    BufferCacheChannelInfo(const BufferCacheChannelInfo& state) = delete;
 | 
			
		||||
    BufferCacheChannelInfo& operator=(const BufferCacheChannelInfo&) = delete;
 | 
			
		||||
 | 
			
		||||
    Binding index_buffer;
 | 
			
		||||
    std::array<Binding, NUM_VERTEX_BUFFERS> vertex_buffers;
 | 
			
		||||
    std::array<std::array<Binding, NUM_GRAPHICS_UNIFORM_BUFFERS>, NUM_STAGES> uniform_buffers;
 | 
			
		||||
    std::array<std::array<Binding, NUM_STORAGE_BUFFERS>, NUM_STAGES> storage_buffers;
 | 
			
		||||
    std::array<std::array<TextureBufferBinding, NUM_TEXTURE_BUFFERS>, NUM_STAGES> texture_buffers;
 | 
			
		||||
    std::array<Binding, NUM_TRANSFORM_FEEDBACK_BUFFERS> transform_feedback_buffers;
 | 
			
		||||
    Binding count_buffer_binding;
 | 
			
		||||
    Binding indirect_buffer_binding;
 | 
			
		||||
 | 
			
		||||
    std::array<Binding, NUM_COMPUTE_UNIFORM_BUFFERS> compute_uniform_buffers;
 | 
			
		||||
    std::array<Binding, NUM_STORAGE_BUFFERS> compute_storage_buffers;
 | 
			
		||||
    std::array<TextureBufferBinding, NUM_TEXTURE_BUFFERS> compute_texture_buffers;
 | 
			
		||||
 | 
			
		||||
    std::array<u32, NUM_STAGES> enabled_uniform_buffer_masks{};
 | 
			
		||||
    u32 enabled_compute_uniform_buffer_mask = 0;
 | 
			
		||||
 | 
			
		||||
    const UniformBufferSizes* uniform_buffer_sizes{};
 | 
			
		||||
    const ComputeUniformBufferSizes* compute_uniform_buffer_sizes{};
 | 
			
		||||
 | 
			
		||||
    std::array<u32, NUM_STAGES> enabled_storage_buffers{};
 | 
			
		||||
    std::array<u32, NUM_STAGES> written_storage_buffers{};
 | 
			
		||||
    u32 enabled_compute_storage_buffers = 0;
 | 
			
		||||
    u32 written_compute_storage_buffers = 0;
 | 
			
		||||
 | 
			
		||||
    std::array<u32, NUM_STAGES> enabled_texture_buffers{};
 | 
			
		||||
    std::array<u32, NUM_STAGES> written_texture_buffers{};
 | 
			
		||||
    std::array<u32, NUM_STAGES> image_texture_buffers{};
 | 
			
		||||
    u32 enabled_compute_texture_buffers = 0;
 | 
			
		||||
    u32 written_compute_texture_buffers = 0;
 | 
			
		||||
    u32 image_compute_texture_buffers = 0;
 | 
			
		||||
 | 
			
		||||
    std::array<u32, 16> uniform_cache_hits{};
 | 
			
		||||
    std::array<u32, 16> uniform_cache_shots{};
 | 
			
		||||
 | 
			
		||||
    u32 uniform_buffer_skip_cache_size = DEFAULT_SKIP_CACHE_SIZE;
 | 
			
		||||
 | 
			
		||||
    bool has_deleted_buffers = false;
 | 
			
		||||
 | 
			
		||||
    std::array<u32, NUM_STAGES> dirty_uniform_buffers{};
 | 
			
		||||
    std::array<u32, NUM_STAGES> fast_bound_uniform_buffers{};
 | 
			
		||||
    std::array<std::array<u32, NUM_GRAPHICS_UNIFORM_BUFFERS>, NUM_STAGES>
 | 
			
		||||
        uniform_buffer_binding_sizes{};
 | 
			
		||||
};
 | 
			
		||||
 | 
			
		||||
template <class P>
 | 
			
		||||
class BufferCache : public VideoCommon::ChannelSetupCaches<BufferCacheChannelInfo> {
 | 
			
		||||
    // Page size for caching purposes.
 | 
			
		||||
    // This is unrelated to the CPU page size and it can be changed as it seems optimal.
 | 
			
		||||
    static constexpr u32 CACHING_PAGEBITS = 16;
 | 
			
		||||
@@ -104,8 +174,6 @@ class BufferCache : public VideoCommon::ChannelSetupCaches<VideoCommon::ChannelI
 | 
			
		||||
    static constexpr bool SEPARATE_IMAGE_BUFFERS_BINDINGS = P::SEPARATE_IMAGE_BUFFER_BINDINGS;
 | 
			
		||||
    static constexpr bool IMPLEMENTS_ASYNC_DOWNLOADS = P::IMPLEMENTS_ASYNC_DOWNLOADS;
 | 
			
		||||
 | 
			
		||||
    static constexpr BufferId NULL_BUFFER_ID{0};
 | 
			
		||||
 | 
			
		||||
    static constexpr s64 DEFAULT_EXPECTED_MEMORY = 512_MiB;
 | 
			
		||||
    static constexpr s64 DEFAULT_CRITICAL_MEMORY = 1_GiB;
 | 
			
		||||
    static constexpr s64 TARGET_THRESHOLD = 4_GiB;
 | 
			
		||||
@@ -149,8 +217,6 @@ class BufferCache : public VideoCommon::ChannelSetupCaches<VideoCommon::ChannelI
 | 
			
		||||
    using OverlapSection = boost::icl::inter_section<int>;
 | 
			
		||||
    using OverlapCounter = boost::icl::split_interval_map<VAddr, int>;
 | 
			
		||||
 | 
			
		||||
    struct Empty {};
 | 
			
		||||
 | 
			
		||||
    struct OverlapResult {
 | 
			
		||||
        std::vector<BufferId> ids;
 | 
			
		||||
        VAddr begin;
 | 
			
		||||
@@ -158,25 +224,7 @@ class BufferCache : public VideoCommon::ChannelSetupCaches<VideoCommon::ChannelI
 | 
			
		||||
        bool has_stream_leap = false;
 | 
			
		||||
    };
 | 
			
		||||
 | 
			
		||||
    struct Binding {
 | 
			
		||||
        VAddr cpu_addr{};
 | 
			
		||||
        u32 size{};
 | 
			
		||||
        BufferId buffer_id;
 | 
			
		||||
    };
 | 
			
		||||
 | 
			
		||||
    struct TextureBufferBinding : Binding {
 | 
			
		||||
        PixelFormat format;
 | 
			
		||||
    };
 | 
			
		||||
 | 
			
		||||
    static constexpr Binding NULL_BINDING{
 | 
			
		||||
        .cpu_addr = 0,
 | 
			
		||||
        .size = 0,
 | 
			
		||||
        .buffer_id = NULL_BUFFER_ID,
 | 
			
		||||
    };
 | 
			
		||||
 | 
			
		||||
public:
 | 
			
		||||
    static constexpr u32 DEFAULT_SKIP_CACHE_SIZE = static_cast<u32>(4_KiB);
 | 
			
		||||
 | 
			
		||||
    explicit BufferCache(VideoCore::RasterizerInterface& rasterizer_,
 | 
			
		||||
                         Core::Memory::Memory& cpu_memory_, Runtime& runtime_);
 | 
			
		||||
 | 
			
		||||
@@ -496,51 +544,6 @@ private:
 | 
			
		||||
 | 
			
		||||
    u32 last_index_count = 0;
 | 
			
		||||
 | 
			
		||||
    Binding index_buffer;
 | 
			
		||||
    std::array<Binding, NUM_VERTEX_BUFFERS> vertex_buffers;
 | 
			
		||||
    std::array<std::array<Binding, NUM_GRAPHICS_UNIFORM_BUFFERS>, NUM_STAGES> uniform_buffers;
 | 
			
		||||
    std::array<std::array<Binding, NUM_STORAGE_BUFFERS>, NUM_STAGES> storage_buffers;
 | 
			
		||||
    std::array<std::array<TextureBufferBinding, NUM_TEXTURE_BUFFERS>, NUM_STAGES> texture_buffers;
 | 
			
		||||
    std::array<Binding, NUM_TRANSFORM_FEEDBACK_BUFFERS> transform_feedback_buffers;
 | 
			
		||||
    Binding count_buffer_binding;
 | 
			
		||||
    Binding indirect_buffer_binding;
 | 
			
		||||
 | 
			
		||||
    std::array<Binding, NUM_COMPUTE_UNIFORM_BUFFERS> compute_uniform_buffers;
 | 
			
		||||
    std::array<Binding, NUM_STORAGE_BUFFERS> compute_storage_buffers;
 | 
			
		||||
    std::array<TextureBufferBinding, NUM_TEXTURE_BUFFERS> compute_texture_buffers;
 | 
			
		||||
 | 
			
		||||
    std::array<u32, NUM_STAGES> enabled_uniform_buffer_masks{};
 | 
			
		||||
    u32 enabled_compute_uniform_buffer_mask = 0;
 | 
			
		||||
 | 
			
		||||
    const UniformBufferSizes* uniform_buffer_sizes{};
 | 
			
		||||
    const ComputeUniformBufferSizes* compute_uniform_buffer_sizes{};
 | 
			
		||||
 | 
			
		||||
    std::array<u32, NUM_STAGES> enabled_storage_buffers{};
 | 
			
		||||
    std::array<u32, NUM_STAGES> written_storage_buffers{};
 | 
			
		||||
    u32 enabled_compute_storage_buffers = 0;
 | 
			
		||||
    u32 written_compute_storage_buffers = 0;
 | 
			
		||||
 | 
			
		||||
    std::array<u32, NUM_STAGES> enabled_texture_buffers{};
 | 
			
		||||
    std::array<u32, NUM_STAGES> written_texture_buffers{};
 | 
			
		||||
    std::array<u32, NUM_STAGES> image_texture_buffers{};
 | 
			
		||||
    u32 enabled_compute_texture_buffers = 0;
 | 
			
		||||
    u32 written_compute_texture_buffers = 0;
 | 
			
		||||
    u32 image_compute_texture_buffers = 0;
 | 
			
		||||
 | 
			
		||||
    std::array<u32, 16> uniform_cache_hits{};
 | 
			
		||||
    std::array<u32, 16> uniform_cache_shots{};
 | 
			
		||||
 | 
			
		||||
    u32 uniform_buffer_skip_cache_size = DEFAULT_SKIP_CACHE_SIZE;
 | 
			
		||||
 | 
			
		||||
    bool has_deleted_buffers = false;
 | 
			
		||||
 | 
			
		||||
    std::conditional_t<HAS_PERSISTENT_UNIFORM_BUFFER_BINDINGS, std::array<u32, NUM_STAGES>, Empty>
 | 
			
		||||
        dirty_uniform_buffers{};
 | 
			
		||||
    std::conditional_t<IS_OPENGL, std::array<u32, NUM_STAGES>, Empty> fast_bound_uniform_buffers{};
 | 
			
		||||
    std::conditional_t<HAS_PERSISTENT_UNIFORM_BUFFER_BINDINGS,
 | 
			
		||||
                       std::array<std::array<u32, NUM_GRAPHICS_UNIFORM_BUFFERS>, NUM_STAGES>, Empty>
 | 
			
		||||
        uniform_buffer_binding_sizes{};
 | 
			
		||||
 | 
			
		||||
    MemoryTracker memory_tracker;
 | 
			
		||||
    IntervalSet uncommitted_ranges;
 | 
			
		||||
    IntervalSet common_ranges;
 | 
			
		||||
 
 | 
			
		||||
@@ -117,7 +117,7 @@ BufferCacheRuntime::BufferCacheRuntime(const Device& device_)
 | 
			
		||||
    for (auto& stage_uniforms : fast_uniforms) {
 | 
			
		||||
        for (OGLBuffer& buffer : stage_uniforms) {
 | 
			
		||||
            buffer.Create();
 | 
			
		||||
            glNamedBufferData(buffer.handle, BufferCache::DEFAULT_SKIP_CACHE_SIZE, nullptr,
 | 
			
		||||
            glNamedBufferData(buffer.handle, VideoCommon::DEFAULT_SKIP_CACHE_SIZE, nullptr,
 | 
			
		||||
                              GL_STREAM_DRAW);
 | 
			
		||||
        }
 | 
			
		||||
    }
 | 
			
		||||
 
 | 
			
		||||
		Reference in New Issue
	
	Block a user