mirror of
				https://git.suyu.dev/suyu/suyu
				synced 2025-11-04 00:49:02 -06:00 
			
		
		
		
	Merge pull request #2592 from FernandoS27/sync1
Implement GPU Synchronization Mechanisms & Correct NVFlinger
This commit is contained in:
		@@ -525,8 +525,9 @@ void Maxwell3D::ProcessSyncPoint() {
 | 
			
		||||
    const u32 sync_point = regs.sync_info.sync_point.Value();
 | 
			
		||||
    const u32 increment = regs.sync_info.increment.Value();
 | 
			
		||||
    const u32 cache_flush = regs.sync_info.unknown.Value();
 | 
			
		||||
    LOG_DEBUG(HW_GPU, "Syncpoint set {}, increment: {}, unk: {}", sync_point, increment,
 | 
			
		||||
              cache_flush);
 | 
			
		||||
    if (increment) {
 | 
			
		||||
        system.GPU().IncrementSyncPoint(sync_point);
 | 
			
		||||
    }
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
void Maxwell3D::DrawArrays() {
 | 
			
		||||
 
 | 
			
		||||
@@ -29,7 +29,8 @@ u32 FramebufferConfig::BytesPerPixel(PixelFormat format) {
 | 
			
		||||
    UNREACHABLE();
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
GPU::GPU(Core::System& system, VideoCore::RendererBase& renderer) : renderer{renderer} {
 | 
			
		||||
GPU::GPU(Core::System& system, VideoCore::RendererBase& renderer, bool is_async)
 | 
			
		||||
    : system{system}, renderer{renderer}, is_async{is_async} {
 | 
			
		||||
    auto& rasterizer{renderer.Rasterizer()};
 | 
			
		||||
    memory_manager = std::make_unique<Tegra::MemoryManager>(system, rasterizer);
 | 
			
		||||
    dma_pusher = std::make_unique<Tegra::DmaPusher>(*this);
 | 
			
		||||
@@ -74,6 +75,51 @@ const DmaPusher& GPU::DmaPusher() const {
 | 
			
		||||
    return *dma_pusher;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
void GPU::IncrementSyncPoint(const u32 syncpoint_id) {
 | 
			
		||||
    syncpoints[syncpoint_id]++;
 | 
			
		||||
    std::lock_guard lock{sync_mutex};
 | 
			
		||||
    if (!syncpt_interrupts[syncpoint_id].empty()) {
 | 
			
		||||
        u32 value = syncpoints[syncpoint_id].load();
 | 
			
		||||
        auto it = syncpt_interrupts[syncpoint_id].begin();
 | 
			
		||||
        while (it != syncpt_interrupts[syncpoint_id].end()) {
 | 
			
		||||
            if (value >= *it) {
 | 
			
		||||
                TriggerCpuInterrupt(syncpoint_id, *it);
 | 
			
		||||
                it = syncpt_interrupts[syncpoint_id].erase(it);
 | 
			
		||||
                continue;
 | 
			
		||||
            }
 | 
			
		||||
            it++;
 | 
			
		||||
        }
 | 
			
		||||
    }
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
u32 GPU::GetSyncpointValue(const u32 syncpoint_id) const {
 | 
			
		||||
    return syncpoints[syncpoint_id].load();
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
void GPU::RegisterSyncptInterrupt(const u32 syncpoint_id, const u32 value) {
 | 
			
		||||
    auto& interrupt = syncpt_interrupts[syncpoint_id];
 | 
			
		||||
    bool contains = std::any_of(interrupt.begin(), interrupt.end(),
 | 
			
		||||
                                [value](u32 in_value) { return in_value == value; });
 | 
			
		||||
    if (contains) {
 | 
			
		||||
        return;
 | 
			
		||||
    }
 | 
			
		||||
    syncpt_interrupts[syncpoint_id].emplace_back(value);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
bool GPU::CancelSyncptInterrupt(const u32 syncpoint_id, const u32 value) {
 | 
			
		||||
    std::lock_guard lock{sync_mutex};
 | 
			
		||||
    auto& interrupt = syncpt_interrupts[syncpoint_id];
 | 
			
		||||
    const auto iter =
 | 
			
		||||
        std::find_if(interrupt.begin(), interrupt.end(),
 | 
			
		||||
                     [value](u32 interrupt_value) { return value == interrupt_value; });
 | 
			
		||||
 | 
			
		||||
    if (iter == interrupt.end()) {
 | 
			
		||||
        return false;
 | 
			
		||||
    }
 | 
			
		||||
    interrupt.erase(iter);
 | 
			
		||||
    return true;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
u32 RenderTargetBytesPerPixel(RenderTargetFormat format) {
 | 
			
		||||
    ASSERT(format != RenderTargetFormat::NONE);
 | 
			
		||||
 | 
			
		||||
 
 | 
			
		||||
@@ -5,8 +5,12 @@
 | 
			
		||||
#pragma once
 | 
			
		||||
 | 
			
		||||
#include <array>
 | 
			
		||||
#include <atomic>
 | 
			
		||||
#include <list>
 | 
			
		||||
#include <memory>
 | 
			
		||||
#include <mutex>
 | 
			
		||||
#include "common/common_types.h"
 | 
			
		||||
#include "core/hle/service/nvdrv/nvdata.h"
 | 
			
		||||
#include "core/hle/service/nvflinger/buffer_queue.h"
 | 
			
		||||
#include "video_core/dma_pusher.h"
 | 
			
		||||
 | 
			
		||||
@@ -127,7 +131,7 @@ class MemoryManager;
 | 
			
		||||
 | 
			
		||||
class GPU {
 | 
			
		||||
public:
 | 
			
		||||
    explicit GPU(Core::System& system, VideoCore::RendererBase& renderer);
 | 
			
		||||
    explicit GPU(Core::System& system, VideoCore::RendererBase& renderer, bool is_async);
 | 
			
		||||
 | 
			
		||||
    virtual ~GPU();
 | 
			
		||||
 | 
			
		||||
@@ -170,6 +174,22 @@ public:
 | 
			
		||||
    /// Returns a reference to the GPU DMA pusher.
 | 
			
		||||
    Tegra::DmaPusher& DmaPusher();
 | 
			
		||||
 | 
			
		||||
    void IncrementSyncPoint(u32 syncpoint_id);
 | 
			
		||||
 | 
			
		||||
    u32 GetSyncpointValue(u32 syncpoint_id) const;
 | 
			
		||||
 | 
			
		||||
    void RegisterSyncptInterrupt(u32 syncpoint_id, u32 value);
 | 
			
		||||
 | 
			
		||||
    bool CancelSyncptInterrupt(u32 syncpoint_id, u32 value);
 | 
			
		||||
 | 
			
		||||
    std::unique_lock<std::mutex> LockSync() {
 | 
			
		||||
        return std::unique_lock{sync_mutex};
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    bool IsAsync() const {
 | 
			
		||||
        return is_async;
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    /// Returns a const reference to the GPU DMA pusher.
 | 
			
		||||
    const Tegra::DmaPusher& DmaPusher() const;
 | 
			
		||||
 | 
			
		||||
@@ -239,6 +259,9 @@ public:
 | 
			
		||||
    /// Notify rasterizer that any caches of the specified region should be flushed and invalidated
 | 
			
		||||
    virtual void FlushAndInvalidateRegion(CacheAddr addr, u64 size) = 0;
 | 
			
		||||
 | 
			
		||||
protected:
 | 
			
		||||
    virtual void TriggerCpuInterrupt(u32 syncpoint_id, u32 value) const = 0;
 | 
			
		||||
 | 
			
		||||
private:
 | 
			
		||||
    void ProcessBindMethod(const MethodCall& method_call);
 | 
			
		||||
    void ProcessSemaphoreTriggerMethod();
 | 
			
		||||
@@ -257,6 +280,7 @@ private:
 | 
			
		||||
protected:
 | 
			
		||||
    std::unique_ptr<Tegra::DmaPusher> dma_pusher;
 | 
			
		||||
    VideoCore::RendererBase& renderer;
 | 
			
		||||
    Core::System& system;
 | 
			
		||||
 | 
			
		||||
private:
 | 
			
		||||
    std::unique_ptr<Tegra::MemoryManager> memory_manager;
 | 
			
		||||
@@ -273,6 +297,14 @@ private:
 | 
			
		||||
    std::unique_ptr<Engines::MaxwellDMA> maxwell_dma;
 | 
			
		||||
    /// Inline memory engine
 | 
			
		||||
    std::unique_ptr<Engines::KeplerMemory> kepler_memory;
 | 
			
		||||
 | 
			
		||||
    std::array<std::atomic<u32>, Service::Nvidia::MaxSyncPoints> syncpoints{};
 | 
			
		||||
 | 
			
		||||
    std::array<std::list<u32>, Service::Nvidia::MaxSyncPoints> syncpt_interrupts;
 | 
			
		||||
 | 
			
		||||
    std::mutex sync_mutex;
 | 
			
		||||
 | 
			
		||||
    const bool is_async;
 | 
			
		||||
};
 | 
			
		||||
 | 
			
		||||
#define ASSERT_REG_POSITION(field_name, position)                                                  \
 | 
			
		||||
 
 | 
			
		||||
@@ -2,6 +2,8 @@
 | 
			
		||||
// Licensed under GPLv2 or any later version
 | 
			
		||||
// Refer to the license.txt file included.
 | 
			
		||||
 | 
			
		||||
#include "core/core.h"
 | 
			
		||||
#include "core/hardware_interrupt_manager.h"
 | 
			
		||||
#include "video_core/gpu_asynch.h"
 | 
			
		||||
#include "video_core/gpu_thread.h"
 | 
			
		||||
#include "video_core/renderer_base.h"
 | 
			
		||||
@@ -9,7 +11,7 @@
 | 
			
		||||
namespace VideoCommon {
 | 
			
		||||
 | 
			
		||||
GPUAsynch::GPUAsynch(Core::System& system, VideoCore::RendererBase& renderer)
 | 
			
		||||
    : GPU(system, renderer), gpu_thread{system} {}
 | 
			
		||||
    : GPU(system, renderer, true), gpu_thread{system} {}
 | 
			
		||||
 | 
			
		||||
GPUAsynch::~GPUAsynch() = default;
 | 
			
		||||
 | 
			
		||||
@@ -38,4 +40,9 @@ void GPUAsynch::FlushAndInvalidateRegion(CacheAddr addr, u64 size) {
 | 
			
		||||
    gpu_thread.FlushAndInvalidateRegion(addr, size);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
void GPUAsynch::TriggerCpuInterrupt(const u32 syncpoint_id, const u32 value) const {
 | 
			
		||||
    auto& interrupt_manager = system.InterruptManager();
 | 
			
		||||
    interrupt_manager.GPUInterruptSyncpt(syncpoint_id, value);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
} // namespace VideoCommon
 | 
			
		||||
 
 | 
			
		||||
@@ -27,6 +27,9 @@ public:
 | 
			
		||||
    void InvalidateRegion(CacheAddr addr, u64 size) override;
 | 
			
		||||
    void FlushAndInvalidateRegion(CacheAddr addr, u64 size) override;
 | 
			
		||||
 | 
			
		||||
protected:
 | 
			
		||||
    void TriggerCpuInterrupt(u32 syncpoint_id, u32 value) const override;
 | 
			
		||||
 | 
			
		||||
private:
 | 
			
		||||
    GPUThread::ThreadManager gpu_thread;
 | 
			
		||||
};
 | 
			
		||||
 
 | 
			
		||||
@@ -8,7 +8,7 @@
 | 
			
		||||
namespace VideoCommon {
 | 
			
		||||
 | 
			
		||||
GPUSynch::GPUSynch(Core::System& system, VideoCore::RendererBase& renderer)
 | 
			
		||||
    : GPU(system, renderer) {}
 | 
			
		||||
    : GPU(system, renderer, false) {}
 | 
			
		||||
 | 
			
		||||
GPUSynch::~GPUSynch() = default;
 | 
			
		||||
 | 
			
		||||
 
 | 
			
		||||
@@ -25,6 +25,10 @@ public:
 | 
			
		||||
    void FlushRegion(CacheAddr addr, u64 size) override;
 | 
			
		||||
    void InvalidateRegion(CacheAddr addr, u64 size) override;
 | 
			
		||||
    void FlushAndInvalidateRegion(CacheAddr addr, u64 size) override;
 | 
			
		||||
 | 
			
		||||
protected:
 | 
			
		||||
    void TriggerCpuInterrupt([[maybe_unused]] u32 syncpoint_id,
 | 
			
		||||
                             [[maybe_unused]] u32 value) const override {}
 | 
			
		||||
};
 | 
			
		||||
 | 
			
		||||
} // namespace VideoCommon
 | 
			
		||||
 
 | 
			
		||||
@@ -21,7 +21,8 @@ static void RunThread(VideoCore::RendererBase& renderer, Tegra::DmaPusher& dma_p
 | 
			
		||||
    MicroProfileOnThreadCreate("GpuThread");
 | 
			
		||||
 | 
			
		||||
    // Wait for first GPU command before acquiring the window context
 | 
			
		||||
    state.WaitForCommands();
 | 
			
		||||
    while (state.queue.Empty())
 | 
			
		||||
        ;
 | 
			
		||||
 | 
			
		||||
    // If emulation was stopped during disk shader loading, abort before trying to acquire context
 | 
			
		||||
    if (!state.is_running) {
 | 
			
		||||
@@ -32,7 +33,6 @@ static void RunThread(VideoCore::RendererBase& renderer, Tegra::DmaPusher& dma_p
 | 
			
		||||
 | 
			
		||||
    CommandDataContainer next;
 | 
			
		||||
    while (state.is_running) {
 | 
			
		||||
        state.WaitForCommands();
 | 
			
		||||
        while (!state.queue.Empty()) {
 | 
			
		||||
            state.queue.Pop(next);
 | 
			
		||||
            if (const auto submit_list = std::get_if<SubmitListCommand>(&next.data)) {
 | 
			
		||||
@@ -49,8 +49,7 @@ static void RunThread(VideoCore::RendererBase& renderer, Tegra::DmaPusher& dma_p
 | 
			
		||||
            } else {
 | 
			
		||||
                UNREACHABLE();
 | 
			
		||||
            }
 | 
			
		||||
            state.signaled_fence = next.fence;
 | 
			
		||||
            state.TrySynchronize();
 | 
			
		||||
            state.signaled_fence.store(next.fence);
 | 
			
		||||
        }
 | 
			
		||||
    }
 | 
			
		||||
}
 | 
			
		||||
@@ -89,12 +88,7 @@ void ThreadManager::FlushRegion(CacheAddr addr, u64 size) {
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
void ThreadManager::InvalidateRegion(CacheAddr addr, u64 size) {
 | 
			
		||||
    if (state.queue.Empty()) {
 | 
			
		||||
        // It's quicker to invalidate a single region on the CPU if the queue is already empty
 | 
			
		||||
        system.Renderer().Rasterizer().InvalidateRegion(addr, size);
 | 
			
		||||
    } else {
 | 
			
		||||
        PushCommand(InvalidateRegionCommand(addr, size));
 | 
			
		||||
    }
 | 
			
		||||
    system.Renderer().Rasterizer().InvalidateRegion(addr, size);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
void ThreadManager::FlushAndInvalidateRegion(CacheAddr addr, u64 size) {
 | 
			
		||||
@@ -105,22 +99,13 @@ void ThreadManager::FlushAndInvalidateRegion(CacheAddr addr, u64 size) {
 | 
			
		||||
u64 ThreadManager::PushCommand(CommandData&& command_data) {
 | 
			
		||||
    const u64 fence{++state.last_fence};
 | 
			
		||||
    state.queue.Push(CommandDataContainer(std::move(command_data), fence));
 | 
			
		||||
    state.SignalCommands();
 | 
			
		||||
    return fence;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
MICROPROFILE_DEFINE(GPU_wait, "GPU", "Wait for the GPU", MP_RGB(128, 128, 192));
 | 
			
		||||
void SynchState::WaitForSynchronization(u64 fence) {
 | 
			
		||||
    if (signaled_fence >= fence) {
 | 
			
		||||
        return;
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    // Wait for the GPU to be idle (all commands to be executed)
 | 
			
		||||
    {
 | 
			
		||||
        MICROPROFILE_SCOPE(GPU_wait);
 | 
			
		||||
        std::unique_lock lock{synchronization_mutex};
 | 
			
		||||
        synchronization_condition.wait(lock, [this, fence] { return signaled_fence >= fence; });
 | 
			
		||||
    }
 | 
			
		||||
    while (signaled_fence.load() < fence)
 | 
			
		||||
        ;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
} // namespace VideoCommon::GPUThread
 | 
			
		||||
 
 | 
			
		||||
@@ -88,41 +88,9 @@ struct CommandDataContainer {
 | 
			
		||||
/// Struct used to synchronize the GPU thread
 | 
			
		||||
struct SynchState final {
 | 
			
		||||
    std::atomic_bool is_running{true};
 | 
			
		||||
    std::atomic_int queued_frame_count{};
 | 
			
		||||
    std::mutex synchronization_mutex;
 | 
			
		||||
    std::mutex commands_mutex;
 | 
			
		||||
    std::condition_variable commands_condition;
 | 
			
		||||
    std::condition_variable synchronization_condition;
 | 
			
		||||
 | 
			
		||||
    /// Returns true if the gap in GPU commands is small enough that we can consider the CPU and GPU
 | 
			
		||||
    /// synchronized. This is entirely empirical.
 | 
			
		||||
    bool IsSynchronized() const {
 | 
			
		||||
        constexpr std::size_t max_queue_gap{5};
 | 
			
		||||
        return queue.Size() <= max_queue_gap;
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    void TrySynchronize() {
 | 
			
		||||
        if (IsSynchronized()) {
 | 
			
		||||
            std::lock_guard lock{synchronization_mutex};
 | 
			
		||||
            synchronization_condition.notify_one();
 | 
			
		||||
        }
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    void WaitForSynchronization(u64 fence);
 | 
			
		||||
 | 
			
		||||
    void SignalCommands() {
 | 
			
		||||
        if (queue.Empty()) {
 | 
			
		||||
            return;
 | 
			
		||||
        }
 | 
			
		||||
 | 
			
		||||
        commands_condition.notify_one();
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    void WaitForCommands() {
 | 
			
		||||
        std::unique_lock lock{commands_mutex};
 | 
			
		||||
        commands_condition.wait(lock, [this] { return !queue.Empty(); });
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    using CommandQueue = Common::SPSCQueue<CommandDataContainer>;
 | 
			
		||||
    CommandQueue queue;
 | 
			
		||||
    u64 last_fence{};
 | 
			
		||||
 
 | 
			
		||||
		Reference in New Issue
	
	Block a user