6 files changed, 64 insertions, 33 deletions
diff --git a/src/common/threadsafe_queue.h b/src/common/threadsafe_queue.h
index a4647314a..ad04df8ca 100644
--- a/src/common/threadsafe_queue.h
+++ b/src/common/threadsafe_queue.h
@@ -83,11 +83,15 @@ public:
        return true;
    }
-    T PopWait() {
+    void Wait() {
        if (Empty()) {
            std::unique_lock lock{cv_mutex};
            cv.wait(lock, [this]() { return !Empty(); });
        }
+    }
+    T PopWait() {
+        Wait();
        T t;
        Pop(t);
        return t;
@@ -156,6 +160,10 @@ public:
        return spsc_queue.Pop(t);
    }
+    void Wait() {
+        spsc_queue.Wait();
+    }
    T PopWait() {
        return spsc_queue.PopWait();
    }
diff --git a/src/core/core.cpp b/src/core/core.cpp
index 305f56ff1..56b47e671 100644
--- a/src/core/core.cpp
+++ b/src/core/core.cpp
@@ -296,7 +296,7 @@ struct System::Impl {
        exit_lock = false;
        if (gpu_core) {
-            gpu_core->WaitIdle();
+            gpu_core->ShutDown();
        }
        services.reset();
diff --git a/src/video_core/gpu.cpp b/src/video_core/gpu.cpp
index c61f44619..009c6f574 100644
--- a/src/video_core/gpu.cpp
+++ b/src/video_core/gpu.cpp
@@ -517,8 +517,8 @@ void GPU::TriggerCpuInterrupt(const u32 syncpoint_id, const u32 value) const {
    interrupt_manager.GPUInterruptSyncpt(syncpoint_id, value);
 }
-void GPU::WaitIdle() const {
+void GPU::ShutDown() {
-    gpu_thread.WaitIdle();
+    gpu_thread.ShutDown();
 }
 void GPU::OnCommandListEnd() {
diff --git a/src/video_core/gpu.h b/src/video_core/gpu.h
index b2ee45496..ecab35d3b 100644
--- a/src/video_core/gpu.h
+++ b/src/video_core/gpu.h
@@ -219,8 +219,8 @@ public:
        return *shader_notify;
    }
-    // Waits for the GPU to finish working
+    // Stops the GPU execution and waits for the GPU to finish working
-    void WaitIdle() const;
+    void ShutDown();
    /// Allows the CPU/NvFlinger to wait on the GPU before presenting a frame.
    void WaitFence(u32 syncpoint_id, u32 value);
diff --git a/src/video_core/gpu_thread.cpp b/src/video_core/gpu_thread.cpp
index 99353f15f..7addfbc7b 100644
--- a/src/video_core/gpu_thread.cpp
+++ b/src/video_core/gpu_thread.cpp
@@ -29,8 +29,7 @@ static void RunThread(Core::System& system, VideoCore::RendererBase& renderer,
    system.RegisterHostThread();
    // Wait for first GPU command before acquiring the window context
-    while (state.queue.Empty())
+    state.queue.Wait();
-        ;
    // If emulation was stopped during disk shader loading, abort before trying to acquire context
    if (!state.is_running) {
@@ -57,11 +56,17 @@ static void RunThread(Core::System& system, VideoCore::RendererBase& renderer,
        } else if (const auto* invalidate = std::get_if<InvalidateRegionCommand>(&next.data)) {
            rasterizer->OnCPUWrite(invalidate->addr, invalidate->size);
        } else if (std::holds_alternative<EndProcessingCommand>(next.data)) {
-            return;
+            ASSERT(state.is_running == false);
        } else {
            UNREACHABLE();
        }
        state.signaled_fence.store(next.fence);
+        if (next.block) {
+            // We have to lock the write_lock to ensure that the condition_variable wait not get a
+            // race between the check and the lock itself.
+            std::lock_guard lk(state.write_lock);
+            state.cv.notify_all();
+        }
    }
 }
@@ -69,13 +74,7 @@ ThreadManager::ThreadManager(Core::System& system_, bool is_async_)
    : system{system_}, is_async{is_async_} {}
 ThreadManager::~ThreadManager() {
-    if (!thread.joinable()) {
+    ShutDown();
-        return;
-    }
-    // Notify GPU thread that a shutdown is pending
-    PushCommand(EndProcessingCommand());
-    thread.join();
 }
 void ThreadManager::StartThread(VideoCore::RendererBase& renderer,
@@ -112,9 +111,8 @@ void ThreadManager::FlushRegion(VAddr addr, u64 size) {
    case Settings::GPUAccuracy::Extreme: {
        auto& gpu = system.GPU();
        u64 fence = gpu.RequestFlush(addr, size);
-        PushCommand(GPUTickCommand());
+        PushCommand(GPUTickCommand(), true);
-        while (fence > gpu.CurrentFlushRequestFence()) {
+        ASSERT(fence <= gpu.CurrentFlushRequestFence());
-        }
        break;
    }
    default:
@@ -131,23 +129,45 @@ void ThreadManager::FlushAndInvalidateRegion(VAddr addr, u64 size) {
    rasterizer->OnCPUWrite(addr, size);
 }
-void ThreadManager::WaitIdle() const {
+void ThreadManager::ShutDown() {
-    while (state.last_fence > state.signaled_fence.load(std::memory_order_relaxed) &&
+    if (!state.is_running) {
-           system.IsPoweredOn()) {
+        return;
    }
+    {
+        std::lock_guard lk(state.write_lock);
+        state.is_running = false;
+        state.cv.notify_all();
+    }
+    if (!thread.joinable()) {
+        return;
+    }
+    // Notify GPU thread that a shutdown is pending
+    PushCommand(EndProcessingCommand());
+    thread.join();
 }
 void ThreadManager::OnCommandListEnd() {
    PushCommand(OnCommandListEndCommand());
 }
-u64 ThreadManager::PushCommand(CommandData&& command_data) {
+u64 ThreadManager::PushCommand(CommandData&& command_data, bool block) {
-    const u64 fence{++state.last_fence};
-    state.queue.Push(CommandDataContainer(std::move(command_data), fence));
    if (!is_async) {
        // In synchronous GPU mode, block the caller until the command has executed
-        WaitIdle();
+        block = true;
+    }
+    std::unique_lock lk(state.write_lock);
+    const u64 fence{++state.last_fence};
+    state.queue.Push(CommandDataContainer(std::move(command_data), fence, block));
+    if (block) {
+        state.cv.wait(lk, [this, fence] {
+            return fence <= state.signaled_fence.load(std::memory_order_relaxed) ||
+                   !state.is_running;
+        });
    }
    return fence;
diff --git a/src/video_core/gpu_thread.h b/src/video_core/gpu_thread.h
index 18269e51c..11a648f38 100644
--- a/src/video_core/gpu_thread.h
+++ b/src/video_core/gpu_thread.h
@@ -90,21 +90,24 @@ using CommandData =
 struct CommandDataContainer {
    CommandDataContainer() = default;
-    explicit CommandDataContainer(CommandData&& data_, u64 next_fence_)
+    explicit CommandDataContainer(CommandData&& data_, u64 next_fence_, bool block_)
-        : data{std::move(data_)}, fence{next_fence_} {}
+        : data{std::move(data_)}, fence{next_fence_}, block(block_) {}
    CommandData data;
    u64 fence{};
+    bool block{};
 };
 /// Struct used to synchronize the GPU thread
 struct SynchState final {
    std::atomic_bool is_running{true};
-    using CommandQueue = Common::MPSCQueue<CommandDataContainer>;
+    using CommandQueue = Common::SPSCQueue<CommandDataContainer>;
+    std::mutex write_lock;
    CommandQueue queue;
    u64 last_fence{};
    std::atomic<u64> signaled_fence{};
+    std::condition_variable cv;
 };
 /// Class used to manage the GPU thread
@@ -132,14 +135,14 @@ public:
    /// Notify rasterizer that any caches of the specified region should be flushed and invalidated
    void FlushAndInvalidateRegion(VAddr addr, u64 size);
-    // Wait until the gpu thread is idle.
+    // Stops the GPU execution and waits for the GPU to finish working
-    void WaitIdle() const;
+    void ShutDown();
    void OnCommandListEnd();
 private:
    /// Pushes a command to be executed by the GPU thread
-    u64 PushCommand(CommandData&& command_data);
+    u64 PushCommand(CommandData&& command_data, bool block = false);
    Core::System& system;
    const bool is_async;