From eb67a45ca82bc01ac843c853fd3c17f2a90e0250 Mon Sep 17 00:00:00 2001
From: ameerj
Date: Mon, 26 Oct 2020 23:07:36 -0400
Subject: video_core: NVDEC Implementation

This commit aims to implement the NVDEC (Nvidia Decoder) functionality, with video frame decoding being handled by the FFmpeg library.

The process begins with Ioctl commands being sent to the NVDEC and VIC (Video Image Composer) emulated devices. These allocate the necessary GPU buffers for the frame data, along with providing information on the incoming video data. A Submit command then signals the GPU to process and decode the frame data.

To decode the frame, the respective codec's header must be manually composed from the information provided by NVDEC, then sent with the raw frame data to the ffmpeg library.

Currently, H264 and VP9 are supported, with VP9 having some minor artifacting issues related mainly to the reference frame composition in its uncompressed header.

Async GPU is not properly implemented at the moment.

Co-Authored-By: David <25727384+ogniK5377@users.noreply.github.com>
---
 src/video_core/gpu.h | 23 ++++++++++++++++++++---
 1 file changed, 20 insertions(+), 3 deletions(-)

(limited to 'src/video_core/gpu.h')
diff --git a/src/video_core/gpu.h b/src/video_core/gpu.h
index 2d15d1c6f..b8c613b11 100644
--- a/src/video_core/gpu.h
+++ b/src/video_core/gpu.h
@@ -13,6 +13,7 @@
 #include "common/common_types.h"
 #include "core/hle/service/nvdrv/nvdata.h"
 #include "core/hle/service/nvflinger/buffer_queue.h"
+#include "video_core/cdma_pusher.h"
 #include "video_core/dma_pusher.h"
 
 using CacheAddr = std::uintptr_t;
@@ -157,7 +158,7 @@ public:
               method_count(method_count) {}
     };
 
-    explicit GPU(Core::System& system, bool is_async);
+    explicit GPU(Core::System& system, bool is_async, bool use_nvdec);
     virtual ~GPU();
 
     /// Binds a renderer to the GPU.
@@ -209,6 +210,15 @@ public:
     /// Returns a reference to the GPU DMA pusher.
     Tegra::DmaPusher& DmaPusher();
 
+    /// Returns a const reference to the GPU DMA pusher.
+    const Tegra::DmaPusher& DmaPusher() const;
+
+    /// Returns a reference to the GPU CDMA pusher.
+    Tegra::CDmaPusher& CDmaPusher();
+
+    /// Returns a const reference to the GPU CDMA pusher.
+    const Tegra::CDmaPusher& CDmaPusher() const;
+
     VideoCore::RendererBase& Renderer() {
         return *renderer;
     }
@@ -249,8 +259,9 @@ public:
         return is_async;
     }
 
-    /// Returns a const reference to the GPU DMA pusher.
-    const Tegra::DmaPusher& DmaPusher() const;
+    bool UseNvdec() const {
+        return use_nvdec;
+    }
 
     struct Regs {
         static constexpr size_t NUM_REGS = 0x40;
@@ -311,6 +322,9 @@ public:
     /// Push GPU command entries to be processed
     virtual void PushGPUEntries(Tegra::CommandList&& entries) = 0;
 
+    /// Push GPU command buffer entries to be processed
+    virtual void PushCommandBuffer(Tegra::ChCommandHeaderList& entries) = 0;
+
     /// Swap buffers (render frame)
     virtual void SwapBuffers(const Tegra::FramebufferConfig* framebuffer) = 0;
 
@@ -349,7 +363,9 @@ protected:
     Core::System& system;
     std::unique_ptr<Tegra::MemoryManager> memory_manager;
     std::unique_ptr<Tegra::DmaPusher> dma_pusher;
+    std::unique_ptr<Tegra::CDmaPusher> cdma_pusher;
     std::unique_ptr<VideoCore::RendererBase> renderer;
+    const bool use_nvdec;
 
 private:
     /// Mapping of command subchannels to their bound engine ids
@@ -372,6 +388,7 @@ private:
     std::array<std::list<u32>, Service::Nvidia::MaxSyncPoints> syncpt_interrupts;
 
     std::mutex sync_mutex;
+    std::mutex device_mutex;
 
     std::condition_variable sync_cv;
 
-- 
cgit v1.2.3


From 6053b955525be69eb73a928a7bdd43ba8f5e69a7 Mon Sep 17 00:00:00 2001
From: bunnei
Date: Mon, 26 Oct 2020 22:11:41 -0700
Subject: video_core: gpu: Implement WaitFence and IncrementSyncPoint.

---
 src/video_core/gpu.h | 25 +++++++++++++++++++++----
 1 file changed, 21 insertions(+), 4 deletions(-)

(limited to 'src/video_core/gpu.h')

diff --git a/src/video_core/gpu.h b/src/video_core/gpu.h
index b8c613b11..5444b49f3 100644
--- a/src/video_core/gpu.h
+++ b/src/video_core/gpu.h
@@ -263,6 +263,24 @@ public:
         return use_nvdec;
     }
 
+    enum class FenceOperation : u32 {
+        Acquire = 0,
+        Increment = 1,
+    };
+
+    union FenceAction {
+        u32 raw;
+        BitField<0, 1, FenceOperation> op;
+        BitField<8, 24, u32> syncpoint_id;
+
+        static constexpr CommandHeader Build(FenceOperation op, u32 syncpoint_id) {
+            FenceAction result{};
+            result.op.Assign(op);
+            result.syncpoint_id.Assign(syncpoint_id);
+            return {result.raw};
+        }
+    };
+
     struct Regs {
         static constexpr size_t NUM_REGS = 0x40;
 
@@ -291,10 +309,7 @@ public:
                 u32 semaphore_acquire;
                 u32 semaphore_release;
                 u32 fence_value;
-                union {
-                    BitField<4, 4, u32> operation;
-                    BitField<8, 8, u32> id;
-                } fence_action;
+                FenceAction fence_action;
                 INSERT_UNION_PADDING_WORDS(0xE2);
 
                 // Puller state
@@ -342,6 +357,8 @@ protected:
 
 private:
     void ProcessBindMethod(const MethodCall& method_call);
+    void ProcessFenceActionMethod();
+    void ProcessWaitForInterruptMethod();
     void ProcessSemaphoreTriggerMethod();
     void ProcessSemaphoreRelease();
     void ProcessSemaphoreAcquire();
-- 
cgit v1.2.3


From 6f006d051e1fad075048ea5664e1ef0605e48a46 Mon Sep 17 00:00:00 2001
From: Lioncash
Date: Wed, 4 Nov 2020 20:41:16 -0500
Subject: General: Fix clang build

Allows building on clang to work again
---
 src/video_core/gpu.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'src/video_core/gpu.h')

diff --git a/src/video_core/gpu.h b/src/video_core/gpu.h
index 5444b49f3..cf5235a79 100644
--- a/src/video_core/gpu.h
+++ b/src/video_core/gpu.h
@@ -273,7 +273,7 @@ public:
         BitField<0, 1, FenceOperation> op;
         BitField<8, 24, u32> syncpoint_id;
 
-        static constexpr CommandHeader Build(FenceOperation op, u32 syncpoint_id) {
+        static CommandHeader Build(FenceOperation op, u32 syncpoint_id) {
             FenceAction result{};
             result.op.Assign(op);
             result.syncpoint_id.Assign(syncpoint_id);
-- 
cgit v1.2.3


From b928fca114d732971541e2bcbd7e9f40aa5497f2 Mon Sep 17 00:00:00 2001
From: Lioncash
Date: Tue, 17 Nov 2020 07:14:44 -0500
Subject: gpu: Make use of [[nodiscard]] where applicable

---
 src/video_core/gpu.h | 66 ++++++++++++++++++++++++++++------------------------
 1 file changed, 35 insertions(+), 31 deletions(-)

(limited to 'src/video_core/gpu.h')

diff --git a/src/video_core/gpu.h b/src/video_core/gpu.h
index cf5235a79..21410e125 100644
--- a/src/video_core/gpu.h
+++ b/src/video_core/gpu.h
@@ -17,11 +17,11 @@
 #include "video_core/dma_pusher.h"
 
 using CacheAddr = std::uintptr_t;
-inline CacheAddr ToCacheAddr(const void* host_ptr) {
+[[nodiscard]] inline CacheAddr ToCacheAddr(const void* host_ptr) {
     return reinterpret_cast<CacheAddr>(host_ptr);
 }
 
-inline u8* FromCacheAddr(CacheAddr cache_addr) {
+[[nodiscard]] inline u8* FromCacheAddr(CacheAddr cache_addr) {
     return reinterpret_cast<u8*>(cache_addr);
 }
 
@@ -149,13 +149,13 @@ public:
         u32 subchannel{};
         u32 method_count{};
 
-        bool IsLastCall() const {
-            return method_count <= 1;
-        }
-
         MethodCall(u32 method, u32 argument, u32 subchannel = 0, u32 method_count = 0)
             : method(method), argument(argument), subchannel(subchannel),
               method_count(method_count) {}
+
+        [[nodiscard]] bool IsLastCall() const {
+            return method_count <= 1;
+        }
     };
 
     explicit GPU(Core::System& system, bool is_async, bool use_nvdec);
@@ -179,10 +179,10 @@ public:
     virtual void OnCommandListEnd();
 
     /// Request a host GPU memory flush from the CPU.
-    u64 RequestFlush(VAddr addr, std::size_t size);
+    [[nodiscard]] u64 RequestFlush(VAddr addr, std::size_t size);
 
     /// Obtains current flush request fence id.
-    u64 CurrentFlushRequestFence() const {
+    [[nodiscard]] u64 CurrentFlushRequestFence() const {
         return current_flush_fence.load(std::memory_order_relaxed);
     }
 
@@ -190,48 +190,52 @@ public:
     void TickWork();
 
     /// Returns a reference to the Maxwell3D GPU engine.
-    Engines::Maxwell3D& Maxwell3D();
+    [[nodiscard]] Engines::Maxwell3D& Maxwell3D();
 
     /// Returns a const reference to the Maxwell3D GPU engine.
-    const Engines::Maxwell3D& Maxwell3D() const;
+    [[nodiscard]] const Engines::Maxwell3D& Maxwell3D() const;
 
     /// Returns a reference to the KeplerCompute GPU engine.
-    Engines::KeplerCompute& KeplerCompute();
+    [[nodiscard]] Engines::KeplerCompute& KeplerCompute();
 
     /// Returns a reference to the KeplerCompute GPU engine.
-    const Engines::KeplerCompute& KeplerCompute() const;
+    [[nodiscard]] const Engines::KeplerCompute& KeplerCompute() const;
 
     /// Returns a reference to the GPU memory manager.
-    Tegra::MemoryManager& MemoryManager();
+    [[nodiscard]] Tegra::MemoryManager& MemoryManager();
 
     /// Returns a const reference to the GPU memory manager.
-    const Tegra::MemoryManager& MemoryManager() const;
+    [[nodiscard]] const Tegra::MemoryManager& MemoryManager() const;
 
     /// Returns a reference to the GPU DMA pusher.
-    Tegra::DmaPusher& DmaPusher();
+    [[nodiscard]] Tegra::DmaPusher& DmaPusher();
 
     /// Returns a const reference to the GPU DMA pusher.
-    const Tegra::DmaPusher& DmaPusher() const;
+    [[nodiscard]] const Tegra::DmaPusher& DmaPusher() const;
 
     /// Returns a reference to the GPU CDMA pusher.
-    Tegra::CDmaPusher& CDmaPusher();
+    [[nodiscard]] Tegra::CDmaPusher& CDmaPusher();
 
     /// Returns a const reference to the GPU CDMA pusher.
-    const Tegra::CDmaPusher& CDmaPusher() const;
+    [[nodiscard]] const Tegra::CDmaPusher& CDmaPusher() const;
 
-    VideoCore::RendererBase& Renderer() {
+    /// Returns a reference to the underlying renderer.
+    [[nodiscard]] VideoCore::RendererBase& Renderer() {
         return *renderer;
     }
 
-    const VideoCore::RendererBase& Renderer() const {
+    /// Returns a const reference to the underlying renderer.
+    [[nodiscard]] const VideoCore::RendererBase& Renderer() const {
         return *renderer;
     }
 
-    VideoCore::ShaderNotify& ShaderNotify() {
+    /// Returns a reference to the shader notifier.
+    [[nodiscard]] VideoCore::ShaderNotify& ShaderNotify() {
         return *shader_notify;
     }
 
-    const VideoCore::ShaderNotify& ShaderNotify() const {
+    /// Returns a const reference to the shader notifier.
+    [[nodiscard]] const VideoCore::ShaderNotify& ShaderNotify() const {
         return *shader_notify;
     }
 
@@ -243,23 +247,23 @@ public:
 
     void IncrementSyncPoint(u32 syncpoint_id);
 
-    u32 GetSyncpointValue(u32 syncpoint_id) const;
+    [[nodiscard]] u32 GetSyncpointValue(u32 syncpoint_id) const;
 
     void RegisterSyncptInterrupt(u32 syncpoint_id, u32 value);
 
-    bool CancelSyncptInterrupt(u32 syncpoint_id, u32 value);
+    [[nodiscard]] bool CancelSyncptInterrupt(u32 syncpoint_id, u32 value);
 
-    u64 GetTicks() const;
+    [[nodiscard]] u64 GetTicks() const;
 
-    std::unique_lock<std::mutex> LockSync() {
+    [[nodiscard]] std::unique_lock<std::mutex> LockSync() {
         return std::unique_lock{sync_mutex};
     }
 
-    bool IsAsync() const {
+    [[nodiscard]] bool IsAsync() const {
         return is_async;
     }
 
-    bool UseNvdec() const {
+    [[nodiscard]] bool UseNvdec() const {
         return use_nvdec;
     }
 
@@ -273,7 +277,7 @@ public:
         BitField<0, 1, FenceOperation> op;
         BitField<8, 24, u32> syncpoint_id;
 
-        static CommandHeader Build(FenceOperation op, u32 syncpoint_id) {
+        [[nodiscard]] static CommandHeader Build(FenceOperation op, u32 syncpoint_id) {
             FenceAction result{};
             result.op.Assign(op);
             result.syncpoint_id.Assign(syncpoint_id);
@@ -291,7 +295,7 @@ public:
                     u32 address_high;
                     u32 address_low;
 
-                    GPUVAddr SemaphoreAddress() const {
+                    [[nodiscard]] GPUVAddr SemaphoreAddress() const {
                         return static_cast<GPUVAddr>((static_cast<GPUVAddr>(address_high) << 32) |
                                                      address_low);
                     }
@@ -374,7 +378,7 @@ private:
                                u32 methods_pending);
 
     /// Determines where the method should be executed.
-    bool ExecuteMethodOnEngine(u32 method);
+    [[nodiscard]] bool ExecuteMethodOnEngine(u32 method);
 
 protected:
     Core::System& system;
-- 
cgit v1.2.3


From 677a8b208d47d0d2397197ce74c7039a8ea79d20 Mon Sep 17 00:00:00 2001
From: Lioncash
Date: Fri, 4 Dec 2020 14:39:12 -0500
Subject: video_core: Resolve more variable shadowing scenarios

Resolves variable shadowing scenarios up to the end of the OpenGL code
to make it nicer to review. The rest will be resolved in a following
commit.
---
 src/video_core/gpu.h | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

(limited to 'src/video_core/gpu.h')

diff --git a/src/video_core/gpu.h b/src/video_core/gpu.h
index 21410e125..660641d04 100644
--- a/src/video_core/gpu.h
+++ b/src/video_core/gpu.h
@@ -149,16 +149,16 @@ public:
         u32 subchannel{};
         u32 method_count{};
 
-        MethodCall(u32 method, u32 argument, u32 subchannel = 0, u32 method_count = 0)
-            : method(method), argument(argument), subchannel(subchannel),
-              method_count(method_count) {}
+        explicit MethodCall(u32 method_, u32 argument_, u32 subchannel_ = 0, u32 method_count_ = 0)
+            : method(method_), argument(argument_), subchannel(subchannel_),
+              method_count(method_count_) {}
 
         [[nodiscard]] bool IsLastCall() const {
             return method_count <= 1;
         }
     };
 
-    explicit GPU(Core::System& system, bool is_async, bool use_nvdec);
+    explicit GPU(Core::System& system_, bool is_async_, bool use_nvdec_);
     virtual ~GPU();
 
     /// Binds a renderer to the GPU.
@@ -414,8 +414,8 @@ private:
     std::condition_variable sync_cv;
 
     struct FlushRequest {
-        FlushRequest(u64 fence, VAddr addr, std::size_t size)
-            : fence{fence}, addr{addr}, size{size} {}
+        explicit FlushRequest(u64 fence_, VAddr addr_, std::size_t size_)
+            : fence{fence_}, addr{addr_}, size{size_} {}
         u64 fence;
         VAddr addr;
         std::size_t size;
-- 
cgit v1.2.3


From 14c825bd1c37b2444e858bf1a75fb77455b4eb52 Mon Sep 17 00:00:00 2001
From: bunnei
Date: Fri, 11 Dec 2020 22:26:14 -0800
Subject: video_core: gpu: Refactor out synchronous/asynchronous GPU
 implementations.

- We must always use a GPU thread now, even with synchronous GPU.
---
 src/video_core/gpu.h | 55 ++++++++++++++++++----------------------------------
 1 file changed, 19 insertions(+), 36 deletions(-)

(limited to 'src/video_core/gpu.h')

diff --git a/src/video_core/gpu.h b/src/video_core/gpu.h
index 660641d04..a2bb4d82d 100644
--- a/src/video_core/gpu.h
+++ b/src/video_core/gpu.h
@@ -15,6 +15,8 @@
 #include "core/hle/service/nvflinger/buffer_queue.h"
 #include "video_core/cdma_pusher.h"
 #include "video_core/dma_pusher.h"
+#include "video_core/framebuffer_config.h"
+#include "video_core/gpu_thread.h"
 
 using CacheAddr = std::uintptr_t;
 [[nodiscard]] inline CacheAddr ToCacheAddr(const void* host_ptr) {
@@ -101,28 +103,6 @@ enum class DepthFormat : u32 {
 struct CommandListHeader;
 class DebugContext;
 
-/**
- * Struct describing framebuffer configuration
- */
-struct FramebufferConfig {
-    enum class PixelFormat : u32 {
-        A8B8G8R8_UNORM = 1,
-        RGB565_UNORM = 4,
-        B8G8R8A8_UNORM = 5,
-    };
-
-    VAddr address;
-    u32 offset;
-    u32 width;
-    u32 height;
-    u32 stride;
-    PixelFormat pixel_format;
-
-    using TransformFlags = Service::NVFlinger::BufferQueue::BufferTransformFlags;
-    TransformFlags transform_flags;
-    Common::Rectangle<int> crop_rect;
-};
-
 namespace Engines {
 class Fermi2D;
 class Maxwell3D;
@@ -141,7 +121,7 @@ enum class EngineID {
 
 class MemoryManager;
 
-class GPU {
+class GPU final {
 public:
     struct MethodCall {
         u32 method{};
@@ -159,7 +139,7 @@ public:
     };
 
     explicit GPU(Core::System& system_, bool is_async_, bool use_nvdec_);
-    virtual ~GPU();
+    ~GPU();
 
     /// Binds a renderer to the GPU.
     void BindRenderer(std::unique_ptr<VideoCore::RendererBase> renderer);
@@ -176,7 +156,7 @@ public:
     /// Synchronizes CPU writes with Host GPU memory.
     void SyncGuestHost();
     /// Signal the ending of command list.
-    virtual void OnCommandListEnd();
+    void OnCommandListEnd();
 
     /// Request a host GPU memory flush from the CPU.
     [[nodiscard]] u64 RequestFlush(VAddr addr, std::size_t size);
@@ -240,7 +220,7 @@ public:
     }
 
     // Waits for the GPU to finish working
-    virtual void WaitIdle() const = 0;
+    void WaitIdle() const;
 
     /// Allows the CPU/NvFlinger to wait on the GPU before presenting a frame.
     void WaitFence(u32 syncpoint_id, u32 value);
@@ -330,34 +310,34 @@ public:
     /// Performs any additional setup necessary in order to begin GPU emulation.
     /// This can be used to launch any necessary threads and register any necessary
     /// core timing events.
-    virtual void Start() = 0;
+    void Start();
 
     /// Obtain the CPU Context
-    virtual void ObtainContext() = 0;
+    void ObtainContext();
 
     /// Release the CPU Context
-    virtual void ReleaseContext() = 0;
+    void ReleaseContext();
 
     /// Push GPU command entries to be processed
-    virtual void PushGPUEntries(Tegra::CommandList&& entries) = 0;
+    void PushGPUEntries(Tegra::CommandList&& entries);
 
     /// Push GPU command buffer entries to be processed
-    virtual void PushCommandBuffer(Tegra::ChCommandHeaderList& entries) = 0;
+    void PushCommandBuffer(Tegra::ChCommandHeaderList& entries);
 
     /// Swap buffers (render frame)
-    virtual void SwapBuffers(const Tegra::FramebufferConfig* framebuffer) = 0;
+    void SwapBuffers(const Tegra::FramebufferConfig* framebuffer);
 
     /// Notify rasterizer that any caches of the specified region should be flushed to Switch memory
-    virtual void FlushRegion(VAddr addr, u64 size) = 0;
+    void FlushRegion(VAddr addr, u64 size);
 
     /// Notify rasterizer that any caches of the specified region should be invalidated
-    virtual void InvalidateRegion(VAddr addr, u64 size) = 0;
+    void InvalidateRegion(VAddr addr, u64 size);
 
     /// Notify rasterizer that any caches of the specified region should be flushed and invalidated
-    virtual void FlushAndInvalidateRegion(VAddr addr, u64 size) = 0;
+    void FlushAndInvalidateRegion(VAddr addr, u64 size);
 
 protected:
-    virtual void TriggerCpuInterrupt(u32 syncpoint_id, u32 value) const = 0;
+    void TriggerCpuInterrupt(u32 syncpoint_id, u32 value) const;
 
 private:
     void ProcessBindMethod(const MethodCall& method_call);
@@ -426,6 +406,9 @@ private:
     u64 last_flush_fence{};
     std::mutex flush_request_mutex;
 
+    VideoCommon::GPUThread::ThreadManager gpu_thread;
+    std::unique_ptr<Core::Frontend::GraphicsContext> cpu_context;
+
     const bool is_async;
 };
 
-- 
cgit v1.2.3


From 40571c073faa02a6a4301e7f0ce365ef50a400aa Mon Sep 17 00:00:00 2001
From: bunnei
Date: Sat, 12 Dec 2020 00:24:33 -0800
Subject: video_core: gpu: Implement synchronous mode using threaded GPU.

---
 src/video_core/gpu.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'src/video_core/gpu.h')

diff --git a/src/video_core/gpu.h b/src/video_core/gpu.h
index a2bb4d82d..d81e38680 100644
--- a/src/video_core/gpu.h
+++ b/src/video_core/gpu.h
@@ -406,10 +406,10 @@ private:
     u64 last_flush_fence{};
     std::mutex flush_request_mutex;
 
+    const bool is_async;
+
     VideoCommon::GPUThread::ThreadManager gpu_thread;
     std::unique_ptr<Core::Frontend::GraphicsContext> cpu_context;
-
-    const bool is_async;
 };
 
 #define ASSERT_REG_POSITION(field_name, position)                                                  \
-- 
cgit v1.2.3