3 files changed, 23 insertions, 13 deletions
diff --git a/src/core/hle/service/nvdrv/devices/nvhost_gpu.cpp b/src/core/hle/service/nvdrv/devices/nvhost_gpu.cpp
index 39a58b685..2e2b0ae1c 100644
--- a/src/core/hle/service/nvdrv/devices/nvhost_gpu.cpp
+++ b/src/core/hle/service/nvdrv/devices/nvhost_gpu.cpp
@@ -128,11 +128,9 @@ u32 nvhost_gpu::AllocateObjectContext(const std::vector<u8>& input, std::vector<
    return 0;
 }
-static void PushGPUEntries(const std::vector<Tegra::CommandListHeader>& entries) {
+static void PushGPUEntries(Tegra::CommandList&& entries) {
    auto& dma_pusher{Core::System::GetInstance().GPU().DmaPusher()};
-    for (const auto& entry : entries) {
+    dma_pusher.Push(std::move(entries));
-        dma_pusher.Push(entry);
-    }
    dma_pusher.DispatchCalls();
 }
@@ -149,11 +147,11 @@ u32 nvhost_gpu::SubmitGPFIFO(const std::vector<u8>& input, std::vector<u8>& outp
                                   params.num_entries * sizeof(Tegra::CommandListHeader),
               "Incorrect input size");
-    std::vector<Tegra::CommandListHeader> entries(params.num_entries);
+    Tegra::CommandList entries(params.num_entries);
    std::memcpy(entries.data(), &input[sizeof(IoctlSubmitGpfifo)],
                params.num_entries * sizeof(Tegra::CommandListHeader));
-    PushGPUEntries(entries);
+    PushGPUEntries(std::move(entries));
    params.fence_out.id = 0;
    params.fence_out.value = 0;
@@ -170,11 +168,11 @@ u32 nvhost_gpu::KickoffPB(const std::vector<u8>& input, std::vector<u8>& output)
    LOG_WARNING(Service_NVDRV, "(STUBBED) called, gpfifo={:X}, num_entries={:X}, flags={:X}",
                params.address, params.num_entries, params.flags);
-    std::vector<Tegra::CommandListHeader> entries(params.num_entries);
+    Tegra::CommandList entries(params.num_entries);
    Memory::ReadBlock(params.address, entries.data(),
                      params.num_entries * sizeof(Tegra::CommandListHeader));
-    PushGPUEntries(entries);
+    PushGPUEntries(std::move(entries));
    params.fence_out.id = 0;
    params.fence_out.value = 0;
diff --git a/src/video_core/dma_pusher.cpp b/src/video_core/dma_pusher.cpp
index 23ec97944..63a958f11 100644
--- a/src/video_core/dma_pusher.cpp
+++ b/src/video_core/dma_pusher.cpp
@@ -23,6 +23,8 @@ void DmaPusher::DispatchCalls() {
    // On entering GPU code, assume all memory may be touched by the ARM core.
    gpu.Maxwell3D().dirty_flags.OnMemoryWrite();
+    dma_pushbuffer_subindex = 0;
    while (Core::System::GetInstance().IsPoweredOn()) {
        if (!Step()) {
            break;
@@ -89,11 +91,17 @@ bool DmaPusher::Step() {
        }
    } else if (ib_enable && !dma_pushbuffer.empty()) {
        // Current pushbuffer empty, but we have more IB entries to read
-        const CommandListHeader& command_list_header{dma_pushbuffer.front()};
+        const CommandList& command_list{dma_pushbuffer.front()};
+        const CommandListHeader& command_list_header{command_list[dma_pushbuffer_subindex++]};
        dma_get = command_list_header.addr;
        dma_put = dma_get + command_list_header.size * sizeof(u32);
        non_main = command_list_header.is_non_main;
-        dma_pushbuffer.pop();
+        if (dma_pushbuffer_subindex >= command_list.size()) {
+            // We've gone through the current list, remove it from the queue
+            dma_pushbuffer.pop();
+            dma_pushbuffer_subindex = 0;
+        }
    } else {
        // Otherwise, pushbuffer empty and IB empty or nonexistent - nothing to do
        return {};
diff --git a/src/video_core/dma_pusher.h b/src/video_core/dma_pusher.h
index 39d98e46e..16e0697c4 100644
--- a/src/video_core/dma_pusher.h
+++ b/src/video_core/dma_pusher.h
@@ -4,6 +4,7 @@
 #pragma once
+#include <vector>
 #include <queue>
 #include "common/bit_field.h"
@@ -45,6 +46,8 @@ static_assert(sizeof(CommandHeader) == sizeof(u32), "CommandHeader has incorrect
 class GPU;
+using CommandList = std::vector<Tegra::CommandListHeader>;
 /**
 * The DmaPusher class implements DMA submission to FIFOs, providing an area of memory that the
 * emulated app fills with commands and tells PFIFO to process. The pushbuffers are then assembled
@@ -57,8 +60,8 @@ public:
    explicit DmaPusher(GPU& gpu);
    ~DmaPusher();
-    void Push(const CommandListHeader& command_list_header) {
+    void Push(CommandList&& entries) {
-        dma_pushbuffer.push(command_list_header);
+        dma_pushbuffer.push(std::move(entries));
    }
    void DispatchCalls();
@@ -72,7 +75,8 @@ private:
    GPU& gpu;
-    std::queue<CommandListHeader> dma_pushbuffer;
+    std::queue<CommandList> dma_pushbuffer; ///< Queue of command lists to be processed
+    std::size_t dma_pushbuffer_subindex{};  ///< Index within a command list within the pushbuffer
    struct DmaState {
        u32 method;            ///< Current method

diff --git a/src/core/hle/service/nvdrv/devices/nvhost_gpu.cpp b/src/core/hle/service/nvdrv/devices/nvhost_gpu.cpp index 39a58b685..2e2b0ae1c 100644 --- a/src/core/hle/service/nvdrv/devices/nvhost_gpu.cpp +++ b/src/core/hle/service/nvdrv/devices/nvhost_gpu.cpp
@@ -128,11 +128,9 @@ u32 nvhost_gpu::AllocateObjectContext(const std::vector<u8>& input, std::vector<
128	return 0;	128	return 0;
129	}	129	}
130		130
131	static void PushGPUEntries(const std::vector<Tegra::CommandListHeader>& entries) {	131	static void PushGPUEntries(Tegra::CommandList&& entries) {
132	auto& dma_pusher{Core::System::GetInstance().GPU().DmaPusher()};	132	auto& dma_pusher{Core::System::GetInstance().GPU().DmaPusher()};
133	for (const auto& entry : entries) {	133	dma_pusher.Push(std::move(entries));
134	dma_pusher.Push(entry);
135	}
136	dma_pusher.DispatchCalls();	134	dma_pusher.DispatchCalls();
137	}	135	}
138		136
@@ -149,11 +147,11 @@ u32 nvhost_gpu::SubmitGPFIFO(const std::vector<u8>& input, std::vector<u8>& outp
149	params.num_entries * sizeof(Tegra::CommandListHeader),	147	params.num_entries * sizeof(Tegra::CommandListHeader),
150	"Incorrect input size");	148	"Incorrect input size");
151		149
152	std::vector<Tegra::CommandListHeader> entries(params.num_entries);	150	Tegra::CommandList entries(params.num_entries);
153	std::memcpy(entries.data(), &input[sizeof(IoctlSubmitGpfifo)],	151	std::memcpy(entries.data(), &input[sizeof(IoctlSubmitGpfifo)],
154	params.num_entries * sizeof(Tegra::CommandListHeader));	152	params.num_entries * sizeof(Tegra::CommandListHeader));
155		153
156	PushGPUEntries(entries);	154	PushGPUEntries(std::move(entries));
157		155
158	params.fence_out.id = 0;	156	params.fence_out.id = 0;
159	params.fence_out.value = 0;	157	params.fence_out.value = 0;
@@ -170,11 +168,11 @@ u32 nvhost_gpu::KickoffPB(const std::vector<u8>& input, std::vector<u8>& output)
170	LOG_WARNING(Service_NVDRV, "(STUBBED) called, gpfifo={:X}, num_entries={:X}, flags={:X}",	168	LOG_WARNING(Service_NVDRV, "(STUBBED) called, gpfifo={:X}, num_entries={:X}, flags={:X}",
171	params.address, params.num_entries, params.flags);	169	params.address, params.num_entries, params.flags);
172		170
173	std::vector<Tegra::CommandListHeader> entries(params.num_entries);	171	Tegra::CommandList entries(params.num_entries);
174	Memory::ReadBlock(params.address, entries.data(),	172	Memory::ReadBlock(params.address, entries.data(),
175	params.num_entries * sizeof(Tegra::CommandListHeader));	173	params.num_entries * sizeof(Tegra::CommandListHeader));
176		174
177	PushGPUEntries(entries);	175	PushGPUEntries(std::move(entries));
178		176
179	params.fence_out.id = 0;	177	params.fence_out.id = 0;
180	params.fence_out.value = 0;	178	params.fence_out.value = 0;


diff --git a/src/video_core/dma_pusher.cpp b/src/video_core/dma_pusher.cpp index 23ec97944..63a958f11 100644 --- a/src/video_core/dma_pusher.cpp +++ b/src/video_core/dma_pusher.cpp
@@ -23,6 +23,8 @@ void DmaPusher::DispatchCalls() {
23	// On entering GPU code, assume all memory may be touched by the ARM core.	23	// On entering GPU code, assume all memory may be touched by the ARM core.
24	gpu.Maxwell3D().dirty_flags.OnMemoryWrite();	24	gpu.Maxwell3D().dirty_flags.OnMemoryWrite();
25		25
		26	dma_pushbuffer_subindex = 0;
		27
26	while (Core::System::GetInstance().IsPoweredOn()) {	28	while (Core::System::GetInstance().IsPoweredOn()) {
27	if (!Step()) {	29	if (!Step()) {
28	break;	30	break;
@@ -89,11 +91,17 @@ bool DmaPusher::Step() {
89	}	91	}
90	} else if (ib_enable && !dma_pushbuffer.empty()) {	92	} else if (ib_enable && !dma_pushbuffer.empty()) {
91	// Current pushbuffer empty, but we have more IB entries to read	93	// Current pushbuffer empty, but we have more IB entries to read
92	const CommandListHeader& command_list_header{dma_pushbuffer.front()};	94	const CommandList& command_list{dma_pushbuffer.front()};
		95	const CommandListHeader& command_list_header{command_list[dma_pushbuffer_subindex++]};
93	dma_get = command_list_header.addr;	96	dma_get = command_list_header.addr;
94	dma_put = dma_get + command_list_header.size * sizeof(u32);	97	dma_put = dma_get + command_list_header.size * sizeof(u32);
95	non_main = command_list_header.is_non_main;	98	non_main = command_list_header.is_non_main;
96	dma_pushbuffer.pop();	99
		100	if (dma_pushbuffer_subindex >= command_list.size()) {
		101	// We've gone through the current list, remove it from the queue
		102	dma_pushbuffer.pop();
		103	dma_pushbuffer_subindex = 0;
		104	}
97	} else {	105	} else {
98	// Otherwise, pushbuffer empty and IB empty or nonexistent - nothing to do	106	// Otherwise, pushbuffer empty and IB empty or nonexistent - nothing to do
99	return {};	107	return {};


diff --git a/src/video_core/dma_pusher.h b/src/video_core/dma_pusher.h index 39d98e46e..16e0697c4 100644 --- a/src/video_core/dma_pusher.h +++ b/src/video_core/dma_pusher.h
@@ -4,6 +4,7 @@
4		4
5	#pragma once	5	#pragma once
6		6
		7	#include <vector>
7	#include <queue>	8	#include <queue>
8		9
9	#include "common/bit_field.h"	10	#include "common/bit_field.h"
@@ -45,6 +46,8 @@ static_assert(sizeof(CommandHeader) == sizeof(u32), "CommandHeader has incorrect
45		46
46	class GPU;	47	class GPU;
47		48
		49	using CommandList = std::vector<Tegra::CommandListHeader>;
		50
48	/**	51	/**
49	* The DmaPusher class implements DMA submission to FIFOs, providing an area of memory that the	52	* The DmaPusher class implements DMA submission to FIFOs, providing an area of memory that the
50	* emulated app fills with commands and tells PFIFO to process. The pushbuffers are then assembled	53	* emulated app fills with commands and tells PFIFO to process. The pushbuffers are then assembled
@@ -57,8 +60,8 @@ public:
57	explicit DmaPusher(GPU& gpu);	60	explicit DmaPusher(GPU& gpu);
58	~DmaPusher();	61	~DmaPusher();
59		62
60	void Push(const CommandListHeader& command_list_header) {	63	void Push(CommandList&& entries) {
61	dma_pushbuffer.push(command_list_header);	64	dma_pushbuffer.push(std::move(entries));
62	}	65	}
63		66
64	void DispatchCalls();	67	void DispatchCalls();
@@ -72,7 +75,8 @@ private:
72		75
73	GPU& gpu;	76	GPU& gpu;
74		77
75	std::queue<CommandListHeader> dma_pushbuffer;	78	std::queue<CommandList> dma_pushbuffer; ///< Queue of command lists to be processed
		79	std::size_t dma_pushbuffer_subindex{}; ///< Index within a command list within the pushbuffer
76		80
77	struct DmaState {	81	struct DmaState {
78	u32 method; ///< Current method	82	u32 method; ///< Current method