summaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
authorGravatar bunnei2021-02-18 15:12:07 -0800
committerGravatar GitHub2021-02-18 15:12:07 -0800
commit9cae3e6e90f840903a0072b916e49f24d0f6cb10 (patch)
tree79511308066a4fbc11aa2e9058b0aa65772cc30a /src
parentMerge pull request #5955 from yuzu-emu/revert-3603-port-5123 (diff)
parent rebase, fix name shadowing, more const (diff)
downloadyuzu-9cae3e6e90f840903a0072b916e49f24d0f6cb10.tar.gz
yuzu-9cae3e6e90f840903a0072b916e49f24d0f6cb10.tar.xz
yuzu-9cae3e6e90f840903a0072b916e49f24d0f6cb10.zip
Merge pull request #4973 from ameerj/nvdec-opt
nvdec: Reuse allocated buffers and general cleanup
Diffstat (limited to 'src')
-rw-r--r--src/core/hle/service/nvdrv/devices/nvhost_nvdec.cpp3
-rw-r--r--src/core/hle/service/nvdrv/devices/nvhost_vic.cpp7
-rw-r--r--src/video_core/cdma_pusher.cpp63
-rw-r--r--src/video_core/cdma_pusher.h33
-rw-r--r--src/video_core/command_classes/codecs/codec.cpp7
-rw-r--r--src/video_core/command_classes/nvdec.cpp8
-rw-r--r--src/video_core/command_classes/nvdec.h2
-rw-r--r--src/video_core/command_classes/vic.cpp45
-rw-r--r--src/video_core/command_classes/vic.h51
-rw-r--r--src/video_core/gpu.cpp6
-rw-r--r--src/video_core/gpu_thread.cpp3
11 files changed, 79 insertions, 149 deletions
diff --git a/src/core/hle/service/nvdrv/devices/nvhost_nvdec.cpp b/src/core/hle/service/nvdrv/devices/nvhost_nvdec.cpp
index 36970f828..ecba1dba1 100644
--- a/src/core/hle/service/nvdrv/devices/nvhost_nvdec.cpp
+++ b/src/core/hle/service/nvdrv/devices/nvhost_nvdec.cpp
@@ -34,8 +34,7 @@ NvResult nvhost_nvdec::Ioctl1(Ioctl command, const std::vector<u8>& input,
34 case 0xa: { 34 case 0xa: {
35 if (command.length == 0x1c) { 35 if (command.length == 0x1c) {
36 LOG_INFO(Service_NVDRV, "NVDEC video stream ended"); 36 LOG_INFO(Service_NVDRV, "NVDEC video stream ended");
37 Tegra::ChCommandHeaderList cmdlist(1); 37 Tegra::ChCommandHeaderList cmdlist{{0xDEADB33F}};
38 cmdlist[0] = Tegra::ChCommandHeader{0xDEADB33F};
39 system.GPU().PushCommandBuffer(cmdlist); 38 system.GPU().PushCommandBuffer(cmdlist);
40 } 39 }
41 return UnmapBuffer(input, output); 40 return UnmapBuffer(input, output);
diff --git a/src/core/hle/service/nvdrv/devices/nvhost_vic.cpp b/src/core/hle/service/nvdrv/devices/nvhost_vic.cpp
index 72499654c..70849a9bd 100644
--- a/src/core/hle/service/nvdrv/devices/nvhost_vic.cpp
+++ b/src/core/hle/service/nvdrv/devices/nvhost_vic.cpp
@@ -28,8 +28,13 @@ NvResult nvhost_vic::Ioctl1(Ioctl command, const std::vector<u8>& input, std::ve
28 return GetWaitbase(input, output); 28 return GetWaitbase(input, output);
29 case 0x9: 29 case 0x9:
30 return MapBuffer(input, output); 30 return MapBuffer(input, output);
31 case 0xa: 31 case 0xa: {
32 if (command.length == 0x1c) {
33 Tegra::ChCommandHeaderList cmdlist{{0xDEADB33F}};
34 system.GPU().PushCommandBuffer(cmdlist);
35 }
32 return UnmapBuffer(input, output); 36 return UnmapBuffer(input, output);
37 }
33 default: 38 default:
34 break; 39 break;
35 } 40 }
diff --git a/src/video_core/cdma_pusher.cpp b/src/video_core/cdma_pusher.cpp
index 33b3c060b..a3fda1094 100644
--- a/src/video_core/cdma_pusher.cpp
+++ b/src/video_core/cdma_pusher.cpp
@@ -37,59 +37,43 @@ CDmaPusher::CDmaPusher(GPU& gpu_)
37 37
38CDmaPusher::~CDmaPusher() = default; 38CDmaPusher::~CDmaPusher() = default;
39 39
40void CDmaPusher::Push(ChCommandHeaderList&& entries) { 40void CDmaPusher::ProcessEntries(ChCommandHeaderList&& entries) {
41 cdma_queue.push(std::move(entries)); 41 for (const auto& value : entries) {
42}
43
44void CDmaPusher::DispatchCalls() {
45 while (!cdma_queue.empty()) {
46 Step();
47 }
48}
49
50void CDmaPusher::Step() {
51 const auto entries{cdma_queue.front()};
52 cdma_queue.pop();
53
54 std::vector<u32> values(entries.size());
55 std::memcpy(values.data(), entries.data(), entries.size() * sizeof(u32));
56
57 for (const u32 value : values) {
58 if (mask != 0) { 42 if (mask != 0) {
59 const auto lbs = static_cast<u32>(std::countr_zero(mask)); 43 const auto lbs = static_cast<u32>(std::countr_zero(mask));
60 mask &= ~(1U << lbs); 44 mask &= ~(1U << lbs);
61 ExecuteCommand(static_cast<u32>(offset + lbs), value); 45 ExecuteCommand(offset + lbs, value.raw);
62 continue; 46 continue;
63 } else if (count != 0) { 47 } else if (count != 0) {
64 --count; 48 --count;
65 ExecuteCommand(static_cast<u32>(offset), value); 49 ExecuteCommand(offset, value.raw);
66 if (incrementing) { 50 if (incrementing) {
67 ++offset; 51 ++offset;
68 } 52 }
69 continue; 53 continue;
70 } 54 }
71 const auto mode = static_cast<ChSubmissionMode>((value >> 28) & 0xf); 55 const auto mode = value.submission_mode.Value();
72 switch (mode) { 56 switch (mode) {
73 case ChSubmissionMode::SetClass: { 57 case ChSubmissionMode::SetClass: {
74 mask = value & 0x3f; 58 mask = value.value & 0x3f;
75 offset = (value >> 16) & 0xfff; 59 offset = value.method_offset;
76 current_class = static_cast<ChClassId>((value >> 6) & 0x3ff); 60 current_class = static_cast<ChClassId>((value.value >> 6) & 0x3ff);
77 break; 61 break;
78 } 62 }
79 case ChSubmissionMode::Incrementing: 63 case ChSubmissionMode::Incrementing:
80 case ChSubmissionMode::NonIncrementing: 64 case ChSubmissionMode::NonIncrementing:
81 count = value & 0xffff; 65 count = value.value;
82 offset = (value >> 16) & 0xfff; 66 offset = value.method_offset;
83 incrementing = mode == ChSubmissionMode::Incrementing; 67 incrementing = mode == ChSubmissionMode::Incrementing;
84 break; 68 break;
85 case ChSubmissionMode::Mask: 69 case ChSubmissionMode::Mask:
86 mask = value & 0xffff; 70 mask = value.value;
87 offset = (value >> 16) & 0xfff; 71 offset = value.method_offset;
88 break; 72 break;
89 case ChSubmissionMode::Immediate: { 73 case ChSubmissionMode::Immediate: {
90 const u32 data = value & 0xfff; 74 const u32 data = value.value & 0xfff;
91 offset = (value >> 16) & 0xfff; 75 offset = value.method_offset;
92 ExecuteCommand(static_cast<u32>(offset), data); 76 ExecuteCommand(offset, data);
93 break; 77 break;
94 } 78 }
95 default: 79 default:
@@ -102,8 +86,8 @@ void CDmaPusher::Step() {
102void CDmaPusher::ExecuteCommand(u32 state_offset, u32 data) { 86void CDmaPusher::ExecuteCommand(u32 state_offset, u32 data) {
103 switch (current_class) { 87 switch (current_class) {
104 case ChClassId::NvDec: 88 case ChClassId::NvDec:
105 ThiStateWrite(nvdec_thi_state, state_offset, {data}); 89 ThiStateWrite(nvdec_thi_state, offset, data);
106 switch (static_cast<ThiMethod>(state_offset)) { 90 switch (static_cast<ThiMethod>(offset)) {
107 case ThiMethod::IncSyncpt: { 91 case ThiMethod::IncSyncpt: {
108 LOG_DEBUG(Service_NVDRV, "NVDEC Class IncSyncpt Method"); 92 LOG_DEBUG(Service_NVDRV, "NVDEC Class IncSyncpt Method");
109 const auto syncpoint_id = static_cast<u32>(data & 0xFF); 93 const auto syncpoint_id = static_cast<u32>(data & 0xFF);
@@ -120,7 +104,7 @@ void CDmaPusher::ExecuteCommand(u32 state_offset, u32 data) {
120 LOG_DEBUG(Service_NVDRV, "NVDEC method 0x{:X}", 104 LOG_DEBUG(Service_NVDRV, "NVDEC method 0x{:X}",
121 static_cast<u32>(nvdec_thi_state.method_0)); 105 static_cast<u32>(nvdec_thi_state.method_0));
122 nvdec_processor->ProcessMethod(static_cast<Nvdec::Method>(nvdec_thi_state.method_0), 106 nvdec_processor->ProcessMethod(static_cast<Nvdec::Method>(nvdec_thi_state.method_0),
123 {data}); 107 data);
124 break; 108 break;
125 default: 109 default:
126 break; 110 break;
@@ -144,7 +128,7 @@ void CDmaPusher::ExecuteCommand(u32 state_offset, u32 data) {
144 case ThiMethod::SetMethod1: 128 case ThiMethod::SetMethod1:
145 LOG_DEBUG(Service_NVDRV, "VIC method 0x{:X}, Args=({})", 129 LOG_DEBUG(Service_NVDRV, "VIC method 0x{:X}, Args=({})",
146 static_cast<u32>(vic_thi_state.method_0), data); 130 static_cast<u32>(vic_thi_state.method_0), data);
147 vic_processor->ProcessMethod(static_cast<Vic::Method>(vic_thi_state.method_0), {data}); 131 vic_processor->ProcessMethod(static_cast<Vic::Method>(vic_thi_state.method_0), data);
148 break; 132 break;
149 default: 133 default:
150 break; 134 break;
@@ -153,7 +137,7 @@ void CDmaPusher::ExecuteCommand(u32 state_offset, u32 data) {
153 case ChClassId::Host1x: 137 case ChClassId::Host1x:
154 // This device is mainly for syncpoint synchronization 138 // This device is mainly for syncpoint synchronization
155 LOG_DEBUG(Service_NVDRV, "Host1X Class Method"); 139 LOG_DEBUG(Service_NVDRV, "Host1X Class Method");
156 host1x_processor->ProcessMethod(static_cast<Host1x::Method>(state_offset), {data}); 140 host1x_processor->ProcessMethod(static_cast<Host1x::Method>(offset), data);
157 break; 141 break;
158 default: 142 default:
159 UNIMPLEMENTED_MSG("Current class not implemented {:X}", static_cast<u32>(current_class)); 143 UNIMPLEMENTED_MSG("Current class not implemented {:X}", static_cast<u32>(current_class));
@@ -161,10 +145,9 @@ void CDmaPusher::ExecuteCommand(u32 state_offset, u32 data) {
161 } 145 }
162} 146}
163 147
164void CDmaPusher::ThiStateWrite(ThiRegisters& state, u32 state_offset, 148void CDmaPusher::ThiStateWrite(ThiRegisters& state, u32 state_offset, u32 argument) {
165 const std::vector<u32>& arguments) { 149 u8* const offset_ptr = reinterpret_cast<u8*>(&state) + sizeof(u32) * state_offset;
166 u8* const state_offset_ptr = reinterpret_cast<u8*>(&state) + sizeof(u32) * state_offset; 150 std::memcpy(offset_ptr, &argument, sizeof(u32));
167 std::memcpy(state_offset_ptr, arguments.data(), sizeof(u32) * arguments.size());
168} 151}
169 152
170} // namespace Tegra 153} // namespace Tegra
diff --git a/src/video_core/cdma_pusher.h b/src/video_core/cdma_pusher.h
index e5f212c1a..1bada44dd 100644
--- a/src/video_core/cdma_pusher.h
+++ b/src/video_core/cdma_pusher.h
@@ -5,9 +5,7 @@
5#pragma once 5#pragma once
6 6
7#include <memory> 7#include <memory>
8#include <unordered_map>
9#include <vector> 8#include <vector>
10#include <queue>
11 9
12#include "common/bit_field.h" 10#include "common/bit_field.h"
13#include "common/common_types.h" 11#include "common/common_types.h"
@@ -16,9 +14,9 @@
16namespace Tegra { 14namespace Tegra {
17 15
18class GPU; 16class GPU;
17class Host1x;
19class Nvdec; 18class Nvdec;
20class Vic; 19class Vic;
21class Host1x;
22 20
23enum class ChSubmissionMode : u32 { 21enum class ChSubmissionMode : u32 {
24 SetClass = 0, 22 SetClass = 0,
@@ -48,16 +46,10 @@ enum class ChClassId : u32 {
48 NvDec = 0xf0 46 NvDec = 0xf0
49}; 47};
50 48
51enum class ChMethod : u32 {
52 Empty = 0,
53 SetMethod = 0x10,
54 SetData = 0x11,
55};
56
57union ChCommandHeader { 49union ChCommandHeader {
58 u32 raw; 50 u32 raw;
59 BitField<0, 16, u32> value; 51 BitField<0, 16, u32> value;
60 BitField<16, 12, ChMethod> method_offset; 52 BitField<16, 12, u32> method_offset;
61 BitField<28, 4, ChSubmissionMode> submission_mode; 53 BitField<28, 4, ChSubmissionMode> submission_mode;
62}; 54};
63static_assert(sizeof(ChCommandHeader) == sizeof(u32), "ChCommand header is an invalid size"); 55static_assert(sizeof(ChCommandHeader) == sizeof(u32), "ChCommand header is an invalid size");
@@ -99,21 +91,15 @@ public:
99 explicit CDmaPusher(GPU& gpu_); 91 explicit CDmaPusher(GPU& gpu_);
100 ~CDmaPusher(); 92 ~CDmaPusher();
101 93
102 /// Push NVDEC command buffer entries into queue 94 /// Process the command entry
103 void Push(ChCommandHeaderList&& entries); 95 void ProcessEntries(ChCommandHeaderList&& entries);
104
105 /// Process queued command buffer entries
106 void DispatchCalls();
107
108 /// Process one queue element
109 void Step();
110 96
97private:
111 /// Invoke command class devices to execute the command based on the current state 98 /// Invoke command class devices to execute the command based on the current state
112 void ExecuteCommand(u32 state_offset, u32 data); 99 void ExecuteCommand(u32 state_offset, u32 data);
113 100
114private:
115 /// Write arguments value to the ThiRegisters member at the specified offset 101 /// Write arguments value to the ThiRegisters member at the specified offset
116 void ThiStateWrite(ThiRegisters& state, u32 state_offset, const std::vector<u32>& arguments); 102 void ThiStateWrite(ThiRegisters& state, u32 offset, u32 argument);
117 103
118 GPU& gpu; 104 GPU& gpu;
119 std::shared_ptr<Tegra::Nvdec> nvdec_processor; 105 std::shared_ptr<Tegra::Nvdec> nvdec_processor;
@@ -124,13 +110,10 @@ private:
124 ThiRegisters vic_thi_state{}; 110 ThiRegisters vic_thi_state{};
125 ThiRegisters nvdec_thi_state{}; 111 ThiRegisters nvdec_thi_state{};
126 112
127 s32 count{}; 113 u32 count{};
128 s32 offset{}; 114 u32 offset{};
129 u32 mask{}; 115 u32 mask{};
130 bool incrementing{}; 116 bool incrementing{};
131
132 // Queue of command lists to be processed
133 std::queue<ChCommandHeaderList> cdma_queue;
134}; 117};
135 118
136} // namespace Tegra 119} // namespace Tegra
diff --git a/src/video_core/command_classes/codecs/codec.cpp b/src/video_core/command_classes/codecs/codec.cpp
index 39bc923a5..d02dc6260 100644
--- a/src/video_core/command_classes/codecs/codec.cpp
+++ b/src/video_core/command_classes/codecs/codec.cpp
@@ -44,8 +44,10 @@ Codec::~Codec() {
44} 44}
45 45
46void Codec::SetTargetCodec(NvdecCommon::VideoCodec codec) { 46void Codec::SetTargetCodec(NvdecCommon::VideoCodec codec) {
47 LOG_INFO(Service_NVDRV, "NVDEC video codec initialized to {}", codec); 47 if (current_codec != codec) {
48 current_codec = codec; 48 LOG_INFO(Service_NVDRV, "NVDEC video codec initialized to {}", static_cast<u32>(codec));
49 current_codec = codec;
50 }
49} 51}
50 52
51void Codec::StateWrite(u32 offset, u64 arguments) { 53void Codec::StateWrite(u32 offset, u64 arguments) {
@@ -55,7 +57,6 @@ void Codec::StateWrite(u32 offset, u64 arguments) {
55 57
56void Codec::Decode() { 58void Codec::Decode() {
57 bool is_first_frame = false; 59 bool is_first_frame = false;
58
59 if (!initialized) { 60 if (!initialized) {
60 if (current_codec == NvdecCommon::VideoCodec::H264) { 61 if (current_codec == NvdecCommon::VideoCodec::H264) {
61 av_codec = avcodec_find_decoder(AV_CODEC_ID_H264); 62 av_codec = avcodec_find_decoder(AV_CODEC_ID_H264);
diff --git a/src/video_core/command_classes/nvdec.cpp b/src/video_core/command_classes/nvdec.cpp
index 79e1f4e13..e4f919afd 100644
--- a/src/video_core/command_classes/nvdec.cpp
+++ b/src/video_core/command_classes/nvdec.cpp
@@ -12,16 +12,16 @@ Nvdec::Nvdec(GPU& gpu_) : gpu(gpu_), codec(std::make_unique<Codec>(gpu)) {}
12 12
13Nvdec::~Nvdec() = default; 13Nvdec::~Nvdec() = default;
14 14
15void Nvdec::ProcessMethod(Method method, const std::vector<u32>& arguments) { 15void Nvdec::ProcessMethod(Method method, u32 argument) {
16 if (method == Method::SetVideoCodec) { 16 if (method == Method::SetVideoCodec) {
17 codec->StateWrite(static_cast<u32>(method), arguments[0]); 17 codec->StateWrite(static_cast<u32>(method), argument);
18 } else { 18 } else {
19 codec->StateWrite(static_cast<u32>(method), static_cast<u64>(arguments[0]) << 8); 19 codec->StateWrite(static_cast<u32>(method), static_cast<u64>(argument) << 8);
20 } 20 }
21 21
22 switch (method) { 22 switch (method) {
23 case Method::SetVideoCodec: 23 case Method::SetVideoCodec:
24 codec->SetTargetCodec(static_cast<NvdecCommon::VideoCodec>(arguments[0])); 24 codec->SetTargetCodec(static_cast<NvdecCommon::VideoCodec>(argument));
25 break; 25 break;
26 case Method::Execute: 26 case Method::Execute:
27 Execute(); 27 Execute();
diff --git a/src/video_core/command_classes/nvdec.h b/src/video_core/command_classes/nvdec.h
index e4877c533..e66be80b8 100644
--- a/src/video_core/command_classes/nvdec.h
+++ b/src/video_core/command_classes/nvdec.h
@@ -23,7 +23,7 @@ public:
23 ~Nvdec(); 23 ~Nvdec();
24 24
25 /// Writes the method into the state, Invoke Execute() if encountered 25 /// Writes the method into the state, Invoke Execute() if encountered
26 void ProcessMethod(Method method, const std::vector<u32>& arguments); 26 void ProcessMethod(Method method, u32 argument);
27 27
28 /// Return most recently decoded frame 28 /// Return most recently decoded frame
29 [[nodiscard]] AVFramePtr GetFrame(); 29 [[nodiscard]] AVFramePtr GetFrame();
diff --git a/src/video_core/command_classes/vic.cpp b/src/video_core/command_classes/vic.cpp
index 2b7569335..0a8b82f2b 100644
--- a/src/video_core/command_classes/vic.cpp
+++ b/src/video_core/command_classes/vic.cpp
@@ -18,18 +18,14 @@ extern "C" {
18namespace Tegra { 18namespace Tegra {
19 19
20Vic::Vic(GPU& gpu_, std::shared_ptr<Nvdec> nvdec_processor_) 20Vic::Vic(GPU& gpu_, std::shared_ptr<Nvdec> nvdec_processor_)
21 : gpu(gpu_), nvdec_processor(std::move(nvdec_processor_)) {} 21 : gpu(gpu_),
22Vic::~Vic() = default; 22 nvdec_processor(std::move(nvdec_processor_)), converted_frame_buffer{nullptr, av_free} {}
23 23
24void Vic::VicStateWrite(u32 offset, u32 arguments) { 24Vic::~Vic() = default;
25 u8* const state_offset = reinterpret_cast<u8*>(&vic_state) + offset * sizeof(u32);
26 std::memcpy(state_offset, &arguments, sizeof(u32));
27}
28 25
29void Vic::ProcessMethod(Method method, const std::vector<u32>& arguments) { 26void Vic::ProcessMethod(Method method, u32 argument) {
30 LOG_DEBUG(HW_GPU, "Vic method 0x{:X}", method); 27 LOG_DEBUG(HW_GPU, "Vic method 0x{:X}", static_cast<u32>(method));
31 VicStateWrite(static_cast<u32>(method), arguments[0]); 28 const u64 arg = static_cast<u64>(argument) << 8;
32 const u64 arg = static_cast<u64>(arguments[0]) << 8;
33 switch (method) { 29 switch (method) {
34 case Method::Execute: 30 case Method::Execute:
35 Execute(); 31 Execute();
@@ -53,8 +49,7 @@ void Vic::ProcessMethod(Method method, const std::vector<u32>& arguments) {
53 49
54void Vic::Execute() { 50void Vic::Execute() {
55 if (output_surface_luma_address == 0) { 51 if (output_surface_luma_address == 0) {
56 LOG_ERROR(Service_NVDRV, "VIC Luma address not set. Received 0x{:X}", 52 LOG_ERROR(Service_NVDRV, "VIC Luma address not set.");
57 vic_state.output_surface.luma_offset);
58 return; 53 return;
59 } 54 }
60 const VicConfig config{gpu.MemoryManager().Read<u64>(config_struct_address + 0x20)}; 55 const VicConfig config{gpu.MemoryManager().Read<u64>(config_struct_address + 0x20)};
@@ -89,8 +84,10 @@ void Vic::Execute() {
89 // Get Converted frame 84 // Get Converted frame
90 const std::size_t linear_size = frame->width * frame->height * 4; 85 const std::size_t linear_size = frame->width * frame->height * 4;
91 86
92 using AVMallocPtr = std::unique_ptr<u8, decltype(&av_free)>; 87 // Only allocate frame_buffer once per stream, as the size is not expected to change
93 AVMallocPtr converted_frame_buffer{static_cast<u8*>(av_malloc(linear_size)), av_free}; 88 if (!converted_frame_buffer) {
89 converted_frame_buffer = AVMallocPtr{static_cast<u8*>(av_malloc(linear_size)), av_free};
90 }
94 91
95 const int converted_stride{frame->width * 4}; 92 const int converted_stride{frame->width * 4};
96 u8* const converted_frame_buf_addr{converted_frame_buffer.get()}; 93 u8* const converted_frame_buf_addr{converted_frame_buffer.get()};
@@ -104,12 +101,12 @@ void Vic::Execute() {
104 const u32 block_height = static_cast<u32>(config.block_linear_height_log2); 101 const u32 block_height = static_cast<u32>(config.block_linear_height_log2);
105 const auto size = Tegra::Texture::CalculateSize(true, 4, frame->width, frame->height, 1, 102 const auto size = Tegra::Texture::CalculateSize(true, 4, frame->width, frame->height, 1,
106 block_height, 0); 103 block_height, 0);
107 std::vector<u8> swizzled_data(size); 104 luma_buffer.resize(size);
108 Tegra::Texture::SwizzleSubrect(frame->width, frame->height, frame->width * 4, 105 Tegra::Texture::SwizzleSubrect(frame->width, frame->height, frame->width * 4,
109 frame->width, 4, swizzled_data.data(), 106 frame->width, 4, luma_buffer.data(),
110 converted_frame_buffer.get(), block_height, 0, 0); 107 converted_frame_buffer.get(), block_height, 0, 0);
111 108
112 gpu.MemoryManager().WriteBlock(output_surface_luma_address, swizzled_data.data(), size); 109 gpu.MemoryManager().WriteBlock(output_surface_luma_address, luma_buffer.data(), size);
113 } else { 110 } else {
114 // send pitch linear frame 111 // send pitch linear frame
115 gpu.MemoryManager().WriteBlock(output_surface_luma_address, converted_frame_buf_addr, 112 gpu.MemoryManager().WriteBlock(output_surface_luma_address, converted_frame_buf_addr,
@@ -132,15 +129,15 @@ void Vic::Execute() {
132 const auto stride = frame->linesize[0]; 129 const auto stride = frame->linesize[0];
133 const auto half_stride = frame->linesize[1]; 130 const auto half_stride = frame->linesize[1];
134 131
135 std::vector<u8> luma_buffer(aligned_width * surface_height); 132 luma_buffer.resize(aligned_width * surface_height);
136 std::vector<u8> chroma_buffer(aligned_width * half_height); 133 chroma_buffer.resize(aligned_width * half_height);
137 134
138 // Populate luma buffer 135 // Populate luma buffer
139 for (std::size_t y = 0; y < surface_height - 1; ++y) { 136 for (std::size_t y = 0; y < surface_height - 1; ++y) {
140 std::size_t src = y * stride; 137 const std::size_t src = y * stride;
141 std::size_t dst = y * aligned_width; 138 const std::size_t dst = y * aligned_width;
142 139
143 std::size_t size = surface_width; 140 const std::size_t size = surface_width;
144 141
145 for (std::size_t offset = 0; offset < size; ++offset) { 142 for (std::size_t offset = 0; offset < size; ++offset) {
146 luma_buffer[dst + offset] = luma_ptr[src + offset]; 143 luma_buffer[dst + offset] = luma_ptr[src + offset];
@@ -151,8 +148,8 @@ void Vic::Execute() {
151 148
152 // Populate chroma buffer from both channels with interleaving. 149 // Populate chroma buffer from both channels with interleaving.
153 for (std::size_t y = 0; y < half_height; ++y) { 150 for (std::size_t y = 0; y < half_height; ++y) {
154 std::size_t src = y * half_stride; 151 const std::size_t src = y * half_stride;
155 std::size_t dst = y * aligned_width; 152 const std::size_t dst = y * aligned_width;
156 153
157 for (std::size_t x = 0; x < half_width; ++x) { 154 for (std::size_t x = 0; x < half_width; ++x) {
158 chroma_buffer[dst + x * 2] = chroma_b_ptr[src + x]; 155 chroma_buffer[dst + x * 2] = chroma_b_ptr[src + x];
diff --git a/src/video_core/command_classes/vic.h b/src/video_core/command_classes/vic.h
index 8c4e284a1..f5a2ed100 100644
--- a/src/video_core/command_classes/vic.h
+++ b/src/video_core/command_classes/vic.h
@@ -15,43 +15,6 @@ namespace Tegra {
15class GPU; 15class GPU;
16class Nvdec; 16class Nvdec;
17 17
18struct PlaneOffsets {
19 u32 luma_offset{};
20 u32 chroma_u_offset{};
21 u32 chroma_v_offset{};
22};
23
24struct VicRegisters {
25 INSERT_PADDING_WORDS(64);
26 u32 nop{};
27 INSERT_PADDING_WORDS(15);
28 u32 pm_trigger{};
29 INSERT_PADDING_WORDS(47);
30 u32 set_application_id{};
31 u32 set_watchdog_timer{};
32 INSERT_PADDING_WORDS(17);
33 u32 context_save_area{};
34 u32 context_switch{};
35 INSERT_PADDING_WORDS(43);
36 u32 execute{};
37 INSERT_PADDING_WORDS(63);
38 std::array<std::array<PlaneOffsets, 8>, 8> surfacex_slots{};
39 u32 picture_index{};
40 u32 control_params{};
41 u32 config_struct_offset{};
42 u32 filter_struct_offset{};
43 u32 palette_offset{};
44 u32 hist_offset{};
45 u32 context_id{};
46 u32 fce_ucode_size{};
47 PlaneOffsets output_surface{};
48 u32 fce_ucode_offset{};
49 INSERT_PADDING_WORDS(4);
50 std::array<u32, 8> slot_context_id{};
51 INSERT_PADDING_WORDS(16);
52};
53static_assert(sizeof(VicRegisters) == 0x7A0, "VicRegisters is an invalid size");
54
55class Vic { 18class Vic {
56public: 19public:
57 enum class Method : u32 { 20 enum class Method : u32 {
@@ -67,14 +30,11 @@ public:
67 ~Vic(); 30 ~Vic();
68 31
69 /// Write to the device state. 32 /// Write to the device state.
70 void ProcessMethod(Method method, const std::vector<u32>& arguments); 33 void ProcessMethod(Method method, u32 argument);
71 34
72private: 35private:
73 void Execute(); 36 void Execute();
74 37
75 void VicStateWrite(u32 offset, u32 arguments);
76 VicRegisters vic_state{};
77
78 enum class VideoPixelFormat : u64_le { 38 enum class VideoPixelFormat : u64_le {
79 RGBA8 = 0x1f, 39 RGBA8 = 0x1f,
80 BGRA8 = 0x20, 40 BGRA8 = 0x20,
@@ -88,8 +48,6 @@ private:
88 BitField<9, 2, u64_le> chroma_loc_vert; 48 BitField<9, 2, u64_le> chroma_loc_vert;
89 BitField<11, 4, u64_le> block_linear_kind; 49 BitField<11, 4, u64_le> block_linear_kind;
90 BitField<15, 4, u64_le> block_linear_height_log2; 50 BitField<15, 4, u64_le> block_linear_height_log2;
91 BitField<19, 3, u64_le> reserved0;
92 BitField<22, 10, u64_le> reserved1;
93 BitField<32, 14, u64_le> surface_width_minus1; 51 BitField<32, 14, u64_le> surface_width_minus1;
94 BitField<46, 14, u64_le> surface_height_minus1; 52 BitField<46, 14, u64_le> surface_height_minus1;
95 }; 53 };
@@ -97,6 +55,13 @@ private:
97 GPU& gpu; 55 GPU& gpu;
98 std::shared_ptr<Tegra::Nvdec> nvdec_processor; 56 std::shared_ptr<Tegra::Nvdec> nvdec_processor;
99 57
58 /// Avoid reallocation of the following buffers every frame, as their
59 /// size does not change during a stream
60 using AVMallocPtr = std::unique_ptr<u8, decltype(&av_free)>;
61 AVMallocPtr converted_frame_buffer;
62 std::vector<u8> luma_buffer;
63 std::vector<u8> chroma_buffer;
64
100 GPUVAddr config_struct_address{}; 65 GPUVAddr config_struct_address{};
101 GPUVAddr output_surface_luma_address{}; 66 GPUVAddr output_surface_luma_address{};
102 GPUVAddr output_surface_chroma_u_address{}; 67 GPUVAddr output_surface_chroma_u_address{};
diff --git a/src/video_core/gpu.cpp b/src/video_core/gpu.cpp
index 2a9bd4121..51c63af4a 100644
--- a/src/video_core/gpu.cpp
+++ b/src/video_core/gpu.cpp
@@ -30,8 +30,7 @@ MICROPROFILE_DEFINE(GPU_wait, "GPU", "Wait for the GPU", MP_RGB(128, 128, 192));
30 30
31GPU::GPU(Core::System& system_, bool is_async_, bool use_nvdec_) 31GPU::GPU(Core::System& system_, bool is_async_, bool use_nvdec_)
32 : system{system_}, memory_manager{std::make_unique<Tegra::MemoryManager>(system)}, 32 : system{system_}, memory_manager{std::make_unique<Tegra::MemoryManager>(system)},
33 dma_pusher{std::make_unique<Tegra::DmaPusher>(system, *this)}, 33 dma_pusher{std::make_unique<Tegra::DmaPusher>(system, *this)}, use_nvdec{use_nvdec_},
34 cdma_pusher{std::make_unique<Tegra::CDmaPusher>(*this)}, use_nvdec{use_nvdec_},
35 maxwell_3d{std::make_unique<Engines::Maxwell3D>(system, *memory_manager)}, 34 maxwell_3d{std::make_unique<Engines::Maxwell3D>(system, *memory_manager)},
36 fermi_2d{std::make_unique<Engines::Fermi2D>()}, 35 fermi_2d{std::make_unique<Engines::Fermi2D>()},
37 kepler_compute{std::make_unique<Engines::KeplerCompute>(system, *memory_manager)}, 36 kepler_compute{std::make_unique<Engines::KeplerCompute>(system, *memory_manager)},
@@ -494,8 +493,7 @@ void GPU::PushCommandBuffer(Tegra::ChCommandHeaderList& entries) {
494 // TODO(ameerj): RE proper async nvdec operation 493 // TODO(ameerj): RE proper async nvdec operation
495 // gpu_thread.SubmitCommandBuffer(std::move(entries)); 494 // gpu_thread.SubmitCommandBuffer(std::move(entries));
496 495
497 cdma_pusher->Push(std::move(entries)); 496 cdma_pusher->ProcessEntries(std::move(entries));
498 cdma_pusher->DispatchCalls();
499} 497}
500 498
501void GPU::SwapBuffers(const Tegra::FramebufferConfig* framebuffer) { 499void GPU::SwapBuffers(const Tegra::FramebufferConfig* framebuffer) {
diff --git a/src/video_core/gpu_thread.cpp b/src/video_core/gpu_thread.cpp
index 50319f1d5..eb0e43c0c 100644
--- a/src/video_core/gpu_thread.cpp
+++ b/src/video_core/gpu_thread.cpp
@@ -48,8 +48,7 @@ static void RunThread(Core::System& system, VideoCore::RendererBase& renderer,
48 dma_pusher.DispatchCalls(); 48 dma_pusher.DispatchCalls();
49 } else if (auto* command_list = std::get_if<SubmitChCommandEntries>(&next.data)) { 49 } else if (auto* command_list = std::get_if<SubmitChCommandEntries>(&next.data)) {
50 // NVDEC 50 // NVDEC
51 cdma_pusher.Push(std::move(command_list->entries)); 51 cdma_pusher.ProcessEntries(std::move(command_list->entries));
52 cdma_pusher.DispatchCalls();
53 } else if (const auto* data = std::get_if<SwapBuffersCommand>(&next.data)) { 52 } else if (const auto* data = std::get_if<SwapBuffersCommand>(&next.data)) {
54 renderer.SwapBuffers(data->framebuffer ? &*data->framebuffer : nullptr); 53 renderer.SwapBuffers(data->framebuffer ? &*data->framebuffer : nullptr);
55 } else if (std::holds_alternative<OnCommandListEndCommand>(next.data)) { 54 } else if (std::holds_alternative<OnCommandListEndCommand>(next.data)) {