diff options
| author | 2021-02-18 15:12:07 -0800 | |
|---|---|---|
| committer | 2021-02-18 15:12:07 -0800 | |
| commit | 9cae3e6e90f840903a0072b916e49f24d0f6cb10 (patch) | |
| tree | 79511308066a4fbc11aa2e9058b0aa65772cc30a /src | |
| parent | Merge pull request #5955 from yuzu-emu/revert-3603-port-5123 (diff) | |
| parent | rebase, fix name shadowing, more const (diff) | |
| download | yuzu-9cae3e6e90f840903a0072b916e49f24d0f6cb10.tar.gz yuzu-9cae3e6e90f840903a0072b916e49f24d0f6cb10.tar.xz yuzu-9cae3e6e90f840903a0072b916e49f24d0f6cb10.zip | |
Merge pull request #4973 from ameerj/nvdec-opt
nvdec: Reuse allocated buffers and general cleanup
Diffstat (limited to 'src')
| -rw-r--r-- | src/core/hle/service/nvdrv/devices/nvhost_nvdec.cpp | 3 | ||||
| -rw-r--r-- | src/core/hle/service/nvdrv/devices/nvhost_vic.cpp | 7 | ||||
| -rw-r--r-- | src/video_core/cdma_pusher.cpp | 63 | ||||
| -rw-r--r-- | src/video_core/cdma_pusher.h | 33 | ||||
| -rw-r--r-- | src/video_core/command_classes/codecs/codec.cpp | 7 | ||||
| -rw-r--r-- | src/video_core/command_classes/nvdec.cpp | 8 | ||||
| -rw-r--r-- | src/video_core/command_classes/nvdec.h | 2 | ||||
| -rw-r--r-- | src/video_core/command_classes/vic.cpp | 45 | ||||
| -rw-r--r-- | src/video_core/command_classes/vic.h | 51 | ||||
| -rw-r--r-- | src/video_core/gpu.cpp | 6 | ||||
| -rw-r--r-- | src/video_core/gpu_thread.cpp | 3 |
11 files changed, 79 insertions, 149 deletions
diff --git a/src/core/hle/service/nvdrv/devices/nvhost_nvdec.cpp b/src/core/hle/service/nvdrv/devices/nvhost_nvdec.cpp index 36970f828..ecba1dba1 100644 --- a/src/core/hle/service/nvdrv/devices/nvhost_nvdec.cpp +++ b/src/core/hle/service/nvdrv/devices/nvhost_nvdec.cpp | |||
| @@ -34,8 +34,7 @@ NvResult nvhost_nvdec::Ioctl1(Ioctl command, const std::vector<u8>& input, | |||
| 34 | case 0xa: { | 34 | case 0xa: { |
| 35 | if (command.length == 0x1c) { | 35 | if (command.length == 0x1c) { |
| 36 | LOG_INFO(Service_NVDRV, "NVDEC video stream ended"); | 36 | LOG_INFO(Service_NVDRV, "NVDEC video stream ended"); |
| 37 | Tegra::ChCommandHeaderList cmdlist(1); | 37 | Tegra::ChCommandHeaderList cmdlist{{0xDEADB33F}}; |
| 38 | cmdlist[0] = Tegra::ChCommandHeader{0xDEADB33F}; | ||
| 39 | system.GPU().PushCommandBuffer(cmdlist); | 38 | system.GPU().PushCommandBuffer(cmdlist); |
| 40 | } | 39 | } |
| 41 | return UnmapBuffer(input, output); | 40 | return UnmapBuffer(input, output); |
diff --git a/src/core/hle/service/nvdrv/devices/nvhost_vic.cpp b/src/core/hle/service/nvdrv/devices/nvhost_vic.cpp index 72499654c..70849a9bd 100644 --- a/src/core/hle/service/nvdrv/devices/nvhost_vic.cpp +++ b/src/core/hle/service/nvdrv/devices/nvhost_vic.cpp | |||
| @@ -28,8 +28,13 @@ NvResult nvhost_vic::Ioctl1(Ioctl command, const std::vector<u8>& input, std::ve | |||
| 28 | return GetWaitbase(input, output); | 28 | return GetWaitbase(input, output); |
| 29 | case 0x9: | 29 | case 0x9: |
| 30 | return MapBuffer(input, output); | 30 | return MapBuffer(input, output); |
| 31 | case 0xa: | 31 | case 0xa: { |
| 32 | if (command.length == 0x1c) { | ||
| 33 | Tegra::ChCommandHeaderList cmdlist{{0xDEADB33F}}; | ||
| 34 | system.GPU().PushCommandBuffer(cmdlist); | ||
| 35 | } | ||
| 32 | return UnmapBuffer(input, output); | 36 | return UnmapBuffer(input, output); |
| 37 | } | ||
| 33 | default: | 38 | default: |
| 34 | break; | 39 | break; |
| 35 | } | 40 | } |
diff --git a/src/video_core/cdma_pusher.cpp b/src/video_core/cdma_pusher.cpp index 33b3c060b..a3fda1094 100644 --- a/src/video_core/cdma_pusher.cpp +++ b/src/video_core/cdma_pusher.cpp | |||
| @@ -37,59 +37,43 @@ CDmaPusher::CDmaPusher(GPU& gpu_) | |||
| 37 | 37 | ||
| 38 | CDmaPusher::~CDmaPusher() = default; | 38 | CDmaPusher::~CDmaPusher() = default; |
| 39 | 39 | ||
| 40 | void CDmaPusher::Push(ChCommandHeaderList&& entries) { | 40 | void CDmaPusher::ProcessEntries(ChCommandHeaderList&& entries) { |
| 41 | cdma_queue.push(std::move(entries)); | 41 | for (const auto& value : entries) { |
| 42 | } | ||
| 43 | |||
| 44 | void CDmaPusher::DispatchCalls() { | ||
| 45 | while (!cdma_queue.empty()) { | ||
| 46 | Step(); | ||
| 47 | } | ||
| 48 | } | ||
| 49 | |||
| 50 | void CDmaPusher::Step() { | ||
| 51 | const auto entries{cdma_queue.front()}; | ||
| 52 | cdma_queue.pop(); | ||
| 53 | |||
| 54 | std::vector<u32> values(entries.size()); | ||
| 55 | std::memcpy(values.data(), entries.data(), entries.size() * sizeof(u32)); | ||
| 56 | |||
| 57 | for (const u32 value : values) { | ||
| 58 | if (mask != 0) { | 42 | if (mask != 0) { |
| 59 | const auto lbs = static_cast<u32>(std::countr_zero(mask)); | 43 | const auto lbs = static_cast<u32>(std::countr_zero(mask)); |
| 60 | mask &= ~(1U << lbs); | 44 | mask &= ~(1U << lbs); |
| 61 | ExecuteCommand(static_cast<u32>(offset + lbs), value); | 45 | ExecuteCommand(offset + lbs, value.raw); |
| 62 | continue; | 46 | continue; |
| 63 | } else if (count != 0) { | 47 | } else if (count != 0) { |
| 64 | --count; | 48 | --count; |
| 65 | ExecuteCommand(static_cast<u32>(offset), value); | 49 | ExecuteCommand(offset, value.raw); |
| 66 | if (incrementing) { | 50 | if (incrementing) { |
| 67 | ++offset; | 51 | ++offset; |
| 68 | } | 52 | } |
| 69 | continue; | 53 | continue; |
| 70 | } | 54 | } |
| 71 | const auto mode = static_cast<ChSubmissionMode>((value >> 28) & 0xf); | 55 | const auto mode = value.submission_mode.Value(); |
| 72 | switch (mode) { | 56 | switch (mode) { |
| 73 | case ChSubmissionMode::SetClass: { | 57 | case ChSubmissionMode::SetClass: { |
| 74 | mask = value & 0x3f; | 58 | mask = value.value & 0x3f; |
| 75 | offset = (value >> 16) & 0xfff; | 59 | offset = value.method_offset; |
| 76 | current_class = static_cast<ChClassId>((value >> 6) & 0x3ff); | 60 | current_class = static_cast<ChClassId>((value.value >> 6) & 0x3ff); |
| 77 | break; | 61 | break; |
| 78 | } | 62 | } |
| 79 | case ChSubmissionMode::Incrementing: | 63 | case ChSubmissionMode::Incrementing: |
| 80 | case ChSubmissionMode::NonIncrementing: | 64 | case ChSubmissionMode::NonIncrementing: |
| 81 | count = value & 0xffff; | 65 | count = value.value; |
| 82 | offset = (value >> 16) & 0xfff; | 66 | offset = value.method_offset; |
| 83 | incrementing = mode == ChSubmissionMode::Incrementing; | 67 | incrementing = mode == ChSubmissionMode::Incrementing; |
| 84 | break; | 68 | break; |
| 85 | case ChSubmissionMode::Mask: | 69 | case ChSubmissionMode::Mask: |
| 86 | mask = value & 0xffff; | 70 | mask = value.value; |
| 87 | offset = (value >> 16) & 0xfff; | 71 | offset = value.method_offset; |
| 88 | break; | 72 | break; |
| 89 | case ChSubmissionMode::Immediate: { | 73 | case ChSubmissionMode::Immediate: { |
| 90 | const u32 data = value & 0xfff; | 74 | const u32 data = value.value & 0xfff; |
| 91 | offset = (value >> 16) & 0xfff; | 75 | offset = value.method_offset; |
| 92 | ExecuteCommand(static_cast<u32>(offset), data); | 76 | ExecuteCommand(offset, data); |
| 93 | break; | 77 | break; |
| 94 | } | 78 | } |
| 95 | default: | 79 | default: |
| @@ -102,8 +86,8 @@ void CDmaPusher::Step() { | |||
| 102 | void CDmaPusher::ExecuteCommand(u32 state_offset, u32 data) { | 86 | void CDmaPusher::ExecuteCommand(u32 state_offset, u32 data) { |
| 103 | switch (current_class) { | 87 | switch (current_class) { |
| 104 | case ChClassId::NvDec: | 88 | case ChClassId::NvDec: |
| 105 | ThiStateWrite(nvdec_thi_state, state_offset, {data}); | 89 | ThiStateWrite(nvdec_thi_state, offset, data); |
| 106 | switch (static_cast<ThiMethod>(state_offset)) { | 90 | switch (static_cast<ThiMethod>(offset)) { |
| 107 | case ThiMethod::IncSyncpt: { | 91 | case ThiMethod::IncSyncpt: { |
| 108 | LOG_DEBUG(Service_NVDRV, "NVDEC Class IncSyncpt Method"); | 92 | LOG_DEBUG(Service_NVDRV, "NVDEC Class IncSyncpt Method"); |
| 109 | const auto syncpoint_id = static_cast<u32>(data & 0xFF); | 93 | const auto syncpoint_id = static_cast<u32>(data & 0xFF); |
| @@ -120,7 +104,7 @@ void CDmaPusher::ExecuteCommand(u32 state_offset, u32 data) { | |||
| 120 | LOG_DEBUG(Service_NVDRV, "NVDEC method 0x{:X}", | 104 | LOG_DEBUG(Service_NVDRV, "NVDEC method 0x{:X}", |
| 121 | static_cast<u32>(nvdec_thi_state.method_0)); | 105 | static_cast<u32>(nvdec_thi_state.method_0)); |
| 122 | nvdec_processor->ProcessMethod(static_cast<Nvdec::Method>(nvdec_thi_state.method_0), | 106 | nvdec_processor->ProcessMethod(static_cast<Nvdec::Method>(nvdec_thi_state.method_0), |
| 123 | {data}); | 107 | data); |
| 124 | break; | 108 | break; |
| 125 | default: | 109 | default: |
| 126 | break; | 110 | break; |
| @@ -144,7 +128,7 @@ void CDmaPusher::ExecuteCommand(u32 state_offset, u32 data) { | |||
| 144 | case ThiMethod::SetMethod1: | 128 | case ThiMethod::SetMethod1: |
| 145 | LOG_DEBUG(Service_NVDRV, "VIC method 0x{:X}, Args=({})", | 129 | LOG_DEBUG(Service_NVDRV, "VIC method 0x{:X}, Args=({})", |
| 146 | static_cast<u32>(vic_thi_state.method_0), data); | 130 | static_cast<u32>(vic_thi_state.method_0), data); |
| 147 | vic_processor->ProcessMethod(static_cast<Vic::Method>(vic_thi_state.method_0), {data}); | 131 | vic_processor->ProcessMethod(static_cast<Vic::Method>(vic_thi_state.method_0), data); |
| 148 | break; | 132 | break; |
| 149 | default: | 133 | default: |
| 150 | break; | 134 | break; |
| @@ -153,7 +137,7 @@ void CDmaPusher::ExecuteCommand(u32 state_offset, u32 data) { | |||
| 153 | case ChClassId::Host1x: | 137 | case ChClassId::Host1x: |
| 154 | // This device is mainly for syncpoint synchronization | 138 | // This device is mainly for syncpoint synchronization |
| 155 | LOG_DEBUG(Service_NVDRV, "Host1X Class Method"); | 139 | LOG_DEBUG(Service_NVDRV, "Host1X Class Method"); |
| 156 | host1x_processor->ProcessMethod(static_cast<Host1x::Method>(state_offset), {data}); | 140 | host1x_processor->ProcessMethod(static_cast<Host1x::Method>(offset), data); |
| 157 | break; | 141 | break; |
| 158 | default: | 142 | default: |
| 159 | UNIMPLEMENTED_MSG("Current class not implemented {:X}", static_cast<u32>(current_class)); | 143 | UNIMPLEMENTED_MSG("Current class not implemented {:X}", static_cast<u32>(current_class)); |
| @@ -161,10 +145,9 @@ void CDmaPusher::ExecuteCommand(u32 state_offset, u32 data) { | |||
| 161 | } | 145 | } |
| 162 | } | 146 | } |
| 163 | 147 | ||
| 164 | void CDmaPusher::ThiStateWrite(ThiRegisters& state, u32 state_offset, | 148 | void CDmaPusher::ThiStateWrite(ThiRegisters& state, u32 state_offset, u32 argument) { |
| 165 | const std::vector<u32>& arguments) { | 149 | u8* const offset_ptr = reinterpret_cast<u8*>(&state) + sizeof(u32) * state_offset; |
| 166 | u8* const state_offset_ptr = reinterpret_cast<u8*>(&state) + sizeof(u32) * state_offset; | 150 | std::memcpy(offset_ptr, &argument, sizeof(u32)); |
| 167 | std::memcpy(state_offset_ptr, arguments.data(), sizeof(u32) * arguments.size()); | ||
| 168 | } | 151 | } |
| 169 | 152 | ||
| 170 | } // namespace Tegra | 153 | } // namespace Tegra |
diff --git a/src/video_core/cdma_pusher.h b/src/video_core/cdma_pusher.h index e5f212c1a..1bada44dd 100644 --- a/src/video_core/cdma_pusher.h +++ b/src/video_core/cdma_pusher.h | |||
| @@ -5,9 +5,7 @@ | |||
| 5 | #pragma once | 5 | #pragma once |
| 6 | 6 | ||
| 7 | #include <memory> | 7 | #include <memory> |
| 8 | #include <unordered_map> | ||
| 9 | #include <vector> | 8 | #include <vector> |
| 10 | #include <queue> | ||
| 11 | 9 | ||
| 12 | #include "common/bit_field.h" | 10 | #include "common/bit_field.h" |
| 13 | #include "common/common_types.h" | 11 | #include "common/common_types.h" |
| @@ -16,9 +14,9 @@ | |||
| 16 | namespace Tegra { | 14 | namespace Tegra { |
| 17 | 15 | ||
| 18 | class GPU; | 16 | class GPU; |
| 17 | class Host1x; | ||
| 19 | class Nvdec; | 18 | class Nvdec; |
| 20 | class Vic; | 19 | class Vic; |
| 21 | class Host1x; | ||
| 22 | 20 | ||
| 23 | enum class ChSubmissionMode : u32 { | 21 | enum class ChSubmissionMode : u32 { |
| 24 | SetClass = 0, | 22 | SetClass = 0, |
| @@ -48,16 +46,10 @@ enum class ChClassId : u32 { | |||
| 48 | NvDec = 0xf0 | 46 | NvDec = 0xf0 |
| 49 | }; | 47 | }; |
| 50 | 48 | ||
| 51 | enum class ChMethod : u32 { | ||
| 52 | Empty = 0, | ||
| 53 | SetMethod = 0x10, | ||
| 54 | SetData = 0x11, | ||
| 55 | }; | ||
| 56 | |||
| 57 | union ChCommandHeader { | 49 | union ChCommandHeader { |
| 58 | u32 raw; | 50 | u32 raw; |
| 59 | BitField<0, 16, u32> value; | 51 | BitField<0, 16, u32> value; |
| 60 | BitField<16, 12, ChMethod> method_offset; | 52 | BitField<16, 12, u32> method_offset; |
| 61 | BitField<28, 4, ChSubmissionMode> submission_mode; | 53 | BitField<28, 4, ChSubmissionMode> submission_mode; |
| 62 | }; | 54 | }; |
| 63 | static_assert(sizeof(ChCommandHeader) == sizeof(u32), "ChCommand header is an invalid size"); | 55 | static_assert(sizeof(ChCommandHeader) == sizeof(u32), "ChCommand header is an invalid size"); |
| @@ -99,21 +91,15 @@ public: | |||
| 99 | explicit CDmaPusher(GPU& gpu_); | 91 | explicit CDmaPusher(GPU& gpu_); |
| 100 | ~CDmaPusher(); | 92 | ~CDmaPusher(); |
| 101 | 93 | ||
| 102 | /// Push NVDEC command buffer entries into queue | 94 | /// Process the command entry |
| 103 | void Push(ChCommandHeaderList&& entries); | 95 | void ProcessEntries(ChCommandHeaderList&& entries); |
| 104 | |||
| 105 | /// Process queued command buffer entries | ||
| 106 | void DispatchCalls(); | ||
| 107 | |||
| 108 | /// Process one queue element | ||
| 109 | void Step(); | ||
| 110 | 96 | ||
| 97 | private: | ||
| 111 | /// Invoke command class devices to execute the command based on the current state | 98 | /// Invoke command class devices to execute the command based on the current state |
| 112 | void ExecuteCommand(u32 state_offset, u32 data); | 99 | void ExecuteCommand(u32 state_offset, u32 data); |
| 113 | 100 | ||
| 114 | private: | ||
| 115 | /// Write arguments value to the ThiRegisters member at the specified offset | 101 | /// Write arguments value to the ThiRegisters member at the specified offset |
| 116 | void ThiStateWrite(ThiRegisters& state, u32 state_offset, const std::vector<u32>& arguments); | 102 | void ThiStateWrite(ThiRegisters& state, u32 offset, u32 argument); |
| 117 | 103 | ||
| 118 | GPU& gpu; | 104 | GPU& gpu; |
| 119 | std::shared_ptr<Tegra::Nvdec> nvdec_processor; | 105 | std::shared_ptr<Tegra::Nvdec> nvdec_processor; |
| @@ -124,13 +110,10 @@ private: | |||
| 124 | ThiRegisters vic_thi_state{}; | 110 | ThiRegisters vic_thi_state{}; |
| 125 | ThiRegisters nvdec_thi_state{}; | 111 | ThiRegisters nvdec_thi_state{}; |
| 126 | 112 | ||
| 127 | s32 count{}; | 113 | u32 count{}; |
| 128 | s32 offset{}; | 114 | u32 offset{}; |
| 129 | u32 mask{}; | 115 | u32 mask{}; |
| 130 | bool incrementing{}; | 116 | bool incrementing{}; |
| 131 | |||
| 132 | // Queue of command lists to be processed | ||
| 133 | std::queue<ChCommandHeaderList> cdma_queue; | ||
| 134 | }; | 117 | }; |
| 135 | 118 | ||
| 136 | } // namespace Tegra | 119 | } // namespace Tegra |
diff --git a/src/video_core/command_classes/codecs/codec.cpp b/src/video_core/command_classes/codecs/codec.cpp index 39bc923a5..d02dc6260 100644 --- a/src/video_core/command_classes/codecs/codec.cpp +++ b/src/video_core/command_classes/codecs/codec.cpp | |||
| @@ -44,8 +44,10 @@ Codec::~Codec() { | |||
| 44 | } | 44 | } |
| 45 | 45 | ||
| 46 | void Codec::SetTargetCodec(NvdecCommon::VideoCodec codec) { | 46 | void Codec::SetTargetCodec(NvdecCommon::VideoCodec codec) { |
| 47 | LOG_INFO(Service_NVDRV, "NVDEC video codec initialized to {}", codec); | 47 | if (current_codec != codec) { |
| 48 | current_codec = codec; | 48 | LOG_INFO(Service_NVDRV, "NVDEC video codec initialized to {}", static_cast<u32>(codec)); |
| 49 | current_codec = codec; | ||
| 50 | } | ||
| 49 | } | 51 | } |
| 50 | 52 | ||
| 51 | void Codec::StateWrite(u32 offset, u64 arguments) { | 53 | void Codec::StateWrite(u32 offset, u64 arguments) { |
| @@ -55,7 +57,6 @@ void Codec::StateWrite(u32 offset, u64 arguments) { | |||
| 55 | 57 | ||
| 56 | void Codec::Decode() { | 58 | void Codec::Decode() { |
| 57 | bool is_first_frame = false; | 59 | bool is_first_frame = false; |
| 58 | |||
| 59 | if (!initialized) { | 60 | if (!initialized) { |
| 60 | if (current_codec == NvdecCommon::VideoCodec::H264) { | 61 | if (current_codec == NvdecCommon::VideoCodec::H264) { |
| 61 | av_codec = avcodec_find_decoder(AV_CODEC_ID_H264); | 62 | av_codec = avcodec_find_decoder(AV_CODEC_ID_H264); |
diff --git a/src/video_core/command_classes/nvdec.cpp b/src/video_core/command_classes/nvdec.cpp index 79e1f4e13..e4f919afd 100644 --- a/src/video_core/command_classes/nvdec.cpp +++ b/src/video_core/command_classes/nvdec.cpp | |||
| @@ -12,16 +12,16 @@ Nvdec::Nvdec(GPU& gpu_) : gpu(gpu_), codec(std::make_unique<Codec>(gpu)) {} | |||
| 12 | 12 | ||
| 13 | Nvdec::~Nvdec() = default; | 13 | Nvdec::~Nvdec() = default; |
| 14 | 14 | ||
| 15 | void Nvdec::ProcessMethod(Method method, const std::vector<u32>& arguments) { | 15 | void Nvdec::ProcessMethod(Method method, u32 argument) { |
| 16 | if (method == Method::SetVideoCodec) { | 16 | if (method == Method::SetVideoCodec) { |
| 17 | codec->StateWrite(static_cast<u32>(method), arguments[0]); | 17 | codec->StateWrite(static_cast<u32>(method), argument); |
| 18 | } else { | 18 | } else { |
| 19 | codec->StateWrite(static_cast<u32>(method), static_cast<u64>(arguments[0]) << 8); | 19 | codec->StateWrite(static_cast<u32>(method), static_cast<u64>(argument) << 8); |
| 20 | } | 20 | } |
| 21 | 21 | ||
| 22 | switch (method) { | 22 | switch (method) { |
| 23 | case Method::SetVideoCodec: | 23 | case Method::SetVideoCodec: |
| 24 | codec->SetTargetCodec(static_cast<NvdecCommon::VideoCodec>(arguments[0])); | 24 | codec->SetTargetCodec(static_cast<NvdecCommon::VideoCodec>(argument)); |
| 25 | break; | 25 | break; |
| 26 | case Method::Execute: | 26 | case Method::Execute: |
| 27 | Execute(); | 27 | Execute(); |
diff --git a/src/video_core/command_classes/nvdec.h b/src/video_core/command_classes/nvdec.h index e4877c533..e66be80b8 100644 --- a/src/video_core/command_classes/nvdec.h +++ b/src/video_core/command_classes/nvdec.h | |||
| @@ -23,7 +23,7 @@ public: | |||
| 23 | ~Nvdec(); | 23 | ~Nvdec(); |
| 24 | 24 | ||
| 25 | /// Writes the method into the state, Invoke Execute() if encountered | 25 | /// Writes the method into the state, Invoke Execute() if encountered |
| 26 | void ProcessMethod(Method method, const std::vector<u32>& arguments); | 26 | void ProcessMethod(Method method, u32 argument); |
| 27 | 27 | ||
| 28 | /// Return most recently decoded frame | 28 | /// Return most recently decoded frame |
| 29 | [[nodiscard]] AVFramePtr GetFrame(); | 29 | [[nodiscard]] AVFramePtr GetFrame(); |
diff --git a/src/video_core/command_classes/vic.cpp b/src/video_core/command_classes/vic.cpp index 2b7569335..0a8b82f2b 100644 --- a/src/video_core/command_classes/vic.cpp +++ b/src/video_core/command_classes/vic.cpp | |||
| @@ -18,18 +18,14 @@ extern "C" { | |||
| 18 | namespace Tegra { | 18 | namespace Tegra { |
| 19 | 19 | ||
| 20 | Vic::Vic(GPU& gpu_, std::shared_ptr<Nvdec> nvdec_processor_) | 20 | Vic::Vic(GPU& gpu_, std::shared_ptr<Nvdec> nvdec_processor_) |
| 21 | : gpu(gpu_), nvdec_processor(std::move(nvdec_processor_)) {} | 21 | : gpu(gpu_), |
| 22 | Vic::~Vic() = default; | 22 | nvdec_processor(std::move(nvdec_processor_)), converted_frame_buffer{nullptr, av_free} {} |
| 23 | 23 | ||
| 24 | void Vic::VicStateWrite(u32 offset, u32 arguments) { | 24 | Vic::~Vic() = default; |
| 25 | u8* const state_offset = reinterpret_cast<u8*>(&vic_state) + offset * sizeof(u32); | ||
| 26 | std::memcpy(state_offset, &arguments, sizeof(u32)); | ||
| 27 | } | ||
| 28 | 25 | ||
| 29 | void Vic::ProcessMethod(Method method, const std::vector<u32>& arguments) { | 26 | void Vic::ProcessMethod(Method method, u32 argument) { |
| 30 | LOG_DEBUG(HW_GPU, "Vic method 0x{:X}", method); | 27 | LOG_DEBUG(HW_GPU, "Vic method 0x{:X}", static_cast<u32>(method)); |
| 31 | VicStateWrite(static_cast<u32>(method), arguments[0]); | 28 | const u64 arg = static_cast<u64>(argument) << 8; |
| 32 | const u64 arg = static_cast<u64>(arguments[0]) << 8; | ||
| 33 | switch (method) { | 29 | switch (method) { |
| 34 | case Method::Execute: | 30 | case Method::Execute: |
| 35 | Execute(); | 31 | Execute(); |
| @@ -53,8 +49,7 @@ void Vic::ProcessMethod(Method method, const std::vector<u32>& arguments) { | |||
| 53 | 49 | ||
| 54 | void Vic::Execute() { | 50 | void Vic::Execute() { |
| 55 | if (output_surface_luma_address == 0) { | 51 | if (output_surface_luma_address == 0) { |
| 56 | LOG_ERROR(Service_NVDRV, "VIC Luma address not set. Received 0x{:X}", | 52 | LOG_ERROR(Service_NVDRV, "VIC Luma address not set."); |
| 57 | vic_state.output_surface.luma_offset); | ||
| 58 | return; | 53 | return; |
| 59 | } | 54 | } |
| 60 | const VicConfig config{gpu.MemoryManager().Read<u64>(config_struct_address + 0x20)}; | 55 | const VicConfig config{gpu.MemoryManager().Read<u64>(config_struct_address + 0x20)}; |
| @@ -89,8 +84,10 @@ void Vic::Execute() { | |||
| 89 | // Get Converted frame | 84 | // Get Converted frame |
| 90 | const std::size_t linear_size = frame->width * frame->height * 4; | 85 | const std::size_t linear_size = frame->width * frame->height * 4; |
| 91 | 86 | ||
| 92 | using AVMallocPtr = std::unique_ptr<u8, decltype(&av_free)>; | 87 | // Only allocate frame_buffer once per stream, as the size is not expected to change |
| 93 | AVMallocPtr converted_frame_buffer{static_cast<u8*>(av_malloc(linear_size)), av_free}; | 88 | if (!converted_frame_buffer) { |
| 89 | converted_frame_buffer = AVMallocPtr{static_cast<u8*>(av_malloc(linear_size)), av_free}; | ||
| 90 | } | ||
| 94 | 91 | ||
| 95 | const int converted_stride{frame->width * 4}; | 92 | const int converted_stride{frame->width * 4}; |
| 96 | u8* const converted_frame_buf_addr{converted_frame_buffer.get()}; | 93 | u8* const converted_frame_buf_addr{converted_frame_buffer.get()}; |
| @@ -104,12 +101,12 @@ void Vic::Execute() { | |||
| 104 | const u32 block_height = static_cast<u32>(config.block_linear_height_log2); | 101 | const u32 block_height = static_cast<u32>(config.block_linear_height_log2); |
| 105 | const auto size = Tegra::Texture::CalculateSize(true, 4, frame->width, frame->height, 1, | 102 | const auto size = Tegra::Texture::CalculateSize(true, 4, frame->width, frame->height, 1, |
| 106 | block_height, 0); | 103 | block_height, 0); |
| 107 | std::vector<u8> swizzled_data(size); | 104 | luma_buffer.resize(size); |
| 108 | Tegra::Texture::SwizzleSubrect(frame->width, frame->height, frame->width * 4, | 105 | Tegra::Texture::SwizzleSubrect(frame->width, frame->height, frame->width * 4, |
| 109 | frame->width, 4, swizzled_data.data(), | 106 | frame->width, 4, luma_buffer.data(), |
| 110 | converted_frame_buffer.get(), block_height, 0, 0); | 107 | converted_frame_buffer.get(), block_height, 0, 0); |
| 111 | 108 | ||
| 112 | gpu.MemoryManager().WriteBlock(output_surface_luma_address, swizzled_data.data(), size); | 109 | gpu.MemoryManager().WriteBlock(output_surface_luma_address, luma_buffer.data(), size); |
| 113 | } else { | 110 | } else { |
| 114 | // send pitch linear frame | 111 | // send pitch linear frame |
| 115 | gpu.MemoryManager().WriteBlock(output_surface_luma_address, converted_frame_buf_addr, | 112 | gpu.MemoryManager().WriteBlock(output_surface_luma_address, converted_frame_buf_addr, |
| @@ -132,15 +129,15 @@ void Vic::Execute() { | |||
| 132 | const auto stride = frame->linesize[0]; | 129 | const auto stride = frame->linesize[0]; |
| 133 | const auto half_stride = frame->linesize[1]; | 130 | const auto half_stride = frame->linesize[1]; |
| 134 | 131 | ||
| 135 | std::vector<u8> luma_buffer(aligned_width * surface_height); | 132 | luma_buffer.resize(aligned_width * surface_height); |
| 136 | std::vector<u8> chroma_buffer(aligned_width * half_height); | 133 | chroma_buffer.resize(aligned_width * half_height); |
| 137 | 134 | ||
| 138 | // Populate luma buffer | 135 | // Populate luma buffer |
| 139 | for (std::size_t y = 0; y < surface_height - 1; ++y) { | 136 | for (std::size_t y = 0; y < surface_height - 1; ++y) { |
| 140 | std::size_t src = y * stride; | 137 | const std::size_t src = y * stride; |
| 141 | std::size_t dst = y * aligned_width; | 138 | const std::size_t dst = y * aligned_width; |
| 142 | 139 | ||
| 143 | std::size_t size = surface_width; | 140 | const std::size_t size = surface_width; |
| 144 | 141 | ||
| 145 | for (std::size_t offset = 0; offset < size; ++offset) { | 142 | for (std::size_t offset = 0; offset < size; ++offset) { |
| 146 | luma_buffer[dst + offset] = luma_ptr[src + offset]; | 143 | luma_buffer[dst + offset] = luma_ptr[src + offset]; |
| @@ -151,8 +148,8 @@ void Vic::Execute() { | |||
| 151 | 148 | ||
| 152 | // Populate chroma buffer from both channels with interleaving. | 149 | // Populate chroma buffer from both channels with interleaving. |
| 153 | for (std::size_t y = 0; y < half_height; ++y) { | 150 | for (std::size_t y = 0; y < half_height; ++y) { |
| 154 | std::size_t src = y * half_stride; | 151 | const std::size_t src = y * half_stride; |
| 155 | std::size_t dst = y * aligned_width; | 152 | const std::size_t dst = y * aligned_width; |
| 156 | 153 | ||
| 157 | for (std::size_t x = 0; x < half_width; ++x) { | 154 | for (std::size_t x = 0; x < half_width; ++x) { |
| 158 | chroma_buffer[dst + x * 2] = chroma_b_ptr[src + x]; | 155 | chroma_buffer[dst + x * 2] = chroma_b_ptr[src + x]; |
diff --git a/src/video_core/command_classes/vic.h b/src/video_core/command_classes/vic.h index 8c4e284a1..f5a2ed100 100644 --- a/src/video_core/command_classes/vic.h +++ b/src/video_core/command_classes/vic.h | |||
| @@ -15,43 +15,6 @@ namespace Tegra { | |||
| 15 | class GPU; | 15 | class GPU; |
| 16 | class Nvdec; | 16 | class Nvdec; |
| 17 | 17 | ||
| 18 | struct PlaneOffsets { | ||
| 19 | u32 luma_offset{}; | ||
| 20 | u32 chroma_u_offset{}; | ||
| 21 | u32 chroma_v_offset{}; | ||
| 22 | }; | ||
| 23 | |||
| 24 | struct VicRegisters { | ||
| 25 | INSERT_PADDING_WORDS(64); | ||
| 26 | u32 nop{}; | ||
| 27 | INSERT_PADDING_WORDS(15); | ||
| 28 | u32 pm_trigger{}; | ||
| 29 | INSERT_PADDING_WORDS(47); | ||
| 30 | u32 set_application_id{}; | ||
| 31 | u32 set_watchdog_timer{}; | ||
| 32 | INSERT_PADDING_WORDS(17); | ||
| 33 | u32 context_save_area{}; | ||
| 34 | u32 context_switch{}; | ||
| 35 | INSERT_PADDING_WORDS(43); | ||
| 36 | u32 execute{}; | ||
| 37 | INSERT_PADDING_WORDS(63); | ||
| 38 | std::array<std::array<PlaneOffsets, 8>, 8> surfacex_slots{}; | ||
| 39 | u32 picture_index{}; | ||
| 40 | u32 control_params{}; | ||
| 41 | u32 config_struct_offset{}; | ||
| 42 | u32 filter_struct_offset{}; | ||
| 43 | u32 palette_offset{}; | ||
| 44 | u32 hist_offset{}; | ||
| 45 | u32 context_id{}; | ||
| 46 | u32 fce_ucode_size{}; | ||
| 47 | PlaneOffsets output_surface{}; | ||
| 48 | u32 fce_ucode_offset{}; | ||
| 49 | INSERT_PADDING_WORDS(4); | ||
| 50 | std::array<u32, 8> slot_context_id{}; | ||
| 51 | INSERT_PADDING_WORDS(16); | ||
| 52 | }; | ||
| 53 | static_assert(sizeof(VicRegisters) == 0x7A0, "VicRegisters is an invalid size"); | ||
| 54 | |||
| 55 | class Vic { | 18 | class Vic { |
| 56 | public: | 19 | public: |
| 57 | enum class Method : u32 { | 20 | enum class Method : u32 { |
| @@ -67,14 +30,11 @@ public: | |||
| 67 | ~Vic(); | 30 | ~Vic(); |
| 68 | 31 | ||
| 69 | /// Write to the device state. | 32 | /// Write to the device state. |
| 70 | void ProcessMethod(Method method, const std::vector<u32>& arguments); | 33 | void ProcessMethod(Method method, u32 argument); |
| 71 | 34 | ||
| 72 | private: | 35 | private: |
| 73 | void Execute(); | 36 | void Execute(); |
| 74 | 37 | ||
| 75 | void VicStateWrite(u32 offset, u32 arguments); | ||
| 76 | VicRegisters vic_state{}; | ||
| 77 | |||
| 78 | enum class VideoPixelFormat : u64_le { | 38 | enum class VideoPixelFormat : u64_le { |
| 79 | RGBA8 = 0x1f, | 39 | RGBA8 = 0x1f, |
| 80 | BGRA8 = 0x20, | 40 | BGRA8 = 0x20, |
| @@ -88,8 +48,6 @@ private: | |||
| 88 | BitField<9, 2, u64_le> chroma_loc_vert; | 48 | BitField<9, 2, u64_le> chroma_loc_vert; |
| 89 | BitField<11, 4, u64_le> block_linear_kind; | 49 | BitField<11, 4, u64_le> block_linear_kind; |
| 90 | BitField<15, 4, u64_le> block_linear_height_log2; | 50 | BitField<15, 4, u64_le> block_linear_height_log2; |
| 91 | BitField<19, 3, u64_le> reserved0; | ||
| 92 | BitField<22, 10, u64_le> reserved1; | ||
| 93 | BitField<32, 14, u64_le> surface_width_minus1; | 51 | BitField<32, 14, u64_le> surface_width_minus1; |
| 94 | BitField<46, 14, u64_le> surface_height_minus1; | 52 | BitField<46, 14, u64_le> surface_height_minus1; |
| 95 | }; | 53 | }; |
| @@ -97,6 +55,13 @@ private: | |||
| 97 | GPU& gpu; | 55 | GPU& gpu; |
| 98 | std::shared_ptr<Tegra::Nvdec> nvdec_processor; | 56 | std::shared_ptr<Tegra::Nvdec> nvdec_processor; |
| 99 | 57 | ||
| 58 | /// Avoid reallocation of the following buffers every frame, as their | ||
| 59 | /// size does not change during a stream | ||
| 60 | using AVMallocPtr = std::unique_ptr<u8, decltype(&av_free)>; | ||
| 61 | AVMallocPtr converted_frame_buffer; | ||
| 62 | std::vector<u8> luma_buffer; | ||
| 63 | std::vector<u8> chroma_buffer; | ||
| 64 | |||
| 100 | GPUVAddr config_struct_address{}; | 65 | GPUVAddr config_struct_address{}; |
| 101 | GPUVAddr output_surface_luma_address{}; | 66 | GPUVAddr output_surface_luma_address{}; |
| 102 | GPUVAddr output_surface_chroma_u_address{}; | 67 | GPUVAddr output_surface_chroma_u_address{}; |
diff --git a/src/video_core/gpu.cpp b/src/video_core/gpu.cpp index 2a9bd4121..51c63af4a 100644 --- a/src/video_core/gpu.cpp +++ b/src/video_core/gpu.cpp | |||
| @@ -30,8 +30,7 @@ MICROPROFILE_DEFINE(GPU_wait, "GPU", "Wait for the GPU", MP_RGB(128, 128, 192)); | |||
| 30 | 30 | ||
| 31 | GPU::GPU(Core::System& system_, bool is_async_, bool use_nvdec_) | 31 | GPU::GPU(Core::System& system_, bool is_async_, bool use_nvdec_) |
| 32 | : system{system_}, memory_manager{std::make_unique<Tegra::MemoryManager>(system)}, | 32 | : system{system_}, memory_manager{std::make_unique<Tegra::MemoryManager>(system)}, |
| 33 | dma_pusher{std::make_unique<Tegra::DmaPusher>(system, *this)}, | 33 | dma_pusher{std::make_unique<Tegra::DmaPusher>(system, *this)}, use_nvdec{use_nvdec_}, |
| 34 | cdma_pusher{std::make_unique<Tegra::CDmaPusher>(*this)}, use_nvdec{use_nvdec_}, | ||
| 35 | maxwell_3d{std::make_unique<Engines::Maxwell3D>(system, *memory_manager)}, | 34 | maxwell_3d{std::make_unique<Engines::Maxwell3D>(system, *memory_manager)}, |
| 36 | fermi_2d{std::make_unique<Engines::Fermi2D>()}, | 35 | fermi_2d{std::make_unique<Engines::Fermi2D>()}, |
| 37 | kepler_compute{std::make_unique<Engines::KeplerCompute>(system, *memory_manager)}, | 36 | kepler_compute{std::make_unique<Engines::KeplerCompute>(system, *memory_manager)}, |
| @@ -494,8 +493,7 @@ void GPU::PushCommandBuffer(Tegra::ChCommandHeaderList& entries) { | |||
| 494 | // TODO(ameerj): RE proper async nvdec operation | 493 | // TODO(ameerj): RE proper async nvdec operation |
| 495 | // gpu_thread.SubmitCommandBuffer(std::move(entries)); | 494 | // gpu_thread.SubmitCommandBuffer(std::move(entries)); |
| 496 | 495 | ||
| 497 | cdma_pusher->Push(std::move(entries)); | 496 | cdma_pusher->ProcessEntries(std::move(entries)); |
| 498 | cdma_pusher->DispatchCalls(); | ||
| 499 | } | 497 | } |
| 500 | 498 | ||
| 501 | void GPU::SwapBuffers(const Tegra::FramebufferConfig* framebuffer) { | 499 | void GPU::SwapBuffers(const Tegra::FramebufferConfig* framebuffer) { |
diff --git a/src/video_core/gpu_thread.cpp b/src/video_core/gpu_thread.cpp index 50319f1d5..eb0e43c0c 100644 --- a/src/video_core/gpu_thread.cpp +++ b/src/video_core/gpu_thread.cpp | |||
| @@ -48,8 +48,7 @@ static void RunThread(Core::System& system, VideoCore::RendererBase& renderer, | |||
| 48 | dma_pusher.DispatchCalls(); | 48 | dma_pusher.DispatchCalls(); |
| 49 | } else if (auto* command_list = std::get_if<SubmitChCommandEntries>(&next.data)) { | 49 | } else if (auto* command_list = std::get_if<SubmitChCommandEntries>(&next.data)) { |
| 50 | // NVDEC | 50 | // NVDEC |
| 51 | cdma_pusher.Push(std::move(command_list->entries)); | 51 | cdma_pusher.ProcessEntries(std::move(command_list->entries)); |
| 52 | cdma_pusher.DispatchCalls(); | ||
| 53 | } else if (const auto* data = std::get_if<SwapBuffersCommand>(&next.data)) { | 52 | } else if (const auto* data = std::get_if<SwapBuffersCommand>(&next.data)) { |
| 54 | renderer.SwapBuffers(data->framebuffer ? &*data->framebuffer : nullptr); | 53 | renderer.SwapBuffers(data->framebuffer ? &*data->framebuffer : nullptr); |
| 55 | } else if (std::holds_alternative<OnCommandListEndCommand>(next.data)) { | 54 | } else if (std::holds_alternative<OnCommandListEndCommand>(next.data)) { |