diff options
| author | 2021-02-18 15:12:07 -0800 | |
|---|---|---|
| committer | 2021-02-18 15:12:07 -0800 | |
| commit | 9cae3e6e90f840903a0072b916e49f24d0f6cb10 (patch) | |
| tree | 79511308066a4fbc11aa2e9058b0aa65772cc30a /src/video_core/command_classes | |
| parent | Merge pull request #5955 from yuzu-emu/revert-3603-port-5123 (diff) | |
| parent | rebase, fix name shadowing, more const (diff) | |
| download | yuzu-9cae3e6e90f840903a0072b916e49f24d0f6cb10.tar.gz yuzu-9cae3e6e90f840903a0072b916e49f24d0f6cb10.tar.xz yuzu-9cae3e6e90f840903a0072b916e49f24d0f6cb10.zip | |
Merge pull request #4973 from ameerj/nvdec-opt
nvdec: Reuse allocated buffers and general cleanup
Diffstat (limited to 'src/video_core/command_classes')
| -rw-r--r-- | src/video_core/command_classes/codecs/codec.cpp | 7 | ||||
| -rw-r--r-- | src/video_core/command_classes/nvdec.cpp | 8 | ||||
| -rw-r--r-- | src/video_core/command_classes/nvdec.h | 2 | ||||
| -rw-r--r-- | src/video_core/command_classes/vic.cpp | 45 | ||||
| -rw-r--r-- | src/video_core/command_classes/vic.h | 51 |
5 files changed, 38 insertions, 75 deletions
diff --git a/src/video_core/command_classes/codecs/codec.cpp b/src/video_core/command_classes/codecs/codec.cpp index 39bc923a5..d02dc6260 100644 --- a/src/video_core/command_classes/codecs/codec.cpp +++ b/src/video_core/command_classes/codecs/codec.cpp | |||
| @@ -44,8 +44,10 @@ Codec::~Codec() { | |||
| 44 | } | 44 | } |
| 45 | 45 | ||
| 46 | void Codec::SetTargetCodec(NvdecCommon::VideoCodec codec) { | 46 | void Codec::SetTargetCodec(NvdecCommon::VideoCodec codec) { |
| 47 | LOG_INFO(Service_NVDRV, "NVDEC video codec initialized to {}", codec); | 47 | if (current_codec != codec) { |
| 48 | current_codec = codec; | 48 | LOG_INFO(Service_NVDRV, "NVDEC video codec initialized to {}", static_cast<u32>(codec)); |
| 49 | current_codec = codec; | ||
| 50 | } | ||
| 49 | } | 51 | } |
| 50 | 52 | ||
| 51 | void Codec::StateWrite(u32 offset, u64 arguments) { | 53 | void Codec::StateWrite(u32 offset, u64 arguments) { |
| @@ -55,7 +57,6 @@ void Codec::StateWrite(u32 offset, u64 arguments) { | |||
| 55 | 57 | ||
| 56 | void Codec::Decode() { | 58 | void Codec::Decode() { |
| 57 | bool is_first_frame = false; | 59 | bool is_first_frame = false; |
| 58 | |||
| 59 | if (!initialized) { | 60 | if (!initialized) { |
| 60 | if (current_codec == NvdecCommon::VideoCodec::H264) { | 61 | if (current_codec == NvdecCommon::VideoCodec::H264) { |
| 61 | av_codec = avcodec_find_decoder(AV_CODEC_ID_H264); | 62 | av_codec = avcodec_find_decoder(AV_CODEC_ID_H264); |
diff --git a/src/video_core/command_classes/nvdec.cpp b/src/video_core/command_classes/nvdec.cpp index 79e1f4e13..e4f919afd 100644 --- a/src/video_core/command_classes/nvdec.cpp +++ b/src/video_core/command_classes/nvdec.cpp | |||
| @@ -12,16 +12,16 @@ Nvdec::Nvdec(GPU& gpu_) : gpu(gpu_), codec(std::make_unique<Codec>(gpu)) {} | |||
| 12 | 12 | ||
| 13 | Nvdec::~Nvdec() = default; | 13 | Nvdec::~Nvdec() = default; |
| 14 | 14 | ||
| 15 | void Nvdec::ProcessMethod(Method method, const std::vector<u32>& arguments) { | 15 | void Nvdec::ProcessMethod(Method method, u32 argument) { |
| 16 | if (method == Method::SetVideoCodec) { | 16 | if (method == Method::SetVideoCodec) { |
| 17 | codec->StateWrite(static_cast<u32>(method), arguments[0]); | 17 | codec->StateWrite(static_cast<u32>(method), argument); |
| 18 | } else { | 18 | } else { |
| 19 | codec->StateWrite(static_cast<u32>(method), static_cast<u64>(arguments[0]) << 8); | 19 | codec->StateWrite(static_cast<u32>(method), static_cast<u64>(argument) << 8); |
| 20 | } | 20 | } |
| 21 | 21 | ||
| 22 | switch (method) { | 22 | switch (method) { |
| 23 | case Method::SetVideoCodec: | 23 | case Method::SetVideoCodec: |
| 24 | codec->SetTargetCodec(static_cast<NvdecCommon::VideoCodec>(arguments[0])); | 24 | codec->SetTargetCodec(static_cast<NvdecCommon::VideoCodec>(argument)); |
| 25 | break; | 25 | break; |
| 26 | case Method::Execute: | 26 | case Method::Execute: |
| 27 | Execute(); | 27 | Execute(); |
diff --git a/src/video_core/command_classes/nvdec.h b/src/video_core/command_classes/nvdec.h index e4877c533..e66be80b8 100644 --- a/src/video_core/command_classes/nvdec.h +++ b/src/video_core/command_classes/nvdec.h | |||
| @@ -23,7 +23,7 @@ public: | |||
| 23 | ~Nvdec(); | 23 | ~Nvdec(); |
| 24 | 24 | ||
| 25 | /// Writes the method into the state, Invoke Execute() if encountered | 25 | /// Writes the method into the state, Invoke Execute() if encountered |
| 26 | void ProcessMethod(Method method, const std::vector<u32>& arguments); | 26 | void ProcessMethod(Method method, u32 argument); |
| 27 | 27 | ||
| 28 | /// Return most recently decoded frame | 28 | /// Return most recently decoded frame |
| 29 | [[nodiscard]] AVFramePtr GetFrame(); | 29 | [[nodiscard]] AVFramePtr GetFrame(); |
diff --git a/src/video_core/command_classes/vic.cpp b/src/video_core/command_classes/vic.cpp index 2b7569335..0a8b82f2b 100644 --- a/src/video_core/command_classes/vic.cpp +++ b/src/video_core/command_classes/vic.cpp | |||
| @@ -18,18 +18,14 @@ extern "C" { | |||
| 18 | namespace Tegra { | 18 | namespace Tegra { |
| 19 | 19 | ||
| 20 | Vic::Vic(GPU& gpu_, std::shared_ptr<Nvdec> nvdec_processor_) | 20 | Vic::Vic(GPU& gpu_, std::shared_ptr<Nvdec> nvdec_processor_) |
| 21 | : gpu(gpu_), nvdec_processor(std::move(nvdec_processor_)) {} | 21 | : gpu(gpu_), |
| 22 | Vic::~Vic() = default; | 22 | nvdec_processor(std::move(nvdec_processor_)), converted_frame_buffer{nullptr, av_free} {} |
| 23 | 23 | ||
| 24 | void Vic::VicStateWrite(u32 offset, u32 arguments) { | 24 | Vic::~Vic() = default; |
| 25 | u8* const state_offset = reinterpret_cast<u8*>(&vic_state) + offset * sizeof(u32); | ||
| 26 | std::memcpy(state_offset, &arguments, sizeof(u32)); | ||
| 27 | } | ||
| 28 | 25 | ||
| 29 | void Vic::ProcessMethod(Method method, const std::vector<u32>& arguments) { | 26 | void Vic::ProcessMethod(Method method, u32 argument) { |
| 30 | LOG_DEBUG(HW_GPU, "Vic method 0x{:X}", method); | 27 | LOG_DEBUG(HW_GPU, "Vic method 0x{:X}", static_cast<u32>(method)); |
| 31 | VicStateWrite(static_cast<u32>(method), arguments[0]); | 28 | const u64 arg = static_cast<u64>(argument) << 8; |
| 32 | const u64 arg = static_cast<u64>(arguments[0]) << 8; | ||
| 33 | switch (method) { | 29 | switch (method) { |
| 34 | case Method::Execute: | 30 | case Method::Execute: |
| 35 | Execute(); | 31 | Execute(); |
| @@ -53,8 +49,7 @@ void Vic::ProcessMethod(Method method, const std::vector<u32>& arguments) { | |||
| 53 | 49 | ||
| 54 | void Vic::Execute() { | 50 | void Vic::Execute() { |
| 55 | if (output_surface_luma_address == 0) { | 51 | if (output_surface_luma_address == 0) { |
| 56 | LOG_ERROR(Service_NVDRV, "VIC Luma address not set. Received 0x{:X}", | 52 | LOG_ERROR(Service_NVDRV, "VIC Luma address not set."); |
| 57 | vic_state.output_surface.luma_offset); | ||
| 58 | return; | 53 | return; |
| 59 | } | 54 | } |
| 60 | const VicConfig config{gpu.MemoryManager().Read<u64>(config_struct_address + 0x20)}; | 55 | const VicConfig config{gpu.MemoryManager().Read<u64>(config_struct_address + 0x20)}; |
| @@ -89,8 +84,10 @@ void Vic::Execute() { | |||
| 89 | // Get Converted frame | 84 | // Get Converted frame |
| 90 | const std::size_t linear_size = frame->width * frame->height * 4; | 85 | const std::size_t linear_size = frame->width * frame->height * 4; |
| 91 | 86 | ||
| 92 | using AVMallocPtr = std::unique_ptr<u8, decltype(&av_free)>; | 87 | // Only allocate frame_buffer once per stream, as the size is not expected to change |
| 93 | AVMallocPtr converted_frame_buffer{static_cast<u8*>(av_malloc(linear_size)), av_free}; | 88 | if (!converted_frame_buffer) { |
| 89 | converted_frame_buffer = AVMallocPtr{static_cast<u8*>(av_malloc(linear_size)), av_free}; | ||
| 90 | } | ||
| 94 | 91 | ||
| 95 | const int converted_stride{frame->width * 4}; | 92 | const int converted_stride{frame->width * 4}; |
| 96 | u8* const converted_frame_buf_addr{converted_frame_buffer.get()}; | 93 | u8* const converted_frame_buf_addr{converted_frame_buffer.get()}; |
| @@ -104,12 +101,12 @@ void Vic::Execute() { | |||
| 104 | const u32 block_height = static_cast<u32>(config.block_linear_height_log2); | 101 | const u32 block_height = static_cast<u32>(config.block_linear_height_log2); |
| 105 | const auto size = Tegra::Texture::CalculateSize(true, 4, frame->width, frame->height, 1, | 102 | const auto size = Tegra::Texture::CalculateSize(true, 4, frame->width, frame->height, 1, |
| 106 | block_height, 0); | 103 | block_height, 0); |
| 107 | std::vector<u8> swizzled_data(size); | 104 | luma_buffer.resize(size); |
| 108 | Tegra::Texture::SwizzleSubrect(frame->width, frame->height, frame->width * 4, | 105 | Tegra::Texture::SwizzleSubrect(frame->width, frame->height, frame->width * 4, |
| 109 | frame->width, 4, swizzled_data.data(), | 106 | frame->width, 4, luma_buffer.data(), |
| 110 | converted_frame_buffer.get(), block_height, 0, 0); | 107 | converted_frame_buffer.get(), block_height, 0, 0); |
| 111 | 108 | ||
| 112 | gpu.MemoryManager().WriteBlock(output_surface_luma_address, swizzled_data.data(), size); | 109 | gpu.MemoryManager().WriteBlock(output_surface_luma_address, luma_buffer.data(), size); |
| 113 | } else { | 110 | } else { |
| 114 | // send pitch linear frame | 111 | // send pitch linear frame |
| 115 | gpu.MemoryManager().WriteBlock(output_surface_luma_address, converted_frame_buf_addr, | 112 | gpu.MemoryManager().WriteBlock(output_surface_luma_address, converted_frame_buf_addr, |
| @@ -132,15 +129,15 @@ void Vic::Execute() { | |||
| 132 | const auto stride = frame->linesize[0]; | 129 | const auto stride = frame->linesize[0]; |
| 133 | const auto half_stride = frame->linesize[1]; | 130 | const auto half_stride = frame->linesize[1]; |
| 134 | 131 | ||
| 135 | std::vector<u8> luma_buffer(aligned_width * surface_height); | 132 | luma_buffer.resize(aligned_width * surface_height); |
| 136 | std::vector<u8> chroma_buffer(aligned_width * half_height); | 133 | chroma_buffer.resize(aligned_width * half_height); |
| 137 | 134 | ||
| 138 | // Populate luma buffer | 135 | // Populate luma buffer |
| 139 | for (std::size_t y = 0; y < surface_height - 1; ++y) { | 136 | for (std::size_t y = 0; y < surface_height - 1; ++y) { |
| 140 | std::size_t src = y * stride; | 137 | const std::size_t src = y * stride; |
| 141 | std::size_t dst = y * aligned_width; | 138 | const std::size_t dst = y * aligned_width; |
| 142 | 139 | ||
| 143 | std::size_t size = surface_width; | 140 | const std::size_t size = surface_width; |
| 144 | 141 | ||
| 145 | for (std::size_t offset = 0; offset < size; ++offset) { | 142 | for (std::size_t offset = 0; offset < size; ++offset) { |
| 146 | luma_buffer[dst + offset] = luma_ptr[src + offset]; | 143 | luma_buffer[dst + offset] = luma_ptr[src + offset]; |
| @@ -151,8 +148,8 @@ void Vic::Execute() { | |||
| 151 | 148 | ||
| 152 | // Populate chroma buffer from both channels with interleaving. | 149 | // Populate chroma buffer from both channels with interleaving. |
| 153 | for (std::size_t y = 0; y < half_height; ++y) { | 150 | for (std::size_t y = 0; y < half_height; ++y) { |
| 154 | std::size_t src = y * half_stride; | 151 | const std::size_t src = y * half_stride; |
| 155 | std::size_t dst = y * aligned_width; | 152 | const std::size_t dst = y * aligned_width; |
| 156 | 153 | ||
| 157 | for (std::size_t x = 0; x < half_width; ++x) { | 154 | for (std::size_t x = 0; x < half_width; ++x) { |
| 158 | chroma_buffer[dst + x * 2] = chroma_b_ptr[src + x]; | 155 | chroma_buffer[dst + x * 2] = chroma_b_ptr[src + x]; |
diff --git a/src/video_core/command_classes/vic.h b/src/video_core/command_classes/vic.h index 8c4e284a1..f5a2ed100 100644 --- a/src/video_core/command_classes/vic.h +++ b/src/video_core/command_classes/vic.h | |||
| @@ -15,43 +15,6 @@ namespace Tegra { | |||
| 15 | class GPU; | 15 | class GPU; |
| 16 | class Nvdec; | 16 | class Nvdec; |
| 17 | 17 | ||
| 18 | struct PlaneOffsets { | ||
| 19 | u32 luma_offset{}; | ||
| 20 | u32 chroma_u_offset{}; | ||
| 21 | u32 chroma_v_offset{}; | ||
| 22 | }; | ||
| 23 | |||
| 24 | struct VicRegisters { | ||
| 25 | INSERT_PADDING_WORDS(64); | ||
| 26 | u32 nop{}; | ||
| 27 | INSERT_PADDING_WORDS(15); | ||
| 28 | u32 pm_trigger{}; | ||
| 29 | INSERT_PADDING_WORDS(47); | ||
| 30 | u32 set_application_id{}; | ||
| 31 | u32 set_watchdog_timer{}; | ||
| 32 | INSERT_PADDING_WORDS(17); | ||
| 33 | u32 context_save_area{}; | ||
| 34 | u32 context_switch{}; | ||
| 35 | INSERT_PADDING_WORDS(43); | ||
| 36 | u32 execute{}; | ||
| 37 | INSERT_PADDING_WORDS(63); | ||
| 38 | std::array<std::array<PlaneOffsets, 8>, 8> surfacex_slots{}; | ||
| 39 | u32 picture_index{}; | ||
| 40 | u32 control_params{}; | ||
| 41 | u32 config_struct_offset{}; | ||
| 42 | u32 filter_struct_offset{}; | ||
| 43 | u32 palette_offset{}; | ||
| 44 | u32 hist_offset{}; | ||
| 45 | u32 context_id{}; | ||
| 46 | u32 fce_ucode_size{}; | ||
| 47 | PlaneOffsets output_surface{}; | ||
| 48 | u32 fce_ucode_offset{}; | ||
| 49 | INSERT_PADDING_WORDS(4); | ||
| 50 | std::array<u32, 8> slot_context_id{}; | ||
| 51 | INSERT_PADDING_WORDS(16); | ||
| 52 | }; | ||
| 53 | static_assert(sizeof(VicRegisters) == 0x7A0, "VicRegisters is an invalid size"); | ||
| 54 | |||
| 55 | class Vic { | 18 | class Vic { |
| 56 | public: | 19 | public: |
| 57 | enum class Method : u32 { | 20 | enum class Method : u32 { |
| @@ -67,14 +30,11 @@ public: | |||
| 67 | ~Vic(); | 30 | ~Vic(); |
| 68 | 31 | ||
| 69 | /// Write to the device state. | 32 | /// Write to the device state. |
| 70 | void ProcessMethod(Method method, const std::vector<u32>& arguments); | 33 | void ProcessMethod(Method method, u32 argument); |
| 71 | 34 | ||
| 72 | private: | 35 | private: |
| 73 | void Execute(); | 36 | void Execute(); |
| 74 | 37 | ||
| 75 | void VicStateWrite(u32 offset, u32 arguments); | ||
| 76 | VicRegisters vic_state{}; | ||
| 77 | |||
| 78 | enum class VideoPixelFormat : u64_le { | 38 | enum class VideoPixelFormat : u64_le { |
| 79 | RGBA8 = 0x1f, | 39 | RGBA8 = 0x1f, |
| 80 | BGRA8 = 0x20, | 40 | BGRA8 = 0x20, |
| @@ -88,8 +48,6 @@ private: | |||
| 88 | BitField<9, 2, u64_le> chroma_loc_vert; | 48 | BitField<9, 2, u64_le> chroma_loc_vert; |
| 89 | BitField<11, 4, u64_le> block_linear_kind; | 49 | BitField<11, 4, u64_le> block_linear_kind; |
| 90 | BitField<15, 4, u64_le> block_linear_height_log2; | 50 | BitField<15, 4, u64_le> block_linear_height_log2; |
| 91 | BitField<19, 3, u64_le> reserved0; | ||
| 92 | BitField<22, 10, u64_le> reserved1; | ||
| 93 | BitField<32, 14, u64_le> surface_width_minus1; | 51 | BitField<32, 14, u64_le> surface_width_minus1; |
| 94 | BitField<46, 14, u64_le> surface_height_minus1; | 52 | BitField<46, 14, u64_le> surface_height_minus1; |
| 95 | }; | 53 | }; |
| @@ -97,6 +55,13 @@ private: | |||
| 97 | GPU& gpu; | 55 | GPU& gpu; |
| 98 | std::shared_ptr<Tegra::Nvdec> nvdec_processor; | 56 | std::shared_ptr<Tegra::Nvdec> nvdec_processor; |
| 99 | 57 | ||
| 58 | /// Avoid reallocation of the following buffers every frame, as their | ||
| 59 | /// size does not change during a stream | ||
| 60 | using AVMallocPtr = std::unique_ptr<u8, decltype(&av_free)>; | ||
| 61 | AVMallocPtr converted_frame_buffer; | ||
| 62 | std::vector<u8> luma_buffer; | ||
| 63 | std::vector<u8> chroma_buffer; | ||
| 64 | |||
| 100 | GPUVAddr config_struct_address{}; | 65 | GPUVAddr config_struct_address{}; |
| 101 | GPUVAddr output_surface_luma_address{}; | 66 | GPUVAddr output_surface_luma_address{}; |
| 102 | GPUVAddr output_surface_chroma_u_address{}; | 67 | GPUVAddr output_surface_chroma_u_address{}; |