diff options
| author | 2021-08-04 03:43:11 +0000 | |
|---|---|---|
| committer | 2021-08-03 23:43:11 -0400 | |
| commit | 2868d4ba84f43c9bf3c7b6997ddcafb6e65c4a02 (patch) | |
| tree | 7cb19f8de5b5b37db87fa331d9b3c951ce372b4b /src | |
| parent | Merge pull request #6805 from lat9nq/fix-user-profiles (diff) | |
| download | yuzu-2868d4ba84f43c9bf3c7b6997ddcafb6e65c4a02.tar.gz yuzu-2868d4ba84f43c9bf3c7b6997ddcafb6e65c4a02.tar.xz yuzu-2868d4ba84f43c9bf3c7b6997ddcafb6e65c4a02.zip | |
nvdec: Implement VA-API hardware video acceleration (#6713)
* nvdec: VA-API
* Verify formatting
* Forgot a semicolon for Windows
* Clarify comment about AV_PIX_FMT_NV12
* Fix assert log spam from missing negation
* vic: Remove forgotten debug code
* Address lioncash's review
* Mention VA-API is Intel/AMD
* Address v1993's review
* Hopefully fix CMakeLists style this time
* vic: Improve cache locality
* vic: Fix off-by-one error
* codec: Async
* codec: Forgot the GetValue()
* nvdec: Address ameerj's review
* codec: Fallback to CPU without VA-API support
* cmake: Address lat9nq's review
* cmake: Make VA-API optional
* vaapi: Multiple GPU
* Apply suggestions from code review
Co-authored-by: Ameer J <52414509+ameerj@users.noreply.github.com>
* nvdec: Address ameerj's review
* codec: Use anonymous instead of static
* nvdec: Remove enum and fix memory leak
* nvdec: Address ameerj's review
* codec: Remove preparation for threading
Co-authored-by: Ameer J <52414509+ameerj@users.noreply.github.com>
Diffstat (limited to '')
| -rw-r--r-- | src/video_core/CMakeLists.txt | 5 | ||||
| -rw-r--r-- | src/video_core/command_classes/codecs/codec.cpp | 144 | ||||
| -rw-r--r-- | src/video_core/command_classes/codecs/codec.h | 4 | ||||
| -rw-r--r-- | src/video_core/command_classes/vic.cpp | 87 | ||||
| -rw-r--r-- | src/video_core/command_classes/vic.h | 7 |
5 files changed, 175 insertions, 72 deletions
diff --git a/src/video_core/CMakeLists.txt b/src/video_core/CMakeLists.txt index 333f6f35f..1eb67c051 100644 --- a/src/video_core/CMakeLists.txt +++ b/src/video_core/CMakeLists.txt | |||
| @@ -1,5 +1,10 @@ | |||
| 1 | add_subdirectory(host_shaders) | 1 | add_subdirectory(host_shaders) |
| 2 | 2 | ||
| 3 | if(LIBVA_FOUND) | ||
| 4 | set_source_files_properties(command_classes/codecs/codec.cpp | ||
| 5 | PROPERTIES COMPILE_DEFINITIONS LIBVA_FOUND=1) | ||
| 6 | endif() | ||
| 7 | |||
| 3 | add_library(video_core STATIC | 8 | add_library(video_core STATIC |
| 4 | buffer_cache/buffer_base.h | 9 | buffer_cache/buffer_base.h |
| 5 | buffer_cache/buffer_cache.cpp | 10 | buffer_cache/buffer_cache.cpp |
diff --git a/src/video_core/command_classes/codecs/codec.cpp b/src/video_core/command_classes/codecs/codec.cpp index 1b4bbc8ac..f798a0053 100644 --- a/src/video_core/command_classes/codecs/codec.cpp +++ b/src/video_core/command_classes/codecs/codec.cpp | |||
| @@ -2,7 +2,6 @@ | |||
| 2 | // Licensed under GPLv2 or any later version | 2 | // Licensed under GPLv2 or any later version |
| 3 | // Refer to the license.txt file included. | 3 | // Refer to the license.txt file included. |
| 4 | 4 | ||
| 5 | #include <cstring> | ||
| 6 | #include <fstream> | 5 | #include <fstream> |
| 7 | #include <vector> | 6 | #include <vector> |
| 8 | #include "common/assert.h" | 7 | #include "common/assert.h" |
| @@ -17,10 +16,47 @@ extern "C" { | |||
| 17 | } | 16 | } |
| 18 | 17 | ||
| 19 | namespace Tegra { | 18 | namespace Tegra { |
| 19 | #if defined(LIBVA_FOUND) | ||
| 20 | // Hardware acceleration code from FFmpeg/doc/examples/hw_decode.c originally under MIT license | ||
| 21 | namespace { | ||
| 22 | constexpr std::array<const char*, 2> VAAPI_DRIVERS = { | ||
| 23 | "i915", | ||
| 24 | "amdgpu", | ||
| 25 | }; | ||
| 26 | |||
| 27 | AVPixelFormat GetHwFormat(AVCodecContext*, const AVPixelFormat* pix_fmts) { | ||
| 28 | for (const AVPixelFormat* p = pix_fmts; *p != AV_PIX_FMT_NONE; ++p) { | ||
| 29 | if (*p == AV_PIX_FMT_VAAPI) { | ||
| 30 | return AV_PIX_FMT_VAAPI; | ||
| 31 | } | ||
| 32 | } | ||
| 33 | LOG_INFO(Service_NVDRV, "Could not find compatible GPU AV format, falling back to CPU"); | ||
| 34 | return *pix_fmts; | ||
| 35 | } | ||
| 36 | |||
| 37 | bool CreateVaapiHwdevice(AVBufferRef** av_hw_device) { | ||
| 38 | AVDictionary* hwdevice_options = nullptr; | ||
| 39 | av_dict_set(&hwdevice_options, "connection_type", "drm", 0); | ||
| 40 | for (const auto& driver : VAAPI_DRIVERS) { | ||
| 41 | av_dict_set(&hwdevice_options, "kernel_driver", driver, 0); | ||
| 42 | const int hwdevice_error = av_hwdevice_ctx_create(av_hw_device, AV_HWDEVICE_TYPE_VAAPI, | ||
| 43 | nullptr, hwdevice_options, 0); | ||
| 44 | if (hwdevice_error >= 0) { | ||
| 45 | LOG_INFO(Service_NVDRV, "Using VA-API with {}", driver); | ||
| 46 | av_dict_free(&hwdevice_options); | ||
| 47 | return true; | ||
| 48 | } | ||
| 49 | LOG_DEBUG(Service_NVDRV, "VA-API av_hwdevice_ctx_create failed {}", hwdevice_error); | ||
| 50 | } | ||
| 51 | LOG_DEBUG(Service_NVDRV, "VA-API av_hwdevice_ctx_create failed for all drivers"); | ||
| 52 | av_dict_free(&hwdevice_options); | ||
| 53 | return false; | ||
| 54 | } | ||
| 55 | } // namespace | ||
| 56 | #endif | ||
| 20 | 57 | ||
| 21 | void AVFrameDeleter(AVFrame* ptr) { | 58 | void AVFrameDeleter(AVFrame* ptr) { |
| 22 | av_frame_unref(ptr); | 59 | av_frame_free(&ptr); |
| 23 | av_free(ptr); | ||
| 24 | } | 60 | } |
| 25 | 61 | ||
| 26 | Codec::Codec(GPU& gpu_, const NvdecCommon::NvdecRegisters& regs) | 62 | Codec::Codec(GPU& gpu_, const NvdecCommon::NvdecRegisters& regs) |
| @@ -32,19 +68,31 @@ Codec::~Codec() { | |||
| 32 | return; | 68 | return; |
| 33 | } | 69 | } |
| 34 | // Free libav memory | 70 | // Free libav memory |
| 35 | AVFrame* av_frame{nullptr}; | ||
| 36 | avcodec_send_packet(av_codec_ctx, nullptr); | 71 | avcodec_send_packet(av_codec_ctx, nullptr); |
| 37 | av_frame = av_frame_alloc(); | 72 | AVFrame* av_frame = av_frame_alloc(); |
| 38 | avcodec_receive_frame(av_codec_ctx, av_frame); | 73 | avcodec_receive_frame(av_codec_ctx, av_frame); |
| 39 | avcodec_flush_buffers(av_codec_ctx); | 74 | avcodec_flush_buffers(av_codec_ctx); |
| 40 | 75 | av_frame_free(&av_frame); | |
| 41 | av_frame_unref(av_frame); | ||
| 42 | av_free(av_frame); | ||
| 43 | avcodec_close(av_codec_ctx); | 76 | avcodec_close(av_codec_ctx); |
| 77 | av_buffer_unref(&av_hw_device); | ||
| 78 | } | ||
| 79 | |||
| 80 | void Codec::InitializeHwdec() { | ||
| 81 | // Prioritize integrated GPU to mitigate bandwidth bottlenecks | ||
| 82 | #if defined(LIBVA_FOUND) | ||
| 83 | if (CreateVaapiHwdevice(&av_hw_device)) { | ||
| 84 | const auto hw_device_ctx = av_buffer_ref(av_hw_device); | ||
| 85 | ASSERT_MSG(hw_device_ctx, "av_buffer_ref failed"); | ||
| 86 | av_codec_ctx->hw_device_ctx = hw_device_ctx; | ||
| 87 | av_codec_ctx->get_format = GetHwFormat; | ||
| 88 | return; | ||
| 89 | } | ||
| 90 | #endif | ||
| 91 | // TODO more GPU accelerated decoders | ||
| 44 | } | 92 | } |
| 45 | 93 | ||
| 46 | void Codec::Initialize() { | 94 | void Codec::Initialize() { |
| 47 | AVCodecID codec{AV_CODEC_ID_NONE}; | 95 | AVCodecID codec; |
| 48 | switch (current_codec) { | 96 | switch (current_codec) { |
| 49 | case NvdecCommon::VideoCodec::H264: | 97 | case NvdecCommon::VideoCodec::H264: |
| 50 | codec = AV_CODEC_ID_H264; | 98 | codec = AV_CODEC_ID_H264; |
| @@ -53,22 +101,24 @@ void Codec::Initialize() { | |||
| 53 | codec = AV_CODEC_ID_VP9; | 101 | codec = AV_CODEC_ID_VP9; |
| 54 | break; | 102 | break; |
| 55 | default: | 103 | default: |
| 104 | UNIMPLEMENTED_MSG("Unknown codec {}", current_codec); | ||
| 56 | return; | 105 | return; |
| 57 | } | 106 | } |
| 58 | av_codec = avcodec_find_decoder(codec); | 107 | av_codec = avcodec_find_decoder(codec); |
| 59 | av_codec_ctx = avcodec_alloc_context3(av_codec); | 108 | av_codec_ctx = avcodec_alloc_context3(av_codec); |
| 60 | av_opt_set(av_codec_ctx->priv_data, "tune", "zerolatency", 0); | 109 | av_opt_set(av_codec_ctx->priv_data, "tune", "zerolatency", 0); |
| 61 | 110 | InitializeHwdec(); | |
| 62 | // TODO(ameerj): libavcodec gpu hw acceleration | 111 | if (!av_codec_ctx->hw_device_ctx) { |
| 63 | 112 | LOG_INFO(Service_NVDRV, "Using FFmpeg software decoding"); | |
| 113 | } | ||
| 64 | const auto av_error = avcodec_open2(av_codec_ctx, av_codec, nullptr); | 114 | const auto av_error = avcodec_open2(av_codec_ctx, av_codec, nullptr); |
| 65 | if (av_error < 0) { | 115 | if (av_error < 0) { |
| 66 | LOG_ERROR(Service_NVDRV, "avcodec_open2() Failed."); | 116 | LOG_ERROR(Service_NVDRV, "avcodec_open2() Failed."); |
| 67 | avcodec_close(av_codec_ctx); | 117 | avcodec_close(av_codec_ctx); |
| 118 | av_buffer_unref(&av_hw_device); | ||
| 68 | return; | 119 | return; |
| 69 | } | 120 | } |
| 70 | initialized = true; | 121 | initialized = true; |
| 71 | return; | ||
| 72 | } | 122 | } |
| 73 | 123 | ||
| 74 | void Codec::SetTargetCodec(NvdecCommon::VideoCodec codec) { | 124 | void Codec::SetTargetCodec(NvdecCommon::VideoCodec codec) { |
| @@ -80,36 +130,64 @@ void Codec::SetTargetCodec(NvdecCommon::VideoCodec codec) { | |||
| 80 | 130 | ||
| 81 | void Codec::Decode() { | 131 | void Codec::Decode() { |
| 82 | const bool is_first_frame = !initialized; | 132 | const bool is_first_frame = !initialized; |
| 83 | if (!initialized) { | 133 | if (is_first_frame) { |
| 84 | Initialize(); | 134 | Initialize(); |
| 85 | } | 135 | } |
| 86 | |||
| 87 | bool vp9_hidden_frame = false; | 136 | bool vp9_hidden_frame = false; |
| 88 | AVPacket packet{}; | ||
| 89 | av_init_packet(&packet); | ||
| 90 | std::vector<u8> frame_data; | 137 | std::vector<u8> frame_data; |
| 91 | |||
| 92 | if (current_codec == NvdecCommon::VideoCodec::H264) { | 138 | if (current_codec == NvdecCommon::VideoCodec::H264) { |
| 93 | frame_data = h264_decoder->ComposeFrameHeader(state, is_first_frame); | 139 | frame_data = h264_decoder->ComposeFrameHeader(state, is_first_frame); |
| 94 | } else if (current_codec == NvdecCommon::VideoCodec::Vp9) { | 140 | } else if (current_codec == NvdecCommon::VideoCodec::Vp9) { |
| 95 | frame_data = vp9_decoder->ComposeFrameHeader(state); | 141 | frame_data = vp9_decoder->ComposeFrameHeader(state); |
| 96 | vp9_hidden_frame = vp9_decoder->WasFrameHidden(); | 142 | vp9_hidden_frame = vp9_decoder->WasFrameHidden(); |
| 97 | } | 143 | } |
| 98 | 144 | AVPacket packet{}; | |
| 145 | av_init_packet(&packet); | ||
| 99 | packet.data = frame_data.data(); | 146 | packet.data = frame_data.data(); |
| 100 | packet.size = static_cast<s32>(frame_data.size()); | 147 | packet.size = static_cast<s32>(frame_data.size()); |
| 101 | 148 | if (const int ret = avcodec_send_packet(av_codec_ctx, &packet); ret) { | |
| 102 | avcodec_send_packet(av_codec_ctx, &packet); | 149 | LOG_DEBUG(Service_NVDRV, "avcodec_send_packet error {}", ret); |
| 103 | 150 | return; | |
| 104 | if (!vp9_hidden_frame) { | 151 | } |
| 105 | // Only receive/store visible frames | 152 | // Only receive/store visible frames |
| 106 | AVFramePtr frame = AVFramePtr{av_frame_alloc(), AVFrameDeleter}; | 153 | if (vp9_hidden_frame) { |
| 107 | avcodec_receive_frame(av_codec_ctx, frame.get()); | 154 | return; |
| 108 | av_frames.push(std::move(frame)); | 155 | } |
| 109 | // Limit queue to 10 frames. Workaround for ZLA decode and queue spam | 156 | AVFrame* hw_frame = av_frame_alloc(); |
| 110 | if (av_frames.size() > 10) { | 157 | AVFrame* sw_frame = hw_frame; |
| 111 | av_frames.pop(); | 158 | ASSERT_MSG(hw_frame, "av_frame_alloc hw_frame failed"); |
| 112 | } | 159 | if (const int ret = avcodec_receive_frame(av_codec_ctx, hw_frame); ret) { |
| 160 | LOG_DEBUG(Service_NVDRV, "avcodec_receive_frame error {}", ret); | ||
| 161 | av_frame_free(&hw_frame); | ||
| 162 | return; | ||
| 163 | } | ||
| 164 | if (!hw_frame->width || !hw_frame->height) { | ||
| 165 | LOG_WARNING(Service_NVDRV, "Zero width or height in frame"); | ||
| 166 | av_frame_free(&hw_frame); | ||
| 167 | return; | ||
| 168 | } | ||
| 169 | #if defined(LIBVA_FOUND) | ||
| 170 | // Hardware acceleration code from FFmpeg/doc/examples/hw_decode.c under MIT license | ||
| 171 | if (hw_frame->format == AV_PIX_FMT_VAAPI) { | ||
| 172 | sw_frame = av_frame_alloc(); | ||
| 173 | ASSERT_MSG(sw_frame, "av_frame_alloc sw_frame failed"); | ||
| 174 | // Can't use AV_PIX_FMT_YUV420P and share code with software decoding in vic.cpp | ||
| 175 | // because Intel drivers crash unless using AV_PIX_FMT_NV12 | ||
| 176 | sw_frame->format = AV_PIX_FMT_NV12; | ||
| 177 | const int transfer_data_ret = av_hwframe_transfer_data(sw_frame, hw_frame, 0); | ||
| 178 | ASSERT_MSG(!transfer_data_ret, "av_hwframe_transfer_data error {}", transfer_data_ret); | ||
| 179 | av_frame_free(&hw_frame); | ||
| 180 | } | ||
| 181 | #endif | ||
| 182 | if (sw_frame->format != AV_PIX_FMT_YUV420P && sw_frame->format != AV_PIX_FMT_NV12) { | ||
| 183 | UNIMPLEMENTED_MSG("Unexpected video format from host graphics: {}", sw_frame->format); | ||
| 184 | av_frame_free(&sw_frame); | ||
| 185 | return; | ||
| 186 | } | ||
| 187 | av_frames.push(AVFramePtr{sw_frame, AVFrameDeleter}); | ||
| 188 | if (av_frames.size() > 10) { | ||
| 189 | LOG_TRACE(Service_NVDRV, "av_frames.push overflow dropped frame"); | ||
| 190 | av_frames.pop(); | ||
| 113 | } | 191 | } |
| 114 | } | 192 | } |
| 115 | 193 | ||
| @@ -119,7 +197,6 @@ AVFramePtr Codec::GetCurrentFrame() { | |||
| 119 | if (av_frames.empty()) { | 197 | if (av_frames.empty()) { |
| 120 | return AVFramePtr{nullptr, AVFrameDeleter}; | 198 | return AVFramePtr{nullptr, AVFrameDeleter}; |
| 121 | } | 199 | } |
| 122 | |||
| 123 | AVFramePtr frame = std::move(av_frames.front()); | 200 | AVFramePtr frame = std::move(av_frames.front()); |
| 124 | av_frames.pop(); | 201 | av_frames.pop(); |
| 125 | return frame; | 202 | return frame; |
| @@ -144,6 +221,5 @@ std::string_view Codec::GetCurrentCodecName() const { | |||
| 144 | default: | 221 | default: |
| 145 | return "Unknown"; | 222 | return "Unknown"; |
| 146 | } | 223 | } |
| 147 | }; | 224 | } |
| 148 | |||
| 149 | } // namespace Tegra | 225 | } // namespace Tegra |
diff --git a/src/video_core/command_classes/codecs/codec.h b/src/video_core/command_classes/codecs/codec.h index 96c823c76..71936203f 100644 --- a/src/video_core/command_classes/codecs/codec.h +++ b/src/video_core/command_classes/codecs/codec.h | |||
| @@ -22,7 +22,6 @@ extern "C" { | |||
| 22 | 22 | ||
| 23 | namespace Tegra { | 23 | namespace Tegra { |
| 24 | class GPU; | 24 | class GPU; |
| 25 | struct VicRegisters; | ||
| 26 | 25 | ||
| 27 | void AVFrameDeleter(AVFrame* ptr); | 26 | void AVFrameDeleter(AVFrame* ptr); |
| 28 | using AVFramePtr = std::unique_ptr<AVFrame, decltype(&AVFrameDeleter)>; | 27 | using AVFramePtr = std::unique_ptr<AVFrame, decltype(&AVFrameDeleter)>; |
| @@ -55,10 +54,13 @@ public: | |||
| 55 | [[nodiscard]] std::string_view GetCurrentCodecName() const; | 54 | [[nodiscard]] std::string_view GetCurrentCodecName() const; |
| 56 | 55 | ||
| 57 | private: | 56 | private: |
| 57 | void InitializeHwdec(); | ||
| 58 | |||
| 58 | bool initialized{}; | 59 | bool initialized{}; |
| 59 | NvdecCommon::VideoCodec current_codec{NvdecCommon::VideoCodec::None}; | 60 | NvdecCommon::VideoCodec current_codec{NvdecCommon::VideoCodec::None}; |
| 60 | 61 | ||
| 61 | AVCodec* av_codec{nullptr}; | 62 | AVCodec* av_codec{nullptr}; |
| 63 | AVBufferRef* av_hw_device{nullptr}; | ||
| 62 | AVCodecContext* av_codec_ctx{nullptr}; | 64 | AVCodecContext* av_codec_ctx{nullptr}; |
| 63 | 65 | ||
| 64 | GPU& gpu; | 66 | GPU& gpu; |
diff --git a/src/video_core/command_classes/vic.cpp b/src/video_core/command_classes/vic.cpp index ffb7c82a1..d5e77941c 100644 --- a/src/video_core/command_classes/vic.cpp +++ b/src/video_core/command_classes/vic.cpp | |||
| @@ -46,11 +46,8 @@ void Vic::ProcessMethod(Method method, u32 argument) { | |||
| 46 | case Method::SetOutputSurfaceLumaOffset: | 46 | case Method::SetOutputSurfaceLumaOffset: |
| 47 | output_surface_luma_address = arg; | 47 | output_surface_luma_address = arg; |
| 48 | break; | 48 | break; |
| 49 | case Method::SetOutputSurfaceChromaUOffset: | 49 | case Method::SetOutputSurfaceChromaOffset: |
| 50 | output_surface_chroma_u_address = arg; | 50 | output_surface_chroma_address = arg; |
| 51 | break; | ||
| 52 | case Method::SetOutputSurfaceChromaVOffset: | ||
| 53 | output_surface_chroma_v_address = arg; | ||
| 54 | break; | 51 | break; |
| 55 | default: | 52 | default: |
| 56 | break; | 53 | break; |
| @@ -65,11 +62,10 @@ void Vic::Execute() { | |||
| 65 | const VicConfig config{gpu.MemoryManager().Read<u64>(config_struct_address + 0x20)}; | 62 | const VicConfig config{gpu.MemoryManager().Read<u64>(config_struct_address + 0x20)}; |
| 66 | const AVFramePtr frame_ptr = nvdec_processor->GetFrame(); | 63 | const AVFramePtr frame_ptr = nvdec_processor->GetFrame(); |
| 67 | const auto* frame = frame_ptr.get(); | 64 | const auto* frame = frame_ptr.get(); |
| 68 | if (!frame || frame->width == 0 || frame->height == 0) { | 65 | if (!frame) { |
| 69 | return; | 66 | return; |
| 70 | } | 67 | } |
| 71 | const VideoPixelFormat pixel_format = | 68 | const auto pixel_format = static_cast<VideoPixelFormat>(config.pixel_format.Value()); |
| 72 | static_cast<VideoPixelFormat>(config.pixel_format.Value()); | ||
| 73 | switch (pixel_format) { | 69 | switch (pixel_format) { |
| 74 | case VideoPixelFormat::BGRA8: | 70 | case VideoPixelFormat::BGRA8: |
| 75 | case VideoPixelFormat::RGBA8: { | 71 | case VideoPixelFormat::RGBA8: { |
| @@ -83,16 +79,18 @@ void Vic::Execute() { | |||
| 83 | sws_freeContext(scaler_ctx); | 79 | sws_freeContext(scaler_ctx); |
| 84 | scaler_ctx = nullptr; | 80 | scaler_ctx = nullptr; |
| 85 | 81 | ||
| 86 | // FFmpeg returns all frames in YUV420, convert it into expected format | 82 | // Frames are decoded into either YUV420 or NV12 formats. Convert to desired format |
| 87 | scaler_ctx = | 83 | scaler_ctx = sws_getContext(frame->width, frame->height, |
| 88 | sws_getContext(frame->width, frame->height, AV_PIX_FMT_YUV420P, frame->width, | 84 | static_cast<AVPixelFormat>(frame->format), frame->width, |
| 89 | frame->height, target_format, 0, nullptr, nullptr, nullptr); | 85 | frame->height, target_format, 0, nullptr, nullptr, nullptr); |
| 90 | 86 | ||
| 91 | scaler_width = frame->width; | 87 | scaler_width = frame->width; |
| 92 | scaler_height = frame->height; | 88 | scaler_height = frame->height; |
| 93 | } | 89 | } |
| 94 | // Get Converted frame | 90 | // Get Converted frame |
| 95 | const std::size_t linear_size = frame->width * frame->height * 4; | 91 | const u32 width = static_cast<u32>(frame->width); |
| 92 | const u32 height = static_cast<u32>(frame->height); | ||
| 93 | const std::size_t linear_size = width * height * 4; | ||
| 96 | 94 | ||
| 97 | // Only allocate frame_buffer once per stream, as the size is not expected to change | 95 | // Only allocate frame_buffer once per stream, as the size is not expected to change |
| 98 | if (!converted_frame_buffer) { | 96 | if (!converted_frame_buffer) { |
| @@ -109,11 +107,10 @@ void Vic::Execute() { | |||
| 109 | if (blk_kind != 0) { | 107 | if (blk_kind != 0) { |
| 110 | // swizzle pitch linear to block linear | 108 | // swizzle pitch linear to block linear |
| 111 | const u32 block_height = static_cast<u32>(config.block_linear_height_log2); | 109 | const u32 block_height = static_cast<u32>(config.block_linear_height_log2); |
| 112 | const auto size = Tegra::Texture::CalculateSize(true, 4, frame->width, frame->height, 1, | 110 | const auto size = |
| 113 | block_height, 0); | 111 | Tegra::Texture::CalculateSize(true, 4, width, height, 1, block_height, 0); |
| 114 | luma_buffer.resize(size); | 112 | luma_buffer.resize(size); |
| 115 | Tegra::Texture::SwizzleSubrect(frame->width, frame->height, frame->width * 4, | 113 | Tegra::Texture::SwizzleSubrect(width, height, width * 4, width, 4, luma_buffer.data(), |
| 116 | frame->width, 4, luma_buffer.data(), | ||
| 117 | converted_frame_buffer.get(), block_height, 0, 0); | 114 | converted_frame_buffer.get(), block_height, 0, 0); |
| 118 | 115 | ||
| 119 | gpu.MemoryManager().WriteBlock(output_surface_luma_address, luma_buffer.data(), size); | 116 | gpu.MemoryManager().WriteBlock(output_surface_luma_address, luma_buffer.data(), size); |
| @@ -131,41 +128,65 @@ void Vic::Execute() { | |||
| 131 | const std::size_t surface_height = config.surface_height_minus1 + 1; | 128 | const std::size_t surface_height = config.surface_height_minus1 + 1; |
| 132 | const auto frame_width = std::min(surface_width, static_cast<size_t>(frame->width)); | 129 | const auto frame_width = std::min(surface_width, static_cast<size_t>(frame->width)); |
| 133 | const auto frame_height = std::min(surface_height, static_cast<size_t>(frame->height)); | 130 | const auto frame_height = std::min(surface_height, static_cast<size_t>(frame->height)); |
| 134 | const std::size_t half_width = frame_width / 2; | 131 | const std::size_t aligned_width = (surface_width + 0xff) & ~0xffUL; |
| 135 | const std::size_t half_height = frame_height / 2; | ||
| 136 | const std::size_t aligned_width = (surface_width + 0xff) & ~0xff; | ||
| 137 | 132 | ||
| 138 | const auto* luma_ptr = frame->data[0]; | ||
| 139 | const auto* chroma_b_ptr = frame->data[1]; | ||
| 140 | const auto* chroma_r_ptr = frame->data[2]; | ||
| 141 | const auto stride = static_cast<size_t>(frame->linesize[0]); | 133 | const auto stride = static_cast<size_t>(frame->linesize[0]); |
| 142 | const auto half_stride = static_cast<size_t>(frame->linesize[1]); | ||
| 143 | 134 | ||
| 144 | luma_buffer.resize(aligned_width * surface_height); | 135 | luma_buffer.resize(aligned_width * surface_height); |
| 145 | chroma_buffer.resize(aligned_width * surface_height / 2); | 136 | chroma_buffer.resize(aligned_width * surface_height / 2); |
| 146 | 137 | ||
| 147 | // Populate luma buffer | 138 | // Populate luma buffer |
| 139 | const u8* luma_src = frame->data[0]; | ||
| 148 | for (std::size_t y = 0; y < frame_height; ++y) { | 140 | for (std::size_t y = 0; y < frame_height; ++y) { |
| 149 | const std::size_t src = y * stride; | 141 | const std::size_t src = y * stride; |
| 150 | const std::size_t dst = y * aligned_width; | 142 | const std::size_t dst = y * aligned_width; |
| 151 | for (std::size_t x = 0; x < frame_width; ++x) { | 143 | for (std::size_t x = 0; x < frame_width; ++x) { |
| 152 | luma_buffer[dst + x] = luma_ptr[src + x]; | 144 | luma_buffer[dst + x] = luma_src[src + x]; |
| 153 | } | 145 | } |
| 154 | } | 146 | } |
| 155 | gpu.MemoryManager().WriteBlock(output_surface_luma_address, luma_buffer.data(), | 147 | gpu.MemoryManager().WriteBlock(output_surface_luma_address, luma_buffer.data(), |
| 156 | luma_buffer.size()); | 148 | luma_buffer.size()); |
| 157 | 149 | ||
| 158 | // Populate chroma buffer from both channels with interleaving. | 150 | // Chroma |
| 159 | for (std::size_t y = 0; y < half_height; ++y) { | 151 | const std::size_t half_height = frame_height / 2; |
| 160 | const std::size_t src = y * half_stride; | 152 | const auto half_stride = static_cast<size_t>(frame->linesize[1]); |
| 161 | const std::size_t dst = y * aligned_width; | ||
| 162 | 153 | ||
| 163 | for (std::size_t x = 0; x < half_width; ++x) { | 154 | switch (frame->format) { |
| 164 | chroma_buffer[dst + x * 2] = chroma_b_ptr[src + x]; | 155 | case AV_PIX_FMT_YUV420P: { |
| 165 | chroma_buffer[dst + x * 2 + 1] = chroma_r_ptr[src + x]; | 156 | // Frame from FFmpeg software |
| 157 | // Populate chroma buffer from both channels with interleaving. | ||
| 158 | const std::size_t half_width = frame_width / 2; | ||
| 159 | const u8* chroma_b_src = frame->data[1]; | ||
| 160 | const u8* chroma_r_src = frame->data[2]; | ||
| 161 | for (std::size_t y = 0; y < half_height; ++y) { | ||
| 162 | const std::size_t src = y * half_stride; | ||
| 163 | const std::size_t dst = y * aligned_width; | ||
| 164 | |||
| 165 | for (std::size_t x = 0; x < half_width; ++x) { | ||
| 166 | chroma_buffer[dst + x * 2] = chroma_b_src[src + x]; | ||
| 167 | chroma_buffer[dst + x * 2 + 1] = chroma_r_src[src + x]; | ||
| 168 | } | ||
| 169 | } | ||
| 170 | break; | ||
| 171 | } | ||
| 172 | case AV_PIX_FMT_NV12: { | ||
| 173 | // Frame from VA-API hardware | ||
| 174 | // This is already interleaved so just copy | ||
| 175 | const u8* chroma_src = frame->data[1]; | ||
| 176 | for (std::size_t y = 0; y < half_height; ++y) { | ||
| 177 | const std::size_t src = y * stride; | ||
| 178 | const std::size_t dst = y * aligned_width; | ||
| 179 | for (std::size_t x = 0; x < frame_width; ++x) { | ||
| 180 | chroma_buffer[dst + x] = chroma_src[src + x]; | ||
| 181 | } | ||
| 166 | } | 182 | } |
| 183 | break; | ||
| 184 | } | ||
| 185 | default: | ||
| 186 | UNREACHABLE(); | ||
| 187 | break; | ||
| 167 | } | 188 | } |
| 168 | gpu.MemoryManager().WriteBlock(output_surface_chroma_u_address, chroma_buffer.data(), | 189 | gpu.MemoryManager().WriteBlock(output_surface_chroma_address, chroma_buffer.data(), |
| 169 | chroma_buffer.size()); | 190 | chroma_buffer.size()); |
| 170 | break; | 191 | break; |
| 171 | } | 192 | } |
diff --git a/src/video_core/command_classes/vic.h b/src/video_core/command_classes/vic.h index f5a2ed100..74246e08c 100644 --- a/src/video_core/command_classes/vic.h +++ b/src/video_core/command_classes/vic.h | |||
| @@ -22,8 +22,8 @@ public: | |||
| 22 | SetControlParams = 0x1c1, | 22 | SetControlParams = 0x1c1, |
| 23 | SetConfigStructOffset = 0x1c2, | 23 | SetConfigStructOffset = 0x1c2, |
| 24 | SetOutputSurfaceLumaOffset = 0x1c8, | 24 | SetOutputSurfaceLumaOffset = 0x1c8, |
| 25 | SetOutputSurfaceChromaUOffset = 0x1c9, | 25 | SetOutputSurfaceChromaOffset = 0x1c9, |
| 26 | SetOutputSurfaceChromaVOffset = 0x1ca | 26 | SetOutputSurfaceChromaUnusedOffset = 0x1ca |
| 27 | }; | 27 | }; |
| 28 | 28 | ||
| 29 | explicit Vic(GPU& gpu, std::shared_ptr<Nvdec> nvdec_processor); | 29 | explicit Vic(GPU& gpu, std::shared_ptr<Nvdec> nvdec_processor); |
| @@ -64,8 +64,7 @@ private: | |||
| 64 | 64 | ||
| 65 | GPUVAddr config_struct_address{}; | 65 | GPUVAddr config_struct_address{}; |
| 66 | GPUVAddr output_surface_luma_address{}; | 66 | GPUVAddr output_surface_luma_address{}; |
| 67 | GPUVAddr output_surface_chroma_u_address{}; | 67 | GPUVAddr output_surface_chroma_address{}; |
| 68 | GPUVAddr output_surface_chroma_v_address{}; | ||
| 69 | 68 | ||
| 70 | SwsContext* scaler_ctx{}; | 69 | SwsContext* scaler_ctx{}; |
| 71 | s32 scaler_width{}; | 70 | s32 scaler_width{}; |