nvdec: Implement VA-API hardware video acceleration (#6713)

* nvdec: VA-API * Verify formatting * Forgot a semicolon for Windows * Clarify comment about AV_PIX_FMT_NV12 * Fix assert log spam from missing negation * vic: Remove forgotten debug code * Address lioncash's review * Mention VA-API is Intel/AMD * Address v1993's review * Hopefully fix CMakeLists style this time * vic: Improve cache locality * vic: Fix off-by-one error * codec: Async * codec: Forgot the GetValue() * nvdec: Address ameerj's review * codec: Fallback to CPU without VA-API support * cmake: Address lat9nq's review * cmake: Make VA-API optional * vaapi: Multiple GPU * Apply suggestions from code review Co-authored-by: Ameer J <52414509+ameerj@users.noreply.github.com> * nvdec: Address ameerj's review * codec: Use anonymous instead of static * nvdec: Remove enum and fix memory leak * nvdec: Address ameerj's review * codec: Remove preparation for threading Co-authored-by: Ameer J <52414509+ameerj@users.noreply.github.com>
author: yzct12345 2021-08-04 03:43:11 +0000
committer: GitHub 2021-08-03 23:43:11 -0400
commit: 2868d4ba84f43c9bf3c7b6997ddcafb6e65c4a02 (patch)
tree: 7cb19f8de5b5b37db87fa331d9b3c951ce372b4b /src
parent: Merge pull request #6805 from lat9nq/fix-user-profiles (diff)
download: yuzu-2868d4ba84f43c9bf3c7b6997ddcafb6e65c4a02.tar.gz
yuzu-2868d4ba84f43c9bf3c7b6997ddcafb6e65c4a02.tar.xz
yuzu-2868d4ba84f43c9bf3c7b6997ddcafb6e65c4a02.zip
5 files changed, 175 insertions, 72 deletions
diff --git a/src/video_core/CMakeLists.txt b/src/video_core/CMakeLists.txt
index 333f6f35f..1eb67c051 100644
--- a/src/video_core/CMakeLists.txt
+++ b/src/video_core/CMakeLists.txt
@@ -1,5 +1,10 @@
 add_subdirectory(host_shaders)
+if(LIBVA_FOUND)
+    set_source_files_properties(command_classes/codecs/codec.cpp
+        PROPERTIES COMPILE_DEFINITIONS LIBVA_FOUND=1)
+endif()
 add_library(video_core STATIC
    buffer_cache/buffer_base.h
    buffer_cache/buffer_cache.cpp
diff --git a/src/video_core/command_classes/codecs/codec.cpp b/src/video_core/command_classes/codecs/codec.cpp
index 1b4bbc8ac..f798a0053 100644
--- a/src/video_core/command_classes/codecs/codec.cpp
+++ b/src/video_core/command_classes/codecs/codec.cpp
@@ -2,7 +2,6 @@
 // Licensed under GPLv2 or any later version
 // Refer to the license.txt file included.
-#include <cstring>
 #include <fstream>
 #include <vector>
 #include "common/assert.h"
@@ -17,10 +16,47 @@ extern "C" {
 }
 namespace Tegra {
+#if defined(LIBVA_FOUND)
+// Hardware acceleration code from FFmpeg/doc/examples/hw_decode.c originally under MIT license
+namespace {
+constexpr std::array<const char*, 2> VAAPI_DRIVERS = {
+    "i915",
+    "amdgpu",
+};
+AVPixelFormat GetHwFormat(AVCodecContext*, const AVPixelFormat* pix_fmts) {
+    for (const AVPixelFormat* p = pix_fmts; *p != AV_PIX_FMT_NONE; ++p) {
+        if (*p == AV_PIX_FMT_VAAPI) {
+            return AV_PIX_FMT_VAAPI;
+        }
+    }
+    LOG_INFO(Service_NVDRV, "Could not find compatible GPU AV format, falling back to CPU");
+    return *pix_fmts;
+}
+bool CreateVaapiHwdevice(AVBufferRef** av_hw_device) {
+    AVDictionary* hwdevice_options = nullptr;
+    av_dict_set(&hwdevice_options, "connection_type", "drm", 0);
+    for (const auto& driver : VAAPI_DRIVERS) {
+        av_dict_set(&hwdevice_options, "kernel_driver", driver, 0);
+        const int hwdevice_error = av_hwdevice_ctx_create(av_hw_device, AV_HWDEVICE_TYPE_VAAPI,
+                                                          nullptr, hwdevice_options, 0);
+        if (hwdevice_error >= 0) {
+            LOG_INFO(Service_NVDRV, "Using VA-API with {}", driver);
+            av_dict_free(&hwdevice_options);
+            return true;
+        }
+        LOG_DEBUG(Service_NVDRV, "VA-API av_hwdevice_ctx_create failed {}", hwdevice_error);
+    }
+    LOG_DEBUG(Service_NVDRV, "VA-API av_hwdevice_ctx_create failed for all drivers");
+    av_dict_free(&hwdevice_options);
+    return false;
+}
+} // namespace
+#endif
 void AVFrameDeleter(AVFrame* ptr) {
-    av_frame_unref(ptr);
+    av_frame_free(&ptr);
-    av_free(ptr);
 }
 Codec::Codec(GPU& gpu_, const NvdecCommon::NvdecRegisters& regs)
@@ -32,19 +68,31 @@ Codec::~Codec() {
        return;
    }
    // Free libav memory
-    AVFrame* av_frame{nullptr};
    avcodec_send_packet(av_codec_ctx, nullptr);
-    av_frame = av_frame_alloc();
+    AVFrame* av_frame = av_frame_alloc();
    avcodec_receive_frame(av_codec_ctx, av_frame);
    avcodec_flush_buffers(av_codec_ctx);
+    av_frame_free(&av_frame);
-    av_frame_unref(av_frame);
-    av_free(av_frame);
    avcodec_close(av_codec_ctx);
+    av_buffer_unref(&av_hw_device);
+}
+void Codec::InitializeHwdec() {
+    // Prioritize integrated GPU to mitigate bandwidth bottlenecks
+#if defined(LIBVA_FOUND)
+    if (CreateVaapiHwdevice(&av_hw_device)) {
+        const auto hw_device_ctx = av_buffer_ref(av_hw_device);
+        ASSERT_MSG(hw_device_ctx, "av_buffer_ref failed");
+        av_codec_ctx->hw_device_ctx = hw_device_ctx;
+        av_codec_ctx->get_format = GetHwFormat;
+        return;
+    }
+#endif
+    // TODO more GPU accelerated decoders
 }
 void Codec::Initialize() {
-    AVCodecID codec{AV_CODEC_ID_NONE};
+    AVCodecID codec;
    switch (current_codec) {
    case NvdecCommon::VideoCodec::H264:
        codec = AV_CODEC_ID_H264;
@@ -53,22 +101,24 @@ void Codec::Initialize() {
        codec = AV_CODEC_ID_VP9;
        break;
    default:
+        UNIMPLEMENTED_MSG("Unknown codec {}", current_codec);
        return;
    }
    av_codec = avcodec_find_decoder(codec);
    av_codec_ctx = avcodec_alloc_context3(av_codec);
    av_opt_set(av_codec_ctx->priv_data, "tune", "zerolatency", 0);
+    InitializeHwdec();
-    // TODO(ameerj): libavcodec gpu hw acceleration
+    if (!av_codec_ctx->hw_device_ctx) {
+        LOG_INFO(Service_NVDRV, "Using FFmpeg software decoding");
+    }
    const auto av_error = avcodec_open2(av_codec_ctx, av_codec, nullptr);
    if (av_error < 0) {
        LOG_ERROR(Service_NVDRV, "avcodec_open2() Failed.");
        avcodec_close(av_codec_ctx);
+        av_buffer_unref(&av_hw_device);
        return;
    }
    initialized = true;
-    return;
 }
 void Codec::SetTargetCodec(NvdecCommon::VideoCodec codec) {
@@ -80,36 +130,64 @@ void Codec::SetTargetCodec(NvdecCommon::VideoCodec codec) {
 void Codec::Decode() {
    const bool is_first_frame = !initialized;
-    if (!initialized) {
+    if (is_first_frame) {
        Initialize();
    }
    bool vp9_hidden_frame = false;
-    AVPacket packet{};
-    av_init_packet(&packet);
    std::vector<u8> frame_data;
    if (current_codec == NvdecCommon::VideoCodec::H264) {
        frame_data = h264_decoder->ComposeFrameHeader(state, is_first_frame);
    } else if (current_codec == NvdecCommon::VideoCodec::Vp9) {
        frame_data = vp9_decoder->ComposeFrameHeader(state);
        vp9_hidden_frame = vp9_decoder->WasFrameHidden();
    }
+    AVPacket packet{};
+    av_init_packet(&packet);
    packet.data = frame_data.data();
    packet.size = static_cast<s32>(frame_data.size());
+    if (const int ret = avcodec_send_packet(av_codec_ctx, &packet); ret) {
-    avcodec_send_packet(av_codec_ctx, &packet);
+        LOG_DEBUG(Service_NVDRV, "avcodec_send_packet error {}", ret);
+        return;
-    if (!vp9_hidden_frame) {
+    }
-        // Only receive/store visible frames
+    // Only receive/store visible frames
-        AVFramePtr frame = AVFramePtr{av_frame_alloc(), AVFrameDeleter};
+    if (vp9_hidden_frame) {
-        avcodec_receive_frame(av_codec_ctx, frame.get());
+        return;
-        av_frames.push(std::move(frame));
+    }
-        // Limit queue to 10 frames. Workaround for ZLA decode and queue spam
+    AVFrame* hw_frame = av_frame_alloc();
-        if (av_frames.size() > 10) {
+    AVFrame* sw_frame = hw_frame;
-            av_frames.pop();
+    ASSERT_MSG(hw_frame, "av_frame_alloc hw_frame failed");
-        }
+    if (const int ret = avcodec_receive_frame(av_codec_ctx, hw_frame); ret) {
+        LOG_DEBUG(Service_NVDRV, "avcodec_receive_frame error {}", ret);
+        av_frame_free(&hw_frame);
+        return;
+    }
+    if (!hw_frame->width || !hw_frame->height) {
+        LOG_WARNING(Service_NVDRV, "Zero width or height in frame");
+        av_frame_free(&hw_frame);
+        return;
+    }
+#if defined(LIBVA_FOUND)
+    // Hardware acceleration code from FFmpeg/doc/examples/hw_decode.c under MIT license
+    if (hw_frame->format == AV_PIX_FMT_VAAPI) {
+        sw_frame = av_frame_alloc();
+        ASSERT_MSG(sw_frame, "av_frame_alloc sw_frame failed");
+        // Can't use AV_PIX_FMT_YUV420P and share code with software decoding in vic.cpp
+        // because Intel drivers crash unless using AV_PIX_FMT_NV12
+        sw_frame->format = AV_PIX_FMT_NV12;
+        const int transfer_data_ret = av_hwframe_transfer_data(sw_frame, hw_frame, 0);
+        ASSERT_MSG(!transfer_data_ret, "av_hwframe_transfer_data error {}", transfer_data_ret);
+        av_frame_free(&hw_frame);
+    }
+#endif
+    if (sw_frame->format != AV_PIX_FMT_YUV420P && sw_frame->format != AV_PIX_FMT_NV12) {
+        UNIMPLEMENTED_MSG("Unexpected video format from host graphics: {}", sw_frame->format);
+        av_frame_free(&sw_frame);
+        return;
+    }
+    av_frames.push(AVFramePtr{sw_frame, AVFrameDeleter});
+    if (av_frames.size() > 10) {
+        LOG_TRACE(Service_NVDRV, "av_frames.push overflow dropped frame");
+        av_frames.pop();
    }
 }
@@ -119,7 +197,6 @@ AVFramePtr Codec::GetCurrentFrame() {
    if (av_frames.empty()) {
        return AVFramePtr{nullptr, AVFrameDeleter};
    }
    AVFramePtr frame = std::move(av_frames.front());
    av_frames.pop();
    return frame;
@@ -144,6 +221,5 @@ std::string_view Codec::GetCurrentCodecName() const {
    default:
        return "Unknown";
    }
-};
+}
 } // namespace Tegra
diff --git a/src/video_core/command_classes/codecs/codec.h b/src/video_core/command_classes/codecs/codec.h
index 96c823c76..71936203f 100644
--- a/src/video_core/command_classes/codecs/codec.h
+++ b/src/video_core/command_classes/codecs/codec.h
@@ -22,7 +22,6 @@ extern "C" {
 namespace Tegra {
 class GPU;
-struct VicRegisters;
 void AVFrameDeleter(AVFrame* ptr);
 using AVFramePtr = std::unique_ptr<AVFrame, decltype(&AVFrameDeleter)>;
@@ -55,10 +54,13 @@ public:
    [[nodiscard]] std::string_view GetCurrentCodecName() const;
 private:
+    void InitializeHwdec();
    bool initialized{};
    NvdecCommon::VideoCodec current_codec{NvdecCommon::VideoCodec::None};
    AVCodec* av_codec{nullptr};
+    AVBufferRef* av_hw_device{nullptr};
    AVCodecContext* av_codec_ctx{nullptr};
    GPU& gpu;
diff --git a/src/video_core/command_classes/vic.cpp b/src/video_core/command_classes/vic.cpp
index ffb7c82a1..d5e77941c 100644
--- a/src/video_core/command_classes/vic.cpp
+++ b/src/video_core/command_classes/vic.cpp
@@ -46,11 +46,8 @@ void Vic::ProcessMethod(Method method, u32 argument) {
    case Method::SetOutputSurfaceLumaOffset:
        output_surface_luma_address = arg;
        break;
-    case Method::SetOutputSurfaceChromaUOffset:
+    case Method::SetOutputSurfaceChromaOffset:
-        output_surface_chroma_u_address = arg;
+        output_surface_chroma_address = arg;
-        break;
-    case Method::SetOutputSurfaceChromaVOffset:
-        output_surface_chroma_v_address = arg;
        break;
    default:
        break;
@@ -65,11 +62,10 @@ void Vic::Execute() {
    const VicConfig config{gpu.MemoryManager().Read<u64>(config_struct_address + 0x20)};
    const AVFramePtr frame_ptr = nvdec_processor->GetFrame();
    const auto* frame = frame_ptr.get();
-    if (!frame || frame->width == 0 || frame->height == 0) {
+    if (!frame) {
        return;
    }
-    const VideoPixelFormat pixel_format =
+    const auto pixel_format = static_cast<VideoPixelFormat>(config.pixel_format.Value());
-        static_cast<VideoPixelFormat>(config.pixel_format.Value());
    switch (pixel_format) {
    case VideoPixelFormat::BGRA8:
    case VideoPixelFormat::RGBA8: {
@@ -83,16 +79,18 @@ void Vic::Execute() {
            sws_freeContext(scaler_ctx);
            scaler_ctx = nullptr;
-            // FFmpeg returns all frames in YUV420, convert it into expected format
+            // Frames are decoded into either YUV420 or NV12 formats. Convert to desired format
-            scaler_ctx =
+            scaler_ctx = sws_getContext(frame->width, frame->height,
-                sws_getContext(frame->width, frame->height, AV_PIX_FMT_YUV420P, frame->width,
+                                        static_cast<AVPixelFormat>(frame->format), frame->width,
-                               frame->height, target_format, 0, nullptr, nullptr, nullptr);
+                                        frame->height, target_format, 0, nullptr, nullptr, nullptr);
            scaler_width = frame->width;
            scaler_height = frame->height;
        }
        // Get Converted frame
-        const std::size_t linear_size = frame->width * frame->height * 4;
+        const u32 width = static_cast<u32>(frame->width);
+        const u32 height = static_cast<u32>(frame->height);
+        const std::size_t linear_size = width * height * 4;
        // Only allocate frame_buffer once per stream, as the size is not expected to change
        if (!converted_frame_buffer) {
@@ -109,11 +107,10 @@ void Vic::Execute() {
        if (blk_kind != 0) {
            // swizzle pitch linear to block linear
            const u32 block_height = static_cast<u32>(config.block_linear_height_log2);
-            const auto size = Tegra::Texture::CalculateSize(true, 4, frame->width, frame->height, 1,
+            const auto size =
-                                                            block_height, 0);
+                Tegra::Texture::CalculateSize(true, 4, width, height, 1, block_height, 0);
            luma_buffer.resize(size);
-            Tegra::Texture::SwizzleSubrect(frame->width, frame->height, frame->width * 4,
+            Tegra::Texture::SwizzleSubrect(width, height, width * 4, width, 4, luma_buffer.data(),
-                                           frame->width, 4, luma_buffer.data(),
                                           converted_frame_buffer.get(), block_height, 0, 0);
            gpu.MemoryManager().WriteBlock(output_surface_luma_address, luma_buffer.data(), size);
@@ -131,41 +128,65 @@ void Vic::Execute() {
        const std::size_t surface_height = config.surface_height_minus1 + 1;
        const auto frame_width = std::min(surface_width, static_cast<size_t>(frame->width));
        const auto frame_height = std::min(surface_height, static_cast<size_t>(frame->height));
-        const std::size_t half_width = frame_width / 2;
+        const std::size_t aligned_width = (surface_width + 0xff) & ~0xffUL;
-        const std::size_t half_height = frame_height / 2;
-        const std::size_t aligned_width = (surface_width + 0xff) & ~0xff;
-        const auto* luma_ptr = frame->data[0];
-        const auto* chroma_b_ptr = frame->data[1];
-        const auto* chroma_r_ptr = frame->data[2];
        const auto stride = static_cast<size_t>(frame->linesize[0]);
-        const auto half_stride = static_cast<size_t>(frame->linesize[1]);
        luma_buffer.resize(aligned_width * surface_height);
        chroma_buffer.resize(aligned_width * surface_height / 2);
        // Populate luma buffer
+        const u8* luma_src = frame->data[0];
        for (std::size_t y = 0; y < frame_height; ++y) {
            const std::size_t src = y * stride;
            const std::size_t dst = y * aligned_width;
            for (std::size_t x = 0; x < frame_width; ++x) {
-                luma_buffer[dst + x] = luma_ptr[src + x];
+                luma_buffer[dst + x] = luma_src[src + x];
            }
        }
        gpu.MemoryManager().WriteBlock(output_surface_luma_address, luma_buffer.data(),
                                       luma_buffer.size());
-        // Populate chroma buffer from both channels with interleaving.
+        // Chroma
-        for (std::size_t y = 0; y < half_height; ++y) {
+        const std::size_t half_height = frame_height / 2;
-            const std::size_t src = y * half_stride;
+        const auto half_stride = static_cast<size_t>(frame->linesize[1]);
-            const std::size_t dst = y * aligned_width;
-            for (std::size_t x = 0; x < half_width; ++x) {
+        switch (frame->format) {
-                chroma_buffer[dst + x * 2] = chroma_b_ptr[src + x];
+        case AV_PIX_FMT_YUV420P: {
-                chroma_buffer[dst + x * 2 + 1] = chroma_r_ptr[src + x];
+            // Frame from FFmpeg software
+            // Populate chroma buffer from both channels with interleaving.
+            const std::size_t half_width = frame_width / 2;
+            const u8* chroma_b_src = frame->data[1];
+            const u8* chroma_r_src = frame->data[2];
+            for (std::size_t y = 0; y < half_height; ++y) {
+                const std::size_t src = y * half_stride;
+                const std::size_t dst = y * aligned_width;
+                for (std::size_t x = 0; x < half_width; ++x) {
+                    chroma_buffer[dst + x * 2] = chroma_b_src[src + x];
+                    chroma_buffer[dst + x * 2 + 1] = chroma_r_src[src + x];
+                }
+            }
+            break;
+        }
+        case AV_PIX_FMT_NV12: {
+            // Frame from VA-API hardware
+            // This is already interleaved so just copy
+            const u8* chroma_src = frame->data[1];
+            for (std::size_t y = 0; y < half_height; ++y) {
+                const std::size_t src = y * stride;
+                const std::size_t dst = y * aligned_width;
+                for (std::size_t x = 0; x < frame_width; ++x) {
+                    chroma_buffer[dst + x] = chroma_src[src + x];
+                }
            }
+            break;
+        }
+        default:
+            UNREACHABLE();
+            break;
        }
-        gpu.MemoryManager().WriteBlock(output_surface_chroma_u_address, chroma_buffer.data(),
+        gpu.MemoryManager().WriteBlock(output_surface_chroma_address, chroma_buffer.data(),
                                       chroma_buffer.size());
        break;
    }
diff --git a/src/video_core/command_classes/vic.h b/src/video_core/command_classes/vic.h
index f5a2ed100..74246e08c 100644
--- a/src/video_core/command_classes/vic.h
+++ b/src/video_core/command_classes/vic.h
@@ -22,8 +22,8 @@ public:
        SetControlParams = 0x1c1,
        SetConfigStructOffset = 0x1c2,
        SetOutputSurfaceLumaOffset = 0x1c8,
-        SetOutputSurfaceChromaUOffset = 0x1c9,
+        SetOutputSurfaceChromaOffset = 0x1c9,
-        SetOutputSurfaceChromaVOffset = 0x1ca
+        SetOutputSurfaceChromaUnusedOffset = 0x1ca
    };
    explicit Vic(GPU& gpu, std::shared_ptr<Nvdec> nvdec_processor);
@@ -64,8 +64,7 @@ private:
    GPUVAddr config_struct_address{};
    GPUVAddr output_surface_luma_address{};
-    GPUVAddr output_surface_chroma_u_address{};
+    GPUVAddr output_surface_chroma_address{};
-    GPUVAddr output_surface_chroma_v_address{};
    SwsContext* scaler_ctx{};
    s32 scaler_width{};
author	yzct12345	2021-08-04 03:43:11 +0000
committer	GitHub	2021-08-03 23:43:11 -0400
commit	2868d4ba84f43c9bf3c7b6997ddcafb6e65c4a02 (patch)
tree	7cb19f8de5b5b37db87fa331d9b3c951ce372b4b /src
parent	Merge pull request #6805 from lat9nq/fix-user-profiles (diff)
download	yuzu-2868d4ba84f43c9bf3c7b6997ddcafb6e65c4a02.tar.gz yuzu-2868d4ba84f43c9bf3c7b6997ddcafb6e65c4a02.tar.xz yuzu-2868d4ba84f43c9bf3c7b6997ddcafb6e65c4a02.zip

diff --git a/src/video_core/CMakeLists.txt b/src/video_core/CMakeLists.txt index 333f6f35f..1eb67c051 100644 --- a/src/video_core/CMakeLists.txt +++ b/src/video_core/CMakeLists.txt
@@ -1,5 +1,10 @@
1	add_subdirectory(host_shaders)	1	add_subdirectory(host_shaders)
2		2
		3	if(LIBVA_FOUND)
		4	set_source_files_properties(command_classes/codecs/codec.cpp
		5	PROPERTIES COMPILE_DEFINITIONS LIBVA_FOUND=1)
		6	endif()
		7
3	add_library(video_core STATIC	8	add_library(video_core STATIC
4	buffer_cache/buffer_base.h	9	buffer_cache/buffer_base.h
5	buffer_cache/buffer_cache.cpp	10	buffer_cache/buffer_cache.cpp


diff --git a/src/video_core/command_classes/codecs/codec.cpp b/src/video_core/command_classes/codecs/codec.cpp index 1b4bbc8ac..f798a0053 100644 --- a/src/video_core/command_classes/codecs/codec.cpp +++ b/src/video_core/command_classes/codecs/codec.cpp
@@ -2,7 +2,6 @@
2	// Licensed under GPLv2 or any later version	2	// Licensed under GPLv2 or any later version
3	// Refer to the license.txt file included.	3	// Refer to the license.txt file included.
4		4
5	#include <cstring>
6	#include <fstream>	5	#include <fstream>
7	#include <vector>	6	#include <vector>
8	#include "common/assert.h"	7	#include "common/assert.h"
@@ -17,10 +16,47 @@ extern "C" {
17	}	16	}
18		17
19	namespace Tegra {	18	namespace Tegra {
		19	#if defined(LIBVA_FOUND)
		20	// Hardware acceleration code from FFmpeg/doc/examples/hw_decode.c originally under MIT license
		21	namespace {
		22	constexpr std::array<const char*, 2> VAAPI_DRIVERS = {
		23	"i915",
		24	"amdgpu",
		25	};
		26
		27	AVPixelFormat GetHwFormat(AVCodecContext, const AVPixelFormat pix_fmts) {
		28	for (const AVPixelFormat* p = pix_fmts; *p != AV_PIX_FMT_NONE; ++p) {
		29	if (*p == AV_PIX_FMT_VAAPI) {
		30	return AV_PIX_FMT_VAAPI;
		31	}
		32	}
		33	LOG_INFO(Service_NVDRV, "Could not find compatible GPU AV format, falling back to CPU");
		34	return *pix_fmts;
		35	}
		36
		37	bool CreateVaapiHwdevice(AVBufferRef** av_hw_device) {
		38	AVDictionary* hwdevice_options = nullptr;
		39	av_dict_set(&hwdevice_options, "connection_type", "drm", 0);
		40	for (const auto& driver : VAAPI_DRIVERS) {
		41	av_dict_set(&hwdevice_options, "kernel_driver", driver, 0);
		42	const int hwdevice_error = av_hwdevice_ctx_create(av_hw_device, AV_HWDEVICE_TYPE_VAAPI,
		43	nullptr, hwdevice_options, 0);
		44	if (hwdevice_error >= 0) {
		45	LOG_INFO(Service_NVDRV, "Using VA-API with {}", driver);
		46	av_dict_free(&hwdevice_options);
		47	return true;
		48	}
		49	LOG_DEBUG(Service_NVDRV, "VA-API av_hwdevice_ctx_create failed {}", hwdevice_error);
		50	}
		51	LOG_DEBUG(Service_NVDRV, "VA-API av_hwdevice_ctx_create failed for all drivers");
		52	av_dict_free(&hwdevice_options);
		53	return false;
		54	}
		55	} // namespace
		56	#endif
20		57
21	void AVFrameDeleter(AVFrame* ptr) {	58	void AVFrameDeleter(AVFrame* ptr) {
22	av_frame_unref(ptr);	59	av_frame_free(&ptr);
23	av_free(ptr);
24	}	60	}
25		61
26	Codec::Codec(GPU& gpu_, const NvdecCommon::NvdecRegisters& regs)	62	Codec::Codec(GPU& gpu_, const NvdecCommon::NvdecRegisters& regs)
@@ -32,19 +68,31 @@ Codec::~Codec() {
32	return;	68	return;
33	}	69	}
34	// Free libav memory	70	// Free libav memory
35	AVFrame* av_frame{nullptr};
36	avcodec_send_packet(av_codec_ctx, nullptr);	71	avcodec_send_packet(av_codec_ctx, nullptr);
37	av_frame = av_frame_alloc();	72	AVFrame* av_frame = av_frame_alloc();
38	avcodec_receive_frame(av_codec_ctx, av_frame);	73	avcodec_receive_frame(av_codec_ctx, av_frame);
39	avcodec_flush_buffers(av_codec_ctx);	74	avcodec_flush_buffers(av_codec_ctx);
40		75	av_frame_free(&av_frame);
41	av_frame_unref(av_frame);
42	av_free(av_frame);
43	avcodec_close(av_codec_ctx);	76	avcodec_close(av_codec_ctx);
		77	av_buffer_unref(&av_hw_device);
		78	}
		79
		80	void Codec::InitializeHwdec() {
		81	// Prioritize integrated GPU to mitigate bandwidth bottlenecks
		82	#if defined(LIBVA_FOUND)
		83	if (CreateVaapiHwdevice(&av_hw_device)) {
		84	const auto hw_device_ctx = av_buffer_ref(av_hw_device);
		85	ASSERT_MSG(hw_device_ctx, "av_buffer_ref failed");
		86	av_codec_ctx->hw_device_ctx = hw_device_ctx;
		87	av_codec_ctx->get_format = GetHwFormat;
		88	return;
		89	}
		90	#endif
		91	// TODO more GPU accelerated decoders
44	}	92	}
45		93
46	void Codec::Initialize() {	94	void Codec::Initialize() {
47	AVCodecID codec{AV_CODEC_ID_NONE};	95	AVCodecID codec;
48	switch (current_codec) {	96	switch (current_codec) {
49	case NvdecCommon::VideoCodec::H264:	97	case NvdecCommon::VideoCodec::H264:
50	codec = AV_CODEC_ID_H264;	98	codec = AV_CODEC_ID_H264;
@@ -53,22 +101,24 @@ void Codec::Initialize() {
53	codec = AV_CODEC_ID_VP9;	101	codec = AV_CODEC_ID_VP9;
54	break;	102	break;
55	default:	103	default:
		104	UNIMPLEMENTED_MSG("Unknown codec {}", current_codec);
56	return;	105	return;
57	}	106	}
58	av_codec = avcodec_find_decoder(codec);	107	av_codec = avcodec_find_decoder(codec);
59	av_codec_ctx = avcodec_alloc_context3(av_codec);	108	av_codec_ctx = avcodec_alloc_context3(av_codec);
60	av_opt_set(av_codec_ctx->priv_data, "tune", "zerolatency", 0);	109	av_opt_set(av_codec_ctx->priv_data, "tune", "zerolatency", 0);
61		110	InitializeHwdec();
62	// TODO(ameerj): libavcodec gpu hw acceleration	111	if (!av_codec_ctx->hw_device_ctx) {
63		112	LOG_INFO(Service_NVDRV, "Using FFmpeg software decoding");
		113	}
64	const auto av_error = avcodec_open2(av_codec_ctx, av_codec, nullptr);	114	const auto av_error = avcodec_open2(av_codec_ctx, av_codec, nullptr);
65	if (av_error < 0) {	115	if (av_error < 0) {
66	LOG_ERROR(Service_NVDRV, "avcodec_open2() Failed.");	116	LOG_ERROR(Service_NVDRV, "avcodec_open2() Failed.");
67	avcodec_close(av_codec_ctx);	117	avcodec_close(av_codec_ctx);
		118	av_buffer_unref(&av_hw_device);
68	return;	119	return;
69	}	120	}
70	initialized = true;	121	initialized = true;
71	return;
72	}	122	}
73		123
74	void Codec::SetTargetCodec(NvdecCommon::VideoCodec codec) {	124	void Codec::SetTargetCodec(NvdecCommon::VideoCodec codec) {
@@ -80,36 +130,64 @@ void Codec::SetTargetCodec(NvdecCommon::VideoCodec codec) {
80		130
81	void Codec::Decode() {	131	void Codec::Decode() {
82	const bool is_first_frame = !initialized;	132	const bool is_first_frame = !initialized;
83	if (!initialized) {	133	if (is_first_frame) {
84	Initialize();	134	Initialize();
85	}	135	}
86
87	bool vp9_hidden_frame = false;	136	bool vp9_hidden_frame = false;
88	AVPacket packet{};
89	av_init_packet(&packet);
90	std::vector<u8> frame_data;	137	std::vector<u8> frame_data;
91
92	if (current_codec == NvdecCommon::VideoCodec::H264) {	138	if (current_codec == NvdecCommon::VideoCodec::H264) {
93	frame_data = h264_decoder->ComposeFrameHeader(state, is_first_frame);	139	frame_data = h264_decoder->ComposeFrameHeader(state, is_first_frame);
94	} else if (current_codec == NvdecCommon::VideoCodec::Vp9) {	140	} else if (current_codec == NvdecCommon::VideoCodec::Vp9) {
95	frame_data = vp9_decoder->ComposeFrameHeader(state);	141	frame_data = vp9_decoder->ComposeFrameHeader(state);
96	vp9_hidden_frame = vp9_decoder->WasFrameHidden();	142	vp9_hidden_frame = vp9_decoder->WasFrameHidden();
97	}	143	}
98		144	AVPacket packet{};
		145	av_init_packet(&packet);
99	packet.data = frame_data.data();	146	packet.data = frame_data.data();
100	packet.size = static_cast<s32>(frame_data.size());	147	packet.size = static_cast<s32>(frame_data.size());
101		148	if (const int ret = avcodec_send_packet(av_codec_ctx, &packet); ret) {
102	avcodec_send_packet(av_codec_ctx, &packet);	149	LOG_DEBUG(Service_NVDRV, "avcodec_send_packet error {}", ret);
103		150	return;
104	if (!vp9_hidden_frame) {	151	}
105	// Only receive/store visible frames	152	// Only receive/store visible frames
106	AVFramePtr frame = AVFramePtr{av_frame_alloc(), AVFrameDeleter};	153	if (vp9_hidden_frame) {
107	avcodec_receive_frame(av_codec_ctx, frame.get());	154	return;
108	av_frames.push(std::move(frame));	155	}
109	// Limit queue to 10 frames. Workaround for ZLA decode and queue spam	156	AVFrame* hw_frame = av_frame_alloc();
110	if (av_frames.size() > 10) {	157	AVFrame* sw_frame = hw_frame;
111	av_frames.pop();	158	ASSERT_MSG(hw_frame, "av_frame_alloc hw_frame failed");
112	}	159	if (const int ret = avcodec_receive_frame(av_codec_ctx, hw_frame); ret) {
		160	LOG_DEBUG(Service_NVDRV, "avcodec_receive_frame error {}", ret);
		161	av_frame_free(&hw_frame);
		162	return;
		163	}
		164	if (!hw_frame->width \|\| !hw_frame->height) {
		165	LOG_WARNING(Service_NVDRV, "Zero width or height in frame");
		166	av_frame_free(&hw_frame);
		167	return;
		168	}
		169	#if defined(LIBVA_FOUND)
		170	// Hardware acceleration code from FFmpeg/doc/examples/hw_decode.c under MIT license
		171	if (hw_frame->format == AV_PIX_FMT_VAAPI) {
		172	sw_frame = av_frame_alloc();
		173	ASSERT_MSG(sw_frame, "av_frame_alloc sw_frame failed");
		174	// Can't use AV_PIX_FMT_YUV420P and share code with software decoding in vic.cpp
		175	// because Intel drivers crash unless using AV_PIX_FMT_NV12
		176	sw_frame->format = AV_PIX_FMT_NV12;
		177	const int transfer_data_ret = av_hwframe_transfer_data(sw_frame, hw_frame, 0);
		178	ASSERT_MSG(!transfer_data_ret, "av_hwframe_transfer_data error {}", transfer_data_ret);
		179	av_frame_free(&hw_frame);
		180	}
		181	#endif
		182	if (sw_frame->format != AV_PIX_FMT_YUV420P && sw_frame->format != AV_PIX_FMT_NV12) {
		183	UNIMPLEMENTED_MSG("Unexpected video format from host graphics: {}", sw_frame->format);
		184	av_frame_free(&sw_frame);
		185	return;
		186	}
		187	av_frames.push(AVFramePtr{sw_frame, AVFrameDeleter});
		188	if (av_frames.size() > 10) {
		189	LOG_TRACE(Service_NVDRV, "av_frames.push overflow dropped frame");
		190	av_frames.pop();
113	}	191	}
114	}	192	}
115		193
@@ -119,7 +197,6 @@ AVFramePtr Codec::GetCurrentFrame() {
119	if (av_frames.empty()) {	197	if (av_frames.empty()) {
120	return AVFramePtr{nullptr, AVFrameDeleter};	198	return AVFramePtr{nullptr, AVFrameDeleter};
121	}	199	}
122
123	AVFramePtr frame = std::move(av_frames.front());	200	AVFramePtr frame = std::move(av_frames.front());
124	av_frames.pop();	201	av_frames.pop();
125	return frame;	202	return frame;
@@ -144,6 +221,5 @@ std::string_view Codec::GetCurrentCodecName() const {
144	default:	221	default:
145	return "Unknown";	222	return "Unknown";
146	}	223	}
147	};	224	}
148
149	} // namespace Tegra	225	} // namespace Tegra


diff --git a/src/video_core/command_classes/codecs/codec.h b/src/video_core/command_classes/codecs/codec.h index 96c823c76..71936203f 100644 --- a/src/video_core/command_classes/codecs/codec.h +++ b/src/video_core/command_classes/codecs/codec.h
@@ -22,7 +22,6 @@ extern "C" {
22		22
23	namespace Tegra {	23	namespace Tegra {
24	class GPU;	24	class GPU;
25	struct VicRegisters;
26		25
27	void AVFrameDeleter(AVFrame* ptr);	26	void AVFrameDeleter(AVFrame* ptr);
28	using AVFramePtr = std::unique_ptr<AVFrame, decltype(&AVFrameDeleter)>;	27	using AVFramePtr = std::unique_ptr<AVFrame, decltype(&AVFrameDeleter)>;
@@ -55,10 +54,13 @@ public:
55	[[nodiscard]] std::string_view GetCurrentCodecName() const;	54	[[nodiscard]] std::string_view GetCurrentCodecName() const;
56		55
57	private:	56	private:
		57	void InitializeHwdec();
		58
58	bool initialized{};	59	bool initialized{};
59	NvdecCommon::VideoCodec current_codec{NvdecCommon::VideoCodec::None};	60	NvdecCommon::VideoCodec current_codec{NvdecCommon::VideoCodec::None};
60		61
61	AVCodec* av_codec{nullptr};	62	AVCodec* av_codec{nullptr};
		63	AVBufferRef* av_hw_device{nullptr};
62	AVCodecContext* av_codec_ctx{nullptr};	64	AVCodecContext* av_codec_ctx{nullptr};
63		65
64	GPU& gpu;	66	GPU& gpu;


diff --git a/src/video_core/command_classes/vic.cpp b/src/video_core/command_classes/vic.cpp index ffb7c82a1..d5e77941c 100644 --- a/src/video_core/command_classes/vic.cpp +++ b/src/video_core/command_classes/vic.cpp
@@ -46,11 +46,8 @@ void Vic::ProcessMethod(Method method, u32 argument) {
46	case Method::SetOutputSurfaceLumaOffset:	46	case Method::SetOutputSurfaceLumaOffset:
47	output_surface_luma_address = arg;	47	output_surface_luma_address = arg;
48	break;	48	break;
49	case Method::SetOutputSurfaceChromaUOffset:	49	case Method::SetOutputSurfaceChromaOffset:
50	output_surface_chroma_u_address = arg;	50	output_surface_chroma_address = arg;
51	break;
52	case Method::SetOutputSurfaceChromaVOffset:
53	output_surface_chroma_v_address = arg;
54	break;	51	break;
55	default:	52	default:
56	break;	53	break;
@@ -65,11 +62,10 @@ void Vic::Execute() {
65	const VicConfig config{gpu.MemoryManager().Read<u64>(config_struct_address + 0x20)};	62	const VicConfig config{gpu.MemoryManager().Read<u64>(config_struct_address + 0x20)};
66	const AVFramePtr frame_ptr = nvdec_processor->GetFrame();	63	const AVFramePtr frame_ptr = nvdec_processor->GetFrame();
67	const auto* frame = frame_ptr.get();	64	const auto* frame = frame_ptr.get();
68	if (!frame \|\| frame->width == 0 \|\| frame->height == 0) {	65	if (!frame) {
69	return;	66	return;
70	}	67	}
71	const VideoPixelFormat pixel_format =	68	const auto pixel_format = static_cast<VideoPixelFormat>(config.pixel_format.Value());
72	static_cast<VideoPixelFormat>(config.pixel_format.Value());
73	switch (pixel_format) {	69	switch (pixel_format) {
74	case VideoPixelFormat::BGRA8:	70	case VideoPixelFormat::BGRA8:
75	case VideoPixelFormat::RGBA8: {	71	case VideoPixelFormat::RGBA8: {
@@ -83,16 +79,18 @@ void Vic::Execute() {
83	sws_freeContext(scaler_ctx);	79	sws_freeContext(scaler_ctx);
84	scaler_ctx = nullptr;	80	scaler_ctx = nullptr;
85		81
86	// FFmpeg returns all frames in YUV420, convert it into expected format	82	// Frames are decoded into either YUV420 or NV12 formats. Convert to desired format
87	scaler_ctx =	83	scaler_ctx = sws_getContext(frame->width, frame->height,
88	sws_getContext(frame->width, frame->height, AV_PIX_FMT_YUV420P, frame->width,	84	static_cast<AVPixelFormat>(frame->format), frame->width,
89	frame->height, target_format, 0, nullptr, nullptr, nullptr);	85	frame->height, target_format, 0, nullptr, nullptr, nullptr);
90		86
91	scaler_width = frame->width;	87	scaler_width = frame->width;
92	scaler_height = frame->height;	88	scaler_height = frame->height;
93	}	89	}
94	// Get Converted frame	90	// Get Converted frame
95	const std::size_t linear_size = frame->width * frame->height * 4;	91	const u32 width = static_cast<u32>(frame->width);
		92	const u32 height = static_cast<u32>(frame->height);
		93	const std::size_t linear_size = width * height * 4;
96		94
97	// Only allocate frame_buffer once per stream, as the size is not expected to change	95	// Only allocate frame_buffer once per stream, as the size is not expected to change
98	if (!converted_frame_buffer) {	96	if (!converted_frame_buffer) {
@@ -109,11 +107,10 @@ void Vic::Execute() {
109	if (blk_kind != 0) {	107	if (blk_kind != 0) {
110	// swizzle pitch linear to block linear	108	// swizzle pitch linear to block linear
111	const u32 block_height = static_cast<u32>(config.block_linear_height_log2);	109	const u32 block_height = static_cast<u32>(config.block_linear_height_log2);
112	const auto size = Tegra::Texture::CalculateSize(true, 4, frame->width, frame->height, 1,	110	const auto size =
113	block_height, 0);	111	Tegra::Texture::CalculateSize(true, 4, width, height, 1, block_height, 0);
114	luma_buffer.resize(size);	112	luma_buffer.resize(size);
115	Tegra::Texture::SwizzleSubrect(frame->width, frame->height, frame->width * 4,	113	Tegra::Texture::SwizzleSubrect(width, height, width * 4, width, 4, luma_buffer.data(),
116	frame->width, 4, luma_buffer.data(),
117	converted_frame_buffer.get(), block_height, 0, 0);	114	converted_frame_buffer.get(), block_height, 0, 0);
118		115
119	gpu.MemoryManager().WriteBlock(output_surface_luma_address, luma_buffer.data(), size);	116	gpu.MemoryManager().WriteBlock(output_surface_luma_address, luma_buffer.data(), size);
@@ -131,41 +128,65 @@ void Vic::Execute() {
131	const std::size_t surface_height = config.surface_height_minus1 + 1;	128	const std::size_t surface_height = config.surface_height_minus1 + 1;
132	const auto frame_width = std::min(surface_width, static_cast<size_t>(frame->width));	129	const auto frame_width = std::min(surface_width, static_cast<size_t>(frame->width));
133	const auto frame_height = std::min(surface_height, static_cast<size_t>(frame->height));	130	const auto frame_height = std::min(surface_height, static_cast<size_t>(frame->height));
134	const std::size_t half_width = frame_width / 2;	131	const std::size_t aligned_width = (surface_width + 0xff) & ~0xffUL;
135	const std::size_t half_height = frame_height / 2;
136	const std::size_t aligned_width = (surface_width + 0xff) & ~0xff;
137		132
138	const auto* luma_ptr = frame->data[0];
139	const auto* chroma_b_ptr = frame->data[1];
140	const auto* chroma_r_ptr = frame->data[2];
141	const auto stride = static_cast<size_t>(frame->linesize[0]);	133	const auto stride = static_cast<size_t>(frame->linesize[0]);
142	const auto half_stride = static_cast<size_t>(frame->linesize[1]);
143		134
144	luma_buffer.resize(aligned_width * surface_height);	135	luma_buffer.resize(aligned_width * surface_height);
145	chroma_buffer.resize(aligned_width * surface_height / 2);	136	chroma_buffer.resize(aligned_width * surface_height / 2);
146		137
147	// Populate luma buffer	138	// Populate luma buffer
		139	const u8* luma_src = frame->data[0];
148	for (std::size_t y = 0; y < frame_height; ++y) {	140	for (std::size_t y = 0; y < frame_height; ++y) {
149	const std::size_t src = y * stride;	141	const std::size_t src = y * stride;
150	const std::size_t dst = y * aligned_width;	142	const std::size_t dst = y * aligned_width;
151	for (std::size_t x = 0; x < frame_width; ++x) {	143	for (std::size_t x = 0; x < frame_width; ++x) {
152	luma_buffer[dst + x] = luma_ptr[src + x];	144	luma_buffer[dst + x] = luma_src[src + x];
153	}	145	}
154	}	146	}
155	gpu.MemoryManager().WriteBlock(output_surface_luma_address, luma_buffer.data(),	147	gpu.MemoryManager().WriteBlock(output_surface_luma_address, luma_buffer.data(),
156	luma_buffer.size());	148	luma_buffer.size());
157		149
158	// Populate chroma buffer from both channels with interleaving.	150	// Chroma
159	for (std::size_t y = 0; y < half_height; ++y) {	151	const std::size_t half_height = frame_height / 2;
160	const std::size_t src = y * half_stride;	152	const auto half_stride = static_cast<size_t>(frame->linesize[1]);
161	const std::size_t dst = y * aligned_width;
162		153
163	for (std::size_t x = 0; x < half_width; ++x) {	154	switch (frame->format) {
164	chroma_buffer[dst + x * 2] = chroma_b_ptr[src + x];	155	case AV_PIX_FMT_YUV420P: {
165	chroma_buffer[dst + x * 2 + 1] = chroma_r_ptr[src + x];	156	// Frame from FFmpeg software
		157	// Populate chroma buffer from both channels with interleaving.
		158	const std::size_t half_width = frame_width / 2;
		159	const u8* chroma_b_src = frame->data[1];
		160	const u8* chroma_r_src = frame->data[2];
		161	for (std::size_t y = 0; y < half_height; ++y) {
		162	const std::size_t src = y * half_stride;
		163	const std::size_t dst = y * aligned_width;
		164
		165	for (std::size_t x = 0; x < half_width; ++x) {
		166	chroma_buffer[dst + x * 2] = chroma_b_src[src + x];
		167	chroma_buffer[dst + x * 2 + 1] = chroma_r_src[src + x];
		168	}
		169	}
		170	break;
		171	}
		172	case AV_PIX_FMT_NV12: {
		173	// Frame from VA-API hardware
		174	// This is already interleaved so just copy
		175	const u8* chroma_src = frame->data[1];
		176	for (std::size_t y = 0; y < half_height; ++y) {
		177	const std::size_t src = y * stride;
		178	const std::size_t dst = y * aligned_width;
		179	for (std::size_t x = 0; x < frame_width; ++x) {
		180	chroma_buffer[dst + x] = chroma_src[src + x];
		181	}
166	}	182	}
		183	break;
		184	}
		185	default:
		186	UNREACHABLE();
		187	break;
167	}	188	}
168	gpu.MemoryManager().WriteBlock(output_surface_chroma_u_address, chroma_buffer.data(),	189	gpu.MemoryManager().WriteBlock(output_surface_chroma_address, chroma_buffer.data(),
169	chroma_buffer.size());	190	chroma_buffer.size());
170	break;	191	break;
171	}	192	}


diff --git a/src/video_core/command_classes/vic.h b/src/video_core/command_classes/vic.h index f5a2ed100..74246e08c 100644 --- a/src/video_core/command_classes/vic.h +++ b/src/video_core/command_classes/vic.h
@@ -22,8 +22,8 @@ public:
22	SetControlParams = 0x1c1,	22	SetControlParams = 0x1c1,
23	SetConfigStructOffset = 0x1c2,	23	SetConfigStructOffset = 0x1c2,
24	SetOutputSurfaceLumaOffset = 0x1c8,	24	SetOutputSurfaceLumaOffset = 0x1c8,
25	SetOutputSurfaceChromaUOffset = 0x1c9,	25	SetOutputSurfaceChromaOffset = 0x1c9,
26	SetOutputSurfaceChromaVOffset = 0x1ca	26	SetOutputSurfaceChromaUnusedOffset = 0x1ca
27	};	27	};
28		28
29	explicit Vic(GPU& gpu, std::shared_ptr<Nvdec> nvdec_processor);	29	explicit Vic(GPU& gpu, std::shared_ptr<Nvdec> nvdec_processor);
@@ -64,8 +64,7 @@ private:
64		64
65	GPUVAddr config_struct_address{};	65	GPUVAddr config_struct_address{};
66	GPUVAddr output_surface_luma_address{};	66	GPUVAddr output_surface_luma_address{};
67	GPUVAddr output_surface_chroma_u_address{};	67	GPUVAddr output_surface_chroma_address{};
68	GPUVAddr output_surface_chroma_v_address{};
69		68
70	SwsContext* scaler_ctx{};	69	SwsContext* scaler_ctx{};
71	s32 scaler_width{};	70	s32 scaler_width{};