9 files changed, 115 insertions, 57 deletions
diff --git a/src/common/color.h b/src/common/color.h
index 24a445dac..0379040be 100644
--- a/src/common/color.h
+++ b/src/common/color.h
@@ -4,6 +4,8 @@
 #pragma once
+#include <cstring>
 #include "common/common_types.h"
 #include "common/swap.h"
 #include "common/vector_math.h"
@@ -55,7 +57,7 @@ constexpr u8 Convert8To6(u8 value) {
 * @param bytes Pointer to encoded source color
 * @return Result color decoded as Math::Vec4<u8>
 */
-inline const Math::Vec4<u8> DecodeRGBA8(const u8* bytes) {
+inline Math::Vec4<u8> DecodeRGBA8(const u8* bytes) {
    return {bytes[3], bytes[2], bytes[1], bytes[0]};
 }
@@ -64,7 +66,7 @@ inline const Math::Vec4<u8> DecodeRGBA8(const u8* bytes) {
 * @param bytes Pointer to encoded source color
 * @return Result color decoded as Math::Vec4<u8>
 */
-inline const Math::Vec4<u8> DecodeRGB8(const u8* bytes) {
+inline Math::Vec4<u8> DecodeRGB8(const u8* bytes) {
    return {bytes[2], bytes[1], bytes[0], 255};
 }
@@ -73,7 +75,7 @@ inline const Math::Vec4<u8> DecodeRGB8(const u8* bytes) {
 * @param bytes Pointer to encoded source color
 * @return Result color decoded as Math::Vec4<u8>
 */
-inline const Math::Vec4<u8> DecodeRG8(const u8* bytes) {
+inline Math::Vec4<u8> DecodeRG8(const u8* bytes) {
    return {bytes[1], bytes[0], 0, 255};
 }
@@ -82,8 +84,9 @@ inline const Math::Vec4<u8> DecodeRG8(const u8* bytes) {
 * @param bytes Pointer to encoded source color
 * @return Result color decoded as Math::Vec4<u8>
 */
-inline const Math::Vec4<u8> DecodeRGB565(const u8* bytes) {
+inline Math::Vec4<u8> DecodeRGB565(const u8* bytes) {
-    const u16_le pixel = *reinterpret_cast<const u16_le*>(bytes);
+    u16_le pixel;
+    std::memcpy(&pixel, bytes, sizeof(pixel));
    return {Convert5To8((pixel >> 11) & 0x1F), Convert6To8((pixel >> 5) & 0x3F),
            Convert5To8(pixel & 0x1F), 255};
 }
@@ -93,8 +96,9 @@ inline const Math::Vec4<u8> DecodeRGB565(const u8* bytes) {
 * @param bytes Pointer to encoded source color
 * @return Result color decoded as Math::Vec4<u8>
 */
-inline const Math::Vec4<u8> DecodeRGB5A1(const u8* bytes) {
+inline Math::Vec4<u8> DecodeRGB5A1(const u8* bytes) {
-    const u16_le pixel = *reinterpret_cast<const u16_le*>(bytes);
+    u16_le pixel;
+    std::memcpy(&pixel, bytes, sizeof(pixel));
    return {Convert5To8((pixel >> 11) & 0x1F), Convert5To8((pixel >> 6) & 0x1F),
            Convert5To8((pixel >> 1) & 0x1F), Convert1To8(pixel & 0x1)};
 }
@@ -104,8 +108,9 @@ inline const Math::Vec4<u8> DecodeRGB5A1(const u8* bytes) {
 * @param bytes Pointer to encoded source color
 * @return Result color decoded as Math::Vec4<u8>
 */
-inline const Math::Vec4<u8> DecodeRGBA4(const u8* bytes) {
+inline Math::Vec4<u8> DecodeRGBA4(const u8* bytes) {
-    const u16_le pixel = *reinterpret_cast<const u16_le*>(bytes);
+    u16_le pixel;
+    std::memcpy(&pixel, bytes, sizeof(pixel));
    return {Convert4To8((pixel >> 12) & 0xF), Convert4To8((pixel >> 8) & 0xF),
            Convert4To8((pixel >> 4) & 0xF), Convert4To8(pixel & 0xF)};
 }
@@ -116,7 +121,9 @@ inline const Math::Vec4<u8> DecodeRGBA4(const u8* bytes) {
 * @return Depth value as an u32
 */
 inline u32 DecodeD16(const u8* bytes) {
-    return *reinterpret_cast<const u16_le*>(bytes);
+    u16_le data;
+    std::memcpy(&data, bytes, sizeof(data));
+    return data;
 }
 /**
@@ -133,7 +140,7 @@ inline u32 DecodeD24(const u8* bytes) {
 * @param bytes Pointer to encoded source values
 * @return Resulting values stored as a Math::Vec2
 */
-inline const Math::Vec2<u32> DecodeD24S8(const u8* bytes) {
+inline Math::Vec2<u32> DecodeD24S8(const u8* bytes) {
    return {static_cast<u32>((bytes[2] << 16) | (bytes[1] << 8) | bytes[0]), bytes[3]};
 }
@@ -175,8 +182,10 @@ inline void EncodeRG8(const Math::Vec4<u8>& color, u8* bytes) {
 * @param bytes Destination pointer to store encoded color
 */
 inline void EncodeRGB565(const Math::Vec4<u8>& color, u8* bytes) {
-    *reinterpret_cast<u16_le*>(bytes) =
+    const u16_le data =
        (Convert8To5(color.r()) << 11) | (Convert8To6(color.g()) << 5) | Convert8To5(color.b());
+    std::memcpy(bytes, &data, sizeof(data));
 }
 /**
@@ -185,9 +194,10 @@ inline void EncodeRGB565(const Math::Vec4<u8>& color, u8* bytes) {
 * @param bytes Destination pointer to store encoded color
 */
 inline void EncodeRGB5A1(const Math::Vec4<u8>& color, u8* bytes) {
-    *reinterpret_cast<u16_le*>(bytes) = (Convert8To5(color.r()) << 11) |
+    const u16_le data = (Convert8To5(color.r()) << 11) | (Convert8To5(color.g()) << 6) |
-                                        (Convert8To5(color.g()) << 6) |
+                        (Convert8To5(color.b()) << 1) | Convert8To1(color.a());
-                                        (Convert8To5(color.b()) << 1) | Convert8To1(color.a());
+    std::memcpy(bytes, &data, sizeof(data));
 }
 /**
@@ -196,9 +206,10 @@ inline void EncodeRGB5A1(const Math::Vec4<u8>& color, u8* bytes) {
 * @param bytes Destination pointer to store encoded color
 */
 inline void EncodeRGBA4(const Math::Vec4<u8>& color, u8* bytes) {
-    *reinterpret_cast<u16_le*>(bytes) = (Convert8To4(color.r()) << 12) |
+    const u16 data = (Convert8To4(color.r()) << 12) | (Convert8To4(color.g()) << 8) |
-                                        (Convert8To4(color.g()) << 8) |
+                     (Convert8To4(color.b()) << 4) | Convert8To4(color.a());
-                                        (Convert8To4(color.b()) << 4) | Convert8To4(color.a());
+    std::memcpy(bytes, &data, sizeof(data));
 }
 /**
@@ -207,7 +218,8 @@ inline void EncodeRGBA4(const Math::Vec4<u8>& color, u8* bytes) {
 * @param bytes Pointer where to store the encoded value
 */
 inline void EncodeD16(u32 value, u8* bytes) {
-    *reinterpret_cast<u16_le*>(bytes) = value & 0xFFFF;
+    const u16_le data = static_cast<u16>(value);
+    std::memcpy(bytes, &data, sizeof(data));
 }
 /**
diff --git a/src/common/vector_math.h b/src/common/vector_math.h
index 5c94fcda3..8feb49941 100644
--- a/src/common/vector_math.h
+++ b/src/common/vector_math.h
@@ -78,7 +78,7 @@ public:
    }
    template <typename U = T>
-    constexpr Vec2<std::enable_if_t<std::is_signed<U>::value, U>> operator-() const {
+    constexpr Vec2<std::enable_if_t<std::is_signed_v<U>, U>> operator-() const {
        return {-x, -y};
    }
    constexpr Vec2<decltype(T{} * T{})> operator*(const Vec2& other) const {
@@ -227,7 +227,7 @@ public:
    }
    template <typename U = T>
-    constexpr Vec3<std::enable_if_t<std::is_signed<U>::value, U>> operator-() const {
+    constexpr Vec3<std::enable_if_t<std::is_signed_v<U>, U>> operator-() const {
        return {-x, -y, -z};
    }
@@ -436,7 +436,7 @@ public:
    }
    template <typename U = T>
-    constexpr Vec4<std::enable_if_t<std::is_signed<U>::value, U>> operator-() const {
+    constexpr Vec4<std::enable_if_t<std::is_signed_v<U>, U>> operator-() const {
        return {-x, -y, -z, -w};
    }
diff --git a/src/video_core/gpu.h b/src/video_core/gpu.h
index 440505c9d..874eddd78 100644
--- a/src/video_core/gpu.h
+++ b/src/video_core/gpu.h
@@ -34,6 +34,7 @@ enum class RenderTargetFormat : u32 {
    RG16_FLOAT = 0xDE,
    R11G11B10_FLOAT = 0xE0,
    R32_FLOAT = 0xE5,
+    B5G6R5_UNORM = 0xE8,
    R16_FLOAT = 0xF2,
    R8_UNORM = 0xF3,
 };
diff --git a/src/video_core/renderer_opengl/gl_rasterizer.cpp b/src/video_core/renderer_opengl/gl_rasterizer.cpp
index bf6b5c3a0..546e86532 100644
--- a/src/video_core/renderer_opengl/gl_rasterizer.cpp
+++ b/src/video_core/renderer_opengl/gl_rasterizer.cpp
@@ -324,6 +324,11 @@ std::pair<Surface, Surface> RasterizerOpenGL::ConfigureFramebuffers(bool using_c
                                                                    bool using_depth_fb) {
    const auto& regs = Core::System::GetInstance().GPU().Maxwell3D().regs;
+    if (regs.rt[0].format == Tegra::RenderTargetFormat::NONE) {
+        LOG_ERROR(HW_GPU, "RenderTargetFormat is not configured");
+        using_color_fb = false;
+    }
    // TODO(bunnei): Implement this
    const bool has_stencil = false;
diff --git a/src/video_core/renderer_opengl/gl_rasterizer_cache.cpp b/src/video_core/renderer_opengl/gl_rasterizer_cache.cpp
index 257aa9571..f6efce818 100644
--- a/src/video_core/renderer_opengl/gl_rasterizer_cache.cpp
+++ b/src/video_core/renderer_opengl/gl_rasterizer_cache.cpp
@@ -109,6 +109,9 @@ static constexpr std::array<FormatTuple, SurfaceParams::MaxPixelFormat> tex_form
    {GL_COMPRESSED_RGBA_S3TC_DXT5_EXT, GL_RGBA, GL_UNSIGNED_INT_8_8_8_8, ComponentType::UNorm,
     true},                                                                                 // DXT45
    {GL_COMPRESSED_RED_RGTC1, GL_RED, GL_UNSIGNED_INT_8_8_8_8, ComponentType::UNorm, true}, // DXN1
+    {GL_COMPRESSED_RG_RGTC2, GL_RG, GL_UNSIGNED_INT_8_8_8_8, ComponentType::UNorm,
+     true},                                                                     // DXN2UNORM
+    {GL_COMPRESSED_SIGNED_RG_RGTC2, GL_RG, GL_INT, ComponentType::SNorm, true}, // DXN2SNORM
    {GL_COMPRESSED_RGBA_BPTC_UNORM_ARB, GL_RGB, GL_UNSIGNED_INT_8_8_8_8, ComponentType::UNorm,
     true},                                                                    // BC7U
    {GL_RGBA8, GL_RGBA, GL_UNSIGNED_BYTE, ComponentType::UNorm, false},        // ASTC_2D_4X4
@@ -218,6 +221,7 @@ static constexpr std::array<void (*)(u32, u32, u32, u8*, Tegra::GPUVAddr),
        MortonCopy<true, PixelFormat::R11FG11FB10F>, MortonCopy<true, PixelFormat::RGBA32UI>,
        MortonCopy<true, PixelFormat::DXT1>,         MortonCopy<true, PixelFormat::DXT23>,
        MortonCopy<true, PixelFormat::DXT45>,        MortonCopy<true, PixelFormat::DXN1>,
+        MortonCopy<true, PixelFormat::DXN2UNORM>,    MortonCopy<true, PixelFormat::DXN2SNORM>,
        MortonCopy<true, PixelFormat::BC7U>,         MortonCopy<true, PixelFormat::ASTC_2D_4X4>,
        MortonCopy<true, PixelFormat::G8R8>,         MortonCopy<true, PixelFormat::BGRA8>,
        MortonCopy<true, PixelFormat::RGBA32F>,      MortonCopy<true, PixelFormat::RG32F>,
@@ -242,7 +246,10 @@ static constexpr std::array<void (*)(u32, u32, u32, u8*, Tegra::GPUVAddr),
        MortonCopy<false, PixelFormat::RGBA16F>,
        MortonCopy<false, PixelFormat::R11FG11FB10F>,
        MortonCopy<false, PixelFormat::RGBA32UI>,
-        // TODO(Subv): Swizzling DXT1/DXT23/DXT45/DXN1/BC7U/ASTC_2D_4X4 formats is not supported
+        // TODO(Subv): Swizzling DXT1/DXT23/DXT45/DXN1/DXN2/BC7U/ASTC_2D_4X4 formats is not
+        // supported
+        nullptr,
+        nullptr,
        nullptr,
        nullptr,
        nullptr,
@@ -447,22 +454,24 @@ MICROPROFILE_DEFINE(OpenGL_SurfaceLoad, "OpenGL", "Surface Load", MP_RGB(128, 64
 void CachedSurface::LoadGLBuffer() {
    ASSERT(params.type != SurfaceType::Fill);
-    u8* const texture_src_data = Memory::GetPointer(params.GetCpuAddr());
+    const u8* const texture_src_data = Memory::GetPointer(params.GetCpuAddr());
    ASSERT(texture_src_data);
-    gl_buffer.resize(params.width * params.height * GetGLBytesPerPixel(params.pixel_format));
+    const u32 bytes_per_pixel = GetGLBytesPerPixel(params.pixel_format);
+    const u32 copy_size = params.width * params.height * bytes_per_pixel;
    MICROPROFILE_SCOPE(OpenGL_SurfaceLoad);
-    if (!params.is_tiled) {
+    if (params.is_tiled) {
-        const u32 bytes_per_pixel{params.GetFormatBpp() >> 3};
+        gl_buffer.resize(copy_size);
-        std::memcpy(gl_buffer.data(), texture_src_data,
-                    bytes_per_pixel * params.width * params.height);
-    } else {
        morton_to_gl_fns[static_cast<size_t>(params.pixel_format)](
            params.width, params.block_height, params.height, gl_buffer.data(), params.addr);
+    } else {
+        const u8* const texture_src_data_end = texture_src_data + copy_size;
+        gl_buffer.assign(texture_src_data, texture_src_data_end);
    }
    ConvertFormatAsNeeded_LoadGLBuffer(gl_buffer, params.pixel_format, params.width, params.height);
@@ -757,10 +766,12 @@ void RasterizerCacheOpenGL::FlushRegion(Tegra::GPUVAddr /*addr*/, size_t /*size*
 }
 void RasterizerCacheOpenGL::InvalidateRegion(Tegra::GPUVAddr addr, size_t size) {
-    for (const auto& pair : surface_cache) {
+    for (auto iter = surface_cache.cbegin(); iter != surface_cache.cend();) {
-        const auto& surface{pair.second};
+        const auto& surface{iter->second};
        const auto& params{surface->GetSurfaceParams()};
+        ++iter;
        if (params.IsOverlappingRegion(addr, size)) {
            UnregisterSurface(surface);
        }
diff --git a/src/video_core/renderer_opengl/gl_rasterizer_cache.h b/src/video_core/renderer_opengl/gl_rasterizer_cache.h
index 0c6652c7a..26e2ee203 100644
--- a/src/video_core/renderer_opengl/gl_rasterizer_cache.h
+++ b/src/video_core/renderer_opengl/gl_rasterizer_cache.h
@@ -35,31 +35,33 @@ struct SurfaceParams {
        DXT23 = 9,
        DXT45 = 10,
        DXN1 = 11, // This is also known as BC4
-        BC7U = 12,
+        DXN2UNORM = 12,
-        ASTC_2D_4X4 = 13,
+        DXN2SNORM = 13,
-        G8R8 = 14,
+        BC7U = 14,
-        BGRA8 = 15,
+        ASTC_2D_4X4 = 15,
-        RGBA32F = 16,
+        G8R8 = 16,
-        RG32F = 17,
+        BGRA8 = 17,
-        R32F = 18,
+        RGBA32F = 18,
-        R16F = 19,
+        RG32F = 19,
-        R16UNORM = 20,
+        R32F = 20,
-        RG16 = 21,
+        R16F = 21,
-        RG16F = 22,
+        R16UNORM = 22,
-        RG16UI = 23,
+        RG16 = 23,
-        RG16I = 24,
+        RG16F = 24,
-        RG16S = 25,
+        RG16UI = 25,
-        RGB32F = 26,
+        RG16I = 26,
-        SRGBA8 = 27,
+        RG16S = 27,
+        RGB32F = 28,
+        SRGBA8 = 29,
        MaxColorFormat,
        // DepthStencil formats
-        Z24S8 = 28,
+        Z24S8 = 30,
-        S8Z24 = 29,
+        S8Z24 = 31,
-        Z32F = 30,
+        Z32F = 32,
-        Z16 = 31,
+        Z16 = 33,
-        Z32FS8 = 32,
+        Z32FS8 = 34,
        MaxDepthStencilFormat,
@@ -109,6 +111,8 @@ struct SurfaceParams {
            4, // DXT23
            4, // DXT45
            4, // DXN1
+            4, // DXN2UNORM
+            4, // DXN2SNORM
            4, // BC7U
            4, // ASTC_2D_4X4
            1, // G8R8
@@ -153,6 +157,8 @@ struct SurfaceParams {
            128, // DXT23
            128, // DXT45
            64,  // DXN1
+            128, // DXN2UNORM
+            128, // DXN2SNORM
            128, // BC7U
            32,  // ASTC_2D_4X4
            16,  // G8R8
@@ -221,6 +227,8 @@ struct SurfaceParams {
            return PixelFormat::RG32F;
        case Tegra::RenderTargetFormat::R11G11B10_FLOAT:
            return PixelFormat::R11FG11FB10F;
+        case Tegra::RenderTargetFormat::B5G6R5_UNORM:
+            return PixelFormat::B5G6R5;
        case Tegra::RenderTargetFormat::RGBA32_UINT:
            return PixelFormat::RGBA32UI;
        case Tegra::RenderTargetFormat::R8_UNORM:
@@ -303,6 +311,16 @@ struct SurfaceParams {
            return PixelFormat::DXT45;
        case Tegra::Texture::TextureFormat::DXN1:
            return PixelFormat::DXN1;
+        case Tegra::Texture::TextureFormat::DXN2:
+            switch (component_type) {
+            case Tegra::Texture::ComponentType::UNORM:
+                return PixelFormat::DXN2UNORM;
+            case Tegra::Texture::ComponentType::SNORM:
+                return PixelFormat::DXN2SNORM;
+            }
+            LOG_CRITICAL(HW_GPU, "Unimplemented component_type={}",
+                         static_cast<u32>(component_type));
+            UNREACHABLE();
        case Tegra::Texture::TextureFormat::BC7U:
            return PixelFormat::BC7U;
        case Tegra::Texture::TextureFormat::ASTC_2D_4X4:
@@ -360,6 +378,9 @@ struct SurfaceParams {
            return Tegra::Texture::TextureFormat::DXT45;
        case PixelFormat::DXN1:
            return Tegra::Texture::TextureFormat::DXN1;
+        case PixelFormat::DXN2UNORM:
+        case PixelFormat::DXN2SNORM:
+            return Tegra::Texture::TextureFormat::DXN2;
        case PixelFormat::BC7U:
            return Tegra::Texture::TextureFormat::BC7U;
        case PixelFormat::ASTC_2D_4X4:
@@ -441,6 +462,7 @@ struct SurfaceParams {
        case Tegra::RenderTargetFormat::RGB10_A2_UNORM:
        case Tegra::RenderTargetFormat::R8_UNORM:
        case Tegra::RenderTargetFormat::RG16_UNORM:
+        case Tegra::RenderTargetFormat::B5G6R5_UNORM:
            return ComponentType::UNorm;
        case Tegra::RenderTargetFormat::RG16_SNORM:
            return ComponentType::SNorm;
diff --git a/src/video_core/renderer_opengl/gl_shader_decompiler.cpp b/src/video_core/renderer_opengl/gl_shader_decompiler.cpp
index dd240a4ce..ea7779429 100644
--- a/src/video_core/renderer_opengl/gl_shader_decompiler.cpp
+++ b/src/video_core/renderer_opengl/gl_shader_decompiler.cpp
@@ -657,16 +657,17 @@ private:
     * @param instr Instruction to generate the if condition for.
     * @returns string containing the predicate condition.
     */
-    std::string GetPredicateCondition(u64 index, bool negate) const {
+    std::string GetPredicateCondition(u64 index, bool negate) {
        using Tegra::Shader::Pred;
        std::string variable;
        // Index 7 is used as an 'Always True' condition.
-        if (index == static_cast<u64>(Pred::UnusedIndex))
+        if (index == static_cast<u64>(Pred::UnusedIndex)) {
            variable = "true";
-        else
+        } else {
            variable = 'p' + std::to_string(index) + '_' + suffix;
+            declr_predicates.insert(variable);
+        }
        if (negate) {
            return "!(" + variable + ')';
        }
diff --git a/src/video_core/renderer_opengl/maxwell_to_gl.h b/src/video_core/renderer_opengl/maxwell_to_gl.h
index 500d4d4b1..43be69dd1 100644
--- a/src/video_core/renderer_opengl/maxwell_to_gl.h
+++ b/src/video_core/renderer_opengl/maxwell_to_gl.h
@@ -31,6 +31,7 @@ inline GLenum VertexType(Maxwell::VertexAttribute attrib) {
        case Maxwell::VertexAttribute::Size::Size_8_8_8_8:
            return GL_UNSIGNED_BYTE;
        case Maxwell::VertexAttribute::Size::Size_16_16:
+        case Maxwell::VertexAttribute::Size::Size_16_16_16_16:
            return GL_UNSIGNED_SHORT;
        case Maxwell::VertexAttribute::Size::Size_10_10_10_2:
            return GL_UNSIGNED_INT_2_10_10_10_REV;
@@ -85,6 +86,8 @@ inline GLenum IndexFormat(Maxwell::IndexFormat index_format) {
 inline GLenum PrimitiveTopology(Maxwell::PrimitiveTopology topology) {
    switch (topology) {
+    case Maxwell::PrimitiveTopology::Points:
+        return GL_POINTS;
    case Maxwell::PrimitiveTopology::Triangles:
        return GL_TRIANGLES;
    case Maxwell::PrimitiveTopology::TriangleStrip:
diff --git a/src/video_core/textures/decoders.cpp b/src/video_core/textures/decoders.cpp
index 65db84ad3..7ea66584c 100644
--- a/src/video_core/textures/decoders.cpp
+++ b/src/video_core/textures/decoders.cpp
@@ -54,6 +54,7 @@ u32 BytesPerPixel(TextureFormat format) {
        return 8;
    case TextureFormat::DXT23:
    case TextureFormat::DXT45:
+    case TextureFormat::DXN2:
    case TextureFormat::BC7U:
        // In this case a 'pixel' actually refers to a 4x4 tile.
        return 16;
@@ -113,6 +114,7 @@ std::vector<u8> UnswizzleTexture(VAddr address, TextureFormat format, u32 width,
    case TextureFormat::DXT23:
    case TextureFormat::DXT45:
    case TextureFormat::DXN1:
+    case TextureFormat::DXN2:
    case TextureFormat::BC7U:
        // In the DXT and DXN formats, each 4x4 tile is swizzled instead of just individual pixel
        // values.
@@ -179,6 +181,7 @@ std::vector<u8> DecodeTexture(const std::vector<u8>& texture_data, TextureFormat
    case TextureFormat::DXT23:
    case TextureFormat::DXT45:
    case TextureFormat::DXN1:
+    case TextureFormat::DXN2:
    case TextureFormat::BC7U:
    case TextureFormat::ASTC_2D_4X4:
    case TextureFormat::A8R8G8B8: