8 files changed, 203 insertions, 21 deletions
diff --git a/src/core/hle/service/nvdrv/devices/nvhost_as_gpu.cpp b/src/core/hle/service/nvdrv/devices/nvhost_as_gpu.cpp
index 7555bbe7d..8d194e175 100644
--- a/src/core/hle/service/nvdrv/devices/nvhost_as_gpu.cpp
+++ b/src/core/hle/service/nvdrv/devices/nvhost_as_gpu.cpp
@@ -167,10 +167,11 @@ u32 nvhost_as_gpu::UnmapBuffer(const std::vector<u8>& input, std::vector<u8>& ou
    auto& system_instance = Core::System::GetInstance();
    // Remove this memory region from the rasterizer cache.
-    system_instance.Renderer().Rasterizer().FlushAndInvalidateRegion(params.offset,
-                                                                     itr->second.size);
    auto& gpu = system_instance.GPU();
+    auto cpu_addr = gpu.MemoryManager().GpuToCpuAddress(params.offset);
+    ASSERT(cpu_addr);
+    system_instance.Renderer().Rasterizer().FlushAndInvalidateRegion(*cpu_addr, itr->second.size);
    params.offset = gpu.MemoryManager().UnmapBuffer(params.offset, itr->second.size);
    buffer_mappings.erase(itr->second.offset);
diff --git a/src/video_core/engines/fermi_2d.h b/src/video_core/engines/fermi_2d.h
index 81d15c62a..2a6e8bbbb 100644
--- a/src/video_core/engines/fermi_2d.h
+++ b/src/video_core/engines/fermi_2d.h
@@ -36,9 +36,9 @@ public:
            RenderTargetFormat format;
            BitField<0, 1, u32> linear;
            union {
-                BitField<0, 4, u32> block_depth;
+                BitField<0, 4, u32> block_width;
                BitField<4, 4, u32> block_height;
-                BitField<8, 4, u32> block_width;
+                BitField<8, 4, u32> block_depth;
            };
            u32 depth;
            u32 layer;
@@ -53,10 +53,20 @@ public:
                                             address_low);
            }
+            u32 BlockWidth() const {
+                // The block width is stored in log2 format.
+                return 1 << block_width;
+            }
            u32 BlockHeight() const {
                // The block height is stored in log2 format.
                return 1 << block_height;
            }
+            u32 BlockDepth() const {
+                // The block depth is stored in log2 format.
+                return 1 << block_depth;
+            }
        };
        static_assert(sizeof(Surface) == 0x28, "Surface has incorrect size");
diff --git a/src/video_core/engines/maxwell_3d.h b/src/video_core/engines/maxwell_3d.h
index 20e1884da..c8d1b6478 100644
--- a/src/video_core/engines/maxwell_3d.h
+++ b/src/video_core/engines/maxwell_3d.h
@@ -347,6 +347,16 @@ public:
            DecrWrap = 8,
        };
+        enum class MemoryLayout : u32 {
+            Linear = 0,
+            BlockLinear = 1,
+        };
+        enum class InvMemoryLayout : u32 {
+            BlockLinear = 0,
+            Linear = 1,
+        };
        struct Cull {
            enum class FrontFace : u32 {
                ClockWise = 0x0900,
@@ -432,7 +442,12 @@ public:
            u32 width;
            u32 height;
            Tegra::RenderTargetFormat format;
-            u32 block_dimensions;
+            union {
+                BitField<0, 3, u32> block_width;
+                BitField<4, 3, u32> block_height;
+                BitField<8, 3, u32> block_depth;
+                BitField<12, 1, InvMemoryLayout> type;
+            } memory_layout;
            u32 array_mode;
            u32 layer_stride;
            u32 base_layer;
@@ -562,7 +577,12 @@ public:
                    u32 address_high;
                    u32 address_low;
                    Tegra::DepthFormat format;
-                    u32 block_dimensions;
+                    union {
+                        BitField<0, 4, u32> block_width;
+                        BitField<4, 4, u32> block_height;
+                        BitField<8, 4, u32> block_depth;
+                        BitField<20, 1, InvMemoryLayout> type;
+                    } memory_layout;
                    u32 layer_stride;
                    GPUVAddr Address() const {
diff --git a/src/video_core/engines/shader_bytecode.h b/src/video_core/engines/shader_bytecode.h
index 550ab1148..9a59b65b3 100644
--- a/src/video_core/engines/shader_bytecode.h
+++ b/src/video_core/engines/shader_bytecode.h
@@ -214,6 +214,18 @@ enum class IMinMaxExchange : u64 {
    XHi = 3,
 };
+enum class VmadType : u64 {
+    Size16_Low = 0,
+    Size16_High = 1,
+    Size32 = 2,
+    Invalid = 3,
+};
+enum class VmadShr : u64 {
+    Shr7 = 1,
+    Shr15 = 2,
+};
 enum class XmadMode : u64 {
    None = 0,
    CLo = 1,
@@ -452,6 +464,7 @@ union Instruction {
    BitField<48, 16, u64> opcode;
    union {
+        BitField<20, 16, u64> imm20_16;
        BitField<20, 19, u64> imm20_19;
        BitField<20, 32, s64> imm20_32;
        BitField<45, 1, u64> negate_b;
@@ -493,6 +506,10 @@ union Instruction {
            }
        } lop3;
+        u16 GetImm20_16() const {
+            return static_cast<u16>(imm20_16);
+        }
        u32 GetImm20_19() const {
            u32 imm{static_cast<u32>(imm20_19)};
            imm <<= 12;
@@ -1017,6 +1034,23 @@ union Instruction {
    } isberd;
    union {
+        BitField<48, 1, u64> signed_a;
+        BitField<38, 1, u64> is_byte_chunk_a;
+        BitField<36, 2, VmadType> type_a;
+        BitField<36, 2, u64> byte_height_a;
+        BitField<49, 1, u64> signed_b;
+        BitField<50, 1, u64> use_register_b;
+        BitField<30, 1, u64> is_byte_chunk_b;
+        BitField<28, 2, VmadType> type_b;
+        BitField<28, 2, u64> byte_height_b;
+        BitField<51, 2, VmadShr> shr;
+        BitField<55, 1, u64> saturate; // Saturates the result (a * b + c)
+        BitField<47, 1, u64> cc;
+    } vmad;
+    union {
        BitField<20, 16, u64> imm20_16;
        BitField<36, 1, u64> product_shift_left;
        BitField<37, 1, u64> merge_37;
@@ -1083,6 +1117,7 @@ public:
        IPA,
        OUT_R, // Emit vertex/primitive
        ISBERD,
+        VMAD,
        FFMA_IMM, // Fused Multiply and Add
        FFMA_CR,
        FFMA_RC,
@@ -1320,6 +1355,7 @@ private:
            INST("11100000--------", Id::IPA, Type::Trivial, "IPA"),
            INST("1111101111100---", Id::OUT_R, Type::Trivial, "OUT_R"),
            INST("1110111111010---", Id::ISBERD, Type::Trivial, "ISBERD"),
+            INST("01011111--------", Id::VMAD, Type::Trivial, "VMAD"),
            INST("0011001-1-------", Id::FFMA_IMM, Type::Ffma, "FFMA_IMM"),
            INST("010010011-------", Id::FFMA_CR, Type::Ffma, "FFMA_CR"),
            INST("010100011-------", Id::FFMA_RC, Type::Ffma, "FFMA_RC"),
diff --git a/src/video_core/renderer_opengl/gl_rasterizer_cache.cpp b/src/video_core/renderer_opengl/gl_rasterizer_cache.cpp
index 56ff83eff..65a220c41 100644
--- a/src/video_core/renderer_opengl/gl_rasterizer_cache.cpp
+++ b/src/video_core/renderer_opengl/gl_rasterizer_cache.cpp
@@ -45,7 +45,9 @@ static VAddr TryGetCpuAddr(Tegra::GPUVAddr gpu_addr) {
    SurfaceParams params{};
    params.addr = TryGetCpuAddr(config.tic.Address());
    params.is_tiled = config.tic.IsTiled();
+    params.block_width = params.is_tiled ? config.tic.BlockWidth() : 0,
    params.block_height = params.is_tiled ? config.tic.BlockHeight() : 0,
+    params.block_depth = params.is_tiled ? config.tic.BlockDepth() : 0,
    params.pixel_format =
        PixelFormatFromTextureFormat(config.tic.format, config.tic.r_type.Value());
    params.component_type = ComponentTypeFromTexture(config.tic.r_type.Value());
@@ -97,8 +99,11 @@ static VAddr TryGetCpuAddr(Tegra::GPUVAddr gpu_addr) {
    const auto& config{Core::System::GetInstance().GPU().Maxwell3D().regs.rt[index]};
    SurfaceParams params{};
    params.addr = TryGetCpuAddr(config.Address());
-    params.is_tiled = true;
+    params.is_tiled =
-    params.block_height = Tegra::Texture::TICEntry::DefaultBlockHeight;
+        config.memory_layout.type == Tegra::Engines::Maxwell3D::Regs::InvMemoryLayout::BlockLinear;
+    params.block_width = 1 << config.memory_layout.block_width;
+    params.block_height = 1 << config.memory_layout.block_height;
+    params.block_depth = 1 << config.memory_layout.block_depth;
    params.pixel_format = PixelFormatFromRenderTargetFormat(config.format);
    params.component_type = ComponentTypeFromRenderTarget(config.format);
    params.type = GetFormatType(params.pixel_format);
@@ -120,13 +125,16 @@ static VAddr TryGetCpuAddr(Tegra::GPUVAddr gpu_addr) {
    return params;
 }
-/*static*/ SurfaceParams SurfaceParams::CreateForDepthBuffer(u32 zeta_width, u32 zeta_height,
+/*static*/ SurfaceParams SurfaceParams::CreateForDepthBuffer(
-                                                             Tegra::GPUVAddr zeta_address,
+    u32 zeta_width, u32 zeta_height, Tegra::GPUVAddr zeta_address, Tegra::DepthFormat format,
-                                                             Tegra::DepthFormat format) {
+    u32 block_width, u32 block_height, u32 block_depth,
+    Tegra::Engines::Maxwell3D::Regs::InvMemoryLayout type) {
    SurfaceParams params{};
    params.addr = TryGetCpuAddr(zeta_address);
-    params.is_tiled = true;
+    params.is_tiled = type == Tegra::Engines::Maxwell3D::Regs::InvMemoryLayout::BlockLinear;
-    params.block_height = Tegra::Texture::TICEntry::DefaultBlockHeight;
+    params.block_width = 1 << std::min(block_width, 5U);
+    params.block_height = 1 << std::min(block_height, 5U);
+    params.block_depth = 1 << std::min(block_depth, 5U);
    params.pixel_format = PixelFormatFromDepthFormat(format);
    params.component_type = ComponentTypeFromDepthFormat(format);
    params.type = GetFormatType(params.pixel_format);
@@ -148,7 +156,9 @@ static VAddr TryGetCpuAddr(Tegra::GPUVAddr gpu_addr) {
    SurfaceParams params{};
    params.addr = TryGetCpuAddr(config.Address());
    params.is_tiled = !config.linear;
-    params.block_height = params.is_tiled ? config.BlockHeight() : 0,
+    params.block_width = params.is_tiled ? std::min(config.BlockWidth(), 32U) : 0,
+    params.block_height = params.is_tiled ? std::min(config.BlockHeight(), 32U) : 0,
+    params.block_depth = params.is_tiled ? std::min(config.BlockDepth(), 32U) : 0,
    params.pixel_format = PixelFormatFromRenderTargetFormat(config.format);
    params.component_type = ComponentTypeFromRenderTarget(config.format);
    params.type = GetFormatType(params.pixel_format);
@@ -818,6 +828,11 @@ void CachedSurface::LoadGLBuffer() {
    if (params.is_tiled) {
        gl_buffer.resize(total_size);
+        ASSERT_MSG(params.block_width == 1, "Block width is defined as {} on texture type {}",
+                   params.block_width, static_cast<u32>(params.target));
+        ASSERT_MSG(params.block_depth == 1, "Block depth is defined as {} on texture type {}",
+                   params.block_depth, static_cast<u32>(params.target));
        // TODO(bunnei): This only unswizzles and copies a 2D texture - we do not yet know how to do
        // this for 3D textures, etc.
        switch (params.target) {
@@ -989,7 +1004,9 @@ Surface RasterizerCacheOpenGL::GetDepthBufferSurface(bool preserve_contents) {
    }
    SurfaceParams depth_params{SurfaceParams::CreateForDepthBuffer(
-        regs.zeta_width, regs.zeta_height, regs.zeta.Address(), regs.zeta.format)};
+        regs.zeta_width, regs.zeta_height, regs.zeta.Address(), regs.zeta.format,
+        regs.zeta.memory_layout.block_width, regs.zeta.memory_layout.block_height,
+        regs.zeta.memory_layout.block_depth, regs.zeta.memory_layout.type)};
    return GetSurface(depth_params, preserve_contents);
 }
diff --git a/src/video_core/renderer_opengl/gl_rasterizer_cache.h b/src/video_core/renderer_opengl/gl_rasterizer_cache.h
index 0b4940b3c..66d98ad4e 100644
--- a/src/video_core/renderer_opengl/gl_rasterizer_cache.h
+++ b/src/video_core/renderer_opengl/gl_rasterizer_cache.h
@@ -716,9 +716,10 @@ struct SurfaceParams {
    static SurfaceParams CreateForFramebuffer(std::size_t index);
    /// Creates SurfaceParams for a depth buffer configuration
-    static SurfaceParams CreateForDepthBuffer(u32 zeta_width, u32 zeta_height,
+    static SurfaceParams CreateForDepthBuffer(
-                                              Tegra::GPUVAddr zeta_address,
+        u32 zeta_width, u32 zeta_height, Tegra::GPUVAddr zeta_address, Tegra::DepthFormat format,
-                                              Tegra::DepthFormat format);
+        u32 block_width, u32 block_height, u32 block_depth,
+        Tegra::Engines::Maxwell3D::Regs::InvMemoryLayout type);
    /// Creates SurfaceParams for a Fermi2D surface copy
    static SurfaceParams CreateForFermiCopySurface(
@@ -733,7 +734,9 @@ struct SurfaceParams {
    VAddr addr;
    bool is_tiled;
+    u32 block_width;
    u32 block_height;
+    u32 block_depth;
    PixelFormat pixel_format;
    ComponentType component_type;
    SurfaceType type;
diff --git a/src/video_core/renderer_opengl/gl_shader_decompiler.cpp b/src/video_core/renderer_opengl/gl_shader_decompiler.cpp
index c82a0dcfa..8dfb49507 100644
--- a/src/video_core/renderer_opengl/gl_shader_decompiler.cpp
+++ b/src/video_core/renderer_opengl/gl_shader_decompiler.cpp
@@ -2953,6 +2953,88 @@ private:
                LOG_WARNING(HW_GPU, "DEPBAR instruction is stubbed");
                break;
            }
+            case OpCode::Id::VMAD: {
+                const bool signed_a = instr.vmad.signed_a == 1;
+                const bool signed_b = instr.vmad.signed_b == 1;
+                const bool result_signed = signed_a || signed_b;
+                boost::optional<std::string> forced_result;
+                auto Unpack = [&](const std::string& op, bool is_chunk, bool is_signed,
+                                  Tegra::Shader::VmadType type, u64 byte_height) {
+                    const std::string value = [&]() {
+                        if (!is_chunk) {
+                            const auto offset = static_cast<u32>(byte_height * 8);
+                            return "((" + op + " >> " + std::to_string(offset) + ") & 0xff)";
+                        }
+                        const std::string zero = "0";
+                        switch (type) {
+                        case Tegra::Shader::VmadType::Size16_Low:
+                            return '(' + op + " & 0xffff)";
+                        case Tegra::Shader::VmadType::Size16_High:
+                            return '(' + op + " >> 16)";
+                        case Tegra::Shader::VmadType::Size32:
+                            // TODO(Rodrigo): From my hardware tests it becomes a bit "mad" when
+                            // this type is used (1 * 1 + 0 == 0x5b800000). Until a better
+                            // explanation is found: assert.
+                            UNREACHABLE_MSG("Unimplemented");
+                            return zero;
+                        case Tegra::Shader::VmadType::Invalid:
+                            // Note(Rodrigo): This flag is invalid according to nvdisasm. From my
+                            // testing (even though it's invalid) this makes the whole instruction
+                            // assign zero to target register.
+                            forced_result = boost::make_optional(zero);
+                            return zero;
+                        default:
+                            UNREACHABLE();
+                            return zero;
+                        }
+                    }();
+                    if (is_signed) {
+                        return "int(" + value + ')';
+                    }
+                    return value;
+                };
+                const std::string op_a = Unpack(regs.GetRegisterAsInteger(instr.gpr8, 0, false),
+                                                instr.vmad.is_byte_chunk_a != 0, signed_a,
+                                                instr.vmad.type_a, instr.vmad.byte_height_a);
+                std::string op_b;
+                if (instr.vmad.use_register_b) {
+                    op_b = Unpack(regs.GetRegisterAsInteger(instr.gpr20, 0, false),
+                                  instr.vmad.is_byte_chunk_b != 0, signed_b, instr.vmad.type_b,
+                                  instr.vmad.byte_height_b);
+                } else {
+                    op_b = '(' +
+                           std::to_string(signed_b ? static_cast<s16>(instr.alu.GetImm20_16())
+                                                   : instr.alu.GetImm20_16()) +
+                           ')';
+                }
+                const std::string op_c = regs.GetRegisterAsInteger(instr.gpr39, 0, result_signed);
+                std::string result;
+                if (forced_result) {
+                    result = *forced_result;
+                } else {
+                    result = '(' + op_a + " * " + op_b + " + " + op_c + ')';
+                    switch (instr.vmad.shr) {
+                    case Tegra::Shader::VmadShr::Shr7:
+                        result = '(' + result + " >> 7)";
+                        break;
+                    case Tegra::Shader::VmadShr::Shr15:
+                        result = '(' + result + " >> 15)";
+                        break;
+                    }
+                }
+                regs.SetRegisterToInteger(instr.gpr0, result_signed, 1, result, 1, 1,
+                                          instr.vmad.saturate == 1, 0, Register::Size::Word,
+                                          instr.vmad.cc);
+                break;
+            }
            default: {
                LOG_CRITICAL(HW_GPU, "Unhandled instruction: {}", opcode->GetName());
                UNREACHABLE();
diff --git a/src/video_core/textures/texture.h b/src/video_core/textures/texture.h
index 8f31d825a..58d17abcb 100644
--- a/src/video_core/textures/texture.h
+++ b/src/video_core/textures/texture.h
@@ -161,7 +161,9 @@ struct TICEntry {
        BitField<21, 3, TICHeaderVersion> header_version;
    };
    union {
+        BitField<0, 3, u32> block_width;
        BitField<3, 3, u32> block_height;
+        BitField<6, 3, u32> block_depth;
        // High 16 bits of the pitch value
        BitField<0, 16, u32> pitch_high;
@@ -202,13 +204,24 @@ struct TICEntry {
        return depth_minus_1 + 1;
    }
+    u32 BlockWidth() const {
+        ASSERT(IsTiled());
+        // The block height is stored in log2 format.
+        return 1 << block_width;
+    }
    u32 BlockHeight() const {
-        ASSERT(header_version == TICHeaderVersion::BlockLinear ||
+        ASSERT(IsTiled());
-               header_version == TICHeaderVersion::BlockLinearColorKey);
        // The block height is stored in log2 format.
        return 1 << block_height;
    }
+    u32 BlockDepth() const {
+        ASSERT(IsTiled());
+        // The block height is stored in log2 format.
+        return 1 << block_depth;
+    }
    bool IsTiled() const {
        return header_version == TICHeaderVersion::BlockLinear ||
               header_version == TICHeaderVersion::BlockLinearColorKey;

diff --git a/src/core/hle/service/nvdrv/devices/nvhost_as_gpu.cpp b/src/core/hle/service/nvdrv/devices/nvhost_as_gpu.cpp index 7555bbe7d..8d194e175 100644 --- a/src/core/hle/service/nvdrv/devices/nvhost_as_gpu.cpp +++ b/src/core/hle/service/nvdrv/devices/nvhost_as_gpu.cpp
@@ -167,10 +167,11 @@ u32 nvhost_as_gpu::UnmapBuffer(const std::vector<u8>& input, std::vector<u8>& ou
167	auto& system_instance = Core::System::GetInstance();	167	auto& system_instance = Core::System::GetInstance();
168		168
169	// Remove this memory region from the rasterizer cache.	169	// Remove this memory region from the rasterizer cache.
170	system_instance.Renderer().Rasterizer().FlushAndInvalidateRegion(params.offset,
171	itr->second.size);
172
173	auto& gpu = system_instance.GPU();	170	auto& gpu = system_instance.GPU();
		171	auto cpu_addr = gpu.MemoryManager().GpuToCpuAddress(params.offset);
		172	ASSERT(cpu_addr);
		173	system_instance.Renderer().Rasterizer().FlushAndInvalidateRegion(*cpu_addr, itr->second.size);
		174
174	params.offset = gpu.MemoryManager().UnmapBuffer(params.offset, itr->second.size);	175	params.offset = gpu.MemoryManager().UnmapBuffer(params.offset, itr->second.size);
175		176
176	buffer_mappings.erase(itr->second.offset);	177	buffer_mappings.erase(itr->second.offset);


diff --git a/src/video_core/engines/fermi_2d.h b/src/video_core/engines/fermi_2d.h index 81d15c62a..2a6e8bbbb 100644 --- a/src/video_core/engines/fermi_2d.h +++ b/src/video_core/engines/fermi_2d.h
@@ -36,9 +36,9 @@ public:
36	RenderTargetFormat format;	36	RenderTargetFormat format;
37	BitField<0, 1, u32> linear;	37	BitField<0, 1, u32> linear;
38	union {	38	union {
39	BitField<0, 4, u32> block_depth;	39	BitField<0, 4, u32> block_width;
40	BitField<4, 4, u32> block_height;	40	BitField<4, 4, u32> block_height;
41	BitField<8, 4, u32> block_width;	41	BitField<8, 4, u32> block_depth;
42	};	42	};
43	u32 depth;	43	u32 depth;
44	u32 layer;	44	u32 layer;
@@ -53,10 +53,20 @@ public:
53	address_low);	53	address_low);
54	}	54	}
55		55
		56	u32 BlockWidth() const {
		57	// The block width is stored in log2 format.
		58	return 1 << block_width;
		59	}
		60
56	u32 BlockHeight() const {	61	u32 BlockHeight() const {
57	// The block height is stored in log2 format.	62	// The block height is stored in log2 format.
58	return 1 << block_height;	63	return 1 << block_height;
59	}	64	}
		65
		66	u32 BlockDepth() const {
		67	// The block depth is stored in log2 format.
		68	return 1 << block_depth;
		69	}
60	};	70	};
61	static_assert(sizeof(Surface) == 0x28, "Surface has incorrect size");	71	static_assert(sizeof(Surface) == 0x28, "Surface has incorrect size");
62		72


diff --git a/src/video_core/engines/maxwell_3d.h b/src/video_core/engines/maxwell_3d.h index 20e1884da..c8d1b6478 100644 --- a/src/video_core/engines/maxwell_3d.h +++ b/src/video_core/engines/maxwell_3d.h
@@ -347,6 +347,16 @@ public:
347	DecrWrap = 8,	347	DecrWrap = 8,
348	};	348	};
349		349
		350	enum class MemoryLayout : u32 {
		351	Linear = 0,
		352	BlockLinear = 1,
		353	};
		354
		355	enum class InvMemoryLayout : u32 {
		356	BlockLinear = 0,
		357	Linear = 1,
		358	};
		359
350	struct Cull {	360	struct Cull {
351	enum class FrontFace : u32 {	361	enum class FrontFace : u32 {
352	ClockWise = 0x0900,	362	ClockWise = 0x0900,
@@ -432,7 +442,12 @@ public:
432	u32 width;	442	u32 width;
433	u32 height;	443	u32 height;
434	Tegra::RenderTargetFormat format;	444	Tegra::RenderTargetFormat format;
435	u32 block_dimensions;	445	union {
		446	BitField<0, 3, u32> block_width;
		447	BitField<4, 3, u32> block_height;
		448	BitField<8, 3, u32> block_depth;
		449	BitField<12, 1, InvMemoryLayout> type;
		450	} memory_layout;
436	u32 array_mode;	451	u32 array_mode;
437	u32 layer_stride;	452	u32 layer_stride;
438	u32 base_layer;	453	u32 base_layer;
@@ -562,7 +577,12 @@ public:
562	u32 address_high;	577	u32 address_high;
563	u32 address_low;	578	u32 address_low;
564	Tegra::DepthFormat format;	579	Tegra::DepthFormat format;
565	u32 block_dimensions;	580	union {
		581	BitField<0, 4, u32> block_width;
		582	BitField<4, 4, u32> block_height;
		583	BitField<8, 4, u32> block_depth;
		584	BitField<20, 1, InvMemoryLayout> type;
		585	} memory_layout;
566	u32 layer_stride;	586	u32 layer_stride;
567		587
568	GPUVAddr Address() const {	588	GPUVAddr Address() const {


diff --git a/src/video_core/engines/shader_bytecode.h b/src/video_core/engines/shader_bytecode.h index 550ab1148..9a59b65b3 100644 --- a/src/video_core/engines/shader_bytecode.h +++ b/src/video_core/engines/shader_bytecode.h
@@ -214,6 +214,18 @@ enum class IMinMaxExchange : u64 {
214	XHi = 3,	214	XHi = 3,
215	};	215	};
216		216
		217	enum class VmadType : u64 {
		218	Size16_Low = 0,
		219	Size16_High = 1,
		220	Size32 = 2,
		221	Invalid = 3,
		222	};
		223
		224	enum class VmadShr : u64 {
		225	Shr7 = 1,
		226	Shr15 = 2,
		227	};
		228
217	enum class XmadMode : u64 {	229	enum class XmadMode : u64 {
218	None = 0,	230	None = 0,
219	CLo = 1,	231	CLo = 1,
@@ -452,6 +464,7 @@ union Instruction {
452	BitField<48, 16, u64> opcode;	464	BitField<48, 16, u64> opcode;
453		465
454	union {	466	union {
		467	BitField<20, 16, u64> imm20_16;
455	BitField<20, 19, u64> imm20_19;	468	BitField<20, 19, u64> imm20_19;
456	BitField<20, 32, s64> imm20_32;	469	BitField<20, 32, s64> imm20_32;
457	BitField<45, 1, u64> negate_b;	470	BitField<45, 1, u64> negate_b;
@@ -493,6 +506,10 @@ union Instruction {
493	}	506	}
494	} lop3;	507	} lop3;
495		508
		509	u16 GetImm20_16() const {
		510	return static_cast<u16>(imm20_16);
		511	}
		512
496	u32 GetImm20_19() const {	513	u32 GetImm20_19() const {
497	u32 imm{static_cast<u32>(imm20_19)};	514	u32 imm{static_cast<u32>(imm20_19)};
498	imm <<= 12;	515	imm <<= 12;
@@ -1017,6 +1034,23 @@ union Instruction {
1017	} isberd;	1034	} isberd;
1018		1035
1019	union {	1036	union {
		1037	BitField<48, 1, u64> signed_a;
		1038	BitField<38, 1, u64> is_byte_chunk_a;
		1039	BitField<36, 2, VmadType> type_a;
		1040	BitField<36, 2, u64> byte_height_a;
		1041
		1042	BitField<49, 1, u64> signed_b;
		1043	BitField<50, 1, u64> use_register_b;
		1044	BitField<30, 1, u64> is_byte_chunk_b;
		1045	BitField<28, 2, VmadType> type_b;
		1046	BitField<28, 2, u64> byte_height_b;
		1047
		1048	BitField<51, 2, VmadShr> shr;
		1049	BitField<55, 1, u64> saturate; // Saturates the result (a * b + c)
		1050	BitField<47, 1, u64> cc;
		1051	} vmad;
		1052
		1053	union {
1020	BitField<20, 16, u64> imm20_16;	1054	BitField<20, 16, u64> imm20_16;
1021	BitField<36, 1, u64> product_shift_left;	1055	BitField<36, 1, u64> product_shift_left;
1022	BitField<37, 1, u64> merge_37;	1056	BitField<37, 1, u64> merge_37;
@@ -1083,6 +1117,7 @@ public:
1083	IPA,	1117	IPA,
1084	OUT_R, // Emit vertex/primitive	1118	OUT_R, // Emit vertex/primitive
1085	ISBERD,	1119	ISBERD,
		1120	VMAD,
1086	FFMA_IMM, // Fused Multiply and Add	1121	FFMA_IMM, // Fused Multiply and Add
1087	FFMA_CR,	1122	FFMA_CR,
1088	FFMA_RC,	1123	FFMA_RC,
@@ -1320,6 +1355,7 @@ private:
1320	INST("11100000--------", Id::IPA, Type::Trivial, "IPA"),	1355	INST("11100000--------", Id::IPA, Type::Trivial, "IPA"),
1321	INST("1111101111100---", Id::OUT_R, Type::Trivial, "OUT_R"),	1356	INST("1111101111100---", Id::OUT_R, Type::Trivial, "OUT_R"),
1322	INST("1110111111010---", Id::ISBERD, Type::Trivial, "ISBERD"),	1357	INST("1110111111010---", Id::ISBERD, Type::Trivial, "ISBERD"),
		1358	INST("01011111--------", Id::VMAD, Type::Trivial, "VMAD"),
1323	INST("0011001-1-------", Id::FFMA_IMM, Type::Ffma, "FFMA_IMM"),	1359	INST("0011001-1-------", Id::FFMA_IMM, Type::Ffma, "FFMA_IMM"),
1324	INST("010010011-------", Id::FFMA_CR, Type::Ffma, "FFMA_CR"),	1360	INST("010010011-------", Id::FFMA_CR, Type::Ffma, "FFMA_CR"),
1325	INST("010100011-------", Id::FFMA_RC, Type::Ffma, "FFMA_RC"),	1361	INST("010100011-------", Id::FFMA_RC, Type::Ffma, "FFMA_RC"),


diff --git a/src/video_core/renderer_opengl/gl_rasterizer_cache.cpp b/src/video_core/renderer_opengl/gl_rasterizer_cache.cpp index 56ff83eff..65a220c41 100644 --- a/src/video_core/renderer_opengl/gl_rasterizer_cache.cpp +++ b/src/video_core/renderer_opengl/gl_rasterizer_cache.cpp
@@ -45,7 +45,9 @@ static VAddr TryGetCpuAddr(Tegra::GPUVAddr gpu_addr) {
45	SurfaceParams params{};	45	SurfaceParams params{};
46	params.addr = TryGetCpuAddr(config.tic.Address());	46	params.addr = TryGetCpuAddr(config.tic.Address());
47	params.is_tiled = config.tic.IsTiled();	47	params.is_tiled = config.tic.IsTiled();
		48	params.block_width = params.is_tiled ? config.tic.BlockWidth() : 0,
48	params.block_height = params.is_tiled ? config.tic.BlockHeight() : 0,	49	params.block_height = params.is_tiled ? config.tic.BlockHeight() : 0,
		50	params.block_depth = params.is_tiled ? config.tic.BlockDepth() : 0,
49	params.pixel_format =	51	params.pixel_format =
50	PixelFormatFromTextureFormat(config.tic.format, config.tic.r_type.Value());	52	PixelFormatFromTextureFormat(config.tic.format, config.tic.r_type.Value());
51	params.component_type = ComponentTypeFromTexture(config.tic.r_type.Value());	53	params.component_type = ComponentTypeFromTexture(config.tic.r_type.Value());
@@ -97,8 +99,11 @@ static VAddr TryGetCpuAddr(Tegra::GPUVAddr gpu_addr) {
97	const auto& config{Core::System::GetInstance().GPU().Maxwell3D().regs.rt[index]};	99	const auto& config{Core::System::GetInstance().GPU().Maxwell3D().regs.rt[index]};
98	SurfaceParams params{};	100	SurfaceParams params{};
99	params.addr = TryGetCpuAddr(config.Address());	101	params.addr = TryGetCpuAddr(config.Address());
100	params.is_tiled = true;	102	params.is_tiled =
101	params.block_height = Tegra::Texture::TICEntry::DefaultBlockHeight;	103	config.memory_layout.type == Tegra::Engines::Maxwell3D::Regs::InvMemoryLayout::BlockLinear;
		104	params.block_width = 1 << config.memory_layout.block_width;
		105	params.block_height = 1 << config.memory_layout.block_height;
		106	params.block_depth = 1 << config.memory_layout.block_depth;
102	params.pixel_format = PixelFormatFromRenderTargetFormat(config.format);	107	params.pixel_format = PixelFormatFromRenderTargetFormat(config.format);
103	params.component_type = ComponentTypeFromRenderTarget(config.format);	108	params.component_type = ComponentTypeFromRenderTarget(config.format);
104	params.type = GetFormatType(params.pixel_format);	109	params.type = GetFormatType(params.pixel_format);
@@ -120,13 +125,16 @@ static VAddr TryGetCpuAddr(Tegra::GPUVAddr gpu_addr) {
120	return params;	125	return params;
121	}	126	}
122		127
123	/static/ SurfaceParams SurfaceParams::CreateForDepthBuffer(u32 zeta_width, u32 zeta_height,	128	/static/ SurfaceParams SurfaceParams::CreateForDepthBuffer(
124	Tegra::GPUVAddr zeta_address,	129	u32 zeta_width, u32 zeta_height, Tegra::GPUVAddr zeta_address, Tegra::DepthFormat format,
125	Tegra::DepthFormat format) {	130	u32 block_width, u32 block_height, u32 block_depth,
		131	Tegra::Engines::Maxwell3D::Regs::InvMemoryLayout type) {
126	SurfaceParams params{};	132	SurfaceParams params{};
127	params.addr = TryGetCpuAddr(zeta_address);	133	params.addr = TryGetCpuAddr(zeta_address);
128	params.is_tiled = true;	134	params.is_tiled = type == Tegra::Engines::Maxwell3D::Regs::InvMemoryLayout::BlockLinear;
129	params.block_height = Tegra::Texture::TICEntry::DefaultBlockHeight;	135	params.block_width = 1 << std::min(block_width, 5U);
		136	params.block_height = 1 << std::min(block_height, 5U);
		137	params.block_depth = 1 << std::min(block_depth, 5U);
130	params.pixel_format = PixelFormatFromDepthFormat(format);	138	params.pixel_format = PixelFormatFromDepthFormat(format);
131	params.component_type = ComponentTypeFromDepthFormat(format);	139	params.component_type = ComponentTypeFromDepthFormat(format);
132	params.type = GetFormatType(params.pixel_format);	140	params.type = GetFormatType(params.pixel_format);
@@ -148,7 +156,9 @@ static VAddr TryGetCpuAddr(Tegra::GPUVAddr gpu_addr) {
148	SurfaceParams params{};	156	SurfaceParams params{};
149	params.addr = TryGetCpuAddr(config.Address());	157	params.addr = TryGetCpuAddr(config.Address());
150	params.is_tiled = !config.linear;	158	params.is_tiled = !config.linear;
151	params.block_height = params.is_tiled ? config.BlockHeight() : 0,	159	params.block_width = params.is_tiled ? std::min(config.BlockWidth(), 32U) : 0,
		160	params.block_height = params.is_tiled ? std::min(config.BlockHeight(), 32U) : 0,
		161	params.block_depth = params.is_tiled ? std::min(config.BlockDepth(), 32U) : 0,
152	params.pixel_format = PixelFormatFromRenderTargetFormat(config.format);	162	params.pixel_format = PixelFormatFromRenderTargetFormat(config.format);
153	params.component_type = ComponentTypeFromRenderTarget(config.format);	163	params.component_type = ComponentTypeFromRenderTarget(config.format);
154	params.type = GetFormatType(params.pixel_format);	164	params.type = GetFormatType(params.pixel_format);
@@ -818,6 +828,11 @@ void CachedSurface::LoadGLBuffer() {
818	if (params.is_tiled) {	828	if (params.is_tiled) {
819	gl_buffer.resize(total_size);	829	gl_buffer.resize(total_size);
820		830
		831	ASSERT_MSG(params.block_width == 1, "Block width is defined as {} on texture type {}",
		832	params.block_width, static_cast<u32>(params.target));
		833	ASSERT_MSG(params.block_depth == 1, "Block depth is defined as {} on texture type {}",
		834	params.block_depth, static_cast<u32>(params.target));
		835
821	// TODO(bunnei): This only unswizzles and copies a 2D texture - we do not yet know how to do	836	// TODO(bunnei): This only unswizzles and copies a 2D texture - we do not yet know how to do
822	// this for 3D textures, etc.	837	// this for 3D textures, etc.
823	switch (params.target) {	838	switch (params.target) {
@@ -989,7 +1004,9 @@ Surface RasterizerCacheOpenGL::GetDepthBufferSurface(bool preserve_contents) {
989	}	1004	}
990		1005
991	SurfaceParams depth_params{SurfaceParams::CreateForDepthBuffer(	1006	SurfaceParams depth_params{SurfaceParams::CreateForDepthBuffer(
992	regs.zeta_width, regs.zeta_height, regs.zeta.Address(), regs.zeta.format)};	1007	regs.zeta_width, regs.zeta_height, regs.zeta.Address(), regs.zeta.format,
		1008	regs.zeta.memory_layout.block_width, regs.zeta.memory_layout.block_height,
		1009	regs.zeta.memory_layout.block_depth, regs.zeta.memory_layout.type)};
993		1010
994	return GetSurface(depth_params, preserve_contents);	1011	return GetSurface(depth_params, preserve_contents);
995	}	1012	}


diff --git a/src/video_core/renderer_opengl/gl_rasterizer_cache.h b/src/video_core/renderer_opengl/gl_rasterizer_cache.h index 0b4940b3c..66d98ad4e 100644 --- a/src/video_core/renderer_opengl/gl_rasterizer_cache.h +++ b/src/video_core/renderer_opengl/gl_rasterizer_cache.h
@@ -716,9 +716,10 @@ struct SurfaceParams {
716	static SurfaceParams CreateForFramebuffer(std::size_t index);	716	static SurfaceParams CreateForFramebuffer(std::size_t index);
717		717
718	/// Creates SurfaceParams for a depth buffer configuration	718	/// Creates SurfaceParams for a depth buffer configuration
719	static SurfaceParams CreateForDepthBuffer(u32 zeta_width, u32 zeta_height,	719	static SurfaceParams CreateForDepthBuffer(
720	Tegra::GPUVAddr zeta_address,	720	u32 zeta_width, u32 zeta_height, Tegra::GPUVAddr zeta_address, Tegra::DepthFormat format,
721	Tegra::DepthFormat format);	721	u32 block_width, u32 block_height, u32 block_depth,
		722	Tegra::Engines::Maxwell3D::Regs::InvMemoryLayout type);
722		723
723	/// Creates SurfaceParams for a Fermi2D surface copy	724	/// Creates SurfaceParams for a Fermi2D surface copy
724	static SurfaceParams CreateForFermiCopySurface(	725	static SurfaceParams CreateForFermiCopySurface(
@@ -733,7 +734,9 @@ struct SurfaceParams {
733		734
734	VAddr addr;	735	VAddr addr;
735	bool is_tiled;	736	bool is_tiled;
		737	u32 block_width;
736	u32 block_height;	738	u32 block_height;
		739	u32 block_depth;
737	PixelFormat pixel_format;	740	PixelFormat pixel_format;
738	ComponentType component_type;	741	ComponentType component_type;
739	SurfaceType type;	742	SurfaceType type;


diff --git a/src/video_core/renderer_opengl/gl_shader_decompiler.cpp b/src/video_core/renderer_opengl/gl_shader_decompiler.cpp index c82a0dcfa..8dfb49507 100644 --- a/src/video_core/renderer_opengl/gl_shader_decompiler.cpp +++ b/src/video_core/renderer_opengl/gl_shader_decompiler.cpp
@@ -2953,6 +2953,88 @@ private:
2953	LOG_WARNING(HW_GPU, "DEPBAR instruction is stubbed");	2953	LOG_WARNING(HW_GPU, "DEPBAR instruction is stubbed");
2954	break;	2954	break;
2955	}	2955	}
		2956	case OpCode::Id::VMAD: {
		2957	const bool signed_a = instr.vmad.signed_a == 1;
		2958	const bool signed_b = instr.vmad.signed_b == 1;
		2959	const bool result_signed = signed_a \|\| signed_b;
		2960	boost::optional<std::string> forced_result;
		2961
		2962	auto Unpack = [&](const std::string& op, bool is_chunk, bool is_signed,
		2963	Tegra::Shader::VmadType type, u64 byte_height) {
		2964	const std::string value = [&]() {
		2965	if (!is_chunk) {
		2966	const auto offset = static_cast<u32>(byte_height * 8);
		2967	return "((" + op + " >> " + std::to_string(offset) + ") & 0xff)";
		2968	}
		2969	const std::string zero = "0";
		2970
		2971	switch (type) {
		2972	case Tegra::Shader::VmadType::Size16_Low:
		2973	return '(' + op + " & 0xffff)";
		2974	case Tegra::Shader::VmadType::Size16_High:
		2975	return '(' + op + " >> 16)";
		2976	case Tegra::Shader::VmadType::Size32:
		2977	// TODO(Rodrigo): From my hardware tests it becomes a bit "mad" when
		2978	// this type is used (1 * 1 + 0 == 0x5b800000). Until a better
		2979	// explanation is found: assert.
		2980	UNREACHABLE_MSG("Unimplemented");
		2981	return zero;
		2982	case Tegra::Shader::VmadType::Invalid:
		2983	// Note(Rodrigo): This flag is invalid according to nvdisasm. From my
		2984	// testing (even though it's invalid) this makes the whole instruction
		2985	// assign zero to target register.
		2986	forced_result = boost::make_optional(zero);
		2987	return zero;
		2988	default:
		2989	UNREACHABLE();
		2990	return zero;
		2991	}
		2992	}();
		2993
		2994	if (is_signed) {
		2995	return "int(" + value + ')';
		2996	}
		2997	return value;
		2998	};
		2999
		3000	const std::string op_a = Unpack(regs.GetRegisterAsInteger(instr.gpr8, 0, false),
		3001	instr.vmad.is_byte_chunk_a != 0, signed_a,
		3002	instr.vmad.type_a, instr.vmad.byte_height_a);
		3003
		3004	std::string op_b;
		3005	if (instr.vmad.use_register_b) {
		3006	op_b = Unpack(regs.GetRegisterAsInteger(instr.gpr20, 0, false),
		3007	instr.vmad.is_byte_chunk_b != 0, signed_b, instr.vmad.type_b,
		3008	instr.vmad.byte_height_b);
		3009	} else {
		3010	op_b = '(' +
		3011	std::to_string(signed_b ? static_cast<s16>(instr.alu.GetImm20_16())
		3012	: instr.alu.GetImm20_16()) +
		3013	')';
		3014	}
		3015
		3016	const std::string op_c = regs.GetRegisterAsInteger(instr.gpr39, 0, result_signed);
		3017
		3018	std::string result;
		3019	if (forced_result) {
		3020	result = *forced_result;
		3021	} else {
		3022	result = '(' + op_a + " * " + op_b + " + " + op_c + ')';
		3023
		3024	switch (instr.vmad.shr) {
		3025	case Tegra::Shader::VmadShr::Shr7:
		3026	result = '(' + result + " >> 7)";
		3027	break;
		3028	case Tegra::Shader::VmadShr::Shr15:
		3029	result = '(' + result + " >> 15)";
		3030	break;
		3031	}
		3032	}
		3033	regs.SetRegisterToInteger(instr.gpr0, result_signed, 1, result, 1, 1,
		3034	instr.vmad.saturate == 1, 0, Register::Size::Word,
		3035	instr.vmad.cc);
		3036	break;
		3037	}
2956	default: {	3038	default: {
2957	LOG_CRITICAL(HW_GPU, "Unhandled instruction: {}", opcode->GetName());	3039	LOG_CRITICAL(HW_GPU, "Unhandled instruction: {}", opcode->GetName());
2958	UNREACHABLE();	3040	UNREACHABLE();


diff --git a/src/video_core/textures/texture.h b/src/video_core/textures/texture.h index 8f31d825a..58d17abcb 100644 --- a/src/video_core/textures/texture.h +++ b/src/video_core/textures/texture.h
@@ -161,7 +161,9 @@ struct TICEntry {
161	BitField<21, 3, TICHeaderVersion> header_version;	161	BitField<21, 3, TICHeaderVersion> header_version;
162	};	162	};
163	union {	163	union {
		164	BitField<0, 3, u32> block_width;
164	BitField<3, 3, u32> block_height;	165	BitField<3, 3, u32> block_height;
		166	BitField<6, 3, u32> block_depth;
165		167
166	// High 16 bits of the pitch value	168	// High 16 bits of the pitch value
167	BitField<0, 16, u32> pitch_high;	169	BitField<0, 16, u32> pitch_high;
@@ -202,13 +204,24 @@ struct TICEntry {
202	return depth_minus_1 + 1;	204	return depth_minus_1 + 1;
203	}	205	}
204		206
		207	u32 BlockWidth() const {
		208	ASSERT(IsTiled());
		209	// The block height is stored in log2 format.
		210	return 1 << block_width;
		211	}
		212
205	u32 BlockHeight() const {	213	u32 BlockHeight() const {
206	ASSERT(header_version == TICHeaderVersion::BlockLinear \|\|	214	ASSERT(IsTiled());
207	header_version == TICHeaderVersion::BlockLinearColorKey);
208	// The block height is stored in log2 format.	215	// The block height is stored in log2 format.
209	return 1 << block_height;	216	return 1 << block_height;
210	}	217	}
211		218
		219	u32 BlockDepth() const {
		220	ASSERT(IsTiled());
		221	// The block height is stored in log2 format.
		222	return 1 << block_depth;
		223	}
		224
212	bool IsTiled() const {	225	bool IsTiled() const {
213	return header_version == TICHeaderVersion::BlockLinear \|\|	226	return header_version == TICHeaderVersion::BlockLinear \|\|
214	header_version == TICHeaderVersion::BlockLinearColorKey;	227	header_version == TICHeaderVersion::BlockLinearColorKey;