summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--CMakeModules/GenerateSCMRev.cmake1
-rw-r--r--src/common/CMakeLists.txt1
-rw-r--r--src/common/alignment.h60
-rw-r--r--src/core/hle/kernel/code_set.h3
-rw-r--r--src/core/hle/kernel/physical_memory.h19
-rw-r--r--src/core/hle/kernel/process.cpp6
-rw-r--r--src/core/hle/kernel/shared_memory.cpp6
-rw-r--r--src/core/hle/kernel/shared_memory.h13
-rw-r--r--src/core/hle/kernel/transfer_memory.cpp2
-rw-r--r--src/core/hle/kernel/transfer_memory.h3
-rw-r--r--src/core/hle/kernel/vm_manager.cpp15
-rw-r--r--src/core/hle/kernel/vm_manager.h9
-rw-r--r--src/core/hle/service/ns/pl_u.cpp12
-rw-r--r--src/core/loader/elf.cpp2
-rw-r--r--src/core/loader/kip.cpp2
-rw-r--r--src/core/loader/nro.cpp2
-rw-r--r--src/core/loader/nso.cpp2
-rw-r--r--src/video_core/CMakeLists.txt5
-rw-r--r--src/video_core/buffer_cache.h299
-rw-r--r--src/video_core/buffer_cache/buffer_block.h76
-rw-r--r--src/video_core/buffer_cache/buffer_cache.h447
-rw-r--r--src/video_core/buffer_cache/map_interval.h89
-rw-r--r--src/video_core/dma_pusher.cpp1
-rw-r--r--src/video_core/engines/fermi_2d.cpp3
-rw-r--r--src/video_core/engines/fermi_2d.h3
-rw-r--r--src/video_core/engines/kepler_memory.cpp2
-rw-r--r--src/video_core/engines/kepler_memory.h1
-rw-r--r--src/video_core/engines/maxwell_3d.cpp22
-rw-r--r--src/video_core/engines/maxwell_3d.h2
-rw-r--r--src/video_core/engines/maxwell_dma.cpp48
-rw-r--r--src/video_core/engines/maxwell_dma.h9
-rw-r--r--src/video_core/engines/shader_bytecode.h27
-rw-r--r--src/video_core/gpu.cpp8
-rw-r--r--src/video_core/gpu.h8
-rw-r--r--src/video_core/rasterizer_interface.h3
-rw-r--r--src/video_core/renderer_opengl/gl_buffer_cache.cpp52
-rw-r--r--src/video_core/renderer_opengl/gl_buffer_cache.h39
-rw-r--r--src/video_core/renderer_opengl/gl_device.cpp3
-rw-r--r--src/video_core/renderer_opengl/gl_device.h5
-rw-r--r--src/video_core/renderer_opengl/gl_rasterizer.cpp8
-rw-r--r--src/video_core/renderer_opengl/gl_rasterizer.h1
-rw-r--r--src/video_core/renderer_opengl/gl_shader_cache.cpp14
-rw-r--r--src/video_core/renderer_opengl/gl_shader_cache.h1
-rw-r--r--src/video_core/renderer_opengl/gl_shader_decompiler.cpp67
-rw-r--r--src/video_core/renderer_opengl/gl_texture_cache.cpp4
-rw-r--r--src/video_core/renderer_opengl/gl_texture_cache.h2
-rw-r--r--src/video_core/renderer_vulkan/vk_shader_decompiler.cpp43
-rw-r--r--src/video_core/shader/decode.cpp1
-rw-r--r--src/video_core/shader/decode/conversion.cpp44
-rw-r--r--src/video_core/shader/decode/float_set.cpp1
-rw-r--r--src/video_core/shader/decode/float_set_predicate.cpp10
-rw-r--r--src/video_core/shader/decode/half_set_predicate.cpp2
-rw-r--r--src/video_core/shader/decode/integer_set.cpp1
-rw-r--r--src/video_core/shader/decode/integer_set_predicate.cpp1
-rw-r--r--src/video_core/shader/decode/other.cpp7
-rw-r--r--src/video_core/shader/decode/predicate_set_register.cpp1
-rw-r--r--src/video_core/shader/decode/warp.cpp55
-rw-r--r--src/video_core/shader/node.h30
-rw-r--r--src/video_core/shader/shader_ir.cpp5
-rw-r--r--src/video_core/shader/shader_ir.h4
-rw-r--r--src/video_core/texture_cache/surface_params.h1
-rw-r--r--src/video_core/texture_cache/texture_cache.h2
-rw-r--r--src/video_core/textures/decoders.cpp14
-rw-r--r--src/video_core/textures/decoders.h3
-rw-r--r--src/video_core/textures/texture.h2
-rw-r--r--src/yuzu/configuration/config.cpp1
-rw-r--r--src/yuzu/main.cpp19
-rw-r--r--src/yuzu/main.h3
-rw-r--r--src/yuzu_tester/yuzu.cpp3
69 files changed, 1188 insertions, 472 deletions
diff --git a/CMakeModules/GenerateSCMRev.cmake b/CMakeModules/GenerateSCMRev.cmake
index abdc74428..a1ace89cb 100644
--- a/CMakeModules/GenerateSCMRev.cmake
+++ b/CMakeModules/GenerateSCMRev.cmake
@@ -81,6 +81,7 @@ set(HASH_FILES
81 "${VIDEO_CORE}/shader/decode/register_set_predicate.cpp" 81 "${VIDEO_CORE}/shader/decode/register_set_predicate.cpp"
82 "${VIDEO_CORE}/shader/decode/shift.cpp" 82 "${VIDEO_CORE}/shader/decode/shift.cpp"
83 "${VIDEO_CORE}/shader/decode/video.cpp" 83 "${VIDEO_CORE}/shader/decode/video.cpp"
84 "${VIDEO_CORE}/shader/decode/warp.cpp"
84 "${VIDEO_CORE}/shader/decode/xmad.cpp" 85 "${VIDEO_CORE}/shader/decode/xmad.cpp"
85 "${VIDEO_CORE}/shader/control_flow.cpp" 86 "${VIDEO_CORE}/shader/control_flow.cpp"
86 "${VIDEO_CORE}/shader/control_flow.h" 87 "${VIDEO_CORE}/shader/control_flow.h"
diff --git a/src/common/CMakeLists.txt b/src/common/CMakeLists.txt
index 2b4266f29..01abdb3bb 100644
--- a/src/common/CMakeLists.txt
+++ b/src/common/CMakeLists.txt
@@ -55,6 +55,7 @@ add_custom_command(OUTPUT scm_rev.cpp
55 "${VIDEO_CORE}/shader/decode/register_set_predicate.cpp" 55 "${VIDEO_CORE}/shader/decode/register_set_predicate.cpp"
56 "${VIDEO_CORE}/shader/decode/shift.cpp" 56 "${VIDEO_CORE}/shader/decode/shift.cpp"
57 "${VIDEO_CORE}/shader/decode/video.cpp" 57 "${VIDEO_CORE}/shader/decode/video.cpp"
58 "${VIDEO_CORE}/shader/decode/warp.cpp"
58 "${VIDEO_CORE}/shader/decode/xmad.cpp" 59 "${VIDEO_CORE}/shader/decode/xmad.cpp"
59 "${VIDEO_CORE}/shader/control_flow.cpp" 60 "${VIDEO_CORE}/shader/control_flow.cpp"
60 "${VIDEO_CORE}/shader/control_flow.h" 61 "${VIDEO_CORE}/shader/control_flow.h"
diff --git a/src/common/alignment.h b/src/common/alignment.h
index 617b14d9b..88d5d3a65 100644
--- a/src/common/alignment.h
+++ b/src/common/alignment.h
@@ -3,6 +3,7 @@
3#pragma once 3#pragma once
4 4
5#include <cstddef> 5#include <cstddef>
6#include <memory>
6#include <type_traits> 7#include <type_traits>
7 8
8namespace Common { 9namespace Common {
@@ -37,4 +38,63 @@ constexpr bool IsWordAligned(T value) {
37 return (value & 0b11) == 0; 38 return (value & 0b11) == 0;
38} 39}
39 40
41template <typename T, std::size_t Align = 16>
42class AlignmentAllocator {
43public:
44 using value_type = T;
45 using size_type = std::size_t;
46 using difference_type = std::ptrdiff_t;
47
48 using pointer = T*;
49 using const_pointer = const T*;
50
51 using reference = T&;
52 using const_reference = const T&;
53
54public:
55 pointer address(reference r) noexcept {
56 return std::addressof(r);
57 }
58
59 const_pointer address(const_reference r) const noexcept {
60 return std::addressof(r);
61 }
62
63 pointer allocate(size_type n) {
64 return static_cast<pointer>(::operator new (n, std::align_val_t{Align}));
65 }
66
67 void deallocate(pointer p, size_type) {
68 ::operator delete (p, std::align_val_t{Align});
69 }
70
71 void construct(pointer p, const value_type& wert) {
72 new (p) value_type(wert);
73 }
74
75 void destroy(pointer p) {
76 p->~value_type();
77 }
78
79 size_type max_size() const noexcept {
80 return size_type(-1) / sizeof(value_type);
81 }
82
83 template <typename T2>
84 struct rebind {
85 using other = AlignmentAllocator<T2, Align>;
86 };
87
88 bool operator!=(const AlignmentAllocator<T, Align>& other) const noexcept {
89 return !(*this == other);
90 }
91
92 // Returns true if and only if storage allocated from *this
93 // can be deallocated from other, and vice versa.
94 // Always returns true for stateless allocators.
95 bool operator==(const AlignmentAllocator<T, Align>& other) const noexcept {
96 return true;
97 }
98};
99
40} // namespace Common 100} // namespace Common
diff --git a/src/core/hle/kernel/code_set.h b/src/core/hle/kernel/code_set.h
index 879957dcb..d8ad54030 100644
--- a/src/core/hle/kernel/code_set.h
+++ b/src/core/hle/kernel/code_set.h
@@ -8,6 +8,7 @@
8#include <vector> 8#include <vector>
9 9
10#include "common/common_types.h" 10#include "common/common_types.h"
11#include "core/hle/kernel/physical_memory.h"
11 12
12namespace Kernel { 13namespace Kernel {
13 14
@@ -77,7 +78,7 @@ struct CodeSet final {
77 } 78 }
78 79
79 /// The overall data that backs this code set. 80 /// The overall data that backs this code set.
80 std::vector<u8> memory; 81 Kernel::PhysicalMemory memory;
81 82
82 /// The segments that comprise this code set. 83 /// The segments that comprise this code set.
83 std::array<Segment, 3> segments; 84 std::array<Segment, 3> segments;
diff --git a/src/core/hle/kernel/physical_memory.h b/src/core/hle/kernel/physical_memory.h
new file mode 100644
index 000000000..090565310
--- /dev/null
+++ b/src/core/hle/kernel/physical_memory.h
@@ -0,0 +1,19 @@
1// Copyright 2019 yuzu Emulator Project
2// Licensed under GPLv2 or any later version
3// Refer to the license.txt file included.
4
5#pragma once
6
7#include "common/alignment.h"
8
9namespace Kernel {
10
11// This encapsulation serves 2 purposes:
12// - First, to encapsulate host physical memory under a single type and set an
13// standard for managing it.
14// - Second to ensure all host backing memory used is aligned to 256 bytes due
15// to strict alignment restrictions on GPU memory.
16
17using PhysicalMemory = std::vector<u8, Common::AlignmentAllocator<u8, 256>>;
18
19} // namespace Kernel
diff --git a/src/core/hle/kernel/process.cpp b/src/core/hle/kernel/process.cpp
index 92169a97b..e80a12ac3 100644
--- a/src/core/hle/kernel/process.cpp
+++ b/src/core/hle/kernel/process.cpp
@@ -247,7 +247,7 @@ VAddr Process::CreateTLSRegion() {
247 ASSERT(region_address.Succeeded()); 247 ASSERT(region_address.Succeeded());
248 248
249 const auto map_result = vm_manager.MapMemoryBlock( 249 const auto map_result = vm_manager.MapMemoryBlock(
250 *region_address, std::make_shared<std::vector<u8>>(Memory::PAGE_SIZE), 0, 250 *region_address, std::make_shared<PhysicalMemory>(Memory::PAGE_SIZE), 0,
251 Memory::PAGE_SIZE, MemoryState::ThreadLocal); 251 Memory::PAGE_SIZE, MemoryState::ThreadLocal);
252 ASSERT(map_result.Succeeded()); 252 ASSERT(map_result.Succeeded());
253 253
@@ -277,7 +277,7 @@ void Process::FreeTLSRegion(VAddr tls_address) {
277} 277}
278 278
279void Process::LoadModule(CodeSet module_, VAddr base_addr) { 279void Process::LoadModule(CodeSet module_, VAddr base_addr) {
280 const auto memory = std::make_shared<std::vector<u8>>(std::move(module_.memory)); 280 const auto memory = std::make_shared<PhysicalMemory>(std::move(module_.memory));
281 281
282 const auto MapSegment = [&](const CodeSet::Segment& segment, VMAPermission permissions, 282 const auto MapSegment = [&](const CodeSet::Segment& segment, VMAPermission permissions,
283 MemoryState memory_state) { 283 MemoryState memory_state) {
@@ -327,7 +327,7 @@ void Process::AllocateMainThreadStack(u64 stack_size) {
327 // Allocate and map the main thread stack 327 // Allocate and map the main thread stack
328 const VAddr mapping_address = vm_manager.GetTLSIORegionEndAddress() - main_thread_stack_size; 328 const VAddr mapping_address = vm_manager.GetTLSIORegionEndAddress() - main_thread_stack_size;
329 vm_manager 329 vm_manager
330 .MapMemoryBlock(mapping_address, std::make_shared<std::vector<u8>>(main_thread_stack_size), 330 .MapMemoryBlock(mapping_address, std::make_shared<PhysicalMemory>(main_thread_stack_size),
331 0, main_thread_stack_size, MemoryState::Stack) 331 0, main_thread_stack_size, MemoryState::Stack)
332 .Unwrap(); 332 .Unwrap();
333} 333}
diff --git a/src/core/hle/kernel/shared_memory.cpp b/src/core/hle/kernel/shared_memory.cpp
index f15c5ee36..a815c4eea 100644
--- a/src/core/hle/kernel/shared_memory.cpp
+++ b/src/core/hle/kernel/shared_memory.cpp
@@ -28,7 +28,7 @@ SharedPtr<SharedMemory> SharedMemory::Create(KernelCore& kernel, Process* owner_
28 shared_memory->other_permissions = other_permissions; 28 shared_memory->other_permissions = other_permissions;
29 29
30 if (address == 0) { 30 if (address == 0) {
31 shared_memory->backing_block = std::make_shared<std::vector<u8>>(size); 31 shared_memory->backing_block = std::make_shared<Kernel::PhysicalMemory>(size);
32 shared_memory->backing_block_offset = 0; 32 shared_memory->backing_block_offset = 0;
33 33
34 // Refresh the address mappings for the current process. 34 // Refresh the address mappings for the current process.
@@ -59,8 +59,8 @@ SharedPtr<SharedMemory> SharedMemory::Create(KernelCore& kernel, Process* owner_
59} 59}
60 60
61SharedPtr<SharedMemory> SharedMemory::CreateForApplet( 61SharedPtr<SharedMemory> SharedMemory::CreateForApplet(
62 KernelCore& kernel, std::shared_ptr<std::vector<u8>> heap_block, std::size_t offset, u64 size, 62 KernelCore& kernel, std::shared_ptr<Kernel::PhysicalMemory> heap_block, std::size_t offset,
63 MemoryPermission permissions, MemoryPermission other_permissions, std::string name) { 63 u64 size, MemoryPermission permissions, MemoryPermission other_permissions, std::string name) {
64 SharedPtr<SharedMemory> shared_memory(new SharedMemory(kernel)); 64 SharedPtr<SharedMemory> shared_memory(new SharedMemory(kernel));
65 65
66 shared_memory->owner_process = nullptr; 66 shared_memory->owner_process = nullptr;
diff --git a/src/core/hle/kernel/shared_memory.h b/src/core/hle/kernel/shared_memory.h
index c2b6155e1..01ca6dcd2 100644
--- a/src/core/hle/kernel/shared_memory.h
+++ b/src/core/hle/kernel/shared_memory.h
@@ -10,6 +10,7 @@
10 10
11#include "common/common_types.h" 11#include "common/common_types.h"
12#include "core/hle/kernel/object.h" 12#include "core/hle/kernel/object.h"
13#include "core/hle/kernel/physical_memory.h"
13#include "core/hle/kernel/process.h" 14#include "core/hle/kernel/process.h"
14#include "core/hle/result.h" 15#include "core/hle/result.h"
15 16
@@ -62,12 +63,10 @@ public:
62 * block. 63 * block.
63 * @param name Optional object name, used for debugging purposes. 64 * @param name Optional object name, used for debugging purposes.
64 */ 65 */
65 static SharedPtr<SharedMemory> CreateForApplet(KernelCore& kernel, 66 static SharedPtr<SharedMemory> CreateForApplet(
66 std::shared_ptr<std::vector<u8>> heap_block, 67 KernelCore& kernel, std::shared_ptr<Kernel::PhysicalMemory> heap_block, std::size_t offset,
67 std::size_t offset, u64 size, 68 u64 size, MemoryPermission permissions, MemoryPermission other_permissions,
68 MemoryPermission permissions, 69 std::string name = "Unknown Applet");
69 MemoryPermission other_permissions,
70 std::string name = "Unknown Applet");
71 70
72 std::string GetTypeName() const override { 71 std::string GetTypeName() const override {
73 return "SharedMemory"; 72 return "SharedMemory";
@@ -135,7 +134,7 @@ private:
135 ~SharedMemory() override; 134 ~SharedMemory() override;
136 135
137 /// Backing memory for this shared memory block. 136 /// Backing memory for this shared memory block.
138 std::shared_ptr<std::vector<u8>> backing_block; 137 std::shared_ptr<PhysicalMemory> backing_block;
139 /// Offset into the backing block for this shared memory. 138 /// Offset into the backing block for this shared memory.
140 std::size_t backing_block_offset = 0; 139 std::size_t backing_block_offset = 0;
141 /// Size of the memory block. Page-aligned. 140 /// Size of the memory block. Page-aligned.
diff --git a/src/core/hle/kernel/transfer_memory.cpp b/src/core/hle/kernel/transfer_memory.cpp
index 26c4e5e67..1113c815e 100644
--- a/src/core/hle/kernel/transfer_memory.cpp
+++ b/src/core/hle/kernel/transfer_memory.cpp
@@ -47,7 +47,7 @@ ResultCode TransferMemory::MapMemory(VAddr address, u64 size, MemoryPermission p
47 return ERR_INVALID_STATE; 47 return ERR_INVALID_STATE;
48 } 48 }
49 49
50 backing_block = std::make_shared<std::vector<u8>>(size); 50 backing_block = std::make_shared<PhysicalMemory>(size);
51 51
52 const auto map_state = owner_permissions == MemoryPermission::None 52 const auto map_state = owner_permissions == MemoryPermission::None
53 ? MemoryState::TransferMemoryIsolated 53 ? MemoryState::TransferMemoryIsolated
diff --git a/src/core/hle/kernel/transfer_memory.h b/src/core/hle/kernel/transfer_memory.h
index a140b1e2b..6be9dc094 100644
--- a/src/core/hle/kernel/transfer_memory.h
+++ b/src/core/hle/kernel/transfer_memory.h
@@ -8,6 +8,7 @@
8#include <vector> 8#include <vector>
9 9
10#include "core/hle/kernel/object.h" 10#include "core/hle/kernel/object.h"
11#include "core/hle/kernel/physical_memory.h"
11 12
12union ResultCode; 13union ResultCode;
13 14
@@ -82,7 +83,7 @@ private:
82 ~TransferMemory() override; 83 ~TransferMemory() override;
83 84
84 /// Memory block backing this instance. 85 /// Memory block backing this instance.
85 std::shared_ptr<std::vector<u8>> backing_block; 86 std::shared_ptr<PhysicalMemory> backing_block;
86 87
87 /// The base address for the memory managed by this instance. 88 /// The base address for the memory managed by this instance.
88 VAddr base_address = 0; 89 VAddr base_address = 0;
diff --git a/src/core/hle/kernel/vm_manager.cpp b/src/core/hle/kernel/vm_manager.cpp
index 4f45fb03b..40cea1e7c 100644
--- a/src/core/hle/kernel/vm_manager.cpp
+++ b/src/core/hle/kernel/vm_manager.cpp
@@ -5,6 +5,7 @@
5#include <algorithm> 5#include <algorithm>
6#include <iterator> 6#include <iterator>
7#include <utility> 7#include <utility>
8#include "common/alignment.h"
8#include "common/assert.h" 9#include "common/assert.h"
9#include "common/logging/log.h" 10#include "common/logging/log.h"
10#include "common/memory_hook.h" 11#include "common/memory_hook.h"
@@ -103,7 +104,7 @@ bool VMManager::IsValidHandle(VMAHandle handle) const {
103} 104}
104 105
105ResultVal<VMManager::VMAHandle> VMManager::MapMemoryBlock(VAddr target, 106ResultVal<VMManager::VMAHandle> VMManager::MapMemoryBlock(VAddr target,
106 std::shared_ptr<std::vector<u8>> block, 107 std::shared_ptr<PhysicalMemory> block,
107 std::size_t offset, u64 size, 108 std::size_t offset, u64 size,
108 MemoryState state, VMAPermission perm) { 109 MemoryState state, VMAPermission perm) {
109 ASSERT(block != nullptr); 110 ASSERT(block != nullptr);
@@ -260,7 +261,7 @@ ResultVal<VAddr> VMManager::SetHeapSize(u64 size) {
260 261
261 if (heap_memory == nullptr) { 262 if (heap_memory == nullptr) {
262 // Initialize heap 263 // Initialize heap
263 heap_memory = std::make_shared<std::vector<u8>>(size); 264 heap_memory = std::make_shared<PhysicalMemory>(size);
264 heap_end = heap_region_base + size; 265 heap_end = heap_region_base + size;
265 } else { 266 } else {
266 UnmapRange(heap_region_base, GetCurrentHeapSize()); 267 UnmapRange(heap_region_base, GetCurrentHeapSize());
@@ -341,7 +342,7 @@ ResultCode VMManager::MapPhysicalMemory(VAddr target, u64 size) {
341 const auto map_size = std::min(end_addr - cur_addr, vma_end - cur_addr); 342 const auto map_size = std::min(end_addr - cur_addr, vma_end - cur_addr);
342 if (vma.state == MemoryState::Unmapped) { 343 if (vma.state == MemoryState::Unmapped) {
343 const auto map_res = 344 const auto map_res =
344 MapMemoryBlock(cur_addr, std::make_shared<std::vector<u8>>(map_size, 0), 0, 345 MapMemoryBlock(cur_addr, std::make_shared<PhysicalMemory>(map_size, 0), 0,
345 map_size, MemoryState::Heap, VMAPermission::ReadWrite); 346 map_size, MemoryState::Heap, VMAPermission::ReadWrite);
346 result = map_res.Code(); 347 result = map_res.Code();
347 if (result.IsError()) { 348 if (result.IsError()) {
@@ -442,7 +443,7 @@ ResultCode VMManager::UnmapPhysicalMemory(VAddr target, u64 size) {
442 if (result.IsError()) { 443 if (result.IsError()) {
443 for (const auto [map_address, map_size] : unmapped_regions) { 444 for (const auto [map_address, map_size] : unmapped_regions) {
444 const auto remap_res = 445 const auto remap_res =
445 MapMemoryBlock(map_address, std::make_shared<std::vector<u8>>(map_size, 0), 0, 446 MapMemoryBlock(map_address, std::make_shared<PhysicalMemory>(map_size, 0), 0,
446 map_size, MemoryState::Heap, VMAPermission::None); 447 map_size, MemoryState::Heap, VMAPermission::None);
447 ASSERT_MSG(remap_res.Succeeded(), "UnmapPhysicalMemory re-map on error"); 448 ASSERT_MSG(remap_res.Succeeded(), "UnmapPhysicalMemory re-map on error");
448 } 449 }
@@ -593,7 +594,7 @@ ResultCode VMManager::MirrorMemory(VAddr dst_addr, VAddr src_addr, u64 size, Mem
593 ASSERT_MSG(vma_offset + size <= vma->second.size, 594 ASSERT_MSG(vma_offset + size <= vma->second.size,
594 "Shared memory exceeds bounds of mapped block"); 595 "Shared memory exceeds bounds of mapped block");
595 596
596 const std::shared_ptr<std::vector<u8>>& backing_block = vma->second.backing_block; 597 const std::shared_ptr<PhysicalMemory>& backing_block = vma->second.backing_block;
597 const std::size_t backing_block_offset = vma->second.offset + vma_offset; 598 const std::size_t backing_block_offset = vma->second.offset + vma_offset;
598 599
599 CASCADE_RESULT(auto new_vma, 600 CASCADE_RESULT(auto new_vma,
@@ -606,7 +607,7 @@ ResultCode VMManager::MirrorMemory(VAddr dst_addr, VAddr src_addr, u64 size, Mem
606 return RESULT_SUCCESS; 607 return RESULT_SUCCESS;
607} 608}
608 609
609void VMManager::RefreshMemoryBlockMappings(const std::vector<u8>* block) { 610void VMManager::RefreshMemoryBlockMappings(const PhysicalMemory* block) {
610 // If this ever proves to have a noticeable performance impact, allow users of the function to 611 // If this ever proves to have a noticeable performance impact, allow users of the function to
611 // specify a specific range of addresses to limit the scan to. 612 // specify a specific range of addresses to limit the scan to.
612 for (const auto& p : vma_map) { 613 for (const auto& p : vma_map) {
@@ -764,7 +765,7 @@ void VMManager::MergeAdjacentVMA(VirtualMemoryArea& left, const VirtualMemoryAre
764 right.backing_block->begin() + right.offset + right.size); 765 right.backing_block->begin() + right.offset + right.size);
765 } else { 766 } else {
766 // Slow case: make a new memory block for left and right. 767 // Slow case: make a new memory block for left and right.
767 auto new_memory = std::make_shared<std::vector<u8>>(); 768 auto new_memory = std::make_shared<PhysicalMemory>();
768 new_memory->insert(new_memory->end(), left.backing_block->begin() + left.offset, 769 new_memory->insert(new_memory->end(), left.backing_block->begin() + left.offset,
769 left.backing_block->begin() + left.offset + left.size); 770 left.backing_block->begin() + left.offset + left.size);
770 new_memory->insert(new_memory->end(), right.backing_block->begin() + right.offset, 771 new_memory->insert(new_memory->end(), right.backing_block->begin() + right.offset,
diff --git a/src/core/hle/kernel/vm_manager.h b/src/core/hle/kernel/vm_manager.h
index 0aecb7499..b18cde619 100644
--- a/src/core/hle/kernel/vm_manager.h
+++ b/src/core/hle/kernel/vm_manager.h
@@ -11,6 +11,7 @@
11#include "common/common_types.h" 11#include "common/common_types.h"
12#include "common/memory_hook.h" 12#include "common/memory_hook.h"
13#include "common/page_table.h" 13#include "common/page_table.h"
14#include "core/hle/kernel/physical_memory.h"
14#include "core/hle/result.h" 15#include "core/hle/result.h"
15#include "core/memory.h" 16#include "core/memory.h"
16 17
@@ -290,7 +291,7 @@ struct VirtualMemoryArea {
290 291
291 // Settings for type = AllocatedMemoryBlock 292 // Settings for type = AllocatedMemoryBlock
292 /// Memory block backing this VMA. 293 /// Memory block backing this VMA.
293 std::shared_ptr<std::vector<u8>> backing_block = nullptr; 294 std::shared_ptr<PhysicalMemory> backing_block = nullptr;
294 /// Offset into the backing_memory the mapping starts from. 295 /// Offset into the backing_memory the mapping starts from.
295 std::size_t offset = 0; 296 std::size_t offset = 0;
296 297
@@ -348,7 +349,7 @@ public:
348 * @param size Size of the mapping. 349 * @param size Size of the mapping.
349 * @param state MemoryState tag to attach to the VMA. 350 * @param state MemoryState tag to attach to the VMA.
350 */ 351 */
351 ResultVal<VMAHandle> MapMemoryBlock(VAddr target, std::shared_ptr<std::vector<u8>> block, 352 ResultVal<VMAHandle> MapMemoryBlock(VAddr target, std::shared_ptr<PhysicalMemory> block,
352 std::size_t offset, u64 size, MemoryState state, 353 std::size_t offset, u64 size, MemoryState state,
353 VMAPermission perm = VMAPermission::ReadWrite); 354 VMAPermission perm = VMAPermission::ReadWrite);
354 355
@@ -547,7 +548,7 @@ public:
547 * Scans all VMAs and updates the page table range of any that use the given vector as backing 548 * Scans all VMAs and updates the page table range of any that use the given vector as backing
548 * memory. This should be called after any operation that causes reallocation of the vector. 549 * memory. This should be called after any operation that causes reallocation of the vector.
549 */ 550 */
550 void RefreshMemoryBlockMappings(const std::vector<u8>* block); 551 void RefreshMemoryBlockMappings(const PhysicalMemory* block);
551 552
552 /// Dumps the address space layout to the log, for debugging 553 /// Dumps the address space layout to the log, for debugging
553 void LogLayout() const; 554 void LogLayout() const;
@@ -777,7 +778,7 @@ private:
777 // the entire virtual address space extents that bound the allocations, including any holes. 778 // the entire virtual address space extents that bound the allocations, including any holes.
778 // This makes deallocation and reallocation of holes fast and keeps process memory contiguous 779 // This makes deallocation and reallocation of holes fast and keeps process memory contiguous
779 // in the emulator address space, allowing Memory::GetPointer to be reasonably safe. 780 // in the emulator address space, allowing Memory::GetPointer to be reasonably safe.
780 std::shared_ptr<std::vector<u8>> heap_memory; 781 std::shared_ptr<PhysicalMemory> heap_memory;
781 782
782 // The end of the currently allocated heap. This is not an inclusive 783 // The end of the currently allocated heap. This is not an inclusive
783 // end of the range. This is essentially 'base_address + current_size'. 784 // end of the range. This is essentially 'base_address + current_size'.
diff --git a/src/core/hle/service/ns/pl_u.cpp b/src/core/hle/service/ns/pl_u.cpp
index ad176f89d..2a522136d 100644
--- a/src/core/hle/service/ns/pl_u.cpp
+++ b/src/core/hle/service/ns/pl_u.cpp
@@ -77,7 +77,7 @@ enum class LoadState : u32 {
77 Done = 1, 77 Done = 1,
78}; 78};
79 79
80static void DecryptSharedFont(const std::vector<u32>& input, std::vector<u8>& output, 80static void DecryptSharedFont(const std::vector<u32>& input, Kernel::PhysicalMemory& output,
81 std::size_t& offset) { 81 std::size_t& offset) {
82 ASSERT_MSG(offset + (input.size() * sizeof(u32)) < SHARED_FONT_MEM_SIZE, 82 ASSERT_MSG(offset + (input.size() * sizeof(u32)) < SHARED_FONT_MEM_SIZE,
83 "Shared fonts exceeds 17mb!"); 83 "Shared fonts exceeds 17mb!");
@@ -94,7 +94,7 @@ static void DecryptSharedFont(const std::vector<u32>& input, std::vector<u8>& ou
94 offset += transformed_font.size() * sizeof(u32); 94 offset += transformed_font.size() * sizeof(u32);
95} 95}
96 96
97static void EncryptSharedFont(const std::vector<u8>& input, std::vector<u8>& output, 97static void EncryptSharedFont(const std::vector<u8>& input, Kernel::PhysicalMemory& output,
98 std::size_t& offset) { 98 std::size_t& offset) {
99 ASSERT_MSG(offset + input.size() + 8 < SHARED_FONT_MEM_SIZE, "Shared fonts exceeds 17mb!"); 99 ASSERT_MSG(offset + input.size() + 8 < SHARED_FONT_MEM_SIZE, "Shared fonts exceeds 17mb!");
100 const u32 KEY = EXPECTED_MAGIC ^ EXPECTED_RESULT; 100 const u32 KEY = EXPECTED_MAGIC ^ EXPECTED_RESULT;
@@ -121,7 +121,7 @@ struct PL_U::Impl {
121 return shared_font_regions.at(index); 121 return shared_font_regions.at(index);
122 } 122 }
123 123
124 void BuildSharedFontsRawRegions(const std::vector<u8>& input) { 124 void BuildSharedFontsRawRegions(const Kernel::PhysicalMemory& input) {
125 // As we can derive the xor key we can just populate the offsets 125 // As we can derive the xor key we can just populate the offsets
126 // based on the shared memory dump 126 // based on the shared memory dump
127 unsigned cur_offset = 0; 127 unsigned cur_offset = 0;
@@ -144,7 +144,7 @@ struct PL_U::Impl {
144 Kernel::SharedPtr<Kernel::SharedMemory> shared_font_mem; 144 Kernel::SharedPtr<Kernel::SharedMemory> shared_font_mem;
145 145
146 /// Backing memory for the shared font data 146 /// Backing memory for the shared font data
147 std::shared_ptr<std::vector<u8>> shared_font; 147 std::shared_ptr<Kernel::PhysicalMemory> shared_font;
148 148
149 // Automatically populated based on shared_fonts dump or system archives. 149 // Automatically populated based on shared_fonts dump or system archives.
150 std::vector<FontRegion> shared_font_regions; 150 std::vector<FontRegion> shared_font_regions;
@@ -166,7 +166,7 @@ PL_U::PL_U() : ServiceFramework("pl:u"), impl{std::make_unique<Impl>()} {
166 // Rebuild shared fonts from data ncas 166 // Rebuild shared fonts from data ncas
167 if (nand->HasEntry(static_cast<u64>(FontArchives::Standard), 167 if (nand->HasEntry(static_cast<u64>(FontArchives::Standard),
168 FileSys::ContentRecordType::Data)) { 168 FileSys::ContentRecordType::Data)) {
169 impl->shared_font = std::make_shared<std::vector<u8>>(SHARED_FONT_MEM_SIZE); 169 impl->shared_font = std::make_shared<Kernel::PhysicalMemory>(SHARED_FONT_MEM_SIZE);
170 for (auto font : SHARED_FONTS) { 170 for (auto font : SHARED_FONTS) {
171 const auto nca = 171 const auto nca =
172 nand->GetEntry(static_cast<u64>(font.first), FileSys::ContentRecordType::Data); 172 nand->GetEntry(static_cast<u64>(font.first), FileSys::ContentRecordType::Data);
@@ -207,7 +207,7 @@ PL_U::PL_U() : ServiceFramework("pl:u"), impl{std::make_unique<Impl>()} {
207 } 207 }
208 208
209 } else { 209 } else {
210 impl->shared_font = std::make_shared<std::vector<u8>>( 210 impl->shared_font = std::make_shared<Kernel::PhysicalMemory>(
211 SHARED_FONT_MEM_SIZE); // Shared memory needs to always be allocated and a fixed size 211 SHARED_FONT_MEM_SIZE); // Shared memory needs to always be allocated and a fixed size
212 212
213 const std::string user_path = FileUtil::GetUserPath(FileUtil::UserPath::SysDataDir); 213 const std::string user_path = FileUtil::GetUserPath(FileUtil::UserPath::SysDataDir);
diff --git a/src/core/loader/elf.cpp b/src/core/loader/elf.cpp
index 6d4b02375..f1795fdd6 100644
--- a/src/core/loader/elf.cpp
+++ b/src/core/loader/elf.cpp
@@ -295,7 +295,7 @@ Kernel::CodeSet ElfReader::LoadInto(VAddr vaddr) {
295 } 295 }
296 } 296 }
297 297
298 std::vector<u8> program_image(total_image_size); 298 Kernel::PhysicalMemory program_image(total_image_size);
299 std::size_t current_image_position = 0; 299 std::size_t current_image_position = 0;
300 300
301 Kernel::CodeSet codeset; 301 Kernel::CodeSet codeset;
diff --git a/src/core/loader/kip.cpp b/src/core/loader/kip.cpp
index 70051c13a..474b55cb1 100644
--- a/src/core/loader/kip.cpp
+++ b/src/core/loader/kip.cpp
@@ -69,7 +69,7 @@ AppLoader::LoadResult AppLoader_KIP::Load(Kernel::Process& process) {
69 69
70 const VAddr base_address = process.VMManager().GetCodeRegionBaseAddress(); 70 const VAddr base_address = process.VMManager().GetCodeRegionBaseAddress();
71 Kernel::CodeSet codeset; 71 Kernel::CodeSet codeset;
72 std::vector<u8> program_image; 72 Kernel::PhysicalMemory program_image;
73 73
74 const auto load_segment = [&program_image](Kernel::CodeSet::Segment& segment, 74 const auto load_segment = [&program_image](Kernel::CodeSet::Segment& segment,
75 const std::vector<u8>& data, u32 offset) { 75 const std::vector<u8>& data, u32 offset) {
diff --git a/src/core/loader/nro.cpp b/src/core/loader/nro.cpp
index 6a0ca389b..e92e2e06e 100644
--- a/src/core/loader/nro.cpp
+++ b/src/core/loader/nro.cpp
@@ -143,7 +143,7 @@ static bool LoadNroImpl(Kernel::Process& process, const std::vector<u8>& data,
143 } 143 }
144 144
145 // Build program image 145 // Build program image
146 std::vector<u8> program_image(PageAlignSize(nro_header.file_size)); 146 Kernel::PhysicalMemory program_image(PageAlignSize(nro_header.file_size));
147 std::memcpy(program_image.data(), data.data(), program_image.size()); 147 std::memcpy(program_image.data(), data.data(), program_image.size());
148 if (program_image.size() != PageAlignSize(nro_header.file_size)) { 148 if (program_image.size() != PageAlignSize(nro_header.file_size)) {
149 return {}; 149 return {};
diff --git a/src/core/loader/nso.cpp b/src/core/loader/nso.cpp
index 29311404a..70c90109f 100644
--- a/src/core/loader/nso.cpp
+++ b/src/core/loader/nso.cpp
@@ -89,7 +89,7 @@ std::optional<VAddr> AppLoader_NSO::LoadModule(Kernel::Process& process,
89 89
90 // Build program image 90 // Build program image
91 Kernel::CodeSet codeset; 91 Kernel::CodeSet codeset;
92 std::vector<u8> program_image; 92 Kernel::PhysicalMemory program_image;
93 for (std::size_t i = 0; i < nso_header.segments.size(); ++i) { 93 for (std::size_t i = 0; i < nso_header.segments.size(); ++i) {
94 std::vector<u8> data = 94 std::vector<u8> data =
95 file.ReadBytes(nso_header.segments_compressed_size[i], nso_header.segments[i].offset); 95 file.ReadBytes(nso_header.segments_compressed_size[i], nso_header.segments[i].offset);
diff --git a/src/video_core/CMakeLists.txt b/src/video_core/CMakeLists.txt
index 7c18c27b3..e2f85c5f1 100644
--- a/src/video_core/CMakeLists.txt
+++ b/src/video_core/CMakeLists.txt
@@ -1,5 +1,7 @@
1add_library(video_core STATIC 1add_library(video_core STATIC
2 buffer_cache.h 2 buffer_cache/buffer_block.h
3 buffer_cache/buffer_cache.h
4 buffer_cache/map_interval.h
3 dma_pusher.cpp 5 dma_pusher.cpp
4 dma_pusher.h 6 dma_pusher.h
5 debug_utils/debug_utils.cpp 7 debug_utils/debug_utils.cpp
@@ -100,6 +102,7 @@ add_library(video_core STATIC
100 shader/decode/integer_set.cpp 102 shader/decode/integer_set.cpp
101 shader/decode/half_set.cpp 103 shader/decode/half_set.cpp
102 shader/decode/video.cpp 104 shader/decode/video.cpp
105 shader/decode/warp.cpp
103 shader/decode/xmad.cpp 106 shader/decode/xmad.cpp
104 shader/decode/other.cpp 107 shader/decode/other.cpp
105 shader/control_flow.cpp 108 shader/control_flow.cpp
diff --git a/src/video_core/buffer_cache.h b/src/video_core/buffer_cache.h
deleted file mode 100644
index 6f868b8b4..000000000
--- a/src/video_core/buffer_cache.h
+++ /dev/null
@@ -1,299 +0,0 @@
1// Copyright 2019 yuzu Emulator Project
2// Licensed under GPLv2 or any later version
3// Refer to the license.txt file included.
4
5#pragma once
6
7#include <array>
8#include <memory>
9#include <mutex>
10#include <unordered_map>
11#include <unordered_set>
12#include <utility>
13#include <vector>
14
15#include "common/alignment.h"
16#include "common/common_types.h"
17#include "core/core.h"
18#include "video_core/memory_manager.h"
19#include "video_core/rasterizer_cache.h"
20
21namespace VideoCore {
22class RasterizerInterface;
23}
24
25namespace VideoCommon {
26
27template <typename BufferStorageType>
28class CachedBuffer final : public RasterizerCacheObject {
29public:
30 explicit CachedBuffer(VAddr cpu_addr, u8* host_ptr)
31 : RasterizerCacheObject{host_ptr}, host_ptr{host_ptr}, cpu_addr{cpu_addr} {}
32 ~CachedBuffer() override = default;
33
34 VAddr GetCpuAddr() const override {
35 return cpu_addr;
36 }
37
38 std::size_t GetSizeInBytes() const override {
39 return size;
40 }
41
42 u8* GetWritableHostPtr() const {
43 return host_ptr;
44 }
45
46 std::size_t GetSize() const {
47 return size;
48 }
49
50 std::size_t GetCapacity() const {
51 return capacity;
52 }
53
54 bool IsInternalized() const {
55 return is_internal;
56 }
57
58 const BufferStorageType& GetBuffer() const {
59 return buffer;
60 }
61
62 void SetSize(std::size_t new_size) {
63 size = new_size;
64 }
65
66 void SetInternalState(bool is_internal_) {
67 is_internal = is_internal_;
68 }
69
70 BufferStorageType ExchangeBuffer(BufferStorageType buffer_, std::size_t new_capacity) {
71 capacity = new_capacity;
72 std::swap(buffer, buffer_);
73 return buffer_;
74 }
75
76private:
77 u8* host_ptr{};
78 VAddr cpu_addr{};
79 std::size_t size{};
80 std::size_t capacity{};
81 bool is_internal{};
82 BufferStorageType buffer;
83};
84
85template <typename BufferStorageType, typename BufferType, typename StreamBuffer>
86class BufferCache : public RasterizerCache<std::shared_ptr<CachedBuffer<BufferStorageType>>> {
87public:
88 using Buffer = std::shared_ptr<CachedBuffer<BufferStorageType>>;
89 using BufferInfo = std::pair<const BufferType*, u64>;
90
91 explicit BufferCache(VideoCore::RasterizerInterface& rasterizer, Core::System& system,
92 std::unique_ptr<StreamBuffer> stream_buffer)
93 : RasterizerCache<Buffer>{rasterizer}, system{system},
94 stream_buffer{std::move(stream_buffer)}, stream_buffer_handle{
95 this->stream_buffer->GetHandle()} {}
96 ~BufferCache() = default;
97
98 void Unregister(const Buffer& entry) override {
99 std::lock_guard lock{RasterizerCache<Buffer>::mutex};
100 if (entry->IsInternalized()) {
101 internalized_entries.erase(entry->GetCacheAddr());
102 }
103 ReserveBuffer(entry);
104 RasterizerCache<Buffer>::Unregister(entry);
105 }
106
107 void TickFrame() {
108 marked_for_destruction_index =
109 (marked_for_destruction_index + 1) % marked_for_destruction_ring_buffer.size();
110 MarkedForDestruction().clear();
111 }
112
113 BufferInfo UploadMemory(GPUVAddr gpu_addr, std::size_t size, std::size_t alignment = 4,
114 bool internalize = false, bool is_written = false) {
115 std::lock_guard lock{RasterizerCache<Buffer>::mutex};
116
117 auto& memory_manager = system.GPU().MemoryManager();
118 const auto host_ptr = memory_manager.GetPointer(gpu_addr);
119 if (!host_ptr) {
120 return {GetEmptyBuffer(size), 0};
121 }
122 const auto cache_addr = ToCacheAddr(host_ptr);
123
124 // Cache management is a big overhead, so only cache entries with a given size.
125 // TODO: Figure out which size is the best for given games.
126 constexpr std::size_t max_stream_size = 0x800;
127 if (!internalize && size < max_stream_size &&
128 internalized_entries.find(cache_addr) == internalized_entries.end()) {
129 return StreamBufferUpload(host_ptr, size, alignment);
130 }
131
132 auto entry = RasterizerCache<Buffer>::TryGet(cache_addr);
133 if (!entry) {
134 return FixedBufferUpload(gpu_addr, host_ptr, size, internalize, is_written);
135 }
136
137 if (entry->GetSize() < size) {
138 IncreaseBufferSize(entry, size);
139 }
140 if (is_written) {
141 entry->MarkAsModified(true, *this);
142 }
143 return {ToHandle(entry->GetBuffer()), 0};
144 }
145
146 /// Uploads from a host memory. Returns the OpenGL buffer where it's located and its offset.
147 BufferInfo UploadHostMemory(const void* raw_pointer, std::size_t size,
148 std::size_t alignment = 4) {
149 std::lock_guard lock{RasterizerCache<Buffer>::mutex};
150 return StreamBufferUpload(raw_pointer, size, alignment);
151 }
152
153 void Map(std::size_t max_size) {
154 std::tie(buffer_ptr, buffer_offset_base, invalidated) = stream_buffer->Map(max_size, 4);
155 buffer_offset = buffer_offset_base;
156 }
157
158 /// Finishes the upload stream, returns true on bindings invalidation.
159 bool Unmap() {
160 stream_buffer->Unmap(buffer_offset - buffer_offset_base);
161 return std::exchange(invalidated, false);
162 }
163
164 virtual const BufferType* GetEmptyBuffer(std::size_t size) = 0;
165
166protected:
167 void FlushObjectInner(const Buffer& entry) override {
168 DownloadBufferData(entry->GetBuffer(), 0, entry->GetSize(), entry->GetWritableHostPtr());
169 }
170
171 virtual BufferStorageType CreateBuffer(std::size_t size) = 0;
172
173 virtual const BufferType* ToHandle(const BufferStorageType& storage) = 0;
174
175 virtual void UploadBufferData(const BufferStorageType& buffer, std::size_t offset,
176 std::size_t size, const u8* data) = 0;
177
178 virtual void DownloadBufferData(const BufferStorageType& buffer, std::size_t offset,
179 std::size_t size, u8* data) = 0;
180
181 virtual void CopyBufferData(const BufferStorageType& src, const BufferStorageType& dst,
182 std::size_t src_offset, std::size_t dst_offset,
183 std::size_t size) = 0;
184
185private:
186 BufferInfo StreamBufferUpload(const void* raw_pointer, std::size_t size,
187 std::size_t alignment) {
188 AlignBuffer(alignment);
189 const std::size_t uploaded_offset = buffer_offset;
190 std::memcpy(buffer_ptr, raw_pointer, size);
191
192 buffer_ptr += size;
193 buffer_offset += size;
194 return {&stream_buffer_handle, uploaded_offset};
195 }
196
197 BufferInfo FixedBufferUpload(GPUVAddr gpu_addr, u8* host_ptr, std::size_t size,
198 bool internalize, bool is_written) {
199 auto& memory_manager = Core::System::GetInstance().GPU().MemoryManager();
200 const auto cpu_addr = memory_manager.GpuToCpuAddress(gpu_addr);
201 ASSERT(cpu_addr);
202
203 auto entry = GetUncachedBuffer(*cpu_addr, host_ptr);
204 entry->SetSize(size);
205 entry->SetInternalState(internalize);
206 RasterizerCache<Buffer>::Register(entry);
207
208 if (internalize) {
209 internalized_entries.emplace(ToCacheAddr(host_ptr));
210 }
211 if (is_written) {
212 entry->MarkAsModified(true, *this);
213 }
214
215 if (entry->GetCapacity() < size) {
216 MarkedForDestruction().push_back(entry->ExchangeBuffer(CreateBuffer(size), size));
217 }
218
219 UploadBufferData(entry->GetBuffer(), 0, size, host_ptr);
220 return {ToHandle(entry->GetBuffer()), 0};
221 }
222
223 void IncreaseBufferSize(Buffer& entry, std::size_t new_size) {
224 const std::size_t old_size = entry->GetSize();
225 if (entry->GetCapacity() < new_size) {
226 const auto& old_buffer = entry->GetBuffer();
227 auto new_buffer = CreateBuffer(new_size);
228
229 // Copy bits from the old buffer to the new buffer.
230 CopyBufferData(old_buffer, new_buffer, 0, 0, old_size);
231 MarkedForDestruction().push_back(
232 entry->ExchangeBuffer(std::move(new_buffer), new_size));
233
234 // This buffer could have been used
235 invalidated = true;
236 }
237 // Upload the new bits.
238 const std::size_t size_diff = new_size - old_size;
239 UploadBufferData(entry->GetBuffer(), old_size, size_diff, entry->GetHostPtr() + old_size);
240
241 // Update entry's size in the object and in the cache.
242 Unregister(entry);
243
244 entry->SetSize(new_size);
245 RasterizerCache<Buffer>::Register(entry);
246 }
247
248 Buffer GetUncachedBuffer(VAddr cpu_addr, u8* host_ptr) {
249 if (auto entry = TryGetReservedBuffer(host_ptr)) {
250 return entry;
251 }
252 return std::make_shared<CachedBuffer<BufferStorageType>>(cpu_addr, host_ptr);
253 }
254
255 Buffer TryGetReservedBuffer(u8* host_ptr) {
256 const auto it = buffer_reserve.find(ToCacheAddr(host_ptr));
257 if (it == buffer_reserve.end()) {
258 return {};
259 }
260 auto& reserve = it->second;
261 auto entry = reserve.back();
262 reserve.pop_back();
263 return entry;
264 }
265
266 void ReserveBuffer(Buffer entry) {
267 buffer_reserve[entry->GetCacheAddr()].push_back(std::move(entry));
268 }
269
270 void AlignBuffer(std::size_t alignment) {
271 // Align the offset, not the mapped pointer
272 const std::size_t offset_aligned = Common::AlignUp(buffer_offset, alignment);
273 buffer_ptr += offset_aligned - buffer_offset;
274 buffer_offset = offset_aligned;
275 }
276
277 std::vector<BufferStorageType>& MarkedForDestruction() {
278 return marked_for_destruction_ring_buffer[marked_for_destruction_index];
279 }
280
281 Core::System& system;
282
283 std::unique_ptr<StreamBuffer> stream_buffer;
284 BufferType stream_buffer_handle{};
285
286 bool invalidated = false;
287
288 u8* buffer_ptr = nullptr;
289 u64 buffer_offset = 0;
290 u64 buffer_offset_base = 0;
291
292 std::size_t marked_for_destruction_index = 0;
293 std::array<std::vector<BufferStorageType>, 4> marked_for_destruction_ring_buffer;
294
295 std::unordered_set<CacheAddr> internalized_entries;
296 std::unordered_map<CacheAddr, std::vector<Buffer>> buffer_reserve;
297};
298
299} // namespace VideoCommon
diff --git a/src/video_core/buffer_cache/buffer_block.h b/src/video_core/buffer_cache/buffer_block.h
new file mode 100644
index 000000000..4b9193182
--- /dev/null
+++ b/src/video_core/buffer_cache/buffer_block.h
@@ -0,0 +1,76 @@
1// Copyright 2019 yuzu Emulator Project
2// Licensed under GPLv2 or any later version
3// Refer to the license.txt file included.
4
5#pragma once
6
7#include <unordered_set>
8#include <utility>
9
10#include "common/alignment.h"
11#include "common/common_types.h"
12#include "video_core/gpu.h"
13
14namespace VideoCommon {
15
16class BufferBlock {
17public:
18 bool Overlaps(const CacheAddr start, const CacheAddr end) const {
19 return (cache_addr < end) && (cache_addr_end > start);
20 }
21
22 bool IsInside(const CacheAddr other_start, const CacheAddr other_end) const {
23 return cache_addr <= other_start && other_end <= cache_addr_end;
24 }
25
26 u8* GetWritableHostPtr() const {
27 return FromCacheAddr(cache_addr);
28 }
29
30 u8* GetWritableHostPtr(std::size_t offset) const {
31 return FromCacheAddr(cache_addr + offset);
32 }
33
34 std::size_t GetOffset(const CacheAddr in_addr) {
35 return static_cast<std::size_t>(in_addr - cache_addr);
36 }
37
38 CacheAddr GetCacheAddr() const {
39 return cache_addr;
40 }
41
42 CacheAddr GetCacheAddrEnd() const {
43 return cache_addr_end;
44 }
45
46 void SetCacheAddr(const CacheAddr new_addr) {
47 cache_addr = new_addr;
48 cache_addr_end = new_addr + size;
49 }
50
51 std::size_t GetSize() const {
52 return size;
53 }
54
55 void SetEpoch(u64 new_epoch) {
56 epoch = new_epoch;
57 }
58
59 u64 GetEpoch() {
60 return epoch;
61 }
62
63protected:
64 explicit BufferBlock(CacheAddr cache_addr, const std::size_t size) : size{size} {
65 SetCacheAddr(cache_addr);
66 }
67 ~BufferBlock() = default;
68
69private:
70 CacheAddr cache_addr{};
71 CacheAddr cache_addr_end{};
72 std::size_t size{};
73 u64 epoch{};
74};
75
76} // namespace VideoCommon
diff --git a/src/video_core/buffer_cache/buffer_cache.h b/src/video_core/buffer_cache/buffer_cache.h
new file mode 100644
index 000000000..2442ddfd6
--- /dev/null
+++ b/src/video_core/buffer_cache/buffer_cache.h
@@ -0,0 +1,447 @@
1// Copyright 2019 yuzu Emulator Project
2// Licensed under GPLv2 or any later version
3// Refer to the license.txt file included.
4
5#pragma once
6
7#include <array>
8#include <memory>
9#include <mutex>
10#include <unordered_map>
11#include <unordered_set>
12#include <utility>
13#include <vector>
14
15#include "common/alignment.h"
16#include "common/common_types.h"
17#include "core/core.h"
18#include "video_core/buffer_cache/buffer_block.h"
19#include "video_core/buffer_cache/map_interval.h"
20#include "video_core/memory_manager.h"
21#include "video_core/rasterizer_interface.h"
22
23namespace VideoCommon {
24
25using MapInterval = std::shared_ptr<MapIntervalBase>;
26
27template <typename TBuffer, typename TBufferType, typename StreamBuffer>
28class BufferCache {
29public:
30 using BufferInfo = std::pair<const TBufferType*, u64>;
31
32 BufferInfo UploadMemory(GPUVAddr gpu_addr, std::size_t size, std::size_t alignment = 4,
33 bool is_written = false) {
34 std::lock_guard lock{mutex};
35
36 auto& memory_manager = system.GPU().MemoryManager();
37 const auto host_ptr = memory_manager.GetPointer(gpu_addr);
38 if (!host_ptr) {
39 return {GetEmptyBuffer(size), 0};
40 }
41 const auto cache_addr = ToCacheAddr(host_ptr);
42
43 // Cache management is a big overhead, so only cache entries with a given size.
44 // TODO: Figure out which size is the best for given games.
45 constexpr std::size_t max_stream_size = 0x800;
46 if (size < max_stream_size) {
47 if (!is_written && !IsRegionWritten(cache_addr, cache_addr + size - 1)) {
48 return StreamBufferUpload(host_ptr, size, alignment);
49 }
50 }
51
52 auto block = GetBlock(cache_addr, size);
53 auto map = MapAddress(block, gpu_addr, cache_addr, size);
54 if (is_written) {
55 map->MarkAsModified(true, GetModifiedTicks());
56 if (!map->IsWritten()) {
57 map->MarkAsWritten(true);
58 MarkRegionAsWritten(map->GetStart(), map->GetEnd() - 1);
59 }
60 } else {
61 if (map->IsWritten()) {
62 WriteBarrier();
63 }
64 }
65
66 const u64 offset = static_cast<u64>(block->GetOffset(cache_addr));
67
68 return {ToHandle(block), offset};
69 }
70
71 /// Uploads from a host memory. Returns the OpenGL buffer where it's located and its offset.
72 BufferInfo UploadHostMemory(const void* raw_pointer, std::size_t size,
73 std::size_t alignment = 4) {
74 std::lock_guard lock{mutex};
75 return StreamBufferUpload(raw_pointer, size, alignment);
76 }
77
78 void Map(std::size_t max_size) {
79 std::lock_guard lock{mutex};
80
81 std::tie(buffer_ptr, buffer_offset_base, invalidated) = stream_buffer->Map(max_size, 4);
82 buffer_offset = buffer_offset_base;
83 }
84
85 /// Finishes the upload stream, returns true on bindings invalidation.
86 bool Unmap() {
87 std::lock_guard lock{mutex};
88
89 stream_buffer->Unmap(buffer_offset - buffer_offset_base);
90 return std::exchange(invalidated, false);
91 }
92
93 void TickFrame() {
94 ++epoch;
95 while (!pending_destruction.empty()) {
96 if (pending_destruction.front()->GetEpoch() + 1 > epoch) {
97 break;
98 }
99 pending_destruction.pop_front();
100 }
101 }
102
103 /// Write any cached resources overlapping the specified region back to memory
104 void FlushRegion(CacheAddr addr, std::size_t size) {
105 std::lock_guard lock{mutex};
106
107 std::vector<MapInterval> objects = GetMapsInRange(addr, size);
108 std::sort(objects.begin(), objects.end(), [](const MapInterval& a, const MapInterval& b) {
109 return a->GetModificationTick() < b->GetModificationTick();
110 });
111 for (auto& object : objects) {
112 if (object->IsModified() && object->IsRegistered()) {
113 FlushMap(object);
114 }
115 }
116 }
117
118 /// Mark the specified region as being invalidated
119 void InvalidateRegion(CacheAddr addr, u64 size) {
120 std::lock_guard lock{mutex};
121
122 std::vector<MapInterval> objects = GetMapsInRange(addr, size);
123 for (auto& object : objects) {
124 if (object->IsRegistered()) {
125 Unregister(object);
126 }
127 }
128 }
129
130 virtual const TBufferType* GetEmptyBuffer(std::size_t size) = 0;
131
132protected:
133 explicit BufferCache(VideoCore::RasterizerInterface& rasterizer, Core::System& system,
134 std::unique_ptr<StreamBuffer> stream_buffer)
135 : rasterizer{rasterizer}, system{system}, stream_buffer{std::move(stream_buffer)},
136 stream_buffer_handle{this->stream_buffer->GetHandle()} {}
137
138 ~BufferCache() = default;
139
140 virtual const TBufferType* ToHandle(const TBuffer& storage) = 0;
141
142 virtual void WriteBarrier() = 0;
143
144 virtual TBuffer CreateBlock(CacheAddr cache_addr, std::size_t size) = 0;
145
146 virtual void UploadBlockData(const TBuffer& buffer, std::size_t offset, std::size_t size,
147 const u8* data) = 0;
148
149 virtual void DownloadBlockData(const TBuffer& buffer, std::size_t offset, std::size_t size,
150 u8* data) = 0;
151
152 virtual void CopyBlock(const TBuffer& src, const TBuffer& dst, std::size_t src_offset,
153 std::size_t dst_offset, std::size_t size) = 0;
154
155 /// Register an object into the cache
156 void Register(const MapInterval& new_map, bool inherit_written = false) {
157 const CacheAddr cache_ptr = new_map->GetStart();
158 const std::optional<VAddr> cpu_addr =
159 system.GPU().MemoryManager().GpuToCpuAddress(new_map->GetGpuAddress());
160 if (!cache_ptr || !cpu_addr) {
161 LOG_CRITICAL(HW_GPU, "Failed to register buffer with unmapped gpu_address 0x{:016x}",
162 new_map->GetGpuAddress());
163 return;
164 }
165 const std::size_t size = new_map->GetEnd() - new_map->GetStart();
166 new_map->SetCpuAddress(*cpu_addr);
167 new_map->MarkAsRegistered(true);
168 const IntervalType interval{new_map->GetStart(), new_map->GetEnd()};
169 mapped_addresses.insert({interval, new_map});
170 rasterizer.UpdatePagesCachedCount(*cpu_addr, size, 1);
171 if (inherit_written) {
172 MarkRegionAsWritten(new_map->GetStart(), new_map->GetEnd() - 1);
173 new_map->MarkAsWritten(true);
174 }
175 }
176
177 /// Unregisters an object from the cache
178 void Unregister(MapInterval& map) {
179 const std::size_t size = map->GetEnd() - map->GetStart();
180 rasterizer.UpdatePagesCachedCount(map->GetCpuAddress(), size, -1);
181 map->MarkAsRegistered(false);
182 if (map->IsWritten()) {
183 UnmarkRegionAsWritten(map->GetStart(), map->GetEnd() - 1);
184 }
185 const IntervalType delete_interval{map->GetStart(), map->GetEnd()};
186 mapped_addresses.erase(delete_interval);
187 }
188
189private:
190 MapInterval CreateMap(const CacheAddr start, const CacheAddr end, const GPUVAddr gpu_addr) {
191 return std::make_shared<MapIntervalBase>(start, end, gpu_addr);
192 }
193
194 MapInterval MapAddress(const TBuffer& block, const GPUVAddr gpu_addr,
195 const CacheAddr cache_addr, const std::size_t size) {
196
197 std::vector<MapInterval> overlaps = GetMapsInRange(cache_addr, size);
198 if (overlaps.empty()) {
199 const CacheAddr cache_addr_end = cache_addr + size;
200 MapInterval new_map = CreateMap(cache_addr, cache_addr_end, gpu_addr);
201 u8* host_ptr = FromCacheAddr(cache_addr);
202 UploadBlockData(block, block->GetOffset(cache_addr), size, host_ptr);
203 Register(new_map);
204 return new_map;
205 }
206
207 const CacheAddr cache_addr_end = cache_addr + size;
208 if (overlaps.size() == 1) {
209 MapInterval& current_map = overlaps[0];
210 if (current_map->IsInside(cache_addr, cache_addr_end)) {
211 return current_map;
212 }
213 }
214 CacheAddr new_start = cache_addr;
215 CacheAddr new_end = cache_addr_end;
216 bool write_inheritance = false;
217 bool modified_inheritance = false;
218 // Calculate new buffer parameters
219 for (auto& overlap : overlaps) {
220 new_start = std::min(overlap->GetStart(), new_start);
221 new_end = std::max(overlap->GetEnd(), new_end);
222 write_inheritance |= overlap->IsWritten();
223 modified_inheritance |= overlap->IsModified();
224 }
225 GPUVAddr new_gpu_addr = gpu_addr + new_start - cache_addr;
226 for (auto& overlap : overlaps) {
227 Unregister(overlap);
228 }
229 UpdateBlock(block, new_start, new_end, overlaps);
230 MapInterval new_map = CreateMap(new_start, new_end, new_gpu_addr);
231 if (modified_inheritance) {
232 new_map->MarkAsModified(true, GetModifiedTicks());
233 }
234 Register(new_map, write_inheritance);
235 return new_map;
236 }
237
238 void UpdateBlock(const TBuffer& block, CacheAddr start, CacheAddr end,
239 std::vector<MapInterval>& overlaps) {
240 const IntervalType base_interval{start, end};
241 IntervalSet interval_set{};
242 interval_set.add(base_interval);
243 for (auto& overlap : overlaps) {
244 const IntervalType subtract{overlap->GetStart(), overlap->GetEnd()};
245 interval_set.subtract(subtract);
246 }
247 for (auto& interval : interval_set) {
248 std::size_t size = interval.upper() - interval.lower();
249 if (size > 0) {
250 u8* host_ptr = FromCacheAddr(interval.lower());
251 UploadBlockData(block, block->GetOffset(interval.lower()), size, host_ptr);
252 }
253 }
254 }
255
256 std::vector<MapInterval> GetMapsInRange(CacheAddr addr, std::size_t size) {
257 if (size == 0) {
258 return {};
259 }
260
261 std::vector<MapInterval> objects{};
262 const IntervalType interval{addr, addr + size};
263 for (auto& pair : boost::make_iterator_range(mapped_addresses.equal_range(interval))) {
264 objects.push_back(pair.second);
265 }
266
267 return objects;
268 }
269
270 /// Returns a ticks counter used for tracking when cached objects were last modified
271 u64 GetModifiedTicks() {
272 return ++modified_ticks;
273 }
274
275 void FlushMap(MapInterval map) {
276 std::size_t size = map->GetEnd() - map->GetStart();
277 TBuffer block = blocks[map->GetStart() >> block_page_bits];
278 u8* host_ptr = FromCacheAddr(map->GetStart());
279 DownloadBlockData(block, block->GetOffset(map->GetStart()), size, host_ptr);
280 map->MarkAsModified(false, 0);
281 }
282
283 BufferInfo StreamBufferUpload(const void* raw_pointer, std::size_t size,
284 std::size_t alignment) {
285 AlignBuffer(alignment);
286 const std::size_t uploaded_offset = buffer_offset;
287 std::memcpy(buffer_ptr, raw_pointer, size);
288
289 buffer_ptr += size;
290 buffer_offset += size;
291 return {&stream_buffer_handle, uploaded_offset};
292 }
293
294 void AlignBuffer(std::size_t alignment) {
295 // Align the offset, not the mapped pointer
296 const std::size_t offset_aligned = Common::AlignUp(buffer_offset, alignment);
297 buffer_ptr += offset_aligned - buffer_offset;
298 buffer_offset = offset_aligned;
299 }
300
301 TBuffer EnlargeBlock(TBuffer buffer) {
302 const std::size_t old_size = buffer->GetSize();
303 const std::size_t new_size = old_size + block_page_size;
304 const CacheAddr cache_addr = buffer->GetCacheAddr();
305 TBuffer new_buffer = CreateBlock(cache_addr, new_size);
306 CopyBlock(buffer, new_buffer, 0, 0, old_size);
307 buffer->SetEpoch(epoch);
308 pending_destruction.push_back(buffer);
309 const CacheAddr cache_addr_end = cache_addr + new_size - 1;
310 u64 page_start = cache_addr >> block_page_bits;
311 const u64 page_end = cache_addr_end >> block_page_bits;
312 while (page_start <= page_end) {
313 blocks[page_start] = new_buffer;
314 ++page_start;
315 }
316 return new_buffer;
317 }
318
319 TBuffer MergeBlocks(TBuffer first, TBuffer second) {
320 const std::size_t size_1 = first->GetSize();
321 const std::size_t size_2 = second->GetSize();
322 const CacheAddr first_addr = first->GetCacheAddr();
323 const CacheAddr second_addr = second->GetCacheAddr();
324 const CacheAddr new_addr = std::min(first_addr, second_addr);
325 const std::size_t new_size = size_1 + size_2;
326 TBuffer new_buffer = CreateBlock(new_addr, new_size);
327 CopyBlock(first, new_buffer, 0, new_buffer->GetOffset(first_addr), size_1);
328 CopyBlock(second, new_buffer, 0, new_buffer->GetOffset(second_addr), size_2);
329 first->SetEpoch(epoch);
330 second->SetEpoch(epoch);
331 pending_destruction.push_back(first);
332 pending_destruction.push_back(second);
333 const CacheAddr cache_addr_end = new_addr + new_size - 1;
334 u64 page_start = new_addr >> block_page_bits;
335 const u64 page_end = cache_addr_end >> block_page_bits;
336 while (page_start <= page_end) {
337 blocks[page_start] = new_buffer;
338 ++page_start;
339 }
340 return new_buffer;
341 }
342
343 TBuffer GetBlock(const CacheAddr cache_addr, const std::size_t size) {
344 TBuffer found{};
345 const CacheAddr cache_addr_end = cache_addr + size - 1;
346 u64 page_start = cache_addr >> block_page_bits;
347 const u64 page_end = cache_addr_end >> block_page_bits;
348 while (page_start <= page_end) {
349 auto it = blocks.find(page_start);
350 if (it == blocks.end()) {
351 if (found) {
352 found = EnlargeBlock(found);
353 } else {
354 const CacheAddr start_addr = (page_start << block_page_bits);
355 found = CreateBlock(start_addr, block_page_size);
356 blocks[page_start] = found;
357 }
358 } else {
359 if (found) {
360 if (found == it->second) {
361 ++page_start;
362 continue;
363 }
364 found = MergeBlocks(found, it->second);
365 } else {
366 found = it->second;
367 }
368 }
369 ++page_start;
370 }
371 return found;
372 }
373
374 void MarkRegionAsWritten(const CacheAddr start, const CacheAddr end) {
375 u64 page_start = start >> write_page_bit;
376 const u64 page_end = end >> write_page_bit;
377 while (page_start <= page_end) {
378 auto it = written_pages.find(page_start);
379 if (it != written_pages.end()) {
380 it->second = it->second + 1;
381 } else {
382 written_pages[page_start] = 1;
383 }
384 page_start++;
385 }
386 }
387
388 void UnmarkRegionAsWritten(const CacheAddr start, const CacheAddr end) {
389 u64 page_start = start >> write_page_bit;
390 const u64 page_end = end >> write_page_bit;
391 while (page_start <= page_end) {
392 auto it = written_pages.find(page_start);
393 if (it != written_pages.end()) {
394 if (it->second > 1) {
395 it->second = it->second - 1;
396 } else {
397 written_pages.erase(it);
398 }
399 }
400 page_start++;
401 }
402 }
403
404 bool IsRegionWritten(const CacheAddr start, const CacheAddr end) const {
405 u64 page_start = start >> write_page_bit;
406 const u64 page_end = end >> write_page_bit;
407 while (page_start <= page_end) {
408 if (written_pages.count(page_start) > 0) {
409 return true;
410 }
411 page_start++;
412 }
413 return false;
414 }
415
416 VideoCore::RasterizerInterface& rasterizer;
417 Core::System& system;
418 std::unique_ptr<StreamBuffer> stream_buffer;
419
420 TBufferType stream_buffer_handle{};
421
422 bool invalidated = false;
423
424 u8* buffer_ptr = nullptr;
425 u64 buffer_offset = 0;
426 u64 buffer_offset_base = 0;
427
428 using IntervalSet = boost::icl::interval_set<CacheAddr>;
429 using IntervalCache = boost::icl::interval_map<CacheAddr, MapInterval>;
430 using IntervalType = typename IntervalCache::interval_type;
431 IntervalCache mapped_addresses{};
432
433 static constexpr u64 write_page_bit{11};
434 std::unordered_map<u64, u32> written_pages{};
435
436 static constexpr u64 block_page_bits{21};
437 static constexpr u64 block_page_size{1 << block_page_bits};
438 std::unordered_map<u64, TBuffer> blocks{};
439
440 std::list<TBuffer> pending_destruction{};
441 u64 epoch{};
442 u64 modified_ticks{};
443
444 std::recursive_mutex mutex;
445};
446
447} // namespace VideoCommon
diff --git a/src/video_core/buffer_cache/map_interval.h b/src/video_core/buffer_cache/map_interval.h
new file mode 100644
index 000000000..3a104d5cd
--- /dev/null
+++ b/src/video_core/buffer_cache/map_interval.h
@@ -0,0 +1,89 @@
1// Copyright 2019 yuzu Emulator Project
2// Licensed under GPLv2 or any later version
3// Refer to the license.txt file included.
4
5#pragma once
6
7#include "common/common_types.h"
8#include "video_core/gpu.h"
9
10namespace VideoCommon {
11
12class MapIntervalBase {
13public:
14 MapIntervalBase(const CacheAddr start, const CacheAddr end, const GPUVAddr gpu_addr)
15 : start{start}, end{end}, gpu_addr{gpu_addr} {}
16
17 void SetCpuAddress(VAddr new_cpu_addr) {
18 cpu_addr = new_cpu_addr;
19 }
20
21 VAddr GetCpuAddress() const {
22 return cpu_addr;
23 }
24
25 GPUVAddr GetGpuAddress() const {
26 return gpu_addr;
27 }
28
29 bool IsInside(const CacheAddr other_start, const CacheAddr other_end) const {
30 return (start <= other_start && other_end <= end);
31 }
32
33 bool operator==(const MapIntervalBase& rhs) const {
34 return std::tie(start, end) == std::tie(rhs.start, rhs.end);
35 }
36
37 bool operator!=(const MapIntervalBase& rhs) const {
38 return !operator==(rhs);
39 }
40
41 void MarkAsRegistered(const bool registered) {
42 is_registered = registered;
43 }
44
45 bool IsRegistered() const {
46 return is_registered;
47 }
48
49 CacheAddr GetStart() const {
50 return start;
51 }
52
53 CacheAddr GetEnd() const {
54 return end;
55 }
56
57 void MarkAsModified(const bool is_modified_, const u64 tick) {
58 is_modified = is_modified_;
59 ticks = tick;
60 }
61
62 bool IsModified() const {
63 return is_modified;
64 }
65
66 u64 GetModificationTick() const {
67 return ticks;
68 }
69
70 void MarkAsWritten(const bool is_written_) {
71 is_written = is_written_;
72 }
73
74 bool IsWritten() const {
75 return is_written;
76 }
77
78private:
79 CacheAddr start;
80 CacheAddr end;
81 GPUVAddr gpu_addr;
82 VAddr cpu_addr{};
83 bool is_written{};
84 bool is_modified{};
85 bool is_registered{};
86 u64 ticks{};
87};
88
89} // namespace VideoCommon
diff --git a/src/video_core/dma_pusher.cpp b/src/video_core/dma_pusher.cpp
index bd036cbe8..0094fd715 100644
--- a/src/video_core/dma_pusher.cpp
+++ b/src/video_core/dma_pusher.cpp
@@ -31,6 +31,7 @@ void DmaPusher::DispatchCalls() {
31 break; 31 break;
32 } 32 }
33 } 33 }
34 gpu.FlushCommands();
34} 35}
35 36
36bool DmaPusher::Step() { 37bool DmaPusher::Step() {
diff --git a/src/video_core/engines/fermi_2d.cpp b/src/video_core/engines/fermi_2d.cpp
index 0ee228e28..98a8b5337 100644
--- a/src/video_core/engines/fermi_2d.cpp
+++ b/src/video_core/engines/fermi_2d.cpp
@@ -10,8 +10,7 @@
10 10
11namespace Tegra::Engines { 11namespace Tegra::Engines {
12 12
13Fermi2D::Fermi2D(VideoCore::RasterizerInterface& rasterizer, MemoryManager& memory_manager) 13Fermi2D::Fermi2D(VideoCore::RasterizerInterface& rasterizer) : rasterizer{rasterizer} {}
14 : rasterizer{rasterizer}, memory_manager{memory_manager} {}
15 14
16void Fermi2D::CallMethod(const GPU::MethodCall& method_call) { 15void Fermi2D::CallMethod(const GPU::MethodCall& method_call) {
17 ASSERT_MSG(method_call.method < Regs::NUM_REGS, 16 ASSERT_MSG(method_call.method < Regs::NUM_REGS,
diff --git a/src/video_core/engines/fermi_2d.h b/src/video_core/engines/fermi_2d.h
index 05421d185..0901cf2fa 100644
--- a/src/video_core/engines/fermi_2d.h
+++ b/src/video_core/engines/fermi_2d.h
@@ -33,7 +33,7 @@ namespace Tegra::Engines {
33 33
34class Fermi2D final { 34class Fermi2D final {
35public: 35public:
36 explicit Fermi2D(VideoCore::RasterizerInterface& rasterizer, MemoryManager& memory_manager); 36 explicit Fermi2D(VideoCore::RasterizerInterface& rasterizer);
37 ~Fermi2D() = default; 37 ~Fermi2D() = default;
38 38
39 /// Write the value to the register identified by method. 39 /// Write the value to the register identified by method.
@@ -145,7 +145,6 @@ public:
145 145
146private: 146private:
147 VideoCore::RasterizerInterface& rasterizer; 147 VideoCore::RasterizerInterface& rasterizer;
148 MemoryManager& memory_manager;
149 148
150 /// Performs the copy from the source surface to the destination surface as configured in the 149 /// Performs the copy from the source surface to the destination surface as configured in the
151 /// registers. 150 /// registers.
diff --git a/src/video_core/engines/kepler_memory.cpp b/src/video_core/engines/kepler_memory.cpp
index 44279de00..fa4a7c5c1 100644
--- a/src/video_core/engines/kepler_memory.cpp
+++ b/src/video_core/engines/kepler_memory.cpp
@@ -15,7 +15,7 @@
15namespace Tegra::Engines { 15namespace Tegra::Engines {
16 16
17KeplerMemory::KeplerMemory(Core::System& system, MemoryManager& memory_manager) 17KeplerMemory::KeplerMemory(Core::System& system, MemoryManager& memory_manager)
18 : system{system}, memory_manager{memory_manager}, upload_state{memory_manager, regs.upload} {} 18 : system{system}, upload_state{memory_manager, regs.upload} {}
19 19
20KeplerMemory::~KeplerMemory() = default; 20KeplerMemory::~KeplerMemory() = default;
21 21
diff --git a/src/video_core/engines/kepler_memory.h b/src/video_core/engines/kepler_memory.h
index f3bc675a9..e0e25c321 100644
--- a/src/video_core/engines/kepler_memory.h
+++ b/src/video_core/engines/kepler_memory.h
@@ -65,7 +65,6 @@ public:
65 65
66private: 66private:
67 Core::System& system; 67 Core::System& system;
68 MemoryManager& memory_manager;
69 Upload::State upload_state; 68 Upload::State upload_state;
70}; 69};
71 70
diff --git a/src/video_core/engines/maxwell_3d.cpp b/src/video_core/engines/maxwell_3d.cpp
index 125c53360..f5158d219 100644
--- a/src/video_core/engines/maxwell_3d.cpp
+++ b/src/video_core/engines/maxwell_3d.cpp
@@ -249,16 +249,10 @@ void Maxwell3D::CallMacroMethod(u32 method, std::vector<u32> parameters) {
249 executing_macro = 0; 249 executing_macro = 0;
250 250
251 // Lookup the macro offset 251 // Lookup the macro offset
252 const u32 entry{(method - MacroRegistersStart) >> 1}; 252 const u32 entry = ((method - MacroRegistersStart) >> 1) % macro_positions.size();
253 const auto& search{macro_offsets.find(entry)};
254 if (search == macro_offsets.end()) {
255 LOG_CRITICAL(HW_GPU, "macro not found for method 0x{:X}!", method);
256 UNREACHABLE();
257 return;
258 }
259 253
260 // Execute the current macro. 254 // Execute the current macro.
261 macro_interpreter.Execute(search->second, std::move(parameters)); 255 macro_interpreter.Execute(macro_positions[entry], std::move(parameters));
262} 256}
263 257
264void Maxwell3D::CallMethod(const GPU::MethodCall& method_call) { 258void Maxwell3D::CallMethod(const GPU::MethodCall& method_call) {
@@ -421,7 +415,7 @@ void Maxwell3D::ProcessMacroUpload(u32 data) {
421} 415}
422 416
423void Maxwell3D::ProcessMacroBind(u32 data) { 417void Maxwell3D::ProcessMacroBind(u32 data) {
424 macro_offsets[regs.macros.entry] = data; 418 macro_positions[regs.macros.entry++] = data;
425} 419}
426 420
427void Maxwell3D::ProcessQueryGet() { 421void Maxwell3D::ProcessQueryGet() {
@@ -524,7 +518,7 @@ void Maxwell3D::ProcessQueryCondition() {
524void Maxwell3D::ProcessSyncPoint() { 518void Maxwell3D::ProcessSyncPoint() {
525 const u32 sync_point = regs.sync_info.sync_point.Value(); 519 const u32 sync_point = regs.sync_info.sync_point.Value();
526 const u32 increment = regs.sync_info.increment.Value(); 520 const u32 increment = regs.sync_info.increment.Value();
527 const u32 cache_flush = regs.sync_info.unknown.Value(); 521 [[maybe_unused]] const u32 cache_flush = regs.sync_info.unknown.Value();
528 if (increment) { 522 if (increment) {
529 system.GPU().IncrementSyncPoint(sync_point); 523 system.GPU().IncrementSyncPoint(sync_point);
530 } 524 }
@@ -626,10 +620,10 @@ Texture::TICEntry Maxwell3D::GetTICEntry(u32 tic_index) const {
626 Texture::TICEntry tic_entry; 620 Texture::TICEntry tic_entry;
627 memory_manager.ReadBlockUnsafe(tic_address_gpu, &tic_entry, sizeof(Texture::TICEntry)); 621 memory_manager.ReadBlockUnsafe(tic_address_gpu, &tic_entry, sizeof(Texture::TICEntry));
628 622
629 const auto r_type{tic_entry.r_type.Value()}; 623 [[maybe_unused]] const auto r_type{tic_entry.r_type.Value()};
630 const auto g_type{tic_entry.g_type.Value()}; 624 [[maybe_unused]] const auto g_type{tic_entry.g_type.Value()};
631 const auto b_type{tic_entry.b_type.Value()}; 625 [[maybe_unused]] const auto b_type{tic_entry.b_type.Value()};
632 const auto a_type{tic_entry.a_type.Value()}; 626 [[maybe_unused]] const auto a_type{tic_entry.a_type.Value()};
633 627
634 // TODO(Subv): Different data types for separate components are not supported 628 // TODO(Subv): Different data types for separate components are not supported
635 DEBUG_ASSERT(r_type == g_type && r_type == b_type && r_type == a_type); 629 DEBUG_ASSERT(r_type == g_type && r_type == b_type && r_type == a_type);
diff --git a/src/video_core/engines/maxwell_3d.h b/src/video_core/engines/maxwell_3d.h
index 1ee982b76..0184342a0 100644
--- a/src/video_core/engines/maxwell_3d.h
+++ b/src/video_core/engines/maxwell_3d.h
@@ -1270,7 +1270,7 @@ private:
1270 MemoryManager& memory_manager; 1270 MemoryManager& memory_manager;
1271 1271
1272 /// Start offsets of each macro in macro_memory 1272 /// Start offsets of each macro in macro_memory
1273 std::unordered_map<u32, u32> macro_offsets; 1273 std::array<u32, 0x80> macro_positions = {};
1274 1274
1275 /// Memory for macro code 1275 /// Memory for macro code
1276 MacroMemory macro_memory; 1276 MacroMemory macro_memory;
diff --git a/src/video_core/engines/maxwell_dma.cpp b/src/video_core/engines/maxwell_dma.cpp
index a28c04473..ad8453c5f 100644
--- a/src/video_core/engines/maxwell_dma.cpp
+++ b/src/video_core/engines/maxwell_dma.cpp
@@ -5,18 +5,17 @@
5#include "common/assert.h" 5#include "common/assert.h"
6#include "common/logging/log.h" 6#include "common/logging/log.h"
7#include "core/core.h" 7#include "core/core.h"
8#include "core/settings.h"
8#include "video_core/engines/maxwell_3d.h" 9#include "video_core/engines/maxwell_3d.h"
9#include "video_core/engines/maxwell_dma.h" 10#include "video_core/engines/maxwell_dma.h"
10#include "video_core/memory_manager.h" 11#include "video_core/memory_manager.h"
11#include "video_core/rasterizer_interface.h"
12#include "video_core/renderer_base.h" 12#include "video_core/renderer_base.h"
13#include "video_core/textures/decoders.h" 13#include "video_core/textures/decoders.h"
14 14
15namespace Tegra::Engines { 15namespace Tegra::Engines {
16 16
17MaxwellDMA::MaxwellDMA(Core::System& system, VideoCore::RasterizerInterface& rasterizer, 17MaxwellDMA::MaxwellDMA(Core::System& system, MemoryManager& memory_manager)
18 MemoryManager& memory_manager) 18 : system{system}, memory_manager{memory_manager} {}
19 : system{system}, rasterizer{rasterizer}, memory_manager{memory_manager} {}
20 19
21void MaxwellDMA::CallMethod(const GPU::MethodCall& method_call) { 20void MaxwellDMA::CallMethod(const GPU::MethodCall& method_call) {
22 ASSERT_MSG(method_call.method < Regs::NUM_REGS, 21 ASSERT_MSG(method_call.method < Regs::NUM_REGS,
@@ -84,13 +83,17 @@ void MaxwellDMA::HandleCopy() {
84 ASSERT(regs.exec.enable_2d == 1); 83 ASSERT(regs.exec.enable_2d == 1);
85 84
86 if (regs.exec.is_dst_linear && !regs.exec.is_src_linear) { 85 if (regs.exec.is_dst_linear && !regs.exec.is_src_linear) {
87 ASSERT(regs.src_params.size_z == 1); 86 ASSERT(regs.src_params.BlockDepth() == 0);
88 // If the input is tiled and the output is linear, deswizzle the input and copy it over. 87 // If the input is tiled and the output is linear, deswizzle the input and copy it over.
89 const u32 src_bytes_per_pixel = regs.src_pitch / regs.src_params.size_x; 88 const u32 bytes_per_pixel = regs.dst_pitch / regs.x_count;
90 const std::size_t src_size = Texture::CalculateSize( 89 const std::size_t src_size = Texture::CalculateSize(
91 true, src_bytes_per_pixel, regs.src_params.size_x, regs.src_params.size_y, 90 true, bytes_per_pixel, regs.src_params.size_x, regs.src_params.size_y,
92 regs.src_params.size_z, regs.src_params.BlockHeight(), regs.src_params.BlockDepth()); 91 regs.src_params.size_z, regs.src_params.BlockHeight(), regs.src_params.BlockDepth());
93 92
93 const std::size_t src_layer_size = Texture::CalculateSize(
94 true, bytes_per_pixel, regs.src_params.size_x, regs.src_params.size_y, 1,
95 regs.src_params.BlockHeight(), regs.src_params.BlockDepth());
96
94 const std::size_t dst_size = regs.dst_pitch * regs.y_count; 97 const std::size_t dst_size = regs.dst_pitch * regs.y_count;
95 98
96 if (read_buffer.size() < src_size) { 99 if (read_buffer.size() < src_size) {
@@ -104,23 +107,23 @@ void MaxwellDMA::HandleCopy() {
104 memory_manager.ReadBlock(source, read_buffer.data(), src_size); 107 memory_manager.ReadBlock(source, read_buffer.data(), src_size);
105 memory_manager.ReadBlock(dest, write_buffer.data(), dst_size); 108 memory_manager.ReadBlock(dest, write_buffer.data(), dst_size);
106 109
107 Texture::UnswizzleSubrect(regs.x_count, regs.y_count, regs.dst_pitch, 110 Texture::UnswizzleSubrect(
108 regs.src_params.size_x, src_bytes_per_pixel, read_buffer.data(), 111 regs.x_count, regs.y_count, regs.dst_pitch, regs.src_params.size_x, bytes_per_pixel,
109 write_buffer.data(), regs.src_params.BlockHeight(), 112 read_buffer.data() + src_layer_size * regs.src_params.pos_z, write_buffer.data(),
110 regs.src_params.pos_x, regs.src_params.pos_y); 113 regs.src_params.BlockHeight(), regs.src_params.pos_x, regs.src_params.pos_y);
111 114
112 memory_manager.WriteBlock(dest, write_buffer.data(), dst_size); 115 memory_manager.WriteBlock(dest, write_buffer.data(), dst_size);
113 } else { 116 } else {
114 ASSERT(regs.dst_params.BlockDepth() == 0); 117 ASSERT(regs.dst_params.BlockDepth() == 0);
115 118
116 const u32 src_bytes_per_pixel = regs.src_pitch / regs.x_count; 119 const u32 bytes_per_pixel = regs.src_pitch / regs.x_count;
117 120
118 const std::size_t dst_size = Texture::CalculateSize( 121 const std::size_t dst_size = Texture::CalculateSize(
119 true, src_bytes_per_pixel, regs.dst_params.size_x, regs.dst_params.size_y, 122 true, bytes_per_pixel, regs.dst_params.size_x, regs.dst_params.size_y,
120 regs.dst_params.size_z, regs.dst_params.BlockHeight(), regs.dst_params.BlockDepth()); 123 regs.dst_params.size_z, regs.dst_params.BlockHeight(), regs.dst_params.BlockDepth());
121 124
122 const std::size_t dst_layer_size = Texture::CalculateSize( 125 const std::size_t dst_layer_size = Texture::CalculateSize(
123 true, src_bytes_per_pixel, regs.dst_params.size_x, regs.dst_params.size_y, 1, 126 true, bytes_per_pixel, regs.dst_params.size_x, regs.dst_params.size_y, 1,
124 regs.dst_params.BlockHeight(), regs.dst_params.BlockDepth()); 127 regs.dst_params.BlockHeight(), regs.dst_params.BlockDepth());
125 128
126 const std::size_t src_size = regs.src_pitch * regs.y_count; 129 const std::size_t src_size = regs.src_pitch * regs.y_count;
@@ -133,14 +136,19 @@ void MaxwellDMA::HandleCopy() {
133 write_buffer.resize(dst_size); 136 write_buffer.resize(dst_size);
134 } 137 }
135 138
136 memory_manager.ReadBlock(source, read_buffer.data(), src_size); 139 if (Settings::values.use_accurate_gpu_emulation) {
137 memory_manager.ReadBlock(dest, write_buffer.data(), dst_size); 140 memory_manager.ReadBlock(source, read_buffer.data(), src_size);
141 memory_manager.ReadBlock(dest, write_buffer.data(), dst_size);
142 } else {
143 memory_manager.ReadBlockUnsafe(source, read_buffer.data(), src_size);
144 memory_manager.ReadBlockUnsafe(dest, write_buffer.data(), dst_size);
145 }
138 146
139 // If the input is linear and the output is tiled, swizzle the input and copy it over. 147 // If the input is linear and the output is tiled, swizzle the input and copy it over.
140 Texture::SwizzleSubrect(regs.x_count, regs.y_count, regs.src_pitch, regs.dst_params.size_x, 148 Texture::SwizzleSubrect(
141 src_bytes_per_pixel, 149 regs.x_count, regs.y_count, regs.src_pitch, regs.dst_params.size_x, bytes_per_pixel,
142 write_buffer.data() + dst_layer_size * regs.dst_params.pos_z, 150 write_buffer.data() + dst_layer_size * regs.dst_params.pos_z, read_buffer.data(),
143 read_buffer.data(), regs.dst_params.BlockHeight()); 151 regs.dst_params.BlockHeight(), regs.dst_params.pos_x, regs.dst_params.pos_y);
144 152
145 memory_manager.WriteBlock(dest, write_buffer.data(), dst_size); 153 memory_manager.WriteBlock(dest, write_buffer.data(), dst_size);
146 } 154 }
diff --git a/src/video_core/engines/maxwell_dma.h b/src/video_core/engines/maxwell_dma.h
index 17b015ca7..93808a9bb 100644
--- a/src/video_core/engines/maxwell_dma.h
+++ b/src/video_core/engines/maxwell_dma.h
@@ -20,10 +20,6 @@ namespace Tegra {
20class MemoryManager; 20class MemoryManager;
21} 21}
22 22
23namespace VideoCore {
24class RasterizerInterface;
25}
26
27namespace Tegra::Engines { 23namespace Tegra::Engines {
28 24
29/** 25/**
@@ -33,8 +29,7 @@ namespace Tegra::Engines {
33 29
34class MaxwellDMA final { 30class MaxwellDMA final {
35public: 31public:
36 explicit MaxwellDMA(Core::System& system, VideoCore::RasterizerInterface& rasterizer, 32 explicit MaxwellDMA(Core::System& system, MemoryManager& memory_manager);
37 MemoryManager& memory_manager);
38 ~MaxwellDMA() = default; 33 ~MaxwellDMA() = default;
39 34
40 /// Write the value to the register identified by method. 35 /// Write the value to the register identified by method.
@@ -180,8 +175,6 @@ public:
180private: 175private:
181 Core::System& system; 176 Core::System& system;
182 177
183 VideoCore::RasterizerInterface& rasterizer;
184
185 MemoryManager& memory_manager; 178 MemoryManager& memory_manager;
186 179
187 std::vector<u8> read_buffer; 180 std::vector<u8> read_buffer;
diff --git a/src/video_core/engines/shader_bytecode.h b/src/video_core/engines/shader_bytecode.h
index 083ee3304..c3678b9ea 100644
--- a/src/video_core/engines/shader_bytecode.h
+++ b/src/video_core/engines/shader_bytecode.h
@@ -538,6 +538,12 @@ enum class PhysicalAttributeDirection : u64 {
538 Output = 1, 538 Output = 1,
539}; 539};
540 540
541enum class VoteOperation : u64 {
542 All = 0, // allThreadsNV
543 Any = 1, // anyThreadNV
544 Eq = 2, // allThreadsEqualNV
545};
546
541union Instruction { 547union Instruction {
542 Instruction& operator=(const Instruction& instr) { 548 Instruction& operator=(const Instruction& instr) {
543 value = instr.value; 549 value = instr.value;
@@ -565,6 +571,13 @@ union Instruction {
565 } nop; 571 } nop;
566 572
567 union { 573 union {
574 BitField<48, 2, VoteOperation> operation;
575 BitField<45, 3, u64> dest_pred;
576 BitField<39, 3, u64> value;
577 BitField<42, 1, u64> negate_value;
578 } vote;
579
580 union {
568 BitField<8, 8, Register> gpr; 581 BitField<8, 8, Register> gpr;
569 BitField<20, 24, s64> offset; 582 BitField<20, 24, s64> offset;
570 } gmem; 583 } gmem;
@@ -873,6 +886,7 @@ union Instruction {
873 union { 886 union {
874 BitField<0, 3, u64> pred0; 887 BitField<0, 3, u64> pred0;
875 BitField<3, 3, u64> pred3; 888 BitField<3, 3, u64> pred3;
889 BitField<6, 1, u64> neg_b;
876 BitField<7, 1, u64> abs_a; 890 BitField<7, 1, u64> abs_a;
877 BitField<39, 3, u64> pred39; 891 BitField<39, 3, u64> pred39;
878 BitField<42, 1, u64> neg_pred; 892 BitField<42, 1, u64> neg_pred;
@@ -1006,7 +1020,6 @@ union Instruction {
1006 } iset; 1020 } iset;
1007 1021
1008 union { 1022 union {
1009 BitField<41, 2, u64> selector; // i2i and i2f only
1010 BitField<45, 1, u64> negate_a; 1023 BitField<45, 1, u64> negate_a;
1011 BitField<49, 1, u64> abs_a; 1024 BitField<49, 1, u64> abs_a;
1012 BitField<10, 2, Register::Size> src_size; 1025 BitField<10, 2, Register::Size> src_size;
@@ -1023,8 +1036,6 @@ union Instruction {
1023 } f2i; 1036 } f2i;
1024 1037
1025 union { 1038 union {
1026 BitField<8, 2, Register::Size> src_size;
1027 BitField<10, 2, Register::Size> dst_size;
1028 BitField<39, 4, u64> rounding; 1039 BitField<39, 4, u64> rounding;
1029 // H0, H1 extract for F16 missing 1040 // H0, H1 extract for F16 missing
1030 BitField<41, 1, u64> selector; // Guessed as some games set it, TODO: reverse this value 1041 BitField<41, 1, u64> selector; // Guessed as some games set it, TODO: reverse this value
@@ -1034,6 +1045,13 @@ union Instruction {
1034 } 1045 }
1035 } f2f; 1046 } f2f;
1036 1047
1048 union {
1049 BitField<41, 2, u64> selector;
1050 } int_src;
1051
1052 union {
1053 BitField<41, 1, u64> selector;
1054 } float_src;
1037 } conversion; 1055 } conversion;
1038 1056
1039 union { 1057 union {
@@ -1489,6 +1507,7 @@ public:
1489 SYNC, 1507 SYNC,
1490 BRK, 1508 BRK,
1491 DEPBAR, 1509 DEPBAR,
1510 VOTE,
1492 BFE_C, 1511 BFE_C,
1493 BFE_R, 1512 BFE_R,
1494 BFE_IMM, 1513 BFE_IMM,
@@ -1651,6 +1670,7 @@ public:
1651 Hfma2, 1670 Hfma2,
1652 Flow, 1671 Flow,
1653 Synch, 1672 Synch,
1673 Warp,
1654 Memory, 1674 Memory,
1655 Texture, 1675 Texture,
1656 Image, 1676 Image,
@@ -1777,6 +1797,7 @@ private:
1777 INST("111000110100---", Id::BRK, Type::Flow, "BRK"), 1797 INST("111000110100---", Id::BRK, Type::Flow, "BRK"),
1778 INST("111000110000----", Id::EXIT, Type::Flow, "EXIT"), 1798 INST("111000110000----", Id::EXIT, Type::Flow, "EXIT"),
1779 INST("1111000011110---", Id::DEPBAR, Type::Synch, "DEPBAR"), 1799 INST("1111000011110---", Id::DEPBAR, Type::Synch, "DEPBAR"),
1800 INST("0101000011011---", Id::VOTE, Type::Warp, "VOTE"),
1780 INST("1110111111011---", Id::LD_A, Type::Memory, "LD_A"), 1801 INST("1110111111011---", Id::LD_A, Type::Memory, "LD_A"),
1781 INST("1110111101001---", Id::LD_S, Type::Memory, "LD_S"), 1802 INST("1110111101001---", Id::LD_S, Type::Memory, "LD_S"),
1782 INST("1110111101000---", Id::LD_L, Type::Memory, "LD_L"), 1803 INST("1110111101000---", Id::LD_L, Type::Memory, "LD_L"),
diff --git a/src/video_core/gpu.cpp b/src/video_core/gpu.cpp
index 3006d8059..2c47541cb 100644
--- a/src/video_core/gpu.cpp
+++ b/src/video_core/gpu.cpp
@@ -23,9 +23,9 @@ GPU::GPU(Core::System& system, VideoCore::RendererBase& renderer, bool is_async)
23 memory_manager = std::make_unique<Tegra::MemoryManager>(system, rasterizer); 23 memory_manager = std::make_unique<Tegra::MemoryManager>(system, rasterizer);
24 dma_pusher = std::make_unique<Tegra::DmaPusher>(*this); 24 dma_pusher = std::make_unique<Tegra::DmaPusher>(*this);
25 maxwell_3d = std::make_unique<Engines::Maxwell3D>(system, rasterizer, *memory_manager); 25 maxwell_3d = std::make_unique<Engines::Maxwell3D>(system, rasterizer, *memory_manager);
26 fermi_2d = std::make_unique<Engines::Fermi2D>(rasterizer, *memory_manager); 26 fermi_2d = std::make_unique<Engines::Fermi2D>(rasterizer);
27 kepler_compute = std::make_unique<Engines::KeplerCompute>(system, rasterizer, *memory_manager); 27 kepler_compute = std::make_unique<Engines::KeplerCompute>(system, rasterizer, *memory_manager);
28 maxwell_dma = std::make_unique<Engines::MaxwellDMA>(system, rasterizer, *memory_manager); 28 maxwell_dma = std::make_unique<Engines::MaxwellDMA>(system, *memory_manager);
29 kepler_memory = std::make_unique<Engines::KeplerMemory>(system, *memory_manager); 29 kepler_memory = std::make_unique<Engines::KeplerMemory>(system, *memory_manager);
30} 30}
31 31
@@ -108,6 +108,10 @@ bool GPU::CancelSyncptInterrupt(const u32 syncpoint_id, const u32 value) {
108 return true; 108 return true;
109} 109}
110 110
111void GPU::FlushCommands() {
112 renderer.Rasterizer().FlushCommands();
113}
114
111u32 RenderTargetBytesPerPixel(RenderTargetFormat format) { 115u32 RenderTargetBytesPerPixel(RenderTargetFormat format) {
112 ASSERT(format != RenderTargetFormat::NONE); 116 ASSERT(format != RenderTargetFormat::NONE);
113 117
diff --git a/src/video_core/gpu.h b/src/video_core/gpu.h
index 1a7f5bdf2..78bc0601a 100644
--- a/src/video_core/gpu.h
+++ b/src/video_core/gpu.h
@@ -19,6 +19,10 @@ inline CacheAddr ToCacheAddr(const void* host_ptr) {
19 return reinterpret_cast<CacheAddr>(host_ptr); 19 return reinterpret_cast<CacheAddr>(host_ptr);
20} 20}
21 21
22inline u8* FromCacheAddr(CacheAddr cache_addr) {
23 return reinterpret_cast<u8*>(cache_addr);
24}
25
22namespace Core { 26namespace Core {
23class System; 27class System;
24} 28}
@@ -149,6 +153,8 @@ public:
149 /// Calls a GPU method. 153 /// Calls a GPU method.
150 void CallMethod(const MethodCall& method_call); 154 void CallMethod(const MethodCall& method_call);
151 155
156 void FlushCommands();
157
152 /// Returns a reference to the Maxwell3D GPU engine. 158 /// Returns a reference to the Maxwell3D GPU engine.
153 Engines::Maxwell3D& Maxwell3D(); 159 Engines::Maxwell3D& Maxwell3D();
154 160
@@ -274,8 +280,8 @@ private:
274 280
275protected: 281protected:
276 std::unique_ptr<Tegra::DmaPusher> dma_pusher; 282 std::unique_ptr<Tegra::DmaPusher> dma_pusher;
277 VideoCore::RendererBase& renderer;
278 Core::System& system; 283 Core::System& system;
284 VideoCore::RendererBase& renderer;
279 285
280private: 286private:
281 std::unique_ptr<Tegra::MemoryManager> memory_manager; 287 std::unique_ptr<Tegra::MemoryManager> memory_manager;
diff --git a/src/video_core/rasterizer_interface.h b/src/video_core/rasterizer_interface.h
index 9881df0d5..6b3f2d50a 100644
--- a/src/video_core/rasterizer_interface.h
+++ b/src/video_core/rasterizer_interface.h
@@ -50,6 +50,9 @@ public:
50 /// and invalidated 50 /// and invalidated
51 virtual void FlushAndInvalidateRegion(CacheAddr addr, u64 size) = 0; 51 virtual void FlushAndInvalidateRegion(CacheAddr addr, u64 size) = 0;
52 52
53 /// Notify the rasterizer to send all written commands to the host GPU.
54 virtual void FlushCommands() = 0;
55
53 /// Notify rasterizer that a frame is about to finish 56 /// Notify rasterizer that a frame is about to finish
54 virtual void TickFrame() = 0; 57 virtual void TickFrame() = 0;
55 58
diff --git a/src/video_core/renderer_opengl/gl_buffer_cache.cpp b/src/video_core/renderer_opengl/gl_buffer_cache.cpp
index 2a9b523f5..f8a807c84 100644
--- a/src/video_core/renderer_opengl/gl_buffer_cache.cpp
+++ b/src/video_core/renderer_opengl/gl_buffer_cache.cpp
@@ -7,28 +7,41 @@
7#include <glad/glad.h> 7#include <glad/glad.h>
8 8
9#include "common/assert.h" 9#include "common/assert.h"
10#include "common/microprofile.h"
11#include "video_core/rasterizer_interface.h"
10#include "video_core/renderer_opengl/gl_buffer_cache.h" 12#include "video_core/renderer_opengl/gl_buffer_cache.h"
11#include "video_core/renderer_opengl/gl_rasterizer.h" 13#include "video_core/renderer_opengl/gl_rasterizer.h"
12#include "video_core/renderer_opengl/gl_resource_manager.h" 14#include "video_core/renderer_opengl/gl_resource_manager.h"
13 15
14namespace OpenGL { 16namespace OpenGL {
15 17
18MICROPROFILE_DEFINE(OpenGL_Buffer_Download, "OpenGL", "Buffer Download", MP_RGB(192, 192, 128));
19
20CachedBufferBlock::CachedBufferBlock(CacheAddr cache_addr, const std::size_t size)
21 : VideoCommon::BufferBlock{cache_addr, size} {
22 gl_buffer.Create();
23 glNamedBufferData(gl_buffer.handle, static_cast<GLsizeiptr>(size), nullptr, GL_DYNAMIC_DRAW);
24}
25
26CachedBufferBlock::~CachedBufferBlock() = default;
27
16OGLBufferCache::OGLBufferCache(RasterizerOpenGL& rasterizer, Core::System& system, 28OGLBufferCache::OGLBufferCache(RasterizerOpenGL& rasterizer, Core::System& system,
17 std::size_t stream_size) 29 std::size_t stream_size)
18 : VideoCommon::BufferCache<OGLBuffer, GLuint, OGLStreamBuffer>{ 30 : VideoCommon::BufferCache<Buffer, GLuint, OGLStreamBuffer>{
19 rasterizer, system, std::make_unique<OGLStreamBuffer>(stream_size, true)} {} 31 rasterizer, system, std::make_unique<OGLStreamBuffer>(stream_size, true)} {}
20 32
21OGLBufferCache::~OGLBufferCache() = default; 33OGLBufferCache::~OGLBufferCache() = default;
22 34
23OGLBuffer OGLBufferCache::CreateBuffer(std::size_t size) { 35Buffer OGLBufferCache::CreateBlock(CacheAddr cache_addr, std::size_t size) {
24 OGLBuffer buffer; 36 return std::make_shared<CachedBufferBlock>(cache_addr, size);
25 buffer.Create(); 37}
26 glNamedBufferData(buffer.handle, static_cast<GLsizeiptr>(size), nullptr, GL_DYNAMIC_DRAW); 38
27 return buffer; 39void OGLBufferCache::WriteBarrier() {
40 glMemoryBarrier(GL_ALL_BARRIER_BITS);
28} 41}
29 42
30const GLuint* OGLBufferCache::ToHandle(const OGLBuffer& buffer) { 43const GLuint* OGLBufferCache::ToHandle(const Buffer& buffer) {
31 return &buffer.handle; 44 return buffer->GetHandle();
32} 45}
33 46
34const GLuint* OGLBufferCache::GetEmptyBuffer(std::size_t) { 47const GLuint* OGLBufferCache::GetEmptyBuffer(std::size_t) {
@@ -36,23 +49,24 @@ const GLuint* OGLBufferCache::GetEmptyBuffer(std::size_t) {
36 return &null_buffer; 49 return &null_buffer;
37} 50}
38 51
39void OGLBufferCache::UploadBufferData(const OGLBuffer& buffer, std::size_t offset, std::size_t size, 52void OGLBufferCache::UploadBlockData(const Buffer& buffer, std::size_t offset, std::size_t size,
40 const u8* data) { 53 const u8* data) {
41 glNamedBufferSubData(buffer.handle, static_cast<GLintptr>(offset), 54 glNamedBufferSubData(*buffer->GetHandle(), static_cast<GLintptr>(offset),
42 static_cast<GLsizeiptr>(size), data); 55 static_cast<GLsizeiptr>(size), data);
43} 56}
44 57
45void OGLBufferCache::DownloadBufferData(const OGLBuffer& buffer, std::size_t offset, 58void OGLBufferCache::DownloadBlockData(const Buffer& buffer, std::size_t offset, std::size_t size,
46 std::size_t size, u8* data) { 59 u8* data) {
47 glGetNamedBufferSubData(buffer.handle, static_cast<GLintptr>(offset), 60 MICROPROFILE_SCOPE(OpenGL_Buffer_Download);
61 glGetNamedBufferSubData(*buffer->GetHandle(), static_cast<GLintptr>(offset),
48 static_cast<GLsizeiptr>(size), data); 62 static_cast<GLsizeiptr>(size), data);
49} 63}
50 64
51void OGLBufferCache::CopyBufferData(const OGLBuffer& src, const OGLBuffer& dst, 65void OGLBufferCache::CopyBlock(const Buffer& src, const Buffer& dst, std::size_t src_offset,
52 std::size_t src_offset, std::size_t dst_offset, 66 std::size_t dst_offset, std::size_t size) {
53 std::size_t size) { 67 glCopyNamedBufferSubData(*src->GetHandle(), *dst->GetHandle(),
54 glCopyNamedBufferSubData(src.handle, dst.handle, static_cast<GLintptr>(src_offset), 68 static_cast<GLintptr>(src_offset), static_cast<GLintptr>(dst_offset),
55 static_cast<GLintptr>(dst_offset), static_cast<GLsizeiptr>(size)); 69 static_cast<GLsizeiptr>(size));
56} 70}
57 71
58} // namespace OpenGL 72} // namespace OpenGL
diff --git a/src/video_core/renderer_opengl/gl_buffer_cache.h b/src/video_core/renderer_opengl/gl_buffer_cache.h
index 8c8ac4038..022e7bfa9 100644
--- a/src/video_core/renderer_opengl/gl_buffer_cache.h
+++ b/src/video_core/renderer_opengl/gl_buffer_cache.h
@@ -7,7 +7,7 @@
7#include <memory> 7#include <memory>
8 8
9#include "common/common_types.h" 9#include "common/common_types.h"
10#include "video_core/buffer_cache.h" 10#include "video_core/buffer_cache/buffer_cache.h"
11#include "video_core/rasterizer_cache.h" 11#include "video_core/rasterizer_cache.h"
12#include "video_core/renderer_opengl/gl_resource_manager.h" 12#include "video_core/renderer_opengl/gl_resource_manager.h"
13#include "video_core/renderer_opengl/gl_stream_buffer.h" 13#include "video_core/renderer_opengl/gl_stream_buffer.h"
@@ -21,7 +21,24 @@ namespace OpenGL {
21class OGLStreamBuffer; 21class OGLStreamBuffer;
22class RasterizerOpenGL; 22class RasterizerOpenGL;
23 23
24class OGLBufferCache final : public VideoCommon::BufferCache<OGLBuffer, GLuint, OGLStreamBuffer> { 24class CachedBufferBlock;
25
26using Buffer = std::shared_ptr<CachedBufferBlock>;
27
28class CachedBufferBlock : public VideoCommon::BufferBlock {
29public:
30 explicit CachedBufferBlock(CacheAddr cache_addr, const std::size_t size);
31 ~CachedBufferBlock();
32
33 const GLuint* GetHandle() const {
34 return &gl_buffer.handle;
35 }
36
37private:
38 OGLBuffer gl_buffer{};
39};
40
41class OGLBufferCache final : public VideoCommon::BufferCache<Buffer, GLuint, OGLStreamBuffer> {
25public: 42public:
26 explicit OGLBufferCache(RasterizerOpenGL& rasterizer, Core::System& system, 43 explicit OGLBufferCache(RasterizerOpenGL& rasterizer, Core::System& system,
27 std::size_t stream_size); 44 std::size_t stream_size);
@@ -30,18 +47,20 @@ public:
30 const GLuint* GetEmptyBuffer(std::size_t) override; 47 const GLuint* GetEmptyBuffer(std::size_t) override;
31 48
32protected: 49protected:
33 OGLBuffer CreateBuffer(std::size_t size) override; 50 Buffer CreateBlock(CacheAddr cache_addr, std::size_t size) override;
51
52 void WriteBarrier() override;
34 53
35 const GLuint* ToHandle(const OGLBuffer& buffer) override; 54 const GLuint* ToHandle(const Buffer& buffer) override;
36 55
37 void UploadBufferData(const OGLBuffer& buffer, std::size_t offset, std::size_t size, 56 void UploadBlockData(const Buffer& buffer, std::size_t offset, std::size_t size,
38 const u8* data) override; 57 const u8* data) override;
39 58
40 void DownloadBufferData(const OGLBuffer& buffer, std::size_t offset, std::size_t size, 59 void DownloadBlockData(const Buffer& buffer, std::size_t offset, std::size_t size,
41 u8* data) override; 60 u8* data) override;
42 61
43 void CopyBufferData(const OGLBuffer& src, const OGLBuffer& dst, std::size_t src_offset, 62 void CopyBlock(const Buffer& src, const Buffer& dst, std::size_t src_offset,
44 std::size_t dst_offset, std::size_t size) override; 63 std::size_t dst_offset, std::size_t size) override;
45}; 64};
46 65
47} // namespace OpenGL 66} // namespace OpenGL
diff --git a/src/video_core/renderer_opengl/gl_device.cpp b/src/video_core/renderer_opengl/gl_device.cpp
index 85424a4c9..03d434b28 100644
--- a/src/video_core/renderer_opengl/gl_device.cpp
+++ b/src/video_core/renderer_opengl/gl_device.cpp
@@ -27,6 +27,8 @@ Device::Device() {
27 shader_storage_alignment = GetInteger<std::size_t>(GL_SHADER_STORAGE_BUFFER_OFFSET_ALIGNMENT); 27 shader_storage_alignment = GetInteger<std::size_t>(GL_SHADER_STORAGE_BUFFER_OFFSET_ALIGNMENT);
28 max_vertex_attributes = GetInteger<u32>(GL_MAX_VERTEX_ATTRIBS); 28 max_vertex_attributes = GetInteger<u32>(GL_MAX_VERTEX_ATTRIBS);
29 max_varyings = GetInteger<u32>(GL_MAX_VARYING_VECTORS); 29 max_varyings = GetInteger<u32>(GL_MAX_VARYING_VECTORS);
30 has_warp_intrinsics = GLAD_GL_NV_gpu_shader5 && GLAD_GL_NV_shader_thread_group &&
31 GLAD_GL_NV_shader_thread_shuffle;
30 has_vertex_viewport_layer = GLAD_GL_ARB_shader_viewport_layer_array; 32 has_vertex_viewport_layer = GLAD_GL_ARB_shader_viewport_layer_array;
31 has_variable_aoffi = TestVariableAoffi(); 33 has_variable_aoffi = TestVariableAoffi();
32 has_component_indexing_bug = TestComponentIndexingBug(); 34 has_component_indexing_bug = TestComponentIndexingBug();
@@ -36,6 +38,7 @@ Device::Device(std::nullptr_t) {
36 uniform_buffer_alignment = 0; 38 uniform_buffer_alignment = 0;
37 max_vertex_attributes = 16; 39 max_vertex_attributes = 16;
38 max_varyings = 15; 40 max_varyings = 15;
41 has_warp_intrinsics = true;
39 has_vertex_viewport_layer = true; 42 has_vertex_viewport_layer = true;
40 has_variable_aoffi = true; 43 has_variable_aoffi = true;
41 has_component_indexing_bug = false; 44 has_component_indexing_bug = false;
diff --git a/src/video_core/renderer_opengl/gl_device.h b/src/video_core/renderer_opengl/gl_device.h
index dc883722d..3ef7c6dd8 100644
--- a/src/video_core/renderer_opengl/gl_device.h
+++ b/src/video_core/renderer_opengl/gl_device.h
@@ -30,6 +30,10 @@ public:
30 return max_varyings; 30 return max_varyings;
31 } 31 }
32 32
33 bool HasWarpIntrinsics() const {
34 return has_warp_intrinsics;
35 }
36
33 bool HasVertexViewportLayer() const { 37 bool HasVertexViewportLayer() const {
34 return has_vertex_viewport_layer; 38 return has_vertex_viewport_layer;
35 } 39 }
@@ -50,6 +54,7 @@ private:
50 std::size_t shader_storage_alignment{}; 54 std::size_t shader_storage_alignment{};
51 u32 max_vertex_attributes{}; 55 u32 max_vertex_attributes{};
52 u32 max_varyings{}; 56 u32 max_varyings{};
57 bool has_warp_intrinsics{};
53 bool has_vertex_viewport_layer{}; 58 bool has_vertex_viewport_layer{};
54 bool has_variable_aoffi{}; 59 bool has_variable_aoffi{};
55 bool has_component_indexing_bug{}; 60 bool has_component_indexing_bug{};
diff --git a/src/video_core/renderer_opengl/gl_rasterizer.cpp b/src/video_core/renderer_opengl/gl_rasterizer.cpp
index c28ae795c..bb09ecd52 100644
--- a/src/video_core/renderer_opengl/gl_rasterizer.cpp
+++ b/src/video_core/renderer_opengl/gl_rasterizer.cpp
@@ -708,8 +708,6 @@ void RasterizerOpenGL::DrawArrays() {
708 return; 708 return;
709 } 709 }
710 710
711 const auto& regs = gpu.regs;
712
713 SyncColorMask(); 711 SyncColorMask();
714 SyncFragmentColorClampState(); 712 SyncFragmentColorClampState();
715 SyncMultiSampleState(); 713 SyncMultiSampleState();
@@ -863,6 +861,10 @@ void RasterizerOpenGL::FlushAndInvalidateRegion(CacheAddr addr, u64 size) {
863 InvalidateRegion(addr, size); 861 InvalidateRegion(addr, size);
864} 862}
865 863
864void RasterizerOpenGL::FlushCommands() {
865 glFlush();
866}
867
866void RasterizerOpenGL::TickFrame() { 868void RasterizerOpenGL::TickFrame() {
867 buffer_cache.TickFrame(); 869 buffer_cache.TickFrame();
868} 870}
@@ -976,7 +978,7 @@ void RasterizerOpenGL::SetupGlobalMemory(const GLShader::GlobalMemoryEntry& entr
976 GPUVAddr gpu_addr, std::size_t size) { 978 GPUVAddr gpu_addr, std::size_t size) {
977 const auto alignment{device.GetShaderStorageBufferAlignment()}; 979 const auto alignment{device.GetShaderStorageBufferAlignment()};
978 const auto [ssbo, buffer_offset] = 980 const auto [ssbo, buffer_offset] =
979 buffer_cache.UploadMemory(gpu_addr, size, alignment, true, entry.IsWritten()); 981 buffer_cache.UploadMemory(gpu_addr, size, alignment, entry.IsWritten());
980 bind_ssbo_pushbuffer.Push(ssbo, buffer_offset, static_cast<GLsizeiptr>(size)); 982 bind_ssbo_pushbuffer.Push(ssbo, buffer_offset, static_cast<GLsizeiptr>(size));
981} 983}
982 984
diff --git a/src/video_core/renderer_opengl/gl_rasterizer.h b/src/video_core/renderer_opengl/gl_rasterizer.h
index 8b123c48d..9d20a4fbf 100644
--- a/src/video_core/renderer_opengl/gl_rasterizer.h
+++ b/src/video_core/renderer_opengl/gl_rasterizer.h
@@ -63,6 +63,7 @@ public:
63 void FlushRegion(CacheAddr addr, u64 size) override; 63 void FlushRegion(CacheAddr addr, u64 size) override;
64 void InvalidateRegion(CacheAddr addr, u64 size) override; 64 void InvalidateRegion(CacheAddr addr, u64 size) override;
65 void FlushAndInvalidateRegion(CacheAddr addr, u64 size) override; 65 void FlushAndInvalidateRegion(CacheAddr addr, u64 size) override;
66 void FlushCommands() override;
66 void TickFrame() override; 67 void TickFrame() override;
67 bool AccelerateSurfaceCopy(const Tegra::Engines::Fermi2D::Regs::Surface& src, 68 bool AccelerateSurfaceCopy(const Tegra::Engines::Fermi2D::Regs::Surface& src,
68 const Tegra::Engines::Fermi2D::Regs::Surface& dst, 69 const Tegra::Engines::Fermi2D::Regs::Surface& dst,
diff --git a/src/video_core/renderer_opengl/gl_shader_cache.cpp b/src/video_core/renderer_opengl/gl_shader_cache.cpp
index 1c90facc3..cf6a5cddf 100644
--- a/src/video_core/renderer_opengl/gl_shader_cache.cpp
+++ b/src/video_core/renderer_opengl/gl_shader_cache.cpp
@@ -212,7 +212,9 @@ CachedProgram SpecializeShader(const std::string& code, const GLShader::ShaderEn
212 const auto texture_buffer_usage{variant.texture_buffer_usage}; 212 const auto texture_buffer_usage{variant.texture_buffer_usage};
213 213
214 std::string source = "#version 430 core\n" 214 std::string source = "#version 430 core\n"
215 "#extension GL_ARB_separate_shader_objects : enable\n"; 215 "#extension GL_ARB_separate_shader_objects : enable\n"
216 "#extension GL_NV_gpu_shader5 : enable\n"
217 "#extension GL_NV_shader_thread_group : enable\n";
216 if (entries.shader_viewport_layer_array) { 218 if (entries.shader_viewport_layer_array) {
217 source += "#extension GL_ARB_shader_viewport_layer_array : enable\n"; 219 source += "#extension GL_ARB_shader_viewport_layer_array : enable\n";
218 } 220 }
@@ -247,20 +249,24 @@ CachedProgram SpecializeShader(const std::string& code, const GLShader::ShaderEn
247 if (!texture_buffer_usage.test(i)) { 249 if (!texture_buffer_usage.test(i)) {
248 continue; 250 continue;
249 } 251 }
250 source += fmt::format("#define SAMPLER_{}_IS_BUFFER", i); 252 source += fmt::format("#define SAMPLER_{}_IS_BUFFER\n", i);
253 }
254 if (texture_buffer_usage.any()) {
255 source += '\n';
251 } 256 }
252 257
253 if (program_type == ProgramType::Geometry) { 258 if (program_type == ProgramType::Geometry) {
254 const auto [glsl_topology, debug_name, max_vertices] = 259 const auto [glsl_topology, debug_name, max_vertices] =
255 GetPrimitiveDescription(primitive_mode); 260 GetPrimitiveDescription(primitive_mode);
256 261
257 source += "layout (" + std::string(glsl_topology) + ") in;\n"; 262 source += "layout (" + std::string(glsl_topology) + ") in;\n\n";
258 source += "#define MAX_VERTEX_INPUT " + std::to_string(max_vertices) + '\n'; 263 source += "#define MAX_VERTEX_INPUT " + std::to_string(max_vertices) + '\n';
259 } 264 }
260 if (program_type == ProgramType::Compute) { 265 if (program_type == ProgramType::Compute) {
261 source += "layout (local_size_variable) in;\n"; 266 source += "layout (local_size_variable) in;\n";
262 } 267 }
263 268
269 source += '\n';
264 source += code; 270 source += code;
265 271
266 OGLShader shader; 272 OGLShader shader;
@@ -289,7 +295,7 @@ std::set<GLenum> GetSupportedFormats() {
289 295
290CachedShader::CachedShader(const ShaderParameters& params, ProgramType program_type, 296CachedShader::CachedShader(const ShaderParameters& params, ProgramType program_type,
291 GLShader::ProgramResult result) 297 GLShader::ProgramResult result)
292 : RasterizerCacheObject{params.host_ptr}, host_ptr{params.host_ptr}, cpu_addr{params.cpu_addr}, 298 : RasterizerCacheObject{params.host_ptr}, cpu_addr{params.cpu_addr},
293 unique_identifier{params.unique_identifier}, program_type{program_type}, 299 unique_identifier{params.unique_identifier}, program_type{program_type},
294 disk_cache{params.disk_cache}, precompiled_programs{params.precompiled_programs}, 300 disk_cache{params.disk_cache}, precompiled_programs{params.precompiled_programs},
295 entries{result.second}, code{std::move(result.first)}, shader_length{entries.shader_length} {} 301 entries{result.second}, code{std::move(result.first)}, shader_length{entries.shader_length} {}
diff --git a/src/video_core/renderer_opengl/gl_shader_cache.h b/src/video_core/renderer_opengl/gl_shader_cache.h
index a3106a0ff..2c8faf855 100644
--- a/src/video_core/renderer_opengl/gl_shader_cache.h
+++ b/src/video_core/renderer_opengl/gl_shader_cache.h
@@ -106,7 +106,6 @@ private:
106 106
107 ShaderDiskCacheUsage GetUsage(const ProgramVariant& variant) const; 107 ShaderDiskCacheUsage GetUsage(const ProgramVariant& variant) const;
108 108
109 u8* host_ptr{};
110 VAddr cpu_addr{}; 109 VAddr cpu_addr{};
111 u64 unique_identifier{}; 110 u64 unique_identifier{};
112 ProgramType program_type{}; 111 ProgramType program_type{};
diff --git a/src/video_core/renderer_opengl/gl_shader_decompiler.cpp b/src/video_core/renderer_opengl/gl_shader_decompiler.cpp
index ffe26b241..359d58cbe 100644
--- a/src/video_core/renderer_opengl/gl_shader_decompiler.cpp
+++ b/src/video_core/renderer_opengl/gl_shader_decompiler.cpp
@@ -565,7 +565,7 @@ private:
565 case Tegra::Shader::ImageType::Texture1D: 565 case Tegra::Shader::ImageType::Texture1D:
566 return "image1D"; 566 return "image1D";
567 case Tegra::Shader::ImageType::TextureBuffer: 567 case Tegra::Shader::ImageType::TextureBuffer:
568 return "bufferImage"; 568 return "imageBuffer";
569 case Tegra::Shader::ImageType::Texture1DArray: 569 case Tegra::Shader::ImageType::Texture1DArray:
570 return "image1DArray"; 570 return "image1DArray";
571 case Tegra::Shader::ImageType::Texture2D: 571 case Tegra::Shader::ImageType::Texture2D:
@@ -1136,6 +1136,16 @@ private:
1136 Type::Float); 1136 Type::Float);
1137 } 1137 }
1138 1138
1139 std::string FCastHalf0(Operation operation) {
1140 const std::string op_a = VisitOperand(operation, 0, Type::HalfFloat);
1141 return fmt::format("({})[0]", op_a);
1142 }
1143
1144 std::string FCastHalf1(Operation operation) {
1145 const std::string op_a = VisitOperand(operation, 0, Type::HalfFloat);
1146 return fmt::format("({})[1]", op_a);
1147 }
1148
1139 template <Type type> 1149 template <Type type>
1140 std::string Min(Operation operation) { 1150 std::string Min(Operation operation) {
1141 return GenerateBinaryCall(operation, "min", type, type, type); 1151 return GenerateBinaryCall(operation, "min", type, type, type);
@@ -1292,6 +1302,11 @@ private:
1292 return ApplyPrecise(operation, BitwiseCastResult(clamped, Type::HalfFloat)); 1302 return ApplyPrecise(operation, BitwiseCastResult(clamped, Type::HalfFloat));
1293 } 1303 }
1294 1304
1305 std::string HCastFloat(Operation operation) {
1306 const std::string op_a = VisitOperand(operation, 0, Type::Float);
1307 return fmt::format("fromHalf2(vec2({}, 0.0f))", op_a);
1308 }
1309
1295 std::string HUnpack(Operation operation) { 1310 std::string HUnpack(Operation operation) {
1296 const std::string operand{VisitOperand(operation, 0, Type::HalfFloat)}; 1311 const std::string operand{VisitOperand(operation, 0, Type::HalfFloat)};
1297 const auto value = [&]() -> std::string { 1312 const auto value = [&]() -> std::string {
@@ -1720,6 +1735,48 @@ private:
1720 return "utof(gl_WorkGroupID"s + GetSwizzle(element) + ')'; 1735 return "utof(gl_WorkGroupID"s + GetSwizzle(element) + ')';
1721 } 1736 }
1722 1737
1738 std::string BallotThread(Operation operation) {
1739 const std::string value = VisitOperand(operation, 0, Type::Bool);
1740 if (!device.HasWarpIntrinsics()) {
1741 LOG_ERROR(Render_OpenGL,
1742 "Nvidia warp intrinsics are not available and its required by a shader");
1743 // Stub on non-Nvidia devices by simulating all threads voting the same as the active
1744 // one.
1745 return fmt::format("utof({} ? 0xFFFFFFFFU : 0U)", value);
1746 }
1747 return fmt::format("utof(ballotThreadNV({}))", value);
1748 }
1749
1750 std::string Vote(Operation operation, const char* func) {
1751 const std::string value = VisitOperand(operation, 0, Type::Bool);
1752 if (!device.HasWarpIntrinsics()) {
1753 LOG_ERROR(Render_OpenGL,
1754 "Nvidia vote intrinsics are not available and its required by a shader");
1755 // Stub with a warp size of one.
1756 return value;
1757 }
1758 return fmt::format("{}({})", func, value);
1759 }
1760
1761 std::string VoteAll(Operation operation) {
1762 return Vote(operation, "allThreadsNV");
1763 }
1764
1765 std::string VoteAny(Operation operation) {
1766 return Vote(operation, "anyThreadNV");
1767 }
1768
1769 std::string VoteEqual(Operation operation) {
1770 if (!device.HasWarpIntrinsics()) {
1771 LOG_ERROR(Render_OpenGL,
1772 "Nvidia vote intrinsics are not available and its required by a shader");
1773 // We must return true here since a stub for a theoretical warp size of 1 will always
1774 // return an equal result for all its votes.
1775 return "true";
1776 }
1777 return Vote(operation, "allThreadsEqualNV");
1778 }
1779
1723 static constexpr std::array operation_decompilers = { 1780 static constexpr std::array operation_decompilers = {
1724 &GLSLDecompiler::Assign, 1781 &GLSLDecompiler::Assign,
1725 1782
@@ -1732,6 +1789,8 @@ private:
1732 &GLSLDecompiler::Negate<Type::Float>, 1789 &GLSLDecompiler::Negate<Type::Float>,
1733 &GLSLDecompiler::Absolute<Type::Float>, 1790 &GLSLDecompiler::Absolute<Type::Float>,
1734 &GLSLDecompiler::FClamp, 1791 &GLSLDecompiler::FClamp,
1792 &GLSLDecompiler::FCastHalf0,
1793 &GLSLDecompiler::FCastHalf1,
1735 &GLSLDecompiler::Min<Type::Float>, 1794 &GLSLDecompiler::Min<Type::Float>,
1736 &GLSLDecompiler::Max<Type::Float>, 1795 &GLSLDecompiler::Max<Type::Float>,
1737 &GLSLDecompiler::FCos, 1796 &GLSLDecompiler::FCos,
@@ -1792,6 +1851,7 @@ private:
1792 &GLSLDecompiler::Absolute<Type::HalfFloat>, 1851 &GLSLDecompiler::Absolute<Type::HalfFloat>,
1793 &GLSLDecompiler::HNegate, 1852 &GLSLDecompiler::HNegate,
1794 &GLSLDecompiler::HClamp, 1853 &GLSLDecompiler::HClamp,
1854 &GLSLDecompiler::HCastFloat,
1795 &GLSLDecompiler::HUnpack, 1855 &GLSLDecompiler::HUnpack,
1796 &GLSLDecompiler::HMergeF32, 1856 &GLSLDecompiler::HMergeF32,
1797 &GLSLDecompiler::HMergeH0, 1857 &GLSLDecompiler::HMergeH0,
@@ -1867,6 +1927,11 @@ private:
1867 &GLSLDecompiler::WorkGroupId<0>, 1927 &GLSLDecompiler::WorkGroupId<0>,
1868 &GLSLDecompiler::WorkGroupId<1>, 1928 &GLSLDecompiler::WorkGroupId<1>,
1869 &GLSLDecompiler::WorkGroupId<2>, 1929 &GLSLDecompiler::WorkGroupId<2>,
1930
1931 &GLSLDecompiler::BallotThread,
1932 &GLSLDecompiler::VoteAll,
1933 &GLSLDecompiler::VoteAny,
1934 &GLSLDecompiler::VoteEqual,
1870 }; 1935 };
1871 static_assert(operation_decompilers.size() == static_cast<std::size_t>(OperationCode::Amount)); 1936 static_assert(operation_decompilers.size() == static_cast<std::size_t>(OperationCode::Amount));
1872 1937
diff --git a/src/video_core/renderer_opengl/gl_texture_cache.cpp b/src/video_core/renderer_opengl/gl_texture_cache.cpp
index 408332f90..4f135fe03 100644
--- a/src/video_core/renderer_opengl/gl_texture_cache.cpp
+++ b/src/video_core/renderer_opengl/gl_texture_cache.cpp
@@ -184,6 +184,9 @@ GLint GetSwizzleSource(SwizzleSource source) {
184} 184}
185 185
186void ApplyTextureDefaults(const SurfaceParams& params, GLuint texture) { 186void ApplyTextureDefaults(const SurfaceParams& params, GLuint texture) {
187 if (params.IsBuffer()) {
188 return;
189 }
187 glTextureParameteri(texture, GL_TEXTURE_MIN_FILTER, GL_LINEAR); 190 glTextureParameteri(texture, GL_TEXTURE_MIN_FILTER, GL_LINEAR);
188 glTextureParameteri(texture, GL_TEXTURE_MAG_FILTER, GL_LINEAR); 191 glTextureParameteri(texture, GL_TEXTURE_MAG_FILTER, GL_LINEAR);
189 glTextureParameteri(texture, GL_TEXTURE_WRAP_S, GL_CLAMP_TO_EDGE); 192 glTextureParameteri(texture, GL_TEXTURE_WRAP_S, GL_CLAMP_TO_EDGE);
@@ -208,6 +211,7 @@ OGLTexture CreateTexture(const SurfaceParams& params, GLenum target, GLenum inte
208 glNamedBufferStorage(texture_buffer.handle, params.width * params.GetBytesPerPixel(), 211 glNamedBufferStorage(texture_buffer.handle, params.width * params.GetBytesPerPixel(),
209 nullptr, GL_DYNAMIC_STORAGE_BIT); 212 nullptr, GL_DYNAMIC_STORAGE_BIT);
210 glTextureBuffer(texture.handle, internal_format, texture_buffer.handle); 213 glTextureBuffer(texture.handle, internal_format, texture_buffer.handle);
214 break;
211 case SurfaceTarget::Texture2D: 215 case SurfaceTarget::Texture2D:
212 case SurfaceTarget::TextureCubemap: 216 case SurfaceTarget::TextureCubemap:
213 glTextureStorage2D(texture.handle, params.emulated_levels, internal_format, params.width, 217 glTextureStorage2D(texture.handle, params.emulated_levels, internal_format, params.width,
diff --git a/src/video_core/renderer_opengl/gl_texture_cache.h b/src/video_core/renderer_opengl/gl_texture_cache.h
index ff6ab6988..21324488a 100644
--- a/src/video_core/renderer_opengl/gl_texture_cache.h
+++ b/src/video_core/renderer_opengl/gl_texture_cache.h
@@ -51,7 +51,7 @@ public:
51 } 51 }
52 52
53protected: 53protected:
54 void DecorateSurfaceName(); 54 void DecorateSurfaceName() override;
55 55
56 View CreateView(const ViewParams& view_key) override; 56 View CreateView(const ViewParams& view_key) override;
57 View CreateViewInner(const ViewParams& view_key, bool is_proxy); 57 View CreateViewInner(const ViewParams& view_key, bool is_proxy);
diff --git a/src/video_core/renderer_vulkan/vk_shader_decompiler.cpp b/src/video_core/renderer_vulkan/vk_shader_decompiler.cpp
index d267712c9..a35b45c9c 100644
--- a/src/video_core/renderer_vulkan/vk_shader_decompiler.cpp
+++ b/src/video_core/renderer_vulkan/vk_shader_decompiler.cpp
@@ -735,6 +735,16 @@ private:
735 return {}; 735 return {};
736 } 736 }
737 737
738 Id FCastHalf0(Operation operation) {
739 UNIMPLEMENTED();
740 return {};
741 }
742
743 Id FCastHalf1(Operation operation) {
744 UNIMPLEMENTED();
745 return {};
746 }
747
738 Id HNegate(Operation operation) { 748 Id HNegate(Operation operation) {
739 UNIMPLEMENTED(); 749 UNIMPLEMENTED();
740 return {}; 750 return {};
@@ -745,6 +755,11 @@ private:
745 return {}; 755 return {};
746 } 756 }
747 757
758 Id HCastFloat(Operation operation) {
759 UNIMPLEMENTED();
760 return {};
761 }
762
748 Id HUnpack(Operation operation) { 763 Id HUnpack(Operation operation) {
749 UNIMPLEMENTED(); 764 UNIMPLEMENTED();
750 return {}; 765 return {};
@@ -1057,6 +1072,26 @@ private:
1057 return {}; 1072 return {};
1058 } 1073 }
1059 1074
1075 Id BallotThread(Operation) {
1076 UNIMPLEMENTED();
1077 return {};
1078 }
1079
1080 Id VoteAll(Operation) {
1081 UNIMPLEMENTED();
1082 return {};
1083 }
1084
1085 Id VoteAny(Operation) {
1086 UNIMPLEMENTED();
1087 return {};
1088 }
1089
1090 Id VoteEqual(Operation) {
1091 UNIMPLEMENTED();
1092 return {};
1093 }
1094
1060 Id DeclareBuiltIn(spv::BuiltIn builtin, spv::StorageClass storage, Id type, 1095 Id DeclareBuiltIn(spv::BuiltIn builtin, spv::StorageClass storage, Id type,
1061 const std::string& name) { 1096 const std::string& name) {
1062 const Id id = OpVariable(type, storage); 1097 const Id id = OpVariable(type, storage);
@@ -1210,6 +1245,8 @@ private:
1210 &SPIRVDecompiler::Unary<&Module::OpFNegate, Type::Float>, 1245 &SPIRVDecompiler::Unary<&Module::OpFNegate, Type::Float>,
1211 &SPIRVDecompiler::Unary<&Module::OpFAbs, Type::Float>, 1246 &SPIRVDecompiler::Unary<&Module::OpFAbs, Type::Float>,
1212 &SPIRVDecompiler::Ternary<&Module::OpFClamp, Type::Float>, 1247 &SPIRVDecompiler::Ternary<&Module::OpFClamp, Type::Float>,
1248 &SPIRVDecompiler::FCastHalf0,
1249 &SPIRVDecompiler::FCastHalf1,
1213 &SPIRVDecompiler::Binary<&Module::OpFMin, Type::Float>, 1250 &SPIRVDecompiler::Binary<&Module::OpFMin, Type::Float>,
1214 &SPIRVDecompiler::Binary<&Module::OpFMax, Type::Float>, 1251 &SPIRVDecompiler::Binary<&Module::OpFMax, Type::Float>,
1215 &SPIRVDecompiler::Unary<&Module::OpCos, Type::Float>, 1252 &SPIRVDecompiler::Unary<&Module::OpCos, Type::Float>,
@@ -1270,6 +1307,7 @@ private:
1270 &SPIRVDecompiler::Unary<&Module::OpFAbs, Type::HalfFloat>, 1307 &SPIRVDecompiler::Unary<&Module::OpFAbs, Type::HalfFloat>,
1271 &SPIRVDecompiler::HNegate, 1308 &SPIRVDecompiler::HNegate,
1272 &SPIRVDecompiler::HClamp, 1309 &SPIRVDecompiler::HClamp,
1310 &SPIRVDecompiler::HCastFloat,
1273 &SPIRVDecompiler::HUnpack, 1311 &SPIRVDecompiler::HUnpack,
1274 &SPIRVDecompiler::HMergeF32, 1312 &SPIRVDecompiler::HMergeF32,
1275 &SPIRVDecompiler::HMergeH0, 1313 &SPIRVDecompiler::HMergeH0,
@@ -1346,6 +1384,11 @@ private:
1346 &SPIRVDecompiler::WorkGroupId<0>, 1384 &SPIRVDecompiler::WorkGroupId<0>,
1347 &SPIRVDecompiler::WorkGroupId<1>, 1385 &SPIRVDecompiler::WorkGroupId<1>,
1348 &SPIRVDecompiler::WorkGroupId<2>, 1386 &SPIRVDecompiler::WorkGroupId<2>,
1387
1388 &SPIRVDecompiler::BallotThread,
1389 &SPIRVDecompiler::VoteAll,
1390 &SPIRVDecompiler::VoteAny,
1391 &SPIRVDecompiler::VoteEqual,
1349 }; 1392 };
1350 static_assert(operation_decompilers.size() == static_cast<std::size_t>(OperationCode::Amount)); 1393 static_assert(operation_decompilers.size() == static_cast<std::size_t>(OperationCode::Amount));
1351 1394
diff --git a/src/video_core/shader/decode.cpp b/src/video_core/shader/decode.cpp
index b547d8323..47a9fd961 100644
--- a/src/video_core/shader/decode.cpp
+++ b/src/video_core/shader/decode.cpp
@@ -176,6 +176,7 @@ u32 ShaderIR::DecodeInstr(NodeBlock& bb, u32 pc) {
176 {OpCode::Type::Ffma, &ShaderIR::DecodeFfma}, 176 {OpCode::Type::Ffma, &ShaderIR::DecodeFfma},
177 {OpCode::Type::Hfma2, &ShaderIR::DecodeHfma2}, 177 {OpCode::Type::Hfma2, &ShaderIR::DecodeHfma2},
178 {OpCode::Type::Conversion, &ShaderIR::DecodeConversion}, 178 {OpCode::Type::Conversion, &ShaderIR::DecodeConversion},
179 {OpCode::Type::Warp, &ShaderIR::DecodeWarp},
179 {OpCode::Type::Memory, &ShaderIR::DecodeMemory}, 180 {OpCode::Type::Memory, &ShaderIR::DecodeMemory},
180 {OpCode::Type::Texture, &ShaderIR::DecodeTexture}, 181 {OpCode::Type::Texture, &ShaderIR::DecodeTexture},
181 {OpCode::Type::Image, &ShaderIR::DecodeImage}, 182 {OpCode::Type::Image, &ShaderIR::DecodeImage},
diff --git a/src/video_core/shader/decode/conversion.cpp b/src/video_core/shader/decode/conversion.cpp
index 4221f0c58..32facd6ba 100644
--- a/src/video_core/shader/decode/conversion.cpp
+++ b/src/video_core/shader/decode/conversion.cpp
@@ -14,6 +14,12 @@ using Tegra::Shader::Instruction;
14using Tegra::Shader::OpCode; 14using Tegra::Shader::OpCode;
15using Tegra::Shader::Register; 15using Tegra::Shader::Register;
16 16
17namespace {
18constexpr OperationCode GetFloatSelector(u64 selector) {
19 return selector == 0 ? OperationCode::FCastHalf0 : OperationCode::FCastHalf1;
20}
21} // Anonymous namespace
22
17u32 ShaderIR::DecodeConversion(NodeBlock& bb, u32 pc) { 23u32 ShaderIR::DecodeConversion(NodeBlock& bb, u32 pc) {
18 const Instruction instr = {program_code[pc]}; 24 const Instruction instr = {program_code[pc]};
19 const auto opcode = OpCode::Decode(instr); 25 const auto opcode = OpCode::Decode(instr);
@@ -22,7 +28,7 @@ u32 ShaderIR::DecodeConversion(NodeBlock& bb, u32 pc) {
22 case OpCode::Id::I2I_R: 28 case OpCode::Id::I2I_R:
23 case OpCode::Id::I2I_C: 29 case OpCode::Id::I2I_C:
24 case OpCode::Id::I2I_IMM: { 30 case OpCode::Id::I2I_IMM: {
25 UNIMPLEMENTED_IF(instr.conversion.selector); 31 UNIMPLEMENTED_IF(instr.conversion.int_src.selector != 0);
26 UNIMPLEMENTED_IF(instr.conversion.dst_size != Register::Size::Word); 32 UNIMPLEMENTED_IF(instr.conversion.dst_size != Register::Size::Word);
27 UNIMPLEMENTED_IF(instr.alu.saturate_d); 33 UNIMPLEMENTED_IF(instr.alu.saturate_d);
28 34
@@ -57,8 +63,8 @@ u32 ShaderIR::DecodeConversion(NodeBlock& bb, u32 pc) {
57 case OpCode::Id::I2F_R: 63 case OpCode::Id::I2F_R:
58 case OpCode::Id::I2F_C: 64 case OpCode::Id::I2F_C:
59 case OpCode::Id::I2F_IMM: { 65 case OpCode::Id::I2F_IMM: {
60 UNIMPLEMENTED_IF(instr.conversion.dst_size != Register::Size::Word); 66 UNIMPLEMENTED_IF(instr.conversion.int_src.selector != 0);
61 UNIMPLEMENTED_IF(instr.conversion.selector); 67 UNIMPLEMENTED_IF(instr.conversion.dst_size == Register::Size::Long);
62 UNIMPLEMENTED_IF_MSG(instr.generates_cc, 68 UNIMPLEMENTED_IF_MSG(instr.generates_cc,
63 "Condition codes generation in I2F is not implemented"); 69 "Condition codes generation in I2F is not implemented");
64 70
@@ -82,14 +88,19 @@ u32 ShaderIR::DecodeConversion(NodeBlock& bb, u32 pc) {
82 value = GetOperandAbsNegFloat(value, false, instr.conversion.negate_a); 88 value = GetOperandAbsNegFloat(value, false, instr.conversion.negate_a);
83 89
84 SetInternalFlagsFromFloat(bb, value, instr.generates_cc); 90 SetInternalFlagsFromFloat(bb, value, instr.generates_cc);
91
92 if (instr.conversion.dst_size == Register::Size::Short) {
93 value = Operation(OperationCode::HCastFloat, PRECISE, value);
94 }
95
85 SetRegister(bb, instr.gpr0, value); 96 SetRegister(bb, instr.gpr0, value);
86 break; 97 break;
87 } 98 }
88 case OpCode::Id::F2F_R: 99 case OpCode::Id::F2F_R:
89 case OpCode::Id::F2F_C: 100 case OpCode::Id::F2F_C:
90 case OpCode::Id::F2F_IMM: { 101 case OpCode::Id::F2F_IMM: {
91 UNIMPLEMENTED_IF(instr.conversion.f2f.dst_size != Register::Size::Word); 102 UNIMPLEMENTED_IF(instr.conversion.dst_size == Register::Size::Long);
92 UNIMPLEMENTED_IF(instr.conversion.f2f.src_size != Register::Size::Word); 103 UNIMPLEMENTED_IF(instr.conversion.src_size == Register::Size::Long);
93 UNIMPLEMENTED_IF_MSG(instr.generates_cc, 104 UNIMPLEMENTED_IF_MSG(instr.generates_cc,
94 "Condition codes generation in F2F is not implemented"); 105 "Condition codes generation in F2F is not implemented");
95 106
@@ -107,6 +118,13 @@ u32 ShaderIR::DecodeConversion(NodeBlock& bb, u32 pc) {
107 } 118 }
108 }(); 119 }();
109 120
121 if (instr.conversion.src_size == Register::Size::Short) {
122 value = Operation(GetFloatSelector(instr.conversion.float_src.selector), NO_PRECISE,
123 std::move(value));
124 } else {
125 ASSERT(instr.conversion.float_src.selector == 0);
126 }
127
110 value = GetOperandAbsNegFloat(value, instr.conversion.abs_a, instr.conversion.negate_a); 128 value = GetOperandAbsNegFloat(value, instr.conversion.abs_a, instr.conversion.negate_a);
111 129
112 value = [&]() { 130 value = [&]() {
@@ -124,19 +142,24 @@ u32 ShaderIR::DecodeConversion(NodeBlock& bb, u32 pc) {
124 default: 142 default:
125 UNIMPLEMENTED_MSG("Unimplemented F2F rounding mode {}", 143 UNIMPLEMENTED_MSG("Unimplemented F2F rounding mode {}",
126 static_cast<u32>(instr.conversion.f2f.rounding.Value())); 144 static_cast<u32>(instr.conversion.f2f.rounding.Value()));
127 return Immediate(0); 145 return value;
128 } 146 }
129 }(); 147 }();
130 value = GetSaturatedFloat(value, instr.alu.saturate_d); 148 value = GetSaturatedFloat(value, instr.alu.saturate_d);
131 149
132 SetInternalFlagsFromFloat(bb, value, instr.generates_cc); 150 SetInternalFlagsFromFloat(bb, value, instr.generates_cc);
151
152 if (instr.conversion.dst_size == Register::Size::Short) {
153 value = Operation(OperationCode::HCastFloat, PRECISE, value);
154 }
155
133 SetRegister(bb, instr.gpr0, value); 156 SetRegister(bb, instr.gpr0, value);
134 break; 157 break;
135 } 158 }
136 case OpCode::Id::F2I_R: 159 case OpCode::Id::F2I_R:
137 case OpCode::Id::F2I_C: 160 case OpCode::Id::F2I_C:
138 case OpCode::Id::F2I_IMM: { 161 case OpCode::Id::F2I_IMM: {
139 UNIMPLEMENTED_IF(instr.conversion.src_size != Register::Size::Word); 162 UNIMPLEMENTED_IF(instr.conversion.src_size == Register::Size::Long);
140 UNIMPLEMENTED_IF_MSG(instr.generates_cc, 163 UNIMPLEMENTED_IF_MSG(instr.generates_cc,
141 "Condition codes generation in F2I is not implemented"); 164 "Condition codes generation in F2I is not implemented");
142 Node value = [&]() { 165 Node value = [&]() {
@@ -153,6 +176,13 @@ u32 ShaderIR::DecodeConversion(NodeBlock& bb, u32 pc) {
153 } 176 }
154 }(); 177 }();
155 178
179 if (instr.conversion.src_size == Register::Size::Short) {
180 value = Operation(GetFloatSelector(instr.conversion.float_src.selector), NO_PRECISE,
181 std::move(value));
182 } else {
183 ASSERT(instr.conversion.float_src.selector == 0);
184 }
185
156 value = GetOperandAbsNegFloat(value, instr.conversion.abs_a, instr.conversion.negate_a); 186 value = GetOperandAbsNegFloat(value, instr.conversion.abs_a, instr.conversion.negate_a);
157 187
158 value = [&]() { 188 value = [&]() {
diff --git a/src/video_core/shader/decode/float_set.cpp b/src/video_core/shader/decode/float_set.cpp
index f5013e44a..5614e8a0d 100644
--- a/src/video_core/shader/decode/float_set.cpp
+++ b/src/video_core/shader/decode/float_set.cpp
@@ -15,7 +15,6 @@ using Tegra::Shader::OpCode;
15 15
16u32 ShaderIR::DecodeFloatSet(NodeBlock& bb, u32 pc) { 16u32 ShaderIR::DecodeFloatSet(NodeBlock& bb, u32 pc) {
17 const Instruction instr = {program_code[pc]}; 17 const Instruction instr = {program_code[pc]};
18 const auto opcode = OpCode::Decode(instr);
19 18
20 const Node op_a = GetOperandAbsNegFloat(GetRegister(instr.gpr8), instr.fset.abs_a != 0, 19 const Node op_a = GetOperandAbsNegFloat(GetRegister(instr.gpr8), instr.fset.abs_a != 0,
21 instr.fset.neg_a != 0); 20 instr.fset.neg_a != 0);
diff --git a/src/video_core/shader/decode/float_set_predicate.cpp b/src/video_core/shader/decode/float_set_predicate.cpp
index 2323052b0..200c2c983 100644
--- a/src/video_core/shader/decode/float_set_predicate.cpp
+++ b/src/video_core/shader/decode/float_set_predicate.cpp
@@ -16,10 +16,9 @@ using Tegra::Shader::Pred;
16 16
17u32 ShaderIR::DecodeFloatSetPredicate(NodeBlock& bb, u32 pc) { 17u32 ShaderIR::DecodeFloatSetPredicate(NodeBlock& bb, u32 pc) {
18 const Instruction instr = {program_code[pc]}; 18 const Instruction instr = {program_code[pc]};
19 const auto opcode = OpCode::Decode(instr);
20 19
21 const Node op_a = GetOperandAbsNegFloat(GetRegister(instr.gpr8), instr.fsetp.abs_a != 0, 20 Node op_a = GetOperandAbsNegFloat(GetRegister(instr.gpr8), instr.fsetp.abs_a != 0,
22 instr.fsetp.neg_a != 0); 21 instr.fsetp.neg_a != 0);
23 Node op_b = [&]() { 22 Node op_b = [&]() {
24 if (instr.is_b_imm) { 23 if (instr.is_b_imm) {
25 return GetImmediate19(instr); 24 return GetImmediate19(instr);
@@ -29,12 +28,13 @@ u32 ShaderIR::DecodeFloatSetPredicate(NodeBlock& bb, u32 pc) {
29 return GetConstBuffer(instr.cbuf34.index, instr.cbuf34.GetOffset()); 28 return GetConstBuffer(instr.cbuf34.index, instr.cbuf34.GetOffset());
30 } 29 }
31 }(); 30 }();
32 op_b = GetOperandAbsNegFloat(op_b, instr.fsetp.abs_b, false); 31 op_b = GetOperandAbsNegFloat(std::move(op_b), instr.fsetp.abs_b, instr.fsetp.neg_b);
33 32
34 // We can't use the constant predicate as destination. 33 // We can't use the constant predicate as destination.
35 ASSERT(instr.fsetp.pred3 != static_cast<u64>(Pred::UnusedIndex)); 34 ASSERT(instr.fsetp.pred3 != static_cast<u64>(Pred::UnusedIndex));
36 35
37 const Node predicate = GetPredicateComparisonFloat(instr.fsetp.cond, op_a, op_b); 36 const Node predicate =
37 GetPredicateComparisonFloat(instr.fsetp.cond, std::move(op_a), std::move(op_b));
38 const Node second_pred = GetPredicate(instr.fsetp.pred39, instr.fsetp.neg_pred != 0); 38 const Node second_pred = GetPredicate(instr.fsetp.pred39, instr.fsetp.neg_pred != 0);
39 39
40 const OperationCode combiner = GetPredicateCombiner(instr.fsetp.op); 40 const OperationCode combiner = GetPredicateCombiner(instr.fsetp.op);
diff --git a/src/video_core/shader/decode/half_set_predicate.cpp b/src/video_core/shader/decode/half_set_predicate.cpp
index a6c082cc9..afea33e5f 100644
--- a/src/video_core/shader/decode/half_set_predicate.cpp
+++ b/src/video_core/shader/decode/half_set_predicate.cpp
@@ -30,7 +30,7 @@ u32 ShaderIR::DecodeHalfSetPredicate(NodeBlock& bb, u32 pc) {
30 case OpCode::Id::HSETP2_C: 30 case OpCode::Id::HSETP2_C:
31 cond = instr.hsetp2.cbuf_and_imm.cond; 31 cond = instr.hsetp2.cbuf_and_imm.cond;
32 h_and = instr.hsetp2.cbuf_and_imm.h_and; 32 h_and = instr.hsetp2.cbuf_and_imm.h_and;
33 op_b = GetOperandAbsNegHalf(GetConstBuffer(instr.cbuf34.index, instr.cbuf34.offset), 33 op_b = GetOperandAbsNegHalf(GetConstBuffer(instr.cbuf34.index, instr.cbuf34.GetOffset()),
34 instr.hsetp2.cbuf.abs_b, instr.hsetp2.cbuf.negate_b); 34 instr.hsetp2.cbuf.abs_b, instr.hsetp2.cbuf.negate_b);
35 break; 35 break;
36 case OpCode::Id::HSETP2_IMM: 36 case OpCode::Id::HSETP2_IMM:
diff --git a/src/video_core/shader/decode/integer_set.cpp b/src/video_core/shader/decode/integer_set.cpp
index 46e3d5905..59809bcd8 100644
--- a/src/video_core/shader/decode/integer_set.cpp
+++ b/src/video_core/shader/decode/integer_set.cpp
@@ -14,7 +14,6 @@ using Tegra::Shader::OpCode;
14 14
15u32 ShaderIR::DecodeIntegerSet(NodeBlock& bb, u32 pc) { 15u32 ShaderIR::DecodeIntegerSet(NodeBlock& bb, u32 pc) {
16 const Instruction instr = {program_code[pc]}; 16 const Instruction instr = {program_code[pc]};
17 const auto opcode = OpCode::Decode(instr);
18 17
19 const Node op_a = GetRegister(instr.gpr8); 18 const Node op_a = GetRegister(instr.gpr8);
20 const Node op_b = [&]() { 19 const Node op_b = [&]() {
diff --git a/src/video_core/shader/decode/integer_set_predicate.cpp b/src/video_core/shader/decode/integer_set_predicate.cpp
index dd20775d7..25e48fef8 100644
--- a/src/video_core/shader/decode/integer_set_predicate.cpp
+++ b/src/video_core/shader/decode/integer_set_predicate.cpp
@@ -16,7 +16,6 @@ using Tegra::Shader::Pred;
16 16
17u32 ShaderIR::DecodeIntegerSetPredicate(NodeBlock& bb, u32 pc) { 17u32 ShaderIR::DecodeIntegerSetPredicate(NodeBlock& bb, u32 pc) {
18 const Instruction instr = {program_code[pc]}; 18 const Instruction instr = {program_code[pc]};
19 const auto opcode = OpCode::Decode(instr);
20 19
21 const Node op_a = GetRegister(instr.gpr8); 20 const Node op_a = GetRegister(instr.gpr8);
22 21
diff --git a/src/video_core/shader/decode/other.cpp b/src/video_core/shader/decode/other.cpp
index ac0e764d6..d46e0f823 100644
--- a/src/video_core/shader/decode/other.cpp
+++ b/src/video_core/shader/decode/other.cpp
@@ -74,6 +74,13 @@ u32 ShaderIR::DecodeOther(NodeBlock& bb, u32 pc) {
74 case SystemVariable::InvocationInfo: 74 case SystemVariable::InvocationInfo:
75 LOG_WARNING(HW_GPU, "MOV_SYS instruction with InvocationInfo is incomplete"); 75 LOG_WARNING(HW_GPU, "MOV_SYS instruction with InvocationInfo is incomplete");
76 return Immediate(0u); 76 return Immediate(0u);
77 case SystemVariable::Tid: {
78 Node value = Immediate(0);
79 value = BitfieldInsert(value, Operation(OperationCode::LocalInvocationIdX), 0, 9);
80 value = BitfieldInsert(value, Operation(OperationCode::LocalInvocationIdY), 16, 9);
81 value = BitfieldInsert(value, Operation(OperationCode::LocalInvocationIdZ), 26, 5);
82 return value;
83 }
77 case SystemVariable::TidX: 84 case SystemVariable::TidX:
78 return Operation(OperationCode::LocalInvocationIdX); 85 return Operation(OperationCode::LocalInvocationIdX);
79 case SystemVariable::TidY: 86 case SystemVariable::TidY:
diff --git a/src/video_core/shader/decode/predicate_set_register.cpp b/src/video_core/shader/decode/predicate_set_register.cpp
index febbfeb50..84dbc50fe 100644
--- a/src/video_core/shader/decode/predicate_set_register.cpp
+++ b/src/video_core/shader/decode/predicate_set_register.cpp
@@ -15,7 +15,6 @@ using Tegra::Shader::OpCode;
15 15
16u32 ShaderIR::DecodePredicateSetRegister(NodeBlock& bb, u32 pc) { 16u32 ShaderIR::DecodePredicateSetRegister(NodeBlock& bb, u32 pc) {
17 const Instruction instr = {program_code[pc]}; 17 const Instruction instr = {program_code[pc]};
18 const auto opcode = OpCode::Decode(instr);
19 18
20 UNIMPLEMENTED_IF_MSG(instr.generates_cc, 19 UNIMPLEMENTED_IF_MSG(instr.generates_cc,
21 "Condition codes generation in PSET is not implemented"); 20 "Condition codes generation in PSET is not implemented");
diff --git a/src/video_core/shader/decode/warp.cpp b/src/video_core/shader/decode/warp.cpp
new file mode 100644
index 000000000..04ca74f46
--- /dev/null
+++ b/src/video_core/shader/decode/warp.cpp
@@ -0,0 +1,55 @@
1// Copyright 2019 yuzu Emulator Project
2// Licensed under GPLv2 or any later version
3// Refer to the license.txt file included.
4
5#include "common/assert.h"
6#include "common/common_types.h"
7#include "video_core/engines/shader_bytecode.h"
8#include "video_core/shader/node_helper.h"
9#include "video_core/shader/shader_ir.h"
10
11namespace VideoCommon::Shader {
12
13using Tegra::Shader::Instruction;
14using Tegra::Shader::OpCode;
15using Tegra::Shader::Pred;
16using Tegra::Shader::VoteOperation;
17
18namespace {
19OperationCode GetOperationCode(VoteOperation vote_op) {
20 switch (vote_op) {
21 case VoteOperation::All:
22 return OperationCode::VoteAll;
23 case VoteOperation::Any:
24 return OperationCode::VoteAny;
25 case VoteOperation::Eq:
26 return OperationCode::VoteEqual;
27 default:
28 UNREACHABLE_MSG("Invalid vote operation={}", static_cast<u64>(vote_op));
29 return OperationCode::VoteAll;
30 }
31}
32} // Anonymous namespace
33
34u32 ShaderIR::DecodeWarp(NodeBlock& bb, u32 pc) {
35 const Instruction instr = {program_code[pc]};
36 const auto opcode = OpCode::Decode(instr);
37
38 switch (opcode->get().GetId()) {
39 case OpCode::Id::VOTE: {
40 const Node value = GetPredicate(instr.vote.value, instr.vote.negate_value != 0);
41 const Node active = Operation(OperationCode::BallotThread, value);
42 const Node vote = Operation(GetOperationCode(instr.vote.operation), value);
43 SetRegister(bb, instr.gpr0, active);
44 SetPredicate(bb, instr.vote.dest_pred, vote);
45 break;
46 }
47 default:
48 UNIMPLEMENTED_MSG("Unhandled warp instruction: {}", opcode->get().GetName());
49 break;
50 }
51
52 return pc;
53}
54
55} // namespace VideoCommon::Shader
diff --git a/src/video_core/shader/node.h b/src/video_core/shader/node.h
index 715184d67..5db9313c4 100644
--- a/src/video_core/shader/node.h
+++ b/src/video_core/shader/node.h
@@ -30,6 +30,8 @@ enum class OperationCode {
30 FNegate, /// (MetaArithmetic, float a) -> float 30 FNegate, /// (MetaArithmetic, float a) -> float
31 FAbsolute, /// (MetaArithmetic, float a) -> float 31 FAbsolute, /// (MetaArithmetic, float a) -> float
32 FClamp, /// (MetaArithmetic, float value, float min, float max) -> float 32 FClamp, /// (MetaArithmetic, float value, float min, float max) -> float
33 FCastHalf0, /// (MetaArithmetic, f16vec2 a) -> float
34 FCastHalf1, /// (MetaArithmetic, f16vec2 a) -> float
33 FMin, /// (MetaArithmetic, float a, float b) -> float 35 FMin, /// (MetaArithmetic, float a, float b) -> float
34 FMax, /// (MetaArithmetic, float a, float b) -> float 36 FMax, /// (MetaArithmetic, float a, float b) -> float
35 FCos, /// (MetaArithmetic, float a) -> float 37 FCos, /// (MetaArithmetic, float a) -> float
@@ -83,17 +85,18 @@ enum class OperationCode {
83 UBitfieldExtract, /// (MetaArithmetic, uint value, int offset, int offset) -> uint 85 UBitfieldExtract, /// (MetaArithmetic, uint value, int offset, int offset) -> uint
84 UBitCount, /// (MetaArithmetic, uint) -> uint 86 UBitCount, /// (MetaArithmetic, uint) -> uint
85 87
86 HAdd, /// (MetaArithmetic, f16vec2 a, f16vec2 b) -> f16vec2 88 HAdd, /// (MetaArithmetic, f16vec2 a, f16vec2 b) -> f16vec2
87 HMul, /// (MetaArithmetic, f16vec2 a, f16vec2 b) -> f16vec2 89 HMul, /// (MetaArithmetic, f16vec2 a, f16vec2 b) -> f16vec2
88 HFma, /// (MetaArithmetic, f16vec2 a, f16vec2 b, f16vec2 c) -> f16vec2 90 HFma, /// (MetaArithmetic, f16vec2 a, f16vec2 b, f16vec2 c) -> f16vec2
89 HAbsolute, /// (f16vec2 a) -> f16vec2 91 HAbsolute, /// (f16vec2 a) -> f16vec2
90 HNegate, /// (f16vec2 a, bool first, bool second) -> f16vec2 92 HNegate, /// (f16vec2 a, bool first, bool second) -> f16vec2
91 HClamp, /// (f16vec2 src, float min, float max) -> f16vec2 93 HClamp, /// (f16vec2 src, float min, float max) -> f16vec2
92 HUnpack, /// (Tegra::Shader::HalfType, T value) -> f16vec2 94 HCastFloat, /// (MetaArithmetic, float a) -> f16vec2
93 HMergeF32, /// (f16vec2 src) -> float 95 HUnpack, /// (Tegra::Shader::HalfType, T value) -> f16vec2
94 HMergeH0, /// (f16vec2 dest, f16vec2 src) -> f16vec2 96 HMergeF32, /// (f16vec2 src) -> float
95 HMergeH1, /// (f16vec2 dest, f16vec2 src) -> f16vec2 97 HMergeH0, /// (f16vec2 dest, f16vec2 src) -> f16vec2
96 HPack2, /// (float a, float b) -> f16vec2 98 HMergeH1, /// (f16vec2 dest, f16vec2 src) -> f16vec2
99 HPack2, /// (float a, float b) -> f16vec2
97 100
98 LogicalAssign, /// (bool& dst, bool src) -> void 101 LogicalAssign, /// (bool& dst, bool src) -> void
99 LogicalAnd, /// (bool a, bool b) -> bool 102 LogicalAnd, /// (bool a, bool b) -> bool
@@ -165,6 +168,11 @@ enum class OperationCode {
165 WorkGroupIdY, /// () -> uint 168 WorkGroupIdY, /// () -> uint
166 WorkGroupIdZ, /// () -> uint 169 WorkGroupIdZ, /// () -> uint
167 170
171 BallotThread, /// (bool) -> uint
172 VoteAll, /// (bool) -> bool
173 VoteAny, /// (bool) -> bool
174 VoteEqual, /// (bool) -> bool
175
168 Amount, 176 Amount,
169}; 177};
170 178
diff --git a/src/video_core/shader/shader_ir.cpp b/src/video_core/shader/shader_ir.cpp
index 5e91fe129..1e5c7f660 100644
--- a/src/video_core/shader/shader_ir.cpp
+++ b/src/video_core/shader/shader_ir.cpp
@@ -405,4 +405,9 @@ Node ShaderIR::BitfieldExtract(Node value, u32 offset, u32 bits) {
405 Immediate(offset), Immediate(bits)); 405 Immediate(offset), Immediate(bits));
406} 406}
407 407
408Node ShaderIR::BitfieldInsert(Node base, Node insert, u32 offset, u32 bits) {
409 return Operation(OperationCode::UBitfieldInsert, NO_PRECISE, base, insert, Immediate(offset),
410 Immediate(bits));
411}
412
408} // namespace VideoCommon::Shader 413} // namespace VideoCommon::Shader
diff --git a/src/video_core/shader/shader_ir.h b/src/video_core/shader/shader_ir.h
index 59a083d90..bcc9b79b6 100644
--- a/src/video_core/shader/shader_ir.h
+++ b/src/video_core/shader/shader_ir.h
@@ -167,6 +167,7 @@ private:
167 u32 DecodeFfma(NodeBlock& bb, u32 pc); 167 u32 DecodeFfma(NodeBlock& bb, u32 pc);
168 u32 DecodeHfma2(NodeBlock& bb, u32 pc); 168 u32 DecodeHfma2(NodeBlock& bb, u32 pc);
169 u32 DecodeConversion(NodeBlock& bb, u32 pc); 169 u32 DecodeConversion(NodeBlock& bb, u32 pc);
170 u32 DecodeWarp(NodeBlock& bb, u32 pc);
170 u32 DecodeMemory(NodeBlock& bb, u32 pc); 171 u32 DecodeMemory(NodeBlock& bb, u32 pc);
171 u32 DecodeTexture(NodeBlock& bb, u32 pc); 172 u32 DecodeTexture(NodeBlock& bb, u32 pc);
172 u32 DecodeImage(NodeBlock& bb, u32 pc); 173 u32 DecodeImage(NodeBlock& bb, u32 pc);
@@ -279,6 +280,9 @@ private:
279 /// Extracts a sequence of bits from a node 280 /// Extracts a sequence of bits from a node
280 Node BitfieldExtract(Node value, u32 offset, u32 bits); 281 Node BitfieldExtract(Node value, u32 offset, u32 bits);
281 282
283 /// Inserts a sequence of bits from a node
284 Node BitfieldInsert(Node base, Node insert, u32 offset, u32 bits);
285
282 void WriteTexInstructionFloat(NodeBlock& bb, Tegra::Shader::Instruction instr, 286 void WriteTexInstructionFloat(NodeBlock& bb, Tegra::Shader::Instruction instr,
283 const Node4& components); 287 const Node4& components);
284 288
diff --git a/src/video_core/texture_cache/surface_params.h b/src/video_core/texture_cache/surface_params.h
index 358d6757c..e7ef66ee2 100644
--- a/src/video_core/texture_cache/surface_params.h
+++ b/src/video_core/texture_cache/surface_params.h
@@ -58,7 +58,6 @@ public:
58 std::size_t GetHostSizeInBytes() const { 58 std::size_t GetHostSizeInBytes() const {
59 std::size_t host_size_in_bytes; 59 std::size_t host_size_in_bytes;
60 if (GetCompressionType() == SurfaceCompression::Converted) { 60 if (GetCompressionType() == SurfaceCompression::Converted) {
61 constexpr std::size_t rgb8_bpp = 4ULL;
62 // ASTC is uncompressed in software, in emulated as RGBA8 61 // ASTC is uncompressed in software, in emulated as RGBA8
63 host_size_in_bytes = 0; 62 host_size_in_bytes = 0;
64 for (u32 level = 0; level < num_levels; ++level) { 63 for (u32 level = 0; level < num_levels; ++level) {
diff --git a/src/video_core/texture_cache/texture_cache.h b/src/video_core/texture_cache/texture_cache.h
index a3a3770a7..2ec0203d1 100644
--- a/src/video_core/texture_cache/texture_cache.h
+++ b/src/video_core/texture_cache/texture_cache.h
@@ -308,8 +308,6 @@ protected:
308 if (!guard_render_targets && surface->IsRenderTarget()) { 308 if (!guard_render_targets && surface->IsRenderTarget()) {
309 ManageRenderTargetUnregister(surface); 309 ManageRenderTargetUnregister(surface);
310 } 310 }
311 const GPUVAddr gpu_addr = surface->GetGpuAddr();
312 const CacheAddr cache_ptr = surface->GetCacheAddr();
313 const std::size_t size = surface->GetSizeInBytes(); 311 const std::size_t size = surface->GetSizeInBytes();
314 const VAddr cpu_addr = surface->GetCpuAddr(); 312 const VAddr cpu_addr = surface->GetCpuAddr();
315 rasterizer.UpdatePagesCachedCount(cpu_addr, size, -1); 313 rasterizer.UpdatePagesCachedCount(cpu_addr, size, -1);
diff --git a/src/video_core/textures/decoders.cpp b/src/video_core/textures/decoders.cpp
index 7e8295944..7df5f1452 100644
--- a/src/video_core/textures/decoders.cpp
+++ b/src/video_core/textures/decoders.cpp
@@ -257,19 +257,21 @@ std::vector<u8> UnswizzleTexture(u8* address, u32 tile_size_x, u32 tile_size_y,
257 257
258void SwizzleSubrect(u32 subrect_width, u32 subrect_height, u32 source_pitch, u32 swizzled_width, 258void SwizzleSubrect(u32 subrect_width, u32 subrect_height, u32 source_pitch, u32 swizzled_width,
259 u32 bytes_per_pixel, u8* swizzled_data, u8* unswizzled_data, 259 u32 bytes_per_pixel, u8* swizzled_data, u8* unswizzled_data,
260 u32 block_height_bit) { 260 u32 block_height_bit, u32 offset_x, u32 offset_y) {
261 const u32 block_height = 1U << block_height_bit; 261 const u32 block_height = 1U << block_height_bit;
262 const u32 image_width_in_gobs{(swizzled_width * bytes_per_pixel + (gob_size_x - 1)) / 262 const u32 image_width_in_gobs{(swizzled_width * bytes_per_pixel + (gob_size_x - 1)) /
263 gob_size_x}; 263 gob_size_x};
264 for (u32 line = 0; line < subrect_height; ++line) { 264 for (u32 line = 0; line < subrect_height; ++line) {
265 const u32 dst_y = line + offset_y;
265 const u32 gob_address_y = 266 const u32 gob_address_y =
266 (line / (gob_size_y * block_height)) * gob_size * block_height * image_width_in_gobs + 267 (dst_y / (gob_size_y * block_height)) * gob_size * block_height * image_width_in_gobs +
267 ((line % (gob_size_y * block_height)) / gob_size_y) * gob_size; 268 ((dst_y % (gob_size_y * block_height)) / gob_size_y) * gob_size;
268 const auto& table = legacy_swizzle_table[line % gob_size_y]; 269 const auto& table = legacy_swizzle_table[dst_y % gob_size_y];
269 for (u32 x = 0; x < subrect_width; ++x) { 270 for (u32 x = 0; x < subrect_width; ++x) {
271 const u32 dst_x = x + offset_x;
270 const u32 gob_address = 272 const u32 gob_address =
271 gob_address_y + (x * bytes_per_pixel / gob_size_x) * gob_size * block_height; 273 gob_address_y + (dst_x * bytes_per_pixel / gob_size_x) * gob_size * block_height;
272 const u32 swizzled_offset = gob_address + table[(x * bytes_per_pixel) % gob_size_x]; 274 const u32 swizzled_offset = gob_address + table[(dst_x * bytes_per_pixel) % gob_size_x];
273 u8* source_line = unswizzled_data + line * source_pitch + x * bytes_per_pixel; 275 u8* source_line = unswizzled_data + line * source_pitch + x * bytes_per_pixel;
274 u8* dest_addr = swizzled_data + swizzled_offset; 276 u8* dest_addr = swizzled_data + swizzled_offset;
275 277
diff --git a/src/video_core/textures/decoders.h b/src/video_core/textures/decoders.h
index eaec9b5a5..f1e3952bc 100644
--- a/src/video_core/textures/decoders.h
+++ b/src/video_core/textures/decoders.h
@@ -44,7 +44,8 @@ std::size_t CalculateSize(bool tiled, u32 bytes_per_pixel, u32 width, u32 height
44 44
45/// Copies an untiled subrectangle into a tiled surface. 45/// Copies an untiled subrectangle into a tiled surface.
46void SwizzleSubrect(u32 subrect_width, u32 subrect_height, u32 source_pitch, u32 swizzled_width, 46void SwizzleSubrect(u32 subrect_width, u32 subrect_height, u32 source_pitch, u32 swizzled_width,
47 u32 bytes_per_pixel, u8* swizzled_data, u8* unswizzled_data, u32 block_height); 47 u32 bytes_per_pixel, u8* swizzled_data, u8* unswizzled_data, u32 block_height,
48 u32 offset_x, u32 offset_y);
48 49
49/// Copies a tiled subrectangle into a linear surface. 50/// Copies a tiled subrectangle into a linear surface.
50void UnswizzleSubrect(u32 subrect_width, u32 subrect_height, u32 dest_pitch, u32 swizzled_width, 51void UnswizzleSubrect(u32 subrect_width, u32 subrect_height, u32 dest_pitch, u32 swizzled_width,
diff --git a/src/video_core/textures/texture.h b/src/video_core/textures/texture.h
index e3be018b9..e36bc2c04 100644
--- a/src/video_core/textures/texture.h
+++ b/src/video_core/textures/texture.h
@@ -213,7 +213,7 @@ struct TICEntry {
213 if (header_version != TICHeaderVersion::OneDBuffer) { 213 if (header_version != TICHeaderVersion::OneDBuffer) {
214 return width_minus_1 + 1; 214 return width_minus_1 + 1;
215 } 215 }
216 return (buffer_high_width_minus_one << 16) | buffer_low_width_minus_one; 216 return ((buffer_high_width_minus_one << 16) | buffer_low_width_minus_one) + 1;
217 } 217 }
218 218
219 u32 Height() const { 219 u32 Height() const {
diff --git a/src/yuzu/configuration/config.cpp b/src/yuzu/configuration/config.cpp
index 5d0fb3f9f..0456248ac 100644
--- a/src/yuzu/configuration/config.cpp
+++ b/src/yuzu/configuration/config.cpp
@@ -516,6 +516,7 @@ void Config::ReadPathValues() {
516 516
517 UISettings::values.roms_path = ReadSetting(QStringLiteral("romsPath")).toString(); 517 UISettings::values.roms_path = ReadSetting(QStringLiteral("romsPath")).toString();
518 UISettings::values.symbols_path = ReadSetting(QStringLiteral("symbolsPath")).toString(); 518 UISettings::values.symbols_path = ReadSetting(QStringLiteral("symbolsPath")).toString();
519 UISettings::values.screenshot_path = ReadSetting(QStringLiteral("screenshotPath")).toString();
519 UISettings::values.game_directory_path = 520 UISettings::values.game_directory_path =
520 ReadSetting(QStringLiteral("gameListRootDir"), QStringLiteral(".")).toString(); 521 ReadSetting(QStringLiteral("gameListRootDir"), QStringLiteral(".")).toString();
521 UISettings::values.game_directory_deepscan = 522 UISettings::values.game_directory_deepscan =
diff --git a/src/yuzu/main.cpp b/src/yuzu/main.cpp
index a7c656fdb..ac57229d5 100644
--- a/src/yuzu/main.cpp
+++ b/src/yuzu/main.cpp
@@ -119,6 +119,7 @@ Q_IMPORT_PLUGIN(QWindowsIntegrationPlugin);
119#endif 119#endif
120 120
121#ifdef _WIN32 121#ifdef _WIN32
122#include <windows.h>
122extern "C" { 123extern "C" {
123// tells Nvidia and AMD drivers to use the dedicated GPU by default on laptops with switchable 124// tells Nvidia and AMD drivers to use the dedicated GPU by default on laptops with switchable
124// graphics 125// graphics
@@ -747,6 +748,18 @@ void GMainWindow::OnDisplayTitleBars(bool show) {
747 } 748 }
748} 749}
749 750
751void GMainWindow::PreventOSSleep() {
752#ifdef _WIN32
753 SetThreadExecutionState(ES_CONTINUOUS | ES_SYSTEM_REQUIRED | ES_DISPLAY_REQUIRED);
754#endif
755}
756
757void GMainWindow::AllowOSSleep() {
758#ifdef _WIN32
759 SetThreadExecutionState(ES_CONTINUOUS);
760#endif
761}
762
750QStringList GMainWindow::GetUnsupportedGLExtensions() { 763QStringList GMainWindow::GetUnsupportedGLExtensions() {
751 QStringList unsupported_ext; 764 QStringList unsupported_ext;
752 765
@@ -966,6 +979,8 @@ void GMainWindow::BootGame(const QString& filename) {
966} 979}
967 980
968void GMainWindow::ShutdownGame() { 981void GMainWindow::ShutdownGame() {
982 AllowOSSleep();
983
969 discord_rpc->Pause(); 984 discord_rpc->Pause();
970 emu_thread->RequestStop(); 985 emu_thread->RequestStop();
971 986
@@ -1567,6 +1582,8 @@ void GMainWindow::OnMenuRecentFile() {
1567} 1582}
1568 1583
1569void GMainWindow::OnStartGame() { 1584void GMainWindow::OnStartGame() {
1585 PreventOSSleep();
1586
1570 emu_thread->SetRunning(true); 1587 emu_thread->SetRunning(true);
1571 1588
1572 qRegisterMetaType<Core::Frontend::SoftwareKeyboardParameters>( 1589 qRegisterMetaType<Core::Frontend::SoftwareKeyboardParameters>(
@@ -1598,6 +1615,8 @@ void GMainWindow::OnPauseGame() {
1598 ui.action_Pause->setEnabled(false); 1615 ui.action_Pause->setEnabled(false);
1599 ui.action_Stop->setEnabled(true); 1616 ui.action_Stop->setEnabled(true);
1600 ui.action_Capture_Screenshot->setEnabled(false); 1617 ui.action_Capture_Screenshot->setEnabled(false);
1618
1619 AllowOSSleep();
1601} 1620}
1602 1621
1603void GMainWindow::OnStopGame() { 1622void GMainWindow::OnStopGame() {
diff --git a/src/yuzu/main.h b/src/yuzu/main.h
index 1137bbc7a..501608ddc 100644
--- a/src/yuzu/main.h
+++ b/src/yuzu/main.h
@@ -130,6 +130,9 @@ private:
130 void ConnectWidgetEvents(); 130 void ConnectWidgetEvents();
131 void ConnectMenuEvents(); 131 void ConnectMenuEvents();
132 132
133 void PreventOSSleep();
134 void AllowOSSleep();
135
133 QStringList GetUnsupportedGLExtensions(); 136 QStringList GetUnsupportedGLExtensions();
134 bool LoadROM(const QString& filename); 137 bool LoadROM(const QString& filename);
135 void BootGame(const QString& filename); 138 void BootGame(const QString& filename);
diff --git a/src/yuzu_tester/yuzu.cpp b/src/yuzu_tester/yuzu.cpp
index b589c3de3..0ee97aa54 100644
--- a/src/yuzu_tester/yuzu.cpp
+++ b/src/yuzu_tester/yuzu.cpp
@@ -92,7 +92,6 @@ int main(int argc, char** argv) {
92 92
93 int option_index = 0; 93 int option_index = 0;
94 94
95 char* endarg;
96#ifdef _WIN32 95#ifdef _WIN32
97 int argc_w; 96 int argc_w;
98 auto argv_w = CommandLineToArgvW(GetCommandLineW(), &argc_w); 97 auto argv_w = CommandLineToArgvW(GetCommandLineW(), &argc_w);
@@ -226,7 +225,7 @@ int main(int argc, char** argv) {
226 225
227 switch (load_result) { 226 switch (load_result) {
228 case Core::System::ResultStatus::ErrorGetLoader: 227 case Core::System::ResultStatus::ErrorGetLoader:
229 LOG_CRITICAL(Frontend, "Failed to obtain loader for %s!", filepath.c_str()); 228 LOG_CRITICAL(Frontend, "Failed to obtain loader for {}!", filepath);
230 return -1; 229 return -1;
231 case Core::System::ResultStatus::ErrorLoader: 230 case Core::System::ResultStatus::ErrorLoader:
232 LOG_CRITICAL(Frontend, "Failed to load ROM!"); 231 LOG_CRITICAL(Frontend, "Failed to load ROM!");