summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--CMakeModules/GenerateSCMRev.cmake1
-rw-r--r--src/common/CMakeLists.txt1
-rw-r--r--src/video_core/CMakeLists.txt5
-rw-r--r--src/video_core/buffer_cache.h299
-rw-r--r--src/video_core/buffer_cache/buffer_block.h76
-rw-r--r--src/video_core/buffer_cache/buffer_cache.h447
-rw-r--r--src/video_core/buffer_cache/map_interval.h89
-rw-r--r--src/video_core/engines/fermi_2d.cpp3
-rw-r--r--src/video_core/engines/fermi_2d.h3
-rw-r--r--src/video_core/engines/kepler_memory.cpp2
-rw-r--r--src/video_core/engines/kepler_memory.h1
-rw-r--r--src/video_core/engines/maxwell_3d.cpp22
-rw-r--r--src/video_core/engines/maxwell_3d.h2
-rw-r--r--src/video_core/engines/maxwell_dma.cpp48
-rw-r--r--src/video_core/engines/maxwell_dma.h9
-rw-r--r--src/video_core/engines/shader_bytecode.h17
-rw-r--r--src/video_core/gpu.cpp4
-rw-r--r--src/video_core/gpu.h6
-rw-r--r--src/video_core/rasterizer_interface.h2
-rw-r--r--src/video_core/renderer_opengl/gl_buffer_cache.cpp52
-rw-r--r--src/video_core/renderer_opengl/gl_buffer_cache.h39
-rw-r--r--src/video_core/renderer_opengl/gl_device.cpp3
-rw-r--r--src/video_core/renderer_opengl/gl_device.h5
-rw-r--r--src/video_core/renderer_opengl/gl_rasterizer.cpp4
-rw-r--r--src/video_core/renderer_opengl/gl_shader_cache.cpp14
-rw-r--r--src/video_core/renderer_opengl/gl_shader_cache.h1
-rw-r--r--src/video_core/renderer_opengl/gl_shader_decompiler.cpp49
-rw-r--r--src/video_core/renderer_opengl/gl_texture_cache.cpp4
-rw-r--r--src/video_core/renderer_opengl/gl_texture_cache.h2
-rw-r--r--src/video_core/renderer_vulkan/vk_shader_decompiler.cpp25
-rw-r--r--src/video_core/shader/decode.cpp1
-rw-r--r--src/video_core/shader/decode/float_set.cpp1
-rw-r--r--src/video_core/shader/decode/float_set_predicate.cpp10
-rw-r--r--src/video_core/shader/decode/integer_set.cpp1
-rw-r--r--src/video_core/shader/decode/integer_set_predicate.cpp1
-rw-r--r--src/video_core/shader/decode/other.cpp7
-rw-r--r--src/video_core/shader/decode/predicate_set_register.cpp1
-rw-r--r--src/video_core/shader/decode/warp.cpp55
-rw-r--r--src/video_core/shader/node.h5
-rw-r--r--src/video_core/shader/shader_ir.cpp5
-rw-r--r--src/video_core/shader/shader_ir.h4
-rw-r--r--src/video_core/texture_cache/surface_params.h1
-rw-r--r--src/video_core/texture_cache/texture_cache.h2
-rw-r--r--src/video_core/textures/decoders.cpp14
-rw-r--r--src/video_core/textures/decoders.h3
-rw-r--r--src/video_core/textures/texture.h2
-rw-r--r--src/yuzu/configuration/config.cpp1
-rw-r--r--src/yuzu/main.cpp19
-rw-r--r--src/yuzu/main.h3
49 files changed, 959 insertions, 412 deletions
diff --git a/CMakeModules/GenerateSCMRev.cmake b/CMakeModules/GenerateSCMRev.cmake
index abdc74428..a1ace89cb 100644
--- a/CMakeModules/GenerateSCMRev.cmake
+++ b/CMakeModules/GenerateSCMRev.cmake
@@ -81,6 +81,7 @@ set(HASH_FILES
81 "${VIDEO_CORE}/shader/decode/register_set_predicate.cpp" 81 "${VIDEO_CORE}/shader/decode/register_set_predicate.cpp"
82 "${VIDEO_CORE}/shader/decode/shift.cpp" 82 "${VIDEO_CORE}/shader/decode/shift.cpp"
83 "${VIDEO_CORE}/shader/decode/video.cpp" 83 "${VIDEO_CORE}/shader/decode/video.cpp"
84 "${VIDEO_CORE}/shader/decode/warp.cpp"
84 "${VIDEO_CORE}/shader/decode/xmad.cpp" 85 "${VIDEO_CORE}/shader/decode/xmad.cpp"
85 "${VIDEO_CORE}/shader/control_flow.cpp" 86 "${VIDEO_CORE}/shader/control_flow.cpp"
86 "${VIDEO_CORE}/shader/control_flow.h" 87 "${VIDEO_CORE}/shader/control_flow.h"
diff --git a/src/common/CMakeLists.txt b/src/common/CMakeLists.txt
index 2b4266f29..01abdb3bb 100644
--- a/src/common/CMakeLists.txt
+++ b/src/common/CMakeLists.txt
@@ -55,6 +55,7 @@ add_custom_command(OUTPUT scm_rev.cpp
55 "${VIDEO_CORE}/shader/decode/register_set_predicate.cpp" 55 "${VIDEO_CORE}/shader/decode/register_set_predicate.cpp"
56 "${VIDEO_CORE}/shader/decode/shift.cpp" 56 "${VIDEO_CORE}/shader/decode/shift.cpp"
57 "${VIDEO_CORE}/shader/decode/video.cpp" 57 "${VIDEO_CORE}/shader/decode/video.cpp"
58 "${VIDEO_CORE}/shader/decode/warp.cpp"
58 "${VIDEO_CORE}/shader/decode/xmad.cpp" 59 "${VIDEO_CORE}/shader/decode/xmad.cpp"
59 "${VIDEO_CORE}/shader/control_flow.cpp" 60 "${VIDEO_CORE}/shader/control_flow.cpp"
60 "${VIDEO_CORE}/shader/control_flow.h" 61 "${VIDEO_CORE}/shader/control_flow.h"
diff --git a/src/video_core/CMakeLists.txt b/src/video_core/CMakeLists.txt
index 7c18c27b3..e2f85c5f1 100644
--- a/src/video_core/CMakeLists.txt
+++ b/src/video_core/CMakeLists.txt
@@ -1,5 +1,7 @@
1add_library(video_core STATIC 1add_library(video_core STATIC
2 buffer_cache.h 2 buffer_cache/buffer_block.h
3 buffer_cache/buffer_cache.h
4 buffer_cache/map_interval.h
3 dma_pusher.cpp 5 dma_pusher.cpp
4 dma_pusher.h 6 dma_pusher.h
5 debug_utils/debug_utils.cpp 7 debug_utils/debug_utils.cpp
@@ -100,6 +102,7 @@ add_library(video_core STATIC
100 shader/decode/integer_set.cpp 102 shader/decode/integer_set.cpp
101 shader/decode/half_set.cpp 103 shader/decode/half_set.cpp
102 shader/decode/video.cpp 104 shader/decode/video.cpp
105 shader/decode/warp.cpp
103 shader/decode/xmad.cpp 106 shader/decode/xmad.cpp
104 shader/decode/other.cpp 107 shader/decode/other.cpp
105 shader/control_flow.cpp 108 shader/control_flow.cpp
diff --git a/src/video_core/buffer_cache.h b/src/video_core/buffer_cache.h
deleted file mode 100644
index 6f868b8b4..000000000
--- a/src/video_core/buffer_cache.h
+++ /dev/null
@@ -1,299 +0,0 @@
1// Copyright 2019 yuzu Emulator Project
2// Licensed under GPLv2 or any later version
3// Refer to the license.txt file included.
4
5#pragma once
6
7#include <array>
8#include <memory>
9#include <mutex>
10#include <unordered_map>
11#include <unordered_set>
12#include <utility>
13#include <vector>
14
15#include "common/alignment.h"
16#include "common/common_types.h"
17#include "core/core.h"
18#include "video_core/memory_manager.h"
19#include "video_core/rasterizer_cache.h"
20
21namespace VideoCore {
22class RasterizerInterface;
23}
24
25namespace VideoCommon {
26
27template <typename BufferStorageType>
28class CachedBuffer final : public RasterizerCacheObject {
29public:
30 explicit CachedBuffer(VAddr cpu_addr, u8* host_ptr)
31 : RasterizerCacheObject{host_ptr}, host_ptr{host_ptr}, cpu_addr{cpu_addr} {}
32 ~CachedBuffer() override = default;
33
34 VAddr GetCpuAddr() const override {
35 return cpu_addr;
36 }
37
38 std::size_t GetSizeInBytes() const override {
39 return size;
40 }
41
42 u8* GetWritableHostPtr() const {
43 return host_ptr;
44 }
45
46 std::size_t GetSize() const {
47 return size;
48 }
49
50 std::size_t GetCapacity() const {
51 return capacity;
52 }
53
54 bool IsInternalized() const {
55 return is_internal;
56 }
57
58 const BufferStorageType& GetBuffer() const {
59 return buffer;
60 }
61
62 void SetSize(std::size_t new_size) {
63 size = new_size;
64 }
65
66 void SetInternalState(bool is_internal_) {
67 is_internal = is_internal_;
68 }
69
70 BufferStorageType ExchangeBuffer(BufferStorageType buffer_, std::size_t new_capacity) {
71 capacity = new_capacity;
72 std::swap(buffer, buffer_);
73 return buffer_;
74 }
75
76private:
77 u8* host_ptr{};
78 VAddr cpu_addr{};
79 std::size_t size{};
80 std::size_t capacity{};
81 bool is_internal{};
82 BufferStorageType buffer;
83};
84
85template <typename BufferStorageType, typename BufferType, typename StreamBuffer>
86class BufferCache : public RasterizerCache<std::shared_ptr<CachedBuffer<BufferStorageType>>> {
87public:
88 using Buffer = std::shared_ptr<CachedBuffer<BufferStorageType>>;
89 using BufferInfo = std::pair<const BufferType*, u64>;
90
91 explicit BufferCache(VideoCore::RasterizerInterface& rasterizer, Core::System& system,
92 std::unique_ptr<StreamBuffer> stream_buffer)
93 : RasterizerCache<Buffer>{rasterizer}, system{system},
94 stream_buffer{std::move(stream_buffer)}, stream_buffer_handle{
95 this->stream_buffer->GetHandle()} {}
96 ~BufferCache() = default;
97
98 void Unregister(const Buffer& entry) override {
99 std::lock_guard lock{RasterizerCache<Buffer>::mutex};
100 if (entry->IsInternalized()) {
101 internalized_entries.erase(entry->GetCacheAddr());
102 }
103 ReserveBuffer(entry);
104 RasterizerCache<Buffer>::Unregister(entry);
105 }
106
107 void TickFrame() {
108 marked_for_destruction_index =
109 (marked_for_destruction_index + 1) % marked_for_destruction_ring_buffer.size();
110 MarkedForDestruction().clear();
111 }
112
113 BufferInfo UploadMemory(GPUVAddr gpu_addr, std::size_t size, std::size_t alignment = 4,
114 bool internalize = false, bool is_written = false) {
115 std::lock_guard lock{RasterizerCache<Buffer>::mutex};
116
117 auto& memory_manager = system.GPU().MemoryManager();
118 const auto host_ptr = memory_manager.GetPointer(gpu_addr);
119 if (!host_ptr) {
120 return {GetEmptyBuffer(size), 0};
121 }
122 const auto cache_addr = ToCacheAddr(host_ptr);
123
124 // Cache management is a big overhead, so only cache entries with a given size.
125 // TODO: Figure out which size is the best for given games.
126 constexpr std::size_t max_stream_size = 0x800;
127 if (!internalize && size < max_stream_size &&
128 internalized_entries.find(cache_addr) == internalized_entries.end()) {
129 return StreamBufferUpload(host_ptr, size, alignment);
130 }
131
132 auto entry = RasterizerCache<Buffer>::TryGet(cache_addr);
133 if (!entry) {
134 return FixedBufferUpload(gpu_addr, host_ptr, size, internalize, is_written);
135 }
136
137 if (entry->GetSize() < size) {
138 IncreaseBufferSize(entry, size);
139 }
140 if (is_written) {
141 entry->MarkAsModified(true, *this);
142 }
143 return {ToHandle(entry->GetBuffer()), 0};
144 }
145
146 /// Uploads from a host memory. Returns the OpenGL buffer where it's located and its offset.
147 BufferInfo UploadHostMemory(const void* raw_pointer, std::size_t size,
148 std::size_t alignment = 4) {
149 std::lock_guard lock{RasterizerCache<Buffer>::mutex};
150 return StreamBufferUpload(raw_pointer, size, alignment);
151 }
152
153 void Map(std::size_t max_size) {
154 std::tie(buffer_ptr, buffer_offset_base, invalidated) = stream_buffer->Map(max_size, 4);
155 buffer_offset = buffer_offset_base;
156 }
157
158 /// Finishes the upload stream, returns true on bindings invalidation.
159 bool Unmap() {
160 stream_buffer->Unmap(buffer_offset - buffer_offset_base);
161 return std::exchange(invalidated, false);
162 }
163
164 virtual const BufferType* GetEmptyBuffer(std::size_t size) = 0;
165
166protected:
167 void FlushObjectInner(const Buffer& entry) override {
168 DownloadBufferData(entry->GetBuffer(), 0, entry->GetSize(), entry->GetWritableHostPtr());
169 }
170
171 virtual BufferStorageType CreateBuffer(std::size_t size) = 0;
172
173 virtual const BufferType* ToHandle(const BufferStorageType& storage) = 0;
174
175 virtual void UploadBufferData(const BufferStorageType& buffer, std::size_t offset,
176 std::size_t size, const u8* data) = 0;
177
178 virtual void DownloadBufferData(const BufferStorageType& buffer, std::size_t offset,
179 std::size_t size, u8* data) = 0;
180
181 virtual void CopyBufferData(const BufferStorageType& src, const BufferStorageType& dst,
182 std::size_t src_offset, std::size_t dst_offset,
183 std::size_t size) = 0;
184
185private:
186 BufferInfo StreamBufferUpload(const void* raw_pointer, std::size_t size,
187 std::size_t alignment) {
188 AlignBuffer(alignment);
189 const std::size_t uploaded_offset = buffer_offset;
190 std::memcpy(buffer_ptr, raw_pointer, size);
191
192 buffer_ptr += size;
193 buffer_offset += size;
194 return {&stream_buffer_handle, uploaded_offset};
195 }
196
197 BufferInfo FixedBufferUpload(GPUVAddr gpu_addr, u8* host_ptr, std::size_t size,
198 bool internalize, bool is_written) {
199 auto& memory_manager = Core::System::GetInstance().GPU().MemoryManager();
200 const auto cpu_addr = memory_manager.GpuToCpuAddress(gpu_addr);
201 ASSERT(cpu_addr);
202
203 auto entry = GetUncachedBuffer(*cpu_addr, host_ptr);
204 entry->SetSize(size);
205 entry->SetInternalState(internalize);
206 RasterizerCache<Buffer>::Register(entry);
207
208 if (internalize) {
209 internalized_entries.emplace(ToCacheAddr(host_ptr));
210 }
211 if (is_written) {
212 entry->MarkAsModified(true, *this);
213 }
214
215 if (entry->GetCapacity() < size) {
216 MarkedForDestruction().push_back(entry->ExchangeBuffer(CreateBuffer(size), size));
217 }
218
219 UploadBufferData(entry->GetBuffer(), 0, size, host_ptr);
220 return {ToHandle(entry->GetBuffer()), 0};
221 }
222
223 void IncreaseBufferSize(Buffer& entry, std::size_t new_size) {
224 const std::size_t old_size = entry->GetSize();
225 if (entry->GetCapacity() < new_size) {
226 const auto& old_buffer = entry->GetBuffer();
227 auto new_buffer = CreateBuffer(new_size);
228
229 // Copy bits from the old buffer to the new buffer.
230 CopyBufferData(old_buffer, new_buffer, 0, 0, old_size);
231 MarkedForDestruction().push_back(
232 entry->ExchangeBuffer(std::move(new_buffer), new_size));
233
234 // This buffer could have been used
235 invalidated = true;
236 }
237 // Upload the new bits.
238 const std::size_t size_diff = new_size - old_size;
239 UploadBufferData(entry->GetBuffer(), old_size, size_diff, entry->GetHostPtr() + old_size);
240
241 // Update entry's size in the object and in the cache.
242 Unregister(entry);
243
244 entry->SetSize(new_size);
245 RasterizerCache<Buffer>::Register(entry);
246 }
247
248 Buffer GetUncachedBuffer(VAddr cpu_addr, u8* host_ptr) {
249 if (auto entry = TryGetReservedBuffer(host_ptr)) {
250 return entry;
251 }
252 return std::make_shared<CachedBuffer<BufferStorageType>>(cpu_addr, host_ptr);
253 }
254
255 Buffer TryGetReservedBuffer(u8* host_ptr) {
256 const auto it = buffer_reserve.find(ToCacheAddr(host_ptr));
257 if (it == buffer_reserve.end()) {
258 return {};
259 }
260 auto& reserve = it->second;
261 auto entry = reserve.back();
262 reserve.pop_back();
263 return entry;
264 }
265
266 void ReserveBuffer(Buffer entry) {
267 buffer_reserve[entry->GetCacheAddr()].push_back(std::move(entry));
268 }
269
270 void AlignBuffer(std::size_t alignment) {
271 // Align the offset, not the mapped pointer
272 const std::size_t offset_aligned = Common::AlignUp(buffer_offset, alignment);
273 buffer_ptr += offset_aligned - buffer_offset;
274 buffer_offset = offset_aligned;
275 }
276
277 std::vector<BufferStorageType>& MarkedForDestruction() {
278 return marked_for_destruction_ring_buffer[marked_for_destruction_index];
279 }
280
281 Core::System& system;
282
283 std::unique_ptr<StreamBuffer> stream_buffer;
284 BufferType stream_buffer_handle{};
285
286 bool invalidated = false;
287
288 u8* buffer_ptr = nullptr;
289 u64 buffer_offset = 0;
290 u64 buffer_offset_base = 0;
291
292 std::size_t marked_for_destruction_index = 0;
293 std::array<std::vector<BufferStorageType>, 4> marked_for_destruction_ring_buffer;
294
295 std::unordered_set<CacheAddr> internalized_entries;
296 std::unordered_map<CacheAddr, std::vector<Buffer>> buffer_reserve;
297};
298
299} // namespace VideoCommon
diff --git a/src/video_core/buffer_cache/buffer_block.h b/src/video_core/buffer_cache/buffer_block.h
new file mode 100644
index 000000000..4b9193182
--- /dev/null
+++ b/src/video_core/buffer_cache/buffer_block.h
@@ -0,0 +1,76 @@
1// Copyright 2019 yuzu Emulator Project
2// Licensed under GPLv2 or any later version
3// Refer to the license.txt file included.
4
5#pragma once
6
7#include <unordered_set>
8#include <utility>
9
10#include "common/alignment.h"
11#include "common/common_types.h"
12#include "video_core/gpu.h"
13
14namespace VideoCommon {
15
16class BufferBlock {
17public:
18 bool Overlaps(const CacheAddr start, const CacheAddr end) const {
19 return (cache_addr < end) && (cache_addr_end > start);
20 }
21
22 bool IsInside(const CacheAddr other_start, const CacheAddr other_end) const {
23 return cache_addr <= other_start && other_end <= cache_addr_end;
24 }
25
26 u8* GetWritableHostPtr() const {
27 return FromCacheAddr(cache_addr);
28 }
29
30 u8* GetWritableHostPtr(std::size_t offset) const {
31 return FromCacheAddr(cache_addr + offset);
32 }
33
34 std::size_t GetOffset(const CacheAddr in_addr) {
35 return static_cast<std::size_t>(in_addr - cache_addr);
36 }
37
38 CacheAddr GetCacheAddr() const {
39 return cache_addr;
40 }
41
42 CacheAddr GetCacheAddrEnd() const {
43 return cache_addr_end;
44 }
45
46 void SetCacheAddr(const CacheAddr new_addr) {
47 cache_addr = new_addr;
48 cache_addr_end = new_addr + size;
49 }
50
51 std::size_t GetSize() const {
52 return size;
53 }
54
55 void SetEpoch(u64 new_epoch) {
56 epoch = new_epoch;
57 }
58
59 u64 GetEpoch() {
60 return epoch;
61 }
62
63protected:
64 explicit BufferBlock(CacheAddr cache_addr, const std::size_t size) : size{size} {
65 SetCacheAddr(cache_addr);
66 }
67 ~BufferBlock() = default;
68
69private:
70 CacheAddr cache_addr{};
71 CacheAddr cache_addr_end{};
72 std::size_t size{};
73 u64 epoch{};
74};
75
76} // namespace VideoCommon
diff --git a/src/video_core/buffer_cache/buffer_cache.h b/src/video_core/buffer_cache/buffer_cache.h
new file mode 100644
index 000000000..2442ddfd6
--- /dev/null
+++ b/src/video_core/buffer_cache/buffer_cache.h
@@ -0,0 +1,447 @@
1// Copyright 2019 yuzu Emulator Project
2// Licensed under GPLv2 or any later version
3// Refer to the license.txt file included.
4
5#pragma once
6
7#include <array>
8#include <memory>
9#include <mutex>
10#include <unordered_map>
11#include <unordered_set>
12#include <utility>
13#include <vector>
14
15#include "common/alignment.h"
16#include "common/common_types.h"
17#include "core/core.h"
18#include "video_core/buffer_cache/buffer_block.h"
19#include "video_core/buffer_cache/map_interval.h"
20#include "video_core/memory_manager.h"
21#include "video_core/rasterizer_interface.h"
22
23namespace VideoCommon {
24
25using MapInterval = std::shared_ptr<MapIntervalBase>;
26
27template <typename TBuffer, typename TBufferType, typename StreamBuffer>
28class BufferCache {
29public:
30 using BufferInfo = std::pair<const TBufferType*, u64>;
31
32 BufferInfo UploadMemory(GPUVAddr gpu_addr, std::size_t size, std::size_t alignment = 4,
33 bool is_written = false) {
34 std::lock_guard lock{mutex};
35
36 auto& memory_manager = system.GPU().MemoryManager();
37 const auto host_ptr = memory_manager.GetPointer(gpu_addr);
38 if (!host_ptr) {
39 return {GetEmptyBuffer(size), 0};
40 }
41 const auto cache_addr = ToCacheAddr(host_ptr);
42
43 // Cache management is a big overhead, so only cache entries with a given size.
44 // TODO: Figure out which size is the best for given games.
45 constexpr std::size_t max_stream_size = 0x800;
46 if (size < max_stream_size) {
47 if (!is_written && !IsRegionWritten(cache_addr, cache_addr + size - 1)) {
48 return StreamBufferUpload(host_ptr, size, alignment);
49 }
50 }
51
52 auto block = GetBlock(cache_addr, size);
53 auto map = MapAddress(block, gpu_addr, cache_addr, size);
54 if (is_written) {
55 map->MarkAsModified(true, GetModifiedTicks());
56 if (!map->IsWritten()) {
57 map->MarkAsWritten(true);
58 MarkRegionAsWritten(map->GetStart(), map->GetEnd() - 1);
59 }
60 } else {
61 if (map->IsWritten()) {
62 WriteBarrier();
63 }
64 }
65
66 const u64 offset = static_cast<u64>(block->GetOffset(cache_addr));
67
68 return {ToHandle(block), offset};
69 }
70
71 /// Uploads from a host memory. Returns the OpenGL buffer where it's located and its offset.
72 BufferInfo UploadHostMemory(const void* raw_pointer, std::size_t size,
73 std::size_t alignment = 4) {
74 std::lock_guard lock{mutex};
75 return StreamBufferUpload(raw_pointer, size, alignment);
76 }
77
78 void Map(std::size_t max_size) {
79 std::lock_guard lock{mutex};
80
81 std::tie(buffer_ptr, buffer_offset_base, invalidated) = stream_buffer->Map(max_size, 4);
82 buffer_offset = buffer_offset_base;
83 }
84
85 /// Finishes the upload stream, returns true on bindings invalidation.
86 bool Unmap() {
87 std::lock_guard lock{mutex};
88
89 stream_buffer->Unmap(buffer_offset - buffer_offset_base);
90 return std::exchange(invalidated, false);
91 }
92
93 void TickFrame() {
94 ++epoch;
95 while (!pending_destruction.empty()) {
96 if (pending_destruction.front()->GetEpoch() + 1 > epoch) {
97 break;
98 }
99 pending_destruction.pop_front();
100 }
101 }
102
103 /// Write any cached resources overlapping the specified region back to memory
104 void FlushRegion(CacheAddr addr, std::size_t size) {
105 std::lock_guard lock{mutex};
106
107 std::vector<MapInterval> objects = GetMapsInRange(addr, size);
108 std::sort(objects.begin(), objects.end(), [](const MapInterval& a, const MapInterval& b) {
109 return a->GetModificationTick() < b->GetModificationTick();
110 });
111 for (auto& object : objects) {
112 if (object->IsModified() && object->IsRegistered()) {
113 FlushMap(object);
114 }
115 }
116 }
117
118 /// Mark the specified region as being invalidated
119 void InvalidateRegion(CacheAddr addr, u64 size) {
120 std::lock_guard lock{mutex};
121
122 std::vector<MapInterval> objects = GetMapsInRange(addr, size);
123 for (auto& object : objects) {
124 if (object->IsRegistered()) {
125 Unregister(object);
126 }
127 }
128 }
129
130 virtual const TBufferType* GetEmptyBuffer(std::size_t size) = 0;
131
132protected:
133 explicit BufferCache(VideoCore::RasterizerInterface& rasterizer, Core::System& system,
134 std::unique_ptr<StreamBuffer> stream_buffer)
135 : rasterizer{rasterizer}, system{system}, stream_buffer{std::move(stream_buffer)},
136 stream_buffer_handle{this->stream_buffer->GetHandle()} {}
137
138 ~BufferCache() = default;
139
140 virtual const TBufferType* ToHandle(const TBuffer& storage) = 0;
141
142 virtual void WriteBarrier() = 0;
143
144 virtual TBuffer CreateBlock(CacheAddr cache_addr, std::size_t size) = 0;
145
146 virtual void UploadBlockData(const TBuffer& buffer, std::size_t offset, std::size_t size,
147 const u8* data) = 0;
148
149 virtual void DownloadBlockData(const TBuffer& buffer, std::size_t offset, std::size_t size,
150 u8* data) = 0;
151
152 virtual void CopyBlock(const TBuffer& src, const TBuffer& dst, std::size_t src_offset,
153 std::size_t dst_offset, std::size_t size) = 0;
154
155 /// Register an object into the cache
156 void Register(const MapInterval& new_map, bool inherit_written = false) {
157 const CacheAddr cache_ptr = new_map->GetStart();
158 const std::optional<VAddr> cpu_addr =
159 system.GPU().MemoryManager().GpuToCpuAddress(new_map->GetGpuAddress());
160 if (!cache_ptr || !cpu_addr) {
161 LOG_CRITICAL(HW_GPU, "Failed to register buffer with unmapped gpu_address 0x{:016x}",
162 new_map->GetGpuAddress());
163 return;
164 }
165 const std::size_t size = new_map->GetEnd() - new_map->GetStart();
166 new_map->SetCpuAddress(*cpu_addr);
167 new_map->MarkAsRegistered(true);
168 const IntervalType interval{new_map->GetStart(), new_map->GetEnd()};
169 mapped_addresses.insert({interval, new_map});
170 rasterizer.UpdatePagesCachedCount(*cpu_addr, size, 1);
171 if (inherit_written) {
172 MarkRegionAsWritten(new_map->GetStart(), new_map->GetEnd() - 1);
173 new_map->MarkAsWritten(true);
174 }
175 }
176
177 /// Unregisters an object from the cache
178 void Unregister(MapInterval& map) {
179 const std::size_t size = map->GetEnd() - map->GetStart();
180 rasterizer.UpdatePagesCachedCount(map->GetCpuAddress(), size, -1);
181 map->MarkAsRegistered(false);
182 if (map->IsWritten()) {
183 UnmarkRegionAsWritten(map->GetStart(), map->GetEnd() - 1);
184 }
185 const IntervalType delete_interval{map->GetStart(), map->GetEnd()};
186 mapped_addresses.erase(delete_interval);
187 }
188
189private:
190 MapInterval CreateMap(const CacheAddr start, const CacheAddr end, const GPUVAddr gpu_addr) {
191 return std::make_shared<MapIntervalBase>(start, end, gpu_addr);
192 }
193
194 MapInterval MapAddress(const TBuffer& block, const GPUVAddr gpu_addr,
195 const CacheAddr cache_addr, const std::size_t size) {
196
197 std::vector<MapInterval> overlaps = GetMapsInRange(cache_addr, size);
198 if (overlaps.empty()) {
199 const CacheAddr cache_addr_end = cache_addr + size;
200 MapInterval new_map = CreateMap(cache_addr, cache_addr_end, gpu_addr);
201 u8* host_ptr = FromCacheAddr(cache_addr);
202 UploadBlockData(block, block->GetOffset(cache_addr), size, host_ptr);
203 Register(new_map);
204 return new_map;
205 }
206
207 const CacheAddr cache_addr_end = cache_addr + size;
208 if (overlaps.size() == 1) {
209 MapInterval& current_map = overlaps[0];
210 if (current_map->IsInside(cache_addr, cache_addr_end)) {
211 return current_map;
212 }
213 }
214 CacheAddr new_start = cache_addr;
215 CacheAddr new_end = cache_addr_end;
216 bool write_inheritance = false;
217 bool modified_inheritance = false;
218 // Calculate new buffer parameters
219 for (auto& overlap : overlaps) {
220 new_start = std::min(overlap->GetStart(), new_start);
221 new_end = std::max(overlap->GetEnd(), new_end);
222 write_inheritance |= overlap->IsWritten();
223 modified_inheritance |= overlap->IsModified();
224 }
225 GPUVAddr new_gpu_addr = gpu_addr + new_start - cache_addr;
226 for (auto& overlap : overlaps) {
227 Unregister(overlap);
228 }
229 UpdateBlock(block, new_start, new_end, overlaps);
230 MapInterval new_map = CreateMap(new_start, new_end, new_gpu_addr);
231 if (modified_inheritance) {
232 new_map->MarkAsModified(true, GetModifiedTicks());
233 }
234 Register(new_map, write_inheritance);
235 return new_map;
236 }
237
238 void UpdateBlock(const TBuffer& block, CacheAddr start, CacheAddr end,
239 std::vector<MapInterval>& overlaps) {
240 const IntervalType base_interval{start, end};
241 IntervalSet interval_set{};
242 interval_set.add(base_interval);
243 for (auto& overlap : overlaps) {
244 const IntervalType subtract{overlap->GetStart(), overlap->GetEnd()};
245 interval_set.subtract(subtract);
246 }
247 for (auto& interval : interval_set) {
248 std::size_t size = interval.upper() - interval.lower();
249 if (size > 0) {
250 u8* host_ptr = FromCacheAddr(interval.lower());
251 UploadBlockData(block, block->GetOffset(interval.lower()), size, host_ptr);
252 }
253 }
254 }
255
256 std::vector<MapInterval> GetMapsInRange(CacheAddr addr, std::size_t size) {
257 if (size == 0) {
258 return {};
259 }
260
261 std::vector<MapInterval> objects{};
262 const IntervalType interval{addr, addr + size};
263 for (auto& pair : boost::make_iterator_range(mapped_addresses.equal_range(interval))) {
264 objects.push_back(pair.second);
265 }
266
267 return objects;
268 }
269
270 /// Returns a ticks counter used for tracking when cached objects were last modified
271 u64 GetModifiedTicks() {
272 return ++modified_ticks;
273 }
274
275 void FlushMap(MapInterval map) {
276 std::size_t size = map->GetEnd() - map->GetStart();
277 TBuffer block = blocks[map->GetStart() >> block_page_bits];
278 u8* host_ptr = FromCacheAddr(map->GetStart());
279 DownloadBlockData(block, block->GetOffset(map->GetStart()), size, host_ptr);
280 map->MarkAsModified(false, 0);
281 }
282
283 BufferInfo StreamBufferUpload(const void* raw_pointer, std::size_t size,
284 std::size_t alignment) {
285 AlignBuffer(alignment);
286 const std::size_t uploaded_offset = buffer_offset;
287 std::memcpy(buffer_ptr, raw_pointer, size);
288
289 buffer_ptr += size;
290 buffer_offset += size;
291 return {&stream_buffer_handle, uploaded_offset};
292 }
293
294 void AlignBuffer(std::size_t alignment) {
295 // Align the offset, not the mapped pointer
296 const std::size_t offset_aligned = Common::AlignUp(buffer_offset, alignment);
297 buffer_ptr += offset_aligned - buffer_offset;
298 buffer_offset = offset_aligned;
299 }
300
301 TBuffer EnlargeBlock(TBuffer buffer) {
302 const std::size_t old_size = buffer->GetSize();
303 const std::size_t new_size = old_size + block_page_size;
304 const CacheAddr cache_addr = buffer->GetCacheAddr();
305 TBuffer new_buffer = CreateBlock(cache_addr, new_size);
306 CopyBlock(buffer, new_buffer, 0, 0, old_size);
307 buffer->SetEpoch(epoch);
308 pending_destruction.push_back(buffer);
309 const CacheAddr cache_addr_end = cache_addr + new_size - 1;
310 u64 page_start = cache_addr >> block_page_bits;
311 const u64 page_end = cache_addr_end >> block_page_bits;
312 while (page_start <= page_end) {
313 blocks[page_start] = new_buffer;
314 ++page_start;
315 }
316 return new_buffer;
317 }
318
319 TBuffer MergeBlocks(TBuffer first, TBuffer second) {
320 const std::size_t size_1 = first->GetSize();
321 const std::size_t size_2 = second->GetSize();
322 const CacheAddr first_addr = first->GetCacheAddr();
323 const CacheAddr second_addr = second->GetCacheAddr();
324 const CacheAddr new_addr = std::min(first_addr, second_addr);
325 const std::size_t new_size = size_1 + size_2;
326 TBuffer new_buffer = CreateBlock(new_addr, new_size);
327 CopyBlock(first, new_buffer, 0, new_buffer->GetOffset(first_addr), size_1);
328 CopyBlock(second, new_buffer, 0, new_buffer->GetOffset(second_addr), size_2);
329 first->SetEpoch(epoch);
330 second->SetEpoch(epoch);
331 pending_destruction.push_back(first);
332 pending_destruction.push_back(second);
333 const CacheAddr cache_addr_end = new_addr + new_size - 1;
334 u64 page_start = new_addr >> block_page_bits;
335 const u64 page_end = cache_addr_end >> block_page_bits;
336 while (page_start <= page_end) {
337 blocks[page_start] = new_buffer;
338 ++page_start;
339 }
340 return new_buffer;
341 }
342
343 TBuffer GetBlock(const CacheAddr cache_addr, const std::size_t size) {
344 TBuffer found{};
345 const CacheAddr cache_addr_end = cache_addr + size - 1;
346 u64 page_start = cache_addr >> block_page_bits;
347 const u64 page_end = cache_addr_end >> block_page_bits;
348 while (page_start <= page_end) {
349 auto it = blocks.find(page_start);
350 if (it == blocks.end()) {
351 if (found) {
352 found = EnlargeBlock(found);
353 } else {
354 const CacheAddr start_addr = (page_start << block_page_bits);
355 found = CreateBlock(start_addr, block_page_size);
356 blocks[page_start] = found;
357 }
358 } else {
359 if (found) {
360 if (found == it->second) {
361 ++page_start;
362 continue;
363 }
364 found = MergeBlocks(found, it->second);
365 } else {
366 found = it->second;
367 }
368 }
369 ++page_start;
370 }
371 return found;
372 }
373
374 void MarkRegionAsWritten(const CacheAddr start, const CacheAddr end) {
375 u64 page_start = start >> write_page_bit;
376 const u64 page_end = end >> write_page_bit;
377 while (page_start <= page_end) {
378 auto it = written_pages.find(page_start);
379 if (it != written_pages.end()) {
380 it->second = it->second + 1;
381 } else {
382 written_pages[page_start] = 1;
383 }
384 page_start++;
385 }
386 }
387
388 void UnmarkRegionAsWritten(const CacheAddr start, const CacheAddr end) {
389 u64 page_start = start >> write_page_bit;
390 const u64 page_end = end >> write_page_bit;
391 while (page_start <= page_end) {
392 auto it = written_pages.find(page_start);
393 if (it != written_pages.end()) {
394 if (it->second > 1) {
395 it->second = it->second - 1;
396 } else {
397 written_pages.erase(it);
398 }
399 }
400 page_start++;
401 }
402 }
403
404 bool IsRegionWritten(const CacheAddr start, const CacheAddr end) const {
405 u64 page_start = start >> write_page_bit;
406 const u64 page_end = end >> write_page_bit;
407 while (page_start <= page_end) {
408 if (written_pages.count(page_start) > 0) {
409 return true;
410 }
411 page_start++;
412 }
413 return false;
414 }
415
416 VideoCore::RasterizerInterface& rasterizer;
417 Core::System& system;
418 std::unique_ptr<StreamBuffer> stream_buffer;
419
420 TBufferType stream_buffer_handle{};
421
422 bool invalidated = false;
423
424 u8* buffer_ptr = nullptr;
425 u64 buffer_offset = 0;
426 u64 buffer_offset_base = 0;
427
428 using IntervalSet = boost::icl::interval_set<CacheAddr>;
429 using IntervalCache = boost::icl::interval_map<CacheAddr, MapInterval>;
430 using IntervalType = typename IntervalCache::interval_type;
431 IntervalCache mapped_addresses{};
432
433 static constexpr u64 write_page_bit{11};
434 std::unordered_map<u64, u32> written_pages{};
435
436 static constexpr u64 block_page_bits{21};
437 static constexpr u64 block_page_size{1 << block_page_bits};
438 std::unordered_map<u64, TBuffer> blocks{};
439
440 std::list<TBuffer> pending_destruction{};
441 u64 epoch{};
442 u64 modified_ticks{};
443
444 std::recursive_mutex mutex;
445};
446
447} // namespace VideoCommon
diff --git a/src/video_core/buffer_cache/map_interval.h b/src/video_core/buffer_cache/map_interval.h
new file mode 100644
index 000000000..3a104d5cd
--- /dev/null
+++ b/src/video_core/buffer_cache/map_interval.h
@@ -0,0 +1,89 @@
1// Copyright 2019 yuzu Emulator Project
2// Licensed under GPLv2 or any later version
3// Refer to the license.txt file included.
4
5#pragma once
6
7#include "common/common_types.h"
8#include "video_core/gpu.h"
9
10namespace VideoCommon {
11
12class MapIntervalBase {
13public:
14 MapIntervalBase(const CacheAddr start, const CacheAddr end, const GPUVAddr gpu_addr)
15 : start{start}, end{end}, gpu_addr{gpu_addr} {}
16
17 void SetCpuAddress(VAddr new_cpu_addr) {
18 cpu_addr = new_cpu_addr;
19 }
20
21 VAddr GetCpuAddress() const {
22 return cpu_addr;
23 }
24
25 GPUVAddr GetGpuAddress() const {
26 return gpu_addr;
27 }
28
29 bool IsInside(const CacheAddr other_start, const CacheAddr other_end) const {
30 return (start <= other_start && other_end <= end);
31 }
32
33 bool operator==(const MapIntervalBase& rhs) const {
34 return std::tie(start, end) == std::tie(rhs.start, rhs.end);
35 }
36
37 bool operator!=(const MapIntervalBase& rhs) const {
38 return !operator==(rhs);
39 }
40
41 void MarkAsRegistered(const bool registered) {
42 is_registered = registered;
43 }
44
45 bool IsRegistered() const {
46 return is_registered;
47 }
48
49 CacheAddr GetStart() const {
50 return start;
51 }
52
53 CacheAddr GetEnd() const {
54 return end;
55 }
56
57 void MarkAsModified(const bool is_modified_, const u64 tick) {
58 is_modified = is_modified_;
59 ticks = tick;
60 }
61
62 bool IsModified() const {
63 return is_modified;
64 }
65
66 u64 GetModificationTick() const {
67 return ticks;
68 }
69
70 void MarkAsWritten(const bool is_written_) {
71 is_written = is_written_;
72 }
73
74 bool IsWritten() const {
75 return is_written;
76 }
77
78private:
79 CacheAddr start;
80 CacheAddr end;
81 GPUVAddr gpu_addr;
82 VAddr cpu_addr{};
83 bool is_written{};
84 bool is_modified{};
85 bool is_registered{};
86 u64 ticks{};
87};
88
89} // namespace VideoCommon
diff --git a/src/video_core/engines/fermi_2d.cpp b/src/video_core/engines/fermi_2d.cpp
index 0ee228e28..98a8b5337 100644
--- a/src/video_core/engines/fermi_2d.cpp
+++ b/src/video_core/engines/fermi_2d.cpp
@@ -10,8 +10,7 @@
10 10
11namespace Tegra::Engines { 11namespace Tegra::Engines {
12 12
13Fermi2D::Fermi2D(VideoCore::RasterizerInterface& rasterizer, MemoryManager& memory_manager) 13Fermi2D::Fermi2D(VideoCore::RasterizerInterface& rasterizer) : rasterizer{rasterizer} {}
14 : rasterizer{rasterizer}, memory_manager{memory_manager} {}
15 14
16void Fermi2D::CallMethod(const GPU::MethodCall& method_call) { 15void Fermi2D::CallMethod(const GPU::MethodCall& method_call) {
17 ASSERT_MSG(method_call.method < Regs::NUM_REGS, 16 ASSERT_MSG(method_call.method < Regs::NUM_REGS,
diff --git a/src/video_core/engines/fermi_2d.h b/src/video_core/engines/fermi_2d.h
index 05421d185..0901cf2fa 100644
--- a/src/video_core/engines/fermi_2d.h
+++ b/src/video_core/engines/fermi_2d.h
@@ -33,7 +33,7 @@ namespace Tegra::Engines {
33 33
34class Fermi2D final { 34class Fermi2D final {
35public: 35public:
36 explicit Fermi2D(VideoCore::RasterizerInterface& rasterizer, MemoryManager& memory_manager); 36 explicit Fermi2D(VideoCore::RasterizerInterface& rasterizer);
37 ~Fermi2D() = default; 37 ~Fermi2D() = default;
38 38
39 /// Write the value to the register identified by method. 39 /// Write the value to the register identified by method.
@@ -145,7 +145,6 @@ public:
145 145
146private: 146private:
147 VideoCore::RasterizerInterface& rasterizer; 147 VideoCore::RasterizerInterface& rasterizer;
148 MemoryManager& memory_manager;
149 148
150 /// Performs the copy from the source surface to the destination surface as configured in the 149 /// Performs the copy from the source surface to the destination surface as configured in the
151 /// registers. 150 /// registers.
diff --git a/src/video_core/engines/kepler_memory.cpp b/src/video_core/engines/kepler_memory.cpp
index 44279de00..fa4a7c5c1 100644
--- a/src/video_core/engines/kepler_memory.cpp
+++ b/src/video_core/engines/kepler_memory.cpp
@@ -15,7 +15,7 @@
15namespace Tegra::Engines { 15namespace Tegra::Engines {
16 16
17KeplerMemory::KeplerMemory(Core::System& system, MemoryManager& memory_manager) 17KeplerMemory::KeplerMemory(Core::System& system, MemoryManager& memory_manager)
18 : system{system}, memory_manager{memory_manager}, upload_state{memory_manager, regs.upload} {} 18 : system{system}, upload_state{memory_manager, regs.upload} {}
19 19
20KeplerMemory::~KeplerMemory() = default; 20KeplerMemory::~KeplerMemory() = default;
21 21
diff --git a/src/video_core/engines/kepler_memory.h b/src/video_core/engines/kepler_memory.h
index f3bc675a9..e0e25c321 100644
--- a/src/video_core/engines/kepler_memory.h
+++ b/src/video_core/engines/kepler_memory.h
@@ -65,7 +65,6 @@ public:
65 65
66private: 66private:
67 Core::System& system; 67 Core::System& system;
68 MemoryManager& memory_manager;
69 Upload::State upload_state; 68 Upload::State upload_state;
70}; 69};
71 70
diff --git a/src/video_core/engines/maxwell_3d.cpp b/src/video_core/engines/maxwell_3d.cpp
index 125c53360..f5158d219 100644
--- a/src/video_core/engines/maxwell_3d.cpp
+++ b/src/video_core/engines/maxwell_3d.cpp
@@ -249,16 +249,10 @@ void Maxwell3D::CallMacroMethod(u32 method, std::vector<u32> parameters) {
249 executing_macro = 0; 249 executing_macro = 0;
250 250
251 // Lookup the macro offset 251 // Lookup the macro offset
252 const u32 entry{(method - MacroRegistersStart) >> 1}; 252 const u32 entry = ((method - MacroRegistersStart) >> 1) % macro_positions.size();
253 const auto& search{macro_offsets.find(entry)};
254 if (search == macro_offsets.end()) {
255 LOG_CRITICAL(HW_GPU, "macro not found for method 0x{:X}!", method);
256 UNREACHABLE();
257 return;
258 }
259 253
260 // Execute the current macro. 254 // Execute the current macro.
261 macro_interpreter.Execute(search->second, std::move(parameters)); 255 macro_interpreter.Execute(macro_positions[entry], std::move(parameters));
262} 256}
263 257
264void Maxwell3D::CallMethod(const GPU::MethodCall& method_call) { 258void Maxwell3D::CallMethod(const GPU::MethodCall& method_call) {
@@ -421,7 +415,7 @@ void Maxwell3D::ProcessMacroUpload(u32 data) {
421} 415}
422 416
423void Maxwell3D::ProcessMacroBind(u32 data) { 417void Maxwell3D::ProcessMacroBind(u32 data) {
424 macro_offsets[regs.macros.entry] = data; 418 macro_positions[regs.macros.entry++] = data;
425} 419}
426 420
427void Maxwell3D::ProcessQueryGet() { 421void Maxwell3D::ProcessQueryGet() {
@@ -524,7 +518,7 @@ void Maxwell3D::ProcessQueryCondition() {
524void Maxwell3D::ProcessSyncPoint() { 518void Maxwell3D::ProcessSyncPoint() {
525 const u32 sync_point = regs.sync_info.sync_point.Value(); 519 const u32 sync_point = regs.sync_info.sync_point.Value();
526 const u32 increment = regs.sync_info.increment.Value(); 520 const u32 increment = regs.sync_info.increment.Value();
527 const u32 cache_flush = regs.sync_info.unknown.Value(); 521 [[maybe_unused]] const u32 cache_flush = regs.sync_info.unknown.Value();
528 if (increment) { 522 if (increment) {
529 system.GPU().IncrementSyncPoint(sync_point); 523 system.GPU().IncrementSyncPoint(sync_point);
530 } 524 }
@@ -626,10 +620,10 @@ Texture::TICEntry Maxwell3D::GetTICEntry(u32 tic_index) const {
626 Texture::TICEntry tic_entry; 620 Texture::TICEntry tic_entry;
627 memory_manager.ReadBlockUnsafe(tic_address_gpu, &tic_entry, sizeof(Texture::TICEntry)); 621 memory_manager.ReadBlockUnsafe(tic_address_gpu, &tic_entry, sizeof(Texture::TICEntry));
628 622
629 const auto r_type{tic_entry.r_type.Value()}; 623 [[maybe_unused]] const auto r_type{tic_entry.r_type.Value()};
630 const auto g_type{tic_entry.g_type.Value()}; 624 [[maybe_unused]] const auto g_type{tic_entry.g_type.Value()};
631 const auto b_type{tic_entry.b_type.Value()}; 625 [[maybe_unused]] const auto b_type{tic_entry.b_type.Value()};
632 const auto a_type{tic_entry.a_type.Value()}; 626 [[maybe_unused]] const auto a_type{tic_entry.a_type.Value()};
633 627
634 // TODO(Subv): Different data types for separate components are not supported 628 // TODO(Subv): Different data types for separate components are not supported
635 DEBUG_ASSERT(r_type == g_type && r_type == b_type && r_type == a_type); 629 DEBUG_ASSERT(r_type == g_type && r_type == b_type && r_type == a_type);
diff --git a/src/video_core/engines/maxwell_3d.h b/src/video_core/engines/maxwell_3d.h
index 1ee982b76..0184342a0 100644
--- a/src/video_core/engines/maxwell_3d.h
+++ b/src/video_core/engines/maxwell_3d.h
@@ -1270,7 +1270,7 @@ private:
1270 MemoryManager& memory_manager; 1270 MemoryManager& memory_manager;
1271 1271
1272 /// Start offsets of each macro in macro_memory 1272 /// Start offsets of each macro in macro_memory
1273 std::unordered_map<u32, u32> macro_offsets; 1273 std::array<u32, 0x80> macro_positions = {};
1274 1274
1275 /// Memory for macro code 1275 /// Memory for macro code
1276 MacroMemory macro_memory; 1276 MacroMemory macro_memory;
diff --git a/src/video_core/engines/maxwell_dma.cpp b/src/video_core/engines/maxwell_dma.cpp
index a28c04473..ad8453c5f 100644
--- a/src/video_core/engines/maxwell_dma.cpp
+++ b/src/video_core/engines/maxwell_dma.cpp
@@ -5,18 +5,17 @@
5#include "common/assert.h" 5#include "common/assert.h"
6#include "common/logging/log.h" 6#include "common/logging/log.h"
7#include "core/core.h" 7#include "core/core.h"
8#include "core/settings.h"
8#include "video_core/engines/maxwell_3d.h" 9#include "video_core/engines/maxwell_3d.h"
9#include "video_core/engines/maxwell_dma.h" 10#include "video_core/engines/maxwell_dma.h"
10#include "video_core/memory_manager.h" 11#include "video_core/memory_manager.h"
11#include "video_core/rasterizer_interface.h"
12#include "video_core/renderer_base.h" 12#include "video_core/renderer_base.h"
13#include "video_core/textures/decoders.h" 13#include "video_core/textures/decoders.h"
14 14
15namespace Tegra::Engines { 15namespace Tegra::Engines {
16 16
17MaxwellDMA::MaxwellDMA(Core::System& system, VideoCore::RasterizerInterface& rasterizer, 17MaxwellDMA::MaxwellDMA(Core::System& system, MemoryManager& memory_manager)
18 MemoryManager& memory_manager) 18 : system{system}, memory_manager{memory_manager} {}
19 : system{system}, rasterizer{rasterizer}, memory_manager{memory_manager} {}
20 19
21void MaxwellDMA::CallMethod(const GPU::MethodCall& method_call) { 20void MaxwellDMA::CallMethod(const GPU::MethodCall& method_call) {
22 ASSERT_MSG(method_call.method < Regs::NUM_REGS, 21 ASSERT_MSG(method_call.method < Regs::NUM_REGS,
@@ -84,13 +83,17 @@ void MaxwellDMA::HandleCopy() {
84 ASSERT(regs.exec.enable_2d == 1); 83 ASSERT(regs.exec.enable_2d == 1);
85 84
86 if (regs.exec.is_dst_linear && !regs.exec.is_src_linear) { 85 if (regs.exec.is_dst_linear && !regs.exec.is_src_linear) {
87 ASSERT(regs.src_params.size_z == 1); 86 ASSERT(regs.src_params.BlockDepth() == 0);
88 // If the input is tiled and the output is linear, deswizzle the input and copy it over. 87 // If the input is tiled and the output is linear, deswizzle the input and copy it over.
89 const u32 src_bytes_per_pixel = regs.src_pitch / regs.src_params.size_x; 88 const u32 bytes_per_pixel = regs.dst_pitch / regs.x_count;
90 const std::size_t src_size = Texture::CalculateSize( 89 const std::size_t src_size = Texture::CalculateSize(
91 true, src_bytes_per_pixel, regs.src_params.size_x, regs.src_params.size_y, 90 true, bytes_per_pixel, regs.src_params.size_x, regs.src_params.size_y,
92 regs.src_params.size_z, regs.src_params.BlockHeight(), regs.src_params.BlockDepth()); 91 regs.src_params.size_z, regs.src_params.BlockHeight(), regs.src_params.BlockDepth());
93 92
93 const std::size_t src_layer_size = Texture::CalculateSize(
94 true, bytes_per_pixel, regs.src_params.size_x, regs.src_params.size_y, 1,
95 regs.src_params.BlockHeight(), regs.src_params.BlockDepth());
96
94 const std::size_t dst_size = regs.dst_pitch * regs.y_count; 97 const std::size_t dst_size = regs.dst_pitch * regs.y_count;
95 98
96 if (read_buffer.size() < src_size) { 99 if (read_buffer.size() < src_size) {
@@ -104,23 +107,23 @@ void MaxwellDMA::HandleCopy() {
104 memory_manager.ReadBlock(source, read_buffer.data(), src_size); 107 memory_manager.ReadBlock(source, read_buffer.data(), src_size);
105 memory_manager.ReadBlock(dest, write_buffer.data(), dst_size); 108 memory_manager.ReadBlock(dest, write_buffer.data(), dst_size);
106 109
107 Texture::UnswizzleSubrect(regs.x_count, regs.y_count, regs.dst_pitch, 110 Texture::UnswizzleSubrect(
108 regs.src_params.size_x, src_bytes_per_pixel, read_buffer.data(), 111 regs.x_count, regs.y_count, regs.dst_pitch, regs.src_params.size_x, bytes_per_pixel,
109 write_buffer.data(), regs.src_params.BlockHeight(), 112 read_buffer.data() + src_layer_size * regs.src_params.pos_z, write_buffer.data(),
110 regs.src_params.pos_x, regs.src_params.pos_y); 113 regs.src_params.BlockHeight(), regs.src_params.pos_x, regs.src_params.pos_y);
111 114
112 memory_manager.WriteBlock(dest, write_buffer.data(), dst_size); 115 memory_manager.WriteBlock(dest, write_buffer.data(), dst_size);
113 } else { 116 } else {
114 ASSERT(regs.dst_params.BlockDepth() == 0); 117 ASSERT(regs.dst_params.BlockDepth() == 0);
115 118
116 const u32 src_bytes_per_pixel = regs.src_pitch / regs.x_count; 119 const u32 bytes_per_pixel = regs.src_pitch / regs.x_count;
117 120
118 const std::size_t dst_size = Texture::CalculateSize( 121 const std::size_t dst_size = Texture::CalculateSize(
119 true, src_bytes_per_pixel, regs.dst_params.size_x, regs.dst_params.size_y, 122 true, bytes_per_pixel, regs.dst_params.size_x, regs.dst_params.size_y,
120 regs.dst_params.size_z, regs.dst_params.BlockHeight(), regs.dst_params.BlockDepth()); 123 regs.dst_params.size_z, regs.dst_params.BlockHeight(), regs.dst_params.BlockDepth());
121 124
122 const std::size_t dst_layer_size = Texture::CalculateSize( 125 const std::size_t dst_layer_size = Texture::CalculateSize(
123 true, src_bytes_per_pixel, regs.dst_params.size_x, regs.dst_params.size_y, 1, 126 true, bytes_per_pixel, regs.dst_params.size_x, regs.dst_params.size_y, 1,
124 regs.dst_params.BlockHeight(), regs.dst_params.BlockDepth()); 127 regs.dst_params.BlockHeight(), regs.dst_params.BlockDepth());
125 128
126 const std::size_t src_size = regs.src_pitch * regs.y_count; 129 const std::size_t src_size = regs.src_pitch * regs.y_count;
@@ -133,14 +136,19 @@ void MaxwellDMA::HandleCopy() {
133 write_buffer.resize(dst_size); 136 write_buffer.resize(dst_size);
134 } 137 }
135 138
136 memory_manager.ReadBlock(source, read_buffer.data(), src_size); 139 if (Settings::values.use_accurate_gpu_emulation) {
137 memory_manager.ReadBlock(dest, write_buffer.data(), dst_size); 140 memory_manager.ReadBlock(source, read_buffer.data(), src_size);
141 memory_manager.ReadBlock(dest, write_buffer.data(), dst_size);
142 } else {
143 memory_manager.ReadBlockUnsafe(source, read_buffer.data(), src_size);
144 memory_manager.ReadBlockUnsafe(dest, write_buffer.data(), dst_size);
145 }
138 146
139 // If the input is linear and the output is tiled, swizzle the input and copy it over. 147 // If the input is linear and the output is tiled, swizzle the input and copy it over.
140 Texture::SwizzleSubrect(regs.x_count, regs.y_count, regs.src_pitch, regs.dst_params.size_x, 148 Texture::SwizzleSubrect(
141 src_bytes_per_pixel, 149 regs.x_count, regs.y_count, regs.src_pitch, regs.dst_params.size_x, bytes_per_pixel,
142 write_buffer.data() + dst_layer_size * regs.dst_params.pos_z, 150 write_buffer.data() + dst_layer_size * regs.dst_params.pos_z, read_buffer.data(),
143 read_buffer.data(), regs.dst_params.BlockHeight()); 151 regs.dst_params.BlockHeight(), regs.dst_params.pos_x, regs.dst_params.pos_y);
144 152
145 memory_manager.WriteBlock(dest, write_buffer.data(), dst_size); 153 memory_manager.WriteBlock(dest, write_buffer.data(), dst_size);
146 } 154 }
diff --git a/src/video_core/engines/maxwell_dma.h b/src/video_core/engines/maxwell_dma.h
index 17b015ca7..93808a9bb 100644
--- a/src/video_core/engines/maxwell_dma.h
+++ b/src/video_core/engines/maxwell_dma.h
@@ -20,10 +20,6 @@ namespace Tegra {
20class MemoryManager; 20class MemoryManager;
21} 21}
22 22
23namespace VideoCore {
24class RasterizerInterface;
25}
26
27namespace Tegra::Engines { 23namespace Tegra::Engines {
28 24
29/** 25/**
@@ -33,8 +29,7 @@ namespace Tegra::Engines {
33 29
34class MaxwellDMA final { 30class MaxwellDMA final {
35public: 31public:
36 explicit MaxwellDMA(Core::System& system, VideoCore::RasterizerInterface& rasterizer, 32 explicit MaxwellDMA(Core::System& system, MemoryManager& memory_manager);
37 MemoryManager& memory_manager);
38 ~MaxwellDMA() = default; 33 ~MaxwellDMA() = default;
39 34
40 /// Write the value to the register identified by method. 35 /// Write the value to the register identified by method.
@@ -180,8 +175,6 @@ public:
180private: 175private:
181 Core::System& system; 176 Core::System& system;
182 177
183 VideoCore::RasterizerInterface& rasterizer;
184
185 MemoryManager& memory_manager; 178 MemoryManager& memory_manager;
186 179
187 std::vector<u8> read_buffer; 180 std::vector<u8> read_buffer;
diff --git a/src/video_core/engines/shader_bytecode.h b/src/video_core/engines/shader_bytecode.h
index 747284700..c3678b9ea 100644
--- a/src/video_core/engines/shader_bytecode.h
+++ b/src/video_core/engines/shader_bytecode.h
@@ -538,6 +538,12 @@ enum class PhysicalAttributeDirection : u64 {
538 Output = 1, 538 Output = 1,
539}; 539};
540 540
541enum class VoteOperation : u64 {
542 All = 0, // allThreadsNV
543 Any = 1, // anyThreadNV
544 Eq = 2, // allThreadsEqualNV
545};
546
541union Instruction { 547union Instruction {
542 Instruction& operator=(const Instruction& instr) { 548 Instruction& operator=(const Instruction& instr) {
543 value = instr.value; 549 value = instr.value;
@@ -565,6 +571,13 @@ union Instruction {
565 } nop; 571 } nop;
566 572
567 union { 573 union {
574 BitField<48, 2, VoteOperation> operation;
575 BitField<45, 3, u64> dest_pred;
576 BitField<39, 3, u64> value;
577 BitField<42, 1, u64> negate_value;
578 } vote;
579
580 union {
568 BitField<8, 8, Register> gpr; 581 BitField<8, 8, Register> gpr;
569 BitField<20, 24, s64> offset; 582 BitField<20, 24, s64> offset;
570 } gmem; 583 } gmem;
@@ -873,6 +886,7 @@ union Instruction {
873 union { 886 union {
874 BitField<0, 3, u64> pred0; 887 BitField<0, 3, u64> pred0;
875 BitField<3, 3, u64> pred3; 888 BitField<3, 3, u64> pred3;
889 BitField<6, 1, u64> neg_b;
876 BitField<7, 1, u64> abs_a; 890 BitField<7, 1, u64> abs_a;
877 BitField<39, 3, u64> pred39; 891 BitField<39, 3, u64> pred39;
878 BitField<42, 1, u64> neg_pred; 892 BitField<42, 1, u64> neg_pred;
@@ -1493,6 +1507,7 @@ public:
1493 SYNC, 1507 SYNC,
1494 BRK, 1508 BRK,
1495 DEPBAR, 1509 DEPBAR,
1510 VOTE,
1496 BFE_C, 1511 BFE_C,
1497 BFE_R, 1512 BFE_R,
1498 BFE_IMM, 1513 BFE_IMM,
@@ -1655,6 +1670,7 @@ public:
1655 Hfma2, 1670 Hfma2,
1656 Flow, 1671 Flow,
1657 Synch, 1672 Synch,
1673 Warp,
1658 Memory, 1674 Memory,
1659 Texture, 1675 Texture,
1660 Image, 1676 Image,
@@ -1781,6 +1797,7 @@ private:
1781 INST("111000110100---", Id::BRK, Type::Flow, "BRK"), 1797 INST("111000110100---", Id::BRK, Type::Flow, "BRK"),
1782 INST("111000110000----", Id::EXIT, Type::Flow, "EXIT"), 1798 INST("111000110000----", Id::EXIT, Type::Flow, "EXIT"),
1783 INST("1111000011110---", Id::DEPBAR, Type::Synch, "DEPBAR"), 1799 INST("1111000011110---", Id::DEPBAR, Type::Synch, "DEPBAR"),
1800 INST("0101000011011---", Id::VOTE, Type::Warp, "VOTE"),
1784 INST("1110111111011---", Id::LD_A, Type::Memory, "LD_A"), 1801 INST("1110111111011---", Id::LD_A, Type::Memory, "LD_A"),
1785 INST("1110111101001---", Id::LD_S, Type::Memory, "LD_S"), 1802 INST("1110111101001---", Id::LD_S, Type::Memory, "LD_S"),
1786 INST("1110111101000---", Id::LD_L, Type::Memory, "LD_L"), 1803 INST("1110111101000---", Id::LD_L, Type::Memory, "LD_L"),
diff --git a/src/video_core/gpu.cpp b/src/video_core/gpu.cpp
index c409af194..8d9db45f5 100644
--- a/src/video_core/gpu.cpp
+++ b/src/video_core/gpu.cpp
@@ -35,9 +35,9 @@ GPU::GPU(Core::System& system, VideoCore::RendererBase& renderer, bool is_async)
35 memory_manager = std::make_unique<Tegra::MemoryManager>(system, rasterizer); 35 memory_manager = std::make_unique<Tegra::MemoryManager>(system, rasterizer);
36 dma_pusher = std::make_unique<Tegra::DmaPusher>(*this); 36 dma_pusher = std::make_unique<Tegra::DmaPusher>(*this);
37 maxwell_3d = std::make_unique<Engines::Maxwell3D>(system, rasterizer, *memory_manager); 37 maxwell_3d = std::make_unique<Engines::Maxwell3D>(system, rasterizer, *memory_manager);
38 fermi_2d = std::make_unique<Engines::Fermi2D>(rasterizer, *memory_manager); 38 fermi_2d = std::make_unique<Engines::Fermi2D>(rasterizer);
39 kepler_compute = std::make_unique<Engines::KeplerCompute>(system, rasterizer, *memory_manager); 39 kepler_compute = std::make_unique<Engines::KeplerCompute>(system, rasterizer, *memory_manager);
40 maxwell_dma = std::make_unique<Engines::MaxwellDMA>(system, rasterizer, *memory_manager); 40 maxwell_dma = std::make_unique<Engines::MaxwellDMA>(system, *memory_manager);
41 kepler_memory = std::make_unique<Engines::KeplerMemory>(system, *memory_manager); 41 kepler_memory = std::make_unique<Engines::KeplerMemory>(system, *memory_manager);
42} 42}
43 43
diff --git a/src/video_core/gpu.h b/src/video_core/gpu.h
index 11857ff99..544340ecd 100644
--- a/src/video_core/gpu.h
+++ b/src/video_core/gpu.h
@@ -19,6 +19,10 @@ inline CacheAddr ToCacheAddr(const void* host_ptr) {
19 return reinterpret_cast<CacheAddr>(host_ptr); 19 return reinterpret_cast<CacheAddr>(host_ptr);
20} 20}
21 21
22inline u8* FromCacheAddr(CacheAddr cache_addr) {
23 return reinterpret_cast<u8*>(cache_addr);
24}
25
22namespace Core { 26namespace Core {
23class System; 27class System;
24} 28}
@@ -281,8 +285,8 @@ private:
281 285
282protected: 286protected:
283 std::unique_ptr<Tegra::DmaPusher> dma_pusher; 287 std::unique_ptr<Tegra::DmaPusher> dma_pusher;
284 VideoCore::RendererBase& renderer;
285 Core::System& system; 288 Core::System& system;
289 VideoCore::RendererBase& renderer;
286 290
287private: 291private:
288 std::unique_ptr<Tegra::MemoryManager> memory_manager; 292 std::unique_ptr<Tegra::MemoryManager> memory_manager;
diff --git a/src/video_core/rasterizer_interface.h b/src/video_core/rasterizer_interface.h
index 6e44d51cf..6b3f2d50a 100644
--- a/src/video_core/rasterizer_interface.h
+++ b/src/video_core/rasterizer_interface.h
@@ -50,7 +50,7 @@ public:
50 /// and invalidated 50 /// and invalidated
51 virtual void FlushAndInvalidateRegion(CacheAddr addr, u64 size) = 0; 51 virtual void FlushAndInvalidateRegion(CacheAddr addr, u64 size) = 0;
52 52
53 // Notify the rasterizer to send all written commands to the host GPU. 53 /// Notify the rasterizer to send all written commands to the host GPU.
54 virtual void FlushCommands() = 0; 54 virtual void FlushCommands() = 0;
55 55
56 /// Notify rasterizer that a frame is about to finish 56 /// Notify rasterizer that a frame is about to finish
diff --git a/src/video_core/renderer_opengl/gl_buffer_cache.cpp b/src/video_core/renderer_opengl/gl_buffer_cache.cpp
index 2a9b523f5..f8a807c84 100644
--- a/src/video_core/renderer_opengl/gl_buffer_cache.cpp
+++ b/src/video_core/renderer_opengl/gl_buffer_cache.cpp
@@ -7,28 +7,41 @@
7#include <glad/glad.h> 7#include <glad/glad.h>
8 8
9#include "common/assert.h" 9#include "common/assert.h"
10#include "common/microprofile.h"
11#include "video_core/rasterizer_interface.h"
10#include "video_core/renderer_opengl/gl_buffer_cache.h" 12#include "video_core/renderer_opengl/gl_buffer_cache.h"
11#include "video_core/renderer_opengl/gl_rasterizer.h" 13#include "video_core/renderer_opengl/gl_rasterizer.h"
12#include "video_core/renderer_opengl/gl_resource_manager.h" 14#include "video_core/renderer_opengl/gl_resource_manager.h"
13 15
14namespace OpenGL { 16namespace OpenGL {
15 17
18MICROPROFILE_DEFINE(OpenGL_Buffer_Download, "OpenGL", "Buffer Download", MP_RGB(192, 192, 128));
19
20CachedBufferBlock::CachedBufferBlock(CacheAddr cache_addr, const std::size_t size)
21 : VideoCommon::BufferBlock{cache_addr, size} {
22 gl_buffer.Create();
23 glNamedBufferData(gl_buffer.handle, static_cast<GLsizeiptr>(size), nullptr, GL_DYNAMIC_DRAW);
24}
25
26CachedBufferBlock::~CachedBufferBlock() = default;
27
16OGLBufferCache::OGLBufferCache(RasterizerOpenGL& rasterizer, Core::System& system, 28OGLBufferCache::OGLBufferCache(RasterizerOpenGL& rasterizer, Core::System& system,
17 std::size_t stream_size) 29 std::size_t stream_size)
18 : VideoCommon::BufferCache<OGLBuffer, GLuint, OGLStreamBuffer>{ 30 : VideoCommon::BufferCache<Buffer, GLuint, OGLStreamBuffer>{
19 rasterizer, system, std::make_unique<OGLStreamBuffer>(stream_size, true)} {} 31 rasterizer, system, std::make_unique<OGLStreamBuffer>(stream_size, true)} {}
20 32
21OGLBufferCache::~OGLBufferCache() = default; 33OGLBufferCache::~OGLBufferCache() = default;
22 34
23OGLBuffer OGLBufferCache::CreateBuffer(std::size_t size) { 35Buffer OGLBufferCache::CreateBlock(CacheAddr cache_addr, std::size_t size) {
24 OGLBuffer buffer; 36 return std::make_shared<CachedBufferBlock>(cache_addr, size);
25 buffer.Create(); 37}
26 glNamedBufferData(buffer.handle, static_cast<GLsizeiptr>(size), nullptr, GL_DYNAMIC_DRAW); 38
27 return buffer; 39void OGLBufferCache::WriteBarrier() {
40 glMemoryBarrier(GL_ALL_BARRIER_BITS);
28} 41}
29 42
30const GLuint* OGLBufferCache::ToHandle(const OGLBuffer& buffer) { 43const GLuint* OGLBufferCache::ToHandle(const Buffer& buffer) {
31 return &buffer.handle; 44 return buffer->GetHandle();
32} 45}
33 46
34const GLuint* OGLBufferCache::GetEmptyBuffer(std::size_t) { 47const GLuint* OGLBufferCache::GetEmptyBuffer(std::size_t) {
@@ -36,23 +49,24 @@ const GLuint* OGLBufferCache::GetEmptyBuffer(std::size_t) {
36 return &null_buffer; 49 return &null_buffer;
37} 50}
38 51
39void OGLBufferCache::UploadBufferData(const OGLBuffer& buffer, std::size_t offset, std::size_t size, 52void OGLBufferCache::UploadBlockData(const Buffer& buffer, std::size_t offset, std::size_t size,
40 const u8* data) { 53 const u8* data) {
41 glNamedBufferSubData(buffer.handle, static_cast<GLintptr>(offset), 54 glNamedBufferSubData(*buffer->GetHandle(), static_cast<GLintptr>(offset),
42 static_cast<GLsizeiptr>(size), data); 55 static_cast<GLsizeiptr>(size), data);
43} 56}
44 57
45void OGLBufferCache::DownloadBufferData(const OGLBuffer& buffer, std::size_t offset, 58void OGLBufferCache::DownloadBlockData(const Buffer& buffer, std::size_t offset, std::size_t size,
46 std::size_t size, u8* data) { 59 u8* data) {
47 glGetNamedBufferSubData(buffer.handle, static_cast<GLintptr>(offset), 60 MICROPROFILE_SCOPE(OpenGL_Buffer_Download);
61 glGetNamedBufferSubData(*buffer->GetHandle(), static_cast<GLintptr>(offset),
48 static_cast<GLsizeiptr>(size), data); 62 static_cast<GLsizeiptr>(size), data);
49} 63}
50 64
51void OGLBufferCache::CopyBufferData(const OGLBuffer& src, const OGLBuffer& dst, 65void OGLBufferCache::CopyBlock(const Buffer& src, const Buffer& dst, std::size_t src_offset,
52 std::size_t src_offset, std::size_t dst_offset, 66 std::size_t dst_offset, std::size_t size) {
53 std::size_t size) { 67 glCopyNamedBufferSubData(*src->GetHandle(), *dst->GetHandle(),
54 glCopyNamedBufferSubData(src.handle, dst.handle, static_cast<GLintptr>(src_offset), 68 static_cast<GLintptr>(src_offset), static_cast<GLintptr>(dst_offset),
55 static_cast<GLintptr>(dst_offset), static_cast<GLsizeiptr>(size)); 69 static_cast<GLsizeiptr>(size));
56} 70}
57 71
58} // namespace OpenGL 72} // namespace OpenGL
diff --git a/src/video_core/renderer_opengl/gl_buffer_cache.h b/src/video_core/renderer_opengl/gl_buffer_cache.h
index 8c8ac4038..022e7bfa9 100644
--- a/src/video_core/renderer_opengl/gl_buffer_cache.h
+++ b/src/video_core/renderer_opengl/gl_buffer_cache.h
@@ -7,7 +7,7 @@
7#include <memory> 7#include <memory>
8 8
9#include "common/common_types.h" 9#include "common/common_types.h"
10#include "video_core/buffer_cache.h" 10#include "video_core/buffer_cache/buffer_cache.h"
11#include "video_core/rasterizer_cache.h" 11#include "video_core/rasterizer_cache.h"
12#include "video_core/renderer_opengl/gl_resource_manager.h" 12#include "video_core/renderer_opengl/gl_resource_manager.h"
13#include "video_core/renderer_opengl/gl_stream_buffer.h" 13#include "video_core/renderer_opengl/gl_stream_buffer.h"
@@ -21,7 +21,24 @@ namespace OpenGL {
21class OGLStreamBuffer; 21class OGLStreamBuffer;
22class RasterizerOpenGL; 22class RasterizerOpenGL;
23 23
24class OGLBufferCache final : public VideoCommon::BufferCache<OGLBuffer, GLuint, OGLStreamBuffer> { 24class CachedBufferBlock;
25
26using Buffer = std::shared_ptr<CachedBufferBlock>;
27
28class CachedBufferBlock : public VideoCommon::BufferBlock {
29public:
30 explicit CachedBufferBlock(CacheAddr cache_addr, const std::size_t size);
31 ~CachedBufferBlock();
32
33 const GLuint* GetHandle() const {
34 return &gl_buffer.handle;
35 }
36
37private:
38 OGLBuffer gl_buffer{};
39};
40
41class OGLBufferCache final : public VideoCommon::BufferCache<Buffer, GLuint, OGLStreamBuffer> {
25public: 42public:
26 explicit OGLBufferCache(RasterizerOpenGL& rasterizer, Core::System& system, 43 explicit OGLBufferCache(RasterizerOpenGL& rasterizer, Core::System& system,
27 std::size_t stream_size); 44 std::size_t stream_size);
@@ -30,18 +47,20 @@ public:
30 const GLuint* GetEmptyBuffer(std::size_t) override; 47 const GLuint* GetEmptyBuffer(std::size_t) override;
31 48
32protected: 49protected:
33 OGLBuffer CreateBuffer(std::size_t size) override; 50 Buffer CreateBlock(CacheAddr cache_addr, std::size_t size) override;
51
52 void WriteBarrier() override;
34 53
35 const GLuint* ToHandle(const OGLBuffer& buffer) override; 54 const GLuint* ToHandle(const Buffer& buffer) override;
36 55
37 void UploadBufferData(const OGLBuffer& buffer, std::size_t offset, std::size_t size, 56 void UploadBlockData(const Buffer& buffer, std::size_t offset, std::size_t size,
38 const u8* data) override; 57 const u8* data) override;
39 58
40 void DownloadBufferData(const OGLBuffer& buffer, std::size_t offset, std::size_t size, 59 void DownloadBlockData(const Buffer& buffer, std::size_t offset, std::size_t size,
41 u8* data) override; 60 u8* data) override;
42 61
43 void CopyBufferData(const OGLBuffer& src, const OGLBuffer& dst, std::size_t src_offset, 62 void CopyBlock(const Buffer& src, const Buffer& dst, std::size_t src_offset,
44 std::size_t dst_offset, std::size_t size) override; 63 std::size_t dst_offset, std::size_t size) override;
45}; 64};
46 65
47} // namespace OpenGL 66} // namespace OpenGL
diff --git a/src/video_core/renderer_opengl/gl_device.cpp b/src/video_core/renderer_opengl/gl_device.cpp
index 85424a4c9..03d434b28 100644
--- a/src/video_core/renderer_opengl/gl_device.cpp
+++ b/src/video_core/renderer_opengl/gl_device.cpp
@@ -27,6 +27,8 @@ Device::Device() {
27 shader_storage_alignment = GetInteger<std::size_t>(GL_SHADER_STORAGE_BUFFER_OFFSET_ALIGNMENT); 27 shader_storage_alignment = GetInteger<std::size_t>(GL_SHADER_STORAGE_BUFFER_OFFSET_ALIGNMENT);
28 max_vertex_attributes = GetInteger<u32>(GL_MAX_VERTEX_ATTRIBS); 28 max_vertex_attributes = GetInteger<u32>(GL_MAX_VERTEX_ATTRIBS);
29 max_varyings = GetInteger<u32>(GL_MAX_VARYING_VECTORS); 29 max_varyings = GetInteger<u32>(GL_MAX_VARYING_VECTORS);
30 has_warp_intrinsics = GLAD_GL_NV_gpu_shader5 && GLAD_GL_NV_shader_thread_group &&
31 GLAD_GL_NV_shader_thread_shuffle;
30 has_vertex_viewport_layer = GLAD_GL_ARB_shader_viewport_layer_array; 32 has_vertex_viewport_layer = GLAD_GL_ARB_shader_viewport_layer_array;
31 has_variable_aoffi = TestVariableAoffi(); 33 has_variable_aoffi = TestVariableAoffi();
32 has_component_indexing_bug = TestComponentIndexingBug(); 34 has_component_indexing_bug = TestComponentIndexingBug();
@@ -36,6 +38,7 @@ Device::Device(std::nullptr_t) {
36 uniform_buffer_alignment = 0; 38 uniform_buffer_alignment = 0;
37 max_vertex_attributes = 16; 39 max_vertex_attributes = 16;
38 max_varyings = 15; 40 max_varyings = 15;
41 has_warp_intrinsics = true;
39 has_vertex_viewport_layer = true; 42 has_vertex_viewport_layer = true;
40 has_variable_aoffi = true; 43 has_variable_aoffi = true;
41 has_component_indexing_bug = false; 44 has_component_indexing_bug = false;
diff --git a/src/video_core/renderer_opengl/gl_device.h b/src/video_core/renderer_opengl/gl_device.h
index dc883722d..3ef7c6dd8 100644
--- a/src/video_core/renderer_opengl/gl_device.h
+++ b/src/video_core/renderer_opengl/gl_device.h
@@ -30,6 +30,10 @@ public:
30 return max_varyings; 30 return max_varyings;
31 } 31 }
32 32
33 bool HasWarpIntrinsics() const {
34 return has_warp_intrinsics;
35 }
36
33 bool HasVertexViewportLayer() const { 37 bool HasVertexViewportLayer() const {
34 return has_vertex_viewport_layer; 38 return has_vertex_viewport_layer;
35 } 39 }
@@ -50,6 +54,7 @@ private:
50 std::size_t shader_storage_alignment{}; 54 std::size_t shader_storage_alignment{};
51 u32 max_vertex_attributes{}; 55 u32 max_vertex_attributes{};
52 u32 max_varyings{}; 56 u32 max_varyings{};
57 bool has_warp_intrinsics{};
53 bool has_vertex_viewport_layer{}; 58 bool has_vertex_viewport_layer{};
54 bool has_variable_aoffi{}; 59 bool has_variable_aoffi{};
55 bool has_component_indexing_bug{}; 60 bool has_component_indexing_bug{};
diff --git a/src/video_core/renderer_opengl/gl_rasterizer.cpp b/src/video_core/renderer_opengl/gl_rasterizer.cpp
index 80cfda7e4..bb09ecd52 100644
--- a/src/video_core/renderer_opengl/gl_rasterizer.cpp
+++ b/src/video_core/renderer_opengl/gl_rasterizer.cpp
@@ -708,8 +708,6 @@ void RasterizerOpenGL::DrawArrays() {
708 return; 708 return;
709 } 709 }
710 710
711 const auto& regs = gpu.regs;
712
713 SyncColorMask(); 711 SyncColorMask();
714 SyncFragmentColorClampState(); 712 SyncFragmentColorClampState();
715 SyncMultiSampleState(); 713 SyncMultiSampleState();
@@ -980,7 +978,7 @@ void RasterizerOpenGL::SetupGlobalMemory(const GLShader::GlobalMemoryEntry& entr
980 GPUVAddr gpu_addr, std::size_t size) { 978 GPUVAddr gpu_addr, std::size_t size) {
981 const auto alignment{device.GetShaderStorageBufferAlignment()}; 979 const auto alignment{device.GetShaderStorageBufferAlignment()};
982 const auto [ssbo, buffer_offset] = 980 const auto [ssbo, buffer_offset] =
983 buffer_cache.UploadMemory(gpu_addr, size, alignment, true, entry.IsWritten()); 981 buffer_cache.UploadMemory(gpu_addr, size, alignment, entry.IsWritten());
984 bind_ssbo_pushbuffer.Push(ssbo, buffer_offset, static_cast<GLsizeiptr>(size)); 982 bind_ssbo_pushbuffer.Push(ssbo, buffer_offset, static_cast<GLsizeiptr>(size));
985} 983}
986 984
diff --git a/src/video_core/renderer_opengl/gl_shader_cache.cpp b/src/video_core/renderer_opengl/gl_shader_cache.cpp
index 1c90facc3..cf6a5cddf 100644
--- a/src/video_core/renderer_opengl/gl_shader_cache.cpp
+++ b/src/video_core/renderer_opengl/gl_shader_cache.cpp
@@ -212,7 +212,9 @@ CachedProgram SpecializeShader(const std::string& code, const GLShader::ShaderEn
212 const auto texture_buffer_usage{variant.texture_buffer_usage}; 212 const auto texture_buffer_usage{variant.texture_buffer_usage};
213 213
214 std::string source = "#version 430 core\n" 214 std::string source = "#version 430 core\n"
215 "#extension GL_ARB_separate_shader_objects : enable\n"; 215 "#extension GL_ARB_separate_shader_objects : enable\n"
216 "#extension GL_NV_gpu_shader5 : enable\n"
217 "#extension GL_NV_shader_thread_group : enable\n";
216 if (entries.shader_viewport_layer_array) { 218 if (entries.shader_viewport_layer_array) {
217 source += "#extension GL_ARB_shader_viewport_layer_array : enable\n"; 219 source += "#extension GL_ARB_shader_viewport_layer_array : enable\n";
218 } 220 }
@@ -247,20 +249,24 @@ CachedProgram SpecializeShader(const std::string& code, const GLShader::ShaderEn
247 if (!texture_buffer_usage.test(i)) { 249 if (!texture_buffer_usage.test(i)) {
248 continue; 250 continue;
249 } 251 }
250 source += fmt::format("#define SAMPLER_{}_IS_BUFFER", i); 252 source += fmt::format("#define SAMPLER_{}_IS_BUFFER\n", i);
253 }
254 if (texture_buffer_usage.any()) {
255 source += '\n';
251 } 256 }
252 257
253 if (program_type == ProgramType::Geometry) { 258 if (program_type == ProgramType::Geometry) {
254 const auto [glsl_topology, debug_name, max_vertices] = 259 const auto [glsl_topology, debug_name, max_vertices] =
255 GetPrimitiveDescription(primitive_mode); 260 GetPrimitiveDescription(primitive_mode);
256 261
257 source += "layout (" + std::string(glsl_topology) + ") in;\n"; 262 source += "layout (" + std::string(glsl_topology) + ") in;\n\n";
258 source += "#define MAX_VERTEX_INPUT " + std::to_string(max_vertices) + '\n'; 263 source += "#define MAX_VERTEX_INPUT " + std::to_string(max_vertices) + '\n';
259 } 264 }
260 if (program_type == ProgramType::Compute) { 265 if (program_type == ProgramType::Compute) {
261 source += "layout (local_size_variable) in;\n"; 266 source += "layout (local_size_variable) in;\n";
262 } 267 }
263 268
269 source += '\n';
264 source += code; 270 source += code;
265 271
266 OGLShader shader; 272 OGLShader shader;
@@ -289,7 +295,7 @@ std::set<GLenum> GetSupportedFormats() {
289 295
290CachedShader::CachedShader(const ShaderParameters& params, ProgramType program_type, 296CachedShader::CachedShader(const ShaderParameters& params, ProgramType program_type,
291 GLShader::ProgramResult result) 297 GLShader::ProgramResult result)
292 : RasterizerCacheObject{params.host_ptr}, host_ptr{params.host_ptr}, cpu_addr{params.cpu_addr}, 298 : RasterizerCacheObject{params.host_ptr}, cpu_addr{params.cpu_addr},
293 unique_identifier{params.unique_identifier}, program_type{program_type}, 299 unique_identifier{params.unique_identifier}, program_type{program_type},
294 disk_cache{params.disk_cache}, precompiled_programs{params.precompiled_programs}, 300 disk_cache{params.disk_cache}, precompiled_programs{params.precompiled_programs},
295 entries{result.second}, code{std::move(result.first)}, shader_length{entries.shader_length} {} 301 entries{result.second}, code{std::move(result.first)}, shader_length{entries.shader_length} {}
diff --git a/src/video_core/renderer_opengl/gl_shader_cache.h b/src/video_core/renderer_opengl/gl_shader_cache.h
index a3106a0ff..2c8faf855 100644
--- a/src/video_core/renderer_opengl/gl_shader_cache.h
+++ b/src/video_core/renderer_opengl/gl_shader_cache.h
@@ -106,7 +106,6 @@ private:
106 106
107 ShaderDiskCacheUsage GetUsage(const ProgramVariant& variant) const; 107 ShaderDiskCacheUsage GetUsage(const ProgramVariant& variant) const;
108 108
109 u8* host_ptr{};
110 VAddr cpu_addr{}; 109 VAddr cpu_addr{};
111 u64 unique_identifier{}; 110 u64 unique_identifier{};
112 ProgramType program_type{}; 111 ProgramType program_type{};
diff --git a/src/video_core/renderer_opengl/gl_shader_decompiler.cpp b/src/video_core/renderer_opengl/gl_shader_decompiler.cpp
index d8f722c26..359d58cbe 100644
--- a/src/video_core/renderer_opengl/gl_shader_decompiler.cpp
+++ b/src/video_core/renderer_opengl/gl_shader_decompiler.cpp
@@ -565,7 +565,7 @@ private:
565 case Tegra::Shader::ImageType::Texture1D: 565 case Tegra::Shader::ImageType::Texture1D:
566 return "image1D"; 566 return "image1D";
567 case Tegra::Shader::ImageType::TextureBuffer: 567 case Tegra::Shader::ImageType::TextureBuffer:
568 return "bufferImage"; 568 return "imageBuffer";
569 case Tegra::Shader::ImageType::Texture1DArray: 569 case Tegra::Shader::ImageType::Texture1DArray:
570 return "image1DArray"; 570 return "image1DArray";
571 case Tegra::Shader::ImageType::Texture2D: 571 case Tegra::Shader::ImageType::Texture2D:
@@ -1735,6 +1735,48 @@ private:
1735 return "utof(gl_WorkGroupID"s + GetSwizzle(element) + ')'; 1735 return "utof(gl_WorkGroupID"s + GetSwizzle(element) + ')';
1736 } 1736 }
1737 1737
1738 std::string BallotThread(Operation operation) {
1739 const std::string value = VisitOperand(operation, 0, Type::Bool);
1740 if (!device.HasWarpIntrinsics()) {
1741 LOG_ERROR(Render_OpenGL,
1742 "Nvidia warp intrinsics are not available and its required by a shader");
1743 // Stub on non-Nvidia devices by simulating all threads voting the same as the active
1744 // one.
1745 return fmt::format("utof({} ? 0xFFFFFFFFU : 0U)", value);
1746 }
1747 return fmt::format("utof(ballotThreadNV({}))", value);
1748 }
1749
1750 std::string Vote(Operation operation, const char* func) {
1751 const std::string value = VisitOperand(operation, 0, Type::Bool);
1752 if (!device.HasWarpIntrinsics()) {
1753 LOG_ERROR(Render_OpenGL,
1754 "Nvidia vote intrinsics are not available and its required by a shader");
1755 // Stub with a warp size of one.
1756 return value;
1757 }
1758 return fmt::format("{}({})", func, value);
1759 }
1760
1761 std::string VoteAll(Operation operation) {
1762 return Vote(operation, "allThreadsNV");
1763 }
1764
1765 std::string VoteAny(Operation operation) {
1766 return Vote(operation, "anyThreadNV");
1767 }
1768
1769 std::string VoteEqual(Operation operation) {
1770 if (!device.HasWarpIntrinsics()) {
1771 LOG_ERROR(Render_OpenGL,
1772 "Nvidia vote intrinsics are not available and its required by a shader");
1773 // We must return true here since a stub for a theoretical warp size of 1 will always
1774 // return an equal result for all its votes.
1775 return "true";
1776 }
1777 return Vote(operation, "allThreadsEqualNV");
1778 }
1779
1738 static constexpr std::array operation_decompilers = { 1780 static constexpr std::array operation_decompilers = {
1739 &GLSLDecompiler::Assign, 1781 &GLSLDecompiler::Assign,
1740 1782
@@ -1885,6 +1927,11 @@ private:
1885 &GLSLDecompiler::WorkGroupId<0>, 1927 &GLSLDecompiler::WorkGroupId<0>,
1886 &GLSLDecompiler::WorkGroupId<1>, 1928 &GLSLDecompiler::WorkGroupId<1>,
1887 &GLSLDecompiler::WorkGroupId<2>, 1929 &GLSLDecompiler::WorkGroupId<2>,
1930
1931 &GLSLDecompiler::BallotThread,
1932 &GLSLDecompiler::VoteAll,
1933 &GLSLDecompiler::VoteAny,
1934 &GLSLDecompiler::VoteEqual,
1888 }; 1935 };
1889 static_assert(operation_decompilers.size() == static_cast<std::size_t>(OperationCode::Amount)); 1936 static_assert(operation_decompilers.size() == static_cast<std::size_t>(OperationCode::Amount));
1890 1937
diff --git a/src/video_core/renderer_opengl/gl_texture_cache.cpp b/src/video_core/renderer_opengl/gl_texture_cache.cpp
index 408332f90..4f135fe03 100644
--- a/src/video_core/renderer_opengl/gl_texture_cache.cpp
+++ b/src/video_core/renderer_opengl/gl_texture_cache.cpp
@@ -184,6 +184,9 @@ GLint GetSwizzleSource(SwizzleSource source) {
184} 184}
185 185
186void ApplyTextureDefaults(const SurfaceParams& params, GLuint texture) { 186void ApplyTextureDefaults(const SurfaceParams& params, GLuint texture) {
187 if (params.IsBuffer()) {
188 return;
189 }
187 glTextureParameteri(texture, GL_TEXTURE_MIN_FILTER, GL_LINEAR); 190 glTextureParameteri(texture, GL_TEXTURE_MIN_FILTER, GL_LINEAR);
188 glTextureParameteri(texture, GL_TEXTURE_MAG_FILTER, GL_LINEAR); 191 glTextureParameteri(texture, GL_TEXTURE_MAG_FILTER, GL_LINEAR);
189 glTextureParameteri(texture, GL_TEXTURE_WRAP_S, GL_CLAMP_TO_EDGE); 192 glTextureParameteri(texture, GL_TEXTURE_WRAP_S, GL_CLAMP_TO_EDGE);
@@ -208,6 +211,7 @@ OGLTexture CreateTexture(const SurfaceParams& params, GLenum target, GLenum inte
208 glNamedBufferStorage(texture_buffer.handle, params.width * params.GetBytesPerPixel(), 211 glNamedBufferStorage(texture_buffer.handle, params.width * params.GetBytesPerPixel(),
209 nullptr, GL_DYNAMIC_STORAGE_BIT); 212 nullptr, GL_DYNAMIC_STORAGE_BIT);
210 glTextureBuffer(texture.handle, internal_format, texture_buffer.handle); 213 glTextureBuffer(texture.handle, internal_format, texture_buffer.handle);
214 break;
211 case SurfaceTarget::Texture2D: 215 case SurfaceTarget::Texture2D:
212 case SurfaceTarget::TextureCubemap: 216 case SurfaceTarget::TextureCubemap:
213 glTextureStorage2D(texture.handle, params.emulated_levels, internal_format, params.width, 217 glTextureStorage2D(texture.handle, params.emulated_levels, internal_format, params.width,
diff --git a/src/video_core/renderer_opengl/gl_texture_cache.h b/src/video_core/renderer_opengl/gl_texture_cache.h
index ff6ab6988..21324488a 100644
--- a/src/video_core/renderer_opengl/gl_texture_cache.h
+++ b/src/video_core/renderer_opengl/gl_texture_cache.h
@@ -51,7 +51,7 @@ public:
51 } 51 }
52 52
53protected: 53protected:
54 void DecorateSurfaceName(); 54 void DecorateSurfaceName() override;
55 55
56 View CreateView(const ViewParams& view_key) override; 56 View CreateView(const ViewParams& view_key) override;
57 View CreateViewInner(const ViewParams& view_key, bool is_proxy); 57 View CreateViewInner(const ViewParams& view_key, bool is_proxy);
diff --git a/src/video_core/renderer_vulkan/vk_shader_decompiler.cpp b/src/video_core/renderer_vulkan/vk_shader_decompiler.cpp
index 24a591797..a35b45c9c 100644
--- a/src/video_core/renderer_vulkan/vk_shader_decompiler.cpp
+++ b/src/video_core/renderer_vulkan/vk_shader_decompiler.cpp
@@ -1072,6 +1072,26 @@ private:
1072 return {}; 1072 return {};
1073 } 1073 }
1074 1074
1075 Id BallotThread(Operation) {
1076 UNIMPLEMENTED();
1077 return {};
1078 }
1079
1080 Id VoteAll(Operation) {
1081 UNIMPLEMENTED();
1082 return {};
1083 }
1084
1085 Id VoteAny(Operation) {
1086 UNIMPLEMENTED();
1087 return {};
1088 }
1089
1090 Id VoteEqual(Operation) {
1091 UNIMPLEMENTED();
1092 return {};
1093 }
1094
1075 Id DeclareBuiltIn(spv::BuiltIn builtin, spv::StorageClass storage, Id type, 1095 Id DeclareBuiltIn(spv::BuiltIn builtin, spv::StorageClass storage, Id type,
1076 const std::string& name) { 1096 const std::string& name) {
1077 const Id id = OpVariable(type, storage); 1097 const Id id = OpVariable(type, storage);
@@ -1364,6 +1384,11 @@ private:
1364 &SPIRVDecompiler::WorkGroupId<0>, 1384 &SPIRVDecompiler::WorkGroupId<0>,
1365 &SPIRVDecompiler::WorkGroupId<1>, 1385 &SPIRVDecompiler::WorkGroupId<1>,
1366 &SPIRVDecompiler::WorkGroupId<2>, 1386 &SPIRVDecompiler::WorkGroupId<2>,
1387
1388 &SPIRVDecompiler::BallotThread,
1389 &SPIRVDecompiler::VoteAll,
1390 &SPIRVDecompiler::VoteAny,
1391 &SPIRVDecompiler::VoteEqual,
1367 }; 1392 };
1368 static_assert(operation_decompilers.size() == static_cast<std::size_t>(OperationCode::Amount)); 1393 static_assert(operation_decompilers.size() == static_cast<std::size_t>(OperationCode::Amount));
1369 1394
diff --git a/src/video_core/shader/decode.cpp b/src/video_core/shader/decode.cpp
index b547d8323..47a9fd961 100644
--- a/src/video_core/shader/decode.cpp
+++ b/src/video_core/shader/decode.cpp
@@ -176,6 +176,7 @@ u32 ShaderIR::DecodeInstr(NodeBlock& bb, u32 pc) {
176 {OpCode::Type::Ffma, &ShaderIR::DecodeFfma}, 176 {OpCode::Type::Ffma, &ShaderIR::DecodeFfma},
177 {OpCode::Type::Hfma2, &ShaderIR::DecodeHfma2}, 177 {OpCode::Type::Hfma2, &ShaderIR::DecodeHfma2},
178 {OpCode::Type::Conversion, &ShaderIR::DecodeConversion}, 178 {OpCode::Type::Conversion, &ShaderIR::DecodeConversion},
179 {OpCode::Type::Warp, &ShaderIR::DecodeWarp},
179 {OpCode::Type::Memory, &ShaderIR::DecodeMemory}, 180 {OpCode::Type::Memory, &ShaderIR::DecodeMemory},
180 {OpCode::Type::Texture, &ShaderIR::DecodeTexture}, 181 {OpCode::Type::Texture, &ShaderIR::DecodeTexture},
181 {OpCode::Type::Image, &ShaderIR::DecodeImage}, 182 {OpCode::Type::Image, &ShaderIR::DecodeImage},
diff --git a/src/video_core/shader/decode/float_set.cpp b/src/video_core/shader/decode/float_set.cpp
index f5013e44a..5614e8a0d 100644
--- a/src/video_core/shader/decode/float_set.cpp
+++ b/src/video_core/shader/decode/float_set.cpp
@@ -15,7 +15,6 @@ using Tegra::Shader::OpCode;
15 15
16u32 ShaderIR::DecodeFloatSet(NodeBlock& bb, u32 pc) { 16u32 ShaderIR::DecodeFloatSet(NodeBlock& bb, u32 pc) {
17 const Instruction instr = {program_code[pc]}; 17 const Instruction instr = {program_code[pc]};
18 const auto opcode = OpCode::Decode(instr);
19 18
20 const Node op_a = GetOperandAbsNegFloat(GetRegister(instr.gpr8), instr.fset.abs_a != 0, 19 const Node op_a = GetOperandAbsNegFloat(GetRegister(instr.gpr8), instr.fset.abs_a != 0,
21 instr.fset.neg_a != 0); 20 instr.fset.neg_a != 0);
diff --git a/src/video_core/shader/decode/float_set_predicate.cpp b/src/video_core/shader/decode/float_set_predicate.cpp
index 2323052b0..200c2c983 100644
--- a/src/video_core/shader/decode/float_set_predicate.cpp
+++ b/src/video_core/shader/decode/float_set_predicate.cpp
@@ -16,10 +16,9 @@ using Tegra::Shader::Pred;
16 16
17u32 ShaderIR::DecodeFloatSetPredicate(NodeBlock& bb, u32 pc) { 17u32 ShaderIR::DecodeFloatSetPredicate(NodeBlock& bb, u32 pc) {
18 const Instruction instr = {program_code[pc]}; 18 const Instruction instr = {program_code[pc]};
19 const auto opcode = OpCode::Decode(instr);
20 19
21 const Node op_a = GetOperandAbsNegFloat(GetRegister(instr.gpr8), instr.fsetp.abs_a != 0, 20 Node op_a = GetOperandAbsNegFloat(GetRegister(instr.gpr8), instr.fsetp.abs_a != 0,
22 instr.fsetp.neg_a != 0); 21 instr.fsetp.neg_a != 0);
23 Node op_b = [&]() { 22 Node op_b = [&]() {
24 if (instr.is_b_imm) { 23 if (instr.is_b_imm) {
25 return GetImmediate19(instr); 24 return GetImmediate19(instr);
@@ -29,12 +28,13 @@ u32 ShaderIR::DecodeFloatSetPredicate(NodeBlock& bb, u32 pc) {
29 return GetConstBuffer(instr.cbuf34.index, instr.cbuf34.GetOffset()); 28 return GetConstBuffer(instr.cbuf34.index, instr.cbuf34.GetOffset());
30 } 29 }
31 }(); 30 }();
32 op_b = GetOperandAbsNegFloat(op_b, instr.fsetp.abs_b, false); 31 op_b = GetOperandAbsNegFloat(std::move(op_b), instr.fsetp.abs_b, instr.fsetp.neg_b);
33 32
34 // We can't use the constant predicate as destination. 33 // We can't use the constant predicate as destination.
35 ASSERT(instr.fsetp.pred3 != static_cast<u64>(Pred::UnusedIndex)); 34 ASSERT(instr.fsetp.pred3 != static_cast<u64>(Pred::UnusedIndex));
36 35
37 const Node predicate = GetPredicateComparisonFloat(instr.fsetp.cond, op_a, op_b); 36 const Node predicate =
37 GetPredicateComparisonFloat(instr.fsetp.cond, std::move(op_a), std::move(op_b));
38 const Node second_pred = GetPredicate(instr.fsetp.pred39, instr.fsetp.neg_pred != 0); 38 const Node second_pred = GetPredicate(instr.fsetp.pred39, instr.fsetp.neg_pred != 0);
39 39
40 const OperationCode combiner = GetPredicateCombiner(instr.fsetp.op); 40 const OperationCode combiner = GetPredicateCombiner(instr.fsetp.op);
diff --git a/src/video_core/shader/decode/integer_set.cpp b/src/video_core/shader/decode/integer_set.cpp
index 46e3d5905..59809bcd8 100644
--- a/src/video_core/shader/decode/integer_set.cpp
+++ b/src/video_core/shader/decode/integer_set.cpp
@@ -14,7 +14,6 @@ using Tegra::Shader::OpCode;
14 14
15u32 ShaderIR::DecodeIntegerSet(NodeBlock& bb, u32 pc) { 15u32 ShaderIR::DecodeIntegerSet(NodeBlock& bb, u32 pc) {
16 const Instruction instr = {program_code[pc]}; 16 const Instruction instr = {program_code[pc]};
17 const auto opcode = OpCode::Decode(instr);
18 17
19 const Node op_a = GetRegister(instr.gpr8); 18 const Node op_a = GetRegister(instr.gpr8);
20 const Node op_b = [&]() { 19 const Node op_b = [&]() {
diff --git a/src/video_core/shader/decode/integer_set_predicate.cpp b/src/video_core/shader/decode/integer_set_predicate.cpp
index dd20775d7..25e48fef8 100644
--- a/src/video_core/shader/decode/integer_set_predicate.cpp
+++ b/src/video_core/shader/decode/integer_set_predicate.cpp
@@ -16,7 +16,6 @@ using Tegra::Shader::Pred;
16 16
17u32 ShaderIR::DecodeIntegerSetPredicate(NodeBlock& bb, u32 pc) { 17u32 ShaderIR::DecodeIntegerSetPredicate(NodeBlock& bb, u32 pc) {
18 const Instruction instr = {program_code[pc]}; 18 const Instruction instr = {program_code[pc]};
19 const auto opcode = OpCode::Decode(instr);
20 19
21 const Node op_a = GetRegister(instr.gpr8); 20 const Node op_a = GetRegister(instr.gpr8);
22 21
diff --git a/src/video_core/shader/decode/other.cpp b/src/video_core/shader/decode/other.cpp
index ac0e764d6..d46e0f823 100644
--- a/src/video_core/shader/decode/other.cpp
+++ b/src/video_core/shader/decode/other.cpp
@@ -74,6 +74,13 @@ u32 ShaderIR::DecodeOther(NodeBlock& bb, u32 pc) {
74 case SystemVariable::InvocationInfo: 74 case SystemVariable::InvocationInfo:
75 LOG_WARNING(HW_GPU, "MOV_SYS instruction with InvocationInfo is incomplete"); 75 LOG_WARNING(HW_GPU, "MOV_SYS instruction with InvocationInfo is incomplete");
76 return Immediate(0u); 76 return Immediate(0u);
77 case SystemVariable::Tid: {
78 Node value = Immediate(0);
79 value = BitfieldInsert(value, Operation(OperationCode::LocalInvocationIdX), 0, 9);
80 value = BitfieldInsert(value, Operation(OperationCode::LocalInvocationIdY), 16, 9);
81 value = BitfieldInsert(value, Operation(OperationCode::LocalInvocationIdZ), 26, 5);
82 return value;
83 }
77 case SystemVariable::TidX: 84 case SystemVariable::TidX:
78 return Operation(OperationCode::LocalInvocationIdX); 85 return Operation(OperationCode::LocalInvocationIdX);
79 case SystemVariable::TidY: 86 case SystemVariable::TidY:
diff --git a/src/video_core/shader/decode/predicate_set_register.cpp b/src/video_core/shader/decode/predicate_set_register.cpp
index febbfeb50..84dbc50fe 100644
--- a/src/video_core/shader/decode/predicate_set_register.cpp
+++ b/src/video_core/shader/decode/predicate_set_register.cpp
@@ -15,7 +15,6 @@ using Tegra::Shader::OpCode;
15 15
16u32 ShaderIR::DecodePredicateSetRegister(NodeBlock& bb, u32 pc) { 16u32 ShaderIR::DecodePredicateSetRegister(NodeBlock& bb, u32 pc) {
17 const Instruction instr = {program_code[pc]}; 17 const Instruction instr = {program_code[pc]};
18 const auto opcode = OpCode::Decode(instr);
19 18
20 UNIMPLEMENTED_IF_MSG(instr.generates_cc, 19 UNIMPLEMENTED_IF_MSG(instr.generates_cc,
21 "Condition codes generation in PSET is not implemented"); 20 "Condition codes generation in PSET is not implemented");
diff --git a/src/video_core/shader/decode/warp.cpp b/src/video_core/shader/decode/warp.cpp
new file mode 100644
index 000000000..04ca74f46
--- /dev/null
+++ b/src/video_core/shader/decode/warp.cpp
@@ -0,0 +1,55 @@
1// Copyright 2019 yuzu Emulator Project
2// Licensed under GPLv2 or any later version
3// Refer to the license.txt file included.
4
5#include "common/assert.h"
6#include "common/common_types.h"
7#include "video_core/engines/shader_bytecode.h"
8#include "video_core/shader/node_helper.h"
9#include "video_core/shader/shader_ir.h"
10
11namespace VideoCommon::Shader {
12
13using Tegra::Shader::Instruction;
14using Tegra::Shader::OpCode;
15using Tegra::Shader::Pred;
16using Tegra::Shader::VoteOperation;
17
18namespace {
19OperationCode GetOperationCode(VoteOperation vote_op) {
20 switch (vote_op) {
21 case VoteOperation::All:
22 return OperationCode::VoteAll;
23 case VoteOperation::Any:
24 return OperationCode::VoteAny;
25 case VoteOperation::Eq:
26 return OperationCode::VoteEqual;
27 default:
28 UNREACHABLE_MSG("Invalid vote operation={}", static_cast<u64>(vote_op));
29 return OperationCode::VoteAll;
30 }
31}
32} // Anonymous namespace
33
34u32 ShaderIR::DecodeWarp(NodeBlock& bb, u32 pc) {
35 const Instruction instr = {program_code[pc]};
36 const auto opcode = OpCode::Decode(instr);
37
38 switch (opcode->get().GetId()) {
39 case OpCode::Id::VOTE: {
40 const Node value = GetPredicate(instr.vote.value, instr.vote.negate_value != 0);
41 const Node active = Operation(OperationCode::BallotThread, value);
42 const Node vote = Operation(GetOperationCode(instr.vote.operation), value);
43 SetRegister(bb, instr.gpr0, active);
44 SetPredicate(bb, instr.vote.dest_pred, vote);
45 break;
46 }
47 default:
48 UNIMPLEMENTED_MSG("Unhandled warp instruction: {}", opcode->get().GetName());
49 break;
50 }
51
52 return pc;
53}
54
55} // namespace VideoCommon::Shader
diff --git a/src/video_core/shader/node.h b/src/video_core/shader/node.h
index 5f0852364..5db9313c4 100644
--- a/src/video_core/shader/node.h
+++ b/src/video_core/shader/node.h
@@ -168,6 +168,11 @@ enum class OperationCode {
168 WorkGroupIdY, /// () -> uint 168 WorkGroupIdY, /// () -> uint
169 WorkGroupIdZ, /// () -> uint 169 WorkGroupIdZ, /// () -> uint
170 170
171 BallotThread, /// (bool) -> uint
172 VoteAll, /// (bool) -> bool
173 VoteAny, /// (bool) -> bool
174 VoteEqual, /// (bool) -> bool
175
171 Amount, 176 Amount,
172}; 177};
173 178
diff --git a/src/video_core/shader/shader_ir.cpp b/src/video_core/shader/shader_ir.cpp
index 5e91fe129..1e5c7f660 100644
--- a/src/video_core/shader/shader_ir.cpp
+++ b/src/video_core/shader/shader_ir.cpp
@@ -405,4 +405,9 @@ Node ShaderIR::BitfieldExtract(Node value, u32 offset, u32 bits) {
405 Immediate(offset), Immediate(bits)); 405 Immediate(offset), Immediate(bits));
406} 406}
407 407
408Node ShaderIR::BitfieldInsert(Node base, Node insert, u32 offset, u32 bits) {
409 return Operation(OperationCode::UBitfieldInsert, NO_PRECISE, base, insert, Immediate(offset),
410 Immediate(bits));
411}
412
408} // namespace VideoCommon::Shader 413} // namespace VideoCommon::Shader
diff --git a/src/video_core/shader/shader_ir.h b/src/video_core/shader/shader_ir.h
index 59a083d90..bcc9b79b6 100644
--- a/src/video_core/shader/shader_ir.h
+++ b/src/video_core/shader/shader_ir.h
@@ -167,6 +167,7 @@ private:
167 u32 DecodeFfma(NodeBlock& bb, u32 pc); 167 u32 DecodeFfma(NodeBlock& bb, u32 pc);
168 u32 DecodeHfma2(NodeBlock& bb, u32 pc); 168 u32 DecodeHfma2(NodeBlock& bb, u32 pc);
169 u32 DecodeConversion(NodeBlock& bb, u32 pc); 169 u32 DecodeConversion(NodeBlock& bb, u32 pc);
170 u32 DecodeWarp(NodeBlock& bb, u32 pc);
170 u32 DecodeMemory(NodeBlock& bb, u32 pc); 171 u32 DecodeMemory(NodeBlock& bb, u32 pc);
171 u32 DecodeTexture(NodeBlock& bb, u32 pc); 172 u32 DecodeTexture(NodeBlock& bb, u32 pc);
172 u32 DecodeImage(NodeBlock& bb, u32 pc); 173 u32 DecodeImage(NodeBlock& bb, u32 pc);
@@ -279,6 +280,9 @@ private:
279 /// Extracts a sequence of bits from a node 280 /// Extracts a sequence of bits from a node
280 Node BitfieldExtract(Node value, u32 offset, u32 bits); 281 Node BitfieldExtract(Node value, u32 offset, u32 bits);
281 282
283 /// Inserts a sequence of bits from a node
284 Node BitfieldInsert(Node base, Node insert, u32 offset, u32 bits);
285
282 void WriteTexInstructionFloat(NodeBlock& bb, Tegra::Shader::Instruction instr, 286 void WriteTexInstructionFloat(NodeBlock& bb, Tegra::Shader::Instruction instr,
283 const Node4& components); 287 const Node4& components);
284 288
diff --git a/src/video_core/texture_cache/surface_params.h b/src/video_core/texture_cache/surface_params.h
index 358d6757c..e7ef66ee2 100644
--- a/src/video_core/texture_cache/surface_params.h
+++ b/src/video_core/texture_cache/surface_params.h
@@ -58,7 +58,6 @@ public:
58 std::size_t GetHostSizeInBytes() const { 58 std::size_t GetHostSizeInBytes() const {
59 std::size_t host_size_in_bytes; 59 std::size_t host_size_in_bytes;
60 if (GetCompressionType() == SurfaceCompression::Converted) { 60 if (GetCompressionType() == SurfaceCompression::Converted) {
61 constexpr std::size_t rgb8_bpp = 4ULL;
62 // ASTC is uncompressed in software, in emulated as RGBA8 61 // ASTC is uncompressed in software, in emulated as RGBA8
63 host_size_in_bytes = 0; 62 host_size_in_bytes = 0;
64 for (u32 level = 0; level < num_levels; ++level) { 63 for (u32 level = 0; level < num_levels; ++level) {
diff --git a/src/video_core/texture_cache/texture_cache.h b/src/video_core/texture_cache/texture_cache.h
index a3a3770a7..2ec0203d1 100644
--- a/src/video_core/texture_cache/texture_cache.h
+++ b/src/video_core/texture_cache/texture_cache.h
@@ -308,8 +308,6 @@ protected:
308 if (!guard_render_targets && surface->IsRenderTarget()) { 308 if (!guard_render_targets && surface->IsRenderTarget()) {
309 ManageRenderTargetUnregister(surface); 309 ManageRenderTargetUnregister(surface);
310 } 310 }
311 const GPUVAddr gpu_addr = surface->GetGpuAddr();
312 const CacheAddr cache_ptr = surface->GetCacheAddr();
313 const std::size_t size = surface->GetSizeInBytes(); 311 const std::size_t size = surface->GetSizeInBytes();
314 const VAddr cpu_addr = surface->GetCpuAddr(); 312 const VAddr cpu_addr = surface->GetCpuAddr();
315 rasterizer.UpdatePagesCachedCount(cpu_addr, size, -1); 313 rasterizer.UpdatePagesCachedCount(cpu_addr, size, -1);
diff --git a/src/video_core/textures/decoders.cpp b/src/video_core/textures/decoders.cpp
index 7e8295944..7df5f1452 100644
--- a/src/video_core/textures/decoders.cpp
+++ b/src/video_core/textures/decoders.cpp
@@ -257,19 +257,21 @@ std::vector<u8> UnswizzleTexture(u8* address, u32 tile_size_x, u32 tile_size_y,
257 257
258void SwizzleSubrect(u32 subrect_width, u32 subrect_height, u32 source_pitch, u32 swizzled_width, 258void SwizzleSubrect(u32 subrect_width, u32 subrect_height, u32 source_pitch, u32 swizzled_width,
259 u32 bytes_per_pixel, u8* swizzled_data, u8* unswizzled_data, 259 u32 bytes_per_pixel, u8* swizzled_data, u8* unswizzled_data,
260 u32 block_height_bit) { 260 u32 block_height_bit, u32 offset_x, u32 offset_y) {
261 const u32 block_height = 1U << block_height_bit; 261 const u32 block_height = 1U << block_height_bit;
262 const u32 image_width_in_gobs{(swizzled_width * bytes_per_pixel + (gob_size_x - 1)) / 262 const u32 image_width_in_gobs{(swizzled_width * bytes_per_pixel + (gob_size_x - 1)) /
263 gob_size_x}; 263 gob_size_x};
264 for (u32 line = 0; line < subrect_height; ++line) { 264 for (u32 line = 0; line < subrect_height; ++line) {
265 const u32 dst_y = line + offset_y;
265 const u32 gob_address_y = 266 const u32 gob_address_y =
266 (line / (gob_size_y * block_height)) * gob_size * block_height * image_width_in_gobs + 267 (dst_y / (gob_size_y * block_height)) * gob_size * block_height * image_width_in_gobs +
267 ((line % (gob_size_y * block_height)) / gob_size_y) * gob_size; 268 ((dst_y % (gob_size_y * block_height)) / gob_size_y) * gob_size;
268 const auto& table = legacy_swizzle_table[line % gob_size_y]; 269 const auto& table = legacy_swizzle_table[dst_y % gob_size_y];
269 for (u32 x = 0; x < subrect_width; ++x) { 270 for (u32 x = 0; x < subrect_width; ++x) {
271 const u32 dst_x = x + offset_x;
270 const u32 gob_address = 272 const u32 gob_address =
271 gob_address_y + (x * bytes_per_pixel / gob_size_x) * gob_size * block_height; 273 gob_address_y + (dst_x * bytes_per_pixel / gob_size_x) * gob_size * block_height;
272 const u32 swizzled_offset = gob_address + table[(x * bytes_per_pixel) % gob_size_x]; 274 const u32 swizzled_offset = gob_address + table[(dst_x * bytes_per_pixel) % gob_size_x];
273 u8* source_line = unswizzled_data + line * source_pitch + x * bytes_per_pixel; 275 u8* source_line = unswizzled_data + line * source_pitch + x * bytes_per_pixel;
274 u8* dest_addr = swizzled_data + swizzled_offset; 276 u8* dest_addr = swizzled_data + swizzled_offset;
275 277
diff --git a/src/video_core/textures/decoders.h b/src/video_core/textures/decoders.h
index eaec9b5a5..f1e3952bc 100644
--- a/src/video_core/textures/decoders.h
+++ b/src/video_core/textures/decoders.h
@@ -44,7 +44,8 @@ std::size_t CalculateSize(bool tiled, u32 bytes_per_pixel, u32 width, u32 height
44 44
45/// Copies an untiled subrectangle into a tiled surface. 45/// Copies an untiled subrectangle into a tiled surface.
46void SwizzleSubrect(u32 subrect_width, u32 subrect_height, u32 source_pitch, u32 swizzled_width, 46void SwizzleSubrect(u32 subrect_width, u32 subrect_height, u32 source_pitch, u32 swizzled_width,
47 u32 bytes_per_pixel, u8* swizzled_data, u8* unswizzled_data, u32 block_height); 47 u32 bytes_per_pixel, u8* swizzled_data, u8* unswizzled_data, u32 block_height,
48 u32 offset_x, u32 offset_y);
48 49
49/// Copies a tiled subrectangle into a linear surface. 50/// Copies a tiled subrectangle into a linear surface.
50void UnswizzleSubrect(u32 subrect_width, u32 subrect_height, u32 dest_pitch, u32 swizzled_width, 51void UnswizzleSubrect(u32 subrect_width, u32 subrect_height, u32 dest_pitch, u32 swizzled_width,
diff --git a/src/video_core/textures/texture.h b/src/video_core/textures/texture.h
index e3be018b9..e36bc2c04 100644
--- a/src/video_core/textures/texture.h
+++ b/src/video_core/textures/texture.h
@@ -213,7 +213,7 @@ struct TICEntry {
213 if (header_version != TICHeaderVersion::OneDBuffer) { 213 if (header_version != TICHeaderVersion::OneDBuffer) {
214 return width_minus_1 + 1; 214 return width_minus_1 + 1;
215 } 215 }
216 return (buffer_high_width_minus_one << 16) | buffer_low_width_minus_one; 216 return ((buffer_high_width_minus_one << 16) | buffer_low_width_minus_one) + 1;
217 } 217 }
218 218
219 u32 Height() const { 219 u32 Height() const {
diff --git a/src/yuzu/configuration/config.cpp b/src/yuzu/configuration/config.cpp
index 5d0fb3f9f..0456248ac 100644
--- a/src/yuzu/configuration/config.cpp
+++ b/src/yuzu/configuration/config.cpp
@@ -516,6 +516,7 @@ void Config::ReadPathValues() {
516 516
517 UISettings::values.roms_path = ReadSetting(QStringLiteral("romsPath")).toString(); 517 UISettings::values.roms_path = ReadSetting(QStringLiteral("romsPath")).toString();
518 UISettings::values.symbols_path = ReadSetting(QStringLiteral("symbolsPath")).toString(); 518 UISettings::values.symbols_path = ReadSetting(QStringLiteral("symbolsPath")).toString();
519 UISettings::values.screenshot_path = ReadSetting(QStringLiteral("screenshotPath")).toString();
519 UISettings::values.game_directory_path = 520 UISettings::values.game_directory_path =
520 ReadSetting(QStringLiteral("gameListRootDir"), QStringLiteral(".")).toString(); 521 ReadSetting(QStringLiteral("gameListRootDir"), QStringLiteral(".")).toString();
521 UISettings::values.game_directory_deepscan = 522 UISettings::values.game_directory_deepscan =
diff --git a/src/yuzu/main.cpp b/src/yuzu/main.cpp
index a7c656fdb..ac57229d5 100644
--- a/src/yuzu/main.cpp
+++ b/src/yuzu/main.cpp
@@ -119,6 +119,7 @@ Q_IMPORT_PLUGIN(QWindowsIntegrationPlugin);
119#endif 119#endif
120 120
121#ifdef _WIN32 121#ifdef _WIN32
122#include <windows.h>
122extern "C" { 123extern "C" {
123// tells Nvidia and AMD drivers to use the dedicated GPU by default on laptops with switchable 124// tells Nvidia and AMD drivers to use the dedicated GPU by default on laptops with switchable
124// graphics 125// graphics
@@ -747,6 +748,18 @@ void GMainWindow::OnDisplayTitleBars(bool show) {
747 } 748 }
748} 749}
749 750
751void GMainWindow::PreventOSSleep() {
752#ifdef _WIN32
753 SetThreadExecutionState(ES_CONTINUOUS | ES_SYSTEM_REQUIRED | ES_DISPLAY_REQUIRED);
754#endif
755}
756
757void GMainWindow::AllowOSSleep() {
758#ifdef _WIN32
759 SetThreadExecutionState(ES_CONTINUOUS);
760#endif
761}
762
750QStringList GMainWindow::GetUnsupportedGLExtensions() { 763QStringList GMainWindow::GetUnsupportedGLExtensions() {
751 QStringList unsupported_ext; 764 QStringList unsupported_ext;
752 765
@@ -966,6 +979,8 @@ void GMainWindow::BootGame(const QString& filename) {
966} 979}
967 980
968void GMainWindow::ShutdownGame() { 981void GMainWindow::ShutdownGame() {
982 AllowOSSleep();
983
969 discord_rpc->Pause(); 984 discord_rpc->Pause();
970 emu_thread->RequestStop(); 985 emu_thread->RequestStop();
971 986
@@ -1567,6 +1582,8 @@ void GMainWindow::OnMenuRecentFile() {
1567} 1582}
1568 1583
1569void GMainWindow::OnStartGame() { 1584void GMainWindow::OnStartGame() {
1585 PreventOSSleep();
1586
1570 emu_thread->SetRunning(true); 1587 emu_thread->SetRunning(true);
1571 1588
1572 qRegisterMetaType<Core::Frontend::SoftwareKeyboardParameters>( 1589 qRegisterMetaType<Core::Frontend::SoftwareKeyboardParameters>(
@@ -1598,6 +1615,8 @@ void GMainWindow::OnPauseGame() {
1598 ui.action_Pause->setEnabled(false); 1615 ui.action_Pause->setEnabled(false);
1599 ui.action_Stop->setEnabled(true); 1616 ui.action_Stop->setEnabled(true);
1600 ui.action_Capture_Screenshot->setEnabled(false); 1617 ui.action_Capture_Screenshot->setEnabled(false);
1618
1619 AllowOSSleep();
1601} 1620}
1602 1621
1603void GMainWindow::OnStopGame() { 1622void GMainWindow::OnStopGame() {
diff --git a/src/yuzu/main.h b/src/yuzu/main.h
index 1137bbc7a..501608ddc 100644
--- a/src/yuzu/main.h
+++ b/src/yuzu/main.h
@@ -130,6 +130,9 @@ private:
130 void ConnectWidgetEvents(); 130 void ConnectWidgetEvents();
131 void ConnectMenuEvents(); 131 void ConnectMenuEvents();
132 132
133 void PreventOSSleep();
134 void AllowOSSleep();
135
133 QStringList GetUnsupportedGLExtensions(); 136 QStringList GetUnsupportedGLExtensions();
134 bool LoadROM(const QString& filename); 137 bool LoadROM(const QString& filename);
135 void BootGame(const QString& filename); 138 void BootGame(const QString& filename);