summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--externals/FidelityFX-FSR/ffx-fsr/ffx_a.h2656
-rw-r--r--externals/FidelityFX-FSR/ffx-fsr/ffx_fsr1.h1199
-rw-r--r--externals/FidelityFX-FSR/license.txt19
-rw-r--r--src/common/math_util.h4
-rw-r--r--src/common/settings.cpp53
-rw-r--r--src/common/settings.h59
-rw-r--r--src/core/frontend/framebuffer_layout.cpp15
-rw-r--r--src/core/frontend/framebuffer_layout.h2
-rw-r--r--src/core/hle/service/am/am.cpp12
-rw-r--r--src/core/hle/service/vi/vi.cpp27
-rw-r--r--src/core/telemetry_session.cpp2
-rw-r--r--src/shader_recompiler/CMakeLists.txt1
-rw-r--r--src/shader_recompiler/backend/bindings.h2
-rw-r--r--src/shader_recompiler/backend/glasm/emit_context.cpp4
-rw-r--r--src/shader_recompiler/backend/glasm/emit_glasm.cpp3
-rw-r--r--src/shader_recompiler/backend/glasm/emit_glasm.h2
-rw-r--r--src/shader_recompiler/backend/glasm/emit_glasm_image.cpp18
-rw-r--r--src/shader_recompiler/backend/glasm/emit_glasm_instructions.h5
-rw-r--r--src/shader_recompiler/backend/glasm/emit_glasm_integer.cpp8
-rw-r--r--src/shader_recompiler/backend/glasm/emit_glasm_not_implemented.cpp4
-rw-r--r--src/shader_recompiler/backend/glsl/emit_context.cpp3
-rw-r--r--src/shader_recompiler/backend/glsl/emit_glsl_context_get_set.cpp4
-rw-r--r--src/shader_recompiler/backend/glsl/emit_glsl_image.cpp16
-rw-r--r--src/shader_recompiler/backend/glsl/emit_glsl_instructions.h5
-rw-r--r--src/shader_recompiler/backend/glsl/emit_glsl_integer.cpp8
-rw-r--r--src/shader_recompiler/backend/spirv/emit_context.cpp86
-rw-r--r--src/shader_recompiler/backend/spirv/emit_context.h17
-rw-r--r--src/shader_recompiler/backend/spirv/emit_spirv.h13
-rw-r--r--src/shader_recompiler/backend/spirv/emit_spirv_context_get_set.cpp12
-rw-r--r--src/shader_recompiler/backend/spirv/emit_spirv_image.cpp54
-rw-r--r--src/shader_recompiler/backend/spirv/emit_spirv_instructions.h5
-rw-r--r--src/shader_recompiler/backend/spirv/emit_spirv_integer.cpp8
-rw-r--r--src/shader_recompiler/frontend/ir/basic_block.cpp5
-rw-r--r--src/shader_recompiler/frontend/ir/basic_block.h3
-rw-r--r--src/shader_recompiler/frontend/ir/ir_emitter.cpp16
-rw-r--r--src/shader_recompiler/frontend/ir/ir_emitter.h7
-rw-r--r--src/shader_recompiler/frontend/ir/microinstruction.cpp11
-rw-r--r--src/shader_recompiler/frontend/ir/opcodes.inc6
-rw-r--r--src/shader_recompiler/frontend/ir/value.h2
-rw-r--r--src/shader_recompiler/frontend/maxwell/translate_program.cpp4
-rw-r--r--src/shader_recompiler/ir_opt/collect_shader_info_pass.cpp5
-rw-r--r--src/shader_recompiler/ir_opt/passes.h1
-rw-r--r--src/shader_recompiler/ir_opt/rescaling_pass.cpp327
-rw-r--r--src/shader_recompiler/shader_info.h10
-rw-r--r--src/video_core/CMakeLists.txt2
-rw-r--r--src/video_core/buffer_cache/buffer_cache.h4
-rw-r--r--src/video_core/dirty_flags.h2
-rw-r--r--src/video_core/host_shaders/CMakeLists.txt26
-rw-r--r--src/video_core/host_shaders/fidelityfx_fsr.comp116
-rw-r--r--src/video_core/host_shaders/fxaa.frag76
-rw-r--r--src/video_core/host_shaders/fxaa.vert38
-rw-r--r--src/video_core/host_shaders/opengl_present_scaleforce.frag130
-rw-r--r--src/video_core/host_shaders/present_bicubic.frag67
-rw-r--r--src/video_core/host_shaders/present_gaussian.frag70
-rw-r--r--src/video_core/host_shaders/vulkan_fidelityfx_fsr_easu_fp16.comp11
-rw-r--r--src/video_core/host_shaders/vulkan_fidelityfx_fsr_easu_fp32.comp10
-rw-r--r--src/video_core/host_shaders/vulkan_fidelityfx_fsr_rcas_fp16.comp11
-rw-r--r--src/video_core/host_shaders/vulkan_fidelityfx_fsr_rcas_fp32.comp10
-rw-r--r--src/video_core/host_shaders/vulkan_present_scaleforce_fp16.frag7
-rw-r--r--src/video_core/host_shaders/vulkan_present_scaleforce_fp32.frag5
-rw-r--r--src/video_core/renderer_opengl/gl_buffer_cache.cpp13
-rw-r--r--src/video_core/renderer_opengl/gl_compute_pipeline.cpp80
-rw-r--r--src/video_core/renderer_opengl/gl_graphics_pipeline.cpp106
-rw-r--r--src/video_core/renderer_opengl/gl_rasterizer.cpp94
-rw-r--r--src/video_core/renderer_opengl/gl_resource_manager.cpp5
-rw-r--r--src/video_core/renderer_opengl/gl_shader_cache.cpp15
-rw-r--r--src/video_core/renderer_opengl/gl_texture_cache.cpp322
-rw-r--r--src/video_core/renderer_opengl/gl_texture_cache.h50
-rw-r--r--src/video_core/renderer_opengl/renderer_opengl.cpp185
-rw-r--r--src/video_core/renderer_opengl/renderer_opengl.h11
-rw-r--r--src/video_core/renderer_vulkan/blit_image.cpp61
-rw-r--r--src/video_core/renderer_vulkan/blit_image.h23
-rw-r--r--src/video_core/renderer_vulkan/pipeline_helper.h73
-rw-r--r--src/video_core/renderer_vulkan/vk_blit_screen.cpp698
-rw-r--r--src/video_core/renderer_vulkan/vk_blit_screen.h38
-rw-r--r--src/video_core/renderer_vulkan/vk_buffer_cache.cpp38
-rw-r--r--src/video_core/renderer_vulkan/vk_buffer_cache.h6
-rw-r--r--src/video_core/renderer_vulkan/vk_compute_pipeline.cpp49
-rw-r--r--src/video_core/renderer_vulkan/vk_fsr.cpp553
-rw-r--r--src/video_core/renderer_vulkan/vk_fsr.h54
-rw-r--r--src/video_core/renderer_vulkan/vk_graphics_pipeline.cpp83
-rw-r--r--src/video_core/renderer_vulkan/vk_graphics_pipeline.h4
-rw-r--r--src/video_core/renderer_vulkan/vk_master_semaphore.h4
-rw-r--r--src/video_core/renderer_vulkan/vk_rasterizer.cpp134
-rw-r--r--src/video_core/renderer_vulkan/vk_scheduler.cpp10
-rw-r--r--src/video_core/renderer_vulkan/vk_scheduler.h5
-rw-r--r--src/video_core/renderer_vulkan/vk_state_tracker.h8
-rw-r--r--src/video_core/renderer_vulkan/vk_texture_cache.cpp391
-rw-r--r--src/video_core/renderer_vulkan/vk_texture_cache.h86
-rw-r--r--src/video_core/surface.cpp74
-rw-r--r--src/video_core/surface.h6
-rw-r--r--src/video_core/texture_cache/image_base.cpp10
-rw-r--r--src/video_core/texture_cache/image_base.h15
-rw-r--r--src/video_core/texture_cache/image_info.cpp22
-rw-r--r--src/video_core/texture_cache/image_info.h4
-rw-r--r--src/video_core/texture_cache/image_view_base.cpp13
-rw-r--r--src/video_core/texture_cache/image_view_base.h4
-rw-r--r--src/video_core/texture_cache/texture_cache.h495
-rw-r--r--src/video_core/texture_cache/texture_cache_base.h43
-rw-r--r--src/video_core/texture_cache/types.h7
-rw-r--r--src/video_core/texture_cache/util.cpp12
-rw-r--r--src/video_core/texture_cache/util.h3
-rw-r--r--src/video_core/textures/texture.cpp29
-rw-r--r--src/video_core/video_core.cpp11
-rw-r--r--src/video_core/video_core.h2
-rw-r--r--src/video_core/vulkan_common/vulkan_device.h8
-rw-r--r--src/yuzu/bootmanager.cpp8
-rw-r--r--src/yuzu/bootmanager.h2
-rw-r--r--src/yuzu/configuration/config.cpp15
-rw-r--r--src/yuzu/configuration/config.h3
-rw-r--r--src/yuzu/configuration/configure_graphics.cpp74
-rw-r--r--src/yuzu/configuration/configure_graphics.ui167
-rw-r--r--src/yuzu/configuration/configure_graphics_advanced.ui13
-rw-r--r--src/yuzu/debugger/profiler.cpp2
-rw-r--r--src/yuzu/game_list.cpp2
-rw-r--r--src/yuzu/main.cpp114
-rw-r--r--src/yuzu/main.h5
-rw-r--r--src/yuzu/uisettings.h1
-rw-r--r--src/yuzu_cmd/config.cpp3
-rw-r--r--src/yuzu_cmd/default_ini.h23
120 files changed, 9051 insertions, 646 deletions
diff --git a/externals/FidelityFX-FSR/ffx-fsr/ffx_a.h b/externals/FidelityFX-FSR/ffx-fsr/ffx_a.h
new file mode 100644
index 000000000..d04bff55c
--- /dev/null
+++ b/externals/FidelityFX-FSR/ffx-fsr/ffx_a.h
@@ -0,0 +1,2656 @@
1//==============================================================================================================================
2//
3// [A] SHADER PORTABILITY 1.20210629
4//
5//==============================================================================================================================
6// FidelityFX Super Resolution Sample
7//
8// Copyright (c) 2021 Advanced Micro Devices, Inc. All rights reserved.
9// Permission is hereby granted, free of charge, to any person obtaining a copy
10// of this software and associated documentation files(the "Software"), to deal
11// in the Software without restriction, including without limitation the rights
12// to use, copy, modify, merge, publish, distribute, sublicense, and / or sell
13// copies of the Software, and to permit persons to whom the Software is
14// furnished to do so, subject to the following conditions :
15// The above copyright notice and this permission notice shall be included in
16// all copies or substantial portions of the Software.
17// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.IN NO EVENT SHALL THE
20// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
21// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
22// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
23// THE SOFTWARE.
24//------------------------------------------------------------------------------------------------------------------------------
25// MIT LICENSE
26// ===========
27// Copyright (c) 2014 Michal Drobot (for concepts used in "FLOAT APPROXIMATIONS").
28// -----------
29// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation
30// files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy,
31// modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the
32// Software is furnished to do so, subject to the following conditions:
33// -----------
34// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the
35// Software.
36// -----------
37// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
38// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
39// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
40// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
41//------------------------------------------------------------------------------------------------------------------------------
42// ABOUT
43// =====
44// Common central point for high-level shading language and C portability for various shader headers.
45//------------------------------------------------------------------------------------------------------------------------------
46// DEFINES
47// =======
48// A_CPU ..... Include the CPU related code.
49// A_GPU ..... Include the GPU related code.
50// A_GLSL .... Using GLSL.
51// A_HLSL .... Using HLSL.
52// A_HLSL_6_2 Using HLSL 6.2 with new 'uint16_t' and related types (requires '-enable-16bit-types').
53// A_NO_16_BIT_CAST Don't use instructions that are not availabe in SPIR-V (needed for running A_HLSL_6_2 on Vulkan)
54// A_GCC ..... Using a GCC compatible compiler (else assume MSVC compatible compiler by default).
55// =======
56// A_BYTE .... Support 8-bit integer.
57// A_HALF .... Support 16-bit integer and floating point.
58// A_LONG .... Support 64-bit integer.
59// A_DUBL .... Support 64-bit floating point.
60// =======
61// A_WAVE .... Support wave-wide operations.
62//------------------------------------------------------------------------------------------------------------------------------
63// To get #include "ffx_a.h" working in GLSL use '#extension GL_GOOGLE_include_directive:require'.
64//------------------------------------------------------------------------------------------------------------------------------
65// SIMPLIFIED TYPE SYSTEM
66// ======================
67// - All ints will be unsigned with exception of when signed is required.
68// - Type naming simplified and shortened "A<type><#components>",
69// - H = 16-bit float (half)
70// - F = 32-bit float (float)
71// - D = 64-bit float (double)
72// - P = 1-bit integer (predicate, not using bool because 'B' is used for byte)
73// - B = 8-bit integer (byte)
74// - W = 16-bit integer (word)
75// - U = 32-bit integer (unsigned)
76// - L = 64-bit integer (long)
77// - Using "AS<type><#components>" for signed when required.
78//------------------------------------------------------------------------------------------------------------------------------
79// TODO
80// ====
81// - Make sure 'ALerp*(a,b,m)' does 'b*m+(-a*m+a)' (2 ops).
82//------------------------------------------------------------------------------------------------------------------------------
83// CHANGE LOG
84// ==========
85// 20200914 - Expanded wave ops and prx code.
86// 20200713 - Added [ZOL] section, fixed serious bugs in sRGB and Rec.709 color conversion code, etc.
87//==============================================================================================================================
88////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
89////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
90//_____________________________________________________________/\_______________________________________________________________
91//==============================================================================================================================
92// COMMON
93//==============================================================================================================================
94#define A_2PI 6.28318530718
95////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
96////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
97////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
98////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
99////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
100////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
101//_____________________________________________________________/\_______________________________________________________________
102//==============================================================================================================================
103//
104//
105// CPU
106//
107//
108//==============================================================================================================================
109#ifdef A_CPU
110 // Supporting user defined overrides.
111 #ifndef A_RESTRICT
112 #define A_RESTRICT __restrict
113 #endif
114//------------------------------------------------------------------------------------------------------------------------------
115 #ifndef A_STATIC
116 #define A_STATIC static
117 #endif
118//------------------------------------------------------------------------------------------------------------------------------
119 // Same types across CPU and GPU.
120 // Predicate uses 32-bit integer (C friendly bool).
121 typedef uint32_t AP1;
122 typedef float AF1;
123 typedef double AD1;
124 typedef uint8_t AB1;
125 typedef uint16_t AW1;
126 typedef uint32_t AU1;
127 typedef uint64_t AL1;
128 typedef int8_t ASB1;
129 typedef int16_t ASW1;
130 typedef int32_t ASU1;
131 typedef int64_t ASL1;
132//------------------------------------------------------------------------------------------------------------------------------
133 #define AD1_(a) ((AD1)(a))
134 #define AF1_(a) ((AF1)(a))
135 #define AL1_(a) ((AL1)(a))
136 #define AU1_(a) ((AU1)(a))
137//------------------------------------------------------------------------------------------------------------------------------
138 #define ASL1_(a) ((ASL1)(a))
139 #define ASU1_(a) ((ASU1)(a))
140//------------------------------------------------------------------------------------------------------------------------------
141 A_STATIC AU1 AU1_AF1(AF1 a){union{AF1 f;AU1 u;}bits;bits.f=a;return bits.u;}
142//------------------------------------------------------------------------------------------------------------------------------
143 #define A_TRUE 1
144 #define A_FALSE 0
145////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
146////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
147////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
148////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
149//_____________________________________________________________/\_______________________________________________________________
150//==============================================================================================================================
151//
152// CPU/GPU PORTING
153//
154//------------------------------------------------------------------------------------------------------------------------------
155// Get CPU and GPU to share all setup code, without duplicate code paths.
156// This uses a lower-case prefix for special vector constructs.
157// - In C restrict pointers are used.
158// - In the shading language, in/inout/out arguments are used.
159// This depends on the ability to access a vector value in both languages via array syntax (aka color[2]).
160//==============================================================================================================================
161////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
162////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
163//_____________________________________________________________/\_______________________________________________________________
164//==============================================================================================================================
165// VECTOR ARGUMENT/RETURN/INITIALIZATION PORTABILITY
166//==============================================================================================================================
167 #define retAD2 AD1 *A_RESTRICT
168 #define retAD3 AD1 *A_RESTRICT
169 #define retAD4 AD1 *A_RESTRICT
170 #define retAF2 AF1 *A_RESTRICT
171 #define retAF3 AF1 *A_RESTRICT
172 #define retAF4 AF1 *A_RESTRICT
173 #define retAL2 AL1 *A_RESTRICT
174 #define retAL3 AL1 *A_RESTRICT
175 #define retAL4 AL1 *A_RESTRICT
176 #define retAU2 AU1 *A_RESTRICT
177 #define retAU3 AU1 *A_RESTRICT
178 #define retAU4 AU1 *A_RESTRICT
179//------------------------------------------------------------------------------------------------------------------------------
180 #define inAD2 AD1 *A_RESTRICT
181 #define inAD3 AD1 *A_RESTRICT
182 #define inAD4 AD1 *A_RESTRICT
183 #define inAF2 AF1 *A_RESTRICT
184 #define inAF3 AF1 *A_RESTRICT
185 #define inAF4 AF1 *A_RESTRICT
186 #define inAL2 AL1 *A_RESTRICT
187 #define inAL3 AL1 *A_RESTRICT
188 #define inAL4 AL1 *A_RESTRICT
189 #define inAU2 AU1 *A_RESTRICT
190 #define inAU3 AU1 *A_RESTRICT
191 #define inAU4 AU1 *A_RESTRICT
192//------------------------------------------------------------------------------------------------------------------------------
193 #define inoutAD2 AD1 *A_RESTRICT
194 #define inoutAD3 AD1 *A_RESTRICT
195 #define inoutAD4 AD1 *A_RESTRICT
196 #define inoutAF2 AF1 *A_RESTRICT
197 #define inoutAF3 AF1 *A_RESTRICT
198 #define inoutAF4 AF1 *A_RESTRICT
199 #define inoutAL2 AL1 *A_RESTRICT
200 #define inoutAL3 AL1 *A_RESTRICT
201 #define inoutAL4 AL1 *A_RESTRICT
202 #define inoutAU2 AU1 *A_RESTRICT
203 #define inoutAU3 AU1 *A_RESTRICT
204 #define inoutAU4 AU1 *A_RESTRICT
205//------------------------------------------------------------------------------------------------------------------------------
206 #define outAD2 AD1 *A_RESTRICT
207 #define outAD3 AD1 *A_RESTRICT
208 #define outAD4 AD1 *A_RESTRICT
209 #define outAF2 AF1 *A_RESTRICT
210 #define outAF3 AF1 *A_RESTRICT
211 #define outAF4 AF1 *A_RESTRICT
212 #define outAL2 AL1 *A_RESTRICT
213 #define outAL3 AL1 *A_RESTRICT
214 #define outAL4 AL1 *A_RESTRICT
215 #define outAU2 AU1 *A_RESTRICT
216 #define outAU3 AU1 *A_RESTRICT
217 #define outAU4 AU1 *A_RESTRICT
218//------------------------------------------------------------------------------------------------------------------------------
219 #define varAD2(x) AD1 x[2]
220 #define varAD3(x) AD1 x[3]
221 #define varAD4(x) AD1 x[4]
222 #define varAF2(x) AF1 x[2]
223 #define varAF3(x) AF1 x[3]
224 #define varAF4(x) AF1 x[4]
225 #define varAL2(x) AL1 x[2]
226 #define varAL3(x) AL1 x[3]
227 #define varAL4(x) AL1 x[4]
228 #define varAU2(x) AU1 x[2]
229 #define varAU3(x) AU1 x[3]
230 #define varAU4(x) AU1 x[4]
231//------------------------------------------------------------------------------------------------------------------------------
232 #define initAD2(x,y) {x,y}
233 #define initAD3(x,y,z) {x,y,z}
234 #define initAD4(x,y,z,w) {x,y,z,w}
235 #define initAF2(x,y) {x,y}
236 #define initAF3(x,y,z) {x,y,z}
237 #define initAF4(x,y,z,w) {x,y,z,w}
238 #define initAL2(x,y) {x,y}
239 #define initAL3(x,y,z) {x,y,z}
240 #define initAL4(x,y,z,w) {x,y,z,w}
241 #define initAU2(x,y) {x,y}
242 #define initAU3(x,y,z) {x,y,z}
243 #define initAU4(x,y,z,w) {x,y,z,w}
244////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
245////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
246//_____________________________________________________________/\_______________________________________________________________
247//==============================================================================================================================
248// SCALAR RETURN OPS
249//------------------------------------------------------------------------------------------------------------------------------
250// TODO
251// ====
252// - Replace transcendentals with manual versions.
253//==============================================================================================================================
254 #ifdef A_GCC
255 A_STATIC AD1 AAbsD1(AD1 a){return __builtin_fabs(a);}
256 A_STATIC AF1 AAbsF1(AF1 a){return __builtin_fabsf(a);}
257 A_STATIC AU1 AAbsSU1(AU1 a){return AU1_(__builtin_abs(ASU1_(a)));}
258 A_STATIC AL1 AAbsSL1(AL1 a){return AL1_(__builtin_llabs(ASL1_(a)));}
259 #else
260 A_STATIC AD1 AAbsD1(AD1 a){return fabs(a);}
261 A_STATIC AF1 AAbsF1(AF1 a){return fabsf(a);}
262 A_STATIC AU1 AAbsSU1(AU1 a){return AU1_(abs(ASU1_(a)));}
263 A_STATIC AL1 AAbsSL1(AL1 a){return AL1_(labs((long)ASL1_(a)));}
264 #endif
265//------------------------------------------------------------------------------------------------------------------------------
266 #ifdef A_GCC
267 A_STATIC AD1 ACosD1(AD1 a){return __builtin_cos(a);}
268 A_STATIC AF1 ACosF1(AF1 a){return __builtin_cosf(a);}
269 #else
270 A_STATIC AD1 ACosD1(AD1 a){return cos(a);}
271 A_STATIC AF1 ACosF1(AF1 a){return cosf(a);}
272 #endif
273//------------------------------------------------------------------------------------------------------------------------------
274 A_STATIC AD1 ADotD2(inAD2 a,inAD2 b){return a[0]*b[0]+a[1]*b[1];}
275 A_STATIC AD1 ADotD3(inAD3 a,inAD3 b){return a[0]*b[0]+a[1]*b[1]+a[2]*b[2];}
276 A_STATIC AD1 ADotD4(inAD4 a,inAD4 b){return a[0]*b[0]+a[1]*b[1]+a[2]*b[2]+a[3]*b[3];}
277 A_STATIC AF1 ADotF2(inAF2 a,inAF2 b){return a[0]*b[0]+a[1]*b[1];}
278 A_STATIC AF1 ADotF3(inAF3 a,inAF3 b){return a[0]*b[0]+a[1]*b[1]+a[2]*b[2];}
279 A_STATIC AF1 ADotF4(inAF4 a,inAF4 b){return a[0]*b[0]+a[1]*b[1]+a[2]*b[2]+a[3]*b[3];}
280//------------------------------------------------------------------------------------------------------------------------------
281 #ifdef A_GCC
282 A_STATIC AD1 AExp2D1(AD1 a){return __builtin_exp2(a);}
283 A_STATIC AF1 AExp2F1(AF1 a){return __builtin_exp2f(a);}
284 #else
285 A_STATIC AD1 AExp2D1(AD1 a){return exp2(a);}
286 A_STATIC AF1 AExp2F1(AF1 a){return exp2f(a);}
287 #endif
288//------------------------------------------------------------------------------------------------------------------------------
289 #ifdef A_GCC
290 A_STATIC AD1 AFloorD1(AD1 a){return __builtin_floor(a);}
291 A_STATIC AF1 AFloorF1(AF1 a){return __builtin_floorf(a);}
292 #else
293 A_STATIC AD1 AFloorD1(AD1 a){return floor(a);}
294 A_STATIC AF1 AFloorF1(AF1 a){return floorf(a);}
295 #endif
296//------------------------------------------------------------------------------------------------------------------------------
297 A_STATIC AD1 ALerpD1(AD1 a,AD1 b,AD1 c){return b*c+(-a*c+a);}
298 A_STATIC AF1 ALerpF1(AF1 a,AF1 b,AF1 c){return b*c+(-a*c+a);}
299//------------------------------------------------------------------------------------------------------------------------------
300 #ifdef A_GCC
301 A_STATIC AD1 ALog2D1(AD1 a){return __builtin_log2(a);}
302 A_STATIC AF1 ALog2F1(AF1 a){return __builtin_log2f(a);}
303 #else
304 A_STATIC AD1 ALog2D1(AD1 a){return log2(a);}
305 A_STATIC AF1 ALog2F1(AF1 a){return log2f(a);}
306 #endif
307//------------------------------------------------------------------------------------------------------------------------------
308 A_STATIC AD1 AMaxD1(AD1 a,AD1 b){return a>b?a:b;}
309 A_STATIC AF1 AMaxF1(AF1 a,AF1 b){return a>b?a:b;}
310 A_STATIC AL1 AMaxL1(AL1 a,AL1 b){return a>b?a:b;}
311 A_STATIC AU1 AMaxU1(AU1 a,AU1 b){return a>b?a:b;}
312//------------------------------------------------------------------------------------------------------------------------------
313 // These follow the convention that A integer types don't have signage, until they are operated on.
314 A_STATIC AL1 AMaxSL1(AL1 a,AL1 b){return (ASL1_(a)>ASL1_(b))?a:b;}
315 A_STATIC AU1 AMaxSU1(AU1 a,AU1 b){return (ASU1_(a)>ASU1_(b))?a:b;}
316//------------------------------------------------------------------------------------------------------------------------------
317 A_STATIC AD1 AMinD1(AD1 a,AD1 b){return a<b?a:b;}
318 A_STATIC AF1 AMinF1(AF1 a,AF1 b){return a<b?a:b;}
319 A_STATIC AL1 AMinL1(AL1 a,AL1 b){return a<b?a:b;}
320 A_STATIC AU1 AMinU1(AU1 a,AU1 b){return a<b?a:b;}
321//------------------------------------------------------------------------------------------------------------------------------
322 A_STATIC AL1 AMinSL1(AL1 a,AL1 b){return (ASL1_(a)<ASL1_(b))?a:b;}
323 A_STATIC AU1 AMinSU1(AU1 a,AU1 b){return (ASU1_(a)<ASU1_(b))?a:b;}
324//------------------------------------------------------------------------------------------------------------------------------
325 A_STATIC AD1 ARcpD1(AD1 a){return 1.0/a;}
326 A_STATIC AF1 ARcpF1(AF1 a){return 1.0f/a;}
327//------------------------------------------------------------------------------------------------------------------------------
328 A_STATIC AL1 AShrSL1(AL1 a,AL1 b){return AL1_(ASL1_(a)>>ASL1_(b));}
329 A_STATIC AU1 AShrSU1(AU1 a,AU1 b){return AU1_(ASU1_(a)>>ASU1_(b));}
330//------------------------------------------------------------------------------------------------------------------------------
331 #ifdef A_GCC
332 A_STATIC AD1 ASinD1(AD1 a){return __builtin_sin(a);}
333 A_STATIC AF1 ASinF1(AF1 a){return __builtin_sinf(a);}
334 #else
335 A_STATIC AD1 ASinD1(AD1 a){return sin(a);}
336 A_STATIC AF1 ASinF1(AF1 a){return sinf(a);}
337 #endif
338//------------------------------------------------------------------------------------------------------------------------------
339 #ifdef A_GCC
340 A_STATIC AD1 ASqrtD1(AD1 a){return __builtin_sqrt(a);}
341 A_STATIC AF1 ASqrtF1(AF1 a){return __builtin_sqrtf(a);}
342 #else
343 A_STATIC AD1 ASqrtD1(AD1 a){return sqrt(a);}
344 A_STATIC AF1 ASqrtF1(AF1 a){return sqrtf(a);}
345 #endif
346////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
347////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
348//_____________________________________________________________/\_______________________________________________________________
349//==============================================================================================================================
350// SCALAR RETURN OPS - DEPENDENT
351//==============================================================================================================================
352 A_STATIC AD1 AClampD1(AD1 x,AD1 n,AD1 m){return AMaxD1(n,AMinD1(x,m));}
353 A_STATIC AF1 AClampF1(AF1 x,AF1 n,AF1 m){return AMaxF1(n,AMinF1(x,m));}
354//------------------------------------------------------------------------------------------------------------------------------
355 A_STATIC AD1 AFractD1(AD1 a){return a-AFloorD1(a);}
356 A_STATIC AF1 AFractF1(AF1 a){return a-AFloorF1(a);}
357//------------------------------------------------------------------------------------------------------------------------------
358 A_STATIC AD1 APowD1(AD1 a,AD1 b){return AExp2D1(b*ALog2D1(a));}
359 A_STATIC AF1 APowF1(AF1 a,AF1 b){return AExp2F1(b*ALog2F1(a));}
360//------------------------------------------------------------------------------------------------------------------------------
361 A_STATIC AD1 ARsqD1(AD1 a){return ARcpD1(ASqrtD1(a));}
362 A_STATIC AF1 ARsqF1(AF1 a){return ARcpF1(ASqrtF1(a));}
363//------------------------------------------------------------------------------------------------------------------------------
364 A_STATIC AD1 ASatD1(AD1 a){return AMinD1(1.0,AMaxD1(0.0,a));}
365 A_STATIC AF1 ASatF1(AF1 a){return AMinF1(1.0f,AMaxF1(0.0f,a));}
366////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
367////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
368//_____________________________________________________________/\_______________________________________________________________
369//==============================================================================================================================
370// VECTOR OPS
371//------------------------------------------------------------------------------------------------------------------------------
372// These are added as needed for production or prototyping, so not necessarily a complete set.
373// They follow a convention of taking in a destination and also returning the destination value to increase utility.
374//==============================================================================================================================
375 A_STATIC retAD2 opAAbsD2(outAD2 d,inAD2 a){d[0]=AAbsD1(a[0]);d[1]=AAbsD1(a[1]);return d;}
376 A_STATIC retAD3 opAAbsD3(outAD3 d,inAD3 a){d[0]=AAbsD1(a[0]);d[1]=AAbsD1(a[1]);d[2]=AAbsD1(a[2]);return d;}
377 A_STATIC retAD4 opAAbsD4(outAD4 d,inAD4 a){d[0]=AAbsD1(a[0]);d[1]=AAbsD1(a[1]);d[2]=AAbsD1(a[2]);d[3]=AAbsD1(a[3]);return d;}
378//------------------------------------------------------------------------------------------------------------------------------
379 A_STATIC retAF2 opAAbsF2(outAF2 d,inAF2 a){d[0]=AAbsF1(a[0]);d[1]=AAbsF1(a[1]);return d;}
380 A_STATIC retAF3 opAAbsF3(outAF3 d,inAF3 a){d[0]=AAbsF1(a[0]);d[1]=AAbsF1(a[1]);d[2]=AAbsF1(a[2]);return d;}
381 A_STATIC retAF4 opAAbsF4(outAF4 d,inAF4 a){d[0]=AAbsF1(a[0]);d[1]=AAbsF1(a[1]);d[2]=AAbsF1(a[2]);d[3]=AAbsF1(a[3]);return d;}
382//==============================================================================================================================
383 A_STATIC retAD2 opAAddD2(outAD2 d,inAD2 a,inAD2 b){d[0]=a[0]+b[0];d[1]=a[1]+b[1];return d;}
384 A_STATIC retAD3 opAAddD3(outAD3 d,inAD3 a,inAD3 b){d[0]=a[0]+b[0];d[1]=a[1]+b[1];d[2]=a[2]+b[2];return d;}
385 A_STATIC retAD4 opAAddD4(outAD4 d,inAD4 a,inAD4 b){d[0]=a[0]+b[0];d[1]=a[1]+b[1];d[2]=a[2]+b[2];d[3]=a[3]+b[3];return d;}
386//------------------------------------------------------------------------------------------------------------------------------
387 A_STATIC retAF2 opAAddF2(outAF2 d,inAF2 a,inAF2 b){d[0]=a[0]+b[0];d[1]=a[1]+b[1];return d;}
388 A_STATIC retAF3 opAAddF3(outAF3 d,inAF3 a,inAF3 b){d[0]=a[0]+b[0];d[1]=a[1]+b[1];d[2]=a[2]+b[2];return d;}
389 A_STATIC retAF4 opAAddF4(outAF4 d,inAF4 a,inAF4 b){d[0]=a[0]+b[0];d[1]=a[1]+b[1];d[2]=a[2]+b[2];d[3]=a[3]+b[3];return d;}
390//==============================================================================================================================
391 A_STATIC retAD2 opAAddOneD2(outAD2 d,inAD2 a,AD1 b){d[0]=a[0]+b;d[1]=a[1]+b;return d;}
392 A_STATIC retAD3 opAAddOneD3(outAD3 d,inAD3 a,AD1 b){d[0]=a[0]+b;d[1]=a[1]+b;d[2]=a[2]+b;return d;}
393 A_STATIC retAD4 opAAddOneD4(outAD4 d,inAD4 a,AD1 b){d[0]=a[0]+b;d[1]=a[1]+b;d[2]=a[2]+b;d[3]=a[3]+b;return d;}
394//------------------------------------------------------------------------------------------------------------------------------
395 A_STATIC retAF2 opAAddOneF2(outAF2 d,inAF2 a,AF1 b){d[0]=a[0]+b;d[1]=a[1]+b;return d;}
396 A_STATIC retAF3 opAAddOneF3(outAF3 d,inAF3 a,AF1 b){d[0]=a[0]+b;d[1]=a[1]+b;d[2]=a[2]+b;return d;}
397 A_STATIC retAF4 opAAddOneF4(outAF4 d,inAF4 a,AF1 b){d[0]=a[0]+b;d[1]=a[1]+b;d[2]=a[2]+b;d[3]=a[3]+b;return d;}
398//==============================================================================================================================
399 A_STATIC retAD2 opACpyD2(outAD2 d,inAD2 a){d[0]=a[0];d[1]=a[1];return d;}
400 A_STATIC retAD3 opACpyD3(outAD3 d,inAD3 a){d[0]=a[0];d[1]=a[1];d[2]=a[2];return d;}
401 A_STATIC retAD4 opACpyD4(outAD4 d,inAD4 a){d[0]=a[0];d[1]=a[1];d[2]=a[2];d[3]=a[3];return d;}
402//------------------------------------------------------------------------------------------------------------------------------
403 A_STATIC retAF2 opACpyF2(outAF2 d,inAF2 a){d[0]=a[0];d[1]=a[1];return d;}
404 A_STATIC retAF3 opACpyF3(outAF3 d,inAF3 a){d[0]=a[0];d[1]=a[1];d[2]=a[2];return d;}
405 A_STATIC retAF4 opACpyF4(outAF4 d,inAF4 a){d[0]=a[0];d[1]=a[1];d[2]=a[2];d[3]=a[3];return d;}
406//==============================================================================================================================
407 A_STATIC retAD2 opALerpD2(outAD2 d,inAD2 a,inAD2 b,inAD2 c){d[0]=ALerpD1(a[0],b[0],c[0]);d[1]=ALerpD1(a[1],b[1],c[1]);return d;}
408 A_STATIC retAD3 opALerpD3(outAD3 d,inAD3 a,inAD3 b,inAD3 c){d[0]=ALerpD1(a[0],b[0],c[0]);d[1]=ALerpD1(a[1],b[1],c[1]);d[2]=ALerpD1(a[2],b[2],c[2]);return d;}
409 A_STATIC retAD4 opALerpD4(outAD4 d,inAD4 a,inAD4 b,inAD4 c){d[0]=ALerpD1(a[0],b[0],c[0]);d[1]=ALerpD1(a[1],b[1],c[1]);d[2]=ALerpD1(a[2],b[2],c[2]);d[3]=ALerpD1(a[3],b[3],c[3]);return d;}
410//------------------------------------------------------------------------------------------------------------------------------
411 A_STATIC retAF2 opALerpF2(outAF2 d,inAF2 a,inAF2 b,inAF2 c){d[0]=ALerpF1(a[0],b[0],c[0]);d[1]=ALerpF1(a[1],b[1],c[1]);return d;}
412 A_STATIC retAF3 opALerpF3(outAF3 d,inAF3 a,inAF3 b,inAF3 c){d[0]=ALerpF1(a[0],b[0],c[0]);d[1]=ALerpF1(a[1],b[1],c[1]);d[2]=ALerpF1(a[2],b[2],c[2]);return d;}
413 A_STATIC retAF4 opALerpF4(outAF4 d,inAF4 a,inAF4 b,inAF4 c){d[0]=ALerpF1(a[0],b[0],c[0]);d[1]=ALerpF1(a[1],b[1],c[1]);d[2]=ALerpF1(a[2],b[2],c[2]);d[3]=ALerpF1(a[3],b[3],c[3]);return d;}
414//==============================================================================================================================
415 A_STATIC retAD2 opALerpOneD2(outAD2 d,inAD2 a,inAD2 b,AD1 c){d[0]=ALerpD1(a[0],b[0],c);d[1]=ALerpD1(a[1],b[1],c);return d;}
416 A_STATIC retAD3 opALerpOneD3(outAD3 d,inAD3 a,inAD3 b,AD1 c){d[0]=ALerpD1(a[0],b[0],c);d[1]=ALerpD1(a[1],b[1],c);d[2]=ALerpD1(a[2],b[2],c);return d;}
417 A_STATIC retAD4 opALerpOneD4(outAD4 d,inAD4 a,inAD4 b,AD1 c){d[0]=ALerpD1(a[0],b[0],c);d[1]=ALerpD1(a[1],b[1],c);d[2]=ALerpD1(a[2],b[2],c);d[3]=ALerpD1(a[3],b[3],c);return d;}
418//------------------------------------------------------------------------------------------------------------------------------
419 A_STATIC retAF2 opALerpOneF2(outAF2 d,inAF2 a,inAF2 b,AF1 c){d[0]=ALerpF1(a[0],b[0],c);d[1]=ALerpF1(a[1],b[1],c);return d;}
420 A_STATIC retAF3 opALerpOneF3(outAF3 d,inAF3 a,inAF3 b,AF1 c){d[0]=ALerpF1(a[0],b[0],c);d[1]=ALerpF1(a[1],b[1],c);d[2]=ALerpF1(a[2],b[2],c);return d;}
421 A_STATIC retAF4 opALerpOneF4(outAF4 d,inAF4 a,inAF4 b,AF1 c){d[0]=ALerpF1(a[0],b[0],c);d[1]=ALerpF1(a[1],b[1],c);d[2]=ALerpF1(a[2],b[2],c);d[3]=ALerpF1(a[3],b[3],c);return d;}
422//==============================================================================================================================
423 A_STATIC retAD2 opAMaxD2(outAD2 d,inAD2 a,inAD2 b){d[0]=AMaxD1(a[0],b[0]);d[1]=AMaxD1(a[1],b[1]);return d;}
424 A_STATIC retAD3 opAMaxD3(outAD3 d,inAD3 a,inAD3 b){d[0]=AMaxD1(a[0],b[0]);d[1]=AMaxD1(a[1],b[1]);d[2]=AMaxD1(a[2],b[2]);return d;}
425 A_STATIC retAD4 opAMaxD4(outAD4 d,inAD4 a,inAD4 b){d[0]=AMaxD1(a[0],b[0]);d[1]=AMaxD1(a[1],b[1]);d[2]=AMaxD1(a[2],b[2]);d[3]=AMaxD1(a[3],b[3]);return d;}
426//------------------------------------------------------------------------------------------------------------------------------
427 A_STATIC retAF2 opAMaxF2(outAF2 d,inAF2 a,inAF2 b){d[0]=AMaxF1(a[0],b[0]);d[1]=AMaxF1(a[1],b[1]);return d;}
428 A_STATIC retAF3 opAMaxF3(outAF3 d,inAF3 a,inAF3 b){d[0]=AMaxF1(a[0],b[0]);d[1]=AMaxF1(a[1],b[1]);d[2]=AMaxF1(a[2],b[2]);return d;}
429 A_STATIC retAF4 opAMaxF4(outAF4 d,inAF4 a,inAF4 b){d[0]=AMaxF1(a[0],b[0]);d[1]=AMaxF1(a[1],b[1]);d[2]=AMaxF1(a[2],b[2]);d[3]=AMaxF1(a[3],b[3]);return d;}
430//==============================================================================================================================
431 A_STATIC retAD2 opAMinD2(outAD2 d,inAD2 a,inAD2 b){d[0]=AMinD1(a[0],b[0]);d[1]=AMinD1(a[1],b[1]);return d;}
432 A_STATIC retAD3 opAMinD3(outAD3 d,inAD3 a,inAD3 b){d[0]=AMinD1(a[0],b[0]);d[1]=AMinD1(a[1],b[1]);d[2]=AMinD1(a[2],b[2]);return d;}
433 A_STATIC retAD4 opAMinD4(outAD4 d,inAD4 a,inAD4 b){d[0]=AMinD1(a[0],b[0]);d[1]=AMinD1(a[1],b[1]);d[2]=AMinD1(a[2],b[2]);d[3]=AMinD1(a[3],b[3]);return d;}
434//------------------------------------------------------------------------------------------------------------------------------
435 A_STATIC retAF2 opAMinF2(outAF2 d,inAF2 a,inAF2 b){d[0]=AMinF1(a[0],b[0]);d[1]=AMinF1(a[1],b[1]);return d;}
436 A_STATIC retAF3 opAMinF3(outAF3 d,inAF3 a,inAF3 b){d[0]=AMinF1(a[0],b[0]);d[1]=AMinF1(a[1],b[1]);d[2]=AMinF1(a[2],b[2]);return d;}
437 A_STATIC retAF4 opAMinF4(outAF4 d,inAF4 a,inAF4 b){d[0]=AMinF1(a[0],b[0]);d[1]=AMinF1(a[1],b[1]);d[2]=AMinF1(a[2],b[2]);d[3]=AMinF1(a[3],b[3]);return d;}
438//==============================================================================================================================
439 A_STATIC retAD2 opAMulD2(outAD2 d,inAD2 a,inAD2 b){d[0]=a[0]*b[0];d[1]=a[1]*b[1];return d;}
440 A_STATIC retAD3 opAMulD3(outAD3 d,inAD3 a,inAD3 b){d[0]=a[0]*b[0];d[1]=a[1]*b[1];d[2]=a[2]*b[2];return d;}
441 A_STATIC retAD4 opAMulD4(outAD4 d,inAD4 a,inAD4 b){d[0]=a[0]*b[0];d[1]=a[1]*b[1];d[2]=a[2]*b[2];d[3]=a[3]*b[3];return d;}
442//------------------------------------------------------------------------------------------------------------------------------
443 A_STATIC retAF2 opAMulF2(outAF2 d,inAF2 a,inAF2 b){d[0]=a[0]*b[0];d[1]=a[1]*b[1];return d;}
444 A_STATIC retAF3 opAMulF3(outAF3 d,inAF3 a,inAF3 b){d[0]=a[0]*b[0];d[1]=a[1]*b[1];d[2]=a[2]*b[2];return d;}
445 A_STATIC retAF4 opAMulF4(outAF4 d,inAF4 a,inAF4 b){d[0]=a[0]*b[0];d[1]=a[1]*b[1];d[2]=a[2]*b[2];d[3]=a[3]*b[3];return d;}
446//==============================================================================================================================
447 A_STATIC retAD2 opAMulOneD2(outAD2 d,inAD2 a,AD1 b){d[0]=a[0]*b;d[1]=a[1]*b;return d;}
448 A_STATIC retAD3 opAMulOneD3(outAD3 d,inAD3 a,AD1 b){d[0]=a[0]*b;d[1]=a[1]*b;d[2]=a[2]*b;return d;}
449 A_STATIC retAD4 opAMulOneD4(outAD4 d,inAD4 a,AD1 b){d[0]=a[0]*b;d[1]=a[1]*b;d[2]=a[2]*b;d[3]=a[3]*b;return d;}
450//------------------------------------------------------------------------------------------------------------------------------
451 A_STATIC retAF2 opAMulOneF2(outAF2 d,inAF2 a,AF1 b){d[0]=a[0]*b;d[1]=a[1]*b;return d;}
452 A_STATIC retAF3 opAMulOneF3(outAF3 d,inAF3 a,AF1 b){d[0]=a[0]*b;d[1]=a[1]*b;d[2]=a[2]*b;return d;}
453 A_STATIC retAF4 opAMulOneF4(outAF4 d,inAF4 a,AF1 b){d[0]=a[0]*b;d[1]=a[1]*b;d[2]=a[2]*b;d[3]=a[3]*b;return d;}
454//==============================================================================================================================
455 A_STATIC retAD2 opANegD2(outAD2 d,inAD2 a){d[0]=-a[0];d[1]=-a[1];return d;}
456 A_STATIC retAD3 opANegD3(outAD3 d,inAD3 a){d[0]=-a[0];d[1]=-a[1];d[2]=-a[2];return d;}
457 A_STATIC retAD4 opANegD4(outAD4 d,inAD4 a){d[0]=-a[0];d[1]=-a[1];d[2]=-a[2];d[3]=-a[3];return d;}
458//------------------------------------------------------------------------------------------------------------------------------
459 A_STATIC retAF2 opANegF2(outAF2 d,inAF2 a){d[0]=-a[0];d[1]=-a[1];return d;}
460 A_STATIC retAF3 opANegF3(outAF3 d,inAF3 a){d[0]=-a[0];d[1]=-a[1];d[2]=-a[2];return d;}
461 A_STATIC retAF4 opANegF4(outAF4 d,inAF4 a){d[0]=-a[0];d[1]=-a[1];d[2]=-a[2];d[3]=-a[3];return d;}
462//==============================================================================================================================
463 A_STATIC retAD2 opARcpD2(outAD2 d,inAD2 a){d[0]=ARcpD1(a[0]);d[1]=ARcpD1(a[1]);return d;}
464 A_STATIC retAD3 opARcpD3(outAD3 d,inAD3 a){d[0]=ARcpD1(a[0]);d[1]=ARcpD1(a[1]);d[2]=ARcpD1(a[2]);return d;}
465 A_STATIC retAD4 opARcpD4(outAD4 d,inAD4 a){d[0]=ARcpD1(a[0]);d[1]=ARcpD1(a[1]);d[2]=ARcpD1(a[2]);d[3]=ARcpD1(a[3]);return d;}
466//------------------------------------------------------------------------------------------------------------------------------
467 A_STATIC retAF2 opARcpF2(outAF2 d,inAF2 a){d[0]=ARcpF1(a[0]);d[1]=ARcpF1(a[1]);return d;}
468 A_STATIC retAF3 opARcpF3(outAF3 d,inAF3 a){d[0]=ARcpF1(a[0]);d[1]=ARcpF1(a[1]);d[2]=ARcpF1(a[2]);return d;}
469 A_STATIC retAF4 opARcpF4(outAF4 d,inAF4 a){d[0]=ARcpF1(a[0]);d[1]=ARcpF1(a[1]);d[2]=ARcpF1(a[2]);d[3]=ARcpF1(a[3]);return d;}
470////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
471////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
472//_____________________________________________________________/\_______________________________________________________________
473//==============================================================================================================================
474// HALF FLOAT PACKING
475//==============================================================================================================================
476 // Convert float to half (in lower 16-bits of output).
477 // Same fast technique as documented here: ftp://ftp.fox-toolkit.org/pub/fasthalffloatconversion.pdf
478 // Supports denormals.
479 // Conversion rules are to make computations possibly "safer" on the GPU,
480 // -INF & -NaN -> -65504
481 // +INF & +NaN -> +65504
482 A_STATIC AU1 AU1_AH1_AF1(AF1 f){
483 static AW1 base[512]={
484 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,
485 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,
486 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,
487 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,
488 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,
489 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,
490 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0001,0x0002,0x0004,0x0008,0x0010,0x0020,0x0040,0x0080,0x0100,
491 0x0200,0x0400,0x0800,0x0c00,0x1000,0x1400,0x1800,0x1c00,0x2000,0x2400,0x2800,0x2c00,0x3000,0x3400,0x3800,0x3c00,
492 0x4000,0x4400,0x4800,0x4c00,0x5000,0x5400,0x5800,0x5c00,0x6000,0x6400,0x6800,0x6c00,0x7000,0x7400,0x7800,0x7bff,
493 0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,
494 0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,
495 0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,
496 0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,
497 0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,
498 0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,
499 0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,
500 0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,
501 0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,
502 0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,
503 0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,
504 0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,
505 0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,
506 0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8001,0x8002,0x8004,0x8008,0x8010,0x8020,0x8040,0x8080,0x8100,
507 0x8200,0x8400,0x8800,0x8c00,0x9000,0x9400,0x9800,0x9c00,0xa000,0xa400,0xa800,0xac00,0xb000,0xb400,0xb800,0xbc00,
508 0xc000,0xc400,0xc800,0xcc00,0xd000,0xd400,0xd800,0xdc00,0xe000,0xe400,0xe800,0xec00,0xf000,0xf400,0xf800,0xfbff,
509 0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,
510 0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,
511 0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,
512 0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,
513 0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,
514 0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,
515 0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff};
516 static AB1 shift[512]={
517 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,
518 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,
519 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,
520 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,
521 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,
522 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,
523 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x17,0x16,0x15,0x14,0x13,0x12,0x11,0x10,0x0f,
524 0x0e,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,
525 0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x18,
526 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,
527 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,
528 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,
529 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,
530 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,
531 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,
532 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,
533 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,
534 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,
535 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,
536 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,
537 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,
538 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,
539 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x17,0x16,0x15,0x14,0x13,0x12,0x11,0x10,0x0f,
540 0x0e,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,
541 0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x18,
542 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,
543 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,
544 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,
545 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,
546 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,
547 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,
548 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18};
549 union{AF1 f;AU1 u;}bits;bits.f=f;AU1 u=bits.u;AU1 i=u>>23;return (AU1)(base[i])+((u&0x7fffff)>>shift[i]);}
550//------------------------------------------------------------------------------------------------------------------------------
551 // Used to output packed constant.
552 A_STATIC AU1 AU1_AH2_AF2(inAF2 a){return AU1_AH1_AF1(a[0])+(AU1_AH1_AF1(a[1])<<16);}
553#endif
554////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
555////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
556////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
557////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
558////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
559////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
560//_____________________________________________________________/\_______________________________________________________________
561//==============================================================================================================================
562//
563//
564// GLSL
565//
566//
567//==============================================================================================================================
568#if defined(A_GLSL) && defined(A_GPU)
569 #ifndef A_SKIP_EXT
570 #ifdef A_HALF
571 #extension GL_EXT_shader_16bit_storage:require
572 #extension GL_EXT_shader_explicit_arithmetic_types:require
573 #endif
574//------------------------------------------------------------------------------------------------------------------------------
575 #ifdef A_LONG
576 #extension GL_ARB_gpu_shader_int64:require
577 #extension GL_NV_shader_atomic_int64:require
578 #endif
579//------------------------------------------------------------------------------------------------------------------------------
580 #ifdef A_WAVE
581 #extension GL_KHR_shader_subgroup_arithmetic:require
582 #extension GL_KHR_shader_subgroup_ballot:require
583 #extension GL_KHR_shader_subgroup_quad:require
584 #extension GL_KHR_shader_subgroup_shuffle:require
585 #endif
586 #endif
587//==============================================================================================================================
588 #define AP1 bool
589 #define AP2 bvec2
590 #define AP3 bvec3
591 #define AP4 bvec4
592//------------------------------------------------------------------------------------------------------------------------------
593 #define AF1 float
594 #define AF2 vec2
595 #define AF3 vec3
596 #define AF4 vec4
597//------------------------------------------------------------------------------------------------------------------------------
598 #define AU1 uint
599 #define AU2 uvec2
600 #define AU3 uvec3
601 #define AU4 uvec4
602//------------------------------------------------------------------------------------------------------------------------------
603 #define ASU1 int
604 #define ASU2 ivec2
605 #define ASU3 ivec3
606 #define ASU4 ivec4
607//==============================================================================================================================
608 #define AF1_AU1(x) uintBitsToFloat(AU1(x))
609 #define AF2_AU2(x) uintBitsToFloat(AU2(x))
610 #define AF3_AU3(x) uintBitsToFloat(AU3(x))
611 #define AF4_AU4(x) uintBitsToFloat(AU4(x))
612//------------------------------------------------------------------------------------------------------------------------------
613 #define AU1_AF1(x) floatBitsToUint(AF1(x))
614 #define AU2_AF2(x) floatBitsToUint(AF2(x))
615 #define AU3_AF3(x) floatBitsToUint(AF3(x))
616 #define AU4_AF4(x) floatBitsToUint(AF4(x))
617//------------------------------------------------------------------------------------------------------------------------------
618 AU1 AU1_AH1_AF1_x(AF1 a){return packHalf2x16(AF2(a,0.0));}
619 #define AU1_AH1_AF1(a) AU1_AH1_AF1_x(AF1(a))
620//------------------------------------------------------------------------------------------------------------------------------
621 #define AU1_AH2_AF2 packHalf2x16
622 #define AU1_AW2Unorm_AF2 packUnorm2x16
623 #define AU1_AB4Unorm_AF4 packUnorm4x8
624//------------------------------------------------------------------------------------------------------------------------------
625 #define AF2_AH2_AU1 unpackHalf2x16
626 #define AF2_AW2Unorm_AU1 unpackUnorm2x16
627 #define AF4_AB4Unorm_AU1 unpackUnorm4x8
628//==============================================================================================================================
629 AF1 AF1_x(AF1 a){return AF1(a);}
630 AF2 AF2_x(AF1 a){return AF2(a,a);}
631 AF3 AF3_x(AF1 a){return AF3(a,a,a);}
632 AF4 AF4_x(AF1 a){return AF4(a,a,a,a);}
633 #define AF1_(a) AF1_x(AF1(a))
634 #define AF2_(a) AF2_x(AF1(a))
635 #define AF3_(a) AF3_x(AF1(a))
636 #define AF4_(a) AF4_x(AF1(a))
637//------------------------------------------------------------------------------------------------------------------------------
638 AU1 AU1_x(AU1 a){return AU1(a);}
639 AU2 AU2_x(AU1 a){return AU2(a,a);}
640 AU3 AU3_x(AU1 a){return AU3(a,a,a);}
641 AU4 AU4_x(AU1 a){return AU4(a,a,a,a);}
642 #define AU1_(a) AU1_x(AU1(a))
643 #define AU2_(a) AU2_x(AU1(a))
644 #define AU3_(a) AU3_x(AU1(a))
645 #define AU4_(a) AU4_x(AU1(a))
646//==============================================================================================================================
647 AU1 AAbsSU1(AU1 a){return AU1(abs(ASU1(a)));}
648 AU2 AAbsSU2(AU2 a){return AU2(abs(ASU2(a)));}
649 AU3 AAbsSU3(AU3 a){return AU3(abs(ASU3(a)));}
650 AU4 AAbsSU4(AU4 a){return AU4(abs(ASU4(a)));}
651//------------------------------------------------------------------------------------------------------------------------------
652 AU1 ABfe(AU1 src,AU1 off,AU1 bits){return bitfieldExtract(src,ASU1(off),ASU1(bits));}
653 AU1 ABfi(AU1 src,AU1 ins,AU1 mask){return (ins&mask)|(src&(~mask));}
654 // Proxy for V_BFI_B32 where the 'mask' is set as 'bits', 'mask=(1<<bits)-1', and 'bits' needs to be an immediate.
655 AU1 ABfiM(AU1 src,AU1 ins,AU1 bits){return bitfieldInsert(src,ins,0,ASU1(bits));}
656//------------------------------------------------------------------------------------------------------------------------------
657 // V_MED3_F32.
658 AF1 AClampF1(AF1 x,AF1 n,AF1 m){return clamp(x,n,m);}
659 AF2 AClampF2(AF2 x,AF2 n,AF2 m){return clamp(x,n,m);}
660 AF3 AClampF3(AF3 x,AF3 n,AF3 m){return clamp(x,n,m);}
661 AF4 AClampF4(AF4 x,AF4 n,AF4 m){return clamp(x,n,m);}
662//------------------------------------------------------------------------------------------------------------------------------
663 // V_FRACT_F32 (note DX frac() is different).
664 AF1 AFractF1(AF1 x){return fract(x);}
665 AF2 AFractF2(AF2 x){return fract(x);}
666 AF3 AFractF3(AF3 x){return fract(x);}
667 AF4 AFractF4(AF4 x){return fract(x);}
668//------------------------------------------------------------------------------------------------------------------------------
669 AF1 ALerpF1(AF1 x,AF1 y,AF1 a){return mix(x,y,a);}
670 AF2 ALerpF2(AF2 x,AF2 y,AF2 a){return mix(x,y,a);}
671 AF3 ALerpF3(AF3 x,AF3 y,AF3 a){return mix(x,y,a);}
672 AF4 ALerpF4(AF4 x,AF4 y,AF4 a){return mix(x,y,a);}
673//------------------------------------------------------------------------------------------------------------------------------
674 // V_MAX3_F32.
675 AF1 AMax3F1(AF1 x,AF1 y,AF1 z){return max(x,max(y,z));}
676 AF2 AMax3F2(AF2 x,AF2 y,AF2 z){return max(x,max(y,z));}
677 AF3 AMax3F3(AF3 x,AF3 y,AF3 z){return max(x,max(y,z));}
678 AF4 AMax3F4(AF4 x,AF4 y,AF4 z){return max(x,max(y,z));}
679//------------------------------------------------------------------------------------------------------------------------------
680 AU1 AMax3SU1(AU1 x,AU1 y,AU1 z){return AU1(max(ASU1(x),max(ASU1(y),ASU1(z))));}
681 AU2 AMax3SU2(AU2 x,AU2 y,AU2 z){return AU2(max(ASU2(x),max(ASU2(y),ASU2(z))));}
682 AU3 AMax3SU3(AU3 x,AU3 y,AU3 z){return AU3(max(ASU3(x),max(ASU3(y),ASU3(z))));}
683 AU4 AMax3SU4(AU4 x,AU4 y,AU4 z){return AU4(max(ASU4(x),max(ASU4(y),ASU4(z))));}
684//------------------------------------------------------------------------------------------------------------------------------
685 AU1 AMax3U1(AU1 x,AU1 y,AU1 z){return max(x,max(y,z));}
686 AU2 AMax3U2(AU2 x,AU2 y,AU2 z){return max(x,max(y,z));}
687 AU3 AMax3U3(AU3 x,AU3 y,AU3 z){return max(x,max(y,z));}
688 AU4 AMax3U4(AU4 x,AU4 y,AU4 z){return max(x,max(y,z));}
689//------------------------------------------------------------------------------------------------------------------------------
690 AU1 AMaxSU1(AU1 a,AU1 b){return AU1(max(ASU1(a),ASU1(b)));}
691 AU2 AMaxSU2(AU2 a,AU2 b){return AU2(max(ASU2(a),ASU2(b)));}
692 AU3 AMaxSU3(AU3 a,AU3 b){return AU3(max(ASU3(a),ASU3(b)));}
693 AU4 AMaxSU4(AU4 a,AU4 b){return AU4(max(ASU4(a),ASU4(b)));}
694//------------------------------------------------------------------------------------------------------------------------------
695 // Clamp has an easier pattern match for med3 when some ordering is known.
696 // V_MED3_F32.
697 AF1 AMed3F1(AF1 x,AF1 y,AF1 z){return max(min(x,y),min(max(x,y),z));}
698 AF2 AMed3F2(AF2 x,AF2 y,AF2 z){return max(min(x,y),min(max(x,y),z));}
699 AF3 AMed3F3(AF3 x,AF3 y,AF3 z){return max(min(x,y),min(max(x,y),z));}
700 AF4 AMed3F4(AF4 x,AF4 y,AF4 z){return max(min(x,y),min(max(x,y),z));}
701//------------------------------------------------------------------------------------------------------------------------------
702 // V_MIN3_F32.
703 AF1 AMin3F1(AF1 x,AF1 y,AF1 z){return min(x,min(y,z));}
704 AF2 AMin3F2(AF2 x,AF2 y,AF2 z){return min(x,min(y,z));}
705 AF3 AMin3F3(AF3 x,AF3 y,AF3 z){return min(x,min(y,z));}
706 AF4 AMin3F4(AF4 x,AF4 y,AF4 z){return min(x,min(y,z));}
707//------------------------------------------------------------------------------------------------------------------------------
708 AU1 AMin3SU1(AU1 x,AU1 y,AU1 z){return AU1(min(ASU1(x),min(ASU1(y),ASU1(z))));}
709 AU2 AMin3SU2(AU2 x,AU2 y,AU2 z){return AU2(min(ASU2(x),min(ASU2(y),ASU2(z))));}
710 AU3 AMin3SU3(AU3 x,AU3 y,AU3 z){return AU3(min(ASU3(x),min(ASU3(y),ASU3(z))));}
711 AU4 AMin3SU4(AU4 x,AU4 y,AU4 z){return AU4(min(ASU4(x),min(ASU4(y),ASU4(z))));}
712//------------------------------------------------------------------------------------------------------------------------------
713 AU1 AMin3U1(AU1 x,AU1 y,AU1 z){return min(x,min(y,z));}
714 AU2 AMin3U2(AU2 x,AU2 y,AU2 z){return min(x,min(y,z));}
715 AU3 AMin3U3(AU3 x,AU3 y,AU3 z){return min(x,min(y,z));}
716 AU4 AMin3U4(AU4 x,AU4 y,AU4 z){return min(x,min(y,z));}
717//------------------------------------------------------------------------------------------------------------------------------
718 AU1 AMinSU1(AU1 a,AU1 b){return AU1(min(ASU1(a),ASU1(b)));}
719 AU2 AMinSU2(AU2 a,AU2 b){return AU2(min(ASU2(a),ASU2(b)));}
720 AU3 AMinSU3(AU3 a,AU3 b){return AU3(min(ASU3(a),ASU3(b)));}
721 AU4 AMinSU4(AU4 a,AU4 b){return AU4(min(ASU4(a),ASU4(b)));}
722//------------------------------------------------------------------------------------------------------------------------------
723 // Normalized trig. Valid input domain is {-256 to +256}. No GLSL compiler intrinsic exists to map to this currently.
724 // V_COS_F32.
725 AF1 ANCosF1(AF1 x){return cos(x*AF1_(A_2PI));}
726 AF2 ANCosF2(AF2 x){return cos(x*AF2_(A_2PI));}
727 AF3 ANCosF3(AF3 x){return cos(x*AF3_(A_2PI));}
728 AF4 ANCosF4(AF4 x){return cos(x*AF4_(A_2PI));}
729//------------------------------------------------------------------------------------------------------------------------------
730 // Normalized trig. Valid input domain is {-256 to +256}. No GLSL compiler intrinsic exists to map to this currently.
731 // V_SIN_F32.
732 AF1 ANSinF1(AF1 x){return sin(x*AF1_(A_2PI));}
733 AF2 ANSinF2(AF2 x){return sin(x*AF2_(A_2PI));}
734 AF3 ANSinF3(AF3 x){return sin(x*AF3_(A_2PI));}
735 AF4 ANSinF4(AF4 x){return sin(x*AF4_(A_2PI));}
736//------------------------------------------------------------------------------------------------------------------------------
737 AF1 ARcpF1(AF1 x){return AF1_(1.0)/x;}
738 AF2 ARcpF2(AF2 x){return AF2_(1.0)/x;}
739 AF3 ARcpF3(AF3 x){return AF3_(1.0)/x;}
740 AF4 ARcpF4(AF4 x){return AF4_(1.0)/x;}
741//------------------------------------------------------------------------------------------------------------------------------
742 AF1 ARsqF1(AF1 x){return AF1_(1.0)/sqrt(x);}
743 AF2 ARsqF2(AF2 x){return AF2_(1.0)/sqrt(x);}
744 AF3 ARsqF3(AF3 x){return AF3_(1.0)/sqrt(x);}
745 AF4 ARsqF4(AF4 x){return AF4_(1.0)/sqrt(x);}
746//------------------------------------------------------------------------------------------------------------------------------
747 AF1 ASatF1(AF1 x){return clamp(x,AF1_(0.0),AF1_(1.0));}
748 AF2 ASatF2(AF2 x){return clamp(x,AF2_(0.0),AF2_(1.0));}
749 AF3 ASatF3(AF3 x){return clamp(x,AF3_(0.0),AF3_(1.0));}
750 AF4 ASatF4(AF4 x){return clamp(x,AF4_(0.0),AF4_(1.0));}
751//------------------------------------------------------------------------------------------------------------------------------
752 AU1 AShrSU1(AU1 a,AU1 b){return AU1(ASU1(a)>>ASU1(b));}
753 AU2 AShrSU2(AU2 a,AU2 b){return AU2(ASU2(a)>>ASU2(b));}
754 AU3 AShrSU3(AU3 a,AU3 b){return AU3(ASU3(a)>>ASU3(b));}
755 AU4 AShrSU4(AU4 a,AU4 b){return AU4(ASU4(a)>>ASU4(b));}
756////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
757////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
758//_____________________________________________________________/\_______________________________________________________________
759//==============================================================================================================================
760// GLSL BYTE
761//==============================================================================================================================
762 #ifdef A_BYTE
763 #define AB1 uint8_t
764 #define AB2 u8vec2
765 #define AB3 u8vec3
766 #define AB4 u8vec4
767//------------------------------------------------------------------------------------------------------------------------------
768 #define ASB1 int8_t
769 #define ASB2 i8vec2
770 #define ASB3 i8vec3
771 #define ASB4 i8vec4
772//------------------------------------------------------------------------------------------------------------------------------
773 AB1 AB1_x(AB1 a){return AB1(a);}
774 AB2 AB2_x(AB1 a){return AB2(a,a);}
775 AB3 AB3_x(AB1 a){return AB3(a,a,a);}
776 AB4 AB4_x(AB1 a){return AB4(a,a,a,a);}
777 #define AB1_(a) AB1_x(AB1(a))
778 #define AB2_(a) AB2_x(AB1(a))
779 #define AB3_(a) AB3_x(AB1(a))
780 #define AB4_(a) AB4_x(AB1(a))
781 #endif
782////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
783////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
784//_____________________________________________________________/\_______________________________________________________________
785//==============================================================================================================================
786// GLSL HALF
787//==============================================================================================================================
788 #ifdef A_HALF
789 #define AH1 float16_t
790 #define AH2 f16vec2
791 #define AH3 f16vec3
792 #define AH4 f16vec4
793//------------------------------------------------------------------------------------------------------------------------------
794 #define AW1 uint16_t
795 #define AW2 u16vec2
796 #define AW3 u16vec3
797 #define AW4 u16vec4
798//------------------------------------------------------------------------------------------------------------------------------
799 #define ASW1 int16_t
800 #define ASW2 i16vec2
801 #define ASW3 i16vec3
802 #define ASW4 i16vec4
803//==============================================================================================================================
804 #define AH2_AU1(x) unpackFloat2x16(AU1(x))
805 AH4 AH4_AU2_x(AU2 x){return AH4(unpackFloat2x16(x.x),unpackFloat2x16(x.y));}
806 #define AH4_AU2(x) AH4_AU2_x(AU2(x))
807 #define AW2_AU1(x) unpackUint2x16(AU1(x))
808 #define AW4_AU2(x) unpackUint4x16(pack64(AU2(x)))
809//------------------------------------------------------------------------------------------------------------------------------
810 #define AU1_AH2(x) packFloat2x16(AH2(x))
811 AU2 AU2_AH4_x(AH4 x){return AU2(packFloat2x16(x.xy),packFloat2x16(x.zw));}
812 #define AU2_AH4(x) AU2_AH4_x(AH4(x))
813 #define AU1_AW2(x) packUint2x16(AW2(x))
814 #define AU2_AW4(x) unpack32(packUint4x16(AW4(x)))
815//==============================================================================================================================
816 #define AW1_AH1(x) halfBitsToUint16(AH1(x))
817 #define AW2_AH2(x) halfBitsToUint16(AH2(x))
818 #define AW3_AH3(x) halfBitsToUint16(AH3(x))
819 #define AW4_AH4(x) halfBitsToUint16(AH4(x))
820//------------------------------------------------------------------------------------------------------------------------------
821 #define AH1_AW1(x) uint16BitsToHalf(AW1(x))
822 #define AH2_AW2(x) uint16BitsToHalf(AW2(x))
823 #define AH3_AW3(x) uint16BitsToHalf(AW3(x))
824 #define AH4_AW4(x) uint16BitsToHalf(AW4(x))
825//==============================================================================================================================
826 AH1 AH1_x(AH1 a){return AH1(a);}
827 AH2 AH2_x(AH1 a){return AH2(a,a);}
828 AH3 AH3_x(AH1 a){return AH3(a,a,a);}
829 AH4 AH4_x(AH1 a){return AH4(a,a,a,a);}
830 #define AH1_(a) AH1_x(AH1(a))
831 #define AH2_(a) AH2_x(AH1(a))
832 #define AH3_(a) AH3_x(AH1(a))
833 #define AH4_(a) AH4_x(AH1(a))
834//------------------------------------------------------------------------------------------------------------------------------
835 AW1 AW1_x(AW1 a){return AW1(a);}
836 AW2 AW2_x(AW1 a){return AW2(a,a);}
837 AW3 AW3_x(AW1 a){return AW3(a,a,a);}
838 AW4 AW4_x(AW1 a){return AW4(a,a,a,a);}
839 #define AW1_(a) AW1_x(AW1(a))
840 #define AW2_(a) AW2_x(AW1(a))
841 #define AW3_(a) AW3_x(AW1(a))
842 #define AW4_(a) AW4_x(AW1(a))
843//==============================================================================================================================
844 AW1 AAbsSW1(AW1 a){return AW1(abs(ASW1(a)));}
845 AW2 AAbsSW2(AW2 a){return AW2(abs(ASW2(a)));}
846 AW3 AAbsSW3(AW3 a){return AW3(abs(ASW3(a)));}
847 AW4 AAbsSW4(AW4 a){return AW4(abs(ASW4(a)));}
848//------------------------------------------------------------------------------------------------------------------------------
849 AH1 AClampH1(AH1 x,AH1 n,AH1 m){return clamp(x,n,m);}
850 AH2 AClampH2(AH2 x,AH2 n,AH2 m){return clamp(x,n,m);}
851 AH3 AClampH3(AH3 x,AH3 n,AH3 m){return clamp(x,n,m);}
852 AH4 AClampH4(AH4 x,AH4 n,AH4 m){return clamp(x,n,m);}
853//------------------------------------------------------------------------------------------------------------------------------
854 AH1 AFractH1(AH1 x){return fract(x);}
855 AH2 AFractH2(AH2 x){return fract(x);}
856 AH3 AFractH3(AH3 x){return fract(x);}
857 AH4 AFractH4(AH4 x){return fract(x);}
858//------------------------------------------------------------------------------------------------------------------------------
859 AH1 ALerpH1(AH1 x,AH1 y,AH1 a){return mix(x,y,a);}
860 AH2 ALerpH2(AH2 x,AH2 y,AH2 a){return mix(x,y,a);}
861 AH3 ALerpH3(AH3 x,AH3 y,AH3 a){return mix(x,y,a);}
862 AH4 ALerpH4(AH4 x,AH4 y,AH4 a){return mix(x,y,a);}
863//------------------------------------------------------------------------------------------------------------------------------
864 // No packed version of max3.
865 AH1 AMax3H1(AH1 x,AH1 y,AH1 z){return max(x,max(y,z));}
866 AH2 AMax3H2(AH2 x,AH2 y,AH2 z){return max(x,max(y,z));}
867 AH3 AMax3H3(AH3 x,AH3 y,AH3 z){return max(x,max(y,z));}
868 AH4 AMax3H4(AH4 x,AH4 y,AH4 z){return max(x,max(y,z));}
869//------------------------------------------------------------------------------------------------------------------------------
870 AW1 AMaxSW1(AW1 a,AW1 b){return AW1(max(ASU1(a),ASU1(b)));}
871 AW2 AMaxSW2(AW2 a,AW2 b){return AW2(max(ASU2(a),ASU2(b)));}
872 AW3 AMaxSW3(AW3 a,AW3 b){return AW3(max(ASU3(a),ASU3(b)));}
873 AW4 AMaxSW4(AW4 a,AW4 b){return AW4(max(ASU4(a),ASU4(b)));}
874//------------------------------------------------------------------------------------------------------------------------------
875 // No packed version of min3.
876 AH1 AMin3H1(AH1 x,AH1 y,AH1 z){return min(x,min(y,z));}
877 AH2 AMin3H2(AH2 x,AH2 y,AH2 z){return min(x,min(y,z));}
878 AH3 AMin3H3(AH3 x,AH3 y,AH3 z){return min(x,min(y,z));}
879 AH4 AMin3H4(AH4 x,AH4 y,AH4 z){return min(x,min(y,z));}
880//------------------------------------------------------------------------------------------------------------------------------
881 AW1 AMinSW1(AW1 a,AW1 b){return AW1(min(ASU1(a),ASU1(b)));}
882 AW2 AMinSW2(AW2 a,AW2 b){return AW2(min(ASU2(a),ASU2(b)));}
883 AW3 AMinSW3(AW3 a,AW3 b){return AW3(min(ASU3(a),ASU3(b)));}
884 AW4 AMinSW4(AW4 a,AW4 b){return AW4(min(ASU4(a),ASU4(b)));}
885//------------------------------------------------------------------------------------------------------------------------------
886 AH1 ARcpH1(AH1 x){return AH1_(1.0)/x;}
887 AH2 ARcpH2(AH2 x){return AH2_(1.0)/x;}
888 AH3 ARcpH3(AH3 x){return AH3_(1.0)/x;}
889 AH4 ARcpH4(AH4 x){return AH4_(1.0)/x;}
890//------------------------------------------------------------------------------------------------------------------------------
891 AH1 ARsqH1(AH1 x){return AH1_(1.0)/sqrt(x);}
892 AH2 ARsqH2(AH2 x){return AH2_(1.0)/sqrt(x);}
893 AH3 ARsqH3(AH3 x){return AH3_(1.0)/sqrt(x);}
894 AH4 ARsqH4(AH4 x){return AH4_(1.0)/sqrt(x);}
895//------------------------------------------------------------------------------------------------------------------------------
896 AH1 ASatH1(AH1 x){return clamp(x,AH1_(0.0),AH1_(1.0));}
897 AH2 ASatH2(AH2 x){return clamp(x,AH2_(0.0),AH2_(1.0));}
898 AH3 ASatH3(AH3 x){return clamp(x,AH3_(0.0),AH3_(1.0));}
899 AH4 ASatH4(AH4 x){return clamp(x,AH4_(0.0),AH4_(1.0));}
900//------------------------------------------------------------------------------------------------------------------------------
901 AW1 AShrSW1(AW1 a,AW1 b){return AW1(ASW1(a)>>ASW1(b));}
902 AW2 AShrSW2(AW2 a,AW2 b){return AW2(ASW2(a)>>ASW2(b));}
903 AW3 AShrSW3(AW3 a,AW3 b){return AW3(ASW3(a)>>ASW3(b));}
904 AW4 AShrSW4(AW4 a,AW4 b){return AW4(ASW4(a)>>ASW4(b));}
905 #endif
906////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
907////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
908//_____________________________________________________________/\_______________________________________________________________
909//==============================================================================================================================
910// GLSL DOUBLE
911//==============================================================================================================================
912 #ifdef A_DUBL
913 #define AD1 double
914 #define AD2 dvec2
915 #define AD3 dvec3
916 #define AD4 dvec4
917//------------------------------------------------------------------------------------------------------------------------------
918 AD1 AD1_x(AD1 a){return AD1(a);}
919 AD2 AD2_x(AD1 a){return AD2(a,a);}
920 AD3 AD3_x(AD1 a){return AD3(a,a,a);}
921 AD4 AD4_x(AD1 a){return AD4(a,a,a,a);}
922 #define AD1_(a) AD1_x(AD1(a))
923 #define AD2_(a) AD2_x(AD1(a))
924 #define AD3_(a) AD3_x(AD1(a))
925 #define AD4_(a) AD4_x(AD1(a))
926//==============================================================================================================================
927 AD1 AFractD1(AD1 x){return fract(x);}
928 AD2 AFractD2(AD2 x){return fract(x);}
929 AD3 AFractD3(AD3 x){return fract(x);}
930 AD4 AFractD4(AD4 x){return fract(x);}
931//------------------------------------------------------------------------------------------------------------------------------
932 AD1 ALerpD1(AD1 x,AD1 y,AD1 a){return mix(x,y,a);}
933 AD2 ALerpD2(AD2 x,AD2 y,AD2 a){return mix(x,y,a);}
934 AD3 ALerpD3(AD3 x,AD3 y,AD3 a){return mix(x,y,a);}
935 AD4 ALerpD4(AD4 x,AD4 y,AD4 a){return mix(x,y,a);}
936//------------------------------------------------------------------------------------------------------------------------------
937 AD1 ARcpD1(AD1 x){return AD1_(1.0)/x;}
938 AD2 ARcpD2(AD2 x){return AD2_(1.0)/x;}
939 AD3 ARcpD3(AD3 x){return AD3_(1.0)/x;}
940 AD4 ARcpD4(AD4 x){return AD4_(1.0)/x;}
941//------------------------------------------------------------------------------------------------------------------------------
942 AD1 ARsqD1(AD1 x){return AD1_(1.0)/sqrt(x);}
943 AD2 ARsqD2(AD2 x){return AD2_(1.0)/sqrt(x);}
944 AD3 ARsqD3(AD3 x){return AD3_(1.0)/sqrt(x);}
945 AD4 ARsqD4(AD4 x){return AD4_(1.0)/sqrt(x);}
946//------------------------------------------------------------------------------------------------------------------------------
947 AD1 ASatD1(AD1 x){return clamp(x,AD1_(0.0),AD1_(1.0));}
948 AD2 ASatD2(AD2 x){return clamp(x,AD2_(0.0),AD2_(1.0));}
949 AD3 ASatD3(AD3 x){return clamp(x,AD3_(0.0),AD3_(1.0));}
950 AD4 ASatD4(AD4 x){return clamp(x,AD4_(0.0),AD4_(1.0));}
951 #endif
952////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
953////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
954//_____________________________________________________________/\_______________________________________________________________
955//==============================================================================================================================
956// GLSL LONG
957//==============================================================================================================================
958 #ifdef A_LONG
959 #define AL1 uint64_t
960 #define AL2 u64vec2
961 #define AL3 u64vec3
962 #define AL4 u64vec4
963//------------------------------------------------------------------------------------------------------------------------------
964 #define ASL1 int64_t
965 #define ASL2 i64vec2
966 #define ASL3 i64vec3
967 #define ASL4 i64vec4
968//------------------------------------------------------------------------------------------------------------------------------
969 #define AL1_AU2(x) packUint2x32(AU2(x))
970 #define AU2_AL1(x) unpackUint2x32(AL1(x))
971//------------------------------------------------------------------------------------------------------------------------------
972 AL1 AL1_x(AL1 a){return AL1(a);}
973 AL2 AL2_x(AL1 a){return AL2(a,a);}
974 AL3 AL3_x(AL1 a){return AL3(a,a,a);}
975 AL4 AL4_x(AL1 a){return AL4(a,a,a,a);}
976 #define AL1_(a) AL1_x(AL1(a))
977 #define AL2_(a) AL2_x(AL1(a))
978 #define AL3_(a) AL3_x(AL1(a))
979 #define AL4_(a) AL4_x(AL1(a))
980//==============================================================================================================================
981 AL1 AAbsSL1(AL1 a){return AL1(abs(ASL1(a)));}
982 AL2 AAbsSL2(AL2 a){return AL2(abs(ASL2(a)));}
983 AL3 AAbsSL3(AL3 a){return AL3(abs(ASL3(a)));}
984 AL4 AAbsSL4(AL4 a){return AL4(abs(ASL4(a)));}
985//------------------------------------------------------------------------------------------------------------------------------
986 AL1 AMaxSL1(AL1 a,AL1 b){return AL1(max(ASU1(a),ASU1(b)));}
987 AL2 AMaxSL2(AL2 a,AL2 b){return AL2(max(ASU2(a),ASU2(b)));}
988 AL3 AMaxSL3(AL3 a,AL3 b){return AL3(max(ASU3(a),ASU3(b)));}
989 AL4 AMaxSL4(AL4 a,AL4 b){return AL4(max(ASU4(a),ASU4(b)));}
990//------------------------------------------------------------------------------------------------------------------------------
991 AL1 AMinSL1(AL1 a,AL1 b){return AL1(min(ASU1(a),ASU1(b)));}
992 AL2 AMinSL2(AL2 a,AL2 b){return AL2(min(ASU2(a),ASU2(b)));}
993 AL3 AMinSL3(AL3 a,AL3 b){return AL3(min(ASU3(a),ASU3(b)));}
994 AL4 AMinSL4(AL4 a,AL4 b){return AL4(min(ASU4(a),ASU4(b)));}
995 #endif
996////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
997////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
998//_____________________________________________________________/\_______________________________________________________________
999//==============================================================================================================================
1000// WAVE OPERATIONS
1001//==============================================================================================================================
1002 #ifdef A_WAVE
1003 // Where 'x' must be a compile time literal.
1004 AF1 AWaveXorF1(AF1 v,AU1 x){return subgroupShuffleXor(v,x);}
1005 AF2 AWaveXorF2(AF2 v,AU1 x){return subgroupShuffleXor(v,x);}
1006 AF3 AWaveXorF3(AF3 v,AU1 x){return subgroupShuffleXor(v,x);}
1007 AF4 AWaveXorF4(AF4 v,AU1 x){return subgroupShuffleXor(v,x);}
1008 AU1 AWaveXorU1(AU1 v,AU1 x){return subgroupShuffleXor(v,x);}
1009 AU2 AWaveXorU2(AU2 v,AU1 x){return subgroupShuffleXor(v,x);}
1010 AU3 AWaveXorU3(AU3 v,AU1 x){return subgroupShuffleXor(v,x);}
1011 AU4 AWaveXorU4(AU4 v,AU1 x){return subgroupShuffleXor(v,x);}
1012//------------------------------------------------------------------------------------------------------------------------------
1013 #ifdef A_HALF
1014 AH2 AWaveXorH2(AH2 v,AU1 x){return AH2_AU1(subgroupShuffleXor(AU1_AH2(v),x));}
1015 AH4 AWaveXorH4(AH4 v,AU1 x){return AH4_AU2(subgroupShuffleXor(AU2_AH4(v),x));}
1016 AW2 AWaveXorW2(AW2 v,AU1 x){return AW2_AU1(subgroupShuffleXor(AU1_AW2(v),x));}
1017 AW4 AWaveXorW4(AW4 v,AU1 x){return AW4_AU2(subgroupShuffleXor(AU2_AW4(v),x));}
1018 #endif
1019 #endif
1020//==============================================================================================================================
1021#endif
1022////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
1023////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
1024////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
1025////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
1026////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
1027////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
1028//_____________________________________________________________/\_______________________________________________________________
1029//==============================================================================================================================
1030//
1031//
1032// HLSL
1033//
1034//
1035//==============================================================================================================================
1036#if defined(A_HLSL) && defined(A_GPU)
1037 #ifdef A_HLSL_6_2
1038 #define AP1 bool
1039 #define AP2 bool2
1040 #define AP3 bool3
1041 #define AP4 bool4
1042//------------------------------------------------------------------------------------------------------------------------------
1043 #define AF1 float32_t
1044 #define AF2 float32_t2
1045 #define AF3 float32_t3
1046 #define AF4 float32_t4
1047//------------------------------------------------------------------------------------------------------------------------------
1048 #define AU1 uint32_t
1049 #define AU2 uint32_t2
1050 #define AU3 uint32_t3
1051 #define AU4 uint32_t4
1052//------------------------------------------------------------------------------------------------------------------------------
1053 #define ASU1 int32_t
1054 #define ASU2 int32_t2
1055 #define ASU3 int32_t3
1056 #define ASU4 int32_t4
1057 #else
1058 #define AP1 bool
1059 #define AP2 bool2
1060 #define AP3 bool3
1061 #define AP4 bool4
1062//------------------------------------------------------------------------------------------------------------------------------
1063 #define AF1 float
1064 #define AF2 float2
1065 #define AF3 float3
1066 #define AF4 float4
1067//------------------------------------------------------------------------------------------------------------------------------
1068 #define AU1 uint
1069 #define AU2 uint2
1070 #define AU3 uint3
1071 #define AU4 uint4
1072//------------------------------------------------------------------------------------------------------------------------------
1073 #define ASU1 int
1074 #define ASU2 int2
1075 #define ASU3 int3
1076 #define ASU4 int4
1077 #endif
1078//==============================================================================================================================
1079 #define AF1_AU1(x) asfloat(AU1(x))
1080 #define AF2_AU2(x) asfloat(AU2(x))
1081 #define AF3_AU3(x) asfloat(AU3(x))
1082 #define AF4_AU4(x) asfloat(AU4(x))
1083//------------------------------------------------------------------------------------------------------------------------------
1084 #define AU1_AF1(x) asuint(AF1(x))
1085 #define AU2_AF2(x) asuint(AF2(x))
1086 #define AU3_AF3(x) asuint(AF3(x))
1087 #define AU4_AF4(x) asuint(AF4(x))
1088//------------------------------------------------------------------------------------------------------------------------------
1089 AU1 AU1_AH1_AF1_x(AF1 a){return f32tof16(a);}
1090 #define AU1_AH1_AF1(a) AU1_AH1_AF1_x(AF1(a))
1091//------------------------------------------------------------------------------------------------------------------------------
1092 AU1 AU1_AH2_AF2_x(AF2 a){return f32tof16(a.x)|(f32tof16(a.y)<<16);}
1093 #define AU1_AH2_AF2(a) AU1_AH2_AF2_x(AF2(a))
1094 #define AU1_AB4Unorm_AF4(x) D3DCOLORtoUBYTE4(AF4(x))
1095//------------------------------------------------------------------------------------------------------------------------------
1096 AF2 AF2_AH2_AU1_x(AU1 x){return AF2(f16tof32(x&0xFFFF),f16tof32(x>>16));}
1097 #define AF2_AH2_AU1(x) AF2_AH2_AU1_x(AU1(x))
1098//==============================================================================================================================
1099 AF1 AF1_x(AF1 a){return AF1(a);}
1100 AF2 AF2_x(AF1 a){return AF2(a,a);}
1101 AF3 AF3_x(AF1 a){return AF3(a,a,a);}
1102 AF4 AF4_x(AF1 a){return AF4(a,a,a,a);}
1103 #define AF1_(a) AF1_x(AF1(a))
1104 #define AF2_(a) AF2_x(AF1(a))
1105 #define AF3_(a) AF3_x(AF1(a))
1106 #define AF4_(a) AF4_x(AF1(a))
1107//------------------------------------------------------------------------------------------------------------------------------
1108 AU1 AU1_x(AU1 a){return AU1(a);}
1109 AU2 AU2_x(AU1 a){return AU2(a,a);}
1110 AU3 AU3_x(AU1 a){return AU3(a,a,a);}
1111 AU4 AU4_x(AU1 a){return AU4(a,a,a,a);}
1112 #define AU1_(a) AU1_x(AU1(a))
1113 #define AU2_(a) AU2_x(AU1(a))
1114 #define AU3_(a) AU3_x(AU1(a))
1115 #define AU4_(a) AU4_x(AU1(a))
1116//==============================================================================================================================
1117 AU1 AAbsSU1(AU1 a){return AU1(abs(ASU1(a)));}
1118 AU2 AAbsSU2(AU2 a){return AU2(abs(ASU2(a)));}
1119 AU3 AAbsSU3(AU3 a){return AU3(abs(ASU3(a)));}
1120 AU4 AAbsSU4(AU4 a){return AU4(abs(ASU4(a)));}
1121//------------------------------------------------------------------------------------------------------------------------------
1122 AU1 ABfe(AU1 src,AU1 off,AU1 bits){AU1 mask=(1u<<bits)-1;return (src>>off)&mask;}
1123 AU1 ABfi(AU1 src,AU1 ins,AU1 mask){return (ins&mask)|(src&(~mask));}
1124 AU1 ABfiM(AU1 src,AU1 ins,AU1 bits){AU1 mask=(1u<<bits)-1;return (ins&mask)|(src&(~mask));}
1125//------------------------------------------------------------------------------------------------------------------------------
1126 AF1 AClampF1(AF1 x,AF1 n,AF1 m){return max(n,min(x,m));}
1127 AF2 AClampF2(AF2 x,AF2 n,AF2 m){return max(n,min(x,m));}
1128 AF3 AClampF3(AF3 x,AF3 n,AF3 m){return max(n,min(x,m));}
1129 AF4 AClampF4(AF4 x,AF4 n,AF4 m){return max(n,min(x,m));}
1130//------------------------------------------------------------------------------------------------------------------------------
1131 AF1 AFractF1(AF1 x){return x-floor(x);}
1132 AF2 AFractF2(AF2 x){return x-floor(x);}
1133 AF3 AFractF3(AF3 x){return x-floor(x);}
1134 AF4 AFractF4(AF4 x){return x-floor(x);}
1135//------------------------------------------------------------------------------------------------------------------------------
1136 AF1 ALerpF1(AF1 x,AF1 y,AF1 a){return lerp(x,y,a);}
1137 AF2 ALerpF2(AF2 x,AF2 y,AF2 a){return lerp(x,y,a);}
1138 AF3 ALerpF3(AF3 x,AF3 y,AF3 a){return lerp(x,y,a);}
1139 AF4 ALerpF4(AF4 x,AF4 y,AF4 a){return lerp(x,y,a);}
1140//------------------------------------------------------------------------------------------------------------------------------
1141 AF1 AMax3F1(AF1 x,AF1 y,AF1 z){return max(x,max(y,z));}
1142 AF2 AMax3F2(AF2 x,AF2 y,AF2 z){return max(x,max(y,z));}
1143 AF3 AMax3F3(AF3 x,AF3 y,AF3 z){return max(x,max(y,z));}
1144 AF4 AMax3F4(AF4 x,AF4 y,AF4 z){return max(x,max(y,z));}
1145//------------------------------------------------------------------------------------------------------------------------------
1146 AU1 AMax3SU1(AU1 x,AU1 y,AU1 z){return AU1(max(ASU1(x),max(ASU1(y),ASU1(z))));}
1147 AU2 AMax3SU2(AU2 x,AU2 y,AU2 z){return AU2(max(ASU2(x),max(ASU2(y),ASU2(z))));}
1148 AU3 AMax3SU3(AU3 x,AU3 y,AU3 z){return AU3(max(ASU3(x),max(ASU3(y),ASU3(z))));}
1149 AU4 AMax3SU4(AU4 x,AU4 y,AU4 z){return AU4(max(ASU4(x),max(ASU4(y),ASU4(z))));}
1150//------------------------------------------------------------------------------------------------------------------------------
1151 AU1 AMax3U1(AU1 x,AU1 y,AU1 z){return max(x,max(y,z));}
1152 AU2 AMax3U2(AU2 x,AU2 y,AU2 z){return max(x,max(y,z));}
1153 AU3 AMax3U3(AU3 x,AU3 y,AU3 z){return max(x,max(y,z));}
1154 AU4 AMax3U4(AU4 x,AU4 y,AU4 z){return max(x,max(y,z));}
1155//------------------------------------------------------------------------------------------------------------------------------
1156 AU1 AMaxSU1(AU1 a,AU1 b){return AU1(max(ASU1(a),ASU1(b)));}
1157 AU2 AMaxSU2(AU2 a,AU2 b){return AU2(max(ASU2(a),ASU2(b)));}
1158 AU3 AMaxSU3(AU3 a,AU3 b){return AU3(max(ASU3(a),ASU3(b)));}
1159 AU4 AMaxSU4(AU4 a,AU4 b){return AU4(max(ASU4(a),ASU4(b)));}
1160//------------------------------------------------------------------------------------------------------------------------------
1161 AF1 AMed3F1(AF1 x,AF1 y,AF1 z){return max(min(x,y),min(max(x,y),z));}
1162 AF2 AMed3F2(AF2 x,AF2 y,AF2 z){return max(min(x,y),min(max(x,y),z));}
1163 AF3 AMed3F3(AF3 x,AF3 y,AF3 z){return max(min(x,y),min(max(x,y),z));}
1164 AF4 AMed3F4(AF4 x,AF4 y,AF4 z){return max(min(x,y),min(max(x,y),z));}
1165//------------------------------------------------------------------------------------------------------------------------------
1166 AF1 AMin3F1(AF1 x,AF1 y,AF1 z){return min(x,min(y,z));}
1167 AF2 AMin3F2(AF2 x,AF2 y,AF2 z){return min(x,min(y,z));}
1168 AF3 AMin3F3(AF3 x,AF3 y,AF3 z){return min(x,min(y,z));}
1169 AF4 AMin3F4(AF4 x,AF4 y,AF4 z){return min(x,min(y,z));}
1170//------------------------------------------------------------------------------------------------------------------------------
1171 AU1 AMin3SU1(AU1 x,AU1 y,AU1 z){return AU1(min(ASU1(x),min(ASU1(y),ASU1(z))));}
1172 AU2 AMin3SU2(AU2 x,AU2 y,AU2 z){return AU2(min(ASU2(x),min(ASU2(y),ASU2(z))));}
1173 AU3 AMin3SU3(AU3 x,AU3 y,AU3 z){return AU3(min(ASU3(x),min(ASU3(y),ASU3(z))));}
1174 AU4 AMin3SU4(AU4 x,AU4 y,AU4 z){return AU4(min(ASU4(x),min(ASU4(y),ASU4(z))));}
1175//------------------------------------------------------------------------------------------------------------------------------
1176 AU1 AMin3U1(AU1 x,AU1 y,AU1 z){return min(x,min(y,z));}
1177 AU2 AMin3U2(AU2 x,AU2 y,AU2 z){return min(x,min(y,z));}
1178 AU3 AMin3U3(AU3 x,AU3 y,AU3 z){return min(x,min(y,z));}
1179 AU4 AMin3U4(AU4 x,AU4 y,AU4 z){return min(x,min(y,z));}
1180//------------------------------------------------------------------------------------------------------------------------------
1181 AU1 AMinSU1(AU1 a,AU1 b){return AU1(min(ASU1(a),ASU1(b)));}
1182 AU2 AMinSU2(AU2 a,AU2 b){return AU2(min(ASU2(a),ASU2(b)));}
1183 AU3 AMinSU3(AU3 a,AU3 b){return AU3(min(ASU3(a),ASU3(b)));}
1184 AU4 AMinSU4(AU4 a,AU4 b){return AU4(min(ASU4(a),ASU4(b)));}
1185//------------------------------------------------------------------------------------------------------------------------------
1186 AF1 ANCosF1(AF1 x){return cos(x*AF1_(A_2PI));}
1187 AF2 ANCosF2(AF2 x){return cos(x*AF2_(A_2PI));}
1188 AF3 ANCosF3(AF3 x){return cos(x*AF3_(A_2PI));}
1189 AF4 ANCosF4(AF4 x){return cos(x*AF4_(A_2PI));}
1190//------------------------------------------------------------------------------------------------------------------------------
1191 AF1 ANSinF1(AF1 x){return sin(x*AF1_(A_2PI));}
1192 AF2 ANSinF2(AF2 x){return sin(x*AF2_(A_2PI));}
1193 AF3 ANSinF3(AF3 x){return sin(x*AF3_(A_2PI));}
1194 AF4 ANSinF4(AF4 x){return sin(x*AF4_(A_2PI));}
1195//------------------------------------------------------------------------------------------------------------------------------
1196 AF1 ARcpF1(AF1 x){return rcp(x);}
1197 AF2 ARcpF2(AF2 x){return rcp(x);}
1198 AF3 ARcpF3(AF3 x){return rcp(x);}
1199 AF4 ARcpF4(AF4 x){return rcp(x);}
1200//------------------------------------------------------------------------------------------------------------------------------
1201 AF1 ARsqF1(AF1 x){return rsqrt(x);}
1202 AF2 ARsqF2(AF2 x){return rsqrt(x);}
1203 AF3 ARsqF3(AF3 x){return rsqrt(x);}
1204 AF4 ARsqF4(AF4 x){return rsqrt(x);}
1205//------------------------------------------------------------------------------------------------------------------------------
1206 AF1 ASatF1(AF1 x){return saturate(x);}
1207 AF2 ASatF2(AF2 x){return saturate(x);}
1208 AF3 ASatF3(AF3 x){return saturate(x);}
1209 AF4 ASatF4(AF4 x){return saturate(x);}
1210//------------------------------------------------------------------------------------------------------------------------------
1211 AU1 AShrSU1(AU1 a,AU1 b){return AU1(ASU1(a)>>ASU1(b));}
1212 AU2 AShrSU2(AU2 a,AU2 b){return AU2(ASU2(a)>>ASU2(b));}
1213 AU3 AShrSU3(AU3 a,AU3 b){return AU3(ASU3(a)>>ASU3(b));}
1214 AU4 AShrSU4(AU4 a,AU4 b){return AU4(ASU4(a)>>ASU4(b));}
1215////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
1216////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
1217//_____________________________________________________________/\_______________________________________________________________
1218//==============================================================================================================================
1219// HLSL BYTE
1220//==============================================================================================================================
1221 #ifdef A_BYTE
1222 #endif
1223////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
1224////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
1225//_____________________________________________________________/\_______________________________________________________________
1226//==============================================================================================================================
1227// HLSL HALF
1228//==============================================================================================================================
1229 #ifdef A_HALF
1230 #ifdef A_HLSL_6_2
1231 #define AH1 float16_t
1232 #define AH2 float16_t2
1233 #define AH3 float16_t3
1234 #define AH4 float16_t4
1235//------------------------------------------------------------------------------------------------------------------------------
1236 #define AW1 uint16_t
1237 #define AW2 uint16_t2
1238 #define AW3 uint16_t3
1239 #define AW4 uint16_t4
1240//------------------------------------------------------------------------------------------------------------------------------
1241 #define ASW1 int16_t
1242 #define ASW2 int16_t2
1243 #define ASW3 int16_t3
1244 #define ASW4 int16_t4
1245 #else
1246 #define AH1 min16float
1247 #define AH2 min16float2
1248 #define AH3 min16float3
1249 #define AH4 min16float4
1250//------------------------------------------------------------------------------------------------------------------------------
1251 #define AW1 min16uint
1252 #define AW2 min16uint2
1253 #define AW3 min16uint3
1254 #define AW4 min16uint4
1255//------------------------------------------------------------------------------------------------------------------------------
1256 #define ASW1 min16int
1257 #define ASW2 min16int2
1258 #define ASW3 min16int3
1259 #define ASW4 min16int4
1260 #endif
1261//==============================================================================================================================
1262 // Need to use manual unpack to get optimal execution (don't use packed types in buffers directly).
1263 // Unpack requires this pattern: https://gpuopen.com/first-steps-implementing-fp16/
1264 AH2 AH2_AU1_x(AU1 x){AF2 t=f16tof32(AU2(x&0xFFFF,x>>16));return AH2(t);}
1265 AH4 AH4_AU2_x(AU2 x){return AH4(AH2_AU1_x(x.x),AH2_AU1_x(x.y));}
1266 AW2 AW2_AU1_x(AU1 x){AU2 t=AU2(x&0xFFFF,x>>16);return AW2(t);}
1267 AW4 AW4_AU2_x(AU2 x){return AW4(AW2_AU1_x(x.x),AW2_AU1_x(x.y));}
1268 #define AH2_AU1(x) AH2_AU1_x(AU1(x))
1269 #define AH4_AU2(x) AH4_AU2_x(AU2(x))
1270 #define AW2_AU1(x) AW2_AU1_x(AU1(x))
1271 #define AW4_AU2(x) AW4_AU2_x(AU2(x))
1272//------------------------------------------------------------------------------------------------------------------------------
1273 AU1 AU1_AH2_x(AH2 x){return f32tof16(x.x)+(f32tof16(x.y)<<16);}
1274 AU2 AU2_AH4_x(AH4 x){return AU2(AU1_AH2_x(x.xy),AU1_AH2_x(x.zw));}
1275 AU1 AU1_AW2_x(AW2 x){return AU1(x.x)+(AU1(x.y)<<16);}
1276 AU2 AU2_AW4_x(AW4 x){return AU2(AU1_AW2_x(x.xy),AU1_AW2_x(x.zw));}
1277 #define AU1_AH2(x) AU1_AH2_x(AH2(x))
1278 #define AU2_AH4(x) AU2_AH4_x(AH4(x))
1279 #define AU1_AW2(x) AU1_AW2_x(AW2(x))
1280 #define AU2_AW4(x) AU2_AW4_x(AW4(x))
1281//==============================================================================================================================
1282 #if defined(A_HLSL_6_2) && !defined(A_NO_16_BIT_CAST)
1283 #define AW1_AH1(x) asuint16(x)
1284 #define AW2_AH2(x) asuint16(x)
1285 #define AW3_AH3(x) asuint16(x)
1286 #define AW4_AH4(x) asuint16(x)
1287 #else
1288 #define AW1_AH1(a) AW1(f32tof16(AF1(a)))
1289 #define AW2_AH2(a) AW2(AW1_AH1((a).x),AW1_AH1((a).y))
1290 #define AW3_AH3(a) AW3(AW1_AH1((a).x),AW1_AH1((a).y),AW1_AH1((a).z))
1291 #define AW4_AH4(a) AW4(AW1_AH1((a).x),AW1_AH1((a).y),AW1_AH1((a).z),AW1_AH1((a).w))
1292 #endif
1293//------------------------------------------------------------------------------------------------------------------------------
1294 #if defined(A_HLSL_6_2) && !defined(A_NO_16_BIT_CAST)
1295 #define AH1_AW1(x) asfloat16(x)
1296 #define AH2_AW2(x) asfloat16(x)
1297 #define AH3_AW3(x) asfloat16(x)
1298 #define AH4_AW4(x) asfloat16(x)
1299 #else
1300 #define AH1_AW1(a) AH1(f16tof32(AU1(a)))
1301 #define AH2_AW2(a) AH2(AH1_AW1((a).x),AH1_AW1((a).y))
1302 #define AH3_AW3(a) AH3(AH1_AW1((a).x),AH1_AW1((a).y),AH1_AW1((a).z))
1303 #define AH4_AW4(a) AH4(AH1_AW1((a).x),AH1_AW1((a).y),AH1_AW1((a).z),AH1_AW1((a).w))
1304 #endif
1305//==============================================================================================================================
1306 AH1 AH1_x(AH1 a){return AH1(a);}
1307 AH2 AH2_x(AH1 a){return AH2(a,a);}
1308 AH3 AH3_x(AH1 a){return AH3(a,a,a);}
1309 AH4 AH4_x(AH1 a){return AH4(a,a,a,a);}
1310 #define AH1_(a) AH1_x(AH1(a))
1311 #define AH2_(a) AH2_x(AH1(a))
1312 #define AH3_(a) AH3_x(AH1(a))
1313 #define AH4_(a) AH4_x(AH1(a))
1314//------------------------------------------------------------------------------------------------------------------------------
1315 AW1 AW1_x(AW1 a){return AW1(a);}
1316 AW2 AW2_x(AW1 a){return AW2(a,a);}
1317 AW3 AW3_x(AW1 a){return AW3(a,a,a);}
1318 AW4 AW4_x(AW1 a){return AW4(a,a,a,a);}
1319 #define AW1_(a) AW1_x(AW1(a))
1320 #define AW2_(a) AW2_x(AW1(a))
1321 #define AW3_(a) AW3_x(AW1(a))
1322 #define AW4_(a) AW4_x(AW1(a))
1323//==============================================================================================================================
1324 AW1 AAbsSW1(AW1 a){return AW1(abs(ASW1(a)));}
1325 AW2 AAbsSW2(AW2 a){return AW2(abs(ASW2(a)));}
1326 AW3 AAbsSW3(AW3 a){return AW3(abs(ASW3(a)));}
1327 AW4 AAbsSW4(AW4 a){return AW4(abs(ASW4(a)));}
1328//------------------------------------------------------------------------------------------------------------------------------
1329 AH1 AClampH1(AH1 x,AH1 n,AH1 m){return max(n,min(x,m));}
1330 AH2 AClampH2(AH2 x,AH2 n,AH2 m){return max(n,min(x,m));}
1331 AH3 AClampH3(AH3 x,AH3 n,AH3 m){return max(n,min(x,m));}
1332 AH4 AClampH4(AH4 x,AH4 n,AH4 m){return max(n,min(x,m));}
1333//------------------------------------------------------------------------------------------------------------------------------
1334 // V_FRACT_F16 (note DX frac() is different).
1335 AH1 AFractH1(AH1 x){return x-floor(x);}
1336 AH2 AFractH2(AH2 x){return x-floor(x);}
1337 AH3 AFractH3(AH3 x){return x-floor(x);}
1338 AH4 AFractH4(AH4 x){return x-floor(x);}
1339//------------------------------------------------------------------------------------------------------------------------------
1340 AH1 ALerpH1(AH1 x,AH1 y,AH1 a){return lerp(x,y,a);}
1341 AH2 ALerpH2(AH2 x,AH2 y,AH2 a){return lerp(x,y,a);}
1342 AH3 ALerpH3(AH3 x,AH3 y,AH3 a){return lerp(x,y,a);}
1343 AH4 ALerpH4(AH4 x,AH4 y,AH4 a){return lerp(x,y,a);}
1344//------------------------------------------------------------------------------------------------------------------------------
1345 AH1 AMax3H1(AH1 x,AH1 y,AH1 z){return max(x,max(y,z));}
1346 AH2 AMax3H2(AH2 x,AH2 y,AH2 z){return max(x,max(y,z));}
1347 AH3 AMax3H3(AH3 x,AH3 y,AH3 z){return max(x,max(y,z));}
1348 AH4 AMax3H4(AH4 x,AH4 y,AH4 z){return max(x,max(y,z));}
1349//------------------------------------------------------------------------------------------------------------------------------
1350 AW1 AMaxSW1(AW1 a,AW1 b){return AW1(max(ASU1(a),ASU1(b)));}
1351 AW2 AMaxSW2(AW2 a,AW2 b){return AW2(max(ASU2(a),ASU2(b)));}
1352 AW3 AMaxSW3(AW3 a,AW3 b){return AW3(max(ASU3(a),ASU3(b)));}
1353 AW4 AMaxSW4(AW4 a,AW4 b){return AW4(max(ASU4(a),ASU4(b)));}
1354//------------------------------------------------------------------------------------------------------------------------------
1355 AH1 AMin3H1(AH1 x,AH1 y,AH1 z){return min(x,min(y,z));}
1356 AH2 AMin3H2(AH2 x,AH2 y,AH2 z){return min(x,min(y,z));}
1357 AH3 AMin3H3(AH3 x,AH3 y,AH3 z){return min(x,min(y,z));}
1358 AH4 AMin3H4(AH4 x,AH4 y,AH4 z){return min(x,min(y,z));}
1359//------------------------------------------------------------------------------------------------------------------------------
1360 AW1 AMinSW1(AW1 a,AW1 b){return AW1(min(ASU1(a),ASU1(b)));}
1361 AW2 AMinSW2(AW2 a,AW2 b){return AW2(min(ASU2(a),ASU2(b)));}
1362 AW3 AMinSW3(AW3 a,AW3 b){return AW3(min(ASU3(a),ASU3(b)));}
1363 AW4 AMinSW4(AW4 a,AW4 b){return AW4(min(ASU4(a),ASU4(b)));}
1364//------------------------------------------------------------------------------------------------------------------------------
1365 AH1 ARcpH1(AH1 x){return rcp(x);}
1366 AH2 ARcpH2(AH2 x){return rcp(x);}
1367 AH3 ARcpH3(AH3 x){return rcp(x);}
1368 AH4 ARcpH4(AH4 x){return rcp(x);}
1369//------------------------------------------------------------------------------------------------------------------------------
1370 AH1 ARsqH1(AH1 x){return rsqrt(x);}
1371 AH2 ARsqH2(AH2 x){return rsqrt(x);}
1372 AH3 ARsqH3(AH3 x){return rsqrt(x);}
1373 AH4 ARsqH4(AH4 x){return rsqrt(x);}
1374//------------------------------------------------------------------------------------------------------------------------------
1375 AH1 ASatH1(AH1 x){return saturate(x);}
1376 AH2 ASatH2(AH2 x){return saturate(x);}
1377 AH3 ASatH3(AH3 x){return saturate(x);}
1378 AH4 ASatH4(AH4 x){return saturate(x);}
1379//------------------------------------------------------------------------------------------------------------------------------
1380 AW1 AShrSW1(AW1 a,AW1 b){return AW1(ASW1(a)>>ASW1(b));}
1381 AW2 AShrSW2(AW2 a,AW2 b){return AW2(ASW2(a)>>ASW2(b));}
1382 AW3 AShrSW3(AW3 a,AW3 b){return AW3(ASW3(a)>>ASW3(b));}
1383 AW4 AShrSW4(AW4 a,AW4 b){return AW4(ASW4(a)>>ASW4(b));}
1384 #endif
1385////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
1386////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
1387//_____________________________________________________________/\_______________________________________________________________
1388//==============================================================================================================================
1389// HLSL DOUBLE
1390//==============================================================================================================================
1391 #ifdef A_DUBL
1392 #ifdef A_HLSL_6_2
1393 #define AD1 float64_t
1394 #define AD2 float64_t2
1395 #define AD3 float64_t3
1396 #define AD4 float64_t4
1397 #else
1398 #define AD1 double
1399 #define AD2 double2
1400 #define AD3 double3
1401 #define AD4 double4
1402 #endif
1403//------------------------------------------------------------------------------------------------------------------------------
1404 AD1 AD1_x(AD1 a){return AD1(a);}
1405 AD2 AD2_x(AD1 a){return AD2(a,a);}
1406 AD3 AD3_x(AD1 a){return AD3(a,a,a);}
1407 AD4 AD4_x(AD1 a){return AD4(a,a,a,a);}
1408 #define AD1_(a) AD1_x(AD1(a))
1409 #define AD2_(a) AD2_x(AD1(a))
1410 #define AD3_(a) AD3_x(AD1(a))
1411 #define AD4_(a) AD4_x(AD1(a))
1412//==============================================================================================================================
1413 AD1 AFractD1(AD1 a){return a-floor(a);}
1414 AD2 AFractD2(AD2 a){return a-floor(a);}
1415 AD3 AFractD3(AD3 a){return a-floor(a);}
1416 AD4 AFractD4(AD4 a){return a-floor(a);}
1417//------------------------------------------------------------------------------------------------------------------------------
1418 AD1 ALerpD1(AD1 x,AD1 y,AD1 a){return lerp(x,y,a);}
1419 AD2 ALerpD2(AD2 x,AD2 y,AD2 a){return lerp(x,y,a);}
1420 AD3 ALerpD3(AD3 x,AD3 y,AD3 a){return lerp(x,y,a);}
1421 AD4 ALerpD4(AD4 x,AD4 y,AD4 a){return lerp(x,y,a);}
1422//------------------------------------------------------------------------------------------------------------------------------
1423 AD1 ARcpD1(AD1 x){return rcp(x);}
1424 AD2 ARcpD2(AD2 x){return rcp(x);}
1425 AD3 ARcpD3(AD3 x){return rcp(x);}
1426 AD4 ARcpD4(AD4 x){return rcp(x);}
1427//------------------------------------------------------------------------------------------------------------------------------
1428 AD1 ARsqD1(AD1 x){return rsqrt(x);}
1429 AD2 ARsqD2(AD2 x){return rsqrt(x);}
1430 AD3 ARsqD3(AD3 x){return rsqrt(x);}
1431 AD4 ARsqD4(AD4 x){return rsqrt(x);}
1432//------------------------------------------------------------------------------------------------------------------------------
1433 AD1 ASatD1(AD1 x){return saturate(x);}
1434 AD2 ASatD2(AD2 x){return saturate(x);}
1435 AD3 ASatD3(AD3 x){return saturate(x);}
1436 AD4 ASatD4(AD4 x){return saturate(x);}
1437 #endif
1438//==============================================================================================================================
1439// HLSL WAVE
1440//==============================================================================================================================
1441 #ifdef A_WAVE
1442 // Where 'x' must be a compile time literal.
1443 AF1 AWaveXorF1(AF1 v,AU1 x){return WaveReadLaneAt(v,WaveGetLaneIndex()^x);}
1444 AF2 AWaveXorF2(AF2 v,AU1 x){return WaveReadLaneAt(v,WaveGetLaneIndex()^x);}
1445 AF3 AWaveXorF3(AF3 v,AU1 x){return WaveReadLaneAt(v,WaveGetLaneIndex()^x);}
1446 AF4 AWaveXorF4(AF4 v,AU1 x){return WaveReadLaneAt(v,WaveGetLaneIndex()^x);}
1447 AU1 AWaveXorU1(AU1 v,AU1 x){return WaveReadLaneAt(v,WaveGetLaneIndex()^x);}
1448 AU2 AWaveXorU1(AU2 v,AU1 x){return WaveReadLaneAt(v,WaveGetLaneIndex()^x);}
1449 AU3 AWaveXorU1(AU3 v,AU1 x){return WaveReadLaneAt(v,WaveGetLaneIndex()^x);}
1450 AU4 AWaveXorU1(AU4 v,AU1 x){return WaveReadLaneAt(v,WaveGetLaneIndex()^x);}
1451//------------------------------------------------------------------------------------------------------------------------------
1452 #ifdef A_HALF
1453 AH2 AWaveXorH2(AH2 v,AU1 x){return AH2_AU1(WaveReadLaneAt(AU1_AH2(v),WaveGetLaneIndex()^x));}
1454 AH4 AWaveXorH4(AH4 v,AU1 x){return AH4_AU2(WaveReadLaneAt(AU2_AH4(v),WaveGetLaneIndex()^x));}
1455 AW2 AWaveXorW2(AW2 v,AU1 x){return AW2_AU1(WaveReadLaneAt(AU1_AW2(v),WaveGetLaneIndex()^x));}
1456 AW4 AWaveXorW4(AW4 v,AU1 x){return AW4_AU1(WaveReadLaneAt(AU1_AW4(v),WaveGetLaneIndex()^x));}
1457 #endif
1458 #endif
1459//==============================================================================================================================
1460#endif
1461////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
1462////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
1463////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
1464////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
1465////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
1466////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
1467//_____________________________________________________________/\_______________________________________________________________
1468//==============================================================================================================================
1469//
1470//
1471// GPU COMMON
1472//
1473//
1474//==============================================================================================================================
1475#ifdef A_GPU
1476 // Negative and positive infinity.
1477 #define A_INFP_F AF1_AU1(0x7f800000u)
1478 #define A_INFN_F AF1_AU1(0xff800000u)
1479//------------------------------------------------------------------------------------------------------------------------------
1480 // Copy sign from 's' to positive 'd'.
1481 AF1 ACpySgnF1(AF1 d,AF1 s){return AF1_AU1(AU1_AF1(d)|(AU1_AF1(s)&AU1_(0x80000000u)));}
1482 AF2 ACpySgnF2(AF2 d,AF2 s){return AF2_AU2(AU2_AF2(d)|(AU2_AF2(s)&AU2_(0x80000000u)));}
1483 AF3 ACpySgnF3(AF3 d,AF3 s){return AF3_AU3(AU3_AF3(d)|(AU3_AF3(s)&AU3_(0x80000000u)));}
1484 AF4 ACpySgnF4(AF4 d,AF4 s){return AF4_AU4(AU4_AF4(d)|(AU4_AF4(s)&AU4_(0x80000000u)));}
1485//------------------------------------------------------------------------------------------------------------------------------
1486 // Single operation to return (useful to create a mask to use in lerp for branch free logic),
1487 // m=NaN := 0
1488 // m>=0 := 0
1489 // m<0 := 1
1490 // Uses the following useful floating point logic,
1491 // saturate(+a*(-INF)==-INF) := 0
1492 // saturate( 0*(-INF)== NaN) := 0
1493 // saturate(-a*(-INF)==+INF) := 1
1494 AF1 ASignedF1(AF1 m){return ASatF1(m*AF1_(A_INFN_F));}
1495 AF2 ASignedF2(AF2 m){return ASatF2(m*AF2_(A_INFN_F));}
1496 AF3 ASignedF3(AF3 m){return ASatF3(m*AF3_(A_INFN_F));}
1497 AF4 ASignedF4(AF4 m){return ASatF4(m*AF4_(A_INFN_F));}
1498//------------------------------------------------------------------------------------------------------------------------------
1499 AF1 AGtZeroF1(AF1 m){return ASatF1(m*AF1_(A_INFP_F));}
1500 AF2 AGtZeroF2(AF2 m){return ASatF2(m*AF2_(A_INFP_F));}
1501 AF3 AGtZeroF3(AF3 m){return ASatF3(m*AF3_(A_INFP_F));}
1502 AF4 AGtZeroF4(AF4 m){return ASatF4(m*AF4_(A_INFP_F));}
1503//==============================================================================================================================
1504 #ifdef A_HALF
1505 #ifdef A_HLSL_6_2
1506 #define A_INFP_H AH1_AW1((uint16_t)0x7c00u)
1507 #define A_INFN_H AH1_AW1((uint16_t)0xfc00u)
1508 #else
1509 #define A_INFP_H AH1_AW1(0x7c00u)
1510 #define A_INFN_H AH1_AW1(0xfc00u)
1511 #endif
1512
1513//------------------------------------------------------------------------------------------------------------------------------
1514 AH1 ACpySgnH1(AH1 d,AH1 s){return AH1_AW1(AW1_AH1(d)|(AW1_AH1(s)&AW1_(0x8000u)));}
1515 AH2 ACpySgnH2(AH2 d,AH2 s){return AH2_AW2(AW2_AH2(d)|(AW2_AH2(s)&AW2_(0x8000u)));}
1516 AH3 ACpySgnH3(AH3 d,AH3 s){return AH3_AW3(AW3_AH3(d)|(AW3_AH3(s)&AW3_(0x8000u)));}
1517 AH4 ACpySgnH4(AH4 d,AH4 s){return AH4_AW4(AW4_AH4(d)|(AW4_AH4(s)&AW4_(0x8000u)));}
1518//------------------------------------------------------------------------------------------------------------------------------
1519 AH1 ASignedH1(AH1 m){return ASatH1(m*AH1_(A_INFN_H));}
1520 AH2 ASignedH2(AH2 m){return ASatH2(m*AH2_(A_INFN_H));}
1521 AH3 ASignedH3(AH3 m){return ASatH3(m*AH3_(A_INFN_H));}
1522 AH4 ASignedH4(AH4 m){return ASatH4(m*AH4_(A_INFN_H));}
1523//------------------------------------------------------------------------------------------------------------------------------
1524 AH1 AGtZeroH1(AH1 m){return ASatH1(m*AH1_(A_INFP_H));}
1525 AH2 AGtZeroH2(AH2 m){return ASatH2(m*AH2_(A_INFP_H));}
1526 AH3 AGtZeroH3(AH3 m){return ASatH3(m*AH3_(A_INFP_H));}
1527 AH4 AGtZeroH4(AH4 m){return ASatH4(m*AH4_(A_INFP_H));}
1528 #endif
1529////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
1530////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
1531//_____________________________________________________________/\_______________________________________________________________
1532//==============================================================================================================================
1533// [FIS] FLOAT INTEGER SORTABLE
1534//------------------------------------------------------------------------------------------------------------------------------
1535// Float to integer sortable.
1536// - If sign bit=0, flip the sign bit (positives).
1537// - If sign bit=1, flip all bits (negatives).
1538// Integer sortable to float.
1539// - If sign bit=1, flip the sign bit (positives).
1540// - If sign bit=0, flip all bits (negatives).
1541// Has nice side effects.
1542// - Larger integers are more positive values.
1543// - Float zero is mapped to center of integers (so clear to integer zero is a nice default for atomic max usage).
1544// Burns 3 ops for conversion {shift,or,xor}.
1545//==============================================================================================================================
1546 AU1 AFisToU1(AU1 x){return x^(( AShrSU1(x,AU1_(31)))|AU1_(0x80000000));}
1547 AU1 AFisFromU1(AU1 x){return x^((~AShrSU1(x,AU1_(31)))|AU1_(0x80000000));}
1548//------------------------------------------------------------------------------------------------------------------------------
1549 // Just adjust high 16-bit value (useful when upper part of 32-bit word is a 16-bit float value).
1550 AU1 AFisToHiU1(AU1 x){return x^(( AShrSU1(x,AU1_(15)))|AU1_(0x80000000));}
1551 AU1 AFisFromHiU1(AU1 x){return x^((~AShrSU1(x,AU1_(15)))|AU1_(0x80000000));}
1552//------------------------------------------------------------------------------------------------------------------------------
1553 #ifdef A_HALF
1554 AW1 AFisToW1(AW1 x){return x^(( AShrSW1(x,AW1_(15)))|AW1_(0x8000));}
1555 AW1 AFisFromW1(AW1 x){return x^((~AShrSW1(x,AW1_(15)))|AW1_(0x8000));}
1556//------------------------------------------------------------------------------------------------------------------------------
1557 AW2 AFisToW2(AW2 x){return x^(( AShrSW2(x,AW2_(15)))|AW2_(0x8000));}
1558 AW2 AFisFromW2(AW2 x){return x^((~AShrSW2(x,AW2_(15)))|AW2_(0x8000));}
1559 #endif
1560////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
1561////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
1562//_____________________________________________________________/\_______________________________________________________________
1563//==============================================================================================================================
1564// [PERM] V_PERM_B32
1565//------------------------------------------------------------------------------------------------------------------------------
1566// Support for V_PERM_B32 started in the 3rd generation of GCN.
1567//------------------------------------------------------------------------------------------------------------------------------
1568// yyyyxxxx - The 'i' input.
1569// 76543210
1570// ========
1571// HGFEDCBA - Naming on permutation.
1572//------------------------------------------------------------------------------------------------------------------------------
1573// TODO
1574// ====
1575// - Make sure compiler optimizes this.
1576//==============================================================================================================================
1577 #ifdef A_HALF
1578 AU1 APerm0E0A(AU2 i){return((i.x )&0xffu)|((i.y<<16)&0xff0000u);}
1579 AU1 APerm0F0B(AU2 i){return((i.x>> 8)&0xffu)|((i.y<< 8)&0xff0000u);}
1580 AU1 APerm0G0C(AU2 i){return((i.x>>16)&0xffu)|((i.y )&0xff0000u);}
1581 AU1 APerm0H0D(AU2 i){return((i.x>>24)&0xffu)|((i.y>> 8)&0xff0000u);}
1582//------------------------------------------------------------------------------------------------------------------------------
1583 AU1 APermHGFA(AU2 i){return((i.x )&0x000000ffu)|(i.y&0xffffff00u);}
1584 AU1 APermHGFC(AU2 i){return((i.x>>16)&0x000000ffu)|(i.y&0xffffff00u);}
1585 AU1 APermHGAE(AU2 i){return((i.x<< 8)&0x0000ff00u)|(i.y&0xffff00ffu);}
1586 AU1 APermHGCE(AU2 i){return((i.x>> 8)&0x0000ff00u)|(i.y&0xffff00ffu);}
1587 AU1 APermHAFE(AU2 i){return((i.x<<16)&0x00ff0000u)|(i.y&0xff00ffffu);}
1588 AU1 APermHCFE(AU2 i){return((i.x )&0x00ff0000u)|(i.y&0xff00ffffu);}
1589 AU1 APermAGFE(AU2 i){return((i.x<<24)&0xff000000u)|(i.y&0x00ffffffu);}
1590 AU1 APermCGFE(AU2 i){return((i.x<< 8)&0xff000000u)|(i.y&0x00ffffffu);}
1591//------------------------------------------------------------------------------------------------------------------------------
1592 AU1 APermGCEA(AU2 i){return((i.x)&0x00ff00ffu)|((i.y<<8)&0xff00ff00u);}
1593 AU1 APermGECA(AU2 i){return(((i.x)&0xffu)|((i.x>>8)&0xff00u)|((i.y<<16)&0xff0000u)|((i.y<<8)&0xff000000u));}
1594 #endif
1595////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
1596////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
1597//_____________________________________________________________/\_______________________________________________________________
1598//==============================================================================================================================
1599// [BUC] BYTE UNSIGNED CONVERSION
1600//------------------------------------------------------------------------------------------------------------------------------
1601// Designed to use the optimal conversion, enables the scaling to possibly be factored into other computation.
1602// Works on a range of {0 to A_BUC_<32,16>}, for <32-bit, and 16-bit> respectively.
1603//------------------------------------------------------------------------------------------------------------------------------
1604// OPCODE NOTES
1605// ============
1606// GCN does not do UNORM or SNORM for bytes in opcodes.
1607// - V_CVT_F32_UBYTE{0,1,2,3} - Unsigned byte to float.
1608// - V_CVT_PKACC_U8_F32 - Float to unsigned byte (does bit-field insert into 32-bit integer).
1609// V_PERM_B32 does byte packing with ability to zero fill bytes as well.
1610// - Can pull out byte values from two sources, and zero fill upper 8-bits of packed hi and lo.
1611//------------------------------------------------------------------------------------------------------------------------------
1612// BYTE : FLOAT - ABuc{0,1,2,3}{To,From}U1() - Designed for V_CVT_F32_UBYTE* and V_CVT_PKACCUM_U8_F32 ops.
1613// ==== =====
1614// 0 : 0
1615// 1 : 1
1616// ...
1617// 255 : 255
1618// : 256 (just outside the encoding range)
1619//------------------------------------------------------------------------------------------------------------------------------
1620// BYTE : FLOAT - ABuc{0,1,2,3}{To,From}U2() - Designed for 16-bit denormal tricks and V_PERM_B32.
1621// ==== =====
1622// 0 : 0
1623// 1 : 1/512
1624// 2 : 1/256
1625// ...
1626// 64 : 1/8
1627// 128 : 1/4
1628// 255 : 255/512
1629// : 1/2 (just outside the encoding range)
1630//------------------------------------------------------------------------------------------------------------------------------
1631// OPTIMAL IMPLEMENTATIONS ON AMD ARCHITECTURES
1632// ============================================
1633// r=ABuc0FromU1(i)
1634// V_CVT_F32_UBYTE0 r,i
1635// --------------------------------------------
1636// r=ABuc0ToU1(d,i)
1637// V_CVT_PKACCUM_U8_F32 r,i,0,d
1638// --------------------------------------------
1639// d=ABuc0FromU2(i)
1640// Where 'k0' is an SGPR with 0x0E0A
1641// Where 'k1' is an SGPR with {32768.0} packed into the lower 16-bits
1642// V_PERM_B32 d,i.x,i.y,k0
1643// V_PK_FMA_F16 d,d,k1.x,0
1644// --------------------------------------------
1645// r=ABuc0ToU2(d,i)
1646// Where 'k0' is an SGPR with {1.0/32768.0} packed into the lower 16-bits
1647// Where 'k1' is an SGPR with 0x????
1648// Where 'k2' is an SGPR with 0x????
1649// V_PK_FMA_F16 i,i,k0.x,0
1650// V_PERM_B32 r.x,i,i,k1
1651// V_PERM_B32 r.y,i,i,k2
1652//==============================================================================================================================
1653 // Peak range for 32-bit and 16-bit operations.
1654 #define A_BUC_32 (255.0)
1655 #define A_BUC_16 (255.0/512.0)
1656//==============================================================================================================================
1657 #if 1
1658 // Designed to be one V_CVT_PKACCUM_U8_F32.
1659 // The extra min is required to pattern match to V_CVT_PKACCUM_U8_F32.
1660 AU1 ABuc0ToU1(AU1 d,AF1 i){return (d&0xffffff00u)|((min(AU1(i),255u) )&(0x000000ffu));}
1661 AU1 ABuc1ToU1(AU1 d,AF1 i){return (d&0xffff00ffu)|((min(AU1(i),255u)<< 8)&(0x0000ff00u));}
1662 AU1 ABuc2ToU1(AU1 d,AF1 i){return (d&0xff00ffffu)|((min(AU1(i),255u)<<16)&(0x00ff0000u));}
1663 AU1 ABuc3ToU1(AU1 d,AF1 i){return (d&0x00ffffffu)|((min(AU1(i),255u)<<24)&(0xff000000u));}
1664//------------------------------------------------------------------------------------------------------------------------------
1665 // Designed to be one V_CVT_F32_UBYTE*.
1666 AF1 ABuc0FromU1(AU1 i){return AF1((i )&255u);}
1667 AF1 ABuc1FromU1(AU1 i){return AF1((i>> 8)&255u);}
1668 AF1 ABuc2FromU1(AU1 i){return AF1((i>>16)&255u);}
1669 AF1 ABuc3FromU1(AU1 i){return AF1((i>>24)&255u);}
1670 #endif
1671//==============================================================================================================================
1672 #ifdef A_HALF
1673 // Takes {x0,x1} and {y0,y1} and builds {{x0,y0},{x1,y1}}.
1674 AW2 ABuc01ToW2(AH2 x,AH2 y){x*=AH2_(1.0/32768.0);y*=AH2_(1.0/32768.0);
1675 return AW2_AU1(APermGCEA(AU2(AU1_AW2(AW2_AH2(x)),AU1_AW2(AW2_AH2(y)))));}
1676//------------------------------------------------------------------------------------------------------------------------------
1677 // Designed for 3 ops to do SOA to AOS and conversion.
1678 AU2 ABuc0ToU2(AU2 d,AH2 i){AU1 b=AU1_AW2(AW2_AH2(i*AH2_(1.0/32768.0)));
1679 return AU2(APermHGFA(AU2(d.x,b)),APermHGFC(AU2(d.y,b)));}
1680 AU2 ABuc1ToU2(AU2 d,AH2 i){AU1 b=AU1_AW2(AW2_AH2(i*AH2_(1.0/32768.0)));
1681 return AU2(APermHGAE(AU2(d.x,b)),APermHGCE(AU2(d.y,b)));}
1682 AU2 ABuc2ToU2(AU2 d,AH2 i){AU1 b=AU1_AW2(AW2_AH2(i*AH2_(1.0/32768.0)));
1683 return AU2(APermHAFE(AU2(d.x,b)),APermHCFE(AU2(d.y,b)));}
1684 AU2 ABuc3ToU2(AU2 d,AH2 i){AU1 b=AU1_AW2(AW2_AH2(i*AH2_(1.0/32768.0)));
1685 return AU2(APermAGFE(AU2(d.x,b)),APermCGFE(AU2(d.y,b)));}
1686//------------------------------------------------------------------------------------------------------------------------------
1687 // Designed for 2 ops to do both AOS to SOA, and conversion.
1688 AH2 ABuc0FromU2(AU2 i){return AH2_AW2(AW2_AU1(APerm0E0A(i)))*AH2_(32768.0);}
1689 AH2 ABuc1FromU2(AU2 i){return AH2_AW2(AW2_AU1(APerm0F0B(i)))*AH2_(32768.0);}
1690 AH2 ABuc2FromU2(AU2 i){return AH2_AW2(AW2_AU1(APerm0G0C(i)))*AH2_(32768.0);}
1691 AH2 ABuc3FromU2(AU2 i){return AH2_AW2(AW2_AU1(APerm0H0D(i)))*AH2_(32768.0);}
1692 #endif
1693////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
1694////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
1695//_____________________________________________________________/\_______________________________________________________________
1696//==============================================================================================================================
1697// [BSC] BYTE SIGNED CONVERSION
1698//------------------------------------------------------------------------------------------------------------------------------
1699// Similar to [BUC].
1700// Works on a range of {-/+ A_BSC_<32,16>}, for <32-bit, and 16-bit> respectively.
1701//------------------------------------------------------------------------------------------------------------------------------
1702// ENCODING (without zero-based encoding)
1703// ========
1704// 0 = unused (can be used to mean something else)
1705// 1 = lowest value
1706// 128 = exact zero center (zero based encoding
1707// 255 = highest value
1708//------------------------------------------------------------------------------------------------------------------------------
1709// Zero-based [Zb] flips the MSB bit of the byte (making 128 "exact zero" actually zero).
1710// This is useful if there is a desire for cleared values to decode as zero.
1711//------------------------------------------------------------------------------------------------------------------------------
1712// BYTE : FLOAT - ABsc{0,1,2,3}{To,From}U2() - Designed for 16-bit denormal tricks and V_PERM_B32.
1713// ==== =====
1714// 0 : -127/512 (unused)
1715// 1 : -126/512
1716// 2 : -125/512
1717// ...
1718// 128 : 0
1719// ...
1720// 255 : 127/512
1721// : 1/4 (just outside the encoding range)
1722//==============================================================================================================================
1723 // Peak range for 32-bit and 16-bit operations.
1724 #define A_BSC_32 (127.0)
1725 #define A_BSC_16 (127.0/512.0)
1726//==============================================================================================================================
1727 #if 1
1728 AU1 ABsc0ToU1(AU1 d,AF1 i){return (d&0xffffff00u)|((min(AU1(i+128.0),255u) )&(0x000000ffu));}
1729 AU1 ABsc1ToU1(AU1 d,AF1 i){return (d&0xffff00ffu)|((min(AU1(i+128.0),255u)<< 8)&(0x0000ff00u));}
1730 AU1 ABsc2ToU1(AU1 d,AF1 i){return (d&0xff00ffffu)|((min(AU1(i+128.0),255u)<<16)&(0x00ff0000u));}
1731 AU1 ABsc3ToU1(AU1 d,AF1 i){return (d&0x00ffffffu)|((min(AU1(i+128.0),255u)<<24)&(0xff000000u));}
1732//------------------------------------------------------------------------------------------------------------------------------
1733 AU1 ABsc0ToZbU1(AU1 d,AF1 i){return ((d&0xffffff00u)|((min(AU1(trunc(i)+128.0),255u) )&(0x000000ffu)))^0x00000080u;}
1734 AU1 ABsc1ToZbU1(AU1 d,AF1 i){return ((d&0xffff00ffu)|((min(AU1(trunc(i)+128.0),255u)<< 8)&(0x0000ff00u)))^0x00008000u;}
1735 AU1 ABsc2ToZbU1(AU1 d,AF1 i){return ((d&0xff00ffffu)|((min(AU1(trunc(i)+128.0),255u)<<16)&(0x00ff0000u)))^0x00800000u;}
1736 AU1 ABsc3ToZbU1(AU1 d,AF1 i){return ((d&0x00ffffffu)|((min(AU1(trunc(i)+128.0),255u)<<24)&(0xff000000u)))^0x80000000u;}
1737//------------------------------------------------------------------------------------------------------------------------------
1738 AF1 ABsc0FromU1(AU1 i){return AF1((i )&255u)-128.0;}
1739 AF1 ABsc1FromU1(AU1 i){return AF1((i>> 8)&255u)-128.0;}
1740 AF1 ABsc2FromU1(AU1 i){return AF1((i>>16)&255u)-128.0;}
1741 AF1 ABsc3FromU1(AU1 i){return AF1((i>>24)&255u)-128.0;}
1742//------------------------------------------------------------------------------------------------------------------------------
1743 AF1 ABsc0FromZbU1(AU1 i){return AF1(((i )&255u)^0x80u)-128.0;}
1744 AF1 ABsc1FromZbU1(AU1 i){return AF1(((i>> 8)&255u)^0x80u)-128.0;}
1745 AF1 ABsc2FromZbU1(AU1 i){return AF1(((i>>16)&255u)^0x80u)-128.0;}
1746 AF1 ABsc3FromZbU1(AU1 i){return AF1(((i>>24)&255u)^0x80u)-128.0;}
1747 #endif
1748//==============================================================================================================================
1749 #ifdef A_HALF
1750 // Takes {x0,x1} and {y0,y1} and builds {{x0,y0},{x1,y1}}.
1751 AW2 ABsc01ToW2(AH2 x,AH2 y){x=x*AH2_(1.0/32768.0)+AH2_(0.25/32768.0);y=y*AH2_(1.0/32768.0)+AH2_(0.25/32768.0);
1752 return AW2_AU1(APermGCEA(AU2(AU1_AW2(AW2_AH2(x)),AU1_AW2(AW2_AH2(y)))));}
1753//------------------------------------------------------------------------------------------------------------------------------
1754 AU2 ABsc0ToU2(AU2 d,AH2 i){AU1 b=AU1_AW2(AW2_AH2(i*AH2_(1.0/32768.0)+AH2_(0.25/32768.0)));
1755 return AU2(APermHGFA(AU2(d.x,b)),APermHGFC(AU2(d.y,b)));}
1756 AU2 ABsc1ToU2(AU2 d,AH2 i){AU1 b=AU1_AW2(AW2_AH2(i*AH2_(1.0/32768.0)+AH2_(0.25/32768.0)));
1757 return AU2(APermHGAE(AU2(d.x,b)),APermHGCE(AU2(d.y,b)));}
1758 AU2 ABsc2ToU2(AU2 d,AH2 i){AU1 b=AU1_AW2(AW2_AH2(i*AH2_(1.0/32768.0)+AH2_(0.25/32768.0)));
1759 return AU2(APermHAFE(AU2(d.x,b)),APermHCFE(AU2(d.y,b)));}
1760 AU2 ABsc3ToU2(AU2 d,AH2 i){AU1 b=AU1_AW2(AW2_AH2(i*AH2_(1.0/32768.0)+AH2_(0.25/32768.0)));
1761 return AU2(APermAGFE(AU2(d.x,b)),APermCGFE(AU2(d.y,b)));}
1762//------------------------------------------------------------------------------------------------------------------------------
1763 AU2 ABsc0ToZbU2(AU2 d,AH2 i){AU1 b=AU1_AW2(AW2_AH2(i*AH2_(1.0/32768.0)+AH2_(0.25/32768.0)))^0x00800080u;
1764 return AU2(APermHGFA(AU2(d.x,b)),APermHGFC(AU2(d.y,b)));}
1765 AU2 ABsc1ToZbU2(AU2 d,AH2 i){AU1 b=AU1_AW2(AW2_AH2(i*AH2_(1.0/32768.0)+AH2_(0.25/32768.0)))^0x00800080u;
1766 return AU2(APermHGAE(AU2(d.x,b)),APermHGCE(AU2(d.y,b)));}
1767 AU2 ABsc2ToZbU2(AU2 d,AH2 i){AU1 b=AU1_AW2(AW2_AH2(i*AH2_(1.0/32768.0)+AH2_(0.25/32768.0)))^0x00800080u;
1768 return AU2(APermHAFE(AU2(d.x,b)),APermHCFE(AU2(d.y,b)));}
1769 AU2 ABsc3ToZbU2(AU2 d,AH2 i){AU1 b=AU1_AW2(AW2_AH2(i*AH2_(1.0/32768.0)+AH2_(0.25/32768.0)))^0x00800080u;
1770 return AU2(APermAGFE(AU2(d.x,b)),APermCGFE(AU2(d.y,b)));}
1771//------------------------------------------------------------------------------------------------------------------------------
1772 AH2 ABsc0FromU2(AU2 i){return AH2_AW2(AW2_AU1(APerm0E0A(i)))*AH2_(32768.0)-AH2_(0.25);}
1773 AH2 ABsc1FromU2(AU2 i){return AH2_AW2(AW2_AU1(APerm0F0B(i)))*AH2_(32768.0)-AH2_(0.25);}
1774 AH2 ABsc2FromU2(AU2 i){return AH2_AW2(AW2_AU1(APerm0G0C(i)))*AH2_(32768.0)-AH2_(0.25);}
1775 AH2 ABsc3FromU2(AU2 i){return AH2_AW2(AW2_AU1(APerm0H0D(i)))*AH2_(32768.0)-AH2_(0.25);}
1776//------------------------------------------------------------------------------------------------------------------------------
1777 AH2 ABsc0FromZbU2(AU2 i){return AH2_AW2(AW2_AU1(APerm0E0A(i)^0x00800080u))*AH2_(32768.0)-AH2_(0.25);}
1778 AH2 ABsc1FromZbU2(AU2 i){return AH2_AW2(AW2_AU1(APerm0F0B(i)^0x00800080u))*AH2_(32768.0)-AH2_(0.25);}
1779 AH2 ABsc2FromZbU2(AU2 i){return AH2_AW2(AW2_AU1(APerm0G0C(i)^0x00800080u))*AH2_(32768.0)-AH2_(0.25);}
1780 AH2 ABsc3FromZbU2(AU2 i){return AH2_AW2(AW2_AU1(APerm0H0D(i)^0x00800080u))*AH2_(32768.0)-AH2_(0.25);}
1781 #endif
1782////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
1783////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
1784//_____________________________________________________________/\_______________________________________________________________
1785//==============================================================================================================================
1786// HALF APPROXIMATIONS
1787//------------------------------------------------------------------------------------------------------------------------------
1788// These support only positive inputs.
1789// Did not see value yet in specialization for range.
1790// Using quick testing, ended up mostly getting the same "best" approximation for various ranges.
1791// With hardware that can co-execute transcendentals, the value in approximations could be less than expected.
1792// However from a latency perspective, if execution of a transcendental is 4 clk, with no packed support, -> 8 clk total.
1793// And co-execution would require a compiler interleaving a lot of independent work for packed usage.
1794//------------------------------------------------------------------------------------------------------------------------------
1795// The one Newton Raphson iteration form of rsq() was skipped (requires 6 ops total).
1796// Same with sqrt(), as this could be x*rsq() (7 ops).
1797//==============================================================================================================================
1798 #ifdef A_HALF
1799 // Minimize squared error across full positive range, 2 ops.
1800 // The 0x1de2 based approximation maps {0 to 1} input maps to < 1 output.
1801 AH1 APrxLoSqrtH1(AH1 a){return AH1_AW1((AW1_AH1(a)>>AW1_(1))+AW1_(0x1de2));}
1802 AH2 APrxLoSqrtH2(AH2 a){return AH2_AW2((AW2_AH2(a)>>AW2_(1))+AW2_(0x1de2));}
1803 AH3 APrxLoSqrtH3(AH3 a){return AH3_AW3((AW3_AH3(a)>>AW3_(1))+AW3_(0x1de2));}
1804 AH4 APrxLoSqrtH4(AH4 a){return AH4_AW4((AW4_AH4(a)>>AW4_(1))+AW4_(0x1de2));}
1805//------------------------------------------------------------------------------------------------------------------------------
1806 // Lower precision estimation, 1 op.
1807 // Minimize squared error across {smallest normal to 16384.0}.
1808 AH1 APrxLoRcpH1(AH1 a){return AH1_AW1(AW1_(0x7784)-AW1_AH1(a));}
1809 AH2 APrxLoRcpH2(AH2 a){return AH2_AW2(AW2_(0x7784)-AW2_AH2(a));}
1810 AH3 APrxLoRcpH3(AH3 a){return AH3_AW3(AW3_(0x7784)-AW3_AH3(a));}
1811 AH4 APrxLoRcpH4(AH4 a){return AH4_AW4(AW4_(0x7784)-AW4_AH4(a));}
1812//------------------------------------------------------------------------------------------------------------------------------
1813 // Medium precision estimation, one Newton Raphson iteration, 3 ops.
1814 AH1 APrxMedRcpH1(AH1 a){AH1 b=AH1_AW1(AW1_(0x778d)-AW1_AH1(a));return b*(-b*a+AH1_(2.0));}
1815 AH2 APrxMedRcpH2(AH2 a){AH2 b=AH2_AW2(AW2_(0x778d)-AW2_AH2(a));return b*(-b*a+AH2_(2.0));}
1816 AH3 APrxMedRcpH3(AH3 a){AH3 b=AH3_AW3(AW3_(0x778d)-AW3_AH3(a));return b*(-b*a+AH3_(2.0));}
1817 AH4 APrxMedRcpH4(AH4 a){AH4 b=AH4_AW4(AW4_(0x778d)-AW4_AH4(a));return b*(-b*a+AH4_(2.0));}
1818//------------------------------------------------------------------------------------------------------------------------------
1819 // Minimize squared error across {smallest normal to 16384.0}, 2 ops.
1820 AH1 APrxLoRsqH1(AH1 a){return AH1_AW1(AW1_(0x59a3)-(AW1_AH1(a)>>AW1_(1)));}
1821 AH2 APrxLoRsqH2(AH2 a){return AH2_AW2(AW2_(0x59a3)-(AW2_AH2(a)>>AW2_(1)));}
1822 AH3 APrxLoRsqH3(AH3 a){return AH3_AW3(AW3_(0x59a3)-(AW3_AH3(a)>>AW3_(1)));}
1823 AH4 APrxLoRsqH4(AH4 a){return AH4_AW4(AW4_(0x59a3)-(AW4_AH4(a)>>AW4_(1)));}
1824 #endif
1825////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
1826////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
1827//_____________________________________________________________/\_______________________________________________________________
1828//==============================================================================================================================
1829// FLOAT APPROXIMATIONS
1830//------------------------------------------------------------------------------------------------------------------------------
1831// Michal Drobot has an excellent presentation on these: "Low Level Optimizations For GCN",
1832// - Idea dates back to SGI, then to Quake 3, etc.
1833// - https://michaldrobot.files.wordpress.com/2014/05/gcn_alu_opt_digitaldragons2014.pdf
1834// - sqrt(x)=rsqrt(x)*x
1835// - rcp(x)=rsqrt(x)*rsqrt(x) for positive x
1836// - https://github.com/michaldrobot/ShaderFastLibs/blob/master/ShaderFastMathLib.h
1837//------------------------------------------------------------------------------------------------------------------------------
1838// These below are from perhaps less complete searching for optimal.
1839// Used FP16 normal range for testing with +4096 32-bit step size for sampling error.
1840// So these match up well with the half approximations.
1841//==============================================================================================================================
1842 AF1 APrxLoSqrtF1(AF1 a){return AF1_AU1((AU1_AF1(a)>>AU1_(1))+AU1_(0x1fbc4639));}
1843 AF1 APrxLoRcpF1(AF1 a){return AF1_AU1(AU1_(0x7ef07ebb)-AU1_AF1(a));}
1844 AF1 APrxMedRcpF1(AF1 a){AF1 b=AF1_AU1(AU1_(0x7ef19fff)-AU1_AF1(a));return b*(-b*a+AF1_(2.0));}
1845 AF1 APrxLoRsqF1(AF1 a){return AF1_AU1(AU1_(0x5f347d74)-(AU1_AF1(a)>>AU1_(1)));}
1846//------------------------------------------------------------------------------------------------------------------------------
1847 AF2 APrxLoSqrtF2(AF2 a){return AF2_AU2((AU2_AF2(a)>>AU2_(1))+AU2_(0x1fbc4639));}
1848 AF2 APrxLoRcpF2(AF2 a){return AF2_AU2(AU2_(0x7ef07ebb)-AU2_AF2(a));}
1849 AF2 APrxMedRcpF2(AF2 a){AF2 b=AF2_AU2(AU2_(0x7ef19fff)-AU2_AF2(a));return b*(-b*a+AF2_(2.0));}
1850 AF2 APrxLoRsqF2(AF2 a){return AF2_AU2(AU2_(0x5f347d74)-(AU2_AF2(a)>>AU2_(1)));}
1851//------------------------------------------------------------------------------------------------------------------------------
1852 AF3 APrxLoSqrtF3(AF3 a){return AF3_AU3((AU3_AF3(a)>>AU3_(1))+AU3_(0x1fbc4639));}
1853 AF3 APrxLoRcpF3(AF3 a){return AF3_AU3(AU3_(0x7ef07ebb)-AU3_AF3(a));}
1854 AF3 APrxMedRcpF3(AF3 a){AF3 b=AF3_AU3(AU3_(0x7ef19fff)-AU3_AF3(a));return b*(-b*a+AF3_(2.0));}
1855 AF3 APrxLoRsqF3(AF3 a){return AF3_AU3(AU3_(0x5f347d74)-(AU3_AF3(a)>>AU3_(1)));}
1856//------------------------------------------------------------------------------------------------------------------------------
1857 AF4 APrxLoSqrtF4(AF4 a){return AF4_AU4((AU4_AF4(a)>>AU4_(1))+AU4_(0x1fbc4639));}
1858 AF4 APrxLoRcpF4(AF4 a){return AF4_AU4(AU4_(0x7ef07ebb)-AU4_AF4(a));}
1859 AF4 APrxMedRcpF4(AF4 a){AF4 b=AF4_AU4(AU4_(0x7ef19fff)-AU4_AF4(a));return b*(-b*a+AF4_(2.0));}
1860 AF4 APrxLoRsqF4(AF4 a){return AF4_AU4(AU4_(0x5f347d74)-(AU4_AF4(a)>>AU4_(1)));}
1861////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
1862////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
1863//_____________________________________________________________/\_______________________________________________________________
1864//==============================================================================================================================
1865// PQ APPROXIMATIONS
1866//------------------------------------------------------------------------------------------------------------------------------
1867// PQ is very close to x^(1/8). The functions below Use the fast float approximation method to do
1868// PQ<~>Gamma2 (4th power and fast 4th root) and PQ<~>Linear (8th power and fast 8th root). Maximum error is ~0.2%.
1869//==============================================================================================================================
1870// Helpers
1871 AF1 Quart(AF1 a) { a = a * a; return a * a;}
1872 AF1 Oct(AF1 a) { a = a * a; a = a * a; return a * a; }
1873 AF2 Quart(AF2 a) { a = a * a; return a * a; }
1874 AF2 Oct(AF2 a) { a = a * a; a = a * a; return a * a; }
1875 AF3 Quart(AF3 a) { a = a * a; return a * a; }
1876 AF3 Oct(AF3 a) { a = a * a; a = a * a; return a * a; }
1877 AF4 Quart(AF4 a) { a = a * a; return a * a; }
1878 AF4 Oct(AF4 a) { a = a * a; a = a * a; return a * a; }
1879 //------------------------------------------------------------------------------------------------------------------------------
1880 AF1 APrxPQToGamma2(AF1 a) { return Quart(a); }
1881 AF1 APrxPQToLinear(AF1 a) { return Oct(a); }
1882 AF1 APrxLoGamma2ToPQ(AF1 a) { return AF1_AU1((AU1_AF1(a) >> AU1_(2)) + AU1_(0x2F9A4E46)); }
1883 AF1 APrxMedGamma2ToPQ(AF1 a) { AF1 b = AF1_AU1((AU1_AF1(a) >> AU1_(2)) + AU1_(0x2F9A4E46)); AF1 b4 = Quart(b); return b - b * (b4 - a) / (AF1_(4.0) * b4); }
1884 AF1 APrxHighGamma2ToPQ(AF1 a) { return sqrt(sqrt(a)); }
1885 AF1 APrxLoLinearToPQ(AF1 a) { return AF1_AU1((AU1_AF1(a) >> AU1_(3)) + AU1_(0x378D8723)); }
1886 AF1 APrxMedLinearToPQ(AF1 a) { AF1 b = AF1_AU1((AU1_AF1(a) >> AU1_(3)) + AU1_(0x378D8723)); AF1 b8 = Oct(b); return b - b * (b8 - a) / (AF1_(8.0) * b8); }
1887 AF1 APrxHighLinearToPQ(AF1 a) { return sqrt(sqrt(sqrt(a))); }
1888 //------------------------------------------------------------------------------------------------------------------------------
1889 AF2 APrxPQToGamma2(AF2 a) { return Quart(a); }
1890 AF2 APrxPQToLinear(AF2 a) { return Oct(a); }
1891 AF2 APrxLoGamma2ToPQ(AF2 a) { return AF2_AU2((AU2_AF2(a) >> AU2_(2)) + AU2_(0x2F9A4E46)); }
1892 AF2 APrxMedGamma2ToPQ(AF2 a) { AF2 b = AF2_AU2((AU2_AF2(a) >> AU2_(2)) + AU2_(0x2F9A4E46)); AF2 b4 = Quart(b); return b - b * (b4 - a) / (AF1_(4.0) * b4); }
1893 AF2 APrxHighGamma2ToPQ(AF2 a) { return sqrt(sqrt(a)); }
1894 AF2 APrxLoLinearToPQ(AF2 a) { return AF2_AU2((AU2_AF2(a) >> AU2_(3)) + AU2_(0x378D8723)); }
1895 AF2 APrxMedLinearToPQ(AF2 a) { AF2 b = AF2_AU2((AU2_AF2(a) >> AU2_(3)) + AU2_(0x378D8723)); AF2 b8 = Oct(b); return b - b * (b8 - a) / (AF1_(8.0) * b8); }
1896 AF2 APrxHighLinearToPQ(AF2 a) { return sqrt(sqrt(sqrt(a))); }
1897 //------------------------------------------------------------------------------------------------------------------------------
1898 AF3 APrxPQToGamma2(AF3 a) { return Quart(a); }
1899 AF3 APrxPQToLinear(AF3 a) { return Oct(a); }
1900 AF3 APrxLoGamma2ToPQ(AF3 a) { return AF3_AU3((AU3_AF3(a) >> AU3_(2)) + AU3_(0x2F9A4E46)); }
1901 AF3 APrxMedGamma2ToPQ(AF3 a) { AF3 b = AF3_AU3((AU3_AF3(a) >> AU3_(2)) + AU3_(0x2F9A4E46)); AF3 b4 = Quart(b); return b - b * (b4 - a) / (AF1_(4.0) * b4); }
1902 AF3 APrxHighGamma2ToPQ(AF3 a) { return sqrt(sqrt(a)); }
1903 AF3 APrxLoLinearToPQ(AF3 a) { return AF3_AU3((AU3_AF3(a) >> AU3_(3)) + AU3_(0x378D8723)); }
1904 AF3 APrxMedLinearToPQ(AF3 a) { AF3 b = AF3_AU3((AU3_AF3(a) >> AU3_(3)) + AU3_(0x378D8723)); AF3 b8 = Oct(b); return b - b * (b8 - a) / (AF1_(8.0) * b8); }
1905 AF3 APrxHighLinearToPQ(AF3 a) { return sqrt(sqrt(sqrt(a))); }
1906 //------------------------------------------------------------------------------------------------------------------------------
1907 AF4 APrxPQToGamma2(AF4 a) { return Quart(a); }
1908 AF4 APrxPQToLinear(AF4 a) { return Oct(a); }
1909 AF4 APrxLoGamma2ToPQ(AF4 a) { return AF4_AU4((AU4_AF4(a) >> AU4_(2)) + AU4_(0x2F9A4E46)); }
1910 AF4 APrxMedGamma2ToPQ(AF4 a) { AF4 b = AF4_AU4((AU4_AF4(a) >> AU4_(2)) + AU4_(0x2F9A4E46)); AF4 b4 = Quart(b); return b - b * (b4 - a) / (AF1_(4.0) * b4); }
1911 AF4 APrxHighGamma2ToPQ(AF4 a) { return sqrt(sqrt(a)); }
1912 AF4 APrxLoLinearToPQ(AF4 a) { return AF4_AU4((AU4_AF4(a) >> AU4_(3)) + AU4_(0x378D8723)); }
1913 AF4 APrxMedLinearToPQ(AF4 a) { AF4 b = AF4_AU4((AU4_AF4(a) >> AU4_(3)) + AU4_(0x378D8723)); AF4 b8 = Oct(b); return b - b * (b8 - a) / (AF1_(8.0) * b8); }
1914 AF4 APrxHighLinearToPQ(AF4 a) { return sqrt(sqrt(sqrt(a))); }
1915////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
1916////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
1917//_____________________________________________________________/\_______________________________________________________________
1918//==============================================================================================================================
1919// PARABOLIC SIN & COS
1920//------------------------------------------------------------------------------------------------------------------------------
1921// Approximate answers to transcendental questions.
1922//------------------------------------------------------------------------------------------------------------------------------
1923//==============================================================================================================================
1924 #if 1
1925 // Valid input range is {-1 to 1} representing {0 to 2 pi}.
1926 // Output range is {-1/4 to 1/4} representing {-1 to 1}.
1927 AF1 APSinF1(AF1 x){return x*abs(x)-x;} // MAD.
1928 AF2 APSinF2(AF2 x){return x*abs(x)-x;}
1929 AF1 APCosF1(AF1 x){x=AFractF1(x*AF1_(0.5)+AF1_(0.75));x=x*AF1_(2.0)-AF1_(1.0);return APSinF1(x);} // 3x MAD, FRACT
1930 AF2 APCosF2(AF2 x){x=AFractF2(x*AF2_(0.5)+AF2_(0.75));x=x*AF2_(2.0)-AF2_(1.0);return APSinF2(x);}
1931 AF2 APSinCosF1(AF1 x){AF1 y=AFractF1(x*AF1_(0.5)+AF1_(0.75));y=y*AF1_(2.0)-AF1_(1.0);return APSinF2(AF2(x,y));}
1932 #endif
1933//------------------------------------------------------------------------------------------------------------------------------
1934 #ifdef A_HALF
1935 // For a packed {sin,cos} pair,
1936 // - Native takes 16 clocks and 4 issue slots (no packed transcendentals).
1937 // - Parabolic takes 8 clocks and 8 issue slots (only fract is non-packed).
1938 AH1 APSinH1(AH1 x){return x*abs(x)-x;}
1939 AH2 APSinH2(AH2 x){return x*abs(x)-x;} // AND,FMA
1940 AH1 APCosH1(AH1 x){x=AFractH1(x*AH1_(0.5)+AH1_(0.75));x=x*AH1_(2.0)-AH1_(1.0);return APSinH1(x);}
1941 AH2 APCosH2(AH2 x){x=AFractH2(x*AH2_(0.5)+AH2_(0.75));x=x*AH2_(2.0)-AH2_(1.0);return APSinH2(x);} // 3x FMA, 2xFRACT, AND
1942 AH2 APSinCosH1(AH1 x){AH1 y=AFractH1(x*AH1_(0.5)+AH1_(0.75));y=y*AH1_(2.0)-AH1_(1.0);return APSinH2(AH2(x,y));}
1943 #endif
1944////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
1945////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
1946//_____________________________________________________________/\_______________________________________________________________
1947//==============================================================================================================================
1948// [ZOL] ZERO ONE LOGIC
1949//------------------------------------------------------------------------------------------------------------------------------
1950// Conditional free logic designed for easy 16-bit packing, and backwards porting to 32-bit.
1951//------------------------------------------------------------------------------------------------------------------------------
1952// 0 := false
1953// 1 := true
1954//------------------------------------------------------------------------------------------------------------------------------
1955// AndNot(x,y) -> !(x&y) .... One op.
1956// AndOr(x,y,z) -> (x&y)|z ... One op.
1957// GtZero(x) -> x>0.0 ..... One op.
1958// Sel(x,y,z) -> x?y:z ..... Two ops, has no precision loss.
1959// Signed(x) -> x<0.0 ..... One op.
1960// ZeroPass(x,y) -> x?0:y ..... Two ops, 'y' is a pass through safe for aliasing as integer.
1961//------------------------------------------------------------------------------------------------------------------------------
1962// OPTIMIZATION NOTES
1963// ==================
1964// - On Vega to use 2 constants in a packed op, pass in as one AW2 or one AH2 'k.xy' and use as 'k.xx' and 'k.yy'.
1965// For example 'a.xy*k.xx+k.yy'.
1966//==============================================================================================================================
1967 #if 1
1968 AU1 AZolAndU1(AU1 x,AU1 y){return min(x,y);}
1969 AU2 AZolAndU2(AU2 x,AU2 y){return min(x,y);}
1970 AU3 AZolAndU3(AU3 x,AU3 y){return min(x,y);}
1971 AU4 AZolAndU4(AU4 x,AU4 y){return min(x,y);}
1972//------------------------------------------------------------------------------------------------------------------------------
1973 AU1 AZolNotU1(AU1 x){return x^AU1_(1);}
1974 AU2 AZolNotU2(AU2 x){return x^AU2_(1);}
1975 AU3 AZolNotU3(AU3 x){return x^AU3_(1);}
1976 AU4 AZolNotU4(AU4 x){return x^AU4_(1);}
1977//------------------------------------------------------------------------------------------------------------------------------
1978 AU1 AZolOrU1(AU1 x,AU1 y){return max(x,y);}
1979 AU2 AZolOrU2(AU2 x,AU2 y){return max(x,y);}
1980 AU3 AZolOrU3(AU3 x,AU3 y){return max(x,y);}
1981 AU4 AZolOrU4(AU4 x,AU4 y){return max(x,y);}
1982//==============================================================================================================================
1983 AU1 AZolF1ToU1(AF1 x){return AU1(x);}
1984 AU2 AZolF2ToU2(AF2 x){return AU2(x);}
1985 AU3 AZolF3ToU3(AF3 x){return AU3(x);}
1986 AU4 AZolF4ToU4(AF4 x){return AU4(x);}
1987//------------------------------------------------------------------------------------------------------------------------------
1988 // 2 ops, denormals don't work in 32-bit on PC (and if they are enabled, OMOD is disabled).
1989 AU1 AZolNotF1ToU1(AF1 x){return AU1(AF1_(1.0)-x);}
1990 AU2 AZolNotF2ToU2(AF2 x){return AU2(AF2_(1.0)-x);}
1991 AU3 AZolNotF3ToU3(AF3 x){return AU3(AF3_(1.0)-x);}
1992 AU4 AZolNotF4ToU4(AF4 x){return AU4(AF4_(1.0)-x);}
1993//------------------------------------------------------------------------------------------------------------------------------
1994 AF1 AZolU1ToF1(AU1 x){return AF1(x);}
1995 AF2 AZolU2ToF2(AU2 x){return AF2(x);}
1996 AF3 AZolU3ToF3(AU3 x){return AF3(x);}
1997 AF4 AZolU4ToF4(AU4 x){return AF4(x);}
1998//==============================================================================================================================
1999 AF1 AZolAndF1(AF1 x,AF1 y){return min(x,y);}
2000 AF2 AZolAndF2(AF2 x,AF2 y){return min(x,y);}
2001 AF3 AZolAndF3(AF3 x,AF3 y){return min(x,y);}
2002 AF4 AZolAndF4(AF4 x,AF4 y){return min(x,y);}
2003//------------------------------------------------------------------------------------------------------------------------------
2004 AF1 ASolAndNotF1(AF1 x,AF1 y){return (-x)*y+AF1_(1.0);}
2005 AF2 ASolAndNotF2(AF2 x,AF2 y){return (-x)*y+AF2_(1.0);}
2006 AF3 ASolAndNotF3(AF3 x,AF3 y){return (-x)*y+AF3_(1.0);}
2007 AF4 ASolAndNotF4(AF4 x,AF4 y){return (-x)*y+AF4_(1.0);}
2008//------------------------------------------------------------------------------------------------------------------------------
2009 AF1 AZolAndOrF1(AF1 x,AF1 y,AF1 z){return ASatF1(x*y+z);}
2010 AF2 AZolAndOrF2(AF2 x,AF2 y,AF2 z){return ASatF2(x*y+z);}
2011 AF3 AZolAndOrF3(AF3 x,AF3 y,AF3 z){return ASatF3(x*y+z);}
2012 AF4 AZolAndOrF4(AF4 x,AF4 y,AF4 z){return ASatF4(x*y+z);}
2013//------------------------------------------------------------------------------------------------------------------------------
2014 AF1 AZolGtZeroF1(AF1 x){return ASatF1(x*AF1_(A_INFP_F));}
2015 AF2 AZolGtZeroF2(AF2 x){return ASatF2(x*AF2_(A_INFP_F));}
2016 AF3 AZolGtZeroF3(AF3 x){return ASatF3(x*AF3_(A_INFP_F));}
2017 AF4 AZolGtZeroF4(AF4 x){return ASatF4(x*AF4_(A_INFP_F));}
2018//------------------------------------------------------------------------------------------------------------------------------
2019 AF1 AZolNotF1(AF1 x){return AF1_(1.0)-x;}
2020 AF2 AZolNotF2(AF2 x){return AF2_(1.0)-x;}
2021 AF3 AZolNotF3(AF3 x){return AF3_(1.0)-x;}
2022 AF4 AZolNotF4(AF4 x){return AF4_(1.0)-x;}
2023//------------------------------------------------------------------------------------------------------------------------------
2024 AF1 AZolOrF1(AF1 x,AF1 y){return max(x,y);}
2025 AF2 AZolOrF2(AF2 x,AF2 y){return max(x,y);}
2026 AF3 AZolOrF3(AF3 x,AF3 y){return max(x,y);}
2027 AF4 AZolOrF4(AF4 x,AF4 y){return max(x,y);}
2028//------------------------------------------------------------------------------------------------------------------------------
2029 AF1 AZolSelF1(AF1 x,AF1 y,AF1 z){AF1 r=(-x)*z+z;return x*y+r;}
2030 AF2 AZolSelF2(AF2 x,AF2 y,AF2 z){AF2 r=(-x)*z+z;return x*y+r;}
2031 AF3 AZolSelF3(AF3 x,AF3 y,AF3 z){AF3 r=(-x)*z+z;return x*y+r;}
2032 AF4 AZolSelF4(AF4 x,AF4 y,AF4 z){AF4 r=(-x)*z+z;return x*y+r;}
2033//------------------------------------------------------------------------------------------------------------------------------
2034 AF1 AZolSignedF1(AF1 x){return ASatF1(x*AF1_(A_INFN_F));}
2035 AF2 AZolSignedF2(AF2 x){return ASatF2(x*AF2_(A_INFN_F));}
2036 AF3 AZolSignedF3(AF3 x){return ASatF3(x*AF3_(A_INFN_F));}
2037 AF4 AZolSignedF4(AF4 x){return ASatF4(x*AF4_(A_INFN_F));}
2038//------------------------------------------------------------------------------------------------------------------------------
2039 AF1 AZolZeroPassF1(AF1 x,AF1 y){return AF1_AU1((AU1_AF1(x)!=AU1_(0))?AU1_(0):AU1_AF1(y));}
2040 AF2 AZolZeroPassF2(AF2 x,AF2 y){return AF2_AU2((AU2_AF2(x)!=AU2_(0))?AU2_(0):AU2_AF2(y));}
2041 AF3 AZolZeroPassF3(AF3 x,AF3 y){return AF3_AU3((AU3_AF3(x)!=AU3_(0))?AU3_(0):AU3_AF3(y));}
2042 AF4 AZolZeroPassF4(AF4 x,AF4 y){return AF4_AU4((AU4_AF4(x)!=AU4_(0))?AU4_(0):AU4_AF4(y));}
2043 #endif
2044//==============================================================================================================================
2045 #ifdef A_HALF
2046 AW1 AZolAndW1(AW1 x,AW1 y){return min(x,y);}
2047 AW2 AZolAndW2(AW2 x,AW2 y){return min(x,y);}
2048 AW3 AZolAndW3(AW3 x,AW3 y){return min(x,y);}
2049 AW4 AZolAndW4(AW4 x,AW4 y){return min(x,y);}
2050//------------------------------------------------------------------------------------------------------------------------------
2051 AW1 AZolNotW1(AW1 x){return x^AW1_(1);}
2052 AW2 AZolNotW2(AW2 x){return x^AW2_(1);}
2053 AW3 AZolNotW3(AW3 x){return x^AW3_(1);}
2054 AW4 AZolNotW4(AW4 x){return x^AW4_(1);}
2055//------------------------------------------------------------------------------------------------------------------------------
2056 AW1 AZolOrW1(AW1 x,AW1 y){return max(x,y);}
2057 AW2 AZolOrW2(AW2 x,AW2 y){return max(x,y);}
2058 AW3 AZolOrW3(AW3 x,AW3 y){return max(x,y);}
2059 AW4 AZolOrW4(AW4 x,AW4 y){return max(x,y);}
2060//==============================================================================================================================
2061 // Uses denormal trick.
2062 AW1 AZolH1ToW1(AH1 x){return AW1_AH1(x*AH1_AW1(AW1_(1)));}
2063 AW2 AZolH2ToW2(AH2 x){return AW2_AH2(x*AH2_AW2(AW2_(1)));}
2064 AW3 AZolH3ToW3(AH3 x){return AW3_AH3(x*AH3_AW3(AW3_(1)));}
2065 AW4 AZolH4ToW4(AH4 x){return AW4_AH4(x*AH4_AW4(AW4_(1)));}
2066//------------------------------------------------------------------------------------------------------------------------------
2067 // AMD arch lacks a packed conversion opcode.
2068 AH1 AZolW1ToH1(AW1 x){return AH1_AW1(x*AW1_AH1(AH1_(1.0)));}
2069 AH2 AZolW2ToH2(AW2 x){return AH2_AW2(x*AW2_AH2(AH2_(1.0)));}
2070 AH3 AZolW1ToH3(AW3 x){return AH3_AW3(x*AW3_AH3(AH3_(1.0)));}
2071 AH4 AZolW2ToH4(AW4 x){return AH4_AW4(x*AW4_AH4(AH4_(1.0)));}
2072//==============================================================================================================================
2073 AH1 AZolAndH1(AH1 x,AH1 y){return min(x,y);}
2074 AH2 AZolAndH2(AH2 x,AH2 y){return min(x,y);}
2075 AH3 AZolAndH3(AH3 x,AH3 y){return min(x,y);}
2076 AH4 AZolAndH4(AH4 x,AH4 y){return min(x,y);}
2077//------------------------------------------------------------------------------------------------------------------------------
2078 AH1 ASolAndNotH1(AH1 x,AH1 y){return (-x)*y+AH1_(1.0);}
2079 AH2 ASolAndNotH2(AH2 x,AH2 y){return (-x)*y+AH2_(1.0);}
2080 AH3 ASolAndNotH3(AH3 x,AH3 y){return (-x)*y+AH3_(1.0);}
2081 AH4 ASolAndNotH4(AH4 x,AH4 y){return (-x)*y+AH4_(1.0);}
2082//------------------------------------------------------------------------------------------------------------------------------
2083 AH1 AZolAndOrH1(AH1 x,AH1 y,AH1 z){return ASatH1(x*y+z);}
2084 AH2 AZolAndOrH2(AH2 x,AH2 y,AH2 z){return ASatH2(x*y+z);}
2085 AH3 AZolAndOrH3(AH3 x,AH3 y,AH3 z){return ASatH3(x*y+z);}
2086 AH4 AZolAndOrH4(AH4 x,AH4 y,AH4 z){return ASatH4(x*y+z);}
2087//------------------------------------------------------------------------------------------------------------------------------
2088 AH1 AZolGtZeroH1(AH1 x){return ASatH1(x*AH1_(A_INFP_H));}
2089 AH2 AZolGtZeroH2(AH2 x){return ASatH2(x*AH2_(A_INFP_H));}
2090 AH3 AZolGtZeroH3(AH3 x){return ASatH3(x*AH3_(A_INFP_H));}
2091 AH4 AZolGtZeroH4(AH4 x){return ASatH4(x*AH4_(A_INFP_H));}
2092//------------------------------------------------------------------------------------------------------------------------------
2093 AH1 AZolNotH1(AH1 x){return AH1_(1.0)-x;}
2094 AH2 AZolNotH2(AH2 x){return AH2_(1.0)-x;}
2095 AH3 AZolNotH3(AH3 x){return AH3_(1.0)-x;}
2096 AH4 AZolNotH4(AH4 x){return AH4_(1.0)-x;}
2097//------------------------------------------------------------------------------------------------------------------------------
2098 AH1 AZolOrH1(AH1 x,AH1 y){return max(x,y);}
2099 AH2 AZolOrH2(AH2 x,AH2 y){return max(x,y);}
2100 AH3 AZolOrH3(AH3 x,AH3 y){return max(x,y);}
2101 AH4 AZolOrH4(AH4 x,AH4 y){return max(x,y);}
2102//------------------------------------------------------------------------------------------------------------------------------
2103 AH1 AZolSelH1(AH1 x,AH1 y,AH1 z){AH1 r=(-x)*z+z;return x*y+r;}
2104 AH2 AZolSelH2(AH2 x,AH2 y,AH2 z){AH2 r=(-x)*z+z;return x*y+r;}
2105 AH3 AZolSelH3(AH3 x,AH3 y,AH3 z){AH3 r=(-x)*z+z;return x*y+r;}
2106 AH4 AZolSelH4(AH4 x,AH4 y,AH4 z){AH4 r=(-x)*z+z;return x*y+r;}
2107//------------------------------------------------------------------------------------------------------------------------------
2108 AH1 AZolSignedH1(AH1 x){return ASatH1(x*AH1_(A_INFN_H));}
2109 AH2 AZolSignedH2(AH2 x){return ASatH2(x*AH2_(A_INFN_H));}
2110 AH3 AZolSignedH3(AH3 x){return ASatH3(x*AH3_(A_INFN_H));}
2111 AH4 AZolSignedH4(AH4 x){return ASatH4(x*AH4_(A_INFN_H));}
2112 #endif
2113////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
2114////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
2115//_____________________________________________________________/\_______________________________________________________________
2116//==============================================================================================================================
2117// COLOR CONVERSIONS
2118//------------------------------------------------------------------------------------------------------------------------------
2119// These are all linear to/from some other space (where 'linear' has been shortened out of the function name).
2120// So 'ToGamma' is 'LinearToGamma', and 'FromGamma' is 'LinearFromGamma'.
2121// These are branch free implementations.
2122// The AToSrgbF1() function is useful for stores for compute shaders for GPUs without hardware linear->sRGB store conversion.
2123//------------------------------------------------------------------------------------------------------------------------------
2124// TRANSFER FUNCTIONS
2125// ==================
2126// 709 ..... Rec709 used for some HDTVs
2127// Gamma ... Typically 2.2 for some PC displays, or 2.4-2.5 for CRTs, or 2.2 FreeSync2 native
2128// Pq ...... PQ native for HDR10
2129// Srgb .... The sRGB output, typical of PC displays, useful for 10-bit output, or storing to 8-bit UNORM without SRGB type
2130// Two ..... Gamma 2.0, fastest conversion (useful for intermediate pass approximations)
2131// Three ... Gamma 3.0, less fast, but good for HDR.
2132//------------------------------------------------------------------------------------------------------------------------------
2133// KEEPING TO SPEC
2134// ===============
2135// Both Rec.709 and sRGB have a linear segment which as spec'ed would intersect the curved segment 2 times.
2136// (a.) For 8-bit sRGB, steps {0 to 10.3} are in the linear region (4% of the encoding range).
2137// (b.) For 8-bit 709, steps {0 to 20.7} are in the linear region (8% of the encoding range).
2138// Also there is a slight step in the transition regions.
2139// Precision of the coefficients in the spec being the likely cause.
2140// Main usage case of the sRGB code is to do the linear->sRGB converstion in a compute shader before store.
2141// This is to work around lack of hardware (typically only ROP does the conversion for free).
2142// To "correct" the linear segment, would be to introduce error, because hardware decode of sRGB->linear is fixed (and free).
2143// So this header keeps with the spec.
2144// For linear->sRGB transforms, the linear segment in some respects reduces error, because rounding in that region is linear.
2145// Rounding in the curved region in hardware (and fast software code) introduces error due to rounding in non-linear.
2146//------------------------------------------------------------------------------------------------------------------------------
2147// FOR PQ
2148// ======
2149// Both input and output is {0.0-1.0}, and where output 1.0 represents 10000.0 cd/m^2.
2150// All constants are only specified to FP32 precision.
2151// External PQ source reference,
2152// - https://github.com/ampas/aces-dev/blob/master/transforms/ctl/utilities/ACESlib.Utilities_Color.a1.0.1.ctl
2153//------------------------------------------------------------------------------------------------------------------------------
2154// PACKED VERSIONS
2155// ===============
2156// These are the A*H2() functions.
2157// There is no PQ functions as FP16 seemed to not have enough precision for the conversion.
2158// The remaining functions are "good enough" for 8-bit, and maybe 10-bit if not concerned about a few 1-bit errors.
2159// Precision is lowest in the 709 conversion, higher in sRGB, higher still in Two and Gamma (when using 2.2 at least).
2160//------------------------------------------------------------------------------------------------------------------------------
2161// NOTES
2162// =====
2163// Could be faster for PQ conversions to be in ALU or a texture lookup depending on usage case.
2164//==============================================================================================================================
2165 #if 1
2166 AF1 ATo709F1(AF1 c){AF3 j=AF3(0.018*4.5,4.5,0.45);AF2 k=AF2(1.099,-0.099);
2167 return clamp(j.x ,c*j.y ,pow(c,j.z )*k.x +k.y );}
2168 AF2 ATo709F2(AF2 c){AF3 j=AF3(0.018*4.5,4.5,0.45);AF2 k=AF2(1.099,-0.099);
2169 return clamp(j.xx ,c*j.yy ,pow(c,j.zz )*k.xx +k.yy );}
2170 AF3 ATo709F3(AF3 c){AF3 j=AF3(0.018*4.5,4.5,0.45);AF2 k=AF2(1.099,-0.099);
2171 return clamp(j.xxx,c*j.yyy,pow(c,j.zzz)*k.xxx+k.yyy);}
2172//------------------------------------------------------------------------------------------------------------------------------
2173 // Note 'rcpX' is '1/x', where the 'x' is what would be used in AFromGamma().
2174 AF1 AToGammaF1(AF1 c,AF1 rcpX){return pow(c,AF1_(rcpX));}
2175 AF2 AToGammaF2(AF2 c,AF1 rcpX){return pow(c,AF2_(rcpX));}
2176 AF3 AToGammaF3(AF3 c,AF1 rcpX){return pow(c,AF3_(rcpX));}
2177//------------------------------------------------------------------------------------------------------------------------------
2178 AF1 AToPqF1(AF1 x){AF1 p=pow(x,AF1_(0.159302));
2179 return pow((AF1_(0.835938)+AF1_(18.8516)*p)/(AF1_(1.0)+AF1_(18.6875)*p),AF1_(78.8438));}
2180 AF2 AToPqF1(AF2 x){AF2 p=pow(x,AF2_(0.159302));
2181 return pow((AF2_(0.835938)+AF2_(18.8516)*p)/(AF2_(1.0)+AF2_(18.6875)*p),AF2_(78.8438));}
2182 AF3 AToPqF1(AF3 x){AF3 p=pow(x,AF3_(0.159302));
2183 return pow((AF3_(0.835938)+AF3_(18.8516)*p)/(AF3_(1.0)+AF3_(18.6875)*p),AF3_(78.8438));}
2184//------------------------------------------------------------------------------------------------------------------------------
2185 AF1 AToSrgbF1(AF1 c){AF3 j=AF3(0.0031308*12.92,12.92,1.0/2.4);AF2 k=AF2(1.055,-0.055);
2186 return clamp(j.x ,c*j.y ,pow(c,j.z )*k.x +k.y );}
2187 AF2 AToSrgbF2(AF2 c){AF3 j=AF3(0.0031308*12.92,12.92,1.0/2.4);AF2 k=AF2(1.055,-0.055);
2188 return clamp(j.xx ,c*j.yy ,pow(c,j.zz )*k.xx +k.yy );}
2189 AF3 AToSrgbF3(AF3 c){AF3 j=AF3(0.0031308*12.92,12.92,1.0/2.4);AF2 k=AF2(1.055,-0.055);
2190 return clamp(j.xxx,c*j.yyy,pow(c,j.zzz)*k.xxx+k.yyy);}
2191//------------------------------------------------------------------------------------------------------------------------------
2192 AF1 AToTwoF1(AF1 c){return sqrt(c);}
2193 AF2 AToTwoF2(AF2 c){return sqrt(c);}
2194 AF3 AToTwoF3(AF3 c){return sqrt(c);}
2195//------------------------------------------------------------------------------------------------------------------------------
2196 AF1 AToThreeF1(AF1 c){return pow(c,AF1_(1.0/3.0));}
2197 AF2 AToThreeF2(AF2 c){return pow(c,AF2_(1.0/3.0));}
2198 AF3 AToThreeF3(AF3 c){return pow(c,AF3_(1.0/3.0));}
2199 #endif
2200//==============================================================================================================================
2201 #if 1
2202 // Unfortunately median won't work here.
2203 AF1 AFrom709F1(AF1 c){AF3 j=AF3(0.081/4.5,1.0/4.5,1.0/0.45);AF2 k=AF2(1.0/1.099,0.099/1.099);
2204 return AZolSelF1(AZolSignedF1(c-j.x ),c*j.y ,pow(c*k.x +k.y ,j.z ));}
2205 AF2 AFrom709F2(AF2 c){AF3 j=AF3(0.081/4.5,1.0/4.5,1.0/0.45);AF2 k=AF2(1.0/1.099,0.099/1.099);
2206 return AZolSelF2(AZolSignedF2(c-j.xx ),c*j.yy ,pow(c*k.xx +k.yy ,j.zz ));}
2207 AF3 AFrom709F3(AF3 c){AF3 j=AF3(0.081/4.5,1.0/4.5,1.0/0.45);AF2 k=AF2(1.0/1.099,0.099/1.099);
2208 return AZolSelF3(AZolSignedF3(c-j.xxx),c*j.yyy,pow(c*k.xxx+k.yyy,j.zzz));}
2209//------------------------------------------------------------------------------------------------------------------------------
2210 AF1 AFromGammaF1(AF1 c,AF1 x){return pow(c,AF1_(x));}
2211 AF2 AFromGammaF2(AF2 c,AF1 x){return pow(c,AF2_(x));}
2212 AF3 AFromGammaF3(AF3 c,AF1 x){return pow(c,AF3_(x));}
2213//------------------------------------------------------------------------------------------------------------------------------
2214 AF1 AFromPqF1(AF1 x){AF1 p=pow(x,AF1_(0.0126833));
2215 return pow(ASatF1(p-AF1_(0.835938))/(AF1_(18.8516)-AF1_(18.6875)*p),AF1_(6.27739));}
2216 AF2 AFromPqF1(AF2 x){AF2 p=pow(x,AF2_(0.0126833));
2217 return pow(ASatF2(p-AF2_(0.835938))/(AF2_(18.8516)-AF2_(18.6875)*p),AF2_(6.27739));}
2218 AF3 AFromPqF1(AF3 x){AF3 p=pow(x,AF3_(0.0126833));
2219 return pow(ASatF3(p-AF3_(0.835938))/(AF3_(18.8516)-AF3_(18.6875)*p),AF3_(6.27739));}
2220//------------------------------------------------------------------------------------------------------------------------------
2221 // Unfortunately median won't work here.
2222 AF1 AFromSrgbF1(AF1 c){AF3 j=AF3(0.04045/12.92,1.0/12.92,2.4);AF2 k=AF2(1.0/1.055,0.055/1.055);
2223 return AZolSelF1(AZolSignedF1(c-j.x ),c*j.y ,pow(c*k.x +k.y ,j.z ));}
2224 AF2 AFromSrgbF2(AF2 c){AF3 j=AF3(0.04045/12.92,1.0/12.92,2.4);AF2 k=AF2(1.0/1.055,0.055/1.055);
2225 return AZolSelF2(AZolSignedF2(c-j.xx ),c*j.yy ,pow(c*k.xx +k.yy ,j.zz ));}
2226 AF3 AFromSrgbF3(AF3 c){AF3 j=AF3(0.04045/12.92,1.0/12.92,2.4);AF2 k=AF2(1.0/1.055,0.055/1.055);
2227 return AZolSelF3(AZolSignedF3(c-j.xxx),c*j.yyy,pow(c*k.xxx+k.yyy,j.zzz));}
2228//------------------------------------------------------------------------------------------------------------------------------
2229 AF1 AFromTwoF1(AF1 c){return c*c;}
2230 AF2 AFromTwoF2(AF2 c){return c*c;}
2231 AF3 AFromTwoF3(AF3 c){return c*c;}
2232//------------------------------------------------------------------------------------------------------------------------------
2233 AF1 AFromThreeF1(AF1 c){return c*c*c;}
2234 AF2 AFromThreeF2(AF2 c){return c*c*c;}
2235 AF3 AFromThreeF3(AF3 c){return c*c*c;}
2236 #endif
2237//==============================================================================================================================
2238 #ifdef A_HALF
2239 AH1 ATo709H1(AH1 c){AH3 j=AH3(0.018*4.5,4.5,0.45);AH2 k=AH2(1.099,-0.099);
2240 return clamp(j.x ,c*j.y ,pow(c,j.z )*k.x +k.y );}
2241 AH2 ATo709H2(AH2 c){AH3 j=AH3(0.018*4.5,4.5,0.45);AH2 k=AH2(1.099,-0.099);
2242 return clamp(j.xx ,c*j.yy ,pow(c,j.zz )*k.xx +k.yy );}
2243 AH3 ATo709H3(AH3 c){AH3 j=AH3(0.018*4.5,4.5,0.45);AH2 k=AH2(1.099,-0.099);
2244 return clamp(j.xxx,c*j.yyy,pow(c,j.zzz)*k.xxx+k.yyy);}
2245//------------------------------------------------------------------------------------------------------------------------------
2246 AH1 AToGammaH1(AH1 c,AH1 rcpX){return pow(c,AH1_(rcpX));}
2247 AH2 AToGammaH2(AH2 c,AH1 rcpX){return pow(c,AH2_(rcpX));}
2248 AH3 AToGammaH3(AH3 c,AH1 rcpX){return pow(c,AH3_(rcpX));}
2249//------------------------------------------------------------------------------------------------------------------------------
2250 AH1 AToSrgbH1(AH1 c){AH3 j=AH3(0.0031308*12.92,12.92,1.0/2.4);AH2 k=AH2(1.055,-0.055);
2251 return clamp(j.x ,c*j.y ,pow(c,j.z )*k.x +k.y );}
2252 AH2 AToSrgbH2(AH2 c){AH3 j=AH3(0.0031308*12.92,12.92,1.0/2.4);AH2 k=AH2(1.055,-0.055);
2253 return clamp(j.xx ,c*j.yy ,pow(c,j.zz )*k.xx +k.yy );}
2254 AH3 AToSrgbH3(AH3 c){AH3 j=AH3(0.0031308*12.92,12.92,1.0/2.4);AH2 k=AH2(1.055,-0.055);
2255 return clamp(j.xxx,c*j.yyy,pow(c,j.zzz)*k.xxx+k.yyy);}
2256//------------------------------------------------------------------------------------------------------------------------------
2257 AH1 AToTwoH1(AH1 c){return sqrt(c);}
2258 AH2 AToTwoH2(AH2 c){return sqrt(c);}
2259 AH3 AToTwoH3(AH3 c){return sqrt(c);}
2260//------------------------------------------------------------------------------------------------------------------------------
2261 AH1 AToThreeF1(AH1 c){return pow(c,AH1_(1.0/3.0));}
2262 AH2 AToThreeF2(AH2 c){return pow(c,AH2_(1.0/3.0));}
2263 AH3 AToThreeF3(AH3 c){return pow(c,AH3_(1.0/3.0));}
2264 #endif
2265//==============================================================================================================================
2266 #ifdef A_HALF
2267 AH1 AFrom709H1(AH1 c){AH3 j=AH3(0.081/4.5,1.0/4.5,1.0/0.45);AH2 k=AH2(1.0/1.099,0.099/1.099);
2268 return AZolSelH1(AZolSignedH1(c-j.x ),c*j.y ,pow(c*k.x +k.y ,j.z ));}
2269 AH2 AFrom709H2(AH2 c){AH3 j=AH3(0.081/4.5,1.0/4.5,1.0/0.45);AH2 k=AH2(1.0/1.099,0.099/1.099);
2270 return AZolSelH2(AZolSignedH2(c-j.xx ),c*j.yy ,pow(c*k.xx +k.yy ,j.zz ));}
2271 AH3 AFrom709H3(AH3 c){AH3 j=AH3(0.081/4.5,1.0/4.5,1.0/0.45);AH2 k=AH2(1.0/1.099,0.099/1.099);
2272 return AZolSelH3(AZolSignedH3(c-j.xxx),c*j.yyy,pow(c*k.xxx+k.yyy,j.zzz));}
2273//------------------------------------------------------------------------------------------------------------------------------
2274 AH1 AFromGammaH1(AH1 c,AH1 x){return pow(c,AH1_(x));}
2275 AH2 AFromGammaH2(AH2 c,AH1 x){return pow(c,AH2_(x));}
2276 AH3 AFromGammaH3(AH3 c,AH1 x){return pow(c,AH3_(x));}
2277//------------------------------------------------------------------------------------------------------------------------------
2278 AH1 AHromSrgbF1(AH1 c){AH3 j=AH3(0.04045/12.92,1.0/12.92,2.4);AH2 k=AH2(1.0/1.055,0.055/1.055);
2279 return AZolSelH1(AZolSignedH1(c-j.x ),c*j.y ,pow(c*k.x +k.y ,j.z ));}
2280 AH2 AHromSrgbF2(AH2 c){AH3 j=AH3(0.04045/12.92,1.0/12.92,2.4);AH2 k=AH2(1.0/1.055,0.055/1.055);
2281 return AZolSelH2(AZolSignedH2(c-j.xx ),c*j.yy ,pow(c*k.xx +k.yy ,j.zz ));}
2282 AH3 AHromSrgbF3(AH3 c){AH3 j=AH3(0.04045/12.92,1.0/12.92,2.4);AH2 k=AH2(1.0/1.055,0.055/1.055);
2283 return AZolSelH3(AZolSignedH3(c-j.xxx),c*j.yyy,pow(c*k.xxx+k.yyy,j.zzz));}
2284//------------------------------------------------------------------------------------------------------------------------------
2285 AH1 AFromTwoH1(AH1 c){return c*c;}
2286 AH2 AFromTwoH2(AH2 c){return c*c;}
2287 AH3 AFromTwoH3(AH3 c){return c*c;}
2288//------------------------------------------------------------------------------------------------------------------------------
2289 AH1 AFromThreeH1(AH1 c){return c*c*c;}
2290 AH2 AFromThreeH2(AH2 c){return c*c*c;}
2291 AH3 AFromThreeH3(AH3 c){return c*c*c;}
2292 #endif
2293////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
2294////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
2295//_____________________________________________________________/\_______________________________________________________________
2296//==============================================================================================================================
2297// CS REMAP
2298//==============================================================================================================================
2299 // Simple remap 64x1 to 8x8 with rotated 2x2 pixel quads in quad linear.
2300 // 543210
2301 // ======
2302 // ..xxx.
2303 // yy...y
2304 AU2 ARmp8x8(AU1 a){return AU2(ABfe(a,1u,3u),ABfiM(ABfe(a,3u,3u),a,1u));}
2305//==============================================================================================================================
2306 // More complex remap 64x1 to 8x8 which is necessary for 2D wave reductions.
2307 // 543210
2308 // ======
2309 // .xx..x
2310 // y..yy.
2311 // Details,
2312 // LANE TO 8x8 MAPPING
2313 // ===================
2314 // 00 01 08 09 10 11 18 19
2315 // 02 03 0a 0b 12 13 1a 1b
2316 // 04 05 0c 0d 14 15 1c 1d
2317 // 06 07 0e 0f 16 17 1e 1f
2318 // 20 21 28 29 30 31 38 39
2319 // 22 23 2a 2b 32 33 3a 3b
2320 // 24 25 2c 2d 34 35 3c 3d
2321 // 26 27 2e 2f 36 37 3e 3f
2322 AU2 ARmpRed8x8(AU1 a){return AU2(ABfiM(ABfe(a,2u,3u),a,1u),ABfiM(ABfe(a,3u,3u),ABfe(a,1u,2u),2u));}
2323//==============================================================================================================================
2324 #ifdef A_HALF
2325 AW2 ARmp8x8H(AU1 a){return AW2(ABfe(a,1u,3u),ABfiM(ABfe(a,3u,3u),a,1u));}
2326 AW2 ARmpRed8x8H(AU1 a){return AW2(ABfiM(ABfe(a,2u,3u),a,1u),ABfiM(ABfe(a,3u,3u),ABfe(a,1u,2u),2u));}
2327 #endif
2328#endif
2329////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
2330////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
2331////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
2332////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
2333//_____________________________________________________________/\_______________________________________________________________
2334//==============================================================================================================================
2335//
2336// REFERENCE
2337//
2338//------------------------------------------------------------------------------------------------------------------------------
2339// IEEE FLOAT RULES
2340// ================
2341// - saturate(NaN)=0, saturate(-INF)=0, saturate(+INF)=1
2342// - {+/-}0 * {+/-}INF = NaN
2343// - -INF + (+INF) = NaN
2344// - {+/-}0 / {+/-}0 = NaN
2345// - {+/-}INF / {+/-}INF = NaN
2346// - a<(-0) := sqrt(a) = NaN (a=-0.0 won't NaN)
2347// - 0 == -0
2348// - 4/0 = +INF
2349// - 4/-0 = -INF
2350// - 4+INF = +INF
2351// - 4-INF = -INF
2352// - 4*(+INF) = +INF
2353// - 4*(-INF) = -INF
2354// - -4*(+INF) = -INF
2355// - sqrt(+INF) = +INF
2356//------------------------------------------------------------------------------------------------------------------------------
2357// FP16 ENCODING
2358// =============
2359// fedcba9876543210
2360// ----------------
2361// ......mmmmmmmmmm 10-bit mantissa (encodes 11-bit 0.5 to 1.0 except for denormals)
2362// .eeeee.......... 5-bit exponent
2363// .00000.......... denormals
2364// .00001.......... -14 exponent
2365// .11110.......... 15 exponent
2366// .111110000000000 infinity
2367// .11111nnnnnnnnnn NaN with n!=0
2368// s............... sign
2369//------------------------------------------------------------------------------------------------------------------------------
2370// FP16/INT16 ALIASING DENORMAL
2371// ============================
2372// 11-bit unsigned integers alias with half float denormal/normal values,
2373// 1 = 2^(-24) = 1/16777216 ....................... first denormal value
2374// 2 = 2^(-23)
2375// ...
2376// 1023 = 2^(-14)*(1-2^(-10)) = 2^(-14)*(1-1/1024) ... last denormal value
2377// 1024 = 2^(-14) = 1/16384 .......................... first normal value that still maps to integers
2378// 2047 .............................................. last normal value that still maps to integers
2379// Scaling limits,
2380// 2^15 = 32768 ...................................... largest power of 2 scaling
2381// Largest pow2 conversion mapping is at *32768,
2382// 1 : 2^(-9) = 1/512
2383// 2 : 1/256
2384// 4 : 1/128
2385// 8 : 1/64
2386// 16 : 1/32
2387// 32 : 1/16
2388// 64 : 1/8
2389// 128 : 1/4
2390// 256 : 1/2
2391// 512 : 1
2392// 1024 : 2
2393// 2047 : a little less than 4
2394//==============================================================================================================================
2395////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
2396////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
2397////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
2398////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
2399////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
2400////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
2401//_____________________________________________________________/\_______________________________________________________________
2402//==============================================================================================================================
2403//
2404//
2405// GPU/CPU PORTABILITY
2406//
2407//
2408//------------------------------------------------------------------------------------------------------------------------------
2409// This is the GPU implementation.
2410// See the CPU implementation for docs.
2411//==============================================================================================================================
2412#ifdef A_GPU
2413 #define A_TRUE true
2414 #define A_FALSE false
2415 #define A_STATIC
2416////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
2417////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
2418//_____________________________________________________________/\_______________________________________________________________
2419//==============================================================================================================================
2420// VECTOR ARGUMENT/RETURN/INITIALIZATION PORTABILITY
2421//==============================================================================================================================
2422 #define retAD2 AD2
2423 #define retAD3 AD3
2424 #define retAD4 AD4
2425 #define retAF2 AF2
2426 #define retAF3 AF3
2427 #define retAF4 AF4
2428 #define retAL2 AL2
2429 #define retAL3 AL3
2430 #define retAL4 AL4
2431 #define retAU2 AU2
2432 #define retAU3 AU3
2433 #define retAU4 AU4
2434//------------------------------------------------------------------------------------------------------------------------------
2435 #define inAD2 in AD2
2436 #define inAD3 in AD3
2437 #define inAD4 in AD4
2438 #define inAF2 in AF2
2439 #define inAF3 in AF3
2440 #define inAF4 in AF4
2441 #define inAL2 in AL2
2442 #define inAL3 in AL3
2443 #define inAL4 in AL4
2444 #define inAU2 in AU2
2445 #define inAU3 in AU3
2446 #define inAU4 in AU4
2447//------------------------------------------------------------------------------------------------------------------------------
2448 #define inoutAD2 inout AD2
2449 #define inoutAD3 inout AD3
2450 #define inoutAD4 inout AD4
2451 #define inoutAF2 inout AF2
2452 #define inoutAF3 inout AF3
2453 #define inoutAF4 inout AF4
2454 #define inoutAL2 inout AL2
2455 #define inoutAL3 inout AL3
2456 #define inoutAL4 inout AL4
2457 #define inoutAU2 inout AU2
2458 #define inoutAU3 inout AU3
2459 #define inoutAU4 inout AU4
2460//------------------------------------------------------------------------------------------------------------------------------
2461 #define outAD2 out AD2
2462 #define outAD3 out AD3
2463 #define outAD4 out AD4
2464 #define outAF2 out AF2
2465 #define outAF3 out AF3
2466 #define outAF4 out AF4
2467 #define outAL2 out AL2
2468 #define outAL3 out AL3
2469 #define outAL4 out AL4
2470 #define outAU2 out AU2
2471 #define outAU3 out AU3
2472 #define outAU4 out AU4
2473//------------------------------------------------------------------------------------------------------------------------------
2474 #define varAD2(x) AD2 x
2475 #define varAD3(x) AD3 x
2476 #define varAD4(x) AD4 x
2477 #define varAF2(x) AF2 x
2478 #define varAF3(x) AF3 x
2479 #define varAF4(x) AF4 x
2480 #define varAL2(x) AL2 x
2481 #define varAL3(x) AL3 x
2482 #define varAL4(x) AL4 x
2483 #define varAU2(x) AU2 x
2484 #define varAU3(x) AU3 x
2485 #define varAU4(x) AU4 x
2486//------------------------------------------------------------------------------------------------------------------------------
2487 #define initAD2(x,y) AD2(x,y)
2488 #define initAD3(x,y,z) AD3(x,y,z)
2489 #define initAD4(x,y,z,w) AD4(x,y,z,w)
2490 #define initAF2(x,y) AF2(x,y)
2491 #define initAF3(x,y,z) AF3(x,y,z)
2492 #define initAF4(x,y,z,w) AF4(x,y,z,w)
2493 #define initAL2(x,y) AL2(x,y)
2494 #define initAL3(x,y,z) AL3(x,y,z)
2495 #define initAL4(x,y,z,w) AL4(x,y,z,w)
2496 #define initAU2(x,y) AU2(x,y)
2497 #define initAU3(x,y,z) AU3(x,y,z)
2498 #define initAU4(x,y,z,w) AU4(x,y,z,w)
2499////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
2500////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
2501//_____________________________________________________________/\_______________________________________________________________
2502//==============================================================================================================================
2503// SCALAR RETURN OPS
2504//==============================================================================================================================
2505 #define AAbsD1(a) abs(AD1(a))
2506 #define AAbsF1(a) abs(AF1(a))
2507//------------------------------------------------------------------------------------------------------------------------------
2508 #define ACosD1(a) cos(AD1(a))
2509 #define ACosF1(a) cos(AF1(a))
2510//------------------------------------------------------------------------------------------------------------------------------
2511 #define ADotD2(a,b) dot(AD2(a),AD2(b))
2512 #define ADotD3(a,b) dot(AD3(a),AD3(b))
2513 #define ADotD4(a,b) dot(AD4(a),AD4(b))
2514 #define ADotF2(a,b) dot(AF2(a),AF2(b))
2515 #define ADotF3(a,b) dot(AF3(a),AF3(b))
2516 #define ADotF4(a,b) dot(AF4(a),AF4(b))
2517//------------------------------------------------------------------------------------------------------------------------------
2518 #define AExp2D1(a) exp2(AD1(a))
2519 #define AExp2F1(a) exp2(AF1(a))
2520//------------------------------------------------------------------------------------------------------------------------------
2521 #define AFloorD1(a) floor(AD1(a))
2522 #define AFloorF1(a) floor(AF1(a))
2523//------------------------------------------------------------------------------------------------------------------------------
2524 #define ALog2D1(a) log2(AD1(a))
2525 #define ALog2F1(a) log2(AF1(a))
2526//------------------------------------------------------------------------------------------------------------------------------
2527 #define AMaxD1(a,b) max(a,b)
2528 #define AMaxF1(a,b) max(a,b)
2529 #define AMaxL1(a,b) max(a,b)
2530 #define AMaxU1(a,b) max(a,b)
2531//------------------------------------------------------------------------------------------------------------------------------
2532 #define AMinD1(a,b) min(a,b)
2533 #define AMinF1(a,b) min(a,b)
2534 #define AMinL1(a,b) min(a,b)
2535 #define AMinU1(a,b) min(a,b)
2536//------------------------------------------------------------------------------------------------------------------------------
2537 #define ASinD1(a) sin(AD1(a))
2538 #define ASinF1(a) sin(AF1(a))
2539//------------------------------------------------------------------------------------------------------------------------------
2540 #define ASqrtD1(a) sqrt(AD1(a))
2541 #define ASqrtF1(a) sqrt(AF1(a))
2542////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
2543////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
2544//_____________________________________________________________/\_______________________________________________________________
2545//==============================================================================================================================
2546// SCALAR RETURN OPS - DEPENDENT
2547//==============================================================================================================================
2548 #define APowD1(a,b) pow(AD1(a),AF1(b))
2549 #define APowF1(a,b) pow(AF1(a),AF1(b))
2550////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
2551////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
2552//_____________________________________________________________/\_______________________________________________________________
2553//==============================================================================================================================
2554// VECTOR OPS
2555//------------------------------------------------------------------------------------------------------------------------------
2556// These are added as needed for production or prototyping, so not necessarily a complete set.
2557// They follow a convention of taking in a destination and also returning the destination value to increase utility.
2558//==============================================================================================================================
2559 #ifdef A_DUBL
2560 AD2 opAAbsD2(outAD2 d,inAD2 a){d=abs(a);return d;}
2561 AD3 opAAbsD3(outAD3 d,inAD3 a){d=abs(a);return d;}
2562 AD4 opAAbsD4(outAD4 d,inAD4 a){d=abs(a);return d;}
2563//------------------------------------------------------------------------------------------------------------------------------
2564 AD2 opAAddD2(outAD2 d,inAD2 a,inAD2 b){d=a+b;return d;}
2565 AD3 opAAddD3(outAD3 d,inAD3 a,inAD3 b){d=a+b;return d;}
2566 AD4 opAAddD4(outAD4 d,inAD4 a,inAD4 b){d=a+b;return d;}
2567//------------------------------------------------------------------------------------------------------------------------------
2568 AD2 opAAddOneD2(outAD2 d,inAD2 a,AD1 b){d=a+AD2_(b);return d;}
2569 AD3 opAAddOneD3(outAD3 d,inAD3 a,AD1 b){d=a+AD3_(b);return d;}
2570 AD4 opAAddOneD4(outAD4 d,inAD4 a,AD1 b){d=a+AD4_(b);return d;}
2571//------------------------------------------------------------------------------------------------------------------------------
2572 AD2 opACpyD2(outAD2 d,inAD2 a){d=a;return d;}
2573 AD3 opACpyD3(outAD3 d,inAD3 a){d=a;return d;}
2574 AD4 opACpyD4(outAD4 d,inAD4 a){d=a;return d;}
2575//------------------------------------------------------------------------------------------------------------------------------
2576 AD2 opALerpD2(outAD2 d,inAD2 a,inAD2 b,inAD2 c){d=ALerpD2(a,b,c);return d;}
2577 AD3 opALerpD3(outAD3 d,inAD3 a,inAD3 b,inAD3 c){d=ALerpD3(a,b,c);return d;}
2578 AD4 opALerpD4(outAD4 d,inAD4 a,inAD4 b,inAD4 c){d=ALerpD4(a,b,c);return d;}
2579//------------------------------------------------------------------------------------------------------------------------------
2580 AD2 opALerpOneD2(outAD2 d,inAD2 a,inAD2 b,AD1 c){d=ALerpD2(a,b,AD2_(c));return d;}
2581 AD3 opALerpOneD3(outAD3 d,inAD3 a,inAD3 b,AD1 c){d=ALerpD3(a,b,AD3_(c));return d;}
2582 AD4 opALerpOneD4(outAD4 d,inAD4 a,inAD4 b,AD1 c){d=ALerpD4(a,b,AD4_(c));return d;}
2583//------------------------------------------------------------------------------------------------------------------------------
2584 AD2 opAMaxD2(outAD2 d,inAD2 a,inAD2 b){d=max(a,b);return d;}
2585 AD3 opAMaxD3(outAD3 d,inAD3 a,inAD3 b){d=max(a,b);return d;}
2586 AD4 opAMaxD4(outAD4 d,inAD4 a,inAD4 b){d=max(a,b);return d;}
2587//------------------------------------------------------------------------------------------------------------------------------
2588 AD2 opAMinD2(outAD2 d,inAD2 a,inAD2 b){d=min(a,b);return d;}
2589 AD3 opAMinD3(outAD3 d,inAD3 a,inAD3 b){d=min(a,b);return d;}
2590 AD4 opAMinD4(outAD4 d,inAD4 a,inAD4 b){d=min(a,b);return d;}
2591//------------------------------------------------------------------------------------------------------------------------------
2592 AD2 opAMulD2(outAD2 d,inAD2 a,inAD2 b){d=a*b;return d;}
2593 AD3 opAMulD3(outAD3 d,inAD3 a,inAD3 b){d=a*b;return d;}
2594 AD4 opAMulD4(outAD4 d,inAD4 a,inAD4 b){d=a*b;return d;}
2595//------------------------------------------------------------------------------------------------------------------------------
2596 AD2 opAMulOneD2(outAD2 d,inAD2 a,AD1 b){d=a*AD2_(b);return d;}
2597 AD3 opAMulOneD3(outAD3 d,inAD3 a,AD1 b){d=a*AD3_(b);return d;}
2598 AD4 opAMulOneD4(outAD4 d,inAD4 a,AD1 b){d=a*AD4_(b);return d;}
2599//------------------------------------------------------------------------------------------------------------------------------
2600 AD2 opANegD2(outAD2 d,inAD2 a){d=-a;return d;}
2601 AD3 opANegD3(outAD3 d,inAD3 a){d=-a;return d;}
2602 AD4 opANegD4(outAD4 d,inAD4 a){d=-a;return d;}
2603//------------------------------------------------------------------------------------------------------------------------------
2604 AD2 opARcpD2(outAD2 d,inAD2 a){d=ARcpD2(a);return d;}
2605 AD3 opARcpD3(outAD3 d,inAD3 a){d=ARcpD3(a);return d;}
2606 AD4 opARcpD4(outAD4 d,inAD4 a){d=ARcpD4(a);return d;}
2607 #endif
2608//==============================================================================================================================
2609 AF2 opAAbsF2(outAF2 d,inAF2 a){d=abs(a);return d;}
2610 AF3 opAAbsF3(outAF3 d,inAF3 a){d=abs(a);return d;}
2611 AF4 opAAbsF4(outAF4 d,inAF4 a){d=abs(a);return d;}
2612//------------------------------------------------------------------------------------------------------------------------------
2613 AF2 opAAddF2(outAF2 d,inAF2 a,inAF2 b){d=a+b;return d;}
2614 AF3 opAAddF3(outAF3 d,inAF3 a,inAF3 b){d=a+b;return d;}
2615 AF4 opAAddF4(outAF4 d,inAF4 a,inAF4 b){d=a+b;return d;}
2616//------------------------------------------------------------------------------------------------------------------------------
2617 AF2 opAAddOneF2(outAF2 d,inAF2 a,AF1 b){d=a+AF2_(b);return d;}
2618 AF3 opAAddOneF3(outAF3 d,inAF3 a,AF1 b){d=a+AF3_(b);return d;}
2619 AF4 opAAddOneF4(outAF4 d,inAF4 a,AF1 b){d=a+AF4_(b);return d;}
2620//------------------------------------------------------------------------------------------------------------------------------
2621 AF2 opACpyF2(outAF2 d,inAF2 a){d=a;return d;}
2622 AF3 opACpyF3(outAF3 d,inAF3 a){d=a;return d;}
2623 AF4 opACpyF4(outAF4 d,inAF4 a){d=a;return d;}
2624//------------------------------------------------------------------------------------------------------------------------------
2625 AF2 opALerpF2(outAF2 d,inAF2 a,inAF2 b,inAF2 c){d=ALerpF2(a,b,c);return d;}
2626 AF3 opALerpF3(outAF3 d,inAF3 a,inAF3 b,inAF3 c){d=ALerpF3(a,b,c);return d;}
2627 AF4 opALerpF4(outAF4 d,inAF4 a,inAF4 b,inAF4 c){d=ALerpF4(a,b,c);return d;}
2628//------------------------------------------------------------------------------------------------------------------------------
2629 AF2 opALerpOneF2(outAF2 d,inAF2 a,inAF2 b,AF1 c){d=ALerpF2(a,b,AF2_(c));return d;}
2630 AF3 opALerpOneF3(outAF3 d,inAF3 a,inAF3 b,AF1 c){d=ALerpF3(a,b,AF3_(c));return d;}
2631 AF4 opALerpOneF4(outAF4 d,inAF4 a,inAF4 b,AF1 c){d=ALerpF4(a,b,AF4_(c));return d;}
2632//------------------------------------------------------------------------------------------------------------------------------
2633 AF2 opAMaxF2(outAF2 d,inAF2 a,inAF2 b){d=max(a,b);return d;}
2634 AF3 opAMaxF3(outAF3 d,inAF3 a,inAF3 b){d=max(a,b);return d;}
2635 AF4 opAMaxF4(outAF4 d,inAF4 a,inAF4 b){d=max(a,b);return d;}
2636//------------------------------------------------------------------------------------------------------------------------------
2637 AF2 opAMinF2(outAF2 d,inAF2 a,inAF2 b){d=min(a,b);return d;}
2638 AF3 opAMinF3(outAF3 d,inAF3 a,inAF3 b){d=min(a,b);return d;}
2639 AF4 opAMinF4(outAF4 d,inAF4 a,inAF4 b){d=min(a,b);return d;}
2640//------------------------------------------------------------------------------------------------------------------------------
2641 AF2 opAMulF2(outAF2 d,inAF2 a,inAF2 b){d=a*b;return d;}
2642 AF3 opAMulF3(outAF3 d,inAF3 a,inAF3 b){d=a*b;return d;}
2643 AF4 opAMulF4(outAF4 d,inAF4 a,inAF4 b){d=a*b;return d;}
2644//------------------------------------------------------------------------------------------------------------------------------
2645 AF2 opAMulOneF2(outAF2 d,inAF2 a,AF1 b){d=a*AF2_(b);return d;}
2646 AF3 opAMulOneF3(outAF3 d,inAF3 a,AF1 b){d=a*AF3_(b);return d;}
2647 AF4 opAMulOneF4(outAF4 d,inAF4 a,AF1 b){d=a*AF4_(b);return d;}
2648//------------------------------------------------------------------------------------------------------------------------------
2649 AF2 opANegF2(outAF2 d,inAF2 a){d=-a;return d;}
2650 AF3 opANegF3(outAF3 d,inAF3 a){d=-a;return d;}
2651 AF4 opANegF4(outAF4 d,inAF4 a){d=-a;return d;}
2652//------------------------------------------------------------------------------------------------------------------------------
2653 AF2 opARcpF2(outAF2 d,inAF2 a){d=ARcpF2(a);return d;}
2654 AF3 opARcpF3(outAF3 d,inAF3 a){d=ARcpF3(a);return d;}
2655 AF4 opARcpF4(outAF4 d,inAF4 a){d=ARcpF4(a);return d;}
2656#endif
diff --git a/externals/FidelityFX-FSR/ffx-fsr/ffx_fsr1.h b/externals/FidelityFX-FSR/ffx-fsr/ffx_fsr1.h
new file mode 100644
index 000000000..15ecfde5c
--- /dev/null
+++ b/externals/FidelityFX-FSR/ffx-fsr/ffx_fsr1.h
@@ -0,0 +1,1199 @@
1//_____________________________________________________________/\_______________________________________________________________
2//==============================================================================================================================
3//
4//
5// AMD FidelityFX SUPER RESOLUTION [FSR 1] ::: SPATIAL SCALING & EXTRAS - v1.20210629
6//
7//
8//------------------------------------------------------------------------------------------------------------------------------
9////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
10////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
11//------------------------------------------------------------------------------------------------------------------------------
12// FidelityFX Super Resolution Sample
13//
14// Copyright (c) 2021 Advanced Micro Devices, Inc. All rights reserved.
15// Permission is hereby granted, free of charge, to any person obtaining a copy
16// of this software and associated documentation files(the "Software"), to deal
17// in the Software without restriction, including without limitation the rights
18// to use, copy, modify, merge, publish, distribute, sublicense, and / or sell
19// copies of the Software, and to permit persons to whom the Software is
20// furnished to do so, subject to the following conditions :
21// The above copyright notice and this permission notice shall be included in
22// all copies or substantial portions of the Software.
23// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
24// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
25// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.IN NO EVENT SHALL THE
26// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
27// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
28// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
29// THE SOFTWARE.
30//------------------------------------------------------------------------------------------------------------------------------
31////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
32////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
33//------------------------------------------------------------------------------------------------------------------------------
34// ABOUT
35// =====
36// FSR is a collection of algorithms relating to generating a higher resolution image.
37// This specific header focuses on single-image non-temporal image scaling, and related tools.
38//
39// The core functions are EASU and RCAS:
40// [EASU] Edge Adaptive Spatial Upsampling ....... 1x to 4x area range spatial scaling, clamped adaptive elliptical filter.
41// [RCAS] Robust Contrast Adaptive Sharpening .... A non-scaling variation on CAS.
42// RCAS needs to be applied after EASU as a separate pass.
43//
44// Optional utility functions are:
45// [LFGA] Linear Film Grain Applicator ........... Tool to apply film grain after scaling.
46// [SRTM] Simple Reversible Tone-Mapper .......... Linear HDR {0 to FP16_MAX} to {0 to 1} and back.
47// [TEPD] Temporal Energy Preserving Dither ...... Temporally energy preserving dithered {0 to 1} linear to gamma 2.0 conversion.
48// See each individual sub-section for inline documentation.
49//------------------------------------------------------------------------------------------------------------------------------
50////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
51////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
52//------------------------------------------------------------------------------------------------------------------------------
53// FUNCTION PERMUTATIONS
54// =====================
55// *F() ..... Single item computation with 32-bit.
56// *H() ..... Single item computation with 16-bit, with packing (aka two 16-bit ops in parallel) when possible.
57// *Hx2() ... Processing two items in parallel with 16-bit, easier packing.
58// Not all interfaces in this file have a *Hx2() form.
59//==============================================================================================================================
60////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
61////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
62////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
63////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
64//_____________________________________________________________/\_______________________________________________________________
65//==============================================================================================================================
66//
67// FSR - [EASU] EDGE ADAPTIVE SPATIAL UPSAMPLING
68//
69//------------------------------------------------------------------------------------------------------------------------------
70// EASU provides a high quality spatial-only scaling at relatively low cost.
71// Meaning EASU is appropiate for laptops and other low-end GPUs.
72// Quality from 1x to 4x area scaling is good.
73//------------------------------------------------------------------------------------------------------------------------------
74// The scalar uses a modified fast approximation to the standard lanczos(size=2) kernel.
75// EASU runs in a single pass, so it applies a directionally and anisotropically adaptive radial lanczos.
76// This is also kept as simple as possible to have minimum runtime.
77//------------------------------------------------------------------------------------------------------------------------------
78// The lanzcos filter has negative lobes, so by itself it will introduce ringing.
79// To remove all ringing, the algorithm uses the nearest 2x2 input texels as a neighborhood,
80// and limits output to the minimum and maximum of that neighborhood.
81//------------------------------------------------------------------------------------------------------------------------------
82// Input image requirements:
83//
84// Color needs to be encoded as 3 channel[red, green, blue](e.g.XYZ not supported)
85// Each channel needs to be in the range[0, 1]
86// Any color primaries are supported
87// Display / tonemapping curve needs to be as if presenting to sRGB display or similar(e.g.Gamma 2.0)
88// There should be no banding in the input
89// There should be no high amplitude noise in the input
90// There should be no noise in the input that is not at input pixel granularity
91// For performance purposes, use 32bpp formats
92//------------------------------------------------------------------------------------------------------------------------------
93// Best to apply EASU at the end of the frame after tonemapping
94// but before film grain or composite of the UI.
95//------------------------------------------------------------------------------------------------------------------------------
96// Example of including this header for D3D HLSL :
97//
98// #define A_GPU 1
99// #define A_HLSL 1
100// #define A_HALF 1
101// #include "ffx_a.h"
102// #define FSR_EASU_H 1
103// #define FSR_RCAS_H 1
104// //declare input callbacks
105// #include "ffx_fsr1.h"
106//
107// Example of including this header for Vulkan GLSL :
108//
109// #define A_GPU 1
110// #define A_GLSL 1
111// #define A_HALF 1
112// #include "ffx_a.h"
113// #define FSR_EASU_H 1
114// #define FSR_RCAS_H 1
115// //declare input callbacks
116// #include "ffx_fsr1.h"
117//
118// Example of including this header for Vulkan HLSL :
119//
120// #define A_GPU 1
121// #define A_HLSL 1
122// #define A_HLSL_6_2 1
123// #define A_NO_16_BIT_CAST 1
124// #define A_HALF 1
125// #include "ffx_a.h"
126// #define FSR_EASU_H 1
127// #define FSR_RCAS_H 1
128// //declare input callbacks
129// #include "ffx_fsr1.h"
130//
131// Example of declaring the required input callbacks for GLSL :
132// The callbacks need to gather4 for each color channel using the specified texture coordinate 'p'.
133// EASU uses gather4 to reduce position computation logic and for free Arrays of Structures to Structures of Arrays conversion.
134//
135// AH4 FsrEasuRH(AF2 p){return AH4(textureGather(sampler2D(tex,sam),p,0));}
136// AH4 FsrEasuGH(AF2 p){return AH4(textureGather(sampler2D(tex,sam),p,1));}
137// AH4 FsrEasuBH(AF2 p){return AH4(textureGather(sampler2D(tex,sam),p,2));}
138// ...
139// The FsrEasuCon function needs to be called from the CPU or GPU to set up constants.
140// The difference in viewport and input image size is there to support Dynamic Resolution Scaling.
141// To use FsrEasuCon() on the CPU, define A_CPU before including ffx_a and ffx_fsr1.
142// Including a GPU example here, the 'con0' through 'con3' values would be stored out to a constant buffer.
143// AU4 con0,con1,con2,con3;
144// FsrEasuCon(con0,con1,con2,con3,
145// 1920.0,1080.0, // Viewport size (top left aligned) in the input image which is to be scaled.
146// 3840.0,2160.0, // The size of the input image.
147// 2560.0,1440.0); // The output resolution.
148//==============================================================================================================================
149////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
150////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
151//_____________________________________________________________/\_______________________________________________________________
152//==============================================================================================================================
153// CONSTANT SETUP
154//==============================================================================================================================
155// Call to setup required constant values (works on CPU or GPU).
156A_STATIC void FsrEasuCon(
157outAU4 con0,
158outAU4 con1,
159outAU4 con2,
160outAU4 con3,
161// This the rendered image resolution being upscaled
162AF1 inputViewportInPixelsX,
163AF1 inputViewportInPixelsY,
164// This is the resolution of the resource containing the input image (useful for dynamic resolution)
165AF1 inputSizeInPixelsX,
166AF1 inputSizeInPixelsY,
167// This is the display resolution which the input image gets upscaled to
168AF1 outputSizeInPixelsX,
169AF1 outputSizeInPixelsY){
170 // Output integer position to a pixel position in viewport.
171 con0[0]=AU1_AF1(inputViewportInPixelsX*ARcpF1(outputSizeInPixelsX));
172 con0[1]=AU1_AF1(inputViewportInPixelsY*ARcpF1(outputSizeInPixelsY));
173 con0[2]=AU1_AF1(AF1_(0.5)*inputViewportInPixelsX*ARcpF1(outputSizeInPixelsX)-AF1_(0.5));
174 con0[3]=AU1_AF1(AF1_(0.5)*inputViewportInPixelsY*ARcpF1(outputSizeInPixelsY)-AF1_(0.5));
175 // Viewport pixel position to normalized image space.
176 // This is used to get upper-left of 'F' tap.
177 con1[0]=AU1_AF1(ARcpF1(inputSizeInPixelsX));
178 con1[1]=AU1_AF1(ARcpF1(inputSizeInPixelsY));
179 // Centers of gather4, first offset from upper-left of 'F'.
180 // +---+---+
181 // | | |
182 // +--(0)--+
183 // | b | c |
184 // +---F---+---+---+
185 // | e | f | g | h |
186 // +--(1)--+--(2)--+
187 // | i | j | k | l |
188 // +---+---+---+---+
189 // | n | o |
190 // +--(3)--+
191 // | | |
192 // +---+---+
193 con1[2]=AU1_AF1(AF1_( 1.0)*ARcpF1(inputSizeInPixelsX));
194 con1[3]=AU1_AF1(AF1_(-1.0)*ARcpF1(inputSizeInPixelsY));
195 // These are from (0) instead of 'F'.
196 con2[0]=AU1_AF1(AF1_(-1.0)*ARcpF1(inputSizeInPixelsX));
197 con2[1]=AU1_AF1(AF1_( 2.0)*ARcpF1(inputSizeInPixelsY));
198 con2[2]=AU1_AF1(AF1_( 1.0)*ARcpF1(inputSizeInPixelsX));
199 con2[3]=AU1_AF1(AF1_( 2.0)*ARcpF1(inputSizeInPixelsY));
200 con3[0]=AU1_AF1(AF1_( 0.0)*ARcpF1(inputSizeInPixelsX));
201 con3[1]=AU1_AF1(AF1_( 4.0)*ARcpF1(inputSizeInPixelsY));
202 con3[2]=con3[3]=0;}
203
204//If the an offset into the input image resource
205A_STATIC void FsrEasuConOffset(
206 outAU4 con0,
207 outAU4 con1,
208 outAU4 con2,
209 outAU4 con3,
210 // This the rendered image resolution being upscaled
211 AF1 inputViewportInPixelsX,
212 AF1 inputViewportInPixelsY,
213 // This is the resolution of the resource containing the input image (useful for dynamic resolution)
214 AF1 inputSizeInPixelsX,
215 AF1 inputSizeInPixelsY,
216 // This is the display resolution which the input image gets upscaled to
217 AF1 outputSizeInPixelsX,
218 AF1 outputSizeInPixelsY,
219 // This is the input image offset into the resource containing it (useful for dynamic resolution)
220 AF1 inputOffsetInPixelsX,
221 AF1 inputOffsetInPixelsY) {
222 FsrEasuCon(con0, con1, con2, con3, inputViewportInPixelsX, inputViewportInPixelsY, inputSizeInPixelsX, inputSizeInPixelsY, outputSizeInPixelsX, outputSizeInPixelsY);
223 con0[2] = AU1_AF1(AF1_(0.5) * inputViewportInPixelsX * ARcpF1(outputSizeInPixelsX) - AF1_(0.5) + inputOffsetInPixelsX);
224 con0[3] = AU1_AF1(AF1_(0.5) * inputViewportInPixelsY * ARcpF1(outputSizeInPixelsY) - AF1_(0.5) + inputOffsetInPixelsY);
225}
226////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
227////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
228//_____________________________________________________________/\_______________________________________________________________
229//==============================================================================================================================
230// NON-PACKED 32-BIT VERSION
231//==============================================================================================================================
232#if defined(A_GPU)&&defined(FSR_EASU_F)
233 // Input callback prototypes, need to be implemented by calling shader
234 AF4 FsrEasuRF(AF2 p);
235 AF4 FsrEasuGF(AF2 p);
236 AF4 FsrEasuBF(AF2 p);
237//------------------------------------------------------------------------------------------------------------------------------
238 // Filtering for a given tap for the scalar.
239 void FsrEasuTapF(
240 inout AF3 aC, // Accumulated color, with negative lobe.
241 inout AF1 aW, // Accumulated weight.
242 AF2 off, // Pixel offset from resolve position to tap.
243 AF2 dir, // Gradient direction.
244 AF2 len, // Length.
245 AF1 lob, // Negative lobe strength.
246 AF1 clp, // Clipping point.
247 AF3 c){ // Tap color.
248 // Rotate offset by direction.
249 AF2 v;
250 v.x=(off.x*( dir.x))+(off.y*dir.y);
251 v.y=(off.x*(-dir.y))+(off.y*dir.x);
252 // Anisotropy.
253 v*=len;
254 // Compute distance^2.
255 AF1 d2=v.x*v.x+v.y*v.y;
256 // Limit to the window as at corner, 2 taps can easily be outside.
257 d2=min(d2,clp);
258 // Approximation of lancos2 without sin() or rcp(), or sqrt() to get x.
259 // (25/16 * (2/5 * x^2 - 1)^2 - (25/16 - 1)) * (1/4 * x^2 - 1)^2
260 // |_______________________________________| |_______________|
261 // base window
262 // The general form of the 'base' is,
263 // (a*(b*x^2-1)^2-(a-1))
264 // Where 'a=1/(2*b-b^2)' and 'b' moves around the negative lobe.
265 AF1 wB=AF1_(2.0/5.0)*d2+AF1_(-1.0);
266 AF1 wA=lob*d2+AF1_(-1.0);
267 wB*=wB;
268 wA*=wA;
269 wB=AF1_(25.0/16.0)*wB+AF1_(-(25.0/16.0-1.0));
270 AF1 w=wB*wA;
271 // Do weighted average.
272 aC+=c*w;aW+=w;}
273//------------------------------------------------------------------------------------------------------------------------------
274 // Accumulate direction and length.
275 void FsrEasuSetF(
276 inout AF2 dir,
277 inout AF1 len,
278 AF2 pp,
279 AP1 biS,AP1 biT,AP1 biU,AP1 biV,
280 AF1 lA,AF1 lB,AF1 lC,AF1 lD,AF1 lE){
281 // Compute bilinear weight, branches factor out as predicates are compiler time immediates.
282 // s t
283 // u v
284 AF1 w = AF1_(0.0);
285 if(biS)w=(AF1_(1.0)-pp.x)*(AF1_(1.0)-pp.y);
286 if(biT)w= pp.x *(AF1_(1.0)-pp.y);
287 if(biU)w=(AF1_(1.0)-pp.x)* pp.y ;
288 if(biV)w= pp.x * pp.y ;
289 // Direction is the '+' diff.
290 // a
291 // b c d
292 // e
293 // Then takes magnitude from abs average of both sides of 'c'.
294 // Length converts gradient reversal to 0, smoothly to non-reversal at 1, shaped, then adding horz and vert terms.
295 AF1 dc=lD-lC;
296 AF1 cb=lC-lB;
297 AF1 lenX=max(abs(dc),abs(cb));
298 lenX=APrxLoRcpF1(lenX);
299 AF1 dirX=lD-lB;
300 dir.x+=dirX*w;
301 lenX=ASatF1(abs(dirX)*lenX);
302 lenX*=lenX;
303 len+=lenX*w;
304 // Repeat for the y axis.
305 AF1 ec=lE-lC;
306 AF1 ca=lC-lA;
307 AF1 lenY=max(abs(ec),abs(ca));
308 lenY=APrxLoRcpF1(lenY);
309 AF1 dirY=lE-lA;
310 dir.y+=dirY*w;
311 lenY=ASatF1(abs(dirY)*lenY);
312 lenY*=lenY;
313 len+=lenY*w;}
314//------------------------------------------------------------------------------------------------------------------------------
315 void FsrEasuF(
316 out AF3 pix,
317 AU2 ip, // Integer pixel position in output.
318 AU4 con0, // Constants generated by FsrEasuCon().
319 AU4 con1,
320 AU4 con2,
321 AU4 con3){
322//------------------------------------------------------------------------------------------------------------------------------
323 // Get position of 'f'.
324 AF2 pp=AF2(ip)*AF2_AU2(con0.xy)+AF2_AU2(con0.zw);
325 AF2 fp=floor(pp);
326 pp-=fp;
327//------------------------------------------------------------------------------------------------------------------------------
328 // 12-tap kernel.
329 // b c
330 // e f g h
331 // i j k l
332 // n o
333 // Gather 4 ordering.
334 // a b
335 // r g
336 // For packed FP16, need either {rg} or {ab} so using the following setup for gather in all versions,
337 // a b <- unused (z)
338 // r g
339 // a b a b
340 // r g r g
341 // a b
342 // r g <- unused (z)
343 // Allowing dead-code removal to remove the 'z's.
344 AF2 p0=fp*AF2_AU2(con1.xy)+AF2_AU2(con1.zw);
345 // These are from p0 to avoid pulling two constants on pre-Navi hardware.
346 AF2 p1=p0+AF2_AU2(con2.xy);
347 AF2 p2=p0+AF2_AU2(con2.zw);
348 AF2 p3=p0+AF2_AU2(con3.xy);
349 AF4 bczzR=FsrEasuRF(p0);
350 AF4 bczzG=FsrEasuGF(p0);
351 AF4 bczzB=FsrEasuBF(p0);
352 AF4 ijfeR=FsrEasuRF(p1);
353 AF4 ijfeG=FsrEasuGF(p1);
354 AF4 ijfeB=FsrEasuBF(p1);
355 AF4 klhgR=FsrEasuRF(p2);
356 AF4 klhgG=FsrEasuGF(p2);
357 AF4 klhgB=FsrEasuBF(p2);
358 AF4 zzonR=FsrEasuRF(p3);
359 AF4 zzonG=FsrEasuGF(p3);
360 AF4 zzonB=FsrEasuBF(p3);
361//------------------------------------------------------------------------------------------------------------------------------
362 // Simplest multi-channel approximate luma possible (luma times 2, in 2 FMA/MAD).
363 AF4 bczzL=bczzB*AF4_(0.5)+(bczzR*AF4_(0.5)+bczzG);
364 AF4 ijfeL=ijfeB*AF4_(0.5)+(ijfeR*AF4_(0.5)+ijfeG);
365 AF4 klhgL=klhgB*AF4_(0.5)+(klhgR*AF4_(0.5)+klhgG);
366 AF4 zzonL=zzonB*AF4_(0.5)+(zzonR*AF4_(0.5)+zzonG);
367 // Rename.
368 AF1 bL=bczzL.x;
369 AF1 cL=bczzL.y;
370 AF1 iL=ijfeL.x;
371 AF1 jL=ijfeL.y;
372 AF1 fL=ijfeL.z;
373 AF1 eL=ijfeL.w;
374 AF1 kL=klhgL.x;
375 AF1 lL=klhgL.y;
376 AF1 hL=klhgL.z;
377 AF1 gL=klhgL.w;
378 AF1 oL=zzonL.z;
379 AF1 nL=zzonL.w;
380 // Accumulate for bilinear interpolation.
381 AF2 dir=AF2_(0.0);
382 AF1 len=AF1_(0.0);
383 FsrEasuSetF(dir,len,pp,true, false,false,false,bL,eL,fL,gL,jL);
384 FsrEasuSetF(dir,len,pp,false,true ,false,false,cL,fL,gL,hL,kL);
385 FsrEasuSetF(dir,len,pp,false,false,true ,false,fL,iL,jL,kL,nL);
386 FsrEasuSetF(dir,len,pp,false,false,false,true ,gL,jL,kL,lL,oL);
387//------------------------------------------------------------------------------------------------------------------------------
388 // Normalize with approximation, and cleanup close to zero.
389 AF2 dir2=dir*dir;
390 AF1 dirR=dir2.x+dir2.y;
391 AP1 zro=dirR<AF1_(1.0/32768.0);
392 dirR=APrxLoRsqF1(dirR);
393 dirR=zro?AF1_(1.0):dirR;
394 dir.x=zro?AF1_(1.0):dir.x;
395 dir*=AF2_(dirR);
396 // Transform from {0 to 2} to {0 to 1} range, and shape with square.
397 len=len*AF1_(0.5);
398 len*=len;
399 // Stretch kernel {1.0 vert|horz, to sqrt(2.0) on diagonal}.
400 AF1 stretch=(dir.x*dir.x+dir.y*dir.y)*APrxLoRcpF1(max(abs(dir.x),abs(dir.y)));
401 // Anisotropic length after rotation,
402 // x := 1.0 lerp to 'stretch' on edges
403 // y := 1.0 lerp to 2x on edges
404 AF2 len2=AF2(AF1_(1.0)+(stretch-AF1_(1.0))*len,AF1_(1.0)+AF1_(-0.5)*len);
405 // Based on the amount of 'edge',
406 // the window shifts from +/-{sqrt(2.0) to slightly beyond 2.0}.
407 AF1 lob=AF1_(0.5)+AF1_((1.0/4.0-0.04)-0.5)*len;
408 // Set distance^2 clipping point to the end of the adjustable window.
409 AF1 clp=APrxLoRcpF1(lob);
410//------------------------------------------------------------------------------------------------------------------------------
411 // Accumulation mixed with min/max of 4 nearest.
412 // b c
413 // e f g h
414 // i j k l
415 // n o
416 AF3 min4=min(AMin3F3(AF3(ijfeR.z,ijfeG.z,ijfeB.z),AF3(klhgR.w,klhgG.w,klhgB.w),AF3(ijfeR.y,ijfeG.y,ijfeB.y)),
417 AF3(klhgR.x,klhgG.x,klhgB.x));
418 AF3 max4=max(AMax3F3(AF3(ijfeR.z,ijfeG.z,ijfeB.z),AF3(klhgR.w,klhgG.w,klhgB.w),AF3(ijfeR.y,ijfeG.y,ijfeB.y)),
419 AF3(klhgR.x,klhgG.x,klhgB.x));
420 // Accumulation.
421 AF3 aC=AF3_(0.0);
422 AF1 aW=AF1_(0.0);
423 FsrEasuTapF(aC,aW,AF2( 0.0,-1.0)-pp,dir,len2,lob,clp,AF3(bczzR.x,bczzG.x,bczzB.x)); // b
424 FsrEasuTapF(aC,aW,AF2( 1.0,-1.0)-pp,dir,len2,lob,clp,AF3(bczzR.y,bczzG.y,bczzB.y)); // c
425 FsrEasuTapF(aC,aW,AF2(-1.0, 1.0)-pp,dir,len2,lob,clp,AF3(ijfeR.x,ijfeG.x,ijfeB.x)); // i
426 FsrEasuTapF(aC,aW,AF2( 0.0, 1.0)-pp,dir,len2,lob,clp,AF3(ijfeR.y,ijfeG.y,ijfeB.y)); // j
427 FsrEasuTapF(aC,aW,AF2( 0.0, 0.0)-pp,dir,len2,lob,clp,AF3(ijfeR.z,ijfeG.z,ijfeB.z)); // f
428 FsrEasuTapF(aC,aW,AF2(-1.0, 0.0)-pp,dir,len2,lob,clp,AF3(ijfeR.w,ijfeG.w,ijfeB.w)); // e
429 FsrEasuTapF(aC,aW,AF2( 1.0, 1.0)-pp,dir,len2,lob,clp,AF3(klhgR.x,klhgG.x,klhgB.x)); // k
430 FsrEasuTapF(aC,aW,AF2( 2.0, 1.0)-pp,dir,len2,lob,clp,AF3(klhgR.y,klhgG.y,klhgB.y)); // l
431 FsrEasuTapF(aC,aW,AF2( 2.0, 0.0)-pp,dir,len2,lob,clp,AF3(klhgR.z,klhgG.z,klhgB.z)); // h
432 FsrEasuTapF(aC,aW,AF2( 1.0, 0.0)-pp,dir,len2,lob,clp,AF3(klhgR.w,klhgG.w,klhgB.w)); // g
433 FsrEasuTapF(aC,aW,AF2( 1.0, 2.0)-pp,dir,len2,lob,clp,AF3(zzonR.z,zzonG.z,zzonB.z)); // o
434 FsrEasuTapF(aC,aW,AF2( 0.0, 2.0)-pp,dir,len2,lob,clp,AF3(zzonR.w,zzonG.w,zzonB.w)); // n
435//------------------------------------------------------------------------------------------------------------------------------
436 // Normalize and dering.
437 pix=min(max4,max(min4,aC*AF3_(ARcpF1(aW))));}
438#endif
439////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
440////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
441//_____________________________________________________________/\_______________________________________________________________
442//==============================================================================================================================
443// PACKED 16-BIT VERSION
444//==============================================================================================================================
445#if defined(A_GPU)&&defined(A_HALF)&&defined(FSR_EASU_H)
446// Input callback prototypes, need to be implemented by calling shader
447 AH4 FsrEasuRH(AF2 p);
448 AH4 FsrEasuGH(AF2 p);
449 AH4 FsrEasuBH(AF2 p);
450//------------------------------------------------------------------------------------------------------------------------------
451 // This runs 2 taps in parallel.
452 void FsrEasuTapH(
453 inout AH2 aCR,inout AH2 aCG,inout AH2 aCB,
454 inout AH2 aW,
455 AH2 offX,AH2 offY,
456 AH2 dir,
457 AH2 len,
458 AH1 lob,
459 AH1 clp,
460 AH2 cR,AH2 cG,AH2 cB){
461 AH2 vX,vY;
462 vX=offX* dir.xx +offY*dir.yy;
463 vY=offX*(-dir.yy)+offY*dir.xx;
464 vX*=len.x;vY*=len.y;
465 AH2 d2=vX*vX+vY*vY;
466 d2=min(d2,AH2_(clp));
467 AH2 wB=AH2_(2.0/5.0)*d2+AH2_(-1.0);
468 AH2 wA=AH2_(lob)*d2+AH2_(-1.0);
469 wB*=wB;
470 wA*=wA;
471 wB=AH2_(25.0/16.0)*wB+AH2_(-(25.0/16.0-1.0));
472 AH2 w=wB*wA;
473 aCR+=cR*w;aCG+=cG*w;aCB+=cB*w;aW+=w;}
474//------------------------------------------------------------------------------------------------------------------------------
475 // This runs 2 taps in parallel.
476 void FsrEasuSetH(
477 inout AH2 dirPX,inout AH2 dirPY,
478 inout AH2 lenP,
479 AH2 pp,
480 AP1 biST,AP1 biUV,
481 AH2 lA,AH2 lB,AH2 lC,AH2 lD,AH2 lE){
482 AH2 w = AH2_(0.0);
483 if(biST)w=(AH2(1.0,0.0)+AH2(-pp.x,pp.x))*AH2_(AH1_(1.0)-pp.y);
484 if(biUV)w=(AH2(1.0,0.0)+AH2(-pp.x,pp.x))*AH2_( pp.y);
485 // ABS is not free in the packed FP16 path.
486 AH2 dc=lD-lC;
487 AH2 cb=lC-lB;
488 AH2 lenX=max(abs(dc),abs(cb));
489 lenX=ARcpH2(lenX);
490 AH2 dirX=lD-lB;
491 dirPX+=dirX*w;
492 lenX=ASatH2(abs(dirX)*lenX);
493 lenX*=lenX;
494 lenP+=lenX*w;
495 AH2 ec=lE-lC;
496 AH2 ca=lC-lA;
497 AH2 lenY=max(abs(ec),abs(ca));
498 lenY=ARcpH2(lenY);
499 AH2 dirY=lE-lA;
500 dirPY+=dirY*w;
501 lenY=ASatH2(abs(dirY)*lenY);
502 lenY*=lenY;
503 lenP+=lenY*w;}
504//------------------------------------------------------------------------------------------------------------------------------
505 void FsrEasuH(
506 out AH3 pix,
507 AU2 ip,
508 AU4 con0,
509 AU4 con1,
510 AU4 con2,
511 AU4 con3){
512//------------------------------------------------------------------------------------------------------------------------------
513 AF2 pp=AF2(ip)*AF2_AU2(con0.xy)+AF2_AU2(con0.zw);
514 AF2 fp=floor(pp);
515 pp-=fp;
516 AH2 ppp=AH2(pp);
517//------------------------------------------------------------------------------------------------------------------------------
518 AF2 p0=fp*AF2_AU2(con1.xy)+AF2_AU2(con1.zw);
519 AF2 p1=p0+AF2_AU2(con2.xy);
520 AF2 p2=p0+AF2_AU2(con2.zw);
521 AF2 p3=p0+AF2_AU2(con3.xy);
522 AH4 bczzR=FsrEasuRH(p0);
523 AH4 bczzG=FsrEasuGH(p0);
524 AH4 bczzB=FsrEasuBH(p0);
525 AH4 ijfeR=FsrEasuRH(p1);
526 AH4 ijfeG=FsrEasuGH(p1);
527 AH4 ijfeB=FsrEasuBH(p1);
528 AH4 klhgR=FsrEasuRH(p2);
529 AH4 klhgG=FsrEasuGH(p2);
530 AH4 klhgB=FsrEasuBH(p2);
531 AH4 zzonR=FsrEasuRH(p3);
532 AH4 zzonG=FsrEasuGH(p3);
533 AH4 zzonB=FsrEasuBH(p3);
534//------------------------------------------------------------------------------------------------------------------------------
535 AH4 bczzL=bczzB*AH4_(0.5)+(bczzR*AH4_(0.5)+bczzG);
536 AH4 ijfeL=ijfeB*AH4_(0.5)+(ijfeR*AH4_(0.5)+ijfeG);
537 AH4 klhgL=klhgB*AH4_(0.5)+(klhgR*AH4_(0.5)+klhgG);
538 AH4 zzonL=zzonB*AH4_(0.5)+(zzonR*AH4_(0.5)+zzonG);
539 AH1 bL=bczzL.x;
540 AH1 cL=bczzL.y;
541 AH1 iL=ijfeL.x;
542 AH1 jL=ijfeL.y;
543 AH1 fL=ijfeL.z;
544 AH1 eL=ijfeL.w;
545 AH1 kL=klhgL.x;
546 AH1 lL=klhgL.y;
547 AH1 hL=klhgL.z;
548 AH1 gL=klhgL.w;
549 AH1 oL=zzonL.z;
550 AH1 nL=zzonL.w;
551 // This part is different, accumulating 2 taps in parallel.
552 AH2 dirPX=AH2_(0.0);
553 AH2 dirPY=AH2_(0.0);
554 AH2 lenP=AH2_(0.0);
555 FsrEasuSetH(dirPX,dirPY,lenP,ppp,true, false,AH2(bL,cL),AH2(eL,fL),AH2(fL,gL),AH2(gL,hL),AH2(jL,kL));
556 FsrEasuSetH(dirPX,dirPY,lenP,ppp,false,true ,AH2(fL,gL),AH2(iL,jL),AH2(jL,kL),AH2(kL,lL),AH2(nL,oL));
557 AH2 dir=AH2(dirPX.r+dirPX.g,dirPY.r+dirPY.g);
558 AH1 len=lenP.r+lenP.g;
559//------------------------------------------------------------------------------------------------------------------------------
560 AH2 dir2=dir*dir;
561 AH1 dirR=dir2.x+dir2.y;
562 AP1 zro=dirR<AH1_(1.0/32768.0);
563 dirR=APrxLoRsqH1(dirR);
564 dirR=zro?AH1_(1.0):dirR;
565 dir.x=zro?AH1_(1.0):dir.x;
566 dir*=AH2_(dirR);
567 len=len*AH1_(0.5);
568 len*=len;
569 AH1 stretch=(dir.x*dir.x+dir.y*dir.y)*APrxLoRcpH1(max(abs(dir.x),abs(dir.y)));
570 AH2 len2=AH2(AH1_(1.0)+(stretch-AH1_(1.0))*len,AH1_(1.0)+AH1_(-0.5)*len);
571 AH1 lob=AH1_(0.5)+AH1_((1.0/4.0-0.04)-0.5)*len;
572 AH1 clp=APrxLoRcpH1(lob);
573//------------------------------------------------------------------------------------------------------------------------------
574 // FP16 is different, using packed trick to do min and max in same operation.
575 AH2 bothR=max(max(AH2(-ijfeR.z,ijfeR.z),AH2(-klhgR.w,klhgR.w)),max(AH2(-ijfeR.y,ijfeR.y),AH2(-klhgR.x,klhgR.x)));
576 AH2 bothG=max(max(AH2(-ijfeG.z,ijfeG.z),AH2(-klhgG.w,klhgG.w)),max(AH2(-ijfeG.y,ijfeG.y),AH2(-klhgG.x,klhgG.x)));
577 AH2 bothB=max(max(AH2(-ijfeB.z,ijfeB.z),AH2(-klhgB.w,klhgB.w)),max(AH2(-ijfeB.y,ijfeB.y),AH2(-klhgB.x,klhgB.x)));
578 // This part is different for FP16, working pairs of taps at a time.
579 AH2 pR=AH2_(0.0);
580 AH2 pG=AH2_(0.0);
581 AH2 pB=AH2_(0.0);
582 AH2 pW=AH2_(0.0);
583 FsrEasuTapH(pR,pG,pB,pW,AH2( 0.0, 1.0)-ppp.xx,AH2(-1.0,-1.0)-ppp.yy,dir,len2,lob,clp,bczzR.xy,bczzG.xy,bczzB.xy);
584 FsrEasuTapH(pR,pG,pB,pW,AH2(-1.0, 0.0)-ppp.xx,AH2( 1.0, 1.0)-ppp.yy,dir,len2,lob,clp,ijfeR.xy,ijfeG.xy,ijfeB.xy);
585 FsrEasuTapH(pR,pG,pB,pW,AH2( 0.0,-1.0)-ppp.xx,AH2( 0.0, 0.0)-ppp.yy,dir,len2,lob,clp,ijfeR.zw,ijfeG.zw,ijfeB.zw);
586 FsrEasuTapH(pR,pG,pB,pW,AH2( 1.0, 2.0)-ppp.xx,AH2( 1.0, 1.0)-ppp.yy,dir,len2,lob,clp,klhgR.xy,klhgG.xy,klhgB.xy);
587 FsrEasuTapH(pR,pG,pB,pW,AH2( 2.0, 1.0)-ppp.xx,AH2( 0.0, 0.0)-ppp.yy,dir,len2,lob,clp,klhgR.zw,klhgG.zw,klhgB.zw);
588 FsrEasuTapH(pR,pG,pB,pW,AH2( 1.0, 0.0)-ppp.xx,AH2( 2.0, 2.0)-ppp.yy,dir,len2,lob,clp,zzonR.zw,zzonG.zw,zzonB.zw);
589 AH3 aC=AH3(pR.x+pR.y,pG.x+pG.y,pB.x+pB.y);
590 AH1 aW=pW.x+pW.y;
591//------------------------------------------------------------------------------------------------------------------------------
592 // Slightly different for FP16 version due to combined min and max.
593 pix=min(AH3(bothR.y,bothG.y,bothB.y),max(-AH3(bothR.x,bothG.x,bothB.x),aC*AH3_(ARcpH1(aW))));}
594#endif
595////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
596////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
597////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
598////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
599//_____________________________________________________________/\_______________________________________________________________
600//==============================================================================================================================
601//
602// FSR - [RCAS] ROBUST CONTRAST ADAPTIVE SHARPENING
603//
604//------------------------------------------------------------------------------------------------------------------------------
605// CAS uses a simplified mechanism to convert local contrast into a variable amount of sharpness.
606// RCAS uses a more exact mechanism, solving for the maximum local sharpness possible before clipping.
607// RCAS also has a built in process to limit sharpening of what it detects as possible noise.
608// RCAS sharper does not support scaling, as it should be applied after EASU scaling.
609// Pass EASU output straight into RCAS, no color conversions necessary.
610//------------------------------------------------------------------------------------------------------------------------------
611// RCAS is based on the following logic.
612// RCAS uses a 5 tap filter in a cross pattern (same as CAS),
613// w n
614// w 1 w for taps w m e
615// w s
616// Where 'w' is the negative lobe weight.
617// output = (w*(n+e+w+s)+m)/(4*w+1)
618// RCAS solves for 'w' by seeing where the signal might clip out of the {0 to 1} input range,
619// 0 == (w*(n+e+w+s)+m)/(4*w+1) -> w = -m/(n+e+w+s)
620// 1 == (w*(n+e+w+s)+m)/(4*w+1) -> w = (1-m)/(n+e+w+s-4*1)
621// Then chooses the 'w' which results in no clipping, limits 'w', and multiplies by the 'sharp' amount.
622// This solution above has issues with MSAA input as the steps along the gradient cause edge detection issues.
623// So RCAS uses 4x the maximum and 4x the minimum (depending on equation)in place of the individual taps.
624// As well as switching from 'm' to either the minimum or maximum (depending on side), to help in energy conservation.
625// This stabilizes RCAS.
626// RCAS does a simple highpass which is normalized against the local contrast then shaped,
627// 0.25
628// 0.25 -1 0.25
629// 0.25
630// This is used as a noise detection filter, to reduce the effect of RCAS on grain, and focus on real edges.
631//
632// GLSL example for the required callbacks :
633//
634// AH4 FsrRcasLoadH(ASW2 p){return AH4(imageLoad(imgSrc,ASU2(p)));}
635// void FsrRcasInputH(inout AH1 r,inout AH1 g,inout AH1 b)
636// {
637// //do any simple input color conversions here or leave empty if none needed
638// }
639//
640// FsrRcasCon need to be called from the CPU or GPU to set up constants.
641// Including a GPU example here, the 'con' value would be stored out to a constant buffer.
642//
643// AU4 con;
644// FsrRcasCon(con,
645// 0.0); // The scale is {0.0 := maximum sharpness, to N>0, where N is the number of stops (halving) of the reduction of sharpness}.
646// ---------------
647// RCAS sharpening supports a CAS-like pass-through alpha via,
648// #define FSR_RCAS_PASSTHROUGH_ALPHA 1
649// RCAS also supports a define to enable a more expensive path to avoid some sharpening of noise.
650// Would suggest it is better to apply film grain after RCAS sharpening (and after scaling) instead of using this define,
651// #define FSR_RCAS_DENOISE 1
652//==============================================================================================================================
653// This is set at the limit of providing unnatural results for sharpening.
654#define FSR_RCAS_LIMIT (0.25-(1.0/16.0))
655////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
656////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
657//_____________________________________________________________/\_______________________________________________________________
658//==============================================================================================================================
659// CONSTANT SETUP
660//==============================================================================================================================
661// Call to setup required constant values (works on CPU or GPU).
662A_STATIC void FsrRcasCon(
663outAU4 con,
664// The scale is {0.0 := maximum, to N>0, where N is the number of stops (halving) of the reduction of sharpness}.
665AF1 sharpness){
666 // Transform from stops to linear value.
667 sharpness=AExp2F1(-sharpness);
668 varAF2(hSharp)=initAF2(sharpness,sharpness);
669 con[0]=AU1_AF1(sharpness);
670 con[1]=AU1_AH2_AF2(hSharp);
671 con[2]=0;
672 con[3]=0;}
673////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
674////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
675//_____________________________________________________________/\_______________________________________________________________
676//==============================================================================================================================
677// NON-PACKED 32-BIT VERSION
678//==============================================================================================================================
679#if defined(A_GPU)&&defined(FSR_RCAS_F)
680 // Input callback prototypes that need to be implemented by calling shader
681 AF4 FsrRcasLoadF(ASU2 p);
682 void FsrRcasInputF(inout AF1 r,inout AF1 g,inout AF1 b);
683//------------------------------------------------------------------------------------------------------------------------------
684 void FsrRcasF(
685 out AF1 pixR, // Output values, non-vector so port between RcasFilter() and RcasFilterH() is easy.
686 out AF1 pixG,
687 out AF1 pixB,
688 #ifdef FSR_RCAS_PASSTHROUGH_ALPHA
689 out AF1 pixA,
690 #endif
691 AU2 ip, // Integer pixel position in output.
692 AU4 con){ // Constant generated by RcasSetup().
693 // Algorithm uses minimal 3x3 pixel neighborhood.
694 // b
695 // d e f
696 // h
697 ASU2 sp=ASU2(ip);
698 AF3 b=FsrRcasLoadF(sp+ASU2( 0,-1)).rgb;
699 AF3 d=FsrRcasLoadF(sp+ASU2(-1, 0)).rgb;
700 #ifdef FSR_RCAS_PASSTHROUGH_ALPHA
701 AF4 ee=FsrRcasLoadF(sp);
702 AF3 e=ee.rgb;pixA=ee.a;
703 #else
704 AF3 e=FsrRcasLoadF(sp).rgb;
705 #endif
706 AF3 f=FsrRcasLoadF(sp+ASU2( 1, 0)).rgb;
707 AF3 h=FsrRcasLoadF(sp+ASU2( 0, 1)).rgb;
708 // Rename (32-bit) or regroup (16-bit).
709 AF1 bR=b.r;
710 AF1 bG=b.g;
711 AF1 bB=b.b;
712 AF1 dR=d.r;
713 AF1 dG=d.g;
714 AF1 dB=d.b;
715 AF1 eR=e.r;
716 AF1 eG=e.g;
717 AF1 eB=e.b;
718 AF1 fR=f.r;
719 AF1 fG=f.g;
720 AF1 fB=f.b;
721 AF1 hR=h.r;
722 AF1 hG=h.g;
723 AF1 hB=h.b;
724 // Run optional input transform.
725 FsrRcasInputF(bR,bG,bB);
726 FsrRcasInputF(dR,dG,dB);
727 FsrRcasInputF(eR,eG,eB);
728 FsrRcasInputF(fR,fG,fB);
729 FsrRcasInputF(hR,hG,hB);
730 // Luma times 2.
731 AF1 bL=bB*AF1_(0.5)+(bR*AF1_(0.5)+bG);
732 AF1 dL=dB*AF1_(0.5)+(dR*AF1_(0.5)+dG);
733 AF1 eL=eB*AF1_(0.5)+(eR*AF1_(0.5)+eG);
734 AF1 fL=fB*AF1_(0.5)+(fR*AF1_(0.5)+fG);
735 AF1 hL=hB*AF1_(0.5)+(hR*AF1_(0.5)+hG);
736 // Noise detection.
737 AF1 nz=AF1_(0.25)*bL+AF1_(0.25)*dL+AF1_(0.25)*fL+AF1_(0.25)*hL-eL;
738 nz=ASatF1(abs(nz)*APrxMedRcpF1(AMax3F1(AMax3F1(bL,dL,eL),fL,hL)-AMin3F1(AMin3F1(bL,dL,eL),fL,hL)));
739 nz=AF1_(-0.5)*nz+AF1_(1.0);
740 // Min and max of ring.
741 AF1 mn4R=min(AMin3F1(bR,dR,fR),hR);
742 AF1 mn4G=min(AMin3F1(bG,dG,fG),hG);
743 AF1 mn4B=min(AMin3F1(bB,dB,fB),hB);
744 AF1 mx4R=max(AMax3F1(bR,dR,fR),hR);
745 AF1 mx4G=max(AMax3F1(bG,dG,fG),hG);
746 AF1 mx4B=max(AMax3F1(bB,dB,fB),hB);
747 // Immediate constants for peak range.
748 AF2 peakC=AF2(1.0,-1.0*4.0);
749 // Limiters, these need to be high precision RCPs.
750 AF1 hitMinR=mn4R*ARcpF1(AF1_(4.0)*mx4R);
751 AF1 hitMinG=mn4G*ARcpF1(AF1_(4.0)*mx4G);
752 AF1 hitMinB=mn4B*ARcpF1(AF1_(4.0)*mx4B);
753 AF1 hitMaxR=(peakC.x-mx4R)*ARcpF1(AF1_(4.0)*mn4R+peakC.y);
754 AF1 hitMaxG=(peakC.x-mx4G)*ARcpF1(AF1_(4.0)*mn4G+peakC.y);
755 AF1 hitMaxB=(peakC.x-mx4B)*ARcpF1(AF1_(4.0)*mn4B+peakC.y);
756 AF1 lobeR=max(-hitMinR,hitMaxR);
757 AF1 lobeG=max(-hitMinG,hitMaxG);
758 AF1 lobeB=max(-hitMinB,hitMaxB);
759 AF1 lobe=max(AF1_(-FSR_RCAS_LIMIT),min(AMax3F1(lobeR,lobeG,lobeB),AF1_(0.0)))*AF1_AU1(con.x);
760 // Apply noise removal.
761 #ifdef FSR_RCAS_DENOISE
762 lobe*=nz;
763 #endif
764 // Resolve, which needs the medium precision rcp approximation to avoid visible tonality changes.
765 AF1 rcpL=APrxMedRcpF1(AF1_(4.0)*lobe+AF1_(1.0));
766 pixR=(lobe*bR+lobe*dR+lobe*hR+lobe*fR+eR)*rcpL;
767 pixG=(lobe*bG+lobe*dG+lobe*hG+lobe*fG+eG)*rcpL;
768 pixB=(lobe*bB+lobe*dB+lobe*hB+lobe*fB+eB)*rcpL;
769 return;}
770#endif
771////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
772////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
773//_____________________________________________________________/\_______________________________________________________________
774//==============================================================================================================================
775// NON-PACKED 16-BIT VERSION
776//==============================================================================================================================
777#if defined(A_GPU)&&defined(A_HALF)&&defined(FSR_RCAS_H)
778 // Input callback prototypes that need to be implemented by calling shader
779 AH4 FsrRcasLoadH(ASW2 p);
780 void FsrRcasInputH(inout AH1 r,inout AH1 g,inout AH1 b);
781//------------------------------------------------------------------------------------------------------------------------------
782 void FsrRcasH(
783 out AH1 pixR, // Output values, non-vector so port between RcasFilter() and RcasFilterH() is easy.
784 out AH1 pixG,
785 out AH1 pixB,
786 #ifdef FSR_RCAS_PASSTHROUGH_ALPHA
787 out AH1 pixA,
788 #endif
789 AU2 ip, // Integer pixel position in output.
790 AU4 con){ // Constant generated by RcasSetup().
791 // Sharpening algorithm uses minimal 3x3 pixel neighborhood.
792 // b
793 // d e f
794 // h
795 ASW2 sp=ASW2(ip);
796 AH3 b=FsrRcasLoadH(sp+ASW2( 0,-1)).rgb;
797 AH3 d=FsrRcasLoadH(sp+ASW2(-1, 0)).rgb;
798 #ifdef FSR_RCAS_PASSTHROUGH_ALPHA
799 AH4 ee=FsrRcasLoadH(sp);
800 AH3 e=ee.rgb;pixA=ee.a;
801 #else
802 AH3 e=FsrRcasLoadH(sp).rgb;
803 #endif
804 AH3 f=FsrRcasLoadH(sp+ASW2( 1, 0)).rgb;
805 AH3 h=FsrRcasLoadH(sp+ASW2( 0, 1)).rgb;
806 // Rename (32-bit) or regroup (16-bit).
807 AH1 bR=b.r;
808 AH1 bG=b.g;
809 AH1 bB=b.b;
810 AH1 dR=d.r;
811 AH1 dG=d.g;
812 AH1 dB=d.b;
813 AH1 eR=e.r;
814 AH1 eG=e.g;
815 AH1 eB=e.b;
816 AH1 fR=f.r;
817 AH1 fG=f.g;
818 AH1 fB=f.b;
819 AH1 hR=h.r;
820 AH1 hG=h.g;
821 AH1 hB=h.b;
822 // Run optional input transform.
823 FsrRcasInputH(bR,bG,bB);
824 FsrRcasInputH(dR,dG,dB);
825 FsrRcasInputH(eR,eG,eB);
826 FsrRcasInputH(fR,fG,fB);
827 FsrRcasInputH(hR,hG,hB);
828 // Luma times 2.
829 AH1 bL=bB*AH1_(0.5)+(bR*AH1_(0.5)+bG);
830 AH1 dL=dB*AH1_(0.5)+(dR*AH1_(0.5)+dG);
831 AH1 eL=eB*AH1_(0.5)+(eR*AH1_(0.5)+eG);
832 AH1 fL=fB*AH1_(0.5)+(fR*AH1_(0.5)+fG);
833 AH1 hL=hB*AH1_(0.5)+(hR*AH1_(0.5)+hG);
834 // Noise detection.
835 AH1 nz=AH1_(0.25)*bL+AH1_(0.25)*dL+AH1_(0.25)*fL+AH1_(0.25)*hL-eL;
836 nz=ASatH1(abs(nz)*APrxMedRcpH1(AMax3H1(AMax3H1(bL,dL,eL),fL,hL)-AMin3H1(AMin3H1(bL,dL,eL),fL,hL)));
837 nz=AH1_(-0.5)*nz+AH1_(1.0);
838 // Min and max of ring.
839 AH1 mn4R=min(AMin3H1(bR,dR,fR),hR);
840 AH1 mn4G=min(AMin3H1(bG,dG,fG),hG);
841 AH1 mn4B=min(AMin3H1(bB,dB,fB),hB);
842 AH1 mx4R=max(AMax3H1(bR,dR,fR),hR);
843 AH1 mx4G=max(AMax3H1(bG,dG,fG),hG);
844 AH1 mx4B=max(AMax3H1(bB,dB,fB),hB);
845 // Immediate constants for peak range.
846 AH2 peakC=AH2(1.0,-1.0*4.0);
847 // Limiters, these need to be high precision RCPs.
848 AH1 hitMinR=mn4R*ARcpH1(AH1_(4.0)*mx4R);
849 AH1 hitMinG=mn4G*ARcpH1(AH1_(4.0)*mx4G);
850 AH1 hitMinB=mn4B*ARcpH1(AH1_(4.0)*mx4B);
851 AH1 hitMaxR=(peakC.x-mx4R)*ARcpH1(AH1_(4.0)*mn4R+peakC.y);
852 AH1 hitMaxG=(peakC.x-mx4G)*ARcpH1(AH1_(4.0)*mn4G+peakC.y);
853 AH1 hitMaxB=(peakC.x-mx4B)*ARcpH1(AH1_(4.0)*mn4B+peakC.y);
854 AH1 lobeR=max(-hitMinR,hitMaxR);
855 AH1 lobeG=max(-hitMinG,hitMaxG);
856 AH1 lobeB=max(-hitMinB,hitMaxB);
857 AH1 lobe=max(AH1_(-FSR_RCAS_LIMIT),min(AMax3H1(lobeR,lobeG,lobeB),AH1_(0.0)))*AH2_AU1(con.y).x;
858 // Apply noise removal.
859 #ifdef FSR_RCAS_DENOISE
860 lobe*=nz;
861 #endif
862 // Resolve, which needs the medium precision rcp approximation to avoid visible tonality changes.
863 AH1 rcpL=APrxMedRcpH1(AH1_(4.0)*lobe+AH1_(1.0));
864 pixR=(lobe*bR+lobe*dR+lobe*hR+lobe*fR+eR)*rcpL;
865 pixG=(lobe*bG+lobe*dG+lobe*hG+lobe*fG+eG)*rcpL;
866 pixB=(lobe*bB+lobe*dB+lobe*hB+lobe*fB+eB)*rcpL;}
867#endif
868////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
869////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
870//_____________________________________________________________/\_______________________________________________________________
871//==============================================================================================================================
872// PACKED 16-BIT VERSION
873//==============================================================================================================================
874#if defined(A_GPU)&&defined(A_HALF)&&defined(FSR_RCAS_HX2)
875 // Input callback prototypes that need to be implemented by the calling shader
876 AH4 FsrRcasLoadHx2(ASW2 p);
877 void FsrRcasInputHx2(inout AH2 r,inout AH2 g,inout AH2 b);
878//------------------------------------------------------------------------------------------------------------------------------
879 // Can be used to convert from packed Structures of Arrays to Arrays of Structures for store.
880 void FsrRcasDepackHx2(out AH4 pix0,out AH4 pix1,AH2 pixR,AH2 pixG,AH2 pixB){
881 #ifdef A_HLSL
882 // Invoke a slower path for DX only, since it won't allow uninitialized values.
883 pix0.a=pix1.a=0.0;
884 #endif
885 pix0.rgb=AH3(pixR.x,pixG.x,pixB.x);
886 pix1.rgb=AH3(pixR.y,pixG.y,pixB.y);}
887//------------------------------------------------------------------------------------------------------------------------------
888 void FsrRcasHx2(
889 // Output values are for 2 8x8 tiles in a 16x8 region.
890 // pix<R,G,B>.x = left 8x8 tile
891 // pix<R,G,B>.y = right 8x8 tile
892 // This enables later processing to easily be packed as well.
893 out AH2 pixR,
894 out AH2 pixG,
895 out AH2 pixB,
896 #ifdef FSR_RCAS_PASSTHROUGH_ALPHA
897 out AH2 pixA,
898 #endif
899 AU2 ip, // Integer pixel position in output.
900 AU4 con){ // Constant generated by RcasSetup().
901 // No scaling algorithm uses minimal 3x3 pixel neighborhood.
902 ASW2 sp0=ASW2(ip);
903 AH3 b0=FsrRcasLoadHx2(sp0+ASW2( 0,-1)).rgb;
904 AH3 d0=FsrRcasLoadHx2(sp0+ASW2(-1, 0)).rgb;
905 #ifdef FSR_RCAS_PASSTHROUGH_ALPHA
906 AH4 ee0=FsrRcasLoadHx2(sp0);
907 AH3 e0=ee0.rgb;pixA.r=ee0.a;
908 #else
909 AH3 e0=FsrRcasLoadHx2(sp0).rgb;
910 #endif
911 AH3 f0=FsrRcasLoadHx2(sp0+ASW2( 1, 0)).rgb;
912 AH3 h0=FsrRcasLoadHx2(sp0+ASW2( 0, 1)).rgb;
913 ASW2 sp1=sp0+ASW2(8,0);
914 AH3 b1=FsrRcasLoadHx2(sp1+ASW2( 0,-1)).rgb;
915 AH3 d1=FsrRcasLoadHx2(sp1+ASW2(-1, 0)).rgb;
916 #ifdef FSR_RCAS_PASSTHROUGH_ALPHA
917 AH4 ee1=FsrRcasLoadHx2(sp1);
918 AH3 e1=ee1.rgb;pixA.g=ee1.a;
919 #else
920 AH3 e1=FsrRcasLoadHx2(sp1).rgb;
921 #endif
922 AH3 f1=FsrRcasLoadHx2(sp1+ASW2( 1, 0)).rgb;
923 AH3 h1=FsrRcasLoadHx2(sp1+ASW2( 0, 1)).rgb;
924 // Arrays of Structures to Structures of Arrays conversion.
925 AH2 bR=AH2(b0.r,b1.r);
926 AH2 bG=AH2(b0.g,b1.g);
927 AH2 bB=AH2(b0.b,b1.b);
928 AH2 dR=AH2(d0.r,d1.r);
929 AH2 dG=AH2(d0.g,d1.g);
930 AH2 dB=AH2(d0.b,d1.b);
931 AH2 eR=AH2(e0.r,e1.r);
932 AH2 eG=AH2(e0.g,e1.g);
933 AH2 eB=AH2(e0.b,e1.b);
934 AH2 fR=AH2(f0.r,f1.r);
935 AH2 fG=AH2(f0.g,f1.g);
936 AH2 fB=AH2(f0.b,f1.b);
937 AH2 hR=AH2(h0.r,h1.r);
938 AH2 hG=AH2(h0.g,h1.g);
939 AH2 hB=AH2(h0.b,h1.b);
940 // Run optional input transform.
941 FsrRcasInputHx2(bR,bG,bB);
942 FsrRcasInputHx2(dR,dG,dB);
943 FsrRcasInputHx2(eR,eG,eB);
944 FsrRcasInputHx2(fR,fG,fB);
945 FsrRcasInputHx2(hR,hG,hB);
946 // Luma times 2.
947 AH2 bL=bB*AH2_(0.5)+(bR*AH2_(0.5)+bG);
948 AH2 dL=dB*AH2_(0.5)+(dR*AH2_(0.5)+dG);
949 AH2 eL=eB*AH2_(0.5)+(eR*AH2_(0.5)+eG);
950 AH2 fL=fB*AH2_(0.5)+(fR*AH2_(0.5)+fG);
951 AH2 hL=hB*AH2_(0.5)+(hR*AH2_(0.5)+hG);
952 // Noise detection.
953 AH2 nz=AH2_(0.25)*bL+AH2_(0.25)*dL+AH2_(0.25)*fL+AH2_(0.25)*hL-eL;
954 nz=ASatH2(abs(nz)*APrxMedRcpH2(AMax3H2(AMax3H2(bL,dL,eL),fL,hL)-AMin3H2(AMin3H2(bL,dL,eL),fL,hL)));
955 nz=AH2_(-0.5)*nz+AH2_(1.0);
956 // Min and max of ring.
957 AH2 mn4R=min(AMin3H2(bR,dR,fR),hR);
958 AH2 mn4G=min(AMin3H2(bG,dG,fG),hG);
959 AH2 mn4B=min(AMin3H2(bB,dB,fB),hB);
960 AH2 mx4R=max(AMax3H2(bR,dR,fR),hR);
961 AH2 mx4G=max(AMax3H2(bG,dG,fG),hG);
962 AH2 mx4B=max(AMax3H2(bB,dB,fB),hB);
963 // Immediate constants for peak range.
964 AH2 peakC=AH2(1.0,-1.0*4.0);
965 // Limiters, these need to be high precision RCPs.
966 AH2 hitMinR=mn4R*ARcpH2(AH2_(4.0)*mx4R);
967 AH2 hitMinG=mn4G*ARcpH2(AH2_(4.0)*mx4G);
968 AH2 hitMinB=mn4B*ARcpH2(AH2_(4.0)*mx4B);
969 AH2 hitMaxR=(peakC.x-mx4R)*ARcpH2(AH2_(4.0)*mn4R+peakC.y);
970 AH2 hitMaxG=(peakC.x-mx4G)*ARcpH2(AH2_(4.0)*mn4G+peakC.y);
971 AH2 hitMaxB=(peakC.x-mx4B)*ARcpH2(AH2_(4.0)*mn4B+peakC.y);
972 AH2 lobeR=max(-hitMinR,hitMaxR);
973 AH2 lobeG=max(-hitMinG,hitMaxG);
974 AH2 lobeB=max(-hitMinB,hitMaxB);
975 AH2 lobe=max(AH2_(-FSR_RCAS_LIMIT),min(AMax3H2(lobeR,lobeG,lobeB),AH2_(0.0)))*AH2_(AH2_AU1(con.y).x);
976 // Apply noise removal.
977 #ifdef FSR_RCAS_DENOISE
978 lobe*=nz;
979 #endif
980 // Resolve, which needs the medium precision rcp approximation to avoid visible tonality changes.
981 AH2 rcpL=APrxMedRcpH2(AH2_(4.0)*lobe+AH2_(1.0));
982 pixR=(lobe*bR+lobe*dR+lobe*hR+lobe*fR+eR)*rcpL;
983 pixG=(lobe*bG+lobe*dG+lobe*hG+lobe*fG+eG)*rcpL;
984 pixB=(lobe*bB+lobe*dB+lobe*hB+lobe*fB+eB)*rcpL;}
985#endif
986////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
987////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
988////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
989////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
990//_____________________________________________________________/\_______________________________________________________________
991//==============================================================================================================================
992//
993// FSR - [LFGA] LINEAR FILM GRAIN APPLICATOR
994//
995//------------------------------------------------------------------------------------------------------------------------------
996// Adding output-resolution film grain after scaling is a good way to mask both rendering and scaling artifacts.
997// Suggest using tiled blue noise as film grain input, with peak noise frequency set for a specific look and feel.
998// The 'Lfga*()' functions provide a convenient way to introduce grain.
999// These functions limit grain based on distance to signal limits.
1000// This is done so that the grain is temporally energy preserving, and thus won't modify image tonality.
1001// Grain application should be done in a linear colorspace.
1002// The grain should be temporally changing, but have a temporal sum per pixel that adds to zero (non-biased).
1003//------------------------------------------------------------------------------------------------------------------------------
1004// Usage,
1005// FsrLfga*(
1006// color, // In/out linear colorspace color {0 to 1} ranged.
1007// grain, // Per pixel grain texture value {-0.5 to 0.5} ranged, input is 3-channel to support colored grain.
1008// amount); // Amount of grain (0 to 1} ranged.
1009//------------------------------------------------------------------------------------------------------------------------------
1010// Example if grain texture is monochrome: 'FsrLfgaF(color,AF3_(grain),amount)'
1011//==============================================================================================================================
1012#if defined(A_GPU)
1013 // Maximum grain is the minimum distance to the signal limit.
1014 void FsrLfgaF(inout AF3 c,AF3 t,AF1 a){c+=(t*AF3_(a))*min(AF3_(1.0)-c,c);}
1015#endif
1016//==============================================================================================================================
1017#if defined(A_GPU)&&defined(A_HALF)
1018 // Half precision version (slower).
1019 void FsrLfgaH(inout AH3 c,AH3 t,AH1 a){c+=(t*AH3_(a))*min(AH3_(1.0)-c,c);}
1020//------------------------------------------------------------------------------------------------------------------------------
1021 // Packed half precision version (faster).
1022 void FsrLfgaHx2(inout AH2 cR,inout AH2 cG,inout AH2 cB,AH2 tR,AH2 tG,AH2 tB,AH1 a){
1023 cR+=(tR*AH2_(a))*min(AH2_(1.0)-cR,cR);cG+=(tG*AH2_(a))*min(AH2_(1.0)-cG,cG);cB+=(tB*AH2_(a))*min(AH2_(1.0)-cB,cB);}
1024#endif
1025////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
1026////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
1027////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
1028////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
1029//_____________________________________________________________/\_______________________________________________________________
1030//==============================================================================================================================
1031//
1032// FSR - [SRTM] SIMPLE REVERSIBLE TONE-MAPPER
1033//
1034//------------------------------------------------------------------------------------------------------------------------------
1035// This provides a way to take linear HDR color {0 to FP16_MAX} and convert it into a temporary {0 to 1} ranged post-tonemapped linear.
1036// The tonemapper preserves RGB ratio, which helps maintain HDR color bleed during filtering.
1037//------------------------------------------------------------------------------------------------------------------------------
1038// Reversible tonemapper usage,
1039// FsrSrtm*(color); // {0 to FP16_MAX} converted to {0 to 1}.
1040// FsrSrtmInv*(color); // {0 to 1} converted into {0 to 32768, output peak safe for FP16}.
1041//==============================================================================================================================
1042#if defined(A_GPU)
1043 void FsrSrtmF(inout AF3 c){c*=AF3_(ARcpF1(AMax3F1(c.r,c.g,c.b)+AF1_(1.0)));}
1044 // The extra max solves the c=1.0 case (which is a /0).
1045 void FsrSrtmInvF(inout AF3 c){c*=AF3_(ARcpF1(max(AF1_(1.0/32768.0),AF1_(1.0)-AMax3F1(c.r,c.g,c.b))));}
1046#endif
1047//==============================================================================================================================
1048#if defined(A_GPU)&&defined(A_HALF)
1049 void FsrSrtmH(inout AH3 c){c*=AH3_(ARcpH1(AMax3H1(c.r,c.g,c.b)+AH1_(1.0)));}
1050 void FsrSrtmInvH(inout AH3 c){c*=AH3_(ARcpH1(max(AH1_(1.0/32768.0),AH1_(1.0)-AMax3H1(c.r,c.g,c.b))));}
1051//------------------------------------------------------------------------------------------------------------------------------
1052 void FsrSrtmHx2(inout AH2 cR,inout AH2 cG,inout AH2 cB){
1053 AH2 rcp=ARcpH2(AMax3H2(cR,cG,cB)+AH2_(1.0));cR*=rcp;cG*=rcp;cB*=rcp;}
1054 void FsrSrtmInvHx2(inout AH2 cR,inout AH2 cG,inout AH2 cB){
1055 AH2 rcp=ARcpH2(max(AH2_(1.0/32768.0),AH2_(1.0)-AMax3H2(cR,cG,cB)));cR*=rcp;cG*=rcp;cB*=rcp;}
1056#endif
1057////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
1058////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
1059////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
1060////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
1061//_____________________________________________________________/\_______________________________________________________________
1062//==============================================================================================================================
1063//
1064// FSR - [TEPD] TEMPORAL ENERGY PRESERVING DITHER
1065//
1066//------------------------------------------------------------------------------------------------------------------------------
1067// Temporally energy preserving dithered {0 to 1} linear to gamma 2.0 conversion.
1068// Gamma 2.0 is used so that the conversion back to linear is just to square the color.
1069// The conversion comes in 8-bit and 10-bit modes, designed for output to 8-bit UNORM or 10:10:10:2 respectively.
1070// Given good non-biased temporal blue noise as dither input,
1071// the output dither will temporally conserve energy.
1072// This is done by choosing the linear nearest step point instead of perceptual nearest.
1073// See code below for details.
1074//------------------------------------------------------------------------------------------------------------------------------
1075// DX SPEC RULES FOR FLOAT->UNORM 8-BIT CONVERSION
1076// ===============================================
1077// - Output is 'uint(floor(saturate(n)*255.0+0.5))'.
1078// - Thus rounding is to nearest.
1079// - NaN gets converted to zero.
1080// - INF is clamped to {0.0 to 1.0}.
1081//==============================================================================================================================
1082#if defined(A_GPU)
1083 // Hand tuned integer position to dither value, with more values than simple checkerboard.
1084 // Only 32-bit has enough precision for this compddation.
1085 // Output is {0 to <1}.
1086 AF1 FsrTepdDitF(AU2 p,AU1 f){
1087 AF1 x=AF1_(p.x+f);
1088 AF1 y=AF1_(p.y);
1089 // The 1.61803 golden ratio.
1090 AF1 a=AF1_((1.0+sqrt(5.0))/2.0);
1091 // Number designed to provide a good visual pattern.
1092 AF1 b=AF1_(1.0/3.69);
1093 x=x*a+(y*b);
1094 return AFractF1(x);}
1095//------------------------------------------------------------------------------------------------------------------------------
1096 // This version is 8-bit gamma 2.0.
1097 // The 'c' input is {0 to 1}.
1098 // Output is {0 to 1} ready for image store.
1099 void FsrTepdC8F(inout AF3 c,AF1 dit){
1100 AF3 n=sqrt(c);
1101 n=floor(n*AF3_(255.0))*AF3_(1.0/255.0);
1102 AF3 a=n*n;
1103 AF3 b=n+AF3_(1.0/255.0);b=b*b;
1104 // Ratio of 'a' to 'b' required to produce 'c'.
1105 // APrxLoRcpF1() won't work here (at least for very high dynamic ranges).
1106 // APrxMedRcpF1() is an IADD,FMA,MUL.
1107 AF3 r=(c-b)*APrxMedRcpF3(a-b);
1108 // Use the ratio as a cutoff to choose 'a' or 'b'.
1109 // AGtZeroF1() is a MUL.
1110 c=ASatF3(n+AGtZeroF3(AF3_(dit)-r)*AF3_(1.0/255.0));}
1111//------------------------------------------------------------------------------------------------------------------------------
1112 // This version is 10-bit gamma 2.0.
1113 // The 'c' input is {0 to 1}.
1114 // Output is {0 to 1} ready for image store.
1115 void FsrTepdC10F(inout AF3 c,AF1 dit){
1116 AF3 n=sqrt(c);
1117 n=floor(n*AF3_(1023.0))*AF3_(1.0/1023.0);
1118 AF3 a=n*n;
1119 AF3 b=n+AF3_(1.0/1023.0);b=b*b;
1120 AF3 r=(c-b)*APrxMedRcpF3(a-b);
1121 c=ASatF3(n+AGtZeroF3(AF3_(dit)-r)*AF3_(1.0/1023.0));}
1122#endif
1123//==============================================================================================================================
1124#if defined(A_GPU)&&defined(A_HALF)
1125 AH1 FsrTepdDitH(AU2 p,AU1 f){
1126 AF1 x=AF1_(p.x+f);
1127 AF1 y=AF1_(p.y);
1128 AF1 a=AF1_((1.0+sqrt(5.0))/2.0);
1129 AF1 b=AF1_(1.0/3.69);
1130 x=x*a+(y*b);
1131 return AH1(AFractF1(x));}
1132//------------------------------------------------------------------------------------------------------------------------------
1133 void FsrTepdC8H(inout AH3 c,AH1 dit){
1134 AH3 n=sqrt(c);
1135 n=floor(n*AH3_(255.0))*AH3_(1.0/255.0);
1136 AH3 a=n*n;
1137 AH3 b=n+AH3_(1.0/255.0);b=b*b;
1138 AH3 r=(c-b)*APrxMedRcpH3(a-b);
1139 c=ASatH3(n+AGtZeroH3(AH3_(dit)-r)*AH3_(1.0/255.0));}
1140//------------------------------------------------------------------------------------------------------------------------------
1141 void FsrTepdC10H(inout AH3 c,AH1 dit){
1142 AH3 n=sqrt(c);
1143 n=floor(n*AH3_(1023.0))*AH3_(1.0/1023.0);
1144 AH3 a=n*n;
1145 AH3 b=n+AH3_(1.0/1023.0);b=b*b;
1146 AH3 r=(c-b)*APrxMedRcpH3(a-b);
1147 c=ASatH3(n+AGtZeroH3(AH3_(dit)-r)*AH3_(1.0/1023.0));}
1148//==============================================================================================================================
1149 // This computes dither for positions 'p' and 'p+{8,0}'.
1150 AH2 FsrTepdDitHx2(AU2 p,AU1 f){
1151 AF2 x;
1152 x.x=AF1_(p.x+f);
1153 x.y=x.x+AF1_(8.0);
1154 AF1 y=AF1_(p.y);
1155 AF1 a=AF1_((1.0+sqrt(5.0))/2.0);
1156 AF1 b=AF1_(1.0/3.69);
1157 x=x*AF2_(a)+AF2_(y*b);
1158 return AH2(AFractF2(x));}
1159//------------------------------------------------------------------------------------------------------------------------------
1160 void FsrTepdC8Hx2(inout AH2 cR,inout AH2 cG,inout AH2 cB,AH2 dit){
1161 AH2 nR=sqrt(cR);
1162 AH2 nG=sqrt(cG);
1163 AH2 nB=sqrt(cB);
1164 nR=floor(nR*AH2_(255.0))*AH2_(1.0/255.0);
1165 nG=floor(nG*AH2_(255.0))*AH2_(1.0/255.0);
1166 nB=floor(nB*AH2_(255.0))*AH2_(1.0/255.0);
1167 AH2 aR=nR*nR;
1168 AH2 aG=nG*nG;
1169 AH2 aB=nB*nB;
1170 AH2 bR=nR+AH2_(1.0/255.0);bR=bR*bR;
1171 AH2 bG=nG+AH2_(1.0/255.0);bG=bG*bG;
1172 AH2 bB=nB+AH2_(1.0/255.0);bB=bB*bB;
1173 AH2 rR=(cR-bR)*APrxMedRcpH2(aR-bR);
1174 AH2 rG=(cG-bG)*APrxMedRcpH2(aG-bG);
1175 AH2 rB=(cB-bB)*APrxMedRcpH2(aB-bB);
1176 cR=ASatH2(nR+AGtZeroH2(dit-rR)*AH2_(1.0/255.0));
1177 cG=ASatH2(nG+AGtZeroH2(dit-rG)*AH2_(1.0/255.0));
1178 cB=ASatH2(nB+AGtZeroH2(dit-rB)*AH2_(1.0/255.0));}
1179//------------------------------------------------------------------------------------------------------------------------------
1180 void FsrTepdC10Hx2(inout AH2 cR,inout AH2 cG,inout AH2 cB,AH2 dit){
1181 AH2 nR=sqrt(cR);
1182 AH2 nG=sqrt(cG);
1183 AH2 nB=sqrt(cB);
1184 nR=floor(nR*AH2_(1023.0))*AH2_(1.0/1023.0);
1185 nG=floor(nG*AH2_(1023.0))*AH2_(1.0/1023.0);
1186 nB=floor(nB*AH2_(1023.0))*AH2_(1.0/1023.0);
1187 AH2 aR=nR*nR;
1188 AH2 aG=nG*nG;
1189 AH2 aB=nB*nB;
1190 AH2 bR=nR+AH2_(1.0/1023.0);bR=bR*bR;
1191 AH2 bG=nG+AH2_(1.0/1023.0);bG=bG*bG;
1192 AH2 bB=nB+AH2_(1.0/1023.0);bB=bB*bB;
1193 AH2 rR=(cR-bR)*APrxMedRcpH2(aR-bR);
1194 AH2 rG=(cG-bG)*APrxMedRcpH2(aG-bG);
1195 AH2 rB=(cB-bB)*APrxMedRcpH2(aB-bB);
1196 cR=ASatH2(nR+AGtZeroH2(dit-rR)*AH2_(1.0/1023.0));
1197 cG=ASatH2(nG+AGtZeroH2(dit-rG)*AH2_(1.0/1023.0));
1198 cB=ASatH2(nB+AGtZeroH2(dit-rB)*AH2_(1.0/1023.0));}
1199#endif
diff --git a/externals/FidelityFX-FSR/license.txt b/externals/FidelityFX-FSR/license.txt
new file mode 100644
index 000000000..324cba594
--- /dev/null
+++ b/externals/FidelityFX-FSR/license.txt
@@ -0,0 +1,19 @@
1Copyright (c) 2021 Advanced Micro Devices, Inc. All rights reserved.
2
3Permission is hereby granted, free of charge, to any person obtaining a copy
4of this software and associated documentation files (the "Software"), to deal
5in the Software without restriction, including without limitation the rights
6to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
7copies of the Software, and to permit persons to whom the Software is
8furnished to do so, subject to the following conditions:
9
10The above copyright notice and this permission notice shall be included in
11all copies or substantial portions of the Software.
12
13THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
19THE SOFTWARE.
diff --git a/src/common/math_util.h b/src/common/math_util.h
index 4c38d8040..510c4e56d 100644
--- a/src/common/math_util.h
+++ b/src/common/math_util.h
@@ -48,8 +48,8 @@ struct Rectangle {
48 } 48 }
49 49
50 [[nodiscard]] Rectangle<T> Scale(const float s) const { 50 [[nodiscard]] Rectangle<T> Scale(const float s) const {
51 return Rectangle{left, top, static_cast<T>(left + GetWidth() * s), 51 return Rectangle{left, top, static_cast<T>(static_cast<float>(left + GetWidth()) * s),
52 static_cast<T>(top + GetHeight() * s)}; 52 static_cast<T>(static_cast<float>(top + GetHeight()) * s)};
53 } 53 }
54}; 54};
55 55
diff --git a/src/common/settings.cpp b/src/common/settings.cpp
index 9dd5e3efb..3bcaa072f 100644
--- a/src/common/settings.cpp
+++ b/src/common/settings.cpp
@@ -47,7 +47,9 @@ void LogSettings() {
47 log_setting("System_TimeZoneIndex", values.time_zone_index.GetValue()); 47 log_setting("System_TimeZoneIndex", values.time_zone_index.GetValue());
48 log_setting("Core_UseMultiCore", values.use_multi_core.GetValue()); 48 log_setting("Core_UseMultiCore", values.use_multi_core.GetValue());
49 log_setting("CPU_Accuracy", values.cpu_accuracy.GetValue()); 49 log_setting("CPU_Accuracy", values.cpu_accuracy.GetValue());
50 log_setting("Renderer_UseResolutionFactor", values.resolution_factor.GetValue()); 50 log_setting("Renderer_UseResolutionScaling", values.resolution_setup.GetValue());
51 log_setting("Renderer_ScalingFilter", values.scaling_filter.GetValue());
52 log_setting("Renderer_AntiAliasing", values.anti_aliasing.GetValue());
51 log_setting("Renderer_UseSpeedLimit", values.use_speed_limit.GetValue()); 53 log_setting("Renderer_UseSpeedLimit", values.use_speed_limit.GetValue());
52 log_setting("Renderer_SpeedLimit", values.speed_limit.GetValue()); 54 log_setting("Renderer_SpeedLimit", values.speed_limit.GetValue());
53 log_setting("Renderer_UseDiskShaderCache", values.use_disk_shader_cache.GetValue()); 55 log_setting("Renderer_UseDiskShaderCache", values.use_disk_shader_cache.GetValue());
@@ -105,6 +107,55 @@ float Volume() {
105 return values.volume.GetValue() / 100.0f; 107 return values.volume.GetValue() / 100.0f;
106} 108}
107 109
110void UpdateRescalingInfo() {
111 const auto setup = values.resolution_setup.GetValue();
112 auto& info = values.resolution_info;
113 info.downscale = false;
114 switch (setup) {
115 case ResolutionSetup::Res1_2X:
116 info.up_scale = 1;
117 info.down_shift = 1;
118 info.downscale = true;
119 break;
120 case ResolutionSetup::Res3_4X:
121 info.up_scale = 3;
122 info.down_shift = 2;
123 info.downscale = true;
124 break;
125 case ResolutionSetup::Res1X:
126 info.up_scale = 1;
127 info.down_shift = 0;
128 break;
129 case ResolutionSetup::Res2X:
130 info.up_scale = 2;
131 info.down_shift = 0;
132 break;
133 case ResolutionSetup::Res3X:
134 info.up_scale = 3;
135 info.down_shift = 0;
136 break;
137 case ResolutionSetup::Res4X:
138 info.up_scale = 4;
139 info.down_shift = 0;
140 break;
141 case ResolutionSetup::Res5X:
142 info.up_scale = 5;
143 info.down_shift = 0;
144 break;
145 case ResolutionSetup::Res6X:
146 info.up_scale = 6;
147 info.down_shift = 0;
148 break;
149 default:
150 UNREACHABLE();
151 info.up_scale = 1;
152 info.down_shift = 0;
153 }
154 info.up_factor = static_cast<f32>(info.up_scale) / (1U << info.down_shift);
155 info.down_factor = static_cast<f32>(1U << info.down_shift) / info.up_scale;
156 info.active = info.up_scale != 1 || info.down_shift != 0;
157}
158
108void RestoreGlobalState(bool is_powered_on) { 159void RestoreGlobalState(bool is_powered_on) {
109 // If a game is running, DO NOT restore the global settings state 160 // If a game is running, DO NOT restore the global settings state
110 if (is_powered_on) { 161 if (is_powered_on) {
diff --git a/src/common/settings.h b/src/common/settings.h
index 9ff4cf85d..42f8b4a7d 100644
--- a/src/common/settings.h
+++ b/src/common/settings.h
@@ -52,6 +52,56 @@ enum class NvdecEmulation : u32 {
52 GPU = 2, 52 GPU = 2,
53}; 53};
54 54
55enum class ResolutionSetup : u32 {
56 Res1_2X = 0,
57 Res3_4X = 1,
58 Res1X = 2,
59 Res2X = 3,
60 Res3X = 4,
61 Res4X = 5,
62 Res5X = 6,
63 Res6X = 7,
64};
65
66enum class ScalingFilter : u32 {
67 NearestNeighbor = 0,
68 Bilinear = 1,
69 Bicubic = 2,
70 Gaussian = 3,
71 ScaleForce = 4,
72 Fsr = 5,
73 LastFilter = Fsr,
74};
75
76enum class AntiAliasing : u32 {
77 None = 0,
78 Fxaa = 1,
79 LastAA = Fxaa,
80};
81
82struct ResolutionScalingInfo {
83 u32 up_scale{1};
84 u32 down_shift{0};
85 f32 up_factor{1.0f};
86 f32 down_factor{1.0f};
87 bool active{};
88 bool downscale{};
89
90 s32 ScaleUp(s32 value) const {
91 if (value == 0) {
92 return 0;
93 }
94 return std::max((value * static_cast<s32>(up_scale)) >> static_cast<s32>(down_shift), 1);
95 }
96
97 u32 ScaleUp(u32 value) const {
98 if (value == 0U) {
99 return 0U;
100 }
101 return std::max((value * up_scale) >> down_shift, 1U);
102 }
103};
104
55/** The BasicSetting class is a simple resource manager. It defines a label and default value 105/** The BasicSetting class is a simple resource manager. It defines a label and default value
56 * alongside the actual value of the setting for simpler and less-error prone use with frontend 106 * alongside the actual value of the setting for simpler and less-error prone use with frontend
57 * configurations. Setting a default value and label is required, though subclasses may deviate from 107 * configurations. Setting a default value and label is required, though subclasses may deviate from
@@ -451,7 +501,10 @@ struct Values {
451 "disable_shader_loop_safety_checks"}; 501 "disable_shader_loop_safety_checks"};
452 Setting<int> vulkan_device{0, "vulkan_device"}; 502 Setting<int> vulkan_device{0, "vulkan_device"};
453 503
454 Setting<u16> resolution_factor{1, "resolution_factor"}; 504 ResolutionScalingInfo resolution_info{};
505 Setting<ResolutionSetup> resolution_setup{ResolutionSetup::Res1X, "resolution_setup"};
506 Setting<ScalingFilter> scaling_filter{ScalingFilter::Bilinear, "scaling_filter"};
507 Setting<AntiAliasing> anti_aliasing{AntiAliasing::None, "anti_aliasing"};
455 // *nix platforms may have issues with the borderless windowed fullscreen mode. 508 // *nix platforms may have issues with the borderless windowed fullscreen mode.
456 // Default to exclusive fullscreen on these platforms for now. 509 // Default to exclusive fullscreen on these platforms for now.
457 RangedSetting<FullscreenMode> fullscreen_mode{ 510 RangedSetting<FullscreenMode> fullscreen_mode{
@@ -462,7 +515,7 @@ struct Values {
462#endif 515#endif
463 FullscreenMode::Borderless, FullscreenMode::Exclusive, "fullscreen_mode"}; 516 FullscreenMode::Borderless, FullscreenMode::Exclusive, "fullscreen_mode"};
464 RangedSetting<int> aspect_ratio{0, 0, 3, "aspect_ratio"}; 517 RangedSetting<int> aspect_ratio{0, 0, 3, "aspect_ratio"};
465 RangedSetting<int> max_anisotropy{0, 0, 4, "max_anisotropy"}; 518 RangedSetting<int> max_anisotropy{0, 0, 5, "max_anisotropy"};
466 Setting<bool> use_speed_limit{true, "use_speed_limit"}; 519 Setting<bool> use_speed_limit{true, "use_speed_limit"};
467 RangedSetting<u16> speed_limit{100, 0, 9999, "speed_limit"}; 520 RangedSetting<u16> speed_limit{100, 0, 9999, "speed_limit"};
468 Setting<bool> use_disk_shader_cache{true, "use_disk_shader_cache"}; 521 Setting<bool> use_disk_shader_cache{true, "use_disk_shader_cache"};
@@ -595,6 +648,8 @@ std::string GetTimeZoneString();
595 648
596void LogSettings(); 649void LogSettings();
597 650
651void UpdateRescalingInfo();
652
598// Restore the global state of all applicable settings in the Values struct 653// Restore the global state of all applicable settings in the Values struct
599void RestoreGlobalState(bool is_powered_on); 654void RestoreGlobalState(bool is_powered_on);
600 655
diff --git a/src/core/frontend/framebuffer_layout.cpp b/src/core/frontend/framebuffer_layout.cpp
index 0832463d6..4b58b672a 100644
--- a/src/core/frontend/framebuffer_layout.cpp
+++ b/src/core/frontend/framebuffer_layout.cpp
@@ -44,16 +44,13 @@ FramebufferLayout DefaultFrameLayout(u32 width, u32 height) {
44 return res; 44 return res;
45} 45}
46 46
47FramebufferLayout FrameLayoutFromResolutionScale(u32 res_scale) { 47FramebufferLayout FrameLayoutFromResolutionScale(f32 res_scale) {
48 u32 width, height; 48 const bool is_docked = Settings::values.use_docked_mode.GetValue();
49 const u32 screen_width = is_docked ? ScreenDocked::Width : ScreenUndocked::Width;
50 const u32 screen_height = is_docked ? ScreenDocked::Height : ScreenUndocked::Height;
49 51
50 if (Settings::values.use_docked_mode.GetValue()) { 52 const u32 width = static_cast<u32>(static_cast<f32>(screen_width) * res_scale);
51 width = ScreenDocked::Width * res_scale; 53 const u32 height = static_cast<u32>(static_cast<f32>(screen_height) * res_scale);
52 height = ScreenDocked::Height * res_scale;
53 } else {
54 width = ScreenUndocked::Width * res_scale;
55 height = ScreenUndocked::Height * res_scale;
56 }
57 54
58 return DefaultFrameLayout(width, height); 55 return DefaultFrameLayout(width, height);
59} 56}
diff --git a/src/core/frontend/framebuffer_layout.h b/src/core/frontend/framebuffer_layout.h
index e2e3bbbb3..2e36c0163 100644
--- a/src/core/frontend/framebuffer_layout.h
+++ b/src/core/frontend/framebuffer_layout.h
@@ -60,7 +60,7 @@ FramebufferLayout DefaultFrameLayout(u32 width, u32 height);
60 * Convenience method to get frame layout by resolution scale 60 * Convenience method to get frame layout by resolution scale
61 * @param res_scale resolution scale factor 61 * @param res_scale resolution scale factor
62 */ 62 */
63FramebufferLayout FrameLayoutFromResolutionScale(u32 res_scale); 63FramebufferLayout FrameLayoutFromResolutionScale(f32 res_scale);
64 64
65/** 65/**
66 * Convenience method to determine emulation aspect ratio 66 * Convenience method to determine emulation aspect ratio
diff --git a/src/core/hle/service/am/am.cpp b/src/core/hle/service/am/am.cpp
index 50c2ace93..aee8d4f93 100644
--- a/src/core/hle/service/am/am.cpp
+++ b/src/core/hle/service/am/am.cpp
@@ -797,15 +797,11 @@ void ICommonStateGetter::GetDefaultDisplayResolution(Kernel::HLERequestContext&
797 rb.Push(ResultSuccess); 797 rb.Push(ResultSuccess);
798 798
799 if (Settings::values.use_docked_mode.GetValue()) { 799 if (Settings::values.use_docked_mode.GetValue()) {
800 rb.Push(static_cast<u32>(Service::VI::DisplayResolution::DockedWidth) * 800 rb.Push(static_cast<u32>(Service::VI::DisplayResolution::DockedWidth));
801 static_cast<u32>(Settings::values.resolution_factor.GetValue())); 801 rb.Push(static_cast<u32>(Service::VI::DisplayResolution::DockedHeight));
802 rb.Push(static_cast<u32>(Service::VI::DisplayResolution::DockedHeight) *
803 static_cast<u32>(Settings::values.resolution_factor.GetValue()));
804 } else { 802 } else {
805 rb.Push(static_cast<u32>(Service::VI::DisplayResolution::UndockedWidth) * 803 rb.Push(static_cast<u32>(Service::VI::DisplayResolution::UndockedWidth));
806 static_cast<u32>(Settings::values.resolution_factor.GetValue())); 804 rb.Push(static_cast<u32>(Service::VI::DisplayResolution::UndockedHeight));
807 rb.Push(static_cast<u32>(Service::VI::DisplayResolution::UndockedHeight) *
808 static_cast<u32>(Settings::values.resolution_factor.GetValue()));
809 } 805 }
810} 806}
811 807
diff --git a/src/core/hle/service/vi/vi.cpp b/src/core/hle/service/vi/vi.cpp
index 63d5242c4..75ee3e5e4 100644
--- a/src/core/hle/service/vi/vi.cpp
+++ b/src/core/hle/service/vi/vi.cpp
@@ -541,11 +541,8 @@ private:
541 switch (transaction) { 541 switch (transaction) {
542 case TransactionId::Connect: { 542 case TransactionId::Connect: {
543 IGBPConnectRequestParcel request{ctx.ReadBuffer()}; 543 IGBPConnectRequestParcel request{ctx.ReadBuffer()};
544 IGBPConnectResponseParcel response{ 544 IGBPConnectResponseParcel response{static_cast<u32>(DisplayResolution::UndockedWidth),
545 static_cast<u32>(static_cast<u32>(DisplayResolution::UndockedWidth) * 545 static_cast<u32>(DisplayResolution::UndockedHeight)};
546 Settings::values.resolution_factor.GetValue()),
547 static_cast<u32>(static_cast<u32>(DisplayResolution::UndockedHeight) *
548 Settings::values.resolution_factor.GetValue())};
549 546
550 buffer_queue.Connect(); 547 buffer_queue.Connect();
551 548
@@ -775,15 +772,11 @@ private:
775 rb.Push(ResultSuccess); 772 rb.Push(ResultSuccess);
776 773
777 if (Settings::values.use_docked_mode.GetValue()) { 774 if (Settings::values.use_docked_mode.GetValue()) {
778 rb.Push(static_cast<u32>(Service::VI::DisplayResolution::DockedWidth) * 775 rb.Push(static_cast<u32>(Service::VI::DisplayResolution::DockedWidth));
779 static_cast<u32>(Settings::values.resolution_factor.GetValue())); 776 rb.Push(static_cast<u32>(Service::VI::DisplayResolution::DockedHeight));
780 rb.Push(static_cast<u32>(Service::VI::DisplayResolution::DockedHeight) *
781 static_cast<u32>(Settings::values.resolution_factor.GetValue()));
782 } else { 777 } else {
783 rb.Push(static_cast<u32>(Service::VI::DisplayResolution::UndockedWidth) * 778 rb.Push(static_cast<u32>(Service::VI::DisplayResolution::UndockedWidth));
784 static_cast<u32>(Settings::values.resolution_factor.GetValue())); 779 rb.Push(static_cast<u32>(Service::VI::DisplayResolution::UndockedHeight));
785 rb.Push(static_cast<u32>(Service::VI::DisplayResolution::UndockedHeight) *
786 static_cast<u32>(Settings::values.resolution_factor.GetValue()));
787 } 780 }
788 781
789 rb.PushRaw<float>(60.0f); // This wouldn't seem to be correct for 30 fps games. 782 rb.PushRaw<float>(60.0f); // This wouldn't seem to be correct for 30 fps games.
@@ -1063,10 +1056,8 @@ private:
1063 // This only returns the fixed values of 1280x720 and makes no distinguishing 1056 // This only returns the fixed values of 1280x720 and makes no distinguishing
1064 // between docked and undocked dimensions. We take the liberty of applying 1057 // between docked and undocked dimensions. We take the liberty of applying
1065 // the resolution scaling factor here. 1058 // the resolution scaling factor here.
1066 rb.Push(static_cast<u64>(DisplayResolution::UndockedWidth) * 1059 rb.Push(static_cast<u64>(DisplayResolution::UndockedWidth));
1067 static_cast<u32>(Settings::values.resolution_factor.GetValue())); 1060 rb.Push(static_cast<u64>(DisplayResolution::UndockedHeight));
1068 rb.Push(static_cast<u64>(DisplayResolution::UndockedHeight) *
1069 static_cast<u32>(Settings::values.resolution_factor.GetValue()));
1070 } 1061 }
1071 1062
1072 void SetLayerScalingMode(Kernel::HLERequestContext& ctx) { 1063 void SetLayerScalingMode(Kernel::HLERequestContext& ctx) {
@@ -1099,8 +1090,6 @@ private:
1099 LOG_WARNING(Service_VI, "(STUBBED) called"); 1090 LOG_WARNING(Service_VI, "(STUBBED) called");
1100 1091
1101 DisplayInfo display_info; 1092 DisplayInfo display_info;
1102 display_info.width *= static_cast<u64>(Settings::values.resolution_factor.GetValue());
1103 display_info.height *= static_cast<u64>(Settings::values.resolution_factor.GetValue());
1104 ctx.WriteBuffer(&display_info, sizeof(DisplayInfo)); 1093 ctx.WriteBuffer(&display_info, sizeof(DisplayInfo));
1105 IPC::ResponseBuilder rb{ctx, 4}; 1094 IPC::ResponseBuilder rb{ctx, 4};
1106 rb.Push(ResultSuccess); 1095 rb.Push(ResultSuccess);
diff --git a/src/core/telemetry_session.cpp b/src/core/telemetry_session.cpp
index 191475f71..654db0b52 100644
--- a/src/core/telemetry_session.cpp
+++ b/src/core/telemetry_session.cpp
@@ -229,8 +229,6 @@ void TelemetrySession::AddInitialInfo(Loader::AppLoader& app_loader,
229 AddField(field_type, "Core_UseMultiCore", Settings::values.use_multi_core.GetValue()); 229 AddField(field_type, "Core_UseMultiCore", Settings::values.use_multi_core.GetValue());
230 AddField(field_type, "Renderer_Backend", 230 AddField(field_type, "Renderer_Backend",
231 TranslateRenderer(Settings::values.renderer_backend.GetValue())); 231 TranslateRenderer(Settings::values.renderer_backend.GetValue()));
232 AddField(field_type, "Renderer_ResolutionFactor",
233 Settings::values.resolution_factor.GetValue());
234 AddField(field_type, "Renderer_UseSpeedLimit", Settings::values.use_speed_limit.GetValue()); 232 AddField(field_type, "Renderer_UseSpeedLimit", Settings::values.use_speed_limit.GetValue());
235 AddField(field_type, "Renderer_SpeedLimit", Settings::values.speed_limit.GetValue()); 233 AddField(field_type, "Renderer_SpeedLimit", Settings::values.speed_limit.GetValue());
236 AddField(field_type, "Renderer_UseDiskShaderCache", 234 AddField(field_type, "Renderer_UseDiskShaderCache",
diff --git a/src/shader_recompiler/CMakeLists.txt b/src/shader_recompiler/CMakeLists.txt
index b5b7e5e83..bc3df80c8 100644
--- a/src/shader_recompiler/CMakeLists.txt
+++ b/src/shader_recompiler/CMakeLists.txt
@@ -221,6 +221,7 @@ add_library(shader_recompiler STATIC
221 ir_opt/lower_fp16_to_fp32.cpp 221 ir_opt/lower_fp16_to_fp32.cpp
222 ir_opt/lower_int64_to_int32.cpp 222 ir_opt/lower_int64_to_int32.cpp
223 ir_opt/passes.h 223 ir_opt/passes.h
224 ir_opt/rescaling_pass.cpp
224 ir_opt/ssa_rewrite_pass.cpp 225 ir_opt/ssa_rewrite_pass.cpp
225 ir_opt/texture_pass.cpp 226 ir_opt/texture_pass.cpp
226 ir_opt/verification_pass.cpp 227 ir_opt/verification_pass.cpp
diff --git a/src/shader_recompiler/backend/bindings.h b/src/shader_recompiler/backend/bindings.h
index 35503000c..669702553 100644
--- a/src/shader_recompiler/backend/bindings.h
+++ b/src/shader_recompiler/backend/bindings.h
@@ -14,6 +14,8 @@ struct Bindings {
14 u32 storage_buffer{}; 14 u32 storage_buffer{};
15 u32 texture{}; 15 u32 texture{};
16 u32 image{}; 16 u32 image{};
17 u32 texture_scaling_index{};
18 u32 image_scaling_index{};
17}; 19};
18 20
19} // namespace Shader::Backend 21} // namespace Shader::Backend
diff --git a/src/shader_recompiler/backend/glasm/emit_context.cpp b/src/shader_recompiler/backend/glasm/emit_context.cpp
index 069c019ad..8fd459dfe 100644
--- a/src/shader_recompiler/backend/glasm/emit_context.cpp
+++ b/src/shader_recompiler/backend/glasm/emit_context.cpp
@@ -6,6 +6,7 @@
6 6
7#include "shader_recompiler/backend/bindings.h" 7#include "shader_recompiler/backend/bindings.h"
8#include "shader_recompiler/backend/glasm/emit_context.h" 8#include "shader_recompiler/backend/glasm/emit_context.h"
9#include "shader_recompiler/backend/glasm/emit_glasm.h"
9#include "shader_recompiler/frontend/ir/program.h" 10#include "shader_recompiler/frontend/ir/program.h"
10#include "shader_recompiler/profile.h" 11#include "shader_recompiler/profile.h"
11#include "shader_recompiler/runtime_info.h" 12#include "shader_recompiler/runtime_info.h"
@@ -55,7 +56,8 @@ EmitContext::EmitContext(IR::Program& program, Bindings& bindings, const Profile
55 } 56 }
56 if (!runtime_info.glasm_use_storage_buffers) { 57 if (!runtime_info.glasm_use_storage_buffers) {
57 if (const size_t num = info.storage_buffers_descriptors.size(); num > 0) { 58 if (const size_t num = info.storage_buffers_descriptors.size(); num > 0) {
58 Add("PARAM c[{}]={{program.local[0..{}]}};", num, num - 1); 59 const size_t index{num + PROGRAM_LOCAL_PARAMETER_STORAGE_BUFFER_BASE};
60 Add("PARAM c[{}]={{program.local[0..{}]}};", index, index - 1);
59 } 61 }
60 } 62 }
61 stage = program.stage; 63 stage = program.stage;
diff --git a/src/shader_recompiler/backend/glasm/emit_glasm.cpp b/src/shader_recompiler/backend/glasm/emit_glasm.cpp
index 4ce1c4f54..004658546 100644
--- a/src/shader_recompiler/backend/glasm/emit_glasm.cpp
+++ b/src/shader_recompiler/backend/glasm/emit_glasm.cpp
@@ -448,6 +448,9 @@ std::string EmitGLASM(const Profile& profile, const RuntimeInfo& runtime_info, I
448 header += fmt::format("SHARED_MEMORY {};", program.shared_memory_size); 448 header += fmt::format("SHARED_MEMORY {};", program.shared_memory_size);
449 header += fmt::format("SHARED shared_mem[]={{program.sharedmem}};"); 449 header += fmt::format("SHARED shared_mem[]={{program.sharedmem}};");
450 } 450 }
451 if (program.info.uses_rescaling_uniform) {
452 header += "PARAM scaling[1]={program.local[0..0]};";
453 }
451 header += "TEMP "; 454 header += "TEMP ";
452 for (size_t index = 0; index < ctx.reg_alloc.NumUsedRegisters(); ++index) { 455 for (size_t index = 0; index < ctx.reg_alloc.NumUsedRegisters(); ++index) {
453 header += fmt::format("R{},", index); 456 header += fmt::format("R{},", index);
diff --git a/src/shader_recompiler/backend/glasm/emit_glasm.h b/src/shader_recompiler/backend/glasm/emit_glasm.h
index bcb55f062..292655acb 100644
--- a/src/shader_recompiler/backend/glasm/emit_glasm.h
+++ b/src/shader_recompiler/backend/glasm/emit_glasm.h
@@ -13,6 +13,8 @@
13 13
14namespace Shader::Backend::GLASM { 14namespace Shader::Backend::GLASM {
15 15
16constexpr u32 PROGRAM_LOCAL_PARAMETER_STORAGE_BUFFER_BASE = 1;
17
16[[nodiscard]] std::string EmitGLASM(const Profile& profile, const RuntimeInfo& runtime_info, 18[[nodiscard]] std::string EmitGLASM(const Profile& profile, const RuntimeInfo& runtime_info,
17 IR::Program& program, Bindings& bindings); 19 IR::Program& program, Bindings& bindings);
18 20
diff --git a/src/shader_recompiler/backend/glasm/emit_glasm_image.cpp b/src/shader_recompiler/backend/glasm/emit_glasm_image.cpp
index 09e3a9b82..d325d31c7 100644
--- a/src/shader_recompiler/backend/glasm/emit_glasm_image.cpp
+++ b/src/shader_recompiler/backend/glasm/emit_glasm_image.cpp
@@ -608,6 +608,24 @@ void EmitImageWrite(EmitContext& ctx, IR::Inst& inst, const IR::Value& index, Re
608 ctx.Add("STOREIM.{} {},{},{},{};", format, image, color, coord, type); 608 ctx.Add("STOREIM.{} {},{},{},{};", format, image, color, coord, type);
609} 609}
610 610
611void EmitIsTextureScaled(EmitContext& ctx, IR::Inst& inst, const IR::Value& index) {
612 if (!index.IsImmediate()) {
613 throw NotImplementedException("Non-constant texture rescaling");
614 }
615 ctx.Add("AND.U RC.x,scaling[0].x,{};"
616 "SNE.S {},RC.x,0;",
617 1u << index.U32(), ctx.reg_alloc.Define(inst));
618}
619
620void EmitIsImageScaled(EmitContext& ctx, IR::Inst& inst, const IR::Value& index) {
621 if (!index.IsImmediate()) {
622 throw NotImplementedException("Non-constant texture rescaling");
623 }
624 ctx.Add("AND.U RC.x,scaling[0].y,{};"
625 "SNE.S {},RC.x,0;",
626 1u << index.U32(), ctx.reg_alloc.Define(inst));
627}
628
611void EmitImageAtomicIAdd32(EmitContext& ctx, IR::Inst& inst, const IR::Value& index, Register coord, 629void EmitImageAtomicIAdd32(EmitContext& ctx, IR::Inst& inst, const IR::Value& index, Register coord,
612 ScalarU32 value) { 630 ScalarU32 value) {
613 ImageAtomic(ctx, inst, index, coord, value, "ADD.U32"); 631 ImageAtomic(ctx, inst, index, coord, value, "ADD.U32");
diff --git a/src/shader_recompiler/backend/glasm/emit_glasm_instructions.h b/src/shader_recompiler/backend/glasm/emit_glasm_instructions.h
index 12afda43b..1f343bff5 100644
--- a/src/shader_recompiler/backend/glasm/emit_glasm_instructions.h
+++ b/src/shader_recompiler/backend/glasm/emit_glasm_instructions.h
@@ -72,6 +72,7 @@ void EmitInvocationId(EmitContext& ctx, IR::Inst& inst);
72void EmitSampleId(EmitContext& ctx, IR::Inst& inst); 72void EmitSampleId(EmitContext& ctx, IR::Inst& inst);
73void EmitIsHelperInvocation(EmitContext& ctx, IR::Inst& inst); 73void EmitIsHelperInvocation(EmitContext& ctx, IR::Inst& inst);
74void EmitYDirection(EmitContext& ctx, IR::Inst& inst); 74void EmitYDirection(EmitContext& ctx, IR::Inst& inst);
75void EmitResolutionDownFactor(EmitContext& ctx, IR::Inst& inst);
75void EmitLoadLocal(EmitContext& ctx, IR::Inst& inst, ScalarU32 word_offset); 76void EmitLoadLocal(EmitContext& ctx, IR::Inst& inst, ScalarU32 word_offset);
76void EmitWriteLocal(EmitContext& ctx, ScalarU32 word_offset, ScalarU32 value); 77void EmitWriteLocal(EmitContext& ctx, ScalarU32 word_offset, ScalarU32 value);
77void EmitUndefU1(EmitContext& ctx, IR::Inst& inst); 78void EmitUndefU1(EmitContext& ctx, IR::Inst& inst);
@@ -303,6 +304,8 @@ void EmitIAdd64(EmitContext& ctx, IR::Inst& inst, Register a, Register b);
303void EmitISub32(EmitContext& ctx, IR::Inst& inst, ScalarS32 a, ScalarS32 b); 304void EmitISub32(EmitContext& ctx, IR::Inst& inst, ScalarS32 a, ScalarS32 b);
304void EmitISub64(EmitContext& ctx, IR::Inst& inst, Register a, Register b); 305void EmitISub64(EmitContext& ctx, IR::Inst& inst, Register a, Register b);
305void EmitIMul32(EmitContext& ctx, IR::Inst& inst, ScalarS32 a, ScalarS32 b); 306void EmitIMul32(EmitContext& ctx, IR::Inst& inst, ScalarS32 a, ScalarS32 b);
307void EmitSDiv32(EmitContext& ctx, IR::Inst& inst, ScalarS32 a, ScalarS32 b);
308void EmitUDiv32(EmitContext& ctx, IR::Inst& inst, ScalarU32 a, ScalarU32 b);
306void EmitINeg32(EmitContext& ctx, IR::Inst& inst, ScalarS32 value); 309void EmitINeg32(EmitContext& ctx, IR::Inst& inst, ScalarS32 value);
307void EmitINeg64(EmitContext& ctx, IR::Inst& inst, Register value); 310void EmitINeg64(EmitContext& ctx, IR::Inst& inst, Register value);
308void EmitIAbs32(EmitContext& ctx, IR::Inst& inst, ScalarS32 value); 311void EmitIAbs32(EmitContext& ctx, IR::Inst& inst, ScalarS32 value);
@@ -553,6 +556,8 @@ void EmitImageGradient(EmitContext& ctx, IR::Inst& inst, const IR::Value& index,
553void EmitImageRead(EmitContext& ctx, IR::Inst& inst, const IR::Value& index, Register coord); 556void EmitImageRead(EmitContext& ctx, IR::Inst& inst, const IR::Value& index, Register coord);
554void EmitImageWrite(EmitContext& ctx, IR::Inst& inst, const IR::Value& index, Register coord, 557void EmitImageWrite(EmitContext& ctx, IR::Inst& inst, const IR::Value& index, Register coord,
555 Register color); 558 Register color);
559void EmitIsTextureScaled(EmitContext& ctx, IR::Inst& inst, const IR::Value& index);
560void EmitIsImageScaled(EmitContext& ctx, IR::Inst& inst, const IR::Value& index);
556void EmitBindlessImageAtomicIAdd32(EmitContext&); 561void EmitBindlessImageAtomicIAdd32(EmitContext&);
557void EmitBindlessImageAtomicSMin32(EmitContext&); 562void EmitBindlessImageAtomicSMin32(EmitContext&);
558void EmitBindlessImageAtomicUMin32(EmitContext&); 563void EmitBindlessImageAtomicUMin32(EmitContext&);
diff --git a/src/shader_recompiler/backend/glasm/emit_glasm_integer.cpp b/src/shader_recompiler/backend/glasm/emit_glasm_integer.cpp
index f55c26b76..8aa494a4d 100644
--- a/src/shader_recompiler/backend/glasm/emit_glasm_integer.cpp
+++ b/src/shader_recompiler/backend/glasm/emit_glasm_integer.cpp
@@ -90,6 +90,14 @@ void EmitIMul32(EmitContext& ctx, IR::Inst& inst, ScalarS32 a, ScalarS32 b) {
90 ctx.Add("MUL.S {}.x,{},{};", inst, a, b); 90 ctx.Add("MUL.S {}.x,{},{};", inst, a, b);
91} 91}
92 92
93void EmitSDiv32(EmitContext& ctx, IR::Inst& inst, ScalarS32 a, ScalarS32 b) {
94 ctx.Add("DIV.S {}.x,{},{};", inst, a, b);
95}
96
97void EmitUDiv32(EmitContext& ctx, IR::Inst& inst, ScalarU32 a, ScalarU32 b) {
98 ctx.Add("DIV.U {}.x,{},{};", inst, a, b);
99}
100
93void EmitINeg32(EmitContext& ctx, IR::Inst& inst, ScalarS32 value) { 101void EmitINeg32(EmitContext& ctx, IR::Inst& inst, ScalarS32 value) {
94 if (value.type != Type::Register && static_cast<s32>(value.imm_u32) < 0) { 102 if (value.type != Type::Register && static_cast<s32>(value.imm_u32) < 0) {
95 ctx.Add("MOV.S {},{};", inst, -static_cast<s32>(value.imm_u32)); 103 ctx.Add("MOV.S {},{};", inst, -static_cast<s32>(value.imm_u32));
diff --git a/src/shader_recompiler/backend/glasm/emit_glasm_not_implemented.cpp b/src/shader_recompiler/backend/glasm/emit_glasm_not_implemented.cpp
index e537f6073..681aeda8d 100644
--- a/src/shader_recompiler/backend/glasm/emit_glasm_not_implemented.cpp
+++ b/src/shader_recompiler/backend/glasm/emit_glasm_not_implemented.cpp
@@ -210,6 +210,10 @@ void EmitYDirection(EmitContext& ctx, IR::Inst& inst) {
210 ctx.Add("MOV.F {}.x,y_direction[0].w;", inst); 210 ctx.Add("MOV.F {}.x,y_direction[0].w;", inst);
211} 211}
212 212
213void EmitResolutionDownFactor(EmitContext& ctx, IR::Inst& inst) {
214 ctx.Add("MOV.F {}.x,scaling[0].z;", inst);
215}
216
213void EmitUndefU1(EmitContext& ctx, IR::Inst& inst) { 217void EmitUndefU1(EmitContext& ctx, IR::Inst& inst) {
214 ctx.Add("MOV.S {}.x,0;", inst); 218 ctx.Add("MOV.S {}.x,0;", inst);
215} 219}
diff --git a/src/shader_recompiler/backend/glsl/emit_context.cpp b/src/shader_recompiler/backend/glsl/emit_context.cpp
index 4e6f2c0fe..97bd59302 100644
--- a/src/shader_recompiler/backend/glsl/emit_context.cpp
+++ b/src/shader_recompiler/backend/glsl/emit_context.cpp
@@ -393,6 +393,9 @@ EmitContext::EmitContext(IR::Program& program, Bindings& bindings, const Profile
393 DefineGenericOutput(index, program.invocations); 393 DefineGenericOutput(index, program.invocations);
394 } 394 }
395 } 395 }
396 if (info.uses_rescaling_uniform) {
397 header += "layout(location=0) uniform vec4 scaling;";
398 }
396 DefineConstantBuffers(bindings); 399 DefineConstantBuffers(bindings);
397 DefineStorageBuffers(bindings); 400 DefineStorageBuffers(bindings);
398 SetupImages(bindings); 401 SetupImages(bindings);
diff --git a/src/shader_recompiler/backend/glsl/emit_glsl_context_get_set.cpp b/src/shader_recompiler/backend/glsl/emit_glsl_context_get_set.cpp
index 170db269a..4c26f3829 100644
--- a/src/shader_recompiler/backend/glsl/emit_glsl_context_get_set.cpp
+++ b/src/shader_recompiler/backend/glsl/emit_glsl_context_get_set.cpp
@@ -445,6 +445,10 @@ void EmitYDirection(EmitContext& ctx, IR::Inst& inst) {
445 ctx.AddF32("{}=gl_FrontMaterial.ambient.a;", inst); 445 ctx.AddF32("{}=gl_FrontMaterial.ambient.a;", inst);
446} 446}
447 447
448void EmitResolutionDownFactor(EmitContext& ctx, IR::Inst& inst) {
449 ctx.AddF32("{}=scaling.z;", inst);
450}
451
448void EmitLoadLocal(EmitContext& ctx, IR::Inst& inst, std::string_view word_offset) { 452void EmitLoadLocal(EmitContext& ctx, IR::Inst& inst, std::string_view word_offset) {
449 ctx.AddU32("{}=lmem[{}];", inst, word_offset); 453 ctx.AddU32("{}=lmem[{}];", inst, word_offset);
450} 454}
diff --git a/src/shader_recompiler/backend/glsl/emit_glsl_image.cpp b/src/shader_recompiler/backend/glsl/emit_glsl_image.cpp
index 447eb8e0a..2f78d0267 100644
--- a/src/shader_recompiler/backend/glsl/emit_glsl_image.cpp
+++ b/src/shader_recompiler/backend/glsl/emit_glsl_image.cpp
@@ -612,6 +612,22 @@ void EmitImageAtomicExchange32(EmitContext& ctx, IR::Inst& inst, const IR::Value
612 value); 612 value);
613} 613}
614 614
615void EmitIsTextureScaled(EmitContext& ctx, IR::Inst& inst, const IR::Value& index) {
616 if (!index.IsImmediate()) {
617 throw NotImplementedException("Non-constant texture rescaling");
618 }
619 const u32 image_index{index.U32()};
620 ctx.AddU1("{}=(ftou(scaling.x)&{})!=0;", inst, 1u << image_index);
621}
622
623void EmitIsImageScaled(EmitContext& ctx, IR::Inst& inst, const IR::Value& index) {
624 if (!index.IsImmediate()) {
625 throw NotImplementedException("Non-constant texture rescaling");
626 }
627 const u32 image_index{index.U32()};
628 ctx.AddU1("{}=(ftou(scaling.y)&{})!=0;", inst, 1u << image_index);
629}
630
615void EmitBindlessImageSampleImplicitLod(EmitContext&) { 631void EmitBindlessImageSampleImplicitLod(EmitContext&) {
616 NotImplemented(); 632 NotImplemented();
617} 633}
diff --git a/src/shader_recompiler/backend/glsl/emit_glsl_instructions.h b/src/shader_recompiler/backend/glsl/emit_glsl_instructions.h
index 5936d086f..f86502e4c 100644
--- a/src/shader_recompiler/backend/glsl/emit_glsl_instructions.h
+++ b/src/shader_recompiler/backend/glsl/emit_glsl_instructions.h
@@ -85,6 +85,7 @@ void EmitInvocationId(EmitContext& ctx, IR::Inst& inst);
85void EmitSampleId(EmitContext& ctx, IR::Inst& inst); 85void EmitSampleId(EmitContext& ctx, IR::Inst& inst);
86void EmitIsHelperInvocation(EmitContext& ctx, IR::Inst& inst); 86void EmitIsHelperInvocation(EmitContext& ctx, IR::Inst& inst);
87void EmitYDirection(EmitContext& ctx, IR::Inst& inst); 87void EmitYDirection(EmitContext& ctx, IR::Inst& inst);
88void EmitResolutionDownFactor(EmitContext& ctx, IR::Inst& inst);
88void EmitLoadLocal(EmitContext& ctx, IR::Inst& inst, std::string_view word_offset); 89void EmitLoadLocal(EmitContext& ctx, IR::Inst& inst, std::string_view word_offset);
89void EmitWriteLocal(EmitContext& ctx, std::string_view word_offset, std::string_view value); 90void EmitWriteLocal(EmitContext& ctx, std::string_view word_offset, std::string_view value);
90void EmitUndefU1(EmitContext& ctx, IR::Inst& inst); 91void EmitUndefU1(EmitContext& ctx, IR::Inst& inst);
@@ -362,6 +363,8 @@ void EmitIAdd64(EmitContext& ctx, IR::Inst& inst, std::string_view a, std::strin
362void EmitISub32(EmitContext& ctx, IR::Inst& inst, std::string_view a, std::string_view b); 363void EmitISub32(EmitContext& ctx, IR::Inst& inst, std::string_view a, std::string_view b);
363void EmitISub64(EmitContext& ctx, IR::Inst& inst, std::string_view a, std::string_view b); 364void EmitISub64(EmitContext& ctx, IR::Inst& inst, std::string_view a, std::string_view b);
364void EmitIMul32(EmitContext& ctx, IR::Inst& inst, std::string_view a, std::string_view b); 365void EmitIMul32(EmitContext& ctx, IR::Inst& inst, std::string_view a, std::string_view b);
366void EmitSDiv32(EmitContext& ctx, IR::Inst& inst, std::string_view a, std::string_view b);
367void EmitUDiv32(EmitContext& ctx, IR::Inst& inst, std::string_view a, std::string_view b);
365void EmitINeg32(EmitContext& ctx, IR::Inst& inst, std::string_view value); 368void EmitINeg32(EmitContext& ctx, IR::Inst& inst, std::string_view value);
366void EmitINeg64(EmitContext& ctx, IR::Inst& inst, std::string_view value); 369void EmitINeg64(EmitContext& ctx, IR::Inst& inst, std::string_view value);
367void EmitIAbs32(EmitContext& ctx, IR::Inst& inst, std::string_view value); 370void EmitIAbs32(EmitContext& ctx, IR::Inst& inst, std::string_view value);
@@ -627,6 +630,8 @@ void EmitImageRead(EmitContext& ctx, IR::Inst& inst, const IR::Value& index,
627 std::string_view coords); 630 std::string_view coords);
628void EmitImageWrite(EmitContext& ctx, IR::Inst& inst, const IR::Value& index, 631void EmitImageWrite(EmitContext& ctx, IR::Inst& inst, const IR::Value& index,
629 std::string_view coords, std::string_view color); 632 std::string_view coords, std::string_view color);
633void EmitIsTextureScaled(EmitContext& ctx, IR::Inst& inst, const IR::Value& index);
634void EmitIsImageScaled(EmitContext& ctx, IR::Inst& inst, const IR::Value& index);
630void EmitBindlessImageAtomicIAdd32(EmitContext&); 635void EmitBindlessImageAtomicIAdd32(EmitContext&);
631void EmitBindlessImageAtomicSMin32(EmitContext&); 636void EmitBindlessImageAtomicSMin32(EmitContext&);
632void EmitBindlessImageAtomicUMin32(EmitContext&); 637void EmitBindlessImageAtomicUMin32(EmitContext&);
diff --git a/src/shader_recompiler/backend/glsl/emit_glsl_integer.cpp b/src/shader_recompiler/backend/glsl/emit_glsl_integer.cpp
index 38419f88f..88c1d4c5e 100644
--- a/src/shader_recompiler/backend/glsl/emit_glsl_integer.cpp
+++ b/src/shader_recompiler/backend/glsl/emit_glsl_integer.cpp
@@ -78,6 +78,14 @@ void EmitIMul32(EmitContext& ctx, IR::Inst& inst, std::string_view a, std::strin
78 ctx.AddU32("{}=uint({}*{});", inst, a, b); 78 ctx.AddU32("{}=uint({}*{});", inst, a, b);
79} 79}
80 80
81void EmitSDiv32(EmitContext& ctx, IR::Inst& inst, std::string_view a, std::string_view b) {
82 ctx.AddU32("{}=uint(int({})/int({}));", inst, a, b);
83}
84
85void EmitUDiv32(EmitContext& ctx, IR::Inst& inst, std::string_view a, std::string_view b) {
86 ctx.AddU32("{}={}/{};", inst, a, b);
87}
88
81void EmitINeg32(EmitContext& ctx, IR::Inst& inst, std::string_view value) { 89void EmitINeg32(EmitContext& ctx, IR::Inst& inst, std::string_view value) {
82 ctx.AddU32("{}=uint(-({}));", inst, value); 90 ctx.AddU32("{}=uint(-({}));", inst, value);
83} 91}
diff --git a/src/shader_recompiler/backend/spirv/emit_context.cpp b/src/shader_recompiler/backend/spirv/emit_context.cpp
index 3c84e6466..723455462 100644
--- a/src/shader_recompiler/backend/spirv/emit_context.cpp
+++ b/src/shader_recompiler/backend/spirv/emit_context.cpp
@@ -7,11 +7,14 @@
7#include <climits> 7#include <climits>
8#include <string_view> 8#include <string_view>
9 9
10#include <boost/container/static_vector.hpp>
11
10#include <fmt/format.h> 12#include <fmt/format.h>
11 13
12#include "common/common_types.h" 14#include "common/common_types.h"
13#include "common/div_ceil.h" 15#include "common/div_ceil.h"
14#include "shader_recompiler/backend/spirv/emit_context.h" 16#include "shader_recompiler/backend/spirv/emit_context.h"
17#include "shader_recompiler/backend/spirv/emit_spirv.h"
15 18
16namespace Shader::Backend::SPIRV { 19namespace Shader::Backend::SPIRV {
17namespace { 20namespace {
@@ -474,8 +477,9 @@ void VectorTypes::Define(Sirit::Module& sirit_ctx, Id base_type, std::string_vie
474 477
475EmitContext::EmitContext(const Profile& profile_, const RuntimeInfo& runtime_info_, 478EmitContext::EmitContext(const Profile& profile_, const RuntimeInfo& runtime_info_,
476 IR::Program& program, Bindings& bindings) 479 IR::Program& program, Bindings& bindings)
477 : Sirit::Module(profile_.supported_spirv), profile{profile_}, 480 : Sirit::Module(profile_.supported_spirv), profile{profile_}, runtime_info{runtime_info_},
478 runtime_info{runtime_info_}, stage{program.stage} { 481 stage{program.stage}, texture_rescaling_index{bindings.texture_scaling_index},
482 image_rescaling_index{bindings.image_scaling_index} {
479 const bool is_unified{profile.unified_descriptor_binding}; 483 const bool is_unified{profile.unified_descriptor_binding};
480 u32& uniform_binding{is_unified ? bindings.unified : bindings.uniform_buffer}; 484 u32& uniform_binding{is_unified ? bindings.unified : bindings.uniform_buffer};
481 u32& storage_binding{is_unified ? bindings.unified : bindings.storage_buffer}; 485 u32& storage_binding{is_unified ? bindings.unified : bindings.storage_buffer};
@@ -492,10 +496,11 @@ EmitContext::EmitContext(const Profile& profile_, const RuntimeInfo& runtime_inf
492 DefineStorageBuffers(program.info, storage_binding); 496 DefineStorageBuffers(program.info, storage_binding);
493 DefineTextureBuffers(program.info, texture_binding); 497 DefineTextureBuffers(program.info, texture_binding);
494 DefineImageBuffers(program.info, image_binding); 498 DefineImageBuffers(program.info, image_binding);
495 DefineTextures(program.info, texture_binding); 499 DefineTextures(program.info, texture_binding, bindings.texture_scaling_index);
496 DefineImages(program.info, image_binding); 500 DefineImages(program.info, image_binding, bindings.image_scaling_index);
497 DefineAttributeMemAccess(program.info); 501 DefineAttributeMemAccess(program.info);
498 DefineGlobalMemoryFunctions(program.info); 502 DefineGlobalMemoryFunctions(program.info);
503 DefineRescalingInput(program.info);
499} 504}
500 505
501EmitContext::~EmitContext() = default; 506EmitContext::~EmitContext() = default;
@@ -996,6 +1001,73 @@ void EmitContext::DefineGlobalMemoryFunctions(const Info& info) {
996 define(&StorageDefinitions::U32x4, storage_types.U32x4, U32[4], sizeof(u32[4])); 1001 define(&StorageDefinitions::U32x4, storage_types.U32x4, U32[4], sizeof(u32[4]));
997} 1002}
998 1003
1004void EmitContext::DefineRescalingInput(const Info& info) {
1005 if (!info.uses_rescaling_uniform) {
1006 return;
1007 }
1008 if (profile.unified_descriptor_binding) {
1009 DefineRescalingInputPushConstant();
1010 } else {
1011 DefineRescalingInputUniformConstant();
1012 }
1013}
1014
1015void EmitContext::DefineRescalingInputPushConstant() {
1016 boost::container::static_vector<Id, 3> members{};
1017 u32 member_index{0};
1018
1019 rescaling_textures_type = TypeArray(U32[1], Const(4u));
1020 Decorate(rescaling_textures_type, spv::Decoration::ArrayStride, 4u);
1021 members.push_back(rescaling_textures_type);
1022 rescaling_textures_member_index = member_index++;
1023
1024 rescaling_images_type = TypeArray(U32[1], Const(NUM_IMAGE_SCALING_WORDS));
1025 Decorate(rescaling_images_type, spv::Decoration::ArrayStride, 4u);
1026 members.push_back(rescaling_images_type);
1027 rescaling_images_member_index = member_index++;
1028
1029 if (stage != Stage::Compute) {
1030 members.push_back(F32[1]);
1031 rescaling_downfactor_member_index = member_index++;
1032 }
1033 const Id push_constant_struct{TypeStruct(std::span(members.data(), members.size()))};
1034 Decorate(push_constant_struct, spv::Decoration::Block);
1035 Name(push_constant_struct, "ResolutionInfo");
1036
1037 MemberDecorate(push_constant_struct, rescaling_textures_member_index, spv::Decoration::Offset,
1038 static_cast<u32>(offsetof(RescalingLayout, rescaling_textures)));
1039 MemberName(push_constant_struct, rescaling_textures_member_index, "rescaling_textures");
1040
1041 MemberDecorate(push_constant_struct, rescaling_images_member_index, spv::Decoration::Offset,
1042 static_cast<u32>(offsetof(RescalingLayout, rescaling_images)));
1043 MemberName(push_constant_struct, rescaling_images_member_index, "rescaling_images");
1044
1045 if (stage != Stage::Compute) {
1046 MemberDecorate(push_constant_struct, rescaling_downfactor_member_index,
1047 spv::Decoration::Offset,
1048 static_cast<u32>(offsetof(RescalingLayout, down_factor)));
1049 MemberName(push_constant_struct, rescaling_downfactor_member_index, "down_factor");
1050 }
1051 const Id pointer_type{TypePointer(spv::StorageClass::PushConstant, push_constant_struct)};
1052 rescaling_push_constants = AddGlobalVariable(pointer_type, spv::StorageClass::PushConstant);
1053 Name(rescaling_push_constants, "rescaling_push_constants");
1054
1055 if (profile.supported_spirv >= 0x00010400) {
1056 interfaces.push_back(rescaling_push_constants);
1057 }
1058}
1059
1060void EmitContext::DefineRescalingInputUniformConstant() {
1061 const Id pointer_type{TypePointer(spv::StorageClass::UniformConstant, F32[4])};
1062 rescaling_uniform_constant =
1063 AddGlobalVariable(pointer_type, spv::StorageClass::UniformConstant);
1064 Decorate(rescaling_uniform_constant, spv::Decoration::Location, 0u);
1065
1066 if (profile.supported_spirv >= 0x00010400) {
1067 interfaces.push_back(rescaling_uniform_constant);
1068 }
1069}
1070
999void EmitContext::DefineConstantBuffers(const Info& info, u32& binding) { 1071void EmitContext::DefineConstantBuffers(const Info& info, u32& binding) {
1000 if (info.constant_buffer_descriptors.empty()) { 1072 if (info.constant_buffer_descriptors.empty()) {
1001 return; 1073 return;
@@ -1184,7 +1256,7 @@ void EmitContext::DefineImageBuffers(const Info& info, u32& binding) {
1184 } 1256 }
1185} 1257}
1186 1258
1187void EmitContext::DefineTextures(const Info& info, u32& binding) { 1259void EmitContext::DefineTextures(const Info& info, u32& binding, u32& scaling_index) {
1188 textures.reserve(info.texture_descriptors.size()); 1260 textures.reserve(info.texture_descriptors.size());
1189 for (const TextureDescriptor& desc : info.texture_descriptors) { 1261 for (const TextureDescriptor& desc : info.texture_descriptors) {
1190 const Id image_type{ImageType(*this, desc)}; 1262 const Id image_type{ImageType(*this, desc)};
@@ -1206,13 +1278,14 @@ void EmitContext::DefineTextures(const Info& info, u32& binding) {
1206 interfaces.push_back(id); 1278 interfaces.push_back(id);
1207 } 1279 }
1208 ++binding; 1280 ++binding;
1281 ++scaling_index;
1209 } 1282 }
1210 if (info.uses_atomic_image_u32) { 1283 if (info.uses_atomic_image_u32) {
1211 image_u32 = TypePointer(spv::StorageClass::Image, U32[1]); 1284 image_u32 = TypePointer(spv::StorageClass::Image, U32[1]);
1212 } 1285 }
1213} 1286}
1214 1287
1215void EmitContext::DefineImages(const Info& info, u32& binding) { 1288void EmitContext::DefineImages(const Info& info, u32& binding, u32& scaling_index) {
1216 images.reserve(info.image_descriptors.size()); 1289 images.reserve(info.image_descriptors.size());
1217 for (const ImageDescriptor& desc : info.image_descriptors) { 1290 for (const ImageDescriptor& desc : info.image_descriptors) {
1218 if (desc.count != 1) { 1291 if (desc.count != 1) {
@@ -1233,6 +1306,7 @@ void EmitContext::DefineImages(const Info& info, u32& binding) {
1233 interfaces.push_back(id); 1306 interfaces.push_back(id);
1234 } 1307 }
1235 ++binding; 1308 ++binding;
1309 ++scaling_index;
1236 } 1310 }
1237} 1311}
1238 1312
diff --git a/src/shader_recompiler/backend/spirv/emit_context.h b/src/shader_recompiler/backend/spirv/emit_context.h
index 112c52382..63f8185d9 100644
--- a/src/shader_recompiler/backend/spirv/emit_context.h
+++ b/src/shader_recompiler/backend/spirv/emit_context.h
@@ -238,6 +238,16 @@ public:
238 Id indexed_load_func{}; 238 Id indexed_load_func{};
239 Id indexed_store_func{}; 239 Id indexed_store_func{};
240 240
241 Id rescaling_uniform_constant{};
242 Id rescaling_push_constants{};
243 Id rescaling_textures_type{};
244 Id rescaling_images_type{};
245 u32 rescaling_textures_member_index{};
246 u32 rescaling_images_member_index{};
247 u32 rescaling_downfactor_member_index{};
248 u32 texture_rescaling_index{};
249 u32 image_rescaling_index{};
250
241 Id local_memory{}; 251 Id local_memory{};
242 252
243 Id shared_memory_u8{}; 253 Id shared_memory_u8{};
@@ -310,10 +320,13 @@ private:
310 void DefineStorageBuffers(const Info& info, u32& binding); 320 void DefineStorageBuffers(const Info& info, u32& binding);
311 void DefineTextureBuffers(const Info& info, u32& binding); 321 void DefineTextureBuffers(const Info& info, u32& binding);
312 void DefineImageBuffers(const Info& info, u32& binding); 322 void DefineImageBuffers(const Info& info, u32& binding);
313 void DefineTextures(const Info& info, u32& binding); 323 void DefineTextures(const Info& info, u32& binding, u32& scaling_index);
314 void DefineImages(const Info& info, u32& binding); 324 void DefineImages(const Info& info, u32& binding, u32& scaling_index);
315 void DefineAttributeMemAccess(const Info& info); 325 void DefineAttributeMemAccess(const Info& info);
316 void DefineGlobalMemoryFunctions(const Info& info); 326 void DefineGlobalMemoryFunctions(const Info& info);
327 void DefineRescalingInput(const Info& info);
328 void DefineRescalingInputPushConstant();
329 void DefineRescalingInputUniformConstant();
317 330
318 void DefineInputs(const IR::Program& program); 331 void DefineInputs(const IR::Program& program);
319 void DefineOutputs(const IR::Program& program); 332 void DefineOutputs(const IR::Program& program);
diff --git a/src/shader_recompiler/backend/spirv/emit_spirv.h b/src/shader_recompiler/backend/spirv/emit_spirv.h
index db0c935fe..4b25534ce 100644
--- a/src/shader_recompiler/backend/spirv/emit_spirv.h
+++ b/src/shader_recompiler/backend/spirv/emit_spirv.h
@@ -16,6 +16,19 @@
16 16
17namespace Shader::Backend::SPIRV { 17namespace Shader::Backend::SPIRV {
18 18
19constexpr u32 NUM_TEXTURE_SCALING_WORDS = 4;
20constexpr u32 NUM_IMAGE_SCALING_WORDS = 2;
21constexpr u32 NUM_TEXTURE_AND_IMAGE_SCALING_WORDS =
22 NUM_TEXTURE_SCALING_WORDS + NUM_IMAGE_SCALING_WORDS;
23
24struct RescalingLayout {
25 alignas(16) std::array<u32, NUM_TEXTURE_SCALING_WORDS> rescaling_textures;
26 alignas(16) std::array<u32, NUM_IMAGE_SCALING_WORDS> rescaling_images;
27 alignas(16) u32 down_factor;
28};
29constexpr u32 RESCALING_LAYOUT_WORDS_OFFSET = offsetof(RescalingLayout, rescaling_textures);
30constexpr u32 RESCALING_LAYOUT_DOWN_FACTOR_OFFSET = offsetof(RescalingLayout, down_factor);
31
19[[nodiscard]] std::vector<u32> EmitSPIRV(const Profile& profile, const RuntimeInfo& runtime_info, 32[[nodiscard]] std::vector<u32> EmitSPIRV(const Profile& profile, const RuntimeInfo& runtime_info,
20 IR::Program& program, Bindings& bindings); 33 IR::Program& program, Bindings& bindings);
21 34
diff --git a/src/shader_recompiler/backend/spirv/emit_spirv_context_get_set.cpp b/src/shader_recompiler/backend/spirv/emit_spirv_context_get_set.cpp
index d3a93d5f4..bac683ae1 100644
--- a/src/shader_recompiler/backend/spirv/emit_spirv_context_get_set.cpp
+++ b/src/shader_recompiler/backend/spirv/emit_spirv_context_get_set.cpp
@@ -526,6 +526,18 @@ Id EmitYDirection(EmitContext& ctx) {
526 return ctx.Const(ctx.runtime_info.y_negate ? -1.0f : 1.0f); 526 return ctx.Const(ctx.runtime_info.y_negate ? -1.0f : 1.0f);
527} 527}
528 528
529Id EmitResolutionDownFactor(EmitContext& ctx) {
530 if (ctx.profile.unified_descriptor_binding) {
531 const Id pointer_type{ctx.TypePointer(spv::StorageClass::PushConstant, ctx.F32[1])};
532 const Id index{ctx.Const(ctx.rescaling_downfactor_member_index)};
533 const Id pointer{ctx.OpAccessChain(pointer_type, ctx.rescaling_push_constants, index)};
534 return ctx.OpLoad(ctx.F32[1], pointer);
535 } else {
536 const Id composite{ctx.OpLoad(ctx.F32[4], ctx.rescaling_uniform_constant)};
537 return ctx.OpCompositeExtract(ctx.F32[1], composite, 2u);
538 }
539}
540
529Id EmitLoadLocal(EmitContext& ctx, Id word_offset) { 541Id EmitLoadLocal(EmitContext& ctx, Id word_offset) {
530 const Id pointer{ctx.OpAccessChain(ctx.private_u32, ctx.local_memory, word_offset)}; 542 const Id pointer{ctx.OpAccessChain(ctx.private_u32, ctx.local_memory, word_offset)};
531 return ctx.OpLoad(ctx.U32[1], pointer); 543 return ctx.OpLoad(ctx.U32[1], pointer);
diff --git a/src/shader_recompiler/backend/spirv/emit_spirv_image.cpp b/src/shader_recompiler/backend/spirv/emit_spirv_image.cpp
index 1d5364309..4d168a96d 100644
--- a/src/shader_recompiler/backend/spirv/emit_spirv_image.cpp
+++ b/src/shader_recompiler/backend/spirv/emit_spirv_image.cpp
@@ -224,6 +224,36 @@ Id Emit(MethodPtrType sparse_ptr, MethodPtrType non_sparse_ptr, EmitContext& ctx
224 Decorate(ctx, inst, sample); 224 Decorate(ctx, inst, sample);
225 return ctx.OpCompositeExtract(result_type, sample, 1U); 225 return ctx.OpCompositeExtract(result_type, sample, 1U);
226} 226}
227
228Id IsScaled(EmitContext& ctx, const IR::Value& index, Id member_index, u32 base_index) {
229 const Id push_constant_u32{ctx.TypePointer(spv::StorageClass::PushConstant, ctx.U32[1])};
230 Id bit{};
231 if (index.IsImmediate()) {
232 // Use BitwiseAnd instead of BitfieldExtract for better codegen on Nvidia OpenGL.
233 // LOP32I.NZ is used to set the predicate rather than BFE+ISETP.
234 const u32 index_value{index.U32() + base_index};
235 const Id word_index{ctx.Const(index_value / 32)};
236 const Id bit_index_mask{ctx.Const(1u << (index_value % 32))};
237 const Id pointer{ctx.OpAccessChain(push_constant_u32, ctx.rescaling_push_constants,
238 member_index, word_index)};
239 const Id word{ctx.OpLoad(ctx.U32[1], pointer)};
240 bit = ctx.OpBitwiseAnd(ctx.U32[1], word, bit_index_mask);
241 } else {
242 Id index_value{ctx.Def(index)};
243 if (base_index != 0) {
244 index_value = ctx.OpIAdd(ctx.U32[1], index_value, ctx.Const(base_index));
245 }
246 const Id bit_index{ctx.OpBitwiseAnd(ctx.U32[1], index_value, ctx.Const(31u))};
247 bit = ctx.OpBitFieldUExtract(ctx.U32[1], index_value, bit_index, ctx.Const(1u));
248 }
249 return ctx.OpINotEqual(ctx.U1, bit, ctx.u32_zero_value);
250}
251
252Id BitTest(EmitContext& ctx, Id mask, Id bit) {
253 const Id shifted{ctx.OpShiftRightLogical(ctx.U32[1], mask, bit)};
254 const Id bit_value{ctx.OpBitwiseAnd(ctx.U32[1], shifted, ctx.Const(1u))};
255 return ctx.OpINotEqual(ctx.U1, bit_value, ctx.u32_zero_value);
256}
227} // Anonymous namespace 257} // Anonymous namespace
228 258
229Id EmitBindlessImageSampleImplicitLod(EmitContext&) { 259Id EmitBindlessImageSampleImplicitLod(EmitContext&) {
@@ -470,4 +500,28 @@ void EmitImageWrite(EmitContext& ctx, IR::Inst* inst, const IR::Value& index, Id
470 ctx.OpImageWrite(Image(ctx, index, info), coords, color); 500 ctx.OpImageWrite(Image(ctx, index, info), coords, color);
471} 501}
472 502
503Id EmitIsTextureScaled(EmitContext& ctx, const IR::Value& index) {
504 if (ctx.profile.unified_descriptor_binding) {
505 const Id member_index{ctx.Const(ctx.rescaling_textures_member_index)};
506 return IsScaled(ctx, index, member_index, ctx.texture_rescaling_index);
507 } else {
508 const Id composite{ctx.OpLoad(ctx.F32[4], ctx.rescaling_uniform_constant)};
509 const Id mask_f32{ctx.OpCompositeExtract(ctx.F32[1], composite, 0u)};
510 const Id mask{ctx.OpBitcast(ctx.U32[1], mask_f32)};
511 return BitTest(ctx, mask, ctx.Def(index));
512 }
513}
514
515Id EmitIsImageScaled(EmitContext& ctx, const IR::Value& index) {
516 if (ctx.profile.unified_descriptor_binding) {
517 const Id member_index{ctx.Const(ctx.rescaling_images_member_index)};
518 return IsScaled(ctx, index, member_index, ctx.image_rescaling_index);
519 } else {
520 const Id composite{ctx.OpLoad(ctx.F32[4], ctx.rescaling_uniform_constant)};
521 const Id mask_f32{ctx.OpCompositeExtract(ctx.F32[1], composite, 1u)};
522 const Id mask{ctx.OpBitcast(ctx.U32[1], mask_f32)};
523 return BitTest(ctx, mask, ctx.Def(index));
524 }
525}
526
473} // namespace Shader::Backend::SPIRV 527} // namespace Shader::Backend::SPIRV
diff --git a/src/shader_recompiler/backend/spirv/emit_spirv_instructions.h b/src/shader_recompiler/backend/spirv/emit_spirv_instructions.h
index c9db1c164..6cd22dd3e 100644
--- a/src/shader_recompiler/backend/spirv/emit_spirv_instructions.h
+++ b/src/shader_recompiler/backend/spirv/emit_spirv_instructions.h
@@ -75,6 +75,7 @@ Id EmitInvocationId(EmitContext& ctx);
75Id EmitSampleId(EmitContext& ctx); 75Id EmitSampleId(EmitContext& ctx);
76Id EmitIsHelperInvocation(EmitContext& ctx); 76Id EmitIsHelperInvocation(EmitContext& ctx);
77Id EmitYDirection(EmitContext& ctx); 77Id EmitYDirection(EmitContext& ctx);
78Id EmitResolutionDownFactor(EmitContext& ctx);
78Id EmitLoadLocal(EmitContext& ctx, Id word_offset); 79Id EmitLoadLocal(EmitContext& ctx, Id word_offset);
79void EmitWriteLocal(EmitContext& ctx, Id word_offset, Id value); 80void EmitWriteLocal(EmitContext& ctx, Id word_offset, Id value);
80Id EmitUndefU1(EmitContext& ctx); 81Id EmitUndefU1(EmitContext& ctx);
@@ -283,6 +284,8 @@ Id EmitIAdd64(EmitContext& ctx, Id a, Id b);
283Id EmitISub32(EmitContext& ctx, Id a, Id b); 284Id EmitISub32(EmitContext& ctx, Id a, Id b);
284Id EmitISub64(EmitContext& ctx, Id a, Id b); 285Id EmitISub64(EmitContext& ctx, Id a, Id b);
285Id EmitIMul32(EmitContext& ctx, Id a, Id b); 286Id EmitIMul32(EmitContext& ctx, Id a, Id b);
287Id EmitSDiv32(EmitContext& ctx, Id a, Id b);
288Id EmitUDiv32(EmitContext& ctx, Id a, Id b);
286Id EmitINeg32(EmitContext& ctx, Id value); 289Id EmitINeg32(EmitContext& ctx, Id value);
287Id EmitINeg64(EmitContext& ctx, Id value); 290Id EmitINeg64(EmitContext& ctx, Id value);
288Id EmitIAbs32(EmitContext& ctx, Id value); 291Id EmitIAbs32(EmitContext& ctx, Id value);
@@ -510,6 +513,8 @@ Id EmitImageGradient(EmitContext& ctx, IR::Inst* inst, const IR::Value& index, I
510 Id derivates, Id offset, Id lod_clamp); 513 Id derivates, Id offset, Id lod_clamp);
511Id EmitImageRead(EmitContext& ctx, IR::Inst* inst, const IR::Value& index, Id coords); 514Id EmitImageRead(EmitContext& ctx, IR::Inst* inst, const IR::Value& index, Id coords);
512void EmitImageWrite(EmitContext& ctx, IR::Inst* inst, const IR::Value& index, Id coords, Id color); 515void EmitImageWrite(EmitContext& ctx, IR::Inst* inst, const IR::Value& index, Id coords, Id color);
516Id EmitIsTextureScaled(EmitContext& ctx, const IR::Value& index);
517Id EmitIsImageScaled(EmitContext& ctx, const IR::Value& index);
513Id EmitBindlessImageAtomicIAdd32(EmitContext&); 518Id EmitBindlessImageAtomicIAdd32(EmitContext&);
514Id EmitBindlessImageAtomicSMin32(EmitContext&); 519Id EmitBindlessImageAtomicSMin32(EmitContext&);
515Id EmitBindlessImageAtomicUMin32(EmitContext&); 520Id EmitBindlessImageAtomicUMin32(EmitContext&);
diff --git a/src/shader_recompiler/backend/spirv/emit_spirv_integer.cpp b/src/shader_recompiler/backend/spirv/emit_spirv_integer.cpp
index 3501d7495..50277eec3 100644
--- a/src/shader_recompiler/backend/spirv/emit_spirv_integer.cpp
+++ b/src/shader_recompiler/backend/spirv/emit_spirv_integer.cpp
@@ -72,6 +72,14 @@ Id EmitIMul32(EmitContext& ctx, Id a, Id b) {
72 return ctx.OpIMul(ctx.U32[1], a, b); 72 return ctx.OpIMul(ctx.U32[1], a, b);
73} 73}
74 74
75Id EmitSDiv32(EmitContext& ctx, Id a, Id b) {
76 return ctx.OpSDiv(ctx.U32[1], a, b);
77}
78
79Id EmitUDiv32(EmitContext& ctx, Id a, Id b) {
80 return ctx.OpUDiv(ctx.U32[1], a, b);
81}
82
75Id EmitINeg32(EmitContext& ctx, Id value) { 83Id EmitINeg32(EmitContext& ctx, Id value) {
76 return ctx.OpSNegate(ctx.U32[1], value); 84 return ctx.OpSNegate(ctx.U32[1], value);
77} 85}
diff --git a/src/shader_recompiler/frontend/ir/basic_block.cpp b/src/shader_recompiler/frontend/ir/basic_block.cpp
index 7c08b25ce..974efa4a0 100644
--- a/src/shader_recompiler/frontend/ir/basic_block.cpp
+++ b/src/shader_recompiler/frontend/ir/basic_block.cpp
@@ -22,6 +22,11 @@ void Block::AppendNewInst(Opcode op, std::initializer_list<Value> args) {
22 PrependNewInst(end(), op, args); 22 PrependNewInst(end(), op, args);
23} 23}
24 24
25Block::iterator Block::PrependNewInst(iterator insertion_point, const Inst& base_inst) {
26 Inst* const inst{inst_pool->Create(base_inst)};
27 return instructions.insert(insertion_point, *inst);
28}
29
25Block::iterator Block::PrependNewInst(iterator insertion_point, Opcode op, 30Block::iterator Block::PrependNewInst(iterator insertion_point, Opcode op,
26 std::initializer_list<Value> args, u32 flags) { 31 std::initializer_list<Value> args, u32 flags) {
27 Inst* const inst{inst_pool->Create(op, flags)}; 32 Inst* const inst{inst_pool->Create(op, flags)};
diff --git a/src/shader_recompiler/frontend/ir/basic_block.h b/src/shader_recompiler/frontend/ir/basic_block.h
index 9ce1ed07e..fbfe98266 100644
--- a/src/shader_recompiler/frontend/ir/basic_block.h
+++ b/src/shader_recompiler/frontend/ir/basic_block.h
@@ -40,6 +40,9 @@ public:
40 /// Appends a new instruction to the end of this basic block. 40 /// Appends a new instruction to the end of this basic block.
41 void AppendNewInst(Opcode op, std::initializer_list<Value> args); 41 void AppendNewInst(Opcode op, std::initializer_list<Value> args);
42 42
43 /// Prepends a copy of an instruction to this basic block before the insertion point.
44 iterator PrependNewInst(iterator insertion_point, const Inst& base_inst);
45
43 /// Prepends a new instruction to this basic block before the insertion point. 46 /// Prepends a new instruction to this basic block before the insertion point.
44 iterator PrependNewInst(iterator insertion_point, Opcode op, 47 iterator PrependNewInst(iterator insertion_point, Opcode op,
45 std::initializer_list<Value> args = {}, u32 flags = 0); 48 std::initializer_list<Value> args = {}, u32 flags = 0);
diff --git a/src/shader_recompiler/frontend/ir/ir_emitter.cpp b/src/shader_recompiler/frontend/ir/ir_emitter.cpp
index 13159a68d..356f889ac 100644
--- a/src/shader_recompiler/frontend/ir/ir_emitter.cpp
+++ b/src/shader_recompiler/frontend/ir/ir_emitter.cpp
@@ -375,6 +375,10 @@ F32 IREmitter::YDirection() {
375 return Inst<F32>(Opcode::YDirection); 375 return Inst<F32>(Opcode::YDirection);
376} 376}
377 377
378F32 IREmitter::ResolutionDownFactor() {
379 return Inst<F32>(Opcode::ResolutionDownFactor);
380}
381
378U32 IREmitter::LaneId() { 382U32 IREmitter::LaneId() {
379 return Inst<U32>(Opcode::LaneId); 383 return Inst<U32>(Opcode::LaneId);
380} 384}
@@ -1141,6 +1145,10 @@ U32 IREmitter::IMul(const U32& a, const U32& b) {
1141 return Inst<U32>(Opcode::IMul32, a, b); 1145 return Inst<U32>(Opcode::IMul32, a, b);
1142} 1146}
1143 1147
1148U32 IREmitter::IDiv(const U32& a, const U32& b, bool is_signed) {
1149 return Inst<U32>(is_signed ? Opcode::SDiv32 : Opcode::UDiv32, a, b);
1150}
1151
1144U32U64 IREmitter::INeg(const U32U64& value) { 1152U32U64 IREmitter::INeg(const U32U64& value) {
1145 switch (value.Type()) { 1153 switch (value.Type()) {
1146 case Type::U32: 1154 case Type::U32:
@@ -1938,6 +1946,14 @@ Value IREmitter::ImageAtomicExchange(const Value& handle, const Value& coords, c
1938 return Inst(op, Flags{info}, handle, coords, value); 1946 return Inst(op, Flags{info}, handle, coords, value);
1939} 1947}
1940 1948
1949U1 IREmitter::IsTextureScaled(const U32& index) {
1950 return Inst<U1>(Opcode::IsTextureScaled, index);
1951}
1952
1953U1 IREmitter::IsImageScaled(const U32& index) {
1954 return Inst<U1>(Opcode::IsImageScaled, index);
1955}
1956
1941U1 IREmitter::VoteAll(const U1& value) { 1957U1 IREmitter::VoteAll(const U1& value) {
1942 return Inst<U1>(Opcode::VoteAll, value); 1958 return Inst<U1>(Opcode::VoteAll, value);
1943} 1959}
diff --git a/src/shader_recompiler/frontend/ir/ir_emitter.h b/src/shader_recompiler/frontend/ir/ir_emitter.h
index 1b89ca5a0..13eefa88b 100644
--- a/src/shader_recompiler/frontend/ir/ir_emitter.h
+++ b/src/shader_recompiler/frontend/ir/ir_emitter.h
@@ -102,6 +102,8 @@ public:
102 [[nodiscard]] U1 IsHelperInvocation(); 102 [[nodiscard]] U1 IsHelperInvocation();
103 [[nodiscard]] F32 YDirection(); 103 [[nodiscard]] F32 YDirection();
104 104
105 [[nodiscard]] F32 ResolutionDownFactor();
106
105 [[nodiscard]] U32 LaneId(); 107 [[nodiscard]] U32 LaneId();
106 108
107 [[nodiscard]] U32 LoadGlobalU8(const U64& address); 109 [[nodiscard]] U32 LoadGlobalU8(const U64& address);
@@ -207,6 +209,7 @@ public:
207 [[nodiscard]] U32U64 IAdd(const U32U64& a, const U32U64& b); 209 [[nodiscard]] U32U64 IAdd(const U32U64& a, const U32U64& b);
208 [[nodiscard]] U32U64 ISub(const U32U64& a, const U32U64& b); 210 [[nodiscard]] U32U64 ISub(const U32U64& a, const U32U64& b);
209 [[nodiscard]] U32 IMul(const U32& a, const U32& b); 211 [[nodiscard]] U32 IMul(const U32& a, const U32& b);
212 [[nodiscard]] U32 IDiv(const U32& a, const U32& b, bool is_signed = false);
210 [[nodiscard]] U32U64 INeg(const U32U64& value); 213 [[nodiscard]] U32U64 INeg(const U32U64& value);
211 [[nodiscard]] U32 IAbs(const U32& value); 214 [[nodiscard]] U32 IAbs(const U32& value);
212 [[nodiscard]] U32U64 ShiftLeftLogical(const U32U64& base, const U32& shift); 215 [[nodiscard]] U32U64 ShiftLeftLogical(const U32U64& base, const U32& shift);
@@ -356,6 +359,10 @@ public:
356 TextureInstInfo info); 359 TextureInstInfo info);
357 [[nodiscard]] Value ImageAtomicExchange(const Value& handle, const Value& coords, 360 [[nodiscard]] Value ImageAtomicExchange(const Value& handle, const Value& coords,
358 const Value& value, TextureInstInfo info); 361 const Value& value, TextureInstInfo info);
362
363 [[nodiscard]] U1 IsTextureScaled(const U32& index);
364 [[nodiscard]] U1 IsImageScaled(const U32& index);
365
359 [[nodiscard]] U1 VoteAll(const U1& value); 366 [[nodiscard]] U1 VoteAll(const U1& value);
360 [[nodiscard]] U1 VoteAny(const U1& value); 367 [[nodiscard]] U1 VoteAny(const U1& value);
361 [[nodiscard]] U1 VoteEqual(const U1& value); 368 [[nodiscard]] U1 VoteEqual(const U1& value);
diff --git a/src/shader_recompiler/frontend/ir/microinstruction.cpp b/src/shader_recompiler/frontend/ir/microinstruction.cpp
index 30b470bdd..97e2bf6af 100644
--- a/src/shader_recompiler/frontend/ir/microinstruction.cpp
+++ b/src/shader_recompiler/frontend/ir/microinstruction.cpp
@@ -47,6 +47,17 @@ Inst::Inst(IR::Opcode op_, u32 flags_) noexcept : op{op_}, flags{flags_} {
47 } 47 }
48} 48}
49 49
50Inst::Inst(const Inst& base) : op{base.op}, flags{base.flags} {
51 if (base.op == Opcode::Phi) {
52 throw NotImplementedException("Copying phi node");
53 }
54 std::construct_at(&args);
55 const size_t num_args{base.NumArgs()};
56 for (size_t index = 0; index < num_args; ++index) {
57 SetArg(index, base.Arg(index));
58 }
59}
60
50Inst::~Inst() { 61Inst::~Inst() {
51 if (op == Opcode::Phi) { 62 if (op == Opcode::Phi) {
52 std::destroy_at(&phi_args); 63 std::destroy_at(&phi_args);
diff --git a/src/shader_recompiler/frontend/ir/opcodes.inc b/src/shader_recompiler/frontend/ir/opcodes.inc
index d91098c80..6929919df 100644
--- a/src/shader_recompiler/frontend/ir/opcodes.inc
+++ b/src/shader_recompiler/frontend/ir/opcodes.inc
@@ -62,6 +62,7 @@ OPCODE(InvocationId, U32,
62OPCODE(SampleId, U32, ) 62OPCODE(SampleId, U32, )
63OPCODE(IsHelperInvocation, U1, ) 63OPCODE(IsHelperInvocation, U1, )
64OPCODE(YDirection, F32, ) 64OPCODE(YDirection, F32, )
65OPCODE(ResolutionDownFactor, F32, )
65 66
66// Undefined 67// Undefined
67OPCODE(UndefU1, U1, ) 68OPCODE(UndefU1, U1, )
@@ -286,6 +287,8 @@ OPCODE(IAdd64, U64, U64,
286OPCODE(ISub32, U32, U32, U32, ) 287OPCODE(ISub32, U32, U32, U32, )
287OPCODE(ISub64, U64, U64, U64, ) 288OPCODE(ISub64, U64, U64, U64, )
288OPCODE(IMul32, U32, U32, U32, ) 289OPCODE(IMul32, U32, U32, U32, )
290OPCODE(SDiv32, U32, U32, U32, )
291OPCODE(UDiv32, U32, U32, U32, )
289OPCODE(INeg32, U32, U32, ) 292OPCODE(INeg32, U32, U32, )
290OPCODE(INeg64, U64, U64, ) 293OPCODE(INeg64, U64, U64, )
291OPCODE(IAbs32, U32, U32, ) 294OPCODE(IAbs32, U32, U32, )
@@ -490,6 +493,9 @@ OPCODE(ImageGradient, F32x4, Opaq
490OPCODE(ImageRead, U32x4, Opaque, Opaque, ) 493OPCODE(ImageRead, U32x4, Opaque, Opaque, )
491OPCODE(ImageWrite, Void, Opaque, Opaque, U32x4, ) 494OPCODE(ImageWrite, Void, Opaque, Opaque, U32x4, )
492 495
496OPCODE(IsTextureScaled, U1, U32, )
497OPCODE(IsImageScaled, U1, U32, )
498
493// Atomic Image operations 499// Atomic Image operations
494 500
495OPCODE(BindlessImageAtomicIAdd32, U32, U32, Opaque, U32, ) 501OPCODE(BindlessImageAtomicIAdd32, U32, U32, Opaque, U32, )
diff --git a/src/shader_recompiler/frontend/ir/value.h b/src/shader_recompiler/frontend/ir/value.h
index 6c9ef6bdd..947579852 100644
--- a/src/shader_recompiler/frontend/ir/value.h
+++ b/src/shader_recompiler/frontend/ir/value.h
@@ -116,10 +116,10 @@ public:
116class Inst : public boost::intrusive::list_base_hook<> { 116class Inst : public boost::intrusive::list_base_hook<> {
117public: 117public:
118 explicit Inst(IR::Opcode op_, u32 flags_) noexcept; 118 explicit Inst(IR::Opcode op_, u32 flags_) noexcept;
119 explicit Inst(const Inst& base);
119 ~Inst(); 120 ~Inst();
120 121
121 Inst& operator=(const Inst&) = delete; 122 Inst& operator=(const Inst&) = delete;
122 Inst(const Inst&) = delete;
123 123
124 Inst& operator=(Inst&&) = delete; 124 Inst& operator=(Inst&&) = delete;
125 Inst(Inst&&) = delete; 125 Inst(Inst&&) = delete;
diff --git a/src/shader_recompiler/frontend/maxwell/translate_program.cpp b/src/shader_recompiler/frontend/maxwell/translate_program.cpp
index 2fc542f0e..267ebe4af 100644
--- a/src/shader_recompiler/frontend/maxwell/translate_program.cpp
+++ b/src/shader_recompiler/frontend/maxwell/translate_program.cpp
@@ -179,6 +179,10 @@ IR::Program TranslateProgram(ObjectPool<IR::Inst>& inst_pool, ObjectPool<IR::Blo
179 Optimization::TexturePass(env, program); 179 Optimization::TexturePass(env, program);
180 180
181 Optimization::ConstantPropagationPass(program); 181 Optimization::ConstantPropagationPass(program);
182
183 if (Settings::values.resolution_info.active) {
184 Optimization::RescalingPass(program);
185 }
182 Optimization::DeadCodeEliminationPass(program); 186 Optimization::DeadCodeEliminationPass(program);
183 if (Settings::values.renderer_debug) { 187 if (Settings::values.renderer_debug) {
184 Optimization::VerificationPass(program); 188 Optimization::VerificationPass(program);
diff --git a/src/shader_recompiler/ir_opt/collect_shader_info_pass.cpp b/src/shader_recompiler/ir_opt/collect_shader_info_pass.cpp
index f69e1c9cc..1e476d83d 100644
--- a/src/shader_recompiler/ir_opt/collect_shader_info_pass.cpp
+++ b/src/shader_recompiler/ir_opt/collect_shader_info_pass.cpp
@@ -430,6 +430,11 @@ void VisitUsages(Info& info, IR::Inst& inst) {
430 case IR::Opcode::IsHelperInvocation: 430 case IR::Opcode::IsHelperInvocation:
431 info.uses_is_helper_invocation = true; 431 info.uses_is_helper_invocation = true;
432 break; 432 break;
433 case IR::Opcode::ResolutionDownFactor:
434 case IR::Opcode::IsTextureScaled:
435 case IR::Opcode::IsImageScaled:
436 info.uses_rescaling_uniform = true;
437 break;
433 case IR::Opcode::LaneId: 438 case IR::Opcode::LaneId:
434 info.uses_subgroup_invocation_id = true; 439 info.uses_subgroup_invocation_id = true;
435 break; 440 break;
diff --git a/src/shader_recompiler/ir_opt/passes.h b/src/shader_recompiler/ir_opt/passes.h
index 2f89b1ea0..f877c7ba0 100644
--- a/src/shader_recompiler/ir_opt/passes.h
+++ b/src/shader_recompiler/ir_opt/passes.h
@@ -19,6 +19,7 @@ void GlobalMemoryToStorageBufferPass(IR::Program& program);
19void IdentityRemovalPass(IR::Program& program); 19void IdentityRemovalPass(IR::Program& program);
20void LowerFp16ToFp32(IR::Program& program); 20void LowerFp16ToFp32(IR::Program& program);
21void LowerInt64ToInt32(IR::Program& program); 21void LowerInt64ToInt32(IR::Program& program);
22void RescalingPass(IR::Program& program);
22void SsaRewritePass(IR::Program& program); 23void SsaRewritePass(IR::Program& program);
23void TexturePass(Environment& env, IR::Program& program); 24void TexturePass(Environment& env, IR::Program& program);
24void VerificationPass(const IR::Program& program); 25void VerificationPass(const IR::Program& program);
diff --git a/src/shader_recompiler/ir_opt/rescaling_pass.cpp b/src/shader_recompiler/ir_opt/rescaling_pass.cpp
new file mode 100644
index 000000000..c28500dd1
--- /dev/null
+++ b/src/shader_recompiler/ir_opt/rescaling_pass.cpp
@@ -0,0 +1,327 @@
1// Copyright 2021 yuzu Emulator Project
2// Licensed under GPLv2 or any later version
3// Refer to the license.txt file included.
4
5#include "common/alignment.h"
6#include "common/settings.h"
7#include "shader_recompiler/environment.h"
8#include "shader_recompiler/frontend/ir/ir_emitter.h"
9#include "shader_recompiler/frontend/ir/modifiers.h"
10#include "shader_recompiler/frontend/ir/program.h"
11#include "shader_recompiler/frontend/ir/value.h"
12#include "shader_recompiler/ir_opt/passes.h"
13#include "shader_recompiler/shader_info.h"
14
15namespace Shader::Optimization {
16namespace {
17[[nodiscard]] bool IsTextureTypeRescalable(TextureType type) {
18 switch (type) {
19 case TextureType::Color2D:
20 case TextureType::ColorArray2D:
21 return true;
22 case TextureType::Color1D:
23 case TextureType::ColorArray1D:
24 case TextureType::Color3D:
25 case TextureType::ColorCube:
26 case TextureType::ColorArrayCube:
27 case TextureType::Buffer:
28 break;
29 }
30 return false;
31}
32
33void VisitMark(IR::Block& block, IR::Inst& inst) {
34 switch (inst.GetOpcode()) {
35 case IR::Opcode::ShuffleIndex:
36 case IR::Opcode::ShuffleUp:
37 case IR::Opcode::ShuffleDown:
38 case IR::Opcode::ShuffleButterfly: {
39 const IR::Value shfl_arg{inst.Arg(0)};
40 if (shfl_arg.IsImmediate()) {
41 break;
42 }
43 const IR::Inst* const arg_inst{shfl_arg.InstRecursive()};
44 if (arg_inst->GetOpcode() != IR::Opcode::BitCastU32F32) {
45 break;
46 }
47 const IR::Value bitcast_arg{arg_inst->Arg(0)};
48 if (bitcast_arg.IsImmediate()) {
49 break;
50 }
51 IR::Inst* const bitcast_inst{bitcast_arg.InstRecursive()};
52 bool must_patch_outside = false;
53 if (bitcast_inst->GetOpcode() == IR::Opcode::GetAttribute) {
54 const IR::Attribute attr{bitcast_inst->Arg(0).Attribute()};
55 switch (attr) {
56 case IR::Attribute::PositionX:
57 case IR::Attribute::PositionY:
58 bitcast_inst->SetFlags<u32>(0xDEADBEEF);
59 must_patch_outside = true;
60 break;
61 default:
62 break;
63 }
64 }
65 if (must_patch_outside) {
66 const auto it{IR::Block::InstructionList::s_iterator_to(inst)};
67 IR::IREmitter ir{block, IR::Block::InstructionList::s_iterator_to(inst)};
68 const IR::F32 new_inst{&*block.PrependNewInst(it, inst)};
69 const IR::F32 up_factor{ir.FPRecip(ir.ResolutionDownFactor())};
70 const IR::Value converted{ir.FPMul(new_inst, up_factor)};
71 inst.ReplaceUsesWith(converted);
72 }
73 break;
74 }
75
76 default:
77 break;
78 }
79}
80
81void PatchFragCoord(IR::Block& block, IR::Inst& inst) {
82 IR::IREmitter ir{block, IR::Block::InstructionList::s_iterator_to(inst)};
83 const IR::F32 down_factor{ir.ResolutionDownFactor()};
84 const IR::F32 frag_coord{ir.GetAttribute(inst.Arg(0).Attribute())};
85 const IR::F32 downscaled_frag_coord{ir.FPMul(frag_coord, down_factor)};
86 inst.ReplaceUsesWith(downscaled_frag_coord);
87}
88
89void PatchPointSize(IR::Block& block, IR::Inst& inst) {
90 IR::IREmitter ir{block, IR::Block::InstructionList::s_iterator_to(inst)};
91 const IR::F32 point_value{inst.Arg(1)};
92 const IR::F32 up_factor{ir.FPRecip(ir.ResolutionDownFactor())};
93 const IR::F32 upscaled_point_value{ir.FPMul(point_value, up_factor)};
94 inst.SetArg(1, upscaled_point_value);
95}
96
97[[nodiscard]] IR::U32 Scale(IR::IREmitter& ir, const IR::U1& is_scaled, const IR::U32& value) {
98 IR::U32 scaled_value{value};
99 if (const u32 up_scale = Settings::values.resolution_info.up_scale; up_scale != 1) {
100 scaled_value = ir.IMul(scaled_value, ir.Imm32(up_scale));
101 }
102 if (const u32 down_shift = Settings::values.resolution_info.down_shift; down_shift != 0) {
103 scaled_value = ir.ShiftRightArithmetic(scaled_value, ir.Imm32(down_shift));
104 }
105 return IR::U32{ir.Select(is_scaled, scaled_value, value)};
106}
107
108[[nodiscard]] IR::U32 SubScale(IR::IREmitter& ir, const IR::U1& is_scaled, const IR::U32& value,
109 const IR::Attribute attrib) {
110 const IR::F32 up_factor{ir.Imm32(Settings::values.resolution_info.up_factor)};
111 const IR::F32 base{ir.FPMul(ir.ConvertUToF(32, 32, value), up_factor)};
112 const IR::F32 frag_coord{ir.GetAttribute(attrib)};
113 const IR::F32 down_factor{ir.Imm32(Settings::values.resolution_info.down_factor)};
114 const IR::F32 floor{ir.FPMul(up_factor, ir.FPFloor(ir.FPMul(frag_coord, down_factor)))};
115 const IR::F16F32F64 deviation{ir.FPAdd(base, ir.FPAdd(frag_coord, ir.FPNeg(floor)))};
116 return IR::U32{ir.Select(is_scaled, ir.ConvertFToU(32, deviation), value)};
117}
118
119[[nodiscard]] IR::U32 DownScale(IR::IREmitter& ir, const IR::U1& is_scaled, const IR::U32& value) {
120 IR::U32 scaled_value{value};
121 if (const u32 down_shift = Settings::values.resolution_info.down_shift; down_shift != 0) {
122 scaled_value = ir.ShiftLeftLogical(scaled_value, ir.Imm32(down_shift));
123 }
124 if (const u32 up_scale = Settings::values.resolution_info.up_scale; up_scale != 1) {
125 scaled_value = ir.IDiv(scaled_value, ir.Imm32(up_scale));
126 }
127 return IR::U32{ir.Select(is_scaled, scaled_value, value)};
128}
129
130void PatchImageQueryDimensions(IR::Block& block, IR::Inst& inst) {
131 const auto it{IR::Block::InstructionList::s_iterator_to(inst)};
132 IR::IREmitter ir{block, IR::Block::InstructionList::s_iterator_to(inst)};
133 const auto info{inst.Flags<IR::TextureInstInfo>()};
134 const IR::U1 is_scaled{ir.IsTextureScaled(ir.Imm32(info.descriptor_index))};
135 switch (info.type) {
136 case TextureType::Color2D:
137 case TextureType::ColorArray2D: {
138 const IR::Value new_inst{&*block.PrependNewInst(it, inst)};
139 const IR::U32 width{DownScale(ir, is_scaled, IR::U32{ir.CompositeExtract(new_inst, 0)})};
140 const IR::U32 height{DownScale(ir, is_scaled, IR::U32{ir.CompositeExtract(new_inst, 1)})};
141 const IR::Value replacement{ir.CompositeConstruct(
142 width, height, ir.CompositeExtract(new_inst, 2), ir.CompositeExtract(new_inst, 3))};
143 inst.ReplaceUsesWith(replacement);
144 break;
145 }
146 case TextureType::Color1D:
147 case TextureType::ColorArray1D:
148 case TextureType::Color3D:
149 case TextureType::ColorCube:
150 case TextureType::ColorArrayCube:
151 case TextureType::Buffer:
152 // Nothing to patch here
153 break;
154 }
155}
156
157void ScaleIntegerComposite(IR::IREmitter& ir, IR::Inst& inst, const IR::U1& is_scaled,
158 size_t index) {
159 const IR::Value composite{inst.Arg(index)};
160 if (composite.IsEmpty()) {
161 return;
162 }
163 const auto info{inst.Flags<IR::TextureInstInfo>()};
164 const IR::U32 x{Scale(ir, is_scaled, IR::U32{ir.CompositeExtract(composite, 0)})};
165 const IR::U32 y{Scale(ir, is_scaled, IR::U32{ir.CompositeExtract(composite, 1)})};
166 switch (info.type) {
167 case TextureType::Color2D:
168 inst.SetArg(index, ir.CompositeConstruct(x, y));
169 break;
170 case TextureType::ColorArray2D: {
171 const IR::U32 z{ir.CompositeExtract(composite, 2)};
172 inst.SetArg(index, ir.CompositeConstruct(x, y, z));
173 break;
174 }
175 case TextureType::Color1D:
176 case TextureType::ColorArray1D:
177 case TextureType::Color3D:
178 case TextureType::ColorCube:
179 case TextureType::ColorArrayCube:
180 case TextureType::Buffer:
181 // Nothing to patch here
182 break;
183 }
184}
185
186void SubScaleCoord(IR::IREmitter& ir, IR::Inst& inst, const IR::U1& is_scaled) {
187 const auto info{inst.Flags<IR::TextureInstInfo>()};
188 const IR::Value coord{inst.Arg(1)};
189 const IR::U32 coord_x{ir.CompositeExtract(coord, 0)};
190 const IR::U32 coord_y{ir.CompositeExtract(coord, 1)};
191
192 const IR::U32 scaled_x{SubScale(ir, is_scaled, coord_x, IR::Attribute::PositionX)};
193 const IR::U32 scaled_y{SubScale(ir, is_scaled, coord_y, IR::Attribute::PositionY)};
194 switch (info.type) {
195 case TextureType::Color2D:
196 inst.SetArg(1, ir.CompositeConstruct(scaled_x, scaled_y));
197 break;
198 case TextureType::ColorArray2D: {
199 const IR::U32 z{ir.CompositeExtract(coord, 2)};
200 inst.SetArg(1, ir.CompositeConstruct(scaled_x, scaled_y, z));
201 break;
202 }
203 case TextureType::Color1D:
204 case TextureType::ColorArray1D:
205 case TextureType::Color3D:
206 case TextureType::ColorCube:
207 case TextureType::ColorArrayCube:
208 case TextureType::Buffer:
209 // Nothing to patch here
210 break;
211 }
212}
213
214void SubScaleImageFetch(IR::Block& block, IR::Inst& inst) {
215 IR::IREmitter ir{block, IR::Block::InstructionList::s_iterator_to(inst)};
216 const auto info{inst.Flags<IR::TextureInstInfo>()};
217 if (!IsTextureTypeRescalable(info.type)) {
218 return;
219 }
220 const IR::U1 is_scaled{ir.IsTextureScaled(ir.Imm32(info.descriptor_index))};
221 SubScaleCoord(ir, inst, is_scaled);
222 // Scale ImageFetch offset
223 ScaleIntegerComposite(ir, inst, is_scaled, 2);
224}
225
226void SubScaleImageRead(IR::Block& block, IR::Inst& inst) {
227 IR::IREmitter ir{block, IR::Block::InstructionList::s_iterator_to(inst)};
228 const auto info{inst.Flags<IR::TextureInstInfo>()};
229 if (!IsTextureTypeRescalable(info.type)) {
230 return;
231 }
232 const IR::U1 is_scaled{ir.IsImageScaled(ir.Imm32(info.descriptor_index))};
233 SubScaleCoord(ir, inst, is_scaled);
234}
235
236void PatchImageFetch(IR::Block& block, IR::Inst& inst) {
237 IR::IREmitter ir{block, IR::Block::InstructionList::s_iterator_to(inst)};
238 const auto info{inst.Flags<IR::TextureInstInfo>()};
239 if (!IsTextureTypeRescalable(info.type)) {
240 return;
241 }
242 const IR::U1 is_scaled{ir.IsTextureScaled(ir.Imm32(info.descriptor_index))};
243 ScaleIntegerComposite(ir, inst, is_scaled, 1);
244 // Scale ImageFetch offset
245 ScaleIntegerComposite(ir, inst, is_scaled, 2);
246}
247
248void PatchImageRead(IR::Block& block, IR::Inst& inst) {
249 IR::IREmitter ir{block, IR::Block::InstructionList::s_iterator_to(inst)};
250 const auto info{inst.Flags<IR::TextureInstInfo>()};
251 if (!IsTextureTypeRescalable(info.type)) {
252 return;
253 }
254 const IR::U1 is_scaled{ir.IsImageScaled(ir.Imm32(info.descriptor_index))};
255 ScaleIntegerComposite(ir, inst, is_scaled, 1);
256}
257
258void Visit(const IR::Program& program, IR::Block& block, IR::Inst& inst) {
259 const bool is_fragment_shader{program.stage == Stage::Fragment};
260 switch (inst.GetOpcode()) {
261 case IR::Opcode::GetAttribute: {
262 const IR::Attribute attr{inst.Arg(0).Attribute()};
263 switch (attr) {
264 case IR::Attribute::PositionX:
265 case IR::Attribute::PositionY:
266 if (is_fragment_shader && inst.Flags<u32>() != 0xDEADBEEF) {
267 PatchFragCoord(block, inst);
268 }
269 break;
270 default:
271 break;
272 }
273 break;
274 }
275 case IR::Opcode::SetAttribute: {
276 const IR::Attribute attr{inst.Arg(0).Attribute()};
277 switch (attr) {
278 case IR::Attribute::PointSize:
279 if (inst.Flags<u32>() != 0xDEADBEEF) {
280 PatchPointSize(block, inst);
281 }
282 break;
283 default:
284 break;
285 }
286 break;
287 }
288 case IR::Opcode::ImageQueryDimensions:
289 PatchImageQueryDimensions(block, inst);
290 break;
291 case IR::Opcode::ImageFetch:
292 if (is_fragment_shader) {
293 SubScaleImageFetch(block, inst);
294 } else {
295 PatchImageFetch(block, inst);
296 }
297 break;
298 case IR::Opcode::ImageRead:
299 if (is_fragment_shader) {
300 SubScaleImageRead(block, inst);
301 } else {
302 PatchImageRead(block, inst);
303 }
304 break;
305 default:
306 break;
307 }
308}
309} // Anonymous namespace
310
311void RescalingPass(IR::Program& program) {
312 const bool is_fragment_shader{program.stage == Stage::Fragment};
313 if (is_fragment_shader) {
314 for (IR::Block* const block : program.post_order_blocks) {
315 for (IR::Inst& inst : block->Instructions()) {
316 VisitMark(*block, inst);
317 }
318 }
319 }
320 for (IR::Block* const block : program.post_order_blocks) {
321 for (IR::Inst& inst : block->Instructions()) {
322 Visit(program, *block, inst);
323 }
324 }
325}
326
327} // namespace Shader::Optimization
diff --git a/src/shader_recompiler/shader_info.h b/src/shader_recompiler/shader_info.h
index 4ef4dbd40..9f375c30e 100644
--- a/src/shader_recompiler/shader_info.h
+++ b/src/shader_recompiler/shader_info.h
@@ -172,6 +172,7 @@ struct Info {
172 bool uses_global_memory{}; 172 bool uses_global_memory{};
173 bool uses_atomic_image_u32{}; 173 bool uses_atomic_image_u32{};
174 bool uses_shadow_lod{}; 174 bool uses_shadow_lod{};
175 bool uses_rescaling_uniform{};
175 176
176 IR::Type used_constant_buffer_types{}; 177 IR::Type used_constant_buffer_types{};
177 IR::Type used_storage_buffer_types{}; 178 IR::Type used_storage_buffer_types{};
@@ -190,4 +191,13 @@ struct Info {
190 ImageDescriptors image_descriptors; 191 ImageDescriptors image_descriptors;
191}; 192};
192 193
194template <typename Descriptors>
195u32 NumDescriptors(const Descriptors& descriptors) {
196 u32 num{};
197 for (const auto& desc : descriptors) {
198 num += desc.count;
199 }
200 return num;
201}
202
193} // namespace Shader 203} // namespace Shader
diff --git a/src/video_core/CMakeLists.txt b/src/video_core/CMakeLists.txt
index 6aac7f305..91a30fef7 100644
--- a/src/video_core/CMakeLists.txt
+++ b/src/video_core/CMakeLists.txt
@@ -132,6 +132,8 @@ add_library(video_core STATIC
132 renderer_vulkan/vk_descriptor_pool.h 132 renderer_vulkan/vk_descriptor_pool.h
133 renderer_vulkan/vk_fence_manager.cpp 133 renderer_vulkan/vk_fence_manager.cpp
134 renderer_vulkan/vk_fence_manager.h 134 renderer_vulkan/vk_fence_manager.h
135 renderer_vulkan/vk_fsr.cpp
136 renderer_vulkan/vk_fsr.h
135 renderer_vulkan/vk_graphics_pipeline.cpp 137 renderer_vulkan/vk_graphics_pipeline.cpp
136 renderer_vulkan/vk_graphics_pipeline.h 138 renderer_vulkan/vk_graphics_pipeline.h
137 renderer_vulkan/vk_master_semaphore.cpp 139 renderer_vulkan/vk_master_semaphore.cpp
diff --git a/src/video_core/buffer_cache/buffer_cache.h b/src/video_core/buffer_cache/buffer_cache.h
index d350c9b36..43bed63ac 100644
--- a/src/video_core/buffer_cache/buffer_cache.h
+++ b/src/video_core/buffer_cache/buffer_cache.h
@@ -853,12 +853,14 @@ void BufferCache<P>::CommitAsyncFlushesHigh() {
853 } 853 }
854 if constexpr (USE_MEMORY_MAPS) { 854 if constexpr (USE_MEMORY_MAPS) {
855 auto download_staging = runtime.DownloadStagingBuffer(total_size_bytes); 855 auto download_staging = runtime.DownloadStagingBuffer(total_size_bytes);
856 runtime.PreCopyBarrier();
856 for (auto& [copy, buffer_id] : downloads) { 857 for (auto& [copy, buffer_id] : downloads) {
857 // Have in mind the staging buffer offset for the copy 858 // Have in mind the staging buffer offset for the copy
858 copy.dst_offset += download_staging.offset; 859 copy.dst_offset += download_staging.offset;
859 const std::array copies{copy}; 860 const std::array copies{copy};
860 runtime.CopyBuffer(download_staging.buffer, slot_buffers[buffer_id], copies); 861 runtime.CopyBuffer(download_staging.buffer, slot_buffers[buffer_id], copies, false);
861 } 862 }
863 runtime.PostCopyBarrier();
862 runtime.Finish(); 864 runtime.Finish();
863 for (const auto& [copy, buffer_id] : downloads) { 865 for (const auto& [copy, buffer_id] : downloads) {
864 const Buffer& buffer = slot_buffers[buffer_id]; 866 const Buffer& buffer = slot_buffers[buffer_id];
diff --git a/src/video_core/dirty_flags.h b/src/video_core/dirty_flags.h
index f0d545f90..d63ad5a35 100644
--- a/src/video_core/dirty_flags.h
+++ b/src/video_core/dirty_flags.h
@@ -29,6 +29,8 @@ enum : u8 {
29 ColorBuffer6, 29 ColorBuffer6,
30 ColorBuffer7, 30 ColorBuffer7,
31 ZetaBuffer, 31 ZetaBuffer,
32 RescaleViewports,
33 RescaleScissors,
32 34
33 VertexBuffers, 35 VertexBuffers,
34 VertexBuffer0, 36 VertexBuffer0,
diff --git a/src/video_core/host_shaders/CMakeLists.txt b/src/video_core/host_shaders/CMakeLists.txt
index 20d748c12..d779a967a 100644
--- a/src/video_core/host_shaders/CMakeLists.txt
+++ b/src/video_core/host_shaders/CMakeLists.txt
@@ -1,3 +1,11 @@
1set(FIDELITYFX_INCLUDE_DIR ${CMAKE_SOURCE_DIR}/externals/FidelityFX-FSR/ffx-fsr)
2
3set(GLSL_INCLUDES
4 fidelityfx_fsr.comp
5 ${FIDELITYFX_INCLUDE_DIR}/ffx_a.h
6 ${FIDELITYFX_INCLUDE_DIR}/ffx_fsr1.h
7)
8
1set(SHADER_FILES 9set(SHADER_FILES
2 astc_decoder.comp 10 astc_decoder.comp
3 block_linear_unswizzle_2d.comp 11 block_linear_unswizzle_2d.comp
@@ -5,14 +13,25 @@ set(SHADER_FILES
5 convert_depth_to_float.frag 13 convert_depth_to_float.frag
6 convert_float_to_depth.frag 14 convert_float_to_depth.frag
7 full_screen_triangle.vert 15 full_screen_triangle.vert
16 fxaa.frag
17 fxaa.vert
8 opengl_copy_bc4.comp 18 opengl_copy_bc4.comp
9 opengl_present.frag 19 opengl_present.frag
10 opengl_present.vert 20 opengl_present.vert
21 opengl_present_scaleforce.frag
11 pitch_unswizzle.comp 22 pitch_unswizzle.comp
23 present_bicubic.frag
24 present_gaussian.frag
12 vulkan_blit_color_float.frag 25 vulkan_blit_color_float.frag
13 vulkan_blit_depth_stencil.frag 26 vulkan_blit_depth_stencil.frag
27 vulkan_fidelityfx_fsr_easu_fp16.comp
28 vulkan_fidelityfx_fsr_easu_fp32.comp
29 vulkan_fidelityfx_fsr_rcas_fp16.comp
30 vulkan_fidelityfx_fsr_rcas_fp32.comp
14 vulkan_present.frag 31 vulkan_present.frag
15 vulkan_present.vert 32 vulkan_present.vert
33 vulkan_present_scaleforce_fp16.frag
34 vulkan_present_scaleforce_fp32.frag
16 vulkan_quad_indexed.comp 35 vulkan_quad_indexed.comp
17 vulkan_uint8.comp 36 vulkan_uint8.comp
18) 37)
@@ -76,7 +95,7 @@ foreach(FILENAME IN ITEMS ${SHADER_FILES})
76 OUTPUT 95 OUTPUT
77 ${SPIRV_HEADER_FILE} 96 ${SPIRV_HEADER_FILE}
78 COMMAND 97 COMMAND
79 ${GLSLANGVALIDATOR} -V ${QUIET_FLAG} ${GLSL_FLAGS} --variable-name ${SPIRV_VARIABLE_NAME} -o ${SPIRV_HEADER_FILE} ${SOURCE_FILE} 98 ${GLSLANGVALIDATOR} -V ${QUIET_FLAG} -I"${FIDELITYFX_INCLUDE_DIR}" ${GLSL_FLAGS} --variable-name ${SPIRV_VARIABLE_NAME} -o ${SPIRV_HEADER_FILE} ${SOURCE_FILE}
80 MAIN_DEPENDENCY 99 MAIN_DEPENDENCY
81 ${SOURCE_FILE} 100 ${SOURCE_FILE}
82 ) 101 )
@@ -84,9 +103,12 @@ foreach(FILENAME IN ITEMS ${SHADER_FILES})
84 endif() 103 endif()
85endforeach() 104endforeach()
86 105
106set(SHADER_SOURCES ${SHADER_FILES})
107list(APPEND SHADER_SOURCES ${GLSL_INCLUDES})
108
87add_custom_target(host_shaders 109add_custom_target(host_shaders
88 DEPENDS 110 DEPENDS
89 ${SHADER_HEADERS} 111 ${SHADER_HEADERS}
90 SOURCES 112 SOURCES
91 ${SHADER_FILES} 113 ${SHADER_SOURCES}
92) 114)
diff --git a/src/video_core/host_shaders/fidelityfx_fsr.comp b/src/video_core/host_shaders/fidelityfx_fsr.comp
new file mode 100644
index 000000000..6b97f789d
--- /dev/null
+++ b/src/video_core/host_shaders/fidelityfx_fsr.comp
@@ -0,0 +1,116 @@
1// Copyright 2021 yuzu Emulator Project
2// Licensed under GPLv2 or any later version
3// Refer to the license.txt file included.
4
5//!#version 460 core
6#extension GL_ARB_separate_shader_objects : enable
7#extension GL_ARB_shading_language_420pack : enable
8#extension GL_GOOGLE_include_directive : enable
9#extension GL_EXT_shader_explicit_arithmetic_types : require
10
11// FidelityFX Super Resolution Sample
12//
13// Copyright (c) 2021 Advanced Micro Devices, Inc. All rights reserved.
14// Permission is hereby granted, free of charge, to any person obtaining a copy
15// of this software and associated documentation files(the "Software"), to deal
16// in the Software without restriction, including without limitation the rights
17// to use, copy, modify, merge, publish, distribute, sublicense, and / or sell
18// copies of the Software, and to permit persons to whom the Software is
19// furnished to do so, subject to the following conditions :
20// The above copyright notice and this permission notice shall be included in
21// all copies or substantial portions of the Software.
22// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
23// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
24// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.IN NO EVENT SHALL THE
25// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
26// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
27// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
28// THE SOFTWARE.
29
30layout( push_constant ) uniform constants {
31 uvec4 Const0;
32 uvec4 Const1;
33 uvec4 Const2;
34 uvec4 Const3;
35};
36
37layout(set=0,binding=0) uniform sampler2D InputTexture;
38layout(set=0,binding=1,rgba16f) uniform image2D OutputTexture;
39
40#define A_GPU 1
41#define A_GLSL 1
42
43#ifndef YUZU_USE_FP16
44 #include "ffx_a.h"
45
46 #if USE_EASU
47 #define FSR_EASU_F 1
48 AF4 FsrEasuRF(AF2 p) { AF4 res = textureGather(InputTexture, p, 0); return res; }
49 AF4 FsrEasuGF(AF2 p) { AF4 res = textureGather(InputTexture, p, 1); return res; }
50 AF4 FsrEasuBF(AF2 p) { AF4 res = textureGather(InputTexture, p, 2); return res; }
51 #endif
52 #if USE_RCAS
53 #define FSR_RCAS_F 1
54 AF4 FsrRcasLoadF(ASU2 p) { return texelFetch(InputTexture, ASU2(p), 0); }
55 void FsrRcasInputF(inout AF1 r, inout AF1 g, inout AF1 b) {}
56 #endif
57#else
58 #define A_HALF
59 #include "ffx_a.h"
60
61 #if USE_EASU
62 #define FSR_EASU_H 1
63 AH4 FsrEasuRH(AF2 p) { AH4 res = AH4(textureGather(InputTexture, p, 0)); return res; }
64 AH4 FsrEasuGH(AF2 p) { AH4 res = AH4(textureGather(InputTexture, p, 1)); return res; }
65 AH4 FsrEasuBH(AF2 p) { AH4 res = AH4(textureGather(InputTexture, p, 2)); return res; }
66 #endif
67 #if USE_RCAS
68 #define FSR_RCAS_H 1
69 AH4 FsrRcasLoadH(ASW2 p) { return AH4(texelFetch(InputTexture, ASU2(p), 0)); }
70 void FsrRcasInputH(inout AH1 r,inout AH1 g,inout AH1 b){}
71 #endif
72#endif
73
74#include "ffx_fsr1.h"
75
76void CurrFilter(AU2 pos) {
77#if USE_BILINEAR
78 AF2 pp = (AF2(pos) * AF2_AU2(Const0.xy) + AF2_AU2(Const0.zw)) * AF2_AU2(Const1.xy) + AF2(0.5, -0.5) * AF2_AU2(Const1.zw);
79 imageStore(OutputTexture, ASU2(pos), textureLod(InputTexture, pp, 0.0));
80#endif
81#if USE_EASU
82 #ifndef YUZU_USE_FP16
83 AF3 c;
84 FsrEasuF(c, pos, Const0, Const1, Const2, Const3);
85 imageStore(OutputTexture, ASU2(pos), AF4(c, 1));
86 #else
87 AH3 c;
88 FsrEasuH(c, pos, Const0, Const1, Const2, Const3);
89 imageStore(OutputTexture, ASU2(pos), AH4(c, 1));
90 #endif
91#endif
92#if USE_RCAS
93 #ifndef YUZU_USE_FP16
94 AF3 c;
95 FsrRcasF(c.r, c.g, c.b, pos, Const0);
96 imageStore(OutputTexture, ASU2(pos), AF4(c, 1));
97 #else
98 AH3 c;
99 FsrRcasH(c.r, c.g, c.b, pos, Const0);
100 imageStore(OutputTexture, ASU2(pos), AH4(c, 1));
101 #endif
102#endif
103}
104
105layout(local_size_x=64) in;
106void main() {
107 // Do remapping of local xy in workgroup for a more PS-like swizzle pattern.
108 AU2 gxy = ARmp8x8(gl_LocalInvocationID.x) + AU2(gl_WorkGroupID.x << 4u, gl_WorkGroupID.y << 4u);
109 CurrFilter(gxy);
110 gxy.x += 8u;
111 CurrFilter(gxy);
112 gxy.y += 8u;
113 CurrFilter(gxy);
114 gxy.x -= 8u;
115 CurrFilter(gxy);
116}
diff --git a/src/video_core/host_shaders/fxaa.frag b/src/video_core/host_shaders/fxaa.frag
new file mode 100644
index 000000000..02f4068d1
--- /dev/null
+++ b/src/video_core/host_shaders/fxaa.frag
@@ -0,0 +1,76 @@
1// Copyright 2021 yuzu Emulator Project
2// Licensed under GPLv2 or any later version
3// Refer to the license.txt file included.
4
5// Source code is adapted from
6// https://www.geeks3d.com/20110405/fxaa-fast-approximate-anti-aliasing-demo-glsl-opengl-test-radeon-geforce/3/
7
8#version 460
9
10#ifdef VULKAN
11
12#define BINDING_COLOR_TEXTURE 1
13
14#else // ^^^ Vulkan ^^^ // vvv OpenGL vvv
15
16#define BINDING_COLOR_TEXTURE 0
17
18#endif
19
20layout (location = 0) in vec4 posPos;
21
22layout (location = 0) out vec4 frag_color;
23
24layout (binding = BINDING_COLOR_TEXTURE) uniform sampler2D input_texture;
25
26const float FXAA_SPAN_MAX = 8.0;
27const float FXAA_REDUCE_MUL = 1.0 / 8.0;
28const float FXAA_REDUCE_MIN = 1.0 / 128.0;
29
30#define FxaaTexLod0(t, p) textureLod(t, p, 0.0)
31#define FxaaTexOff(t, p, o) textureLodOffset(t, p, 0.0, o)
32
33vec3 FxaaPixelShader(vec4 posPos, sampler2D tex) {
34
35 vec3 rgbNW = FxaaTexLod0(tex, posPos.zw).xyz;
36 vec3 rgbNE = FxaaTexOff(tex, posPos.zw, ivec2(1,0)).xyz;
37 vec3 rgbSW = FxaaTexOff(tex, posPos.zw, ivec2(0,1)).xyz;
38 vec3 rgbSE = FxaaTexOff(tex, posPos.zw, ivec2(1,1)).xyz;
39 vec3 rgbM = FxaaTexLod0(tex, posPos.xy).xyz;
40/*---------------------------------------------------------*/
41 vec3 luma = vec3(0.299, 0.587, 0.114);
42 float lumaNW = dot(rgbNW, luma);
43 float lumaNE = dot(rgbNE, luma);
44 float lumaSW = dot(rgbSW, luma);
45 float lumaSE = dot(rgbSE, luma);
46 float lumaM = dot(rgbM, luma);
47/*---------------------------------------------------------*/
48 float lumaMin = min(lumaM, min(min(lumaNW, lumaNE), min(lumaSW, lumaSE)));
49 float lumaMax = max(lumaM, max(max(lumaNW, lumaNE), max(lumaSW, lumaSE)));
50/*---------------------------------------------------------*/
51 vec2 dir;
52 dir.x = -((lumaNW + lumaNE) - (lumaSW + lumaSE));
53 dir.y = ((lumaNW + lumaSW) - (lumaNE + lumaSE));
54/*---------------------------------------------------------*/
55 float dirReduce = max(
56 (lumaNW + lumaNE + lumaSW + lumaSE) * (0.25 * FXAA_REDUCE_MUL),
57 FXAA_REDUCE_MIN);
58 float rcpDirMin = 1.0/(min(abs(dir.x), abs(dir.y)) + dirReduce);
59 dir = min(vec2( FXAA_SPAN_MAX, FXAA_SPAN_MAX),
60 max(vec2(-FXAA_SPAN_MAX, -FXAA_SPAN_MAX),
61 dir * rcpDirMin)) / textureSize(tex, 0);
62/*--------------------------------------------------------*/
63 vec3 rgbA = (1.0 / 2.0) * (
64 FxaaTexLod0(tex, posPos.xy + dir * (1.0 / 3.0 - 0.5)).xyz +
65 FxaaTexLod0(tex, posPos.xy + dir * (2.0 / 3.0 - 0.5)).xyz);
66 vec3 rgbB = rgbA * (1.0 / 2.0) + (1.0 / 4.0) * (
67 FxaaTexLod0(tex, posPos.xy + dir * (0.0 / 3.0 - 0.5)).xyz +
68 FxaaTexLod0(tex, posPos.xy + dir * (3.0 / 3.0 - 0.5)).xyz);
69 float lumaB = dot(rgbB, luma);
70 if((lumaB < lumaMin) || (lumaB > lumaMax)) return rgbA;
71 return rgbB;
72}
73
74void main() {
75 frag_color = vec4(FxaaPixelShader(posPos, input_texture), 1.0);
76}
diff --git a/src/video_core/host_shaders/fxaa.vert b/src/video_core/host_shaders/fxaa.vert
new file mode 100644
index 000000000..ac20c04e9
--- /dev/null
+++ b/src/video_core/host_shaders/fxaa.vert
@@ -0,0 +1,38 @@
1// Copyright 2021 yuzu Emulator Project
2// Licensed under GPLv2 or any later version
3// Refer to the license.txt file included.
4
5#version 460
6
7out gl_PerVertex {
8 vec4 gl_Position;
9};
10
11const vec2 vertices[4] =
12 vec2[4](vec2(-1.0, 1.0), vec2(1.0, 1.0), vec2(-1.0, -1.0), vec2(1.0, -1.0));
13
14layout (location = 0) out vec4 posPos;
15
16#ifdef VULKAN
17
18#define BINDING_COLOR_TEXTURE 0
19#define VERTEX_ID gl_VertexIndex
20
21#else // ^^^ Vulkan ^^^ // vvv OpenGL vvv
22
23#define BINDING_COLOR_TEXTURE 0
24#define VERTEX_ID gl_VertexID
25
26#endif
27
28layout (binding = BINDING_COLOR_TEXTURE) uniform sampler2D input_texture;
29
30const float FXAA_SUBPIX_SHIFT = 0;
31
32void main() {
33 vec2 vertex = vertices[VERTEX_ID];
34 gl_Position = vec4(vertex, 0.0, 1.0);
35 vec2 vert_tex_coord = (vertex + 1.0) / 2.0;
36 posPos.xy = vert_tex_coord;
37 posPos.zw = vert_tex_coord - (0.5 + FXAA_SUBPIX_SHIFT) / textureSize(input_texture, 0);
38}
diff --git a/src/video_core/host_shaders/opengl_present_scaleforce.frag b/src/video_core/host_shaders/opengl_present_scaleforce.frag
new file mode 100644
index 000000000..71ff9e1e3
--- /dev/null
+++ b/src/video_core/host_shaders/opengl_present_scaleforce.frag
@@ -0,0 +1,130 @@
1// MIT License
2//
3// Copyright (c) 2020 BreadFish64
4//
5// Permission is hereby granted, free of charge, to any person obtaining a copy
6// of this software and associated documentation files (the "Software"), to deal
7// in the Software without restriction, including without limitation the rights
8// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9// copies of the Software, and to permit persons to whom the Software is
10// furnished to do so, subject to the following conditions:
11//
12// The above copyright notice and this permission notice shall be included in all
13// copies or substantial portions of the Software.
14//
15// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21// SOFTWARE.
22
23// Adapted from https://github.com/BreadFish64/ScaleFish/tree/master/scaleforce
24
25//! #version 460
26
27#extension GL_ARB_separate_shader_objects : enable
28
29#ifdef YUZU_USE_FP16
30
31#extension GL_AMD_gpu_shader_half_float : enable
32#extension GL_NV_gpu_shader5 : enable
33
34#define lfloat float16_t
35#define lvec2 f16vec2
36#define lvec3 f16vec3
37#define lvec4 f16vec4
38
39#else
40
41#define lfloat float
42#define lvec2 vec2
43#define lvec3 vec3
44#define lvec4 vec4
45
46#endif
47
48#ifdef VULKAN
49
50#define BINDING_COLOR_TEXTURE 1
51
52#else // ^^^ Vulkan ^^^ // vvv OpenGL vvv
53
54#define BINDING_COLOR_TEXTURE 0
55
56#endif
57
58layout (location = 0) in vec2 tex_coord;
59
60layout (location = 0) out vec4 frag_color;
61
62layout (binding = BINDING_COLOR_TEXTURE) uniform sampler2D input_texture;
63
64const bool ignore_alpha = true;
65
66lfloat ColorDist1(lvec4 a, lvec4 b) {
67 // https://en.wikipedia.org/wiki/YCbCr#ITU-R_BT.2020_conversion
68 const lvec3 K = lvec3(0.2627, 0.6780, 0.0593);
69 const lfloat scaleB = lfloat(0.5) / (lfloat(1.0) - K.b);
70 const lfloat scaleR = lfloat(0.5) / (lfloat(1.0) - K.r);
71 lvec4 diff = a - b;
72 lfloat Y = dot(diff.rgb, K);
73 lfloat Cb = scaleB * (diff.b - Y);
74 lfloat Cr = scaleR * (diff.r - Y);
75 lvec3 YCbCr = lvec3(Y, Cb, Cr);
76 lfloat d = length(YCbCr);
77 if (ignore_alpha) {
78 return d;
79 }
80 return sqrt(a.a * b.a * d * d + diff.a * diff.a);
81}
82
83lvec4 ColorDist(lvec4 ref, lvec4 A, lvec4 B, lvec4 C, lvec4 D) {
84 return lvec4(
85 ColorDist1(ref, A),
86 ColorDist1(ref, B),
87 ColorDist1(ref, C),
88 ColorDist1(ref, D)
89 );
90}
91
92vec4 Scaleforce(sampler2D tex, vec2 tex_coord) {
93 lvec4 bl = lvec4(textureOffset(tex, tex_coord, ivec2(-1, -1)));
94 lvec4 bc = lvec4(textureOffset(tex, tex_coord, ivec2(0, -1)));
95 lvec4 br = lvec4(textureOffset(tex, tex_coord, ivec2(1, -1)));
96 lvec4 cl = lvec4(textureOffset(tex, tex_coord, ivec2(-1, 0)));
97 lvec4 cc = lvec4(texture(tex, tex_coord));
98 lvec4 cr = lvec4(textureOffset(tex, tex_coord, ivec2(1, 0)));
99 lvec4 tl = lvec4(textureOffset(tex, tex_coord, ivec2(-1, 1)));
100 lvec4 tc = lvec4(textureOffset(tex, tex_coord, ivec2(0, 1)));
101 lvec4 tr = lvec4(textureOffset(tex, tex_coord, ivec2(1, 1)));
102
103 lvec4 offset_tl = ColorDist(cc, tl, tc, tr, cr);
104 lvec4 offset_br = ColorDist(cc, br, bc, bl, cl);
105
106 // Calculate how different cc is from the texels around it
107 const lfloat plus_weight = lfloat(1.5);
108 const lfloat cross_weight = lfloat(1.5);
109 lfloat total_dist = dot(offset_tl + offset_br, lvec4(cross_weight, plus_weight, cross_weight, plus_weight));
110
111 if (total_dist == lfloat(0.0)) {
112 return cc;
113 } else {
114 // Add together all the distances with direction taken into account
115 lvec4 tmp = offset_tl - offset_br;
116 lvec2 total_offset = tmp.wy * plus_weight + (tmp.zz + lvec2(-tmp.x, tmp.x)) * cross_weight;
117
118 // When the image has thin points, they tend to split apart.
119 // This is because the texels all around are different and total_offset reaches into clear areas.
120 // This works pretty well to keep the offset in bounds for these cases.
121 lfloat clamp_val = length(total_offset) / total_dist;
122 vec2 final_offset = vec2(clamp(total_offset, -clamp_val, clamp_val)) / textureSize(tex, 0);
123
124 return texture(tex, tex_coord - final_offset);
125 }
126}
127
128void main() {
129 frag_color = Scaleforce(input_texture, tex_coord);
130}
diff --git a/src/video_core/host_shaders/present_bicubic.frag b/src/video_core/host_shaders/present_bicubic.frag
new file mode 100644
index 000000000..902b70c2b
--- /dev/null
+++ b/src/video_core/host_shaders/present_bicubic.frag
@@ -0,0 +1,67 @@
1// Copyright 2021 yuzu Emulator Project
2// Licensed under GPLv2 or any later version
3// Refer to the license.txt file included.
4
5#version 460 core
6
7#ifdef VULKAN
8
9#define BINDING_COLOR_TEXTURE 1
10
11#else // ^^^ Vulkan ^^^ // vvv OpenGL vvv
12
13#define BINDING_COLOR_TEXTURE 0
14
15#endif
16
17
18layout (location = 0) in vec2 frag_tex_coord;
19
20layout (location = 0) out vec4 color;
21
22layout (binding = BINDING_COLOR_TEXTURE) uniform sampler2D color_texture;
23
24vec4 cubic(float v) {
25 vec4 n = vec4(1.0, 2.0, 3.0, 4.0) - v;
26 vec4 s = n * n * n;
27 float x = s.x;
28 float y = s.y - 4.0 * s.x;
29 float z = s.z - 4.0 * s.y + 6.0 * s.x;
30 float w = 6.0 - x - y - z;
31 return vec4(x, y, z, w) * (1.0 / 6.0);
32}
33
34vec4 textureBicubic( sampler2D textureSampler, vec2 texCoords ) {
35
36 vec2 texSize = textureSize(textureSampler, 0);
37 vec2 invTexSize = 1.0 / texSize;
38
39 texCoords = texCoords * texSize - 0.5;
40
41 vec2 fxy = fract(texCoords);
42 texCoords -= fxy;
43
44 vec4 xcubic = cubic(fxy.x);
45 vec4 ycubic = cubic(fxy.y);
46
47 vec4 c = texCoords.xxyy + vec2(-0.5, +1.5).xyxy;
48
49 vec4 s = vec4(xcubic.xz + xcubic.yw, ycubic.xz + ycubic.yw);
50 vec4 offset = c + vec4(xcubic.yw, ycubic.yw) / s;
51
52 offset *= invTexSize.xxyy;
53
54 vec4 sample0 = texture(textureSampler, offset.xz);
55 vec4 sample1 = texture(textureSampler, offset.yz);
56 vec4 sample2 = texture(textureSampler, offset.xw);
57 vec4 sample3 = texture(textureSampler, offset.yw);
58
59 float sx = s.x / (s.x + s.y);
60 float sy = s.z / (s.z + s.w);
61
62 return mix(mix(sample3, sample2, sx), mix(sample1, sample0, sx), sy);
63}
64
65void main() {
66 color = vec4(textureBicubic(color_texture, frag_tex_coord).rgb, 1.0f);
67}
diff --git a/src/video_core/host_shaders/present_gaussian.frag b/src/video_core/host_shaders/present_gaussian.frag
new file mode 100644
index 000000000..66fed3238
--- /dev/null
+++ b/src/video_core/host_shaders/present_gaussian.frag
@@ -0,0 +1,70 @@
1// Copyright 2021 yuzu Emulator Project
2// Licensed under GPLv2 or any later version
3// Refer to the license.txt file included.
4
5// Code adapted from the following sources:
6// - https://learnopengl.com/Advanced-Lighting/Bloom
7// - https://www.rastergrid.com/blog/2010/09/efficient-gaussian-blur-with-linear-sampling/
8
9#version 460 core
10
11#ifdef VULKAN
12
13#define BINDING_COLOR_TEXTURE 1
14
15#else // ^^^ Vulkan ^^^ // vvv OpenGL vvv
16
17#define BINDING_COLOR_TEXTURE 0
18
19#endif
20
21layout(location = 0) in vec2 frag_tex_coord;
22
23layout(location = 0) out vec4 color;
24
25layout(binding = BINDING_COLOR_TEXTURE) uniform sampler2D color_texture;
26
27const float offset[3] = float[](0.0, 1.3846153846, 3.2307692308);
28const float weight[3] = float[](0.2270270270, 0.3162162162, 0.0702702703);
29
30vec4 blurVertical(sampler2D textureSampler, vec2 coord, vec2 norm) {
31 vec4 result = vec4(0.0f);
32 for (int i = 1; i < 3; i++) {
33 result += texture(textureSampler, vec2(coord) + (vec2(0.0, offset[i]) * norm)) * weight[i];
34 result += texture(textureSampler, vec2(coord) - (vec2(0.0, offset[i]) * norm)) * weight[i];
35 }
36 return result;
37}
38
39vec4 blurHorizontal(sampler2D textureSampler, vec2 coord, vec2 norm) {
40 vec4 result = vec4(0.0f);
41 for (int i = 1; i < 3; i++) {
42 result += texture(textureSampler, vec2(coord) + (vec2(offset[i], 0.0) * norm)) * weight[i];
43 result += texture(textureSampler, vec2(coord) - (vec2(offset[i], 0.0) * norm)) * weight[i];
44 }
45 return result;
46}
47
48vec4 blurDiagonal(sampler2D textureSampler, vec2 coord, vec2 norm) {
49 vec4 result = vec4(0.0f);
50 for (int i = 1; i < 3; i++) {
51 result +=
52 texture(textureSampler, vec2(coord) + (vec2(offset[i], offset[i]) * norm)) * weight[i];
53 result +=
54 texture(textureSampler, vec2(coord) - (vec2(offset[i], offset[i]) * norm)) * weight[i];
55 }
56 return result;
57}
58
59void main() {
60 vec3 base = texture(color_texture, vec2(frag_tex_coord)).rgb * weight[0];
61 vec2 tex_offset = 1.0f / textureSize(color_texture, 0);
62
63 // TODO(Blinkhawk): This code can be optimized through shader group instructions.
64 vec3 horizontal = blurHorizontal(color_texture, frag_tex_coord, tex_offset).rgb;
65 vec3 vertical = blurVertical(color_texture, frag_tex_coord, tex_offset).rgb;
66 vec3 diagonalA = blurDiagonal(color_texture, frag_tex_coord, tex_offset).rgb;
67 vec3 diagonalB = blurDiagonal(color_texture, frag_tex_coord, tex_offset * vec2(1.0, -1.0)).rgb;
68 vec3 combination = mix(mix(horizontal, vertical, 0.5f), mix(diagonalA, diagonalB, 0.5f), 0.5f);
69 color = vec4(combination + base, 1.0f);
70}
diff --git a/src/video_core/host_shaders/vulkan_fidelityfx_fsr_easu_fp16.comp b/src/video_core/host_shaders/vulkan_fidelityfx_fsr_easu_fp16.comp
new file mode 100644
index 000000000..1c96a7905
--- /dev/null
+++ b/src/video_core/host_shaders/vulkan_fidelityfx_fsr_easu_fp16.comp
@@ -0,0 +1,11 @@
1// Copyright 2021 yuzu Emulator Project
2// Licensed under GPLv2 or any later version
3// Refer to the license.txt file included.
4
5#version 460 core
6#extension GL_GOOGLE_include_directive : enable
7
8#define YUZU_USE_FP16
9#define USE_EASU 1
10
11#include "fidelityfx_fsr.comp"
diff --git a/src/video_core/host_shaders/vulkan_fidelityfx_fsr_easu_fp32.comp b/src/video_core/host_shaders/vulkan_fidelityfx_fsr_easu_fp32.comp
new file mode 100644
index 000000000..f4daff739
--- /dev/null
+++ b/src/video_core/host_shaders/vulkan_fidelityfx_fsr_easu_fp32.comp
@@ -0,0 +1,10 @@
1// Copyright 2021 yuzu Emulator Project
2// Licensed under GPLv2 or any later version
3// Refer to the license.txt file included.
4
5#version 460 core
6#extension GL_GOOGLE_include_directive : enable
7
8#define USE_EASU 1
9
10#include "fidelityfx_fsr.comp"
diff --git a/src/video_core/host_shaders/vulkan_fidelityfx_fsr_rcas_fp16.comp b/src/video_core/host_shaders/vulkan_fidelityfx_fsr_rcas_fp16.comp
new file mode 100644
index 000000000..6b6796dd1
--- /dev/null
+++ b/src/video_core/host_shaders/vulkan_fidelityfx_fsr_rcas_fp16.comp
@@ -0,0 +1,11 @@
1// Copyright 2021 yuzu Emulator Project
2// Licensed under GPLv2 or any later version
3// Refer to the license.txt file included.
4
5#version 460 core
6#extension GL_GOOGLE_include_directive : enable
7
8#define YUZU_USE_FP16
9#define USE_RCAS 1
10
11#include "fidelityfx_fsr.comp"
diff --git a/src/video_core/host_shaders/vulkan_fidelityfx_fsr_rcas_fp32.comp b/src/video_core/host_shaders/vulkan_fidelityfx_fsr_rcas_fp32.comp
new file mode 100644
index 000000000..f785eebf3
--- /dev/null
+++ b/src/video_core/host_shaders/vulkan_fidelityfx_fsr_rcas_fp32.comp
@@ -0,0 +1,10 @@
1// Copyright 2021 yuzu Emulator Project
2// Licensed under GPLv2 or any later version
3// Refer to the license.txt file included.
4
5#version 460 core
6#extension GL_GOOGLE_include_directive : enable
7
8#define USE_RCAS 1
9
10#include "fidelityfx_fsr.comp"
diff --git a/src/video_core/host_shaders/vulkan_present_scaleforce_fp16.frag b/src/video_core/host_shaders/vulkan_present_scaleforce_fp16.frag
new file mode 100644
index 000000000..924c03060
--- /dev/null
+++ b/src/video_core/host_shaders/vulkan_present_scaleforce_fp16.frag
@@ -0,0 +1,7 @@
1#version 460
2
3#extension GL_GOOGLE_include_directive : enable
4
5#define YUZU_USE_FP16
6
7#include "opengl_present_scaleforce.frag"
diff --git a/src/video_core/host_shaders/vulkan_present_scaleforce_fp32.frag b/src/video_core/host_shaders/vulkan_present_scaleforce_fp32.frag
new file mode 100644
index 000000000..a594b83ca
--- /dev/null
+++ b/src/video_core/host_shaders/vulkan_present_scaleforce_fp32.frag
@@ -0,0 +1,5 @@
1#version 460
2
3#extension GL_GOOGLE_include_directive : enable
4
5#include "opengl_present_scaleforce.frag"
diff --git a/src/video_core/renderer_opengl/gl_buffer_cache.cpp b/src/video_core/renderer_opengl/gl_buffer_cache.cpp
index 187a28e4d..d4dd10bb6 100644
--- a/src/video_core/renderer_opengl/gl_buffer_cache.cpp
+++ b/src/video_core/renderer_opengl/gl_buffer_cache.cpp
@@ -5,6 +5,7 @@
5#include <algorithm> 5#include <algorithm>
6#include <span> 6#include <span>
7 7
8#include "shader_recompiler/backend/glasm/emit_glasm.h"
8#include "video_core/buffer_cache/buffer_cache.h" 9#include "video_core/buffer_cache/buffer_cache.h"
9#include "video_core/renderer_opengl/gl_buffer_cache.h" 10#include "video_core/renderer_opengl/gl_buffer_cache.h"
10#include "video_core/renderer_opengl/gl_device.h" 11#include "video_core/renderer_opengl/gl_device.h"
@@ -229,8 +230,10 @@ void BufferCacheRuntime::BindStorageBuffer(size_t stage, u32 binding_index, Buff
229 .padding = 0, 230 .padding = 0,
230 }; 231 };
231 buffer.MakeResident(is_written ? GL_READ_WRITE : GL_READ_ONLY); 232 buffer.MakeResident(is_written ? GL_READ_WRITE : GL_READ_ONLY);
232 glProgramLocalParametersI4uivNV(PROGRAM_LUT[stage], binding_index, 1, 233 glProgramLocalParametersI4uivNV(
233 reinterpret_cast<const GLuint*>(&ssbo)); 234 PROGRAM_LUT[stage],
235 Shader::Backend::GLASM::PROGRAM_LOCAL_PARAMETER_STORAGE_BUFFER_BASE + binding_index, 1,
236 reinterpret_cast<const GLuint*>(&ssbo));
234 } 237 }
235} 238}
236 239
@@ -250,8 +253,10 @@ void BufferCacheRuntime::BindComputeStorageBuffer(u32 binding_index, Buffer& buf
250 .padding = 0, 253 .padding = 0,
251 }; 254 };
252 buffer.MakeResident(is_written ? GL_READ_WRITE : GL_READ_ONLY); 255 buffer.MakeResident(is_written ? GL_READ_WRITE : GL_READ_ONLY);
253 glProgramLocalParametersI4uivNV(GL_COMPUTE_PROGRAM_NV, binding_index, 1, 256 glProgramLocalParametersI4uivNV(
254 reinterpret_cast<const GLuint*>(&ssbo)); 257 GL_COMPUTE_PROGRAM_NV,
258 Shader::Backend::GLASM::PROGRAM_LOCAL_PARAMETER_STORAGE_BUFFER_BASE + binding_index, 1,
259 reinterpret_cast<const GLuint*>(&ssbo));
255 } 260 }
256} 261}
257 262
diff --git a/src/video_core/renderer_opengl/gl_compute_pipeline.cpp b/src/video_core/renderer_opengl/gl_compute_pipeline.cpp
index aa1cc592f..5c1f21c65 100644
--- a/src/video_core/renderer_opengl/gl_compute_pipeline.cpp
+++ b/src/video_core/renderer_opengl/gl_compute_pipeline.cpp
@@ -19,15 +19,6 @@ using VideoCommon::ImageId;
19constexpr u32 MAX_TEXTURES = 64; 19constexpr u32 MAX_TEXTURES = 64;
20constexpr u32 MAX_IMAGES = 16; 20constexpr u32 MAX_IMAGES = 16;
21 21
22template <typename Range>
23u32 AccumulateCount(const Range& range) {
24 u32 num{};
25 for (const auto& desc : range) {
26 num += desc.count;
27 }
28 return num;
29}
30
31size_t ComputePipelineKey::Hash() const noexcept { 22size_t ComputePipelineKey::Hash() const noexcept {
32 return static_cast<size_t>( 23 return static_cast<size_t>(
33 Common::CityHash64(reinterpret_cast<const char*>(this), sizeof *this)); 24 Common::CityHash64(reinterpret_cast<const char*>(this), sizeof *this));
@@ -58,17 +49,17 @@ ComputePipeline::ComputePipeline(const Device& device, TextureCache& texture_cac
58 std::copy_n(info.constant_buffer_used_sizes.begin(), uniform_buffer_sizes.size(), 49 std::copy_n(info.constant_buffer_used_sizes.begin(), uniform_buffer_sizes.size(),
59 uniform_buffer_sizes.begin()); 50 uniform_buffer_sizes.begin());
60 51
61 num_texture_buffers = AccumulateCount(info.texture_buffer_descriptors); 52 num_texture_buffers = Shader::NumDescriptors(info.texture_buffer_descriptors);
62 num_image_buffers = AccumulateCount(info.image_buffer_descriptors); 53 num_image_buffers = Shader::NumDescriptors(info.image_buffer_descriptors);
63 54
64 const u32 num_textures{num_texture_buffers + AccumulateCount(info.texture_descriptors)}; 55 const u32 num_textures{num_texture_buffers + Shader::NumDescriptors(info.texture_descriptors)};
65 ASSERT(num_textures <= MAX_TEXTURES); 56 ASSERT(num_textures <= MAX_TEXTURES);
66 57
67 const u32 num_images{num_image_buffers + AccumulateCount(info.image_descriptors)}; 58 const u32 num_images{num_image_buffers + Shader::NumDescriptors(info.image_descriptors)};
68 ASSERT(num_images <= MAX_IMAGES); 59 ASSERT(num_images <= MAX_IMAGES);
69 60
70 const bool is_glasm{assembly_program.handle != 0}; 61 const bool is_glasm{assembly_program.handle != 0};
71 const u32 num_storage_buffers{AccumulateCount(info.storage_buffers_descriptors)}; 62 const u32 num_storage_buffers{Shader::NumDescriptors(info.storage_buffers_descriptors)};
72 use_storage_buffers = 63 use_storage_buffers =
73 !is_glasm || num_storage_buffers < device.GetMaxGLASMStorageBufferBlocks(); 64 !is_glasm || num_storage_buffers < device.GetMaxGLASMStorageBufferBlocks();
74 writes_global_memory = !use_storage_buffers && 65 writes_global_memory = !use_storage_buffers &&
@@ -88,8 +79,7 @@ void ComputePipeline::Configure() {
88 } 79 }
89 texture_cache.SynchronizeComputeDescriptors(); 80 texture_cache.SynchronizeComputeDescriptors();
90 81
91 std::array<ImageViewId, MAX_TEXTURES + MAX_IMAGES> image_view_ids; 82 boost::container::static_vector<VideoCommon::ImageViewInOut, MAX_TEXTURES + MAX_IMAGES> views;
92 boost::container::static_vector<u32, MAX_TEXTURES + MAX_IMAGES> image_view_indices;
93 std::array<GLuint, MAX_TEXTURES> samplers; 83 std::array<GLuint, MAX_TEXTURES> samplers;
94 std::array<GLuint, MAX_TEXTURES> textures; 84 std::array<GLuint, MAX_TEXTURES> textures;
95 std::array<GLuint, MAX_IMAGES> images; 85 std::array<GLuint, MAX_IMAGES> images;
@@ -119,33 +109,39 @@ void ComputePipeline::Configure() {
119 } 109 }
120 return TexturePair(gpu_memory.Read<u32>(addr), via_header_index); 110 return TexturePair(gpu_memory.Read<u32>(addr), via_header_index);
121 }}; 111 }};
122 const auto add_image{[&](const auto& desc) { 112 const auto add_image{[&](const auto& desc, bool blacklist) {
123 for (u32 index = 0; index < desc.count; ++index) { 113 for (u32 index = 0; index < desc.count; ++index) {
124 const auto handle{read_handle(desc, index)}; 114 const auto handle{read_handle(desc, index)};
125 image_view_indices.push_back(handle.first); 115 views.push_back({
116 .index = handle.first,
117 .blacklist = blacklist,
118 .id = {},
119 });
126 } 120 }
127 }}; 121 }};
128 for (const auto& desc : info.texture_buffer_descriptors) { 122 for (const auto& desc : info.texture_buffer_descriptors) {
129 for (u32 index = 0; index < desc.count; ++index) { 123 for (u32 index = 0; index < desc.count; ++index) {
130 const auto handle{read_handle(desc, index)}; 124 const auto handle{read_handle(desc, index)};
131 image_view_indices.push_back(handle.first); 125 views.push_back({handle.first});
132 samplers[sampler_binding++] = 0; 126 samplers[sampler_binding++] = 0;
133 } 127 }
134 } 128 }
135 std::ranges::for_each(info.image_buffer_descriptors, add_image); 129 for (const auto& desc : info.image_buffer_descriptors) {
130 add_image(desc, false);
131 }
136 for (const auto& desc : info.texture_descriptors) { 132 for (const auto& desc : info.texture_descriptors) {
137 for (u32 index = 0; index < desc.count; ++index) { 133 for (u32 index = 0; index < desc.count; ++index) {
138 const auto handle{read_handle(desc, index)}; 134 const auto handle{read_handle(desc, index)};
139 image_view_indices.push_back(handle.first); 135 views.push_back({handle.first});
140 136
141 Sampler* const sampler = texture_cache.GetComputeSampler(handle.second); 137 Sampler* const sampler = texture_cache.GetComputeSampler(handle.second);
142 samplers[sampler_binding++] = sampler->Handle(); 138 samplers[sampler_binding++] = sampler->Handle();
143 } 139 }
144 } 140 }
145 std::ranges::for_each(info.image_descriptors, add_image); 141 for (const auto& desc : info.image_descriptors) {
146 142 add_image(desc, desc.is_written);
147 const std::span indices_span(image_view_indices.data(), image_view_indices.size()); 143 }
148 texture_cache.FillComputeImageViews(indices_span, image_view_ids); 144 texture_cache.FillComputeImageViews(std::span(views.data(), views.size()));
149 145
150 if (assembly_program.handle != 0) { 146 if (assembly_program.handle != 0) {
151 program_manager.BindComputeAssemblyProgram(assembly_program.handle); 147 program_manager.BindComputeAssemblyProgram(assembly_program.handle);
@@ -161,7 +157,7 @@ void ComputePipeline::Configure() {
161 if constexpr (is_image) { 157 if constexpr (is_image) {
162 is_written = desc.is_written; 158 is_written = desc.is_written;
163 } 159 }
164 ImageView& image_view{texture_cache.GetImageView(image_view_ids[texbuf_index])}; 160 ImageView& image_view{texture_cache.GetImageView(views[texbuf_index].id)};
165 buffer_cache.BindComputeTextureBuffer(texbuf_index, image_view.GpuAddr(), 161 buffer_cache.BindComputeTextureBuffer(texbuf_index, image_view.GpuAddr(),
166 image_view.BufferSize(), image_view.format, 162 image_view.BufferSize(), image_view.format,
167 is_written, is_image); 163 is_written, is_image);
@@ -177,23 +173,45 @@ void ComputePipeline::Configure() {
177 buffer_cache.runtime.SetImagePointers(textures.data(), images.data()); 173 buffer_cache.runtime.SetImagePointers(textures.data(), images.data());
178 buffer_cache.BindHostComputeBuffers(); 174 buffer_cache.BindHostComputeBuffers();
179 175
180 const ImageId* views_it{image_view_ids.data() + num_texture_buffers + num_image_buffers}; 176 const VideoCommon::ImageViewInOut* views_it{views.data() + num_texture_buffers +
177 num_image_buffers};
181 texture_binding += num_texture_buffers; 178 texture_binding += num_texture_buffers;
182 image_binding += num_image_buffers; 179 image_binding += num_image_buffers;
183 180
181 u32 texture_scaling_mask{};
184 for (const auto& desc : info.texture_descriptors) { 182 for (const auto& desc : info.texture_descriptors) {
185 for (u32 index = 0; index < desc.count; ++index) { 183 for (u32 index = 0; index < desc.count; ++index) {
186 ImageView& image_view{texture_cache.GetImageView(*(views_it++))}; 184 ImageView& image_view{texture_cache.GetImageView((views_it++)->id)};
187 textures[texture_binding++] = image_view.Handle(desc.type); 185 textures[texture_binding] = image_view.Handle(desc.type);
186 if (texture_cache.IsRescaling(image_view)) {
187 texture_scaling_mask |= 1u << texture_binding;
188 }
189 ++texture_binding;
188 } 190 }
189 } 191 }
192 u32 image_scaling_mask{};
190 for (const auto& desc : info.image_descriptors) { 193 for (const auto& desc : info.image_descriptors) {
191 for (u32 index = 0; index < desc.count; ++index) { 194 for (u32 index = 0; index < desc.count; ++index) {
192 ImageView& image_view{texture_cache.GetImageView(*(views_it++))}; 195 ImageView& image_view{texture_cache.GetImageView((views_it++)->id)};
193 if (desc.is_written) { 196 if (desc.is_written) {
194 texture_cache.MarkModification(image_view.image_id); 197 texture_cache.MarkModification(image_view.image_id);
195 } 198 }
196 images[image_binding++] = image_view.StorageView(desc.type, desc.format); 199 images[image_binding] = image_view.StorageView(desc.type, desc.format);
200 if (texture_cache.IsRescaling(image_view)) {
201 image_scaling_mask |= 1u << image_binding;
202 }
203 ++image_binding;
204 }
205 }
206 if (info.uses_rescaling_uniform) {
207 const f32 float_texture_scaling_mask{Common::BitCast<f32>(texture_scaling_mask)};
208 const f32 float_image_scaling_mask{Common::BitCast<f32>(image_scaling_mask)};
209 if (assembly_program.handle != 0) {
210 glProgramLocalParameter4fARB(GL_COMPUTE_PROGRAM_NV, 0, float_texture_scaling_mask,
211 float_image_scaling_mask, 0.0f, 0.0f);
212 } else {
213 glProgramUniform4f(source_program.handle, 0, float_texture_scaling_mask,
214 float_image_scaling_mask, 0.0f, 0.0f);
197 } 215 }
198 } 216 }
199 if (texture_binding != 0) { 217 if (texture_binding != 0) {
diff --git a/src/video_core/renderer_opengl/gl_graphics_pipeline.cpp b/src/video_core/renderer_opengl/gl_graphics_pipeline.cpp
index bccb37a58..f8495896c 100644
--- a/src/video_core/renderer_opengl/gl_graphics_pipeline.cpp
+++ b/src/video_core/renderer_opengl/gl_graphics_pipeline.cpp
@@ -15,7 +15,7 @@
15#include "video_core/renderer_opengl/gl_shader_util.h" 15#include "video_core/renderer_opengl/gl_shader_util.h"
16#include "video_core/renderer_opengl/gl_state_tracker.h" 16#include "video_core/renderer_opengl/gl_state_tracker.h"
17#include "video_core/shader_notify.h" 17#include "video_core/shader_notify.h"
18#include "video_core/texture_cache/texture_cache_base.h" 18#include "video_core/texture_cache/texture_cache.h"
19 19
20#if defined(_MSC_VER) && defined(NDEBUG) 20#if defined(_MSC_VER) && defined(NDEBUG)
21#define LAMBDA_FORCEINLINE [[msvc::forceinline]] 21#define LAMBDA_FORCEINLINE [[msvc::forceinline]]
@@ -27,6 +27,7 @@ namespace OpenGL {
27namespace { 27namespace {
28using Shader::ImageBufferDescriptor; 28using Shader::ImageBufferDescriptor;
29using Shader::ImageDescriptor; 29using Shader::ImageDescriptor;
30using Shader::NumDescriptors;
30using Shader::TextureBufferDescriptor; 31using Shader::TextureBufferDescriptor;
31using Shader::TextureDescriptor; 32using Shader::TextureDescriptor;
32using Tegra::Texture::TexturePair; 33using Tegra::Texture::TexturePair;
@@ -35,15 +36,6 @@ using VideoCommon::ImageId;
35constexpr u32 MAX_TEXTURES = 64; 36constexpr u32 MAX_TEXTURES = 64;
36constexpr u32 MAX_IMAGES = 8; 37constexpr u32 MAX_IMAGES = 8;
37 38
38template <typename Range>
39u32 AccumulateCount(const Range& range) {
40 u32 num{};
41 for (const auto& desc : range) {
42 num += desc.count;
43 }
44 return num;
45}
46
47GLenum Stage(size_t stage_index) { 39GLenum Stage(size_t stage_index) {
48 switch (stage_index) { 40 switch (stage_index) {
49 case 0: 41 case 0:
@@ -204,23 +196,23 @@ GraphicsPipeline::GraphicsPipeline(
204 base_uniform_bindings[stage + 1] = base_uniform_bindings[stage]; 196 base_uniform_bindings[stage + 1] = base_uniform_bindings[stage];
205 base_storage_bindings[stage + 1] = base_storage_bindings[stage]; 197 base_storage_bindings[stage + 1] = base_storage_bindings[stage];
206 198
207 base_uniform_bindings[stage + 1] += AccumulateCount(info.constant_buffer_descriptors); 199 base_uniform_bindings[stage + 1] += NumDescriptors(info.constant_buffer_descriptors);
208 base_storage_bindings[stage + 1] += AccumulateCount(info.storage_buffers_descriptors); 200 base_storage_bindings[stage + 1] += NumDescriptors(info.storage_buffers_descriptors);
209 } 201 }
210 enabled_uniform_buffer_masks[stage] = info.constant_buffer_mask; 202 enabled_uniform_buffer_masks[stage] = info.constant_buffer_mask;
211 std::ranges::copy(info.constant_buffer_used_sizes, uniform_buffer_sizes[stage].begin()); 203 std::ranges::copy(info.constant_buffer_used_sizes, uniform_buffer_sizes[stage].begin());
212 204
213 const u32 num_tex_buffer_bindings{AccumulateCount(info.texture_buffer_descriptors)}; 205 const u32 num_tex_buffer_bindings{NumDescriptors(info.texture_buffer_descriptors)};
214 num_texture_buffers[stage] += num_tex_buffer_bindings; 206 num_texture_buffers[stage] += num_tex_buffer_bindings;
215 num_textures += num_tex_buffer_bindings; 207 num_textures += num_tex_buffer_bindings;
216 208
217 const u32 num_img_buffers_bindings{AccumulateCount(info.image_buffer_descriptors)}; 209 const u32 num_img_buffers_bindings{NumDescriptors(info.image_buffer_descriptors)};
218 num_image_buffers[stage] += num_img_buffers_bindings; 210 num_image_buffers[stage] += num_img_buffers_bindings;
219 num_images += num_img_buffers_bindings; 211 num_images += num_img_buffers_bindings;
220 212
221 num_textures += AccumulateCount(info.texture_descriptors); 213 num_textures += NumDescriptors(info.texture_descriptors);
222 num_images += AccumulateCount(info.image_descriptors); 214 num_images += NumDescriptors(info.image_descriptors);
223 num_storage_buffers += AccumulateCount(info.storage_buffers_descriptors); 215 num_storage_buffers += NumDescriptors(info.storage_buffers_descriptors);
224 216
225 writes_global_memory |= std::ranges::any_of( 217 writes_global_memory |= std::ranges::any_of(
226 info.storage_buffers_descriptors, [](const auto& desc) { return desc.is_written; }); 218 info.storage_buffers_descriptors, [](const auto& desc) { return desc.is_written; });
@@ -288,10 +280,9 @@ GraphicsPipeline::GraphicsPipeline(
288 280
289template <typename Spec> 281template <typename Spec>
290void GraphicsPipeline::ConfigureImpl(bool is_indexed) { 282void GraphicsPipeline::ConfigureImpl(bool is_indexed) {
291 std::array<ImageId, MAX_TEXTURES + MAX_IMAGES> image_view_ids; 283 std::array<VideoCommon::ImageViewInOut, MAX_TEXTURES + MAX_IMAGES> views;
292 std::array<u32, MAX_TEXTURES + MAX_IMAGES> image_view_indices;
293 std::array<GLuint, MAX_TEXTURES> samplers; 284 std::array<GLuint, MAX_TEXTURES> samplers;
294 size_t image_view_index{}; 285 size_t views_index{};
295 GLsizei sampler_binding{}; 286 GLsizei sampler_binding{};
296 287
297 texture_cache.SynchronizeGraphicsDescriptors(); 288 texture_cache.SynchronizeGraphicsDescriptors();
@@ -336,30 +327,34 @@ void GraphicsPipeline::ConfigureImpl(bool is_indexed) {
336 } 327 }
337 return TexturePair(gpu_memory.Read<u32>(addr), via_header_index); 328 return TexturePair(gpu_memory.Read<u32>(addr), via_header_index);
338 }}; 329 }};
339 const auto add_image{[&](const auto& desc) { 330 const auto add_image{[&](const auto& desc, bool blacklist) LAMBDA_FORCEINLINE {
340 for (u32 index = 0; index < desc.count; ++index) { 331 for (u32 index = 0; index < desc.count; ++index) {
341 const auto handle{read_handle(desc, index)}; 332 const auto handle{read_handle(desc, index)};
342 image_view_indices[image_view_index++] = handle.first; 333 views[views_index++] = {
334 .index = handle.first,
335 .blacklist = blacklist,
336 .id = {},
337 };
343 } 338 }
344 }}; 339 }};
345 if constexpr (Spec::has_texture_buffers) { 340 if constexpr (Spec::has_texture_buffers) {
346 for (const auto& desc : info.texture_buffer_descriptors) { 341 for (const auto& desc : info.texture_buffer_descriptors) {
347 for (u32 index = 0; index < desc.count; ++index) { 342 for (u32 index = 0; index < desc.count; ++index) {
348 const auto handle{read_handle(desc, index)}; 343 const auto handle{read_handle(desc, index)};
349 image_view_indices[image_view_index++] = handle.first; 344 views[views_index++] = {handle.first};
350 samplers[sampler_binding++] = 0; 345 samplers[sampler_binding++] = 0;
351 } 346 }
352 } 347 }
353 } 348 }
354 if constexpr (Spec::has_image_buffers) { 349 if constexpr (Spec::has_image_buffers) {
355 for (const auto& desc : info.image_buffer_descriptors) { 350 for (const auto& desc : info.image_buffer_descriptors) {
356 add_image(desc); 351 add_image(desc, false);
357 } 352 }
358 } 353 }
359 for (const auto& desc : info.texture_descriptors) { 354 for (const auto& desc : info.texture_descriptors) {
360 for (u32 index = 0; index < desc.count; ++index) { 355 for (u32 index = 0; index < desc.count; ++index) {
361 const auto handle{read_handle(desc, index)}; 356 const auto handle{read_handle(desc, index)};
362 image_view_indices[image_view_index++] = handle.first; 357 views[views_index++] = {handle.first};
363 358
364 Sampler* const sampler{texture_cache.GetGraphicsSampler(handle.second)}; 359 Sampler* const sampler{texture_cache.GetGraphicsSampler(handle.second)};
365 samplers[sampler_binding++] = sampler->Handle(); 360 samplers[sampler_binding++] = sampler->Handle();
@@ -367,7 +362,7 @@ void GraphicsPipeline::ConfigureImpl(bool is_indexed) {
367 } 362 }
368 if constexpr (Spec::has_images) { 363 if constexpr (Spec::has_images) {
369 for (const auto& desc : info.image_descriptors) { 364 for (const auto& desc : info.image_descriptors) {
370 add_image(desc); 365 add_image(desc, desc.is_written);
371 } 366 }
372 } 367 }
373 }}; 368 }};
@@ -386,13 +381,12 @@ void GraphicsPipeline::ConfigureImpl(bool is_indexed) {
386 if constexpr (Spec::enabled_stages[4]) { 381 if constexpr (Spec::enabled_stages[4]) {
387 config_stage(4); 382 config_stage(4);
388 } 383 }
389 const std::span indices_span(image_view_indices.data(), image_view_index); 384 texture_cache.FillGraphicsImageViews<Spec::has_images>(std::span(views.data(), views_index));
390 texture_cache.FillGraphicsImageViews(indices_span, image_view_ids);
391 385
392 texture_cache.UpdateRenderTargets(false); 386 texture_cache.UpdateRenderTargets(false);
393 state_tracker.BindFramebuffer(texture_cache.GetFramebuffer()->Handle()); 387 state_tracker.BindFramebuffer(texture_cache.GetFramebuffer()->Handle());
394 388
395 ImageId* texture_buffer_index{image_view_ids.data()}; 389 VideoCommon::ImageViewInOut* texture_buffer_it{views.data()};
396 const auto bind_stage_info{[&](size_t stage) LAMBDA_FORCEINLINE { 390 const auto bind_stage_info{[&](size_t stage) LAMBDA_FORCEINLINE {
397 size_t index{}; 391 size_t index{};
398 const auto add_buffer{[&](const auto& desc) { 392 const auto add_buffer{[&](const auto& desc) {
@@ -402,12 +396,12 @@ void GraphicsPipeline::ConfigureImpl(bool is_indexed) {
402 if constexpr (is_image) { 396 if constexpr (is_image) {
403 is_written = desc.is_written; 397 is_written = desc.is_written;
404 } 398 }
405 ImageView& image_view{texture_cache.GetImageView(*texture_buffer_index)}; 399 ImageView& image_view{texture_cache.GetImageView(texture_buffer_it->id)};
406 buffer_cache.BindGraphicsTextureBuffer(stage, index, image_view.GpuAddr(), 400 buffer_cache.BindGraphicsTextureBuffer(stage, index, image_view.GpuAddr(),
407 image_view.BufferSize(), image_view.format, 401 image_view.BufferSize(), image_view.format,
408 is_written, is_image); 402 is_written, is_image);
409 ++index; 403 ++index;
410 ++texture_buffer_index; 404 ++texture_buffer_it;
411 } 405 }
412 }}; 406 }};
413 const Shader::Info& info{stage_infos[stage]}; 407 const Shader::Info& info{stage_infos[stage]};
@@ -423,13 +417,9 @@ void GraphicsPipeline::ConfigureImpl(bool is_indexed) {
423 add_buffer(desc); 417 add_buffer(desc);
424 } 418 }
425 } 419 }
426 for (const auto& desc : info.texture_descriptors) { 420 texture_buffer_it += Shader::NumDescriptors(info.texture_descriptors);
427 texture_buffer_index += desc.count;
428 }
429 if constexpr (Spec::has_images) { 421 if constexpr (Spec::has_images) {
430 for (const auto& desc : info.image_descriptors) { 422 texture_buffer_it += Shader::NumDescriptors(info.image_descriptors);
431 texture_buffer_index += desc.count;
432 }
433 } 423 }
434 }}; 424 }};
435 if constexpr (Spec::enabled_stages[0]) { 425 if constexpr (Spec::enabled_stages[0]) {
@@ -453,12 +443,13 @@ void GraphicsPipeline::ConfigureImpl(bool is_indexed) {
453 if (!is_built.load(std::memory_order::relaxed)) { 443 if (!is_built.load(std::memory_order::relaxed)) {
454 WaitForBuild(); 444 WaitForBuild();
455 } 445 }
456 if (assembly_programs[0].handle != 0) { 446 const bool use_assembly{assembly_programs[0].handle != 0};
447 if (use_assembly) {
457 program_manager.BindAssemblyPrograms(assembly_programs, enabled_stages_mask); 448 program_manager.BindAssemblyPrograms(assembly_programs, enabled_stages_mask);
458 } else { 449 } else {
459 program_manager.BindSourcePrograms(source_programs); 450 program_manager.BindSourcePrograms(source_programs);
460 } 451 }
461 const ImageId* views_it{image_view_ids.data()}; 452 const VideoCommon::ImageViewInOut* views_it{views.data()};
462 GLsizei texture_binding = 0; 453 GLsizei texture_binding = 0;
463 GLsizei image_binding = 0; 454 GLsizei image_binding = 0;
464 std::array<GLuint, MAX_TEXTURES> textures; 455 std::array<GLuint, MAX_TEXTURES> textures;
@@ -473,20 +464,49 @@ void GraphicsPipeline::ConfigureImpl(bool is_indexed) {
473 views_it += num_texture_buffers[stage]; 464 views_it += num_texture_buffers[stage];
474 views_it += num_image_buffers[stage]; 465 views_it += num_image_buffers[stage];
475 466
467 u32 texture_scaling_mask{};
468 u32 image_scaling_mask{};
469 u32 stage_texture_binding{};
470 u32 stage_image_binding{};
471
476 const auto& info{stage_infos[stage]}; 472 const auto& info{stage_infos[stage]};
477 for (const auto& desc : info.texture_descriptors) { 473 for (const auto& desc : info.texture_descriptors) {
478 for (u32 index = 0; index < desc.count; ++index) { 474 for (u32 index = 0; index < desc.count; ++index) {
479 ImageView& image_view{texture_cache.GetImageView(*(views_it++))}; 475 ImageView& image_view{texture_cache.GetImageView((views_it++)->id)};
480 textures[texture_binding++] = image_view.Handle(desc.type); 476 textures[texture_binding] = image_view.Handle(desc.type);
477 if (texture_cache.IsRescaling(image_view)) {
478 texture_scaling_mask |= 1u << stage_texture_binding;
479 }
480 ++texture_binding;
481 ++stage_texture_binding;
481 } 482 }
482 } 483 }
483 for (const auto& desc : info.image_descriptors) { 484 for (const auto& desc : info.image_descriptors) {
484 for (u32 index = 0; index < desc.count; ++index) { 485 for (u32 index = 0; index < desc.count; ++index) {
485 ImageView& image_view{texture_cache.GetImageView(*(views_it++))}; 486 ImageView& image_view{texture_cache.GetImageView((views_it++)->id)};
486 if (desc.is_written) { 487 if (desc.is_written) {
487 texture_cache.MarkModification(image_view.image_id); 488 texture_cache.MarkModification(image_view.image_id);
488 } 489 }
489 images[image_binding++] = image_view.StorageView(desc.type, desc.format); 490 images[image_binding] = image_view.StorageView(desc.type, desc.format);
491 if (texture_cache.IsRescaling(image_view)) {
492 image_scaling_mask |= 1u << stage_image_binding;
493 }
494 ++image_binding;
495 ++stage_image_binding;
496 }
497 }
498 if (info.uses_rescaling_uniform) {
499 const f32 float_texture_scaling_mask{Common::BitCast<f32>(texture_scaling_mask)};
500 const f32 float_image_scaling_mask{Common::BitCast<f32>(image_scaling_mask)};
501 const bool is_rescaling{texture_cache.IsRescaling()};
502 const f32 config_down_factor{Settings::values.resolution_info.down_factor};
503 const f32 down_factor{is_rescaling ? config_down_factor : 1.0f};
504 if (use_assembly) {
505 glProgramLocalParameter4fARB(AssemblyStage(stage), 0, float_texture_scaling_mask,
506 float_image_scaling_mask, down_factor, 0.0f);
507 } else {
508 glProgramUniform4f(source_programs[stage].handle, 0, float_texture_scaling_mask,
509 float_image_scaling_mask, down_factor, 0.0f);
490 } 510 }
491 } 511 }
492 }}; 512 }};
diff --git a/src/video_core/renderer_opengl/gl_rasterizer.cpp b/src/video_core/renderer_opengl/gl_rasterizer.cpp
index a6d9f7c43..9b516c64f 100644
--- a/src/video_core/renderer_opengl/gl_rasterizer.cpp
+++ b/src/video_core/renderer_opengl/gl_rasterizer.cpp
@@ -184,6 +184,10 @@ void RasterizerOpenGL::Clear() {
184 SyncRasterizeEnable(); 184 SyncRasterizeEnable();
185 SyncStencilTestState(); 185 SyncStencilTestState();
186 186
187 std::scoped_lock lock{texture_cache.mutex};
188 texture_cache.UpdateRenderTargets(true);
189 state_tracker.BindFramebuffer(texture_cache.GetFramebuffer()->Handle());
190 SyncViewport();
187 if (regs.clear_flags.scissor) { 191 if (regs.clear_flags.scissor) {
188 SyncScissorTest(); 192 SyncScissorTest();
189 } else { 193 } else {
@@ -192,10 +196,6 @@ void RasterizerOpenGL::Clear() {
192 } 196 }
193 UNIMPLEMENTED_IF(regs.clear_flags.viewport); 197 UNIMPLEMENTED_IF(regs.clear_flags.viewport);
194 198
195 std::scoped_lock lock{texture_cache.mutex};
196 texture_cache.UpdateRenderTargets(true);
197 state_tracker.BindFramebuffer(texture_cache.GetFramebuffer()->Handle());
198
199 if (use_color) { 199 if (use_color) {
200 glClearBufferfv(GL_COLOR, regs.clear_buffers.RT, regs.clear_color); 200 glClearBufferfv(GL_COLOR, regs.clear_buffers.RT, regs.clear_color);
201 } 201 }
@@ -214,8 +214,6 @@ void RasterizerOpenGL::Draw(bool is_indexed, bool is_instanced) {
214 214
215 query_cache.UpdateCounters(); 215 query_cache.UpdateCounters();
216 216
217 SyncState();
218
219 GraphicsPipeline* const pipeline{shader_cache.CurrentGraphicsPipeline()}; 217 GraphicsPipeline* const pipeline{shader_cache.CurrentGraphicsPipeline()};
220 if (!pipeline) { 218 if (!pipeline) {
221 return; 219 return;
@@ -223,6 +221,8 @@ void RasterizerOpenGL::Draw(bool is_indexed, bool is_instanced) {
223 std::scoped_lock lock{buffer_cache.mutex, texture_cache.mutex}; 221 std::scoped_lock lock{buffer_cache.mutex, texture_cache.mutex};
224 pipeline->Configure(is_indexed); 222 pipeline->Configure(is_indexed);
225 223
224 SyncState();
225
226 const GLenum primitive_mode = MaxwellToGL::PrimitiveTopology(maxwell3d.regs.draw.topology); 226 const GLenum primitive_mode = MaxwellToGL::PrimitiveTopology(maxwell3d.regs.draw.topology);
227 BeginTransformFeedback(pipeline, primitive_mode); 227 BeginTransformFeedback(pipeline, primitive_mode);
228 228
@@ -533,7 +533,8 @@ void RasterizerOpenGL::SyncViewport() {
533 auto& flags = maxwell3d.dirty.flags; 533 auto& flags = maxwell3d.dirty.flags;
534 const auto& regs = maxwell3d.regs; 534 const auto& regs = maxwell3d.regs;
535 535
536 const bool dirty_viewport = flags[Dirty::Viewports]; 536 const bool rescale_viewports = flags[VideoCommon::Dirty::RescaleViewports];
537 const bool dirty_viewport = flags[Dirty::Viewports] || rescale_viewports;
537 const bool dirty_clip_control = flags[Dirty::ClipControl]; 538 const bool dirty_clip_control = flags[Dirty::ClipControl];
538 539
539 if (dirty_clip_control || flags[Dirty::FrontFace]) { 540 if (dirty_clip_control || flags[Dirty::FrontFace]) {
@@ -553,8 +554,7 @@ void RasterizerOpenGL::SyncViewport() {
553 } 554 }
554 glFrontFace(mode); 555 glFrontFace(mode);
555 } 556 }
556 557 if (dirty_viewport || dirty_clip_control) {
557 if (dirty_viewport || flags[Dirty::ClipControl]) {
558 flags[Dirty::ClipControl] = false; 558 flags[Dirty::ClipControl] = false;
559 559
560 bool flip_y = false; 560 bool flip_y = false;
@@ -570,37 +570,58 @@ void RasterizerOpenGL::SyncViewport() {
570 state_tracker.ClipControl(origin, depth); 570 state_tracker.ClipControl(origin, depth);
571 state_tracker.SetYNegate(regs.screen_y_control.y_negate != 0); 571 state_tracker.SetYNegate(regs.screen_y_control.y_negate != 0);
572 } 572 }
573 const bool is_rescaling{texture_cache.IsRescaling()};
574 const float scale = is_rescaling ? Settings::values.resolution_info.up_factor : 1.0f;
575 const auto conv = [scale](float value) -> GLfloat {
576 float new_value = value * scale;
577 if (scale < 1.0f) {
578 const bool sign = std::signbit(value);
579 new_value = std::round(std::abs(new_value));
580 new_value = sign ? -new_value : new_value;
581 }
582 return static_cast<GLfloat>(new_value);
583 };
573 584
574 if (dirty_viewport) { 585 if (dirty_viewport) {
575 flags[Dirty::Viewports] = false; 586 flags[Dirty::Viewports] = false;
576 587
577 const bool force = flags[Dirty::ViewportTransform]; 588 const bool force = flags[Dirty::ViewportTransform] || rescale_viewports;
578 flags[Dirty::ViewportTransform] = false; 589 flags[Dirty::ViewportTransform] = false;
590 flags[VideoCommon::Dirty::RescaleViewports] = false;
579 591
580 for (std::size_t i = 0; i < Maxwell::NumViewports; ++i) { 592 for (size_t index = 0; index < Maxwell::NumViewports; ++index) {
581 if (!force && !flags[Dirty::Viewport0 + i]) { 593 if (!force && !flags[Dirty::Viewport0 + index]) {
582 continue; 594 continue;
583 } 595 }
584 flags[Dirty::Viewport0 + i] = false; 596 flags[Dirty::Viewport0 + index] = false;
585 597
586 const auto& src = regs.viewport_transform[i]; 598 const auto& src = regs.viewport_transform[index];
587 const Common::Rectangle<f32> rect{src.GetRect()}; 599 GLfloat x = conv(src.translate_x - src.scale_x);
588 glViewportIndexedf(static_cast<GLuint>(i), rect.left, rect.bottom, rect.GetWidth(), 600 GLfloat y = conv(src.translate_y - src.scale_y);
589 rect.GetHeight()); 601 GLfloat width = conv(src.scale_x * 2.0f);
602 GLfloat height = conv(src.scale_y * 2.0f);
603
604 if (height < 0) {
605 y += height;
606 height = -height;
607 }
608 glViewportIndexedf(static_cast<GLuint>(index), x, y, width != 0.0f ? width : 1.0f,
609 height != 0.0f ? height : 1.0f);
590 610
591 const GLdouble reduce_z = regs.depth_mode == Maxwell::DepthMode::MinusOneToOne; 611 const GLdouble reduce_z = regs.depth_mode == Maxwell::DepthMode::MinusOneToOne;
592 const GLdouble near_depth = src.translate_z - src.scale_z * reduce_z; 612 const GLdouble near_depth = src.translate_z - src.scale_z * reduce_z;
593 const GLdouble far_depth = src.translate_z + src.scale_z; 613 const GLdouble far_depth = src.translate_z + src.scale_z;
594 if (device.HasDepthBufferFloat()) { 614 if (device.HasDepthBufferFloat()) {
595 glDepthRangeIndexeddNV(static_cast<GLuint>(i), near_depth, far_depth); 615 glDepthRangeIndexeddNV(static_cast<GLuint>(index), near_depth, far_depth);
596 } else { 616 } else {
597 glDepthRangeIndexed(static_cast<GLuint>(i), near_depth, far_depth); 617 glDepthRangeIndexed(static_cast<GLuint>(index), near_depth, far_depth);
598 } 618 }
599 619
600 if (!GLAD_GL_NV_viewport_swizzle) { 620 if (!GLAD_GL_NV_viewport_swizzle) {
601 continue; 621 continue;
602 } 622 }
603 glViewportSwizzleNV(static_cast<GLuint>(i), MaxwellToGL::ViewportSwizzle(src.swizzle.x), 623 glViewportSwizzleNV(static_cast<GLuint>(index),
624 MaxwellToGL::ViewportSwizzle(src.swizzle.x),
604 MaxwellToGL::ViewportSwizzle(src.swizzle.y), 625 MaxwellToGL::ViewportSwizzle(src.swizzle.y),
605 MaxwellToGL::ViewportSwizzle(src.swizzle.z), 626 MaxwellToGL::ViewportSwizzle(src.swizzle.z),
606 MaxwellToGL::ViewportSwizzle(src.swizzle.w)); 627 MaxwellToGL::ViewportSwizzle(src.swizzle.w));
@@ -903,14 +924,34 @@ void RasterizerOpenGL::SyncLogicOpState() {
903 924
904void RasterizerOpenGL::SyncScissorTest() { 925void RasterizerOpenGL::SyncScissorTest() {
905 auto& flags = maxwell3d.dirty.flags; 926 auto& flags = maxwell3d.dirty.flags;
906 if (!flags[Dirty::Scissors]) { 927 if (!flags[Dirty::Scissors] && !flags[VideoCommon::Dirty::RescaleScissors]) {
907 return; 928 return;
908 } 929 }
909 flags[Dirty::Scissors] = false; 930 flags[Dirty::Scissors] = false;
910 931
932 const bool force = flags[VideoCommon::Dirty::RescaleScissors];
933 flags[VideoCommon::Dirty::RescaleScissors] = false;
934
911 const auto& regs = maxwell3d.regs; 935 const auto& regs = maxwell3d.regs;
936
937 const auto& resolution = Settings::values.resolution_info;
938 const bool is_rescaling{texture_cache.IsRescaling()};
939 const u32 up_scale = is_rescaling ? resolution.up_scale : 1U;
940 const u32 down_shift = is_rescaling ? resolution.down_shift : 0U;
941 const auto scale_up = [up_scale, down_shift](u32 value) -> u32 {
942 if (value == 0) {
943 return 0U;
944 }
945 const u32 upset = value * up_scale;
946 u32 acumm{};
947 if ((up_scale >> down_shift) == 0) {
948 acumm = upset % 2;
949 }
950 const u32 converted_value = upset >> down_shift;
951 return std::max<u32>(converted_value + acumm, 1U);
952 };
912 for (std::size_t index = 0; index < Maxwell::NumViewports; ++index) { 953 for (std::size_t index = 0; index < Maxwell::NumViewports; ++index) {
913 if (!flags[Dirty::Scissor0 + index]) { 954 if (!force && !flags[Dirty::Scissor0 + index]) {
914 continue; 955 continue;
915 } 956 }
916 flags[Dirty::Scissor0 + index] = false; 957 flags[Dirty::Scissor0 + index] = false;
@@ -918,8 +959,8 @@ void RasterizerOpenGL::SyncScissorTest() {
918 const auto& src = regs.scissor_test[index]; 959 const auto& src = regs.scissor_test[index];
919 if (src.enable) { 960 if (src.enable) {
920 glEnablei(GL_SCISSOR_TEST, static_cast<GLuint>(index)); 961 glEnablei(GL_SCISSOR_TEST, static_cast<GLuint>(index));
921 glScissorIndexed(static_cast<GLuint>(index), src.min_x, src.min_y, 962 glScissorIndexed(static_cast<GLuint>(index), scale_up(src.min_x), scale_up(src.min_y),
922 src.max_x - src.min_x, src.max_y - src.min_y); 963 scale_up(src.max_x - src.min_x), scale_up(src.max_y - src.min_y));
923 } else { 964 } else {
924 glDisablei(GL_SCISSOR_TEST, static_cast<GLuint>(index)); 965 glDisablei(GL_SCISSOR_TEST, static_cast<GLuint>(index));
925 } 966 }
@@ -935,8 +976,9 @@ void RasterizerOpenGL::SyncPointState() {
935 976
936 oglEnable(GL_POINT_SPRITE, maxwell3d.regs.point_sprite_enable); 977 oglEnable(GL_POINT_SPRITE, maxwell3d.regs.point_sprite_enable);
937 oglEnable(GL_PROGRAM_POINT_SIZE, maxwell3d.regs.vp_point_size.enable); 978 oglEnable(GL_PROGRAM_POINT_SIZE, maxwell3d.regs.vp_point_size.enable);
938 979 const bool is_rescaling{texture_cache.IsRescaling()};
939 glPointSize(std::max(1.0f, maxwell3d.regs.point_size)); 980 const float scale = is_rescaling ? Settings::values.resolution_info.up_factor : 1.0f;
981 glPointSize(std::max(1.0f, maxwell3d.regs.point_size * scale));
940} 982}
941 983
942void RasterizerOpenGL::SyncLineState() { 984void RasterizerOpenGL::SyncLineState() {
diff --git a/src/video_core/renderer_opengl/gl_resource_manager.cpp b/src/video_core/renderer_opengl/gl_resource_manager.cpp
index 8695c29e3..5e7101d28 100644
--- a/src/video_core/renderer_opengl/gl_resource_manager.cpp
+++ b/src/video_core/renderer_opengl/gl_resource_manager.cpp
@@ -166,7 +166,12 @@ void OGLFramebuffer::Create() {
166 return; 166 return;
167 167
168 MICROPROFILE_SCOPE(OpenGL_ResourceCreation); 168 MICROPROFILE_SCOPE(OpenGL_ResourceCreation);
169 // Bind to READ_FRAMEBUFFER to stop Nvidia's driver from creating an EXT_framebuffer instead of
170 // a core framebuffer. EXT framebuffer attachments have to match in size and can be shared
171 // across contexts. yuzu doesn't share framebuffers across contexts and we need attachments with
172 // mismatching size, this is why core framebuffers are preferred.
169 glGenFramebuffers(1, &handle); 173 glGenFramebuffers(1, &handle);
174 glBindFramebuffer(GL_READ_FRAMEBUFFER, handle);
170} 175}
171 176
172void OGLFramebuffer::Release() { 177void OGLFramebuffer::Release() {
diff --git a/src/video_core/renderer_opengl/gl_shader_cache.cpp b/src/video_core/renderer_opengl/gl_shader_cache.cpp
index 02682bd76..42ef67628 100644
--- a/src/video_core/renderer_opengl/gl_shader_cache.cpp
+++ b/src/video_core/renderer_opengl/gl_shader_cache.cpp
@@ -426,16 +426,14 @@ std::unique_ptr<GraphicsPipeline> ShaderCache::CreateGraphicsPipeline(
426 // Normal path 426 // Normal path
427 programs[index] = TranslateProgram(pools.inst, pools.block, env, cfg, host_info); 427 programs[index] = TranslateProgram(pools.inst, pools.block, env, cfg, host_info);
428 428
429 for (const auto& desc : programs[index].info.storage_buffers_descriptors) { 429 total_storage_buffers +=
430 total_storage_buffers += desc.count; 430 Shader::NumDescriptors(programs[index].info.storage_buffers_descriptors);
431 }
432 } else { 431 } else {
433 // VertexB path when VertexA is present. 432 // VertexB path when VertexA is present.
434 auto& program_va{programs[0]}; 433 auto& program_va{programs[0]};
435 auto program_vb{TranslateProgram(pools.inst, pools.block, env, cfg, host_info)}; 434 auto program_vb{TranslateProgram(pools.inst, pools.block, env, cfg, host_info)};
436 for (const auto& desc : program_vb.info.storage_buffers_descriptors) { 435 total_storage_buffers +=
437 total_storage_buffers += desc.count; 436 Shader::NumDescriptors(program_vb.info.storage_buffers_descriptors);
438 }
439 programs[index] = MergeDualVertexPrograms(program_va, program_vb, env); 437 programs[index] = MergeDualVertexPrograms(program_va, program_vb, env);
440 } 438 }
441 } 439 }
@@ -510,10 +508,7 @@ std::unique_ptr<ComputePipeline> ShaderCache::CreateComputePipeline(
510 Shader::Maxwell::Flow::CFG cfg{env, pools.flow_block, env.StartAddress()}; 508 Shader::Maxwell::Flow::CFG cfg{env, pools.flow_block, env.StartAddress()};
511 auto program{TranslateProgram(pools.inst, pools.block, env, cfg, host_info)}; 509 auto program{TranslateProgram(pools.inst, pools.block, env, cfg, host_info)};
512 510
513 u32 num_storage_buffers{}; 511 const u32 num_storage_buffers{Shader::NumDescriptors(program.info.storage_buffers_descriptors)};
514 for (const auto& desc : program.info.storage_buffers_descriptors) {
515 num_storage_buffers += desc.count;
516 }
517 Shader::RuntimeInfo info; 512 Shader::RuntimeInfo info;
518 info.glasm_use_storage_buffers = num_storage_buffers <= device.GetMaxGLASMStorageBufferBlocks(); 513 info.glasm_use_storage_buffers = num_storage_buffers <= device.GetMaxGLASMStorageBufferBlocks();
519 514
diff --git a/src/video_core/renderer_opengl/gl_texture_cache.cpp b/src/video_core/renderer_opengl/gl_texture_cache.cpp
index 8c3ca3d82..2f7d98d8b 100644
--- a/src/video_core/renderer_opengl/gl_texture_cache.cpp
+++ b/src/video_core/renderer_opengl/gl_texture_cache.cpp
@@ -9,8 +9,8 @@
9 9
10#include <glad/glad.h> 10#include <glad/glad.h>
11 11
12#include "common/literals.h"
12#include "common/settings.h" 13#include "common/settings.h"
13
14#include "video_core/renderer_opengl/gl_device.h" 14#include "video_core/renderer_opengl/gl_device.h"
15#include "video_core/renderer_opengl/gl_shader_manager.h" 15#include "video_core/renderer_opengl/gl_shader_manager.h"
16#include "video_core/renderer_opengl/gl_state_tracker.h" 16#include "video_core/renderer_opengl/gl_state_tracker.h"
@@ -42,6 +42,7 @@ using VideoCore::Surface::IsPixelFormatSRGB;
42using VideoCore::Surface::MaxPixelFormat; 42using VideoCore::Surface::MaxPixelFormat;
43using VideoCore::Surface::PixelFormat; 43using VideoCore::Surface::PixelFormat;
44using VideoCore::Surface::SurfaceType; 44using VideoCore::Surface::SurfaceType;
45using namespace Common::Literals;
45 46
46struct CopyOrigin { 47struct CopyOrigin {
47 GLint level; 48 GLint level;
@@ -316,6 +317,52 @@ void AttachTexture(GLuint fbo, GLenum attachment, const ImageView* image_view) {
316 } 317 }
317} 318}
318 319
320OGLTexture MakeImage(const VideoCommon::ImageInfo& info, GLenum gl_internal_format) {
321 const GLenum target = ImageTarget(info);
322 const GLsizei width = info.size.width;
323 const GLsizei height = info.size.height;
324 const GLsizei depth = info.size.depth;
325 const int max_host_mip_levels = std::bit_width(info.size.width);
326 const GLsizei num_levels = std::min(info.resources.levels, max_host_mip_levels);
327 const GLsizei num_layers = info.resources.layers;
328 const GLsizei num_samples = info.num_samples;
329
330 GLuint handle = 0;
331 OGLTexture texture;
332 if (target != GL_TEXTURE_BUFFER) {
333 texture.Create(target);
334 handle = texture.handle;
335 }
336 switch (target) {
337 case GL_TEXTURE_1D_ARRAY:
338 glTextureStorage2D(handle, num_levels, gl_internal_format, width, num_layers);
339 break;
340 case GL_TEXTURE_2D_ARRAY:
341 glTextureStorage3D(handle, num_levels, gl_internal_format, width, height, num_layers);
342 break;
343 case GL_TEXTURE_2D_MULTISAMPLE_ARRAY: {
344 // TODO: Where should 'fixedsamplelocations' come from?
345 const auto [samples_x, samples_y] = SamplesLog2(info.num_samples);
346 glTextureStorage3DMultisample(handle, num_samples, gl_internal_format, width >> samples_x,
347 height >> samples_y, num_layers, GL_FALSE);
348 break;
349 }
350 case GL_TEXTURE_RECTANGLE:
351 glTextureStorage2D(handle, num_levels, gl_internal_format, width, height);
352 break;
353 case GL_TEXTURE_3D:
354 glTextureStorage3D(handle, num_levels, gl_internal_format, width, height, depth);
355 break;
356 case GL_TEXTURE_BUFFER:
357 UNREACHABLE();
358 break;
359 default:
360 UNREACHABLE_MSG("Invalid target=0x{:x}", target);
361 break;
362 }
363 return texture;
364}
365
319[[nodiscard]] bool IsPixelFormatBGR(PixelFormat format) { 366[[nodiscard]] bool IsPixelFormatBGR(PixelFormat format) {
320 switch (format) { 367 switch (format) {
321 case PixelFormat::B5G6R5_UNORM: 368 case PixelFormat::B5G6R5_UNORM:
@@ -359,7 +406,8 @@ ImageBufferMap::~ImageBufferMap() {
359 406
360TextureCacheRuntime::TextureCacheRuntime(const Device& device_, ProgramManager& program_manager, 407TextureCacheRuntime::TextureCacheRuntime(const Device& device_, ProgramManager& program_manager,
361 StateTracker& state_tracker_) 408 StateTracker& state_tracker_)
362 : device{device_}, state_tracker{state_tracker_}, util_shaders(program_manager) { 409 : device{device_}, state_tracker{state_tracker_},
410 util_shaders(program_manager), resolution{Settings::values.resolution_info} {
363 static constexpr std::array TARGETS{GL_TEXTURE_1D_ARRAY, GL_TEXTURE_2D_ARRAY, GL_TEXTURE_3D}; 411 static constexpr std::array TARGETS{GL_TEXTURE_1D_ARRAY, GL_TEXTURE_2D_ARRAY, GL_TEXTURE_3D};
364 for (size_t i = 0; i < TARGETS.size(); ++i) { 412 for (size_t i = 0; i < TARGETS.size(); ++i) {
365 const GLenum target = TARGETS[i]; 413 const GLenum target = TARGETS[i];
@@ -426,6 +474,13 @@ TextureCacheRuntime::TextureCacheRuntime(const Device& device_, ProgramManager&
426 set_view(Shader::TextureType::ColorArray1D, null_image_1d_array.handle); 474 set_view(Shader::TextureType::ColorArray1D, null_image_1d_array.handle);
427 set_view(Shader::TextureType::ColorArray2D, null_image_view_2d_array.handle); 475 set_view(Shader::TextureType::ColorArray2D, null_image_view_2d_array.handle);
428 set_view(Shader::TextureType::ColorArrayCube, null_image_cube_array.handle); 476 set_view(Shader::TextureType::ColorArrayCube, null_image_cube_array.handle);
477
478 if (resolution.active) {
479 for (size_t i = 0; i < rescale_draw_fbos.size(); ++i) {
480 rescale_draw_fbos[i].Create();
481 rescale_read_fbos[i].Create();
482 }
483 }
429} 484}
430 485
431TextureCacheRuntime::~TextureCacheRuntime() = default; 486TextureCacheRuntime::~TextureCacheRuntime() = default;
@@ -442,6 +497,15 @@ ImageBufferMap TextureCacheRuntime::DownloadStagingBuffer(size_t size) {
442 return download_buffers.RequestMap(size, false); 497 return download_buffers.RequestMap(size, false);
443} 498}
444 499
500u64 TextureCacheRuntime::GetDeviceLocalMemory() const {
501 if (GLAD_GL_NVX_gpu_memory_info) {
502 GLint cur_avail_mem_kb = 0;
503 glGetIntegerv(GL_GPU_MEMORY_INFO_CURRENT_AVAILABLE_VIDMEM_NVX, &cur_avail_mem_kb);
504 return static_cast<u64>(cur_avail_mem_kb) * 1_KiB;
505 }
506 return 2_GiB; // Return minimum requirements
507}
508
445void TextureCacheRuntime::CopyImage(Image& dst_image, Image& src_image, 509void TextureCacheRuntime::CopyImage(Image& dst_image, Image& src_image,
446 std::span<const ImageCopy> copies) { 510 std::span<const ImageCopy> copies) {
447 const GLuint dst_name = dst_image.Handle(); 511 const GLuint dst_name = dst_image.Handle();
@@ -605,13 +669,13 @@ std::optional<size_t> TextureCacheRuntime::StagingBuffers::FindBuffer(size_t req
605 return found; 669 return found;
606} 670}
607 671
608Image::Image(TextureCacheRuntime& runtime, const VideoCommon::ImageInfo& info_, GPUVAddr gpu_addr_, 672Image::Image(TextureCacheRuntime& runtime_, const VideoCommon::ImageInfo& info_, GPUVAddr gpu_addr_,
609 VAddr cpu_addr_) 673 VAddr cpu_addr_)
610 : VideoCommon::ImageBase(info_, gpu_addr_, cpu_addr_) { 674 : VideoCommon::ImageBase(info_, gpu_addr_, cpu_addr_), runtime{&runtime_} {
611 if (CanBeAccelerated(runtime, info)) { 675 if (CanBeAccelerated(*runtime, info)) {
612 flags |= ImageFlagBits::AcceleratedUpload; 676 flags |= ImageFlagBits::AcceleratedUpload;
613 } 677 }
614 if (IsConverted(runtime.device, info.format, info.type)) { 678 if (IsConverted(runtime->device, info.format, info.type)) {
615 flags |= ImageFlagBits::Converted; 679 flags |= ImageFlagBits::Converted;
616 gl_internal_format = IsPixelFormatSRGB(info.format) ? GL_SRGB8_ALPHA8 : GL_RGBA8; 680 gl_internal_format = IsPixelFormatSRGB(info.format) ? GL_SRGB8_ALPHA8 : GL_RGBA8;
617 gl_format = GL_RGBA; 681 gl_format = GL_RGBA;
@@ -622,58 +686,25 @@ Image::Image(TextureCacheRuntime& runtime, const VideoCommon::ImageInfo& info_,
622 gl_format = tuple.format; 686 gl_format = tuple.format;
623 gl_type = tuple.type; 687 gl_type = tuple.type;
624 } 688 }
625 const GLenum target = ImageTarget(info); 689 texture = MakeImage(info, gl_internal_format);
626 const GLsizei width = info.size.width; 690 current_texture = texture.handle;
627 const GLsizei height = info.size.height; 691 if (runtime->device.HasDebuggingToolAttached()) {
628 const GLsizei depth = info.size.depth;
629 const int max_host_mip_levels = std::bit_width(info.size.width);
630 const GLsizei num_levels = std::min(info.resources.levels, max_host_mip_levels);
631 const GLsizei num_layers = info.resources.layers;
632 const GLsizei num_samples = info.num_samples;
633
634 GLuint handle = 0;
635 if (target != GL_TEXTURE_BUFFER) {
636 texture.Create(target);
637 handle = texture.handle;
638 }
639 switch (target) {
640 case GL_TEXTURE_1D_ARRAY:
641 glTextureStorage2D(handle, num_levels, gl_internal_format, width, num_layers);
642 break;
643 case GL_TEXTURE_2D_ARRAY:
644 glTextureStorage3D(handle, num_levels, gl_internal_format, width, height, num_layers);
645 break;
646 case GL_TEXTURE_2D_MULTISAMPLE_ARRAY: {
647 // TODO: Where should 'fixedsamplelocations' come from?
648 const auto [samples_x, samples_y] = SamplesLog2(info.num_samples);
649 glTextureStorage3DMultisample(handle, num_samples, gl_internal_format, width >> samples_x,
650 height >> samples_y, num_layers, GL_FALSE);
651 break;
652 }
653 case GL_TEXTURE_RECTANGLE:
654 glTextureStorage2D(handle, num_levels, gl_internal_format, width, height);
655 break;
656 case GL_TEXTURE_3D:
657 glTextureStorage3D(handle, num_levels, gl_internal_format, width, height, depth);
658 break;
659 case GL_TEXTURE_BUFFER:
660 UNREACHABLE();
661 break;
662 default:
663 UNREACHABLE_MSG("Invalid target=0x{:x}", target);
664 break;
665 }
666 if (runtime.device.HasDebuggingToolAttached()) {
667 const std::string name = VideoCommon::Name(*this); 692 const std::string name = VideoCommon::Name(*this);
668 glObjectLabel(target == GL_TEXTURE_BUFFER ? GL_BUFFER : GL_TEXTURE, handle, 693 glObjectLabel(ImageTarget(info) == GL_TEXTURE_BUFFER ? GL_BUFFER : GL_TEXTURE,
669 static_cast<GLsizei>(name.size()), name.data()); 694 texture.handle, static_cast<GLsizei>(name.size()), name.data());
670 } 695 }
671} 696}
672 697
698Image::Image(const VideoCommon::NullImageParams& params) : VideoCommon::ImageBase{params} {}
699
673Image::~Image() = default; 700Image::~Image() = default;
674 701
675void Image::UploadMemory(const ImageBufferMap& map, 702void Image::UploadMemory(const ImageBufferMap& map,
676 std::span<const VideoCommon::BufferImageCopy> copies) { 703 std::span<const VideoCommon::BufferImageCopy> copies) {
704 const bool is_rescaled = True(flags & ImageFlagBits::Rescaled);
705 if (is_rescaled) {
706 ScaleDown(true);
707 }
677 glBindBuffer(GL_PIXEL_UNPACK_BUFFER, map.buffer); 708 glBindBuffer(GL_PIXEL_UNPACK_BUFFER, map.buffer);
678 glFlushMappedBufferRange(GL_PIXEL_UNPACK_BUFFER, map.offset, unswizzled_size_bytes); 709 glFlushMappedBufferRange(GL_PIXEL_UNPACK_BUFFER, map.offset, unswizzled_size_bytes);
679 710
@@ -693,12 +724,18 @@ void Image::UploadMemory(const ImageBufferMap& map,
693 } 724 }
694 CopyBufferToImage(copy, map.offset); 725 CopyBufferToImage(copy, map.offset);
695 } 726 }
727 if (is_rescaled) {
728 ScaleUp();
729 }
696} 730}
697 731
698void Image::DownloadMemory(ImageBufferMap& map, 732void Image::DownloadMemory(ImageBufferMap& map,
699 std::span<const VideoCommon::BufferImageCopy> copies) { 733 std::span<const VideoCommon::BufferImageCopy> copies) {
734 const bool is_rescaled = True(flags & ImageFlagBits::Rescaled);
735 if (is_rescaled) {
736 ScaleDown();
737 }
700 glMemoryBarrier(GL_PIXEL_BUFFER_BARRIER_BIT); // TODO: Move this to its own API 738 glMemoryBarrier(GL_PIXEL_BUFFER_BARRIER_BIT); // TODO: Move this to its own API
701
702 glBindBuffer(GL_PIXEL_PACK_BUFFER, map.buffer); 739 glBindBuffer(GL_PIXEL_PACK_BUFFER, map.buffer);
703 glPixelStorei(GL_PACK_ALIGNMENT, 1); 740 glPixelStorei(GL_PACK_ALIGNMENT, 1);
704 741
@@ -716,6 +753,9 @@ void Image::DownloadMemory(ImageBufferMap& map,
716 } 753 }
717 CopyImageToBuffer(copy, map.offset); 754 CopyImageToBuffer(copy, map.offset);
718 } 755 }
756 if (is_rescaled) {
757 ScaleUp(true);
758 }
719} 759}
720 760
721GLuint Image::StorageHandle() noexcept { 761GLuint Image::StorageHandle() noexcept {
@@ -741,11 +781,11 @@ GLuint Image::StorageHandle() noexcept {
741 return store_view.handle; 781 return store_view.handle;
742 } 782 }
743 store_view.Create(); 783 store_view.Create();
744 glTextureView(store_view.handle, ImageTarget(info), texture.handle, GL_RGBA8, 0, 784 glTextureView(store_view.handle, ImageTarget(info), current_texture, GL_RGBA8, 0,
745 info.resources.levels, 0, info.resources.layers); 785 info.resources.levels, 0, info.resources.layers);
746 return store_view.handle; 786 return store_view.handle;
747 default: 787 default:
748 return texture.handle; 788 return current_texture;
749 } 789 }
750} 790}
751 791
@@ -849,6 +889,140 @@ void Image::CopyImageToBuffer(const VideoCommon::BufferImageCopy& copy, size_t b
849 } 889 }
850} 890}
851 891
892void Image::Scale(bool up_scale) {
893 const auto format_type = GetFormatType(info.format);
894 const GLenum attachment = [format_type] {
895 switch (format_type) {
896 case SurfaceType::ColorTexture:
897 return GL_COLOR_ATTACHMENT0;
898 case SurfaceType::Depth:
899 return GL_DEPTH_ATTACHMENT;
900 case SurfaceType::DepthStencil:
901 return GL_DEPTH_STENCIL_ATTACHMENT;
902 default:
903 UNREACHABLE();
904 return GL_COLOR_ATTACHMENT0;
905 }
906 }();
907 const GLenum mask = [format_type] {
908 switch (format_type) {
909 case SurfaceType::ColorTexture:
910 return GL_COLOR_BUFFER_BIT;
911 case SurfaceType::Depth:
912 return GL_DEPTH_BUFFER_BIT;
913 case SurfaceType::DepthStencil:
914 return GL_STENCIL_BUFFER_BIT | GL_DEPTH_BUFFER_BIT;
915 default:
916 UNREACHABLE();
917 return GL_COLOR_BUFFER_BIT;
918 }
919 }();
920 const size_t fbo_index = [format_type] {
921 switch (format_type) {
922 case SurfaceType::ColorTexture:
923 return 0;
924 case SurfaceType::Depth:
925 return 1;
926 case SurfaceType::DepthStencil:
927 return 2;
928 default:
929 UNREACHABLE();
930 return 0;
931 }
932 }();
933 const bool is_2d = info.type == ImageType::e2D;
934 const bool is_color{(mask & GL_COLOR_BUFFER_BIT) != 0};
935 // Integer formats must use NEAREST filter
936 const bool linear_color_format{is_color && !IsPixelFormatInteger(info.format)};
937 const GLenum filter = linear_color_format ? GL_LINEAR : GL_NEAREST;
938
939 const auto& resolution = runtime->resolution;
940 const u32 scaled_width = resolution.ScaleUp(info.size.width);
941 const u32 scaled_height = is_2d ? resolution.ScaleUp(info.size.height) : info.size.height;
942 const u32 original_width = info.size.width;
943 const u32 original_height = info.size.height;
944
945 if (!upscaled_backup.handle) {
946 auto dst_info = info;
947 dst_info.size.width = scaled_width;
948 dst_info.size.height = scaled_height;
949 upscaled_backup = MakeImage(dst_info, gl_internal_format);
950 }
951 const u32 src_width = up_scale ? original_width : scaled_width;
952 const u32 src_height = up_scale ? original_height : scaled_height;
953 const u32 dst_width = up_scale ? scaled_width : original_width;
954 const u32 dst_height = up_scale ? scaled_height : original_height;
955 const auto src_handle = up_scale ? texture.handle : upscaled_backup.handle;
956 const auto dst_handle = up_scale ? upscaled_backup.handle : texture.handle;
957
958 // TODO (ameerj): Investigate other GL states that affect blitting.
959 glDisablei(GL_SCISSOR_TEST, 0);
960 glViewportIndexedf(0, 0.0f, 0.0f, static_cast<GLfloat>(dst_width),
961 static_cast<GLfloat>(dst_height));
962
963 const GLuint read_fbo = runtime->rescale_read_fbos[fbo_index].handle;
964 const GLuint draw_fbo = runtime->rescale_draw_fbos[fbo_index].handle;
965 for (s32 layer = 0; layer < info.resources.layers; ++layer) {
966 for (s32 level = 0; level < info.resources.levels; ++level) {
967 const u32 src_level_width = std::max(1u, src_width >> level);
968 const u32 src_level_height = std::max(1u, src_height >> level);
969 const u32 dst_level_width = std::max(1u, dst_width >> level);
970 const u32 dst_level_height = std::max(1u, dst_height >> level);
971
972 glNamedFramebufferTextureLayer(read_fbo, attachment, src_handle, level, layer);
973 glNamedFramebufferTextureLayer(draw_fbo, attachment, dst_handle, level, layer);
974
975 glBlitNamedFramebuffer(read_fbo, draw_fbo, 0, 0, src_level_width, src_level_height, 0,
976 0, dst_level_width, dst_level_height, mask, filter);
977 }
978 }
979 current_texture = dst_handle;
980 auto& state_tracker = runtime->GetStateTracker();
981 state_tracker.NotifyViewport0();
982 state_tracker.NotifyScissor0();
983}
984
985bool Image::ScaleUp(bool ignore) {
986 if (True(flags & ImageFlagBits::Rescaled)) {
987 return false;
988 }
989 if (gl_format == 0 && gl_type == 0) {
990 // compressed textures
991 return false;
992 }
993 if (info.type == ImageType::Linear) {
994 UNREACHABLE();
995 return false;
996 }
997 flags |= ImageFlagBits::Rescaled;
998 if (!runtime->resolution.active) {
999 return false;
1000 }
1001 has_scaled = true;
1002 if (ignore) {
1003 current_texture = upscaled_backup.handle;
1004 return true;
1005 }
1006 Scale(true);
1007 return true;
1008}
1009
1010bool Image::ScaleDown(bool ignore) {
1011 if (False(flags & ImageFlagBits::Rescaled)) {
1012 return false;
1013 }
1014 flags &= ~ImageFlagBits::Rescaled;
1015 if (!runtime->resolution.active) {
1016 return false;
1017 }
1018 if (ignore) {
1019 current_texture = texture.handle;
1020 return true;
1021 }
1022 Scale(false);
1023 return true;
1024}
1025
852ImageView::ImageView(TextureCacheRuntime& runtime, const VideoCommon::ImageViewInfo& info, 1026ImageView::ImageView(TextureCacheRuntime& runtime, const VideoCommon::ImageViewInfo& info,
853 ImageId image_id_, Image& image) 1027 ImageId image_id_, Image& image)
854 : VideoCommon::ImageViewBase{info, image.info, image_id_}, views{runtime.null_image_views} { 1028 : VideoCommon::ImageViewBase{info, image.info, image_id_}, views{runtime.null_image_views} {
@@ -862,7 +1036,7 @@ ImageView::ImageView(TextureCacheRuntime& runtime, const VideoCommon::ImageViewI
862 flat_range = info.range; 1036 flat_range = info.range;
863 set_object_label = device.HasDebuggingToolAttached(); 1037 set_object_label = device.HasDebuggingToolAttached();
864 is_render_target = info.IsRenderTarget(); 1038 is_render_target = info.IsRenderTarget();
865 original_texture = image.texture.handle; 1039 original_texture = image.Handle();
866 num_samples = image.info.num_samples; 1040 num_samples = image.info.num_samples;
867 if (!is_render_target) { 1041 if (!is_render_target) {
868 swizzle[0] = info.x_source; 1042 swizzle[0] = info.x_source;
@@ -950,9 +1124,11 @@ ImageView::ImageView(TextureCacheRuntime&, const VideoCommon::ImageInfo& info,
950 const VideoCommon::ImageViewInfo& view_info) 1124 const VideoCommon::ImageViewInfo& view_info)
951 : VideoCommon::ImageViewBase{info, view_info} {} 1125 : VideoCommon::ImageViewBase{info, view_info} {}
952 1126
953ImageView::ImageView(TextureCacheRuntime& runtime, const VideoCommon::NullImageParams& params) 1127ImageView::ImageView(TextureCacheRuntime& runtime, const VideoCommon::NullImageViewParams& params)
954 : VideoCommon::ImageViewBase{params}, views{runtime.null_image_views} {} 1128 : VideoCommon::ImageViewBase{params}, views{runtime.null_image_views} {}
955 1129
1130ImageView::~ImageView() = default;
1131
956GLuint ImageView::StorageView(Shader::TextureType texture_type, Shader::ImageFormat image_format) { 1132GLuint ImageView::StorageView(Shader::TextureType texture_type, Shader::ImageFormat image_format) {
957 if (image_format == Shader::ImageFormat::Typeless) { 1133 if (image_format == Shader::ImageFormat::Typeless) {
958 return Handle(texture_type); 1134 return Handle(texture_type);
@@ -1037,7 +1213,8 @@ Sampler::Sampler(TextureCacheRuntime& runtime, const TSCEntry& config) {
1037 glSamplerParameterfv(handle, GL_TEXTURE_BORDER_COLOR, config.BorderColor().data()); 1213 glSamplerParameterfv(handle, GL_TEXTURE_BORDER_COLOR, config.BorderColor().data());
1038 1214
1039 if (GLAD_GL_ARB_texture_filter_anisotropic || GLAD_GL_EXT_texture_filter_anisotropic) { 1215 if (GLAD_GL_ARB_texture_filter_anisotropic || GLAD_GL_EXT_texture_filter_anisotropic) {
1040 glSamplerParameterf(handle, GL_TEXTURE_MAX_ANISOTROPY, config.MaxAnisotropy()); 1216 const f32 max_anisotropy = std::clamp(config.MaxAnisotropy(), 1.0f, 16.0f);
1217 glSamplerParameterf(handle, GL_TEXTURE_MAX_ANISOTROPY, max_anisotropy);
1041 } else { 1218 } else {
1042 LOG_WARNING(Render_OpenGL, "GL_ARB_texture_filter_anisotropic is required"); 1219 LOG_WARNING(Render_OpenGL, "GL_ARB_texture_filter_anisotropic is required");
1043 } 1220 }
@@ -1056,13 +1233,8 @@ Sampler::Sampler(TextureCacheRuntime& runtime, const TSCEntry& config) {
1056 1233
1057Framebuffer::Framebuffer(TextureCacheRuntime& runtime, std::span<ImageView*, NUM_RT> color_buffers, 1234Framebuffer::Framebuffer(TextureCacheRuntime& runtime, std::span<ImageView*, NUM_RT> color_buffers,
1058 ImageView* depth_buffer, const VideoCommon::RenderTargets& key) { 1235 ImageView* depth_buffer, const VideoCommon::RenderTargets& key) {
1059 // Bind to READ_FRAMEBUFFER to stop Nvidia's driver from creating an EXT_framebuffer instead of 1236 framebuffer.Create();
1060 // a core framebuffer. EXT framebuffer attachments have to match in size and can be shared 1237 GLuint handle = framebuffer.handle;
1061 // across contexts. yuzu doesn't share framebuffers across contexts and we need attachments with
1062 // mismatching size, this is why core framebuffers are preferred.
1063 GLuint handle;
1064 glGenFramebuffers(1, &handle);
1065 glBindFramebuffer(GL_READ_FRAMEBUFFER, handle);
1066 1238
1067 GLsizei num_buffers = 0; 1239 GLsizei num_buffers = 0;
1068 std::array<GLenum, NUM_RT> gl_draw_buffers; 1240 std::array<GLenum, NUM_RT> gl_draw_buffers;
@@ -1110,31 +1282,31 @@ Framebuffer::Framebuffer(TextureCacheRuntime& runtime, std::span<ImageView*, NUM
1110 const std::string name = VideoCommon::Name(key); 1282 const std::string name = VideoCommon::Name(key);
1111 glObjectLabel(GL_FRAMEBUFFER, handle, static_cast<GLsizei>(name.size()), name.data()); 1283 glObjectLabel(GL_FRAMEBUFFER, handle, static_cast<GLsizei>(name.size()), name.data());
1112 } 1284 }
1113 framebuffer.handle = handle;
1114} 1285}
1115 1286
1287Framebuffer::~Framebuffer() = default;
1288
1116void BGRCopyPass::CopyBGR(Image& dst_image, Image& src_image, 1289void BGRCopyPass::CopyBGR(Image& dst_image, Image& src_image,
1117 std::span<const VideoCommon::ImageCopy> copies) { 1290 std::span<const VideoCommon::ImageCopy> copies) {
1118 static constexpr VideoCommon::Offset3D zero_offset{0, 0, 0}; 1291 static constexpr VideoCommon::Offset3D zero_offset{0, 0, 0};
1119 const u32 requested_pbo_size = 1292 const u32 img_bpp = BytesPerBlock(src_image.info.format);
1120 std::max(src_image.unswizzled_size_bytes, dst_image.unswizzled_size_bytes);
1121
1122 if (bgr_pbo_size < requested_pbo_size) {
1123 bgr_pbo.Create();
1124 bgr_pbo_size = requested_pbo_size;
1125 glNamedBufferData(bgr_pbo.handle, bgr_pbo_size, nullptr, GL_STREAM_COPY);
1126 }
1127 for (const ImageCopy& copy : copies) { 1293 for (const ImageCopy& copy : copies) {
1128 ASSERT(copy.src_offset == zero_offset); 1294 ASSERT(copy.src_offset == zero_offset);
1129 ASSERT(copy.dst_offset == zero_offset); 1295 ASSERT(copy.dst_offset == zero_offset);
1130 1296 const u32 num_src_layers = static_cast<u32>(copy.src_subresource.num_layers);
1297 const u32 copy_size = copy.extent.width * copy.extent.height * num_src_layers * img_bpp;
1298 if (bgr_pbo_size < copy_size) {
1299 bgr_pbo.Create();
1300 bgr_pbo_size = copy_size;
1301 glNamedBufferData(bgr_pbo.handle, bgr_pbo_size, nullptr, GL_STREAM_COPY);
1302 }
1131 // Copy from source to PBO 1303 // Copy from source to PBO
1132 glPixelStorei(GL_PACK_ALIGNMENT, 1); 1304 glPixelStorei(GL_PACK_ALIGNMENT, 1);
1133 glPixelStorei(GL_PACK_ROW_LENGTH, copy.extent.width); 1305 glPixelStorei(GL_PACK_ROW_LENGTH, copy.extent.width);
1134 glBindBuffer(GL_PIXEL_PACK_BUFFER, bgr_pbo.handle); 1306 glBindBuffer(GL_PIXEL_PACK_BUFFER, bgr_pbo.handle);
1135 glGetTextureSubImage(src_image.Handle(), 0, 0, 0, 0, copy.extent.width, copy.extent.height, 1307 glGetTextureSubImage(src_image.Handle(), 0, 0, 0, 0, copy.extent.width, copy.extent.height,
1136 copy.src_subresource.num_layers, src_image.GlFormat(), 1308 num_src_layers, src_image.GlFormat(), src_image.GlType(),
1137 src_image.GlType(), static_cast<GLsizei>(bgr_pbo_size), nullptr); 1309 static_cast<GLsizei>(bgr_pbo_size), nullptr);
1138 1310
1139 // Copy from PBO to destination in desired GL format 1311 // Copy from PBO to destination in desired GL format
1140 glPixelStorei(GL_UNPACK_ALIGNMENT, 1); 1312 glPixelStorei(GL_UNPACK_ALIGNMENT, 1);
diff --git a/src/video_core/renderer_opengl/gl_texture_cache.h b/src/video_core/renderer_opengl/gl_texture_cache.h
index 1ca2c90be..1bb762568 100644
--- a/src/video_core/renderer_opengl/gl_texture_cache.h
+++ b/src/video_core/renderer_opengl/gl_texture_cache.h
@@ -15,6 +15,10 @@
15#include "video_core/texture_cache/image_view_base.h" 15#include "video_core/texture_cache/image_view_base.h"
16#include "video_core/texture_cache/texture_cache_base.h" 16#include "video_core/texture_cache/texture_cache_base.h"
17 17
18namespace Settings {
19struct ResolutionScalingInfo;
20}
21
18namespace OpenGL { 22namespace OpenGL {
19 23
20class Device; 24class Device;
@@ -78,9 +82,11 @@ public:
78 82
79 ImageBufferMap DownloadStagingBuffer(size_t size); 83 ImageBufferMap DownloadStagingBuffer(size_t size);
80 84
85 u64 GetDeviceLocalMemory() const;
86
81 void CopyImage(Image& dst, Image& src, std::span<const VideoCommon::ImageCopy> copies); 87 void CopyImage(Image& dst, Image& src, std::span<const VideoCommon::ImageCopy> copies);
82 88
83 void ConvertImage(Framebuffer* dst, ImageView& dst_view, ImageView& src_view) { 89 void ConvertImage(Framebuffer* dst, ImageView& dst_view, ImageView& src_view, bool rescaled) {
84 UNIMPLEMENTED(); 90 UNIMPLEMENTED();
85 } 91 }
86 92
@@ -110,6 +116,12 @@ public:
110 116
111 bool HasNativeASTC() const noexcept; 117 bool HasNativeASTC() const noexcept;
112 118
119 void TickFrame() {}
120
121 StateTracker& GetStateTracker() {
122 return state_tracker;
123 }
124
113private: 125private:
114 struct StagingBuffers { 126 struct StagingBuffers {
115 explicit StagingBuffers(GLenum storage_flags_, GLenum map_flags_); 127 explicit StagingBuffers(GLenum storage_flags_, GLenum map_flags_);
@@ -149,6 +161,10 @@ private:
149 OGLTextureView null_image_view_cube; 161 OGLTextureView null_image_view_cube;
150 162
151 std::array<GLuint, Shader::NUM_TEXTURE_TYPES> null_image_views{}; 163 std::array<GLuint, Shader::NUM_TEXTURE_TYPES> null_image_views{};
164
165 std::array<OGLFramebuffer, 3> rescale_draw_fbos;
166 std::array<OGLFramebuffer, 3> rescale_read_fbos;
167 const Settings::ResolutionScalingInfo& resolution;
152}; 168};
153 169
154class Image : public VideoCommon::ImageBase { 170class Image : public VideoCommon::ImageBase {
@@ -157,6 +173,7 @@ class Image : public VideoCommon::ImageBase {
157public: 173public:
158 explicit Image(TextureCacheRuntime&, const VideoCommon::ImageInfo& info, GPUVAddr gpu_addr, 174 explicit Image(TextureCacheRuntime&, const VideoCommon::ImageInfo& info, GPUVAddr gpu_addr,
159 VAddr cpu_addr); 175 VAddr cpu_addr);
176 explicit Image(const VideoCommon::NullImageParams&);
160 177
161 ~Image(); 178 ~Image();
162 179
@@ -174,7 +191,7 @@ public:
174 GLuint StorageHandle() noexcept; 191 GLuint StorageHandle() noexcept;
175 192
176 GLuint Handle() const noexcept { 193 GLuint Handle() const noexcept {
177 return texture.handle; 194 return current_texture;
178 } 195 }
179 196
180 GLuint GlFormat() const noexcept { 197 GLuint GlFormat() const noexcept {
@@ -185,16 +202,25 @@ public:
185 return gl_type; 202 return gl_type;
186 } 203 }
187 204
205 bool ScaleUp(bool ignore = false);
206
207 bool ScaleDown(bool ignore = false);
208
188private: 209private:
189 void CopyBufferToImage(const VideoCommon::BufferImageCopy& copy, size_t buffer_offset); 210 void CopyBufferToImage(const VideoCommon::BufferImageCopy& copy, size_t buffer_offset);
190 211
191 void CopyImageToBuffer(const VideoCommon::BufferImageCopy& copy, size_t buffer_offset); 212 void CopyImageToBuffer(const VideoCommon::BufferImageCopy& copy, size_t buffer_offset);
192 213
214 void Scale(bool up_scale);
215
193 OGLTexture texture; 216 OGLTexture texture;
217 OGLTexture upscaled_backup;
194 OGLTextureView store_view; 218 OGLTextureView store_view;
195 GLenum gl_internal_format = GL_NONE; 219 GLenum gl_internal_format = GL_NONE;
196 GLenum gl_format = GL_NONE; 220 GLenum gl_format = GL_NONE;
197 GLenum gl_type = GL_NONE; 221 GLenum gl_type = GL_NONE;
222 TextureCacheRuntime* runtime{};
223 GLuint current_texture{};
198}; 224};
199 225
200class ImageView : public VideoCommon::ImageViewBase { 226class ImageView : public VideoCommon::ImageViewBase {
@@ -206,7 +232,15 @@ public:
206 const VideoCommon::ImageViewInfo&, GPUVAddr); 232 const VideoCommon::ImageViewInfo&, GPUVAddr);
207 explicit ImageView(TextureCacheRuntime&, const VideoCommon::ImageInfo& info, 233 explicit ImageView(TextureCacheRuntime&, const VideoCommon::ImageInfo& info,
208 const VideoCommon::ImageViewInfo& view_info); 234 const VideoCommon::ImageViewInfo& view_info);
209 explicit ImageView(TextureCacheRuntime&, const VideoCommon::NullImageParams&); 235 explicit ImageView(TextureCacheRuntime&, const VideoCommon::NullImageViewParams&);
236
237 ~ImageView();
238
239 ImageView(const ImageView&) = delete;
240 ImageView& operator=(const ImageView&) = delete;
241
242 ImageView(ImageView&&) = default;
243 ImageView& operator=(ImageView&&) = default;
210 244
211 [[nodiscard]] GLuint StorageView(Shader::TextureType texture_type, 245 [[nodiscard]] GLuint StorageView(Shader::TextureType texture_type,
212 Shader::ImageFormat image_format); 246 Shader::ImageFormat image_format);
@@ -276,6 +310,14 @@ public:
276 explicit Framebuffer(TextureCacheRuntime&, std::span<ImageView*, NUM_RT> color_buffers, 310 explicit Framebuffer(TextureCacheRuntime&, std::span<ImageView*, NUM_RT> color_buffers,
277 ImageView* depth_buffer, const VideoCommon::RenderTargets& key); 311 ImageView* depth_buffer, const VideoCommon::RenderTargets& key);
278 312
313 ~Framebuffer();
314
315 Framebuffer(const Framebuffer&) = delete;
316 Framebuffer& operator=(const Framebuffer&) = delete;
317
318 Framebuffer(Framebuffer&&) = default;
319 Framebuffer& operator=(Framebuffer&&) = default;
320
279 [[nodiscard]] GLuint Handle() const noexcept { 321 [[nodiscard]] GLuint Handle() const noexcept {
280 return framebuffer.handle; 322 return framebuffer.handle;
281 } 323 }
@@ -293,7 +335,7 @@ struct TextureCacheParams {
293 static constexpr bool ENABLE_VALIDATION = true; 335 static constexpr bool ENABLE_VALIDATION = true;
294 static constexpr bool FRAMEBUFFER_BLITS = true; 336 static constexpr bool FRAMEBUFFER_BLITS = true;
295 static constexpr bool HAS_EMULATED_COPIES = true; 337 static constexpr bool HAS_EMULATED_COPIES = true;
296 static constexpr bool HAS_DEVICE_MEMORY_INFO = false; 338 static constexpr bool HAS_DEVICE_MEMORY_INFO = true;
297 339
298 using Runtime = OpenGL::TextureCacheRuntime; 340 using Runtime = OpenGL::TextureCacheRuntime;
299 using Image = OpenGL::Image; 341 using Image = OpenGL::Image;
diff --git a/src/video_core/renderer_opengl/renderer_opengl.cpp b/src/video_core/renderer_opengl/renderer_opengl.cpp
index 7d7cba69c..28daacd82 100644
--- a/src/video_core/renderer_opengl/renderer_opengl.cpp
+++ b/src/video_core/renderer_opengl/renderer_opengl.cpp
@@ -21,8 +21,13 @@
21#include "core/memory.h" 21#include "core/memory.h"
22#include "core/perf_stats.h" 22#include "core/perf_stats.h"
23#include "core/telemetry_session.h" 23#include "core/telemetry_session.h"
24#include "video_core/host_shaders/fxaa_frag.h"
25#include "video_core/host_shaders/fxaa_vert.h"
24#include "video_core/host_shaders/opengl_present_frag.h" 26#include "video_core/host_shaders/opengl_present_frag.h"
27#include "video_core/host_shaders/opengl_present_scaleforce_frag.h"
25#include "video_core/host_shaders/opengl_present_vert.h" 28#include "video_core/host_shaders/opengl_present_vert.h"
29#include "video_core/host_shaders/present_bicubic_frag.h"
30#include "video_core/host_shaders/present_gaussian_frag.h"
26#include "video_core/renderer_opengl/gl_rasterizer.h" 31#include "video_core/renderer_opengl/gl_rasterizer.h"
27#include "video_core/renderer_opengl/gl_shader_manager.h" 32#include "video_core/renderer_opengl/gl_shader_manager.h"
28#include "video_core/renderer_opengl/gl_shader_util.h" 33#include "video_core/renderer_opengl/gl_shader_util.h"
@@ -208,7 +213,9 @@ void RendererOpenGL::LoadFBToScreenInfo(const Tegra::FramebufferConfig& framebuf
208 framebuffer_crop_rect = framebuffer.crop_rect; 213 framebuffer_crop_rect = framebuffer.crop_rect;
209 214
210 const VAddr framebuffer_addr{framebuffer.address + framebuffer.offset}; 215 const VAddr framebuffer_addr{framebuffer.address + framebuffer.offset};
211 if (rasterizer.AccelerateDisplay(framebuffer, framebuffer_addr, framebuffer.stride)) { 216 screen_info.was_accelerated =
217 rasterizer.AccelerateDisplay(framebuffer, framebuffer_addr, framebuffer.stride);
218 if (screen_info.was_accelerated) {
212 return; 219 return;
213 } 220 }
214 221
@@ -251,12 +258,25 @@ void RendererOpenGL::LoadColorToActiveGLTexture(u8 color_r, u8 color_g, u8 color
251 258
252void RendererOpenGL::InitOpenGLObjects() { 259void RendererOpenGL::InitOpenGLObjects() {
253 // Create shader programs 260 // Create shader programs
261 fxaa_vertex = CreateProgram(HostShaders::FXAA_VERT, GL_VERTEX_SHADER);
262 fxaa_fragment = CreateProgram(HostShaders::FXAA_FRAG, GL_FRAGMENT_SHADER);
254 present_vertex = CreateProgram(HostShaders::OPENGL_PRESENT_VERT, GL_VERTEX_SHADER); 263 present_vertex = CreateProgram(HostShaders::OPENGL_PRESENT_VERT, GL_VERTEX_SHADER);
255 present_fragment = CreateProgram(HostShaders::OPENGL_PRESENT_FRAG, GL_FRAGMENT_SHADER); 264 present_bilinear_fragment = CreateProgram(HostShaders::OPENGL_PRESENT_FRAG, GL_FRAGMENT_SHADER);
265 present_bicubic_fragment = CreateProgram(HostShaders::PRESENT_BICUBIC_FRAG, GL_FRAGMENT_SHADER);
266 present_gaussian_fragment =
267 CreateProgram(HostShaders::PRESENT_GAUSSIAN_FRAG, GL_FRAGMENT_SHADER);
268 present_scaleforce_fragment =
269 CreateProgram(fmt::format("#version 460\n{}", HostShaders::OPENGL_PRESENT_SCALEFORCE_FRAG),
270 GL_FRAGMENT_SHADER);
256 271
257 // Generate presentation sampler 272 // Generate presentation sampler
258 present_sampler.Create(); 273 present_sampler.Create();
259 glSamplerParameteri(present_sampler.handle, GL_TEXTURE_MIN_FILTER, GL_LINEAR); 274 glSamplerParameteri(present_sampler.handle, GL_TEXTURE_MIN_FILTER, GL_LINEAR);
275 glSamplerParameteri(present_sampler.handle, GL_TEXTURE_MAG_FILTER, GL_LINEAR);
276
277 present_sampler_nn.Create();
278 glSamplerParameteri(present_sampler_nn.handle, GL_TEXTURE_MIN_FILTER, GL_NEAREST);
279 glSamplerParameteri(present_sampler_nn.handle, GL_TEXTURE_MAG_FILTER, GL_NEAREST);
260 280
261 // Generate VBO handle for drawing 281 // Generate VBO handle for drawing
262 vertex_buffer.Create(); 282 vertex_buffer.Create();
@@ -274,6 +294,8 @@ void RendererOpenGL::InitOpenGLObjects() {
274 294
275 // Clear screen to black 295 // Clear screen to black
276 LoadColorToActiveGLTexture(0, 0, 0, 0, screen_info.texture); 296 LoadColorToActiveGLTexture(0, 0, 0, 0, screen_info.texture);
297
298 fxaa_framebuffer.Create();
277} 299}
278 300
279void RendererOpenGL::AddTelemetryFields() { 301void RendererOpenGL::AddTelemetryFields() {
@@ -325,18 +347,130 @@ void RendererOpenGL::ConfigureFramebufferTexture(TextureInfo& texture,
325 texture.resource.Release(); 347 texture.resource.Release();
326 texture.resource.Create(GL_TEXTURE_2D); 348 texture.resource.Create(GL_TEXTURE_2D);
327 glTextureStorage2D(texture.resource.handle, 1, internal_format, texture.width, texture.height); 349 glTextureStorage2D(texture.resource.handle, 1, internal_format, texture.width, texture.height);
350 fxaa_texture.Release();
351 fxaa_texture.Create(GL_TEXTURE_2D);
352 glTextureStorage2D(fxaa_texture.handle, 1, GL_RGBA16F,
353 Settings::values.resolution_info.ScaleUp(screen_info.texture.width),
354 Settings::values.resolution_info.ScaleUp(screen_info.texture.height));
355 glNamedFramebufferTexture(fxaa_framebuffer.handle, GL_COLOR_ATTACHMENT0, fxaa_texture.handle,
356 0);
328} 357}
329 358
330void RendererOpenGL::DrawScreen(const Layout::FramebufferLayout& layout) { 359void RendererOpenGL::DrawScreen(const Layout::FramebufferLayout& layout) {
360 // TODO: Signal state tracker about these changes
361 state_tracker.NotifyScreenDrawVertexArray();
362 state_tracker.NotifyPolygonModes();
363 state_tracker.NotifyViewport0();
364 state_tracker.NotifyScissor0();
365 state_tracker.NotifyColorMask(0);
366 state_tracker.NotifyBlend0();
367 state_tracker.NotifyFramebuffer();
368 state_tracker.NotifyFrontFace();
369 state_tracker.NotifyCullTest();
370 state_tracker.NotifyDepthTest();
371 state_tracker.NotifyStencilTest();
372 state_tracker.NotifyPolygonOffset();
373 state_tracker.NotifyRasterizeEnable();
374 state_tracker.NotifyFramebufferSRGB();
375 state_tracker.NotifyLogicOp();
376 state_tracker.NotifyClipControl();
377 state_tracker.NotifyAlphaTest();
378
379 state_tracker.ClipControl(GL_LOWER_LEFT, GL_ZERO_TO_ONE);
380
331 // Update background color before drawing 381 // Update background color before drawing
332 glClearColor(Settings::values.bg_red.GetValue() / 255.0f, 382 glClearColor(Settings::values.bg_red.GetValue() / 255.0f,
333 Settings::values.bg_green.GetValue() / 255.0f, 383 Settings::values.bg_green.GetValue() / 255.0f,
334 Settings::values.bg_blue.GetValue() / 255.0f, 1.0f); 384 Settings::values.bg_blue.GetValue() / 255.0f, 1.0f);
335 385
386 glEnable(GL_CULL_FACE);
387 glDisable(GL_COLOR_LOGIC_OP);
388 glDisable(GL_DEPTH_TEST);
389 glDisable(GL_STENCIL_TEST);
390 glDisable(GL_POLYGON_OFFSET_FILL);
391 glDisable(GL_RASTERIZER_DISCARD);
392 glDisable(GL_ALPHA_TEST);
393 glDisablei(GL_BLEND, 0);
394 glPolygonMode(GL_FRONT_AND_BACK, GL_FILL);
395 glCullFace(GL_BACK);
396 glFrontFace(GL_CW);
397 glColorMaski(0, GL_TRUE, GL_TRUE, GL_TRUE, GL_TRUE);
398
399 glBindTextureUnit(0, screen_info.display_texture);
400
401 if (Settings::values.anti_aliasing.GetValue() == Settings::AntiAliasing::Fxaa) {
402 program_manager.BindPresentPrograms(fxaa_vertex.handle, fxaa_fragment.handle);
403
404 glEnablei(GL_SCISSOR_TEST, 0);
405 auto viewport_width = screen_info.texture.width;
406 auto scissor_width = framebuffer_crop_rect.GetWidth();
407 if (scissor_width <= 0) {
408 scissor_width = viewport_width;
409 }
410 auto viewport_height = screen_info.texture.height;
411 auto scissor_height = framebuffer_crop_rect.GetHeight();
412 if (scissor_height <= 0) {
413 scissor_height = viewport_height;
414 }
415 if (screen_info.was_accelerated) {
416 viewport_width = Settings::values.resolution_info.ScaleUp(viewport_width);
417 scissor_width = Settings::values.resolution_info.ScaleUp(scissor_width);
418 viewport_height = Settings::values.resolution_info.ScaleUp(viewport_height);
419 scissor_height = Settings::values.resolution_info.ScaleUp(scissor_height);
420 }
421 glScissorIndexed(0, 0, 0, scissor_width, scissor_height);
422 glViewportIndexedf(0, 0.0f, 0.0f, static_cast<GLfloat>(viewport_width),
423 static_cast<GLfloat>(viewport_height));
424 glDepthRangeIndexed(0, 0.0, 0.0);
425
426 glBindSampler(0, present_sampler.handle);
427 GLint old_read_fb;
428 GLint old_draw_fb;
429 glGetIntegerv(GL_READ_FRAMEBUFFER_BINDING, &old_read_fb);
430 glGetIntegerv(GL_DRAW_FRAMEBUFFER_BINDING, &old_draw_fb);
431 glBindFramebuffer(GL_DRAW_FRAMEBUFFER, fxaa_framebuffer.handle);
432
433 glDrawArrays(GL_TRIANGLE_STRIP, 0, 4);
434
435 glBindFramebuffer(GL_READ_FRAMEBUFFER, old_read_fb);
436 glBindFramebuffer(GL_DRAW_FRAMEBUFFER, old_draw_fb);
437
438 glBindTextureUnit(0, fxaa_texture.handle);
439 }
440
336 // Set projection matrix 441 // Set projection matrix
337 const std::array ortho_matrix = 442 const std::array ortho_matrix =
338 MakeOrthographicMatrix(static_cast<float>(layout.width), static_cast<float>(layout.height)); 443 MakeOrthographicMatrix(static_cast<float>(layout.width), static_cast<float>(layout.height));
339 program_manager.BindPresentPrograms(present_vertex.handle, present_fragment.handle); 444
445 GLuint fragment_handle;
446 const auto filter = Settings::values.scaling_filter.GetValue();
447 switch (filter) {
448 case Settings::ScalingFilter::NearestNeighbor:
449 fragment_handle = present_bilinear_fragment.handle;
450 break;
451 case Settings::ScalingFilter::Bilinear:
452 fragment_handle = present_bilinear_fragment.handle;
453 break;
454 case Settings::ScalingFilter::Bicubic:
455 fragment_handle = present_bicubic_fragment.handle;
456 break;
457 case Settings::ScalingFilter::Gaussian:
458 fragment_handle = present_gaussian_fragment.handle;
459 break;
460 case Settings::ScalingFilter::ScaleForce:
461 fragment_handle = present_scaleforce_fragment.handle;
462 break;
463 case Settings::ScalingFilter::Fsr:
464 LOG_WARNING(
465 Render_OpenGL,
466 "FidelityFX FSR Super Sampling is not supported in OpenGL, changing to ScaleForce");
467 fragment_handle = present_scaleforce_fragment.handle;
468 break;
469 default:
470 fragment_handle = present_bilinear_fragment.handle;
471 break;
472 }
473 program_manager.BindPresentPrograms(present_vertex.handle, fragment_handle);
340 glProgramUniformMatrix3x2fv(present_vertex.handle, ModelViewMatrixLocation, 1, GL_FALSE, 474 glProgramUniformMatrix3x2fv(present_vertex.handle, ModelViewMatrixLocation, 1, GL_FALSE,
341 ortho_matrix.data()); 475 ortho_matrix.data());
342 476
@@ -370,6 +504,11 @@ void RendererOpenGL::DrawScreen(const Layout::FramebufferLayout& layout) {
370 scale_v = static_cast<f32>(framebuffer_crop_rect.GetHeight()) / 504 scale_v = static_cast<f32>(framebuffer_crop_rect.GetHeight()) /
371 static_cast<f32>(screen_info.texture.height); 505 static_cast<f32>(screen_info.texture.height);
372 } 506 }
507 if (Settings::values.anti_aliasing.GetValue() == Settings::AntiAliasing::Fxaa &&
508 !screen_info.was_accelerated) {
509 scale_u /= Settings::values.resolution_info.up_factor;
510 scale_v /= Settings::values.resolution_info.up_factor;
511 }
373 512
374 const auto& screen = layout.screen; 513 const auto& screen = layout.screen;
375 const std::array vertices = { 514 const std::array vertices = {
@@ -380,47 +519,14 @@ void RendererOpenGL::DrawScreen(const Layout::FramebufferLayout& layout) {
380 }; 519 };
381 glNamedBufferSubData(vertex_buffer.handle, 0, sizeof(vertices), std::data(vertices)); 520 glNamedBufferSubData(vertex_buffer.handle, 0, sizeof(vertices), std::data(vertices));
382 521
383 // TODO: Signal state tracker about these changes
384 state_tracker.NotifyScreenDrawVertexArray();
385 state_tracker.NotifyPolygonModes();
386 state_tracker.NotifyViewport0();
387 state_tracker.NotifyScissor0();
388 state_tracker.NotifyColorMask(0);
389 state_tracker.NotifyBlend0();
390 state_tracker.NotifyFramebuffer();
391 state_tracker.NotifyFrontFace();
392 state_tracker.NotifyCullTest();
393 state_tracker.NotifyDepthTest();
394 state_tracker.NotifyStencilTest();
395 state_tracker.NotifyPolygonOffset();
396 state_tracker.NotifyRasterizeEnable();
397 state_tracker.NotifyFramebufferSRGB();
398 state_tracker.NotifyLogicOp();
399 state_tracker.NotifyClipControl();
400 state_tracker.NotifyAlphaTest();
401
402 state_tracker.ClipControl(GL_LOWER_LEFT, GL_ZERO_TO_ONE);
403 glEnable(GL_CULL_FACE);
404 if (screen_info.display_srgb) { 522 if (screen_info.display_srgb) {
405 glEnable(GL_FRAMEBUFFER_SRGB); 523 glEnable(GL_FRAMEBUFFER_SRGB);
406 } else { 524 } else {
407 glDisable(GL_FRAMEBUFFER_SRGB); 525 glDisable(GL_FRAMEBUFFER_SRGB);
408 } 526 }
409 glDisable(GL_COLOR_LOGIC_OP);
410 glDisable(GL_DEPTH_TEST);
411 glDisable(GL_STENCIL_TEST);
412 glDisable(GL_POLYGON_OFFSET_FILL);
413 glDisable(GL_RASTERIZER_DISCARD);
414 glDisable(GL_ALPHA_TEST);
415 glDisablei(GL_BLEND, 0);
416 glDisablei(GL_SCISSOR_TEST, 0); 527 glDisablei(GL_SCISSOR_TEST, 0);
417 glPolygonMode(GL_FRONT_AND_BACK, GL_FILL);
418 glCullFace(GL_BACK);
419 glFrontFace(GL_CW);
420 glColorMaski(0, GL_TRUE, GL_TRUE, GL_TRUE, GL_TRUE);
421 glViewportIndexedf(0, 0.0f, 0.0f, static_cast<GLfloat>(layout.width), 528 glViewportIndexedf(0, 0.0f, 0.0f, static_cast<GLfloat>(layout.width),
422 static_cast<GLfloat>(layout.height)); 529 static_cast<GLfloat>(layout.height));
423 glDepthRangeIndexed(0, 0.0, 0.0);
424 530
425 glEnableVertexAttribArray(PositionLocation); 531 glEnableVertexAttribArray(PositionLocation);
426 glEnableVertexAttribArray(TexCoordLocation); 532 glEnableVertexAttribArray(TexCoordLocation);
@@ -440,8 +546,11 @@ void RendererOpenGL::DrawScreen(const Layout::FramebufferLayout& layout) {
440 glBindVertexBuffer(0, vertex_buffer.handle, 0, sizeof(ScreenRectVertex)); 546 glBindVertexBuffer(0, vertex_buffer.handle, 0, sizeof(ScreenRectVertex));
441 } 547 }
442 548
443 glBindTextureUnit(0, screen_info.display_texture); 549 if (Settings::values.scaling_filter.GetValue() != Settings::ScalingFilter::NearestNeighbor) {
444 glBindSampler(0, present_sampler.handle); 550 glBindSampler(0, present_sampler.handle);
551 } else {
552 glBindSampler(0, present_sampler_nn.handle);
553 }
445 554
446 glClear(GL_COLOR_BUFFER_BIT); 555 glClear(GL_COLOR_BUFFER_BIT);
447 glDrawArrays(GL_TRIANGLE_STRIP, 0, 4); 556 glDrawArrays(GL_TRIANGLE_STRIP, 0, 4);
diff --git a/src/video_core/renderer_opengl/renderer_opengl.h b/src/video_core/renderer_opengl/renderer_opengl.h
index d455f572f..cda333cad 100644
--- a/src/video_core/renderer_opengl/renderer_opengl.h
+++ b/src/video_core/renderer_opengl/renderer_opengl.h
@@ -50,6 +50,7 @@ struct TextureInfo {
50/// Structure used for storing information about the display target for the Switch screen 50/// Structure used for storing information about the display target for the Switch screen
51struct ScreenInfo { 51struct ScreenInfo {
52 GLuint display_texture{}; 52 GLuint display_texture{};
53 bool was_accelerated = false;
53 bool display_srgb{}; 54 bool display_srgb{};
54 const Common::Rectangle<float> display_texcoords{0.0f, 0.0f, 1.0f, 1.0f}; 55 const Common::Rectangle<float> display_texcoords{0.0f, 0.0f, 1.0f, 1.0f};
55 TextureInfo texture; 56 TextureInfo texture;
@@ -109,9 +110,15 @@ private:
109 110
110 // OpenGL object IDs 111 // OpenGL object IDs
111 OGLSampler present_sampler; 112 OGLSampler present_sampler;
113 OGLSampler present_sampler_nn;
112 OGLBuffer vertex_buffer; 114 OGLBuffer vertex_buffer;
115 OGLProgram fxaa_vertex;
116 OGLProgram fxaa_fragment;
113 OGLProgram present_vertex; 117 OGLProgram present_vertex;
114 OGLProgram present_fragment; 118 OGLProgram present_bilinear_fragment;
119 OGLProgram present_bicubic_fragment;
120 OGLProgram present_gaussian_fragment;
121 OGLProgram present_scaleforce_fragment;
115 OGLFramebuffer screenshot_framebuffer; 122 OGLFramebuffer screenshot_framebuffer;
116 123
117 // GPU address of the vertex buffer 124 // GPU address of the vertex buffer
@@ -119,6 +126,8 @@ private:
119 126
120 /// Display information for Switch screen 127 /// Display information for Switch screen
121 ScreenInfo screen_info; 128 ScreenInfo screen_info;
129 OGLTexture fxaa_texture;
130 OGLFramebuffer fxaa_framebuffer;
122 131
123 /// OpenGL framebuffer data 132 /// OpenGL framebuffer data
124 std::vector<u8> gl_framebuffer_data; 133 std::vector<u8> gl_framebuffer_data;
diff --git a/src/video_core/renderer_vulkan/blit_image.cpp b/src/video_core/renderer_vulkan/blit_image.cpp
index 6c1b2f063..b3884a4f5 100644
--- a/src/video_core/renderer_vulkan/blit_image.cpp
+++ b/src/video_core/renderer_vulkan/blit_image.cpp
@@ -363,7 +363,7 @@ BlitImageHelper::BlitImageHelper(const Device& device_, VKScheduler& scheduler_,
363 363
364BlitImageHelper::~BlitImageHelper() = default; 364BlitImageHelper::~BlitImageHelper() = default;
365 365
366void BlitImageHelper::BlitColor(const Framebuffer* dst_framebuffer, const ImageView& src_image_view, 366void BlitImageHelper::BlitColor(const Framebuffer* dst_framebuffer, VkImageView src_view,
367 const Region2D& dst_region, const Region2D& src_region, 367 const Region2D& dst_region, const Region2D& src_region,
368 Tegra::Engines::Fermi2D::Filter filter, 368 Tegra::Engines::Fermi2D::Filter filter,
369 Tegra::Engines::Fermi2D::Operation operation) { 369 Tegra::Engines::Fermi2D::Operation operation) {
@@ -373,9 +373,8 @@ void BlitImageHelper::BlitColor(const Framebuffer* dst_framebuffer, const ImageV
373 .operation = operation, 373 .operation = operation,
374 }; 374 };
375 const VkPipelineLayout layout = *one_texture_pipeline_layout; 375 const VkPipelineLayout layout = *one_texture_pipeline_layout;
376 const VkImageView src_view = src_image_view.Handle(Shader::TextureType::Color2D);
377 const VkSampler sampler = is_linear ? *linear_sampler : *nearest_sampler; 376 const VkSampler sampler = is_linear ? *linear_sampler : *nearest_sampler;
378 const VkPipeline pipeline = FindOrEmplacePipeline(key); 377 const VkPipeline pipeline = FindOrEmplaceColorPipeline(key);
379 scheduler.RequestRenderpass(dst_framebuffer); 378 scheduler.RequestRenderpass(dst_framebuffer);
380 scheduler.Record([this, dst_region, src_region, pipeline, layout, sampler, 379 scheduler.Record([this, dst_region, src_region, pipeline, layout, sampler,
381 src_view](vk::CommandBuffer cmdbuf) { 380 src_view](vk::CommandBuffer cmdbuf) {
@@ -398,10 +397,13 @@ void BlitImageHelper::BlitDepthStencil(const Framebuffer* dst_framebuffer,
398 Tegra::Engines::Fermi2D::Operation operation) { 397 Tegra::Engines::Fermi2D::Operation operation) {
399 ASSERT(filter == Tegra::Engines::Fermi2D::Filter::Point); 398 ASSERT(filter == Tegra::Engines::Fermi2D::Filter::Point);
400 ASSERT(operation == Tegra::Engines::Fermi2D::Operation::SrcCopy); 399 ASSERT(operation == Tegra::Engines::Fermi2D::Operation::SrcCopy);
401 400 const BlitImagePipelineKey key{
401 .renderpass = dst_framebuffer->RenderPass(),
402 .operation = operation,
403 };
402 const VkPipelineLayout layout = *two_textures_pipeline_layout; 404 const VkPipelineLayout layout = *two_textures_pipeline_layout;
403 const VkSampler sampler = *nearest_sampler; 405 const VkSampler sampler = *nearest_sampler;
404 const VkPipeline pipeline = BlitDepthStencilPipeline(dst_framebuffer->RenderPass()); 406 const VkPipeline pipeline = FindOrEmplaceDepthStencilPipeline(key);
405 scheduler.RequestRenderpass(dst_framebuffer); 407 scheduler.RequestRenderpass(dst_framebuffer);
406 scheduler.Record([dst_region, src_region, pipeline, layout, sampler, src_depth_view, 408 scheduler.Record([dst_region, src_region, pipeline, layout, sampler, src_depth_view,
407 src_stencil_view, this](vk::CommandBuffer cmdbuf) { 409 src_stencil_view, this](vk::CommandBuffer cmdbuf) {
@@ -419,40 +421,45 @@ void BlitImageHelper::BlitDepthStencil(const Framebuffer* dst_framebuffer,
419} 421}
420 422
421void BlitImageHelper::ConvertD32ToR32(const Framebuffer* dst_framebuffer, 423void BlitImageHelper::ConvertD32ToR32(const Framebuffer* dst_framebuffer,
422 const ImageView& src_image_view) { 424 const ImageView& src_image_view, u32 up_scale,
425 u32 down_shift) {
423 ConvertDepthToColorPipeline(convert_d32_to_r32_pipeline, dst_framebuffer->RenderPass()); 426 ConvertDepthToColorPipeline(convert_d32_to_r32_pipeline, dst_framebuffer->RenderPass());
424 Convert(*convert_d32_to_r32_pipeline, dst_framebuffer, src_image_view); 427 Convert(*convert_d32_to_r32_pipeline, dst_framebuffer, src_image_view, up_scale, down_shift);
425} 428}
426 429
427void BlitImageHelper::ConvertR32ToD32(const Framebuffer* dst_framebuffer, 430void BlitImageHelper::ConvertR32ToD32(const Framebuffer* dst_framebuffer,
428 const ImageView& src_image_view) { 431 const ImageView& src_image_view, u32 up_scale,
432 u32 down_shift) {
429 ConvertColorToDepthPipeline(convert_r32_to_d32_pipeline, dst_framebuffer->RenderPass()); 433 ConvertColorToDepthPipeline(convert_r32_to_d32_pipeline, dst_framebuffer->RenderPass());
430 Convert(*convert_r32_to_d32_pipeline, dst_framebuffer, src_image_view); 434 Convert(*convert_r32_to_d32_pipeline, dst_framebuffer, src_image_view, up_scale, down_shift);
431} 435}
432 436
433void BlitImageHelper::ConvertD16ToR16(const Framebuffer* dst_framebuffer, 437void BlitImageHelper::ConvertD16ToR16(const Framebuffer* dst_framebuffer,
434 const ImageView& src_image_view) { 438 const ImageView& src_image_view, u32 up_scale,
439 u32 down_shift) {
435 ConvertDepthToColorPipeline(convert_d16_to_r16_pipeline, dst_framebuffer->RenderPass()); 440 ConvertDepthToColorPipeline(convert_d16_to_r16_pipeline, dst_framebuffer->RenderPass());
436 Convert(*convert_d16_to_r16_pipeline, dst_framebuffer, src_image_view); 441 Convert(*convert_d16_to_r16_pipeline, dst_framebuffer, src_image_view, up_scale, down_shift);
437} 442}
438 443
439void BlitImageHelper::ConvertR16ToD16(const Framebuffer* dst_framebuffer, 444void BlitImageHelper::ConvertR16ToD16(const Framebuffer* dst_framebuffer,
440 const ImageView& src_image_view) { 445 const ImageView& src_image_view, u32 up_scale,
446 u32 down_shift) {
441 ConvertColorToDepthPipeline(convert_r16_to_d16_pipeline, dst_framebuffer->RenderPass()); 447 ConvertColorToDepthPipeline(convert_r16_to_d16_pipeline, dst_framebuffer->RenderPass());
442 Convert(*convert_r16_to_d16_pipeline, dst_framebuffer, src_image_view); 448 Convert(*convert_r16_to_d16_pipeline, dst_framebuffer, src_image_view, up_scale, down_shift);
443} 449}
444 450
445void BlitImageHelper::Convert(VkPipeline pipeline, const Framebuffer* dst_framebuffer, 451void BlitImageHelper::Convert(VkPipeline pipeline, const Framebuffer* dst_framebuffer,
446 const ImageView& src_image_view) { 452 const ImageView& src_image_view, u32 up_scale, u32 down_shift) {
447 const VkPipelineLayout layout = *one_texture_pipeline_layout; 453 const VkPipelineLayout layout = *one_texture_pipeline_layout;
448 const VkImageView src_view = src_image_view.Handle(Shader::TextureType::Color2D); 454 const VkImageView src_view = src_image_view.Handle(Shader::TextureType::Color2D);
449 const VkSampler sampler = *nearest_sampler; 455 const VkSampler sampler = *nearest_sampler;
450 const VkExtent2D extent{ 456 const VkExtent2D extent{
451 .width = src_image_view.size.width, 457 .width = std::max((src_image_view.size.width * up_scale) >> down_shift, 1U),
452 .height = src_image_view.size.height, 458 .height = std::max((src_image_view.size.height * up_scale) >> down_shift, 1U),
453 }; 459 };
454 scheduler.RequestRenderpass(dst_framebuffer); 460 scheduler.RequestRenderpass(dst_framebuffer);
455 scheduler.Record([pipeline, layout, sampler, src_view, extent, this](vk::CommandBuffer cmdbuf) { 461 scheduler.Record([pipeline, layout, sampler, src_view, extent, up_scale, down_shift,
462 this](vk::CommandBuffer cmdbuf) {
456 const VkOffset2D offset{ 463 const VkOffset2D offset{
457 .x = 0, 464 .x = 0,
458 .y = 0, 465 .y = 0,
@@ -488,7 +495,7 @@ void BlitImageHelper::Convert(VkPipeline pipeline, const Framebuffer* dst_frameb
488 scheduler.InvalidateState(); 495 scheduler.InvalidateState();
489} 496}
490 497
491VkPipeline BlitImageHelper::FindOrEmplacePipeline(const BlitImagePipelineKey& key) { 498VkPipeline BlitImageHelper::FindOrEmplaceColorPipeline(const BlitImagePipelineKey& key) {
492 const auto it = std::ranges::find(blit_color_keys, key); 499 const auto it = std::ranges::find(blit_color_keys, key);
493 if (it != blit_color_keys.end()) { 500 if (it != blit_color_keys.end()) {
494 return *blit_color_pipelines[std::distance(blit_color_keys.begin(), it)]; 501 return *blit_color_pipelines[std::distance(blit_color_keys.begin(), it)];
@@ -542,12 +549,14 @@ VkPipeline BlitImageHelper::FindOrEmplacePipeline(const BlitImagePipelineKey& ke
542 return *blit_color_pipelines.back(); 549 return *blit_color_pipelines.back();
543} 550}
544 551
545VkPipeline BlitImageHelper::BlitDepthStencilPipeline(VkRenderPass renderpass) { 552VkPipeline BlitImageHelper::FindOrEmplaceDepthStencilPipeline(const BlitImagePipelineKey& key) {
546 if (blit_depth_stencil_pipeline) { 553 const auto it = std::ranges::find(blit_depth_stencil_keys, key);
547 return *blit_depth_stencil_pipeline; 554 if (it != blit_depth_stencil_keys.end()) {
555 return *blit_depth_stencil_pipelines[std::distance(blit_depth_stencil_keys.begin(), it)];
548 } 556 }
557 blit_depth_stencil_keys.push_back(key);
549 const std::array stages = MakeStages(*full_screen_vert, *blit_depth_stencil_frag); 558 const std::array stages = MakeStages(*full_screen_vert, *blit_depth_stencil_frag);
550 blit_depth_stencil_pipeline = device.GetLogical().CreateGraphicsPipeline({ 559 blit_depth_stencil_pipelines.push_back(device.GetLogical().CreateGraphicsPipeline({
551 .sType = VK_STRUCTURE_TYPE_GRAPHICS_PIPELINE_CREATE_INFO, 560 .sType = VK_STRUCTURE_TYPE_GRAPHICS_PIPELINE_CREATE_INFO,
552 .pNext = nullptr, 561 .pNext = nullptr,
553 .flags = 0, 562 .flags = 0,
@@ -560,15 +569,15 @@ VkPipeline BlitImageHelper::BlitDepthStencilPipeline(VkRenderPass renderpass) {
560 .pRasterizationState = &PIPELINE_RASTERIZATION_STATE_CREATE_INFO, 569 .pRasterizationState = &PIPELINE_RASTERIZATION_STATE_CREATE_INFO,
561 .pMultisampleState = &PIPELINE_MULTISAMPLE_STATE_CREATE_INFO, 570 .pMultisampleState = &PIPELINE_MULTISAMPLE_STATE_CREATE_INFO,
562 .pDepthStencilState = &PIPELINE_DEPTH_STENCIL_STATE_CREATE_INFO, 571 .pDepthStencilState = &PIPELINE_DEPTH_STENCIL_STATE_CREATE_INFO,
563 .pColorBlendState = &PIPELINE_COLOR_BLEND_STATE_EMPTY_CREATE_INFO, 572 .pColorBlendState = &PIPELINE_COLOR_BLEND_STATE_GENERIC_CREATE_INFO,
564 .pDynamicState = &PIPELINE_DYNAMIC_STATE_CREATE_INFO, 573 .pDynamicState = &PIPELINE_DYNAMIC_STATE_CREATE_INFO,
565 .layout = *two_textures_pipeline_layout, 574 .layout = *two_textures_pipeline_layout,
566 .renderPass = renderpass, 575 .renderPass = key.renderpass,
567 .subpass = 0, 576 .subpass = 0,
568 .basePipelineHandle = VK_NULL_HANDLE, 577 .basePipelineHandle = VK_NULL_HANDLE,
569 .basePipelineIndex = 0, 578 .basePipelineIndex = 0,
570 }); 579 }));
571 return *blit_depth_stencil_pipeline; 580 return *blit_depth_stencil_pipelines.back();
572} 581}
573 582
574void BlitImageHelper::ConvertDepthToColorPipeline(vk::Pipeline& pipeline, VkRenderPass renderpass) { 583void BlitImageHelper::ConvertDepthToColorPipeline(vk::Pipeline& pipeline, VkRenderPass renderpass) {
diff --git a/src/video_core/renderer_vulkan/blit_image.h b/src/video_core/renderer_vulkan/blit_image.h
index 33ee095c1..d77f76678 100644
--- a/src/video_core/renderer_vulkan/blit_image.h
+++ b/src/video_core/renderer_vulkan/blit_image.h
@@ -34,7 +34,7 @@ public:
34 StateTracker& state_tracker, DescriptorPool& descriptor_pool); 34 StateTracker& state_tracker, DescriptorPool& descriptor_pool);
35 ~BlitImageHelper(); 35 ~BlitImageHelper();
36 36
37 void BlitColor(const Framebuffer* dst_framebuffer, const ImageView& src_image_view, 37 void BlitColor(const Framebuffer* dst_framebuffer, VkImageView src_image_view,
38 const Region2D& dst_region, const Region2D& src_region, 38 const Region2D& dst_region, const Region2D& src_region,
39 Tegra::Engines::Fermi2D::Filter filter, 39 Tegra::Engines::Fermi2D::Filter filter,
40 Tegra::Engines::Fermi2D::Operation operation); 40 Tegra::Engines::Fermi2D::Operation operation);
@@ -44,21 +44,25 @@ public:
44 const Region2D& src_region, Tegra::Engines::Fermi2D::Filter filter, 44 const Region2D& src_region, Tegra::Engines::Fermi2D::Filter filter,
45 Tegra::Engines::Fermi2D::Operation operation); 45 Tegra::Engines::Fermi2D::Operation operation);
46 46
47 void ConvertD32ToR32(const Framebuffer* dst_framebuffer, const ImageView& src_image_view); 47 void ConvertD32ToR32(const Framebuffer* dst_framebuffer, const ImageView& src_image_view,
48 u32 up_scale, u32 down_shift);
48 49
49 void ConvertR32ToD32(const Framebuffer* dst_framebuffer, const ImageView& src_image_view); 50 void ConvertR32ToD32(const Framebuffer* dst_framebuffer, const ImageView& src_image_view,
51 u32 up_scale, u32 down_shift);
50 52
51 void ConvertD16ToR16(const Framebuffer* dst_framebuffer, const ImageView& src_image_view); 53 void ConvertD16ToR16(const Framebuffer* dst_framebuffer, const ImageView& src_image_view,
54 u32 up_scale, u32 down_shift);
52 55
53 void ConvertR16ToD16(const Framebuffer* dst_framebuffer, const ImageView& src_image_view); 56 void ConvertR16ToD16(const Framebuffer* dst_framebuffer, const ImageView& src_image_view,
57 u32 up_scale, u32 down_shift);
54 58
55private: 59private:
56 void Convert(VkPipeline pipeline, const Framebuffer* dst_framebuffer, 60 void Convert(VkPipeline pipeline, const Framebuffer* dst_framebuffer,
57 const ImageView& src_image_view); 61 const ImageView& src_image_view, u32 up_scale, u32 down_shift);
58 62
59 [[nodiscard]] VkPipeline FindOrEmplacePipeline(const BlitImagePipelineKey& key); 63 [[nodiscard]] VkPipeline FindOrEmplaceColorPipeline(const BlitImagePipelineKey& key);
60 64
61 [[nodiscard]] VkPipeline BlitDepthStencilPipeline(VkRenderPass renderpass); 65 [[nodiscard]] VkPipeline FindOrEmplaceDepthStencilPipeline(const BlitImagePipelineKey& key);
62 66
63 void ConvertDepthToColorPipeline(vk::Pipeline& pipeline, VkRenderPass renderpass); 67 void ConvertDepthToColorPipeline(vk::Pipeline& pipeline, VkRenderPass renderpass);
64 68
@@ -84,7 +88,8 @@ private:
84 88
85 std::vector<BlitImagePipelineKey> blit_color_keys; 89 std::vector<BlitImagePipelineKey> blit_color_keys;
86 std::vector<vk::Pipeline> blit_color_pipelines; 90 std::vector<vk::Pipeline> blit_color_pipelines;
87 vk::Pipeline blit_depth_stencil_pipeline; 91 std::vector<BlitImagePipelineKey> blit_depth_stencil_keys;
92 std::vector<vk::Pipeline> blit_depth_stencil_pipelines;
88 vk::Pipeline convert_d32_to_r32_pipeline; 93 vk::Pipeline convert_d32_to_r32_pipeline;
89 vk::Pipeline convert_r32_to_d32_pipeline; 94 vk::Pipeline convert_r32_to_d32_pipeline;
90 vk::Pipeline convert_d16_to_r16_pipeline; 95 vk::Pipeline convert_d16_to_r16_pipeline;
diff --git a/src/video_core/renderer_vulkan/pipeline_helper.h b/src/video_core/renderer_vulkan/pipeline_helper.h
index 4847db6b6..11c160570 100644
--- a/src/video_core/renderer_vulkan/pipeline_helper.h
+++ b/src/video_core/renderer_vulkan/pipeline_helper.h
@@ -10,6 +10,7 @@
10 10
11#include "common/assert.h" 11#include "common/assert.h"
12#include "common/common_types.h" 12#include "common/common_types.h"
13#include "shader_recompiler/backend/spirv/emit_spirv.h"
13#include "shader_recompiler/shader_info.h" 14#include "shader_recompiler/shader_info.h"
14#include "video_core/renderer_vulkan/vk_texture_cache.h" 15#include "video_core/renderer_vulkan/vk_texture_cache.h"
15#include "video_core/renderer_vulkan/vk_update_descriptor.h" 16#include "video_core/renderer_vulkan/vk_update_descriptor.h"
@@ -20,6 +21,8 @@
20 21
21namespace Vulkan { 22namespace Vulkan {
22 23
24using Shader::Backend::SPIRV::NUM_TEXTURE_AND_IMAGE_SCALING_WORDS;
25
23class DescriptorLayoutBuilder { 26class DescriptorLayoutBuilder {
24public: 27public:
25 DescriptorLayoutBuilder(const Device& device_) : device{&device_} {} 28 DescriptorLayoutBuilder(const Device& device_) : device{&device_} {}
@@ -68,18 +71,28 @@ public:
68 } 71 }
69 72
70 vk::PipelineLayout CreatePipelineLayout(VkDescriptorSetLayout descriptor_set_layout) const { 73 vk::PipelineLayout CreatePipelineLayout(VkDescriptorSetLayout descriptor_set_layout) const {
74 using Shader::Backend::SPIRV::RescalingLayout;
75 const u32 size_offset = is_compute ? sizeof(RescalingLayout::down_factor) : 0u;
76 const VkPushConstantRange range{
77 .stageFlags = static_cast<VkShaderStageFlags>(
78 is_compute ? VK_SHADER_STAGE_COMPUTE_BIT : VK_SHADER_STAGE_ALL_GRAPHICS),
79 .offset = 0,
80 .size = static_cast<u32>(sizeof(RescalingLayout)) - size_offset,
81 };
71 return device->GetLogical().CreatePipelineLayout({ 82 return device->GetLogical().CreatePipelineLayout({
72 .sType = VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO, 83 .sType = VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO,
73 .pNext = nullptr, 84 .pNext = nullptr,
74 .flags = 0, 85 .flags = 0,
75 .setLayoutCount = descriptor_set_layout ? 1U : 0U, 86 .setLayoutCount = descriptor_set_layout ? 1U : 0U,
76 .pSetLayouts = bindings.empty() ? nullptr : &descriptor_set_layout, 87 .pSetLayouts = bindings.empty() ? nullptr : &descriptor_set_layout,
77 .pushConstantRangeCount = 0, 88 .pushConstantRangeCount = 1,
78 .pPushConstantRanges = nullptr, 89 .pPushConstantRanges = &range,
79 }); 90 });
80 } 91 }
81 92
82 void Add(const Shader::Info& info, VkShaderStageFlags stage) { 93 void Add(const Shader::Info& info, VkShaderStageFlags stage) {
94 is_compute |= (stage & VK_SHADER_STAGE_COMPUTE_BIT) != 0;
95
83 Add(VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER, stage, info.constant_buffer_descriptors); 96 Add(VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER, stage, info.constant_buffer_descriptors);
84 Add(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, stage, info.storage_buffers_descriptors); 97 Add(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, stage, info.storage_buffers_descriptors);
85 Add(VK_DESCRIPTOR_TYPE_UNIFORM_TEXEL_BUFFER, stage, info.texture_buffer_descriptors); 98 Add(VK_DESCRIPTOR_TYPE_UNIFORM_TEXEL_BUFFER, stage, info.texture_buffer_descriptors);
@@ -115,6 +128,7 @@ private:
115 } 128 }
116 129
117 const Device* device{}; 130 const Device* device{};
131 bool is_compute{};
118 boost::container::small_vector<VkDescriptorSetLayoutBinding, 32> bindings; 132 boost::container::small_vector<VkDescriptorSetLayoutBinding, 32> bindings;
119 boost::container::small_vector<VkDescriptorUpdateTemplateEntryKHR, 32> entries; 133 boost::container::small_vector<VkDescriptorUpdateTemplateEntryKHR, 32> entries;
120 u32 binding{}; 134 u32 binding{};
@@ -122,31 +136,68 @@ private:
122 size_t offset{}; 136 size_t offset{};
123}; 137};
124 138
125inline void PushImageDescriptors(const Shader::Info& info, const VkSampler*& samplers, 139class RescalingPushConstant {
126 const ImageId*& image_view_ids, TextureCache& texture_cache, 140public:
127 VKUpdateDescriptorQueue& update_descriptor_queue) { 141 explicit RescalingPushConstant() noexcept {}
128 for (const auto& desc : info.texture_buffer_descriptors) { 142
129 image_view_ids += desc.count; 143 void PushTexture(bool is_rescaled) noexcept {
144 *texture_ptr |= is_rescaled ? texture_bit : 0u;
145 texture_bit <<= 1u;
146 if (texture_bit == 0u) {
147 texture_bit = 1u;
148 ++texture_ptr;
149 }
130 } 150 }
131 for (const auto& desc : info.image_buffer_descriptors) { 151
132 image_view_ids += desc.count; 152 void PushImage(bool is_rescaled) noexcept {
153 *image_ptr |= is_rescaled ? image_bit : 0u;
154 image_bit <<= 1u;
155 if (image_bit == 0u) {
156 image_bit = 1u;
157 ++image_ptr;
158 }
133 } 159 }
160
161 const std::array<u32, NUM_TEXTURE_AND_IMAGE_SCALING_WORDS>& Data() const noexcept {
162 return words;
163 }
164
165private:
166 std::array<u32, NUM_TEXTURE_AND_IMAGE_SCALING_WORDS> words{};
167 u32* texture_ptr{words.data()};
168 u32* image_ptr{words.data() + Shader::Backend::SPIRV::NUM_TEXTURE_SCALING_WORDS};
169 u32 texture_bit{1u};
170 u32 image_bit{1u};
171};
172
173inline void PushImageDescriptors(TextureCache& texture_cache,
174 VKUpdateDescriptorQueue& update_descriptor_queue,
175 const Shader::Info& info, RescalingPushConstant& rescaling,
176 const VkSampler*& samplers,
177 const VideoCommon::ImageViewInOut*& views) {
178 const u32 num_texture_buffers = Shader::NumDescriptors(info.texture_buffer_descriptors);
179 const u32 num_image_buffers = Shader::NumDescriptors(info.image_buffer_descriptors);
180 views += num_texture_buffers;
181 views += num_image_buffers;
134 for (const auto& desc : info.texture_descriptors) { 182 for (const auto& desc : info.texture_descriptors) {
135 for (u32 index = 0; index < desc.count; ++index) { 183 for (u32 index = 0; index < desc.count; ++index) {
184 const VideoCommon::ImageViewId image_view_id{(views++)->id};
136 const VkSampler sampler{*(samplers++)}; 185 const VkSampler sampler{*(samplers++)};
137 ImageView& image_view{texture_cache.GetImageView(*(image_view_ids++))}; 186 ImageView& image_view{texture_cache.GetImageView(image_view_id)};
138 const VkImageView vk_image_view{image_view.Handle(desc.type)}; 187 const VkImageView vk_image_view{image_view.Handle(desc.type)};
139 update_descriptor_queue.AddSampledImage(vk_image_view, sampler); 188 update_descriptor_queue.AddSampledImage(vk_image_view, sampler);
189 rescaling.PushTexture(texture_cache.IsRescaling(image_view));
140 } 190 }
141 } 191 }
142 for (const auto& desc : info.image_descriptors) { 192 for (const auto& desc : info.image_descriptors) {
143 for (u32 index = 0; index < desc.count; ++index) { 193 for (u32 index = 0; index < desc.count; ++index) {
144 ImageView& image_view{texture_cache.GetImageView(*(image_view_ids++))}; 194 ImageView& image_view{texture_cache.GetImageView((views++)->id)};
145 if (desc.is_written) { 195 if (desc.is_written) {
146 texture_cache.MarkModification(image_view.image_id); 196 texture_cache.MarkModification(image_view.image_id);
147 } 197 }
148 const VkImageView vk_image_view{image_view.StorageView(desc.type, desc.format)}; 198 const VkImageView vk_image_view{image_view.StorageView(desc.type, desc.format)};
149 update_descriptor_queue.AddImage(vk_image_view); 199 update_descriptor_queue.AddImage(vk_image_view);
200 rescaling.PushImage(texture_cache.IsRescaling(image_view));
150 } 201 }
151 } 202 }
152} 203}
diff --git a/src/video_core/renderer_vulkan/vk_blit_screen.cpp b/src/video_core/renderer_vulkan/vk_blit_screen.cpp
index 888bc7392..1e447e621 100644
--- a/src/video_core/renderer_vulkan/vk_blit_screen.cpp
+++ b/src/video_core/renderer_vulkan/vk_blit_screen.cpp
@@ -12,14 +12,22 @@
12#include "common/assert.h" 12#include "common/assert.h"
13#include "common/common_types.h" 13#include "common/common_types.h"
14#include "common/math_util.h" 14#include "common/math_util.h"
15#include "common/settings.h"
15#include "core/core.h" 16#include "core/core.h"
16#include "core/frontend/emu_window.h" 17#include "core/frontend/emu_window.h"
17#include "core/memory.h" 18#include "core/memory.h"
18#include "video_core/gpu.h" 19#include "video_core/gpu.h"
20#include "video_core/host_shaders/fxaa_frag_spv.h"
21#include "video_core/host_shaders/fxaa_vert_spv.h"
22#include "video_core/host_shaders/present_bicubic_frag_spv.h"
23#include "video_core/host_shaders/present_gaussian_frag_spv.h"
19#include "video_core/host_shaders/vulkan_present_frag_spv.h" 24#include "video_core/host_shaders/vulkan_present_frag_spv.h"
25#include "video_core/host_shaders/vulkan_present_scaleforce_fp16_frag_spv.h"
26#include "video_core/host_shaders/vulkan_present_scaleforce_fp32_frag_spv.h"
20#include "video_core/host_shaders/vulkan_present_vert_spv.h" 27#include "video_core/host_shaders/vulkan_present_vert_spv.h"
21#include "video_core/renderer_vulkan/renderer_vulkan.h" 28#include "video_core/renderer_vulkan/renderer_vulkan.h"
22#include "video_core/renderer_vulkan/vk_blit_screen.h" 29#include "video_core/renderer_vulkan/vk_blit_screen.h"
30#include "video_core/renderer_vulkan/vk_fsr.h"
23#include "video_core/renderer_vulkan/vk_master_semaphore.h" 31#include "video_core/renderer_vulkan/vk_master_semaphore.h"
24#include "video_core/renderer_vulkan/vk_scheduler.h" 32#include "video_core/renderer_vulkan/vk_scheduler.h"
25#include "video_core/renderer_vulkan/vk_shader_util.h" 33#include "video_core/renderer_vulkan/vk_shader_util.h"
@@ -144,8 +152,8 @@ VkSemaphore VKBlitScreen::Draw(const Tegra::FramebufferConfig& framebuffer,
144 scheduler.Wait(resource_ticks[image_index]); 152 scheduler.Wait(resource_ticks[image_index]);
145 resource_ticks[image_index] = scheduler.CurrentTick(); 153 resource_ticks[image_index] = scheduler.CurrentTick();
146 154
147 UpdateDescriptorSet(image_index, 155 VkImageView source_image_view =
148 use_accelerated ? screen_info.image_view : *raw_image_views[image_index]); 156 use_accelerated ? screen_info.image_view : *raw_image_views[image_index];
149 157
150 BufferData data; 158 BufferData data;
151 SetUniformData(data, layout); 159 SetUniformData(data, layout);
@@ -222,9 +230,134 @@ VkSemaphore VKBlitScreen::Draw(const Tegra::FramebufferConfig& framebuffer,
222 read_barrier); 230 read_barrier);
223 cmdbuf.CopyBufferToImage(*buffer, image, VK_IMAGE_LAYOUT_GENERAL, copy); 231 cmdbuf.CopyBufferToImage(*buffer, image, VK_IMAGE_LAYOUT_GENERAL, copy);
224 cmdbuf.PipelineBarrier(VK_PIPELINE_STAGE_TRANSFER_BIT, 232 cmdbuf.PipelineBarrier(VK_PIPELINE_STAGE_TRANSFER_BIT,
225 VK_PIPELINE_STAGE_FRAGMENT_SHADER_BIT, 0, write_barrier); 233 VK_PIPELINE_STAGE_FRAGMENT_SHADER_BIT |
234 VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT,
235 0, write_barrier);
226 }); 236 });
227 } 237 }
238
239 const auto anti_alias_pass = Settings::values.anti_aliasing.GetValue();
240 if (use_accelerated && anti_alias_pass != Settings::AntiAliasing::None) {
241 UpdateAADescriptorSet(image_index, source_image_view, false);
242 const u32 up_scale = Settings::values.resolution_info.up_scale;
243 const u32 down_shift = Settings::values.resolution_info.down_shift;
244 VkExtent2D size{
245 .width = (up_scale * framebuffer.width) >> down_shift,
246 .height = (up_scale * framebuffer.height) >> down_shift,
247 };
248 scheduler.Record([this, image_index, size, anti_alias_pass](vk::CommandBuffer cmdbuf) {
249 const VkImageMemoryBarrier base_barrier{
250 .sType = VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER,
251 .pNext = nullptr,
252 .srcAccessMask = 0,
253 .dstAccessMask = 0,
254 .oldLayout = VK_IMAGE_LAYOUT_GENERAL,
255 .newLayout = VK_IMAGE_LAYOUT_GENERAL,
256 .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
257 .dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
258 .image = {},
259 .subresourceRange =
260 {
261 .aspectMask = VK_IMAGE_ASPECT_COLOR_BIT,
262 .baseMipLevel = 0,
263 .levelCount = 1,
264 .baseArrayLayer = 0,
265 .layerCount = 1,
266 },
267 };
268
269 {
270 VkImageMemoryBarrier fsr_write_barrier = base_barrier;
271 fsr_write_barrier.image = *aa_image;
272 fsr_write_barrier.oldLayout = VK_IMAGE_LAYOUT_UNDEFINED;
273 cmdbuf.PipelineBarrier(VK_PIPELINE_STAGE_ALL_COMMANDS_BIT,
274 VK_PIPELINE_STAGE_FRAGMENT_SHADER_BIT, 0, fsr_write_barrier);
275 }
276
277 const f32 bg_red = Settings::values.bg_red.GetValue() / 255.0f;
278 const f32 bg_green = Settings::values.bg_green.GetValue() / 255.0f;
279 const f32 bg_blue = Settings::values.bg_blue.GetValue() / 255.0f;
280 const VkClearValue clear_color{
281 .color = {.float32 = {bg_red, bg_green, bg_blue, 1.0f}},
282 };
283 const VkRenderPassBeginInfo renderpass_bi{
284 .sType = VK_STRUCTURE_TYPE_RENDER_PASS_BEGIN_INFO,
285 .pNext = nullptr,
286 .renderPass = *aa_renderpass,
287 .framebuffer = *aa_framebuffer,
288 .renderArea =
289 {
290 .offset = {0, 0},
291 .extent = size,
292 },
293 .clearValueCount = 1,
294 .pClearValues = &clear_color,
295 };
296 const VkViewport viewport{
297 .x = 0.0f,
298 .y = 0.0f,
299 .width = static_cast<float>(size.width),
300 .height = static_cast<float>(size.height),
301 .minDepth = 0.0f,
302 .maxDepth = 1.0f,
303 };
304 const VkRect2D scissor{
305 .offset = {0, 0},
306 .extent = size,
307 };
308 cmdbuf.BeginRenderPass(renderpass_bi, VK_SUBPASS_CONTENTS_INLINE);
309 switch (anti_alias_pass) {
310 case Settings::AntiAliasing::Fxaa:
311 cmdbuf.BindPipeline(VK_PIPELINE_BIND_POINT_GRAPHICS, *aa_pipeline);
312 break;
313 default:
314 cmdbuf.BindPipeline(VK_PIPELINE_BIND_POINT_GRAPHICS, *aa_pipeline);
315 break;
316 }
317 cmdbuf.SetViewport(0, viewport);
318 cmdbuf.SetScissor(0, scissor);
319
320 cmdbuf.BindVertexBuffer(0, *buffer, offsetof(BufferData, vertices));
321 cmdbuf.BindDescriptorSets(VK_PIPELINE_BIND_POINT_GRAPHICS, *aa_pipeline_layout, 0,
322 aa_descriptor_sets[image_index], {});
323 cmdbuf.Draw(4, 1, 0, 0);
324 cmdbuf.EndRenderPass();
325
326 {
327 VkImageMemoryBarrier blit_read_barrier = base_barrier;
328 blit_read_barrier.image = *aa_image;
329 blit_read_barrier.srcAccessMask = VK_ACCESS_COLOR_ATTACHMENT_WRITE_BIT;
330 blit_read_barrier.dstAccessMask = VK_ACCESS_SHADER_READ_BIT;
331
332 cmdbuf.PipelineBarrier(VK_PIPELINE_STAGE_COLOR_ATTACHMENT_OUTPUT_BIT,
333 VK_PIPELINE_STAGE_FRAGMENT_SHADER_BIT, 0, blit_read_barrier);
334 }
335 });
336 source_image_view = *aa_image_view;
337 }
338
339 if (fsr) {
340 auto crop_rect = framebuffer.crop_rect;
341 if (crop_rect.GetWidth() == 0) {
342 crop_rect.right = framebuffer.width;
343 }
344 if (crop_rect.GetHeight() == 0) {
345 crop_rect.bottom = framebuffer.height;
346 }
347 crop_rect = crop_rect.Scale(Settings::values.resolution_info.up_factor);
348 VkExtent2D fsr_input_size{
349 .width = Settings::values.resolution_info.ScaleUp(framebuffer.width),
350 .height = Settings::values.resolution_info.ScaleUp(framebuffer.height),
351 };
352 VkImageView fsr_image_view =
353 fsr->Draw(scheduler, image_index, source_image_view, fsr_input_size, crop_rect);
354 UpdateDescriptorSet(image_index, fsr_image_view, true);
355 } else {
356 const bool is_nn =
357 Settings::values.scaling_filter.GetValue() == Settings::ScalingFilter::NearestNeighbor;
358 UpdateDescriptorSet(image_index, source_image_view, is_nn);
359 }
360
228 scheduler.Record( 361 scheduler.Record(
229 [this, host_framebuffer, image_index, size = render_area](vk::CommandBuffer cmdbuf) { 362 [this, host_framebuffer, image_index, size = render_area](vk::CommandBuffer cmdbuf) {
230 const f32 bg_red = Settings::values.bg_red.GetValue() / 255.0f; 363 const f32 bg_red = Settings::values.bg_red.GetValue() / 255.0f;
@@ -258,8 +391,28 @@ VkSemaphore VKBlitScreen::Draw(const Tegra::FramebufferConfig& framebuffer,
258 .offset = {0, 0}, 391 .offset = {0, 0},
259 .extent = size, 392 .extent = size,
260 }; 393 };
394 const auto filter = Settings::values.scaling_filter.GetValue();
261 cmdbuf.BeginRenderPass(renderpass_bi, VK_SUBPASS_CONTENTS_INLINE); 395 cmdbuf.BeginRenderPass(renderpass_bi, VK_SUBPASS_CONTENTS_INLINE);
262 cmdbuf.BindPipeline(VK_PIPELINE_BIND_POINT_GRAPHICS, *pipeline); 396 switch (filter) {
397 case Settings::ScalingFilter::NearestNeighbor:
398 cmdbuf.BindPipeline(VK_PIPELINE_BIND_POINT_GRAPHICS, *bilinear_pipeline);
399 break;
400 case Settings::ScalingFilter::Bilinear:
401 cmdbuf.BindPipeline(VK_PIPELINE_BIND_POINT_GRAPHICS, *bilinear_pipeline);
402 break;
403 case Settings::ScalingFilter::Bicubic:
404 cmdbuf.BindPipeline(VK_PIPELINE_BIND_POINT_GRAPHICS, *bicubic_pipeline);
405 break;
406 case Settings::ScalingFilter::Gaussian:
407 cmdbuf.BindPipeline(VK_PIPELINE_BIND_POINT_GRAPHICS, *gaussian_pipeline);
408 break;
409 case Settings::ScalingFilter::ScaleForce:
410 cmdbuf.BindPipeline(VK_PIPELINE_BIND_POINT_GRAPHICS, *scaleforce_pipeline);
411 break;
412 default:
413 cmdbuf.BindPipeline(VK_PIPELINE_BIND_POINT_GRAPHICS, *bilinear_pipeline);
414 break;
415 }
263 cmdbuf.SetViewport(0, viewport); 416 cmdbuf.SetViewport(0, viewport);
264 cmdbuf.SetScissor(0, scissor); 417 cmdbuf.SetScissor(0, scissor);
265 418
@@ -281,11 +434,16 @@ VkSemaphore VKBlitScreen::DrawToSwapchain(const Tegra::FramebufferConfig& frameb
281} 434}
282 435
283vk::Framebuffer VKBlitScreen::CreateFramebuffer(const VkImageView& image_view, VkExtent2D extent) { 436vk::Framebuffer VKBlitScreen::CreateFramebuffer(const VkImageView& image_view, VkExtent2D extent) {
437 return CreateFramebuffer(image_view, extent, renderpass);
438}
439
440vk::Framebuffer VKBlitScreen::CreateFramebuffer(const VkImageView& image_view, VkExtent2D extent,
441 vk::RenderPass& rd) {
284 return device.GetLogical().CreateFramebuffer(VkFramebufferCreateInfo{ 442 return device.GetLogical().CreateFramebuffer(VkFramebufferCreateInfo{
285 .sType = VK_STRUCTURE_TYPE_FRAMEBUFFER_CREATE_INFO, 443 .sType = VK_STRUCTURE_TYPE_FRAMEBUFFER_CREATE_INFO,
286 .pNext = nullptr, 444 .pNext = nullptr,
287 .flags = 0, 445 .flags = 0,
288 .renderPass = *renderpass, 446 .renderPass = *rd,
289 .attachmentCount = 1, 447 .attachmentCount = 1,
290 .pAttachments = &image_view, 448 .pAttachments = &image_view,
291 .width = extent.width, 449 .width = extent.width,
@@ -308,9 +466,21 @@ void VKBlitScreen::CreateDynamicResources() {
308 CreateRenderPass(); 466 CreateRenderPass();
309 CreateFramebuffers(); 467 CreateFramebuffers();
310 CreateGraphicsPipeline(); 468 CreateGraphicsPipeline();
469 fsr.reset();
470 if (Settings::values.scaling_filter.GetValue() == Settings::ScalingFilter::Fsr) {
471 CreateFSR();
472 }
311} 473}
312 474
313void VKBlitScreen::RefreshResources(const Tegra::FramebufferConfig& framebuffer) { 475void VKBlitScreen::RefreshResources(const Tegra::FramebufferConfig& framebuffer) {
476 if (Settings::values.scaling_filter.GetValue() == Settings::ScalingFilter::Fsr) {
477 if (!fsr) {
478 CreateFSR();
479 }
480 } else {
481 fsr.reset();
482 }
483
314 if (framebuffer.width == raw_width && framebuffer.height == raw_height && !raw_images.empty()) { 484 if (framebuffer.width == raw_width && framebuffer.height == raw_height && !raw_images.empty()) {
315 return; 485 return;
316 } 486 }
@@ -324,7 +494,16 @@ void VKBlitScreen::RefreshResources(const Tegra::FramebufferConfig& framebuffer)
324 494
325void VKBlitScreen::CreateShaders() { 495void VKBlitScreen::CreateShaders() {
326 vertex_shader = BuildShader(device, VULKAN_PRESENT_VERT_SPV); 496 vertex_shader = BuildShader(device, VULKAN_PRESENT_VERT_SPV);
327 fragment_shader = BuildShader(device, VULKAN_PRESENT_FRAG_SPV); 497 fxaa_vertex_shader = BuildShader(device, FXAA_VERT_SPV);
498 fxaa_fragment_shader = BuildShader(device, FXAA_FRAG_SPV);
499 bilinear_fragment_shader = BuildShader(device, VULKAN_PRESENT_FRAG_SPV);
500 bicubic_fragment_shader = BuildShader(device, PRESENT_BICUBIC_FRAG_SPV);
501 gaussian_fragment_shader = BuildShader(device, PRESENT_GAUSSIAN_FRAG_SPV);
502 if (device.IsFloat16Supported()) {
503 scaleforce_fragment_shader = BuildShader(device, VULKAN_PRESENT_SCALEFORCE_FP16_FRAG_SPV);
504 } else {
505 scaleforce_fragment_shader = BuildShader(device, VULKAN_PRESENT_SCALEFORCE_FP32_FRAG_SPV);
506 }
328} 507}
329 508
330void VKBlitScreen::CreateSemaphores() { 509void VKBlitScreen::CreateSemaphores() {
@@ -344,6 +523,13 @@ void VKBlitScreen::CreateDescriptorPool() {
344 }, 523 },
345 }}; 524 }};
346 525
526 const std::array<VkDescriptorPoolSize, 1> pool_sizes_aa{{
527 {
528 .type = VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER,
529 .descriptorCount = static_cast<u32>(image_count * 2),
530 },
531 }};
532
347 const VkDescriptorPoolCreateInfo ci{ 533 const VkDescriptorPoolCreateInfo ci{
348 .sType = VK_STRUCTURE_TYPE_DESCRIPTOR_POOL_CREATE_INFO, 534 .sType = VK_STRUCTURE_TYPE_DESCRIPTOR_POOL_CREATE_INFO,
349 .pNext = nullptr, 535 .pNext = nullptr,
@@ -353,19 +539,33 @@ void VKBlitScreen::CreateDescriptorPool() {
353 .pPoolSizes = pool_sizes.data(), 539 .pPoolSizes = pool_sizes.data(),
354 }; 540 };
355 descriptor_pool = device.GetLogical().CreateDescriptorPool(ci); 541 descriptor_pool = device.GetLogical().CreateDescriptorPool(ci);
542
543 const VkDescriptorPoolCreateInfo ci_aa{
544 .sType = VK_STRUCTURE_TYPE_DESCRIPTOR_POOL_CREATE_INFO,
545 .pNext = nullptr,
546 .flags = VK_DESCRIPTOR_POOL_CREATE_FREE_DESCRIPTOR_SET_BIT,
547 .maxSets = static_cast<u32>(image_count),
548 .poolSizeCount = static_cast<u32>(pool_sizes_aa.size()),
549 .pPoolSizes = pool_sizes_aa.data(),
550 };
551 aa_descriptor_pool = device.GetLogical().CreateDescriptorPool(ci_aa);
356} 552}
357 553
358void VKBlitScreen::CreateRenderPass() { 554void VKBlitScreen::CreateRenderPass() {
555 renderpass = CreateRenderPassImpl(swapchain.GetImageViewFormat());
556}
557
558vk::RenderPass VKBlitScreen::CreateRenderPassImpl(VkFormat format, bool is_present) {
359 const VkAttachmentDescription color_attachment{ 559 const VkAttachmentDescription color_attachment{
360 .flags = 0, 560 .flags = 0,
361 .format = swapchain.GetImageViewFormat(), 561 .format = format,
362 .samples = VK_SAMPLE_COUNT_1_BIT, 562 .samples = VK_SAMPLE_COUNT_1_BIT,
363 .loadOp = VK_ATTACHMENT_LOAD_OP_CLEAR, 563 .loadOp = VK_ATTACHMENT_LOAD_OP_CLEAR,
364 .storeOp = VK_ATTACHMENT_STORE_OP_STORE, 564 .storeOp = VK_ATTACHMENT_STORE_OP_STORE,
365 .stencilLoadOp = VK_ATTACHMENT_LOAD_OP_DONT_CARE, 565 .stencilLoadOp = VK_ATTACHMENT_LOAD_OP_DONT_CARE,
366 .stencilStoreOp = VK_ATTACHMENT_STORE_OP_DONT_CARE, 566 .stencilStoreOp = VK_ATTACHMENT_STORE_OP_DONT_CARE,
367 .initialLayout = VK_IMAGE_LAYOUT_UNDEFINED, 567 .initialLayout = VK_IMAGE_LAYOUT_UNDEFINED,
368 .finalLayout = VK_IMAGE_LAYOUT_PRESENT_SRC_KHR, 568 .finalLayout = is_present ? VK_IMAGE_LAYOUT_PRESENT_SRC_KHR : VK_IMAGE_LAYOUT_GENERAL,
369 }; 569 };
370 570
371 const VkAttachmentReference color_attachment_ref{ 571 const VkAttachmentReference color_attachment_ref{
@@ -408,7 +608,7 @@ void VKBlitScreen::CreateRenderPass() {
408 .pDependencies = &dependency, 608 .pDependencies = &dependency,
409 }; 609 };
410 610
411 renderpass = device.GetLogical().CreateRenderPass(renderpass_ci); 611 return device.GetLogical().CreateRenderPass(renderpass_ci);
412} 612}
413 613
414void VKBlitScreen::CreateDescriptorSetLayout() { 614void VKBlitScreen::CreateDescriptorSetLayout() {
@@ -429,6 +629,23 @@ void VKBlitScreen::CreateDescriptorSetLayout() {
429 }, 629 },
430 }}; 630 }};
431 631
632 const std::array<VkDescriptorSetLayoutBinding, 2> layout_bindings_aa{{
633 {
634 .binding = 0,
635 .descriptorType = VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER,
636 .descriptorCount = 1,
637 .stageFlags = VK_SHADER_STAGE_VERTEX_BIT,
638 .pImmutableSamplers = nullptr,
639 },
640 {
641 .binding = 1,
642 .descriptorType = VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER,
643 .descriptorCount = 1,
644 .stageFlags = VK_SHADER_STAGE_FRAGMENT_BIT,
645 .pImmutableSamplers = nullptr,
646 },
647 }};
648
432 const VkDescriptorSetLayoutCreateInfo ci{ 649 const VkDescriptorSetLayoutCreateInfo ci{
433 .sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_LAYOUT_CREATE_INFO, 650 .sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_LAYOUT_CREATE_INFO,
434 .pNext = nullptr, 651 .pNext = nullptr,
@@ -437,11 +654,21 @@ void VKBlitScreen::CreateDescriptorSetLayout() {
437 .pBindings = layout_bindings.data(), 654 .pBindings = layout_bindings.data(),
438 }; 655 };
439 656
657 const VkDescriptorSetLayoutCreateInfo ci_aa{
658 .sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_LAYOUT_CREATE_INFO,
659 .pNext = nullptr,
660 .flags = 0,
661 .bindingCount = static_cast<u32>(layout_bindings_aa.size()),
662 .pBindings = layout_bindings_aa.data(),
663 };
664
440 descriptor_set_layout = device.GetLogical().CreateDescriptorSetLayout(ci); 665 descriptor_set_layout = device.GetLogical().CreateDescriptorSetLayout(ci);
666 aa_descriptor_set_layout = device.GetLogical().CreateDescriptorSetLayout(ci_aa);
441} 667}
442 668
443void VKBlitScreen::CreateDescriptorSets() { 669void VKBlitScreen::CreateDescriptorSets() {
444 const std::vector layouts(image_count, *descriptor_set_layout); 670 const std::vector layouts(image_count, *descriptor_set_layout);
671 const std::vector layouts_aa(image_count, *aa_descriptor_set_layout);
445 672
446 const VkDescriptorSetAllocateInfo ai{ 673 const VkDescriptorSetAllocateInfo ai{
447 .sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_ALLOCATE_INFO, 674 .sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_ALLOCATE_INFO,
@@ -451,7 +678,16 @@ void VKBlitScreen::CreateDescriptorSets() {
451 .pSetLayouts = layouts.data(), 678 .pSetLayouts = layouts.data(),
452 }; 679 };
453 680
681 const VkDescriptorSetAllocateInfo ai_aa{
682 .sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_ALLOCATE_INFO,
683 .pNext = nullptr,
684 .descriptorPool = *aa_descriptor_pool,
685 .descriptorSetCount = static_cast<u32>(image_count),
686 .pSetLayouts = layouts_aa.data(),
687 };
688
454 descriptor_sets = descriptor_pool.Allocate(ai); 689 descriptor_sets = descriptor_pool.Allocate(ai);
690 aa_descriptor_sets = aa_descriptor_pool.Allocate(ai_aa);
455} 691}
456 692
457void VKBlitScreen::CreatePipelineLayout() { 693void VKBlitScreen::CreatePipelineLayout() {
@@ -464,11 +700,63 @@ void VKBlitScreen::CreatePipelineLayout() {
464 .pushConstantRangeCount = 0, 700 .pushConstantRangeCount = 0,
465 .pPushConstantRanges = nullptr, 701 .pPushConstantRanges = nullptr,
466 }; 702 };
703 const VkPipelineLayoutCreateInfo ci_aa{
704 .sType = VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO,
705 .pNext = nullptr,
706 .flags = 0,
707 .setLayoutCount = 1,
708 .pSetLayouts = aa_descriptor_set_layout.address(),
709 .pushConstantRangeCount = 0,
710 .pPushConstantRanges = nullptr,
711 };
467 pipeline_layout = device.GetLogical().CreatePipelineLayout(ci); 712 pipeline_layout = device.GetLogical().CreatePipelineLayout(ci);
713 aa_pipeline_layout = device.GetLogical().CreatePipelineLayout(ci_aa);
468} 714}
469 715
470void VKBlitScreen::CreateGraphicsPipeline() { 716void VKBlitScreen::CreateGraphicsPipeline() {
471 const std::array<VkPipelineShaderStageCreateInfo, 2> shader_stages{{ 717 const std::array<VkPipelineShaderStageCreateInfo, 2> bilinear_shader_stages{{
718 {
719 .sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO,
720 .pNext = nullptr,
721 .flags = 0,
722 .stage = VK_SHADER_STAGE_VERTEX_BIT,
723 .module = *vertex_shader,
724 .pName = "main",
725 .pSpecializationInfo = nullptr,
726 },
727 {
728 .sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO,
729 .pNext = nullptr,
730 .flags = 0,
731 .stage = VK_SHADER_STAGE_FRAGMENT_BIT,
732 .module = *bilinear_fragment_shader,
733 .pName = "main",
734 .pSpecializationInfo = nullptr,
735 },
736 }};
737
738 const std::array<VkPipelineShaderStageCreateInfo, 2> bicubic_shader_stages{{
739 {
740 .sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO,
741 .pNext = nullptr,
742 .flags = 0,
743 .stage = VK_SHADER_STAGE_VERTEX_BIT,
744 .module = *vertex_shader,
745 .pName = "main",
746 .pSpecializationInfo = nullptr,
747 },
748 {
749 .sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO,
750 .pNext = nullptr,
751 .flags = 0,
752 .stage = VK_SHADER_STAGE_FRAGMENT_BIT,
753 .module = *bicubic_fragment_shader,
754 .pName = "main",
755 .pSpecializationInfo = nullptr,
756 },
757 }};
758
759 const std::array<VkPipelineShaderStageCreateInfo, 2> gaussian_shader_stages{{
472 { 760 {
473 .sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO, 761 .sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO,
474 .pNext = nullptr, 762 .pNext = nullptr,
@@ -483,7 +771,28 @@ void VKBlitScreen::CreateGraphicsPipeline() {
483 .pNext = nullptr, 771 .pNext = nullptr,
484 .flags = 0, 772 .flags = 0,
485 .stage = VK_SHADER_STAGE_FRAGMENT_BIT, 773 .stage = VK_SHADER_STAGE_FRAGMENT_BIT,
486 .module = *fragment_shader, 774 .module = *gaussian_fragment_shader,
775 .pName = "main",
776 .pSpecializationInfo = nullptr,
777 },
778 }};
779
780 const std::array<VkPipelineShaderStageCreateInfo, 2> scaleforce_shader_stages{{
781 {
782 .sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO,
783 .pNext = nullptr,
784 .flags = 0,
785 .stage = VK_SHADER_STAGE_VERTEX_BIT,
786 .module = *vertex_shader,
787 .pName = "main",
788 .pSpecializationInfo = nullptr,
789 },
790 {
791 .sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO,
792 .pNext = nullptr,
793 .flags = 0,
794 .stage = VK_SHADER_STAGE_FRAGMENT_BIT,
795 .module = *scaleforce_fragment_shader,
487 .pName = "main", 796 .pName = "main",
488 .pSpecializationInfo = nullptr, 797 .pSpecializationInfo = nullptr,
489 }, 798 },
@@ -583,12 +892,12 @@ void VKBlitScreen::CreateGraphicsPipeline() {
583 .pDynamicStates = dynamic_states.data(), 892 .pDynamicStates = dynamic_states.data(),
584 }; 893 };
585 894
586 const VkGraphicsPipelineCreateInfo pipeline_ci{ 895 const VkGraphicsPipelineCreateInfo bilinear_pipeline_ci{
587 .sType = VK_STRUCTURE_TYPE_GRAPHICS_PIPELINE_CREATE_INFO, 896 .sType = VK_STRUCTURE_TYPE_GRAPHICS_PIPELINE_CREATE_INFO,
588 .pNext = nullptr, 897 .pNext = nullptr,
589 .flags = 0, 898 .flags = 0,
590 .stageCount = static_cast<u32>(shader_stages.size()), 899 .stageCount = static_cast<u32>(bilinear_shader_stages.size()),
591 .pStages = shader_stages.data(), 900 .pStages = bilinear_shader_stages.data(),
592 .pVertexInputState = &vertex_input_ci, 901 .pVertexInputState = &vertex_input_ci,
593 .pInputAssemblyState = &input_assembly_ci, 902 .pInputAssemblyState = &input_assembly_ci,
594 .pTessellationState = nullptr, 903 .pTessellationState = nullptr,
@@ -605,7 +914,76 @@ void VKBlitScreen::CreateGraphicsPipeline() {
605 .basePipelineIndex = 0, 914 .basePipelineIndex = 0,
606 }; 915 };
607 916
608 pipeline = device.GetLogical().CreateGraphicsPipeline(pipeline_ci); 917 const VkGraphicsPipelineCreateInfo bicubic_pipeline_ci{
918 .sType = VK_STRUCTURE_TYPE_GRAPHICS_PIPELINE_CREATE_INFO,
919 .pNext = nullptr,
920 .flags = 0,
921 .stageCount = static_cast<u32>(bicubic_shader_stages.size()),
922 .pStages = bicubic_shader_stages.data(),
923 .pVertexInputState = &vertex_input_ci,
924 .pInputAssemblyState = &input_assembly_ci,
925 .pTessellationState = nullptr,
926 .pViewportState = &viewport_state_ci,
927 .pRasterizationState = &rasterization_ci,
928 .pMultisampleState = &multisampling_ci,
929 .pDepthStencilState = nullptr,
930 .pColorBlendState = &color_blend_ci,
931 .pDynamicState = &dynamic_state_ci,
932 .layout = *pipeline_layout,
933 .renderPass = *renderpass,
934 .subpass = 0,
935 .basePipelineHandle = 0,
936 .basePipelineIndex = 0,
937 };
938
939 const VkGraphicsPipelineCreateInfo gaussian_pipeline_ci{
940 .sType = VK_STRUCTURE_TYPE_GRAPHICS_PIPELINE_CREATE_INFO,
941 .pNext = nullptr,
942 .flags = 0,
943 .stageCount = static_cast<u32>(gaussian_shader_stages.size()),
944 .pStages = gaussian_shader_stages.data(),
945 .pVertexInputState = &vertex_input_ci,
946 .pInputAssemblyState = &input_assembly_ci,
947 .pTessellationState = nullptr,
948 .pViewportState = &viewport_state_ci,
949 .pRasterizationState = &rasterization_ci,
950 .pMultisampleState = &multisampling_ci,
951 .pDepthStencilState = nullptr,
952 .pColorBlendState = &color_blend_ci,
953 .pDynamicState = &dynamic_state_ci,
954 .layout = *pipeline_layout,
955 .renderPass = *renderpass,
956 .subpass = 0,
957 .basePipelineHandle = 0,
958 .basePipelineIndex = 0,
959 };
960
961 const VkGraphicsPipelineCreateInfo scaleforce_pipeline_ci{
962 .sType = VK_STRUCTURE_TYPE_GRAPHICS_PIPELINE_CREATE_INFO,
963 .pNext = nullptr,
964 .flags = 0,
965 .stageCount = static_cast<u32>(scaleforce_shader_stages.size()),
966 .pStages = scaleforce_shader_stages.data(),
967 .pVertexInputState = &vertex_input_ci,
968 .pInputAssemblyState = &input_assembly_ci,
969 .pTessellationState = nullptr,
970 .pViewportState = &viewport_state_ci,
971 .pRasterizationState = &rasterization_ci,
972 .pMultisampleState = &multisampling_ci,
973 .pDepthStencilState = nullptr,
974 .pColorBlendState = &color_blend_ci,
975 .pDynamicState = &dynamic_state_ci,
976 .layout = *pipeline_layout,
977 .renderPass = *renderpass,
978 .subpass = 0,
979 .basePipelineHandle = 0,
980 .basePipelineIndex = 0,
981 };
982
983 bilinear_pipeline = device.GetLogical().CreateGraphicsPipeline(bilinear_pipeline_ci);
984 bicubic_pipeline = device.GetLogical().CreateGraphicsPipeline(bicubic_pipeline_ci);
985 gaussian_pipeline = device.GetLogical().CreateGraphicsPipeline(gaussian_pipeline_ci);
986 scaleforce_pipeline = device.GetLogical().CreateGraphicsPipeline(scaleforce_pipeline_ci);
609} 987}
610 988
611void VKBlitScreen::CreateSampler() { 989void VKBlitScreen::CreateSampler() {
@@ -614,8 +992,29 @@ void VKBlitScreen::CreateSampler() {
614 .pNext = nullptr, 992 .pNext = nullptr,
615 .flags = 0, 993 .flags = 0,
616 .magFilter = VK_FILTER_LINEAR, 994 .magFilter = VK_FILTER_LINEAR,
995 .minFilter = VK_FILTER_LINEAR,
996 .mipmapMode = VK_SAMPLER_MIPMAP_MODE_NEAREST,
997 .addressModeU = VK_SAMPLER_ADDRESS_MODE_CLAMP_TO_BORDER,
998 .addressModeV = VK_SAMPLER_ADDRESS_MODE_CLAMP_TO_BORDER,
999 .addressModeW = VK_SAMPLER_ADDRESS_MODE_CLAMP_TO_BORDER,
1000 .mipLodBias = 0.0f,
1001 .anisotropyEnable = VK_FALSE,
1002 .maxAnisotropy = 0.0f,
1003 .compareEnable = VK_FALSE,
1004 .compareOp = VK_COMPARE_OP_NEVER,
1005 .minLod = 0.0f,
1006 .maxLod = 0.0f,
1007 .borderColor = VK_BORDER_COLOR_FLOAT_OPAQUE_BLACK,
1008 .unnormalizedCoordinates = VK_FALSE,
1009 };
1010
1011 const VkSamplerCreateInfo ci_nn{
1012 .sType = VK_STRUCTURE_TYPE_SAMPLER_CREATE_INFO,
1013 .pNext = nullptr,
1014 .flags = 0,
1015 .magFilter = VK_FILTER_NEAREST,
617 .minFilter = VK_FILTER_NEAREST, 1016 .minFilter = VK_FILTER_NEAREST,
618 .mipmapMode = VK_SAMPLER_MIPMAP_MODE_LINEAR, 1017 .mipmapMode = VK_SAMPLER_MIPMAP_MODE_NEAREST,
619 .addressModeU = VK_SAMPLER_ADDRESS_MODE_CLAMP_TO_BORDER, 1018 .addressModeU = VK_SAMPLER_ADDRESS_MODE_CLAMP_TO_BORDER,
620 .addressModeV = VK_SAMPLER_ADDRESS_MODE_CLAMP_TO_BORDER, 1019 .addressModeV = VK_SAMPLER_ADDRESS_MODE_CLAMP_TO_BORDER,
621 .addressModeW = VK_SAMPLER_ADDRESS_MODE_CLAMP_TO_BORDER, 1020 .addressModeW = VK_SAMPLER_ADDRESS_MODE_CLAMP_TO_BORDER,
@@ -631,6 +1030,7 @@ void VKBlitScreen::CreateSampler() {
631 }; 1030 };
632 1031
633 sampler = device.GetLogical().CreateSampler(ci); 1032 sampler = device.GetLogical().CreateSampler(ci);
1033 nn_sampler = device.GetLogical().CreateSampler(ci_nn);
634} 1034}
635 1035
636void VKBlitScreen::CreateFramebuffers() { 1036void VKBlitScreen::CreateFramebuffers() {
@@ -639,7 +1039,7 @@ void VKBlitScreen::CreateFramebuffers() {
639 1039
640 for (std::size_t i = 0; i < image_count; ++i) { 1040 for (std::size_t i = 0; i < image_count; ++i) {
641 const VkImageView image_view{swapchain.GetImageViewIndex(i)}; 1041 const VkImageView image_view{swapchain.GetImageViewIndex(i)};
642 framebuffers[i] = CreateFramebuffer(image_view, size); 1042 framebuffers[i] = CreateFramebuffer(image_view, size, renderpass);
643 } 1043 }
644} 1044}
645 1045
@@ -649,6 +1049,11 @@ void VKBlitScreen::ReleaseRawImages() {
649 } 1049 }
650 raw_images.clear(); 1050 raw_images.clear();
651 raw_buffer_commits.clear(); 1051 raw_buffer_commits.clear();
1052
1053 aa_image_view.reset();
1054 aa_image.reset();
1055 aa_commit = MemoryCommit{};
1056
652 buffer.reset(); 1057 buffer.reset();
653 buffer_commit = MemoryCommit{}; 1058 buffer_commit = MemoryCommit{};
654} 1059}
@@ -675,8 +1080,11 @@ void VKBlitScreen::CreateRawImages(const Tegra::FramebufferConfig& framebuffer)
675 raw_image_views.resize(image_count); 1080 raw_image_views.resize(image_count);
676 raw_buffer_commits.resize(image_count); 1081 raw_buffer_commits.resize(image_count);
677 1082
678 for (size_t i = 0; i < image_count; ++i) { 1083 const auto create_image = [&](bool used_on_framebuffer = false, u32 up_scale = 1,
679 raw_images[i] = device.GetLogical().CreateImage(VkImageCreateInfo{ 1084 u32 down_shift = 0) {
1085 u32 extra_usages = used_on_framebuffer ? VK_IMAGE_USAGE_COLOR_ATTACHMENT_BIT
1086 : VK_IMAGE_USAGE_TRANSFER_DST_BIT;
1087 return device.GetLogical().CreateImage(VkImageCreateInfo{
680 .sType = VK_STRUCTURE_TYPE_IMAGE_CREATE_INFO, 1088 .sType = VK_STRUCTURE_TYPE_IMAGE_CREATE_INFO,
681 .pNext = nullptr, 1089 .pNext = nullptr,
682 .flags = 0, 1090 .flags = 0,
@@ -684,26 +1092,30 @@ void VKBlitScreen::CreateRawImages(const Tegra::FramebufferConfig& framebuffer)
684 .format = GetFormat(framebuffer), 1092 .format = GetFormat(framebuffer),
685 .extent = 1093 .extent =
686 { 1094 {
687 .width = framebuffer.width, 1095 .width = (up_scale * framebuffer.width) >> down_shift,
688 .height = framebuffer.height, 1096 .height = (up_scale * framebuffer.height) >> down_shift,
689 .depth = 1, 1097 .depth = 1,
690 }, 1098 },
691 .mipLevels = 1, 1099 .mipLevels = 1,
692 .arrayLayers = 1, 1100 .arrayLayers = 1,
693 .samples = VK_SAMPLE_COUNT_1_BIT, 1101 .samples = VK_SAMPLE_COUNT_1_BIT,
694 .tiling = VK_IMAGE_TILING_LINEAR, 1102 .tiling = used_on_framebuffer ? VK_IMAGE_TILING_OPTIMAL : VK_IMAGE_TILING_LINEAR,
695 .usage = VK_IMAGE_USAGE_TRANSFER_DST_BIT | VK_IMAGE_USAGE_SAMPLED_BIT, 1103 .usage = VK_IMAGE_USAGE_TRANSFER_DST_BIT | VK_IMAGE_USAGE_SAMPLED_BIT | extra_usages,
696 .sharingMode = VK_SHARING_MODE_EXCLUSIVE, 1104 .sharingMode = VK_SHARING_MODE_EXCLUSIVE,
697 .queueFamilyIndexCount = 0, 1105 .queueFamilyIndexCount = 0,
698 .pQueueFamilyIndices = nullptr, 1106 .pQueueFamilyIndices = nullptr,
699 .initialLayout = VK_IMAGE_LAYOUT_UNDEFINED, 1107 .initialLayout = VK_IMAGE_LAYOUT_UNDEFINED,
700 }); 1108 });
701 raw_buffer_commits[i] = memory_allocator.Commit(raw_images[i], MemoryUsage::DeviceLocal); 1109 };
702 raw_image_views[i] = device.GetLogical().CreateImageView(VkImageViewCreateInfo{ 1110 const auto create_commit = [&](vk::Image& image) {
1111 return memory_allocator.Commit(image, MemoryUsage::DeviceLocal);
1112 };
1113 const auto create_image_view = [&](vk::Image& image) {
1114 return device.GetLogical().CreateImageView(VkImageViewCreateInfo{
703 .sType = VK_STRUCTURE_TYPE_IMAGE_VIEW_CREATE_INFO, 1115 .sType = VK_STRUCTURE_TYPE_IMAGE_VIEW_CREATE_INFO,
704 .pNext = nullptr, 1116 .pNext = nullptr,
705 .flags = 0, 1117 .flags = 0,
706 .image = *raw_images[i], 1118 .image = *image,
707 .viewType = VK_IMAGE_VIEW_TYPE_2D, 1119 .viewType = VK_IMAGE_VIEW_TYPE_2D,
708 .format = GetFormat(framebuffer), 1120 .format = GetFormat(framebuffer),
709 .components = 1121 .components =
@@ -722,10 +1134,211 @@ void VKBlitScreen::CreateRawImages(const Tegra::FramebufferConfig& framebuffer)
722 .layerCount = 1, 1134 .layerCount = 1,
723 }, 1135 },
724 }); 1136 });
1137 };
1138
1139 for (size_t i = 0; i < image_count; ++i) {
1140 raw_images[i] = create_image();
1141 raw_buffer_commits[i] = create_commit(raw_images[i]);
1142 raw_image_views[i] = create_image_view(raw_images[i]);
725 } 1143 }
1144
1145 // AA Resources
1146 const u32 up_scale = Settings::values.resolution_info.up_scale;
1147 const u32 down_shift = Settings::values.resolution_info.down_shift;
1148 aa_image = create_image(true, up_scale, down_shift);
1149 aa_commit = create_commit(aa_image);
1150 aa_image_view = create_image_view(aa_image);
1151 VkExtent2D size{
1152 .width = (up_scale * framebuffer.width) >> down_shift,
1153 .height = (up_scale * framebuffer.height) >> down_shift,
1154 };
1155 if (aa_renderpass) {
1156 aa_framebuffer = CreateFramebuffer(*aa_image_view, size, aa_renderpass);
1157 return;
1158 }
1159 aa_renderpass = CreateRenderPassImpl(GetFormat(framebuffer), false);
1160 aa_framebuffer = CreateFramebuffer(*aa_image_view, size, aa_renderpass);
1161
1162 const std::array<VkPipelineShaderStageCreateInfo, 2> fxaa_shader_stages{{
1163 {
1164 .sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO,
1165 .pNext = nullptr,
1166 .flags = 0,
1167 .stage = VK_SHADER_STAGE_VERTEX_BIT,
1168 .module = *fxaa_vertex_shader,
1169 .pName = "main",
1170 .pSpecializationInfo = nullptr,
1171 },
1172 {
1173 .sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO,
1174 .pNext = nullptr,
1175 .flags = 0,
1176 .stage = VK_SHADER_STAGE_FRAGMENT_BIT,
1177 .module = *fxaa_fragment_shader,
1178 .pName = "main",
1179 .pSpecializationInfo = nullptr,
1180 },
1181 }};
1182
1183 const auto vertex_binding_description = ScreenRectVertex::GetDescription();
1184 const auto vertex_attrs_description = ScreenRectVertex::GetAttributes();
1185
1186 const VkPipelineVertexInputStateCreateInfo vertex_input_ci{
1187 .sType = VK_STRUCTURE_TYPE_PIPELINE_VERTEX_INPUT_STATE_CREATE_INFO,
1188 .pNext = nullptr,
1189 .flags = 0,
1190 .vertexBindingDescriptionCount = 1,
1191 .pVertexBindingDescriptions = &vertex_binding_description,
1192 .vertexAttributeDescriptionCount = u32{vertex_attrs_description.size()},
1193 .pVertexAttributeDescriptions = vertex_attrs_description.data(),
1194 };
1195
1196 const VkPipelineInputAssemblyStateCreateInfo input_assembly_ci{
1197 .sType = VK_STRUCTURE_TYPE_PIPELINE_INPUT_ASSEMBLY_STATE_CREATE_INFO,
1198 .pNext = nullptr,
1199 .flags = 0,
1200 .topology = VK_PRIMITIVE_TOPOLOGY_TRIANGLE_STRIP,
1201 .primitiveRestartEnable = VK_FALSE,
1202 };
1203
1204 const VkPipelineViewportStateCreateInfo viewport_state_ci{
1205 .sType = VK_STRUCTURE_TYPE_PIPELINE_VIEWPORT_STATE_CREATE_INFO,
1206 .pNext = nullptr,
1207 .flags = 0,
1208 .viewportCount = 1,
1209 .pViewports = nullptr,
1210 .scissorCount = 1,
1211 .pScissors = nullptr,
1212 };
1213
1214 const VkPipelineRasterizationStateCreateInfo rasterization_ci{
1215 .sType = VK_STRUCTURE_TYPE_PIPELINE_RASTERIZATION_STATE_CREATE_INFO,
1216 .pNext = nullptr,
1217 .flags = 0,
1218 .depthClampEnable = VK_FALSE,
1219 .rasterizerDiscardEnable = VK_FALSE,
1220 .polygonMode = VK_POLYGON_MODE_FILL,
1221 .cullMode = VK_CULL_MODE_NONE,
1222 .frontFace = VK_FRONT_FACE_CLOCKWISE,
1223 .depthBiasEnable = VK_FALSE,
1224 .depthBiasConstantFactor = 0.0f,
1225 .depthBiasClamp = 0.0f,
1226 .depthBiasSlopeFactor = 0.0f,
1227 .lineWidth = 1.0f,
1228 };
1229
1230 const VkPipelineMultisampleStateCreateInfo multisampling_ci{
1231 .sType = VK_STRUCTURE_TYPE_PIPELINE_MULTISAMPLE_STATE_CREATE_INFO,
1232 .pNext = nullptr,
1233 .flags = 0,
1234 .rasterizationSamples = VK_SAMPLE_COUNT_1_BIT,
1235 .sampleShadingEnable = VK_FALSE,
1236 .minSampleShading = 0.0f,
1237 .pSampleMask = nullptr,
1238 .alphaToCoverageEnable = VK_FALSE,
1239 .alphaToOneEnable = VK_FALSE,
1240 };
1241
1242 const VkPipelineColorBlendAttachmentState color_blend_attachment{
1243 .blendEnable = VK_FALSE,
1244 .srcColorBlendFactor = VK_BLEND_FACTOR_ZERO,
1245 .dstColorBlendFactor = VK_BLEND_FACTOR_ZERO,
1246 .colorBlendOp = VK_BLEND_OP_ADD,
1247 .srcAlphaBlendFactor = VK_BLEND_FACTOR_ZERO,
1248 .dstAlphaBlendFactor = VK_BLEND_FACTOR_ZERO,
1249 .alphaBlendOp = VK_BLEND_OP_ADD,
1250 .colorWriteMask = VK_COLOR_COMPONENT_R_BIT | VK_COLOR_COMPONENT_G_BIT |
1251 VK_COLOR_COMPONENT_B_BIT | VK_COLOR_COMPONENT_A_BIT,
1252 };
1253
1254 const VkPipelineColorBlendStateCreateInfo color_blend_ci{
1255 .sType = VK_STRUCTURE_TYPE_PIPELINE_COLOR_BLEND_STATE_CREATE_INFO,
1256 .pNext = nullptr,
1257 .flags = 0,
1258 .logicOpEnable = VK_FALSE,
1259 .logicOp = VK_LOGIC_OP_COPY,
1260 .attachmentCount = 1,
1261 .pAttachments = &color_blend_attachment,
1262 .blendConstants = {0.0f, 0.0f, 0.0f, 0.0f},
1263 };
1264
1265 static constexpr std::array dynamic_states{
1266 VK_DYNAMIC_STATE_VIEWPORT,
1267 VK_DYNAMIC_STATE_SCISSOR,
1268 };
1269 const VkPipelineDynamicStateCreateInfo dynamic_state_ci{
1270 .sType = VK_STRUCTURE_TYPE_PIPELINE_DYNAMIC_STATE_CREATE_INFO,
1271 .pNext = nullptr,
1272 .flags = 0,
1273 .dynamicStateCount = static_cast<u32>(dynamic_states.size()),
1274 .pDynamicStates = dynamic_states.data(),
1275 };
1276
1277 const VkGraphicsPipelineCreateInfo fxaa_pipeline_ci{
1278 .sType = VK_STRUCTURE_TYPE_GRAPHICS_PIPELINE_CREATE_INFO,
1279 .pNext = nullptr,
1280 .flags = 0,
1281 .stageCount = static_cast<u32>(fxaa_shader_stages.size()),
1282 .pStages = fxaa_shader_stages.data(),
1283 .pVertexInputState = &vertex_input_ci,
1284 .pInputAssemblyState = &input_assembly_ci,
1285 .pTessellationState = nullptr,
1286 .pViewportState = &viewport_state_ci,
1287 .pRasterizationState = &rasterization_ci,
1288 .pMultisampleState = &multisampling_ci,
1289 .pDepthStencilState = nullptr,
1290 .pColorBlendState = &color_blend_ci,
1291 .pDynamicState = &dynamic_state_ci,
1292 .layout = *aa_pipeline_layout,
1293 .renderPass = *aa_renderpass,
1294 .subpass = 0,
1295 .basePipelineHandle = 0,
1296 .basePipelineIndex = 0,
1297 };
1298
1299 // AA
1300 aa_pipeline = device.GetLogical().CreateGraphicsPipeline(fxaa_pipeline_ci);
726} 1301}
727 1302
728void VKBlitScreen::UpdateDescriptorSet(std::size_t image_index, VkImageView image_view) const { 1303void VKBlitScreen::UpdateAADescriptorSet(std::size_t image_index, VkImageView image_view,
1304 bool nn) const {
1305 const VkDescriptorImageInfo image_info{
1306 .sampler = nn ? *nn_sampler : *sampler,
1307 .imageView = image_view,
1308 .imageLayout = VK_IMAGE_LAYOUT_GENERAL,
1309 };
1310
1311 const VkWriteDescriptorSet sampler_write{
1312 .sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET,
1313 .pNext = nullptr,
1314 .dstSet = aa_descriptor_sets[image_index],
1315 .dstBinding = 0,
1316 .dstArrayElement = 0,
1317 .descriptorCount = 1,
1318 .descriptorType = VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER,
1319 .pImageInfo = &image_info,
1320 .pBufferInfo = nullptr,
1321 .pTexelBufferView = nullptr,
1322 };
1323
1324 const VkWriteDescriptorSet sampler_write_2{
1325 .sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET,
1326 .pNext = nullptr,
1327 .dstSet = aa_descriptor_sets[image_index],
1328 .dstBinding = 1,
1329 .dstArrayElement = 0,
1330 .descriptorCount = 1,
1331 .descriptorType = VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER,
1332 .pImageInfo = &image_info,
1333 .pBufferInfo = nullptr,
1334 .pTexelBufferView = nullptr,
1335 };
1336
1337 device.GetLogical().UpdateDescriptorSets(std::array{sampler_write, sampler_write_2}, {});
1338}
1339
1340void VKBlitScreen::UpdateDescriptorSet(std::size_t image_index, VkImageView image_view,
1341 bool nn) const {
729 const VkDescriptorBufferInfo buffer_info{ 1342 const VkDescriptorBufferInfo buffer_info{
730 .buffer = *buffer, 1343 .buffer = *buffer,
731 .offset = offsetof(BufferData, uniform), 1344 .offset = offsetof(BufferData, uniform),
@@ -746,7 +1359,7 @@ void VKBlitScreen::UpdateDescriptorSet(std::size_t image_index, VkImageView imag
746 }; 1359 };
747 1360
748 const VkDescriptorImageInfo image_info{ 1361 const VkDescriptorImageInfo image_info{
749 .sampler = *sampler, 1362 .sampler = nn ? *nn_sampler : *sampler,
750 .imageView = image_view, 1363 .imageView = image_view,
751 .imageLayout = VK_IMAGE_LAYOUT_GENERAL, 1364 .imageLayout = VK_IMAGE_LAYOUT_GENERAL,
752 }; 1365 };
@@ -798,17 +1411,19 @@ void VKBlitScreen::SetVertexData(BufferData& data, const Tegra::FramebufferConfi
798 UNIMPLEMENTED_IF(framebuffer_crop_rect.top != 0); 1411 UNIMPLEMENTED_IF(framebuffer_crop_rect.top != 0);
799 UNIMPLEMENTED_IF(framebuffer_crop_rect.left != 0); 1412 UNIMPLEMENTED_IF(framebuffer_crop_rect.left != 0);
800 1413
801 // Scale the output by the crop width/height. This is commonly used with 1280x720 rendering
802 // (e.g. handheld mode) on a 1920x1080 framebuffer.
803 f32 scale_u = 1.0f; 1414 f32 scale_u = 1.0f;
804 f32 scale_v = 1.0f; 1415 f32 scale_v = 1.0f;
805 if (framebuffer_crop_rect.GetWidth() > 0) { 1416 // Scale the output by the crop width/height. This is commonly used with 1280x720 rendering
806 scale_u = static_cast<f32>(framebuffer_crop_rect.GetWidth()) / 1417 // (e.g. handheld mode) on a 1920x1080 framebuffer.
807 static_cast<f32>(screen_info.width); 1418 if (!fsr) {
808 } 1419 if (framebuffer_crop_rect.GetWidth() > 0) {
809 if (framebuffer_crop_rect.GetHeight() > 0) { 1420 scale_u = static_cast<f32>(framebuffer_crop_rect.GetWidth()) /
810 scale_v = static_cast<f32>(framebuffer_crop_rect.GetHeight()) / 1421 static_cast<f32>(screen_info.width);
811 static_cast<f32>(screen_info.height); 1422 }
1423 if (framebuffer_crop_rect.GetHeight() > 0) {
1424 scale_v = static_cast<f32>(framebuffer_crop_rect.GetHeight()) /
1425 static_cast<f32>(screen_info.height);
1426 }
812 } 1427 }
813 1428
814 const auto& screen = layout.screen; 1429 const auto& screen = layout.screen;
@@ -822,6 +1437,15 @@ void VKBlitScreen::SetVertexData(BufferData& data, const Tegra::FramebufferConfi
822 data.vertices[3] = ScreenRectVertex(x + w, y + h, texcoords.bottom * scale_u, right * scale_v); 1437 data.vertices[3] = ScreenRectVertex(x + w, y + h, texcoords.bottom * scale_u, right * scale_v);
823} 1438}
824 1439
1440void VKBlitScreen::CreateFSR() {
1441 const auto& layout = render_window.GetFramebufferLayout();
1442 const VkExtent2D fsr_size{
1443 .width = layout.screen.GetWidth(),
1444 .height = layout.screen.GetHeight(),
1445 };
1446 fsr = std::make_unique<FSR>(device, memory_allocator, image_count, fsr_size);
1447}
1448
825u64 VKBlitScreen::CalculateBufferSize(const Tegra::FramebufferConfig& framebuffer) const { 1449u64 VKBlitScreen::CalculateBufferSize(const Tegra::FramebufferConfig& framebuffer) const {
826 return sizeof(BufferData) + GetSizeInBytes(framebuffer) * image_count; 1450 return sizeof(BufferData) + GetSizeInBytes(framebuffer) * image_count;
827} 1451}
diff --git a/src/video_core/renderer_vulkan/vk_blit_screen.h b/src/video_core/renderer_vulkan/vk_blit_screen.h
index 430bcfbca..bbca71af3 100644
--- a/src/video_core/renderer_vulkan/vk_blit_screen.h
+++ b/src/video_core/renderer_vulkan/vk_blit_screen.h
@@ -34,6 +34,7 @@ namespace Vulkan {
34struct ScreenInfo; 34struct ScreenInfo;
35 35
36class Device; 36class Device;
37class FSR;
37class RasterizerVulkan; 38class RasterizerVulkan;
38class VKScheduler; 39class VKScheduler;
39class VKSwapchain; 40class VKSwapchain;
@@ -66,6 +67,9 @@ public:
66 [[nodiscard]] vk::Framebuffer CreateFramebuffer(const VkImageView& image_view, 67 [[nodiscard]] vk::Framebuffer CreateFramebuffer(const VkImageView& image_view,
67 VkExtent2D extent); 68 VkExtent2D extent);
68 69
70 [[nodiscard]] vk::Framebuffer CreateFramebuffer(const VkImageView& image_view,
71 VkExtent2D extent, vk::RenderPass& rd);
72
69private: 73private:
70 struct BufferData; 74 struct BufferData;
71 75
@@ -74,6 +78,7 @@ private:
74 void CreateSemaphores(); 78 void CreateSemaphores();
75 void CreateDescriptorPool(); 79 void CreateDescriptorPool();
76 void CreateRenderPass(); 80 void CreateRenderPass();
81 vk::RenderPass CreateRenderPassImpl(VkFormat, bool is_present = true);
77 void CreateDescriptorSetLayout(); 82 void CreateDescriptorSetLayout();
78 void CreateDescriptorSets(); 83 void CreateDescriptorSets();
79 void CreatePipelineLayout(); 84 void CreatePipelineLayout();
@@ -88,11 +93,14 @@ private:
88 void CreateStagingBuffer(const Tegra::FramebufferConfig& framebuffer); 93 void CreateStagingBuffer(const Tegra::FramebufferConfig& framebuffer);
89 void CreateRawImages(const Tegra::FramebufferConfig& framebuffer); 94 void CreateRawImages(const Tegra::FramebufferConfig& framebuffer);
90 95
91 void UpdateDescriptorSet(std::size_t image_index, VkImageView image_view) const; 96 void UpdateDescriptorSet(std::size_t image_index, VkImageView image_view, bool nn) const;
97 void UpdateAADescriptorSet(std::size_t image_index, VkImageView image_view, bool nn) const;
92 void SetUniformData(BufferData& data, const Layout::FramebufferLayout layout) const; 98 void SetUniformData(BufferData& data, const Layout::FramebufferLayout layout) const;
93 void SetVertexData(BufferData& data, const Tegra::FramebufferConfig& framebuffer, 99 void SetVertexData(BufferData& data, const Tegra::FramebufferConfig& framebuffer,
94 const Layout::FramebufferLayout layout) const; 100 const Layout::FramebufferLayout layout) const;
95 101
102 void CreateFSR();
103
96 u64 CalculateBufferSize(const Tegra::FramebufferConfig& framebuffer) const; 104 u64 CalculateBufferSize(const Tegra::FramebufferConfig& framebuffer) const;
97 u64 GetRawImageOffset(const Tegra::FramebufferConfig& framebuffer, 105 u64 GetRawImageOffset(const Tegra::FramebufferConfig& framebuffer,
98 std::size_t image_index) const; 106 std::size_t image_index) const;
@@ -107,14 +115,24 @@ private:
107 const VKScreenInfo& screen_info; 115 const VKScreenInfo& screen_info;
108 116
109 vk::ShaderModule vertex_shader; 117 vk::ShaderModule vertex_shader;
110 vk::ShaderModule fragment_shader; 118 vk::ShaderModule fxaa_vertex_shader;
119 vk::ShaderModule fxaa_fragment_shader;
120 vk::ShaderModule bilinear_fragment_shader;
121 vk::ShaderModule bicubic_fragment_shader;
122 vk::ShaderModule gaussian_fragment_shader;
123 vk::ShaderModule scaleforce_fragment_shader;
111 vk::DescriptorPool descriptor_pool; 124 vk::DescriptorPool descriptor_pool;
112 vk::DescriptorSetLayout descriptor_set_layout; 125 vk::DescriptorSetLayout descriptor_set_layout;
113 vk::PipelineLayout pipeline_layout; 126 vk::PipelineLayout pipeline_layout;
114 vk::Pipeline pipeline; 127 vk::Pipeline nearest_neightbor_pipeline;
128 vk::Pipeline bilinear_pipeline;
129 vk::Pipeline bicubic_pipeline;
130 vk::Pipeline gaussian_pipeline;
131 vk::Pipeline scaleforce_pipeline;
115 vk::RenderPass renderpass; 132 vk::RenderPass renderpass;
116 std::vector<vk::Framebuffer> framebuffers; 133 std::vector<vk::Framebuffer> framebuffers;
117 vk::DescriptorSets descriptor_sets; 134 vk::DescriptorSets descriptor_sets;
135 vk::Sampler nn_sampler;
118 vk::Sampler sampler; 136 vk::Sampler sampler;
119 137
120 vk::Buffer buffer; 138 vk::Buffer buffer;
@@ -126,8 +144,22 @@ private:
126 std::vector<vk::Image> raw_images; 144 std::vector<vk::Image> raw_images;
127 std::vector<vk::ImageView> raw_image_views; 145 std::vector<vk::ImageView> raw_image_views;
128 std::vector<MemoryCommit> raw_buffer_commits; 146 std::vector<MemoryCommit> raw_buffer_commits;
147
148 vk::DescriptorPool aa_descriptor_pool;
149 vk::DescriptorSetLayout aa_descriptor_set_layout;
150 vk::PipelineLayout aa_pipeline_layout;
151 vk::Pipeline aa_pipeline;
152 vk::RenderPass aa_renderpass;
153 vk::Framebuffer aa_framebuffer;
154 vk::DescriptorSets aa_descriptor_sets;
155 vk::Image aa_image;
156 vk::ImageView aa_image_view;
157 MemoryCommit aa_commit;
158
129 u32 raw_width = 0; 159 u32 raw_width = 0;
130 u32 raw_height = 0; 160 u32 raw_height = 0;
161
162 std::unique_ptr<FSR> fsr;
131}; 163};
132 164
133} // namespace Vulkan 165} // namespace Vulkan
diff --git a/src/video_core/renderer_vulkan/vk_buffer_cache.cpp b/src/video_core/renderer_vulkan/vk_buffer_cache.cpp
index 8ac58bc2f..5ffd93499 100644
--- a/src/video_core/renderer_vulkan/vk_buffer_cache.cpp
+++ b/src/video_core/renderer_vulkan/vk_buffer_cache.cpp
@@ -146,7 +146,7 @@ void BufferCacheRuntime::Finish() {
146} 146}
147 147
148void BufferCacheRuntime::CopyBuffer(VkBuffer dst_buffer, VkBuffer src_buffer, 148void BufferCacheRuntime::CopyBuffer(VkBuffer dst_buffer, VkBuffer src_buffer,
149 std::span<const VideoCommon::BufferCopy> copies) { 149 std::span<const VideoCommon::BufferCopy> copies, bool barrier) {
150 static constexpr VkMemoryBarrier READ_BARRIER{ 150 static constexpr VkMemoryBarrier READ_BARRIER{
151 .sType = VK_STRUCTURE_TYPE_MEMORY_BARRIER, 151 .sType = VK_STRUCTURE_TYPE_MEMORY_BARRIER,
152 .pNext = nullptr, 152 .pNext = nullptr,
@@ -163,10 +163,42 @@ void BufferCacheRuntime::CopyBuffer(VkBuffer dst_buffer, VkBuffer src_buffer,
163 boost::container::small_vector<VkBufferCopy, 3> vk_copies(copies.size()); 163 boost::container::small_vector<VkBufferCopy, 3> vk_copies(copies.size());
164 std::ranges::transform(copies, vk_copies.begin(), MakeBufferCopy); 164 std::ranges::transform(copies, vk_copies.begin(), MakeBufferCopy);
165 scheduler.RequestOutsideRenderPassOperationContext(); 165 scheduler.RequestOutsideRenderPassOperationContext();
166 scheduler.Record([src_buffer, dst_buffer, vk_copies](vk::CommandBuffer cmdbuf) { 166 scheduler.Record([src_buffer, dst_buffer, vk_copies, barrier](vk::CommandBuffer cmdbuf) {
167 if (barrier) {
168 cmdbuf.PipelineBarrier(VK_PIPELINE_STAGE_ALL_COMMANDS_BIT,
169 VK_PIPELINE_STAGE_TRANSFER_BIT, 0, READ_BARRIER);
170 }
171 cmdbuf.CopyBuffer(src_buffer, dst_buffer, vk_copies);
172 if (barrier) {
173 cmdbuf.PipelineBarrier(VK_PIPELINE_STAGE_TRANSFER_BIT,
174 VK_PIPELINE_STAGE_ALL_COMMANDS_BIT, 0, WRITE_BARRIER);
175 }
176 });
177}
178
179void BufferCacheRuntime::PreCopyBarrier() {
180 static constexpr VkMemoryBarrier READ_BARRIER{
181 .sType = VK_STRUCTURE_TYPE_MEMORY_BARRIER,
182 .pNext = nullptr,
183 .srcAccessMask = VK_ACCESS_MEMORY_WRITE_BIT,
184 .dstAccessMask = VK_ACCESS_TRANSFER_READ_BIT | VK_ACCESS_TRANSFER_WRITE_BIT,
185 };
186 scheduler.RequestOutsideRenderPassOperationContext();
187 scheduler.Record([](vk::CommandBuffer cmdbuf) {
167 cmdbuf.PipelineBarrier(VK_PIPELINE_STAGE_ALL_COMMANDS_BIT, VK_PIPELINE_STAGE_TRANSFER_BIT, 188 cmdbuf.PipelineBarrier(VK_PIPELINE_STAGE_ALL_COMMANDS_BIT, VK_PIPELINE_STAGE_TRANSFER_BIT,
168 0, READ_BARRIER); 189 0, READ_BARRIER);
169 cmdbuf.CopyBuffer(src_buffer, dst_buffer, vk_copies); 190 });
191}
192
193void BufferCacheRuntime::PostCopyBarrier() {
194 static constexpr VkMemoryBarrier WRITE_BARRIER{
195 .sType = VK_STRUCTURE_TYPE_MEMORY_BARRIER,
196 .pNext = nullptr,
197 .srcAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT,
198 .dstAccessMask = VK_ACCESS_MEMORY_READ_BIT | VK_ACCESS_MEMORY_WRITE_BIT,
199 };
200 scheduler.RequestOutsideRenderPassOperationContext();
201 scheduler.Record([](vk::CommandBuffer cmdbuf) {
170 cmdbuf.PipelineBarrier(VK_PIPELINE_STAGE_TRANSFER_BIT, VK_PIPELINE_STAGE_ALL_COMMANDS_BIT, 202 cmdbuf.PipelineBarrier(VK_PIPELINE_STAGE_TRANSFER_BIT, VK_PIPELINE_STAGE_ALL_COMMANDS_BIT,
171 0, WRITE_BARRIER); 203 0, WRITE_BARRIER);
172 }); 204 });
diff --git a/src/video_core/renderer_vulkan/vk_buffer_cache.h b/src/video_core/renderer_vulkan/vk_buffer_cache.h
index c27402ff0..1ee0d8420 100644
--- a/src/video_core/renderer_vulkan/vk_buffer_cache.h
+++ b/src/video_core/renderer_vulkan/vk_buffer_cache.h
@@ -69,8 +69,12 @@ public:
69 69
70 [[nodiscard]] StagingBufferRef DownloadStagingBuffer(size_t size); 70 [[nodiscard]] StagingBufferRef DownloadStagingBuffer(size_t size);
71 71
72 void PreCopyBarrier();
73
72 void CopyBuffer(VkBuffer src_buffer, VkBuffer dst_buffer, 74 void CopyBuffer(VkBuffer src_buffer, VkBuffer dst_buffer,
73 std::span<const VideoCommon::BufferCopy> copies); 75 std::span<const VideoCommon::BufferCopy> copies, bool barrier = true);
76
77 void PostCopyBarrier();
74 78
75 void ClearBuffer(VkBuffer dest_buffer, u32 offset, size_t size, u32 value); 79 void ClearBuffer(VkBuffer dest_buffer, u32 offset, size_t size, u32 value);
76 80
diff --git a/src/video_core/renderer_vulkan/vk_compute_pipeline.cpp b/src/video_core/renderer_vulkan/vk_compute_pipeline.cpp
index 44faf626a..de36bcdb7 100644
--- a/src/video_core/renderer_vulkan/vk_compute_pipeline.cpp
+++ b/src/video_core/renderer_vulkan/vk_compute_pipeline.cpp
@@ -22,6 +22,7 @@
22namespace Vulkan { 22namespace Vulkan {
23 23
24using Shader::ImageBufferDescriptor; 24using Shader::ImageBufferDescriptor;
25using Shader::Backend::SPIRV::RESCALING_LAYOUT_WORDS_OFFSET;
25using Tegra::Texture::TexturePair; 26using Tegra::Texture::TexturePair;
26 27
27ComputePipeline::ComputePipeline(const Device& device_, DescriptorPool& descriptor_pool, 28ComputePipeline::ComputePipeline(const Device& device_, DescriptorPool& descriptor_pool,
@@ -108,8 +109,7 @@ void ComputePipeline::Configure(Tegra::Engines::KeplerCompute& kepler_compute,
108 texture_cache.SynchronizeComputeDescriptors(); 109 texture_cache.SynchronizeComputeDescriptors();
109 110
110 static constexpr size_t max_elements = 64; 111 static constexpr size_t max_elements = 64;
111 std::array<ImageId, max_elements> image_view_ids; 112 boost::container::static_vector<VideoCommon::ImageViewInOut, max_elements> views;
112 boost::container::static_vector<u32, max_elements> image_view_indices;
113 boost::container::static_vector<VkSampler, max_elements> samplers; 113 boost::container::static_vector<VkSampler, max_elements> samplers;
114 114
115 const auto& qmd{kepler_compute.launch_description}; 115 const auto& qmd{kepler_compute.launch_description};
@@ -134,30 +134,37 @@ void ComputePipeline::Configure(Tegra::Engines::KeplerCompute& kepler_compute,
134 } 134 }
135 return TexturePair(gpu_memory.Read<u32>(addr), via_header_index); 135 return TexturePair(gpu_memory.Read<u32>(addr), via_header_index);
136 }}; 136 }};
137 const auto add_image{[&](const auto& desc) { 137 const auto add_image{[&](const auto& desc, bool blacklist) {
138 for (u32 index = 0; index < desc.count; ++index) { 138 for (u32 index = 0; index < desc.count; ++index) {
139 const auto handle{read_handle(desc, index)}; 139 const auto handle{read_handle(desc, index)};
140 image_view_indices.push_back(handle.first); 140 views.push_back({
141 .index = handle.first,
142 .blacklist = blacklist,
143 .id = {},
144 });
141 } 145 }
142 }}; 146 }};
143 std::ranges::for_each(info.texture_buffer_descriptors, add_image); 147 for (const auto& desc : info.texture_buffer_descriptors) {
144 std::ranges::for_each(info.image_buffer_descriptors, add_image); 148 add_image(desc, false);
149 }
150 for (const auto& desc : info.image_buffer_descriptors) {
151 add_image(desc, false);
152 }
145 for (const auto& desc : info.texture_descriptors) { 153 for (const auto& desc : info.texture_descriptors) {
146 for (u32 index = 0; index < desc.count; ++index) { 154 for (u32 index = 0; index < desc.count; ++index) {
147 const auto handle{read_handle(desc, index)}; 155 const auto handle{read_handle(desc, index)};
148 image_view_indices.push_back(handle.first); 156 views.push_back({handle.first});
149 157
150 Sampler* const sampler = texture_cache.GetComputeSampler(handle.second); 158 Sampler* const sampler = texture_cache.GetComputeSampler(handle.second);
151 samplers.push_back(sampler->Handle()); 159 samplers.push_back(sampler->Handle());
152 } 160 }
153 } 161 }
154 std::ranges::for_each(info.image_descriptors, add_image); 162 for (const auto& desc : info.image_descriptors) {
155 163 add_image(desc, desc.is_written);
156 const std::span indices_span(image_view_indices.data(), image_view_indices.size()); 164 }
157 texture_cache.FillComputeImageViews(indices_span, image_view_ids); 165 texture_cache.FillComputeImageViews(std::span(views.data(), views.size()));
158 166
159 buffer_cache.UnbindComputeTextureBuffers(); 167 buffer_cache.UnbindComputeTextureBuffers();
160 ImageId* texture_buffer_ids{image_view_ids.data()};
161 size_t index{}; 168 size_t index{};
162 const auto add_buffer{[&](const auto& desc) { 169 const auto add_buffer{[&](const auto& desc) {
163 constexpr bool is_image = std::is_same_v<decltype(desc), const ImageBufferDescriptor&>; 170 constexpr bool is_image = std::is_same_v<decltype(desc), const ImageBufferDescriptor&>;
@@ -166,11 +173,10 @@ void ComputePipeline::Configure(Tegra::Engines::KeplerCompute& kepler_compute,
166 if constexpr (is_image) { 173 if constexpr (is_image) {
167 is_written = desc.is_written; 174 is_written = desc.is_written;
168 } 175 }
169 ImageView& image_view = texture_cache.GetImageView(*texture_buffer_ids); 176 ImageView& image_view = texture_cache.GetImageView(views[index].id);
170 buffer_cache.BindComputeTextureBuffer(index, image_view.GpuAddr(), 177 buffer_cache.BindComputeTextureBuffer(index, image_view.GpuAddr(),
171 image_view.BufferSize(), image_view.format, 178 image_view.BufferSize(), image_view.format,
172 is_written, is_image); 179 is_written, is_image);
173 ++texture_buffer_ids;
174 ++index; 180 ++index;
175 } 181 }
176 }}; 182 }};
@@ -180,9 +186,11 @@ void ComputePipeline::Configure(Tegra::Engines::KeplerCompute& kepler_compute,
180 buffer_cache.UpdateComputeBuffers(); 186 buffer_cache.UpdateComputeBuffers();
181 buffer_cache.BindHostComputeBuffers(); 187 buffer_cache.BindHostComputeBuffers();
182 188
189 RescalingPushConstant rescaling;
183 const VkSampler* samplers_it{samplers.data()}; 190 const VkSampler* samplers_it{samplers.data()};
184 const ImageId* views_it{image_view_ids.data()}; 191 const VideoCommon::ImageViewInOut* views_it{views.data()};
185 PushImageDescriptors(info, samplers_it, views_it, texture_cache, update_descriptor_queue); 192 PushImageDescriptors(texture_cache, update_descriptor_queue, info, rescaling, samplers_it,
193 views_it);
186 194
187 if (!is_built.load(std::memory_order::relaxed)) { 195 if (!is_built.load(std::memory_order::relaxed)) {
188 // Wait for the pipeline to be built 196 // Wait for the pipeline to be built
@@ -192,11 +200,18 @@ void ComputePipeline::Configure(Tegra::Engines::KeplerCompute& kepler_compute,
192 }); 200 });
193 } 201 }
194 const void* const descriptor_data{update_descriptor_queue.UpdateData()}; 202 const void* const descriptor_data{update_descriptor_queue.UpdateData()};
195 scheduler.Record([this, descriptor_data](vk::CommandBuffer cmdbuf) { 203 const bool is_rescaling = !info.texture_descriptors.empty() || !info.image_descriptors.empty();
204 scheduler.Record([this, descriptor_data, is_rescaling,
205 rescaling_data = rescaling.Data()](vk::CommandBuffer cmdbuf) {
196 cmdbuf.BindPipeline(VK_PIPELINE_BIND_POINT_COMPUTE, *pipeline); 206 cmdbuf.BindPipeline(VK_PIPELINE_BIND_POINT_COMPUTE, *pipeline);
197 if (!descriptor_set_layout) { 207 if (!descriptor_set_layout) {
198 return; 208 return;
199 } 209 }
210 if (is_rescaling) {
211 cmdbuf.PushConstants(*pipeline_layout, VK_SHADER_STAGE_COMPUTE_BIT,
212 RESCALING_LAYOUT_WORDS_OFFSET, sizeof(rescaling_data),
213 rescaling_data.data());
214 }
200 const VkDescriptorSet descriptor_set{descriptor_allocator.Commit()}; 215 const VkDescriptorSet descriptor_set{descriptor_allocator.Commit()};
201 const vk::Device& dev{device.GetLogical()}; 216 const vk::Device& dev{device.GetLogical()};
202 dev.UpdateDescriptorSet(descriptor_set, *descriptor_update_template, descriptor_data); 217 dev.UpdateDescriptorSet(descriptor_set, *descriptor_update_template, descriptor_data);
diff --git a/src/video_core/renderer_vulkan/vk_fsr.cpp b/src/video_core/renderer_vulkan/vk_fsr.cpp
new file mode 100644
index 000000000..73629d229
--- /dev/null
+++ b/src/video_core/renderer_vulkan/vk_fsr.cpp
@@ -0,0 +1,553 @@
1// Copyright 2021 yuzu Emulator Project
2// Licensed under GPLv2 or any later version
3// Refer to the license.txt file included.
4
5#include <cmath>
6#include "common/bit_cast.h"
7#include "common/common_types.h"
8#include "common/div_ceil.h"
9
10#include "video_core/host_shaders/vulkan_fidelityfx_fsr_easu_fp16_comp_spv.h"
11#include "video_core/host_shaders/vulkan_fidelityfx_fsr_easu_fp32_comp_spv.h"
12#include "video_core/host_shaders/vulkan_fidelityfx_fsr_rcas_fp16_comp_spv.h"
13#include "video_core/host_shaders/vulkan_fidelityfx_fsr_rcas_fp32_comp_spv.h"
14#include "video_core/renderer_vulkan/vk_fsr.h"
15#include "video_core/renderer_vulkan/vk_scheduler.h"
16#include "video_core/renderer_vulkan/vk_shader_util.h"
17#include "video_core/vulkan_common/vulkan_device.h"
18
19namespace Vulkan {
20namespace {
21// Reimplementations of the constant generating functions in ffx_fsr1.h
22// GCC generated a lot of warnings when using the official header.
23u32 AU1_AH1_AF1(f32 f) {
24 static constexpr u32 base[512]{
25 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
26 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
27 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
28 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
29 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
30 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
31 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
32 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
33 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
34 0x0000, 0x0000, 0x0000, 0x0000, 0x0001, 0x0002, 0x0004, 0x0008, 0x0010, 0x0020, 0x0040,
35 0x0080, 0x0100, 0x0200, 0x0400, 0x0800, 0x0c00, 0x1000, 0x1400, 0x1800, 0x1c00, 0x2000,
36 0x2400, 0x2800, 0x2c00, 0x3000, 0x3400, 0x3800, 0x3c00, 0x4000, 0x4400, 0x4800, 0x4c00,
37 0x5000, 0x5400, 0x5800, 0x5c00, 0x6000, 0x6400, 0x6800, 0x6c00, 0x7000, 0x7400, 0x7800,
38 0x7bff, 0x7bff, 0x7bff, 0x7bff, 0x7bff, 0x7bff, 0x7bff, 0x7bff, 0x7bff, 0x7bff, 0x7bff,
39 0x7bff, 0x7bff, 0x7bff, 0x7bff, 0x7bff, 0x7bff, 0x7bff, 0x7bff, 0x7bff, 0x7bff, 0x7bff,
40 0x7bff, 0x7bff, 0x7bff, 0x7bff, 0x7bff, 0x7bff, 0x7bff, 0x7bff, 0x7bff, 0x7bff, 0x7bff,
41 0x7bff, 0x7bff, 0x7bff, 0x7bff, 0x7bff, 0x7bff, 0x7bff, 0x7bff, 0x7bff, 0x7bff, 0x7bff,
42 0x7bff, 0x7bff, 0x7bff, 0x7bff, 0x7bff, 0x7bff, 0x7bff, 0x7bff, 0x7bff, 0x7bff, 0x7bff,
43 0x7bff, 0x7bff, 0x7bff, 0x7bff, 0x7bff, 0x7bff, 0x7bff, 0x7bff, 0x7bff, 0x7bff, 0x7bff,
44 0x7bff, 0x7bff, 0x7bff, 0x7bff, 0x7bff, 0x7bff, 0x7bff, 0x7bff, 0x7bff, 0x7bff, 0x7bff,
45 0x7bff, 0x7bff, 0x7bff, 0x7bff, 0x7bff, 0x7bff, 0x7bff, 0x7bff, 0x7bff, 0x7bff, 0x7bff,
46 0x7bff, 0x7bff, 0x7bff, 0x7bff, 0x7bff, 0x7bff, 0x7bff, 0x7bff, 0x7bff, 0x7bff, 0x7bff,
47 0x7bff, 0x7bff, 0x7bff, 0x7bff, 0x7bff, 0x7bff, 0x7bff, 0x7bff, 0x7bff, 0x7bff, 0x7bff,
48 0x7bff, 0x7bff, 0x7bff, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000,
49 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000,
50 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000,
51 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000,
52 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000,
53 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000,
54 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000,
55 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000,
56 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000,
57 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8001, 0x8002, 0x8004, 0x8008,
58 0x8010, 0x8020, 0x8040, 0x8080, 0x8100, 0x8200, 0x8400, 0x8800, 0x8c00, 0x9000, 0x9400,
59 0x9800, 0x9c00, 0xa000, 0xa400, 0xa800, 0xac00, 0xb000, 0xb400, 0xb800, 0xbc00, 0xc000,
60 0xc400, 0xc800, 0xcc00, 0xd000, 0xd400, 0xd800, 0xdc00, 0xe000, 0xe400, 0xe800, 0xec00,
61 0xf000, 0xf400, 0xf800, 0xfbff, 0xfbff, 0xfbff, 0xfbff, 0xfbff, 0xfbff, 0xfbff, 0xfbff,
62 0xfbff, 0xfbff, 0xfbff, 0xfbff, 0xfbff, 0xfbff, 0xfbff, 0xfbff, 0xfbff, 0xfbff, 0xfbff,
63 0xfbff, 0xfbff, 0xfbff, 0xfbff, 0xfbff, 0xfbff, 0xfbff, 0xfbff, 0xfbff, 0xfbff, 0xfbff,
64 0xfbff, 0xfbff, 0xfbff, 0xfbff, 0xfbff, 0xfbff, 0xfbff, 0xfbff, 0xfbff, 0xfbff, 0xfbff,
65 0xfbff, 0xfbff, 0xfbff, 0xfbff, 0xfbff, 0xfbff, 0xfbff, 0xfbff, 0xfbff, 0xfbff, 0xfbff,
66 0xfbff, 0xfbff, 0xfbff, 0xfbff, 0xfbff, 0xfbff, 0xfbff, 0xfbff, 0xfbff, 0xfbff, 0xfbff,
67 0xfbff, 0xfbff, 0xfbff, 0xfbff, 0xfbff, 0xfbff, 0xfbff, 0xfbff, 0xfbff, 0xfbff, 0xfbff,
68 0xfbff, 0xfbff, 0xfbff, 0xfbff, 0xfbff, 0xfbff, 0xfbff, 0xfbff, 0xfbff, 0xfbff, 0xfbff,
69 0xfbff, 0xfbff, 0xfbff, 0xfbff, 0xfbff, 0xfbff, 0xfbff, 0xfbff, 0xfbff, 0xfbff, 0xfbff,
70 0xfbff, 0xfbff, 0xfbff, 0xfbff, 0xfbff, 0xfbff, 0xfbff, 0xfbff, 0xfbff, 0xfbff, 0xfbff,
71 0xfbff, 0xfbff, 0xfbff, 0xfbff, 0xfbff, 0xfbff,
72 };
73 static constexpr s8 shift[512]{
74 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
75 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
76 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
77 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
78 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
79 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
80 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x17, 0x16,
81 0x15, 0x14, 0x13, 0x12, 0x11, 0x10, 0x0f, 0x0e, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d,
82 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d,
83 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
84 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
85 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
86 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
87 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
88 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
89 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
90 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
91 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
92 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
93 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
94 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
95 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
96 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
97 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x17,
98 0x16, 0x15, 0x14, 0x13, 0x12, 0x11, 0x10, 0x0f, 0x0e, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d,
99 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d,
100 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
101 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
102 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
103 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
104 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
105 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
106 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
107 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
108 0x18, 0x18,
109 };
110 const u32 u = Common::BitCast<u32>(f);
111 const u32 i = u >> 23;
112 return base[i] + ((u & 0x7fffff) >> shift[i]);
113}
114
115u32 AU1_AH2_AF2(f32 a[2]) {
116 return AU1_AH1_AF1(a[0]) + (AU1_AH1_AF1(a[1]) << 16);
117}
118
119void FsrEasuCon(u32 con0[4], u32 con1[4], u32 con2[4], u32 con3[4], f32 inputViewportInPixelsX,
120 f32 inputViewportInPixelsY, f32 inputSizeInPixelsX, f32 inputSizeInPixelsY,
121 f32 outputSizeInPixelsX, f32 outputSizeInPixelsY) {
122 con0[0] = Common::BitCast<u32>(inputViewportInPixelsX / outputSizeInPixelsX);
123 con0[1] = Common::BitCast<u32>(inputViewportInPixelsY / outputSizeInPixelsY);
124 con0[2] = Common::BitCast<u32>(0.5f * inputViewportInPixelsX / outputSizeInPixelsX - 0.5f);
125 con0[3] = Common::BitCast<u32>(0.5f * inputViewportInPixelsY / outputSizeInPixelsY - 0.5f);
126 con1[0] = Common::BitCast<u32>(1.0f / inputSizeInPixelsX);
127 con1[1] = Common::BitCast<u32>(1.0f / inputSizeInPixelsY);
128 con1[2] = Common::BitCast<u32>(1.0f / inputSizeInPixelsX);
129 con1[3] = Common::BitCast<u32>(-1.0f / inputSizeInPixelsY);
130 con2[0] = Common::BitCast<u32>(-1.0f / inputSizeInPixelsX);
131 con2[1] = Common::BitCast<u32>(2.0f / inputSizeInPixelsY);
132 con2[2] = Common::BitCast<u32>(1.0f / inputSizeInPixelsX);
133 con2[3] = Common::BitCast<u32>(2.0f / inputSizeInPixelsY);
134 con3[0] = Common::BitCast<u32>(0.0f / inputSizeInPixelsX);
135 con3[1] = Common::BitCast<u32>(4.0f / inputSizeInPixelsY);
136 con3[2] = con3[3] = 0;
137}
138
139void FsrEasuConOffset(u32 con0[4], u32 con1[4], u32 con2[4], u32 con3[4],
140 f32 inputViewportInPixelsX, f32 inputViewportInPixelsY,
141 f32 inputSizeInPixelsX, f32 inputSizeInPixelsY, f32 outputSizeInPixelsX,
142 f32 outputSizeInPixelsY, f32 inputOffsetInPixelsX, f32 inputOffsetInPixelsY) {
143 FsrEasuCon(con0, con1, con2, con3, inputViewportInPixelsX, inputViewportInPixelsY,
144 inputSizeInPixelsX, inputSizeInPixelsY, outputSizeInPixelsX, outputSizeInPixelsY);
145 con0[2] = Common::BitCast<u32>(0.5f * inputViewportInPixelsX / outputSizeInPixelsX - 0.5f +
146 inputOffsetInPixelsX);
147 con0[3] = Common::BitCast<u32>(0.5f * inputViewportInPixelsY / outputSizeInPixelsY - 0.5f +
148 inputOffsetInPixelsY);
149}
150
151void FsrRcasCon(u32* con, f32 sharpness) {
152 sharpness = std::exp2f(-sharpness);
153 f32 hSharp[2]{sharpness, sharpness};
154 con[0] = Common::BitCast<u32>(sharpness);
155 con[1] = AU1_AH2_AF2(hSharp);
156 con[2] = 0;
157 con[3] = 0;
158}
159} // Anonymous namespace
160
161FSR::FSR(const Device& device_, MemoryAllocator& memory_allocator_, size_t image_count_,
162 VkExtent2D output_size_)
163 : device{device_}, memory_allocator{memory_allocator_}, image_count{image_count_},
164 output_size{output_size_} {
165
166 CreateImages();
167 CreateSampler();
168 CreateShaders();
169 CreateDescriptorPool();
170 CreateDescriptorSetLayout();
171 CreateDescriptorSets();
172 CreatePipelineLayout();
173 CreatePipeline();
174}
175
176VkImageView FSR::Draw(VKScheduler& scheduler, size_t image_index, VkImageView image_view,
177 VkExtent2D input_image_extent, const Common::Rectangle<int>& crop_rect) {
178
179 UpdateDescriptorSet(image_index, image_view);
180
181 scheduler.Record([this, image_index, input_image_extent, crop_rect](vk::CommandBuffer cmdbuf) {
182 const VkImageMemoryBarrier base_barrier{
183 .sType = VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER,
184 .pNext = nullptr,
185 .srcAccessMask = 0,
186 .dstAccessMask = 0,
187 .oldLayout = VK_IMAGE_LAYOUT_GENERAL,
188 .newLayout = VK_IMAGE_LAYOUT_GENERAL,
189 .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
190 .dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
191 .image = {},
192 .subresourceRange =
193 {
194 .aspectMask = VK_IMAGE_ASPECT_COLOR_BIT,
195 .baseMipLevel = 0,
196 .levelCount = 1,
197 .baseArrayLayer = 0,
198 .layerCount = 1,
199 },
200 };
201
202 cmdbuf.BindPipeline(VK_PIPELINE_BIND_POINT_COMPUTE, *easu_pipeline);
203
204 std::array<u32, 4 * 4> push_constants;
205 FsrEasuConOffset(
206 push_constants.data() + 0, push_constants.data() + 4, push_constants.data() + 8,
207 push_constants.data() + 12,
208
209 static_cast<f32>(crop_rect.GetWidth()), static_cast<f32>(crop_rect.GetHeight()),
210 static_cast<f32>(input_image_extent.width), static_cast<f32>(input_image_extent.height),
211 static_cast<f32>(output_size.width), static_cast<f32>(output_size.height),
212 static_cast<f32>(crop_rect.left), static_cast<f32>(crop_rect.top));
213 cmdbuf.PushConstants(*pipeline_layout, VK_SHADER_STAGE_COMPUTE_BIT, push_constants);
214
215 {
216 VkImageMemoryBarrier fsr_write_barrier = base_barrier;
217 fsr_write_barrier.image = *images[image_index],
218 fsr_write_barrier.oldLayout = VK_IMAGE_LAYOUT_UNDEFINED;
219
220 cmdbuf.PipelineBarrier(VK_PIPELINE_STAGE_ALL_COMMANDS_BIT,
221 VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, 0, fsr_write_barrier);
222 }
223
224 cmdbuf.BindDescriptorSets(VK_PIPELINE_BIND_POINT_COMPUTE, *pipeline_layout, 0,
225 descriptor_sets[image_index * 2], {});
226 cmdbuf.Dispatch(Common::DivCeil(output_size.width, 16u),
227 Common::DivCeil(output_size.height, 16u), 1);
228
229 cmdbuf.BindPipeline(VK_PIPELINE_BIND_POINT_COMPUTE, *rcas_pipeline);
230
231 FsrRcasCon(push_constants.data(), 0.25f);
232 cmdbuf.PushConstants(*pipeline_layout, VK_SHADER_STAGE_COMPUTE_BIT, push_constants);
233
234 {
235 std::array<VkImageMemoryBarrier, 2> barriers;
236 auto& fsr_read_barrier = barriers[0];
237 auto& blit_write_barrier = barriers[1];
238
239 fsr_read_barrier = base_barrier;
240 fsr_read_barrier.image = *images[image_index];
241 fsr_read_barrier.srcAccessMask = VK_ACCESS_SHADER_WRITE_BIT;
242 fsr_read_barrier.dstAccessMask = VK_ACCESS_SHADER_READ_BIT;
243
244 blit_write_barrier = base_barrier;
245 blit_write_barrier.image = *images[image_count + image_index];
246 blit_write_barrier.oldLayout = VK_IMAGE_LAYOUT_UNDEFINED;
247 blit_write_barrier.newLayout = VK_IMAGE_LAYOUT_GENERAL;
248
249 cmdbuf.PipelineBarrier(VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT,
250 VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, 0, {}, {}, barriers);
251 }
252
253 cmdbuf.BindDescriptorSets(VK_PIPELINE_BIND_POINT_COMPUTE, *pipeline_layout, 0,
254 descriptor_sets[image_index * 2 + 1], {});
255 cmdbuf.Dispatch(Common::DivCeil(output_size.width, 16u),
256 Common::DivCeil(output_size.height, 16u), 1);
257
258 {
259 std::array<VkImageMemoryBarrier, 1> barriers;
260 auto& blit_read_barrier = barriers[0];
261
262 blit_read_barrier = base_barrier;
263 blit_read_barrier.image = *images[image_count + image_index];
264 blit_read_barrier.srcAccessMask = VK_ACCESS_SHADER_WRITE_BIT;
265 blit_read_barrier.dstAccessMask = VK_ACCESS_SHADER_READ_BIT;
266
267 cmdbuf.PipelineBarrier(VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT,
268 VK_PIPELINE_STAGE_FRAGMENT_SHADER_BIT, 0, {}, {}, barriers);
269 }
270 });
271
272 return *image_views[image_count + image_index];
273}
274
275void FSR::CreateDescriptorPool() {
276 const std::array<VkDescriptorPoolSize, 2> pool_sizes{{
277 {
278 .type = VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER,
279 .descriptorCount = static_cast<u32>(image_count * 2),
280 },
281 {
282 .type = VK_DESCRIPTOR_TYPE_STORAGE_IMAGE,
283 .descriptorCount = static_cast<u32>(image_count * 2),
284 },
285 }};
286
287 const VkDescriptorPoolCreateInfo ci{
288 .sType = VK_STRUCTURE_TYPE_DESCRIPTOR_POOL_CREATE_INFO,
289 .pNext = nullptr,
290 .flags = VK_DESCRIPTOR_POOL_CREATE_FREE_DESCRIPTOR_SET_BIT,
291 .maxSets = static_cast<u32>(image_count * 2),
292 .poolSizeCount = static_cast<u32>(pool_sizes.size()),
293 .pPoolSizes = pool_sizes.data(),
294 };
295 descriptor_pool = device.GetLogical().CreateDescriptorPool(ci);
296}
297
298void FSR::CreateDescriptorSetLayout() {
299 const std::array<VkDescriptorSetLayoutBinding, 2> layout_bindings{{
300 {
301 .binding = 0,
302 .descriptorType = VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER,
303 .descriptorCount = 1,
304 .stageFlags = VK_SHADER_STAGE_COMPUTE_BIT,
305 .pImmutableSamplers = sampler.address(),
306 },
307 {
308 .binding = 1,
309 .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_IMAGE,
310 .descriptorCount = 1,
311 .stageFlags = VK_SHADER_STAGE_COMPUTE_BIT,
312 .pImmutableSamplers = sampler.address(),
313 },
314 }};
315
316 const VkDescriptorSetLayoutCreateInfo ci{
317 .sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_LAYOUT_CREATE_INFO,
318 .pNext = nullptr,
319 .flags = 0,
320 .bindingCount = static_cast<u32>(layout_bindings.size()),
321 .pBindings = layout_bindings.data(),
322 };
323
324 descriptor_set_layout = device.GetLogical().CreateDescriptorSetLayout(ci);
325}
326
327void FSR::CreateDescriptorSets() {
328 const u32 sets = static_cast<u32>(image_count * 2);
329 const std::vector layouts(sets, *descriptor_set_layout);
330
331 const VkDescriptorSetAllocateInfo ai{
332 .sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_ALLOCATE_INFO,
333 .pNext = nullptr,
334 .descriptorPool = *descriptor_pool,
335 .descriptorSetCount = sets,
336 .pSetLayouts = layouts.data(),
337 };
338
339 descriptor_sets = descriptor_pool.Allocate(ai);
340}
341
342void FSR::CreateImages() {
343 images.resize(image_count * 2);
344 image_views.resize(image_count * 2);
345 buffer_commits.resize(image_count * 2);
346
347 for (size_t i = 0; i < image_count * 2; ++i) {
348 images[i] = device.GetLogical().CreateImage(VkImageCreateInfo{
349 .sType = VK_STRUCTURE_TYPE_IMAGE_CREATE_INFO,
350 .pNext = nullptr,
351 .flags = 0,
352 .imageType = VK_IMAGE_TYPE_2D,
353 .format = VK_FORMAT_R16G16B16A16_SFLOAT,
354 .extent =
355 {
356 .width = output_size.width,
357 .height = output_size.height,
358 .depth = 1,
359 },
360 .mipLevels = 1,
361 .arrayLayers = 1,
362 .samples = VK_SAMPLE_COUNT_1_BIT,
363 .tiling = VK_IMAGE_TILING_OPTIMAL,
364 .usage = VK_IMAGE_USAGE_TRANSFER_SRC_BIT | VK_IMAGE_USAGE_STORAGE_BIT |
365 VK_IMAGE_USAGE_SAMPLED_BIT,
366 .sharingMode = VK_SHARING_MODE_EXCLUSIVE,
367 .queueFamilyIndexCount = 0,
368 .pQueueFamilyIndices = nullptr,
369 .initialLayout = VK_IMAGE_LAYOUT_UNDEFINED,
370 });
371 buffer_commits[i] = memory_allocator.Commit(images[i], MemoryUsage::DeviceLocal);
372 image_views[i] = device.GetLogical().CreateImageView(VkImageViewCreateInfo{
373 .sType = VK_STRUCTURE_TYPE_IMAGE_VIEW_CREATE_INFO,
374 .pNext = nullptr,
375 .flags = 0,
376 .image = *images[i],
377 .viewType = VK_IMAGE_VIEW_TYPE_2D,
378 .format = VK_FORMAT_R16G16B16A16_SFLOAT,
379 .components =
380 {
381 .r = VK_COMPONENT_SWIZZLE_IDENTITY,
382 .g = VK_COMPONENT_SWIZZLE_IDENTITY,
383 .b = VK_COMPONENT_SWIZZLE_IDENTITY,
384 .a = VK_COMPONENT_SWIZZLE_IDENTITY,
385 },
386 .subresourceRange =
387 {
388 .aspectMask = VK_IMAGE_ASPECT_COLOR_BIT,
389 .baseMipLevel = 0,
390 .levelCount = 1,
391 .baseArrayLayer = 0,
392 .layerCount = 1,
393 },
394 });
395 }
396}
397
398void FSR::CreatePipelineLayout() {
399 VkPushConstantRange push_const{
400 .stageFlags = VK_SHADER_STAGE_COMPUTE_BIT,
401 .offset = 0,
402 .size = sizeof(std::array<u32, 4 * 4>),
403 };
404 VkPipelineLayoutCreateInfo ci{
405 .sType = VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO,
406 .pNext = nullptr,
407 .flags = 0,
408 .setLayoutCount = 1,
409 .pSetLayouts = descriptor_set_layout.address(),
410 .pushConstantRangeCount = 1,
411 .pPushConstantRanges = &push_const,
412 };
413
414 pipeline_layout = device.GetLogical().CreatePipelineLayout(ci);
415}
416
417void FSR::UpdateDescriptorSet(std::size_t image_index, VkImageView image_view) const {
418 const auto fsr_image_view = *image_views[image_index];
419 const auto blit_image_view = *image_views[image_count + image_index];
420
421 const VkDescriptorImageInfo image_info{
422 .sampler = VK_NULL_HANDLE,
423 .imageView = image_view,
424 .imageLayout = VK_IMAGE_LAYOUT_GENERAL,
425 };
426 const VkDescriptorImageInfo fsr_image_info{
427 .sampler = VK_NULL_HANDLE,
428 .imageView = fsr_image_view,
429 .imageLayout = VK_IMAGE_LAYOUT_GENERAL,
430 };
431 const VkDescriptorImageInfo blit_image_info{
432 .sampler = VK_NULL_HANDLE,
433 .imageView = blit_image_view,
434 .imageLayout = VK_IMAGE_LAYOUT_GENERAL,
435 };
436
437 VkWriteDescriptorSet sampler_write{
438 .sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET,
439 .pNext = nullptr,
440 .dstSet = descriptor_sets[image_index * 2],
441 .dstBinding = 0,
442 .dstArrayElement = 0,
443 .descriptorCount = 1,
444 .descriptorType = VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER,
445 .pImageInfo = &image_info,
446 .pBufferInfo = nullptr,
447 .pTexelBufferView = nullptr,
448 };
449
450 VkWriteDescriptorSet output_write{
451 .sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET,
452 .pNext = nullptr,
453 .dstSet = descriptor_sets[image_index * 2],
454 .dstBinding = 1,
455 .dstArrayElement = 0,
456 .descriptorCount = 1,
457 .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_IMAGE,
458 .pImageInfo = &fsr_image_info,
459 .pBufferInfo = nullptr,
460 .pTexelBufferView = nullptr,
461 };
462
463 device.GetLogical().UpdateDescriptorSets(std::array{sampler_write, output_write}, {});
464
465 sampler_write.dstSet = descriptor_sets[image_index * 2 + 1];
466 sampler_write.pImageInfo = &fsr_image_info;
467 output_write.dstSet = descriptor_sets[image_index * 2 + 1];
468 output_write.pImageInfo = &blit_image_info;
469
470 device.GetLogical().UpdateDescriptorSets(std::array{sampler_write, output_write}, {});
471}
472
473void FSR::CreateSampler() {
474 const VkSamplerCreateInfo ci{
475 .sType = VK_STRUCTURE_TYPE_SAMPLER_CREATE_INFO,
476 .pNext = nullptr,
477 .flags = 0,
478 .magFilter = VK_FILTER_LINEAR,
479 .minFilter = VK_FILTER_LINEAR,
480 .mipmapMode = VK_SAMPLER_MIPMAP_MODE_LINEAR,
481 .addressModeU = VK_SAMPLER_ADDRESS_MODE_CLAMP_TO_EDGE,
482 .addressModeV = VK_SAMPLER_ADDRESS_MODE_CLAMP_TO_EDGE,
483 .addressModeW = VK_SAMPLER_ADDRESS_MODE_CLAMP_TO_EDGE,
484 .mipLodBias = 0.0f,
485 .anisotropyEnable = VK_FALSE,
486 .maxAnisotropy = 0.0f,
487 .compareEnable = VK_FALSE,
488 .compareOp = VK_COMPARE_OP_NEVER,
489 .minLod = 0.0f,
490 .maxLod = 0.0f,
491 .borderColor = VK_BORDER_COLOR_FLOAT_OPAQUE_BLACK,
492 .unnormalizedCoordinates = VK_FALSE,
493 };
494
495 sampler = device.GetLogical().CreateSampler(ci);
496}
497
498void FSR::CreateShaders() {
499 if (device.IsFloat16Supported()) {
500 easu_shader = BuildShader(device, VULKAN_FIDELITYFX_FSR_EASU_FP16_COMP_SPV);
501 rcas_shader = BuildShader(device, VULKAN_FIDELITYFX_FSR_RCAS_FP16_COMP_SPV);
502 } else {
503 easu_shader = BuildShader(device, VULKAN_FIDELITYFX_FSR_EASU_FP32_COMP_SPV);
504 rcas_shader = BuildShader(device, VULKAN_FIDELITYFX_FSR_RCAS_FP32_COMP_SPV);
505 }
506}
507
508void FSR::CreatePipeline() {
509 VkPipelineShaderStageCreateInfo shader_stage_easu{
510 .sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO,
511 .pNext = nullptr,
512 .flags = 0,
513 .stage = VK_SHADER_STAGE_COMPUTE_BIT,
514 .module = *easu_shader,
515 .pName = "main",
516 .pSpecializationInfo = nullptr,
517 };
518
519 VkPipelineShaderStageCreateInfo shader_stage_rcas{
520 .sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO,
521 .pNext = nullptr,
522 .flags = 0,
523 .stage = VK_SHADER_STAGE_COMPUTE_BIT,
524 .module = *rcas_shader,
525 .pName = "main",
526 .pSpecializationInfo = nullptr,
527 };
528
529 VkComputePipelineCreateInfo pipeline_ci_easu{
530 .sType = VK_STRUCTURE_TYPE_COMPUTE_PIPELINE_CREATE_INFO,
531 .pNext = nullptr,
532 .flags = 0,
533 .stage = shader_stage_easu,
534 .layout = *pipeline_layout,
535 .basePipelineHandle = VK_NULL_HANDLE,
536 .basePipelineIndex = 0,
537 };
538
539 VkComputePipelineCreateInfo pipeline_ci_rcas{
540 .sType = VK_STRUCTURE_TYPE_COMPUTE_PIPELINE_CREATE_INFO,
541 .pNext = nullptr,
542 .flags = 0,
543 .stage = shader_stage_rcas,
544 .layout = *pipeline_layout,
545 .basePipelineHandle = VK_NULL_HANDLE,
546 .basePipelineIndex = 0,
547 };
548
549 easu_pipeline = device.GetLogical().CreateComputePipeline(pipeline_ci_easu);
550 rcas_pipeline = device.GetLogical().CreateComputePipeline(pipeline_ci_rcas);
551}
552
553} // namespace Vulkan
diff --git a/src/video_core/renderer_vulkan/vk_fsr.h b/src/video_core/renderer_vulkan/vk_fsr.h
new file mode 100644
index 000000000..6bbec3d36
--- /dev/null
+++ b/src/video_core/renderer_vulkan/vk_fsr.h
@@ -0,0 +1,54 @@
1// Copyright 2021 yuzu Emulator Project
2// Licensed under GPLv2 or any later version
3// Refer to the license.txt file included.
4
5#pragma once
6
7#include "common/math_util.h"
8#include "video_core/vulkan_common/vulkan_memory_allocator.h"
9#include "video_core/vulkan_common/vulkan_wrapper.h"
10
11namespace Vulkan {
12
13class Device;
14class VKScheduler;
15
16class FSR {
17public:
18 explicit FSR(const Device& device, MemoryAllocator& memory_allocator, size_t image_count,
19 VkExtent2D output_size);
20 VkImageView Draw(VKScheduler& scheduler, size_t image_index, VkImageView image_view,
21 VkExtent2D input_image_extent, const Common::Rectangle<int>& crop_rect);
22
23private:
24 void CreateDescriptorPool();
25 void CreateDescriptorSetLayout();
26 void CreateDescriptorSets();
27 void CreateImages();
28 void CreateSampler();
29 void CreateShaders();
30 void CreatePipeline();
31 void CreatePipelineLayout();
32
33 void UpdateDescriptorSet(std::size_t image_index, VkImageView image_view) const;
34
35 const Device& device;
36 MemoryAllocator& memory_allocator;
37 size_t image_count;
38 VkExtent2D output_size;
39
40 vk::DescriptorPool descriptor_pool;
41 vk::DescriptorSetLayout descriptor_set_layout;
42 vk::DescriptorSets descriptor_sets;
43 vk::PipelineLayout pipeline_layout;
44 vk::ShaderModule easu_shader;
45 vk::ShaderModule rcas_shader;
46 vk::Pipeline easu_pipeline;
47 vk::Pipeline rcas_pipeline;
48 vk::Sampler sampler;
49 std::vector<vk::Image> images;
50 std::vector<vk::ImageView> image_views;
51 std::vector<MemoryCommit> buffer_commits;
52};
53
54} // namespace Vulkan
diff --git a/src/video_core/renderer_vulkan/vk_graphics_pipeline.cpp b/src/video_core/renderer_vulkan/vk_graphics_pipeline.cpp
index 8634c3316..616a7b457 100644
--- a/src/video_core/renderer_vulkan/vk_graphics_pipeline.cpp
+++ b/src/video_core/renderer_vulkan/vk_graphics_pipeline.cpp
@@ -32,6 +32,8 @@ namespace {
32using boost::container::small_vector; 32using boost::container::small_vector;
33using boost::container::static_vector; 33using boost::container::static_vector;
34using Shader::ImageBufferDescriptor; 34using Shader::ImageBufferDescriptor;
35using Shader::Backend::SPIRV::RESCALING_LAYOUT_DOWN_FACTOR_OFFSET;
36using Shader::Backend::SPIRV::RESCALING_LAYOUT_WORDS_OFFSET;
35using Tegra::Texture::TexturePair; 37using Tegra::Texture::TexturePair;
36using VideoCore::Surface::PixelFormat; 38using VideoCore::Surface::PixelFormat;
37using VideoCore::Surface::PixelFormatFromDepthFormat; 39using VideoCore::Surface::PixelFormatFromDepthFormat;
@@ -235,6 +237,7 @@ GraphicsPipeline::GraphicsPipeline(
235 stage_infos[stage] = *info; 237 stage_infos[stage] = *info;
236 enabled_uniform_buffer_masks[stage] = info->constant_buffer_mask; 238 enabled_uniform_buffer_masks[stage] = info->constant_buffer_mask;
237 std::ranges::copy(info->constant_buffer_used_sizes, uniform_buffer_sizes[stage].begin()); 239 std::ranges::copy(info->constant_buffer_used_sizes, uniform_buffer_sizes[stage].begin());
240 num_textures += Shader::NumDescriptors(info->texture_descriptors);
238 } 241 }
239 auto func{[this, shader_notify, &render_pass_cache, &descriptor_pool, pipeline_statistics] { 242 auto func{[this, shader_notify, &render_pass_cache, &descriptor_pool, pipeline_statistics] {
240 DescriptorLayoutBuilder builder{MakeBuilder(device, stage_infos)}; 243 DescriptorLayoutBuilder builder{MakeBuilder(device, stage_infos)};
@@ -277,11 +280,10 @@ void GraphicsPipeline::AddTransition(GraphicsPipeline* transition) {
277 280
278template <typename Spec> 281template <typename Spec>
279void GraphicsPipeline::ConfigureImpl(bool is_indexed) { 282void GraphicsPipeline::ConfigureImpl(bool is_indexed) {
280 std::array<ImageId, MAX_IMAGE_ELEMENTS> image_view_ids; 283 std::array<VideoCommon::ImageViewInOut, MAX_IMAGE_ELEMENTS> views;
281 std::array<u32, MAX_IMAGE_ELEMENTS> image_view_indices;
282 std::array<VkSampler, MAX_IMAGE_ELEMENTS> samplers; 284 std::array<VkSampler, MAX_IMAGE_ELEMENTS> samplers;
283 size_t sampler_index{}; 285 size_t sampler_index{};
284 size_t image_index{}; 286 size_t view_index{};
285 287
286 texture_cache.SynchronizeGraphicsDescriptors(); 288 texture_cache.SynchronizeGraphicsDescriptors();
287 289
@@ -322,26 +324,30 @@ void GraphicsPipeline::ConfigureImpl(bool is_indexed) {
322 } 324 }
323 return TexturePair(gpu_memory.Read<u32>(addr), via_header_index); 325 return TexturePair(gpu_memory.Read<u32>(addr), via_header_index);
324 }}; 326 }};
325 const auto add_image{[&](const auto& desc) { 327 const auto add_image{[&](const auto& desc, bool blacklist) LAMBDA_FORCEINLINE {
326 for (u32 index = 0; index < desc.count; ++index) { 328 for (u32 index = 0; index < desc.count; ++index) {
327 const auto handle{read_handle(desc, index)}; 329 const auto handle{read_handle(desc, index)};
328 image_view_indices[image_index++] = handle.first; 330 views[view_index++] = {
331 .index = handle.first,
332 .blacklist = blacklist,
333 .id = {},
334 };
329 } 335 }
330 }}; 336 }};
331 if constexpr (Spec::has_texture_buffers) { 337 if constexpr (Spec::has_texture_buffers) {
332 for (const auto& desc : info.texture_buffer_descriptors) { 338 for (const auto& desc : info.texture_buffer_descriptors) {
333 add_image(desc); 339 add_image(desc, false);
334 } 340 }
335 } 341 }
336 if constexpr (Spec::has_image_buffers) { 342 if constexpr (Spec::has_image_buffers) {
337 for (const auto& desc : info.image_buffer_descriptors) { 343 for (const auto& desc : info.image_buffer_descriptors) {
338 add_image(desc); 344 add_image(desc, false);
339 } 345 }
340 } 346 }
341 for (const auto& desc : info.texture_descriptors) { 347 for (const auto& desc : info.texture_descriptors) {
342 for (u32 index = 0; index < desc.count; ++index) { 348 for (u32 index = 0; index < desc.count; ++index) {
343 const auto handle{read_handle(desc, index)}; 349 const auto handle{read_handle(desc, index)};
344 image_view_indices[image_index++] = handle.first; 350 views[view_index++] = {handle.first};
345 351
346 Sampler* const sampler{texture_cache.GetGraphicsSampler(handle.second)}; 352 Sampler* const sampler{texture_cache.GetGraphicsSampler(handle.second)};
347 samplers[sampler_index++] = sampler->Handle(); 353 samplers[sampler_index++] = sampler->Handle();
@@ -349,7 +355,7 @@ void GraphicsPipeline::ConfigureImpl(bool is_indexed) {
349 } 355 }
350 if constexpr (Spec::has_images) { 356 if constexpr (Spec::has_images) {
351 for (const auto& desc : info.image_descriptors) { 357 for (const auto& desc : info.image_descriptors) {
352 add_image(desc); 358 add_image(desc, desc.is_written);
353 } 359 }
354 } 360 }
355 }}; 361 }};
@@ -368,10 +374,9 @@ void GraphicsPipeline::ConfigureImpl(bool is_indexed) {
368 if constexpr (Spec::enabled_stages[4]) { 374 if constexpr (Spec::enabled_stages[4]) {
369 config_stage(4); 375 config_stage(4);
370 } 376 }
371 const std::span indices_span(image_view_indices.data(), image_index); 377 texture_cache.FillGraphicsImageViews<Spec::has_images>(std::span(views.data(), view_index));
372 texture_cache.FillGraphicsImageViews(indices_span, image_view_ids);
373 378
374 ImageId* texture_buffer_index{image_view_ids.data()}; 379 VideoCommon::ImageViewInOut* texture_buffer_it{views.data()};
375 const auto bind_stage_info{[&](size_t stage) LAMBDA_FORCEINLINE { 380 const auto bind_stage_info{[&](size_t stage) LAMBDA_FORCEINLINE {
376 size_t index{}; 381 size_t index{};
377 const auto add_buffer{[&](const auto& desc) { 382 const auto add_buffer{[&](const auto& desc) {
@@ -381,12 +386,12 @@ void GraphicsPipeline::ConfigureImpl(bool is_indexed) {
381 if constexpr (is_image) { 386 if constexpr (is_image) {
382 is_written = desc.is_written; 387 is_written = desc.is_written;
383 } 388 }
384 ImageView& image_view{texture_cache.GetImageView(*texture_buffer_index)}; 389 ImageView& image_view{texture_cache.GetImageView(texture_buffer_it->id)};
385 buffer_cache.BindGraphicsTextureBuffer(stage, index, image_view.GpuAddr(), 390 buffer_cache.BindGraphicsTextureBuffer(stage, index, image_view.GpuAddr(),
386 image_view.BufferSize(), image_view.format, 391 image_view.BufferSize(), image_view.format,
387 is_written, is_image); 392 is_written, is_image);
388 ++index; 393 ++index;
389 ++texture_buffer_index; 394 ++texture_buffer_it;
390 } 395 }
391 }}; 396 }};
392 buffer_cache.UnbindGraphicsTextureBuffers(stage); 397 buffer_cache.UnbindGraphicsTextureBuffers(stage);
@@ -402,13 +407,9 @@ void GraphicsPipeline::ConfigureImpl(bool is_indexed) {
402 add_buffer(desc); 407 add_buffer(desc);
403 } 408 }
404 } 409 }
405 for (const auto& desc : info.texture_descriptors) { 410 texture_buffer_it += Shader::NumDescriptors(info.texture_descriptors);
406 texture_buffer_index += desc.count;
407 }
408 if constexpr (Spec::has_images) { 411 if constexpr (Spec::has_images) {
409 for (const auto& desc : info.image_descriptors) { 412 texture_buffer_it += Shader::NumDescriptors(info.image_descriptors);
410 texture_buffer_index += desc.count;
411 }
412 } 413 }
413 }}; 414 }};
414 if constexpr (Spec::enabled_stages[0]) { 415 if constexpr (Spec::enabled_stages[0]) {
@@ -432,12 +433,13 @@ void GraphicsPipeline::ConfigureImpl(bool is_indexed) {
432 433
433 update_descriptor_queue.Acquire(); 434 update_descriptor_queue.Acquire();
434 435
436 RescalingPushConstant rescaling;
435 const VkSampler* samplers_it{samplers.data()}; 437 const VkSampler* samplers_it{samplers.data()};
436 const ImageId* views_it{image_view_ids.data()}; 438 const VideoCommon::ImageViewInOut* views_it{views.data()};
437 const auto prepare_stage{[&](size_t stage) LAMBDA_FORCEINLINE { 439 const auto prepare_stage{[&](size_t stage) LAMBDA_FORCEINLINE {
438 buffer_cache.BindHostStageBuffers(stage); 440 buffer_cache.BindHostStageBuffers(stage);
439 PushImageDescriptors(stage_infos[stage], samplers_it, views_it, texture_cache, 441 PushImageDescriptors(texture_cache, update_descriptor_queue, stage_infos[stage], rescaling,
440 update_descriptor_queue); 442 samplers_it, views_it);
441 }}; 443 }};
442 if constexpr (Spec::enabled_stages[0]) { 444 if constexpr (Spec::enabled_stages[0]) {
443 prepare_stage(0); 445 prepare_stage(0);
@@ -454,10 +456,10 @@ void GraphicsPipeline::ConfigureImpl(bool is_indexed) {
454 if constexpr (Spec::enabled_stages[4]) { 456 if constexpr (Spec::enabled_stages[4]) {
455 prepare_stage(4); 457 prepare_stage(4);
456 } 458 }
457 ConfigureDraw(); 459 ConfigureDraw(rescaling);
458} 460}
459 461
460void GraphicsPipeline::ConfigureDraw() { 462void GraphicsPipeline::ConfigureDraw(const RescalingPushConstant& rescaling) {
461 texture_cache.UpdateRenderTargets(false); 463 texture_cache.UpdateRenderTargets(false);
462 scheduler.RequestRenderpass(texture_cache.GetFramebuffer()); 464 scheduler.RequestRenderpass(texture_cache.GetFramebuffer());
463 465
@@ -468,12 +470,25 @@ void GraphicsPipeline::ConfigureDraw() {
468 build_condvar.wait(lock, [this] { return is_built.load(std::memory_order::relaxed); }); 470 build_condvar.wait(lock, [this] { return is_built.load(std::memory_order::relaxed); });
469 }); 471 });
470 } 472 }
473 const bool is_rescaling{texture_cache.IsRescaling()};
474 const bool update_rescaling{scheduler.UpdateRescaling(is_rescaling)};
471 const bool bind_pipeline{scheduler.UpdateGraphicsPipeline(this)}; 475 const bool bind_pipeline{scheduler.UpdateGraphicsPipeline(this)};
472 const void* const descriptor_data{update_descriptor_queue.UpdateData()}; 476 const void* const descriptor_data{update_descriptor_queue.UpdateData()};
473 scheduler.Record([this, descriptor_data, bind_pipeline](vk::CommandBuffer cmdbuf) { 477 scheduler.Record([this, descriptor_data, bind_pipeline, rescaling_data = rescaling.Data(),
478 is_rescaling, update_rescaling](vk::CommandBuffer cmdbuf) {
474 if (bind_pipeline) { 479 if (bind_pipeline) {
475 cmdbuf.BindPipeline(VK_PIPELINE_BIND_POINT_GRAPHICS, *pipeline); 480 cmdbuf.BindPipeline(VK_PIPELINE_BIND_POINT_GRAPHICS, *pipeline);
476 } 481 }
482 cmdbuf.PushConstants(*pipeline_layout, VK_SHADER_STAGE_ALL_GRAPHICS,
483 RESCALING_LAYOUT_WORDS_OFFSET, sizeof(rescaling_data),
484 rescaling_data.data());
485 if (update_rescaling) {
486 const f32 config_down_factor{Settings::values.resolution_info.down_factor};
487 const f32 scale_down_factor{is_rescaling ? config_down_factor : 1.0f};
488 cmdbuf.PushConstants(*pipeline_layout, VK_SHADER_STAGE_ALL_GRAPHICS,
489 RESCALING_LAYOUT_DOWN_FACTOR_OFFSET, sizeof(scale_down_factor),
490 &scale_down_factor);
491 }
477 if (!descriptor_set_layout) { 492 if (!descriptor_set_layout) {
478 return; 493 return;
479 } 494 }
@@ -826,18 +841,10 @@ void GraphicsPipeline::MakePipeline(VkRenderPass render_pass) {
826void GraphicsPipeline::Validate() { 841void GraphicsPipeline::Validate() {
827 size_t num_images{}; 842 size_t num_images{};
828 for (const auto& info : stage_infos) { 843 for (const auto& info : stage_infos) {
829 for (const auto& desc : info.texture_buffer_descriptors) { 844 num_images += Shader::NumDescriptors(info.texture_buffer_descriptors);
830 num_images += desc.count; 845 num_images += Shader::NumDescriptors(info.image_buffer_descriptors);
831 } 846 num_images += Shader::NumDescriptors(info.texture_descriptors);
832 for (const auto& desc : info.image_buffer_descriptors) { 847 num_images += Shader::NumDescriptors(info.image_descriptors);
833 num_images += desc.count;
834 }
835 for (const auto& desc : info.texture_descriptors) {
836 num_images += desc.count;
837 }
838 for (const auto& desc : info.image_descriptors) {
839 num_images += desc.count;
840 }
841 } 848 }
842 ASSERT(num_images <= MAX_IMAGE_ELEMENTS); 849 ASSERT(num_images <= MAX_IMAGE_ELEMENTS);
843} 850}
diff --git a/src/video_core/renderer_vulkan/vk_graphics_pipeline.h b/src/video_core/renderer_vulkan/vk_graphics_pipeline.h
index 1c780e944..a0c1d8f07 100644
--- a/src/video_core/renderer_vulkan/vk_graphics_pipeline.h
+++ b/src/video_core/renderer_vulkan/vk_graphics_pipeline.h
@@ -62,6 +62,7 @@ namespace Vulkan {
62class Device; 62class Device;
63class PipelineStatistics; 63class PipelineStatistics;
64class RenderPassCache; 64class RenderPassCache;
65class RescalingPushConstant;
65class VKScheduler; 66class VKScheduler;
66class VKUpdateDescriptorQueue; 67class VKUpdateDescriptorQueue;
67 68
@@ -113,7 +114,7 @@ private:
113 template <typename Spec> 114 template <typename Spec>
114 void ConfigureImpl(bool is_indexed); 115 void ConfigureImpl(bool is_indexed);
115 116
116 void ConfigureDraw(); 117 void ConfigureDraw(const RescalingPushConstant& rescaling);
117 118
118 void MakePipeline(VkRenderPass render_pass); 119 void MakePipeline(VkRenderPass render_pass);
119 120
@@ -138,6 +139,7 @@ private:
138 std::array<Shader::Info, NUM_STAGES> stage_infos; 139 std::array<Shader::Info, NUM_STAGES> stage_infos;
139 std::array<u32, 5> enabled_uniform_buffer_masks{}; 140 std::array<u32, 5> enabled_uniform_buffer_masks{};
140 VideoCommon::UniformBufferSizes uniform_buffer_sizes{}; 141 VideoCommon::UniformBufferSizes uniform_buffer_sizes{};
142 u32 num_textures{};
141 143
142 vk::DescriptorSetLayout descriptor_set_layout; 144 vk::DescriptorSetLayout descriptor_set_layout;
143 DescriptorAllocator descriptor_allocator; 145 DescriptorAllocator descriptor_allocator;
diff --git a/src/video_core/renderer_vulkan/vk_master_semaphore.h b/src/video_core/renderer_vulkan/vk_master_semaphore.h
index 0886b7da8..9be9c9bed 100644
--- a/src/video_core/renderer_vulkan/vk_master_semaphore.h
+++ b/src/video_core/renderer_vulkan/vk_master_semaphore.h
@@ -70,7 +70,9 @@ public:
70 return; 70 return;
71 } 71 }
72 // If none of the above is hit, fallback to a regular wait 72 // If none of the above is hit, fallback to a regular wait
73 semaphore.Wait(tick); 73 while (!semaphore.Wait(tick)) {
74 }
75 Refresh();
74 } 76 }
75 77
76private: 78private:
diff --git a/src/video_core/renderer_vulkan/vk_rasterizer.cpp b/src/video_core/renderer_vulkan/vk_rasterizer.cpp
index 30b47a7a0..fd334a146 100644
--- a/src/video_core/renderer_vulkan/vk_rasterizer.cpp
+++ b/src/video_core/renderer_vulkan/vk_rasterizer.cpp
@@ -58,18 +58,28 @@ struct DrawParams {
58 bool is_indexed; 58 bool is_indexed;
59}; 59};
60 60
61VkViewport GetViewportState(const Device& device, const Maxwell& regs, size_t index) { 61VkViewport GetViewportState(const Device& device, const Maxwell& regs, size_t index, float scale) {
62 const auto& src = regs.viewport_transform[index]; 62 const auto& src = regs.viewport_transform[index];
63 const float width = src.scale_x * 2.0f; 63 const auto conv = [scale](float value) {
64 float y = src.translate_y - src.scale_y; 64 float new_value = value * scale;
65 float height = src.scale_y * 2.0f; 65 if (scale < 1.0f) {
66 const bool sign = std::signbit(value);
67 new_value = std::round(std::abs(new_value));
68 new_value = sign ? -new_value : new_value;
69 }
70 return new_value;
71 };
72 const float x = conv(src.translate_x - src.scale_x);
73 const float width = conv(src.scale_x * 2.0f);
74 float y = conv(src.translate_y - src.scale_y);
75 float height = conv(src.scale_y * 2.0f);
66 if (regs.screen_y_control.y_negate) { 76 if (regs.screen_y_control.y_negate) {
67 y += height; 77 y += height;
68 height = -height; 78 height = -height;
69 } 79 }
70 const float reduce_z = regs.depth_mode == Maxwell::DepthMode::MinusOneToOne ? 1.0f : 0.0f; 80 const float reduce_z = regs.depth_mode == Maxwell::DepthMode::MinusOneToOne ? 1.0f : 0.0f;
71 VkViewport viewport{ 81 VkViewport viewport{
72 .x = src.translate_x - src.scale_x, 82 .x = x,
73 .y = y, 83 .y = y,
74 .width = width != 0.0f ? width : 1.0f, 84 .width = width != 0.0f ? width : 1.0f,
75 .height = height != 0.0f ? height : 1.0f, 85 .height = height != 0.0f ? height : 1.0f,
@@ -83,14 +93,27 @@ VkViewport GetViewportState(const Device& device, const Maxwell& regs, size_t in
83 return viewport; 93 return viewport;
84} 94}
85 95
86VkRect2D GetScissorState(const Maxwell& regs, size_t index) { 96VkRect2D GetScissorState(const Maxwell& regs, size_t index, u32 up_scale = 1, u32 down_shift = 0) {
87 const auto& src = regs.scissor_test[index]; 97 const auto& src = regs.scissor_test[index];
88 VkRect2D scissor; 98 VkRect2D scissor;
99 const auto scale_up = [&](s32 value) -> s32 {
100 if (value == 0) {
101 return 0U;
102 }
103 const s32 upset = value * up_scale;
104 s32 acumm = 0;
105 if ((up_scale >> down_shift) == 0) {
106 acumm = upset % 2;
107 }
108 const s32 converted_value = (value * up_scale) >> down_shift;
109 return value < 0 ? std::min<s32>(converted_value - acumm, -1)
110 : std::max<s32>(converted_value + acumm, 1);
111 };
89 if (src.enable) { 112 if (src.enable) {
90 scissor.offset.x = static_cast<s32>(src.min_x); 113 scissor.offset.x = scale_up(static_cast<s32>(src.min_x));
91 scissor.offset.y = static_cast<s32>(src.min_y); 114 scissor.offset.y = scale_up(static_cast<s32>(src.min_y));
92 scissor.extent.width = src.max_x - src.min_x; 115 scissor.extent.width = scale_up(src.max_x - src.min_x);
93 scissor.extent.height = src.max_y - src.min_y; 116 scissor.extent.height = scale_up(src.max_y - src.min_y);
94 } else { 117 } else {
95 scissor.offset.x = 0; 118 scissor.offset.x = 0;
96 scissor.offset.y = 0; 119 scissor.offset.y = 0;
@@ -199,7 +222,7 @@ void RasterizerVulkan::Clear() {
199 222
200 query_cache.UpdateCounters(); 223 query_cache.UpdateCounters();
201 224
202 const auto& regs = maxwell3d.regs; 225 auto& regs = maxwell3d.regs;
203 const bool use_color = regs.clear_buffers.R || regs.clear_buffers.G || regs.clear_buffers.B || 226 const bool use_color = regs.clear_buffers.R || regs.clear_buffers.G || regs.clear_buffers.B ||
204 regs.clear_buffers.A; 227 regs.clear_buffers.A;
205 const bool use_depth = regs.clear_buffers.Z; 228 const bool use_depth = regs.clear_buffers.Z;
@@ -214,8 +237,16 @@ void RasterizerVulkan::Clear() {
214 const VkExtent2D render_area = framebuffer->RenderArea(); 237 const VkExtent2D render_area = framebuffer->RenderArea();
215 scheduler.RequestRenderpass(framebuffer); 238 scheduler.RequestRenderpass(framebuffer);
216 239
240 u32 up_scale = 1;
241 u32 down_shift = 0;
242 if (texture_cache.IsRescaling()) {
243 up_scale = Settings::values.resolution_info.up_scale;
244 down_shift = Settings::values.resolution_info.down_shift;
245 }
246 UpdateViewportsState(regs);
247
217 VkClearRect clear_rect{ 248 VkClearRect clear_rect{
218 .rect = GetScissorState(regs, 0), 249 .rect = GetScissorState(regs, 0, up_scale, down_shift),
219 .baseArrayLayer = regs.clear_buffers.layer, 250 .baseArrayLayer = regs.clear_buffers.layer,
220 .layerCount = 1, 251 .layerCount = 1,
221 }; 252 };
@@ -230,7 +261,38 @@ void RasterizerVulkan::Clear() {
230 const u32 color_attachment = regs.clear_buffers.RT; 261 const u32 color_attachment = regs.clear_buffers.RT;
231 if (use_color && framebuffer->HasAspectColorBit(color_attachment)) { 262 if (use_color && framebuffer->HasAspectColorBit(color_attachment)) {
232 VkClearValue clear_value; 263 VkClearValue clear_value;
233 std::memcpy(clear_value.color.float32, regs.clear_color, sizeof(regs.clear_color)); 264 bool is_integer = false;
265 bool is_signed = false;
266 size_t int_size = 8;
267 for (std::size_t i = 0; i < Tegra::Engines::Maxwell3D::Regs::NumRenderTargets; ++i) {
268 const auto& this_rt = regs.rt[i];
269 if (this_rt.Address() == 0) {
270 continue;
271 }
272 if (this_rt.format == Tegra::RenderTargetFormat::NONE) {
273 continue;
274 }
275 const auto format =
276 VideoCore::Surface::PixelFormatFromRenderTargetFormat(this_rt.format);
277 is_integer = IsPixelFormatInteger(format);
278 is_signed = IsPixelFormatSignedInteger(format);
279 int_size = PixelComponentSizeBitsInteger(format);
280 break;
281 }
282 if (!is_integer) {
283 std::memcpy(clear_value.color.float32, regs.clear_color, sizeof(regs.clear_color));
284 } else if (!is_signed) {
285 for (size_t i = 0; i < 4; i++) {
286 clear_value.color.uint32[i] = static_cast<u32>(
287 static_cast<f32>(static_cast<u64>(int_size) << 1U) * regs.clear_color[i]);
288 }
289 } else {
290 for (size_t i = 0; i < 4; i++) {
291 clear_value.color.int32[i] =
292 static_cast<s32>(static_cast<f32>(static_cast<s64>(int_size - 1) << 1) *
293 (regs.clear_color[i] - 0.5f));
294 }
295 }
234 296
235 scheduler.Record([color_attachment, clear_value, clear_rect](vk::CommandBuffer cmdbuf) { 297 scheduler.Record([color_attachment, clear_value, clear_rect](vk::CommandBuffer cmdbuf) {
236 const VkClearAttachment attachment{ 298 const VkClearAttachment attachment{
@@ -595,15 +657,17 @@ void RasterizerVulkan::UpdateViewportsState(Tegra::Engines::Maxwell3D::Regs& reg
595 if (!state_tracker.TouchViewports()) { 657 if (!state_tracker.TouchViewports()) {
596 return; 658 return;
597 } 659 }
660 const bool is_rescaling{texture_cache.IsRescaling()};
661 const float scale = is_rescaling ? Settings::values.resolution_info.up_factor : 1.0f;
598 const std::array viewports{ 662 const std::array viewports{
599 GetViewportState(device, regs, 0), GetViewportState(device, regs, 1), 663 GetViewportState(device, regs, 0, scale), GetViewportState(device, regs, 1, scale),
600 GetViewportState(device, regs, 2), GetViewportState(device, regs, 3), 664 GetViewportState(device, regs, 2, scale), GetViewportState(device, regs, 3, scale),
601 GetViewportState(device, regs, 4), GetViewportState(device, regs, 5), 665 GetViewportState(device, regs, 4, scale), GetViewportState(device, regs, 5, scale),
602 GetViewportState(device, regs, 6), GetViewportState(device, regs, 7), 666 GetViewportState(device, regs, 6, scale), GetViewportState(device, regs, 7, scale),
603 GetViewportState(device, regs, 8), GetViewportState(device, regs, 9), 667 GetViewportState(device, regs, 8, scale), GetViewportState(device, regs, 9, scale),
604 GetViewportState(device, regs, 10), GetViewportState(device, regs, 11), 668 GetViewportState(device, regs, 10, scale), GetViewportState(device, regs, 11, scale),
605 GetViewportState(device, regs, 12), GetViewportState(device, regs, 13), 669 GetViewportState(device, regs, 12, scale), GetViewportState(device, regs, 13, scale),
606 GetViewportState(device, regs, 14), GetViewportState(device, regs, 15), 670 GetViewportState(device, regs, 14, scale), GetViewportState(device, regs, 15, scale),
607 }; 671 };
608 scheduler.Record([viewports](vk::CommandBuffer cmdbuf) { cmdbuf.SetViewport(0, viewports); }); 672 scheduler.Record([viewports](vk::CommandBuffer cmdbuf) { cmdbuf.SetViewport(0, viewports); });
609} 673}
@@ -612,13 +676,29 @@ void RasterizerVulkan::UpdateScissorsState(Tegra::Engines::Maxwell3D::Regs& regs
612 if (!state_tracker.TouchScissors()) { 676 if (!state_tracker.TouchScissors()) {
613 return; 677 return;
614 } 678 }
679 u32 up_scale = 1;
680 u32 down_shift = 0;
681 if (texture_cache.IsRescaling()) {
682 up_scale = Settings::values.resolution_info.up_scale;
683 down_shift = Settings::values.resolution_info.down_shift;
684 }
615 const std::array scissors{ 685 const std::array scissors{
616 GetScissorState(regs, 0), GetScissorState(regs, 1), GetScissorState(regs, 2), 686 GetScissorState(regs, 0, up_scale, down_shift),
617 GetScissorState(regs, 3), GetScissorState(regs, 4), GetScissorState(regs, 5), 687 GetScissorState(regs, 1, up_scale, down_shift),
618 GetScissorState(regs, 6), GetScissorState(regs, 7), GetScissorState(regs, 8), 688 GetScissorState(regs, 2, up_scale, down_shift),
619 GetScissorState(regs, 9), GetScissorState(regs, 10), GetScissorState(regs, 11), 689 GetScissorState(regs, 3, up_scale, down_shift),
620 GetScissorState(regs, 12), GetScissorState(regs, 13), GetScissorState(regs, 14), 690 GetScissorState(regs, 4, up_scale, down_shift),
621 GetScissorState(regs, 15), 691 GetScissorState(regs, 5, up_scale, down_shift),
692 GetScissorState(regs, 6, up_scale, down_shift),
693 GetScissorState(regs, 7, up_scale, down_shift),
694 GetScissorState(regs, 8, up_scale, down_shift),
695 GetScissorState(regs, 9, up_scale, down_shift),
696 GetScissorState(regs, 10, up_scale, down_shift),
697 GetScissorState(regs, 11, up_scale, down_shift),
698 GetScissorState(regs, 12, up_scale, down_shift),
699 GetScissorState(regs, 13, up_scale, down_shift),
700 GetScissorState(regs, 14, up_scale, down_shift),
701 GetScissorState(regs, 15, up_scale, down_shift),
622 }; 702 };
623 scheduler.Record([scissors](vk::CommandBuffer cmdbuf) { cmdbuf.SetScissor(0, scissors); }); 703 scheduler.Record([scissors](vk::CommandBuffer cmdbuf) { cmdbuf.SetScissor(0, scissors); });
624} 704}
diff --git a/src/video_core/renderer_vulkan/vk_scheduler.cpp b/src/video_core/renderer_vulkan/vk_scheduler.cpp
index 0c11c814f..3bfdf41ba 100644
--- a/src/video_core/renderer_vulkan/vk_scheduler.cpp
+++ b/src/video_core/renderer_vulkan/vk_scheduler.cpp
@@ -128,6 +128,15 @@ bool VKScheduler::UpdateGraphicsPipeline(GraphicsPipeline* pipeline) {
128 return true; 128 return true;
129} 129}
130 130
131bool VKScheduler::UpdateRescaling(bool is_rescaling) {
132 if (state.rescaling_defined && is_rescaling == state.is_rescaling) {
133 return false;
134 }
135 state.rescaling_defined = true;
136 state.is_rescaling = is_rescaling;
137 return true;
138}
139
131void VKScheduler::WorkerThread(std::stop_token stop_token) { 140void VKScheduler::WorkerThread(std::stop_token stop_token) {
132 Common::SetCurrentThreadName("yuzu:VulkanWorker"); 141 Common::SetCurrentThreadName("yuzu:VulkanWorker");
133 do { 142 do {
@@ -227,6 +236,7 @@ void VKScheduler::AllocateNewContext() {
227 236
228void VKScheduler::InvalidateState() { 237void VKScheduler::InvalidateState() {
229 state.graphics_pipeline = nullptr; 238 state.graphics_pipeline = nullptr;
239 state.rescaling_defined = false;
230 state_tracker.InvalidateCommandBufferState(); 240 state_tracker.InvalidateCommandBufferState();
231} 241}
232 242
diff --git a/src/video_core/renderer_vulkan/vk_scheduler.h b/src/video_core/renderer_vulkan/vk_scheduler.h
index 85fc1712f..1b06c9296 100644
--- a/src/video_core/renderer_vulkan/vk_scheduler.h
+++ b/src/video_core/renderer_vulkan/vk_scheduler.h
@@ -56,6 +56,9 @@ public:
56 /// Update the pipeline to the current execution context. 56 /// Update the pipeline to the current execution context.
57 bool UpdateGraphicsPipeline(GraphicsPipeline* pipeline); 57 bool UpdateGraphicsPipeline(GraphicsPipeline* pipeline);
58 58
59 /// Update the rescaling state. Returns true if the state has to be updated.
60 bool UpdateRescaling(bool is_rescaling);
61
59 /// Invalidates current command buffer state except for render passes 62 /// Invalidates current command buffer state except for render passes
60 void InvalidateState(); 63 void InvalidateState();
61 64
@@ -185,6 +188,8 @@ private:
185 VkFramebuffer framebuffer = nullptr; 188 VkFramebuffer framebuffer = nullptr;
186 VkExtent2D render_area = {0, 0}; 189 VkExtent2D render_area = {0, 0};
187 GraphicsPipeline* graphics_pipeline = nullptr; 190 GraphicsPipeline* graphics_pipeline = nullptr;
191 bool is_rescaling = false;
192 bool rescaling_defined = false;
188 }; 193 };
189 194
190 void WorkerThread(std::stop_token stop_token); 195 void WorkerThread(std::stop_token stop_token);
diff --git a/src/video_core/renderer_vulkan/vk_state_tracker.h b/src/video_core/renderer_vulkan/vk_state_tracker.h
index 2f2d6b31f..40a149832 100644
--- a/src/video_core/renderer_vulkan/vk_state_tracker.h
+++ b/src/video_core/renderer_vulkan/vk_state_tracker.h
@@ -71,11 +71,15 @@ public:
71 } 71 }
72 72
73 bool TouchViewports() { 73 bool TouchViewports() {
74 return Exchange(Dirty::Viewports, false); 74 const bool dirty_viewports = Exchange(Dirty::Viewports, false);
75 const bool rescale_viewports = Exchange(VideoCommon::Dirty::RescaleViewports, false);
76 return dirty_viewports || rescale_viewports;
75 } 77 }
76 78
77 bool TouchScissors() { 79 bool TouchScissors() {
78 return Exchange(Dirty::Scissors, false); 80 const bool dirty_scissors = Exchange(Dirty::Scissors, false);
81 const bool rescale_scissors = Exchange(VideoCommon::Dirty::RescaleScissors, false);
82 return dirty_scissors || rescale_scissors;
79 } 83 }
80 84
81 bool TouchDepthBias() { 85 bool TouchDepthBias() {
diff --git a/src/video_core/renderer_vulkan/vk_texture_cache.cpp b/src/video_core/renderer_vulkan/vk_texture_cache.cpp
index 06c5fb867..407fd2a15 100644
--- a/src/video_core/renderer_vulkan/vk_texture_cache.cpp
+++ b/src/video_core/renderer_vulkan/vk_texture_cache.cpp
@@ -32,10 +32,12 @@ using Tegra::Engines::Fermi2D;
32using Tegra::Texture::SwizzleSource; 32using Tegra::Texture::SwizzleSource;
33using Tegra::Texture::TextureMipmapFilter; 33using Tegra::Texture::TextureMipmapFilter;
34using VideoCommon::BufferImageCopy; 34using VideoCommon::BufferImageCopy;
35using VideoCommon::ImageFlagBits;
35using VideoCommon::ImageInfo; 36using VideoCommon::ImageInfo;
36using VideoCommon::ImageType; 37using VideoCommon::ImageType;
37using VideoCommon::SubresourceRange; 38using VideoCommon::SubresourceRange;
38using VideoCore::Surface::IsPixelFormatASTC; 39using VideoCore::Surface::IsPixelFormatASTC;
40using VideoCore::Surface::IsPixelFormatInteger;
39 41
40namespace { 42namespace {
41constexpr VkBorderColor ConvertBorderColor(const std::array<float, 4>& color) { 43constexpr VkBorderColor ConvertBorderColor(const std::array<float, 4>& color) {
@@ -588,8 +590,158 @@ struct RangedBarrierRange {
588 UNREACHABLE_MSG("Invalid image format={}", format); 590 UNREACHABLE_MSG("Invalid image format={}", format);
589 return VK_FORMAT_R32_UINT; 591 return VK_FORMAT_R32_UINT;
590} 592}
593
594void BlitScale(VKScheduler& scheduler, VkImage src_image, VkImage dst_image, const ImageInfo& info,
595 VkImageAspectFlags aspect_mask, const Settings::ResolutionScalingInfo& resolution,
596 bool up_scaling = true) {
597 const bool is_2d = info.type == ImageType::e2D;
598 const auto resources = info.resources;
599 const VkExtent2D extent{
600 .width = info.size.width,
601 .height = info.size.height,
602 };
603 // Depth and integer formats must use NEAREST filter for blits.
604 const bool is_color{aspect_mask == VK_IMAGE_ASPECT_COLOR_BIT};
605 const bool is_bilinear{is_color && !IsPixelFormatInteger(info.format)};
606 const VkFilter vk_filter = is_bilinear ? VK_FILTER_LINEAR : VK_FILTER_NEAREST;
607
608 scheduler.RequestOutsideRenderPassOperationContext();
609 scheduler.Record([dst_image, src_image, extent, resources, aspect_mask, resolution, is_2d,
610 vk_filter, up_scaling](vk::CommandBuffer cmdbuf) {
611 const VkOffset2D src_size{
612 .x = static_cast<s32>(up_scaling ? extent.width : resolution.ScaleUp(extent.width)),
613 .y = static_cast<s32>(is_2d && up_scaling ? extent.height
614 : resolution.ScaleUp(extent.height)),
615 };
616 const VkOffset2D dst_size{
617 .x = static_cast<s32>(up_scaling ? resolution.ScaleUp(extent.width) : extent.width),
618 .y = static_cast<s32>(is_2d && up_scaling ? resolution.ScaleUp(extent.height)
619 : extent.height),
620 };
621 boost::container::small_vector<VkImageBlit, 4> regions;
622 regions.reserve(resources.levels);
623 for (s32 level = 0; level < resources.levels; level++) {
624 regions.push_back({
625 .srcSubresource{
626 .aspectMask = aspect_mask,
627 .mipLevel = static_cast<u32>(level),
628 .baseArrayLayer = 0,
629 .layerCount = static_cast<u32>(resources.layers),
630 },
631 .srcOffsets{
632 {
633 .x = 0,
634 .y = 0,
635 .z = 0,
636 },
637 {
638 .x = std::max(1, src_size.x >> level),
639 .y = std::max(1, src_size.y >> level),
640 .z = 1,
641 },
642 },
643 .dstSubresource{
644 .aspectMask = aspect_mask,
645 .mipLevel = static_cast<u32>(level),
646 .baseArrayLayer = 0,
647 .layerCount = static_cast<u32>(resources.layers),
648 },
649 .dstOffsets{
650 {
651 .x = 0,
652 .y = 0,
653 .z = 0,
654 },
655 {
656 .x = std::max(1, dst_size.x >> level),
657 .y = std::max(1, dst_size.y >> level),
658 .z = 1,
659 },
660 },
661 });
662 }
663 const VkImageSubresourceRange subresource_range{
664 .aspectMask = aspect_mask,
665 .baseMipLevel = 0,
666 .levelCount = VK_REMAINING_MIP_LEVELS,
667 .baseArrayLayer = 0,
668 .layerCount = VK_REMAINING_ARRAY_LAYERS,
669 };
670 const std::array read_barriers{
671 VkImageMemoryBarrier{
672 .sType = VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER,
673 .pNext = nullptr,
674 .srcAccessMask = VK_ACCESS_MEMORY_WRITE_BIT,
675 .dstAccessMask = VK_ACCESS_TRANSFER_READ_BIT,
676 .oldLayout = VK_IMAGE_LAYOUT_GENERAL,
677 .newLayout = VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL,
678 .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
679 .dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
680 .image = src_image,
681 .subresourceRange = subresource_range,
682 },
683 VkImageMemoryBarrier{
684 .sType = VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER,
685 .pNext = nullptr,
686 .srcAccessMask = VK_ACCESS_SHADER_WRITE_BIT |
687 VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_WRITE_BIT |
688 VK_ACCESS_TRANSFER_WRITE_BIT,
689 .dstAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT,
690 .oldLayout = VK_IMAGE_LAYOUT_UNDEFINED, // Discard contents
691 .newLayout = VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL,
692 .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
693 .dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
694 .image = dst_image,
695 .subresourceRange = subresource_range,
696 },
697 };
698 const std::array write_barriers{
699 VkImageMemoryBarrier{
700 .sType = VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER,
701 .pNext = nullptr,
702 .srcAccessMask = 0,
703 .dstAccessMask = VK_ACCESS_MEMORY_WRITE_BIT | VK_ACCESS_MEMORY_READ_BIT,
704 .oldLayout = VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL,
705 .newLayout = VK_IMAGE_LAYOUT_GENERAL,
706 .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
707 .dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
708 .image = src_image,
709 .subresourceRange = subresource_range,
710 },
711 VkImageMemoryBarrier{
712 .sType = VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER,
713 .pNext = nullptr,
714 .srcAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT,
715 .dstAccessMask = VK_ACCESS_MEMORY_WRITE_BIT | VK_ACCESS_MEMORY_READ_BIT,
716 .oldLayout = VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL,
717 .newLayout = VK_IMAGE_LAYOUT_GENERAL,
718 .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
719 .dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
720 .image = dst_image,
721 .subresourceRange = subresource_range,
722 },
723 };
724 cmdbuf.PipelineBarrier(VK_PIPELINE_STAGE_ALL_COMMANDS_BIT, VK_PIPELINE_STAGE_TRANSFER_BIT,
725 0, nullptr, nullptr, read_barriers);
726 cmdbuf.BlitImage(src_image, VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL, dst_image,
727 VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL, regions, vk_filter);
728 cmdbuf.PipelineBarrier(VK_PIPELINE_STAGE_TRANSFER_BIT, VK_PIPELINE_STAGE_ALL_COMMANDS_BIT,
729 0, nullptr, nullptr, write_barriers);
730 });
731}
591} // Anonymous namespace 732} // Anonymous namespace
592 733
734TextureCacheRuntime::TextureCacheRuntime(const Device& device_, VKScheduler& scheduler_,
735 MemoryAllocator& memory_allocator_,
736 StagingBufferPool& staging_buffer_pool_,
737 BlitImageHelper& blit_image_helper_,
738 ASTCDecoderPass& astc_decoder_pass_,
739 RenderPassCache& render_pass_cache_)
740 : device{device_}, scheduler{scheduler_}, memory_allocator{memory_allocator_},
741 staging_buffer_pool{staging_buffer_pool_}, blit_image_helper{blit_image_helper_},
742 astc_decoder_pass{astc_decoder_pass_}, render_pass_cache{render_pass_cache_},
743 resolution{Settings::values.resolution_info} {}
744
593void TextureCacheRuntime::Finish() { 745void TextureCacheRuntime::Finish() {
594 scheduler.Finish(); 746 scheduler.Finish();
595} 747}
@@ -614,8 +766,8 @@ void TextureCacheRuntime::BlitImage(Framebuffer* dst_framebuffer, ImageView& dst
614 return; 766 return;
615 } 767 }
616 if (aspect_mask == VK_IMAGE_ASPECT_COLOR_BIT && !is_src_msaa && !is_dst_msaa) { 768 if (aspect_mask == VK_IMAGE_ASPECT_COLOR_BIT && !is_src_msaa && !is_dst_msaa) {
617 blit_image_helper.BlitColor(dst_framebuffer, src, dst_region, src_region, filter, 769 blit_image_helper.BlitColor(dst_framebuffer, src.Handle(Shader::TextureType::Color2D),
618 operation); 770 dst_region, src_region, filter, operation);
619 return; 771 return;
620 } 772 }
621 if (aspect_mask == (VK_IMAGE_ASPECT_DEPTH_BIT | VK_IMAGE_ASPECT_STENCIL_BIT)) { 773 if (aspect_mask == (VK_IMAGE_ASPECT_DEPTH_BIT | VK_IMAGE_ASPECT_STENCIL_BIT)) {
@@ -719,26 +871,29 @@ void TextureCacheRuntime::BlitImage(Framebuffer* dst_framebuffer, ImageView& dst
719 }); 871 });
720} 872}
721 873
722void TextureCacheRuntime::ConvertImage(Framebuffer* dst, ImageView& dst_view, ImageView& src_view) { 874void TextureCacheRuntime::ConvertImage(Framebuffer* dst, ImageView& dst_view, ImageView& src_view,
875 bool rescaled) {
876 const u32 up_scale = rescaled ? resolution.up_scale : 1;
877 const u32 down_shift = rescaled ? resolution.down_shift : 0;
723 switch (dst_view.format) { 878 switch (dst_view.format) {
724 case PixelFormat::R16_UNORM: 879 case PixelFormat::R16_UNORM:
725 if (src_view.format == PixelFormat::D16_UNORM) { 880 if (src_view.format == PixelFormat::D16_UNORM) {
726 return blit_image_helper.ConvertD16ToR16(dst, src_view); 881 return blit_image_helper.ConvertD16ToR16(dst, src_view, up_scale, down_shift);
727 } 882 }
728 break; 883 break;
729 case PixelFormat::R32_FLOAT: 884 case PixelFormat::R32_FLOAT:
730 if (src_view.format == PixelFormat::D32_FLOAT) { 885 if (src_view.format == PixelFormat::D32_FLOAT) {
731 return blit_image_helper.ConvertD32ToR32(dst, src_view); 886 return blit_image_helper.ConvertD32ToR32(dst, src_view, up_scale, down_shift);
732 } 887 }
733 break; 888 break;
734 case PixelFormat::D16_UNORM: 889 case PixelFormat::D16_UNORM:
735 if (src_view.format == PixelFormat::R16_UNORM) { 890 if (src_view.format == PixelFormat::R16_UNORM) {
736 return blit_image_helper.ConvertR16ToD16(dst, src_view); 891 return blit_image_helper.ConvertR16ToD16(dst, src_view, up_scale, down_shift);
737 } 892 }
738 break; 893 break;
739 case PixelFormat::D32_FLOAT: 894 case PixelFormat::D32_FLOAT:
740 if (src_view.format == PixelFormat::R32_FLOAT) { 895 if (src_view.format == PixelFormat::R32_FLOAT) {
741 return blit_image_helper.ConvertR32ToD32(dst, src_view); 896 return blit_image_helper.ConvertR32ToD32(dst, src_view, up_scale, down_shift);
742 } 897 }
743 break; 898 break;
744 default: 899 default:
@@ -840,36 +995,39 @@ u64 TextureCacheRuntime::GetDeviceLocalMemory() const {
840 return device.GetDeviceLocalMemory(); 995 return device.GetDeviceLocalMemory();
841} 996}
842 997
843Image::Image(TextureCacheRuntime& runtime, const ImageInfo& info_, GPUVAddr gpu_addr_, 998void TextureCacheRuntime::TickFrame() {}
999
1000Image::Image(TextureCacheRuntime& runtime_, const ImageInfo& info_, GPUVAddr gpu_addr_,
844 VAddr cpu_addr_) 1001 VAddr cpu_addr_)
845 : VideoCommon::ImageBase(info_, gpu_addr_, cpu_addr_), scheduler{&runtime.scheduler}, 1002 : VideoCommon::ImageBase(info_, gpu_addr_, cpu_addr_), scheduler{&runtime_.scheduler},
846 image(MakeImage(runtime.device, info)), 1003 runtime{&runtime_}, original_image(MakeImage(runtime_.device, info)),
847 commit(runtime.memory_allocator.Commit(image, MemoryUsage::DeviceLocal)), 1004 commit(runtime_.memory_allocator.Commit(original_image, MemoryUsage::DeviceLocal)),
848 aspect_mask(ImageAspectMask(info.format)) { 1005 aspect_mask(ImageAspectMask(info.format)) {
849 if (IsPixelFormatASTC(info.format) && !runtime.device.IsOptimalAstcSupported()) { 1006 if (IsPixelFormatASTC(info.format) && !runtime->device.IsOptimalAstcSupported()) {
850 if (Settings::values.accelerate_astc.GetValue()) { 1007 if (Settings::values.accelerate_astc.GetValue()) {
851 flags |= VideoCommon::ImageFlagBits::AcceleratedUpload; 1008 flags |= VideoCommon::ImageFlagBits::AcceleratedUpload;
852 } else { 1009 } else {
853 flags |= VideoCommon::ImageFlagBits::Converted; 1010 flags |= VideoCommon::ImageFlagBits::Converted;
854 } 1011 }
855 } 1012 }
856 if (runtime.device.HasDebuggingToolAttached()) { 1013 if (runtime->device.HasDebuggingToolAttached()) {
857 image.SetObjectNameEXT(VideoCommon::Name(*this).c_str()); 1014 original_image.SetObjectNameEXT(VideoCommon::Name(*this).c_str());
858 } 1015 }
859 static constexpr VkImageViewUsageCreateInfo storage_image_view_usage_create_info{ 1016 static constexpr VkImageViewUsageCreateInfo storage_image_view_usage_create_info{
860 .sType = VK_STRUCTURE_TYPE_IMAGE_VIEW_USAGE_CREATE_INFO, 1017 .sType = VK_STRUCTURE_TYPE_IMAGE_VIEW_USAGE_CREATE_INFO,
861 .pNext = nullptr, 1018 .pNext = nullptr,
862 .usage = VK_IMAGE_USAGE_STORAGE_BIT, 1019 .usage = VK_IMAGE_USAGE_STORAGE_BIT,
863 }; 1020 };
864 if (IsPixelFormatASTC(info.format) && !runtime.device.IsOptimalAstcSupported()) { 1021 current_image = *original_image;
865 const auto& device = runtime.device.GetLogical(); 1022 if (IsPixelFormatASTC(info.format) && !runtime->device.IsOptimalAstcSupported()) {
1023 const auto& device = runtime->device.GetLogical();
866 storage_image_views.reserve(info.resources.levels); 1024 storage_image_views.reserve(info.resources.levels);
867 for (s32 level = 0; level < info.resources.levels; ++level) { 1025 for (s32 level = 0; level < info.resources.levels; ++level) {
868 storage_image_views.push_back(device.CreateImageView(VkImageViewCreateInfo{ 1026 storage_image_views.push_back(device.CreateImageView(VkImageViewCreateInfo{
869 .sType = VK_STRUCTURE_TYPE_IMAGE_VIEW_CREATE_INFO, 1027 .sType = VK_STRUCTURE_TYPE_IMAGE_VIEW_CREATE_INFO,
870 .pNext = &storage_image_view_usage_create_info, 1028 .pNext = &storage_image_view_usage_create_info,
871 .flags = 0, 1029 .flags = 0,
872 .image = *image, 1030 .image = *original_image,
873 .viewType = VK_IMAGE_VIEW_TYPE_2D_ARRAY, 1031 .viewType = VK_IMAGE_VIEW_TYPE_2D_ARRAY,
874 .format = VK_FORMAT_A8B8G8R8_UNORM_PACK32, 1032 .format = VK_FORMAT_A8B8G8R8_UNORM_PACK32,
875 .components{ 1033 .components{
@@ -890,26 +1048,39 @@ Image::Image(TextureCacheRuntime& runtime, const ImageInfo& info_, GPUVAddr gpu_
890 } 1048 }
891} 1049}
892 1050
1051Image::Image(const VideoCommon::NullImageParams& params) : VideoCommon::ImageBase{params} {}
1052
893Image::~Image() = default; 1053Image::~Image() = default;
894 1054
895void Image::UploadMemory(const StagingBufferRef& map, std::span<const BufferImageCopy> copies) { 1055void Image::UploadMemory(const StagingBufferRef& map, std::span<const BufferImageCopy> copies) {
896 // TODO: Move this to another API 1056 // TODO: Move this to another API
1057 const bool is_rescaled = True(flags & ImageFlagBits::Rescaled);
1058 if (is_rescaled) {
1059 ScaleDown(true);
1060 }
897 scheduler->RequestOutsideRenderPassOperationContext(); 1061 scheduler->RequestOutsideRenderPassOperationContext();
898 std::vector vk_copies = TransformBufferImageCopies(copies, map.offset, aspect_mask); 1062 std::vector vk_copies = TransformBufferImageCopies(copies, map.offset, aspect_mask);
899 const VkBuffer src_buffer = map.buffer; 1063 const VkBuffer src_buffer = map.buffer;
900 const VkImage vk_image = *image; 1064 const VkImage vk_image = *original_image;
901 const VkImageAspectFlags vk_aspect_mask = aspect_mask; 1065 const VkImageAspectFlags vk_aspect_mask = aspect_mask;
902 const bool is_initialized = std::exchange(initialized, true); 1066 const bool is_initialized = std::exchange(initialized, true);
903 scheduler->Record([src_buffer, vk_image, vk_aspect_mask, is_initialized, 1067 scheduler->Record([src_buffer, vk_image, vk_aspect_mask, is_initialized,
904 vk_copies](vk::CommandBuffer cmdbuf) { 1068 vk_copies](vk::CommandBuffer cmdbuf) {
905 CopyBufferToImage(cmdbuf, src_buffer, vk_image, vk_aspect_mask, is_initialized, vk_copies); 1069 CopyBufferToImage(cmdbuf, src_buffer, vk_image, vk_aspect_mask, is_initialized, vk_copies);
906 }); 1070 });
1071 if (is_rescaled) {
1072 ScaleUp();
1073 }
907} 1074}
908 1075
909void Image::DownloadMemory(const StagingBufferRef& map, std::span<const BufferImageCopy> copies) { 1076void Image::DownloadMemory(const StagingBufferRef& map, std::span<const BufferImageCopy> copies) {
1077 const bool is_rescaled = True(flags & ImageFlagBits::Rescaled);
1078 if (is_rescaled) {
1079 ScaleDown();
1080 }
910 std::vector vk_copies = TransformBufferImageCopies(copies, map.offset, aspect_mask); 1081 std::vector vk_copies = TransformBufferImageCopies(copies, map.offset, aspect_mask);
911 scheduler->RequestOutsideRenderPassOperationContext(); 1082 scheduler->RequestOutsideRenderPassOperationContext();
912 scheduler->Record([buffer = map.buffer, image = *image, aspect_mask = aspect_mask, 1083 scheduler->Record([buffer = map.buffer, image = *original_image, aspect_mask = aspect_mask,
913 vk_copies](vk::CommandBuffer cmdbuf) { 1084 vk_copies](vk::CommandBuffer cmdbuf) {
914 const VkImageMemoryBarrier read_barrier{ 1085 const VkImageMemoryBarrier read_barrier{
915 .sType = VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER, 1086 .sType = VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER,
@@ -959,6 +1130,146 @@ void Image::DownloadMemory(const StagingBufferRef& map, std::span<const BufferIm
959 cmdbuf.PipelineBarrier(VK_PIPELINE_STAGE_TRANSFER_BIT, VK_PIPELINE_STAGE_ALL_COMMANDS_BIT, 1130 cmdbuf.PipelineBarrier(VK_PIPELINE_STAGE_TRANSFER_BIT, VK_PIPELINE_STAGE_ALL_COMMANDS_BIT,
960 0, memory_write_barrier, nullptr, image_write_barrier); 1131 0, memory_write_barrier, nullptr, image_write_barrier);
961 }); 1132 });
1133 if (is_rescaled) {
1134 ScaleUp(true);
1135 }
1136}
1137
1138bool Image::ScaleUp(bool ignore) {
1139 if (True(flags & ImageFlagBits::Rescaled)) {
1140 return false;
1141 }
1142 ASSERT(info.type != ImageType::Linear);
1143 flags |= ImageFlagBits::Rescaled;
1144 const auto& resolution = runtime->resolution;
1145 if (!resolution.active) {
1146 return false;
1147 }
1148 has_scaled = true;
1149 const auto& device = runtime->device;
1150 if (!scaled_image) {
1151 const bool is_2d = info.type == ImageType::e2D;
1152 const u32 scaled_width = resolution.ScaleUp(info.size.width);
1153 const u32 scaled_height = is_2d ? resolution.ScaleUp(info.size.height) : info.size.height;
1154 auto scaled_info = info;
1155 scaled_info.size.width = scaled_width;
1156 scaled_info.size.height = scaled_height;
1157 scaled_image = MakeImage(device, scaled_info);
1158 auto& allocator = runtime->memory_allocator;
1159 scaled_commit = MemoryCommit(allocator.Commit(scaled_image, MemoryUsage::DeviceLocal));
1160 ignore = false;
1161 }
1162 current_image = *scaled_image;
1163 if (ignore) {
1164 return true;
1165 }
1166
1167 if (aspect_mask == 0) {
1168 aspect_mask = ImageAspectMask(info.format);
1169 }
1170 static constexpr auto OPTIMAL_FORMAT = FormatType::Optimal;
1171 const PixelFormat format = StorageFormat(info.format);
1172 const auto vk_format = MaxwellToVK::SurfaceFormat(device, OPTIMAL_FORMAT, false, format).format;
1173 const auto blit_usage = VK_FORMAT_FEATURE_BLIT_SRC_BIT | VK_FORMAT_FEATURE_BLIT_DST_BIT;
1174 if (device.IsFormatSupported(vk_format, blit_usage, OPTIMAL_FORMAT)) {
1175 BlitScale(*scheduler, *original_image, *scaled_image, info, aspect_mask, resolution);
1176 } else {
1177 return BlitScaleHelper(true);
1178 }
1179 return true;
1180}
1181
1182bool Image::ScaleDown(bool ignore) {
1183 if (False(flags & ImageFlagBits::Rescaled)) {
1184 return false;
1185 }
1186 ASSERT(info.type != ImageType::Linear);
1187 flags &= ~ImageFlagBits::Rescaled;
1188 const auto& resolution = runtime->resolution;
1189 if (!resolution.active) {
1190 return false;
1191 }
1192 current_image = *original_image;
1193 if (ignore) {
1194 return true;
1195 }
1196 if (aspect_mask == 0) {
1197 aspect_mask = ImageAspectMask(info.format);
1198 }
1199 static constexpr auto OPTIMAL_FORMAT = FormatType::Optimal;
1200 const PixelFormat format = StorageFormat(info.format);
1201 const auto& device = runtime->device;
1202 const auto vk_format = MaxwellToVK::SurfaceFormat(device, OPTIMAL_FORMAT, false, format).format;
1203 const auto blit_usage = VK_FORMAT_FEATURE_BLIT_SRC_BIT | VK_FORMAT_FEATURE_BLIT_DST_BIT;
1204 if (device.IsFormatSupported(vk_format, blit_usage, OPTIMAL_FORMAT)) {
1205 BlitScale(*scheduler, *scaled_image, *original_image, info, aspect_mask, resolution, false);
1206 } else {
1207 return BlitScaleHelper(false);
1208 }
1209 return true;
1210}
1211
1212bool Image::BlitScaleHelper(bool scale_up) {
1213 using namespace VideoCommon;
1214 static constexpr auto BLIT_OPERATION = Tegra::Engines::Fermi2D::Operation::SrcCopy;
1215 const bool is_color{aspect_mask == VK_IMAGE_ASPECT_COLOR_BIT};
1216 const bool is_bilinear{is_color && !IsPixelFormatInteger(info.format)};
1217 const auto operation = is_bilinear ? Tegra::Engines::Fermi2D::Filter::Bilinear
1218 : Tegra::Engines::Fermi2D::Filter::Point;
1219
1220 const bool is_2d = info.type == ImageType::e2D;
1221 const auto& resolution = runtime->resolution;
1222 const u32 scaled_width = resolution.ScaleUp(info.size.width);
1223 const u32 scaled_height = is_2d ? resolution.ScaleUp(info.size.height) : info.size.height;
1224 std::unique_ptr<ImageView>& blit_view = scale_up ? scale_view : normal_view;
1225 std::unique_ptr<Framebuffer>& blit_framebuffer =
1226 scale_up ? scale_framebuffer : normal_framebuffer;
1227 if (!blit_view) {
1228 const auto view_info = ImageViewInfo(ImageViewType::e2D, info.format);
1229 blit_view = std::make_unique<ImageView>(*runtime, view_info, NULL_IMAGE_ID, *this);
1230 }
1231
1232 const u32 src_width = scale_up ? info.size.width : scaled_width;
1233 const u32 src_height = scale_up ? info.size.height : scaled_height;
1234 const u32 dst_width = scale_up ? scaled_width : info.size.width;
1235 const u32 dst_height = scale_up ? scaled_height : info.size.height;
1236 const Region2D src_region{
1237 .start = {0, 0},
1238 .end = {static_cast<s32>(src_width), static_cast<s32>(src_height)},
1239 };
1240 const Region2D dst_region{
1241 .start = {0, 0},
1242 .end = {static_cast<s32>(dst_width), static_cast<s32>(dst_height)},
1243 };
1244 const VkExtent2D extent{
1245 .width = std::max(scaled_width, info.size.width),
1246 .height = std::max(scaled_height, info.size.width),
1247 };
1248
1249 auto* view_ptr = blit_view.get();
1250 if (aspect_mask == VK_IMAGE_ASPECT_COLOR_BIT) {
1251 if (!blit_framebuffer) {
1252 blit_framebuffer = std::make_unique<Framebuffer>(*runtime, view_ptr, nullptr, extent);
1253 }
1254 const auto color_view = blit_view->Handle(Shader::TextureType::Color2D);
1255
1256 runtime->blit_image_helper.BlitColor(blit_framebuffer.get(), color_view, dst_region,
1257 src_region, operation, BLIT_OPERATION);
1258 } else if (!runtime->device.IsBlitDepthStencilSupported() &&
1259 aspect_mask == (VK_IMAGE_ASPECT_DEPTH_BIT | VK_IMAGE_ASPECT_STENCIL_BIT)) {
1260 if (!blit_framebuffer) {
1261 blit_framebuffer = std::make_unique<Framebuffer>(*runtime, nullptr, view_ptr, extent);
1262 }
1263 runtime->blit_image_helper.BlitDepthStencil(blit_framebuffer.get(), blit_view->DepthView(),
1264 blit_view->StencilView(), dst_region,
1265 src_region, operation, BLIT_OPERATION);
1266 } else {
1267 // TODO: Use helper blits where applicable
1268 flags &= ~ImageFlagBits::Rescaled;
1269 LOG_ERROR(Render_Vulkan, "Device does not support scaling format {}", info.format);
1270 return false;
1271 }
1272 return true;
962} 1273}
963 1274
964ImageView::ImageView(TextureCacheRuntime& runtime, const VideoCommon::ImageViewInfo& info, 1275ImageView::ImageView(TextureCacheRuntime& runtime, const VideoCommon::ImageViewInfo& info,
@@ -1052,9 +1363,11 @@ ImageView::ImageView(TextureCacheRuntime&, const VideoCommon::ImageInfo& info,
1052 : VideoCommon::ImageViewBase{info, view_info}, gpu_addr{gpu_addr_}, 1363 : VideoCommon::ImageViewBase{info, view_info}, gpu_addr{gpu_addr_},
1053 buffer_size{VideoCommon::CalculateGuestSizeInBytes(info)} {} 1364 buffer_size{VideoCommon::CalculateGuestSizeInBytes(info)} {}
1054 1365
1055ImageView::ImageView(TextureCacheRuntime&, const VideoCommon::NullImageParams& params) 1366ImageView::ImageView(TextureCacheRuntime&, const VideoCommon::NullImageViewParams& params)
1056 : VideoCommon::ImageViewBase{params} {} 1367 : VideoCommon::ImageViewBase{params} {}
1057 1368
1369ImageView::~ImageView() = default;
1370
1058VkImageView ImageView::DepthView() { 1371VkImageView ImageView::DepthView() {
1059 if (depth_view) { 1372 if (depth_view) {
1060 return *depth_view; 1373 return *depth_view;
@@ -1137,7 +1450,8 @@ Sampler::Sampler(TextureCacheRuntime& runtime, const Tegra::Texture::TSCEntry& t
1137 LOG_WARNING(Render_Vulkan, "VK_EXT_sampler_filter_minmax is required"); 1450 LOG_WARNING(Render_Vulkan, "VK_EXT_sampler_filter_minmax is required");
1138 } 1451 }
1139 // Some games have samplers with garbage. Sanitize them here. 1452 // Some games have samplers with garbage. Sanitize them here.
1140 const float max_anisotropy = std::clamp(tsc.MaxAnisotropy(), 1.0f, 16.0f); 1453 const f32 max_anisotropy = std::clamp(tsc.MaxAnisotropy(), 1.0f, 16.0f);
1454
1141 sampler = device.GetLogical().CreateSampler(VkSamplerCreateInfo{ 1455 sampler = device.GetLogical().CreateSampler(VkSamplerCreateInfo{
1142 .sType = VK_STRUCTURE_TYPE_SAMPLER_CREATE_INFO, 1456 .sType = VK_STRUCTURE_TYPE_SAMPLER_CREATE_INFO,
1143 .pNext = pnext, 1457 .pNext = pnext,
@@ -1162,7 +1476,29 @@ Sampler::Sampler(TextureCacheRuntime& runtime, const Tegra::Texture::TSCEntry& t
1162} 1476}
1163 1477
1164Framebuffer::Framebuffer(TextureCacheRuntime& runtime, std::span<ImageView*, NUM_RT> color_buffers, 1478Framebuffer::Framebuffer(TextureCacheRuntime& runtime, std::span<ImageView*, NUM_RT> color_buffers,
1165 ImageView* depth_buffer, const VideoCommon::RenderTargets& key) { 1479 ImageView* depth_buffer, const VideoCommon::RenderTargets& key)
1480 : render_area{VkExtent2D{
1481 .width = key.size.width,
1482 .height = key.size.height,
1483 }} {
1484 CreateFramebuffer(runtime, color_buffers, depth_buffer);
1485 if (runtime.device.HasDebuggingToolAttached()) {
1486 framebuffer.SetObjectNameEXT(VideoCommon::Name(key).c_str());
1487 }
1488}
1489
1490Framebuffer::Framebuffer(TextureCacheRuntime& runtime, ImageView* color_buffer,
1491 ImageView* depth_buffer, VkExtent2D extent)
1492 : render_area{extent} {
1493 std::array<ImageView*, NUM_RT> color_buffers{color_buffer};
1494 CreateFramebuffer(runtime, color_buffers, depth_buffer);
1495}
1496
1497Framebuffer::~Framebuffer() = default;
1498
1499void Framebuffer::CreateFramebuffer(TextureCacheRuntime& runtime,
1500 std::span<ImageView*, NUM_RT> color_buffers,
1501 ImageView* depth_buffer) {
1166 std::vector<VkImageView> attachments; 1502 std::vector<VkImageView> attachments;
1167 RenderPassKey renderpass_key{}; 1503 RenderPassKey renderpass_key{};
1168 s32 num_layers = 1; 1504 s32 num_layers = 1;
@@ -1200,10 +1536,6 @@ Framebuffer::Framebuffer(TextureCacheRuntime& runtime, std::span<ImageView*, NUM
1200 1536
1201 renderpass = runtime.render_pass_cache.Get(renderpass_key); 1537 renderpass = runtime.render_pass_cache.Get(renderpass_key);
1202 1538
1203 render_area = VkExtent2D{
1204 .width = key.size.width,
1205 .height = key.size.height,
1206 };
1207 num_color_buffers = static_cast<u32>(num_colors); 1539 num_color_buffers = static_cast<u32>(num_colors);
1208 framebuffer = runtime.device.GetLogical().CreateFramebuffer({ 1540 framebuffer = runtime.device.GetLogical().CreateFramebuffer({
1209 .sType = VK_STRUCTURE_TYPE_FRAMEBUFFER_CREATE_INFO, 1541 .sType = VK_STRUCTURE_TYPE_FRAMEBUFFER_CREATE_INFO,
@@ -1212,13 +1544,10 @@ Framebuffer::Framebuffer(TextureCacheRuntime& runtime, std::span<ImageView*, NUM
1212 .renderPass = renderpass, 1544 .renderPass = renderpass,
1213 .attachmentCount = static_cast<u32>(attachments.size()), 1545 .attachmentCount = static_cast<u32>(attachments.size()),
1214 .pAttachments = attachments.data(), 1546 .pAttachments = attachments.data(),
1215 .width = key.size.width, 1547 .width = render_area.width,
1216 .height = key.size.height, 1548 .height = render_area.height,
1217 .layers = static_cast<u32>(std::max(num_layers, 1)), 1549 .layers = static_cast<u32>(std::max(num_layers, 1)),
1218 }); 1550 });
1219 if (runtime.device.HasDebuggingToolAttached()) {
1220 framebuffer.SetObjectNameEXT(VideoCommon::Name(key).c_str());
1221 }
1222} 1551}
1223 1552
1224void TextureCacheRuntime::AccelerateImageUpload( 1553void TextureCacheRuntime::AccelerateImageUpload(
diff --git a/src/video_core/renderer_vulkan/vk_texture_cache.h b/src/video_core/renderer_vulkan/vk_texture_cache.h
index b09c468e4..ff28b4e96 100644
--- a/src/video_core/renderer_vulkan/vk_texture_cache.h
+++ b/src/video_core/renderer_vulkan/vk_texture_cache.h
@@ -13,6 +13,10 @@
13#include "video_core/vulkan_common/vulkan_memory_allocator.h" 13#include "video_core/vulkan_common/vulkan_memory_allocator.h"
14#include "video_core/vulkan_common/vulkan_wrapper.h" 14#include "video_core/vulkan_common/vulkan_wrapper.h"
15 15
16namespace Settings {
17struct ResolutionScalingInfo;
18}
19
16namespace Vulkan { 20namespace Vulkan {
17 21
18using VideoCommon::ImageId; 22using VideoCommon::ImageId;
@@ -31,14 +35,14 @@ class RenderPassCache;
31class StagingBufferPool; 35class StagingBufferPool;
32class VKScheduler; 36class VKScheduler;
33 37
34struct TextureCacheRuntime { 38class TextureCacheRuntime {
35 const Device& device; 39public:
36 VKScheduler& scheduler; 40 explicit TextureCacheRuntime(const Device& device_, VKScheduler& scheduler_,
37 MemoryAllocator& memory_allocator; 41 MemoryAllocator& memory_allocator_,
38 StagingBufferPool& staging_buffer_pool; 42 StagingBufferPool& staging_buffer_pool_,
39 BlitImageHelper& blit_image_helper; 43 BlitImageHelper& blit_image_helper_,
40 ASTCDecoderPass& astc_decoder_pass; 44 ASTCDecoderPass& astc_decoder_pass_,
41 RenderPassCache& render_pass_cache; 45 RenderPassCache& render_pass_cache_);
42 46
43 void Finish(); 47 void Finish();
44 48
@@ -46,6 +50,10 @@ struct TextureCacheRuntime {
46 50
47 StagingBufferRef DownloadStagingBuffer(size_t size); 51 StagingBufferRef DownloadStagingBuffer(size_t size);
48 52
53 void TickFrame();
54
55 u64 GetDeviceLocalMemory() const;
56
49 void BlitImage(Framebuffer* dst_framebuffer, ImageView& dst, ImageView& src, 57 void BlitImage(Framebuffer* dst_framebuffer, ImageView& dst, ImageView& src,
50 const Region2D& dst_region, const Region2D& src_region, 58 const Region2D& dst_region, const Region2D& src_region,
51 Tegra::Engines::Fermi2D::Filter filter, 59 Tegra::Engines::Fermi2D::Filter filter,
@@ -53,7 +61,7 @@ struct TextureCacheRuntime {
53 61
54 void CopyImage(Image& dst, Image& src, std::span<const VideoCommon::ImageCopy> copies); 62 void CopyImage(Image& dst, Image& src, std::span<const VideoCommon::ImageCopy> copies);
55 63
56 void ConvertImage(Framebuffer* dst, ImageView& dst_view, ImageView& src_view); 64 void ConvertImage(Framebuffer* dst, ImageView& dst_view, ImageView& src_view, bool rescaled);
57 65
58 bool CanAccelerateImageUpload(Image&) const noexcept { 66 bool CanAccelerateImageUpload(Image&) const noexcept {
59 return false; 67 return false;
@@ -74,13 +82,21 @@ struct TextureCacheRuntime {
74 return true; 82 return true;
75 } 83 }
76 84
77 u64 GetDeviceLocalMemory() const; 85 const Device& device;
86 VKScheduler& scheduler;
87 MemoryAllocator& memory_allocator;
88 StagingBufferPool& staging_buffer_pool;
89 BlitImageHelper& blit_image_helper;
90 ASTCDecoderPass& astc_decoder_pass;
91 RenderPassCache& render_pass_cache;
92 const Settings::ResolutionScalingInfo& resolution;
78}; 93};
79 94
80class Image : public VideoCommon::ImageBase { 95class Image : public VideoCommon::ImageBase {
81public: 96public:
82 explicit Image(TextureCacheRuntime&, const VideoCommon::ImageInfo& info, GPUVAddr gpu_addr, 97 explicit Image(TextureCacheRuntime&, const VideoCommon::ImageInfo& info, GPUVAddr gpu_addr,
83 VAddr cpu_addr); 98 VAddr cpu_addr);
99 explicit Image(const VideoCommon::NullImageParams&);
84 100
85 ~Image(); 101 ~Image();
86 102
@@ -97,7 +113,7 @@ public:
97 std::span<const VideoCommon::BufferImageCopy> copies); 113 std::span<const VideoCommon::BufferImageCopy> copies);
98 114
99 [[nodiscard]] VkImage Handle() const noexcept { 115 [[nodiscard]] VkImage Handle() const noexcept {
100 return *image; 116 return current_image;
101 } 117 }
102 118
103 [[nodiscard]] VkImageAspectFlags AspectMask() const noexcept { 119 [[nodiscard]] VkImageAspectFlags AspectMask() const noexcept {
@@ -113,14 +129,30 @@ public:
113 return std::exchange(initialized, true); 129 return std::exchange(initialized, true);
114 } 130 }
115 131
132 bool ScaleUp(bool ignore = false);
133
134 bool ScaleDown(bool ignore = false);
135
116private: 136private:
117 VKScheduler* scheduler; 137 bool BlitScaleHelper(bool scale_up);
118 vk::Image image; 138
139 VKScheduler* scheduler{};
140 TextureCacheRuntime* runtime{};
141
142 vk::Image original_image;
119 MemoryCommit commit; 143 MemoryCommit commit;
120 vk::ImageView image_view;
121 std::vector<vk::ImageView> storage_image_views; 144 std::vector<vk::ImageView> storage_image_views;
122 VkImageAspectFlags aspect_mask = 0; 145 VkImageAspectFlags aspect_mask = 0;
123 bool initialized = false; 146 bool initialized = false;
147 vk::Image scaled_image{};
148 MemoryCommit scaled_commit{};
149 VkImage current_image{};
150
151 std::unique_ptr<Framebuffer> scale_framebuffer;
152 std::unique_ptr<ImageView> scale_view;
153
154 std::unique_ptr<Framebuffer> normal_framebuffer;
155 std::unique_ptr<ImageView> normal_view;
124}; 156};
125 157
126class ImageView : public VideoCommon::ImageViewBase { 158class ImageView : public VideoCommon::ImageViewBase {
@@ -128,7 +160,15 @@ public:
128 explicit ImageView(TextureCacheRuntime&, const VideoCommon::ImageViewInfo&, ImageId, Image&); 160 explicit ImageView(TextureCacheRuntime&, const VideoCommon::ImageViewInfo&, ImageId, Image&);
129 explicit ImageView(TextureCacheRuntime&, const VideoCommon::ImageInfo&, 161 explicit ImageView(TextureCacheRuntime&, const VideoCommon::ImageInfo&,
130 const VideoCommon::ImageViewInfo&, GPUVAddr); 162 const VideoCommon::ImageViewInfo&, GPUVAddr);
131 explicit ImageView(TextureCacheRuntime&, const VideoCommon::NullImageParams&); 163 explicit ImageView(TextureCacheRuntime&, const VideoCommon::NullImageViewParams&);
164
165 ~ImageView();
166
167 ImageView(const ImageView&) = delete;
168 ImageView& operator=(const ImageView&) = delete;
169
170 ImageView(ImageView&&) = default;
171 ImageView& operator=(ImageView&&) = default;
132 172
133 [[nodiscard]] VkImageView DepthView(); 173 [[nodiscard]] VkImageView DepthView();
134 174
@@ -197,9 +237,23 @@ private:
197 237
198class Framebuffer { 238class Framebuffer {
199public: 239public:
200 explicit Framebuffer(TextureCacheRuntime&, std::span<ImageView*, NUM_RT> color_buffers, 240 explicit Framebuffer(TextureCacheRuntime& runtime, std::span<ImageView*, NUM_RT> color_buffers,
201 ImageView* depth_buffer, const VideoCommon::RenderTargets& key); 241 ImageView* depth_buffer, const VideoCommon::RenderTargets& key);
202 242
243 explicit Framebuffer(TextureCacheRuntime& runtime, ImageView* color_buffer,
244 ImageView* depth_buffer, VkExtent2D extent);
245
246 ~Framebuffer();
247
248 Framebuffer(const Framebuffer&) = delete;
249 Framebuffer& operator=(const Framebuffer&) = delete;
250
251 Framebuffer(Framebuffer&&) = default;
252 Framebuffer& operator=(Framebuffer&&) = default;
253
254 void CreateFramebuffer(TextureCacheRuntime& runtime,
255 std::span<ImageView*, NUM_RT> color_buffers, ImageView* depth_buffer);
256
203 [[nodiscard]] VkFramebuffer Handle() const noexcept { 257 [[nodiscard]] VkFramebuffer Handle() const noexcept {
204 return *framebuffer; 258 return *framebuffer;
205 } 259 }
diff --git a/src/video_core/surface.cpp b/src/video_core/surface.cpp
index eb1746265..58d262446 100644
--- a/src/video_core/surface.cpp
+++ b/src/video_core/surface.cpp
@@ -279,6 +279,80 @@ bool IsPixelFormatSRGB(PixelFormat format) {
279 } 279 }
280} 280}
281 281
282bool IsPixelFormatInteger(PixelFormat format) {
283 switch (format) {
284 case PixelFormat::A8B8G8R8_SINT:
285 case PixelFormat::A8B8G8R8_UINT:
286 case PixelFormat::A2B10G10R10_UINT:
287 case PixelFormat::R8_SINT:
288 case PixelFormat::R8_UINT:
289 case PixelFormat::R16G16B16A16_SINT:
290 case PixelFormat::R16G16B16A16_UINT:
291 case PixelFormat::R32G32B32A32_UINT:
292 case PixelFormat::R32G32B32A32_SINT:
293 case PixelFormat::R32G32_SINT:
294 case PixelFormat::R16_UINT:
295 case PixelFormat::R16_SINT:
296 case PixelFormat::R16G16_UINT:
297 case PixelFormat::R16G16_SINT:
298 case PixelFormat::R8G8_SINT:
299 case PixelFormat::R8G8_UINT:
300 case PixelFormat::R32G32_UINT:
301 case PixelFormat::R32_UINT:
302 case PixelFormat::R32_SINT:
303 return true;
304 default:
305 return false;
306 }
307}
308
309bool IsPixelFormatSignedInteger(PixelFormat format) {
310 switch (format) {
311 case PixelFormat::A8B8G8R8_SINT:
312 case PixelFormat::R8_SINT:
313 case PixelFormat::R16G16B16A16_SINT:
314 case PixelFormat::R32G32B32A32_SINT:
315 case PixelFormat::R32G32_SINT:
316 case PixelFormat::R16_SINT:
317 case PixelFormat::R16G16_SINT:
318 case PixelFormat::R8G8_SINT:
319 case PixelFormat::R32_SINT:
320 return true;
321 default:
322 return false;
323 }
324}
325
326size_t PixelComponentSizeBitsInteger(PixelFormat format) {
327 switch (format) {
328 case PixelFormat::A8B8G8R8_SINT:
329 case PixelFormat::A8B8G8R8_UINT:
330 case PixelFormat::R8_SINT:
331 case PixelFormat::R8_UINT:
332 case PixelFormat::R8G8_SINT:
333 case PixelFormat::R8G8_UINT:
334 return 8;
335 case PixelFormat::A2B10G10R10_UINT:
336 return 10;
337 case PixelFormat::R16G16B16A16_SINT:
338 case PixelFormat::R16G16B16A16_UINT:
339 case PixelFormat::R16_UINT:
340 case PixelFormat::R16_SINT:
341 case PixelFormat::R16G16_UINT:
342 case PixelFormat::R16G16_SINT:
343 return 16;
344 case PixelFormat::R32G32B32A32_UINT:
345 case PixelFormat::R32G32B32A32_SINT:
346 case PixelFormat::R32G32_SINT:
347 case PixelFormat::R32G32_UINT:
348 case PixelFormat::R32_UINT:
349 case PixelFormat::R32_SINT:
350 return 32;
351 default:
352 return 0;
353 }
354}
355
282std::pair<u32, u32> GetASTCBlockSize(PixelFormat format) { 356std::pair<u32, u32> GetASTCBlockSize(PixelFormat format) {
283 return {DefaultBlockWidth(format), DefaultBlockHeight(format)}; 357 return {DefaultBlockWidth(format), DefaultBlockHeight(format)};
284} 358}
diff --git a/src/video_core/surface.h b/src/video_core/surface.h
index 1503db81f..2ce7c7d33 100644
--- a/src/video_core/surface.h
+++ b/src/video_core/surface.h
@@ -460,6 +460,12 @@ bool IsPixelFormatASTC(PixelFormat format);
460 460
461bool IsPixelFormatSRGB(PixelFormat format); 461bool IsPixelFormatSRGB(PixelFormat format);
462 462
463bool IsPixelFormatInteger(PixelFormat format);
464
465bool IsPixelFormatSignedInteger(PixelFormat format);
466
467size_t PixelComponentSizeBitsInteger(PixelFormat format);
468
463std::pair<u32, u32> GetASTCBlockSize(PixelFormat format); 469std::pair<u32, u32> GetASTCBlockSize(PixelFormat format);
464 470
465u64 EstimatedDecompressedSize(u64 base_size, PixelFormat format); 471u64 EstimatedDecompressedSize(u64 base_size, PixelFormat format);
diff --git a/src/video_core/texture_cache/image_base.cpp b/src/video_core/texture_cache/image_base.cpp
index 6052d148a..3db2fdf34 100644
--- a/src/video_core/texture_cache/image_base.cpp
+++ b/src/video_core/texture_cache/image_base.cpp
@@ -60,15 +60,17 @@ namespace {
60ImageBase::ImageBase(const ImageInfo& info_, GPUVAddr gpu_addr_, VAddr cpu_addr_) 60ImageBase::ImageBase(const ImageInfo& info_, GPUVAddr gpu_addr_, VAddr cpu_addr_)
61 : info{info_}, guest_size_bytes{CalculateGuestSizeInBytes(info)}, 61 : info{info_}, guest_size_bytes{CalculateGuestSizeInBytes(info)},
62 unswizzled_size_bytes{CalculateUnswizzledSizeBytes(info)}, 62 unswizzled_size_bytes{CalculateUnswizzledSizeBytes(info)},
63 converted_size_bytes{CalculateConvertedSizeBytes(info)}, gpu_addr{gpu_addr_}, 63 converted_size_bytes{CalculateConvertedSizeBytes(info)}, scale_rating{}, scale_tick{},
64 cpu_addr{cpu_addr_}, cpu_addr_end{cpu_addr + guest_size_bytes}, 64 has_scaled{}, gpu_addr{gpu_addr_}, cpu_addr{cpu_addr_},
65 mip_level_offsets{CalculateMipLevelOffsets(info)} { 65 cpu_addr_end{cpu_addr + guest_size_bytes}, mip_level_offsets{CalculateMipLevelOffsets(info)} {
66 if (info.type == ImageType::e3D) { 66 if (info.type == ImageType::e3D) {
67 slice_offsets = CalculateSliceOffsets(info); 67 slice_offsets = CalculateSliceOffsets(info);
68 slice_subresources = CalculateSliceSubresources(info); 68 slice_subresources = CalculateSliceSubresources(info);
69 } 69 }
70} 70}
71 71
72ImageBase::ImageBase(const NullImageParams&) {}
73
72ImageMapView::ImageMapView(GPUVAddr gpu_addr_, VAddr cpu_addr_, size_t size_, ImageId image_id_) 74ImageMapView::ImageMapView(GPUVAddr gpu_addr_, VAddr cpu_addr_, size_t size_, ImageId image_id_)
73 : gpu_addr{gpu_addr_}, cpu_addr{cpu_addr_}, size{size_}, image_id{image_id_} {} 75 : gpu_addr{gpu_addr_}, cpu_addr{cpu_addr_}, size{size_}, image_id{image_id_} {}
74 76
@@ -254,6 +256,8 @@ void AddImageAlias(ImageBase& lhs, ImageBase& rhs, ImageId lhs_id, ImageId rhs_i
254 } 256 }
255 lhs.aliased_images.push_back(std::move(lhs_alias)); 257 lhs.aliased_images.push_back(std::move(lhs_alias));
256 rhs.aliased_images.push_back(std::move(rhs_alias)); 258 rhs.aliased_images.push_back(std::move(rhs_alias));
259 lhs.flags &= ~ImageFlagBits::IsRescalable;
260 rhs.flags &= ~ImageFlagBits::IsRescalable;
257} 261}
258 262
259} // namespace VideoCommon 263} // namespace VideoCommon
diff --git a/src/video_core/texture_cache/image_base.h b/src/video_core/texture_cache/image_base.h
index 0c17a791b..89c111c00 100644
--- a/src/video_core/texture_cache/image_base.h
+++ b/src/video_core/texture_cache/image_base.h
@@ -33,6 +33,11 @@ enum class ImageFlagBits : u32 {
33 ///< garbage collection priority 33 ///< garbage collection priority
34 Alias = 1 << 11, ///< This image has aliases and has priority on garbage 34 Alias = 1 << 11, ///< This image has aliases and has priority on garbage
35 ///< collection 35 ///< collection
36
37 // Rescaler
38 Rescaled = 1 << 12,
39 CheckingRescalable = 1 << 13,
40 IsRescalable = 1 << 14,
36}; 41};
37DECLARE_ENUM_FLAG_OPERATORS(ImageFlagBits) 42DECLARE_ENUM_FLAG_OPERATORS(ImageFlagBits)
38 43
@@ -43,8 +48,11 @@ struct AliasedImage {
43 ImageId id; 48 ImageId id;
44}; 49};
45 50
51struct NullImageParams {};
52
46struct ImageBase { 53struct ImageBase {
47 explicit ImageBase(const ImageInfo& info, GPUVAddr gpu_addr, VAddr cpu_addr); 54 explicit ImageBase(const ImageInfo& info, GPUVAddr gpu_addr, VAddr cpu_addr);
55 explicit ImageBase(const NullImageParams&);
48 56
49 [[nodiscard]] std::optional<SubresourceBase> TryFindBase(GPUVAddr other_addr) const noexcept; 57 [[nodiscard]] std::optional<SubresourceBase> TryFindBase(GPUVAddr other_addr) const noexcept;
50 58
@@ -68,11 +76,18 @@ struct ImageBase {
68 void CheckBadOverlapState(); 76 void CheckBadOverlapState();
69 void CheckAliasState(); 77 void CheckAliasState();
70 78
79 bool HasScaled() const {
80 return has_scaled;
81 }
82
71 ImageInfo info; 83 ImageInfo info;
72 84
73 u32 guest_size_bytes = 0; 85 u32 guest_size_bytes = 0;
74 u32 unswizzled_size_bytes = 0; 86 u32 unswizzled_size_bytes = 0;
75 u32 converted_size_bytes = 0; 87 u32 converted_size_bytes = 0;
88 u32 scale_rating = 0;
89 u64 scale_tick = 0;
90 bool has_scaled = false;
76 ImageFlagBits flags = ImageFlagBits::CpuModified; 91 ImageFlagBits flags = ImageFlagBits::CpuModified;
77 92
78 GPUVAddr gpu_addr = 0; 93 GPUVAddr gpu_addr = 0;
diff --git a/src/video_core/texture_cache/image_info.cpp b/src/video_core/texture_cache/image_info.cpp
index 64fd7010a..afb94082b 100644
--- a/src/video_core/texture_cache/image_info.cpp
+++ b/src/video_core/texture_cache/image_info.cpp
@@ -16,6 +16,7 @@ namespace VideoCommon {
16using Tegra::Texture::TextureType; 16using Tegra::Texture::TextureType;
17using Tegra::Texture::TICEntry; 17using Tegra::Texture::TICEntry;
18using VideoCore::Surface::PixelFormat; 18using VideoCore::Surface::PixelFormat;
19using VideoCore::Surface::SurfaceType;
19 20
20ImageInfo::ImageInfo(const TICEntry& config) noexcept { 21ImageInfo::ImageInfo(const TICEntry& config) noexcept {
21 format = PixelFormatFromTextureInfo(config.format, config.r_type, config.g_type, config.b_type, 22 format = PixelFormatFromTextureInfo(config.format, config.r_type, config.g_type, config.b_type,
@@ -31,6 +32,7 @@ ImageInfo::ImageInfo(const TICEntry& config) noexcept {
31 .depth = config.block_depth, 32 .depth = config.block_depth,
32 }; 33 };
33 } 34 }
35 rescaleable = false;
34 tile_width_spacing = config.tile_width_spacing; 36 tile_width_spacing = config.tile_width_spacing;
35 if (config.texture_type != TextureType::Texture2D && 37 if (config.texture_type != TextureType::Texture2D &&
36 config.texture_type != TextureType::Texture2DNoMipmap) { 38 config.texture_type != TextureType::Texture2DNoMipmap) {
@@ -41,6 +43,7 @@ ImageInfo::ImageInfo(const TICEntry& config) noexcept {
41 ASSERT(config.BaseLayer() == 0); 43 ASSERT(config.BaseLayer() == 0);
42 type = ImageType::e1D; 44 type = ImageType::e1D;
43 size.width = config.Width(); 45 size.width = config.Width();
46 resources.layers = 1;
44 break; 47 break;
45 case TextureType::Texture1DArray: 48 case TextureType::Texture1DArray:
46 UNIMPLEMENTED_IF(config.BaseLayer() != 0); 49 UNIMPLEMENTED_IF(config.BaseLayer() != 0);
@@ -52,12 +55,14 @@ ImageInfo::ImageInfo(const TICEntry& config) noexcept {
52 case TextureType::Texture2DNoMipmap: 55 case TextureType::Texture2DNoMipmap:
53 ASSERT(config.Depth() == 1); 56 ASSERT(config.Depth() == 1);
54 type = config.IsPitchLinear() ? ImageType::Linear : ImageType::e2D; 57 type = config.IsPitchLinear() ? ImageType::Linear : ImageType::e2D;
58 rescaleable = !config.IsPitchLinear();
55 size.width = config.Width(); 59 size.width = config.Width();
56 size.height = config.Height(); 60 size.height = config.Height();
57 resources.layers = config.BaseLayer() + 1; 61 resources.layers = config.BaseLayer() + 1;
58 break; 62 break;
59 case TextureType::Texture2DArray: 63 case TextureType::Texture2DArray:
60 type = ImageType::e2D; 64 type = ImageType::e2D;
65 rescaleable = true;
61 size.width = config.Width(); 66 size.width = config.Width();
62 size.height = config.Height(); 67 size.height = config.Height();
63 resources.layers = config.BaseLayer() + config.Depth(); 68 resources.layers = config.BaseLayer() + config.Depth();
@@ -82,10 +87,12 @@ ImageInfo::ImageInfo(const TICEntry& config) noexcept {
82 size.width = config.Width(); 87 size.width = config.Width();
83 size.height = config.Height(); 88 size.height = config.Height();
84 size.depth = config.Depth(); 89 size.depth = config.Depth();
90 resources.layers = 1;
85 break; 91 break;
86 case TextureType::Texture1DBuffer: 92 case TextureType::Texture1DBuffer:
87 type = ImageType::Buffer; 93 type = ImageType::Buffer;
88 size.width = config.Width(); 94 size.width = config.Width();
95 resources.layers = 1;
89 break; 96 break;
90 default: 97 default:
91 UNREACHABLE_MSG("Invalid texture_type={}", static_cast<int>(config.texture_type.Value())); 98 UNREACHABLE_MSG("Invalid texture_type={}", static_cast<int>(config.texture_type.Value()));
@@ -95,12 +102,16 @@ ImageInfo::ImageInfo(const TICEntry& config) noexcept {
95 // FIXME: Call this without passing *this 102 // FIXME: Call this without passing *this
96 layer_stride = CalculateLayerStride(*this); 103 layer_stride = CalculateLayerStride(*this);
97 maybe_unaligned_layer_stride = CalculateLayerSize(*this); 104 maybe_unaligned_layer_stride = CalculateLayerSize(*this);
105 rescaleable &= (block.depth == 0) && resources.levels == 1;
106 rescaleable &= size.height > 256 || GetFormatType(format) != SurfaceType::ColorTexture;
107 downscaleable = size.height > 512;
98 } 108 }
99} 109}
100 110
101ImageInfo::ImageInfo(const Tegra::Engines::Maxwell3D::Regs& regs, size_t index) noexcept { 111ImageInfo::ImageInfo(const Tegra::Engines::Maxwell3D::Regs& regs, size_t index) noexcept {
102 const auto& rt = regs.rt[index]; 112 const auto& rt = regs.rt[index];
103 format = VideoCore::Surface::PixelFormatFromRenderTargetFormat(rt.format); 113 format = VideoCore::Surface::PixelFormatFromRenderTargetFormat(rt.format);
114 rescaleable = false;
104 if (rt.tile_mode.is_pitch_linear) { 115 if (rt.tile_mode.is_pitch_linear) {
105 ASSERT(rt.tile_mode.is_3d == 0); 116 ASSERT(rt.tile_mode.is_3d == 0);
106 type = ImageType::Linear; 117 type = ImageType::Linear;
@@ -126,6 +137,9 @@ ImageInfo::ImageInfo(const Tegra::Engines::Maxwell3D::Regs& regs, size_t index)
126 type = ImageType::e3D; 137 type = ImageType::e3D;
127 size.depth = rt.depth; 138 size.depth = rt.depth;
128 } else { 139 } else {
140 rescaleable = block.depth == 0;
141 rescaleable &= size.height > 256;
142 downscaleable = size.height > 512;
129 type = ImageType::e2D; 143 type = ImageType::e2D;
130 resources.layers = rt.depth; 144 resources.layers = rt.depth;
131 } 145 }
@@ -135,6 +149,7 @@ ImageInfo::ImageInfo(const Tegra::Engines::Maxwell3D::Regs& regs) noexcept {
135 format = VideoCore::Surface::PixelFormatFromDepthFormat(regs.zeta.format); 149 format = VideoCore::Surface::PixelFormatFromDepthFormat(regs.zeta.format);
136 size.width = regs.zeta_width; 150 size.width = regs.zeta_width;
137 size.height = regs.zeta_height; 151 size.height = regs.zeta_height;
152 rescaleable = false;
138 resources.levels = 1; 153 resources.levels = 1;
139 layer_stride = regs.zeta.layer_stride * 4; 154 layer_stride = regs.zeta.layer_stride * 4;
140 maybe_unaligned_layer_stride = layer_stride; 155 maybe_unaligned_layer_stride = layer_stride;
@@ -153,6 +168,8 @@ ImageInfo::ImageInfo(const Tegra::Engines::Maxwell3D::Regs& regs) noexcept {
153 type = ImageType::e3D; 168 type = ImageType::e3D;
154 size.depth = regs.zeta_depth; 169 size.depth = regs.zeta_depth;
155 } else { 170 } else {
171 rescaleable = block.depth == 0;
172 downscaleable = size.height > 512;
156 type = ImageType::e2D; 173 type = ImageType::e2D;
157 resources.layers = regs.zeta_depth; 174 resources.layers = regs.zeta_depth;
158 } 175 }
@@ -161,6 +178,7 @@ ImageInfo::ImageInfo(const Tegra::Engines::Maxwell3D::Regs& regs) noexcept {
161ImageInfo::ImageInfo(const Tegra::Engines::Fermi2D::Surface& config) noexcept { 178ImageInfo::ImageInfo(const Tegra::Engines::Fermi2D::Surface& config) noexcept {
162 UNIMPLEMENTED_IF_MSG(config.layer != 0, "Surface layer is not zero"); 179 UNIMPLEMENTED_IF_MSG(config.layer != 0, "Surface layer is not zero");
163 format = VideoCore::Surface::PixelFormatFromRenderTargetFormat(config.format); 180 format = VideoCore::Surface::PixelFormatFromRenderTargetFormat(config.format);
181 rescaleable = false;
164 if (config.linear == Tegra::Engines::Fermi2D::MemoryLayout::Pitch) { 182 if (config.linear == Tegra::Engines::Fermi2D::MemoryLayout::Pitch) {
165 type = ImageType::Linear; 183 type = ImageType::Linear;
166 size = Extent3D{ 184 size = Extent3D{
@@ -171,6 +189,7 @@ ImageInfo::ImageInfo(const Tegra::Engines::Fermi2D::Surface& config) noexcept {
171 pitch = config.pitch; 189 pitch = config.pitch;
172 } else { 190 } else {
173 type = config.block_depth > 0 ? ImageType::e3D : ImageType::e2D; 191 type = config.block_depth > 0 ? ImageType::e3D : ImageType::e2D;
192
174 block = Extent3D{ 193 block = Extent3D{
175 .width = config.block_width, 194 .width = config.block_width,
176 .height = config.block_height, 195 .height = config.block_height,
@@ -183,6 +202,9 @@ ImageInfo::ImageInfo(const Tegra::Engines::Fermi2D::Surface& config) noexcept {
183 .height = config.height, 202 .height = config.height,
184 .depth = 1, 203 .depth = 1,
185 }; 204 };
205 rescaleable = block.depth == 0;
206 rescaleable &= size.height > 256;
207 downscaleable = size.height > 512;
186 } 208 }
187} 209}
188 210
diff --git a/src/video_core/texture_cache/image_info.h b/src/video_core/texture_cache/image_info.h
index 5049fc36e..5932dcaba 100644
--- a/src/video_core/texture_cache/image_info.h
+++ b/src/video_core/texture_cache/image_info.h
@@ -15,7 +15,7 @@ using Tegra::Texture::TICEntry;
15using VideoCore::Surface::PixelFormat; 15using VideoCore::Surface::PixelFormat;
16 16
17struct ImageInfo { 17struct ImageInfo {
18 explicit ImageInfo() = default; 18 ImageInfo() = default;
19 explicit ImageInfo(const TICEntry& config) noexcept; 19 explicit ImageInfo(const TICEntry& config) noexcept;
20 explicit ImageInfo(const Tegra::Engines::Maxwell3D::Regs& regs, size_t index) noexcept; 20 explicit ImageInfo(const Tegra::Engines::Maxwell3D::Regs& regs, size_t index) noexcept;
21 explicit ImageInfo(const Tegra::Engines::Maxwell3D::Regs& regs) noexcept; 21 explicit ImageInfo(const Tegra::Engines::Maxwell3D::Regs& regs) noexcept;
@@ -33,6 +33,8 @@ struct ImageInfo {
33 u32 maybe_unaligned_layer_stride = 0; 33 u32 maybe_unaligned_layer_stride = 0;
34 u32 num_samples = 1; 34 u32 num_samples = 1;
35 u32 tile_width_spacing = 0; 35 u32 tile_width_spacing = 0;
36 bool rescaleable = false;
37 bool downscaleable = false;
36}; 38};
37 39
38} // namespace VideoCommon 40} // namespace VideoCommon
diff --git a/src/video_core/texture_cache/image_view_base.cpp b/src/video_core/texture_cache/image_view_base.cpp
index 450becbeb..c7b4fc231 100644
--- a/src/video_core/texture_cache/image_view_base.cpp
+++ b/src/video_core/texture_cache/image_view_base.cpp
@@ -37,14 +37,15 @@ ImageViewBase::ImageViewBase(const ImageViewInfo& info, const ImageInfo& image_i
37} 37}
38 38
39ImageViewBase::ImageViewBase(const ImageInfo& info, const ImageViewInfo& view_info) 39ImageViewBase::ImageViewBase(const ImageInfo& info, const ImageViewInfo& view_info)
40 : format{info.format}, type{ImageViewType::Buffer}, size{ 40 : image_id{NULL_IMAGE_ID}, format{info.format}, type{ImageViewType::Buffer},
41 .width = info.size.width, 41 size{
42 .height = 1, 42 .width = info.size.width,
43 .depth = 1, 43 .height = 1,
44 } { 44 .depth = 1,
45 } {
45 ASSERT_MSG(view_info.type == ImageViewType::Buffer, "Expected texture buffer"); 46 ASSERT_MSG(view_info.type == ImageViewType::Buffer, "Expected texture buffer");
46} 47}
47 48
48ImageViewBase::ImageViewBase(const NullImageParams&) {} 49ImageViewBase::ImageViewBase(const NullImageViewParams&) : image_id{NULL_IMAGE_ID} {}
49 50
50} // namespace VideoCommon 51} // namespace VideoCommon
diff --git a/src/video_core/texture_cache/image_view_base.h b/src/video_core/texture_cache/image_view_base.h
index 903f715c5..9c24c5359 100644
--- a/src/video_core/texture_cache/image_view_base.h
+++ b/src/video_core/texture_cache/image_view_base.h
@@ -15,7 +15,7 @@ using VideoCore::Surface::PixelFormat;
15struct ImageViewInfo; 15struct ImageViewInfo;
16struct ImageInfo; 16struct ImageInfo;
17 17
18struct NullImageParams {}; 18struct NullImageViewParams {};
19 19
20enum class ImageViewFlagBits : u16 { 20enum class ImageViewFlagBits : u16 {
21 PreemtiveDownload = 1 << 0, 21 PreemtiveDownload = 1 << 0,
@@ -28,7 +28,7 @@ struct ImageViewBase {
28 explicit ImageViewBase(const ImageViewInfo& info, const ImageInfo& image_info, 28 explicit ImageViewBase(const ImageViewInfo& info, const ImageInfo& image_info,
29 ImageId image_id); 29 ImageId image_id);
30 explicit ImageViewBase(const ImageInfo& info, const ImageViewInfo& view_info); 30 explicit ImageViewBase(const ImageInfo& info, const ImageViewInfo& view_info);
31 explicit ImageViewBase(const NullImageParams&); 31 explicit ImageViewBase(const NullImageViewParams&);
32 32
33 [[nodiscard]] bool IsBuffer() const noexcept { 33 [[nodiscard]] bool IsBuffer() const noexcept {
34 return type == ImageViewType::Buffer; 34 return type == ImageViewType::Buffer;
diff --git a/src/video_core/texture_cache/texture_cache.h b/src/video_core/texture_cache/texture_cache.h
index f70c1f764..4d2874bf2 100644
--- a/src/video_core/texture_cache/texture_cache.h
+++ b/src/video_core/texture_cache/texture_cache.h
@@ -7,6 +7,7 @@
7#include <unordered_set> 7#include <unordered_set>
8 8
9#include "common/alignment.h" 9#include "common/alignment.h"
10#include "common/settings.h"
10#include "video_core/dirty_flags.h" 11#include "video_core/dirty_flags.h"
11#include "video_core/engines/kepler_compute.h" 12#include "video_core/engines/kepler_compute.h"
12#include "video_core/texture_cache/image_view_base.h" 13#include "video_core/texture_cache/image_view_base.h"
@@ -44,21 +45,22 @@ TextureCache<P>::TextureCache(Runtime& runtime_, VideoCore::RasterizerInterface&
44 45
45 // Make sure the first index is reserved for the null resources 46 // Make sure the first index is reserved for the null resources
46 // This way the null resource becomes a compile time constant 47 // This way the null resource becomes a compile time constant
47 void(slot_image_views.insert(runtime, NullImageParams{})); 48 void(slot_images.insert(NullImageParams{}));
49 void(slot_image_views.insert(runtime, NullImageViewParams{}));
48 void(slot_samplers.insert(runtime, sampler_descriptor)); 50 void(slot_samplers.insert(runtime, sampler_descriptor));
49 51
50 if constexpr (HAS_DEVICE_MEMORY_INFO) { 52 if constexpr (HAS_DEVICE_MEMORY_INFO) {
51 const auto device_memory = runtime.GetDeviceLocalMemory(); 53 const auto device_memory = runtime.GetDeviceLocalMemory();
52 const u64 possible_expected_memory = (device_memory * 3) / 10; 54 const u64 possible_expected_memory = (device_memory * 4) / 10;
53 const u64 possible_critical_memory = (device_memory * 6) / 10; 55 const u64 possible_critical_memory = (device_memory * 7) / 10;
54 expected_memory = std::max(possible_expected_memory, DEFAULT_EXPECTED_MEMORY); 56 expected_memory = std::max(possible_expected_memory, DEFAULT_EXPECTED_MEMORY - 256_MiB);
55 critical_memory = std::max(possible_critical_memory, DEFAULT_CRITICAL_MEMORY); 57 critical_memory = std::max(possible_critical_memory, DEFAULT_CRITICAL_MEMORY - 512_MiB);
56 minimum_memory = 0; 58 minimum_memory = 0;
57 } else { 59 } else {
58 // on OGL we can be more conservatives as the driver takes care. 60 // On OpenGL we can be more conservatives as the driver takes care.
59 expected_memory = DEFAULT_EXPECTED_MEMORY + 512_MiB; 61 expected_memory = DEFAULT_EXPECTED_MEMORY + 512_MiB;
60 critical_memory = DEFAULT_CRITICAL_MEMORY + 1_GiB; 62 critical_memory = DEFAULT_CRITICAL_MEMORY + 1_GiB;
61 minimum_memory = expected_memory; 63 minimum_memory = 0;
62 } 64 }
63} 65}
64 66
@@ -67,7 +69,7 @@ void TextureCache<P>::RunGarbageCollector() {
67 const bool high_priority_mode = total_used_memory >= expected_memory; 69 const bool high_priority_mode = total_used_memory >= expected_memory;
68 const bool aggressive_mode = total_used_memory >= critical_memory; 70 const bool aggressive_mode = total_used_memory >= critical_memory;
69 const u64 ticks_to_destroy = aggressive_mode ? 10ULL : high_priority_mode ? 25ULL : 100ULL; 71 const u64 ticks_to_destroy = aggressive_mode ? 10ULL : high_priority_mode ? 25ULL : 100ULL;
70 size_t num_iterations = aggressive_mode ? 10000 : (high_priority_mode ? 100 : 5); 72 size_t num_iterations = aggressive_mode ? 300 : (high_priority_mode ? 50 : 10);
71 const auto clean_up = [this, &num_iterations, high_priority_mode](ImageId image_id) { 73 const auto clean_up = [this, &num_iterations, high_priority_mode](ImageId image_id) {
72 if (num_iterations == 0) { 74 if (num_iterations == 0) {
73 return true; 75 return true;
@@ -89,7 +91,7 @@ void TextureCache<P>::RunGarbageCollector() {
89 UntrackImage(image, image_id); 91 UntrackImage(image, image_id);
90 } 92 }
91 UnregisterImage(image_id); 93 UnregisterImage(image_id);
92 DeleteImage(image_id); 94 DeleteImage(image_id, image.scale_tick > frame_tick + 5);
93 return false; 95 return false;
94 }; 96 };
95 lru_cache.ForEachItemBelow(frame_tick - ticks_to_destroy, clean_up); 97 lru_cache.ForEachItemBelow(frame_tick - ticks_to_destroy, clean_up);
@@ -103,6 +105,7 @@ void TextureCache<P>::TickFrame() {
103 sentenced_images.Tick(); 105 sentenced_images.Tick();
104 sentenced_framebuffers.Tick(); 106 sentenced_framebuffers.Tick();
105 sentenced_image_view.Tick(); 107 sentenced_image_view.Tick();
108 runtime.TickFrame();
106 ++frame_tick; 109 ++frame_tick;
107} 110}
108 111
@@ -122,15 +125,14 @@ void TextureCache<P>::MarkModification(ImageId id) noexcept {
122} 125}
123 126
124template <class P> 127template <class P>
125void TextureCache<P>::FillGraphicsImageViews(std::span<const u32> indices, 128template <bool has_blacklists>
126 std::span<ImageViewId> image_view_ids) { 129void TextureCache<P>::FillGraphicsImageViews(std::span<ImageViewInOut> views) {
127 FillImageViews(graphics_image_table, graphics_image_view_ids, indices, image_view_ids); 130 FillImageViews<has_blacklists>(graphics_image_table, graphics_image_view_ids, views);
128} 131}
129 132
130template <class P> 133template <class P>
131void TextureCache<P>::FillComputeImageViews(std::span<const u32> indices, 134void TextureCache<P>::FillComputeImageViews(std::span<ImageViewInOut> views) {
132 std::span<ImageViewId> image_view_ids) { 135 FillImageViews<true>(compute_image_table, compute_image_view_ids, views);
133 FillImageViews(compute_image_table, compute_image_view_ids, indices, image_view_ids);
134} 136}
135 137
136template <class P> 138template <class P>
@@ -190,6 +192,102 @@ void TextureCache<P>::SynchronizeComputeDescriptors() {
190} 192}
191 193
192template <class P> 194template <class P>
195bool TextureCache<P>::RescaleRenderTargets(bool is_clear) {
196 auto& flags = maxwell3d.dirty.flags;
197 u32 scale_rating = 0;
198 bool rescaled = false;
199 std::array<ImageId, NUM_RT> tmp_color_images{};
200 ImageId tmp_depth_image{};
201 do {
202 flags[Dirty::RenderTargets] = false;
203
204 has_deleted_images = false;
205 // Render target control is used on all render targets, so force look ups when this one is
206 // up
207 const bool force = flags[Dirty::RenderTargetControl];
208 flags[Dirty::RenderTargetControl] = false;
209
210 scale_rating = 0;
211 bool any_rescaled = false;
212 bool can_rescale = true;
213 const auto check_rescale = [&](ImageViewId view_id, ImageId& id_save) {
214 if (view_id != NULL_IMAGE_VIEW_ID && view_id != ImageViewId{}) {
215 const auto& view = slot_image_views[view_id];
216 const auto image_id = view.image_id;
217 id_save = image_id;
218 auto& image = slot_images[image_id];
219 can_rescale &= ImageCanRescale(image);
220 any_rescaled |= True(image.flags & ImageFlagBits::Rescaled) ||
221 GetFormatType(image.info.format) != SurfaceType::ColorTexture;
222 scale_rating = std::max<u32>(scale_rating, image.scale_tick <= frame_tick
223 ? image.scale_rating + 1U
224 : image.scale_rating);
225 } else {
226 id_save = CORRUPT_ID;
227 }
228 };
229 for (size_t index = 0; index < NUM_RT; ++index) {
230 ImageViewId& color_buffer_id = render_targets.color_buffer_ids[index];
231 if (flags[Dirty::ColorBuffer0 + index] || force) {
232 flags[Dirty::ColorBuffer0 + index] = false;
233 BindRenderTarget(&color_buffer_id, FindColorBuffer(index, is_clear));
234 }
235 check_rescale(color_buffer_id, tmp_color_images[index]);
236 }
237 if (flags[Dirty::ZetaBuffer] || force) {
238 flags[Dirty::ZetaBuffer] = false;
239 BindRenderTarget(&render_targets.depth_buffer_id, FindDepthBuffer(is_clear));
240 }
241 check_rescale(render_targets.depth_buffer_id, tmp_depth_image);
242
243 if (can_rescale) {
244 rescaled = any_rescaled || scale_rating >= 2;
245 const auto scale_up = [this](ImageId image_id) {
246 if (image_id != CORRUPT_ID) {
247 Image& image = slot_images[image_id];
248 ScaleUp(image);
249 }
250 };
251 if (rescaled) {
252 for (size_t index = 0; index < NUM_RT; ++index) {
253 scale_up(tmp_color_images[index]);
254 }
255 scale_up(tmp_depth_image);
256 scale_rating = 2;
257 }
258 } else {
259 rescaled = false;
260 const auto scale_down = [this](ImageId image_id) {
261 if (image_id != CORRUPT_ID) {
262 Image& image = slot_images[image_id];
263 ScaleDown(image);
264 }
265 };
266 for (size_t index = 0; index < NUM_RT; ++index) {
267 scale_down(tmp_color_images[index]);
268 }
269 scale_down(tmp_depth_image);
270 scale_rating = 1;
271 }
272 } while (has_deleted_images);
273 const auto set_rating = [this, scale_rating](ImageId image_id) {
274 if (image_id != CORRUPT_ID) {
275 Image& image = slot_images[image_id];
276 image.scale_rating = scale_rating;
277 if (image.scale_tick <= frame_tick) {
278 image.scale_tick = frame_tick + 1;
279 }
280 }
281 };
282 for (size_t index = 0; index < NUM_RT; ++index) {
283 set_rating(tmp_color_images[index]);
284 }
285 set_rating(tmp_depth_image);
286
287 return rescaled;
288}
289
290template <class P>
193void TextureCache<P>::UpdateRenderTargets(bool is_clear) { 291void TextureCache<P>::UpdateRenderTargets(bool is_clear) {
194 using namespace VideoCommon::Dirty; 292 using namespace VideoCommon::Dirty;
195 auto& flags = maxwell3d.dirty.flags; 293 auto& flags = maxwell3d.dirty.flags;
@@ -202,24 +300,18 @@ void TextureCache<P>::UpdateRenderTargets(bool is_clear) {
202 PrepareImageView(depth_buffer_id, true, is_clear && IsFullClear(depth_buffer_id)); 300 PrepareImageView(depth_buffer_id, true, is_clear && IsFullClear(depth_buffer_id));
203 return; 301 return;
204 } 302 }
205 flags[Dirty::RenderTargets] = false;
206 303
207 // Render target control is used on all render targets, so force look ups when this one is up 304 const bool rescaled = RescaleRenderTargets(is_clear);
208 const bool force = flags[Dirty::RenderTargetControl]; 305 if (is_rescaling != rescaled) {
209 flags[Dirty::RenderTargetControl] = false; 306 flags[Dirty::RescaleViewports] = true;
307 flags[Dirty::RescaleScissors] = true;
308 is_rescaling = rescaled;
309 }
210 310
211 for (size_t index = 0; index < NUM_RT; ++index) { 311 for (size_t index = 0; index < NUM_RT; ++index) {
212 ImageViewId& color_buffer_id = render_targets.color_buffer_ids[index]; 312 ImageViewId& color_buffer_id = render_targets.color_buffer_ids[index];
213 if (flags[Dirty::ColorBuffer0 + index] || force) {
214 flags[Dirty::ColorBuffer0 + index] = false;
215 BindRenderTarget(&color_buffer_id, FindColorBuffer(index, is_clear));
216 }
217 PrepareImageView(color_buffer_id, true, is_clear && IsFullClear(color_buffer_id)); 313 PrepareImageView(color_buffer_id, true, is_clear && IsFullClear(color_buffer_id));
218 } 314 }
219 if (flags[Dirty::ZetaBuffer] || force) {
220 flags[Dirty::ZetaBuffer] = false;
221 BindRenderTarget(&render_targets.depth_buffer_id, FindDepthBuffer(is_clear));
222 }
223 const ImageViewId depth_buffer_id = render_targets.depth_buffer_id; 315 const ImageViewId depth_buffer_id = render_targets.depth_buffer_id;
224 316
225 PrepareImageView(depth_buffer_id, true, is_clear && IsFullClear(depth_buffer_id)); 317 PrepareImageView(depth_buffer_id, true, is_clear && IsFullClear(depth_buffer_id));
@@ -227,9 +319,15 @@ void TextureCache<P>::UpdateRenderTargets(bool is_clear) {
227 for (size_t index = 0; index < NUM_RT; ++index) { 319 for (size_t index = 0; index < NUM_RT; ++index) {
228 render_targets.draw_buffers[index] = static_cast<u8>(maxwell3d.regs.rt_control.Map(index)); 320 render_targets.draw_buffers[index] = static_cast<u8>(maxwell3d.regs.rt_control.Map(index));
229 } 321 }
322 u32 up_scale = 1;
323 u32 down_shift = 0;
324 if (is_rescaling) {
325 up_scale = Settings::values.resolution_info.up_scale;
326 down_shift = Settings::values.resolution_info.down_shift;
327 }
230 render_targets.size = Extent2D{ 328 render_targets.size = Extent2D{
231 maxwell3d.regs.render_area.width, 329 (maxwell3d.regs.render_area.width * up_scale) >> down_shift,
232 maxwell3d.regs.render_area.height, 330 (maxwell3d.regs.render_area.height * up_scale) >> down_shift,
233 }; 331 };
234 332
235 flags[Dirty::DepthBiasGlobal] = true; 333 flags[Dirty::DepthBiasGlobal] = true;
@@ -241,17 +339,28 @@ typename P::Framebuffer* TextureCache<P>::GetFramebuffer() {
241} 339}
242 340
243template <class P> 341template <class P>
342template <bool has_blacklists>
244void TextureCache<P>::FillImageViews(DescriptorTable<TICEntry>& table, 343void TextureCache<P>::FillImageViews(DescriptorTable<TICEntry>& table,
245 std::span<ImageViewId> cached_image_view_ids, 344 std::span<ImageViewId> cached_image_view_ids,
246 std::span<const u32> indices, 345 std::span<ImageViewInOut> views) {
247 std::span<ImageViewId> image_view_ids) { 346 bool has_blacklisted;
248 ASSERT(indices.size() <= image_view_ids.size());
249 do { 347 do {
250 has_deleted_images = false; 348 has_deleted_images = false;
251 std::ranges::transform(indices, image_view_ids.begin(), [&](u32 index) { 349 if constexpr (has_blacklists) {
252 return VisitImageView(table, cached_image_view_ids, index); 350 has_blacklisted = false;
253 }); 351 }
254 } while (has_deleted_images); 352 for (ImageViewInOut& view : views) {
353 view.id = VisitImageView(table, cached_image_view_ids, view.index);
354 if constexpr (has_blacklists) {
355 if (view.blacklist && view.id != NULL_IMAGE_VIEW_ID) {
356 const ImageViewBase& image_view{slot_image_views[view.id]};
357 auto& image = slot_images[image_view.image_id];
358 has_blacklisted |= ScaleDown(image);
359 image.scale_rating = 0;
360 }
361 }
362 }
363 } while (has_deleted_images || (has_blacklists && has_blacklisted));
255} 364}
256 365
257template <class P> 366template <class P>
@@ -369,8 +478,43 @@ void TextureCache<P>::BlitImage(const Tegra::Engines::Fermi2D::Surface& dst,
369 PrepareImage(src_id, false, false); 478 PrepareImage(src_id, false, false);
370 PrepareImage(dst_id, true, false); 479 PrepareImage(dst_id, true, false);
371 480
372 ImageBase& dst_image = slot_images[dst_id]; 481 Image& dst_image = slot_images[dst_id];
373 const ImageBase& src_image = slot_images[src_id]; 482 Image& src_image = slot_images[src_id];
483 bool is_src_rescaled = True(src_image.flags & ImageFlagBits::Rescaled);
484 bool is_dst_rescaled = True(dst_image.flags & ImageFlagBits::Rescaled);
485
486 const bool is_resolve = src_image.info.num_samples != 1 && dst_image.info.num_samples == 1;
487 if (is_src_rescaled != is_dst_rescaled) {
488 if (ImageCanRescale(src_image)) {
489 ScaleUp(src_image);
490 is_src_rescaled = True(src_image.flags & ImageFlagBits::Rescaled);
491 if (is_resolve) {
492 dst_image.info.rescaleable = true;
493 for (const auto& alias : dst_image.aliased_images) {
494 Image& other_image = slot_images[alias.id];
495 other_image.info.rescaleable = true;
496 }
497 }
498 }
499 if (ImageCanRescale(dst_image)) {
500 ScaleUp(dst_image);
501 is_dst_rescaled = True(dst_image.flags & ImageFlagBits::Rescaled);
502 }
503 }
504 if (is_resolve && (is_src_rescaled != is_dst_rescaled)) {
505 // A resolve requires both images to be the same dimensions. Resize down if needed.
506 ScaleDown(src_image);
507 ScaleDown(dst_image);
508 is_src_rescaled = True(src_image.flags & ImageFlagBits::Rescaled);
509 is_dst_rescaled = True(dst_image.flags & ImageFlagBits::Rescaled);
510 }
511 const auto& resolution = Settings::values.resolution_info;
512 const auto scale_region = [&](Region2D& region) {
513 region.start.x = resolution.ScaleUp(region.start.x);
514 region.start.y = resolution.ScaleUp(region.start.y);
515 region.end.x = resolution.ScaleUp(region.end.x);
516 region.end.y = resolution.ScaleUp(region.end.y);
517 };
374 518
375 // TODO: Deduplicate 519 // TODO: Deduplicate
376 const std::optional src_base = src_image.TryFindBase(src.Address()); 520 const std::optional src_base = src_image.TryFindBase(src.Address());
@@ -378,20 +522,26 @@ void TextureCache<P>::BlitImage(const Tegra::Engines::Fermi2D::Surface& dst,
378 const ImageViewInfo src_view_info(ImageViewType::e2D, images.src_format, src_range); 522 const ImageViewInfo src_view_info(ImageViewType::e2D, images.src_format, src_range);
379 const auto [src_framebuffer_id, src_view_id] = RenderTargetFromImage(src_id, src_view_info); 523 const auto [src_framebuffer_id, src_view_id] = RenderTargetFromImage(src_id, src_view_info);
380 const auto [src_samples_x, src_samples_y] = SamplesLog2(src_image.info.num_samples); 524 const auto [src_samples_x, src_samples_y] = SamplesLog2(src_image.info.num_samples);
381 const Region2D src_region{ 525 Region2D src_region{
382 Offset2D{.x = copy.src_x0 >> src_samples_x, .y = copy.src_y0 >> src_samples_y}, 526 Offset2D{.x = copy.src_x0 >> src_samples_x, .y = copy.src_y0 >> src_samples_y},
383 Offset2D{.x = copy.src_x1 >> src_samples_x, .y = copy.src_y1 >> src_samples_y}, 527 Offset2D{.x = copy.src_x1 >> src_samples_x, .y = copy.src_y1 >> src_samples_y},
384 }; 528 };
529 if (is_src_rescaled) {
530 scale_region(src_region);
531 }
385 532
386 const std::optional dst_base = dst_image.TryFindBase(dst.Address()); 533 const std::optional dst_base = dst_image.TryFindBase(dst.Address());
387 const SubresourceRange dst_range{.base = dst_base.value(), .extent = {1, 1}}; 534 const SubresourceRange dst_range{.base = dst_base.value(), .extent = {1, 1}};
388 const ImageViewInfo dst_view_info(ImageViewType::e2D, images.dst_format, dst_range); 535 const ImageViewInfo dst_view_info(ImageViewType::e2D, images.dst_format, dst_range);
389 const auto [dst_framebuffer_id, dst_view_id] = RenderTargetFromImage(dst_id, dst_view_info); 536 const auto [dst_framebuffer_id, dst_view_id] = RenderTargetFromImage(dst_id, dst_view_info);
390 const auto [dst_samples_x, dst_samples_y] = SamplesLog2(dst_image.info.num_samples); 537 const auto [dst_samples_x, dst_samples_y] = SamplesLog2(dst_image.info.num_samples);
391 const Region2D dst_region{ 538 Region2D dst_region{
392 Offset2D{.x = copy.dst_x0 >> dst_samples_x, .y = copy.dst_y0 >> dst_samples_y}, 539 Offset2D{.x = copy.dst_x0 >> dst_samples_x, .y = copy.dst_y0 >> dst_samples_y},
393 Offset2D{.x = copy.dst_x1 >> dst_samples_x, .y = copy.dst_y1 >> dst_samples_y}, 540 Offset2D{.x = copy.dst_x1 >> dst_samples_x, .y = copy.dst_y1 >> dst_samples_y},
394 }; 541 };
542 if (is_dst_rescaled) {
543 scale_region(dst_region);
544 }
395 545
396 // Always call this after src_framebuffer_id was queried, as the address might be invalidated. 546 // Always call this after src_framebuffer_id was queried, as the address might be invalidated.
397 Framebuffer* const dst_framebuffer = &slot_framebuffers[dst_framebuffer_id]; 547 Framebuffer* const dst_framebuffer = &slot_framebuffers[dst_framebuffer_id];
@@ -487,6 +637,20 @@ void TextureCache<P>::PopAsyncFlushes() {
487} 637}
488 638
489template <class P> 639template <class P>
640bool TextureCache<P>::IsRescaling() const noexcept {
641 return is_rescaling;
642}
643
644template <class P>
645bool TextureCache<P>::IsRescaling(const ImageViewBase& image_view) const noexcept {
646 if (image_view.type == ImageViewType::Buffer) {
647 return false;
648 }
649 const ImageBase& image = slot_images[image_view.image_id];
650 return True(image.flags & ImageFlagBits::Rescaled);
651}
652
653template <class P>
490bool TextureCache<P>::IsRegionGpuModified(VAddr addr, size_t size) { 654bool TextureCache<P>::IsRegionGpuModified(VAddr addr, size_t size) {
491 bool is_modified = false; 655 bool is_modified = false;
492 ForEachImageInRegion(addr, size, [&is_modified](ImageId, ImageBase& image) { 656 ForEachImageInRegion(addr, size, [&is_modified](ImageId, ImageBase& image) {
@@ -624,6 +788,105 @@ ImageId TextureCache<P>::FindImage(const ImageInfo& info, GPUVAddr gpu_addr,
624} 788}
625 789
626template <class P> 790template <class P>
791bool TextureCache<P>::ImageCanRescale(ImageBase& image) {
792 if (!image.info.rescaleable) {
793 return false;
794 }
795 if (Settings::values.resolution_info.downscale && !image.info.downscaleable) {
796 return false;
797 }
798 if (True(image.flags & (ImageFlagBits::Rescaled | ImageFlagBits::CheckingRescalable))) {
799 return true;
800 }
801 if (True(image.flags & ImageFlagBits::IsRescalable)) {
802 return true;
803 }
804 image.flags |= ImageFlagBits::CheckingRescalable;
805 for (const auto& alias : image.aliased_images) {
806 Image& other_image = slot_images[alias.id];
807 if (!ImageCanRescale(other_image)) {
808 image.flags &= ~ImageFlagBits::CheckingRescalable;
809 return false;
810 }
811 }
812 image.flags &= ~ImageFlagBits::CheckingRescalable;
813 image.flags |= ImageFlagBits::IsRescalable;
814 return true;
815}
816
817template <class P>
818void TextureCache<P>::InvalidateScale(Image& image) {
819 if (image.scale_tick <= frame_tick) {
820 image.scale_tick = frame_tick + 1;
821 }
822 const std::span<const ImageViewId> image_view_ids = image.image_view_ids;
823 auto& dirty = maxwell3d.dirty.flags;
824 dirty[Dirty::RenderTargets] = true;
825 dirty[Dirty::ZetaBuffer] = true;
826 for (size_t rt = 0; rt < NUM_RT; ++rt) {
827 dirty[Dirty::ColorBuffer0 + rt] = true;
828 }
829 for (const ImageViewId image_view_id : image_view_ids) {
830 std::ranges::replace(render_targets.color_buffer_ids, image_view_id, ImageViewId{});
831 if (render_targets.depth_buffer_id == image_view_id) {
832 render_targets.depth_buffer_id = ImageViewId{};
833 }
834 }
835 RemoveImageViewReferences(image_view_ids);
836 RemoveFramebuffers(image_view_ids);
837 for (const ImageViewId image_view_id : image_view_ids) {
838 sentenced_image_view.Push(std::move(slot_image_views[image_view_id]));
839 slot_image_views.erase(image_view_id);
840 }
841 image.image_view_ids.clear();
842 image.image_view_infos.clear();
843 if constexpr (ENABLE_VALIDATION) {
844 std::ranges::fill(graphics_image_view_ids, CORRUPT_ID);
845 std::ranges::fill(compute_image_view_ids, CORRUPT_ID);
846 }
847 graphics_image_table.Invalidate();
848 compute_image_table.Invalidate();
849 has_deleted_images = true;
850}
851
852template <class P>
853u64 TextureCache<P>::GetScaledImageSizeBytes(ImageBase& image) {
854 const u64 scale_up = static_cast<u64>(Settings::values.resolution_info.up_scale *
855 Settings::values.resolution_info.up_scale);
856 const u64 down_shift = static_cast<u64>(Settings::values.resolution_info.down_shift +
857 Settings::values.resolution_info.down_shift);
858 const u64 image_size_bytes =
859 static_cast<u64>(std::max(image.guest_size_bytes, image.unswizzled_size_bytes));
860 const u64 tentative_size = (image_size_bytes * scale_up) >> down_shift;
861 const u64 fitted_size = Common::AlignUp(tentative_size, 1024);
862 return fitted_size;
863}
864
865template <class P>
866bool TextureCache<P>::ScaleUp(Image& image) {
867 const bool has_copy = image.HasScaled();
868 const bool rescaled = image.ScaleUp();
869 if (!rescaled) {
870 return false;
871 }
872 if (!has_copy) {
873 total_used_memory += GetScaledImageSizeBytes(image);
874 }
875 InvalidateScale(image);
876 return true;
877}
878
879template <class P>
880bool TextureCache<P>::ScaleDown(Image& image) {
881 const bool rescaled = image.ScaleDown();
882 if (!rescaled) {
883 return false;
884 }
885 InvalidateScale(image);
886 return true;
887}
888
889template <class P>
627ImageId TextureCache<P>::InsertImage(const ImageInfo& info, GPUVAddr gpu_addr, 890ImageId TextureCache<P>::InsertImage(const ImageInfo& info, GPUVAddr gpu_addr,
628 RelaxedOptions options) { 891 RelaxedOptions options) {
629 std::optional<VAddr> cpu_addr = gpu_memory.GpuToCpuAddress(gpu_addr); 892 std::optional<VAddr> cpu_addr = gpu_memory.GpuToCpuAddress(gpu_addr);
@@ -660,12 +923,18 @@ ImageId TextureCache<P>::JoinImages(const ImageInfo& info, GPUVAddr gpu_addr, VA
660 std::vector<ImageId> right_aliased_ids; 923 std::vector<ImageId> right_aliased_ids;
661 std::unordered_set<ImageId> ignore_textures; 924 std::unordered_set<ImageId> ignore_textures;
662 std::vector<ImageId> bad_overlap_ids; 925 std::vector<ImageId> bad_overlap_ids;
926 std::vector<ImageId> all_siblings;
927 const bool this_is_linear = info.type == ImageType::Linear;
663 const auto region_check = [&](ImageId overlap_id, ImageBase& overlap) { 928 const auto region_check = [&](ImageId overlap_id, ImageBase& overlap) {
664 if (True(overlap.flags & ImageFlagBits::Remapped)) { 929 if (True(overlap.flags & ImageFlagBits::Remapped)) {
665 ignore_textures.insert(overlap_id); 930 ignore_textures.insert(overlap_id);
666 return; 931 return;
667 } 932 }
668 if (info.type == ImageType::Linear) { 933 const bool overlap_is_linear = overlap.info.type == ImageType::Linear;
934 if (this_is_linear != overlap_is_linear) {
935 return;
936 }
937 if (this_is_linear && overlap_is_linear) {
669 if (info.pitch == overlap.info.pitch && gpu_addr == overlap.gpu_addr) { 938 if (info.pitch == overlap.info.pitch && gpu_addr == overlap.gpu_addr) {
670 // Alias linear images with the same pitch 939 // Alias linear images with the same pitch
671 left_aliased_ids.push_back(overlap_id); 940 left_aliased_ids.push_back(overlap_id);
@@ -681,6 +950,7 @@ ImageId TextureCache<P>::JoinImages(const ImageInfo& info, GPUVAddr gpu_addr, VA
681 cpu_addr = solution->cpu_addr; 950 cpu_addr = solution->cpu_addr;
682 new_info.resources = solution->resources; 951 new_info.resources = solution->resources;
683 overlap_ids.push_back(overlap_id); 952 overlap_ids.push_back(overlap_id);
953 all_siblings.push_back(overlap_id);
684 return; 954 return;
685 } 955 }
686 static constexpr auto options = RelaxedOptions::Size | RelaxedOptions::Format; 956 static constexpr auto options = RelaxedOptions::Size | RelaxedOptions::Format;
@@ -688,10 +958,12 @@ ImageId TextureCache<P>::JoinImages(const ImageInfo& info, GPUVAddr gpu_addr, VA
688 if (IsSubresource(new_info, overlap, gpu_addr, options, broken_views, native_bgr)) { 958 if (IsSubresource(new_info, overlap, gpu_addr, options, broken_views, native_bgr)) {
689 left_aliased_ids.push_back(overlap_id); 959 left_aliased_ids.push_back(overlap_id);
690 overlap.flags |= ImageFlagBits::Alias; 960 overlap.flags |= ImageFlagBits::Alias;
961 all_siblings.push_back(overlap_id);
691 } else if (IsSubresource(overlap.info, new_image_base, overlap.gpu_addr, options, 962 } else if (IsSubresource(overlap.info, new_image_base, overlap.gpu_addr, options,
692 broken_views, native_bgr)) { 963 broken_views, native_bgr)) {
693 right_aliased_ids.push_back(overlap_id); 964 right_aliased_ids.push_back(overlap_id);
694 overlap.flags |= ImageFlagBits::Alias; 965 overlap.flags |= ImageFlagBits::Alias;
966 all_siblings.push_back(overlap_id);
695 } else { 967 } else {
696 bad_overlap_ids.push_back(overlap_id); 968 bad_overlap_ids.push_back(overlap_id);
697 overlap.flags |= ImageFlagBits::BadOverlap; 969 overlap.flags |= ImageFlagBits::BadOverlap;
@@ -709,6 +981,32 @@ ImageId TextureCache<P>::JoinImages(const ImageInfo& info, GPUVAddr gpu_addr, VA
709 } 981 }
710 }; 982 };
711 ForEachSparseImageInRegion(gpu_addr, size_bytes, region_check_gpu); 983 ForEachSparseImageInRegion(gpu_addr, size_bytes, region_check_gpu);
984
985 bool can_rescale = info.rescaleable;
986 bool any_rescaled = false;
987 for (const ImageId sibling_id : all_siblings) {
988 if (!can_rescale) {
989 break;
990 }
991 Image& sibling = slot_images[sibling_id];
992 can_rescale &= ImageCanRescale(sibling);
993 any_rescaled |= True(sibling.flags & ImageFlagBits::Rescaled);
994 }
995
996 can_rescale &= any_rescaled;
997
998 if (can_rescale) {
999 for (const ImageId sibling_id : all_siblings) {
1000 Image& sibling = slot_images[sibling_id];
1001 ScaleUp(sibling);
1002 }
1003 } else {
1004 for (const ImageId sibling_id : all_siblings) {
1005 Image& sibling = slot_images[sibling_id];
1006 ScaleDown(sibling);
1007 }
1008 }
1009
712 const ImageId new_image_id = slot_images.insert(runtime, new_info, gpu_addr, cpu_addr); 1010 const ImageId new_image_id = slot_images.insert(runtime, new_info, gpu_addr, cpu_addr);
713 Image& new_image = slot_images[new_image_id]; 1011 Image& new_image = slot_images[new_image_id];
714 1012
@@ -731,14 +1029,23 @@ ImageId TextureCache<P>::JoinImages(const ImageInfo& info, GPUVAddr gpu_addr, VA
731 // TODO: Only upload what we need 1029 // TODO: Only upload what we need
732 RefreshContents(new_image, new_image_id); 1030 RefreshContents(new_image, new_image_id);
733 1031
1032 if (can_rescale) {
1033 ScaleUp(new_image);
1034 } else {
1035 ScaleDown(new_image);
1036 }
1037
734 for (const ImageId overlap_id : overlap_ids) { 1038 for (const ImageId overlap_id : overlap_ids) {
735 Image& overlap = slot_images[overlap_id]; 1039 Image& overlap = slot_images[overlap_id];
736 if (overlap.info.num_samples != new_image.info.num_samples) { 1040 if (overlap.info.num_samples != new_image.info.num_samples) {
737 LOG_WARNING(HW_GPU, "Copying between images with different samples is not implemented"); 1041 LOG_WARNING(HW_GPU, "Copying between images with different samples is not implemented");
738 } else { 1042 } else {
1043 const auto& resolution = Settings::values.resolution_info;
739 const SubresourceBase base = new_image.TryFindBase(overlap.gpu_addr).value(); 1044 const SubresourceBase base = new_image.TryFindBase(overlap.gpu_addr).value();
740 const auto copies = MakeShrinkImageCopies(new_info, overlap.info, base); 1045 const u32 up_scale = can_rescale ? resolution.up_scale : 1;
741 runtime.CopyImage(new_image, overlap, copies); 1046 const u32 down_shift = can_rescale ? resolution.down_shift : 0;
1047 auto copies = MakeShrinkImageCopies(new_info, overlap.info, base, up_scale, down_shift);
1048 runtime.CopyImage(new_image, overlap, std::move(copies));
742 } 1049 }
743 if (True(overlap.flags & ImageFlagBits::Tracked)) { 1050 if (True(overlap.flags & ImageFlagBits::Tracked)) {
744 UntrackImage(overlap, overlap_id); 1051 UntrackImage(overlap, overlap_id);
@@ -1083,13 +1390,6 @@ void TextureCache<P>::UnregisterImage(ImageId image_id) {
1083 "Trying to unregister an already registered image"); 1390 "Trying to unregister an already registered image");
1084 image.flags &= ~ImageFlagBits::Registered; 1391 image.flags &= ~ImageFlagBits::Registered;
1085 image.flags &= ~ImageFlagBits::BadOverlap; 1392 image.flags &= ~ImageFlagBits::BadOverlap;
1086 u64 tentative_size = std::max(image.guest_size_bytes, image.unswizzled_size_bytes);
1087 if ((IsPixelFormatASTC(image.info.format) &&
1088 True(image.flags & ImageFlagBits::AcceleratedUpload)) ||
1089 True(image.flags & ImageFlagBits::Converted)) {
1090 tentative_size = EstimatedDecompressedSize(tentative_size, image.info.format);
1091 }
1092 total_used_memory -= Common::AlignUp(tentative_size, 1024);
1093 lru_cache.Free(image.lru_index); 1393 lru_cache.Free(image.lru_index);
1094 const auto& clear_page_table = 1394 const auto& clear_page_table =
1095 [this, image_id]( 1395 [this, image_id](
@@ -1213,8 +1513,18 @@ void TextureCache<P>::UntrackImage(ImageBase& image, ImageId image_id) {
1213} 1513}
1214 1514
1215template <class P> 1515template <class P>
1216void TextureCache<P>::DeleteImage(ImageId image_id) { 1516void TextureCache<P>::DeleteImage(ImageId image_id, bool immediate_delete) {
1217 ImageBase& image = slot_images[image_id]; 1517 ImageBase& image = slot_images[image_id];
1518 if (image.HasScaled()) {
1519 total_used_memory -= GetScaledImageSizeBytes(image);
1520 }
1521 u64 tentative_size = std::max(image.guest_size_bytes, image.unswizzled_size_bytes);
1522 if ((IsPixelFormatASTC(image.info.format) &&
1523 True(image.flags & ImageFlagBits::AcceleratedUpload)) ||
1524 True(image.flags & ImageFlagBits::Converted)) {
1525 tentative_size = EstimatedDecompressedSize(tentative_size, image.info.format);
1526 }
1527 total_used_memory -= Common::AlignUp(tentative_size, 1024);
1218 const GPUVAddr gpu_addr = image.gpu_addr; 1528 const GPUVAddr gpu_addr = image.gpu_addr;
1219 const auto alloc_it = image_allocs_table.find(gpu_addr); 1529 const auto alloc_it = image_allocs_table.find(gpu_addr);
1220 if (alloc_it == image_allocs_table.end()) { 1530 if (alloc_it == image_allocs_table.end()) {
@@ -1269,10 +1579,14 @@ void TextureCache<P>::DeleteImage(ImageId image_id) {
1269 num_removed_overlaps); 1579 num_removed_overlaps);
1270 } 1580 }
1271 for (const ImageViewId image_view_id : image_view_ids) { 1581 for (const ImageViewId image_view_id : image_view_ids) {
1272 sentenced_image_view.Push(std::move(slot_image_views[image_view_id])); 1582 if (!immediate_delete) {
1583 sentenced_image_view.Push(std::move(slot_image_views[image_view_id]));
1584 }
1273 slot_image_views.erase(image_view_id); 1585 slot_image_views.erase(image_view_id);
1274 } 1586 }
1275 sentenced_images.Push(std::move(slot_images[image_id])); 1587 if (!immediate_delete) {
1588 sentenced_images.Push(std::move(slot_images[image_id]));
1589 }
1276 slot_images.erase(image_id); 1590 slot_images.erase(image_id);
1277 1591
1278 alloc_images.erase(alloc_image_it); 1592 alloc_images.erase(alloc_image_it);
@@ -1306,6 +1620,9 @@ void TextureCache<P>::RemoveFramebuffers(std::span<const ImageViewId> removed_vi
1306 auto it = framebuffers.begin(); 1620 auto it = framebuffers.begin();
1307 while (it != framebuffers.end()) { 1621 while (it != framebuffers.end()) {
1308 if (it->first.Contains(removed_views)) { 1622 if (it->first.Contains(removed_views)) {
1623 auto framebuffer_id = it->second;
1624 ASSERT(framebuffer_id);
1625 sentenced_framebuffers.Push(std::move(slot_framebuffers[framebuffer_id]));
1309 it = framebuffers.erase(it); 1626 it = framebuffers.erase(it);
1310 } else { 1627 } else {
1311 ++it; 1628 ++it;
@@ -1322,26 +1639,60 @@ void TextureCache<P>::MarkModification(ImageBase& image) noexcept {
1322template <class P> 1639template <class P>
1323void TextureCache<P>::SynchronizeAliases(ImageId image_id) { 1640void TextureCache<P>::SynchronizeAliases(ImageId image_id) {
1324 boost::container::small_vector<const AliasedImage*, 1> aliased_images; 1641 boost::container::small_vector<const AliasedImage*, 1> aliased_images;
1325 ImageBase& image = slot_images[image_id]; 1642 Image& image = slot_images[image_id];
1643 bool any_rescaled = True(image.flags & ImageFlagBits::Rescaled);
1326 u64 most_recent_tick = image.modification_tick; 1644 u64 most_recent_tick = image.modification_tick;
1327 for (const AliasedImage& aliased : image.aliased_images) { 1645 for (const AliasedImage& aliased : image.aliased_images) {
1328 ImageBase& aliased_image = slot_images[aliased.id]; 1646 ImageBase& aliased_image = slot_images[aliased.id];
1329 if (image.modification_tick < aliased_image.modification_tick) { 1647 if (image.modification_tick < aliased_image.modification_tick) {
1330 most_recent_tick = std::max(most_recent_tick, aliased_image.modification_tick); 1648 most_recent_tick = std::max(most_recent_tick, aliased_image.modification_tick);
1331 aliased_images.push_back(&aliased); 1649 aliased_images.push_back(&aliased);
1650 any_rescaled |= True(aliased_image.flags & ImageFlagBits::Rescaled);
1332 } 1651 }
1333 } 1652 }
1334 if (aliased_images.empty()) { 1653 if (aliased_images.empty()) {
1335 return; 1654 return;
1336 } 1655 }
1656 const bool can_rescale = ImageCanRescale(image);
1657 if (any_rescaled) {
1658 if (can_rescale) {
1659 ScaleUp(image);
1660 } else {
1661 ScaleDown(image);
1662 }
1663 }
1337 image.modification_tick = most_recent_tick; 1664 image.modification_tick = most_recent_tick;
1338 std::ranges::sort(aliased_images, [this](const AliasedImage* lhs, const AliasedImage* rhs) { 1665 std::ranges::sort(aliased_images, [this](const AliasedImage* lhs, const AliasedImage* rhs) {
1339 const ImageBase& lhs_image = slot_images[lhs->id]; 1666 const ImageBase& lhs_image = slot_images[lhs->id];
1340 const ImageBase& rhs_image = slot_images[rhs->id]; 1667 const ImageBase& rhs_image = slot_images[rhs->id];
1341 return lhs_image.modification_tick < rhs_image.modification_tick; 1668 return lhs_image.modification_tick < rhs_image.modification_tick;
1342 }); 1669 });
1670 const auto& resolution = Settings::values.resolution_info;
1343 for (const AliasedImage* const aliased : aliased_images) { 1671 for (const AliasedImage* const aliased : aliased_images) {
1344 CopyImage(image_id, aliased->id, aliased->copies); 1672 if (!resolution.active | !any_rescaled) {
1673 CopyImage(image_id, aliased->id, aliased->copies);
1674 continue;
1675 }
1676 Image& aliased_image = slot_images[aliased->id];
1677 if (!can_rescale) {
1678 ScaleDown(aliased_image);
1679 CopyImage(image_id, aliased->id, aliased->copies);
1680 continue;
1681 }
1682 ScaleUp(aliased_image);
1683
1684 const bool both_2d{image.info.type == ImageType::e2D &&
1685 aliased_image.info.type == ImageType::e2D};
1686 auto copies = aliased->copies;
1687 for (auto copy : copies) {
1688 copy.extent.width = std::max<u32>(
1689 (copy.extent.width * resolution.up_scale) >> resolution.down_shift, 1);
1690 if (both_2d) {
1691 copy.extent.height = std::max<u32>(
1692 (copy.extent.height * resolution.up_scale) >> resolution.down_shift, 1);
1693 }
1694 }
1695 CopyImage(image_id, aliased->id, copies);
1345 } 1696 }
1346} 1697}
1347 1698
@@ -1377,9 +1728,25 @@ void TextureCache<P>::PrepareImageView(ImageViewId image_view_id, bool is_modifi
1377} 1728}
1378 1729
1379template <class P> 1730template <class P>
1380void TextureCache<P>::CopyImage(ImageId dst_id, ImageId src_id, std::span<const ImageCopy> copies) { 1731void TextureCache<P>::CopyImage(ImageId dst_id, ImageId src_id, std::vector<ImageCopy> copies) {
1381 Image& dst = slot_images[dst_id]; 1732 Image& dst = slot_images[dst_id];
1382 Image& src = slot_images[src_id]; 1733 Image& src = slot_images[src_id];
1734 const bool is_rescaled = True(src.flags & ImageFlagBits::Rescaled);
1735 if (is_rescaled) {
1736 ASSERT(True(dst.flags & ImageFlagBits::Rescaled));
1737 const bool both_2d{src.info.type == ImageType::e2D && dst.info.type == ImageType::e2D};
1738 const auto& resolution = Settings::values.resolution_info;
1739 for (auto& copy : copies) {
1740 copy.src_offset.x = resolution.ScaleUp(copy.src_offset.x);
1741 copy.dst_offset.x = resolution.ScaleUp(copy.dst_offset.x);
1742 copy.extent.width = resolution.ScaleUp(copy.extent.width);
1743 if (both_2d) {
1744 copy.src_offset.y = resolution.ScaleUp(copy.src_offset.y);
1745 copy.dst_offset.y = resolution.ScaleUp(copy.dst_offset.y);
1746 copy.extent.height = resolution.ScaleUp(copy.extent.height);
1747 }
1748 }
1749 }
1383 const auto dst_format_type = GetFormatType(dst.info.format); 1750 const auto dst_format_type = GetFormatType(dst.info.format);
1384 const auto src_format_type = GetFormatType(src.info.format); 1751 const auto src_format_type = GetFormatType(src.info.format);
1385 if (src_format_type == dst_format_type) { 1752 if (src_format_type == dst_format_type) {
@@ -1424,7 +1791,7 @@ void TextureCache<P>::CopyImage(ImageId dst_id, ImageId src_id, std::span<const
1424 }; 1791 };
1425 UNIMPLEMENTED_IF(copy.extent != expected_size); 1792 UNIMPLEMENTED_IF(copy.extent != expected_size);
1426 1793
1427 runtime.ConvertImage(dst_framebuffer, dst_view, src_view); 1794 runtime.ConvertImage(dst_framebuffer, dst_view, src_view, is_rescaled);
1428 } 1795 }
1429} 1796}
1430 1797
@@ -1433,8 +1800,8 @@ void TextureCache<P>::BindRenderTarget(ImageViewId* old_id, ImageViewId new_id)
1433 if (*old_id == new_id) { 1800 if (*old_id == new_id) {
1434 return; 1801 return;
1435 } 1802 }
1436 if (*old_id) { 1803 if (new_id) {
1437 const ImageViewBase& old_view = slot_image_views[*old_id]; 1804 const ImageViewBase& old_view = slot_image_views[new_id];
1438 if (True(old_view.flags & ImageViewFlagBits::PreemtiveDownload)) { 1805 if (True(old_view.flags & ImageViewFlagBits::PreemtiveDownload)) {
1439 uncommitted_downloads.push_back(old_view.image_id); 1806 uncommitted_downloads.push_back(old_view.image_id);
1440 } 1807 }
@@ -1447,10 +1814,18 @@ std::pair<FramebufferId, ImageViewId> TextureCache<P>::RenderTargetFromImage(
1447 ImageId image_id, const ImageViewInfo& view_info) { 1814 ImageId image_id, const ImageViewInfo& view_info) {
1448 const ImageViewId view_id = FindOrEmplaceImageView(image_id, view_info); 1815 const ImageViewId view_id = FindOrEmplaceImageView(image_id, view_info);
1449 const ImageBase& image = slot_images[image_id]; 1816 const ImageBase& image = slot_images[image_id];
1817 const bool is_rescaled = True(image.flags & ImageFlagBits::Rescaled);
1450 const bool is_color = GetFormatType(image.info.format) == SurfaceType::ColorTexture; 1818 const bool is_color = GetFormatType(image.info.format) == SurfaceType::ColorTexture;
1451 const ImageViewId color_view_id = is_color ? view_id : ImageViewId{}; 1819 const ImageViewId color_view_id = is_color ? view_id : ImageViewId{};
1452 const ImageViewId depth_view_id = is_color ? ImageViewId{} : view_id; 1820 const ImageViewId depth_view_id = is_color ? ImageViewId{} : view_id;
1453 const Extent3D extent = MipSize(image.info.size, view_info.range.base.level); 1821 Extent3D extent = MipSize(image.info.size, view_info.range.base.level);
1822 if (is_rescaled) {
1823 const auto& resolution = Settings::values.resolution_info;
1824 extent.width = resolution.ScaleUp(extent.width);
1825 if (image.info.type == ImageType::e2D) {
1826 extent.height = resolution.ScaleUp(extent.height);
1827 }
1828 }
1454 const u32 num_samples = image.info.num_samples; 1829 const u32 num_samples = image.info.num_samples;
1455 const auto [samples_x, samples_y] = SamplesLog2(num_samples); 1830 const auto [samples_x, samples_y] = SamplesLog2(num_samples);
1456 const FramebufferId framebuffer_id = GetFramebufferId(RenderTargets{ 1831 const FramebufferId framebuffer_id = GetFramebufferId(RenderTargets{
diff --git a/src/video_core/texture_cache/texture_cache_base.h b/src/video_core/texture_cache/texture_cache_base.h
index 2d1893c1c..643ad811c 100644
--- a/src/video_core/texture_cache/texture_cache_base.h
+++ b/src/video_core/texture_cache/texture_cache_base.h
@@ -21,6 +21,7 @@
21#include "video_core/texture_cache/descriptor_table.h" 21#include "video_core/texture_cache/descriptor_table.h"
22#include "video_core/texture_cache/image_base.h" 22#include "video_core/texture_cache/image_base.h"
23#include "video_core/texture_cache/image_info.h" 23#include "video_core/texture_cache/image_info.h"
24#include "video_core/texture_cache/image_view_base.h"
24#include "video_core/texture_cache/image_view_info.h" 25#include "video_core/texture_cache/image_view_info.h"
25#include "video_core/texture_cache/render_targets.h" 26#include "video_core/texture_cache/render_targets.h"
26#include "video_core/texture_cache/slot_vector.h" 27#include "video_core/texture_cache/slot_vector.h"
@@ -39,6 +40,12 @@ using VideoCore::Surface::PixelFormatFromDepthFormat;
39using VideoCore::Surface::PixelFormatFromRenderTargetFormat; 40using VideoCore::Surface::PixelFormatFromRenderTargetFormat;
40using namespace Common::Literals; 41using namespace Common::Literals;
41 42
43struct ImageViewInOut {
44 u32 index{};
45 bool blacklist{};
46 ImageViewId id{};
47};
48
42template <class P> 49template <class P>
43class TextureCache { 50class TextureCache {
44 /// Address shift for caching images into a hash table 51 /// Address shift for caching images into a hash table
@@ -53,11 +60,6 @@ class TextureCache {
53 /// True when the API can provide info about the memory of the device. 60 /// True when the API can provide info about the memory of the device.
54 static constexpr bool HAS_DEVICE_MEMORY_INFO = P::HAS_DEVICE_MEMORY_INFO; 61 static constexpr bool HAS_DEVICE_MEMORY_INFO = P::HAS_DEVICE_MEMORY_INFO;
55 62
56 /// Image view ID for null descriptors
57 static constexpr ImageViewId NULL_IMAGE_VIEW_ID{0};
58 /// Sampler ID for bugged sampler ids
59 static constexpr SamplerId NULL_SAMPLER_ID{0};
60
61 static constexpr u64 DEFAULT_EXPECTED_MEMORY = 1_GiB; 63 static constexpr u64 DEFAULT_EXPECTED_MEMORY = 1_GiB;
62 static constexpr u64 DEFAULT_CRITICAL_MEMORY = 2_GiB; 64 static constexpr u64 DEFAULT_CRITICAL_MEMORY = 2_GiB;
63 65
@@ -99,11 +101,11 @@ public:
99 void MarkModification(ImageId id) noexcept; 101 void MarkModification(ImageId id) noexcept;
100 102
101 /// Fill image_view_ids with the graphics images in indices 103 /// Fill image_view_ids with the graphics images in indices
102 void FillGraphicsImageViews(std::span<const u32> indices, 104 template <bool has_blacklists>
103 std::span<ImageViewId> image_view_ids); 105 void FillGraphicsImageViews(std::span<ImageViewInOut> views);
104 106
105 /// Fill image_view_ids with the compute images in indices 107 /// Fill image_view_ids with the compute images in indices
106 void FillComputeImageViews(std::span<const u32> indices, std::span<ImageViewId> image_view_ids); 108 void FillComputeImageViews(std::span<ImageViewInOut> views);
107 109
108 /// Get the sampler from the graphics descriptor table in the specified index 110 /// Get the sampler from the graphics descriptor table in the specified index
109 Sampler* GetGraphicsSampler(u32 index); 111 Sampler* GetGraphicsSampler(u32 index);
@@ -117,6 +119,11 @@ public:
117 /// Refresh the state for compute image view and sampler descriptors 119 /// Refresh the state for compute image view and sampler descriptors
118 void SynchronizeComputeDescriptors(); 120 void SynchronizeComputeDescriptors();
119 121
122 /// Updates the Render Targets if they can be rescaled
123 /// @param is_clear True when the render targets are being used for clears
124 /// @retval True if the Render Targets have been rescaled.
125 bool RescaleRenderTargets(bool is_clear);
126
120 /// Update bound render targets and upload memory if necessary 127 /// Update bound render targets and upload memory if necessary
121 /// @param is_clear True when the render targets are being used for clears 128 /// @param is_clear True when the render targets are being used for clears
122 void UpdateRenderTargets(bool is_clear); 129 void UpdateRenderTargets(bool is_clear);
@@ -160,6 +167,10 @@ public:
160 /// Return true when a CPU region is modified from the GPU 167 /// Return true when a CPU region is modified from the GPU
161 [[nodiscard]] bool IsRegionGpuModified(VAddr addr, size_t size); 168 [[nodiscard]] bool IsRegionGpuModified(VAddr addr, size_t size);
162 169
170 [[nodiscard]] bool IsRescaling() const noexcept;
171
172 [[nodiscard]] bool IsRescaling(const ImageViewBase& image_view) const noexcept;
173
163 std::mutex mutex; 174 std::mutex mutex;
164 175
165private: 176private:
@@ -198,9 +209,10 @@ private:
198 void RunGarbageCollector(); 209 void RunGarbageCollector();
199 210
200 /// Fills image_view_ids in the image views in indices 211 /// Fills image_view_ids in the image views in indices
212 template <bool has_blacklists>
201 void FillImageViews(DescriptorTable<TICEntry>& table, 213 void FillImageViews(DescriptorTable<TICEntry>& table,
202 std::span<ImageViewId> cached_image_view_ids, std::span<const u32> indices, 214 std::span<ImageViewId> cached_image_view_ids,
203 std::span<ImageViewId> image_view_ids); 215 std::span<ImageViewInOut> views);
204 216
205 /// Find or create an image view in the guest descriptor table 217 /// Find or create an image view in the guest descriptor table
206 ImageViewId VisitImageView(DescriptorTable<TICEntry>& table, 218 ImageViewId VisitImageView(DescriptorTable<TICEntry>& table,
@@ -285,7 +297,7 @@ private:
285 void UntrackImage(ImageBase& image, ImageId image_id); 297 void UntrackImage(ImageBase& image, ImageId image_id);
286 298
287 /// Delete image from the cache 299 /// Delete image from the cache
288 void DeleteImage(ImageId image); 300 void DeleteImage(ImageId image, bool immediate_delete = false);
289 301
290 /// Remove image views references from the cache 302 /// Remove image views references from the cache
291 void RemoveImageViewReferences(std::span<const ImageViewId> removed_views); 303 void RemoveImageViewReferences(std::span<const ImageViewId> removed_views);
@@ -306,7 +318,7 @@ private:
306 void PrepareImageView(ImageViewId image_view_id, bool is_modification, bool invalidate); 318 void PrepareImageView(ImageViewId image_view_id, bool is_modification, bool invalidate);
307 319
308 /// Execute copies from one image to the other, even if they are incompatible 320 /// Execute copies from one image to the other, even if they are incompatible
309 void CopyImage(ImageId dst_id, ImageId src_id, std::span<const ImageCopy> copies); 321 void CopyImage(ImageId dst_id, ImageId src_id, std::vector<ImageCopy> copies);
310 322
311 /// Bind an image view as render target, downloading resources preemtively if needed 323 /// Bind an image view as render target, downloading resources preemtively if needed
312 void BindRenderTarget(ImageViewId* old_id, ImageViewId new_id); 324 void BindRenderTarget(ImageViewId* old_id, ImageViewId new_id);
@@ -318,6 +330,12 @@ private:
318 /// Returns true if the current clear parameters clear the whole image of a given image view 330 /// Returns true if the current clear parameters clear the whole image of a given image view
319 [[nodiscard]] bool IsFullClear(ImageViewId id); 331 [[nodiscard]] bool IsFullClear(ImageViewId id);
320 332
333 bool ImageCanRescale(ImageBase& image);
334 void InvalidateScale(Image& image);
335 bool ScaleUp(Image& image);
336 bool ScaleDown(Image& image);
337 u64 GetScaledImageSizeBytes(ImageBase& image);
338
321 Runtime& runtime; 339 Runtime& runtime;
322 VideoCore::RasterizerInterface& rasterizer; 340 VideoCore::RasterizerInterface& rasterizer;
323 Tegra::Engines::Maxwell3D& maxwell3d; 341 Tegra::Engines::Maxwell3D& maxwell3d;
@@ -349,6 +367,7 @@ private:
349 VAddr virtual_invalid_space{}; 367 VAddr virtual_invalid_space{};
350 368
351 bool has_deleted_images = false; 369 bool has_deleted_images = false;
370 bool is_rescaling = false;
352 u64 total_used_memory = 0; 371 u64 total_used_memory = 0;
353 u64 minimum_memory; 372 u64 minimum_memory;
354 u64 expected_memory; 373 u64 expected_memory;
diff --git a/src/video_core/texture_cache/types.h b/src/video_core/texture_cache/types.h
index 47a11cb2f..5c274abdf 100644
--- a/src/video_core/texture_cache/types.h
+++ b/src/video_core/texture_cache/types.h
@@ -22,6 +22,13 @@ using ImageAllocId = SlotId;
22using SamplerId = SlotId; 22using SamplerId = SlotId;
23using FramebufferId = SlotId; 23using FramebufferId = SlotId;
24 24
25/// Fake image ID for null image views
26constexpr ImageId NULL_IMAGE_ID{0};
27/// Image view ID for null descriptors
28constexpr ImageViewId NULL_IMAGE_VIEW_ID{0};
29/// Sampler ID for bugged sampler ids
30constexpr SamplerId NULL_SAMPLER_ID{0};
31
25enum class ImageType : u32 { 32enum class ImageType : u32 {
26 e1D, 33 e1D,
27 e2D, 34 e2D,
diff --git a/src/video_core/texture_cache/util.cpp b/src/video_core/texture_cache/util.cpp
index 59cf2f561..ddc9fb13a 100644
--- a/src/video_core/texture_cache/util.cpp
+++ b/src/video_core/texture_cache/util.cpp
@@ -723,7 +723,7 @@ ImageViewType RenderTargetImageViewType(const ImageInfo& info) noexcept {
723} 723}
724 724
725std::vector<ImageCopy> MakeShrinkImageCopies(const ImageInfo& dst, const ImageInfo& src, 725std::vector<ImageCopy> MakeShrinkImageCopies(const ImageInfo& dst, const ImageInfo& src,
726 SubresourceBase base) { 726 SubresourceBase base, u32 up_scale, u32 down_shift) {
727 ASSERT(dst.resources.levels >= src.resources.levels); 727 ASSERT(dst.resources.levels >= src.resources.levels);
728 ASSERT(dst.num_samples == src.num_samples); 728 ASSERT(dst.num_samples == src.num_samples);
729 729
@@ -732,7 +732,7 @@ std::vector<ImageCopy> MakeShrinkImageCopies(const ImageInfo& dst, const ImageIn
732 ASSERT(src.type == ImageType::e3D); 732 ASSERT(src.type == ImageType::e3D);
733 ASSERT(src.resources.levels == 1); 733 ASSERT(src.resources.levels == 1);
734 } 734 }
735 735 const bool both_2d{src.type == ImageType::e2D && dst.type == ImageType::e2D};
736 std::vector<ImageCopy> copies; 736 std::vector<ImageCopy> copies;
737 copies.reserve(src.resources.levels); 737 copies.reserve(src.resources.levels);
738 for (s32 level = 0; level < src.resources.levels; ++level) { 738 for (s32 level = 0; level < src.resources.levels; ++level) {
@@ -762,6 +762,10 @@ std::vector<ImageCopy> MakeShrinkImageCopies(const ImageInfo& dst, const ImageIn
762 if (is_dst_3d) { 762 if (is_dst_3d) {
763 copy.extent.depth = src.size.depth; 763 copy.extent.depth = src.size.depth;
764 } 764 }
765 copy.extent.width = std::max<u32>((copy.extent.width * up_scale) >> down_shift, 1);
766 if (both_2d) {
767 copy.extent.height = std::max<u32>((copy.extent.height * up_scale) >> down_shift, 1);
768 }
765 } 769 }
766 return copies; 770 return copies;
767} 771}
@@ -1153,10 +1157,10 @@ void DeduceBlitImages(ImageInfo& dst_info, ImageInfo& src_info, const ImageBase*
1153 if (dst && GetFormatType(dst->info.format) != SurfaceType::ColorTexture) { 1157 if (dst && GetFormatType(dst->info.format) != SurfaceType::ColorTexture) {
1154 dst_info.format = dst->info.format; 1158 dst_info.format = dst->info.format;
1155 } 1159 }
1156 if (!dst && src && GetFormatType(src->info.format) != SurfaceType::ColorTexture) { 1160 if (src && GetFormatType(src->info.format) != SurfaceType::ColorTexture) {
1157 dst_info.format = src->info.format; 1161 dst_info.format = src->info.format;
1158 } 1162 }
1159 if (!src && dst && GetFormatType(dst->info.format) != SurfaceType::ColorTexture) { 1163 if (dst && GetFormatType(dst->info.format) != SurfaceType::ColorTexture) {
1160 src_info.format = dst->info.format; 1164 src_info.format = dst->info.format;
1161 } 1165 }
1162} 1166}
diff --git a/src/video_core/texture_cache/util.h b/src/video_core/texture_cache/util.h
index 766502908..7af52de2e 100644
--- a/src/video_core/texture_cache/util.h
+++ b/src/video_core/texture_cache/util.h
@@ -55,7 +55,8 @@ struct OverlapResult {
55 55
56[[nodiscard]] std::vector<ImageCopy> MakeShrinkImageCopies(const ImageInfo& dst, 56[[nodiscard]] std::vector<ImageCopy> MakeShrinkImageCopies(const ImageInfo& dst,
57 const ImageInfo& src, 57 const ImageInfo& src,
58 SubresourceBase base); 58 SubresourceBase base, u32 up_scale = 1,
59 u32 down_shift = 0);
59 60
60[[nodiscard]] bool IsValidEntry(const Tegra::MemoryManager& gpu_memory, const TICEntry& config); 61[[nodiscard]] bool IsValidEntry(const Tegra::MemoryManager& gpu_memory, const TICEntry& config);
61 62
diff --git a/src/video_core/textures/texture.cpp b/src/video_core/textures/texture.cpp
index a552543ed..06954963d 100644
--- a/src/video_core/textures/texture.cpp
+++ b/src/video_core/textures/texture.cpp
@@ -51,22 +51,6 @@ constexpr std::array<float, 256> SRGB_CONVERSION_LUT = {
51 0.917104f, 0.929242f, 0.941493f, 0.953859f, 0.966338f, 1.000000f, 1.000000f, 1.000000f, 51 0.917104f, 0.929242f, 0.941493f, 0.953859f, 0.966338f, 1.000000f, 1.000000f, 1.000000f,
52}; 52};
53 53
54unsigned SettingsMinimumAnisotropy() noexcept {
55 switch (static_cast<Anisotropy>(Settings::values.max_anisotropy.GetValue())) {
56 default:
57 case Anisotropy::Default:
58 return 1U;
59 case Anisotropy::Filter2x:
60 return 2U;
61 case Anisotropy::Filter4x:
62 return 4U;
63 case Anisotropy::Filter8x:
64 return 8U;
65 case Anisotropy::Filter16x:
66 return 16U;
67 }
68}
69
70} // Anonymous namespace 54} // Anonymous namespace
71 55
72std::array<float, 4> TSCEntry::BorderColor() const noexcept { 56std::array<float, 4> TSCEntry::BorderColor() const noexcept {
@@ -78,7 +62,18 @@ std::array<float, 4> TSCEntry::BorderColor() const noexcept {
78} 62}
79 63
80float TSCEntry::MaxAnisotropy() const noexcept { 64float TSCEntry::MaxAnisotropy() const noexcept {
81 return static_cast<float>(std::max(1U << max_anisotropy, SettingsMinimumAnisotropy())); 65 if (max_anisotropy == 0 && mipmap_filter != TextureMipmapFilter::Linear) {
66 return 1.0f;
67 }
68 const auto anisotropic_settings = Settings::values.max_anisotropy.GetValue();
69 u32 added_anisotropic{};
70 if (anisotropic_settings == 0) {
71 added_anisotropic = Settings::values.resolution_info.up_scale >>
72 Settings::values.resolution_info.down_shift;
73 } else {
74 added_anisotropic = Settings::values.max_anisotropy.GetValue() - 1U;
75 }
76 return static_cast<float>(1U << (max_anisotropy + added_anisotropic));
82} 77}
83 78
84} // namespace Tegra::Texture 79} // namespace Tegra::Texture
diff --git a/src/video_core/video_core.cpp b/src/video_core/video_core.cpp
index cae543a51..e852c817e 100644
--- a/src/video_core/video_core.cpp
+++ b/src/video_core/video_core.cpp
@@ -37,6 +37,8 @@ std::unique_ptr<VideoCore::RendererBase> CreateRenderer(
37namespace VideoCore { 37namespace VideoCore {
38 38
39std::unique_ptr<Tegra::GPU> CreateGPU(Core::Frontend::EmuWindow& emu_window, Core::System& system) { 39std::unique_ptr<Tegra::GPU> CreateGPU(Core::Frontend::EmuWindow& emu_window, Core::System& system) {
40 Settings::UpdateRescalingInfo();
41
40 const auto nvdec_value = Settings::values.nvdec_emulation.GetValue(); 42 const auto nvdec_value = Settings::values.nvdec_emulation.GetValue();
41 const bool use_nvdec = nvdec_value != Settings::NvdecEmulation::Off; 43 const bool use_nvdec = nvdec_value != Settings::NvdecEmulation::Off;
42 const bool use_async = Settings::values.use_asynchronous_gpu_emulation.GetValue(); 44 const bool use_async = Settings::values.use_asynchronous_gpu_emulation.GetValue();
@@ -53,11 +55,10 @@ std::unique_ptr<Tegra::GPU> CreateGPU(Core::Frontend::EmuWindow& emu_window, Cor
53 } 55 }
54} 56}
55 57
56u16 GetResolutionScaleFactor(const RendererBase& renderer) { 58float GetResolutionScaleFactor(const RendererBase& renderer) {
57 return static_cast<u16>( 59 return Settings::values.resolution_info.active
58 Settings::values.resolution_factor.GetValue() != 0 60 ? Settings::values.resolution_info.up_factor
59 ? Settings::values.resolution_factor.GetValue() 61 : renderer.GetRenderWindow().GetFramebufferLayout().GetScalingRatio();
60 : renderer.GetRenderWindow().GetFramebufferLayout().GetScalingRatio());
61} 62}
62 63
63} // namespace VideoCore 64} // namespace VideoCore
diff --git a/src/video_core/video_core.h b/src/video_core/video_core.h
index f5c27125d..f86877e86 100644
--- a/src/video_core/video_core.h
+++ b/src/video_core/video_core.h
@@ -25,6 +25,6 @@ class RendererBase;
25/// Creates an emulated GPU instance using the given system context. 25/// Creates an emulated GPU instance using the given system context.
26std::unique_ptr<Tegra::GPU> CreateGPU(Core::Frontend::EmuWindow& emu_window, Core::System& system); 26std::unique_ptr<Tegra::GPU> CreateGPU(Core::Frontend::EmuWindow& emu_window, Core::System& system);
27 27
28u16 GetResolutionScaleFactor(const RendererBase& renderer); 28float GetResolutionScaleFactor(const RendererBase& renderer);
29 29
30} // namespace VideoCore 30} // namespace VideoCore
diff --git a/src/video_core/vulkan_common/vulkan_device.h b/src/video_core/vulkan_common/vulkan_device.h
index 2d5daf6cd..10653ac6b 100644
--- a/src/video_core/vulkan_common/vulkan_device.h
+++ b/src/video_core/vulkan_common/vulkan_device.h
@@ -40,6 +40,10 @@ public:
40 VkFormat GetSupportedFormat(VkFormat wanted_format, VkFormatFeatureFlags wanted_usage, 40 VkFormat GetSupportedFormat(VkFormat wanted_format, VkFormatFeatureFlags wanted_usage,
41 FormatType format_type) const; 41 FormatType format_type) const;
42 42
43 /// Returns true if a format is supported.
44 bool IsFormatSupported(VkFormat wanted_format, VkFormatFeatureFlags wanted_usage,
45 FormatType format_type) const;
46
43 /// Reports a device loss. 47 /// Reports a device loss.
44 void ReportLoss() const; 48 void ReportLoss() const;
45 49
@@ -370,10 +374,6 @@ private:
370 /// Returns true if the device natively supports blitting depth stencil images. 374 /// Returns true if the device natively supports blitting depth stencil images.
371 bool TestDepthStencilBlits() const; 375 bool TestDepthStencilBlits() const;
372 376
373 /// Returns true if a format is supported.
374 bool IsFormatSupported(VkFormat wanted_format, VkFormatFeatureFlags wanted_usage,
375 FormatType format_type) const;
376
377 VkInstance instance; ///< Vulkan instance. 377 VkInstance instance; ///< Vulkan instance.
378 vk::DeviceDispatch dld; ///< Device function pointers. 378 vk::DeviceDispatch dld; ///< Device function pointers.
379 vk::PhysicalDevice physical; ///< Physical device. 379 vk::PhysicalDevice physical; ///< Physical device.
diff --git a/src/yuzu/bootmanager.cpp b/src/yuzu/bootmanager.cpp
index 46ab0603d..976acd176 100644
--- a/src/yuzu/bootmanager.cpp
+++ b/src/yuzu/bootmanager.cpp
@@ -628,11 +628,9 @@ void GRenderWindow::ReleaseRenderTarget() {
628 main_context.reset(); 628 main_context.reset();
629} 629}
630 630
631void GRenderWindow::CaptureScreenshot(u32 res_scale, const QString& screenshot_path) { 631void GRenderWindow::CaptureScreenshot(const QString& screenshot_path) {
632 VideoCore::RendererBase& renderer = system.Renderer(); 632 auto& renderer = system.Renderer();
633 if (res_scale == 0) { 633 const f32 res_scale = VideoCore::GetResolutionScaleFactor(renderer);
634 res_scale = VideoCore::GetResolutionScaleFactor(renderer);
635 }
636 634
637 const Layout::FramebufferLayout layout{Layout::FrameLayoutFromResolutionScale(res_scale)}; 635 const Layout::FramebufferLayout layout{Layout::FrameLayoutFromResolutionScale(res_scale)};
638 screenshot_image = QImage(QSize(layout.width, layout.height), QImage::Format_RGB32); 636 screenshot_image = QImage(QSize(layout.width, layout.height), QImage::Format_RGB32);
diff --git a/src/yuzu/bootmanager.h b/src/yuzu/bootmanager.h
index e6a0666e9..40fd4a9d6 100644
--- a/src/yuzu/bootmanager.h
+++ b/src/yuzu/bootmanager.h
@@ -178,7 +178,7 @@ public:
178 178
179 bool IsLoadingComplete() const; 179 bool IsLoadingComplete() const;
180 180
181 void CaptureScreenshot(u32 res_scale, const QString& screenshot_path); 181 void CaptureScreenshot(const QString& screenshot_path);
182 182
183 std::pair<u32, u32> ScaleTouch(const QPointF& pos) const; 183 std::pair<u32, u32> ScaleTouch(const QPointF& pos) const;
184 184
diff --git a/src/yuzu/configuration/config.cpp b/src/yuzu/configuration/config.cpp
index faea5dda1..8227d06bc 100644
--- a/src/yuzu/configuration/config.cpp
+++ b/src/yuzu/configuration/config.cpp
@@ -824,6 +824,9 @@ void Config::ReadRendererValues() {
824 ReadGlobalSetting(Settings::values.vulkan_device); 824 ReadGlobalSetting(Settings::values.vulkan_device);
825 ReadGlobalSetting(Settings::values.fullscreen_mode); 825 ReadGlobalSetting(Settings::values.fullscreen_mode);
826 ReadGlobalSetting(Settings::values.aspect_ratio); 826 ReadGlobalSetting(Settings::values.aspect_ratio);
827 ReadGlobalSetting(Settings::values.resolution_setup);
828 ReadGlobalSetting(Settings::values.scaling_filter);
829 ReadGlobalSetting(Settings::values.anti_aliasing);
827 ReadGlobalSetting(Settings::values.max_anisotropy); 830 ReadGlobalSetting(Settings::values.max_anisotropy);
828 ReadGlobalSetting(Settings::values.use_speed_limit); 831 ReadGlobalSetting(Settings::values.use_speed_limit);
829 ReadGlobalSetting(Settings::values.speed_limit); 832 ReadGlobalSetting(Settings::values.speed_limit);
@@ -1364,6 +1367,18 @@ void Config::SaveRendererValues() {
1364 static_cast<u32>(Settings::values.fullscreen_mode.GetDefault()), 1367 static_cast<u32>(Settings::values.fullscreen_mode.GetDefault()),
1365 Settings::values.fullscreen_mode.UsingGlobal()); 1368 Settings::values.fullscreen_mode.UsingGlobal());
1366 WriteGlobalSetting(Settings::values.aspect_ratio); 1369 WriteGlobalSetting(Settings::values.aspect_ratio);
1370 WriteSetting(QString::fromStdString(Settings::values.resolution_setup.GetLabel()),
1371 static_cast<u32>(Settings::values.resolution_setup.GetValue(global)),
1372 static_cast<u32>(Settings::values.resolution_setup.GetDefault()),
1373 Settings::values.resolution_setup.UsingGlobal());
1374 WriteSetting(QString::fromStdString(Settings::values.scaling_filter.GetLabel()),
1375 static_cast<u32>(Settings::values.scaling_filter.GetValue(global)),
1376 static_cast<u32>(Settings::values.scaling_filter.GetDefault()),
1377 Settings::values.scaling_filter.UsingGlobal());
1378 WriteSetting(QString::fromStdString(Settings::values.anti_aliasing.GetLabel()),
1379 static_cast<u32>(Settings::values.anti_aliasing.GetValue(global)),
1380 static_cast<u32>(Settings::values.anti_aliasing.GetDefault()),
1381 Settings::values.anti_aliasing.UsingGlobal());
1367 WriteGlobalSetting(Settings::values.max_anisotropy); 1382 WriteGlobalSetting(Settings::values.max_anisotropy);
1368 WriteGlobalSetting(Settings::values.use_speed_limit); 1383 WriteGlobalSetting(Settings::values.use_speed_limit);
1369 WriteGlobalSetting(Settings::values.speed_limit); 1384 WriteGlobalSetting(Settings::values.speed_limit);
diff --git a/src/yuzu/configuration/config.h b/src/yuzu/configuration/config.h
index a7f4a6720..d673c1cdc 100644
--- a/src/yuzu/configuration/config.h
+++ b/src/yuzu/configuration/config.h
@@ -189,5 +189,8 @@ Q_DECLARE_METATYPE(Settings::CPUAccuracy);
189Q_DECLARE_METATYPE(Settings::GPUAccuracy); 189Q_DECLARE_METATYPE(Settings::GPUAccuracy);
190Q_DECLARE_METATYPE(Settings::FullscreenMode); 190Q_DECLARE_METATYPE(Settings::FullscreenMode);
191Q_DECLARE_METATYPE(Settings::NvdecEmulation); 191Q_DECLARE_METATYPE(Settings::NvdecEmulation);
192Q_DECLARE_METATYPE(Settings::ResolutionSetup);
193Q_DECLARE_METATYPE(Settings::ScalingFilter);
194Q_DECLARE_METATYPE(Settings::AntiAliasing);
192Q_DECLARE_METATYPE(Settings::RendererBackend); 195Q_DECLARE_METATYPE(Settings::RendererBackend);
193Q_DECLARE_METATYPE(Settings::ShaderBackend); 196Q_DECLARE_METATYPE(Settings::ShaderBackend);
diff --git a/src/yuzu/configuration/configure_graphics.cpp b/src/yuzu/configuration/configure_graphics.cpp
index 8e20cc6f3..59f975a6e 100644
--- a/src/yuzu/configuration/configure_graphics.cpp
+++ b/src/yuzu/configuration/configure_graphics.cpp
@@ -89,6 +89,7 @@ void ConfigureGraphics::SetConfiguration() {
89 ui->use_asynchronous_gpu_emulation->setEnabled(runtime_lock); 89 ui->use_asynchronous_gpu_emulation->setEnabled(runtime_lock);
90 ui->use_disk_shader_cache->setEnabled(runtime_lock); 90 ui->use_disk_shader_cache->setEnabled(runtime_lock);
91 ui->nvdec_emulation_widget->setEnabled(runtime_lock); 91 ui->nvdec_emulation_widget->setEnabled(runtime_lock);
92 ui->resolution_combobox->setEnabled(runtime_lock);
92 ui->accelerate_astc->setEnabled(runtime_lock); 93 ui->accelerate_astc->setEnabled(runtime_lock);
93 ui->use_disk_shader_cache->setChecked(Settings::values.use_disk_shader_cache.GetValue()); 94 ui->use_disk_shader_cache->setChecked(Settings::values.use_disk_shader_cache.GetValue());
94 ui->use_asynchronous_gpu_emulation->setChecked( 95 ui->use_asynchronous_gpu_emulation->setChecked(
@@ -102,6 +103,12 @@ void ConfigureGraphics::SetConfiguration() {
102 ui->nvdec_emulation->setCurrentIndex( 103 ui->nvdec_emulation->setCurrentIndex(
103 static_cast<int>(Settings::values.nvdec_emulation.GetValue())); 104 static_cast<int>(Settings::values.nvdec_emulation.GetValue()));
104 ui->aspect_ratio_combobox->setCurrentIndex(Settings::values.aspect_ratio.GetValue()); 105 ui->aspect_ratio_combobox->setCurrentIndex(Settings::values.aspect_ratio.GetValue());
106 ui->resolution_combobox->setCurrentIndex(
107 static_cast<int>(Settings::values.resolution_setup.GetValue()));
108 ui->scaling_filter_combobox->setCurrentIndex(
109 static_cast<int>(Settings::values.scaling_filter.GetValue()));
110 ui->anti_aliasing_combobox->setCurrentIndex(
111 static_cast<int>(Settings::values.anti_aliasing.GetValue()));
105 } else { 112 } else {
106 ConfigurationShared::SetPerGameSetting(ui->api, &Settings::values.renderer_backend); 113 ConfigurationShared::SetPerGameSetting(ui->api, &Settings::values.renderer_backend);
107 ConfigurationShared::SetHighlight(ui->api_widget, 114 ConfigurationShared::SetHighlight(ui->api_widget,
@@ -122,6 +129,21 @@ void ConfigureGraphics::SetConfiguration() {
122 ConfigurationShared::SetHighlight(ui->ar_label, 129 ConfigurationShared::SetHighlight(ui->ar_label,
123 !Settings::values.aspect_ratio.UsingGlobal()); 130 !Settings::values.aspect_ratio.UsingGlobal());
124 131
132 ConfigurationShared::SetPerGameSetting(ui->resolution_combobox,
133 &Settings::values.resolution_setup);
134 ConfigurationShared::SetHighlight(ui->resolution_label,
135 !Settings::values.resolution_setup.UsingGlobal());
136
137 ConfigurationShared::SetPerGameSetting(ui->scaling_filter_combobox,
138 &Settings::values.scaling_filter);
139 ConfigurationShared::SetHighlight(ui->scaling_filter_label,
140 !Settings::values.scaling_filter.UsingGlobal());
141
142 ConfigurationShared::SetPerGameSetting(ui->anti_aliasing_combobox,
143 &Settings::values.anti_aliasing);
144 ConfigurationShared::SetHighlight(ui->anti_aliasing_label,
145 !Settings::values.anti_aliasing.UsingGlobal());
146
125 ui->bg_combobox->setCurrentIndex(Settings::values.bg_red.UsingGlobal() ? 0 : 1); 147 ui->bg_combobox->setCurrentIndex(Settings::values.bg_red.UsingGlobal() ? 0 : 1);
126 ui->bg_button->setEnabled(!Settings::values.bg_red.UsingGlobal()); 148 ui->bg_button->setEnabled(!Settings::values.bg_red.UsingGlobal());
127 ConfigurationShared::SetHighlight(ui->bg_layout, !Settings::values.bg_red.UsingGlobal()); 149 ConfigurationShared::SetHighlight(ui->bg_layout, !Settings::values.bg_red.UsingGlobal());
@@ -133,11 +155,22 @@ void ConfigureGraphics::SetConfiguration() {
133} 155}
134 156
135void ConfigureGraphics::ApplyConfiguration() { 157void ConfigureGraphics::ApplyConfiguration() {
158 const auto resolution_setup = static_cast<Settings::ResolutionSetup>(
159 ui->resolution_combobox->currentIndex() -
160 ((Settings::IsConfiguringGlobal()) ? 0 : ConfigurationShared::USE_GLOBAL_OFFSET));
161
162 const auto scaling_filter = static_cast<Settings::ScalingFilter>(
163 ui->scaling_filter_combobox->currentIndex() -
164 ((Settings::IsConfiguringGlobal()) ? 0 : ConfigurationShared::USE_GLOBAL_OFFSET));
165
166 const auto anti_aliasing = static_cast<Settings::AntiAliasing>(
167 ui->anti_aliasing_combobox->currentIndex() -
168 ((Settings::IsConfiguringGlobal()) ? 0 : ConfigurationShared::USE_GLOBAL_OFFSET));
169
136 ConfigurationShared::ApplyPerGameSetting(&Settings::values.fullscreen_mode, 170 ConfigurationShared::ApplyPerGameSetting(&Settings::values.fullscreen_mode,
137 ui->fullscreen_mode_combobox); 171 ui->fullscreen_mode_combobox);
138 ConfigurationShared::ApplyPerGameSetting(&Settings::values.aspect_ratio, 172 ConfigurationShared::ApplyPerGameSetting(&Settings::values.aspect_ratio,
139 ui->aspect_ratio_combobox); 173 ui->aspect_ratio_combobox);
140
141 ConfigurationShared::ApplyPerGameSetting(&Settings::values.use_disk_shader_cache, 174 ConfigurationShared::ApplyPerGameSetting(&Settings::values.use_disk_shader_cache,
142 ui->use_disk_shader_cache, use_disk_shader_cache); 175 ui->use_disk_shader_cache, use_disk_shader_cache);
143 ConfigurationShared::ApplyPerGameSetting(&Settings::values.use_asynchronous_gpu_emulation, 176 ConfigurationShared::ApplyPerGameSetting(&Settings::values.use_asynchronous_gpu_emulation,
@@ -165,7 +198,34 @@ void ConfigureGraphics::ApplyConfiguration() {
165 Settings::values.bg_green.SetValue(static_cast<u8>(bg_color.green())); 198 Settings::values.bg_green.SetValue(static_cast<u8>(bg_color.green()));
166 Settings::values.bg_blue.SetValue(static_cast<u8>(bg_color.blue())); 199 Settings::values.bg_blue.SetValue(static_cast<u8>(bg_color.blue()));
167 } 200 }
201 if (Settings::values.resolution_setup.UsingGlobal()) {
202 Settings::values.resolution_setup.SetValue(resolution_setup);
203 }
204 if (Settings::values.scaling_filter.UsingGlobal()) {
205 Settings::values.scaling_filter.SetValue(scaling_filter);
206 }
207 if (Settings::values.anti_aliasing.UsingGlobal()) {
208 Settings::values.anti_aliasing.SetValue(anti_aliasing);
209 }
168 } else { 210 } else {
211 if (ui->resolution_combobox->currentIndex() == ConfigurationShared::USE_GLOBAL_INDEX) {
212 Settings::values.resolution_setup.SetGlobal(true);
213 } else {
214 Settings::values.resolution_setup.SetGlobal(false);
215 Settings::values.resolution_setup.SetValue(resolution_setup);
216 }
217 if (ui->scaling_filter_combobox->currentIndex() == ConfigurationShared::USE_GLOBAL_INDEX) {
218 Settings::values.scaling_filter.SetGlobal(true);
219 } else {
220 Settings::values.scaling_filter.SetGlobal(false);
221 Settings::values.scaling_filter.SetValue(scaling_filter);
222 }
223 if (ui->anti_aliasing_combobox->currentIndex() == ConfigurationShared::USE_GLOBAL_INDEX) {
224 Settings::values.anti_aliasing.SetGlobal(true);
225 } else {
226 Settings::values.anti_aliasing.SetGlobal(false);
227 Settings::values.anti_aliasing.SetValue(anti_aliasing);
228 }
169 if (ui->api->currentIndex() == ConfigurationShared::USE_GLOBAL_INDEX) { 229 if (ui->api->currentIndex() == ConfigurationShared::USE_GLOBAL_INDEX) {
170 Settings::values.renderer_backend.SetGlobal(true); 230 Settings::values.renderer_backend.SetGlobal(true);
171 Settings::values.shader_backend.SetGlobal(true); 231 Settings::values.shader_backend.SetGlobal(true);
@@ -312,6 +372,9 @@ void ConfigureGraphics::SetupPerGameUI() {
312 ui->device->setEnabled(Settings::values.renderer_backend.UsingGlobal()); 372 ui->device->setEnabled(Settings::values.renderer_backend.UsingGlobal());
313 ui->fullscreen_mode_combobox->setEnabled(Settings::values.fullscreen_mode.UsingGlobal()); 373 ui->fullscreen_mode_combobox->setEnabled(Settings::values.fullscreen_mode.UsingGlobal());
314 ui->aspect_ratio_combobox->setEnabled(Settings::values.aspect_ratio.UsingGlobal()); 374 ui->aspect_ratio_combobox->setEnabled(Settings::values.aspect_ratio.UsingGlobal());
375 ui->resolution_combobox->setEnabled(Settings::values.resolution_setup.UsingGlobal());
376 ui->scaling_filter_combobox->setEnabled(Settings::values.scaling_filter.UsingGlobal());
377 ui->anti_aliasing_combobox->setEnabled(Settings::values.anti_aliasing.UsingGlobal());
315 ui->use_asynchronous_gpu_emulation->setEnabled( 378 ui->use_asynchronous_gpu_emulation->setEnabled(
316 Settings::values.use_asynchronous_gpu_emulation.UsingGlobal()); 379 Settings::values.use_asynchronous_gpu_emulation.UsingGlobal());
317 ui->nvdec_emulation->setEnabled(Settings::values.nvdec_emulation.UsingGlobal()); 380 ui->nvdec_emulation->setEnabled(Settings::values.nvdec_emulation.UsingGlobal());
@@ -340,6 +403,15 @@ void ConfigureGraphics::SetupPerGameUI() {
340 ConfigurationShared::SetColoredComboBox( 403 ConfigurationShared::SetColoredComboBox(
341 ui->fullscreen_mode_combobox, ui->fullscreen_mode_label, 404 ui->fullscreen_mode_combobox, ui->fullscreen_mode_label,
342 static_cast<int>(Settings::values.fullscreen_mode.GetValue(true))); 405 static_cast<int>(Settings::values.fullscreen_mode.GetValue(true)));
406 ConfigurationShared::SetColoredComboBox(
407 ui->resolution_combobox, ui->resolution_label,
408 static_cast<int>(Settings::values.resolution_setup.GetValue(true)));
409 ConfigurationShared::SetColoredComboBox(
410 ui->scaling_filter_combobox, ui->scaling_filter_label,
411 static_cast<int>(Settings::values.scaling_filter.GetValue(true)));
412 ConfigurationShared::SetColoredComboBox(
413 ui->anti_aliasing_combobox, ui->anti_aliasing_label,
414 static_cast<int>(Settings::values.anti_aliasing.GetValue(true)));
343 ConfigurationShared::InsertGlobalItem( 415 ConfigurationShared::InsertGlobalItem(
344 ui->api, static_cast<int>(Settings::values.renderer_backend.GetValue(true))); 416 ui->api, static_cast<int>(Settings::values.renderer_backend.GetValue(true)));
345 ConfigurationShared::InsertGlobalItem( 417 ConfigurationShared::InsertGlobalItem(
diff --git a/src/yuzu/configuration/configure_graphics.ui b/src/yuzu/configuration/configure_graphics.ui
index beae74344..660b68c1c 100644
--- a/src/yuzu/configuration/configure_graphics.ui
+++ b/src/yuzu/configuration/configure_graphics.ui
@@ -310,6 +310,173 @@
310 </widget> 310 </widget>
311 </item> 311 </item>
312 <item> 312 <item>
313 <widget class="QWidget" name="resolution_layout" native="true">
314 <layout class="QHBoxLayout" name="horizontalLayout_5">
315 <property name="leftMargin">
316 <number>0</number>
317 </property>
318 <property name="topMargin">
319 <number>0</number>
320 </property>
321 <property name="rightMargin">
322 <number>0</number>
323 </property>
324 <property name="bottomMargin">
325 <number>0</number>
326 </property>
327 <item>
328 <widget class="QLabel" name="resolution_label">
329 <property name="text">
330 <string>Resolution:</string>
331 </property>
332 </widget>
333 </item>
334 <item>
335 <widget class="QComboBox" name="resolution_combobox">
336 <item>
337 <property name="text">
338 <string>0.5X (360p/540p) [EXPERIMENTAL]</string>
339 </property>
340 </item>
341 <item>
342 <property name="text">
343 <string>0.75X (540p/810p) [EXPERIMENTAL]</string>
344 </property>
345 </item>
346 <item>
347 <property name="text">
348 <string>1X (720p/1080p)</string>
349 </property>
350 </item>
351 <item>
352 <property name="text">
353 <string>2X (1440p/2160p)</string>
354 </property>
355 </item>
356 <item>
357 <property name="text">
358 <string>3X (2160p/3240p)</string>
359 </property>
360 </item>
361 <item>
362 <property name="text">
363 <string>4X (2880p/4320p)</string>
364 </property>
365 </item>
366 <item>
367 <property name="text">
368 <string>5X (3600p/5400p)</string>
369 </property>
370 </item>
371 <item>
372 <property name="text">
373 <string>6X (4320p/6480p)</string>
374 </property>
375 </item>
376 </widget>
377 </item>
378 </layout>
379 </widget>
380 </item>
381 <item>
382 <widget class="QWidget" name="scaling_filter_layout" native="true">
383 <layout class="QHBoxLayout" name="horizontalLayout_6">
384 <property name="leftMargin">
385 <number>0</number>
386 </property>
387 <property name="topMargin">
388 <number>0</number>
389 </property>
390 <property name="rightMargin">
391 <number>0</number>
392 </property>
393 <property name="bottomMargin">
394 <number>0</number>
395 </property>
396 <item>
397 <widget class="QLabel" name="scaling_filter_label">
398 <property name="text">
399 <string>Window Adapting Filter:</string>
400 </property>
401 </widget>
402 </item>
403 <item>
404 <widget class="QComboBox" name="scaling_filter_combobox">
405 <item>
406 <property name="text">
407 <string>Nearest Neighbor</string>
408 </property>
409 </item>
410 <item>
411 <property name="text">
412 <string>Bilinear</string>
413 </property>
414 </item>
415 <item>
416 <property name="text">
417 <string>Bicubic</string>
418 </property>
419 </item>
420 <item>
421 <property name="text">
422 <string>Gaussian</string>
423 </property>
424 </item>
425 <item>
426 <property name="text">
427 <string>ScaleForce</string>
428 </property>
429 </item>
430 <item>
431 <property name="text">
432 <string>AMD's FidelityFX™️ Super Resolution [Vulkan Only]</string>
433 </property>
434 </item>
435 </widget>
436 </item>
437 </layout>
438 </widget>
439 </item>
440 <item>
441 <widget class="QWidget" name="anti_aliasing_layout" native="true">
442 <layout class="QHBoxLayout" name="horizontalLayout_7">
443 <property name="leftMargin">
444 <number>0</number>
445 </property>
446 <property name="topMargin">
447 <number>0</number>
448 </property>
449 <property name="rightMargin">
450 <number>0</number>
451 </property>
452 <property name="bottomMargin">
453 <number>0</number>
454 </property>
455 <item>
456 <widget class="QLabel" name="anti_aliasing_label">
457 <property name="text">
458 <string>Anti-Aliasing Method:</string>
459 </property>
460 </widget>
461 </item>
462 <item>
463 <widget class="QComboBox" name="anti_aliasing_combobox">
464 <item>
465 <property name="text">
466 <string>None</string>
467 </property>
468 </item>
469 <item>
470 <property name="text">
471 <string>FXAA</string>
472 </property>
473 </item>
474 </widget>
475 </item>
476 </layout>
477 </widget>
478 </item>
479 <item>
313 <widget class="QWidget" name="bg_layout" native="true"> 480 <widget class="QWidget" name="bg_layout" native="true">
314 <property name="sizePolicy"> 481 <property name="sizePolicy">
315 <sizepolicy hsizetype="Preferred" vsizetype="Preferred"> 482 <sizepolicy hsizetype="Preferred" vsizetype="Preferred">
diff --git a/src/yuzu/configuration/configure_graphics_advanced.ui b/src/yuzu/configuration/configure_graphics_advanced.ui
index d06b45f17..96de0b3d1 100644
--- a/src/yuzu/configuration/configure_graphics_advanced.ui
+++ b/src/yuzu/configuration/configure_graphics_advanced.ui
@@ -125,27 +125,32 @@
125 <widget class="QComboBox" name="anisotropic_filtering_combobox"> 125 <widget class="QComboBox" name="anisotropic_filtering_combobox">
126 <item> 126 <item>
127 <property name="text"> 127 <property name="text">
128 <string>Automatic</string>
129 </property>
130 </item>
131 <item>
132 <property name="text">
128 <string>Default</string> 133 <string>Default</string>
129 </property> 134 </property>
130 </item> 135 </item>
131 <item> 136 <item>
132 <property name="text"> 137 <property name="text">
133 <string>2x (WILL BREAK THINGS)</string> 138 <string>2x</string>
134 </property> 139 </property>
135 </item> 140 </item>
136 <item> 141 <item>
137 <property name="text"> 142 <property name="text">
138 <string>4x (WILL BREAK THINGS)</string> 143 <string>4x</string>
139 </property> 144 </property>
140 </item> 145 </item>
141 <item> 146 <item>
142 <property name="text"> 147 <property name="text">
143 <string>8x (WILL BREAK THINGS)</string> 148 <string>8x</string>
144 </property> 149 </property>
145 </item> 150 </item>
146 <item> 151 <item>
147 <property name="text"> 152 <property name="text">
148 <string>16x (WILL BREAK THINGS)</string> 153 <string>16x</string>
149 </property> 154 </property>
150 </item> 155 </item>
151 </widget> 156 </widget>
diff --git a/src/yuzu/debugger/profiler.cpp b/src/yuzu/debugger/profiler.cpp
index 33110685a..a8b254199 100644
--- a/src/yuzu/debugger/profiler.cpp
+++ b/src/yuzu/debugger/profiler.cpp
@@ -163,7 +163,7 @@ void MicroProfileWidget::mouseReleaseEvent(QMouseEvent* ev) {
163} 163}
164 164
165void MicroProfileWidget::wheelEvent(QWheelEvent* ev) { 165void MicroProfileWidget::wheelEvent(QWheelEvent* ev) {
166 const auto wheel_position = ev->position().toPoint(); 166 const auto wheel_position = ev->pos();
167 MicroProfileMousePosition(wheel_position.x() / x_scale, wheel_position.y() / y_scale, 167 MicroProfileMousePosition(wheel_position.x() / x_scale, wheel_position.y() / y_scale,
168 ev->angleDelta().y() / 120); 168 ev->angleDelta().y() / 120);
169 ev->accept(); 169 ev->accept();
diff --git a/src/yuzu/game_list.cpp b/src/yuzu/game_list.cpp
index 6bd0f9ee9..2af95dbe5 100644
--- a/src/yuzu/game_list.cpp
+++ b/src/yuzu/game_list.cpp
@@ -159,7 +159,7 @@ GameListSearchField::GameListSearchField(GameList* parent) : QWidget{parent} {
159 * @return true if the haystack contains all words of userinput 159 * @return true if the haystack contains all words of userinput
160 */ 160 */
161static bool ContainsAllWords(const QString& haystack, const QString& userinput) { 161static bool ContainsAllWords(const QString& haystack, const QString& userinput) {
162 const QStringList userinput_split = userinput.split(QLatin1Char{' '}, Qt::SkipEmptyParts); 162 const QStringList userinput_split = userinput.split(QLatin1Char{' '}, QString::SkipEmptyParts);
163 163
164 return std::all_of(userinput_split.begin(), userinput_split.end(), 164 return std::all_of(userinput_split.begin(), userinput_split.end(),
165 [&haystack](const QString& s) { return haystack.contains(s); }); 165 [&haystack](const QString& s) { return haystack.contains(s); });
diff --git a/src/yuzu/main.cpp b/src/yuzu/main.cpp
index 4e5552d2a..d057dc889 100644
--- a/src/yuzu/main.cpp
+++ b/src/yuzu/main.cpp
@@ -747,6 +747,8 @@ void GMainWindow::InitializeWidgets() {
747 747
748 shader_building_label = new QLabel(); 748 shader_building_label = new QLabel();
749 shader_building_label->setToolTip(tr("The amount of shaders currently being built")); 749 shader_building_label->setToolTip(tr("The amount of shaders currently being built"));
750 res_scale_label = new QLabel();
751 res_scale_label->setToolTip(tr("The current selected resolution scaling multiplier."));
750 emu_speed_label = new QLabel(); 752 emu_speed_label = new QLabel();
751 emu_speed_label->setToolTip( 753 emu_speed_label->setToolTip(
752 tr("Current emulation speed. Values higher or lower than 100% " 754 tr("Current emulation speed. Values higher or lower than 100% "
@@ -759,8 +761,8 @@ void GMainWindow::InitializeWidgets() {
759 tr("Time taken to emulate a Switch frame, not counting framelimiting or v-sync. For " 761 tr("Time taken to emulate a Switch frame, not counting framelimiting or v-sync. For "
760 "full-speed emulation this should be at most 16.67 ms.")); 762 "full-speed emulation this should be at most 16.67 ms."));
761 763
762 for (auto& label : 764 for (auto& label : {shader_building_label, res_scale_label, emu_speed_label, game_fps_label,
763 {shader_building_label, emu_speed_label, game_fps_label, emu_frametime_label}) { 765 emu_frametime_label}) {
764 label->setVisible(false); 766 label->setVisible(false);
765 label->setFrameStyle(QFrame::NoFrame); 767 label->setFrameStyle(QFrame::NoFrame);
766 label->setContentsMargins(4, 0, 4, 0); 768 label->setContentsMargins(4, 0, 4, 0);
@@ -772,6 +774,55 @@ void GMainWindow::InitializeWidgets() {
772 tas_label->setFocusPolicy(Qt::NoFocus); 774 tas_label->setFocusPolicy(Qt::NoFocus);
773 statusBar()->insertPermanentWidget(0, tas_label); 775 statusBar()->insertPermanentWidget(0, tas_label);
774 776
777 // setup AA button
778 aa_status_button = new QPushButton();
779 aa_status_button->setObjectName(QStringLiteral("TogglableStatusBarButton"));
780 aa_status_button->setFocusPolicy(Qt::NoFocus);
781 connect(aa_status_button, &QPushButton::clicked, [&] {
782 auto aa_mode = Settings::values.anti_aliasing.GetValue();
783 if (aa_mode == Settings::AntiAliasing::LastAA) {
784 aa_mode = Settings::AntiAliasing::None;
785 } else {
786 aa_mode = static_cast<Settings::AntiAliasing>(static_cast<u32>(aa_mode) + 1);
787 }
788 Settings::values.anti_aliasing.SetValue(aa_mode);
789 aa_status_button->setChecked(true);
790 UpdateAAText();
791 });
792 UpdateAAText();
793 aa_status_button->setCheckable(true);
794 aa_status_button->setChecked(true);
795 statusBar()->insertPermanentWidget(0, aa_status_button);
796
797 // Setup Filter button
798 filter_status_button = new QPushButton();
799 filter_status_button->setObjectName(QStringLiteral("TogglableStatusBarButton"));
800 filter_status_button->setFocusPolicy(Qt::NoFocus);
801 connect(filter_status_button, &QPushButton::clicked, [&] {
802 auto filter = Settings::values.scaling_filter.GetValue();
803 if (filter == Settings::ScalingFilter::LastFilter) {
804 filter = Settings::ScalingFilter::NearestNeighbor;
805 } else {
806 filter = static_cast<Settings::ScalingFilter>(static_cast<u32>(filter) + 1);
807 }
808 if (Settings::values.renderer_backend.GetValue() == Settings::RendererBackend::OpenGL &&
809 filter == Settings::ScalingFilter::Fsr) {
810 filter = Settings::ScalingFilter::NearestNeighbor;
811 }
812 Settings::values.scaling_filter.SetValue(filter);
813 filter_status_button->setChecked(true);
814 UpdateFilterText();
815 });
816 auto filter = Settings::values.scaling_filter.GetValue();
817 if (Settings::values.renderer_backend.GetValue() == Settings::RendererBackend::OpenGL &&
818 filter == Settings::ScalingFilter::Fsr) {
819 Settings::values.scaling_filter.SetValue(Settings::ScalingFilter::NearestNeighbor);
820 }
821 UpdateFilterText();
822 filter_status_button->setCheckable(true);
823 filter_status_button->setChecked(true);
824 statusBar()->insertPermanentWidget(0, filter_status_button);
825
775 // Setup Dock button 826 // Setup Dock button
776 dock_status_button = new QPushButton(); 827 dock_status_button = new QPushButton();
777 dock_status_button->setObjectName(QStringLiteral("TogglableStatusBarButton")); 828 dock_status_button->setObjectName(QStringLiteral("TogglableStatusBarButton"));
@@ -842,6 +893,11 @@ void GMainWindow::InitializeWidgets() {
842 Settings::values.renderer_backend.SetValue(Settings::RendererBackend::Vulkan); 893 Settings::values.renderer_backend.SetValue(Settings::RendererBackend::Vulkan);
843 } else { 894 } else {
844 Settings::values.renderer_backend.SetValue(Settings::RendererBackend::OpenGL); 895 Settings::values.renderer_backend.SetValue(Settings::RendererBackend::OpenGL);
896 const auto filter = Settings::values.scaling_filter.GetValue();
897 if (filter == Settings::ScalingFilter::Fsr) {
898 Settings::values.scaling_filter.SetValue(Settings::ScalingFilter::NearestNeighbor);
899 UpdateFilterText();
900 }
845 } 901 }
846 902
847 system->ApplySettings(); 903 system->ApplySettings();
@@ -1535,6 +1591,7 @@ void GMainWindow::ShutdownGame() {
1535 // Disable status bar updates 1591 // Disable status bar updates
1536 status_bar_update_timer.stop(); 1592 status_bar_update_timer.stop();
1537 shader_building_label->setVisible(false); 1593 shader_building_label->setVisible(false);
1594 res_scale_label->setVisible(false);
1538 emu_speed_label->setVisible(false); 1595 emu_speed_label->setVisible(false);
1539 game_fps_label->setVisible(false); 1596 game_fps_label->setVisible(false);
1540 emu_frametime_label->setVisible(false); 1597 emu_frametime_label->setVisible(false);
@@ -2889,8 +2946,7 @@ void GMainWindow::OnCaptureScreenshot() {
2889 } 2946 }
2890 } 2947 }
2891#endif 2948#endif
2892 render_window->CaptureScreenshot(UISettings::values.screenshot_resolution_factor.GetValue(), 2949 render_window->CaptureScreenshot(filename);
2893 filename);
2894} 2950}
2895 2951
2896// TODO: Written 2020-10-01: Remove per-game config migration code when it is irrelevant 2952// TODO: Written 2020-10-01: Remove per-game config migration code when it is irrelevant
@@ -2981,6 +3037,11 @@ void GMainWindow::UpdateStatusBar() {
2981 shader_building_label->setVisible(false); 3037 shader_building_label->setVisible(false);
2982 } 3038 }
2983 3039
3040 const auto res_info = Settings::values.resolution_info;
3041 const auto res_scale = res_info.up_factor;
3042 res_scale_label->setText(
3043 tr("Scale: %1x", "%1 is the resolution scaling factor").arg(res_scale));
3044
2984 if (Settings::values.use_speed_limit.GetValue()) { 3045 if (Settings::values.use_speed_limit.GetValue()) {
2985 emu_speed_label->setText(tr("Speed: %1% / %2%") 3046 emu_speed_label->setText(tr("Speed: %1% / %2%")
2986 .arg(results.emulation_speed * 100.0, 0, 'f', 0) 3047 .arg(results.emulation_speed * 100.0, 0, 'f', 0)
@@ -2996,6 +3057,7 @@ void GMainWindow::UpdateStatusBar() {
2996 } 3057 }
2997 emu_frametime_label->setText(tr("Frame: %1 ms").arg(results.frametime * 1000.0, 0, 'f', 2)); 3058 emu_frametime_label->setText(tr("Frame: %1 ms").arg(results.frametime * 1000.0, 0, 'f', 2));
2998 3059
3060 res_scale_label->setVisible(true);
2999 emu_speed_label->setVisible(!Settings::values.use_multi_core.GetValue()); 3061 emu_speed_label->setVisible(!Settings::values.use_multi_core.GetValue());
3000 game_fps_label->setVisible(true); 3062 game_fps_label->setVisible(true);
3001 emu_frametime_label->setVisible(true); 3063 emu_frametime_label->setVisible(true);
@@ -3025,11 +3087,55 @@ void GMainWindow::UpdateGPUAccuracyButton() {
3025 } 3087 }
3026} 3088}
3027 3089
3090void GMainWindow::UpdateFilterText() {
3091 const auto filter = Settings::values.scaling_filter.GetValue();
3092 switch (filter) {
3093 case Settings::ScalingFilter::NearestNeighbor:
3094 filter_status_button->setText(tr("NEAREST"));
3095 break;
3096 case Settings::ScalingFilter::Bilinear:
3097 filter_status_button->setText(tr("BILINEAR"));
3098 break;
3099 case Settings::ScalingFilter::Bicubic:
3100 filter_status_button->setText(tr("BICUBIC"));
3101 break;
3102 case Settings::ScalingFilter::Gaussian:
3103 filter_status_button->setText(tr("GAUSSIAN"));
3104 break;
3105 case Settings::ScalingFilter::ScaleForce:
3106 filter_status_button->setText(tr("SCALEFORCE"));
3107 break;
3108 case Settings::ScalingFilter::Fsr:
3109 filter_status_button->setText(tr("AMD'S FIDELITYFX SR"));
3110 break;
3111 default:
3112 filter_status_button->setText(tr("BILINEAR"));
3113 break;
3114 }
3115}
3116
3117void GMainWindow::UpdateAAText() {
3118 const auto aa_mode = Settings::values.anti_aliasing.GetValue();
3119 switch (aa_mode) {
3120 case Settings::AntiAliasing::Fxaa:
3121 aa_status_button->setText(tr("FXAA"));
3122 break;
3123 case Settings::AntiAliasing::None:
3124 aa_status_button->setText(tr("NO AA"));
3125 break;
3126 default:
3127 aa_status_button->setText(tr("FXAA"));
3128 break;
3129 }
3130}
3131
3028void GMainWindow::UpdateStatusButtons() { 3132void GMainWindow::UpdateStatusButtons() {
3029 dock_status_button->setChecked(Settings::values.use_docked_mode.GetValue()); 3133 dock_status_button->setChecked(Settings::values.use_docked_mode.GetValue());
3030 renderer_status_button->setChecked(Settings::values.renderer_backend.GetValue() == 3134 renderer_status_button->setChecked(Settings::values.renderer_backend.GetValue() ==
3031 Settings::RendererBackend::Vulkan); 3135 Settings::RendererBackend::Vulkan);
3032 UpdateGPUAccuracyButton(); 3136 UpdateGPUAccuracyButton();
3137 UpdateFilterText();
3138 UpdateAAText();
3033} 3139}
3034 3140
3035void GMainWindow::UpdateUISettings() { 3141void GMainWindow::UpdateUISettings() {
diff --git a/src/yuzu/main.h b/src/yuzu/main.h
index 981102daa..24633ff2d 100644
--- a/src/yuzu/main.h
+++ b/src/yuzu/main.h
@@ -302,6 +302,8 @@ private:
302 void MigrateConfigFiles(); 302 void MigrateConfigFiles();
303 void UpdateWindowTitle(std::string_view title_name = {}, std::string_view title_version = {}, 303 void UpdateWindowTitle(std::string_view title_name = {}, std::string_view title_version = {},
304 std::string_view gpu_vendor = {}); 304 std::string_view gpu_vendor = {});
305 void UpdateFilterText();
306 void UpdateAAText();
305 void UpdateStatusBar(); 307 void UpdateStatusBar();
306 void UpdateGPUAccuracyButton(); 308 void UpdateGPUAccuracyButton();
307 void UpdateStatusButtons(); 309 void UpdateStatusButtons();
@@ -328,6 +330,7 @@ private:
328 // Status bar elements 330 // Status bar elements
329 QLabel* message_label = nullptr; 331 QLabel* message_label = nullptr;
330 QLabel* shader_building_label = nullptr; 332 QLabel* shader_building_label = nullptr;
333 QLabel* res_scale_label = nullptr;
331 QLabel* emu_speed_label = nullptr; 334 QLabel* emu_speed_label = nullptr;
332 QLabel* game_fps_label = nullptr; 335 QLabel* game_fps_label = nullptr;
333 QLabel* emu_frametime_label = nullptr; 336 QLabel* emu_frametime_label = nullptr;
@@ -335,6 +338,8 @@ private:
335 QPushButton* gpu_accuracy_button = nullptr; 338 QPushButton* gpu_accuracy_button = nullptr;
336 QPushButton* renderer_status_button = nullptr; 339 QPushButton* renderer_status_button = nullptr;
337 QPushButton* dock_status_button = nullptr; 340 QPushButton* dock_status_button = nullptr;
341 QPushButton* filter_status_button = nullptr;
342 QPushButton* aa_status_button = nullptr;
338 QTimer status_bar_update_timer; 343 QTimer status_bar_update_timer;
339 344
340 std::unique_ptr<Config> config; 345 std::unique_ptr<Config> config;
diff --git a/src/yuzu/uisettings.h b/src/yuzu/uisettings.h
index cac19452f..936914ef3 100644
--- a/src/yuzu/uisettings.h
+++ b/src/yuzu/uisettings.h
@@ -68,7 +68,6 @@ struct Values {
68 Settings::BasicSetting<bool> enable_discord_presence{true, "enable_discord_presence"}; 68 Settings::BasicSetting<bool> enable_discord_presence{true, "enable_discord_presence"};
69 69
70 Settings::BasicSetting<bool> enable_screenshot_save_as{true, "enable_screenshot_save_as"}; 70 Settings::BasicSetting<bool> enable_screenshot_save_as{true, "enable_screenshot_save_as"};
71 Settings::BasicSetting<u16> screenshot_resolution_factor{0, "screenshot_resolution_factor"};
72 71
73 QString roms_path; 72 QString roms_path;
74 QString symbols_path; 73 QString symbols_path;
diff --git a/src/yuzu_cmd/config.cpp b/src/yuzu_cmd/config.cpp
index 0b8fde691..33241ea98 100644
--- a/src/yuzu_cmd/config.cpp
+++ b/src/yuzu_cmd/config.cpp
@@ -451,6 +451,9 @@ void Config::ReadValues() {
451 ReadSetting("Renderer", Settings::values.disable_shader_loop_safety_checks); 451 ReadSetting("Renderer", Settings::values.disable_shader_loop_safety_checks);
452 ReadSetting("Renderer", Settings::values.vulkan_device); 452 ReadSetting("Renderer", Settings::values.vulkan_device);
453 453
454 ReadSetting("Renderer", Settings::values.resolution_setup);
455 ReadSetting("Renderer", Settings::values.scaling_filter);
456 ReadSetting("Renderer", Settings::values.anti_aliasing);
454 ReadSetting("Renderer", Settings::values.fullscreen_mode); 457 ReadSetting("Renderer", Settings::values.fullscreen_mode);
455 ReadSetting("Renderer", Settings::values.aspect_ratio); 458 ReadSetting("Renderer", Settings::values.aspect_ratio);
456 ReadSetting("Renderer", Settings::values.max_anisotropy); 459 ReadSetting("Renderer", Settings::values.max_anisotropy);
diff --git a/src/yuzu_cmd/default_ini.h b/src/yuzu_cmd/default_ini.h
index 339dca766..ecdc271a8 100644
--- a/src/yuzu_cmd/default_ini.h
+++ b/src/yuzu_cmd/default_ini.h
@@ -236,6 +236,29 @@ disable_shader_loop_safety_checks =
236# Which Vulkan physical device to use (defaults to 0) 236# Which Vulkan physical device to use (defaults to 0)
237vulkan_device = 237vulkan_device =
238 238
239# 0: 0.5x (360p/540p) [EXPERIMENTAL]
240# 1: 0.75x (540p/810p) [EXPERIMENTAL]
241# 2 (default): 1x (720p/1080p)
242# 3: 2x (1440p/2160p)
243# 4: 3x (2160p/3240p)
244# 5: 4x (2880p/4320p)
245# 6: 5x (3600p/5400p)
246# 7: 6x (4320p/6480p)
247resolution_setup =
248
249# Pixel filter to use when up- or down-sampling rendered frames.
250# 0: Nearest Neighbor
251# 1 (default): Bilinear
252# 2: Bicubic
253# 3: Gaussian
254# 4: ScaleForce
255# 5: AMD FidelityFX™️ Super Resolution [Vulkan Only]
256scaling_filter =
257
258# Anti-Aliasing (AA)
259# 0 (default): None, 1: FXAA
260anti_aliasing =
261
239# Whether to use fullscreen or borderless window mode 262# Whether to use fullscreen or borderless window mode
240# 0 (Windows default): Borderless window, 1 (All other default): Exclusive fullscreen 263# 0 (Windows default): Borderless window, 1 (All other default): Exclusive fullscreen
241fullscreen_mode = 264fullscreen_mode =