diff options
Diffstat (limited to 'src')
| -rw-r--r-- | src/shader_recompiler/backend/glsl/emit_glsl_image.cpp | 29 | ||||
| -rw-r--r-- | src/shader_recompiler/backend/spirv/emit_spirv_image.cpp | 39 | ||||
| -rw-r--r-- | src/shader_recompiler/profile.h | 4 | ||||
| -rw-r--r-- | src/video_core/renderer_opengl/gl_device.cpp | 1 | ||||
| -rw-r--r-- | src/video_core/renderer_opengl/gl_device.h | 5 | ||||
| -rw-r--r-- | src/video_core/renderer_opengl/gl_shader_cache.cpp | 1 | ||||
| -rw-r--r-- | src/video_core/renderer_vulkan/vk_pipeline_cache.cpp | 1 | ||||
| -rw-r--r-- | src/video_core/vulkan_common/vulkan_device.cpp | 1 | ||||
| -rw-r--r-- | src/video_core/vulkan_common/vulkan_device.h | 5 |
9 files changed, 86 insertions, 0 deletions
diff --git a/src/shader_recompiler/backend/glsl/emit_glsl_image.cpp b/src/shader_recompiler/backend/glsl/emit_glsl_image.cpp index f335c8af0..418505475 100644 --- a/src/shader_recompiler/backend/glsl/emit_glsl_image.cpp +++ b/src/shader_recompiler/backend/glsl/emit_glsl_image.cpp | |||
| @@ -143,6 +143,21 @@ IR::Inst* PrepareSparse(IR::Inst& inst) { | |||
| 143 | } | 143 | } |
| 144 | return sparse_inst; | 144 | return sparse_inst; |
| 145 | } | 145 | } |
| 146 | |||
| 147 | std::string ImageGatherSubpixelOffset(const IR::TextureInstInfo& info, std::string_view texture, | ||
| 148 | std::string_view coords) { | ||
| 149 | switch (info.type) { | ||
| 150 | case TextureType::Color2D: | ||
| 151 | case TextureType::Color2DRect: | ||
| 152 | return fmt::format("{}+vec2(0.001953125)/vec2(textureSize({}, 0))", coords, texture); | ||
| 153 | case TextureType::ColorArray2D: | ||
| 154 | case TextureType::ColorCube: | ||
| 155 | return fmt::format("vec3({0}.xy+vec2(0.001953125)/vec2(textureSize({1}, 0)),{0}.z)", coords, | ||
| 156 | texture); | ||
| 157 | default: | ||
| 158 | return std::string{coords}; | ||
| 159 | } | ||
| 160 | } | ||
| 146 | } // Anonymous namespace | 161 | } // Anonymous namespace |
| 147 | 162 | ||
| 148 | void EmitImageSampleImplicitLod(EmitContext& ctx, IR::Inst& inst, const IR::Value& index, | 163 | void EmitImageSampleImplicitLod(EmitContext& ctx, IR::Inst& inst, const IR::Value& index, |
| @@ -340,6 +355,13 @@ void EmitImageGather(EmitContext& ctx, IR::Inst& inst, const IR::Value& index, | |||
| 340 | LOG_WARNING(Shader_GLSL, "Device does not support sparse texture queries. STUBBING"); | 355 | LOG_WARNING(Shader_GLSL, "Device does not support sparse texture queries. STUBBING"); |
| 341 | ctx.AddU1("{}=true;", *sparse_inst); | 356 | ctx.AddU1("{}=true;", *sparse_inst); |
| 342 | } | 357 | } |
| 358 | std::string coords_with_subpixel_offset; | ||
| 359 | if (ctx.profile.need_gather_subpixel_offset) { | ||
| 360 | // Apply a subpixel offset of 1/512 the texel size of the texture to ensure same rounding on | ||
| 361 | // AMD hardware as on Maxwell or other Nvidia architectures. | ||
| 362 | coords_with_subpixel_offset = ImageGatherSubpixelOffset(info, texture, coords); | ||
| 363 | coords = coords_with_subpixel_offset; | ||
| 364 | } | ||
| 343 | if (!sparse_inst || !supports_sparse) { | 365 | if (!sparse_inst || !supports_sparse) { |
| 344 | if (offset.IsEmpty()) { | 366 | if (offset.IsEmpty()) { |
| 345 | ctx.Add("{}=textureGather({},{},int({}));", texel, texture, coords, | 367 | ctx.Add("{}=textureGather({},{},int({}));", texel, texture, coords, |
| @@ -387,6 +409,13 @@ void EmitImageGatherDref(EmitContext& ctx, IR::Inst& inst, const IR::Value& inde | |||
| 387 | LOG_WARNING(Shader_GLSL, "Device does not support sparse texture queries. STUBBING"); | 409 | LOG_WARNING(Shader_GLSL, "Device does not support sparse texture queries. STUBBING"); |
| 388 | ctx.AddU1("{}=true;", *sparse_inst); | 410 | ctx.AddU1("{}=true;", *sparse_inst); |
| 389 | } | 411 | } |
| 412 | std::string coords_with_subpixel_offset; | ||
| 413 | if (ctx.profile.need_gather_subpixel_offset) { | ||
| 414 | // Apply a subpixel offset of 1/512 the texel size of the texture to ensure same rounding on | ||
| 415 | // AMD hardware as on Maxwell or other Nvidia architectures. | ||
| 416 | coords_with_subpixel_offset = ImageGatherSubpixelOffset(info, texture, coords); | ||
| 417 | coords = coords_with_subpixel_offset; | ||
| 418 | } | ||
| 390 | if (!sparse_inst || !supports_sparse) { | 419 | if (!sparse_inst || !supports_sparse) { |
| 391 | if (offset.IsEmpty()) { | 420 | if (offset.IsEmpty()) { |
| 392 | ctx.Add("{}=textureGather({},{},{});", texel, texture, coords, dref); | 421 | ctx.Add("{}=textureGather({},{},{});", texel, texture, coords, dref); |
diff --git a/src/shader_recompiler/backend/spirv/emit_spirv_image.cpp b/src/shader_recompiler/backend/spirv/emit_spirv_image.cpp index 02073c420..968901d42 100644 --- a/src/shader_recompiler/backend/spirv/emit_spirv_image.cpp +++ b/src/shader_recompiler/backend/spirv/emit_spirv_image.cpp | |||
| @@ -261,6 +261,39 @@ Id BitTest(EmitContext& ctx, Id mask, Id bit) { | |||
| 261 | const Id bit_value{ctx.OpBitwiseAnd(ctx.U32[1], shifted, ctx.Const(1u))}; | 261 | const Id bit_value{ctx.OpBitwiseAnd(ctx.U32[1], shifted, ctx.Const(1u))}; |
| 262 | return ctx.OpINotEqual(ctx.U1, bit_value, ctx.u32_zero_value); | 262 | return ctx.OpINotEqual(ctx.U1, bit_value, ctx.u32_zero_value); |
| 263 | } | 263 | } |
| 264 | |||
| 265 | Id ImageGatherSubpixelOffset(EmitContext& ctx, const IR::TextureInstInfo& info, Id texture, | ||
| 266 | Id coords) { | ||
| 267 | // Apply a subpixel offset of 1/512 the texel size of the texture to ensure same rounding on | ||
| 268 | // AMD hardware as on Maxwell or other Nvidia architectures. | ||
| 269 | const auto calculate_offset{[&](size_t dim) -> std::array<Id, 2> { | ||
| 270 | const Id nudge{ctx.Const(0x1p-9f)}; | ||
| 271 | const Id image_size{ctx.OpImageQuerySizeLod(ctx.U32[dim], texture, ctx.u32_zero_value)}; | ||
| 272 | const Id offset_x{ctx.OpFDiv( | ||
| 273 | ctx.F32[1], nudge, | ||
| 274 | ctx.OpConvertUToF(ctx.F32[1], ctx.OpCompositeExtract(ctx.U32[1], image_size, 0)))}; | ||
| 275 | const Id offset_y{ctx.OpFDiv( | ||
| 276 | ctx.F32[1], nudge, | ||
| 277 | ctx.OpConvertUToF(ctx.F32[1], ctx.OpCompositeExtract(ctx.U32[1], image_size, 1)))}; | ||
| 278 | return {ctx.OpFAdd(ctx.F32[1], ctx.OpCompositeExtract(ctx.F32[1], coords, 0), offset_x), | ||
| 279 | ctx.OpFAdd(ctx.F32[1], ctx.OpCompositeExtract(ctx.F32[1], coords, 1), offset_y)}; | ||
| 280 | }}; | ||
| 281 | switch (info.type) { | ||
| 282 | case TextureType::Color2D: | ||
| 283 | case TextureType::Color2DRect: { | ||
| 284 | const auto offset{calculate_offset(2)}; | ||
| 285 | return ctx.OpCompositeConstruct(ctx.F32[2], offset[0], offset[1]); | ||
| 286 | } | ||
| 287 | case TextureType::ColorArray2D: | ||
| 288 | case TextureType::ColorCube: { | ||
| 289 | const auto offset{calculate_offset(3)}; | ||
| 290 | return ctx.OpCompositeConstruct(ctx.F32[3], offset[0], offset[1], | ||
| 291 | ctx.OpCompositeExtract(ctx.F32[1], coords, 2)); | ||
| 292 | } | ||
| 293 | default: | ||
| 294 | return coords; | ||
| 295 | } | ||
| 296 | } | ||
| 264 | } // Anonymous namespace | 297 | } // Anonymous namespace |
| 265 | 298 | ||
| 266 | Id EmitBindlessImageSampleImplicitLod(EmitContext&) { | 299 | Id EmitBindlessImageSampleImplicitLod(EmitContext&) { |
| @@ -423,6 +456,9 @@ Id EmitImageGather(EmitContext& ctx, IR::Inst* inst, const IR::Value& index, Id | |||
| 423 | const IR::Value& offset, const IR::Value& offset2) { | 456 | const IR::Value& offset, const IR::Value& offset2) { |
| 424 | const auto info{inst->Flags<IR::TextureInstInfo>()}; | 457 | const auto info{inst->Flags<IR::TextureInstInfo>()}; |
| 425 | const ImageOperands operands(ctx, offset, offset2); | 458 | const ImageOperands operands(ctx, offset, offset2); |
| 459 | if (ctx.profile.need_gather_subpixel_offset) { | ||
| 460 | coords = ImageGatherSubpixelOffset(ctx, info, TextureImage(ctx, info, index), coords); | ||
| 461 | } | ||
| 426 | return Emit(&EmitContext::OpImageSparseGather, &EmitContext::OpImageGather, ctx, inst, | 462 | return Emit(&EmitContext::OpImageSparseGather, &EmitContext::OpImageGather, ctx, inst, |
| 427 | ctx.F32[4], Texture(ctx, info, index), coords, ctx.Const(info.gather_component), | 463 | ctx.F32[4], Texture(ctx, info, index), coords, ctx.Const(info.gather_component), |
| 428 | operands.MaskOptional(), operands.Span()); | 464 | operands.MaskOptional(), operands.Span()); |
| @@ -432,6 +468,9 @@ Id EmitImageGatherDref(EmitContext& ctx, IR::Inst* inst, const IR::Value& index, | |||
| 432 | const IR::Value& offset, const IR::Value& offset2, Id dref) { | 468 | const IR::Value& offset, const IR::Value& offset2, Id dref) { |
| 433 | const auto info{inst->Flags<IR::TextureInstInfo>()}; | 469 | const auto info{inst->Flags<IR::TextureInstInfo>()}; |
| 434 | const ImageOperands operands(ctx, offset, offset2); | 470 | const ImageOperands operands(ctx, offset, offset2); |
| 471 | if (ctx.profile.need_gather_subpixel_offset) { | ||
| 472 | coords = ImageGatherSubpixelOffset(ctx, info, TextureImage(ctx, info, index), coords); | ||
| 473 | } | ||
| 435 | return Emit(&EmitContext::OpImageSparseDrefGather, &EmitContext::OpImageDrefGather, ctx, inst, | 474 | return Emit(&EmitContext::OpImageSparseDrefGather, &EmitContext::OpImageDrefGather, ctx, inst, |
| 436 | ctx.F32[4], Texture(ctx, info, index), coords, dref, operands.MaskOptional(), | 475 | ctx.F32[4], Texture(ctx, info, index), coords, dref, operands.MaskOptional(), |
| 437 | operands.Span()); | 476 | operands.Span()); |
diff --git a/src/shader_recompiler/profile.h b/src/shader_recompiler/profile.h index 253e0d0bd..31390e869 100644 --- a/src/shader_recompiler/profile.h +++ b/src/shader_recompiler/profile.h | |||
| @@ -52,6 +52,10 @@ struct Profile { | |||
| 52 | bool need_declared_frag_colors{}; | 52 | bool need_declared_frag_colors{}; |
| 53 | /// Prevents fast math optimizations that may cause inaccuracies | 53 | /// Prevents fast math optimizations that may cause inaccuracies |
| 54 | bool need_fastmath_off{}; | 54 | bool need_fastmath_off{}; |
| 55 | /// Some GPU vendors use a lower fixed point format of 16.8 when calculating pixel coordinates | ||
| 56 | /// in the ImageGather instruction than the Maxwell architecture does. Applying an offset does | ||
| 57 | /// fix this mismatching rounding behaviour. | ||
| 58 | bool need_gather_subpixel_offset{}; | ||
| 55 | 59 | ||
| 56 | /// OpFClamp is broken and OpFMax + OpFMin should be used instead | 60 | /// OpFClamp is broken and OpFMax + OpFMin should be used instead |
| 57 | bool has_broken_spirv_clamp{}; | 61 | bool has_broken_spirv_clamp{}; |
diff --git a/src/video_core/renderer_opengl/gl_device.cpp b/src/video_core/renderer_opengl/gl_device.cpp index 22ed16ebf..d36a0a7a1 100644 --- a/src/video_core/renderer_opengl/gl_device.cpp +++ b/src/video_core/renderer_opengl/gl_device.cpp | |||
| @@ -169,6 +169,7 @@ Device::Device(Core::Frontend::EmuWindow& emu_window) { | |||
| 169 | has_draw_texture = GLAD_GL_NV_draw_texture; | 169 | has_draw_texture = GLAD_GL_NV_draw_texture; |
| 170 | warp_size_potentially_larger_than_guest = !is_nvidia && !is_intel; | 170 | warp_size_potentially_larger_than_guest = !is_nvidia && !is_intel; |
| 171 | need_fastmath_off = is_nvidia; | 171 | need_fastmath_off = is_nvidia; |
| 172 | need_gather_subpixel_offset = is_amd; | ||
| 172 | can_report_memory = GLAD_GL_NVX_gpu_memory_info; | 173 | can_report_memory = GLAD_GL_NVX_gpu_memory_info; |
| 173 | 174 | ||
| 174 | // At the moment of writing this, only Nvidia's driver optimizes BufferSubData on exclusive | 175 | // At the moment of writing this, only Nvidia's driver optimizes BufferSubData on exclusive |
diff --git a/src/video_core/renderer_opengl/gl_device.h b/src/video_core/renderer_opengl/gl_device.h index 3ff8cad83..e8104c4de 100644 --- a/src/video_core/renderer_opengl/gl_device.h +++ b/src/video_core/renderer_opengl/gl_device.h | |||
| @@ -160,6 +160,10 @@ public: | |||
| 160 | return need_fastmath_off; | 160 | return need_fastmath_off; |
| 161 | } | 161 | } |
| 162 | 162 | ||
| 163 | bool NeedsGatherSubpixelOffset() const { | ||
| 164 | return need_gather_subpixel_offset; | ||
| 165 | } | ||
| 166 | |||
| 163 | bool HasCbufFtouBug() const { | 167 | bool HasCbufFtouBug() const { |
| 164 | return has_cbuf_ftou_bug; | 168 | return has_cbuf_ftou_bug; |
| 165 | } | 169 | } |
| @@ -225,6 +229,7 @@ private: | |||
| 225 | bool has_draw_texture{}; | 229 | bool has_draw_texture{}; |
| 226 | bool warp_size_potentially_larger_than_guest{}; | 230 | bool warp_size_potentially_larger_than_guest{}; |
| 227 | bool need_fastmath_off{}; | 231 | bool need_fastmath_off{}; |
| 232 | bool need_gather_subpixel_offset{}; | ||
| 228 | bool has_cbuf_ftou_bug{}; | 233 | bool has_cbuf_ftou_bug{}; |
| 229 | bool has_bool_ref_bug{}; | 234 | bool has_bool_ref_bug{}; |
| 230 | bool can_report_memory{}; | 235 | bool can_report_memory{}; |
diff --git a/src/video_core/renderer_opengl/gl_shader_cache.cpp b/src/video_core/renderer_opengl/gl_shader_cache.cpp index 479bb8ba3..b40aa6f5e 100644 --- a/src/video_core/renderer_opengl/gl_shader_cache.cpp +++ b/src/video_core/renderer_opengl/gl_shader_cache.cpp | |||
| @@ -218,6 +218,7 @@ ShaderCache::ShaderCache(RasterizerOpenGL& rasterizer_, Core::Frontend::EmuWindo | |||
| 218 | .lower_left_origin_mode = true, | 218 | .lower_left_origin_mode = true, |
| 219 | .need_declared_frag_colors = true, | 219 | .need_declared_frag_colors = true, |
| 220 | .need_fastmath_off = device.NeedsFastmathOff(), | 220 | .need_fastmath_off = device.NeedsFastmathOff(), |
| 221 | .need_gather_subpixel_offset = device.NeedsGatherSubpixelOffset(), | ||
| 221 | 222 | ||
| 222 | .has_broken_spirv_clamp = true, | 223 | .has_broken_spirv_clamp = true, |
| 223 | .has_broken_unsigned_image_offsets = true, | 224 | .has_broken_unsigned_image_offsets = true, |
diff --git a/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp b/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp index 0684cceed..f51257267 100644 --- a/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp +++ b/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp | |||
| @@ -329,6 +329,7 @@ PipelineCache::PipelineCache(RasterizerVulkan& rasterizer_, const Device& device | |||
| 329 | 329 | ||
| 330 | .lower_left_origin_mode = false, | 330 | .lower_left_origin_mode = false, |
| 331 | .need_declared_frag_colors = false, | 331 | .need_declared_frag_colors = false, |
| 332 | .need_gather_subpixel_offset = device.NeedsGatherSubpixelOffset(), | ||
| 332 | 333 | ||
| 333 | .has_broken_spirv_clamp = driver_id == VK_DRIVER_ID_INTEL_PROPRIETARY_WINDOWS, | 334 | .has_broken_spirv_clamp = driver_id == VK_DRIVER_ID_INTEL_PROPRIETARY_WINDOWS, |
| 334 | .has_broken_spirv_position_input = driver_id == VK_DRIVER_ID_QUALCOMM_PROPRIETARY, | 335 | .has_broken_spirv_position_input = driver_id == VK_DRIVER_ID_QUALCOMM_PROPRIETARY, |
diff --git a/src/video_core/vulkan_common/vulkan_device.cpp b/src/video_core/vulkan_common/vulkan_device.cpp index 6f288b3f8..0939b62c9 100644 --- a/src/video_core/vulkan_common/vulkan_device.cpp +++ b/src/video_core/vulkan_common/vulkan_device.cpp | |||
| @@ -431,6 +431,7 @@ Device::Device(VkInstance instance_, vk::PhysicalDevice physical_, VkSurfaceKHR | |||
| 431 | "AMD GCN4 and earlier have broken VK_IMAGE_CREATE_CUBE_COMPATIBLE_BIT"); | 431 | "AMD GCN4 and earlier have broken VK_IMAGE_CREATE_CUBE_COMPATIBLE_BIT"); |
| 432 | has_broken_cube_compatibility = true; | 432 | has_broken_cube_compatibility = true; |
| 433 | } | 433 | } |
| 434 | need_gather_subpixel_offset = true; | ||
| 434 | } | 435 | } |
| 435 | if (extensions.sampler_filter_minmax && is_amd) { | 436 | if (extensions.sampler_filter_minmax && is_amd) { |
| 436 | // Disable ext_sampler_filter_minmax on AMD GCN4 and lower as it is broken. | 437 | // Disable ext_sampler_filter_minmax on AMD GCN4 and lower as it is broken. |
diff --git a/src/video_core/vulkan_common/vulkan_device.h b/src/video_core/vulkan_common/vulkan_device.h index 41b5da18a..50e95bcca 100644 --- a/src/video_core/vulkan_common/vulkan_device.h +++ b/src/video_core/vulkan_common/vulkan_device.h | |||
| @@ -554,6 +554,10 @@ public: | |||
| 554 | return features.robustness2.nullDescriptor; | 554 | return features.robustness2.nullDescriptor; |
| 555 | } | 555 | } |
| 556 | 556 | ||
| 557 | bool NeedsGatherSubpixelOffset() const { | ||
| 558 | return need_gather_subpixel_offset; | ||
| 559 | } | ||
| 560 | |||
| 557 | u32 GetMaxVertexInputAttributes() const { | 561 | u32 GetMaxVertexInputAttributes() const { |
| 558 | return properties.properties.limits.maxVertexInputAttributes; | 562 | return properties.properties.limits.maxVertexInputAttributes; |
| 559 | } | 563 | } |
| @@ -664,6 +668,7 @@ private: | |||
| 664 | bool must_emulate_bgr565{}; ///< Emulates BGR565 by swizzling RGB565 format. | 668 | bool must_emulate_bgr565{}; ///< Emulates BGR565 by swizzling RGB565 format. |
| 665 | bool dynamic_state3_blending{}; ///< Has all blending features of dynamic_state3. | 669 | bool dynamic_state3_blending{}; ///< Has all blending features of dynamic_state3. |
| 666 | bool dynamic_state3_enables{}; ///< Has all enables features of dynamic_state3. | 670 | bool dynamic_state3_enables{}; ///< Has all enables features of dynamic_state3. |
| 671 | bool need_gather_subpixel_offset{}; ///< Needs offset at ImageGather for correct rounding. | ||
| 667 | u64 device_access_memory{}; ///< Total size of device local memory in bytes. | 672 | u64 device_access_memory{}; ///< Total size of device local memory in bytes. |
| 668 | u32 sets_per_pool{}; ///< Sets per Description Pool | 673 | u32 sets_per_pool{}; ///< Sets per Description Pool |
| 669 | 674 | ||