diff options
| author | 2022-12-30 13:45:00 +0000 | |
|---|---|---|
| committer | 2023-01-05 22:13:07 +0000 | |
| commit | 68ed60cee41ce93a637fd8463657b137d95fdae4 (patch) | |
| tree | 6657dc4e7ea4fbd96bb655cffc8c69a4ad8915d2 /src | |
| parent | Vulkan, OpenGL: Hook up geometry shader passthrough emulation (diff) | |
| download | yuzu-68ed60cee41ce93a637fd8463657b137d95fdae4.tar.gz yuzu-68ed60cee41ce93a637fd8463657b137d95fdae4.tar.xz yuzu-68ed60cee41ce93a637fd8463657b137d95fdae4.zip | |
shader_recompiler: Fix shuffle partitioning for >64 invoc-per-subgroup GPUs
The existing implementation only supports 64 invoc-per-subgroup GPUs, and misbehaves on adreno when invocations need to be split into 4 emulated subgroups.
Diffstat (limited to 'src')
| -rw-r--r-- | src/shader_recompiler/backend/spirv/emit_spirv_warp.cpp | 58 |
1 files changed, 28 insertions, 30 deletions
diff --git a/src/shader_recompiler/backend/spirv/emit_spirv_warp.cpp b/src/shader_recompiler/backend/spirv/emit_spirv_warp.cpp index 2c90f2368..c5db19d09 100644 --- a/src/shader_recompiler/backend/spirv/emit_spirv_warp.cpp +++ b/src/shader_recompiler/backend/spirv/emit_spirv_warp.cpp | |||
| @@ -58,11 +58,10 @@ Id SelectValue(EmitContext& ctx, Id in_range, Id value, Id src_thread_id) { | |||
| 58 | ctx.OpGroupNonUniformShuffle(ctx.U32[1], SubgroupScope(ctx), value, src_thread_id), value); | 58 | ctx.OpGroupNonUniformShuffle(ctx.U32[1], SubgroupScope(ctx), value, src_thread_id), value); |
| 59 | } | 59 | } |
| 60 | 60 | ||
| 61 | Id GetUpperClamp(EmitContext& ctx, Id invocation_id, Id clamp) { | 61 | Id AddPartitionBase(EmitContext& ctx, Id thread_id) { |
| 62 | const Id thirty_two{ctx.Const(32u)}; | 62 | const Id partition_idx{ctx.OpShiftRightLogical(ctx.U32[1], GetThreadId(ctx), ctx.Const(5u))}; |
| 63 | const Id is_upper_partition{ctx.OpSGreaterThanEqual(ctx.U1, invocation_id, thirty_two)}; | 63 | const Id partition_base{ctx.OpShiftLeftLogical(ctx.U32[1], partition_idx, ctx.Const(5u))}; |
| 64 | const Id upper_clamp{ctx.OpIAdd(ctx.U32[1], thirty_two, clamp)}; | 64 | return ctx.OpIAdd(ctx.U32[1], thread_id, partition_base); |
| 65 | return ctx.OpSelect(ctx.U32[1], is_upper_partition, upper_clamp, clamp); | ||
| 66 | } | 65 | } |
| 67 | } // Anonymous namespace | 66 | } // Anonymous namespace |
| 68 | 67 | ||
| @@ -145,64 +144,63 @@ Id EmitSubgroupGeMask(EmitContext& ctx) { | |||
| 145 | Id EmitShuffleIndex(EmitContext& ctx, IR::Inst* inst, Id value, Id index, Id clamp, | 144 | Id EmitShuffleIndex(EmitContext& ctx, IR::Inst* inst, Id value, Id index, Id clamp, |
| 146 | Id segmentation_mask) { | 145 | Id segmentation_mask) { |
| 147 | const Id not_seg_mask{ctx.OpNot(ctx.U32[1], segmentation_mask)}; | 146 | const Id not_seg_mask{ctx.OpNot(ctx.U32[1], segmentation_mask)}; |
| 148 | const Id thread_id{GetThreadId(ctx)}; | 147 | const Id thread_id{EmitLaneId(ctx)}; |
| 149 | if (ctx.profile.warp_size_potentially_larger_than_guest) { | ||
| 150 | const Id thirty_two{ctx.Const(32u)}; | ||
| 151 | const Id is_upper_partition{ctx.OpSGreaterThanEqual(ctx.U1, thread_id, thirty_two)}; | ||
| 152 | const Id upper_index{ctx.OpIAdd(ctx.U32[1], thirty_two, index)}; | ||
| 153 | const Id upper_clamp{ctx.OpIAdd(ctx.U32[1], thirty_two, clamp)}; | ||
| 154 | index = ctx.OpSelect(ctx.U32[1], is_upper_partition, upper_index, index); | ||
| 155 | clamp = ctx.OpSelect(ctx.U32[1], is_upper_partition, upper_clamp, clamp); | ||
| 156 | } | ||
| 157 | const Id min_thread_id{ComputeMinThreadId(ctx, thread_id, segmentation_mask)}; | 148 | const Id min_thread_id{ComputeMinThreadId(ctx, thread_id, segmentation_mask)}; |
| 158 | const Id max_thread_id{ComputeMaxThreadId(ctx, min_thread_id, clamp, not_seg_mask)}; | 149 | const Id max_thread_id{ComputeMaxThreadId(ctx, min_thread_id, clamp, not_seg_mask)}; |
| 159 | 150 | ||
| 160 | const Id lhs{ctx.OpBitwiseAnd(ctx.U32[1], index, not_seg_mask)}; | 151 | const Id lhs{ctx.OpBitwiseAnd(ctx.U32[1], index, not_seg_mask)}; |
| 161 | const Id src_thread_id{ctx.OpBitwiseOr(ctx.U32[1], lhs, min_thread_id)}; | 152 | Id src_thread_id{ctx.OpBitwiseOr(ctx.U32[1], lhs, min_thread_id)}; |
| 162 | const Id in_range{ctx.OpSLessThanEqual(ctx.U1, src_thread_id, max_thread_id)}; | 153 | const Id in_range{ctx.OpSLessThanEqual(ctx.U1, src_thread_id, max_thread_id)}; |
| 163 | 154 | ||
| 155 | if (ctx.profile.warp_size_potentially_larger_than_guest) { | ||
| 156 | src_thread_id = AddPartitionBase(ctx, src_thread_id); | ||
| 157 | } | ||
| 158 | |||
| 164 | SetInBoundsFlag(inst, in_range); | 159 | SetInBoundsFlag(inst, in_range); |
| 165 | return SelectValue(ctx, in_range, value, src_thread_id); | 160 | return SelectValue(ctx, in_range, value, src_thread_id); |
| 166 | } | 161 | } |
| 167 | 162 | ||
| 168 | Id EmitShuffleUp(EmitContext& ctx, IR::Inst* inst, Id value, Id index, Id clamp, | 163 | Id EmitShuffleUp(EmitContext& ctx, IR::Inst* inst, Id value, Id index, Id clamp, |
| 169 | Id segmentation_mask) { | 164 | Id segmentation_mask) { |
| 170 | const Id thread_id{GetThreadId(ctx)}; | 165 | const Id thread_id{EmitLaneId(ctx)}; |
| 171 | if (ctx.profile.warp_size_potentially_larger_than_guest) { | ||
| 172 | clamp = GetUpperClamp(ctx, thread_id, clamp); | ||
| 173 | } | ||
| 174 | const Id max_thread_id{GetMaxThreadId(ctx, thread_id, clamp, segmentation_mask)}; | 166 | const Id max_thread_id{GetMaxThreadId(ctx, thread_id, clamp, segmentation_mask)}; |
| 175 | const Id src_thread_id{ctx.OpISub(ctx.U32[1], thread_id, index)}; | 167 | Id src_thread_id{ctx.OpISub(ctx.U32[1], thread_id, index)}; |
| 176 | const Id in_range{ctx.OpSGreaterThanEqual(ctx.U1, src_thread_id, max_thread_id)}; | 168 | const Id in_range{ctx.OpSGreaterThanEqual(ctx.U1, src_thread_id, max_thread_id)}; |
| 177 | 169 | ||
| 170 | if (ctx.profile.warp_size_potentially_larger_than_guest) { | ||
| 171 | src_thread_id = AddPartitionBase(ctx, src_thread_id); | ||
| 172 | } | ||
| 173 | |||
| 178 | SetInBoundsFlag(inst, in_range); | 174 | SetInBoundsFlag(inst, in_range); |
| 179 | return SelectValue(ctx, in_range, value, src_thread_id); | 175 | return SelectValue(ctx, in_range, value, src_thread_id); |
| 180 | } | 176 | } |
| 181 | 177 | ||
| 182 | Id EmitShuffleDown(EmitContext& ctx, IR::Inst* inst, Id value, Id index, Id clamp, | 178 | Id EmitShuffleDown(EmitContext& ctx, IR::Inst* inst, Id value, Id index, Id clamp, |
| 183 | Id segmentation_mask) { | 179 | Id segmentation_mask) { |
| 184 | const Id thread_id{GetThreadId(ctx)}; | 180 | const Id thread_id{EmitLaneId(ctx)}; |
| 185 | if (ctx.profile.warp_size_potentially_larger_than_guest) { | ||
| 186 | clamp = GetUpperClamp(ctx, thread_id, clamp); | ||
| 187 | } | ||
| 188 | const Id max_thread_id{GetMaxThreadId(ctx, thread_id, clamp, segmentation_mask)}; | 181 | const Id max_thread_id{GetMaxThreadId(ctx, thread_id, clamp, segmentation_mask)}; |
| 189 | const Id src_thread_id{ctx.OpIAdd(ctx.U32[1], thread_id, index)}; | 182 | Id src_thread_id{ctx.OpIAdd(ctx.U32[1], thread_id, index)}; |
| 190 | const Id in_range{ctx.OpSLessThanEqual(ctx.U1, src_thread_id, max_thread_id)}; | 183 | const Id in_range{ctx.OpSLessThanEqual(ctx.U1, src_thread_id, max_thread_id)}; |
| 191 | 184 | ||
| 185 | if (ctx.profile.warp_size_potentially_larger_than_guest) { | ||
| 186 | src_thread_id = AddPartitionBase(ctx, src_thread_id); | ||
| 187 | } | ||
| 188 | |||
| 192 | SetInBoundsFlag(inst, in_range); | 189 | SetInBoundsFlag(inst, in_range); |
| 193 | return SelectValue(ctx, in_range, value, src_thread_id); | 190 | return SelectValue(ctx, in_range, value, src_thread_id); |
| 194 | } | 191 | } |
| 195 | 192 | ||
| 196 | Id EmitShuffleButterfly(EmitContext& ctx, IR::Inst* inst, Id value, Id index, Id clamp, | 193 | Id EmitShuffleButterfly(EmitContext& ctx, IR::Inst* inst, Id value, Id index, Id clamp, |
| 197 | Id segmentation_mask) { | 194 | Id segmentation_mask) { |
| 198 | const Id thread_id{GetThreadId(ctx)}; | 195 | const Id thread_id{EmitLaneId(ctx)}; |
| 199 | if (ctx.profile.warp_size_potentially_larger_than_guest) { | ||
| 200 | clamp = GetUpperClamp(ctx, thread_id, clamp); | ||
| 201 | } | ||
| 202 | const Id max_thread_id{GetMaxThreadId(ctx, thread_id, clamp, segmentation_mask)}; | 196 | const Id max_thread_id{GetMaxThreadId(ctx, thread_id, clamp, segmentation_mask)}; |
| 203 | const Id src_thread_id{ctx.OpBitwiseXor(ctx.U32[1], thread_id, index)}; | 197 | Id src_thread_id{ctx.OpBitwiseXor(ctx.U32[1], thread_id, index)}; |
| 204 | const Id in_range{ctx.OpSLessThanEqual(ctx.U1, src_thread_id, max_thread_id)}; | 198 | const Id in_range{ctx.OpSLessThanEqual(ctx.U1, src_thread_id, max_thread_id)}; |
| 205 | 199 | ||
| 200 | if (ctx.profile.warp_size_potentially_larger_than_guest) { | ||
| 201 | src_thread_id = AddPartitionBase(ctx, src_thread_id); | ||
| 202 | } | ||
| 203 | |||
| 206 | SetInBoundsFlag(inst, in_range); | 204 | SetInBoundsFlag(inst, in_range); |
| 207 | return SelectValue(ctx, in_range, value, src_thread_id); | 205 | return SelectValue(ctx, in_range, value, src_thread_id); |
| 208 | } | 206 | } |