diff options
| author | 2021-09-12 13:53:29 -0400 | |
|---|---|---|
| committer | 2021-09-12 13:53:29 -0400 | |
| commit | 9248442bb2759c071b565a10e959883980ff09d6 (patch) | |
| tree | 1058129dc40331f7f688d32467da17214a2dc013 /src/shader_recompiler/backend | |
| parent | Merge pull request #6975 from ogniK5377/acc-async-ctx (diff) | |
| parent | emit_glsl_warp: Fix shuffle ops for 64-thread warp sizes (diff) | |
| download | yuzu-9248442bb2759c071b565a10e959883980ff09d6.tar.gz yuzu-9248442bb2759c071b565a10e959883980ff09d6.tar.xz yuzu-9248442bb2759c071b565a10e959883980ff09d6.zip | |
Merge pull request #6948 from ameerj/amd-warp-fix
shaders: Fix warp instructions on 64-thread warp devices
Diffstat (limited to 'src/shader_recompiler/backend')
| -rw-r--r-- | src/shader_recompiler/backend/glsl/emit_glsl_warp.cpp | 122 | ||||
| -rw-r--r-- | src/shader_recompiler/backend/spirv/emit_spirv_warp.cpp | 41 |
2 files changed, 109 insertions, 54 deletions
diff --git a/src/shader_recompiler/backend/glsl/emit_glsl_warp.cpp b/src/shader_recompiler/backend/glsl/emit_glsl_warp.cpp index a982dd8a2..cd285e2c8 100644 --- a/src/shader_recompiler/backend/glsl/emit_glsl_warp.cpp +++ b/src/shader_recompiler/backend/glsl/emit_glsl_warp.cpp | |||
| @@ -11,6 +11,8 @@ | |||
| 11 | 11 | ||
| 12 | namespace Shader::Backend::GLSL { | 12 | namespace Shader::Backend::GLSL { |
| 13 | namespace { | 13 | namespace { |
| 14 | constexpr char THREAD_ID[]{"gl_SubGroupInvocationARB"}; | ||
| 15 | |||
| 14 | void SetInBoundsFlag(EmitContext& ctx, IR::Inst& inst) { | 16 | void SetInBoundsFlag(EmitContext& ctx, IR::Inst& inst) { |
| 15 | IR::Inst* const in_bounds{inst.GetAssociatedPseudoOperation(IR::Opcode::GetInBoundsFromOp)}; | 17 | IR::Inst* const in_bounds{inst.GetAssociatedPseudoOperation(IR::Opcode::GetInBoundsFromOp)}; |
| 16 | if (!in_bounds) { | 18 | if (!in_bounds) { |
| @@ -43,84 +45,100 @@ void UseShuffleNv(EmitContext& ctx, IR::Inst& inst, std::string_view shfl_op, | |||
| 43 | ctx.AddU32("{}={}({},{},{},shfl_in_bounds);", inst, shfl_op, value, index, width); | 45 | ctx.AddU32("{}={}({},{},{},shfl_in_bounds);", inst, shfl_op, value, index, width); |
| 44 | SetInBoundsFlag(ctx, inst); | 46 | SetInBoundsFlag(ctx, inst); |
| 45 | } | 47 | } |
| 48 | |||
| 49 | std::string_view BallotIndex(EmitContext& ctx) { | ||
| 50 | if (!ctx.profile.warp_size_potentially_larger_than_guest) { | ||
| 51 | return ".x"; | ||
| 52 | } | ||
| 53 | return "[gl_SubGroupInvocationARB>>5]"; | ||
| 54 | } | ||
| 55 | |||
| 56 | std::string GetMask(EmitContext& ctx, std::string_view mask) { | ||
| 57 | const auto ballot_index{BallotIndex(ctx)}; | ||
| 58 | return fmt::format("uint(uvec2({}){})", mask, ballot_index); | ||
| 59 | } | ||
| 46 | } // Anonymous namespace | 60 | } // Anonymous namespace |
| 47 | 61 | ||
| 48 | void EmitLaneId(EmitContext& ctx, IR::Inst& inst) { | 62 | void EmitLaneId(EmitContext& ctx, IR::Inst& inst) { |
| 49 | ctx.AddU32("{}=gl_SubGroupInvocationARB&31u;", inst); | 63 | ctx.AddU32("{}={}&31u;", inst, THREAD_ID); |
| 50 | } | 64 | } |
| 51 | 65 | ||
| 52 | void EmitVoteAll(EmitContext& ctx, IR::Inst& inst, std::string_view pred) { | 66 | void EmitVoteAll(EmitContext& ctx, IR::Inst& inst, std::string_view pred) { |
| 53 | if (!ctx.profile.warp_size_potentially_larger_than_guest) { | 67 | if (!ctx.profile.warp_size_potentially_larger_than_guest) { |
| 54 | ctx.AddU1("{}=allInvocationsEqualARB({});", inst, pred); | 68 | ctx.AddU1("{}=allInvocationsEqualARB({});", inst, pred); |
| 55 | } else { | 69 | return; |
| 56 | const auto active_mask{fmt::format("uvec2(ballotARB(true))[gl_SubGroupInvocationARB]")}; | ||
| 57 | const auto ballot{fmt::format("uvec2(ballotARB({}))[gl_SubGroupInvocationARB]", pred)}; | ||
| 58 | ctx.AddU1("{}=({}&{})=={};", inst, ballot, active_mask, active_mask); | ||
| 59 | } | 70 | } |
| 71 | const auto ballot_index{BallotIndex(ctx)}; | ||
| 72 | const auto active_mask{fmt::format("uvec2(ballotARB(true)){}", ballot_index)}; | ||
| 73 | const auto ballot{fmt::format("uvec2(ballotARB({})){}", pred, ballot_index)}; | ||
| 74 | ctx.AddU1("{}=({}&{})=={};", inst, ballot, active_mask, active_mask); | ||
| 60 | } | 75 | } |
| 61 | 76 | ||
| 62 | void EmitVoteAny(EmitContext& ctx, IR::Inst& inst, std::string_view pred) { | 77 | void EmitVoteAny(EmitContext& ctx, IR::Inst& inst, std::string_view pred) { |
| 63 | if (!ctx.profile.warp_size_potentially_larger_than_guest) { | 78 | if (!ctx.profile.warp_size_potentially_larger_than_guest) { |
| 64 | ctx.AddU1("{}=anyInvocationARB({});", inst, pred); | 79 | ctx.AddU1("{}=anyInvocationARB({});", inst, pred); |
| 65 | } else { | 80 | return; |
| 66 | const auto active_mask{fmt::format("uvec2(ballotARB(true))[gl_SubGroupInvocationARB]")}; | ||
| 67 | const auto ballot{fmt::format("uvec2(ballotARB({}))[gl_SubGroupInvocationARB]", pred)}; | ||
| 68 | ctx.AddU1("{}=({}&{})!=0u;", inst, ballot, active_mask, active_mask); | ||
| 69 | } | 81 | } |
| 82 | const auto ballot_index{BallotIndex(ctx)}; | ||
| 83 | const auto active_mask{fmt::format("uvec2(ballotARB(true)){}", ballot_index)}; | ||
| 84 | const auto ballot{fmt::format("uvec2(ballotARB({})){}", pred, ballot_index)}; | ||
| 85 | ctx.AddU1("{}=({}&{})!=0u;", inst, ballot, active_mask, active_mask); | ||
| 70 | } | 86 | } |
| 71 | 87 | ||
| 72 | void EmitVoteEqual(EmitContext& ctx, IR::Inst& inst, std::string_view pred) { | 88 | void EmitVoteEqual(EmitContext& ctx, IR::Inst& inst, std::string_view pred) { |
| 73 | if (!ctx.profile.warp_size_potentially_larger_than_guest) { | 89 | if (!ctx.profile.warp_size_potentially_larger_than_guest) { |
| 74 | ctx.AddU1("{}=allInvocationsEqualARB({});", inst, pred); | 90 | ctx.AddU1("{}=allInvocationsEqualARB({});", inst, pred); |
| 75 | } else { | 91 | return; |
| 76 | const auto active_mask{fmt::format("uvec2(ballotARB(true))[gl_SubGroupInvocationARB]")}; | ||
| 77 | const auto ballot{fmt::format("uvec2(ballotARB({}))[gl_SubGroupInvocationARB]", pred)}; | ||
| 78 | const auto value{fmt::format("({}^{})", ballot, active_mask)}; | ||
| 79 | ctx.AddU1("{}=({}==0)||({}=={});", inst, value, value, active_mask); | ||
| 80 | } | 92 | } |
| 93 | const auto ballot_index{BallotIndex(ctx)}; | ||
| 94 | const auto active_mask{fmt::format("uvec2(ballotARB(true)){}", ballot_index)}; | ||
| 95 | const auto ballot{fmt::format("uvec2(ballotARB({})){}", pred, ballot_index)}; | ||
| 96 | const auto value{fmt::format("({}^{})", ballot, active_mask)}; | ||
| 97 | ctx.AddU1("{}=({}==0)||({}=={});", inst, value, value, active_mask); | ||
| 81 | } | 98 | } |
| 82 | 99 | ||
| 83 | void EmitSubgroupBallot(EmitContext& ctx, IR::Inst& inst, std::string_view pred) { | 100 | void EmitSubgroupBallot(EmitContext& ctx, IR::Inst& inst, std::string_view pred) { |
| 84 | if (!ctx.profile.warp_size_potentially_larger_than_guest) { | 101 | const auto ballot_index{BallotIndex(ctx)}; |
| 85 | ctx.AddU32("{}=uvec2(ballotARB({})).x;", inst, pred); | 102 | ctx.AddU32("{}=uvec2(ballotARB({})){};", inst, pred, ballot_index); |
| 86 | } else { | ||
| 87 | ctx.AddU32("{}=uvec2(ballotARB({}))[gl_SubGroupInvocationARB];", inst, pred); | ||
| 88 | } | ||
| 89 | } | 103 | } |
| 90 | 104 | ||
| 91 | void EmitSubgroupEqMask(EmitContext& ctx, IR::Inst& inst) { | 105 | void EmitSubgroupEqMask(EmitContext& ctx, IR::Inst& inst) { |
| 92 | ctx.AddU32("{}=uint(gl_SubGroupEqMaskARB.x);", inst); | 106 | ctx.AddU32("{}={};", inst, GetMask(ctx, "gl_SubGroupEqMaskARB")); |
| 93 | } | 107 | } |
| 94 | 108 | ||
| 95 | void EmitSubgroupLtMask(EmitContext& ctx, IR::Inst& inst) { | 109 | void EmitSubgroupLtMask(EmitContext& ctx, IR::Inst& inst) { |
| 96 | ctx.AddU32("{}=uint(gl_SubGroupLtMaskARB.x);", inst); | 110 | ctx.AddU32("{}={};", inst, GetMask(ctx, "gl_SubGroupLtMaskARB")); |
| 97 | } | 111 | } |
| 98 | 112 | ||
| 99 | void EmitSubgroupLeMask(EmitContext& ctx, IR::Inst& inst) { | 113 | void EmitSubgroupLeMask(EmitContext& ctx, IR::Inst& inst) { |
| 100 | ctx.AddU32("{}=uint(gl_SubGroupLeMaskARB.x);", inst); | 114 | ctx.AddU32("{}={};", inst, GetMask(ctx, "gl_SubGroupLeMaskARB")); |
| 101 | } | 115 | } |
| 102 | 116 | ||
| 103 | void EmitSubgroupGtMask(EmitContext& ctx, IR::Inst& inst) { | 117 | void EmitSubgroupGtMask(EmitContext& ctx, IR::Inst& inst) { |
| 104 | ctx.AddU32("{}=uint(gl_SubGroupGtMaskARB.x);", inst); | 118 | ctx.AddU32("{}={};", inst, GetMask(ctx, "gl_SubGroupGtMaskARB")); |
| 105 | } | 119 | } |
| 106 | 120 | ||
| 107 | void EmitSubgroupGeMask(EmitContext& ctx, IR::Inst& inst) { | 121 | void EmitSubgroupGeMask(EmitContext& ctx, IR::Inst& inst) { |
| 108 | ctx.AddU32("{}=uint(gl_SubGroupGeMaskARB.x);", inst); | 122 | ctx.AddU32("{}={};", inst, GetMask(ctx, "gl_SubGroupGeMaskARB")); |
| 109 | } | 123 | } |
| 110 | 124 | ||
| 111 | void EmitShuffleIndex(EmitContext& ctx, IR::Inst& inst, std::string_view value, | 125 | void EmitShuffleIndex(EmitContext& ctx, IR::Inst& inst, std::string_view value, |
| 112 | std::string_view index, std::string_view clamp, | 126 | std::string_view index, std::string_view clamp, std::string_view seg_mask) { |
| 113 | std::string_view segmentation_mask) { | ||
| 114 | if (ctx.profile.support_gl_warp_intrinsics) { | 127 | if (ctx.profile.support_gl_warp_intrinsics) { |
| 115 | UseShuffleNv(ctx, inst, "shuffleNV", value, index, clamp, segmentation_mask); | 128 | UseShuffleNv(ctx, inst, "shuffleNV", value, index, clamp, seg_mask); |
| 116 | return; | 129 | return; |
| 117 | } | 130 | } |
| 118 | const auto not_seg_mask{fmt::format("(~{})", segmentation_mask)}; | 131 | const bool big_warp{ctx.profile.warp_size_potentially_larger_than_guest}; |
| 119 | const auto thread_id{"gl_SubGroupInvocationARB"}; | 132 | const auto is_upper_partition{"int(gl_SubGroupInvocationARB)>=32"}; |
| 120 | const auto min_thread_id{ComputeMinThreadId(thread_id, segmentation_mask)}; | 133 | const auto upper_index{fmt::format("{}?{}+32:{}", is_upper_partition, index, index)}; |
| 121 | const auto max_thread_id{ComputeMaxThreadId(min_thread_id, clamp, not_seg_mask)}; | 134 | const auto upper_clamp{fmt::format("{}?{}+32:{}", is_upper_partition, clamp, clamp)}; |
| 135 | |||
| 136 | const auto not_seg_mask{fmt::format("(~{})", seg_mask)}; | ||
| 137 | const auto min_thread_id{ComputeMinThreadId(THREAD_ID, seg_mask)}; | ||
| 138 | const auto max_thread_id{ | ||
| 139 | ComputeMaxThreadId(min_thread_id, big_warp ? upper_clamp : clamp, not_seg_mask)}; | ||
| 122 | 140 | ||
| 123 | const auto lhs{fmt::format("({}&{})", index, not_seg_mask)}; | 141 | const auto lhs{fmt::format("({}&{})", big_warp ? upper_index : index, not_seg_mask)}; |
| 124 | const auto src_thread_id{fmt::format("({})|({})", lhs, min_thread_id)}; | 142 | const auto src_thread_id{fmt::format("({})|({})", lhs, min_thread_id)}; |
| 125 | ctx.Add("shfl_in_bounds=int({})<=int({});", src_thread_id, max_thread_id); | 143 | ctx.Add("shfl_in_bounds=int({})<=int({});", src_thread_id, max_thread_id); |
| 126 | SetInBoundsFlag(ctx, inst); | 144 | SetInBoundsFlag(ctx, inst); |
| @@ -128,29 +146,34 @@ void EmitShuffleIndex(EmitContext& ctx, IR::Inst& inst, std::string_view value, | |||
| 128 | } | 146 | } |
| 129 | 147 | ||
| 130 | void EmitShuffleUp(EmitContext& ctx, IR::Inst& inst, std::string_view value, std::string_view index, | 148 | void EmitShuffleUp(EmitContext& ctx, IR::Inst& inst, std::string_view value, std::string_view index, |
| 131 | std::string_view clamp, std::string_view segmentation_mask) { | 149 | std::string_view clamp, std::string_view seg_mask) { |
| 132 | if (ctx.profile.support_gl_warp_intrinsics) { | 150 | if (ctx.profile.support_gl_warp_intrinsics) { |
| 133 | UseShuffleNv(ctx, inst, "shuffleUpNV", value, index, clamp, segmentation_mask); | 151 | UseShuffleNv(ctx, inst, "shuffleUpNV", value, index, clamp, seg_mask); |
| 134 | return; | 152 | return; |
| 135 | } | 153 | } |
| 136 | const auto thread_id{"gl_SubGroupInvocationARB"}; | 154 | const bool big_warp{ctx.profile.warp_size_potentially_larger_than_guest}; |
| 137 | const auto max_thread_id{GetMaxThreadId(thread_id, clamp, segmentation_mask)}; | 155 | const auto is_upper_partition{"int(gl_SubGroupInvocationARB)>=32"}; |
| 138 | const auto src_thread_id{fmt::format("({}-{})", thread_id, index)}; | 156 | const auto upper_clamp{fmt::format("{}?{}+32:{}", is_upper_partition, clamp, clamp)}; |
| 157 | |||
| 158 | const auto max_thread_id{GetMaxThreadId(THREAD_ID, big_warp ? upper_clamp : clamp, seg_mask)}; | ||
| 159 | const auto src_thread_id{fmt::format("({}-{})", THREAD_ID, index)}; | ||
| 139 | ctx.Add("shfl_in_bounds=int({})>=int({});", src_thread_id, max_thread_id); | 160 | ctx.Add("shfl_in_bounds=int({})>=int({});", src_thread_id, max_thread_id); |
| 140 | SetInBoundsFlag(ctx, inst); | 161 | SetInBoundsFlag(ctx, inst); |
| 141 | ctx.AddU32("{}=shfl_in_bounds?readInvocationARB({},{}):{};", inst, value, src_thread_id, value); | 162 | ctx.AddU32("{}=shfl_in_bounds?readInvocationARB({},{}):{};", inst, value, src_thread_id, value); |
| 142 | } | 163 | } |
| 143 | 164 | ||
| 144 | void EmitShuffleDown(EmitContext& ctx, IR::Inst& inst, std::string_view value, | 165 | void EmitShuffleDown(EmitContext& ctx, IR::Inst& inst, std::string_view value, |
| 145 | std::string_view index, std::string_view clamp, | 166 | std::string_view index, std::string_view clamp, std::string_view seg_mask) { |
| 146 | std::string_view segmentation_mask) { | ||
| 147 | if (ctx.profile.support_gl_warp_intrinsics) { | 167 | if (ctx.profile.support_gl_warp_intrinsics) { |
| 148 | UseShuffleNv(ctx, inst, "shuffleDownNV", value, index, clamp, segmentation_mask); | 168 | UseShuffleNv(ctx, inst, "shuffleDownNV", value, index, clamp, seg_mask); |
| 149 | return; | 169 | return; |
| 150 | } | 170 | } |
| 151 | const auto thread_id{"gl_SubGroupInvocationARB"}; | 171 | const bool big_warp{ctx.profile.warp_size_potentially_larger_than_guest}; |
| 152 | const auto max_thread_id{GetMaxThreadId(thread_id, clamp, segmentation_mask)}; | 172 | const auto is_upper_partition{"int(gl_SubGroupInvocationARB)>=32"}; |
| 153 | const auto src_thread_id{fmt::format("({}+{})", thread_id, index)}; | 173 | const auto upper_clamp{fmt::format("{}?{}+32:{}", is_upper_partition, clamp, clamp)}; |
| 174 | |||
| 175 | const auto max_thread_id{GetMaxThreadId(THREAD_ID, big_warp ? upper_clamp : clamp, seg_mask)}; | ||
| 176 | const auto src_thread_id{fmt::format("({}+{})", THREAD_ID, index)}; | ||
| 154 | ctx.Add("shfl_in_bounds=int({})<=int({});", src_thread_id, max_thread_id); | 177 | ctx.Add("shfl_in_bounds=int({})<=int({});", src_thread_id, max_thread_id); |
| 155 | SetInBoundsFlag(ctx, inst); | 178 | SetInBoundsFlag(ctx, inst); |
| 156 | ctx.AddU32("{}=shfl_in_bounds?readInvocationARB({},{}):{};", inst, value, src_thread_id, value); | 179 | ctx.AddU32("{}=shfl_in_bounds?readInvocationARB({},{}):{};", inst, value, src_thread_id, value); |
| @@ -158,14 +181,17 @@ void EmitShuffleDown(EmitContext& ctx, IR::Inst& inst, std::string_view value, | |||
| 158 | 181 | ||
| 159 | void EmitShuffleButterfly(EmitContext& ctx, IR::Inst& inst, std::string_view value, | 182 | void EmitShuffleButterfly(EmitContext& ctx, IR::Inst& inst, std::string_view value, |
| 160 | std::string_view index, std::string_view clamp, | 183 | std::string_view index, std::string_view clamp, |
| 161 | std::string_view segmentation_mask) { | 184 | std::string_view seg_mask) { |
| 162 | if (ctx.profile.support_gl_warp_intrinsics) { | 185 | if (ctx.profile.support_gl_warp_intrinsics) { |
| 163 | UseShuffleNv(ctx, inst, "shuffleXorNV", value, index, clamp, segmentation_mask); | 186 | UseShuffleNv(ctx, inst, "shuffleXorNV", value, index, clamp, seg_mask); |
| 164 | return; | 187 | return; |
| 165 | } | 188 | } |
| 166 | const auto thread_id{"gl_SubGroupInvocationARB"}; | 189 | const bool big_warp{ctx.profile.warp_size_potentially_larger_than_guest}; |
| 167 | const auto max_thread_id{GetMaxThreadId(thread_id, clamp, segmentation_mask)}; | 190 | const auto is_upper_partition{"int(gl_SubGroupInvocationARB)>=32"}; |
| 168 | const auto src_thread_id{fmt::format("({}^{})", thread_id, index)}; | 191 | const auto upper_clamp{fmt::format("{}?{}+32:{}", is_upper_partition, clamp, clamp)}; |
| 192 | |||
| 193 | const auto max_thread_id{GetMaxThreadId(THREAD_ID, big_warp ? upper_clamp : clamp, seg_mask)}; | ||
| 194 | const auto src_thread_id{fmt::format("({}^{})", THREAD_ID, index)}; | ||
| 169 | ctx.Add("shfl_in_bounds=int({})<=int({});", src_thread_id, max_thread_id); | 195 | ctx.Add("shfl_in_bounds=int({})<=int({});", src_thread_id, max_thread_id); |
| 170 | SetInBoundsFlag(ctx, inst); | 196 | SetInBoundsFlag(ctx, inst); |
| 171 | ctx.AddU32("{}=shfl_in_bounds?readInvocationARB({},{}):{};", inst, value, src_thread_id, value); | 197 | ctx.AddU32("{}=shfl_in_bounds?readInvocationARB({},{}):{};", inst, value, src_thread_id, value); |
diff --git a/src/shader_recompiler/backend/spirv/emit_spirv_warp.cpp b/src/shader_recompiler/backend/spirv/emit_spirv_warp.cpp index 78b1e1ba7..cef52c56e 100644 --- a/src/shader_recompiler/backend/spirv/emit_spirv_warp.cpp +++ b/src/shader_recompiler/backend/spirv/emit_spirv_warp.cpp | |||
| @@ -7,8 +7,13 @@ | |||
| 7 | 7 | ||
| 8 | namespace Shader::Backend::SPIRV { | 8 | namespace Shader::Backend::SPIRV { |
| 9 | namespace { | 9 | namespace { |
| 10 | Id GetThreadId(EmitContext& ctx) { | ||
| 11 | return ctx.OpLoad(ctx.U32[1], ctx.subgroup_local_invocation_id); | ||
| 12 | } | ||
| 13 | |||
| 10 | Id WarpExtract(EmitContext& ctx, Id value) { | 14 | Id WarpExtract(EmitContext& ctx, Id value) { |
| 11 | const Id local_index{ctx.OpLoad(ctx.U32[1], ctx.subgroup_local_invocation_id)}; | 15 | const Id thread_id{GetThreadId(ctx)}; |
| 16 | const Id local_index{ctx.OpShiftRightArithmetic(ctx.U32[1], thread_id, ctx.Const(5U))}; | ||
| 12 | return ctx.OpVectorExtractDynamic(ctx.U32[1], value, local_index); | 17 | return ctx.OpVectorExtractDynamic(ctx.U32[1], value, local_index); |
| 13 | } | 18 | } |
| 14 | 19 | ||
| @@ -48,10 +53,17 @@ Id SelectValue(EmitContext& ctx, Id in_range, Id value, Id src_thread_id) { | |||
| 48 | return ctx.OpSelect(ctx.U32[1], in_range, | 53 | return ctx.OpSelect(ctx.U32[1], in_range, |
| 49 | ctx.OpSubgroupReadInvocationKHR(ctx.U32[1], value, src_thread_id), value); | 54 | ctx.OpSubgroupReadInvocationKHR(ctx.U32[1], value, src_thread_id), value); |
| 50 | } | 55 | } |
| 56 | |||
| 57 | Id GetUpperClamp(EmitContext& ctx, Id invocation_id, Id clamp) { | ||
| 58 | const Id thirty_two{ctx.Const(32u)}; | ||
| 59 | const Id is_upper_partition{ctx.OpSGreaterThanEqual(ctx.U1, invocation_id, thirty_two)}; | ||
| 60 | const Id upper_clamp{ctx.OpIAdd(ctx.U32[1], thirty_two, clamp)}; | ||
| 61 | return ctx.OpSelect(ctx.U32[1], is_upper_partition, upper_clamp, clamp); | ||
| 62 | } | ||
| 51 | } // Anonymous namespace | 63 | } // Anonymous namespace |
| 52 | 64 | ||
| 53 | Id EmitLaneId(EmitContext& ctx) { | 65 | Id EmitLaneId(EmitContext& ctx) { |
| 54 | const Id id{ctx.OpLoad(ctx.U32[1], ctx.subgroup_local_invocation_id)}; | 66 | const Id id{GetThreadId(ctx)}; |
| 55 | if (!ctx.profile.warp_size_potentially_larger_than_guest) { | 67 | if (!ctx.profile.warp_size_potentially_larger_than_guest) { |
| 56 | return id; | 68 | return id; |
| 57 | } | 69 | } |
| @@ -123,7 +135,15 @@ Id EmitSubgroupGeMask(EmitContext& ctx) { | |||
| 123 | Id EmitShuffleIndex(EmitContext& ctx, IR::Inst* inst, Id value, Id index, Id clamp, | 135 | Id EmitShuffleIndex(EmitContext& ctx, IR::Inst* inst, Id value, Id index, Id clamp, |
| 124 | Id segmentation_mask) { | 136 | Id segmentation_mask) { |
| 125 | const Id not_seg_mask{ctx.OpNot(ctx.U32[1], segmentation_mask)}; | 137 | const Id not_seg_mask{ctx.OpNot(ctx.U32[1], segmentation_mask)}; |
| 126 | const Id thread_id{ctx.OpLoad(ctx.U32[1], ctx.subgroup_local_invocation_id)}; | 138 | const Id thread_id{GetThreadId(ctx)}; |
| 139 | if (ctx.profile.warp_size_potentially_larger_than_guest) { | ||
| 140 | const Id thirty_two{ctx.Const(32u)}; | ||
| 141 | const Id is_upper_partition{ctx.OpSGreaterThanEqual(ctx.U1, thread_id, thirty_two)}; | ||
| 142 | const Id upper_index{ctx.OpIAdd(ctx.U32[1], thirty_two, index)}; | ||
| 143 | const Id upper_clamp{ctx.OpIAdd(ctx.U32[1], thirty_two, clamp)}; | ||
| 144 | index = ctx.OpSelect(ctx.U32[1], is_upper_partition, upper_index, index); | ||
| 145 | clamp = ctx.OpSelect(ctx.U32[1], is_upper_partition, upper_clamp, clamp); | ||
| 146 | } | ||
| 127 | const Id min_thread_id{ComputeMinThreadId(ctx, thread_id, segmentation_mask)}; | 147 | const Id min_thread_id{ComputeMinThreadId(ctx, thread_id, segmentation_mask)}; |
| 128 | const Id max_thread_id{ComputeMaxThreadId(ctx, min_thread_id, clamp, not_seg_mask)}; | 148 | const Id max_thread_id{ComputeMaxThreadId(ctx, min_thread_id, clamp, not_seg_mask)}; |
| 129 | 149 | ||
| @@ -137,7 +157,10 @@ Id EmitShuffleIndex(EmitContext& ctx, IR::Inst* inst, Id value, Id index, Id cla | |||
| 137 | 157 | ||
| 138 | Id EmitShuffleUp(EmitContext& ctx, IR::Inst* inst, Id value, Id index, Id clamp, | 158 | Id EmitShuffleUp(EmitContext& ctx, IR::Inst* inst, Id value, Id index, Id clamp, |
| 139 | Id segmentation_mask) { | 159 | Id segmentation_mask) { |
| 140 | const Id thread_id{ctx.OpLoad(ctx.U32[1], ctx.subgroup_local_invocation_id)}; | 160 | const Id thread_id{GetThreadId(ctx)}; |
| 161 | if (ctx.profile.warp_size_potentially_larger_than_guest) { | ||
| 162 | clamp = GetUpperClamp(ctx, thread_id, clamp); | ||
| 163 | } | ||
| 141 | const Id max_thread_id{GetMaxThreadId(ctx, thread_id, clamp, segmentation_mask)}; | 164 | const Id max_thread_id{GetMaxThreadId(ctx, thread_id, clamp, segmentation_mask)}; |
| 142 | const Id src_thread_id{ctx.OpISub(ctx.U32[1], thread_id, index)}; | 165 | const Id src_thread_id{ctx.OpISub(ctx.U32[1], thread_id, index)}; |
| 143 | const Id in_range{ctx.OpSGreaterThanEqual(ctx.U1, src_thread_id, max_thread_id)}; | 166 | const Id in_range{ctx.OpSGreaterThanEqual(ctx.U1, src_thread_id, max_thread_id)}; |
| @@ -148,7 +171,10 @@ Id EmitShuffleUp(EmitContext& ctx, IR::Inst* inst, Id value, Id index, Id clamp, | |||
| 148 | 171 | ||
| 149 | Id EmitShuffleDown(EmitContext& ctx, IR::Inst* inst, Id value, Id index, Id clamp, | 172 | Id EmitShuffleDown(EmitContext& ctx, IR::Inst* inst, Id value, Id index, Id clamp, |
| 150 | Id segmentation_mask) { | 173 | Id segmentation_mask) { |
| 151 | const Id thread_id{ctx.OpLoad(ctx.U32[1], ctx.subgroup_local_invocation_id)}; | 174 | const Id thread_id{GetThreadId(ctx)}; |
| 175 | if (ctx.profile.warp_size_potentially_larger_than_guest) { | ||
| 176 | clamp = GetUpperClamp(ctx, thread_id, clamp); | ||
| 177 | } | ||
| 152 | const Id max_thread_id{GetMaxThreadId(ctx, thread_id, clamp, segmentation_mask)}; | 178 | const Id max_thread_id{GetMaxThreadId(ctx, thread_id, clamp, segmentation_mask)}; |
| 153 | const Id src_thread_id{ctx.OpIAdd(ctx.U32[1], thread_id, index)}; | 179 | const Id src_thread_id{ctx.OpIAdd(ctx.U32[1], thread_id, index)}; |
| 154 | const Id in_range{ctx.OpSLessThanEqual(ctx.U1, src_thread_id, max_thread_id)}; | 180 | const Id in_range{ctx.OpSLessThanEqual(ctx.U1, src_thread_id, max_thread_id)}; |
| @@ -159,7 +185,10 @@ Id EmitShuffleDown(EmitContext& ctx, IR::Inst* inst, Id value, Id index, Id clam | |||
| 159 | 185 | ||
| 160 | Id EmitShuffleButterfly(EmitContext& ctx, IR::Inst* inst, Id value, Id index, Id clamp, | 186 | Id EmitShuffleButterfly(EmitContext& ctx, IR::Inst* inst, Id value, Id index, Id clamp, |
| 161 | Id segmentation_mask) { | 187 | Id segmentation_mask) { |
| 162 | const Id thread_id{ctx.OpLoad(ctx.U32[1], ctx.subgroup_local_invocation_id)}; | 188 | const Id thread_id{GetThreadId(ctx)}; |
| 189 | if (ctx.profile.warp_size_potentially_larger_than_guest) { | ||
| 190 | clamp = GetUpperClamp(ctx, thread_id, clamp); | ||
| 191 | } | ||
| 163 | const Id max_thread_id{GetMaxThreadId(ctx, thread_id, clamp, segmentation_mask)}; | 192 | const Id max_thread_id{GetMaxThreadId(ctx, thread_id, clamp, segmentation_mask)}; |
| 164 | const Id src_thread_id{ctx.OpBitwiseXor(ctx.U32[1], thread_id, index)}; | 193 | const Id src_thread_id{ctx.OpBitwiseXor(ctx.U32[1], thread_id, index)}; |
| 165 | const Id in_range{ctx.OpSLessThanEqual(ctx.U1, src_thread_id, max_thread_id)}; | 194 | const Id in_range{ctx.OpSLessThanEqual(ctx.U1, src_thread_id, max_thread_id)}; |