summaryrefslogtreecommitdiff
path: root/src/shader_recompiler/backend
diff options
context:
space:
mode:
authorGravatar Morph2021-09-12 13:53:29 -0400
committerGravatar GitHub2021-09-12 13:53:29 -0400
commit9248442bb2759c071b565a10e959883980ff09d6 (patch)
tree1058129dc40331f7f688d32467da17214a2dc013 /src/shader_recompiler/backend
parentMerge pull request #6975 from ogniK5377/acc-async-ctx (diff)
parentemit_glsl_warp: Fix shuffle ops for 64-thread warp sizes (diff)
downloadyuzu-9248442bb2759c071b565a10e959883980ff09d6.tar.gz
yuzu-9248442bb2759c071b565a10e959883980ff09d6.tar.xz
yuzu-9248442bb2759c071b565a10e959883980ff09d6.zip
Merge pull request #6948 from ameerj/amd-warp-fix
shaders: Fix warp instructions on 64-thread warp devices
Diffstat (limited to 'src/shader_recompiler/backend')
-rw-r--r--src/shader_recompiler/backend/glsl/emit_glsl_warp.cpp122
-rw-r--r--src/shader_recompiler/backend/spirv/emit_spirv_warp.cpp41
2 files changed, 109 insertions, 54 deletions
diff --git a/src/shader_recompiler/backend/glsl/emit_glsl_warp.cpp b/src/shader_recompiler/backend/glsl/emit_glsl_warp.cpp
index a982dd8a2..cd285e2c8 100644
--- a/src/shader_recompiler/backend/glsl/emit_glsl_warp.cpp
+++ b/src/shader_recompiler/backend/glsl/emit_glsl_warp.cpp
@@ -11,6 +11,8 @@
11 11
12namespace Shader::Backend::GLSL { 12namespace Shader::Backend::GLSL {
13namespace { 13namespace {
14constexpr char THREAD_ID[]{"gl_SubGroupInvocationARB"};
15
14void SetInBoundsFlag(EmitContext& ctx, IR::Inst& inst) { 16void SetInBoundsFlag(EmitContext& ctx, IR::Inst& inst) {
15 IR::Inst* const in_bounds{inst.GetAssociatedPseudoOperation(IR::Opcode::GetInBoundsFromOp)}; 17 IR::Inst* const in_bounds{inst.GetAssociatedPseudoOperation(IR::Opcode::GetInBoundsFromOp)};
16 if (!in_bounds) { 18 if (!in_bounds) {
@@ -43,84 +45,100 @@ void UseShuffleNv(EmitContext& ctx, IR::Inst& inst, std::string_view shfl_op,
43 ctx.AddU32("{}={}({},{},{},shfl_in_bounds);", inst, shfl_op, value, index, width); 45 ctx.AddU32("{}={}({},{},{},shfl_in_bounds);", inst, shfl_op, value, index, width);
44 SetInBoundsFlag(ctx, inst); 46 SetInBoundsFlag(ctx, inst);
45} 47}
48
49std::string_view BallotIndex(EmitContext& ctx) {
50 if (!ctx.profile.warp_size_potentially_larger_than_guest) {
51 return ".x";
52 }
53 return "[gl_SubGroupInvocationARB>>5]";
54}
55
56std::string GetMask(EmitContext& ctx, std::string_view mask) {
57 const auto ballot_index{BallotIndex(ctx)};
58 return fmt::format("uint(uvec2({}){})", mask, ballot_index);
59}
46} // Anonymous namespace 60} // Anonymous namespace
47 61
48void EmitLaneId(EmitContext& ctx, IR::Inst& inst) { 62void EmitLaneId(EmitContext& ctx, IR::Inst& inst) {
49 ctx.AddU32("{}=gl_SubGroupInvocationARB&31u;", inst); 63 ctx.AddU32("{}={}&31u;", inst, THREAD_ID);
50} 64}
51 65
52void EmitVoteAll(EmitContext& ctx, IR::Inst& inst, std::string_view pred) { 66void EmitVoteAll(EmitContext& ctx, IR::Inst& inst, std::string_view pred) {
53 if (!ctx.profile.warp_size_potentially_larger_than_guest) { 67 if (!ctx.profile.warp_size_potentially_larger_than_guest) {
54 ctx.AddU1("{}=allInvocationsEqualARB({});", inst, pred); 68 ctx.AddU1("{}=allInvocationsEqualARB({});", inst, pred);
55 } else { 69 return;
56 const auto active_mask{fmt::format("uvec2(ballotARB(true))[gl_SubGroupInvocationARB]")};
57 const auto ballot{fmt::format("uvec2(ballotARB({}))[gl_SubGroupInvocationARB]", pred)};
58 ctx.AddU1("{}=({}&{})=={};", inst, ballot, active_mask, active_mask);
59 } 70 }
71 const auto ballot_index{BallotIndex(ctx)};
72 const auto active_mask{fmt::format("uvec2(ballotARB(true)){}", ballot_index)};
73 const auto ballot{fmt::format("uvec2(ballotARB({})){}", pred, ballot_index)};
74 ctx.AddU1("{}=({}&{})=={};", inst, ballot, active_mask, active_mask);
60} 75}
61 76
62void EmitVoteAny(EmitContext& ctx, IR::Inst& inst, std::string_view pred) { 77void EmitVoteAny(EmitContext& ctx, IR::Inst& inst, std::string_view pred) {
63 if (!ctx.profile.warp_size_potentially_larger_than_guest) { 78 if (!ctx.profile.warp_size_potentially_larger_than_guest) {
64 ctx.AddU1("{}=anyInvocationARB({});", inst, pred); 79 ctx.AddU1("{}=anyInvocationARB({});", inst, pred);
65 } else { 80 return;
66 const auto active_mask{fmt::format("uvec2(ballotARB(true))[gl_SubGroupInvocationARB]")};
67 const auto ballot{fmt::format("uvec2(ballotARB({}))[gl_SubGroupInvocationARB]", pred)};
68 ctx.AddU1("{}=({}&{})!=0u;", inst, ballot, active_mask, active_mask);
69 } 81 }
82 const auto ballot_index{BallotIndex(ctx)};
83 const auto active_mask{fmt::format("uvec2(ballotARB(true)){}", ballot_index)};
84 const auto ballot{fmt::format("uvec2(ballotARB({})){}", pred, ballot_index)};
85 ctx.AddU1("{}=({}&{})!=0u;", inst, ballot, active_mask, active_mask);
70} 86}
71 87
72void EmitVoteEqual(EmitContext& ctx, IR::Inst& inst, std::string_view pred) { 88void EmitVoteEqual(EmitContext& ctx, IR::Inst& inst, std::string_view pred) {
73 if (!ctx.profile.warp_size_potentially_larger_than_guest) { 89 if (!ctx.profile.warp_size_potentially_larger_than_guest) {
74 ctx.AddU1("{}=allInvocationsEqualARB({});", inst, pred); 90 ctx.AddU1("{}=allInvocationsEqualARB({});", inst, pred);
75 } else { 91 return;
76 const auto active_mask{fmt::format("uvec2(ballotARB(true))[gl_SubGroupInvocationARB]")};
77 const auto ballot{fmt::format("uvec2(ballotARB({}))[gl_SubGroupInvocationARB]", pred)};
78 const auto value{fmt::format("({}^{})", ballot, active_mask)};
79 ctx.AddU1("{}=({}==0)||({}=={});", inst, value, value, active_mask);
80 } 92 }
93 const auto ballot_index{BallotIndex(ctx)};
94 const auto active_mask{fmt::format("uvec2(ballotARB(true)){}", ballot_index)};
95 const auto ballot{fmt::format("uvec2(ballotARB({})){}", pred, ballot_index)};
96 const auto value{fmt::format("({}^{})", ballot, active_mask)};
97 ctx.AddU1("{}=({}==0)||({}=={});", inst, value, value, active_mask);
81} 98}
82 99
83void EmitSubgroupBallot(EmitContext& ctx, IR::Inst& inst, std::string_view pred) { 100void EmitSubgroupBallot(EmitContext& ctx, IR::Inst& inst, std::string_view pred) {
84 if (!ctx.profile.warp_size_potentially_larger_than_guest) { 101 const auto ballot_index{BallotIndex(ctx)};
85 ctx.AddU32("{}=uvec2(ballotARB({})).x;", inst, pred); 102 ctx.AddU32("{}=uvec2(ballotARB({})){};", inst, pred, ballot_index);
86 } else {
87 ctx.AddU32("{}=uvec2(ballotARB({}))[gl_SubGroupInvocationARB];", inst, pred);
88 }
89} 103}
90 104
91void EmitSubgroupEqMask(EmitContext& ctx, IR::Inst& inst) { 105void EmitSubgroupEqMask(EmitContext& ctx, IR::Inst& inst) {
92 ctx.AddU32("{}=uint(gl_SubGroupEqMaskARB.x);", inst); 106 ctx.AddU32("{}={};", inst, GetMask(ctx, "gl_SubGroupEqMaskARB"));
93} 107}
94 108
95void EmitSubgroupLtMask(EmitContext& ctx, IR::Inst& inst) { 109void EmitSubgroupLtMask(EmitContext& ctx, IR::Inst& inst) {
96 ctx.AddU32("{}=uint(gl_SubGroupLtMaskARB.x);", inst); 110 ctx.AddU32("{}={};", inst, GetMask(ctx, "gl_SubGroupLtMaskARB"));
97} 111}
98 112
99void EmitSubgroupLeMask(EmitContext& ctx, IR::Inst& inst) { 113void EmitSubgroupLeMask(EmitContext& ctx, IR::Inst& inst) {
100 ctx.AddU32("{}=uint(gl_SubGroupLeMaskARB.x);", inst); 114 ctx.AddU32("{}={};", inst, GetMask(ctx, "gl_SubGroupLeMaskARB"));
101} 115}
102 116
103void EmitSubgroupGtMask(EmitContext& ctx, IR::Inst& inst) { 117void EmitSubgroupGtMask(EmitContext& ctx, IR::Inst& inst) {
104 ctx.AddU32("{}=uint(gl_SubGroupGtMaskARB.x);", inst); 118 ctx.AddU32("{}={};", inst, GetMask(ctx, "gl_SubGroupGtMaskARB"));
105} 119}
106 120
107void EmitSubgroupGeMask(EmitContext& ctx, IR::Inst& inst) { 121void EmitSubgroupGeMask(EmitContext& ctx, IR::Inst& inst) {
108 ctx.AddU32("{}=uint(gl_SubGroupGeMaskARB.x);", inst); 122 ctx.AddU32("{}={};", inst, GetMask(ctx, "gl_SubGroupGeMaskARB"));
109} 123}
110 124
111void EmitShuffleIndex(EmitContext& ctx, IR::Inst& inst, std::string_view value, 125void EmitShuffleIndex(EmitContext& ctx, IR::Inst& inst, std::string_view value,
112 std::string_view index, std::string_view clamp, 126 std::string_view index, std::string_view clamp, std::string_view seg_mask) {
113 std::string_view segmentation_mask) {
114 if (ctx.profile.support_gl_warp_intrinsics) { 127 if (ctx.profile.support_gl_warp_intrinsics) {
115 UseShuffleNv(ctx, inst, "shuffleNV", value, index, clamp, segmentation_mask); 128 UseShuffleNv(ctx, inst, "shuffleNV", value, index, clamp, seg_mask);
116 return; 129 return;
117 } 130 }
118 const auto not_seg_mask{fmt::format("(~{})", segmentation_mask)}; 131 const bool big_warp{ctx.profile.warp_size_potentially_larger_than_guest};
119 const auto thread_id{"gl_SubGroupInvocationARB"}; 132 const auto is_upper_partition{"int(gl_SubGroupInvocationARB)>=32"};
120 const auto min_thread_id{ComputeMinThreadId(thread_id, segmentation_mask)}; 133 const auto upper_index{fmt::format("{}?{}+32:{}", is_upper_partition, index, index)};
121 const auto max_thread_id{ComputeMaxThreadId(min_thread_id, clamp, not_seg_mask)}; 134 const auto upper_clamp{fmt::format("{}?{}+32:{}", is_upper_partition, clamp, clamp)};
135
136 const auto not_seg_mask{fmt::format("(~{})", seg_mask)};
137 const auto min_thread_id{ComputeMinThreadId(THREAD_ID, seg_mask)};
138 const auto max_thread_id{
139 ComputeMaxThreadId(min_thread_id, big_warp ? upper_clamp : clamp, not_seg_mask)};
122 140
123 const auto lhs{fmt::format("({}&{})", index, not_seg_mask)}; 141 const auto lhs{fmt::format("({}&{})", big_warp ? upper_index : index, not_seg_mask)};
124 const auto src_thread_id{fmt::format("({})|({})", lhs, min_thread_id)}; 142 const auto src_thread_id{fmt::format("({})|({})", lhs, min_thread_id)};
125 ctx.Add("shfl_in_bounds=int({})<=int({});", src_thread_id, max_thread_id); 143 ctx.Add("shfl_in_bounds=int({})<=int({});", src_thread_id, max_thread_id);
126 SetInBoundsFlag(ctx, inst); 144 SetInBoundsFlag(ctx, inst);
@@ -128,29 +146,34 @@ void EmitShuffleIndex(EmitContext& ctx, IR::Inst& inst, std::string_view value,
128} 146}
129 147
130void EmitShuffleUp(EmitContext& ctx, IR::Inst& inst, std::string_view value, std::string_view index, 148void EmitShuffleUp(EmitContext& ctx, IR::Inst& inst, std::string_view value, std::string_view index,
131 std::string_view clamp, std::string_view segmentation_mask) { 149 std::string_view clamp, std::string_view seg_mask) {
132 if (ctx.profile.support_gl_warp_intrinsics) { 150 if (ctx.profile.support_gl_warp_intrinsics) {
133 UseShuffleNv(ctx, inst, "shuffleUpNV", value, index, clamp, segmentation_mask); 151 UseShuffleNv(ctx, inst, "shuffleUpNV", value, index, clamp, seg_mask);
134 return; 152 return;
135 } 153 }
136 const auto thread_id{"gl_SubGroupInvocationARB"}; 154 const bool big_warp{ctx.profile.warp_size_potentially_larger_than_guest};
137 const auto max_thread_id{GetMaxThreadId(thread_id, clamp, segmentation_mask)}; 155 const auto is_upper_partition{"int(gl_SubGroupInvocationARB)>=32"};
138 const auto src_thread_id{fmt::format("({}-{})", thread_id, index)}; 156 const auto upper_clamp{fmt::format("{}?{}+32:{}", is_upper_partition, clamp, clamp)};
157
158 const auto max_thread_id{GetMaxThreadId(THREAD_ID, big_warp ? upper_clamp : clamp, seg_mask)};
159 const auto src_thread_id{fmt::format("({}-{})", THREAD_ID, index)};
139 ctx.Add("shfl_in_bounds=int({})>=int({});", src_thread_id, max_thread_id); 160 ctx.Add("shfl_in_bounds=int({})>=int({});", src_thread_id, max_thread_id);
140 SetInBoundsFlag(ctx, inst); 161 SetInBoundsFlag(ctx, inst);
141 ctx.AddU32("{}=shfl_in_bounds?readInvocationARB({},{}):{};", inst, value, src_thread_id, value); 162 ctx.AddU32("{}=shfl_in_bounds?readInvocationARB({},{}):{};", inst, value, src_thread_id, value);
142} 163}
143 164
144void EmitShuffleDown(EmitContext& ctx, IR::Inst& inst, std::string_view value, 165void EmitShuffleDown(EmitContext& ctx, IR::Inst& inst, std::string_view value,
145 std::string_view index, std::string_view clamp, 166 std::string_view index, std::string_view clamp, std::string_view seg_mask) {
146 std::string_view segmentation_mask) {
147 if (ctx.profile.support_gl_warp_intrinsics) { 167 if (ctx.profile.support_gl_warp_intrinsics) {
148 UseShuffleNv(ctx, inst, "shuffleDownNV", value, index, clamp, segmentation_mask); 168 UseShuffleNv(ctx, inst, "shuffleDownNV", value, index, clamp, seg_mask);
149 return; 169 return;
150 } 170 }
151 const auto thread_id{"gl_SubGroupInvocationARB"}; 171 const bool big_warp{ctx.profile.warp_size_potentially_larger_than_guest};
152 const auto max_thread_id{GetMaxThreadId(thread_id, clamp, segmentation_mask)}; 172 const auto is_upper_partition{"int(gl_SubGroupInvocationARB)>=32"};
153 const auto src_thread_id{fmt::format("({}+{})", thread_id, index)}; 173 const auto upper_clamp{fmt::format("{}?{}+32:{}", is_upper_partition, clamp, clamp)};
174
175 const auto max_thread_id{GetMaxThreadId(THREAD_ID, big_warp ? upper_clamp : clamp, seg_mask)};
176 const auto src_thread_id{fmt::format("({}+{})", THREAD_ID, index)};
154 ctx.Add("shfl_in_bounds=int({})<=int({});", src_thread_id, max_thread_id); 177 ctx.Add("shfl_in_bounds=int({})<=int({});", src_thread_id, max_thread_id);
155 SetInBoundsFlag(ctx, inst); 178 SetInBoundsFlag(ctx, inst);
156 ctx.AddU32("{}=shfl_in_bounds?readInvocationARB({},{}):{};", inst, value, src_thread_id, value); 179 ctx.AddU32("{}=shfl_in_bounds?readInvocationARB({},{}):{};", inst, value, src_thread_id, value);
@@ -158,14 +181,17 @@ void EmitShuffleDown(EmitContext& ctx, IR::Inst& inst, std::string_view value,
158 181
159void EmitShuffleButterfly(EmitContext& ctx, IR::Inst& inst, std::string_view value, 182void EmitShuffleButterfly(EmitContext& ctx, IR::Inst& inst, std::string_view value,
160 std::string_view index, std::string_view clamp, 183 std::string_view index, std::string_view clamp,
161 std::string_view segmentation_mask) { 184 std::string_view seg_mask) {
162 if (ctx.profile.support_gl_warp_intrinsics) { 185 if (ctx.profile.support_gl_warp_intrinsics) {
163 UseShuffleNv(ctx, inst, "shuffleXorNV", value, index, clamp, segmentation_mask); 186 UseShuffleNv(ctx, inst, "shuffleXorNV", value, index, clamp, seg_mask);
164 return; 187 return;
165 } 188 }
166 const auto thread_id{"gl_SubGroupInvocationARB"}; 189 const bool big_warp{ctx.profile.warp_size_potentially_larger_than_guest};
167 const auto max_thread_id{GetMaxThreadId(thread_id, clamp, segmentation_mask)}; 190 const auto is_upper_partition{"int(gl_SubGroupInvocationARB)>=32"};
168 const auto src_thread_id{fmt::format("({}^{})", thread_id, index)}; 191 const auto upper_clamp{fmt::format("{}?{}+32:{}", is_upper_partition, clamp, clamp)};
192
193 const auto max_thread_id{GetMaxThreadId(THREAD_ID, big_warp ? upper_clamp : clamp, seg_mask)};
194 const auto src_thread_id{fmt::format("({}^{})", THREAD_ID, index)};
169 ctx.Add("shfl_in_bounds=int({})<=int({});", src_thread_id, max_thread_id); 195 ctx.Add("shfl_in_bounds=int({})<=int({});", src_thread_id, max_thread_id);
170 SetInBoundsFlag(ctx, inst); 196 SetInBoundsFlag(ctx, inst);
171 ctx.AddU32("{}=shfl_in_bounds?readInvocationARB({},{}):{};", inst, value, src_thread_id, value); 197 ctx.AddU32("{}=shfl_in_bounds?readInvocationARB({},{}):{};", inst, value, src_thread_id, value);
diff --git a/src/shader_recompiler/backend/spirv/emit_spirv_warp.cpp b/src/shader_recompiler/backend/spirv/emit_spirv_warp.cpp
index 78b1e1ba7..cef52c56e 100644
--- a/src/shader_recompiler/backend/spirv/emit_spirv_warp.cpp
+++ b/src/shader_recompiler/backend/spirv/emit_spirv_warp.cpp
@@ -7,8 +7,13 @@
7 7
8namespace Shader::Backend::SPIRV { 8namespace Shader::Backend::SPIRV {
9namespace { 9namespace {
10Id GetThreadId(EmitContext& ctx) {
11 return ctx.OpLoad(ctx.U32[1], ctx.subgroup_local_invocation_id);
12}
13
10Id WarpExtract(EmitContext& ctx, Id value) { 14Id WarpExtract(EmitContext& ctx, Id value) {
11 const Id local_index{ctx.OpLoad(ctx.U32[1], ctx.subgroup_local_invocation_id)}; 15 const Id thread_id{GetThreadId(ctx)};
16 const Id local_index{ctx.OpShiftRightArithmetic(ctx.U32[1], thread_id, ctx.Const(5U))};
12 return ctx.OpVectorExtractDynamic(ctx.U32[1], value, local_index); 17 return ctx.OpVectorExtractDynamic(ctx.U32[1], value, local_index);
13} 18}
14 19
@@ -48,10 +53,17 @@ Id SelectValue(EmitContext& ctx, Id in_range, Id value, Id src_thread_id) {
48 return ctx.OpSelect(ctx.U32[1], in_range, 53 return ctx.OpSelect(ctx.U32[1], in_range,
49 ctx.OpSubgroupReadInvocationKHR(ctx.U32[1], value, src_thread_id), value); 54 ctx.OpSubgroupReadInvocationKHR(ctx.U32[1], value, src_thread_id), value);
50} 55}
56
57Id GetUpperClamp(EmitContext& ctx, Id invocation_id, Id clamp) {
58 const Id thirty_two{ctx.Const(32u)};
59 const Id is_upper_partition{ctx.OpSGreaterThanEqual(ctx.U1, invocation_id, thirty_two)};
60 const Id upper_clamp{ctx.OpIAdd(ctx.U32[1], thirty_two, clamp)};
61 return ctx.OpSelect(ctx.U32[1], is_upper_partition, upper_clamp, clamp);
62}
51} // Anonymous namespace 63} // Anonymous namespace
52 64
53Id EmitLaneId(EmitContext& ctx) { 65Id EmitLaneId(EmitContext& ctx) {
54 const Id id{ctx.OpLoad(ctx.U32[1], ctx.subgroup_local_invocation_id)}; 66 const Id id{GetThreadId(ctx)};
55 if (!ctx.profile.warp_size_potentially_larger_than_guest) { 67 if (!ctx.profile.warp_size_potentially_larger_than_guest) {
56 return id; 68 return id;
57 } 69 }
@@ -123,7 +135,15 @@ Id EmitSubgroupGeMask(EmitContext& ctx) {
123Id EmitShuffleIndex(EmitContext& ctx, IR::Inst* inst, Id value, Id index, Id clamp, 135Id EmitShuffleIndex(EmitContext& ctx, IR::Inst* inst, Id value, Id index, Id clamp,
124 Id segmentation_mask) { 136 Id segmentation_mask) {
125 const Id not_seg_mask{ctx.OpNot(ctx.U32[1], segmentation_mask)}; 137 const Id not_seg_mask{ctx.OpNot(ctx.U32[1], segmentation_mask)};
126 const Id thread_id{ctx.OpLoad(ctx.U32[1], ctx.subgroup_local_invocation_id)}; 138 const Id thread_id{GetThreadId(ctx)};
139 if (ctx.profile.warp_size_potentially_larger_than_guest) {
140 const Id thirty_two{ctx.Const(32u)};
141 const Id is_upper_partition{ctx.OpSGreaterThanEqual(ctx.U1, thread_id, thirty_two)};
142 const Id upper_index{ctx.OpIAdd(ctx.U32[1], thirty_two, index)};
143 const Id upper_clamp{ctx.OpIAdd(ctx.U32[1], thirty_two, clamp)};
144 index = ctx.OpSelect(ctx.U32[1], is_upper_partition, upper_index, index);
145 clamp = ctx.OpSelect(ctx.U32[1], is_upper_partition, upper_clamp, clamp);
146 }
127 const Id min_thread_id{ComputeMinThreadId(ctx, thread_id, segmentation_mask)}; 147 const Id min_thread_id{ComputeMinThreadId(ctx, thread_id, segmentation_mask)};
128 const Id max_thread_id{ComputeMaxThreadId(ctx, min_thread_id, clamp, not_seg_mask)}; 148 const Id max_thread_id{ComputeMaxThreadId(ctx, min_thread_id, clamp, not_seg_mask)};
129 149
@@ -137,7 +157,10 @@ Id EmitShuffleIndex(EmitContext& ctx, IR::Inst* inst, Id value, Id index, Id cla
137 157
138Id EmitShuffleUp(EmitContext& ctx, IR::Inst* inst, Id value, Id index, Id clamp, 158Id EmitShuffleUp(EmitContext& ctx, IR::Inst* inst, Id value, Id index, Id clamp,
139 Id segmentation_mask) { 159 Id segmentation_mask) {
140 const Id thread_id{ctx.OpLoad(ctx.U32[1], ctx.subgroup_local_invocation_id)}; 160 const Id thread_id{GetThreadId(ctx)};
161 if (ctx.profile.warp_size_potentially_larger_than_guest) {
162 clamp = GetUpperClamp(ctx, thread_id, clamp);
163 }
141 const Id max_thread_id{GetMaxThreadId(ctx, thread_id, clamp, segmentation_mask)}; 164 const Id max_thread_id{GetMaxThreadId(ctx, thread_id, clamp, segmentation_mask)};
142 const Id src_thread_id{ctx.OpISub(ctx.U32[1], thread_id, index)}; 165 const Id src_thread_id{ctx.OpISub(ctx.U32[1], thread_id, index)};
143 const Id in_range{ctx.OpSGreaterThanEqual(ctx.U1, src_thread_id, max_thread_id)}; 166 const Id in_range{ctx.OpSGreaterThanEqual(ctx.U1, src_thread_id, max_thread_id)};
@@ -148,7 +171,10 @@ Id EmitShuffleUp(EmitContext& ctx, IR::Inst* inst, Id value, Id index, Id clamp,
148 171
149Id EmitShuffleDown(EmitContext& ctx, IR::Inst* inst, Id value, Id index, Id clamp, 172Id EmitShuffleDown(EmitContext& ctx, IR::Inst* inst, Id value, Id index, Id clamp,
150 Id segmentation_mask) { 173 Id segmentation_mask) {
151 const Id thread_id{ctx.OpLoad(ctx.U32[1], ctx.subgroup_local_invocation_id)}; 174 const Id thread_id{GetThreadId(ctx)};
175 if (ctx.profile.warp_size_potentially_larger_than_guest) {
176 clamp = GetUpperClamp(ctx, thread_id, clamp);
177 }
152 const Id max_thread_id{GetMaxThreadId(ctx, thread_id, clamp, segmentation_mask)}; 178 const Id max_thread_id{GetMaxThreadId(ctx, thread_id, clamp, segmentation_mask)};
153 const Id src_thread_id{ctx.OpIAdd(ctx.U32[1], thread_id, index)}; 179 const Id src_thread_id{ctx.OpIAdd(ctx.U32[1], thread_id, index)};
154 const Id in_range{ctx.OpSLessThanEqual(ctx.U1, src_thread_id, max_thread_id)}; 180 const Id in_range{ctx.OpSLessThanEqual(ctx.U1, src_thread_id, max_thread_id)};
@@ -159,7 +185,10 @@ Id EmitShuffleDown(EmitContext& ctx, IR::Inst* inst, Id value, Id index, Id clam
159 185
160Id EmitShuffleButterfly(EmitContext& ctx, IR::Inst* inst, Id value, Id index, Id clamp, 186Id EmitShuffleButterfly(EmitContext& ctx, IR::Inst* inst, Id value, Id index, Id clamp,
161 Id segmentation_mask) { 187 Id segmentation_mask) {
162 const Id thread_id{ctx.OpLoad(ctx.U32[1], ctx.subgroup_local_invocation_id)}; 188 const Id thread_id{GetThreadId(ctx)};
189 if (ctx.profile.warp_size_potentially_larger_than_guest) {
190 clamp = GetUpperClamp(ctx, thread_id, clamp);
191 }
163 const Id max_thread_id{GetMaxThreadId(ctx, thread_id, clamp, segmentation_mask)}; 192 const Id max_thread_id{GetMaxThreadId(ctx, thread_id, clamp, segmentation_mask)};
164 const Id src_thread_id{ctx.OpBitwiseXor(ctx.U32[1], thread_id, index)}; 193 const Id src_thread_id{ctx.OpBitwiseXor(ctx.U32[1], thread_id, index)};
165 const Id in_range{ctx.OpSLessThanEqual(ctx.U1, src_thread_id, max_thread_id)}; 194 const Id in_range{ctx.OpSLessThanEqual(ctx.U1, src_thread_id, max_thread_id)};