diff options
| author | 2023-08-24 03:58:59 +0200 | |
|---|---|---|
| committer | 2023-09-23 23:05:30 +0200 | |
| commit | 57d8cd6c40bbadeb30e7a4792267061cbad4d446 (patch) | |
| tree | 4ed8c078eee5983e875e5104cf0ff61242964185 /src/video_core/host_shaders | |
| parent | Query Cache: Fix behavior in Normal Accuracy (diff) | |
| download | yuzu-57d8cd6c40bbadeb30e7a4792267061cbad4d446.tar.gz yuzu-57d8cd6c40bbadeb30e7a4792267061cbad4d446.tar.xz yuzu-57d8cd6c40bbadeb30e7a4792267061cbad4d446.zip | |
Query Cache: Fix Prefix Sums
Diffstat (limited to 'src/video_core/host_shaders')
| -rw-r--r-- | src/video_core/host_shaders/queries_prefix_scan_sum.comp | 131 | ||||
| -rw-r--r-- | src/video_core/host_shaders/queries_prefix_scan_sum_nosubgroups.comp | 60 |
2 files changed, 109 insertions, 82 deletions
diff --git a/src/video_core/host_shaders/queries_prefix_scan_sum.comp b/src/video_core/host_shaders/queries_prefix_scan_sum.comp index 8f10e248e..6faa8981f 100644 --- a/src/video_core/host_shaders/queries_prefix_scan_sum.comp +++ b/src/video_core/host_shaders/queries_prefix_scan_sum.comp | |||
| @@ -34,11 +34,16 @@ | |||
| 34 | #endif | 34 | #endif |
| 35 | 35 | ||
| 36 | BEGIN_PUSH_CONSTANTS | 36 | BEGIN_PUSH_CONSTANTS |
| 37 | UNIFORM(0) uint max_accumulation_base; | 37 | UNIFORM(0) uint min_accumulation_base; |
| 38 | UNIFORM(1) uint accumulation_limit; | 38 | UNIFORM(1) uint max_accumulation_base; |
| 39 | UNIFORM(2) uint accumulation_limit; | ||
| 40 | UNIFORM(3) uint buffer_offset; | ||
| 39 | END_PUSH_CONSTANTS | 41 | END_PUSH_CONSTANTS |
| 40 | 42 | ||
| 41 | layout(local_size_x = 32) in; | 43 | #define LOCAL_RESULTS 8 |
| 44 | #define QUERIES_PER_INVOC 2048 | ||
| 45 | |||
| 46 | layout(local_size_x = QUERIES_PER_INVOC / LOCAL_RESULTS) in; | ||
| 42 | 47 | ||
| 43 | layout(std430, binding = 0) readonly buffer block1 { | 48 | layout(std430, binding = 0) readonly buffer block1 { |
| 44 | uvec2 input_data[]; | 49 | uvec2 input_data[]; |
| @@ -52,7 +57,7 @@ layout(std430, binding = 2) coherent buffer block3 { | |||
| 52 | uvec2 accumulated_data; | 57 | uvec2 accumulated_data; |
| 53 | }; | 58 | }; |
| 54 | 59 | ||
| 55 | shared uvec2 shared_data[2]; | 60 | shared uvec2 shared_data[128]; |
| 56 | 61 | ||
| 57 | // Simple Uint64 add that uses 2 uint variables for GPUs that don't support uint64 | 62 | // Simple Uint64 add that uses 2 uint variables for GPUs that don't support uint64 |
| 58 | uvec2 AddUint64(uvec2 value_1, uvec2 value_2) { | 63 | uvec2 AddUint64(uvec2 value_1, uvec2 value_2) { |
| @@ -67,8 +72,8 @@ uvec2 AddUint64(uvec2 value_1, uvec2 value_2) { | |||
| 67 | uvec2 subgroupInclusiveAddUint64(uvec2 value) { | 72 | uvec2 subgroupInclusiveAddUint64(uvec2 value) { |
| 68 | uvec2 result = value; | 73 | uvec2 result = value; |
| 69 | for (uint i = 1; i < gl_SubgroupSize; i *= 2) { | 74 | for (uint i = 1; i < gl_SubgroupSize; i *= 2) { |
| 75 | uvec2 other = subgroupShuffleUp(result, i); // get value from subgroup_inv_id - i; | ||
| 70 | if (i <= gl_SubgroupInvocationID) { | 76 | if (i <= gl_SubgroupInvocationID) { |
| 71 | uvec2 other = subgroupShuffleUp(result, i); // get value from subgroup_inv_id - i; | ||
| 72 | result = AddUint64(result, other); | 77 | result = AddUint64(result, other); |
| 73 | } | 78 | } |
| 74 | } | 79 | } |
| @@ -76,89 +81,93 @@ uvec2 subgroupInclusiveAddUint64(uvec2 value) { | |||
| 76 | } | 81 | } |
| 77 | 82 | ||
| 78 | // Writes down the results to the output buffer and to the accumulation buffer | 83 | // Writes down the results to the output buffer and to the accumulation buffer |
| 79 | void WriteResults(uvec2 result) { | 84 | void WriteResults(uvec2 results[LOCAL_RESULTS]) { |
| 80 | uint current_global_id = gl_GlobalInvocationID.x; | 85 | const uint current_id = gl_LocalInvocationID.x; |
| 81 | uvec2 base_data = current_global_id < max_accumulation_base ? accumulated_data : uvec2(0); | 86 | const uvec2 accum = accumulated_data; |
| 82 | output_data[current_global_id] = result + base_data; | 87 | for (uint i = 0; i < LOCAL_RESULTS; i++) { |
| 83 | if (max_accumulation_base >= accumulation_limit + 1) { | 88 | uvec2 base_data = current_id * LOCAL_RESULTS + i < min_accumulation_base ? accum : uvec2(0, 0); |
| 84 | if (current_global_id == accumulation_limit) { | 89 | AddUint64(results[i], base_data); |
| 85 | accumulated_data = result; | 90 | } |
| 91 | for (uint i = 0; i < LOCAL_RESULTS; i++) { | ||
| 92 | output_data[buffer_offset + current_id * LOCAL_RESULTS + i] = results[i]; | ||
| 93 | } | ||
| 94 | uint index = accumulation_limit % LOCAL_RESULTS; | ||
| 95 | uint base_id = accumulation_limit / LOCAL_RESULTS; | ||
| 96 | if (min_accumulation_base >= accumulation_limit + 1) { | ||
| 97 | if (current_id == base_id) { | ||
| 98 | accumulated_data = results[index]; | ||
| 86 | } | 99 | } |
| 87 | return; | 100 | return; |
| 88 | } | 101 | } |
| 89 | // We have that ugly case in which the accumulation data is reset in the middle somewhere. | 102 | // We have that ugly case in which the accumulation data is reset in the middle somewhere. |
| 90 | barrier(); | 103 | barrier(); |
| 91 | groupMemoryBarrier(); | 104 | groupMemoryBarrier(); |
| 92 | if (current_global_id == accumulation_limit) { | 105 | |
| 93 | uvec2 value_1 = output_data[max_accumulation_base]; | 106 | if (current_id == base_id) { |
| 94 | accumulated_data = AddUint64(result, -value_1); | 107 | uvec2 reset_value = output_data[max_accumulation_base - 1]; |
| 108 | // Calculate two complement / negate manually | ||
| 109 | reset_value = AddUint64(uvec2(1,0), ~reset_value); | ||
| 110 | accumulated_data = AddUint64(results[index], reset_value); | ||
| 95 | } | 111 | } |
| 96 | } | 112 | } |
| 97 | 113 | ||
| 98 | void main() { | 114 | void main() { |
| 99 | uint subgroup_inv_id = gl_SubgroupInvocationID; | 115 | const uint subgroup_inv_id = gl_SubgroupInvocationID; |
| 100 | uint subgroup_id = gl_SubgroupID; | 116 | const uint subgroup_id = gl_SubgroupID + gl_WorkGroupID.x * gl_NumSubgroups; |
| 101 | uint last_subgroup_id = subgroupMax(subgroup_inv_id); | 117 | const uint last_subgroup_id = subgroupMax(subgroup_inv_id); |
| 102 | uint current_global_id = gl_GlobalInvocationID.x; | 118 | const uint current_id = gl_LocalInvocationID.x; |
| 103 | uint total_work = gl_NumWorkGroups.x * gl_WorkGroupSize.x; | 119 | const uint total_work = accumulation_limit; |
| 104 | uvec2 data = input_data[current_global_id]; | 120 | const uint last_result_id = LOCAL_RESULTS - 1; |
| 121 | uvec2 data[LOCAL_RESULTS]; | ||
| 122 | for (uint i = 0; i < LOCAL_RESULTS; i++) { | ||
| 123 | data[i] = input_data[buffer_offset + current_id * LOCAL_RESULTS + i]; | ||
| 124 | } | ||
| 125 | uvec2 results[LOCAL_RESULTS]; | ||
| 126 | results[0] = data[0]; | ||
| 127 | for (uint i = 1; i < LOCAL_RESULTS; i++) { | ||
| 128 | results[i] = AddUint64(data[i], results[i - 1]); | ||
| 129 | } | ||
| 105 | // make sure all input data has been loaded | 130 | // make sure all input data has been loaded |
| 106 | subgroupBarrier(); | 131 | subgroupBarrier(); |
| 107 | subgroupMemoryBarrier(); | 132 | subgroupMemoryBarrier(); |
| 108 | 133 | ||
| 109 | uvec2 result = subgroupInclusiveAddUint64(data); | 134 | // on the last local result, do a subgroup inclusive scan sum |
| 135 | results[last_result_id] = subgroupInclusiveAddUint64(results[last_result_id]); | ||
| 136 | // get the last local result from the subgroup behind the current | ||
| 137 | uvec2 result_behind = subgroupShuffleUp(results[last_result_id], 1); | ||
| 138 | if (subgroup_inv_id != 0) { | ||
| 139 | for (uint i = 1; i < LOCAL_RESULTS; i++) { | ||
| 140 | results[i - 1] = AddUint64(results[i - 1], result_behind); | ||
| 141 | } | ||
| 142 | } | ||
| 110 | 143 | ||
| 111 | // if we had less queries than our subgroup, just write down the results. | 144 | // if we had less queries than our subgroup, just write down the results. |
| 112 | if (total_work <= gl_SubgroupSize) { // This condition is constant per dispatch. | 145 | if (total_work <= gl_SubgroupSize * LOCAL_RESULTS) { // This condition is constant per dispatch. |
| 113 | WriteResults(result); | 146 | WriteResults(results); |
| 114 | return; | 147 | return; |
| 115 | } | 148 | } |
| 116 | 149 | ||
| 117 | // We now have more, so lets write the last result into shared memory. | 150 | // We now have more, so lets write the last result into shared memory. |
| 118 | // Only pick the last subgroup. | 151 | // Only pick the last subgroup. |
| 119 | if (subgroup_inv_id == last_subgroup_id) { | 152 | if (subgroup_inv_id == last_subgroup_id) { |
| 120 | shared_data[subgroup_id] = result; | 153 | shared_data[subgroup_id] = results[last_result_id]; |
| 121 | } | 154 | } |
| 122 | // wait until everyone loaded their stuffs | 155 | // wait until everyone loaded their stuffs |
| 123 | barrier(); | 156 | barrier(); |
| 124 | memoryBarrierShared(); | 157 | memoryBarrierShared(); |
| 125 | 158 | ||
| 126 | // Case 1: the total work for the grouped results can be calculated in a single subgroup | 159 | // only if it's not the first subgroup |
| 127 | // operation (about 1024 queries). | ||
| 128 | uint total_extra_work = gl_NumSubgroups * gl_NumWorkGroups.x; | ||
| 129 | if (total_extra_work <= gl_SubgroupSize) { // This condition is constant per dispatch. | ||
| 130 | if (subgroup_id != 0) { | ||
| 131 | uvec2 tmp = shared_data[subgroup_inv_id]; | ||
| 132 | subgroupBarrier(); | ||
| 133 | subgroupMemoryBarrierShared(); | ||
| 134 | tmp = subgroupInclusiveAddUint64(tmp); | ||
| 135 | result = AddUint64(result, subgroupShuffle(tmp, subgroup_id - 1)); | ||
| 136 | } | ||
| 137 | |||
| 138 | WriteResults(result); | ||
| 139 | return; | ||
| 140 | } | ||
| 141 | |||
| 142 | // Case 2: our work amount is huge, so lets do it in O(log n) steps. | ||
| 143 | const uint extra = (total_extra_work ^ (total_extra_work - 1)) != 0 ? 1 : 0; | ||
| 144 | const uint steps = 1 << (findMSB(total_extra_work) + extra); | ||
| 145 | uint step; | ||
| 146 | // Hillis and Steele's algorithm | ||
| 147 | for (step = 1; step < steps; step *= 2) { | ||
| 148 | if (current_global_id < steps && current_global_id >= step) { | ||
| 149 | uvec2 current = shared_data[current_global_id]; | ||
| 150 | uvec2 other = shared_data[current_global_id - step]; | ||
| 151 | shared_data[current_global_id] = AddUint64(current, other); | ||
| 152 | } | ||
| 153 | // steps is constant, so this will always execute in ever workgroup's thread. | ||
| 154 | barrier(); | ||
| 155 | memoryBarrierShared(); | ||
| 156 | } | ||
| 157 | // Only add results for groups higher than 0 | ||
| 158 | if (subgroup_id != 0) { | 160 | if (subgroup_id != 0) { |
| 159 | result = AddUint64(result, shared_data[subgroup_id - 1]); | 161 | // get the results from some previous invocation |
| 162 | uvec2 tmp = shared_data[subgroup_inv_id]; | ||
| 163 | subgroupBarrier(); | ||
| 164 | subgroupMemoryBarrierShared(); | ||
| 165 | tmp = subgroupInclusiveAddUint64(tmp); | ||
| 166 | // obtain the result that would be equivalent to the previous result | ||
| 167 | uvec2 shuffled_result = subgroupShuffle(tmp, subgroup_id - 1); | ||
| 168 | for (uint i = 0; i < LOCAL_RESULTS; i++) { | ||
| 169 | results[i] = AddUint64(results[i], shuffled_result); | ||
| 170 | } | ||
| 160 | } | 171 | } |
| 161 | 172 | WriteResults(results); | |
| 162 | // Just write the final results. We are done | ||
| 163 | WriteResults(result); | ||
| 164 | } \ No newline at end of file | 173 | } \ No newline at end of file |
diff --git a/src/video_core/host_shaders/queries_prefix_scan_sum_nosubgroups.comp b/src/video_core/host_shaders/queries_prefix_scan_sum_nosubgroups.comp index 8021476ed..559a213b9 100644 --- a/src/video_core/host_shaders/queries_prefix_scan_sum_nosubgroups.comp +++ b/src/video_core/host_shaders/queries_prefix_scan_sum_nosubgroups.comp | |||
| @@ -32,25 +32,30 @@ | |||
| 32 | #endif | 32 | #endif |
| 33 | 33 | ||
| 34 | BEGIN_PUSH_CONSTANTS | 34 | BEGIN_PUSH_CONSTANTS |
| 35 | UNIFORM(0) uint max_accumulation_base; | 35 | UNIFORM(0) uint min_accumulation_base; |
| 36 | UNIFORM(1) uint accumulation_limit; | 36 | UNIFORM(1) uint max_accumulation_base; |
| 37 | UNIFORM(2) uint accumulation_limit; | ||
| 38 | UNIFORM(3) uint buffer_offset; | ||
| 37 | END_PUSH_CONSTANTS | 39 | END_PUSH_CONSTANTS |
| 38 | 40 | ||
| 39 | layout(local_size_x = 32) in; | 41 | #define LOCAL_RESULTS 4 |
| 42 | #define QUERIES_PER_INVOC 2048 | ||
| 43 | |||
| 44 | layout(local_size_x = QUERIES_PER_INVOC / LOCAL_RESULTS) in; | ||
| 40 | 45 | ||
| 41 | layout(std430, binding = 0) readonly buffer block1 { | 46 | layout(std430, binding = 0) readonly buffer block1 { |
| 42 | uvec2 input_data[gl_WorkGroupSize.x]; | 47 | uvec2 input_data[gl_WorkGroupSize.x * LOCAL_RESULTS]; |
| 43 | }; | 48 | }; |
| 44 | 49 | ||
| 45 | layout(std430, binding = 1) writeonly coherent buffer block2 { | 50 | layout(std430, binding = 1) writeonly coherent buffer block2 { |
| 46 | uvec2 output_data[gl_WorkGroupSize.x]; | 51 | uvec2 output_data[gl_WorkGroupSize.x * LOCAL_RESULTS]; |
| 47 | }; | 52 | }; |
| 48 | 53 | ||
| 49 | layout(std430, binding = 2) coherent buffer block3 { | 54 | layout(std430, binding = 2) coherent buffer block3 { |
| 50 | uvec2 accumulated_data; | 55 | uvec2 accumulated_data; |
| 51 | }; | 56 | }; |
| 52 | 57 | ||
| 53 | shared uvec2 shared_data[gl_WorkGroupSize.x * 2]; | 58 | shared uvec2 shared_data[gl_WorkGroupSize.x * LOCAL_RESULTS]; |
| 54 | 59 | ||
| 55 | uvec2 AddUint64(uvec2 value_1, uvec2 value_2) { | 60 | uvec2 AddUint64(uvec2 value_1, uvec2 value_2) { |
| 56 | uint carry = 0; | 61 | uint carry = 0; |
| @@ -62,23 +67,31 @@ uvec2 AddUint64(uvec2 value_1, uvec2 value_2) { | |||
| 62 | 67 | ||
| 63 | void main(void) { | 68 | void main(void) { |
| 64 | uint id = gl_LocalInvocationID.x; | 69 | uint id = gl_LocalInvocationID.x; |
| 65 | uvec2 base_value_1 = (id * 2) < max_accumulation_base ? accumulated_data : uvec2(0); | 70 | uvec2 base_value[LOCAL_RESULTS]; |
| 66 | uvec2 base_value_2 = (id * 2 + 1) < max_accumulation_base ? accumulated_data : uvec2(0); | 71 | const uvec2 accum = accumulated_data; |
| 72 | for (uint i = 0; i < LOCAL_RESULTS; i++) { | ||
| 73 | base_value[i] = (buffer_offset + id * LOCAL_RESULTS + i) < min_accumulation_base | ||
| 74 | ? accumulated_data | ||
| 75 | : uvec2(0); | ||
| 76 | } | ||
| 67 | uint work_size = gl_WorkGroupSize.x; | 77 | uint work_size = gl_WorkGroupSize.x; |
| 68 | uint rd_id; | 78 | uint rd_id; |
| 69 | uint wr_id; | 79 | uint wr_id; |
| 70 | uint mask; | 80 | uint mask; |
| 71 | uvec2 input_1 = input_data[id * 2]; | 81 | uvec2 inputs[LOCAL_RESULTS]; |
| 72 | uvec2 input_2 = input_data[id * 2 + 1]; | 82 | for (uint i = 0; i < LOCAL_RESULTS; i++) { |
| 83 | inputs[i] = input_data[buffer_offset + id * LOCAL_RESULTS + i]; | ||
| 84 | } | ||
| 73 | // The number of steps is the log base 2 of the | 85 | // The number of steps is the log base 2 of the |
| 74 | // work group size, which should be a power of 2 | 86 | // work group size, which should be a power of 2 |
| 75 | const uint steps = uint(log2(work_size)) + 1; | 87 | const uint steps = uint(log2(work_size)) + uint(log2(LOCAL_RESULTS)); |
| 76 | uint step = 0; | 88 | uint step = 0; |
| 77 | 89 | ||
| 78 | // Each invocation is responsible for the content of | 90 | // Each invocation is responsible for the content of |
| 79 | // two elements of the output array | 91 | // two elements of the output array |
| 80 | shared_data[id * 2] = input_1; | 92 | for (uint i = 0; i < LOCAL_RESULTS; i++) { |
| 81 | shared_data[id * 2 + 1] = input_2; | 93 | shared_data[id * LOCAL_RESULTS + i] = inputs[i]; |
| 94 | } | ||
| 82 | // Synchronize to make sure that everyone has initialized | 95 | // Synchronize to make sure that everyone has initialized |
| 83 | // their elements of shared_data[] with data loaded from | 96 | // their elements of shared_data[] with data loaded from |
| 84 | // the input arrays | 97 | // the input arrays |
| @@ -100,21 +113,26 @@ void main(void) { | |||
| 100 | memoryBarrierShared(); | 113 | memoryBarrierShared(); |
| 101 | } | 114 | } |
| 102 | // Add the accumulation | 115 | // Add the accumulation |
| 103 | shared_data[id * 2] = AddUint64(shared_data[id * 2], base_value_1); | 116 | for (uint i = 0; i < LOCAL_RESULTS; i++) { |
| 104 | shared_data[id * 2 + 1] = AddUint64(shared_data[id * 2 + 1], base_value_2); | 117 | shared_data[id * LOCAL_RESULTS + i] = |
| 118 | AddUint64(shared_data[id * LOCAL_RESULTS + i], base_value[i]); | ||
| 119 | } | ||
| 105 | barrier(); | 120 | barrier(); |
| 106 | memoryBarrierShared(); | 121 | memoryBarrierShared(); |
| 107 | 122 | ||
| 108 | // Finally write our data back to the output buffer | 123 | // Finally write our data back to the output buffer |
| 109 | output_data[id * 2] = shared_data[id * 2]; | 124 | for (uint i = 0; i < LOCAL_RESULTS; i++) { |
| 110 | output_data[id * 2 + 1] = shared_data[id * 2 + 1]; | 125 | output_data[buffer_offset + id * LOCAL_RESULTS + i] = shared_data[id * LOCAL_RESULTS + i]; |
| 126 | } | ||
| 111 | if (id == 0) { | 127 | if (id == 0) { |
| 112 | if (max_accumulation_base >= accumulation_limit + 1) { | 128 | if (min_accumulation_base >= accumulation_limit + 1) { |
| 113 | accumulated_data = shared_data[accumulation_limit]; | 129 | accumulated_data = shared_data[accumulation_limit]; |
| 114 | return; | 130 | return; |
| 115 | } | 131 | } |
| 116 | uvec2 value_1 = shared_data[max_accumulation_base]; | 132 | uvec2 reset_value = shared_data[max_accumulation_base - 1]; |
| 117 | uvec2 value_2 = shared_data[accumulation_limit]; | 133 | uvec2 final_value = shared_data[accumulation_limit]; |
| 118 | accumulated_data = AddUint64(value_1, -value_2); | 134 | // Two complements |
| 135 | reset_value = AddUint64(uvec2(1, 0), ~reset_value); | ||
| 136 | accumulated_data = AddUint64(final_value, reset_value); | ||
| 119 | } | 137 | } |
| 120 | } \ No newline at end of file | 138 | } \ No newline at end of file |