diff options
5 files changed, 175 insertions, 132 deletions
diff --git a/src/video_core/host_shaders/queries_prefix_scan_sum.comp b/src/video_core/host_shaders/queries_prefix_scan_sum.comp index 8f10e248e..6faa8981f 100644 --- a/src/video_core/host_shaders/queries_prefix_scan_sum.comp +++ b/src/video_core/host_shaders/queries_prefix_scan_sum.comp | |||
| @@ -34,11 +34,16 @@ | |||
| 34 | #endif | 34 | #endif |
| 35 | 35 | ||
| 36 | BEGIN_PUSH_CONSTANTS | 36 | BEGIN_PUSH_CONSTANTS |
| 37 | UNIFORM(0) uint max_accumulation_base; | 37 | UNIFORM(0) uint min_accumulation_base; |
| 38 | UNIFORM(1) uint accumulation_limit; | 38 | UNIFORM(1) uint max_accumulation_base; |
| 39 | UNIFORM(2) uint accumulation_limit; | ||
| 40 | UNIFORM(3) uint buffer_offset; | ||
| 39 | END_PUSH_CONSTANTS | 41 | END_PUSH_CONSTANTS |
| 40 | 42 | ||
| 41 | layout(local_size_x = 32) in; | 43 | #define LOCAL_RESULTS 8 |
| 44 | #define QUERIES_PER_INVOC 2048 | ||
| 45 | |||
| 46 | layout(local_size_x = QUERIES_PER_INVOC / LOCAL_RESULTS) in; | ||
| 42 | 47 | ||
| 43 | layout(std430, binding = 0) readonly buffer block1 { | 48 | layout(std430, binding = 0) readonly buffer block1 { |
| 44 | uvec2 input_data[]; | 49 | uvec2 input_data[]; |
| @@ -52,7 +57,7 @@ layout(std430, binding = 2) coherent buffer block3 { | |||
| 52 | uvec2 accumulated_data; | 57 | uvec2 accumulated_data; |
| 53 | }; | 58 | }; |
| 54 | 59 | ||
| 55 | shared uvec2 shared_data[2]; | 60 | shared uvec2 shared_data[128]; |
| 56 | 61 | ||
| 57 | // Simple Uint64 add that uses 2 uint variables for GPUs that don't support uint64 | 62 | // Simple Uint64 add that uses 2 uint variables for GPUs that don't support uint64 |
| 58 | uvec2 AddUint64(uvec2 value_1, uvec2 value_2) { | 63 | uvec2 AddUint64(uvec2 value_1, uvec2 value_2) { |
| @@ -67,8 +72,8 @@ uvec2 AddUint64(uvec2 value_1, uvec2 value_2) { | |||
| 67 | uvec2 subgroupInclusiveAddUint64(uvec2 value) { | 72 | uvec2 subgroupInclusiveAddUint64(uvec2 value) { |
| 68 | uvec2 result = value; | 73 | uvec2 result = value; |
| 69 | for (uint i = 1; i < gl_SubgroupSize; i *= 2) { | 74 | for (uint i = 1; i < gl_SubgroupSize; i *= 2) { |
| 75 | uvec2 other = subgroupShuffleUp(result, i); // get value from subgroup_inv_id - i; | ||
| 70 | if (i <= gl_SubgroupInvocationID) { | 76 | if (i <= gl_SubgroupInvocationID) { |
| 71 | uvec2 other = subgroupShuffleUp(result, i); // get value from subgroup_inv_id - i; | ||
| 72 | result = AddUint64(result, other); | 77 | result = AddUint64(result, other); |
| 73 | } | 78 | } |
| 74 | } | 79 | } |
| @@ -76,89 +81,93 @@ uvec2 subgroupInclusiveAddUint64(uvec2 value) { | |||
| 76 | } | 81 | } |
| 77 | 82 | ||
| 78 | // Writes down the results to the output buffer and to the accumulation buffer | 83 | // Writes down the results to the output buffer and to the accumulation buffer |
| 79 | void WriteResults(uvec2 result) { | 84 | void WriteResults(uvec2 results[LOCAL_RESULTS]) { |
| 80 | uint current_global_id = gl_GlobalInvocationID.x; | 85 | const uint current_id = gl_LocalInvocationID.x; |
| 81 | uvec2 base_data = current_global_id < max_accumulation_base ? accumulated_data : uvec2(0); | 86 | const uvec2 accum = accumulated_data; |
| 82 | output_data[current_global_id] = result + base_data; | 87 | for (uint i = 0; i < LOCAL_RESULTS; i++) { |
| 83 | if (max_accumulation_base >= accumulation_limit + 1) { | 88 | uvec2 base_data = current_id * LOCAL_RESULTS + i < min_accumulation_base ? accum : uvec2(0, 0); |
| 84 | if (current_global_id == accumulation_limit) { | 89 | AddUint64(results[i], base_data); |
| 85 | accumulated_data = result; | 90 | } |
| 91 | for (uint i = 0; i < LOCAL_RESULTS; i++) { | ||
| 92 | output_data[buffer_offset + current_id * LOCAL_RESULTS + i] = results[i]; | ||
| 93 | } | ||
| 94 | uint index = accumulation_limit % LOCAL_RESULTS; | ||
| 95 | uint base_id = accumulation_limit / LOCAL_RESULTS; | ||
| 96 | if (min_accumulation_base >= accumulation_limit + 1) { | ||
| 97 | if (current_id == base_id) { | ||
| 98 | accumulated_data = results[index]; | ||
| 86 | } | 99 | } |
| 87 | return; | 100 | return; |
| 88 | } | 101 | } |
| 89 | // We have that ugly case in which the accumulation data is reset in the middle somewhere. | 102 | // We have that ugly case in which the accumulation data is reset in the middle somewhere. |
| 90 | barrier(); | 103 | barrier(); |
| 91 | groupMemoryBarrier(); | 104 | groupMemoryBarrier(); |
| 92 | if (current_global_id == accumulation_limit) { | 105 | |
| 93 | uvec2 value_1 = output_data[max_accumulation_base]; | 106 | if (current_id == base_id) { |
| 94 | accumulated_data = AddUint64(result, -value_1); | 107 | uvec2 reset_value = output_data[max_accumulation_base - 1]; |
| 108 | // Calculate two complement / negate manually | ||
| 109 | reset_value = AddUint64(uvec2(1,0), ~reset_value); | ||
| 110 | accumulated_data = AddUint64(results[index], reset_value); | ||
| 95 | } | 111 | } |
| 96 | } | 112 | } |
| 97 | 113 | ||
| 98 | void main() { | 114 | void main() { |
| 99 | uint subgroup_inv_id = gl_SubgroupInvocationID; | 115 | const uint subgroup_inv_id = gl_SubgroupInvocationID; |
| 100 | uint subgroup_id = gl_SubgroupID; | 116 | const uint subgroup_id = gl_SubgroupID + gl_WorkGroupID.x * gl_NumSubgroups; |
| 101 | uint last_subgroup_id = subgroupMax(subgroup_inv_id); | 117 | const uint last_subgroup_id = subgroupMax(subgroup_inv_id); |
| 102 | uint current_global_id = gl_GlobalInvocationID.x; | 118 | const uint current_id = gl_LocalInvocationID.x; |
| 103 | uint total_work = gl_NumWorkGroups.x * gl_WorkGroupSize.x; | 119 | const uint total_work = accumulation_limit; |
| 104 | uvec2 data = input_data[current_global_id]; | 120 | const uint last_result_id = LOCAL_RESULTS - 1; |
| 121 | uvec2 data[LOCAL_RESULTS]; | ||
| 122 | for (uint i = 0; i < LOCAL_RESULTS; i++) { | ||
| 123 | data[i] = input_data[buffer_offset + current_id * LOCAL_RESULTS + i]; | ||
| 124 | } | ||
| 125 | uvec2 results[LOCAL_RESULTS]; | ||
| 126 | results[0] = data[0]; | ||
| 127 | for (uint i = 1; i < LOCAL_RESULTS; i++) { | ||
| 128 | results[i] = AddUint64(data[i], results[i - 1]); | ||
| 129 | } | ||
| 105 | // make sure all input data has been loaded | 130 | // make sure all input data has been loaded |
| 106 | subgroupBarrier(); | 131 | subgroupBarrier(); |
| 107 | subgroupMemoryBarrier(); | 132 | subgroupMemoryBarrier(); |
| 108 | 133 | ||
| 109 | uvec2 result = subgroupInclusiveAddUint64(data); | 134 | // on the last local result, do a subgroup inclusive scan sum |
| 135 | results[last_result_id] = subgroupInclusiveAddUint64(results[last_result_id]); | ||
| 136 | // get the last local result from the subgroup behind the current | ||
| 137 | uvec2 result_behind = subgroupShuffleUp(results[last_result_id], 1); | ||
| 138 | if (subgroup_inv_id != 0) { | ||
| 139 | for (uint i = 1; i < LOCAL_RESULTS; i++) { | ||
| 140 | results[i - 1] = AddUint64(results[i - 1], result_behind); | ||
| 141 | } | ||
| 142 | } | ||
| 110 | 143 | ||
| 111 | // if we had less queries than our subgroup, just write down the results. | 144 | // if we had less queries than our subgroup, just write down the results. |
| 112 | if (total_work <= gl_SubgroupSize) { // This condition is constant per dispatch. | 145 | if (total_work <= gl_SubgroupSize * LOCAL_RESULTS) { // This condition is constant per dispatch. |
| 113 | WriteResults(result); | 146 | WriteResults(results); |
| 114 | return; | 147 | return; |
| 115 | } | 148 | } |
| 116 | 149 | ||
| 117 | // We now have more, so lets write the last result into shared memory. | 150 | // We now have more, so lets write the last result into shared memory. |
| 118 | // Only pick the last subgroup. | 151 | // Only pick the last subgroup. |
| 119 | if (subgroup_inv_id == last_subgroup_id) { | 152 | if (subgroup_inv_id == last_subgroup_id) { |
| 120 | shared_data[subgroup_id] = result; | 153 | shared_data[subgroup_id] = results[last_result_id]; |
| 121 | } | 154 | } |
| 122 | // wait until everyone loaded their stuffs | 155 | // wait until everyone loaded their stuffs |
| 123 | barrier(); | 156 | barrier(); |
| 124 | memoryBarrierShared(); | 157 | memoryBarrierShared(); |
| 125 | 158 | ||
| 126 | // Case 1: the total work for the grouped results can be calculated in a single subgroup | 159 | // only if it's not the first subgroup |
| 127 | // operation (about 1024 queries). | ||
| 128 | uint total_extra_work = gl_NumSubgroups * gl_NumWorkGroups.x; | ||
| 129 | if (total_extra_work <= gl_SubgroupSize) { // This condition is constant per dispatch. | ||
| 130 | if (subgroup_id != 0) { | ||
| 131 | uvec2 tmp = shared_data[subgroup_inv_id]; | ||
| 132 | subgroupBarrier(); | ||
| 133 | subgroupMemoryBarrierShared(); | ||
| 134 | tmp = subgroupInclusiveAddUint64(tmp); | ||
| 135 | result = AddUint64(result, subgroupShuffle(tmp, subgroup_id - 1)); | ||
| 136 | } | ||
| 137 | |||
| 138 | WriteResults(result); | ||
| 139 | return; | ||
| 140 | } | ||
| 141 | |||
| 142 | // Case 2: our work amount is huge, so lets do it in O(log n) steps. | ||
| 143 | const uint extra = (total_extra_work ^ (total_extra_work - 1)) != 0 ? 1 : 0; | ||
| 144 | const uint steps = 1 << (findMSB(total_extra_work) + extra); | ||
| 145 | uint step; | ||
| 146 | // Hillis and Steele's algorithm | ||
| 147 | for (step = 1; step < steps; step *= 2) { | ||
| 148 | if (current_global_id < steps && current_global_id >= step) { | ||
| 149 | uvec2 current = shared_data[current_global_id]; | ||
| 150 | uvec2 other = shared_data[current_global_id - step]; | ||
| 151 | shared_data[current_global_id] = AddUint64(current, other); | ||
| 152 | } | ||
| 153 | // steps is constant, so this will always execute in ever workgroup's thread. | ||
| 154 | barrier(); | ||
| 155 | memoryBarrierShared(); | ||
| 156 | } | ||
| 157 | // Only add results for groups higher than 0 | ||
| 158 | if (subgroup_id != 0) { | 160 | if (subgroup_id != 0) { |
| 159 | result = AddUint64(result, shared_data[subgroup_id - 1]); | 161 | // get the results from some previous invocation |
| 162 | uvec2 tmp = shared_data[subgroup_inv_id]; | ||
| 163 | subgroupBarrier(); | ||
| 164 | subgroupMemoryBarrierShared(); | ||
| 165 | tmp = subgroupInclusiveAddUint64(tmp); | ||
| 166 | // obtain the result that would be equivalent to the previous result | ||
| 167 | uvec2 shuffled_result = subgroupShuffle(tmp, subgroup_id - 1); | ||
| 168 | for (uint i = 0; i < LOCAL_RESULTS; i++) { | ||
| 169 | results[i] = AddUint64(results[i], shuffled_result); | ||
| 170 | } | ||
| 160 | } | 171 | } |
| 161 | 172 | WriteResults(results); | |
| 162 | // Just write the final results. We are done | ||
| 163 | WriteResults(result); | ||
| 164 | } \ No newline at end of file | 173 | } \ No newline at end of file |
diff --git a/src/video_core/host_shaders/queries_prefix_scan_sum_nosubgroups.comp b/src/video_core/host_shaders/queries_prefix_scan_sum_nosubgroups.comp index 8021476ed..559a213b9 100644 --- a/src/video_core/host_shaders/queries_prefix_scan_sum_nosubgroups.comp +++ b/src/video_core/host_shaders/queries_prefix_scan_sum_nosubgroups.comp | |||
| @@ -32,25 +32,30 @@ | |||
| 32 | #endif | 32 | #endif |
| 33 | 33 | ||
| 34 | BEGIN_PUSH_CONSTANTS | 34 | BEGIN_PUSH_CONSTANTS |
| 35 | UNIFORM(0) uint max_accumulation_base; | 35 | UNIFORM(0) uint min_accumulation_base; |
| 36 | UNIFORM(1) uint accumulation_limit; | 36 | UNIFORM(1) uint max_accumulation_base; |
| 37 | UNIFORM(2) uint accumulation_limit; | ||
| 38 | UNIFORM(3) uint buffer_offset; | ||
| 37 | END_PUSH_CONSTANTS | 39 | END_PUSH_CONSTANTS |
| 38 | 40 | ||
| 39 | layout(local_size_x = 32) in; | 41 | #define LOCAL_RESULTS 4 |
| 42 | #define QUERIES_PER_INVOC 2048 | ||
| 43 | |||
| 44 | layout(local_size_x = QUERIES_PER_INVOC / LOCAL_RESULTS) in; | ||
| 40 | 45 | ||
| 41 | layout(std430, binding = 0) readonly buffer block1 { | 46 | layout(std430, binding = 0) readonly buffer block1 { |
| 42 | uvec2 input_data[gl_WorkGroupSize.x]; | 47 | uvec2 input_data[gl_WorkGroupSize.x * LOCAL_RESULTS]; |
| 43 | }; | 48 | }; |
| 44 | 49 | ||
| 45 | layout(std430, binding = 1) writeonly coherent buffer block2 { | 50 | layout(std430, binding = 1) writeonly coherent buffer block2 { |
| 46 | uvec2 output_data[gl_WorkGroupSize.x]; | 51 | uvec2 output_data[gl_WorkGroupSize.x * LOCAL_RESULTS]; |
| 47 | }; | 52 | }; |
| 48 | 53 | ||
| 49 | layout(std430, binding = 2) coherent buffer block3 { | 54 | layout(std430, binding = 2) coherent buffer block3 { |
| 50 | uvec2 accumulated_data; | 55 | uvec2 accumulated_data; |
| 51 | }; | 56 | }; |
| 52 | 57 | ||
| 53 | shared uvec2 shared_data[gl_WorkGroupSize.x * 2]; | 58 | shared uvec2 shared_data[gl_WorkGroupSize.x * LOCAL_RESULTS]; |
| 54 | 59 | ||
| 55 | uvec2 AddUint64(uvec2 value_1, uvec2 value_2) { | 60 | uvec2 AddUint64(uvec2 value_1, uvec2 value_2) { |
| 56 | uint carry = 0; | 61 | uint carry = 0; |
| @@ -62,23 +67,31 @@ uvec2 AddUint64(uvec2 value_1, uvec2 value_2) { | |||
| 62 | 67 | ||
| 63 | void main(void) { | 68 | void main(void) { |
| 64 | uint id = gl_LocalInvocationID.x; | 69 | uint id = gl_LocalInvocationID.x; |
| 65 | uvec2 base_value_1 = (id * 2) < max_accumulation_base ? accumulated_data : uvec2(0); | 70 | uvec2 base_value[LOCAL_RESULTS]; |
| 66 | uvec2 base_value_2 = (id * 2 + 1) < max_accumulation_base ? accumulated_data : uvec2(0); | 71 | const uvec2 accum = accumulated_data; |
| 72 | for (uint i = 0; i < LOCAL_RESULTS; i++) { | ||
| 73 | base_value[i] = (buffer_offset + id * LOCAL_RESULTS + i) < min_accumulation_base | ||
| 74 | ? accumulated_data | ||
| 75 | : uvec2(0); | ||
| 76 | } | ||
| 67 | uint work_size = gl_WorkGroupSize.x; | 77 | uint work_size = gl_WorkGroupSize.x; |
| 68 | uint rd_id; | 78 | uint rd_id; |
| 69 | uint wr_id; | 79 | uint wr_id; |
| 70 | uint mask; | 80 | uint mask; |
| 71 | uvec2 input_1 = input_data[id * 2]; | 81 | uvec2 inputs[LOCAL_RESULTS]; |
| 72 | uvec2 input_2 = input_data[id * 2 + 1]; | 82 | for (uint i = 0; i < LOCAL_RESULTS; i++) { |
| 83 | inputs[i] = input_data[buffer_offset + id * LOCAL_RESULTS + i]; | ||
| 84 | } | ||
| 73 | // The number of steps is the log base 2 of the | 85 | // The number of steps is the log base 2 of the |
| 74 | // work group size, which should be a power of 2 | 86 | // work group size, which should be a power of 2 |
| 75 | const uint steps = uint(log2(work_size)) + 1; | 87 | const uint steps = uint(log2(work_size)) + uint(log2(LOCAL_RESULTS)); |
| 76 | uint step = 0; | 88 | uint step = 0; |
| 77 | 89 | ||
| 78 | // Each invocation is responsible for the content of | 90 | // Each invocation is responsible for the content of |
| 79 | // two elements of the output array | 91 | // two elements of the output array |
| 80 | shared_data[id * 2] = input_1; | 92 | for (uint i = 0; i < LOCAL_RESULTS; i++) { |
| 81 | shared_data[id * 2 + 1] = input_2; | 93 | shared_data[id * LOCAL_RESULTS + i] = inputs[i]; |
| 94 | } | ||
| 82 | // Synchronize to make sure that everyone has initialized | 95 | // Synchronize to make sure that everyone has initialized |
| 83 | // their elements of shared_data[] with data loaded from | 96 | // their elements of shared_data[] with data loaded from |
| 84 | // the input arrays | 97 | // the input arrays |
| @@ -100,21 +113,26 @@ void main(void) { | |||
| 100 | memoryBarrierShared(); | 113 | memoryBarrierShared(); |
| 101 | } | 114 | } |
| 102 | // Add the accumulation | 115 | // Add the accumulation |
| 103 | shared_data[id * 2] = AddUint64(shared_data[id * 2], base_value_1); | 116 | for (uint i = 0; i < LOCAL_RESULTS; i++) { |
| 104 | shared_data[id * 2 + 1] = AddUint64(shared_data[id * 2 + 1], base_value_2); | 117 | shared_data[id * LOCAL_RESULTS + i] = |
| 118 | AddUint64(shared_data[id * LOCAL_RESULTS + i], base_value[i]); | ||
| 119 | } | ||
| 105 | barrier(); | 120 | barrier(); |
| 106 | memoryBarrierShared(); | 121 | memoryBarrierShared(); |
| 107 | 122 | ||
| 108 | // Finally write our data back to the output buffer | 123 | // Finally write our data back to the output buffer |
| 109 | output_data[id * 2] = shared_data[id * 2]; | 124 | for (uint i = 0; i < LOCAL_RESULTS; i++) { |
| 110 | output_data[id * 2 + 1] = shared_data[id * 2 + 1]; | 125 | output_data[buffer_offset + id * LOCAL_RESULTS + i] = shared_data[id * LOCAL_RESULTS + i]; |
| 126 | } | ||
| 111 | if (id == 0) { | 127 | if (id == 0) { |
| 112 | if (max_accumulation_base >= accumulation_limit + 1) { | 128 | if (min_accumulation_base >= accumulation_limit + 1) { |
| 113 | accumulated_data = shared_data[accumulation_limit]; | 129 | accumulated_data = shared_data[accumulation_limit]; |
| 114 | return; | 130 | return; |
| 115 | } | 131 | } |
| 116 | uvec2 value_1 = shared_data[max_accumulation_base]; | 132 | uvec2 reset_value = shared_data[max_accumulation_base - 1]; |
| 117 | uvec2 value_2 = shared_data[accumulation_limit]; | 133 | uvec2 final_value = shared_data[accumulation_limit]; |
| 118 | accumulated_data = AddUint64(value_1, -value_2); | 134 | // Two complements |
| 135 | reset_value = AddUint64(uvec2(1, 0), ~reset_value); | ||
| 136 | accumulated_data = AddUint64(final_value, reset_value); | ||
| 119 | } | 137 | } |
| 120 | } \ No newline at end of file | 138 | } \ No newline at end of file |
diff --git a/src/video_core/renderer_vulkan/vk_compute_pass.cpp b/src/video_core/renderer_vulkan/vk_compute_pass.cpp index 44ec5a032..289d5b25c 100644 --- a/src/video_core/renderer_vulkan/vk_compute_pass.cpp +++ b/src/video_core/renderer_vulkan/vk_compute_pass.cpp | |||
| @@ -179,8 +179,10 @@ struct AstcPushConstants { | |||
| 179 | }; | 179 | }; |
| 180 | 180 | ||
| 181 | struct QueriesPrefixScanPushConstants { | 181 | struct QueriesPrefixScanPushConstants { |
| 182 | u32 min_accumulation_base; | ||
| 182 | u32 max_accumulation_base; | 183 | u32 max_accumulation_base; |
| 183 | u32 accumulation_limit; | 184 | u32 accumulation_limit; |
| 185 | u32 buffer_offset; | ||
| 184 | }; | 186 | }; |
| 185 | } // Anonymous namespace | 187 | } // Anonymous namespace |
| 186 | 188 | ||
| @@ -416,56 +418,65 @@ QueriesPrefixScanPass::QueriesPrefixScanPass( | |||
| 416 | device_.IsSubgroupFeatureSupported(VK_SUBGROUP_FEATURE_SHUFFLE_BIT) && | 418 | device_.IsSubgroupFeatureSupported(VK_SUBGROUP_FEATURE_SHUFFLE_BIT) && |
| 417 | device_.IsSubgroupFeatureSupported(VK_SUBGROUP_FEATURE_SHUFFLE_RELATIVE_BIT) | 419 | device_.IsSubgroupFeatureSupported(VK_SUBGROUP_FEATURE_SHUFFLE_RELATIVE_BIT) |
| 418 | ? std::span<const u32>(QUERIES_PREFIX_SCAN_SUM_COMP_SPV) | 420 | ? std::span<const u32>(QUERIES_PREFIX_SCAN_SUM_COMP_SPV) |
| 419 | : std::span<const u32>(QUERIES_PREFIX_SCAN_SUM_NOSUBGROUPS_COMP_SPV), | 421 | : std::span<const u32>(QUERIES_PREFIX_SCAN_SUM_NOSUBGROUPS_COMP_SPV)), |
| 420 | {32}), | ||
| 421 | scheduler{scheduler_}, compute_pass_descriptor_queue{compute_pass_descriptor_queue_} {} | 422 | scheduler{scheduler_}, compute_pass_descriptor_queue{compute_pass_descriptor_queue_} {} |
| 422 | 423 | ||
| 423 | void QueriesPrefixScanPass::Run(VkBuffer accumulation_buffer, VkBuffer dst_buffer, | 424 | void QueriesPrefixScanPass::Run(VkBuffer accumulation_buffer, VkBuffer dst_buffer, |
| 424 | VkBuffer src_buffer, size_t number_of_sums, | 425 | VkBuffer src_buffer, size_t number_of_sums, |
| 425 | size_t max_accumulation_limit) { | 426 | size_t min_accumulation_limit, size_t max_accumulation_limit) { |
| 426 | size_t aligned_runs = Common::AlignUp(number_of_sums, 32); | 427 | size_t current_runs = number_of_sums; |
| 427 | 428 | size_t offset = 0; | |
| 428 | compute_pass_descriptor_queue.Acquire(); | 429 | while (current_runs != 0) { |
| 429 | compute_pass_descriptor_queue.AddBuffer(src_buffer, 0, aligned_runs * sizeof(u64)); | 430 | static constexpr size_t DISPATCH_SIZE = 2048U; |
| 430 | compute_pass_descriptor_queue.AddBuffer(dst_buffer, 0, aligned_runs * sizeof(u64)); | 431 | size_t runs_to_do = std::min<size_t>(current_runs, DISPATCH_SIZE); |
| 431 | compute_pass_descriptor_queue.AddBuffer(accumulation_buffer, 0, sizeof(u64)); | 432 | current_runs -= runs_to_do; |
| 432 | const void* const descriptor_data{compute_pass_descriptor_queue.UpdateData()}; | 433 | compute_pass_descriptor_queue.Acquire(); |
| 433 | 434 | compute_pass_descriptor_queue.AddBuffer(src_buffer, 0, number_of_sums * sizeof(u64)); | |
| 434 | scheduler.RequestOutsideRenderPassOperationContext(); | 435 | compute_pass_descriptor_queue.AddBuffer(dst_buffer, 0, number_of_sums * sizeof(u64)); |
| 435 | scheduler.Record([this, descriptor_data, max_accumulation_limit, number_of_sums, | 436 | compute_pass_descriptor_queue.AddBuffer(accumulation_buffer, 0, sizeof(u64)); |
| 436 | aligned_runs](vk::CommandBuffer cmdbuf) { | 437 | const void* const descriptor_data{compute_pass_descriptor_queue.UpdateData()}; |
| 437 | static constexpr VkMemoryBarrier read_barrier{ | 438 | size_t used_offset = offset; |
| 438 | .sType = VK_STRUCTURE_TYPE_MEMORY_BARRIER, | 439 | offset += runs_to_do; |
| 439 | .pNext = nullptr, | 440 | |
| 440 | .srcAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT, | 441 | scheduler.RequestOutsideRenderPassOperationContext(); |
| 441 | .dstAccessMask = VK_ACCESS_SHADER_READ_BIT | VK_ACCESS_SHADER_WRITE_BIT, | 442 | scheduler.Record([this, descriptor_data, min_accumulation_limit, max_accumulation_limit, |
| 442 | }; | 443 | runs_to_do, used_offset](vk::CommandBuffer cmdbuf) { |
| 443 | static constexpr VkMemoryBarrier write_barrier{ | 444 | static constexpr VkMemoryBarrier read_barrier{ |
| 444 | .sType = VK_STRUCTURE_TYPE_MEMORY_BARRIER, | 445 | .sType = VK_STRUCTURE_TYPE_MEMORY_BARRIER, |
| 445 | .pNext = nullptr, | 446 | .pNext = nullptr, |
| 446 | .srcAccessMask = VK_ACCESS_SHADER_WRITE_BIT, | 447 | .srcAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT, |
| 447 | .dstAccessMask = VK_ACCESS_SHADER_READ_BIT | VK_ACCESS_TRANSFER_READ_BIT | | 448 | .dstAccessMask = VK_ACCESS_SHADER_READ_BIT | VK_ACCESS_SHADER_WRITE_BIT, |
| 448 | VK_ACCESS_VERTEX_ATTRIBUTE_READ_BIT | | 449 | }; |
| 449 | VK_ACCESS_INDIRECT_COMMAND_READ_BIT | VK_ACCESS_INDEX_READ_BIT | | 450 | static constexpr VkMemoryBarrier write_barrier{ |
| 450 | VK_ACCESS_UNIFORM_READ_BIT | | 451 | .sType = VK_STRUCTURE_TYPE_MEMORY_BARRIER, |
| 451 | VK_ACCESS_CONDITIONAL_RENDERING_READ_BIT_EXT, | 452 | .pNext = nullptr, |
| 452 | }; | 453 | .srcAccessMask = VK_ACCESS_SHADER_WRITE_BIT, |
| 453 | const QueriesPrefixScanPushConstants uniforms{ | 454 | .dstAccessMask = VK_ACCESS_SHADER_READ_BIT | VK_ACCESS_TRANSFER_READ_BIT | |
| 454 | .max_accumulation_base = static_cast<u32>(max_accumulation_limit), | 455 | VK_ACCESS_VERTEX_ATTRIBUTE_READ_BIT | |
| 455 | .accumulation_limit = static_cast<u32>(number_of_sums - 1), | 456 | VK_ACCESS_INDIRECT_COMMAND_READ_BIT | VK_ACCESS_INDEX_READ_BIT | |
| 456 | }; | 457 | VK_ACCESS_UNIFORM_READ_BIT | |
| 457 | const VkDescriptorSet set = descriptor_allocator.Commit(); | 458 | VK_ACCESS_CONDITIONAL_RENDERING_READ_BIT_EXT, |
| 458 | device.GetLogical().UpdateDescriptorSet(set, *descriptor_template, descriptor_data); | 459 | }; |
| 460 | const QueriesPrefixScanPushConstants uniforms{ | ||
| 461 | .min_accumulation_base = static_cast<u32>(min_accumulation_limit), | ||
| 462 | .max_accumulation_base = static_cast<u32>(max_accumulation_limit), | ||
| 463 | .accumulation_limit = static_cast<u32>(runs_to_do - 1), | ||
| 464 | .buffer_offset = static_cast<u32>(used_offset), | ||
| 465 | }; | ||
| 466 | const VkDescriptorSet set = descriptor_allocator.Commit(); | ||
| 467 | device.GetLogical().UpdateDescriptorSet(set, *descriptor_template, descriptor_data); | ||
| 459 | 468 | ||
| 460 | cmdbuf.PipelineBarrier(VK_PIPELINE_STAGE_ALL_COMMANDS_BIT, | 469 | cmdbuf.PipelineBarrier(VK_PIPELINE_STAGE_ALL_COMMANDS_BIT, |
| 461 | VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, 0, read_barrier); | 470 | VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, 0, read_barrier); |
| 462 | cmdbuf.BindPipeline(VK_PIPELINE_BIND_POINT_COMPUTE, *pipeline); | 471 | cmdbuf.BindPipeline(VK_PIPELINE_BIND_POINT_COMPUTE, *pipeline); |
| 463 | cmdbuf.BindDescriptorSets(VK_PIPELINE_BIND_POINT_COMPUTE, *layout, 0, set, {}); | 472 | cmdbuf.BindDescriptorSets(VK_PIPELINE_BIND_POINT_COMPUTE, *layout, 0, set, {}); |
| 464 | cmdbuf.PushConstants(*layout, VK_SHADER_STAGE_COMPUTE_BIT, uniforms); | 473 | cmdbuf.PushConstants(*layout, VK_SHADER_STAGE_COMPUTE_BIT, uniforms); |
| 465 | cmdbuf.Dispatch(static_cast<u32>(aligned_runs / 32U), 1, 1); | 474 | cmdbuf.Dispatch(1, 1, 1); |
| 466 | cmdbuf.PipelineBarrier(VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, | 475 | cmdbuf.PipelineBarrier(VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, |
| 467 | VK_PIPELINE_STAGE_CONDITIONAL_RENDERING_BIT_EXT, 0, write_barrier); | 476 | VK_PIPELINE_STAGE_CONDITIONAL_RENDERING_BIT_EXT, 0, |
| 468 | }); | 477 | write_barrier); |
| 478 | }); | ||
| 479 | } | ||
| 469 | } | 480 | } |
| 470 | 481 | ||
| 471 | ASTCDecoderPass::ASTCDecoderPass(const Device& device_, Scheduler& scheduler_, | 482 | ASTCDecoderPass::ASTCDecoderPass(const Device& device_, Scheduler& scheduler_, |
diff --git a/src/video_core/renderer_vulkan/vk_compute_pass.h b/src/video_core/renderer_vulkan/vk_compute_pass.h index 68ffb1b82..3ff935639 100644 --- a/src/video_core/renderer_vulkan/vk_compute_pass.h +++ b/src/video_core/renderer_vulkan/vk_compute_pass.h | |||
| @@ -104,7 +104,7 @@ public: | |||
| 104 | ComputePassDescriptorQueue& compute_pass_descriptor_queue_); | 104 | ComputePassDescriptorQueue& compute_pass_descriptor_queue_); |
| 105 | 105 | ||
| 106 | void Run(VkBuffer accumulation_buffer, VkBuffer dst_buffer, VkBuffer src_buffer, | 106 | void Run(VkBuffer accumulation_buffer, VkBuffer dst_buffer, VkBuffer src_buffer, |
| 107 | size_t number_of_sums, size_t max_accumulation_limit); | 107 | size_t number_of_sums, size_t min_accumulation_limit, size_t max_accumulation_limit); |
| 108 | 108 | ||
| 109 | private: | 109 | private: |
| 110 | Scheduler& scheduler; | 110 | Scheduler& scheduler; |
diff --git a/src/video_core/renderer_vulkan/vk_query_cache.cpp b/src/video_core/renderer_vulkan/vk_query_cache.cpp index 2cc007716..a32da3ba3 100644 --- a/src/video_core/renderer_vulkan/vk_query_cache.cpp +++ b/src/video_core/renderer_vulkan/vk_query_cache.cpp | |||
| @@ -181,7 +181,8 @@ public: | |||
| 181 | }); | 181 | }); |
| 182 | rasterizer->SyncOperation(std::move(func)); | 182 | rasterizer->SyncOperation(std::move(func)); |
| 183 | accumulation_since_last_sync = false; | 183 | accumulation_since_last_sync = false; |
| 184 | last_accumulation_checkpoint = std::min(last_accumulation_checkpoint, num_slots_used); | 184 | first_accumulation_checkpoint = std::min(first_accumulation_checkpoint, num_slots_used); |
| 185 | last_accumulation_checkpoint = std::max(last_accumulation_checkpoint, num_slots_used); | ||
| 185 | } | 186 | } |
| 186 | 187 | ||
| 187 | void CloseCounter() override { | 188 | void CloseCounter() override { |
| @@ -285,7 +286,9 @@ public: | |||
| 285 | resolve_buffers.push_back(intermediary_buffer_index); | 286 | resolve_buffers.push_back(intermediary_buffer_index); |
| 286 | queries_prefix_scan_pass->Run(*accumulation_buffer, *buffers[intermediary_buffer_index], | 287 | queries_prefix_scan_pass->Run(*accumulation_buffer, *buffers[intermediary_buffer_index], |
| 287 | *buffers[resolve_buffer_index], num_slots_used, | 288 | *buffers[resolve_buffer_index], num_slots_used, |
| 288 | std::min(last_accumulation_checkpoint, num_slots_used)); | 289 | std::min(first_accumulation_checkpoint, num_slots_used), |
| 290 | last_accumulation_checkpoint); | ||
| 291 | |||
| 289 | } else { | 292 | } else { |
| 290 | scheduler.RequestOutsideRenderPassOperationContext(); | 293 | scheduler.RequestOutsideRenderPassOperationContext(); |
| 291 | scheduler.Record([buffer = *accumulation_buffer](vk::CommandBuffer cmdbuf) { | 294 | scheduler.Record([buffer = *accumulation_buffer](vk::CommandBuffer cmdbuf) { |
| @@ -298,7 +301,8 @@ public: | |||
| 298 | rasterizer->SyncOperation(std::move(func)); | 301 | rasterizer->SyncOperation(std::move(func)); |
| 299 | AbandonCurrentQuery(); | 302 | AbandonCurrentQuery(); |
| 300 | num_slots_used = 0; | 303 | num_slots_used = 0; |
| 301 | last_accumulation_checkpoint = std::numeric_limits<size_t>::max(); | 304 | first_accumulation_checkpoint = std::numeric_limits<size_t>::max(); |
| 305 | last_accumulation_checkpoint = 0; | ||
| 302 | accumulation_since_last_sync = has_multi_queries; | 306 | accumulation_since_last_sync = has_multi_queries; |
| 303 | pending_sync.clear(); | 307 | pending_sync.clear(); |
| 304 | } | 308 | } |
| @@ -506,7 +510,7 @@ private: | |||
| 506 | 510 | ||
| 507 | template <bool is_resolve> | 511 | template <bool is_resolve> |
| 508 | size_t ObtainBuffer(size_t num_needed) { | 512 | size_t ObtainBuffer(size_t num_needed) { |
| 509 | const size_t log_2 = std::max<size_t>(6U, Common::Log2Ceil64(num_needed)); | 513 | const size_t log_2 = std::max<size_t>(11U, Common::Log2Ceil64(num_needed)); |
| 510 | if constexpr (is_resolve) { | 514 | if constexpr (is_resolve) { |
| 511 | if (resolve_table[log_2] != 0) { | 515 | if (resolve_table[log_2] != 0) { |
| 512 | return resolve_table[log_2] - 1; | 516 | return resolve_table[log_2] - 1; |
| @@ -563,6 +567,7 @@ private: | |||
| 563 | VkQueryPool current_query_pool; | 567 | VkQueryPool current_query_pool; |
| 564 | size_t current_query_id; | 568 | size_t current_query_id; |
| 565 | size_t num_slots_used{}; | 569 | size_t num_slots_used{}; |
| 570 | size_t first_accumulation_checkpoint{}; | ||
| 566 | size_t last_accumulation_checkpoint{}; | 571 | size_t last_accumulation_checkpoint{}; |
| 567 | bool accumulation_since_last_sync{}; | 572 | bool accumulation_since_last_sync{}; |
| 568 | VideoCommon::HostQueryBase* current_query; | 573 | VideoCommon::HostQueryBase* current_query; |