diff options
| author | 2021-07-25 11:39:04 -0700 | |
|---|---|---|
| committer | 2021-07-25 11:39:04 -0700 | |
| commit | 98b26b6e126d4775fdf3f773fe8a8ac808a8ff8f (patch) | |
| tree | 816faa96c2c4d291825063433331a8ea4b3d08f1 /src/shader_recompiler/ir_opt | |
| parent | Merge pull request #6699 from lat9nq/common-threads (diff) | |
| parent | shader: Support out of bound local memory reads and immediate writes (diff) | |
| download | yuzu-98b26b6e126d4775fdf3f773fe8a8ac808a8ff8f.tar.gz yuzu-98b26b6e126d4775fdf3f773fe8a8ac808a8ff8f.tar.xz yuzu-98b26b6e126d4775fdf3f773fe8a8ac808a8ff8f.zip | |
Merge pull request #6585 from ameerj/hades
Shader Decompiler Rewrite
Diffstat (limited to 'src/shader_recompiler/ir_opt')
| -rw-r--r-- | src/shader_recompiler/ir_opt/collect_shader_info_pass.cpp | 928 | ||||
| -rw-r--r-- | src/shader_recompiler/ir_opt/constant_propagation_pass.cpp | 610 | ||||
| -rw-r--r-- | src/shader_recompiler/ir_opt/dead_code_elimination_pass.cpp | 26 | ||||
| -rw-r--r-- | src/shader_recompiler/ir_opt/dual_vertex_pass.cpp | 30 | ||||
| -rw-r--r-- | src/shader_recompiler/ir_opt/global_memory_to_storage_buffer_pass.cpp | 526 | ||||
| -rw-r--r-- | src/shader_recompiler/ir_opt/identity_removal_pass.cpp | 38 | ||||
| -rw-r--r-- | src/shader_recompiler/ir_opt/lower_fp16_to_fp32.cpp | 143 | ||||
| -rw-r--r-- | src/shader_recompiler/ir_opt/lower_int64_to_int32.cpp | 218 | ||||
| -rw-r--r-- | src/shader_recompiler/ir_opt/passes.h | 32 | ||||
| -rw-r--r-- | src/shader_recompiler/ir_opt/ssa_rewrite_pass.cpp | 383 | ||||
| -rw-r--r-- | src/shader_recompiler/ir_opt/texture_pass.cpp | 523 | ||||
| -rw-r--r-- | src/shader_recompiler/ir_opt/verification_pass.cpp | 98 |
12 files changed, 3555 insertions, 0 deletions
diff --git a/src/shader_recompiler/ir_opt/collect_shader_info_pass.cpp b/src/shader_recompiler/ir_opt/collect_shader_info_pass.cpp new file mode 100644 index 000000000..5ead930f1 --- /dev/null +++ b/src/shader_recompiler/ir_opt/collect_shader_info_pass.cpp | |||
| @@ -0,0 +1,928 @@ | |||
| 1 | // Copyright 2021 yuzu Emulator Project | ||
| 2 | // Licensed under GPLv2 or any later version | ||
| 3 | // Refer to the license.txt file included. | ||
| 4 | |||
| 5 | #include "common/alignment.h" | ||
| 6 | #include "shader_recompiler/environment.h" | ||
| 7 | #include "shader_recompiler/frontend/ir/modifiers.h" | ||
| 8 | #include "shader_recompiler/frontend/ir/program.h" | ||
| 9 | #include "shader_recompiler/frontend/ir/value.h" | ||
| 10 | #include "shader_recompiler/ir_opt/passes.h" | ||
| 11 | #include "shader_recompiler/shader_info.h" | ||
| 12 | |||
| 13 | namespace Shader::Optimization { | ||
| 14 | namespace { | ||
| 15 | void AddConstantBufferDescriptor(Info& info, u32 index, u32 count) { | ||
| 16 | if (count != 1) { | ||
| 17 | throw NotImplementedException("Constant buffer descriptor indexing"); | ||
| 18 | } | ||
| 19 | if ((info.constant_buffer_mask & (1U << index)) != 0) { | ||
| 20 | return; | ||
| 21 | } | ||
| 22 | info.constant_buffer_mask |= 1U << index; | ||
| 23 | |||
| 24 | auto& cbufs{info.constant_buffer_descriptors}; | ||
| 25 | cbufs.insert(std::ranges::lower_bound(cbufs, index, {}, &ConstantBufferDescriptor::index), | ||
| 26 | ConstantBufferDescriptor{ | ||
| 27 | .index = index, | ||
| 28 | .count = 1, | ||
| 29 | }); | ||
| 30 | } | ||
| 31 | |||
| 32 | void GetPatch(Info& info, IR::Patch patch) { | ||
| 33 | if (!IR::IsGeneric(patch)) { | ||
| 34 | throw NotImplementedException("Reading non-generic patch {}", patch); | ||
| 35 | } | ||
| 36 | info.uses_patches.at(IR::GenericPatchIndex(patch)) = true; | ||
| 37 | } | ||
| 38 | |||
| 39 | void SetPatch(Info& info, IR::Patch patch) { | ||
| 40 | if (IR::IsGeneric(patch)) { | ||
| 41 | info.uses_patches.at(IR::GenericPatchIndex(patch)) = true; | ||
| 42 | return; | ||
| 43 | } | ||
| 44 | switch (patch) { | ||
| 45 | case IR::Patch::TessellationLodLeft: | ||
| 46 | case IR::Patch::TessellationLodTop: | ||
| 47 | case IR::Patch::TessellationLodRight: | ||
| 48 | case IR::Patch::TessellationLodBottom: | ||
| 49 | info.stores_tess_level_outer = true; | ||
| 50 | break; | ||
| 51 | case IR::Patch::TessellationLodInteriorU: | ||
| 52 | case IR::Patch::TessellationLodInteriorV: | ||
| 53 | info.stores_tess_level_inner = true; | ||
| 54 | break; | ||
| 55 | default: | ||
| 56 | throw NotImplementedException("Set patch {}", patch); | ||
| 57 | } | ||
| 58 | } | ||
| 59 | |||
| 60 | void CheckCBufNVN(Info& info, IR::Inst& inst) { | ||
| 61 | const IR::Value cbuf_index{inst.Arg(0)}; | ||
| 62 | if (!cbuf_index.IsImmediate()) { | ||
| 63 | info.nvn_buffer_used.set(); | ||
| 64 | return; | ||
| 65 | } | ||
| 66 | const u32 index{cbuf_index.U32()}; | ||
| 67 | if (index != 0) { | ||
| 68 | return; | ||
| 69 | } | ||
| 70 | const IR::Value cbuf_offset{inst.Arg(1)}; | ||
| 71 | if (!cbuf_offset.IsImmediate()) { | ||
| 72 | info.nvn_buffer_used.set(); | ||
| 73 | return; | ||
| 74 | } | ||
| 75 | const u32 offset{cbuf_offset.U32()}; | ||
| 76 | const u32 descriptor_size{0x10}; | ||
| 77 | const u32 upper_limit{info.nvn_buffer_base + descriptor_size * 16}; | ||
| 78 | if (offset >= info.nvn_buffer_base && offset < upper_limit) { | ||
| 79 | const std::size_t nvn_index{(offset - info.nvn_buffer_base) / descriptor_size}; | ||
| 80 | info.nvn_buffer_used.set(nvn_index, true); | ||
| 81 | } | ||
| 82 | } | ||
| 83 | |||
| 84 | void VisitUsages(Info& info, IR::Inst& inst) { | ||
| 85 | switch (inst.GetOpcode()) { | ||
| 86 | case IR::Opcode::CompositeConstructF16x2: | ||
| 87 | case IR::Opcode::CompositeConstructF16x3: | ||
| 88 | case IR::Opcode::CompositeConstructF16x4: | ||
| 89 | case IR::Opcode::CompositeExtractF16x2: | ||
| 90 | case IR::Opcode::CompositeExtractF16x3: | ||
| 91 | case IR::Opcode::CompositeExtractF16x4: | ||
| 92 | case IR::Opcode::CompositeInsertF16x2: | ||
| 93 | case IR::Opcode::CompositeInsertF16x3: | ||
| 94 | case IR::Opcode::CompositeInsertF16x4: | ||
| 95 | case IR::Opcode::SelectF16: | ||
| 96 | case IR::Opcode::BitCastU16F16: | ||
| 97 | case IR::Opcode::BitCastF16U16: | ||
| 98 | case IR::Opcode::PackFloat2x16: | ||
| 99 | case IR::Opcode::UnpackFloat2x16: | ||
| 100 | case IR::Opcode::ConvertS16F16: | ||
| 101 | case IR::Opcode::ConvertS32F16: | ||
| 102 | case IR::Opcode::ConvertS64F16: | ||
| 103 | case IR::Opcode::ConvertU16F16: | ||
| 104 | case IR::Opcode::ConvertU32F16: | ||
| 105 | case IR::Opcode::ConvertU64F16: | ||
| 106 | case IR::Opcode::ConvertF16S8: | ||
| 107 | case IR::Opcode::ConvertF16S16: | ||
| 108 | case IR::Opcode::ConvertF16S32: | ||
| 109 | case IR::Opcode::ConvertF16S64: | ||
| 110 | case IR::Opcode::ConvertF16U8: | ||
| 111 | case IR::Opcode::ConvertF16U16: | ||
| 112 | case IR::Opcode::ConvertF16U32: | ||
| 113 | case IR::Opcode::ConvertF16U64: | ||
| 114 | case IR::Opcode::FPAbs16: | ||
| 115 | case IR::Opcode::FPAdd16: | ||
| 116 | case IR::Opcode::FPCeil16: | ||
| 117 | case IR::Opcode::FPFloor16: | ||
| 118 | case IR::Opcode::FPFma16: | ||
| 119 | case IR::Opcode::FPMul16: | ||
| 120 | case IR::Opcode::FPNeg16: | ||
| 121 | case IR::Opcode::FPRoundEven16: | ||
| 122 | case IR::Opcode::FPSaturate16: | ||
| 123 | case IR::Opcode::FPClamp16: | ||
| 124 | case IR::Opcode::FPTrunc16: | ||
| 125 | case IR::Opcode::FPOrdEqual16: | ||
| 126 | case IR::Opcode::FPUnordEqual16: | ||
| 127 | case IR::Opcode::FPOrdNotEqual16: | ||
| 128 | case IR::Opcode::FPUnordNotEqual16: | ||
| 129 | case IR::Opcode::FPOrdLessThan16: | ||
| 130 | case IR::Opcode::FPUnordLessThan16: | ||
| 131 | case IR::Opcode::FPOrdGreaterThan16: | ||
| 132 | case IR::Opcode::FPUnordGreaterThan16: | ||
| 133 | case IR::Opcode::FPOrdLessThanEqual16: | ||
| 134 | case IR::Opcode::FPUnordLessThanEqual16: | ||
| 135 | case IR::Opcode::FPOrdGreaterThanEqual16: | ||
| 136 | case IR::Opcode::FPUnordGreaterThanEqual16: | ||
| 137 | case IR::Opcode::FPIsNan16: | ||
| 138 | case IR::Opcode::GlobalAtomicAddF16x2: | ||
| 139 | case IR::Opcode::GlobalAtomicMinF16x2: | ||
| 140 | case IR::Opcode::GlobalAtomicMaxF16x2: | ||
| 141 | case IR::Opcode::StorageAtomicAddF16x2: | ||
| 142 | case IR::Opcode::StorageAtomicMinF16x2: | ||
| 143 | case IR::Opcode::StorageAtomicMaxF16x2: | ||
| 144 | info.uses_fp16 = true; | ||
| 145 | break; | ||
| 146 | case IR::Opcode::CompositeConstructF64x2: | ||
| 147 | case IR::Opcode::CompositeConstructF64x3: | ||
| 148 | case IR::Opcode::CompositeConstructF64x4: | ||
| 149 | case IR::Opcode::CompositeExtractF64x2: | ||
| 150 | case IR::Opcode::CompositeExtractF64x3: | ||
| 151 | case IR::Opcode::CompositeExtractF64x4: | ||
| 152 | case IR::Opcode::CompositeInsertF64x2: | ||
| 153 | case IR::Opcode::CompositeInsertF64x3: | ||
| 154 | case IR::Opcode::CompositeInsertF64x4: | ||
| 155 | case IR::Opcode::SelectF64: | ||
| 156 | case IR::Opcode::BitCastU64F64: | ||
| 157 | case IR::Opcode::BitCastF64U64: | ||
| 158 | case IR::Opcode::PackDouble2x32: | ||
| 159 | case IR::Opcode::UnpackDouble2x32: | ||
| 160 | case IR::Opcode::FPAbs64: | ||
| 161 | case IR::Opcode::FPAdd64: | ||
| 162 | case IR::Opcode::FPCeil64: | ||
| 163 | case IR::Opcode::FPFloor64: | ||
| 164 | case IR::Opcode::FPFma64: | ||
| 165 | case IR::Opcode::FPMax64: | ||
| 166 | case IR::Opcode::FPMin64: | ||
| 167 | case IR::Opcode::FPMul64: | ||
| 168 | case IR::Opcode::FPNeg64: | ||
| 169 | case IR::Opcode::FPRecip64: | ||
| 170 | case IR::Opcode::FPRecipSqrt64: | ||
| 171 | case IR::Opcode::FPRoundEven64: | ||
| 172 | case IR::Opcode::FPSaturate64: | ||
| 173 | case IR::Opcode::FPClamp64: | ||
| 174 | case IR::Opcode::FPTrunc64: | ||
| 175 | case IR::Opcode::FPOrdEqual64: | ||
| 176 | case IR::Opcode::FPUnordEqual64: | ||
| 177 | case IR::Opcode::FPOrdNotEqual64: | ||
| 178 | case IR::Opcode::FPUnordNotEqual64: | ||
| 179 | case IR::Opcode::FPOrdLessThan64: | ||
| 180 | case IR::Opcode::FPUnordLessThan64: | ||
| 181 | case IR::Opcode::FPOrdGreaterThan64: | ||
| 182 | case IR::Opcode::FPUnordGreaterThan64: | ||
| 183 | case IR::Opcode::FPOrdLessThanEqual64: | ||
| 184 | case IR::Opcode::FPUnordLessThanEqual64: | ||
| 185 | case IR::Opcode::FPOrdGreaterThanEqual64: | ||
| 186 | case IR::Opcode::FPUnordGreaterThanEqual64: | ||
| 187 | case IR::Opcode::FPIsNan64: | ||
| 188 | case IR::Opcode::ConvertS16F64: | ||
| 189 | case IR::Opcode::ConvertS32F64: | ||
| 190 | case IR::Opcode::ConvertS64F64: | ||
| 191 | case IR::Opcode::ConvertU16F64: | ||
| 192 | case IR::Opcode::ConvertU32F64: | ||
| 193 | case IR::Opcode::ConvertU64F64: | ||
| 194 | case IR::Opcode::ConvertF32F64: | ||
| 195 | case IR::Opcode::ConvertF64F32: | ||
| 196 | case IR::Opcode::ConvertF64S8: | ||
| 197 | case IR::Opcode::ConvertF64S16: | ||
| 198 | case IR::Opcode::ConvertF64S32: | ||
| 199 | case IR::Opcode::ConvertF64S64: | ||
| 200 | case IR::Opcode::ConvertF64U8: | ||
| 201 | case IR::Opcode::ConvertF64U16: | ||
| 202 | case IR::Opcode::ConvertF64U32: | ||
| 203 | case IR::Opcode::ConvertF64U64: | ||
| 204 | info.uses_fp64 = true; | ||
| 205 | break; | ||
| 206 | default: | ||
| 207 | break; | ||
| 208 | } | ||
| 209 | switch (inst.GetOpcode()) { | ||
| 210 | case IR::Opcode::GetCbufU8: | ||
| 211 | case IR::Opcode::GetCbufS8: | ||
| 212 | case IR::Opcode::UndefU8: | ||
| 213 | case IR::Opcode::LoadGlobalU8: | ||
| 214 | case IR::Opcode::LoadGlobalS8: | ||
| 215 | case IR::Opcode::WriteGlobalU8: | ||
| 216 | case IR::Opcode::WriteGlobalS8: | ||
| 217 | case IR::Opcode::LoadStorageU8: | ||
| 218 | case IR::Opcode::LoadStorageS8: | ||
| 219 | case IR::Opcode::WriteStorageU8: | ||
| 220 | case IR::Opcode::WriteStorageS8: | ||
| 221 | case IR::Opcode::LoadSharedU8: | ||
| 222 | case IR::Opcode::LoadSharedS8: | ||
| 223 | case IR::Opcode::WriteSharedU8: | ||
| 224 | case IR::Opcode::SelectU8: | ||
| 225 | case IR::Opcode::ConvertF16S8: | ||
| 226 | case IR::Opcode::ConvertF16U8: | ||
| 227 | case IR::Opcode::ConvertF32S8: | ||
| 228 | case IR::Opcode::ConvertF32U8: | ||
| 229 | case IR::Opcode::ConvertF64S8: | ||
| 230 | case IR::Opcode::ConvertF64U8: | ||
| 231 | info.uses_int8 = true; | ||
| 232 | break; | ||
| 233 | default: | ||
| 234 | break; | ||
| 235 | } | ||
| 236 | switch (inst.GetOpcode()) { | ||
| 237 | case IR::Opcode::GetCbufU16: | ||
| 238 | case IR::Opcode::GetCbufS16: | ||
| 239 | case IR::Opcode::UndefU16: | ||
| 240 | case IR::Opcode::LoadGlobalU16: | ||
| 241 | case IR::Opcode::LoadGlobalS16: | ||
| 242 | case IR::Opcode::WriteGlobalU16: | ||
| 243 | case IR::Opcode::WriteGlobalS16: | ||
| 244 | case IR::Opcode::LoadStorageU16: | ||
| 245 | case IR::Opcode::LoadStorageS16: | ||
| 246 | case IR::Opcode::WriteStorageU16: | ||
| 247 | case IR::Opcode::WriteStorageS16: | ||
| 248 | case IR::Opcode::LoadSharedU16: | ||
| 249 | case IR::Opcode::LoadSharedS16: | ||
| 250 | case IR::Opcode::WriteSharedU16: | ||
| 251 | case IR::Opcode::SelectU16: | ||
| 252 | case IR::Opcode::BitCastU16F16: | ||
| 253 | case IR::Opcode::BitCastF16U16: | ||
| 254 | case IR::Opcode::ConvertS16F16: | ||
| 255 | case IR::Opcode::ConvertS16F32: | ||
| 256 | case IR::Opcode::ConvertS16F64: | ||
| 257 | case IR::Opcode::ConvertU16F16: | ||
| 258 | case IR::Opcode::ConvertU16F32: | ||
| 259 | case IR::Opcode::ConvertU16F64: | ||
| 260 | case IR::Opcode::ConvertF16S16: | ||
| 261 | case IR::Opcode::ConvertF16U16: | ||
| 262 | case IR::Opcode::ConvertF32S16: | ||
| 263 | case IR::Opcode::ConvertF32U16: | ||
| 264 | case IR::Opcode::ConvertF64S16: | ||
| 265 | case IR::Opcode::ConvertF64U16: | ||
| 266 | info.uses_int16 = true; | ||
| 267 | break; | ||
| 268 | default: | ||
| 269 | break; | ||
| 270 | } | ||
| 271 | switch (inst.GetOpcode()) { | ||
| 272 | case IR::Opcode::UndefU64: | ||
| 273 | case IR::Opcode::LoadGlobalU8: | ||
| 274 | case IR::Opcode::LoadGlobalS8: | ||
| 275 | case IR::Opcode::LoadGlobalU16: | ||
| 276 | case IR::Opcode::LoadGlobalS16: | ||
| 277 | case IR::Opcode::LoadGlobal32: | ||
| 278 | case IR::Opcode::LoadGlobal64: | ||
| 279 | case IR::Opcode::LoadGlobal128: | ||
| 280 | case IR::Opcode::WriteGlobalU8: | ||
| 281 | case IR::Opcode::WriteGlobalS8: | ||
| 282 | case IR::Opcode::WriteGlobalU16: | ||
| 283 | case IR::Opcode::WriteGlobalS16: | ||
| 284 | case IR::Opcode::WriteGlobal32: | ||
| 285 | case IR::Opcode::WriteGlobal64: | ||
| 286 | case IR::Opcode::WriteGlobal128: | ||
| 287 | case IR::Opcode::SelectU64: | ||
| 288 | case IR::Opcode::BitCastU64F64: | ||
| 289 | case IR::Opcode::BitCastF64U64: | ||
| 290 | case IR::Opcode::PackUint2x32: | ||
| 291 | case IR::Opcode::UnpackUint2x32: | ||
| 292 | case IR::Opcode::IAdd64: | ||
| 293 | case IR::Opcode::ISub64: | ||
| 294 | case IR::Opcode::INeg64: | ||
| 295 | case IR::Opcode::ShiftLeftLogical64: | ||
| 296 | case IR::Opcode::ShiftRightLogical64: | ||
| 297 | case IR::Opcode::ShiftRightArithmetic64: | ||
| 298 | case IR::Opcode::ConvertS64F16: | ||
| 299 | case IR::Opcode::ConvertS64F32: | ||
| 300 | case IR::Opcode::ConvertS64F64: | ||
| 301 | case IR::Opcode::ConvertU64F16: | ||
| 302 | case IR::Opcode::ConvertU64F32: | ||
| 303 | case IR::Opcode::ConvertU64F64: | ||
| 304 | case IR::Opcode::ConvertU64U32: | ||
| 305 | case IR::Opcode::ConvertU32U64: | ||
| 306 | case IR::Opcode::ConvertF16U64: | ||
| 307 | case IR::Opcode::ConvertF32U64: | ||
| 308 | case IR::Opcode::ConvertF64U64: | ||
| 309 | case IR::Opcode::SharedAtomicExchange64: | ||
| 310 | case IR::Opcode::GlobalAtomicIAdd64: | ||
| 311 | case IR::Opcode::GlobalAtomicSMin64: | ||
| 312 | case IR::Opcode::GlobalAtomicUMin64: | ||
| 313 | case IR::Opcode::GlobalAtomicSMax64: | ||
| 314 | case IR::Opcode::GlobalAtomicUMax64: | ||
| 315 | case IR::Opcode::GlobalAtomicAnd64: | ||
| 316 | case IR::Opcode::GlobalAtomicOr64: | ||
| 317 | case IR::Opcode::GlobalAtomicXor64: | ||
| 318 | case IR::Opcode::GlobalAtomicExchange64: | ||
| 319 | case IR::Opcode::StorageAtomicIAdd64: | ||
| 320 | case IR::Opcode::StorageAtomicSMin64: | ||
| 321 | case IR::Opcode::StorageAtomicUMin64: | ||
| 322 | case IR::Opcode::StorageAtomicSMax64: | ||
| 323 | case IR::Opcode::StorageAtomicUMax64: | ||
| 324 | case IR::Opcode::StorageAtomicAnd64: | ||
| 325 | case IR::Opcode::StorageAtomicOr64: | ||
| 326 | case IR::Opcode::StorageAtomicXor64: | ||
| 327 | case IR::Opcode::StorageAtomicExchange64: | ||
| 328 | info.uses_int64 = true; | ||
| 329 | break; | ||
| 330 | default: | ||
| 331 | break; | ||
| 332 | } | ||
| 333 | switch (inst.GetOpcode()) { | ||
| 334 | case IR::Opcode::WriteGlobalU8: | ||
| 335 | case IR::Opcode::WriteGlobalS8: | ||
| 336 | case IR::Opcode::WriteGlobalU16: | ||
| 337 | case IR::Opcode::WriteGlobalS16: | ||
| 338 | case IR::Opcode::WriteGlobal32: | ||
| 339 | case IR::Opcode::WriteGlobal64: | ||
| 340 | case IR::Opcode::WriteGlobal128: | ||
| 341 | case IR::Opcode::GlobalAtomicIAdd32: | ||
| 342 | case IR::Opcode::GlobalAtomicSMin32: | ||
| 343 | case IR::Opcode::GlobalAtomicUMin32: | ||
| 344 | case IR::Opcode::GlobalAtomicSMax32: | ||
| 345 | case IR::Opcode::GlobalAtomicUMax32: | ||
| 346 | case IR::Opcode::GlobalAtomicInc32: | ||
| 347 | case IR::Opcode::GlobalAtomicDec32: | ||
| 348 | case IR::Opcode::GlobalAtomicAnd32: | ||
| 349 | case IR::Opcode::GlobalAtomicOr32: | ||
| 350 | case IR::Opcode::GlobalAtomicXor32: | ||
| 351 | case IR::Opcode::GlobalAtomicExchange32: | ||
| 352 | case IR::Opcode::GlobalAtomicIAdd64: | ||
| 353 | case IR::Opcode::GlobalAtomicSMin64: | ||
| 354 | case IR::Opcode::GlobalAtomicUMin64: | ||
| 355 | case IR::Opcode::GlobalAtomicSMax64: | ||
| 356 | case IR::Opcode::GlobalAtomicUMax64: | ||
| 357 | case IR::Opcode::GlobalAtomicAnd64: | ||
| 358 | case IR::Opcode::GlobalAtomicOr64: | ||
| 359 | case IR::Opcode::GlobalAtomicXor64: | ||
| 360 | case IR::Opcode::GlobalAtomicExchange64: | ||
| 361 | case IR::Opcode::GlobalAtomicAddF32: | ||
| 362 | case IR::Opcode::GlobalAtomicAddF16x2: | ||
| 363 | case IR::Opcode::GlobalAtomicAddF32x2: | ||
| 364 | case IR::Opcode::GlobalAtomicMinF16x2: | ||
| 365 | case IR::Opcode::GlobalAtomicMinF32x2: | ||
| 366 | case IR::Opcode::GlobalAtomicMaxF16x2: | ||
| 367 | case IR::Opcode::GlobalAtomicMaxF32x2: | ||
| 368 | info.stores_global_memory = true; | ||
| 369 | [[fallthrough]]; | ||
| 370 | case IR::Opcode::LoadGlobalU8: | ||
| 371 | case IR::Opcode::LoadGlobalS8: | ||
| 372 | case IR::Opcode::LoadGlobalU16: | ||
| 373 | case IR::Opcode::LoadGlobalS16: | ||
| 374 | case IR::Opcode::LoadGlobal32: | ||
| 375 | case IR::Opcode::LoadGlobal64: | ||
| 376 | case IR::Opcode::LoadGlobal128: | ||
| 377 | info.uses_int64 = true; | ||
| 378 | info.uses_global_memory = true; | ||
| 379 | info.used_constant_buffer_types |= IR::Type::U32 | IR::Type::U32x2; | ||
| 380 | info.used_storage_buffer_types |= IR::Type::U32 | IR::Type::U32x2 | IR::Type::U32x4; | ||
| 381 | break; | ||
| 382 | default: | ||
| 383 | break; | ||
| 384 | } | ||
| 385 | switch (inst.GetOpcode()) { | ||
| 386 | case IR::Opcode::DemoteToHelperInvocation: | ||
| 387 | info.uses_demote_to_helper_invocation = true; | ||
| 388 | break; | ||
| 389 | case IR::Opcode::GetAttribute: | ||
| 390 | info.loads.mask[static_cast<size_t>(inst.Arg(0).Attribute())] = true; | ||
| 391 | break; | ||
| 392 | case IR::Opcode::SetAttribute: | ||
| 393 | info.stores.mask[static_cast<size_t>(inst.Arg(0).Attribute())] = true; | ||
| 394 | break; | ||
| 395 | case IR::Opcode::GetPatch: | ||
| 396 | GetPatch(info, inst.Arg(0).Patch()); | ||
| 397 | break; | ||
| 398 | case IR::Opcode::SetPatch: | ||
| 399 | SetPatch(info, inst.Arg(0).Patch()); | ||
| 400 | break; | ||
| 401 | case IR::Opcode::GetAttributeIndexed: | ||
| 402 | info.loads_indexed_attributes = true; | ||
| 403 | break; | ||
| 404 | case IR::Opcode::SetAttributeIndexed: | ||
| 405 | info.stores_indexed_attributes = true; | ||
| 406 | break; | ||
| 407 | case IR::Opcode::SetFragColor: | ||
| 408 | info.stores_frag_color[inst.Arg(0).U32()] = true; | ||
| 409 | break; | ||
| 410 | case IR::Opcode::SetSampleMask: | ||
| 411 | info.stores_sample_mask = true; | ||
| 412 | break; | ||
| 413 | case IR::Opcode::SetFragDepth: | ||
| 414 | info.stores_frag_depth = true; | ||
| 415 | break; | ||
| 416 | case IR::Opcode::WorkgroupId: | ||
| 417 | info.uses_workgroup_id = true; | ||
| 418 | break; | ||
| 419 | case IR::Opcode::LocalInvocationId: | ||
| 420 | info.uses_local_invocation_id = true; | ||
| 421 | break; | ||
| 422 | case IR::Opcode::InvocationId: | ||
| 423 | info.uses_invocation_id = true; | ||
| 424 | break; | ||
| 425 | case IR::Opcode::SampleId: | ||
| 426 | info.uses_sample_id = true; | ||
| 427 | break; | ||
| 428 | case IR::Opcode::IsHelperInvocation: | ||
| 429 | info.uses_is_helper_invocation = true; | ||
| 430 | break; | ||
| 431 | case IR::Opcode::LaneId: | ||
| 432 | info.uses_subgroup_invocation_id = true; | ||
| 433 | break; | ||
| 434 | case IR::Opcode::ShuffleIndex: | ||
| 435 | case IR::Opcode::ShuffleUp: | ||
| 436 | case IR::Opcode::ShuffleDown: | ||
| 437 | case IR::Opcode::ShuffleButterfly: | ||
| 438 | info.uses_subgroup_shuffles = true; | ||
| 439 | break; | ||
| 440 | case IR::Opcode::GetCbufU8: | ||
| 441 | case IR::Opcode::GetCbufS8: | ||
| 442 | case IR::Opcode::GetCbufU16: | ||
| 443 | case IR::Opcode::GetCbufS16: | ||
| 444 | case IR::Opcode::GetCbufU32: | ||
| 445 | case IR::Opcode::GetCbufF32: | ||
| 446 | case IR::Opcode::GetCbufU32x2: { | ||
| 447 | const IR::Value index{inst.Arg(0)}; | ||
| 448 | const IR::Value offset{inst.Arg(1)}; | ||
| 449 | if (!index.IsImmediate()) { | ||
| 450 | throw NotImplementedException("Constant buffer with non-immediate index"); | ||
| 451 | } | ||
| 452 | AddConstantBufferDescriptor(info, index.U32(), 1); | ||
| 453 | u32 element_size{}; | ||
| 454 | switch (inst.GetOpcode()) { | ||
| 455 | case IR::Opcode::GetCbufU8: | ||
| 456 | case IR::Opcode::GetCbufS8: | ||
| 457 | info.used_constant_buffer_types |= IR::Type::U8; | ||
| 458 | element_size = 1; | ||
| 459 | break; | ||
| 460 | case IR::Opcode::GetCbufU16: | ||
| 461 | case IR::Opcode::GetCbufS16: | ||
| 462 | info.used_constant_buffer_types |= IR::Type::U16; | ||
| 463 | element_size = 2; | ||
| 464 | break; | ||
| 465 | case IR::Opcode::GetCbufU32: | ||
| 466 | info.used_constant_buffer_types |= IR::Type::U32; | ||
| 467 | element_size = 4; | ||
| 468 | break; | ||
| 469 | case IR::Opcode::GetCbufF32: | ||
| 470 | info.used_constant_buffer_types |= IR::Type::F32; | ||
| 471 | element_size = 4; | ||
| 472 | break; | ||
| 473 | case IR::Opcode::GetCbufU32x2: | ||
| 474 | info.used_constant_buffer_types |= IR::Type::U32x2; | ||
| 475 | element_size = 8; | ||
| 476 | break; | ||
| 477 | default: | ||
| 478 | break; | ||
| 479 | } | ||
| 480 | u32& size{info.constant_buffer_used_sizes[index.U32()]}; | ||
| 481 | if (offset.IsImmediate()) { | ||
| 482 | size = Common::AlignUp(std::max(size, offset.U32() + element_size), 16u); | ||
| 483 | } else { | ||
| 484 | size = 0x10'000; | ||
| 485 | } | ||
| 486 | break; | ||
| 487 | } | ||
| 488 | case IR::Opcode::BindlessImageSampleImplicitLod: | ||
| 489 | case IR::Opcode::BindlessImageSampleExplicitLod: | ||
| 490 | case IR::Opcode::BindlessImageSampleDrefImplicitLod: | ||
| 491 | case IR::Opcode::BindlessImageSampleDrefExplicitLod: | ||
| 492 | case IR::Opcode::BindlessImageGather: | ||
| 493 | case IR::Opcode::BindlessImageGatherDref: | ||
| 494 | case IR::Opcode::BindlessImageFetch: | ||
| 495 | case IR::Opcode::BindlessImageQueryDimensions: | ||
| 496 | case IR::Opcode::BindlessImageQueryLod: | ||
| 497 | case IR::Opcode::BindlessImageGradient: | ||
| 498 | case IR::Opcode::BoundImageSampleImplicitLod: | ||
| 499 | case IR::Opcode::BoundImageSampleExplicitLod: | ||
| 500 | case IR::Opcode::BoundImageSampleDrefImplicitLod: | ||
| 501 | case IR::Opcode::BoundImageSampleDrefExplicitLod: | ||
| 502 | case IR::Opcode::BoundImageGather: | ||
| 503 | case IR::Opcode::BoundImageGatherDref: | ||
| 504 | case IR::Opcode::BoundImageFetch: | ||
| 505 | case IR::Opcode::BoundImageQueryDimensions: | ||
| 506 | case IR::Opcode::BoundImageQueryLod: | ||
| 507 | case IR::Opcode::BoundImageGradient: | ||
| 508 | case IR::Opcode::ImageGather: | ||
| 509 | case IR::Opcode::ImageGatherDref: | ||
| 510 | case IR::Opcode::ImageFetch: | ||
| 511 | case IR::Opcode::ImageQueryDimensions: | ||
| 512 | case IR::Opcode::ImageGradient: { | ||
| 513 | const TextureType type{inst.Flags<IR::TextureInstInfo>().type}; | ||
| 514 | info.uses_sampled_1d |= type == TextureType::Color1D || type == TextureType::ColorArray1D; | ||
| 515 | info.uses_sparse_residency |= | ||
| 516 | inst.GetAssociatedPseudoOperation(IR::Opcode::GetSparseFromOp) != nullptr; | ||
| 517 | break; | ||
| 518 | } | ||
| 519 | case IR::Opcode::ImageSampleImplicitLod: | ||
| 520 | case IR::Opcode::ImageSampleExplicitLod: | ||
| 521 | case IR::Opcode::ImageSampleDrefImplicitLod: | ||
| 522 | case IR::Opcode::ImageSampleDrefExplicitLod: | ||
| 523 | case IR::Opcode::ImageQueryLod: { | ||
| 524 | const auto flags{inst.Flags<IR::TextureInstInfo>()}; | ||
| 525 | const TextureType type{flags.type}; | ||
| 526 | info.uses_sampled_1d |= type == TextureType::Color1D || type == TextureType::ColorArray1D; | ||
| 527 | info.uses_shadow_lod |= flags.is_depth != 0; | ||
| 528 | info.uses_sparse_residency |= | ||
| 529 | inst.GetAssociatedPseudoOperation(IR::Opcode::GetSparseFromOp) != nullptr; | ||
| 530 | break; | ||
| 531 | } | ||
| 532 | case IR::Opcode::ImageRead: { | ||
| 533 | const auto flags{inst.Flags<IR::TextureInstInfo>()}; | ||
| 534 | info.uses_typeless_image_reads |= flags.image_format == ImageFormat::Typeless; | ||
| 535 | info.uses_sparse_residency |= | ||
| 536 | inst.GetAssociatedPseudoOperation(IR::Opcode::GetSparseFromOp) != nullptr; | ||
| 537 | break; | ||
| 538 | } | ||
| 539 | case IR::Opcode::ImageWrite: { | ||
| 540 | const auto flags{inst.Flags<IR::TextureInstInfo>()}; | ||
| 541 | info.uses_typeless_image_writes |= flags.image_format == ImageFormat::Typeless; | ||
| 542 | info.uses_image_buffers |= flags.type == TextureType::Buffer; | ||
| 543 | break; | ||
| 544 | } | ||
| 545 | case IR::Opcode::SubgroupEqMask: | ||
| 546 | case IR::Opcode::SubgroupLtMask: | ||
| 547 | case IR::Opcode::SubgroupLeMask: | ||
| 548 | case IR::Opcode::SubgroupGtMask: | ||
| 549 | case IR::Opcode::SubgroupGeMask: | ||
| 550 | info.uses_subgroup_mask = true; | ||
| 551 | break; | ||
| 552 | case IR::Opcode::VoteAll: | ||
| 553 | case IR::Opcode::VoteAny: | ||
| 554 | case IR::Opcode::VoteEqual: | ||
| 555 | case IR::Opcode::SubgroupBallot: | ||
| 556 | info.uses_subgroup_vote = true; | ||
| 557 | break; | ||
| 558 | case IR::Opcode::FSwizzleAdd: | ||
| 559 | info.uses_fswzadd = true; | ||
| 560 | break; | ||
| 561 | case IR::Opcode::DPdxFine: | ||
| 562 | case IR::Opcode::DPdyFine: | ||
| 563 | case IR::Opcode::DPdxCoarse: | ||
| 564 | case IR::Opcode::DPdyCoarse: | ||
| 565 | info.uses_derivatives = true; | ||
| 566 | break; | ||
| 567 | case IR::Opcode::LoadStorageU8: | ||
| 568 | case IR::Opcode::LoadStorageS8: | ||
| 569 | case IR::Opcode::WriteStorageU8: | ||
| 570 | case IR::Opcode::WriteStorageS8: | ||
| 571 | info.used_storage_buffer_types |= IR::Type::U8; | ||
| 572 | break; | ||
| 573 | case IR::Opcode::LoadStorageU16: | ||
| 574 | case IR::Opcode::LoadStorageS16: | ||
| 575 | case IR::Opcode::WriteStorageU16: | ||
| 576 | case IR::Opcode::WriteStorageS16: | ||
| 577 | info.used_storage_buffer_types |= IR::Type::U16; | ||
| 578 | break; | ||
| 579 | case IR::Opcode::LoadStorage32: | ||
| 580 | case IR::Opcode::WriteStorage32: | ||
| 581 | case IR::Opcode::StorageAtomicIAdd32: | ||
| 582 | case IR::Opcode::StorageAtomicUMin32: | ||
| 583 | case IR::Opcode::StorageAtomicUMax32: | ||
| 584 | case IR::Opcode::StorageAtomicAnd32: | ||
| 585 | case IR::Opcode::StorageAtomicOr32: | ||
| 586 | case IR::Opcode::StorageAtomicXor32: | ||
| 587 | case IR::Opcode::StorageAtomicExchange32: | ||
| 588 | info.used_storage_buffer_types |= IR::Type::U32; | ||
| 589 | break; | ||
| 590 | case IR::Opcode::LoadStorage64: | ||
| 591 | case IR::Opcode::WriteStorage64: | ||
| 592 | info.used_storage_buffer_types |= IR::Type::U32x2; | ||
| 593 | break; | ||
| 594 | case IR::Opcode::LoadStorage128: | ||
| 595 | case IR::Opcode::WriteStorage128: | ||
| 596 | info.used_storage_buffer_types |= IR::Type::U32x4; | ||
| 597 | break; | ||
| 598 | case IR::Opcode::SharedAtomicSMin32: | ||
| 599 | info.uses_atomic_s32_min = true; | ||
| 600 | break; | ||
| 601 | case IR::Opcode::SharedAtomicSMax32: | ||
| 602 | info.uses_atomic_s32_max = true; | ||
| 603 | break; | ||
| 604 | case IR::Opcode::SharedAtomicInc32: | ||
| 605 | info.uses_shared_increment = true; | ||
| 606 | break; | ||
| 607 | case IR::Opcode::SharedAtomicDec32: | ||
| 608 | info.uses_shared_decrement = true; | ||
| 609 | break; | ||
| 610 | case IR::Opcode::SharedAtomicExchange64: | ||
| 611 | info.uses_int64_bit_atomics = true; | ||
| 612 | break; | ||
| 613 | case IR::Opcode::GlobalAtomicInc32: | ||
| 614 | case IR::Opcode::StorageAtomicInc32: | ||
| 615 | info.used_storage_buffer_types |= IR::Type::U32; | ||
| 616 | info.uses_global_increment = true; | ||
| 617 | break; | ||
| 618 | case IR::Opcode::GlobalAtomicDec32: | ||
| 619 | case IR::Opcode::StorageAtomicDec32: | ||
| 620 | info.used_storage_buffer_types |= IR::Type::U32; | ||
| 621 | info.uses_global_decrement = true; | ||
| 622 | break; | ||
| 623 | case IR::Opcode::GlobalAtomicAddF32: | ||
| 624 | case IR::Opcode::StorageAtomicAddF32: | ||
| 625 | info.used_storage_buffer_types |= IR::Type::U32; | ||
| 626 | info.uses_atomic_f32_add = true; | ||
| 627 | break; | ||
| 628 | case IR::Opcode::GlobalAtomicAddF16x2: | ||
| 629 | case IR::Opcode::StorageAtomicAddF16x2: | ||
| 630 | info.used_storage_buffer_types |= IR::Type::U32; | ||
| 631 | info.uses_atomic_f16x2_add = true; | ||
| 632 | break; | ||
| 633 | case IR::Opcode::GlobalAtomicAddF32x2: | ||
| 634 | case IR::Opcode::StorageAtomicAddF32x2: | ||
| 635 | info.used_storage_buffer_types |= IR::Type::U32; | ||
| 636 | info.uses_atomic_f32x2_add = true; | ||
| 637 | break; | ||
| 638 | case IR::Opcode::GlobalAtomicMinF16x2: | ||
| 639 | case IR::Opcode::StorageAtomicMinF16x2: | ||
| 640 | info.used_storage_buffer_types |= IR::Type::U32; | ||
| 641 | info.uses_atomic_f16x2_min = true; | ||
| 642 | break; | ||
| 643 | case IR::Opcode::GlobalAtomicMinF32x2: | ||
| 644 | case IR::Opcode::StorageAtomicMinF32x2: | ||
| 645 | info.used_storage_buffer_types |= IR::Type::U32; | ||
| 646 | info.uses_atomic_f32x2_min = true; | ||
| 647 | break; | ||
| 648 | case IR::Opcode::GlobalAtomicMaxF16x2: | ||
| 649 | case IR::Opcode::StorageAtomicMaxF16x2: | ||
| 650 | info.used_storage_buffer_types |= IR::Type::U32; | ||
| 651 | info.uses_atomic_f16x2_max = true; | ||
| 652 | break; | ||
| 653 | case IR::Opcode::GlobalAtomicMaxF32x2: | ||
| 654 | case IR::Opcode::StorageAtomicMaxF32x2: | ||
| 655 | info.used_storage_buffer_types |= IR::Type::U32; | ||
| 656 | info.uses_atomic_f32x2_max = true; | ||
| 657 | break; | ||
| 658 | case IR::Opcode::StorageAtomicSMin32: | ||
| 659 | info.used_storage_buffer_types |= IR::Type::U32; | ||
| 660 | info.uses_atomic_s32_min = true; | ||
| 661 | break; | ||
| 662 | case IR::Opcode::StorageAtomicSMax32: | ||
| 663 | info.used_storage_buffer_types |= IR::Type::U32; | ||
| 664 | info.uses_atomic_s32_max = true; | ||
| 665 | break; | ||
| 666 | case IR::Opcode::GlobalAtomicIAdd64: | ||
| 667 | case IR::Opcode::GlobalAtomicSMin64: | ||
| 668 | case IR::Opcode::GlobalAtomicUMin64: | ||
| 669 | case IR::Opcode::GlobalAtomicSMax64: | ||
| 670 | case IR::Opcode::GlobalAtomicUMax64: | ||
| 671 | case IR::Opcode::GlobalAtomicAnd64: | ||
| 672 | case IR::Opcode::GlobalAtomicOr64: | ||
| 673 | case IR::Opcode::GlobalAtomicXor64: | ||
| 674 | case IR::Opcode::GlobalAtomicExchange64: | ||
| 675 | case IR::Opcode::StorageAtomicIAdd64: | ||
| 676 | case IR::Opcode::StorageAtomicSMin64: | ||
| 677 | case IR::Opcode::StorageAtomicUMin64: | ||
| 678 | case IR::Opcode::StorageAtomicSMax64: | ||
| 679 | case IR::Opcode::StorageAtomicUMax64: | ||
| 680 | case IR::Opcode::StorageAtomicAnd64: | ||
| 681 | case IR::Opcode::StorageAtomicOr64: | ||
| 682 | case IR::Opcode::StorageAtomicXor64: | ||
| 683 | info.used_storage_buffer_types |= IR::Type::U64; | ||
| 684 | info.uses_int64_bit_atomics = true; | ||
| 685 | break; | ||
| 686 | case IR::Opcode::BindlessImageAtomicIAdd32: | ||
| 687 | case IR::Opcode::BindlessImageAtomicSMin32: | ||
| 688 | case IR::Opcode::BindlessImageAtomicUMin32: | ||
| 689 | case IR::Opcode::BindlessImageAtomicSMax32: | ||
| 690 | case IR::Opcode::BindlessImageAtomicUMax32: | ||
| 691 | case IR::Opcode::BindlessImageAtomicInc32: | ||
| 692 | case IR::Opcode::BindlessImageAtomicDec32: | ||
| 693 | case IR::Opcode::BindlessImageAtomicAnd32: | ||
| 694 | case IR::Opcode::BindlessImageAtomicOr32: | ||
| 695 | case IR::Opcode::BindlessImageAtomicXor32: | ||
| 696 | case IR::Opcode::BindlessImageAtomicExchange32: | ||
| 697 | case IR::Opcode::BoundImageAtomicIAdd32: | ||
| 698 | case IR::Opcode::BoundImageAtomicSMin32: | ||
| 699 | case IR::Opcode::BoundImageAtomicUMin32: | ||
| 700 | case IR::Opcode::BoundImageAtomicSMax32: | ||
| 701 | case IR::Opcode::BoundImageAtomicUMax32: | ||
| 702 | case IR::Opcode::BoundImageAtomicInc32: | ||
| 703 | case IR::Opcode::BoundImageAtomicDec32: | ||
| 704 | case IR::Opcode::BoundImageAtomicAnd32: | ||
| 705 | case IR::Opcode::BoundImageAtomicOr32: | ||
| 706 | case IR::Opcode::BoundImageAtomicXor32: | ||
| 707 | case IR::Opcode::BoundImageAtomicExchange32: | ||
| 708 | case IR::Opcode::ImageAtomicIAdd32: | ||
| 709 | case IR::Opcode::ImageAtomicSMin32: | ||
| 710 | case IR::Opcode::ImageAtomicUMin32: | ||
| 711 | case IR::Opcode::ImageAtomicSMax32: | ||
| 712 | case IR::Opcode::ImageAtomicUMax32: | ||
| 713 | case IR::Opcode::ImageAtomicInc32: | ||
| 714 | case IR::Opcode::ImageAtomicDec32: | ||
| 715 | case IR::Opcode::ImageAtomicAnd32: | ||
| 716 | case IR::Opcode::ImageAtomicOr32: | ||
| 717 | case IR::Opcode::ImageAtomicXor32: | ||
| 718 | case IR::Opcode::ImageAtomicExchange32: | ||
| 719 | info.uses_atomic_image_u32 = true; | ||
| 720 | break; | ||
| 721 | default: | ||
| 722 | break; | ||
| 723 | } | ||
| 724 | } | ||
| 725 | |||
| 726 | void VisitFpModifiers(Info& info, IR::Inst& inst) { | ||
| 727 | switch (inst.GetOpcode()) { | ||
| 728 | case IR::Opcode::FPAdd16: | ||
| 729 | case IR::Opcode::FPFma16: | ||
| 730 | case IR::Opcode::FPMul16: | ||
| 731 | case IR::Opcode::FPRoundEven16: | ||
| 732 | case IR::Opcode::FPFloor16: | ||
| 733 | case IR::Opcode::FPCeil16: | ||
| 734 | case IR::Opcode::FPTrunc16: { | ||
| 735 | const auto control{inst.Flags<IR::FpControl>()}; | ||
| 736 | switch (control.fmz_mode) { | ||
| 737 | case IR::FmzMode::DontCare: | ||
| 738 | break; | ||
| 739 | case IR::FmzMode::FTZ: | ||
| 740 | case IR::FmzMode::FMZ: | ||
| 741 | info.uses_fp16_denorms_flush = true; | ||
| 742 | break; | ||
| 743 | case IR::FmzMode::None: | ||
| 744 | info.uses_fp16_denorms_preserve = true; | ||
| 745 | break; | ||
| 746 | } | ||
| 747 | break; | ||
| 748 | } | ||
| 749 | case IR::Opcode::FPAdd32: | ||
| 750 | case IR::Opcode::FPFma32: | ||
| 751 | case IR::Opcode::FPMul32: | ||
| 752 | case IR::Opcode::FPRoundEven32: | ||
| 753 | case IR::Opcode::FPFloor32: | ||
| 754 | case IR::Opcode::FPCeil32: | ||
| 755 | case IR::Opcode::FPTrunc32: | ||
| 756 | case IR::Opcode::FPOrdEqual32: | ||
| 757 | case IR::Opcode::FPUnordEqual32: | ||
| 758 | case IR::Opcode::FPOrdNotEqual32: | ||
| 759 | case IR::Opcode::FPUnordNotEqual32: | ||
| 760 | case IR::Opcode::FPOrdLessThan32: | ||
| 761 | case IR::Opcode::FPUnordLessThan32: | ||
| 762 | case IR::Opcode::FPOrdGreaterThan32: | ||
| 763 | case IR::Opcode::FPUnordGreaterThan32: | ||
| 764 | case IR::Opcode::FPOrdLessThanEqual32: | ||
| 765 | case IR::Opcode::FPUnordLessThanEqual32: | ||
| 766 | case IR::Opcode::FPOrdGreaterThanEqual32: | ||
| 767 | case IR::Opcode::FPUnordGreaterThanEqual32: | ||
| 768 | case IR::Opcode::ConvertF16F32: | ||
| 769 | case IR::Opcode::ConvertF64F32: { | ||
| 770 | const auto control{inst.Flags<IR::FpControl>()}; | ||
| 771 | switch (control.fmz_mode) { | ||
| 772 | case IR::FmzMode::DontCare: | ||
| 773 | break; | ||
| 774 | case IR::FmzMode::FTZ: | ||
| 775 | case IR::FmzMode::FMZ: | ||
| 776 | info.uses_fp32_denorms_flush = true; | ||
| 777 | break; | ||
| 778 | case IR::FmzMode::None: | ||
| 779 | info.uses_fp32_denorms_preserve = true; | ||
| 780 | break; | ||
| 781 | } | ||
| 782 | break; | ||
| 783 | } | ||
| 784 | default: | ||
| 785 | break; | ||
| 786 | } | ||
| 787 | } | ||
| 788 | |||
| 789 | void VisitCbufs(Info& info, IR::Inst& inst) { | ||
| 790 | switch (inst.GetOpcode()) { | ||
| 791 | case IR::Opcode::GetCbufU8: | ||
| 792 | case IR::Opcode::GetCbufS8: | ||
| 793 | case IR::Opcode::GetCbufU16: | ||
| 794 | case IR::Opcode::GetCbufS16: | ||
| 795 | case IR::Opcode::GetCbufU32: | ||
| 796 | case IR::Opcode::GetCbufF32: | ||
| 797 | case IR::Opcode::GetCbufU32x2: { | ||
| 798 | CheckCBufNVN(info, inst); | ||
| 799 | break; | ||
| 800 | } | ||
| 801 | default: | ||
| 802 | break; | ||
| 803 | } | ||
| 804 | } | ||
| 805 | |||
| 806 | void Visit(Info& info, IR::Inst& inst) { | ||
| 807 | VisitUsages(info, inst); | ||
| 808 | VisitFpModifiers(info, inst); | ||
| 809 | VisitCbufs(info, inst); | ||
| 810 | } | ||
| 811 | |||
| 812 | void GatherInfoFromHeader(Environment& env, Info& info) { | ||
| 813 | Stage stage{env.ShaderStage()}; | ||
| 814 | if (stage == Stage::Compute) { | ||
| 815 | return; | ||
| 816 | } | ||
| 817 | const auto& header{env.SPH()}; | ||
| 818 | if (stage == Stage::Fragment) { | ||
| 819 | if (!info.loads_indexed_attributes) { | ||
| 820 | return; | ||
| 821 | } | ||
| 822 | for (size_t index = 0; index < IR::NUM_GENERICS; ++index) { | ||
| 823 | const size_t offset{static_cast<size_t>(IR::Attribute::Generic0X) + index * 4}; | ||
| 824 | const auto vector{header.ps.imap_generic_vector[index]}; | ||
| 825 | info.loads.mask[offset + 0] = vector.x != PixelImap::Unused; | ||
| 826 | info.loads.mask[offset + 1] = vector.y != PixelImap::Unused; | ||
| 827 | info.loads.mask[offset + 2] = vector.z != PixelImap::Unused; | ||
| 828 | info.loads.mask[offset + 3] = vector.w != PixelImap::Unused; | ||
| 829 | } | ||
| 830 | return; | ||
| 831 | } | ||
| 832 | if (info.loads_indexed_attributes) { | ||
| 833 | for (size_t index = 0; index < IR::NUM_GENERICS; ++index) { | ||
| 834 | const IR::Attribute attribute{IR::Attribute::Generic0X + index * 4}; | ||
| 835 | const auto mask = header.vtg.InputGeneric(index); | ||
| 836 | for (size_t i = 0; i < 4; ++i) { | ||
| 837 | info.loads.Set(attribute + i, mask[i]); | ||
| 838 | } | ||
| 839 | } | ||
| 840 | for (size_t index = 0; index < 8; ++index) { | ||
| 841 | const u16 mask{header.vtg.clip_distances}; | ||
| 842 | info.loads.Set(IR::Attribute::ClipDistance0 + index, ((mask >> index) & 1) != 0); | ||
| 843 | } | ||
| 844 | info.loads.Set(IR::Attribute::PrimitiveId, header.vtg.imap_systemb.primitive_array_id != 0); | ||
| 845 | info.loads.Set(IR::Attribute::Layer, header.vtg.imap_systemb.rt_array_index != 0); | ||
| 846 | info.loads.Set(IR::Attribute::ViewportIndex, header.vtg.imap_systemb.viewport_index != 0); | ||
| 847 | info.loads.Set(IR::Attribute::PointSize, header.vtg.imap_systemb.point_size != 0); | ||
| 848 | info.loads.Set(IR::Attribute::PositionX, header.vtg.imap_systemb.position_x != 0); | ||
| 849 | info.loads.Set(IR::Attribute::PositionY, header.vtg.imap_systemb.position_y != 0); | ||
| 850 | info.loads.Set(IR::Attribute::PositionZ, header.vtg.imap_systemb.position_z != 0); | ||
| 851 | info.loads.Set(IR::Attribute::PositionW, header.vtg.imap_systemb.position_w != 0); | ||
| 852 | info.loads.Set(IR::Attribute::PointSpriteS, header.vtg.point_sprite_s != 0); | ||
| 853 | info.loads.Set(IR::Attribute::PointSpriteT, header.vtg.point_sprite_t != 0); | ||
| 854 | info.loads.Set(IR::Attribute::FogCoordinate, header.vtg.fog_coordinate != 0); | ||
| 855 | info.loads.Set(IR::Attribute::TessellationEvaluationPointU, | ||
| 856 | header.vtg.tessellation_eval_point_u != 0); | ||
| 857 | info.loads.Set(IR::Attribute::TessellationEvaluationPointV, | ||
| 858 | header.vtg.tessellation_eval_point_v != 0); | ||
| 859 | info.loads.Set(IR::Attribute::InstanceId, header.vtg.instance_id != 0); | ||
| 860 | info.loads.Set(IR::Attribute::VertexId, header.vtg.vertex_id != 0); | ||
| 861 | // TODO: Legacy varyings | ||
| 862 | } | ||
| 863 | if (info.stores_indexed_attributes) { | ||
| 864 | for (size_t index = 0; index < IR::NUM_GENERICS; ++index) { | ||
| 865 | const IR::Attribute attribute{IR::Attribute::Generic0X + index * 4}; | ||
| 866 | const auto mask{header.vtg.OutputGeneric(index)}; | ||
| 867 | for (size_t i = 0; i < 4; ++i) { | ||
| 868 | info.stores.Set(attribute + i, mask[i]); | ||
| 869 | } | ||
| 870 | } | ||
| 871 | for (size_t index = 0; index < 8; ++index) { | ||
| 872 | const u16 mask{header.vtg.omap_systemc.clip_distances}; | ||
| 873 | info.stores.Set(IR::Attribute::ClipDistance0 + index, ((mask >> index) & 1) != 0); | ||
| 874 | } | ||
| 875 | info.stores.Set(IR::Attribute::PrimitiveId, | ||
| 876 | header.vtg.omap_systemb.primitive_array_id != 0); | ||
| 877 | info.stores.Set(IR::Attribute::Layer, header.vtg.omap_systemb.rt_array_index != 0); | ||
| 878 | info.stores.Set(IR::Attribute::ViewportIndex, header.vtg.omap_systemb.viewport_index != 0); | ||
| 879 | info.stores.Set(IR::Attribute::PointSize, header.vtg.omap_systemb.point_size != 0); | ||
| 880 | info.stores.Set(IR::Attribute::PositionX, header.vtg.omap_systemb.position_x != 0); | ||
| 881 | info.stores.Set(IR::Attribute::PositionY, header.vtg.omap_systemb.position_y != 0); | ||
| 882 | info.stores.Set(IR::Attribute::PositionZ, header.vtg.omap_systemb.position_z != 0); | ||
| 883 | info.stores.Set(IR::Attribute::PositionW, header.vtg.omap_systemb.position_w != 0); | ||
| 884 | info.stores.Set(IR::Attribute::PointSpriteS, header.vtg.omap_systemc.point_sprite_s != 0); | ||
| 885 | info.stores.Set(IR::Attribute::PointSpriteT, header.vtg.omap_systemc.point_sprite_t != 0); | ||
| 886 | info.stores.Set(IR::Attribute::FogCoordinate, header.vtg.omap_systemc.fog_coordinate != 0); | ||
| 887 | info.stores.Set(IR::Attribute::TessellationEvaluationPointU, | ||
| 888 | header.vtg.omap_systemc.tessellation_eval_point_u != 0); | ||
| 889 | info.stores.Set(IR::Attribute::TessellationEvaluationPointV, | ||
| 890 | header.vtg.omap_systemc.tessellation_eval_point_v != 0); | ||
| 891 | info.stores.Set(IR::Attribute::InstanceId, header.vtg.omap_systemc.instance_id != 0); | ||
| 892 | info.stores.Set(IR::Attribute::VertexId, header.vtg.omap_systemc.vertex_id != 0); | ||
| 893 | // TODO: Legacy varyings | ||
| 894 | } | ||
| 895 | } | ||
| 896 | } // Anonymous namespace | ||
| 897 | |||
| 898 | void CollectShaderInfoPass(Environment& env, IR::Program& program) { | ||
| 899 | Info& info{program.info}; | ||
| 900 | const u32 base{[&] { | ||
| 901 | switch (program.stage) { | ||
| 902 | case Stage::VertexA: | ||
| 903 | case Stage::VertexB: | ||
| 904 | return 0x110u; | ||
| 905 | case Stage::TessellationControl: | ||
| 906 | return 0x210u; | ||
| 907 | case Stage::TessellationEval: | ||
| 908 | return 0x310u; | ||
| 909 | case Stage::Geometry: | ||
| 910 | return 0x410u; | ||
| 911 | case Stage::Fragment: | ||
| 912 | return 0x510u; | ||
| 913 | case Stage::Compute: | ||
| 914 | return 0x310u; | ||
| 915 | } | ||
| 916 | throw InvalidArgument("Invalid stage {}", program.stage); | ||
| 917 | }()}; | ||
| 918 | info.nvn_buffer_base = base; | ||
| 919 | |||
| 920 | for (IR::Block* const block : program.post_order_blocks) { | ||
| 921 | for (IR::Inst& inst : block->Instructions()) { | ||
| 922 | Visit(info, inst); | ||
| 923 | } | ||
| 924 | } | ||
| 925 | GatherInfoFromHeader(env, info); | ||
| 926 | } | ||
| 927 | |||
| 928 | } // namespace Shader::Optimization | ||
diff --git a/src/shader_recompiler/ir_opt/constant_propagation_pass.cpp b/src/shader_recompiler/ir_opt/constant_propagation_pass.cpp new file mode 100644 index 000000000..8dd6d6c2c --- /dev/null +++ b/src/shader_recompiler/ir_opt/constant_propagation_pass.cpp | |||
| @@ -0,0 +1,610 @@ | |||
| 1 | // Copyright 2021 yuzu Emulator Project | ||
| 2 | // Licensed under GPLv2 or any later version | ||
| 3 | // Refer to the license.txt file included. | ||
| 4 | |||
| 5 | #include <algorithm> | ||
| 6 | #include <tuple> | ||
| 7 | #include <type_traits> | ||
| 8 | |||
| 9 | #include "common/bit_cast.h" | ||
| 10 | #include "common/bit_util.h" | ||
| 11 | #include "shader_recompiler/exception.h" | ||
| 12 | #include "shader_recompiler/frontend/ir/ir_emitter.h" | ||
| 13 | #include "shader_recompiler/frontend/ir/value.h" | ||
| 14 | #include "shader_recompiler/ir_opt/passes.h" | ||
| 15 | |||
| 16 | namespace Shader::Optimization { | ||
| 17 | namespace { | ||
| 18 | // Metaprogramming stuff to get arguments information out of a lambda | ||
| 19 | template <typename Func> | ||
| 20 | struct LambdaTraits : LambdaTraits<decltype(&std::remove_reference_t<Func>::operator())> {}; | ||
| 21 | |||
| 22 | template <typename ReturnType, typename LambdaType, typename... Args> | ||
| 23 | struct LambdaTraits<ReturnType (LambdaType::*)(Args...) const> { | ||
| 24 | template <size_t I> | ||
| 25 | using ArgType = std::tuple_element_t<I, std::tuple<Args...>>; | ||
| 26 | |||
| 27 | static constexpr size_t NUM_ARGS{sizeof...(Args)}; | ||
| 28 | }; | ||
| 29 | |||
| 30 | template <typename T> | ||
| 31 | [[nodiscard]] T Arg(const IR::Value& value) { | ||
| 32 | if constexpr (std::is_same_v<T, bool>) { | ||
| 33 | return value.U1(); | ||
| 34 | } else if constexpr (std::is_same_v<T, u32>) { | ||
| 35 | return value.U32(); | ||
| 36 | } else if constexpr (std::is_same_v<T, s32>) { | ||
| 37 | return static_cast<s32>(value.U32()); | ||
| 38 | } else if constexpr (std::is_same_v<T, f32>) { | ||
| 39 | return value.F32(); | ||
| 40 | } else if constexpr (std::is_same_v<T, u64>) { | ||
| 41 | return value.U64(); | ||
| 42 | } | ||
| 43 | } | ||
| 44 | |||
| 45 | template <typename T, typename ImmFn> | ||
| 46 | bool FoldCommutative(IR::Inst& inst, ImmFn&& imm_fn) { | ||
| 47 | const IR::Value lhs{inst.Arg(0)}; | ||
| 48 | const IR::Value rhs{inst.Arg(1)}; | ||
| 49 | |||
| 50 | const bool is_lhs_immediate{lhs.IsImmediate()}; | ||
| 51 | const bool is_rhs_immediate{rhs.IsImmediate()}; | ||
| 52 | |||
| 53 | if (is_lhs_immediate && is_rhs_immediate) { | ||
| 54 | const auto result{imm_fn(Arg<T>(lhs), Arg<T>(rhs))}; | ||
| 55 | inst.ReplaceUsesWith(IR::Value{result}); | ||
| 56 | return false; | ||
| 57 | } | ||
| 58 | if (is_lhs_immediate && !is_rhs_immediate) { | ||
| 59 | IR::Inst* const rhs_inst{rhs.InstRecursive()}; | ||
| 60 | if (rhs_inst->GetOpcode() == inst.GetOpcode() && rhs_inst->Arg(1).IsImmediate()) { | ||
| 61 | const auto combined{imm_fn(Arg<T>(lhs), Arg<T>(rhs_inst->Arg(1)))}; | ||
| 62 | inst.SetArg(0, rhs_inst->Arg(0)); | ||
| 63 | inst.SetArg(1, IR::Value{combined}); | ||
| 64 | } else { | ||
| 65 | // Normalize | ||
| 66 | inst.SetArg(0, rhs); | ||
| 67 | inst.SetArg(1, lhs); | ||
| 68 | } | ||
| 69 | } | ||
| 70 | if (!is_lhs_immediate && is_rhs_immediate) { | ||
| 71 | const IR::Inst* const lhs_inst{lhs.InstRecursive()}; | ||
| 72 | if (lhs_inst->GetOpcode() == inst.GetOpcode() && lhs_inst->Arg(1).IsImmediate()) { | ||
| 73 | const auto combined{imm_fn(Arg<T>(rhs), Arg<T>(lhs_inst->Arg(1)))}; | ||
| 74 | inst.SetArg(0, lhs_inst->Arg(0)); | ||
| 75 | inst.SetArg(1, IR::Value{combined}); | ||
| 76 | } | ||
| 77 | } | ||
| 78 | return true; | ||
| 79 | } | ||
| 80 | |||
| 81 | template <typename Func> | ||
| 82 | bool FoldWhenAllImmediates(IR::Inst& inst, Func&& func) { | ||
| 83 | if (!inst.AreAllArgsImmediates() || inst.HasAssociatedPseudoOperation()) { | ||
| 84 | return false; | ||
| 85 | } | ||
| 86 | using Indices = std::make_index_sequence<LambdaTraits<decltype(func)>::NUM_ARGS>; | ||
| 87 | inst.ReplaceUsesWith(EvalImmediates(inst, func, Indices{})); | ||
| 88 | return true; | ||
| 89 | } | ||
| 90 | |||
| 91 | void FoldGetRegister(IR::Inst& inst) { | ||
| 92 | if (inst.Arg(0).Reg() == IR::Reg::RZ) { | ||
| 93 | inst.ReplaceUsesWith(IR::Value{u32{0}}); | ||
| 94 | } | ||
| 95 | } | ||
| 96 | |||
| 97 | void FoldGetPred(IR::Inst& inst) { | ||
| 98 | if (inst.Arg(0).Pred() == IR::Pred::PT) { | ||
| 99 | inst.ReplaceUsesWith(IR::Value{true}); | ||
| 100 | } | ||
| 101 | } | ||
| 102 | |||
| 103 | /// Replaces the pattern generated by two XMAD multiplications | ||
| 104 | bool FoldXmadMultiply(IR::Block& block, IR::Inst& inst) { | ||
| 105 | /* | ||
| 106 | * We are looking for this pattern: | ||
| 107 | * %rhs_bfe = BitFieldUExtract %factor_a, #0, #16 | ||
| 108 | * %rhs_mul = IMul32 %rhs_bfe, %factor_b | ||
| 109 | * %lhs_bfe = BitFieldUExtract %factor_a, #16, #16 | ||
| 110 | * %rhs_mul = IMul32 %lhs_bfe, %factor_b | ||
| 111 | * %lhs_shl = ShiftLeftLogical32 %rhs_mul, #16 | ||
| 112 | * %result = IAdd32 %lhs_shl, %rhs_mul | ||
| 113 | * | ||
| 114 | * And replacing it with | ||
| 115 | * %result = IMul32 %factor_a, %factor_b | ||
| 116 | * | ||
| 117 | * This optimization has been proven safe by LLVM and MSVC. | ||
| 118 | */ | ||
| 119 | const IR::Value lhs_arg{inst.Arg(0)}; | ||
| 120 | const IR::Value rhs_arg{inst.Arg(1)}; | ||
| 121 | if (lhs_arg.IsImmediate() || rhs_arg.IsImmediate()) { | ||
| 122 | return false; | ||
| 123 | } | ||
| 124 | IR::Inst* const lhs_shl{lhs_arg.InstRecursive()}; | ||
| 125 | if (lhs_shl->GetOpcode() != IR::Opcode::ShiftLeftLogical32 || | ||
| 126 | lhs_shl->Arg(1) != IR::Value{16U}) { | ||
| 127 | return false; | ||
| 128 | } | ||
| 129 | if (lhs_shl->Arg(0).IsImmediate()) { | ||
| 130 | return false; | ||
| 131 | } | ||
| 132 | IR::Inst* const lhs_mul{lhs_shl->Arg(0).InstRecursive()}; | ||
| 133 | IR::Inst* const rhs_mul{rhs_arg.InstRecursive()}; | ||
| 134 | if (lhs_mul->GetOpcode() != IR::Opcode::IMul32 || rhs_mul->GetOpcode() != IR::Opcode::IMul32) { | ||
| 135 | return false; | ||
| 136 | } | ||
| 137 | if (lhs_mul->Arg(1).Resolve() != rhs_mul->Arg(1).Resolve()) { | ||
| 138 | return false; | ||
| 139 | } | ||
| 140 | const IR::U32 factor_b{lhs_mul->Arg(1)}; | ||
| 141 | if (lhs_mul->Arg(0).IsImmediate() || rhs_mul->Arg(0).IsImmediate()) { | ||
| 142 | return false; | ||
| 143 | } | ||
| 144 | IR::Inst* const lhs_bfe{lhs_mul->Arg(0).InstRecursive()}; | ||
| 145 | IR::Inst* const rhs_bfe{rhs_mul->Arg(0).InstRecursive()}; | ||
| 146 | if (lhs_bfe->GetOpcode() != IR::Opcode::BitFieldUExtract) { | ||
| 147 | return false; | ||
| 148 | } | ||
| 149 | if (rhs_bfe->GetOpcode() != IR::Opcode::BitFieldUExtract) { | ||
| 150 | return false; | ||
| 151 | } | ||
| 152 | if (lhs_bfe->Arg(1) != IR::Value{16U} || lhs_bfe->Arg(2) != IR::Value{16U}) { | ||
| 153 | return false; | ||
| 154 | } | ||
| 155 | if (rhs_bfe->Arg(1) != IR::Value{0U} || rhs_bfe->Arg(2) != IR::Value{16U}) { | ||
| 156 | return false; | ||
| 157 | } | ||
| 158 | if (lhs_bfe->Arg(0).Resolve() != rhs_bfe->Arg(0).Resolve()) { | ||
| 159 | return false; | ||
| 160 | } | ||
| 161 | const IR::U32 factor_a{lhs_bfe->Arg(0)}; | ||
| 162 | IR::IREmitter ir{block, IR::Block::InstructionList::s_iterator_to(inst)}; | ||
| 163 | inst.ReplaceUsesWith(ir.IMul(factor_a, factor_b)); | ||
| 164 | return true; | ||
| 165 | } | ||
| 166 | |||
| 167 | template <typename T> | ||
| 168 | void FoldAdd(IR::Block& block, IR::Inst& inst) { | ||
| 169 | if (inst.HasAssociatedPseudoOperation()) { | ||
| 170 | return; | ||
| 171 | } | ||
| 172 | if (!FoldCommutative<T>(inst, [](T a, T b) { return a + b; })) { | ||
| 173 | return; | ||
| 174 | } | ||
| 175 | const IR::Value rhs{inst.Arg(1)}; | ||
| 176 | if (rhs.IsImmediate() && Arg<T>(rhs) == 0) { | ||
| 177 | inst.ReplaceUsesWith(inst.Arg(0)); | ||
| 178 | return; | ||
| 179 | } | ||
| 180 | if constexpr (std::is_same_v<T, u32>) { | ||
| 181 | if (FoldXmadMultiply(block, inst)) { | ||
| 182 | return; | ||
| 183 | } | ||
| 184 | } | ||
| 185 | } | ||
| 186 | |||
| 187 | void FoldISub32(IR::Inst& inst) { | ||
| 188 | if (FoldWhenAllImmediates(inst, [](u32 a, u32 b) { return a - b; })) { | ||
| 189 | return; | ||
| 190 | } | ||
| 191 | if (inst.Arg(0).IsImmediate() || inst.Arg(1).IsImmediate()) { | ||
| 192 | return; | ||
| 193 | } | ||
| 194 | // ISub32 is generally used to subtract two constant buffers, compare and replace this with | ||
| 195 | // zero if they equal. | ||
| 196 | const auto equal_cbuf{[](IR::Inst* a, IR::Inst* b) { | ||
| 197 | return a->GetOpcode() == IR::Opcode::GetCbufU32 && | ||
| 198 | b->GetOpcode() == IR::Opcode::GetCbufU32 && a->Arg(0) == b->Arg(0) && | ||
| 199 | a->Arg(1) == b->Arg(1); | ||
| 200 | }}; | ||
| 201 | IR::Inst* op_a{inst.Arg(0).InstRecursive()}; | ||
| 202 | IR::Inst* op_b{inst.Arg(1).InstRecursive()}; | ||
| 203 | if (equal_cbuf(op_a, op_b)) { | ||
| 204 | inst.ReplaceUsesWith(IR::Value{u32{0}}); | ||
| 205 | return; | ||
| 206 | } | ||
| 207 | // It's also possible a value is being added to a cbuf and then subtracted | ||
| 208 | if (op_b->GetOpcode() == IR::Opcode::IAdd32) { | ||
| 209 | // Canonicalize local variables to simplify the following logic | ||
| 210 | std::swap(op_a, op_b); | ||
| 211 | } | ||
| 212 | if (op_b->GetOpcode() != IR::Opcode::GetCbufU32) { | ||
| 213 | return; | ||
| 214 | } | ||
| 215 | IR::Inst* const inst_cbuf{op_b}; | ||
| 216 | if (op_a->GetOpcode() != IR::Opcode::IAdd32) { | ||
| 217 | return; | ||
| 218 | } | ||
| 219 | IR::Value add_op_a{op_a->Arg(0)}; | ||
| 220 | IR::Value add_op_b{op_a->Arg(1)}; | ||
| 221 | if (add_op_b.IsImmediate()) { | ||
| 222 | // Canonicalize | ||
| 223 | std::swap(add_op_a, add_op_b); | ||
| 224 | } | ||
| 225 | if (add_op_b.IsImmediate()) { | ||
| 226 | return; | ||
| 227 | } | ||
| 228 | IR::Inst* const add_cbuf{add_op_b.InstRecursive()}; | ||
| 229 | if (equal_cbuf(add_cbuf, inst_cbuf)) { | ||
| 230 | inst.ReplaceUsesWith(add_op_a); | ||
| 231 | } | ||
| 232 | } | ||
| 233 | |||
| 234 | void FoldSelect(IR::Inst& inst) { | ||
| 235 | const IR::Value cond{inst.Arg(0)}; | ||
| 236 | if (cond.IsImmediate()) { | ||
| 237 | inst.ReplaceUsesWith(cond.U1() ? inst.Arg(1) : inst.Arg(2)); | ||
| 238 | } | ||
| 239 | } | ||
| 240 | |||
| 241 | void FoldFPMul32(IR::Inst& inst) { | ||
| 242 | const auto control{inst.Flags<IR::FpControl>()}; | ||
| 243 | if (control.no_contraction) { | ||
| 244 | return; | ||
| 245 | } | ||
| 246 | // Fold interpolation operations | ||
| 247 | const IR::Value lhs_value{inst.Arg(0)}; | ||
| 248 | const IR::Value rhs_value{inst.Arg(1)}; | ||
| 249 | if (lhs_value.IsImmediate() || rhs_value.IsImmediate()) { | ||
| 250 | return; | ||
| 251 | } | ||
| 252 | IR::Inst* const lhs_op{lhs_value.InstRecursive()}; | ||
| 253 | IR::Inst* const rhs_op{rhs_value.InstRecursive()}; | ||
| 254 | if (lhs_op->GetOpcode() != IR::Opcode::FPMul32 || | ||
| 255 | rhs_op->GetOpcode() != IR::Opcode::FPRecip32) { | ||
| 256 | return; | ||
| 257 | } | ||
| 258 | const IR::Value recip_source{rhs_op->Arg(0)}; | ||
| 259 | const IR::Value lhs_mul_source{lhs_op->Arg(1).Resolve()}; | ||
| 260 | if (recip_source.IsImmediate() || lhs_mul_source.IsImmediate()) { | ||
| 261 | return; | ||
| 262 | } | ||
| 263 | IR::Inst* const attr_a{recip_source.InstRecursive()}; | ||
| 264 | IR::Inst* const attr_b{lhs_mul_source.InstRecursive()}; | ||
| 265 | if (attr_a->GetOpcode() != IR::Opcode::GetAttribute || | ||
| 266 | attr_b->GetOpcode() != IR::Opcode::GetAttribute) { | ||
| 267 | return; | ||
| 268 | } | ||
| 269 | if (attr_a->Arg(0).Attribute() == attr_b->Arg(0).Attribute()) { | ||
| 270 | inst.ReplaceUsesWith(lhs_op->Arg(0)); | ||
| 271 | } | ||
| 272 | } | ||
| 273 | |||
| 274 | void FoldLogicalAnd(IR::Inst& inst) { | ||
| 275 | if (!FoldCommutative<bool>(inst, [](bool a, bool b) { return a && b; })) { | ||
| 276 | return; | ||
| 277 | } | ||
| 278 | const IR::Value rhs{inst.Arg(1)}; | ||
| 279 | if (rhs.IsImmediate()) { | ||
| 280 | if (rhs.U1()) { | ||
| 281 | inst.ReplaceUsesWith(inst.Arg(0)); | ||
| 282 | } else { | ||
| 283 | inst.ReplaceUsesWith(IR::Value{false}); | ||
| 284 | } | ||
| 285 | } | ||
| 286 | } | ||
| 287 | |||
| 288 | void FoldLogicalOr(IR::Inst& inst) { | ||
| 289 | if (!FoldCommutative<bool>(inst, [](bool a, bool b) { return a || b; })) { | ||
| 290 | return; | ||
| 291 | } | ||
| 292 | const IR::Value rhs{inst.Arg(1)}; | ||
| 293 | if (rhs.IsImmediate()) { | ||
| 294 | if (rhs.U1()) { | ||
| 295 | inst.ReplaceUsesWith(IR::Value{true}); | ||
| 296 | } else { | ||
| 297 | inst.ReplaceUsesWith(inst.Arg(0)); | ||
| 298 | } | ||
| 299 | } | ||
| 300 | } | ||
| 301 | |||
| 302 | void FoldLogicalNot(IR::Inst& inst) { | ||
| 303 | const IR::U1 value{inst.Arg(0)}; | ||
| 304 | if (value.IsImmediate()) { | ||
| 305 | inst.ReplaceUsesWith(IR::Value{!value.U1()}); | ||
| 306 | return; | ||
| 307 | } | ||
| 308 | IR::Inst* const arg{value.InstRecursive()}; | ||
| 309 | if (arg->GetOpcode() == IR::Opcode::LogicalNot) { | ||
| 310 | inst.ReplaceUsesWith(arg->Arg(0)); | ||
| 311 | } | ||
| 312 | } | ||
| 313 | |||
| 314 | template <IR::Opcode op, typename Dest, typename Source> | ||
| 315 | void FoldBitCast(IR::Inst& inst, IR::Opcode reverse) { | ||
| 316 | const IR::Value value{inst.Arg(0)}; | ||
| 317 | if (value.IsImmediate()) { | ||
| 318 | inst.ReplaceUsesWith(IR::Value{Common::BitCast<Dest>(Arg<Source>(value))}); | ||
| 319 | return; | ||
| 320 | } | ||
| 321 | IR::Inst* const arg_inst{value.InstRecursive()}; | ||
| 322 | if (arg_inst->GetOpcode() == reverse) { | ||
| 323 | inst.ReplaceUsesWith(arg_inst->Arg(0)); | ||
| 324 | return; | ||
| 325 | } | ||
| 326 | if constexpr (op == IR::Opcode::BitCastF32U32) { | ||
| 327 | if (arg_inst->GetOpcode() == IR::Opcode::GetCbufU32) { | ||
| 328 | // Replace the bitcast with a typed constant buffer read | ||
| 329 | inst.ReplaceOpcode(IR::Opcode::GetCbufF32); | ||
| 330 | inst.SetArg(0, arg_inst->Arg(0)); | ||
| 331 | inst.SetArg(1, arg_inst->Arg(1)); | ||
| 332 | return; | ||
| 333 | } | ||
| 334 | } | ||
| 335 | } | ||
| 336 | |||
| 337 | void FoldInverseFunc(IR::Inst& inst, IR::Opcode reverse) { | ||
| 338 | const IR::Value value{inst.Arg(0)}; | ||
| 339 | if (value.IsImmediate()) { | ||
| 340 | return; | ||
| 341 | } | ||
| 342 | IR::Inst* const arg_inst{value.InstRecursive()}; | ||
| 343 | if (arg_inst->GetOpcode() == reverse) { | ||
| 344 | inst.ReplaceUsesWith(arg_inst->Arg(0)); | ||
| 345 | return; | ||
| 346 | } | ||
| 347 | } | ||
| 348 | |||
| 349 | template <typename Func, size_t... I> | ||
| 350 | IR::Value EvalImmediates(const IR::Inst& inst, Func&& func, std::index_sequence<I...>) { | ||
| 351 | using Traits = LambdaTraits<decltype(func)>; | ||
| 352 | return IR::Value{func(Arg<typename Traits::template ArgType<I>>(inst.Arg(I))...)}; | ||
| 353 | } | ||
| 354 | |||
| 355 | std::optional<IR::Value> FoldCompositeExtractImpl(IR::Value inst_value, IR::Opcode insert, | ||
| 356 | IR::Opcode construct, u32 first_index) { | ||
| 357 | IR::Inst* const inst{inst_value.InstRecursive()}; | ||
| 358 | if (inst->GetOpcode() == construct) { | ||
| 359 | return inst->Arg(first_index); | ||
| 360 | } | ||
| 361 | if (inst->GetOpcode() != insert) { | ||
| 362 | return std::nullopt; | ||
| 363 | } | ||
| 364 | IR::Value value_index{inst->Arg(2)}; | ||
| 365 | if (!value_index.IsImmediate()) { | ||
| 366 | return std::nullopt; | ||
| 367 | } | ||
| 368 | const u32 second_index{value_index.U32()}; | ||
| 369 | if (first_index != second_index) { | ||
| 370 | IR::Value value_composite{inst->Arg(0)}; | ||
| 371 | if (value_composite.IsImmediate()) { | ||
| 372 | return std::nullopt; | ||
| 373 | } | ||
| 374 | return FoldCompositeExtractImpl(value_composite, insert, construct, first_index); | ||
| 375 | } | ||
| 376 | return inst->Arg(1); | ||
| 377 | } | ||
| 378 | |||
| 379 | void FoldCompositeExtract(IR::Inst& inst, IR::Opcode construct, IR::Opcode insert) { | ||
| 380 | const IR::Value value_1{inst.Arg(0)}; | ||
| 381 | const IR::Value value_2{inst.Arg(1)}; | ||
| 382 | if (value_1.IsImmediate()) { | ||
| 383 | return; | ||
| 384 | } | ||
| 385 | if (!value_2.IsImmediate()) { | ||
| 386 | return; | ||
| 387 | } | ||
| 388 | const u32 first_index{value_2.U32()}; | ||
| 389 | const std::optional result{FoldCompositeExtractImpl(value_1, insert, construct, first_index)}; | ||
| 390 | if (!result) { | ||
| 391 | return; | ||
| 392 | } | ||
| 393 | inst.ReplaceUsesWith(*result); | ||
| 394 | } | ||
| 395 | |||
| 396 | IR::Value GetThroughCast(IR::Value value, IR::Opcode expected_cast) { | ||
| 397 | if (value.IsImmediate()) { | ||
| 398 | return value; | ||
| 399 | } | ||
| 400 | IR::Inst* const inst{value.InstRecursive()}; | ||
| 401 | if (inst->GetOpcode() == expected_cast) { | ||
| 402 | return inst->Arg(0).Resolve(); | ||
| 403 | } | ||
| 404 | return value; | ||
| 405 | } | ||
| 406 | |||
| 407 | void FoldFSwizzleAdd(IR::Block& block, IR::Inst& inst) { | ||
| 408 | const IR::Value swizzle{inst.Arg(2)}; | ||
| 409 | if (!swizzle.IsImmediate()) { | ||
| 410 | return; | ||
| 411 | } | ||
| 412 | const IR::Value value_1{GetThroughCast(inst.Arg(0).Resolve(), IR::Opcode::BitCastF32U32)}; | ||
| 413 | const IR::Value value_2{GetThroughCast(inst.Arg(1).Resolve(), IR::Opcode::BitCastF32U32)}; | ||
| 414 | if (value_1.IsImmediate()) { | ||
| 415 | return; | ||
| 416 | } | ||
| 417 | const u32 swizzle_value{swizzle.U32()}; | ||
| 418 | if (swizzle_value != 0x99 && swizzle_value != 0xA5) { | ||
| 419 | return; | ||
| 420 | } | ||
| 421 | IR::Inst* const inst2{value_1.InstRecursive()}; | ||
| 422 | if (inst2->GetOpcode() != IR::Opcode::ShuffleButterfly) { | ||
| 423 | return; | ||
| 424 | } | ||
| 425 | const IR::Value value_3{GetThroughCast(inst2->Arg(0).Resolve(), IR::Opcode::BitCastU32F32)}; | ||
| 426 | if (value_2 != value_3) { | ||
| 427 | return; | ||
| 428 | } | ||
| 429 | const IR::Value index{inst2->Arg(1)}; | ||
| 430 | const IR::Value clamp{inst2->Arg(2)}; | ||
| 431 | const IR::Value segmentation_mask{inst2->Arg(3)}; | ||
| 432 | if (!index.IsImmediate() || !clamp.IsImmediate() || !segmentation_mask.IsImmediate()) { | ||
| 433 | return; | ||
| 434 | } | ||
| 435 | if (clamp.U32() != 3 || segmentation_mask.U32() != 28) { | ||
| 436 | return; | ||
| 437 | } | ||
| 438 | if (swizzle_value == 0x99) { | ||
| 439 | // DPdxFine | ||
| 440 | if (index.U32() == 1) { | ||
| 441 | IR::IREmitter ir{block, IR::Block::InstructionList::s_iterator_to(inst)}; | ||
| 442 | inst.ReplaceUsesWith(ir.DPdxFine(IR::F32{inst.Arg(1)})); | ||
| 443 | } | ||
| 444 | } else if (swizzle_value == 0xA5) { | ||
| 445 | // DPdyFine | ||
| 446 | if (index.U32() == 2) { | ||
| 447 | IR::IREmitter ir{block, IR::Block::InstructionList::s_iterator_to(inst)}; | ||
| 448 | inst.ReplaceUsesWith(ir.DPdyFine(IR::F32{inst.Arg(1)})); | ||
| 449 | } | ||
| 450 | } | ||
| 451 | } | ||
| 452 | |||
| 453 | void ConstantPropagation(IR::Block& block, IR::Inst& inst) { | ||
| 454 | switch (inst.GetOpcode()) { | ||
| 455 | case IR::Opcode::GetRegister: | ||
| 456 | return FoldGetRegister(inst); | ||
| 457 | case IR::Opcode::GetPred: | ||
| 458 | return FoldGetPred(inst); | ||
| 459 | case IR::Opcode::IAdd32: | ||
| 460 | return FoldAdd<u32>(block, inst); | ||
| 461 | case IR::Opcode::ISub32: | ||
| 462 | return FoldISub32(inst); | ||
| 463 | case IR::Opcode::IMul32: | ||
| 464 | FoldWhenAllImmediates(inst, [](u32 a, u32 b) { return a * b; }); | ||
| 465 | return; | ||
| 466 | case IR::Opcode::ShiftRightArithmetic32: | ||
| 467 | FoldWhenAllImmediates(inst, [](s32 a, s32 b) { return static_cast<u32>(a >> b); }); | ||
| 468 | return; | ||
| 469 | case IR::Opcode::BitCastF32U32: | ||
| 470 | return FoldBitCast<IR::Opcode::BitCastF32U32, f32, u32>(inst, IR::Opcode::BitCastU32F32); | ||
| 471 | case IR::Opcode::BitCastU32F32: | ||
| 472 | return FoldBitCast<IR::Opcode::BitCastU32F32, u32, f32>(inst, IR::Opcode::BitCastF32U32); | ||
| 473 | case IR::Opcode::IAdd64: | ||
| 474 | return FoldAdd<u64>(block, inst); | ||
| 475 | case IR::Opcode::PackHalf2x16: | ||
| 476 | return FoldInverseFunc(inst, IR::Opcode::UnpackHalf2x16); | ||
| 477 | case IR::Opcode::UnpackHalf2x16: | ||
| 478 | return FoldInverseFunc(inst, IR::Opcode::PackHalf2x16); | ||
| 479 | case IR::Opcode::SelectU1: | ||
| 480 | case IR::Opcode::SelectU8: | ||
| 481 | case IR::Opcode::SelectU16: | ||
| 482 | case IR::Opcode::SelectU32: | ||
| 483 | case IR::Opcode::SelectU64: | ||
| 484 | case IR::Opcode::SelectF16: | ||
| 485 | case IR::Opcode::SelectF32: | ||
| 486 | case IR::Opcode::SelectF64: | ||
| 487 | return FoldSelect(inst); | ||
| 488 | case IR::Opcode::FPMul32: | ||
| 489 | return FoldFPMul32(inst); | ||
| 490 | case IR::Opcode::LogicalAnd: | ||
| 491 | return FoldLogicalAnd(inst); | ||
| 492 | case IR::Opcode::LogicalOr: | ||
| 493 | return FoldLogicalOr(inst); | ||
| 494 | case IR::Opcode::LogicalNot: | ||
| 495 | return FoldLogicalNot(inst); | ||
| 496 | case IR::Opcode::SLessThan: | ||
| 497 | FoldWhenAllImmediates(inst, [](s32 a, s32 b) { return a < b; }); | ||
| 498 | return; | ||
| 499 | case IR::Opcode::ULessThan: | ||
| 500 | FoldWhenAllImmediates(inst, [](u32 a, u32 b) { return a < b; }); | ||
| 501 | return; | ||
| 502 | case IR::Opcode::SLessThanEqual: | ||
| 503 | FoldWhenAllImmediates(inst, [](s32 a, s32 b) { return a <= b; }); | ||
| 504 | return; | ||
| 505 | case IR::Opcode::ULessThanEqual: | ||
| 506 | FoldWhenAllImmediates(inst, [](u32 a, u32 b) { return a <= b; }); | ||
| 507 | return; | ||
| 508 | case IR::Opcode::SGreaterThan: | ||
| 509 | FoldWhenAllImmediates(inst, [](s32 a, s32 b) { return a > b; }); | ||
| 510 | return; | ||
| 511 | case IR::Opcode::UGreaterThan: | ||
| 512 | FoldWhenAllImmediates(inst, [](u32 a, u32 b) { return a > b; }); | ||
| 513 | return; | ||
| 514 | case IR::Opcode::SGreaterThanEqual: | ||
| 515 | FoldWhenAllImmediates(inst, [](s32 a, s32 b) { return a >= b; }); | ||
| 516 | return; | ||
| 517 | case IR::Opcode::UGreaterThanEqual: | ||
| 518 | FoldWhenAllImmediates(inst, [](u32 a, u32 b) { return a >= b; }); | ||
| 519 | return; | ||
| 520 | case IR::Opcode::IEqual: | ||
| 521 | FoldWhenAllImmediates(inst, [](u32 a, u32 b) { return a == b; }); | ||
| 522 | return; | ||
| 523 | case IR::Opcode::INotEqual: | ||
| 524 | FoldWhenAllImmediates(inst, [](u32 a, u32 b) { return a != b; }); | ||
| 525 | return; | ||
| 526 | case IR::Opcode::BitwiseAnd32: | ||
| 527 | FoldWhenAllImmediates(inst, [](u32 a, u32 b) { return a & b; }); | ||
| 528 | return; | ||
| 529 | case IR::Opcode::BitwiseOr32: | ||
| 530 | FoldWhenAllImmediates(inst, [](u32 a, u32 b) { return a | b; }); | ||
| 531 | return; | ||
| 532 | case IR::Opcode::BitwiseXor32: | ||
| 533 | FoldWhenAllImmediates(inst, [](u32 a, u32 b) { return a ^ b; }); | ||
| 534 | return; | ||
| 535 | case IR::Opcode::BitFieldUExtract: | ||
| 536 | FoldWhenAllImmediates(inst, [](u32 base, u32 shift, u32 count) { | ||
| 537 | if (static_cast<size_t>(shift) + static_cast<size_t>(count) > 32) { | ||
| 538 | throw LogicError("Undefined result in {}({}, {}, {})", IR::Opcode::BitFieldUExtract, | ||
| 539 | base, shift, count); | ||
| 540 | } | ||
| 541 | return (base >> shift) & ((1U << count) - 1); | ||
| 542 | }); | ||
| 543 | return; | ||
| 544 | case IR::Opcode::BitFieldSExtract: | ||
| 545 | FoldWhenAllImmediates(inst, [](s32 base, u32 shift, u32 count) { | ||
| 546 | const size_t back_shift{static_cast<size_t>(shift) + static_cast<size_t>(count)}; | ||
| 547 | const size_t left_shift{32 - back_shift}; | ||
| 548 | const size_t right_shift{static_cast<size_t>(32 - count)}; | ||
| 549 | if (back_shift > 32 || left_shift >= 32 || right_shift >= 32) { | ||
| 550 | throw LogicError("Undefined result in {}({}, {}, {})", IR::Opcode::BitFieldSExtract, | ||
| 551 | base, shift, count); | ||
| 552 | } | ||
| 553 | return static_cast<u32>((base << left_shift) >> right_shift); | ||
| 554 | }); | ||
| 555 | return; | ||
| 556 | case IR::Opcode::BitFieldInsert: | ||
| 557 | FoldWhenAllImmediates(inst, [](u32 base, u32 insert, u32 offset, u32 bits) { | ||
| 558 | if (bits >= 32 || offset >= 32) { | ||
| 559 | throw LogicError("Undefined result in {}({}, {}, {}, {})", | ||
| 560 | IR::Opcode::BitFieldInsert, base, insert, offset, bits); | ||
| 561 | } | ||
| 562 | return (base & ~(~(~0u << bits) << offset)) | (insert << offset); | ||
| 563 | }); | ||
| 564 | return; | ||
| 565 | case IR::Opcode::CompositeExtractU32x2: | ||
| 566 | return FoldCompositeExtract(inst, IR::Opcode::CompositeConstructU32x2, | ||
| 567 | IR::Opcode::CompositeInsertU32x2); | ||
| 568 | case IR::Opcode::CompositeExtractU32x3: | ||
| 569 | return FoldCompositeExtract(inst, IR::Opcode::CompositeConstructU32x3, | ||
| 570 | IR::Opcode::CompositeInsertU32x3); | ||
| 571 | case IR::Opcode::CompositeExtractU32x4: | ||
| 572 | return FoldCompositeExtract(inst, IR::Opcode::CompositeConstructU32x4, | ||
| 573 | IR::Opcode::CompositeInsertU32x4); | ||
| 574 | case IR::Opcode::CompositeExtractF32x2: | ||
| 575 | return FoldCompositeExtract(inst, IR::Opcode::CompositeConstructF32x2, | ||
| 576 | IR::Opcode::CompositeInsertF32x2); | ||
| 577 | case IR::Opcode::CompositeExtractF32x3: | ||
| 578 | return FoldCompositeExtract(inst, IR::Opcode::CompositeConstructF32x3, | ||
| 579 | IR::Opcode::CompositeInsertF32x3); | ||
| 580 | case IR::Opcode::CompositeExtractF32x4: | ||
| 581 | return FoldCompositeExtract(inst, IR::Opcode::CompositeConstructF32x4, | ||
| 582 | IR::Opcode::CompositeInsertF32x4); | ||
| 583 | case IR::Opcode::CompositeExtractF16x2: | ||
| 584 | return FoldCompositeExtract(inst, IR::Opcode::CompositeConstructF16x2, | ||
| 585 | IR::Opcode::CompositeInsertF16x2); | ||
| 586 | case IR::Opcode::CompositeExtractF16x3: | ||
| 587 | return FoldCompositeExtract(inst, IR::Opcode::CompositeConstructF16x3, | ||
| 588 | IR::Opcode::CompositeInsertF16x3); | ||
| 589 | case IR::Opcode::CompositeExtractF16x4: | ||
| 590 | return FoldCompositeExtract(inst, IR::Opcode::CompositeConstructF16x4, | ||
| 591 | IR::Opcode::CompositeInsertF16x4); | ||
| 592 | case IR::Opcode::FSwizzleAdd: | ||
| 593 | return FoldFSwizzleAdd(block, inst); | ||
| 594 | default: | ||
| 595 | break; | ||
| 596 | } | ||
| 597 | } | ||
| 598 | } // Anonymous namespace | ||
| 599 | |||
| 600 | void ConstantPropagationPass(IR::Program& program) { | ||
| 601 | const auto end{program.post_order_blocks.rend()}; | ||
| 602 | for (auto it = program.post_order_blocks.rbegin(); it != end; ++it) { | ||
| 603 | IR::Block* const block{*it}; | ||
| 604 | for (IR::Inst& inst : block->Instructions()) { | ||
| 605 | ConstantPropagation(*block, inst); | ||
| 606 | } | ||
| 607 | } | ||
| 608 | } | ||
| 609 | |||
| 610 | } // namespace Shader::Optimization | ||
diff --git a/src/shader_recompiler/ir_opt/dead_code_elimination_pass.cpp b/src/shader_recompiler/ir_opt/dead_code_elimination_pass.cpp new file mode 100644 index 000000000..400836301 --- /dev/null +++ b/src/shader_recompiler/ir_opt/dead_code_elimination_pass.cpp | |||
| @@ -0,0 +1,26 @@ | |||
| 1 | // Copyright 2021 yuzu Emulator Project | ||
| 2 | // Licensed under GPLv2 or any later version | ||
| 3 | // Refer to the license.txt file included. | ||
| 4 | |||
| 5 | #include "shader_recompiler/frontend/ir/basic_block.h" | ||
| 6 | #include "shader_recompiler/frontend/ir/value.h" | ||
| 7 | #include "shader_recompiler/ir_opt/passes.h" | ||
| 8 | |||
| 9 | namespace Shader::Optimization { | ||
| 10 | |||
| 11 | void DeadCodeEliminationPass(IR::Program& program) { | ||
| 12 | // We iterate over the instructions in reverse order. | ||
| 13 | // This is because removing an instruction reduces the number of uses for earlier instructions. | ||
| 14 | for (IR::Block* const block : program.post_order_blocks) { | ||
| 15 | auto it{block->end()}; | ||
| 16 | while (it != block->begin()) { | ||
| 17 | --it; | ||
| 18 | if (!it->HasUses() && !it->MayHaveSideEffects()) { | ||
| 19 | it->Invalidate(); | ||
| 20 | it = block->Instructions().erase(it); | ||
| 21 | } | ||
| 22 | } | ||
| 23 | } | ||
| 24 | } | ||
| 25 | |||
| 26 | } // namespace Shader::Optimization | ||
diff --git a/src/shader_recompiler/ir_opt/dual_vertex_pass.cpp b/src/shader_recompiler/ir_opt/dual_vertex_pass.cpp new file mode 100644 index 000000000..055ba9c54 --- /dev/null +++ b/src/shader_recompiler/ir_opt/dual_vertex_pass.cpp | |||
| @@ -0,0 +1,30 @@ | |||
| 1 | // Copyright 2021 yuzu Emulator Project | ||
| 2 | // Licensed under GPLv2 or any later version | ||
| 3 | // Refer to the license.txt file included. | ||
| 4 | |||
| 5 | #include "shader_recompiler/frontend/ir/ir_emitter.h" | ||
| 6 | #include "shader_recompiler/ir_opt/passes.h" | ||
| 7 | |||
| 8 | namespace Shader::Optimization { | ||
| 9 | |||
| 10 | void VertexATransformPass(IR::Program& program) { | ||
| 11 | for (IR::Block* const block : program.blocks) { | ||
| 12 | for (IR::Inst& inst : block->Instructions()) { | ||
| 13 | if (inst.GetOpcode() == IR::Opcode::Epilogue) { | ||
| 14 | return inst.Invalidate(); | ||
| 15 | } | ||
| 16 | } | ||
| 17 | } | ||
| 18 | } | ||
| 19 | |||
| 20 | void VertexBTransformPass(IR::Program& program) { | ||
| 21 | for (IR::Block* const block : program.blocks) { | ||
| 22 | for (IR::Inst& inst : block->Instructions()) { | ||
| 23 | if (inst.GetOpcode() == IR::Opcode::Prologue) { | ||
| 24 | return inst.Invalidate(); | ||
| 25 | } | ||
| 26 | } | ||
| 27 | } | ||
| 28 | } | ||
| 29 | |||
| 30 | } // namespace Shader::Optimization | ||
diff --git a/src/shader_recompiler/ir_opt/global_memory_to_storage_buffer_pass.cpp b/src/shader_recompiler/ir_opt/global_memory_to_storage_buffer_pass.cpp new file mode 100644 index 000000000..4197b0095 --- /dev/null +++ b/src/shader_recompiler/ir_opt/global_memory_to_storage_buffer_pass.cpp | |||
| @@ -0,0 +1,526 @@ | |||
| 1 | // Copyright 2021 yuzu Emulator Project | ||
| 2 | // Licensed under GPLv2 or any later version | ||
| 3 | // Refer to the license.txt file included. | ||
| 4 | |||
| 5 | #include <algorithm> | ||
| 6 | #include <compare> | ||
| 7 | #include <optional> | ||
| 8 | #include <queue> | ||
| 9 | |||
| 10 | #include <boost/container/flat_set.hpp> | ||
| 11 | #include <boost/container/small_vector.hpp> | ||
| 12 | |||
| 13 | #include "common/alignment.h" | ||
| 14 | #include "shader_recompiler/frontend/ir/basic_block.h" | ||
| 15 | #include "shader_recompiler/frontend/ir/breadth_first_search.h" | ||
| 16 | #include "shader_recompiler/frontend/ir/ir_emitter.h" | ||
| 17 | #include "shader_recompiler/frontend/ir/value.h" | ||
| 18 | #include "shader_recompiler/ir_opt/passes.h" | ||
| 19 | |||
| 20 | namespace Shader::Optimization { | ||
| 21 | namespace { | ||
| 22 | /// Address in constant buffers to the storage buffer descriptor | ||
| 23 | struct StorageBufferAddr { | ||
| 24 | auto operator<=>(const StorageBufferAddr&) const noexcept = default; | ||
| 25 | |||
| 26 | u32 index; | ||
| 27 | u32 offset; | ||
| 28 | }; | ||
| 29 | |||
| 30 | /// Block iterator to a global memory instruction and the storage buffer it uses | ||
| 31 | struct StorageInst { | ||
| 32 | StorageBufferAddr storage_buffer; | ||
| 33 | IR::Inst* inst; | ||
| 34 | IR::Block* block; | ||
| 35 | }; | ||
| 36 | |||
| 37 | /// Bias towards a certain range of constant buffers when looking for storage buffers | ||
| 38 | struct Bias { | ||
| 39 | u32 index; | ||
| 40 | u32 offset_begin; | ||
| 41 | u32 offset_end; | ||
| 42 | }; | ||
| 43 | |||
| 44 | using boost::container::flat_set; | ||
| 45 | using boost::container::small_vector; | ||
| 46 | using StorageBufferSet = | ||
| 47 | flat_set<StorageBufferAddr, std::less<StorageBufferAddr>, small_vector<StorageBufferAddr, 16>>; | ||
| 48 | using StorageInstVector = small_vector<StorageInst, 24>; | ||
| 49 | using StorageWritesSet = | ||
| 50 | flat_set<StorageBufferAddr, std::less<StorageBufferAddr>, small_vector<StorageBufferAddr, 16>>; | ||
| 51 | |||
| 52 | struct StorageInfo { | ||
| 53 | StorageBufferSet set; | ||
| 54 | StorageInstVector to_replace; | ||
| 55 | StorageWritesSet writes; | ||
| 56 | }; | ||
| 57 | |||
| 58 | /// Returns true when the instruction is a global memory instruction | ||
| 59 | bool IsGlobalMemory(const IR::Inst& inst) { | ||
| 60 | switch (inst.GetOpcode()) { | ||
| 61 | case IR::Opcode::LoadGlobalS8: | ||
| 62 | case IR::Opcode::LoadGlobalU8: | ||
| 63 | case IR::Opcode::LoadGlobalS16: | ||
| 64 | case IR::Opcode::LoadGlobalU16: | ||
| 65 | case IR::Opcode::LoadGlobal32: | ||
| 66 | case IR::Opcode::LoadGlobal64: | ||
| 67 | case IR::Opcode::LoadGlobal128: | ||
| 68 | case IR::Opcode::WriteGlobalS8: | ||
| 69 | case IR::Opcode::WriteGlobalU8: | ||
| 70 | case IR::Opcode::WriteGlobalS16: | ||
| 71 | case IR::Opcode::WriteGlobalU16: | ||
| 72 | case IR::Opcode::WriteGlobal32: | ||
| 73 | case IR::Opcode::WriteGlobal64: | ||
| 74 | case IR::Opcode::WriteGlobal128: | ||
| 75 | case IR::Opcode::GlobalAtomicIAdd32: | ||
| 76 | case IR::Opcode::GlobalAtomicSMin32: | ||
| 77 | case IR::Opcode::GlobalAtomicUMin32: | ||
| 78 | case IR::Opcode::GlobalAtomicSMax32: | ||
| 79 | case IR::Opcode::GlobalAtomicUMax32: | ||
| 80 | case IR::Opcode::GlobalAtomicInc32: | ||
| 81 | case IR::Opcode::GlobalAtomicDec32: | ||
| 82 | case IR::Opcode::GlobalAtomicAnd32: | ||
| 83 | case IR::Opcode::GlobalAtomicOr32: | ||
| 84 | case IR::Opcode::GlobalAtomicXor32: | ||
| 85 | case IR::Opcode::GlobalAtomicExchange32: | ||
| 86 | case IR::Opcode::GlobalAtomicIAdd64: | ||
| 87 | case IR::Opcode::GlobalAtomicSMin64: | ||
| 88 | case IR::Opcode::GlobalAtomicUMin64: | ||
| 89 | case IR::Opcode::GlobalAtomicSMax64: | ||
| 90 | case IR::Opcode::GlobalAtomicUMax64: | ||
| 91 | case IR::Opcode::GlobalAtomicAnd64: | ||
| 92 | case IR::Opcode::GlobalAtomicOr64: | ||
| 93 | case IR::Opcode::GlobalAtomicXor64: | ||
| 94 | case IR::Opcode::GlobalAtomicExchange64: | ||
| 95 | case IR::Opcode::GlobalAtomicAddF32: | ||
| 96 | case IR::Opcode::GlobalAtomicAddF16x2: | ||
| 97 | case IR::Opcode::GlobalAtomicAddF32x2: | ||
| 98 | case IR::Opcode::GlobalAtomicMinF16x2: | ||
| 99 | case IR::Opcode::GlobalAtomicMinF32x2: | ||
| 100 | case IR::Opcode::GlobalAtomicMaxF16x2: | ||
| 101 | case IR::Opcode::GlobalAtomicMaxF32x2: | ||
| 102 | return true; | ||
| 103 | default: | ||
| 104 | return false; | ||
| 105 | } | ||
| 106 | } | ||
| 107 | |||
| 108 | /// Returns true when the instruction is a global memory instruction | ||
| 109 | bool IsGlobalMemoryWrite(const IR::Inst& inst) { | ||
| 110 | switch (inst.GetOpcode()) { | ||
| 111 | case IR::Opcode::WriteGlobalS8: | ||
| 112 | case IR::Opcode::WriteGlobalU8: | ||
| 113 | case IR::Opcode::WriteGlobalS16: | ||
| 114 | case IR::Opcode::WriteGlobalU16: | ||
| 115 | case IR::Opcode::WriteGlobal32: | ||
| 116 | case IR::Opcode::WriteGlobal64: | ||
| 117 | case IR::Opcode::WriteGlobal128: | ||
| 118 | case IR::Opcode::GlobalAtomicIAdd32: | ||
| 119 | case IR::Opcode::GlobalAtomicSMin32: | ||
| 120 | case IR::Opcode::GlobalAtomicUMin32: | ||
| 121 | case IR::Opcode::GlobalAtomicSMax32: | ||
| 122 | case IR::Opcode::GlobalAtomicUMax32: | ||
| 123 | case IR::Opcode::GlobalAtomicInc32: | ||
| 124 | case IR::Opcode::GlobalAtomicDec32: | ||
| 125 | case IR::Opcode::GlobalAtomicAnd32: | ||
| 126 | case IR::Opcode::GlobalAtomicOr32: | ||
| 127 | case IR::Opcode::GlobalAtomicXor32: | ||
| 128 | case IR::Opcode::GlobalAtomicExchange32: | ||
| 129 | case IR::Opcode::GlobalAtomicIAdd64: | ||
| 130 | case IR::Opcode::GlobalAtomicSMin64: | ||
| 131 | case IR::Opcode::GlobalAtomicUMin64: | ||
| 132 | case IR::Opcode::GlobalAtomicSMax64: | ||
| 133 | case IR::Opcode::GlobalAtomicUMax64: | ||
| 134 | case IR::Opcode::GlobalAtomicAnd64: | ||
| 135 | case IR::Opcode::GlobalAtomicOr64: | ||
| 136 | case IR::Opcode::GlobalAtomicXor64: | ||
| 137 | case IR::Opcode::GlobalAtomicExchange64: | ||
| 138 | case IR::Opcode::GlobalAtomicAddF32: | ||
| 139 | case IR::Opcode::GlobalAtomicAddF16x2: | ||
| 140 | case IR::Opcode::GlobalAtomicAddF32x2: | ||
| 141 | case IR::Opcode::GlobalAtomicMinF16x2: | ||
| 142 | case IR::Opcode::GlobalAtomicMinF32x2: | ||
| 143 | case IR::Opcode::GlobalAtomicMaxF16x2: | ||
| 144 | case IR::Opcode::GlobalAtomicMaxF32x2: | ||
| 145 | return true; | ||
| 146 | default: | ||
| 147 | return false; | ||
| 148 | } | ||
| 149 | } | ||
| 150 | |||
| 151 | /// Converts a global memory opcode to its storage buffer equivalent | ||
| 152 | IR::Opcode GlobalToStorage(IR::Opcode opcode) { | ||
| 153 | switch (opcode) { | ||
| 154 | case IR::Opcode::LoadGlobalS8: | ||
| 155 | return IR::Opcode::LoadStorageS8; | ||
| 156 | case IR::Opcode::LoadGlobalU8: | ||
| 157 | return IR::Opcode::LoadStorageU8; | ||
| 158 | case IR::Opcode::LoadGlobalS16: | ||
| 159 | return IR::Opcode::LoadStorageS16; | ||
| 160 | case IR::Opcode::LoadGlobalU16: | ||
| 161 | return IR::Opcode::LoadStorageU16; | ||
| 162 | case IR::Opcode::LoadGlobal32: | ||
| 163 | return IR::Opcode::LoadStorage32; | ||
| 164 | case IR::Opcode::LoadGlobal64: | ||
| 165 | return IR::Opcode::LoadStorage64; | ||
| 166 | case IR::Opcode::LoadGlobal128: | ||
| 167 | return IR::Opcode::LoadStorage128; | ||
| 168 | case IR::Opcode::WriteGlobalS8: | ||
| 169 | return IR::Opcode::WriteStorageS8; | ||
| 170 | case IR::Opcode::WriteGlobalU8: | ||
| 171 | return IR::Opcode::WriteStorageU8; | ||
| 172 | case IR::Opcode::WriteGlobalS16: | ||
| 173 | return IR::Opcode::WriteStorageS16; | ||
| 174 | case IR::Opcode::WriteGlobalU16: | ||
| 175 | return IR::Opcode::WriteStorageU16; | ||
| 176 | case IR::Opcode::WriteGlobal32: | ||
| 177 | return IR::Opcode::WriteStorage32; | ||
| 178 | case IR::Opcode::WriteGlobal64: | ||
| 179 | return IR::Opcode::WriteStorage64; | ||
| 180 | case IR::Opcode::WriteGlobal128: | ||
| 181 | return IR::Opcode::WriteStorage128; | ||
| 182 | case IR::Opcode::GlobalAtomicIAdd32: | ||
| 183 | return IR::Opcode::StorageAtomicIAdd32; | ||
| 184 | case IR::Opcode::GlobalAtomicSMin32: | ||
| 185 | return IR::Opcode::StorageAtomicSMin32; | ||
| 186 | case IR::Opcode::GlobalAtomicUMin32: | ||
| 187 | return IR::Opcode::StorageAtomicUMin32; | ||
| 188 | case IR::Opcode::GlobalAtomicSMax32: | ||
| 189 | return IR::Opcode::StorageAtomicSMax32; | ||
| 190 | case IR::Opcode::GlobalAtomicUMax32: | ||
| 191 | return IR::Opcode::StorageAtomicUMax32; | ||
| 192 | case IR::Opcode::GlobalAtomicInc32: | ||
| 193 | return IR::Opcode::StorageAtomicInc32; | ||
| 194 | case IR::Opcode::GlobalAtomicDec32: | ||
| 195 | return IR::Opcode::StorageAtomicDec32; | ||
| 196 | case IR::Opcode::GlobalAtomicAnd32: | ||
| 197 | return IR::Opcode::StorageAtomicAnd32; | ||
| 198 | case IR::Opcode::GlobalAtomicOr32: | ||
| 199 | return IR::Opcode::StorageAtomicOr32; | ||
| 200 | case IR::Opcode::GlobalAtomicXor32: | ||
| 201 | return IR::Opcode::StorageAtomicXor32; | ||
| 202 | case IR::Opcode::GlobalAtomicIAdd64: | ||
| 203 | return IR::Opcode::StorageAtomicIAdd64; | ||
| 204 | case IR::Opcode::GlobalAtomicSMin64: | ||
| 205 | return IR::Opcode::StorageAtomicSMin64; | ||
| 206 | case IR::Opcode::GlobalAtomicUMin64: | ||
| 207 | return IR::Opcode::StorageAtomicUMin64; | ||
| 208 | case IR::Opcode::GlobalAtomicSMax64: | ||
| 209 | return IR::Opcode::StorageAtomicSMax64; | ||
| 210 | case IR::Opcode::GlobalAtomicUMax64: | ||
| 211 | return IR::Opcode::StorageAtomicUMax64; | ||
| 212 | case IR::Opcode::GlobalAtomicAnd64: | ||
| 213 | return IR::Opcode::StorageAtomicAnd64; | ||
| 214 | case IR::Opcode::GlobalAtomicOr64: | ||
| 215 | return IR::Opcode::StorageAtomicOr64; | ||
| 216 | case IR::Opcode::GlobalAtomicXor64: | ||
| 217 | return IR::Opcode::StorageAtomicXor64; | ||
| 218 | case IR::Opcode::GlobalAtomicExchange32: | ||
| 219 | return IR::Opcode::StorageAtomicExchange32; | ||
| 220 | case IR::Opcode::GlobalAtomicExchange64: | ||
| 221 | return IR::Opcode::StorageAtomicExchange64; | ||
| 222 | case IR::Opcode::GlobalAtomicAddF32: | ||
| 223 | return IR::Opcode::StorageAtomicAddF32; | ||
| 224 | case IR::Opcode::GlobalAtomicAddF16x2: | ||
| 225 | return IR::Opcode::StorageAtomicAddF16x2; | ||
| 226 | case IR::Opcode::GlobalAtomicMinF16x2: | ||
| 227 | return IR::Opcode::StorageAtomicMinF16x2; | ||
| 228 | case IR::Opcode::GlobalAtomicMaxF16x2: | ||
| 229 | return IR::Opcode::StorageAtomicMaxF16x2; | ||
| 230 | case IR::Opcode::GlobalAtomicAddF32x2: | ||
| 231 | return IR::Opcode::StorageAtomicAddF32x2; | ||
| 232 | case IR::Opcode::GlobalAtomicMinF32x2: | ||
| 233 | return IR::Opcode::StorageAtomicMinF32x2; | ||
| 234 | case IR::Opcode::GlobalAtomicMaxF32x2: | ||
| 235 | return IR::Opcode::StorageAtomicMaxF32x2; | ||
| 236 | default: | ||
| 237 | throw InvalidArgument("Invalid global memory opcode {}", opcode); | ||
| 238 | } | ||
| 239 | } | ||
| 240 | |||
| 241 | /// Returns true when a storage buffer address satisfies a bias | ||
| 242 | bool MeetsBias(const StorageBufferAddr& storage_buffer, const Bias& bias) noexcept { | ||
| 243 | return storage_buffer.index == bias.index && storage_buffer.offset >= bias.offset_begin && | ||
| 244 | storage_buffer.offset < bias.offset_end; | ||
| 245 | } | ||
| 246 | |||
| 247 | struct LowAddrInfo { | ||
| 248 | IR::U32 value; | ||
| 249 | s32 imm_offset; | ||
| 250 | }; | ||
| 251 | |||
| 252 | /// Tries to track the first 32-bits of a global memory instruction | ||
| 253 | std::optional<LowAddrInfo> TrackLowAddress(IR::Inst* inst) { | ||
| 254 | // The first argument is the low level GPU pointer to the global memory instruction | ||
| 255 | const IR::Value addr{inst->Arg(0)}; | ||
| 256 | if (addr.IsImmediate()) { | ||
| 257 | // Not much we can do if it's an immediate | ||
| 258 | return std::nullopt; | ||
| 259 | } | ||
| 260 | // This address is expected to either be a PackUint2x32, a IAdd64, or a CompositeConstructU32x2 | ||
| 261 | IR::Inst* addr_inst{addr.InstRecursive()}; | ||
| 262 | s32 imm_offset{0}; | ||
| 263 | if (addr_inst->GetOpcode() == IR::Opcode::IAdd64) { | ||
| 264 | // If it's an IAdd64, get the immediate offset it is applying and grab the address | ||
| 265 | // instruction. This expects for the instruction to be canonicalized having the address on | ||
| 266 | // the first argument and the immediate offset on the second one. | ||
| 267 | const IR::U64 imm_offset_value{addr_inst->Arg(1)}; | ||
| 268 | if (!imm_offset_value.IsImmediate()) { | ||
| 269 | return std::nullopt; | ||
| 270 | } | ||
| 271 | imm_offset = static_cast<s32>(static_cast<s64>(imm_offset_value.U64())); | ||
| 272 | const IR::U64 iadd_addr{addr_inst->Arg(0)}; | ||
| 273 | if (iadd_addr.IsImmediate()) { | ||
| 274 | return std::nullopt; | ||
| 275 | } | ||
| 276 | addr_inst = iadd_addr.InstRecursive(); | ||
| 277 | } | ||
| 278 | // With IAdd64 handled, now PackUint2x32 is expected | ||
| 279 | if (addr_inst->GetOpcode() == IR::Opcode::PackUint2x32) { | ||
| 280 | // PackUint2x32 is expected to be generated from a vector | ||
| 281 | const IR::Value vector{addr_inst->Arg(0)}; | ||
| 282 | if (vector.IsImmediate()) { | ||
| 283 | return std::nullopt; | ||
| 284 | } | ||
| 285 | addr_inst = vector.InstRecursive(); | ||
| 286 | } | ||
| 287 | // The vector is expected to be a CompositeConstructU32x2 | ||
| 288 | if (addr_inst->GetOpcode() != IR::Opcode::CompositeConstructU32x2) { | ||
| 289 | return std::nullopt; | ||
| 290 | } | ||
| 291 | // Grab the first argument from the CompositeConstructU32x2, this is the low address. | ||
| 292 | return LowAddrInfo{ | ||
| 293 | .value{IR::U32{addr_inst->Arg(0)}}, | ||
| 294 | .imm_offset = imm_offset, | ||
| 295 | }; | ||
| 296 | } | ||
| 297 | |||
| 298 | /// Tries to track the storage buffer address used by a global memory instruction | ||
| 299 | std::optional<StorageBufferAddr> Track(const IR::Value& value, const Bias* bias) { | ||
| 300 | const auto pred{[bias](const IR::Inst* inst) -> std::optional<StorageBufferAddr> { | ||
| 301 | if (inst->GetOpcode() != IR::Opcode::GetCbufU32) { | ||
| 302 | return std::nullopt; | ||
| 303 | } | ||
| 304 | const IR::Value index{inst->Arg(0)}; | ||
| 305 | const IR::Value offset{inst->Arg(1)}; | ||
| 306 | if (!index.IsImmediate()) { | ||
| 307 | // Definitely not a storage buffer if it's read from a | ||
| 308 | // non-immediate index | ||
| 309 | return std::nullopt; | ||
| 310 | } | ||
| 311 | if (!offset.IsImmediate()) { | ||
| 312 | // TODO: Support SSBO arrays | ||
| 313 | return std::nullopt; | ||
| 314 | } | ||
| 315 | const StorageBufferAddr storage_buffer{ | ||
| 316 | .index = index.U32(), | ||
| 317 | .offset = offset.U32(), | ||
| 318 | }; | ||
| 319 | if (!Common::IsAligned(storage_buffer.offset, 16)) { | ||
| 320 | // The SSBO pointer has to be aligned | ||
| 321 | return std::nullopt; | ||
| 322 | } | ||
| 323 | if (bias && !MeetsBias(storage_buffer, *bias)) { | ||
| 324 | // We have to blacklist some addresses in case we wrongly | ||
| 325 | // point to them | ||
| 326 | return std::nullopt; | ||
| 327 | } | ||
| 328 | return storage_buffer; | ||
| 329 | }}; | ||
| 330 | return BreadthFirstSearch(value, pred); | ||
| 331 | } | ||
| 332 | |||
| 333 | /// Collects the storage buffer used by a global memory instruction and the instruction itself | ||
| 334 | void CollectStorageBuffers(IR::Block& block, IR::Inst& inst, StorageInfo& info) { | ||
| 335 | // NVN puts storage buffers in a specific range, we have to bias towards these addresses to | ||
| 336 | // avoid getting false positives | ||
| 337 | static constexpr Bias nvn_bias{ | ||
| 338 | .index = 0, | ||
| 339 | .offset_begin = 0x110, | ||
| 340 | .offset_end = 0x610, | ||
| 341 | }; | ||
| 342 | // Track the low address of the instruction | ||
| 343 | const std::optional<LowAddrInfo> low_addr_info{TrackLowAddress(&inst)}; | ||
| 344 | if (!low_addr_info) { | ||
| 345 | // Failed to track the low address, use NVN fallbacks | ||
| 346 | return; | ||
| 347 | } | ||
| 348 | // First try to find storage buffers in the NVN address | ||
| 349 | const IR::U32 low_addr{low_addr_info->value}; | ||
| 350 | std::optional<StorageBufferAddr> storage_buffer{Track(low_addr, &nvn_bias)}; | ||
| 351 | if (!storage_buffer) { | ||
| 352 | // If it fails, track without a bias | ||
| 353 | storage_buffer = Track(low_addr, nullptr); | ||
| 354 | if (!storage_buffer) { | ||
| 355 | // If that also fails, use NVN fallbacks | ||
| 356 | return; | ||
| 357 | } | ||
| 358 | } | ||
| 359 | // Collect storage buffer and the instruction | ||
| 360 | if (IsGlobalMemoryWrite(inst)) { | ||
| 361 | info.writes.insert(*storage_buffer); | ||
| 362 | } | ||
| 363 | info.set.insert(*storage_buffer); | ||
| 364 | info.to_replace.push_back(StorageInst{ | ||
| 365 | .storage_buffer{*storage_buffer}, | ||
| 366 | .inst = &inst, | ||
| 367 | .block = &block, | ||
| 368 | }); | ||
| 369 | } | ||
| 370 | |||
| 371 | /// Returns the offset in indices (not bytes) for an equivalent storage instruction | ||
| 372 | IR::U32 StorageOffset(IR::Block& block, IR::Inst& inst, StorageBufferAddr buffer) { | ||
| 373 | IR::IREmitter ir{block, IR::Block::InstructionList::s_iterator_to(inst)}; | ||
| 374 | IR::U32 offset; | ||
| 375 | if (const std::optional<LowAddrInfo> low_addr{TrackLowAddress(&inst)}) { | ||
| 376 | offset = low_addr->value; | ||
| 377 | if (low_addr->imm_offset != 0) { | ||
| 378 | offset = ir.IAdd(offset, ir.Imm32(low_addr->imm_offset)); | ||
| 379 | } | ||
| 380 | } else { | ||
| 381 | offset = ir.UConvert(32, IR::U64{inst.Arg(0)}); | ||
| 382 | } | ||
| 383 | // Subtract the least significant 32 bits from the guest offset. The result is the storage | ||
| 384 | // buffer offset in bytes. | ||
| 385 | const IR::U32 low_cbuf{ir.GetCbuf(ir.Imm32(buffer.index), ir.Imm32(buffer.offset))}; | ||
| 386 | return ir.ISub(offset, low_cbuf); | ||
| 387 | } | ||
| 388 | |||
| 389 | /// Replace a global memory load instruction with its storage buffer equivalent | ||
| 390 | void ReplaceLoad(IR::Block& block, IR::Inst& inst, const IR::U32& storage_index, | ||
| 391 | const IR::U32& offset) { | ||
| 392 | const IR::Opcode new_opcode{GlobalToStorage(inst.GetOpcode())}; | ||
| 393 | const auto it{IR::Block::InstructionList::s_iterator_to(inst)}; | ||
| 394 | const IR::Value value{&*block.PrependNewInst(it, new_opcode, {storage_index, offset})}; | ||
| 395 | inst.ReplaceUsesWith(value); | ||
| 396 | } | ||
| 397 | |||
| 398 | /// Replace a global memory write instruction with its storage buffer equivalent | ||
| 399 | void ReplaceWrite(IR::Block& block, IR::Inst& inst, const IR::U32& storage_index, | ||
| 400 | const IR::U32& offset) { | ||
| 401 | const IR::Opcode new_opcode{GlobalToStorage(inst.GetOpcode())}; | ||
| 402 | const auto it{IR::Block::InstructionList::s_iterator_to(inst)}; | ||
| 403 | block.PrependNewInst(it, new_opcode, {storage_index, offset, inst.Arg(1)}); | ||
| 404 | inst.Invalidate(); | ||
| 405 | } | ||
| 406 | |||
| 407 | /// Replace an atomic operation on global memory instruction with its storage buffer equivalent | ||
| 408 | void ReplaceAtomic(IR::Block& block, IR::Inst& inst, const IR::U32& storage_index, | ||
| 409 | const IR::U32& offset) { | ||
| 410 | const IR::Opcode new_opcode{GlobalToStorage(inst.GetOpcode())}; | ||
| 411 | const auto it{IR::Block::InstructionList::s_iterator_to(inst)}; | ||
| 412 | const IR::Value value{ | ||
| 413 | &*block.PrependNewInst(it, new_opcode, {storage_index, offset, inst.Arg(1)})}; | ||
| 414 | inst.ReplaceUsesWith(value); | ||
| 415 | } | ||
| 416 | |||
| 417 | /// Replace a global memory instruction with its storage buffer equivalent | ||
| 418 | void Replace(IR::Block& block, IR::Inst& inst, const IR::U32& storage_index, | ||
| 419 | const IR::U32& offset) { | ||
| 420 | switch (inst.GetOpcode()) { | ||
| 421 | case IR::Opcode::LoadGlobalS8: | ||
| 422 | case IR::Opcode::LoadGlobalU8: | ||
| 423 | case IR::Opcode::LoadGlobalS16: | ||
| 424 | case IR::Opcode::LoadGlobalU16: | ||
| 425 | case IR::Opcode::LoadGlobal32: | ||
| 426 | case IR::Opcode::LoadGlobal64: | ||
| 427 | case IR::Opcode::LoadGlobal128: | ||
| 428 | return ReplaceLoad(block, inst, storage_index, offset); | ||
| 429 | case IR::Opcode::WriteGlobalS8: | ||
| 430 | case IR::Opcode::WriteGlobalU8: | ||
| 431 | case IR::Opcode::WriteGlobalS16: | ||
| 432 | case IR::Opcode::WriteGlobalU16: | ||
| 433 | case IR::Opcode::WriteGlobal32: | ||
| 434 | case IR::Opcode::WriteGlobal64: | ||
| 435 | case IR::Opcode::WriteGlobal128: | ||
| 436 | return ReplaceWrite(block, inst, storage_index, offset); | ||
| 437 | case IR::Opcode::GlobalAtomicIAdd32: | ||
| 438 | case IR::Opcode::GlobalAtomicSMin32: | ||
| 439 | case IR::Opcode::GlobalAtomicUMin32: | ||
| 440 | case IR::Opcode::GlobalAtomicSMax32: | ||
| 441 | case IR::Opcode::GlobalAtomicUMax32: | ||
| 442 | case IR::Opcode::GlobalAtomicInc32: | ||
| 443 | case IR::Opcode::GlobalAtomicDec32: | ||
| 444 | case IR::Opcode::GlobalAtomicAnd32: | ||
| 445 | case IR::Opcode::GlobalAtomicOr32: | ||
| 446 | case IR::Opcode::GlobalAtomicXor32: | ||
| 447 | case IR::Opcode::GlobalAtomicExchange32: | ||
| 448 | case IR::Opcode::GlobalAtomicIAdd64: | ||
| 449 | case IR::Opcode::GlobalAtomicSMin64: | ||
| 450 | case IR::Opcode::GlobalAtomicUMin64: | ||
| 451 | case IR::Opcode::GlobalAtomicSMax64: | ||
| 452 | case IR::Opcode::GlobalAtomicUMax64: | ||
| 453 | case IR::Opcode::GlobalAtomicAnd64: | ||
| 454 | case IR::Opcode::GlobalAtomicOr64: | ||
| 455 | case IR::Opcode::GlobalAtomicXor64: | ||
| 456 | case IR::Opcode::GlobalAtomicExchange64: | ||
| 457 | case IR::Opcode::GlobalAtomicAddF32: | ||
| 458 | case IR::Opcode::GlobalAtomicAddF16x2: | ||
| 459 | case IR::Opcode::GlobalAtomicAddF32x2: | ||
| 460 | case IR::Opcode::GlobalAtomicMinF16x2: | ||
| 461 | case IR::Opcode::GlobalAtomicMinF32x2: | ||
| 462 | case IR::Opcode::GlobalAtomicMaxF16x2: | ||
| 463 | case IR::Opcode::GlobalAtomicMaxF32x2: | ||
| 464 | return ReplaceAtomic(block, inst, storage_index, offset); | ||
| 465 | default: | ||
| 466 | throw InvalidArgument("Invalid global memory opcode {}", inst.GetOpcode()); | ||
| 467 | } | ||
| 468 | } | ||
| 469 | } // Anonymous namespace | ||
| 470 | |||
| 471 | void GlobalMemoryToStorageBufferPass(IR::Program& program) { | ||
| 472 | StorageInfo info; | ||
| 473 | for (IR::Block* const block : program.post_order_blocks) { | ||
| 474 | for (IR::Inst& inst : block->Instructions()) { | ||
| 475 | if (!IsGlobalMemory(inst)) { | ||
| 476 | continue; | ||
| 477 | } | ||
| 478 | CollectStorageBuffers(*block, inst, info); | ||
| 479 | } | ||
| 480 | } | ||
| 481 | for (const StorageBufferAddr& storage_buffer : info.set) { | ||
| 482 | program.info.storage_buffers_descriptors.push_back({ | ||
| 483 | .cbuf_index = storage_buffer.index, | ||
| 484 | .cbuf_offset = storage_buffer.offset, | ||
| 485 | .count = 1, | ||
| 486 | .is_written = info.writes.contains(storage_buffer), | ||
| 487 | }); | ||
| 488 | } | ||
| 489 | for (const StorageInst& storage_inst : info.to_replace) { | ||
| 490 | const StorageBufferAddr storage_buffer{storage_inst.storage_buffer}; | ||
| 491 | const auto it{info.set.find(storage_inst.storage_buffer)}; | ||
| 492 | const IR::U32 index{IR::Value{static_cast<u32>(info.set.index_of(it))}}; | ||
| 493 | IR::Block* const block{storage_inst.block}; | ||
| 494 | IR::Inst* const inst{storage_inst.inst}; | ||
| 495 | const IR::U32 offset{StorageOffset(*block, *inst, storage_buffer)}; | ||
| 496 | Replace(*block, *inst, index, offset); | ||
| 497 | } | ||
| 498 | } | ||
| 499 | |||
| 500 | template <typename Descriptors, typename Descriptor, typename Func> | ||
| 501 | static u32 Add(Descriptors& descriptors, const Descriptor& desc, Func&& pred) { | ||
| 502 | // TODO: Handle arrays | ||
| 503 | const auto it{std::ranges::find_if(descriptors, pred)}; | ||
| 504 | if (it != descriptors.end()) { | ||
| 505 | return static_cast<u32>(std::distance(descriptors.begin(), it)); | ||
| 506 | } | ||
| 507 | descriptors.push_back(desc); | ||
| 508 | return static_cast<u32>(descriptors.size()) - 1; | ||
| 509 | } | ||
| 510 | |||
| 511 | void JoinStorageInfo(Info& base, Info& source) { | ||
| 512 | auto& descriptors = base.storage_buffers_descriptors; | ||
| 513 | for (auto& desc : source.storage_buffers_descriptors) { | ||
| 514 | auto it{std::ranges::find_if(descriptors, [&desc](const auto& existing) { | ||
| 515 | return desc.cbuf_index == existing.cbuf_index && | ||
| 516 | desc.cbuf_offset == existing.cbuf_offset && desc.count == existing.count; | ||
| 517 | })}; | ||
| 518 | if (it != descriptors.end()) { | ||
| 519 | it->is_written |= desc.is_written; | ||
| 520 | continue; | ||
| 521 | } | ||
| 522 | descriptors.push_back(desc); | ||
| 523 | } | ||
| 524 | } | ||
| 525 | |||
| 526 | } // namespace Shader::Optimization | ||
diff --git a/src/shader_recompiler/ir_opt/identity_removal_pass.cpp b/src/shader_recompiler/ir_opt/identity_removal_pass.cpp new file mode 100644 index 000000000..e9b55f835 --- /dev/null +++ b/src/shader_recompiler/ir_opt/identity_removal_pass.cpp | |||
| @@ -0,0 +1,38 @@ | |||
| 1 | // Copyright 2021 yuzu Emulator Project | ||
| 2 | // Licensed under GPLv2 or any later version | ||
| 3 | // Refer to the license.txt file included. | ||
| 4 | |||
| 5 | #include <vector> | ||
| 6 | |||
| 7 | #include "shader_recompiler/frontend/ir/basic_block.h" | ||
| 8 | #include "shader_recompiler/frontend/ir/value.h" | ||
| 9 | #include "shader_recompiler/ir_opt/passes.h" | ||
| 10 | |||
| 11 | namespace Shader::Optimization { | ||
| 12 | |||
| 13 | void IdentityRemovalPass(IR::Program& program) { | ||
| 14 | std::vector<IR::Inst*> to_invalidate; | ||
| 15 | for (IR::Block* const block : program.blocks) { | ||
| 16 | for (auto inst = block->begin(); inst != block->end();) { | ||
| 17 | const size_t num_args{inst->NumArgs()}; | ||
| 18 | for (size_t i = 0; i < num_args; ++i) { | ||
| 19 | IR::Value arg; | ||
| 20 | while ((arg = inst->Arg(i)).IsIdentity()) { | ||
| 21 | inst->SetArg(i, arg.Inst()->Arg(0)); | ||
| 22 | } | ||
| 23 | } | ||
| 24 | if (inst->GetOpcode() == IR::Opcode::Identity || | ||
| 25 | inst->GetOpcode() == IR::Opcode::Void) { | ||
| 26 | to_invalidate.push_back(&*inst); | ||
| 27 | inst = block->Instructions().erase(inst); | ||
| 28 | } else { | ||
| 29 | ++inst; | ||
| 30 | } | ||
| 31 | } | ||
| 32 | } | ||
| 33 | for (IR::Inst* const inst : to_invalidate) { | ||
| 34 | inst->Invalidate(); | ||
| 35 | } | ||
| 36 | } | ||
| 37 | |||
| 38 | } // namespace Shader::Optimization | ||
diff --git a/src/shader_recompiler/ir_opt/lower_fp16_to_fp32.cpp b/src/shader_recompiler/ir_opt/lower_fp16_to_fp32.cpp new file mode 100644 index 000000000..773e1f961 --- /dev/null +++ b/src/shader_recompiler/ir_opt/lower_fp16_to_fp32.cpp | |||
| @@ -0,0 +1,143 @@ | |||
| 1 | // Copyright 2021 yuzu Emulator Project | ||
| 2 | // Licensed under GPLv2 or any later version | ||
| 3 | // Refer to the license.txt file included. | ||
| 4 | |||
| 5 | #include <algorithm> | ||
| 6 | |||
| 7 | #include "shader_recompiler/frontend/ir/ir_emitter.h" | ||
| 8 | #include "shader_recompiler/frontend/ir/value.h" | ||
| 9 | #include "shader_recompiler/ir_opt/passes.h" | ||
| 10 | |||
| 11 | namespace Shader::Optimization { | ||
| 12 | namespace { | ||
| 13 | IR::Opcode Replace(IR::Opcode op) { | ||
| 14 | switch (op) { | ||
| 15 | case IR::Opcode::FPAbs16: | ||
| 16 | return IR::Opcode::FPAbs32; | ||
| 17 | case IR::Opcode::FPAdd16: | ||
| 18 | return IR::Opcode::FPAdd32; | ||
| 19 | case IR::Opcode::FPCeil16: | ||
| 20 | return IR::Opcode::FPCeil32; | ||
| 21 | case IR::Opcode::FPFloor16: | ||
| 22 | return IR::Opcode::FPFloor32; | ||
| 23 | case IR::Opcode::FPFma16: | ||
| 24 | return IR::Opcode::FPFma32; | ||
| 25 | case IR::Opcode::FPMul16: | ||
| 26 | return IR::Opcode::FPMul32; | ||
| 27 | case IR::Opcode::FPNeg16: | ||
| 28 | return IR::Opcode::FPNeg32; | ||
| 29 | case IR::Opcode::FPRoundEven16: | ||
| 30 | return IR::Opcode::FPRoundEven32; | ||
| 31 | case IR::Opcode::FPSaturate16: | ||
| 32 | return IR::Opcode::FPSaturate32; | ||
| 33 | case IR::Opcode::FPClamp16: | ||
| 34 | return IR::Opcode::FPClamp32; | ||
| 35 | case IR::Opcode::FPTrunc16: | ||
| 36 | return IR::Opcode::FPTrunc32; | ||
| 37 | case IR::Opcode::CompositeConstructF16x2: | ||
| 38 | return IR::Opcode::CompositeConstructF32x2; | ||
| 39 | case IR::Opcode::CompositeConstructF16x3: | ||
| 40 | return IR::Opcode::CompositeConstructF32x3; | ||
| 41 | case IR::Opcode::CompositeConstructF16x4: | ||
| 42 | return IR::Opcode::CompositeConstructF32x4; | ||
| 43 | case IR::Opcode::CompositeExtractF16x2: | ||
| 44 | return IR::Opcode::CompositeExtractF32x2; | ||
| 45 | case IR::Opcode::CompositeExtractF16x3: | ||
| 46 | return IR::Opcode::CompositeExtractF32x3; | ||
| 47 | case IR::Opcode::CompositeExtractF16x4: | ||
| 48 | return IR::Opcode::CompositeExtractF32x4; | ||
| 49 | case IR::Opcode::CompositeInsertF16x2: | ||
| 50 | return IR::Opcode::CompositeInsertF32x2; | ||
| 51 | case IR::Opcode::CompositeInsertF16x3: | ||
| 52 | return IR::Opcode::CompositeInsertF32x3; | ||
| 53 | case IR::Opcode::CompositeInsertF16x4: | ||
| 54 | return IR::Opcode::CompositeInsertF32x4; | ||
| 55 | case IR::Opcode::FPOrdEqual16: | ||
| 56 | return IR::Opcode::FPOrdEqual32; | ||
| 57 | case IR::Opcode::FPUnordEqual16: | ||
| 58 | return IR::Opcode::FPUnordEqual32; | ||
| 59 | case IR::Opcode::FPOrdNotEqual16: | ||
| 60 | return IR::Opcode::FPOrdNotEqual32; | ||
| 61 | case IR::Opcode::FPUnordNotEqual16: | ||
| 62 | return IR::Opcode::FPUnordNotEqual32; | ||
| 63 | case IR::Opcode::FPOrdLessThan16: | ||
| 64 | return IR::Opcode::FPOrdLessThan32; | ||
| 65 | case IR::Opcode::FPUnordLessThan16: | ||
| 66 | return IR::Opcode::FPUnordLessThan32; | ||
| 67 | case IR::Opcode::FPOrdGreaterThan16: | ||
| 68 | return IR::Opcode::FPOrdGreaterThan32; | ||
| 69 | case IR::Opcode::FPUnordGreaterThan16: | ||
| 70 | return IR::Opcode::FPUnordGreaterThan32; | ||
| 71 | case IR::Opcode::FPOrdLessThanEqual16: | ||
| 72 | return IR::Opcode::FPOrdLessThanEqual32; | ||
| 73 | case IR::Opcode::FPUnordLessThanEqual16: | ||
| 74 | return IR::Opcode::FPUnordLessThanEqual32; | ||
| 75 | case IR::Opcode::FPOrdGreaterThanEqual16: | ||
| 76 | return IR::Opcode::FPOrdGreaterThanEqual32; | ||
| 77 | case IR::Opcode::FPUnordGreaterThanEqual16: | ||
| 78 | return IR::Opcode::FPUnordGreaterThanEqual32; | ||
| 79 | case IR::Opcode::FPIsNan16: | ||
| 80 | return IR::Opcode::FPIsNan32; | ||
| 81 | case IR::Opcode::ConvertS16F16: | ||
| 82 | return IR::Opcode::ConvertS16F32; | ||
| 83 | case IR::Opcode::ConvertS32F16: | ||
| 84 | return IR::Opcode::ConvertS32F32; | ||
| 85 | case IR::Opcode::ConvertS64F16: | ||
| 86 | return IR::Opcode::ConvertS64F32; | ||
| 87 | case IR::Opcode::ConvertU16F16: | ||
| 88 | return IR::Opcode::ConvertU16F32; | ||
| 89 | case IR::Opcode::ConvertU32F16: | ||
| 90 | return IR::Opcode::ConvertU32F32; | ||
| 91 | case IR::Opcode::ConvertU64F16: | ||
| 92 | return IR::Opcode::ConvertU64F32; | ||
| 93 | case IR::Opcode::PackFloat2x16: | ||
| 94 | return IR::Opcode::PackHalf2x16; | ||
| 95 | case IR::Opcode::UnpackFloat2x16: | ||
| 96 | return IR::Opcode::UnpackHalf2x16; | ||
| 97 | case IR::Opcode::ConvertF32F16: | ||
| 98 | return IR::Opcode::Identity; | ||
| 99 | case IR::Opcode::ConvertF16F32: | ||
| 100 | return IR::Opcode::Identity; | ||
| 101 | case IR::Opcode::ConvertF16S8: | ||
| 102 | return IR::Opcode::ConvertF32S8; | ||
| 103 | case IR::Opcode::ConvertF16S16: | ||
| 104 | return IR::Opcode::ConvertF32S16; | ||
| 105 | case IR::Opcode::ConvertF16S32: | ||
| 106 | return IR::Opcode::ConvertF32S32; | ||
| 107 | case IR::Opcode::ConvertF16S64: | ||
| 108 | return IR::Opcode::ConvertF32S64; | ||
| 109 | case IR::Opcode::ConvertF16U8: | ||
| 110 | return IR::Opcode::ConvertF32U8; | ||
| 111 | case IR::Opcode::ConvertF16U16: | ||
| 112 | return IR::Opcode::ConvertF32U16; | ||
| 113 | case IR::Opcode::ConvertF16U32: | ||
| 114 | return IR::Opcode::ConvertF32U32; | ||
| 115 | case IR::Opcode::ConvertF16U64: | ||
| 116 | return IR::Opcode::ConvertF32U64; | ||
| 117 | case IR::Opcode::GlobalAtomicAddF16x2: | ||
| 118 | return IR::Opcode::GlobalAtomicAddF32x2; | ||
| 119 | case IR::Opcode::StorageAtomicAddF16x2: | ||
| 120 | return IR::Opcode::StorageAtomicAddF32x2; | ||
| 121 | case IR::Opcode::GlobalAtomicMinF16x2: | ||
| 122 | return IR::Opcode::GlobalAtomicMinF32x2; | ||
| 123 | case IR::Opcode::StorageAtomicMinF16x2: | ||
| 124 | return IR::Opcode::StorageAtomicMinF32x2; | ||
| 125 | case IR::Opcode::GlobalAtomicMaxF16x2: | ||
| 126 | return IR::Opcode::GlobalAtomicMaxF32x2; | ||
| 127 | case IR::Opcode::StorageAtomicMaxF16x2: | ||
| 128 | return IR::Opcode::StorageAtomicMaxF32x2; | ||
| 129 | default: | ||
| 130 | return op; | ||
| 131 | } | ||
| 132 | } | ||
| 133 | } // Anonymous namespace | ||
| 134 | |||
| 135 | void LowerFp16ToFp32(IR::Program& program) { | ||
| 136 | for (IR::Block* const block : program.blocks) { | ||
| 137 | for (IR::Inst& inst : block->Instructions()) { | ||
| 138 | inst.ReplaceOpcode(Replace(inst.GetOpcode())); | ||
| 139 | } | ||
| 140 | } | ||
| 141 | } | ||
| 142 | |||
| 143 | } // namespace Shader::Optimization | ||
diff --git a/src/shader_recompiler/ir_opt/lower_int64_to_int32.cpp b/src/shader_recompiler/ir_opt/lower_int64_to_int32.cpp new file mode 100644 index 000000000..e80d3d1d9 --- /dev/null +++ b/src/shader_recompiler/ir_opt/lower_int64_to_int32.cpp | |||
| @@ -0,0 +1,218 @@ | |||
| 1 | // Copyright 2021 yuzu Emulator Project | ||
| 2 | // Licensed under GPLv2 or any later version | ||
| 3 | // Refer to the license.txt file included. | ||
| 4 | |||
| 5 | #include <utility> | ||
| 6 | |||
| 7 | #include "shader_recompiler/exception.h" | ||
| 8 | #include "shader_recompiler/frontend/ir/basic_block.h" | ||
| 9 | #include "shader_recompiler/frontend/ir/ir_emitter.h" | ||
| 10 | #include "shader_recompiler/frontend/ir/program.h" | ||
| 11 | #include "shader_recompiler/frontend/ir/value.h" | ||
| 12 | #include "shader_recompiler/ir_opt/passes.h" | ||
| 13 | |||
| 14 | namespace Shader::Optimization { | ||
| 15 | namespace { | ||
| 16 | std::pair<IR::U32, IR::U32> Unpack(IR::IREmitter& ir, const IR::Value& packed) { | ||
| 17 | if (packed.IsImmediate()) { | ||
| 18 | const u64 value{packed.U64()}; | ||
| 19 | return { | ||
| 20 | ir.Imm32(static_cast<u32>(value)), | ||
| 21 | ir.Imm32(static_cast<u32>(value >> 32)), | ||
| 22 | }; | ||
| 23 | } else { | ||
| 24 | return std::pair<IR::U32, IR::U32>{ | ||
| 25 | ir.CompositeExtract(packed, 0u), | ||
| 26 | ir.CompositeExtract(packed, 1u), | ||
| 27 | }; | ||
| 28 | } | ||
| 29 | } | ||
| 30 | |||
| 31 | void IAdd64To32(IR::Block& block, IR::Inst& inst) { | ||
| 32 | if (inst.HasAssociatedPseudoOperation()) { | ||
| 33 | throw NotImplementedException("IAdd64 emulation with pseudo instructions"); | ||
| 34 | } | ||
| 35 | IR::IREmitter ir(block, IR::Block::InstructionList::s_iterator_to(inst)); | ||
| 36 | const auto [a_lo, a_hi]{Unpack(ir, inst.Arg(0))}; | ||
| 37 | const auto [b_lo, b_hi]{Unpack(ir, inst.Arg(1))}; | ||
| 38 | |||
| 39 | const IR::U32 ret_lo{ir.IAdd(a_lo, b_lo)}; | ||
| 40 | const IR::U32 carry{ir.Select(ir.GetCarryFromOp(ret_lo), ir.Imm32(1u), ir.Imm32(0u))}; | ||
| 41 | |||
| 42 | const IR::U32 ret_hi{ir.IAdd(ir.IAdd(a_hi, b_hi), carry)}; | ||
| 43 | inst.ReplaceUsesWith(ir.CompositeConstruct(ret_lo, ret_hi)); | ||
| 44 | } | ||
| 45 | |||
| 46 | void ISub64To32(IR::Block& block, IR::Inst& inst) { | ||
| 47 | if (inst.HasAssociatedPseudoOperation()) { | ||
| 48 | throw NotImplementedException("ISub64 emulation with pseudo instructions"); | ||
| 49 | } | ||
| 50 | IR::IREmitter ir(block, IR::Block::InstructionList::s_iterator_to(inst)); | ||
| 51 | const auto [a_lo, a_hi]{Unpack(ir, inst.Arg(0))}; | ||
| 52 | const auto [b_lo, b_hi]{Unpack(ir, inst.Arg(1))}; | ||
| 53 | |||
| 54 | const IR::U32 ret_lo{ir.ISub(a_lo, b_lo)}; | ||
| 55 | const IR::U1 underflow{ir.IGreaterThan(ret_lo, a_lo, false)}; | ||
| 56 | const IR::U32 underflow_bit{ir.Select(underflow, ir.Imm32(1u), ir.Imm32(0u))}; | ||
| 57 | |||
| 58 | const IR::U32 ret_hi{ir.ISub(ir.ISub(a_hi, b_hi), underflow_bit)}; | ||
| 59 | inst.ReplaceUsesWith(ir.CompositeConstruct(ret_lo, ret_hi)); | ||
| 60 | } | ||
| 61 | |||
| 62 | void INeg64To32(IR::Block& block, IR::Inst& inst) { | ||
| 63 | if (inst.HasAssociatedPseudoOperation()) { | ||
| 64 | throw NotImplementedException("INeg64 emulation with pseudo instructions"); | ||
| 65 | } | ||
| 66 | IR::IREmitter ir(block, IR::Block::InstructionList::s_iterator_to(inst)); | ||
| 67 | auto [lo, hi]{Unpack(ir, inst.Arg(0))}; | ||
| 68 | lo = ir.BitwiseNot(lo); | ||
| 69 | hi = ir.BitwiseNot(hi); | ||
| 70 | |||
| 71 | lo = ir.IAdd(lo, ir.Imm32(1)); | ||
| 72 | |||
| 73 | const IR::U32 carry{ir.Select(ir.GetCarryFromOp(lo), ir.Imm32(1u), ir.Imm32(0u))}; | ||
| 74 | hi = ir.IAdd(hi, carry); | ||
| 75 | |||
| 76 | inst.ReplaceUsesWith(ir.CompositeConstruct(lo, hi)); | ||
| 77 | } | ||
| 78 | |||
| 79 | void ShiftLeftLogical64To32(IR::Block& block, IR::Inst& inst) { | ||
| 80 | if (inst.HasAssociatedPseudoOperation()) { | ||
| 81 | throw NotImplementedException("ShiftLeftLogical64 emulation with pseudo instructions"); | ||
| 82 | } | ||
| 83 | IR::IREmitter ir(block, IR::Block::InstructionList::s_iterator_to(inst)); | ||
| 84 | const auto [lo, hi]{Unpack(ir, inst.Arg(0))}; | ||
| 85 | const IR::U32 shift{inst.Arg(1)}; | ||
| 86 | |||
| 87 | const IR::U32 shifted_lo{ir.ShiftLeftLogical(lo, shift)}; | ||
| 88 | const IR::U32 shifted_hi{ir.ShiftLeftLogical(hi, shift)}; | ||
| 89 | |||
| 90 | const IR::U32 inv_shift{ir.ISub(shift, ir.Imm32(32))}; | ||
| 91 | const IR::U1 is_long{ir.IGreaterThanEqual(inv_shift, ir.Imm32(0), true)}; | ||
| 92 | const IR::U1 is_zero{ir.IEqual(shift, ir.Imm32(0))}; | ||
| 93 | |||
| 94 | const IR::U32 long_ret_lo{ir.Imm32(0)}; | ||
| 95 | const IR::U32 long_ret_hi{ir.ShiftLeftLogical(lo, inv_shift)}; | ||
| 96 | |||
| 97 | const IR::U32 shift_complement{ir.ISub(ir.Imm32(32), shift)}; | ||
| 98 | const IR::U32 lo_extract{ir.BitFieldExtract(lo, shift_complement, shift, false)}; | ||
| 99 | const IR::U32 short_ret_lo{shifted_lo}; | ||
| 100 | const IR::U32 short_ret_hi{ir.BitwiseOr(shifted_hi, lo_extract)}; | ||
| 101 | |||
| 102 | const IR::U32 zero_ret_lo{lo}; | ||
| 103 | const IR::U32 zero_ret_hi{hi}; | ||
| 104 | |||
| 105 | const IR::U32 non_zero_lo{ir.Select(is_long, long_ret_lo, short_ret_lo)}; | ||
| 106 | const IR::U32 non_zero_hi{ir.Select(is_long, long_ret_hi, short_ret_hi)}; | ||
| 107 | |||
| 108 | const IR::U32 ret_lo{ir.Select(is_zero, zero_ret_lo, non_zero_lo)}; | ||
| 109 | const IR::U32 ret_hi{ir.Select(is_zero, zero_ret_hi, non_zero_hi)}; | ||
| 110 | inst.ReplaceUsesWith(ir.CompositeConstruct(ret_lo, ret_hi)); | ||
| 111 | } | ||
| 112 | |||
| 113 | void ShiftRightLogical64To32(IR::Block& block, IR::Inst& inst) { | ||
| 114 | if (inst.HasAssociatedPseudoOperation()) { | ||
| 115 | throw NotImplementedException("ShiftRightLogical64 emulation with pseudo instructions"); | ||
| 116 | } | ||
| 117 | IR::IREmitter ir(block, IR::Block::InstructionList::s_iterator_to(inst)); | ||
| 118 | const auto [lo, hi]{Unpack(ir, inst.Arg(0))}; | ||
| 119 | const IR::U32 shift{inst.Arg(1)}; | ||
| 120 | |||
| 121 | const IR::U32 shifted_lo{ir.ShiftRightLogical(lo, shift)}; | ||
| 122 | const IR::U32 shifted_hi{ir.ShiftRightLogical(hi, shift)}; | ||
| 123 | |||
| 124 | const IR::U32 inv_shift{ir.ISub(shift, ir.Imm32(32))}; | ||
| 125 | const IR::U1 is_long{ir.IGreaterThanEqual(inv_shift, ir.Imm32(0), true)}; | ||
| 126 | const IR::U1 is_zero{ir.IEqual(shift, ir.Imm32(0))}; | ||
| 127 | |||
| 128 | const IR::U32 long_ret_hi{ir.Imm32(0)}; | ||
| 129 | const IR::U32 long_ret_lo{ir.ShiftRightLogical(hi, inv_shift)}; | ||
| 130 | |||
| 131 | const IR::U32 shift_complement{ir.ISub(ir.Imm32(32), shift)}; | ||
| 132 | const IR::U32 short_hi_extract{ir.BitFieldExtract(hi, ir.Imm32(0), shift)}; | ||
| 133 | const IR::U32 short_ret_hi{shifted_hi}; | ||
| 134 | const IR::U32 short_ret_lo{ | ||
| 135 | ir.BitFieldInsert(shifted_lo, short_hi_extract, shift_complement, shift)}; | ||
| 136 | |||
| 137 | const IR::U32 zero_ret_lo{lo}; | ||
| 138 | const IR::U32 zero_ret_hi{hi}; | ||
| 139 | |||
| 140 | const IR::U32 non_zero_lo{ir.Select(is_long, long_ret_lo, short_ret_lo)}; | ||
| 141 | const IR::U32 non_zero_hi{ir.Select(is_long, long_ret_hi, short_ret_hi)}; | ||
| 142 | |||
| 143 | const IR::U32 ret_lo{ir.Select(is_zero, zero_ret_lo, non_zero_lo)}; | ||
| 144 | const IR::U32 ret_hi{ir.Select(is_zero, zero_ret_hi, non_zero_hi)}; | ||
| 145 | inst.ReplaceUsesWith(ir.CompositeConstruct(ret_lo, ret_hi)); | ||
| 146 | } | ||
| 147 | |||
| 148 | void ShiftRightArithmetic64To32(IR::Block& block, IR::Inst& inst) { | ||
| 149 | if (inst.HasAssociatedPseudoOperation()) { | ||
| 150 | throw NotImplementedException("ShiftRightArithmetic64 emulation with pseudo instructions"); | ||
| 151 | } | ||
| 152 | IR::IREmitter ir(block, IR::Block::InstructionList::s_iterator_to(inst)); | ||
| 153 | const auto [lo, hi]{Unpack(ir, inst.Arg(0))}; | ||
| 154 | const IR::U32 shift{inst.Arg(1)}; | ||
| 155 | |||
| 156 | const IR::U32 shifted_lo{ir.ShiftRightLogical(lo, shift)}; | ||
| 157 | const IR::U32 shifted_hi{ir.ShiftRightArithmetic(hi, shift)}; | ||
| 158 | |||
| 159 | const IR::U32 sign_extension{ir.ShiftRightArithmetic(hi, ir.Imm32(31))}; | ||
| 160 | |||
| 161 | const IR::U32 inv_shift{ir.ISub(shift, ir.Imm32(32))}; | ||
| 162 | const IR::U1 is_long{ir.IGreaterThanEqual(inv_shift, ir.Imm32(0), true)}; | ||
| 163 | const IR::U1 is_zero{ir.IEqual(shift, ir.Imm32(0))}; | ||
| 164 | |||
| 165 | const IR::U32 long_ret_hi{sign_extension}; | ||
| 166 | const IR::U32 long_ret_lo{ir.ShiftRightArithmetic(hi, inv_shift)}; | ||
| 167 | |||
| 168 | const IR::U32 shift_complement{ir.ISub(ir.Imm32(32), shift)}; | ||
| 169 | const IR::U32 short_hi_extract(ir.BitFieldExtract(hi, ir.Imm32(0), shift)); | ||
| 170 | const IR::U32 short_ret_hi{shifted_hi}; | ||
| 171 | const IR::U32 short_ret_lo{ | ||
| 172 | ir.BitFieldInsert(shifted_lo, short_hi_extract, shift_complement, shift)}; | ||
| 173 | |||
| 174 | const IR::U32 zero_ret_lo{lo}; | ||
| 175 | const IR::U32 zero_ret_hi{hi}; | ||
| 176 | |||
| 177 | const IR::U32 non_zero_lo{ir.Select(is_long, long_ret_lo, short_ret_lo)}; | ||
| 178 | const IR::U32 non_zero_hi{ir.Select(is_long, long_ret_hi, short_ret_hi)}; | ||
| 179 | |||
| 180 | const IR::U32 ret_lo{ir.Select(is_zero, zero_ret_lo, non_zero_lo)}; | ||
| 181 | const IR::U32 ret_hi{ir.Select(is_zero, zero_ret_hi, non_zero_hi)}; | ||
| 182 | inst.ReplaceUsesWith(ir.CompositeConstruct(ret_lo, ret_hi)); | ||
| 183 | } | ||
| 184 | |||
| 185 | void Lower(IR::Block& block, IR::Inst& inst) { | ||
| 186 | switch (inst.GetOpcode()) { | ||
| 187 | case IR::Opcode::PackUint2x32: | ||
| 188 | case IR::Opcode::UnpackUint2x32: | ||
| 189 | return inst.ReplaceOpcode(IR::Opcode::Identity); | ||
| 190 | case IR::Opcode::IAdd64: | ||
| 191 | return IAdd64To32(block, inst); | ||
| 192 | case IR::Opcode::ISub64: | ||
| 193 | return ISub64To32(block, inst); | ||
| 194 | case IR::Opcode::INeg64: | ||
| 195 | return INeg64To32(block, inst); | ||
| 196 | case IR::Opcode::ShiftLeftLogical64: | ||
| 197 | return ShiftLeftLogical64To32(block, inst); | ||
| 198 | case IR::Opcode::ShiftRightLogical64: | ||
| 199 | return ShiftRightLogical64To32(block, inst); | ||
| 200 | case IR::Opcode::ShiftRightArithmetic64: | ||
| 201 | return ShiftRightArithmetic64To32(block, inst); | ||
| 202 | default: | ||
| 203 | break; | ||
| 204 | } | ||
| 205 | } | ||
| 206 | } // Anonymous namespace | ||
| 207 | |||
| 208 | void LowerInt64ToInt32(IR::Program& program) { | ||
| 209 | const auto end{program.post_order_blocks.rend()}; | ||
| 210 | for (auto it = program.post_order_blocks.rbegin(); it != end; ++it) { | ||
| 211 | IR::Block* const block{*it}; | ||
| 212 | for (IR::Inst& inst : block->Instructions()) { | ||
| 213 | Lower(*block, inst); | ||
| 214 | } | ||
| 215 | } | ||
| 216 | } | ||
| 217 | |||
| 218 | } // namespace Shader::Optimization | ||
diff --git a/src/shader_recompiler/ir_opt/passes.h b/src/shader_recompiler/ir_opt/passes.h new file mode 100644 index 000000000..2f89b1ea0 --- /dev/null +++ b/src/shader_recompiler/ir_opt/passes.h | |||
| @@ -0,0 +1,32 @@ | |||
| 1 | // Copyright 2021 yuzu Emulator Project | ||
| 2 | // Licensed under GPLv2 or any later version | ||
| 3 | // Refer to the license.txt file included. | ||
| 4 | |||
| 5 | #pragma once | ||
| 6 | |||
| 7 | #include <span> | ||
| 8 | |||
| 9 | #include "shader_recompiler/environment.h" | ||
| 10 | #include "shader_recompiler/frontend/ir/basic_block.h" | ||
| 11 | #include "shader_recompiler/frontend/ir/program.h" | ||
| 12 | |||
| 13 | namespace Shader::Optimization { | ||
| 14 | |||
| 15 | void CollectShaderInfoPass(Environment& env, IR::Program& program); | ||
| 16 | void ConstantPropagationPass(IR::Program& program); | ||
| 17 | void DeadCodeEliminationPass(IR::Program& program); | ||
| 18 | void GlobalMemoryToStorageBufferPass(IR::Program& program); | ||
| 19 | void IdentityRemovalPass(IR::Program& program); | ||
| 20 | void LowerFp16ToFp32(IR::Program& program); | ||
| 21 | void LowerInt64ToInt32(IR::Program& program); | ||
| 22 | void SsaRewritePass(IR::Program& program); | ||
| 23 | void TexturePass(Environment& env, IR::Program& program); | ||
| 24 | void VerificationPass(const IR::Program& program); | ||
| 25 | |||
| 26 | // Dual Vertex | ||
| 27 | void VertexATransformPass(IR::Program& program); | ||
| 28 | void VertexBTransformPass(IR::Program& program); | ||
| 29 | void JoinTextureInfo(Info& base, Info& source); | ||
| 30 | void JoinStorageInfo(Info& base, Info& source); | ||
| 31 | |||
| 32 | } // namespace Shader::Optimization | ||
diff --git a/src/shader_recompiler/ir_opt/ssa_rewrite_pass.cpp b/src/shader_recompiler/ir_opt/ssa_rewrite_pass.cpp new file mode 100644 index 000000000..53145fb5e --- /dev/null +++ b/src/shader_recompiler/ir_opt/ssa_rewrite_pass.cpp | |||
| @@ -0,0 +1,383 @@ | |||
| 1 | // Copyright 2021 yuzu Emulator Project | ||
| 2 | // Licensed under GPLv2 or any later version | ||
| 3 | // Refer to the license.txt file included. | ||
| 4 | |||
| 5 | // This file implements the SSA rewriting algorithm proposed in | ||
| 6 | // | ||
| 7 | // Simple and Efficient Construction of Static Single Assignment Form. | ||
| 8 | // Braun M., Buchwald S., Hack S., Leiba R., Mallon C., Zwinkau A. (2013) | ||
| 9 | // In: Jhala R., De Bosschere K. (eds) | ||
| 10 | // Compiler Construction. CC 2013. | ||
| 11 | // Lecture Notes in Computer Science, vol 7791. | ||
| 12 | // Springer, Berlin, Heidelberg | ||
| 13 | // | ||
| 14 | // https://link.springer.com/chapter/10.1007/978-3-642-37051-9_6 | ||
| 15 | // | ||
| 16 | |||
| 17 | #include <span> | ||
| 18 | #include <variant> | ||
| 19 | #include <vector> | ||
| 20 | |||
| 21 | #include <boost/container/flat_map.hpp> | ||
| 22 | #include <boost/container/flat_set.hpp> | ||
| 23 | |||
| 24 | #include "shader_recompiler/frontend/ir/basic_block.h" | ||
| 25 | #include "shader_recompiler/frontend/ir/opcodes.h" | ||
| 26 | #include "shader_recompiler/frontend/ir/pred.h" | ||
| 27 | #include "shader_recompiler/frontend/ir/reg.h" | ||
| 28 | #include "shader_recompiler/frontend/ir/value.h" | ||
| 29 | #include "shader_recompiler/ir_opt/passes.h" | ||
| 30 | |||
| 31 | namespace Shader::Optimization { | ||
| 32 | namespace { | ||
| 33 | struct FlagTag { | ||
| 34 | auto operator<=>(const FlagTag&) const noexcept = default; | ||
| 35 | }; | ||
| 36 | struct ZeroFlagTag : FlagTag {}; | ||
| 37 | struct SignFlagTag : FlagTag {}; | ||
| 38 | struct CarryFlagTag : FlagTag {}; | ||
| 39 | struct OverflowFlagTag : FlagTag {}; | ||
| 40 | |||
| 41 | struct GotoVariable : FlagTag { | ||
| 42 | GotoVariable() = default; | ||
| 43 | explicit GotoVariable(u32 index_) : index{index_} {} | ||
| 44 | |||
| 45 | auto operator<=>(const GotoVariable&) const noexcept = default; | ||
| 46 | |||
| 47 | u32 index; | ||
| 48 | }; | ||
| 49 | |||
| 50 | struct IndirectBranchVariable { | ||
| 51 | auto operator<=>(const IndirectBranchVariable&) const noexcept = default; | ||
| 52 | }; | ||
| 53 | |||
| 54 | using Variant = std::variant<IR::Reg, IR::Pred, ZeroFlagTag, SignFlagTag, CarryFlagTag, | ||
| 55 | OverflowFlagTag, GotoVariable, IndirectBranchVariable>; | ||
| 56 | using ValueMap = boost::container::flat_map<IR::Block*, IR::Value>; | ||
| 57 | |||
| 58 | struct DefTable { | ||
| 59 | const IR::Value& Def(IR::Block* block, IR::Reg variable) { | ||
| 60 | return block->SsaRegValue(variable); | ||
| 61 | } | ||
| 62 | void SetDef(IR::Block* block, IR::Reg variable, const IR::Value& value) { | ||
| 63 | block->SetSsaRegValue(variable, value); | ||
| 64 | } | ||
| 65 | |||
| 66 | const IR::Value& Def(IR::Block* block, IR::Pred variable) { | ||
| 67 | return preds[IR::PredIndex(variable)][block]; | ||
| 68 | } | ||
| 69 | void SetDef(IR::Block* block, IR::Pred variable, const IR::Value& value) { | ||
| 70 | preds[IR::PredIndex(variable)].insert_or_assign(block, value); | ||
| 71 | } | ||
| 72 | |||
| 73 | const IR::Value& Def(IR::Block* block, GotoVariable variable) { | ||
| 74 | return goto_vars[variable.index][block]; | ||
| 75 | } | ||
| 76 | void SetDef(IR::Block* block, GotoVariable variable, const IR::Value& value) { | ||
| 77 | goto_vars[variable.index].insert_or_assign(block, value); | ||
| 78 | } | ||
| 79 | |||
| 80 | const IR::Value& Def(IR::Block* block, IndirectBranchVariable) { | ||
| 81 | return indirect_branch_var[block]; | ||
| 82 | } | ||
| 83 | void SetDef(IR::Block* block, IndirectBranchVariable, const IR::Value& value) { | ||
| 84 | indirect_branch_var.insert_or_assign(block, value); | ||
| 85 | } | ||
| 86 | |||
| 87 | const IR::Value& Def(IR::Block* block, ZeroFlagTag) { | ||
| 88 | return zero_flag[block]; | ||
| 89 | } | ||
| 90 | void SetDef(IR::Block* block, ZeroFlagTag, const IR::Value& value) { | ||
| 91 | zero_flag.insert_or_assign(block, value); | ||
| 92 | } | ||
| 93 | |||
| 94 | const IR::Value& Def(IR::Block* block, SignFlagTag) { | ||
| 95 | return sign_flag[block]; | ||
| 96 | } | ||
| 97 | void SetDef(IR::Block* block, SignFlagTag, const IR::Value& value) { | ||
| 98 | sign_flag.insert_or_assign(block, value); | ||
| 99 | } | ||
| 100 | |||
| 101 | const IR::Value& Def(IR::Block* block, CarryFlagTag) { | ||
| 102 | return carry_flag[block]; | ||
| 103 | } | ||
| 104 | void SetDef(IR::Block* block, CarryFlagTag, const IR::Value& value) { | ||
| 105 | carry_flag.insert_or_assign(block, value); | ||
| 106 | } | ||
| 107 | |||
| 108 | const IR::Value& Def(IR::Block* block, OverflowFlagTag) { | ||
| 109 | return overflow_flag[block]; | ||
| 110 | } | ||
| 111 | void SetDef(IR::Block* block, OverflowFlagTag, const IR::Value& value) { | ||
| 112 | overflow_flag.insert_or_assign(block, value); | ||
| 113 | } | ||
| 114 | |||
| 115 | std::array<ValueMap, IR::NUM_USER_PREDS> preds; | ||
| 116 | boost::container::flat_map<u32, ValueMap> goto_vars; | ||
| 117 | ValueMap indirect_branch_var; | ||
| 118 | ValueMap zero_flag; | ||
| 119 | ValueMap sign_flag; | ||
| 120 | ValueMap carry_flag; | ||
| 121 | ValueMap overflow_flag; | ||
| 122 | }; | ||
| 123 | |||
| 124 | IR::Opcode UndefOpcode(IR::Reg) noexcept { | ||
| 125 | return IR::Opcode::UndefU32; | ||
| 126 | } | ||
| 127 | |||
| 128 | IR::Opcode UndefOpcode(IR::Pred) noexcept { | ||
| 129 | return IR::Opcode::UndefU1; | ||
| 130 | } | ||
| 131 | |||
| 132 | IR::Opcode UndefOpcode(const FlagTag&) noexcept { | ||
| 133 | return IR::Opcode::UndefU1; | ||
| 134 | } | ||
| 135 | |||
| 136 | IR::Opcode UndefOpcode(IndirectBranchVariable) noexcept { | ||
| 137 | return IR::Opcode::UndefU32; | ||
| 138 | } | ||
| 139 | |||
| 140 | enum class Status { | ||
| 141 | Start, | ||
| 142 | SetValue, | ||
| 143 | PreparePhiArgument, | ||
| 144 | PushPhiArgument, | ||
| 145 | }; | ||
| 146 | |||
| 147 | template <typename Type> | ||
| 148 | struct ReadState { | ||
| 149 | ReadState(IR::Block* block_) : block{block_} {} | ||
| 150 | ReadState() = default; | ||
| 151 | |||
| 152 | IR::Block* block{}; | ||
| 153 | IR::Value result{}; | ||
| 154 | IR::Inst* phi{}; | ||
| 155 | IR::Block* const* pred_it{}; | ||
| 156 | IR::Block* const* pred_end{}; | ||
| 157 | Status pc{Status::Start}; | ||
| 158 | }; | ||
| 159 | |||
| 160 | class Pass { | ||
| 161 | public: | ||
| 162 | template <typename Type> | ||
| 163 | void WriteVariable(Type variable, IR::Block* block, const IR::Value& value) { | ||
| 164 | current_def.SetDef(block, variable, value); | ||
| 165 | } | ||
| 166 | |||
| 167 | template <typename Type> | ||
| 168 | IR::Value ReadVariable(Type variable, IR::Block* root_block) { | ||
| 169 | boost::container::small_vector<ReadState<Type>, 64> stack{ | ||
| 170 | ReadState<Type>(nullptr), | ||
| 171 | ReadState<Type>(root_block), | ||
| 172 | }; | ||
| 173 | const auto prepare_phi_operand{[&] { | ||
| 174 | if (stack.back().pred_it == stack.back().pred_end) { | ||
| 175 | IR::Inst* const phi{stack.back().phi}; | ||
| 176 | IR::Block* const block{stack.back().block}; | ||
| 177 | const IR::Value result{TryRemoveTrivialPhi(*phi, block, UndefOpcode(variable))}; | ||
| 178 | stack.pop_back(); | ||
| 179 | stack.back().result = result; | ||
| 180 | WriteVariable(variable, block, result); | ||
| 181 | } else { | ||
| 182 | IR::Block* const imm_pred{*stack.back().pred_it}; | ||
| 183 | stack.back().pc = Status::PushPhiArgument; | ||
| 184 | stack.emplace_back(imm_pred); | ||
| 185 | } | ||
| 186 | }}; | ||
| 187 | do { | ||
| 188 | IR::Block* const block{stack.back().block}; | ||
| 189 | switch (stack.back().pc) { | ||
| 190 | case Status::Start: { | ||
| 191 | if (const IR::Value& def = current_def.Def(block, variable); !def.IsEmpty()) { | ||
| 192 | stack.back().result = def; | ||
| 193 | } else if (!block->IsSsaSealed()) { | ||
| 194 | // Incomplete CFG | ||
| 195 | IR::Inst* phi{&*block->PrependNewInst(block->begin(), IR::Opcode::Phi)}; | ||
| 196 | phi->SetFlags(IR::TypeOf(UndefOpcode(variable))); | ||
| 197 | |||
| 198 | incomplete_phis[block].insert_or_assign(variable, phi); | ||
| 199 | stack.back().result = IR::Value{&*phi}; | ||
| 200 | } else if (const std::span imm_preds = block->ImmPredecessors(); | ||
| 201 | imm_preds.size() == 1) { | ||
| 202 | // Optimize the common case of one predecessor: no phi needed | ||
| 203 | stack.back().pc = Status::SetValue; | ||
| 204 | stack.emplace_back(imm_preds.front()); | ||
| 205 | break; | ||
| 206 | } else { | ||
| 207 | // Break potential cycles with operandless phi | ||
| 208 | IR::Inst* const phi{&*block->PrependNewInst(block->begin(), IR::Opcode::Phi)}; | ||
| 209 | phi->SetFlags(IR::TypeOf(UndefOpcode(variable))); | ||
| 210 | |||
| 211 | WriteVariable(variable, block, IR::Value{phi}); | ||
| 212 | |||
| 213 | stack.back().phi = phi; | ||
| 214 | stack.back().pred_it = imm_preds.data(); | ||
| 215 | stack.back().pred_end = imm_preds.data() + imm_preds.size(); | ||
| 216 | prepare_phi_operand(); | ||
| 217 | break; | ||
| 218 | } | ||
| 219 | } | ||
| 220 | [[fallthrough]]; | ||
| 221 | case Status::SetValue: { | ||
| 222 | const IR::Value result{stack.back().result}; | ||
| 223 | WriteVariable(variable, block, result); | ||
| 224 | stack.pop_back(); | ||
| 225 | stack.back().result = result; | ||
| 226 | break; | ||
| 227 | } | ||
| 228 | case Status::PushPhiArgument: { | ||
| 229 | IR::Inst* const phi{stack.back().phi}; | ||
| 230 | phi->AddPhiOperand(*stack.back().pred_it, stack.back().result); | ||
| 231 | ++stack.back().pred_it; | ||
| 232 | } | ||
| 233 | [[fallthrough]]; | ||
| 234 | case Status::PreparePhiArgument: | ||
| 235 | prepare_phi_operand(); | ||
| 236 | break; | ||
| 237 | } | ||
| 238 | } while (stack.size() > 1); | ||
| 239 | return stack.back().result; | ||
| 240 | } | ||
| 241 | |||
| 242 | void SealBlock(IR::Block* block) { | ||
| 243 | const auto it{incomplete_phis.find(block)}; | ||
| 244 | if (it != incomplete_phis.end()) { | ||
| 245 | for (auto& pair : it->second) { | ||
| 246 | auto& variant{pair.first}; | ||
| 247 | auto& phi{pair.second}; | ||
| 248 | std::visit([&](auto& variable) { AddPhiOperands(variable, *phi, block); }, variant); | ||
| 249 | } | ||
| 250 | } | ||
| 251 | block->SsaSeal(); | ||
| 252 | } | ||
| 253 | |||
| 254 | private: | ||
| 255 | template <typename Type> | ||
| 256 | IR::Value AddPhiOperands(Type variable, IR::Inst& phi, IR::Block* block) { | ||
| 257 | for (IR::Block* const imm_pred : block->ImmPredecessors()) { | ||
| 258 | phi.AddPhiOperand(imm_pred, ReadVariable(variable, imm_pred)); | ||
| 259 | } | ||
| 260 | return TryRemoveTrivialPhi(phi, block, UndefOpcode(variable)); | ||
| 261 | } | ||
| 262 | |||
| 263 | IR::Value TryRemoveTrivialPhi(IR::Inst& phi, IR::Block* block, IR::Opcode undef_opcode) { | ||
| 264 | IR::Value same; | ||
| 265 | const size_t num_args{phi.NumArgs()}; | ||
| 266 | for (size_t arg_index = 0; arg_index < num_args; ++arg_index) { | ||
| 267 | const IR::Value& op{phi.Arg(arg_index)}; | ||
| 268 | if (op.Resolve() == same.Resolve() || op == IR::Value{&phi}) { | ||
| 269 | // Unique value or self-reference | ||
| 270 | continue; | ||
| 271 | } | ||
| 272 | if (!same.IsEmpty()) { | ||
| 273 | // The phi merges at least two values: not trivial | ||
| 274 | return IR::Value{&phi}; | ||
| 275 | } | ||
| 276 | same = op; | ||
| 277 | } | ||
| 278 | // Remove the phi node from the block, it will be reinserted | ||
| 279 | IR::Block::InstructionList& list{block->Instructions()}; | ||
| 280 | list.erase(IR::Block::InstructionList::s_iterator_to(phi)); | ||
| 281 | |||
| 282 | // Find the first non-phi instruction and use it as an insertion point | ||
| 283 | IR::Block::iterator reinsert_point{std::ranges::find_if_not(list, IR::IsPhi)}; | ||
| 284 | if (same.IsEmpty()) { | ||
| 285 | // The phi is unreachable or in the start block | ||
| 286 | // Insert an undefined instruction and make it the phi node replacement | ||
| 287 | // The "phi" node reinsertion point is specified after this instruction | ||
| 288 | reinsert_point = block->PrependNewInst(reinsert_point, undef_opcode); | ||
| 289 | same = IR::Value{&*reinsert_point}; | ||
| 290 | ++reinsert_point; | ||
| 291 | } | ||
| 292 | // Reinsert the phi node and reroute all its uses to the "same" value | ||
| 293 | list.insert(reinsert_point, phi); | ||
| 294 | phi.ReplaceUsesWith(same); | ||
| 295 | // TODO: Try to recursively remove all phi users, which might have become trivial | ||
| 296 | return same; | ||
| 297 | } | ||
| 298 | |||
| 299 | boost::container::flat_map<IR::Block*, boost::container::flat_map<Variant, IR::Inst*>> | ||
| 300 | incomplete_phis; | ||
| 301 | DefTable current_def; | ||
| 302 | }; | ||
| 303 | |||
| 304 | void VisitInst(Pass& pass, IR::Block* block, IR::Inst& inst) { | ||
| 305 | switch (inst.GetOpcode()) { | ||
| 306 | case IR::Opcode::SetRegister: | ||
| 307 | if (const IR::Reg reg{inst.Arg(0).Reg()}; reg != IR::Reg::RZ) { | ||
| 308 | pass.WriteVariable(reg, block, inst.Arg(1)); | ||
| 309 | } | ||
| 310 | break; | ||
| 311 | case IR::Opcode::SetPred: | ||
| 312 | if (const IR::Pred pred{inst.Arg(0).Pred()}; pred != IR::Pred::PT) { | ||
| 313 | pass.WriteVariable(pred, block, inst.Arg(1)); | ||
| 314 | } | ||
| 315 | break; | ||
| 316 | case IR::Opcode::SetGotoVariable: | ||
| 317 | pass.WriteVariable(GotoVariable{inst.Arg(0).U32()}, block, inst.Arg(1)); | ||
| 318 | break; | ||
| 319 | case IR::Opcode::SetIndirectBranchVariable: | ||
| 320 | pass.WriteVariable(IndirectBranchVariable{}, block, inst.Arg(0)); | ||
| 321 | break; | ||
| 322 | case IR::Opcode::SetZFlag: | ||
| 323 | pass.WriteVariable(ZeroFlagTag{}, block, inst.Arg(0)); | ||
| 324 | break; | ||
| 325 | case IR::Opcode::SetSFlag: | ||
| 326 | pass.WriteVariable(SignFlagTag{}, block, inst.Arg(0)); | ||
| 327 | break; | ||
| 328 | case IR::Opcode::SetCFlag: | ||
| 329 | pass.WriteVariable(CarryFlagTag{}, block, inst.Arg(0)); | ||
| 330 | break; | ||
| 331 | case IR::Opcode::SetOFlag: | ||
| 332 | pass.WriteVariable(OverflowFlagTag{}, block, inst.Arg(0)); | ||
| 333 | break; | ||
| 334 | case IR::Opcode::GetRegister: | ||
| 335 | if (const IR::Reg reg{inst.Arg(0).Reg()}; reg != IR::Reg::RZ) { | ||
| 336 | inst.ReplaceUsesWith(pass.ReadVariable(reg, block)); | ||
| 337 | } | ||
| 338 | break; | ||
| 339 | case IR::Opcode::GetPred: | ||
| 340 | if (const IR::Pred pred{inst.Arg(0).Pred()}; pred != IR::Pred::PT) { | ||
| 341 | inst.ReplaceUsesWith(pass.ReadVariable(pred, block)); | ||
| 342 | } | ||
| 343 | break; | ||
| 344 | case IR::Opcode::GetGotoVariable: | ||
| 345 | inst.ReplaceUsesWith(pass.ReadVariable(GotoVariable{inst.Arg(0).U32()}, block)); | ||
| 346 | break; | ||
| 347 | case IR::Opcode::GetIndirectBranchVariable: | ||
| 348 | inst.ReplaceUsesWith(pass.ReadVariable(IndirectBranchVariable{}, block)); | ||
| 349 | break; | ||
| 350 | case IR::Opcode::GetZFlag: | ||
| 351 | inst.ReplaceUsesWith(pass.ReadVariable(ZeroFlagTag{}, block)); | ||
| 352 | break; | ||
| 353 | case IR::Opcode::GetSFlag: | ||
| 354 | inst.ReplaceUsesWith(pass.ReadVariable(SignFlagTag{}, block)); | ||
| 355 | break; | ||
| 356 | case IR::Opcode::GetCFlag: | ||
| 357 | inst.ReplaceUsesWith(pass.ReadVariable(CarryFlagTag{}, block)); | ||
| 358 | break; | ||
| 359 | case IR::Opcode::GetOFlag: | ||
| 360 | inst.ReplaceUsesWith(pass.ReadVariable(OverflowFlagTag{}, block)); | ||
| 361 | break; | ||
| 362 | default: | ||
| 363 | break; | ||
| 364 | } | ||
| 365 | } | ||
| 366 | |||
| 367 | void VisitBlock(Pass& pass, IR::Block* block) { | ||
| 368 | for (IR::Inst& inst : block->Instructions()) { | ||
| 369 | VisitInst(pass, block, inst); | ||
| 370 | } | ||
| 371 | pass.SealBlock(block); | ||
| 372 | } | ||
| 373 | } // Anonymous namespace | ||
| 374 | |||
| 375 | void SsaRewritePass(IR::Program& program) { | ||
| 376 | Pass pass; | ||
| 377 | const auto end{program.post_order_blocks.rend()}; | ||
| 378 | for (auto block = program.post_order_blocks.rbegin(); block != end; ++block) { | ||
| 379 | VisitBlock(pass, *block); | ||
| 380 | } | ||
| 381 | } | ||
| 382 | |||
| 383 | } // namespace Shader::Optimization | ||
diff --git a/src/shader_recompiler/ir_opt/texture_pass.cpp b/src/shader_recompiler/ir_opt/texture_pass.cpp new file mode 100644 index 000000000..44ad10d43 --- /dev/null +++ b/src/shader_recompiler/ir_opt/texture_pass.cpp | |||
| @@ -0,0 +1,523 @@ | |||
| 1 | // Copyright 2021 yuzu Emulator Project | ||
| 2 | // Licensed under GPLv2 or any later version | ||
| 3 | // Refer to the license.txt file included. | ||
| 4 | |||
| 5 | #include <algorithm> | ||
| 6 | #include <bit> | ||
| 7 | #include <optional> | ||
| 8 | |||
| 9 | #include <boost/container/small_vector.hpp> | ||
| 10 | |||
| 11 | #include "shader_recompiler/environment.h" | ||
| 12 | #include "shader_recompiler/frontend/ir/basic_block.h" | ||
| 13 | #include "shader_recompiler/frontend/ir/breadth_first_search.h" | ||
| 14 | #include "shader_recompiler/frontend/ir/ir_emitter.h" | ||
| 15 | #include "shader_recompiler/ir_opt/passes.h" | ||
| 16 | #include "shader_recompiler/shader_info.h" | ||
| 17 | |||
| 18 | namespace Shader::Optimization { | ||
| 19 | namespace { | ||
| 20 | struct ConstBufferAddr { | ||
| 21 | u32 index; | ||
| 22 | u32 offset; | ||
| 23 | u32 secondary_index; | ||
| 24 | u32 secondary_offset; | ||
| 25 | IR::U32 dynamic_offset; | ||
| 26 | u32 count; | ||
| 27 | bool has_secondary; | ||
| 28 | }; | ||
| 29 | |||
| 30 | struct TextureInst { | ||
| 31 | ConstBufferAddr cbuf; | ||
| 32 | IR::Inst* inst; | ||
| 33 | IR::Block* block; | ||
| 34 | }; | ||
| 35 | |||
| 36 | using TextureInstVector = boost::container::small_vector<TextureInst, 24>; | ||
| 37 | |||
| 38 | constexpr u32 DESCRIPTOR_SIZE = 8; | ||
| 39 | constexpr u32 DESCRIPTOR_SIZE_SHIFT = static_cast<u32>(std::countr_zero(DESCRIPTOR_SIZE)); | ||
| 40 | |||
| 41 | IR::Opcode IndexedInstruction(const IR::Inst& inst) { | ||
| 42 | switch (inst.GetOpcode()) { | ||
| 43 | case IR::Opcode::BindlessImageSampleImplicitLod: | ||
| 44 | case IR::Opcode::BoundImageSampleImplicitLod: | ||
| 45 | return IR::Opcode::ImageSampleImplicitLod; | ||
| 46 | case IR::Opcode::BoundImageSampleExplicitLod: | ||
| 47 | case IR::Opcode::BindlessImageSampleExplicitLod: | ||
| 48 | return IR::Opcode::ImageSampleExplicitLod; | ||
| 49 | case IR::Opcode::BoundImageSampleDrefImplicitLod: | ||
| 50 | case IR::Opcode::BindlessImageSampleDrefImplicitLod: | ||
| 51 | return IR::Opcode::ImageSampleDrefImplicitLod; | ||
| 52 | case IR::Opcode::BoundImageSampleDrefExplicitLod: | ||
| 53 | case IR::Opcode::BindlessImageSampleDrefExplicitLod: | ||
| 54 | return IR::Opcode::ImageSampleDrefExplicitLod; | ||
| 55 | case IR::Opcode::BindlessImageGather: | ||
| 56 | case IR::Opcode::BoundImageGather: | ||
| 57 | return IR::Opcode::ImageGather; | ||
| 58 | case IR::Opcode::BindlessImageGatherDref: | ||
| 59 | case IR::Opcode::BoundImageGatherDref: | ||
| 60 | return IR::Opcode::ImageGatherDref; | ||
| 61 | case IR::Opcode::BindlessImageFetch: | ||
| 62 | case IR::Opcode::BoundImageFetch: | ||
| 63 | return IR::Opcode::ImageFetch; | ||
| 64 | case IR::Opcode::BoundImageQueryDimensions: | ||
| 65 | case IR::Opcode::BindlessImageQueryDimensions: | ||
| 66 | return IR::Opcode::ImageQueryDimensions; | ||
| 67 | case IR::Opcode::BoundImageQueryLod: | ||
| 68 | case IR::Opcode::BindlessImageQueryLod: | ||
| 69 | return IR::Opcode::ImageQueryLod; | ||
| 70 | case IR::Opcode::BoundImageGradient: | ||
| 71 | case IR::Opcode::BindlessImageGradient: | ||
| 72 | return IR::Opcode::ImageGradient; | ||
| 73 | case IR::Opcode::BoundImageRead: | ||
| 74 | case IR::Opcode::BindlessImageRead: | ||
| 75 | return IR::Opcode::ImageRead; | ||
| 76 | case IR::Opcode::BoundImageWrite: | ||
| 77 | case IR::Opcode::BindlessImageWrite: | ||
| 78 | return IR::Opcode::ImageWrite; | ||
| 79 | case IR::Opcode::BoundImageAtomicIAdd32: | ||
| 80 | case IR::Opcode::BindlessImageAtomicIAdd32: | ||
| 81 | return IR::Opcode::ImageAtomicIAdd32; | ||
| 82 | case IR::Opcode::BoundImageAtomicSMin32: | ||
| 83 | case IR::Opcode::BindlessImageAtomicSMin32: | ||
| 84 | return IR::Opcode::ImageAtomicSMin32; | ||
| 85 | case IR::Opcode::BoundImageAtomicUMin32: | ||
| 86 | case IR::Opcode::BindlessImageAtomicUMin32: | ||
| 87 | return IR::Opcode::ImageAtomicUMin32; | ||
| 88 | case IR::Opcode::BoundImageAtomicSMax32: | ||
| 89 | case IR::Opcode::BindlessImageAtomicSMax32: | ||
| 90 | return IR::Opcode::ImageAtomicSMax32; | ||
| 91 | case IR::Opcode::BoundImageAtomicUMax32: | ||
| 92 | case IR::Opcode::BindlessImageAtomicUMax32: | ||
| 93 | return IR::Opcode::ImageAtomicUMax32; | ||
| 94 | case IR::Opcode::BoundImageAtomicInc32: | ||
| 95 | case IR::Opcode::BindlessImageAtomicInc32: | ||
| 96 | return IR::Opcode::ImageAtomicInc32; | ||
| 97 | case IR::Opcode::BoundImageAtomicDec32: | ||
| 98 | case IR::Opcode::BindlessImageAtomicDec32: | ||
| 99 | return IR::Opcode::ImageAtomicDec32; | ||
| 100 | case IR::Opcode::BoundImageAtomicAnd32: | ||
| 101 | case IR::Opcode::BindlessImageAtomicAnd32: | ||
| 102 | return IR::Opcode::ImageAtomicAnd32; | ||
| 103 | case IR::Opcode::BoundImageAtomicOr32: | ||
| 104 | case IR::Opcode::BindlessImageAtomicOr32: | ||
| 105 | return IR::Opcode::ImageAtomicOr32; | ||
| 106 | case IR::Opcode::BoundImageAtomicXor32: | ||
| 107 | case IR::Opcode::BindlessImageAtomicXor32: | ||
| 108 | return IR::Opcode::ImageAtomicXor32; | ||
| 109 | case IR::Opcode::BoundImageAtomicExchange32: | ||
| 110 | case IR::Opcode::BindlessImageAtomicExchange32: | ||
| 111 | return IR::Opcode::ImageAtomicExchange32; | ||
| 112 | default: | ||
| 113 | return IR::Opcode::Void; | ||
| 114 | } | ||
| 115 | } | ||
| 116 | |||
| 117 | bool IsBindless(const IR::Inst& inst) { | ||
| 118 | switch (inst.GetOpcode()) { | ||
| 119 | case IR::Opcode::BindlessImageSampleImplicitLod: | ||
| 120 | case IR::Opcode::BindlessImageSampleExplicitLod: | ||
| 121 | case IR::Opcode::BindlessImageSampleDrefImplicitLod: | ||
| 122 | case IR::Opcode::BindlessImageSampleDrefExplicitLod: | ||
| 123 | case IR::Opcode::BindlessImageGather: | ||
| 124 | case IR::Opcode::BindlessImageGatherDref: | ||
| 125 | case IR::Opcode::BindlessImageFetch: | ||
| 126 | case IR::Opcode::BindlessImageQueryDimensions: | ||
| 127 | case IR::Opcode::BindlessImageQueryLod: | ||
| 128 | case IR::Opcode::BindlessImageGradient: | ||
| 129 | case IR::Opcode::BindlessImageRead: | ||
| 130 | case IR::Opcode::BindlessImageWrite: | ||
| 131 | case IR::Opcode::BindlessImageAtomicIAdd32: | ||
| 132 | case IR::Opcode::BindlessImageAtomicSMin32: | ||
| 133 | case IR::Opcode::BindlessImageAtomicUMin32: | ||
| 134 | case IR::Opcode::BindlessImageAtomicSMax32: | ||
| 135 | case IR::Opcode::BindlessImageAtomicUMax32: | ||
| 136 | case IR::Opcode::BindlessImageAtomicInc32: | ||
| 137 | case IR::Opcode::BindlessImageAtomicDec32: | ||
| 138 | case IR::Opcode::BindlessImageAtomicAnd32: | ||
| 139 | case IR::Opcode::BindlessImageAtomicOr32: | ||
| 140 | case IR::Opcode::BindlessImageAtomicXor32: | ||
| 141 | case IR::Opcode::BindlessImageAtomicExchange32: | ||
| 142 | return true; | ||
| 143 | case IR::Opcode::BoundImageSampleImplicitLod: | ||
| 144 | case IR::Opcode::BoundImageSampleExplicitLod: | ||
| 145 | case IR::Opcode::BoundImageSampleDrefImplicitLod: | ||
| 146 | case IR::Opcode::BoundImageSampleDrefExplicitLod: | ||
| 147 | case IR::Opcode::BoundImageGather: | ||
| 148 | case IR::Opcode::BoundImageGatherDref: | ||
| 149 | case IR::Opcode::BoundImageFetch: | ||
| 150 | case IR::Opcode::BoundImageQueryDimensions: | ||
| 151 | case IR::Opcode::BoundImageQueryLod: | ||
| 152 | case IR::Opcode::BoundImageGradient: | ||
| 153 | case IR::Opcode::BoundImageRead: | ||
| 154 | case IR::Opcode::BoundImageWrite: | ||
| 155 | case IR::Opcode::BoundImageAtomicIAdd32: | ||
| 156 | case IR::Opcode::BoundImageAtomicSMin32: | ||
| 157 | case IR::Opcode::BoundImageAtomicUMin32: | ||
| 158 | case IR::Opcode::BoundImageAtomicSMax32: | ||
| 159 | case IR::Opcode::BoundImageAtomicUMax32: | ||
| 160 | case IR::Opcode::BoundImageAtomicInc32: | ||
| 161 | case IR::Opcode::BoundImageAtomicDec32: | ||
| 162 | case IR::Opcode::BoundImageAtomicAnd32: | ||
| 163 | case IR::Opcode::BoundImageAtomicOr32: | ||
| 164 | case IR::Opcode::BoundImageAtomicXor32: | ||
| 165 | case IR::Opcode::BoundImageAtomicExchange32: | ||
| 166 | return false; | ||
| 167 | default: | ||
| 168 | throw InvalidArgument("Invalid opcode {}", inst.GetOpcode()); | ||
| 169 | } | ||
| 170 | } | ||
| 171 | |||
| 172 | bool IsTextureInstruction(const IR::Inst& inst) { | ||
| 173 | return IndexedInstruction(inst) != IR::Opcode::Void; | ||
| 174 | } | ||
| 175 | |||
| 176 | std::optional<ConstBufferAddr> TryGetConstBuffer(const IR::Inst* inst); | ||
| 177 | |||
| 178 | std::optional<ConstBufferAddr> Track(const IR::Value& value) { | ||
| 179 | return IR::BreadthFirstSearch(value, TryGetConstBuffer); | ||
| 180 | } | ||
| 181 | |||
| 182 | std::optional<ConstBufferAddr> TryGetConstBuffer(const IR::Inst* inst) { | ||
| 183 | switch (inst->GetOpcode()) { | ||
| 184 | default: | ||
| 185 | return std::nullopt; | ||
| 186 | case IR::Opcode::BitwiseOr32: { | ||
| 187 | std::optional lhs{Track(inst->Arg(0))}; | ||
| 188 | std::optional rhs{Track(inst->Arg(1))}; | ||
| 189 | if (!lhs || !rhs) { | ||
| 190 | return std::nullopt; | ||
| 191 | } | ||
| 192 | if (lhs->has_secondary || rhs->has_secondary) { | ||
| 193 | return std::nullopt; | ||
| 194 | } | ||
| 195 | if (lhs->count > 1 || rhs->count > 1) { | ||
| 196 | return std::nullopt; | ||
| 197 | } | ||
| 198 | if (lhs->index > rhs->index || lhs->offset > rhs->offset) { | ||
| 199 | std::swap(lhs, rhs); | ||
| 200 | } | ||
| 201 | return ConstBufferAddr{ | ||
| 202 | .index = lhs->index, | ||
| 203 | .offset = lhs->offset, | ||
| 204 | .secondary_index = rhs->index, | ||
| 205 | .secondary_offset = rhs->offset, | ||
| 206 | .dynamic_offset = {}, | ||
| 207 | .count = 1, | ||
| 208 | .has_secondary = true, | ||
| 209 | }; | ||
| 210 | } | ||
| 211 | case IR::Opcode::GetCbufU32x2: | ||
| 212 | case IR::Opcode::GetCbufU32: | ||
| 213 | break; | ||
| 214 | } | ||
| 215 | const IR::Value index{inst->Arg(0)}; | ||
| 216 | const IR::Value offset{inst->Arg(1)}; | ||
| 217 | if (!index.IsImmediate()) { | ||
| 218 | // Reading a bindless texture from variable indices is valid | ||
| 219 | // but not supported here at the moment | ||
| 220 | return std::nullopt; | ||
| 221 | } | ||
| 222 | if (offset.IsImmediate()) { | ||
| 223 | return ConstBufferAddr{ | ||
| 224 | .index = index.U32(), | ||
| 225 | .offset = offset.U32(), | ||
| 226 | .secondary_index = 0, | ||
| 227 | .secondary_offset = 0, | ||
| 228 | .dynamic_offset = {}, | ||
| 229 | .count = 1, | ||
| 230 | .has_secondary = false, | ||
| 231 | }; | ||
| 232 | } | ||
| 233 | IR::Inst* const offset_inst{offset.InstRecursive()}; | ||
| 234 | if (offset_inst->GetOpcode() != IR::Opcode::IAdd32) { | ||
| 235 | return std::nullopt; | ||
| 236 | } | ||
| 237 | u32 base_offset{}; | ||
| 238 | IR::U32 dynamic_offset; | ||
| 239 | if (offset_inst->Arg(0).IsImmediate()) { | ||
| 240 | base_offset = offset_inst->Arg(0).U32(); | ||
| 241 | dynamic_offset = IR::U32{offset_inst->Arg(1)}; | ||
| 242 | } else if (offset_inst->Arg(1).IsImmediate()) { | ||
| 243 | base_offset = offset_inst->Arg(1).U32(); | ||
| 244 | dynamic_offset = IR::U32{offset_inst->Arg(0)}; | ||
| 245 | } else { | ||
| 246 | return std::nullopt; | ||
| 247 | } | ||
| 248 | return ConstBufferAddr{ | ||
| 249 | .index = index.U32(), | ||
| 250 | .offset = base_offset, | ||
| 251 | .secondary_index = 0, | ||
| 252 | .secondary_offset = 0, | ||
| 253 | .dynamic_offset = dynamic_offset, | ||
| 254 | .count = 8, | ||
| 255 | .has_secondary = false, | ||
| 256 | }; | ||
| 257 | } | ||
| 258 | |||
| 259 | TextureInst MakeInst(Environment& env, IR::Block* block, IR::Inst& inst) { | ||
| 260 | ConstBufferAddr addr; | ||
| 261 | if (IsBindless(inst)) { | ||
| 262 | const std::optional<ConstBufferAddr> track_addr{Track(inst.Arg(0))}; | ||
| 263 | if (!track_addr) { | ||
| 264 | throw NotImplementedException("Failed to track bindless texture constant buffer"); | ||
| 265 | } | ||
| 266 | addr = *track_addr; | ||
| 267 | } else { | ||
| 268 | addr = ConstBufferAddr{ | ||
| 269 | .index = env.TextureBoundBuffer(), | ||
| 270 | .offset = inst.Arg(0).U32(), | ||
| 271 | .secondary_index = 0, | ||
| 272 | .secondary_offset = 0, | ||
| 273 | .dynamic_offset = {}, | ||
| 274 | .count = 1, | ||
| 275 | .has_secondary = false, | ||
| 276 | }; | ||
| 277 | } | ||
| 278 | return TextureInst{ | ||
| 279 | .cbuf = addr, | ||
| 280 | .inst = &inst, | ||
| 281 | .block = block, | ||
| 282 | }; | ||
| 283 | } | ||
| 284 | |||
| 285 | TextureType ReadTextureType(Environment& env, const ConstBufferAddr& cbuf) { | ||
| 286 | const u32 secondary_index{cbuf.has_secondary ? cbuf.secondary_index : cbuf.index}; | ||
| 287 | const u32 secondary_offset{cbuf.has_secondary ? cbuf.secondary_offset : cbuf.offset}; | ||
| 288 | const u32 lhs_raw{env.ReadCbufValue(cbuf.index, cbuf.offset)}; | ||
| 289 | const u32 rhs_raw{env.ReadCbufValue(secondary_index, secondary_offset)}; | ||
| 290 | return env.ReadTextureType(lhs_raw | rhs_raw); | ||
| 291 | } | ||
| 292 | |||
| 293 | class Descriptors { | ||
| 294 | public: | ||
| 295 | explicit Descriptors(TextureBufferDescriptors& texture_buffer_descriptors_, | ||
| 296 | ImageBufferDescriptors& image_buffer_descriptors_, | ||
| 297 | TextureDescriptors& texture_descriptors_, | ||
| 298 | ImageDescriptors& image_descriptors_) | ||
| 299 | : texture_buffer_descriptors{texture_buffer_descriptors_}, | ||
| 300 | image_buffer_descriptors{image_buffer_descriptors_}, | ||
| 301 | texture_descriptors{texture_descriptors_}, image_descriptors{image_descriptors_} {} | ||
| 302 | |||
| 303 | u32 Add(const TextureBufferDescriptor& desc) { | ||
| 304 | return Add(texture_buffer_descriptors, desc, [&desc](const auto& existing) { | ||
| 305 | return desc.cbuf_index == existing.cbuf_index && | ||
| 306 | desc.cbuf_offset == existing.cbuf_offset && | ||
| 307 | desc.secondary_cbuf_index == existing.secondary_cbuf_index && | ||
| 308 | desc.secondary_cbuf_offset == existing.secondary_cbuf_offset && | ||
| 309 | desc.count == existing.count && desc.size_shift == existing.size_shift && | ||
| 310 | desc.has_secondary == existing.has_secondary; | ||
| 311 | }); | ||
| 312 | } | ||
| 313 | |||
| 314 | u32 Add(const ImageBufferDescriptor& desc) { | ||
| 315 | const u32 index{Add(image_buffer_descriptors, desc, [&desc](const auto& existing) { | ||
| 316 | return desc.format == existing.format && desc.cbuf_index == existing.cbuf_index && | ||
| 317 | desc.cbuf_offset == existing.cbuf_offset && desc.count == existing.count && | ||
| 318 | desc.size_shift == existing.size_shift; | ||
| 319 | })}; | ||
| 320 | image_buffer_descriptors[index].is_written |= desc.is_written; | ||
| 321 | image_buffer_descriptors[index].is_read |= desc.is_read; | ||
| 322 | return index; | ||
| 323 | } | ||
| 324 | |||
| 325 | u32 Add(const TextureDescriptor& desc) { | ||
| 326 | return Add(texture_descriptors, desc, [&desc](const auto& existing) { | ||
| 327 | return desc.type == existing.type && desc.is_depth == existing.is_depth && | ||
| 328 | desc.has_secondary == existing.has_secondary && | ||
| 329 | desc.cbuf_index == existing.cbuf_index && | ||
| 330 | desc.cbuf_offset == existing.cbuf_offset && | ||
| 331 | desc.secondary_cbuf_index == existing.secondary_cbuf_index && | ||
| 332 | desc.secondary_cbuf_offset == existing.secondary_cbuf_offset && | ||
| 333 | desc.count == existing.count && desc.size_shift == existing.size_shift; | ||
| 334 | }); | ||
| 335 | } | ||
| 336 | |||
| 337 | u32 Add(const ImageDescriptor& desc) { | ||
| 338 | const u32 index{Add(image_descriptors, desc, [&desc](const auto& existing) { | ||
| 339 | return desc.type == existing.type && desc.format == existing.format && | ||
| 340 | desc.cbuf_index == existing.cbuf_index && | ||
| 341 | desc.cbuf_offset == existing.cbuf_offset && desc.count == existing.count && | ||
| 342 | desc.size_shift == existing.size_shift; | ||
| 343 | })}; | ||
| 344 | image_descriptors[index].is_written |= desc.is_written; | ||
| 345 | image_descriptors[index].is_read |= desc.is_read; | ||
| 346 | return index; | ||
| 347 | } | ||
| 348 | |||
| 349 | private: | ||
| 350 | template <typename Descriptors, typename Descriptor, typename Func> | ||
| 351 | static u32 Add(Descriptors& descriptors, const Descriptor& desc, Func&& pred) { | ||
| 352 | // TODO: Handle arrays | ||
| 353 | const auto it{std::ranges::find_if(descriptors, pred)}; | ||
| 354 | if (it != descriptors.end()) { | ||
| 355 | return static_cast<u32>(std::distance(descriptors.begin(), it)); | ||
| 356 | } | ||
| 357 | descriptors.push_back(desc); | ||
| 358 | return static_cast<u32>(descriptors.size()) - 1; | ||
| 359 | } | ||
| 360 | |||
| 361 | TextureBufferDescriptors& texture_buffer_descriptors; | ||
| 362 | ImageBufferDescriptors& image_buffer_descriptors; | ||
| 363 | TextureDescriptors& texture_descriptors; | ||
| 364 | ImageDescriptors& image_descriptors; | ||
| 365 | }; | ||
| 366 | } // Anonymous namespace | ||
| 367 | |||
| 368 | void TexturePass(Environment& env, IR::Program& program) { | ||
| 369 | TextureInstVector to_replace; | ||
| 370 | for (IR::Block* const block : program.post_order_blocks) { | ||
| 371 | for (IR::Inst& inst : block->Instructions()) { | ||
| 372 | if (!IsTextureInstruction(inst)) { | ||
| 373 | continue; | ||
| 374 | } | ||
| 375 | to_replace.push_back(MakeInst(env, block, inst)); | ||
| 376 | } | ||
| 377 | } | ||
| 378 | // Sort instructions to visit textures by constant buffer index, then by offset | ||
| 379 | std::ranges::sort(to_replace, [](const auto& lhs, const auto& rhs) { | ||
| 380 | return lhs.cbuf.offset < rhs.cbuf.offset; | ||
| 381 | }); | ||
| 382 | std::stable_sort(to_replace.begin(), to_replace.end(), [](const auto& lhs, const auto& rhs) { | ||
| 383 | return lhs.cbuf.index < rhs.cbuf.index; | ||
| 384 | }); | ||
| 385 | Descriptors descriptors{ | ||
| 386 | program.info.texture_buffer_descriptors, | ||
| 387 | program.info.image_buffer_descriptors, | ||
| 388 | program.info.texture_descriptors, | ||
| 389 | program.info.image_descriptors, | ||
| 390 | }; | ||
| 391 | for (TextureInst& texture_inst : to_replace) { | ||
| 392 | // TODO: Handle arrays | ||
| 393 | IR::Inst* const inst{texture_inst.inst}; | ||
| 394 | inst->ReplaceOpcode(IndexedInstruction(*inst)); | ||
| 395 | |||
| 396 | const auto& cbuf{texture_inst.cbuf}; | ||
| 397 | auto flags{inst->Flags<IR::TextureInstInfo>()}; | ||
| 398 | switch (inst->GetOpcode()) { | ||
| 399 | case IR::Opcode::ImageQueryDimensions: | ||
| 400 | flags.type.Assign(ReadTextureType(env, cbuf)); | ||
| 401 | inst->SetFlags(flags); | ||
| 402 | break; | ||
| 403 | case IR::Opcode::ImageFetch: | ||
| 404 | if (flags.type != TextureType::Color1D) { | ||
| 405 | break; | ||
| 406 | } | ||
| 407 | if (ReadTextureType(env, cbuf) == TextureType::Buffer) { | ||
| 408 | // Replace with the bound texture type only when it's a texture buffer | ||
| 409 | // If the instruction is 1D and the bound type is 2D, don't change the code and let | ||
| 410 | // the rasterizer robustness handle it | ||
| 411 | // This happens on Fire Emblem: Three Houses | ||
| 412 | flags.type.Assign(TextureType::Buffer); | ||
| 413 | } | ||
| 414 | break; | ||
| 415 | default: | ||
| 416 | break; | ||
| 417 | } | ||
| 418 | u32 index; | ||
| 419 | switch (inst->GetOpcode()) { | ||
| 420 | case IR::Opcode::ImageRead: | ||
| 421 | case IR::Opcode::ImageAtomicIAdd32: | ||
| 422 | case IR::Opcode::ImageAtomicSMin32: | ||
| 423 | case IR::Opcode::ImageAtomicUMin32: | ||
| 424 | case IR::Opcode::ImageAtomicSMax32: | ||
| 425 | case IR::Opcode::ImageAtomicUMax32: | ||
| 426 | case IR::Opcode::ImageAtomicInc32: | ||
| 427 | case IR::Opcode::ImageAtomicDec32: | ||
| 428 | case IR::Opcode::ImageAtomicAnd32: | ||
| 429 | case IR::Opcode::ImageAtomicOr32: | ||
| 430 | case IR::Opcode::ImageAtomicXor32: | ||
| 431 | case IR::Opcode::ImageAtomicExchange32: | ||
| 432 | case IR::Opcode::ImageWrite: { | ||
| 433 | if (cbuf.has_secondary) { | ||
| 434 | throw NotImplementedException("Unexpected separate sampler"); | ||
| 435 | } | ||
| 436 | const bool is_written{inst->GetOpcode() != IR::Opcode::ImageRead}; | ||
| 437 | const bool is_read{inst->GetOpcode() != IR::Opcode::ImageWrite}; | ||
| 438 | if (flags.type == TextureType::Buffer) { | ||
| 439 | index = descriptors.Add(ImageBufferDescriptor{ | ||
| 440 | .format = flags.image_format, | ||
| 441 | .is_written = is_written, | ||
| 442 | .is_read = is_read, | ||
| 443 | .cbuf_index = cbuf.index, | ||
| 444 | .cbuf_offset = cbuf.offset, | ||
| 445 | .count = cbuf.count, | ||
| 446 | .size_shift = DESCRIPTOR_SIZE_SHIFT, | ||
| 447 | }); | ||
| 448 | } else { | ||
| 449 | index = descriptors.Add(ImageDescriptor{ | ||
| 450 | .type = flags.type, | ||
| 451 | .format = flags.image_format, | ||
| 452 | .is_written = is_written, | ||
| 453 | .is_read = is_read, | ||
| 454 | .cbuf_index = cbuf.index, | ||
| 455 | .cbuf_offset = cbuf.offset, | ||
| 456 | .count = cbuf.count, | ||
| 457 | .size_shift = DESCRIPTOR_SIZE_SHIFT, | ||
| 458 | }); | ||
| 459 | } | ||
| 460 | break; | ||
| 461 | } | ||
| 462 | default: | ||
| 463 | if (flags.type == TextureType::Buffer) { | ||
| 464 | index = descriptors.Add(TextureBufferDescriptor{ | ||
| 465 | .has_secondary = cbuf.has_secondary, | ||
| 466 | .cbuf_index = cbuf.index, | ||
| 467 | .cbuf_offset = cbuf.offset, | ||
| 468 | .secondary_cbuf_index = cbuf.secondary_index, | ||
| 469 | .secondary_cbuf_offset = cbuf.secondary_offset, | ||
| 470 | .count = cbuf.count, | ||
| 471 | .size_shift = DESCRIPTOR_SIZE_SHIFT, | ||
| 472 | }); | ||
| 473 | } else { | ||
| 474 | index = descriptors.Add(TextureDescriptor{ | ||
| 475 | .type = flags.type, | ||
| 476 | .is_depth = flags.is_depth != 0, | ||
| 477 | .has_secondary = cbuf.has_secondary, | ||
| 478 | .cbuf_index = cbuf.index, | ||
| 479 | .cbuf_offset = cbuf.offset, | ||
| 480 | .secondary_cbuf_index = cbuf.secondary_index, | ||
| 481 | .secondary_cbuf_offset = cbuf.secondary_offset, | ||
| 482 | .count = cbuf.count, | ||
| 483 | .size_shift = DESCRIPTOR_SIZE_SHIFT, | ||
| 484 | }); | ||
| 485 | } | ||
| 486 | break; | ||
| 487 | } | ||
| 488 | flags.descriptor_index.Assign(index); | ||
| 489 | inst->SetFlags(flags); | ||
| 490 | |||
| 491 | if (cbuf.count > 1) { | ||
| 492 | const auto insert_point{IR::Block::InstructionList::s_iterator_to(*inst)}; | ||
| 493 | IR::IREmitter ir{*texture_inst.block, insert_point}; | ||
| 494 | const IR::U32 shift{ir.Imm32(std::countr_zero(DESCRIPTOR_SIZE))}; | ||
| 495 | inst->SetArg(0, ir.ShiftRightArithmetic(cbuf.dynamic_offset, shift)); | ||
| 496 | } else { | ||
| 497 | inst->SetArg(0, IR::Value{}); | ||
| 498 | } | ||
| 499 | } | ||
| 500 | } | ||
| 501 | |||
| 502 | void JoinTextureInfo(Info& base, Info& source) { | ||
| 503 | Descriptors descriptors{ | ||
| 504 | base.texture_buffer_descriptors, | ||
| 505 | base.image_buffer_descriptors, | ||
| 506 | base.texture_descriptors, | ||
| 507 | base.image_descriptors, | ||
| 508 | }; | ||
| 509 | for (auto& desc : source.texture_buffer_descriptors) { | ||
| 510 | descriptors.Add(desc); | ||
| 511 | } | ||
| 512 | for (auto& desc : source.image_buffer_descriptors) { | ||
| 513 | descriptors.Add(desc); | ||
| 514 | } | ||
| 515 | for (auto& desc : source.texture_descriptors) { | ||
| 516 | descriptors.Add(desc); | ||
| 517 | } | ||
| 518 | for (auto& desc : source.image_descriptors) { | ||
| 519 | descriptors.Add(desc); | ||
| 520 | } | ||
| 521 | } | ||
| 522 | |||
| 523 | } // namespace Shader::Optimization | ||
diff --git a/src/shader_recompiler/ir_opt/verification_pass.cpp b/src/shader_recompiler/ir_opt/verification_pass.cpp new file mode 100644 index 000000000..975d5aadf --- /dev/null +++ b/src/shader_recompiler/ir_opt/verification_pass.cpp | |||
| @@ -0,0 +1,98 @@ | |||
| 1 | // Copyright 2021 yuzu Emulator Project | ||
| 2 | // Licensed under GPLv2 or any later version | ||
| 3 | // Refer to the license.txt file included. | ||
| 4 | |||
| 5 | #include <map> | ||
| 6 | #include <set> | ||
| 7 | |||
| 8 | #include "shader_recompiler/exception.h" | ||
| 9 | #include "shader_recompiler/frontend/ir/basic_block.h" | ||
| 10 | #include "shader_recompiler/frontend/ir/value.h" | ||
| 11 | #include "shader_recompiler/ir_opt/passes.h" | ||
| 12 | |||
| 13 | namespace Shader::Optimization { | ||
| 14 | |||
| 15 | static void ValidateTypes(const IR::Program& program) { | ||
| 16 | for (const auto& block : program.blocks) { | ||
| 17 | for (const IR::Inst& inst : *block) { | ||
| 18 | if (inst.GetOpcode() == IR::Opcode::Phi) { | ||
| 19 | // Skip validation on phi nodes | ||
| 20 | continue; | ||
| 21 | } | ||
| 22 | const size_t num_args{inst.NumArgs()}; | ||
| 23 | for (size_t i = 0; i < num_args; ++i) { | ||
| 24 | const IR::Type t1{inst.Arg(i).Type()}; | ||
| 25 | const IR::Type t2{IR::ArgTypeOf(inst.GetOpcode(), i)}; | ||
| 26 | if (!IR::AreTypesCompatible(t1, t2)) { | ||
| 27 | throw LogicError("Invalid types in block:\n{}", IR::DumpBlock(*block)); | ||
| 28 | } | ||
| 29 | } | ||
| 30 | } | ||
| 31 | } | ||
| 32 | } | ||
| 33 | |||
| 34 | static void ValidateUses(const IR::Program& program) { | ||
| 35 | std::map<IR::Inst*, int> actual_uses; | ||
| 36 | for (const auto& block : program.blocks) { | ||
| 37 | for (const IR::Inst& inst : *block) { | ||
| 38 | const size_t num_args{inst.NumArgs()}; | ||
| 39 | for (size_t i = 0; i < num_args; ++i) { | ||
| 40 | const IR::Value arg{inst.Arg(i)}; | ||
| 41 | if (!arg.IsImmediate()) { | ||
| 42 | ++actual_uses[arg.Inst()]; | ||
| 43 | } | ||
| 44 | } | ||
| 45 | } | ||
| 46 | } | ||
| 47 | for (const auto [inst, uses] : actual_uses) { | ||
| 48 | if (inst->UseCount() != uses) { | ||
| 49 | throw LogicError("Invalid uses in block: {}", IR::DumpProgram(program)); | ||
| 50 | } | ||
| 51 | } | ||
| 52 | } | ||
| 53 | |||
| 54 | static void ValidateForwardDeclarations(const IR::Program& program) { | ||
| 55 | std::set<const IR::Inst*> definitions; | ||
| 56 | for (const IR::Block* const block : program.blocks) { | ||
| 57 | for (const IR::Inst& inst : *block) { | ||
| 58 | definitions.emplace(&inst); | ||
| 59 | if (inst.GetOpcode() == IR::Opcode::Phi) { | ||
| 60 | // Phi nodes can have forward declarations | ||
| 61 | continue; | ||
| 62 | } | ||
| 63 | const size_t num_args{inst.NumArgs()}; | ||
| 64 | for (size_t arg = 0; arg < num_args; ++arg) { | ||
| 65 | if (inst.Arg(arg).IsImmediate()) { | ||
| 66 | continue; | ||
| 67 | } | ||
| 68 | if (!definitions.contains(inst.Arg(arg).Inst())) { | ||
| 69 | throw LogicError("Forward declaration in block: {}", IR::DumpBlock(*block)); | ||
| 70 | } | ||
| 71 | } | ||
| 72 | } | ||
| 73 | } | ||
| 74 | } | ||
| 75 | |||
| 76 | static void ValidatePhiNodes(const IR::Program& program) { | ||
| 77 | for (const IR::Block* const block : program.blocks) { | ||
| 78 | bool no_more_phis{false}; | ||
| 79 | for (const IR::Inst& inst : *block) { | ||
| 80 | if (inst.GetOpcode() == IR::Opcode::Phi) { | ||
| 81 | if (no_more_phis) { | ||
| 82 | throw LogicError("Interleaved phi nodes: {}", IR::DumpBlock(*block)); | ||
| 83 | } | ||
| 84 | } else { | ||
| 85 | no_more_phis = true; | ||
| 86 | } | ||
| 87 | } | ||
| 88 | } | ||
| 89 | } | ||
| 90 | |||
| 91 | void VerificationPass(const IR::Program& program) { | ||
| 92 | ValidateTypes(program); | ||
| 93 | ValidateUses(program); | ||
| 94 | ValidateForwardDeclarations(program); | ||
| 95 | ValidatePhiNodes(program); | ||
| 96 | } | ||
| 97 | |||
| 98 | } // namespace Shader::Optimization | ||