diff options
| author | 2016-05-13 08:49:20 +0200 | |
|---|---|---|
| committer | 2016-05-16 18:55:51 +0200 | |
| commit | ff0fa86b17e8133263bb54c1338ade8ecd97e5d9 (patch) | |
| tree | aedf8d5ac4ecc967ab7bacff7ea011104e95f99f /src | |
| parent | Merge pull request #1787 from JayFoxRox/refactor-jit (diff) | |
| download | yuzu-ff0fa86b17e8133263bb54c1338ade8ecd97e5d9.tar.gz yuzu-ff0fa86b17e8133263bb54c1338ade8ecd97e5d9.tar.xz yuzu-ff0fa86b17e8133263bb54c1338ade8ecd97e5d9.zip | |
Retrieve shader result from new OutputRegisters-type
Diffstat (limited to 'src')
| -rw-r--r-- | src/video_core/command_processor.cpp | 21 | ||||
| -rw-r--r-- | src/video_core/shader/shader.cpp | 103 | ||||
| -rw-r--r-- | src/video_core/shader/shader.h | 17 | ||||
| -rw-r--r-- | src/video_core/shader/shader_interpreter.cpp | 4 |
4 files changed, 81 insertions, 64 deletions
diff --git a/src/video_core/command_processor.cpp b/src/video_core/command_processor.cpp index ad0da796e..c29a3fe51 100644 --- a/src/video_core/command_processor.cpp +++ b/src/video_core/command_processor.cpp | |||
| @@ -149,7 +149,8 @@ static void WritePicaReg(u32 id, u32 value, u32 mask) { | |||
| 149 | // Send to vertex shader | 149 | // Send to vertex shader |
| 150 | if (g_debug_context) | 150 | if (g_debug_context) |
| 151 | g_debug_context->OnEvent(DebugContext::Event::VertexShaderInvocation, static_cast<void*>(&immediate_input)); | 151 | g_debug_context->OnEvent(DebugContext::Event::VertexShaderInvocation, static_cast<void*>(&immediate_input)); |
| 152 | Shader::OutputVertex output = g_state.vs.Run(shader_unit, immediate_input, regs.vs.num_input_attributes+1); | 152 | g_state.vs.Run(shader_unit, immediate_input, regs.vs.num_input_attributes+1); |
| 153 | Shader::OutputVertex output_vertex = shader_unit.output_registers.ToVertex(regs.vs); | ||
| 153 | 154 | ||
| 154 | // Send to renderer | 155 | // Send to renderer |
| 155 | using Pica::Shader::OutputVertex; | 156 | using Pica::Shader::OutputVertex; |
| @@ -157,7 +158,7 @@ static void WritePicaReg(u32 id, u32 value, u32 mask) { | |||
| 157 | VideoCore::g_renderer->Rasterizer()->AddTriangle(v0, v1, v2); | 158 | VideoCore::g_renderer->Rasterizer()->AddTriangle(v0, v1, v2); |
| 158 | }; | 159 | }; |
| 159 | 160 | ||
| 160 | g_state.primitive_assembler.SubmitVertex(output, AddTriangle); | 161 | g_state.primitive_assembler.SubmitVertex(output_vertex, AddTriangle); |
| 161 | } | 162 | } |
| 162 | } | 163 | } |
| 163 | } | 164 | } |
| @@ -231,7 +232,7 @@ static void WritePicaReg(u32 id, u32 value, u32 mask) { | |||
| 231 | // The size has been tuned for optimal balance between hit-rate and the cost of lookup | 232 | // The size has been tuned for optimal balance between hit-rate and the cost of lookup |
| 232 | const size_t VERTEX_CACHE_SIZE = 32; | 233 | const size_t VERTEX_CACHE_SIZE = 32; |
| 233 | std::array<u16, VERTEX_CACHE_SIZE> vertex_cache_ids; | 234 | std::array<u16, VERTEX_CACHE_SIZE> vertex_cache_ids; |
| 234 | std::array<Shader::OutputVertex, VERTEX_CACHE_SIZE> vertex_cache; | 235 | std::array<Shader::OutputRegisters, VERTEX_CACHE_SIZE> vertex_cache; |
| 235 | 236 | ||
| 236 | unsigned int vertex_cache_pos = 0; | 237 | unsigned int vertex_cache_pos = 0; |
| 237 | vertex_cache_ids.fill(-1); | 238 | vertex_cache_ids.fill(-1); |
| @@ -249,7 +250,7 @@ static void WritePicaReg(u32 id, u32 value, u32 mask) { | |||
| 249 | ASSERT(vertex != -1); | 250 | ASSERT(vertex != -1); |
| 250 | 251 | ||
| 251 | bool vertex_cache_hit = false; | 252 | bool vertex_cache_hit = false; |
| 252 | Shader::OutputVertex output; | 253 | Shader::OutputRegisters output_registers; |
| 253 | 254 | ||
| 254 | if (is_indexed) { | 255 | if (is_indexed) { |
| 255 | if (g_debug_context && Pica::g_debug_context->recorder) { | 256 | if (g_debug_context && Pica::g_debug_context->recorder) { |
| @@ -259,7 +260,7 @@ static void WritePicaReg(u32 id, u32 value, u32 mask) { | |||
| 259 | 260 | ||
| 260 | for (unsigned int i = 0; i < VERTEX_CACHE_SIZE; ++i) { | 261 | for (unsigned int i = 0; i < VERTEX_CACHE_SIZE; ++i) { |
| 261 | if (vertex == vertex_cache_ids[i]) { | 262 | if (vertex == vertex_cache_ids[i]) { |
| 262 | output = vertex_cache[i]; | 263 | output_registers = vertex_cache[i]; |
| 263 | vertex_cache_hit = true; | 264 | vertex_cache_hit = true; |
| 264 | break; | 265 | break; |
| 265 | } | 266 | } |
| @@ -274,15 +275,19 @@ static void WritePicaReg(u32 id, u32 value, u32 mask) { | |||
| 274 | // Send to vertex shader | 275 | // Send to vertex shader |
| 275 | if (g_debug_context) | 276 | if (g_debug_context) |
| 276 | g_debug_context->OnEvent(DebugContext::Event::VertexShaderInvocation, (void*)&input); | 277 | g_debug_context->OnEvent(DebugContext::Event::VertexShaderInvocation, (void*)&input); |
| 277 | output = g_state.vs.Run(shader_unit, input, loader.GetNumTotalAttributes()); | 278 | g_state.vs.Run(shader_unit, input, loader.GetNumTotalAttributes()); |
| 279 | output_registers = shader_unit.output_registers; | ||
| 278 | 280 | ||
| 279 | if (is_indexed) { | 281 | if (is_indexed) { |
| 280 | vertex_cache[vertex_cache_pos] = output; | 282 | vertex_cache[vertex_cache_pos] = output_registers; |
| 281 | vertex_cache_ids[vertex_cache_pos] = vertex; | 283 | vertex_cache_ids[vertex_cache_pos] = vertex; |
| 282 | vertex_cache_pos = (vertex_cache_pos + 1) % VERTEX_CACHE_SIZE; | 284 | vertex_cache_pos = (vertex_cache_pos + 1) % VERTEX_CACHE_SIZE; |
| 283 | } | 285 | } |
| 284 | } | 286 | } |
| 285 | 287 | ||
| 288 | // Retreive vertex from register data | ||
| 289 | Shader::OutputVertex output_vertex = output_registers.ToVertex(regs.vs); | ||
| 290 | |||
| 286 | // Send to renderer | 291 | // Send to renderer |
| 287 | using Pica::Shader::OutputVertex; | 292 | using Pica::Shader::OutputVertex; |
| 288 | auto AddTriangle = []( | 293 | auto AddTriangle = []( |
| @@ -290,7 +295,7 @@ static void WritePicaReg(u32 id, u32 value, u32 mask) { | |||
| 290 | VideoCore::g_renderer->Rasterizer()->AddTriangle(v0, v1, v2); | 295 | VideoCore::g_renderer->Rasterizer()->AddTriangle(v0, v1, v2); |
| 291 | }; | 296 | }; |
| 292 | 297 | ||
| 293 | primitive_assembler.SubmitVertex(output, AddTriangle); | 298 | primitive_assembler.SubmitVertex(output_vertex, AddTriangle); |
| 294 | } | 299 | } |
| 295 | 300 | ||
| 296 | for (auto& range : memory_accesses.ranges) { | 301 | for (auto& range : memory_accesses.ranges) { |
diff --git a/src/video_core/shader/shader.cpp b/src/video_core/shader/shader.cpp index 161097610..f565e2c91 100644 --- a/src/video_core/shader/shader.cpp +++ b/src/video_core/shader/shader.cpp | |||
| @@ -30,6 +30,58 @@ namespace Pica { | |||
| 30 | 30 | ||
| 31 | namespace Shader { | 31 | namespace Shader { |
| 32 | 32 | ||
| 33 | OutputVertex OutputRegisters::ToVertex(const Regs::ShaderConfig& config) { | ||
| 34 | // Setup output data | ||
| 35 | OutputVertex ret; | ||
| 36 | // TODO(neobrain): Under some circumstances, up to 16 attributes may be output. We need to | ||
| 37 | // figure out what those circumstances are and enable the remaining outputs then. | ||
| 38 | unsigned index = 0; | ||
| 39 | for (unsigned i = 0; i < 7; ++i) { | ||
| 40 | |||
| 41 | if (index >= g_state.regs.vs_output_total) | ||
| 42 | break; | ||
| 43 | |||
| 44 | if ((config.output_mask & (1 << i)) == 0) | ||
| 45 | continue; | ||
| 46 | |||
| 47 | const auto& output_register_map = g_state.regs.vs_output_attributes[index]; | ||
| 48 | |||
| 49 | u32 semantics[4] = { | ||
| 50 | output_register_map.map_x, output_register_map.map_y, | ||
| 51 | output_register_map.map_z, output_register_map.map_w | ||
| 52 | }; | ||
| 53 | |||
| 54 | for (unsigned comp = 0; comp < 4; ++comp) { | ||
| 55 | float24* out = ((float24*)&ret) + semantics[comp]; | ||
| 56 | if (semantics[comp] != Regs::VSOutputAttributes::INVALID) { | ||
| 57 | *out = value[i][comp]; | ||
| 58 | } else { | ||
| 59 | // Zero output so that attributes which aren't output won't have denormals in them, | ||
| 60 | // which would slow us down later. | ||
| 61 | memset(out, 0, sizeof(*out)); | ||
| 62 | } | ||
| 63 | } | ||
| 64 | |||
| 65 | index++; | ||
| 66 | } | ||
| 67 | |||
| 68 | // The hardware takes the absolute and saturates vertex colors like this, *before* doing interpolation | ||
| 69 | for (unsigned i = 0; i < 4; ++i) { | ||
| 70 | ret.color[i] = float24::FromFloat32( | ||
| 71 | std::fmin(std::fabs(ret.color[i].ToFloat32()), 1.0f)); | ||
| 72 | } | ||
| 73 | |||
| 74 | LOG_TRACE(HW_GPU, "Output vertex: pos(%.2f, %.2f, %.2f, %.2f), quat(%.2f, %.2f, %.2f, %.2f), " | ||
| 75 | "col(%.2f, %.2f, %.2f, %.2f), tc0(%.2f, %.2f), view(%.2f, %.2f, %.2f)", | ||
| 76 | ret.pos.x.ToFloat32(), ret.pos.y.ToFloat32(), ret.pos.z.ToFloat32(), ret.pos.w.ToFloat32(), | ||
| 77 | ret.quat.x.ToFloat32(), ret.quat.y.ToFloat32(), ret.quat.z.ToFloat32(), ret.quat.w.ToFloat32(), | ||
| 78 | ret.color.x.ToFloat32(), ret.color.y.ToFloat32(), ret.color.z.ToFloat32(), ret.color.w.ToFloat32(), | ||
| 79 | ret.tc0.u().ToFloat32(), ret.tc0.v().ToFloat32(), | ||
| 80 | ret.view.x.ToFloat32(), ret.view.y.ToFloat32(), ret.view.z.ToFloat32()); | ||
| 81 | |||
| 82 | return ret; | ||
| 83 | } | ||
| 84 | |||
| 33 | #ifdef ARCHITECTURE_x86_64 | 85 | #ifdef ARCHITECTURE_x86_64 |
| 34 | static std::unordered_map<u64, std::unique_ptr<JitShader>> shader_map; | 86 | static std::unordered_map<u64, std::unique_ptr<JitShader>> shader_map; |
| 35 | static const JitShader* jit_shader; | 87 | static const JitShader* jit_shader; |
| @@ -62,7 +114,7 @@ void ShaderSetup::Setup() { | |||
| 62 | 114 | ||
| 63 | MICROPROFILE_DEFINE(GPU_Shader, "GPU", "Shader", MP_RGB(50, 50, 240)); | 115 | MICROPROFILE_DEFINE(GPU_Shader, "GPU", "Shader", MP_RGB(50, 50, 240)); |
| 64 | 116 | ||
| 65 | OutputVertex ShaderSetup::Run(UnitState<false>& state, const InputVertex& input, int num_attributes) { | 117 | void ShaderSetup::Run(UnitState<false>& state, const InputVertex& input, int num_attributes) { |
| 66 | auto& config = g_state.regs.vs; | 118 | auto& config = g_state.regs.vs; |
| 67 | auto& setup = g_state.vs; | 119 | auto& setup = g_state.vs; |
| 68 | 120 | ||
| @@ -89,55 +141,6 @@ OutputVertex ShaderSetup::Run(UnitState<false>& state, const InputVertex& input, | |||
| 89 | RunInterpreter(setup, state, config.main_offset); | 141 | RunInterpreter(setup, state, config.main_offset); |
| 90 | #endif // ARCHITECTURE_x86_64 | 142 | #endif // ARCHITECTURE_x86_64 |
| 91 | 143 | ||
| 92 | // Setup output data | ||
| 93 | OutputVertex ret; | ||
| 94 | // TODO(neobrain): Under some circumstances, up to 16 attributes may be output. We need to | ||
| 95 | // figure out what those circumstances are and enable the remaining outputs then. | ||
| 96 | unsigned index = 0; | ||
| 97 | for (unsigned i = 0; i < 7; ++i) { | ||
| 98 | |||
| 99 | if (index >= g_state.regs.vs_output_total) | ||
| 100 | break; | ||
| 101 | |||
| 102 | if ((g_state.regs.vs.output_mask & (1 << i)) == 0) | ||
| 103 | continue; | ||
| 104 | |||
| 105 | const auto& output_register_map = g_state.regs.vs_output_attributes[index]; // TODO: Don't hardcode VS here | ||
| 106 | |||
| 107 | u32 semantics[4] = { | ||
| 108 | output_register_map.map_x, output_register_map.map_y, | ||
| 109 | output_register_map.map_z, output_register_map.map_w | ||
| 110 | }; | ||
| 111 | |||
| 112 | for (unsigned comp = 0; comp < 4; ++comp) { | ||
| 113 | float24* out = ((float24*)&ret) + semantics[comp]; | ||
| 114 | if (semantics[comp] != Regs::VSOutputAttributes::INVALID) { | ||
| 115 | *out = state.registers.output[i][comp]; | ||
| 116 | } else { | ||
| 117 | // Zero output so that attributes which aren't output won't have denormals in them, | ||
| 118 | // which would slow us down later. | ||
| 119 | memset(out, 0, sizeof(*out)); | ||
| 120 | } | ||
| 121 | } | ||
| 122 | |||
| 123 | index++; | ||
| 124 | } | ||
| 125 | |||
| 126 | // The hardware takes the absolute and saturates vertex colors like this, *before* doing interpolation | ||
| 127 | for (unsigned i = 0; i < 4; ++i) { | ||
| 128 | ret.color[i] = float24::FromFloat32( | ||
| 129 | std::fmin(std::fabs(ret.color[i].ToFloat32()), 1.0f)); | ||
| 130 | } | ||
| 131 | |||
| 132 | LOG_TRACE(HW_GPU, "Output vertex: pos(%.2f, %.2f, %.2f, %.2f), quat(%.2f, %.2f, %.2f, %.2f), " | ||
| 133 | "col(%.2f, %.2f, %.2f, %.2f), tc0(%.2f, %.2f), view(%.2f, %.2f, %.2f)", | ||
| 134 | ret.pos.x.ToFloat32(), ret.pos.y.ToFloat32(), ret.pos.z.ToFloat32(), ret.pos.w.ToFloat32(), | ||
| 135 | ret.quat.x.ToFloat32(), ret.quat.y.ToFloat32(), ret.quat.z.ToFloat32(), ret.quat.w.ToFloat32(), | ||
| 136 | ret.color.x.ToFloat32(), ret.color.y.ToFloat32(), ret.color.z.ToFloat32(), ret.color.w.ToFloat32(), | ||
| 137 | ret.tc0.u().ToFloat32(), ret.tc0.v().ToFloat32(), | ||
| 138 | ret.view.x.ToFloat32(), ret.view.y.ToFloat32(), ret.view.z.ToFloat32()); | ||
| 139 | |||
| 140 | return ret; | ||
| 141 | } | 144 | } |
| 142 | 145 | ||
| 143 | DebugData<true> ShaderSetup::ProduceDebugInfo(const InputVertex& input, int num_attributes, const Regs::ShaderConfig& config, const ShaderSetup& setup) { | 146 | DebugData<true> ShaderSetup::ProduceDebugInfo(const InputVertex& input, int num_attributes, const Regs::ShaderConfig& config, const ShaderSetup& setup) { |
diff --git a/src/video_core/shader/shader.h b/src/video_core/shader/shader.h index 84898f21c..fee16df62 100644 --- a/src/video_core/shader/shader.h +++ b/src/video_core/shader/shader.h | |||
| @@ -84,6 +84,15 @@ struct OutputVertex { | |||
| 84 | static_assert(std::is_pod<OutputVertex>::value, "Structure is not POD"); | 84 | static_assert(std::is_pod<OutputVertex>::value, "Structure is not POD"); |
| 85 | static_assert(sizeof(OutputVertex) == 32 * sizeof(float), "OutputVertex has invalid size"); | 85 | static_assert(sizeof(OutputVertex) == 32 * sizeof(float), "OutputVertex has invalid size"); |
| 86 | 86 | ||
| 87 | struct OutputRegisters { | ||
| 88 | OutputRegisters() = default; | ||
| 89 | |||
| 90 | alignas(16) Math::Vec4<float24> value[16]; | ||
| 91 | |||
| 92 | OutputVertex ToVertex(const Regs::ShaderConfig& config); | ||
| 93 | }; | ||
| 94 | static_assert(std::is_pod<OutputRegisters>::value, "Structure is not POD"); | ||
| 95 | |||
| 87 | // Helper structure used to keep track of data useful for inspection of shader emulation | 96 | // Helper structure used to keep track of data useful for inspection of shader emulation |
| 88 | template<bool full_debugging> | 97 | template<bool full_debugging> |
| 89 | struct DebugData; | 98 | struct DebugData; |
| @@ -267,11 +276,12 @@ struct UnitState { | |||
| 267 | // The registers are accessed by the shader JIT using SSE instructions, and are therefore | 276 | // The registers are accessed by the shader JIT using SSE instructions, and are therefore |
| 268 | // required to be 16-byte aligned. | 277 | // required to be 16-byte aligned. |
| 269 | alignas(16) Math::Vec4<float24> input[16]; | 278 | alignas(16) Math::Vec4<float24> input[16]; |
| 270 | alignas(16) Math::Vec4<float24> output[16]; | ||
| 271 | alignas(16) Math::Vec4<float24> temporary[16]; | 279 | alignas(16) Math::Vec4<float24> temporary[16]; |
| 272 | } registers; | 280 | } registers; |
| 273 | static_assert(std::is_pod<Registers>::value, "Structure is not POD"); | 281 | static_assert(std::is_pod<Registers>::value, "Structure is not POD"); |
| 274 | 282 | ||
| 283 | OutputRegisters output_registers; | ||
| 284 | |||
| 275 | bool conditional_code[2]; | 285 | bool conditional_code[2]; |
| 276 | 286 | ||
| 277 | // Two Address registers and one loop counter | 287 | // Two Address registers and one loop counter |
| @@ -297,7 +307,7 @@ struct UnitState { | |||
| 297 | static size_t OutputOffset(const DestRegister& reg) { | 307 | static size_t OutputOffset(const DestRegister& reg) { |
| 298 | switch (reg.GetRegisterType()) { | 308 | switch (reg.GetRegisterType()) { |
| 299 | case RegisterType::Output: | 309 | case RegisterType::Output: |
| 300 | return offsetof(UnitState, registers.output) + reg.GetIndex()*sizeof(Math::Vec4<float24>); | 310 | return offsetof(UnitState, output_registers.value) + reg.GetIndex()*sizeof(Math::Vec4<float24>); |
| 301 | 311 | ||
| 302 | case RegisterType::Temporary: | 312 | case RegisterType::Temporary: |
| 303 | return offsetof(UnitState, registers.temporary) + reg.GetIndex()*sizeof(Math::Vec4<float24>); | 313 | return offsetof(UnitState, registers.temporary) + reg.GetIndex()*sizeof(Math::Vec4<float24>); |
| @@ -354,9 +364,8 @@ struct ShaderSetup { | |||
| 354 | * @param state Shader unit state, must be setup per shader and per shader unit | 364 | * @param state Shader unit state, must be setup per shader and per shader unit |
| 355 | * @param input Input vertex into the shader | 365 | * @param input Input vertex into the shader |
| 356 | * @param num_attributes The number of vertex shader attributes | 366 | * @param num_attributes The number of vertex shader attributes |
| 357 | * @return The output vertex, after having been processed by the vertex shader | ||
| 358 | */ | 367 | */ |
| 359 | OutputVertex Run(UnitState<false>& state, const InputVertex& input, int num_attributes); | 368 | void Run(UnitState<false>& state, const InputVertex& input, int num_attributes); |
| 360 | 369 | ||
| 361 | /** | 370 | /** |
| 362 | * Produce debug information based on the given shader and input vertex | 371 | * Produce debug information based on the given shader and input vertex |
diff --git a/src/video_core/shader/shader_interpreter.cpp b/src/video_core/shader/shader_interpreter.cpp index 714e8bfd5..b1eadc071 100644 --- a/src/video_core/shader/shader_interpreter.cpp +++ b/src/video_core/shader/shader_interpreter.cpp | |||
| @@ -144,7 +144,7 @@ void RunInterpreter(const ShaderSetup& setup, UnitState<Debug>& state, unsigned | |||
| 144 | src2[3] = src2[3] * float24::FromFloat32(-1); | 144 | src2[3] = src2[3] * float24::FromFloat32(-1); |
| 145 | } | 145 | } |
| 146 | 146 | ||
| 147 | float24* dest = (instr.common.dest.Value() < 0x10) ? &state.registers.output[instr.common.dest.Value().GetIndex()][0] | 147 | float24* dest = (instr.common.dest.Value() < 0x10) ? &state.output_registers.value[instr.common.dest.Value().GetIndex()][0] |
| 148 | : (instr.common.dest.Value() < 0x20) ? &state.registers.temporary[instr.common.dest.Value().GetIndex()][0] | 148 | : (instr.common.dest.Value() < 0x20) ? &state.registers.temporary[instr.common.dest.Value().GetIndex()][0] |
| 149 | : dummy_vec4_float24; | 149 | : dummy_vec4_float24; |
| 150 | 150 | ||
| @@ -483,7 +483,7 @@ void RunInterpreter(const ShaderSetup& setup, UnitState<Debug>& state, unsigned | |||
| 483 | src3[3] = src3[3] * float24::FromFloat32(-1); | 483 | src3[3] = src3[3] * float24::FromFloat32(-1); |
| 484 | } | 484 | } |
| 485 | 485 | ||
| 486 | float24* dest = (instr.mad.dest.Value() < 0x10) ? &state.registers.output[instr.mad.dest.Value().GetIndex()][0] | 486 | float24* dest = (instr.mad.dest.Value() < 0x10) ? &state.output_registers.value[instr.mad.dest.Value().GetIndex()][0] |
| 487 | : (instr.mad.dest.Value() < 0x20) ? &state.registers.temporary[instr.mad.dest.Value().GetIndex()][0] | 487 | : (instr.mad.dest.Value() < 0x20) ? &state.registers.temporary[instr.mad.dest.Value().GetIndex()][0] |
| 488 | : dummy_vec4_float24; | 488 | : dummy_vec4_float24; |
| 489 | 489 | ||