diff options
| author | 2015-07-21 19:38:59 -0400 | |
|---|---|---|
| committer | 2015-08-15 17:33:44 -0400 | |
| commit | 3f69c2039de1c3d084ac2c9eb0aa9315490346bf (patch) | |
| tree | 743f6bae0c3f1d475eabb083335ad7d6377bb97e /src | |
| parent | Shader: Move shader code to its own subdirectory, "shader". (diff) | |
| download | yuzu-3f69c2039de1c3d084ac2c9eb0aa9315490346bf.tar.gz yuzu-3f69c2039de1c3d084ac2c9eb0aa9315490346bf.tar.xz yuzu-3f69c2039de1c3d084ac2c9eb0aa9315490346bf.zip | |
Shader: Define a common interface for running vertex shader programs.
Diffstat (limited to 'src')
| -rw-r--r-- | src/video_core/CMakeLists.txt | 2 | ||||
| -rw-r--r-- | src/video_core/command_processor.cpp | 5 | ||||
| -rw-r--r-- | src/video_core/pica.h | 6 | ||||
| -rw-r--r-- | src/video_core/shader/shader.cpp | 105 | ||||
| -rw-r--r-- | src/video_core/shader/shader.h | 163 | ||||
| -rw-r--r-- | src/video_core/shader/shader_interpreter.cpp | 135 | ||||
| -rw-r--r-- | src/video_core/shader/shader_interpreter.h | 59 |
7 files changed, 289 insertions, 186 deletions
diff --git a/src/video_core/CMakeLists.txt b/src/video_core/CMakeLists.txt index 22252ea1d..2b859a077 100644 --- a/src/video_core/CMakeLists.txt +++ b/src/video_core/CMakeLists.txt | |||
| @@ -11,6 +11,7 @@ set(SRCS | |||
| 11 | pica.cpp | 11 | pica.cpp |
| 12 | primitive_assembly.cpp | 12 | primitive_assembly.cpp |
| 13 | rasterizer.cpp | 13 | rasterizer.cpp |
| 14 | shader/shader.cpp | ||
| 14 | shader/shader_interpreter.cpp | 15 | shader/shader_interpreter.cpp |
| 15 | utils.cpp | 16 | utils.cpp |
| 16 | video_core.cpp | 17 | video_core.cpp |
| @@ -35,6 +36,7 @@ set(HEADERS | |||
| 35 | primitive_assembly.h | 36 | primitive_assembly.h |
| 36 | rasterizer.h | 37 | rasterizer.h |
| 37 | renderer_base.h | 38 | renderer_base.h |
| 39 | shader/shader.h | ||
| 38 | shader/shader_interpreter.h | 40 | shader/shader_interpreter.h |
| 39 | utils.h | 41 | utils.h |
| 40 | video_core.h | 42 | video_core.h |
diff --git a/src/video_core/command_processor.cpp b/src/video_core/command_processor.cpp index 8f8e9872d..374c4748d 100644 --- a/src/video_core/command_processor.cpp +++ b/src/video_core/command_processor.cpp | |||
| @@ -215,6 +215,9 @@ static inline void WritePicaReg(u32 id, u32 value, u32 mask) { | |||
| 215 | unsigned int vertex_cache_pos = 0; | 215 | unsigned int vertex_cache_pos = 0; |
| 216 | vertex_cache_ids.fill(-1); | 216 | vertex_cache_ids.fill(-1); |
| 217 | 217 | ||
| 218 | Shader::UnitState shader_unit; | ||
| 219 | Shader::Setup(shader_unit); | ||
| 220 | |||
| 218 | for (unsigned int index = 0; index < regs.num_vertices; ++index) | 221 | for (unsigned int index = 0; index < regs.num_vertices; ++index) |
| 219 | { | 222 | { |
| 220 | unsigned int vertex = is_indexed ? (index_u16 ? index_address_16[index] : index_address_8[index]) : index; | 223 | unsigned int vertex = is_indexed ? (index_u16 ? index_address_16[index] : index_address_8[index]) : index; |
| @@ -307,7 +310,7 @@ static inline void WritePicaReg(u32 id, u32 value, u32 mask) { | |||
| 307 | &geometry_dumper, _1, _2, _3)); | 310 | &geometry_dumper, _1, _2, _3)); |
| 308 | #endif | 311 | #endif |
| 309 | // Send to vertex shader | 312 | // Send to vertex shader |
| 310 | output = Shader::RunShader(input, attribute_config.GetNumTotalAttributes(), g_state.regs.vs, g_state.vs); | 313 | output = Shader::Run(shader_unit, input, attribute_config.GetNumTotalAttributes()); |
| 311 | 314 | ||
| 312 | if (is_indexed) { | 315 | if (is_indexed) { |
| 313 | vertex_cache[vertex_cache_pos] = output; | 316 | vertex_cache[vertex_cache_pos] = output; |
diff --git a/src/video_core/pica.h b/src/video_core/pica.h index 34b02b2f8..6ce90f95a 100644 --- a/src/video_core/pica.h +++ b/src/video_core/pica.h | |||
| @@ -1083,6 +1083,7 @@ private: | |||
| 1083 | // TODO: Perform proper arithmetic on this! | 1083 | // TODO: Perform proper arithmetic on this! |
| 1084 | float value; | 1084 | float value; |
| 1085 | }; | 1085 | }; |
| 1086 | static_assert(sizeof(float24) == sizeof(float), "Shader JIT assumes float24 is implemented as a 32-bit float"); | ||
| 1086 | 1087 | ||
| 1087 | /// Struct used to describe current Pica state | 1088 | /// Struct used to describe current Pica state |
| 1088 | struct State { | 1089 | struct State { |
| @@ -1092,7 +1093,10 @@ struct State { | |||
| 1092 | /// Vertex shader memory | 1093 | /// Vertex shader memory |
| 1093 | struct ShaderSetup { | 1094 | struct ShaderSetup { |
| 1094 | struct { | 1095 | struct { |
| 1095 | Math::Vec4<float24> f[96]; | 1096 | // The float uniforms are accessed by the shader JIT using SSE instructions, and are |
| 1097 | // therefore required to be 16-byte aligned. | ||
| 1098 | Math::Vec4<float24> MEMORY_ALIGNED16(f[96]); | ||
| 1099 | |||
| 1096 | std::array<bool, 16> b; | 1100 | std::array<bool, 16> b; |
| 1097 | std::array<Math::Vec4<u8>, 4> i; | 1101 | std::array<Math::Vec4<u8>, 4> i; |
| 1098 | } uniforms; | 1102 | } uniforms; |
diff --git a/src/video_core/shader/shader.cpp b/src/video_core/shader/shader.cpp new file mode 100644 index 000000000..e397e8e03 --- /dev/null +++ b/src/video_core/shader/shader.cpp | |||
| @@ -0,0 +1,105 @@ | |||
| 1 | // Copyright 2015 Citra Emulator Project | ||
| 2 | // Licensed under GPLv2 or any later version | ||
| 3 | // Refer to the license.txt file included. | ||
| 4 | |||
| 5 | #include "common/logging/log.h" | ||
| 6 | #include "common/profiler.h" | ||
| 7 | |||
| 8 | #include "video_core/debug_utils/debug_utils.h" | ||
| 9 | #include "video_core/pica.h" | ||
| 10 | |||
| 11 | #include "shader.h" | ||
| 12 | #include "shader_interpreter.h" | ||
| 13 | |||
| 14 | namespace Pica { | ||
| 15 | |||
| 16 | namespace Shader { | ||
| 17 | |||
| 18 | void Setup(UnitState& state) { | ||
| 19 | // TODO(bunnei): This will be used by the JIT in a subsequent commit | ||
| 20 | } | ||
| 21 | |||
| 22 | static Common::Profiling::TimingCategory shader_category("Vertex Shader"); | ||
| 23 | |||
| 24 | OutputVertex Run(UnitState& state, const InputVertex& input, int num_attributes) { | ||
| 25 | auto& config = g_state.regs.vs; | ||
| 26 | auto& setup = g_state.vs; | ||
| 27 | |||
| 28 | Common::Profiling::ScopeTimer timer(shader_category); | ||
| 29 | |||
| 30 | state.program_counter = config.main_offset; | ||
| 31 | state.debug.max_offset = 0; | ||
| 32 | state.debug.max_opdesc_id = 0; | ||
| 33 | |||
| 34 | // Setup input register table | ||
| 35 | const auto& attribute_register_map = config.input_register_map; | ||
| 36 | |||
| 37 | if (num_attributes > 0) state.input_registers[attribute_register_map.attribute0_register] = input.attr[0]; | ||
| 38 | if (num_attributes > 1) state.input_registers[attribute_register_map.attribute1_register] = input.attr[1]; | ||
| 39 | if (num_attributes > 2) state.input_registers[attribute_register_map.attribute2_register] = input.attr[2]; | ||
| 40 | if (num_attributes > 3) state.input_registers[attribute_register_map.attribute3_register] = input.attr[3]; | ||
| 41 | if (num_attributes > 4) state.input_registers[attribute_register_map.attribute4_register] = input.attr[4]; | ||
| 42 | if (num_attributes > 5) state.input_registers[attribute_register_map.attribute5_register] = input.attr[5]; | ||
| 43 | if (num_attributes > 6) state.input_registers[attribute_register_map.attribute6_register] = input.attr[6]; | ||
| 44 | if (num_attributes > 7) state.input_registers[attribute_register_map.attribute7_register] = input.attr[7]; | ||
| 45 | if (num_attributes > 8) state.input_registers[attribute_register_map.attribute8_register] = input.attr[8]; | ||
| 46 | if (num_attributes > 9) state.input_registers[attribute_register_map.attribute9_register] = input.attr[9]; | ||
| 47 | if (num_attributes > 10) state.input_registers[attribute_register_map.attribute10_register] = input.attr[10]; | ||
| 48 | if (num_attributes > 11) state.input_registers[attribute_register_map.attribute11_register] = input.attr[11]; | ||
| 49 | if (num_attributes > 12) state.input_registers[attribute_register_map.attribute12_register] = input.attr[12]; | ||
| 50 | if (num_attributes > 13) state.input_registers[attribute_register_map.attribute13_register] = input.attr[13]; | ||
| 51 | if (num_attributes > 14) state.input_registers[attribute_register_map.attribute14_register] = input.attr[14]; | ||
| 52 | if (num_attributes > 15) state.input_registers[attribute_register_map.attribute15_register] = input.attr[15]; | ||
| 53 | |||
| 54 | state.conditional_code[0] = false; | ||
| 55 | state.conditional_code[1] = false; | ||
| 56 | |||
| 57 | RunInterpreter(state); | ||
| 58 | |||
| 59 | #if PICA_DUMP_SHADERS | ||
| 60 | DebugUtils::DumpShader(setup.program_code.data(), state.debug.max_offset, setup.swizzle_data.data(), | ||
| 61 | state.debug.max_opdesc_id, config.main_offset, | ||
| 62 | g_state.regs.vs_output_attributes); // TODO: Don't hardcode VS here | ||
| 63 | #endif | ||
| 64 | |||
| 65 | // Setup output data | ||
| 66 | OutputVertex ret; | ||
| 67 | // TODO(neobrain): Under some circumstances, up to 16 attributes may be output. We need to | ||
| 68 | // figure out what those circumstances are and enable the remaining outputs then. | ||
| 69 | for (int i = 0; i < 7; ++i) { | ||
| 70 | const auto& output_register_map = g_state.regs.vs_output_attributes[i]; // TODO: Don't hardcode VS here | ||
| 71 | |||
| 72 | u32 semantics[4] = { | ||
| 73 | output_register_map.map_x, output_register_map.map_y, | ||
| 74 | output_register_map.map_z, output_register_map.map_w | ||
| 75 | }; | ||
| 76 | |||
| 77 | for (int comp = 0; comp < 4; ++comp) { | ||
| 78 | float24* out = ((float24*)&ret) + semantics[comp]; | ||
| 79 | if (semantics[comp] != Regs::VSOutputAttributes::INVALID) { | ||
| 80 | *out = state.output_registers[i][comp]; | ||
| 81 | } else { | ||
| 82 | // Zero output so that attributes which aren't output won't have denormals in them, | ||
| 83 | // which would slow us down later. | ||
| 84 | memset(out, 0, sizeof(*out)); | ||
| 85 | } | ||
| 86 | } | ||
| 87 | } | ||
| 88 | |||
| 89 | // The hardware takes the absolute and saturates vertex colors like this, *before* doing interpolation | ||
| 90 | for (int i = 0; i < 4; ++i) { | ||
| 91 | ret.color[i] = float24::FromFloat32( | ||
| 92 | std::fmin(std::fabs(ret.color[i].ToFloat32()), 1.0f)); | ||
| 93 | } | ||
| 94 | |||
| 95 | LOG_TRACE(Render_Software, "Output vertex: pos (%.2f, %.2f, %.2f, %.2f), col(%.2f, %.2f, %.2f, %.2f), tc0(%.2f, %.2f)", | ||
| 96 | ret.pos.x.ToFloat32(), ret.pos.y.ToFloat32(), ret.pos.z.ToFloat32(), ret.pos.w.ToFloat32(), | ||
| 97 | ret.color.x.ToFloat32(), ret.color.y.ToFloat32(), ret.color.z.ToFloat32(), ret.color.w.ToFloat32(), | ||
| 98 | ret.tc0.u().ToFloat32(), ret.tc0.v().ToFloat32()); | ||
| 99 | |||
| 100 | return ret; | ||
| 101 | } | ||
| 102 | |||
| 103 | } // namespace Shader | ||
| 104 | |||
| 105 | } // namespace Pica | ||
diff --git a/src/video_core/shader/shader.h b/src/video_core/shader/shader.h new file mode 100644 index 000000000..38c00768d --- /dev/null +++ b/src/video_core/shader/shader.h | |||
| @@ -0,0 +1,163 @@ | |||
| 1 | // Copyright 2015 Citra Emulator Project | ||
| 2 | // Licensed under GPLv2 or any later version | ||
| 3 | // Refer to the license.txt file included. | ||
| 4 | |||
| 5 | #pragma once | ||
| 6 | |||
| 7 | #include <boost/container/static_vector.hpp> | ||
| 8 | #include <nihstro/shader_binary.h> | ||
| 9 | |||
| 10 | #include "common/common_funcs.h" | ||
| 11 | #include "common/common_types.h" | ||
| 12 | #include "common/vector_math.h" | ||
| 13 | |||
| 14 | #include "video_core/pica.h" | ||
| 15 | |||
| 16 | using nihstro::RegisterType; | ||
| 17 | using nihstro::SourceRegister; | ||
| 18 | using nihstro::DestRegister; | ||
| 19 | |||
| 20 | namespace Pica { | ||
| 21 | |||
| 22 | namespace Shader { | ||
| 23 | |||
| 24 | struct InputVertex { | ||
| 25 | Math::Vec4<float24> attr[16]; | ||
| 26 | }; | ||
| 27 | |||
| 28 | struct OutputVertex { | ||
| 29 | OutputVertex() = default; | ||
| 30 | |||
| 31 | // VS output attributes | ||
| 32 | Math::Vec4<float24> pos; | ||
| 33 | Math::Vec4<float24> dummy; // quaternions (not implemented, yet) | ||
| 34 | Math::Vec4<float24> color; | ||
| 35 | Math::Vec2<float24> tc0; | ||
| 36 | Math::Vec2<float24> tc1; | ||
| 37 | float24 pad[6]; | ||
| 38 | Math::Vec2<float24> tc2; | ||
| 39 | |||
| 40 | // Padding for optimal alignment | ||
| 41 | float24 pad2[4]; | ||
| 42 | |||
| 43 | // Attributes used to store intermediate results | ||
| 44 | |||
| 45 | // position after perspective divide | ||
| 46 | Math::Vec3<float24> screenpos; | ||
| 47 | float24 pad3; | ||
| 48 | |||
| 49 | // Linear interpolation | ||
| 50 | // factor: 0=this, 1=vtx | ||
| 51 | void Lerp(float24 factor, const OutputVertex& vtx) { | ||
| 52 | pos = pos * factor + vtx.pos * (float24::FromFloat32(1) - factor); | ||
| 53 | |||
| 54 | // TODO: Should perform perspective correct interpolation here... | ||
| 55 | tc0 = tc0 * factor + vtx.tc0 * (float24::FromFloat32(1) - factor); | ||
| 56 | tc1 = tc1 * factor + vtx.tc1 * (float24::FromFloat32(1) - factor); | ||
| 57 | tc2 = tc2 * factor + vtx.tc2 * (float24::FromFloat32(1) - factor); | ||
| 58 | |||
| 59 | screenpos = screenpos * factor + vtx.screenpos * (float24::FromFloat32(1) - factor); | ||
| 60 | |||
| 61 | color = color * factor + vtx.color * (float24::FromFloat32(1) - factor); | ||
| 62 | } | ||
| 63 | |||
| 64 | // Linear interpolation | ||
| 65 | // factor: 0=v0, 1=v1 | ||
| 66 | static OutputVertex Lerp(float24 factor, const OutputVertex& v0, const OutputVertex& v1) { | ||
| 67 | OutputVertex ret = v0; | ||
| 68 | ret.Lerp(factor, v1); | ||
| 69 | return ret; | ||
| 70 | } | ||
| 71 | }; | ||
| 72 | static_assert(std::is_pod<OutputVertex>::value, "Structure is not POD"); | ||
| 73 | static_assert(sizeof(OutputVertex) == 32 * sizeof(float), "OutputVertex has invalid size"); | ||
| 74 | |||
| 75 | /** | ||
| 76 | * This structure contains the state information that needs to be unique for a shader unit. The 3DS | ||
| 77 | * has four shader units that process shaders in parallel. At the present, Citra only implements a | ||
| 78 | * single shader unit that processes all shaders serially. Putting the state information in a struct | ||
| 79 | * here will make it easier for us to parallelize the shader processing later. | ||
| 80 | */ | ||
| 81 | struct UnitState { | ||
| 82 | // The registers are accessed by the shader JIT using SSE instructions, and are therefore | ||
| 83 | // required to be 16-byte aligned. | ||
| 84 | Math::Vec4<float24> MEMORY_ALIGNED16(input_registers[16]); | ||
| 85 | Math::Vec4<float24> MEMORY_ALIGNED16(output_registers[16]); | ||
| 86 | Math::Vec4<float24> MEMORY_ALIGNED16(temporary_registers[16]); | ||
| 87 | |||
| 88 | u32 program_counter; | ||
| 89 | bool conditional_code[2]; | ||
| 90 | |||
| 91 | // Two Address registers and one loop counter | ||
| 92 | // TODO: How many bits do these actually have? | ||
| 93 | s32 address_registers[3]; | ||
| 94 | |||
| 95 | enum { | ||
| 96 | INVALID_ADDRESS = 0xFFFFFFFF | ||
| 97 | }; | ||
| 98 | |||
| 99 | struct CallStackElement { | ||
| 100 | u32 final_address; // Address upon which we jump to return_address | ||
| 101 | u32 return_address; // Where to jump when leaving scope | ||
| 102 | u8 repeat_counter; // How often to repeat until this call stack element is removed | ||
| 103 | u8 loop_increment; // Which value to add to the loop counter after an iteration | ||
| 104 | // TODO: Should this be a signed value? Does it even matter? | ||
| 105 | u32 loop_address; // The address where we'll return to after each loop iteration | ||
| 106 | }; | ||
| 107 | |||
| 108 | // TODO: Is there a maximal size for this? | ||
| 109 | boost::container::static_vector<CallStackElement, 16> call_stack; | ||
| 110 | |||
| 111 | struct { | ||
| 112 | u32 max_offset; // maximum program counter ever reached | ||
| 113 | u32 max_opdesc_id; // maximum swizzle pattern index ever used | ||
| 114 | } debug; | ||
| 115 | |||
| 116 | static int InputOffset(const SourceRegister& reg) { | ||
| 117 | switch (reg.GetRegisterType()) { | ||
| 118 | case RegisterType::Input: | ||
| 119 | return (int)offsetof(UnitState, input_registers) + reg.GetIndex()*sizeof(Math::Vec4<float24>); | ||
| 120 | |||
| 121 | case RegisterType::Temporary: | ||
| 122 | return (int)offsetof(UnitState, temporary_registers) + reg.GetIndex()*sizeof(Math::Vec4<float24>); | ||
| 123 | |||
| 124 | default: | ||
| 125 | UNREACHABLE(); | ||
| 126 | return 0; | ||
| 127 | } | ||
| 128 | } | ||
| 129 | |||
| 130 | static int OutputOffset(const DestRegister& reg) { | ||
| 131 | switch (reg.GetRegisterType()) { | ||
| 132 | case RegisterType::Output: | ||
| 133 | return (int)offsetof(UnitState, output_registers) + reg.GetIndex()*sizeof(Math::Vec4<float24>); | ||
| 134 | |||
| 135 | case RegisterType::Temporary: | ||
| 136 | return (int)offsetof(UnitState, temporary_registers) + reg.GetIndex()*sizeof(Math::Vec4<float24>); | ||
| 137 | |||
| 138 | default: | ||
| 139 | UNREACHABLE(); | ||
| 140 | return 0; | ||
| 141 | } | ||
| 142 | } | ||
| 143 | }; | ||
| 144 | |||
| 145 | /** | ||
| 146 | * Performs any shader unit setup that only needs to happen once per shader (as opposed to once per | ||
| 147 | * vertex, which would happen within the `Run` function). | ||
| 148 | * @param state Shader unit state, must be setup per shader and per shader unit | ||
| 149 | */ | ||
| 150 | void Setup(UnitState& state); | ||
| 151 | |||
| 152 | /** | ||
| 153 | * Runs the currently setup shader | ||
| 154 | * @param state Shader unit state, must be setup per shader and per shader unit | ||
| 155 | * @param input Input vertex into the shader | ||
| 156 | * @param num_attributes The number of vertex shader attributes | ||
| 157 | * @return The output vertex, after having been processed by the vertex shader | ||
| 158 | */ | ||
| 159 | OutputVertex Run(UnitState& state, const InputVertex& input, int num_attributes); | ||
| 160 | |||
| 161 | } // namespace Shader | ||
| 162 | |||
| 163 | } // namespace Pica | ||
diff --git a/src/video_core/shader/shader_interpreter.cpp b/src/video_core/shader/shader_interpreter.cpp index 369883225..eb48e7053 100644 --- a/src/video_core/shader/shader_interpreter.cpp +++ b/src/video_core/shader/shader_interpreter.cpp | |||
| @@ -2,18 +2,14 @@ | |||
| 2 | // Licensed under GPLv2 or any later version | 2 | // Licensed under GPLv2 or any later version |
| 3 | // Refer to the license.txt file included. | 3 | // Refer to the license.txt file included. |
| 4 | 4 | ||
| 5 | #include <boost/container/static_vector.hpp> | ||
| 6 | #include <boost/range/algorithm.hpp> | ||
| 7 | |||
| 8 | #include <common/file_util.h> | 5 | #include <common/file_util.h> |
| 9 | 6 | ||
| 10 | #include <nihstro/shader_bytecode.h> | 7 | #include <nihstro/shader_bytecode.h> |
| 11 | 8 | ||
| 12 | #include "common/profiler.h" | ||
| 13 | |||
| 14 | #include "video_core/pica.h" | 9 | #include "video_core/pica.h" |
| 15 | #include "video_core/shader/shader_interpreter.h" | 10 | |
| 16 | #include "video_core/debug_utils/debug_utils.h" | 11 | #include "shader.h" |
| 12 | #include "shader_interpreter.h" | ||
| 17 | 13 | ||
| 18 | using nihstro::OpCode; | 14 | using nihstro::OpCode; |
| 19 | using nihstro::Instruction; | 15 | using nihstro::Instruction; |
| @@ -25,42 +21,7 @@ namespace Pica { | |||
| 25 | 21 | ||
| 26 | namespace Shader { | 22 | namespace Shader { |
| 27 | 23 | ||
| 28 | struct ShaderState { | 24 | void RunInterpreter(UnitState& state) { |
| 29 | u32 program_counter; | ||
| 30 | |||
| 31 | const float24* input_register_table[16]; | ||
| 32 | Math::Vec4<float24> output_registers[16]; | ||
| 33 | |||
| 34 | Math::Vec4<float24> temporary_registers[16]; | ||
| 35 | bool conditional_code[2]; | ||
| 36 | |||
| 37 | // Two Address registers and one loop counter | ||
| 38 | // TODO: How many bits do these actually have? | ||
| 39 | s32 address_registers[3]; | ||
| 40 | |||
| 41 | enum { | ||
| 42 | INVALID_ADDRESS = 0xFFFFFFFF | ||
| 43 | }; | ||
| 44 | |||
| 45 | struct CallStackElement { | ||
| 46 | u32 final_address; // Address upon which we jump to return_address | ||
| 47 | u32 return_address; // Where to jump when leaving scope | ||
| 48 | u8 repeat_counter; // How often to repeat until this call stack element is removed | ||
| 49 | u8 loop_increment; // Which value to add to the loop counter after an iteration | ||
| 50 | // TODO: Should this be a signed value? Does it even matter? | ||
| 51 | u32 loop_address; // The address where we'll return to after each loop iteration | ||
| 52 | }; | ||
| 53 | |||
| 54 | // TODO: Is there a maximal size for this? | ||
| 55 | boost::container::static_vector<CallStackElement, 16> call_stack; | ||
| 56 | |||
| 57 | struct { | ||
| 58 | u32 max_offset; // maximum program counter ever reached | ||
| 59 | u32 max_opdesc_id; // maximum swizzle pattern index ever used | ||
| 60 | } debug; | ||
| 61 | }; | ||
| 62 | |||
| 63 | static void ProcessShaderCode(ShaderState& state) { | ||
| 64 | const auto& uniforms = g_state.vs.uniforms; | 25 | const auto& uniforms = g_state.vs.uniforms; |
| 65 | const auto& swizzle_data = g_state.vs.swizzle_data; | 26 | const auto& swizzle_data = g_state.vs.swizzle_data; |
| 66 | const auto& program_code = g_state.vs.program_code; | 27 | const auto& program_code = g_state.vs.program_code; |
| @@ -90,7 +51,7 @@ static void ProcessShaderCode(ShaderState& state) { | |||
| 90 | const Instruction instr = { program_code[state.program_counter] }; | 51 | const Instruction instr = { program_code[state.program_counter] }; |
| 91 | const SwizzlePattern swizzle = { swizzle_data[instr.common.operand_desc_id] }; | 52 | const SwizzlePattern swizzle = { swizzle_data[instr.common.operand_desc_id] }; |
| 92 | 53 | ||
| 93 | static auto call = [](ShaderState& state, u32 offset, u32 num_instructions, | 54 | static auto call = [](UnitState& state, u32 offset, u32 num_instructions, |
| 94 | u32 return_offset, u8 repeat_count, u8 loop_increment) { | 55 | u32 return_offset, u8 repeat_count, u8 loop_increment) { |
| 95 | state.program_counter = offset - 1; // -1 to make sure when incrementing the PC we end up at the correct offset | 56 | state.program_counter = offset - 1; // -1 to make sure when incrementing the PC we end up at the correct offset |
| 96 | ASSERT(state.call_stack.size() < state.call_stack.capacity()); | 57 | ASSERT(state.call_stack.size() < state.call_stack.capacity()); |
| @@ -101,7 +62,7 @@ static void ProcessShaderCode(ShaderState& state) { | |||
| 101 | auto LookupSourceRegister = [&](const SourceRegister& source_reg) -> const float24* { | 62 | auto LookupSourceRegister = [&](const SourceRegister& source_reg) -> const float24* { |
| 102 | switch (source_reg.GetRegisterType()) { | 63 | switch (source_reg.GetRegisterType()) { |
| 103 | case RegisterType::Input: | 64 | case RegisterType::Input: |
| 104 | return state.input_register_table[source_reg.GetIndex()]; | 65 | return &state.input_registers[source_reg.GetIndex()].x; |
| 105 | 66 | ||
| 106 | case RegisterType::Temporary: | 67 | case RegisterType::Temporary: |
| 107 | return &state.temporary_registers[source_reg.GetIndex()].x; | 68 | return &state.temporary_registers[source_reg.GetIndex()].x; |
| @@ -413,7 +374,7 @@ static void ProcessShaderCode(ShaderState& state) { | |||
| 413 | 374 | ||
| 414 | default: | 375 | default: |
| 415 | { | 376 | { |
| 416 | static auto evaluate_condition = [](const ShaderState& state, bool refx, bool refy, Instruction::FlowControlType flow_control) { | 377 | static auto evaluate_condition = [](const UnitState& state, bool refx, bool refy, Instruction::FlowControlType flow_control) { |
| 417 | bool results[2] = { refx == state.conditional_code[0], | 378 | bool results[2] = { refx == state.conditional_code[0], |
| 418 | refy == state.conditional_code[1] }; | 379 | refy == state.conditional_code[1] }; |
| 419 | 380 | ||
| @@ -542,88 +503,6 @@ static void ProcessShaderCode(ShaderState& state) { | |||
| 542 | } | 503 | } |
| 543 | } | 504 | } |
| 544 | 505 | ||
| 545 | static Common::Profiling::TimingCategory shader_category("Vertex Shader"); | ||
| 546 | |||
| 547 | OutputVertex RunShader(const InputVertex& input, int num_attributes, const Regs::ShaderConfig& config, const State::ShaderSetup& setup) { | ||
| 548 | Common::Profiling::ScopeTimer timer(shader_category); | ||
| 549 | |||
| 550 | ShaderState state; | ||
| 551 | |||
| 552 | state.program_counter = config.main_offset; | ||
| 553 | state.debug.max_offset = 0; | ||
| 554 | state.debug.max_opdesc_id = 0; | ||
| 555 | |||
| 556 | // Setup input register table | ||
| 557 | const auto& attribute_register_map = config.input_register_map; | ||
| 558 | float24 dummy_register; | ||
| 559 | boost::fill(state.input_register_table, &dummy_register); | ||
| 560 | |||
| 561 | if (num_attributes > 0) state.input_register_table[attribute_register_map.attribute0_register] = &input.attr[0].x; | ||
| 562 | if (num_attributes > 1) state.input_register_table[attribute_register_map.attribute1_register] = &input.attr[1].x; | ||
| 563 | if (num_attributes > 2) state.input_register_table[attribute_register_map.attribute2_register] = &input.attr[2].x; | ||
| 564 | if (num_attributes > 3) state.input_register_table[attribute_register_map.attribute3_register] = &input.attr[3].x; | ||
| 565 | if (num_attributes > 4) state.input_register_table[attribute_register_map.attribute4_register] = &input.attr[4].x; | ||
| 566 | if (num_attributes > 5) state.input_register_table[attribute_register_map.attribute5_register] = &input.attr[5].x; | ||
| 567 | if (num_attributes > 6) state.input_register_table[attribute_register_map.attribute6_register] = &input.attr[6].x; | ||
| 568 | if (num_attributes > 7) state.input_register_table[attribute_register_map.attribute7_register] = &input.attr[7].x; | ||
| 569 | if (num_attributes > 8) state.input_register_table[attribute_register_map.attribute8_register] = &input.attr[8].x; | ||
| 570 | if (num_attributes > 9) state.input_register_table[attribute_register_map.attribute9_register] = &input.attr[9].x; | ||
| 571 | if (num_attributes > 10) state.input_register_table[attribute_register_map.attribute10_register] = &input.attr[10].x; | ||
| 572 | if (num_attributes > 11) state.input_register_table[attribute_register_map.attribute11_register] = &input.attr[11].x; | ||
| 573 | if (num_attributes > 12) state.input_register_table[attribute_register_map.attribute12_register] = &input.attr[12].x; | ||
| 574 | if (num_attributes > 13) state.input_register_table[attribute_register_map.attribute13_register] = &input.attr[13].x; | ||
| 575 | if (num_attributes > 14) state.input_register_table[attribute_register_map.attribute14_register] = &input.attr[14].x; | ||
| 576 | if (num_attributes > 15) state.input_register_table[attribute_register_map.attribute15_register] = &input.attr[15].x; | ||
| 577 | |||
| 578 | state.conditional_code[0] = false; | ||
| 579 | state.conditional_code[1] = false; | ||
| 580 | |||
| 581 | ProcessShaderCode(state); | ||
| 582 | #if PICA_DUMP_SHADERS | ||
| 583 | DebugUtils::DumpShader(setup.program_code.data(), state.debug.max_offset, setup.swizzle_data.data(), | ||
| 584 | state.debug.max_opdesc_id, config.main_offset, | ||
| 585 | g_state.regs.vs_output_attributes); // TODO: Don't hardcode VS here | ||
| 586 | #endif | ||
| 587 | |||
| 588 | // Setup output data | ||
| 589 | OutputVertex ret; | ||
| 590 | // TODO(neobrain): Under some circumstances, up to 16 attributes may be output. We need to | ||
| 591 | // figure out what those circumstances are and enable the remaining outputs then. | ||
| 592 | for (int i = 0; i < 7; ++i) { | ||
| 593 | const auto& output_register_map = g_state.regs.vs_output_attributes[i]; // TODO: Don't hardcode VS here | ||
| 594 | |||
| 595 | u32 semantics[4] = { | ||
| 596 | output_register_map.map_x, output_register_map.map_y, | ||
| 597 | output_register_map.map_z, output_register_map.map_w | ||
| 598 | }; | ||
| 599 | |||
| 600 | for (int comp = 0; comp < 4; ++comp) { | ||
| 601 | float24* out = ((float24*)&ret) + semantics[comp]; | ||
| 602 | if (semantics[comp] != Regs::VSOutputAttributes::INVALID) { | ||
| 603 | *out = state.output_registers[i][comp]; | ||
| 604 | } else { | ||
| 605 | // Zero output so that attributes which aren't output won't have denormals in them, | ||
| 606 | // which would slow us down later. | ||
| 607 | memset(out, 0, sizeof(*out)); | ||
| 608 | } | ||
| 609 | } | ||
| 610 | } | ||
| 611 | |||
| 612 | // The hardware takes the absolute and saturates vertex colors like this, *before* doing interpolation | ||
| 613 | for (int i = 0; i < 4; ++i) { | ||
| 614 | ret.color[i] = float24::FromFloat32( | ||
| 615 | std::fmin(std::fabs(ret.color[i].ToFloat32()), 1.0f)); | ||
| 616 | } | ||
| 617 | |||
| 618 | LOG_TRACE(Render_Software, "Output vertex: pos (%.2f, %.2f, %.2f, %.2f), col(%.2f, %.2f, %.2f, %.2f), tc0(%.2f, %.2f)", | ||
| 619 | ret.pos.x.ToFloat32(), ret.pos.y.ToFloat32(), ret.pos.z.ToFloat32(), ret.pos.w.ToFloat32(), | ||
| 620 | ret.color.x.ToFloat32(), ret.color.y.ToFloat32(), ret.color.z.ToFloat32(), ret.color.w.ToFloat32(), | ||
| 621 | ret.tc0.u().ToFloat32(), ret.tc0.v().ToFloat32()); | ||
| 622 | |||
| 623 | return ret; | ||
| 624 | } | ||
| 625 | |||
| 626 | |||
| 627 | } // namespace | 506 | } // namespace |
| 628 | 507 | ||
| 629 | } // namespace | 508 | } // namespace |
diff --git a/src/video_core/shader/shader_interpreter.h b/src/video_core/shader/shader_interpreter.h index f2900bfc6..ad6e58e39 100644 --- a/src/video_core/shader/shader_interpreter.h +++ b/src/video_core/shader/shader_interpreter.h | |||
| @@ -4,68 +4,15 @@ | |||
| 4 | 4 | ||
| 5 | #pragma once | 5 | #pragma once |
| 6 | 6 | ||
| 7 | #include <type_traits> | ||
| 8 | |||
| 9 | #include "common/vector_math.h" | ||
| 10 | |||
| 11 | #include "video_core/pica.h" | 7 | #include "video_core/pica.h" |
| 12 | 8 | ||
| 9 | #include "shader.h" | ||
| 10 | |||
| 13 | namespace Pica { | 11 | namespace Pica { |
| 14 | 12 | ||
| 15 | namespace Shader { | 13 | namespace Shader { |
| 16 | 14 | ||
| 17 | struct InputVertex { | 15 | void RunInterpreter(UnitState& state); |
| 18 | Math::Vec4<float24> attr[16]; | ||
| 19 | }; | ||
| 20 | |||
| 21 | struct OutputVertex { | ||
| 22 | OutputVertex() = default; | ||
| 23 | |||
| 24 | // VS output attributes | ||
| 25 | Math::Vec4<float24> pos; | ||
| 26 | Math::Vec4<float24> dummy; // quaternions (not implemented, yet) | ||
| 27 | Math::Vec4<float24> color; | ||
| 28 | Math::Vec2<float24> tc0; | ||
| 29 | Math::Vec2<float24> tc1; | ||
| 30 | float24 pad[6]; | ||
| 31 | Math::Vec2<float24> tc2; | ||
| 32 | |||
| 33 | // Padding for optimal alignment | ||
| 34 | float24 pad2[4]; | ||
| 35 | |||
| 36 | // Attributes used to store intermediate results | ||
| 37 | |||
| 38 | // position after perspective divide | ||
| 39 | Math::Vec3<float24> screenpos; | ||
| 40 | float24 pad3; | ||
| 41 | |||
| 42 | // Linear interpolation | ||
| 43 | // factor: 0=this, 1=vtx | ||
| 44 | void Lerp(float24 factor, const OutputVertex& vtx) { | ||
| 45 | pos = pos * factor + vtx.pos * (float24::FromFloat32(1) - factor); | ||
| 46 | |||
| 47 | // TODO: Should perform perspective correct interpolation here... | ||
| 48 | tc0 = tc0 * factor + vtx.tc0 * (float24::FromFloat32(1) - factor); | ||
| 49 | tc1 = tc1 * factor + vtx.tc1 * (float24::FromFloat32(1) - factor); | ||
| 50 | tc2 = tc2 * factor + vtx.tc2 * (float24::FromFloat32(1) - factor); | ||
| 51 | |||
| 52 | screenpos = screenpos * factor + vtx.screenpos * (float24::FromFloat32(1) - factor); | ||
| 53 | |||
| 54 | color = color * factor + vtx.color * (float24::FromFloat32(1) - factor); | ||
| 55 | } | ||
| 56 | |||
| 57 | // Linear interpolation | ||
| 58 | // factor: 0=v0, 1=v1 | ||
| 59 | static OutputVertex Lerp(float24 factor, const OutputVertex& v0, const OutputVertex& v1) { | ||
| 60 | OutputVertex ret = v0; | ||
| 61 | ret.Lerp(factor, v1); | ||
| 62 | return ret; | ||
| 63 | } | ||
| 64 | }; | ||
| 65 | static_assert(std::is_pod<OutputVertex>::value, "Structure is not POD"); | ||
| 66 | static_assert(sizeof(OutputVertex) == 32 * sizeof(float), "OutputVertex has invalid size"); | ||
| 67 | |||
| 68 | OutputVertex RunShader(const InputVertex& input, int num_attributes, const Regs::ShaderConfig& config, const State::ShaderSetup& setup); | ||
| 69 | 16 | ||
| 70 | } // namespace | 17 | } // namespace |
| 71 | 18 | ||