summaryrefslogtreecommitdiff
path: root/src/video_core
diff options
context:
space:
mode:
Diffstat (limited to 'src/video_core')
-rw-r--r--src/video_core/CMakeLists.txt4
-rw-r--r--src/video_core/command_processor.cpp492
-rw-r--r--src/video_core/geometry_pipeline.cpp274
-rw-r--r--src/video_core/geometry_pipeline.h49
-rw-r--r--src/video_core/pica.cpp21
-rw-r--r--src/video_core/pica_state.h13
-rw-r--r--src/video_core/pica_types.h18
-rw-r--r--src/video_core/primitive_assembly.cpp15
-rw-r--r--src/video_core/primitive_assembly.h7
-rw-r--r--src/video_core/regs_framebuffer.h10
-rw-r--r--src/video_core/regs_pipeline.h43
-rw-r--r--src/video_core/regs_rasterizer.h14
-rw-r--r--src/video_core/regs_shader.h7
-rw-r--r--src/video_core/renderer_opengl/gl_rasterizer.cpp56
-rw-r--r--src/video_core/renderer_opengl/gl_rasterizer.h9
-rw-r--r--src/video_core/renderer_opengl/gl_shader_gen.cpp115
-rw-r--r--src/video_core/renderer_opengl/gl_state.cpp13
-rw-r--r--src/video_core/renderer_opengl/gl_state.h3
-rw-r--r--src/video_core/shader/shader.cpp44
-rw-r--r--src/video_core/shader/shader.h49
-rw-r--r--src/video_core/shader/shader_interpreter.cpp18
-rw-r--r--src/video_core/shader/shader_jit_x64_compiler.cpp49
-rw-r--r--src/video_core/shader/shader_jit_x64_compiler.h2
-rw-r--r--src/video_core/swrasterizer/clipper.cpp46
-rw-r--r--src/video_core/swrasterizer/framebuffer.cpp2
-rw-r--r--src/video_core/swrasterizer/lighting.cpp308
-rw-r--r--src/video_core/swrasterizer/lighting.h19
-rw-r--r--src/video_core/swrasterizer/rasterizer.cpp29
-rw-r--r--src/video_core/swrasterizer/rasterizer.h6
-rw-r--r--src/video_core/swrasterizer/texturing.cpp4
-rw-r--r--src/video_core/utils.h14
31 files changed, 1401 insertions, 352 deletions
diff --git a/src/video_core/CMakeLists.txt b/src/video_core/CMakeLists.txt
index 0961a3251..82f47d8a9 100644
--- a/src/video_core/CMakeLists.txt
+++ b/src/video_core/CMakeLists.txt
@@ -1,6 +1,7 @@
1set(SRCS 1set(SRCS
2 command_processor.cpp 2 command_processor.cpp
3 debug_utils/debug_utils.cpp 3 debug_utils/debug_utils.cpp
4 geometry_pipeline.cpp
4 pica.cpp 5 pica.cpp
5 primitive_assembly.cpp 6 primitive_assembly.cpp
6 regs.cpp 7 regs.cpp
@@ -15,6 +16,7 @@ set(SRCS
15 shader/shader_interpreter.cpp 16 shader/shader_interpreter.cpp
16 swrasterizer/clipper.cpp 17 swrasterizer/clipper.cpp
17 swrasterizer/framebuffer.cpp 18 swrasterizer/framebuffer.cpp
19 swrasterizer/lighting.cpp
18 swrasterizer/proctex.cpp 20 swrasterizer/proctex.cpp
19 swrasterizer/rasterizer.cpp 21 swrasterizer/rasterizer.cpp
20 swrasterizer/swrasterizer.cpp 22 swrasterizer/swrasterizer.cpp
@@ -28,6 +30,7 @@ set(SRCS
28set(HEADERS 30set(HEADERS
29 command_processor.h 31 command_processor.h
30 debug_utils/debug_utils.h 32 debug_utils/debug_utils.h
33 geometry_pipeline.h
31 gpu_debugger.h 34 gpu_debugger.h
32 pica.h 35 pica.h
33 pica_state.h 36 pica_state.h
@@ -55,6 +58,7 @@ set(HEADERS
55 shader/shader_interpreter.h 58 shader/shader_interpreter.h
56 swrasterizer/clipper.h 59 swrasterizer/clipper.h
57 swrasterizer/framebuffer.h 60 swrasterizer/framebuffer.h
61 swrasterizer/lighting.h
58 swrasterizer/proctex.h 62 swrasterizer/proctex.h
59 swrasterizer/rasterizer.h 63 swrasterizer/rasterizer.h
60 swrasterizer/swrasterizer.h 64 swrasterizer/swrasterizer.h
diff --git a/src/video_core/command_processor.cpp b/src/video_core/command_processor.cpp
index 4633a1df1..caf9f7a06 100644
--- a/src/video_core/command_processor.cpp
+++ b/src/video_core/command_processor.cpp
@@ -119,24 +119,221 @@ static void WriteUniformFloatReg(ShaderRegs& config, Shader::ShaderSetup& setup,
119 } 119 }
120} 120}
121 121
122static void WriteProgramCode(ShaderRegs& config, Shader::ShaderSetup& setup, 122static void LoadDefaultVertexAttributes(u32 register_value) {
123 unsigned max_program_code_length, u32 value) { 123 auto& regs = g_state.regs;
124 if (config.program.offset >= max_program_code_length) { 124
125 LOG_ERROR(HW_GPU, "Invalid %s program offset %d", GetShaderSetupTypeName(setup), 125 // TODO: Does actual hardware indeed keep an intermediate buffer or does
126 (int)config.program.offset); 126 // it directly write the values?
127 } else { 127 default_attr_write_buffer[default_attr_counter++] = register_value;
128 setup.program_code[config.program.offset] = value; 128
129 config.program.offset++; 129 // Default attributes are written in a packed format such that four float24 values are encoded
130 // in three 32-bit numbers.
131 // We write to internal memory once a full such vector is written.
132 if (default_attr_counter >= 3) {
133 default_attr_counter = 0;
134
135 auto& setup = regs.pipeline.vs_default_attributes_setup;
136
137 if (setup.index >= 16) {
138 LOG_ERROR(HW_GPU, "Invalid VS default attribute index %d", (int)setup.index);
139 return;
140 }
141
142 Math::Vec4<float24> attribute;
143
144 // NOTE: The destination component order indeed is "backwards"
145 attribute.w = float24::FromRaw(default_attr_write_buffer[0] >> 8);
146 attribute.z = float24::FromRaw(((default_attr_write_buffer[0] & 0xFF) << 16) |
147 ((default_attr_write_buffer[1] >> 16) & 0xFFFF));
148 attribute.y = float24::FromRaw(((default_attr_write_buffer[1] & 0xFFFF) << 8) |
149 ((default_attr_write_buffer[2] >> 24) & 0xFF));
150 attribute.x = float24::FromRaw(default_attr_write_buffer[2] & 0xFFFFFF);
151
152 LOG_TRACE(HW_GPU, "Set default VS attribute %x to (%f %f %f %f)", (int)setup.index,
153 attribute.x.ToFloat32(), attribute.y.ToFloat32(), attribute.z.ToFloat32(),
154 attribute.w.ToFloat32());
155
156 // TODO: Verify that this actually modifies the register!
157 if (setup.index < 15) {
158 g_state.input_default_attributes.attr[setup.index] = attribute;
159 setup.index++;
160 } else {
161 // Put each attribute into an immediate input buffer. When all specified immediate
162 // attributes are present, the Vertex Shader is invoked and everything is sent to
163 // the primitive assembler.
164
165 auto& immediate_input = g_state.immediate.input_vertex;
166 auto& immediate_attribute_id = g_state.immediate.current_attribute;
167
168 immediate_input.attr[immediate_attribute_id] = attribute;
169
170 if (immediate_attribute_id < regs.pipeline.max_input_attrib_index) {
171 immediate_attribute_id += 1;
172 } else {
173 MICROPROFILE_SCOPE(GPU_Drawing);
174 immediate_attribute_id = 0;
175
176 auto* shader_engine = Shader::GetEngine();
177 shader_engine->SetupBatch(g_state.vs, regs.vs.main_offset);
178
179 // Send to vertex shader
180 if (g_debug_context)
181 g_debug_context->OnEvent(DebugContext::Event::VertexShaderInvocation,
182 static_cast<void*>(&immediate_input));
183 Shader::UnitState shader_unit;
184 Shader::AttributeBuffer output{};
185
186 shader_unit.LoadInput(regs.vs, immediate_input);
187 shader_engine->Run(g_state.vs, shader_unit);
188 shader_unit.WriteOutput(regs.vs, output);
189
190 // Send to geometry pipeline
191 if (g_state.immediate.reset_geometry_pipeline) {
192 g_state.geometry_pipeline.Reconfigure();
193 g_state.immediate.reset_geometry_pipeline = false;
194 }
195 ASSERT(!g_state.geometry_pipeline.NeedIndexInput());
196 g_state.geometry_pipeline.Setup(shader_engine);
197 g_state.geometry_pipeline.SubmitVertex(output);
198
199 // TODO: If drawing after every immediate mode triangle kills performance,
200 // change it to flush triangles whenever a drawing config register changes
201 // See: https://github.com/citra-emu/citra/pull/2866#issuecomment-327011550
202 VideoCore::g_renderer->Rasterizer()->DrawTriangles();
203 if (g_debug_context) {
204 g_debug_context->OnEvent(DebugContext::Event::FinishedPrimitiveBatch, nullptr);
205 }
206 }
207 }
130 } 208 }
131} 209}
132 210
133static void WriteSwizzlePatterns(ShaderRegs& config, Shader::ShaderSetup& setup, u32 value) { 211static void Draw(u32 command_id) {
134 if (config.swizzle_patterns.offset >= setup.swizzle_data.size()) { 212 MICROPROFILE_SCOPE(GPU_Drawing);
135 LOG_ERROR(HW_GPU, "Invalid %s swizzle pattern offset %d", GetShaderSetupTypeName(setup), 213 auto& regs = g_state.regs;
136 (int)config.swizzle_patterns.offset); 214
137 } else { 215#if PICA_LOG_TEV
138 setup.swizzle_data[config.swizzle_patterns.offset] = value; 216 DebugUtils::DumpTevStageConfig(regs.GetTevStages());
139 config.swizzle_patterns.offset++; 217#endif
218 if (g_debug_context)
219 g_debug_context->OnEvent(DebugContext::Event::IncomingPrimitiveBatch, nullptr);
220
221 // Processes information about internal vertex attributes to figure out how a vertex is
222 // loaded.
223 // Later, these can be compiled and cached.
224 const u32 base_address = regs.pipeline.vertex_attributes.GetPhysicalBaseAddress();
225 VertexLoader loader(regs.pipeline);
226
227 // Load vertices
228 bool is_indexed = (command_id == PICA_REG_INDEX(pipeline.trigger_draw_indexed));
229
230 const auto& index_info = regs.pipeline.index_array;
231 const u8* index_address_8 = Memory::GetPhysicalPointer(base_address + index_info.offset);
232 const u16* index_address_16 = reinterpret_cast<const u16*>(index_address_8);
233 bool index_u16 = index_info.format != 0;
234
235 PrimitiveAssembler<Shader::OutputVertex>& primitive_assembler = g_state.primitive_assembler;
236
237 if (g_debug_context && g_debug_context->recorder) {
238 for (int i = 0; i < 3; ++i) {
239 const auto texture = regs.texturing.GetTextures()[i];
240 if (!texture.enabled)
241 continue;
242
243 u8* texture_data = Memory::GetPhysicalPointer(texture.config.GetPhysicalAddress());
244 g_debug_context->recorder->MemoryAccessed(
245 texture_data, Pica::TexturingRegs::NibblesPerPixel(texture.format) *
246 texture.config.width / 2 * texture.config.height,
247 texture.config.GetPhysicalAddress());
248 }
249 }
250
251 DebugUtils::MemoryAccessTracker memory_accesses;
252
253 // Simple circular-replacement vertex cache
254 // The size has been tuned for optimal balance between hit-rate and the cost of lookup
255 const size_t VERTEX_CACHE_SIZE = 32;
256 std::array<u16, VERTEX_CACHE_SIZE> vertex_cache_ids;
257 std::array<Shader::AttributeBuffer, VERTEX_CACHE_SIZE> vertex_cache;
258 Shader::AttributeBuffer vs_output;
259
260 unsigned int vertex_cache_pos = 0;
261 vertex_cache_ids.fill(-1);
262
263 auto* shader_engine = Shader::GetEngine();
264 Shader::UnitState shader_unit;
265
266 shader_engine->SetupBatch(g_state.vs, regs.vs.main_offset);
267
268 g_state.geometry_pipeline.Reconfigure();
269 g_state.geometry_pipeline.Setup(shader_engine);
270 if (g_state.geometry_pipeline.NeedIndexInput())
271 ASSERT(is_indexed);
272
273 for (unsigned int index = 0; index < regs.pipeline.num_vertices; ++index) {
274 // Indexed rendering doesn't use the start offset
275 unsigned int vertex = is_indexed
276 ? (index_u16 ? index_address_16[index] : index_address_8[index])
277 : (index + regs.pipeline.vertex_offset);
278
279 // -1 is a common special value used for primitive restart. Since it's unknown if
280 // the PICA supports it, and it would mess up the caching, guard against it here.
281 ASSERT(vertex != -1);
282
283 bool vertex_cache_hit = false;
284
285 if (is_indexed) {
286 if (g_state.geometry_pipeline.NeedIndexInput()) {
287 g_state.geometry_pipeline.SubmitIndex(vertex);
288 continue;
289 }
290
291 if (g_debug_context && Pica::g_debug_context->recorder) {
292 int size = index_u16 ? 2 : 1;
293 memory_accesses.AddAccess(base_address + index_info.offset + size * index, size);
294 }
295
296 for (unsigned int i = 0; i < VERTEX_CACHE_SIZE; ++i) {
297 if (vertex == vertex_cache_ids[i]) {
298 vs_output = vertex_cache[i];
299 vertex_cache_hit = true;
300 break;
301 }
302 }
303 }
304
305 if (!vertex_cache_hit) {
306 // Initialize data for the current vertex
307 Shader::AttributeBuffer input;
308 loader.LoadVertex(base_address, index, vertex, input, memory_accesses);
309
310 // Send to vertex shader
311 if (g_debug_context)
312 g_debug_context->OnEvent(DebugContext::Event::VertexShaderInvocation,
313 (void*)&input);
314 shader_unit.LoadInput(regs.vs, input);
315 shader_engine->Run(g_state.vs, shader_unit);
316 shader_unit.WriteOutput(regs.vs, vs_output);
317
318 if (is_indexed) {
319 vertex_cache[vertex_cache_pos] = vs_output;
320 vertex_cache_ids[vertex_cache_pos] = vertex;
321 vertex_cache_pos = (vertex_cache_pos + 1) % VERTEX_CACHE_SIZE;
322 }
323 }
324
325 // Send to geometry pipeline
326 g_state.geometry_pipeline.SubmitVertex(vs_output);
327 }
328
329 for (auto& range : memory_accesses.ranges) {
330 g_debug_context->recorder->MemoryAccessed(Memory::GetPhysicalPointer(range.first),
331 range.second, range.first);
332 }
333
334 VideoCore::g_renderer->Rasterizer()->DrawTriangles();
335 if (g_debug_context) {
336 g_debug_context->OnEvent(DebugContext::Event::FinishedPrimitiveBatch, nullptr);
140 } 337 }
141} 338}
142 339
@@ -182,106 +379,19 @@ static void WritePicaReg(u32 id, u32 value, u32 mask) {
182 379
183 case PICA_REG_INDEX(pipeline.vs_default_attributes_setup.index): 380 case PICA_REG_INDEX(pipeline.vs_default_attributes_setup.index):
184 g_state.immediate.current_attribute = 0; 381 g_state.immediate.current_attribute = 0;
382 g_state.immediate.reset_geometry_pipeline = true;
185 default_attr_counter = 0; 383 default_attr_counter = 0;
186 break; 384 break;
187 385
188 // Load default vertex input attributes 386 // Load default vertex input attributes
189 case PICA_REG_INDEX_WORKAROUND(pipeline.vs_default_attributes_setup.set_value[0], 0x233): 387 case PICA_REG_INDEX_WORKAROUND(pipeline.vs_default_attributes_setup.set_value[0], 0x233):
190 case PICA_REG_INDEX_WORKAROUND(pipeline.vs_default_attributes_setup.set_value[1], 0x234): 388 case PICA_REG_INDEX_WORKAROUND(pipeline.vs_default_attributes_setup.set_value[1], 0x234):
191 case PICA_REG_INDEX_WORKAROUND(pipeline.vs_default_attributes_setup.set_value[2], 0x235): { 389 case PICA_REG_INDEX_WORKAROUND(pipeline.vs_default_attributes_setup.set_value[2], 0x235):
192 // TODO: Does actual hardware indeed keep an intermediate buffer or does 390 LoadDefaultVertexAttributes(value);
193 // it directly write the values?
194 default_attr_write_buffer[default_attr_counter++] = value;
195
196 // Default attributes are written in a packed format such that four float24 values are
197 // encoded in
198 // three 32-bit numbers. We write to internal memory once a full such vector is
199 // written.
200 if (default_attr_counter >= 3) {
201 default_attr_counter = 0;
202
203 auto& setup = regs.pipeline.vs_default_attributes_setup;
204
205 if (setup.index >= 16) {
206 LOG_ERROR(HW_GPU, "Invalid VS default attribute index %d", (int)setup.index);
207 break;
208 }
209
210 Math::Vec4<float24> attribute;
211
212 // NOTE: The destination component order indeed is "backwards"
213 attribute.w = float24::FromRaw(default_attr_write_buffer[0] >> 8);
214 attribute.z = float24::FromRaw(((default_attr_write_buffer[0] & 0xFF) << 16) |
215 ((default_attr_write_buffer[1] >> 16) & 0xFFFF));
216 attribute.y = float24::FromRaw(((default_attr_write_buffer[1] & 0xFFFF) << 8) |
217 ((default_attr_write_buffer[2] >> 24) & 0xFF));
218 attribute.x = float24::FromRaw(default_attr_write_buffer[2] & 0xFFFFFF);
219
220 LOG_TRACE(HW_GPU, "Set default VS attribute %x to (%f %f %f %f)", (int)setup.index,
221 attribute.x.ToFloat32(), attribute.y.ToFloat32(), attribute.z.ToFloat32(),
222 attribute.w.ToFloat32());
223
224 // TODO: Verify that this actually modifies the register!
225 if (setup.index < 15) {
226 g_state.input_default_attributes.attr[setup.index] = attribute;
227 setup.index++;
228 } else {
229 // Put each attribute into an immediate input buffer. When all specified immediate
230 // attributes are present, the Vertex Shader is invoked and everything is sent to
231 // the primitive assembler.
232
233 auto& immediate_input = g_state.immediate.input_vertex;
234 auto& immediate_attribute_id = g_state.immediate.current_attribute;
235
236 immediate_input.attr[immediate_attribute_id] = attribute;
237
238 if (immediate_attribute_id < regs.pipeline.max_input_attrib_index) {
239 immediate_attribute_id += 1;
240 } else {
241 MICROPROFILE_SCOPE(GPU_Drawing);
242 immediate_attribute_id = 0;
243
244 auto* shader_engine = Shader::GetEngine();
245 shader_engine->SetupBatch(g_state.vs, regs.vs.main_offset);
246
247 // Send to vertex shader
248 if (g_debug_context)
249 g_debug_context->OnEvent(DebugContext::Event::VertexShaderInvocation,
250 static_cast<void*>(&immediate_input));
251 Shader::UnitState shader_unit;
252 Shader::AttributeBuffer output{};
253
254 shader_unit.LoadInput(regs.vs, immediate_input);
255 shader_engine->Run(g_state.vs, shader_unit);
256 shader_unit.WriteOutput(regs.vs, output);
257
258 // Send to renderer
259 using Pica::Shader::OutputVertex;
260 auto AddTriangle = [](const OutputVertex& v0, const OutputVertex& v1,
261 const OutputVertex& v2) {
262 VideoCore::g_renderer->Rasterizer()->AddTriangle(v0, v1, v2);
263 };
264
265 g_state.primitive_assembler.SubmitVertex(
266 Shader::OutputVertex::FromAttributeBuffer(regs.rasterizer, output),
267 AddTriangle);
268 }
269 }
270 }
271 break; 391 break;
272 }
273 392
274 case PICA_REG_INDEX(pipeline.gpu_mode): 393 case PICA_REG_INDEX(pipeline.gpu_mode):
275 if (regs.pipeline.gpu_mode == PipelineRegs::GPUMode::Configuring) { 394 // This register likely just enables vertex processing and doesn't need any special handling
276 MICROPROFILE_SCOPE(GPU_Drawing);
277
278 // Draw immediate mode triangles when GPU Mode is set to GPUMode::Configuring
279 VideoCore::g_renderer->Rasterizer()->DrawTriangles();
280
281 if (g_debug_context) {
282 g_debug_context->OnEvent(DebugContext::Event::FinishedPrimitiveBatch, nullptr);
283 }
284 }
285 break; 395 break;
286 396
287 case PICA_REG_INDEX_WORKAROUND(pipeline.command_buffer.trigger[0], 0x23c): 397 case PICA_REG_INDEX_WORKAROUND(pipeline.command_buffer.trigger[0], 0x23c):
@@ -297,130 +407,9 @@ static void WritePicaReg(u32 id, u32 value, u32 mask) {
297 407
298 // It seems like these trigger vertex rendering 408 // It seems like these trigger vertex rendering
299 case PICA_REG_INDEX(pipeline.trigger_draw): 409 case PICA_REG_INDEX(pipeline.trigger_draw):
300 case PICA_REG_INDEX(pipeline.trigger_draw_indexed): { 410 case PICA_REG_INDEX(pipeline.trigger_draw_indexed):
301 MICROPROFILE_SCOPE(GPU_Drawing); 411 Draw(id);
302
303#if PICA_LOG_TEV
304 DebugUtils::DumpTevStageConfig(regs.GetTevStages());
305#endif
306 if (g_debug_context)
307 g_debug_context->OnEvent(DebugContext::Event::IncomingPrimitiveBatch, nullptr);
308
309 // Processes information about internal vertex attributes to figure out how a vertex is
310 // loaded.
311 // Later, these can be compiled and cached.
312 const u32 base_address = regs.pipeline.vertex_attributes.GetPhysicalBaseAddress();
313 VertexLoader loader(regs.pipeline);
314
315 // Load vertices
316 bool is_indexed = (id == PICA_REG_INDEX(pipeline.trigger_draw_indexed));
317
318 const auto& index_info = regs.pipeline.index_array;
319 const u8* index_address_8 = Memory::GetPhysicalPointer(base_address + index_info.offset);
320 const u16* index_address_16 = reinterpret_cast<const u16*>(index_address_8);
321 bool index_u16 = index_info.format != 0;
322
323 PrimitiveAssembler<Shader::OutputVertex>& primitive_assembler = g_state.primitive_assembler;
324
325 if (g_debug_context && g_debug_context->recorder) {
326 for (int i = 0; i < 3; ++i) {
327 const auto texture = regs.texturing.GetTextures()[i];
328 if (!texture.enabled)
329 continue;
330
331 u8* texture_data = Memory::GetPhysicalPointer(texture.config.GetPhysicalAddress());
332 g_debug_context->recorder->MemoryAccessed(
333 texture_data, Pica::TexturingRegs::NibblesPerPixel(texture.format) *
334 texture.config.width / 2 * texture.config.height,
335 texture.config.GetPhysicalAddress());
336 }
337 }
338
339 DebugUtils::MemoryAccessTracker memory_accesses;
340
341 // Simple circular-replacement vertex cache
342 // The size has been tuned for optimal balance between hit-rate and the cost of lookup
343 const size_t VERTEX_CACHE_SIZE = 32;
344 std::array<u16, VERTEX_CACHE_SIZE> vertex_cache_ids;
345 std::array<Shader::OutputVertex, VERTEX_CACHE_SIZE> vertex_cache;
346 Shader::OutputVertex output_vertex;
347
348 unsigned int vertex_cache_pos = 0;
349 vertex_cache_ids.fill(-1);
350
351 auto* shader_engine = Shader::GetEngine();
352 Shader::UnitState shader_unit;
353
354 shader_engine->SetupBatch(g_state.vs, regs.vs.main_offset);
355
356 for (unsigned int index = 0; index < regs.pipeline.num_vertices; ++index) {
357 // Indexed rendering doesn't use the start offset
358 unsigned int vertex =
359 is_indexed ? (index_u16 ? index_address_16[index] : index_address_8[index])
360 : (index + regs.pipeline.vertex_offset);
361
362 // -1 is a common special value used for primitive restart. Since it's unknown if
363 // the PICA supports it, and it would mess up the caching, guard against it here.
364 ASSERT(vertex != -1);
365
366 bool vertex_cache_hit = false;
367
368 if (is_indexed) {
369 if (g_debug_context && Pica::g_debug_context->recorder) {
370 int size = index_u16 ? 2 : 1;
371 memory_accesses.AddAccess(base_address + index_info.offset + size * index,
372 size);
373 }
374
375 for (unsigned int i = 0; i < VERTEX_CACHE_SIZE; ++i) {
376 if (vertex == vertex_cache_ids[i]) {
377 output_vertex = vertex_cache[i];
378 vertex_cache_hit = true;
379 break;
380 }
381 }
382 }
383
384 if (!vertex_cache_hit) {
385 // Initialize data for the current vertex
386 Shader::AttributeBuffer input, output{};
387 loader.LoadVertex(base_address, index, vertex, input, memory_accesses);
388
389 // Send to vertex shader
390 if (g_debug_context)
391 g_debug_context->OnEvent(DebugContext::Event::VertexShaderInvocation,
392 (void*)&input);
393 shader_unit.LoadInput(regs.vs, input);
394 shader_engine->Run(g_state.vs, shader_unit);
395 shader_unit.WriteOutput(regs.vs, output);
396
397 // Retrieve vertex from register data
398 output_vertex = Shader::OutputVertex::FromAttributeBuffer(regs.rasterizer, output);
399
400 if (is_indexed) {
401 vertex_cache[vertex_cache_pos] = output_vertex;
402 vertex_cache_ids[vertex_cache_pos] = vertex;
403 vertex_cache_pos = (vertex_cache_pos + 1) % VERTEX_CACHE_SIZE;
404 }
405 }
406
407 // Send to renderer
408 using Pica::Shader::OutputVertex;
409 auto AddTriangle = [](const OutputVertex& v0, const OutputVertex& v1,
410 const OutputVertex& v2) {
411 VideoCore::g_renderer->Rasterizer()->AddTriangle(v0, v1, v2);
412 };
413
414 primitive_assembler.SubmitVertex(output_vertex, AddTriangle);
415 }
416
417 for (auto& range : memory_accesses.ranges) {
418 g_debug_context->recorder->MemoryAccessed(Memory::GetPhysicalPointer(range.first),
419 range.second, range.first);
420 }
421
422 break; 412 break;
423 }
424 413
425 case PICA_REG_INDEX(gs.bool_uniforms): 414 case PICA_REG_INDEX(gs.bool_uniforms):
426 WriteUniformBoolReg(g_state.gs, g_state.regs.gs.bool_uniforms.Value()); 415 WriteUniformBoolReg(g_state.gs, g_state.regs.gs.bool_uniforms.Value());
@@ -458,7 +447,13 @@ static void WritePicaReg(u32 id, u32 value, u32 mask) {
458 case PICA_REG_INDEX_WORKAROUND(gs.program.set_word[5], 0x2a1): 447 case PICA_REG_INDEX_WORKAROUND(gs.program.set_word[5], 0x2a1):
459 case PICA_REG_INDEX_WORKAROUND(gs.program.set_word[6], 0x2a2): 448 case PICA_REG_INDEX_WORKAROUND(gs.program.set_word[6], 0x2a2):
460 case PICA_REG_INDEX_WORKAROUND(gs.program.set_word[7], 0x2a3): { 449 case PICA_REG_INDEX_WORKAROUND(gs.program.set_word[7], 0x2a3): {
461 WriteProgramCode(g_state.regs.gs, g_state.gs, 4096, value); 450 u32& offset = g_state.regs.gs.program.offset;
451 if (offset >= 4096) {
452 LOG_ERROR(HW_GPU, "Invalid GS program offset %u", offset);
453 } else {
454 g_state.gs.program_code[offset] = value;
455 offset++;
456 }
462 break; 457 break;
463 } 458 }
464 459
@@ -470,11 +465,18 @@ static void WritePicaReg(u32 id, u32 value, u32 mask) {
470 case PICA_REG_INDEX_WORKAROUND(gs.swizzle_patterns.set_word[5], 0x2ab): 465 case PICA_REG_INDEX_WORKAROUND(gs.swizzle_patterns.set_word[5], 0x2ab):
471 case PICA_REG_INDEX_WORKAROUND(gs.swizzle_patterns.set_word[6], 0x2ac): 466 case PICA_REG_INDEX_WORKAROUND(gs.swizzle_patterns.set_word[6], 0x2ac):
472 case PICA_REG_INDEX_WORKAROUND(gs.swizzle_patterns.set_word[7], 0x2ad): { 467 case PICA_REG_INDEX_WORKAROUND(gs.swizzle_patterns.set_word[7], 0x2ad): {
473 WriteSwizzlePatterns(g_state.regs.gs, g_state.gs, value); 468 u32& offset = g_state.regs.gs.swizzle_patterns.offset;
469 if (offset >= g_state.gs.swizzle_data.size()) {
470 LOG_ERROR(HW_GPU, "Invalid GS swizzle pattern offset %u", offset);
471 } else {
472 g_state.gs.swizzle_data[offset] = value;
473 offset++;
474 }
474 break; 475 break;
475 } 476 }
476 477
477 case PICA_REG_INDEX(vs.bool_uniforms): 478 case PICA_REG_INDEX(vs.bool_uniforms):
479 // TODO (wwylele): does regs.pipeline.gs_unit_exclusive_configuration affect this?
478 WriteUniformBoolReg(g_state.vs, g_state.regs.vs.bool_uniforms.Value()); 480 WriteUniformBoolReg(g_state.vs, g_state.regs.vs.bool_uniforms.Value());
479 break; 481 break;
480 482
@@ -482,6 +484,7 @@ static void WritePicaReg(u32 id, u32 value, u32 mask) {
482 case PICA_REG_INDEX_WORKAROUND(vs.int_uniforms[1], 0x2b2): 484 case PICA_REG_INDEX_WORKAROUND(vs.int_uniforms[1], 0x2b2):
483 case PICA_REG_INDEX_WORKAROUND(vs.int_uniforms[2], 0x2b3): 485 case PICA_REG_INDEX_WORKAROUND(vs.int_uniforms[2], 0x2b3):
484 case PICA_REG_INDEX_WORKAROUND(vs.int_uniforms[3], 0x2b4): { 486 case PICA_REG_INDEX_WORKAROUND(vs.int_uniforms[3], 0x2b4): {
487 // TODO (wwylele): does regs.pipeline.gs_unit_exclusive_configuration affect this?
485 unsigned index = (id - PICA_REG_INDEX_WORKAROUND(vs.int_uniforms[0], 0x2b1)); 488 unsigned index = (id - PICA_REG_INDEX_WORKAROUND(vs.int_uniforms[0], 0x2b1));
486 auto values = regs.vs.int_uniforms[index]; 489 auto values = regs.vs.int_uniforms[index];
487 WriteUniformIntReg(g_state.vs, index, 490 WriteUniformIntReg(g_state.vs, index,
@@ -497,6 +500,7 @@ static void WritePicaReg(u32 id, u32 value, u32 mask) {
497 case PICA_REG_INDEX_WORKAROUND(vs.uniform_setup.set_value[5], 0x2c6): 500 case PICA_REG_INDEX_WORKAROUND(vs.uniform_setup.set_value[5], 0x2c6):
498 case PICA_REG_INDEX_WORKAROUND(vs.uniform_setup.set_value[6], 0x2c7): 501 case PICA_REG_INDEX_WORKAROUND(vs.uniform_setup.set_value[6], 0x2c7):
499 case PICA_REG_INDEX_WORKAROUND(vs.uniform_setup.set_value[7], 0x2c8): { 502 case PICA_REG_INDEX_WORKAROUND(vs.uniform_setup.set_value[7], 0x2c8): {
503 // TODO (wwylele): does regs.pipeline.gs_unit_exclusive_configuration affect this?
500 WriteUniformFloatReg(g_state.regs.vs, g_state.vs, vs_float_regs_counter, 504 WriteUniformFloatReg(g_state.regs.vs, g_state.vs, vs_float_regs_counter,
501 vs_uniform_write_buffer, value); 505 vs_uniform_write_buffer, value);
502 break; 506 break;
@@ -510,7 +514,16 @@ static void WritePicaReg(u32 id, u32 value, u32 mask) {
510 case PICA_REG_INDEX_WORKAROUND(vs.program.set_word[5], 0x2d1): 514 case PICA_REG_INDEX_WORKAROUND(vs.program.set_word[5], 0x2d1):
511 case PICA_REG_INDEX_WORKAROUND(vs.program.set_word[6], 0x2d2): 515 case PICA_REG_INDEX_WORKAROUND(vs.program.set_word[6], 0x2d2):
512 case PICA_REG_INDEX_WORKAROUND(vs.program.set_word[7], 0x2d3): { 516 case PICA_REG_INDEX_WORKAROUND(vs.program.set_word[7], 0x2d3): {
513 WriteProgramCode(g_state.regs.vs, g_state.vs, 512, value); 517 u32& offset = g_state.regs.vs.program.offset;
518 if (offset >= 512) {
519 LOG_ERROR(HW_GPU, "Invalid VS program offset %u", offset);
520 } else {
521 g_state.vs.program_code[offset] = value;
522 if (!g_state.regs.pipeline.gs_unit_exclusive_configuration) {
523 g_state.gs.program_code[offset] = value;
524 }
525 offset++;
526 }
514 break; 527 break;
515 } 528 }
516 529
@@ -522,7 +535,16 @@ static void WritePicaReg(u32 id, u32 value, u32 mask) {
522 case PICA_REG_INDEX_WORKAROUND(vs.swizzle_patterns.set_word[5], 0x2db): 535 case PICA_REG_INDEX_WORKAROUND(vs.swizzle_patterns.set_word[5], 0x2db):
523 case PICA_REG_INDEX_WORKAROUND(vs.swizzle_patterns.set_word[6], 0x2dc): 536 case PICA_REG_INDEX_WORKAROUND(vs.swizzle_patterns.set_word[6], 0x2dc):
524 case PICA_REG_INDEX_WORKAROUND(vs.swizzle_patterns.set_word[7], 0x2dd): { 537 case PICA_REG_INDEX_WORKAROUND(vs.swizzle_patterns.set_word[7], 0x2dd): {
525 WriteSwizzlePatterns(g_state.regs.vs, g_state.vs, value); 538 u32& offset = g_state.regs.vs.swizzle_patterns.offset;
539 if (offset >= g_state.vs.swizzle_data.size()) {
540 LOG_ERROR(HW_GPU, "Invalid VS swizzle pattern offset %u", offset);
541 } else {
542 g_state.vs.swizzle_data[offset] = value;
543 if (!g_state.regs.pipeline.gs_unit_exclusive_configuration) {
544 g_state.gs.swizzle_data[offset] = value;
545 }
546 offset++;
547 }
526 break; 548 break;
527 } 549 }
528 550
@@ -620,6 +642,6 @@ void ProcessCommandList(const u32* list, u32 size) {
620 } 642 }
621} 643}
622 644
623} // namespace 645} // namespace CommandProcessor
624 646
625} // namespace 647} // namespace Pica
diff --git a/src/video_core/geometry_pipeline.cpp b/src/video_core/geometry_pipeline.cpp
new file mode 100644
index 000000000..98ff2ccd3
--- /dev/null
+++ b/src/video_core/geometry_pipeline.cpp
@@ -0,0 +1,274 @@
1// Copyright 2017 Citra Emulator Project
2// Licensed under GPLv2 or any later version
3// Refer to the license.txt file included.
4
5#include "video_core/geometry_pipeline.h"
6#include "video_core/pica_state.h"
7#include "video_core/regs.h"
8#include "video_core/renderer_base.h"
9#include "video_core/video_core.h"
10
11namespace Pica {
12
13/// An attribute buffering interface for different pipeline modes
14class GeometryPipelineBackend {
15public:
16 virtual ~GeometryPipelineBackend() = default;
17
18 /// Checks if there is no incomplete data transfer
19 virtual bool IsEmpty() const = 0;
20
21 /// Checks if the pipeline needs a direct input from index buffer
22 virtual bool NeedIndexInput() const = 0;
23
24 /// Submits an index from index buffer
25 virtual void SubmitIndex(unsigned int val) = 0;
26
27 /**
28 * Submits vertex attributes
29 * @param input attributes of a vertex output from vertex shader
30 * @return if the buffer is full and the geometry shader should be invoked
31 */
32 virtual bool SubmitVertex(const Shader::AttributeBuffer& input) = 0;
33};
34
35// In the Point mode, vertex attributes are sent to the input registers in the geometry shader unit.
36// The size of vertex shader outputs and geometry shader inputs are constants. Geometry shader is
37// invoked upon inputs buffer filled up by vertex shader outputs. For example, if we have a geometry
38// shader that takes 6 inputs, and the vertex shader outputs 2 attributes, it would take 3 vertices
39// for one geometry shader invocation.
40// TODO: what happens when the input size is not divisible by the output size?
41class GeometryPipeline_Point : public GeometryPipelineBackend {
42public:
43 GeometryPipeline_Point(const Regs& regs, Shader::GSUnitState& unit) : regs(regs), unit(unit) {
44 ASSERT(regs.pipeline.variable_primitive == 0);
45 ASSERT(regs.gs.input_to_uniform == 0);
46 vs_output_num = regs.pipeline.vs_outmap_total_minus_1_a + 1;
47 size_t gs_input_num = regs.gs.max_input_attribute_index + 1;
48 ASSERT(gs_input_num % vs_output_num == 0);
49 buffer_cur = attribute_buffer.attr;
50 buffer_end = attribute_buffer.attr + gs_input_num;
51 }
52
53 bool IsEmpty() const override {
54 return buffer_cur == attribute_buffer.attr;
55 }
56
57 bool NeedIndexInput() const override {
58 return false;
59 }
60
61 void SubmitIndex(unsigned int val) override {
62 UNREACHABLE();
63 }
64
65 bool SubmitVertex(const Shader::AttributeBuffer& input) override {
66 buffer_cur = std::copy(input.attr, input.attr + vs_output_num, buffer_cur);
67 if (buffer_cur == buffer_end) {
68 buffer_cur = attribute_buffer.attr;
69 unit.LoadInput(regs.gs, attribute_buffer);
70 return true;
71 }
72 return false;
73 }
74
75private:
76 const Regs& regs;
77 Shader::GSUnitState& unit;
78 Shader::AttributeBuffer attribute_buffer;
79 Math::Vec4<float24>* buffer_cur;
80 Math::Vec4<float24>* buffer_end;
81 unsigned int vs_output_num;
82};
83
84// In VariablePrimitive mode, vertex attributes are buffered into the uniform registers in the
85// geometry shader unit. The number of vertex is variable, which is specified by the first index
86// value in the batch. This mode is usually used for subdivision.
87class GeometryPipeline_VariablePrimitive : public GeometryPipelineBackend {
88public:
89 GeometryPipeline_VariablePrimitive(const Regs& regs, Shader::ShaderSetup& setup)
90 : regs(regs), setup(setup) {
91 ASSERT(regs.pipeline.variable_primitive == 1);
92 ASSERT(regs.gs.input_to_uniform == 1);
93 vs_output_num = regs.pipeline.vs_outmap_total_minus_1_a + 1;
94 }
95
96 bool IsEmpty() const override {
97 return need_index;
98 }
99
100 bool NeedIndexInput() const override {
101 return need_index;
102 }
103
104 void SubmitIndex(unsigned int val) override {
105 DEBUG_ASSERT(need_index);
106
107 // The number of vertex input is put to the uniform register
108 float24 vertex_num = float24::FromFloat32(static_cast<float>(val));
109 setup.uniforms.f[0] = Math::MakeVec(vertex_num, vertex_num, vertex_num, vertex_num);
110
111 // The second uniform register and so on are used for receiving input vertices
112 buffer_cur = setup.uniforms.f + 1;
113
114 main_vertex_num = regs.pipeline.variable_vertex_main_num_minus_1 + 1;
115 total_vertex_num = val;
116 need_index = false;
117 }
118
119 bool SubmitVertex(const Shader::AttributeBuffer& input) override {
120 DEBUG_ASSERT(!need_index);
121 if (main_vertex_num != 0) {
122 // For main vertices, receive all attributes
123 buffer_cur = std::copy(input.attr, input.attr + vs_output_num, buffer_cur);
124 --main_vertex_num;
125 } else {
126 // For other vertices, only receive the first attribute (usually the position)
127 *(buffer_cur++) = input.attr[0];
128 }
129 --total_vertex_num;
130
131 if (total_vertex_num == 0) {
132 need_index = true;
133 return true;
134 }
135
136 return false;
137 }
138
139private:
140 bool need_index = true;
141 const Regs& regs;
142 Shader::ShaderSetup& setup;
143 unsigned int main_vertex_num;
144 unsigned int total_vertex_num;
145 Math::Vec4<float24>* buffer_cur;
146 unsigned int vs_output_num;
147};
148
149// In FixedPrimitive mode, vertex attributes are buffered into the uniform registers in the geometry
150// shader unit. The number of vertex per shader invocation is constant. This is usually used for
151// particle system.
152class GeometryPipeline_FixedPrimitive : public GeometryPipelineBackend {
153public:
154 GeometryPipeline_FixedPrimitive(const Regs& regs, Shader::ShaderSetup& setup)
155 : regs(regs), setup(setup) {
156 ASSERT(regs.pipeline.variable_primitive == 0);
157 ASSERT(regs.gs.input_to_uniform == 1);
158 vs_output_num = regs.pipeline.vs_outmap_total_minus_1_a + 1;
159 ASSERT(vs_output_num == regs.pipeline.gs_config.stride_minus_1 + 1);
160 size_t vertex_num = regs.pipeline.gs_config.fixed_vertex_num_minus_1 + 1;
161 buffer_cur = buffer_begin = setup.uniforms.f + regs.pipeline.gs_config.start_index;
162 buffer_end = buffer_begin + vs_output_num * vertex_num;
163 }
164
165 bool IsEmpty() const override {
166 return buffer_cur == buffer_begin;
167 }
168
169 bool NeedIndexInput() const override {
170 return false;
171 }
172
173 void SubmitIndex(unsigned int val) override {
174 UNREACHABLE();
175 }
176
177 bool SubmitVertex(const Shader::AttributeBuffer& input) override {
178 buffer_cur = std::copy(input.attr, input.attr + vs_output_num, buffer_cur);
179 if (buffer_cur == buffer_end) {
180 buffer_cur = buffer_begin;
181 return true;
182 }
183 return false;
184 }
185
186private:
187 const Regs& regs;
188 Shader::ShaderSetup& setup;
189 Math::Vec4<float24>* buffer_begin;
190 Math::Vec4<float24>* buffer_cur;
191 Math::Vec4<float24>* buffer_end;
192 unsigned int vs_output_num;
193};
194
195GeometryPipeline::GeometryPipeline(State& state) : state(state) {}
196
197GeometryPipeline::~GeometryPipeline() = default;
198
199void GeometryPipeline::SetVertexHandler(Shader::VertexHandler vertex_handler) {
200 this->vertex_handler = vertex_handler;
201}
202
203void GeometryPipeline::Setup(Shader::ShaderEngine* shader_engine) {
204 if (!backend)
205 return;
206
207 this->shader_engine = shader_engine;
208 shader_engine->SetupBatch(state.gs, state.regs.gs.main_offset);
209}
210
211void GeometryPipeline::Reconfigure() {
212 ASSERT(!backend || backend->IsEmpty());
213
214 if (state.regs.pipeline.use_gs == PipelineRegs::UseGS::No) {
215 backend = nullptr;
216 return;
217 }
218
219 ASSERT(state.regs.pipeline.use_gs == PipelineRegs::UseGS::Yes);
220
221 // The following assumes that when geometry shader is in use, the shader unit 3 is configured as
222 // a geometry shader unit.
223 // TODO: what happens if this is not true?
224 ASSERT(state.regs.pipeline.gs_unit_exclusive_configuration == 1);
225 ASSERT(state.regs.gs.shader_mode == ShaderRegs::ShaderMode::GS);
226
227 state.gs_unit.ConfigOutput(state.regs.gs);
228
229 ASSERT(state.regs.pipeline.vs_outmap_total_minus_1_a ==
230 state.regs.pipeline.vs_outmap_total_minus_1_b);
231
232 switch (state.regs.pipeline.gs_config.mode) {
233 case PipelineRegs::GSMode::Point:
234 backend = std::make_unique<GeometryPipeline_Point>(state.regs, state.gs_unit);
235 break;
236 case PipelineRegs::GSMode::VariablePrimitive:
237 backend = std::make_unique<GeometryPipeline_VariablePrimitive>(state.regs, state.gs);
238 break;
239 case PipelineRegs::GSMode::FixedPrimitive:
240 backend = std::make_unique<GeometryPipeline_FixedPrimitive>(state.regs, state.gs);
241 break;
242 default:
243 UNREACHABLE();
244 }
245}
246
247bool GeometryPipeline::NeedIndexInput() const {
248 if (!backend)
249 return false;
250 return backend->NeedIndexInput();
251}
252
253void GeometryPipeline::SubmitIndex(unsigned int val) {
254 backend->SubmitIndex(val);
255}
256
257void GeometryPipeline::SubmitVertex(const Shader::AttributeBuffer& input) {
258 if (!backend) {
259 // No backend means the geometry shader is disabled, so we send the vertex shader output
260 // directly to the primitive assembler.
261 vertex_handler(input);
262 } else {
263 if (backend->SubmitVertex(input)) {
264 shader_engine->Run(state.gs, state.gs_unit);
265
266 // The uniform b15 is set to true after every geometry shader invocation. This is useful
267 // for the shader to know if this is the first invocation in a batch, if the program set
268 // b15 to false first.
269 state.gs.uniforms.b[15] = true;
270 }
271 }
272}
273
274} // namespace Pica
diff --git a/src/video_core/geometry_pipeline.h b/src/video_core/geometry_pipeline.h
new file mode 100644
index 000000000..91fdd3192
--- /dev/null
+++ b/src/video_core/geometry_pipeline.h
@@ -0,0 +1,49 @@
1// Copyright 2017 Citra Emulator Project
2// Licensed under GPLv2 or any later version
3// Refer to the license.txt file included.
4
5#pragma once
6
7#include <memory>
8#include "video_core/shader/shader.h"
9
10namespace Pica {
11
12struct State;
13
14class GeometryPipelineBackend;
15
16/// A pipeline receiving from vertex shader and sending to geometry shader and primitive assembler
17class GeometryPipeline {
18public:
19 explicit GeometryPipeline(State& state);
20 ~GeometryPipeline();
21
22 /// Sets the handler for receiving vertex outputs from vertex shader
23 void SetVertexHandler(Shader::VertexHandler vertex_handler);
24
25 /**
26 * Setup the geometry shader unit if it is in use
27 * @param shader_engine the shader engine for the geometry shader to run
28 */
29 void Setup(Shader::ShaderEngine* shader_engine);
30
31 /// Reconfigures the pipeline according to current register settings
32 void Reconfigure();
33
34 /// Checks if the pipeline needs a direct input from index buffer
35 bool NeedIndexInput() const;
36
37 /// Submits an index from index buffer. Call this only when NeedIndexInput returns true
38 void SubmitIndex(unsigned int val);
39
40 /// Submits vertex attributes output from vertex shader
41 void SubmitVertex(const Shader::AttributeBuffer& input);
42
43private:
44 Shader::VertexHandler vertex_handler;
45 Shader::ShaderEngine* shader_engine;
46 std::unique_ptr<GeometryPipelineBackend> backend;
47 State& state;
48};
49} // namespace Pica
diff --git a/src/video_core/pica.cpp b/src/video_core/pica.cpp
index b95148a6a..218e06883 100644
--- a/src/video_core/pica.cpp
+++ b/src/video_core/pica.cpp
@@ -3,9 +3,11 @@
3// Refer to the license.txt file included. 3// Refer to the license.txt file included.
4 4
5#include <cstring> 5#include <cstring>
6#include "video_core/geometry_pipeline.h"
6#include "video_core/pica.h" 7#include "video_core/pica.h"
7#include "video_core/pica_state.h" 8#include "video_core/pica_state.h"
8#include "video_core/regs_pipeline.h" 9#include "video_core/renderer_base.h"
10#include "video_core/video_core.h"
9 11
10namespace Pica { 12namespace Pica {
11 13
@@ -24,6 +26,23 @@ void Zero(T& o) {
24 memset(&o, 0, sizeof(o)); 26 memset(&o, 0, sizeof(o));
25} 27}
26 28
29State::State() : geometry_pipeline(*this) {
30 auto SubmitVertex = [this](const Shader::AttributeBuffer& vertex) {
31 using Pica::Shader::OutputVertex;
32 auto AddTriangle = [this](const OutputVertex& v0, const OutputVertex& v1,
33 const OutputVertex& v2) {
34 VideoCore::g_renderer->Rasterizer()->AddTriangle(v0, v1, v2);
35 };
36 primitive_assembler.SubmitVertex(
37 Shader::OutputVertex::FromAttributeBuffer(regs.rasterizer, vertex), AddTriangle);
38 };
39
40 auto SetWinding = [this]() { primitive_assembler.SetWinding(); };
41
42 g_state.gs_unit.SetVertexHandler(SubmitVertex, SetWinding);
43 g_state.geometry_pipeline.SetVertexHandler(SubmitVertex);
44}
45
27void State::Reset() { 46void State::Reset() {
28 Zero(regs); 47 Zero(regs);
29 Zero(vs); 48 Zero(vs);
diff --git a/src/video_core/pica_state.h b/src/video_core/pica_state.h
index 2d23d34e6..c6634a0bc 100644
--- a/src/video_core/pica_state.h
+++ b/src/video_core/pica_state.h
@@ -8,6 +8,7 @@
8#include "common/bit_field.h" 8#include "common/bit_field.h"
9#include "common/common_types.h" 9#include "common/common_types.h"
10#include "common/vector_math.h" 10#include "common/vector_math.h"
11#include "video_core/geometry_pipeline.h"
11#include "video_core/primitive_assembly.h" 12#include "video_core/primitive_assembly.h"
12#include "video_core/regs.h" 13#include "video_core/regs.h"
13#include "video_core/shader/shader.h" 14#include "video_core/shader/shader.h"
@@ -16,6 +17,7 @@ namespace Pica {
16 17
17/// Struct used to describe current Pica state 18/// Struct used to describe current Pica state
18struct State { 19struct State {
20 State();
19 void Reset(); 21 void Reset();
20 22
21 /// Pica registers 23 /// Pica registers
@@ -79,7 +81,7 @@ struct State {
79 std::array<ColorDifferenceEntry, 256> color_diff_table; 81 std::array<ColorDifferenceEntry, 256> color_diff_table;
80 } proctex; 82 } proctex;
81 83
82 struct { 84 struct Lighting {
83 union LutEntry { 85 union LutEntry {
84 // Used for raw access 86 // Used for raw access
85 u32 raw; 87 u32 raw;
@@ -137,8 +139,17 @@ struct State {
137 Shader::AttributeBuffer input_vertex; 139 Shader::AttributeBuffer input_vertex;
138 // Index of the next attribute to be loaded into `input_vertex`. 140 // Index of the next attribute to be loaded into `input_vertex`.
139 u32 current_attribute = 0; 141 u32 current_attribute = 0;
142 // Indicates the immediate mode just started and the geometry pipeline needs to reconfigure
143 bool reset_geometry_pipeline = true;
140 } immediate; 144 } immediate;
141 145
146 // the geometry shader needs to be kept in the global state because some shaders relie on
147 // preserved register value across shader invocation.
148 // TODO: also bring the three vertex shader units here and implement the shader scheduler.
149 Shader::GSUnitState gs_unit;
150
151 GeometryPipeline geometry_pipeline;
152
142 // This is constructed with a dummy triangle topology 153 // This is constructed with a dummy triangle topology
143 PrimitiveAssembler<Shader::OutputVertex> primitive_assembler; 154 PrimitiveAssembler<Shader::OutputVertex> primitive_assembler;
144}; 155};
diff --git a/src/video_core/pica_types.h b/src/video_core/pica_types.h
index 5d7e10066..2eafa7e9e 100644
--- a/src/video_core/pica_types.h
+++ b/src/video_core/pica_types.h
@@ -58,11 +58,12 @@ public:
58 } 58 }
59 59
60 Float<M, E> operator*(const Float<M, E>& flt) const { 60 Float<M, E> operator*(const Float<M, E>& flt) const {
61 if ((this->value == 0.f && !std::isnan(flt.value)) || 61 float result = value * flt.ToFloat32();
62 (flt.value == 0.f && !std::isnan(this->value))) 62 // PICA gives 0 instead of NaN when multiplying by inf
63 // PICA gives 0 instead of NaN when multiplying by inf 63 if (!std::isnan(value) && !std::isnan(flt.ToFloat32()))
64 return Zero(); 64 if (std::isnan(result))
65 return Float<M, E>::FromFloat32(ToFloat32() * flt.ToFloat32()); 65 result = 0.f;
66 return Float<M, E>::FromFloat32(result);
66 } 67 }
67 68
68 Float<M, E> operator/(const Float<M, E>& flt) const { 69 Float<M, E> operator/(const Float<M, E>& flt) const {
@@ -78,12 +79,7 @@ public:
78 } 79 }
79 80
80 Float<M, E>& operator*=(const Float<M, E>& flt) { 81 Float<M, E>& operator*=(const Float<M, E>& flt) {
81 if ((this->value == 0.f && !std::isnan(flt.value)) || 82 value = operator*(flt).value;
82 (flt.value == 0.f && !std::isnan(this->value)))
83 // PICA gives 0 instead of NaN when multiplying by inf
84 *this = Zero();
85 else
86 value *= flt.ToFloat32();
87 return *this; 83 return *this;
88 } 84 }
89 85
diff --git a/src/video_core/primitive_assembly.cpp b/src/video_core/primitive_assembly.cpp
index acd2ac5e2..9c3dd4cab 100644
--- a/src/video_core/primitive_assembly.cpp
+++ b/src/video_core/primitive_assembly.cpp
@@ -17,15 +17,18 @@ template <typename VertexType>
17void PrimitiveAssembler<VertexType>::SubmitVertex(const VertexType& vtx, 17void PrimitiveAssembler<VertexType>::SubmitVertex(const VertexType& vtx,
18 TriangleHandler triangle_handler) { 18 TriangleHandler triangle_handler) {
19 switch (topology) { 19 switch (topology) {
20 // TODO: Figure out what's different with TriangleTopology::Shader.
21 case PipelineRegs::TriangleTopology::List: 20 case PipelineRegs::TriangleTopology::List:
22 case PipelineRegs::TriangleTopology::Shader: 21 case PipelineRegs::TriangleTopology::Shader:
23 if (buffer_index < 2) { 22 if (buffer_index < 2) {
24 buffer[buffer_index++] = vtx; 23 buffer[buffer_index++] = vtx;
25 } else { 24 } else {
26 buffer_index = 0; 25 buffer_index = 0;
27 26 if (topology == PipelineRegs::TriangleTopology::Shader && winding) {
28 triangle_handler(buffer[0], buffer[1], vtx); 27 triangle_handler(buffer[1], buffer[0], vtx);
28 winding = false;
29 } else {
30 triangle_handler(buffer[0], buffer[1], vtx);
31 }
29 } 32 }
30 break; 33 break;
31 34
@@ -51,9 +54,15 @@ void PrimitiveAssembler<VertexType>::SubmitVertex(const VertexType& vtx,
51} 54}
52 55
53template <typename VertexType> 56template <typename VertexType>
57void PrimitiveAssembler<VertexType>::SetWinding() {
58 winding = true;
59}
60
61template <typename VertexType>
54void PrimitiveAssembler<VertexType>::Reset() { 62void PrimitiveAssembler<VertexType>::Reset() {
55 buffer_index = 0; 63 buffer_index = 0;
56 strip_ready = false; 64 strip_ready = false;
65 winding = false;
57} 66}
58 67
59template <typename VertexType> 68template <typename VertexType>
diff --git a/src/video_core/primitive_assembly.h b/src/video_core/primitive_assembly.h
index e8eccdf27..12de8e3b9 100644
--- a/src/video_core/primitive_assembly.h
+++ b/src/video_core/primitive_assembly.h
@@ -30,6 +30,12 @@ struct PrimitiveAssembler {
30 void SubmitVertex(const VertexType& vtx, TriangleHandler triangle_handler); 30 void SubmitVertex(const VertexType& vtx, TriangleHandler triangle_handler);
31 31
32 /** 32 /**
33 * Invert the vertex order of the next triangle. Called by geometry shader emitter.
34 * This only takes effect for TriangleTopology::Shader.
35 */
36 void SetWinding();
37
38 /**
33 * Resets the internal state of the PrimitiveAssembler. 39 * Resets the internal state of the PrimitiveAssembler.
34 */ 40 */
35 void Reset(); 41 void Reset();
@@ -45,6 +51,7 @@ private:
45 int buffer_index; 51 int buffer_index;
46 VertexType buffer[2]; 52 VertexType buffer[2];
47 bool strip_ready = false; 53 bool strip_ready = false;
54 bool winding = false;
48}; 55};
49 56
50} // namespace 57} // namespace
diff --git a/src/video_core/regs_framebuffer.h b/src/video_core/regs_framebuffer.h
index a50bd4111..7b565f911 100644
--- a/src/video_core/regs_framebuffer.h
+++ b/src/video_core/regs_framebuffer.h
@@ -256,10 +256,9 @@ struct FramebufferRegs {
256 return 3; 256 return 3;
257 case DepthFormat::D24S8: 257 case DepthFormat::D24S8:
258 return 4; 258 return 4;
259 default:
260 LOG_CRITICAL(HW_GPU, "Unknown depth format %u", format);
261 UNIMPLEMENTED();
262 } 259 }
260
261 ASSERT_MSG(false, "Unknown depth format %u", format);
263 } 262 }
264 263
265 // Returns the number of bits per depth component of the specified depth format 264 // Returns the number of bits per depth component of the specified depth format
@@ -270,10 +269,9 @@ struct FramebufferRegs {
270 case DepthFormat::D24: 269 case DepthFormat::D24:
271 case DepthFormat::D24S8: 270 case DepthFormat::D24S8:
272 return 24; 271 return 24;
273 default:
274 LOG_CRITICAL(HW_GPU, "Unknown depth format %u", format);
275 UNIMPLEMENTED();
276 } 272 }
273
274 ASSERT_MSG(false, "Unknown depth format %u", format);
277 } 275 }
278 276
279 INSERT_PADDING_WORDS(0x20); 277 INSERT_PADDING_WORDS(0x20);
diff --git a/src/video_core/regs_pipeline.h b/src/video_core/regs_pipeline.h
index 31c747d77..e78c3e331 100644
--- a/src/video_core/regs_pipeline.h
+++ b/src/video_core/regs_pipeline.h
@@ -147,7 +147,15 @@ struct PipelineRegs {
147 // Number of vertices to render 147 // Number of vertices to render
148 u32 num_vertices; 148 u32 num_vertices;
149 149
150 INSERT_PADDING_WORDS(0x1); 150 enum class UseGS : u32 {
151 No = 0,
152 Yes = 2,
153 };
154
155 union {
156 BitField<0, 2, UseGS> use_gs;
157 BitField<31, 1, u32> variable_primitive;
158 };
151 159
152 // The index of the first vertex to render 160 // The index of the first vertex to render
153 u32 vertex_offset; 161 u32 vertex_offset;
@@ -202,7 +210,14 @@ struct PipelineRegs {
202 /// Number of input attributes to the vertex shader minus 1 210 /// Number of input attributes to the vertex shader minus 1
203 BitField<0, 4, u32> max_input_attrib_index; 211 BitField<0, 4, u32> max_input_attrib_index;
204 212
205 INSERT_PADDING_WORDS(2); 213 INSERT_PADDING_WORDS(1);
214
215 // The shader unit 3, which can be used for both vertex and geometry shader, gets its
216 // configuration depending on this register. If this is not set, unit 3 will share some
217 // configuration with other units. It is known that program code and swizzle pattern uploaded
218 // via regs.vs will be also uploaded to unit 3 if this is not set. Although very likely, it is
219 // still unclear whether uniforms and other configuration can be also shared.
220 BitField<0, 1, u32> gs_unit_exclusive_configuration;
206 221
207 enum class GPUMode : u32 { 222 enum class GPUMode : u32 {
208 Drawing = 0, 223 Drawing = 0,
@@ -211,7 +226,29 @@ struct PipelineRegs {
211 226
212 GPUMode gpu_mode; 227 GPUMode gpu_mode;
213 228
214 INSERT_PADDING_WORDS(0x18); 229 INSERT_PADDING_WORDS(0x4);
230 BitField<0, 4, u32> vs_outmap_total_minus_1_a;
231 INSERT_PADDING_WORDS(0x6);
232 BitField<0, 4, u32> vs_outmap_total_minus_1_b;
233
234 enum class GSMode : u32 {
235 Point = 0,
236 VariablePrimitive = 1,
237 FixedPrimitive = 2,
238 };
239
240 union {
241 BitField<0, 8, GSMode> mode;
242 BitField<8, 4, u32> fixed_vertex_num_minus_1;
243 BitField<12, 4, u32> stride_minus_1;
244 BitField<16, 4, u32> start_index;
245 } gs_config;
246
247 INSERT_PADDING_WORDS(0x1);
248
249 u32 variable_vertex_main_num_minus_1;
250
251 INSERT_PADDING_WORDS(0x9);
215 252
216 enum class TriangleTopology : u32 { 253 enum class TriangleTopology : u32 {
217 List = 0, 254 List = 0,
diff --git a/src/video_core/regs_rasterizer.h b/src/video_core/regs_rasterizer.h
index 2874fd127..4fef00d76 100644
--- a/src/video_core/regs_rasterizer.h
+++ b/src/video_core/regs_rasterizer.h
@@ -5,10 +5,10 @@
5#pragma once 5#pragma once
6 6
7#include <array> 7#include <array>
8
9#include "common/bit_field.h" 8#include "common/bit_field.h"
10#include "common/common_funcs.h" 9#include "common/common_funcs.h"
11#include "common/common_types.h" 10#include "common/common_types.h"
11#include "video_core/pica_types.h"
12 12
13namespace Pica { 13namespace Pica {
14 14
@@ -31,7 +31,17 @@ struct RasterizerRegs {
31 31
32 BitField<0, 24, u32> viewport_size_y; 32 BitField<0, 24, u32> viewport_size_y;
33 33
34 INSERT_PADDING_WORDS(0x9); 34 INSERT_PADDING_WORDS(0x3);
35
36 BitField<0, 1, u32> clip_enable;
37 BitField<0, 24, u32> clip_coef[4]; // float24
38
39 Math::Vec4<float24> GetClipCoef() const {
40 return {float24::FromRaw(clip_coef[0]), float24::FromRaw(clip_coef[1]),
41 float24::FromRaw(clip_coef[2]), float24::FromRaw(clip_coef[3])};
42 }
43
44 INSERT_PADDING_WORDS(0x1);
35 45
36 BitField<0, 24, u32> viewport_depth_range; // float24 46 BitField<0, 24, u32> viewport_depth_range; // float24
37 BitField<0, 24, u32> viewport_depth_near_plane; // float24 47 BitField<0, 24, u32> viewport_depth_near_plane; // float24
diff --git a/src/video_core/regs_shader.h b/src/video_core/regs_shader.h
index ddb1ee451..c15d4d162 100644
--- a/src/video_core/regs_shader.h
+++ b/src/video_core/regs_shader.h
@@ -24,9 +24,16 @@ struct ShaderRegs {
24 24
25 INSERT_PADDING_WORDS(0x4); 25 INSERT_PADDING_WORDS(0x4);
26 26
27 enum ShaderMode {
28 GS = 0x08,
29 VS = 0xA0,
30 };
31
27 union { 32 union {
28 // Number of input attributes to shader unit - 1 33 // Number of input attributes to shader unit - 1
29 BitField<0, 4, u32> max_input_attribute_index; 34 BitField<0, 4, u32> max_input_attribute_index;
35 BitField<8, 8, u32> input_to_uniform;
36 BitField<24, 8, ShaderMode> shader_mode;
30 }; 37 };
31 38
32 // Offset to shader program entry point (in words) 39 // Offset to shader program entry point (in words)
diff --git a/src/video_core/renderer_opengl/gl_rasterizer.cpp b/src/video_core/renderer_opengl/gl_rasterizer.cpp
index 1c6c15a58..7e09e4712 100644
--- a/src/video_core/renderer_opengl/gl_rasterizer.cpp
+++ b/src/video_core/renderer_opengl/gl_rasterizer.cpp
@@ -28,6 +28,9 @@ MICROPROFILE_DEFINE(OpenGL_Blits, "OpenGL", "Blits", MP_RGB(100, 100, 255));
28MICROPROFILE_DEFINE(OpenGL_CacheManagement, "OpenGL", "Cache Mgmt", MP_RGB(100, 255, 100)); 28MICROPROFILE_DEFINE(OpenGL_CacheManagement, "OpenGL", "Cache Mgmt", MP_RGB(100, 255, 100));
29 29
30RasterizerOpenGL::RasterizerOpenGL() : shader_dirty(true) { 30RasterizerOpenGL::RasterizerOpenGL() : shader_dirty(true) {
31 // Clipping plane 0 is always enabled for PICA fixed clip plane z <= 0
32 state.clip_distance[0] = true;
33
31 // Create sampler objects 34 // Create sampler objects
32 for (size_t i = 0; i < texture_samplers.size(); ++i) { 35 for (size_t i = 0; i < texture_samplers.size(); ++i) {
33 texture_samplers[i].Create(); 36 texture_samplers[i].Create();
@@ -166,6 +169,8 @@ RasterizerOpenGL::RasterizerOpenGL() : shader_dirty(true) {
166 glTexBuffer(GL_TEXTURE_BUFFER, GL_RGBA32F, proctex_diff_lut_buffer.handle); 169 glTexBuffer(GL_TEXTURE_BUFFER, GL_RGBA32F, proctex_diff_lut_buffer.handle);
167 170
168 // Sync fixed function OpenGL state 171 // Sync fixed function OpenGL state
172 SyncClipEnabled();
173 SyncClipCoef();
169 SyncCullMode(); 174 SyncCullMode();
170 SyncBlendEnabled(); 175 SyncBlendEnabled();
171 SyncBlendFuncs(); 176 SyncBlendFuncs();
@@ -232,13 +237,24 @@ void RasterizerOpenGL::DrawTriangles() {
232 237
233 glFramebufferTexture2D(GL_DRAW_FRAMEBUFFER, GL_COLOR_ATTACHMENT0, GL_TEXTURE_2D, 238 glFramebufferTexture2D(GL_DRAW_FRAMEBUFFER, GL_COLOR_ATTACHMENT0, GL_TEXTURE_2D,
234 color_surface != nullptr ? color_surface->texture.handle : 0, 0); 239 color_surface != nullptr ? color_surface->texture.handle : 0, 0);
235 glFramebufferTexture2D(GL_DRAW_FRAMEBUFFER, GL_DEPTH_ATTACHMENT, GL_TEXTURE_2D, 240 if (depth_surface != nullptr) {
236 depth_surface != nullptr ? depth_surface->texture.handle : 0, 0); 241 if (regs.framebuffer.framebuffer.depth_format ==
237 bool has_stencil = 242 Pica::FramebufferRegs::DepthFormat::D24S8) {
238 regs.framebuffer.framebuffer.depth_format == Pica::FramebufferRegs::DepthFormat::D24S8; 243 // attach both depth and stencil
239 glFramebufferTexture2D( 244 glFramebufferTexture2D(GL_DRAW_FRAMEBUFFER, GL_DEPTH_STENCIL_ATTACHMENT, GL_TEXTURE_2D,
240 GL_DRAW_FRAMEBUFFER, GL_STENCIL_ATTACHMENT, GL_TEXTURE_2D, 245 depth_surface->texture.handle, 0);
241 (has_stencil && depth_surface != nullptr) ? depth_surface->texture.handle : 0, 0); 246 } else {
247 // attach depth
248 glFramebufferTexture2D(GL_DRAW_FRAMEBUFFER, GL_DEPTH_ATTACHMENT, GL_TEXTURE_2D,
249 depth_surface->texture.handle, 0);
250 // clear stencil attachment
251 glFramebufferTexture2D(GL_DRAW_FRAMEBUFFER, GL_STENCIL_ATTACHMENT, GL_TEXTURE_2D, 0, 0);
252 }
253 } else {
254 // clear both depth and stencil attachment
255 glFramebufferTexture2D(GL_DRAW_FRAMEBUFFER, GL_DEPTH_STENCIL_ATTACHMENT, GL_TEXTURE_2D, 0,
256 0);
257 }
242 258
243 // Sync the viewport 259 // Sync the viewport
244 // These registers hold half-width and half-height, so must be multiplied by 2 260 // These registers hold half-width and half-height, so must be multiplied by 2
@@ -398,6 +414,18 @@ void RasterizerOpenGL::NotifyPicaRegisterChanged(u32 id) {
398 SyncCullMode(); 414 SyncCullMode();
399 break; 415 break;
400 416
417 // Clipping plane
418 case PICA_REG_INDEX(rasterizer.clip_enable):
419 SyncClipEnabled();
420 break;
421
422 case PICA_REG_INDEX_WORKAROUND(rasterizer.clip_coef[0], 0x48):
423 case PICA_REG_INDEX_WORKAROUND(rasterizer.clip_coef[1], 0x49):
424 case PICA_REG_INDEX_WORKAROUND(rasterizer.clip_coef[2], 0x4a):
425 case PICA_REG_INDEX_WORKAROUND(rasterizer.clip_coef[3], 0x4b):
426 SyncClipCoef();
427 break;
428
401 // Depth modifiers 429 // Depth modifiers
402 case PICA_REG_INDEX(rasterizer.viewport_depth_range): 430 case PICA_REG_INDEX(rasterizer.viewport_depth_range):
403 SyncDepthScale(); 431 SyncDepthScale();
@@ -1277,6 +1305,20 @@ void RasterizerOpenGL::SetShader() {
1277 } 1305 }
1278} 1306}
1279 1307
1308void RasterizerOpenGL::SyncClipEnabled() {
1309 state.clip_distance[1] = Pica::g_state.regs.rasterizer.clip_enable != 0;
1310}
1311
1312void RasterizerOpenGL::SyncClipCoef() {
1313 const auto raw_clip_coef = Pica::g_state.regs.rasterizer.GetClipCoef();
1314 const GLvec4 new_clip_coef = {raw_clip_coef.x.ToFloat32(), raw_clip_coef.y.ToFloat32(),
1315 raw_clip_coef.z.ToFloat32(), raw_clip_coef.w.ToFloat32()};
1316 if (new_clip_coef != uniform_block_data.data.clip_coef) {
1317 uniform_block_data.data.clip_coef = new_clip_coef;
1318 uniform_block_data.dirty = true;
1319 }
1320}
1321
1280void RasterizerOpenGL::SyncCullMode() { 1322void RasterizerOpenGL::SyncCullMode() {
1281 const auto& regs = Pica::g_state.regs; 1323 const auto& regs = Pica::g_state.regs;
1282 1324
diff --git a/src/video_core/renderer_opengl/gl_rasterizer.h b/src/video_core/renderer_opengl/gl_rasterizer.h
index 78e218efe..46c62961c 100644
--- a/src/video_core/renderer_opengl/gl_rasterizer.h
+++ b/src/video_core/renderer_opengl/gl_rasterizer.h
@@ -151,14 +151,21 @@ private:
151 LightSrc light_src[8]; 151 LightSrc light_src[8];
152 alignas(16) GLvec4 const_color[6]; // A vec4 color for each of the six tev stages 152 alignas(16) GLvec4 const_color[6]; // A vec4 color for each of the six tev stages
153 alignas(16) GLvec4 tev_combiner_buffer_color; 153 alignas(16) GLvec4 tev_combiner_buffer_color;
154 alignas(16) GLvec4 clip_coef;
154 }; 155 };
155 156
156 static_assert( 157 static_assert(
157 sizeof(UniformData) == 0x460, 158 sizeof(UniformData) == 0x470,
158 "The size of the UniformData structure has changed, update the structure in the shader"); 159 "The size of the UniformData structure has changed, update the structure in the shader");
159 static_assert(sizeof(UniformData) < 16384, 160 static_assert(sizeof(UniformData) < 16384,
160 "UniformData structure must be less than 16kb as per the OpenGL spec"); 161 "UniformData structure must be less than 16kb as per the OpenGL spec");
161 162
163 /// Syncs the clip enabled status to match the PICA register
164 void SyncClipEnabled();
165
166 /// Syncs the clip coefficients to match the PICA register
167 void SyncClipCoef();
168
162 /// Sets the OpenGL shader in accordance with the current PICA register state 169 /// Sets the OpenGL shader in accordance with the current PICA register state
163 void SetShader(); 170 void SetShader();
164 171
diff --git a/src/video_core/renderer_opengl/gl_shader_gen.cpp b/src/video_core/renderer_opengl/gl_shader_gen.cpp
index bb192affd..9fe183944 100644
--- a/src/video_core/renderer_opengl/gl_shader_gen.cpp
+++ b/src/video_core/renderer_opengl/gl_shader_gen.cpp
@@ -8,6 +8,7 @@
8#include "common/assert.h" 8#include "common/assert.h"
9#include "common/bit_field.h" 9#include "common/bit_field.h"
10#include "common/logging/log.h" 10#include "common/logging/log.h"
11#include "core/core.h"
11#include "video_core/regs_framebuffer.h" 12#include "video_core/regs_framebuffer.h"
12#include "video_core/regs_lighting.h" 13#include "video_core/regs_lighting.h"
13#include "video_core/regs_rasterizer.h" 14#include "video_core/regs_rasterizer.h"
@@ -24,6 +25,42 @@ using TevStageConfig = TexturingRegs::TevStageConfig;
24 25
25namespace GLShader { 26namespace GLShader {
26 27
28static const std::string UniformBlockDef = R"(
29#define NUM_TEV_STAGES 6
30#define NUM_LIGHTS 8
31
32struct LightSrc {
33 vec3 specular_0;
34 vec3 specular_1;
35 vec3 diffuse;
36 vec3 ambient;
37 vec3 position;
38 vec3 spot_direction;
39 float dist_atten_bias;
40 float dist_atten_scale;
41};
42
43layout (std140) uniform shader_data {
44 vec2 framebuffer_scale;
45 int alphatest_ref;
46 float depth_scale;
47 float depth_offset;
48 int scissor_x1;
49 int scissor_y1;
50 int scissor_x2;
51 int scissor_y2;
52 vec3 fog_color;
53 vec2 proctex_noise_f;
54 vec2 proctex_noise_a;
55 vec2 proctex_noise_p;
56 vec3 lighting_global_ambient;
57 LightSrc light_src[NUM_LIGHTS];
58 vec4 const_color[NUM_TEV_STAGES];
59 vec4 tev_combiner_buffer_color;
60 vec4 clip_coef;
61};
62)";
63
27PicaShaderConfig PicaShaderConfig::BuildFromRegs(const Pica::Regs& regs) { 64PicaShaderConfig PicaShaderConfig::BuildFromRegs(const Pica::Regs& regs) {
28 PicaShaderConfig res; 65 PicaShaderConfig res;
29 66
@@ -525,11 +562,12 @@ static void WriteLighting(std::string& out, const PicaShaderConfig& config) {
525 "float geo_factor = 1.0;\n"; 562 "float geo_factor = 1.0;\n";
526 563
527 // Compute fragment normals and tangents 564 // Compute fragment normals and tangents
528 const std::string pertubation = 565 auto Perturbation = [&]() {
529 "2.0 * (" + SampleTexture(config, lighting.bump_selector) + ").rgb - 1.0"; 566 return "2.0 * (" + SampleTexture(config, lighting.bump_selector) + ").rgb - 1.0";
567 };
530 if (lighting.bump_mode == LightingRegs::LightingBumpMode::NormalMap) { 568 if (lighting.bump_mode == LightingRegs::LightingBumpMode::NormalMap) {
531 // Bump mapping is enabled using a normal map 569 // Bump mapping is enabled using a normal map
532 out += "vec3 surface_normal = " + pertubation + ";\n"; 570 out += "vec3 surface_normal = " + Perturbation() + ";\n";
533 571
534 // Recompute Z-component of perturbation if 'renorm' is enabled, this provides a higher 572 // Recompute Z-component of perturbation if 'renorm' is enabled, this provides a higher
535 // precision result 573 // precision result
@@ -543,7 +581,7 @@ static void WriteLighting(std::string& out, const PicaShaderConfig& config) {
543 out += "vec3 surface_tangent = vec3(1.0, 0.0, 0.0);\n"; 581 out += "vec3 surface_tangent = vec3(1.0, 0.0, 0.0);\n";
544 } else if (lighting.bump_mode == LightingRegs::LightingBumpMode::TangentMap) { 582 } else if (lighting.bump_mode == LightingRegs::LightingBumpMode::TangentMap) {
545 // Bump mapping is enabled using a tangent map 583 // Bump mapping is enabled using a tangent map
546 out += "vec3 surface_tangent = " + pertubation + ";\n"; 584 out += "vec3 surface_tangent = " + Perturbation() + ";\n";
547 // Mathematically, recomputing Z-component of the tangent vector won't affect the relevant 585 // Mathematically, recomputing Z-component of the tangent vector won't affect the relevant
548 // computation below, which is also confirmed on 3DS. So we don't bother recomputing here 586 // computation below, which is also confirmed on 3DS. So we don't bother recomputing here
549 // even if 'renorm' is enabled. 587 // even if 'renorm' is enabled.
@@ -593,8 +631,8 @@ static void WriteLighting(std::string& out, const PicaShaderConfig& config) {
593 // Note: even if the normal vector is modified by normal map, which is not the 631 // Note: even if the normal vector is modified by normal map, which is not the
594 // normal of the tangent plane anymore, the half angle vector is still projected 632 // normal of the tangent plane anymore, the half angle vector is still projected
595 // using the modified normal vector. 633 // using the modified normal vector.
596 std::string half_angle_proj = "normalize(half_vector) - normal / dot(normal, " 634 std::string half_angle_proj =
597 "normal) * dot(normal, normalize(half_vector))"; 635 "normalize(half_vector) - normal * dot(normal, normalize(half_vector))";
598 // Note: the half angle vector projection is confirmed not normalized before the dot 636 // Note: the half angle vector projection is confirmed not normalized before the dot
599 // product. The result is in fact not cos(phi) as the name suggested. 637 // product. The result is in fact not cos(phi) as the name suggested.
600 index = "dot(" + half_angle_proj + ", tangent)"; 638 index = "dot(" + half_angle_proj + ", tangent)";
@@ -749,7 +787,8 @@ static void WriteLighting(std::string& out, const PicaShaderConfig& config) {
749 } 787 }
750 788
751 // Fresnel 789 // Fresnel
752 if (lighting.lut_fr.enable && 790 // Note: only the last entry in the light slots applies the Fresnel factor
791 if (light_index == lighting.src_num - 1 && lighting.lut_fr.enable &&
753 LightingRegs::IsLightingSamplerSupported(lighting.config, 792 LightingRegs::IsLightingSamplerSupported(lighting.config,
754 LightingRegs::LightingSampler::Fresnel)) { 793 LightingRegs::LightingSampler::Fresnel)) {
755 // Lookup fresnel LUT value 794 // Lookup fresnel LUT value
@@ -758,17 +797,17 @@ static void WriteLighting(std::string& out, const PicaShaderConfig& config) {
758 lighting.lut_fr.type, lighting.lut_fr.abs_input); 797 lighting.lut_fr.type, lighting.lut_fr.abs_input);
759 value = "(" + std::to_string(lighting.lut_fr.scale) + " * " + value + ")"; 798 value = "(" + std::to_string(lighting.lut_fr.scale) + " * " + value + ")";
760 799
761 // Enabled for difffuse lighting alpha component 800 // Enabled for diffuse lighting alpha component
762 if (lighting.fresnel_selector == LightingRegs::LightingFresnelSelector::PrimaryAlpha || 801 if (lighting.fresnel_selector == LightingRegs::LightingFresnelSelector::PrimaryAlpha ||
763 lighting.fresnel_selector == LightingRegs::LightingFresnelSelector::Both) { 802 lighting.fresnel_selector == LightingRegs::LightingFresnelSelector::Both) {
764 out += "diffuse_sum.a *= " + value + ";\n"; 803 out += "diffuse_sum.a = " + value + ";\n";
765 } 804 }
766 805
767 // Enabled for the specular lighting alpha component 806 // Enabled for the specular lighting alpha component
768 if (lighting.fresnel_selector == 807 if (lighting.fresnel_selector ==
769 LightingRegs::LightingFresnelSelector::SecondaryAlpha || 808 LightingRegs::LightingFresnelSelector::SecondaryAlpha ||
770 lighting.fresnel_selector == LightingRegs::LightingFresnelSelector::Both) { 809 lighting.fresnel_selector == LightingRegs::LightingFresnelSelector::Both) {
771 out += "specular_sum.a *= " + value + ";\n"; 810 out += "specular_sum.a = " + value + ";\n";
772 } 811 }
773 } 812 }
774 813
@@ -1007,8 +1046,6 @@ std::string GenerateFragmentShader(const PicaShaderConfig& config) {
1007 1046
1008 std::string out = R"( 1047 std::string out = R"(
1009#version 330 core 1048#version 330 core
1010#define NUM_TEV_STAGES 6
1011#define NUM_LIGHTS 8
1012 1049
1013in vec4 primary_color; 1050in vec4 primary_color;
1014in vec2 texcoord[3]; 1051in vec2 texcoord[3];
@@ -1020,36 +1057,6 @@ in vec4 gl_FragCoord;
1020 1057
1021out vec4 color; 1058out vec4 color;
1022 1059
1023struct LightSrc {
1024 vec3 specular_0;
1025 vec3 specular_1;
1026 vec3 diffuse;
1027 vec3 ambient;
1028 vec3 position;
1029 vec3 spot_direction;
1030 float dist_atten_bias;
1031 float dist_atten_scale;
1032};
1033
1034layout (std140) uniform shader_data {
1035 vec2 framebuffer_scale;
1036 int alphatest_ref;
1037 float depth_scale;
1038 float depth_offset;
1039 int scissor_x1;
1040 int scissor_y1;
1041 int scissor_x2;
1042 int scissor_y2;
1043 vec3 fog_color;
1044 vec2 proctex_noise_f;
1045 vec2 proctex_noise_a;
1046 vec2 proctex_noise_p;
1047 vec3 lighting_global_ambient;
1048 LightSrc light_src[NUM_LIGHTS];
1049 vec4 const_color[NUM_TEV_STAGES];
1050 vec4 tev_combiner_buffer_color;
1051};
1052
1053uniform sampler2D tex[3]; 1060uniform sampler2D tex[3];
1054uniform samplerBuffer lighting_lut; 1061uniform samplerBuffer lighting_lut;
1055uniform samplerBuffer fog_lut; 1062uniform samplerBuffer fog_lut;
@@ -1058,7 +1065,11 @@ uniform samplerBuffer proctex_color_map;
1058uniform samplerBuffer proctex_alpha_map; 1065uniform samplerBuffer proctex_alpha_map;
1059uniform samplerBuffer proctex_lut; 1066uniform samplerBuffer proctex_lut;
1060uniform samplerBuffer proctex_diff_lut; 1067uniform samplerBuffer proctex_diff_lut;
1068)";
1069
1070 out += UniformBlockDef;
1061 1071
1072 out += R"(
1062// Rotate the vector v by the quaternion q 1073// Rotate the vector v by the quaternion q
1063vec3 quaternion_rotate(vec4 q, vec3 v) { 1074vec3 quaternion_rotate(vec4 q, vec3 v) {
1064 return v + 2.0 * cross(q.xyz, cross(q.xyz, v) + q.w * v); 1075 return v + 2.0 * cross(q.xyz, cross(q.xyz, v) + q.w * v);
@@ -1111,7 +1122,10 @@ vec4 secondary_fragment_color = vec4(0.0);
1111 "gl_FragCoord.y < scissor_y2)) discard;\n"; 1122 "gl_FragCoord.y < scissor_y2)) discard;\n";
1112 } 1123 }
1113 1124
1114 out += "float z_over_w = 1.0 - gl_FragCoord.z * 2.0;\n"; 1125 // After perspective divide, OpenGL transform z_over_w from [-1, 1] to [near, far]. Here we use
1126 // default near = 0 and far = 1, and undo the transformation to get the original z_over_w, then
1127 // do our own transformation according to PICA specification.
1128 out += "float z_over_w = 2.0 * gl_FragCoord.z - 1.0;\n";
1115 out += "float depth = z_over_w * depth_scale + depth_offset;\n"; 1129 out += "float depth = z_over_w * depth_scale + depth_offset;\n";
1116 if (state.depthmap_enable == RasterizerRegs::DepthBuffering::WBuffering) { 1130 if (state.depthmap_enable == RasterizerRegs::DepthBuffering::WBuffering) {
1117 out += "depth /= gl_FragCoord.w;\n"; 1131 out += "depth /= gl_FragCoord.w;\n";
@@ -1151,6 +1165,11 @@ vec4 secondary_fragment_color = vec4(0.0);
1151 1165
1152 // Blend the fog 1166 // Blend the fog
1153 out += "last_tex_env_out.rgb = mix(fog_color.rgb, last_tex_env_out.rgb, fog_factor);\n"; 1167 out += "last_tex_env_out.rgb = mix(fog_color.rgb, last_tex_env_out.rgb, fog_factor);\n";
1168 } else if (state.fog_mode == TexturingRegs::FogMode::Gas) {
1169 Core::Telemetry().AddField(Telemetry::FieldType::Session, "VideoCore_Pica_UseGasMode",
1170 true);
1171 LOG_CRITICAL(Render_OpenGL, "Unimplemented gas mode");
1172 UNIMPLEMENTED();
1154 } 1173 }
1155 1174
1156 out += "gl_FragDepth = depth;\n"; 1175 out += "gl_FragDepth = depth;\n";
@@ -1186,6 +1205,12 @@ out float texcoord0_w;
1186out vec4 normquat; 1205out vec4 normquat;
1187out vec3 view; 1206out vec3 view;
1188 1207
1208)";
1209
1210 out += UniformBlockDef;
1211
1212 out += R"(
1213
1189void main() { 1214void main() {
1190 primary_color = vert_color; 1215 primary_color = vert_color;
1191 texcoord[0] = vert_texcoord0; 1216 texcoord[0] = vert_texcoord0;
@@ -1194,7 +1219,9 @@ void main() {
1194 texcoord0_w = vert_texcoord0_w; 1219 texcoord0_w = vert_texcoord0_w;
1195 normquat = vert_normquat; 1220 normquat = vert_normquat;
1196 view = vert_view; 1221 view = vert_view;
1197 gl_Position = vec4(vert_position.x, vert_position.y, -vert_position.z, vert_position.w); 1222 gl_Position = vert_position;
1223 gl_ClipDistance[0] = -vert_position.z; // fixed PICA clipping plane z <= 0
1224 gl_ClipDistance[1] = dot(clip_coef, vert_position);
1198} 1225}
1199)"; 1226)";
1200 1227
diff --git a/src/video_core/renderer_opengl/gl_state.cpp b/src/video_core/renderer_opengl/gl_state.cpp
index bc9d34b84..5770ae08f 100644
--- a/src/video_core/renderer_opengl/gl_state.cpp
+++ b/src/video_core/renderer_opengl/gl_state.cpp
@@ -68,6 +68,8 @@ OpenGLState::OpenGLState() {
68 draw.vertex_buffer = 0; 68 draw.vertex_buffer = 0;
69 draw.uniform_buffer = 0; 69 draw.uniform_buffer = 0;
70 draw.shader_program = 0; 70 draw.shader_program = 0;
71
72 clip_distance = {};
71} 73}
72 74
73void OpenGLState::Apply() const { 75void OpenGLState::Apply() const {
@@ -261,6 +263,17 @@ void OpenGLState::Apply() const {
261 glUseProgram(draw.shader_program); 263 glUseProgram(draw.shader_program);
262 } 264 }
263 265
266 // Clip distance
267 for (size_t i = 0; i < clip_distance.size(); ++i) {
268 if (clip_distance[i] != cur_state.clip_distance[i]) {
269 if (clip_distance[i]) {
270 glEnable(GL_CLIP_DISTANCE0 + static_cast<GLenum>(i));
271 } else {
272 glDisable(GL_CLIP_DISTANCE0 + static_cast<GLenum>(i));
273 }
274 }
275 }
276
264 cur_state = *this; 277 cur_state = *this;
265} 278}
266 279
diff --git a/src/video_core/renderer_opengl/gl_state.h b/src/video_core/renderer_opengl/gl_state.h
index 745a74479..437fe34c4 100644
--- a/src/video_core/renderer_opengl/gl_state.h
+++ b/src/video_core/renderer_opengl/gl_state.h
@@ -4,6 +4,7 @@
4 4
5#pragma once 5#pragma once
6 6
7#include <array>
7#include <glad/glad.h> 8#include <glad/glad.h>
8 9
9namespace TextureUnits { 10namespace TextureUnits {
@@ -123,6 +124,8 @@ public:
123 GLuint shader_program; // GL_CURRENT_PROGRAM 124 GLuint shader_program; // GL_CURRENT_PROGRAM
124 } draw; 125 } draw;
125 126
127 std::array<bool, 2> clip_distance; // GL_CLIP_DISTANCE
128
126 OpenGLState(); 129 OpenGLState();
127 130
128 /// Get the currently active OpenGL state 131 /// Get the currently active OpenGL state
diff --git a/src/video_core/shader/shader.cpp b/src/video_core/shader/shader.cpp
index 67ed19ba8..2857d2829 100644
--- a/src/video_core/shader/shader.cpp
+++ b/src/video_core/shader/shader.cpp
@@ -21,7 +21,8 @@ namespace Pica {
21 21
22namespace Shader { 22namespace Shader {
23 23
24OutputVertex OutputVertex::FromAttributeBuffer(const RasterizerRegs& regs, AttributeBuffer& input) { 24OutputVertex OutputVertex::FromAttributeBuffer(const RasterizerRegs& regs,
25 const AttributeBuffer& input) {
25 // Setup output data 26 // Setup output data
26 union { 27 union {
27 OutputVertex ret{}; 28 OutputVertex ret{};
@@ -51,7 +52,8 @@ OutputVertex OutputVertex::FromAttributeBuffer(const RasterizerRegs& regs, Attri
51 // The hardware takes the absolute and saturates vertex colors like this, *before* doing 52 // The hardware takes the absolute and saturates vertex colors like this, *before* doing
52 // interpolation 53 // interpolation
53 for (unsigned i = 0; i < 4; ++i) { 54 for (unsigned i = 0; i < 4; ++i) {
54 ret.color[i] = float24::FromFloat32(std::fmin(std::fabs(ret.color[i].ToFloat32()), 1.0f)); 55 float c = std::fabs(ret.color[i].ToFloat32());
56 ret.color[i] = float24::FromFloat32(c < 1.0f ? c : 1.0f);
55 } 57 }
56 58
57 LOG_TRACE(HW_GPU, "Output vertex: pos(%.2f, %.2f, %.2f, %.2f), quat(%.2f, %.2f, %.2f, %.2f), " 59 LOG_TRACE(HW_GPU, "Output vertex: pos(%.2f, %.2f, %.2f, %.2f), quat(%.2f, %.2f, %.2f, %.2f), "
@@ -82,6 +84,44 @@ void UnitState::WriteOutput(const ShaderRegs& config, AttributeBuffer& output) {
82 } 84 }
83} 85}
84 86
87UnitState::UnitState(GSEmitter* emitter) : emitter_ptr(emitter) {}
88
89GSEmitter::GSEmitter() {
90 handlers = new Handlers;
91}
92
93GSEmitter::~GSEmitter() {
94 delete handlers;
95}
96
97void GSEmitter::Emit(Math::Vec4<float24> (&vertex)[16]) {
98 ASSERT(vertex_id < 3);
99 std::copy(std::begin(vertex), std::end(vertex), buffer[vertex_id].begin());
100 if (prim_emit) {
101 if (winding)
102 handlers->winding_setter();
103 for (size_t i = 0; i < buffer.size(); ++i) {
104 AttributeBuffer output;
105 unsigned int output_i = 0;
106 for (unsigned int reg : Common::BitSet<u32>(output_mask)) {
107 output.attr[output_i++] = buffer[i][reg];
108 }
109 handlers->vertex_handler(output);
110 }
111 }
112}
113
114GSUnitState::GSUnitState() : UnitState(&emitter) {}
115
116void GSUnitState::SetVertexHandler(VertexHandler vertex_handler, WindingSetter winding_setter) {
117 emitter.handlers->vertex_handler = std::move(vertex_handler);
118 emitter.handlers->winding_setter = std::move(winding_setter);
119}
120
121void GSUnitState::ConfigOutput(const ShaderRegs& config) {
122 emitter.output_mask = config.output_mask;
123}
124
85MICROPROFILE_DEFINE(GPU_Shader, "GPU", "Shader", MP_RGB(50, 50, 240)); 125MICROPROFILE_DEFINE(GPU_Shader, "GPU", "Shader", MP_RGB(50, 50, 240));
86 126
87#ifdef ARCHITECTURE_x86_64 127#ifdef ARCHITECTURE_x86_64
diff --git a/src/video_core/shader/shader.h b/src/video_core/shader/shader.h
index e156f6aef..a3789da01 100644
--- a/src/video_core/shader/shader.h
+++ b/src/video_core/shader/shader.h
@@ -6,6 +6,7 @@
6 6
7#include <array> 7#include <array>
8#include <cstddef> 8#include <cstddef>
9#include <functional>
9#include <type_traits> 10#include <type_traits>
10#include <nihstro/shader_bytecode.h> 11#include <nihstro/shader_bytecode.h>
11#include "common/assert.h" 12#include "common/assert.h"
@@ -31,6 +32,12 @@ struct AttributeBuffer {
31 alignas(16) Math::Vec4<float24> attr[16]; 32 alignas(16) Math::Vec4<float24> attr[16];
32}; 33};
33 34
35/// Handler type for receiving vertex outputs from vertex shader or geometry shader
36using VertexHandler = std::function<void(const AttributeBuffer&)>;
37
38/// Handler type for signaling to invert the vertex order of the next triangle
39using WindingSetter = std::function<void()>;
40
34struct OutputVertex { 41struct OutputVertex {
35 Math::Vec4<float24> pos; 42 Math::Vec4<float24> pos;
36 Math::Vec4<float24> quat; 43 Math::Vec4<float24> quat;
@@ -43,7 +50,8 @@ struct OutputVertex {
43 INSERT_PADDING_WORDS(1); 50 INSERT_PADDING_WORDS(1);
44 Math::Vec2<float24> tc2; 51 Math::Vec2<float24> tc2;
45 52
46 static OutputVertex FromAttributeBuffer(const RasterizerRegs& regs, AttributeBuffer& output); 53 static OutputVertex FromAttributeBuffer(const RasterizerRegs& regs,
54 const AttributeBuffer& output);
47}; 55};
48#define ASSERT_POS(var, pos) \ 56#define ASSERT_POS(var, pos) \
49 static_assert(offsetof(OutputVertex, var) == pos * sizeof(float24), "Semantic at wrong " \ 57 static_assert(offsetof(OutputVertex, var) == pos * sizeof(float24), "Semantic at wrong " \
@@ -61,12 +69,36 @@ static_assert(std::is_pod<OutputVertex>::value, "Structure is not POD");
61static_assert(sizeof(OutputVertex) == 24 * sizeof(float), "OutputVertex has invalid size"); 69static_assert(sizeof(OutputVertex) == 24 * sizeof(float), "OutputVertex has invalid size");
62 70
63/** 71/**
72 * This structure contains state information for primitive emitting in geometry shader.
73 */
74struct GSEmitter {
75 std::array<std::array<Math::Vec4<float24>, 16>, 3> buffer;
76 u8 vertex_id;
77 bool prim_emit;
78 bool winding;
79 u32 output_mask;
80
81 // Function objects are hidden behind a raw pointer to make the structure standard layout type,
82 // for JIT to use offsetof to access other members.
83 struct Handlers {
84 VertexHandler vertex_handler;
85 WindingSetter winding_setter;
86 } * handlers;
87
88 GSEmitter();
89 ~GSEmitter();
90 void Emit(Math::Vec4<float24> (&vertex)[16]);
91};
92static_assert(std::is_standard_layout<GSEmitter>::value, "GSEmitter is not standard layout type");
93
94/**
64 * This structure contains the state information that needs to be unique for a shader unit. The 3DS 95 * This structure contains the state information that needs to be unique for a shader unit. The 3DS
65 * has four shader units that process shaders in parallel. At the present, Citra only implements a 96 * has four shader units that process shaders in parallel. At the present, Citra only implements a
66 * single shader unit that processes all shaders serially. Putting the state information in a struct 97 * single shader unit that processes all shaders serially. Putting the state information in a struct
67 * here will make it easier for us to parallelize the shader processing later. 98 * here will make it easier for us to parallelize the shader processing later.
68 */ 99 */
69struct UnitState { 100struct UnitState {
101 explicit UnitState(GSEmitter* emitter = nullptr);
70 struct Registers { 102 struct Registers {
71 // The registers are accessed by the shader JIT using SSE instructions, and are therefore 103 // The registers are accessed by the shader JIT using SSE instructions, and are therefore
72 // required to be 16-byte aligned. 104 // required to be 16-byte aligned.
@@ -82,6 +114,8 @@ struct UnitState {
82 // TODO: How many bits do these actually have? 114 // TODO: How many bits do these actually have?
83 s32 address_registers[3]; 115 s32 address_registers[3];
84 116
117 GSEmitter* emitter_ptr;
118
85 static size_t InputOffset(const SourceRegister& reg) { 119 static size_t InputOffset(const SourceRegister& reg) {
86 switch (reg.GetRegisterType()) { 120 switch (reg.GetRegisterType()) {
87 case RegisterType::Input: 121 case RegisterType::Input:
@@ -125,6 +159,19 @@ struct UnitState {
125 void WriteOutput(const ShaderRegs& config, AttributeBuffer& output); 159 void WriteOutput(const ShaderRegs& config, AttributeBuffer& output);
126}; 160};
127 161
162/**
163 * This is an extended shader unit state that represents the special unit that can run both vertex
164 * shader and geometry shader. It contains an additional primitive emitter and utilities for
165 * geometry shader.
166 */
167struct GSUnitState : public UnitState {
168 GSUnitState();
169 void SetVertexHandler(VertexHandler vertex_handler, WindingSetter winding_setter);
170 void ConfigOutput(const ShaderRegs& config);
171
172 GSEmitter emitter;
173};
174
128struct ShaderSetup { 175struct ShaderSetup {
129 struct { 176 struct {
130 // The float uniforms are accessed by the shader JIT using SSE instructions, and are 177 // The float uniforms are accessed by the shader JIT using SSE instructions, and are
diff --git a/src/video_core/shader/shader_interpreter.cpp b/src/video_core/shader/shader_interpreter.cpp
index aa1cec81f..9d4da4904 100644
--- a/src/video_core/shader/shader_interpreter.cpp
+++ b/src/video_core/shader/shader_interpreter.cpp
@@ -631,11 +631,27 @@ static void RunInterpreter(const ShaderSetup& setup, UnitState& state, DebugData
631 state.address_registers[2] = loop_param.y; 631 state.address_registers[2] = loop_param.y;
632 632
633 Record<DebugDataRecord::LOOP_INT_IN>(debug_data, iteration, loop_param); 633 Record<DebugDataRecord::LOOP_INT_IN>(debug_data, iteration, loop_param);
634 call(program_counter + 1, instr.flow_control.dest_offset - program_counter + 1, 634 call(program_counter + 1, instr.flow_control.dest_offset - program_counter,
635 instr.flow_control.dest_offset + 1, loop_param.x, loop_param.z); 635 instr.flow_control.dest_offset + 1, loop_param.x, loop_param.z);
636 break; 636 break;
637 } 637 }
638 638
639 case OpCode::Id::EMIT: {
640 GSEmitter* emitter = state.emitter_ptr;
641 ASSERT_MSG(emitter, "Execute EMIT on VS");
642 emitter->Emit(state.registers.output);
643 break;
644 }
645
646 case OpCode::Id::SETEMIT: {
647 GSEmitter* emitter = state.emitter_ptr;
648 ASSERT_MSG(emitter, "Execute SETEMIT on VS");
649 emitter->vertex_id = instr.setemit.vertex_id;
650 emitter->prim_emit = instr.setemit.prim_emit != 0;
651 emitter->winding = instr.setemit.winding != 0;
652 break;
653 }
654
639 default: 655 default:
640 LOG_ERROR(HW_GPU, "Unhandled instruction: 0x%02x (%s): 0x%08x", 656 LOG_ERROR(HW_GPU, "Unhandled instruction: 0x%02x (%s): 0x%08x",
641 (int)instr.opcode.Value().EffectiveOpCode(), 657 (int)instr.opcode.Value().EffectiveOpCode(),
diff --git a/src/video_core/shader/shader_jit_x64_compiler.cpp b/src/video_core/shader/shader_jit_x64_compiler.cpp
index 42a57aab1..1b31623bd 100644
--- a/src/video_core/shader/shader_jit_x64_compiler.cpp
+++ b/src/video_core/shader/shader_jit_x64_compiler.cpp
@@ -75,8 +75,8 @@ const JitFunction instr_table[64] = {
75 &JitShader::Compile_IF, // ifu 75 &JitShader::Compile_IF, // ifu
76 &JitShader::Compile_IF, // ifc 76 &JitShader::Compile_IF, // ifc
77 &JitShader::Compile_LOOP, // loop 77 &JitShader::Compile_LOOP, // loop
78 nullptr, // emit 78 &JitShader::Compile_EMIT, // emit
79 nullptr, // sete 79 &JitShader::Compile_SETE, // sete
80 &JitShader::Compile_JMP, // jmpc 80 &JitShader::Compile_JMP, // jmpc
81 &JitShader::Compile_JMP, // jmpu 81 &JitShader::Compile_JMP, // jmpu
82 &JitShader::Compile_CMP, // cmp 82 &JitShader::Compile_CMP, // cmp
@@ -772,6 +772,51 @@ void JitShader::Compile_JMP(Instruction instr) {
772 } 772 }
773} 773}
774 774
775static void Emit(GSEmitter* emitter, Math::Vec4<float24> (*output)[16]) {
776 emitter->Emit(*output);
777}
778
779void JitShader::Compile_EMIT(Instruction instr) {
780 Label have_emitter, end;
781 mov(rax, qword[STATE + offsetof(UnitState, emitter_ptr)]);
782 test(rax, rax);
783 jnz(have_emitter);
784
785 ABI_PushRegistersAndAdjustStack(*this, PersistentCallerSavedRegs(), 0);
786 mov(ABI_PARAM1, reinterpret_cast<size_t>("Execute EMIT on VS"));
787 CallFarFunction(*this, LogCritical);
788 ABI_PopRegistersAndAdjustStack(*this, PersistentCallerSavedRegs(), 0);
789 jmp(end);
790
791 L(have_emitter);
792 ABI_PushRegistersAndAdjustStack(*this, PersistentCallerSavedRegs(), 0);
793 mov(ABI_PARAM1, rax);
794 mov(ABI_PARAM2, STATE);
795 add(ABI_PARAM2, static_cast<Xbyak::uint32>(offsetof(UnitState, registers.output)));
796 CallFarFunction(*this, Emit);
797 ABI_PopRegistersAndAdjustStack(*this, PersistentCallerSavedRegs(), 0);
798 L(end);
799}
800
801void JitShader::Compile_SETE(Instruction instr) {
802 Label have_emitter, end;
803 mov(rax, qword[STATE + offsetof(UnitState, emitter_ptr)]);
804 test(rax, rax);
805 jnz(have_emitter);
806
807 ABI_PushRegistersAndAdjustStack(*this, PersistentCallerSavedRegs(), 0);
808 mov(ABI_PARAM1, reinterpret_cast<size_t>("Execute SETEMIT on VS"));
809 CallFarFunction(*this, LogCritical);
810 ABI_PopRegistersAndAdjustStack(*this, PersistentCallerSavedRegs(), 0);
811 jmp(end);
812
813 L(have_emitter);
814 mov(byte[rax + offsetof(GSEmitter, vertex_id)], instr.setemit.vertex_id);
815 mov(byte[rax + offsetof(GSEmitter, prim_emit)], instr.setemit.prim_emit);
816 mov(byte[rax + offsetof(GSEmitter, winding)], instr.setemit.winding);
817 L(end);
818}
819
775void JitShader::Compile_Block(unsigned end) { 820void JitShader::Compile_Block(unsigned end) {
776 while (program_counter < end) { 821 while (program_counter < end) {
777 Compile_NextInstr(); 822 Compile_NextInstr();
diff --git a/src/video_core/shader/shader_jit_x64_compiler.h b/src/video_core/shader/shader_jit_x64_compiler.h
index 31af0ca48..4aee56b1d 100644
--- a/src/video_core/shader/shader_jit_x64_compiler.h
+++ b/src/video_core/shader/shader_jit_x64_compiler.h
@@ -66,6 +66,8 @@ public:
66 void Compile_JMP(Instruction instr); 66 void Compile_JMP(Instruction instr);
67 void Compile_CMP(Instruction instr); 67 void Compile_CMP(Instruction instr);
68 void Compile_MAD(Instruction instr); 68 void Compile_MAD(Instruction instr);
69 void Compile_EMIT(Instruction instr);
70 void Compile_SETE(Instruction instr);
69 71
70private: 72private:
71 void Compile_Block(unsigned end); 73 void Compile_Block(unsigned end);
diff --git a/src/video_core/swrasterizer/clipper.cpp b/src/video_core/swrasterizer/clipper.cpp
index 6fb923756..c1ed48398 100644
--- a/src/video_core/swrasterizer/clipper.cpp
+++ b/src/video_core/swrasterizer/clipper.cpp
@@ -31,7 +31,7 @@ public:
31 : coeffs(coeffs), bias(bias) {} 31 : coeffs(coeffs), bias(bias) {}
32 32
33 bool IsInside(const Vertex& vertex) const { 33 bool IsInside(const Vertex& vertex) const {
34 return Math::Dot(vertex.pos + bias, coeffs) <= float24::FromFloat32(0); 34 return Math::Dot(vertex.pos + bias, coeffs) >= float24::FromFloat32(0);
35 } 35 }
36 36
37 bool IsOutSide(const Vertex& vertex) const { 37 bool IsOutSide(const Vertex& vertex) const {
@@ -95,6 +95,17 @@ void ProcessTriangle(const OutputVertex& v0, const OutputVertex& v1, const Outpu
95 static const size_t MAX_VERTICES = 9; 95 static const size_t MAX_VERTICES = 9;
96 static_vector<Vertex, MAX_VERTICES> buffer_a = {v0, v1, v2}; 96 static_vector<Vertex, MAX_VERTICES> buffer_a = {v0, v1, v2};
97 static_vector<Vertex, MAX_VERTICES> buffer_b; 97 static_vector<Vertex, MAX_VERTICES> buffer_b;
98
99 auto FlipQuaternionIfOpposite = [](auto& a, const auto& b) {
100 if (Math::Dot(a, b) < float24::Zero())
101 a = a * float24::FromFloat32(-1.0f);
102 };
103
104 // Flip the quaternions if they are opposite to prevent interpolating them over the wrong
105 // direction.
106 FlipQuaternionIfOpposite(buffer_a[1].quat, buffer_a[0].quat);
107 FlipQuaternionIfOpposite(buffer_a[2].quat, buffer_a[0].quat);
108
98 auto* output_list = &buffer_a; 109 auto* output_list = &buffer_a;
99 auto* input_list = &buffer_b; 110 auto* input_list = &buffer_b;
100 111
@@ -105,23 +116,18 @@ void ProcessTriangle(const OutputVertex& v0, const OutputVertex& v1, const Outpu
105 static const float24 f0 = float24::FromFloat32(0.0); 116 static const float24 f0 = float24::FromFloat32(0.0);
106 static const float24 f1 = float24::FromFloat32(1.0); 117 static const float24 f1 = float24::FromFloat32(1.0);
107 static const std::array<ClippingEdge, 7> clipping_edges = {{ 118 static const std::array<ClippingEdge, 7> clipping_edges = {{
108 {Math::MakeVec(f1, f0, f0, -f1)}, // x = +w 119 {Math::MakeVec(-f1, f0, f0, f1)}, // x = +w
109 {Math::MakeVec(-f1, f0, f0, -f1)}, // x = -w 120 {Math::MakeVec(f1, f0, f0, f1)}, // x = -w
110 {Math::MakeVec(f0, f1, f0, -f1)}, // y = +w 121 {Math::MakeVec(f0, -f1, f0, f1)}, // y = +w
111 {Math::MakeVec(f0, -f1, f0, -f1)}, // y = -w 122 {Math::MakeVec(f0, f1, f0, f1)}, // y = -w
112 {Math::MakeVec(f0, f0, f1, f0)}, // z = 0 123 {Math::MakeVec(f0, f0, -f1, f0)}, // z = 0
113 {Math::MakeVec(f0, f0, -f1, -f1)}, // z = -w 124 {Math::MakeVec(f0, f0, f1, f1)}, // z = -w
114 {Math::MakeVec(f0, f0, f0, -f1), Math::Vec4<float24>(f0, f0, f0, EPSILON)}, // w = EPSILON 125 {Math::MakeVec(f0, f0, f0, f1), Math::Vec4<float24>(f0, f0, f0, EPSILON)}, // w = EPSILON
115 }}; 126 }};
116 127
117 // TODO: If one vertex lies outside one of the depth clipping planes, some platforms (e.g. Wii)
118 // drop the whole primitive instead of clipping the primitive properly. We should test if
119 // this happens on the 3DS, too.
120
121 // Simple implementation of the Sutherland-Hodgman clipping algorithm. 128 // Simple implementation of the Sutherland-Hodgman clipping algorithm.
122 // TODO: Make this less inefficient (currently lots of useless buffering overhead happens here) 129 // TODO: Make this less inefficient (currently lots of useless buffering overhead happens here)
123 for (auto edge : clipping_edges) { 130 auto Clip = [&](const ClippingEdge& edge) {
124
125 std::swap(input_list, output_list); 131 std::swap(input_list, output_list);
126 output_list->clear(); 132 output_list->clear();
127 133
@@ -140,12 +146,24 @@ void ProcessTriangle(const OutputVertex& v0, const OutputVertex& v1, const Outpu
140 } 146 }
141 reference_vertex = &vertex; 147 reference_vertex = &vertex;
142 } 148 }
149 };
150
151 for (auto edge : clipping_edges) {
152 Clip(edge);
143 153
144 // Need to have at least a full triangle to continue... 154 // Need to have at least a full triangle to continue...
145 if (output_list->size() < 3) 155 if (output_list->size() < 3)
146 return; 156 return;
147 } 157 }
148 158
159 if (g_state.regs.rasterizer.clip_enable) {
160 ClippingEdge custom_edge{g_state.regs.rasterizer.GetClipCoef()};
161 Clip(custom_edge);
162
163 if (output_list->size() < 3)
164 return;
165 }
166
149 InitScreenCoordinates((*output_list)[0]); 167 InitScreenCoordinates((*output_list)[0]);
150 InitScreenCoordinates((*output_list)[1]); 168 InitScreenCoordinates((*output_list)[1]);
151 169
diff --git a/src/video_core/swrasterizer/framebuffer.cpp b/src/video_core/swrasterizer/framebuffer.cpp
index 7de3aac75..f34eab6cf 100644
--- a/src/video_core/swrasterizer/framebuffer.cpp
+++ b/src/video_core/swrasterizer/framebuffer.cpp
@@ -352,6 +352,8 @@ u8 LogicOp(u8 src, u8 dest, FramebufferRegs::LogicOp op) {
352 case FramebufferRegs::LogicOp::OrInverted: 352 case FramebufferRegs::LogicOp::OrInverted:
353 return ~src | dest; 353 return ~src | dest;
354 } 354 }
355
356 UNREACHABLE();
355}; 357};
356 358
357} // namespace Rasterizer 359} // namespace Rasterizer
diff --git a/src/video_core/swrasterizer/lighting.cpp b/src/video_core/swrasterizer/lighting.cpp
new file mode 100644
index 000000000..5fa748611
--- /dev/null
+++ b/src/video_core/swrasterizer/lighting.cpp
@@ -0,0 +1,308 @@
1// Copyright 2017 Citra Emulator Project
2// Licensed under GPLv2 or any later version
3// Refer to the license.txt file included.
4
5#include "common/math_util.h"
6#include "video_core/swrasterizer/lighting.h"
7
8namespace Pica {
9
10static float LookupLightingLut(const Pica::State::Lighting& lighting, size_t lut_index, u8 index,
11 float delta) {
12 ASSERT_MSG(lut_index < lighting.luts.size(), "Out of range lut");
13 ASSERT_MSG(index < lighting.luts[lut_index].size(), "Out of range index");
14
15 const auto& lut = lighting.luts[lut_index][index];
16
17 float lut_value = lut.ToFloat();
18 float lut_diff = lut.DiffToFloat();
19
20 return lut_value + lut_diff * delta;
21}
22
23std::tuple<Math::Vec4<u8>, Math::Vec4<u8>> ComputeFragmentsColors(
24 const Pica::LightingRegs& lighting, const Pica::State::Lighting& lighting_state,
25 const Math::Quaternion<float>& normquat, const Math::Vec3<float>& view,
26 const Math::Vec4<u8> (&texture_color)[4]) {
27
28 Math::Vec3<float> surface_normal;
29 Math::Vec3<float> surface_tangent;
30
31 if (lighting.config0.bump_mode != LightingRegs::LightingBumpMode::None) {
32 Math::Vec3<float> perturbation =
33 texture_color[lighting.config0.bump_selector].xyz().Cast<float>() / 127.5f -
34 Math::MakeVec(1.0f, 1.0f, 1.0f);
35 if (lighting.config0.bump_mode == LightingRegs::LightingBumpMode::NormalMap) {
36 if (!lighting.config0.disable_bump_renorm) {
37 const float z_square = 1 - perturbation.xy().Length2();
38 perturbation.z = std::sqrt(std::max(z_square, 0.0f));
39 }
40 surface_normal = perturbation;
41 surface_tangent = Math::MakeVec(1.0f, 0.0f, 0.0f);
42 } else if (lighting.config0.bump_mode == LightingRegs::LightingBumpMode::TangentMap) {
43 surface_normal = Math::MakeVec(0.0f, 0.0f, 1.0f);
44 surface_tangent = perturbation;
45 } else {
46 LOG_ERROR(HW_GPU, "Unknown bump mode %u", lighting.config0.bump_mode.Value());
47 }
48 } else {
49 surface_normal = Math::MakeVec(0.0f, 0.0f, 1.0f);
50 surface_tangent = Math::MakeVec(1.0f, 0.0f, 0.0f);
51 }
52
53 // Use the normalized the quaternion when performing the rotation
54 auto normal = Math::QuaternionRotate(normquat, surface_normal);
55 auto tangent = Math::QuaternionRotate(normquat, surface_tangent);
56
57 Math::Vec4<float> diffuse_sum = {0.0f, 0.0f, 0.0f, 1.0f};
58 Math::Vec4<float> specular_sum = {0.0f, 0.0f, 0.0f, 1.0f};
59
60 for (unsigned light_index = 0; light_index <= lighting.max_light_index; ++light_index) {
61 unsigned num = lighting.light_enable.GetNum(light_index);
62 const auto& light_config = lighting.light[num];
63
64 Math::Vec3<float> refl_value = {};
65 Math::Vec3<float> position = {float16::FromRaw(light_config.x).ToFloat32(),
66 float16::FromRaw(light_config.y).ToFloat32(),
67 float16::FromRaw(light_config.z).ToFloat32()};
68 Math::Vec3<float> light_vector;
69
70 if (light_config.config.directional)
71 light_vector = position;
72 else
73 light_vector = position + view;
74
75 light_vector.Normalize();
76
77 Math::Vec3<float> norm_view = view.Normalized();
78 Math::Vec3<float> half_vector = norm_view + light_vector;
79
80 float dist_atten = 1.0f;
81 if (!lighting.IsDistAttenDisabled(num)) {
82 auto distance = (-view - position).Length();
83 float scale = Pica::float20::FromRaw(light_config.dist_atten_scale).ToFloat32();
84 float bias = Pica::float20::FromRaw(light_config.dist_atten_bias).ToFloat32();
85 size_t lut =
86 static_cast<size_t>(LightingRegs::LightingSampler::DistanceAttenuation) + num;
87
88 float sample_loc = MathUtil::Clamp(scale * distance + bias, 0.0f, 1.0f);
89
90 u8 lutindex =
91 static_cast<u8>(MathUtil::Clamp(std::floor(sample_loc * 256.0f), 0.0f, 255.0f));
92 float delta = sample_loc * 256 - lutindex;
93 dist_atten = LookupLightingLut(lighting_state, lut, lutindex, delta);
94 }
95
96 auto GetLutValue = [&](LightingRegs::LightingLutInput input, bool abs,
97 LightingRegs::LightingScale scale_enum,
98 LightingRegs::LightingSampler sampler) {
99 float result = 0.0f;
100
101 switch (input) {
102 case LightingRegs::LightingLutInput::NH:
103 result = Math::Dot(normal, half_vector.Normalized());
104 break;
105
106 case LightingRegs::LightingLutInput::VH:
107 result = Math::Dot(norm_view, half_vector.Normalized());
108 break;
109
110 case LightingRegs::LightingLutInput::NV:
111 result = Math::Dot(normal, norm_view);
112 break;
113
114 case LightingRegs::LightingLutInput::LN:
115 result = Math::Dot(light_vector, normal);
116 break;
117
118 case LightingRegs::LightingLutInput::SP: {
119 Math::Vec3<s32> spot_dir{light_config.spot_x.Value(), light_config.spot_y.Value(),
120 light_config.spot_z.Value()};
121 result = Math::Dot(light_vector, spot_dir.Cast<float>() / 2047.0f);
122 break;
123 }
124 case LightingRegs::LightingLutInput::CP:
125 if (lighting.config0.config == LightingRegs::LightingConfig::Config7) {
126 const Math::Vec3<float> norm_half_vector = half_vector.Normalized();
127 const Math::Vec3<float> half_vector_proj =
128 norm_half_vector - normal * Math::Dot(normal, norm_half_vector);
129 result = Math::Dot(half_vector_proj, tangent);
130 } else {
131 result = 0.0f;
132 }
133 break;
134 default:
135 LOG_CRITICAL(HW_GPU, "Unknown lighting LUT input %u\n", static_cast<u32>(input));
136 UNIMPLEMENTED();
137 result = 0.0f;
138 }
139
140 u8 index;
141 float delta;
142
143 if (abs) {
144 if (light_config.config.two_sided_diffuse)
145 result = std::abs(result);
146 else
147 result = std::max(result, 0.0f);
148
149 float flr = std::floor(result * 256.0f);
150 index = static_cast<u8>(MathUtil::Clamp(flr, 0.0f, 255.0f));
151 delta = result * 256 - index;
152 } else {
153 float flr = std::floor(result * 128.0f);
154 s8 signed_index = static_cast<s8>(MathUtil::Clamp(flr, -128.0f, 127.0f));
155 delta = result * 128.0f - signed_index;
156 index = static_cast<u8>(signed_index);
157 }
158
159 float scale = lighting.lut_scale.GetScale(scale_enum);
160 return scale *
161 LookupLightingLut(lighting_state, static_cast<size_t>(sampler), index, delta);
162 };
163
164 // If enabled, compute spot light attenuation value
165 float spot_atten = 1.0f;
166 if (!lighting.IsSpotAttenDisabled(num) &&
167 LightingRegs::IsLightingSamplerSupported(
168 lighting.config0.config, LightingRegs::LightingSampler::SpotlightAttenuation)) {
169 auto lut = LightingRegs::SpotlightAttenuationSampler(num);
170 spot_atten = GetLutValue(lighting.lut_input.sp, lighting.abs_lut_input.disable_sp == 0,
171 lighting.lut_scale.sp, lut);
172 }
173
174 // Specular 0 component
175 float d0_lut_value = 1.0f;
176 if (lighting.config1.disable_lut_d0 == 0 &&
177 LightingRegs::IsLightingSamplerSupported(
178 lighting.config0.config, LightingRegs::LightingSampler::Distribution0)) {
179 d0_lut_value =
180 GetLutValue(lighting.lut_input.d0, lighting.abs_lut_input.disable_d0 == 0,
181 lighting.lut_scale.d0, LightingRegs::LightingSampler::Distribution0);
182 }
183
184 Math::Vec3<float> specular_0 = d0_lut_value * light_config.specular_0.ToVec3f();
185
186 // If enabled, lookup ReflectRed value, otherwise, 1.0 is used
187 if (lighting.config1.disable_lut_rr == 0 &&
188 LightingRegs::IsLightingSamplerSupported(lighting.config0.config,
189 LightingRegs::LightingSampler::ReflectRed)) {
190 refl_value.x =
191 GetLutValue(lighting.lut_input.rr, lighting.abs_lut_input.disable_rr == 0,
192 lighting.lut_scale.rr, LightingRegs::LightingSampler::ReflectRed);
193 } else {
194 refl_value.x = 1.0f;
195 }
196
197 // If enabled, lookup ReflectGreen value, otherwise, ReflectRed value is used
198 if (lighting.config1.disable_lut_rg == 0 &&
199 LightingRegs::IsLightingSamplerSupported(lighting.config0.config,
200 LightingRegs::LightingSampler::ReflectGreen)) {
201 refl_value.y =
202 GetLutValue(lighting.lut_input.rg, lighting.abs_lut_input.disable_rg == 0,
203 lighting.lut_scale.rg, LightingRegs::LightingSampler::ReflectGreen);
204 } else {
205 refl_value.y = refl_value.x;
206 }
207
208 // If enabled, lookup ReflectBlue value, otherwise, ReflectRed value is used
209 if (lighting.config1.disable_lut_rb == 0 &&
210 LightingRegs::IsLightingSamplerSupported(lighting.config0.config,
211 LightingRegs::LightingSampler::ReflectBlue)) {
212 refl_value.z =
213 GetLutValue(lighting.lut_input.rb, lighting.abs_lut_input.disable_rb == 0,
214 lighting.lut_scale.rb, LightingRegs::LightingSampler::ReflectBlue);
215 } else {
216 refl_value.z = refl_value.x;
217 }
218
219 // Specular 1 component
220 float d1_lut_value = 1.0f;
221 if (lighting.config1.disable_lut_d1 == 0 &&
222 LightingRegs::IsLightingSamplerSupported(
223 lighting.config0.config, LightingRegs::LightingSampler::Distribution1)) {
224 d1_lut_value =
225 GetLutValue(lighting.lut_input.d1, lighting.abs_lut_input.disable_d1 == 0,
226 lighting.lut_scale.d1, LightingRegs::LightingSampler::Distribution1);
227 }
228
229 Math::Vec3<float> specular_1 =
230 d1_lut_value * refl_value * light_config.specular_1.ToVec3f();
231
232 // Fresnel
233 // Note: only the last entry in the light slots applies the Fresnel factor
234 if (light_index == lighting.max_light_index && lighting.config1.disable_lut_fr == 0 &&
235 LightingRegs::IsLightingSamplerSupported(lighting.config0.config,
236 LightingRegs::LightingSampler::Fresnel)) {
237
238 float lut_value =
239 GetLutValue(lighting.lut_input.fr, lighting.abs_lut_input.disable_fr == 0,
240 lighting.lut_scale.fr, LightingRegs::LightingSampler::Fresnel);
241
242 // Enabled for diffuse lighting alpha component
243 if (lighting.config0.fresnel_selector ==
244 LightingRegs::LightingFresnelSelector::PrimaryAlpha ||
245 lighting.config0.fresnel_selector == LightingRegs::LightingFresnelSelector::Both) {
246 diffuse_sum.a() = lut_value;
247 }
248
249 // Enabled for the specular lighting alpha component
250 if (lighting.config0.fresnel_selector ==
251 LightingRegs::LightingFresnelSelector::SecondaryAlpha ||
252 lighting.config0.fresnel_selector == LightingRegs::LightingFresnelSelector::Both) {
253 specular_sum.a() = lut_value;
254 }
255 }
256
257 auto dot_product = Math::Dot(light_vector, normal);
258
259 // Calculate clamp highlights before applying the two-sided diffuse configuration to the dot
260 // product.
261 float clamp_highlights = 1.0f;
262 if (lighting.config0.clamp_highlights) {
263 if (dot_product <= 0.0f)
264 clamp_highlights = 0.0f;
265 else
266 clamp_highlights = 1.0f;
267 }
268
269 if (light_config.config.two_sided_diffuse)
270 dot_product = std::abs(dot_product);
271 else
272 dot_product = std::max(dot_product, 0.0f);
273
274 if (light_config.config.geometric_factor_0 || light_config.config.geometric_factor_1) {
275 float geo_factor = half_vector.Length2();
276 geo_factor = geo_factor == 0.0f ? 0.0f : std::min(dot_product / geo_factor, 1.0f);
277 if (light_config.config.geometric_factor_0) {
278 specular_0 *= geo_factor;
279 }
280 if (light_config.config.geometric_factor_1) {
281 specular_1 *= geo_factor;
282 }
283 }
284
285 auto diffuse =
286 light_config.diffuse.ToVec3f() * dot_product + light_config.ambient.ToVec3f();
287 diffuse_sum += Math::MakeVec(diffuse * dist_atten * spot_atten, 0.0f);
288
289 specular_sum += Math::MakeVec(
290 (specular_0 + specular_1) * clamp_highlights * dist_atten * spot_atten, 0.0f);
291 }
292
293 diffuse_sum += Math::MakeVec(lighting.global_ambient.ToVec3f(), 0.0f);
294
295 auto diffuse = Math::MakeVec<float>(MathUtil::Clamp(diffuse_sum.x, 0.0f, 1.0f) * 255,
296 MathUtil::Clamp(diffuse_sum.y, 0.0f, 1.0f) * 255,
297 MathUtil::Clamp(diffuse_sum.z, 0.0f, 1.0f) * 255,
298 MathUtil::Clamp(diffuse_sum.w, 0.0f, 1.0f) * 255)
299 .Cast<u8>();
300 auto specular = Math::MakeVec<float>(MathUtil::Clamp(specular_sum.x, 0.0f, 1.0f) * 255,
301 MathUtil::Clamp(specular_sum.y, 0.0f, 1.0f) * 255,
302 MathUtil::Clamp(specular_sum.z, 0.0f, 1.0f) * 255,
303 MathUtil::Clamp(specular_sum.w, 0.0f, 1.0f) * 255)
304 .Cast<u8>();
305 return std::make_tuple(diffuse, specular);
306}
307
308} // namespace Pica
diff --git a/src/video_core/swrasterizer/lighting.h b/src/video_core/swrasterizer/lighting.h
new file mode 100644
index 000000000..d807a3d94
--- /dev/null
+++ b/src/video_core/swrasterizer/lighting.h
@@ -0,0 +1,19 @@
1// Copyright 2017 Citra Emulator Project
2// Licensed under GPLv2 or any later version
3// Refer to the license.txt file included.
4
5#pragma once
6
7#include <tuple>
8#include "common/quaternion.h"
9#include "common/vector_math.h"
10#include "video_core/pica_state.h"
11
12namespace Pica {
13
14std::tuple<Math::Vec4<u8>, Math::Vec4<u8>> ComputeFragmentsColors(
15 const Pica::LightingRegs& lighting, const Pica::State::Lighting& lighting_state,
16 const Math::Quaternion<float>& normquat, const Math::Vec3<float>& view,
17 const Math::Vec4<u8> (&texture_color)[4]);
18
19} // namespace Pica
diff --git a/src/video_core/swrasterizer/rasterizer.cpp b/src/video_core/swrasterizer/rasterizer.cpp
index 512e81c08..862135614 100644
--- a/src/video_core/swrasterizer/rasterizer.cpp
+++ b/src/video_core/swrasterizer/rasterizer.cpp
@@ -13,6 +13,7 @@
13#include "common/logging/log.h" 13#include "common/logging/log.h"
14#include "common/math_util.h" 14#include "common/math_util.h"
15#include "common/microprofile.h" 15#include "common/microprofile.h"
16#include "common/quaternion.h"
16#include "common/vector_math.h" 17#include "common/vector_math.h"
17#include "core/hw/gpu.h" 18#include "core/hw/gpu.h"
18#include "core/memory.h" 19#include "core/memory.h"
@@ -24,6 +25,7 @@
24#include "video_core/regs_texturing.h" 25#include "video_core/regs_texturing.h"
25#include "video_core/shader/shader.h" 26#include "video_core/shader/shader.h"
26#include "video_core/swrasterizer/framebuffer.h" 27#include "video_core/swrasterizer/framebuffer.h"
28#include "video_core/swrasterizer/lighting.h"
27#include "video_core/swrasterizer/proctex.h" 29#include "video_core/swrasterizer/proctex.h"
28#include "video_core/swrasterizer/rasterizer.h" 30#include "video_core/swrasterizer/rasterizer.h"
29#include "video_core/swrasterizer/texturing.h" 31#include "video_core/swrasterizer/texturing.h"
@@ -419,6 +421,26 @@ static void ProcessTriangleInternal(const Vertex& v0, const Vertex& v1, const Ve
419 regs.texturing.tev_combiner_buffer_color.a, 421 regs.texturing.tev_combiner_buffer_color.a,
420 }; 422 };
421 423
424 Math::Vec4<u8> primary_fragment_color = {0, 0, 0, 0};
425 Math::Vec4<u8> secondary_fragment_color = {0, 0, 0, 0};
426
427 if (!g_state.regs.lighting.disable) {
428 Math::Quaternion<float> normquat = Math::Quaternion<float>{
429 {GetInterpolatedAttribute(v0.quat.x, v1.quat.x, v2.quat.x).ToFloat32(),
430 GetInterpolatedAttribute(v0.quat.y, v1.quat.y, v2.quat.y).ToFloat32(),
431 GetInterpolatedAttribute(v0.quat.z, v1.quat.z, v2.quat.z).ToFloat32()},
432 GetInterpolatedAttribute(v0.quat.w, v1.quat.w, v2.quat.w).ToFloat32(),
433 }.Normalized();
434
435 Math::Vec3<float> view{
436 GetInterpolatedAttribute(v0.view.x, v1.view.x, v2.view.x).ToFloat32(),
437 GetInterpolatedAttribute(v0.view.y, v1.view.y, v2.view.y).ToFloat32(),
438 GetInterpolatedAttribute(v0.view.z, v1.view.z, v2.view.z).ToFloat32(),
439 };
440 std::tie(primary_fragment_color, secondary_fragment_color) = ComputeFragmentsColors(
441 g_state.regs.lighting, g_state.lighting, normquat, view, texture_color);
442 }
443
422 for (unsigned tev_stage_index = 0; tev_stage_index < tev_stages.size(); 444 for (unsigned tev_stage_index = 0; tev_stage_index < tev_stages.size();
423 ++tev_stage_index) { 445 ++tev_stage_index) {
424 const auto& tev_stage = tev_stages[tev_stage_index]; 446 const auto& tev_stage = tev_stages[tev_stage_index];
@@ -427,14 +449,13 @@ static void ProcessTriangleInternal(const Vertex& v0, const Vertex& v1, const Ve
427 auto GetSource = [&](Source source) -> Math::Vec4<u8> { 449 auto GetSource = [&](Source source) -> Math::Vec4<u8> {
428 switch (source) { 450 switch (source) {
429 case Source::PrimaryColor: 451 case Source::PrimaryColor:
452 return primary_color;
430 453
431 // HACK: Until we implement fragment lighting, use primary_color
432 case Source::PrimaryFragmentColor: 454 case Source::PrimaryFragmentColor:
433 return primary_color; 455 return primary_fragment_color;
434 456
435 // HACK: Until we implement fragment lighting, use zero
436 case Source::SecondaryFragmentColor: 457 case Source::SecondaryFragmentColor:
437 return {0, 0, 0, 0}; 458 return secondary_fragment_color;
438 459
439 case Source::Texture0: 460 case Source::Texture0:
440 return texture_color[0]; 461 return texture_color[0];
diff --git a/src/video_core/swrasterizer/rasterizer.h b/src/video_core/swrasterizer/rasterizer.h
index 2f0877581..66cd6cfd4 100644
--- a/src/video_core/swrasterizer/rasterizer.h
+++ b/src/video_core/swrasterizer/rasterizer.h
@@ -19,10 +19,9 @@ struct Vertex : Shader::OutputVertex {
19 19
20 // Linear interpolation 20 // Linear interpolation
21 // factor: 0=this, 1=vtx 21 // factor: 0=this, 1=vtx
22 // Note: This function cannot be called after perspective divide
22 void Lerp(float24 factor, const Vertex& vtx) { 23 void Lerp(float24 factor, const Vertex& vtx) {
23 pos = pos * factor + vtx.pos * (float24::FromFloat32(1) - factor); 24 pos = pos * factor + vtx.pos * (float24::FromFloat32(1) - factor);
24
25 // TODO: Should perform perspective correct interpolation here...
26 quat = quat * factor + vtx.quat * (float24::FromFloat32(1) - factor); 25 quat = quat * factor + vtx.quat * (float24::FromFloat32(1) - factor);
27 color = color * factor + vtx.color * (float24::FromFloat32(1) - factor); 26 color = color * factor + vtx.color * (float24::FromFloat32(1) - factor);
28 tc0 = tc0 * factor + vtx.tc0 * (float24::FromFloat32(1) - factor); 27 tc0 = tc0 * factor + vtx.tc0 * (float24::FromFloat32(1) - factor);
@@ -30,12 +29,11 @@ struct Vertex : Shader::OutputVertex {
30 tc0_w = tc0_w * factor + vtx.tc0_w * (float24::FromFloat32(1) - factor); 29 tc0_w = tc0_w * factor + vtx.tc0_w * (float24::FromFloat32(1) - factor);
31 view = view * factor + vtx.view * (float24::FromFloat32(1) - factor); 30 view = view * factor + vtx.view * (float24::FromFloat32(1) - factor);
32 tc2 = tc2 * factor + vtx.tc2 * (float24::FromFloat32(1) - factor); 31 tc2 = tc2 * factor + vtx.tc2 * (float24::FromFloat32(1) - factor);
33
34 screenpos = screenpos * factor + vtx.screenpos * (float24::FromFloat32(1) - factor);
35 } 32 }
36 33
37 // Linear interpolation 34 // Linear interpolation
38 // factor: 0=v0, 1=v1 35 // factor: 0=v0, 1=v1
36 // Note: This function cannot be called after perspective divide
39 static Vertex Lerp(float24 factor, const Vertex& v0, const Vertex& v1) { 37 static Vertex Lerp(float24 factor, const Vertex& v0, const Vertex& v1) {
40 Vertex ret = v0; 38 Vertex ret = v0;
41 ret.Lerp(factor, v1); 39 ret.Lerp(factor, v1);
diff --git a/src/video_core/swrasterizer/texturing.cpp b/src/video_core/swrasterizer/texturing.cpp
index 4f02b93f2..79b1ce841 100644
--- a/src/video_core/swrasterizer/texturing.cpp
+++ b/src/video_core/swrasterizer/texturing.cpp
@@ -89,6 +89,8 @@ Math::Vec3<u8> GetColorModifier(TevStageConfig::ColorModifier factor,
89 case ColorModifier::OneMinusSourceBlue: 89 case ColorModifier::OneMinusSourceBlue:
90 return (Math::Vec3<u8>(255, 255, 255) - values.bbb()).Cast<u8>(); 90 return (Math::Vec3<u8>(255, 255, 255) - values.bbb()).Cast<u8>();
91 } 91 }
92
93 UNREACHABLE();
92}; 94};
93 95
94u8 GetAlphaModifier(TevStageConfig::AlphaModifier factor, const Math::Vec4<u8>& values) { 96u8 GetAlphaModifier(TevStageConfig::AlphaModifier factor, const Math::Vec4<u8>& values) {
@@ -119,6 +121,8 @@ u8 GetAlphaModifier(TevStageConfig::AlphaModifier factor, const Math::Vec4<u8>&
119 case AlphaModifier::OneMinusSourceBlue: 121 case AlphaModifier::OneMinusSourceBlue:
120 return 255 - values.b(); 122 return 255 - values.b();
121 } 123 }
124
125 UNREACHABLE();
122}; 126};
123 127
124Math::Vec3<u8> ColorCombine(TevStageConfig::Operation op, const Math::Vec3<u8> input[3]) { 128Math::Vec3<u8> ColorCombine(TevStageConfig::Operation op, const Math::Vec3<u8> input[3]) {
diff --git a/src/video_core/utils.h b/src/video_core/utils.h
index 7ce83a055..d8567f314 100644
--- a/src/video_core/utils.h
+++ b/src/video_core/utils.h
@@ -8,17 +8,11 @@
8 8
9namespace VideoCore { 9namespace VideoCore {
10 10
11/** 11// 8x8 Z-Order coordinate from 2D coordinates
12 * Interleave the lower 3 bits of each coordinate to get the intra-block offsets, which are
13 * arranged in a Z-order curve. More details on the bit manipulation at:
14 * https://fgiesen.wordpress.com/2009/12/13/decoding-morton-codes/
15 */
16static inline u32 MortonInterleave(u32 x, u32 y) { 12static inline u32 MortonInterleave(u32 x, u32 y) {
17 u32 i = (x & 7) | ((y & 7) << 8); // ---- -210 13 static const u32 xlut[] = {0x00, 0x01, 0x04, 0x05, 0x10, 0x11, 0x14, 0x15};
18 i = (i ^ (i << 2)) & 0x1313; // ---2 --10 14 static const u32 ylut[] = {0x00, 0x02, 0x08, 0x0a, 0x20, 0x22, 0x28, 0x2a};
19 i = (i ^ (i << 1)) & 0x1515; // ---2 -1-0 15 return xlut[x % 8] + ylut[y % 8];
20 i = (i | (i >> 7)) & 0x3F;
21 return i;
22} 16}
23 17
24/** 18/**