summaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
authorGravatar bunnei2015-02-18 17:19:38 -0500
committerGravatar bunnei2015-02-18 17:19:38 -0500
commit4a48b017ca7fe8fe68dfc84d70864ef6aea6a266 (patch)
treedcd7914a3a2147790d384ce0992f70d40bce8704 /src
parentMerge pull request #570 from purpasmart96/config_mem (diff)
parentPica/Rasterizer: Replace exit() calls with UNIMPLEMENTED(). (diff)
downloadyuzu-4a48b017ca7fe8fe68dfc84d70864ef6aea6a266.tar.gz
yuzu-4a48b017ca7fe8fe68dfc84d70864ef6aea6a266.tar.xz
yuzu-4a48b017ca7fe8fe68dfc84d70864ef6aea6a266.zip
Merge pull request #562 from neobrain/pica_progress3
More PICA200 Emulation Fixes
Diffstat (limited to 'src')
-rw-r--r--src/core/hle/service/gsp_gpu.cpp34
-rw-r--r--src/core/hle/service/gsp_gpu.h4
-rw-r--r--src/core/hw/gpu.cpp41
-rw-r--r--src/core/hw/gpu.h32
-rw-r--r--src/video_core/clipper.cpp84
-rw-r--r--src/video_core/command_processor.cpp32
-rw-r--r--src/video_core/pica.h24
-rw-r--r--src/video_core/rasterizer.cpp252
-rw-r--r--src/video_core/vertex_shader.cpp50
9 files changed, 341 insertions, 212 deletions
diff --git a/src/core/hle/service/gsp_gpu.cpp b/src/core/hle/service/gsp_gpu.cpp
index 31e61391f..c23cfa3c8 100644
--- a/src/core/hle/service/gsp_gpu.cpp
+++ b/src/core/hle/service/gsp_gpu.cpp
@@ -368,28 +368,28 @@ static void ExecuteCommand(const Command& command, u32 thread_id) {
368 case CommandId::SET_MEMORY_FILL: 368 case CommandId::SET_MEMORY_FILL:
369 { 369 {
370 auto& params = command.memory_fill; 370 auto& params = command.memory_fill;
371 WriteGPURegister(static_cast<u32>(GPU_REG_INDEX(memory_fill_config[0].address_start)), 371 WriteGPURegister(static_cast<u32>(GPU_REG_INDEX(memory_fill_config[0].address_start)),
372 Memory::VirtualToPhysicalAddress(params.start1) >> 3); 372 Memory::VirtualToPhysicalAddress(params.start1) >> 3);
373 WriteGPURegister(static_cast<u32>(GPU_REG_INDEX(memory_fill_config[0].address_end)), 373 WriteGPURegister(static_cast<u32>(GPU_REG_INDEX(memory_fill_config[0].address_end)),
374 Memory::VirtualToPhysicalAddress(params.end1) >> 3); 374 Memory::VirtualToPhysicalAddress(params.end1) >> 3);
375 WriteGPURegister(static_cast<u32>(GPU_REG_INDEX(memory_fill_config[0].size)), params.end1 - params.start1); 375 WriteGPURegister(static_cast<u32>(GPU_REG_INDEX(memory_fill_config[0].value_32bit)), params.value1);
376 WriteGPURegister(static_cast<u32>(GPU_REG_INDEX(memory_fill_config[0].value)), params.value1); 376 WriteGPURegister(static_cast<u32>(GPU_REG_INDEX(memory_fill_config[0].control)), params.control1);
377 377
378 WriteGPURegister(static_cast<u32>(GPU_REG_INDEX(memory_fill_config[1].address_start)), 378 WriteGPURegister(static_cast<u32>(GPU_REG_INDEX(memory_fill_config[1].address_start)),
379 Memory::VirtualToPhysicalAddress(params.start2) >> 3); 379 Memory::VirtualToPhysicalAddress(params.start2) >> 3);
380 WriteGPURegister(static_cast<u32>(GPU_REG_INDEX(memory_fill_config[1].address_end)), 380 WriteGPURegister(static_cast<u32>(GPU_REG_INDEX(memory_fill_config[1].address_end)),
381 Memory::VirtualToPhysicalAddress(params.end2) >> 3); 381 Memory::VirtualToPhysicalAddress(params.end2) >> 3);
382 WriteGPURegister(static_cast<u32>(GPU_REG_INDEX(memory_fill_config[1].size)), params.end2 - params.start2); 382 WriteGPURegister(static_cast<u32>(GPU_REG_INDEX(memory_fill_config[1].value_32bit)), params.value2);
383 WriteGPURegister(static_cast<u32>(GPU_REG_INDEX(memory_fill_config[1].value)), params.value2); 383 WriteGPURegister(static_cast<u32>(GPU_REG_INDEX(memory_fill_config[1].control)), params.control2);
384 break; 384 break;
385 } 385 }
386 386
387 case CommandId::SET_DISPLAY_TRANSFER: 387 case CommandId::SET_DISPLAY_TRANSFER:
388 { 388 {
389 auto& params = command.image_copy; 389 auto& params = command.image_copy;
390 WriteGPURegister(static_cast<u32>(GPU_REG_INDEX(display_transfer_config.input_address)), 390 WriteGPURegister(static_cast<u32>(GPU_REG_INDEX(display_transfer_config.input_address)),
391 Memory::VirtualToPhysicalAddress(params.in_buffer_address) >> 3); 391 Memory::VirtualToPhysicalAddress(params.in_buffer_address) >> 3);
392 WriteGPURegister(static_cast<u32>(GPU_REG_INDEX(display_transfer_config.output_address)), 392 WriteGPURegister(static_cast<u32>(GPU_REG_INDEX(display_transfer_config.output_address)),
393 Memory::VirtualToPhysicalAddress(params.out_buffer_address) >> 3); 393 Memory::VirtualToPhysicalAddress(params.out_buffer_address) >> 3);
394 WriteGPURegister(static_cast<u32>(GPU_REG_INDEX(display_transfer_config.input_size)), params.in_buffer_size); 394 WriteGPURegister(static_cast<u32>(GPU_REG_INDEX(display_transfer_config.input_size)), params.in_buffer_size);
395 WriteGPURegister(static_cast<u32>(GPU_REG_INDEX(display_transfer_config.output_size)), params.out_buffer_size); 395 WriteGPURegister(static_cast<u32>(GPU_REG_INDEX(display_transfer_config.output_size)), params.out_buffer_size);
@@ -402,9 +402,9 @@ static void ExecuteCommand(const Command& command, u32 thread_id) {
402 case CommandId::SET_TEXTURE_COPY: 402 case CommandId::SET_TEXTURE_COPY:
403 { 403 {
404 auto& params = command.image_copy; 404 auto& params = command.image_copy;
405 WriteGPURegister(static_cast<u32>(GPU_REG_INDEX(display_transfer_config.input_address)), 405 WriteGPURegister(static_cast<u32>(GPU_REG_INDEX(display_transfer_config.input_address)),
406 Memory::VirtualToPhysicalAddress(params.in_buffer_address) >> 3); 406 Memory::VirtualToPhysicalAddress(params.in_buffer_address) >> 3);
407 WriteGPURegister(static_cast<u32>(GPU_REG_INDEX(display_transfer_config.output_address)), 407 WriteGPURegister(static_cast<u32>(GPU_REG_INDEX(display_transfer_config.output_address)),
408 Memory::VirtualToPhysicalAddress(params.out_buffer_address) >> 3); 408 Memory::VirtualToPhysicalAddress(params.out_buffer_address) >> 3);
409 WriteGPURegister(static_cast<u32>(GPU_REG_INDEX(display_transfer_config.input_size)), params.in_buffer_size); 409 WriteGPURegister(static_cast<u32>(GPU_REG_INDEX(display_transfer_config.input_size)), params.in_buffer_size);
410 WriteGPURegister(static_cast<u32>(GPU_REG_INDEX(display_transfer_config.output_size)), params.out_buffer_size); 410 WriteGPURegister(static_cast<u32>(GPU_REG_INDEX(display_transfer_config.output_size)), params.out_buffer_size);
diff --git a/src/core/hle/service/gsp_gpu.h b/src/core/hle/service/gsp_gpu.h
index 65abb194a..a435d418a 100644
--- a/src/core/hle/service/gsp_gpu.h
+++ b/src/core/hle/service/gsp_gpu.h
@@ -109,9 +109,13 @@ struct Command {
109 u32 start1; 109 u32 start1;
110 u32 value1; 110 u32 value1;
111 u32 end1; 111 u32 end1;
112
112 u32 start2; 113 u32 start2;
113 u32 value2; 114 u32 value2;
114 u32 end2; 115 u32 end2;
116
117 u16 control1;
118 u16 control2;
115 } memory_fill; 119 } memory_fill;
116 120
117 struct { 121 struct {
diff --git a/src/core/hw/gpu.cpp b/src/core/hw/gpu.cpp
index aad0e5d0d..bd7d92cd1 100644
--- a/src/core/hw/gpu.cpp
+++ b/src/core/hw/gpu.cpp
@@ -67,23 +67,38 @@ inline void Write(u32 addr, const T data) {
67 switch (index) { 67 switch (index) {
68 68
69 // Memory fills are triggered once the fill value is written. 69 // Memory fills are triggered once the fill value is written.
70 // NOTE: This is not verified. 70 case GPU_REG_INDEX_WORKAROUND(memory_fill_config[0].trigger, 0x00004 + 0x3):
71 case GPU_REG_INDEX_WORKAROUND(memory_fill_config[0].value, 0x00004 + 0x3): 71 case GPU_REG_INDEX_WORKAROUND(memory_fill_config[1].trigger, 0x00008 + 0x3):
72 case GPU_REG_INDEX_WORKAROUND(memory_fill_config[1].value, 0x00008 + 0x3):
73 { 72 {
74 const bool is_second_filler = (index != GPU_REG_INDEX(memory_fill_config[0].value)); 73 const bool is_second_filler = (index != GPU_REG_INDEX(memory_fill_config[0].trigger));
75 const auto& config = g_regs.memory_fill_config[is_second_filler]; 74 auto& config = g_regs.memory_fill_config[is_second_filler];
76 75
77 // TODO: Not sure if this check should be done at GSP level instead 76 if (config.address_start && config.trigger) {
78 if (config.address_start) { 77 u8* start = Memory::GetPointer(Memory::PhysicalToVirtualAddress(config.GetStartAddress()));
79 // TODO: Not sure if this algorithm is correct, particularly because it doesn't use the size member at all 78 u8* end = Memory::GetPointer(Memory::PhysicalToVirtualAddress(config.GetEndAddress()));
80 u32* start = (u32*)Memory::GetPointer(Memory::PhysicalToVirtualAddress(config.GetStartAddress())); 79
81 u32* end = (u32*)Memory::GetPointer(Memory::PhysicalToVirtualAddress(config.GetEndAddress())); 80 if (config.fill_24bit) {
82 for (u32* ptr = start; ptr < end; ++ptr) 81 // fill with 24-bit values
83 *ptr = bswap32(config.value); // TODO: This is just a workaround to missing framebuffer format emulation 82 for (u8* ptr = start; ptr < end; ptr += 3) {
83 ptr[0] = config.value_24bit_b;
84 ptr[1] = config.value_24bit_g;
85 ptr[2] = config.value_24bit_r;
86 }
87 } else if (config.fill_32bit) {
88 // fill with 32-bit values
89 for (u32* ptr = (u32*)start; ptr < (u32*)end; ++ptr)
90 *ptr = config.value_32bit;
91 } else {
92 // fill with 16-bit values
93 for (u16* ptr = (u16*)start; ptr < (u16*)end; ++ptr)
94 *ptr = config.value_16bit;
95 }
84 96
85 LOG_TRACE(HW_GPU, "MemoryFill from 0x%08x to 0x%08x", config.GetStartAddress(), config.GetEndAddress()); 97 LOG_TRACE(HW_GPU, "MemoryFill from 0x%08x to 0x%08x", config.GetStartAddress(), config.GetEndAddress());
86 98
99 config.trigger = 0;
100 config.finished = 1;
101
87 if (!is_second_filler) { 102 if (!is_second_filler) {
88 GSP_GPU::SignalInterrupt(GSP_GPU::InterruptId::PSC0); 103 GSP_GPU::SignalInterrupt(GSP_GPU::InterruptId::PSC0);
89 } else { 104 } else {
diff --git a/src/core/hw/gpu.h b/src/core/hw/gpu.h
index 9fd694f65..df9aa0d71 100644
--- a/src/core/hw/gpu.h
+++ b/src/core/hw/gpu.h
@@ -84,9 +84,35 @@ struct Regs {
84 84
85 struct { 85 struct {
86 u32 address_start; 86 u32 address_start;
87 u32 address_end; // ? 87 u32 address_end;
88 u32 size; 88
89 u32 value; // ? 89 union {
90 u32 value_32bit;
91
92 BitField<0, 16, u32> value_16bit;
93
94 // TODO: Verify component order
95 BitField< 0, 8, u32> value_24bit_r;
96 BitField< 8, 8, u32> value_24bit_g;
97 BitField<16, 8, u32> value_24bit_b;
98 };
99
100 union {
101 u32 control;
102
103 // Setting this field to 1 triggers the memory fill.
104 // This field also acts as a status flag, and gets reset to 0 upon completion.
105 BitField<0, 1, u32> trigger;
106
107 // Set to 1 upon completion.
108 BitField<0, 1, u32> finished;
109
110 // 0: fill with 16- or 32-bit wide values; 1: fill with 24-bit wide values
111 BitField<8, 1, u32> fill_24bit;
112
113 // 0: fill with 16-bit wide values; 1: fill with 32-bit wide values
114 BitField<9, 1, u32> fill_32bit;
115 };
90 116
91 inline u32 GetStartAddress() const { 117 inline u32 GetStartAddress() const {
92 return DecodeAddressRegister(address_start); 118 return DecodeAddressRegister(address_start);
diff --git a/src/video_core/clipper.cpp b/src/video_core/clipper.cpp
index 1744066ba..ba3876a76 100644
--- a/src/video_core/clipper.cpp
+++ b/src/video_core/clipper.cpp
@@ -15,30 +15,18 @@ namespace Clipper {
15 15
16struct ClippingEdge { 16struct ClippingEdge {
17public: 17public:
18 enum Type { 18 ClippingEdge(Math::Vec4<float24> coeffs,
19 POS_X = 0, 19 Math::Vec4<float24> bias = Math::Vec4<float24>(float24::FromFloat32(0),
20 NEG_X = 1, 20 float24::FromFloat32(0),
21 POS_Y = 2, 21 float24::FromFloat32(0),
22 NEG_Y = 3, 22 float24::FromFloat32(0)))
23 POS_Z = 4, 23 : coeffs(coeffs),
24 NEG_Z = 5, 24 bias(bias)
25 }; 25 {
26 26 }
27 ClippingEdge(Type type, float24 position) : type(type), pos(position) {}
28 27
29 bool IsInside(const OutputVertex& vertex) const { 28 bool IsInside(const OutputVertex& vertex) const {
30 switch (type) { 29 return Math::Dot(vertex.pos + bias, coeffs) <= float24::FromFloat32(0);
31 case POS_X: return vertex.pos.x <= pos * vertex.pos.w;
32 case NEG_X: return vertex.pos.x >= pos * vertex.pos.w;
33 case POS_Y: return vertex.pos.y <= pos * vertex.pos.w;
34 case NEG_Y: return vertex.pos.y >= pos * vertex.pos.w;
35
36 // TODO: Check z compares ... should be 0..1 instead?
37 case POS_Z: return vertex.pos.z <= pos * vertex.pos.w;
38
39 default:
40 case NEG_Z: return vertex.pos.z >= pos * vertex.pos.w;
41 }
42 } 30 }
43 31
44 bool IsOutSide(const OutputVertex& vertex) const { 32 bool IsOutSide(const OutputVertex& vertex) const {
@@ -46,31 +34,17 @@ public:
46 } 34 }
47 35
48 OutputVertex GetIntersection(const OutputVertex& v0, const OutputVertex& v1) const { 36 OutputVertex GetIntersection(const OutputVertex& v0, const OutputVertex& v1) const {
49 auto dotpr = [this](const OutputVertex& vtx) { 37 float24 dp = Math::Dot(v0.pos + bias, coeffs);
50 switch (type) { 38 float24 dp_prev = Math::Dot(v1.pos + bias, coeffs);
51 case POS_X: return vtx.pos.x - vtx.pos.w;
52 case NEG_X: return -vtx.pos.x - vtx.pos.w;
53 case POS_Y: return vtx.pos.y - vtx.pos.w;
54 case NEG_Y: return -vtx.pos.y - vtx.pos.w;
55
56 // TODO: Verify z clipping
57 case POS_Z: return vtx.pos.z - vtx.pos.w;
58
59 default:
60 case NEG_Z: return -vtx.pos.w;
61 }
62 };
63
64 float24 dp = dotpr(v0);
65 float24 dp_prev = dotpr(v1);
66 float24 factor = dp_prev / (dp_prev - dp); 39 float24 factor = dp_prev / (dp_prev - dp);
67 40
68 return OutputVertex::Lerp(factor, v0, v1); 41 return OutputVertex::Lerp(factor, v0, v1);
69 } 42 }
70 43
71private: 44private:
72 Type type;
73 float24 pos; 45 float24 pos;
46 Math::Vec4<float24> coeffs;
47 Math::Vec4<float24> bias;
74}; 48};
75 49
76static void InitScreenCoordinates(OutputVertex& vtx) 50static void InitScreenCoordinates(OutputVertex& vtx)
@@ -98,10 +72,9 @@ static void InitScreenCoordinates(OutputVertex& vtx)
98 vtx.tc2 *= inv_w; 72 vtx.tc2 *= inv_w;
99 vtx.pos.w = inv_w; 73 vtx.pos.w = inv_w;
100 74
101 // TODO: Not sure why the viewport width needs to be divided by 2 but the viewport height does not
102 vtx.screenpos[0] = (vtx.pos.x * inv_w + float24::FromFloat32(1.0)) * viewport.halfsize_x + viewport.offset_x; 75 vtx.screenpos[0] = (vtx.pos.x * inv_w + float24::FromFloat32(1.0)) * viewport.halfsize_x + viewport.offset_x;
103 vtx.screenpos[1] = (vtx.pos.y * inv_w + float24::FromFloat32(1.0)) * viewport.halfsize_y + viewport.offset_y; 76 vtx.screenpos[1] = (vtx.pos.y * inv_w + float24::FromFloat32(1.0)) * viewport.halfsize_y + viewport.offset_y;
104 vtx.screenpos[2] = viewport.offset_z - vtx.pos.z * inv_w * viewport.zscale; 77 vtx.screenpos[2] = viewport.offset_z + vtx.pos.z * inv_w * viewport.zscale;
105} 78}
106 79
107void ProcessTriangle(OutputVertex &v0, OutputVertex &v1, OutputVertex &v2) { 80void ProcessTriangle(OutputVertex &v0, OutputVertex &v1, OutputVertex &v2) {
@@ -117,14 +90,29 @@ void ProcessTriangle(OutputVertex &v0, OutputVertex &v1, OutputVertex &v2) {
117 auto* output_list = &buffer_a; 90 auto* output_list = &buffer_a;
118 auto* input_list = &buffer_b; 91 auto* input_list = &buffer_b;
119 92
93 // NOTE: We clip against a w=epsilon plane to guarantee that the output has a positive w value.
94 // TODO: Not sure if this is a valid approach. Also should probably instead use the smallest
95 // epsilon possible within float24 accuracy.
96 static const float24 EPSILON = float24::FromFloat32(0.00001);
97 static const float24 f0 = float24::FromFloat32(0.0);
98 static const float24 f1 = float24::FromFloat32(1.0);
99 static const std::array<ClippingEdge, 7> clipping_edges = {{
100 { Math::MakeVec( f1, f0, f0, -f1) }, // x = +w
101 { Math::MakeVec(-f1, f0, f0, -f1) }, // x = -w
102 { Math::MakeVec( f0, f1, f0, -f1) }, // y = +w
103 { Math::MakeVec( f0, -f1, f0, -f1) }, // y = -w
104 { Math::MakeVec( f0, f0, f1, f0) }, // z = 0
105 { Math::MakeVec( f0, f0, -f1, -f1) }, // z = -w
106 { Math::MakeVec( f0, f0, f0, -f1), Math::Vec4<float24>(f0, f0, f0, EPSILON) }, // w = EPSILON
107 }};
108
109 // TODO: If one vertex lies outside one of the depth clipping planes, some platforms (e.g. Wii)
110 // drop the whole primitive instead of clipping the primitive properly. We should test if
111 // this happens on the 3DS, too.
112
120 // Simple implementation of the Sutherland-Hodgman clipping algorithm. 113 // Simple implementation of the Sutherland-Hodgman clipping algorithm.
121 // TODO: Make this less inefficient (currently lots of useless buffering overhead happens here) 114 // TODO: Make this less inefficient (currently lots of useless buffering overhead happens here)
122 for (auto edge : { ClippingEdge(ClippingEdge::POS_X, float24::FromFloat32(+1.0)), 115 for (auto edge : clipping_edges) {
123 ClippingEdge(ClippingEdge::NEG_X, float24::FromFloat32(-1.0)),
124 ClippingEdge(ClippingEdge::POS_Y, float24::FromFloat32(+1.0)),
125 ClippingEdge(ClippingEdge::NEG_Y, float24::FromFloat32(-1.0)),
126 ClippingEdge(ClippingEdge::POS_Z, float24::FromFloat32(+1.0)),
127 ClippingEdge(ClippingEdge::NEG_Z, float24::FromFloat32(-1.0)) }) {
128 116
129 std::swap(input_list, output_list); 117 std::swap(input_list, output_list);
130 output_list->clear(); 118 output_list->clear();
diff --git a/src/video_core/command_processor.cpp b/src/video_core/command_processor.cpp
index 0d9f4ba66..586ad62b6 100644
--- a/src/video_core/command_processor.cpp
+++ b/src/video_core/command_processor.cpp
@@ -2,6 +2,8 @@
2// Licensed under GPLv2 or any later version 2// Licensed under GPLv2 or any later version
3// Refer to the license.txt file included. 3// Refer to the license.txt file included.
4 4
5#include <boost/range/algorithm/fill.hpp>
6
5#include "clipper.h" 7#include "clipper.h"
6#include "command_processor.h" 8#include "command_processor.h"
7#include "math.h" 9#include "math.h"
@@ -23,10 +25,6 @@ static int float_regs_counter = 0;
23 25
24static u32 uniform_write_buffer[4]; 26static u32 uniform_write_buffer[4];
25 27
26// Used for VSLoadProgramData and VSLoadSwizzleData
27static u32 vs_binary_write_offset = 0;
28static u32 vs_swizzle_write_offset = 0;
29
30static inline void WritePicaReg(u32 id, u32 value, u32 mask) { 28static inline void WritePicaReg(u32 id, u32 value, u32 mask) {
31 29
32 if (id >= registers.NumIds()) 30 if (id >= registers.NumIds())
@@ -65,10 +63,14 @@ static inline void WritePicaReg(u32 id, u32 value, u32 mask) {
65 63
66 // Information about internal vertex attributes 64 // Information about internal vertex attributes
67 u32 vertex_attribute_sources[16]; 65 u32 vertex_attribute_sources[16];
68 std::fill(vertex_attribute_sources, &vertex_attribute_sources[16], 0xdeadbeef); 66 boost::fill(vertex_attribute_sources, 0xdeadbeef);
69 u32 vertex_attribute_strides[16]; 67 u32 vertex_attribute_strides[16];
70 u32 vertex_attribute_formats[16]; 68 u32 vertex_attribute_formats[16];
71 u32 vertex_attribute_elements[16]; 69
70 // HACK: Initialize vertex_attribute_elements to zero to prevent infinite loops below.
71 // This is one of the hacks required to deal with uninitalized vertex attributes.
72 // TODO: Fix this properly.
73 u32 vertex_attribute_elements[16] = {};
72 u32 vertex_attribute_element_size[16]; 74 u32 vertex_attribute_element_size[16];
73 75
74 // Setup attribute data from loaders 76 // Setup attribute data from loaders
@@ -252,11 +254,6 @@ static inline void WritePicaReg(u32 id, u32 value, u32 mask) {
252 break; 254 break;
253 } 255 }
254 256
255 // Seems to be used to reset the write pointer for VSLoadProgramData
256 case PICA_REG_INDEX(vs_program.begin_load):
257 vs_binary_write_offset = 0;
258 break;
259
260 // Load shader program code 257 // Load shader program code
261 case PICA_REG_INDEX_WORKAROUND(vs_program.set_word[0], 0x2cc): 258 case PICA_REG_INDEX_WORKAROUND(vs_program.set_word[0], 0x2cc):
262 case PICA_REG_INDEX_WORKAROUND(vs_program.set_word[1], 0x2cd): 259 case PICA_REG_INDEX_WORKAROUND(vs_program.set_word[1], 0x2cd):
@@ -267,16 +264,11 @@ static inline void WritePicaReg(u32 id, u32 value, u32 mask) {
267 case PICA_REG_INDEX_WORKAROUND(vs_program.set_word[6], 0x2d2): 264 case PICA_REG_INDEX_WORKAROUND(vs_program.set_word[6], 0x2d2):
268 case PICA_REG_INDEX_WORKAROUND(vs_program.set_word[7], 0x2d3): 265 case PICA_REG_INDEX_WORKAROUND(vs_program.set_word[7], 0x2d3):
269 { 266 {
270 VertexShader::SubmitShaderMemoryChange(vs_binary_write_offset, value); 267 VertexShader::SubmitShaderMemoryChange(registers.vs_program.offset, value);
271 vs_binary_write_offset++; 268 registers.vs_program.offset++;
272 break; 269 break;
273 } 270 }
274 271
275 // Seems to be used to reset the write pointer for VSLoadSwizzleData
276 case PICA_REG_INDEX(vs_swizzle_patterns.begin_load):
277 vs_swizzle_write_offset = 0;
278 break;
279
280 // Load swizzle pattern data 272 // Load swizzle pattern data
281 case PICA_REG_INDEX_WORKAROUND(vs_swizzle_patterns.set_word[0], 0x2d6): 273 case PICA_REG_INDEX_WORKAROUND(vs_swizzle_patterns.set_word[0], 0x2d6):
282 case PICA_REG_INDEX_WORKAROUND(vs_swizzle_patterns.set_word[1], 0x2d7): 274 case PICA_REG_INDEX_WORKAROUND(vs_swizzle_patterns.set_word[1], 0x2d7):
@@ -287,8 +279,8 @@ static inline void WritePicaReg(u32 id, u32 value, u32 mask) {
287 case PICA_REG_INDEX_WORKAROUND(vs_swizzle_patterns.set_word[6], 0x2dc): 279 case PICA_REG_INDEX_WORKAROUND(vs_swizzle_patterns.set_word[6], 0x2dc):
288 case PICA_REG_INDEX_WORKAROUND(vs_swizzle_patterns.set_word[7], 0x2dd): 280 case PICA_REG_INDEX_WORKAROUND(vs_swizzle_patterns.set_word[7], 0x2dd):
289 { 281 {
290 VertexShader::SubmitSwizzleDataChange(vs_swizzle_write_offset, value); 282 VertexShader::SubmitSwizzleDataChange(registers.vs_swizzle_patterns.offset, value);
291 vs_swizzle_write_offset++; 283 registers.vs_swizzle_patterns.offset++;
292 break; 284 break;
293 } 285 }
294 286
diff --git a/src/video_core/pica.h b/src/video_core/pica.h
index 9c1a12dc8..e4a5ef78e 100644
--- a/src/video_core/pica.h
+++ b/src/video_core/pica.h
@@ -118,8 +118,9 @@ struct Regs {
118 118
119 struct TextureConfig { 119 struct TextureConfig {
120 enum WrapMode : u32 { 120 enum WrapMode : u32 {
121 ClampToEdge = 0, 121 ClampToEdge = 0,
122 Repeat = 2, 122 Repeat = 2,
123 MirroredRepeat = 3,
123 }; 124 };
124 125
125 INSERT_PADDING_WORDS(0x1); 126 INSERT_PADDING_WORDS(0x1);
@@ -131,7 +132,7 @@ struct Regs {
131 132
132 union { 133 union {
133 BitField< 8, 2, WrapMode> wrap_s; 134 BitField< 8, 2, WrapMode> wrap_s;
134 BitField<11, 2, WrapMode> wrap_t; 135 BitField<12, 2, WrapMode> wrap_t;
135 }; 136 };
136 137
137 INSERT_PADDING_WORDS(0x1); 138 INSERT_PADDING_WORDS(0x1);
@@ -223,6 +224,8 @@ struct Regs {
223 struct TevStageConfig { 224 struct TevStageConfig {
224 enum class Source : u32 { 225 enum class Source : u32 {
225 PrimaryColor = 0x0, 226 PrimaryColor = 0x0,
227 PrimaryFragmentColor = 0x1,
228
226 Texture0 = 0x3, 229 Texture0 = 0x3,
227 Texture1 = 0x4, 230 Texture1 = 0x4,
228 Texture2 = 0x5, 231 Texture2 = 0x5,
@@ -265,6 +268,9 @@ struct Regs {
265 AddSigned = 3, 268 AddSigned = 3,
266 Lerp = 4, 269 Lerp = 4,
267 Subtract = 5, 270 Subtract = 5,
271
272 MultiplyThenAdd = 8,
273 AddThenMultiply = 9,
268 }; 274 };
269 275
270 union { 276 union {
@@ -337,7 +343,7 @@ struct Regs {
337 }; 343 };
338 344
339 union { 345 union {
340 enum BlendEquation : u32 { 346 enum class BlendEquation : u32 {
341 Add = 0, 347 Add = 0,
342 Subtract = 1, 348 Subtract = 1,
343 ReverseSubtract = 2, 349 ReverseSubtract = 2,
@@ -421,7 +427,7 @@ struct Regs {
421 INSERT_PADDING_WORDS(0x6); 427 INSERT_PADDING_WORDS(0x6);
422 428
423 u32 depth_format; 429 u32 depth_format;
424 u32 color_format; 430 BitField<16, 3, u32> color_format;
425 431
426 INSERT_PADDING_WORDS(0x4); 432 INSERT_PADDING_WORDS(0x4);
427 433
@@ -678,7 +684,9 @@ struct Regs {
678 INSERT_PADDING_WORDS(0x2); 684 INSERT_PADDING_WORDS(0x2);
679 685
680 struct { 686 struct {
681 u32 begin_load; 687 // Offset of the next instruction to write code to.
688 // Incremented with each instruction write.
689 u32 offset;
682 690
683 // Writing to these registers sets the "current" word in the shader program. 691 // Writing to these registers sets the "current" word in the shader program.
684 // TODO: It's not clear how the hardware stores what the "current" word is. 692 // TODO: It's not clear how the hardware stores what the "current" word is.
@@ -690,7 +698,9 @@ struct Regs {
690 // This register group is used to load an internal table of swizzling patterns, 698 // This register group is used to load an internal table of swizzling patterns,
691 // which are indexed by each shader instruction to specify vector component swizzling. 699 // which are indexed by each shader instruction to specify vector component swizzling.
692 struct { 700 struct {
693 u32 begin_load; 701 // Offset of the next swizzle pattern to write code to.
702 // Incremented with each instruction write.
703 u32 offset;
694 704
695 // Writing to these registers sets the "current" swizzle pattern in the table. 705 // Writing to these registers sets the "current" swizzle pattern in the table.
696 // TODO: It's not clear how the hardware stores what the "current" swizzle pattern is. 706 // TODO: It's not clear how the hardware stores what the "current" swizzle pattern is.
diff --git a/src/video_core/rasterizer.cpp b/src/video_core/rasterizer.cpp
index 3faa10153..94873f406 100644
--- a/src/video_core/rasterizer.cpp
+++ b/src/video_core/rasterizer.cpp
@@ -5,6 +5,7 @@
5#include <algorithm> 5#include <algorithm>
6 6
7#include "common/common_types.h" 7#include "common/common_types.h"
8#include "common/math_util.h"
8 9
9#include "math.h" 10#include "math.h"
10#include "pica.h" 11#include "pica.h"
@@ -20,16 +21,31 @@ namespace Rasterizer {
20static void DrawPixel(int x, int y, const Math::Vec4<u8>& color) { 21static void DrawPixel(int x, int y, const Math::Vec4<u8>& color) {
21 const PAddr addr = registers.framebuffer.GetColorBufferPhysicalAddress(); 22 const PAddr addr = registers.framebuffer.GetColorBufferPhysicalAddress();
22 u32* color_buffer = reinterpret_cast<u32*>(Memory::GetPointer(PAddrToVAddr(addr))); 23 u32* color_buffer = reinterpret_cast<u32*>(Memory::GetPointer(PAddrToVAddr(addr)));
23 u32 value = (color.a() << 24) | (color.r() << 16) | (color.g() << 8) | color.b();
24 24
25 // Assuming RGBA8 format until actual framebuffer format handling is implemented 25 // Similarly to textures, the render framebuffer is laid out from bottom to top, too.
26 *(color_buffer + x + y * registers.framebuffer.GetWidth()) = value; 26 // NOTE: The framebuffer height register contains the actual FB height minus one.
27 y = (registers.framebuffer.height - y);
28
29 switch (registers.framebuffer.color_format) {
30 case registers.framebuffer.RGBA8:
31 {
32 u32 value = (color.a() << 24) | (color.r() << 16) | (color.g() << 8) | color.b();
33 *(color_buffer + x + y * registers.framebuffer.GetWidth()) = value;
34 break;
35 }
36
37 default:
38 LOG_CRITICAL(Render_Software, "Unknown framebuffer color format %x", registers.framebuffer.color_format);
39 UNIMPLEMENTED();
40 }
27} 41}
28 42
29static const Math::Vec4<u8> GetPixel(int x, int y) { 43static const Math::Vec4<u8> GetPixel(int x, int y) {
30 const PAddr addr = registers.framebuffer.GetColorBufferPhysicalAddress(); 44 const PAddr addr = registers.framebuffer.GetColorBufferPhysicalAddress();
31 u32* color_buffer_u32 = reinterpret_cast<u32*>(Memory::GetPointer(PAddrToVAddr(addr))); 45 u32* color_buffer_u32 = reinterpret_cast<u32*>(Memory::GetPointer(PAddrToVAddr(addr)));
32 46
47 y = (registers.framebuffer.height - y);
48
33 u32 value = *(color_buffer_u32 + x + y * registers.framebuffer.GetWidth()); 49 u32 value = *(color_buffer_u32 + x + y * registers.framebuffer.GetWidth());
34 Math::Vec4<u8> ret; 50 Math::Vec4<u8> ret;
35 ret.a() = value >> 24; 51 ret.a() = value >> 24;
@@ -43,6 +59,8 @@ static u32 GetDepth(int x, int y) {
43 const PAddr addr = registers.framebuffer.GetDepthBufferPhysicalAddress(); 59 const PAddr addr = registers.framebuffer.GetDepthBufferPhysicalAddress();
44 u16* depth_buffer = reinterpret_cast<u16*>(Memory::GetPointer(PAddrToVAddr(addr))); 60 u16* depth_buffer = reinterpret_cast<u16*>(Memory::GetPointer(PAddrToVAddr(addr)));
45 61
62 y = (registers.framebuffer.height - y);
63
46 // Assuming 16-bit depth buffer format until actual format handling is implemented 64 // Assuming 16-bit depth buffer format until actual format handling is implemented
47 return *(depth_buffer + x + y * registers.framebuffer.GetWidth()); 65 return *(depth_buffer + x + y * registers.framebuffer.GetWidth());
48} 66}
@@ -51,6 +69,8 @@ static void SetDepth(int x, int y, u16 value) {
51 const PAddr addr = registers.framebuffer.GetDepthBufferPhysicalAddress(); 69 const PAddr addr = registers.framebuffer.GetDepthBufferPhysicalAddress();
52 u16* depth_buffer = reinterpret_cast<u16*>(Memory::GetPointer(PAddrToVAddr(addr))); 70 u16* depth_buffer = reinterpret_cast<u16*>(Memory::GetPointer(PAddrToVAddr(addr)));
53 71
72 y = (registers.framebuffer.height - y);
73
54 // Assuming 16-bit depth buffer format until actual format handling is implemented 74 // Assuming 16-bit depth buffer format until actual format handling is implemented
55 *(depth_buffer + x + y * registers.framebuffer.GetWidth()) = value; 75 *(depth_buffer + x + y * registers.framebuffer.GetWidth()) = value;
56} 76}
@@ -90,30 +110,43 @@ static int SignedArea (const Math::Vec2<Fix12P4>& vtx1,
90 return Math::Cross(vec1, vec2).z; 110 return Math::Cross(vec1, vec2).z;
91}; 111};
92 112
93void ProcessTriangle(const VertexShader::OutputVertex& v0, 113/**
94 const VertexShader::OutputVertex& v1, 114 * Helper function for ProcessTriangle with the "reversed" flag to allow for implementing
95 const VertexShader::OutputVertex& v2) 115 * culling via recursion.
116 */
117static void ProcessTriangleInternal(const VertexShader::OutputVertex& v0,
118 const VertexShader::OutputVertex& v1,
119 const VertexShader::OutputVertex& v2,
120 bool reversed = false)
96{ 121{
97 // vertex positions in rasterizer coordinates 122 // vertex positions in rasterizer coordinates
98 auto FloatToFix = [](float24 flt) { 123 static auto FloatToFix = [](float24 flt) {
99 return Fix12P4(static_cast<unsigned short>(flt.ToFloat32() * 16.0f)); 124 // TODO: Rounding here is necessary to prevent garbage pixels at
100 }; 125 // triangle borders. Is it that the correct solution, though?
101 auto ScreenToRasterizerCoordinates = [FloatToFix](const Math::Vec3<float24> vec) { 126 return Fix12P4(static_cast<unsigned short>(round(flt.ToFloat32() * 16.0f)));
102 return Math::Vec3<Fix12P4>{FloatToFix(vec.x), FloatToFix(vec.y), FloatToFix(vec.z)}; 127 };
103 }; 128 static auto ScreenToRasterizerCoordinates = [](const Math::Vec3<float24>& vec) {
129 return Math::Vec3<Fix12P4>{FloatToFix(vec.x), FloatToFix(vec.y), FloatToFix(vec.z)};
130 };
104 131
105 Math::Vec3<Fix12P4> vtxpos[3]{ ScreenToRasterizerCoordinates(v0.screenpos), 132 Math::Vec3<Fix12P4> vtxpos[3]{ ScreenToRasterizerCoordinates(v0.screenpos),
106 ScreenToRasterizerCoordinates(v1.screenpos), 133 ScreenToRasterizerCoordinates(v1.screenpos),
107 ScreenToRasterizerCoordinates(v2.screenpos) }; 134 ScreenToRasterizerCoordinates(v2.screenpos) };
108 135
109 if (registers.cull_mode == Regs::CullMode::KeepClockWise) { 136 if (registers.cull_mode == Regs::CullMode::KeepAll) {
110 // Reverse vertex order and use the CCW code path. 137 // Make sure we always end up with a triangle wound counter-clockwise
111 std::swap(vtxpos[1], vtxpos[2]); 138 if (!reversed && SignedArea(vtxpos[0].xy(), vtxpos[1].xy(), vtxpos[2].xy()) <= 0) {
112 } 139 ProcessTriangleInternal(v0, v2, v1, true);
140 return;
141 }
142 } else {
143 if (!reversed && registers.cull_mode == Regs::CullMode::KeepClockWise) {
144 // Reverse vertex order and use the CCW code path.
145 ProcessTriangleInternal(v0, v2, v1, true);
146 return;
147 }
113 148
114 if (registers.cull_mode != Regs::CullMode::KeepAll) {
115 // Cull away triangles which are wound clockwise. 149 // Cull away triangles which are wound clockwise.
116 // TODO: A check for degenerate triangles ("== 0") should be considered for CullMode::KeepAll
117 if (SignedArea(vtxpos[0].xy(), vtxpos[1].xy(), vtxpos[2].xy()) <= 0) 150 if (SignedArea(vtxpos[0].xy(), vtxpos[1].xy(), vtxpos[2].xy()) <= 0)
118 return; 151 return;
119 } 152 }
@@ -155,9 +188,10 @@ void ProcessTriangle(const VertexShader::OutputVertex& v0,
155 auto textures = registers.GetTextures(); 188 auto textures = registers.GetTextures();
156 auto tev_stages = registers.GetTevStages(); 189 auto tev_stages = registers.GetTevStages();
157 190
191 // Enter rasterization loop, starting at the center of the topleft bounding box corner.
158 // TODO: Not sure if looping through x first might be faster 192 // TODO: Not sure if looping through x first might be faster
159 for (u16 y = min_y; y < max_y; y += 0x10) { 193 for (u16 y = min_y + 8; y < max_y; y += 0x10) {
160 for (u16 x = min_x; x < max_x; x += 0x10) { 194 for (u16 x = min_x + 8; x < max_x; x += 0x10) {
161 195
162 // Calculate the barycentric coordinates w0, w1 and w2 196 // Calculate the barycentric coordinates w0, w1 and w2
163 int w0 = bias0 + SignedArea(vtxpos[1].xy(), vtxpos[2].xy(), {x, y}); 197 int w0 = bias0 + SignedArea(vtxpos[1].xy(), vtxpos[2].xy(), {x, y});
@@ -220,7 +254,7 @@ void ProcessTriangle(const VertexShader::OutputVertex& v0,
220 254
221 int s = (int)(uv[i].u() * float24::FromFloat32(static_cast<float>(texture.config.width))).ToFloat32(); 255 int s = (int)(uv[i].u() * float24::FromFloat32(static_cast<float>(texture.config.width))).ToFloat32();
222 int t = (int)(uv[i].v() * float24::FromFloat32(static_cast<float>(texture.config.height))).ToFloat32(); 256 int t = (int)(uv[i].v() * float24::FromFloat32(static_cast<float>(texture.config.height))).ToFloat32();
223 auto GetWrappedTexCoord = [](Regs::TextureConfig::WrapMode mode, int val, unsigned size) { 257 static auto GetWrappedTexCoord = [](Regs::TextureConfig::WrapMode mode, int val, unsigned size) {
224 switch (mode) { 258 switch (mode) {
225 case Regs::TextureConfig::ClampToEdge: 259 case Regs::TextureConfig::ClampToEdge:
226 val = std::max(val, 0); 260 val = std::max(val, 0);
@@ -228,7 +262,15 @@ void ProcessTriangle(const VertexShader::OutputVertex& v0,
228 return val; 262 return val;
229 263
230 case Regs::TextureConfig::Repeat: 264 case Regs::TextureConfig::Repeat:
231 return (int)(((unsigned)val) % size); 265 return (int)((unsigned)val % size);
266
267 case Regs::TextureConfig::MirroredRepeat:
268 {
269 int val = (int)((unsigned)val % (2 * size));
270 if (val >= size)
271 val = 2 * size - 1 - val;
272 return val;
273 }
232 274
233 default: 275 default:
234 LOG_ERROR(HW_GPU, "Unknown texture coordinate wrapping mode %x\n", (int)mode); 276 LOG_ERROR(HW_GPU, "Unknown texture coordinate wrapping mode %x\n", (int)mode);
@@ -236,6 +278,10 @@ void ProcessTriangle(const VertexShader::OutputVertex& v0,
236 return 0; 278 return 0;
237 } 279 }
238 }; 280 };
281
282 // Textures are laid out from bottom to top, hence we invert the t coordinate.
283 // NOTE: This may not be the right place for the inversion.
284 // TODO: Check if this applies to ETC textures, too.
239 s = GetWrappedTexCoord(texture.config.wrap_s, s, texture.config.width); 285 s = GetWrappedTexCoord(texture.config.wrap_s, s, texture.config.width);
240 t = texture.config.height - 1 - GetWrappedTexCoord(texture.config.wrap_t, t, texture.config.height); 286 t = texture.config.height - 1 - GetWrappedTexCoord(texture.config.wrap_t, t, texture.config.height);
241 287
@@ -262,7 +308,9 @@ void ProcessTriangle(const VertexShader::OutputVertex& v0,
262 308
263 auto GetSource = [&](Source source) -> Math::Vec4<u8> { 309 auto GetSource = [&](Source source) -> Math::Vec4<u8> {
264 switch (source) { 310 switch (source) {
311 // TODO: What's the difference between these two?
265 case Source::PrimaryColor: 312 case Source::PrimaryColor:
313 case Source::PrimaryFragmentColor:
266 return primary_color; 314 return primary_color;
267 315
268 case Source::Texture0: 316 case Source::Texture0:
@@ -378,6 +426,25 @@ void ProcessTriangle(const VertexShader::OutputVertex& v0,
378 return result.Cast<u8>(); 426 return result.Cast<u8>();
379 } 427 }
380 428
429 case Operation::MultiplyThenAdd:
430 {
431 auto result = (input[0] * input[1] + 255 * input[2].Cast<int>()) / 255;
432 result.r() = std::min(255, result.r());
433 result.g() = std::min(255, result.g());
434 result.b() = std::min(255, result.b());
435 return result.Cast<u8>();
436 }
437
438 case Operation::AddThenMultiply:
439 {
440 auto result = input[0] + input[1];
441 result.r() = std::min(255, result.r());
442 result.g() = std::min(255, result.g());
443 result.b() = std::min(255, result.b());
444 result = (result * input[2].Cast<int>()) / 255;
445 return result.Cast<u8>();
446 }
447
381 default: 448 default:
382 LOG_ERROR(HW_GPU, "Unknown color combiner operation %d\n", (int)op); 449 LOG_ERROR(HW_GPU, "Unknown color combiner operation %d\n", (int)op);
383 UNIMPLEMENTED(); 450 UNIMPLEMENTED();
@@ -402,6 +469,12 @@ void ProcessTriangle(const VertexShader::OutputVertex& v0,
402 case Operation::Subtract: 469 case Operation::Subtract:
403 return std::max(0, (int)input[0] - (int)input[1]); 470 return std::max(0, (int)input[0] - (int)input[1]);
404 471
472 case Operation::MultiplyThenAdd:
473 return std::min(255, (input[0] * input[1] + 255 * input[2]) / 255);
474
475 case Operation::AddThenMultiply:
476 return (std::min(255, (input[0] + input[1])) * input[2]) / 255;
477
405 default: 478 default:
406 LOG_ERROR(HW_GPU, "Unknown alpha combiner operation %d\n", (int)op); 479 LOG_ERROR(HW_GPU, "Unknown alpha combiner operation %d\n", (int)op);
407 UNIMPLEMENTED(); 480 UNIMPLEMENTED();
@@ -475,7 +548,7 @@ void ProcessTriangle(const VertexShader::OutputVertex& v0,
475 548
476 // TODO: Does depth indeed only get written even if depth testing is enabled? 549 // TODO: Does depth indeed only get written even if depth testing is enabled?
477 if (registers.output_merger.depth_test_enable) { 550 if (registers.output_merger.depth_test_enable) {
478 u16 z = (u16)(-(v0.screenpos[2].ToFloat32() * w0 + 551 u16 z = (u16)((v0.screenpos[2].ToFloat32() * w0 +
479 v1.screenpos[2].ToFloat32() * w1 + 552 v1.screenpos[2].ToFloat32() * w1 +
480 v2.screenpos[2].ToFloat32() * w2) * 65535.f / wsum); 553 v2.screenpos[2].ToFloat32() * w2) * 65535.f / wsum);
481 u16 ref_z = GetDepth(x >> 4, y >> 4); 554 u16 ref_z = GetDepth(x >> 4, y >> 4);
@@ -524,6 +597,7 @@ void ProcessTriangle(const VertexShader::OutputVertex& v0,
524 } 597 }
525 598
526 auto dest = GetPixel(x >> 4, y >> 4); 599 auto dest = GetPixel(x >> 4, y >> 4);
600 Math::Vec4<u8> blend_output = combiner_output;
527 601
528 if (registers.output_merger.alphablend_enable) { 602 if (registers.output_merger.alphablend_enable) {
529 auto params = registers.output_merger.alpha_blending; 603 auto params = registers.output_merger.alpha_blending;
@@ -574,7 +648,7 @@ void ProcessTriangle(const VertexShader::OutputVertex& v0,
574 648
575 default: 649 default:
576 LOG_CRITICAL(HW_GPU, "Unknown color blend factor %x", factor); 650 LOG_CRITICAL(HW_GPU, "Unknown color blend factor %x", factor);
577 exit(0); 651 UNIMPLEMENTED();
578 break; 652 break;
579 } 653 }
580 }; 654 };
@@ -607,86 +681,78 @@ void ProcessTriangle(const VertexShader::OutputVertex& v0,
607 681
608 default: 682 default:
609 LOG_CRITICAL(HW_GPU, "Unknown alpha blend factor %x", factor); 683 LOG_CRITICAL(HW_GPU, "Unknown alpha blend factor %x", factor);
610 exit(0); 684 UNIMPLEMENTED();
685 break;
686 }
687 };
688
689 using BlendEquation = decltype(params)::BlendEquation;
690 static auto EvaluateBlendEquation = [](const Math::Vec4<u8>& src, const Math::Vec4<u8>& srcfactor,
691 const Math::Vec4<u8>& dest, const Math::Vec4<u8>& destfactor,
692 BlendEquation equation) {
693 Math::Vec4<int> result;
694
695 auto src_result = (src * srcfactor).Cast<int>();
696 auto dst_result = (dest * destfactor).Cast<int>();
697
698 switch (equation) {
699 case BlendEquation::Add:
700 result = (src_result + dst_result) / 255;
611 break; 701 break;
702
703 case BlendEquation::Subtract:
704 result = (src_result - dst_result) / 255;
705 break;
706
707 case BlendEquation::ReverseSubtract:
708 result = (dst_result - src_result) / 255;
709 break;
710
711 // TODO: How do these two actually work?
712 // OpenGL doesn't include the blend factors in the min/max computations,
713 // but is this what the 3DS actually does?
714 case BlendEquation::Min:
715 result.r() = std::min(src.r(), dest.r());
716 result.g() = std::min(src.g(), dest.g());
717 result.b() = std::min(src.b(), dest.b());
718 result.a() = std::min(src.a(), dest.a());
719 break;
720
721 case BlendEquation::Max:
722 result.r() = std::max(src.r(), dest.r());
723 result.g() = std::max(src.g(), dest.g());
724 result.b() = std::max(src.b(), dest.b());
725 result.a() = std::max(src.a(), dest.a());
726 break;
727
728 default:
729 LOG_CRITICAL(HW_GPU, "Unknown RGB blend equation %x", equation);
730 UNIMPLEMENTED();
612 } 731 }
732
733 return Math::Vec4<u8>(MathUtil::Clamp(result.r(), 0, 255),
734 MathUtil::Clamp(result.g(), 0, 255),
735 MathUtil::Clamp(result.b(), 0, 255),
736 MathUtil::Clamp(result.a(), 0, 255));
613 }; 737 };
614 738
615 auto srcfactor = Math::MakeVec(LookupFactorRGB(params.factor_source_rgb), 739 auto srcfactor = Math::MakeVec(LookupFactorRGB(params.factor_source_rgb),
616 LookupFactorA(params.factor_source_a)); 740 LookupFactorA(params.factor_source_a));
617 auto dstfactor = Math::MakeVec(LookupFactorRGB(params.factor_dest_rgb), 741 auto dstfactor = Math::MakeVec(LookupFactorRGB(params.factor_dest_rgb),
618 LookupFactorA(params.factor_dest_a)); 742 LookupFactorA(params.factor_dest_a));
619
620 auto src_result = (combiner_output * srcfactor).Cast<int>();
621 auto dst_result = (dest * dstfactor).Cast<int>();
622
623 switch (params.blend_equation_rgb) {
624 case params.Add:
625 {
626 auto result = (src_result + dst_result) / 255;
627 result.r() = std::min(255, result.r());
628 result.g() = std::min(255, result.g());
629 result.b() = std::min(255, result.b());
630 combiner_output = result.Cast<u8>();
631 break;
632 }
633
634 case params.Subtract:
635 {
636 auto result = (src_result - dst_result) / 255;
637 result.r() = std::max(0, result.r());
638 result.g() = std::max(0, result.g());
639 result.b() = std::max(0, result.b());
640 combiner_output = result.Cast<u8>();
641 break;
642 }
643
644 case params.ReverseSubtract:
645 {
646 auto result = (dst_result - src_result) / 255;
647 result.r() = std::max(0, result.r());
648 result.g() = std::max(0, result.g());
649 result.b() = std::max(0, result.b());
650 combiner_output = result.Cast<u8>();
651 break;
652 }
653
654 case params.Min:
655 {
656 // TODO: GL spec says to do it without the factors, but is this what the 3DS does?
657 Math::Vec4<int> result;
658 result.r() = std::min(combiner_output.r(),dest.r());
659 result.g() = std::min(combiner_output.g(),dest.g());
660 result.b() = std::min(combiner_output.b(),dest.b());
661 combiner_output = result.Cast<u8>();
662 break;
663 }
664
665 case params.Max:
666 {
667 // TODO: GL spec says to do it without the factors, but is this what the 3DS does?
668 Math::Vec4<int> result;
669 result.r() = std::max(combiner_output.r(),dest.r());
670 result.g() = std::max(combiner_output.g(),dest.g());
671 result.b() = std::max(combiner_output.b(),dest.b());
672 combiner_output = result.Cast<u8>();
673 break;
674 }
675 743
676 default: 744 blend_output = EvaluateBlendEquation(combiner_output, srcfactor, dest, dstfactor, params.blend_equation_rgb);
677 LOG_CRITICAL(HW_GPU, "Unknown RGB blend equation %x", params.blend_equation_rgb.Value()); 745 blend_output.a() = EvaluateBlendEquation(combiner_output, srcfactor, dest, dstfactor, params.blend_equation_a).a();
678 exit(0);
679 }
680 } else { 746 } else {
681 LOG_CRITICAL(HW_GPU, "logic op: %x", registers.output_merger.logic_op); 747 LOG_CRITICAL(HW_GPU, "logic op: %x", registers.output_merger.logic_op);
682 exit(0); 748 UNIMPLEMENTED();
683 } 749 }
684 750
685 const Math::Vec4<u8> result = { 751 const Math::Vec4<u8> result = {
686 registers.output_merger.red_enable ? combiner_output.r() : dest.r(), 752 registers.output_merger.red_enable ? blend_output.r() : dest.r(),
687 registers.output_merger.green_enable ? combiner_output.g() : dest.g(), 753 registers.output_merger.green_enable ? blend_output.g() : dest.g(),
688 registers.output_merger.blue_enable ? combiner_output.b() : dest.b(), 754 registers.output_merger.blue_enable ? blend_output.b() : dest.b(),
689 registers.output_merger.alpha_enable ? combiner_output.a() : dest.a() 755 registers.output_merger.alpha_enable ? blend_output.a() : dest.a()
690 }; 756 };
691 757
692 DrawPixel(x >> 4, y >> 4, result); 758 DrawPixel(x >> 4, y >> 4, result);
@@ -694,6 +760,12 @@ void ProcessTriangle(const VertexShader::OutputVertex& v0,
694 } 760 }
695} 761}
696 762
763void ProcessTriangle(const VertexShader::OutputVertex& v0,
764 const VertexShader::OutputVertex& v1,
765 const VertexShader::OutputVertex& v2) {
766 ProcessTriangleInternal(v0, v1, v2);
767}
768
697} // namespace Rasterizer 769} // namespace Rasterizer
698 770
699} // namespace Pica 771} // namespace Pica
diff --git a/src/video_core/vertex_shader.cpp b/src/video_core/vertex_shader.cpp
index 80935a50a..def868ac7 100644
--- a/src/video_core/vertex_shader.cpp
+++ b/src/video_core/vertex_shader.cpp
@@ -85,8 +85,11 @@ struct VertexShaderState {
85 }; 85 };
86 86
87 struct CallStackElement { 87 struct CallStackElement {
88 u32 final_address; 88 u32 final_address; // Address upon which we jump to return_address
89 u32 return_address; 89 u32 return_address; // Where to jump when leaving scope
90 u8 repeat_counter; // How often to repeat until this call stack element is removed
91 u8 loop_increment; // Which value to add to the loop counter after an iteration
92 // TODO: Should this be a signed value? Does it even matter?
90 }; 93 };
91 94
92 // TODO: Is there a maximal size for this? 95 // TODO: Is there a maximal size for this?
@@ -105,9 +108,14 @@ static void ProcessShaderCode(VertexShaderState& state) {
105 108
106 while (true) { 109 while (true) {
107 if (!state.call_stack.empty()) { 110 if (!state.call_stack.empty()) {
108 if (state.program_counter - shader_memory.data() == state.call_stack.top().final_address) { 111 auto& top = state.call_stack.top();
109 state.program_counter = &shader_memory[state.call_stack.top().return_address]; 112 if (state.program_counter - shader_memory.data() == top.final_address) {
110 state.call_stack.pop(); 113 state.address_registers[2] += top.loop_increment;
114
115 if (top.repeat_counter-- == 0) {
116 state.program_counter = &shader_memory[top.return_address];
117 state.call_stack.pop();
118 }
111 119
112 // TODO: Is "trying again" accurate to hardware? 120 // TODO: Is "trying again" accurate to hardware?
113 continue; 121 continue;
@@ -118,9 +126,10 @@ static void ProcessShaderCode(VertexShaderState& state) {
118 const Instruction& instr = *(const Instruction*)state.program_counter; 126 const Instruction& instr = *(const Instruction*)state.program_counter;
119 const SwizzlePattern& swizzle = *(SwizzlePattern*)&swizzle_data[instr.common.operand_desc_id]; 127 const SwizzlePattern& swizzle = *(SwizzlePattern*)&swizzle_data[instr.common.operand_desc_id];
120 128
121 auto call = [&](VertexShaderState& state, u32 offset, u32 num_instructions, u32 return_offset) { 129 static auto call = [](VertexShaderState& state, u32 offset, u32 num_instructions,
130 u32 return_offset, u8 repeat_count, u8 loop_increment) {
122 state.program_counter = &shader_memory[offset] - 1; // -1 to make sure when incrementing the PC we end up at the correct offset 131 state.program_counter = &shader_memory[offset] - 1; // -1 to make sure when incrementing the PC we end up at the correct offset
123 state.call_stack.push({ offset + num_instructions, return_offset }); 132 state.call_stack.push({ offset + num_instructions, return_offset, repeat_count, loop_increment });
124 }; 133 };
125 u32 binary_offset = state.program_counter - shader_memory.data(); 134 u32 binary_offset = state.program_counter - shader_memory.data();
126 135
@@ -457,7 +466,7 @@ static void ProcessShaderCode(VertexShaderState& state) {
457 call(state, 466 call(state,
458 instr.flow_control.dest_offset, 467 instr.flow_control.dest_offset,
459 instr.flow_control.num_instructions, 468 instr.flow_control.num_instructions,
460 binary_offset + 1); 469 binary_offset + 1, 0, 0);
461 break; 470 break;
462 471
463 case Instruction::OpCode::CALLU: 472 case Instruction::OpCode::CALLU:
@@ -465,7 +474,7 @@ static void ProcessShaderCode(VertexShaderState& state) {
465 call(state, 474 call(state,
466 instr.flow_control.dest_offset, 475 instr.flow_control.dest_offset,
467 instr.flow_control.num_instructions, 476 instr.flow_control.num_instructions,
468 binary_offset + 1); 477 binary_offset + 1, 0, 0);
469 } 478 }
470 break; 479 break;
471 480
@@ -474,7 +483,7 @@ static void ProcessShaderCode(VertexShaderState& state) {
474 call(state, 483 call(state,
475 instr.flow_control.dest_offset, 484 instr.flow_control.dest_offset,
476 instr.flow_control.num_instructions, 485 instr.flow_control.num_instructions,
477 binary_offset + 1); 486 binary_offset + 1, 0, 0);
478 } 487 }
479 break; 488 break;
480 489
@@ -486,12 +495,12 @@ static void ProcessShaderCode(VertexShaderState& state) {
486 call(state, 495 call(state,
487 binary_offset + 1, 496 binary_offset + 1,
488 instr.flow_control.dest_offset - binary_offset - 1, 497 instr.flow_control.dest_offset - binary_offset - 1,
489 instr.flow_control.dest_offset + instr.flow_control.num_instructions); 498 instr.flow_control.dest_offset + instr.flow_control.num_instructions, 0, 0);
490 } else { 499 } else {
491 call(state, 500 call(state,
492 instr.flow_control.dest_offset, 501 instr.flow_control.dest_offset,
493 instr.flow_control.num_instructions, 502 instr.flow_control.num_instructions,
494 instr.flow_control.dest_offset + instr.flow_control.num_instructions); 503 instr.flow_control.dest_offset + instr.flow_control.num_instructions, 0, 0);
495 } 504 }
496 505
497 break; 506 break;
@@ -504,17 +513,30 @@ static void ProcessShaderCode(VertexShaderState& state) {
504 call(state, 513 call(state,
505 binary_offset + 1, 514 binary_offset + 1,
506 instr.flow_control.dest_offset - binary_offset - 1, 515 instr.flow_control.dest_offset - binary_offset - 1,
507 instr.flow_control.dest_offset + instr.flow_control.num_instructions); 516 instr.flow_control.dest_offset + instr.flow_control.num_instructions, 0, 0);
508 } else { 517 } else {
509 call(state, 518 call(state,
510 instr.flow_control.dest_offset, 519 instr.flow_control.dest_offset,
511 instr.flow_control.num_instructions, 520 instr.flow_control.num_instructions,
512 instr.flow_control.dest_offset + instr.flow_control.num_instructions); 521 instr.flow_control.dest_offset + instr.flow_control.num_instructions, 0, 0);
513 } 522 }
514 523
515 break; 524 break;
516 } 525 }
517 526
527 case Instruction::OpCode::LOOP:
528 {
529 state.address_registers[2] = shader_uniforms.i[instr.flow_control.int_uniform_id].y;
530
531 call(state,
532 binary_offset + 1,
533 instr.flow_control.dest_offset - binary_offset + 1,
534 instr.flow_control.dest_offset + 1,
535 shader_uniforms.i[instr.flow_control.int_uniform_id].x,
536 shader_uniforms.i[instr.flow_control.int_uniform_id].z);
537 break;
538 }
539
518 default: 540 default:
519 LOG_ERROR(HW_GPU, "Unhandled instruction: 0x%02x (%s): 0x%08x", 541 LOG_ERROR(HW_GPU, "Unhandled instruction: 0x%02x (%s): 0x%08x",
520 (int)instr.opcode.Value(), instr.opcode.GetInfo().name, instr.hex); 542 (int)instr.opcode.Value(), instr.opcode.GetInfo().name, instr.hex);