5 files changed, 55 insertions, 43 deletions
diff --git a/src/core/hw/y2r.cpp b/src/core/hw/y2r.cpp
index b40f13cae..f80e26ecd 100644
--- a/src/core/hw/y2r.cpp
+++ b/src/core/hw/y2r.cpp
@@ -111,7 +111,7 @@ static void SendData(const u32* input, ConversionBuffer& buf, int amount_of_data
        while (output < unit_end) {
            u32 color = *input++;
            Math::Vec4<u8> col_vec{
-                (color >> 24) & 0xFF, (color >> 16) & 0xFF, (color >>  8) & 0xFF, alpha,
+                (u8)(color >> 24), (u8)(color >> 16), (u8)(color >> 8), alpha
            };
            switch (output_format) {
diff --git a/src/video_core/clipper.cpp b/src/video_core/clipper.cpp
index 224132d71..558b49d60 100644
--- a/src/video_core/clipper.cpp
+++ b/src/video_core/clipper.cpp
@@ -153,7 +153,7 @@ void ProcessTriangle(OutputVertex &v0, OutputVertex &v1, OutputVertex &v2) {
                  "Triangle %lu/%lu at position (%.3f, %.3f, %.3f, %.3f), "
                  "(%.3f, %.3f, %.3f, %.3f), (%.3f, %.3f, %.3f, %.3f) and "
                  "screen position (%.2f, %.2f, %.2f), (%.2f, %.2f, %.2f), (%.2f, %.2f, %.2f)",
-                  i, output_list->size(),
+                  i + 1, output_list->size() - 2,
                  vtx0.pos.x.ToFloat32(), vtx0.pos.y.ToFloat32(), vtx0.pos.z.ToFloat32(), vtx0.pos.w.ToFloat32(),
                  vtx1.pos.x.ToFloat32(), vtx1.pos.y.ToFloat32(), vtx1.pos.z.ToFloat32(), vtx1.pos.w.ToFloat32(),
                  vtx2.pos.x.ToFloat32(), vtx2.pos.y.ToFloat32(), vtx2.pos.z.ToFloat32(), vtx2.pos.w.ToFloat32(),
diff --git a/src/video_core/command_processor.cpp b/src/video_core/command_processor.cpp
index 2a1c885a7..f2e3aee85 100644
--- a/src/video_core/command_processor.cpp
+++ b/src/video_core/command_processor.cpp
@@ -60,6 +60,46 @@ static inline void WritePicaReg(u32 id, u32 value, u32 mask) {
            GSP_GPU::SignalInterrupt(GSP_GPU::InterruptId::P3D);
            break;
+        // Load default vertex input attributes
+        case PICA_REG_INDEX_WORKAROUND(vs_default_attributes_setup.set_value[0], 0x233):
+        case PICA_REG_INDEX_WORKAROUND(vs_default_attributes_setup.set_value[1], 0x234):
+        case PICA_REG_INDEX_WORKAROUND(vs_default_attributes_setup.set_value[2], 0x235):
+        {
+            // TODO: Does actual hardware indeed keep an intermediate buffer or does
+            //       it directly write the values?
+            default_attr_write_buffer[default_attr_counter++] = value;
+            // Default attributes are written in a packed format such that four float24 values are encoded in
+            // three 32-bit numbers. We write to internal memory once a full such vector is
+            // written.
+            if (default_attr_counter >= 3) {
+                default_attr_counter = 0;
+                auto& setup = regs.vs_default_attributes_setup;
+                if (setup.index >= 16) {
+                    LOG_ERROR(HW_GPU, "Invalid VS default attribute index %d", (int)setup.index);
+                    break;
+                }
+                Math::Vec4<float24>& attribute = g_state.vs.default_attributes[setup.index];
+                // NOTE: The destination component order indeed is "backwards"
+                attribute.w = float24::FromRawFloat24(default_attr_write_buffer[0] >> 8);
+                attribute.z = float24::FromRawFloat24(((default_attr_write_buffer[0] & 0xFF) << 16) | ((default_attr_write_buffer[1] >> 16) & 0xFFFF));
+                attribute.y = float24::FromRawFloat24(((default_attr_write_buffer[1] & 0xFFFF) << 8) | ((default_attr_write_buffer[2] >> 24) & 0xFF));
+                attribute.x = float24::FromRawFloat24(default_attr_write_buffer[2] & 0xFFFFFF);
+                LOG_TRACE(HW_GPU, "Set default VS attribute %x to (%f %f %f %f)", (int)setup.index,
+                          attribute.x.ToFloat32(), attribute.y.ToFloat32(), attribute.z.ToFloat32(),
+                          attribute.w.ToFloat32());
+                // TODO: Verify that this actually modifies the register!
+                setup.index = setup.index + 1;
+            }
+            break;
+        }
        case PICA_REG_INDEX_WORKAROUND(command_buffer.trigger[0], 0x23c):
        case PICA_REG_INDEX_WORKAROUND(command_buffer.trigger[1], 0x23d):
        {
@@ -351,46 +391,6 @@ static inline void WritePicaReg(u32 id, u32 value, u32 mask) {
            break;
        }
-        // Load default vertex input attributes
-        case PICA_REG_INDEX_WORKAROUND(vs_default_attributes_setup.set_value[0], 0x233):
-        case PICA_REG_INDEX_WORKAROUND(vs_default_attributes_setup.set_value[1], 0x234):
-        case PICA_REG_INDEX_WORKAROUND(vs_default_attributes_setup.set_value[2], 0x235):
-        {
-            // TODO: Does actual hardware indeed keep an intermediate buffer or does
-            //       it directly write the values?
-            default_attr_write_buffer[default_attr_counter++] = value;
-            // Default attributes are written in a packed format such that four float24 values are encoded in
-            // three 32-bit numbers. We write to internal memory once a full such vector is
-            // written.
-            if (default_attr_counter >= 3) {
-                default_attr_counter = 0;
-                auto& setup = regs.vs_default_attributes_setup;
-                if (setup.index >= 16) {
-                    LOG_ERROR(HW_GPU, "Invalid VS default attribute index %d", (int)setup.index);
-                    break;
-                }
-                Math::Vec4<float24>& attribute = g_state.vs.default_attributes[setup.index];
-                // NOTE: The destination component order indeed is "backwards"
-                attribute.w = float24::FromRawFloat24(default_attr_write_buffer[0] >> 8);
-                attribute.z = float24::FromRawFloat24(((default_attr_write_buffer[0] & 0xFF) << 16) | ((default_attr_write_buffer[1] >> 16) & 0xFFFF));
-                attribute.y = float24::FromRawFloat24(((default_attr_write_buffer[1] & 0xFFFF) << 8) | ((default_attr_write_buffer[2] >> 24) & 0xFF));
-                attribute.x = float24::FromRawFloat24(default_attr_write_buffer[2] & 0xFFFFFF);
-                LOG_TRACE(HW_GPU, "Set default VS attribute %x to (%f %f %f %f)", (int)setup.index,
-                          attribute.x.ToFloat32(), attribute.y.ToFloat32(), attribute.z.ToFloat32(),
-                          attribute.w.ToFloat32());
-                // TODO: Verify that this actually modifies the register!
-                setup.index = setup.index + 1;
-            }
-            break;
-        }
        // Load shader program code
        case PICA_REG_INDEX_WORKAROUND(vs_program.set_word[0], 0x2cc):
        case PICA_REG_INDEX_WORKAROUND(vs_program.set_word[1], 0x2cd):
diff --git a/src/video_core/pica.h b/src/video_core/pica.h
index 46a7b21dc..026b10a62 100644
--- a/src/video_core/pica.h
+++ b/src/video_core/pica.h
@@ -290,6 +290,7 @@ struct Regs {
            AddSigned       = 3,
            Lerp            = 4,
            Subtract        = 5,
+            Dot3_RGB        = 6,
            MultiplyThenAdd = 8,
            AddThenMultiply = 9,
diff --git a/src/video_core/rasterizer.cpp b/src/video_core/rasterizer.cpp
index c381c2bd9..a6b7997ce 100644
--- a/src/video_core/rasterizer.cpp
+++ b/src/video_core/rasterizer.cpp
@@ -641,7 +641,18 @@ static void ProcessTriangleInternal(const VertexShader::OutputVertex& v0,
                        result = (result * input[2].Cast<int>()) / 255;
                        return result.Cast<u8>();
                    }
+                    case Operation::Dot3_RGB:
+                    {
+                        // Not fully accurate.
+                        // Worst case scenario seems to yield a +/-3 error
+                        // Some HW results indicate that the per-component computation can't have a higher precision than 1/256,
+                        // while dot3_rgb( (0x80,g0,b0),(0x7F,g1,b1) ) and dot3_rgb( (0x80,g0,b0),(0x80,g1,b1) ) give different results
+                        int result = ((input[0].r() * 2 - 255) * (input[1].r() * 2 - 255) + 128) / 256 +
+                                     ((input[0].g() * 2 - 255) * (input[1].g() * 2 - 255) + 128) / 256 +
+                                     ((input[0].b() * 2 - 255) * (input[1].b() * 2 - 255) + 128) / 256;
+                        result = std::max(0, std::min(255, result));
+                        return { (u8)result, (u8)result, (u8)result };
+                    }
                    default:
                        LOG_ERROR(HW_GPU, "Unknown color combiner operation %d\n", (int)op);
                        UNIMPLEMENTED();

diff --git a/src/core/hw/y2r.cpp b/src/core/hw/y2r.cpp index b40f13cae..f80e26ecd 100644 --- a/src/core/hw/y2r.cpp +++ b/src/core/hw/y2r.cpp
@@ -111,7 +111,7 @@ static void SendData(const u32* input, ConversionBuffer& buf, int amount_of_data
111	while (output < unit_end) {	111	while (output < unit_end) {
112	u32 color = *input++;	112	u32 color = *input++;
113	Math::Vec4<u8> col_vec{	113	Math::Vec4<u8> col_vec{
114	(color >> 24) & 0xFF, (color >> 16) & 0xFF, (color >> 8) & 0xFF, alpha,	114	(u8)(color >> 24), (u8)(color >> 16), (u8)(color >> 8), alpha
115	};	115	};
116		116
117	switch (output_format) {	117	switch (output_format) {


diff --git a/src/video_core/clipper.cpp b/src/video_core/clipper.cpp index 224132d71..558b49d60 100644 --- a/src/video_core/clipper.cpp +++ b/src/video_core/clipper.cpp
@@ -153,7 +153,7 @@ void ProcessTriangle(OutputVertex &v0, OutputVertex &v1, OutputVertex &v2) {
153	"Triangle %lu/%lu at position (%.3f, %.3f, %.3f, %.3f), "	153	"Triangle %lu/%lu at position (%.3f, %.3f, %.3f, %.3f), "
154	"(%.3f, %.3f, %.3f, %.3f), (%.3f, %.3f, %.3f, %.3f) and "	154	"(%.3f, %.3f, %.3f, %.3f), (%.3f, %.3f, %.3f, %.3f) and "
155	"screen position (%.2f, %.2f, %.2f), (%.2f, %.2f, %.2f), (%.2f, %.2f, %.2f)",	155	"screen position (%.2f, %.2f, %.2f), (%.2f, %.2f, %.2f), (%.2f, %.2f, %.2f)",
156	i, output_list->size(),	156	i + 1, output_list->size() - 2,
157	vtx0.pos.x.ToFloat32(), vtx0.pos.y.ToFloat32(), vtx0.pos.z.ToFloat32(), vtx0.pos.w.ToFloat32(),	157	vtx0.pos.x.ToFloat32(), vtx0.pos.y.ToFloat32(), vtx0.pos.z.ToFloat32(), vtx0.pos.w.ToFloat32(),
158	vtx1.pos.x.ToFloat32(), vtx1.pos.y.ToFloat32(), vtx1.pos.z.ToFloat32(), vtx1.pos.w.ToFloat32(),	158	vtx1.pos.x.ToFloat32(), vtx1.pos.y.ToFloat32(), vtx1.pos.z.ToFloat32(), vtx1.pos.w.ToFloat32(),
159	vtx2.pos.x.ToFloat32(), vtx2.pos.y.ToFloat32(), vtx2.pos.z.ToFloat32(), vtx2.pos.w.ToFloat32(),	159	vtx2.pos.x.ToFloat32(), vtx2.pos.y.ToFloat32(), vtx2.pos.z.ToFloat32(), vtx2.pos.w.ToFloat32(),


diff --git a/src/video_core/command_processor.cpp b/src/video_core/command_processor.cpp index 2a1c885a7..f2e3aee85 100644 --- a/src/video_core/command_processor.cpp +++ b/src/video_core/command_processor.cpp
@@ -60,6 +60,46 @@ static inline void WritePicaReg(u32 id, u32 value, u32 mask) {
60	GSP_GPU::SignalInterrupt(GSP_GPU::InterruptId::P3D);	60	GSP_GPU::SignalInterrupt(GSP_GPU::InterruptId::P3D);
61	break;	61	break;
62		62
		63	// Load default vertex input attributes
		64	case PICA_REG_INDEX_WORKAROUND(vs_default_attributes_setup.set_value[0], 0x233):
		65	case PICA_REG_INDEX_WORKAROUND(vs_default_attributes_setup.set_value[1], 0x234):
		66	case PICA_REG_INDEX_WORKAROUND(vs_default_attributes_setup.set_value[2], 0x235):
		67	{
		68	// TODO: Does actual hardware indeed keep an intermediate buffer or does
		69	// it directly write the values?
		70	default_attr_write_buffer[default_attr_counter++] = value;
		71
		72	// Default attributes are written in a packed format such that four float24 values are encoded in
		73	// three 32-bit numbers. We write to internal memory once a full such vector is
		74	// written.
		75	if (default_attr_counter >= 3) {
		76	default_attr_counter = 0;
		77
		78	auto& setup = regs.vs_default_attributes_setup;
		79
		80	if (setup.index >= 16) {
		81	LOG_ERROR(HW_GPU, "Invalid VS default attribute index %d", (int)setup.index);
		82	break;
		83	}
		84
		85	Math::Vec4<float24>& attribute = g_state.vs.default_attributes[setup.index];
		86
		87	// NOTE: The destination component order indeed is "backwards"
		88	attribute.w = float24::FromRawFloat24(default_attr_write_buffer[0] >> 8);
		89	attribute.z = float24::FromRawFloat24(((default_attr_write_buffer[0] & 0xFF) << 16) \| ((default_attr_write_buffer[1] >> 16) & 0xFFFF));
		90	attribute.y = float24::FromRawFloat24(((default_attr_write_buffer[1] & 0xFFFF) << 8) \| ((default_attr_write_buffer[2] >> 24) & 0xFF));
		91	attribute.x = float24::FromRawFloat24(default_attr_write_buffer[2] & 0xFFFFFF);
		92
		93	LOG_TRACE(HW_GPU, "Set default VS attribute %x to (%f %f %f %f)", (int)setup.index,
		94	attribute.x.ToFloat32(), attribute.y.ToFloat32(), attribute.z.ToFloat32(),
		95	attribute.w.ToFloat32());
		96
		97	// TODO: Verify that this actually modifies the register!
		98	setup.index = setup.index + 1;
		99	}
		100	break;
		101	}
		102
63	case PICA_REG_INDEX_WORKAROUND(command_buffer.trigger[0], 0x23c):	103	case PICA_REG_INDEX_WORKAROUND(command_buffer.trigger[0], 0x23c):
64	case PICA_REG_INDEX_WORKAROUND(command_buffer.trigger[1], 0x23d):	104	case PICA_REG_INDEX_WORKAROUND(command_buffer.trigger[1], 0x23d):
65	{	105	{
@@ -351,46 +391,6 @@ static inline void WritePicaReg(u32 id, u32 value, u32 mask) {
351	break;	391	break;
352	}	392	}
353		393
354	// Load default vertex input attributes
355	case PICA_REG_INDEX_WORKAROUND(vs_default_attributes_setup.set_value[0], 0x233):
356	case PICA_REG_INDEX_WORKAROUND(vs_default_attributes_setup.set_value[1], 0x234):
357	case PICA_REG_INDEX_WORKAROUND(vs_default_attributes_setup.set_value[2], 0x235):
358	{
359	// TODO: Does actual hardware indeed keep an intermediate buffer or does
360	// it directly write the values?
361	default_attr_write_buffer[default_attr_counter++] = value;
362
363	// Default attributes are written in a packed format such that four float24 values are encoded in
364	// three 32-bit numbers. We write to internal memory once a full such vector is
365	// written.
366	if (default_attr_counter >= 3) {
367	default_attr_counter = 0;
368
369	auto& setup = regs.vs_default_attributes_setup;
370
371	if (setup.index >= 16) {
372	LOG_ERROR(HW_GPU, "Invalid VS default attribute index %d", (int)setup.index);
373	break;
374	}
375
376	Math::Vec4<float24>& attribute = g_state.vs.default_attributes[setup.index];
377
378	// NOTE: The destination component order indeed is "backwards"
379	attribute.w = float24::FromRawFloat24(default_attr_write_buffer[0] >> 8);
380	attribute.z = float24::FromRawFloat24(((default_attr_write_buffer[0] & 0xFF) << 16) \| ((default_attr_write_buffer[1] >> 16) & 0xFFFF));
381	attribute.y = float24::FromRawFloat24(((default_attr_write_buffer[1] & 0xFFFF) << 8) \| ((default_attr_write_buffer[2] >> 24) & 0xFF));
382	attribute.x = float24::FromRawFloat24(default_attr_write_buffer[2] & 0xFFFFFF);
383
384	LOG_TRACE(HW_GPU, "Set default VS attribute %x to (%f %f %f %f)", (int)setup.index,
385	attribute.x.ToFloat32(), attribute.y.ToFloat32(), attribute.z.ToFloat32(),
386	attribute.w.ToFloat32());
387
388	// TODO: Verify that this actually modifies the register!
389	setup.index = setup.index + 1;
390	}
391	break;
392	}
393
394	// Load shader program code	394	// Load shader program code
395	case PICA_REG_INDEX_WORKAROUND(vs_program.set_word[0], 0x2cc):	395	case PICA_REG_INDEX_WORKAROUND(vs_program.set_word[0], 0x2cc):
396	case PICA_REG_INDEX_WORKAROUND(vs_program.set_word[1], 0x2cd):	396	case PICA_REG_INDEX_WORKAROUND(vs_program.set_word[1], 0x2cd):


diff --git a/src/video_core/pica.h b/src/video_core/pica.h index 46a7b21dc..026b10a62 100644 --- a/src/video_core/pica.h +++ b/src/video_core/pica.h
@@ -290,6 +290,7 @@ struct Regs {
290	AddSigned = 3,	290	AddSigned = 3,
291	Lerp = 4,	291	Lerp = 4,
292	Subtract = 5,	292	Subtract = 5,
		293	Dot3_RGB = 6,
293		294
294	MultiplyThenAdd = 8,	295	MultiplyThenAdd = 8,
295	AddThenMultiply = 9,	296	AddThenMultiply = 9,


diff --git a/src/video_core/rasterizer.cpp b/src/video_core/rasterizer.cpp index c381c2bd9..a6b7997ce 100644 --- a/src/video_core/rasterizer.cpp +++ b/src/video_core/rasterizer.cpp
@@ -641,7 +641,18 @@ static void ProcessTriangleInternal(const VertexShader::OutputVertex& v0,
641	result = (result * input[2].Cast<int>()) / 255;	641	result = (result * input[2].Cast<int>()) / 255;
642	return result.Cast<u8>();	642	return result.Cast<u8>();
643	}	643	}
644		644	case Operation::Dot3_RGB:
		645	{
		646	// Not fully accurate.
		647	// Worst case scenario seems to yield a +/-3 error
		648	// Some HW results indicate that the per-component computation can't have a higher precision than 1/256,
		649	// while dot3_rgb( (0x80,g0,b0),(0x7F,g1,b1) ) and dot3_rgb( (0x80,g0,b0),(0x80,g1,b1) ) give different results
		650	int result = ((input[0].r() * 2 - 255) * (input[1].r() * 2 - 255) + 128) / 256 +
		651	((input[0].g() * 2 - 255) * (input[1].g() * 2 - 255) + 128) / 256 +
		652	((input[0].b() * 2 - 255) * (input[1].b() * 2 - 255) + 128) / 256;
		653	result = std::max(0, std::min(255, result));
		654	return { (u8)result, (u8)result, (u8)result };
		655	}
645	default:	656	default:
646	LOG_ERROR(HW_GPU, "Unknown color combiner operation %d\n", (int)op);	657	LOG_ERROR(HW_GPU, "Unknown color combiner operation %d\n", (int)op);
647	UNIMPLEMENTED();	658	UNIMPLEMENTED();