Pica/VertexShader: Fix a bug caused due to incorrect assumptions of consecutive output register tables.

We now write create a temporary buffer for output registers and copy all of them to the actual output vertex structure after the shader has run. This is technically not necessary, but it's easier to vectorize in the future.
author: Tony Wasserka 2015-03-12 14:18:46 +0100
committer: Tony Wasserka 2015-03-12 14:18:46 +0100
commit: e4f5ec6272016dd34afe4e8901a9e8027324ba21 (patch)
tree: 60f2db6f42538875bf8e552926e5df4dbddfab58 /src
parent: Merge pull request #642 from bunnei/touchpad (diff)
download: yuzu-e4f5ec6272016dd34afe4e8901a9e8027324ba21.tar.gz
yuzu-e4f5ec6272016dd34afe4e8901a9e8027324ba21.tar.xz
yuzu-e4f5ec6272016dd34afe4e8901a9e8027324ba21.zip
1 files changed, 24 insertions, 20 deletions
diff --git a/src/video_core/vertex_shader.cpp b/src/video_core/vertex_shader.cpp
index 4eb3e743e..e8d865172 100644
--- a/src/video_core/vertex_shader.cpp
+++ b/src/video_core/vertex_shader.cpp
@@ -72,7 +72,7 @@ struct VertexShaderState {
    u32* program_counter;
    const float24* input_register_table[16];
-    float24* output_register_table[7*4];
+    Math::Vec4<float24> output_registers[16];
    Math::Vec4<float24> temporary_registers[16];
    bool conditional_code[2];
@@ -198,8 +198,7 @@ static void ProcessShaderCode(VertexShaderState& state) {
                src2[3] = src2[3] * float24::FromFloat32(-1);
            }
-            float24* dest = (instr.common.dest.Value() < 0x08) ? state.output_register_table[4*instr.common.dest.Value().GetIndex()]
+            float24* dest = (instr.common.dest.Value() < 0x10) ? &state.output_registers[instr.common.dest.Value().GetIndex()][0]
-                        : (instr.common.dest.Value() < 0x10) ? dummy_vec4_float24
                        : (instr.common.dest.Value() < 0x20) ? &state.temporary_registers[instr.common.dest.Value().GetIndex()][0]
                        : dummy_vec4_float24;
@@ -409,8 +408,7 @@ static void ProcessShaderCode(VertexShaderState& state) {
                    src3[3] = src3[3] * float24::FromFloat32(-1);
                }
-                float24* dest = (instr.mad.dest.Value() < 0x08) ? state.output_register_table[4*instr.mad.dest.Value().GetIndex()]
+                float24* dest = (instr.mad.dest.Value() < 0x10) ? &state.output_registers[instr.mad.dest.Value().GetIndex()][0]
-                            : (instr.mad.dest.Value() < 0x10) ? dummy_vec4_float24
                            : (instr.mad.dest.Value() < 0x20) ? &state.temporary_registers[instr.mad.dest.Value().GetIndex()][0]
                            : dummy_vec4_float24;
@@ -587,12 +585,18 @@ OutputVertex RunShader(const InputVertex& input, int num_attributes) {
    if(num_attributes > 14) state.input_register_table[attribute_register_map.attribute14_register] = &input.attr[14].x;
    if(num_attributes > 15) state.input_register_table[attribute_register_map.attribute15_register] = &input.attr[15].x;
-    // Setup output register table
+    state.conditional_code[0] = false;
-    OutputVertex ret;
+    state.conditional_code[1] = false;
-    // Zero output so that attributes which aren't output won't have denormals in them, which will
-    // slow us down later.
+    ProcessShaderCode(state);
-    memset(&ret, 0, sizeof(ret));
+    DebugUtils::DumpShader(shader_memory.data(), state.debug.max_offset, swizzle_data.data(),
+                           state.debug.max_opdesc_id, registers.vs_main_offset,
+                           registers.vs_output_attributes);
+    // Setup output data
+    OutputVertex ret;
+    // TODO(neobrain): Under some circumstances, up to 16 attributes may be output. We need to
+    // figure out what those circumstances are and enable the remaining outputs then.
    for (int i = 0; i < 7; ++i) {
        const auto& output_register_map = registers.vs_output_attributes[i];
@@ -601,18 +605,18 @@ OutputVertex RunShader(const InputVertex& input, int num_attributes) {
            output_register_map.map_z, output_register_map.map_w
        };
-        for (int comp = 0; comp < 4; ++comp)
+        for (int comp = 0; comp < 4; ++comp) {
-            state.output_register_table[4*i+comp] = ((float24*)&ret) + semantics[comp];
+            float24* out = ((float24*)&ret) + semantics[comp];
+            if (semantics[comp] != Regs::VSOutputAttributes::INVALID) {
+                *out = state.output_registers[i][comp];
+            } else {
+                // Zero output so that attributes which aren't output won't have denormals in them,
+                // which would slow us down later.
+                memset(out, 0, sizeof(*out));
+            }
+        }
    }
-    state.conditional_code[0] = false;
-    state.conditional_code[1] = false;
-    ProcessShaderCode(state);
-    DebugUtils::DumpShader(shader_memory.data(), state.debug.max_offset, swizzle_data.data(),
-                           state.debug.max_opdesc_id, registers.vs_main_offset,
-                           registers.vs_output_attributes);
    LOG_TRACE(Render_Software, "Output vertex: pos (%.2f, %.2f, %.2f, %.2f), col(%.2f, %.2f, %.2f, %.2f), tc0(%.2f, %.2f)",
        ret.pos.x.ToFloat32(), ret.pos.y.ToFloat32(), ret.pos.z.ToFloat32(), ret.pos.w.ToFloat32(),
        ret.color.x.ToFloat32(), ret.color.y.ToFloat32(), ret.color.z.ToFloat32(), ret.color.w.ToFloat32(),
author	Tony Wasserka	2015-03-12 14:18:46 +0100
committer	Tony Wasserka	2015-03-12 14:18:46 +0100
commit	e4f5ec6272016dd34afe4e8901a9e8027324ba21 (patch)
tree	60f2db6f42538875bf8e552926e5df4dbddfab58 /src
parent	Merge pull request #642 from bunnei/touchpad (diff)
download	yuzu-e4f5ec6272016dd34afe4e8901a9e8027324ba21.tar.gz yuzu-e4f5ec6272016dd34afe4e8901a9e8027324ba21.tar.xz yuzu-e4f5ec6272016dd34afe4e8901a9e8027324ba21.zip

diff --git a/src/video_core/vertex_shader.cpp b/src/video_core/vertex_shader.cpp index 4eb3e743e..e8d865172 100644 --- a/src/video_core/vertex_shader.cpp +++ b/src/video_core/vertex_shader.cpp
@@ -72,7 +72,7 @@ struct VertexShaderState {
72	u32* program_counter;	72	u32* program_counter;
73		73
74	const float24* input_register_table[16];	74	const float24* input_register_table[16];
75	float24* output_register_table[7*4];	75	Math::Vec4<float24> output_registers[16];
76		76
77	Math::Vec4<float24> temporary_registers[16];	77	Math::Vec4<float24> temporary_registers[16];
78	bool conditional_code[2];	78	bool conditional_code[2];
@@ -198,8 +198,7 @@ static void ProcessShaderCode(VertexShaderState& state) {
198	src2[3] = src2[3] * float24::FromFloat32(-1);	198	src2[3] = src2[3] * float24::FromFloat32(-1);
199	}	199	}
200		200
201	float24* dest = (instr.common.dest.Value() < 0x08) ? state.output_register_table[4*instr.common.dest.Value().GetIndex()]	201	float24* dest = (instr.common.dest.Value() < 0x10) ? &state.output_registers[instr.common.dest.Value().GetIndex()][0]
202	: (instr.common.dest.Value() < 0x10) ? dummy_vec4_float24
203	: (instr.common.dest.Value() < 0x20) ? &state.temporary_registers[instr.common.dest.Value().GetIndex()][0]	202	: (instr.common.dest.Value() < 0x20) ? &state.temporary_registers[instr.common.dest.Value().GetIndex()][0]
204	: dummy_vec4_float24;	203	: dummy_vec4_float24;
205		204
@@ -409,8 +408,7 @@ static void ProcessShaderCode(VertexShaderState& state) {
409	src3[3] = src3[3] * float24::FromFloat32(-1);	408	src3[3] = src3[3] * float24::FromFloat32(-1);
410	}	409	}
411		410
412	float24* dest = (instr.mad.dest.Value() < 0x08) ? state.output_register_table[4*instr.mad.dest.Value().GetIndex()]	411	float24* dest = (instr.mad.dest.Value() < 0x10) ? &state.output_registers[instr.mad.dest.Value().GetIndex()][0]
413	: (instr.mad.dest.Value() < 0x10) ? dummy_vec4_float24
414	: (instr.mad.dest.Value() < 0x20) ? &state.temporary_registers[instr.mad.dest.Value().GetIndex()][0]	412	: (instr.mad.dest.Value() < 0x20) ? &state.temporary_registers[instr.mad.dest.Value().GetIndex()][0]
415	: dummy_vec4_float24;	413	: dummy_vec4_float24;
416		414
@@ -587,12 +585,18 @@ OutputVertex RunShader(const InputVertex& input, int num_attributes) {
587	if(num_attributes > 14) state.input_register_table[attribute_register_map.attribute14_register] = &input.attr[14].x;	585	if(num_attributes > 14) state.input_register_table[attribute_register_map.attribute14_register] = &input.attr[14].x;
588	if(num_attributes > 15) state.input_register_table[attribute_register_map.attribute15_register] = &input.attr[15].x;	586	if(num_attributes > 15) state.input_register_table[attribute_register_map.attribute15_register] = &input.attr[15].x;
589		587
590	// Setup output register table	588	state.conditional_code[0] = false;
591	OutputVertex ret;	589	state.conditional_code[1] = false;
592	// Zero output so that attributes which aren't output won't have denormals in them, which will	590
593	// slow us down later.	591	ProcessShaderCode(state);
594	memset(&ret, 0, sizeof(ret));	592	DebugUtils::DumpShader(shader_memory.data(), state.debug.max_offset, swizzle_data.data(),
		593	state.debug.max_opdesc_id, registers.vs_main_offset,
		594	registers.vs_output_attributes);
595		595
		596	// Setup output data
		597	OutputVertex ret;
		598	// TODO(neobrain): Under some circumstances, up to 16 attributes may be output. We need to
		599	// figure out what those circumstances are and enable the remaining outputs then.
596	for (int i = 0; i < 7; ++i) {	600	for (int i = 0; i < 7; ++i) {
597	const auto& output_register_map = registers.vs_output_attributes[i];	601	const auto& output_register_map = registers.vs_output_attributes[i];
598		602
@@ -601,18 +605,18 @@ OutputVertex RunShader(const InputVertex& input, int num_attributes) {
601	output_register_map.map_z, output_register_map.map_w	605	output_register_map.map_z, output_register_map.map_w
602	};	606	};
603		607
604	for (int comp = 0; comp < 4; ++comp)	608	for (int comp = 0; comp < 4; ++comp) {
605	state.output_register_table[4i+comp] = ((float24)&ret) + semantics[comp];	609	float24* out = ((float24*)&ret) + semantics[comp];
		610	if (semantics[comp] != Regs::VSOutputAttributes::INVALID) {
		611	*out = state.output_registers[i][comp];
		612	} else {
		613	// Zero output so that attributes which aren't output won't have denormals in them,
		614	// which would slow us down later.
		615	memset(out, 0, sizeof(*out));
		616	}
		617	}
606	}	618	}
607		619
608	state.conditional_code[0] = false;
609	state.conditional_code[1] = false;
610
611	ProcessShaderCode(state);
612	DebugUtils::DumpShader(shader_memory.data(), state.debug.max_offset, swizzle_data.data(),
613	state.debug.max_opdesc_id, registers.vs_main_offset,
614	registers.vs_output_attributes);
615
616	LOG_TRACE(Render_Software, "Output vertex: pos (%.2f, %.2f, %.2f, %.2f), col(%.2f, %.2f, %.2f, %.2f), tc0(%.2f, %.2f)",	620	LOG_TRACE(Render_Software, "Output vertex: pos (%.2f, %.2f, %.2f, %.2f), col(%.2f, %.2f, %.2f, %.2f), tc0(%.2f, %.2f)",
617	ret.pos.x.ToFloat32(), ret.pos.y.ToFloat32(), ret.pos.z.ToFloat32(), ret.pos.w.ToFloat32(),	621	ret.pos.x.ToFloat32(), ret.pos.y.ToFloat32(), ret.pos.z.ToFloat32(), ret.pos.w.ToFloat32(),
618	ret.color.x.ToFloat32(), ret.color.y.ToFloat32(), ret.color.z.ToFloat32(), ret.color.w.ToFloat32(),	622	ret.color.x.ToFloat32(), ret.color.y.ToFloat32(), ret.color.z.ToFloat32(), ret.color.w.ToFloat32(),