Merge pull request #351 from yuriks/optimize

Rasterizer Optimizations
author: Tony Wasserka 2014-12-30 00:13:48 +0100
committer: Tony Wasserka 2014-12-30 00:13:48 +0100
commit: b7e0b16354bc31521785247d7da3ad84f3829ea8 (patch)
tree: 9ed2e151b59536187946efb3fe34b51507b19c56 /src/video_core/clipper.cpp
parent: Merge pull request #361 from lioncash/moreqops (diff)
parent: Rasterizer: Pre-divide vertex attributes by W (diff)
download: yuzu-b7e0b16354bc31521785247d7da3ad84f3829ea8.tar.gz
yuzu-b7e0b16354bc31521785247d7da3ad84f3829ea8.tar.xz
yuzu-b7e0b16354bc31521785247d7da3ad84f3829ea8.zip
1 files changed, 40 insertions, 37 deletions
diff --git a/src/video_core/clipper.cpp b/src/video_core/clipper.cpp
index 0bcd0b895..1744066ba 100644
--- a/src/video_core/clipper.cpp
+++ b/src/video_core/clipper.cpp
@@ -2,7 +2,7 @@
 // Licensed under GPLv2 or any later version
 // Refer to the license.txt file included.
-#include <vector>
+#include <boost/container/static_vector.hpp>
 #include "clipper.h"
 #include "pica.h"
@@ -91,25 +91,31 @@ static void InitScreenCoordinates(OutputVertex& vtx)
    viewport.zscale     = float24::FromRawFloat24(registers.viewport_depth_range);
    viewport.offset_z   = float24::FromRawFloat24(registers.viewport_depth_far_plane);
+    float24 inv_w = float24::FromFloat32(1.f) / vtx.pos.w;
+    vtx.color *= inv_w;
+    vtx.tc0 *= inv_w;
+    vtx.tc1 *= inv_w;
+    vtx.tc2 *= inv_w;
+    vtx.pos.w = inv_w;
    // TODO: Not sure why the viewport width needs to be divided by 2 but the viewport height does not
-    vtx.screenpos[0] = (vtx.pos.x / vtx.pos.w + float24::FromFloat32(1.0)) * viewport.halfsize_x + viewport.offset_x;
+    vtx.screenpos[0] = (vtx.pos.x * inv_w + float24::FromFloat32(1.0)) * viewport.halfsize_x + viewport.offset_x;
-    vtx.screenpos[1] = (vtx.pos.y / vtx.pos.w + float24::FromFloat32(1.0)) * viewport.halfsize_y + viewport.offset_y;
+    vtx.screenpos[1] = (vtx.pos.y * inv_w + float24::FromFloat32(1.0)) * viewport.halfsize_y + viewport.offset_y;
-    vtx.screenpos[2] = viewport.offset_z - vtx.pos.z / vtx.pos.w * viewport.zscale;
+    vtx.screenpos[2] = viewport.offset_z - vtx.pos.z * inv_w * viewport.zscale;
 }
 void ProcessTriangle(OutputVertex &v0, OutputVertex &v1, OutputVertex &v2) {
+    using boost::container::static_vector;
-    // TODO (neobrain):
-    // The list of output vertices has some fixed maximum size,
+    // Clipping a planar n-gon against a plane will remove at least 1 vertex and introduces 2 at
-    // however I haven't taken the time to figure out what it is exactly.
+    // the new edge (or less in degenerate cases). As such, we can say that each clipping plane
-    // For now, we hence just assume a maximal size of 1000 vertices.
+    // introduces at most 1 new vertex to the polygon. Since we start with a triangle and have a
-    const size_t max_vertices = 1000;
+    // fixed 6 clipping planes, the maximum number of vertices of the clipped polygon is 3 + 6 = 9.
-    std::vector<OutputVertex> buffer_vertices;
+    static const size_t MAX_VERTICES = 9;
-    std::vector<OutputVertex*> output_list{ &v0, &v1, &v2 };
+    static_vector<OutputVertex, MAX_VERTICES> buffer_a = { v0, v1, v2 };
+    static_vector<OutputVertex, MAX_VERTICES> buffer_b;
-    // Make sure to reserve space for all vertices.
+    auto* output_list = &buffer_a;
-    // Without this, buffer reallocation would invalidate references.
+    auto* input_list  = &buffer_b;
-    buffer_vertices.reserve(max_vertices);
    // Simple implementation of the Sutherland-Hodgman clipping algorithm.
    // TODO: Make this less inefficient (currently lots of useless buffering overhead happens here)
@@ -120,48 +126,45 @@ void ProcessTriangle(OutputVertex &v0, OutputVertex &v1, OutputVertex &v2) {
                       ClippingEdge(ClippingEdge::POS_Z, float24::FromFloat32(+1.0)),
                       ClippingEdge(ClippingEdge::NEG_Z, float24::FromFloat32(-1.0)) }) {
-        const std::vector<OutputVertex*> input_list = output_list;
+        std::swap(input_list, output_list);
-        output_list.clear();
+        output_list->clear();
-        const OutputVertex* reference_vertex = input_list.back();
+        const OutputVertex* reference_vertex = &input_list->back();
-        for (const auto& vertex : input_list) {
+        for (const auto& vertex : *input_list) {
            // NOTE: This algorithm changes vertex order in some cases!
-            if (edge.IsInside(*vertex)) {
+            if (edge.IsInside(vertex)) {
                if (edge.IsOutSide(*reference_vertex)) {
-                    buffer_vertices.push_back(edge.GetIntersection(*vertex, *reference_vertex));
+                    output_list->push_back(edge.GetIntersection(vertex, *reference_vertex));
-                    output_list.push_back(&(buffer_vertices.back()));
                }
-                output_list.push_back(vertex);
+                output_list->push_back(vertex);
            } else if (edge.IsInside(*reference_vertex)) {
-                buffer_vertices.push_back(edge.GetIntersection(*vertex, *reference_vertex));
+                output_list->push_back(edge.GetIntersection(vertex, *reference_vertex));
-                output_list.push_back(&(buffer_vertices.back()));
            }
+            reference_vertex = &vertex;
-            reference_vertex = vertex;
        }
        // Need to have at least a full triangle to continue...
-        if (output_list.size() < 3)
+        if (output_list->size() < 3)
            return;
    }
-    InitScreenCoordinates(*(output_list[0]));
+    InitScreenCoordinates((*output_list)[0]);
-    InitScreenCoordinates(*(output_list[1]));
+    InitScreenCoordinates((*output_list)[1]);
-    for (size_t i = 0; i < output_list.size() - 2; i ++) {
+    for (size_t i = 0; i < output_list->size() - 2; i ++) {
-        OutputVertex& vtx0 = *(output_list[0]);
+        OutputVertex& vtx0 = (*output_list)[0];
-        OutputVertex& vtx1 = *(output_list[i+1]);
+        OutputVertex& vtx1 = (*output_list)[i+1];
-        OutputVertex& vtx2 = *(output_list[i+2]);
+        OutputVertex& vtx2 = (*output_list)[i+2];
        InitScreenCoordinates(vtx2);
        LOG_TRACE(Render_Software,
-                  "Triangle %lu/%lu (%lu buffer vertices) at position (%.3f, %.3f, %.3f, %.3f), "
+                  "Triangle %lu/%lu at position (%.3f, %.3f, %.3f, %.3f), "
                  "(%.3f, %.3f, %.3f, %.3f), (%.3f, %.3f, %.3f, %.3f) and "
                  "screen position (%.2f, %.2f, %.2f), (%.2f, %.2f, %.2f), (%.2f, %.2f, %.2f)",
-                  i,output_list.size(), buffer_vertices.size(),
+                  i, output_list->size(),
                  vtx0.pos.x.ToFloat32(), vtx0.pos.y.ToFloat32(), vtx0.pos.z.ToFloat32(), vtx0.pos.w.ToFloat32(),
                  vtx1.pos.x.ToFloat32(), vtx1.pos.y.ToFloat32(), vtx1.pos.z.ToFloat32(), vtx1.pos.w.ToFloat32(),
                  vtx2.pos.x.ToFloat32(), vtx2.pos.y.ToFloat32(), vtx2.pos.z.ToFloat32(), vtx2.pos.w.ToFloat32(),
author	Tony Wasserka	2014-12-30 00:13:48 +0100
committer	Tony Wasserka	2014-12-30 00:13:48 +0100
commit	b7e0b16354bc31521785247d7da3ad84f3829ea8 (patch)
tree	9ed2e151b59536187946efb3fe34b51507b19c56 /src/video_core/clipper.cpp
parent	Merge pull request #361 from lioncash/moreqops (diff)
parent	Rasterizer: Pre-divide vertex attributes by W (diff)
download	yuzu-b7e0b16354bc31521785247d7da3ad84f3829ea8.tar.gz yuzu-b7e0b16354bc31521785247d7da3ad84f3829ea8.tar.xz yuzu-b7e0b16354bc31521785247d7da3ad84f3829ea8.zip

diff --git a/src/video_core/clipper.cpp b/src/video_core/clipper.cpp index 0bcd0b895..1744066ba 100644 --- a/src/video_core/clipper.cpp +++ b/src/video_core/clipper.cpp
@@ -2,7 +2,7 @@
2	// Licensed under GPLv2 or any later version	2	// Licensed under GPLv2 or any later version
3	// Refer to the license.txt file included.	3	// Refer to the license.txt file included.
4		4
5	#include <vector>	5	#include <boost/container/static_vector.hpp>
6		6
7	#include "clipper.h"	7	#include "clipper.h"
8	#include "pica.h"	8	#include "pica.h"
@@ -91,25 +91,31 @@ static void InitScreenCoordinates(OutputVertex& vtx)
91	viewport.zscale = float24::FromRawFloat24(registers.viewport_depth_range);	91	viewport.zscale = float24::FromRawFloat24(registers.viewport_depth_range);
92	viewport.offset_z = float24::FromRawFloat24(registers.viewport_depth_far_plane);	92	viewport.offset_z = float24::FromRawFloat24(registers.viewport_depth_far_plane);
93		93
		94	float24 inv_w = float24::FromFloat32(1.f) / vtx.pos.w;
		95	vtx.color *= inv_w;
		96	vtx.tc0 *= inv_w;
		97	vtx.tc1 *= inv_w;
		98	vtx.tc2 *= inv_w;
		99	vtx.pos.w = inv_w;
		100
94	// TODO: Not sure why the viewport width needs to be divided by 2 but the viewport height does not	101	// TODO: Not sure why the viewport width needs to be divided by 2 but the viewport height does not
95	vtx.screenpos[0] = (vtx.pos.x / vtx.pos.w + float24::FromFloat32(1.0)) * viewport.halfsize_x + viewport.offset_x;	102	vtx.screenpos[0] = (vtx.pos.x * inv_w + float24::FromFloat32(1.0)) * viewport.halfsize_x + viewport.offset_x;
96	vtx.screenpos[1] = (vtx.pos.y / vtx.pos.w + float24::FromFloat32(1.0)) * viewport.halfsize_y + viewport.offset_y;	103	vtx.screenpos[1] = (vtx.pos.y * inv_w + float24::FromFloat32(1.0)) * viewport.halfsize_y + viewport.offset_y;
97	vtx.screenpos[2] = viewport.offset_z - vtx.pos.z / vtx.pos.w * viewport.zscale;	104	vtx.screenpos[2] = viewport.offset_z - vtx.pos.z * inv_w * viewport.zscale;
98	}	105	}
99		106
100	void ProcessTriangle(OutputVertex &v0, OutputVertex &v1, OutputVertex &v2) {	107	void ProcessTriangle(OutputVertex &v0, OutputVertex &v1, OutputVertex &v2) {
101		108	using boost::container::static_vector;
102	// TODO (neobrain):	109
103	// The list of output vertices has some fixed maximum size,	110	// Clipping a planar n-gon against a plane will remove at least 1 vertex and introduces 2 at
104	// however I haven't taken the time to figure out what it is exactly.	111	// the new edge (or less in degenerate cases). As such, we can say that each clipping plane
105	// For now, we hence just assume a maximal size of 1000 vertices.	112	// introduces at most 1 new vertex to the polygon. Since we start with a triangle and have a
106	const size_t max_vertices = 1000;	113	// fixed 6 clipping planes, the maximum number of vertices of the clipped polygon is 3 + 6 = 9.
107	std::vector<OutputVertex> buffer_vertices;	114	static const size_t MAX_VERTICES = 9;
108	std::vector<OutputVertex*> output_list{ &v0, &v1, &v2 };	115	static_vector<OutputVertex, MAX_VERTICES> buffer_a = { v0, v1, v2 };
109		116	static_vector<OutputVertex, MAX_VERTICES> buffer_b;
110	// Make sure to reserve space for all vertices.	117	auto* output_list = &buffer_a;
111	// Without this, buffer reallocation would invalidate references.	118	auto* input_list = &buffer_b;
112	buffer_vertices.reserve(max_vertices);
113		119
114	// Simple implementation of the Sutherland-Hodgman clipping algorithm.	120	// Simple implementation of the Sutherland-Hodgman clipping algorithm.
115	// TODO: Make this less inefficient (currently lots of useless buffering overhead happens here)	121	// TODO: Make this less inefficient (currently lots of useless buffering overhead happens here)
@@ -120,48 +126,45 @@ void ProcessTriangle(OutputVertex &v0, OutputVertex &v1, OutputVertex &v2) {
120	ClippingEdge(ClippingEdge::POS_Z, float24::FromFloat32(+1.0)),	126	ClippingEdge(ClippingEdge::POS_Z, float24::FromFloat32(+1.0)),
121	ClippingEdge(ClippingEdge::NEG_Z, float24::FromFloat32(-1.0)) }) {	127	ClippingEdge(ClippingEdge::NEG_Z, float24::FromFloat32(-1.0)) }) {
122		128
123	const std::vector<OutputVertex*> input_list = output_list;	129	std::swap(input_list, output_list);
124	output_list.clear();	130	output_list->clear();
125		131
126	const OutputVertex* reference_vertex = input_list.back();	132	const OutputVertex* reference_vertex = &input_list->back();
127		133
128	for (const auto& vertex : input_list) {	134	for (const auto& vertex : *input_list) {
129	// NOTE: This algorithm changes vertex order in some cases!	135	// NOTE: This algorithm changes vertex order in some cases!
130	if (edge.IsInside(*vertex)) {	136	if (edge.IsInside(vertex)) {
131	if (edge.IsOutSide(*reference_vertex)) {	137	if (edge.IsOutSide(*reference_vertex)) {
132	buffer_vertices.push_back(edge.GetIntersection(vertex, reference_vertex));	138	output_list->push_back(edge.GetIntersection(vertex, *reference_vertex));
133	output_list.push_back(&(buffer_vertices.back()));
134	}	139	}
135		140
136	output_list.push_back(vertex);	141	output_list->push_back(vertex);
137	} else if (edge.IsInside(*reference_vertex)) {	142	} else if (edge.IsInside(*reference_vertex)) {
138	buffer_vertices.push_back(edge.GetIntersection(vertex, reference_vertex));	143	output_list->push_back(edge.GetIntersection(vertex, *reference_vertex));
139	output_list.push_back(&(buffer_vertices.back()));
140	}	144	}
141		145	reference_vertex = &vertex;
142	reference_vertex = vertex;
143	}	146	}
144		147
145	// Need to have at least a full triangle to continue...	148	// Need to have at least a full triangle to continue...
146	if (output_list.size() < 3)	149	if (output_list->size() < 3)
147	return;	150	return;
148	}	151	}
149		152
150	InitScreenCoordinates(*(output_list[0]));	153	InitScreenCoordinates((*output_list)[0]);
151	InitScreenCoordinates(*(output_list[1]));	154	InitScreenCoordinates((*output_list)[1]);
152		155
153	for (size_t i = 0; i < output_list.size() - 2; i ++) {	156	for (size_t i = 0; i < output_list->size() - 2; i ++) {
154	OutputVertex& vtx0 = *(output_list[0]);	157	OutputVertex& vtx0 = (*output_list)[0];
155	OutputVertex& vtx1 = *(output_list[i+1]);	158	OutputVertex& vtx1 = (*output_list)[i+1];
156	OutputVertex& vtx2 = *(output_list[i+2]);	159	OutputVertex& vtx2 = (*output_list)[i+2];
157		160
158	InitScreenCoordinates(vtx2);	161	InitScreenCoordinates(vtx2);
159		162
160	LOG_TRACE(Render_Software,	163	LOG_TRACE(Render_Software,
161	"Triangle %lu/%lu (%lu buffer vertices) at position (%.3f, %.3f, %.3f, %.3f), "	164	"Triangle %lu/%lu at position (%.3f, %.3f, %.3f, %.3f), "
162	"(%.3f, %.3f, %.3f, %.3f), (%.3f, %.3f, %.3f, %.3f) and "	165	"(%.3f, %.3f, %.3f, %.3f), (%.3f, %.3f, %.3f, %.3f) and "
163	"screen position (%.2f, %.2f, %.2f), (%.2f, %.2f, %.2f), (%.2f, %.2f, %.2f)",	166	"screen position (%.2f, %.2f, %.2f), (%.2f, %.2f, %.2f), (%.2f, %.2f, %.2f)",
164	i,output_list.size(), buffer_vertices.size(),	167	i, output_list->size(),
165	vtx0.pos.x.ToFloat32(), vtx0.pos.y.ToFloat32(), vtx0.pos.z.ToFloat32(), vtx0.pos.w.ToFloat32(),	168	vtx0.pos.x.ToFloat32(), vtx0.pos.y.ToFloat32(), vtx0.pos.z.ToFloat32(), vtx0.pos.w.ToFloat32(),
166	vtx1.pos.x.ToFloat32(), vtx1.pos.y.ToFloat32(), vtx1.pos.z.ToFloat32(), vtx1.pos.w.ToFloat32(),	169	vtx1.pos.x.ToFloat32(), vtx1.pos.y.ToFloat32(), vtx1.pos.z.ToFloat32(), vtx1.pos.w.ToFloat32(),
167	vtx2.pos.x.ToFloat32(), vtx2.pos.y.ToFloat32(), vtx2.pos.z.ToFloat32(), vtx2.pos.w.ToFloat32(),	170	vtx2.pos.x.ToFloat32(), vtx2.pos.y.ToFloat32(), vtx2.pos.z.ToFloat32(), vtx2.pos.w.ToFloat32(),