6 files changed, 105 insertions, 78 deletions
diff --git a/src/citra_qt/main.cpp b/src/citra_qt/main.cpp
index 23d4925b8..b12e6a02b 100644
--- a/src/citra_qt/main.cpp
+++ b/src/citra_qt/main.cpp
@@ -230,12 +230,15 @@ void GMainWindow::ToggleWindowMode()
        render_window->setParent(nullptr);
        render_window->setVisible(true);
        render_window->RestoreGeometry();
+        render_window->setFocusPolicy(Qt::NoFocus);
    }
    else if (!enable && render_window->parent() == nullptr)
    {
        render_window->BackupGeometry();
        ui.horizontalLayout->addWidget(render_window);
        render_window->setVisible(true);
+        render_window->setFocusPolicy(Qt::ClickFocus);
+        render_window->setFocus();
    }
 }
diff --git a/src/video_core/clipper.cpp b/src/video_core/clipper.cpp
index 0bcd0b895..1744066ba 100644
--- a/src/video_core/clipper.cpp
+++ b/src/video_core/clipper.cpp
@@ -2,7 +2,7 @@
 // Licensed under GPLv2 or any later version
 // Refer to the license.txt file included.
-#include <vector>
+#include <boost/container/static_vector.hpp>
 #include "clipper.h"
 #include "pica.h"
@@ -91,25 +91,31 @@ static void InitScreenCoordinates(OutputVertex& vtx)
    viewport.zscale     = float24::FromRawFloat24(registers.viewport_depth_range);
    viewport.offset_z   = float24::FromRawFloat24(registers.viewport_depth_far_plane);
+    float24 inv_w = float24::FromFloat32(1.f) / vtx.pos.w;
+    vtx.color *= inv_w;
+    vtx.tc0 *= inv_w;
+    vtx.tc1 *= inv_w;
+    vtx.tc2 *= inv_w;
+    vtx.pos.w = inv_w;
    // TODO: Not sure why the viewport width needs to be divided by 2 but the viewport height does not
-    vtx.screenpos[0] = (vtx.pos.x / vtx.pos.w + float24::FromFloat32(1.0)) * viewport.halfsize_x + viewport.offset_x;
+    vtx.screenpos[0] = (vtx.pos.x * inv_w + float24::FromFloat32(1.0)) * viewport.halfsize_x + viewport.offset_x;
-    vtx.screenpos[1] = (vtx.pos.y / vtx.pos.w + float24::FromFloat32(1.0)) * viewport.halfsize_y + viewport.offset_y;
+    vtx.screenpos[1] = (vtx.pos.y * inv_w + float24::FromFloat32(1.0)) * viewport.halfsize_y + viewport.offset_y;
-    vtx.screenpos[2] = viewport.offset_z - vtx.pos.z / vtx.pos.w * viewport.zscale;
+    vtx.screenpos[2] = viewport.offset_z - vtx.pos.z * inv_w * viewport.zscale;
 }
 void ProcessTriangle(OutputVertex &v0, OutputVertex &v1, OutputVertex &v2) {
+    using boost::container::static_vector;
-    // TODO (neobrain):
-    // The list of output vertices has some fixed maximum size,
+    // Clipping a planar n-gon against a plane will remove at least 1 vertex and introduces 2 at
-    // however I haven't taken the time to figure out what it is exactly.
+    // the new edge (or less in degenerate cases). As such, we can say that each clipping plane
-    // For now, we hence just assume a maximal size of 1000 vertices.
+    // introduces at most 1 new vertex to the polygon. Since we start with a triangle and have a
-    const size_t max_vertices = 1000;
+    // fixed 6 clipping planes, the maximum number of vertices of the clipped polygon is 3 + 6 = 9.
-    std::vector<OutputVertex> buffer_vertices;
+    static const size_t MAX_VERTICES = 9;
-    std::vector<OutputVertex*> output_list{ &v0, &v1, &v2 };
+    static_vector<OutputVertex, MAX_VERTICES> buffer_a = { v0, v1, v2 };
+    static_vector<OutputVertex, MAX_VERTICES> buffer_b;
-    // Make sure to reserve space for all vertices.
+    auto* output_list = &buffer_a;
-    // Without this, buffer reallocation would invalidate references.
+    auto* input_list  = &buffer_b;
-    buffer_vertices.reserve(max_vertices);
    // Simple implementation of the Sutherland-Hodgman clipping algorithm.
    // TODO: Make this less inefficient (currently lots of useless buffering overhead happens here)
@@ -120,48 +126,45 @@ void ProcessTriangle(OutputVertex &v0, OutputVertex &v1, OutputVertex &v2) {
                       ClippingEdge(ClippingEdge::POS_Z, float24::FromFloat32(+1.0)),
                       ClippingEdge(ClippingEdge::NEG_Z, float24::FromFloat32(-1.0)) }) {
-        const std::vector<OutputVertex*> input_list = output_list;
+        std::swap(input_list, output_list);
-        output_list.clear();
+        output_list->clear();
-        const OutputVertex* reference_vertex = input_list.back();
+        const OutputVertex* reference_vertex = &input_list->back();
-        for (const auto& vertex : input_list) {
+        for (const auto& vertex : *input_list) {
            // NOTE: This algorithm changes vertex order in some cases!
-            if (edge.IsInside(*vertex)) {
+            if (edge.IsInside(vertex)) {
                if (edge.IsOutSide(*reference_vertex)) {
-                    buffer_vertices.push_back(edge.GetIntersection(*vertex, *reference_vertex));
+                    output_list->push_back(edge.GetIntersection(vertex, *reference_vertex));
-                    output_list.push_back(&(buffer_vertices.back()));
                }
-                output_list.push_back(vertex);
+                output_list->push_back(vertex);
            } else if (edge.IsInside(*reference_vertex)) {
-                buffer_vertices.push_back(edge.GetIntersection(*vertex, *reference_vertex));
+                output_list->push_back(edge.GetIntersection(vertex, *reference_vertex));
-                output_list.push_back(&(buffer_vertices.back()));
            }
+            reference_vertex = &vertex;
-            reference_vertex = vertex;
        }
        // Need to have at least a full triangle to continue...
-        if (output_list.size() < 3)
+        if (output_list->size() < 3)
            return;
    }
-    InitScreenCoordinates(*(output_list[0]));
+    InitScreenCoordinates((*output_list)[0]);
-    InitScreenCoordinates(*(output_list[1]));
+    InitScreenCoordinates((*output_list)[1]);
-    for (size_t i = 0; i < output_list.size() - 2; i ++) {
+    for (size_t i = 0; i < output_list->size() - 2; i ++) {
-        OutputVertex& vtx0 = *(output_list[0]);
+        OutputVertex& vtx0 = (*output_list)[0];
-        OutputVertex& vtx1 = *(output_list[i+1]);
+        OutputVertex& vtx1 = (*output_list)[i+1];
-        OutputVertex& vtx2 = *(output_list[i+2]);
+        OutputVertex& vtx2 = (*output_list)[i+2];
        InitScreenCoordinates(vtx2);
        LOG_TRACE(Render_Software,
-                  "Triangle %lu/%lu (%lu buffer vertices) at position (%.3f, %.3f, %.3f, %.3f), "
+                  "Triangle %lu/%lu at position (%.3f, %.3f, %.3f, %.3f), "
                  "(%.3f, %.3f, %.3f, %.3f), (%.3f, %.3f, %.3f, %.3f) and "
                  "screen position (%.2f, %.2f, %.2f), (%.2f, %.2f, %.2f), (%.2f, %.2f, %.2f)",
-                  i,output_list.size(), buffer_vertices.size(),
+                  i, output_list->size(),
                  vtx0.pos.x.ToFloat32(), vtx0.pos.y.ToFloat32(), vtx0.pos.z.ToFloat32(), vtx0.pos.w.ToFloat32(),
                  vtx1.pos.x.ToFloat32(), vtx1.pos.y.ToFloat32(), vtx1.pos.z.ToFloat32(), vtx1.pos.w.ToFloat32(),
                  vtx2.pos.x.ToFloat32(), vtx2.pos.y.ToFloat32(), vtx2.pos.z.ToFloat32(), vtx2.pos.w.ToFloat32(),
diff --git a/src/video_core/debug_utils/debug_utils.cpp b/src/video_core/debug_utils/debug_utils.cpp
index 328386b7e..5921185a6 100644
--- a/src/video_core/debug_utils/debug_utils.cpp
+++ b/src/video_core/debug_utils/debug_utils.cpp
@@ -304,7 +304,6 @@ std::unique_ptr<PicaTrace> FinishPicaTracing()
 }
 const Math::Vec4<u8> LookupTexture(const u8* source, int x, int y, const TextureInfo& info, bool disable_alpha) {
    // Images are split into 8x8 tiles. Each tile is composed of four 4x4 subtiles each
    // of which is composed of four 2x2 subtiles each of which is composed of four texels.
    // Each structure is embedded into the next-bigger one in a diagonal pattern, e.g.
@@ -323,41 +322,39 @@ const Math::Vec4<u8> LookupTexture(const u8* source, int x, int y, const Texture
    // 02 03 06 07 18 19 22 23
    // 00 01 04 05 16 17 20 21
-    // TODO(neobrain): Not sure if this swizzling pattern is used for all textures.
+    const unsigned int block_width = 8;
-    // To be flexible in case different but similar patterns are used, we keep this
+    const unsigned int block_height = 8;
-    // somewhat inefficient code around for now.
-    int texel_index_within_tile = 0;
+    const unsigned int coarse_x = x & ~7;
-    for (int block_size_index = 0; block_size_index < 3; ++block_size_index) {
+    const unsigned int coarse_y = y & ~7;
-        int sub_tile_width = 1 << block_size_index;
-        int sub_tile_height = 1 << block_size_index;
-        int sub_tile_index = (x & sub_tile_width) << block_size_index;
-        sub_tile_index += 2 * ((y & sub_tile_height) << block_size_index);
-        texel_index_within_tile += sub_tile_index;
-    }
-    const int block_width = 8;
+    // Interleave the lower 3 bits of each coordinate to get the intra-block offsets, which are
-    const int block_height = 8;
+    // arranged in a Z-order curve. More details on the bit manipulation at:
+    // https://fgiesen.wordpress.com/2009/12/13/decoding-morton-codes/
+    unsigned int i = (x | (y << 8)) & 0x0707; // ---- -210
+    i = (i ^ (i << 2)) & 0x1313;              // ---2 --10
+    i = (i ^ (i << 1)) & 0x1515;              // ---2 -1-0
+    i = (i | (i >> 7)) & 0x3F;
-    int coarse_x = (x / block_width) * block_width;
+    source += coarse_y * info.stride;
-    int coarse_y = (y / block_height) * block_height;
+    const unsigned int offset = coarse_x * block_height + i;
    switch (info.format) {
    case Regs::TextureFormat::RGBA8:
    {
-        const u8* source_ptr = source + coarse_x * block_height * 4 + coarse_y * info.stride + texel_index_within_tile * 4;
+        const u8* source_ptr = source + offset * 4;
        return { source_ptr[3], source_ptr[2], source_ptr[1], disable_alpha ? (u8)255 : source_ptr[0] };
    }
    case Regs::TextureFormat::RGB8:
    {
-        const u8* source_ptr = source + coarse_x * block_height * 3 + coarse_y * info.stride + texel_index_within_tile * 3;
+        const u8* source_ptr = source + offset * 3;
        return { source_ptr[2], source_ptr[1], source_ptr[0], 255 };
    }
    case Regs::TextureFormat::RGBA5551:
    {
-        const u16 source_ptr = *(const u16*)(source + coarse_x * block_height * 2 + coarse_y * info.stride + texel_index_within_tile * 2);
+        const u16 source_ptr = *(const u16*)(source + offset * 2);
        u8 r = (source_ptr >> 11) & 0x1F;
        u8 g = ((source_ptr) >> 6) & 0x1F;
        u8 b = (source_ptr >> 1) & 0x1F;
@@ -367,7 +364,7 @@ const Math::Vec4<u8> LookupTexture(const u8* source, int x, int y, const Texture
    case Regs::TextureFormat::RGB565:
    {
-        const u16 source_ptr = *(const u16*)(source + coarse_x * block_height * 2 + coarse_y * info.stride + texel_index_within_tile * 2);
+        const u16 source_ptr = *(const u16*)(source + offset * 2);
        u8 r = (source_ptr >> 11) & 0x1F;
        u8 g = ((source_ptr) >> 5) & 0x3F;
        u8 b = (source_ptr) & 0x1F;
@@ -376,7 +373,7 @@ const Math::Vec4<u8> LookupTexture(const u8* source, int x, int y, const Texture
    case Regs::TextureFormat::RGBA4:
    {
-        const u8* source_ptr = source + coarse_x * block_height * 2 + coarse_y * info.stride + texel_index_within_tile * 2;
+        const u8* source_ptr = source + offset * 2;
        u8 r = source_ptr[1] >> 4;
        u8 g = source_ptr[1] & 0xFF;
        u8 b = source_ptr[0] >> 4;
@@ -390,7 +387,7 @@ const Math::Vec4<u8> LookupTexture(const u8* source, int x, int y, const Texture
    case Regs::TextureFormat::IA8:
    {
-        const u8* source_ptr = source + coarse_x * block_height * 2 + coarse_y * info.stride + texel_index_within_tile * 2;
+        const u8* source_ptr = source + offset * 2;
        // TODO: component order not verified
@@ -404,13 +401,13 @@ const Math::Vec4<u8> LookupTexture(const u8* source, int x, int y, const Texture
    case Regs::TextureFormat::I8:
    {
-        const u8* source_ptr = source + coarse_x * block_height + coarse_y * info.stride + texel_index_within_tile;
+        const u8* source_ptr = source + offset;
        return { *source_ptr, *source_ptr, *source_ptr, 255 };
    }
    case Regs::TextureFormat::A8:
    {
-        const u8* source_ptr = source + coarse_x * block_height + coarse_y * info.stride + texel_index_within_tile;
+        const u8* source_ptr = source + offset;
        if (disable_alpha) {
            return { *source_ptr, *source_ptr, *source_ptr, 255 };
@@ -421,7 +418,7 @@ const Math::Vec4<u8> LookupTexture(const u8* source, int x, int y, const Texture
    case Regs::TextureFormat::IA4:
    {
-        const u8* source_ptr = source + coarse_x * block_height / 2 + coarse_y * info.stride + texel_index_within_tile / 2;
+        const u8* source_ptr = source + offset / 2;
        // TODO: component order not verified
@@ -440,7 +437,7 @@ const Math::Vec4<u8> LookupTexture(const u8* source, int x, int y, const Texture
    case Regs::TextureFormat::A4:
    {
-        const u8* source_ptr = source + coarse_x * block_height / 2 + coarse_y * info.stride + texel_index_within_tile / 2;
+        const u8* source_ptr = source + offset / 2;
        // TODO: component order not verified
diff --git a/src/video_core/pica.h b/src/video_core/pica.h
index 89d97e4e9..38bac748c 100644
--- a/src/video_core/pica.h
+++ b/src/video_core/pica.h
@@ -757,6 +757,26 @@ struct float24 {
        return float24::FromFloat32(ToFloat32() - flt.ToFloat32());
    }
+    float24& operator *= (const float24& flt) {
+        value *= flt.ToFloat32();
+        return *this;
+    }
+    float24& operator /= (const float24& flt) {
+        value /= flt.ToFloat32();
+        return *this;
+    }
+    float24& operator += (const float24& flt) {
+        value += flt.ToFloat32();
+        return *this;
+    }
+    float24& operator -= (const float24& flt) {
+        value -= flt.ToFloat32();
+        return *this;
+    }
    float24 operator - () const {
        return float24::FromFloat32(-ToFloat32());
    }
diff --git a/src/video_core/rasterizer.cpp b/src/video_core/rasterizer.cpp
index df1f88c79..a80148872 100644
--- a/src/video_core/rasterizer.cpp
+++ b/src/video_core/rasterizer.cpp
@@ -106,6 +106,11 @@ void ProcessTriangle(const VertexShader::OutputVertex& v0,
    int bias1 = IsRightSideOrFlatBottomEdge(vtxpos[1].xy(), vtxpos[2].xy(), vtxpos[0].xy()) ? -1 : 0;
    int bias2 = IsRightSideOrFlatBottomEdge(vtxpos[2].xy(), vtxpos[0].xy(), vtxpos[1].xy()) ? -1 : 0;
+    auto w_inverse = Math::MakeVec(v0.pos.w, v1.pos.w, v2.pos.w);
+    auto textures = registers.GetTextures();
+    auto tev_stages = registers.GetTevStages();
    // TODO: Not sure if looping through x first might be faster
    for (u16 y = min_y; y < max_y; y += 0x10) {
        for (u16 x = min_x; x < max_x; x += 0x10) {
@@ -129,6 +134,11 @@ void ProcessTriangle(const VertexShader::OutputVertex& v0,
            if (w0 < 0 || w1 < 0 || w2 < 0)
                continue;
+            auto baricentric_coordinates = Math::MakeVec(float24::FromFloat32(static_cast<float>(w0)),
+                                                float24::FromFloat32(static_cast<float>(w1)),
+                                                float24::FromFloat32(static_cast<float>(w2)));
+            float24 interpolated_w_inverse = float24::FromFloat32(1.0f) / Math::Dot(w_inverse, baricentric_coordinates);
            // Perspective correct attribute interpolation:
            // Attribute values cannot be calculated by simple linear interpolation since
            // they are not linear in screen space. For example, when interpolating a
@@ -145,19 +155,9 @@ void ProcessTriangle(const VertexShader::OutputVertex& v0,
            //
            // The generalization to three vertices is straightforward in baricentric coordinates.
            auto GetInterpolatedAttribute = [&](float24 attr0, float24 attr1, float24 attr2) {
-                auto attr_over_w = Math::MakeVec(attr0 / v0.pos.w,
+                auto attr_over_w = Math::MakeVec(attr0, attr1, attr2);
-                                                 attr1 / v1.pos.w,
-                                                 attr2 / v2.pos.w);
-                auto w_inverse   = Math::MakeVec(float24::FromFloat32(1.f) / v0.pos.w,
-                                                 float24::FromFloat32(1.f) / v1.pos.w,
-                                                 float24::FromFloat32(1.f) / v2.pos.w);
-                auto baricentric_coordinates = Math::MakeVec(float24::FromFloat32(static_cast<float>(w0)),
-                                                             float24::FromFloat32(static_cast<float>(w1)),
-                                                             float24::FromFloat32(static_cast<float>(w2)));
                float24 interpolated_attr_over_w = Math::Dot(attr_over_w, baricentric_coordinates);
-                float24 interpolated_w_inverse   = Math::Dot(w_inverse,   baricentric_coordinates);
+                return interpolated_attr_over_w * interpolated_w_inverse;
-                return interpolated_attr_over_w / interpolated_w_inverse;
            };
            Math::Vec4<u8> primary_color{
@@ -177,7 +177,7 @@ void ProcessTriangle(const VertexShader::OutputVertex& v0,
            Math::Vec4<u8> texture_color[3]{};
            for (int i = 0; i < 3; ++i) {
-                auto texture = registers.GetTextures()[i];
+                const auto& texture = textures[i];
                if (!texture.enabled)
                    continue;
@@ -219,7 +219,7 @@ void ProcessTriangle(const VertexShader::OutputVertex& v0,
            // with some basic arithmetic. Alpha combiners can be configured separately but work
            // analogously.
            Math::Vec4<u8> combiner_output;
-            for (auto tev_stage : registers.GetTevStages()) {
+            for (const auto& tev_stage : tev_stages) {
                using Source = Regs::TevStageConfig::Source;
                using ColorModifier = Regs::TevStageConfig::ColorModifier;
                using AlphaModifier = Regs::TevStageConfig::AlphaModifier;
diff --git a/src/video_core/vertex_shader.cpp b/src/video_core/vertex_shader.cpp
index e31bc3bc7..bed5081a0 100644
--- a/src/video_core/vertex_shader.cpp
+++ b/src/video_core/vertex_shader.cpp
@@ -469,6 +469,10 @@ OutputVertex RunShader(const InputVertex& input, int num_attributes)
    // Setup output register table
    OutputVertex ret;
+    // Zero output so that attributes which aren't output won't have denormals in them, which will
+    // slow us down later.
+    memset(&ret, 0, sizeof(ret));
    for (int i = 0; i < 7; ++i) {
        const auto& output_register_map = registers.vs_output_attributes[i];