8 files changed, 143 insertions, 26 deletions
diff --git a/externals/nihstro b/externals/nihstro
-Subproject 4a78588b308564f7ebae193e0ae00d9a0d5741d
+Subproject 81f1804a43f625e3a1a20752c0db70a41341038
diff --git a/src/core/core.cpp b/src/core/core.cpp
index bb2ed7a92..b5c258230 100644
--- a/src/core/core.cpp
+++ b/src/core/core.cpp
@@ -61,10 +61,6 @@ int Init() {
    g_sys_core = new ARM_DynCom(USER32MODE);
    g_app_core = new ARM_DynCom(USER32MODE);
-    // TODO: Whenever TLS is implemented, this should contain
-    // the address of the 0x200-byte TLS
-    g_app_core->SetCP15Register(CP15_THREAD_URO, Memory::TLS_AREA_VADDR);
    LOG_DEBUG(Core, "Initialized OK");
    return 0;
 }
diff --git a/src/core/hle/kernel/session.h b/src/core/hle/kernel/session.h
index 0fd18148a..8c3886ffd 100644
--- a/src/core/hle/kernel/session.h
+++ b/src/core/hle/kernel/session.h
@@ -5,6 +5,7 @@
 #pragma once
 #include "core/hle/kernel/kernel.h"
+#include "core/hle/kernel/thread.h"
 #include "core/mem_map.h"
 namespace Kernel {
@@ -12,12 +13,15 @@ namespace Kernel {
 static const int kCommandHeaderOffset = 0x80; ///< Offset into command buffer of header
 /**
- * Returns a pointer to the command buffer in kernel memory
+ * Returns a pointer to the command buffer in the current thread's TLS
+ * TODO(Subv): This is not entirely correct, the command buffer should be copied from
+ * the thread's TLS to an intermediate buffer in kernel memory, and then copied again to
+ * the service handler process' memory.
 * @param offset Optional offset into command buffer
 * @return Pointer to command buffer
 */
-inline static u32* GetCommandBuffer(const int offset=0) {
+inline static u32* GetCommandBuffer(const int offset = 0) {
-    return (u32*)Memory::GetPointer(Memory::TLS_AREA_VADDR + kCommandHeaderOffset + offset);
+    return (u32*)Memory::GetPointer(GetCurrentThread()->GetTLSAddress() + kCommandHeaderOffset + offset);
 }
 /**
diff --git a/src/core/hle/kernel/thread.cpp b/src/core/hle/kernel/thread.cpp
index 0a3fd7cb1..5de8f9a73 100644
--- a/src/core/hle/kernel/thread.cpp
+++ b/src/core/hle/kernel/thread.cpp
@@ -197,6 +197,7 @@ static void SwitchContext(Thread* new_thread) {
        new_thread->current_priority = new_thread->nominal_priority;
        Core::g_app_core->LoadContext(new_thread->context);
+        Core::g_app_core->SetCP15Register(CP15_THREAD_URO, new_thread->GetTLSAddress());
    } else {
        current_thread = nullptr;
    }
@@ -402,6 +403,12 @@ ResultVal<SharedPtr<Thread>> Thread::Create(std::string name, VAddr entry_point,
    thread->name = std::move(name);
    thread->callback_handle = wakeup_callback_handle_table.Create(thread).MoveFrom();
+    VAddr tls_address = Memory::TLS_AREA_VADDR + (thread->thread_id - 1) * 0x200;
+    ASSERT_MSG(tls_address < Memory::TLS_AREA_VADDR_END, "Too many threads");
+    thread->tls_address = tls_address;
    // TODO(peachum): move to ScheduleThread() when scheduler is added so selected core is used
    // to initialize the context
    Core::g_app_core->ResetContext(thread->context, stack_top, entry_point, arg);
@@ -495,6 +502,10 @@ void Thread::SetWaitSynchronizationOutput(s32 output) {
    context.cpu_registers[1] = output;
 }
+VAddr Thread::GetTLSAddress() const {
+    return tls_address;
+}
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 void ThreadingInit() {
diff --git a/src/core/hle/kernel/thread.h b/src/core/hle/kernel/thread.h
index 9958b16e6..6891c8c2f 100644
--- a/src/core/hle/kernel/thread.h
+++ b/src/core/hle/kernel/thread.h
@@ -135,6 +135,12 @@ public:
     */
    void Stop();
+    /*
+     * Returns the Thread Local Storage address of the current thread
+     * @returns VAddr of the thread's TLS
+     */
+    VAddr GetTLSAddress() const;
    Core::ThreadContext context;
    u32 thread_id;
@@ -150,6 +156,8 @@ public:
    s32 processor_id;
+    VAddr tls_address; ///< Address of the Thread Local Storage of the thread
    /// Mutexes currently held by this thread, which will be released when it exits.
    boost::container::flat_set<SharedPtr<Mutex>> held_mutexes;
diff --git a/src/video_core/pica.h b/src/video_core/pica.h
index e4a91058c..5e169ff69 100644
--- a/src/video_core/pica.h
+++ b/src/video_core/pica.h
@@ -226,7 +226,8 @@ struct Regs {
            Texture1               = 0x4,
            Texture2               = 0x5,
            Texture3               = 0x6,
-            // 0x7-0xc = primary color??
+            PreviousBuffer         = 0xd,
            Constant               = 0xe,
            Previous               = 0xf,
        };
@@ -299,7 +300,18 @@ struct Regs {
            BitField<24, 8, u32> const_a;
        };
-        INSERT_PADDING_WORDS(0x1);
+        union {
+            BitField< 0, 2, u32> color_scale;
+            BitField<16, 2, u32> alpha_scale;
+        };
+        inline unsigned GetColorMultiplier() const {
+            return (color_scale < 3) ? (1 << color_scale) : 1;
+        }
+        inline unsigned GetAlphaMultiplier() const {
+            return (alpha_scale < 3) ? (1 << alpha_scale) : 1;
+        }
    };
    TevStageConfig tev_stage0;
@@ -309,11 +321,36 @@ struct Regs {
    TevStageConfig tev_stage2;
    INSERT_PADDING_WORDS(0x3);
    TevStageConfig tev_stage3;
-    INSERT_PADDING_WORDS(0x13);
+    INSERT_PADDING_WORDS(0x3);
+    union {
+        // Tev stages 0-3 write their output to the combiner buffer if the corresponding bit in
+        // these masks are set
+        BitField< 8, 4, u32> update_mask_rgb;
+        BitField<12, 4, u32> update_mask_a;
+        bool TevStageUpdatesCombinerBufferColor(unsigned stage_index) const {
+            return (stage_index < 4) && (update_mask_rgb & (1 << stage_index));
+        }
+        bool TevStageUpdatesCombinerBufferAlpha(unsigned stage_index) const {
+            return (stage_index < 4) && (update_mask_a & (1 << stage_index));
+        }
+    } tev_combiner_buffer_input;
+    
+    INSERT_PADDING_WORDS(0xf);
    TevStageConfig tev_stage4;
    INSERT_PADDING_WORDS(0x3);
    TevStageConfig tev_stage5;
-    INSERT_PADDING_WORDS(0x3);
+    union {
+        BitField< 0, 8, u32> r;
+        BitField< 8, 8, u32> g;
+        BitField<16, 8, u32> b;
+        BitField<24, 8, u32> a;
+    } tev_combiner_buffer_color;
+    INSERT_PADDING_WORDS(0x2);
    const std::array<Regs::TevStageConfig,6> GetTevStages() const {
        return { tev_stage0, tev_stage1,
@@ -426,9 +463,7 @@ struct Regs {
        D24S8  = 3
    };
-    /*
+    // Returns the number of bytes in the specified depth format
-     * Returns the number of bytes in the specified depth format
-     */
    static u32 BytesPerDepthPixel(DepthFormat format) {
        switch (format) {
        case DepthFormat::D16:
@@ -443,6 +478,20 @@ struct Regs {
        }
    }
+    // Returns the number of bits per depth component of the specified depth format
+    static u32 DepthBitsPerPixel(DepthFormat format) {
+        switch (format) {
+        case DepthFormat::D16:
+            return 16;
+        case DepthFormat::D24:
+        case DepthFormat::D24S8:
+            return 24;
+        default:
+            LOG_CRITICAL(HW_GPU, "Unknown depth format %u", format);
+            UNIMPLEMENTED();
+        }
+    }
    struct {
        // Components are laid out in reverse byte order, most significant bits first.
        enum ColorFormat : u32 {
@@ -784,8 +833,10 @@ struct Regs {
        ADD_FIELD(tev_stage1);
        ADD_FIELD(tev_stage2);
        ADD_FIELD(tev_stage3);
+        ADD_FIELD(tev_combiner_buffer_input);
        ADD_FIELD(tev_stage4);
        ADD_FIELD(tev_stage5);
+        ADD_FIELD(tev_combiner_buffer_color);
        ADD_FIELD(output_merger);
        ADD_FIELD(framebuffer);
        ADD_FIELD(vertex_attributes);
@@ -859,8 +910,10 @@ ASSERT_REG_POSITION(tev_stage0, 0xc0);
 ASSERT_REG_POSITION(tev_stage1, 0xc8);
 ASSERT_REG_POSITION(tev_stage2, 0xd0);
 ASSERT_REG_POSITION(tev_stage3, 0xd8);
+ASSERT_REG_POSITION(tev_combiner_buffer_input, 0xe0);
 ASSERT_REG_POSITION(tev_stage4, 0xf0);
 ASSERT_REG_POSITION(tev_stage5, 0xf8);
+ASSERT_REG_POSITION(tev_combiner_buffer_color, 0xfd);
 ASSERT_REG_POSITION(output_merger, 0x100);
 ASSERT_REG_POSITION(framebuffer, 0x110);
 ASSERT_REG_POSITION(vertex_attributes, 0x200);
diff --git a/src/video_core/rasterizer.cpp b/src/video_core/rasterizer.cpp
index 3b3fef484..46a326bb4 100644
--- a/src/video_core/rasterizer.cpp
+++ b/src/video_core/rasterizer.cpp
@@ -90,7 +90,7 @@ static const Math::Vec4<u8> GetPixel(int x, int y) {
        UNIMPLEMENTED();
    }
-    return {};
+    return {0, 0, 0, 0};
 }
 static u32 GetDepth(int x, int y) {
@@ -376,7 +376,13 @@ static void ProcessTriangleInternal(const VertexShader::OutputVertex& v0,
            // with some basic arithmetic. Alpha combiners can be configured separately but work
            // analogously.
            Math::Vec4<u8> combiner_output;
-            for (const auto& tev_stage : tev_stages) {
+            Math::Vec4<u8> combiner_buffer = {
+                registers.tev_combiner_buffer_color.r, registers.tev_combiner_buffer_color.g,
+                registers.tev_combiner_buffer_color.b, registers.tev_combiner_buffer_color.a
+            };
+            for (unsigned tev_stage_index = 0; tev_stage_index < tev_stages.size(); ++tev_stage_index) {
+                const auto& tev_stage = tev_stages[tev_stage_index];
                using Source = Regs::TevStageConfig::Source;
                using ColorModifier = Regs::TevStageConfig::ColorModifier;
                using AlphaModifier = Regs::TevStageConfig::AlphaModifier;
@@ -398,6 +404,9 @@ static void ProcessTriangleInternal(const VertexShader::OutputVertex& v0,
                    case Source::Texture2:
                        return texture_color[2];
+                    case Source::PreviousBuffer:
+                        return combiner_buffer;
                    case Source::Constant:
                        return {tev_stage.const_r, tev_stage.const_g, tev_stage.const_b, tev_stage.const_a};
@@ -407,7 +416,7 @@ static void ProcessTriangleInternal(const VertexShader::OutputVertex& v0,
                    default:
                        LOG_ERROR(HW_GPU, "Unknown color combiner source %d\n", (int)source);
                        UNIMPLEMENTED();
-                        return {};
+                        return {0, 0, 0, 0};
                    }
                };
@@ -490,6 +499,16 @@ static void ProcessTriangleInternal(const VertexShader::OutputVertex& v0,
                        return result.Cast<u8>();
                    }
+                    case Operation::AddSigned:
+                    {
+                        // TODO(bunnei): Verify that the color conversion from (float) 0.5f to (byte) 128 is correct
+                        auto result = input[0].Cast<int>() + input[1].Cast<int>() - Math::MakeVec<int>(128, 128, 128);
+                        result.r() = MathUtil::Clamp<int>(result.r(), 0, 255);
+                        result.g() = MathUtil::Clamp<int>(result.g(), 0, 255);
+                        result.b() = MathUtil::Clamp<int>(result.b(), 0, 255);
+                        return result.Cast<u8>();
+                    }
                    case Operation::Lerp:
                        return ((input[0] * input[2] + input[1] * (Math::MakeVec<u8>(255, 255, 255) - input[2]).Cast<u8>()) / 255).Cast<u8>();
@@ -524,7 +543,7 @@ static void ProcessTriangleInternal(const VertexShader::OutputVertex& v0,
                    default:
                        LOG_ERROR(HW_GPU, "Unknown color combiner operation %d\n", (int)op);
                        UNIMPLEMENTED();
-                        return {};
+                        return {0, 0, 0};
                    }
                };
@@ -578,7 +597,20 @@ static void ProcessTriangleInternal(const VertexShader::OutputVertex& v0,
                };
                auto alpha_output = AlphaCombine(tev_stage.alpha_op, alpha_result);
-                combiner_output = Math::MakeVec(color_output, alpha_output);
+                combiner_output[0] = std::min((unsigned)255, color_output.r() * tev_stage.GetColorMultiplier());
+                combiner_output[1] = std::min((unsigned)255, color_output.g() * tev_stage.GetColorMultiplier());
+                combiner_output[2] = std::min((unsigned)255, color_output.b() * tev_stage.GetColorMultiplier());
+                combiner_output[3] = std::min((unsigned)255, alpha_output * tev_stage.GetAlphaMultiplier());
+                if (registers.tev_combiner_buffer_input.TevStageUpdatesCombinerBufferColor(tev_stage_index)) {
+                    combiner_buffer.r() = combiner_output.r();
+                    combiner_buffer.g() = combiner_output.g();
+                    combiner_buffer.b() = combiner_output.b();
+                }
+                if (registers.tev_combiner_buffer_input.TevStageUpdatesCombinerBufferAlpha(tev_stage_index)) {
+                    combiner_buffer.a() = combiner_output.a();
+                }
            }
            if (registers.output_merger.alpha_test.enable) {
@@ -624,9 +656,10 @@ static void ProcessTriangleInternal(const VertexShader::OutputVertex& v0,
            // TODO: Does depth indeed only get written even if depth testing is enabled?
            if (registers.output_merger.depth_test_enable) {
-                u16 z = (u16)((v0.screenpos[2].ToFloat32() * w0 +
+                unsigned num_bits = Pica::Regs::DepthBitsPerPixel(registers.framebuffer.depth_format);
-                            v1.screenpos[2].ToFloat32() * w1 +
+                u32 z = (u32)((v0.screenpos[2].ToFloat32() * w0 +
-                            v2.screenpos[2].ToFloat32() * w2) * 65535.f / wsum);
+                               v1.screenpos[2].ToFloat32() * w1 +
+                               v2.screenpos[2].ToFloat32() * w2) * ((1 << num_bits) - 1) / wsum);
                u32 ref_z = GetDepth(x >> 4, y >> 4);
                bool pass = false;
diff --git a/src/video_core/vertex_shader.cpp b/src/video_core/vertex_shader.cpp
index 51f4e58bf..885b7de59 100644
--- a/src/video_core/vertex_shader.cpp
+++ b/src/video_core/vertex_shader.cpp
@@ -235,6 +235,15 @@ static void ProcessShaderCode(VertexShaderState& state) {
                break;
            }
+            case OpCode::Id::FLR:
+                for (int i = 0; i < 4; ++i) {
+                    if (!swizzle.DestComponentEnabled(i))
+                        continue;
+                    dest[i] = float24::FromFloat32(std::floor(src1[i].ToFloat32()));
+                }
+                break;
            case OpCode::Id::MAX:
                for (int i = 0; i < 4; ++i) {
                    if (!swizzle.DestComponentEnabled(i))
@@ -366,12 +375,15 @@ static void ProcessShaderCode(VertexShaderState& state) {
        case OpCode::Type::MultiplyAdd:
        {
-            if (instr.opcode.Value().EffectiveOpCode() == OpCode::Id::MAD) {
+            if ((instr.opcode.Value().EffectiveOpCode() == OpCode::Id::MAD) || 
+                (instr.opcode.Value().EffectiveOpCode() == OpCode::Id::MADI)) {
                const SwizzlePattern& swizzle = *(SwizzlePattern*)&swizzle_data[instr.mad.operand_desc_id];
-                const float24* src1_ = LookupSourceRegister(instr.mad.src1);
+                bool is_inverted = (instr.opcode.Value().EffectiveOpCode() == OpCode::Id::MADI);
-                const float24* src2_ = LookupSourceRegister(instr.mad.src2);
-                const float24* src3_ = LookupSourceRegister(instr.mad.src3);
+                const float24* src1_ = LookupSourceRegister(instr.mad.GetSrc1(is_inverted));
+                const float24* src2_ = LookupSourceRegister(instr.mad.GetSrc2(is_inverted));
+                const float24* src3_ = LookupSourceRegister(instr.mad.GetSrc3(is_inverted));
                const bool negate_src1 = ((bool)swizzle.negate_src1 != false);
                const bool negate_src2 = ((bool)swizzle.negate_src2 != false);

diff --git a/externals/nihstro b/externals/nihstro
	Subproject 4a78588b308564f7ebae193e0ae00d9a0d5741d		Subproject 81f1804a43f625e3a1a20752c0db70a41341038


diff --git a/src/core/core.cpp b/src/core/core.cpp index bb2ed7a92..b5c258230 100644 --- a/src/core/core.cpp +++ b/src/core/core.cpp
@@ -61,10 +61,6 @@ int Init() {
61	g_sys_core = new ARM_DynCom(USER32MODE);	61	g_sys_core = new ARM_DynCom(USER32MODE);
62	g_app_core = new ARM_DynCom(USER32MODE);	62	g_app_core = new ARM_DynCom(USER32MODE);
63		63
64	// TODO: Whenever TLS is implemented, this should contain
65	// the address of the 0x200-byte TLS
66	g_app_core->SetCP15Register(CP15_THREAD_URO, Memory::TLS_AREA_VADDR);
67
68	LOG_DEBUG(Core, "Initialized OK");	64	LOG_DEBUG(Core, "Initialized OK");
69	return 0;	65	return 0;
70	}	66	}


diff --git a/src/core/hle/kernel/session.h b/src/core/hle/kernel/session.h index 0fd18148a..8c3886ffd 100644 --- a/src/core/hle/kernel/session.h +++ b/src/core/hle/kernel/session.h
@@ -5,6 +5,7 @@
5	#pragma once	5	#pragma once
6		6
7	#include "core/hle/kernel/kernel.h"	7	#include "core/hle/kernel/kernel.h"
		8	#include "core/hle/kernel/thread.h"
8	#include "core/mem_map.h"	9	#include "core/mem_map.h"
9		10
10	namespace Kernel {	11	namespace Kernel {
@@ -12,12 +13,15 @@ namespace Kernel {
12	static const int kCommandHeaderOffset = 0x80; ///< Offset into command buffer of header	13	static const int kCommandHeaderOffset = 0x80; ///< Offset into command buffer of header
13		14
14	/**	15	/**
15	* Returns a pointer to the command buffer in kernel memory	16	* Returns a pointer to the command buffer in the current thread's TLS
		17	* TODO(Subv): This is not entirely correct, the command buffer should be copied from
		18	* the thread's TLS to an intermediate buffer in kernel memory, and then copied again to
		19	* the service handler process' memory.
16	* @param offset Optional offset into command buffer	20	* @param offset Optional offset into command buffer
17	* @return Pointer to command buffer	21	* @return Pointer to command buffer
18	*/	22	*/
19	inline static u32* GetCommandBuffer(const int offset=0) {	23	inline static u32* GetCommandBuffer(const int offset = 0) {
20	return (u32*)Memory::GetPointer(Memory::TLS_AREA_VADDR + kCommandHeaderOffset + offset);	24	return (u32*)Memory::GetPointer(GetCurrentThread()->GetTLSAddress() + kCommandHeaderOffset + offset);
21	}	25	}
22		26
23	/**	27	/**


diff --git a/src/core/hle/kernel/thread.cpp b/src/core/hle/kernel/thread.cpp index 0a3fd7cb1..5de8f9a73 100644 --- a/src/core/hle/kernel/thread.cpp +++ b/src/core/hle/kernel/thread.cpp
@@ -197,6 +197,7 @@ static void SwitchContext(Thread* new_thread) {
197	new_thread->current_priority = new_thread->nominal_priority;	197	new_thread->current_priority = new_thread->nominal_priority;
198		198
199	Core::g_app_core->LoadContext(new_thread->context);	199	Core::g_app_core->LoadContext(new_thread->context);
		200	Core::g_app_core->SetCP15Register(CP15_THREAD_URO, new_thread->GetTLSAddress());
200	} else {	201	} else {
201	current_thread = nullptr;	202	current_thread = nullptr;
202	}	203	}
@@ -402,6 +403,12 @@ ResultVal<SharedPtr<Thread>> Thread::Create(std::string name, VAddr entry_point,
402	thread->name = std::move(name);	403	thread->name = std::move(name);
403	thread->callback_handle = wakeup_callback_handle_table.Create(thread).MoveFrom();	404	thread->callback_handle = wakeup_callback_handle_table.Create(thread).MoveFrom();
404		405
		406	VAddr tls_address = Memory::TLS_AREA_VADDR + (thread->thread_id - 1) * 0x200;
		407
		408	ASSERT_MSG(tls_address < Memory::TLS_AREA_VADDR_END, "Too many threads");
		409
		410	thread->tls_address = tls_address;
		411
405	// TODO(peachum): move to ScheduleThread() when scheduler is added so selected core is used	412	// TODO(peachum): move to ScheduleThread() when scheduler is added so selected core is used
406	// to initialize the context	413	// to initialize the context
407	Core::g_app_core->ResetContext(thread->context, stack_top, entry_point, arg);	414	Core::g_app_core->ResetContext(thread->context, stack_top, entry_point, arg);
@@ -495,6 +502,10 @@ void Thread::SetWaitSynchronizationOutput(s32 output) {
495	context.cpu_registers[1] = output;	502	context.cpu_registers[1] = output;
496	}	503	}
497		504
		505	VAddr Thread::GetTLSAddress() const {
		506	return tls_address;
		507	}
		508
498	////////////////////////////////////////////////////////////////////////////////////////////////////	509	////////////////////////////////////////////////////////////////////////////////////////////////////
499		510
500	void ThreadingInit() {	511	void ThreadingInit() {


diff --git a/src/core/hle/kernel/thread.h b/src/core/hle/kernel/thread.h index 9958b16e6..6891c8c2f 100644 --- a/src/core/hle/kernel/thread.h +++ b/src/core/hle/kernel/thread.h
@@ -135,6 +135,12 @@ public:
135	*/	135	*/
136	void Stop();	136	void Stop();
137		137
		138	/*
		139	* Returns the Thread Local Storage address of the current thread
		140	* @returns VAddr of the thread's TLS
		141	*/
		142	VAddr GetTLSAddress() const;
		143
138	Core::ThreadContext context;	144	Core::ThreadContext context;
139		145
140	u32 thread_id;	146	u32 thread_id;
@@ -150,6 +156,8 @@ public:
150		156
151	s32 processor_id;	157	s32 processor_id;
152		158
		159	VAddr tls_address; ///< Address of the Thread Local Storage of the thread
		160
153	/// Mutexes currently held by this thread, which will be released when it exits.	161	/// Mutexes currently held by this thread, which will be released when it exits.
154	boost::container::flat_set<SharedPtr<Mutex>> held_mutexes;	162	boost::container::flat_set<SharedPtr<Mutex>> held_mutexes;
155		163


diff --git a/src/video_core/pica.h b/src/video_core/pica.h index e4a91058c..5e169ff69 100644 --- a/src/video_core/pica.h +++ b/src/video_core/pica.h
@@ -226,7 +226,8 @@ struct Regs {
226	Texture1 = 0x4,	226	Texture1 = 0x4,
227	Texture2 = 0x5,	227	Texture2 = 0x5,
228	Texture3 = 0x6,	228	Texture3 = 0x6,
229	// 0x7-0xc = primary color??	229
		230	PreviousBuffer = 0xd,
230	Constant = 0xe,	231	Constant = 0xe,
231	Previous = 0xf,	232	Previous = 0xf,
232	};	233	};
@@ -299,7 +300,18 @@ struct Regs {
299	BitField<24, 8, u32> const_a;	300	BitField<24, 8, u32> const_a;
300	};	301	};
301		302
302	INSERT_PADDING_WORDS(0x1);	303	union {
		304	BitField< 0, 2, u32> color_scale;
		305	BitField<16, 2, u32> alpha_scale;
		306	};
		307
		308	inline unsigned GetColorMultiplier() const {
		309	return (color_scale < 3) ? (1 << color_scale) : 1;
		310	}
		311
		312	inline unsigned GetAlphaMultiplier() const {
		313	return (alpha_scale < 3) ? (1 << alpha_scale) : 1;
		314	}
303	};	315	};
304		316
305	TevStageConfig tev_stage0;	317	TevStageConfig tev_stage0;
@@ -309,11 +321,36 @@ struct Regs {
309	TevStageConfig tev_stage2;	321	TevStageConfig tev_stage2;
310	INSERT_PADDING_WORDS(0x3);	322	INSERT_PADDING_WORDS(0x3);
311	TevStageConfig tev_stage3;	323	TevStageConfig tev_stage3;
312	INSERT_PADDING_WORDS(0x13);	324	INSERT_PADDING_WORDS(0x3);
		325
		326	union {
		327	// Tev stages 0-3 write their output to the combiner buffer if the corresponding bit in
		328	// these masks are set
		329	BitField< 8, 4, u32> update_mask_rgb;
		330	BitField<12, 4, u32> update_mask_a;
		331
		332	bool TevStageUpdatesCombinerBufferColor(unsigned stage_index) const {
		333	return (stage_index < 4) && (update_mask_rgb & (1 << stage_index));
		334	}
		335
		336	bool TevStageUpdatesCombinerBufferAlpha(unsigned stage_index) const {
		337	return (stage_index < 4) && (update_mask_a & (1 << stage_index));
		338	}
		339	} tev_combiner_buffer_input;
		340
		341	INSERT_PADDING_WORDS(0xf);
313	TevStageConfig tev_stage4;	342	TevStageConfig tev_stage4;
314	INSERT_PADDING_WORDS(0x3);	343	INSERT_PADDING_WORDS(0x3);
315	TevStageConfig tev_stage5;	344	TevStageConfig tev_stage5;
316	INSERT_PADDING_WORDS(0x3);	345
		346	union {
		347	BitField< 0, 8, u32> r;
		348	BitField< 8, 8, u32> g;
		349	BitField<16, 8, u32> b;
		350	BitField<24, 8, u32> a;
		351	} tev_combiner_buffer_color;
		352
		353	INSERT_PADDING_WORDS(0x2);
317		354
318	const std::array<Regs::TevStageConfig,6> GetTevStages() const {	355	const std::array<Regs::TevStageConfig,6> GetTevStages() const {
319	return { tev_stage0, tev_stage1,	356	return { tev_stage0, tev_stage1,
@@ -426,9 +463,7 @@ struct Regs {
426	D24S8 = 3	463	D24S8 = 3
427	};	464	};
428		465
429	/*	466	// Returns the number of bytes in the specified depth format
430	* Returns the number of bytes in the specified depth format
431	*/
432	static u32 BytesPerDepthPixel(DepthFormat format) {	467	static u32 BytesPerDepthPixel(DepthFormat format) {
433	switch (format) {	468	switch (format) {
434	case DepthFormat::D16:	469	case DepthFormat::D16:
@@ -443,6 +478,20 @@ struct Regs {
443	}	478	}
444	}	479	}
445		480
		481	// Returns the number of bits per depth component of the specified depth format
		482	static u32 DepthBitsPerPixel(DepthFormat format) {
		483	switch (format) {
		484	case DepthFormat::D16:
		485	return 16;
		486	case DepthFormat::D24:
		487	case DepthFormat::D24S8:
		488	return 24;
		489	default:
		490	LOG_CRITICAL(HW_GPU, "Unknown depth format %u", format);
		491	UNIMPLEMENTED();
		492	}
		493	}
		494
446	struct {	495	struct {
447	// Components are laid out in reverse byte order, most significant bits first.	496	// Components are laid out in reverse byte order, most significant bits first.
448	enum ColorFormat : u32 {	497	enum ColorFormat : u32 {
@@ -784,8 +833,10 @@ struct Regs {
784	ADD_FIELD(tev_stage1);	833	ADD_FIELD(tev_stage1);
785	ADD_FIELD(tev_stage2);	834	ADD_FIELD(tev_stage2);
786	ADD_FIELD(tev_stage3);	835	ADD_FIELD(tev_stage3);
		836	ADD_FIELD(tev_combiner_buffer_input);
787	ADD_FIELD(tev_stage4);	837	ADD_FIELD(tev_stage4);
788	ADD_FIELD(tev_stage5);	838	ADD_FIELD(tev_stage5);
		839	ADD_FIELD(tev_combiner_buffer_color);
789	ADD_FIELD(output_merger);	840	ADD_FIELD(output_merger);
790	ADD_FIELD(framebuffer);	841	ADD_FIELD(framebuffer);
791	ADD_FIELD(vertex_attributes);	842	ADD_FIELD(vertex_attributes);
@@ -859,8 +910,10 @@ ASSERT_REG_POSITION(tev_stage0, 0xc0);
859	ASSERT_REG_POSITION(tev_stage1, 0xc8);	910	ASSERT_REG_POSITION(tev_stage1, 0xc8);
860	ASSERT_REG_POSITION(tev_stage2, 0xd0);	911	ASSERT_REG_POSITION(tev_stage2, 0xd0);
861	ASSERT_REG_POSITION(tev_stage3, 0xd8);	912	ASSERT_REG_POSITION(tev_stage3, 0xd8);
		913	ASSERT_REG_POSITION(tev_combiner_buffer_input, 0xe0);
862	ASSERT_REG_POSITION(tev_stage4, 0xf0);	914	ASSERT_REG_POSITION(tev_stage4, 0xf0);
863	ASSERT_REG_POSITION(tev_stage5, 0xf8);	915	ASSERT_REG_POSITION(tev_stage5, 0xf8);
		916	ASSERT_REG_POSITION(tev_combiner_buffer_color, 0xfd);
864	ASSERT_REG_POSITION(output_merger, 0x100);	917	ASSERT_REG_POSITION(output_merger, 0x100);
865	ASSERT_REG_POSITION(framebuffer, 0x110);	918	ASSERT_REG_POSITION(framebuffer, 0x110);
866	ASSERT_REG_POSITION(vertex_attributes, 0x200);	919	ASSERT_REG_POSITION(vertex_attributes, 0x200);


diff --git a/src/video_core/rasterizer.cpp b/src/video_core/rasterizer.cpp index 3b3fef484..46a326bb4 100644 --- a/src/video_core/rasterizer.cpp +++ b/src/video_core/rasterizer.cpp
@@ -90,7 +90,7 @@ static const Math::Vec4<u8> GetPixel(int x, int y) {
90	UNIMPLEMENTED();	90	UNIMPLEMENTED();
91	}	91	}
92		92
93	return {};	93	return {0, 0, 0, 0};
94	}	94	}
95		95
96	static u32 GetDepth(int x, int y) {	96	static u32 GetDepth(int x, int y) {
@@ -376,7 +376,13 @@ static void ProcessTriangleInternal(const VertexShader::OutputVertex& v0,
376	// with some basic arithmetic. Alpha combiners can be configured separately but work	376	// with some basic arithmetic. Alpha combiners can be configured separately but work
377	// analogously.	377	// analogously.
378	Math::Vec4<u8> combiner_output;	378	Math::Vec4<u8> combiner_output;
379	for (const auto& tev_stage : tev_stages) {	379	Math::Vec4<u8> combiner_buffer = {
		380	registers.tev_combiner_buffer_color.r, registers.tev_combiner_buffer_color.g,
		381	registers.tev_combiner_buffer_color.b, registers.tev_combiner_buffer_color.a
		382	};
		383
		384	for (unsigned tev_stage_index = 0; tev_stage_index < tev_stages.size(); ++tev_stage_index) {
		385	const auto& tev_stage = tev_stages[tev_stage_index];
380	using Source = Regs::TevStageConfig::Source;	386	using Source = Regs::TevStageConfig::Source;
381	using ColorModifier = Regs::TevStageConfig::ColorModifier;	387	using ColorModifier = Regs::TevStageConfig::ColorModifier;
382	using AlphaModifier = Regs::TevStageConfig::AlphaModifier;	388	using AlphaModifier = Regs::TevStageConfig::AlphaModifier;
@@ -398,6 +404,9 @@ static void ProcessTriangleInternal(const VertexShader::OutputVertex& v0,
398	case Source::Texture2:	404	case Source::Texture2:
399	return texture_color[2];	405	return texture_color[2];
400		406
		407	case Source::PreviousBuffer:
		408	return combiner_buffer;
		409
401	case Source::Constant:	410	case Source::Constant:
402	return {tev_stage.const_r, tev_stage.const_g, tev_stage.const_b, tev_stage.const_a};	411	return {tev_stage.const_r, tev_stage.const_g, tev_stage.const_b, tev_stage.const_a};
403		412
@@ -407,7 +416,7 @@ static void ProcessTriangleInternal(const VertexShader::OutputVertex& v0,
407	default:	416	default:
408	LOG_ERROR(HW_GPU, "Unknown color combiner source %d\n", (int)source);	417	LOG_ERROR(HW_GPU, "Unknown color combiner source %d\n", (int)source);
409	UNIMPLEMENTED();	418	UNIMPLEMENTED();
410	return {};	419	return {0, 0, 0, 0};
411	}	420	}
412	};	421	};
413		422
@@ -490,6 +499,16 @@ static void ProcessTriangleInternal(const VertexShader::OutputVertex& v0,
490	return result.Cast<u8>();	499	return result.Cast<u8>();
491	}	500	}
492		501
		502	case Operation::AddSigned:
		503	{
		504	// TODO(bunnei): Verify that the color conversion from (float) 0.5f to (byte) 128 is correct
		505	auto result = input[0].Cast<int>() + input[1].Cast<int>() - Math::MakeVec<int>(128, 128, 128);
		506	result.r() = MathUtil::Clamp<int>(result.r(), 0, 255);
		507	result.g() = MathUtil::Clamp<int>(result.g(), 0, 255);
		508	result.b() = MathUtil::Clamp<int>(result.b(), 0, 255);
		509	return result.Cast<u8>();
		510	}
		511
493	case Operation::Lerp:	512	case Operation::Lerp:
494	return ((input[0] * input[2] + input[1] * (Math::MakeVec<u8>(255, 255, 255) - input[2]).Cast<u8>()) / 255).Cast<u8>();	513	return ((input[0] * input[2] + input[1] * (Math::MakeVec<u8>(255, 255, 255) - input[2]).Cast<u8>()) / 255).Cast<u8>();
495		514
@@ -524,7 +543,7 @@ static void ProcessTriangleInternal(const VertexShader::OutputVertex& v0,
524	default:	543	default:
525	LOG_ERROR(HW_GPU, "Unknown color combiner operation %d\n", (int)op);	544	LOG_ERROR(HW_GPU, "Unknown color combiner operation %d\n", (int)op);
526	UNIMPLEMENTED();	545	UNIMPLEMENTED();
527	return {};	546	return {0, 0, 0};
528	}	547	}
529	};	548	};
530		549
@@ -578,7 +597,20 @@ static void ProcessTriangleInternal(const VertexShader::OutputVertex& v0,
578	};	597	};
579	auto alpha_output = AlphaCombine(tev_stage.alpha_op, alpha_result);	598	auto alpha_output = AlphaCombine(tev_stage.alpha_op, alpha_result);
580		599
581	combiner_output = Math::MakeVec(color_output, alpha_output);	600	combiner_output[0] = std::min((unsigned)255, color_output.r() * tev_stage.GetColorMultiplier());
		601	combiner_output[1] = std::min((unsigned)255, color_output.g() * tev_stage.GetColorMultiplier());
		602	combiner_output[2] = std::min((unsigned)255, color_output.b() * tev_stage.GetColorMultiplier());
		603	combiner_output[3] = std::min((unsigned)255, alpha_output * tev_stage.GetAlphaMultiplier());
		604
		605	if (registers.tev_combiner_buffer_input.TevStageUpdatesCombinerBufferColor(tev_stage_index)) {
		606	combiner_buffer.r() = combiner_output.r();
		607	combiner_buffer.g() = combiner_output.g();
		608	combiner_buffer.b() = combiner_output.b();
		609	}
		610
		611	if (registers.tev_combiner_buffer_input.TevStageUpdatesCombinerBufferAlpha(tev_stage_index)) {
		612	combiner_buffer.a() = combiner_output.a();
		613	}
582	}	614	}
583		615
584	if (registers.output_merger.alpha_test.enable) {	616	if (registers.output_merger.alpha_test.enable) {
@@ -624,9 +656,10 @@ static void ProcessTriangleInternal(const VertexShader::OutputVertex& v0,
624		656
625	// TODO: Does depth indeed only get written even if depth testing is enabled?	657	// TODO: Does depth indeed only get written even if depth testing is enabled?
626	if (registers.output_merger.depth_test_enable) {	658	if (registers.output_merger.depth_test_enable) {
627	u16 z = (u16)((v0.screenpos[2].ToFloat32() * w0 +	659	unsigned num_bits = Pica::Regs::DepthBitsPerPixel(registers.framebuffer.depth_format);
628	v1.screenpos[2].ToFloat32() * w1 +	660	u32 z = (u32)((v0.screenpos[2].ToFloat32() * w0 +
629	v2.screenpos[2].ToFloat32() * w2) * 65535.f / wsum);	661	v1.screenpos[2].ToFloat32() * w1 +
		662	v2.screenpos[2].ToFloat32() * w2) * ((1 << num_bits) - 1) / wsum);
630	u32 ref_z = GetDepth(x >> 4, y >> 4);	663	u32 ref_z = GetDepth(x >> 4, y >> 4);
631		664
632	bool pass = false;	665	bool pass = false;


diff --git a/src/video_core/vertex_shader.cpp b/src/video_core/vertex_shader.cpp index 51f4e58bf..885b7de59 100644 --- a/src/video_core/vertex_shader.cpp +++ b/src/video_core/vertex_shader.cpp
@@ -235,6 +235,15 @@ static void ProcessShaderCode(VertexShaderState& state) {
235	break;	235	break;
236	}	236	}
237		237
		238	case OpCode::Id::FLR:
		239	for (int i = 0; i < 4; ++i) {
		240	if (!swizzle.DestComponentEnabled(i))
		241	continue;
		242
		243	dest[i] = float24::FromFloat32(std::floor(src1[i].ToFloat32()));
		244	}
		245	break;
		246
238	case OpCode::Id::MAX:	247	case OpCode::Id::MAX:
239	for (int i = 0; i < 4; ++i) {	248	for (int i = 0; i < 4; ++i) {
240	if (!swizzle.DestComponentEnabled(i))	249	if (!swizzle.DestComponentEnabled(i))
@@ -366,12 +375,15 @@ static void ProcessShaderCode(VertexShaderState& state) {
366		375
367	case OpCode::Type::MultiplyAdd:	376	case OpCode::Type::MultiplyAdd:
368	{	377	{
369	if (instr.opcode.Value().EffectiveOpCode() == OpCode::Id::MAD) {	378	if ((instr.opcode.Value().EffectiveOpCode() == OpCode::Id::MAD) \|\|
		379	(instr.opcode.Value().EffectiveOpCode() == OpCode::Id::MADI)) {
370	const SwizzlePattern& swizzle = (SwizzlePattern)&swizzle_data[instr.mad.operand_desc_id];	380	const SwizzlePattern& swizzle = (SwizzlePattern)&swizzle_data[instr.mad.operand_desc_id];
371		381
372	const float24* src1_ = LookupSourceRegister(instr.mad.src1);	382	bool is_inverted = (instr.opcode.Value().EffectiveOpCode() == OpCode::Id::MADI);
373	const float24* src2_ = LookupSourceRegister(instr.mad.src2);	383
374	const float24* src3_ = LookupSourceRegister(instr.mad.src3);	384	const float24* src1_ = LookupSourceRegister(instr.mad.GetSrc1(is_inverted));
		385	const float24* src2_ = LookupSourceRegister(instr.mad.GetSrc2(is_inverted));
		386	const float24* src3_ = LookupSourceRegister(instr.mad.GetSrc3(is_inverted));
375		387
376	const bool negate_src1 = ((bool)swizzle.negate_src1 != false);	388	const bool negate_src1 = ((bool)swizzle.negate_src1 != false);
377	const bool negate_src2 = ((bool)swizzle.negate_src2 != false);	389	const bool negate_src2 = ((bool)swizzle.negate_src2 != false);