12 files changed, 121 insertions, 78 deletions
diff --git a/src/citra_qt/debugger/graphics_tracing.cpp b/src/citra_qt/debugger/graphics_tracing.cpp
index 1402f8e79..9c80f7ec9 100644
--- a/src/citra_qt/debugger/graphics_tracing.cpp
+++ b/src/citra_qt/debugger/graphics_tracing.cpp
@@ -74,7 +74,7 @@ void GraphicsTracingWidget::StartRecording() {
    std::array<u32, 4 * 16> default_attributes;
    for (unsigned i = 0; i < 16; ++i) {
        for (unsigned comp = 0; comp < 3; ++comp) {
-            default_attributes[4 * i + comp] = nihstro::to_float24(Pica::g_state.vs.default_attributes[i][comp].ToFloat32());
+            default_attributes[4 * i + comp] = nihstro::to_float24(Pica::g_state.vs_default_attributes[i][comp].ToFloat32());
        }
    }
diff --git a/src/core/hle/kernel/memory.cpp b/src/core/hle/kernel/memory.cpp
index 862643448..61a741e28 100644
--- a/src/core/hle/kernel/memory.cpp
+++ b/src/core/hle/kernel/memory.cpp
@@ -109,7 +109,6 @@ struct MemoryArea {
 static MemoryArea memory_areas[] = {
    {SHARED_MEMORY_VADDR, SHARED_MEMORY_SIZE,     "Shared Memory"}, // Shared memory
    {VRAM_VADDR,          VRAM_SIZE,              "VRAM"},          // Video memory (VRAM)
-    {TLS_AREA_VADDR,      TLS_AREA_SIZE,          "TLS Area"},      // TLS memory
 };
 }
diff --git a/src/core/hle/kernel/process.h b/src/core/hle/kernel/process.h
index a06afef2b..d781ef32c 100644
--- a/src/core/hle/kernel/process.h
+++ b/src/core/hle/kernel/process.h
@@ -142,8 +142,11 @@ public:
    MemoryRegionInfo* memory_region = nullptr;
-    /// Bitmask of the used TLS slots
+    /// The Thread Local Storage area is allocated as processes create threads,
-    std::bitset<300> used_tls_slots;
+    /// each TLS area is 0x200 bytes, so one page (0x1000) is split up in 8 parts, and each part
+    /// holds the TLS for a specific thread. This vector contains which parts are in use for each page as a bitmask.
+    /// This vector will grow as more pages are allocated for new threads.
+    std::vector<std::bitset<8>> tls_slots;
    VAddr GetLinearHeapAreaAddress() const;
    VAddr GetLinearHeapBase() const;
diff --git a/src/core/hle/kernel/thread.cpp b/src/core/hle/kernel/thread.cpp
index 6dc95d0f1..68f026918 100644
--- a/src/core/hle/kernel/thread.cpp
+++ b/src/core/hle/kernel/thread.cpp
@@ -117,9 +117,10 @@ void Thread::Stop() {
    }
    wait_objects.clear();
-    Kernel::g_current_process->used_tls_slots[tls_index] = false;
+    // Mark the TLS slot in the thread's page as free.
-    g_current_process->misc_memory_used -= Memory::TLS_ENTRY_SIZE;
+    u32 tls_page = (tls_address - Memory::TLS_AREA_VADDR) / Memory::PAGE_SIZE;
-    g_current_process->memory_region->used -= Memory::TLS_ENTRY_SIZE;
+    u32 tls_slot = ((tls_address - Memory::TLS_AREA_VADDR) % Memory::PAGE_SIZE) / Memory::TLS_ENTRY_SIZE;
+    Kernel::g_current_process->tls_slots[tls_page].reset(tls_slot);
    HLE::Reschedule(__func__);
 }
@@ -366,6 +367,31 @@ static void DebugThreadQueue() {
    }
 }
+/**
+ * Finds a free location for the TLS section of a thread.
+ * @param tls_slots The TLS page array of the thread's owner process.
+ * Returns a tuple of (page, slot, alloc_needed) where:
+ * page: The index of the first allocated TLS page that has free slots.
+ * slot: The index of the first free slot in the indicated page.
+ * alloc_needed: Whether there's a need to allocate a new TLS page (All pages are full).
+ */
+std::tuple<u32, u32, bool> GetFreeThreadLocalSlot(std::vector<std::bitset<8>>& tls_slots) {
+    // Iterate over all the allocated pages, and try to find one where not all slots are used.
+    for (unsigned page = 0; page < tls_slots.size(); ++page) {
+        const auto& page_tls_slots = tls_slots[page];
+        if (!page_tls_slots.all()) {
+            // We found a page with at least one free slot, find which slot it is
+            for (unsigned slot = 0; slot < page_tls_slots.size(); ++slot) {
+                if (!page_tls_slots.test(slot)) {
+                    return std::make_tuple(page, slot, false);
+                }
+            }
+        }
+    }
+    return std::make_tuple(0, 0, true);
+}
 ResultVal<SharedPtr<Thread>> Thread::Create(std::string name, VAddr entry_point, s32 priority,
        u32 arg, s32 processor_id, VAddr stack_top) {
    if (priority < THREADPRIO_HIGHEST || priority > THREADPRIO_LOWEST) {
@@ -403,22 +429,50 @@ ResultVal<SharedPtr<Thread>> Thread::Create(std::string name, VAddr entry_point,
    thread->name = std::move(name);
    thread->callback_handle = wakeup_callback_handle_table.Create(thread).MoveFrom();
    thread->owner_process = g_current_process;
-    thread->tls_index = -1;
    thread->waitsynch_waited = false;
    // Find the next available TLS index, and mark it as used
-    auto& used_tls_slots = Kernel::g_current_process->used_tls_slots;
+    auto& tls_slots = Kernel::g_current_process->tls_slots;
-    for (unsigned int i = 0; i < used_tls_slots.size(); ++i) {
+    bool needs_allocation = true;
-        if (used_tls_slots[i] == false) {
+    u32 available_page; // Which allocated page has free space
-            thread->tls_index = i;
+    u32 available_slot; // Which slot within the page is free
-            used_tls_slots[i] = true;
-            break;
+    std::tie(available_page, available_slot, needs_allocation) = GetFreeThreadLocalSlot(tls_slots);
+    if (needs_allocation) {
+        // There are no already-allocated pages with free slots, lets allocate a new one.
+        // TLS pages are allocated from the BASE region in the linear heap.
+        MemoryRegionInfo* memory_region = GetMemoryRegion(MemoryRegion::BASE);
+        auto& linheap_memory = memory_region->linear_heap_memory;
+        if (linheap_memory->size() + Memory::PAGE_SIZE > memory_region->size) {
+            LOG_ERROR(Kernel_SVC, "Not enough space in region to allocate a new TLS page for thread");
+            return ResultCode(ErrorDescription::OutOfMemory, ErrorModule::Kernel, ErrorSummary::OutOfResource, ErrorLevel::Permanent);
        }
+        u32 offset = linheap_memory->size();
+        // Allocate some memory from the end of the linear heap for this region.
+        linheap_memory->insert(linheap_memory->end(), Memory::PAGE_SIZE, 0);
+        memory_region->used += Memory::PAGE_SIZE;
+        Kernel::g_current_process->linear_heap_used += Memory::PAGE_SIZE;
+        tls_slots.emplace_back(0); // The page is completely available at the start
+        available_page = tls_slots.size() - 1;
+        available_slot = 0; // Use the first slot in the new page
+        auto& vm_manager = Kernel::g_current_process->vm_manager;
+        vm_manager.RefreshMemoryBlockMappings(linheap_memory.get());
+        // Map the page to the current process' address space.
+        // TODO(Subv): Find the correct MemoryState for this region.
+        vm_manager.MapMemoryBlock(Memory::TLS_AREA_VADDR + available_page * Memory::PAGE_SIZE,
+                                  linheap_memory, offset, Memory::PAGE_SIZE, MemoryState::Private);
    }
-    ASSERT_MSG(thread->tls_index != -1, "Out of TLS space");
+    // Mark the slot as used
-    g_current_process->misc_memory_used += Memory::TLS_ENTRY_SIZE;
+    tls_slots[available_page].set(available_slot);
-    g_current_process->memory_region->used += Memory::TLS_ENTRY_SIZE;
+    thread->tls_address = Memory::TLS_AREA_VADDR + available_page * Memory::PAGE_SIZE + available_slot * Memory::TLS_ENTRY_SIZE;
    // TODO(peachum): move to ScheduleThread() when scheduler is added so selected core is used
    // to initialize the context
@@ -509,10 +563,6 @@ void Thread::SetWaitSynchronizationOutput(s32 output) {
    context.cpu_registers[1] = output;
 }
-VAddr Thread::GetTLSAddress() const {
-    return Memory::TLS_AREA_VADDR + tls_index * Memory::TLS_ENTRY_SIZE;
-}
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 void ThreadingInit() {
diff --git a/src/core/hle/kernel/thread.h b/src/core/hle/kernel/thread.h
index 97ba57fc5..deab5d5a6 100644
--- a/src/core/hle/kernel/thread.h
+++ b/src/core/hle/kernel/thread.h
@@ -127,7 +127,7 @@ public:
     * Returns the Thread Local Storage address of the current thread
     * @returns VAddr of the thread's TLS
     */
-    VAddr GetTLSAddress() const;
+    VAddr GetTLSAddress() const { return tls_address; }
    Core::ThreadContext context;
@@ -144,7 +144,7 @@ public:
    s32 processor_id;
-    s32 tls_index; ///< Index of the Thread Local Storage of the thread
+    VAddr tls_address; ///< Virtual address of the Thread Local Storage of the thread
    bool waitsynch_waited; ///< Set to true if the last svcWaitSynch call caused the thread to wait
diff --git a/src/core/memory.h b/src/core/memory.h
index 9caa3c3f5..126d60471 100644
--- a/src/core/memory.h
+++ b/src/core/memory.h
@@ -100,15 +100,9 @@ enum : VAddr {
    SHARED_PAGE_SIZE      = 0x00001000,
    SHARED_PAGE_VADDR_END = SHARED_PAGE_VADDR + SHARED_PAGE_SIZE,
-    // TODO(yuriks): The size of this area is dynamic, the kernel grows
-    // it as more and more threads are created. For now we'll just use a
-    // hardcoded value.
    /// Area where TLS (Thread-Local Storage) buffers are allocated.
    TLS_AREA_VADDR     = 0x1FF82000,
    TLS_ENTRY_SIZE     = 0x200,
-    TLS_AREA_SIZE      = 300 * TLS_ENTRY_SIZE + 0x800, // Space for up to 300 threads + round to page size
-    TLS_AREA_VADDR_END = TLS_AREA_VADDR + TLS_AREA_SIZE,
    /// Equivalent to LINEAR_HEAP_VADDR, but expanded to cover the extra memory in the New 3DS.
    NEW_LINEAR_HEAP_VADDR     = 0x30000000,
diff --git a/src/video_core/command_processor.cpp b/src/video_core/command_processor.cpp
index e7dc5ddac..ad0da796e 100644
--- a/src/video_core/command_processor.cpp
+++ b/src/video_core/command_processor.cpp
@@ -128,7 +128,7 @@ static void WritePicaReg(u32 id, u32 value, u32 mask) {
                // TODO: Verify that this actually modifies the register!
                if (setup.index < 15) {
-                    g_state.vs.default_attributes[setup.index] = attribute;
+                    g_state.vs_default_attributes[setup.index] = attribute;
                    setup.index++;
                } else {
                    // Put each attribute into an immediate input buffer.
diff --git a/src/video_core/pica_state.h b/src/video_core/pica_state.h
index 1059c6ae4..495174c25 100644
--- a/src/video_core/pica_state.h
+++ b/src/video_core/pica_state.h
@@ -25,6 +25,8 @@ struct State {
    Shader::ShaderSetup vs;
    Shader::ShaderSetup gs;
+    std::array<Math::Vec4<float24>, 16> vs_default_attributes;
    struct {
        union LutEntry {
            // Used for raw access
diff --git a/src/video_core/shader/shader.cpp b/src/video_core/shader/shader.cpp
index 449fc703f..e93a9d92a 100644
--- a/src/video_core/shader/shader.cpp
+++ b/src/video_core/shader/shader.cpp
@@ -67,7 +67,6 @@ OutputVertex ShaderSetup::Run(UnitState<false>& state, const InputVertex& input,
    MICROPROFILE_SCOPE(GPU_Shader);
-    state.program_counter = config.main_offset;
    state.debug.max_offset = 0;
    state.debug.max_opdesc_id = 0;
@@ -143,7 +142,6 @@ OutputVertex ShaderSetup::Run(UnitState<false>& state, const InputVertex& input,
 DebugData<true> ShaderSetup::ProduceDebugInfo(const InputVertex& input, int num_attributes, const Regs::ShaderConfig& config, const ShaderSetup& setup) {
    UnitState<true> state;
-    state.program_counter = config.main_offset;
    state.debug.max_offset = 0;
    state.debug.max_opdesc_id = 0;
diff --git a/src/video_core/shader/shader.h b/src/video_core/shader/shader.h
index 7f417675a..983e4a967 100644
--- a/src/video_core/shader/shader.h
+++ b/src/video_core/shader/shader.h
@@ -272,29 +272,12 @@ struct UnitState {
    } registers;
    static_assert(std::is_pod<Registers>::value, "Structure is not POD");
-    u32 program_counter;
    bool conditional_code[2];
    // Two Address registers and one loop counter
    // TODO: How many bits do these actually have?
    s32 address_registers[3];
-    enum {
-        INVALID_ADDRESS = 0xFFFFFFFF
-    };
-    struct CallStackElement {
-        u32 final_address;  // Address upon which we jump to return_address
-        u32 return_address; // Where to jump when leaving scope
-        u8 repeat_counter;  // How often to repeat until this call stack element is removed
-        u8 loop_increment;  // Which value to add to the loop counter after an iteration
-                            // TODO: Should this be a signed value? Does it even matter?
-        u32 loop_address;   // The address where we'll return to after each loop iteration
-    };
-    // TODO: Is there a maximal size for this?
-    boost::container::static_vector<CallStackElement, 16> call_stack;
    DebugData<Debug> debug;
    static size_t InputOffset(const SourceRegister& reg) {
@@ -340,8 +323,6 @@ struct ShaderSetup {
        std::array<Math::Vec4<u8>, 4> i;
    } uniforms;
-    Math::Vec4<float24> default_attributes[16];
    std::array<u32, 1024> program_code;
    std::array<u32, 1024> swizzle_data;
diff --git a/src/video_core/shader/shader_interpreter.cpp b/src/video_core/shader/shader_interpreter.cpp
index 7710f7fbc..3a827d11f 100644
--- a/src/video_core/shader/shader_interpreter.cpp
+++ b/src/video_core/shader/shader_interpreter.cpp
@@ -29,8 +29,24 @@ namespace Pica {
 namespace Shader {
+constexpr u32 INVALID_ADDRESS = 0xFFFFFFFF;
+struct CallStackElement {
+    u32 final_address;  // Address upon which we jump to return_address
+    u32 return_address; // Where to jump when leaving scope
+    u8 repeat_counter;  // How often to repeat until this call stack element is removed
+    u8 loop_increment;  // Which value to add to the loop counter after an iteration
+                        // TODO: Should this be a signed value? Does it even matter?
+    u32 loop_address;   // The address where we'll return to after each loop iteration
+};
 template<bool Debug>
 void RunInterpreter(UnitState<Debug>& state) {
+    // TODO: Is there a maximal size for this?
+    boost::container::static_vector<CallStackElement, 16> call_stack;
+    u32 program_counter = g_state.regs.vs.main_offset;
    const auto& uniforms = g_state.vs.uniforms;
    const auto& swizzle_data = g_state.vs.swizzle_data;
    const auto& program_code = g_state.vs.program_code;
@@ -41,16 +57,16 @@ void RunInterpreter(UnitState<Debug>& state) {
    unsigned iteration = 0;
    bool exit_loop = false;
    while (!exit_loop) {
-        if (!state.call_stack.empty()) {
+        if (!call_stack.empty()) {
-            auto& top = state.call_stack.back();
+            auto& top = call_stack.back();
-            if (state.program_counter == top.final_address) {
+            if (program_counter == top.final_address) {
                state.address_registers[2] += top.loop_increment;
                if (top.repeat_counter-- == 0) {
-                    state.program_counter = top.return_address;
+                    program_counter = top.return_address;
-                    state.call_stack.pop_back();
+                    call_stack.pop_back();
                } else {
-                    state.program_counter = top.loop_address;
+                    program_counter = top.loop_address;
                }
                // TODO: Is "trying again" accurate to hardware?
@@ -58,20 +74,20 @@ void RunInterpreter(UnitState<Debug>& state) {
            }
        }
-        const Instruction instr = { program_code[state.program_counter] };
+        const Instruction instr = { program_code[program_counter] };
        const SwizzlePattern swizzle = { swizzle_data[instr.common.operand_desc_id] };
-        static auto call = [](UnitState<Debug>& state, u32 offset, u32 num_instructions,
+        static auto call = [&program_counter, &call_stack](UnitState<Debug>& state, u32 offset, u32 num_instructions,
                              u32 return_offset, u8 repeat_count, u8 loop_increment) {
-            state.program_counter = offset - 1; // -1 to make sure when incrementing the PC we end up at the correct offset
+            program_counter = offset - 1; // -1 to make sure when incrementing the PC we end up at the correct offset
-            ASSERT(state.call_stack.size() < state.call_stack.capacity());
+            ASSERT(call_stack.size() < call_stack.capacity());
-            state.call_stack.push_back({ offset + num_instructions, return_offset, repeat_count, loop_increment, offset });
+            call_stack.push_back({ offset + num_instructions, return_offset, repeat_count, loop_increment, offset });
        };
-        Record<DebugDataRecord::CUR_INSTR>(state.debug, iteration, state.program_counter);
+        Record<DebugDataRecord::CUR_INSTR>(state.debug, iteration, program_counter);
        if (iteration > 0)
-            Record<DebugDataRecord::NEXT_INSTR>(state.debug, iteration - 1, state.program_counter);
+            Record<DebugDataRecord::NEXT_INSTR>(state.debug, iteration - 1, program_counter);
-        state.debug.max_offset = std::max<u32>(state.debug.max_offset, 1 + state.program_counter);
+        state.debug.max_offset = std::max<u32>(state.debug.max_offset, 1 + program_counter);
        auto LookupSourceRegister = [&](const SourceRegister& source_reg) -> const float24* {
            switch (source_reg.GetRegisterType()) {
@@ -519,7 +535,7 @@ void RunInterpreter(UnitState<Debug>& state) {
            case OpCode::Id::JMPC:
                Record<DebugDataRecord::COND_CMP_IN>(state.debug, iteration, state.conditional_code);
                if (evaluate_condition(state, instr.flow_control.refx, instr.flow_control.refy, instr.flow_control)) {
-                    state.program_counter = instr.flow_control.dest_offset - 1;
+                    program_counter = instr.flow_control.dest_offset - 1;
                }
                break;
@@ -527,7 +543,7 @@ void RunInterpreter(UnitState<Debug>& state) {
                Record<DebugDataRecord::COND_BOOL_IN>(state.debug, iteration, uniforms.b[instr.flow_control.bool_uniform_id]);
                if (uniforms.b[instr.flow_control.bool_uniform_id] == !(instr.flow_control.num_instructions & 1)) {
-                    state.program_counter = instr.flow_control.dest_offset - 1;
+                    program_counter = instr.flow_control.dest_offset - 1;
                }
                break;
@@ -535,7 +551,7 @@ void RunInterpreter(UnitState<Debug>& state) {
                call(state,
                     instr.flow_control.dest_offset,
                     instr.flow_control.num_instructions,
-                     state.program_counter + 1, 0, 0);
+                     program_counter + 1, 0, 0);
                break;
            case OpCode::Id::CALLU:
@@ -544,7 +560,7 @@ void RunInterpreter(UnitState<Debug>& state) {
                    call(state,
                        instr.flow_control.dest_offset,
                        instr.flow_control.num_instructions,
-                        state.program_counter + 1, 0, 0);
+                        program_counter + 1, 0, 0);
                }
                break;
@@ -554,7 +570,7 @@ void RunInterpreter(UnitState<Debug>& state) {
                    call(state,
                        instr.flow_control.dest_offset,
                        instr.flow_control.num_instructions,
-                        state.program_counter + 1, 0, 0);
+                        program_counter + 1, 0, 0);
                }
                break;
@@ -565,8 +581,8 @@ void RunInterpreter(UnitState<Debug>& state) {
                Record<DebugDataRecord::COND_BOOL_IN>(state.debug, iteration, uniforms.b[instr.flow_control.bool_uniform_id]);
                if (uniforms.b[instr.flow_control.bool_uniform_id]) {
                    call(state,
-                         state.program_counter + 1,
+                         program_counter + 1,
-                         instr.flow_control.dest_offset - state.program_counter - 1,
+                         instr.flow_control.dest_offset - program_counter - 1,
                         instr.flow_control.dest_offset + instr.flow_control.num_instructions, 0, 0);
                } else {
                    call(state,
@@ -584,8 +600,8 @@ void RunInterpreter(UnitState<Debug>& state) {
                Record<DebugDataRecord::COND_CMP_IN>(state.debug, iteration, state.conditional_code);
                if (evaluate_condition(state, instr.flow_control.refx, instr.flow_control.refy, instr.flow_control)) {
                    call(state,
-                         state.program_counter + 1,
+                         program_counter + 1,
-                         instr.flow_control.dest_offset - state.program_counter - 1,
+                         instr.flow_control.dest_offset - program_counter - 1,
                         instr.flow_control.dest_offset + instr.flow_control.num_instructions, 0, 0);
                } else {
                    call(state,
@@ -607,8 +623,8 @@ void RunInterpreter(UnitState<Debug>& state) {
                Record<DebugDataRecord::LOOP_INT_IN>(state.debug, iteration, loop_param);
                call(state,
-                     state.program_counter + 1,
+                     program_counter + 1,
-                     instr.flow_control.dest_offset - state.program_counter + 1,
+                     instr.flow_control.dest_offset - program_counter + 1,
                     instr.flow_control.dest_offset + 1,
                     loop_param.x,
                     loop_param.z);
@@ -625,7 +641,7 @@ void RunInterpreter(UnitState<Debug>& state) {
        }
        }
-        ++state.program_counter;
+        ++program_counter;
        ++iteration;
    }
 }
diff --git a/src/video_core/vertex_loader.cpp b/src/video_core/vertex_loader.cpp
index 21ae52949..83896814f 100644
--- a/src/video_core/vertex_loader.cpp
+++ b/src/video_core/vertex_loader.cpp
@@ -124,7 +124,7 @@ void VertexLoader::LoadVertex(u32 base_address, int index, int vertex, Shader::I
                input.attr[i][0].ToFloat32(), input.attr[i][1].ToFloat32(), input.attr[i][2].ToFloat32(), input.attr[i][3].ToFloat32());
        } else if (vertex_attribute_is_default[i]) {
            // Load the default attribute if we're configured to do so
-            input.attr[i] = g_state.vs.default_attributes[i];
+            input.attr[i] = g_state.vs_default_attributes[i];
            LOG_TRACE(HW_GPU, "Loaded default attribute %x for vertex %x (index %x): (%f, %f, %f, %f)",
                i, vertex, index,
                input.attr[i][0].ToFloat32(), input.attr[i][1].ToFloat32(),