55 files changed, 3198 insertions, 587 deletions
diff --git a/src/core/CMakeLists.txt b/src/core/CMakeLists.txt
index ba5b02174..51e4088d2 100644
--- a/src/core/CMakeLists.txt
+++ b/src/core/CMakeLists.txt
@@ -40,6 +40,8 @@ add_library(core STATIC
    hle/config_mem.h
    hle/ipc.h
    hle/ipc_helpers.h
+    hle/kernel/address_arbiter.cpp
+    hle/kernel/address_arbiter.h
    hle/kernel/client_port.cpp
    hle/kernel/client_port.h
    hle/kernel/client_session.cpp
@@ -257,6 +259,8 @@ add_library(core STATIC
    loader/linker.h
    loader/loader.cpp
    loader/loader.h
+    loader/nca.cpp
+    loader/nca.h
    loader/nro.cpp
    loader/nro.h
    loader/nso.cpp
diff --git a/src/core/file_sys/partition_filesystem.cpp b/src/core/file_sys/partition_filesystem.cpp
index 808254ecc..874b9e23b 100644
--- a/src/core/file_sys/partition_filesystem.cpp
+++ b/src/core/file_sys/partition_filesystem.cpp
@@ -19,13 +19,20 @@ Loader::ResultStatus PartitionFilesystem::Load(const std::string& file_path, siz
    if (file.GetSize() < sizeof(Header))
        return Loader::ResultStatus::Error;
+    file.Seek(offset, SEEK_SET);
    // For cartridges, HFSs can get very large, so we need to calculate the size up to
    // the actual content itself instead of just blindly reading in the entire file.
    Header pfs_header;
    if (!file.ReadBytes(&pfs_header, sizeof(Header)))
        return Loader::ResultStatus::Error;
-    bool is_hfs = (memcmp(pfs_header.magic.data(), "HFS", 3) == 0);
+    if (pfs_header.magic != Common::MakeMagic('H', 'F', 'S', '0') &&
+        pfs_header.magic != Common::MakeMagic('P', 'F', 'S', '0')) {
+        return Loader::ResultStatus::ErrorInvalidFormat;
+    }
+    bool is_hfs = pfs_header.magic == Common::MakeMagic('H', 'F', 'S', '0');
    size_t entry_size = is_hfs ? sizeof(HFSEntry) : sizeof(PFSEntry);
    size_t metadata_size =
        sizeof(Header) + (pfs_header.num_entries * entry_size) + pfs_header.strtab_size;
@@ -50,7 +57,12 @@ Loader::ResultStatus PartitionFilesystem::Load(const std::vector<u8>& file_data,
        return Loader::ResultStatus::Error;
    memcpy(&pfs_header, &file_data[offset], sizeof(Header));
-    is_hfs = (memcmp(pfs_header.magic.data(), "HFS", 3) == 0);
+    if (pfs_header.magic != Common::MakeMagic('H', 'F', 'S', '0') &&
+        pfs_header.magic != Common::MakeMagic('P', 'F', 'S', '0')) {
+        return Loader::ResultStatus::ErrorInvalidFormat;
+    }
+    is_hfs = pfs_header.magic == Common::MakeMagic('H', 'F', 'S', '0');
    size_t entries_offset = offset + sizeof(Header);
    size_t entry_size = is_hfs ? sizeof(HFSEntry) : sizeof(PFSEntry);
@@ -73,21 +85,21 @@ u32 PartitionFilesystem::GetNumEntries() const {
    return pfs_header.num_entries;
 }
-u64 PartitionFilesystem::GetEntryOffset(int index) const {
+u64 PartitionFilesystem::GetEntryOffset(u32 index) const {
    if (index > GetNumEntries())
        return 0;
    return content_offset + pfs_entries[index].fs_entry.offset;
 }
-u64 PartitionFilesystem::GetEntrySize(int index) const {
+u64 PartitionFilesystem::GetEntrySize(u32 index) const {
    if (index > GetNumEntries())
        return 0;
    return pfs_entries[index].fs_entry.size;
 }
-std::string PartitionFilesystem::GetEntryName(int index) const {
+std::string PartitionFilesystem::GetEntryName(u32 index) const {
    if (index > GetNumEntries())
        return "";
@@ -113,7 +125,7 @@ u64 PartitionFilesystem::GetFileSize(const std::string& name) const {
 }
 void PartitionFilesystem::Print() const {
-    NGLOG_DEBUG(Service_FS, "Magic:                  {:.4}", pfs_header.magic.data());
+    NGLOG_DEBUG(Service_FS, "Magic:                  {}", pfs_header.magic);
    NGLOG_DEBUG(Service_FS, "Files:                  {}", pfs_header.num_entries);
    for (u32 i = 0; i < pfs_header.num_entries; i++) {
        NGLOG_DEBUG(Service_FS, " > File {}:              {} (0x{:X} bytes, at 0x{:X})", i,
diff --git a/src/core/file_sys/partition_filesystem.h b/src/core/file_sys/partition_filesystem.h
index 573c90057..9c5810cf1 100644
--- a/src/core/file_sys/partition_filesystem.h
+++ b/src/core/file_sys/partition_filesystem.h
@@ -27,9 +27,9 @@ public:
    Loader::ResultStatus Load(const std::vector<u8>& file_data, size_t offset = 0);
    u32 GetNumEntries() const;
-    u64 GetEntryOffset(int index) const;
+    u64 GetEntryOffset(u32 index) const;
-    u64 GetEntrySize(int index) const;
+    u64 GetEntrySize(u32 index) const;
-    std::string GetEntryName(int index) const;
+    std::string GetEntryName(u32 index) const;
    u64 GetFileOffset(const std::string& name) const;
    u64 GetFileSize(const std::string& name) const;
@@ -37,7 +37,7 @@ public:
 private:
    struct Header {
-        std::array<char, 4> magic;
+        u32_le magic;
        u32_le num_entries;
        u32_le strtab_size;
        INSERT_PADDING_BYTES(0x4);
diff --git a/src/core/hle/kernel/address_arbiter.cpp b/src/core/hle/kernel/address_arbiter.cpp
new file mode 100644
index 000000000..e9c8369d7
--- /dev/null
+++ b/src/core/hle/kernel/address_arbiter.cpp
@@ -0,0 +1,173 @@
+// Copyright 2018 yuzu emulator team
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+#include "common/assert.h"
+#include "common/common_funcs.h"
+#include "common/common_types.h"
+#include "core/core.h"
+#include "core/hle/kernel/errors.h"
+#include "core/hle/kernel/kernel.h"
+#include "core/hle/kernel/process.h"
+#include "core/hle/kernel/thread.h"
+#include "core/hle/lock.h"
+#include "core/memory.h"
+namespace Kernel {
+namespace AddressArbiter {
+// Performs actual address waiting logic.
+static ResultCode WaitForAddress(VAddr address, s64 timeout) {
+    SharedPtr<Thread> current_thread = GetCurrentThread();
+    current_thread->arb_wait_address = address;
+    current_thread->status = THREADSTATUS_WAIT_ARB;
+    current_thread->wakeup_callback = nullptr;
+    current_thread->WakeAfterDelay(timeout);
+    Core::System::GetInstance().CpuCore(current_thread->processor_id).PrepareReschedule();
+    return RESULT_TIMEOUT;
+}
+// Gets the threads waiting on an address.
+static void GetThreadsWaitingOnAddress(std::vector<SharedPtr<Thread>>& waiting_threads,
+                                       VAddr address) {
+    auto RetrieveWaitingThreads =
+        [](size_t core_index, std::vector<SharedPtr<Thread>>& waiting_threads, VAddr arb_addr) {
+            const auto& scheduler = Core::System::GetInstance().Scheduler(core_index);
+            auto& thread_list = scheduler->GetThreadList();
+            for (auto& thread : thread_list) {
+                if (thread->arb_wait_address == arb_addr)
+                    waiting_threads.push_back(thread);
+            }
+        };
+    // Retrieve a list of all threads that are waiting for this address.
+    RetrieveWaitingThreads(0, waiting_threads, address);
+    RetrieveWaitingThreads(1, waiting_threads, address);
+    RetrieveWaitingThreads(2, waiting_threads, address);
+    RetrieveWaitingThreads(3, waiting_threads, address);
+    // Sort them by priority, such that the highest priority ones come first.
+    std::sort(waiting_threads.begin(), waiting_threads.end(),
+              [](const SharedPtr<Thread>& lhs, const SharedPtr<Thread>& rhs) {
+                  return lhs->current_priority < rhs->current_priority;
+              });
+}
+// Wake up num_to_wake (or all) threads in a vector.
+static void WakeThreads(std::vector<SharedPtr<Thread>>& waiting_threads, s32 num_to_wake) {
+    // Only process up to 'target' threads, unless 'target' is <= 0, in which case process
+    // them all.
+    size_t last = waiting_threads.size();
+    if (num_to_wake > 0)
+        last = num_to_wake;
+    // Signal the waiting threads.
+    for (size_t i = 0; i < last; i++) {
+        ASSERT(waiting_threads[i]->status = THREADSTATUS_WAIT_ARB);
+        waiting_threads[i]->SetWaitSynchronizationResult(RESULT_SUCCESS);
+        waiting_threads[i]->arb_wait_address = 0;
+        waiting_threads[i]->ResumeFromWait();
+    }
+}
+// Signals an address being waited on.
+ResultCode SignalToAddress(VAddr address, s32 num_to_wake) {
+    // Get threads waiting on the address.
+    std::vector<SharedPtr<Thread>> waiting_threads;
+    GetThreadsWaitingOnAddress(waiting_threads, address);
+    WakeThreads(waiting_threads, num_to_wake);
+    return RESULT_SUCCESS;
+}
+// Signals an address being waited on and increments its value if equal to the value argument.
+ResultCode IncrementAndSignalToAddressIfEqual(VAddr address, s32 value, s32 num_to_wake) {
+    // Ensure that we can write to the address.
+    if (!Memory::IsValidVirtualAddress(address)) {
+        return ERR_INVALID_ADDRESS_STATE;
+    }
+    if (static_cast<s32>(Memory::Read32(address)) == value) {
+        Memory::Write32(address, static_cast<u32>(value + 1));
+    } else {
+        return ERR_INVALID_STATE;
+    }
+    return SignalToAddress(address, num_to_wake);
+}
+// Signals an address being waited on and modifies its value based on waiting thread count if equal
+// to the value argument.
+ResultCode ModifyByWaitingCountAndSignalToAddressIfEqual(VAddr address, s32 value,
+                                                         s32 num_to_wake) {
+    // Ensure that we can write to the address.
+    if (!Memory::IsValidVirtualAddress(address)) {
+        return ERR_INVALID_ADDRESS_STATE;
+    }
+    // Get threads waiting on the address.
+    std::vector<SharedPtr<Thread>> waiting_threads;
+    GetThreadsWaitingOnAddress(waiting_threads, address);
+    // Determine the modified value depending on the waiting count.
+    s32 updated_value;
+    if (waiting_threads.size() == 0) {
+        updated_value = value - 1;
+    } else if (num_to_wake <= 0 || waiting_threads.size() <= num_to_wake) {
+        updated_value = value + 1;
+    } else {
+        updated_value = value;
+    }
+    if (static_cast<s32>(Memory::Read32(address)) == value) {
+        Memory::Write32(address, static_cast<u32>(updated_value));
+    } else {
+        return ERR_INVALID_STATE;
+    }
+    WakeThreads(waiting_threads, num_to_wake);
+    return RESULT_SUCCESS;
+}
+// Waits on an address if the value passed is less than the argument value, optionally decrementing.
+ResultCode WaitForAddressIfLessThan(VAddr address, s32 value, s64 timeout, bool should_decrement) {
+    // Ensure that we can read the address.
+    if (!Memory::IsValidVirtualAddress(address)) {
+        return ERR_INVALID_ADDRESS_STATE;
+    }
+    s32 cur_value = static_cast<s32>(Memory::Read32(address));
+    if (cur_value < value) {
+        Memory::Write32(address, static_cast<u32>(cur_value - 1));
+    } else {
+        return ERR_INVALID_STATE;
+    }
+    // Short-circuit without rescheduling, if timeout is zero.
+    if (timeout == 0) {
+        return RESULT_TIMEOUT;
+    }
+    return WaitForAddress(address, timeout);
+}
+// Waits on an address if the value passed is equal to the argument value.
+ResultCode WaitForAddressIfEqual(VAddr address, s32 value, s64 timeout) {
+    // Ensure that we can read the address.
+    if (!Memory::IsValidVirtualAddress(address)) {
+        return ERR_INVALID_ADDRESS_STATE;
+    }
+    // Only wait for the address if equal.
+    if (static_cast<s32>(Memory::Read32(address)) != value) {
+        return ERR_INVALID_STATE;
+    }
+    // Short-circuit without rescheduling, if timeout is zero.
+    if (timeout == 0) {
+        return RESULT_TIMEOUT;
+    }
+    return WaitForAddress(address, timeout);
+}
+} // namespace AddressArbiter
+} // namespace Kernel
diff --git a/src/core/hle/kernel/address_arbiter.h b/src/core/hle/kernel/address_arbiter.h
new file mode 100644
index 000000000..f20f3dbc0
--- /dev/null
+++ b/src/core/hle/kernel/address_arbiter.h
@@ -0,0 +1,32 @@
+// Copyright 2018 yuzu emulator team
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+#pragma once
+#include "core/hle/result.h"
+namespace Kernel {
+namespace AddressArbiter {
+enum class ArbitrationType {
+    WaitIfLessThan = 0,
+    DecrementAndWaitIfLessThan = 1,
+    WaitIfEqual = 2,
+};
+enum class SignalType {
+    Signal = 0,
+    IncrementAndSignalIfEqual = 1,
+    ModifyByWaitingCountAndSignalIfEqual = 2,
+};
+ResultCode SignalToAddress(VAddr address, s32 num_to_wake);
+ResultCode IncrementAndSignalToAddressIfEqual(VAddr address, s32 value, s32 num_to_wake);
+ResultCode ModifyByWaitingCountAndSignalToAddressIfEqual(VAddr address, s32 value, s32 num_to_wake);
+ResultCode WaitForAddressIfLessThan(VAddr address, s32 value, s64 timeout, bool should_decrement);
+ResultCode WaitForAddressIfEqual(VAddr address, s32 value, s64 timeout);
+} // namespace AddressArbiter
+} // namespace Kernel
diff --git a/src/core/hle/kernel/errors.h b/src/core/hle/kernel/errors.h
index e1b5430bf..221cb1bb5 100644
--- a/src/core/hle/kernel/errors.h
+++ b/src/core/hle/kernel/errors.h
@@ -20,13 +20,16 @@ enum {
    MaxConnectionsReached = 52,
    // Confirmed Switch OS error codes
-    MisalignedAddress = 102,
+    InvalidAddress = 102,
+    InvalidMemoryState = 106,
    InvalidProcessorId = 113,
    InvalidHandle = 114,
    InvalidCombination = 116,
    Timeout = 117,
    SynchronizationCanceled = 118,
    TooLarge = 119,
+    InvalidEnumValue = 120,
+    InvalidState = 125,
 };
 }
@@ -39,14 +42,15 @@ constexpr ResultCode ERR_SESSION_CLOSED_BY_REMOTE(-1);
 constexpr ResultCode ERR_PORT_NAME_TOO_LONG(-1);
 constexpr ResultCode ERR_WRONG_PERMISSION(-1);
 constexpr ResultCode ERR_MAX_CONNECTIONS_REACHED(-1);
-constexpr ResultCode ERR_INVALID_ENUM_VALUE(-1);
+constexpr ResultCode ERR_INVALID_ENUM_VALUE(ErrorModule::Kernel, ErrCodes::InvalidEnumValue);
 constexpr ResultCode ERR_INVALID_ENUM_VALUE_FND(-1);
 constexpr ResultCode ERR_INVALID_COMBINATION(-1);
 constexpr ResultCode ERR_INVALID_COMBINATION_KERNEL(-1);
 constexpr ResultCode ERR_OUT_OF_MEMORY(-1);
-constexpr ResultCode ERR_INVALID_ADDRESS(-1);
+constexpr ResultCode ERR_INVALID_ADDRESS(ErrorModule::Kernel, ErrCodes::InvalidAddress);
-constexpr ResultCode ERR_INVALID_ADDRESS_STATE(-1);
+constexpr ResultCode ERR_INVALID_ADDRESS_STATE(ErrorModule::Kernel, ErrCodes::InvalidMemoryState);
 constexpr ResultCode ERR_INVALID_HANDLE(ErrorModule::Kernel, ErrCodes::InvalidHandle);
+constexpr ResultCode ERR_INVALID_STATE(ErrorModule::Kernel, ErrCodes::InvalidState);
 constexpr ResultCode ERR_INVALID_POINTER(-1);
 constexpr ResultCode ERR_INVALID_OBJECT_ADDR(-1);
 constexpr ResultCode ERR_NOT_AUTHORIZED(-1);
diff --git a/src/core/hle/kernel/hle_ipc.cpp b/src/core/hle/kernel/hle_ipc.cpp
index 01904467e..b0d83f401 100644
--- a/src/core/hle/kernel/hle_ipc.cpp
+++ b/src/core/hle/kernel/hle_ipc.cpp
@@ -271,6 +271,11 @@ std::vector<u8> HLERequestContext::ReadBuffer(int buffer_index) const {
 }
 size_t HLERequestContext::WriteBuffer(const void* buffer, size_t size, int buffer_index) const {
+    if (size == 0) {
+        NGLOG_WARNING(Core, "skip empty buffer write");
+        return 0;
+    }
    const bool is_buffer_b{BufferDescriptorB().size() && BufferDescriptorB()[buffer_index].Size()};
    const size_t buffer_size{GetWriteBufferSize(buffer_index)};
    if (size > buffer_size) {
diff --git a/src/core/hle/kernel/mutex.cpp b/src/core/hle/kernel/mutex.cpp
index bc144f3de..65560226d 100644
--- a/src/core/hle/kernel/mutex.cpp
+++ b/src/core/hle/kernel/mutex.cpp
@@ -59,7 +59,7 @@ ResultCode Mutex::TryAcquire(VAddr address, Handle holding_thread_handle,
                             Handle requesting_thread_handle) {
    // The mutex address must be 4-byte aligned
    if ((address % sizeof(u32)) != 0) {
-        return ResultCode(ErrorModule::Kernel, ErrCodes::MisalignedAddress);
+        return ResultCode(ErrorModule::Kernel, ErrCodes::InvalidAddress);
    }
    SharedPtr<Thread> holding_thread = g_handle_table.Get<Thread>(holding_thread_handle);
@@ -97,7 +97,7 @@ ResultCode Mutex::TryAcquire(VAddr address, Handle holding_thread_handle,
 ResultCode Mutex::Release(VAddr address) {
    // The mutex address must be 4-byte aligned
    if ((address % sizeof(u32)) != 0) {
-        return ResultCode(ErrorModule::Kernel, ErrCodes::MisalignedAddress);
+        return ResultCode(ErrorModule::Kernel, ErrCodes::InvalidAddress);
    }
    auto [thread, num_waiters] = GetHighestPriorityMutexWaitingThread(GetCurrentThread(), address);
diff --git a/src/core/hle/kernel/svc.cpp b/src/core/hle/kernel/svc.cpp
index ec3601e8b..1a36e0d02 100644
--- a/src/core/hle/kernel/svc.cpp
+++ b/src/core/hle/kernel/svc.cpp
@@ -11,6 +11,7 @@
 #include "common/string_util.h"
 #include "core/core.h"
 #include "core/core_timing.h"
+#include "core/hle/kernel/address_arbiter.h"
 #include "core/hle/kernel/client_port.h"
 #include "core/hle/kernel/client_session.h"
 #include "core/hle/kernel/event.h"
@@ -316,6 +317,11 @@ static ResultCode GetInfo(u64* result, u64 info_id, u64 handle, u64 info_sub_id)
                      "(STUBBED) Attempted to query privileged process id bounds, returned 0");
        *result = 0;
        break;
+    case GetInfoType::UserExceptionContextAddr:
+        NGLOG_WARNING(Kernel_SVC,
+                      "(STUBBED) Attempted to query user exception context address, returned 0");
+        *result = 0;
+        break;
    default:
        UNIMPLEMENTED();
    }
@@ -575,7 +581,7 @@ static void SleepThread(s64 nanoseconds) {
    Core::System::GetInstance().PrepareReschedule();
 }
-/// Signal process wide key atomic
+/// Wait process wide key atomic
 static ResultCode WaitProcessWideKeyAtomic(VAddr mutex_addr, VAddr condition_variable_addr,
                                           Handle thread_handle, s64 nano_seconds) {
    NGLOG_TRACE(
@@ -684,6 +690,58 @@ static ResultCode SignalProcessWideKey(VAddr condition_variable_addr, s32 target
    return RESULT_SUCCESS;
 }
+// Wait for an address (via Address Arbiter)
+static ResultCode WaitForAddress(VAddr address, u32 type, s32 value, s64 timeout) {
+    NGLOG_WARNING(Kernel_SVC, "called, address=0x{:X}, type=0x{:X}, value=0x{:X}, timeout={}",
+                  address, type, value, timeout);
+    // If the passed address is a kernel virtual address, return invalid memory state.
+    if (Memory::IsKernelVirtualAddress(address)) {
+        return ERR_INVALID_ADDRESS_STATE;
+    }
+    // If the address is not properly aligned to 4 bytes, return invalid address.
+    if (address % sizeof(u32) != 0) {
+        return ERR_INVALID_ADDRESS;
+    }
+    switch (static_cast<AddressArbiter::ArbitrationType>(type)) {
+    case AddressArbiter::ArbitrationType::WaitIfLessThan:
+        return AddressArbiter::WaitForAddressIfLessThan(address, value, timeout, false);
+    case AddressArbiter::ArbitrationType::DecrementAndWaitIfLessThan:
+        return AddressArbiter::WaitForAddressIfLessThan(address, value, timeout, true);
+    case AddressArbiter::ArbitrationType::WaitIfEqual:
+        return AddressArbiter::WaitForAddressIfEqual(address, value, timeout);
+    default:
+        return ERR_INVALID_ENUM_VALUE;
+    }
+}
+// Signals to an address (via Address Arbiter)
+static ResultCode SignalToAddress(VAddr address, u32 type, s32 value, s32 num_to_wake) {
+    NGLOG_WARNING(Kernel_SVC,
+                  "called, address=0x{:X}, type=0x{:X}, value=0x{:X}, num_to_wake=0x{:X}", address,
+                  type, value, num_to_wake);
+    // If the passed address is a kernel virtual address, return invalid memory state.
+    if (Memory::IsKernelVirtualAddress(address)) {
+        return ERR_INVALID_ADDRESS_STATE;
+    }
+    // If the address is not properly aligned to 4 bytes, return invalid address.
+    if (address % sizeof(u32) != 0) {
+        return ERR_INVALID_ADDRESS;
+    }
+    switch (static_cast<AddressArbiter::SignalType>(type)) {
+    case AddressArbiter::SignalType::Signal:
+        return AddressArbiter::SignalToAddress(address, num_to_wake);
+    case AddressArbiter::SignalType::IncrementAndSignalIfEqual:
+        return AddressArbiter::IncrementAndSignalToAddressIfEqual(address, value, num_to_wake);
+    case AddressArbiter::SignalType::ModifyByWaitingCountAndSignalIfEqual:
+        return AddressArbiter::ModifyByWaitingCountAndSignalToAddressIfEqual(address, value,
+                                                                             num_to_wake);
+    default:
+        return ERR_INVALID_ENUM_VALUE;
+    }
+}
 /// This returns the total CPU ticks elapsed since the CPU was powered-on
 static u64 GetSystemTick() {
    const u64 result{CoreTiming::GetTicks()};
@@ -744,7 +802,7 @@ static ResultCode SetThreadCoreMask(Handle thread_handle, u32 core, u64 mask) {
        ASSERT(thread->owner_process->ideal_processor != THREADPROCESSORID_DEFAULT);
        // Set the target CPU to the one specified in the process' exheader.
        core = thread->owner_process->ideal_processor;
-        mask = 1 << core;
+        mask = 1ull << core;
    }
    if (mask == 0) {
@@ -761,7 +819,7 @@ static ResultCode SetThreadCoreMask(Handle thread_handle, u32 core, u64 mask) {
    }
    // Error out if the input core isn't enabled in the input mask.
-    if (core < Core::NUM_CPU_CORES && (mask & (1 << core)) == 0) {
+    if (core < Core::NUM_CPU_CORES && (mask & (1ull << core)) == 0) {
        return ResultCode(ErrorModule::Kernel, ErrCodes::InvalidCombination);
    }
@@ -856,8 +914,8 @@ static const FunctionDef SVC_Table[] = {
    {0x31, nullptr, "GetResourceLimitCurrentValue"},
    {0x32, SvcWrap<SetThreadActivity>, "SetThreadActivity"},
    {0x33, SvcWrap<GetThreadContext>, "GetThreadContext"},
-    {0x34, nullptr, "WaitForAddress"},
+    {0x34, SvcWrap<WaitForAddress>, "WaitForAddress"},
-    {0x35, nullptr, "SignalToAddress"},
+    {0x35, SvcWrap<SignalToAddress>, "SignalToAddress"},
    {0x36, nullptr, "Unknown"},
    {0x37, nullptr, "Unknown"},
    {0x38, nullptr, "Unknown"},
diff --git a/src/core/hle/kernel/svc_wrap.h b/src/core/hle/kernel/svc_wrap.h
index 40aa88cc1..79c3fe31b 100644
--- a/src/core/hle/kernel/svc_wrap.h
+++ b/src/core/hle/kernel/svc_wrap.h
@@ -179,6 +179,20 @@ void SvcWrap() {
    FuncReturn(retval);
 }
+template <ResultCode func(u64, u32, s32, s64)>
+void SvcWrap() {
+    FuncReturn(
+        func(PARAM(0), (u32)(PARAM(1) & 0xFFFFFFFF), (s32)(PARAM(2) & 0xFFFFFFFF), (s64)PARAM(3))
+            .raw);
+}
+template <ResultCode func(u64, u32, s32, s32)>
+void SvcWrap() {
+    FuncReturn(func(PARAM(0), (u32)(PARAM(1) & 0xFFFFFFFF), (s32)(PARAM(2) & 0xFFFFFFFF),
+                    (s32)(PARAM(3) & 0xFFFFFFFF))
+                   .raw);
+}
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 // Function wrappers that return type u32
diff --git a/src/core/hle/kernel/thread.cpp b/src/core/hle/kernel/thread.cpp
index cffa7ca83..2f333ec34 100644
--- a/src/core/hle/kernel/thread.cpp
+++ b/src/core/hle/kernel/thread.cpp
@@ -140,6 +140,11 @@ static void ThreadWakeupCallback(u64 thread_handle, int cycles_late) {
        }
    }
+    if (thread->arb_wait_address != 0) {
+        ASSERT(thread->status == THREADSTATUS_WAIT_ARB);
+        thread->arb_wait_address = 0;
+    }
    if (resume)
        thread->ResumeFromWait();
 }
@@ -179,6 +184,7 @@ void Thread::ResumeFromWait() {
    case THREADSTATUS_WAIT_SLEEP:
    case THREADSTATUS_WAIT_IPC:
    case THREADSTATUS_WAIT_MUTEX:
+    case THREADSTATUS_WAIT_ARB:
        break;
    case THREADSTATUS_READY:
diff --git a/src/core/hle/kernel/thread.h b/src/core/hle/kernel/thread.h
index 1d2da6d50..f1e759802 100644
--- a/src/core/hle/kernel/thread.h
+++ b/src/core/hle/kernel/thread.h
@@ -45,6 +45,7 @@ enum ThreadStatus {
    THREADSTATUS_WAIT_SYNCH_ANY, ///< Waiting due to WaitSynch1 or WaitSynchN with wait_all = false
    THREADSTATUS_WAIT_SYNCH_ALL, ///< Waiting due to WaitSynchronizationN with wait_all = true
    THREADSTATUS_WAIT_MUTEX,     ///< Waiting due to an ArbitrateLock/WaitProcessWideKey svc
+    THREADSTATUS_WAIT_ARB,       ///< Waiting due to a SignalToAddress/WaitForAddress svc
    THREADSTATUS_DORMANT,        ///< Created but not yet made ready
    THREADSTATUS_DEAD            ///< Run to completion, or forcefully terminated
 };
@@ -230,6 +231,9 @@ public:
    VAddr mutex_wait_address; ///< If waiting on a Mutex, this is the mutex address
    Handle wait_handle;       ///< The handle used to wait for the mutex.
+    // If waiting for an AddressArbiter, this is the address being waited on.
+    VAddr arb_wait_address{0};
    std::string name;
    /// Handle used by guest emulated application to access this thread
diff --git a/src/core/hle/service/audio/audren_u.cpp b/src/core/hle/service/audio/audren_u.cpp
index 6e8002bc9..3dfb3fb52 100644
--- a/src/core/hle/service/audio/audren_u.cpp
+++ b/src/core/hle/service/audio/audren_u.cpp
@@ -17,7 +17,8 @@ constexpr u64 audio_ticks{static_cast<u64>(CoreTiming::BASE_CLOCK_RATE / 200)};
 class IAudioRenderer final : public ServiceFramework<IAudioRenderer> {
 public:
-    IAudioRenderer() : ServiceFramework("IAudioRenderer") {
+    IAudioRenderer(AudioRendererParameter audren_params)
+        : ServiceFramework("IAudioRenderer"), worker_params(audren_params) {
        static const FunctionInfo functions[] = {
            {0, nullptr, "GetAudioRendererSampleRate"},
            {1, nullptr, "GetAudioRendererSampleCount"},
@@ -57,27 +58,37 @@ private:
    }
    void RequestUpdateAudioRenderer(Kernel::HLERequestContext& ctx) {
-        NGLOG_DEBUG(Service_Audio, "{}", ctx.Description());
+        UpdateDataHeader config{};
-        AudioRendererResponseData response_data{};
+        auto buf = ctx.ReadBuffer();
+        std::memcpy(&config, buf.data(), sizeof(UpdateDataHeader));
-        response_data.section_0_size =
+        u32 memory_pool_count = worker_params.effect_count + (worker_params.voice_count * 4);
-            static_cast<u32>(response_data.state_entries.size() * sizeof(AudioRendererStateEntry));
-        response_data.section_1_size = static_cast<u32>(response_data.section_1.size());
+        std::vector<MemoryPoolInfo> mem_pool_info(memory_pool_count);
-        response_data.section_2_size = static_cast<u32>(response_data.section_2.size());
+        std::memcpy(mem_pool_info.data(),
-        response_data.section_3_size = static_cast<u32>(response_data.section_3.size());
+                    buf.data() + sizeof(UpdateDataHeader) + config.behavior_size,
-        response_data.section_4_size = static_cast<u32>(response_data.section_4.size());
+                    memory_pool_count * sizeof(MemoryPoolInfo));
-        response_data.section_5_size = static_cast<u32>(response_data.section_5.size());
-        response_data.total_size = sizeof(AudioRendererResponseData);
+        UpdateDataHeader response_data{worker_params};
-        for (unsigned i = 0; i < response_data.state_entries.size(); i++) {
+        ASSERT(ctx.GetWriteBufferSize() == response_data.total_size);
-            // 4 = Busy and 5 = Ready?
-            response_data.state_entries[i].state = 5;
+        std::vector<u8> output(response_data.total_size);
+        std::memcpy(output.data(), &response_data, sizeof(UpdateDataHeader));
+        std::vector<MemoryPoolEntry> memory_pool(memory_pool_count);
+        for (unsigned i = 0; i < memory_pool.size(); i++) {
+            if (mem_pool_info[i].pool_state == MemoryPoolStates::RequestAttach)
+                memory_pool[i].state = MemoryPoolStates::Attached;
+            else if (mem_pool_info[i].pool_state == MemoryPoolStates::RequestDetach)
+                memory_pool[i].state = MemoryPoolStates::Detached;
+            else
+                memory_pool[i].state = mem_pool_info[i].pool_state;
        }
+        std::memcpy(output.data() + sizeof(UpdateDataHeader), memory_pool.data(),
+                    response_data.memory_pools_size);
-        ctx.WriteBuffer(&response_data, response_data.total_size);
+        ctx.WriteBuffer(output);
        IPC::ResponseBuilder rb{ctx, 2};
        rb.Push(RESULT_SUCCESS);
        NGLOG_WARNING(Service_Audio, "(STUBBED) called");
@@ -109,48 +120,66 @@ private:
        NGLOG_WARNING(Service_Audio, "(STUBBED) called");
    }
-    struct AudioRendererStateEntry {
+    enum class MemoryPoolStates : u32 { // Should be LE
-        u32_le state;
+        Invalid = 0x0,
+        Unknown = 0x1,
+        RequestDetach = 0x2,
+        Detached = 0x3,
+        RequestAttach = 0x4,
+        Attached = 0x5,
+        Released = 0x6,
+    };
+    struct MemoryPoolEntry {
+        MemoryPoolStates state;
        u32_le unknown_4;
        u32_le unknown_8;
        u32_le unknown_c;
    };
-    static_assert(sizeof(AudioRendererStateEntry) == 0x10,
+    static_assert(sizeof(MemoryPoolEntry) == 0x10, "MemoryPoolEntry has wrong size");
-                  "AudioRendererStateEntry has wrong size");
-    struct AudioRendererResponseData {
-        u32_le unknown_0;
-        u32_le section_5_size;
-        u32_le section_0_size;
-        u32_le section_1_size;
-        u32_le unknown_10;
-        u32_le section_2_size;
-        u32_le unknown_18;
-        u32_le section_3_size;
-        u32_le section_4_size;
-        u32_le unknown_24;
-        u32_le unknown_28;
-        u32_le unknown_2c;
-        u32_le unknown_30;
-        u32_le unknown_34;
-        u32_le unknown_38;
-        u32_le total_size;
-        std::array<AudioRendererStateEntry, 0x18e> state_entries;
+    struct MemoryPoolInfo {
+        u64_le pool_address;
+        u64_le pool_size;
+        MemoryPoolStates pool_state;
+        INSERT_PADDING_WORDS(3); // Unknown
+    };
+    static_assert(sizeof(MemoryPoolInfo) == 0x20, "MemoryPoolInfo has wrong size");
+    struct UpdateDataHeader {
+        UpdateDataHeader() {}
+        UpdateDataHeader(const AudioRendererParameter& config) {
+            revision = Common::MakeMagic('R', 'E', 'V', '4'); // 5.1.0 Revision
+            behavior_size = 0xb0;
+            memory_pools_size = (config.effect_count + (config.voice_count * 4)) * 0x10;
+            voices_size = config.voice_count * 0x10;
+            effects_size = config.effect_count * 0x10;
+            sinks_size = config.sink_count * 0x20;
+            performance_manager_size = 0x10;
+            total_size = sizeof(UpdateDataHeader) + behavior_size + memory_pools_size +
+                         voices_size + effects_size + sinks_size + performance_manager_size;
+        }
-        std::array<u8, 0x600> section_1;
+        u32_le revision;
-        std::array<u8, 0xe0> section_2;
+        u32_le behavior_size;
-        std::array<u8, 0x20> section_3;
+        u32_le memory_pools_size;
-        std::array<u8, 0x10> section_4;
+        u32_le voices_size;
-        std::array<u8, 0xb0> section_5;
+        u32_le voice_resource_size;
+        u32_le effects_size;
+        u32_le mixes_size;
+        u32_le sinks_size;
+        u32_le performance_manager_size;
+        INSERT_PADDING_WORDS(6);
+        u32_le total_size;
    };
-    static_assert(sizeof(AudioRendererResponseData) == 0x20e0,
+    static_assert(sizeof(UpdateDataHeader) == 0x40, "UpdateDataHeader has wrong size");
-                  "AudioRendererResponseData has wrong size");
    /// This is used to trigger the audio event callback.
    CoreTiming::EventType* audio_event;
    Kernel::SharedPtr<Kernel::Event> system_event;
+    AudioRendererParameter worker_params;
 };
 class IAudioDevice final : public ServiceFramework<IAudioDevice> {
@@ -248,31 +277,33 @@ AudRenU::AudRenU() : ServiceFramework("audren:u") {
 }
 void AudRenU::OpenAudioRenderer(Kernel::HLERequestContext& ctx) {
+    IPC::RequestParser rp{ctx};
+    auto params = rp.PopRaw<AudioRendererParameter>();
    IPC::ResponseBuilder rb{ctx, 2, 0, 1};
    rb.Push(RESULT_SUCCESS);
-    rb.PushIpcInterface<Audio::IAudioRenderer>();
+    rb.PushIpcInterface<Audio::IAudioRenderer>(std::move(params));
    NGLOG_DEBUG(Service_Audio, "called");
 }
 void AudRenU::GetAudioRendererWorkBufferSize(Kernel::HLERequestContext& ctx) {
    IPC::RequestParser rp{ctx};
-    auto params = rp.PopRaw<WorkerBufferParameters>();
+    auto params = rp.PopRaw<AudioRendererParameter>();
-    u64 buffer_sz = Common::AlignUp(4 * params.unknown8, 0x40);
+    u64 buffer_sz = Common::AlignUp(4 * params.unknown_8, 0x40);
-    buffer_sz += params.unknownC * 1024;
+    buffer_sz += params.unknown_c * 1024;
-    buffer_sz += 0x940 * (params.unknownC + 1);
+    buffer_sz += 0x940 * (params.unknown_c + 1);
    buffer_sz += 0x3F0 * params.voice_count;
-    buffer_sz += Common::AlignUp(8 * (params.unknownC + 1), 0x10);
+    buffer_sz += Common::AlignUp(8 * (params.unknown_c + 1), 0x10);
    buffer_sz += Common::AlignUp(8 * params.voice_count, 0x10);
    buffer_sz +=
-        Common::AlignUp((0x3C0 * (params.sink_count + params.unknownC) + 4 * params.sample_count) *
+        Common::AlignUp((0x3C0 * (params.sink_count + params.unknown_c) + 4 * params.sample_count) *
-                            (params.unknown8 + 6),
+                            (params.unknown_8 + 6),
                        0x40);
-    if (IsFeatureSupported(AudioFeatures::Splitter, params.magic)) {
+    if (IsFeatureSupported(AudioFeatures::Splitter, params.revision)) {
-        u32 count = params.unknownC + 1;
+        u32 count = params.unknown_c + 1;
        u64 node_count = Common::AlignUp(count, 0x40);
        u64 node_state_buffer_sz =
            4 * (node_count * node_count) + 0xC * node_count + 2 * (node_count / 8);
@@ -287,20 +318,20 @@ void AudRenU::GetAudioRendererWorkBufferSize(Kernel::HLERequestContext& ctx) {
    }
    buffer_sz += 0x20 * (params.effect_count + 4 * params.voice_count) + 0x50;
-    if (IsFeatureSupported(AudioFeatures::Splitter, params.magic)) {
+    if (IsFeatureSupported(AudioFeatures::Splitter, params.revision)) {
-        buffer_sz += 0xE0 * params.unknown2c;
+        buffer_sz += 0xE0 * params.unknown_2c;
        buffer_sz += 0x20 * params.splitter_count;
-        buffer_sz += Common::AlignUp(4 * params.unknown2c, 0x10);
+        buffer_sz += Common::AlignUp(4 * params.unknown_2c, 0x10);
    }
    buffer_sz = Common::AlignUp(buffer_sz, 0x40) + 0x170 * params.sink_count;
    u64 output_sz = buffer_sz + 0x280 * params.sink_count + 0x4B0 * params.effect_count +
                    ((params.voice_count * 256) | 0x40);
-    if (params.unknown1c >= 1) {
+    if (params.unknown_1c >= 1) {
        output_sz = Common::AlignUp(((16 * params.sink_count + 16 * params.effect_count +
                                      16 * params.voice_count + 16) +
                                     0x658) *
-                                            (params.unknown1c + 1) +
+                                            (params.unknown_1c + 1) +
                                        0xc0,
                                    0x40) +
                    output_sz;
@@ -328,7 +359,7 @@ bool AudRenU::IsFeatureSupported(AudioFeatures feature, u32_le revision) const {
    u32_be version_num = (revision - Common::MakeMagic('R', 'E', 'V', '0')); // Byte swap
    switch (feature) {
    case AudioFeatures::Splitter:
-        return version_num >= 2;
+        return version_num >= 2u;
    default:
        return false;
    }
diff --git a/src/core/hle/service/audio/audren_u.h b/src/core/hle/service/audio/audren_u.h
index fe53de4ce..b9b81db4f 100644
--- a/src/core/hle/service/audio/audren_u.h
+++ b/src/core/hle/service/audio/audren_u.h
@@ -12,6 +12,24 @@ class HLERequestContext;
 namespace Service::Audio {
+struct AudioRendererParameter {
+    u32_le sample_rate;
+    u32_le sample_count;
+    u32_le unknown_8;
+    u32_le unknown_c;
+    u32_le voice_count;
+    u32_le sink_count;
+    u32_le effect_count;
+    u32_le unknown_1c;
+    u8 unknown_20;
+    INSERT_PADDING_BYTES(3);
+    u32_le splitter_count;
+    u32_le unknown_2c;
+    INSERT_PADDING_WORDS(1);
+    u32_le revision;
+};
+static_assert(sizeof(AudioRendererParameter) == 52, "AudioRendererParameter is an invalid size");
 class AudRenU final : public ServiceFramework<AudRenU> {
 public:
    explicit AudRenU();
@@ -22,25 +40,6 @@ private:
    void GetAudioRendererWorkBufferSize(Kernel::HLERequestContext& ctx);
    void GetAudioDevice(Kernel::HLERequestContext& ctx);
-    struct WorkerBufferParameters {
-        u32_le sample_rate;
-        u32_le sample_count;
-        u32_le unknown8;
-        u32_le unknownC;
-        u32_le voice_count;
-        u32_le sink_count;
-        u32_le effect_count;
-        u32_le unknown1c;
-        u8 unknown20;
-        u8 padding1[3];
-        u32_le splitter_count;
-        u32_le unknown2c;
-        u8 padding2[4];
-        u32_le magic;
-    };
-    static_assert(sizeof(WorkerBufferParameters) == 52,
-                  "WorkerBufferParameters is an invalid size");
    enum class AudioFeatures : u32 {
        Splitter,
    };
diff --git a/src/core/hle/service/hid/hid.cpp b/src/core/hle/service/hid/hid.cpp
index 00c5308ba..2696a8bf0 100644
--- a/src/core/hle/service/hid/hid.cpp
+++ b/src/core/hle/service/hid/hid.cpp
@@ -84,6 +84,10 @@ private:
        for (size_t controller = 0; controller < mem.controllers.size(); controller++) {
            for (int index = 0; index < HID_NUM_LAYOUTS; index++) {
+                // TODO(DarkLordZach): Is this layout/controller config actually invalid?
+                if (controller == Controller_Handheld && index == Layout_Single)
+                    continue;
                ControllerLayout& layout = mem.controllers[controller].layouts[index];
                layout.header.num_entries = HID_NUM_ENTRIES;
                layout.header.max_entry_index = HID_NUM_ENTRIES - 1;
@@ -94,7 +98,6 @@ private:
                layout.header.latest_entry = (layout.header.latest_entry + 1) % HID_NUM_ENTRIES;
                ControllerInputEntry& entry = layout.entries[layout.header.latest_entry];
-                entry.connection_state = ConnectionState_Connected | ConnectionState_Wired;
                entry.timestamp++;
                // TODO(shinyquagsire23): Is this always identical to timestamp?
                entry.timestamp_2++;
@@ -103,6 +106,8 @@ private:
                if (controller != Controller_Handheld)
                    continue;
+                entry.connection_state = ConnectionState_Connected | ConnectionState_Wired;
                // TODO(shinyquagsire23): Set up some LUTs for each layout mapping in the future?
                // For now everything is just the default handheld layout, but split Joy-Con will
                // rotate the face buttons and directions for certain layouts.
diff --git a/src/core/hle/service/hid/hid.h b/src/core/hle/service/hid/hid.h
index 15eee8f01..b499308d6 100644
--- a/src/core/hle/service/hid/hid.h
+++ b/src/core/hle/service/hid/hid.h
@@ -12,7 +12,7 @@ namespace Service::HID {
 // Begin enums and output structs
 constexpr u32 HID_NUM_ENTRIES = 17;
-constexpr u32 HID_NUM_LAYOUTS = 2;
+constexpr u32 HID_NUM_LAYOUTS = 7;
 constexpr s32 HID_JOYSTICK_MAX = 0x8000;
 constexpr s32 HID_JOYSTICK_MIN = -0x8000;
diff --git a/src/core/hle/service/nvdrv/devices/nvhost_gpu.cpp b/src/core/hle/service/nvdrv/devices/nvhost_gpu.cpp
index 79aab87f9..ed7b6dc03 100644
--- a/src/core/hle/service/nvdrv/devices/nvhost_gpu.cpp
+++ b/src/core/hle/service/nvdrv/devices/nvhost_gpu.cpp
@@ -121,8 +121,9 @@ u32 nvhost_gpu::AllocateObjectContext(const std::vector<u8>& input, std::vector<
 }
 u32 nvhost_gpu::SubmitGPFIFO(const std::vector<u8>& input, std::vector<u8>& output) {
-    if (input.size() < sizeof(IoctlSubmitGpfifo))
+    if (input.size() < sizeof(IoctlSubmitGpfifo)) {
        UNIMPLEMENTED();
+    }
    IoctlSubmitGpfifo params{};
    std::memcpy(&params, input.data(), sizeof(IoctlSubmitGpfifo));
    NGLOG_WARNING(Service_NVDRV, "(STUBBED) called, gpfifo={:X}, num_entries={:X}, flags={:X}",
diff --git a/src/core/hle/service/set/set.cpp b/src/core/hle/service/set/set.cpp
index f0572bed6..baeecb0ec 100644
--- a/src/core/hle/service/set/set.cpp
+++ b/src/core/hle/service/set/set.cpp
@@ -12,9 +12,6 @@
 namespace Service::Set {
 void SET::GetAvailableLanguageCodes(Kernel::HLERequestContext& ctx) {
-    IPC::RequestParser rp{ctx};
-    u32 id = rp.Pop<u32>();
    static constexpr std::array<LanguageCode, 17> available_language_codes = {{
        LanguageCode::JA,
        LanguageCode::EN_US,
@@ -50,7 +47,7 @@ SET::SET() : ServiceFramework("set") {
        {2, nullptr, "MakeLanguageCode"},
        {3, nullptr, "GetAvailableLanguageCodeCount"},
        {4, nullptr, "GetRegionCode"},
-        {5, nullptr, "GetAvailableLanguageCodes2"},
+        {5, &SET::GetAvailableLanguageCodes, "GetAvailableLanguageCodes2"},
        {6, nullptr, "GetAvailableLanguageCodeCount2"},
        {7, nullptr, "GetKeyCodeMap"},
        {8, nullptr, "GetQuestFlag"},
diff --git a/src/core/loader/loader.cpp b/src/core/loader/loader.cpp
index 6a4fd38cb..20cc0bac0 100644
--- a/src/core/loader/loader.cpp
+++ b/src/core/loader/loader.cpp
@@ -9,6 +9,7 @@
 #include "core/hle/kernel/process.h"
 #include "core/loader/deconstructed_rom_directory.h"
 #include "core/loader/elf.h"
+#include "core/loader/nca.h"
 #include "core/loader/nro.h"
 #include "core/loader/nso.h"
@@ -32,6 +33,7 @@ FileType IdentifyFile(FileUtil::IOFile& file, const std::string& filepath) {
    CHECK_TYPE(ELF)
    CHECK_TYPE(NSO)
    CHECK_TYPE(NRO)
+    CHECK_TYPE(NCA)
 #undef CHECK_TYPE
@@ -57,6 +59,8 @@ FileType GuessFromExtension(const std::string& extension_) {
        return FileType::NRO;
    else if (extension == ".nso")
        return FileType::NSO;
+    else if (extension == ".nca")
+        return FileType::NCA;
    return FileType::Unknown;
 }
@@ -69,6 +73,8 @@ const char* GetFileTypeString(FileType type) {
        return "NRO";
    case FileType::NSO:
        return "NSO";
+    case FileType::NCA:
+        return "NCA";
    case FileType::DeconstructedRomDirectory:
        return "Directory";
    case FileType::Error:
@@ -104,6 +110,10 @@ static std::unique_ptr<AppLoader> GetFileLoader(FileUtil::IOFile&& file, FileTyp
    case FileType::NRO:
        return std::make_unique<AppLoader_NRO>(std::move(file), filepath);
+    // NX NCA file format.
+    case FileType::NCA:
+        return std::make_unique<AppLoader_NCA>(std::move(file), filepath);
    // NX deconstructed ROM directory.
    case FileType::DeconstructedRomDirectory:
        return std::make_unique<AppLoader_DeconstructedRomDirectory>(std::move(file), filepath);
diff --git a/src/core/loader/loader.h b/src/core/loader/loader.h
index b1aabb1cb..b76f7b13d 100644
--- a/src/core/loader/loader.h
+++ b/src/core/loader/loader.h
@@ -29,6 +29,7 @@ enum class FileType {
    ELF,
    NSO,
    NRO,
+    NCA,
    DeconstructedRomDirectory,
 };
diff --git a/src/core/loader/nca.cpp b/src/core/loader/nca.cpp
new file mode 100644
index 000000000..067945d46
--- /dev/null
+++ b/src/core/loader/nca.cpp
@@ -0,0 +1,303 @@
+// Copyright 2018 yuzu emulator team
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+#include <vector>
+#include "common/common_funcs.h"
+#include "common/file_util.h"
+#include "common/logging/log.h"
+#include "common/swap.h"
+#include "core/core.h"
+#include "core/file_sys/program_metadata.h"
+#include "core/file_sys/romfs_factory.h"
+#include "core/hle/kernel/process.h"
+#include "core/hle/kernel/resource_limit.h"
+#include "core/hle/service/filesystem/filesystem.h"
+#include "core/loader/nca.h"
+#include "core/loader/nso.h"
+#include "core/memory.h"
+namespace Loader {
+// Media offsets in headers are stored divided by 512. Mult. by this to get real offset.
+constexpr u64 MEDIA_OFFSET_MULTIPLIER = 0x200;
+constexpr u64 SECTION_HEADER_SIZE = 0x200;
+constexpr u64 SECTION_HEADER_OFFSET = 0x400;
+enum class NcaContentType : u8 { Program = 0, Meta = 1, Control = 2, Manual = 3, Data = 4 };
+enum class NcaSectionFilesystemType : u8 { PFS0 = 0x2, ROMFS = 0x3 };
+struct NcaSectionTableEntry {
+    u32_le media_offset;
+    u32_le media_end_offset;
+    INSERT_PADDING_BYTES(0x8);
+};
+static_assert(sizeof(NcaSectionTableEntry) == 0x10, "NcaSectionTableEntry has incorrect size.");
+struct NcaHeader {
+    std::array<u8, 0x100> rsa_signature_1;
+    std::array<u8, 0x100> rsa_signature_2;
+    u32_le magic;
+    u8 is_system;
+    NcaContentType content_type;
+    u8 crypto_type;
+    u8 key_index;
+    u64_le size;
+    u64_le title_id;
+    INSERT_PADDING_BYTES(0x4);
+    u32_le sdk_version;
+    u8 crypto_type_2;
+    INSERT_PADDING_BYTES(15);
+    std::array<u8, 0x10> rights_id;
+    std::array<NcaSectionTableEntry, 0x4> section_tables;
+    std::array<std::array<u8, 0x20>, 0x4> hash_tables;
+    std::array<std::array<u8, 0x10>, 0x4> key_area;
+    INSERT_PADDING_BYTES(0xC0);
+};
+static_assert(sizeof(NcaHeader) == 0x400, "NcaHeader has incorrect size.");
+struct NcaSectionHeaderBlock {
+    INSERT_PADDING_BYTES(3);
+    NcaSectionFilesystemType filesystem_type;
+    u8 crypto_type;
+    INSERT_PADDING_BYTES(3);
+};
+static_assert(sizeof(NcaSectionHeaderBlock) == 0x8, "NcaSectionHeaderBlock has incorrect size.");
+struct Pfs0Superblock {
+    NcaSectionHeaderBlock header_block;
+    std::array<u8, 0x20> hash;
+    u32_le size;
+    INSERT_PADDING_BYTES(4);
+    u64_le hash_table_offset;
+    u64_le hash_table_size;
+    u64_le pfs0_header_offset;
+    u64_le pfs0_size;
+    INSERT_PADDING_BYTES(432);
+};
+static_assert(sizeof(Pfs0Superblock) == 0x200, "Pfs0Superblock has incorrect size.");
+static bool IsValidNca(const NcaHeader& header) {
+    return header.magic == Common::MakeMagic('N', 'C', 'A', '2') ||
+           header.magic == Common::MakeMagic('N', 'C', 'A', '3');
+}
+// TODO(DarkLordZach): Add support for encrypted.
+class Nca final {
+    std::vector<FileSys::PartitionFilesystem> pfs;
+    std::vector<u64> pfs_offset;
+    u64 romfs_offset = 0;
+    u64 romfs_size = 0;
+    boost::optional<u8> exefs_id = boost::none;
+    FileUtil::IOFile file;
+    std::string path;
+    u64 GetExeFsFileOffset(const std::string& file_name) const;
+    u64 GetExeFsFileSize(const std::string& file_name) const;
+public:
+    ResultStatus Load(FileUtil::IOFile&& file, std::string path);
+    FileSys::PartitionFilesystem GetPfs(u8 id) const;
+    u64 GetRomFsOffset() const;
+    u64 GetRomFsSize() const;
+    std::vector<u8> GetExeFsFile(const std::string& file_name);
+};
+static bool IsPfsExeFs(const FileSys::PartitionFilesystem& pfs) {
+    // According to switchbrew, an exefs must only contain these two files:
+    return pfs.GetFileSize("main") > 0 && pfs.GetFileSize("main.npdm") > 0;
+}
+ResultStatus Nca::Load(FileUtil::IOFile&& in_file, std::string in_path) {
+    file = std::move(in_file);
+    path = in_path;
+    file.Seek(0, SEEK_SET);
+    std::array<u8, sizeof(NcaHeader)> header_array{};
+    if (sizeof(NcaHeader) != file.ReadBytes(header_array.data(), sizeof(NcaHeader)))
+        NGLOG_CRITICAL(Loader, "File reader errored out during header read.");
+    NcaHeader header{};
+    std::memcpy(&header, header_array.data(), sizeof(NcaHeader));
+    if (!IsValidNca(header))
+        return ResultStatus::ErrorInvalidFormat;
+    int number_sections =
+        std::count_if(std::begin(header.section_tables), std::end(header.section_tables),
+                      [](NcaSectionTableEntry entry) { return entry.media_offset > 0; });
+    for (int i = 0; i < number_sections; ++i) {
+        // Seek to beginning of this section.
+        file.Seek(SECTION_HEADER_OFFSET + i * SECTION_HEADER_SIZE, SEEK_SET);
+        std::array<u8, sizeof(NcaSectionHeaderBlock)> array{};
+        if (sizeof(NcaSectionHeaderBlock) !=
+            file.ReadBytes(array.data(), sizeof(NcaSectionHeaderBlock)))
+            NGLOG_CRITICAL(Loader, "File reader errored out during header read.");
+        NcaSectionHeaderBlock block{};
+        std::memcpy(&block, array.data(), sizeof(NcaSectionHeaderBlock));
+        if (block.filesystem_type == NcaSectionFilesystemType::ROMFS) {
+            romfs_offset = header.section_tables[i].media_offset * MEDIA_OFFSET_MULTIPLIER;
+            romfs_size =
+                header.section_tables[i].media_end_offset * MEDIA_OFFSET_MULTIPLIER - romfs_offset;
+        } else if (block.filesystem_type == NcaSectionFilesystemType::PFS0) {
+            Pfs0Superblock sb{};
+            // Seek back to beginning of this section.
+            file.Seek(SECTION_HEADER_OFFSET + i * SECTION_HEADER_SIZE, SEEK_SET);
+            if (sizeof(Pfs0Superblock) != file.ReadBytes(&sb, sizeof(Pfs0Superblock)))
+                NGLOG_CRITICAL(Loader, "File reader errored out during header read.");
+            u64 offset = (static_cast<u64>(header.section_tables[i].media_offset) *
+                          MEDIA_OFFSET_MULTIPLIER) +
+                         sb.pfs0_header_offset;
+            FileSys::PartitionFilesystem npfs{};
+            ResultStatus status = npfs.Load(path, offset);
+            if (status == ResultStatus::Success) {
+                pfs.emplace_back(std::move(npfs));
+                pfs_offset.emplace_back(offset);
+            }
+        }
+    }
+    for (size_t i = 0; i < pfs.size(); ++i) {
+        if (IsPfsExeFs(pfs[i]))
+            exefs_id = i;
+    }
+    return ResultStatus::Success;
+}
+FileSys::PartitionFilesystem Nca::GetPfs(u8 id) const {
+    return pfs[id];
+}
+u64 Nca::GetExeFsFileOffset(const std::string& file_name) const {
+    if (exefs_id == boost::none)
+        return 0;
+    return pfs[*exefs_id].GetFileOffset(file_name) + pfs_offset[*exefs_id];
+}
+u64 Nca::GetExeFsFileSize(const std::string& file_name) const {
+    if (exefs_id == boost::none)
+        return 0;
+    return pfs[*exefs_id].GetFileSize(file_name);
+}
+u64 Nca::GetRomFsOffset() const {
+    return romfs_offset;
+}
+u64 Nca::GetRomFsSize() const {
+    return romfs_size;
+}
+std::vector<u8> Nca::GetExeFsFile(const std::string& file_name) {
+    std::vector<u8> out(GetExeFsFileSize(file_name));
+    file.Seek(GetExeFsFileOffset(file_name), SEEK_SET);
+    file.ReadBytes(out.data(), GetExeFsFileSize(file_name));
+    return out;
+}
+AppLoader_NCA::AppLoader_NCA(FileUtil::IOFile&& file, std::string filepath)
+    : AppLoader(std::move(file)), filepath(std::move(filepath)) {}
+FileType AppLoader_NCA::IdentifyType(FileUtil::IOFile& file, const std::string&) {
+    file.Seek(0, SEEK_SET);
+    std::array<u8, 0x400> header_enc_array{};
+    if (0x400 != file.ReadBytes(header_enc_array.data(), 0x400))
+        return FileType::Error;
+    // TODO(DarkLordZach): Assuming everything is decrypted. Add crypto support.
+    NcaHeader header{};
+    std::memcpy(&header, header_enc_array.data(), sizeof(NcaHeader));
+    if (IsValidNca(header) && header.content_type == NcaContentType::Program)
+        return FileType::NCA;
+    return FileType::Error;
+}
+ResultStatus AppLoader_NCA::Load(Kernel::SharedPtr<Kernel::Process>& process) {
+    if (is_loaded) {
+        return ResultStatus::ErrorAlreadyLoaded;
+    }
+    if (!file.IsOpen()) {
+        return ResultStatus::Error;
+    }
+    nca = std::make_unique<Nca>();
+    ResultStatus result = nca->Load(std::move(file), filepath);
+    if (result != ResultStatus::Success) {
+        return result;
+    }
+    result = metadata.Load(nca->GetExeFsFile("main.npdm"));
+    if (result != ResultStatus::Success) {
+        return result;
+    }
+    metadata.Print();
+    const FileSys::ProgramAddressSpaceType arch_bits{metadata.GetAddressSpaceType()};
+    if (arch_bits == FileSys::ProgramAddressSpaceType::Is32Bit) {
+        return ResultStatus::ErrorUnsupportedArch;
+    }
+    VAddr next_load_addr{Memory::PROCESS_IMAGE_VADDR};
+    for (const auto& module : {"rtld", "main", "subsdk0", "subsdk1", "subsdk2", "subsdk3",
+                               "subsdk4", "subsdk5", "subsdk6", "subsdk7", "sdk"}) {
+        const VAddr load_addr = next_load_addr;
+        next_load_addr = AppLoader_NSO::LoadModule(module, nca->GetExeFsFile(module), load_addr);
+        if (next_load_addr) {
+            NGLOG_DEBUG(Loader, "loaded module {} @ 0x{:X}", module, load_addr);
+        } else {
+            next_load_addr = load_addr;
+        }
+    }
+    process->program_id = metadata.GetTitleID();
+    process->svc_access_mask.set();
+    process->address_mappings = default_address_mappings;
+    process->resource_limit =
+        Kernel::ResourceLimit::GetForCategory(Kernel::ResourceLimitCategory::APPLICATION);
+    process->Run(Memory::PROCESS_IMAGE_VADDR, metadata.GetMainThreadPriority(),
+                 metadata.GetMainThreadStackSize());
+    if (nca->GetRomFsSize() > 0)
+        Service::FileSystem::RegisterFileSystem(std::make_unique<FileSys::RomFS_Factory>(*this),
+                                                Service::FileSystem::Type::RomFS);
+    is_loaded = true;
+    return ResultStatus::Success;
+}
+ResultStatus AppLoader_NCA::ReadRomFS(std::shared_ptr<FileUtil::IOFile>& romfs_file, u64& offset,
+                                      u64& size) {
+    if (nca->GetRomFsSize() == 0) {
+        NGLOG_DEBUG(Loader, "No RomFS available");
+        return ResultStatus::ErrorNotUsed;
+    }
+    romfs_file = std::make_shared<FileUtil::IOFile>(filepath, "rb");
+    offset = nca->GetRomFsOffset();
+    size = nca->GetRomFsSize();
+    NGLOG_DEBUG(Loader, "RomFS offset:           0x{:016X}", offset);
+    NGLOG_DEBUG(Loader, "RomFS size:             0x{:016X}", size);
+    return ResultStatus::Success;
+}
+AppLoader_NCA::~AppLoader_NCA() = default;
+} // namespace Loader
diff --git a/src/core/loader/nca.h b/src/core/loader/nca.h
new file mode 100644
index 000000000..3b6c451d0
--- /dev/null
+++ b/src/core/loader/nca.h
@@ -0,0 +1,49 @@
+// Copyright 2018 yuzu emulator team
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+#pragma once
+#include <string>
+#include "common/common_types.h"
+#include "core/file_sys/partition_filesystem.h"
+#include "core/file_sys/program_metadata.h"
+#include "core/hle/kernel/kernel.h"
+#include "core/loader/loader.h"
+namespace Loader {
+class Nca;
+/// Loads an NCA file
+class AppLoader_NCA final : public AppLoader {
+public:
+    AppLoader_NCA(FileUtil::IOFile&& file, std::string filepath);
+    /**
+     * Returns the type of the file
+     * @param file FileUtil::IOFile open file
+     * @param filepath Path of the file that we are opening.
+     * @return FileType found, or FileType::Error if this loader doesn't know it
+     */
+    static FileType IdentifyType(FileUtil::IOFile& file, const std::string& filepath);
+    FileType GetFileType() override {
+        return IdentifyType(file, filepath);
+    }
+    ResultStatus Load(Kernel::SharedPtr<Kernel::Process>& process) override;
+    ResultStatus ReadRomFS(std::shared_ptr<FileUtil::IOFile>& romfs_file, u64& offset,
+                           u64& size) override;
+    ~AppLoader_NCA();
+private:
+    std::string filepath;
+    FileSys::ProgramMetadata metadata;
+    std::unique_ptr<Nca> nca;
+};
+} // namespace Loader
diff --git a/src/core/loader/nso.cpp b/src/core/loader/nso.cpp
index 01be9e217..845ed7e90 100644
--- a/src/core/loader/nso.cpp
+++ b/src/core/loader/nso.cpp
@@ -66,8 +66,22 @@ FileType AppLoader_NSO::IdentifyType(FileUtil::IOFile& file, const std::string&)
    return FileType::Error;
 }
+static std::vector<u8> DecompressSegment(const std::vector<u8>& compressed_data,
+                                         const NsoSegmentHeader& header) {
+    std::vector<u8> uncompressed_data;
+    uncompressed_data.resize(header.size);
+    const int bytes_uncompressed = LZ4_decompress_safe(
+        reinterpret_cast<const char*>(compressed_data.data()),
+        reinterpret_cast<char*>(uncompressed_data.data()), compressed_data.size(), header.size);
+    ASSERT_MSG(bytes_uncompressed == header.size && bytes_uncompressed == uncompressed_data.size(),
+               "{} != {} != {}", bytes_uncompressed, header.size, uncompressed_data.size());
+    return uncompressed_data;
+}
 static std::vector<u8> ReadSegment(FileUtil::IOFile& file, const NsoSegmentHeader& header,
-                                   int compressed_size) {
+                                   size_t compressed_size) {
    std::vector<u8> compressed_data;
    compressed_data.resize(compressed_size);
@@ -77,22 +91,65 @@ static std::vector<u8> ReadSegment(FileUtil::IOFile& file, const NsoSegmentHeade
        return {};
    }
-    std::vector<u8> uncompressed_data;
+    return DecompressSegment(compressed_data, header);
-    uncompressed_data.resize(header.size);
-    const int bytes_uncompressed = LZ4_decompress_safe(
-        reinterpret_cast<const char*>(compressed_data.data()),
-        reinterpret_cast<char*>(uncompressed_data.data()), compressed_size, header.size);
-    ASSERT_MSG(bytes_uncompressed == header.size && bytes_uncompressed == uncompressed_data.size(),
-               "{} != {} != {}", bytes_uncompressed, header.size, uncompressed_data.size());
-    return uncompressed_data;
 }
 static constexpr u32 PageAlignSize(u32 size) {
    return (size + Memory::PAGE_MASK) & ~Memory::PAGE_MASK;
 }
+VAddr AppLoader_NSO::LoadModule(const std::string& name, const std::vector<u8>& file_data,
+                                VAddr load_base) {
+    if (file_data.size() < sizeof(NsoHeader))
+        return {};
+    NsoHeader nso_header;
+    std::memcpy(&nso_header, file_data.data(), sizeof(NsoHeader));
+    if (nso_header.magic != Common::MakeMagic('N', 'S', 'O', '0'))
+        return {};
+    // Build program image
+    Kernel::SharedPtr<Kernel::CodeSet> codeset = Kernel::CodeSet::Create("");
+    std::vector<u8> program_image;
+    for (int i = 0; i < nso_header.segments.size(); ++i) {
+        std::vector<u8> compressed_data(nso_header.segments_compressed_size[i]);
+        for (int j = 0; j < nso_header.segments_compressed_size[i]; ++j)
+            compressed_data[j] = file_data[nso_header.segments[i].offset + j];
+        std::vector<u8> data = DecompressSegment(compressed_data, nso_header.segments[i]);
+        program_image.resize(nso_header.segments[i].location);
+        program_image.insert(program_image.end(), data.begin(), data.end());
+        codeset->segments[i].addr = nso_header.segments[i].location;
+        codeset->segments[i].offset = nso_header.segments[i].location;
+        codeset->segments[i].size = PageAlignSize(static_cast<u32>(data.size()));
+    }
+    // MOD header pointer is at .text offset + 4
+    u32 module_offset;
+    std::memcpy(&module_offset, program_image.data() + 4, sizeof(u32));
+    // Read MOD header
+    ModHeader mod_header{};
+    // Default .bss to size in segment header if MOD0 section doesn't exist
+    u32 bss_size{PageAlignSize(nso_header.segments[2].bss_size)};
+    std::memcpy(&mod_header, program_image.data() + module_offset, sizeof(ModHeader));
+    const bool has_mod_header{mod_header.magic == Common::MakeMagic('M', 'O', 'D', '0')};
+    if (has_mod_header) {
+        // Resize program image to include .bss section and page align each section
+        bss_size = PageAlignSize(mod_header.bss_end_offset - mod_header.bss_start_offset);
+    }
+    codeset->data.size += bss_size;
+    const u32 image_size{PageAlignSize(static_cast<u32>(program_image.size()) + bss_size)};
+    program_image.resize(image_size);
+    // Load codeset for current process
+    codeset->name = name;
+    codeset->memory = std::make_shared<std::vector<u8>>(std::move(program_image));
+    Core::CurrentProcess()->LoadModule(codeset, load_base);
+    return load_base + image_size;
+}
 VAddr AppLoader_NSO::LoadModule(const std::string& path, VAddr load_base) {
    FileUtil::IOFile file(path, "rb");
    if (!file.IsOpen()) {
diff --git a/src/core/loader/nso.h b/src/core/loader/nso.h
index 1ae30a824..386f4d39a 100644
--- a/src/core/loader/nso.h
+++ b/src/core/loader/nso.h
@@ -29,6 +29,9 @@ public:
        return IdentifyType(file, filepath);
    }
+    static VAddr LoadModule(const std::string& name, const std::vector<u8>& file_data,
+                            VAddr load_base);
    static VAddr LoadModule(const std::string& path, VAddr load_base);
    ResultStatus Load(Kernel::SharedPtr<Kernel::Process>& process) override;
diff --git a/src/core/memory.cpp b/src/core/memory.cpp
index 3b81acd63..f070dee7d 100644
--- a/src/core/memory.cpp
+++ b/src/core/memory.cpp
@@ -241,6 +241,10 @@ bool IsValidVirtualAddress(const VAddr vaddr) {
    return IsValidVirtualAddress(*Core::CurrentProcess(), vaddr);
 }
+bool IsKernelVirtualAddress(const VAddr vaddr) {
+    return KERNEL_REGION_VADDR <= vaddr && vaddr < KERNEL_REGION_END;
+}
 bool IsValidPhysicalAddress(const PAddr paddr) {
    return GetPhysicalPointer(paddr) != nullptr;
 }
diff --git a/src/core/memory.h b/src/core/memory.h
index 3f56a2c6a..8d5d017a4 100644
--- a/src/core/memory.h
+++ b/src/core/memory.h
@@ -188,6 +188,11 @@ enum : VAddr {
    MAP_REGION_VADDR = NEW_MAP_REGION_VADDR_END,
    MAP_REGION_SIZE = 0x1000000000,
    MAP_REGION_VADDR_END = MAP_REGION_VADDR + MAP_REGION_SIZE,
+    /// Kernel Virtual Address Range
+    KERNEL_REGION_VADDR = 0xFFFFFF8000000000,
+    KERNEL_REGION_SIZE = 0x7FFFE00000,
+    KERNEL_REGION_END = KERNEL_REGION_VADDR + KERNEL_REGION_SIZE,
 };
 /// Currently active page table
@@ -197,6 +202,8 @@ PageTable* GetCurrentPageTable();
 /// Determines if the given VAddr is valid for the specified process.
 bool IsValidVirtualAddress(const Kernel::Process& process, const VAddr vaddr);
 bool IsValidVirtualAddress(const VAddr addr);
+/// Determines if the given VAddr is a kernel address
+bool IsKernelVirtualAddress(const VAddr addr);
 bool IsValidPhysicalAddress(const PAddr addr);
diff --git a/src/video_core/CMakeLists.txt b/src/video_core/CMakeLists.txt
index 281810357..c6431e722 100644
--- a/src/video_core/CMakeLists.txt
+++ b/src/video_core/CMakeLists.txt
@@ -9,6 +9,8 @@ add_library(video_core STATIC
    engines/maxwell_3d.h
    engines/maxwell_compute.cpp
    engines/maxwell_compute.h
+    engines/maxwell_dma.cpp
+    engines/maxwell_dma.h
    engines/shader_bytecode.h
    gpu.cpp
    gpu.h
@@ -39,6 +41,8 @@ add_library(video_core STATIC
    renderer_opengl/maxwell_to_gl.h
    renderer_opengl/renderer_opengl.cpp
    renderer_opengl/renderer_opengl.h
+    textures/astc.cpp
+    textures/astc.h
    textures/decoders.cpp
    textures/decoders.h
    textures/texture.h
diff --git a/src/video_core/command_processor.cpp b/src/video_core/command_processor.cpp
index d72d6f760..cec9cb9f3 100644
--- a/src/video_core/command_processor.cpp
+++ b/src/video_core/command_processor.cpp
@@ -16,6 +16,7 @@
 #include "video_core/engines/fermi_2d.h"
 #include "video_core/engines/maxwell_3d.h"
 #include "video_core/engines/maxwell_compute.h"
+#include "video_core/engines/maxwell_dma.h"
 #include "video_core/gpu.h"
 #include "video_core/renderer_base.h"
 #include "video_core/video_core.h"
@@ -60,8 +61,11 @@ void GPU::WriteReg(u32 method, u32 subchannel, u32 value, u32 remaining_params)
    case EngineID::MAXWELL_COMPUTE_B:
        maxwell_compute->WriteReg(method, value);
        break;
+    case EngineID::MAXWELL_DMA_COPY_A:
+        maxwell_dma->WriteReg(method, value);
+        break;
    default:
-        UNIMPLEMENTED();
+        UNIMPLEMENTED_MSG("Unimplemented engine");
    }
 }
diff --git a/src/video_core/debug_utils/debug_utils.h b/src/video_core/debug_utils/debug_utils.h
index bbba8e380..9382a75e5 100644
--- a/src/video_core/debug_utils/debug_utils.h
+++ b/src/video_core/debug_utils/debug_utils.h
@@ -55,8 +55,10 @@ public:
        virtual ~BreakPointObserver() {
            auto context = context_weak.lock();
            if (context) {
-                std::unique_lock<std::mutex> lock(context->breakpoint_mutex);
+                {
-                context->breakpoint_observers.remove(this);
+                    std::unique_lock<std::mutex> lock(context->breakpoint_mutex);
+                    context->breakpoint_observers.remove(this);
+                }
                // If we are the last observer to be destroyed, tell the debugger context that
                // it is free to continue. In particular, this is required for a proper yuzu
diff --git a/src/video_core/engines/fermi_2d.cpp b/src/video_core/engines/fermi_2d.cpp
index 6b9382f06..998b7c843 100644
--- a/src/video_core/engines/fermi_2d.cpp
+++ b/src/video_core/engines/fermi_2d.cpp
@@ -47,6 +47,7 @@ void Fermi2D::HandleSurfaceCopy() {
    if (regs.src.linear == regs.dst.linear) {
        // If the input layout and the output layout are the same, just perform a raw copy.
+        ASSERT(regs.src.BlockHeight() == regs.dst.BlockHeight());
        Memory::CopyBlock(dest_cpu, source_cpu,
                          src_bytes_per_pixel * regs.dst.width * regs.dst.height);
        return;
diff --git a/src/video_core/engines/maxwell_3d.cpp b/src/video_core/engines/maxwell_3d.cpp
index 86e9dc998..93c43c8cb 100644
--- a/src/video_core/engines/maxwell_3d.cpp
+++ b/src/video_core/engines/maxwell_3d.cpp
@@ -328,8 +328,9 @@ std::vector<Texture::FullTextureInfo> Maxwell3D::GetStageTextures(Regs::ShaderSt
        Texture::FullTextureInfo tex_info{};
        // TODO(Subv): Use the shader to determine which textures are actually accessed.
-        tex_info.index = (current_texture - tex_info_buffer.address - TextureInfoOffset) /
+        tex_info.index =
-                         sizeof(Texture::TextureHandle);
+            static_cast<u32>(current_texture - tex_info_buffer.address - TextureInfoOffset) /
+            sizeof(Texture::TextureHandle);
        // Load the TIC data.
        if (tex_handle.tic_id != 0) {
diff --git a/src/video_core/engines/maxwell_dma.cpp b/src/video_core/engines/maxwell_dma.cpp
new file mode 100644
index 000000000..442138988
--- /dev/null
+++ b/src/video_core/engines/maxwell_dma.cpp
@@ -0,0 +1,69 @@
+// Copyright 2018 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+#include "core/memory.h"
+#include "video_core/engines/maxwell_dma.h"
+#include "video_core/textures/decoders.h"
+namespace Tegra {
+namespace Engines {
+MaxwellDMA::MaxwellDMA(MemoryManager& memory_manager) : memory_manager(memory_manager) {}
+void MaxwellDMA::WriteReg(u32 method, u32 value) {
+    ASSERT_MSG(method < Regs::NUM_REGS,
+               "Invalid MaxwellDMA register, increase the size of the Regs structure");
+    regs.reg_array[method] = value;
+#define MAXWELLDMA_REG_INDEX(field_name)                                                           \
+    (offsetof(Tegra::Engines::MaxwellDMA::Regs, field_name) / sizeof(u32))
+    switch (method) {
+    case MAXWELLDMA_REG_INDEX(exec): {
+        HandleCopy();
+        break;
+    }
+    }
+#undef MAXWELLDMA_REG_INDEX
+}
+void MaxwellDMA::HandleCopy() {
+    NGLOG_WARNING(HW_GPU, "Requested a DMA copy");
+    const GPUVAddr source = regs.src_address.Address();
+    const GPUVAddr dest = regs.dst_address.Address();
+    const VAddr source_cpu = *memory_manager.GpuToCpuAddress(source);
+    const VAddr dest_cpu = *memory_manager.GpuToCpuAddress(dest);
+    // TODO(Subv): Perform more research and implement all features of this engine.
+    ASSERT(regs.exec.enable_swizzle == 0);
+    ASSERT(regs.exec.enable_2d == 1);
+    ASSERT(regs.exec.query_mode == Regs::QueryMode::None);
+    ASSERT(regs.exec.query_intr == Regs::QueryIntr::None);
+    ASSERT(regs.exec.copy_mode == Regs::CopyMode::Unk2);
+    ASSERT(regs.src_params.pos_x == 0);
+    ASSERT(regs.src_params.pos_y == 0);
+    ASSERT(regs.dst_params.pos_x == 0);
+    ASSERT(regs.dst_params.pos_y == 0);
+    ASSERT(regs.exec.is_dst_linear != regs.exec.is_src_linear);
+    u8* src_buffer = Memory::GetPointer(source_cpu);
+    u8* dst_buffer = Memory::GetPointer(dest_cpu);
+    if (regs.exec.is_dst_linear && !regs.exec.is_src_linear) {
+        // If the input is tiled and the output is linear, deswizzle the input and copy it over.
+        Texture::CopySwizzledData(regs.src_params.size_x, regs.src_params.size_y, 1, 1, src_buffer,
+                                  dst_buffer, true, regs.src_params.BlockHeight());
+    } else {
+        // If the input is linear and the output is tiled, swizzle the input and copy it over.
+        Texture::CopySwizzledData(regs.dst_params.size_x, regs.dst_params.size_y, 1, 1, dst_buffer,
+                                  src_buffer, false, regs.dst_params.BlockHeight());
+    }
+}
+} // namespace Engines
+} // namespace Tegra
diff --git a/src/video_core/engines/maxwell_dma.h b/src/video_core/engines/maxwell_dma.h
new file mode 100644
index 000000000..905749bde
--- /dev/null
+++ b/src/video_core/engines/maxwell_dma.h
@@ -0,0 +1,155 @@
+// Copyright 2018 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+#pragma once
+#include <array>
+#include "common/assert.h"
+#include "common/bit_field.h"
+#include "common/common_funcs.h"
+#include "common/common_types.h"
+#include "video_core/gpu.h"
+#include "video_core/memory_manager.h"
+namespace Tegra {
+namespace Engines {
+class MaxwellDMA final {
+public:
+    explicit MaxwellDMA(MemoryManager& memory_manager);
+    ~MaxwellDMA() = default;
+    /// Write the value to the register identified by method.
+    void WriteReg(u32 method, u32 value);
+    struct Regs {
+        static constexpr size_t NUM_REGS = 0x1D6;
+        struct Parameters {
+            union {
+                BitField<0, 4, u32> block_depth;
+                BitField<4, 4, u32> block_height;
+                BitField<8, 4, u32> block_width;
+            };
+            u32 size_x;
+            u32 size_y;
+            u32 size_z;
+            u32 pos_z;
+            union {
+                BitField<0, 16, u32> pos_x;
+                BitField<16, 16, u32> pos_y;
+            };
+            u32 BlockHeight() const {
+                return 1 << block_height;
+            }
+        };
+        static_assert(sizeof(Parameters) == 24, "Parameters has wrong size");
+        enum class CopyMode : u32 {
+            None = 0,
+            Unk1 = 1,
+            Unk2 = 2,
+        };
+        enum class QueryMode : u32 {
+            None = 0,
+            Short = 1,
+            Long = 2,
+        };
+        enum class QueryIntr : u32 {
+            None = 0,
+            Block = 1,
+            NonBlock = 2,
+        };
+        union {
+            struct {
+                INSERT_PADDING_WORDS(0xC0);
+                struct {
+                    union {
+                        BitField<0, 2, CopyMode> copy_mode;
+                        BitField<2, 1, u32> flush;
+                        BitField<3, 2, QueryMode> query_mode;
+                        BitField<5, 2, QueryIntr> query_intr;
+                        BitField<7, 1, u32> is_src_linear;
+                        BitField<8, 1, u32> is_dst_linear;
+                        BitField<9, 1, u32> enable_2d;
+                        BitField<10, 1, u32> enable_swizzle;
+                    };
+                } exec;
+                INSERT_PADDING_WORDS(0x3F);
+                struct {
+                    u32 address_high;
+                    u32 address_low;
+                    GPUVAddr Address() const {
+                        return static_cast<GPUVAddr>((static_cast<GPUVAddr>(address_high) << 32) |
+                                                     address_low);
+                    }
+                } src_address;
+                struct {
+                    u32 address_high;
+                    u32 address_low;
+                    GPUVAddr Address() const {
+                        return static_cast<GPUVAddr>((static_cast<GPUVAddr>(address_high) << 32) |
+                                                     address_low);
+                    }
+                } dst_address;
+                u32 src_pitch;
+                u32 dst_pitch;
+                u32 x_count;
+                u32 y_count;
+                INSERT_PADDING_WORDS(0xBB);
+                Parameters dst_params;
+                INSERT_PADDING_WORDS(1);
+                Parameters src_params;
+                INSERT_PADDING_WORDS(0x13);
+            };
+            std::array<u32, NUM_REGS> reg_array;
+        };
+    } regs{};
+    MemoryManager& memory_manager;
+private:
+    /// Performs the copy from the source buffer to the destination buffer as configured in the
+    /// registers.
+    void HandleCopy();
+};
+#define ASSERT_REG_POSITION(field_name, position)                                                  \
+    static_assert(offsetof(MaxwellDMA::Regs, field_name) == position * 4,                          \
+                  "Field " #field_name " has invalid position")
+ASSERT_REG_POSITION(exec, 0xC0);
+ASSERT_REG_POSITION(src_address, 0x100);
+ASSERT_REG_POSITION(dst_address, 0x102);
+ASSERT_REG_POSITION(src_pitch, 0x104);
+ASSERT_REG_POSITION(dst_pitch, 0x105);
+ASSERT_REG_POSITION(x_count, 0x106);
+ASSERT_REG_POSITION(y_count, 0x107);
+ASSERT_REG_POSITION(dst_params, 0x1C3);
+ASSERT_REG_POSITION(src_params, 0x1CA);
+#undef ASSERT_REG_POSITION
+} // namespace Engines
+} // namespace Tegra
diff --git a/src/video_core/engines/shader_bytecode.h b/src/video_core/engines/shader_bytecode.h
index d748026b8..cb4db0679 100644
--- a/src/video_core/engines/shader_bytecode.h
+++ b/src/video_core/engines/shader_bytecode.h
@@ -213,16 +213,15 @@ union Instruction {
    BitField<28, 8, Register> gpr28;
    BitField<39, 8, Register> gpr39;
    BitField<48, 16, u64> opcode;
-    BitField<50, 1, u64> saturate_a;
    union {
        BitField<20, 19, u64> imm20_19;
-        BitField<20, 32, u64> imm20_32;
+        BitField<20, 32, s64> imm20_32;
        BitField<45, 1, u64> negate_b;
        BitField<46, 1, u64> abs_a;
        BitField<48, 1, u64> negate_a;
        BitField<49, 1, u64> abs_b;
-        BitField<50, 1, u64> abs_d;
+        BitField<50, 1, u64> saturate_d;
        BitField<56, 1, u64> negate_imm;
        union {
@@ -231,10 +230,18 @@ union Instruction {
        } fmnmx;
        union {
+            BitField<39, 1, u64> invert_a;
+            BitField<40, 1, u64> invert_b;
+            BitField<41, 2, LogicOperation> operation;
+            BitField<44, 2, u64> unk44;
+            BitField<48, 3, Pred> pred48;
+        } lop;
+        union {
            BitField<53, 2, LogicOperation> operation;
            BitField<55, 1, u64> invert_a;
            BitField<56, 1, u64> invert_b;
-        } lop;
+        } lop32i;
        float GetImm20_19() const {
            float result{};
@@ -247,7 +254,7 @@ union Instruction {
        float GetImm20_32() const {
            float result{};
-            u32 imm{static_cast<u32>(imm20_32)};
+            s32 imm{static_cast<s32>(imm20_32)};
            std::memcpy(&result, &imm, sizeof(imm));
            return result;
        }
@@ -271,6 +278,11 @@ union Instruction {
    } alu_integer;
    union {
+        BitField<54, 1, u64> saturate;
+        BitField<56, 1, u64> negate_a;
+    } iadd32i;
+    union {
        BitField<20, 8, u64> shift_position;
        BitField<28, 8, u64> shift_length;
        BitField<48, 1, u64> negate_b;
@@ -330,7 +342,17 @@ union Instruction {
    } fset;
    union {
-        BitField<10, 2, Register::Size> size;
+        BitField<39, 3, u64> pred39;
+        BitField<42, 1, u64> neg_pred;
+        BitField<44, 1, u64> bf;
+        BitField<45, 2, PredOperation> op;
+        BitField<48, 1, u64> is_signed;
+        BitField<49, 3, PredCondition> cond;
+    } iset;
+    union {
+        BitField<8, 2, Register::Size> dest_size;
+        BitField<10, 2, Register::Size> src_size;
        BitField<12, 1, u64> is_output_signed;
        BitField<13, 1, u64> is_input_signed;
        BitField<41, 2, u64> selector;
@@ -350,7 +372,7 @@ union Instruction {
        BitField<31, 4, u64> component_mask;
        bool IsComponentEnabled(size_t component) const {
-            return ((1 << component) & component_mask) != 0;
+            return ((1ull << component) & component_mask) != 0;
        }
    } tex;
@@ -369,7 +391,7 @@ union Instruction {
            ASSERT(component_mask_selector < mask.size());
-            return ((1 << component) & mask[component_mask_selector]) != 0;
+            return ((1ull << component) & mask[component_mask_selector]) != 0;
        }
    } texs;
@@ -442,6 +464,7 @@ public:
        IADD_C,
        IADD_R,
        IADD_IMM,
+        IADD32I,
        ISCADD_C, // Scale and Add
        ISCADD_R,
        ISCADD_IMM,
@@ -461,6 +484,9 @@ public:
        I2I_C,
        I2I_R,
        I2I_IMM,
+        LOP_C,
+        LOP_R,
+        LOP_IMM,
        LOP32I,
        MOV_C,
        MOV_R,
@@ -487,6 +513,9 @@ public:
        ISETP_C,
        ISETP_IMM,
        ISETP_R,
+        ISET_R,
+        ISET_C,
+        ISET_IMM,
        PSETP,
        XMAD_IMM,
        XMAD_CR,
@@ -497,15 +526,17 @@ public:
    enum class Type {
        Trivial,
        Arithmetic,
+        ArithmeticImmediate,
        ArithmeticInteger,
+        ArithmeticIntegerImmediate,
        Bfe,
-        Logic,
        Shift,
        Ffma,
        Flow,
        Memory,
        FloatSet,
        FloatSetPredicate,
+        IntegerSet,
        IntegerSetPredicate,
        PredicateSetPredicate,
        Conversion,
@@ -625,10 +656,11 @@ private:
            INST("0100110001101---", Id::FMUL_C, Type::Arithmetic, "FMUL_C"),
            INST("0101110001101---", Id::FMUL_R, Type::Arithmetic, "FMUL_R"),
            INST("0011100-01101---", Id::FMUL_IMM, Type::Arithmetic, "FMUL_IMM"),
-            INST("00011110--------", Id::FMUL32_IMM, Type::Arithmetic, "FMUL32_IMM"),
+            INST("00011110--------", Id::FMUL32_IMM, Type::ArithmeticImmediate, "FMUL32_IMM"),
            INST("0100110000010---", Id::IADD_C, Type::ArithmeticInteger, "IADD_C"),
            INST("0101110000010---", Id::IADD_R, Type::ArithmeticInteger, "IADD_R"),
            INST("0011100-00010---", Id::IADD_IMM, Type::ArithmeticInteger, "IADD_IMM"),
+            INST("0001110---------", Id::IADD32I, Type::ArithmeticIntegerImmediate, "IADD32I"),
            INST("0100110000011---", Id::ISCADD_C, Type::ArithmeticInteger, "ISCADD_C"),
            INST("0101110000011---", Id::ISCADD_R, Type::ArithmeticInteger, "ISCADD_R"),
            INST("0011100-00011---", Id::ISCADD_IMM, Type::ArithmeticInteger, "ISCADD_IMM"),
@@ -645,7 +677,7 @@ private:
            INST("0100110010011---", Id::MOV_C, Type::Arithmetic, "MOV_C"),
            INST("0101110010011---", Id::MOV_R, Type::Arithmetic, "MOV_R"),
            INST("0011100-10011---", Id::MOV_IMM, Type::Arithmetic, "MOV_IMM"),
-            INST("000000010000----", Id::MOV32_IMM, Type::Arithmetic, "MOV32_IMM"),
+            INST("000000010000----", Id::MOV32_IMM, Type::ArithmeticImmediate, "MOV32_IMM"),
            INST("0100110001100---", Id::FMNMX_C, Type::Arithmetic, "FMNMX_C"),
            INST("0101110001100---", Id::FMNMX_R, Type::Arithmetic, "FMNMX_R"),
            INST("0011100-01100---", Id::FMNMX_IMM, Type::Arithmetic, "FMNMX_IMM"),
@@ -655,7 +687,10 @@ private:
            INST("0100110000000---", Id::BFE_C, Type::Bfe, "BFE_C"),
            INST("0101110000000---", Id::BFE_R, Type::Bfe, "BFE_R"),
            INST("0011100-00000---", Id::BFE_IMM, Type::Bfe, "BFE_IMM"),
-            INST("000001----------", Id::LOP32I, Type::Logic, "LOP32I"),
+            INST("0100110001000---", Id::LOP_C, Type::ArithmeticInteger, "LOP_C"),
+            INST("0101110001000---", Id::LOP_R, Type::ArithmeticInteger, "LOP_R"),
+            INST("0011100001000---", Id::LOP_IMM, Type::ArithmeticInteger, "LOP_IMM"),
+            INST("000001----------", Id::LOP32I, Type::ArithmeticIntegerImmediate, "LOP32I"),
            INST("0100110001001---", Id::SHL_C, Type::Shift, "SHL_C"),
            INST("0101110001001---", Id::SHL_R, Type::Shift, "SHL_R"),
            INST("0011100-01001---", Id::SHL_IMM, Type::Shift, "SHL_IMM"),
@@ -677,6 +712,9 @@ private:
            INST("010010110110----", Id::ISETP_C, Type::IntegerSetPredicate, "ISETP_C"),
            INST("010110110110----", Id::ISETP_R, Type::IntegerSetPredicate, "ISETP_R"),
            INST("0011011-0110----", Id::ISETP_IMM, Type::IntegerSetPredicate, "ISETP_IMM"),
+            INST("010110110101----", Id::ISET_R, Type::IntegerSet, "ISET_R"),
+            INST("010010110101----", Id::ISET_C, Type::IntegerSet, "ISET_C"),
+            INST("0011011-0101----", Id::ISET_IMM, Type::IntegerSet, "ISET_IMM"),
            INST("0101000010010---", Id::PSETP, Type::PredicateSetPredicate, "PSETP"),
            INST("0011011-00------", Id::XMAD_IMM, Type::Arithmetic, "XMAD_IMM"),
            INST("0100111---------", Id::XMAD_CR, Type::Arithmetic, "XMAD_CR"),
diff --git a/src/video_core/gpu.cpp b/src/video_core/gpu.cpp
index 66351fe6e..e36483145 100644
--- a/src/video_core/gpu.cpp
+++ b/src/video_core/gpu.cpp
@@ -5,6 +5,7 @@
 #include "video_core/engines/fermi_2d.h"
 #include "video_core/engines/maxwell_3d.h"
 #include "video_core/engines/maxwell_compute.h"
+#include "video_core/engines/maxwell_dma.h"
 #include "video_core/gpu.h"
 namespace Tegra {
@@ -14,6 +15,7 @@ GPU::GPU() {
    maxwell_3d = std::make_unique<Engines::Maxwell3D>(*memory_manager);
    fermi_2d = std::make_unique<Engines::Fermi2D>(*memory_manager);
    maxwell_compute = std::make_unique<Engines::MaxwellCompute>();
+    maxwell_dma = std::make_unique<Engines::MaxwellDMA>(*memory_manager);
 }
 GPU::~GPU() = default;
diff --git a/src/video_core/gpu.h b/src/video_core/gpu.h
index 5852b9619..7b4e9b842 100644
--- a/src/video_core/gpu.h
+++ b/src/video_core/gpu.h
@@ -63,6 +63,7 @@ namespace Engines {
 class Fermi2D;
 class Maxwell3D;
 class MaxwellCompute;
+class MaxwellDMA;
 } // namespace Engines
 enum class EngineID {
@@ -103,6 +104,8 @@ private:
    std::unique_ptr<Engines::Fermi2D> fermi_2d;
    /// Compute engine
    std::unique_ptr<Engines::MaxwellCompute> maxwell_compute;
+    /// DMA engine
+    std::unique_ptr<Engines::MaxwellDMA> maxwell_dma;
 };
 } // namespace Tegra
diff --git a/src/video_core/renderer_opengl/gl_rasterizer.cpp b/src/video_core/renderer_opengl/gl_rasterizer.cpp
index e04966849..0f6dec60b 100644
--- a/src/video_core/renderer_opengl/gl_rasterizer.cpp
+++ b/src/video_core/renderer_opengl/gl_rasterizer.cpp
@@ -639,7 +639,7 @@ u32 RasterizerOpenGL::SetupConstBuffers(Maxwell::ShaderStage stage, GLuint progr
    state.Apply();
-    return current_bindpoint + entries.size();
+    return current_bindpoint + static_cast<u32>(entries.size());
 }
 u32 RasterizerOpenGL::SetupTextures(Maxwell::ShaderStage stage, GLuint program, u32 current_unit,
@@ -685,7 +685,7 @@ u32 RasterizerOpenGL::SetupTextures(Maxwell::ShaderStage stage, GLuint program,
    state.Apply();
-    return current_unit + entries.size();
+    return current_unit + static_cast<u32>(entries.size());
 }
 void RasterizerOpenGL::BindFramebufferSurfaces(const Surface& color_surface,
@@ -746,7 +746,6 @@ void RasterizerOpenGL::SyncDepthOffset() {
 void RasterizerOpenGL::SyncBlendState() {
    const auto& regs = Core::System().GetInstance().GPU().Maxwell3D().regs;
-    ASSERT_MSG(regs.independent_blend_enable == 1, "Only independent blending is implemented");
    // TODO(Subv): Support more than just render target 0.
    state.blend.enabled = regs.blend.enable[0] != 0;
@@ -754,6 +753,7 @@ void RasterizerOpenGL::SyncBlendState() {
    if (!state.blend.enabled)
        return;
+    ASSERT_MSG(regs.independent_blend_enable == 1, "Only independent blending is implemented");
    ASSERT_MSG(!regs.independent_blend[0].separate_alpha, "Unimplemented");
    state.blend.rgb_equation = MaxwellToGL::BlendEquation(regs.independent_blend[0].equation_rgb);
    state.blend.src_rgb_func = MaxwellToGL::BlendFunc(regs.independent_blend[0].factor_source_rgb);
diff --git a/src/video_core/renderer_opengl/gl_rasterizer_cache.cpp b/src/video_core/renderer_opengl/gl_rasterizer_cache.cpp
index ff48a2669..61d670dcb 100644
--- a/src/video_core/renderer_opengl/gl_rasterizer_cache.cpp
+++ b/src/video_core/renderer_opengl/gl_rasterizer_cache.cpp
@@ -28,6 +28,7 @@
 #include "video_core/engines/maxwell_3d.h"
 #include "video_core/renderer_opengl/gl_rasterizer_cache.h"
 #include "video_core/renderer_opengl/gl_state.h"
+#include "video_core/textures/astc.h"
 #include "video_core/textures/decoders.h"
 #include "video_core/utils.h"
 #include "video_core/video_core.h"
@@ -55,6 +56,7 @@ static constexpr std::array<FormatTuple, SurfaceParams::MaxPixelFormat> tex_form
    {GL_COMPRESSED_RGBA_S3TC_DXT3_EXT, GL_RGBA, GL_UNSIGNED_INT_8_8_8_8, true}, // DXT23
    {GL_COMPRESSED_RGBA_S3TC_DXT5_EXT, GL_RGBA, GL_UNSIGNED_INT_8_8_8_8, true}, // DXT45
    {GL_COMPRESSED_RED_RGTC1, GL_RED, GL_UNSIGNED_INT_8_8_8_8, true},           // DXN1
+    {GL_RGBA8, GL_RGBA, GL_UNSIGNED_BYTE, false},                               // ASTC_2D_4X4
 }};
 static const FormatTuple& GetFormatTuple(PixelFormat pixel_format, ComponentType component_type) {
@@ -86,6 +88,23 @@ static u16 GetResolutionScaleFactor() {
                                : Settings::values.resolution_factor);
 }
+static void ConvertASTCToRGBA8(std::vector<u8>& data, PixelFormat format, u32 width, u32 height) {
+    u32 block_width{};
+    u32 block_height{};
+    switch (format) {
+    case PixelFormat::ASTC_2D_4X4:
+        block_width = 4;
+        block_height = 4;
+        break;
+    default:
+        NGLOG_CRITICAL(HW_GPU, "Unhandled format: {}", static_cast<u32>(format));
+        UNREACHABLE();
+    }
+    data = Tegra::Texture::ASTC::Decompress(data, width, height, block_width, block_height);
+}
 template <bool morton_to_gl, PixelFormat format>
 void MortonCopy(u32 stride, u32 block_height, u32 height, u8* gl_buffer, Tegra::GPUVAddr base,
                Tegra::GPUVAddr start, Tegra::GPUVAddr end) {
@@ -97,6 +116,12 @@ void MortonCopy(u32 stride, u32 block_height, u32 height, u8* gl_buffer, Tegra::
        auto data = Tegra::Texture::UnswizzleTexture(
            *gpu.memory_manager->GpuToCpuAddress(base),
            SurfaceParams::TextureFormatFromPixelFormat(format), stride, height, block_height);
+        if (SurfaceParams::IsFormatASTC(format)) {
+            // ASTC formats are converted to RGBA8 in software, as most PC GPUs do not support this
+            ConvertASTCToRGBA8(data, format, stride, height);
+        }
        std::memcpy(gl_buffer, data.data(), data.size());
    } else {
        // TODO(bunnei): Assumes the default rendering GOB size of 16 (128 lines). We should check
@@ -118,7 +143,7 @@ static constexpr std::array<void (*)(u32, u32, u32, u8*, Tegra::GPUVAddr, Tegra:
        MortonCopy<true, PixelFormat::R8>,           MortonCopy<true, PixelFormat::RGBA16F>,
        MortonCopy<true, PixelFormat::R11FG11FB10F>, MortonCopy<true, PixelFormat::DXT1>,
        MortonCopy<true, PixelFormat::DXT23>,        MortonCopy<true, PixelFormat::DXT45>,
-        MortonCopy<true, PixelFormat::DXN1>,
+        MortonCopy<true, PixelFormat::DXN1>,         MortonCopy<true, PixelFormat::ASTC_2D_4X4>,
 };
 static constexpr std::array<void (*)(u32, u32, u32, u8*, Tegra::GPUVAddr, Tegra::GPUVAddr,
@@ -137,6 +162,7 @@ static constexpr std::array<void (*)(u32, u32, u32, u8*, Tegra::GPUVAddr, Tegra:
        nullptr,
        nullptr,
        nullptr,
+        MortonCopy<false, PixelFormat::ABGR8>,
 };
 // Allocate an uninitialized texture of appropriate size and format for the surface
@@ -549,7 +575,7 @@ void CachedSurface::UploadGLTexture(const MathUtil::Rectangle<u32>& rect, GLuint
        glCompressedTexImage2D(GL_TEXTURE_2D, 0, tuple.internal_format,
                               static_cast<GLsizei>(rect.GetWidth() * GetCompresssionFactor()),
                               static_cast<GLsizei>(rect.GetHeight() * GetCompresssionFactor()), 0,
-                               size, &gl_buffer[buffer_offset]);
+                               static_cast<GLsizei>(size), &gl_buffer[buffer_offset]);
    } else {
        glTexSubImage2D(GL_TEXTURE_2D, 0, x0, y0, static_cast<GLsizei>(rect.GetWidth()),
                        static_cast<GLsizei>(rect.GetHeight()), tuple.format, tuple.type,
@@ -888,9 +914,6 @@ Surface RasterizerCacheOpenGL::GetSurface(const SurfaceParams& params, ScaleMatc
    // Use GetSurfaceSubRect instead
    ASSERT(params.width == params.stride);
-    ASSERT(!params.is_tiled ||
-           (params.GetActualWidth() % 8 == 0 && params.GetActualHeight() % 8 == 0));
    // Check for an exact match in existing surfaces
    Surface surface =
        FindMatch<MatchFlags::Exact | MatchFlags::Invalid>(surface_cache, params, match_res_scale);
@@ -1048,8 +1071,13 @@ Surface RasterizerCacheOpenGL::GetTextureSurface(const Tegra::Texture::FullTextu
    if (config.tic.IsTiled()) {
        params.block_height = config.tic.BlockHeight();
-        params.width = Common::AlignUp(params.width, params.block_height);
-        params.height = Common::AlignUp(params.height, params.block_height);
+        // TODO(bunnei): The below align up is a hack. This is here because some compressed textures
+        // are not a multiple of their own compression factor, and so this accounts for that. This
+        // could potentially result in an extra row of 4px being decoded if a texture is not a
+        // multiple of 4.
+        params.width = Common::AlignUp(params.width, 4);
+        params.height = Common::AlignUp(params.height, 4);
    } else {
        // Use the texture-provided stride value if the texture isn't tiled.
        params.stride = static_cast<u32>(params.PixelsInBytes(config.tic.Pitch()));
@@ -1057,26 +1085,6 @@ Surface RasterizerCacheOpenGL::GetTextureSurface(const Tegra::Texture::FullTextu
    params.UpdateParams();
-    if (params.GetActualWidth() % 8 != 0 || params.GetActualHeight() % 8 != 0 ||
-        params.stride != params.width) {
-        Surface src_surface;
-        MathUtil::Rectangle<u32> rect;
-        std::tie(src_surface, rect) = GetSurfaceSubRect(params, ScaleMatch::Ignore, true);
-        rect = rect.Scale(params.GetCompresssionFactor());
-        params.res_scale = src_surface->res_scale;
-        Surface tmp_surface = CreateSurface(params);
-        auto dst_rect = tmp_surface->GetScaledRect().Scale(params.GetCompresssionFactor());
-        BlitTextures(src_surface->texture.handle, rect, tmp_surface->texture.handle, dst_rect,
-                     SurfaceParams::GetFormatType(params.pixel_format), read_framebuffer.handle,
-                     draw_framebuffer.handle);
-        remove_surfaces.emplace(tmp_surface);
-        return tmp_surface;
-    }
    return GetSurface(params, ScaleMatch::Ignore, true);
 }
@@ -1251,7 +1259,7 @@ void RasterizerCacheOpenGL::ValidateSurface(const Surface& surface, Tegra::GPUVA
        const auto interval = *it & validate_interval;
        // Look for a valid surface to copy from
-        SurfaceParams params = surface->FromInterval(interval);
+        SurfaceParams params = *surface;
        Surface copy_surface =
            FindMatch<MatchFlags::Copy>(surface_cache, params, ScaleMatch::Ignore, interval);
diff --git a/src/video_core/renderer_opengl/gl_rasterizer_cache.h b/src/video_core/renderer_opengl/gl_rasterizer_cache.h
index 0f43e863d..9da945e19 100644
--- a/src/video_core/renderer_opengl/gl_rasterizer_cache.h
+++ b/src/video_core/renderer_opengl/gl_rasterizer_cache.h
@@ -65,6 +65,7 @@ struct SurfaceParams {
        DXT23 = 8,
        DXT45 = 9,
        DXN1 = 10, // This is also known as BC4
+        ASTC_2D_4X4 = 11,
        Max,
        Invalid = 255,
@@ -111,6 +112,7 @@ struct SurfaceParams {
            4, // DXT23
            4, // DXT45
            4, // DXN1
+            1, // ASTC_2D_4X4
        }};
        ASSERT(static_cast<size_t>(format) < compression_factor_table.size());
@@ -136,6 +138,7 @@ struct SurfaceParams {
            128, // DXT23
            128, // DXT45
            64,  // DXN1
+            32,  // ASTC_2D_4X4
        }};
        ASSERT(static_cast<size_t>(format) < bpp_table.size());
@@ -162,6 +165,15 @@ struct SurfaceParams {
        }
    }
+    static bool IsFormatASTC(PixelFormat format) {
+        switch (format) {
+        case PixelFormat::ASTC_2D_4X4:
+            return true;
+        default:
+            return false;
+        }
+    }
    static PixelFormat PixelFormatFromGPUPixelFormat(Tegra::FramebufferConfig::PixelFormat format) {
        switch (format) {
        case Tegra::FramebufferConfig::PixelFormat::ABGR8:
@@ -197,6 +209,8 @@ struct SurfaceParams {
            return PixelFormat::DXT45;
        case Tegra::Texture::TextureFormat::DXN1:
            return PixelFormat::DXN1;
+        case Tegra::Texture::TextureFormat::ASTC_2D_4X4:
+            return PixelFormat::ASTC_2D_4X4;
        default:
            NGLOG_CRITICAL(HW_GPU, "Unimplemented format={}", static_cast<u32>(format));
            UNREACHABLE();
@@ -228,6 +242,8 @@ struct SurfaceParams {
            return Tegra::Texture::TextureFormat::DXT45;
        case PixelFormat::DXN1:
            return Tegra::Texture::TextureFormat::DXN1;
+        case PixelFormat::ASTC_2D_4X4:
+            return Tegra::Texture::TextureFormat::ASTC_2D_4X4;
        default:
            UNREACHABLE();
        }
diff --git a/src/video_core/renderer_opengl/gl_shader_decompiler.cpp b/src/video_core/renderer_opengl/gl_shader_decompiler.cpp
index 87ae47ac9..cd7569e2f 100644
--- a/src/video_core/renderer_opengl/gl_shader_decompiler.cpp
+++ b/src/video_core/renderer_opengl/gl_shader_decompiler.cpp
@@ -17,6 +17,7 @@ namespace Decompiler {
 using Tegra::Shader::Attribute;
 using Tegra::Shader::Instruction;
+using Tegra::Shader::LogicOperation;
 using Tegra::Shader::OpCode;
 using Tegra::Shader::Register;
 using Tegra::Shader::Sampler;
@@ -267,6 +268,27 @@ public:
    }
    /**
+     * Returns code that does an integer size conversion for the specified size.
+     * @param value Value to perform integer size conversion on.
+     * @param size Register size to use for conversion instructions.
+     * @returns GLSL string corresponding to the value converted to the specified size.
+     */
+    static std::string ConvertIntegerSize(const std::string& value, Register::Size size) {
+        switch (size) {
+        case Register::Size::Byte:
+            return "((" + value + " << 24) >> 24)";
+        case Register::Size::Short:
+            return "((" + value + " << 16) >> 16)";
+        case Register::Size::Word:
+            // Default - do nothing
+            return value;
+        default:
+            NGLOG_CRITICAL(HW_GPU, "Unimplemented conversion size {}", static_cast<u32>(size));
+            UNREACHABLE();
+        }
+    }
+    /**
     * Gets a register as an float.
     * @param reg The register to get.
     * @param elem The element to use for the operation.
@@ -282,15 +304,18 @@ public:
     * @param reg The register to get.
     * @param elem The element to use for the operation.
     * @param is_signed Whether to get the register as a signed (or unsigned) integer.
+     * @param size Register size to use for conversion instructions.
     * @returns GLSL string corresponding to the register as an integer.
     */
-    std::string GetRegisterAsInteger(const Register& reg, unsigned elem = 0,
+    std::string GetRegisterAsInteger(const Register& reg, unsigned elem = 0, bool is_signed = true,
-                                     bool is_signed = true) {
+                                     Register::Size size = Register::Size::Word) {
        const std::string func = GetGLSLConversionFunc(
            GLSLRegister::Type::Float,
            is_signed ? GLSLRegister::Type::Integer : GLSLRegister::Type::UnsignedInteger);
-        return func + '(' + GetRegister(reg, elem) + ')';
+        std::string value = func + '(' + GetRegister(reg, elem) + ')';
+        return ConvertIntegerSize(value, size);
    }
    /**
@@ -300,13 +325,15 @@ public:
     * @param value The code representing the value to assign.
     * @param dest_num_components Number of components in the destination.
     * @param value_num_components Number of components in the value.
-     * @param is_abs Optional, when True, applies absolute value to output.
+     * @param is_saturated Optional, when True, saturates the provided value.
     * @param dest_elem Optional, the destination element to use for the operation.
     */
    void SetRegisterToFloat(const Register& reg, u64 elem, const std::string& value,
-                            u64 dest_num_components, u64 value_num_components, bool is_abs = false,
+                            u64 dest_num_components, u64 value_num_components,
-                            u64 dest_elem = 0) {
+                            bool is_saturated = false, u64 dest_elem = 0) {
-        SetRegister(reg, elem, value, dest_num_components, value_num_components, is_abs, dest_elem);
+        SetRegister(reg, elem, is_saturated ? "clamp(" + value + ", 0.0, 1.0)" : value,
+                    dest_num_components, value_num_components, dest_elem);
    }
    /**
@@ -316,18 +343,22 @@ public:
     * @param value The code representing the value to assign.
     * @param dest_num_components Number of components in the destination.
     * @param value_num_components Number of components in the value.
-     * @param is_abs Optional, when True, applies absolute value to output.
+     * @param is_saturated Optional, when True, saturates the provided value.
     * @param dest_elem Optional, the destination element to use for the operation.
+     * @param size Register size to use for conversion instructions.
     */
    void SetRegisterToInteger(const Register& reg, bool is_signed, u64 elem,
                              const std::string& value, u64 dest_num_components,
-                              u64 value_num_components, bool is_abs = false, u64 dest_elem = 0) {
+                              u64 value_num_components, bool is_saturated = false,
+                              u64 dest_elem = 0, Register::Size size = Register::Size::Word) {
+        ASSERT_MSG(!is_saturated, "Unimplemented");
        const std::string func = GetGLSLConversionFunc(
            is_signed ? GLSLRegister::Type::Integer : GLSLRegister::Type::UnsignedInteger,
            GLSLRegister::Type::Float);
-        SetRegister(reg, elem, func + '(' + value + ')', dest_num_components, value_num_components,
+        SetRegister(reg, elem, func + '(' + ConvertIntegerSize(value, size) + ')',
-                    is_abs, dest_elem);
+                    dest_num_components, value_num_components, dest_elem);
    }
    /**
@@ -507,13 +538,11 @@ private:
     * @param value The code representing the value to assign.
     * @param dest_num_components Number of components in the destination.
     * @param value_num_components Number of components in the value.
-     * @param is_abs Optional, when True, applies absolute value to output.
     * @param dest_elem Optional, the destination element to use for the operation.
     */
    void SetRegister(const Register& reg, u64 elem, const std::string& value,
-                     u64 dest_num_components, u64 value_num_components, bool is_abs,
+                     u64 dest_num_components, u64 value_num_components, u64 dest_elem) {
-                     u64 dest_elem) {
+        std::string dest = GetRegister(reg, static_cast<u32>(dest_elem));
-        std::string dest = GetRegister(reg, dest_elem);
        if (dest_num_components > 1) {
            dest += GetSwizzle(elem);
        }
@@ -523,8 +552,6 @@ private:
            src += GetSwizzle(elem);
        }
-        src = is_abs ? "abs(" + src + ')' : src;
        shader.AddLine(dest + " = " + src + ';');
    }
@@ -545,7 +572,7 @@ private:
            // vertex shader, and what's the value of the fourth element when inside a Tess Eval
            // shader.
            ASSERT(stage == Maxwell3D::Regs::ShaderStage::Vertex);
-            return "vec4(0, 0, gl_InstanceID, gl_VertexID)";
+            return "vec4(0, 0, uintBitsToFloat(gl_InstanceID), uintBitsToFloat(gl_VertexID))";
        default:
            const u32 index{static_cast<u32>(attribute) -
                            static_cast<u32>(Attribute::Index::Attribute_0)};
@@ -740,6 +767,31 @@ private:
        return (absolute_offset % SchedPeriod) == 0;
    }
+    void WriteLogicOperation(Register dest, LogicOperation logic_op, const std::string& op_a,
+                             const std::string& op_b) {
+        switch (logic_op) {
+        case LogicOperation::And: {
+            regs.SetRegisterToInteger(dest, true, 0, '(' + op_a + " & " + op_b + ')', 1, 1);
+            break;
+        }
+        case LogicOperation::Or: {
+            regs.SetRegisterToInteger(dest, true, 0, '(' + op_a + " | " + op_b + ')', 1, 1);
+            break;
+        }
+        case LogicOperation::Xor: {
+            regs.SetRegisterToInteger(dest, true, 0, '(' + op_a + " ^ " + op_b + ')', 1, 1);
+            break;
+        }
+        case LogicOperation::PassB: {
+            regs.SetRegisterToInteger(dest, true, 0, op_b, 1, 1);
+            break;
+        }
+        default:
+            NGLOG_CRITICAL(HW_GPU, "Unimplemented logic operation: {}", static_cast<u32>(logic_op));
+            UNREACHABLE();
+        }
+    }
    /**
     * Compiles a single instruction from Tegra to GLSL.
     * @param offset the offset of the Tegra shader instruction.
@@ -777,22 +829,25 @@ private:
        switch (opcode->GetType()) {
        case OpCode::Type::Arithmetic: {
-            std::string op_a = instr.alu.negate_a ? "-" : "";
+            std::string op_a = regs.GetRegisterAsFloat(instr.gpr8);
-            op_a += regs.GetRegisterAsFloat(instr.gpr8);
            if (instr.alu.abs_a) {
                op_a = "abs(" + op_a + ')';
            }
-            std::string op_b = instr.alu.negate_b ? "-" : "";
+            if (instr.alu.negate_a) {
+                op_a = "-(" + op_a + ')';
+            }
+            std::string op_b;
            if (instr.is_b_imm) {
-                op_b += GetImmediate19(instr);
+                op_b = GetImmediate19(instr);
            } else {
                if (instr.is_b_gpr) {
-                    op_b += regs.GetRegisterAsFloat(instr.gpr20);
+                    op_b = regs.GetRegisterAsFloat(instr.gpr20);
                } else {
-                    op_b += regs.GetUniform(instr.cbuf34.index, instr.cbuf34.offset,
+                    op_b = regs.GetUniform(instr.cbuf34.index, instr.cbuf34.offset,
-                                            GLSLRegister::Type::Float);
+                                           GLSLRegister::Type::Float);
                }
            }
@@ -800,6 +855,10 @@ private:
                op_b = "abs(" + op_b + ')';
            }
+            if (instr.alu.negate_b) {
+                op_b = "-(" + op_b + ')';
+            }
            switch (opcode->GetId()) {
            case OpCode::Id::MOV_C:
            case OpCode::Id::MOV_R: {
@@ -807,64 +866,49 @@ private:
                break;
            }
-            case OpCode::Id::MOV32_IMM: {
-                // mov32i doesn't have abs or neg bits.
-                regs.SetRegisterToFloat(instr.gpr0, 0, GetImmediate32(instr), 1, 1);
-                break;
-            }
            case OpCode::Id::FMUL_C:
            case OpCode::Id::FMUL_R:
            case OpCode::Id::FMUL_IMM: {
-                ASSERT_MSG(!instr.saturate_a, "Unimplemented");
+                regs.SetRegisterToFloat(instr.gpr0, 0, op_a + " * " + op_b, 1, 1,
+                                        instr.alu.saturate_d);
-                regs.SetRegisterToFloat(instr.gpr0, 0, op_a + " * " + op_b, 1, 1, instr.alu.abs_d);
-                break;
-            }
-            case OpCode::Id::FMUL32_IMM: {
-                // fmul32i doesn't have abs or neg bits.
-                regs.SetRegisterToFloat(
-                    instr.gpr0, 0,
-                    regs.GetRegisterAsFloat(instr.gpr8) + " * " + GetImmediate32(instr), 1, 1);
                break;
            }
            case OpCode::Id::FADD_C:
            case OpCode::Id::FADD_R:
            case OpCode::Id::FADD_IMM: {
-                ASSERT_MSG(!instr.saturate_a, "Unimplemented");
+                regs.SetRegisterToFloat(instr.gpr0, 0, op_a + " + " + op_b, 1, 1,
+                                        instr.alu.saturate_d);
-                regs.SetRegisterToFloat(instr.gpr0, 0, op_a + " + " + op_b, 1, 1, instr.alu.abs_d);
                break;
            }
            case OpCode::Id::MUFU: {
-                ASSERT_MSG(!instr.saturate_a, "Unimplemented");
                switch (instr.sub_op) {
                case SubOp::Cos:
                    regs.SetRegisterToFloat(instr.gpr0, 0, "cos(" + op_a + ')', 1, 1,
-                                            instr.alu.abs_d);
+                                            instr.alu.saturate_d);
                    break;
                case SubOp::Sin:
                    regs.SetRegisterToFloat(instr.gpr0, 0, "sin(" + op_a + ')', 1, 1,
-                                            instr.alu.abs_d);
+                                            instr.alu.saturate_d);
                    break;
                case SubOp::Ex2:
                    regs.SetRegisterToFloat(instr.gpr0, 0, "exp2(" + op_a + ')', 1, 1,
-                                            instr.alu.abs_d);
+                                            instr.alu.saturate_d);
                    break;
                case SubOp::Lg2:
                    regs.SetRegisterToFloat(instr.gpr0, 0, "log2(" + op_a + ')', 1, 1,
-                                            instr.alu.abs_d);
+                                            instr.alu.saturate_d);
                    break;
                case SubOp::Rcp:
-                    regs.SetRegisterToFloat(instr.gpr0, 0, "1.0 / " + op_a, 1, 1, instr.alu.abs_d);
+                    regs.SetRegisterToFloat(instr.gpr0, 0, "1.0 / " + op_a, 1, 1,
+                                            instr.alu.saturate_d);
                    break;
                case SubOp::Rsq:
                    regs.SetRegisterToFloat(instr.gpr0, 0, "inversesqrt(" + op_a + ')', 1, 1,
-                                            instr.alu.abs_d);
+                                            instr.alu.saturate_d);
                    break;
                case SubOp::Min:
                    regs.SetRegisterToFloat(instr.gpr0, 0, "min(" + op_a + "," + op_b + ')', 1, 1,
-                                            instr.alu.abs_d);
+                                            instr.alu.saturate_d);
                    break;
                default:
                    NGLOG_CRITICAL(HW_GPU, "Unhandled MUFU sub op: {0:x}",
@@ -901,6 +945,21 @@ private:
            }
            break;
        }
+        case OpCode::Type::ArithmeticImmediate: {
+            switch (opcode->GetId()) {
+            case OpCode::Id::MOV32_IMM: {
+                regs.SetRegisterToFloat(instr.gpr0, 0, GetImmediate32(instr), 1, 1);
+                break;
+            }
+            case OpCode::Id::FMUL32_IMM: {
+                regs.SetRegisterToFloat(
+                    instr.gpr0, 0,
+                    regs.GetRegisterAsFloat(instr.gpr8) + " * " + GetImmediate32(instr), 1, 1);
+                break;
+            }
+            }
+            break;
+        }
        case OpCode::Type::Bfe: {
            ASSERT_MSG(!instr.bfe.negate_b, "Unimplemented");
@@ -926,49 +985,6 @@ private:
            break;
        }
-        case OpCode::Type::Logic: {
-            std::string op_a = regs.GetRegisterAsInteger(instr.gpr8, 0, true);
-            if (instr.alu.lop.invert_a)
-                op_a = "~(" + op_a + ')';
-            switch (opcode->GetId()) {
-            case OpCode::Id::LOP32I: {
-                u32 imm = static_cast<u32>(instr.alu.imm20_32.Value());
-                if (instr.alu.lop.invert_b)
-                    imm = ~imm;
-                switch (instr.alu.lop.operation) {
-                case Tegra::Shader::LogicOperation::And: {
-                    regs.SetRegisterToInteger(instr.gpr0, true, 0,
-                                              '(' + op_a + " & " + std::to_string(imm) + ')', 1, 1);
-                    break;
-                }
-                case Tegra::Shader::LogicOperation::Or: {
-                    regs.SetRegisterToInteger(instr.gpr0, true, 0,
-                                              '(' + op_a + " | " + std::to_string(imm) + ')', 1, 1);
-                    break;
-                }
-                case Tegra::Shader::LogicOperation::Xor: {
-                    regs.SetRegisterToInteger(instr.gpr0, true, 0,
-                                              '(' + op_a + " ^ " + std::to_string(imm) + ')', 1, 1);
-                    break;
-                }
-                default:
-                    NGLOG_CRITICAL(HW_GPU, "Unimplemented lop32i operation: {}",
-                                   static_cast<u32>(instr.alu.lop.operation.Value()));
-                    UNREACHABLE();
-                }
-                break;
-            }
-            default: {
-                NGLOG_CRITICAL(HW_GPU, "Unhandled logic instruction: {}", opcode->GetName());
-                UNREACHABLE();
-            }
-            }
-            break;
-        }
        case OpCode::Type::Shift: {
            std::string op_a = regs.GetRegisterAsInteger(instr.gpr8, 0, true);
@@ -1012,14 +1028,39 @@ private:
            break;
        }
-        case OpCode::Type::ArithmeticInteger: {
+        case OpCode::Type::ArithmeticIntegerImmediate: {
            std::string op_a = regs.GetRegisterAsInteger(instr.gpr8);
+            std::string op_b = std::to_string(instr.alu.imm20_32.Value());
+            switch (opcode->GetId()) {
+            case OpCode::Id::IADD32I:
+                if (instr.iadd32i.negate_a)
+                    op_a = "-(" + op_a + ')';
-            if (instr.alu_integer.negate_a)
+                regs.SetRegisterToInteger(instr.gpr0, true, 0, op_a + " + " + op_b, 1, 1,
-                op_a = '-' + op_a;
+                                          instr.iadd32i.saturate != 0);
+                break;
+            case OpCode::Id::LOP32I: {
+                if (instr.alu.lop32i.invert_a)
+                    op_a = "~(" + op_a + ')';
-            std::string op_b = instr.alu_integer.negate_b ? "-" : "";
+                if (instr.alu.lop32i.invert_b)
+                    op_b = "~(" + op_b + ')';
+                WriteLogicOperation(instr.gpr0, instr.alu.lop32i.operation, op_a, op_b);
+                break;
+            }
+            default: {
+                NGLOG_CRITICAL(HW_GPU, "Unhandled ArithmeticIntegerImmediate instruction: {}",
+                               opcode->GetName());
+                UNREACHABLE();
+            }
+            }
+            break;
+        }
+        case OpCode::Type::ArithmeticInteger: {
+            std::string op_a = regs.GetRegisterAsInteger(instr.gpr8);
+            std::string op_b;
            if (instr.is_b_imm) {
                op_b += '(' + std::to_string(instr.alu.GetSignedImm20_20()) + ')';
            } else {
@@ -1035,19 +1076,46 @@ private:
            case OpCode::Id::IADD_C:
            case OpCode::Id::IADD_R:
            case OpCode::Id::IADD_IMM: {
-                ASSERT_MSG(!instr.saturate_a, "Unimplemented");
+                if (instr.alu_integer.negate_a)
-                regs.SetRegisterToInteger(instr.gpr0, true, 0, op_a + " + " + op_b, 1, 1);
+                    op_a = "-(" + op_a + ')';
+                if (instr.alu_integer.negate_b)
+                    op_b = "-(" + op_b + ')';
+                regs.SetRegisterToInteger(instr.gpr0, true, 0, op_a + " + " + op_b, 1, 1,
+                                          instr.alu.saturate_d);
                break;
            }
            case OpCode::Id::ISCADD_C:
            case OpCode::Id::ISCADD_R:
            case OpCode::Id::ISCADD_IMM: {
+                if (instr.alu_integer.negate_a)
+                    op_a = "-(" + op_a + ')';
+                if (instr.alu_integer.negate_b)
+                    op_b = "-(" + op_b + ')';
                std::string shift = std::to_string(instr.alu_integer.shift_amount.Value());
                regs.SetRegisterToInteger(instr.gpr0, true, 0,
                                          "((" + op_a + " << " + shift + ") + " + op_b + ')', 1, 1);
                break;
            }
+            case OpCode::Id::LOP_C:
+            case OpCode::Id::LOP_R:
+            case OpCode::Id::LOP_IMM: {
+                ASSERT_MSG(!instr.alu.lop.unk44, "Unimplemented");
+                ASSERT_MSG(instr.alu.lop.pred48 == Pred::UnusedIndex, "Unimplemented");
+                if (instr.alu.lop.invert_a)
+                    op_a = "~(" + op_a + ')';
+                if (instr.alu.lop.invert_b)
+                    op_b = "~(" + op_b + ')';
+                WriteLogicOperation(instr.gpr0, instr.alu.lop.operation, op_a, op_b);
+                break;
+            }
            default: {
                NGLOG_CRITICAL(HW_GPU, "Unhandled ArithmeticInteger instruction: {}",
                               opcode->GetName());
@@ -1058,8 +1126,6 @@ private:
            break;
        }
        case OpCode::Type::Ffma: {
-            ASSERT_MSG(!instr.saturate_a, "Unimplemented");
            std::string op_a = regs.GetRegisterAsFloat(instr.gpr8);
            std::string op_b = instr.ffma.negate_b ? "-" : "";
            std::string op_c = instr.ffma.negate_c ? "-" : "";
@@ -1093,33 +1159,33 @@ private:
            }
            }
-            regs.SetRegisterToFloat(instr.gpr0, 0, op_a + " * " + op_b + " + " + op_c, 1, 1);
+            regs.SetRegisterToFloat(instr.gpr0, 0, op_a + " * " + op_b + " + " + op_c, 1, 1,
+                                    instr.alu.saturate_d);
            break;
        }
        case OpCode::Type::Conversion: {
-            ASSERT_MSG(instr.conversion.size == Register::Size::Word, "Unimplemented");
            ASSERT_MSG(!instr.conversion.negate_a, "Unimplemented");
-            ASSERT_MSG(!instr.saturate_a, "Unimplemented");
            switch (opcode->GetId()) {
            case OpCode::Id::I2I_R: {
                ASSERT_MSG(!instr.conversion.selector, "Unimplemented");
-                std::string op_a =
+                std::string op_a = regs.GetRegisterAsInteger(
-                    regs.GetRegisterAsInteger(instr.gpr20, 0, instr.conversion.is_input_signed);
+                    instr.gpr20, 0, instr.conversion.is_input_signed, instr.conversion.src_size);
                if (instr.conversion.abs_a) {
                    op_a = "abs(" + op_a + ')';
                }
                regs.SetRegisterToInteger(instr.gpr0, instr.conversion.is_output_signed, 0, op_a, 1,
-                                          1);
+                                          1, instr.alu.saturate_d, 0, instr.conversion.dest_size);
                break;
            }
            case OpCode::Id::I2F_R: {
+                ASSERT_MSG(instr.conversion.dest_size == Register::Size::Word, "Unimplemented");
                ASSERT_MSG(!instr.conversion.selector, "Unimplemented");
-                std::string op_a =
+                std::string op_a = regs.GetRegisterAsInteger(
-                    regs.GetRegisterAsInteger(instr.gpr20, 0, instr.conversion.is_input_signed);
+                    instr.gpr20, 0, instr.conversion.is_input_signed, instr.conversion.src_size);
                if (instr.conversion.abs_a) {
                    op_a = "abs(" + op_a + ')';
@@ -1129,8 +1195,8 @@ private:
                break;
            }
            case OpCode::Id::F2F_R: {
-                ASSERT_MSG(!instr.saturate_a, "Unimplemented");
+                ASSERT_MSG(instr.conversion.dest_size == Register::Size::Word, "Unimplemented");
+                ASSERT_MSG(instr.conversion.src_size == Register::Size::Word, "Unimplemented");
                std::string op_a = regs.GetRegisterAsFloat(instr.gpr20);
                switch (instr.conversion.f2f.rounding) {
@@ -1156,10 +1222,11 @@ private:
                    op_a = "abs(" + op_a + ')';
                }
-                regs.SetRegisterToFloat(instr.gpr0, 0, op_a, 1, 1);
+                regs.SetRegisterToFloat(instr.gpr0, 0, op_a, 1, 1, instr.alu.saturate_d);
                break;
            }
            case OpCode::Id::F2I_R: {
+                ASSERT_MSG(instr.conversion.src_size == Register::Size::Word, "Unimplemented");
                std::string op_a = regs.GetRegisterAsFloat(instr.gpr20);
                if (instr.conversion.abs_a) {
@@ -1192,7 +1259,7 @@ private:
                }
                regs.SetRegisterToInteger(instr.gpr0, instr.conversion.is_output_signed, 0, op_a, 1,
-                                          1);
+                                          1, false, 0, instr.conversion.dest_size);
                break;
            }
            default: {
@@ -1430,8 +1497,8 @@ private:
                op_b = "abs(" + op_b + ')';
            }
-            // The fset instruction sets a register to 1.0 if the condition is true, and to 0
+            // The fset instruction sets a register to 1.0 or -1 (depending on the bf bit) if the
-            // otherwise.
+            // condition is true, and to 0 otherwise.
            std::string second_pred =
                GetPredicateCondition(instr.fset.pred39, instr.fset.neg_pred != 0);
@@ -1449,6 +1516,41 @@ private:
            }
            break;
        }
+        case OpCode::Type::IntegerSet: {
+            std::string op_a = regs.GetRegisterAsInteger(instr.gpr8, 0, instr.iset.is_signed);
+            std::string op_b;
+            if (instr.is_b_imm) {
+                op_b = std::to_string(instr.alu.GetSignedImm20_20());
+            } else {
+                if (instr.is_b_gpr) {
+                    op_b = regs.GetRegisterAsInteger(instr.gpr20, 0, instr.iset.is_signed);
+                } else {
+                    op_b = regs.GetUniform(instr.cbuf34.index, instr.cbuf34.offset,
+                                           GLSLRegister::Type::Integer);
+                }
+            }
+            // The iset instruction sets a register to 1.0 or -1 (depending on the bf bit) if the
+            // condition is true, and to 0 otherwise.
+            std::string second_pred =
+                GetPredicateCondition(instr.iset.pred39, instr.iset.neg_pred != 0);
+            std::string comparator = GetPredicateComparison(instr.iset.cond);
+            std::string combiner = GetPredicateCombiner(instr.iset.op);
+            std::string predicate = "(((" + op_a + ") " + comparator + " (" + op_b + ")) " +
+                                    combiner + " (" + second_pred + "))";
+            if (instr.iset.bf) {
+                regs.SetRegisterToFloat(instr.gpr0, 0, predicate + " ? 1.0 : 0.0", 1, 1);
+            } else {
+                regs.SetRegisterToInteger(instr.gpr0, false, 0, predicate + " ? 0xFFFFFFFF : 0", 1,
+                                          1);
+            }
+            break;
+        }
        default: {
            switch (opcode->GetId()) {
            case OpCode::Id::EXIT: {
diff --git a/src/video_core/renderer_opengl/gl_shader_gen.cpp b/src/video_core/renderer_opengl/gl_shader_gen.cpp
index b88d592b7..c1e6fac9f 100644
--- a/src/video_core/renderer_opengl/gl_shader_gen.cpp
+++ b/src/video_core/renderer_opengl/gl_shader_gen.cpp
@@ -39,6 +39,10 @@ void main() {
    // Viewport can be flipped, which is unsupported by glViewport
    position.xy *= viewport_flip.xy;
    gl_Position = position;
+    // TODO(bunnei): This is likely a hack, position.w should be interpolated as 1.0
+    // For now, this is here to bring order in lieu of proper emulation
+    position.w = 1.0;
 }
 )";
    out += program.first;
diff --git a/src/video_core/renderer_opengl/gl_shader_manager.cpp b/src/video_core/renderer_opengl/gl_shader_manager.cpp
index 7c00beb33..d7167b298 100644
--- a/src/video_core/renderer_opengl/gl_shader_manager.cpp
+++ b/src/video_core/renderer_opengl/gl_shader_manager.cpp
@@ -38,8 +38,8 @@ void MaxwellUniformData::SetFromRegs(const Maxwell3D::State::ShaderStageInfo& sh
    const auto& regs = Core::System().GetInstance().GPU().Maxwell3D().regs;
    // TODO(bunnei): Support more than one viewport
-    viewport_flip[0] = regs.viewport_transform[0].scale_x < 0.0 ? -1.0 : 1.0;
+    viewport_flip[0] = regs.viewport_transform[0].scale_x < 0.0 ? -1.0f : 1.0f;
-    viewport_flip[1] = regs.viewport_transform[0].scale_y < 0.0 ? -1.0 : 1.0;
+    viewport_flip[1] = regs.viewport_transform[0].scale_y < 0.0 ? -1.0f : 1.0f;
 }
 } // namespace GLShader
diff --git a/src/video_core/renderer_opengl/gl_state.cpp b/src/video_core/renderer_opengl/gl_state.cpp
index 443ce3f2b..6e5f9a789 100644
--- a/src/video_core/renderer_opengl/gl_state.cpp
+++ b/src/video_core/renderer_opengl/gl_state.cpp
@@ -196,13 +196,13 @@ void OpenGLState::Apply() const {
    }
    // Textures
-    for (size_t i = 0; i < std::size(texture_units); ++i) {
+    for (int i = 0; i < std::size(texture_units); ++i) {
        if (texture_units[i].texture_2d != cur_state.texture_units[i].texture_2d) {
            glActiveTexture(TextureUnits::MaxwellTexture(i).Enum());
            glBindTexture(GL_TEXTURE_2D, texture_units[i].texture_2d);
        }
        if (texture_units[i].sampler != cur_state.texture_units[i].sampler) {
-            glBindSampler(i, texture_units[i].sampler);
+            glBindSampler(static_cast<GLuint>(i), texture_units[i].sampler);
        }
        // Update the texture swizzle
        if (texture_units[i].swizzle.r != cur_state.texture_units[i].swizzle.r ||
diff --git a/src/video_core/textures/astc.cpp b/src/video_core/textures/astc.cpp
new file mode 100644
index 000000000..3c4ad1c9d
--- /dev/null
+++ b/src/video_core/textures/astc.cpp
@@ -0,0 +1,1646 @@
+// Copyright 2016 The University of North Carolina at Chapel Hill
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+// Please send all BUG REPORTS to <pavel@cs.unc.edu>.
+// <http://gamma.cs.unc.edu/FasTC/>
+#include <algorithm>
+#include <cassert>
+#include <cstdint>
+#include <cstring>
+#include <vector>
+#include "video_core/textures/astc.h"
+class BitStream {
+public:
+    BitStream(unsigned char* ptr, int nBits = 0, int start_offset = 0)
+        : m_BitsWritten(0), m_BitsRead(0), m_NumBits(nBits), m_CurByte(ptr),
+          m_NextBit(start_offset % 8), done(false) {}
+    int GetBitsWritten() const {
+        return m_BitsWritten;
+    }
+    ~BitStream() {}
+    void WriteBitsR(unsigned int val, unsigned int nBits) {
+        for (unsigned int i = 0; i < nBits; i++) {
+            WriteBit((val >> (nBits - i - 1)) & 1);
+        }
+    }
+    void WriteBits(unsigned int val, unsigned int nBits) {
+        for (unsigned int i = 0; i < nBits; i++) {
+            WriteBit((val >> i) & 1);
+        }
+    }
+    int GetBitsRead() const {
+        return m_BitsRead;
+    }
+    int ReadBit() {
+        int bit = *m_CurByte >> m_NextBit++;
+        while (m_NextBit >= 8) {
+            m_NextBit -= 8;
+            m_CurByte++;
+        }
+        m_BitsRead++;
+        return bit & 1;
+    }
+    unsigned int ReadBits(unsigned int nBits) {
+        unsigned int ret = 0;
+        for (unsigned int i = 0; i < nBits; i++) {
+            ret |= (ReadBit() & 1) << i;
+        }
+        return ret;
+    }
+private:
+    void WriteBit(int b) {
+        if (done)
+            return;
+        const unsigned int mask = 1 << m_NextBit++;
+        // clear the bit
+        *m_CurByte &= ~mask;
+        // Write the bit, if necessary
+        if (b)
+            *m_CurByte |= mask;
+        // Next byte?
+        if (m_NextBit >= 8) {
+            m_CurByte += 1;
+            m_NextBit = 0;
+        }
+        done = done || ++m_BitsWritten >= m_NumBits;
+    }
+    int m_BitsWritten;
+    const int m_NumBits;
+    unsigned char* m_CurByte;
+    int m_NextBit;
+    int m_BitsRead;
+    bool done;
+};
+template <typename IntType>
+class Bits {
+private:
+    const IntType& m_Bits;
+    // Don't copy
+    Bits() {}
+    Bits(const Bits&) {}
+    Bits& operator=(const Bits&) {}
+public:
+    explicit Bits(IntType& v) : m_Bits(v) {}
+    uint8_t operator[](uint32_t bitPos) {
+        return static_cast<uint8_t>((m_Bits >> bitPos) & 1);
+    }
+    IntType operator()(uint32_t start, uint32_t end) {
+        if (start == end) {
+            return (*this)[start];
+        } else if (start > end) {
+            uint32_t t = start;
+            start = end;
+            end = t;
+        }
+        uint64_t mask = (1 << (end - start + 1)) - 1;
+        return (m_Bits >> start) & mask;
+    }
+};
+enum EIntegerEncoding { eIntegerEncoding_JustBits, eIntegerEncoding_Quint, eIntegerEncoding_Trit };
+class IntegerEncodedValue {
+private:
+    const EIntegerEncoding m_Encoding;
+    const uint32_t m_NumBits;
+    uint32_t m_BitValue;
+    union {
+        uint32_t m_QuintValue;
+        uint32_t m_TritValue;
+    };
+public:
+    // Jank, but we're not doing any heavy lifting in this class, so it's
+    // probably OK. It allows us to use these in std::vectors...
+    IntegerEncodedValue& operator=(const IntegerEncodedValue& other) {
+        new (this) IntegerEncodedValue(other);
+        return *this;
+    }
+    IntegerEncodedValue(EIntegerEncoding encoding, uint32_t numBits)
+        : m_Encoding(encoding), m_NumBits(numBits) {}
+    EIntegerEncoding GetEncoding() const {
+        return m_Encoding;
+    }
+    uint32_t BaseBitLength() const {
+        return m_NumBits;
+    }
+    uint32_t GetBitValue() const {
+        return m_BitValue;
+    }
+    void SetBitValue(uint32_t val) {
+        m_BitValue = val;
+    }
+    uint32_t GetTritValue() const {
+        return m_TritValue;
+    }
+    void SetTritValue(uint32_t val) {
+        m_TritValue = val;
+    }
+    uint32_t GetQuintValue() const {
+        return m_QuintValue;
+    }
+    void SetQuintValue(uint32_t val) {
+        m_QuintValue = val;
+    }
+    bool MatchesEncoding(const IntegerEncodedValue& other) {
+        return m_Encoding == other.m_Encoding && m_NumBits == other.m_NumBits;
+    }
+    // Returns the number of bits required to encode nVals values.
+    uint32_t GetBitLength(uint32_t nVals) {
+        uint32_t totalBits = m_NumBits * nVals;
+        if (m_Encoding == eIntegerEncoding_Trit) {
+            totalBits += (nVals * 8 + 4) / 5;
+        } else if (m_Encoding == eIntegerEncoding_Quint) {
+            totalBits += (nVals * 7 + 2) / 3;
+        }
+        return totalBits;
+    }
+    // Count the number of bits set in a number.
+    static inline uint32_t Popcnt(uint32_t n) {
+        uint32_t c;
+        for (c = 0; n; c++) {
+            n &= n - 1;
+        }
+        return c;
+    }
+    // Returns a new instance of this struct that corresponds to the
+    // can take no more than maxval values
+    static IntegerEncodedValue CreateEncoding(uint32_t maxVal) {
+        while (maxVal > 0) {
+            uint32_t check = maxVal + 1;
+            // Is maxVal a power of two?
+            if (!(check & (check - 1))) {
+                return IntegerEncodedValue(eIntegerEncoding_JustBits, Popcnt(maxVal));
+            }
+            // Is maxVal of the type 3*2^n - 1?
+            if ((check % 3 == 0) && !((check / 3) & ((check / 3) - 1))) {
+                return IntegerEncodedValue(eIntegerEncoding_Trit, Popcnt(check / 3 - 1));
+            }
+            // Is maxVal of the type 5*2^n - 1?
+            if ((check % 5 == 0) && !((check / 5) & ((check / 5) - 1))) {
+                return IntegerEncodedValue(eIntegerEncoding_Quint, Popcnt(check / 5 - 1));
+            }
+            // Apparently it can't be represented with a bounded integer sequence...
+            // just iterate.
+            maxVal--;
+        }
+        return IntegerEncodedValue(eIntegerEncoding_JustBits, 0);
+    }
+    // Fills result with the values that are encoded in the given
+    // bitstream. We must know beforehand what the maximum possible
+    // value is, and how many values we're decoding.
+    static void DecodeIntegerSequence(std::vector<IntegerEncodedValue>& result, BitStream& bits,
+                                      uint32_t maxRange, uint32_t nValues) {
+        // Determine encoding parameters
+        IntegerEncodedValue val = IntegerEncodedValue::CreateEncoding(maxRange);
+        // Start decoding
+        uint32_t nValsDecoded = 0;
+        while (nValsDecoded < nValues) {
+            switch (val.GetEncoding()) {
+            case eIntegerEncoding_Quint:
+                DecodeQuintBlock(bits, result, val.BaseBitLength());
+                nValsDecoded += 3;
+                break;
+            case eIntegerEncoding_Trit:
+                DecodeTritBlock(bits, result, val.BaseBitLength());
+                nValsDecoded += 5;
+                break;
+            case eIntegerEncoding_JustBits:
+                val.SetBitValue(bits.ReadBits(val.BaseBitLength()));
+                result.push_back(val);
+                nValsDecoded++;
+                break;
+            }
+        }
+    }
+private:
+    static void DecodeTritBlock(BitStream& bits, std::vector<IntegerEncodedValue>& result,
+                                uint32_t nBitsPerValue) {
+        // Implement the algorithm in section C.2.12
+        uint32_t m[5];
+        uint32_t t[5];
+        uint32_t T;
+        // Read the trit encoded block according to
+        // table C.2.14
+        m[0] = bits.ReadBits(nBitsPerValue);
+        T = bits.ReadBits(2);
+        m[1] = bits.ReadBits(nBitsPerValue);
+        T |= bits.ReadBits(2) << 2;
+        m[2] = bits.ReadBits(nBitsPerValue);
+        T |= bits.ReadBit() << 4;
+        m[3] = bits.ReadBits(nBitsPerValue);
+        T |= bits.ReadBits(2) << 5;
+        m[4] = bits.ReadBits(nBitsPerValue);
+        T |= bits.ReadBit() << 7;
+        uint32_t C = 0;
+        Bits<uint32_t> Tb(T);
+        if (Tb(2, 4) == 7) {
+            C = (Tb(5, 7) << 2) | Tb(0, 1);
+            t[4] = t[3] = 2;
+        } else {
+            C = Tb(0, 4);
+            if (Tb(5, 6) == 3) {
+                t[4] = 2;
+                t[3] = Tb[7];
+            } else {
+                t[4] = Tb[7];
+                t[3] = Tb(5, 6);
+            }
+        }
+        Bits<uint32_t> Cb(C);
+        if (Cb(0, 1) == 3) {
+            t[2] = 2;
+            t[1] = Cb[4];
+            t[0] = (Cb[3] << 1) | (Cb[2] & ~Cb[3]);
+        } else if (Cb(2, 3) == 3) {
+            t[2] = 2;
+            t[1] = 2;
+            t[0] = Cb(0, 1);
+        } else {
+            t[2] = Cb[4];
+            t[1] = Cb(2, 3);
+            t[0] = (Cb[1] << 1) | (Cb[0] & ~Cb[1]);
+        }
+        for (uint32_t i = 0; i < 5; i++) {
+            IntegerEncodedValue val(eIntegerEncoding_Trit, nBitsPerValue);
+            val.SetBitValue(m[i]);
+            val.SetTritValue(t[i]);
+            result.push_back(val);
+        }
+    }
+    static void DecodeQuintBlock(BitStream& bits, std::vector<IntegerEncodedValue>& result,
+                                 uint32_t nBitsPerValue) {
+        // Implement the algorithm in section C.2.12
+        uint32_t m[3];
+        uint32_t q[3];
+        uint32_t Q;
+        // Read the trit encoded block according to
+        // table C.2.15
+        m[0] = bits.ReadBits(nBitsPerValue);
+        Q = bits.ReadBits(3);
+        m[1] = bits.ReadBits(nBitsPerValue);
+        Q |= bits.ReadBits(2) << 3;
+        m[2] = bits.ReadBits(nBitsPerValue);
+        Q |= bits.ReadBits(2) << 5;
+        Bits<uint32_t> Qb(Q);
+        if (Qb(1, 2) == 3 && Qb(5, 6) == 0) {
+            q[0] = q[1] = 4;
+            q[2] = (Qb[0] << 2) | ((Qb[4] & ~Qb[0]) << 1) | (Qb[3] & ~Qb[0]);
+        } else {
+            uint32_t C = 0;
+            if (Qb(1, 2) == 3) {
+                q[2] = 4;
+                C = (Qb(3, 4) << 3) | ((~Qb(5, 6) & 3) << 1) | Qb[0];
+            } else {
+                q[2] = Qb(5, 6);
+                C = Qb(0, 4);
+            }
+            Bits<uint32_t> Cb(C);
+            if (Cb(0, 2) == 5) {
+                q[1] = 4;
+                q[0] = Cb(3, 4);
+            } else {
+                q[1] = Cb(3, 4);
+                q[0] = Cb(0, 2);
+            }
+        }
+        for (uint32_t i = 0; i < 3; i++) {
+            IntegerEncodedValue val(eIntegerEncoding_Quint, nBitsPerValue);
+            val.m_BitValue = m[i];
+            val.m_QuintValue = q[i];
+            result.push_back(val);
+        }
+    }
+};
+namespace ASTCC {
+struct TexelWeightParams {
+    uint32_t m_Width;
+    uint32_t m_Height;
+    bool m_bDualPlane;
+    uint32_t m_MaxWeight;
+    bool m_bError;
+    bool m_bVoidExtentLDR;
+    bool m_bVoidExtentHDR;
+    TexelWeightParams() {
+        memset(this, 0, sizeof(*this));
+    }
+    uint32_t GetPackedBitSize() {
+        // How many indices do we have?
+        uint32_t nIdxs = m_Height * m_Width;
+        if (m_bDualPlane) {
+            nIdxs *= 2;
+        }
+        return IntegerEncodedValue::CreateEncoding(m_MaxWeight).GetBitLength(nIdxs);
+    }
+    uint32_t GetNumWeightValues() const {
+        uint32_t ret = m_Width * m_Height;
+        if (m_bDualPlane) {
+            ret *= 2;
+        }
+        return ret;
+    }
+};
+TexelWeightParams DecodeBlockInfo(BitStream& strm) {
+    TexelWeightParams params;
+    // Read the entire block mode all at once
+    uint16_t modeBits = strm.ReadBits(11);
+    // Does this match the void extent block mode?
+    if ((modeBits & 0x01FF) == 0x1FC) {
+        if (modeBits & 0x200) {
+            params.m_bVoidExtentHDR = true;
+        } else {
+            params.m_bVoidExtentLDR = true;
+        }
+        // Next two bits must be one.
+        if (!(modeBits & 0x400) || !strm.ReadBit()) {
+            params.m_bError = true;
+        }
+        return params;
+    }
+    // First check if the last four bits are zero
+    if ((modeBits & 0xF) == 0) {
+        params.m_bError = true;
+        return params;
+    }
+    // If the last two bits are zero, then if bits
+    // [6-8] are all ones, this is also reserved.
+    if ((modeBits & 0x3) == 0 && (modeBits & 0x1C0) == 0x1C0) {
+        params.m_bError = true;
+        return params;
+    }
+    // Otherwise, there is no error... Figure out the layout
+    // of the block mode. Layout is determined by a number
+    // between 0 and 9 corresponding to table C.2.8 of the
+    // ASTC spec.
+    uint32_t layout = 0;
+    if ((modeBits & 0x1) || (modeBits & 0x2)) {
+        // layout is in [0-4]
+        if (modeBits & 0x8) {
+            // layout is in [2-4]
+            if (modeBits & 0x4) {
+                // layout is in [3-4]
+                if (modeBits & 0x100) {
+                    layout = 4;
+                } else {
+                    layout = 3;
+                }
+            } else {
+                layout = 2;
+            }
+        } else {
+            // layout is in [0-1]
+            if (modeBits & 0x4) {
+                layout = 1;
+            } else {
+                layout = 0;
+            }
+        }
+    } else {
+        // layout is in [5-9]
+        if (modeBits & 0x100) {
+            // layout is in [7-9]
+            if (modeBits & 0x80) {
+                // layout is in [7-8]
+                assert((modeBits & 0x40) == 0U);
+                if (modeBits & 0x20) {
+                    layout = 8;
+                } else {
+                    layout = 7;
+                }
+            } else {
+                layout = 9;
+            }
+        } else {
+            // layout is in [5-6]
+            if (modeBits & 0x80) {
+                layout = 6;
+            } else {
+                layout = 5;
+            }
+        }
+    }
+    assert(layout < 10);
+    // Determine R
+    uint32_t R = !!(modeBits & 0x10);
+    if (layout < 5) {
+        R |= (modeBits & 0x3) << 1;
+    } else {
+        R |= (modeBits & 0xC) >> 1;
+    }
+    assert(2 <= R && R <= 7);
+    // Determine width & height
+    switch (layout) {
+    case 0: {
+        uint32_t A = (modeBits >> 5) & 0x3;
+        uint32_t B = (modeBits >> 7) & 0x3;
+        params.m_Width = B + 4;
+        params.m_Height = A + 2;
+        break;
+    }
+    case 1: {
+        uint32_t A = (modeBits >> 5) & 0x3;
+        uint32_t B = (modeBits >> 7) & 0x3;
+        params.m_Width = B + 8;
+        params.m_Height = A + 2;
+        break;
+    }
+    case 2: {
+        uint32_t A = (modeBits >> 5) & 0x3;
+        uint32_t B = (modeBits >> 7) & 0x3;
+        params.m_Width = A + 2;
+        params.m_Height = B + 8;
+        break;
+    }
+    case 3: {
+        uint32_t A = (modeBits >> 5) & 0x3;
+        uint32_t B = (modeBits >> 7) & 0x1;
+        params.m_Width = A + 2;
+        params.m_Height = B + 6;
+        break;
+    }
+    case 4: {
+        uint32_t A = (modeBits >> 5) & 0x3;
+        uint32_t B = (modeBits >> 7) & 0x1;
+        params.m_Width = B + 2;
+        params.m_Height = A + 2;
+        break;
+    }
+    case 5: {
+        uint32_t A = (modeBits >> 5) & 0x3;
+        params.m_Width = 12;
+        params.m_Height = A + 2;
+        break;
+    }
+    case 6: {
+        uint32_t A = (modeBits >> 5) & 0x3;
+        params.m_Width = A + 2;
+        params.m_Height = 12;
+        break;
+    }
+    case 7: {
+        params.m_Width = 6;
+        params.m_Height = 10;
+        break;
+    }
+    case 8: {
+        params.m_Width = 10;
+        params.m_Height = 6;
+        break;
+    }
+    case 9: {
+        uint32_t A = (modeBits >> 5) & 0x3;
+        uint32_t B = (modeBits >> 9) & 0x3;
+        params.m_Width = A + 6;
+        params.m_Height = B + 6;
+        break;
+    }
+    default:
+        assert(!"Don't know this layout...");
+        params.m_bError = true;
+        break;
+    }
+    // Determine whether or not we're using dual planes
+    // and/or high precision layouts.
+    bool D = (layout != 9) && (modeBits & 0x400);
+    bool H = (layout != 9) && (modeBits & 0x200);
+    if (H) {
+        const uint32_t maxWeights[6] = {9, 11, 15, 19, 23, 31};
+        params.m_MaxWeight = maxWeights[R - 2];
+    } else {
+        const uint32_t maxWeights[6] = {1, 2, 3, 4, 5, 7};
+        params.m_MaxWeight = maxWeights[R - 2];
+    }
+    params.m_bDualPlane = D;
+    return params;
+}
+void FillVoidExtentLDR(BitStream& strm, uint32_t* const outBuf, uint32_t blockWidth,
+                       uint32_t blockHeight) {
+    // Don't actually care about the void extent, just read the bits...
+    for (int i = 0; i < 4; ++i) {
+        strm.ReadBits(13);
+    }
+    // Decode the RGBA components and renormalize them to the range [0, 255]
+    uint16_t r = strm.ReadBits(16);
+    uint16_t g = strm.ReadBits(16);
+    uint16_t b = strm.ReadBits(16);
+    uint16_t a = strm.ReadBits(16);
+    uint32_t rgba = (r >> 8) | (g & 0xFF00) | (static_cast<uint32_t>(b) & 0xFF00) << 8 |
+                    (static_cast<uint32_t>(a) & 0xFF00) << 16;
+    for (uint32_t j = 0; j < blockHeight; j++)
+        for (uint32_t i = 0; i < blockWidth; i++) {
+            outBuf[j * blockWidth + i] = rgba;
+        }
+}
+void FillError(uint32_t* outBuf, uint32_t blockWidth, uint32_t blockHeight) {
+    for (uint32_t j = 0; j < blockHeight; j++)
+        for (uint32_t i = 0; i < blockWidth; i++) {
+            outBuf[j * blockWidth + i] = 0xFFFF00FF;
+        }
+}
+// Replicates low numBits such that [(toBit - 1):(toBit - 1 - fromBit)]
+// is the same as [(numBits - 1):0] and repeats all the way down.
+template <typename IntType>
+IntType Replicate(const IntType& val, uint32_t numBits, uint32_t toBit) {
+    if (numBits == 0)
+        return 0;
+    if (toBit == 0)
+        return 0;
+    IntType v = val & ((1 << numBits) - 1);
+    IntType res = v;
+    uint32_t reslen = numBits;
+    while (reslen < toBit) {
+        uint32_t comp = 0;
+        if (numBits > toBit - reslen) {
+            uint32_t newshift = toBit - reslen;
+            comp = numBits - newshift;
+            numBits = newshift;
+        }
+        res <<= numBits;
+        res |= v >> comp;
+        reslen += numBits;
+    }
+    return res;
+}
+class Pixel {
+protected:
+    typedef int16_t ChannelType;
+    uint8_t m_BitDepth[4];
+    int16_t color[4];
+public:
+    Pixel() {
+        for (int i = 0; i < 4; i++) {
+            m_BitDepth[i] = 8;
+            color[i] = 0;
+        }
+    }
+    Pixel(ChannelType a, ChannelType r, ChannelType g, ChannelType b, unsigned bitDepth = 8) {
+        for (int i = 0; i < 4; i++)
+            m_BitDepth[i] = bitDepth;
+        color[0] = a;
+        color[1] = r;
+        color[2] = g;
+        color[3] = b;
+    }
+    // Changes the depth of each pixel. This scales the values to
+    // the appropriate bit depth by either truncating the least
+    // significant bits when going from larger to smaller bit depth
+    // or by repeating the most significant bits when going from
+    // smaller to larger bit depths.
+    void ChangeBitDepth(const uint8_t (&depth)[4]) {
+        for (uint32_t i = 0; i < 4; i++) {
+            Component(i) = ChangeBitDepth(Component(i), m_BitDepth[i], depth[i]);
+            m_BitDepth[i] = depth[i];
+        }
+    }
+    template <typename IntType>
+    static float ConvertChannelToFloat(IntType channel, uint8_t bitDepth) {
+        float denominator = static_cast<float>((1 << bitDepth) - 1);
+        return static_cast<float>(channel) / denominator;
+    }
+    // Changes the bit depth of a single component. See the comment
+    // above for how we do this.
+    static ChannelType ChangeBitDepth(Pixel::ChannelType val, uint8_t oldDepth, uint8_t newDepth) {
+        assert(newDepth <= 8);
+        assert(oldDepth <= 8);
+        if (oldDepth == newDepth) {
+            // Do nothing
+            return val;
+        } else if (oldDepth == 0 && newDepth != 0) {
+            return (1 << newDepth) - 1;
+        } else if (newDepth > oldDepth) {
+            return Replicate(val, oldDepth, newDepth);
+        } else {
+            // oldDepth > newDepth
+            if (newDepth == 0) {
+                return 0xFF;
+            } else {
+                uint8_t bitsWasted = oldDepth - newDepth;
+                uint16_t v = static_cast<uint16_t>(val);
+                v = (v + (1 << (bitsWasted - 1))) >> bitsWasted;
+                v = ::std::min<uint16_t>(::std::max<uint16_t>(0, v), (1 << newDepth) - 1);
+                return static_cast<uint8_t>(v);
+            }
+        }
+        assert(!"We shouldn't get here.");
+        return 0;
+    }
+    const ChannelType& A() const {
+        return color[0];
+    }
+    ChannelType& A() {
+        return color[0];
+    }
+    const ChannelType& R() const {
+        return color[1];
+    }
+    ChannelType& R() {
+        return color[1];
+    }
+    const ChannelType& G() const {
+        return color[2];
+    }
+    ChannelType& G() {
+        return color[2];
+    }
+    const ChannelType& B() const {
+        return color[3];
+    }
+    ChannelType& B() {
+        return color[3];
+    }
+    const ChannelType& Component(uint32_t idx) const {
+        return color[idx];
+    }
+    ChannelType& Component(uint32_t idx) {
+        return color[idx];
+    }
+    void GetBitDepth(uint8_t (&outDepth)[4]) const {
+        for (int i = 0; i < 4; i++) {
+            outDepth[i] = m_BitDepth[i];
+        }
+    }
+    // Take all of the components, transform them to their 8-bit variants,
+    // and then pack each channel into an R8G8B8A8 32-bit integer. We assume
+    // that the architecture is little-endian, so the alpha channel will end
+    // up in the most-significant byte.
+    uint32_t Pack() const {
+        Pixel eightBit(*this);
+        const uint8_t eightBitDepth[4] = {8, 8, 8, 8};
+        eightBit.ChangeBitDepth(eightBitDepth);
+        uint32_t r = 0;
+        r |= eightBit.A();
+        r <<= 8;
+        r |= eightBit.B();
+        r <<= 8;
+        r |= eightBit.G();
+        r <<= 8;
+        r |= eightBit.R();
+        return r;
+    }
+    // Clamps the pixel to the range [0,255]
+    void ClampByte() {
+        for (uint32_t i = 0; i < 4; i++) {
+            color[i] = (color[i] < 0) ? 0 : ((color[i] > 255) ? 255 : color[i]);
+        }
+    }
+    void MakeOpaque() {
+        A() = 255;
+    }
+};
+void DecodeColorValues(uint32_t* out, uint8_t* data, uint32_t* modes, const uint32_t nPartitions,
+                       const uint32_t nBitsForColorData) {
+    // First figure out how many color values we have
+    uint32_t nValues = 0;
+    for (uint32_t i = 0; i < nPartitions; i++) {
+        nValues += ((modes[i] >> 2) + 1) << 1;
+    }
+    // Then based on the number of values and the remaining number of bits,
+    // figure out the max value for each of them...
+    uint32_t range = 256;
+    while (--range > 0) {
+        IntegerEncodedValue val = IntegerEncodedValue::CreateEncoding(range);
+        uint32_t bitLength = val.GetBitLength(nValues);
+        if (bitLength <= nBitsForColorData) {
+            // Find the smallest possible range that matches the given encoding
+            while (--range > 0) {
+                IntegerEncodedValue newval = IntegerEncodedValue::CreateEncoding(range);
+                if (!newval.MatchesEncoding(val)) {
+                    break;
+                }
+            }
+            // Return to last matching range.
+            range++;
+            break;
+        }
+    }
+    // We now have enough to decode our integer sequence.
+    std::vector<IntegerEncodedValue> decodedColorValues;
+    BitStream colorStream(data);
+    IntegerEncodedValue::DecodeIntegerSequence(decodedColorValues, colorStream, range, nValues);
+    // Once we have the decoded values, we need to dequantize them to the 0-255 range
+    // This procedure is outlined in ASTC spec C.2.13
+    uint32_t outIdx = 0;
+    std::vector<IntegerEncodedValue>::const_iterator itr;
+    for (itr = decodedColorValues.begin(); itr != decodedColorValues.end(); itr++) {
+        // Have we already decoded all that we need?
+        if (outIdx >= nValues) {
+            break;
+        }
+        const IntegerEncodedValue& val = *itr;
+        uint32_t bitlen = val.BaseBitLength();
+        uint32_t bitval = val.GetBitValue();
+        assert(bitlen >= 1);
+        uint32_t A = 0, B = 0, C = 0, D = 0;
+        // A is just the lsb replicated 9 times.
+        A = Replicate(bitval & 1, 1, 9);
+        switch (val.GetEncoding()) {
+        // Replicate bits
+        case eIntegerEncoding_JustBits:
+            out[outIdx++] = Replicate(bitval, bitlen, 8);
+            break;
+        // Use algorithm in C.2.13
+        case eIntegerEncoding_Trit: {
+            D = val.GetTritValue();
+            switch (bitlen) {
+            case 1: {
+                C = 204;
+            } break;
+            case 2: {
+                C = 93;
+                // B = b000b0bb0
+                uint32_t b = (bitval >> 1) & 1;
+                B = (b << 8) | (b << 4) | (b << 2) | (b << 1);
+            } break;
+            case 3: {
+                C = 44;
+                // B = cb000cbcb
+                uint32_t cb = (bitval >> 1) & 3;
+                B = (cb << 7) | (cb << 2) | cb;
+            } break;
+            case 4: {
+                C = 22;
+                // B = dcb000dcb
+                uint32_t dcb = (bitval >> 1) & 7;
+                B = (dcb << 6) | dcb;
+            } break;
+            case 5: {
+                C = 11;
+                // B = edcb000ed
+                uint32_t edcb = (bitval >> 1) & 0xF;
+                B = (edcb << 5) | (edcb >> 2);
+            } break;
+            case 6: {
+                C = 5;
+                // B = fedcb000f
+                uint32_t fedcb = (bitval >> 1) & 0x1F;
+                B = (fedcb << 4) | (fedcb >> 4);
+            } break;
+            default:
+                assert(!"Unsupported trit encoding for color values!");
+                break;
+            } // switch(bitlen)
+        }     // case eIntegerEncoding_Trit
+        break;
+        case eIntegerEncoding_Quint: {
+            D = val.GetQuintValue();
+            switch (bitlen) {
+            case 1: {
+                C = 113;
+            } break;
+            case 2: {
+                C = 54;
+                // B = b0000bb00
+                uint32_t b = (bitval >> 1) & 1;
+                B = (b << 8) | (b << 3) | (b << 2);
+            } break;
+            case 3: {
+                C = 26;
+                // B = cb0000cbc
+                uint32_t cb = (bitval >> 1) & 3;
+                B = (cb << 7) | (cb << 1) | (cb >> 1);
+            } break;
+            case 4: {
+                C = 13;
+                // B = dcb0000dc
+                uint32_t dcb = (bitval >> 1) & 7;
+                B = (dcb << 6) | (dcb >> 1);
+            } break;
+            case 5: {
+                C = 6;
+                // B = edcb0000e
+                uint32_t edcb = (bitval >> 1) & 0xF;
+                B = (edcb << 5) | (edcb >> 3);
+            } break;
+            default:
+                assert(!"Unsupported quint encoding for color values!");
+                break;
+            } // switch(bitlen)
+        }     // case eIntegerEncoding_Quint
+        break;
+        } // switch(val.GetEncoding())
+        if (val.GetEncoding() != eIntegerEncoding_JustBits) {
+            uint32_t T = D * C + B;
+            T ^= A;
+            T = (A & 0x80) | (T >> 2);
+            out[outIdx++] = T;
+        }
+    }
+    // Make sure that each of our values is in the proper range...
+    for (uint32_t i = 0; i < nValues; i++) {
+        assert(out[i] <= 255);
+    }
+}
+uint32_t UnquantizeTexelWeight(const IntegerEncodedValue& val) {
+    uint32_t bitval = val.GetBitValue();
+    uint32_t bitlen = val.BaseBitLength();
+    uint32_t A = Replicate(bitval & 1, 1, 7);
+    uint32_t B = 0, C = 0, D = 0;
+    uint32_t result = 0;
+    switch (val.GetEncoding()) {
+    case eIntegerEncoding_JustBits:
+        result = Replicate(bitval, bitlen, 6);
+        break;
+    case eIntegerEncoding_Trit: {
+        D = val.GetTritValue();
+        assert(D < 3);
+        switch (bitlen) {
+        case 0: {
+            uint32_t results[3] = {0, 32, 63};
+            result = results[D];
+        } break;
+        case 1: {
+            C = 50;
+        } break;
+        case 2: {
+            C = 23;
+            uint32_t b = (bitval >> 1) & 1;
+            B = (b << 6) | (b << 2) | b;
+        } break;
+        case 3: {
+            C = 11;
+            uint32_t cb = (bitval >> 1) & 3;
+            B = (cb << 5) | cb;
+        } break;
+        default:
+            assert(!"Invalid trit encoding for texel weight");
+            break;
+        }
+    } break;
+    case eIntegerEncoding_Quint: {
+        D = val.GetQuintValue();
+        assert(D < 5);
+        switch (bitlen) {
+        case 0: {
+            uint32_t results[5] = {0, 16, 32, 47, 63};
+            result = results[D];
+        } break;
+        case 1: {
+            C = 28;
+        } break;
+        case 2: {
+            C = 13;
+            uint32_t b = (bitval >> 1) & 1;
+            B = (b << 6) | (b << 1);
+        } break;
+        default:
+            assert(!"Invalid quint encoding for texel weight");
+            break;
+        }
+    } break;
+    }
+    if (val.GetEncoding() != eIntegerEncoding_JustBits && bitlen > 0) {
+        // Decode the value...
+        result = D * C + B;
+        result ^= A;
+        result = (A & 0x20) | (result >> 2);
+    }
+    assert(result < 64);
+    // Change from [0,63] to [0,64]
+    if (result > 32) {
+        result += 1;
+    }
+    return result;
+}
+void UnquantizeTexelWeights(uint32_t out[2][144], std::vector<IntegerEncodedValue>& weights,
+                            const TexelWeightParams& params, const uint32_t blockWidth,
+                            const uint32_t blockHeight) {
+    uint32_t weightIdx = 0;
+    uint32_t unquantized[2][144];
+    std::vector<IntegerEncodedValue>::const_iterator itr;
+    for (itr = weights.begin(); itr != weights.end(); itr++) {
+        unquantized[0][weightIdx] = UnquantizeTexelWeight(*itr);
+        if (params.m_bDualPlane) {
+            itr++;
+            unquantized[1][weightIdx] = UnquantizeTexelWeight(*itr);
+            if (itr == weights.end()) {
+                break;
+            }
+        }
+        if (++weightIdx >= (params.m_Width * params.m_Height))
+            break;
+    }
+    // Do infill if necessary (Section C.2.18) ...
+    uint32_t Ds = (1024 + (blockWidth / 2)) / (blockWidth - 1);
+    uint32_t Dt = (1024 + (blockHeight / 2)) / (blockHeight - 1);
+    const uint32_t kPlaneScale = params.m_bDualPlane ? 2U : 1U;
+    for (uint32_t plane = 0; plane < kPlaneScale; plane++)
+        for (uint32_t t = 0; t < blockHeight; t++)
+            for (uint32_t s = 0; s < blockWidth; s++) {
+                uint32_t cs = Ds * s;
+                uint32_t ct = Dt * t;
+                uint32_t gs = (cs * (params.m_Width - 1) + 32) >> 6;
+                uint32_t gt = (ct * (params.m_Height - 1) + 32) >> 6;
+                uint32_t js = gs >> 4;
+                uint32_t fs = gs & 0xF;
+                uint32_t jt = gt >> 4;
+                uint32_t ft = gt & 0x0F;
+                uint32_t w11 = (fs * ft + 8) >> 4;
+                uint32_t w10 = ft - w11;
+                uint32_t w01 = fs - w11;
+                uint32_t w00 = 16 - fs - ft + w11;
+                uint32_t v0 = js + jt * params.m_Width;
+#define FIND_TEXEL(tidx, bidx)                                                                     \
+    uint32_t p##bidx = 0;                                                                          \
+    do {                                                                                           \
+        if ((tidx) < (params.m_Width * params.m_Height)) {                                         \
+            p##bidx = unquantized[plane][(tidx)];                                                  \
+        }                                                                                          \
+    } while (0)
+                FIND_TEXEL(v0, 00);
+                FIND_TEXEL(v0 + 1, 01);
+                FIND_TEXEL(v0 + params.m_Width, 10);
+                FIND_TEXEL(v0 + params.m_Width + 1, 11);
+#undef FIND_TEXEL
+                out[plane][t * blockWidth + s] =
+                    (p00 * w00 + p01 * w01 + p10 * w10 + p11 * w11 + 8) >> 4;
+            }
+}
+// Transfers a bit as described in C.2.14
+static inline void BitTransferSigned(int32_t& a, int32_t& b) {
+    b >>= 1;
+    b |= a & 0x80;
+    a >>= 1;
+    a &= 0x3F;
+    if (a & 0x20)
+        a -= 0x40;
+}
+// Adds more precision to the blue channel as described
+// in C.2.14
+static inline Pixel BlueContract(int32_t a, int32_t r, int32_t g, int32_t b) {
+    return Pixel(static_cast<int16_t>(a), static_cast<int16_t>((r + b) >> 1),
+                 static_cast<int16_t>((g + b) >> 1), static_cast<int16_t>(b));
+}
+// Partition selection functions as specified in
+// C.2.21
+static inline uint32_t hash52(uint32_t p) {
+    p ^= p >> 15;
+    p -= p << 17;
+    p += p << 7;
+    p += p << 4;
+    p ^= p >> 5;
+    p += p << 16;
+    p ^= p >> 7;
+    p ^= p >> 3;
+    p ^= p << 6;
+    p ^= p >> 17;
+    return p;
+}
+static uint32_t SelectPartition(int32_t seed, int32_t x, int32_t y, int32_t z,
+                                int32_t partitionCount, int32_t smallBlock) {
+    if (1 == partitionCount)
+        return 0;
+    if (smallBlock) {
+        x <<= 1;
+        y <<= 1;
+        z <<= 1;
+    }
+    seed += (partitionCount - 1) * 1024;
+    uint32_t rnum = hash52(static_cast<uint32_t>(seed));
+    uint8_t seed1 = static_cast<uint8_t>(rnum & 0xF);
+    uint8_t seed2 = static_cast<uint8_t>((rnum >> 4) & 0xF);
+    uint8_t seed3 = static_cast<uint8_t>((rnum >> 8) & 0xF);
+    uint8_t seed4 = static_cast<uint8_t>((rnum >> 12) & 0xF);
+    uint8_t seed5 = static_cast<uint8_t>((rnum >> 16) & 0xF);
+    uint8_t seed6 = static_cast<uint8_t>((rnum >> 20) & 0xF);
+    uint8_t seed7 = static_cast<uint8_t>((rnum >> 24) & 0xF);
+    uint8_t seed8 = static_cast<uint8_t>((rnum >> 28) & 0xF);
+    uint8_t seed9 = static_cast<uint8_t>((rnum >> 18) & 0xF);
+    uint8_t seed10 = static_cast<uint8_t>((rnum >> 22) & 0xF);
+    uint8_t seed11 = static_cast<uint8_t>((rnum >> 26) & 0xF);
+    uint8_t seed12 = static_cast<uint8_t>(((rnum >> 30) | (rnum << 2)) & 0xF);
+    seed1 *= seed1;
+    seed2 *= seed2;
+    seed3 *= seed3;
+    seed4 *= seed4;
+    seed5 *= seed5;
+    seed6 *= seed6;
+    seed7 *= seed7;
+    seed8 *= seed8;
+    seed9 *= seed9;
+    seed10 *= seed10;
+    seed11 *= seed11;
+    seed12 *= seed12;
+    int32_t sh1, sh2, sh3;
+    if (seed & 1) {
+        sh1 = (seed & 2) ? 4 : 5;
+        sh2 = (partitionCount == 3) ? 6 : 5;
+    } else {
+        sh1 = (partitionCount == 3) ? 6 : 5;
+        sh2 = (seed & 2) ? 4 : 5;
+    }
+    sh3 = (seed & 0x10) ? sh1 : sh2;
+    seed1 >>= sh1;
+    seed2 >>= sh2;
+    seed3 >>= sh1;
+    seed4 >>= sh2;
+    seed5 >>= sh1;
+    seed6 >>= sh2;
+    seed7 >>= sh1;
+    seed8 >>= sh2;
+    seed9 >>= sh3;
+    seed10 >>= sh3;
+    seed11 >>= sh3;
+    seed12 >>= sh3;
+    int32_t a = seed1 * x + seed2 * y + seed11 * z + (rnum >> 14);
+    int32_t b = seed3 * x + seed4 * y + seed12 * z + (rnum >> 10);
+    int32_t c = seed5 * x + seed6 * y + seed9 * z + (rnum >> 6);
+    int32_t d = seed7 * x + seed8 * y + seed10 * z + (rnum >> 2);
+    a &= 0x3F;
+    b &= 0x3F;
+    c &= 0x3F;
+    d &= 0x3F;
+    if (partitionCount < 4)
+        d = 0;
+    if (partitionCount < 3)
+        c = 0;
+    if (a >= b && a >= c && a >= d)
+        return 0;
+    else if (b >= c && b >= d)
+        return 1;
+    else if (c >= d)
+        return 2;
+    return 3;
+}
+static inline uint32_t Select2DPartition(int32_t seed, int32_t x, int32_t y, int32_t partitionCount,
+                                         int32_t smallBlock) {
+    return SelectPartition(seed, x, y, 0, partitionCount, smallBlock);
+}
+// Section C.2.14
+void ComputeEndpoints(Pixel& ep1, Pixel& ep2, const uint32_t*& colorValues,
+                      uint32_t colorEndpointMode) {
+#define READ_UINT_VALUES(N)                                                                        \
+    uint32_t v[N];                                                                                 \
+    for (uint32_t i = 0; i < N; i++) {                                                             \
+        v[i] = *(colorValues++);                                                                   \
+    }
+#define READ_INT_VALUES(N)                                                                         \
+    int32_t v[N];                                                                                  \
+    for (uint32_t i = 0; i < N; i++) {                                                             \
+        v[i] = static_cast<int32_t>(*(colorValues++));                                             \
+    }
+    switch (colorEndpointMode) {
+    case 0: {
+        READ_UINT_VALUES(2)
+        ep1 = Pixel(0xFF, v[0], v[0], v[0]);
+        ep2 = Pixel(0xFF, v[1], v[1], v[1]);
+    } break;
+    case 1: {
+        READ_UINT_VALUES(2)
+        uint32_t L0 = (v[0] >> 2) | (v[1] & 0xC0);
+        uint32_t L1 = std::max(L0 + (v[1] & 0x3F), 0xFFU);
+        ep1 = Pixel(0xFF, L0, L0, L0);
+        ep2 = Pixel(0xFF, L1, L1, L1);
+    } break;
+    case 4: {
+        READ_UINT_VALUES(4)
+        ep1 = Pixel(v[2], v[0], v[0], v[0]);
+        ep2 = Pixel(v[3], v[1], v[1], v[1]);
+    } break;
+    case 5: {
+        READ_INT_VALUES(4)
+        BitTransferSigned(v[1], v[0]);
+        BitTransferSigned(v[3], v[2]);
+        ep1 = Pixel(v[2], v[0], v[0], v[0]);
+        ep2 = Pixel(v[2] + v[3], v[0] + v[1], v[0] + v[1], v[0] + v[1]);
+        ep1.ClampByte();
+        ep2.ClampByte();
+    } break;
+    case 6: {
+        READ_UINT_VALUES(4)
+        ep1 = Pixel(0xFF, v[0] * v[3] >> 8, v[1] * v[3] >> 8, v[2] * v[3] >> 8);
+        ep2 = Pixel(0xFF, v[0], v[1], v[2]);
+    } break;
+    case 8: {
+        READ_UINT_VALUES(6)
+        if (v[1] + v[3] + v[5] >= v[0] + v[2] + v[4]) {
+            ep1 = Pixel(0xFF, v[0], v[2], v[4]);
+            ep2 = Pixel(0xFF, v[1], v[3], v[5]);
+        } else {
+            ep1 = BlueContract(0xFF, v[1], v[3], v[5]);
+            ep2 = BlueContract(0xFF, v[0], v[2], v[4]);
+        }
+    } break;
+    case 9: {
+        READ_INT_VALUES(6)
+        BitTransferSigned(v[1], v[0]);
+        BitTransferSigned(v[3], v[2]);
+        BitTransferSigned(v[5], v[4]);
+        if (v[1] + v[3] + v[5] >= 0) {
+            ep1 = Pixel(0xFF, v[0], v[2], v[4]);
+            ep2 = Pixel(0xFF, v[0] + v[1], v[2] + v[3], v[4] + v[5]);
+        } else {
+            ep1 = BlueContract(0xFF, v[0] + v[1], v[2] + v[3], v[4] + v[5]);
+            ep2 = BlueContract(0xFF, v[0], v[2], v[4]);
+        }
+        ep1.ClampByte();
+        ep2.ClampByte();
+    } break;
+    case 10: {
+        READ_UINT_VALUES(6)
+        ep1 = Pixel(v[4], v[0] * v[3] >> 8, v[1] * v[3] >> 8, v[2] * v[3] >> 8);
+        ep2 = Pixel(v[5], v[0], v[1], v[2]);
+    } break;
+    case 12: {
+        READ_UINT_VALUES(8)
+        if (v[1] + v[3] + v[5] >= v[0] + v[2] + v[4]) {
+            ep1 = Pixel(v[6], v[0], v[2], v[4]);
+            ep2 = Pixel(v[7], v[1], v[3], v[5]);
+        } else {
+            ep1 = BlueContract(v[7], v[1], v[3], v[5]);
+            ep2 = BlueContract(v[6], v[0], v[2], v[4]);
+        }
+    } break;
+    case 13: {
+        READ_INT_VALUES(8)
+        BitTransferSigned(v[1], v[0]);
+        BitTransferSigned(v[3], v[2]);
+        BitTransferSigned(v[5], v[4]);
+        BitTransferSigned(v[7], v[6]);
+        if (v[1] + v[3] + v[5] >= 0) {
+            ep1 = Pixel(v[6], v[0], v[2], v[4]);
+            ep2 = Pixel(v[7] + v[6], v[0] + v[1], v[2] + v[3], v[4] + v[5]);
+        } else {
+            ep1 = BlueContract(v[6] + v[7], v[0] + v[1], v[2] + v[3], v[4] + v[5]);
+            ep2 = BlueContract(v[6], v[0], v[2], v[4]);
+        }
+        ep1.ClampByte();
+        ep2.ClampByte();
+    } break;
+    default:
+        assert(!"Unsupported color endpoint mode (is it HDR?)");
+        break;
+    }
+#undef READ_UINT_VALUES
+#undef READ_INT_VALUES
+}
+void DecompressBlock(uint8_t inBuf[16], const uint32_t blockWidth, const uint32_t blockHeight,
+                     uint32_t* outBuf) {
+    BitStream strm(inBuf);
+    TexelWeightParams weightParams = DecodeBlockInfo(strm);
+    // Was there an error?
+    if (weightParams.m_bError) {
+        assert(!"Invalid block mode");
+        FillError(outBuf, blockWidth, blockHeight);
+        return;
+    }
+    if (weightParams.m_bVoidExtentLDR) {
+        FillVoidExtentLDR(strm, outBuf, blockWidth, blockHeight);
+        return;
+    }
+    if (weightParams.m_bVoidExtentHDR) {
+        assert(!"HDR void extent blocks are unsupported!");
+        FillError(outBuf, blockWidth, blockHeight);
+        return;
+    }
+    if (weightParams.m_Width > blockWidth) {
+        assert(!"Texel weight grid width should be smaller than block width");
+        FillError(outBuf, blockWidth, blockHeight);
+        return;
+    }
+    if (weightParams.m_Height > blockHeight) {
+        assert(!"Texel weight grid height should be smaller than block height");
+        FillError(outBuf, blockWidth, blockHeight);
+        return;
+    }
+    // Read num partitions
+    uint32_t nPartitions = strm.ReadBits(2) + 1;
+    assert(nPartitions <= 4);
+    if (nPartitions == 4 && weightParams.m_bDualPlane) {
+        assert(!"Dual plane mode is incompatible with four partition blocks");
+        FillError(outBuf, blockWidth, blockHeight);
+        return;
+    }
+    // Based on the number of partitions, read the color endpoint mode for
+    // each partition.
+    // Determine partitions, partition index, and color endpoint modes
+    int32_t planeIdx = -1;
+    uint32_t partitionIndex;
+    uint32_t colorEndpointMode[4] = {0, 0, 0, 0};
+    // Define color data.
+    uint8_t colorEndpointData[16];
+    memset(colorEndpointData, 0, sizeof(colorEndpointData));
+    BitStream colorEndpointStream(colorEndpointData, 16 * 8, 0);
+    // Read extra config data...
+    uint32_t baseCEM = 0;
+    if (nPartitions == 1) {
+        colorEndpointMode[0] = strm.ReadBits(4);
+        partitionIndex = 0;
+    } else {
+        partitionIndex = strm.ReadBits(10);
+        baseCEM = strm.ReadBits(6);
+    }
+    uint32_t baseMode = (baseCEM & 3);
+    // Remaining bits are color endpoint data...
+    uint32_t nWeightBits = weightParams.GetPackedBitSize();
+    int32_t remainingBits = 128 - nWeightBits - strm.GetBitsRead();
+    // Consider extra bits prior to texel data...
+    uint32_t extraCEMbits = 0;
+    if (baseMode) {
+        switch (nPartitions) {
+        case 2:
+            extraCEMbits += 2;
+            break;
+        case 3:
+            extraCEMbits += 5;
+            break;
+        case 4:
+            extraCEMbits += 8;
+            break;
+        default:
+            assert(false);
+            break;
+        }
+    }
+    remainingBits -= extraCEMbits;
+    // Do we have a dual plane situation?
+    uint32_t planeSelectorBits = 0;
+    if (weightParams.m_bDualPlane) {
+        planeSelectorBits = 2;
+    }
+    remainingBits -= planeSelectorBits;
+    // Read color data...
+    uint32_t colorDataBits = remainingBits;
+    while (remainingBits > 0) {
+        uint32_t nb = std::min(remainingBits, 8);
+        uint32_t b = strm.ReadBits(nb);
+        colorEndpointStream.WriteBits(b, nb);
+        remainingBits -= 8;
+    }
+    // Read the plane selection bits
+    planeIdx = strm.ReadBits(planeSelectorBits);
+    // Read the rest of the CEM
+    if (baseMode) {
+        uint32_t extraCEM = strm.ReadBits(extraCEMbits);
+        uint32_t CEM = (extraCEM << 6) | baseCEM;
+        CEM >>= 2;
+        bool C[4] = {0};
+        for (uint32_t i = 0; i < nPartitions; i++) {
+            C[i] = CEM & 1;
+            CEM >>= 1;
+        }
+        uint8_t M[4] = {0};
+        for (uint32_t i = 0; i < nPartitions; i++) {
+            M[i] = CEM & 3;
+            CEM >>= 2;
+            assert(M[i] <= 3);
+        }
+        for (uint32_t i = 0; i < nPartitions; i++) {
+            colorEndpointMode[i] = baseMode;
+            if (!(C[i]))
+                colorEndpointMode[i] -= 1;
+            colorEndpointMode[i] <<= 2;
+            colorEndpointMode[i] |= M[i];
+        }
+    } else if (nPartitions > 1) {
+        uint32_t CEM = baseCEM >> 2;
+        for (uint32_t i = 0; i < nPartitions; i++) {
+            colorEndpointMode[i] = CEM;
+        }
+    }
+    // Make sure everything up till here is sane.
+    for (uint32_t i = 0; i < nPartitions; i++) {
+        assert(colorEndpointMode[i] < 16);
+    }
+    assert(strm.GetBitsRead() + weightParams.GetPackedBitSize() == 128);
+    // Decode both color data and texel weight data
+    uint32_t colorValues[32]; // Four values, two endpoints, four maximum paritions
+    DecodeColorValues(colorValues, colorEndpointData, colorEndpointMode, nPartitions,
+                      colorDataBits);
+    Pixel endpoints[4][2];
+    const uint32_t* colorValuesPtr = colorValues;
+    for (uint32_t i = 0; i < nPartitions; i++) {
+        ComputeEndpoints(endpoints[i][0], endpoints[i][1], colorValuesPtr, colorEndpointMode[i]);
+    }
+    // Read the texel weight data..
+    uint8_t texelWeightData[16];
+    memcpy(texelWeightData, inBuf, sizeof(texelWeightData));
+    // Reverse everything
+    for (uint32_t i = 0; i < 8; i++) {
+// Taken from http://graphics.stanford.edu/~seander/bithacks.html#ReverseByteWith64Bits
+#define REVERSE_BYTE(b) (((b)*0x80200802ULL) & 0x0884422110ULL) * 0x0101010101ULL >> 32
+        unsigned char a = static_cast<unsigned char>(REVERSE_BYTE(texelWeightData[i]));
+        unsigned char b = static_cast<unsigned char>(REVERSE_BYTE(texelWeightData[15 - i]));
+#undef REVERSE_BYTE
+        texelWeightData[i] = b;
+        texelWeightData[15 - i] = a;
+    }
+    // Make sure that higher non-texel bits are set to zero
+    const uint32_t clearByteStart = (weightParams.GetPackedBitSize() >> 3) + 1;
+    texelWeightData[clearByteStart - 1] &= (1 << (weightParams.GetPackedBitSize() % 8)) - 1;
+    memset(texelWeightData + clearByteStart, 0, 16 - clearByteStart);
+    std::vector<IntegerEncodedValue> texelWeightValues;
+    BitStream weightStream(texelWeightData);
+    IntegerEncodedValue::DecodeIntegerSequence(texelWeightValues, weightStream,
+                                               weightParams.m_MaxWeight,
+                                               weightParams.GetNumWeightValues());
+    // Blocks can be at most 12x12, so we can have as many as 144 weights
+    uint32_t weights[2][144];
+    UnquantizeTexelWeights(weights, texelWeightValues, weightParams, blockWidth, blockHeight);
+    // Now that we have endpoints and weights, we can interpolate and generate
+    // the proper decoding...
+    for (uint32_t j = 0; j < blockHeight; j++)
+        for (uint32_t i = 0; i < blockWidth; i++) {
+            uint32_t partition = Select2DPartition(partitionIndex, i, j, nPartitions,
+                                                   (blockHeight * blockWidth) < 32);
+            assert(partition < nPartitions);
+            Pixel p;
+            for (uint32_t c = 0; c < 4; c++) {
+                uint32_t C0 = endpoints[partition][0].Component(c);
+                C0 = Replicate(C0, 8, 16);
+                uint32_t C1 = endpoints[partition][1].Component(c);
+                C1 = Replicate(C1, 8, 16);
+                uint32_t plane = 0;
+                if (weightParams.m_bDualPlane && (((planeIdx + 1) & 3) == c)) {
+                    plane = 1;
+                }
+                uint32_t weight = weights[plane][j * blockWidth + i];
+                uint32_t C = (C0 * (64 - weight) + C1 * weight + 32) / 64;
+                if (C == 65535) {
+                    p.Component(c) = 255;
+                } else {
+                    double Cf = static_cast<double>(C);
+                    p.Component(c) = static_cast<uint16_t>(255.0 * (Cf / 65536.0) + 0.5);
+                }
+            }
+            outBuf[j * blockWidth + i] = p.Pack();
+        }
+}
+} // namespace ASTCC
+namespace Tegra::Texture::ASTC {
+std::vector<uint8_t> Decompress(std::vector<uint8_t>& data, uint32_t width, uint32_t height,
+                                uint32_t block_width, uint32_t block_height) {
+    uint32_t blockIdx = 0;
+    std::vector<uint8_t> outData;
+    outData.resize(height * width * 4);
+    for (uint32_t j = 0; j < height; j += block_height) {
+        for (uint32_t i = 0; i < width; i += block_width) {
+            uint8_t* blockPtr = data.data() + blockIdx * 16;
+            // Blocks can be at most 12x12
+            uint32_t uncompData[144];
+            ASTCC::DecompressBlock(blockPtr, block_width, block_height, uncompData);
+            uint32_t decompWidth = std::min(block_width, width - i);
+            uint32_t decompHeight = std::min(block_height, height - j);
+            uint8_t* outRow = outData.data() + (j * width + i) * 4;
+            for (uint32_t jj = 0; jj < decompHeight; jj++) {
+                memcpy(outRow + jj * width * 4, uncompData + jj * block_width, decompWidth * 4);
+            }
+            blockIdx++;
+        }
+    }
+    return outData;
+}
+} // namespace Tegra::Texture::ASTC
diff --git a/src/video_core/textures/astc.h b/src/video_core/textures/astc.h
new file mode 100644
index 000000000..f0d7c0e56
--- /dev/null
+++ b/src/video_core/textures/astc.h
@@ -0,0 +1,15 @@
+// Copyright 2018 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+#pragma once
+#include <cstdint>
+#include <vector>
+namespace Tegra::Texture::ASTC {
+std::vector<uint8_t> Decompress(std::vector<uint8_t>& data, uint32_t width, uint32_t height,
+                                uint32_t block_width, uint32_t block_height);
+} // namespace Tegra::Texture::ASTC
diff --git a/src/video_core/textures/decoders.cpp b/src/video_core/textures/decoders.cpp
index 7bf9c4c4b..0db4367f1 100644
--- a/src/video_core/textures/decoders.cpp
+++ b/src/video_core/textures/decoders.cpp
@@ -53,6 +53,7 @@ u32 BytesPerPixel(TextureFormat format) {
    case TextureFormat::DXT45:
        // In this case a 'pixel' actually refers to a 4x4 tile.
        return 16;
+    case TextureFormat::ASTC_2D_4X4:
    case TextureFormat::A8R8G8B8:
    case TextureFormat::A2B10G10R10:
    case TextureFormat::BF10GF11RF11:
@@ -94,6 +95,7 @@ std::vector<u8> UnswizzleTexture(VAddr address, TextureFormat format, u32 width,
    case TextureFormat::R8:
    case TextureFormat::R16_G16_B16_A16:
    case TextureFormat::BF10GF11RF11:
+    case TextureFormat::ASTC_2D_4X4:
        CopySwizzledData(width, height, bytes_per_pixel, bytes_per_pixel, data,
                         unswizzled_data.data(), true, block_height);
        break;
@@ -115,6 +117,7 @@ std::vector<u8> DecodeTexture(const std::vector<u8>& texture_data, TextureFormat
    case TextureFormat::DXT23:
    case TextureFormat::DXT45:
    case TextureFormat::DXN1:
+    case TextureFormat::ASTC_2D_4X4:
    case TextureFormat::A8R8G8B8:
    case TextureFormat::A2B10G10R10:
    case TextureFormat::A1B5G5R5:
diff --git a/src/yuzu/CMakeLists.txt b/src/yuzu/CMakeLists.txt
index 5af3154d7..c662570d2 100644
--- a/src/yuzu/CMakeLists.txt
+++ b/src/yuzu/CMakeLists.txt
@@ -32,8 +32,6 @@ add_executable(yuzu
    debugger/graphics/graphics_surface.h
    debugger/profiler.cpp
    debugger/profiler.h
-    debugger/registers.cpp
-    debugger/registers.h
    debugger/wait_tree.cpp
    debugger/wait_tree.h
    game_list.cpp
@@ -60,7 +58,6 @@ set(UIS
    configuration/configure_graphics.ui
    configuration/configure_input.ui
    configuration/configure_system.ui
-    debugger/registers.ui
    hotkeys.ui
    main.ui
 )
diff --git a/src/yuzu/debugger/registers.cpp b/src/yuzu/debugger/registers.cpp
deleted file mode 100644
index 178cc65a7..000000000
--- a/src/yuzu/debugger/registers.cpp
+++ /dev/null
@@ -1,190 +0,0 @@
-// Copyright 2014 Citra Emulator Project
-// Licensed under GPLv2 or any later version
-// Refer to the license.txt file included.
-#include <QTreeWidgetItem>
-#include "core/arm/arm_interface.h"
-#include "core/core.h"
-#include "yuzu/debugger/registers.h"
-#include "yuzu/util/util.h"
-RegistersWidget::RegistersWidget(QWidget* parent) : QDockWidget(parent) {
-    cpu_regs_ui.setupUi(this);
-    tree = cpu_regs_ui.treeWidget;
-    tree->addTopLevelItem(core_registers = new QTreeWidgetItem(QStringList(tr("Registers"))));
-    tree->addTopLevelItem(vfp_registers = new QTreeWidgetItem(QStringList(tr("VFP Registers"))));
-    tree->addTopLevelItem(vfp_system_registers =
-                              new QTreeWidgetItem(QStringList(tr("VFP System Registers"))));
-    tree->addTopLevelItem(cpsr = new QTreeWidgetItem(QStringList("CPSR")));
-    for (int i = 0; i < 16; ++i) {
-        QTreeWidgetItem* child = new QTreeWidgetItem(QStringList(QString("R[%1]").arg(i)));
-        core_registers->addChild(child);
-    }
-    for (int i = 0; i < 32; ++i) {
-        QTreeWidgetItem* child = new QTreeWidgetItem(QStringList(QString("S[%1]").arg(i)));
-        vfp_registers->addChild(child);
-    }
-    QFont font = GetMonospaceFont();
-    CreateCPSRChildren();
-    CreateVFPSystemRegisterChildren();
-    // Set Registers to display in monospace font
-    for (int i = 0; i < core_registers->childCount(); ++i)
-        core_registers->child(i)->setFont(1, font);
-    for (int i = 0; i < vfp_registers->childCount(); ++i)
-        vfp_registers->child(i)->setFont(1, font);
-    for (int i = 0; i < vfp_system_registers->childCount(); ++i) {
-        vfp_system_registers->child(i)->setFont(1, font);
-        for (int x = 0; x < vfp_system_registers->child(i)->childCount(); ++x) {
-            vfp_system_registers->child(i)->child(x)->setFont(1, font);
-        }
-    }
-    // Set CSPR to display in monospace font
-    cpsr->setFont(1, font);
-    for (int i = 0; i < cpsr->childCount(); ++i) {
-        cpsr->child(i)->setFont(1, font);
-        for (int x = 0; x < cpsr->child(i)->childCount(); ++x) {
-            cpsr->child(i)->child(x)->setFont(1, font);
-        }
-    }
-    setEnabled(false);
-}
-void RegistersWidget::OnDebugModeEntered() {
-    if (!Core::System::GetInstance().IsPoweredOn())
-        return;
-    for (int i = 0; i < core_registers->childCount(); ++i)
-        core_registers->child(i)->setText(
-            1, QString("0x%1").arg(Core::CurrentArmInterface().GetReg(i), 8, 16, QLatin1Char('0')));
-    UpdateCPSRValues();
-}
-void RegistersWidget::OnDebugModeLeft() {}
-void RegistersWidget::OnEmulationStarting(EmuThread* emu_thread) {
-    setEnabled(true);
-}
-void RegistersWidget::OnEmulationStopping() {
-    // Reset widget text
-    for (int i = 0; i < core_registers->childCount(); ++i)
-        core_registers->child(i)->setText(1, QString(""));
-    for (int i = 0; i < vfp_registers->childCount(); ++i)
-        vfp_registers->child(i)->setText(1, QString(""));
-    for (int i = 0; i < cpsr->childCount(); ++i)
-        cpsr->child(i)->setText(1, QString(""));
-    cpsr->setText(1, QString(""));
-    // FPSCR
-    for (int i = 0; i < vfp_system_registers->child(0)->childCount(); ++i)
-        vfp_system_registers->child(0)->child(i)->setText(1, QString(""));
-    // FPEXC
-    for (int i = 0; i < vfp_system_registers->child(1)->childCount(); ++i)
-        vfp_system_registers->child(1)->child(i)->setText(1, QString(""));
-    vfp_system_registers->child(0)->setText(1, QString(""));
-    vfp_system_registers->child(1)->setText(1, QString(""));
-    vfp_system_registers->child(2)->setText(1, QString(""));
-    vfp_system_registers->child(3)->setText(1, QString(""));
-    setEnabled(false);
-}
-void RegistersWidget::CreateCPSRChildren() {
-    cpsr->addChild(new QTreeWidgetItem(QStringList("M")));
-    cpsr->addChild(new QTreeWidgetItem(QStringList("T")));
-    cpsr->addChild(new QTreeWidgetItem(QStringList("F")));
-    cpsr->addChild(new QTreeWidgetItem(QStringList("I")));
-    cpsr->addChild(new QTreeWidgetItem(QStringList("A")));
-    cpsr->addChild(new QTreeWidgetItem(QStringList("E")));
-    cpsr->addChild(new QTreeWidgetItem(QStringList("IT")));
-    cpsr->addChild(new QTreeWidgetItem(QStringList("GE")));
-    cpsr->addChild(new QTreeWidgetItem(QStringList("DNM")));
-    cpsr->addChild(new QTreeWidgetItem(QStringList("J")));
-    cpsr->addChild(new QTreeWidgetItem(QStringList("Q")));
-    cpsr->addChild(new QTreeWidgetItem(QStringList("V")));
-    cpsr->addChild(new QTreeWidgetItem(QStringList("C")));
-    cpsr->addChild(new QTreeWidgetItem(QStringList("Z")));
-    cpsr->addChild(new QTreeWidgetItem(QStringList("N")));
-}
-void RegistersWidget::UpdateCPSRValues() {
-    const u32 cpsr_val = Core::CurrentArmInterface().GetCPSR();
-    cpsr->setText(1, QString("0x%1").arg(cpsr_val, 8, 16, QLatin1Char('0')));
-    cpsr->child(0)->setText(
-        1, QString("b%1").arg(cpsr_val & 0x1F, 5, 2, QLatin1Char('0'))); // M - Mode
-    cpsr->child(1)->setText(1, QString::number((cpsr_val >> 5) & 1));    // T - State
-    cpsr->child(2)->setText(1, QString::number((cpsr_val >> 6) & 1));    // F - FIQ disable
-    cpsr->child(3)->setText(1, QString::number((cpsr_val >> 7) & 1));    // I - IRQ disable
-    cpsr->child(4)->setText(1, QString::number((cpsr_val >> 8) & 1));    // A - Imprecise abort
-    cpsr->child(5)->setText(1, QString::number((cpsr_val >> 9) & 1));    // E - Data endianness
-    cpsr->child(6)->setText(1,
-                            QString::number((cpsr_val >> 10) & 0x3F)); // IT - If-Then state (DNM)
-    cpsr->child(7)->setText(1,
-                            QString::number((cpsr_val >> 16) & 0xF)); // GE - Greater-than-or-Equal
-    cpsr->child(8)->setText(1, QString::number((cpsr_val >> 20) & 0xF)); // DNM - Do not modify
-    cpsr->child(9)->setText(1, QString::number((cpsr_val >> 24) & 1));   // J - Jazelle
-    cpsr->child(10)->setText(1, QString::number((cpsr_val >> 27) & 1));  // Q - Saturation
-    cpsr->child(11)->setText(1, QString::number((cpsr_val >> 28) & 1));  // V - Overflow
-    cpsr->child(12)->setText(1, QString::number((cpsr_val >> 29) & 1));  // C - Carry/Borrow/Extend
-    cpsr->child(13)->setText(1, QString::number((cpsr_val >> 30) & 1));  // Z - Zero
-    cpsr->child(14)->setText(1, QString::number((cpsr_val >> 31) & 1));  // N - Negative/Less than
-}
-void RegistersWidget::CreateVFPSystemRegisterChildren() {
-    QTreeWidgetItem* const fpscr = new QTreeWidgetItem(QStringList("FPSCR"));
-    fpscr->addChild(new QTreeWidgetItem(QStringList("IOC")));
-    fpscr->addChild(new QTreeWidgetItem(QStringList("DZC")));
-    fpscr->addChild(new QTreeWidgetItem(QStringList("OFC")));
-    fpscr->addChild(new QTreeWidgetItem(QStringList("UFC")));
-    fpscr->addChild(new QTreeWidgetItem(QStringList("IXC")));
-    fpscr->addChild(new QTreeWidgetItem(QStringList("IDC")));
-    fpscr->addChild(new QTreeWidgetItem(QStringList("IOE")));
-    fpscr->addChild(new QTreeWidgetItem(QStringList("DZE")));
-    fpscr->addChild(new QTreeWidgetItem(QStringList("OFE")));
-    fpscr->addChild(new QTreeWidgetItem(QStringList("UFE")));
-    fpscr->addChild(new QTreeWidgetItem(QStringList("IXE")));
-    fpscr->addChild(new QTreeWidgetItem(QStringList("IDE")));
-    fpscr->addChild(new QTreeWidgetItem(QStringList(tr("Vector Length"))));
-    fpscr->addChild(new QTreeWidgetItem(QStringList(tr("Vector Stride"))));
-    fpscr->addChild(new QTreeWidgetItem(QStringList(tr("Rounding Mode"))));
-    fpscr->addChild(new QTreeWidgetItem(QStringList("FZ")));
-    fpscr->addChild(new QTreeWidgetItem(QStringList("DN")));
-    fpscr->addChild(new QTreeWidgetItem(QStringList("V")));
-    fpscr->addChild(new QTreeWidgetItem(QStringList("C")));
-    fpscr->addChild(new QTreeWidgetItem(QStringList("Z")));
-    fpscr->addChild(new QTreeWidgetItem(QStringList("N")));
-    QTreeWidgetItem* const fpexc = new QTreeWidgetItem(QStringList("FPEXC"));
-    fpexc->addChild(new QTreeWidgetItem(QStringList("IOC")));
-    fpexc->addChild(new QTreeWidgetItem(QStringList("OFC")));
-    fpexc->addChild(new QTreeWidgetItem(QStringList("UFC")));
-    fpexc->addChild(new QTreeWidgetItem(QStringList("INV")));
-    fpexc->addChild(new QTreeWidgetItem(QStringList(tr("Vector Iteration Count"))));
-    fpexc->addChild(new QTreeWidgetItem(QStringList("FP2V")));
-    fpexc->addChild(new QTreeWidgetItem(QStringList("EN")));
-    fpexc->addChild(new QTreeWidgetItem(QStringList("EX")));
-    vfp_system_registers->addChild(fpscr);
-    vfp_system_registers->addChild(fpexc);
-    vfp_system_registers->addChild(new QTreeWidgetItem(QStringList("FPINST")));
-    vfp_system_registers->addChild(new QTreeWidgetItem(QStringList("FPINST2")));
-}
-void RegistersWidget::UpdateVFPSystemRegisterValues() {
-    UNIMPLEMENTED();
-}
diff --git a/src/yuzu/debugger/registers.h b/src/yuzu/debugger/registers.h
deleted file mode 100644
index 55bda5b59..000000000
--- a/src/yuzu/debugger/registers.h
+++ /dev/null
@@ -1,42 +0,0 @@
-// Copyright 2014 Citra Emulator Project
-// Licensed under GPLv2 or any later version
-// Refer to the license.txt file included.
-#pragma once
-#include <QDockWidget>
-#include "ui_registers.h"
-class QTreeWidget;
-class QTreeWidgetItem;
-class EmuThread;
-class RegistersWidget : public QDockWidget {
-    Q_OBJECT
-public:
-    explicit RegistersWidget(QWidget* parent = nullptr);
-public slots:
-    void OnDebugModeEntered();
-    void OnDebugModeLeft();
-    void OnEmulationStarting(EmuThread* emu_thread);
-    void OnEmulationStopping();
-private:
-    void CreateCPSRChildren();
-    void UpdateCPSRValues();
-    void CreateVFPSystemRegisterChildren();
-    void UpdateVFPSystemRegisterValues();
-    Ui::ARMRegisters cpu_regs_ui;
-    QTreeWidget* tree;
-    QTreeWidgetItem* core_registers;
-    QTreeWidgetItem* vfp_registers;
-    QTreeWidgetItem* vfp_system_registers;
-    QTreeWidgetItem* cpsr;
-};
diff --git a/src/yuzu/debugger/registers.ui b/src/yuzu/debugger/registers.ui
deleted file mode 100644
index c81ae03f9..000000000
--- a/src/yuzu/debugger/registers.ui
+++ /dev/null
@@ -1,40 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<ui version="4.0">
- <class>ARMRegisters</class>
- <widget class="QDockWidget" name="ARMRegisters">
-  <property name="geometry">
-   <rect>
-    <x>0</x>
-    <y>0</y>
-    <width>400</width>
-    <height>300</height>
-   </rect>
-  </property>
-  <property name="windowTitle">
-   <string>ARM Registers</string>
-  </property>
-  <widget class="QWidget" name="dockWidgetContents">
-   <layout class="QVBoxLayout" name="verticalLayout">
-    <item>
-     <widget class="QTreeWidget" name="treeWidget">
-      <property name="alternatingRowColors">
-       <bool>true</bool>
-      </property>
-      <column>
-       <property name="text">
-        <string>Register</string>
-       </property>
-      </column>
-      <column>
-       <property name="text">
-        <string>Value</string>
-       </property>
-      </column>
-     </widget>
-    </item>
-   </layout>
-  </widget>
- </widget>
- <resources/>
- <connections/>
-</ui>
diff --git a/src/yuzu/debugger/wait_tree.cpp b/src/yuzu/debugger/wait_tree.cpp
index 017bef13c..7101b381e 100644
--- a/src/yuzu/debugger/wait_tree.cpp
+++ b/src/yuzu/debugger/wait_tree.cpp
@@ -213,6 +213,9 @@ QString WaitTreeThread::GetText() const {
    case THREADSTATUS_WAIT_MUTEX:
        status = tr("waiting for mutex");
        break;
+    case THREADSTATUS_WAIT_ARB:
+        status = tr("waiting for address arbiter");
+        break;
    case THREADSTATUS_DORMANT:
        status = tr("dormant");
        break;
@@ -240,6 +243,7 @@ QColor WaitTreeThread::GetColor() const {
    case THREADSTATUS_WAIT_SYNCH_ALL:
    case THREADSTATUS_WAIT_SYNCH_ANY:
    case THREADSTATUS_WAIT_MUTEX:
+    case THREADSTATUS_WAIT_ARB:
        return QColor(Qt::GlobalColor::red);
    case THREADSTATUS_DORMANT:
        return QColor(Qt::GlobalColor::darkCyan);
diff --git a/src/yuzu/game_list.cpp b/src/yuzu/game_list.cpp
index bbd681eae..55dce6d47 100644
--- a/src/yuzu/game_list.cpp
+++ b/src/yuzu/game_list.cpp
@@ -3,6 +3,7 @@
 // Refer to the license.txt file included.
 #include <QApplication>
+#include <QDir>
 #include <QFileInfo>
 #include <QHeaderView>
 #include <QKeyEvent>
@@ -264,8 +265,17 @@ void GameList::ValidateEntry(const QModelIndex& item) {
    if (file_path.isEmpty())
        return;
    std::string std_file_path(file_path.toStdString());
-    if (!FileUtil::Exists(std_file_path) || FileUtil::IsDirectory(std_file_path))
+    if (!FileUtil::Exists(std_file_path))
        return;
+    if (FileUtil::IsDirectory(std_file_path)) {
+        QDir dir(std_file_path.c_str());
+        QStringList matching_main = dir.entryList(QStringList("main"), QDir::Files);
+        if (matching_main.size() == 1) {
+            emit GameChosen(dir.path() + DIR_SEP + matching_main[0]);
+        }
+        return;
+    }
    // Users usually want to run a diffrent game after closing one
    search_field->clear();
    emit GameChosen(file_path);
@@ -356,13 +366,26 @@ void GameList::LoadInterfaceLayout() {
    item_model->sort(header->sortIndicatorSection(), header->sortIndicatorOrder());
 }
-const QStringList GameList::supported_file_extensions = {"nso", "nro"};
+const QStringList GameList::supported_file_extensions = {"nso", "nro", "nca"};
 static bool HasSupportedFileExtension(const std::string& file_name) {
    QFileInfo file = QFileInfo(file_name.c_str());
    return GameList::supported_file_extensions.contains(file.suffix(), Qt::CaseInsensitive);
 }
+static bool IsExtractedNCAMain(const std::string& file_name) {
+    return QFileInfo(file_name.c_str()).fileName() == "main";
+}
+static QString FormatGameName(const std::string& physical_name) {
+    QFileInfo file_info(physical_name.c_str());
+    if (IsExtractedNCAMain(physical_name)) {
+        return file_info.dir().path();
+    } else {
+        return QString::fromStdString(physical_name);
+    }
+}
 void GameList::RefreshGameDirectory() {
    if (!UISettings::values.gamedir.isEmpty() && current_worker != nullptr) {
        NGLOG_INFO(Frontend, "Change detected in the games directory. Reloading game list.");
@@ -380,7 +403,8 @@ void GameListWorker::AddFstEntriesToGameList(const std::string& dir_path, unsign
            return false; // Breaks the callback loop.
        bool is_dir = FileUtil::IsDirectory(physical_name);
-        if (!is_dir && HasSupportedFileExtension(physical_name)) {
+        if (!is_dir &&
+            (HasSupportedFileExtension(physical_name) || IsExtractedNCAMain(physical_name))) {
            std::unique_ptr<Loader::AppLoader> loader = Loader::GetLoader(physical_name);
            if (!loader)
                return true;
@@ -392,7 +416,7 @@ void GameListWorker::AddFstEntriesToGameList(const std::string& dir_path, unsign
            loader->ReadProgramId(program_id);
            emit EntryReady({
-                new GameListItemPath(QString::fromStdString(physical_name), smdh, program_id),
+                new GameListItemPath(FormatGameName(physical_name), smdh, program_id),
                new GameListItem(
                    QString::fromStdString(Loader::GetFileTypeString(loader->GetFileType()))),
                new GameListItemSize(FileUtil::GetSize(physical_name)),
diff --git a/src/yuzu/main.cpp b/src/yuzu/main.cpp
index 3038bd6da..97be548d7 100644
--- a/src/yuzu/main.cpp
+++ b/src/yuzu/main.cpp
@@ -13,6 +13,7 @@
 #include <QMessageBox>
 #include <QtGui>
 #include <QtWidgets>
+#include "common/common_paths.h"
 #include "common/logging/backend.h"
 #include "common/logging/filter.h"
 #include "common/logging/log.h"
@@ -33,7 +34,6 @@
 #include "yuzu/debugger/graphics/graphics_breakpoints.h"
 #include "yuzu/debugger/graphics/graphics_surface.h"
 #include "yuzu/debugger/profiler.h"
-#include "yuzu/debugger/registers.h"
 #include "yuzu/debugger/wait_tree.h"
 #include "yuzu/game_list.h"
 #include "yuzu/hotkeys.h"
@@ -169,15 +169,6 @@ void GMainWindow::InitializeDebugWidgets() {
    debug_menu->addAction(microProfileDialog->toggleViewAction());
 #endif
-    registersWidget = new RegistersWidget(this);
-    addDockWidget(Qt::RightDockWidgetArea, registersWidget);
-    registersWidget->hide();
-    debug_menu->addAction(registersWidget->toggleViewAction());
-    connect(this, &GMainWindow::EmulationStarting, registersWidget,
-            &RegistersWidget::OnEmulationStarting);
-    connect(this, &GMainWindow::EmulationStopping, registersWidget,
-            &RegistersWidget::OnEmulationStopping);
    graphicsBreakpointsWidget = new GraphicsBreakPointsWidget(debug_context, this);
    addDockWidget(Qt::RightDockWidgetArea, graphicsBreakpointsWidget);
    graphicsBreakpointsWidget->hide();
@@ -288,6 +279,7 @@ void GMainWindow::ConnectWidgetEvents() {
 void GMainWindow::ConnectMenuEvents() {
    // File
    connect(ui.action_Load_File, &QAction::triggered, this, &GMainWindow::OnMenuLoadFile);
+    connect(ui.action_Load_Folder, &QAction::triggered, this, &GMainWindow::OnMenuLoadFolder);
    connect(ui.action_Select_Game_List_Root, &QAction::triggered, this,
            &GMainWindow::OnMenuSelectGameListRoot);
    connect(ui.action_Exit, &QAction::triggered, this, &QMainWindow::close);
@@ -460,17 +452,12 @@ void GMainWindow::BootGame(const QString& filename) {
    connect(render_window, &GRenderWindow::Closed, this, &GMainWindow::OnStopGame);
    // BlockingQueuedConnection is important here, it makes sure we've finished refreshing our views
    // before the CPU continues
-    connect(emu_thread.get(), &EmuThread::DebugModeEntered, registersWidget,
-            &RegistersWidget::OnDebugModeEntered, Qt::BlockingQueuedConnection);
    connect(emu_thread.get(), &EmuThread::DebugModeEntered, waitTreeWidget,
            &WaitTreeWidget::OnDebugModeEntered, Qt::BlockingQueuedConnection);
-    connect(emu_thread.get(), &EmuThread::DebugModeLeft, registersWidget,
-            &RegistersWidget::OnDebugModeLeft, Qt::BlockingQueuedConnection);
    connect(emu_thread.get(), &EmuThread::DebugModeLeft, waitTreeWidget,
            &WaitTreeWidget::OnDebugModeLeft, Qt::BlockingQueuedConnection);
    // Update the GUI
-    registersWidget->OnDebugModeEntered();
    if (ui.action_Single_Window_Mode->isChecked()) {
        game_list->hide();
    }
@@ -565,6 +552,8 @@ void GMainWindow::OnMenuLoadFile() {
    for (const auto& piece : game_list->supported_file_extensions)
        extensions += "*." + piece + " ";
+    extensions += "main ";
    QString file_filter = tr("Switch Executable") + " (" + extensions + ")";
    file_filter += ";;" + tr("All Files (*.*)");
@@ -577,6 +566,18 @@ void GMainWindow::OnMenuLoadFile() {
    }
 }
+void GMainWindow::OnMenuLoadFolder() {
+    QDir dir = QFileDialog::getExistingDirectory(this, tr("Open Extracted ROM Directory"));
+    QStringList matching_main = dir.entryList(QStringList("main"), QDir::Files);
+    if (matching_main.size() == 1) {
+        BootGame(dir.path() + DIR_SEP + matching_main[0]);
+    } else {
+        QMessageBox::warning(this, tr("Invalid Directory Selected"),
+                             tr("The directory you have selected does not contain a 'main' file."));
+    }
+}
 void GMainWindow::OnMenuSelectGameListRoot() {
    QString dir_path = QFileDialog::getExistingDirectory(this, tr("Select Directory"));
    if (!dir_path.isEmpty()) {
diff --git a/src/yuzu/main.h b/src/yuzu/main.h
index ac3024d8a..074bba3f9 100644
--- a/src/yuzu/main.h
+++ b/src/yuzu/main.h
@@ -19,7 +19,6 @@ class GraphicsSurfaceWidget;
 class GRenderWindow;
 class MicroProfileDialog;
 class ProfilerWidget;
-class RegistersWidget;
 class WaitTreeWidget;
 namespace Tegra {
@@ -124,6 +123,7 @@ private slots:
    void OnGameListLoadFile(QString game_path);
    void OnGameListOpenSaveFolder(u64 program_id);
    void OnMenuLoadFile();
+    void OnMenuLoadFolder();
    /// Called whenever a user selects the "File->Select Game List Root" menu item
    void OnMenuSelectGameListRoot();
    void OnMenuRecentFile();
@@ -163,7 +163,6 @@ private:
    // Debugger panes
    ProfilerWidget* profilerWidget;
    MicroProfileDialog* microProfileDialog;
-    RegistersWidget* registersWidget;
    GraphicsBreakPointsWidget* graphicsBreakpointsWidget;
    GraphicsSurfaceWidget* graphicsSurfaceWidget;
    WaitTreeWidget* waitTreeWidget;
diff --git a/src/yuzu/main.ui b/src/yuzu/main.ui
index 0fcd93cc2..22c4cad08 100644
--- a/src/yuzu/main.ui
+++ b/src/yuzu/main.ui
@@ -58,6 +58,7 @@
     </property>
    </widget>
    <addaction name="action_Load_File"/>
+    <addaction name="action_Load_Folder"/>
    <addaction name="separator"/>
    <addaction name="action_Select_Game_List_Root"/>
    <addaction name="menu_recent_files"/>
@@ -106,6 +107,11 @@
    <string>Load File...</string>
   </property>
  </action>
+  <action name="action_Load_Folder">
+   <property name="text">
+    <string>Load Folder...</string>
+   </property>
+  </action>
  <action name="action_Load_Symbol_Map">
   <property name="text">
    <string>Load Symbol Map...</string>