44 files changed, 2151 insertions, 354 deletions
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index cb09f3cd1..2bb411492 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -4,6 +4,7 @@ include_directories(.)
 add_subdirectory(common)
 add_subdirectory(core)
 add_subdirectory(video_core)
+add_subdirectory(audio_core)
 if (ENABLE_GLFW)
    add_subdirectory(citra)
 endif()
diff --git a/src/audio_core/CMakeLists.txt b/src/audio_core/CMakeLists.txt
new file mode 100644
index 000000000..b0d1c7eb6
--- /dev/null
+++ b/src/audio_core/CMakeLists.txt
@@ -0,0 +1,16 @@
+set(SRCS
+            audio_core.cpp
+            hle/dsp.cpp
+            hle/pipe.cpp
+            )
+set(HEADERS
+            audio_core.h
+            hle/dsp.h
+            hle/pipe.h
+            sink.h
+            )
+create_directory_groups(${SRCS} ${HEADERS})
+add_library(audio_core STATIC ${SRCS} ${HEADERS})
+\ No newline at end of file
diff --git a/src/audio_core/audio_core.cpp b/src/audio_core/audio_core.cpp
new file mode 100644
index 000000000..894f46990
--- /dev/null
+++ b/src/audio_core/audio_core.cpp
@@ -0,0 +1,53 @@
+// Copyright 2016 Citra Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+#include "audio_core/audio_core.h"
+#include "audio_core/hle/dsp.h"
+#include "core/core_timing.h"
+#include "core/hle/kernel/vm_manager.h"
+#include "core/hle/service/dsp_dsp.h"
+namespace AudioCore {
+// Audio Ticks occur about every 5 miliseconds.
+static int tick_event;                               ///< CoreTiming event
+static constexpr u64 audio_frame_ticks = 1310252ull; ///< Units: ARM11 cycles
+static void AudioTickCallback(u64 /*userdata*/, int cycles_late) {
+    if (DSP::HLE::Tick()) {
+        // HACK: We're not signaling the interrups when they should be, but just firing them all off together.
+        // It should be only (interrupt_id = 2, channel_id = 2) that's signalled here.
+        // TODO(merry): Understand when the other interrupts are fired.
+        DSP_DSP::SignalAllInterrupts();
+    }
+    // Reschedule recurrent event
+    CoreTiming::ScheduleEvent(audio_frame_ticks - cycles_late, tick_event);
+}
+/// Initialise Audio
+void Init() {
+    DSP::HLE::Init();
+    tick_event = CoreTiming::RegisterEvent("AudioCore::tick_event", AudioTickCallback);
+    CoreTiming::ScheduleEvent(audio_frame_ticks, tick_event);
+}
+/// Add DSP address spaces to Process's address space.
+void AddAddressSpace(Kernel::VMManager& address_space) {
+    auto r0_vma = address_space.MapBackingMemory(DSP::HLE::region0_base, reinterpret_cast<u8*>(&DSP::HLE::g_region0), sizeof(DSP::HLE::SharedMemory), Kernel::MemoryState::IO).MoveFrom();
+    address_space.Reprotect(r0_vma, Kernel::VMAPermission::ReadWrite);
+    auto r1_vma = address_space.MapBackingMemory(DSP::HLE::region1_base, reinterpret_cast<u8*>(&DSP::HLE::g_region1), sizeof(DSP::HLE::SharedMemory), Kernel::MemoryState::IO).MoveFrom();
+    address_space.Reprotect(r1_vma, Kernel::VMAPermission::ReadWrite);
+}
+/// Shutdown Audio
+void Shutdown() {
+    CoreTiming::UnscheduleEvent(tick_event, 0);
+    DSP::HLE::Shutdown();
+}
+} //namespace
diff --git a/src/audio_core/audio_core.h b/src/audio_core/audio_core.h
new file mode 100644
index 000000000..64c330914
--- /dev/null
+++ b/src/audio_core/audio_core.h
@@ -0,0 +1,26 @@
+// Copyright 2016 Citra Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+#pragma once
+namespace Kernel {
+class VMManager;
+}
+namespace AudioCore {
+constexpr int num_sources = 24;
+constexpr int samples_per_frame = 160;     ///< Samples per audio frame at native sample rate
+constexpr int native_sample_rate = 32728;  ///< 32kHz
+/// Initialise Audio Core
+void Init();
+/// Add DSP address spaces to a Process.
+void AddAddressSpace(Kernel::VMManager& vm_manager);
+/// Shutdown Audio Core
+void Shutdown();
+} // namespace
diff --git a/src/audio_core/hle/dsp.cpp b/src/audio_core/hle/dsp.cpp
new file mode 100644
index 000000000..c89356edc
--- /dev/null
+++ b/src/audio_core/hle/dsp.cpp
@@ -0,0 +1,42 @@
+// Copyright 2016 Citra Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+#include "audio_core/hle/dsp.h"
+#include "audio_core/hle/pipe.h"
+namespace DSP {
+namespace HLE {
+SharedMemory g_region0;
+SharedMemory g_region1;
+void Init() {
+    DSP::HLE::ResetPipes();
+}
+void Shutdown() {
+}
+bool Tick() {
+    return true;
+}
+SharedMemory& CurrentRegion() {
+    // The region with the higher frame counter is chosen unless there is wraparound.
+    if (g_region0.frame_counter == 0xFFFFu && g_region1.frame_counter != 0xFFFEu) {
+        // Wraparound has occured.
+        return g_region1;
+    }
+    if (g_region1.frame_counter == 0xFFFFu && g_region0.frame_counter != 0xFFFEu) {
+        // Wraparound has occured.
+        return g_region0;
+    }
+    return (g_region0.frame_counter > g_region1.frame_counter) ? g_region0 : g_region1;
+}
+} // namespace HLE
+} // namespace DSP
diff --git a/src/audio_core/hle/dsp.h b/src/audio_core/hle/dsp.h
new file mode 100644
index 000000000..14c4000c6
--- /dev/null
+++ b/src/audio_core/hle/dsp.h
@@ -0,0 +1,502 @@
+// Copyright 2016 Citra Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+#pragma once
+#include <cstddef>
+#include <type_traits>
+#include "audio_core/audio_core.h"
+#include "common/bit_field.h"
+#include "common/common_funcs.h"
+#include "common/common_types.h"
+#include "common/swap.h"
+namespace DSP {
+namespace HLE {
+// The application-accessible region of DSP memory consists of two parts.
+// Both are marked as IO and have Read/Write permissions.
+//
+// First Region:  0x1FF50000 (Size: 0x8000)
+// Second Region: 0x1FF70000 (Size: 0x8000)
+//
+// The DSP reads from each region alternately based on the frame counter for each region much like a
+// double-buffer. The frame counter is located as the very last u16 of each region and is incremented
+// each audio tick.
+struct SharedMemory;
+constexpr VAddr region0_base = 0x1FF50000;
+extern SharedMemory g_region0;
+constexpr VAddr region1_base = 0x1FF70000;
+extern SharedMemory g_region1;
+/**
+ * The DSP is native 16-bit. The DSP also appears to be big-endian. When reading 32-bit numbers from
+ * its memory regions, the higher and lower 16-bit halves are swapped compared to the little-endian
+ * layout of the ARM11. Hence from the ARM11's point of view the memory space appears to be
+ * middle-endian.
+ *
+ * Unusually this does not appear to be an issue for floating point numbers. The DSP makes the more
+ * sensible choice of keeping that little-endian. There are also some exceptions such as the
+ * IntermediateMixSamples structure, which is little-endian.
+ *
+ * This struct implements the conversion to and from this middle-endianness.
+ */
+struct u32_dsp {
+    u32_dsp() = default;
+    operator u32() const {
+        return Convert(storage);
+    }
+    void operator=(u32 new_value) {
+        storage = Convert(new_value);
+    }
+private:
+    static constexpr u32 Convert(u32 value) {
+        return (value << 16) | (value >> 16);
+    }
+    u32_le storage;
+};
+#if (__GNUC__ >= 5) || defined(__clang__) || defined(_MSC_VER)
+static_assert(std::is_trivially_copyable<u32_dsp>::value, "u32_dsp isn't trivially copyable");
+#endif
+// There are 15 structures in each memory region. A table of them in the order they appear in memory
+// is presented below
+//
+//       Pipe 2 #    First Region DSP Address   Purpose                               Control
+//       5           0x8400                     DSP Status                            DSP
+//       9           0x8410                     DSP Debug Info                        DSP
+//       6           0x8540                     Final Mix Samples                     DSP
+//       2           0x8680                     Source Status [24]                    DSP
+//       8           0x8710                     Compressor Table                      Application
+//       4           0x9430                     DSP Configuration                     Application
+//       7           0x9492                     Intermediate Mix Samples              DSP + App
+//       1           0x9E92                     Source Configuration [24]             Application
+//       3           0xA792                     Source ADPCM Coefficients [24]        Application
+//       10          0xA912                     Surround Sound Related
+//       11          0xAA12                     Surround Sound Related
+//       12          0xAAD2                     Surround Sound Related
+//       13          0xAC52                     Surround Sound Related
+//       14          0xAC5C                     Surround Sound Related
+//       0           0xBFFF                     Frame Counter                         Application
+//
+// Note that the above addresses do vary slightly between audio firmwares observed; the addresses are
+// not fixed in stone. The addresses above are only an examplar; they're what this implementation
+// does and provides to applications.
+//
+// Application requests the DSP service to convert DSP addresses into ARM11 virtual addresses using the
+// ConvertProcessAddressFromDspDram service call. Applications seem to derive the addresses for the
+// second region via:
+//     second_region_dsp_addr = first_region_dsp_addr | 0x10000
+//
+// Applications maintain most of its own audio state, the memory region is used mainly for
+// communication and not storage of state.
+//
+// In the documentation below, filter and effect transfer functions are specified in the z domain.
+// (If you are more familiar with the Laplace transform, z = exp(sT). The z domain is the digital
+//  frequency domain, just like how the s domain is the analog frequency domain.)
+#define INSERT_PADDING_DSPWORDS(num_words) INSERT_PADDING_BYTES(2 * (num_words))
+// GCC versions < 5.0 do not implement std::is_trivially_copyable.
+// Excluding MSVC because it has weird behaviour for std::is_trivially_copyable.
+#if (__GNUC__ >= 5) || defined(__clang__)
+    #define ASSERT_DSP_STRUCT(name, size) \
+        static_assert(std::is_standard_layout<name>::value, "DSP structure " #name " doesn't use standard layout"); \
+        static_assert(std::is_trivially_copyable<name>::value, "DSP structure " #name " isn't trivially copyable"); \
+        static_assert(sizeof(name) == (size), "Unexpected struct size for DSP structure " #name)
+#else
+    #define ASSERT_DSP_STRUCT(name, size) \
+        static_assert(std::is_standard_layout<name>::value, "DSP structure " #name " doesn't use standard layout"); \
+        static_assert(sizeof(name) == (size), "Unexpected struct size for DSP structure " #name)
+#endif
+struct SourceConfiguration {
+    struct Configuration {
+        /// These dirty flags are set by the application when it updates the fields in this struct.
+        /// The DSP clears these each audio frame.
+        union {
+            u32_le dirty_raw;
+            BitField<2, 1, u32_le> adpcm_coefficients_dirty;
+            BitField<3, 1, u32_le> partial_embedded_buffer_dirty; ///< Tends to be set when a looped buffer is queued.
+            BitField<16, 1, u32_le> enable_dirty;
+            BitField<17, 1, u32_le> interpolation_dirty;
+            BitField<18, 1, u32_le> rate_multiplier_dirty;
+            BitField<19, 1, u32_le> buffer_queue_dirty;
+            BitField<20, 1, u32_le> loop_related_dirty;
+            BitField<21, 1, u32_le> play_position_dirty; ///< Tends to also be set when embedded buffer is updated.
+            BitField<22, 1, u32_le> filters_enabled_dirty;
+            BitField<23, 1, u32_le> simple_filter_dirty;
+            BitField<24, 1, u32_le> biquad_filter_dirty;
+            BitField<25, 1, u32_le> gain_0_dirty;
+            BitField<26, 1, u32_le> gain_1_dirty;
+            BitField<27, 1, u32_le> gain_2_dirty;
+            BitField<28, 1, u32_le> sync_dirty;
+            BitField<29, 1, u32_le> reset_flag;
+            BitField<31, 1, u32_le> embedded_buffer_dirty;
+        };
+        // Gain control
+        /**
+         * Gain is between 0.0-1.0. This determines how much will this source appear on
+         * each of the 12 channels that feed into the intermediate mixers.
+         * Each of the three intermediate mixers is fed two left and two right channels.
+         */
+        float_le gain[3][4];
+        // Interpolation
+        /// Multiplier for sample rate. Resampling occurs with the selected interpolation method.
+        float_le rate_multiplier;
+        enum class InterpolationMode : u8 {
+            None = 0,
+            Linear = 1,
+            Polyphase = 2
+        };
+        InterpolationMode interpolation_mode;
+        INSERT_PADDING_BYTES(1); ///< Interpolation related
+        // Filters
+        /**
+         * This is the simplest normalized first-order digital recursive filter.
+         * The transfer function of this filter is:
+         *     H(z) = b0 / (1 + a1 z^-1)
+         * Values are signed fixed point with 15 fractional bits.
+         */
+        struct SimpleFilter {
+            s16_le b0;
+            s16_le a1;
+        };
+        /**
+         * This is a normalised biquad filter (second-order).
+         * The transfer function of this filter is:
+         *     H(z) = (b0 + b1 z^-1 + b2 z^-2) / (1 - a1 z^-1 - a2 z^-2)
+         * Nintendo chose to negate the feedbackward coefficients. This differs from standard notation
+         * as in: https://ccrma.stanford.edu/~jos/filters/Direct_Form_I.html
+         * Values are signed fixed point with 14 fractional bits.
+         */
+        struct BiquadFilter {
+            s16_le b0;
+            s16_le b1;
+            s16_le b2;
+            s16_le a1;
+            s16_le a2;
+        };
+        union {
+            u16_le filters_enabled;
+            BitField<0, 1, u16_le> simple_filter_enabled;
+            BitField<1, 1, u16_le> biquad_filter_enabled;
+        };
+        SimpleFilter simple_filter;
+        BiquadFilter biquad_filter;
+        // Buffer Queue
+        /// A buffer of audio data from the application, along with metadata about it.
+        struct Buffer {
+            /// Physical memory address of the start of the buffer
+            u32_dsp physical_address;
+            /// This is length in terms of samples.
+            /// Note that in different buffer formats a sample takes up different number of bytes.
+            u32_dsp length;
+            /// ADPCM Predictor (4 bits) and Scale (4 bits)
+            union {
+                u16_le adpcm_ps;
+                BitField<0, 4, u16_le> adpcm_scale;
+                BitField<4, 4, u16_le> adpcm_predictor;
+            };
+            /// ADPCM Historical Samples (y[n-1] and y[n-2])
+            u16_le adpcm_yn[2];
+            /// This is non-zero when the ADPCM values above are to be updated.
+            u8 adpcm_dirty;
+            /// Is a looping buffer.
+            u8 is_looping;
+            /// This value is shown in SourceStatus::previous_buffer_id when this buffer has finished.
+            /// This allows the emulated application to tell what buffer is currently playing
+            u16_le buffer_id;
+            INSERT_PADDING_DSPWORDS(1);
+        };
+        u16_le buffers_dirty;             ///< Bitmap indicating which buffers are dirty (bit i -> buffers[i])
+        Buffer buffers[4];                ///< Queued Buffers
+        // Playback controls
+        u32_dsp loop_related;
+        u8 enable;
+        INSERT_PADDING_BYTES(1);
+        u16_le sync;                      ///< Application-side sync (See also: SourceStatus::sync)
+        u32_dsp play_position;            ///< Position. (Units: number of samples)
+        INSERT_PADDING_DSPWORDS(2);
+        // Embedded Buffer
+        // This buffer is often the first buffer to be used when initiating audio playback,
+        // after which the buffer queue is used.
+        u32_dsp physical_address;
+        /// This is length in terms of samples.
+        /// Note a sample takes up different number of bytes in different buffer formats.
+        u32_dsp length;
+        enum class MonoOrStereo : u16_le {
+            Mono = 1,
+            Stereo = 2
+        };
+        enum class Format : u16_le {
+            PCM8 = 0,
+            PCM16 = 1,
+            ADPCM = 2
+        };
+        union {
+            u16_le flags1_raw;
+            BitField<0, 2, MonoOrStereo> mono_or_stereo;
+            BitField<2, 2, Format> format;
+            BitField<5, 1, u16_le> fade_in;
+        };
+        /// ADPCM Predictor (4 bit) and Scale (4 bit)
+        union {
+            u16_le adpcm_ps;
+            BitField<0, 4, u16_le> adpcm_scale;
+            BitField<4, 4, u16_le> adpcm_predictor;
+        };
+        /// ADPCM Historical Samples (y[n-1] and y[n-2])
+        u16_le adpcm_yn[2];
+        union {
+            u16_le flags2_raw;
+            BitField<0, 1, u16_le> adpcm_dirty; ///< Has the ADPCM info above been changed?
+            BitField<1, 1, u16_le> is_looping; ///< Is this a looping buffer?
+        };
+        /// Buffer id of embedded buffer (used as a buffer id in SourceStatus to reference this buffer).
+        u16_le buffer_id;
+    };
+    Configuration config[AudioCore::num_sources];
+};
+ASSERT_DSP_STRUCT(SourceConfiguration::Configuration, 192);
+ASSERT_DSP_STRUCT(SourceConfiguration::Configuration::Buffer, 20);
+struct SourceStatus {
+    struct Status {
+        u8 is_enabled;               ///< Is this channel enabled? (Doesn't have to be playing anything.)
+        u8 previous_buffer_id_dirty; ///< Non-zero when previous_buffer_id changes
+        u16_le sync;                 ///< Is set by the DSP to the value of SourceConfiguration::sync
+        u32_dsp buffer_position;     ///< Number of samples into the current buffer
+        u16_le previous_buffer_id;   ///< Updated when a buffer finishes playing
+        INSERT_PADDING_DSPWORDS(1);
+    };
+    Status status[AudioCore::num_sources];
+};
+ASSERT_DSP_STRUCT(SourceStatus::Status, 12);
+struct DspConfiguration {
+    /// These dirty flags are set by the application when it updates the fields in this struct.
+    /// The DSP clears these each audio frame.
+    union {
+        u32_le dirty_raw;
+        BitField<8, 1, u32_le> mixer1_enabled_dirty;
+        BitField<9, 1, u32_le> mixer2_enabled_dirty;
+        BitField<10, 1, u32_le> delay_effect_0_dirty;
+        BitField<11, 1, u32_le> delay_effect_1_dirty;
+        BitField<12, 1, u32_le> reverb_effect_0_dirty;
+        BitField<13, 1, u32_le> reverb_effect_1_dirty;
+        BitField<16, 1, u32_le> volume_0_dirty;
+        BitField<24, 1, u32_le> volume_1_dirty;
+        BitField<25, 1, u32_le> volume_2_dirty;
+        BitField<26, 1, u32_le> output_format_dirty;
+        BitField<27, 1, u32_le> limiter_enabled_dirty;
+        BitField<28, 1, u32_le> headphones_connected_dirty;
+    };
+    /// The DSP has three intermediate audio mixers. This controls the volume level (0.0-1.0) for each at the final mixer
+    float_le volume[3];
+    INSERT_PADDING_DSPWORDS(3);
+    enum class OutputFormat : u16_le {
+        Mono = 0,
+        Stereo = 1,
+        Surround = 2
+    };
+    OutputFormat output_format;
+    u16_le limiter_enabled;      ///< Not sure of the exact gain equation for the limiter.
+    u16_le headphones_connected; ///< Application updates the DSP on headphone status.
+    INSERT_PADDING_DSPWORDS(4);  ///< TODO: Surround sound related
+    INSERT_PADDING_DSPWORDS(2);  ///< TODO: Intermediate mixer 1/2 related
+    u16_le mixer1_enabled;
+    u16_le mixer2_enabled;
+    /**
+     * This is delay with feedback.
+     * Transfer function:
+     *     H(z) = a z^-N / (1 - b z^-1 + a g z^-N)
+     *   where
+     *     N = frame_count * samples_per_frame
+     * g, a and b are fixed point with 7 fractional bits
+     */
+    struct DelayEffect {
+        /// These dirty flags are set by the application when it updates the fields in this struct.
+        /// The DSP clears these each audio frame.
+        union {
+            u16_le dirty_raw;
+            BitField<0, 1, u16_le> enable_dirty;
+            BitField<1, 1, u16_le> work_buffer_address_dirty;
+            BitField<2, 1, u16_le> other_dirty; ///< Set when anything else has been changed
+        };
+        u16_le enable;
+        INSERT_PADDING_DSPWORDS(1);
+        u16_le outputs;
+        u32_dsp work_buffer_address; ///< The application allocates a block of memory for the DSP to use as a work buffer.
+        u16_le frame_count;  ///< Frames to delay by
+        // Coefficients
+        s16_le g; ///< Fixed point with 7 fractional bits
+        s16_le a; ///< Fixed point with 7 fractional bits
+        s16_le b; ///< Fixed point with 7 fractional bits
+    };
+    DelayEffect delay_effect[2];
+    struct ReverbEffect {
+        INSERT_PADDING_DSPWORDS(26); ///< TODO
+    };
+    ReverbEffect reverb_effect[2];
+    INSERT_PADDING_DSPWORDS(4);
+};
+ASSERT_DSP_STRUCT(DspConfiguration, 196);
+ASSERT_DSP_STRUCT(DspConfiguration::DelayEffect, 20);
+ASSERT_DSP_STRUCT(DspConfiguration::ReverbEffect, 52);
+struct AdpcmCoefficients {
+    /// Coefficients are signed fixed point with 11 fractional bits.
+    /// Each source has 16 coefficients associated with it.
+    s16_le coeff[AudioCore::num_sources][16];
+};
+ASSERT_DSP_STRUCT(AdpcmCoefficients, 768);
+struct DspStatus {
+    u16_le unknown;
+    u16_le dropped_frames;
+    INSERT_PADDING_DSPWORDS(0xE);
+};
+ASSERT_DSP_STRUCT(DspStatus, 32);
+/// Final mixed output in PCM16 stereo format, what you hear out of the speakers.
+/// When the application writes to this region it has no effect.
+struct FinalMixSamples {
+    s16_le pcm16[2 * AudioCore::samples_per_frame];
+};
+ASSERT_DSP_STRUCT(FinalMixSamples, 640);
+/// DSP writes output of intermediate mixers 1 and 2 here.
+/// Writes to this region by the application edits the output of the intermediate mixers.
+/// This seems to be intended to allow the application to do custom effects on the ARM11.
+/// Values that exceed s16 range will be clipped by the DSP after further processing.
+struct IntermediateMixSamples {
+    struct Samples {
+        s32_le pcm32[4][AudioCore::samples_per_frame]; ///< Little-endian as opposed to DSP middle-endian.
+    };
+    Samples mix1;
+    Samples mix2;
+};
+ASSERT_DSP_STRUCT(IntermediateMixSamples, 5120);
+/// Compressor table
+struct Compressor {
+    INSERT_PADDING_DSPWORDS(0xD20); ///< TODO
+};
+/// There is no easy way to implement this in a HLE implementation.
+struct DspDebug {
+    INSERT_PADDING_DSPWORDS(0x130);
+};
+ASSERT_DSP_STRUCT(DspDebug, 0x260);
+struct SharedMemory {
+    /// Padding
+    INSERT_PADDING_DSPWORDS(0x400);
+    DspStatus dsp_status;
+    DspDebug dsp_debug;
+    FinalMixSamples final_samples;
+    SourceStatus source_statuses;
+    Compressor compressor;
+    DspConfiguration dsp_configuration;
+    IntermediateMixSamples intermediate_mix_samples;
+    SourceConfiguration source_configurations;
+    AdpcmCoefficients adpcm_coefficients;
+    /// Unknown 10-14 (Surround sound related)
+    INSERT_PADDING_DSPWORDS(0x16ED);
+    u16_le frame_counter;
+};
+ASSERT_DSP_STRUCT(SharedMemory, 0x8000);
+#undef INSERT_PADDING_DSPWORDS
+#undef ASSERT_DSP_STRUCT
+/// Initialize DSP hardware
+void Init();
+/// Shutdown DSP hardware
+void Shutdown();
+/**
+ * Perform processing and updates state of current shared memory buffer.
+ * This function is called every audio tick before triggering the audio interrupt.
+ * @return Whether an audio interrupt should be triggered this frame.
+ */
+bool Tick();
+/// Returns a mutable reference to the current region. Current region is selected based on the frame counter.
+SharedMemory& CurrentRegion();
+} // namespace HLE
+} // namespace DSP
diff --git a/src/audio_core/hle/pipe.cpp b/src/audio_core/hle/pipe.cpp
new file mode 100644
index 000000000..6542c760c
--- /dev/null
+++ b/src/audio_core/hle/pipe.cpp
@@ -0,0 +1,55 @@
+// Copyright 2016 Citra Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+#include <array>
+#include <vector>
+#include "audio_core/hle/pipe.h"
+#include "common/common_types.h"
+#include "common/logging/log.h"
+namespace DSP {
+namespace HLE {
+static size_t pipe2position = 0;
+void ResetPipes() {
+    pipe2position = 0;
+}
+std::vector<u8> PipeRead(u32 pipe_number, u32 length) {
+    if (pipe_number != 2) {
+        LOG_WARNING(Audio_DSP, "pipe_number = %u (!= 2), unimplemented", pipe_number);
+        return {}; // We currently don't handle anything other than the audio pipe.
+    }
+    // Canned DSP responses that games expect. These were taken from HW by 3dmoo team.
+    // TODO: Our implementation will actually use a slightly different response than this one.
+    // TODO: Use offsetof on DSP structures instead for a proper response.
+    static const std::array<u8, 32> canned_response {{
+        0x0F, 0x00, 0xFF, 0xBF, 0x8E, 0x9E, 0x80, 0x86, 0x8E, 0xA7, 0x30, 0x94, 0x00, 0x84, 0x40, 0x85,
+        0x8E, 0x94, 0x10, 0x87, 0x10, 0x84, 0x0E, 0xA9, 0x0E, 0xAA, 0xCE, 0xAA, 0x4E, 0xAC, 0x58, 0xAC
+    }};
+    // TODO: Move this into dsp::DSP service since it happens on the service side.
+    // Hardware observation: No data is returned if requested length reads beyond the end of the data in-pipe.
+    if (pipe2position + length > canned_response.size()) {
+        return {};
+    }
+    std::vector<u8> ret;
+    for (size_t i = 0; i < length; i++, pipe2position++) {
+        ret.emplace_back(canned_response[pipe2position]);
+    }
+    return ret;
+}
+void PipeWrite(u32 pipe_number, const std::vector<u8>& buffer) {
+    // TODO: proper pipe behaviour
+}
+} // namespace HLE
+} // namespace DSP
diff --git a/src/audio_core/hle/pipe.h b/src/audio_core/hle/pipe.h
new file mode 100644
index 000000000..ff6536950
--- /dev/null
+++ b/src/audio_core/hle/pipe.h
@@ -0,0 +1,38 @@
+// Copyright 2016 Citra Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+#pragma once
+#include <vector>
+#include "common/common_types.h"
+namespace DSP {
+namespace HLE {
+/// Reset the pipes by setting pipe positions back to the beginning.
+void ResetPipes();
+/**
+ * Read a DSP pipe.
+ * Pipe IDs:
+ *   pipe_number = 0: Debug
+ *   pipe_number = 1: P-DMA
+ *   pipe_number = 2: Audio
+ *   pipe_number = 3: Binary
+ * @param pipe_number The Pipe ID
+ * @param length How much data to request.
+ * @return The data read from the pipe. The size of this vector can be less than the length requested.
+ */
+std::vector<u8> PipeRead(u32 pipe_number, u32 length);
+/**
+ * Write to a DSP pipe.
+ * @param pipe_number The Pipe ID
+ * @param buffer The data to write to the pipe.
+ */
+void PipeWrite(u32 pipe_number, const std::vector<u8>& buffer);
+} // namespace HLE
+} // namespace DSP
diff --git a/src/audio_core/sink.h b/src/audio_core/sink.h
new file mode 100644
index 000000000..cad21a85e
--- /dev/null
+++ b/src/audio_core/sink.h
@@ -0,0 +1,34 @@
+// Copyright 2016 Citra Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+#pragma once
+#include <vector>
+#include "common/common_types.h"
+namespace AudioCore {
+/**
+ * This class is an interface for an audio sink. An audio sink accepts samples in stereo signed PCM16 format to be output.
+ * Sinks *do not* handle resampling and expect the correct sample rate. They are dumb outputs.
+ */
+class Sink {
+public:
+    virtual ~Sink() = default;
+    /// The native rate of this sink. The sink expects to be fed samples that respect this. (Units: samples/sec)
+    virtual unsigned GetNativeSampleRate() const = 0;
+    /**
+     * Feed stereo samples to sink.
+     * @param samples Samples in interleaved stereo PCM16 format. Size of vector must be multiple of two.
+     */
+    virtual void EnqueueSamples(const std::vector<s16>& samples) = 0;
+    /// Samples enqueued that have not been played yet.
+    virtual std::size_t SamplesInQueue() const = 0;
+};
+} // namespace
diff --git a/src/citra/CMakeLists.txt b/src/citra/CMakeLists.txt
index e7f8a17f9..b9abb818e 100644
--- a/src/citra/CMakeLists.txt
+++ b/src/citra/CMakeLists.txt
@@ -17,7 +17,7 @@ include_directories(${GLFW_INCLUDE_DIRS})
 link_directories(${GLFW_LIBRARY_DIRS})
 add_executable(citra ${SRCS} ${HEADERS})
-target_link_libraries(citra core video_core common)
+target_link_libraries(citra core video_core audio_core common)
 target_link_libraries(citra ${GLFW_LIBRARIES} ${OPENGL_gl_LIBRARY} inih glad)
 if (MSVC)
    target_link_libraries(citra getopt)
diff --git a/src/citra_qt/CMakeLists.txt b/src/citra_qt/CMakeLists.txt
index bbf6ae001..b3d1205a4 100644
--- a/src/citra_qt/CMakeLists.txt
+++ b/src/citra_qt/CMakeLists.txt
@@ -79,7 +79,7 @@ if (APPLE)
 else()
    add_executable(citra-qt ${SRCS} ${HEADERS} ${UI_HDRS})
 endif()
-target_link_libraries(citra-qt core video_core common qhexedit)
+target_link_libraries(citra-qt core video_core audio_core common qhexedit)
 target_link_libraries(citra-qt ${OPENGL_gl_LIBRARY} ${CITRA_QT_LIBS})
 target_link_libraries(citra-qt ${PLATFORM_LIBRARIES})
diff --git a/src/common/bit_field.h b/src/common/bit_field.h
index 66689f398..371eb17a1 100644
--- a/src/common/bit_field.h
+++ b/src/common/bit_field.h
@@ -115,29 +115,24 @@ template<std::size_t position, std::size_t bits, typename T>
 struct BitField
 {
 private:
-    // This constructor might be considered ambiguous:
+    // We hide the copy assigment operator here, because the default copy
-    // Would it initialize the storage or just the bitfield?
+    // assignment would copy the full storage value, rather than just the bits
-    // Hence, delete it. Use the assignment operator to set bitfield values!
+    // relevant to this particular bit field.
-    BitField(T val) = delete;
+    // We don't delete it because we want BitField to be trivially copyable.
+    BitField& operator=(const BitField&) = default;
 public:
+    // This constructor and assignment operator might be considered ambiguous:
+    // Would they initialize the storage or just the bitfield?
+    // Hence, delete them. Use the Assign method to set bitfield values!
+    BitField(T val) = delete;
+    BitField& operator=(T val) = delete;
    // Force default constructor to be created
    // so that we can use this within unions
    BitField() = default;
-    // We explicitly delete the copy assigment operator here, because the
+    FORCE_INLINE operator T() const {
-    // default copy assignment would copy the full storage value, rather than
-    // just the bits relevant to this particular bit field.
-    BitField& operator=(const BitField&) = delete;
-    FORCE_INLINE BitField& operator=(T val)
-    {
-        Assign(val);
-        return *this;
-    }
-    FORCE_INLINE operator T() const
-    {
        return Value();
    }
@@ -145,8 +140,7 @@ public:
        storage = (storage & ~GetMask()) | (((StorageType)value << position) & GetMask());
    }
-    FORCE_INLINE T Value() const
+    FORCE_INLINE T Value() const {
-    {
        if (std::numeric_limits<T>::is_signed)
        {
            std::size_t shift = 8 * sizeof(T)-bits;
@@ -159,8 +153,7 @@ public:
    }
    // TODO: we may want to change this to explicit operator bool() if it's bug-free in VS2015
-    FORCE_INLINE bool ToBool() const
+    FORCE_INLINE bool ToBool() const {
-    {
        return Value() != 0;
    }
@@ -176,8 +169,7 @@ private:
    // Unsigned version of StorageType
    typedef typename std::make_unsigned<StorageType>::type StorageTypeU;
-    FORCE_INLINE StorageType GetMask() const
+    FORCE_INLINE StorageType GetMask() const {
-    {
        return (((StorageTypeU)~0) >> (8 * sizeof(T)-bits)) << position;
    }
@@ -189,6 +181,10 @@ private:
    static_assert(position < 8 * sizeof(T), "Invalid position");
    static_assert(bits <= 8 * sizeof(T), "Invalid number of bits");
    static_assert(bits > 0, "Invalid number of bits");
-    static_assert(std::is_standard_layout<T>::value, "Invalid base type");
+    static_assert(std::is_pod<T>::value, "Invalid base type");
 };
 #pragma pack()
+#if (__GNUC__ >= 5) || defined(__clang__) || defined(_MSC_VER)
+static_assert(std::is_trivially_copyable<BitField<0, 1, u32>>::value, "BitField must be trivially copyable");
+#endif
diff --git a/src/common/emu_window.cpp b/src/common/emu_window.cpp
index b69b05cb9..b2807354a 100644
--- a/src/common/emu_window.cpp
+++ b/src/common/emu_window.cpp
@@ -55,14 +55,14 @@ void EmuWindow::TouchPressed(unsigned framebuffer_x, unsigned framebuffer_y) {
        (framebuffer_layout.bottom_screen.bottom - framebuffer_layout.bottom_screen.top);
    touch_pressed = true;
-    pad_state.touch = 1;
+    pad_state.touch.Assign(1);
 }
 void EmuWindow::TouchReleased() {
    touch_pressed = false;
    touch_x = 0;
    touch_y = 0;
-    pad_state.touch = 0;
+    pad_state.touch.Assign(0);
 }
 void EmuWindow::TouchMoved(unsigned framebuffer_x, unsigned framebuffer_y) {
diff --git a/src/common/logging/backend.cpp b/src/common/logging/backend.cpp
index d186ba8f8..58819012d 100644
--- a/src/common/logging/backend.cpp
+++ b/src/common/logging/backend.cpp
@@ -58,6 +58,8 @@ namespace Log {
        CLS(Render) \
        SUB(Render, Software) \
        SUB(Render, OpenGL) \
+        CLS(Audio) \
+        SUB(Audio, DSP) \
        CLS(Loader)
 // GetClassName is a macro defined by Windows.h, grrr...
diff --git a/src/common/logging/backend.h b/src/common/logging/backend.h
index c1f4d08e4..795d42ebd 100644
--- a/src/common/logging/backend.h
+++ b/src/common/logging/backend.h
@@ -27,25 +27,9 @@ struct Entry {
    std::string message;
    Entry() = default;
+    Entry(Entry&& o) = default;
-    // TODO(yuriks) Use defaulted move constructors once MSVC supports them
+    Entry& operator=(Entry&& o) = default;
-#define MOVE(member) member(std::move(o.member))
-    Entry(Entry&& o)
-        : MOVE(timestamp), MOVE(log_class), MOVE(log_level),
-        MOVE(location), MOVE(message)
-    {}
-#undef MOVE
-    Entry& operator=(const Entry&& o) {
-#define MOVE(member) member = std::move(o.member)
-        MOVE(timestamp);
-        MOVE(log_class);
-        MOVE(log_level);
-        MOVE(location);
-        MOVE(message);
-#undef MOVE
-        return *this;
-    }
 };
 /**
diff --git a/src/common/logging/log.h b/src/common/logging/log.h
index 2d9323a7b..ec7bb00b8 100644
--- a/src/common/logging/log.h
+++ b/src/common/logging/log.h
@@ -73,6 +73,8 @@ enum class Class : ClassType {
    Render,                     ///< Emulator video output and hardware acceleration
    Render_Software,            ///< Software renderer backend
    Render_OpenGL,              ///< OpenGL backend
+    Audio,                      ///< Emulator audio output
+    Audio_DSP,                  ///< The HLE implementation of the DSP
    Loader,                     ///< ROM loader
    Count ///< Total number of logging classes
diff --git a/src/core/hle/kernel/memory.cpp b/src/core/hle/kernel/memory.cpp
index 0cfb43fc7..862643448 100644
--- a/src/core/hle/kernel/memory.cpp
+++ b/src/core/hle/kernel/memory.cpp
@@ -7,6 +7,8 @@
 #include <utility>
 #include <vector>
+#include "audio_core/audio_core.h"
 #include "common/common_types.h"
 #include "common/logging/log.h"
@@ -107,7 +109,6 @@ struct MemoryArea {
 static MemoryArea memory_areas[] = {
    {SHARED_MEMORY_VADDR, SHARED_MEMORY_SIZE,     "Shared Memory"}, // Shared memory
    {VRAM_VADDR,          VRAM_SIZE,              "VRAM"},          // Video memory (VRAM)
-    {DSP_RAM_VADDR,       DSP_RAM_SIZE,           "DSP RAM"},       // DSP memory
    {TLS_AREA_VADDR,      TLS_AREA_SIZE,          "TLS Area"},      // TLS memory
 };
@@ -133,6 +134,8 @@ void InitLegacyAddressSpace(Kernel::VMManager& address_space) {
    auto shared_page_vma = address_space.MapBackingMemory(SHARED_PAGE_VADDR,
            (u8*)&SharedPage::shared_page, SHARED_PAGE_SIZE, MemoryState::Shared).MoveFrom();
    address_space.Reprotect(shared_page_vma, VMAPermission::Read);
+    AudioCore::AddAddressSpace(address_space);
 }
 } // namespace
diff --git a/src/core/hle/kernel/process.cpp b/src/core/hle/kernel/process.cpp
index d148efde2..16eb972fb 100644
--- a/src/core/hle/kernel/process.cpp
+++ b/src/core/hle/kernel/process.cpp
@@ -35,7 +35,7 @@ SharedPtr<Process> Process::Create(SharedPtr<CodeSet> code_set) {
    process->codeset = std::move(code_set);
    process->flags.raw = 0;
-    process->flags.memory_region = MemoryRegion::APPLICATION;
+    process->flags.memory_region.Assign(MemoryRegion::APPLICATION);
    Memory::InitLegacyAddressSpace(process->vm_manager);
    return process;
diff --git a/src/core/hle/result.h b/src/core/hle/result.h
index ea3abb5f6..0fce5988b 100644
--- a/src/core/hle/result.h
+++ b/src/core/hle/result.h
@@ -193,10 +193,10 @@ union ResultCode {
    explicit ResultCode(u32 raw) : raw(raw) {}
    ResultCode(ErrorDescription description_, ErrorModule module_,
            ErrorSummary summary_, ErrorLevel level_) : raw(0) {
-        description = description_;
+        description.Assign(description_);
-        module = module_;
+        module.Assign(module_);
-        summary = summary_;
+        summary.Assign(summary_);
-        level = level_;
+        level.Assign(level_);
    }
    ResultCode& operator=(const ResultCode& o) { raw = o.raw; return *this; }
diff --git a/src/core/hle/service/cfg/cfg.cpp b/src/core/hle/service/cfg/cfg.cpp
index 633fe19eb..7556aa6a5 100644
--- a/src/core/hle/service/cfg/cfg.cpp
+++ b/src/core/hle/service/cfg/cfg.cpp
@@ -293,8 +293,8 @@ ResultCode DeleteConfigNANDSaveFile() {
 ResultCode UpdateConfigNANDSavegame() {
    FileSys::Mode mode = {};
-    mode.write_flag = 1;
+    mode.write_flag.Assign(1);
-    mode.create_flag = 1;
+    mode.create_flag.Assign(1);
    FileSys::Path path("config");
@@ -405,7 +405,7 @@ void Init() {
    FileSys::Path config_path("config");
    FileSys::Mode open_mode = {};
-    open_mode.read_flag = 1;
+    open_mode.read_flag.Assign(1);
    auto config_result = Service::FS::OpenFileFromArchive(*archive_result, config_path, open_mode);
diff --git a/src/core/hle/service/dsp_dsp.cpp b/src/core/hle/service/dsp_dsp.cpp
index f9f931f6d..15d3274ec 100644
--- a/src/core/hle/service/dsp_dsp.cpp
+++ b/src/core/hle/service/dsp_dsp.cpp
@@ -2,6 +2,8 @@
 // Licensed under GPLv2 or any later version
 // Refer to the license.txt file included.
+#include "audio_core/hle/pipe.h"
 #include "common/logging/log.h"
 #include "core/hle/kernel/event.h"
@@ -14,17 +16,30 @@ namespace DSP_DSP {
 static u32 read_pipe_count;
 static Kernel::SharedPtr<Kernel::Event> semaphore_event;
-static Kernel::SharedPtr<Kernel::Event> interrupt_event;
-void SignalInterrupt() {
+struct PairHash {
-    // TODO(bunnei): This is just a stub, it does not do anything other than signal to the emulated
+    template <typename T, typename U>
-    // application that a DSP interrupt occurred, without specifying which one. Since we do not
+    std::size_t operator()(const std::pair<T, U> &x) const {
-    // emulate the DSP yet (and how it works is largely unknown), this is a work around to get games
+        // TODO(yuriks): Replace with better hash combining function.
-    // that check the DSP interrupt signal event to run. We should figure out the different types of
+        return std::hash<T>()(x.first) ^ std::hash<U>()(x.second);
-    // DSP interrupts, and trigger them at the appropriate times.
+    }
+};
+/// Map of (audio interrupt number, channel number) to Kernel::Events. See: RegisterInterruptEvents
+static std::unordered_map<std::pair<u32, u32>, Kernel::SharedPtr<Kernel::Event>, PairHash> interrupt_events;
+// DSP Interrupts:
+// Interrupt #2 occurs every frame tick. Userland programs normally have a thread that's waiting
+// for an interrupt event. Immediately after this interrupt event, userland normally updates the
+// state in the next region and increments the relevant frame counter by two.
+void SignalAllInterrupts() {
+    // HACK: The other interrupts have currently unknown purpose, we trigger them each tick in any case.
+    for (auto& interrupt_event : interrupt_events)
+        interrupt_event.second->Signal();
+}
-    if (interrupt_event != 0)
+void SignalInterrupt(u32 interrupt, u32 channel) {
-        interrupt_event->Signal();
+    interrupt_events[std::make_pair(interrupt, channel)]->Signal();
 }
 /**
@@ -43,7 +58,7 @@ static void ConvertProcessAddressFromDspDram(Service::Interface* self) {
    cmd_buff[1] = 0; // No error
    cmd_buff[2] = (addr << 1) + (Memory::DSP_RAM_VADDR + 0x40000);
-    LOG_WARNING(Service_DSP, "(STUBBED) called with address 0x%08X", addr);
+    LOG_TRACE(Service_DSP, "addr=0x%08X", addr);
 }
 /**
@@ -121,8 +136,8 @@ static void FlushDataCache(Service::Interface* self) {
 /**
 * DSP_DSP::RegisterInterruptEvents service function
 *  Inputs:
- *      1 : Parameter 0 (purpose unknown)
+ *      1 : Interrupt Number
- *      2 : Parameter 1 (purpose unknown)
+ *      2 : Channel Number
 *      4 : Interrupt event handle
 *  Outputs:
 *      1 : Result of function, 0 on success, otherwise error code
@@ -130,22 +145,24 @@ static void FlushDataCache(Service::Interface* self) {
 static void RegisterInterruptEvents(Service::Interface* self) {
    u32* cmd_buff = Kernel::GetCommandBuffer();
-    u32 param0 = cmd_buff[1];
+    u32 interrupt = cmd_buff[1];
-    u32 param1 = cmd_buff[2];
+    u32 channel = cmd_buff[2];
    u32 event_handle = cmd_buff[4];
-    auto evt = Kernel::g_handle_table.Get<Kernel::Event>(cmd_buff[4]);
+    if (event_handle) {
-    if (evt != nullptr) {
+        auto evt = Kernel::g_handle_table.Get<Kernel::Event>(cmd_buff[4]);
-        interrupt_event = evt;
+        if (evt) {
-        cmd_buff[1] = 0; // No error
+            interrupt_events[std::make_pair(interrupt, channel)] = evt;
+            cmd_buff[1] = RESULT_SUCCESS.raw;
+            LOG_WARNING(Service_DSP, "Registered interrupt=%u, channel=%u, event_handle=0x%08X", interrupt, channel, event_handle);
+        } else {
+            cmd_buff[1] = -1;
+            LOG_ERROR(Service_DSP, "Invalid event handle! interrupt=%u, channel=%u, event_handle=0x%08X", interrupt, channel, event_handle);
+        }
    } else {
-        LOG_ERROR(Service_DSP, "called with invalid handle=%08X", cmd_buff[4]);
+        interrupt_events.erase(std::make_pair(interrupt, channel));
+        LOG_WARNING(Service_DSP, "Unregistered interrupt=%u, channel=%u, event_handle=0x%08X", interrupt, channel, event_handle);
-        // TODO(yuriks): An error should be returned from SendSyncRequest, not in the cmdbuf
-        cmd_buff[1] = -1;
    }
-    LOG_WARNING(Service_DSP, "(STUBBED) called param0=%u, param1=%u, event_handle=0x%08X", param0, param1, event_handle);
 }
 /**
@@ -158,8 +175,6 @@ static void RegisterInterruptEvents(Service::Interface* self) {
 static void SetSemaphore(Service::Interface* self) {
    u32* cmd_buff = Kernel::GetCommandBuffer();
-    SignalInterrupt();
    cmd_buff[1] = 0; // No error
    LOG_WARNING(Service_DSP, "(STUBBED) called");
@@ -168,9 +183,9 @@ static void SetSemaphore(Service::Interface* self) {
 /**
 * DSP_DSP::WriteProcessPipe service function
 *  Inputs:
- *      1 : Number
+ *      1 : Channel
 *      2 : Size
- *      3 : (size <<14) | 0x402
+ *      3 : (size << 14) | 0x402
 *      4 : Buffer
 *  Outputs:
 *      0 : Return header
@@ -179,21 +194,42 @@ static void SetSemaphore(Service::Interface* self) {
 static void WriteProcessPipe(Service::Interface* self) {
    u32* cmd_buff = Kernel::GetCommandBuffer();
-    u32 number   = cmd_buff[1];
+    u32 channel  = cmd_buff[1];
    u32 size     = cmd_buff[2];
-    u32 new_size = cmd_buff[3];
    u32 buffer   = cmd_buff[4];
+    if (IPC::StaticBufferDesc(size, 1) != cmd_buff[3]) {
+        LOG_ERROR(Service_DSP, "IPC static buffer descriptor failed validation (0x%X). channel=%u, size=0x%X, buffer=0x%08X", cmd_buff[3], channel, size, buffer);
+        cmd_buff[1] = -1; // TODO
+        return;
+    }
+    if (!Memory::GetPointer(buffer)) {
+        LOG_ERROR(Service_DSP, "Invalid Buffer: channel=%u, size=0x%X, buffer=0x%08X", channel, size, buffer);
+        cmd_buff[1] = -1; // TODO
+        return;
+    }
+    std::vector<u8> message(size);
+    for (size_t i = 0; i < size; i++) {
+        message[i] = Memory::Read8(buffer + i);
+    }
+    DSP::HLE::PipeWrite(channel, message);
    cmd_buff[1] = RESULT_SUCCESS.raw; // No error
-    LOG_WARNING(Service_DSP, "(STUBBED) called number=%u, size=0x%X, new_size=0x%X, buffer=0x%08X",
+    LOG_TRACE(Service_DSP, "channel=%u, size=0x%X, buffer=0x%08X", channel, size, buffer);
-                number, size, new_size, buffer);
 }
 /**
 * DSP_DSP::ReadPipeIfPossible service function
+ *      A pipe is a means of communication between the ARM11 and DSP that occurs on
+ *      hardware by writing to/reading from the DSP registers at 0x10203000.
+ *      Pipes are used for initialisation. See also DSP::HLE::PipeRead.
 *  Inputs:
- *      1 : Unknown
+ *      1 : Pipe Number
 *      2 : Unknown
 *      3 : Size in bytes of read (observed only lower half word used)
 *      0x41 : Virtual address to read from DSP pipe to in memory
@@ -204,35 +240,25 @@ static void WriteProcessPipe(Service::Interface* self) {
 static void ReadPipeIfPossible(Service::Interface* self) {
    u32* cmd_buff = Kernel::GetCommandBuffer();
-    u32 unk1 = cmd_buff[1];
+    u32 pipe = cmd_buff[1];
    u32 unk2 = cmd_buff[2];
    u32 size = cmd_buff[3] & 0xFFFF;// Lower 16 bits are size
    VAddr addr = cmd_buff[0x41];
-    // Canned DSP responses that games expect. These were taken from HW by 3dmoo team.
+    if (!Memory::GetPointer(addr)) {
-    // TODO: Remove this hack :)
+        LOG_ERROR(Service_DSP, "Invalid addr: pipe=0x%08X, unk2=0x%08X, size=0x%X, buffer=0x%08X", pipe, unk2, size, addr);
-    static const std::array<u16, 16> canned_read_pipe = {{
+        cmd_buff[1] = -1; // TODO
-        0x000F, 0xBFFF, 0x9E8E, 0x8680, 0xA78E, 0x9430, 0x8400, 0x8540,
+        return;
-        0x948E, 0x8710, 0x8410, 0xA90E, 0xAA0E, 0xAACE, 0xAC4E, 0xAC58
+    }
-    }};
-    u32 initial_size = read_pipe_count;
+    std::vector<u8> response = DSP::HLE::PipeRead(pipe, size);
-    for (unsigned offset = 0; offset < size; offset += sizeof(u16)) {
+    Memory::WriteBlock(addr, response.data(), response.size());
-        if (read_pipe_count < canned_read_pipe.size()) {
-            Memory::Write16(addr + offset, canned_read_pipe[read_pipe_count]);
-            read_pipe_count++;
-        } else {
-            LOG_ERROR(Service_DSP, "canned read pipe log exceeded!");
-            break;
-        }
-    }
    cmd_buff[1] = 0; // No error
-    cmd_buff[2] = (read_pipe_count - initial_size) * sizeof(u16);
+    cmd_buff[2] = (u32)response.size();
-    LOG_WARNING(Service_DSP, "(STUBBED) called unk1=0x%08X, unk2=0x%08X, size=0x%X, buffer=0x%08X",
+    LOG_TRACE(Service_DSP, "pipe=0x%08X, unk2=0x%08X, size=0x%X, buffer=0x%08X", pipe, unk2, size, addr);
-                unk1, unk2, size, addr);
 }
 /**
@@ -311,7 +337,6 @@ const Interface::FunctionInfo FunctionTable[] = {
 Interface::Interface() {
    semaphore_event = Kernel::Event::Create(RESETTYPE_ONESHOT, "DSP_DSP::semaphore_event");
-    interrupt_event = nullptr;
    read_pipe_count = 0;
    Register(FunctionTable);
@@ -319,7 +344,7 @@ Interface::Interface() {
 Interface::~Interface() {
    semaphore_event = nullptr;
-    interrupt_event = nullptr;
+    interrupt_events.clear();
 }
 } // namespace
diff --git a/src/core/hle/service/dsp_dsp.h b/src/core/hle/service/dsp_dsp.h
index b6f611db5..32b89e9bb 100644
--- a/src/core/hle/service/dsp_dsp.h
+++ b/src/core/hle/service/dsp_dsp.h
@@ -23,7 +23,15 @@ public:
    }
 };
-/// Signals that a DSP interrupt has occurred to userland code
+/// Signal all audio related interrupts.
-void SignalInterrupt();
+void SignalAllInterrupts();
+/**
+ * Signal a specific audio related interrupt based on interrupt id and channel id.
+ * @param interrupt_id The interrupt id
+ * @param channel_id The channel id
+ * The significance of various values of interrupt_id and channel_id is not yet known.
+ */
+void SignalInterrupt(u32 interrupt_id, u32 channel_id);
 } // namespace
diff --git a/src/core/hle/service/gsp_gpu.cpp b/src/core/hle/service/gsp_gpu.cpp
index 98b11c798..5838b6d71 100644
--- a/src/core/hle/service/gsp_gpu.cpp
+++ b/src/core/hle/service/gsp_gpu.cpp
@@ -347,7 +347,7 @@ void SignalInterrupt(InterruptId interrupt_id) {
            FrameBufferUpdate* info = GetFrameBufferInfo(thread_id, screen_id);
            if (info->is_dirty) {
                SetBufferSwap(screen_id, info->framebuffer_info[info->index]);
-                info->is_dirty = false;
+                info->is_dirty.Assign(false);
            }
        }
    }
@@ -499,7 +499,7 @@ static void SetLcdForceBlack(Service::Interface* self) {
    // Since data is already zeroed, there is no need to explicitly set
    // the color to black (all zero).
-    data.is_enabled = enable_black;
+    data.is_enabled.Assign(enable_black);
    LCD::Write(HW::VADDR_LCD + 4 * LCD_REG_INDEX(color_fill_top), data.raw); // Top LCD
    LCD::Write(HW::VADDR_LCD + 4 * LCD_REG_INDEX(color_fill_bottom), data.raw); // Bottom LCD
@@ -521,7 +521,7 @@ static void TriggerCmdReqQueue(Service::Interface* self) {
            ExecuteCommand(command_buffer->commands[i], thread_id);
            // Indicates that command has completed
-            command_buffer->number_commands = command_buffer->number_commands - 1;
+            command_buffer->number_commands.Assign(command_buffer->number_commands - 1);
        }
    }
diff --git a/src/core/hle/service/hid/hid.cpp b/src/core/hle/service/hid/hid.cpp
index 0bed0ce36..11d7e69a1 100644
--- a/src/core/hle/service/hid/hid.cpp
+++ b/src/core/hle/service/hid/hid.cpp
@@ -105,7 +105,7 @@ void Update() {
    bool pressed = false;
    std::tie(touch_entry->x, touch_entry->y, pressed) = VideoCore::g_emu_window->GetTouchState();
-    touch_entry->valid = pressed ? 1 : 0;
+    touch_entry->valid.Assign(pressed ? 1 : 0);
    // TODO(bunnei): We're not doing anything with offset 0xA8 + 0x18 of HID SharedMemory, which
    // supposedly is "Touch-screen entry, which contains the raw coordinate data prior to being
diff --git a/src/core/hle/service/ptm/ptm.cpp b/src/core/hle/service/ptm/ptm.cpp
index 22c1093ff..6bdee4d9e 100644
--- a/src/core/hle/service/ptm/ptm.cpp
+++ b/src/core/hle/service/ptm/ptm.cpp
@@ -110,8 +110,8 @@ void Init() {
        FileSys::Path gamecoin_path("gamecoin.dat");
        FileSys::Mode open_mode = {};
-        open_mode.write_flag = 1;
+        open_mode.write_flag.Assign(1);
-        open_mode.create_flag = 1;
+        open_mode.create_flag.Assign(1);
        // Open the file and write the default gamecoin information
        auto gamecoin_result = Service::FS::OpenFileFromArchive(*archive_result, gamecoin_path, open_mode);
        if (gamecoin_result.Succeeded()) {
diff --git a/src/core/hle/service/soc_u.cpp b/src/core/hle/service/soc_u.cpp
index 822b093f4..e603bf794 100644
--- a/src/core/hle/service/soc_u.cpp
+++ b/src/core/hle/service/soc_u.cpp
@@ -178,17 +178,17 @@ struct CTRPollFD {
        static Events TranslateTo3DS(u32 input_event) {
            Events ev = {};
            if (input_event & POLLIN)
-                ev.pollin = 1;
+                ev.pollin.Assign(1);
            if (input_event & POLLPRI)
-                ev.pollpri = 1;
+                ev.pollpri.Assign(1);
            if (input_event & POLLHUP)
-                ev.pollhup = 1;
+                ev.pollhup.Assign(1);
            if (input_event & POLLERR)
-                ev.pollerr = 1;
+                ev.pollerr.Assign(1);
            if (input_event & POLLOUT)
-                ev.pollout = 1;
+                ev.pollout.Assign(1);
            if (input_event & POLLNVAL)
-                ev.pollnval = 1;
+                ev.pollnval.Assign(1);
            return ev;
        }
diff --git a/src/core/hw/gpu.cpp b/src/core/hw/gpu.cpp
index 4bd3a632d..5312baa83 100644
--- a/src/core/hw/gpu.cpp
+++ b/src/core/hw/gpu.cpp
@@ -17,7 +17,6 @@
 #include "core/core_timing.h"
 #include "core/hle/service/gsp_gpu.h"
-#include "core/hle/service/dsp_dsp.h"
 #include "core/hle/service/hid/hid.h"
 #include "core/hw/hw.h"
@@ -146,8 +145,8 @@ inline void Write(u32 addr, const T data) {
            // Reset "trigger" flag and set the "finish" flag
            // NOTE: This was confirmed to happen on hardware even if "address_start" is zero.
-            config.trigger = 0;
+            config.trigger.Assign(0);
-            config.finished = 1;
+            config.finished.Assign(1);
        }
        break;
    }
@@ -414,11 +413,6 @@ static void VBlankCallback(u64 userdata, int cycles_late) {
    GSP_GPU::SignalInterrupt(GSP_GPU::InterruptId::PDC0);
    GSP_GPU::SignalInterrupt(GSP_GPU::InterruptId::PDC1);
-    // TODO(bunnei): Fake a DSP interrupt on each frame. This does not belong here, but
-    // until we can emulate DSP interrupts, this is probably the only reasonable place to do
-    // this. Certain games expect this to be periodically signaled.
-    DSP_DSP::SignalInterrupt();
    // Check for user input updates
    Service::HID::Update();
@@ -444,16 +438,16 @@ void Init() {
    framebuffer_sub.address_left1  = 0x1848F000;
    framebuffer_sub.address_left2  = 0x184C7800;
-    framebuffer_top.width = 240;
+    framebuffer_top.width.Assign(240);
-    framebuffer_top.height = 400;
+    framebuffer_top.height.Assign(400);
    framebuffer_top.stride = 3 * 240;
-    framebuffer_top.color_format = Regs::PixelFormat::RGB8;
+    framebuffer_top.color_format.Assign(Regs::PixelFormat::RGB8);
    framebuffer_top.active_fb = 0;
-    framebuffer_sub.width = 240;
+    framebuffer_sub.width.Assign(240);
-    framebuffer_sub.height = 320;
+    framebuffer_sub.height.Assign(320);
    framebuffer_sub.stride = 3 * 240;
-    framebuffer_sub.color_format = Regs::PixelFormat::RGB8;
+    framebuffer_sub.color_format.Assign(Regs::PixelFormat::RGB8);
    framebuffer_sub.active_fb = 0;
    last_skip_frame = false;
diff --git a/src/core/system.cpp b/src/core/system.cpp
index 7e9c56538..b62ebf69e 100644
--- a/src/core/system.cpp
+++ b/src/core/system.cpp
@@ -2,9 +2,12 @@
 // Licensed under GPLv2 or any later version
 // Refer to the license.txt file included.
+#include "audio_core/audio_core.h"
 #include "core/core.h"
 #include "core/core_timing.h"
 #include "core/system.h"
+#include "core/gdbstub/gdbstub.h"
 #include "core/hw/hw.h"
 #include "core/hle/hle.h"
 #include "core/hle/kernel/kernel.h"
@@ -12,8 +15,6 @@
 #include "video_core/video_core.h"
-#include "core/gdbstub/gdbstub.h"
 namespace System {
 void Init(EmuWindow* emu_window) {
@@ -24,11 +25,13 @@ void Init(EmuWindow* emu_window) {
    Kernel::Init();
    HLE::Init();
    VideoCore::Init(emu_window);
+    AudioCore::Init();
    GDBStub::Init();
 }
 void Shutdown() {
    GDBStub::Shutdown();
+    AudioCore::Shutdown();
    VideoCore::Shutdown();
    HLE::Shutdown();
    Kernel::Shutdown();
diff --git a/src/video_core/CMakeLists.txt b/src/video_core/CMakeLists.txt
index c3d7294d5..4b5d298f3 100644
--- a/src/video_core/CMakeLists.txt
+++ b/src/video_core/CMakeLists.txt
@@ -33,6 +33,7 @@ set(HEADERS
            command_processor.h
            gpu_debugger.h
            pica.h
+            pica_types.h
            primitive_assembly.h
            rasterizer.h
            rasterizer_interface.h
diff --git a/src/video_core/clipper.cpp b/src/video_core/clipper.cpp
index 5d609da06..a385589d2 100644
--- a/src/video_core/clipper.cpp
+++ b/src/video_core/clipper.cpp
@@ -59,15 +59,17 @@ static void InitScreenCoordinates(OutputVertex& vtx)
    } viewport;
    const auto& regs = g_state.regs;
-    viewport.halfsize_x = float24::FromRawFloat24(regs.viewport_size_x);
+    viewport.halfsize_x = float24::FromRaw(regs.viewport_size_x);
-    viewport.halfsize_y = float24::FromRawFloat24(regs.viewport_size_y);
+    viewport.halfsize_y = float24::FromRaw(regs.viewport_size_y);
    viewport.offset_x   = float24::FromFloat32(static_cast<float>(regs.viewport_corner.x));
    viewport.offset_y   = float24::FromFloat32(static_cast<float>(regs.viewport_corner.y));
-    viewport.zscale     = float24::FromRawFloat24(regs.viewport_depth_range);
+    viewport.zscale     = float24::FromRaw(regs.viewport_depth_range);
-    viewport.offset_z   = float24::FromRawFloat24(regs.viewport_depth_far_plane);
+    viewport.offset_z   = float24::FromRaw(regs.viewport_depth_far_plane);
    float24 inv_w = float24::FromFloat32(1.f) / vtx.pos.w;
    vtx.color *= inv_w;
+    vtx.view *= inv_w;
+    vtx.quat *= inv_w;
    vtx.tc0 *= inv_w;
    vtx.tc1 *= inv_w;
    vtx.tc2 *= inv_w;
diff --git a/src/video_core/command_processor.cpp b/src/video_core/command_processor.cpp
index 59c75042c..73fdfbe9c 100644
--- a/src/video_core/command_processor.cpp
+++ b/src/video_core/command_processor.cpp
@@ -98,10 +98,10 @@ static void WritePicaReg(u32 id, u32 value, u32 mask) {
                Math::Vec4<float24>& attribute = g_state.vs.default_attributes[setup.index];
                // NOTE: The destination component order indeed is "backwards"
-                attribute.w = float24::FromRawFloat24(default_attr_write_buffer[0] >> 8);
+                attribute.w = float24::FromRaw(default_attr_write_buffer[0] >> 8);
-                attribute.z = float24::FromRawFloat24(((default_attr_write_buffer[0] & 0xFF) << 16) | ((default_attr_write_buffer[1] >> 16) & 0xFFFF));
+                attribute.z = float24::FromRaw(((default_attr_write_buffer[0] & 0xFF) << 16) | ((default_attr_write_buffer[1] >> 16) & 0xFFFF));
-                attribute.y = float24::FromRawFloat24(((default_attr_write_buffer[1] & 0xFFFF) << 8) | ((default_attr_write_buffer[2] >> 24) & 0xFF));
+                attribute.y = float24::FromRaw(((default_attr_write_buffer[1] & 0xFFFF) << 8) | ((default_attr_write_buffer[2] >> 24) & 0xFF));
-                attribute.x = float24::FromRawFloat24(default_attr_write_buffer[2] & 0xFFFFFF);
+                attribute.x = float24::FromRaw(default_attr_write_buffer[2] & 0xFFFFFF);
                LOG_TRACE(HW_GPU, "Set default VS attribute %x to (%f %f %f %f)", (int)setup.index,
                          attribute.x.ToFloat32(), attribute.y.ToFloat32(), attribute.z.ToFloat32(),
@@ -157,15 +157,25 @@ static void WritePicaReg(u32 id, u32 value, u32 mask) {
                // TODO: What happens if a loader overwrites a previous one's data?
                for (unsigned component = 0; component < loader_config.component_count; ++component) {
-                    if (component >= 12)
+                    if (component >= 12) {
                        LOG_ERROR(HW_GPU, "Overflow in the vertex attribute loader %u trying to load component %u", loader, component);
+                        continue;
+                    }
                    u32 attribute_index = loader_config.GetComponent(component);
-                    vertex_attribute_sources[attribute_index] = load_address;
+                    if (attribute_index < 12) {
-                    vertex_attribute_strides[attribute_index] = static_cast<u32>(loader_config.byte_count);
+                        vertex_attribute_sources[attribute_index] = load_address;
-                    vertex_attribute_formats[attribute_index] = attribute_config.GetFormat(attribute_index);
+                        vertex_attribute_strides[attribute_index] = static_cast<u32>(loader_config.byte_count);
-                    vertex_attribute_elements[attribute_index] = attribute_config.GetNumElements(attribute_index);
+                        vertex_attribute_formats[attribute_index] = attribute_config.GetFormat(attribute_index);
-                    vertex_attribute_element_size[attribute_index] = attribute_config.GetElementSizeInBytes(attribute_index);
+                        vertex_attribute_elements[attribute_index] = attribute_config.GetNumElements(attribute_index);
-                    load_address += attribute_config.GetStride(attribute_index);
+                        vertex_attribute_element_size[attribute_index] = attribute_config.GetElementSizeInBytes(attribute_index);
+                        load_address += attribute_config.GetStride(attribute_index);
+                    } else if (attribute_index < 16) {
+                        // Attribute ids 12, 13, 14 and 15 signify 4, 8, 12 and 16-byte paddings, respectively
+                        load_address += (attribute_index - 11) * 4;
+                    } else {
+                        UNREACHABLE(); // This is truly unreachable due to the number of bits for each component
+                    }
                }
            }
@@ -418,10 +428,10 @@ static void WritePicaReg(u32 id, u32 value, u32 mask) {
                        uniform[3 - i] = float24::FromFloat32(*(float*)(&uniform_write_buffer[i]));
                } else {
                    // TODO: Untested
-                    uniform.w = float24::FromRawFloat24(uniform_write_buffer[0] >> 8);
+                    uniform.w = float24::FromRaw(uniform_write_buffer[0] >> 8);
-                    uniform.z = float24::FromRawFloat24(((uniform_write_buffer[0] & 0xFF)<<16) | ((uniform_write_buffer[1] >> 16) & 0xFFFF));
+                    uniform.z = float24::FromRaw(((uniform_write_buffer[0] & 0xFF) << 16) | ((uniform_write_buffer[1] >> 16) & 0xFFFF));
-                    uniform.y = float24::FromRawFloat24(((uniform_write_buffer[1] & 0xFFFF)<<8) | ((uniform_write_buffer[2] >> 24) & 0xFF));
+                    uniform.y = float24::FromRaw(((uniform_write_buffer[1] & 0xFFFF) << 8) | ((uniform_write_buffer[2] >> 24) & 0xFF));
-                    uniform.x = float24::FromRawFloat24(uniform_write_buffer[2] & 0xFFFFFF);
+                    uniform.x = float24::FromRaw(uniform_write_buffer[2] & 0xFFFFFF);
                }
                LOG_TRACE(HW_GPU, "Set uniform %x to (%f %f %f %f)", (int)uniform_setup.index,
@@ -429,7 +439,7 @@ static void WritePicaReg(u32 id, u32 value, u32 mask) {
                          uniform.w.ToFloat32());
                // TODO: Verify that this actually modifies the register!
-                uniform_setup.index = uniform_setup.index + 1;
+                uniform_setup.index.Assign(uniform_setup.index + 1);
            }
            break;
        }
@@ -464,6 +474,24 @@ static void WritePicaReg(u32 id, u32 value, u32 mask) {
            break;
        }
+        case PICA_REG_INDEX_WORKAROUND(lighting.lut_data[0], 0x1c8):
+        case PICA_REG_INDEX_WORKAROUND(lighting.lut_data[1], 0x1c9):
+        case PICA_REG_INDEX_WORKAROUND(lighting.lut_data[2], 0x1ca):
+        case PICA_REG_INDEX_WORKAROUND(lighting.lut_data[3], 0x1cb):
+        case PICA_REG_INDEX_WORKAROUND(lighting.lut_data[4], 0x1cc):
+        case PICA_REG_INDEX_WORKAROUND(lighting.lut_data[5], 0x1cd):
+        case PICA_REG_INDEX_WORKAROUND(lighting.lut_data[6], 0x1ce):
+        case PICA_REG_INDEX_WORKAROUND(lighting.lut_data[7], 0x1cf):
+        {
+            auto& lut_config = regs.lighting.lut_config;
+            ASSERT_MSG(lut_config.index < 256, "lut_config.index exceeded maximum value of 255!");
+            g_state.lighting.luts[lut_config.type][lut_config.index].raw = value;
+            lut_config.index.Assign(lut_config.index + 1);
+            break;
+        }
        default:
            break;
    }
diff --git a/src/video_core/debug_utils/debug_utils.cpp b/src/video_core/debug_utils/debug_utils.cpp
index 4f66dbd65..6e6fd7335 100644
--- a/src/video_core/debug_utils/debug_utils.cpp
+++ b/src/video_core/debug_utils/debug_utils.cpp
@@ -201,11 +201,11 @@ void DumpShader(const std::string& filename, const Regs::ShaderConfig& config, c
                    if (it == output_info_table.end()) {
                        output_info_table.emplace_back();
-                        output_info_table.back().type = type;
+                        output_info_table.back().type.Assign(type);
-                        output_info_table.back().component_mask = component_mask;
+                        output_info_table.back().component_mask.Assign(component_mask);
-                        output_info_table.back().id = i;
+                        output_info_table.back().id.Assign(i);
                    } else {
-                        it->component_mask = it->component_mask | component_mask;
+                        it->component_mask.Assign(it->component_mask | component_mask);
                    }
                } catch (const std::out_of_range& ) {
                    DEBUG_ASSERT_MSG(false, "Unknown output attribute mapping");
diff --git a/src/video_core/pica.h b/src/video_core/pica.h
index 2f1b2dec4..9077b1725 100644
--- a/src/video_core/pica.h
+++ b/src/video_core/pica.h
@@ -16,6 +16,8 @@
 #include "common/vector_math.h"
 #include "common/logging/log.h"
+#include "pica_types.h"
 namespace Pica {
 // Returns index corresponding to the Regs member labeled by field_name
@@ -239,7 +241,8 @@ struct Regs {
    TextureConfig texture0;
    INSERT_PADDING_WORDS(0x8);
    BitField<0, 4, TextureFormat> texture0_format;
-    INSERT_PADDING_WORDS(0x2);
+    BitField<0, 1, u32> fragment_lighting_enable;
+    INSERT_PADDING_WORDS(0x1);
    TextureConfig texture1;
    BitField<0, 4, TextureFormat> texture1_format;
    INSERT_PADDING_WORDS(0x2);
@@ -641,7 +644,268 @@ struct Regs {
        }
    }
-    INSERT_PADDING_WORDS(0xe0);
+    INSERT_PADDING_WORDS(0x20);
+    enum class LightingSampler {
+        Distribution0 = 0,
+        Distribution1 = 1,
+        Fresnel = 3,
+        ReflectBlue = 4,
+        ReflectGreen = 5,
+        ReflectRed = 6,
+        SpotlightAttenuation = 8,
+        DistanceAttenuation = 16,
+    };
+    /**
+     * Pica fragment lighting supports using different LUTs for each lighting component:
+     * Reflectance R, G, and B channels, distribution function for specular components 0 and 1,
+     * fresnel factor, and spotlight attenuation. Furthermore, which LUTs are used for each channel
+     * (or whether a channel is enabled at all) is specified by various pre-defined lighting
+     * configurations. With configurations that require more LUTs, more cycles are required on HW to
+     * perform lighting computations.
+     */
+    enum class LightingConfig {
+        Config0 = 0, ///< Reflect Red, Distribution 0, Spotlight
+        Config1 = 1, ///< Reflect Red, Fresnel, Spotlight
+        Config2 = 2, ///< Reflect Red, Distribution 0/1
+        Config3 = 3, ///< Distribution 0/1, Fresnel
+        Config4 = 4, ///< Reflect Red/Green/Blue, Distribution 0/1, Spotlight
+        Config5 = 5, ///< Reflect Red/Green/Blue, Distribution 0, Fresnel, Spotlight
+        Config6 = 6, ///< Reflect Red, Distribution 0/1, Fresnel, Spotlight
+        Config7 = 8, ///< Reflect Red/Green/Blue, Distribution 0/1, Fresnel, Spotlight
+                     ///< NOTE: '8' is intentional, '7' does not appear to be a valid configuration
+    };
+    /// Selects which lighting components are affected by fresnel
+    enum class LightingFresnelSelector {
+        None = 0,                             ///< Fresnel is disabled
+        PrimaryAlpha = 1,                     ///< Primary (diffuse) lighting alpha is affected by fresnel
+        SecondaryAlpha = 2,                   ///< Secondary (specular) lighting alpha is affected by fresnel
+        Both = PrimaryAlpha | SecondaryAlpha, ///< Both primary and secondary lighting alphas are affected by fresnel
+    };
+    /// Factor used to scale the output of a lighting LUT
+    enum class LightingScale {
+        Scale1 = 0,   ///< Scale is 1x
+        Scale2 = 1,   ///< Scale is 2x
+        Scale4 = 2,   ///< Scale is 4x
+        Scale8 = 3,   ///< Scale is 8x
+        Scale1_4 = 6, ///< Scale is 0.25x
+        Scale1_2 = 7, ///< Scale is 0.5x
+    };
+    enum class LightingLutInput {
+        NH = 0, // Cosine of the angle between the normal and half-angle vectors
+        VH = 1, // Cosine of the angle between the view and half-angle vectors
+        NV = 2, // Cosine of the angle between the normal and the view vector
+        LN = 3, // Cosine of the angle between the light and the normal vectors
+    };
+    enum class LightingBumpMode : u32 {
+        None = 0,
+        NormalMap = 1,
+        TangentMap = 2,
+    };
+    union LightColor {
+        BitField< 0, 10, u32> b;
+        BitField<10, 10, u32> g;
+        BitField<20, 10, u32> r;
+        Math::Vec3f ToVec3f() const {
+            // These fields are 10 bits wide, however 255 corresponds to 1.0f for each color component
+            return Math::MakeVec((f32)r / 255.f, (f32)g / 255.f, (f32)b / 255.f);
+        }
+    };
+    /// Returns true if the specified lighting sampler is supported by the current Pica lighting configuration
+    static bool IsLightingSamplerSupported(LightingConfig config, LightingSampler sampler) {
+        switch (sampler) {
+        case LightingSampler::Distribution0:
+            return (config != LightingConfig::Config1);
+        case LightingSampler::Distribution1:
+            return (config != LightingConfig::Config0) && (config != LightingConfig::Config1) && (config != LightingConfig::Config5);
+        case LightingSampler::Fresnel:
+            return (config != LightingConfig::Config0) && (config != LightingConfig::Config2) && (config != LightingConfig::Config4);
+        case LightingSampler::ReflectRed:
+            return (config != LightingConfig::Config3);
+        case LightingSampler::ReflectGreen:
+        case LightingSampler::ReflectBlue:
+            return (config == LightingConfig::Config4) || (config == LightingConfig::Config5) || (config == LightingConfig::Config7);
+        }
+        return false;
+    }
+    struct {
+        struct LightSrc {
+            LightColor specular_0;  // material.specular_0 * light.specular_0
+            LightColor specular_1;  // material.specular_1 * light.specular_1
+            LightColor diffuse;     // material.diffuse * light.diffuse
+            LightColor ambient;     // material.ambient * light.ambient
+            struct {
+                // Encoded as 16-bit floating point
+                union {
+                    BitField< 0, 16, u32> x;
+                    BitField<16, 16, u32> y;
+                };
+                union {
+                    BitField< 0, 16, u32> z;
+                };
+                INSERT_PADDING_WORDS(0x3);
+                union {
+                    BitField<0, 1, u32> directional;
+                    BitField<1, 1, u32> two_sided_diffuse; // When disabled, clamp dot-product to 0
+                };
+            };
+            BitField<0, 20, u32> dist_atten_bias;
+            BitField<0, 20, u32> dist_atten_scale;
+            INSERT_PADDING_WORDS(0x4);
+        };
+        static_assert(sizeof(LightSrc) == 0x10 * sizeof(u32), "LightSrc structure must be 0x10 words");
+        LightSrc light[8];
+        LightColor global_ambient; // Emission + (material.ambient * lighting.ambient)
+        INSERT_PADDING_WORDS(0x1);
+        BitField<0, 3, u32> num_lights; // Number of enabled lights - 1
+        union {
+            BitField< 2, 2, LightingFresnelSelector> fresnel_selector;
+            BitField< 4, 4, LightingConfig> config;
+            BitField<22, 2, u32> bump_selector; // 0: Texture 0, 1: Texture 1, 2: Texture 2
+            BitField<27, 1, u32> clamp_highlights;
+            BitField<28, 2, LightingBumpMode> bump_mode;
+            BitField<30, 1, u32> disable_bump_renorm;
+        };
+        union {
+            BitField<16, 1, u32> disable_lut_d0;
+            BitField<17, 1, u32> disable_lut_d1;
+            BitField<19, 1, u32> disable_lut_fr;
+            BitField<20, 1, u32> disable_lut_rr;
+            BitField<21, 1, u32> disable_lut_rg;
+            BitField<22, 1, u32> disable_lut_rb;
+            // Each bit specifies whether distance attenuation should be applied for the
+            // corresponding light
+            BitField<24, 1, u32> disable_dist_atten_light_0;
+            BitField<25, 1, u32> disable_dist_atten_light_1;
+            BitField<26, 1, u32> disable_dist_atten_light_2;
+            BitField<27, 1, u32> disable_dist_atten_light_3;
+            BitField<28, 1, u32> disable_dist_atten_light_4;
+            BitField<29, 1, u32> disable_dist_atten_light_5;
+            BitField<30, 1, u32> disable_dist_atten_light_6;
+            BitField<31, 1, u32> disable_dist_atten_light_7;
+        };
+        bool IsDistAttenDisabled(unsigned index) const {
+            const unsigned disable[] = { disable_dist_atten_light_0, disable_dist_atten_light_1,
+                                         disable_dist_atten_light_2, disable_dist_atten_light_3,
+                                         disable_dist_atten_light_4, disable_dist_atten_light_5,
+                                         disable_dist_atten_light_6, disable_dist_atten_light_7 };
+            return disable[index] != 0;
+        }
+        union {
+            BitField<0, 8, u32> index;      ///< Index at which to set data in the LUT
+            BitField<8, 5, u32> type;       ///< Type of LUT for which to set data
+        } lut_config;
+        BitField<0, 1, u32> disable;
+        INSERT_PADDING_WORDS(0x1);
+        // When data is written to any of these registers, it gets written to the lookup table of
+        // the selected type at the selected index, specified above in the `lut_config` register.
+        // With each write, `lut_config.index` is incremented. It does not matter which of these
+        // registers is written to, the behavior will be the same.
+        u32 lut_data[8];
+        // These are used to specify if absolute (abs) value should be used for each LUT index. When
+        // abs mode is disabled, LUT indexes are in the range of (-1.0, 1.0). Otherwise, they are in
+        // the range of (0.0, 1.0).
+        union {
+            BitField< 1, 1, u32> disable_d0;
+            BitField< 5, 1, u32> disable_d1;
+            BitField< 9, 1, u32> disable_sp;
+            BitField<13, 1, u32> disable_fr;
+            BitField<17, 1, u32> disable_rb;
+            BitField<21, 1, u32> disable_rg;
+            BitField<25, 1, u32> disable_rr;
+        } abs_lut_input;
+        union {
+            BitField< 0, 3, LightingLutInput> d0;
+            BitField< 4, 3, LightingLutInput> d1;
+            BitField< 8, 3, LightingLutInput> sp;
+            BitField<12, 3, LightingLutInput> fr;
+            BitField<16, 3, LightingLutInput> rb;
+            BitField<20, 3, LightingLutInput> rg;
+            BitField<24, 3, LightingLutInput> rr;
+        } lut_input;
+        union {
+            BitField< 0, 3, LightingScale> d0;
+            BitField< 4, 3, LightingScale> d1;
+            BitField< 8, 3, LightingScale> sp;
+            BitField<12, 3, LightingScale> fr;
+            BitField<16, 3, LightingScale> rb;
+            BitField<20, 3, LightingScale> rg;
+            BitField<24, 3, LightingScale> rr;
+            static float GetScale(LightingScale scale) {
+                switch (scale) {
+                case LightingScale::Scale1:
+                    return 1.0f;
+                case LightingScale::Scale2:
+                    return 2.0f;
+                case LightingScale::Scale4:
+                    return 4.0f;
+                case LightingScale::Scale8:
+                    return 8.0f;
+                case LightingScale::Scale1_4:
+                    return 0.25f;
+                case LightingScale::Scale1_2:
+                    return 0.5f;
+                }
+                return 0.0f;
+            }
+        } lut_scale;
+        INSERT_PADDING_WORDS(0x6);
+        union {
+            // There are 8 light enable "slots", corresponding to the total number of lights
+            // supported by Pica. For N enabled lights (specified by register 0x1c2, or 'src_num'
+            // above), the first N slots below will be set to integers within the range of 0-7,
+            // corresponding to the actual light that is enabled for each slot.
+            BitField< 0, 3, u32> slot_0;
+            BitField< 4, 3, u32> slot_1;
+            BitField< 8, 3, u32> slot_2;
+            BitField<12, 3, u32> slot_3;
+            BitField<16, 3, u32> slot_4;
+            BitField<20, 3, u32> slot_5;
+            BitField<24, 3, u32> slot_6;
+            BitField<28, 3, u32> slot_7;
+            unsigned GetNum(unsigned index) const {
+                const unsigned enable_slots[] = { slot_0, slot_1, slot_2, slot_3, slot_4, slot_5, slot_6, slot_7 };
+                return enable_slots[index];
+            }
+        } light_enable;
+    } lighting;
+    INSERT_PADDING_WORDS(0x26);
    enum class VertexAttributeFormat : u64 {
        BYTE = 0,
@@ -990,6 +1254,7 @@ ASSERT_REG_POSITION(viewport_corner, 0x68);
 ASSERT_REG_POSITION(texture0_enable, 0x80);
 ASSERT_REG_POSITION(texture0, 0x81);
 ASSERT_REG_POSITION(texture0_format, 0x8e);
+ASSERT_REG_POSITION(fragment_lighting_enable, 0x8f);
 ASSERT_REG_POSITION(texture1, 0x91);
 ASSERT_REG_POSITION(texture1_format, 0x96);
 ASSERT_REG_POSITION(texture2, 0x99);
@@ -1004,6 +1269,7 @@ ASSERT_REG_POSITION(tev_stage5, 0xf8);
 ASSERT_REG_POSITION(tev_combiner_buffer_color, 0xfd);
 ASSERT_REG_POSITION(output_merger, 0x100);
 ASSERT_REG_POSITION(framebuffer, 0x110);
+ASSERT_REG_POSITION(lighting, 0x140);
 ASSERT_REG_POSITION(vertex_attributes, 0x200);
 ASSERT_REG_POSITION(index_array, 0x227);
 ASSERT_REG_POSITION(num_vertices, 0x228);
@@ -1026,118 +1292,6 @@ static_assert(sizeof(Regs::ShaderConfig) == 0x30 * sizeof(u32), "ShaderConfig st
 static_assert(sizeof(Regs) <= 0x300 * sizeof(u32), "Register set structure larger than it should be");
 static_assert(sizeof(Regs) >= 0x300 * sizeof(u32), "Register set structure smaller than it should be");
-struct float24 {
-    static float24 FromFloat32(float val) {
-        float24 ret;
-        ret.value = val;
-        return ret;
-    }
-    // 16 bit mantissa, 7 bit exponent, 1 bit sign
-    // TODO: No idea if this works as intended
-    static float24 FromRawFloat24(u32 hex) {
-        float24 ret;
-        if ((hex & 0xFFFFFF) == 0) {
-            ret.value = 0;
-        } else {
-            u32 mantissa = hex & 0xFFFF;
-            u32 exponent = (hex >> 16) & 0x7F;
-            u32 sign = hex >> 23;
-            ret.value = std::pow(2.0f, (float)exponent-63.0f) * (1.0f + mantissa * std::pow(2.0f, -16.f));
-            if (sign)
-                ret.value = -ret.value;
-        }
-        return ret;
-    }
-    static float24 Zero() {
-        return FromFloat32(0.f);
-    }
-    // Not recommended for anything but logging
-    float ToFloat32() const {
-        return value;
-    }
-    float24 operator * (const float24& flt) const {
-        if ((this->value == 0.f && !std::isnan(flt.value)) ||
-            (flt.value == 0.f && !std::isnan(this->value)))
-            // PICA gives 0 instead of NaN when multiplying by inf
-            return Zero();
-        return float24::FromFloat32(ToFloat32() * flt.ToFloat32());
-    }
-    float24 operator / (const float24& flt) const {
-        return float24::FromFloat32(ToFloat32() / flt.ToFloat32());
-    }
-    float24 operator + (const float24& flt) const {
-        return float24::FromFloat32(ToFloat32() + flt.ToFloat32());
-    }
-    float24 operator - (const float24& flt) const {
-        return float24::FromFloat32(ToFloat32() - flt.ToFloat32());
-    }
-    float24& operator *= (const float24& flt) {
-        if ((this->value == 0.f && !std::isnan(flt.value)) ||
-            (flt.value == 0.f && !std::isnan(this->value)))
-            // PICA gives 0 instead of NaN when multiplying by inf
-            *this = Zero();
-        else value *= flt.ToFloat32();
-        return *this;
-    }
-    float24& operator /= (const float24& flt) {
-        value /= flt.ToFloat32();
-        return *this;
-    }
-    float24& operator += (const float24& flt) {
-        value += flt.ToFloat32();
-        return *this;
-    }
-    float24& operator -= (const float24& flt) {
-        value -= flt.ToFloat32();
-        return *this;
-    }
-    float24 operator - () const {
-        return float24::FromFloat32(-ToFloat32());
-    }
-    bool operator < (const float24& flt) const {
-        return ToFloat32() < flt.ToFloat32();
-    }
-    bool operator > (const float24& flt) const {
-        return ToFloat32() > flt.ToFloat32();
-    }
-    bool operator >= (const float24& flt) const {
-        return ToFloat32() >= flt.ToFloat32();
-    }
-    bool operator <= (const float24& flt) const {
-        return ToFloat32() <= flt.ToFloat32();
-    }
-    bool operator == (const float24& flt) const {
-        return ToFloat32() == flt.ToFloat32();
-    }
-    bool operator != (const float24& flt) const {
-        return ToFloat32() != flt.ToFloat32();
-    }
-private:
-    // Stored as a regular float, merely for convenience
-    // TODO: Perform proper arithmetic on this!
-    float value;
-};
-static_assert(sizeof(float24) == sizeof(float), "Shader JIT assumes float24 is implemented as a 32-bit float");
 /// Struct used to describe current Pica state
 struct State {
    /// Pica registers
@@ -1163,6 +1317,25 @@ struct State {
    ShaderSetup vs;
    ShaderSetup gs;
+    struct {
+        union LutEntry {
+            // Used for raw access
+            u32 raw;
+            // LUT value, encoded as 12-bit fixed point, with 12 fraction bits
+            BitField< 0, 12, u32> value;
+            // Used by HW for efficient interpolation, Citra does not use these
+            BitField<12, 12, u32> difference;
+            float ToFloat() {
+                return static_cast<float>(value) / 4095.f;
+            }
+        };
+        std::array<std::array<LutEntry, 256>, 24> luts;
+    } lighting;
    /// Current Pica command list
    struct {
        const u32* head_ptr;
diff --git a/src/video_core/pica_types.h b/src/video_core/pica_types.h
new file mode 100644
index 000000000..ecf45654b
--- /dev/null
+++ b/src/video_core/pica_types.h
@@ -0,0 +1,146 @@
+// Copyright 2015 Citra Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+#pragma once
+#include <cstring>
+#include "common/common_types.h"
+namespace Pica {
+/**
+ * Template class for converting arbitrary Pica float types to IEEE 754 32-bit single-precision
+ * floating point.
+ *
+ * When decoding, format is as follows:
+ *  - The first `M` bits are the mantissa
+ *  - The next `E` bits are the exponent
+ *  - The last bit is the sign bit
+ *
+ * @todo Verify on HW if this conversion is sufficiently accurate.
+ */
+template<unsigned M, unsigned E>
+struct Float {
+public:
+    static Float<M, E> FromFloat32(float val) {
+        Float<M, E> ret;
+        ret.value = val;
+        return ret;
+    }
+    static Float<M, E> FromRaw(u32 hex) {
+        Float<M, E> res;
+        const int width = M + E + 1;
+        const int bias = 128 - (1 << (E - 1));
+        const int exponent = (hex >> M) & ((1 << E) - 1);
+        const unsigned mantissa = hex & ((1 << M) - 1);
+        if (hex & ((1 << (width - 1)) - 1))
+            hex = ((hex >> (E + M)) << 31) | (mantissa << (23 - M)) | ((exponent + bias) << 23);
+        else
+            hex = ((hex >> (E + M)) << 31);
+        std::memcpy(&res.value, &hex, sizeof(float));
+        return res;
+    }
+    static Float<M, E> Zero() {
+        return FromFloat32(0.f);
+    }
+    // Not recommended for anything but logging
+    float ToFloat32() const {
+        return value;
+    }
+    Float<M, E> operator * (const Float<M, E>& flt) const {
+        if ((this->value == 0.f && !std::isnan(flt.value)) ||
+            (flt.value == 0.f && !std::isnan(this->value)))
+            // PICA gives 0 instead of NaN when multiplying by inf
+            return Zero();
+        return Float<M, E>::FromFloat32(ToFloat32() * flt.ToFloat32());
+    }
+    Float<M, E> operator / (const Float<M, E>& flt) const {
+        return Float<M, E>::FromFloat32(ToFloat32() / flt.ToFloat32());
+    }
+    Float<M, E> operator + (const Float<M, E>& flt) const {
+        return Float<M, E>::FromFloat32(ToFloat32() + flt.ToFloat32());
+    }
+    Float<M, E> operator - (const Float<M, E>& flt) const {
+        return Float<M, E>::FromFloat32(ToFloat32() - flt.ToFloat32());
+    }
+    Float<M, E>& operator *= (const Float<M, E>& flt) {
+        if ((this->value == 0.f && !std::isnan(flt.value)) ||
+            (flt.value == 0.f && !std::isnan(this->value)))
+            // PICA gives 0 instead of NaN when multiplying by inf
+            *this = Zero();
+        else value *= flt.ToFloat32();
+        return *this;
+    }
+    Float<M, E>& operator /= (const Float<M, E>& flt) {
+        value /= flt.ToFloat32();
+        return *this;
+    }
+    Float<M, E>& operator += (const Float<M, E>& flt) {
+        value += flt.ToFloat32();
+        return *this;
+    }
+    Float<M, E>& operator -= (const Float<M, E>& flt) {
+        value -= flt.ToFloat32();
+        return *this;
+    }
+    Float<M, E> operator - () const {
+        return Float<M, E>::FromFloat32(-ToFloat32());
+    }
+    bool operator < (const Float<M, E>& flt) const {
+        return ToFloat32() < flt.ToFloat32();
+    }
+    bool operator > (const Float<M, E>& flt) const {
+        return ToFloat32() > flt.ToFloat32();
+    }
+    bool operator >= (const Float<M, E>& flt) const {
+        return ToFloat32() >= flt.ToFloat32();
+    }
+    bool operator <= (const Float<M, E>& flt) const {
+        return ToFloat32() <= flt.ToFloat32();
+    }
+    bool operator == (const Float<M, E>& flt) const {
+        return ToFloat32() == flt.ToFloat32();
+    }
+    bool operator != (const Float<M, E>& flt) const {
+        return ToFloat32() != flt.ToFloat32();
+    }
+private:
+    static const unsigned MASK = (1 << (M + E + 1)) - 1;
+    static const unsigned MANTISSA_MASK = (1 << M) - 1;
+    static const unsigned EXPONENT_MASK = (1 << E) - 1;
+    // Stored as a regular float, merely for convenience
+    // TODO: Perform proper arithmetic on this!
+    float value;
+};
+using float24 = Float<16, 7>;
+using float20 = Float<12, 7>;
+using float16 = Float<10, 5>;
+} // namespace Pica
diff --git a/src/video_core/renderer_opengl/gl_rasterizer.cpp b/src/video_core/renderer_opengl/gl_rasterizer.cpp
index 291ef737d..b7d19bf94 100644
--- a/src/video_core/renderer_opengl/gl_rasterizer.cpp
+++ b/src/video_core/renderer_opengl/gl_rasterizer.cpp
@@ -36,7 +36,7 @@ static bool IsPassThroughTevStage(const Pica::Regs::TevStageConfig& stage) {
            stage.GetAlphaMultiplier() == 1);
 }
-RasterizerOpenGL::RasterizerOpenGL() : last_fb_color_addr(0), last_fb_depth_addr(0) { }
+RasterizerOpenGL::RasterizerOpenGL() : cached_fb_color_addr(0), cached_fb_depth_addr(0) { }
 RasterizerOpenGL::~RasterizerOpenGL() { }
 void RasterizerOpenGL::InitObjects() {
@@ -75,6 +75,12 @@ void RasterizerOpenGL::InitObjects() {
    glEnableVertexAttribArray(GLShader::ATTRIBUTE_TEXCOORD1);
    glEnableVertexAttribArray(GLShader::ATTRIBUTE_TEXCOORD2);
+    glVertexAttribPointer(GLShader::ATTRIBUTE_NORMQUAT, 4, GL_FLOAT, GL_FALSE, sizeof(HardwareVertex), (GLvoid*)offsetof(HardwareVertex, normquat));
+    glEnableVertexAttribArray(GLShader::ATTRIBUTE_NORMQUAT);
+    glVertexAttribPointer(GLShader::ATTRIBUTE_VIEW, 3, GL_FLOAT, GL_FALSE, sizeof(HardwareVertex), (GLvoid*)offsetof(HardwareVertex, view));
+    glEnableVertexAttribArray(GLShader::ATTRIBUTE_VIEW);
    SetShader();
    // Create textures for OGL framebuffer that will be rendered to, initially 1x1 to succeed in framebuffer creation
@@ -120,6 +126,19 @@ void RasterizerOpenGL::InitObjects() {
    glFramebufferTexture2D(GL_FRAMEBUFFER, GL_COLOR_ATTACHMENT0, GL_TEXTURE_2D, fb_color_texture.texture.handle, 0);
    glFramebufferTexture2D(GL_FRAMEBUFFER, GL_DEPTH_ATTACHMENT, GL_TEXTURE_2D, fb_depth_texture.texture.handle, 0);
+    for (size_t i = 0; i < lighting_lut.size(); ++i) {
+        lighting_lut[i].Create();
+        state.lighting_lut[i].texture_1d = lighting_lut[i].handle;
+        glActiveTexture(GL_TEXTURE3 + i);
+        glBindTexture(GL_TEXTURE_1D, state.lighting_lut[i].texture_1d);
+        glTexImage1D(GL_TEXTURE_1D, 0, GL_RGBA32F, 256, 0, GL_RGBA, GL_FLOAT, nullptr);
+        glTexParameteri(GL_TEXTURE_1D, GL_TEXTURE_MIN_FILTER, GL_LINEAR);
+        glTexParameteri(GL_TEXTURE_1D, GL_TEXTURE_MAG_FILTER, GL_LINEAR);
+    }
+    state.Apply();
    ASSERT_MSG(glCheckFramebufferStatus(GL_FRAMEBUFFER) == GL_FRAMEBUFFER_COMPLETE,
               "OpenGL rasterizer framebuffer setup failed, status %X", glCheckFramebufferStatus(GL_FRAMEBUFFER));
 }
@@ -139,12 +158,34 @@ void RasterizerOpenGL::Reset() {
    res_cache.InvalidateAll();
 }
+/**
+ * This is a helper function to resolve an issue with opposite quaternions being interpolated by
+ * OpenGL. See below for a detailed description of this issue (yuriks):
+ *
+ * For any rotation, there are two quaternions Q, and -Q, that represent the same rotation. If you
+ * interpolate two quaternions that are opposite, instead of going from one rotation to another
+ * using the shortest path, you'll go around the longest path. You can test if two quaternions are
+ * opposite by checking if Dot(Q1, W2) < 0. In that case, you can flip either of them, therefore
+ * making Dot(-Q1, W2) positive.
+ *
+ * NOTE: This solution corrects this issue per-vertex before passing the quaternions to OpenGL. This
+ * should be correct for nearly all cases, however a more correct implementation (but less trivial
+ * and perhaps unnecessary) would be to handle this per-fragment, by interpolating the quaternions
+ * manually using two Lerps, and doing this correction before each Lerp.
+ */
+static bool AreQuaternionsOpposite(Math::Vec4<Pica::float24> qa, Math::Vec4<Pica::float24> qb) {
+    Math::Vec4f a{ qa.x.ToFloat32(), qa.y.ToFloat32(), qa.z.ToFloat32(), qa.w.ToFloat32() };
+    Math::Vec4f b{ qb.x.ToFloat32(), qb.y.ToFloat32(), qb.z.ToFloat32(), qb.w.ToFloat32() };
+    return (Math::Dot(a, b) < 0.f);
+}
 void RasterizerOpenGL::AddTriangle(const Pica::Shader::OutputVertex& v0,
                                   const Pica::Shader::OutputVertex& v1,
                                   const Pica::Shader::OutputVertex& v2) {
-    vertex_batch.emplace_back(v0);
+    vertex_batch.emplace_back(v0, false);
-    vertex_batch.emplace_back(v1);
+    vertex_batch.emplace_back(v1, AreQuaternionsOpposite(v0.quat, v1.quat));
-    vertex_batch.emplace_back(v2);
+    vertex_batch.emplace_back(v2, AreQuaternionsOpposite(v0.quat, v2.quat));
 }
 void RasterizerOpenGL::DrawTriangles() {
@@ -156,6 +197,13 @@ void RasterizerOpenGL::DrawTriangles() {
        state.draw.shader_dirty = false;
    }
+    for (unsigned index = 0; index < lighting_lut.size(); index++) {
+        if (uniform_block_data.lut_dirty[index]) {
+            SyncLightingLUT(index);
+            uniform_block_data.lut_dirty[index] = false;
+        }
+    }
    if (uniform_block_data.dirty) {
        glBufferData(GL_UNIFORM_BUFFER, sizeof(UniformData), &uniform_block_data.data, GL_STATIC_DRAW);
        uniform_block_data.dirty = false;
@@ -169,16 +217,14 @@ void RasterizerOpenGL::DrawTriangles() {
    // Flush the resource cache at the current depth and color framebuffer addresses for render-to-texture
    const auto& regs = Pica::g_state.regs;
-    PAddr cur_fb_color_addr = regs.framebuffer.GetColorBufferPhysicalAddress();
+    u32 cached_fb_color_size = Pica::Regs::BytesPerColorPixel(fb_color_texture.format)
-    u32 cur_fb_color_size = Pica::Regs::BytesPerColorPixel(regs.framebuffer.color_format)
+                               * fb_color_texture.width * fb_color_texture.height;
-                            * regs.framebuffer.GetWidth() * regs.framebuffer.GetHeight();
-    PAddr cur_fb_depth_addr = regs.framebuffer.GetDepthBufferPhysicalAddress();
+    u32 cached_fb_depth_size = Pica::Regs::BytesPerDepthPixel(fb_depth_texture.format)
-    u32 cur_fb_depth_size = Pica::Regs::BytesPerDepthPixel(regs.framebuffer.depth_format)
+                               * fb_depth_texture.width * fb_depth_texture.height;
-                            * regs.framebuffer.GetWidth() * regs.framebuffer.GetHeight();
-    res_cache.InvalidateInRange(cur_fb_color_addr, cur_fb_color_size, true);
+    res_cache.InvalidateInRange(cached_fb_color_addr, cached_fb_color_size, true);
-    res_cache.InvalidateInRange(cur_fb_depth_addr, cur_fb_depth_size, true);
+    res_cache.InvalidateInRange(cached_fb_depth_addr, cached_fb_depth_size, true);
 }
 void RasterizerOpenGL::FlushFramebuffer() {
@@ -285,44 +331,199 @@ void RasterizerOpenGL::NotifyPicaRegisterChanged(u32 id) {
    case PICA_REG_INDEX(tev_combiner_buffer_color):
        SyncCombinerColor();
        break;
+    // Fragment lighting specular 0 color
+    case PICA_REG_INDEX_WORKAROUND(lighting.light[0].specular_0, 0x140 + 0 * 0x10):
+        SyncLightSpecular0(0);
+        break;
+    case PICA_REG_INDEX_WORKAROUND(lighting.light[1].specular_0, 0x140 + 1 * 0x10):
+        SyncLightSpecular0(1);
+        break;
+    case PICA_REG_INDEX_WORKAROUND(lighting.light[2].specular_0, 0x140 + 2 * 0x10):
+        SyncLightSpecular0(2);
+        break;
+    case PICA_REG_INDEX_WORKAROUND(lighting.light[3].specular_0, 0x140 + 3 * 0x10):
+        SyncLightSpecular0(3);
+        break;
+    case PICA_REG_INDEX_WORKAROUND(lighting.light[4].specular_0, 0x140 + 4 * 0x10):
+        SyncLightSpecular0(4);
+        break;
+    case PICA_REG_INDEX_WORKAROUND(lighting.light[5].specular_0, 0x140 + 5 * 0x10):
+        SyncLightSpecular0(5);
+        break;
+    case PICA_REG_INDEX_WORKAROUND(lighting.light[6].specular_0, 0x140 + 6 * 0x10):
+        SyncLightSpecular0(6);
+        break;
+    case PICA_REG_INDEX_WORKAROUND(lighting.light[7].specular_0, 0x140 + 7 * 0x10):
+        SyncLightSpecular0(7);
+        break;
+    // Fragment lighting specular 1 color
+    case PICA_REG_INDEX_WORKAROUND(lighting.light[0].specular_1, 0x141 + 0 * 0x10):
+        SyncLightSpecular1(0);
+        break;
+    case PICA_REG_INDEX_WORKAROUND(lighting.light[1].specular_1, 0x141 + 1 * 0x10):
+        SyncLightSpecular1(1);
+        break;
+    case PICA_REG_INDEX_WORKAROUND(lighting.light[2].specular_1, 0x141 + 2 * 0x10):
+        SyncLightSpecular1(2);
+        break;
+    case PICA_REG_INDEX_WORKAROUND(lighting.light[3].specular_1, 0x141 + 3 * 0x10):
+        SyncLightSpecular1(3);
+        break;
+    case PICA_REG_INDEX_WORKAROUND(lighting.light[4].specular_1, 0x141 + 4 * 0x10):
+        SyncLightSpecular1(4);
+        break;
+    case PICA_REG_INDEX_WORKAROUND(lighting.light[5].specular_1, 0x141 + 5 * 0x10):
+        SyncLightSpecular1(5);
+        break;
+    case PICA_REG_INDEX_WORKAROUND(lighting.light[6].specular_1, 0x141 + 6 * 0x10):
+        SyncLightSpecular1(6);
+        break;
+    case PICA_REG_INDEX_WORKAROUND(lighting.light[7].specular_1, 0x141 + 7 * 0x10):
+        SyncLightSpecular1(7);
+        break;
+    // Fragment lighting diffuse color
+    case PICA_REG_INDEX_WORKAROUND(lighting.light[0].diffuse, 0x142 + 0 * 0x10):
+        SyncLightDiffuse(0);
+        break;
+    case PICA_REG_INDEX_WORKAROUND(lighting.light[1].diffuse, 0x142 + 1 * 0x10):
+        SyncLightDiffuse(1);
+        break;
+    case PICA_REG_INDEX_WORKAROUND(lighting.light[2].diffuse, 0x142 + 2 * 0x10):
+        SyncLightDiffuse(2);
+        break;
+    case PICA_REG_INDEX_WORKAROUND(lighting.light[3].diffuse, 0x142 + 3 * 0x10):
+        SyncLightDiffuse(3);
+        break;
+    case PICA_REG_INDEX_WORKAROUND(lighting.light[4].diffuse, 0x142 + 4 * 0x10):
+        SyncLightDiffuse(4);
+        break;
+    case PICA_REG_INDEX_WORKAROUND(lighting.light[5].diffuse, 0x142 + 5 * 0x10):
+        SyncLightDiffuse(5);
+        break;
+    case PICA_REG_INDEX_WORKAROUND(lighting.light[6].diffuse, 0x142 + 6 * 0x10):
+        SyncLightDiffuse(6);
+        break;
+    case PICA_REG_INDEX_WORKAROUND(lighting.light[7].diffuse, 0x142 + 7 * 0x10):
+        SyncLightDiffuse(7);
+        break;
+    // Fragment lighting ambient color
+    case PICA_REG_INDEX_WORKAROUND(lighting.light[0].ambient, 0x143 + 0 * 0x10):
+        SyncLightAmbient(0);
+        break;
+    case PICA_REG_INDEX_WORKAROUND(lighting.light[1].ambient, 0x143 + 1 * 0x10):
+        SyncLightAmbient(1);
+        break;
+    case PICA_REG_INDEX_WORKAROUND(lighting.light[2].ambient, 0x143 + 2 * 0x10):
+        SyncLightAmbient(2);
+        break;
+    case PICA_REG_INDEX_WORKAROUND(lighting.light[3].ambient, 0x143 + 3 * 0x10):
+        SyncLightAmbient(3);
+        break;
+    case PICA_REG_INDEX_WORKAROUND(lighting.light[4].ambient, 0x143 + 4 * 0x10):
+        SyncLightAmbient(4);
+        break;
+    case PICA_REG_INDEX_WORKAROUND(lighting.light[5].ambient, 0x143 + 5 * 0x10):
+        SyncLightAmbient(5);
+        break;
+    case PICA_REG_INDEX_WORKAROUND(lighting.light[6].ambient, 0x143 + 6 * 0x10):
+        SyncLightAmbient(6);
+        break;
+    case PICA_REG_INDEX_WORKAROUND(lighting.light[7].ambient, 0x143 + 7 * 0x10):
+        SyncLightAmbient(7);
+        break;
+     // Fragment lighting position
+    case PICA_REG_INDEX_WORKAROUND(lighting.light[0].x, 0x144 + 0 * 0x10):
+    case PICA_REG_INDEX_WORKAROUND(lighting.light[0].z, 0x145 + 0 * 0x10):
+        SyncLightPosition(0);
+        break;
+    case PICA_REG_INDEX_WORKAROUND(lighting.light[1].x, 0x144 + 1 * 0x10):
+    case PICA_REG_INDEX_WORKAROUND(lighting.light[1].z, 0x145 + 1 * 0x10):
+        SyncLightPosition(1);
+        break;
+    case PICA_REG_INDEX_WORKAROUND(lighting.light[2].x, 0x144 + 2 * 0x10):
+    case PICA_REG_INDEX_WORKAROUND(lighting.light[2].z, 0x145 + 2 * 0x10):
+        SyncLightPosition(2);
+        break;
+    case PICA_REG_INDEX_WORKAROUND(lighting.light[3].x, 0x144 + 3 * 0x10):
+    case PICA_REG_INDEX_WORKAROUND(lighting.light[3].z, 0x145 + 3 * 0x10):
+        SyncLightPosition(3);
+        break;
+    case PICA_REG_INDEX_WORKAROUND(lighting.light[4].x, 0x144 + 4 * 0x10):
+    case PICA_REG_INDEX_WORKAROUND(lighting.light[4].z, 0x145 + 4 * 0x10):
+        SyncLightPosition(4);
+        break;
+    case PICA_REG_INDEX_WORKAROUND(lighting.light[5].x, 0x144 + 5 * 0x10):
+    case PICA_REG_INDEX_WORKAROUND(lighting.light[5].z, 0x145 + 5 * 0x10):
+        SyncLightPosition(5);
+        break;
+    case PICA_REG_INDEX_WORKAROUND(lighting.light[6].x, 0x144 + 6 * 0x10):
+    case PICA_REG_INDEX_WORKAROUND(lighting.light[6].z, 0x145 + 6 * 0x10):
+        SyncLightPosition(6);
+        break;
+    case PICA_REG_INDEX_WORKAROUND(lighting.light[7].x, 0x144 + 7 * 0x10):
+    case PICA_REG_INDEX_WORKAROUND(lighting.light[7].z, 0x145 + 7 * 0x10):
+        SyncLightPosition(7);
+        break;
+    // Fragment lighting global ambient color (emission + ambient * ambient)
+    case PICA_REG_INDEX_WORKAROUND(lighting.global_ambient, 0x1c0):
+        SyncGlobalAmbient();
+        break;
+    // Fragment lighting lookup tables
+    case PICA_REG_INDEX_WORKAROUND(lighting.lut_data[0], 0x1c8):
+    case PICA_REG_INDEX_WORKAROUND(lighting.lut_data[1], 0x1c9):
+    case PICA_REG_INDEX_WORKAROUND(lighting.lut_data[2], 0x1ca):
+    case PICA_REG_INDEX_WORKAROUND(lighting.lut_data[3], 0x1cb):
+    case PICA_REG_INDEX_WORKAROUND(lighting.lut_data[4], 0x1cc):
+    case PICA_REG_INDEX_WORKAROUND(lighting.lut_data[5], 0x1cd):
+    case PICA_REG_INDEX_WORKAROUND(lighting.lut_data[6], 0x1ce):
+    case PICA_REG_INDEX_WORKAROUND(lighting.lut_data[7], 0x1cf):
+    {
+        auto& lut_config = regs.lighting.lut_config;
+        uniform_block_data.lut_dirty[lut_config.type / 4] = true;
+        break;
+    }
    }
 }
 void RasterizerOpenGL::FlushRegion(PAddr addr, u32 size) {
    const auto& regs = Pica::g_state.regs;
-    PAddr cur_fb_color_addr = regs.framebuffer.GetColorBufferPhysicalAddress();
+    u32 cached_fb_color_size = Pica::Regs::BytesPerColorPixel(fb_color_texture.format)
-    u32 cur_fb_color_size = Pica::Regs::BytesPerColorPixel(regs.framebuffer.color_format)
+                               * fb_color_texture.width * fb_color_texture.height;
-                            * regs.framebuffer.GetWidth() * regs.framebuffer.GetHeight();
-    PAddr cur_fb_depth_addr = regs.framebuffer.GetDepthBufferPhysicalAddress();
+    u32 cached_fb_depth_size = Pica::Regs::BytesPerDepthPixel(fb_depth_texture.format)
-    u32 cur_fb_depth_size = Pica::Regs::BytesPerDepthPixel(regs.framebuffer.depth_format)
+                               * fb_depth_texture.width * fb_depth_texture.height;
-                            * regs.framebuffer.GetWidth() * regs.framebuffer.GetHeight();
    // If source memory region overlaps 3DS framebuffers, commit them before the copy happens
-    if (MathUtil::IntervalsIntersect(addr, size, cur_fb_color_addr, cur_fb_color_size))
+    if (MathUtil::IntervalsIntersect(addr, size, cached_fb_color_addr, cached_fb_color_size))
        CommitColorBuffer();
-    if (MathUtil::IntervalsIntersect(addr, size, cur_fb_depth_addr, cur_fb_depth_size))
+    if (MathUtil::IntervalsIntersect(addr, size, cached_fb_depth_addr, cached_fb_depth_size))
        CommitDepthBuffer();
 }
 void RasterizerOpenGL::InvalidateRegion(PAddr addr, u32 size) {
    const auto& regs = Pica::g_state.regs;
-    PAddr cur_fb_color_addr = regs.framebuffer.GetColorBufferPhysicalAddress();
+    u32 cached_fb_color_size = Pica::Regs::BytesPerColorPixel(fb_color_texture.format)
-    u32 cur_fb_color_size = Pica::Regs::BytesPerColorPixel(regs.framebuffer.color_format)
+                               * fb_color_texture.width * fb_color_texture.height;
-                            * regs.framebuffer.GetWidth() * regs.framebuffer.GetHeight();
-    PAddr cur_fb_depth_addr = regs.framebuffer.GetDepthBufferPhysicalAddress();
+    u32 cached_fb_depth_size = Pica::Regs::BytesPerDepthPixel(fb_depth_texture.format)
-    u32 cur_fb_depth_size = Pica::Regs::BytesPerDepthPixel(regs.framebuffer.depth_format)
+                               * fb_depth_texture.width * fb_depth_texture.height;
-                            * regs.framebuffer.GetWidth() * regs.framebuffer.GetHeight();
    // If modified memory region overlaps 3DS framebuffers, reload their contents into OpenGL
-    if (MathUtil::IntervalsIntersect(addr, size, cur_fb_color_addr, cur_fb_color_size))
+    if (MathUtil::IntervalsIntersect(addr, size, cached_fb_color_addr, cached_fb_color_size))
        ReloadColorBuffer();
-    if (MathUtil::IntervalsIntersect(addr, size, cur_fb_depth_addr, cur_fb_depth_size))
+    if (MathUtil::IntervalsIntersect(addr, size, cached_fb_depth_addr, cached_fb_depth_size))
        ReloadDepthBuffer();
    // Notify cache of flush in case the region touches a cached resource
@@ -497,27 +698,48 @@ void RasterizerOpenGL::SetShader() {
        uniform_tex = glGetUniformLocation(shader->shader.handle, "tex[2]");
        if (uniform_tex != -1) { glUniform1i(uniform_tex, 2); }
+        // Set the texture samplers to correspond to different lookup table texture units
+        GLuint uniform_lut = glGetUniformLocation(shader->shader.handle, "lut[0]");
+        if (uniform_lut != -1) { glUniform1i(uniform_lut, 3); }
+        uniform_lut = glGetUniformLocation(shader->shader.handle, "lut[1]");
+        if (uniform_lut != -1) { glUniform1i(uniform_lut, 4); }
+        uniform_lut = glGetUniformLocation(shader->shader.handle, "lut[2]");
+        if (uniform_lut != -1) { glUniform1i(uniform_lut, 5); }
+        uniform_lut = glGetUniformLocation(shader->shader.handle, "lut[3]");
+        if (uniform_lut != -1) { glUniform1i(uniform_lut, 6); }
+        uniform_lut = glGetUniformLocation(shader->shader.handle, "lut[4]");
+        if (uniform_lut != -1) { glUniform1i(uniform_lut, 7); }
+        uniform_lut = glGetUniformLocation(shader->shader.handle, "lut[5]");
+        if (uniform_lut != -1) { glUniform1i(uniform_lut, 8); }
        current_shader = shader_cache.emplace(config, std::move(shader)).first->second.get();
        unsigned int block_index = glGetUniformBlockIndex(current_shader->shader.handle, "shader_data");
        glUniformBlockBinding(current_shader->shader.handle, block_index, 0);
-    }
-    // Update uniforms
+        // Update uniforms
-    SyncAlphaTest();
+        SyncAlphaTest();
-    SyncCombinerColor();
+        SyncCombinerColor();
-    auto& tev_stages = Pica::g_state.regs.GetTevStages();
+        auto& tev_stages = Pica::g_state.regs.GetTevStages();
-    for (int index = 0; index < tev_stages.size(); ++index)
+        for (int index = 0; index < tev_stages.size(); ++index)
-        SyncTevConstColor(index, tev_stages[index]);
+            SyncTevConstColor(index, tev_stages[index]);
+        SyncGlobalAmbient();
+        for (int light_index = 0; light_index < 8; light_index++) {
+            SyncLightDiffuse(light_index);
+            SyncLightAmbient(light_index);
+            SyncLightPosition(light_index);
+        }
+    }
 }
 void RasterizerOpenGL::SyncFramebuffer() {
    const auto& regs = Pica::g_state.regs;
-    PAddr cur_fb_color_addr = regs.framebuffer.GetColorBufferPhysicalAddress();
+    PAddr new_fb_color_addr = regs.framebuffer.GetColorBufferPhysicalAddress();
    Pica::Regs::ColorFormat new_fb_color_format = regs.framebuffer.color_format;
-    PAddr cur_fb_depth_addr = regs.framebuffer.GetDepthBufferPhysicalAddress();
+    PAddr new_fb_depth_addr = regs.framebuffer.GetDepthBufferPhysicalAddress();
    Pica::Regs::DepthFormat new_fb_depth_format = regs.framebuffer.depth_format;
    bool fb_size_changed = fb_color_texture.width != static_cast<GLsizei>(regs.framebuffer.GetWidth()) ||
@@ -529,10 +751,10 @@ void RasterizerOpenGL::SyncFramebuffer() {
    bool depth_fb_prop_changed = fb_depth_texture.format != new_fb_depth_format ||
                                 fb_size_changed;
-    bool color_fb_modified = last_fb_color_addr != cur_fb_color_addr ||
+    bool color_fb_modified = cached_fb_color_addr != new_fb_color_addr ||
                             color_fb_prop_changed;
-    bool depth_fb_modified = last_fb_depth_addr != cur_fb_depth_addr ||
+    bool depth_fb_modified = cached_fb_depth_addr != new_fb_depth_addr ||
                             depth_fb_prop_changed;
    // Commit if framebuffer modified in any way
@@ -572,13 +794,13 @@ void RasterizerOpenGL::SyncFramebuffer() {
    // Load buffer data again if fb modified in any way
    if (color_fb_modified) {
-        last_fb_color_addr = cur_fb_color_addr;
+        cached_fb_color_addr = new_fb_color_addr;
        ReloadColorBuffer();
    }
    if (depth_fb_modified) {
-        last_fb_depth_addr = cur_fb_depth_addr;
+        cached_fb_depth_addr = new_fb_depth_addr;
        ReloadDepthBuffer();
    }
@@ -610,8 +832,8 @@ void RasterizerOpenGL::SyncCullMode() {
 }
 void RasterizerOpenGL::SyncDepthModifiers() {
-    float depth_scale = -Pica::float24::FromRawFloat24(Pica::g_state.regs.viewport_depth_range).ToFloat32();
+    float depth_scale = -Pica::float24::FromRaw(Pica::g_state.regs.viewport_depth_range).ToFloat32();
-    float depth_offset = Pica::float24::FromRawFloat24(Pica::g_state.regs.viewport_depth_far_plane).ToFloat32() / 2.0f;
+    float depth_offset = Pica::float24::FromRaw(Pica::g_state.regs.viewport_depth_far_plane).ToFloat32() / 2.0f;
    // TODO: Implement scale modifier
    uniform_block_data.data.depth_offset = depth_offset;
@@ -689,12 +911,81 @@ void RasterizerOpenGL::SyncTevConstColor(int stage_index, const Pica::Regs::TevS
    }
 }
+void RasterizerOpenGL::SyncGlobalAmbient() {
+    auto color = PicaToGL::LightColor(Pica::g_state.regs.lighting.global_ambient);
+    if (color != uniform_block_data.data.lighting_global_ambient) {
+        uniform_block_data.data.lighting_global_ambient = color;
+        uniform_block_data.dirty = true;
+    }
+}
+void RasterizerOpenGL::SyncLightingLUT(unsigned lut_index) {
+    std::array<GLvec4, 256> new_data;
+    for (unsigned offset = 0; offset < new_data.size(); ++offset) {
+        new_data[offset][0] = Pica::g_state.lighting.luts[(lut_index * 4) + 0][offset].ToFloat();
+        new_data[offset][1] = Pica::g_state.lighting.luts[(lut_index * 4) + 1][offset].ToFloat();
+        new_data[offset][2] = Pica::g_state.lighting.luts[(lut_index * 4) + 2][offset].ToFloat();
+        new_data[offset][3] = Pica::g_state.lighting.luts[(lut_index * 4) + 3][offset].ToFloat();
+    }
+    if (new_data != lighting_lut_data[lut_index]) {
+        lighting_lut_data[lut_index] = new_data;
+        glActiveTexture(GL_TEXTURE3 + lut_index);
+        glTexSubImage1D(GL_TEXTURE_1D, 0, 0, 256, GL_RGBA, GL_FLOAT, lighting_lut_data[lut_index].data());
+    }
+}
+void RasterizerOpenGL::SyncLightSpecular0(int light_index) {
+    auto color = PicaToGL::LightColor(Pica::g_state.regs.lighting.light[light_index].specular_0);
+    if (color != uniform_block_data.data.light_src[light_index].specular_0) {
+        uniform_block_data.data.light_src[light_index].specular_0 = color;
+        uniform_block_data.dirty = true;
+    }
+}
+void RasterizerOpenGL::SyncLightSpecular1(int light_index) {
+    auto color = PicaToGL::LightColor(Pica::g_state.regs.lighting.light[light_index].specular_1);
+    if (color != uniform_block_data.data.light_src[light_index].specular_1) {
+        uniform_block_data.data.light_src[light_index].specular_1 = color;
+        uniform_block_data.dirty = true;
+    }
+}
+void RasterizerOpenGL::SyncLightDiffuse(int light_index) {
+    auto color = PicaToGL::LightColor(Pica::g_state.regs.lighting.light[light_index].diffuse);
+    if (color != uniform_block_data.data.light_src[light_index].diffuse) {
+        uniform_block_data.data.light_src[light_index].diffuse = color;
+        uniform_block_data.dirty = true;
+    }
+}
+void RasterizerOpenGL::SyncLightAmbient(int light_index) {
+    auto color = PicaToGL::LightColor(Pica::g_state.regs.lighting.light[light_index].ambient);
+    if (color != uniform_block_data.data.light_src[light_index].ambient) {
+        uniform_block_data.data.light_src[light_index].ambient = color;
+        uniform_block_data.dirty = true;
+    }
+}
+void RasterizerOpenGL::SyncLightPosition(int light_index) {
+    GLvec3 position = {
+        Pica::float16::FromRaw(Pica::g_state.regs.lighting.light[light_index].x).ToFloat32(),
+        Pica::float16::FromRaw(Pica::g_state.regs.lighting.light[light_index].y).ToFloat32(),
+        Pica::float16::FromRaw(Pica::g_state.regs.lighting.light[light_index].z).ToFloat32() };
+    if (position != uniform_block_data.data.light_src[light_index].position) {
+        uniform_block_data.data.light_src[light_index].position = position;
+        uniform_block_data.dirty = true;
+    }
+}
 void RasterizerOpenGL::SyncDrawState() {
    const auto& regs = Pica::g_state.regs;
    // Sync the viewport
-    GLsizei viewport_width = (GLsizei)Pica::float24::FromRawFloat24(regs.viewport_size_x).ToFloat32() * 2;
+    GLsizei viewport_width = (GLsizei)Pica::float24::FromRaw(regs.viewport_size_x).ToFloat32() * 2;
-    GLsizei viewport_height = (GLsizei)Pica::float24::FromRawFloat24(regs.viewport_size_y).ToFloat32() * 2;
+    GLsizei viewport_height = (GLsizei)Pica::float24::FromRaw(regs.viewport_size_y).ToFloat32() * 2;
    // OpenGL uses different y coordinates, so negate corner offset and flip origin
    // TODO: Ensure viewport_corner.x should not be negated or origin flipped
@@ -723,7 +1014,7 @@ void RasterizerOpenGL::SyncDrawState() {
 MICROPROFILE_DEFINE(OpenGL_FramebufferReload, "OpenGL", "FB Reload", MP_RGB(70, 70, 200));
 void RasterizerOpenGL::ReloadColorBuffer() {
-    u8* color_buffer = Memory::GetPhysicalPointer(Pica::g_state.regs.framebuffer.GetColorBufferPhysicalAddress());
+    u8* color_buffer = Memory::GetPhysicalPointer(cached_fb_color_addr);
    if (color_buffer == nullptr)
        return;
@@ -758,13 +1049,11 @@ void RasterizerOpenGL::ReloadColorBuffer() {
 }
 void RasterizerOpenGL::ReloadDepthBuffer() {
-    PAddr depth_buffer_addr = Pica::g_state.regs.framebuffer.GetDepthBufferPhysicalAddress();
+    if (cached_fb_depth_addr == 0)
-    if (depth_buffer_addr == 0)
        return;
    // TODO: Appears to work, but double-check endianness of depth values and order of depth-stencil
-    u8* depth_buffer = Memory::GetPhysicalPointer(depth_buffer_addr);
+    u8* depth_buffer = Memory::GetPhysicalPointer(cached_fb_depth_addr);
    if (depth_buffer == nullptr)
        return;
@@ -827,8 +1116,8 @@ Common::Profiling::TimingCategory buffer_commit_category("Framebuffer Commit");
 MICROPROFILE_DEFINE(OpenGL_FramebufferCommit, "OpenGL", "FB Commit", MP_RGB(70, 70, 200));
 void RasterizerOpenGL::CommitColorBuffer() {
-    if (last_fb_color_addr != 0) {
+    if (cached_fb_color_addr != 0) {
-        u8* color_buffer = Memory::GetPhysicalPointer(last_fb_color_addr);
+        u8* color_buffer = Memory::GetPhysicalPointer(cached_fb_color_addr);
        if (color_buffer != nullptr) {
            Common::Profiling::ScopeTimer timer(buffer_commit_category);
@@ -863,9 +1152,9 @@ void RasterizerOpenGL::CommitColorBuffer() {
 }
 void RasterizerOpenGL::CommitDepthBuffer() {
-    if (last_fb_depth_addr != 0) {
+    if (cached_fb_depth_addr != 0) {
        // TODO: Output seems correct visually, but doesn't quite match sw renderer output. One of them is wrong.
-        u8* depth_buffer = Memory::GetPhysicalPointer(last_fb_depth_addr);
+        u8* depth_buffer = Memory::GetPhysicalPointer(cached_fb_depth_addr);
        if (depth_buffer != nullptr) {
            Common::Profiling::ScopeTimer timer(buffer_commit_category);
diff --git a/src/video_core/renderer_opengl/gl_rasterizer.h b/src/video_core/renderer_opengl/gl_rasterizer.h
index c8a2d8f16..fef5f5331 100644
--- a/src/video_core/renderer_opengl/gl_rasterizer.h
+++ b/src/video_core/renderer_opengl/gl_rasterizer.h
@@ -17,6 +17,7 @@
 #include "video_core/rasterizer_interface.h"
 #include "video_core/renderer_opengl/gl_rasterizer_cache.h"
 #include "video_core/renderer_opengl/gl_state.h"
+#include "video_core/renderer_opengl/pica_to_gl.h"
 #include "video_core/shader/shader_interpreter.h"
 /**
@@ -71,6 +72,59 @@ struct PicaShaderConfig {
            regs.tev_combiner_buffer_input.update_mask_rgb.Value() |
            regs.tev_combiner_buffer_input.update_mask_a.Value() << 4;
+        // Fragment lighting
+        res.lighting.enable = !regs.lighting.disable;
+        res.lighting.src_num = regs.lighting.num_lights + 1;
+        for (unsigned light_index = 0; light_index < res.lighting.src_num; ++light_index) {
+            unsigned num = regs.lighting.light_enable.GetNum(light_index);
+            const auto& light = regs.lighting.light[num];
+            res.lighting.light[light_index].num = num;
+            res.lighting.light[light_index].directional = light.directional != 0;
+            res.lighting.light[light_index].two_sided_diffuse = light.two_sided_diffuse != 0;
+            res.lighting.light[light_index].dist_atten_enable = !regs.lighting.IsDistAttenDisabled(num);
+            res.lighting.light[light_index].dist_atten_bias = Pica::float20::FromRaw(light.dist_atten_bias).ToFloat32();
+            res.lighting.light[light_index].dist_atten_scale = Pica::float20::FromRaw(light.dist_atten_scale).ToFloat32();
+        }
+        res.lighting.lut_d0.enable = regs.lighting.disable_lut_d0 == 0;
+        res.lighting.lut_d0.abs_input = regs.lighting.abs_lut_input.disable_d0 == 0;
+        res.lighting.lut_d0.type = regs.lighting.lut_input.d0.Value();
+        res.lighting.lut_d0.scale = regs.lighting.lut_scale.GetScale(regs.lighting.lut_scale.d0);
+        res.lighting.lut_d1.enable = regs.lighting.disable_lut_d1 == 0;
+        res.lighting.lut_d1.abs_input = regs.lighting.abs_lut_input.disable_d1 == 0;
+        res.lighting.lut_d1.type = regs.lighting.lut_input.d1.Value();
+        res.lighting.lut_d1.scale = regs.lighting.lut_scale.GetScale(regs.lighting.lut_scale.d1);
+        res.lighting.lut_fr.enable = regs.lighting.disable_lut_fr == 0;
+        res.lighting.lut_fr.abs_input = regs.lighting.abs_lut_input.disable_fr == 0;
+        res.lighting.lut_fr.type = regs.lighting.lut_input.fr.Value();
+        res.lighting.lut_fr.scale = regs.lighting.lut_scale.GetScale(regs.lighting.lut_scale.fr);
+        res.lighting.lut_rr.enable = regs.lighting.disable_lut_rr == 0;
+        res.lighting.lut_rr.abs_input = regs.lighting.abs_lut_input.disable_rr == 0;
+        res.lighting.lut_rr.type = regs.lighting.lut_input.rr.Value();
+        res.lighting.lut_rr.scale = regs.lighting.lut_scale.GetScale(regs.lighting.lut_scale.rr);
+        res.lighting.lut_rg.enable = regs.lighting.disable_lut_rg == 0;
+        res.lighting.lut_rg.abs_input = regs.lighting.abs_lut_input.disable_rg == 0;
+        res.lighting.lut_rg.type = regs.lighting.lut_input.rg.Value();
+        res.lighting.lut_rg.scale = regs.lighting.lut_scale.GetScale(regs.lighting.lut_scale.rg);
+        res.lighting.lut_rb.enable = regs.lighting.disable_lut_rb == 0;
+        res.lighting.lut_rb.abs_input = regs.lighting.abs_lut_input.disable_rb == 0;
+        res.lighting.lut_rb.type = regs.lighting.lut_input.rb.Value();
+        res.lighting.lut_rb.scale = regs.lighting.lut_scale.GetScale(regs.lighting.lut_scale.rb);
+        res.lighting.config = regs.lighting.config;
+        res.lighting.fresnel_selector = regs.lighting.fresnel_selector;
+        res.lighting.bump_mode = regs.lighting.bump_mode;
+        res.lighting.bump_selector = regs.lighting.bump_selector;
+        res.lighting.bump_renorm = regs.lighting.disable_bump_renorm == 0;
+        res.lighting.clamp_highlights = regs.lighting.clamp_highlights != 0;
        return res;
    }
@@ -86,9 +140,37 @@ struct PicaShaderConfig {
        return std::memcmp(this, &o, sizeof(PicaShaderConfig)) == 0;
    };
-    Pica::Regs::CompareFunc alpha_test_func;
+    Pica::Regs::CompareFunc alpha_test_func = Pica::Regs::CompareFunc::Never;
    std::array<Pica::Regs::TevStageConfig, 6> tev_stages = {};
-    u8 combiner_buffer_input;
+    u8 combiner_buffer_input = 0;
+    struct {
+        struct {
+            unsigned num = 0;
+            bool directional = false;
+            bool two_sided_diffuse = false;
+            bool dist_atten_enable = false;
+            GLfloat dist_atten_scale = 0.0f;
+            GLfloat dist_atten_bias = 0.0f;
+        } light[8];
+        bool enable = false;
+        unsigned src_num = 0;
+        Pica::Regs::LightingBumpMode bump_mode = Pica::Regs::LightingBumpMode::None;
+        unsigned bump_selector = 0;
+        bool bump_renorm = false;
+        bool clamp_highlights = false;
+        Pica::Regs::LightingConfig config = Pica::Regs::LightingConfig::Config0;
+        Pica::Regs::LightingFresnelSelector fresnel_selector = Pica::Regs::LightingFresnelSelector::None;
+        struct {
+            bool enable = false;
+            bool abs_input = false;
+            Pica::Regs::LightingLutInput type = Pica::Regs::LightingLutInput::NH;
+            float scale = 1.0f;
+        } lut_d0, lut_d1, lut_fr, lut_rr, lut_rg, lut_rb;
+    } lighting;
 };
 namespace std {
@@ -167,7 +249,7 @@ private:
    /// Structure that the hardware rendered vertices are composed of
    struct HardwareVertex {
-        HardwareVertex(const Pica::Shader::OutputVertex& v) {
+        HardwareVertex(const Pica::Shader::OutputVertex& v, bool flip_quaternion) {
            position[0] = v.pos.x.ToFloat32();
            position[1] = v.pos.y.ToFloat32();
            position[2] = v.pos.z.ToFloat32();
@@ -182,6 +264,19 @@ private:
            tex_coord1[1] = v.tc1.y.ToFloat32();
            tex_coord2[0] = v.tc2.x.ToFloat32();
            tex_coord2[1] = v.tc2.y.ToFloat32();
+            normquat[0] = v.quat.x.ToFloat32();
+            normquat[1] = v.quat.y.ToFloat32();
+            normquat[2] = v.quat.z.ToFloat32();
+            normquat[3] = v.quat.w.ToFloat32();
+            view[0] = v.view.x.ToFloat32();
+            view[1] = v.view.y.ToFloat32();
+            view[2] = v.view.z.ToFloat32();
+            if (flip_quaternion) {
+                for (float& x : normquat) {
+                    x = -x;
+                }
+            }
        }
        GLfloat position[4];
@@ -189,20 +284,31 @@ private:
        GLfloat tex_coord0[2];
        GLfloat tex_coord1[2];
        GLfloat tex_coord2[2];
+        GLfloat normquat[4];
+        GLfloat view[3];
+    };
+    struct LightSrc {
+        alignas(16) GLvec3 specular_0;
+        alignas(16) GLvec3 specular_1;
+        alignas(16) GLvec3 diffuse;
+        alignas(16) GLvec3 ambient;
+        alignas(16) GLvec3 position;
    };
    /// Uniform structure for the Uniform Buffer Object, all members must be 16-byte aligned
    struct UniformData {
        // A vec4 color for each of the six tev stages
-        std::array<GLfloat, 4> const_color[6];
+        GLvec4 const_color[6];
-        std::array<GLfloat, 4> tev_combiner_buffer_color;
+        GLvec4 tev_combiner_buffer_color;
        GLint alphatest_ref;
        GLfloat depth_offset;
-        INSERT_PADDING_BYTES(8);
+        alignas(16) GLvec3 lighting_global_ambient;
+        LightSrc light_src[8];
    };
-    static_assert(sizeof(UniformData) == 0x80, "The size of the UniformData structure has changed, update the structure in the shader");
+    static_assert(sizeof(UniformData) == 0x310, "The size of the UniformData structure has changed, update the structure in the shader");
-    static_assert(sizeof(UniformData) < 16000, "UniformData structure must be less than 16kb as per the OpenGL spec");
+    static_assert(sizeof(UniformData) < 16384, "UniformData structure must be less than 16kb as per the OpenGL spec");
    /// Reconfigure the OpenGL color texture to use the given format and dimensions
    void ReconfigureColorTexture(TextureInfo& texture, Pica::Regs::ColorFormat format, u32 width, u32 height);
@@ -249,6 +355,27 @@ private:
    /// Syncs the TEV combiner color buffer to match the PICA register
    void SyncCombinerColor();
+    /// Syncs the lighting global ambient color to match the PICA register
+    void SyncGlobalAmbient();
+    /// Syncs the lighting lookup tables
+    void SyncLightingLUT(unsigned index);
+    /// Syncs the specified light's diffuse color to match the PICA register
+    void SyncLightDiffuse(int light_index);
+    /// Syncs the specified light's ambient color to match the PICA register
+    void SyncLightAmbient(int light_index);
+    /// Syncs the specified light's position to match the PICA register
+    void SyncLightPosition(int light_index);
+    /// Syncs the specified light's specular 0 color to match the PICA register
+    void SyncLightSpecular0(int light_index);
+    /// Syncs the specified light's specular 1 color to match the PICA register
+    void SyncLightSpecular1(int light_index);
    /// Syncs the remaining OpenGL drawing state to match the current PICA state
    void SyncDrawState();
@@ -278,8 +405,8 @@ private:
    OpenGLState state;
-    PAddr last_fb_color_addr;
+    PAddr cached_fb_color_addr;
-    PAddr last_fb_depth_addr;
+    PAddr cached_fb_depth_addr;
    // Hardware rasterizer
    std::array<SamplerInfo, 3> texture_samplers;
@@ -291,6 +418,7 @@ private:
    struct {
        UniformData data;
+        bool lut_dirty[6];
        bool dirty;
    } uniform_block_data;
@@ -298,4 +426,7 @@ private:
    OGLBuffer vertex_buffer;
    OGLBuffer uniform_buffer;
    OGLFramebuffer framebuffer;
+    std::array<OGLTexture, 6> lighting_lut;
+    std::array<std::array<GLvec4, 256>, 6> lighting_lut_data;
 };
diff --git a/src/video_core/renderer_opengl/gl_shader_gen.cpp b/src/video_core/renderer_opengl/gl_shader_gen.cpp
index 22022f7f4..ee4b54ab9 100644
--- a/src/video_core/renderer_opengl/gl_shader_gen.cpp
+++ b/src/video_core/renderer_opengl/gl_shader_gen.cpp
@@ -32,12 +32,10 @@ static void AppendSource(std::string& out, TevStageConfig::Source source,
        out += "primary_color";
        break;
    case Source::PrimaryFragmentColor:
-        // HACK: Until we implement fragment lighting, use primary_color
+        out += "primary_fragment_color";
-        out += "primary_color";
        break;
    case Source::SecondaryFragmentColor:
-        // HACK: Until we implement fragment lighting, use zero
+        out += "secondary_fragment_color";
-        out += "vec4(0.0)";
        break;
    case Source::Texture0:
        out += "texture(tex[0], texcoord[0])";
@@ -320,26 +318,229 @@ static void WriteTevStage(std::string& out, const PicaShaderConfig& config, unsi
        out += "next_combiner_buffer.a = last_tex_env_out.a;\n";
 }
+/// Writes the code to emulate fragment lighting
+static void WriteLighting(std::string& out, const PicaShaderConfig& config) {
+    // Define lighting globals
+    out += "vec4 diffuse_sum = vec4(0.0, 0.0, 0.0, 1.0);\n"
+           "vec4 specular_sum = vec4(0.0, 0.0, 0.0, 1.0);\n"
+           "vec3 light_vector = vec3(0.0);\n"
+           "vec3 refl_value = vec3(0.0);\n";
+    // Compute fragment normals
+    if (config.lighting.bump_mode == Pica::Regs::LightingBumpMode::NormalMap) {
+        // Bump mapping is enabled using a normal map, read perturbation vector from the selected texture
+        std::string bump_selector = std::to_string(config.lighting.bump_selector);
+        out += "vec3 surface_normal = 2.0 * texture(tex[" + bump_selector + "], texcoord[" + bump_selector + "]).rgb - 1.0;\n";
+        // Recompute Z-component of perturbation if 'renorm' is enabled, this provides a higher precision result
+        if (config.lighting.bump_renorm) {
+            std::string val = "(1.0 - (surface_normal.x*surface_normal.x + surface_normal.y*surface_normal.y))";
+            out += "surface_normal.z = sqrt(max(" + val + ", 0.0));\n";
+        }
+    } else if (config.lighting.bump_mode == Pica::Regs::LightingBumpMode::TangentMap) {
+        // Bump mapping is enabled using a tangent map
+        LOG_CRITICAL(HW_GPU, "unimplemented bump mapping mode (tangent mapping)");
+        UNIMPLEMENTED();
+    } else {
+        // No bump mapping - surface local normal is just a unit normal
+        out += "vec3 surface_normal = vec3(0.0, 0.0, 1.0);\n";
+    }
+    // Rotate the surface-local normal by the interpolated normal quaternion to convert it to eyespace
+    out += "vec3 normal = normalize(quaternion_rotate(normquat, surface_normal));\n";
+    // Gets the index into the specified lookup table for specular lighting
+    auto GetLutIndex = [config](unsigned light_num, Regs::LightingLutInput input, bool abs) {
+        const std::string half_angle = "normalize(normalize(view) + light_vector)";
+        std::string index;
+        switch (input) {
+        case Regs::LightingLutInput::NH:
+            index = "dot(normal, " + half_angle + ")";
+            break;
+        case Regs::LightingLutInput::VH:
+            index = std::string("dot(normalize(view), " + half_angle + ")");
+            break;
+        case Regs::LightingLutInput::NV:
+            index = std::string("dot(normal, normalize(view))");
+            break;
+        case Regs::LightingLutInput::LN:
+            index = std::string("dot(light_vector, normal)");
+            break;
+        default:
+            LOG_CRITICAL(HW_GPU, "Unknown lighting LUT input %d\n", (int)input);
+            UNIMPLEMENTED();
+            break;
+        }
+        if (abs) {
+            // LUT index is in the range of (0.0, 1.0)
+            index = config.lighting.light[light_num].two_sided_diffuse ? "abs(" + index + ")" : "max(" + index + ", 0.f)";
+            return "(FLOAT_255 * clamp(" + index + ", 0.0, 1.0))";
+        } else {
+            // LUT index is in the range of (-1.0, 1.0)
+            index = "clamp(" + index + ", -1.0, 1.0)";
+            return "(FLOAT_255 * ((" + index + " < 0) ? " + index + " + 2.0 : " + index + ") / 2.0)";
+        }
+        return std::string();
+    };
+    // Gets the lighting lookup table value given the specified sampler and index
+    auto GetLutValue = [](Regs::LightingSampler sampler, std::string lut_index) {
+        return std::string("texture(lut[" + std::to_string((unsigned)sampler / 4) + "], " +
+                           lut_index + ")[" + std::to_string((unsigned)sampler & 3) + "]");
+    };
+    // Write the code to emulate each enabled light
+    for (unsigned light_index = 0; light_index < config.lighting.src_num; ++light_index) {
+        const auto& light_config = config.lighting.light[light_index];
+        std::string light_src = "light_src[" + std::to_string(light_config.num) + "]";
+        // Compute light vector (directional or positional)
+        if (light_config.directional)
+            out += "light_vector = normalize(" + light_src + ".position);\n";
+        else
+            out += "light_vector = normalize(" + light_src + ".position + view);\n";
+        // Compute dot product of light_vector and normal, adjust if lighting is one-sided or two-sided
+        std::string dot_product = light_config.two_sided_diffuse ? "abs(dot(light_vector, normal))" : "max(dot(light_vector, normal), 0.0)";
+        // If enabled, compute distance attenuation value
+        std::string dist_atten = "1.0";
+        if (light_config.dist_atten_enable) {
+            std::string scale = std::to_string(light_config.dist_atten_scale);
+            std::string bias = std::to_string(light_config.dist_atten_bias);
+            std::string index = "(" + scale + " * length(-view - " + light_src + ".position) + " + bias + ")";
+            index = "((clamp(" + index + ", 0.0, FLOAT_255)))";
+            const unsigned lut_num = ((unsigned)Regs::LightingSampler::DistanceAttenuation + light_config.num);
+            dist_atten = GetLutValue((Regs::LightingSampler)lut_num, index);
+        }
+        // If enabled, clamp specular component if lighting result is negative
+        std::string clamp_highlights = config.lighting.clamp_highlights ? "(dot(light_vector, normal) <= 0.0 ? 0.0 : 1.0)" : "1.0";
+        // Specular 0 component
+        std::string d0_lut_value = "1.0";
+        if (config.lighting.lut_d0.enable && Pica::Regs::IsLightingSamplerSupported(config.lighting.config, Pica::Regs::LightingSampler::Distribution0)) {
+            // Lookup specular "distribution 0" LUT value
+            std::string index = GetLutIndex(light_config.num, config.lighting.lut_d0.type, config.lighting.lut_d0.abs_input);
+            d0_lut_value = "(" + std::to_string(config.lighting.lut_d0.scale) + " * " + GetLutValue(Regs::LightingSampler::Distribution0, index) + ")";
+        }
+        std::string specular_0 = "(" + d0_lut_value + " * " + light_src + ".specular_0)";
+        // If enabled, lookup ReflectRed value, otherwise, 1.0 is used
+        if (config.lighting.lut_rr.enable && Pica::Regs::IsLightingSamplerSupported(config.lighting.config, Pica::Regs::LightingSampler::ReflectRed)) {
+            std::string index = GetLutIndex(light_config.num, config.lighting.lut_rr.type, config.lighting.lut_rr.abs_input);
+            std::string value = "(" + std::to_string(config.lighting.lut_rr.scale) + " * " + GetLutValue(Regs::LightingSampler::ReflectRed, index) + ")";
+            out += "refl_value.r = " + value + ";\n";
+        } else {
+            out += "refl_value.r = 1.0;\n";
+        }
+        // If enabled, lookup ReflectGreen value, otherwise, ReflectRed value is used
+        if (config.lighting.lut_rg.enable && Pica::Regs::IsLightingSamplerSupported(config.lighting.config, Pica::Regs::LightingSampler::ReflectGreen)) {
+            std::string index = GetLutIndex(light_config.num, config.lighting.lut_rg.type, config.lighting.lut_rg.abs_input);
+            std::string value = "(" + std::to_string(config.lighting.lut_rg.scale) + " * " + GetLutValue(Regs::LightingSampler::ReflectGreen, index) + ")";
+            out += "refl_value.g = " + value + ";\n";
+        } else {
+            out += "refl_value.g = refl_value.r;\n";
+        }
+        // If enabled, lookup ReflectBlue value, otherwise, ReflectRed value is used
+        if (config.lighting.lut_rb.enable && Pica::Regs::IsLightingSamplerSupported(config.lighting.config, Pica::Regs::LightingSampler::ReflectBlue)) {
+            std::string index = GetLutIndex(light_config.num, config.lighting.lut_rb.type, config.lighting.lut_rb.abs_input);
+            std::string value = "(" + std::to_string(config.lighting.lut_rb.scale) + " * " + GetLutValue(Regs::LightingSampler::ReflectBlue, index) + ")";
+            out += "refl_value.b = " + value + ";\n";
+        } else {
+            out += "refl_value.b = refl_value.r;\n";
+        }
+        // Specular 1 component
+        std::string d1_lut_value = "1.0";
+        if (config.lighting.lut_d1.enable && Pica::Regs::IsLightingSamplerSupported(config.lighting.config, Pica::Regs::LightingSampler::Distribution1)) {
+            // Lookup specular "distribution 1" LUT value
+            std::string index = GetLutIndex(light_config.num, config.lighting.lut_d1.type, config.lighting.lut_d1.abs_input);
+            d1_lut_value = "(" + std::to_string(config.lighting.lut_d1.scale) + " * " + GetLutValue(Regs::LightingSampler::Distribution1, index) + ")";
+        }
+        std::string specular_1 = "(" + d1_lut_value + " * refl_value * " + light_src + ".specular_1)";
+        // Fresnel
+        if (config.lighting.lut_fr.enable && Pica::Regs::IsLightingSamplerSupported(config.lighting.config, Pica::Regs::LightingSampler::Fresnel)) {
+            // Lookup fresnel LUT value
+            std::string index = GetLutIndex(light_config.num, config.lighting.lut_fr.type, config.lighting.lut_fr.abs_input);
+            std::string value = "(" + std::to_string(config.lighting.lut_fr.scale) + " * " + GetLutValue(Regs::LightingSampler::Fresnel, index) + ")";
+            // Enabled for difffuse lighting alpha component
+            if (config.lighting.fresnel_selector == Pica::Regs::LightingFresnelSelector::PrimaryAlpha ||
+                config.lighting.fresnel_selector == Pica::Regs::LightingFresnelSelector::Both)
+                out += "diffuse_sum.a  *= " + value + ";\n";
+            // Enabled for the specular lighting alpha component
+            if (config.lighting.fresnel_selector == Pica::Regs::LightingFresnelSelector::SecondaryAlpha ||
+                config.lighting.fresnel_selector == Pica::Regs::LightingFresnelSelector::Both)
+                out += "specular_sum.a *= " + value + ";\n";
+        }
+        // Compute primary fragment color (diffuse lighting) function
+        out += "diffuse_sum.rgb += ((" + light_src + ".diffuse * " + dot_product + ") + " + light_src + ".ambient) * " + dist_atten + ";\n";
+        // Compute secondary fragment color (specular lighting) function
+        out += "specular_sum.rgb += (" + specular_0 + " + " + specular_1 + ") * " + clamp_highlights + " * " + dist_atten + ";\n";
+    }
+    // Sum final lighting result
+    out += "diffuse_sum.rgb += lighting_global_ambient;\n";
+    out += "primary_fragment_color = clamp(diffuse_sum, vec4(0.0), vec4(1.0));\n";
+    out += "secondary_fragment_color = clamp(specular_sum, vec4(0.0), vec4(1.0));\n";
+}
 std::string GenerateFragmentShader(const PicaShaderConfig& config) {
    std::string out = R"(
 #version 330 core
 #define NUM_TEV_STAGES 6
+#define NUM_LIGHTS 8
+#define LIGHTING_LUT_SIZE 256
+#define FLOAT_255 (255.0 / 256.0)
 in vec4 primary_color;
 in vec2 texcoord[3];
+in vec4 normquat;
+in vec3 view;
 out vec4 color;
+struct LightSrc {
+    vec3 specular_0;
+    vec3 specular_1;
+    vec3 diffuse;
+    vec3 ambient;
+    vec3 position;
+};
 layout (std140) uniform shader_data {
    vec4 const_color[NUM_TEV_STAGES];
    vec4 tev_combiner_buffer_color;
    int alphatest_ref;
    float depth_offset;
+    vec3 lighting_global_ambient;
+    LightSrc light_src[NUM_LIGHTS];
 };
 uniform sampler2D tex[3];
+uniform sampler1D lut[6];
+// Rotate the vector v by the quaternion q
+vec3 quaternion_rotate(vec4 q, vec3 v) {
+    return v + 2.0 * cross(q.xyz, cross(q.xyz, v) + q.w * v);
+}
 void main() {
+vec4 primary_fragment_color = vec4(0.0);
+vec4 secondary_fragment_color = vec4(0.0);
 )";
    // Do not do any sort of processing if it's obvious we're not going to pass the alpha test
@@ -348,6 +549,9 @@ void main() {
        return out;
    }
+    if (config.lighting.enable)
+        WriteLighting(out, config);
    out += "vec4 combiner_buffer = vec4(0.0);\n";
    out += "vec4 next_combiner_buffer = tev_combiner_buffer_color;\n";
    out += "vec4 last_tex_env_out = vec4(0.0);\n";
@@ -369,21 +573,28 @@ void main() {
 std::string GenerateVertexShader() {
    std::string out = "#version 330 core\n";
    out += "layout(location = " + std::to_string((int)ATTRIBUTE_POSITION)  + ") in vec4 vert_position;\n";
    out += "layout(location = " + std::to_string((int)ATTRIBUTE_COLOR)     + ") in vec4 vert_color;\n";
    out += "layout(location = " + std::to_string((int)ATTRIBUTE_TEXCOORD0) + ") in vec2 vert_texcoord0;\n";
    out += "layout(location = " + std::to_string((int)ATTRIBUTE_TEXCOORD1) + ") in vec2 vert_texcoord1;\n";
    out += "layout(location = " + std::to_string((int)ATTRIBUTE_TEXCOORD2) + ") in vec2 vert_texcoord2;\n";
+    out += "layout(location = " + std::to_string((int)ATTRIBUTE_NORMQUAT)  + ") in vec4 vert_normquat;\n";
+    out += "layout(location = " + std::to_string((int)ATTRIBUTE_VIEW)      + ") in vec3 vert_view;\n";
    out += R"(
 out vec4 primary_color;
 out vec2 texcoord[3];
+out vec4 normquat;
+out vec3 view;
 void main() {
    primary_color = vert_color;
    texcoord[0] = vert_texcoord0;
    texcoord[1] = vert_texcoord1;
    texcoord[2] = vert_texcoord2;
+    normquat = vert_normquat;
+    view = vert_view;
    gl_Position = vec4(vert_position.x, vert_position.y, -vert_position.z, vert_position.w);
 }
 )";
diff --git a/src/video_core/renderer_opengl/gl_shader_util.h b/src/video_core/renderer_opengl/gl_shader_util.h
index 046aae14f..097242f6f 100644
--- a/src/video_core/renderer_opengl/gl_shader_util.h
+++ b/src/video_core/renderer_opengl/gl_shader_util.h
@@ -14,6 +14,8 @@ enum Attributes {
    ATTRIBUTE_TEXCOORD0,
    ATTRIBUTE_TEXCOORD1,
    ATTRIBUTE_TEXCOORD2,
+    ATTRIBUTE_NORMQUAT,
+    ATTRIBUTE_VIEW,
 };
 /**
diff --git a/src/video_core/renderer_opengl/gl_state.cpp b/src/video_core/renderer_opengl/gl_state.cpp
index a82372995..08e4d0b54 100644
--- a/src/video_core/renderer_opengl/gl_state.cpp
+++ b/src/video_core/renderer_opengl/gl_state.cpp
@@ -48,6 +48,10 @@ OpenGLState::OpenGLState() {
        texture_unit.sampler = 0;
    }
+    for (auto& lut : lighting_lut) {
+        lut.texture_1d = 0;
+    }
    draw.framebuffer = 0;
    draw.vertex_array = 0;
    draw.vertex_buffer = 0;
@@ -170,6 +174,14 @@ void OpenGLState::Apply() {
        }
    }
+    // Lighting LUTs
+    for (unsigned i = 0; i < ARRAY_SIZE(lighting_lut); ++i) {
+        if (lighting_lut[i].texture_1d != cur_state.lighting_lut[i].texture_1d) {
+            glActiveTexture(GL_TEXTURE3 + i);
+            glBindTexture(GL_TEXTURE_1D, lighting_lut[i].texture_1d);
+        }
+    }
    // Framebuffer
    if (draw.framebuffer != cur_state.draw.framebuffer) {
        glBindFramebuffer(GL_FRAMEBUFFER, draw.framebuffer);
diff --git a/src/video_core/renderer_opengl/gl_state.h b/src/video_core/renderer_opengl/gl_state.h
index b8ab45bb8..e848058d7 100644
--- a/src/video_core/renderer_opengl/gl_state.h
+++ b/src/video_core/renderer_opengl/gl_state.h
@@ -62,6 +62,10 @@ public:
    } texture_units[3];
    struct {
+        GLuint texture_1d; // GL_TEXTURE_BINDING_1D
+    } lighting_lut[6];
+    struct {
        GLuint framebuffer; // GL_DRAW_FRAMEBUFFER_BINDING
        GLuint vertex_array; // GL_VERTEX_ARRAY_BINDING
        GLuint vertex_buffer; // GL_ARRAY_BUFFER_BINDING
diff --git a/src/video_core/renderer_opengl/pica_to_gl.h b/src/video_core/renderer_opengl/pica_to_gl.h
index 04c1d1a34..3d6c4e9e5 100644
--- a/src/video_core/renderer_opengl/pica_to_gl.h
+++ b/src/video_core/renderer_opengl/pica_to_gl.h
@@ -10,6 +10,9 @@
 #include "video_core/pica.h"
+using GLvec3 = std::array<GLfloat, 3>;
+using GLvec4 = std::array<GLfloat, 4>;
 namespace PicaToGL {
 inline GLenum TextureFilterMode(Pica::Regs::TextureConfig::TextureFilter mode) {
@@ -175,7 +178,7 @@ inline GLenum StencilOp(Pica::Regs::StencilAction action) {
    return stencil_op_table[(unsigned)action];
 }
-inline std::array<GLfloat, 4> ColorRGBA8(const u32 color) {
+inline GLvec4 ColorRGBA8(const u32 color) {
    return { { (color >>  0 & 0xFF) / 255.0f,
               (color >>  8 & 0xFF) / 255.0f,
               (color >> 16 & 0xFF) / 255.0f,
@@ -183,4 +186,11 @@ inline std::array<GLfloat, 4> ColorRGBA8(const u32 color) {
           } };
 }
+inline std::array<GLfloat, 3> LightColor(const Pica::Regs::LightColor& color) {
+    return { { color.r / 255.0f,
+               color.g / 255.0f,
+               color.b / 255.0f
+           } };
+}
 } // namespace
diff --git a/src/video_core/renderer_opengl/renderer_opengl.cpp b/src/video_core/renderer_opengl/renderer_opengl.cpp
index a6a38f0af..ca3a6a6b4 100644
--- a/src/video_core/renderer_opengl/renderer_opengl.cpp
+++ b/src/video_core/renderer_opengl/renderer_opengl.cpp
@@ -81,8 +81,8 @@ struct ScreenRectVertex {
 * The projection part of the matrix is trivial, hence these operations are represented
 * by a 3x2 matrix.
 */
-static std::array<GLfloat, 3*2> MakeOrthographicMatrix(const float width, const float height) {
+static std::array<GLfloat, 3 * 2> MakeOrthographicMatrix(const float width, const float height) {
-    std::array<GLfloat, 3*2> matrix;
+    std::array<GLfloat, 3 * 2> matrix;
    matrix[0] = 2.f / width; matrix[2] = 0.f;           matrix[4] = -1.f;
    matrix[1] = 0.f;         matrix[3] = -2.f / height; matrix[5] = 1.f;
diff --git a/src/video_core/shader/shader.cpp b/src/video_core/shader/shader.cpp
index 59f54236b..44c234ed8 100644
--- a/src/video_core/shader/shader.cpp
+++ b/src/video_core/shader/shader.cpp
@@ -134,11 +134,13 @@ OutputVertex Run(UnitState<false>& state, const InputVertex& input, int num_attr
            std::fmin(std::fabs(ret.color[i].ToFloat32()), 1.0f));
    }
-    LOG_TRACE(Render_Software, "Output vertex: pos (%.2f, %.2f, %.2f, %.2f), quat (%.2f, %.2f, %.2f, %.2f), col(%.2f, %.2f, %.2f, %.2f), tc0(%.2f, %.2f)",
+    LOG_TRACE(Render_Software, "Output vertex: pos(%.2f, %.2f, %.2f, %.2f), quat(%.2f, %.2f, %.2f, %.2f), "
+        "col(%.2f, %.2f, %.2f, %.2f), tc0(%.2f, %.2f), view(%.2f, %.2f, %.2f)",
        ret.pos.x.ToFloat32(), ret.pos.y.ToFloat32(), ret.pos.z.ToFloat32(), ret.pos.w.ToFloat32(),
        ret.quat.x.ToFloat32(), ret.quat.y.ToFloat32(), ret.quat.z.ToFloat32(), ret.quat.w.ToFloat32(),
        ret.color.x.ToFloat32(), ret.color.y.ToFloat32(), ret.color.z.ToFloat32(), ret.color.w.ToFloat32(),
-        ret.tc0.u().ToFloat32(), ret.tc0.v().ToFloat32());
+        ret.tc0.u().ToFloat32(), ret.tc0.v().ToFloat32(),
+        ret.view.x.ToFloat32(), ret.view.y.ToFloat32(), ret.view.z.ToFloat32());
    return ret;
 }
diff --git a/src/video_core/shader/shader.h b/src/video_core/shader/shader.h
index 1c6fa592c..f068cd93f 100644
--- a/src/video_core/shader/shader.h
+++ b/src/video_core/shader/shader.h
@@ -37,17 +37,19 @@ struct OutputVertex {
    Math::Vec4<float24> color;
    Math::Vec2<float24> tc0;
    Math::Vec2<float24> tc1;
-    float24 pad[6];
+    INSERT_PADDING_WORDS(2);
+    Math::Vec3<float24> view;
+    INSERT_PADDING_WORDS(1);
    Math::Vec2<float24> tc2;
    // Padding for optimal alignment
-    float24 pad2[4];
+    INSERT_PADDING_WORDS(4);
    // Attributes used to store intermediate results
    // position after perspective divide
    Math::Vec3<float24> screenpos;
-    float24 pad3;
+    INSERT_PADDING_WORDS(1);
    // Linear interpolation
    // factor: 0=this, 1=vtx