summaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
Diffstat (limited to 'src')
-rw-r--r--src/audio_core/audio_core.cpp16
-rw-r--r--src/audio_core/null_sink.h6
-rw-r--r--src/audio_core/sdl2_sink.cpp30
-rw-r--r--src/audio_core/sdl2_sink.h5
-rw-r--r--src/audio_core/sink.h9
-rw-r--r--src/audio_core/sink_details.cpp19
-rw-r--r--src/audio_core/sink_details.h2
-rw-r--r--src/citra/citra.cpp20
-rw-r--r--src/citra/config.cpp4
-rw-r--r--src/citra/default_ini.h4
-rw-r--r--src/citra_qt/CMakeLists.txt4
-rw-r--r--src/citra_qt/config.cpp3
-rw-r--r--src/citra_qt/configure_audio.cpp33
-rw-r--r--src/citra_qt/configure_audio.h3
-rw-r--r--src/citra_qt/configure_audio.ui15
-rw-r--r--src/citra_qt/debugger/graphics/graphics_vertex_shader.cpp7
-rw-r--r--src/citra_qt/debugger/graphics/graphics_vertex_shader.h1
-rw-r--r--src/citra_qt/debugger/ramview.cpp12
-rw-r--r--src/citra_qt/debugger/ramview.h17
-rw-r--r--src/citra_qt/main.cpp3
-rw-r--r--src/common/hash.cpp8
-rw-r--r--src/common/hash.h5
-rw-r--r--src/core/core.h2
-rw-r--r--src/core/core_timing.cpp2
-rw-r--r--src/core/core_timing.h1
-rw-r--r--src/core/file_sys/archive_extsavedata.cpp2
-rw-r--r--src/core/file_sys/archive_sdmc.cpp12
-rw-r--r--src/core/file_sys/savedata_archive.cpp12
-rw-r--r--src/core/frontend/emu_window.cpp12
-rw-r--r--src/core/hle/service/err_f.cpp2
-rw-r--r--src/core/hle/service/gsp_gpu.cpp2
-rw-r--r--src/core/hle/service/hid/hid.cpp150
-rw-r--r--src/core/hle/service/hid/hid.h3
-rw-r--r--src/core/hle/service/mic_u.cpp8
-rw-r--r--src/core/hle/service/nfc/nfc.cpp105
-rw-r--r--src/core/hle/service/nfc/nfc.h122
-rw-r--r--src/core/hle/service/nfc/nfc_m.cpp21
-rw-r--r--src/core/hle/service/nfc/nfc_u.cpp21
-rw-r--r--src/core/hw/gpu.cpp6
-rw-r--r--src/core/loader/3dsx.cpp34
-rw-r--r--src/core/settings.h1
-rw-r--r--src/video_core/CMakeLists.txt6
-rw-r--r--src/video_core/command_processor.cpp22
-rw-r--r--src/video_core/pica.cpp2
-rw-r--r--src/video_core/renderer_opengl/gl_rasterizer.cpp5
-rw-r--r--src/video_core/renderer_opengl/gl_rasterizer.h2
-rw-r--r--src/video_core/renderer_opengl/gl_rasterizer_cache.cpp1
-rw-r--r--src/video_core/renderer_opengl/gl_rasterizer_cache.h7
-rw-r--r--src/video_core/shader/shader.cpp102
-rw-r--r--src/video_core/shader/shader.h70
-rw-r--r--src/video_core/shader/shader_interpreter.cpp49
-rw-r--r--src/video_core/shader/shader_interpreter.h26
-rw-r--r--src/video_core/shader/shader_jit_x64.cpp890
-rw-r--r--src/video_core/shader/shader_jit_x64.h115
-rw-r--r--src/video_core/shader/shader_jit_x64_compiler.cpp884
-rw-r--r--src/video_core/shader/shader_jit_x64_compiler.h125
56 files changed, 1771 insertions, 1279 deletions
diff --git a/src/audio_core/audio_core.cpp b/src/audio_core/audio_core.cpp
index ba6acf28e..84f9c03a7 100644
--- a/src/audio_core/audio_core.cpp
+++ b/src/audio_core/audio_core.cpp
@@ -56,20 +56,8 @@ void AddAddressSpace(Kernel::VMManager& address_space) {
56} 56}
57 57
58void SelectSink(std::string sink_id) { 58void SelectSink(std::string sink_id) {
59 auto iter = 59 const SinkDetails& sink_details = GetSinkDetails(sink_id);
60 std::find_if(g_sink_details.begin(), g_sink_details.end(), 60 DSP::HLE::SetSink(sink_details.factory());
61 [sink_id](const auto& sink_detail) { return sink_detail.id == sink_id; });
62
63 if (sink_id == "auto" || iter == g_sink_details.end()) {
64 if (sink_id != "auto") {
65 LOG_ERROR(Audio, "AudioCore::SelectSink given invalid sink_id %s", sink_id.c_str());
66 }
67 // Auto-select.
68 // g_sink_details is ordered in terms of desirability, with the best choice at the front.
69 iter = g_sink_details.begin();
70 }
71
72 DSP::HLE::SetSink(iter->factory());
73} 61}
74 62
75void EnableStretching(bool enable) { 63void EnableStretching(bool enable) {
diff --git a/src/audio_core/null_sink.h b/src/audio_core/null_sink.h
index e7668438c..c732926a2 100644
--- a/src/audio_core/null_sink.h
+++ b/src/audio_core/null_sink.h
@@ -23,6 +23,12 @@ public:
23 size_t SamplesInQueue() const override { 23 size_t SamplesInQueue() const override {
24 return 0; 24 return 0;
25 } 25 }
26
27 void SetDevice(int device_id) override {}
28
29 std::vector<std::string> GetDeviceList() const override {
30 return {};
31 }
26}; 32};
27 33
28} // namespace AudioCore 34} // namespace AudioCore
diff --git a/src/audio_core/sdl2_sink.cpp b/src/audio_core/sdl2_sink.cpp
index 4b66cd826..933c5f16d 100644
--- a/src/audio_core/sdl2_sink.cpp
+++ b/src/audio_core/sdl2_sink.cpp
@@ -4,12 +4,12 @@
4 4
5#include <list> 5#include <list>
6#include <numeric> 6#include <numeric>
7#include <vector>
8#include <SDL.h> 7#include <SDL.h>
9#include "audio_core/audio_core.h" 8#include "audio_core/audio_core.h"
10#include "audio_core/sdl2_sink.h" 9#include "audio_core/sdl2_sink.h"
11#include "common/assert.h" 10#include "common/assert.h"
12#include "common/logging/log.h" 11#include "common/logging/log.h"
12#include "core/settings.h"
13 13
14namespace AudioCore { 14namespace AudioCore {
15 15
@@ -42,10 +42,24 @@ SDL2Sink::SDL2Sink() : impl(std::make_unique<Impl>()) {
42 SDL_AudioSpec obtained_audiospec; 42 SDL_AudioSpec obtained_audiospec;
43 SDL_zero(obtained_audiospec); 43 SDL_zero(obtained_audiospec);
44 44
45 impl->audio_device_id = 45 int device_count = SDL_GetNumAudioDevices(0);
46 SDL_OpenAudioDevice(nullptr, false, &desired_audiospec, &obtained_audiospec, 0); 46 device_list.clear();
47 for (int i = 0; i < device_count; ++i) {
48 device_list.push_back(SDL_GetAudioDeviceName(i, 0));
49 }
50
51 const char* device = nullptr;
52
53 if (device_count >= 1 && Settings::values.audio_device_id != "auto" &&
54 !Settings::values.audio_device_id.empty()) {
55 device = Settings::values.audio_device_id.c_str();
56 }
57
58 impl->audio_device_id = SDL_OpenAudioDevice(device, false, &desired_audiospec,
59 &obtained_audiospec, SDL_AUDIO_ALLOW_ANY_CHANGE);
47 if (impl->audio_device_id <= 0) { 60 if (impl->audio_device_id <= 0) {
48 LOG_CRITICAL(Audio_Sink, "SDL_OpenAudioDevice failed with: %s", SDL_GetError()); 61 LOG_CRITICAL(Audio_Sink, "SDL_OpenAudioDevice failed with code %d for device \"%s\"",
62 impl->audio_device_id, Settings::values.audio_device_id.c_str());
49 return; 63 return;
50 } 64 }
51 65
@@ -69,6 +83,10 @@ unsigned int SDL2Sink::GetNativeSampleRate() const {
69 return impl->sample_rate; 83 return impl->sample_rate;
70} 84}
71 85
86std::vector<std::string> SDL2Sink::GetDeviceList() const {
87 return device_list;
88}
89
72void SDL2Sink::EnqueueSamples(const s16* samples, size_t sample_count) { 90void SDL2Sink::EnqueueSamples(const s16* samples, size_t sample_count) {
73 if (impl->audio_device_id <= 0) 91 if (impl->audio_device_id <= 0)
74 return; 92 return;
@@ -96,6 +114,10 @@ size_t SDL2Sink::SamplesInQueue() const {
96 return total_size; 114 return total_size;
97} 115}
98 116
117void SDL2Sink::SetDevice(int device_id) {
118 this->device_id = device_id;
119}
120
99void SDL2Sink::Impl::Callback(void* impl_, u8* buffer, int buffer_size_in_bytes) { 121void SDL2Sink::Impl::Callback(void* impl_, u8* buffer, int buffer_size_in_bytes) {
100 Impl* impl = reinterpret_cast<Impl*>(impl_); 122 Impl* impl = reinterpret_cast<Impl*>(impl_);
101 123
diff --git a/src/audio_core/sdl2_sink.h b/src/audio_core/sdl2_sink.h
index ccd0f7c7e..bcc725369 100644
--- a/src/audio_core/sdl2_sink.h
+++ b/src/audio_core/sdl2_sink.h
@@ -21,9 +21,14 @@ public:
21 21
22 size_t SamplesInQueue() const override; 22 size_t SamplesInQueue() const override;
23 23
24 std::vector<std::string> GetDeviceList() const override;
25 void SetDevice(int device_id) override;
26
24private: 27private:
25 struct Impl; 28 struct Impl;
26 std::unique_ptr<Impl> impl; 29 std::unique_ptr<Impl> impl;
30 int device_id;
31 std::vector<std::string> device_list;
27}; 32};
28 33
29} // namespace AudioCore 34} // namespace AudioCore
diff --git a/src/audio_core/sink.h b/src/audio_core/sink.h
index 08f3bab5b..558c8c0fe 100644
--- a/src/audio_core/sink.h
+++ b/src/audio_core/sink.h
@@ -31,6 +31,15 @@ public:
31 31
32 /// Samples enqueued that have not been played yet. 32 /// Samples enqueued that have not been played yet.
33 virtual std::size_t SamplesInQueue() const = 0; 33 virtual std::size_t SamplesInQueue() const = 0;
34
35 /**
36 * Sets the desired output device.
37 * @paran device_id Id of the desired device.
38 */
39 virtual void SetDevice(int device_id) = 0;
40
41 /// Returns the list of available devices.
42 virtual std::vector<std::string> GetDeviceList() const = 0;
34}; 43};
35 44
36} // namespace 45} // namespace
diff --git a/src/audio_core/sink_details.cpp b/src/audio_core/sink_details.cpp
index 95ccc9e9d..6972395af 100644
--- a/src/audio_core/sink_details.cpp
+++ b/src/audio_core/sink_details.cpp
@@ -2,6 +2,7 @@
2// Licensed under GPLv2 or any later version 2// Licensed under GPLv2 or any later version
3// Refer to the license.txt file included. 3// Refer to the license.txt file included.
4 4
5#include <algorithm>
5#include <memory> 6#include <memory>
6#include <vector> 7#include <vector>
7#include "audio_core/null_sink.h" 8#include "audio_core/null_sink.h"
@@ -9,6 +10,7 @@
9#ifdef HAVE_SDL2 10#ifdef HAVE_SDL2
10#include "audio_core/sdl2_sink.h" 11#include "audio_core/sdl2_sink.h"
11#endif 12#endif
13#include "common/logging/log.h"
12 14
13namespace AudioCore { 15namespace AudioCore {
14 16
@@ -20,4 +22,21 @@ const std::vector<SinkDetails> g_sink_details = {
20 {"null", []() { return std::make_unique<NullSink>(); }}, 22 {"null", []() { return std::make_unique<NullSink>(); }},
21}; 23};
22 24
25const SinkDetails& GetSinkDetails(std::string sink_id) {
26 auto iter =
27 std::find_if(g_sink_details.begin(), g_sink_details.end(),
28 [sink_id](const auto& sink_detail) { return sink_detail.id == sink_id; });
29
30 if (sink_id == "auto" || iter == g_sink_details.end()) {
31 if (sink_id != "auto") {
32 LOG_ERROR(Audio, "AudioCore::SelectSink given invalid sink_id %s", sink_id.c_str());
33 }
34 // Auto-select.
35 // g_sink_details is ordered in terms of desirability, with the best choice at the front.
36 iter = g_sink_details.begin();
37 }
38
39 return *iter;
40}
41
23} // namespace AudioCore 42} // namespace AudioCore
diff --git a/src/audio_core/sink_details.h b/src/audio_core/sink_details.h
index 4b30cf835..9d3735171 100644
--- a/src/audio_core/sink_details.h
+++ b/src/audio_core/sink_details.h
@@ -24,4 +24,6 @@ struct SinkDetails {
24 24
25extern const std::vector<SinkDetails> g_sink_details; 25extern const std::vector<SinkDetails> g_sink_details;
26 26
27const SinkDetails& GetSinkDetails(std::string sink_id);
28
27} // namespace AudioCore 29} // namespace AudioCore
diff --git a/src/citra/citra.cpp b/src/citra/citra.cpp
index 99c096ac7..76f5caeb1 100644
--- a/src/citra/citra.cpp
+++ b/src/citra/citra.cpp
@@ -141,6 +141,26 @@ int main(int argc, char** argv) {
141 case Core::System::ResultStatus::ErrorLoader: 141 case Core::System::ResultStatus::ErrorLoader:
142 LOG_CRITICAL(Frontend, "Failed to load ROM!"); 142 LOG_CRITICAL(Frontend, "Failed to load ROM!");
143 return -1; 143 return -1;
144 case Core::System::ResultStatus::ErrorLoader_ErrorEncrypted:
145 LOG_CRITICAL(Frontend, "The game that you are trying to load must be decrypted before "
146 "being used with Citra. \n\n For more information on dumping and "
147 "decrypting games, please refer to: "
148 "https://citra-emu.org/wiki/Dumping-Game-Cartridges");
149 return -1;
150 case Core::System::ResultStatus::ErrorLoader_ErrorInvalidFormat:
151 LOG_CRITICAL(Frontend, "Error while loading ROM: The ROM format is not supported.");
152 return -1;
153 case Core::System::ResultStatus::ErrorNotInitialized:
154 LOG_CRITICAL(Frontend, "CPUCore not initialized");
155 return -1;
156 case Core::System::ResultStatus::ErrorSystemMode:
157 LOG_CRITICAL(Frontend, "Failed to determine system mode!");
158 return -1;
159 case Core::System::ResultStatus::ErrorVideoCore:
160 LOG_CRITICAL(Frontend, "VideoCore not initialized");
161 return -1;
162 case Core::System::ResultStatus::Success:
163 break; // Expected case
144 } 164 }
145 165
146 while (emu_window->IsOpen()) { 166 while (emu_window->IsOpen()) {
diff --git a/src/citra/config.cpp b/src/citra/config.cpp
index bd8ac563b..827c90e55 100644
--- a/src/citra/config.cpp
+++ b/src/citra/config.cpp
@@ -63,7 +63,8 @@ void Config::ReadValues() {
63 // Renderer 63 // Renderer
64 Settings::values.use_hw_renderer = sdl2_config->GetBoolean("Renderer", "use_hw_renderer", true); 64 Settings::values.use_hw_renderer = sdl2_config->GetBoolean("Renderer", "use_hw_renderer", true);
65 Settings::values.use_shader_jit = sdl2_config->GetBoolean("Renderer", "use_shader_jit", true); 65 Settings::values.use_shader_jit = sdl2_config->GetBoolean("Renderer", "use_shader_jit", true);
66 Settings::values.resolution_factor = sdl2_config->GetReal("Renderer", "resolution_factor", 1.0); 66 Settings::values.resolution_factor =
67 (float)sdl2_config->GetReal("Renderer", "resolution_factor", 1.0);
67 Settings::values.use_vsync = sdl2_config->GetBoolean("Renderer", "use_vsync", false); 68 Settings::values.use_vsync = sdl2_config->GetBoolean("Renderer", "use_vsync", false);
68 Settings::values.toggle_framelimit = 69 Settings::values.toggle_framelimit =
69 sdl2_config->GetBoolean("Renderer", "toggle_framelimit", true); 70 sdl2_config->GetBoolean("Renderer", "toggle_framelimit", true);
@@ -81,6 +82,7 @@ void Config::ReadValues() {
81 Settings::values.sink_id = sdl2_config->Get("Audio", "output_engine", "auto"); 82 Settings::values.sink_id = sdl2_config->Get("Audio", "output_engine", "auto");
82 Settings::values.enable_audio_stretching = 83 Settings::values.enable_audio_stretching =
83 sdl2_config->GetBoolean("Audio", "enable_audio_stretching", true); 84 sdl2_config->GetBoolean("Audio", "enable_audio_stretching", true);
85 Settings::values.audio_device_id = sdl2_config->Get("Audio", "output_device", "auto");
84 86
85 // Data Storage 87 // Data Storage
86 Settings::values.use_virtual_sd = 88 Settings::values.use_virtual_sd =
diff --git a/src/citra/default_ini.h b/src/citra/default_ini.h
index 7996813b4..d728fb9e8 100644
--- a/src/citra/default_ini.h
+++ b/src/citra/default_ini.h
@@ -91,6 +91,10 @@ output_engine =
91# 0: No, 1 (default): Yes 91# 0: No, 1 (default): Yes
92enable_audio_stretching = 92enable_audio_stretching =
93 93
94# Which audio device to use.
95# auto (default): Auto-select
96output_device =
97
94[Data Storage] 98[Data Storage]
95# Whether to create a virtual SD card. 99# Whether to create a virtual SD card.
96# 1 (default): Yes, 0: No 100# 1 (default): Yes, 0: No
diff --git a/src/citra_qt/CMakeLists.txt b/src/citra_qt/CMakeLists.txt
index 93f1c339d..d4460bf01 100644
--- a/src/citra_qt/CMakeLists.txt
+++ b/src/citra_qt/CMakeLists.txt
@@ -14,7 +14,6 @@ set(SRCS
14 debugger/graphics/graphics_tracing.cpp 14 debugger/graphics/graphics_tracing.cpp
15 debugger/graphics/graphics_vertex_shader.cpp 15 debugger/graphics/graphics_vertex_shader.cpp
16 debugger/profiler.cpp 16 debugger/profiler.cpp
17 debugger/ramview.cpp
18 debugger/registers.cpp 17 debugger/registers.cpp
19 debugger/wait_tree.cpp 18 debugger/wait_tree.cpp
20 util/spinbox.cpp 19 util/spinbox.cpp
@@ -48,7 +47,6 @@ set(HEADERS
48 debugger/graphics/graphics_tracing.h 47 debugger/graphics/graphics_tracing.h
49 debugger/graphics/graphics_vertex_shader.h 48 debugger/graphics/graphics_vertex_shader.h
50 debugger/profiler.h 49 debugger/profiler.h
51 debugger/ramview.h
52 debugger/registers.h 50 debugger/registers.h
53 debugger/wait_tree.h 51 debugger/wait_tree.h
54 util/spinbox.h 52 util/spinbox.h
@@ -100,7 +98,7 @@ if (APPLE)
100else() 98else()
101 add_executable(citra-qt ${SRCS} ${HEADERS} ${UI_HDRS}) 99 add_executable(citra-qt ${SRCS} ${HEADERS} ${UI_HDRS})
102endif() 100endif()
103target_link_libraries(citra-qt core video_core audio_core common qhexedit) 101target_link_libraries(citra-qt core video_core audio_core common)
104target_link_libraries(citra-qt ${OPENGL_gl_LIBRARY} ${CITRA_QT_LIBS}) 102target_link_libraries(citra-qt ${OPENGL_gl_LIBRARY} ${CITRA_QT_LIBS})
105target_link_libraries(citra-qt ${PLATFORM_LIBRARIES} Threads::Threads) 103target_link_libraries(citra-qt ${PLATFORM_LIBRARIES} Threads::Threads)
106 104
diff --git a/src/citra_qt/config.cpp b/src/citra_qt/config.cpp
index 8021667d0..f776e16b2 100644
--- a/src/citra_qt/config.cpp
+++ b/src/citra_qt/config.cpp
@@ -63,6 +63,8 @@ void Config::ReadValues() {
63 Settings::values.sink_id = qt_config->value("output_engine", "auto").toString().toStdString(); 63 Settings::values.sink_id = qt_config->value("output_engine", "auto").toString().toStdString();
64 Settings::values.enable_audio_stretching = 64 Settings::values.enable_audio_stretching =
65 qt_config->value("enable_audio_stretching", true).toBool(); 65 qt_config->value("enable_audio_stretching", true).toBool();
66 Settings::values.audio_device_id =
67 qt_config->value("output_device", "auto").toString().toStdString();
66 qt_config->endGroup(); 68 qt_config->endGroup();
67 69
68 qt_config->beginGroup("Data Storage"); 70 qt_config->beginGroup("Data Storage");
@@ -169,6 +171,7 @@ void Config::SaveValues() {
169 qt_config->beginGroup("Audio"); 171 qt_config->beginGroup("Audio");
170 qt_config->setValue("output_engine", QString::fromStdString(Settings::values.sink_id)); 172 qt_config->setValue("output_engine", QString::fromStdString(Settings::values.sink_id));
171 qt_config->setValue("enable_audio_stretching", Settings::values.enable_audio_stretching); 173 qt_config->setValue("enable_audio_stretching", Settings::values.enable_audio_stretching);
174 qt_config->setValue("output_device", QString::fromStdString(Settings::values.audio_device_id));
172 qt_config->endGroup(); 175 qt_config->endGroup();
173 176
174 qt_config->beginGroup("Data Storage"); 177 qt_config->beginGroup("Data Storage");
diff --git a/src/citra_qt/configure_audio.cpp b/src/citra_qt/configure_audio.cpp
index 3cdd4c780..3ddcf9232 100644
--- a/src/citra_qt/configure_audio.cpp
+++ b/src/citra_qt/configure_audio.cpp
@@ -2,6 +2,9 @@
2// Licensed under GPLv2 or any later version 2// Licensed under GPLv2 or any later version
3// Refer to the license.txt file included. 3// Refer to the license.txt file included.
4 4
5#include <memory>
6#include "audio_core/audio_core.h"
7#include "audio_core/sink.h"
5#include "audio_core/sink_details.h" 8#include "audio_core/sink_details.h"
6#include "citra_qt/configure_audio.h" 9#include "citra_qt/configure_audio.h"
7#include "core/settings.h" 10#include "core/settings.h"
@@ -18,6 +21,8 @@ ConfigureAudio::ConfigureAudio(QWidget* parent)
18 } 21 }
19 22
20 this->setConfiguration(); 23 this->setConfiguration();
24 connect(ui->output_sink_combo_box, SIGNAL(currentIndexChanged(int)), this,
25 SLOT(updateAudioDevices(int)));
21} 26}
22 27
23ConfigureAudio::~ConfigureAudio() {} 28ConfigureAudio::~ConfigureAudio() {}
@@ -33,6 +38,19 @@ void ConfigureAudio::setConfiguration() {
33 ui->output_sink_combo_box->setCurrentIndex(new_sink_index); 38 ui->output_sink_combo_box->setCurrentIndex(new_sink_index);
34 39
35 ui->toggle_audio_stretching->setChecked(Settings::values.enable_audio_stretching); 40 ui->toggle_audio_stretching->setChecked(Settings::values.enable_audio_stretching);
41
42 // The device list cannot be pre-populated (nor listed) until the output sink is known.
43 updateAudioDevices(new_sink_index);
44
45 int new_device_index = -1;
46 for (int index = 0; index < ui->audio_device_combo_box->count(); index++) {
47 if (ui->audio_device_combo_box->itemText(index).toStdString() ==
48 Settings::values.audio_device_id) {
49 new_device_index = index;
50 break;
51 }
52 }
53 ui->audio_device_combo_box->setCurrentIndex(new_device_index);
36} 54}
37 55
38void ConfigureAudio::applyConfiguration() { 56void ConfigureAudio::applyConfiguration() {
@@ -40,5 +58,20 @@ void ConfigureAudio::applyConfiguration() {
40 ui->output_sink_combo_box->itemText(ui->output_sink_combo_box->currentIndex()) 58 ui->output_sink_combo_box->itemText(ui->output_sink_combo_box->currentIndex())
41 .toStdString(); 59 .toStdString();
42 Settings::values.enable_audio_stretching = ui->toggle_audio_stretching->isChecked(); 60 Settings::values.enable_audio_stretching = ui->toggle_audio_stretching->isChecked();
61 Settings::values.audio_device_id =
62 ui->audio_device_combo_box->itemText(ui->audio_device_combo_box->currentIndex())
63 .toStdString();
43 Settings::Apply(); 64 Settings::Apply();
44} 65}
66
67void ConfigureAudio::updateAudioDevices(int sink_index) {
68 ui->audio_device_combo_box->clear();
69 ui->audio_device_combo_box->addItem("auto");
70
71 std::string sink_id = ui->output_sink_combo_box->itemText(sink_index).toStdString();
72 std::vector<std::string> device_list =
73 AudioCore::GetSinkDetails(sink_id).factory()->GetDeviceList();
74 for (const auto& device : device_list) {
75 ui->audio_device_combo_box->addItem(device.c_str());
76 }
77}
diff --git a/src/citra_qt/configure_audio.h b/src/citra_qt/configure_audio.h
index 51df2e27b..8190e694f 100644
--- a/src/citra_qt/configure_audio.h
+++ b/src/citra_qt/configure_audio.h
@@ -20,6 +20,9 @@ public:
20 20
21 void applyConfiguration(); 21 void applyConfiguration();
22 22
23public slots:
24 void updateAudioDevices(int sink_index);
25
23private: 26private:
24 void setConfiguration(); 27 void setConfiguration();
25 28
diff --git a/src/citra_qt/configure_audio.ui b/src/citra_qt/configure_audio.ui
index 3e2b4635f..dd870eb61 100644
--- a/src/citra_qt/configure_audio.ui
+++ b/src/citra_qt/configure_audio.ui
@@ -35,6 +35,21 @@
35 </property> 35 </property>
36 </widget> 36 </widget>
37 </item> 37 </item>
38 <item>
39 <layout class="QHBoxLayout">
40 <item>
41 <widget class="QLabel">
42 <property name="text">
43 <string>Audio Device:</string>
44 </property>
45 </widget>
46 </item>
47 <item>
48 <widget class="QComboBox" name="audio_device_combo_box">
49 </widget>
50 </item>
51 </layout>
52 </item>
38 </layout> 53 </layout>
39 </widget> 54 </widget>
40 </item> 55 </item>
diff --git a/src/citra_qt/debugger/graphics/graphics_vertex_shader.cpp b/src/citra_qt/debugger/graphics/graphics_vertex_shader.cpp
index ff2e7e363..f37524190 100644
--- a/src/citra_qt/debugger/graphics/graphics_vertex_shader.cpp
+++ b/src/citra_qt/debugger/graphics/graphics_vertex_shader.cpp
@@ -18,7 +18,9 @@
18#include "citra_qt/util/util.h" 18#include "citra_qt/util/util.h"
19#include "video_core/pica.h" 19#include "video_core/pica.h"
20#include "video_core/pica_state.h" 20#include "video_core/pica_state.h"
21#include "video_core/shader/debug_data.h"
21#include "video_core/shader/shader.h" 22#include "video_core/shader/shader.h"
23#include "video_core/shader/shader_interpreter.h"
22 24
23using nihstro::OpCode; 25using nihstro::OpCode;
24using nihstro::Instruction; 26using nihstro::Instruction;
@@ -518,8 +520,9 @@ void GraphicsVertexShaderWidget::Reload(bool replace_vertex_data, void* vertex_d
518 info.labels.insert({entry_point, "main"}); 520 info.labels.insert({entry_point, "main"});
519 521
520 // Generate debug information 522 // Generate debug information
521 debug_data = Pica::g_state.vs.ProduceDebugInfo(input_vertex, num_attributes, shader_config, 523 Pica::Shader::InterpreterEngine shader_engine;
522 shader_setup); 524 shader_engine.SetupBatch(shader_setup, entry_point);
525 debug_data = shader_engine.ProduceDebugInfo(shader_setup, input_vertex, num_attributes);
523 526
524 // Reload widget state 527 // Reload widget state
525 for (int attr = 0; attr < num_attributes; ++attr) { 528 for (int attr = 0; attr < num_attributes; ++attr) {
diff --git a/src/citra_qt/debugger/graphics/graphics_vertex_shader.h b/src/citra_qt/debugger/graphics/graphics_vertex_shader.h
index bedea0bed..3292573f3 100644
--- a/src/citra_qt/debugger/graphics/graphics_vertex_shader.h
+++ b/src/citra_qt/debugger/graphics/graphics_vertex_shader.h
@@ -8,6 +8,7 @@
8#include <QTreeView> 8#include <QTreeView>
9#include "citra_qt/debugger/graphics/graphics_breakpoint_observer.h" 9#include "citra_qt/debugger/graphics/graphics_breakpoint_observer.h"
10#include "nihstro/parser_shbin.h" 10#include "nihstro/parser_shbin.h"
11#include "video_core/shader/debug_data.h"
11#include "video_core/shader/shader.h" 12#include "video_core/shader/shader.h"
12 13
13class QLabel; 14class QLabel;
diff --git a/src/citra_qt/debugger/ramview.cpp b/src/citra_qt/debugger/ramview.cpp
deleted file mode 100644
index 10a09dda8..000000000
--- a/src/citra_qt/debugger/ramview.cpp
+++ /dev/null
@@ -1,12 +0,0 @@
1// Copyright 2014 Citra Emulator Project
2// Licensed under GPLv2 or any later version
3// Refer to the license.txt file included.
4
5#include "citra_qt/debugger/ramview.h"
6
7GRamView::GRamView(QWidget* parent) : QHexEdit(parent) {}
8
9void GRamView::OnCPUStepped() {
10 // TODO: QHexEdit doesn't show vertical scroll bars for > 10MB data streams...
11 // setData(QByteArray((const char*)Mem_RAM,sizeof(Mem_RAM)/8));
12}
diff --git a/src/citra_qt/debugger/ramview.h b/src/citra_qt/debugger/ramview.h
deleted file mode 100644
index d01cea93b..000000000
--- a/src/citra_qt/debugger/ramview.h
+++ /dev/null
@@ -1,17 +0,0 @@
1// Copyright 2014 Citra Emulator Project
2// Licensed under GPLv2 or any later version
3// Refer to the license.txt file included.
4
5#pragma once
6
7#include "qhexedit.h"
8
9class GRamView : public QHexEdit {
10 Q_OBJECT
11
12public:
13 explicit GRamView(QWidget* parent = nullptr);
14
15public slots:
16 void OnCPUStepped();
17};
diff --git a/src/citra_qt/main.cpp b/src/citra_qt/main.cpp
index 6d59cf640..f765c0147 100644
--- a/src/citra_qt/main.cpp
+++ b/src/citra_qt/main.cpp
@@ -12,6 +12,7 @@
12#include <QFileDialog> 12#include <QFileDialog>
13#include <QMessageBox> 13#include <QMessageBox>
14#include <QtGui> 14#include <QtGui>
15#include <QtWidgets>
15#include "citra_qt/bootmanager.h" 16#include "citra_qt/bootmanager.h"
16#include "citra_qt/config.h" 17#include "citra_qt/config.h"
17#include "citra_qt/configure_dialog.h" 18#include "citra_qt/configure_dialog.h"
@@ -24,7 +25,6 @@
24#include "citra_qt/debugger/graphics/graphics_tracing.h" 25#include "citra_qt/debugger/graphics/graphics_tracing.h"
25#include "citra_qt/debugger/graphics/graphics_vertex_shader.h" 26#include "citra_qt/debugger/graphics/graphics_vertex_shader.h"
26#include "citra_qt/debugger/profiler.h" 27#include "citra_qt/debugger/profiler.h"
27#include "citra_qt/debugger/ramview.h"
28#include "citra_qt/debugger/registers.h" 28#include "citra_qt/debugger/registers.h"
29#include "citra_qt/debugger/wait_tree.h" 29#include "citra_qt/debugger/wait_tree.h"
30#include "citra_qt/game_list.h" 30#include "citra_qt/game_list.h"
@@ -46,7 +46,6 @@
46#include "core/gdbstub/gdbstub.h" 46#include "core/gdbstub/gdbstub.h"
47#include "core/loader/loader.h" 47#include "core/loader/loader.h"
48#include "core/settings.h" 48#include "core/settings.h"
49#include "qhexedit.h"
50#include "video_core/video_core.h" 49#include "video_core/video_core.h"
51 50
52#ifdef QT_STATICPLUGIN 51#ifdef QT_STATICPLUGIN
diff --git a/src/common/hash.cpp b/src/common/hash.cpp
index 2309320bb..f3d390dc5 100644
--- a/src/common/hash.cpp
+++ b/src/common/hash.cpp
@@ -16,7 +16,7 @@ namespace Common {
16 16
17// Block read - if your platform needs to do endian-swapping or can only handle aligned reads, do 17// Block read - if your platform needs to do endian-swapping or can only handle aligned reads, do
18// the conversion here 18// the conversion here
19static FORCE_INLINE u64 getblock64(const u64* p, int i) { 19static FORCE_INLINE u64 getblock64(const u64* p, size_t i) {
20 return p[i]; 20 return p[i];
21} 21}
22 22
@@ -34,9 +34,9 @@ static FORCE_INLINE u64 fmix64(u64 k) {
34// This is the 128-bit variant of the MurmurHash3 hash function that is targeted for 64-bit 34// This is the 128-bit variant of the MurmurHash3 hash function that is targeted for 64-bit
35// platforms (MurmurHash3_x64_128). It was taken from: 35// platforms (MurmurHash3_x64_128). It was taken from:
36// https://code.google.com/p/smhasher/source/browse/trunk/MurmurHash3.cpp 36// https://code.google.com/p/smhasher/source/browse/trunk/MurmurHash3.cpp
37void MurmurHash3_128(const void* key, int len, u32 seed, void* out) { 37void MurmurHash3_128(const void* key, size_t len, u32 seed, void* out) {
38 const u8* data = (const u8*)key; 38 const u8* data = (const u8*)key;
39 const int nblocks = len / 16; 39 const size_t nblocks = len / 16;
40 40
41 u64 h1 = seed; 41 u64 h1 = seed;
42 u64 h2 = seed; 42 u64 h2 = seed;
@@ -48,7 +48,7 @@ void MurmurHash3_128(const void* key, int len, u32 seed, void* out) {
48 48
49 const u64* blocks = (const u64*)(data); 49 const u64* blocks = (const u64*)(data);
50 50
51 for (int i = 0; i < nblocks; i++) { 51 for (size_t i = 0; i < nblocks; i++) {
52 u64 k1 = getblock64(blocks, i * 2 + 0); 52 u64 k1 = getblock64(blocks, i * 2 + 0);
53 u64 k2 = getblock64(blocks, i * 2 + 1); 53 u64 k2 = getblock64(blocks, i * 2 + 1);
54 54
diff --git a/src/common/hash.h b/src/common/hash.h
index a3850be68..ee2560dad 100644
--- a/src/common/hash.h
+++ b/src/common/hash.h
@@ -4,11 +4,12 @@
4 4
5#pragma once 5#pragma once
6 6
7#include <cstddef>
7#include "common/common_types.h" 8#include "common/common_types.h"
8 9
9namespace Common { 10namespace Common {
10 11
11void MurmurHash3_128(const void* key, int len, u32 seed, void* out); 12void MurmurHash3_128(const void* key, size_t len, u32 seed, void* out);
12 13
13/** 14/**
14 * Computes a 64-bit hash over the specified block of data 15 * Computes a 64-bit hash over the specified block of data
@@ -16,7 +17,7 @@ void MurmurHash3_128(const void* key, int len, u32 seed, void* out);
16 * @param len Length of data (in bytes) to compute hash over 17 * @param len Length of data (in bytes) to compute hash over
17 * @returns 64-bit hash value that was computed over the data block 18 * @returns 64-bit hash value that was computed over the data block
18 */ 19 */
19static inline u64 ComputeHash64(const void* data, int len) { 20static inline u64 ComputeHash64(const void* data, size_t len) {
20 u64 res[2]; 21 u64 res[2];
21 MurmurHash3_128(data, len, 0, res); 22 MurmurHash3_128(data, len, 0, res);
22 return res[0]; 23 return res[0];
diff --git a/src/core/core.h b/src/core/core.h
index 1015e8847..17572a74f 100644
--- a/src/core/core.h
+++ b/src/core/core.h
@@ -115,7 +115,7 @@ private:
115 static System s_instance; 115 static System s_instance;
116}; 116};
117 117
118static ARM_Interface& CPU() { 118inline ARM_Interface& CPU() {
119 return System::GetInstance().CPU(); 119 return System::GetInstance().CPU();
120} 120}
121 121
diff --git a/src/core/core_timing.cpp b/src/core/core_timing.cpp
index a437d0823..276ecfdf6 100644
--- a/src/core/core_timing.cpp
+++ b/src/core/core_timing.cpp
@@ -13,7 +13,7 @@
13#include "core/core.h" 13#include "core/core.h"
14#include "core/core_timing.h" 14#include "core/core_timing.h"
15 15
16int g_clock_rate_arm11 = 268123480; 16int g_clock_rate_arm11 = BASE_CLOCK_RATE_ARM11;
17 17
18// is this really necessary? 18// is this really necessary?
19#define INITIAL_SLICE_LENGTH 20000 19#define INITIAL_SLICE_LENGTH 20000
diff --git a/src/core/core_timing.h b/src/core/core_timing.h
index b72a1b500..d2f85cd4d 100644
--- a/src/core/core_timing.h
+++ b/src/core/core_timing.h
@@ -21,6 +21,7 @@
21// inside callback: 21// inside callback:
22// ScheduleEvent(periodInCycles - cycles_late, callback, "whatever") 22// ScheduleEvent(periodInCycles - cycles_late, callback, "whatever")
23 23
24constexpr int BASE_CLOCK_RATE_ARM11 = 268123480;
24extern int g_clock_rate_arm11; 25extern int g_clock_rate_arm11;
25 26
26inline s64 msToCycles(int ms) { 27inline s64 msToCycles(int ms) {
diff --git a/src/core/file_sys/archive_extsavedata.cpp b/src/core/file_sys/archive_extsavedata.cpp
index 51ce78435..dd2fb167f 100644
--- a/src/core/file_sys/archive_extsavedata.cpp
+++ b/src/core/file_sys/archive_extsavedata.cpp
@@ -107,6 +107,8 @@ public:
107 case PathParser::NotFound: 107 case PathParser::NotFound:
108 LOG_ERROR(Service_FS, "%s not found", full_path.c_str()); 108 LOG_ERROR(Service_FS, "%s not found", full_path.c_str());
109 return ERROR_FILE_NOT_FOUND; 109 return ERROR_FILE_NOT_FOUND;
110 case PathParser::FileFound:
111 break; // Expected 'success' case
110 } 112 }
111 113
112 FileUtil::IOFile file(full_path, "r+b"); 114 FileUtil::IOFile file(full_path, "r+b");
diff --git a/src/core/file_sys/archive_sdmc.cpp b/src/core/file_sys/archive_sdmc.cpp
index 333dfb92e..72ff05c65 100644
--- a/src/core/file_sys/archive_sdmc.cpp
+++ b/src/core/file_sys/archive_sdmc.cpp
@@ -72,6 +72,8 @@ ResultVal<std::unique_ptr<FileBackend>> SDMCArchive::OpenFileBase(const Path& pa
72 FileUtil::CreateEmptyFile(full_path); 72 FileUtil::CreateEmptyFile(full_path);
73 } 73 }
74 break; 74 break;
75 case PathParser::FileFound:
76 break; // Expected 'success' case
75 } 77 }
76 78
77 FileUtil::IOFile file(full_path, mode.write_flag ? "r+b" : "rb"); 79 FileUtil::IOFile file(full_path, mode.write_flag ? "r+b" : "rb");
@@ -106,6 +108,8 @@ ResultCode SDMCArchive::DeleteFile(const Path& path) const {
106 case PathParser::DirectoryFound: 108 case PathParser::DirectoryFound:
107 LOG_ERROR(Service_FS, "%s is not a file", full_path.c_str()); 109 LOG_ERROR(Service_FS, "%s is not a file", full_path.c_str());
108 return ERROR_UNEXPECTED_FILE_OR_DIRECTORY_SDMC; 110 return ERROR_UNEXPECTED_FILE_OR_DIRECTORY_SDMC;
111 case PathParser::FileFound:
112 break; // Expected 'success' case
109 } 113 }
110 114
111 if (FileUtil::Delete(full_path)) { 115 if (FileUtil::Delete(full_path)) {
@@ -154,6 +158,8 @@ static ResultCode DeleteDirectoryHelper(const Path& path, const std::string& mou
154 case PathParser::FileFound: 158 case PathParser::FileFound:
155 LOG_ERROR(Service_FS, "Unexpected file in path %s", full_path.c_str()); 159 LOG_ERROR(Service_FS, "Unexpected file in path %s", full_path.c_str());
156 return ERROR_UNEXPECTED_FILE_OR_DIRECTORY_SDMC; 160 return ERROR_UNEXPECTED_FILE_OR_DIRECTORY_SDMC;
161 case PathParser::DirectoryFound:
162 break; // Expected 'success' case
157 } 163 }
158 164
159 if (deleter(full_path)) { 165 if (deleter(full_path)) {
@@ -197,6 +203,8 @@ ResultCode SDMCArchive::CreateFile(const FileSys::Path& path, u64 size) const {
197 case PathParser::FileFound: 203 case PathParser::FileFound:
198 LOG_ERROR(Service_FS, "%s already exists", full_path.c_str()); 204 LOG_ERROR(Service_FS, "%s already exists", full_path.c_str());
199 return ERROR_ALREADY_EXISTS; 205 return ERROR_ALREADY_EXISTS;
206 case PathParser::NotFound:
207 break; // Expected 'success' case
200 } 208 }
201 209
202 if (size == 0) { 210 if (size == 0) {
@@ -238,6 +246,8 @@ ResultCode SDMCArchive::CreateDirectory(const Path& path) const {
238 case PathParser::FileFound: 246 case PathParser::FileFound:
239 LOG_ERROR(Service_FS, "%s already exists", full_path.c_str()); 247 LOG_ERROR(Service_FS, "%s already exists", full_path.c_str());
240 return ERROR_ALREADY_EXISTS; 248 return ERROR_ALREADY_EXISTS;
249 case PathParser::NotFound:
250 break; // Expected 'success' case
241 } 251 }
242 252
243 if (FileUtil::CreateDir(mount_point + path.AsString())) { 253 if (FileUtil::CreateDir(mount_point + path.AsString())) {
@@ -281,6 +291,8 @@ ResultVal<std::unique_ptr<DirectoryBackend>> SDMCArchive::OpenDirectory(const Pa
281 case PathParser::FileInPath: 291 case PathParser::FileInPath:
282 LOG_ERROR(Service_FS, "Unexpected file in path %s", full_path.c_str()); 292 LOG_ERROR(Service_FS, "Unexpected file in path %s", full_path.c_str());
283 return ERROR_UNEXPECTED_FILE_OR_DIRECTORY_SDMC; 293 return ERROR_UNEXPECTED_FILE_OR_DIRECTORY_SDMC;
294 case PathParser::DirectoryFound:
295 break; // Expected 'success' case
284 } 296 }
285 297
286 auto directory = std::make_unique<DiskDirectory>(full_path); 298 auto directory = std::make_unique<DiskDirectory>(full_path);
diff --git a/src/core/file_sys/savedata_archive.cpp b/src/core/file_sys/savedata_archive.cpp
index f2e6a06bc..f540c4a93 100644
--- a/src/core/file_sys/savedata_archive.cpp
+++ b/src/core/file_sys/savedata_archive.cpp
@@ -57,6 +57,8 @@ ResultVal<std::unique_ptr<FileBackend>> SaveDataArchive::OpenFile(const Path& pa
57 FileUtil::CreateEmptyFile(full_path); 57 FileUtil::CreateEmptyFile(full_path);
58 } 58 }
59 break; 59 break;
60 case PathParser::FileFound:
61 break; // Expected 'success' case
60 } 62 }
61 63
62 FileUtil::IOFile file(full_path, mode.write_flag ? "r+b" : "rb"); 64 FileUtil::IOFile file(full_path, mode.write_flag ? "r+b" : "rb");
@@ -91,6 +93,8 @@ ResultCode SaveDataArchive::DeleteFile(const Path& path) const {
91 case PathParser::NotFound: 93 case PathParser::NotFound:
92 LOG_ERROR(Service_FS, "File not found %s", full_path.c_str()); 94 LOG_ERROR(Service_FS, "File not found %s", full_path.c_str());
93 return ERROR_FILE_NOT_FOUND; 95 return ERROR_FILE_NOT_FOUND;
96 case PathParser::FileFound:
97 break; // Expected 'success' case
94 } 98 }
95 99
96 if (FileUtil::Delete(full_path)) { 100 if (FileUtil::Delete(full_path)) {
@@ -139,6 +143,8 @@ static ResultCode DeleteDirectoryHelper(const Path& path, const std::string& mou
139 case PathParser::FileFound: 143 case PathParser::FileFound:
140 LOG_ERROR(Service_FS, "Unexpected file or directory %s", full_path.c_str()); 144 LOG_ERROR(Service_FS, "Unexpected file or directory %s", full_path.c_str());
141 return ERROR_UNEXPECTED_FILE_OR_DIRECTORY; 145 return ERROR_UNEXPECTED_FILE_OR_DIRECTORY;
146 case PathParser::DirectoryFound:
147 break; // Expected 'success' case
142 } 148 }
143 149
144 if (deleter(full_path)) { 150 if (deleter(full_path)) {
@@ -182,6 +188,8 @@ ResultCode SaveDataArchive::CreateFile(const FileSys::Path& path, u64 size) cons
182 case PathParser::FileFound: 188 case PathParser::FileFound:
183 LOG_ERROR(Service_FS, "%s already exists", full_path.c_str()); 189 LOG_ERROR(Service_FS, "%s already exists", full_path.c_str());
184 return ERROR_FILE_ALREADY_EXISTS; 190 return ERROR_FILE_ALREADY_EXISTS;
191 case PathParser::NotFound:
192 break; // Expected 'success' case
185 } 193 }
186 194
187 if (size == 0) { 195 if (size == 0) {
@@ -225,6 +233,8 @@ ResultCode SaveDataArchive::CreateDirectory(const Path& path) const {
225 case PathParser::FileFound: 233 case PathParser::FileFound:
226 LOG_ERROR(Service_FS, "%s already exists", full_path.c_str()); 234 LOG_ERROR(Service_FS, "%s already exists", full_path.c_str());
227 return ERROR_DIRECTORY_ALREADY_EXISTS; 235 return ERROR_DIRECTORY_ALREADY_EXISTS;
236 case PathParser::NotFound:
237 break; // Expected 'success' case
228 } 238 }
229 239
230 if (FileUtil::CreateDir(mount_point + path.AsString())) { 240 if (FileUtil::CreateDir(mount_point + path.AsString())) {
@@ -269,6 +279,8 @@ ResultVal<std::unique_ptr<DirectoryBackend>> SaveDataArchive::OpenDirectory(
269 case PathParser::FileFound: 279 case PathParser::FileFound:
270 LOG_ERROR(Service_FS, "Unexpected file in path %s", full_path.c_str()); 280 LOG_ERROR(Service_FS, "Unexpected file in path %s", full_path.c_str());
271 return ERROR_UNEXPECTED_FILE_OR_DIRECTORY; 281 return ERROR_UNEXPECTED_FILE_OR_DIRECTORY;
282 case PathParser::DirectoryFound:
283 break; // Expected 'success' case
272 } 284 }
273 285
274 auto directory = std::make_unique<DiskDirectory>(full_path); 286 auto directory = std::make_unique<DiskDirectory>(full_path);
diff --git a/src/core/frontend/emu_window.cpp b/src/core/frontend/emu_window.cpp
index 1541cc39d..4f0f786ce 100644
--- a/src/core/frontend/emu_window.cpp
+++ b/src/core/frontend/emu_window.cpp
@@ -98,9 +98,9 @@ void EmuWindow::AccelerometerChanged(float x, float y, float z) {
98 // TODO(wwylele): do a time stretch as it in GyroscopeChanged 98 // TODO(wwylele): do a time stretch as it in GyroscopeChanged
99 // The time stretch formula should be like 99 // The time stretch formula should be like
100 // stretched_vector = (raw_vector - gravity) * stretch_ratio + gravity 100 // stretched_vector = (raw_vector - gravity) * stretch_ratio + gravity
101 accel_x = x * coef; 101 accel_x = static_cast<s16>(x * coef);
102 accel_y = y * coef; 102 accel_y = static_cast<s16>(y * coef);
103 accel_z = z * coef; 103 accel_z = static_cast<s16>(z * coef);
104} 104}
105 105
106void EmuWindow::GyroscopeChanged(float x, float y, float z) { 106void EmuWindow::GyroscopeChanged(float x, float y, float z) {
@@ -109,9 +109,9 @@ void EmuWindow::GyroscopeChanged(float x, float y, float z) {
109 float stretch = 109 float stretch =
110 FULL_FPS / Common::Profiling::GetTimingResultsAggregator()->GetAggregatedResults().fps; 110 FULL_FPS / Common::Profiling::GetTimingResultsAggregator()->GetAggregatedResults().fps;
111 std::lock_guard<std::mutex> lock(gyro_mutex); 111 std::lock_guard<std::mutex> lock(gyro_mutex);
112 gyro_x = x * coef * stretch; 112 gyro_x = static_cast<s16>(x * coef * stretch);
113 gyro_y = y * coef * stretch; 113 gyro_y = static_cast<s16>(y * coef * stretch);
114 gyro_z = z * coef * stretch; 114 gyro_z = static_cast<s16>(z * coef * stretch);
115} 115}
116 116
117void EmuWindow::UpdateCurrentFramebufferLayout(unsigned width, unsigned height) { 117void EmuWindow::UpdateCurrentFramebufferLayout(unsigned width, unsigned height) {
diff --git a/src/core/hle/service/err_f.cpp b/src/core/hle/service/err_f.cpp
index cd0a1a598..9da55f328 100644
--- a/src/core/hle/service/err_f.cpp
+++ b/src/core/hle/service/err_f.cpp
@@ -227,6 +227,8 @@ static void ThrowFatalError(Interface* self) {
227 LOG_CRITICAL(Service_ERR, "FINST2: 0x%08X", 227 LOG_CRITICAL(Service_ERR, "FINST2: 0x%08X",
228 errtype.exception_data.exception_info.fpinst2); 228 errtype.exception_data.exception_info.fpinst2);
229 break; 229 break;
230 case ExceptionType::Undefined:
231 break; // Not logging exception_info for this case
230 } 232 }
231 LOG_CRITICAL(Service_ERR, "Datetime: %s", GetCurrentSystemTime().c_str()); 233 LOG_CRITICAL(Service_ERR, "Datetime: %s", GetCurrentSystemTime().c_str());
232 break; 234 break;
diff --git a/src/core/hle/service/gsp_gpu.cpp b/src/core/hle/service/gsp_gpu.cpp
index 947958703..a8c1331ed 100644
--- a/src/core/hle/service/gsp_gpu.cpp
+++ b/src/core/hle/service/gsp_gpu.cpp
@@ -149,7 +149,7 @@ static ResultCode WriteHWRegsWithMask(u32 base_address, u32 size_in_bytes, VAddr
149 u32 mask = Memory::Read32(masks_vaddr); 149 u32 mask = Memory::Read32(masks_vaddr);
150 150
151 // Update the current value of the register only for set mask bits 151 // Update the current value of the register only for set mask bits
152 reg_value = (reg_value & ~mask) | (data | mask); 152 reg_value = (reg_value & ~mask) | (data & mask);
153 153
154 WriteSingleHWReg(base_address, reg_value); 154 WriteSingleHWReg(base_address, reg_value);
155 155
diff --git a/src/core/hle/service/hid/hid.cpp b/src/core/hle/service/hid/hid.cpp
index 676154bd4..f14ab3811 100644
--- a/src/core/hle/service/hid/hid.cpp
+++ b/src/core/hle/service/hid/hid.cpp
@@ -35,6 +35,15 @@ static u32 next_gyroscope_index;
35static int enable_accelerometer_count = 0; // positive means enabled 35static int enable_accelerometer_count = 0; // positive means enabled
36static int enable_gyroscope_count = 0; // positive means enabled 36static int enable_gyroscope_count = 0; // positive means enabled
37 37
38static int pad_update_event;
39static int accelerometer_update_event;
40static int gyroscope_update_event;
41
42// Updating period for each HID device. These empirical values are measured from a 11.2 3DS.
43constexpr u64 pad_update_ticks = BASE_CLOCK_RATE_ARM11 / 234;
44constexpr u64 accelerometer_update_ticks = BASE_CLOCK_RATE_ARM11 / 104;
45constexpr u64 gyroscope_update_ticks = BASE_CLOCK_RATE_ARM11 / 101;
46
38static PadState GetCirclePadDirectionState(s16 circle_pad_x, s16 circle_pad_y) { 47static PadState GetCirclePadDirectionState(s16 circle_pad_x, s16 circle_pad_y) {
39 // 30 degree and 60 degree are angular thresholds for directions 48 // 30 degree and 60 degree are angular thresholds for directions
40 constexpr float TAN30 = 0.577350269f; 49 constexpr float TAN30 = 0.577350269f;
@@ -65,14 +74,9 @@ static PadState GetCirclePadDirectionState(s16 circle_pad_x, s16 circle_pad_y) {
65 return state; 74 return state;
66} 75}
67 76
68void Update() { 77static void UpdatePadCallback(u64 userdata, int cycles_late) {
69 SharedMem* mem = reinterpret_cast<SharedMem*>(shared_mem->GetPointer()); 78 SharedMem* mem = reinterpret_cast<SharedMem*>(shared_mem->GetPointer());
70 79
71 if (mem == nullptr) {
72 LOG_DEBUG(Service_HID, "Cannot update HID prior to mapping shared memory!");
73 return;
74 }
75
76 PadState state = VideoCore::g_emu_window->GetPadState(); 80 PadState state = VideoCore::g_emu_window->GetPadState();
77 81
78 // Get current circle pad position and update circle pad direction 82 // Get current circle pad position and update circle pad direction
@@ -131,59 +135,68 @@ void Update() {
131 event_pad_or_touch_1->Signal(); 135 event_pad_or_touch_1->Signal();
132 event_pad_or_touch_2->Signal(); 136 event_pad_or_touch_2->Signal();
133 137
134 // Update accelerometer 138 // Reschedule recurrent event
135 if (enable_accelerometer_count > 0) { 139 CoreTiming::ScheduleEvent(pad_update_ticks - cycles_late, pad_update_event);
136 mem->accelerometer.index = next_accelerometer_index; 140}
137 next_accelerometer_index = 141
138 (next_accelerometer_index + 1) % mem->accelerometer.entries.size(); 142static void UpdateAccelerometerCallback(u64 userdata, int cycles_late) {
139 143 SharedMem* mem = reinterpret_cast<SharedMem*>(shared_mem->GetPointer());
140 AccelerometerDataEntry& accelerometer_entry = 144
141 mem->accelerometer.entries[mem->accelerometer.index]; 145 mem->accelerometer.index = next_accelerometer_index;
142 std::tie(accelerometer_entry.x, accelerometer_entry.y, accelerometer_entry.z) = 146 next_accelerometer_index = (next_accelerometer_index + 1) % mem->accelerometer.entries.size();
143 VideoCore::g_emu_window->GetAccelerometerState();
144
145 // Make up "raw" entry
146 // TODO(wwylele):
147 // From hardware testing, the raw_entry values are approximately,
148 // but not exactly, as twice as corresponding entries (or with a minus sign).
149 // It may caused by system calibration to the accelerometer.
150 // Figure out how it works, or, if no game reads raw_entry,
151 // the following three lines can be removed and leave raw_entry unimplemented.
152 mem->accelerometer.raw_entry.x = -2 * accelerometer_entry.x;
153 mem->accelerometer.raw_entry.z = 2 * accelerometer_entry.y;
154 mem->accelerometer.raw_entry.y = -2 * accelerometer_entry.z;
155
156 // If we just updated index 0, provide a new timestamp
157 if (mem->accelerometer.index == 0) {
158 mem->accelerometer.index_reset_ticks_previous = mem->accelerometer.index_reset_ticks;
159 mem->accelerometer.index_reset_ticks = (s64)CoreTiming::GetTicks();
160 }
161 147
162 event_accelerometer->Signal(); 148 AccelerometerDataEntry& accelerometer_entry =
149 mem->accelerometer.entries[mem->accelerometer.index];
150 std::tie(accelerometer_entry.x, accelerometer_entry.y, accelerometer_entry.z) =
151 VideoCore::g_emu_window->GetAccelerometerState();
152
153 // Make up "raw" entry
154 // TODO(wwylele):
155 // From hardware testing, the raw_entry values are approximately, but not exactly, as twice as
156 // corresponding entries (or with a minus sign). It may caused by system calibration to the
157 // accelerometer. Figure out how it works, or, if no game reads raw_entry, the following three
158 // lines can be removed and leave raw_entry unimplemented.
159 mem->accelerometer.raw_entry.x = -2 * accelerometer_entry.x;
160 mem->accelerometer.raw_entry.z = 2 * accelerometer_entry.y;
161 mem->accelerometer.raw_entry.y = -2 * accelerometer_entry.z;
162
163 // If we just updated index 0, provide a new timestamp
164 if (mem->accelerometer.index == 0) {
165 mem->accelerometer.index_reset_ticks_previous = mem->accelerometer.index_reset_ticks;
166 mem->accelerometer.index_reset_ticks = (s64)CoreTiming::GetTicks();
163 } 167 }
164 168
165 // Update gyroscope 169 event_accelerometer->Signal();
166 if (enable_gyroscope_count > 0) {
167 mem->gyroscope.index = next_gyroscope_index;
168 next_gyroscope_index = (next_gyroscope_index + 1) % mem->gyroscope.entries.size();
169 170
170 GyroscopeDataEntry& gyroscope_entry = mem->gyroscope.entries[mem->gyroscope.index]; 171 // Reschedule recurrent event
171 std::tie(gyroscope_entry.x, gyroscope_entry.y, gyroscope_entry.z) = 172 CoreTiming::ScheduleEvent(accelerometer_update_ticks - cycles_late, accelerometer_update_event);
172 VideoCore::g_emu_window->GetGyroscopeState(); 173}
173 174
174 // Make up "raw" entry 175static void UpdateGyroscopeCallback(u64 userdata, int cycles_late) {
175 mem->gyroscope.raw_entry.x = gyroscope_entry.x; 176 SharedMem* mem = reinterpret_cast<SharedMem*>(shared_mem->GetPointer());
176 mem->gyroscope.raw_entry.z = -gyroscope_entry.y;
177 mem->gyroscope.raw_entry.y = gyroscope_entry.z;
178 177
179 // If we just updated index 0, provide a new timestamp 178 mem->gyroscope.index = next_gyroscope_index;
180 if (mem->gyroscope.index == 0) { 179 next_gyroscope_index = (next_gyroscope_index + 1) % mem->gyroscope.entries.size();
181 mem->gyroscope.index_reset_ticks_previous = mem->gyroscope.index_reset_ticks; 180
182 mem->gyroscope.index_reset_ticks = (s64)CoreTiming::GetTicks(); 181 GyroscopeDataEntry& gyroscope_entry = mem->gyroscope.entries[mem->gyroscope.index];
183 } 182 std::tie(gyroscope_entry.x, gyroscope_entry.y, gyroscope_entry.z) =
183 VideoCore::g_emu_window->GetGyroscopeState();
184
185 // Make up "raw" entry
186 mem->gyroscope.raw_entry.x = gyroscope_entry.x;
187 mem->gyroscope.raw_entry.z = -gyroscope_entry.y;
188 mem->gyroscope.raw_entry.y = gyroscope_entry.z;
184 189
185 event_gyroscope->Signal(); 190 // If we just updated index 0, provide a new timestamp
191 if (mem->gyroscope.index == 0) {
192 mem->gyroscope.index_reset_ticks_previous = mem->gyroscope.index_reset_ticks;
193 mem->gyroscope.index_reset_ticks = (s64)CoreTiming::GetTicks();
186 } 194 }
195
196 event_gyroscope->Signal();
197
198 // Reschedule recurrent event
199 CoreTiming::ScheduleEvent(gyroscope_update_ticks - cycles_late, gyroscope_update_event);
187} 200}
188 201
189void GetIPCHandles(Service::Interface* self) { 202void GetIPCHandles(Service::Interface* self) {
@@ -204,7 +217,11 @@ void EnableAccelerometer(Service::Interface* self) {
204 u32* cmd_buff = Kernel::GetCommandBuffer(); 217 u32* cmd_buff = Kernel::GetCommandBuffer();
205 218
206 ++enable_accelerometer_count; 219 ++enable_accelerometer_count;
207 event_accelerometer->Signal(); 220
221 // Schedules the accelerometer update event if the accelerometer was just enabled
222 if (enable_accelerometer_count == 1) {
223 CoreTiming::ScheduleEvent(accelerometer_update_ticks, accelerometer_update_event);
224 }
208 225
209 cmd_buff[1] = RESULT_SUCCESS.raw; 226 cmd_buff[1] = RESULT_SUCCESS.raw;
210 227
@@ -215,7 +232,11 @@ void DisableAccelerometer(Service::Interface* self) {
215 u32* cmd_buff = Kernel::GetCommandBuffer(); 232 u32* cmd_buff = Kernel::GetCommandBuffer();
216 233
217 --enable_accelerometer_count; 234 --enable_accelerometer_count;
218 event_accelerometer->Signal(); 235
236 // Unschedules the accelerometer update event if the accelerometer was just disabled
237 if (enable_accelerometer_count == 0) {
238 CoreTiming::UnscheduleEvent(accelerometer_update_event, 0);
239 }
219 240
220 cmd_buff[1] = RESULT_SUCCESS.raw; 241 cmd_buff[1] = RESULT_SUCCESS.raw;
221 242
@@ -226,7 +247,11 @@ void EnableGyroscopeLow(Service::Interface* self) {
226 u32* cmd_buff = Kernel::GetCommandBuffer(); 247 u32* cmd_buff = Kernel::GetCommandBuffer();
227 248
228 ++enable_gyroscope_count; 249 ++enable_gyroscope_count;
229 event_gyroscope->Signal(); 250
251 // Schedules the gyroscope update event if the gyroscope was just enabled
252 if (enable_gyroscope_count == 1) {
253 CoreTiming::ScheduleEvent(gyroscope_update_ticks, gyroscope_update_event);
254 }
230 255
231 cmd_buff[1] = RESULT_SUCCESS.raw; 256 cmd_buff[1] = RESULT_SUCCESS.raw;
232 257
@@ -237,7 +262,11 @@ void DisableGyroscopeLow(Service::Interface* self) {
237 u32* cmd_buff = Kernel::GetCommandBuffer(); 262 u32* cmd_buff = Kernel::GetCommandBuffer();
238 263
239 --enable_gyroscope_count; 264 --enable_gyroscope_count;
240 event_gyroscope->Signal(); 265
266 // Unschedules the gyroscope update event if the gyroscope was just disabled
267 if (enable_gyroscope_count == 0) {
268 CoreTiming::UnscheduleEvent(gyroscope_update_event, 0);
269 }
241 270
242 cmd_buff[1] = RESULT_SUCCESS.raw; 271 cmd_buff[1] = RESULT_SUCCESS.raw;
243 272
@@ -291,6 +320,8 @@ void Init() {
291 320
292 next_pad_index = 0; 321 next_pad_index = 0;
293 next_touch_index = 0; 322 next_touch_index = 0;
323 next_accelerometer_index = 0;
324 next_gyroscope_index = 0;
294 325
295 // Create event handles 326 // Create event handles
296 event_pad_or_touch_1 = Event::Create(ResetType::OneShot, "HID:EventPadOrTouch1"); 327 event_pad_or_touch_1 = Event::Create(ResetType::OneShot, "HID:EventPadOrTouch1");
@@ -298,6 +329,15 @@ void Init() {
298 event_accelerometer = Event::Create(ResetType::OneShot, "HID:EventAccelerometer"); 329 event_accelerometer = Event::Create(ResetType::OneShot, "HID:EventAccelerometer");
299 event_gyroscope = Event::Create(ResetType::OneShot, "HID:EventGyroscope"); 330 event_gyroscope = Event::Create(ResetType::OneShot, "HID:EventGyroscope");
300 event_debug_pad = Event::Create(ResetType::OneShot, "HID:EventDebugPad"); 331 event_debug_pad = Event::Create(ResetType::OneShot, "HID:EventDebugPad");
332
333 // Register update callbacks
334 pad_update_event = CoreTiming::RegisterEvent("HID::UpdatePadCallback", UpdatePadCallback);
335 accelerometer_update_event =
336 CoreTiming::RegisterEvent("HID::UpdateAccelerometerCallback", UpdateAccelerometerCallback);
337 gyroscope_update_event =
338 CoreTiming::RegisterEvent("HID::UpdateGyroscopeCallback", UpdateGyroscopeCallback);
339
340 CoreTiming::ScheduleEvent(pad_update_ticks, pad_update_event);
301} 341}
302 342
303void Shutdown() { 343void Shutdown() {
diff --git a/src/core/hle/service/hid/hid.h b/src/core/hle/service/hid/hid.h
index 7904e7355..21e66dfe0 100644
--- a/src/core/hle/service/hid/hid.h
+++ b/src/core/hle/service/hid/hid.h
@@ -296,9 +296,6 @@ void GetGyroscopeLowRawToDpsCoefficient(Service::Interface* self);
296 */ 296 */
297void GetGyroscopeLowCalibrateParam(Service::Interface* self); 297void GetGyroscopeLowCalibrateParam(Service::Interface* self);
298 298
299/// Checks for user input updates
300void Update();
301
302/// Initialize HID service 299/// Initialize HID service
303void Init(); 300void Init();
304 301
diff --git a/src/core/hle/service/mic_u.cpp b/src/core/hle/service/mic_u.cpp
index c62f8afc6..e98388560 100644
--- a/src/core/hle/service/mic_u.cpp
+++ b/src/core/hle/service/mic_u.cpp
@@ -93,7 +93,7 @@ static void StartSampling(Interface* self) {
93 sample_rate = static_cast<SampleRate>(cmd_buff[2] & 0xFF); 93 sample_rate = static_cast<SampleRate>(cmd_buff[2] & 0xFF);
94 audio_buffer_offset = cmd_buff[3]; 94 audio_buffer_offset = cmd_buff[3];
95 audio_buffer_size = cmd_buff[4]; 95 audio_buffer_size = cmd_buff[4];
96 audio_buffer_loop = static_cast<bool>(cmd_buff[5] & 0xFF); 96 audio_buffer_loop = (cmd_buff[5] & 0xFF) != 0;
97 97
98 cmd_buff[1] = RESULT_SUCCESS.raw; // No error 98 cmd_buff[1] = RESULT_SUCCESS.raw; // No error
99 is_sampling = true; 99 is_sampling = true;
@@ -202,7 +202,7 @@ static void GetGain(Interface* self) {
202 */ 202 */
203static void SetPower(Interface* self) { 203static void SetPower(Interface* self) {
204 u32* cmd_buff = Kernel::GetCommandBuffer(); 204 u32* cmd_buff = Kernel::GetCommandBuffer();
205 mic_power = static_cast<bool>(cmd_buff[1] & 0xFF); 205 mic_power = (cmd_buff[1] & 0xFF) != 0;
206 cmd_buff[1] = RESULT_SUCCESS.raw; // No error 206 cmd_buff[1] = RESULT_SUCCESS.raw; // No error
207 LOG_WARNING(Service_MIC, "(STUBBED) called, mic_power=%u", mic_power); 207 LOG_WARNING(Service_MIC, "(STUBBED) called, mic_power=%u", mic_power);
208} 208}
@@ -252,7 +252,7 @@ static void SetIirFilterMic(Interface* self) {
252 */ 252 */
253static void SetClamp(Interface* self) { 253static void SetClamp(Interface* self) {
254 u32* cmd_buff = Kernel::GetCommandBuffer(); 254 u32* cmd_buff = Kernel::GetCommandBuffer();
255 clamp = static_cast<bool>(cmd_buff[1] & 0xFF); 255 clamp = (cmd_buff[1] & 0xFF) != 0;
256 cmd_buff[1] = RESULT_SUCCESS.raw; // No error 256 cmd_buff[1] = RESULT_SUCCESS.raw; // No error
257 LOG_WARNING(Service_MIC, "(STUBBED) called, clamp=%u", clamp); 257 LOG_WARNING(Service_MIC, "(STUBBED) called, clamp=%u", clamp);
258} 258}
@@ -282,7 +282,7 @@ static void GetClamp(Interface* self) {
282 */ 282 */
283static void SetAllowShellClosed(Interface* self) { 283static void SetAllowShellClosed(Interface* self) {
284 u32* cmd_buff = Kernel::GetCommandBuffer(); 284 u32* cmd_buff = Kernel::GetCommandBuffer();
285 allow_shell_closed = static_cast<bool>(cmd_buff[1] & 0xFF); 285 allow_shell_closed = (cmd_buff[1] & 0xFF) != 0;
286 cmd_buff[1] = RESULT_SUCCESS.raw; // No error 286 cmd_buff[1] = RESULT_SUCCESS.raw; // No error
287 LOG_WARNING(Service_MIC, "(STUBBED) called, allow_shell_closed=%u", allow_shell_closed); 287 LOG_WARNING(Service_MIC, "(STUBBED) called, allow_shell_closed=%u", allow_shell_closed);
288} 288}
diff --git a/src/core/hle/service/nfc/nfc.cpp b/src/core/hle/service/nfc/nfc.cpp
index e248285f9..fd3c7d9c2 100644
--- a/src/core/hle/service/nfc/nfc.cpp
+++ b/src/core/hle/service/nfc/nfc.cpp
@@ -11,6 +11,81 @@ namespace Service {
11namespace NFC { 11namespace NFC {
12 12
13static Kernel::SharedPtr<Kernel::Event> tag_in_range_event; 13static Kernel::SharedPtr<Kernel::Event> tag_in_range_event;
14static Kernel::SharedPtr<Kernel::Event> tag_out_of_range_event;
15static TagState nfc_tag_state = TagState::NotInitialized;
16static CommunicationStatus nfc_status = CommunicationStatus::NfcInitialized;
17
18void Initialize(Interface* self) {
19 u32* cmd_buff = Kernel::GetCommandBuffer();
20
21 u8 param = static_cast<u8>(cmd_buff[1] & 0xFF);
22
23 nfc_tag_state = TagState::NotScanning;
24
25 cmd_buff[1] = RESULT_SUCCESS.raw; // No error
26 LOG_WARNING(Service_NFC, "(STUBBED) called, param=%u", param);
27}
28
29void Shutdown(Interface* self) {
30 u32* cmd_buff = Kernel::GetCommandBuffer();
31
32 u8 param = static_cast<u8>(cmd_buff[1] & 0xFF);
33 nfc_tag_state = TagState::NotInitialized;
34
35 cmd_buff[1] = RESULT_SUCCESS.raw; // No error
36 LOG_WARNING(Service_NFC, "(STUBBED) called, param=%u", param);
37}
38
39void StartCommunication(Interface* self) {
40 u32* cmd_buff = Kernel::GetCommandBuffer();
41
42 cmd_buff[1] = RESULT_SUCCESS.raw; // No error
43 LOG_WARNING(Service_NFC, "(STUBBED) called");
44}
45
46void StopCommunication(Interface* self) {
47 u32* cmd_buff = Kernel::GetCommandBuffer();
48
49 cmd_buff[1] = RESULT_SUCCESS.raw; // No error
50 LOG_WARNING(Service_NFC, "(STUBBED) called");
51}
52
53void StartTagScanning(Interface* self) {
54 u32* cmd_buff = Kernel::GetCommandBuffer();
55
56 nfc_tag_state = TagState::TagInRange;
57 tag_in_range_event->Signal();
58
59 cmd_buff[1] = RESULT_SUCCESS.raw; // No error
60 LOG_WARNING(Service_NFC, "(STUBBED) called");
61}
62
63void StopTagScanning(Interface* self) {
64 u32* cmd_buff = Kernel::GetCommandBuffer();
65
66 nfc_tag_state = TagState::NotScanning;
67
68 cmd_buff[1] = RESULT_SUCCESS.raw; // No error
69 LOG_WARNING(Service_NFC, "(STUBBED) called");
70}
71
72void LoadAmiiboData(Interface* self) {
73 u32* cmd_buff = Kernel::GetCommandBuffer();
74
75 nfc_tag_state = TagState::TagDataLoaded;
76
77 cmd_buff[1] = RESULT_SUCCESS.raw; // No error
78 LOG_WARNING(Service_NFC, "(STUBBED) called");
79}
80
81void ResetTagScanState(Interface* self) {
82 u32* cmd_buff = Kernel::GetCommandBuffer();
83
84 nfc_tag_state = TagState::NotScanning;
85
86 cmd_buff[1] = RESULT_SUCCESS.raw; // No error
87 LOG_WARNING(Service_NFC, "(STUBBED) called");
88}
14 89
15void GetTagInRangeEvent(Interface* self) { 90void GetTagInRangeEvent(Interface* self) {
16 u32* cmd_buff = Kernel::GetCommandBuffer(); 91 u32* cmd_buff = Kernel::GetCommandBuffer();
@@ -22,16 +97,46 @@ void GetTagInRangeEvent(Interface* self) {
22 LOG_WARNING(Service_NFC, "(STUBBED) called"); 97 LOG_WARNING(Service_NFC, "(STUBBED) called");
23} 98}
24 99
100void GetTagOutOfRangeEvent(Interface* self) {
101 u32* cmd_buff = Kernel::GetCommandBuffer();
102
103 cmd_buff[0] = IPC::MakeHeader(0xC, 1, 2);
104 cmd_buff[1] = RESULT_SUCCESS.raw;
105 cmd_buff[2] = IPC::CopyHandleDesc();
106 cmd_buff[3] = Kernel::g_handle_table.Create(tag_out_of_range_event).MoveFrom();
107 LOG_WARNING(Service_NFC, "(STUBBED) called");
108}
109
110void GetTagState(Interface* self) {
111 u32* cmd_buff = Kernel::GetCommandBuffer();
112
113 cmd_buff[1] = RESULT_SUCCESS.raw; // No error
114 cmd_buff[2] = static_cast<u8>(nfc_tag_state);
115 LOG_DEBUG(Service_NFC, "(STUBBED) called");
116}
117
118void CommunicationGetStatus(Interface* self) {
119 u32* cmd_buff = Kernel::GetCommandBuffer();
120
121 cmd_buff[1] = RESULT_SUCCESS.raw; // No error
122 cmd_buff[2] = static_cast<u8>(nfc_status);
123 LOG_DEBUG(Service_NFC, "(STUBBED) called");
124}
125
25void Init() { 126void Init() {
26 AddService(new NFC_M()); 127 AddService(new NFC_M());
27 AddService(new NFC_U()); 128 AddService(new NFC_U());
28 129
29 tag_in_range_event = 130 tag_in_range_event =
30 Kernel::Event::Create(Kernel::ResetType::OneShot, "NFC::tag_in_range_event"); 131 Kernel::Event::Create(Kernel::ResetType::OneShot, "NFC::tag_in_range_event");
132 tag_out_of_range_event =
133 Kernel::Event::Create(Kernel::ResetType::OneShot, "NFC::tag_out_range_event");
134 nfc_tag_state = TagState::NotInitialized;
31} 135}
32 136
33void Shutdown() { 137void Shutdown() {
34 tag_in_range_event = nullptr; 138 tag_in_range_event = nullptr;
139 tag_out_of_range_event = nullptr;
35} 140}
36 141
37} // namespace NFC 142} // namespace NFC
diff --git a/src/core/hle/service/nfc/nfc.h b/src/core/hle/service/nfc/nfc.h
index b02354201..a013bdae7 100644
--- a/src/core/hle/service/nfc/nfc.h
+++ b/src/core/hle/service/nfc/nfc.h
@@ -4,12 +4,103 @@
4 4
5#pragma once 5#pragma once
6 6
7#include "common/common_types.h"
8
7namespace Service { 9namespace Service {
8 10
9class Interface; 11class Interface;
10 12
11namespace NFC { 13namespace NFC {
12 14
15enum class TagState : u8 {
16 NotInitialized = 0,
17 NotScanning = 1,
18 Scanning = 2,
19 TagInRange = 3,
20 TagOutOfRange = 4,
21 TagDataLoaded = 5,
22};
23
24enum class CommunicationStatus : u8 {
25 AttemptInitialize = 1,
26 NfcInitialized = 2,
27};
28
29/**
30 * NFC::Initialize service function
31 * Inputs:
32 * 0 : Header code [0x00010040]
33 * 1 : (u8) unknown parameter. Can be either value 0x1 or 0x2
34 * Outputs:
35 * 1 : Result of function, 0 on success, otherwise error code
36 */
37void Initialize(Interface* self);
38
39/**
40 * NFC::Shutdown service function
41 * Inputs:
42 * 0 : Header code [0x00020040]
43 * 1 : (u8) unknown parameter
44 * Outputs:
45 * 1 : Result of function, 0 on success, otherwise error code
46 */
47void Shutdown(Interface* self);
48
49/**
50 * NFC::StartCommunication service function
51 * Inputs:
52 * 0 : Header code [0x00030000]
53 * Outputs:
54 * 1 : Result of function, 0 on success, otherwise error code
55 */
56void StartCommunication(Interface* self);
57
58/**
59 * NFC::StopCommunication service function
60 * Inputs:
61 * 0 : Header code [0x00040000]
62 * Outputs:
63 * 1 : Result of function, 0 on success, otherwise error code
64 */
65void StopCommunication(Interface* self);
66
67/**
68 * NFC::StartTagScanning service function
69 * Inputs:
70 * 0 : Header code [0x00050040]
71 * 1 : (u16) unknown. This is normally 0x0
72 * Outputs:
73 * 1 : Result of function, 0 on success, otherwise error code
74 */
75void StartTagScanning(Interface* self);
76
77/**
78 * NFC::StopTagScanning service function
79 * Inputs:
80 * 0 : Header code [0x00060000]
81 * Outputs:
82 * 1 : Result of function, 0 on success, otherwise error code
83 */
84void StopTagScanning(Interface* self);
85
86/**
87 * NFC::LoadAmiiboData service function
88 * Inputs:
89 * 0 : Header code [0x00070000]
90 * Outputs:
91 * 1 : Result of function, 0 on success, otherwise error code
92 */
93void LoadAmiiboData(Interface* self);
94
95/**
96 * NFC::ResetTagScanState service function
97 * Inputs:
98 * 0 : Header code [0x00080000]
99 * Outputs:
100 * 1 : Result of function, 0 on success, otherwise error code
101 */
102void ResetTagScanState(Interface* self);
103
13/** 104/**
14 * NFC::GetTagInRangeEvent service function 105 * NFC::GetTagInRangeEvent service function
15 * Inputs: 106 * Inputs:
@@ -21,6 +112,37 @@ namespace NFC {
21 */ 112 */
22void GetTagInRangeEvent(Interface* self); 113void GetTagInRangeEvent(Interface* self);
23 114
115/**
116 * NFC::GetTagOutOfRangeEvent service function
117 * Inputs:
118 * 0 : Header code [0x000C0000]
119 * Outputs:
120 * 1 : Result of function, 0 on success, otherwise error code
121 * 2 : Copy handle descriptor
122 * 3 : Event Handle
123 */
124void GetTagOutOfRangeEvent(Interface* self);
125
126/**
127 * NFC::GetTagState service function
128 * Inputs:
129 * 0 : Header code [0x000D0000]
130 * Outputs:
131 * 1 : Result of function, 0 on success, otherwise error code
132 * 2 : (u8) Tag state
133 */
134void GetTagState(Interface* self);
135
136/**
137 * NFC::CommunicationGetStatus service function
138 * Inputs:
139 * 0 : Header code [0x000F0000]
140 * Outputs:
141 * 1 : Result of function, 0 on success, otherwise error code
142 * 2 : (u8) Communication state
143 */
144void CommunicationGetStatus(Interface* self);
145
24/// Initialize all NFC services. 146/// Initialize all NFC services.
25void Init(); 147void Init();
26 148
diff --git a/src/core/hle/service/nfc/nfc_m.cpp b/src/core/hle/service/nfc/nfc_m.cpp
index f43b4029a..ebe637650 100644
--- a/src/core/hle/service/nfc/nfc_m.cpp
+++ b/src/core/hle/service/nfc/nfc_m.cpp
@@ -11,18 +11,19 @@ namespace NFC {
11const Interface::FunctionInfo FunctionTable[] = { 11const Interface::FunctionInfo FunctionTable[] = {
12 // clang-format off 12 // clang-format off
13 // nfc:u shared commands 13 // nfc:u shared commands
14 {0x00010040, nullptr, "Initialize"}, 14 {0x00010040, Initialize, "Initialize"},
15 {0x00020040, nullptr, "Shutdown"}, 15 {0x00020040, Shutdown, "Shutdown"},
16 {0x00030000, nullptr, "StartCommunication"}, 16 {0x00030000, StartCommunication, "StartCommunication"},
17 {0x00040000, nullptr, "StopCommunication"}, 17 {0x00040000, StopCommunication, "StopCommunication"},
18 {0x00050040, nullptr, "StartTagScanning"}, 18 {0x00050040, StartTagScanning, "StartTagScanning"},
19 {0x00060000, nullptr, "StopTagScanning"}, 19 {0x00060000, StopTagScanning, "StopTagScanning"},
20 {0x00070000, nullptr, "LoadAmiiboData"}, 20 {0x00070000, LoadAmiiboData, "LoadAmiiboData"},
21 {0x00080000, nullptr, "ResetTagScanState"}, 21 {0x00080000, ResetTagScanState, "ResetTagScanState"},
22 {0x00090002, nullptr, "UpdateStoredAmiiboData"}, 22 {0x00090002, nullptr, "UpdateStoredAmiiboData"},
23 {0x000B0000, GetTagInRangeEvent, "GetTagInRangeEvent"}, 23 {0x000B0000, GetTagInRangeEvent, "GetTagInRangeEvent"},
24 {0x000D0000, nullptr, "GetTagState"}, 24 {0x000C0000, GetTagOutOfRangeEvent, "GetTagOutOfRangeEvent"},
25 {0x000F0000, nullptr, "CommunicationGetStatus"}, 25 {0x000D0000, GetTagState, "GetTagState"},
26 {0x000F0000, CommunicationGetStatus, "CommunicationGetStatus"},
26 {0x00100000, nullptr, "GetTagInfo2"}, 27 {0x00100000, nullptr, "GetTagInfo2"},
27 {0x00110000, nullptr, "GetTagInfo"}, 28 {0x00110000, nullptr, "GetTagInfo"},
28 {0x00120000, nullptr, "CommunicationGetResult"}, 29 {0x00120000, nullptr, "CommunicationGetResult"},
diff --git a/src/core/hle/service/nfc/nfc_u.cpp b/src/core/hle/service/nfc/nfc_u.cpp
index 4b5200ae8..5a40c7874 100644
--- a/src/core/hle/service/nfc/nfc_u.cpp
+++ b/src/core/hle/service/nfc/nfc_u.cpp
@@ -10,18 +10,19 @@ namespace NFC {
10 10
11const Interface::FunctionInfo FunctionTable[] = { 11const Interface::FunctionInfo FunctionTable[] = {
12 // clang-format off 12 // clang-format off
13 {0x00010040, nullptr, "Initialize"}, 13 {0x00010040, Initialize, "Initialize"},
14 {0x00020040, nullptr, "Shutdown"}, 14 {0x00020040, Shutdown, "Shutdown"},
15 {0x00030000, nullptr, "StartCommunication"}, 15 {0x00030000, StartCommunication, "StartCommunication"},
16 {0x00040000, nullptr, "StopCommunication"}, 16 {0x00040000, StopCommunication, "StopCommunication"},
17 {0x00050040, nullptr, "StartTagScanning"}, 17 {0x00050040, StartTagScanning, "StartTagScanning"},
18 {0x00060000, nullptr, "StopTagScanning"}, 18 {0x00060000, StopTagScanning, "StopTagScanning"},
19 {0x00070000, nullptr, "LoadAmiiboData"}, 19 {0x00070000, LoadAmiiboData, "LoadAmiiboData"},
20 {0x00080000, nullptr, "ResetTagScanState"}, 20 {0x00080000, ResetTagScanState, "ResetTagScanState"},
21 {0x00090002, nullptr, "UpdateStoredAmiiboData"}, 21 {0x00090002, nullptr, "UpdateStoredAmiiboData"},
22 {0x000B0000, GetTagInRangeEvent, "GetTagInRangeEvent"}, 22 {0x000B0000, GetTagInRangeEvent, "GetTagInRangeEvent"},
23 {0x000D0000, nullptr, "GetTagState"}, 23 {0x000C0000, GetTagOutOfRangeEvent, "GetTagOutOfRangeEvent"},
24 {0x000F0000, nullptr, "CommunicationGetStatus"}, 24 {0x000D0000, GetTagState, "GetTagState"},
25 {0x000F0000, CommunicationGetStatus, "CommunicationGetStatus"},
25 {0x00100000, nullptr, "GetTagInfo2"}, 26 {0x00100000, nullptr, "GetTagInfo2"},
26 {0x00110000, nullptr, "GetTagInfo"}, 27 {0x00110000, nullptr, "GetTagInfo"},
27 {0x00120000, nullptr, "CommunicationGetResult"}, 28 {0x00120000, nullptr, "CommunicationGetResult"},
diff --git a/src/core/hw/gpu.cpp b/src/core/hw/gpu.cpp
index 1a1ee90b2..fa8c13d36 100644
--- a/src/core/hw/gpu.cpp
+++ b/src/core/hw/gpu.cpp
@@ -15,7 +15,6 @@
15#include "common/vector_math.h" 15#include "common/vector_math.h"
16#include "core/core_timing.h" 16#include "core/core_timing.h"
17#include "core/hle/service/gsp_gpu.h" 17#include "core/hle/service/gsp_gpu.h"
18#include "core/hle/service/hid/hid.h"
19#include "core/hw/gpu.h" 18#include "core/hw/gpu.h"
20#include "core/hw/hw.h" 19#include "core/hw/hw.h"
21#include "core/memory.h" 20#include "core/memory.h"
@@ -33,7 +32,7 @@ namespace GPU {
33Regs g_regs; 32Regs g_regs;
34 33
35/// 268MHz CPU clocks / 60Hz frames per second 34/// 268MHz CPU clocks / 60Hz frames per second
36const u64 frame_ticks = 268123480ull / 60; 35const u64 frame_ticks = BASE_CLOCK_RATE_ARM11 / 60;
37/// Event id for CoreTiming 36/// Event id for CoreTiming
38static int vblank_event; 37static int vblank_event;
39/// Total number of frames drawn 38/// Total number of frames drawn
@@ -551,9 +550,6 @@ static void VBlankCallback(u64 userdata, int cycles_late) {
551 Service::GSP::SignalInterrupt(Service::GSP::InterruptId::PDC0); 550 Service::GSP::SignalInterrupt(Service::GSP::InterruptId::PDC0);
552 Service::GSP::SignalInterrupt(Service::GSP::InterruptId::PDC1); 551 Service::GSP::SignalInterrupt(Service::GSP::InterruptId::PDC1);
553 552
554 // Check for user input updates
555 Service::HID::Update();
556
557 if (!Settings::values.use_vsync && Settings::values.toggle_framelimit) { 553 if (!Settings::values.use_vsync && Settings::values.toggle_framelimit) {
558 FrameLimiter(); 554 FrameLimiter();
559 } 555 }
diff --git a/src/core/loader/3dsx.cpp b/src/core/loader/3dsx.cpp
index 1c10740a0..09266e8b0 100644
--- a/src/core/loader/3dsx.cpp
+++ b/src/core/loader/3dsx.cpp
@@ -177,18 +177,34 @@ static THREEDSX_Error Load3DSXFile(FileUtil::IOFile& file, u32 base_addr,
177 pos += table.skip; 177 pos += table.skip;
178 s32 num_patches = table.patch; 178 s32 num_patches = table.patch;
179 while (0 < num_patches && pos < end_pos) { 179 while (0 < num_patches && pos < end_pos) {
180 u32 in_addr = 180 u32 in_addr = base_addr + static_cast<u32>(reinterpret_cast<u8*>(pos) -
181 static_cast<u32>(reinterpret_cast<u8*>(pos) - program_image.data()); 181 program_image.data());
182 u32 addr = TranslateAddr(*pos, &loadinfo, offsets); 182 u32 orig_data = *pos;
183 LOG_TRACE(Loader, "Patching %08X <-- rel(%08X,%d) (%08X)", 183 u32 sub_type = orig_data >> (32 - 4);
184 base_addr + in_addr, addr, current_segment_reloc_table, *pos); 184 u32 addr = TranslateAddr(orig_data & ~0xF0000000, &loadinfo, offsets);
185 LOG_TRACE(Loader, "Patching %08X <-- rel(%08X,%d) (%08X)", in_addr, addr,
186 current_segment_reloc_table, *pos);
185 switch (current_segment_reloc_table) { 187 switch (current_segment_reloc_table) {
186 case 0: 188 case 0: {
187 *pos = (addr); 189 if (sub_type != 0)
190 return ERROR_READ;
191 *pos = addr;
188 break; 192 break;
189 case 1: 193 }
190 *pos = static_cast<u32>(addr - in_addr); 194 case 1: {
195 u32 data = addr - in_addr;
196 switch (sub_type) {
197 case 0: // 32-bit signed offset
198 *pos = data;
199 break;
200 case 1: // 31-bit signed offset
201 *pos = data & ~(1U << 31);
202 break;
203 default:
204 return ERROR_READ;
205 }
191 break; 206 break;
207 }
192 default: 208 default:
193 break; // this should never happen 209 break; // this should never happen
194 } 210 }
diff --git a/src/core/settings.h b/src/core/settings.h
index 8dbda653a..e22ce0f16 100644
--- a/src/core/settings.h
+++ b/src/core/settings.h
@@ -104,6 +104,7 @@ struct Values {
104 // Audio 104 // Audio
105 std::string sink_id; 105 std::string sink_id;
106 bool enable_audio_stretching; 106 bool enable_audio_stretching;
107 std::string audio_device_id;
107 108
108 // Debugging 109 // Debugging
109 bool use_gdbstub; 110 bool use_gdbstub;
diff --git a/src/video_core/CMakeLists.txt b/src/video_core/CMakeLists.txt
index 6ca319b59..d55b84ce0 100644
--- a/src/video_core/CMakeLists.txt
+++ b/src/video_core/CMakeLists.txt
@@ -50,10 +50,12 @@ set(HEADERS
50 50
51if(ARCHITECTURE_x86_64) 51if(ARCHITECTURE_x86_64)
52 set(SRCS ${SRCS} 52 set(SRCS ${SRCS}
53 shader/shader_jit_x64.cpp) 53 shader/shader_jit_x64.cpp
54 shader/shader_jit_x64_compiler.cpp)
54 55
55 set(HEADERS ${HEADERS} 56 set(HEADERS ${HEADERS}
56 shader/shader_jit_x64.h) 57 shader/shader_jit_x64.h
58 shader/shader_jit_x64_compiler.h)
57endif() 59endif()
58 60
59create_directory_groups(${SRCS} ${HEADERS}) 61create_directory_groups(${SRCS} ${HEADERS})
diff --git a/src/video_core/command_processor.cpp b/src/video_core/command_processor.cpp
index ea58e9f54..eb79974a8 100644
--- a/src/video_core/command_processor.cpp
+++ b/src/video_core/command_processor.cpp
@@ -142,16 +142,18 @@ static void WritePicaReg(u32 id, u32 value, u32 mask) {
142 MICROPROFILE_SCOPE(GPU_Drawing); 142 MICROPROFILE_SCOPE(GPU_Drawing);
143 immediate_attribute_id = 0; 143 immediate_attribute_id = 0;
144 144
145 Shader::UnitState shader_unit; 145 auto* shader_engine = Shader::GetEngine();
146 g_state.vs.Setup(); 146 shader_engine->SetupBatch(g_state.vs, regs.vs.main_offset);
147 147
148 // Send to vertex shader 148 // Send to vertex shader
149 if (g_debug_context) 149 if (g_debug_context)
150 g_debug_context->OnEvent(DebugContext::Event::VertexShaderInvocation, 150 g_debug_context->OnEvent(DebugContext::Event::VertexShaderInvocation,
151 static_cast<void*>(&immediate_input)); 151 static_cast<void*>(&immediate_input));
152 g_state.vs.Run(shader_unit, immediate_input, regs.vs.num_input_attributes + 1); 152 Shader::UnitState shader_unit;
153 Shader::OutputVertex output_vertex = 153 shader_unit.LoadInputVertex(immediate_input, regs.vs.num_input_attributes + 1);
154 shader_unit.output_registers.ToVertex(regs.vs); 154 shader_engine->Run(g_state.vs, shader_unit);
155 auto output_vertex = Shader::OutputVertex::FromRegisters(
156 shader_unit.registers.output, regs, regs.vs.output_mask);
155 157
156 // Send to renderer 158 // Send to renderer
157 using Pica::Shader::OutputVertex; 159 using Pica::Shader::OutputVertex;
@@ -243,8 +245,10 @@ static void WritePicaReg(u32 id, u32 value, u32 mask) {
243 unsigned int vertex_cache_pos = 0; 245 unsigned int vertex_cache_pos = 0;
244 vertex_cache_ids.fill(-1); 246 vertex_cache_ids.fill(-1);
245 247
248 auto* shader_engine = Shader::GetEngine();
246 Shader::UnitState shader_unit; 249 Shader::UnitState shader_unit;
247 g_state.vs.Setup(); 250
251 shader_engine->SetupBatch(g_state.vs, regs.vs.main_offset);
248 252
249 for (unsigned int index = 0; index < regs.num_vertices; ++index) { 253 for (unsigned int index = 0; index < regs.num_vertices; ++index) {
250 // Indexed rendering doesn't use the start offset 254 // Indexed rendering doesn't use the start offset
@@ -283,10 +287,12 @@ static void WritePicaReg(u32 id, u32 value, u32 mask) {
283 if (g_debug_context) 287 if (g_debug_context)
284 g_debug_context->OnEvent(DebugContext::Event::VertexShaderInvocation, 288 g_debug_context->OnEvent(DebugContext::Event::VertexShaderInvocation,
285 (void*)&input); 289 (void*)&input);
286 g_state.vs.Run(shader_unit, input, loader.GetNumTotalAttributes()); 290 shader_unit.LoadInputVertex(input, loader.GetNumTotalAttributes());
291 shader_engine->Run(g_state.vs, shader_unit);
287 292
288 // Retrieve vertex from register data 293 // Retrieve vertex from register data
289 output_vertex = shader_unit.output_registers.ToVertex(regs.vs); 294 output_vertex = Shader::OutputVertex::FromRegisters(shader_unit.registers.output,
295 regs, regs.vs.output_mask);
290 296
291 if (is_indexed) { 297 if (is_indexed) {
292 vertex_cache[vertex_cache_pos] = output_vertex; 298 vertex_cache[vertex_cache_pos] = output_vertex;
diff --git a/src/video_core/pica.cpp b/src/video_core/pica.cpp
index ce2bd455e..b4a77c632 100644
--- a/src/video_core/pica.cpp
+++ b/src/video_core/pica.cpp
@@ -499,7 +499,7 @@ void Init() {
499} 499}
500 500
501void Shutdown() { 501void Shutdown() {
502 Shader::ClearCache(); 502 Shader::Shutdown();
503} 503}
504 504
505template <typename T> 505template <typename T>
diff --git a/src/video_core/renderer_opengl/gl_rasterizer.cpp b/src/video_core/renderer_opengl/gl_rasterizer.cpp
index 5a306a5c8..f3674e965 100644
--- a/src/video_core/renderer_opengl/gl_rasterizer.cpp
+++ b/src/video_core/renderer_opengl/gl_rasterizer.cpp
@@ -716,8 +716,6 @@ void RasterizerOpenGL::FlushAndInvalidateRegion(PAddr addr, u32 size) {
716 716
717bool RasterizerOpenGL::AccelerateDisplayTransfer(const GPU::Regs::DisplayTransferConfig& config) { 717bool RasterizerOpenGL::AccelerateDisplayTransfer(const GPU::Regs::DisplayTransferConfig& config) {
718 MICROPROFILE_SCOPE(OpenGL_Blits); 718 MICROPROFILE_SCOPE(OpenGL_Blits);
719 using PixelFormat = CachedSurface::PixelFormat;
720 using SurfaceType = CachedSurface::SurfaceType;
721 719
722 CachedSurface src_params; 720 CachedSurface src_params;
723 src_params.addr = config.GetPhysicalInputAddress(); 721 src_params.addr = config.GetPhysicalInputAddress();
@@ -748,7 +746,8 @@ bool RasterizerOpenGL::AccelerateDisplayTransfer(const GPU::Regs::DisplayTransfe
748 746
749 // Adjust the source rectangle to take into account parts of the input lines being cropped 747 // Adjust the source rectangle to take into account parts of the input lines being cropped
750 if (config.input_width > config.output_width) { 748 if (config.input_width > config.output_width) {
751 src_rect.right -= (config.input_width - config.output_width) * src_surface->res_scale_width; 749 src_rect.right -= static_cast<int>((config.input_width - config.output_width) *
750 src_surface->res_scale_width);
752 } 751 }
753 752
754 // Require destination surface to have same resolution scale as source to preserve scaling 753 // Require destination surface to have same resolution scale as source to preserve scaling
diff --git a/src/video_core/renderer_opengl/gl_rasterizer.h b/src/video_core/renderer_opengl/gl_rasterizer.h
index e1a9cb361..cc3e4bed5 100644
--- a/src/video_core/renderer_opengl/gl_rasterizer.h
+++ b/src/video_core/renderer_opengl/gl_rasterizer.h
@@ -76,7 +76,7 @@ union PicaShaderConfig {
76 } 76 }
77 77
78 state.fog_mode = regs.fog_mode; 78 state.fog_mode = regs.fog_mode;
79 state.fog_flip = regs.fog_flip; 79 state.fog_flip = regs.fog_flip != 0;
80 80
81 state.combiner_buffer_input = regs.tev_combiner_buffer_input.update_mask_rgb.Value() | 81 state.combiner_buffer_input = regs.tev_combiner_buffer_input.update_mask_rgb.Value() |
82 regs.tev_combiner_buffer_input.update_mask_a.Value() << 4; 82 regs.tev_combiner_buffer_input.update_mask_a.Value() << 4;
diff --git a/src/video_core/renderer_opengl/gl_rasterizer_cache.cpp b/src/video_core/renderer_opengl/gl_rasterizer_cache.cpp
index ef3b06a7b..1e7eedecb 100644
--- a/src/video_core/renderer_opengl/gl_rasterizer_cache.cpp
+++ b/src/video_core/renderer_opengl/gl_rasterizer_cache.cpp
@@ -172,7 +172,6 @@ bool RasterizerCacheOpenGL::TryBlitSurfaces(CachedSurface* src_surface,
172 const MathUtil::Rectangle<int>& src_rect, 172 const MathUtil::Rectangle<int>& src_rect,
173 CachedSurface* dst_surface, 173 CachedSurface* dst_surface,
174 const MathUtil::Rectangle<int>& dst_rect) { 174 const MathUtil::Rectangle<int>& dst_rect) {
175 using SurfaceType = CachedSurface::SurfaceType;
176 175
177 if (!CachedSurface::CheckFormatsBlittable(src_surface->pixel_format, 176 if (!CachedSurface::CheckFormatsBlittable(src_surface->pixel_format,
178 dst_surface->pixel_format)) { 177 dst_surface->pixel_format)) {
diff --git a/src/video_core/renderer_opengl/gl_rasterizer_cache.h b/src/video_core/renderer_opengl/gl_rasterizer_cache.h
index b50e8292b..f57fdb3cc 100644
--- a/src/video_core/renderer_opengl/gl_rasterizer_cache.h
+++ b/src/video_core/renderer_opengl/gl_rasterizer_cache.h
@@ -8,7 +8,14 @@
8#include <memory> 8#include <memory>
9#include <set> 9#include <set>
10#include <tuple> 10#include <tuple>
11#ifdef __GNUC__
12#pragma GCC diagnostic push
13#pragma GCC diagnostic ignored "-Wunused-local-typedef"
14#endif
11#include <boost/icl/interval_map.hpp> 15#include <boost/icl/interval_map.hpp>
16#ifdef __GNUC__
17#pragma GCC diagnostic pop
18#endif
12#include <glad/glad.h> 19#include <glad/glad.h>
13#include "common/assert.h" 20#include "common/assert.h"
14#include "common/common_funcs.h" 21#include "common/common_funcs.h"
diff --git a/src/video_core/shader/shader.cpp b/src/video_core/shader/shader.cpp
index a4aa3c9e0..2da50bd62 100644
--- a/src/video_core/shader/shader.cpp
+++ b/src/video_core/shader/shader.cpp
@@ -2,14 +2,8 @@
2// Licensed under GPLv2 or any later version 2// Licensed under GPLv2 or any later version
3// Refer to the license.txt file included. 3// Refer to the license.txt file included.
4 4
5#include <atomic>
6#include <cmath> 5#include <cmath>
7#include <cstring> 6#include <cstring>
8#include <unordered_map>
9#include <utility>
10#include <boost/range/algorithm/fill.hpp>
11#include "common/bit_field.h"
12#include "common/hash.h"
13#include "common/logging/log.h" 7#include "common/logging/log.h"
14#include "common/microprofile.h" 8#include "common/microprofile.h"
15#include "video_core/pica.h" 9#include "video_core/pica.h"
@@ -25,7 +19,8 @@ namespace Pica {
25 19
26namespace Shader { 20namespace Shader {
27 21
28OutputVertex OutputRegisters::ToVertex(const Regs::ShaderConfig& config) const { 22OutputVertex OutputVertex::FromRegisters(Math::Vec4<float24> output_regs[16], const Regs& regs,
23 u32 output_mask) {
29 // Setup output data 24 // Setup output data
30 OutputVertex ret; 25 OutputVertex ret;
31 // TODO(neobrain): Under some circumstances, up to 16 attributes may be output. We need to 26 // TODO(neobrain): Under some circumstances, up to 16 attributes may be output. We need to
@@ -33,13 +28,13 @@ OutputVertex OutputRegisters::ToVertex(const Regs::ShaderConfig& config) const {
33 unsigned index = 0; 28 unsigned index = 0;
34 for (unsigned i = 0; i < 7; ++i) { 29 for (unsigned i = 0; i < 7; ++i) {
35 30
36 if (index >= g_state.regs.vs_output_total) 31 if (index >= regs.vs_output_total)
37 break; 32 break;
38 33
39 if ((config.output_mask & (1 << i)) == 0) 34 if ((output_mask & (1 << i)) == 0)
40 continue; 35 continue;
41 36
42 const auto& output_register_map = g_state.regs.vs_output_attributes[index]; 37 const auto& output_register_map = regs.vs_output_attributes[index];
43 38
44 u32 semantics[4] = {output_register_map.map_x, output_register_map.map_y, 39 u32 semantics[4] = {output_register_map.map_x, output_register_map.map_y,
45 output_register_map.map_z, output_register_map.map_w}; 40 output_register_map.map_z, output_register_map.map_w};
@@ -47,7 +42,7 @@ OutputVertex OutputRegisters::ToVertex(const Regs::ShaderConfig& config) const {
47 for (unsigned comp = 0; comp < 4; ++comp) { 42 for (unsigned comp = 0; comp < 4; ++comp) {
48 float24* out = ((float24*)&ret) + semantics[comp]; 43 float24* out = ((float24*)&ret) + semantics[comp];
49 if (semantics[comp] != Regs::VSOutputAttributes::INVALID) { 44 if (semantics[comp] != Regs::VSOutputAttributes::INVALID) {
50 *out = value[i][comp]; 45 *out = output_regs[i][comp];
51 } else { 46 } else {
52 // Zero output so that attributes which aren't output won't have denormals in them, 47 // Zero output so that attributes which aren't output won't have denormals in them,
53 // which would slow us down later. 48 // which would slow us down later.
@@ -76,86 +71,41 @@ OutputVertex OutputRegisters::ToVertex(const Regs::ShaderConfig& config) const {
76 return ret; 71 return ret;
77} 72}
78 73
79#ifdef ARCHITECTURE_x86_64 74void UnitState::LoadInputVertex(const InputVertex& input, int num_attributes) {
80static std::unordered_map<u64, std::unique_ptr<JitShader>> shader_map; 75 // Setup input register table
81static const JitShader* jit_shader; 76 const auto& attribute_register_map = g_state.regs.vs.input_register_map;
82#endif // ARCHITECTURE_x86_64 77
78 for (int i = 0; i < num_attributes; i++)
79 registers.input[attribute_register_map.GetRegisterForAttribute(i)] = input.attr[i];
80}
81
82MICROPROFILE_DEFINE(GPU_Shader, "GPU", "Shader", MP_RGB(50, 50, 240));
83 83
84void ClearCache() {
85#ifdef ARCHITECTURE_x86_64 84#ifdef ARCHITECTURE_x86_64
86 shader_map.clear(); 85static std::unique_ptr<JitX64Engine> jit_engine;
87#endif // ARCHITECTURE_x86_64 86#endif // ARCHITECTURE_x86_64
88} 87static InterpreterEngine interpreter_engine;
89 88
90void ShaderSetup::Setup() { 89ShaderEngine* GetEngine() {
91#ifdef ARCHITECTURE_x86_64 90#ifdef ARCHITECTURE_x86_64
91 // TODO(yuriks): Re-initialize on each change rather than being persistent
92 if (VideoCore::g_shader_jit_enabled) { 92 if (VideoCore::g_shader_jit_enabled) {
93 u64 cache_key = 93 if (jit_engine == nullptr) {
94 Common::ComputeHash64(&g_state.vs.program_code, sizeof(g_state.vs.program_code)) ^ 94 jit_engine = std::make_unique<JitX64Engine>();
95 Common::ComputeHash64(&g_state.vs.swizzle_data, sizeof(g_state.vs.swizzle_data));
96
97 auto iter = shader_map.find(cache_key);
98 if (iter != shader_map.end()) {
99 jit_shader = iter->second.get();
100 } else {
101 auto shader = std::make_unique<JitShader>();
102 shader->Compile();
103 jit_shader = shader.get();
104 shader_map[cache_key] = std::move(shader);
105 } 95 }
96 return jit_engine.get();
106 } 97 }
107#endif // ARCHITECTURE_x86_64 98#endif // ARCHITECTURE_x86_64
108}
109
110MICROPROFILE_DEFINE(GPU_Shader, "GPU", "Shader", MP_RGB(50, 50, 240));
111
112void ShaderSetup::Run(UnitState& state, const InputVertex& input, int num_attributes) {
113 auto& config = g_state.regs.vs;
114 auto& setup = g_state.vs;
115
116 MICROPROFILE_SCOPE(GPU_Shader);
117 99
118 // Setup input register table 100 return &interpreter_engine;
119 const auto& attribute_register_map = config.input_register_map; 101}
120
121 for (unsigned i = 0; i < num_attributes; i++)
122 state.registers.input[attribute_register_map.GetRegisterForAttribute(i)] = input.attr[i];
123
124 state.conditional_code[0] = false;
125 state.conditional_code[1] = false;
126 102
103void Shutdown() {
127#ifdef ARCHITECTURE_x86_64 104#ifdef ARCHITECTURE_x86_64
128 if (VideoCore::g_shader_jit_enabled) { 105 jit_engine = nullptr;
129 jit_shader->Run(setup, state, config.main_offset);
130 } else {
131 DebugData<false> dummy_debug_data;
132 RunInterpreter(setup, state, dummy_debug_data, config.main_offset);
133 }
134#else
135 DebugData<false> dummy_debug_data;
136 RunInterpreter(setup, state, dummy_debug_data, config.main_offset);
137#endif // ARCHITECTURE_x86_64 106#endif // ARCHITECTURE_x86_64
138} 107}
139 108
140DebugData<true> ShaderSetup::ProduceDebugInfo(const InputVertex& input, int num_attributes,
141 const Regs::ShaderConfig& config,
142 const ShaderSetup& setup) {
143 UnitState state;
144 DebugData<true> debug_data;
145
146 // Setup input register table
147 boost::fill(state.registers.input, Math::Vec4<float24>::AssignToAll(float24::Zero()));
148 const auto& attribute_register_map = config.input_register_map;
149 for (unsigned i = 0; i < num_attributes; i++)
150 state.registers.input[attribute_register_map.GetRegisterForAttribute(i)] = input.attr[i];
151
152 state.conditional_code[0] = false;
153 state.conditional_code[1] = false;
154
155 RunInterpreter(setup, state, debug_data, config.main_offset);
156 return debug_data;
157}
158
159} // namespace Shader 109} // namespace Shader
160 110
161} // namespace Pica 111} // namespace Pica
diff --git a/src/video_core/shader/shader.h b/src/video_core/shader/shader.h
index 2b07759b9..44d9f76c3 100644
--- a/src/video_core/shader/shader.h
+++ b/src/video_core/shader/shader.h
@@ -6,7 +6,6 @@
6 6
7#include <array> 7#include <array>
8#include <cstddef> 8#include <cstddef>
9#include <memory>
10#include <type_traits> 9#include <type_traits>
11#include <nihstro/shader_bytecode.h> 10#include <nihstro/shader_bytecode.h>
12#include "common/assert.h" 11#include "common/assert.h"
@@ -15,7 +14,6 @@
15#include "common/vector_math.h" 14#include "common/vector_math.h"
16#include "video_core/pica.h" 15#include "video_core/pica.h"
17#include "video_core/pica_types.h" 16#include "video_core/pica_types.h"
18#include "video_core/shader/debug_data.h"
19 17
20using nihstro::RegisterType; 18using nihstro::RegisterType;
21using nihstro::SourceRegister; 19using nihstro::SourceRegister;
@@ -75,19 +73,13 @@ struct OutputVertex {
75 ret.Lerp(factor, v1); 73 ret.Lerp(factor, v1);
76 return ret; 74 return ret;
77 } 75 }
76
77 static OutputVertex FromRegisters(Math::Vec4<float24> output_regs[16], const Regs& regs,
78 u32 output_mask);
78}; 79};
79static_assert(std::is_pod<OutputVertex>::value, "Structure is not POD"); 80static_assert(std::is_pod<OutputVertex>::value, "Structure is not POD");
80static_assert(sizeof(OutputVertex) == 32 * sizeof(float), "OutputVertex has invalid size"); 81static_assert(sizeof(OutputVertex) == 32 * sizeof(float), "OutputVertex has invalid size");
81 82
82struct OutputRegisters {
83 OutputRegisters() = default;
84
85 alignas(16) Math::Vec4<float24> value[16];
86
87 OutputVertex ToVertex(const Regs::ShaderConfig& config) const;
88};
89static_assert(std::is_pod<OutputRegisters>::value, "Structure is not POD");
90
91/** 83/**
92 * This structure contains the state information that needs to be unique for a shader unit. The 3DS 84 * This structure contains the state information that needs to be unique for a shader unit. The 3DS
93 * has four shader units that process shaders in parallel. At the present, Citra only implements a 85 * has four shader units that process shaders in parallel. At the present, Citra only implements a
@@ -100,11 +92,10 @@ struct UnitState {
100 // required to be 16-byte aligned. 92 // required to be 16-byte aligned.
101 alignas(16) Math::Vec4<float24> input[16]; 93 alignas(16) Math::Vec4<float24> input[16];
102 alignas(16) Math::Vec4<float24> temporary[16]; 94 alignas(16) Math::Vec4<float24> temporary[16];
95 alignas(16) Math::Vec4<float24> output[16];
103 } registers; 96 } registers;
104 static_assert(std::is_pod<Registers>::value, "Structure is not POD"); 97 static_assert(std::is_pod<Registers>::value, "Structure is not POD");
105 98
106 OutputRegisters output_registers;
107
108 bool conditional_code[2]; 99 bool conditional_code[2];
109 100
110 // Two Address registers and one loop counter 101 // Two Address registers and one loop counter
@@ -130,7 +121,7 @@ struct UnitState {
130 static size_t OutputOffset(const DestRegister& reg) { 121 static size_t OutputOffset(const DestRegister& reg) {
131 switch (reg.GetRegisterType()) { 122 switch (reg.GetRegisterType()) {
132 case RegisterType::Output: 123 case RegisterType::Output:
133 return offsetof(UnitState, output_registers.value) + 124 return offsetof(UnitState, registers.output) +
134 reg.GetIndex() * sizeof(Math::Vec4<float24>); 125 reg.GetIndex() * sizeof(Math::Vec4<float24>);
135 126
136 case RegisterType::Temporary: 127 case RegisterType::Temporary:
@@ -142,13 +133,17 @@ struct UnitState {
142 return 0; 133 return 0;
143 } 134 }
144 } 135 }
145};
146 136
147/// Clears the shader cache 137 /**
148void ClearCache(); 138 * Loads the unit state with an input vertex.
139 *
140 * @param input Input vertex into the shader
141 * @param num_attributes The number of vertex shader attributes to load
142 */
143 void LoadInputVertex(const InputVertex& input, int num_attributes);
144};
149 145
150struct ShaderSetup { 146struct ShaderSetup {
151
152 struct { 147 struct {
153 // The float uniforms are accessed by the shader JIT using SSE instructions, and are 148 // The float uniforms are accessed by the shader JIT using SSE instructions, and are
154 // therefore required to be 16-byte aligned. 149 // therefore required to be 16-byte aligned.
@@ -173,32 +168,37 @@ struct ShaderSetup {
173 std::array<u32, 1024> program_code; 168 std::array<u32, 1024> program_code;
174 std::array<u32, 1024> swizzle_data; 169 std::array<u32, 1024> swizzle_data;
175 170
171 /// Data private to ShaderEngines
172 struct EngineData {
173 unsigned int entry_point;
174 /// Used by the JIT, points to a compiled shader object.
175 const void* cached_shader = nullptr;
176 } engine_data;
177};
178
179class ShaderEngine {
180public:
181 virtual ~ShaderEngine() = default;
182
176 /** 183 /**
177 * Performs any shader unit setup that only needs to happen once per shader (as opposed to once 184 * Performs any shader unit setup that only needs to happen once per shader (as opposed to once
178 * per vertex, which would happen within the `Run` function). 185 * per vertex, which would happen within the `Run` function).
179 */ 186 */
180 void Setup(); 187 virtual void SetupBatch(ShaderSetup& setup, unsigned int entry_point) = 0;
181
182 /**
183 * Runs the currently setup shader
184 * @param state Shader unit state, must be setup per shader and per shader unit
185 * @param input Input vertex into the shader
186 * @param num_attributes The number of vertex shader attributes
187 */
188 void Run(UnitState& state, const InputVertex& input, int num_attributes);
189 188
190 /** 189 /**
191 * Produce debug information based on the given shader and input vertex 190 * Runs the currently setup shader.
192 * @param input Input vertex into the shader 191 *
193 * @param num_attributes The number of vertex shader attributes 192 * @param setup Shader engine state, must be setup with SetupBatch on each shader change.
194 * @param config Configuration object for the shader pipeline 193 * @param state Shader unit state, must be setup with input data before each shader invocation.
195 * @param setup Setup object for the shader pipeline
196 * @return Debug information for this shader with regards to the given vertex
197 */ 194 */
198 DebugData<true> ProduceDebugInfo(const InputVertex& input, int num_attributes, 195 virtual void Run(const ShaderSetup& setup, UnitState& state) const = 0;
199 const Regs::ShaderConfig& config, const ShaderSetup& setup);
200}; 196};
201 197
198// TODO(yuriks): Remove and make it non-global state somewhere
199ShaderEngine* GetEngine();
200void Shutdown();
201
202} // namespace Shader 202} // namespace Shader
203 203
204} // namespace Pica 204} // namespace Pica
diff --git a/src/video_core/shader/shader_interpreter.cpp b/src/video_core/shader/shader_interpreter.cpp
index 20fb9754b..c0c89b857 100644
--- a/src/video_core/shader/shader_interpreter.cpp
+++ b/src/video_core/shader/shader_interpreter.cpp
@@ -7,10 +7,12 @@
7#include <cmath> 7#include <cmath>
8#include <numeric> 8#include <numeric>
9#include <boost/container/static_vector.hpp> 9#include <boost/container/static_vector.hpp>
10#include <boost/range/algorithm/fill.hpp>
10#include <nihstro/shader_bytecode.h> 11#include <nihstro/shader_bytecode.h>
11#include "common/assert.h" 12#include "common/assert.h"
12#include "common/common_types.h" 13#include "common/common_types.h"
13#include "common/logging/log.h" 14#include "common/logging/log.h"
15#include "common/microprofile.h"
14#include "common/vector_math.h" 16#include "common/vector_math.h"
15#include "video_core/pica_state.h" 17#include "video_core/pica_state.h"
16#include "video_core/pica_types.h" 18#include "video_core/pica_types.h"
@@ -37,12 +39,15 @@ struct CallStackElement {
37}; 39};
38 40
39template <bool Debug> 41template <bool Debug>
40void RunInterpreter(const ShaderSetup& setup, UnitState& state, DebugData<Debug>& debug_data, 42static void RunInterpreter(const ShaderSetup& setup, UnitState& state, DebugData<Debug>& debug_data,
41 unsigned offset) { 43 unsigned offset) {
42 // TODO: Is there a maximal size for this? 44 // TODO: Is there a maximal size for this?
43 boost::container::static_vector<CallStackElement, 16> call_stack; 45 boost::container::static_vector<CallStackElement, 16> call_stack;
44 u32 program_counter = offset; 46 u32 program_counter = offset;
45 47
48 state.conditional_code[0] = false;
49 state.conditional_code[1] = false;
50
46 auto call = [&program_counter, &call_stack](u32 offset, u32 num_instructions, u32 return_offset, 51 auto call = [&program_counter, &call_stack](u32 offset, u32 num_instructions, u32 return_offset,
47 u8 repeat_count, u8 loop_increment) { 52 u8 repeat_count, u8 loop_increment) {
48 // -1 to make sure when incrementing the PC we end up at the correct offset 53 // -1 to make sure when incrementing the PC we end up at the correct offset
@@ -73,9 +78,9 @@ void RunInterpreter(const ShaderSetup& setup, UnitState& state, DebugData<Debug>
73 } 78 }
74 }; 79 };
75 80
76 const auto& uniforms = g_state.vs.uniforms; 81 const auto& uniforms = setup.uniforms;
77 const auto& swizzle_data = g_state.vs.swizzle_data; 82 const auto& swizzle_data = setup.swizzle_data;
78 const auto& program_code = g_state.vs.program_code; 83 const auto& program_code = setup.program_code;
79 84
80 // Placeholder for invalid inputs 85 // Placeholder for invalid inputs
81 static float24 dummy_vec4_float24[4]; 86 static float24 dummy_vec4_float24[4];
@@ -170,7 +175,7 @@ void RunInterpreter(const ShaderSetup& setup, UnitState& state, DebugData<Debug>
170 175
171 float24* dest = 176 float24* dest =
172 (instr.common.dest.Value() < 0x10) 177 (instr.common.dest.Value() < 0x10)
173 ? &state.output_registers.value[instr.common.dest.Value().GetIndex()][0] 178 ? &state.registers.output[instr.common.dest.Value().GetIndex()][0]
174 : (instr.common.dest.Value() < 0x20) 179 : (instr.common.dest.Value() < 0x20)
175 ? &state.registers.temporary[instr.common.dest.Value().GetIndex()][0] 180 ? &state.registers.temporary[instr.common.dest.Value().GetIndex()][0]
176 : dummy_vec4_float24; 181 : dummy_vec4_float24;
@@ -513,7 +518,7 @@ void RunInterpreter(const ShaderSetup& setup, UnitState& state, DebugData<Debug>
513 518
514 float24* dest = 519 float24* dest =
515 (instr.mad.dest.Value() < 0x10) 520 (instr.mad.dest.Value() < 0x10)
516 ? &state.output_registers.value[instr.mad.dest.Value().GetIndex()][0] 521 ? &state.registers.output[instr.mad.dest.Value().GetIndex()][0]
517 : (instr.mad.dest.Value() < 0x20) 522 : (instr.mad.dest.Value() < 0x20)
518 ? &state.registers.temporary[instr.mad.dest.Value().GetIndex()][0] 523 ? &state.registers.temporary[instr.mad.dest.Value().GetIndex()][0]
519 : dummy_vec4_float24; 524 : dummy_vec4_float24;
@@ -647,9 +652,33 @@ void RunInterpreter(const ShaderSetup& setup, UnitState& state, DebugData<Debug>
647 } 652 }
648} 653}
649 654
650// Explicit instantiation 655void InterpreterEngine::SetupBatch(ShaderSetup& setup, unsigned int entry_point) {
651template void RunInterpreter(const ShaderSetup&, UnitState&, DebugData<false>&, unsigned offset); 656 ASSERT(entry_point < 1024);
652template void RunInterpreter(const ShaderSetup&, UnitState&, DebugData<true>&, unsigned offset); 657 setup.engine_data.entry_point = entry_point;
658}
659
660MICROPROFILE_DECLARE(GPU_Shader);
661
662void InterpreterEngine::Run(const ShaderSetup& setup, UnitState& state) const {
663
664 MICROPROFILE_SCOPE(GPU_Shader);
665
666 DebugData<false> dummy_debug_data;
667 RunInterpreter(setup, state, dummy_debug_data, setup.engine_data.entry_point);
668}
669
670DebugData<true> InterpreterEngine::ProduceDebugInfo(const ShaderSetup& setup,
671 const InputVertex& input,
672 int num_attributes) const {
673 UnitState state;
674 DebugData<true> debug_data;
675
676 // Setup input register table
677 boost::fill(state.registers.input, Math::Vec4<float24>::AssignToAll(float24::Zero()));
678 state.LoadInputVertex(input, num_attributes);
679 RunInterpreter(setup, state, debug_data, setup.engine_data.entry_point);
680 return debug_data;
681}
653 682
654} // namespace 683} // namespace
655 684
diff --git a/src/video_core/shader/shader_interpreter.h b/src/video_core/shader/shader_interpreter.h
index d31dcd7a6..d6c0e2d8c 100644
--- a/src/video_core/shader/shader_interpreter.h
+++ b/src/video_core/shader/shader_interpreter.h
@@ -4,18 +4,28 @@
4 4
5#pragma once 5#pragma once
6 6
7#include "video_core/shader/debug_data.h"
8#include "video_core/shader/shader.h"
9
7namespace Pica { 10namespace Pica {
8 11
9namespace Shader { 12namespace Shader {
10 13
11struct UnitState; 14class InterpreterEngine final : public ShaderEngine {
12 15public:
13template <bool Debug> 16 void SetupBatch(ShaderSetup& setup, unsigned int entry_point) override;
14struct DebugData; 17 void Run(const ShaderSetup& setup, UnitState& state) const override;
15 18
16template <bool Debug> 19 /**
17void RunInterpreter(const ShaderSetup& setup, UnitState& state, DebugData<Debug>& debug_data, 20 * Produce debug information based on the given shader and input vertex
18 unsigned offset); 21 * @param input Input vertex into the shader
22 * @param num_attributes The number of vertex shader attributes
23 * @param config Configuration object for the shader pipeline
24 * @return Debug information for this shader with regards to the given vertex
25 */
26 DebugData<true> ProduceDebugInfo(const ShaderSetup& setup, const InputVertex& input,
27 int num_attributes) const;
28};
19 29
20} // namespace 30} // namespace
21 31
diff --git a/src/video_core/shader/shader_jit_x64.cpp b/src/video_core/shader/shader_jit_x64.cpp
index c588b778b..0ee0dd9ef 100644
--- a/src/video_core/shader/shader_jit_x64.cpp
+++ b/src/video_core/shader/shader_jit_x64.cpp
@@ -1,888 +1,48 @@
1// Copyright 2015 Citra Emulator Project 1// Copyright 2016 Citra Emulator Project
2// Licensed under GPLv2 or any later version 2// Licensed under GPLv2 or any later version
3// Refer to the license.txt file included. 3// Refer to the license.txt file included.
4 4
5#include <algorithm> 5#include "common/hash.h"
6#include <cmath> 6#include "common/microprofile.h"
7#include <cstdint>
8#include <nihstro/shader_bytecode.h>
9#include <smmintrin.h>
10#include <xmmintrin.h>
11#include "common/assert.h"
12#include "common/logging/log.h"
13#include "common/vector_math.h"
14#include "common/x64/cpu_detect.h"
15#include "common/x64/xbyak_abi.h"
16#include "common/x64/xbyak_util.h"
17#include "video_core/pica_state.h"
18#include "video_core/pica_types.h"
19#include "video_core/shader/shader.h" 7#include "video_core/shader/shader.h"
20#include "video_core/shader/shader_jit_x64.h" 8#include "video_core/shader/shader_jit_x64.h"
21 9#include "video_core/shader/shader_jit_x64_compiler.h"
22using namespace Common::X64;
23using namespace Xbyak::util;
24using Xbyak::Label;
25using Xbyak::Reg32;
26using Xbyak::Reg64;
27using Xbyak::Xmm;
28 10
29namespace Pica { 11namespace Pica {
30
31namespace Shader { 12namespace Shader {
32 13
33typedef void (JitShader::*JitFunction)(Instruction instr); 14JitX64Engine::JitX64Engine() = default;
34 15JitX64Engine::~JitX64Engine() = default;
35const JitFunction instr_table[64] = {
36 &JitShader::Compile_ADD, // add
37 &JitShader::Compile_DP3, // dp3
38 &JitShader::Compile_DP4, // dp4
39 &JitShader::Compile_DPH, // dph
40 nullptr, // unknown
41 &JitShader::Compile_EX2, // ex2
42 &JitShader::Compile_LG2, // lg2
43 nullptr, // unknown
44 &JitShader::Compile_MUL, // mul
45 &JitShader::Compile_SGE, // sge
46 &JitShader::Compile_SLT, // slt
47 &JitShader::Compile_FLR, // flr
48 &JitShader::Compile_MAX, // max
49 &JitShader::Compile_MIN, // min
50 &JitShader::Compile_RCP, // rcp
51 &JitShader::Compile_RSQ, // rsq
52 nullptr, // unknown
53 nullptr, // unknown
54 &JitShader::Compile_MOVA, // mova
55 &JitShader::Compile_MOV, // mov
56 nullptr, // unknown
57 nullptr, // unknown
58 nullptr, // unknown
59 nullptr, // unknown
60 &JitShader::Compile_DPH, // dphi
61 nullptr, // unknown
62 &JitShader::Compile_SGE, // sgei
63 &JitShader::Compile_SLT, // slti
64 nullptr, // unknown
65 nullptr, // unknown
66 nullptr, // unknown
67 nullptr, // unknown
68 nullptr, // unknown
69 &JitShader::Compile_NOP, // nop
70 &JitShader::Compile_END, // end
71 nullptr, // break
72 &JitShader::Compile_CALL, // call
73 &JitShader::Compile_CALLC, // callc
74 &JitShader::Compile_CALLU, // callu
75 &JitShader::Compile_IF, // ifu
76 &JitShader::Compile_IF, // ifc
77 &JitShader::Compile_LOOP, // loop
78 nullptr, // emit
79 nullptr, // sete
80 &JitShader::Compile_JMP, // jmpc
81 &JitShader::Compile_JMP, // jmpu
82 &JitShader::Compile_CMP, // cmp
83 &JitShader::Compile_CMP, // cmp
84 &JitShader::Compile_MAD, // madi
85 &JitShader::Compile_MAD, // madi
86 &JitShader::Compile_MAD, // madi
87 &JitShader::Compile_MAD, // madi
88 &JitShader::Compile_MAD, // madi
89 &JitShader::Compile_MAD, // madi
90 &JitShader::Compile_MAD, // madi
91 &JitShader::Compile_MAD, // madi
92 &JitShader::Compile_MAD, // mad
93 &JitShader::Compile_MAD, // mad
94 &JitShader::Compile_MAD, // mad
95 &JitShader::Compile_MAD, // mad
96 &JitShader::Compile_MAD, // mad
97 &JitShader::Compile_MAD, // mad
98 &JitShader::Compile_MAD, // mad
99 &JitShader::Compile_MAD, // mad
100};
101
102// The following is used to alias some commonly used registers. Generally, RAX-RDX and XMM0-XMM3 can
103// be used as scratch registers within a compiler function. The other registers have designated
104// purposes, as documented below:
105 16
106/// Pointer to the uniform memory 17void JitX64Engine::SetupBatch(ShaderSetup& setup, unsigned int entry_point) {
107static const Reg64 SETUP = r9; 18 ASSERT(entry_point < 1024);
108/// The two 32-bit VS address offset registers set by the MOVA instruction 19 setup.engine_data.entry_point = entry_point;
109static const Reg64 ADDROFFS_REG_0 = r10;
110static const Reg64 ADDROFFS_REG_1 = r11;
111/// VS loop count register (Multiplied by 16)
112static const Reg32 LOOPCOUNT_REG = r12d;
113/// Current VS loop iteration number (we could probably use LOOPCOUNT_REG, but this quicker)
114static const Reg32 LOOPCOUNT = esi;
115/// Number to increment LOOPCOUNT_REG by on each loop iteration (Multiplied by 16)
116static const Reg32 LOOPINC = edi;
117/// Result of the previous CMP instruction for the X-component comparison
118static const Reg64 COND0 = r13;
119/// Result of the previous CMP instruction for the Y-component comparison
120static const Reg64 COND1 = r14;
121/// Pointer to the UnitState instance for the current VS unit
122static const Reg64 STATE = r15;
123/// SIMD scratch register
124static const Xmm SCRATCH = xmm0;
125/// Loaded with the first swizzled source register, otherwise can be used as a scratch register
126static const Xmm SRC1 = xmm1;
127/// Loaded with the second swizzled source register, otherwise can be used as a scratch register
128static const Xmm SRC2 = xmm2;
129/// Loaded with the third swizzled source register, otherwise can be used as a scratch register
130static const Xmm SRC3 = xmm3;
131/// Additional scratch register
132static const Xmm SCRATCH2 = xmm4;
133/// Constant vector of [1.0f, 1.0f, 1.0f, 1.0f], used to efficiently set a vector to one
134static const Xmm ONE = xmm14;
135/// Constant vector of [-0.f, -0.f, -0.f, -0.f], used to efficiently negate a vector with XOR
136static const Xmm NEGBIT = xmm15;
137 20
138// State registers that must not be modified by external functions calls 21 u64 code_hash = Common::ComputeHash64(&setup.program_code, sizeof(setup.program_code));
139// Scratch registers, e.g., SRC1 and SCRATCH, have to be saved on the side if needed 22 u64 swizzle_hash = Common::ComputeHash64(&setup.swizzle_data, sizeof(setup.swizzle_data));
140static const BitSet32 persistent_regs = BuildRegSet({
141 // Pointers to register blocks
142 SETUP, STATE,
143 // Cached registers
144 ADDROFFS_REG_0, ADDROFFS_REG_1, LOOPCOUNT_REG, COND0, COND1,
145 // Constants
146 ONE, NEGBIT,
147});
148 23
149/// Raw constant for the source register selector that indicates no swizzling is performed 24 u64 cache_key = code_hash ^ swizzle_hash;
150static const u8 NO_SRC_REG_SWIZZLE = 0x1b; 25 auto iter = cache.find(cache_key);
151/// Raw constant for the destination register enable mask that indicates all components are enabled 26 if (iter != cache.end()) {
152static const u8 NO_DEST_REG_MASK = 0xf; 27 setup.engine_data.cached_shader = iter->second.get();
153
154/**
155 * Get the vertex shader instruction for a given offset in the current shader program
156 * @param offset Offset in the current shader program of the instruction
157 * @return Instruction at the specified offset
158 */
159static Instruction GetVertexShaderInstruction(size_t offset) {
160 return {g_state.vs.program_code[offset]};
161}
162
163static void LogCritical(const char* msg) {
164 LOG_CRITICAL(HW_GPU, "%s", msg);
165}
166
167void JitShader::Compile_Assert(bool condition, const char* msg) {
168 if (!condition) {
169 mov(ABI_PARAM1, reinterpret_cast<size_t>(msg));
170 CallFarFunction(*this, LogCritical);
171 }
172}
173
174/**
175 * Loads and swizzles a source register into the specified XMM register.
176 * @param instr VS instruction, used for determining how to load the source register
177 * @param src_num Number indicating which source register to load (1 = src1, 2 = src2, 3 = src3)
178 * @param src_reg SourceRegister object corresponding to the source register to load
179 * @param dest Destination XMM register to store the loaded, swizzled source register
180 */
181void JitShader::Compile_SwizzleSrc(Instruction instr, unsigned src_num, SourceRegister src_reg,
182 Xmm dest) {
183 Reg64 src_ptr;
184 size_t src_offset;
185
186 if (src_reg.GetRegisterType() == RegisterType::FloatUniform) {
187 src_ptr = SETUP;
188 src_offset = ShaderSetup::GetFloatUniformOffset(src_reg.GetIndex());
189 } else { 28 } else {
190 src_ptr = STATE; 29 auto shader = std::make_unique<JitShader>();
191 src_offset = UnitState::InputOffset(src_reg); 30 shader->Compile(&setup.program_code, &setup.swizzle_data);
192 } 31 setup.engine_data.cached_shader = shader.get();
193 32 cache.emplace_hint(iter, cache_key, std::move(shader));
194 int src_offset_disp = (int)src_offset;
195 ASSERT_MSG(src_offset == src_offset_disp, "Source register offset too large for int type");
196
197 unsigned operand_desc_id;
198
199 const bool is_inverted =
200 (0 != (instr.opcode.Value().GetInfo().subtype & OpCode::Info::SrcInversed));
201
202 unsigned address_register_index;
203 unsigned offset_src;
204
205 if (instr.opcode.Value().EffectiveOpCode() == OpCode::Id::MAD ||
206 instr.opcode.Value().EffectiveOpCode() == OpCode::Id::MADI) {
207 operand_desc_id = instr.mad.operand_desc_id;
208 offset_src = is_inverted ? 3 : 2;
209 address_register_index = instr.mad.address_register_index;
210 } else {
211 operand_desc_id = instr.common.operand_desc_id;
212 offset_src = is_inverted ? 2 : 1;
213 address_register_index = instr.common.address_register_index;
214 }
215
216 if (src_num == offset_src && address_register_index != 0) {
217 switch (address_register_index) {
218 case 1: // address offset 1
219 movaps(dest, xword[src_ptr + ADDROFFS_REG_0 + src_offset_disp]);
220 break;
221 case 2: // address offset 2
222 movaps(dest, xword[src_ptr + ADDROFFS_REG_1 + src_offset_disp]);
223 break;
224 case 3: // address offset 3
225 movaps(dest, xword[src_ptr + LOOPCOUNT_REG.cvt64() + src_offset_disp]);
226 break;
227 default:
228 UNREACHABLE();
229 break;
230 }
231 } else {
232 // Load the source
233 movaps(dest, xword[src_ptr + src_offset_disp]);
234 }
235
236 SwizzlePattern swiz = {g_state.vs.swizzle_data[operand_desc_id]};
237
238 // Generate instructions for source register swizzling as needed
239 u8 sel = swiz.GetRawSelector(src_num);
240 if (sel != NO_SRC_REG_SWIZZLE) {
241 // Selector component order needs to be reversed for the SHUFPS instruction
242 sel = ((sel & 0xc0) >> 6) | ((sel & 3) << 6) | ((sel & 0xc) << 2) | ((sel & 0x30) >> 2);
243
244 // Shuffle inputs for swizzle
245 shufps(dest, dest, sel);
246 }
247
248 // If the source register should be negated, flip the negative bit using XOR
249 const bool negate[] = {swiz.negate_src1, swiz.negate_src2, swiz.negate_src3};
250 if (negate[src_num - 1]) {
251 xorps(dest, NEGBIT);
252 } 33 }
253} 34}
254 35
255void JitShader::Compile_DestEnable(Instruction instr, Xmm src) { 36MICROPROFILE_DECLARE(GPU_Shader);
256 DestRegister dest;
257 unsigned operand_desc_id;
258 if (instr.opcode.Value().EffectiveOpCode() == OpCode::Id::MAD ||
259 instr.opcode.Value().EffectiveOpCode() == OpCode::Id::MADI) {
260 operand_desc_id = instr.mad.operand_desc_id;
261 dest = instr.mad.dest.Value();
262 } else {
263 operand_desc_id = instr.common.operand_desc_id;
264 dest = instr.common.dest.Value();
265 }
266
267 SwizzlePattern swiz = {g_state.vs.swizzle_data[operand_desc_id]};
268
269 size_t dest_offset_disp = UnitState::OutputOffset(dest);
270
271 // If all components are enabled, write the result to the destination register
272 if (swiz.dest_mask == NO_DEST_REG_MASK) {
273 // Store dest back to memory
274 movaps(xword[STATE + dest_offset_disp], src);
275
276 } else {
277 // Not all components are enabled, so mask the result when storing to the destination
278 // register...
279 movaps(SCRATCH, xword[STATE + dest_offset_disp]);
280
281 if (Common::GetCPUCaps().sse4_1) {
282 u8 mask = ((swiz.dest_mask & 1) << 3) | ((swiz.dest_mask & 8) >> 3) |
283 ((swiz.dest_mask & 2) << 1) | ((swiz.dest_mask & 4) >> 1);
284 blendps(SCRATCH, src, mask);
285 } else {
286 movaps(SCRATCH2, src);
287 unpckhps(SCRATCH2, SCRATCH); // Unpack X/Y components of source and destination
288 unpcklps(SCRATCH, src); // Unpack Z/W components of source and destination
289
290 // Compute selector to selectively copy source components to destination for SHUFPS
291 // instruction
292 u8 sel = ((swiz.DestComponentEnabled(0) ? 1 : 0) << 0) |
293 ((swiz.DestComponentEnabled(1) ? 3 : 2) << 2) |
294 ((swiz.DestComponentEnabled(2) ? 0 : 1) << 4) |
295 ((swiz.DestComponentEnabled(3) ? 2 : 3) << 6);
296 shufps(SCRATCH, SCRATCH2, sel);
297 }
298
299 // Store dest back to memory
300 movaps(xword[STATE + dest_offset_disp], SCRATCH);
301 }
302}
303
304void JitShader::Compile_SanitizedMul(Xmm src1, Xmm src2, Xmm scratch) {
305 movaps(scratch, src1);
306 cmpordps(scratch, src2);
307
308 mulps(src1, src2);
309 37
310 movaps(src2, src1); 38void JitX64Engine::Run(const ShaderSetup& setup, UnitState& state) const {
311 cmpunordps(src2, src2); 39 ASSERT(setup.engine_data.cached_shader != nullptr);
312 40
313 xorps(scratch, src2); 41 MICROPROFILE_SCOPE(GPU_Shader);
314 andps(src1, scratch);
315}
316
317void JitShader::Compile_EvaluateCondition(Instruction instr) {
318 // Note: NXOR is used below to check for equality
319 switch (instr.flow_control.op) {
320 case Instruction::FlowControlType::Or:
321 mov(eax, COND0);
322 mov(ebx, COND1);
323 xor(eax, (instr.flow_control.refx.Value() ^ 1));
324 xor(ebx, (instr.flow_control.refy.Value() ^ 1));
325 or (eax, ebx);
326 break;
327
328 case Instruction::FlowControlType::And:
329 mov(eax, COND0);
330 mov(ebx, COND1);
331 xor(eax, (instr.flow_control.refx.Value() ^ 1));
332 xor(ebx, (instr.flow_control.refy.Value() ^ 1));
333 and(eax, ebx);
334 break;
335
336 case Instruction::FlowControlType::JustX:
337 mov(eax, COND0);
338 xor(eax, (instr.flow_control.refx.Value() ^ 1));
339 break;
340
341 case Instruction::FlowControlType::JustY:
342 mov(eax, COND1);
343 xor(eax, (instr.flow_control.refy.Value() ^ 1));
344 break;
345 }
346}
347 42
348void JitShader::Compile_UniformCondition(Instruction instr) { 43 const JitShader* shader = static_cast<const JitShader*>(setup.engine_data.cached_shader);
349 size_t offset = ShaderSetup::GetBoolUniformOffset(instr.flow_control.bool_uniform_id); 44 shader->Run(setup, state, setup.engine_data.entry_point);
350 cmp(byte[SETUP + offset], 0);
351} 45}
352 46
353BitSet32 JitShader::PersistentCallerSavedRegs() {
354 return persistent_regs & ABI_ALL_CALLER_SAVED;
355}
356
357void JitShader::Compile_ADD(Instruction instr) {
358 Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1);
359 Compile_SwizzleSrc(instr, 2, instr.common.src2, SRC2);
360 addps(SRC1, SRC2);
361 Compile_DestEnable(instr, SRC1);
362}
363
364void JitShader::Compile_DP3(Instruction instr) {
365 Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1);
366 Compile_SwizzleSrc(instr, 2, instr.common.src2, SRC2);
367
368 Compile_SanitizedMul(SRC1, SRC2, SCRATCH);
369
370 movaps(SRC2, SRC1);
371 shufps(SRC2, SRC2, _MM_SHUFFLE(1, 1, 1, 1));
372
373 movaps(SRC3, SRC1);
374 shufps(SRC3, SRC3, _MM_SHUFFLE(2, 2, 2, 2));
375
376 shufps(SRC1, SRC1, _MM_SHUFFLE(0, 0, 0, 0));
377 addps(SRC1, SRC2);
378 addps(SRC1, SRC3);
379
380 Compile_DestEnable(instr, SRC1);
381}
382
383void JitShader::Compile_DP4(Instruction instr) {
384 Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1);
385 Compile_SwizzleSrc(instr, 2, instr.common.src2, SRC2);
386
387 Compile_SanitizedMul(SRC1, SRC2, SCRATCH);
388
389 movaps(SRC2, SRC1);
390 shufps(SRC1, SRC1, _MM_SHUFFLE(2, 3, 0, 1)); // XYZW -> ZWXY
391 addps(SRC1, SRC2);
392
393 movaps(SRC2, SRC1);
394 shufps(SRC1, SRC1, _MM_SHUFFLE(0, 1, 2, 3)); // XYZW -> WZYX
395 addps(SRC1, SRC2);
396
397 Compile_DestEnable(instr, SRC1);
398}
399
400void JitShader::Compile_DPH(Instruction instr) {
401 if (instr.opcode.Value().EffectiveOpCode() == OpCode::Id::DPHI) {
402 Compile_SwizzleSrc(instr, 1, instr.common.src1i, SRC1);
403 Compile_SwizzleSrc(instr, 2, instr.common.src2i, SRC2);
404 } else {
405 Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1);
406 Compile_SwizzleSrc(instr, 2, instr.common.src2, SRC2);
407 }
408
409 if (Common::GetCPUCaps().sse4_1) {
410 // Set 4th component to 1.0
411 blendps(SRC1, ONE, 0b1000);
412 } else {
413 // Set 4th component to 1.0
414 movaps(SCRATCH, SRC1);
415 unpckhps(SCRATCH, ONE); // XYZW, 1111 -> Z1__
416 unpcklpd(SRC1, SCRATCH); // XYZW, Z1__ -> XYZ1
417 }
418
419 Compile_SanitizedMul(SRC1, SRC2, SCRATCH);
420
421 movaps(SRC2, SRC1);
422 shufps(SRC1, SRC1, _MM_SHUFFLE(2, 3, 0, 1)); // XYZW -> ZWXY
423 addps(SRC1, SRC2);
424
425 movaps(SRC2, SRC1);
426 shufps(SRC1, SRC1, _MM_SHUFFLE(0, 1, 2, 3)); // XYZW -> WZYX
427 addps(SRC1, SRC2);
428
429 Compile_DestEnable(instr, SRC1);
430}
431
432void JitShader::Compile_EX2(Instruction instr) {
433 Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1);
434 movss(xmm0, SRC1); // ABI_PARAM1
435
436 ABI_PushRegistersAndAdjustStack(*this, PersistentCallerSavedRegs(), 0);
437 CallFarFunction(*this, exp2f);
438 ABI_PopRegistersAndAdjustStack(*this, PersistentCallerSavedRegs(), 0);
439
440 shufps(xmm0, xmm0, _MM_SHUFFLE(0, 0, 0, 0)); // ABI_RETURN
441 movaps(SRC1, xmm0);
442 Compile_DestEnable(instr, SRC1);
443}
444
445void JitShader::Compile_LG2(Instruction instr) {
446 Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1);
447 movss(xmm0, SRC1); // ABI_PARAM1
448
449 ABI_PushRegistersAndAdjustStack(*this, PersistentCallerSavedRegs(), 0);
450 CallFarFunction(*this, log2f);
451 ABI_PopRegistersAndAdjustStack(*this, PersistentCallerSavedRegs(), 0);
452
453 shufps(xmm0, xmm0, _MM_SHUFFLE(0, 0, 0, 0)); // ABI_RETURN
454 movaps(SRC1, xmm0);
455 Compile_DestEnable(instr, SRC1);
456}
457
458void JitShader::Compile_MUL(Instruction instr) {
459 Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1);
460 Compile_SwizzleSrc(instr, 2, instr.common.src2, SRC2);
461 Compile_SanitizedMul(SRC1, SRC2, SCRATCH);
462 Compile_DestEnable(instr, SRC1);
463}
464
465void JitShader::Compile_SGE(Instruction instr) {
466 if (instr.opcode.Value().EffectiveOpCode() == OpCode::Id::SGEI) {
467 Compile_SwizzleSrc(instr, 1, instr.common.src1i, SRC1);
468 Compile_SwizzleSrc(instr, 2, instr.common.src2i, SRC2);
469 } else {
470 Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1);
471 Compile_SwizzleSrc(instr, 2, instr.common.src2, SRC2);
472 }
473
474 cmpleps(SRC2, SRC1);
475 andps(SRC2, ONE);
476
477 Compile_DestEnable(instr, SRC2);
478}
479
480void JitShader::Compile_SLT(Instruction instr) {
481 if (instr.opcode.Value().EffectiveOpCode() == OpCode::Id::SLTI) {
482 Compile_SwizzleSrc(instr, 1, instr.common.src1i, SRC1);
483 Compile_SwizzleSrc(instr, 2, instr.common.src2i, SRC2);
484 } else {
485 Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1);
486 Compile_SwizzleSrc(instr, 2, instr.common.src2, SRC2);
487 }
488
489 cmpltps(SRC1, SRC2);
490 andps(SRC1, ONE);
491
492 Compile_DestEnable(instr, SRC1);
493}
494
495void JitShader::Compile_FLR(Instruction instr) {
496 Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1);
497
498 if (Common::GetCPUCaps().sse4_1) {
499 roundps(SRC1, SRC1, _MM_FROUND_FLOOR);
500 } else {
501 cvttps2dq(SRC1, SRC1);
502 cvtdq2ps(SRC1, SRC1);
503 }
504
505 Compile_DestEnable(instr, SRC1);
506}
507
508void JitShader::Compile_MAX(Instruction instr) {
509 Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1);
510 Compile_SwizzleSrc(instr, 2, instr.common.src2, SRC2);
511 // SSE semantics match PICA200 ones: In case of NaN, SRC2 is returned.
512 maxps(SRC1, SRC2);
513 Compile_DestEnable(instr, SRC1);
514}
515
516void JitShader::Compile_MIN(Instruction instr) {
517 Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1);
518 Compile_SwizzleSrc(instr, 2, instr.common.src2, SRC2);
519 // SSE semantics match PICA200 ones: In case of NaN, SRC2 is returned.
520 minps(SRC1, SRC2);
521 Compile_DestEnable(instr, SRC1);
522}
523
524void JitShader::Compile_MOVA(Instruction instr) {
525 SwizzlePattern swiz = {g_state.vs.swizzle_data[instr.common.operand_desc_id]};
526
527 if (!swiz.DestComponentEnabled(0) && !swiz.DestComponentEnabled(1)) {
528 return; // NoOp
529 }
530
531 Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1);
532
533 // Convert floats to integers using truncation (only care about X and Y components)
534 cvttps2dq(SRC1, SRC1);
535
536 // Get result
537 movq(rax, SRC1);
538
539 // Handle destination enable
540 if (swiz.DestComponentEnabled(0) && swiz.DestComponentEnabled(1)) {
541 // Move and sign-extend low 32 bits
542 movsxd(ADDROFFS_REG_0, eax);
543
544 // Move and sign-extend high 32 bits
545 shr(rax, 32);
546 movsxd(ADDROFFS_REG_1, eax);
547
548 // Multiply by 16 to be used as an offset later
549 shl(ADDROFFS_REG_0, 4);
550 shl(ADDROFFS_REG_1, 4);
551 } else {
552 if (swiz.DestComponentEnabled(0)) {
553 // Move and sign-extend low 32 bits
554 movsxd(ADDROFFS_REG_0, eax);
555
556 // Multiply by 16 to be used as an offset later
557 shl(ADDROFFS_REG_0, 4);
558 } else if (swiz.DestComponentEnabled(1)) {
559 // Move and sign-extend high 32 bits
560 shr(rax, 32);
561 movsxd(ADDROFFS_REG_1, eax);
562
563 // Multiply by 16 to be used as an offset later
564 shl(ADDROFFS_REG_1, 4);
565 }
566 }
567}
568
569void JitShader::Compile_MOV(Instruction instr) {
570 Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1);
571 Compile_DestEnable(instr, SRC1);
572}
573
574void JitShader::Compile_RCP(Instruction instr) {
575 Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1);
576
577 // TODO(bunnei): RCPSS is a pretty rough approximation, this might cause problems if Pica
578 // performs this operation more accurately. This should be checked on hardware.
579 rcpss(SRC1, SRC1);
580 shufps(SRC1, SRC1, _MM_SHUFFLE(0, 0, 0, 0)); // XYWZ -> XXXX
581
582 Compile_DestEnable(instr, SRC1);
583}
584
585void JitShader::Compile_RSQ(Instruction instr) {
586 Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1);
587
588 // TODO(bunnei): RSQRTSS is a pretty rough approximation, this might cause problems if Pica
589 // performs this operation more accurately. This should be checked on hardware.
590 rsqrtss(SRC1, SRC1);
591 shufps(SRC1, SRC1, _MM_SHUFFLE(0, 0, 0, 0)); // XYWZ -> XXXX
592
593 Compile_DestEnable(instr, SRC1);
594}
595
596void JitShader::Compile_NOP(Instruction instr) {}
597
598void JitShader::Compile_END(Instruction instr) {
599 ABI_PopRegistersAndAdjustStack(*this, ABI_ALL_CALLEE_SAVED, 8);
600 ret();
601}
602
603void JitShader::Compile_CALL(Instruction instr) {
604 // Push offset of the return
605 push(qword, (instr.flow_control.dest_offset + instr.flow_control.num_instructions));
606
607 // Call the subroutine
608 call(instruction_labels[instr.flow_control.dest_offset]);
609
610 // Skip over the return offset that's on the stack
611 add(rsp, 8);
612}
613
614void JitShader::Compile_CALLC(Instruction instr) {
615 Compile_EvaluateCondition(instr);
616 Label b;
617 jz(b);
618 Compile_CALL(instr);
619 L(b);
620}
621
622void JitShader::Compile_CALLU(Instruction instr) {
623 Compile_UniformCondition(instr);
624 Label b;
625 jz(b);
626 Compile_CALL(instr);
627 L(b);
628}
629
630void JitShader::Compile_CMP(Instruction instr) {
631 using Op = Instruction::Common::CompareOpType::Op;
632 Op op_x = instr.common.compare_op.x;
633 Op op_y = instr.common.compare_op.y;
634
635 Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1);
636 Compile_SwizzleSrc(instr, 2, instr.common.src2, SRC2);
637
638 // SSE doesn't have greater-than (GT) or greater-equal (GE) comparison operators. You need to
639 // emulate them by swapping the lhs and rhs and using LT and LE. NLT and NLE can't be used here
640 // because they don't match when used with NaNs.
641 static const u8 cmp[] = {CMP_EQ, CMP_NEQ, CMP_LT, CMP_LE, CMP_LT, CMP_LE};
642
643 bool invert_op_x = (op_x == Op::GreaterThan || op_x == Op::GreaterEqual);
644 Xmm lhs_x = invert_op_x ? SRC2 : SRC1;
645 Xmm rhs_x = invert_op_x ? SRC1 : SRC2;
646
647 if (op_x == op_y) {
648 // Compare X-component and Y-component together
649 cmpps(lhs_x, rhs_x, cmp[op_x]);
650 movq(COND0, lhs_x);
651
652 mov(COND1, COND0);
653 } else {
654 bool invert_op_y = (op_y == Op::GreaterThan || op_y == Op::GreaterEqual);
655 Xmm lhs_y = invert_op_y ? SRC2 : SRC1;
656 Xmm rhs_y = invert_op_y ? SRC1 : SRC2;
657
658 // Compare X-component
659 movaps(SCRATCH, lhs_x);
660 cmpss(SCRATCH, rhs_x, cmp[op_x]);
661
662 // Compare Y-component
663 cmpps(lhs_y, rhs_y, cmp[op_y]);
664
665 movq(COND0, SCRATCH);
666 movq(COND1, lhs_y);
667 }
668
669 shr(COND0.cvt32(), 31); // ignores upper 32 bits in source
670 shr(COND1, 63);
671}
672
673void JitShader::Compile_MAD(Instruction instr) {
674 Compile_SwizzleSrc(instr, 1, instr.mad.src1, SRC1);
675
676 if (instr.opcode.Value().EffectiveOpCode() == OpCode::Id::MADI) {
677 Compile_SwizzleSrc(instr, 2, instr.mad.src2i, SRC2);
678 Compile_SwizzleSrc(instr, 3, instr.mad.src3i, SRC3);
679 } else {
680 Compile_SwizzleSrc(instr, 2, instr.mad.src2, SRC2);
681 Compile_SwizzleSrc(instr, 3, instr.mad.src3, SRC3);
682 }
683
684 Compile_SanitizedMul(SRC1, SRC2, SCRATCH);
685 addps(SRC1, SRC3);
686
687 Compile_DestEnable(instr, SRC1);
688}
689
690void JitShader::Compile_IF(Instruction instr) {
691 Compile_Assert(instr.flow_control.dest_offset >= program_counter,
692 "Backwards if-statements not supported");
693 Label l_else, l_endif;
694
695 // Evaluate the "IF" condition
696 if (instr.opcode.Value() == OpCode::Id::IFU) {
697 Compile_UniformCondition(instr);
698 } else if (instr.opcode.Value() == OpCode::Id::IFC) {
699 Compile_EvaluateCondition(instr);
700 }
701 jz(l_else, T_NEAR);
702
703 // Compile the code that corresponds to the condition evaluating as true
704 Compile_Block(instr.flow_control.dest_offset);
705
706 // If there isn't an "ELSE" condition, we are done here
707 if (instr.flow_control.num_instructions == 0) {
708 L(l_else);
709 return;
710 }
711
712 jmp(l_endif, T_NEAR);
713
714 L(l_else);
715 // This code corresponds to the "ELSE" condition
716 // Comple the code that corresponds to the condition evaluating as false
717 Compile_Block(instr.flow_control.dest_offset + instr.flow_control.num_instructions);
718
719 L(l_endif);
720}
721
722void JitShader::Compile_LOOP(Instruction instr) {
723 Compile_Assert(instr.flow_control.dest_offset >= program_counter,
724 "Backwards loops not supported");
725 Compile_Assert(!looping, "Nested loops not supported");
726
727 looping = true;
728
729 // This decodes the fields from the integer uniform at index instr.flow_control.int_uniform_id.
730 // The Y (LOOPCOUNT_REG) and Z (LOOPINC) component are kept multiplied by 16 (Left shifted by
731 // 4 bits) to be used as an offset into the 16-byte vector registers later
732 size_t offset = ShaderSetup::GetIntUniformOffset(instr.flow_control.int_uniform_id);
733 mov(LOOPCOUNT, dword[SETUP + offset]);
734 mov(LOOPCOUNT_REG, LOOPCOUNT);
735 shr(LOOPCOUNT_REG, 4);
736 and(LOOPCOUNT_REG, 0xFF0); // Y-component is the start
737 mov(LOOPINC, LOOPCOUNT);
738 shr(LOOPINC, 12);
739 and(LOOPINC, 0xFF0); // Z-component is the incrementer
740 movzx(LOOPCOUNT, LOOPCOUNT.cvt8()); // X-component is iteration count
741 add(LOOPCOUNT, 1); // Iteration count is X-component + 1
742
743 Label l_loop_start;
744 L(l_loop_start);
745
746 Compile_Block(instr.flow_control.dest_offset + 1);
747
748 add(LOOPCOUNT_REG, LOOPINC); // Increment LOOPCOUNT_REG by Z-component
749 sub(LOOPCOUNT, 1); // Increment loop count by 1
750 jnz(l_loop_start); // Loop if not equal
751
752 looping = false;
753}
754
755void JitShader::Compile_JMP(Instruction instr) {
756 if (instr.opcode.Value() == OpCode::Id::JMPC)
757 Compile_EvaluateCondition(instr);
758 else if (instr.opcode.Value() == OpCode::Id::JMPU)
759 Compile_UniformCondition(instr);
760 else
761 UNREACHABLE();
762
763 bool inverted_condition =
764 (instr.opcode.Value() == OpCode::Id::JMPU) && (instr.flow_control.num_instructions & 1);
765
766 Label& b = instruction_labels[instr.flow_control.dest_offset];
767 if (inverted_condition) {
768 jz(b, T_NEAR);
769 } else {
770 jnz(b, T_NEAR);
771 }
772}
773
774void JitShader::Compile_Block(unsigned end) {
775 while (program_counter < end) {
776 Compile_NextInstr();
777 }
778}
779
780void JitShader::Compile_Return() {
781 // Peek return offset on the stack and check if we're at that offset
782 mov(rax, qword[rsp + 8]);
783 cmp(eax, (program_counter));
784
785 // If so, jump back to before CALL
786 Label b;
787 jnz(b);
788 ret();
789 L(b);
790}
791
792void JitShader::Compile_NextInstr() {
793 if (std::binary_search(return_offsets.begin(), return_offsets.end(), program_counter)) {
794 Compile_Return();
795 }
796
797 L(instruction_labels[program_counter]);
798
799 Instruction instr = GetVertexShaderInstruction(program_counter++);
800
801 OpCode::Id opcode = instr.opcode.Value();
802 auto instr_func = instr_table[static_cast<unsigned>(opcode)];
803
804 if (instr_func) {
805 // JIT the instruction!
806 ((*this).*instr_func)(instr);
807 } else {
808 // Unhandled instruction
809 LOG_CRITICAL(HW_GPU, "Unhandled instruction: 0x%02x (0x%08x)",
810 instr.opcode.Value().EffectiveOpCode(), instr.hex);
811 }
812}
813
814void JitShader::FindReturnOffsets() {
815 return_offsets.clear();
816
817 for (size_t offset = 0; offset < g_state.vs.program_code.size(); ++offset) {
818 Instruction instr = GetVertexShaderInstruction(offset);
819
820 switch (instr.opcode.Value()) {
821 case OpCode::Id::CALL:
822 case OpCode::Id::CALLC:
823 case OpCode::Id::CALLU:
824 return_offsets.push_back(instr.flow_control.dest_offset +
825 instr.flow_control.num_instructions);
826 break;
827 default:
828 break;
829 }
830 }
831
832 // Sort for efficient binary search later
833 std::sort(return_offsets.begin(), return_offsets.end());
834}
835
836void JitShader::Compile() {
837 // Reset flow control state
838 program = (CompiledShader*)getCurr();
839 program_counter = 0;
840 looping = false;
841 instruction_labels.fill(Xbyak::Label());
842
843 // Find all `CALL` instructions and identify return locations
844 FindReturnOffsets();
845
846 // The stack pointer is 8 modulo 16 at the entry of a procedure
847 ABI_PushRegistersAndAdjustStack(*this, ABI_ALL_CALLEE_SAVED, 8);
848
849 mov(SETUP, ABI_PARAM1);
850 mov(STATE, ABI_PARAM2);
851
852 // Zero address/loop registers
853 xor(ADDROFFS_REG_0.cvt32(), ADDROFFS_REG_0.cvt32());
854 xor(ADDROFFS_REG_1.cvt32(), ADDROFFS_REG_1.cvt32());
855 xor(LOOPCOUNT_REG, LOOPCOUNT_REG);
856
857 // Used to set a register to one
858 static const __m128 one = {1.f, 1.f, 1.f, 1.f};
859 mov(rax, reinterpret_cast<size_t>(&one));
860 movaps(ONE, xword[rax]);
861
862 // Used to negate registers
863 static const __m128 neg = {-0.f, -0.f, -0.f, -0.f};
864 mov(rax, reinterpret_cast<size_t>(&neg));
865 movaps(NEGBIT, xword[rax]);
866
867 // Jump to start of the shader program
868 jmp(ABI_PARAM3);
869
870 // Compile entire program
871 Compile_Block(static_cast<unsigned>(g_state.vs.program_code.size()));
872
873 // Free memory that's no longer needed
874 return_offsets.clear();
875 return_offsets.shrink_to_fit();
876
877 ready();
878
879 uintptr_t size = reinterpret_cast<uintptr_t>(getCurr()) - reinterpret_cast<uintptr_t>(program);
880 ASSERT_MSG(size <= MAX_SHADER_SIZE, "Compiled a shader that exceeds the allocated size!");
881 LOG_DEBUG(HW_GPU, "Compiled shader size=%lu", size);
882}
883
884JitShader::JitShader() : Xbyak::CodeGenerator(MAX_SHADER_SIZE) {}
885
886} // namespace Shader 47} // namespace Shader
887
888} // namespace Pica 48} // namespace Pica
diff --git a/src/video_core/shader/shader_jit_x64.h b/src/video_core/shader/shader_jit_x64.h
index f37548306..078b2cba5 100644
--- a/src/video_core/shader/shader_jit_x64.h
+++ b/src/video_core/shader/shader_jit_x64.h
@@ -1,121 +1,30 @@
1// Copyright 2015 Citra Emulator Project 1// Copyright 2016 Citra Emulator Project
2// Licensed under GPLv2 or any later version 2// Licensed under GPLv2 or any later version
3// Refer to the license.txt file included. 3// Refer to the license.txt file included.
4 4
5#pragma once 5#pragma once
6 6
7#include <array> 7#include <memory>
8#include <cstddef> 8#include <unordered_map>
9#include <utility>
10#include <vector>
11#include <nihstro/shader_bytecode.h>
12#include <xbyak.h>
13#include "common/bit_set.h"
14#include "common/common_types.h" 9#include "common/common_types.h"
15#include "common/x64/emitter.h"
16#include "video_core/shader/shader.h" 10#include "video_core/shader/shader.h"
17 11
18using nihstro::Instruction;
19using nihstro::OpCode;
20using nihstro::SwizzlePattern;
21
22namespace Pica { 12namespace Pica {
23
24namespace Shader { 13namespace Shader {
25 14
26/// Memory allocated for each compiled shader (64Kb) 15class JitShader;
27constexpr size_t MAX_SHADER_SIZE = 1024 * 64;
28 16
29/** 17class JitX64Engine final : public ShaderEngine {
30 * This class implements the shader JIT compiler. It recompiles a Pica shader program into x86_64
31 * code that can be executed on the host machine directly.
32 */
33class JitShader : public Xbyak::CodeGenerator {
34public: 18public:
35 JitShader(); 19 JitX64Engine();
36 20 ~JitX64Engine() override;
37 void Run(const ShaderSetup& setup, UnitState& state, unsigned offset) const {
38 program(&setup, &state, instruction_labels[offset].getAddress());
39 }
40
41 void Compile();
42 21
43 void Compile_ADD(Instruction instr); 22 void SetupBatch(ShaderSetup& setup, unsigned int entry_point) override;
44 void Compile_DP3(Instruction instr); 23 void Run(const ShaderSetup& setup, UnitState& state) const override;
45 void Compile_DP4(Instruction instr);
46 void Compile_DPH(Instruction instr);
47 void Compile_EX2(Instruction instr);
48 void Compile_LG2(Instruction instr);
49 void Compile_MUL(Instruction instr);
50 void Compile_SGE(Instruction instr);
51 void Compile_SLT(Instruction instr);
52 void Compile_FLR(Instruction instr);
53 void Compile_MAX(Instruction instr);
54 void Compile_MIN(Instruction instr);
55 void Compile_RCP(Instruction instr);
56 void Compile_RSQ(Instruction instr);
57 void Compile_MOVA(Instruction instr);
58 void Compile_MOV(Instruction instr);
59 void Compile_NOP(Instruction instr);
60 void Compile_END(Instruction instr);
61 void Compile_CALL(Instruction instr);
62 void Compile_CALLC(Instruction instr);
63 void Compile_CALLU(Instruction instr);
64 void Compile_IF(Instruction instr);
65 void Compile_LOOP(Instruction instr);
66 void Compile_JMP(Instruction instr);
67 void Compile_CMP(Instruction instr);
68 void Compile_MAD(Instruction instr);
69 24
70private: 25private:
71 void Compile_Block(unsigned end); 26 std::unordered_map<u64, std::unique_ptr<JitShader>> cache;
72 void Compile_NextInstr();
73
74 void Compile_SwizzleSrc(Instruction instr, unsigned src_num, SourceRegister src_reg,
75 Xbyak::Xmm dest);
76 void Compile_DestEnable(Instruction instr, Xbyak::Xmm dest);
77
78 /**
79 * Compiles a `MUL src1, src2` operation, properly handling the PICA semantics when multiplying
80 * zero by inf. Clobbers `src2` and `scratch`.
81 */
82 void Compile_SanitizedMul(Xbyak::Xmm src1, Xbyak::Xmm src2, Xbyak::Xmm scratch);
83
84 void Compile_EvaluateCondition(Instruction instr);
85 void Compile_UniformCondition(Instruction instr);
86
87 /**
88 * Emits the code to conditionally return from a subroutine envoked by the `CALL` instruction.
89 */
90 void Compile_Return();
91
92 BitSet32 PersistentCallerSavedRegs();
93
94 /**
95 * Assertion evaluated at compile-time, but only triggered if executed at runtime.
96 * @param msg Message to be logged if the assertion fails.
97 */
98 void Compile_Assert(bool condition, const char* msg);
99
100 /**
101 * Analyzes the entire shader program for `CALL` instructions before emitting any code,
102 * identifying the locations where a return needs to be inserted.
103 */
104 void FindReturnOffsets();
105
106 /// Mapping of Pica VS instructions to pointers in the emitted code
107 std::array<Xbyak::Label, 1024> instruction_labels;
108
109 /// Offsets in code where a return needs to be inserted
110 std::vector<unsigned> return_offsets;
111
112 unsigned program_counter = 0; ///< Offset of the next instruction to decode
113 bool looping = false; ///< True if compiling a loop, used to check for nested loops
114
115 using CompiledShader = void(const void* setup, void* state, const u8* start_addr);
116 CompiledShader* program = nullptr;
117}; 27};
118 28
119} // Shader 29} // namespace Shader
120 30} // namespace Pica
121} // Pica
diff --git a/src/video_core/shader/shader_jit_x64_compiler.cpp b/src/video_core/shader/shader_jit_x64_compiler.cpp
new file mode 100644
index 000000000..49806e8c9
--- /dev/null
+++ b/src/video_core/shader/shader_jit_x64_compiler.cpp
@@ -0,0 +1,884 @@
1// Copyright 2015 Citra Emulator Project
2// Licensed under GPLv2 or any later version
3// Refer to the license.txt file included.
4
5#include <algorithm>
6#include <cmath>
7#include <cstdint>
8#include <nihstro/shader_bytecode.h>
9#include <smmintrin.h>
10#include <xmmintrin.h>
11#include "common/assert.h"
12#include "common/logging/log.h"
13#include "common/vector_math.h"
14#include "common/x64/cpu_detect.h"
15#include "common/x64/xbyak_abi.h"
16#include "common/x64/xbyak_util.h"
17#include "video_core/pica_state.h"
18#include "video_core/pica_types.h"
19#include "video_core/shader/shader.h"
20#include "video_core/shader/shader_jit_x64_compiler.h"
21
22using namespace Common::X64;
23using namespace Xbyak::util;
24using Xbyak::Label;
25using Xbyak::Reg32;
26using Xbyak::Reg64;
27using Xbyak::Xmm;
28
29namespace Pica {
30
31namespace Shader {
32
33typedef void (JitShader::*JitFunction)(Instruction instr);
34
35const JitFunction instr_table[64] = {
36 &JitShader::Compile_ADD, // add
37 &JitShader::Compile_DP3, // dp3
38 &JitShader::Compile_DP4, // dp4
39 &JitShader::Compile_DPH, // dph
40 nullptr, // unknown
41 &JitShader::Compile_EX2, // ex2
42 &JitShader::Compile_LG2, // lg2
43 nullptr, // unknown
44 &JitShader::Compile_MUL, // mul
45 &JitShader::Compile_SGE, // sge
46 &JitShader::Compile_SLT, // slt
47 &JitShader::Compile_FLR, // flr
48 &JitShader::Compile_MAX, // max
49 &JitShader::Compile_MIN, // min
50 &JitShader::Compile_RCP, // rcp
51 &JitShader::Compile_RSQ, // rsq
52 nullptr, // unknown
53 nullptr, // unknown
54 &JitShader::Compile_MOVA, // mova
55 &JitShader::Compile_MOV, // mov
56 nullptr, // unknown
57 nullptr, // unknown
58 nullptr, // unknown
59 nullptr, // unknown
60 &JitShader::Compile_DPH, // dphi
61 nullptr, // unknown
62 &JitShader::Compile_SGE, // sgei
63 &JitShader::Compile_SLT, // slti
64 nullptr, // unknown
65 nullptr, // unknown
66 nullptr, // unknown
67 nullptr, // unknown
68 nullptr, // unknown
69 &JitShader::Compile_NOP, // nop
70 &JitShader::Compile_END, // end
71 nullptr, // break
72 &JitShader::Compile_CALL, // call
73 &JitShader::Compile_CALLC, // callc
74 &JitShader::Compile_CALLU, // callu
75 &JitShader::Compile_IF, // ifu
76 &JitShader::Compile_IF, // ifc
77 &JitShader::Compile_LOOP, // loop
78 nullptr, // emit
79 nullptr, // sete
80 &JitShader::Compile_JMP, // jmpc
81 &JitShader::Compile_JMP, // jmpu
82 &JitShader::Compile_CMP, // cmp
83 &JitShader::Compile_CMP, // cmp
84 &JitShader::Compile_MAD, // madi
85 &JitShader::Compile_MAD, // madi
86 &JitShader::Compile_MAD, // madi
87 &JitShader::Compile_MAD, // madi
88 &JitShader::Compile_MAD, // madi
89 &JitShader::Compile_MAD, // madi
90 &JitShader::Compile_MAD, // madi
91 &JitShader::Compile_MAD, // madi
92 &JitShader::Compile_MAD, // mad
93 &JitShader::Compile_MAD, // mad
94 &JitShader::Compile_MAD, // mad
95 &JitShader::Compile_MAD, // mad
96 &JitShader::Compile_MAD, // mad
97 &JitShader::Compile_MAD, // mad
98 &JitShader::Compile_MAD, // mad
99 &JitShader::Compile_MAD, // mad
100};
101
102// The following is used to alias some commonly used registers. Generally, RAX-RDX and XMM0-XMM3 can
103// be used as scratch registers within a compiler function. The other registers have designated
104// purposes, as documented below:
105
106/// Pointer to the uniform memory
107static const Reg64 SETUP = r9;
108/// The two 32-bit VS address offset registers set by the MOVA instruction
109static const Reg64 ADDROFFS_REG_0 = r10;
110static const Reg64 ADDROFFS_REG_1 = r11;
111/// VS loop count register (Multiplied by 16)
112static const Reg32 LOOPCOUNT_REG = r12d;
113/// Current VS loop iteration number (we could probably use LOOPCOUNT_REG, but this quicker)
114static const Reg32 LOOPCOUNT = esi;
115/// Number to increment LOOPCOUNT_REG by on each loop iteration (Multiplied by 16)
116static const Reg32 LOOPINC = edi;
117/// Result of the previous CMP instruction for the X-component comparison
118static const Reg64 COND0 = r13;
119/// Result of the previous CMP instruction for the Y-component comparison
120static const Reg64 COND1 = r14;
121/// Pointer to the UnitState instance for the current VS unit
122static const Reg64 STATE = r15;
123/// SIMD scratch register
124static const Xmm SCRATCH = xmm0;
125/// Loaded with the first swizzled source register, otherwise can be used as a scratch register
126static const Xmm SRC1 = xmm1;
127/// Loaded with the second swizzled source register, otherwise can be used as a scratch register
128static const Xmm SRC2 = xmm2;
129/// Loaded with the third swizzled source register, otherwise can be used as a scratch register
130static const Xmm SRC3 = xmm3;
131/// Additional scratch register
132static const Xmm SCRATCH2 = xmm4;
133/// Constant vector of [1.0f, 1.0f, 1.0f, 1.0f], used to efficiently set a vector to one
134static const Xmm ONE = xmm14;
135/// Constant vector of [-0.f, -0.f, -0.f, -0.f], used to efficiently negate a vector with XOR
136static const Xmm NEGBIT = xmm15;
137
138// State registers that must not be modified by external functions calls
139// Scratch registers, e.g., SRC1 and SCRATCH, have to be saved on the side if needed
140static const BitSet32 persistent_regs = BuildRegSet({
141 // Pointers to register blocks
142 SETUP, STATE,
143 // Cached registers
144 ADDROFFS_REG_0, ADDROFFS_REG_1, LOOPCOUNT_REG, COND0, COND1,
145 // Constants
146 ONE, NEGBIT,
147});
148
149/// Raw constant for the source register selector that indicates no swizzling is performed
150static const u8 NO_SRC_REG_SWIZZLE = 0x1b;
151/// Raw constant for the destination register enable mask that indicates all components are enabled
152static const u8 NO_DEST_REG_MASK = 0xf;
153
154static void LogCritical(const char* msg) {
155 LOG_CRITICAL(HW_GPU, "%s", msg);
156}
157
158void JitShader::Compile_Assert(bool condition, const char* msg) {
159 if (!condition) {
160 mov(ABI_PARAM1, reinterpret_cast<size_t>(msg));
161 CallFarFunction(*this, LogCritical);
162 }
163}
164
165/**
166 * Loads and swizzles a source register into the specified XMM register.
167 * @param instr VS instruction, used for determining how to load the source register
168 * @param src_num Number indicating which source register to load (1 = src1, 2 = src2, 3 = src3)
169 * @param src_reg SourceRegister object corresponding to the source register to load
170 * @param dest Destination XMM register to store the loaded, swizzled source register
171 */
172void JitShader::Compile_SwizzleSrc(Instruction instr, unsigned src_num, SourceRegister src_reg,
173 Xmm dest) {
174 Reg64 src_ptr;
175 size_t src_offset;
176
177 if (src_reg.GetRegisterType() == RegisterType::FloatUniform) {
178 src_ptr = SETUP;
179 src_offset = ShaderSetup::GetFloatUniformOffset(src_reg.GetIndex());
180 } else {
181 src_ptr = STATE;
182 src_offset = UnitState::InputOffset(src_reg);
183 }
184
185 int src_offset_disp = (int)src_offset;
186 ASSERT_MSG(src_offset == src_offset_disp, "Source register offset too large for int type");
187
188 unsigned operand_desc_id;
189
190 const bool is_inverted =
191 (0 != (instr.opcode.Value().GetInfo().subtype & OpCode::Info::SrcInversed));
192
193 unsigned address_register_index;
194 unsigned offset_src;
195
196 if (instr.opcode.Value().EffectiveOpCode() == OpCode::Id::MAD ||
197 instr.opcode.Value().EffectiveOpCode() == OpCode::Id::MADI) {
198 operand_desc_id = instr.mad.operand_desc_id;
199 offset_src = is_inverted ? 3 : 2;
200 address_register_index = instr.mad.address_register_index;
201 } else {
202 operand_desc_id = instr.common.operand_desc_id;
203 offset_src = is_inverted ? 2 : 1;
204 address_register_index = instr.common.address_register_index;
205 }
206
207 if (src_num == offset_src && address_register_index != 0) {
208 switch (address_register_index) {
209 case 1: // address offset 1
210 movaps(dest, xword[src_ptr + ADDROFFS_REG_0 + src_offset_disp]);
211 break;
212 case 2: // address offset 2
213 movaps(dest, xword[src_ptr + ADDROFFS_REG_1 + src_offset_disp]);
214 break;
215 case 3: // address offset 3
216 movaps(dest, xword[src_ptr + LOOPCOUNT_REG.cvt64() + src_offset_disp]);
217 break;
218 default:
219 UNREACHABLE();
220 break;
221 }
222 } else {
223 // Load the source
224 movaps(dest, xword[src_ptr + src_offset_disp]);
225 }
226
227 SwizzlePattern swiz = {(*swizzle_data)[operand_desc_id]};
228
229 // Generate instructions for source register swizzling as needed
230 u8 sel = swiz.GetRawSelector(src_num);
231 if (sel != NO_SRC_REG_SWIZZLE) {
232 // Selector component order needs to be reversed for the SHUFPS instruction
233 sel = ((sel & 0xc0) >> 6) | ((sel & 3) << 6) | ((sel & 0xc) << 2) | ((sel & 0x30) >> 2);
234
235 // Shuffle inputs for swizzle
236 shufps(dest, dest, sel);
237 }
238
239 // If the source register should be negated, flip the negative bit using XOR
240 const bool negate[] = {swiz.negate_src1, swiz.negate_src2, swiz.negate_src3};
241 if (negate[src_num - 1]) {
242 xorps(dest, NEGBIT);
243 }
244}
245
246void JitShader::Compile_DestEnable(Instruction instr, Xmm src) {
247 DestRegister dest;
248 unsigned operand_desc_id;
249 if (instr.opcode.Value().EffectiveOpCode() == OpCode::Id::MAD ||
250 instr.opcode.Value().EffectiveOpCode() == OpCode::Id::MADI) {
251 operand_desc_id = instr.mad.operand_desc_id;
252 dest = instr.mad.dest.Value();
253 } else {
254 operand_desc_id = instr.common.operand_desc_id;
255 dest = instr.common.dest.Value();
256 }
257
258 SwizzlePattern swiz = {(*swizzle_data)[operand_desc_id]};
259
260 size_t dest_offset_disp = UnitState::OutputOffset(dest);
261
262 // If all components are enabled, write the result to the destination register
263 if (swiz.dest_mask == NO_DEST_REG_MASK) {
264 // Store dest back to memory
265 movaps(xword[STATE + dest_offset_disp], src);
266
267 } else {
268 // Not all components are enabled, so mask the result when storing to the destination
269 // register...
270 movaps(SCRATCH, xword[STATE + dest_offset_disp]);
271
272 if (Common::GetCPUCaps().sse4_1) {
273 u8 mask = ((swiz.dest_mask & 1) << 3) | ((swiz.dest_mask & 8) >> 3) |
274 ((swiz.dest_mask & 2) << 1) | ((swiz.dest_mask & 4) >> 1);
275 blendps(SCRATCH, src, mask);
276 } else {
277 movaps(SCRATCH2, src);
278 unpckhps(SCRATCH2, SCRATCH); // Unpack X/Y components of source and destination
279 unpcklps(SCRATCH, src); // Unpack Z/W components of source and destination
280
281 // Compute selector to selectively copy source components to destination for SHUFPS
282 // instruction
283 u8 sel = ((swiz.DestComponentEnabled(0) ? 1 : 0) << 0) |
284 ((swiz.DestComponentEnabled(1) ? 3 : 2) << 2) |
285 ((swiz.DestComponentEnabled(2) ? 0 : 1) << 4) |
286 ((swiz.DestComponentEnabled(3) ? 2 : 3) << 6);
287 shufps(SCRATCH, SCRATCH2, sel);
288 }
289
290 // Store dest back to memory
291 movaps(xword[STATE + dest_offset_disp], SCRATCH);
292 }
293}
294
295void JitShader::Compile_SanitizedMul(Xmm src1, Xmm src2, Xmm scratch) {
296 movaps(scratch, src1);
297 cmpordps(scratch, src2);
298
299 mulps(src1, src2);
300
301 movaps(src2, src1);
302 cmpunordps(src2, src2);
303
304 xorps(scratch, src2);
305 andps(src1, scratch);
306}
307
308void JitShader::Compile_EvaluateCondition(Instruction instr) {
309 // Note: NXOR is used below to check for equality
310 switch (instr.flow_control.op) {
311 case Instruction::FlowControlType::Or:
312 mov(eax, COND0);
313 mov(ebx, COND1);
314 xor(eax, (instr.flow_control.refx.Value() ^ 1));
315 xor(ebx, (instr.flow_control.refy.Value() ^ 1));
316 or (eax, ebx);
317 break;
318
319 case Instruction::FlowControlType::And:
320 mov(eax, COND0);
321 mov(ebx, COND1);
322 xor(eax, (instr.flow_control.refx.Value() ^ 1));
323 xor(ebx, (instr.flow_control.refy.Value() ^ 1));
324 and(eax, ebx);
325 break;
326
327 case Instruction::FlowControlType::JustX:
328 mov(eax, COND0);
329 xor(eax, (instr.flow_control.refx.Value() ^ 1));
330 break;
331
332 case Instruction::FlowControlType::JustY:
333 mov(eax, COND1);
334 xor(eax, (instr.flow_control.refy.Value() ^ 1));
335 break;
336 }
337}
338
339void JitShader::Compile_UniformCondition(Instruction instr) {
340 size_t offset = ShaderSetup::GetBoolUniformOffset(instr.flow_control.bool_uniform_id);
341 cmp(byte[SETUP + offset], 0);
342}
343
344BitSet32 JitShader::PersistentCallerSavedRegs() {
345 return persistent_regs & ABI_ALL_CALLER_SAVED;
346}
347
348void JitShader::Compile_ADD(Instruction instr) {
349 Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1);
350 Compile_SwizzleSrc(instr, 2, instr.common.src2, SRC2);
351 addps(SRC1, SRC2);
352 Compile_DestEnable(instr, SRC1);
353}
354
355void JitShader::Compile_DP3(Instruction instr) {
356 Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1);
357 Compile_SwizzleSrc(instr, 2, instr.common.src2, SRC2);
358
359 Compile_SanitizedMul(SRC1, SRC2, SCRATCH);
360
361 movaps(SRC2, SRC1);
362 shufps(SRC2, SRC2, _MM_SHUFFLE(1, 1, 1, 1));
363
364 movaps(SRC3, SRC1);
365 shufps(SRC3, SRC3, _MM_SHUFFLE(2, 2, 2, 2));
366
367 shufps(SRC1, SRC1, _MM_SHUFFLE(0, 0, 0, 0));
368 addps(SRC1, SRC2);
369 addps(SRC1, SRC3);
370
371 Compile_DestEnable(instr, SRC1);
372}
373
374void JitShader::Compile_DP4(Instruction instr) {
375 Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1);
376 Compile_SwizzleSrc(instr, 2, instr.common.src2, SRC2);
377
378 Compile_SanitizedMul(SRC1, SRC2, SCRATCH);
379
380 movaps(SRC2, SRC1);
381 shufps(SRC1, SRC1, _MM_SHUFFLE(2, 3, 0, 1)); // XYZW -> ZWXY
382 addps(SRC1, SRC2);
383
384 movaps(SRC2, SRC1);
385 shufps(SRC1, SRC1, _MM_SHUFFLE(0, 1, 2, 3)); // XYZW -> WZYX
386 addps(SRC1, SRC2);
387
388 Compile_DestEnable(instr, SRC1);
389}
390
391void JitShader::Compile_DPH(Instruction instr) {
392 if (instr.opcode.Value().EffectiveOpCode() == OpCode::Id::DPHI) {
393 Compile_SwizzleSrc(instr, 1, instr.common.src1i, SRC1);
394 Compile_SwizzleSrc(instr, 2, instr.common.src2i, SRC2);
395 } else {
396 Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1);
397 Compile_SwizzleSrc(instr, 2, instr.common.src2, SRC2);
398 }
399
400 if (Common::GetCPUCaps().sse4_1) {
401 // Set 4th component to 1.0
402 blendps(SRC1, ONE, 0b1000);
403 } else {
404 // Set 4th component to 1.0
405 movaps(SCRATCH, SRC1);
406 unpckhps(SCRATCH, ONE); // XYZW, 1111 -> Z1__
407 unpcklpd(SRC1, SCRATCH); // XYZW, Z1__ -> XYZ1
408 }
409
410 Compile_SanitizedMul(SRC1, SRC2, SCRATCH);
411
412 movaps(SRC2, SRC1);
413 shufps(SRC1, SRC1, _MM_SHUFFLE(2, 3, 0, 1)); // XYZW -> ZWXY
414 addps(SRC1, SRC2);
415
416 movaps(SRC2, SRC1);
417 shufps(SRC1, SRC1, _MM_SHUFFLE(0, 1, 2, 3)); // XYZW -> WZYX
418 addps(SRC1, SRC2);
419
420 Compile_DestEnable(instr, SRC1);
421}
422
423void JitShader::Compile_EX2(Instruction instr) {
424 Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1);
425 movss(xmm0, SRC1); // ABI_PARAM1
426
427 ABI_PushRegistersAndAdjustStack(*this, PersistentCallerSavedRegs(), 0);
428 CallFarFunction(*this, exp2f);
429 ABI_PopRegistersAndAdjustStack(*this, PersistentCallerSavedRegs(), 0);
430
431 shufps(xmm0, xmm0, _MM_SHUFFLE(0, 0, 0, 0)); // ABI_RETURN
432 movaps(SRC1, xmm0);
433 Compile_DestEnable(instr, SRC1);
434}
435
436void JitShader::Compile_LG2(Instruction instr) {
437 Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1);
438 movss(xmm0, SRC1); // ABI_PARAM1
439
440 ABI_PushRegistersAndAdjustStack(*this, PersistentCallerSavedRegs(), 0);
441 CallFarFunction(*this, log2f);
442 ABI_PopRegistersAndAdjustStack(*this, PersistentCallerSavedRegs(), 0);
443
444 shufps(xmm0, xmm0, _MM_SHUFFLE(0, 0, 0, 0)); // ABI_RETURN
445 movaps(SRC1, xmm0);
446 Compile_DestEnable(instr, SRC1);
447}
448
449void JitShader::Compile_MUL(Instruction instr) {
450 Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1);
451 Compile_SwizzleSrc(instr, 2, instr.common.src2, SRC2);
452 Compile_SanitizedMul(SRC1, SRC2, SCRATCH);
453 Compile_DestEnable(instr, SRC1);
454}
455
456void JitShader::Compile_SGE(Instruction instr) {
457 if (instr.opcode.Value().EffectiveOpCode() == OpCode::Id::SGEI) {
458 Compile_SwizzleSrc(instr, 1, instr.common.src1i, SRC1);
459 Compile_SwizzleSrc(instr, 2, instr.common.src2i, SRC2);
460 } else {
461 Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1);
462 Compile_SwizzleSrc(instr, 2, instr.common.src2, SRC2);
463 }
464
465 cmpleps(SRC2, SRC1);
466 andps(SRC2, ONE);
467
468 Compile_DestEnable(instr, SRC2);
469}
470
471void JitShader::Compile_SLT(Instruction instr) {
472 if (instr.opcode.Value().EffectiveOpCode() == OpCode::Id::SLTI) {
473 Compile_SwizzleSrc(instr, 1, instr.common.src1i, SRC1);
474 Compile_SwizzleSrc(instr, 2, instr.common.src2i, SRC2);
475 } else {
476 Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1);
477 Compile_SwizzleSrc(instr, 2, instr.common.src2, SRC2);
478 }
479
480 cmpltps(SRC1, SRC2);
481 andps(SRC1, ONE);
482
483 Compile_DestEnable(instr, SRC1);
484}
485
486void JitShader::Compile_FLR(Instruction instr) {
487 Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1);
488
489 if (Common::GetCPUCaps().sse4_1) {
490 roundps(SRC1, SRC1, _MM_FROUND_FLOOR);
491 } else {
492 cvttps2dq(SRC1, SRC1);
493 cvtdq2ps(SRC1, SRC1);
494 }
495
496 Compile_DestEnable(instr, SRC1);
497}
498
499void JitShader::Compile_MAX(Instruction instr) {
500 Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1);
501 Compile_SwizzleSrc(instr, 2, instr.common.src2, SRC2);
502 // SSE semantics match PICA200 ones: In case of NaN, SRC2 is returned.
503 maxps(SRC1, SRC2);
504 Compile_DestEnable(instr, SRC1);
505}
506
507void JitShader::Compile_MIN(Instruction instr) {
508 Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1);
509 Compile_SwizzleSrc(instr, 2, instr.common.src2, SRC2);
510 // SSE semantics match PICA200 ones: In case of NaN, SRC2 is returned.
511 minps(SRC1, SRC2);
512 Compile_DestEnable(instr, SRC1);
513}
514
515void JitShader::Compile_MOVA(Instruction instr) {
516 SwizzlePattern swiz = {(*swizzle_data)[instr.common.operand_desc_id]};
517
518 if (!swiz.DestComponentEnabled(0) && !swiz.DestComponentEnabled(1)) {
519 return; // NoOp
520 }
521
522 Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1);
523
524 // Convert floats to integers using truncation (only care about X and Y components)
525 cvttps2dq(SRC1, SRC1);
526
527 // Get result
528 movq(rax, SRC1);
529
530 // Handle destination enable
531 if (swiz.DestComponentEnabled(0) && swiz.DestComponentEnabled(1)) {
532 // Move and sign-extend low 32 bits
533 movsxd(ADDROFFS_REG_0, eax);
534
535 // Move and sign-extend high 32 bits
536 shr(rax, 32);
537 movsxd(ADDROFFS_REG_1, eax);
538
539 // Multiply by 16 to be used as an offset later
540 shl(ADDROFFS_REG_0, 4);
541 shl(ADDROFFS_REG_1, 4);
542 } else {
543 if (swiz.DestComponentEnabled(0)) {
544 // Move and sign-extend low 32 bits
545 movsxd(ADDROFFS_REG_0, eax);
546
547 // Multiply by 16 to be used as an offset later
548 shl(ADDROFFS_REG_0, 4);
549 } else if (swiz.DestComponentEnabled(1)) {
550 // Move and sign-extend high 32 bits
551 shr(rax, 32);
552 movsxd(ADDROFFS_REG_1, eax);
553
554 // Multiply by 16 to be used as an offset later
555 shl(ADDROFFS_REG_1, 4);
556 }
557 }
558}
559
560void JitShader::Compile_MOV(Instruction instr) {
561 Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1);
562 Compile_DestEnable(instr, SRC1);
563}
564
565void JitShader::Compile_RCP(Instruction instr) {
566 Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1);
567
568 // TODO(bunnei): RCPSS is a pretty rough approximation, this might cause problems if Pica
569 // performs this operation more accurately. This should be checked on hardware.
570 rcpss(SRC1, SRC1);
571 shufps(SRC1, SRC1, _MM_SHUFFLE(0, 0, 0, 0)); // XYWZ -> XXXX
572
573 Compile_DestEnable(instr, SRC1);
574}
575
576void JitShader::Compile_RSQ(Instruction instr) {
577 Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1);
578
579 // TODO(bunnei): RSQRTSS is a pretty rough approximation, this might cause problems if Pica
580 // performs this operation more accurately. This should be checked on hardware.
581 rsqrtss(SRC1, SRC1);
582 shufps(SRC1, SRC1, _MM_SHUFFLE(0, 0, 0, 0)); // XYWZ -> XXXX
583
584 Compile_DestEnable(instr, SRC1);
585}
586
587void JitShader::Compile_NOP(Instruction instr) {}
588
589void JitShader::Compile_END(Instruction instr) {
590 ABI_PopRegistersAndAdjustStack(*this, ABI_ALL_CALLEE_SAVED, 8);
591 ret();
592}
593
594void JitShader::Compile_CALL(Instruction instr) {
595 // Push offset of the return
596 push(qword, (instr.flow_control.dest_offset + instr.flow_control.num_instructions));
597
598 // Call the subroutine
599 call(instruction_labels[instr.flow_control.dest_offset]);
600
601 // Skip over the return offset that's on the stack
602 add(rsp, 8);
603}
604
605void JitShader::Compile_CALLC(Instruction instr) {
606 Compile_EvaluateCondition(instr);
607 Label b;
608 jz(b);
609 Compile_CALL(instr);
610 L(b);
611}
612
613void JitShader::Compile_CALLU(Instruction instr) {
614 Compile_UniformCondition(instr);
615 Label b;
616 jz(b);
617 Compile_CALL(instr);
618 L(b);
619}
620
621void JitShader::Compile_CMP(Instruction instr) {
622 using Op = Instruction::Common::CompareOpType::Op;
623 Op op_x = instr.common.compare_op.x;
624 Op op_y = instr.common.compare_op.y;
625
626 Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1);
627 Compile_SwizzleSrc(instr, 2, instr.common.src2, SRC2);
628
629 // SSE doesn't have greater-than (GT) or greater-equal (GE) comparison operators. You need to
630 // emulate them by swapping the lhs and rhs and using LT and LE. NLT and NLE can't be used here
631 // because they don't match when used with NaNs.
632 static const u8 cmp[] = {CMP_EQ, CMP_NEQ, CMP_LT, CMP_LE, CMP_LT, CMP_LE};
633
634 bool invert_op_x = (op_x == Op::GreaterThan || op_x == Op::GreaterEqual);
635 Xmm lhs_x = invert_op_x ? SRC2 : SRC1;
636 Xmm rhs_x = invert_op_x ? SRC1 : SRC2;
637
638 if (op_x == op_y) {
639 // Compare X-component and Y-component together
640 cmpps(lhs_x, rhs_x, cmp[op_x]);
641 movq(COND0, lhs_x);
642
643 mov(COND1, COND0);
644 } else {
645 bool invert_op_y = (op_y == Op::GreaterThan || op_y == Op::GreaterEqual);
646 Xmm lhs_y = invert_op_y ? SRC2 : SRC1;
647 Xmm rhs_y = invert_op_y ? SRC1 : SRC2;
648
649 // Compare X-component
650 movaps(SCRATCH, lhs_x);
651 cmpss(SCRATCH, rhs_x, cmp[op_x]);
652
653 // Compare Y-component
654 cmpps(lhs_y, rhs_y, cmp[op_y]);
655
656 movq(COND0, SCRATCH);
657 movq(COND1, lhs_y);
658 }
659
660 shr(COND0.cvt32(), 31); // ignores upper 32 bits in source
661 shr(COND1, 63);
662}
663
664void JitShader::Compile_MAD(Instruction instr) {
665 Compile_SwizzleSrc(instr, 1, instr.mad.src1, SRC1);
666
667 if (instr.opcode.Value().EffectiveOpCode() == OpCode::Id::MADI) {
668 Compile_SwizzleSrc(instr, 2, instr.mad.src2i, SRC2);
669 Compile_SwizzleSrc(instr, 3, instr.mad.src3i, SRC3);
670 } else {
671 Compile_SwizzleSrc(instr, 2, instr.mad.src2, SRC2);
672 Compile_SwizzleSrc(instr, 3, instr.mad.src3, SRC3);
673 }
674
675 Compile_SanitizedMul(SRC1, SRC2, SCRATCH);
676 addps(SRC1, SRC3);
677
678 Compile_DestEnable(instr, SRC1);
679}
680
681void JitShader::Compile_IF(Instruction instr) {
682 Compile_Assert(instr.flow_control.dest_offset >= program_counter,
683 "Backwards if-statements not supported");
684 Label l_else, l_endif;
685
686 // Evaluate the "IF" condition
687 if (instr.opcode.Value() == OpCode::Id::IFU) {
688 Compile_UniformCondition(instr);
689 } else if (instr.opcode.Value() == OpCode::Id::IFC) {
690 Compile_EvaluateCondition(instr);
691 }
692 jz(l_else, T_NEAR);
693
694 // Compile the code that corresponds to the condition evaluating as true
695 Compile_Block(instr.flow_control.dest_offset);
696
697 // If there isn't an "ELSE" condition, we are done here
698 if (instr.flow_control.num_instructions == 0) {
699 L(l_else);
700 return;
701 }
702
703 jmp(l_endif, T_NEAR);
704
705 L(l_else);
706 // This code corresponds to the "ELSE" condition
707 // Comple the code that corresponds to the condition evaluating as false
708 Compile_Block(instr.flow_control.dest_offset + instr.flow_control.num_instructions);
709
710 L(l_endif);
711}
712
713void JitShader::Compile_LOOP(Instruction instr) {
714 Compile_Assert(instr.flow_control.dest_offset >= program_counter,
715 "Backwards loops not supported");
716 Compile_Assert(!looping, "Nested loops not supported");
717
718 looping = true;
719
720 // This decodes the fields from the integer uniform at index instr.flow_control.int_uniform_id.
721 // The Y (LOOPCOUNT_REG) and Z (LOOPINC) component are kept multiplied by 16 (Left shifted by
722 // 4 bits) to be used as an offset into the 16-byte vector registers later
723 size_t offset = ShaderSetup::GetIntUniformOffset(instr.flow_control.int_uniform_id);
724 mov(LOOPCOUNT, dword[SETUP + offset]);
725 mov(LOOPCOUNT_REG, LOOPCOUNT);
726 shr(LOOPCOUNT_REG, 4);
727 and(LOOPCOUNT_REG, 0xFF0); // Y-component is the start
728 mov(LOOPINC, LOOPCOUNT);
729 shr(LOOPINC, 12);
730 and(LOOPINC, 0xFF0); // Z-component is the incrementer
731 movzx(LOOPCOUNT, LOOPCOUNT.cvt8()); // X-component is iteration count
732 add(LOOPCOUNT, 1); // Iteration count is X-component + 1
733
734 Label l_loop_start;
735 L(l_loop_start);
736
737 Compile_Block(instr.flow_control.dest_offset + 1);
738
739 add(LOOPCOUNT_REG, LOOPINC); // Increment LOOPCOUNT_REG by Z-component
740 sub(LOOPCOUNT, 1); // Increment loop count by 1
741 jnz(l_loop_start); // Loop if not equal
742
743 looping = false;
744}
745
746void JitShader::Compile_JMP(Instruction instr) {
747 if (instr.opcode.Value() == OpCode::Id::JMPC)
748 Compile_EvaluateCondition(instr);
749 else if (instr.opcode.Value() == OpCode::Id::JMPU)
750 Compile_UniformCondition(instr);
751 else
752 UNREACHABLE();
753
754 bool inverted_condition =
755 (instr.opcode.Value() == OpCode::Id::JMPU) && (instr.flow_control.num_instructions & 1);
756
757 Label& b = instruction_labels[instr.flow_control.dest_offset];
758 if (inverted_condition) {
759 jz(b, T_NEAR);
760 } else {
761 jnz(b, T_NEAR);
762 }
763}
764
765void JitShader::Compile_Block(unsigned end) {
766 while (program_counter < end) {
767 Compile_NextInstr();
768 }
769}
770
771void JitShader::Compile_Return() {
772 // Peek return offset on the stack and check if we're at that offset
773 mov(rax, qword[rsp + 8]);
774 cmp(eax, (program_counter));
775
776 // If so, jump back to before CALL
777 Label b;
778 jnz(b);
779 ret();
780 L(b);
781}
782
783void JitShader::Compile_NextInstr() {
784 if (std::binary_search(return_offsets.begin(), return_offsets.end(), program_counter)) {
785 Compile_Return();
786 }
787
788 L(instruction_labels[program_counter]);
789
790 Instruction instr = {(*program_code)[program_counter++]};
791
792 OpCode::Id opcode = instr.opcode.Value();
793 auto instr_func = instr_table[static_cast<unsigned>(opcode)];
794
795 if (instr_func) {
796 // JIT the instruction!
797 ((*this).*instr_func)(instr);
798 } else {
799 // Unhandled instruction
800 LOG_CRITICAL(HW_GPU, "Unhandled instruction: 0x%02x (0x%08x)",
801 instr.opcode.Value().EffectiveOpCode(), instr.hex);
802 }
803}
804
805void JitShader::FindReturnOffsets() {
806 return_offsets.clear();
807
808 for (size_t offset = 0; offset < program_code->size(); ++offset) {
809 Instruction instr = {(*program_code)[offset]};
810
811 switch (instr.opcode.Value()) {
812 case OpCode::Id::CALL:
813 case OpCode::Id::CALLC:
814 case OpCode::Id::CALLU:
815 return_offsets.push_back(instr.flow_control.dest_offset +
816 instr.flow_control.num_instructions);
817 break;
818 default:
819 break;
820 }
821 }
822
823 // Sort for efficient binary search later
824 std::sort(return_offsets.begin(), return_offsets.end());
825}
826
827void JitShader::Compile(const std::array<u32, 1024>* program_code_,
828 const std::array<u32, 1024>* swizzle_data_) {
829 program_code = program_code_;
830 swizzle_data = swizzle_data_;
831
832 // Reset flow control state
833 program = (CompiledShader*)getCurr();
834 program_counter = 0;
835 looping = false;
836 instruction_labels.fill(Xbyak::Label());
837
838 // Find all `CALL` instructions and identify return locations
839 FindReturnOffsets();
840
841 // The stack pointer is 8 modulo 16 at the entry of a procedure
842 ABI_PushRegistersAndAdjustStack(*this, ABI_ALL_CALLEE_SAVED, 8);
843
844 mov(SETUP, ABI_PARAM1);
845 mov(STATE, ABI_PARAM2);
846
847 // Zero address/loop registers
848 xor(ADDROFFS_REG_0.cvt32(), ADDROFFS_REG_0.cvt32());
849 xor(ADDROFFS_REG_1.cvt32(), ADDROFFS_REG_1.cvt32());
850 xor(LOOPCOUNT_REG, LOOPCOUNT_REG);
851
852 // Used to set a register to one
853 static const __m128 one = {1.f, 1.f, 1.f, 1.f};
854 mov(rax, reinterpret_cast<size_t>(&one));
855 movaps(ONE, xword[rax]);
856
857 // Used to negate registers
858 static const __m128 neg = {-0.f, -0.f, -0.f, -0.f};
859 mov(rax, reinterpret_cast<size_t>(&neg));
860 movaps(NEGBIT, xword[rax]);
861
862 // Jump to start of the shader program
863 jmp(ABI_PARAM3);
864
865 // Compile entire program
866 Compile_Block(static_cast<unsigned>(program_code->size()));
867
868 // Free memory that's no longer needed
869 program_code = nullptr;
870 swizzle_data = nullptr;
871 return_offsets.clear();
872 return_offsets.shrink_to_fit();
873
874 ready();
875
876 ASSERT_MSG(getSize() <= MAX_SHADER_SIZE, "Compiled a shader that exceeds the allocated size!");
877 LOG_DEBUG(HW_GPU, "Compiled shader size=%lu", getSize());
878}
879
880JitShader::JitShader() : Xbyak::CodeGenerator(MAX_SHADER_SIZE) {}
881
882} // namespace Shader
883
884} // namespace Pica
diff --git a/src/video_core/shader/shader_jit_x64_compiler.h b/src/video_core/shader/shader_jit_x64_compiler.h
new file mode 100644
index 000000000..29e9875ea
--- /dev/null
+++ b/src/video_core/shader/shader_jit_x64_compiler.h
@@ -0,0 +1,125 @@
1// Copyright 2015 Citra Emulator Project
2// Licensed under GPLv2 or any later version
3// Refer to the license.txt file included.
4
5#pragma once
6
7#include <array>
8#include <cstddef>
9#include <utility>
10#include <vector>
11#include <nihstro/shader_bytecode.h>
12#include <xbyak.h>
13#include "common/bit_set.h"
14#include "common/common_types.h"
15#include "common/x64/emitter.h"
16#include "video_core/shader/shader.h"
17
18using nihstro::Instruction;
19using nihstro::OpCode;
20using nihstro::SwizzlePattern;
21
22namespace Pica {
23
24namespace Shader {
25
26/// Memory allocated for each compiled shader (64Kb)
27constexpr size_t MAX_SHADER_SIZE = 1024 * 64;
28
29/**
30 * This class implements the shader JIT compiler. It recompiles a Pica shader program into x86_64
31 * code that can be executed on the host machine directly.
32 */
33class JitShader : public Xbyak::CodeGenerator {
34public:
35 JitShader();
36
37 void Run(const ShaderSetup& setup, UnitState& state, unsigned offset) const {
38 program(&setup, &state, instruction_labels[offset].getAddress());
39 }
40
41 void Compile(const std::array<u32, 1024>* program_code,
42 const std::array<u32, 1024>* swizzle_data);
43
44 void Compile_ADD(Instruction instr);
45 void Compile_DP3(Instruction instr);
46 void Compile_DP4(Instruction instr);
47 void Compile_DPH(Instruction instr);
48 void Compile_EX2(Instruction instr);
49 void Compile_LG2(Instruction instr);
50 void Compile_MUL(Instruction instr);
51 void Compile_SGE(Instruction instr);
52 void Compile_SLT(Instruction instr);
53 void Compile_FLR(Instruction instr);
54 void Compile_MAX(Instruction instr);
55 void Compile_MIN(Instruction instr);
56 void Compile_RCP(Instruction instr);
57 void Compile_RSQ(Instruction instr);
58 void Compile_MOVA(Instruction instr);
59 void Compile_MOV(Instruction instr);
60 void Compile_NOP(Instruction instr);
61 void Compile_END(Instruction instr);
62 void Compile_CALL(Instruction instr);
63 void Compile_CALLC(Instruction instr);
64 void Compile_CALLU(Instruction instr);
65 void Compile_IF(Instruction instr);
66 void Compile_LOOP(Instruction instr);
67 void Compile_JMP(Instruction instr);
68 void Compile_CMP(Instruction instr);
69 void Compile_MAD(Instruction instr);
70
71private:
72 void Compile_Block(unsigned end);
73 void Compile_NextInstr();
74
75 void Compile_SwizzleSrc(Instruction instr, unsigned src_num, SourceRegister src_reg,
76 Xbyak::Xmm dest);
77 void Compile_DestEnable(Instruction instr, Xbyak::Xmm dest);
78
79 /**
80 * Compiles a `MUL src1, src2` operation, properly handling the PICA semantics when multiplying
81 * zero by inf. Clobbers `src2` and `scratch`.
82 */
83 void Compile_SanitizedMul(Xbyak::Xmm src1, Xbyak::Xmm src2, Xbyak::Xmm scratch);
84
85 void Compile_EvaluateCondition(Instruction instr);
86 void Compile_UniformCondition(Instruction instr);
87
88 /**
89 * Emits the code to conditionally return from a subroutine envoked by the `CALL` instruction.
90 */
91 void Compile_Return();
92
93 BitSet32 PersistentCallerSavedRegs();
94
95 /**
96 * Assertion evaluated at compile-time, but only triggered if executed at runtime.
97 * @param msg Message to be logged if the assertion fails.
98 */
99 void Compile_Assert(bool condition, const char* msg);
100
101 /**
102 * Analyzes the entire shader program for `CALL` instructions before emitting any code,
103 * identifying the locations where a return needs to be inserted.
104 */
105 void FindReturnOffsets();
106
107 const std::array<u32, 1024>* program_code = nullptr;
108 const std::array<u32, 1024>* swizzle_data = nullptr;
109
110 /// Mapping of Pica VS instructions to pointers in the emitted code
111 std::array<Xbyak::Label, 1024> instruction_labels;
112
113 /// Offsets in code where a return needs to be inserted
114 std::vector<unsigned> return_offsets;
115
116 unsigned program_counter = 0; ///< Offset of the next instruction to decode
117 bool looping = false; ///< True if compiling a loop, used to check for nested loops
118
119 using CompiledShader = void(const void* setup, void* state, const u8* start_addr);
120 CompiledShader* program = nullptr;
121};
122
123} // Shader
124
125} // Pica