diff options
Diffstat (limited to 'src')
47 files changed, 5520 insertions, 338 deletions
diff --git a/src/citra/CMakeLists.txt b/src/citra/CMakeLists.txt index 918687312..1d6aac9a9 100644 --- a/src/citra/CMakeLists.txt +++ b/src/citra/CMakeLists.txt | |||
| @@ -14,7 +14,7 @@ set(HEADERS | |||
| 14 | create_directory_groups(${SRCS} ${HEADERS}) | 14 | create_directory_groups(${SRCS} ${HEADERS}) |
| 15 | 15 | ||
| 16 | add_executable(citra ${SRCS} ${HEADERS}) | 16 | add_executable(citra ${SRCS} ${HEADERS}) |
| 17 | target_link_libraries(citra core common video_core) | 17 | target_link_libraries(citra core video_core common) |
| 18 | target_link_libraries(citra ${GLFW_LIBRARIES} ${OPENGL_gl_LIBRARY} inih) | 18 | target_link_libraries(citra ${GLFW_LIBRARIES} ${OPENGL_gl_LIBRARY} inih) |
| 19 | if (MSVC) | 19 | if (MSVC) |
| 20 | target_link_libraries(citra getopt) | 20 | target_link_libraries(citra getopt) |
diff --git a/src/citra/citra.cpp b/src/citra/citra.cpp index 182646f4c..d6fcb66a5 100644 --- a/src/citra/citra.cpp +++ b/src/citra/citra.cpp | |||
| @@ -71,6 +71,7 @@ int main(int argc, char **argv) { | |||
| 71 | EmuWindow_GLFW* emu_window = new EmuWindow_GLFW; | 71 | EmuWindow_GLFW* emu_window = new EmuWindow_GLFW; |
| 72 | 72 | ||
| 73 | VideoCore::g_hw_renderer_enabled = Settings::values.use_hw_renderer; | 73 | VideoCore::g_hw_renderer_enabled = Settings::values.use_hw_renderer; |
| 74 | VideoCore::g_shader_jit_enabled = Settings::values.use_shader_jit; | ||
| 74 | 75 | ||
| 75 | System::Init(emu_window); | 76 | System::Init(emu_window); |
| 76 | 77 | ||
diff --git a/src/citra/config.cpp b/src/citra/config.cpp index 2c1407a6f..8a98bda87 100644 --- a/src/citra/config.cpp +++ b/src/citra/config.cpp | |||
| @@ -61,6 +61,7 @@ void Config::ReadValues() { | |||
| 61 | 61 | ||
| 62 | // Renderer | 62 | // Renderer |
| 63 | Settings::values.use_hw_renderer = glfw_config->GetBoolean("Renderer", "use_hw_renderer", false); | 63 | Settings::values.use_hw_renderer = glfw_config->GetBoolean("Renderer", "use_hw_renderer", false); |
| 64 | Settings::values.use_shader_jit = glfw_config->GetBoolean("Renderer", "use_shader_jit", true); | ||
| 64 | 65 | ||
| 65 | Settings::values.bg_red = (float)glfw_config->GetReal("Renderer", "bg_red", 1.0); | 66 | Settings::values.bg_red = (float)glfw_config->GetReal("Renderer", "bg_red", 1.0); |
| 66 | Settings::values.bg_green = (float)glfw_config->GetReal("Renderer", "bg_green", 1.0); | 67 | Settings::values.bg_green = (float)glfw_config->GetReal("Renderer", "bg_green", 1.0); |
diff --git a/src/citra/default_ini.h b/src/citra/default_ini.h index 1925bece8..7e5d49729 100644 --- a/src/citra/default_ini.h +++ b/src/citra/default_ini.h | |||
| @@ -42,6 +42,10 @@ frame_skip = | |||
| 42 | # 0 (default): Software, 1: Hardware | 42 | # 0 (default): Software, 1: Hardware |
| 43 | use_hw_renderer = | 43 | use_hw_renderer = |
| 44 | 44 | ||
| 45 | # Whether to use the Just-In-Time (JIT) compiler for shader emulation | ||
| 46 | # 0 : Interpreter (slow), 1 (default): JIT (fast) | ||
| 47 | use_shader_jit = | ||
| 48 | |||
| 45 | # The clear color for the renderer. What shows up on the sides of the bottom screen. | 49 | # The clear color for the renderer. What shows up on the sides of the bottom screen. |
| 46 | # Must be in range of 0.0-1.0. Defaults to 1.0 for all. | 50 | # Must be in range of 0.0-1.0. Defaults to 1.0 for all. |
| 47 | bg_red = | 51 | bg_red = |
diff --git a/src/citra_qt/CMakeLists.txt b/src/citra_qt/CMakeLists.txt index 47aaeca24..0c0515054 100644 --- a/src/citra_qt/CMakeLists.txt +++ b/src/citra_qt/CMakeLists.txt | |||
| @@ -71,7 +71,7 @@ if (APPLE) | |||
| 71 | else() | 71 | else() |
| 72 | add_executable(citra-qt ${SRCS} ${HEADERS} ${UI_HDRS}) | 72 | add_executable(citra-qt ${SRCS} ${HEADERS} ${UI_HDRS}) |
| 73 | endif() | 73 | endif() |
| 74 | target_link_libraries(citra-qt core common video_core qhexedit) | 74 | target_link_libraries(citra-qt core video_core common qhexedit) |
| 75 | target_link_libraries(citra-qt ${OPENGL_gl_LIBRARY} ${CITRA_QT_LIBS}) | 75 | target_link_libraries(citra-qt ${OPENGL_gl_LIBRARY} ${CITRA_QT_LIBS}) |
| 76 | target_link_libraries(citra-qt ${PLATFORM_LIBRARIES}) | 76 | target_link_libraries(citra-qt ${PLATFORM_LIBRARIES}) |
| 77 | 77 | ||
diff --git a/src/citra_qt/config.cpp b/src/citra_qt/config.cpp index 5716634ee..a20351fb8 100644 --- a/src/citra_qt/config.cpp +++ b/src/citra_qt/config.cpp | |||
| @@ -44,6 +44,7 @@ void Config::ReadValues() { | |||
| 44 | 44 | ||
| 45 | qt_config->beginGroup("Renderer"); | 45 | qt_config->beginGroup("Renderer"); |
| 46 | Settings::values.use_hw_renderer = qt_config->value("use_hw_renderer", false).toBool(); | 46 | Settings::values.use_hw_renderer = qt_config->value("use_hw_renderer", false).toBool(); |
| 47 | Settings::values.use_shader_jit = qt_config->value("use_shader_jit", true).toBool(); | ||
| 47 | 48 | ||
| 48 | Settings::values.bg_red = qt_config->value("bg_red", 1.0).toFloat(); | 49 | Settings::values.bg_red = qt_config->value("bg_red", 1.0).toFloat(); |
| 49 | Settings::values.bg_green = qt_config->value("bg_green", 1.0).toFloat(); | 50 | Settings::values.bg_green = qt_config->value("bg_green", 1.0).toFloat(); |
| @@ -77,6 +78,7 @@ void Config::SaveValues() { | |||
| 77 | 78 | ||
| 78 | qt_config->beginGroup("Renderer"); | 79 | qt_config->beginGroup("Renderer"); |
| 79 | qt_config->setValue("use_hw_renderer", Settings::values.use_hw_renderer); | 80 | qt_config->setValue("use_hw_renderer", Settings::values.use_hw_renderer); |
| 81 | qt_config->setValue("use_shader_jit", Settings::values.use_shader_jit); | ||
| 80 | 82 | ||
| 81 | // Cast to double because Qt's written float values are not human-readable | 83 | // Cast to double because Qt's written float values are not human-readable |
| 82 | qt_config->setValue("bg_red", (double)Settings::values.bg_red); | 84 | qt_config->setValue("bg_red", (double)Settings::values.bg_red); |
diff --git a/src/citra_qt/debugger/graphics_vertex_shader.cpp b/src/citra_qt/debugger/graphics_vertex_shader.cpp index f42a2f4ce..302e22d7a 100644 --- a/src/citra_qt/debugger/graphics_vertex_shader.cpp +++ b/src/citra_qt/debugger/graphics_vertex_shader.cpp | |||
| @@ -8,7 +8,7 @@ | |||
| 8 | #include <QBoxLayout> | 8 | #include <QBoxLayout> |
| 9 | #include <QTreeView> | 9 | #include <QTreeView> |
| 10 | 10 | ||
| 11 | #include "video_core/vertex_shader.h" | 11 | #include "video_core/shader/shader_interpreter.h" |
| 12 | 12 | ||
| 13 | #include "graphics_vertex_shader.h" | 13 | #include "graphics_vertex_shader.h" |
| 14 | 14 | ||
diff --git a/src/citra_qt/main.cpp b/src/citra_qt/main.cpp index 6b030c178..4c3edf87a 100644 --- a/src/citra_qt/main.cpp +++ b/src/citra_qt/main.cpp | |||
| @@ -131,6 +131,9 @@ GMainWindow::GMainWindow() : emu_thread(nullptr) | |||
| 131 | ui.action_Use_Hardware_Renderer->setChecked(Settings::values.use_hw_renderer); | 131 | ui.action_Use_Hardware_Renderer->setChecked(Settings::values.use_hw_renderer); |
| 132 | SetHardwareRendererEnabled(ui.action_Use_Hardware_Renderer->isChecked()); | 132 | SetHardwareRendererEnabled(ui.action_Use_Hardware_Renderer->isChecked()); |
| 133 | 133 | ||
| 134 | ui.action_Use_Shader_JIT->setChecked(Settings::values.use_shader_jit); | ||
| 135 | SetShaderJITEnabled(ui.action_Use_Shader_JIT->isChecked()); | ||
| 136 | |||
| 134 | ui.action_Single_Window_Mode->setChecked(settings.value("singleWindowMode", true).toBool()); | 137 | ui.action_Single_Window_Mode->setChecked(settings.value("singleWindowMode", true).toBool()); |
| 135 | ToggleWindowMode(); | 138 | ToggleWindowMode(); |
| 136 | 139 | ||
| @@ -144,6 +147,7 @@ GMainWindow::GMainWindow() : emu_thread(nullptr) | |||
| 144 | connect(ui.action_Pause, SIGNAL(triggered()), this, SLOT(OnPauseGame())); | 147 | connect(ui.action_Pause, SIGNAL(triggered()), this, SLOT(OnPauseGame())); |
| 145 | connect(ui.action_Stop, SIGNAL(triggered()), this, SLOT(OnStopGame())); | 148 | connect(ui.action_Stop, SIGNAL(triggered()), this, SLOT(OnStopGame())); |
| 146 | connect(ui.action_Use_Hardware_Renderer, SIGNAL(triggered(bool)), this, SLOT(SetHardwareRendererEnabled(bool))); | 149 | connect(ui.action_Use_Hardware_Renderer, SIGNAL(triggered(bool)), this, SLOT(SetHardwareRendererEnabled(bool))); |
| 150 | connect(ui.action_Use_Shader_JIT, SIGNAL(triggered(bool)), this, SLOT(SetShaderJITEnabled(bool))); | ||
| 147 | connect(ui.action_Single_Window_Mode, SIGNAL(triggered(bool)), this, SLOT(ToggleWindowMode())); | 151 | connect(ui.action_Single_Window_Mode, SIGNAL(triggered(bool)), this, SLOT(ToggleWindowMode())); |
| 148 | connect(ui.action_Hotkeys, SIGNAL(triggered()), this, SLOT(OnOpenHotkeysDialog())); | 152 | connect(ui.action_Hotkeys, SIGNAL(triggered()), this, SLOT(OnOpenHotkeysDialog())); |
| 149 | 153 | ||
| @@ -331,6 +335,10 @@ void GMainWindow::SetHardwareRendererEnabled(bool enabled) { | |||
| 331 | VideoCore::g_hw_renderer_enabled = enabled; | 335 | VideoCore::g_hw_renderer_enabled = enabled; |
| 332 | } | 336 | } |
| 333 | 337 | ||
| 338 | void GMainWindow::SetShaderJITEnabled(bool enabled) { | ||
| 339 | VideoCore::g_shader_jit_enabled = enabled; | ||
| 340 | } | ||
| 341 | |||
| 334 | void GMainWindow::ToggleWindowMode() { | 342 | void GMainWindow::ToggleWindowMode() { |
| 335 | if (ui.action_Single_Window_Mode->isChecked()) { | 343 | if (ui.action_Single_Window_Mode->isChecked()) { |
| 336 | // Render in the main window... | 344 | // Render in the main window... |
diff --git a/src/citra_qt/main.h b/src/citra_qt/main.h index 9fe9e0c9c..61114a04d 100644 --- a/src/citra_qt/main.h +++ b/src/citra_qt/main.h | |||
| @@ -70,6 +70,7 @@ private slots: | |||
| 70 | void OnConfigure(); | 70 | void OnConfigure(); |
| 71 | void OnDisplayTitleBars(bool); | 71 | void OnDisplayTitleBars(bool); |
| 72 | void SetHardwareRendererEnabled(bool); | 72 | void SetHardwareRendererEnabled(bool); |
| 73 | void SetShaderJITEnabled(bool); | ||
| 73 | void ToggleWindowMode(); | 74 | void ToggleWindowMode(); |
| 74 | 75 | ||
| 75 | private: | 76 | private: |
diff --git a/src/citra_qt/main.ui b/src/citra_qt/main.ui index 9a809ee6c..b2ce8167d 100644 --- a/src/citra_qt/main.ui +++ b/src/citra_qt/main.ui | |||
| @@ -66,6 +66,7 @@ | |||
| 66 | <addaction name="action_Stop"/> | 66 | <addaction name="action_Stop"/> |
| 67 | <addaction name="separator"/> | 67 | <addaction name="separator"/> |
| 68 | <addaction name="action_Use_Hardware_Renderer"/> | 68 | <addaction name="action_Use_Hardware_Renderer"/> |
| 69 | <addaction name="action_Use_Shader_JIT"/> | ||
| 69 | <addaction name="action_Configure"/> | 70 | <addaction name="action_Configure"/> |
| 70 | </widget> | 71 | </widget> |
| 71 | <widget class="QMenu" name="menu_View"> | 72 | <widget class="QMenu" name="menu_View"> |
| @@ -153,6 +154,14 @@ | |||
| 153 | <string>Use Hardware Renderer</string> | 154 | <string>Use Hardware Renderer</string> |
| 154 | </property> | 155 | </property> |
| 155 | </action> | 156 | </action> |
| 157 | <action name="action_Use_Shader_JIT"> | ||
| 158 | <property name="checkable"> | ||
| 159 | <bool>true</bool> | ||
| 160 | </property> | ||
| 161 | <property name="text"> | ||
| 162 | <string>Use Shader JIT</string> | ||
| 163 | </property> | ||
| 164 | </action> | ||
| 156 | <action name="action_Configure"> | 165 | <action name="action_Configure"> |
| 157 | <property name="text"> | 166 | <property name="text"> |
| 158 | <string>Configure ...</string> | 167 | <string>Configure ...</string> |
diff --git a/src/common/CMakeLists.txt b/src/common/CMakeLists.txt index 4c086cd2f..e743a026d 100644 --- a/src/common/CMakeLists.txt +++ b/src/common/CMakeLists.txt | |||
| @@ -5,6 +5,7 @@ set(SRCS | |||
| 5 | break_points.cpp | 5 | break_points.cpp |
| 6 | emu_window.cpp | 6 | emu_window.cpp |
| 7 | file_util.cpp | 7 | file_util.cpp |
| 8 | hash.cpp | ||
| 8 | key_map.cpp | 9 | key_map.cpp |
| 9 | logging/filter.cpp | 10 | logging/filter.cpp |
| 10 | logging/text_formatter.cpp | 11 | logging/text_formatter.cpp |
| @@ -24,14 +25,15 @@ set(HEADERS | |||
| 24 | bit_field.h | 25 | bit_field.h |
| 25 | break_points.h | 26 | break_points.h |
| 26 | chunk_file.h | 27 | chunk_file.h |
| 28 | code_block.h | ||
| 27 | color.h | 29 | color.h |
| 28 | common_funcs.h | 30 | common_funcs.h |
| 29 | common_paths.h | 31 | common_paths.h |
| 30 | common_types.h | 32 | common_types.h |
| 31 | cpu_detect.h | ||
| 32 | debug_interface.h | 33 | debug_interface.h |
| 33 | emu_window.h | 34 | emu_window.h |
| 34 | file_util.h | 35 | file_util.h |
| 36 | hash.h | ||
| 35 | key_map.h | 37 | key_map.h |
| 36 | linear_disk_cache.h | 38 | linear_disk_cache.h |
| 37 | logging/text_formatter.h | 39 | logging/text_formatter.h |
| @@ -56,6 +58,18 @@ set(HEADERS | |||
| 56 | vector_math.h | 58 | vector_math.h |
| 57 | ) | 59 | ) |
| 58 | 60 | ||
| 61 | if(ARCHITECTURE_x86_64) | ||
| 62 | set(SRCS ${SRCS} | ||
| 63 | x64/abi.cpp | ||
| 64 | x64/cpu_detect.cpp | ||
| 65 | x64/emitter.cpp) | ||
| 66 | |||
| 67 | set(HEADERS ${HEADERS} | ||
| 68 | x64/abi.h | ||
| 69 | x64/cpu_detect.h | ||
| 70 | x64/emitter.h) | ||
| 71 | endif() | ||
| 72 | |||
| 59 | create_directory_groups(${SRCS} ${HEADERS}) | 73 | create_directory_groups(${SRCS} ${HEADERS}) |
| 60 | 74 | ||
| 61 | add_library(common STATIC ${SRCS} ${HEADERS}) | 75 | add_library(common STATIC ${SRCS} ${HEADERS}) |
diff --git a/src/common/code_block.h b/src/common/code_block.h new file mode 100644 index 000000000..9ef7296d3 --- /dev/null +++ b/src/common/code_block.h | |||
| @@ -0,0 +1,87 @@ | |||
| 1 | // Copyright 2013 Dolphin Emulator Project | ||
| 2 | // Licensed under GPLv2 | ||
| 3 | // Refer to the license.txt file included. | ||
| 4 | |||
| 5 | #pragma once | ||
| 6 | |||
| 7 | #include "common_types.h" | ||
| 8 | #include "memory_util.h" | ||
| 9 | |||
| 10 | // Everything that needs to generate code should inherit from this. | ||
| 11 | // You get memory management for free, plus, you can use all emitter functions without | ||
| 12 | // having to prefix them with gen-> or something similar. | ||
| 13 | // Example implementation: | ||
| 14 | // class JIT : public CodeBlock<ARMXEmitter> {} | ||
| 15 | template<class T> class CodeBlock : public T, NonCopyable | ||
| 16 | { | ||
| 17 | private: | ||
| 18 | // A privately used function to set the executable RAM space to something invalid. | ||
| 19 | // For debugging usefulness it should be used to set the RAM to a host specific breakpoint instruction | ||
| 20 | virtual void PoisonMemory() = 0; | ||
| 21 | |||
| 22 | protected: | ||
| 23 | u8 *region; | ||
| 24 | size_t region_size; | ||
| 25 | |||
| 26 | public: | ||
| 27 | CodeBlock() : region(nullptr), region_size(0) {} | ||
| 28 | virtual ~CodeBlock() { if (region) FreeCodeSpace(); } | ||
| 29 | |||
| 30 | // Call this before you generate any code. | ||
| 31 | void AllocCodeSpace(int size) | ||
| 32 | { | ||
| 33 | region_size = size; | ||
| 34 | region = (u8*)AllocateExecutableMemory(region_size); | ||
| 35 | T::SetCodePtr(region); | ||
| 36 | } | ||
| 37 | |||
| 38 | // Always clear code space with breakpoints, so that if someone accidentally executes | ||
| 39 | // uninitialized, it just breaks into the debugger. | ||
| 40 | void ClearCodeSpace() | ||
| 41 | { | ||
| 42 | PoisonMemory(); | ||
| 43 | ResetCodePtr(); | ||
| 44 | } | ||
| 45 | |||
| 46 | // Call this when shutting down. Don't rely on the destructor, even though it'll do the job. | ||
| 47 | void FreeCodeSpace() | ||
| 48 | { | ||
| 49 | #ifdef __SYMBIAN32__ | ||
| 50 | ResetExecutableMemory(region); | ||
| 51 | #else | ||
| 52 | FreeMemoryPages(region, region_size); | ||
| 53 | #endif | ||
| 54 | region = nullptr; | ||
| 55 | region_size = 0; | ||
| 56 | } | ||
| 57 | |||
| 58 | bool IsInSpace(const u8 *ptr) | ||
| 59 | { | ||
| 60 | return (ptr >= region) && (ptr < (region + region_size)); | ||
| 61 | } | ||
| 62 | |||
| 63 | // Cannot currently be undone. Will write protect the entire code region. | ||
| 64 | // Start over if you need to change the code (call FreeCodeSpace(), AllocCodeSpace()). | ||
| 65 | void WriteProtect() | ||
| 66 | { | ||
| 67 | WriteProtectMemory(region, region_size, true); | ||
| 68 | } | ||
| 69 | |||
| 70 | void ResetCodePtr() | ||
| 71 | { | ||
| 72 | T::SetCodePtr(region); | ||
| 73 | } | ||
| 74 | |||
| 75 | size_t GetSpaceLeft() const | ||
| 76 | { | ||
| 77 | return region_size - (T::GetCodePtr() - region); | ||
| 78 | } | ||
| 79 | |||
| 80 | u8 *GetBasePtr() { | ||
| 81 | return region; | ||
| 82 | } | ||
| 83 | |||
| 84 | size_t GetOffset(const u8 *ptr) const { | ||
| 85 | return ptr - region; | ||
| 86 | } | ||
| 87 | }; | ||
diff --git a/src/common/common_funcs.h b/src/common/common_funcs.h index 83b47f61e..88e452a16 100644 --- a/src/common/common_funcs.h +++ b/src/common/common_funcs.h | |||
| @@ -35,7 +35,7 @@ | |||
| 35 | 35 | ||
| 36 | #ifndef _MSC_VER | 36 | #ifndef _MSC_VER |
| 37 | 37 | ||
| 38 | #if defined(__x86_64__) || defined(_M_X64) | 38 | #ifdef ARCHITECTURE_x86_64 |
| 39 | #define Crash() __asm__ __volatile__("int $3") | 39 | #define Crash() __asm__ __volatile__("int $3") |
| 40 | #elif defined(_M_ARM) | 40 | #elif defined(_M_ARM) |
| 41 | #define Crash() __asm__ __volatile__("trap") | 41 | #define Crash() __asm__ __volatile__("trap") |
diff --git a/src/common/cpu_detect.h b/src/common/cpu_detect.h deleted file mode 100644 index b585f9608..000000000 --- a/src/common/cpu_detect.h +++ /dev/null | |||
| @@ -1,78 +0,0 @@ | |||
| 1 | // Copyright 2013 Dolphin Emulator Project / 2014 Citra Emulator Project | ||
| 2 | // Licensed under GPLv2 or any later version | ||
| 3 | // Refer to the license.txt file included. | ||
| 4 | |||
| 5 | |||
| 6 | // Detect the cpu, so we'll know which optimizations to use | ||
| 7 | #pragma once | ||
| 8 | |||
| 9 | #include <string> | ||
| 10 | |||
| 11 | enum CPUVendor | ||
| 12 | { | ||
| 13 | VENDOR_INTEL = 0, | ||
| 14 | VENDOR_AMD = 1, | ||
| 15 | VENDOR_ARM = 2, | ||
| 16 | VENDOR_OTHER = 3, | ||
| 17 | }; | ||
| 18 | |||
| 19 | struct CPUInfo | ||
| 20 | { | ||
| 21 | CPUVendor vendor; | ||
| 22 | |||
| 23 | char cpu_string[0x21]; | ||
| 24 | char brand_string[0x41]; | ||
| 25 | bool OS64bit; | ||
| 26 | bool CPU64bit; | ||
| 27 | bool Mode64bit; | ||
| 28 | |||
| 29 | bool HTT; | ||
| 30 | int num_cores; | ||
| 31 | int logical_cpu_count; | ||
| 32 | |||
| 33 | bool bSSE; | ||
| 34 | bool bSSE2; | ||
| 35 | bool bSSE3; | ||
| 36 | bool bSSSE3; | ||
| 37 | bool bPOPCNT; | ||
| 38 | bool bSSE4_1; | ||
| 39 | bool bSSE4_2; | ||
| 40 | bool bLZCNT; | ||
| 41 | bool bSSE4A; | ||
| 42 | bool bAVX; | ||
| 43 | bool bAES; | ||
| 44 | bool bLAHFSAHF64; | ||
| 45 | bool bLongMode; | ||
| 46 | |||
| 47 | // ARM specific CPUInfo | ||
| 48 | bool bSwp; | ||
| 49 | bool bHalf; | ||
| 50 | bool bThumb; | ||
| 51 | bool bFastMult; | ||
| 52 | bool bVFP; | ||
| 53 | bool bEDSP; | ||
| 54 | bool bThumbEE; | ||
| 55 | bool bNEON; | ||
| 56 | bool bVFPv3; | ||
| 57 | bool bTLS; | ||
| 58 | bool bVFPv4; | ||
| 59 | bool bIDIVa; | ||
| 60 | bool bIDIVt; | ||
| 61 | bool bArmV7; // enable MOVT, MOVW etc | ||
| 62 | |||
| 63 | // ARMv8 specific | ||
| 64 | bool bFP; | ||
| 65 | bool bASIMD; | ||
| 66 | |||
| 67 | // Call Detect() | ||
| 68 | explicit CPUInfo(); | ||
| 69 | |||
| 70 | // Turn the cpu info into a string we can show | ||
| 71 | std::string Summarize(); | ||
| 72 | |||
| 73 | private: | ||
| 74 | // Detects the various cpu features | ||
| 75 | void Detect(); | ||
| 76 | }; | ||
| 77 | |||
| 78 | extern CPUInfo cpu_info; | ||
diff --git a/src/common/hash.cpp b/src/common/hash.cpp new file mode 100644 index 000000000..413e9c6f1 --- /dev/null +++ b/src/common/hash.cpp | |||
| @@ -0,0 +1,126 @@ | |||
| 1 | // Copyright 2015 Citra Emulator Project | ||
| 2 | // Licensed under GPLv2 or any later version | ||
| 3 | // Refer to the license.txt file included. | ||
| 4 | |||
| 5 | #if defined(_MSC_VER) | ||
| 6 | #include <stdlib.h> | ||
| 7 | #endif | ||
| 8 | |||
| 9 | #include "common_funcs.h" | ||
| 10 | #include "common_types.h" | ||
| 11 | #include "hash.h" | ||
| 12 | |||
| 13 | namespace Common { | ||
| 14 | |||
| 15 | // MurmurHash3 was written by Austin Appleby, and is placed in the public | ||
| 16 | // domain. The author hereby disclaims copyright to this source code. | ||
| 17 | |||
| 18 | // Block read - if your platform needs to do endian-swapping or can only handle aligned reads, do | ||
| 19 | // the conversion here | ||
| 20 | |||
| 21 | static FORCE_INLINE u32 getblock32(const u32* p, int i) { | ||
| 22 | return p[i]; | ||
| 23 | } | ||
| 24 | |||
| 25 | static FORCE_INLINE u64 getblock64(const u64* p, int i) { | ||
| 26 | return p[i]; | ||
| 27 | } | ||
| 28 | |||
| 29 | // Finalization mix - force all bits of a hash block to avalanche | ||
| 30 | |||
| 31 | static FORCE_INLINE u32 fmix32(u32 h) { | ||
| 32 | h ^= h >> 16; | ||
| 33 | h *= 0x85ebca6b; | ||
| 34 | h ^= h >> 13; | ||
| 35 | h *= 0xc2b2ae35; | ||
| 36 | h ^= h >> 16; | ||
| 37 | |||
| 38 | return h; | ||
| 39 | } | ||
| 40 | |||
| 41 | static FORCE_INLINE u64 fmix64(u64 k) { | ||
| 42 | k ^= k >> 33; | ||
| 43 | k *= 0xff51afd7ed558ccdllu; | ||
| 44 | k ^= k >> 33; | ||
| 45 | k *= 0xc4ceb9fe1a85ec53llu; | ||
| 46 | k ^= k >> 33; | ||
| 47 | |||
| 48 | return k; | ||
| 49 | } | ||
| 50 | |||
| 51 | // This is the 128-bit variant of the MurmurHash3 hash function that is targetted for 64-bit | ||
| 52 | // platforms (MurmurHash3_x64_128). It was taken from: | ||
| 53 | // https://code.google.com/p/smhasher/source/browse/trunk/MurmurHash3.cpp | ||
| 54 | void MurmurHash3_128(const void* key, int len, u32 seed, void* out) { | ||
| 55 | const u8 * data = (const u8*)key; | ||
| 56 | const int nblocks = len / 16; | ||
| 57 | |||
| 58 | u64 h1 = seed; | ||
| 59 | u64 h2 = seed; | ||
| 60 | |||
| 61 | const u64 c1 = 0x87c37b91114253d5llu; | ||
| 62 | const u64 c2 = 0x4cf5ad432745937fllu; | ||
| 63 | |||
| 64 | // Body | ||
| 65 | |||
| 66 | const u64 * blocks = (const u64 *)(data); | ||
| 67 | |||
| 68 | for (int i = 0; i < nblocks; i++) { | ||
| 69 | u64 k1 = getblock64(blocks,i*2+0); | ||
| 70 | u64 k2 = getblock64(blocks,i*2+1); | ||
| 71 | |||
| 72 | k1 *= c1; k1 = _rotl64(k1,31); k1 *= c2; h1 ^= k1; | ||
| 73 | |||
| 74 | h1 = _rotl64(h1,27); h1 += h2; h1 = h1*5+0x52dce729; | ||
| 75 | |||
| 76 | k2 *= c2; k2 = _rotl64(k2,33); k2 *= c1; h2 ^= k2; | ||
| 77 | |||
| 78 | h2 = _rotl64(h2,31); h2 += h1; h2 = h2*5+0x38495ab5; | ||
| 79 | } | ||
| 80 | |||
| 81 | // Tail | ||
| 82 | |||
| 83 | const u8 * tail = (const u8*)(data + nblocks*16); | ||
| 84 | |||
| 85 | u64 k1 = 0; | ||
| 86 | u64 k2 = 0; | ||
| 87 | |||
| 88 | switch (len & 15) { | ||
| 89 | case 15: k2 ^= ((u64)tail[14]) << 48; | ||
| 90 | case 14: k2 ^= ((u64)tail[13]) << 40; | ||
| 91 | case 13: k2 ^= ((u64)tail[12]) << 32; | ||
| 92 | case 12: k2 ^= ((u64)tail[11]) << 24; | ||
| 93 | case 11: k2 ^= ((u64)tail[10]) << 16; | ||
| 94 | case 10: k2 ^= ((u64)tail[ 9]) << 8; | ||
| 95 | case 9: k2 ^= ((u64)tail[ 8]) << 0; | ||
| 96 | k2 *= c2; k2 = _rotl64(k2,33); k2 *= c1; h2 ^= k2; | ||
| 97 | |||
| 98 | case 8: k1 ^= ((u64)tail[ 7]) << 56; | ||
| 99 | case 7: k1 ^= ((u64)tail[ 6]) << 48; | ||
| 100 | case 6: k1 ^= ((u64)tail[ 5]) << 40; | ||
| 101 | case 5: k1 ^= ((u64)tail[ 4]) << 32; | ||
| 102 | case 4: k1 ^= ((u64)tail[ 3]) << 24; | ||
| 103 | case 3: k1 ^= ((u64)tail[ 2]) << 16; | ||
| 104 | case 2: k1 ^= ((u64)tail[ 1]) << 8; | ||
| 105 | case 1: k1 ^= ((u64)tail[ 0]) << 0; | ||
| 106 | k1 *= c1; k1 = _rotl64(k1,31); k1 *= c2; h1 ^= k1; | ||
| 107 | }; | ||
| 108 | |||
| 109 | // Finalization | ||
| 110 | |||
| 111 | h1 ^= len; h2 ^= len; | ||
| 112 | |||
| 113 | h1 += h2; | ||
| 114 | h2 += h1; | ||
| 115 | |||
| 116 | h1 = fmix64(h1); | ||
| 117 | h2 = fmix64(h2); | ||
| 118 | |||
| 119 | h1 += h2; | ||
| 120 | h2 += h1; | ||
| 121 | |||
| 122 | ((u64*)out)[0] = h1; | ||
| 123 | ((u64*)out)[1] = h2; | ||
| 124 | } | ||
| 125 | |||
| 126 | } // namespace Common | ||
diff --git a/src/common/hash.h b/src/common/hash.h new file mode 100644 index 000000000..a3850be68 --- /dev/null +++ b/src/common/hash.h | |||
| @@ -0,0 +1,25 @@ | |||
| 1 | // Copyright 2015 Citra Emulator Project | ||
| 2 | // Licensed under GPLv2 or any later version | ||
| 3 | // Refer to the license.txt file included. | ||
| 4 | |||
| 5 | #pragma once | ||
| 6 | |||
| 7 | #include "common/common_types.h" | ||
| 8 | |||
| 9 | namespace Common { | ||
| 10 | |||
| 11 | void MurmurHash3_128(const void* key, int len, u32 seed, void* out); | ||
| 12 | |||
| 13 | /** | ||
| 14 | * Computes a 64-bit hash over the specified block of data | ||
| 15 | * @param data Block of data to compute hash over | ||
| 16 | * @param len Length of data (in bytes) to compute hash over | ||
| 17 | * @returns 64-bit hash value that was computed over the data block | ||
| 18 | */ | ||
| 19 | static inline u64 ComputeHash64(const void* data, int len) { | ||
| 20 | u64 res[2]; | ||
| 21 | MurmurHash3_128(data, len, 0, res); | ||
| 22 | return res[0]; | ||
| 23 | } | ||
| 24 | |||
| 25 | } // namespace Common | ||
diff --git a/src/common/memory_util.cpp b/src/common/memory_util.cpp index 2b3ace528..5ef784224 100644 --- a/src/common/memory_util.cpp +++ b/src/common/memory_util.cpp | |||
| @@ -16,7 +16,7 @@ | |||
| 16 | #include <sys/mman.h> | 16 | #include <sys/mman.h> |
| 17 | #endif | 17 | #endif |
| 18 | 18 | ||
| 19 | #if !defined(_WIN32) && defined(__x86_64__) && !defined(MAP_32BIT) | 19 | #if !defined(_WIN32) && defined(ARCHITECTURE_X64) && !defined(MAP_32BIT) |
| 20 | #include <unistd.h> | 20 | #include <unistd.h> |
| 21 | #define PAGE_MASK (getpagesize() - 1) | 21 | #define PAGE_MASK (getpagesize() - 1) |
| 22 | #define round_page(x) ((((unsigned long)(x)) + PAGE_MASK) & ~(PAGE_MASK)) | 22 | #define round_page(x) ((((unsigned long)(x)) + PAGE_MASK) & ~(PAGE_MASK)) |
| @@ -31,7 +31,7 @@ void* AllocateExecutableMemory(size_t size, bool low) | |||
| 31 | void* ptr = VirtualAlloc(0, size, MEM_COMMIT, PAGE_EXECUTE_READWRITE); | 31 | void* ptr = VirtualAlloc(0, size, MEM_COMMIT, PAGE_EXECUTE_READWRITE); |
| 32 | #else | 32 | #else |
| 33 | static char *map_hint = 0; | 33 | static char *map_hint = 0; |
| 34 | #if defined(__x86_64__) && !defined(MAP_32BIT) | 34 | #if defined(ARCHITECTURE_X64) && !defined(MAP_32BIT) |
| 35 | // This OS has no flag to enforce allocation below the 4 GB boundary, | 35 | // This OS has no flag to enforce allocation below the 4 GB boundary, |
| 36 | // but if we hint that we want a low address it is very likely we will | 36 | // but if we hint that we want a low address it is very likely we will |
| 37 | // get one. | 37 | // get one. |
| @@ -43,7 +43,7 @@ void* AllocateExecutableMemory(size_t size, bool low) | |||
| 43 | #endif | 43 | #endif |
| 44 | void* ptr = mmap(map_hint, size, PROT_READ | PROT_WRITE | PROT_EXEC, | 44 | void* ptr = mmap(map_hint, size, PROT_READ | PROT_WRITE | PROT_EXEC, |
| 45 | MAP_ANON | MAP_PRIVATE | 45 | MAP_ANON | MAP_PRIVATE |
| 46 | #if defined(__x86_64__) && defined(MAP_32BIT) | 46 | #if defined(ARCHITECTURE_X64) && defined(MAP_32BIT) |
| 47 | | (low ? MAP_32BIT : 0) | 47 | | (low ? MAP_32BIT : 0) |
| 48 | #endif | 48 | #endif |
| 49 | , -1, 0); | 49 | , -1, 0); |
| @@ -62,7 +62,7 @@ void* AllocateExecutableMemory(size_t size, bool low) | |||
| 62 | #endif | 62 | #endif |
| 63 | LOG_ERROR(Common_Memory, "Failed to allocate executable memory"); | 63 | LOG_ERROR(Common_Memory, "Failed to allocate executable memory"); |
| 64 | } | 64 | } |
| 65 | #if !defined(_WIN32) && defined(__x86_64__) && !defined(MAP_32BIT) | 65 | #if !defined(_WIN32) && defined(ARCHITECTURE_X64) && !defined(MAP_32BIT) |
| 66 | else | 66 | else |
| 67 | { | 67 | { |
| 68 | if (low) | 68 | if (low) |
diff --git a/src/common/platform.h b/src/common/platform.h index 0a912dda3..9ba4db11b 100644 --- a/src/common/platform.h +++ b/src/common/platform.h | |||
| @@ -27,7 +27,7 @@ | |||
| 27 | //////////////////////////////////////////////////////////////////////////////////////////////////// | 27 | //////////////////////////////////////////////////////////////////////////////////////////////////// |
| 28 | // Platform detection | 28 | // Platform detection |
| 29 | 29 | ||
| 30 | #if defined(__x86_64__) || defined(_M_X64) || defined(__aarch64__) | 30 | #if defined(ARCHITECTURE_x86_64) || defined(__aarch64__) |
| 31 | #define EMU_ARCH_BITS 64 | 31 | #define EMU_ARCH_BITS 64 |
| 32 | #elif defined(__i386) || defined(_M_IX86) || defined(__arm__) || defined(_M_ARM) | 32 | #elif defined(__i386) || defined(_M_IX86) || defined(__arm__) || defined(_M_ARM) |
| 33 | #define EMU_ARCH_BITS 32 | 33 | #define EMU_ARCH_BITS 32 |
diff --git a/src/common/x64/abi.cpp b/src/common/x64/abi.cpp new file mode 100644 index 000000000..4c07a6ebe --- /dev/null +++ b/src/common/x64/abi.cpp | |||
| @@ -0,0 +1,680 @@ | |||
| 1 | // Copyright (C) 2003 Dolphin Project. | ||
| 2 | |||
| 3 | // This program is free software: you can redistribute it and/or modify | ||
| 4 | // it under the terms of the GNU General Public License as published by | ||
| 5 | // the Free Software Foundation, version 2.0 or later versions. | ||
| 6 | |||
| 7 | // This program is distributed in the hope that it will be useful, | ||
| 8 | // but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
| 9 | // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
| 10 | // GNU General Public License 2.0 for more details. | ||
| 11 | |||
| 12 | // A copy of the GPL 2.0 should have been included with the program. | ||
| 13 | // If not, see http://www.gnu.org/licenses/ | ||
| 14 | |||
| 15 | // Official SVN repository and contact information can be found at | ||
| 16 | // http://code.google.com/p/dolphin-emu/ | ||
| 17 | |||
| 18 | #include "abi.h" | ||
| 19 | #include "emitter.h" | ||
| 20 | |||
| 21 | using namespace Gen; | ||
| 22 | |||
| 23 | // Shared code between Win64 and Unix64 | ||
| 24 | |||
| 25 | // Sets up a __cdecl function. | ||
| 26 | void XEmitter::ABI_EmitPrologue(int maxCallParams) | ||
| 27 | { | ||
| 28 | #ifdef _M_IX86 | ||
| 29 | // Don't really need to do anything | ||
| 30 | #elif defined(ARCHITECTURE_x86_64) | ||
| 31 | #if _WIN32 | ||
| 32 | int stacksize = ((maxCallParams + 1) & ~1) * 8 + 8; | ||
| 33 | // Set up a stack frame so that we can call functions | ||
| 34 | // TODO: use maxCallParams | ||
| 35 | SUB(64, R(RSP), Imm8(stacksize)); | ||
| 36 | #endif | ||
| 37 | #else | ||
| 38 | #error Arch not supported | ||
| 39 | #endif | ||
| 40 | } | ||
| 41 | |||
| 42 | void XEmitter::ABI_EmitEpilogue(int maxCallParams) | ||
| 43 | { | ||
| 44 | #ifdef _M_IX86 | ||
| 45 | RET(); | ||
| 46 | #elif defined(ARCHITECTURE_x86_64) | ||
| 47 | #ifdef _WIN32 | ||
| 48 | int stacksize = ((maxCallParams+1)&~1)*8 + 8; | ||
| 49 | ADD(64, R(RSP), Imm8(stacksize)); | ||
| 50 | #endif | ||
| 51 | RET(); | ||
| 52 | #else | ||
| 53 | #error Arch not supported | ||
| 54 | |||
| 55 | |||
| 56 | #endif | ||
| 57 | } | ||
| 58 | |||
| 59 | #ifdef _M_IX86 // All32 | ||
| 60 | |||
| 61 | // Shared code between Win32 and Unix32 | ||
| 62 | void XEmitter::ABI_CallFunction(const void *func) { | ||
| 63 | ABI_AlignStack(0); | ||
| 64 | CALL(func); | ||
| 65 | ABI_RestoreStack(0); | ||
| 66 | } | ||
| 67 | |||
| 68 | void XEmitter::ABI_CallFunctionC16(const void *func, u16 param1) { | ||
| 69 | ABI_AlignStack(1 * 2); | ||
| 70 | PUSH(16, Imm16(param1)); | ||
| 71 | CALL(func); | ||
| 72 | ABI_RestoreStack(1 * 2); | ||
| 73 | } | ||
| 74 | |||
| 75 | void XEmitter::ABI_CallFunctionCC16(const void *func, u32 param1, u16 param2) { | ||
| 76 | ABI_AlignStack(1 * 2 + 1 * 4); | ||
| 77 | PUSH(16, Imm16(param2)); | ||
| 78 | PUSH(32, Imm32(param1)); | ||
| 79 | CALL(func); | ||
| 80 | ABI_RestoreStack(1 * 2 + 1 * 4); | ||
| 81 | } | ||
| 82 | |||
| 83 | void XEmitter::ABI_CallFunctionC(const void *func, u32 param1) { | ||
| 84 | ABI_AlignStack(1 * 4); | ||
| 85 | PUSH(32, Imm32(param1)); | ||
| 86 | CALL(func); | ||
| 87 | ABI_RestoreStack(1 * 4); | ||
| 88 | } | ||
| 89 | |||
| 90 | void XEmitter::ABI_CallFunctionCC(const void *func, u32 param1, u32 param2) { | ||
| 91 | ABI_AlignStack(2 * 4); | ||
| 92 | PUSH(32, Imm32(param2)); | ||
| 93 | PUSH(32, Imm32(param1)); | ||
| 94 | CALL(func); | ||
| 95 | ABI_RestoreStack(2 * 4); | ||
| 96 | } | ||
| 97 | |||
| 98 | void XEmitter::ABI_CallFunctionCCC(const void *func, u32 param1, u32 param2, u32 param3) { | ||
| 99 | ABI_AlignStack(3 * 4); | ||
| 100 | PUSH(32, Imm32(param3)); | ||
| 101 | PUSH(32, Imm32(param2)); | ||
| 102 | PUSH(32, Imm32(param1)); | ||
| 103 | CALL(func); | ||
| 104 | ABI_RestoreStack(3 * 4); | ||
| 105 | } | ||
| 106 | |||
| 107 | void XEmitter::ABI_CallFunctionCCP(const void *func, u32 param1, u32 param2, void *param3) { | ||
| 108 | ABI_AlignStack(3 * 4); | ||
| 109 | PUSH(32, ImmPtr(param3)); | ||
| 110 | PUSH(32, Imm32(param2)); | ||
| 111 | PUSH(32, Imm32(param1)); | ||
| 112 | CALL(func); | ||
| 113 | ABI_RestoreStack(3 * 4); | ||
| 114 | } | ||
| 115 | |||
| 116 | void XEmitter::ABI_CallFunctionCCCP(const void *func, u32 param1, u32 param2,u32 param3, void *param4) { | ||
| 117 | ABI_AlignStack(4 * 4); | ||
| 118 | PUSH(32, ImmPtr(param4)); | ||
| 119 | PUSH(32, Imm32(param3)); | ||
| 120 | PUSH(32, Imm32(param2)); | ||
| 121 | PUSH(32, Imm32(param1)); | ||
| 122 | CALL(func); | ||
| 123 | ABI_RestoreStack(4 * 4); | ||
| 124 | } | ||
| 125 | |||
| 126 | void XEmitter::ABI_CallFunctionP(const void *func, void *param1) { | ||
| 127 | ABI_AlignStack(1 * 4); | ||
| 128 | PUSH(32, ImmPtr(param1)); | ||
| 129 | CALL(func); | ||
| 130 | ABI_RestoreStack(1 * 4); | ||
| 131 | } | ||
| 132 | |||
| 133 | void XEmitter::ABI_CallFunctionPA(const void *func, void *param1, const Gen::OpArg &arg2) { | ||
| 134 | ABI_AlignStack(2 * 4); | ||
| 135 | PUSH(32, arg2); | ||
| 136 | PUSH(32, ImmPtr(param1)); | ||
| 137 | CALL(func); | ||
| 138 | ABI_RestoreStack(2 * 4); | ||
| 139 | } | ||
| 140 | |||
| 141 | void XEmitter::ABI_CallFunctionPAA(const void *func, void *param1, const Gen::OpArg &arg2, const Gen::OpArg &arg3) { | ||
| 142 | ABI_AlignStack(3 * 4); | ||
| 143 | PUSH(32, arg3); | ||
| 144 | PUSH(32, arg2); | ||
| 145 | PUSH(32, ImmPtr(param1)); | ||
| 146 | CALL(func); | ||
| 147 | ABI_RestoreStack(3 * 4); | ||
| 148 | } | ||
| 149 | |||
| 150 | void XEmitter::ABI_CallFunctionPPC(const void *func, void *param1, void *param2, u32 param3) { | ||
| 151 | ABI_AlignStack(3 * 4); | ||
| 152 | PUSH(32, Imm32(param3)); | ||
| 153 | PUSH(32, ImmPtr(param2)); | ||
| 154 | PUSH(32, ImmPtr(param1)); | ||
| 155 | CALL(func); | ||
| 156 | ABI_RestoreStack(3 * 4); | ||
| 157 | } | ||
| 158 | |||
| 159 | // Pass a register as a parameter. | ||
| 160 | void XEmitter::ABI_CallFunctionR(const void *func, X64Reg reg1) { | ||
| 161 | ABI_AlignStack(1 * 4); | ||
| 162 | PUSH(32, R(reg1)); | ||
| 163 | CALL(func); | ||
| 164 | ABI_RestoreStack(1 * 4); | ||
| 165 | } | ||
| 166 | |||
| 167 | // Pass two registers as parameters. | ||
| 168 | void XEmitter::ABI_CallFunctionRR(const void *func, Gen::X64Reg reg1, Gen::X64Reg reg2) | ||
| 169 | { | ||
| 170 | ABI_AlignStack(2 * 4); | ||
| 171 | PUSH(32, R(reg2)); | ||
| 172 | PUSH(32, R(reg1)); | ||
| 173 | CALL(func); | ||
| 174 | ABI_RestoreStack(2 * 4); | ||
| 175 | } | ||
| 176 | |||
| 177 | void XEmitter::ABI_CallFunctionAC(const void *func, const Gen::OpArg &arg1, u32 param2) | ||
| 178 | { | ||
| 179 | ABI_AlignStack(2 * 4); | ||
| 180 | PUSH(32, Imm32(param2)); | ||
| 181 | PUSH(32, arg1); | ||
| 182 | CALL(func); | ||
| 183 | ABI_RestoreStack(2 * 4); | ||
| 184 | } | ||
| 185 | |||
| 186 | void XEmitter::ABI_CallFunctionACC(const void *func, const Gen::OpArg &arg1, u32 param2, u32 param3) | ||
| 187 | { | ||
| 188 | ABI_AlignStack(3 * 4); | ||
| 189 | PUSH(32, Imm32(param3)); | ||
| 190 | PUSH(32, Imm32(param2)); | ||
| 191 | PUSH(32, arg1); | ||
| 192 | CALL(func); | ||
| 193 | ABI_RestoreStack(3 * 4); | ||
| 194 | } | ||
| 195 | |||
| 196 | void XEmitter::ABI_CallFunctionA(const void *func, const Gen::OpArg &arg1) | ||
| 197 | { | ||
| 198 | ABI_AlignStack(1 * 4); | ||
| 199 | PUSH(32, arg1); | ||
| 200 | CALL(func); | ||
| 201 | ABI_RestoreStack(1 * 4); | ||
| 202 | } | ||
| 203 | |||
| 204 | void XEmitter::ABI_CallFunctionAA(const void *func, const Gen::OpArg &arg1, const Gen::OpArg &arg2) | ||
| 205 | { | ||
| 206 | ABI_AlignStack(2 * 4); | ||
| 207 | PUSH(32, arg2); | ||
| 208 | PUSH(32, arg1); | ||
| 209 | CALL(func); | ||
| 210 | ABI_RestoreStack(2 * 4); | ||
| 211 | } | ||
| 212 | |||
| 213 | void XEmitter::ABI_PushAllCalleeSavedRegsAndAdjustStack() { | ||
| 214 | // Note: 4 * 4 = 16 bytes, so alignment is preserved. | ||
| 215 | PUSH(EBP); | ||
| 216 | PUSH(EBX); | ||
| 217 | PUSH(ESI); | ||
| 218 | PUSH(EDI); | ||
| 219 | } | ||
| 220 | |||
| 221 | void XEmitter::ABI_PopAllCalleeSavedRegsAndAdjustStack() { | ||
| 222 | POP(EDI); | ||
| 223 | POP(ESI); | ||
| 224 | POP(EBX); | ||
| 225 | POP(EBP); | ||
| 226 | } | ||
| 227 | |||
| 228 | unsigned int XEmitter::ABI_GetAlignedFrameSize(unsigned int frameSize) { | ||
| 229 | frameSize += 4; // reserve space for return address | ||
| 230 | unsigned int alignedSize = | ||
| 231 | #ifdef __GNUC__ | ||
| 232 | (frameSize + 15) & -16; | ||
| 233 | #else | ||
| 234 | (frameSize + 3) & -4; | ||
| 235 | #endif | ||
| 236 | return alignedSize; | ||
| 237 | } | ||
| 238 | |||
| 239 | |||
| 240 | void XEmitter::ABI_AlignStack(unsigned int frameSize) { | ||
| 241 | // Mac OS X requires the stack to be 16-byte aligned before every call. | ||
| 242 | // Linux requires the stack to be 16-byte aligned before calls that put SSE | ||
| 243 | // vectors on the stack, but since we do not keep track of which calls do that, | ||
| 244 | // it is effectively every call as well. | ||
| 245 | // Windows binaries compiled with MSVC do not have such a restriction*, but I | ||
| 246 | // expect that GCC on Windows acts the same as GCC on Linux in this respect. | ||
| 247 | // It would be nice if someone could verify this. | ||
| 248 | // *However, the MSVC optimizing compiler assumes a 4-byte-aligned stack at times. | ||
| 249 | unsigned int fillSize = | ||
| 250 | ABI_GetAlignedFrameSize(frameSize) - (frameSize + 4); | ||
| 251 | if (fillSize != 0) { | ||
| 252 | SUB(32, R(ESP), Imm8(fillSize)); | ||
| 253 | } | ||
| 254 | } | ||
| 255 | |||
| 256 | void XEmitter::ABI_RestoreStack(unsigned int frameSize) { | ||
| 257 | unsigned int alignedSize = ABI_GetAlignedFrameSize(frameSize); | ||
| 258 | alignedSize -= 4; // return address is POPped at end of call | ||
| 259 | if (alignedSize != 0) { | ||
| 260 | ADD(32, R(ESP), Imm8(alignedSize)); | ||
| 261 | } | ||
| 262 | } | ||
| 263 | |||
| 264 | #else //64bit | ||
| 265 | |||
| 266 | // Common functions | ||
| 267 | void XEmitter::ABI_CallFunction(const void *func) { | ||
| 268 | u64 distance = u64(func) - (u64(code) + 5); | ||
| 269 | if (distance >= 0x0000000080000000ULL | ||
| 270 | && distance < 0xFFFFFFFF80000000ULL) { | ||
| 271 | // Far call | ||
| 272 | MOV(64, R(RAX), ImmPtr(func)); | ||
| 273 | CALLptr(R(RAX)); | ||
| 274 | } else { | ||
| 275 | CALL(func); | ||
| 276 | } | ||
| 277 | } | ||
| 278 | |||
| 279 | void XEmitter::ABI_CallFunctionC16(const void *func, u16 param1) { | ||
| 280 | MOV(32, R(ABI_PARAM1), Imm32((u32)param1)); | ||
| 281 | u64 distance = u64(func) - (u64(code) + 5); | ||
| 282 | if (distance >= 0x0000000080000000ULL | ||
| 283 | && distance < 0xFFFFFFFF80000000ULL) { | ||
| 284 | // Far call | ||
| 285 | MOV(64, R(RAX), ImmPtr(func)); | ||
| 286 | CALLptr(R(RAX)); | ||
| 287 | } else { | ||
| 288 | CALL(func); | ||
| 289 | } | ||
| 290 | } | ||
| 291 | |||
| 292 | void XEmitter::ABI_CallFunctionCC16(const void *func, u32 param1, u16 param2) { | ||
| 293 | MOV(32, R(ABI_PARAM1), Imm32(param1)); | ||
| 294 | MOV(32, R(ABI_PARAM2), Imm32((u32)param2)); | ||
| 295 | u64 distance = u64(func) - (u64(code) + 5); | ||
| 296 | if (distance >= 0x0000000080000000ULL | ||
| 297 | && distance < 0xFFFFFFFF80000000ULL) { | ||
| 298 | // Far call | ||
| 299 | MOV(64, R(RAX), ImmPtr(func)); | ||
| 300 | CALLptr(R(RAX)); | ||
| 301 | } else { | ||
| 302 | CALL(func); | ||
| 303 | } | ||
| 304 | } | ||
| 305 | |||
| 306 | void XEmitter::ABI_CallFunctionC(const void *func, u32 param1) { | ||
| 307 | MOV(32, R(ABI_PARAM1), Imm32(param1)); | ||
| 308 | u64 distance = u64(func) - (u64(code) + 5); | ||
| 309 | if (distance >= 0x0000000080000000ULL | ||
| 310 | && distance < 0xFFFFFFFF80000000ULL) { | ||
| 311 | // Far call | ||
| 312 | MOV(64, R(RAX), ImmPtr(func)); | ||
| 313 | CALLptr(R(RAX)); | ||
| 314 | } else { | ||
| 315 | CALL(func); | ||
| 316 | } | ||
| 317 | } | ||
| 318 | |||
| 319 | void XEmitter::ABI_CallFunctionCC(const void *func, u32 param1, u32 param2) { | ||
| 320 | MOV(32, R(ABI_PARAM1), Imm32(param1)); | ||
| 321 | MOV(32, R(ABI_PARAM2), Imm32(param2)); | ||
| 322 | u64 distance = u64(func) - (u64(code) + 5); | ||
| 323 | if (distance >= 0x0000000080000000ULL | ||
| 324 | && distance < 0xFFFFFFFF80000000ULL) { | ||
| 325 | // Far call | ||
| 326 | MOV(64, R(RAX), ImmPtr(func)); | ||
| 327 | CALLptr(R(RAX)); | ||
| 328 | } else { | ||
| 329 | CALL(func); | ||
| 330 | } | ||
| 331 | } | ||
| 332 | |||
| 333 | void XEmitter::ABI_CallFunctionCCC(const void *func, u32 param1, u32 param2, u32 param3) { | ||
| 334 | MOV(32, R(ABI_PARAM1), Imm32(param1)); | ||
| 335 | MOV(32, R(ABI_PARAM2), Imm32(param2)); | ||
| 336 | MOV(32, R(ABI_PARAM3), Imm32(param3)); | ||
| 337 | u64 distance = u64(func) - (u64(code) + 5); | ||
| 338 | if (distance >= 0x0000000080000000ULL | ||
| 339 | && distance < 0xFFFFFFFF80000000ULL) { | ||
| 340 | // Far call | ||
| 341 | MOV(64, R(RAX), ImmPtr(func)); | ||
| 342 | CALLptr(R(RAX)); | ||
| 343 | } else { | ||
| 344 | CALL(func); | ||
| 345 | } | ||
| 346 | } | ||
| 347 | |||
| 348 | void XEmitter::ABI_CallFunctionCCP(const void *func, u32 param1, u32 param2, void *param3) { | ||
| 349 | MOV(32, R(ABI_PARAM1), Imm32(param1)); | ||
| 350 | MOV(32, R(ABI_PARAM2), Imm32(param2)); | ||
| 351 | MOV(64, R(ABI_PARAM3), ImmPtr(param3)); | ||
| 352 | u64 distance = u64(func) - (u64(code) + 5); | ||
| 353 | if (distance >= 0x0000000080000000ULL | ||
| 354 | && distance < 0xFFFFFFFF80000000ULL) { | ||
| 355 | // Far call | ||
| 356 | MOV(64, R(RAX), ImmPtr(func)); | ||
| 357 | CALLptr(R(RAX)); | ||
| 358 | } else { | ||
| 359 | CALL(func); | ||
| 360 | } | ||
| 361 | } | ||
| 362 | |||
| 363 | void XEmitter::ABI_CallFunctionCCCP(const void *func, u32 param1, u32 param2, u32 param3, void *param4) { | ||
| 364 | MOV(32, R(ABI_PARAM1), Imm32(param1)); | ||
| 365 | MOV(32, R(ABI_PARAM2), Imm32(param2)); | ||
| 366 | MOV(32, R(ABI_PARAM3), Imm32(param3)); | ||
| 367 | MOV(64, R(ABI_PARAM4), ImmPtr(param4)); | ||
| 368 | u64 distance = u64(func) - (u64(code) + 5); | ||
| 369 | if (distance >= 0x0000000080000000ULL | ||
| 370 | && distance < 0xFFFFFFFF80000000ULL) { | ||
| 371 | // Far call | ||
| 372 | MOV(64, R(RAX), ImmPtr(func)); | ||
| 373 | CALLptr(R(RAX)); | ||
| 374 | } else { | ||
| 375 | CALL(func); | ||
| 376 | } | ||
| 377 | } | ||
| 378 | |||
| 379 | void XEmitter::ABI_CallFunctionP(const void *func, void *param1) { | ||
| 380 | MOV(64, R(ABI_PARAM1), ImmPtr(param1)); | ||
| 381 | u64 distance = u64(func) - (u64(code) + 5); | ||
| 382 | if (distance >= 0x0000000080000000ULL | ||
| 383 | && distance < 0xFFFFFFFF80000000ULL) { | ||
| 384 | // Far call | ||
| 385 | MOV(64, R(RAX), ImmPtr(func)); | ||
| 386 | CALLptr(R(RAX)); | ||
| 387 | } else { | ||
| 388 | CALL(func); | ||
| 389 | } | ||
| 390 | } | ||
| 391 | |||
| 392 | void XEmitter::ABI_CallFunctionPA(const void *func, void *param1, const Gen::OpArg &arg2) { | ||
| 393 | MOV(64, R(ABI_PARAM1), ImmPtr(param1)); | ||
| 394 | if (!arg2.IsSimpleReg(ABI_PARAM2)) | ||
| 395 | MOV(32, R(ABI_PARAM2), arg2); | ||
| 396 | u64 distance = u64(func) - (u64(code) + 5); | ||
| 397 | if (distance >= 0x0000000080000000ULL | ||
| 398 | && distance < 0xFFFFFFFF80000000ULL) { | ||
| 399 | // Far call | ||
| 400 | MOV(64, R(RAX), ImmPtr(func)); | ||
| 401 | CALLptr(R(RAX)); | ||
| 402 | } else { | ||
| 403 | CALL(func); | ||
| 404 | } | ||
| 405 | } | ||
| 406 | |||
| 407 | void XEmitter::ABI_CallFunctionPAA(const void *func, void *param1, const Gen::OpArg &arg2, const Gen::OpArg &arg3) { | ||
| 408 | MOV(64, R(ABI_PARAM1), ImmPtr(param1)); | ||
| 409 | if (!arg2.IsSimpleReg(ABI_PARAM2)) | ||
| 410 | MOV(32, R(ABI_PARAM2), arg2); | ||
| 411 | if (!arg3.IsSimpleReg(ABI_PARAM3)) | ||
| 412 | MOV(32, R(ABI_PARAM3), arg3); | ||
| 413 | u64 distance = u64(func) - (u64(code) + 5); | ||
| 414 | if (distance >= 0x0000000080000000ULL | ||
| 415 | && distance < 0xFFFFFFFF80000000ULL) { | ||
| 416 | // Far call | ||
| 417 | MOV(64, R(RAX), ImmPtr(func)); | ||
| 418 | CALLptr(R(RAX)); | ||
| 419 | } else { | ||
| 420 | CALL(func); | ||
| 421 | } | ||
| 422 | } | ||
| 423 | |||
| 424 | void XEmitter::ABI_CallFunctionPPC(const void *func, void *param1, void *param2, u32 param3) { | ||
| 425 | MOV(64, R(ABI_PARAM1), ImmPtr(param1)); | ||
| 426 | MOV(64, R(ABI_PARAM2), ImmPtr(param2)); | ||
| 427 | MOV(32, R(ABI_PARAM3), Imm32(param3)); | ||
| 428 | u64 distance = u64(func) - (u64(code) + 5); | ||
| 429 | if (distance >= 0x0000000080000000ULL | ||
| 430 | && distance < 0xFFFFFFFF80000000ULL) { | ||
| 431 | // Far call | ||
| 432 | MOV(64, R(RAX), ImmPtr(func)); | ||
| 433 | CALLptr(R(RAX)); | ||
| 434 | } else { | ||
| 435 | CALL(func); | ||
| 436 | } | ||
| 437 | } | ||
| 438 | |||
| 439 | // Pass a register as a parameter. | ||
| 440 | void XEmitter::ABI_CallFunctionR(const void *func, X64Reg reg1) { | ||
| 441 | if (reg1 != ABI_PARAM1) | ||
| 442 | MOV(32, R(ABI_PARAM1), R(reg1)); | ||
| 443 | u64 distance = u64(func) - (u64(code) + 5); | ||
| 444 | if (distance >= 0x0000000080000000ULL | ||
| 445 | && distance < 0xFFFFFFFF80000000ULL) { | ||
| 446 | // Far call | ||
| 447 | MOV(64, R(RAX), ImmPtr(func)); | ||
| 448 | CALLptr(R(RAX)); | ||
| 449 | } else { | ||
| 450 | CALL(func); | ||
| 451 | } | ||
| 452 | } | ||
| 453 | |||
| 454 | // Pass two registers as parameters. | ||
| 455 | void XEmitter::ABI_CallFunctionRR(const void *func, X64Reg reg1, X64Reg reg2) { | ||
| 456 | if (reg2 != ABI_PARAM1) { | ||
| 457 | if (reg1 != ABI_PARAM1) | ||
| 458 | MOV(64, R(ABI_PARAM1), R(reg1)); | ||
| 459 | if (reg2 != ABI_PARAM2) | ||
| 460 | MOV(64, R(ABI_PARAM2), R(reg2)); | ||
| 461 | } else { | ||
| 462 | if (reg2 != ABI_PARAM2) | ||
| 463 | MOV(64, R(ABI_PARAM2), R(reg2)); | ||
| 464 | if (reg1 != ABI_PARAM1) | ||
| 465 | MOV(64, R(ABI_PARAM1), R(reg1)); | ||
| 466 | } | ||
| 467 | u64 distance = u64(func) - (u64(code) + 5); | ||
| 468 | if (distance >= 0x0000000080000000ULL | ||
| 469 | && distance < 0xFFFFFFFF80000000ULL) { | ||
| 470 | // Far call | ||
| 471 | MOV(64, R(RAX), ImmPtr(func)); | ||
| 472 | CALLptr(R(RAX)); | ||
| 473 | } else { | ||
| 474 | CALL(func); | ||
| 475 | } | ||
| 476 | } | ||
| 477 | |||
| 478 | void XEmitter::ABI_CallFunctionAC(const void *func, const Gen::OpArg &arg1, u32 param2) | ||
| 479 | { | ||
| 480 | if (!arg1.IsSimpleReg(ABI_PARAM1)) | ||
| 481 | MOV(32, R(ABI_PARAM1), arg1); | ||
| 482 | MOV(32, R(ABI_PARAM2), Imm32(param2)); | ||
| 483 | u64 distance = u64(func) - (u64(code) + 5); | ||
| 484 | if (distance >= 0x0000000080000000ULL | ||
| 485 | && distance < 0xFFFFFFFF80000000ULL) { | ||
| 486 | // Far call | ||
| 487 | MOV(64, R(RAX), ImmPtr(func)); | ||
| 488 | CALLptr(R(RAX)); | ||
| 489 | } else { | ||
| 490 | CALL(func); | ||
| 491 | } | ||
| 492 | } | ||
| 493 | |||
| 494 | void XEmitter::ABI_CallFunctionACC(const void *func, const Gen::OpArg &arg1, u32 param2, u32 param3) | ||
| 495 | { | ||
| 496 | if (!arg1.IsSimpleReg(ABI_PARAM1)) | ||
| 497 | MOV(32, R(ABI_PARAM1), arg1); | ||
| 498 | MOV(32, R(ABI_PARAM2), Imm32(param2)); | ||
| 499 | MOV(64, R(ABI_PARAM3), Imm64(param3)); | ||
| 500 | u64 distance = u64(func) - (u64(code) + 5); | ||
| 501 | if (distance >= 0x0000000080000000ULL | ||
| 502 | && distance < 0xFFFFFFFF80000000ULL) { | ||
| 503 | // Far call | ||
| 504 | MOV(64, R(RAX), ImmPtr(func)); | ||
| 505 | CALLptr(R(RAX)); | ||
| 506 | } else { | ||
| 507 | CALL(func); | ||
| 508 | } | ||
| 509 | } | ||
| 510 | |||
| 511 | void XEmitter::ABI_CallFunctionA(const void *func, const Gen::OpArg &arg1) | ||
| 512 | { | ||
| 513 | if (!arg1.IsSimpleReg(ABI_PARAM1)) | ||
| 514 | MOV(32, R(ABI_PARAM1), arg1); | ||
| 515 | u64 distance = u64(func) - (u64(code) + 5); | ||
| 516 | if (distance >= 0x0000000080000000ULL | ||
| 517 | && distance < 0xFFFFFFFF80000000ULL) { | ||
| 518 | // Far call | ||
| 519 | MOV(64, R(RAX), ImmPtr(func)); | ||
| 520 | CALLptr(R(RAX)); | ||
| 521 | } else { | ||
| 522 | CALL(func); | ||
| 523 | } | ||
| 524 | } | ||
| 525 | |||
| 526 | void XEmitter::ABI_CallFunctionAA(const void *func, const Gen::OpArg &arg1, const Gen::OpArg &arg2) | ||
| 527 | { | ||
| 528 | if (!arg1.IsSimpleReg(ABI_PARAM1)) | ||
| 529 | MOV(32, R(ABI_PARAM1), arg1); | ||
| 530 | if (!arg2.IsSimpleReg(ABI_PARAM2)) | ||
| 531 | MOV(32, R(ABI_PARAM2), arg2); | ||
| 532 | u64 distance = u64(func) - (u64(code) + 5); | ||
| 533 | if (distance >= 0x0000000080000000ULL | ||
| 534 | && distance < 0xFFFFFFFF80000000ULL) { | ||
| 535 | // Far call | ||
| 536 | MOV(64, R(RAX), ImmPtr(func)); | ||
| 537 | CALLptr(R(RAX)); | ||
| 538 | } else { | ||
| 539 | CALL(func); | ||
| 540 | } | ||
| 541 | } | ||
| 542 | |||
| 543 | unsigned int XEmitter::ABI_GetAlignedFrameSize(unsigned int frameSize) { | ||
| 544 | return frameSize; | ||
| 545 | } | ||
| 546 | |||
| 547 | #ifdef _WIN32 | ||
| 548 | |||
| 549 | // The Windows x64 ABI requires XMM6 - XMM15 to be callee saved. 10 regs. | ||
| 550 | // But, not saving XMM4 and XMM5 breaks things in VS 2010, even though they are volatile regs. | ||
| 551 | // Let's just save all 16. | ||
| 552 | const int XMM_STACK_SPACE = 16 * 16; | ||
| 553 | |||
| 554 | // Win64 Specific Code | ||
| 555 | void XEmitter::ABI_PushAllCalleeSavedRegsAndAdjustStack() { | ||
| 556 | //we only want to do this once | ||
| 557 | PUSH(RBX); | ||
| 558 | PUSH(RSI); | ||
| 559 | PUSH(RDI); | ||
| 560 | PUSH(RBP); | ||
| 561 | PUSH(R12); | ||
| 562 | PUSH(R13); | ||
| 563 | PUSH(R14); | ||
| 564 | PUSH(R15); | ||
| 565 | ABI_AlignStack(0); | ||
| 566 | |||
| 567 | // Do this after aligning, because before it's offset by 8. | ||
| 568 | SUB(64, R(RSP), Imm32(XMM_STACK_SPACE)); | ||
| 569 | for (int i = 0; i < 16; ++i) | ||
| 570 | MOVAPS(MDisp(RSP, i * 16), (X64Reg)(XMM0 + i)); | ||
| 571 | } | ||
| 572 | |||
| 573 | void XEmitter::ABI_PopAllCalleeSavedRegsAndAdjustStack() { | ||
| 574 | for (int i = 0; i < 16; ++i) | ||
| 575 | MOVAPS((X64Reg)(XMM0 + i), MDisp(RSP, i * 16)); | ||
| 576 | ADD(64, R(RSP), Imm32(XMM_STACK_SPACE)); | ||
| 577 | |||
| 578 | ABI_RestoreStack(0); | ||
| 579 | POP(R15); | ||
| 580 | POP(R14); | ||
| 581 | POP(R13); | ||
| 582 | POP(R12); | ||
| 583 | POP(RBP); | ||
| 584 | POP(RDI); | ||
| 585 | POP(RSI); | ||
| 586 | POP(RBX); | ||
| 587 | } | ||
| 588 | |||
| 589 | // Win64 Specific Code | ||
| 590 | void XEmitter::ABI_PushAllCallerSavedRegsAndAdjustStack() { | ||
| 591 | PUSH(RCX); | ||
| 592 | PUSH(RDX); | ||
| 593 | PUSH(RSI); | ||
| 594 | PUSH(RDI); | ||
| 595 | PUSH(R8); | ||
| 596 | PUSH(R9); | ||
| 597 | PUSH(R10); | ||
| 598 | PUSH(R11); | ||
| 599 | // TODO: Callers preserve XMM4-5 (XMM0-3 are args.) | ||
| 600 | ABI_AlignStack(0); | ||
| 601 | } | ||
| 602 | |||
| 603 | void XEmitter::ABI_PopAllCallerSavedRegsAndAdjustStack() { | ||
| 604 | ABI_RestoreStack(0); | ||
| 605 | POP(R11); | ||
| 606 | POP(R10); | ||
| 607 | POP(R9); | ||
| 608 | POP(R8); | ||
| 609 | POP(RDI); | ||
| 610 | POP(RSI); | ||
| 611 | POP(RDX); | ||
| 612 | POP(RCX); | ||
| 613 | } | ||
| 614 | |||
| 615 | void XEmitter::ABI_AlignStack(unsigned int /*frameSize*/) { | ||
| 616 | SUB(64, R(RSP), Imm8(0x28)); | ||
| 617 | } | ||
| 618 | |||
| 619 | void XEmitter::ABI_RestoreStack(unsigned int /*frameSize*/) { | ||
| 620 | ADD(64, R(RSP), Imm8(0x28)); | ||
| 621 | } | ||
| 622 | |||
| 623 | #else | ||
| 624 | // Unix64 Specific Code | ||
| 625 | void XEmitter::ABI_PushAllCalleeSavedRegsAndAdjustStack() { | ||
| 626 | PUSH(RBX); | ||
| 627 | PUSH(RBP); | ||
| 628 | PUSH(R12); | ||
| 629 | PUSH(R13); | ||
| 630 | PUSH(R14); | ||
| 631 | PUSH(R15); | ||
| 632 | PUSH(R15); //just to align stack. duped push/pop doesn't hurt. | ||
| 633 | // TODO: XMM? | ||
| 634 | } | ||
| 635 | |||
| 636 | void XEmitter::ABI_PopAllCalleeSavedRegsAndAdjustStack() { | ||
| 637 | POP(R15); | ||
| 638 | POP(R15); | ||
| 639 | POP(R14); | ||
| 640 | POP(R13); | ||
| 641 | POP(R12); | ||
| 642 | POP(RBP); | ||
| 643 | POP(RBX); | ||
| 644 | } | ||
| 645 | |||
| 646 | void XEmitter::ABI_PushAllCallerSavedRegsAndAdjustStack() { | ||
| 647 | PUSH(RCX); | ||
| 648 | PUSH(RDX); | ||
| 649 | PUSH(RSI); | ||
| 650 | PUSH(RDI); | ||
| 651 | PUSH(R8); | ||
| 652 | PUSH(R9); | ||
| 653 | PUSH(R10); | ||
| 654 | PUSH(R11); | ||
| 655 | PUSH(R11); | ||
| 656 | } | ||
| 657 | |||
| 658 | void XEmitter::ABI_PopAllCallerSavedRegsAndAdjustStack() { | ||
| 659 | POP(R11); | ||
| 660 | POP(R11); | ||
| 661 | POP(R10); | ||
| 662 | POP(R9); | ||
| 663 | POP(R8); | ||
| 664 | POP(RDI); | ||
| 665 | POP(RSI); | ||
| 666 | POP(RDX); | ||
| 667 | POP(RCX); | ||
| 668 | } | ||
| 669 | |||
| 670 | void XEmitter::ABI_AlignStack(unsigned int /*frameSize*/) { | ||
| 671 | SUB(64, R(RSP), Imm8(0x08)); | ||
| 672 | } | ||
| 673 | |||
| 674 | void XEmitter::ABI_RestoreStack(unsigned int /*frameSize*/) { | ||
| 675 | ADD(64, R(RSP), Imm8(0x08)); | ||
| 676 | } | ||
| 677 | |||
| 678 | #endif // WIN32 | ||
| 679 | |||
| 680 | #endif // 32bit | ||
diff --git a/src/common/x64/abi.h b/src/common/x64/abi.h new file mode 100644 index 000000000..7e9c156ae --- /dev/null +++ b/src/common/x64/abi.h | |||
| @@ -0,0 +1,78 @@ | |||
| 1 | // Copyright (C) 2003 Dolphin Project. | ||
| 2 | |||
| 3 | // This program is free software: you can redistribute it and/or modify | ||
| 4 | // it under the terms of the GNU General Public License as published by | ||
| 5 | // the Free Software Foundation, version 2.0 or later versions. | ||
| 6 | |||
| 7 | // This program is distributed in the hope that it will be useful, | ||
| 8 | // but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
| 9 | // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
| 10 | // GNU General Public License 2.0 for more details. | ||
| 11 | |||
| 12 | // A copy of the GPL 2.0 should have been included with the program. | ||
| 13 | // If not, see http://www.gnu.org/licenses/ | ||
| 14 | |||
| 15 | // Official SVN repository and contact information can be found at | ||
| 16 | // http://code.google.com/p/dolphin-emu/ | ||
| 17 | |||
| 18 | #pragma once | ||
| 19 | |||
| 20 | #include "common/common_types.h" | ||
| 21 | |||
| 22 | // x86/x64 ABI:s, and helpers to help follow them when JIT-ing code. | ||
| 23 | // All convensions return values in EAX (+ possibly EDX). | ||
| 24 | |||
| 25 | // Linux 32-bit, Windows 32-bit (cdecl, System V): | ||
| 26 | // * Caller pushes left to right | ||
| 27 | // * Caller fixes stack after call | ||
| 28 | // * function subtract from stack for local storage only. | ||
| 29 | // Scratch: EAX ECX EDX | ||
| 30 | // Callee-save: EBX ESI EDI EBP | ||
| 31 | // Parameters: - | ||
| 32 | |||
| 33 | // Windows 64-bit | ||
| 34 | // * 4-reg "fastcall" variant, very new-skool stack handling | ||
| 35 | // * Callee moves stack pointer, to make room for shadow regs for the biggest function _it itself calls_ | ||
| 36 | // * Parameters passed in RCX, RDX, ... further parameters are MOVed into the allocated stack space. | ||
| 37 | // Scratch: RAX RCX RDX R8 R9 R10 R11 | ||
| 38 | // Callee-save: RBX RSI RDI RBP R12 R13 R14 R15 | ||
| 39 | // Parameters: RCX RDX R8 R9, further MOV-ed | ||
| 40 | |||
| 41 | // Linux 64-bit | ||
| 42 | // * 6-reg "fastcall" variant, old skool stack handling (parameters are pushed) | ||
| 43 | // Scratch: RAX RCX RDX RSI RDI R8 R9 R10 R11 | ||
| 44 | // Callee-save: RBX RBP R12 R13 R14 R15 | ||
| 45 | // Parameters: RDI RSI RDX RCX R8 R9 | ||
| 46 | |||
| 47 | #ifdef _M_IX86 // 32 bit calling convention, shared by all | ||
| 48 | |||
| 49 | // 32-bit don't pass parameters in regs, but these are convenient to have anyway when we have to | ||
| 50 | // choose regs to put stuff in. | ||
| 51 | #define ABI_PARAM1 RCX | ||
| 52 | #define ABI_PARAM2 RDX | ||
| 53 | |||
| 54 | // There are no ABI_PARAM* here, since args are pushed. | ||
| 55 | // 32-bit bog standard cdecl, shared between linux and windows | ||
| 56 | // MacOSX 32-bit is same as System V with a few exceptions that we probably don't care much about. | ||
| 57 | |||
| 58 | #elif ARCHITECTURE_x86_64 // 64 bit calling convention | ||
| 59 | |||
| 60 | #ifdef _WIN32 // 64-bit Windows - the really exotic calling convention | ||
| 61 | |||
| 62 | #define ABI_PARAM1 RCX | ||
| 63 | #define ABI_PARAM2 RDX | ||
| 64 | #define ABI_PARAM3 R8 | ||
| 65 | #define ABI_PARAM4 R9 | ||
| 66 | |||
| 67 | #else //64-bit Unix (hopefully MacOSX too) | ||
| 68 | |||
| 69 | #define ABI_PARAM1 RDI | ||
| 70 | #define ABI_PARAM2 RSI | ||
| 71 | #define ABI_PARAM3 RDX | ||
| 72 | #define ABI_PARAM4 RCX | ||
| 73 | #define ABI_PARAM5 R8 | ||
| 74 | #define ABI_PARAM6 R9 | ||
| 75 | |||
| 76 | #endif // WIN32 | ||
| 77 | |||
| 78 | #endif // X86 | ||
diff --git a/src/common/x64/cpu_detect.cpp b/src/common/x64/cpu_detect.cpp new file mode 100644 index 000000000..d9c430c67 --- /dev/null +++ b/src/common/x64/cpu_detect.cpp | |||
| @@ -0,0 +1,187 @@ | |||
| 1 | // Copyright 2013 Dolphin Emulator Project / 2015 Citra Emulator Project | ||
| 2 | // Licensed under GPLv2 or any later version | ||
| 3 | // Refer to the license.txt file included. | ||
| 4 | |||
| 5 | #include <cstring> | ||
| 6 | #include <string> | ||
| 7 | #include <thread> | ||
| 8 | |||
| 9 | #include "common/common_types.h" | ||
| 10 | |||
| 11 | #include "cpu_detect.h" | ||
| 12 | |||
| 13 | namespace Common { | ||
| 14 | |||
| 15 | #ifndef _MSC_VER | ||
| 16 | |||
| 17 | #ifdef __FreeBSD__ | ||
| 18 | #include <sys/types.h> | ||
| 19 | #include <machine/cpufunc.h> | ||
| 20 | #endif | ||
| 21 | |||
| 22 | static inline void __cpuidex(int info[4], int function_id, int subfunction_id) { | ||
| 23 | #ifdef __FreeBSD__ | ||
| 24 | // Despite the name, this is just do_cpuid() with ECX as second input. | ||
| 25 | cpuid_count((u_int)function_id, (u_int)subfunction_id, (u_int*)info); | ||
| 26 | #else | ||
| 27 | info[0] = function_id; // eax | ||
| 28 | info[2] = subfunction_id; // ecx | ||
| 29 | __asm__( | ||
| 30 | "cpuid" | ||
| 31 | : "=a" (info[0]), | ||
| 32 | "=b" (info[1]), | ||
| 33 | "=c" (info[2]), | ||
| 34 | "=d" (info[3]) | ||
| 35 | : "a" (function_id), | ||
| 36 | "c" (subfunction_id) | ||
| 37 | ); | ||
| 38 | #endif | ||
| 39 | } | ||
| 40 | |||
| 41 | static inline void __cpuid(int info[4], int function_id) { | ||
| 42 | return __cpuidex(info, function_id, 0); | ||
| 43 | } | ||
| 44 | |||
| 45 | #define _XCR_XFEATURE_ENABLED_MASK 0 | ||
| 46 | static u64 _xgetbv(u32 index) { | ||
| 47 | u32 eax, edx; | ||
| 48 | __asm__ __volatile__("xgetbv" : "=a"(eax), "=d"(edx) : "c"(index)); | ||
| 49 | return ((u64)edx << 32) | eax; | ||
| 50 | } | ||
| 51 | |||
| 52 | #endif // ifndef _MSC_VER | ||
| 53 | |||
| 54 | // Detects the various CPU features | ||
| 55 | static CPUCaps Detect() { | ||
| 56 | CPUCaps caps = {}; | ||
| 57 | |||
| 58 | caps.num_cores = std::thread::hardware_concurrency(); | ||
| 59 | |||
| 60 | // Assumes the CPU supports the CPUID instruction. Those that don't would likely not support | ||
| 61 | // Citra at all anyway | ||
| 62 | |||
| 63 | int cpu_id[4]; | ||
| 64 | memset(caps.brand_string, 0, sizeof(caps.brand_string)); | ||
| 65 | |||
| 66 | // Detect CPU's CPUID capabilities and grab CPU string | ||
| 67 | __cpuid(cpu_id, 0x00000000); | ||
| 68 | u32 max_std_fn = cpu_id[0]; // EAX | ||
| 69 | |||
| 70 | std::memcpy(&caps.brand_string[0], &cpu_id[1], sizeof(int)); | ||
| 71 | std::memcpy(&caps.brand_string[4], &cpu_id[3], sizeof(int)); | ||
| 72 | std::memcpy(&caps.brand_string[8], &cpu_id[2], sizeof(int)); | ||
| 73 | |||
| 74 | __cpuid(cpu_id, 0x80000000); | ||
| 75 | |||
| 76 | u32 max_ex_fn = cpu_id[0]; | ||
| 77 | if (!strcmp(caps.brand_string, "GenuineIntel")) | ||
| 78 | caps.vendor = CPUVendor::INTEL; | ||
| 79 | else if (!strcmp(caps.brand_string, "AuthenticAMD")) | ||
| 80 | caps.vendor = CPUVendor::AMD; | ||
| 81 | else | ||
| 82 | caps.vendor = CPUVendor::OTHER; | ||
| 83 | |||
| 84 | // Set reasonable default brand string even if brand string not available | ||
| 85 | strcpy(caps.cpu_string, caps.brand_string); | ||
| 86 | |||
| 87 | // Detect family and other miscellaneous features | ||
| 88 | if (max_std_fn >= 1) { | ||
| 89 | __cpuid(cpu_id, 0x00000001); | ||
| 90 | |||
| 91 | if ((cpu_id[3] >> 25) & 1) caps.sse = true; | ||
| 92 | if ((cpu_id[3] >> 26) & 1) caps.sse2 = true; | ||
| 93 | if ((cpu_id[2]) & 1) caps.sse3 = true; | ||
| 94 | if ((cpu_id[2] >> 9) & 1) caps.ssse3 = true; | ||
| 95 | if ((cpu_id[2] >> 19) & 1) caps.sse4_1 = true; | ||
| 96 | if ((cpu_id[2] >> 20) & 1) caps.sse4_2 = true; | ||
| 97 | if ((cpu_id[2] >> 22) & 1) caps.movbe = true; | ||
| 98 | if ((cpu_id[2] >> 25) & 1) caps.aes = true; | ||
| 99 | |||
| 100 | if ((cpu_id[3] >> 24) & 1) { | ||
| 101 | caps.fxsave_fxrstor = true; | ||
| 102 | } | ||
| 103 | |||
| 104 | // AVX support requires 3 separate checks: | ||
| 105 | // - Is the AVX bit set in CPUID? | ||
| 106 | // - Is the XSAVE bit set in CPUID? | ||
| 107 | // - XGETBV result has the XCR bit set. | ||
| 108 | if (((cpu_id[2] >> 28) & 1) && ((cpu_id[2] >> 27) & 1)) { | ||
| 109 | if ((_xgetbv(_XCR_XFEATURE_ENABLED_MASK) & 0x6) == 0x6) { | ||
| 110 | caps.avx = true; | ||
| 111 | if ((cpu_id[2] >> 12) & 1) | ||
| 112 | caps.fma = true; | ||
| 113 | } | ||
| 114 | } | ||
| 115 | |||
| 116 | if (max_std_fn >= 7) { | ||
| 117 | __cpuidex(cpu_id, 0x00000007, 0x00000000); | ||
| 118 | // Can't enable AVX2 unless the XSAVE/XGETBV checks above passed | ||
| 119 | if ((cpu_id[1] >> 5) & 1) | ||
| 120 | caps.avx2 = caps.avx; | ||
| 121 | if ((cpu_id[1] >> 3) & 1) | ||
| 122 | caps.bmi1 = true; | ||
| 123 | if ((cpu_id[1] >> 8) & 1) | ||
| 124 | caps.bmi2 = true; | ||
| 125 | } | ||
| 126 | } | ||
| 127 | |||
| 128 | caps.flush_to_zero = caps.sse; | ||
| 129 | |||
| 130 | if (max_ex_fn >= 0x80000004) { | ||
| 131 | // Extract CPU model string | ||
| 132 | __cpuid(cpu_id, 0x80000002); | ||
| 133 | std::memcpy(caps.cpu_string, cpu_id, sizeof(cpu_id)); | ||
| 134 | __cpuid(cpu_id, 0x80000003); | ||
| 135 | std::memcpy(caps.cpu_string + 16, cpu_id, sizeof(cpu_id)); | ||
| 136 | __cpuid(cpu_id, 0x80000004); | ||
| 137 | std::memcpy(caps.cpu_string + 32, cpu_id, sizeof(cpu_id)); | ||
| 138 | } | ||
| 139 | |||
| 140 | if (max_ex_fn >= 0x80000001) { | ||
| 141 | // Check for more features | ||
| 142 | __cpuid(cpu_id, 0x80000001); | ||
| 143 | if (cpu_id[2] & 1) caps.lahf_sahf_64 = true; | ||
| 144 | if ((cpu_id[2] >> 5) & 1) caps.lzcnt = true; | ||
| 145 | if ((cpu_id[2] >> 16) & 1) caps.fma4 = true; | ||
| 146 | if ((cpu_id[3] >> 29) & 1) caps.long_mode = true; | ||
| 147 | } | ||
| 148 | |||
| 149 | return caps; | ||
| 150 | } | ||
| 151 | |||
| 152 | const CPUCaps& GetCPUCaps() { | ||
| 153 | static CPUCaps caps = Detect(); | ||
| 154 | return caps; | ||
| 155 | } | ||
| 156 | |||
| 157 | std::string GetCPUCapsString() { | ||
| 158 | auto caps = GetCPUCaps(); | ||
| 159 | |||
| 160 | std::string sum(caps.cpu_string); | ||
| 161 | sum += " ("; | ||
| 162 | sum += caps.brand_string; | ||
| 163 | sum += ")"; | ||
| 164 | |||
| 165 | if (caps.sse) sum += ", SSE"; | ||
| 166 | if (caps.sse2) { | ||
| 167 | sum += ", SSE2"; | ||
| 168 | if (!caps.flush_to_zero) sum += " (without DAZ)"; | ||
| 169 | } | ||
| 170 | |||
| 171 | if (caps.sse3) sum += ", SSE3"; | ||
| 172 | if (caps.ssse3) sum += ", SSSE3"; | ||
| 173 | if (caps.sse4_1) sum += ", SSE4.1"; | ||
| 174 | if (caps.sse4_2) sum += ", SSE4.2"; | ||
| 175 | if (caps.avx) sum += ", AVX"; | ||
| 176 | if (caps.avx2) sum += ", AVX2"; | ||
| 177 | if (caps.bmi1) sum += ", BMI1"; | ||
| 178 | if (caps.bmi2) sum += ", BMI2"; | ||
| 179 | if (caps.fma) sum += ", FMA"; | ||
| 180 | if (caps.aes) sum += ", AES"; | ||
| 181 | if (caps.movbe) sum += ", MOVBE"; | ||
| 182 | if (caps.long_mode) sum += ", 64-bit support"; | ||
| 183 | |||
| 184 | return sum; | ||
| 185 | } | ||
| 186 | |||
| 187 | } // namespace Common | ||
diff --git a/src/common/x64/cpu_detect.h b/src/common/x64/cpu_detect.h new file mode 100644 index 000000000..0af3a8adb --- /dev/null +++ b/src/common/x64/cpu_detect.h | |||
| @@ -0,0 +1,66 @@ | |||
| 1 | // Copyright 2013 Dolphin Emulator Project / 2015 Citra Emulator Project | ||
| 2 | // Licensed under GPLv2 or any later version | ||
| 3 | // Refer to the license.txt file included. | ||
| 4 | |||
| 5 | #pragma once | ||
| 6 | |||
| 7 | #include <string> | ||
| 8 | |||
| 9 | namespace Common { | ||
| 10 | |||
| 11 | /// x86/x64 CPU vendors that may be detected by this module | ||
| 12 | enum class CPUVendor { | ||
| 13 | INTEL, | ||
| 14 | AMD, | ||
| 15 | OTHER, | ||
| 16 | }; | ||
| 17 | |||
| 18 | /// x86/x64 CPU capabilities that may be detected by this module | ||
| 19 | struct CPUCaps { | ||
| 20 | CPUVendor vendor; | ||
| 21 | char cpu_string[0x21]; | ||
| 22 | char brand_string[0x41]; | ||
| 23 | int num_cores; | ||
| 24 | bool sse; | ||
| 25 | bool sse2; | ||
| 26 | bool sse3; | ||
| 27 | bool ssse3; | ||
| 28 | bool sse4_1; | ||
| 29 | bool sse4_2; | ||
| 30 | bool lzcnt; | ||
| 31 | bool avx; | ||
| 32 | bool avx2; | ||
| 33 | bool bmi1; | ||
| 34 | bool bmi2; | ||
| 35 | bool fma; | ||
| 36 | bool fma4; | ||
| 37 | bool aes; | ||
| 38 | |||
| 39 | // Support for the FXSAVE and FXRSTOR instructions | ||
| 40 | bool fxsave_fxrstor; | ||
| 41 | |||
| 42 | bool movbe; | ||
| 43 | |||
| 44 | // This flag indicates that the hardware supports some mode in which denormal inputs and outputs | ||
| 45 | // are automatically set to (signed) zero. | ||
| 46 | bool flush_to_zero; | ||
| 47 | |||
| 48 | // Support for LAHF and SAHF instructions in 64-bit mode | ||
| 49 | bool lahf_sahf_64; | ||
| 50 | |||
| 51 | bool long_mode; | ||
| 52 | }; | ||
| 53 | |||
| 54 | /** | ||
| 55 | * Gets the supported capabilities of the host CPU | ||
| 56 | * @return Reference to a CPUCaps struct with the detected host CPU capabilities | ||
| 57 | */ | ||
| 58 | const CPUCaps& GetCPUCaps(); | ||
| 59 | |||
| 60 | /** | ||
| 61 | * Gets a string summary of the name and supported capabilities of the host CPU | ||
| 62 | * @return String summary | ||
| 63 | */ | ||
| 64 | std::string GetCPUCapsString(); | ||
| 65 | |||
| 66 | } // namespace Common | ||
diff --git a/src/common/x64/emitter.cpp b/src/common/x64/emitter.cpp new file mode 100644 index 000000000..4b79acd1f --- /dev/null +++ b/src/common/x64/emitter.cpp | |||
| @@ -0,0 +1,1989 @@ | |||
| 1 | // Copyright (C) 2003 Dolphin Project. | ||
| 2 | |||
| 3 | // This program is free software: you can redistribute it and/or modify | ||
| 4 | // it under the terms of the GNU General Public License as published by | ||
| 5 | // the Free Software Foundation, version 2.0 or later versions. | ||
| 6 | |||
| 7 | // This program is distributed in the hope that it will be useful, | ||
| 8 | // but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
| 9 | // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
| 10 | // GNU General Public License 2.0 for more details. | ||
| 11 | |||
| 12 | // A copy of the GPL 2.0 should have been included with the program. | ||
| 13 | // If not, see http://www.gnu.org/licenses/ | ||
| 14 | |||
| 15 | // Official SVN repository and contact information can be found at | ||
| 16 | // http://code.google.com/p/dolphin-emu/ | ||
| 17 | |||
| 18 | #include <cstring> | ||
| 19 | |||
| 20 | #include "common/assert.h" | ||
| 21 | #include "common/logging/log.h" | ||
| 22 | #include "common/memory_util.h" | ||
| 23 | |||
| 24 | #include "abi.h" | ||
| 25 | #include "cpu_detect.h" | ||
| 26 | #include "emitter.h" | ||
| 27 | |||
| 28 | #define PRIx64 "llx" | ||
| 29 | |||
| 30 | // Minimize the diff against Dolphin | ||
| 31 | #define DYNA_REC JIT | ||
| 32 | |||
| 33 | namespace Gen | ||
| 34 | { | ||
| 35 | |||
| 36 | struct NormalOpDef | ||
| 37 | { | ||
| 38 | u8 toRm8, toRm32, fromRm8, fromRm32, imm8, imm32, simm8, eaximm8, eaximm32, ext; | ||
| 39 | }; | ||
| 40 | |||
| 41 | // 0xCC is code for invalid combination of immediates | ||
| 42 | static const NormalOpDef normalops[11] = | ||
| 43 | { | ||
| 44 | {0x00, 0x01, 0x02, 0x03, 0x80, 0x81, 0x83, 0x04, 0x05, 0}, //ADD | ||
| 45 | {0x10, 0x11, 0x12, 0x13, 0x80, 0x81, 0x83, 0x14, 0x15, 2}, //ADC | ||
| 46 | |||
| 47 | {0x28, 0x29, 0x2A, 0x2B, 0x80, 0x81, 0x83, 0x2C, 0x2D, 5}, //SUB | ||
| 48 | {0x18, 0x19, 0x1A, 0x1B, 0x80, 0x81, 0x83, 0x1C, 0x1D, 3}, //SBB | ||
| 49 | |||
| 50 | {0x20, 0x21, 0x22, 0x23, 0x80, 0x81, 0x83, 0x24, 0x25, 4}, //AND | ||
| 51 | {0x08, 0x09, 0x0A, 0x0B, 0x80, 0x81, 0x83, 0x0C, 0x0D, 1}, //OR | ||
| 52 | |||
| 53 | {0x30, 0x31, 0x32, 0x33, 0x80, 0x81, 0x83, 0x34, 0x35, 6}, //XOR | ||
| 54 | {0x88, 0x89, 0x8A, 0x8B, 0xC6, 0xC7, 0xCC, 0xCC, 0xCC, 0}, //MOV | ||
| 55 | |||
| 56 | {0x84, 0x85, 0x84, 0x85, 0xF6, 0xF7, 0xCC, 0xA8, 0xA9, 0}, //TEST (to == from) | ||
| 57 | {0x38, 0x39, 0x3A, 0x3B, 0x80, 0x81, 0x83, 0x3C, 0x3D, 7}, //CMP | ||
| 58 | |||
| 59 | {0x86, 0x87, 0x86, 0x87, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 7}, //XCHG | ||
| 60 | }; | ||
| 61 | |||
| 62 | enum NormalSSEOps | ||
| 63 | { | ||
| 64 | sseCMP = 0xC2, | ||
| 65 | sseADD = 0x58, //ADD | ||
| 66 | sseSUB = 0x5C, //SUB | ||
| 67 | sseAND = 0x54, //AND | ||
| 68 | sseANDN = 0x55, //ANDN | ||
| 69 | sseOR = 0x56, | ||
| 70 | sseXOR = 0x57, | ||
| 71 | sseMUL = 0x59, //MUL | ||
| 72 | sseDIV = 0x5E, //DIV | ||
| 73 | sseMIN = 0x5D, //MIN | ||
| 74 | sseMAX = 0x5F, //MAX | ||
| 75 | sseCOMIS = 0x2F, //COMIS | ||
| 76 | sseUCOMIS = 0x2E, //UCOMIS | ||
| 77 | sseSQRT = 0x51, //SQRT | ||
| 78 | sseRSQRT = 0x52, //RSQRT (NO DOUBLE PRECISION!!!) | ||
| 79 | sseRCP = 0x53, //RCP | ||
| 80 | sseMOVAPfromRM = 0x28, //MOVAP from RM | ||
| 81 | sseMOVAPtoRM = 0x29, //MOVAP to RM | ||
| 82 | sseMOVUPfromRM = 0x10, //MOVUP from RM | ||
| 83 | sseMOVUPtoRM = 0x11, //MOVUP to RM | ||
| 84 | sseMOVLPfromRM= 0x12, | ||
| 85 | sseMOVLPtoRM = 0x13, | ||
| 86 | sseMOVHPfromRM= 0x16, | ||
| 87 | sseMOVHPtoRM = 0x17, | ||
| 88 | sseMOVHLPS = 0x12, | ||
| 89 | sseMOVLHPS = 0x16, | ||
| 90 | sseMOVDQfromRM = 0x6F, | ||
| 91 | sseMOVDQtoRM = 0x7F, | ||
| 92 | sseMASKMOVDQU = 0xF7, | ||
| 93 | sseLDDQU = 0xF0, | ||
| 94 | sseSHUF = 0xC6, | ||
| 95 | sseMOVNTDQ = 0xE7, | ||
| 96 | sseMOVNTP = 0x2B, | ||
| 97 | sseHADD = 0x7C, | ||
| 98 | }; | ||
| 99 | |||
| 100 | |||
| 101 | void XEmitter::SetCodePtr(u8 *ptr) | ||
| 102 | { | ||
| 103 | code = ptr; | ||
| 104 | } | ||
| 105 | |||
| 106 | const u8 *XEmitter::GetCodePtr() const | ||
| 107 | { | ||
| 108 | return code; | ||
| 109 | } | ||
| 110 | |||
| 111 | u8 *XEmitter::GetWritableCodePtr() | ||
| 112 | { | ||
| 113 | return code; | ||
| 114 | } | ||
| 115 | |||
| 116 | void XEmitter::ReserveCodeSpace(int bytes) | ||
| 117 | { | ||
| 118 | for (int i = 0; i < bytes; i++) | ||
| 119 | *code++ = 0xCC; | ||
| 120 | } | ||
| 121 | |||
| 122 | const u8 *XEmitter::AlignCode4() | ||
| 123 | { | ||
| 124 | int c = int((u64)code & 3); | ||
| 125 | if (c) | ||
| 126 | ReserveCodeSpace(4-c); | ||
| 127 | return code; | ||
| 128 | } | ||
| 129 | |||
| 130 | const u8 *XEmitter::AlignCode16() | ||
| 131 | { | ||
| 132 | int c = int((u64)code & 15); | ||
| 133 | if (c) | ||
| 134 | ReserveCodeSpace(16-c); | ||
| 135 | return code; | ||
| 136 | } | ||
| 137 | |||
| 138 | const u8 *XEmitter::AlignCodePage() | ||
| 139 | { | ||
| 140 | int c = int((u64)code & 4095); | ||
| 141 | if (c) | ||
| 142 | ReserveCodeSpace(4096-c); | ||
| 143 | return code; | ||
| 144 | } | ||
| 145 | |||
| 146 | // This operation modifies flags; check to see the flags are locked. | ||
| 147 | // If the flags are locked, we should immediately and loudly fail before | ||
| 148 | // causing a subtle JIT bug. | ||
| 149 | void XEmitter::CheckFlags() | ||
| 150 | { | ||
| 151 | ASSERT_MSG(!flags_locked, "Attempt to modify flags while flags locked!"); | ||
| 152 | } | ||
| 153 | |||
| 154 | void XEmitter::WriteModRM(int mod, int reg, int rm) | ||
| 155 | { | ||
| 156 | Write8((u8)((mod << 6) | ((reg & 7) << 3) | (rm & 7))); | ||
| 157 | } | ||
| 158 | |||
| 159 | void XEmitter::WriteSIB(int scale, int index, int base) | ||
| 160 | { | ||
| 161 | Write8((u8)((scale << 6) | ((index & 7) << 3) | (base & 7))); | ||
| 162 | } | ||
| 163 | |||
| 164 | void OpArg::WriteRex(XEmitter *emit, int opBits, int bits, int customOp) const | ||
| 165 | { | ||
| 166 | if (customOp == -1) customOp = operandReg; | ||
| 167 | #ifdef ARCHITECTURE_x86_64 | ||
| 168 | u8 op = 0x40; | ||
| 169 | // REX.W (whether operation is a 64-bit operation) | ||
| 170 | if (opBits == 64) op |= 8; | ||
| 171 | // REX.R (whether ModR/M reg field refers to R8-R15. | ||
| 172 | if (customOp & 8) op |= 4; | ||
| 173 | // REX.X (whether ModR/M SIB index field refers to R8-R15) | ||
| 174 | if (indexReg & 8) op |= 2; | ||
| 175 | // REX.B (whether ModR/M rm or SIB base or opcode reg field refers to R8-R15) | ||
| 176 | if (offsetOrBaseReg & 8) op |= 1; | ||
| 177 | // Write REX if wr have REX bits to write, or if the operation accesses | ||
| 178 | // SIL, DIL, BPL, or SPL. | ||
| 179 | if (op != 0x40 || | ||
| 180 | (scale == SCALE_NONE && bits == 8 && (offsetOrBaseReg & 0x10c) == 4) || | ||
| 181 | (opBits == 8 && (customOp & 0x10c) == 4)) | ||
| 182 | { | ||
| 183 | emit->Write8(op); | ||
| 184 | // Check the operation doesn't access AH, BH, CH, or DH. | ||
| 185 | DEBUG_ASSERT((offsetOrBaseReg & 0x100) == 0); | ||
| 186 | DEBUG_ASSERT((customOp & 0x100) == 0); | ||
| 187 | } | ||
| 188 | #else | ||
| 189 | DEBUG_ASSERT(opBits != 64); | ||
| 190 | DEBUG_ASSERT((customOp & 8) == 0 || customOp == -1); | ||
| 191 | DEBUG_ASSERT((indexReg & 8) == 0); | ||
| 192 | DEBUG_ASSERT((offsetOrBaseReg & 8) == 0); | ||
| 193 | DEBUG_ASSERT(opBits != 8 || (customOp & 0x10c) != 4 || customOp == -1); | ||
| 194 | DEBUG_ASSERT(scale == SCALE_ATREG || bits != 8 || (offsetOrBaseReg & 0x10c) != 4); | ||
| 195 | #endif | ||
| 196 | } | ||
| 197 | |||
| 198 | void OpArg::WriteVex(XEmitter* emit, X64Reg regOp1, X64Reg regOp2, int L, int pp, int mmmmm, int W) const | ||
| 199 | { | ||
| 200 | int R = !(regOp1 & 8); | ||
| 201 | int X = !(indexReg & 8); | ||
| 202 | int B = !(offsetOrBaseReg & 8); | ||
| 203 | |||
| 204 | int vvvv = (regOp2 == X64Reg::INVALID_REG) ? 0xf : (regOp2 ^ 0xf); | ||
| 205 | |||
| 206 | // do we need any VEX fields that only appear in the three-byte form? | ||
| 207 | if (X == 1 && B == 1 && W == 0 && mmmmm == 1) | ||
| 208 | { | ||
| 209 | u8 RvvvvLpp = (R << 7) | (vvvv << 3) | (L << 1) | pp; | ||
| 210 | emit->Write8(0xC5); | ||
| 211 | emit->Write8(RvvvvLpp); | ||
| 212 | } | ||
| 213 | else | ||
| 214 | { | ||
| 215 | u8 RXBmmmmm = (R << 7) | (X << 6) | (B << 5) | mmmmm; | ||
| 216 | u8 WvvvvLpp = (W << 7) | (vvvv << 3) | (L << 1) | pp; | ||
| 217 | emit->Write8(0xC4); | ||
| 218 | emit->Write8(RXBmmmmm); | ||
| 219 | emit->Write8(WvvvvLpp); | ||
| 220 | } | ||
| 221 | } | ||
| 222 | |||
| 223 | void OpArg::WriteRest(XEmitter *emit, int extraBytes, X64Reg _operandReg, | ||
| 224 | bool warn_64bit_offset) const | ||
| 225 | { | ||
| 226 | if (_operandReg == INVALID_REG) | ||
| 227 | _operandReg = (X64Reg)this->operandReg; | ||
| 228 | int mod = 0; | ||
| 229 | int ireg = indexReg; | ||
| 230 | bool SIB = false; | ||
| 231 | int _offsetOrBaseReg = this->offsetOrBaseReg; | ||
| 232 | |||
| 233 | if (scale == SCALE_RIP) //Also, on 32-bit, just an immediate address | ||
| 234 | { | ||
| 235 | // Oh, RIP addressing. | ||
| 236 | _offsetOrBaseReg = 5; | ||
| 237 | emit->WriteModRM(0, _operandReg, _offsetOrBaseReg); | ||
| 238 | //TODO : add some checks | ||
| 239 | #ifdef ARCHITECTURE_x86_64 | ||
| 240 | u64 ripAddr = (u64)emit->GetCodePtr() + 4 + extraBytes; | ||
| 241 | s64 distance = (s64)offset - (s64)ripAddr; | ||
| 242 | ASSERT_MSG( | ||
| 243 | (distance < 0x80000000LL && | ||
| 244 | distance >= -0x80000000LL) || | ||
| 245 | !warn_64bit_offset, | ||
| 246 | "WriteRest: op out of range (0x%" PRIx64 " uses 0x%" PRIx64 ")", | ||
| 247 | ripAddr, offset); | ||
| 248 | s32 offs = (s32)distance; | ||
| 249 | emit->Write32((u32)offs); | ||
| 250 | #else | ||
| 251 | emit->Write32((u32)offset); | ||
| 252 | #endif | ||
| 253 | return; | ||
| 254 | } | ||
| 255 | |||
| 256 | if (scale == 0) | ||
| 257 | { | ||
| 258 | // Oh, no memory, Just a reg. | ||
| 259 | mod = 3; //11 | ||
| 260 | } | ||
| 261 | else if (scale >= 1) | ||
| 262 | { | ||
| 263 | //Ah good, no scaling. | ||
| 264 | if (scale == SCALE_ATREG && !((_offsetOrBaseReg & 7) == 4 || (_offsetOrBaseReg & 7) == 5)) | ||
| 265 | { | ||
| 266 | //Okay, we're good. No SIB necessary. | ||
| 267 | int ioff = (int)offset; | ||
| 268 | if (ioff == 0) | ||
| 269 | { | ||
| 270 | mod = 0; | ||
| 271 | } | ||
| 272 | else if (ioff<-128 || ioff>127) | ||
| 273 | { | ||
| 274 | mod = 2; //32-bit displacement | ||
| 275 | } | ||
| 276 | else | ||
| 277 | { | ||
| 278 | mod = 1; //8-bit displacement | ||
| 279 | } | ||
| 280 | } | ||
| 281 | else if (scale >= SCALE_NOBASE_2 && scale <= SCALE_NOBASE_8) | ||
| 282 | { | ||
| 283 | SIB = true; | ||
| 284 | mod = 0; | ||
| 285 | _offsetOrBaseReg = 5; | ||
| 286 | } | ||
| 287 | else //if (scale != SCALE_ATREG) | ||
| 288 | { | ||
| 289 | if ((_offsetOrBaseReg & 7) == 4) //this would occupy the SIB encoding :( | ||
| 290 | { | ||
| 291 | //So we have to fake it with SIB encoding :( | ||
| 292 | SIB = true; | ||
| 293 | } | ||
| 294 | |||
| 295 | if (scale >= SCALE_1 && scale < SCALE_ATREG) | ||
| 296 | { | ||
| 297 | SIB = true; | ||
| 298 | } | ||
| 299 | |||
| 300 | if (scale == SCALE_ATREG && ((_offsetOrBaseReg & 7) == 4)) | ||
| 301 | { | ||
| 302 | SIB = true; | ||
| 303 | ireg = _offsetOrBaseReg; | ||
| 304 | } | ||
| 305 | |||
| 306 | //Okay, we're fine. Just disp encoding. | ||
| 307 | //We need displacement. Which size? | ||
| 308 | int ioff = (int)(s64)offset; | ||
| 309 | if (ioff < -128 || ioff > 127) | ||
| 310 | { | ||
| 311 | mod = 2; //32-bit displacement | ||
| 312 | } | ||
| 313 | else | ||
| 314 | { | ||
| 315 | mod = 1; //8-bit displacement | ||
| 316 | } | ||
| 317 | } | ||
| 318 | } | ||
| 319 | |||
| 320 | // Okay. Time to do the actual writing | ||
| 321 | // ModRM byte: | ||
| 322 | int oreg = _offsetOrBaseReg; | ||
| 323 | if (SIB) | ||
| 324 | oreg = 4; | ||
| 325 | |||
| 326 | // TODO(ector): WTF is this if about? I don't remember writing it :-) | ||
| 327 | //if (RIP) | ||
| 328 | // oreg = 5; | ||
| 329 | |||
| 330 | emit->WriteModRM(mod, _operandReg&7, oreg&7); | ||
| 331 | |||
| 332 | if (SIB) | ||
| 333 | { | ||
| 334 | //SIB byte | ||
| 335 | int ss; | ||
| 336 | switch (scale) | ||
| 337 | { | ||
| 338 | case SCALE_NONE: _offsetOrBaseReg = 4; ss = 0; break; //RSP | ||
| 339 | case SCALE_1: ss = 0; break; | ||
| 340 | case SCALE_2: ss = 1; break; | ||
| 341 | case SCALE_4: ss = 2; break; | ||
| 342 | case SCALE_8: ss = 3; break; | ||
| 343 | case SCALE_NOBASE_2: ss = 1; break; | ||
| 344 | case SCALE_NOBASE_4: ss = 2; break; | ||
| 345 | case SCALE_NOBASE_8: ss = 3; break; | ||
| 346 | case SCALE_ATREG: ss = 0; break; | ||
| 347 | default: ASSERT_MSG(0, "Invalid scale for SIB byte"); ss = 0; break; | ||
| 348 | } | ||
| 349 | emit->Write8((u8)((ss << 6) | ((ireg&7)<<3) | (_offsetOrBaseReg&7))); | ||
| 350 | } | ||
| 351 | |||
| 352 | if (mod == 1) //8-bit disp | ||
| 353 | { | ||
| 354 | emit->Write8((u8)(s8)(s32)offset); | ||
| 355 | } | ||
| 356 | else if (mod == 2 || (scale >= SCALE_NOBASE_2 && scale <= SCALE_NOBASE_8)) //32-bit disp | ||
| 357 | { | ||
| 358 | emit->Write32((u32)offset); | ||
| 359 | } | ||
| 360 | } | ||
| 361 | |||
| 362 | // W = operand extended width (1 if 64-bit) | ||
| 363 | // R = register# upper bit | ||
| 364 | // X = scale amnt upper bit | ||
| 365 | // B = base register# upper bit | ||
| 366 | void XEmitter::Rex(int w, int r, int x, int b) | ||
| 367 | { | ||
| 368 | w = w ? 1 : 0; | ||
| 369 | r = r ? 1 : 0; | ||
| 370 | x = x ? 1 : 0; | ||
| 371 | b = b ? 1 : 0; | ||
| 372 | u8 rx = (u8)(0x40 | (w << 3) | (r << 2) | (x << 1) | (b)); | ||
| 373 | if (rx != 0x40) | ||
| 374 | Write8(rx); | ||
| 375 | } | ||
| 376 | |||
| 377 | void XEmitter::JMP(const u8 *addr, bool force5Bytes) | ||
| 378 | { | ||
| 379 | u64 fn = (u64)addr; | ||
| 380 | if (!force5Bytes) | ||
| 381 | { | ||
| 382 | s64 distance = (s64)(fn - ((u64)code + 2)); | ||
| 383 | ASSERT_MSG(distance >= -0x80 && distance < 0x80, | ||
| 384 | "Jump target too far away, needs force5Bytes = true"); | ||
| 385 | //8 bits will do | ||
| 386 | Write8(0xEB); | ||
| 387 | Write8((u8)(s8)distance); | ||
| 388 | } | ||
| 389 | else | ||
| 390 | { | ||
| 391 | s64 distance = (s64)(fn - ((u64)code + 5)); | ||
| 392 | |||
| 393 | ASSERT_MSG( | ||
| 394 | distance >= -0x80000000LL && distance < 0x80000000LL, | ||
| 395 | "Jump target too far away, needs indirect register"); | ||
| 396 | Write8(0xE9); | ||
| 397 | Write32((u32)(s32)distance); | ||
| 398 | } | ||
| 399 | } | ||
| 400 | |||
| 401 | void XEmitter::JMPptr(const OpArg &arg2) | ||
| 402 | { | ||
| 403 | OpArg arg = arg2; | ||
| 404 | if (arg.IsImm()) ASSERT_MSG(0, "JMPptr - Imm argument"); | ||
| 405 | arg.operandReg = 4; | ||
| 406 | arg.WriteRex(this, 0, 0); | ||
| 407 | Write8(0xFF); | ||
| 408 | arg.WriteRest(this); | ||
| 409 | } | ||
| 410 | |||
| 411 | //Can be used to trap other processors, before overwriting their code | ||
| 412 | // not used in dolphin | ||
| 413 | void XEmitter::JMPself() | ||
| 414 | { | ||
| 415 | Write8(0xEB); | ||
| 416 | Write8(0xFE); | ||
| 417 | } | ||
| 418 | |||
| 419 | void XEmitter::CALLptr(OpArg arg) | ||
| 420 | { | ||
| 421 | if (arg.IsImm()) ASSERT_MSG(0, "CALLptr - Imm argument"); | ||
| 422 | arg.operandReg = 2; | ||
| 423 | arg.WriteRex(this, 0, 0); | ||
| 424 | Write8(0xFF); | ||
| 425 | arg.WriteRest(this); | ||
| 426 | } | ||
| 427 | |||
| 428 | void XEmitter::CALL(const void *fnptr) | ||
| 429 | { | ||
| 430 | u64 distance = u64(fnptr) - (u64(code) + 5); | ||
| 431 | ASSERT_MSG( | ||
| 432 | distance < 0x0000000080000000ULL || | ||
| 433 | distance >= 0xFFFFFFFF80000000ULL, | ||
| 434 | "CALL out of range (%p calls %p)", code, fnptr); | ||
| 435 | Write8(0xE8); | ||
| 436 | Write32(u32(distance)); | ||
| 437 | } | ||
| 438 | |||
| 439 | FixupBranch XEmitter::J(bool force5bytes) | ||
| 440 | { | ||
| 441 | FixupBranch branch; | ||
| 442 | branch.type = force5bytes ? 1 : 0; | ||
| 443 | branch.ptr = code + (force5bytes ? 5 : 2); | ||
| 444 | if (!force5bytes) | ||
| 445 | { | ||
| 446 | //8 bits will do | ||
| 447 | Write8(0xEB); | ||
| 448 | Write8(0); | ||
| 449 | } | ||
| 450 | else | ||
| 451 | { | ||
| 452 | Write8(0xE9); | ||
| 453 | Write32(0); | ||
| 454 | } | ||
| 455 | return branch; | ||
| 456 | } | ||
| 457 | |||
| 458 | FixupBranch XEmitter::J_CC(CCFlags conditionCode, bool force5bytes) | ||
| 459 | { | ||
| 460 | FixupBranch branch; | ||
| 461 | branch.type = force5bytes ? 1 : 0; | ||
| 462 | branch.ptr = code + (force5bytes ? 6 : 2); | ||
| 463 | if (!force5bytes) | ||
| 464 | { | ||
| 465 | //8 bits will do | ||
| 466 | Write8(0x70 + conditionCode); | ||
| 467 | Write8(0); | ||
| 468 | } | ||
| 469 | else | ||
| 470 | { | ||
| 471 | Write8(0x0F); | ||
| 472 | Write8(0x80 + conditionCode); | ||
| 473 | Write32(0); | ||
| 474 | } | ||
| 475 | return branch; | ||
| 476 | } | ||
| 477 | |||
| 478 | void XEmitter::J_CC(CCFlags conditionCode, const u8* addr, bool force5bytes) | ||
| 479 | { | ||
| 480 | u64 fn = (u64)addr; | ||
| 481 | s64 distance = (s64)(fn - ((u64)code + 2)); | ||
| 482 | if (distance < -0x80 || distance >= 0x80 || force5bytes) | ||
| 483 | { | ||
| 484 | distance = (s64)(fn - ((u64)code + 6)); | ||
| 485 | ASSERT_MSG( | ||
| 486 | distance >= -0x80000000LL && distance < 0x80000000LL, | ||
| 487 | "Jump target too far away, needs indirect register"); | ||
| 488 | Write8(0x0F); | ||
| 489 | Write8(0x80 + conditionCode); | ||
| 490 | Write32((u32)(s32)distance); | ||
| 491 | } | ||
| 492 | else | ||
| 493 | { | ||
| 494 | Write8(0x70 + conditionCode); | ||
| 495 | Write8((u8)(s8)distance); | ||
| 496 | } | ||
| 497 | } | ||
| 498 | |||
| 499 | void XEmitter::SetJumpTarget(const FixupBranch &branch) | ||
| 500 | { | ||
| 501 | if (branch.type == 0) | ||
| 502 | { | ||
| 503 | s64 distance = (s64)(code - branch.ptr); | ||
| 504 | ASSERT_MSG(distance >= -0x80 && distance < 0x80, "Jump target too far away, needs force5Bytes = true"); | ||
| 505 | branch.ptr[-1] = (u8)(s8)distance; | ||
| 506 | } | ||
| 507 | else if (branch.type == 1) | ||
| 508 | { | ||
| 509 | s64 distance = (s64)(code - branch.ptr); | ||
| 510 | ASSERT_MSG(distance >= -0x80000000LL && distance < 0x80000000LL, "Jump target too far away, needs indirect register"); | ||
| 511 | ((s32*)branch.ptr)[-1] = (s32)distance; | ||
| 512 | } | ||
| 513 | } | ||
| 514 | |||
| 515 | // INC/DEC considered harmful on newer CPUs due to partial flag set. | ||
| 516 | // Use ADD, SUB instead. | ||
| 517 | |||
| 518 | /* | ||
| 519 | void XEmitter::INC(int bits, OpArg arg) | ||
| 520 | { | ||
| 521 | if (arg.IsImm()) ASSERT_MSG(0, "INC - Imm argument"); | ||
| 522 | arg.operandReg = 0; | ||
| 523 | if (bits == 16) {Write8(0x66);} | ||
| 524 | arg.WriteRex(this, bits, bits); | ||
| 525 | Write8(bits == 8 ? 0xFE : 0xFF); | ||
| 526 | arg.WriteRest(this); | ||
| 527 | } | ||
| 528 | void XEmitter::DEC(int bits, OpArg arg) | ||
| 529 | { | ||
| 530 | if (arg.IsImm()) ASSERT_MSG(0, "DEC - Imm argument"); | ||
| 531 | arg.operandReg = 1; | ||
| 532 | if (bits == 16) {Write8(0x66);} | ||
| 533 | arg.WriteRex(this, bits, bits); | ||
| 534 | Write8(bits == 8 ? 0xFE : 0xFF); | ||
| 535 | arg.WriteRest(this); | ||
| 536 | } | ||
| 537 | */ | ||
| 538 | |||
| 539 | //Single byte opcodes | ||
| 540 | //There is no PUSHAD/POPAD in 64-bit mode. | ||
| 541 | void XEmitter::INT3() {Write8(0xCC);} | ||
| 542 | void XEmitter::RET() {Write8(0xC3);} | ||
| 543 | void XEmitter::RET_FAST() {Write8(0xF3); Write8(0xC3);} //two-byte return (rep ret) - recommended by AMD optimization manual for the case of jumping to a ret | ||
| 544 | |||
| 545 | // The first sign of decadence: optimized NOPs. | ||
| 546 | void XEmitter::NOP(size_t size) | ||
| 547 | { | ||
| 548 | DEBUG_ASSERT((int)size > 0); | ||
| 549 | while (true) | ||
| 550 | { | ||
| 551 | switch (size) | ||
| 552 | { | ||
| 553 | case 0: | ||
| 554 | return; | ||
| 555 | case 1: | ||
| 556 | Write8(0x90); | ||
| 557 | return; | ||
| 558 | case 2: | ||
| 559 | Write8(0x66); Write8(0x90); | ||
| 560 | return; | ||
| 561 | case 3: | ||
| 562 | Write8(0x0F); Write8(0x1F); Write8(0x00); | ||
| 563 | return; | ||
| 564 | case 4: | ||
| 565 | Write8(0x0F); Write8(0x1F); Write8(0x40); Write8(0x00); | ||
| 566 | return; | ||
| 567 | case 5: | ||
| 568 | Write8(0x0F); Write8(0x1F); Write8(0x44); Write8(0x00); | ||
| 569 | Write8(0x00); | ||
| 570 | return; | ||
| 571 | case 6: | ||
| 572 | Write8(0x66); Write8(0x0F); Write8(0x1F); Write8(0x44); | ||
| 573 | Write8(0x00); Write8(0x00); | ||
| 574 | return; | ||
| 575 | case 7: | ||
| 576 | Write8(0x0F); Write8(0x1F); Write8(0x80); Write8(0x00); | ||
| 577 | Write8(0x00); Write8(0x00); Write8(0x00); | ||
| 578 | return; | ||
| 579 | case 8: | ||
| 580 | Write8(0x0F); Write8(0x1F); Write8(0x84); Write8(0x00); | ||
| 581 | Write8(0x00); Write8(0x00); Write8(0x00); Write8(0x00); | ||
| 582 | return; | ||
| 583 | case 9: | ||
| 584 | Write8(0x66); Write8(0x0F); Write8(0x1F); Write8(0x84); | ||
| 585 | Write8(0x00); Write8(0x00); Write8(0x00); Write8(0x00); | ||
| 586 | Write8(0x00); | ||
| 587 | return; | ||
| 588 | case 10: | ||
| 589 | Write8(0x66); Write8(0x66); Write8(0x0F); Write8(0x1F); | ||
| 590 | Write8(0x84); Write8(0x00); Write8(0x00); Write8(0x00); | ||
| 591 | Write8(0x00); Write8(0x00); | ||
| 592 | return; | ||
| 593 | default: | ||
| 594 | // Even though x86 instructions are allowed to be up to 15 bytes long, | ||
| 595 | // AMD advises against using NOPs longer than 11 bytes because they | ||
| 596 | // carry a performance penalty on CPUs older than AMD family 16h. | ||
| 597 | Write8(0x66); Write8(0x66); Write8(0x66); Write8(0x0F); | ||
| 598 | Write8(0x1F); Write8(0x84); Write8(0x00); Write8(0x00); | ||
| 599 | Write8(0x00); Write8(0x00); Write8(0x00); | ||
| 600 | size -= 11; | ||
| 601 | continue; | ||
| 602 | } | ||
| 603 | } | ||
| 604 | } | ||
| 605 | |||
| 606 | void XEmitter::PAUSE() {Write8(0xF3); NOP();} //use in tight spinloops for energy saving on some cpu | ||
| 607 | void XEmitter::CLC() {CheckFlags(); Write8(0xF8);} //clear carry | ||
| 608 | void XEmitter::CMC() {CheckFlags(); Write8(0xF5);} //flip carry | ||
| 609 | void XEmitter::STC() {CheckFlags(); Write8(0xF9);} //set carry | ||
| 610 | |||
| 611 | //TODO: xchg ah, al ??? | ||
| 612 | void XEmitter::XCHG_AHAL() | ||
| 613 | { | ||
| 614 | Write8(0x86); | ||
| 615 | Write8(0xe0); | ||
| 616 | // alt. 86 c4 | ||
| 617 | } | ||
| 618 | |||
| 619 | //These two can not be executed on early Intel 64-bit CPU:s, only on AMD! | ||
| 620 | void XEmitter::LAHF() {Write8(0x9F);} | ||
| 621 | void XEmitter::SAHF() {CheckFlags(); Write8(0x9E);} | ||
| 622 | |||
| 623 | void XEmitter::PUSHF() {Write8(0x9C);} | ||
| 624 | void XEmitter::POPF() {CheckFlags(); Write8(0x9D);} | ||
| 625 | |||
| 626 | void XEmitter::LFENCE() {Write8(0x0F); Write8(0xAE); Write8(0xE8);} | ||
| 627 | void XEmitter::MFENCE() {Write8(0x0F); Write8(0xAE); Write8(0xF0);} | ||
| 628 | void XEmitter::SFENCE() {Write8(0x0F); Write8(0xAE); Write8(0xF8);} | ||
| 629 | |||
| 630 | void XEmitter::WriteSimple1Byte(int bits, u8 byte, X64Reg reg) | ||
| 631 | { | ||
| 632 | if (bits == 16) | ||
| 633 | Write8(0x66); | ||
| 634 | Rex(bits == 64, 0, 0, (int)reg >> 3); | ||
| 635 | Write8(byte + ((int)reg & 7)); | ||
| 636 | } | ||
| 637 | |||
| 638 | void XEmitter::WriteSimple2Byte(int bits, u8 byte1, u8 byte2, X64Reg reg) | ||
| 639 | { | ||
| 640 | if (bits == 16) | ||
| 641 | Write8(0x66); | ||
| 642 | Rex(bits==64, 0, 0, (int)reg >> 3); | ||
| 643 | Write8(byte1); | ||
| 644 | Write8(byte2 + ((int)reg & 7)); | ||
| 645 | } | ||
| 646 | |||
| 647 | void XEmitter::CWD(int bits) | ||
| 648 | { | ||
| 649 | if (bits == 16) | ||
| 650 | Write8(0x66); | ||
| 651 | Rex(bits == 64, 0, 0, 0); | ||
| 652 | Write8(0x99); | ||
| 653 | } | ||
| 654 | |||
| 655 | void XEmitter::CBW(int bits) | ||
| 656 | { | ||
| 657 | if (bits == 8) | ||
| 658 | Write8(0x66); | ||
| 659 | Rex(bits == 32, 0, 0, 0); | ||
| 660 | Write8(0x98); | ||
| 661 | } | ||
| 662 | |||
| 663 | //Simple opcodes | ||
| 664 | |||
| 665 | |||
| 666 | //push/pop do not need wide to be 64-bit | ||
| 667 | void XEmitter::PUSH(X64Reg reg) {WriteSimple1Byte(32, 0x50, reg);} | ||
| 668 | void XEmitter::POP(X64Reg reg) {WriteSimple1Byte(32, 0x58, reg);} | ||
| 669 | |||
| 670 | void XEmitter::PUSH(int bits, const OpArg ®) | ||
| 671 | { | ||
| 672 | if (reg.IsSimpleReg()) | ||
| 673 | PUSH(reg.GetSimpleReg()); | ||
| 674 | else if (reg.IsImm()) | ||
| 675 | { | ||
| 676 | switch (reg.GetImmBits()) | ||
| 677 | { | ||
| 678 | case 8: | ||
| 679 | Write8(0x6A); | ||
| 680 | Write8((u8)(s8)reg.offset); | ||
| 681 | break; | ||
| 682 | case 16: | ||
| 683 | Write8(0x66); | ||
| 684 | Write8(0x68); | ||
| 685 | Write16((u16)(s16)(s32)reg.offset); | ||
| 686 | break; | ||
| 687 | case 32: | ||
| 688 | Write8(0x68); | ||
| 689 | Write32((u32)reg.offset); | ||
| 690 | break; | ||
| 691 | default: | ||
| 692 | ASSERT_MSG(0, "PUSH - Bad imm bits"); | ||
| 693 | break; | ||
| 694 | } | ||
| 695 | } | ||
| 696 | else | ||
| 697 | { | ||
| 698 | if (bits == 16) | ||
| 699 | Write8(0x66); | ||
| 700 | reg.WriteRex(this, bits, bits); | ||
| 701 | Write8(0xFF); | ||
| 702 | reg.WriteRest(this, 0, (X64Reg)6); | ||
| 703 | } | ||
| 704 | } | ||
| 705 | |||
| 706 | void XEmitter::POP(int /*bits*/, const OpArg ®) | ||
| 707 | { | ||
| 708 | if (reg.IsSimpleReg()) | ||
| 709 | POP(reg.GetSimpleReg()); | ||
| 710 | else | ||
| 711 | ASSERT_MSG(0, "POP - Unsupported encoding"); | ||
| 712 | } | ||
| 713 | |||
| 714 | void XEmitter::BSWAP(int bits, X64Reg reg) | ||
| 715 | { | ||
| 716 | if (bits >= 32) | ||
| 717 | { | ||
| 718 | WriteSimple2Byte(bits, 0x0F, 0xC8, reg); | ||
| 719 | } | ||
| 720 | else if (bits == 16) | ||
| 721 | { | ||
| 722 | ROL(16, R(reg), Imm8(8)); | ||
| 723 | } | ||
| 724 | else if (bits == 8) | ||
| 725 | { | ||
| 726 | // Do nothing - can't bswap a single byte... | ||
| 727 | } | ||
| 728 | else | ||
| 729 | { | ||
| 730 | ASSERT_MSG(0, "BSWAP - Wrong number of bits"); | ||
| 731 | } | ||
| 732 | } | ||
| 733 | |||
| 734 | // Undefined opcode - reserved | ||
| 735 | // If we ever need a way to always cause a non-breakpoint hard exception... | ||
| 736 | void XEmitter::UD2() | ||
| 737 | { | ||
| 738 | Write8(0x0F); | ||
| 739 | Write8(0x0B); | ||
| 740 | } | ||
| 741 | |||
| 742 | void XEmitter::PREFETCH(PrefetchLevel level, OpArg arg) | ||
| 743 | { | ||
| 744 | ASSERT_MSG(!arg.IsImm(), "PREFETCH - Imm argument"); | ||
| 745 | arg.operandReg = (u8)level; | ||
| 746 | arg.WriteRex(this, 0, 0); | ||
| 747 | Write8(0x0F); | ||
| 748 | Write8(0x18); | ||
| 749 | arg.WriteRest(this); | ||
| 750 | } | ||
| 751 | |||
| 752 | void XEmitter::SETcc(CCFlags flag, OpArg dest) | ||
| 753 | { | ||
| 754 | ASSERT_MSG(!dest.IsImm(), "SETcc - Imm argument"); | ||
| 755 | dest.operandReg = 0; | ||
| 756 | dest.WriteRex(this, 0, 8); | ||
| 757 | Write8(0x0F); | ||
| 758 | Write8(0x90 + (u8)flag); | ||
| 759 | dest.WriteRest(this); | ||
| 760 | } | ||
| 761 | |||
| 762 | void XEmitter::CMOVcc(int bits, X64Reg dest, OpArg src, CCFlags flag) | ||
| 763 | { | ||
| 764 | ASSERT_MSG(!src.IsImm(), "CMOVcc - Imm argument"); | ||
| 765 | ASSERT_MSG(bits != 8, "CMOVcc - 8 bits unsupported"); | ||
| 766 | if (bits == 16) | ||
| 767 | Write8(0x66); | ||
| 768 | src.operandReg = dest; | ||
| 769 | src.WriteRex(this, bits, bits); | ||
| 770 | Write8(0x0F); | ||
| 771 | Write8(0x40 + (u8)flag); | ||
| 772 | src.WriteRest(this); | ||
| 773 | } | ||
| 774 | |||
| 775 | void XEmitter::WriteMulDivType(int bits, OpArg src, int ext) | ||
| 776 | { | ||
| 777 | ASSERT_MSG(!src.IsImm(), "WriteMulDivType - Imm argument"); | ||
| 778 | CheckFlags(); | ||
| 779 | src.operandReg = ext; | ||
| 780 | if (bits == 16) | ||
| 781 | Write8(0x66); | ||
| 782 | src.WriteRex(this, bits, bits, 0); | ||
| 783 | if (bits == 8) | ||
| 784 | { | ||
| 785 | Write8(0xF6); | ||
| 786 | } | ||
| 787 | else | ||
| 788 | { | ||
| 789 | Write8(0xF7); | ||
| 790 | } | ||
| 791 | src.WriteRest(this); | ||
| 792 | } | ||
| 793 | |||
| 794 | void XEmitter::MUL(int bits, OpArg src) {WriteMulDivType(bits, src, 4);} | ||
| 795 | void XEmitter::DIV(int bits, OpArg src) {WriteMulDivType(bits, src, 6);} | ||
| 796 | void XEmitter::IMUL(int bits, OpArg src) {WriteMulDivType(bits, src, 5);} | ||
| 797 | void XEmitter::IDIV(int bits, OpArg src) {WriteMulDivType(bits, src, 7);} | ||
| 798 | void XEmitter::NEG(int bits, OpArg src) {WriteMulDivType(bits, src, 3);} | ||
| 799 | void XEmitter::NOT(int bits, OpArg src) {WriteMulDivType(bits, src, 2);} | ||
| 800 | |||
| 801 | void XEmitter::WriteBitSearchType(int bits, X64Reg dest, OpArg src, u8 byte2, bool rep) | ||
| 802 | { | ||
| 803 | ASSERT_MSG(!src.IsImm(), "WriteBitSearchType - Imm argument"); | ||
| 804 | CheckFlags(); | ||
| 805 | src.operandReg = (u8)dest; | ||
| 806 | if (bits == 16) | ||
| 807 | Write8(0x66); | ||
| 808 | if (rep) | ||
| 809 | Write8(0xF3); | ||
| 810 | src.WriteRex(this, bits, bits); | ||
| 811 | Write8(0x0F); | ||
| 812 | Write8(byte2); | ||
| 813 | src.WriteRest(this); | ||
| 814 | } | ||
| 815 | |||
| 816 | void XEmitter::MOVNTI(int bits, OpArg dest, X64Reg src) | ||
| 817 | { | ||
| 818 | if (bits <= 16) | ||
| 819 | ASSERT_MSG(0, "MOVNTI - bits<=16"); | ||
| 820 | WriteBitSearchType(bits, src, dest, 0xC3); | ||
| 821 | } | ||
| 822 | |||
| 823 | void XEmitter::BSF(int bits, X64Reg dest, OpArg src) {WriteBitSearchType(bits,dest,src,0xBC);} //bottom bit to top bit | ||
| 824 | void XEmitter::BSR(int bits, X64Reg dest, OpArg src) {WriteBitSearchType(bits,dest,src,0xBD);} //top bit to bottom bit | ||
| 825 | |||
| 826 | void XEmitter::TZCNT(int bits, X64Reg dest, OpArg src) | ||
| 827 | { | ||
| 828 | CheckFlags(); | ||
| 829 | if (!Common::GetCPUCaps().bmi1) | ||
| 830 | ASSERT_MSG(0, "Trying to use BMI1 on a system that doesn't support it. Bad programmer."); | ||
| 831 | WriteBitSearchType(bits, dest, src, 0xBC, true); | ||
| 832 | } | ||
| 833 | void XEmitter::LZCNT(int bits, X64Reg dest, OpArg src) | ||
| 834 | { | ||
| 835 | CheckFlags(); | ||
| 836 | if (!Common::GetCPUCaps().lzcnt) | ||
| 837 | ASSERT_MSG(0, "Trying to use LZCNT on a system that doesn't support it. Bad programmer."); | ||
| 838 | WriteBitSearchType(bits, dest, src, 0xBD, true); | ||
| 839 | } | ||
| 840 | |||
| 841 | void XEmitter::MOVSX(int dbits, int sbits, X64Reg dest, OpArg src) | ||
| 842 | { | ||
| 843 | ASSERT_MSG(!src.IsImm(), "MOVSX - Imm argument"); | ||
| 844 | if (dbits == sbits) | ||
| 845 | { | ||
| 846 | MOV(dbits, R(dest), src); | ||
| 847 | return; | ||
| 848 | } | ||
| 849 | src.operandReg = (u8)dest; | ||
| 850 | if (dbits == 16) | ||
| 851 | Write8(0x66); | ||
| 852 | src.WriteRex(this, dbits, sbits); | ||
| 853 | if (sbits == 8) | ||
| 854 | { | ||
| 855 | Write8(0x0F); | ||
| 856 | Write8(0xBE); | ||
| 857 | } | ||
| 858 | else if (sbits == 16) | ||
| 859 | { | ||
| 860 | Write8(0x0F); | ||
| 861 | Write8(0xBF); | ||
| 862 | } | ||
| 863 | else if (sbits == 32 && dbits == 64) | ||
| 864 | { | ||
| 865 | Write8(0x63); | ||
| 866 | } | ||
| 867 | else | ||
| 868 | { | ||
| 869 | Crash(); | ||
| 870 | } | ||
| 871 | src.WriteRest(this); | ||
| 872 | } | ||
| 873 | |||
| 874 | void XEmitter::MOVZX(int dbits, int sbits, X64Reg dest, OpArg src) | ||
| 875 | { | ||
| 876 | ASSERT_MSG(!src.IsImm(), "MOVZX - Imm argument"); | ||
| 877 | if (dbits == sbits) | ||
| 878 | { | ||
| 879 | MOV(dbits, R(dest), src); | ||
| 880 | return; | ||
| 881 | } | ||
| 882 | src.operandReg = (u8)dest; | ||
| 883 | if (dbits == 16) | ||
| 884 | Write8(0x66); | ||
| 885 | //the 32bit result is automatically zero extended to 64bit | ||
| 886 | src.WriteRex(this, dbits == 64 ? 32 : dbits, sbits); | ||
| 887 | if (sbits == 8) | ||
| 888 | { | ||
| 889 | Write8(0x0F); | ||
| 890 | Write8(0xB6); | ||
| 891 | } | ||
| 892 | else if (sbits == 16) | ||
| 893 | { | ||
| 894 | Write8(0x0F); | ||
| 895 | Write8(0xB7); | ||
| 896 | } | ||
| 897 | else if (sbits == 32 && dbits == 64) | ||
| 898 | { | ||
| 899 | Write8(0x8B); | ||
| 900 | } | ||
| 901 | else | ||
| 902 | { | ||
| 903 | ASSERT_MSG(0, "MOVZX - Invalid size"); | ||
| 904 | } | ||
| 905 | src.WriteRest(this); | ||
| 906 | } | ||
| 907 | |||
| 908 | void XEmitter::MOVBE(int bits, const OpArg& dest, const OpArg& src) | ||
| 909 | { | ||
| 910 | ASSERT_MSG(Common::GetCPUCaps().movbe, "Generating MOVBE on a system that does not support it."); | ||
| 911 | if (bits == 8) | ||
| 912 | { | ||
| 913 | MOV(bits, dest, src); | ||
| 914 | return; | ||
| 915 | } | ||
| 916 | |||
| 917 | if (bits == 16) | ||
| 918 | Write8(0x66); | ||
| 919 | |||
| 920 | if (dest.IsSimpleReg()) | ||
| 921 | { | ||
| 922 | ASSERT_MSG(!src.IsSimpleReg() && !src.IsImm(), "MOVBE: Loading from !mem"); | ||
| 923 | src.WriteRex(this, bits, bits, dest.GetSimpleReg()); | ||
| 924 | Write8(0x0F); Write8(0x38); Write8(0xF0); | ||
| 925 | src.WriteRest(this, 0, dest.GetSimpleReg()); | ||
| 926 | } | ||
| 927 | else if (src.IsSimpleReg()) | ||
| 928 | { | ||
| 929 | ASSERT_MSG(!dest.IsSimpleReg() && !dest.IsImm(), "MOVBE: Storing to !mem"); | ||
| 930 | dest.WriteRex(this, bits, bits, src.GetSimpleReg()); | ||
| 931 | Write8(0x0F); Write8(0x38); Write8(0xF1); | ||
| 932 | dest.WriteRest(this, 0, src.GetSimpleReg()); | ||
| 933 | } | ||
| 934 | else | ||
| 935 | { | ||
| 936 | ASSERT_MSG(0, "MOVBE: Not loading or storing to mem"); | ||
| 937 | } | ||
| 938 | } | ||
| 939 | |||
| 940 | |||
| 941 | void XEmitter::LEA(int bits, X64Reg dest, OpArg src) | ||
| 942 | { | ||
| 943 | ASSERT_MSG(!src.IsImm(), "LEA - Imm argument"); | ||
| 944 | src.operandReg = (u8)dest; | ||
| 945 | if (bits == 16) | ||
| 946 | Write8(0x66); //TODO: performance warning | ||
| 947 | src.WriteRex(this, bits, bits); | ||
| 948 | Write8(0x8D); | ||
| 949 | src.WriteRest(this, 0, INVALID_REG, bits == 64); | ||
| 950 | } | ||
| 951 | |||
| 952 | //shift can be either imm8 or cl | ||
| 953 | void XEmitter::WriteShift(int bits, OpArg dest, OpArg &shift, int ext) | ||
| 954 | { | ||
| 955 | CheckFlags(); | ||
| 956 | bool writeImm = false; | ||
| 957 | if (dest.IsImm()) | ||
| 958 | { | ||
| 959 | ASSERT_MSG(0, "WriteShift - can't shift imms"); | ||
| 960 | } | ||
| 961 | if ((shift.IsSimpleReg() && shift.GetSimpleReg() != ECX) || (shift.IsImm() && shift.GetImmBits() != 8)) | ||
| 962 | { | ||
| 963 | ASSERT_MSG(0, "WriteShift - illegal argument"); | ||
| 964 | } | ||
| 965 | dest.operandReg = ext; | ||
| 966 | if (bits == 16) | ||
| 967 | Write8(0x66); | ||
| 968 | dest.WriteRex(this, bits, bits, 0); | ||
| 969 | if (shift.GetImmBits() == 8) | ||
| 970 | { | ||
| 971 | //ok an imm | ||
| 972 | u8 imm = (u8)shift.offset; | ||
| 973 | if (imm == 1) | ||
| 974 | { | ||
| 975 | Write8(bits == 8 ? 0xD0 : 0xD1); | ||
| 976 | } | ||
| 977 | else | ||
| 978 | { | ||
| 979 | writeImm = true; | ||
| 980 | Write8(bits == 8 ? 0xC0 : 0xC1); | ||
| 981 | } | ||
| 982 | } | ||
| 983 | else | ||
| 984 | { | ||
| 985 | Write8(bits == 8 ? 0xD2 : 0xD3); | ||
| 986 | } | ||
| 987 | dest.WriteRest(this, writeImm ? 1 : 0); | ||
| 988 | if (writeImm) | ||
| 989 | Write8((u8)shift.offset); | ||
| 990 | } | ||
| 991 | |||
| 992 | // large rotates and shift are slower on intel than amd | ||
| 993 | // intel likes to rotate by 1, and the op is smaller too | ||
| 994 | void XEmitter::ROL(int bits, OpArg dest, OpArg shift) {WriteShift(bits, dest, shift, 0);} | ||
| 995 | void XEmitter::ROR(int bits, OpArg dest, OpArg shift) {WriteShift(bits, dest, shift, 1);} | ||
| 996 | void XEmitter::RCL(int bits, OpArg dest, OpArg shift) {WriteShift(bits, dest, shift, 2);} | ||
| 997 | void XEmitter::RCR(int bits, OpArg dest, OpArg shift) {WriteShift(bits, dest, shift, 3);} | ||
| 998 | void XEmitter::SHL(int bits, OpArg dest, OpArg shift) {WriteShift(bits, dest, shift, 4);} | ||
| 999 | void XEmitter::SHR(int bits, OpArg dest, OpArg shift) {WriteShift(bits, dest, shift, 5);} | ||
| 1000 | void XEmitter::SAR(int bits, OpArg dest, OpArg shift) {WriteShift(bits, dest, shift, 7);} | ||
| 1001 | |||
| 1002 | // index can be either imm8 or register, don't use memory destination because it's slow | ||
| 1003 | void XEmitter::WriteBitTest(int bits, OpArg &dest, OpArg &index, int ext) | ||
| 1004 | { | ||
| 1005 | CheckFlags(); | ||
| 1006 | if (dest.IsImm()) | ||
| 1007 | { | ||
| 1008 | ASSERT_MSG(0, "WriteBitTest - can't test imms"); | ||
| 1009 | } | ||
| 1010 | if ((index.IsImm() && index.GetImmBits() != 8)) | ||
| 1011 | { | ||
| 1012 | ASSERT_MSG(0, "WriteBitTest - illegal argument"); | ||
| 1013 | } | ||
| 1014 | if (bits == 16) | ||
| 1015 | Write8(0x66); | ||
| 1016 | if (index.IsImm()) | ||
| 1017 | { | ||
| 1018 | dest.WriteRex(this, bits, bits); | ||
| 1019 | Write8(0x0F); Write8(0xBA); | ||
| 1020 | dest.WriteRest(this, 1, (X64Reg)ext); | ||
| 1021 | Write8((u8)index.offset); | ||
| 1022 | } | ||
| 1023 | else | ||
| 1024 | { | ||
| 1025 | X64Reg operand = index.GetSimpleReg(); | ||
| 1026 | dest.WriteRex(this, bits, bits, operand); | ||
| 1027 | Write8(0x0F); Write8(0x83 + 8*ext); | ||
| 1028 | dest.WriteRest(this, 1, operand); | ||
| 1029 | } | ||
| 1030 | } | ||
| 1031 | |||
| 1032 | void XEmitter::BT(int bits, OpArg dest, OpArg index) {WriteBitTest(bits, dest, index, 4);} | ||
| 1033 | void XEmitter::BTS(int bits, OpArg dest, OpArg index) {WriteBitTest(bits, dest, index, 5);} | ||
| 1034 | void XEmitter::BTR(int bits, OpArg dest, OpArg index) {WriteBitTest(bits, dest, index, 6);} | ||
| 1035 | void XEmitter::BTC(int bits, OpArg dest, OpArg index) {WriteBitTest(bits, dest, index, 7);} | ||
| 1036 | |||
| 1037 | //shift can be either imm8 or cl | ||
| 1038 | void XEmitter::SHRD(int bits, OpArg dest, OpArg src, OpArg shift) | ||
| 1039 | { | ||
| 1040 | CheckFlags(); | ||
| 1041 | if (dest.IsImm()) | ||
| 1042 | { | ||
| 1043 | ASSERT_MSG(0, "SHRD - can't use imms as destination"); | ||
| 1044 | } | ||
| 1045 | if (!src.IsSimpleReg()) | ||
| 1046 | { | ||
| 1047 | ASSERT_MSG(0, "SHRD - must use simple register as source"); | ||
| 1048 | } | ||
| 1049 | if ((shift.IsSimpleReg() && shift.GetSimpleReg() != ECX) || (shift.IsImm() && shift.GetImmBits() != 8)) | ||
| 1050 | { | ||
| 1051 | ASSERT_MSG(0, "SHRD - illegal shift"); | ||
| 1052 | } | ||
| 1053 | if (bits == 16) | ||
| 1054 | Write8(0x66); | ||
| 1055 | X64Reg operand = src.GetSimpleReg(); | ||
| 1056 | dest.WriteRex(this, bits, bits, operand); | ||
| 1057 | if (shift.GetImmBits() == 8) | ||
| 1058 | { | ||
| 1059 | Write8(0x0F); Write8(0xAC); | ||
| 1060 | dest.WriteRest(this, 1, operand); | ||
| 1061 | Write8((u8)shift.offset); | ||
| 1062 | } | ||
| 1063 | else | ||
| 1064 | { | ||
| 1065 | Write8(0x0F); Write8(0xAD); | ||
| 1066 | dest.WriteRest(this, 0, operand); | ||
| 1067 | } | ||
| 1068 | } | ||
| 1069 | |||
| 1070 | void XEmitter::SHLD(int bits, OpArg dest, OpArg src, OpArg shift) | ||
| 1071 | { | ||
| 1072 | CheckFlags(); | ||
| 1073 | if (dest.IsImm()) | ||
| 1074 | { | ||
| 1075 | ASSERT_MSG(0, "SHLD - can't use imms as destination"); | ||
| 1076 | } | ||
| 1077 | if (!src.IsSimpleReg()) | ||
| 1078 | { | ||
| 1079 | ASSERT_MSG(0, "SHLD - must use simple register as source"); | ||
| 1080 | } | ||
| 1081 | if ((shift.IsSimpleReg() && shift.GetSimpleReg() != ECX) || (shift.IsImm() && shift.GetImmBits() != 8)) | ||
| 1082 | { | ||
| 1083 | ASSERT_MSG(0, "SHLD - illegal shift"); | ||
| 1084 | } | ||
| 1085 | if (bits == 16) | ||
| 1086 | Write8(0x66); | ||
| 1087 | X64Reg operand = src.GetSimpleReg(); | ||
| 1088 | dest.WriteRex(this, bits, bits, operand); | ||
| 1089 | if (shift.GetImmBits() == 8) | ||
| 1090 | { | ||
| 1091 | Write8(0x0F); Write8(0xA4); | ||
| 1092 | dest.WriteRest(this, 1, operand); | ||
| 1093 | Write8((u8)shift.offset); | ||
| 1094 | } | ||
| 1095 | else | ||
| 1096 | { | ||
| 1097 | Write8(0x0F); Write8(0xA5); | ||
| 1098 | dest.WriteRest(this, 0, operand); | ||
| 1099 | } | ||
| 1100 | } | ||
| 1101 | |||
| 1102 | void OpArg::WriteSingleByteOp(XEmitter *emit, u8 op, X64Reg _operandReg, int bits) | ||
| 1103 | { | ||
| 1104 | if (bits == 16) | ||
| 1105 | emit->Write8(0x66); | ||
| 1106 | |||
| 1107 | this->operandReg = (u8)_operandReg; | ||
| 1108 | WriteRex(emit, bits, bits); | ||
| 1109 | emit->Write8(op); | ||
| 1110 | WriteRest(emit); | ||
| 1111 | } | ||
| 1112 | |||
| 1113 | //operand can either be immediate or register | ||
| 1114 | void OpArg::WriteNormalOp(XEmitter *emit, bool toRM, NormalOp op, const OpArg &operand, int bits) const | ||
| 1115 | { | ||
| 1116 | X64Reg _operandReg; | ||
| 1117 | if (IsImm()) | ||
| 1118 | { | ||
| 1119 | ASSERT_MSG(0, "WriteNormalOp - Imm argument, wrong order"); | ||
| 1120 | } | ||
| 1121 | |||
| 1122 | if (bits == 16) | ||
| 1123 | emit->Write8(0x66); | ||
| 1124 | |||
| 1125 | int immToWrite = 0; | ||
| 1126 | |||
| 1127 | if (operand.IsImm()) | ||
| 1128 | { | ||
| 1129 | WriteRex(emit, bits, bits); | ||
| 1130 | |||
| 1131 | if (!toRM) | ||
| 1132 | { | ||
| 1133 | ASSERT_MSG(0, "WriteNormalOp - Writing to Imm (!toRM)"); | ||
| 1134 | } | ||
| 1135 | |||
| 1136 | if (operand.scale == SCALE_IMM8 && bits == 8) | ||
| 1137 | { | ||
| 1138 | // op al, imm8 | ||
| 1139 | if (!scale && offsetOrBaseReg == AL && normalops[op].eaximm8 != 0xCC) | ||
| 1140 | { | ||
| 1141 | emit->Write8(normalops[op].eaximm8); | ||
| 1142 | emit->Write8((u8)operand.offset); | ||
| 1143 | return; | ||
| 1144 | } | ||
| 1145 | // mov reg, imm8 | ||
| 1146 | if (!scale && op == nrmMOV) | ||
| 1147 | { | ||
| 1148 | emit->Write8(0xB0 + (offsetOrBaseReg & 7)); | ||
| 1149 | emit->Write8((u8)operand.offset); | ||
| 1150 | return; | ||
| 1151 | } | ||
| 1152 | // op r/m8, imm8 | ||
| 1153 | emit->Write8(normalops[op].imm8); | ||
| 1154 | immToWrite = 8; | ||
| 1155 | } | ||
| 1156 | else if ((operand.scale == SCALE_IMM16 && bits == 16) || | ||
| 1157 | (operand.scale == SCALE_IMM32 && bits == 32) || | ||
| 1158 | (operand.scale == SCALE_IMM32 && bits == 64)) | ||
| 1159 | { | ||
| 1160 | // Try to save immediate size if we can, but first check to see | ||
| 1161 | // if the instruction supports simm8. | ||
| 1162 | // op r/m, imm8 | ||
| 1163 | if (normalops[op].simm8 != 0xCC && | ||
| 1164 | ((operand.scale == SCALE_IMM16 && (s16)operand.offset == (s8)operand.offset) || | ||
| 1165 | (operand.scale == SCALE_IMM32 && (s32)operand.offset == (s8)operand.offset))) | ||
| 1166 | { | ||
| 1167 | emit->Write8(normalops[op].simm8); | ||
| 1168 | immToWrite = 8; | ||
| 1169 | } | ||
| 1170 | else | ||
| 1171 | { | ||
| 1172 | // mov reg, imm | ||
| 1173 | if (!scale && op == nrmMOV && bits != 64) | ||
| 1174 | { | ||
| 1175 | emit->Write8(0xB8 + (offsetOrBaseReg & 7)); | ||
| 1176 | if (bits == 16) | ||
| 1177 | emit->Write16((u16)operand.offset); | ||
| 1178 | else | ||
| 1179 | emit->Write32((u32)operand.offset); | ||
| 1180 | return; | ||
| 1181 | } | ||
| 1182 | // op eax, imm | ||
| 1183 | if (!scale && offsetOrBaseReg == EAX && normalops[op].eaximm32 != 0xCC) | ||
| 1184 | { | ||
| 1185 | emit->Write8(normalops[op].eaximm32); | ||
| 1186 | if (bits == 16) | ||
| 1187 | emit->Write16((u16)operand.offset); | ||
| 1188 | else | ||
| 1189 | emit->Write32((u32)operand.offset); | ||
| 1190 | return; | ||
| 1191 | } | ||
| 1192 | // op r/m, imm | ||
| 1193 | emit->Write8(normalops[op].imm32); | ||
| 1194 | immToWrite = bits == 16 ? 16 : 32; | ||
| 1195 | } | ||
| 1196 | } | ||
| 1197 | else if ((operand.scale == SCALE_IMM8 && bits == 16) || | ||
| 1198 | (operand.scale == SCALE_IMM8 && bits == 32) || | ||
| 1199 | (operand.scale == SCALE_IMM8 && bits == 64)) | ||
| 1200 | { | ||
| 1201 | // op r/m, imm8 | ||
| 1202 | emit->Write8(normalops[op].simm8); | ||
| 1203 | immToWrite = 8; | ||
| 1204 | } | ||
| 1205 | else if (operand.scale == SCALE_IMM64 && bits == 64) | ||
| 1206 | { | ||
| 1207 | if (scale) | ||
| 1208 | { | ||
| 1209 | ASSERT_MSG(0, "WriteNormalOp - MOV with 64-bit imm requres register destination"); | ||
| 1210 | } | ||
| 1211 | // mov reg64, imm64 | ||
| 1212 | else if (op == nrmMOV) | ||
| 1213 | { | ||
| 1214 | emit->Write8(0xB8 + (offsetOrBaseReg & 7)); | ||
| 1215 | emit->Write64((u64)operand.offset); | ||
| 1216 | return; | ||
| 1217 | } | ||
| 1218 | ASSERT_MSG(0, "WriteNormalOp - Only MOV can take 64-bit imm"); | ||
| 1219 | } | ||
| 1220 | else | ||
| 1221 | { | ||
| 1222 | ASSERT_MSG(0, "WriteNormalOp - Unhandled case"); | ||
| 1223 | } | ||
| 1224 | _operandReg = (X64Reg)normalops[op].ext; //pass extension in REG of ModRM | ||
| 1225 | } | ||
| 1226 | else | ||
| 1227 | { | ||
| 1228 | _operandReg = (X64Reg)operand.offsetOrBaseReg; | ||
| 1229 | WriteRex(emit, bits, bits, _operandReg); | ||
| 1230 | // op r/m, reg | ||
| 1231 | if (toRM) | ||
| 1232 | { | ||
| 1233 | emit->Write8(bits == 8 ? normalops[op].toRm8 : normalops[op].toRm32); | ||
| 1234 | } | ||
| 1235 | // op reg, r/m | ||
| 1236 | else | ||
| 1237 | { | ||
| 1238 | emit->Write8(bits == 8 ? normalops[op].fromRm8 : normalops[op].fromRm32); | ||
| 1239 | } | ||
| 1240 | } | ||
| 1241 | WriteRest(emit, immToWrite >> 3, _operandReg); | ||
| 1242 | switch (immToWrite) | ||
| 1243 | { | ||
| 1244 | case 0: | ||
| 1245 | break; | ||
| 1246 | case 8: | ||
| 1247 | emit->Write8((u8)operand.offset); | ||
| 1248 | break; | ||
| 1249 | case 16: | ||
| 1250 | emit->Write16((u16)operand.offset); | ||
| 1251 | break; | ||
| 1252 | case 32: | ||
| 1253 | emit->Write32((u32)operand.offset); | ||
| 1254 | break; | ||
| 1255 | default: | ||
| 1256 | ASSERT_MSG(0, "WriteNormalOp - Unhandled case"); | ||
| 1257 | } | ||
| 1258 | } | ||
| 1259 | |||
| 1260 | void XEmitter::WriteNormalOp(XEmitter *emit, int bits, NormalOp op, const OpArg &a1, const OpArg &a2) | ||
| 1261 | { | ||
| 1262 | if (a1.IsImm()) | ||
| 1263 | { | ||
| 1264 | //Booh! Can't write to an imm | ||
| 1265 | ASSERT_MSG(0, "WriteNormalOp - a1 cannot be imm"); | ||
| 1266 | return; | ||
| 1267 | } | ||
| 1268 | if (a2.IsImm()) | ||
| 1269 | { | ||
| 1270 | a1.WriteNormalOp(emit, true, op, a2, bits); | ||
| 1271 | } | ||
| 1272 | else | ||
| 1273 | { | ||
| 1274 | if (a1.IsSimpleReg()) | ||
| 1275 | { | ||
| 1276 | a2.WriteNormalOp(emit, false, op, a1, bits); | ||
| 1277 | } | ||
| 1278 | else | ||
| 1279 | { | ||
| 1280 | ASSERT_MSG(a2.IsSimpleReg() || a2.IsImm(), "WriteNormalOp - a1 and a2 cannot both be memory"); | ||
| 1281 | a1.WriteNormalOp(emit, true, op, a2, bits); | ||
| 1282 | } | ||
| 1283 | } | ||
| 1284 | } | ||
| 1285 | |||
| 1286 | void XEmitter::ADD (int bits, const OpArg &a1, const OpArg &a2) {CheckFlags(); WriteNormalOp(this, bits, nrmADD, a1, a2);} | ||
| 1287 | void XEmitter::ADC (int bits, const OpArg &a1, const OpArg &a2) {CheckFlags(); WriteNormalOp(this, bits, nrmADC, a1, a2);} | ||
| 1288 | void XEmitter::SUB (int bits, const OpArg &a1, const OpArg &a2) {CheckFlags(); WriteNormalOp(this, bits, nrmSUB, a1, a2);} | ||
| 1289 | void XEmitter::SBB (int bits, const OpArg &a1, const OpArg &a2) {CheckFlags(); WriteNormalOp(this, bits, nrmSBB, a1, a2);} | ||
| 1290 | void XEmitter::AND (int bits, const OpArg &a1, const OpArg &a2) {CheckFlags(); WriteNormalOp(this, bits, nrmAND, a1, a2);} | ||
| 1291 | void XEmitter::OR (int bits, const OpArg &a1, const OpArg &a2) {CheckFlags(); WriteNormalOp(this, bits, nrmOR , a1, a2);} | ||
| 1292 | void XEmitter::XOR (int bits, const OpArg &a1, const OpArg &a2) {CheckFlags(); WriteNormalOp(this, bits, nrmXOR, a1, a2);} | ||
| 1293 | void XEmitter::MOV (int bits, const OpArg &a1, const OpArg &a2) | ||
| 1294 | { | ||
| 1295 | if (a1.IsSimpleReg() && a2.IsSimpleReg() && a1.GetSimpleReg() == a2.GetSimpleReg()) | ||
| 1296 | LOG_ERROR(Common, "Redundant MOV @ %p - bug in JIT?", code); | ||
| 1297 | WriteNormalOp(this, bits, nrmMOV, a1, a2); | ||
| 1298 | } | ||
| 1299 | void XEmitter::TEST(int bits, const OpArg &a1, const OpArg &a2) {CheckFlags(); WriteNormalOp(this, bits, nrmTEST, a1, a2);} | ||
| 1300 | void XEmitter::CMP (int bits, const OpArg &a1, const OpArg &a2) {CheckFlags(); WriteNormalOp(this, bits, nrmCMP, a1, a2);} | ||
| 1301 | void XEmitter::XCHG(int bits, const OpArg &a1, const OpArg &a2) {WriteNormalOp(this, bits, nrmXCHG, a1, a2);} | ||
| 1302 | |||
| 1303 | void XEmitter::IMUL(int bits, X64Reg regOp, OpArg a1, OpArg a2) | ||
| 1304 | { | ||
| 1305 | CheckFlags(); | ||
| 1306 | if (bits == 8) | ||
| 1307 | { | ||
| 1308 | ASSERT_MSG(0, "IMUL - illegal bit size!"); | ||
| 1309 | return; | ||
| 1310 | } | ||
| 1311 | |||
| 1312 | if (a1.IsImm()) | ||
| 1313 | { | ||
| 1314 | ASSERT_MSG(0, "IMUL - second arg cannot be imm!"); | ||
| 1315 | return; | ||
| 1316 | } | ||
| 1317 | |||
| 1318 | if (!a2.IsImm()) | ||
| 1319 | { | ||
| 1320 | ASSERT_MSG(0, "IMUL - third arg must be imm!"); | ||
| 1321 | return; | ||
| 1322 | } | ||
| 1323 | |||
| 1324 | if (bits == 16) | ||
| 1325 | Write8(0x66); | ||
| 1326 | a1.WriteRex(this, bits, bits, regOp); | ||
| 1327 | |||
| 1328 | if (a2.GetImmBits() == 8 || | ||
| 1329 | (a2.GetImmBits() == 16 && (s8)a2.offset == (s16)a2.offset) || | ||
| 1330 | (a2.GetImmBits() == 32 && (s8)a2.offset == (s32)a2.offset)) | ||
| 1331 | { | ||
| 1332 | Write8(0x6B); | ||
| 1333 | a1.WriteRest(this, 1, regOp); | ||
| 1334 | Write8((u8)a2.offset); | ||
| 1335 | } | ||
| 1336 | else | ||
| 1337 | { | ||
| 1338 | Write8(0x69); | ||
| 1339 | if (a2.GetImmBits() == 16 && bits == 16) | ||
| 1340 | { | ||
| 1341 | a1.WriteRest(this, 2, regOp); | ||
| 1342 | Write16((u16)a2.offset); | ||
| 1343 | } | ||
| 1344 | else if (a2.GetImmBits() == 32 && (bits == 32 || bits == 64)) | ||
| 1345 | { | ||
| 1346 | a1.WriteRest(this, 4, regOp); | ||
| 1347 | Write32((u32)a2.offset); | ||
| 1348 | } | ||
| 1349 | else | ||
| 1350 | { | ||
| 1351 | ASSERT_MSG(0, "IMUL - unhandled case!"); | ||
| 1352 | } | ||
| 1353 | } | ||
| 1354 | } | ||
| 1355 | |||
| 1356 | void XEmitter::IMUL(int bits, X64Reg regOp, OpArg a) | ||
| 1357 | { | ||
| 1358 | CheckFlags(); | ||
| 1359 | if (bits == 8) | ||
| 1360 | { | ||
| 1361 | ASSERT_MSG(0, "IMUL - illegal bit size!"); | ||
| 1362 | return; | ||
| 1363 | } | ||
| 1364 | |||
| 1365 | if (a.IsImm()) | ||
| 1366 | { | ||
| 1367 | IMUL(bits, regOp, R(regOp), a) ; | ||
| 1368 | return; | ||
| 1369 | } | ||
| 1370 | |||
| 1371 | if (bits == 16) | ||
| 1372 | Write8(0x66); | ||
| 1373 | a.WriteRex(this, bits, bits, regOp); | ||
| 1374 | Write8(0x0F); | ||
| 1375 | Write8(0xAF); | ||
| 1376 | a.WriteRest(this, 0, regOp); | ||
| 1377 | } | ||
| 1378 | |||
| 1379 | |||
| 1380 | void XEmitter::WriteSSEOp(u8 opPrefix, u16 op, X64Reg regOp, OpArg arg, int extrabytes) | ||
| 1381 | { | ||
| 1382 | if (opPrefix) | ||
| 1383 | Write8(opPrefix); | ||
| 1384 | arg.operandReg = regOp; | ||
| 1385 | arg.WriteRex(this, 0, 0); | ||
| 1386 | Write8(0x0F); | ||
| 1387 | if (op > 0xFF) | ||
| 1388 | Write8((op >> 8) & 0xFF); | ||
| 1389 | Write8(op & 0xFF); | ||
| 1390 | arg.WriteRest(this, extrabytes); | ||
| 1391 | } | ||
| 1392 | |||
| 1393 | void XEmitter::WriteAVXOp(u8 opPrefix, u16 op, X64Reg regOp, OpArg arg, int extrabytes) | ||
| 1394 | { | ||
| 1395 | WriteAVXOp(opPrefix, op, regOp, INVALID_REG, arg, extrabytes); | ||
| 1396 | } | ||
| 1397 | |||
| 1398 | static int GetVEXmmmmm(u16 op) | ||
| 1399 | { | ||
| 1400 | // Currently, only 0x38 and 0x3A are used as secondary escape byte. | ||
| 1401 | if ((op >> 8) == 0x3A) | ||
| 1402 | return 3; | ||
| 1403 | else if ((op >> 8) == 0x38) | ||
| 1404 | return 2; | ||
| 1405 | else | ||
| 1406 | return 1; | ||
| 1407 | } | ||
| 1408 | |||
| 1409 | static int GetVEXpp(u8 opPrefix) | ||
| 1410 | { | ||
| 1411 | if (opPrefix == 0x66) | ||
| 1412 | return 1; | ||
| 1413 | else if (opPrefix == 0xF3) | ||
| 1414 | return 2; | ||
| 1415 | else if (opPrefix == 0xF2) | ||
| 1416 | return 3; | ||
| 1417 | else | ||
| 1418 | return 0; | ||
| 1419 | } | ||
| 1420 | |||
| 1421 | void XEmitter::WriteAVXOp(u8 opPrefix, u16 op, X64Reg regOp1, X64Reg regOp2, OpArg arg, int extrabytes) | ||
| 1422 | { | ||
| 1423 | if (!Common::GetCPUCaps().avx) | ||
| 1424 | ASSERT_MSG(0, "Trying to use AVX on a system that doesn't support it. Bad programmer."); | ||
| 1425 | int mmmmm = GetVEXmmmmm(op); | ||
| 1426 | int pp = GetVEXpp(opPrefix); | ||
| 1427 | // FIXME: we currently don't support 256-bit instructions, and "size" is not the vector size here | ||
| 1428 | arg.WriteVex(this, regOp1, regOp2, 0, pp, mmmmm); | ||
| 1429 | Write8(op & 0xFF); | ||
| 1430 | arg.WriteRest(this, extrabytes, regOp1); | ||
| 1431 | } | ||
| 1432 | |||
| 1433 | // Like the above, but more general; covers GPR-based VEX operations, like BMI1/2 | ||
| 1434 | void XEmitter::WriteVEXOp(int size, u8 opPrefix, u16 op, X64Reg regOp1, X64Reg regOp2, OpArg arg, int extrabytes) | ||
| 1435 | { | ||
| 1436 | if (size != 32 && size != 64) | ||
| 1437 | ASSERT_MSG(0, "VEX GPR instructions only support 32-bit and 64-bit modes!"); | ||
| 1438 | int mmmmm = GetVEXmmmmm(op); | ||
| 1439 | int pp = GetVEXpp(opPrefix); | ||
| 1440 | arg.WriteVex(this, regOp1, regOp2, 0, pp, mmmmm, size == 64); | ||
| 1441 | Write8(op & 0xFF); | ||
| 1442 | arg.WriteRest(this, extrabytes, regOp1); | ||
| 1443 | } | ||
| 1444 | |||
| 1445 | void XEmitter::WriteBMI1Op(int size, u8 opPrefix, u16 op, X64Reg regOp1, X64Reg regOp2, OpArg arg, int extrabytes) | ||
| 1446 | { | ||
| 1447 | CheckFlags(); | ||
| 1448 | if (!Common::GetCPUCaps().bmi1) | ||
| 1449 | ASSERT_MSG(0, "Trying to use BMI1 on a system that doesn't support it. Bad programmer."); | ||
| 1450 | WriteVEXOp(size, opPrefix, op, regOp1, regOp2, arg, extrabytes); | ||
| 1451 | } | ||
| 1452 | |||
| 1453 | void XEmitter::WriteBMI2Op(int size, u8 opPrefix, u16 op, X64Reg regOp1, X64Reg regOp2, OpArg arg, int extrabytes) | ||
| 1454 | { | ||
| 1455 | CheckFlags(); | ||
| 1456 | if (!Common::GetCPUCaps().bmi2) | ||
| 1457 | ASSERT_MSG(0, "Trying to use BMI2 on a system that doesn't support it. Bad programmer."); | ||
| 1458 | WriteVEXOp(size, opPrefix, op, regOp1, regOp2, arg, extrabytes); | ||
| 1459 | } | ||
| 1460 | |||
| 1461 | void XEmitter::MOVD_xmm(X64Reg dest, const OpArg &arg) {WriteSSEOp(0x66, 0x6E, dest, arg, 0);} | ||
| 1462 | void XEmitter::MOVD_xmm(const OpArg &arg, X64Reg src) {WriteSSEOp(0x66, 0x7E, src, arg, 0);} | ||
| 1463 | |||
| 1464 | void XEmitter::MOVQ_xmm(X64Reg dest, OpArg arg) | ||
| 1465 | { | ||
| 1466 | #ifdef ARCHITECTURE_x86_64 | ||
| 1467 | // Alternate encoding | ||
| 1468 | // This does not display correctly in MSVC's debugger, it thinks it's a MOVD | ||
| 1469 | arg.operandReg = dest; | ||
| 1470 | Write8(0x66); | ||
| 1471 | arg.WriteRex(this, 64, 0); | ||
| 1472 | Write8(0x0f); | ||
| 1473 | Write8(0x6E); | ||
| 1474 | arg.WriteRest(this, 0); | ||
| 1475 | #else | ||
| 1476 | arg.operandReg = dest; | ||
| 1477 | Write8(0xF3); | ||
| 1478 | Write8(0x0f); | ||
| 1479 | Write8(0x7E); | ||
| 1480 | arg.WriteRest(this, 0); | ||
| 1481 | #endif | ||
| 1482 | } | ||
| 1483 | |||
| 1484 | void XEmitter::MOVQ_xmm(OpArg arg, X64Reg src) | ||
| 1485 | { | ||
| 1486 | if (src > 7 || arg.IsSimpleReg()) | ||
| 1487 | { | ||
| 1488 | // Alternate encoding | ||
| 1489 | // This does not display correctly in MSVC's debugger, it thinks it's a MOVD | ||
| 1490 | arg.operandReg = src; | ||
| 1491 | Write8(0x66); | ||
| 1492 | arg.WriteRex(this, 64, 0); | ||
| 1493 | Write8(0x0f); | ||
| 1494 | Write8(0x7E); | ||
| 1495 | arg.WriteRest(this, 0); | ||
| 1496 | } | ||
| 1497 | else | ||
| 1498 | { | ||
| 1499 | arg.operandReg = src; | ||
| 1500 | arg.WriteRex(this, 0, 0); | ||
| 1501 | Write8(0x66); | ||
| 1502 | Write8(0x0f); | ||
| 1503 | Write8(0xD6); | ||
| 1504 | arg.WriteRest(this, 0); | ||
| 1505 | } | ||
| 1506 | } | ||
| 1507 | |||
| 1508 | void XEmitter::WriteMXCSR(OpArg arg, int ext) | ||
| 1509 | { | ||
| 1510 | if (arg.IsImm() || arg.IsSimpleReg()) | ||
| 1511 | ASSERT_MSG(0, "MXCSR - invalid operand"); | ||
| 1512 | |||
| 1513 | arg.operandReg = ext; | ||
| 1514 | arg.WriteRex(this, 0, 0); | ||
| 1515 | Write8(0x0F); | ||
| 1516 | Write8(0xAE); | ||
| 1517 | arg.WriteRest(this); | ||
| 1518 | } | ||
| 1519 | |||
| 1520 | void XEmitter::STMXCSR(OpArg memloc) {WriteMXCSR(memloc, 3);} | ||
| 1521 | void XEmitter::LDMXCSR(OpArg memloc) {WriteMXCSR(memloc, 2);} | ||
| 1522 | |||
| 1523 | void XEmitter::MOVNTDQ(OpArg arg, X64Reg regOp) {WriteSSEOp(0x66, sseMOVNTDQ, regOp, arg);} | ||
| 1524 | void XEmitter::MOVNTPS(OpArg arg, X64Reg regOp) {WriteSSEOp(0x00, sseMOVNTP, regOp, arg);} | ||
| 1525 | void XEmitter::MOVNTPD(OpArg arg, X64Reg regOp) {WriteSSEOp(0x66, sseMOVNTP, regOp, arg);} | ||
| 1526 | |||
| 1527 | void XEmitter::ADDSS(X64Reg regOp, OpArg arg) {WriteSSEOp(0xF3, sseADD, regOp, arg);} | ||
| 1528 | void XEmitter::ADDSD(X64Reg regOp, OpArg arg) {WriteSSEOp(0xF2, sseADD, regOp, arg);} | ||
| 1529 | void XEmitter::SUBSS(X64Reg regOp, OpArg arg) {WriteSSEOp(0xF3, sseSUB, regOp, arg);} | ||
| 1530 | void XEmitter::SUBSD(X64Reg regOp, OpArg arg) {WriteSSEOp(0xF2, sseSUB, regOp, arg);} | ||
| 1531 | void XEmitter::CMPSS(X64Reg regOp, OpArg arg, u8 compare) {WriteSSEOp(0xF3, sseCMP, regOp, arg, 1); Write8(compare);} | ||
| 1532 | void XEmitter::CMPSD(X64Reg regOp, OpArg arg, u8 compare) {WriteSSEOp(0xF2, sseCMP, regOp, arg, 1); Write8(compare);} | ||
| 1533 | void XEmitter::MULSS(X64Reg regOp, OpArg arg) {WriteSSEOp(0xF3, sseMUL, regOp, arg);} | ||
| 1534 | void XEmitter::MULSD(X64Reg regOp, OpArg arg) {WriteSSEOp(0xF2, sseMUL, regOp, arg);} | ||
| 1535 | void XEmitter::DIVSS(X64Reg regOp, OpArg arg) {WriteSSEOp(0xF3, sseDIV, regOp, arg);} | ||
| 1536 | void XEmitter::DIVSD(X64Reg regOp, OpArg arg) {WriteSSEOp(0xF2, sseDIV, regOp, arg);} | ||
| 1537 | void XEmitter::MINSS(X64Reg regOp, OpArg arg) {WriteSSEOp(0xF3, sseMIN, regOp, arg);} | ||
| 1538 | void XEmitter::MINSD(X64Reg regOp, OpArg arg) {WriteSSEOp(0xF2, sseMIN, regOp, arg);} | ||
| 1539 | void XEmitter::MAXSS(X64Reg regOp, OpArg arg) {WriteSSEOp(0xF3, sseMAX, regOp, arg);} | ||
| 1540 | void XEmitter::MAXSD(X64Reg regOp, OpArg arg) {WriteSSEOp(0xF2, sseMAX, regOp, arg);} | ||
| 1541 | void XEmitter::SQRTSS(X64Reg regOp, OpArg arg) {WriteSSEOp(0xF3, sseSQRT, regOp, arg);} | ||
| 1542 | void XEmitter::SQRTSD(X64Reg regOp, OpArg arg) {WriteSSEOp(0xF2, sseSQRT, regOp, arg);} | ||
| 1543 | void XEmitter::RSQRTSS(X64Reg regOp, OpArg arg) {WriteSSEOp(0xF3, sseRSQRT, regOp, arg);} | ||
| 1544 | |||
| 1545 | void XEmitter::ADDPS(X64Reg regOp, OpArg arg) {WriteSSEOp(0x00, sseADD, regOp, arg);} | ||
| 1546 | void XEmitter::ADDPD(X64Reg regOp, OpArg arg) {WriteSSEOp(0x66, sseADD, regOp, arg);} | ||
| 1547 | void XEmitter::SUBPS(X64Reg regOp, OpArg arg) {WriteSSEOp(0x00, sseSUB, regOp, arg);} | ||
| 1548 | void XEmitter::SUBPD(X64Reg regOp, OpArg arg) {WriteSSEOp(0x66, sseSUB, regOp, arg);} | ||
| 1549 | void XEmitter::CMPPS(X64Reg regOp, OpArg arg, u8 compare) {WriteSSEOp(0x00, sseCMP, regOp, arg, 1); Write8(compare);} | ||
| 1550 | void XEmitter::CMPPD(X64Reg regOp, OpArg arg, u8 compare) {WriteSSEOp(0x66, sseCMP, regOp, arg, 1); Write8(compare);} | ||
| 1551 | void XEmitter::ANDPS(X64Reg regOp, OpArg arg) {WriteSSEOp(0x00, sseAND, regOp, arg);} | ||
| 1552 | void XEmitter::ANDPD(X64Reg regOp, OpArg arg) {WriteSSEOp(0x66, sseAND, regOp, arg);} | ||
| 1553 | void XEmitter::ANDNPS(X64Reg regOp, OpArg arg) {WriteSSEOp(0x00, sseANDN, regOp, arg);} | ||
| 1554 | void XEmitter::ANDNPD(X64Reg regOp, OpArg arg) {WriteSSEOp(0x66, sseANDN, regOp, arg);} | ||
| 1555 | void XEmitter::ORPS(X64Reg regOp, OpArg arg) {WriteSSEOp(0x00, sseOR, regOp, arg);} | ||
| 1556 | void XEmitter::ORPD(X64Reg regOp, OpArg arg) {WriteSSEOp(0x66, sseOR, regOp, arg);} | ||
| 1557 | void XEmitter::XORPS(X64Reg regOp, OpArg arg) {WriteSSEOp(0x00, sseXOR, regOp, arg);} | ||
| 1558 | void XEmitter::XORPD(X64Reg regOp, OpArg arg) {WriteSSEOp(0x66, sseXOR, regOp, arg);} | ||
| 1559 | void XEmitter::MULPS(X64Reg regOp, OpArg arg) {WriteSSEOp(0x00, sseMUL, regOp, arg);} | ||
| 1560 | void XEmitter::MULPD(X64Reg regOp, OpArg arg) {WriteSSEOp(0x66, sseMUL, regOp, arg);} | ||
| 1561 | void XEmitter::DIVPS(X64Reg regOp, OpArg arg) {WriteSSEOp(0x00, sseDIV, regOp, arg);} | ||
| 1562 | void XEmitter::DIVPD(X64Reg regOp, OpArg arg) {WriteSSEOp(0x66, sseDIV, regOp, arg);} | ||
| 1563 | void XEmitter::MINPS(X64Reg regOp, OpArg arg) {WriteSSEOp(0x00, sseMIN, regOp, arg);} | ||
| 1564 | void XEmitter::MINPD(X64Reg regOp, OpArg arg) {WriteSSEOp(0x66, sseMIN, regOp, arg);} | ||
| 1565 | void XEmitter::MAXPS(X64Reg regOp, OpArg arg) {WriteSSEOp(0x00, sseMAX, regOp, arg);} | ||
| 1566 | void XEmitter::MAXPD(X64Reg regOp, OpArg arg) {WriteSSEOp(0x66, sseMAX, regOp, arg);} | ||
| 1567 | void XEmitter::SQRTPS(X64Reg regOp, OpArg arg) {WriteSSEOp(0x00, sseSQRT, regOp, arg);} | ||
| 1568 | void XEmitter::SQRTPD(X64Reg regOp, OpArg arg) {WriteSSEOp(0x66, sseSQRT, regOp, arg);} | ||
| 1569 | void XEmitter::RCPPS(X64Reg regOp, OpArg arg) { WriteSSEOp(0x00, sseRCP, regOp, arg); } | ||
| 1570 | void XEmitter::RSQRTPS(X64Reg regOp, OpArg arg) {WriteSSEOp(0x00, sseRSQRT, regOp, arg);} | ||
| 1571 | void XEmitter::SHUFPS(X64Reg regOp, OpArg arg, u8 shuffle) {WriteSSEOp(0x00, sseSHUF, regOp, arg,1); Write8(shuffle);} | ||
| 1572 | void XEmitter::SHUFPD(X64Reg regOp, OpArg arg, u8 shuffle) {WriteSSEOp(0x66, sseSHUF, regOp, arg,1); Write8(shuffle);} | ||
| 1573 | |||
| 1574 | void XEmitter::HADDPS(X64Reg regOp, OpArg arg) {WriteSSEOp(0xF2, sseHADD, regOp, arg);} | ||
| 1575 | |||
| 1576 | void XEmitter::COMISS(X64Reg regOp, OpArg arg) {WriteSSEOp(0x00, sseCOMIS, regOp, arg);} //weird that these should be packed | ||
| 1577 | void XEmitter::COMISD(X64Reg regOp, OpArg arg) {WriteSSEOp(0x66, sseCOMIS, regOp, arg);} //ordered | ||
| 1578 | void XEmitter::UCOMISS(X64Reg regOp, OpArg arg) {WriteSSEOp(0x00, sseUCOMIS, regOp, arg);} //unordered | ||
| 1579 | void XEmitter::UCOMISD(X64Reg regOp, OpArg arg) {WriteSSEOp(0x66, sseUCOMIS, regOp, arg);} | ||
| 1580 | |||
| 1581 | void XEmitter::MOVAPS(X64Reg regOp, OpArg arg) {WriteSSEOp(0x00, sseMOVAPfromRM, regOp, arg);} | ||
| 1582 | void XEmitter::MOVAPD(X64Reg regOp, OpArg arg) {WriteSSEOp(0x66, sseMOVAPfromRM, regOp, arg);} | ||
| 1583 | void XEmitter::MOVAPS(OpArg arg, X64Reg regOp) {WriteSSEOp(0x00, sseMOVAPtoRM, regOp, arg);} | ||
| 1584 | void XEmitter::MOVAPD(OpArg arg, X64Reg regOp) {WriteSSEOp(0x66, sseMOVAPtoRM, regOp, arg);} | ||
| 1585 | |||
| 1586 | void XEmitter::MOVUPS(X64Reg regOp, OpArg arg) {WriteSSEOp(0x00, sseMOVUPfromRM, regOp, arg);} | ||
| 1587 | void XEmitter::MOVUPD(X64Reg regOp, OpArg arg) {WriteSSEOp(0x66, sseMOVUPfromRM, regOp, arg);} | ||
| 1588 | void XEmitter::MOVUPS(OpArg arg, X64Reg regOp) {WriteSSEOp(0x00, sseMOVUPtoRM, regOp, arg);} | ||
| 1589 | void XEmitter::MOVUPD(OpArg arg, X64Reg regOp) {WriteSSEOp(0x66, sseMOVUPtoRM, regOp, arg);} | ||
| 1590 | |||
| 1591 | void XEmitter::MOVDQA(X64Reg regOp, OpArg arg) {WriteSSEOp(0x66, sseMOVDQfromRM, regOp, arg);} | ||
| 1592 | void XEmitter::MOVDQA(OpArg arg, X64Reg regOp) {WriteSSEOp(0x66, sseMOVDQtoRM, regOp, arg);} | ||
| 1593 | void XEmitter::MOVDQU(X64Reg regOp, OpArg arg) {WriteSSEOp(0xF3, sseMOVDQfromRM, regOp, arg);} | ||
| 1594 | void XEmitter::MOVDQU(OpArg arg, X64Reg regOp) {WriteSSEOp(0xF3, sseMOVDQtoRM, regOp, arg);} | ||
| 1595 | |||
| 1596 | void XEmitter::MOVSS(X64Reg regOp, OpArg arg) {WriteSSEOp(0xF3, sseMOVUPfromRM, regOp, arg);} | ||
| 1597 | void XEmitter::MOVSD(X64Reg regOp, OpArg arg) {WriteSSEOp(0xF2, sseMOVUPfromRM, regOp, arg);} | ||
| 1598 | void XEmitter::MOVSS(OpArg arg, X64Reg regOp) {WriteSSEOp(0xF3, sseMOVUPtoRM, regOp, arg);} | ||
| 1599 | void XEmitter::MOVSD(OpArg arg, X64Reg regOp) {WriteSSEOp(0xF2, sseMOVUPtoRM, regOp, arg);} | ||
| 1600 | |||
| 1601 | void XEmitter::MOVLPS(X64Reg regOp, OpArg arg) { WriteSSEOp(0x00, sseMOVLPfromRM, regOp, arg); } | ||
| 1602 | void XEmitter::MOVLPD(X64Reg regOp, OpArg arg) { WriteSSEOp(0x66, sseMOVLPfromRM, regOp, arg); } | ||
| 1603 | void XEmitter::MOVLPS(OpArg arg, X64Reg regOp) { WriteSSEOp(0x00, sseMOVLPtoRM, regOp, arg); } | ||
| 1604 | void XEmitter::MOVLPD(OpArg arg, X64Reg regOp) { WriteSSEOp(0x66, sseMOVLPtoRM, regOp, arg); } | ||
| 1605 | |||
| 1606 | void XEmitter::MOVHPS(X64Reg regOp, OpArg arg) { WriteSSEOp(0x00, sseMOVHPfromRM, regOp, arg); } | ||
| 1607 | void XEmitter::MOVHPD(X64Reg regOp, OpArg arg) { WriteSSEOp(0x66, sseMOVHPfromRM, regOp, arg); } | ||
| 1608 | void XEmitter::MOVHPS(OpArg arg, X64Reg regOp) { WriteSSEOp(0x00, sseMOVHPtoRM, regOp, arg); } | ||
| 1609 | void XEmitter::MOVHPD(OpArg arg, X64Reg regOp) { WriteSSEOp(0x66, sseMOVHPtoRM, regOp, arg); } | ||
| 1610 | |||
| 1611 | void XEmitter::MOVHLPS(X64Reg regOp1, X64Reg regOp2) {WriteSSEOp(0x00, sseMOVHLPS, regOp1, R(regOp2));} | ||
| 1612 | void XEmitter::MOVLHPS(X64Reg regOp1, X64Reg regOp2) {WriteSSEOp(0x00, sseMOVLHPS, regOp1, R(regOp2));} | ||
| 1613 | |||
| 1614 | void XEmitter::CVTPS2PD(X64Reg regOp, OpArg arg) {WriteSSEOp(0x00, 0x5A, regOp, arg);} | ||
| 1615 | void XEmitter::CVTPD2PS(X64Reg regOp, OpArg arg) {WriteSSEOp(0x66, 0x5A, regOp, arg);} | ||
| 1616 | |||
| 1617 | void XEmitter::CVTSD2SS(X64Reg regOp, OpArg arg) {WriteSSEOp(0xF2, 0x5A, regOp, arg);} | ||
| 1618 | void XEmitter::CVTSS2SD(X64Reg regOp, OpArg arg) {WriteSSEOp(0xF3, 0x5A, regOp, arg);} | ||
| 1619 | void XEmitter::CVTSD2SI(X64Reg regOp, OpArg arg) {WriteSSEOp(0xF2, 0x2D, regOp, arg);} | ||
| 1620 | void XEmitter::CVTSS2SI(X64Reg regOp, OpArg arg) {WriteSSEOp(0xF3, 0x2D, regOp, arg);} | ||
| 1621 | void XEmitter::CVTSI2SD(X64Reg regOp, OpArg arg) {WriteSSEOp(0xF2, 0x2A, regOp, arg);} | ||
| 1622 | void XEmitter::CVTSI2SS(X64Reg regOp, OpArg arg) {WriteSSEOp(0xF3, 0x2A, regOp, arg);} | ||
| 1623 | |||
| 1624 | void XEmitter::CVTDQ2PD(X64Reg regOp, OpArg arg) {WriteSSEOp(0xF3, 0xE6, regOp, arg);} | ||
| 1625 | void XEmitter::CVTDQ2PS(X64Reg regOp, OpArg arg) {WriteSSEOp(0x00, 0x5B, regOp, arg);} | ||
| 1626 | void XEmitter::CVTPD2DQ(X64Reg regOp, OpArg arg) {WriteSSEOp(0xF2, 0xE6, regOp, arg);} | ||
| 1627 | void XEmitter::CVTPS2DQ(X64Reg regOp, OpArg arg) {WriteSSEOp(0x66, 0x5B, regOp, arg);} | ||
| 1628 | |||
| 1629 | void XEmitter::CVTTSD2SI(X64Reg regOp, OpArg arg) {WriteSSEOp(0xF2, 0x2C, regOp, arg);} | ||
| 1630 | void XEmitter::CVTTSS2SI(X64Reg regOp, OpArg arg) {WriteSSEOp(0xF3, 0x2C, regOp, arg);} | ||
| 1631 | void XEmitter::CVTTPS2DQ(X64Reg regOp, OpArg arg) {WriteSSEOp(0xF3, 0x5B, regOp, arg);} | ||
| 1632 | void XEmitter::CVTTPD2DQ(X64Reg regOp, OpArg arg) {WriteSSEOp(0x66, 0xE6, regOp, arg);} | ||
| 1633 | |||
| 1634 | void XEmitter::MASKMOVDQU(X64Reg dest, X64Reg src) {WriteSSEOp(0x66, sseMASKMOVDQU, dest, R(src));} | ||
| 1635 | |||
| 1636 | void XEmitter::MOVMSKPS(X64Reg dest, OpArg arg) {WriteSSEOp(0x00, 0x50, dest, arg);} | ||
| 1637 | void XEmitter::MOVMSKPD(X64Reg dest, OpArg arg) {WriteSSEOp(0x66, 0x50, dest, arg);} | ||
| 1638 | |||
| 1639 | void XEmitter::LDDQU(X64Reg dest, OpArg arg) {WriteSSEOp(0xF2, sseLDDQU, dest, arg);} // For integer data only | ||
| 1640 | |||
| 1641 | // THESE TWO ARE UNTESTED. | ||
| 1642 | void XEmitter::UNPCKLPS(X64Reg dest, OpArg arg) {WriteSSEOp(0x00, 0x14, dest, arg);} | ||
| 1643 | void XEmitter::UNPCKHPS(X64Reg dest, OpArg arg) {WriteSSEOp(0x00, 0x15, dest, arg);} | ||
| 1644 | |||
| 1645 | void XEmitter::UNPCKLPD(X64Reg dest, OpArg arg) {WriteSSEOp(0x66, 0x14, dest, arg);} | ||
| 1646 | void XEmitter::UNPCKHPD(X64Reg dest, OpArg arg) {WriteSSEOp(0x66, 0x15, dest, arg);} | ||
| 1647 | |||
| 1648 | void XEmitter::MOVDDUP(X64Reg regOp, OpArg arg) | ||
| 1649 | { | ||
| 1650 | if (Common::GetCPUCaps().sse3) | ||
| 1651 | { | ||
| 1652 | WriteSSEOp(0xF2, 0x12, regOp, arg); //SSE3 movddup | ||
| 1653 | } | ||
| 1654 | else | ||
| 1655 | { | ||
| 1656 | // Simulate this instruction with SSE2 instructions | ||
| 1657 | if (!arg.IsSimpleReg(regOp)) | ||
| 1658 | MOVSD(regOp, arg); | ||
| 1659 | UNPCKLPD(regOp, R(regOp)); | ||
| 1660 | } | ||
| 1661 | } | ||
| 1662 | |||
| 1663 | //There are a few more left | ||
| 1664 | |||
| 1665 | // Also some integer instructions are missing | ||
| 1666 | void XEmitter::PACKSSDW(X64Reg dest, OpArg arg) {WriteSSEOp(0x66, 0x6B, dest, arg);} | ||
| 1667 | void XEmitter::PACKSSWB(X64Reg dest, OpArg arg) {WriteSSEOp(0x66, 0x63, dest, arg);} | ||
| 1668 | void XEmitter::PACKUSWB(X64Reg dest, OpArg arg) {WriteSSEOp(0x66, 0x67, dest, arg);} | ||
| 1669 | |||
| 1670 | void XEmitter::PUNPCKLBW(X64Reg dest, const OpArg &arg) {WriteSSEOp(0x66, 0x60, dest, arg);} | ||
| 1671 | void XEmitter::PUNPCKLWD(X64Reg dest, const OpArg &arg) {WriteSSEOp(0x66, 0x61, dest, arg);} | ||
| 1672 | void XEmitter::PUNPCKLDQ(X64Reg dest, const OpArg &arg) {WriteSSEOp(0x66, 0x62, dest, arg);} | ||
| 1673 | void XEmitter::PUNPCKLQDQ(X64Reg dest, const OpArg &arg) {WriteSSEOp(0x66, 0x6C, dest, arg);} | ||
| 1674 | |||
| 1675 | void XEmitter::PSRLW(X64Reg reg, int shift) | ||
| 1676 | { | ||
| 1677 | WriteSSEOp(0x66, 0x71, (X64Reg)2, R(reg)); | ||
| 1678 | Write8(shift); | ||
| 1679 | } | ||
| 1680 | |||
| 1681 | void XEmitter::PSRLD(X64Reg reg, int shift) | ||
| 1682 | { | ||
| 1683 | WriteSSEOp(0x66, 0x72, (X64Reg)2, R(reg)); | ||
| 1684 | Write8(shift); | ||
| 1685 | } | ||
| 1686 | |||
| 1687 | void XEmitter::PSRLQ(X64Reg reg, int shift) | ||
| 1688 | { | ||
| 1689 | WriteSSEOp(0x66, 0x73, (X64Reg)2, R(reg)); | ||
| 1690 | Write8(shift); | ||
| 1691 | } | ||
| 1692 | |||
| 1693 | void XEmitter::PSRLQ(X64Reg reg, OpArg arg) | ||
| 1694 | { | ||
| 1695 | WriteSSEOp(0x66, 0xd3, reg, arg); | ||
| 1696 | } | ||
| 1697 | |||
| 1698 | void XEmitter::PSRLDQ(X64Reg reg, int shift) { | ||
| 1699 | WriteSSEOp(0x66, 0x73, (X64Reg)3, R(reg)); | ||
| 1700 | Write8(shift); | ||
| 1701 | } | ||
| 1702 | |||
| 1703 | void XEmitter::PSLLW(X64Reg reg, int shift) | ||
| 1704 | { | ||
| 1705 | WriteSSEOp(0x66, 0x71, (X64Reg)6, R(reg)); | ||
| 1706 | Write8(shift); | ||
| 1707 | } | ||
| 1708 | |||
| 1709 | void XEmitter::PSLLD(X64Reg reg, int shift) | ||
| 1710 | { | ||
| 1711 | WriteSSEOp(0x66, 0x72, (X64Reg)6, R(reg)); | ||
| 1712 | Write8(shift); | ||
| 1713 | } | ||
| 1714 | |||
| 1715 | void XEmitter::PSLLQ(X64Reg reg, int shift) | ||
| 1716 | { | ||
| 1717 | WriteSSEOp(0x66, 0x73, (X64Reg)6, R(reg)); | ||
| 1718 | Write8(shift); | ||
| 1719 | } | ||
| 1720 | |||
| 1721 | void XEmitter::PSLLDQ(X64Reg reg, int shift) { | ||
| 1722 | WriteSSEOp(0x66, 0x73, (X64Reg)7, R(reg)); | ||
| 1723 | Write8(shift); | ||
| 1724 | } | ||
| 1725 | |||
| 1726 | void XEmitter::PSRAW(X64Reg reg, int shift) | ||
| 1727 | { | ||
| 1728 | WriteSSEOp(0x66, 0x71, (X64Reg)4, R(reg)); | ||
| 1729 | Write8(shift); | ||
| 1730 | } | ||
| 1731 | |||
| 1732 | void XEmitter::PSRAD(X64Reg reg, int shift) | ||
| 1733 | { | ||
| 1734 | WriteSSEOp(0x66, 0x72, (X64Reg)4, R(reg)); | ||
| 1735 | Write8(shift); | ||
| 1736 | } | ||
| 1737 | |||
| 1738 | void XEmitter::WriteSSSE3Op(u8 opPrefix, u16 op, X64Reg regOp, OpArg arg, int extrabytes) | ||
| 1739 | { | ||
| 1740 | if (!Common::GetCPUCaps().ssse3) | ||
| 1741 | ASSERT_MSG(0, "Trying to use SSSE3 on a system that doesn't support it. Bad programmer."); | ||
| 1742 | WriteSSEOp(opPrefix, op, regOp, arg, extrabytes); | ||
| 1743 | } | ||
| 1744 | |||
| 1745 | void XEmitter::WriteSSE41Op(u8 opPrefix, u16 op, X64Reg regOp, OpArg arg, int extrabytes) | ||
| 1746 | { | ||
| 1747 | if (!Common::GetCPUCaps().sse4_1) | ||
| 1748 | ASSERT_MSG(0, "Trying to use SSE4.1 on a system that doesn't support it. Bad programmer."); | ||
| 1749 | WriteSSEOp(opPrefix, op, regOp, arg, extrabytes); | ||
| 1750 | } | ||
| 1751 | |||
| 1752 | void XEmitter::PSHUFB(X64Reg dest, OpArg arg) {WriteSSSE3Op(0x66, 0x3800, dest, arg);} | ||
| 1753 | void XEmitter::PTEST(X64Reg dest, OpArg arg) {WriteSSE41Op(0x66, 0x3817, dest, arg);} | ||
| 1754 | void XEmitter::PACKUSDW(X64Reg dest, OpArg arg) {WriteSSE41Op(0x66, 0x382b, dest, arg);} | ||
| 1755 | void XEmitter::DPPS(X64Reg dest, OpArg arg, u8 mask) {WriteSSE41Op(0x66, 0x3A40, dest, arg, 1); Write8(mask);} | ||
| 1756 | |||
| 1757 | void XEmitter::PMINSB(X64Reg dest, OpArg arg) {WriteSSE41Op(0x66, 0x3838, dest, arg);} | ||
| 1758 | void XEmitter::PMINSD(X64Reg dest, OpArg arg) {WriteSSE41Op(0x66, 0x3839, dest, arg);} | ||
| 1759 | void XEmitter::PMINUW(X64Reg dest, OpArg arg) {WriteSSE41Op(0x66, 0x383a, dest, arg);} | ||
| 1760 | void XEmitter::PMINUD(X64Reg dest, OpArg arg) {WriteSSE41Op(0x66, 0x383b, dest, arg);} | ||
| 1761 | void XEmitter::PMAXSB(X64Reg dest, OpArg arg) {WriteSSE41Op(0x66, 0x383c, dest, arg);} | ||
| 1762 | void XEmitter::PMAXSD(X64Reg dest, OpArg arg) {WriteSSE41Op(0x66, 0x383d, dest, arg);} | ||
| 1763 | void XEmitter::PMAXUW(X64Reg dest, OpArg arg) {WriteSSE41Op(0x66, 0x383e, dest, arg);} | ||
| 1764 | void XEmitter::PMAXUD(X64Reg dest, OpArg arg) {WriteSSE41Op(0x66, 0x383f, dest, arg);} | ||
| 1765 | |||
| 1766 | void XEmitter::PMOVSXBW(X64Reg dest, OpArg arg) {WriteSSE41Op(0x66, 0x3820, dest, arg);} | ||
| 1767 | void XEmitter::PMOVSXBD(X64Reg dest, OpArg arg) {WriteSSE41Op(0x66, 0x3821, dest, arg);} | ||
| 1768 | void XEmitter::PMOVSXBQ(X64Reg dest, OpArg arg) {WriteSSE41Op(0x66, 0x3822, dest, arg);} | ||
| 1769 | void XEmitter::PMOVSXWD(X64Reg dest, OpArg arg) {WriteSSE41Op(0x66, 0x3823, dest, arg);} | ||
| 1770 | void XEmitter::PMOVSXWQ(X64Reg dest, OpArg arg) {WriteSSE41Op(0x66, 0x3824, dest, arg);} | ||
| 1771 | void XEmitter::PMOVSXDQ(X64Reg dest, OpArg arg) {WriteSSE41Op(0x66, 0x3825, dest, arg);} | ||
| 1772 | void XEmitter::PMOVZXBW(X64Reg dest, OpArg arg) {WriteSSE41Op(0x66, 0x3830, dest, arg);} | ||
| 1773 | void XEmitter::PMOVZXBD(X64Reg dest, OpArg arg) {WriteSSE41Op(0x66, 0x3831, dest, arg);} | ||
| 1774 | void XEmitter::PMOVZXBQ(X64Reg dest, OpArg arg) {WriteSSE41Op(0x66, 0x3832, dest, arg);} | ||
| 1775 | void XEmitter::PMOVZXWD(X64Reg dest, OpArg arg) {WriteSSE41Op(0x66, 0x3833, dest, arg);} | ||
| 1776 | void XEmitter::PMOVZXWQ(X64Reg dest, OpArg arg) {WriteSSE41Op(0x66, 0x3834, dest, arg);} | ||
| 1777 | void XEmitter::PMOVZXDQ(X64Reg dest, OpArg arg) {WriteSSE41Op(0x66, 0x3835, dest, arg);} | ||
| 1778 | |||
| 1779 | void XEmitter::PBLENDVB(X64Reg dest, OpArg arg) {WriteSSE41Op(0x66, 0x3810, dest, arg);} | ||
| 1780 | void XEmitter::BLENDVPS(X64Reg dest, OpArg arg) {WriteSSE41Op(0x66, 0x3814, dest, arg);} | ||
| 1781 | void XEmitter::BLENDVPD(X64Reg dest, OpArg arg) {WriteSSE41Op(0x66, 0x3815, dest, arg);} | ||
| 1782 | void XEmitter::BLENDPS(X64Reg dest, const OpArg& arg, u8 blend) { WriteSSE41Op(0x66, 0x3A0C, dest, arg, 1); Write8(blend); } | ||
| 1783 | void XEmitter::BLENDPD(X64Reg dest, const OpArg& arg, u8 blend) { WriteSSE41Op(0x66, 0x3A0D, dest, arg, 1); Write8(blend); } | ||
| 1784 | |||
| 1785 | void XEmitter::ROUNDSS(X64Reg dest, OpArg arg, u8 mode) {WriteSSE41Op(0x66, 0x3A0A, dest, arg, 1); Write8(mode);} | ||
| 1786 | void XEmitter::ROUNDSD(X64Reg dest, OpArg arg, u8 mode) {WriteSSE41Op(0x66, 0x3A0B, dest, arg, 1); Write8(mode);} | ||
| 1787 | void XEmitter::ROUNDPS(X64Reg dest, OpArg arg, u8 mode) {WriteSSE41Op(0x66, 0x3A08, dest, arg, 1); Write8(mode);} | ||
| 1788 | void XEmitter::ROUNDPD(X64Reg dest, OpArg arg, u8 mode) {WriteSSE41Op(0x66, 0x3A09, dest, arg, 1); Write8(mode);} | ||
| 1789 | |||
| 1790 | void XEmitter::PAND(X64Reg dest, OpArg arg) {WriteSSEOp(0x66, 0xDB, dest, arg);} | ||
| 1791 | void XEmitter::PANDN(X64Reg dest, OpArg arg) {WriteSSEOp(0x66, 0xDF, dest, arg);} | ||
| 1792 | void XEmitter::PXOR(X64Reg dest, OpArg arg) {WriteSSEOp(0x66, 0xEF, dest, arg);} | ||
| 1793 | void XEmitter::POR(X64Reg dest, OpArg arg) {WriteSSEOp(0x66, 0xEB, dest, arg);} | ||
| 1794 | |||
| 1795 | void XEmitter::PADDB(X64Reg dest, OpArg arg) {WriteSSEOp(0x66, 0xFC, dest, arg);} | ||
| 1796 | void XEmitter::PADDW(X64Reg dest, OpArg arg) {WriteSSEOp(0x66, 0xFD, dest, arg);} | ||
| 1797 | void XEmitter::PADDD(X64Reg dest, OpArg arg) {WriteSSEOp(0x66, 0xFE, dest, arg);} | ||
| 1798 | void XEmitter::PADDQ(X64Reg dest, OpArg arg) {WriteSSEOp(0x66, 0xD4, dest, arg);} | ||
| 1799 | |||
| 1800 | void XEmitter::PADDSB(X64Reg dest, OpArg arg) {WriteSSEOp(0x66, 0xEC, dest, arg);} | ||
| 1801 | void XEmitter::PADDSW(X64Reg dest, OpArg arg) {WriteSSEOp(0x66, 0xED, dest, arg);} | ||
| 1802 | void XEmitter::PADDUSB(X64Reg dest, OpArg arg) {WriteSSEOp(0x66, 0xDC, dest, arg);} | ||
| 1803 | void XEmitter::PADDUSW(X64Reg dest, OpArg arg) {WriteSSEOp(0x66, 0xDD, dest, arg);} | ||
| 1804 | |||
| 1805 | void XEmitter::PSUBB(X64Reg dest, OpArg arg) {WriteSSEOp(0x66, 0xF8, dest, arg);} | ||
| 1806 | void XEmitter::PSUBW(X64Reg dest, OpArg arg) {WriteSSEOp(0x66, 0xF9, dest, arg);} | ||
| 1807 | void XEmitter::PSUBD(X64Reg dest, OpArg arg) {WriteSSEOp(0x66, 0xFA, dest, arg);} | ||
| 1808 | void XEmitter::PSUBQ(X64Reg dest, OpArg arg) {WriteSSEOp(0x66, 0xFB, dest, arg);} | ||
| 1809 | |||
| 1810 | void XEmitter::PSUBSB(X64Reg dest, OpArg arg) {WriteSSEOp(0x66, 0xE8, dest, arg);} | ||
| 1811 | void XEmitter::PSUBSW(X64Reg dest, OpArg arg) {WriteSSEOp(0x66, 0xE9, dest, arg);} | ||
| 1812 | void XEmitter::PSUBUSB(X64Reg dest, OpArg arg) {WriteSSEOp(0x66, 0xD8, dest, arg);} | ||
| 1813 | void XEmitter::PSUBUSW(X64Reg dest, OpArg arg) {WriteSSEOp(0x66, 0xD9, dest, arg);} | ||
| 1814 | |||
| 1815 | void XEmitter::PAVGB(X64Reg dest, OpArg arg) {WriteSSEOp(0x66, 0xE0, dest, arg);} | ||
| 1816 | void XEmitter::PAVGW(X64Reg dest, OpArg arg) {WriteSSEOp(0x66, 0xE3, dest, arg);} | ||
| 1817 | |||
| 1818 | void XEmitter::PCMPEQB(X64Reg dest, OpArg arg) {WriteSSEOp(0x66, 0x74, dest, arg);} | ||
| 1819 | void XEmitter::PCMPEQW(X64Reg dest, OpArg arg) {WriteSSEOp(0x66, 0x75, dest, arg);} | ||
| 1820 | void XEmitter::PCMPEQD(X64Reg dest, OpArg arg) {WriteSSEOp(0x66, 0x76, dest, arg);} | ||
| 1821 | |||
| 1822 | void XEmitter::PCMPGTB(X64Reg dest, OpArg arg) {WriteSSEOp(0x66, 0x64, dest, arg);} | ||
| 1823 | void XEmitter::PCMPGTW(X64Reg dest, OpArg arg) {WriteSSEOp(0x66, 0x65, dest, arg);} | ||
| 1824 | void XEmitter::PCMPGTD(X64Reg dest, OpArg arg) {WriteSSEOp(0x66, 0x66, dest, arg);} | ||
| 1825 | |||
| 1826 | void XEmitter::PEXTRW(X64Reg dest, OpArg arg, u8 subreg) {WriteSSEOp(0x66, 0xC5, dest, arg, 1); Write8(subreg);} | ||
| 1827 | void XEmitter::PINSRW(X64Reg dest, OpArg arg, u8 subreg) {WriteSSEOp(0x66, 0xC4, dest, arg, 1); Write8(subreg);} | ||
| 1828 | |||
| 1829 | void XEmitter::PMADDWD(X64Reg dest, OpArg arg) {WriteSSEOp(0x66, 0xF5, dest, arg); } | ||
| 1830 | void XEmitter::PSADBW(X64Reg dest, OpArg arg) {WriteSSEOp(0x66, 0xF6, dest, arg);} | ||
| 1831 | |||
| 1832 | void XEmitter::PMAXSW(X64Reg dest, OpArg arg) {WriteSSEOp(0x66, 0xEE, dest, arg); } | ||
| 1833 | void XEmitter::PMAXUB(X64Reg dest, OpArg arg) {WriteSSEOp(0x66, 0xDE, dest, arg); } | ||
| 1834 | void XEmitter::PMINSW(X64Reg dest, OpArg arg) {WriteSSEOp(0x66, 0xEA, dest, arg); } | ||
| 1835 | void XEmitter::PMINUB(X64Reg dest, OpArg arg) {WriteSSEOp(0x66, 0xDA, dest, arg); } | ||
| 1836 | |||
| 1837 | void XEmitter::PMOVMSKB(X64Reg dest, OpArg arg) {WriteSSEOp(0x66, 0xD7, dest, arg); } | ||
| 1838 | void XEmitter::PSHUFD(X64Reg regOp, OpArg arg, u8 shuffle) {WriteSSEOp(0x66, 0x70, regOp, arg, 1); Write8(shuffle);} | ||
| 1839 | void XEmitter::PSHUFLW(X64Reg regOp, OpArg arg, u8 shuffle) {WriteSSEOp(0xF2, 0x70, regOp, arg, 1); Write8(shuffle);} | ||
| 1840 | void XEmitter::PSHUFHW(X64Reg regOp, OpArg arg, u8 shuffle) {WriteSSEOp(0xF3, 0x70, regOp, arg, 1); Write8(shuffle);} | ||
| 1841 | |||
| 1842 | // VEX | ||
| 1843 | void XEmitter::VADDSD(X64Reg regOp1, X64Reg regOp2, OpArg arg) {WriteAVXOp(0xF2, sseADD, regOp1, regOp2, arg);} | ||
| 1844 | void XEmitter::VSUBSD(X64Reg regOp1, X64Reg regOp2, OpArg arg) {WriteAVXOp(0xF2, sseSUB, regOp1, regOp2, arg);} | ||
| 1845 | void XEmitter::VMULSD(X64Reg regOp1, X64Reg regOp2, OpArg arg) {WriteAVXOp(0xF2, sseMUL, regOp1, regOp2, arg);} | ||
| 1846 | void XEmitter::VDIVSD(X64Reg regOp1, X64Reg regOp2, OpArg arg) {WriteAVXOp(0xF2, sseDIV, regOp1, regOp2, arg);} | ||
| 1847 | void XEmitter::VADDPD(X64Reg regOp1, X64Reg regOp2, OpArg arg) {WriteAVXOp(0x66, sseADD, regOp1, regOp2, arg);} | ||
| 1848 | void XEmitter::VSUBPD(X64Reg regOp1, X64Reg regOp2, OpArg arg) {WriteAVXOp(0x66, sseSUB, regOp1, regOp2, arg);} | ||
| 1849 | void XEmitter::VMULPD(X64Reg regOp1, X64Reg regOp2, OpArg arg) {WriteAVXOp(0x66, sseMUL, regOp1, regOp2, arg);} | ||
| 1850 | void XEmitter::VDIVPD(X64Reg regOp1, X64Reg regOp2, OpArg arg) {WriteAVXOp(0x66, sseDIV, regOp1, regOp2, arg);} | ||
| 1851 | void XEmitter::VSQRTSD(X64Reg regOp1, X64Reg regOp2, OpArg arg) {WriteAVXOp(0xF2, sseSQRT, regOp1, regOp2, arg);} | ||
| 1852 | void XEmitter::VSHUFPD(X64Reg regOp1, X64Reg regOp2, OpArg arg, u8 shuffle) {WriteAVXOp(0x66, sseSHUF, regOp1, regOp2, arg, 1); Write8(shuffle);} | ||
| 1853 | void XEmitter::VUNPCKLPD(X64Reg regOp1, X64Reg regOp2, OpArg arg){WriteAVXOp(0x66, 0x14, regOp1, regOp2, arg);} | ||
| 1854 | void XEmitter::VUNPCKHPD(X64Reg regOp1, X64Reg regOp2, OpArg arg){WriteAVXOp(0x66, 0x15, regOp1, regOp2, arg);} | ||
| 1855 | |||
| 1856 | void XEmitter::VANDPS(X64Reg regOp1, X64Reg regOp2, OpArg arg) { WriteAVXOp(0x00, sseAND, regOp1, regOp2, arg); } | ||
| 1857 | void XEmitter::VANDPD(X64Reg regOp1, X64Reg regOp2, OpArg arg) { WriteAVXOp(0x66, sseAND, regOp1, regOp2, arg); } | ||
| 1858 | void XEmitter::VANDNPS(X64Reg regOp1, X64Reg regOp2, OpArg arg) { WriteAVXOp(0x00, sseANDN, regOp1, regOp2, arg); } | ||
| 1859 | void XEmitter::VANDNPD(X64Reg regOp1, X64Reg regOp2, OpArg arg) { WriteAVXOp(0x66, sseANDN, regOp1, regOp2, arg); } | ||
| 1860 | void XEmitter::VORPS(X64Reg regOp1, X64Reg regOp2, OpArg arg) { WriteAVXOp(0x00, sseOR, regOp1, regOp2, arg); } | ||
| 1861 | void XEmitter::VORPD(X64Reg regOp1, X64Reg regOp2, OpArg arg) { WriteAVXOp(0x66, sseOR, regOp1, regOp2, arg); } | ||
| 1862 | void XEmitter::VXORPS(X64Reg regOp1, X64Reg regOp2, OpArg arg) { WriteAVXOp(0x00, sseXOR, regOp1, regOp2, arg); } | ||
| 1863 | void XEmitter::VXORPD(X64Reg regOp1, X64Reg regOp2, OpArg arg) { WriteAVXOp(0x66, sseXOR, regOp1, regOp2, arg); } | ||
| 1864 | |||
| 1865 | void XEmitter::VPAND(X64Reg regOp1, X64Reg regOp2, OpArg arg) { WriteAVXOp(0x66, 0xDB, regOp1, regOp2, arg); } | ||
| 1866 | void XEmitter::VPANDN(X64Reg regOp1, X64Reg regOp2, OpArg arg) { WriteAVXOp(0x66, 0xDF, regOp1, regOp2, arg); } | ||
| 1867 | void XEmitter::VPOR(X64Reg regOp1, X64Reg regOp2, OpArg arg) { WriteAVXOp(0x66, 0xEB, regOp1, regOp2, arg); } | ||
| 1868 | void XEmitter::VPXOR(X64Reg regOp1, X64Reg regOp2, OpArg arg) { WriteAVXOp(0x66, 0xEF, regOp1, regOp2, arg); } | ||
| 1869 | |||
| 1870 | void XEmitter::VFMADD132PS(X64Reg regOp1, X64Reg regOp2, OpArg arg) { WriteAVXOp(0x66, 0x3898, regOp1, regOp2, arg); } | ||
| 1871 | void XEmitter::VFMADD213PS(X64Reg regOp1, X64Reg regOp2, OpArg arg) { WriteAVXOp(0x66, 0x38A8, regOp1, regOp2, arg); } | ||
| 1872 | void XEmitter::VFMADD231PS(X64Reg regOp1, X64Reg regOp2, OpArg arg) { WriteAVXOp(0x66, 0x38B8, regOp1, regOp2, arg); } | ||
| 1873 | void XEmitter::VFMADD132PD(X64Reg regOp1, X64Reg regOp2, OpArg arg) { WriteAVXOp(0x66, 0x3898, regOp1, regOp2, arg, 1); } | ||
| 1874 | void XEmitter::VFMADD213PD(X64Reg regOp1, X64Reg regOp2, OpArg arg) { WriteAVXOp(0x66, 0x38A8, regOp1, regOp2, arg, 1); } | ||
| 1875 | void XEmitter::VFMADD231PD(X64Reg regOp1, X64Reg regOp2, OpArg arg) { WriteAVXOp(0x66, 0x38B8, regOp1, regOp2, arg, 1); } | ||
| 1876 | void XEmitter::VFMADD132SS(X64Reg regOp1, X64Reg regOp2, OpArg arg) { WriteAVXOp(0x66, 0x3899, regOp1, regOp2, arg); } | ||
| 1877 | void XEmitter::VFMADD213SS(X64Reg regOp1, X64Reg regOp2, OpArg arg) { WriteAVXOp(0x66, 0x38A9, regOp1, regOp2, arg); } | ||
| 1878 | void XEmitter::VFMADD231SS(X64Reg regOp1, X64Reg regOp2, OpArg arg) { WriteAVXOp(0x66, 0x38B9, regOp1, regOp2, arg); } | ||
| 1879 | void XEmitter::VFMADD132SD(X64Reg regOp1, X64Reg regOp2, OpArg arg) { WriteAVXOp(0x66, 0x3899, regOp1, regOp2, arg, 1); } | ||
| 1880 | void XEmitter::VFMADD213SD(X64Reg regOp1, X64Reg regOp2, OpArg arg) { WriteAVXOp(0x66, 0x38A9, regOp1, regOp2, arg, 1); } | ||
| 1881 | void XEmitter::VFMADD231SD(X64Reg regOp1, X64Reg regOp2, OpArg arg) { WriteAVXOp(0x66, 0x38B9, regOp1, regOp2, arg, 1); } | ||
| 1882 | void XEmitter::VFMSUB132PS(X64Reg regOp1, X64Reg regOp2, OpArg arg) { WriteAVXOp(0x66, 0x389A, regOp1, regOp2, arg); } | ||
| 1883 | void XEmitter::VFMSUB213PS(X64Reg regOp1, X64Reg regOp2, OpArg arg) { WriteAVXOp(0x66, 0x38AA, regOp1, regOp2, arg); } | ||
| 1884 | void XEmitter::VFMSUB231PS(X64Reg regOp1, X64Reg regOp2, OpArg arg) { WriteAVXOp(0x66, 0x38BA, regOp1, regOp2, arg); } | ||
| 1885 | void XEmitter::VFMSUB132PD(X64Reg regOp1, X64Reg regOp2, OpArg arg) { WriteAVXOp(0x66, 0x389A, regOp1, regOp2, arg, 1); } | ||
| 1886 | void XEmitter::VFMSUB213PD(X64Reg regOp1, X64Reg regOp2, OpArg arg) { WriteAVXOp(0x66, 0x38AA, regOp1, regOp2, arg, 1); } | ||
| 1887 | void XEmitter::VFMSUB231PD(X64Reg regOp1, X64Reg regOp2, OpArg arg) { WriteAVXOp(0x66, 0x38BA, regOp1, regOp2, arg, 1); } | ||
| 1888 | void XEmitter::VFMSUB132SS(X64Reg regOp1, X64Reg regOp2, OpArg arg) { WriteAVXOp(0x66, 0x389B, regOp1, regOp2, arg); } | ||
| 1889 | void XEmitter::VFMSUB213SS(X64Reg regOp1, X64Reg regOp2, OpArg arg) { WriteAVXOp(0x66, 0x38AB, regOp1, regOp2, arg); } | ||
| 1890 | void XEmitter::VFMSUB231SS(X64Reg regOp1, X64Reg regOp2, OpArg arg) { WriteAVXOp(0x66, 0x38BB, regOp1, regOp2, arg); } | ||
| 1891 | void XEmitter::VFMSUB132SD(X64Reg regOp1, X64Reg regOp2, OpArg arg) { WriteAVXOp(0x66, 0x389B, regOp1, regOp2, arg, 1); } | ||
| 1892 | void XEmitter::VFMSUB213SD(X64Reg regOp1, X64Reg regOp2, OpArg arg) { WriteAVXOp(0x66, 0x38AB, regOp1, regOp2, arg, 1); } | ||
| 1893 | void XEmitter::VFMSUB231SD(X64Reg regOp1, X64Reg regOp2, OpArg arg) { WriteAVXOp(0x66, 0x38BB, regOp1, regOp2, arg, 1); } | ||
| 1894 | void XEmitter::VFNMADD132PS(X64Reg regOp1, X64Reg regOp2, OpArg arg) { WriteAVXOp(0x66, 0x389C, regOp1, regOp2, arg); } | ||
| 1895 | void XEmitter::VFNMADD213PS(X64Reg regOp1, X64Reg regOp2, OpArg arg) { WriteAVXOp(0x66, 0x38AC, regOp1, regOp2, arg); } | ||
| 1896 | void XEmitter::VFNMADD231PS(X64Reg regOp1, X64Reg regOp2, OpArg arg) { WriteAVXOp(0x66, 0x38BC, regOp1, regOp2, arg); } | ||
| 1897 | void XEmitter::VFNMADD132PD(X64Reg regOp1, X64Reg regOp2, OpArg arg) { WriteAVXOp(0x66, 0x389C, regOp1, regOp2, arg, 1); } | ||
| 1898 | void XEmitter::VFNMADD213PD(X64Reg regOp1, X64Reg regOp2, OpArg arg) { WriteAVXOp(0x66, 0x38AC, regOp1, regOp2, arg, 1); } | ||
| 1899 | void XEmitter::VFNMADD231PD(X64Reg regOp1, X64Reg regOp2, OpArg arg) { WriteAVXOp(0x66, 0x38BC, regOp1, regOp2, arg, 1); } | ||
| 1900 | void XEmitter::VFNMADD132SS(X64Reg regOp1, X64Reg regOp2, OpArg arg) { WriteAVXOp(0x66, 0x389D, regOp1, regOp2, arg); } | ||
| 1901 | void XEmitter::VFNMADD213SS(X64Reg regOp1, X64Reg regOp2, OpArg arg) { WriteAVXOp(0x66, 0x38AD, regOp1, regOp2, arg); } | ||
| 1902 | void XEmitter::VFNMADD231SS(X64Reg regOp1, X64Reg regOp2, OpArg arg) { WriteAVXOp(0x66, 0x38BD, regOp1, regOp2, arg); } | ||
| 1903 | void XEmitter::VFNMADD132SD(X64Reg regOp1, X64Reg regOp2, OpArg arg) { WriteAVXOp(0x66, 0x389D, regOp1, regOp2, arg, 1); } | ||
| 1904 | void XEmitter::VFNMADD213SD(X64Reg regOp1, X64Reg regOp2, OpArg arg) { WriteAVXOp(0x66, 0x38AD, regOp1, regOp2, arg, 1); } | ||
| 1905 | void XEmitter::VFNMADD231SD(X64Reg regOp1, X64Reg regOp2, OpArg arg) { WriteAVXOp(0x66, 0x38BD, regOp1, regOp2, arg, 1); } | ||
| 1906 | void XEmitter::VFNMSUB132PS(X64Reg regOp1, X64Reg regOp2, OpArg arg) { WriteAVXOp(0x66, 0x389E, regOp1, regOp2, arg); } | ||
| 1907 | void XEmitter::VFNMSUB213PS(X64Reg regOp1, X64Reg regOp2, OpArg arg) { WriteAVXOp(0x66, 0x38AE, regOp1, regOp2, arg); } | ||
| 1908 | void XEmitter::VFNMSUB231PS(X64Reg regOp1, X64Reg regOp2, OpArg arg) { WriteAVXOp(0x66, 0x38BE, regOp1, regOp2, arg); } | ||
| 1909 | void XEmitter::VFNMSUB132PD(X64Reg regOp1, X64Reg regOp2, OpArg arg) { WriteAVXOp(0x66, 0x389E, regOp1, regOp2, arg, 1); } | ||
| 1910 | void XEmitter::VFNMSUB213PD(X64Reg regOp1, X64Reg regOp2, OpArg arg) { WriteAVXOp(0x66, 0x38AE, regOp1, regOp2, arg, 1); } | ||
| 1911 | void XEmitter::VFNMSUB231PD(X64Reg regOp1, X64Reg regOp2, OpArg arg) { WriteAVXOp(0x66, 0x38BE, regOp1, regOp2, arg, 1); } | ||
| 1912 | void XEmitter::VFNMSUB132SS(X64Reg regOp1, X64Reg regOp2, OpArg arg) { WriteAVXOp(0x66, 0x389F, regOp1, regOp2, arg); } | ||
| 1913 | void XEmitter::VFNMSUB213SS(X64Reg regOp1, X64Reg regOp2, OpArg arg) { WriteAVXOp(0x66, 0x38AF, regOp1, regOp2, arg); } | ||
| 1914 | void XEmitter::VFNMSUB231SS(X64Reg regOp1, X64Reg regOp2, OpArg arg) { WriteAVXOp(0x66, 0x38BF, regOp1, regOp2, arg); } | ||
| 1915 | void XEmitter::VFNMSUB132SD(X64Reg regOp1, X64Reg regOp2, OpArg arg) { WriteAVXOp(0x66, 0x389F, regOp1, regOp2, arg, 1); } | ||
| 1916 | void XEmitter::VFNMSUB213SD(X64Reg regOp1, X64Reg regOp2, OpArg arg) { WriteAVXOp(0x66, 0x38AF, regOp1, regOp2, arg, 1); } | ||
| 1917 | void XEmitter::VFNMSUB231SD(X64Reg regOp1, X64Reg regOp2, OpArg arg) { WriteAVXOp(0x66, 0x38BF, regOp1, regOp2, arg, 1); } | ||
| 1918 | void XEmitter::VFMADDSUB132PS(X64Reg regOp1, X64Reg regOp2, OpArg arg) { WriteAVXOp(0x66, 0x3896, regOp1, regOp2, arg); } | ||
| 1919 | void XEmitter::VFMADDSUB213PS(X64Reg regOp1, X64Reg regOp2, OpArg arg) { WriteAVXOp(0x66, 0x38A6, regOp1, regOp2, arg); } | ||
| 1920 | void XEmitter::VFMADDSUB231PS(X64Reg regOp1, X64Reg regOp2, OpArg arg) { WriteAVXOp(0x66, 0x38B6, regOp1, regOp2, arg); } | ||
| 1921 | void XEmitter::VFMADDSUB132PD(X64Reg regOp1, X64Reg regOp2, OpArg arg) { WriteAVXOp(0x66, 0x3896, regOp1, regOp2, arg, 1); } | ||
| 1922 | void XEmitter::VFMADDSUB213PD(X64Reg regOp1, X64Reg regOp2, OpArg arg) { WriteAVXOp(0x66, 0x38A6, regOp1, regOp2, arg, 1); } | ||
| 1923 | void XEmitter::VFMADDSUB231PD(X64Reg regOp1, X64Reg regOp2, OpArg arg) { WriteAVXOp(0x66, 0x38B6, regOp1, regOp2, arg, 1); } | ||
| 1924 | void XEmitter::VFMSUBADD132PS(X64Reg regOp1, X64Reg regOp2, OpArg arg) { WriteAVXOp(0x66, 0x3897, regOp1, regOp2, arg); } | ||
| 1925 | void XEmitter::VFMSUBADD213PS(X64Reg regOp1, X64Reg regOp2, OpArg arg) { WriteAVXOp(0x66, 0x38A7, regOp1, regOp2, arg); } | ||
| 1926 | void XEmitter::VFMSUBADD231PS(X64Reg regOp1, X64Reg regOp2, OpArg arg) { WriteAVXOp(0x66, 0x38B7, regOp1, regOp2, arg); } | ||
| 1927 | void XEmitter::VFMSUBADD132PD(X64Reg regOp1, X64Reg regOp2, OpArg arg) { WriteAVXOp(0x66, 0x3897, regOp1, regOp2, arg, 1); } | ||
| 1928 | void XEmitter::VFMSUBADD213PD(X64Reg regOp1, X64Reg regOp2, OpArg arg) { WriteAVXOp(0x66, 0x38A7, regOp1, regOp2, arg, 1); } | ||
| 1929 | void XEmitter::VFMSUBADD231PD(X64Reg regOp1, X64Reg regOp2, OpArg arg) { WriteAVXOp(0x66, 0x38B7, regOp1, regOp2, arg, 1); } | ||
| 1930 | |||
| 1931 | void XEmitter::SARX(int bits, X64Reg regOp1, OpArg arg, X64Reg regOp2) {WriteBMI2Op(bits, 0xF3, 0x38F7, regOp1, regOp2, arg);} | ||
| 1932 | void XEmitter::SHLX(int bits, X64Reg regOp1, OpArg arg, X64Reg regOp2) {WriteBMI2Op(bits, 0x66, 0x38F7, regOp1, regOp2, arg);} | ||
| 1933 | void XEmitter::SHRX(int bits, X64Reg regOp1, OpArg arg, X64Reg regOp2) {WriteBMI2Op(bits, 0xF2, 0x38F7, regOp1, regOp2, arg);} | ||
| 1934 | void XEmitter::RORX(int bits, X64Reg regOp, OpArg arg, u8 rotate) {WriteBMI2Op(bits, 0xF2, 0x3AF0, regOp, INVALID_REG, arg, 1); Write8(rotate);} | ||
| 1935 | void XEmitter::PEXT(int bits, X64Reg regOp1, X64Reg regOp2, OpArg arg) {WriteBMI2Op(bits, 0xF3, 0x38F5, regOp1, regOp2, arg);} | ||
| 1936 | void XEmitter::PDEP(int bits, X64Reg regOp1, X64Reg regOp2, OpArg arg) {WriteBMI2Op(bits, 0xF2, 0x38F5, regOp1, regOp2, arg);} | ||
| 1937 | void XEmitter::MULX(int bits, X64Reg regOp1, X64Reg regOp2, OpArg arg) {WriteBMI2Op(bits, 0xF2, 0x38F6, regOp2, regOp1, arg);} | ||
| 1938 | void XEmitter::BZHI(int bits, X64Reg regOp1, OpArg arg, X64Reg regOp2) {WriteBMI2Op(bits, 0x00, 0x38F5, regOp1, regOp2, arg);} | ||
| 1939 | void XEmitter::BLSR(int bits, X64Reg regOp, OpArg arg) {WriteBMI1Op(bits, 0x00, 0x38F3, (X64Reg)0x1, regOp, arg);} | ||
| 1940 | void XEmitter::BLSMSK(int bits, X64Reg regOp, OpArg arg) {WriteBMI1Op(bits, 0x00, 0x38F3, (X64Reg)0x2, regOp, arg);} | ||
| 1941 | void XEmitter::BLSI(int bits, X64Reg regOp, OpArg arg) {WriteBMI1Op(bits, 0x00, 0x38F3, (X64Reg)0x3, regOp, arg);} | ||
| 1942 | void XEmitter::BEXTR(int bits, X64Reg regOp1, OpArg arg, X64Reg regOp2){WriteBMI1Op(bits, 0x00, 0x38F7, regOp1, regOp2, arg);} | ||
| 1943 | void XEmitter::ANDN(int bits, X64Reg regOp1, X64Reg regOp2, OpArg arg) {WriteBMI1Op(bits, 0x00, 0x38F2, regOp1, regOp2, arg);} | ||
| 1944 | |||
| 1945 | // Prefixes | ||
| 1946 | |||
| 1947 | void XEmitter::LOCK() { Write8(0xF0); } | ||
| 1948 | void XEmitter::REP() { Write8(0xF3); } | ||
| 1949 | void XEmitter::REPNE() { Write8(0xF2); } | ||
| 1950 | void XEmitter::FSOverride() { Write8(0x64); } | ||
| 1951 | void XEmitter::GSOverride() { Write8(0x65); } | ||
| 1952 | |||
| 1953 | void XEmitter::FWAIT() | ||
| 1954 | { | ||
| 1955 | Write8(0x9B); | ||
| 1956 | } | ||
| 1957 | |||
| 1958 | // TODO: make this more generic | ||
| 1959 | void XEmitter::WriteFloatLoadStore(int bits, FloatOp op, FloatOp op_80b, OpArg arg) | ||
| 1960 | { | ||
| 1961 | int mf = 0; | ||
| 1962 | ASSERT_MSG(!(bits == 80 && op_80b == floatINVALID), "WriteFloatLoadStore: 80 bits not supported for this instruction"); | ||
| 1963 | switch (bits) | ||
| 1964 | { | ||
| 1965 | case 32: mf = 0; break; | ||
| 1966 | case 64: mf = 4; break; | ||
| 1967 | case 80: mf = 2; break; | ||
| 1968 | default: ASSERT_MSG(0, "WriteFloatLoadStore: invalid bits (should be 32/64/80)"); | ||
| 1969 | } | ||
| 1970 | Write8(0xd9 | mf); | ||
| 1971 | // x87 instructions use the reg field of the ModR/M byte as opcode: | ||
| 1972 | if (bits == 80) | ||
| 1973 | op = op_80b; | ||
| 1974 | arg.WriteRest(this, 0, (X64Reg) op); | ||
| 1975 | } | ||
| 1976 | |||
| 1977 | void XEmitter::FLD(int bits, OpArg src) {WriteFloatLoadStore(bits, floatLD, floatLD80, src);} | ||
| 1978 | void XEmitter::FST(int bits, OpArg dest) {WriteFloatLoadStore(bits, floatST, floatINVALID, dest);} | ||
| 1979 | void XEmitter::FSTP(int bits, OpArg dest) {WriteFloatLoadStore(bits, floatSTP, floatSTP80, dest);} | ||
| 1980 | void XEmitter::FNSTSW_AX() { Write8(0xDF); Write8(0xE0); } | ||
| 1981 | |||
| 1982 | void XEmitter::RDTSC() { Write8(0x0F); Write8(0x31); } | ||
| 1983 | |||
| 1984 | void XCodeBlock::PoisonMemory() { | ||
| 1985 | // x86/64: 0xCC = breakpoint | ||
| 1986 | memset(region, 0xCC, region_size); | ||
| 1987 | } | ||
| 1988 | |||
| 1989 | } | ||
diff --git a/src/common/x64/emitter.h b/src/common/x64/emitter.h new file mode 100644 index 000000000..e9c924126 --- /dev/null +++ b/src/common/x64/emitter.h | |||
| @@ -0,0 +1,1067 @@ | |||
| 1 | // Copyright (C) 2003 Dolphin Project. | ||
| 2 | |||
| 3 | // This program is free software: you can redistribute it and/or modify | ||
| 4 | // it under the terms of the GNU General Public License as published by | ||
| 5 | // the Free Software Foundation, version 2.0 or later versions. | ||
| 6 | |||
| 7 | // This program is distributed in the hope that it will be useful, | ||
| 8 | // but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
| 9 | // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
| 10 | // GNU General Public License 2.0 for more details. | ||
| 11 | |||
| 12 | // A copy of the GPL 2.0 should have been included with the program. | ||
| 13 | // If not, see http://www.gnu.org/licenses/ | ||
| 14 | |||
| 15 | // Official SVN repository and contact information can be found at | ||
| 16 | // http://code.google.com/p/dolphin-emu/ | ||
| 17 | |||
| 18 | #pragma once | ||
| 19 | |||
| 20 | #include "common/assert.h" | ||
| 21 | #include "common/common_types.h" | ||
| 22 | #include "common/code_block.h" | ||
| 23 | |||
| 24 | #if defined(ARCHITECTURE_x86_64) && !defined(_ARCH_64) | ||
| 25 | #define _ARCH_64 | ||
| 26 | #endif | ||
| 27 | |||
| 28 | #ifdef _ARCH_64 | ||
| 29 | #define PTRBITS 64 | ||
| 30 | #else | ||
| 31 | #define PTRBITS 32 | ||
| 32 | #endif | ||
| 33 | |||
| 34 | namespace Gen | ||
| 35 | { | ||
| 36 | |||
| 37 | enum X64Reg | ||
| 38 | { | ||
| 39 | EAX = 0, EBX = 3, ECX = 1, EDX = 2, | ||
| 40 | ESI = 6, EDI = 7, EBP = 5, ESP = 4, | ||
| 41 | |||
| 42 | RAX = 0, RBX = 3, RCX = 1, RDX = 2, | ||
| 43 | RSI = 6, RDI = 7, RBP = 5, RSP = 4, | ||
| 44 | R8 = 8, R9 = 9, R10 = 10,R11 = 11, | ||
| 45 | R12 = 12,R13 = 13,R14 = 14,R15 = 15, | ||
| 46 | |||
| 47 | AL = 0, BL = 3, CL = 1, DL = 2, | ||
| 48 | SIL = 6, DIL = 7, BPL = 5, SPL = 4, | ||
| 49 | AH = 0x104, BH = 0x107, CH = 0x105, DH = 0x106, | ||
| 50 | |||
| 51 | AX = 0, BX = 3, CX = 1, DX = 2, | ||
| 52 | SI = 6, DI = 7, BP = 5, SP = 4, | ||
| 53 | |||
| 54 | XMM0=0, XMM1, XMM2, XMM3, XMM4, XMM5, XMM6, XMM7, | ||
| 55 | XMM8, XMM9, XMM10, XMM11, XMM12, XMM13, XMM14, XMM15, | ||
| 56 | |||
| 57 | YMM0=0, YMM1, YMM2, YMM3, YMM4, YMM5, YMM6, YMM7, | ||
| 58 | YMM8, YMM9, YMM10, YMM11, YMM12, YMM13, YMM14, YMM15, | ||
| 59 | |||
| 60 | INVALID_REG = 0xFFFFFFFF | ||
| 61 | }; | ||
| 62 | |||
| 63 | enum CCFlags | ||
| 64 | { | ||
| 65 | CC_O = 0, | ||
| 66 | CC_NO = 1, | ||
| 67 | CC_B = 2, CC_C = 2, CC_NAE = 2, | ||
| 68 | CC_NB = 3, CC_NC = 3, CC_AE = 3, | ||
| 69 | CC_Z = 4, CC_E = 4, | ||
| 70 | CC_NZ = 5, CC_NE = 5, | ||
| 71 | CC_BE = 6, CC_NA = 6, | ||
| 72 | CC_NBE = 7, CC_A = 7, | ||
| 73 | CC_S = 8, | ||
| 74 | CC_NS = 9, | ||
| 75 | CC_P = 0xA, CC_PE = 0xA, | ||
| 76 | CC_NP = 0xB, CC_PO = 0xB, | ||
| 77 | CC_L = 0xC, CC_NGE = 0xC, | ||
| 78 | CC_NL = 0xD, CC_GE = 0xD, | ||
| 79 | CC_LE = 0xE, CC_NG = 0xE, | ||
| 80 | CC_NLE = 0xF, CC_G = 0xF | ||
| 81 | }; | ||
| 82 | |||
| 83 | enum | ||
| 84 | { | ||
| 85 | NUMGPRs = 16, | ||
| 86 | NUMXMMs = 16, | ||
| 87 | }; | ||
| 88 | |||
| 89 | enum | ||
| 90 | { | ||
| 91 | SCALE_NONE = 0, | ||
| 92 | SCALE_1 = 1, | ||
| 93 | SCALE_2 = 2, | ||
| 94 | SCALE_4 = 4, | ||
| 95 | SCALE_8 = 8, | ||
| 96 | SCALE_ATREG = 16, | ||
| 97 | //SCALE_NOBASE_1 is not supported and can be replaced with SCALE_ATREG | ||
| 98 | SCALE_NOBASE_2 = 34, | ||
| 99 | SCALE_NOBASE_4 = 36, | ||
| 100 | SCALE_NOBASE_8 = 40, | ||
| 101 | SCALE_RIP = 0xFF, | ||
| 102 | SCALE_IMM8 = 0xF0, | ||
| 103 | SCALE_IMM16 = 0xF1, | ||
| 104 | SCALE_IMM32 = 0xF2, | ||
| 105 | SCALE_IMM64 = 0xF3, | ||
| 106 | }; | ||
| 107 | |||
| 108 | enum NormalOp { | ||
| 109 | nrmADD, | ||
| 110 | nrmADC, | ||
| 111 | nrmSUB, | ||
| 112 | nrmSBB, | ||
| 113 | nrmAND, | ||
| 114 | nrmOR , | ||
| 115 | nrmXOR, | ||
| 116 | nrmMOV, | ||
| 117 | nrmTEST, | ||
| 118 | nrmCMP, | ||
| 119 | nrmXCHG, | ||
| 120 | }; | ||
| 121 | |||
| 122 | enum { | ||
| 123 | CMP_EQ = 0, | ||
| 124 | CMP_LT = 1, | ||
| 125 | CMP_LE = 2, | ||
| 126 | CMP_UNORD = 3, | ||
| 127 | CMP_NEQ = 4, | ||
| 128 | CMP_NLT = 5, | ||
| 129 | CMP_NLE = 6, | ||
| 130 | CMP_ORD = 7, | ||
| 131 | }; | ||
| 132 | |||
| 133 | enum FloatOp { | ||
| 134 | floatLD = 0, | ||
| 135 | floatST = 2, | ||
| 136 | floatSTP = 3, | ||
| 137 | floatLD80 = 5, | ||
| 138 | floatSTP80 = 7, | ||
| 139 | |||
| 140 | floatINVALID = -1, | ||
| 141 | }; | ||
| 142 | |||
| 143 | enum FloatRound { | ||
| 144 | FROUND_NEAREST = 0, | ||
| 145 | FROUND_FLOOR = 1, | ||
| 146 | FROUND_CEIL = 2, | ||
| 147 | FROUND_ZERO = 3, | ||
| 148 | FROUND_MXCSR = 4, | ||
| 149 | |||
| 150 | FROUND_RAISE_PRECISION = 0, | ||
| 151 | FROUND_IGNORE_PRECISION = 8, | ||
| 152 | }; | ||
| 153 | |||
| 154 | class XEmitter; | ||
| 155 | |||
| 156 | // RIP addressing does not benefit from micro op fusion on Core arch | ||
| 157 | struct OpArg | ||
| 158 | { | ||
| 159 | OpArg() {} // dummy op arg, used for storage | ||
| 160 | OpArg(u64 _offset, int _scale, X64Reg rmReg = RAX, X64Reg scaledReg = RAX) | ||
| 161 | { | ||
| 162 | operandReg = 0; | ||
| 163 | scale = (u8)_scale; | ||
| 164 | offsetOrBaseReg = (u16)rmReg; | ||
| 165 | indexReg = (u16)scaledReg; | ||
| 166 | //if scale == 0 never mind offsetting | ||
| 167 | offset = _offset; | ||
| 168 | } | ||
| 169 | bool operator==(const OpArg &b) const | ||
| 170 | { | ||
| 171 | return operandReg == b.operandReg && scale == b.scale && offsetOrBaseReg == b.offsetOrBaseReg && | ||
| 172 | indexReg == b.indexReg && offset == b.offset; | ||
| 173 | } | ||
| 174 | void WriteRex(XEmitter *emit, int opBits, int bits, int customOp = -1) const; | ||
| 175 | void WriteVex(XEmitter* emit, X64Reg regOp1, X64Reg regOp2, int L, int pp, int mmmmm, int W = 0) const; | ||
| 176 | void WriteRest(XEmitter *emit, int extraBytes=0, X64Reg operandReg=INVALID_REG, bool warn_64bit_offset = true) const; | ||
| 177 | void WriteFloatModRM(XEmitter *emit, FloatOp op); | ||
| 178 | void WriteSingleByteOp(XEmitter *emit, u8 op, X64Reg operandReg, int bits); | ||
| 179 | // This one is public - must be written to | ||
| 180 | u64 offset; // use RIP-relative as much as possible - 64-bit immediates are not available. | ||
| 181 | u16 operandReg; | ||
| 182 | |||
| 183 | void WriteNormalOp(XEmitter *emit, bool toRM, NormalOp op, const OpArg &operand, int bits) const; | ||
| 184 | bool IsImm() const {return scale == SCALE_IMM8 || scale == SCALE_IMM16 || scale == SCALE_IMM32 || scale == SCALE_IMM64;} | ||
| 185 | bool IsSimpleReg() const {return scale == SCALE_NONE;} | ||
| 186 | bool IsSimpleReg(X64Reg reg) const | ||
| 187 | { | ||
| 188 | if (!IsSimpleReg()) | ||
| 189 | return false; | ||
| 190 | return GetSimpleReg() == reg; | ||
| 191 | } | ||
| 192 | |||
| 193 | bool CanDoOpWith(const OpArg &other) const | ||
| 194 | { | ||
| 195 | if (IsSimpleReg()) return true; | ||
| 196 | if (!IsSimpleReg() && !other.IsSimpleReg() && !other.IsImm()) return false; | ||
| 197 | return true; | ||
| 198 | } | ||
| 199 | |||
| 200 | int GetImmBits() const | ||
| 201 | { | ||
| 202 | switch (scale) | ||
| 203 | { | ||
| 204 | case SCALE_IMM8: return 8; | ||
| 205 | case SCALE_IMM16: return 16; | ||
| 206 | case SCALE_IMM32: return 32; | ||
| 207 | case SCALE_IMM64: return 64; | ||
| 208 | default: return -1; | ||
| 209 | } | ||
| 210 | } | ||
| 211 | |||
| 212 | void SetImmBits(int bits) { | ||
| 213 | switch (bits) | ||
| 214 | { | ||
| 215 | case 8: scale = SCALE_IMM8; break; | ||
| 216 | case 16: scale = SCALE_IMM16; break; | ||
| 217 | case 32: scale = SCALE_IMM32; break; | ||
| 218 | case 64: scale = SCALE_IMM64; break; | ||
| 219 | } | ||
| 220 | } | ||
| 221 | |||
| 222 | X64Reg GetSimpleReg() const | ||
| 223 | { | ||
| 224 | if (scale == SCALE_NONE) | ||
| 225 | return (X64Reg)offsetOrBaseReg; | ||
| 226 | else | ||
| 227 | return INVALID_REG; | ||
| 228 | } | ||
| 229 | |||
| 230 | u32 GetImmValue() const { | ||
| 231 | return (u32)offset; | ||
| 232 | } | ||
| 233 | |||
| 234 | // For loops. | ||
| 235 | void IncreaseOffset(int sz) { | ||
| 236 | offset += sz; | ||
| 237 | } | ||
| 238 | |||
| 239 | private: | ||
| 240 | u8 scale; | ||
| 241 | u16 offsetOrBaseReg; | ||
| 242 | u16 indexReg; | ||
| 243 | }; | ||
| 244 | |||
| 245 | inline OpArg M(const void *ptr) {return OpArg((u64)ptr, (int)SCALE_RIP);} | ||
| 246 | template <typename T> | ||
| 247 | inline OpArg M(const T *ptr) {return OpArg((u64)(const void *)ptr, (int)SCALE_RIP);} | ||
| 248 | inline OpArg R(X64Reg value) {return OpArg(0, SCALE_NONE, value);} | ||
| 249 | inline OpArg MatR(X64Reg value) {return OpArg(0, SCALE_ATREG, value);} | ||
| 250 | |||
| 251 | inline OpArg MDisp(X64Reg value, int offset) | ||
| 252 | { | ||
| 253 | return OpArg((u32)offset, SCALE_ATREG, value); | ||
| 254 | } | ||
| 255 | |||
| 256 | inline OpArg MComplex(X64Reg base, X64Reg scaled, int scale, int offset) | ||
| 257 | { | ||
| 258 | return OpArg(offset, scale, base, scaled); | ||
| 259 | } | ||
| 260 | |||
| 261 | inline OpArg MScaled(X64Reg scaled, int scale, int offset) | ||
| 262 | { | ||
| 263 | if (scale == SCALE_1) | ||
| 264 | return OpArg(offset, SCALE_ATREG, scaled); | ||
| 265 | else | ||
| 266 | return OpArg(offset, scale | 0x20, RAX, scaled); | ||
| 267 | } | ||
| 268 | |||
| 269 | inline OpArg MRegSum(X64Reg base, X64Reg offset) | ||
| 270 | { | ||
| 271 | return MComplex(base, offset, 1, 0); | ||
| 272 | } | ||
| 273 | |||
| 274 | inline OpArg Imm8 (u8 imm) {return OpArg(imm, SCALE_IMM8);} | ||
| 275 | inline OpArg Imm16(u16 imm) {return OpArg(imm, SCALE_IMM16);} //rarely used | ||
| 276 | inline OpArg Imm32(u32 imm) {return OpArg(imm, SCALE_IMM32);} | ||
| 277 | inline OpArg Imm64(u64 imm) {return OpArg(imm, SCALE_IMM64);} | ||
| 278 | inline OpArg UImmAuto(u32 imm) { | ||
| 279 | return OpArg(imm, imm >= 128 ? SCALE_IMM32 : SCALE_IMM8); | ||
| 280 | } | ||
| 281 | inline OpArg SImmAuto(s32 imm) { | ||
| 282 | return OpArg(imm, (imm >= 128 || imm < -128) ? SCALE_IMM32 : SCALE_IMM8); | ||
| 283 | } | ||
| 284 | |||
| 285 | #ifdef _ARCH_64 | ||
| 286 | inline OpArg ImmPtr(const void* imm) {return Imm64((u64)imm);} | ||
| 287 | #else | ||
| 288 | inline OpArg ImmPtr(const void* imm) {return Imm32((u32)imm);} | ||
| 289 | #endif | ||
| 290 | |||
| 291 | inline u32 PtrOffset(const void* ptr, const void* base) | ||
| 292 | { | ||
| 293 | #ifdef _ARCH_64 | ||
| 294 | s64 distance = (s64)ptr-(s64)base; | ||
| 295 | if (distance >= 0x80000000LL || | ||
| 296 | distance < -0x80000000LL) | ||
| 297 | { | ||
| 298 | ASSERT_MSG(0, "pointer offset out of range"); | ||
| 299 | return 0; | ||
| 300 | } | ||
| 301 | |||
| 302 | return (u32)distance; | ||
| 303 | #else | ||
| 304 | return (u32)ptr-(u32)base; | ||
| 305 | #endif | ||
| 306 | } | ||
| 307 | |||
| 308 | //usage: int a[]; ARRAY_OFFSET(a,10) | ||
| 309 | #define ARRAY_OFFSET(array,index) ((u32)((u64)&(array)[index]-(u64)&(array)[0])) | ||
| 310 | //usage: struct {int e;} s; STRUCT_OFFSET(s,e) | ||
| 311 | #define STRUCT_OFFSET(str,elem) ((u32)((u64)&(str).elem-(u64)&(str))) | ||
| 312 | |||
| 313 | struct FixupBranch | ||
| 314 | { | ||
| 315 | u8 *ptr; | ||
| 316 | int type; //0 = 8bit 1 = 32bit | ||
| 317 | }; | ||
| 318 | |||
| 319 | enum SSECompare | ||
| 320 | { | ||
| 321 | EQ = 0, | ||
| 322 | LT, | ||
| 323 | LE, | ||
| 324 | UNORD, | ||
| 325 | NEQ, | ||
| 326 | NLT, | ||
| 327 | NLE, | ||
| 328 | ORD, | ||
| 329 | }; | ||
| 330 | |||
| 331 | typedef const u8* JumpTarget; | ||
| 332 | |||
| 333 | class XEmitter | ||
| 334 | { | ||
| 335 | friend struct OpArg; // for Write8 etc | ||
| 336 | private: | ||
| 337 | u8 *code; | ||
| 338 | bool flags_locked; | ||
| 339 | |||
| 340 | void CheckFlags(); | ||
| 341 | |||
| 342 | void Rex(int w, int r, int x, int b); | ||
| 343 | void WriteSimple1Byte(int bits, u8 byte, X64Reg reg); | ||
| 344 | void WriteSimple2Byte(int bits, u8 byte1, u8 byte2, X64Reg reg); | ||
| 345 | void WriteMulDivType(int bits, OpArg src, int ext); | ||
| 346 | void WriteBitSearchType(int bits, X64Reg dest, OpArg src, u8 byte2, bool rep = false); | ||
| 347 | void WriteShift(int bits, OpArg dest, OpArg &shift, int ext); | ||
| 348 | void WriteBitTest(int bits, OpArg &dest, OpArg &index, int ext); | ||
| 349 | void WriteMXCSR(OpArg arg, int ext); | ||
| 350 | void WriteSSEOp(u8 opPrefix, u16 op, X64Reg regOp, OpArg arg, int extrabytes = 0); | ||
| 351 | void WriteSSSE3Op(u8 opPrefix, u16 op, X64Reg regOp, OpArg arg, int extrabytes = 0); | ||
| 352 | void WriteSSE41Op(u8 opPrefix, u16 op, X64Reg regOp, OpArg arg, int extrabytes = 0); | ||
| 353 | void WriteAVXOp(u8 opPrefix, u16 op, X64Reg regOp, OpArg arg, int extrabytes = 0); | ||
| 354 | void WriteAVXOp(u8 opPrefix, u16 op, X64Reg regOp1, X64Reg regOp2, OpArg arg, int extrabytes = 0); | ||
| 355 | void WriteVEXOp(int size, u8 opPrefix, u16 op, X64Reg regOp1, X64Reg regOp2, OpArg arg, int extrabytes = 0); | ||
| 356 | void WriteBMI1Op(int size, u8 opPrefix, u16 op, X64Reg regOp1, X64Reg regOp2, OpArg arg, int extrabytes = 0); | ||
| 357 | void WriteBMI2Op(int size, u8 opPrefix, u16 op, X64Reg regOp1, X64Reg regOp2, OpArg arg, int extrabytes = 0); | ||
| 358 | void WriteFloatLoadStore(int bits, FloatOp op, FloatOp op_80b, OpArg arg); | ||
| 359 | void WriteNormalOp(XEmitter *emit, int bits, NormalOp op, const OpArg &a1, const OpArg &a2); | ||
| 360 | |||
| 361 | void ABI_CalculateFrameSize(u32 mask, size_t rsp_alignment, size_t needed_frame_size, size_t* shadowp, size_t* subtractionp, size_t* xmm_offsetp); | ||
| 362 | |||
| 363 | protected: | ||
| 364 | inline void Write8(u8 value) {*code++ = value;} | ||
| 365 | inline void Write16(u16 value) {*(u16*)code = (value); code += 2;} | ||
| 366 | inline void Write32(u32 value) {*(u32*)code = (value); code += 4;} | ||
| 367 | inline void Write64(u64 value) {*(u64*)code = (value); code += 8;} | ||
| 368 | |||
| 369 | public: | ||
| 370 | XEmitter() { code = nullptr; flags_locked = false; } | ||
| 371 | XEmitter(u8 *code_ptr) { code = code_ptr; flags_locked = false; } | ||
| 372 | virtual ~XEmitter() {} | ||
| 373 | |||
| 374 | void WriteModRM(int mod, int rm, int reg); | ||
| 375 | void WriteSIB(int scale, int index, int base); | ||
| 376 | |||
| 377 | void SetCodePtr(u8 *ptr); | ||
| 378 | void ReserveCodeSpace(int bytes); | ||
| 379 | const u8 *AlignCode4(); | ||
| 380 | const u8 *AlignCode16(); | ||
| 381 | const u8 *AlignCodePage(); | ||
| 382 | const u8 *GetCodePtr() const; | ||
| 383 | u8 *GetWritableCodePtr(); | ||
| 384 | |||
| 385 | void LockFlags() { flags_locked = true; } | ||
| 386 | void UnlockFlags() { flags_locked = false; } | ||
| 387 | |||
| 388 | // Looking for one of these? It's BANNED!! Some instructions are slow on modern CPU | ||
| 389 | // INC, DEC, LOOP, LOOPNE, LOOPE, ENTER, LEAVE, XCHG, XLAT, REP MOVSB/MOVSD, REP SCASD + other string instr., | ||
| 390 | // INC and DEC are slow on Intel Core, but not on AMD. They create a | ||
| 391 | // false flag dependency because they only update a subset of the flags. | ||
| 392 | // XCHG is SLOW and should be avoided. | ||
| 393 | |||
| 394 | // Debug breakpoint | ||
| 395 | void INT3(); | ||
| 396 | |||
| 397 | // Do nothing | ||
| 398 | void NOP(size_t count = 1); | ||
| 399 | |||
| 400 | // Save energy in wait-loops on P4 only. Probably not too useful. | ||
| 401 | void PAUSE(); | ||
| 402 | |||
| 403 | // Flag control | ||
| 404 | void STC(); | ||
| 405 | void CLC(); | ||
| 406 | void CMC(); | ||
| 407 | |||
| 408 | // These two can not be executed in 64-bit mode on early Intel 64-bit CPU:s, only on Core2 and AMD! | ||
| 409 | void LAHF(); // 3 cycle vector path | ||
| 410 | void SAHF(); // direct path fast | ||
| 411 | |||
| 412 | |||
| 413 | // Stack control | ||
| 414 | void PUSH(X64Reg reg); | ||
| 415 | void POP(X64Reg reg); | ||
| 416 | void PUSH(int bits, const OpArg ®); | ||
| 417 | void POP(int bits, const OpArg ®); | ||
| 418 | void PUSHF(); | ||
| 419 | void POPF(); | ||
| 420 | |||
| 421 | // Flow control | ||
| 422 | void RET(); | ||
| 423 | void RET_FAST(); | ||
| 424 | void UD2(); | ||
| 425 | FixupBranch J(bool force5bytes = false); | ||
| 426 | |||
| 427 | void JMP(const u8 * addr, bool force5Bytes = false); | ||
| 428 | void JMP(OpArg arg); | ||
| 429 | void JMPptr(const OpArg &arg); | ||
| 430 | void JMPself(); //infinite loop! | ||
| 431 | #ifdef CALL | ||
| 432 | #undef CALL | ||
| 433 | #endif | ||
| 434 | void CALL(const void *fnptr); | ||
| 435 | void CALLptr(OpArg arg); | ||
| 436 | |||
| 437 | FixupBranch J_CC(CCFlags conditionCode, bool force5bytes = false); | ||
| 438 | //void J_CC(CCFlags conditionCode, JumpTarget target); | ||
| 439 | void J_CC(CCFlags conditionCode, const u8 * addr, bool force5Bytes = false); | ||
| 440 | |||
| 441 | void SetJumpTarget(const FixupBranch &branch); | ||
| 442 | |||
| 443 | void SETcc(CCFlags flag, OpArg dest); | ||
| 444 | // Note: CMOV brings small if any benefit on current cpus. | ||
| 445 | void CMOVcc(int bits, X64Reg dest, OpArg src, CCFlags flag); | ||
| 446 | |||
| 447 | // Fences | ||
| 448 | void LFENCE(); | ||
| 449 | void MFENCE(); | ||
| 450 | void SFENCE(); | ||
| 451 | |||
| 452 | // Bit scan | ||
| 453 | void BSF(int bits, X64Reg dest, OpArg src); //bottom bit to top bit | ||
| 454 | void BSR(int bits, X64Reg dest, OpArg src); //top bit to bottom bit | ||
| 455 | |||
| 456 | // Cache control | ||
| 457 | enum PrefetchLevel | ||
| 458 | { | ||
| 459 | PF_NTA, //Non-temporal (data used once and only once) | ||
| 460 | PF_T0, //All cache levels | ||
| 461 | PF_T1, //Levels 2+ (aliased to T0 on AMD) | ||
| 462 | PF_T2, //Levels 3+ (aliased to T0 on AMD) | ||
| 463 | }; | ||
| 464 | void PREFETCH(PrefetchLevel level, OpArg arg); | ||
| 465 | void MOVNTI(int bits, OpArg dest, X64Reg src); | ||
| 466 | void MOVNTDQ(OpArg arg, X64Reg regOp); | ||
| 467 | void MOVNTPS(OpArg arg, X64Reg regOp); | ||
| 468 | void MOVNTPD(OpArg arg, X64Reg regOp); | ||
| 469 | |||
| 470 | // Multiplication / division | ||
| 471 | void MUL(int bits, OpArg src); //UNSIGNED | ||
| 472 | void IMUL(int bits, OpArg src); //SIGNED | ||
| 473 | void IMUL(int bits, X64Reg regOp, OpArg src); | ||
| 474 | void IMUL(int bits, X64Reg regOp, OpArg src, OpArg imm); | ||
| 475 | void DIV(int bits, OpArg src); | ||
| 476 | void IDIV(int bits, OpArg src); | ||
| 477 | |||
| 478 | // Shift | ||
| 479 | void ROL(int bits, OpArg dest, OpArg shift); | ||
| 480 | void ROR(int bits, OpArg dest, OpArg shift); | ||
| 481 | void RCL(int bits, OpArg dest, OpArg shift); | ||
| 482 | void RCR(int bits, OpArg dest, OpArg shift); | ||
| 483 | void SHL(int bits, OpArg dest, OpArg shift); | ||
| 484 | void SHR(int bits, OpArg dest, OpArg shift); | ||
| 485 | void SAR(int bits, OpArg dest, OpArg shift); | ||
| 486 | |||
| 487 | // Bit Test | ||
| 488 | void BT(int bits, OpArg dest, OpArg index); | ||
| 489 | void BTS(int bits, OpArg dest, OpArg index); | ||
| 490 | void BTR(int bits, OpArg dest, OpArg index); | ||
| 491 | void BTC(int bits, OpArg dest, OpArg index); | ||
| 492 | |||
| 493 | // Double-Precision Shift | ||
| 494 | void SHRD(int bits, OpArg dest, OpArg src, OpArg shift); | ||
| 495 | void SHLD(int bits, OpArg dest, OpArg src, OpArg shift); | ||
| 496 | |||
| 497 | // Extend EAX into EDX in various ways | ||
| 498 | void CWD(int bits = 16); | ||
| 499 | inline void CDQ() {CWD(32);} | ||
| 500 | inline void CQO() {CWD(64);} | ||
| 501 | void CBW(int bits = 8); | ||
| 502 | inline void CWDE() {CBW(16);} | ||
| 503 | inline void CDQE() {CBW(32);} | ||
| 504 | |||
| 505 | // Load effective address | ||
| 506 | void LEA(int bits, X64Reg dest, OpArg src); | ||
| 507 | |||
| 508 | // Integer arithmetic | ||
| 509 | void NEG (int bits, OpArg src); | ||
| 510 | void ADD (int bits, const OpArg &a1, const OpArg &a2); | ||
| 511 | void ADC (int bits, const OpArg &a1, const OpArg &a2); | ||
| 512 | void SUB (int bits, const OpArg &a1, const OpArg &a2); | ||
| 513 | void SBB (int bits, const OpArg &a1, const OpArg &a2); | ||
| 514 | void AND (int bits, const OpArg &a1, const OpArg &a2); | ||
| 515 | void CMP (int bits, const OpArg &a1, const OpArg &a2); | ||
| 516 | |||
| 517 | // Bit operations | ||
| 518 | void NOT (int bits, OpArg src); | ||
| 519 | void OR (int bits, const OpArg &a1, const OpArg &a2); | ||
| 520 | void XOR (int bits, const OpArg &a1, const OpArg &a2); | ||
| 521 | void MOV (int bits, const OpArg &a1, const OpArg &a2); | ||
| 522 | void TEST(int bits, const OpArg &a1, const OpArg &a2); | ||
| 523 | |||
| 524 | // Are these useful at all? Consider removing. | ||
| 525 | void XCHG(int bits, const OpArg &a1, const OpArg &a2); | ||
| 526 | void XCHG_AHAL(); | ||
| 527 | |||
| 528 | // Byte swapping (32 and 64-bit only). | ||
| 529 | void BSWAP(int bits, X64Reg reg); | ||
| 530 | |||
| 531 | // Sign/zero extension | ||
| 532 | void MOVSX(int dbits, int sbits, X64Reg dest, OpArg src); //automatically uses MOVSXD if necessary | ||
| 533 | void MOVZX(int dbits, int sbits, X64Reg dest, OpArg src); | ||
| 534 | |||
| 535 | // Available only on Atom or >= Haswell so far. Test with GetCPUCaps().movbe. | ||
| 536 | void MOVBE(int dbits, const OpArg& dest, const OpArg& src); | ||
| 537 | |||
| 538 | // Available only on AMD >= Phenom or Intel >= Haswell | ||
| 539 | void LZCNT(int bits, X64Reg dest, OpArg src); | ||
| 540 | // Note: this one is actually part of BMI1 | ||
| 541 | void TZCNT(int bits, X64Reg dest, OpArg src); | ||
| 542 | |||
| 543 | // WARNING - These two take 11-13 cycles and are VectorPath! (AMD64) | ||
| 544 | void STMXCSR(OpArg memloc); | ||
| 545 | void LDMXCSR(OpArg memloc); | ||
| 546 | |||
| 547 | // Prefixes | ||
| 548 | void LOCK(); | ||
| 549 | void REP(); | ||
| 550 | void REPNE(); | ||
| 551 | void FSOverride(); | ||
| 552 | void GSOverride(); | ||
| 553 | |||
| 554 | // x87 | ||
| 555 | enum x87StatusWordBits { | ||
| 556 | x87_InvalidOperation = 0x1, | ||
| 557 | x87_DenormalizedOperand = 0x2, | ||
| 558 | x87_DivisionByZero = 0x4, | ||
| 559 | x87_Overflow = 0x8, | ||
| 560 | x87_Underflow = 0x10, | ||
| 561 | x87_Precision = 0x20, | ||
| 562 | x87_StackFault = 0x40, | ||
| 563 | x87_ErrorSummary = 0x80, | ||
| 564 | x87_C0 = 0x100, | ||
| 565 | x87_C1 = 0x200, | ||
| 566 | x87_C2 = 0x400, | ||
| 567 | x87_TopOfStack = 0x2000 | 0x1000 | 0x800, | ||
| 568 | x87_C3 = 0x4000, | ||
| 569 | x87_FPUBusy = 0x8000, | ||
| 570 | }; | ||
| 571 | |||
| 572 | void FLD(int bits, OpArg src); | ||
| 573 | void FST(int bits, OpArg dest); | ||
| 574 | void FSTP(int bits, OpArg dest); | ||
| 575 | void FNSTSW_AX(); | ||
| 576 | void FWAIT(); | ||
| 577 | |||
| 578 | // SSE/SSE2: Floating point arithmetic | ||
| 579 | void ADDSS(X64Reg regOp, OpArg arg); | ||
| 580 | void ADDSD(X64Reg regOp, OpArg arg); | ||
| 581 | void SUBSS(X64Reg regOp, OpArg arg); | ||
| 582 | void SUBSD(X64Reg regOp, OpArg arg); | ||
| 583 | void MULSS(X64Reg regOp, OpArg arg); | ||
| 584 | void MULSD(X64Reg regOp, OpArg arg); | ||
| 585 | void DIVSS(X64Reg regOp, OpArg arg); | ||
| 586 | void DIVSD(X64Reg regOp, OpArg arg); | ||
| 587 | void MINSS(X64Reg regOp, OpArg arg); | ||
| 588 | void MINSD(X64Reg regOp, OpArg arg); | ||
| 589 | void MAXSS(X64Reg regOp, OpArg arg); | ||
| 590 | void MAXSD(X64Reg regOp, OpArg arg); | ||
| 591 | void SQRTSS(X64Reg regOp, OpArg arg); | ||
| 592 | void SQRTSD(X64Reg regOp, OpArg arg); | ||
| 593 | void RSQRTSS(X64Reg regOp, OpArg arg); | ||
| 594 | |||
| 595 | // SSE/SSE2: Floating point bitwise (yes) | ||
| 596 | void CMPSS(X64Reg regOp, OpArg arg, u8 compare); | ||
| 597 | void CMPSD(X64Reg regOp, OpArg arg, u8 compare); | ||
| 598 | |||
| 599 | inline void CMPEQSS(X64Reg regOp, OpArg arg) { CMPSS(regOp, arg, CMP_EQ); } | ||
| 600 | inline void CMPLTSS(X64Reg regOp, OpArg arg) { CMPSS(regOp, arg, CMP_LT); } | ||
| 601 | inline void CMPLESS(X64Reg regOp, OpArg arg) { CMPSS(regOp, arg, CMP_LE); } | ||
| 602 | inline void CMPUNORDSS(X64Reg regOp, OpArg arg) { CMPSS(regOp, arg, CMP_UNORD); } | ||
| 603 | inline void CMPNEQSS(X64Reg regOp, OpArg arg) { CMPSS(regOp, arg, CMP_NEQ); } | ||
| 604 | inline void CMPNLTSS(X64Reg regOp, OpArg arg) { CMPSS(regOp, arg, CMP_NLT); } | ||
| 605 | inline void CMPORDSS(X64Reg regOp, OpArg arg) { CMPSS(regOp, arg, CMP_ORD); } | ||
| 606 | |||
| 607 | // SSE/SSE2: Floating point packed arithmetic (x4 for float, x2 for double) | ||
| 608 | void ADDPS(X64Reg regOp, OpArg arg); | ||
| 609 | void ADDPD(X64Reg regOp, OpArg arg); | ||
| 610 | void SUBPS(X64Reg regOp, OpArg arg); | ||
| 611 | void SUBPD(X64Reg regOp, OpArg arg); | ||
| 612 | void CMPPS(X64Reg regOp, OpArg arg, u8 compare); | ||
| 613 | void CMPPD(X64Reg regOp, OpArg arg, u8 compare); | ||
| 614 | void MULPS(X64Reg regOp, OpArg arg); | ||
| 615 | void MULPD(X64Reg regOp, OpArg arg); | ||
| 616 | void DIVPS(X64Reg regOp, OpArg arg); | ||
| 617 | void DIVPD(X64Reg regOp, OpArg arg); | ||
| 618 | void MINPS(X64Reg regOp, OpArg arg); | ||
| 619 | void MINPD(X64Reg regOp, OpArg arg); | ||
| 620 | void MAXPS(X64Reg regOp, OpArg arg); | ||
| 621 | void MAXPD(X64Reg regOp, OpArg arg); | ||
| 622 | void SQRTPS(X64Reg regOp, OpArg arg); | ||
| 623 | void SQRTPD(X64Reg regOp, OpArg arg); | ||
| 624 | void RCPPS(X64Reg regOp, OpArg arg); | ||
| 625 | void RSQRTPS(X64Reg regOp, OpArg arg); | ||
| 626 | |||
| 627 | // SSE/SSE2: Floating point packed bitwise (x4 for float, x2 for double) | ||
| 628 | void ANDPS(X64Reg regOp, OpArg arg); | ||
| 629 | void ANDPD(X64Reg regOp, OpArg arg); | ||
| 630 | void ANDNPS(X64Reg regOp, OpArg arg); | ||
| 631 | void ANDNPD(X64Reg regOp, OpArg arg); | ||
| 632 | void ORPS(X64Reg regOp, OpArg arg); | ||
| 633 | void ORPD(X64Reg regOp, OpArg arg); | ||
| 634 | void XORPS(X64Reg regOp, OpArg arg); | ||
| 635 | void XORPD(X64Reg regOp, OpArg arg); | ||
| 636 | |||
| 637 | // SSE/SSE2: Shuffle components. These are tricky - see Intel documentation. | ||
| 638 | void SHUFPS(X64Reg regOp, OpArg arg, u8 shuffle); | ||
| 639 | void SHUFPD(X64Reg regOp, OpArg arg, u8 shuffle); | ||
| 640 | |||
| 641 | // SSE/SSE2: Useful alternative to shuffle in some cases. | ||
| 642 | void MOVDDUP(X64Reg regOp, OpArg arg); | ||
| 643 | |||
| 644 | // TODO: Actually implement | ||
| 645 | #if 0 | ||
| 646 | // SSE3: Horizontal operations in SIMD registers. Could be useful for various VFPU things like dot products... | ||
| 647 | void ADDSUBPS(X64Reg dest, OpArg src); | ||
| 648 | void ADDSUBPD(X64Reg dest, OpArg src); | ||
| 649 | void HADDPD(X64Reg dest, OpArg src); | ||
| 650 | void HSUBPS(X64Reg dest, OpArg src); | ||
| 651 | void HSUBPD(X64Reg dest, OpArg src); | ||
| 652 | |||
| 653 | // SSE4: Further horizontal operations - dot products. These are weirdly flexible, the arg contains both a read mask and a write "mask". | ||
| 654 | void DPPD(X64Reg dest, OpArg src, u8 arg); | ||
| 655 | |||
| 656 | // These are probably useful for VFPU emulation. | ||
| 657 | void INSERTPS(X64Reg dest, OpArg src, u8 arg); | ||
| 658 | void EXTRACTPS(OpArg dest, X64Reg src, u8 arg); | ||
| 659 | #endif | ||
| 660 | |||
| 661 | // SSE3: Horizontal operations in SIMD registers. Very slow! shufps-based code beats it handily on Ivy. | ||
| 662 | void HADDPS(X64Reg dest, OpArg src); | ||
| 663 | |||
| 664 | // SSE4: Further horizontal operations - dot products. These are weirdly flexible, the arg contains both a read mask and a write "mask". | ||
| 665 | void DPPS(X64Reg dest, OpArg src, u8 arg); | ||
| 666 | |||
| 667 | void UNPCKLPS(X64Reg dest, OpArg src); | ||
| 668 | void UNPCKHPS(X64Reg dest, OpArg src); | ||
| 669 | void UNPCKLPD(X64Reg dest, OpArg src); | ||
| 670 | void UNPCKHPD(X64Reg dest, OpArg src); | ||
| 671 | |||
| 672 | // SSE/SSE2: Compares. | ||
| 673 | void COMISS(X64Reg regOp, OpArg arg); | ||
| 674 | void COMISD(X64Reg regOp, OpArg arg); | ||
| 675 | void UCOMISS(X64Reg regOp, OpArg arg); | ||
| 676 | void UCOMISD(X64Reg regOp, OpArg arg); | ||
| 677 | |||
| 678 | // SSE/SSE2: Moves. Use the right data type for your data, in most cases. | ||
| 679 | void MOVAPS(X64Reg regOp, OpArg arg); | ||
| 680 | void MOVAPD(X64Reg regOp, OpArg arg); | ||
| 681 | void MOVAPS(OpArg arg, X64Reg regOp); | ||
| 682 | void MOVAPD(OpArg arg, X64Reg regOp); | ||
| 683 | |||
| 684 | void MOVUPS(X64Reg regOp, OpArg arg); | ||
| 685 | void MOVUPD(X64Reg regOp, OpArg arg); | ||
| 686 | void MOVUPS(OpArg arg, X64Reg regOp); | ||
| 687 | void MOVUPD(OpArg arg, X64Reg regOp); | ||
| 688 | |||
| 689 | void MOVDQA(X64Reg regOp, OpArg arg); | ||
| 690 | void MOVDQA(OpArg arg, X64Reg regOp); | ||
| 691 | void MOVDQU(X64Reg regOp, OpArg arg); | ||
| 692 | void MOVDQU(OpArg arg, X64Reg regOp); | ||
| 693 | |||
| 694 | void MOVSS(X64Reg regOp, OpArg arg); | ||
| 695 | void MOVSD(X64Reg regOp, OpArg arg); | ||
| 696 | void MOVSS(OpArg arg, X64Reg regOp); | ||
| 697 | void MOVSD(OpArg arg, X64Reg regOp); | ||
| 698 | |||
| 699 | void MOVLPS(X64Reg regOp, OpArg arg); | ||
| 700 | void MOVLPD(X64Reg regOp, OpArg arg); | ||
| 701 | void MOVLPS(OpArg arg, X64Reg regOp); | ||
| 702 | void MOVLPD(OpArg arg, X64Reg regOp); | ||
| 703 | |||
| 704 | void MOVHPS(X64Reg regOp, OpArg arg); | ||
| 705 | void MOVHPD(X64Reg regOp, OpArg arg); | ||
| 706 | void MOVHPS(OpArg arg, X64Reg regOp); | ||
| 707 | void MOVHPD(OpArg arg, X64Reg regOp); | ||
| 708 | |||
| 709 | void MOVHLPS(X64Reg regOp1, X64Reg regOp2); | ||
| 710 | void MOVLHPS(X64Reg regOp1, X64Reg regOp2); | ||
| 711 | |||
| 712 | void MOVD_xmm(X64Reg dest, const OpArg &arg); | ||
| 713 | void MOVQ_xmm(X64Reg dest, OpArg arg); | ||
| 714 | void MOVD_xmm(const OpArg &arg, X64Reg src); | ||
| 715 | void MOVQ_xmm(OpArg arg, X64Reg src); | ||
| 716 | |||
| 717 | // SSE/SSE2: Generates a mask from the high bits of the components of the packed register in question. | ||
| 718 | void MOVMSKPS(X64Reg dest, OpArg arg); | ||
| 719 | void MOVMSKPD(X64Reg dest, OpArg arg); | ||
| 720 | |||
| 721 | // SSE2: Selective byte store, mask in src register. EDI/RDI specifies store address. This is a weird one. | ||
| 722 | void MASKMOVDQU(X64Reg dest, X64Reg src); | ||
| 723 | void LDDQU(X64Reg dest, OpArg src); | ||
| 724 | |||
| 725 | // SSE/SSE2: Data type conversions. | ||
| 726 | void CVTPS2PD(X64Reg dest, OpArg src); | ||
| 727 | void CVTPD2PS(X64Reg dest, OpArg src); | ||
| 728 | void CVTSS2SD(X64Reg dest, OpArg src); | ||
| 729 | void CVTSI2SS(X64Reg dest, OpArg src); | ||
| 730 | void CVTSD2SS(X64Reg dest, OpArg src); | ||
| 731 | void CVTSI2SD(X64Reg dest, OpArg src); | ||
| 732 | void CVTDQ2PD(X64Reg regOp, OpArg arg); | ||
| 733 | void CVTPD2DQ(X64Reg regOp, OpArg arg); | ||
| 734 | void CVTDQ2PS(X64Reg regOp, OpArg arg); | ||
| 735 | void CVTPS2DQ(X64Reg regOp, OpArg arg); | ||
| 736 | |||
| 737 | void CVTTPS2DQ(X64Reg regOp, OpArg arg); | ||
| 738 | void CVTTPD2DQ(X64Reg regOp, OpArg arg); | ||
| 739 | |||
| 740 | // Destinations are X64 regs (rax, rbx, ...) for these instructions. | ||
| 741 | void CVTSS2SI(X64Reg xregdest, OpArg src); | ||
| 742 | void CVTSD2SI(X64Reg xregdest, OpArg src); | ||
| 743 | void CVTTSS2SI(X64Reg xregdest, OpArg arg); | ||
| 744 | void CVTTSD2SI(X64Reg xregdest, OpArg arg); | ||
| 745 | |||
| 746 | // SSE2: Packed integer instructions | ||
| 747 | void PACKSSDW(X64Reg dest, OpArg arg); | ||
| 748 | void PACKSSWB(X64Reg dest, OpArg arg); | ||
| 749 | void PACKUSDW(X64Reg dest, OpArg arg); | ||
| 750 | void PACKUSWB(X64Reg dest, OpArg arg); | ||
| 751 | |||
| 752 | void PUNPCKLBW(X64Reg dest, const OpArg &arg); | ||
| 753 | void PUNPCKLWD(X64Reg dest, const OpArg &arg); | ||
| 754 | void PUNPCKLDQ(X64Reg dest, const OpArg &arg); | ||
| 755 | void PUNPCKLQDQ(X64Reg dest, const OpArg &arg); | ||
| 756 | |||
| 757 | void PTEST(X64Reg dest, OpArg arg); | ||
| 758 | void PAND(X64Reg dest, OpArg arg); | ||
| 759 | void PANDN(X64Reg dest, OpArg arg); | ||
| 760 | void PXOR(X64Reg dest, OpArg arg); | ||
| 761 | void POR(X64Reg dest, OpArg arg); | ||
| 762 | |||
| 763 | void PADDB(X64Reg dest, OpArg arg); | ||
| 764 | void PADDW(X64Reg dest, OpArg arg); | ||
| 765 | void PADDD(X64Reg dest, OpArg arg); | ||
| 766 | void PADDQ(X64Reg dest, OpArg arg); | ||
| 767 | |||
| 768 | void PADDSB(X64Reg dest, OpArg arg); | ||
| 769 | void PADDSW(X64Reg dest, OpArg arg); | ||
| 770 | void PADDUSB(X64Reg dest, OpArg arg); | ||
| 771 | void PADDUSW(X64Reg dest, OpArg arg); | ||
| 772 | |||
| 773 | void PSUBB(X64Reg dest, OpArg arg); | ||
| 774 | void PSUBW(X64Reg dest, OpArg arg); | ||
| 775 | void PSUBD(X64Reg dest, OpArg arg); | ||
| 776 | void PSUBQ(X64Reg dest, OpArg arg); | ||
| 777 | |||
| 778 | void PSUBSB(X64Reg dest, OpArg arg); | ||
| 779 | void PSUBSW(X64Reg dest, OpArg arg); | ||
| 780 | void PSUBUSB(X64Reg dest, OpArg arg); | ||
| 781 | void PSUBUSW(X64Reg dest, OpArg arg); | ||
| 782 | |||
| 783 | void PAVGB(X64Reg dest, OpArg arg); | ||
| 784 | void PAVGW(X64Reg dest, OpArg arg); | ||
| 785 | |||
| 786 | void PCMPEQB(X64Reg dest, OpArg arg); | ||
| 787 | void PCMPEQW(X64Reg dest, OpArg arg); | ||
| 788 | void PCMPEQD(X64Reg dest, OpArg arg); | ||
| 789 | |||
| 790 | void PCMPGTB(X64Reg dest, OpArg arg); | ||
| 791 | void PCMPGTW(X64Reg dest, OpArg arg); | ||
| 792 | void PCMPGTD(X64Reg dest, OpArg arg); | ||
| 793 | |||
| 794 | void PEXTRW(X64Reg dest, OpArg arg, u8 subreg); | ||
| 795 | void PINSRW(X64Reg dest, OpArg arg, u8 subreg); | ||
| 796 | |||
| 797 | void PMADDWD(X64Reg dest, OpArg arg); | ||
| 798 | void PSADBW(X64Reg dest, OpArg arg); | ||
| 799 | |||
| 800 | void PMAXSW(X64Reg dest, OpArg arg); | ||
| 801 | void PMAXUB(X64Reg dest, OpArg arg); | ||
| 802 | void PMINSW(X64Reg dest, OpArg arg); | ||
| 803 | void PMINUB(X64Reg dest, OpArg arg); | ||
| 804 | // SSE4: More MAX/MIN instructions. | ||
| 805 | void PMINSB(X64Reg dest, OpArg arg); | ||
| 806 | void PMINSD(X64Reg dest, OpArg arg); | ||
| 807 | void PMINUW(X64Reg dest, OpArg arg); | ||
| 808 | void PMINUD(X64Reg dest, OpArg arg); | ||
| 809 | void PMAXSB(X64Reg dest, OpArg arg); | ||
| 810 | void PMAXSD(X64Reg dest, OpArg arg); | ||
| 811 | void PMAXUW(X64Reg dest, OpArg arg); | ||
| 812 | void PMAXUD(X64Reg dest, OpArg arg); | ||
| 813 | |||
| 814 | void PMOVMSKB(X64Reg dest, OpArg arg); | ||
| 815 | void PSHUFD(X64Reg dest, OpArg arg, u8 shuffle); | ||
| 816 | void PSHUFB(X64Reg dest, OpArg arg); | ||
| 817 | |||
| 818 | void PSHUFLW(X64Reg dest, OpArg arg, u8 shuffle); | ||
| 819 | void PSHUFHW(X64Reg dest, OpArg arg, u8 shuffle); | ||
| 820 | |||
| 821 | void PSRLW(X64Reg reg, int shift); | ||
| 822 | void PSRLD(X64Reg reg, int shift); | ||
| 823 | void PSRLQ(X64Reg reg, int shift); | ||
| 824 | void PSRLQ(X64Reg reg, OpArg arg); | ||
| 825 | void PSRLDQ(X64Reg reg, int shift); | ||
| 826 | |||
| 827 | void PSLLW(X64Reg reg, int shift); | ||
| 828 | void PSLLD(X64Reg reg, int shift); | ||
| 829 | void PSLLQ(X64Reg reg, int shift); | ||
| 830 | void PSLLDQ(X64Reg reg, int shift); | ||
| 831 | |||
| 832 | void PSRAW(X64Reg reg, int shift); | ||
| 833 | void PSRAD(X64Reg reg, int shift); | ||
| 834 | |||
| 835 | // SSE4: data type conversions | ||
| 836 | void PMOVSXBW(X64Reg dest, OpArg arg); | ||
| 837 | void PMOVSXBD(X64Reg dest, OpArg arg); | ||
| 838 | void PMOVSXBQ(X64Reg dest, OpArg arg); | ||
| 839 | void PMOVSXWD(X64Reg dest, OpArg arg); | ||
| 840 | void PMOVSXWQ(X64Reg dest, OpArg arg); | ||
| 841 | void PMOVSXDQ(X64Reg dest, OpArg arg); | ||
| 842 | void PMOVZXBW(X64Reg dest, OpArg arg); | ||
| 843 | void PMOVZXBD(X64Reg dest, OpArg arg); | ||
| 844 | void PMOVZXBQ(X64Reg dest, OpArg arg); | ||
| 845 | void PMOVZXWD(X64Reg dest, OpArg arg); | ||
| 846 | void PMOVZXWQ(X64Reg dest, OpArg arg); | ||
| 847 | void PMOVZXDQ(X64Reg dest, OpArg arg); | ||
| 848 | |||
| 849 | // SSE4: variable blend instructions (xmm0 implicit argument) | ||
| 850 | void PBLENDVB(X64Reg dest, OpArg arg); | ||
| 851 | void BLENDVPS(X64Reg dest, OpArg arg); | ||
| 852 | void BLENDVPD(X64Reg dest, OpArg arg); | ||
| 853 | void BLENDPS(X64Reg dest, const OpArg& arg, u8 blend); | ||
| 854 | void BLENDPD(X64Reg dest, const OpArg& arg, u8 blend); | ||
| 855 | |||
| 856 | // SSE4: rounding (see FloatRound for mode or use ROUNDNEARSS, etc. helpers.) | ||
| 857 | void ROUNDSS(X64Reg dest, OpArg arg, u8 mode); | ||
| 858 | void ROUNDSD(X64Reg dest, OpArg arg, u8 mode); | ||
| 859 | void ROUNDPS(X64Reg dest, OpArg arg, u8 mode); | ||
| 860 | void ROUNDPD(X64Reg dest, OpArg arg, u8 mode); | ||
| 861 | |||
| 862 | inline void ROUNDNEARSS(X64Reg dest, OpArg arg) { ROUNDSS(dest, arg, FROUND_NEAREST); } | ||
| 863 | inline void ROUNDFLOORSS(X64Reg dest, OpArg arg) { ROUNDSS(dest, arg, FROUND_FLOOR); } | ||
| 864 | inline void ROUNDCEILSS(X64Reg dest, OpArg arg) { ROUNDSS(dest, arg, FROUND_CEIL); } | ||
| 865 | inline void ROUNDZEROSS(X64Reg dest, OpArg arg) { ROUNDSS(dest, arg, FROUND_ZERO); } | ||
| 866 | |||
| 867 | inline void ROUNDNEARSD(X64Reg dest, OpArg arg) { ROUNDSD(dest, arg, FROUND_NEAREST); } | ||
| 868 | inline void ROUNDFLOORSD(X64Reg dest, OpArg arg) { ROUNDSD(dest, arg, FROUND_FLOOR); } | ||
| 869 | inline void ROUNDCEILSD(X64Reg dest, OpArg arg) { ROUNDSD(dest, arg, FROUND_CEIL); } | ||
| 870 | inline void ROUNDZEROSD(X64Reg dest, OpArg arg) { ROUNDSD(dest, arg, FROUND_ZERO); } | ||
| 871 | |||
| 872 | inline void ROUNDNEARPS(X64Reg dest, OpArg arg) { ROUNDPS(dest, arg, FROUND_NEAREST); } | ||
| 873 | inline void ROUNDFLOORPS(X64Reg dest, OpArg arg) { ROUNDPS(dest, arg, FROUND_FLOOR); } | ||
| 874 | inline void ROUNDCEILPS(X64Reg dest, OpArg arg) { ROUNDPS(dest, arg, FROUND_CEIL); } | ||
| 875 | inline void ROUNDZEROPS(X64Reg dest, OpArg arg) { ROUNDPS(dest, arg, FROUND_ZERO); } | ||
| 876 | |||
| 877 | inline void ROUNDNEARPD(X64Reg dest, OpArg arg) { ROUNDPD(dest, arg, FROUND_NEAREST); } | ||
| 878 | inline void ROUNDFLOORPD(X64Reg dest, OpArg arg) { ROUNDPD(dest, arg, FROUND_FLOOR); } | ||
| 879 | inline void ROUNDCEILPD(X64Reg dest, OpArg arg) { ROUNDPD(dest, arg, FROUND_CEIL); } | ||
| 880 | inline void ROUNDZEROPD(X64Reg dest, OpArg arg) { ROUNDPD(dest, arg, FROUND_ZERO); } | ||
| 881 | |||
| 882 | // AVX | ||
| 883 | void VADDSD(X64Reg regOp1, X64Reg regOp2, OpArg arg); | ||
| 884 | void VSUBSD(X64Reg regOp1, X64Reg regOp2, OpArg arg); | ||
| 885 | void VMULSD(X64Reg regOp1, X64Reg regOp2, OpArg arg); | ||
| 886 | void VDIVSD(X64Reg regOp1, X64Reg regOp2, OpArg arg); | ||
| 887 | void VADDPD(X64Reg regOp1, X64Reg regOp2, OpArg arg); | ||
| 888 | void VSUBPD(X64Reg regOp1, X64Reg regOp2, OpArg arg); | ||
| 889 | void VMULPD(X64Reg regOp1, X64Reg regOp2, OpArg arg); | ||
| 890 | void VDIVPD(X64Reg regOp1, X64Reg regOp2, OpArg arg); | ||
| 891 | void VSQRTSD(X64Reg regOp1, X64Reg regOp2, OpArg arg); | ||
| 892 | void VSHUFPD(X64Reg regOp1, X64Reg regOp2, OpArg arg, u8 shuffle); | ||
| 893 | void VUNPCKLPD(X64Reg regOp1, X64Reg regOp2, OpArg arg); | ||
| 894 | void VUNPCKHPD(X64Reg regOp1, X64Reg regOp2, OpArg arg); | ||
| 895 | |||
| 896 | void VANDPS(X64Reg regOp1, X64Reg regOp2, OpArg arg); | ||
| 897 | void VANDPD(X64Reg regOp1, X64Reg regOp2, OpArg arg); | ||
| 898 | void VANDNPS(X64Reg regOp1, X64Reg regOp2, OpArg arg); | ||
| 899 | void VANDNPD(X64Reg regOp1, X64Reg regOp2, OpArg arg); | ||
| 900 | void VORPS(X64Reg regOp1, X64Reg regOp2, OpArg arg); | ||
| 901 | void VORPD(X64Reg regOp1, X64Reg regOp2, OpArg arg); | ||
| 902 | void VXORPS(X64Reg regOp1, X64Reg regOp2, OpArg arg); | ||
| 903 | void VXORPD(X64Reg regOp1, X64Reg regOp2, OpArg arg); | ||
| 904 | |||
| 905 | void VPAND(X64Reg regOp1, X64Reg regOp2, OpArg arg); | ||
| 906 | void VPANDN(X64Reg regOp1, X64Reg regOp2, OpArg arg); | ||
| 907 | void VPOR(X64Reg regOp1, X64Reg regOp2, OpArg arg); | ||
| 908 | void VPXOR(X64Reg regOp1, X64Reg regOp2, OpArg arg); | ||
| 909 | |||
| 910 | // FMA3 | ||
| 911 | void VFMADD132PS(X64Reg regOp1, X64Reg regOp2, OpArg arg); | ||
| 912 | void VFMADD213PS(X64Reg regOp1, X64Reg regOp2, OpArg arg); | ||
| 913 | void VFMADD231PS(X64Reg regOp1, X64Reg regOp2, OpArg arg); | ||
| 914 | void VFMADD132PD(X64Reg regOp1, X64Reg regOp2, OpArg arg); | ||
| 915 | void VFMADD213PD(X64Reg regOp1, X64Reg regOp2, OpArg arg); | ||
| 916 | void VFMADD231PD(X64Reg regOp1, X64Reg regOp2, OpArg arg); | ||
| 917 | void VFMADD132SS(X64Reg regOp1, X64Reg regOp2, OpArg arg); | ||
| 918 | void VFMADD213SS(X64Reg regOp1, X64Reg regOp2, OpArg arg); | ||
| 919 | void VFMADD231SS(X64Reg regOp1, X64Reg regOp2, OpArg arg); | ||
| 920 | void VFMADD132SD(X64Reg regOp1, X64Reg regOp2, OpArg arg); | ||
| 921 | void VFMADD213SD(X64Reg regOp1, X64Reg regOp2, OpArg arg); | ||
| 922 | void VFMADD231SD(X64Reg regOp1, X64Reg regOp2, OpArg arg); | ||
| 923 | void VFMSUB132PS(X64Reg regOp1, X64Reg regOp2, OpArg arg); | ||
| 924 | void VFMSUB213PS(X64Reg regOp1, X64Reg regOp2, OpArg arg); | ||
| 925 | void VFMSUB231PS(X64Reg regOp1, X64Reg regOp2, OpArg arg); | ||
| 926 | void VFMSUB132PD(X64Reg regOp1, X64Reg regOp2, OpArg arg); | ||
| 927 | void VFMSUB213PD(X64Reg regOp1, X64Reg regOp2, OpArg arg); | ||
| 928 | void VFMSUB231PD(X64Reg regOp1, X64Reg regOp2, OpArg arg); | ||
| 929 | void VFMSUB132SS(X64Reg regOp1, X64Reg regOp2, OpArg arg); | ||
| 930 | void VFMSUB213SS(X64Reg regOp1, X64Reg regOp2, OpArg arg); | ||
| 931 | void VFMSUB231SS(X64Reg regOp1, X64Reg regOp2, OpArg arg); | ||
| 932 | void VFMSUB132SD(X64Reg regOp1, X64Reg regOp2, OpArg arg); | ||
| 933 | void VFMSUB213SD(X64Reg regOp1, X64Reg regOp2, OpArg arg); | ||
| 934 | void VFMSUB231SD(X64Reg regOp1, X64Reg regOp2, OpArg arg); | ||
| 935 | void VFNMADD132PS(X64Reg regOp1, X64Reg regOp2, OpArg arg); | ||
| 936 | void VFNMADD213PS(X64Reg regOp1, X64Reg regOp2, OpArg arg); | ||
| 937 | void VFNMADD231PS(X64Reg regOp1, X64Reg regOp2, OpArg arg); | ||
| 938 | void VFNMADD132PD(X64Reg regOp1, X64Reg regOp2, OpArg arg); | ||
| 939 | void VFNMADD213PD(X64Reg regOp1, X64Reg regOp2, OpArg arg); | ||
| 940 | void VFNMADD231PD(X64Reg regOp1, X64Reg regOp2, OpArg arg); | ||
| 941 | void VFNMADD132SS(X64Reg regOp1, X64Reg regOp2, OpArg arg); | ||
| 942 | void VFNMADD213SS(X64Reg regOp1, X64Reg regOp2, OpArg arg); | ||
| 943 | void VFNMADD231SS(X64Reg regOp1, X64Reg regOp2, OpArg arg); | ||
| 944 | void VFNMADD132SD(X64Reg regOp1, X64Reg regOp2, OpArg arg); | ||
| 945 | void VFNMADD213SD(X64Reg regOp1, X64Reg regOp2, OpArg arg); | ||
| 946 | void VFNMADD231SD(X64Reg regOp1, X64Reg regOp2, OpArg arg); | ||
| 947 | void VFNMSUB132PS(X64Reg regOp1, X64Reg regOp2, OpArg arg); | ||
| 948 | void VFNMSUB213PS(X64Reg regOp1, X64Reg regOp2, OpArg arg); | ||
| 949 | void VFNMSUB231PS(X64Reg regOp1, X64Reg regOp2, OpArg arg); | ||
| 950 | void VFNMSUB132PD(X64Reg regOp1, X64Reg regOp2, OpArg arg); | ||
| 951 | void VFNMSUB213PD(X64Reg regOp1, X64Reg regOp2, OpArg arg); | ||
| 952 | void VFNMSUB231PD(X64Reg regOp1, X64Reg regOp2, OpArg arg); | ||
| 953 | void VFNMSUB132SS(X64Reg regOp1, X64Reg regOp2, OpArg arg); | ||
| 954 | void VFNMSUB213SS(X64Reg regOp1, X64Reg regOp2, OpArg arg); | ||
| 955 | void VFNMSUB231SS(X64Reg regOp1, X64Reg regOp2, OpArg arg); | ||
| 956 | void VFNMSUB132SD(X64Reg regOp1, X64Reg regOp2, OpArg arg); | ||
| 957 | void VFNMSUB213SD(X64Reg regOp1, X64Reg regOp2, OpArg arg); | ||
| 958 | void VFNMSUB231SD(X64Reg regOp1, X64Reg regOp2, OpArg arg); | ||
| 959 | void VFMADDSUB132PS(X64Reg regOp1, X64Reg regOp2, OpArg arg); | ||
| 960 | void VFMADDSUB213PS(X64Reg regOp1, X64Reg regOp2, OpArg arg); | ||
| 961 | void VFMADDSUB231PS(X64Reg regOp1, X64Reg regOp2, OpArg arg); | ||
| 962 | void VFMADDSUB132PD(X64Reg regOp1, X64Reg regOp2, OpArg arg); | ||
| 963 | void VFMADDSUB213PD(X64Reg regOp1, X64Reg regOp2, OpArg arg); | ||
| 964 | void VFMADDSUB231PD(X64Reg regOp1, X64Reg regOp2, OpArg arg); | ||
| 965 | void VFMSUBADD132PS(X64Reg regOp1, X64Reg regOp2, OpArg arg); | ||
| 966 | void VFMSUBADD213PS(X64Reg regOp1, X64Reg regOp2, OpArg arg); | ||
| 967 | void VFMSUBADD231PS(X64Reg regOp1, X64Reg regOp2, OpArg arg); | ||
| 968 | void VFMSUBADD132PD(X64Reg regOp1, X64Reg regOp2, OpArg arg); | ||
| 969 | void VFMSUBADD213PD(X64Reg regOp1, X64Reg regOp2, OpArg arg); | ||
| 970 | void VFMSUBADD231PD(X64Reg regOp1, X64Reg regOp2, OpArg arg); | ||
| 971 | |||
| 972 | // VEX GPR instructions | ||
| 973 | void SARX(int bits, X64Reg regOp1, OpArg arg, X64Reg regOp2); | ||
| 974 | void SHLX(int bits, X64Reg regOp1, OpArg arg, X64Reg regOp2); | ||
| 975 | void SHRX(int bits, X64Reg regOp1, OpArg arg, X64Reg regOp2); | ||
| 976 | void RORX(int bits, X64Reg regOp, OpArg arg, u8 rotate); | ||
| 977 | void PEXT(int bits, X64Reg regOp1, X64Reg regOp2, OpArg arg); | ||
| 978 | void PDEP(int bits, X64Reg regOp1, X64Reg regOp2, OpArg arg); | ||
| 979 | void MULX(int bits, X64Reg regOp1, X64Reg regOp2, OpArg arg); | ||
| 980 | void BZHI(int bits, X64Reg regOp1, OpArg arg, X64Reg regOp2); | ||
| 981 | void BLSR(int bits, X64Reg regOp, OpArg arg); | ||
| 982 | void BLSMSK(int bits, X64Reg regOp, OpArg arg); | ||
| 983 | void BLSI(int bits, X64Reg regOp, OpArg arg); | ||
| 984 | void BEXTR(int bits, X64Reg regOp1, OpArg arg, X64Reg regOp2); | ||
| 985 | void ANDN(int bits, X64Reg regOp1, X64Reg regOp2, OpArg arg); | ||
| 986 | |||
| 987 | void RDTSC(); | ||
| 988 | |||
| 989 | // Utility functions | ||
| 990 | // The difference between this and CALL is that this aligns the stack | ||
| 991 | // where appropriate. | ||
| 992 | void ABI_CallFunction(const void *func); | ||
| 993 | template <typename T> | ||
| 994 | void ABI_CallFunction(T (*func)()) { | ||
| 995 | ABI_CallFunction((const void *)func); | ||
| 996 | } | ||
| 997 | |||
| 998 | void ABI_CallFunction(const u8 *func) { | ||
| 999 | ABI_CallFunction((const void *)func); | ||
| 1000 | } | ||
| 1001 | void ABI_CallFunctionC16(const void *func, u16 param1); | ||
| 1002 | void ABI_CallFunctionCC16(const void *func, u32 param1, u16 param2); | ||
| 1003 | |||
| 1004 | |||
| 1005 | // These only support u32 parameters, but that's enough for a lot of uses. | ||
| 1006 | // These will destroy the 1 or 2 first "parameter regs". | ||
| 1007 | void ABI_CallFunctionC(const void *func, u32 param1); | ||
| 1008 | void ABI_CallFunctionCC(const void *func, u32 param1, u32 param2); | ||
| 1009 | void ABI_CallFunctionCCC(const void *func, u32 param1, u32 param2, u32 param3); | ||
| 1010 | void ABI_CallFunctionCCP(const void *func, u32 param1, u32 param2, void *param3); | ||
| 1011 | void ABI_CallFunctionCCCP(const void *func, u32 param1, u32 param2, u32 param3, void *param4); | ||
| 1012 | void ABI_CallFunctionP(const void *func, void *param1); | ||
| 1013 | void ABI_CallFunctionPA(const void *func, void *param1, const Gen::OpArg &arg2); | ||
| 1014 | void ABI_CallFunctionPAA(const void *func, void *param1, const Gen::OpArg &arg2, const Gen::OpArg &arg3); | ||
| 1015 | void ABI_CallFunctionPPC(const void *func, void *param1, void *param2, u32 param3); | ||
| 1016 | void ABI_CallFunctionAC(const void *func, const Gen::OpArg &arg1, u32 param2); | ||
| 1017 | void ABI_CallFunctionACC(const void *func, const Gen::OpArg &arg1, u32 param2, u32 param3); | ||
| 1018 | void ABI_CallFunctionA(const void *func, const Gen::OpArg &arg1); | ||
| 1019 | void ABI_CallFunctionAA(const void *func, const Gen::OpArg &arg1, const Gen::OpArg &arg2); | ||
| 1020 | |||
| 1021 | // Pass a register as a parameter. | ||
| 1022 | void ABI_CallFunctionR(const void *func, X64Reg reg1); | ||
| 1023 | void ABI_CallFunctionRR(const void *func, X64Reg reg1, X64Reg reg2); | ||
| 1024 | |||
| 1025 | template <typename Tr, typename T1> | ||
| 1026 | void ABI_CallFunctionC(Tr (*func)(T1), u32 param1) { | ||
| 1027 | ABI_CallFunctionC((const void *)func, param1); | ||
| 1028 | } | ||
| 1029 | |||
| 1030 | // A function that doesn't have any control over what it will do to regs, | ||
| 1031 | // such as the dispatcher, should be surrounded by these. | ||
| 1032 | void ABI_PushAllCalleeSavedRegsAndAdjustStack(); | ||
| 1033 | void ABI_PopAllCalleeSavedRegsAndAdjustStack(); | ||
| 1034 | |||
| 1035 | // A function that doesn't know anything about it's surroundings, should | ||
| 1036 | // be surrounded by these to establish a safe environment, where it can roam free. | ||
| 1037 | // An example is a backpatch injected function. | ||
| 1038 | void ABI_PushAllCallerSavedRegsAndAdjustStack(); | ||
| 1039 | void ABI_PopAllCallerSavedRegsAndAdjustStack(); | ||
| 1040 | |||
| 1041 | unsigned int ABI_GetAlignedFrameSize(unsigned int frameSize); | ||
| 1042 | void ABI_AlignStack(unsigned int frameSize); | ||
| 1043 | void ABI_RestoreStack(unsigned int frameSize); | ||
| 1044 | |||
| 1045 | // Sets up a __cdecl function. | ||
| 1046 | // Only x64 really needs the parameter count. | ||
| 1047 | void ABI_EmitPrologue(int maxCallParams); | ||
| 1048 | void ABI_EmitEpilogue(int maxCallParams); | ||
| 1049 | |||
| 1050 | #ifdef _M_IX86 | ||
| 1051 | inline int ABI_GetNumXMMRegs() { return 8; } | ||
| 1052 | #else | ||
| 1053 | inline int ABI_GetNumXMMRegs() { return 16; } | ||
| 1054 | #endif | ||
| 1055 | }; // class XEmitter | ||
| 1056 | |||
| 1057 | |||
| 1058 | // Everything that needs to generate X86 code should inherit from this. | ||
| 1059 | // You get memory management for free, plus, you can use all the MOV etc functions without | ||
| 1060 | // having to prefix them with gen-> or something similar. | ||
| 1061 | |||
| 1062 | class XCodeBlock : public CodeBlock<XEmitter> { | ||
| 1063 | public: | ||
| 1064 | void PoisonMemory() override; | ||
| 1065 | }; | ||
| 1066 | |||
| 1067 | } // namespace | ||
diff --git a/src/core/settings.h b/src/core/settings.h index 2775ee257..6ca0e1afc 100644 --- a/src/core/settings.h +++ b/src/core/settings.h | |||
| @@ -53,6 +53,7 @@ struct Values { | |||
| 53 | 53 | ||
| 54 | // Renderer | 54 | // Renderer |
| 55 | bool use_hw_renderer; | 55 | bool use_hw_renderer; |
| 56 | bool use_shader_jit; | ||
| 56 | 57 | ||
| 57 | float bg_red; | 58 | float bg_red; |
| 58 | float bg_green; | 59 | float bg_green; |
diff --git a/src/video_core/CMakeLists.txt b/src/video_core/CMakeLists.txt index 162108301..183709d8b 100644 --- a/src/video_core/CMakeLists.txt +++ b/src/video_core/CMakeLists.txt | |||
| @@ -11,8 +11,9 @@ set(SRCS | |||
| 11 | pica.cpp | 11 | pica.cpp |
| 12 | primitive_assembly.cpp | 12 | primitive_assembly.cpp |
| 13 | rasterizer.cpp | 13 | rasterizer.cpp |
| 14 | shader/shader.cpp | ||
| 15 | shader/shader_interpreter.cpp | ||
| 14 | utils.cpp | 16 | utils.cpp |
| 15 | vertex_shader.cpp | ||
| 16 | video_core.cpp | 17 | video_core.cpp |
| 17 | ) | 18 | ) |
| 18 | 19 | ||
| @@ -35,11 +36,20 @@ set(HEADERS | |||
| 35 | primitive_assembly.h | 36 | primitive_assembly.h |
| 36 | rasterizer.h | 37 | rasterizer.h |
| 37 | renderer_base.h | 38 | renderer_base.h |
| 39 | shader/shader.h | ||
| 40 | shader/shader_interpreter.h | ||
| 38 | utils.h | 41 | utils.h |
| 39 | vertex_shader.h | ||
| 40 | video_core.h | 42 | video_core.h |
| 41 | ) | 43 | ) |
| 42 | 44 | ||
| 45 | if(ARCHITECTURE_x86_64) | ||
| 46 | set(SRCS ${SRCS} | ||
| 47 | shader/shader_jit_x64.cpp) | ||
| 48 | |||
| 49 | set(HEADERS ${HEADERS} | ||
| 50 | shader/shader_jit_x64.h) | ||
| 51 | endif() | ||
| 52 | |||
| 43 | create_directory_groups(${SRCS} ${HEADERS}) | 53 | create_directory_groups(${SRCS} ${HEADERS}) |
| 44 | 54 | ||
| 45 | add_library(video_core STATIC ${SRCS} ${HEADERS}) | 55 | add_library(video_core STATIC ${SRCS} ${HEADERS}) |
diff --git a/src/video_core/clipper.cpp b/src/video_core/clipper.cpp index 558b49d60..bb6048cc0 100644 --- a/src/video_core/clipper.cpp +++ b/src/video_core/clipper.cpp | |||
| @@ -7,7 +7,7 @@ | |||
| 7 | #include "clipper.h" | 7 | #include "clipper.h" |
| 8 | #include "pica.h" | 8 | #include "pica.h" |
| 9 | #include "rasterizer.h" | 9 | #include "rasterizer.h" |
| 10 | #include "vertex_shader.h" | 10 | #include "shader/shader_interpreter.h" |
| 11 | 11 | ||
| 12 | namespace Pica { | 12 | namespace Pica { |
| 13 | 13 | ||
diff --git a/src/video_core/clipper.h b/src/video_core/clipper.h index 19ce8e140..6ed01e877 100644 --- a/src/video_core/clipper.h +++ b/src/video_core/clipper.h | |||
| @@ -6,13 +6,13 @@ | |||
| 6 | 6 | ||
| 7 | namespace Pica { | 7 | namespace Pica { |
| 8 | 8 | ||
| 9 | namespace VertexShader { | 9 | namespace Shader { |
| 10 | struct OutputVertex; | 10 | struct OutputVertex; |
| 11 | } | 11 | } |
| 12 | 12 | ||
| 13 | namespace Clipper { | 13 | namespace Clipper { |
| 14 | 14 | ||
| 15 | using VertexShader::OutputVertex; | 15 | using Shader::OutputVertex; |
| 16 | 16 | ||
| 17 | void ProcessTriangle(OutputVertex& v0, OutputVertex& v1, OutputVertex& v2); | 17 | void ProcessTriangle(OutputVertex& v0, OutputVertex& v1, OutputVertex& v2); |
| 18 | 18 | ||
diff --git a/src/video_core/command_processor.cpp b/src/video_core/command_processor.cpp index 243abe842..374c4748d 100644 --- a/src/video_core/command_processor.cpp +++ b/src/video_core/command_processor.cpp | |||
| @@ -18,7 +18,7 @@ | |||
| 18 | #include "pica.h" | 18 | #include "pica.h" |
| 19 | #include "primitive_assembly.h" | 19 | #include "primitive_assembly.h" |
| 20 | #include "renderer_base.h" | 20 | #include "renderer_base.h" |
| 21 | #include "vertex_shader.h" | 21 | #include "shader/shader_interpreter.h" |
| 22 | #include "video_core.h" | 22 | #include "video_core.h" |
| 23 | 23 | ||
| 24 | namespace Pica { | 24 | namespace Pica { |
| @@ -165,7 +165,7 @@ static inline void WritePicaReg(u32 id, u32 value, u32 mask) { | |||
| 165 | DebugUtils::GeometryDumper geometry_dumper; | 165 | DebugUtils::GeometryDumper geometry_dumper; |
| 166 | PrimitiveAssembler<DebugUtils::GeometryDumper::Vertex> dumping_primitive_assembler(regs.triangle_topology.Value()); | 166 | PrimitiveAssembler<DebugUtils::GeometryDumper::Vertex> dumping_primitive_assembler(regs.triangle_topology.Value()); |
| 167 | #endif | 167 | #endif |
| 168 | PrimitiveAssembler<VertexShader::OutputVertex> primitive_assembler(regs.triangle_topology.Value()); | 168 | PrimitiveAssembler<Shader::OutputVertex> primitive_assembler(regs.triangle_topology.Value()); |
| 169 | 169 | ||
| 170 | if (g_debug_context) { | 170 | if (g_debug_context) { |
| 171 | for (int i = 0; i < 3; ++i) { | 171 | for (int i = 0; i < 3; ++i) { |
| @@ -210,11 +210,14 @@ static inline void WritePicaReg(u32 id, u32 value, u32 mask) { | |||
| 210 | // The size has been tuned for optimal balance between hit-rate and the cost of lookup | 210 | // The size has been tuned for optimal balance between hit-rate and the cost of lookup |
| 211 | const size_t VERTEX_CACHE_SIZE = 32; | 211 | const size_t VERTEX_CACHE_SIZE = 32; |
| 212 | std::array<u16, VERTEX_CACHE_SIZE> vertex_cache_ids; | 212 | std::array<u16, VERTEX_CACHE_SIZE> vertex_cache_ids; |
| 213 | std::array<VertexShader::OutputVertex, VERTEX_CACHE_SIZE> vertex_cache; | 213 | std::array<Shader::OutputVertex, VERTEX_CACHE_SIZE> vertex_cache; |
| 214 | 214 | ||
| 215 | unsigned int vertex_cache_pos = 0; | 215 | unsigned int vertex_cache_pos = 0; |
| 216 | vertex_cache_ids.fill(-1); | 216 | vertex_cache_ids.fill(-1); |
| 217 | 217 | ||
| 218 | Shader::UnitState shader_unit; | ||
| 219 | Shader::Setup(shader_unit); | ||
| 220 | |||
| 218 | for (unsigned int index = 0; index < regs.num_vertices; ++index) | 221 | for (unsigned int index = 0; index < regs.num_vertices; ++index) |
| 219 | { | 222 | { |
| 220 | unsigned int vertex = is_indexed ? (index_u16 ? index_address_16[index] : index_address_8[index]) : index; | 223 | unsigned int vertex = is_indexed ? (index_u16 ? index_address_16[index] : index_address_8[index]) : index; |
| @@ -224,7 +227,7 @@ static inline void WritePicaReg(u32 id, u32 value, u32 mask) { | |||
| 224 | ASSERT(vertex != -1); | 227 | ASSERT(vertex != -1); |
| 225 | 228 | ||
| 226 | bool vertex_cache_hit = false; | 229 | bool vertex_cache_hit = false; |
| 227 | VertexShader::OutputVertex output; | 230 | Shader::OutputVertex output; |
| 228 | 231 | ||
| 229 | if (is_indexed) { | 232 | if (is_indexed) { |
| 230 | if (g_debug_context && Pica::g_debug_context->recorder) { | 233 | if (g_debug_context && Pica::g_debug_context->recorder) { |
| @@ -243,7 +246,7 @@ static inline void WritePicaReg(u32 id, u32 value, u32 mask) { | |||
| 243 | 246 | ||
| 244 | if (!vertex_cache_hit) { | 247 | if (!vertex_cache_hit) { |
| 245 | // Initialize data for the current vertex | 248 | // Initialize data for the current vertex |
| 246 | VertexShader::InputVertex input; | 249 | Shader::InputVertex input; |
| 247 | 250 | ||
| 248 | for (int i = 0; i < attribute_config.GetNumTotalAttributes(); ++i) { | 251 | for (int i = 0; i < attribute_config.GetNumTotalAttributes(); ++i) { |
| 249 | if (vertex_attribute_elements[i] != 0) { | 252 | if (vertex_attribute_elements[i] != 0) { |
| @@ -306,9 +309,8 @@ static inline void WritePicaReg(u32 id, u32 value, u32 mask) { | |||
| 306 | std::bind(&DebugUtils::GeometryDumper::AddTriangle, | 309 | std::bind(&DebugUtils::GeometryDumper::AddTriangle, |
| 307 | &geometry_dumper, _1, _2, _3)); | 310 | &geometry_dumper, _1, _2, _3)); |
| 308 | #endif | 311 | #endif |
| 309 | |||
| 310 | // Send to vertex shader | 312 | // Send to vertex shader |
| 311 | output = VertexShader::RunShader(input, attribute_config.GetNumTotalAttributes(), g_state.regs.vs, g_state.vs); | 313 | output = Shader::Run(shader_unit, input, attribute_config.GetNumTotalAttributes()); |
| 312 | 314 | ||
| 313 | if (is_indexed) { | 315 | if (is_indexed) { |
| 314 | vertex_cache[vertex_cache_pos] = output; | 316 | vertex_cache[vertex_cache_pos] = output; |
| @@ -319,9 +321,9 @@ static inline void WritePicaReg(u32 id, u32 value, u32 mask) { | |||
| 319 | 321 | ||
| 320 | if (Settings::values.use_hw_renderer) { | 322 | if (Settings::values.use_hw_renderer) { |
| 321 | // Send to hardware renderer | 323 | // Send to hardware renderer |
| 322 | static auto AddHWTriangle = [](const Pica::VertexShader::OutputVertex& v0, | 324 | static auto AddHWTriangle = [](const Pica::Shader::OutputVertex& v0, |
| 323 | const Pica::VertexShader::OutputVertex& v1, | 325 | const Pica::Shader::OutputVertex& v1, |
| 324 | const Pica::VertexShader::OutputVertex& v2) { | 326 | const Pica::Shader::OutputVertex& v2) { |
| 325 | VideoCore::g_renderer->hw_rasterizer->AddTriangle(v0, v1, v2); | 327 | VideoCore::g_renderer->hw_rasterizer->AddTriangle(v0, v1, v2); |
| 326 | }; | 328 | }; |
| 327 | 329 | ||
diff --git a/src/video_core/hwrasterizer_base.h b/src/video_core/hwrasterizer_base.h index c8746c608..54b8892fb 100644 --- a/src/video_core/hwrasterizer_base.h +++ b/src/video_core/hwrasterizer_base.h | |||
| @@ -7,7 +7,7 @@ | |||
| 7 | #include "common/common_types.h" | 7 | #include "common/common_types.h" |
| 8 | 8 | ||
| 9 | namespace Pica { | 9 | namespace Pica { |
| 10 | namespace VertexShader { | 10 | namespace Shader { |
| 11 | struct OutputVertex; | 11 | struct OutputVertex; |
| 12 | } | 12 | } |
| 13 | } | 13 | } |
| @@ -24,9 +24,9 @@ public: | |||
| 24 | virtual void Reset() = 0; | 24 | virtual void Reset() = 0; |
| 25 | 25 | ||
| 26 | /// Queues the primitive formed by the given vertices for rendering | 26 | /// Queues the primitive formed by the given vertices for rendering |
| 27 | virtual void AddTriangle(const Pica::VertexShader::OutputVertex& v0, | 27 | virtual void AddTriangle(const Pica::Shader::OutputVertex& v0, |
| 28 | const Pica::VertexShader::OutputVertex& v1, | 28 | const Pica::Shader::OutputVertex& v1, |
| 29 | const Pica::VertexShader::OutputVertex& v2) = 0; | 29 | const Pica::Shader::OutputVertex& v2) = 0; |
| 30 | 30 | ||
| 31 | /// Draw the current batch of triangles | 31 | /// Draw the current batch of triangles |
| 32 | virtual void DrawTriangles() = 0; | 32 | virtual void DrawTriangles() = 0; |
diff --git a/src/video_core/pica.cpp b/src/video_core/pica.cpp index 17cb66780..c73a8178e 100644 --- a/src/video_core/pica.cpp +++ b/src/video_core/pica.cpp | |||
| @@ -6,6 +6,7 @@ | |||
| 6 | #include <unordered_map> | 6 | #include <unordered_map> |
| 7 | 7 | ||
| 8 | #include "pica.h" | 8 | #include "pica.h" |
| 9 | #include "shader/shader.h" | ||
| 9 | 10 | ||
| 10 | namespace Pica { | 11 | namespace Pica { |
| 11 | 12 | ||
| @@ -84,6 +85,8 @@ void Init() { | |||
| 84 | } | 85 | } |
| 85 | 86 | ||
| 86 | void Shutdown() { | 87 | void Shutdown() { |
| 88 | Shader::Shutdown(); | ||
| 89 | |||
| 87 | memset(&g_state, 0, sizeof(State)); | 90 | memset(&g_state, 0, sizeof(State)); |
| 88 | } | 91 | } |
| 89 | 92 | ||
diff --git a/src/video_core/pica.h b/src/video_core/pica.h index 34b02b2f8..6ce90f95a 100644 --- a/src/video_core/pica.h +++ b/src/video_core/pica.h | |||
| @@ -1083,6 +1083,7 @@ private: | |||
| 1083 | // TODO: Perform proper arithmetic on this! | 1083 | // TODO: Perform proper arithmetic on this! |
| 1084 | float value; | 1084 | float value; |
| 1085 | }; | 1085 | }; |
| 1086 | static_assert(sizeof(float24) == sizeof(float), "Shader JIT assumes float24 is implemented as a 32-bit float"); | ||
| 1086 | 1087 | ||
| 1087 | /// Struct used to describe current Pica state | 1088 | /// Struct used to describe current Pica state |
| 1088 | struct State { | 1089 | struct State { |
| @@ -1092,7 +1093,10 @@ struct State { | |||
| 1092 | /// Vertex shader memory | 1093 | /// Vertex shader memory |
| 1093 | struct ShaderSetup { | 1094 | struct ShaderSetup { |
| 1094 | struct { | 1095 | struct { |
| 1095 | Math::Vec4<float24> f[96]; | 1096 | // The float uniforms are accessed by the shader JIT using SSE instructions, and are |
| 1097 | // therefore required to be 16-byte aligned. | ||
| 1098 | Math::Vec4<float24> MEMORY_ALIGNED16(f[96]); | ||
| 1099 | |||
| 1096 | std::array<bool, 16> b; | 1100 | std::array<bool, 16> b; |
| 1097 | std::array<Math::Vec4<u8>, 4> i; | 1101 | std::array<Math::Vec4<u8>, 4> i; |
| 1098 | } uniforms; | 1102 | } uniforms; |
diff --git a/src/video_core/primitive_assembly.cpp b/src/video_core/primitive_assembly.cpp index 2f22bdcce..e2b1df44c 100644 --- a/src/video_core/primitive_assembly.cpp +++ b/src/video_core/primitive_assembly.cpp | |||
| @@ -4,7 +4,7 @@ | |||
| 4 | 4 | ||
| 5 | #include "pica.h" | 5 | #include "pica.h" |
| 6 | #include "primitive_assembly.h" | 6 | #include "primitive_assembly.h" |
| 7 | #include "vertex_shader.h" | 7 | #include "shader/shader_interpreter.h" |
| 8 | 8 | ||
| 9 | #include "common/logging/log.h" | 9 | #include "common/logging/log.h" |
| 10 | #include "video_core/debug_utils/debug_utils.h" | 10 | #include "video_core/debug_utils/debug_utils.h" |
| @@ -56,7 +56,7 @@ void PrimitiveAssembler<VertexType>::SubmitVertex(VertexType& vtx, TriangleHandl | |||
| 56 | 56 | ||
| 57 | // explicitly instantiate use cases | 57 | // explicitly instantiate use cases |
| 58 | template | 58 | template |
| 59 | struct PrimitiveAssembler<VertexShader::OutputVertex>; | 59 | struct PrimitiveAssembler<Shader::OutputVertex>; |
| 60 | template | 60 | template |
| 61 | struct PrimitiveAssembler<DebugUtils::GeometryDumper::Vertex>; | 61 | struct PrimitiveAssembler<DebugUtils::GeometryDumper::Vertex>; |
| 62 | 62 | ||
diff --git a/src/video_core/primitive_assembly.h b/src/video_core/primitive_assembly.h index 52ff4cd89..80432d68a 100644 --- a/src/video_core/primitive_assembly.h +++ b/src/video_core/primitive_assembly.h | |||
| @@ -8,7 +8,7 @@ | |||
| 8 | 8 | ||
| 9 | #include "video_core/pica.h" | 9 | #include "video_core/pica.h" |
| 10 | 10 | ||
| 11 | #include "video_core/vertex_shader.h" | 11 | #include "video_core/shader/shader_interpreter.h" |
| 12 | 12 | ||
| 13 | namespace Pica { | 13 | namespace Pica { |
| 14 | 14 | ||
diff --git a/src/video_core/rasterizer.cpp b/src/video_core/rasterizer.cpp index 68b7cc05d..b83798b0f 100644 --- a/src/video_core/rasterizer.cpp +++ b/src/video_core/rasterizer.cpp | |||
| @@ -16,7 +16,7 @@ | |||
| 16 | #include "math.h" | 16 | #include "math.h" |
| 17 | #include "pica.h" | 17 | #include "pica.h" |
| 18 | #include "rasterizer.h" | 18 | #include "rasterizer.h" |
| 19 | #include "vertex_shader.h" | 19 | #include "shader/shader_interpreter.h" |
| 20 | #include "video_core/utils.h" | 20 | #include "video_core/utils.h" |
| 21 | 21 | ||
| 22 | namespace Pica { | 22 | namespace Pica { |
| @@ -272,9 +272,9 @@ static Common::Profiling::TimingCategory rasterization_category("Rasterization") | |||
| 272 | * Helper function for ProcessTriangle with the "reversed" flag to allow for implementing | 272 | * Helper function for ProcessTriangle with the "reversed" flag to allow for implementing |
| 273 | * culling via recursion. | 273 | * culling via recursion. |
| 274 | */ | 274 | */ |
| 275 | static void ProcessTriangleInternal(const VertexShader::OutputVertex& v0, | 275 | static void ProcessTriangleInternal(const Shader::OutputVertex& v0, |
| 276 | const VertexShader::OutputVertex& v1, | 276 | const Shader::OutputVertex& v1, |
| 277 | const VertexShader::OutputVertex& v2, | 277 | const Shader::OutputVertex& v2, |
| 278 | bool reversed = false) | 278 | bool reversed = false) |
| 279 | { | 279 | { |
| 280 | const auto& regs = g_state.regs; | 280 | const auto& regs = g_state.regs; |
| @@ -1107,9 +1107,9 @@ static void ProcessTriangleInternal(const VertexShader::OutputVertex& v0, | |||
| 1107 | } | 1107 | } |
| 1108 | } | 1108 | } |
| 1109 | 1109 | ||
| 1110 | void ProcessTriangle(const VertexShader::OutputVertex& v0, | 1110 | void ProcessTriangle(const Shader::OutputVertex& v0, |
| 1111 | const VertexShader::OutputVertex& v1, | 1111 | const Shader::OutputVertex& v1, |
| 1112 | const VertexShader::OutputVertex& v2) { | 1112 | const Shader::OutputVertex& v2) { |
| 1113 | ProcessTriangleInternal(v0, v1, v2); | 1113 | ProcessTriangleInternal(v0, v1, v2); |
| 1114 | } | 1114 | } |
| 1115 | 1115 | ||
diff --git a/src/video_core/rasterizer.h b/src/video_core/rasterizer.h index 42148f8b1..a6a9634b4 100644 --- a/src/video_core/rasterizer.h +++ b/src/video_core/rasterizer.h | |||
| @@ -6,15 +6,15 @@ | |||
| 6 | 6 | ||
| 7 | namespace Pica { | 7 | namespace Pica { |
| 8 | 8 | ||
| 9 | namespace VertexShader { | 9 | namespace Shader { |
| 10 | struct OutputVertex; | 10 | struct OutputVertex; |
| 11 | } | 11 | } |
| 12 | 12 | ||
| 13 | namespace Rasterizer { | 13 | namespace Rasterizer { |
| 14 | 14 | ||
| 15 | void ProcessTriangle(const VertexShader::OutputVertex& v0, | 15 | void ProcessTriangle(const Shader::OutputVertex& v0, |
| 16 | const VertexShader::OutputVertex& v1, | 16 | const Shader::OutputVertex& v1, |
| 17 | const VertexShader::OutputVertex& v2); | 17 | const Shader::OutputVertex& v2); |
| 18 | 18 | ||
| 19 | } // namespace Rasterizer | 19 | } // namespace Rasterizer |
| 20 | 20 | ||
diff --git a/src/video_core/renderer_opengl/gl_rasterizer.cpp b/src/video_core/renderer_opengl/gl_rasterizer.cpp index e7c1cfeb7..9f1552adf 100644 --- a/src/video_core/renderer_opengl/gl_rasterizer.cpp +++ b/src/video_core/renderer_opengl/gl_rasterizer.cpp | |||
| @@ -202,9 +202,9 @@ void RasterizerOpenGL::Reset() { | |||
| 202 | res_cache.FullFlush(); | 202 | res_cache.FullFlush(); |
| 203 | } | 203 | } |
| 204 | 204 | ||
| 205 | void RasterizerOpenGL::AddTriangle(const Pica::VertexShader::OutputVertex& v0, | 205 | void RasterizerOpenGL::AddTriangle(const Pica::Shader::OutputVertex& v0, |
| 206 | const Pica::VertexShader::OutputVertex& v1, | 206 | const Pica::Shader::OutputVertex& v1, |
| 207 | const Pica::VertexShader::OutputVertex& v2) { | 207 | const Pica::Shader::OutputVertex& v2) { |
| 208 | vertex_batch.push_back(HardwareVertex(v0)); | 208 | vertex_batch.push_back(HardwareVertex(v0)); |
| 209 | vertex_batch.push_back(HardwareVertex(v1)); | 209 | vertex_batch.push_back(HardwareVertex(v1)); |
| 210 | vertex_batch.push_back(HardwareVertex(v2)); | 210 | vertex_batch.push_back(HardwareVertex(v2)); |
diff --git a/src/video_core/renderer_opengl/gl_rasterizer.h b/src/video_core/renderer_opengl/gl_rasterizer.h index ae7b26fc6..a02d5c856 100644 --- a/src/video_core/renderer_opengl/gl_rasterizer.h +++ b/src/video_core/renderer_opengl/gl_rasterizer.h | |||
| @@ -9,7 +9,7 @@ | |||
| 9 | #include "common/common_types.h" | 9 | #include "common/common_types.h" |
| 10 | 10 | ||
| 11 | #include "video_core/hwrasterizer_base.h" | 11 | #include "video_core/hwrasterizer_base.h" |
| 12 | #include "video_core/vertex_shader.h" | 12 | #include "video_core/shader/shader_interpreter.h" |
| 13 | 13 | ||
| 14 | #include "gl_state.h" | 14 | #include "gl_state.h" |
| 15 | #include "gl_rasterizer_cache.h" | 15 | #include "gl_rasterizer_cache.h" |
| @@ -27,9 +27,9 @@ public: | |||
| 27 | void Reset() override; | 27 | void Reset() override; |
| 28 | 28 | ||
| 29 | /// Queues the primitive formed by the given vertices for rendering | 29 | /// Queues the primitive formed by the given vertices for rendering |
| 30 | void AddTriangle(const Pica::VertexShader::OutputVertex& v0, | 30 | void AddTriangle(const Pica::Shader::OutputVertex& v0, |
| 31 | const Pica::VertexShader::OutputVertex& v1, | 31 | const Pica::Shader::OutputVertex& v1, |
| 32 | const Pica::VertexShader::OutputVertex& v2) override; | 32 | const Pica::Shader::OutputVertex& v2) override; |
| 33 | 33 | ||
| 34 | /// Draw the current batch of triangles | 34 | /// Draw the current batch of triangles |
| 35 | void DrawTriangles() override; | 35 | void DrawTriangles() override; |
| @@ -82,7 +82,7 @@ private: | |||
| 82 | 82 | ||
| 83 | /// Structure that the hardware rendered vertices are composed of | 83 | /// Structure that the hardware rendered vertices are composed of |
| 84 | struct HardwareVertex { | 84 | struct HardwareVertex { |
| 85 | HardwareVertex(const Pica::VertexShader::OutputVertex& v) { | 85 | HardwareVertex(const Pica::Shader::OutputVertex& v) { |
| 86 | position[0] = v.pos.x.ToFloat32(); | 86 | position[0] = v.pos.x.ToFloat32(); |
| 87 | position[1] = v.pos.y.ToFloat32(); | 87 | position[1] = v.pos.y.ToFloat32(); |
| 88 | position[2] = v.pos.z.ToFloat32(); | 88 | position[2] = v.pos.z.ToFloat32(); |
diff --git a/src/video_core/shader/shader.cpp b/src/video_core/shader/shader.cpp new file mode 100644 index 000000000..6a27a8015 --- /dev/null +++ b/src/video_core/shader/shader.cpp | |||
| @@ -0,0 +1,145 @@ | |||
| 1 | // Copyright 2015 Citra Emulator Project | ||
| 2 | // Licensed under GPLv2 or any later version | ||
| 3 | // Refer to the license.txt file included. | ||
| 4 | |||
| 5 | #include <memory> | ||
| 6 | #include <unordered_map> | ||
| 7 | |||
| 8 | #include "common/hash.h" | ||
| 9 | #include "common/make_unique.h" | ||
| 10 | #include "common/profiler.h" | ||
| 11 | |||
| 12 | #include "video_core/debug_utils/debug_utils.h" | ||
| 13 | #include "video_core/pica.h" | ||
| 14 | #include "video_core/video_core.h" | ||
| 15 | |||
| 16 | #include "shader.h" | ||
| 17 | #include "shader_interpreter.h" | ||
| 18 | |||
| 19 | #ifdef ARCHITECTURE_x86_64 | ||
| 20 | #include "shader_jit_x64.h" | ||
| 21 | #endif // ARCHITECTURE_x86_64 | ||
| 22 | |||
| 23 | namespace Pica { | ||
| 24 | |||
| 25 | namespace Shader { | ||
| 26 | |||
| 27 | #ifdef ARCHITECTURE_x86_64 | ||
| 28 | static std::unordered_map<u64, CompiledShader*> shader_map; | ||
| 29 | static JitCompiler jit; | ||
| 30 | static CompiledShader* jit_shader; | ||
| 31 | #endif // ARCHITECTURE_x86_64 | ||
| 32 | |||
| 33 | void Setup(UnitState& state) { | ||
| 34 | #ifdef ARCHITECTURE_x86_64 | ||
| 35 | if (VideoCore::g_shader_jit_enabled) { | ||
| 36 | u64 cache_key = (Common::ComputeHash64(&g_state.vs.program_code, sizeof(g_state.vs.program_code)) ^ | ||
| 37 | Common::ComputeHash64(&g_state.vs.swizzle_data, sizeof(g_state.vs.swizzle_data)) ^ | ||
| 38 | g_state.regs.vs.main_offset); | ||
| 39 | |||
| 40 | auto iter = shader_map.find(cache_key); | ||
| 41 | if (iter != shader_map.end()) { | ||
| 42 | jit_shader = iter->second; | ||
| 43 | } else { | ||
| 44 | jit_shader = jit.Compile(); | ||
| 45 | shader_map.emplace(cache_key, jit_shader); | ||
| 46 | } | ||
| 47 | } | ||
| 48 | #endif // ARCHITECTURE_x86_64 | ||
| 49 | } | ||
| 50 | |||
| 51 | void Shutdown() { | ||
| 52 | shader_map.clear(); | ||
| 53 | } | ||
| 54 | |||
| 55 | static Common::Profiling::TimingCategory shader_category("Vertex Shader"); | ||
| 56 | |||
| 57 | OutputVertex Run(UnitState& state, const InputVertex& input, int num_attributes) { | ||
| 58 | auto& config = g_state.regs.vs; | ||
| 59 | auto& setup = g_state.vs; | ||
| 60 | |||
| 61 | Common::Profiling::ScopeTimer timer(shader_category); | ||
| 62 | |||
| 63 | state.program_counter = config.main_offset; | ||
| 64 | state.debug.max_offset = 0; | ||
| 65 | state.debug.max_opdesc_id = 0; | ||
| 66 | |||
| 67 | // Setup input register table | ||
| 68 | const auto& attribute_register_map = config.input_register_map; | ||
| 69 | |||
| 70 | if (num_attributes > 0) state.registers.input[attribute_register_map.attribute0_register] = input.attr[0]; | ||
| 71 | if (num_attributes > 1) state.registers.input[attribute_register_map.attribute1_register] = input.attr[1]; | ||
| 72 | if (num_attributes > 2) state.registers.input[attribute_register_map.attribute2_register] = input.attr[2]; | ||
| 73 | if (num_attributes > 3) state.registers.input[attribute_register_map.attribute3_register] = input.attr[3]; | ||
| 74 | if (num_attributes > 4) state.registers.input[attribute_register_map.attribute4_register] = input.attr[4]; | ||
| 75 | if (num_attributes > 5) state.registers.input[attribute_register_map.attribute5_register] = input.attr[5]; | ||
| 76 | if (num_attributes > 6) state.registers.input[attribute_register_map.attribute6_register] = input.attr[6]; | ||
| 77 | if (num_attributes > 7) state.registers.input[attribute_register_map.attribute7_register] = input.attr[7]; | ||
| 78 | if (num_attributes > 8) state.registers.input[attribute_register_map.attribute8_register] = input.attr[8]; | ||
| 79 | if (num_attributes > 9) state.registers.input[attribute_register_map.attribute9_register] = input.attr[9]; | ||
| 80 | if (num_attributes > 10) state.registers.input[attribute_register_map.attribute10_register] = input.attr[10]; | ||
| 81 | if (num_attributes > 11) state.registers.input[attribute_register_map.attribute11_register] = input.attr[11]; | ||
| 82 | if (num_attributes > 12) state.registers.input[attribute_register_map.attribute12_register] = input.attr[12]; | ||
| 83 | if (num_attributes > 13) state.registers.input[attribute_register_map.attribute13_register] = input.attr[13]; | ||
| 84 | if (num_attributes > 14) state.registers.input[attribute_register_map.attribute14_register] = input.attr[14]; | ||
| 85 | if (num_attributes > 15) state.registers.input[attribute_register_map.attribute15_register] = input.attr[15]; | ||
| 86 | |||
| 87 | state.conditional_code[0] = false; | ||
| 88 | state.conditional_code[1] = false; | ||
| 89 | |||
| 90 | #ifdef ARCHITECTURE_x86_64 | ||
| 91 | if (VideoCore::g_shader_jit_enabled) | ||
| 92 | jit_shader(&state.registers); | ||
| 93 | else | ||
| 94 | RunInterpreter(state); | ||
| 95 | #else | ||
| 96 | RunInterpreter(state); | ||
| 97 | #endif // ARCHITECTURE_x86_64 | ||
| 98 | |||
| 99 | #if PICA_DUMP_SHADERS | ||
| 100 | DebugUtils::DumpShader(setup.program_code.data(), state.debug.max_offset, setup.swizzle_data.data(), | ||
| 101 | state.debug.max_opdesc_id, config.main_offset, | ||
| 102 | g_state.regs.vs_output_attributes); // TODO: Don't hardcode VS here | ||
| 103 | #endif | ||
| 104 | |||
| 105 | // Setup output data | ||
| 106 | OutputVertex ret; | ||
| 107 | // TODO(neobrain): Under some circumstances, up to 16 attributes may be output. We need to | ||
| 108 | // figure out what those circumstances are and enable the remaining outputs then. | ||
| 109 | for (int i = 0; i < 7; ++i) { | ||
| 110 | const auto& output_register_map = g_state.regs.vs_output_attributes[i]; // TODO: Don't hardcode VS here | ||
| 111 | |||
| 112 | u32 semantics[4] = { | ||
| 113 | output_register_map.map_x, output_register_map.map_y, | ||
| 114 | output_register_map.map_z, output_register_map.map_w | ||
| 115 | }; | ||
| 116 | |||
| 117 | for (int comp = 0; comp < 4; ++comp) { | ||
| 118 | float24* out = ((float24*)&ret) + semantics[comp]; | ||
| 119 | if (semantics[comp] != Regs::VSOutputAttributes::INVALID) { | ||
| 120 | *out = state.registers.output[i][comp]; | ||
| 121 | } else { | ||
| 122 | // Zero output so that attributes which aren't output won't have denormals in them, | ||
| 123 | // which would slow us down later. | ||
| 124 | memset(out, 0, sizeof(*out)); | ||
| 125 | } | ||
| 126 | } | ||
| 127 | } | ||
| 128 | |||
| 129 | // The hardware takes the absolute and saturates vertex colors like this, *before* doing interpolation | ||
| 130 | for (int i = 0; i < 4; ++i) { | ||
| 131 | ret.color[i] = float24::FromFloat32( | ||
| 132 | std::fmin(std::fabs(ret.color[i].ToFloat32()), 1.0f)); | ||
| 133 | } | ||
| 134 | |||
| 135 | LOG_TRACE(Render_Software, "Output vertex: pos (%.2f, %.2f, %.2f, %.2f), col(%.2f, %.2f, %.2f, %.2f), tc0(%.2f, %.2f)", | ||
| 136 | ret.pos.x.ToFloat32(), ret.pos.y.ToFloat32(), ret.pos.z.ToFloat32(), ret.pos.w.ToFloat32(), | ||
| 137 | ret.color.x.ToFloat32(), ret.color.y.ToFloat32(), ret.color.z.ToFloat32(), ret.color.w.ToFloat32(), | ||
| 138 | ret.tc0.u().ToFloat32(), ret.tc0.v().ToFloat32()); | ||
| 139 | |||
| 140 | return ret; | ||
| 141 | } | ||
| 142 | |||
| 143 | } // namespace Shader | ||
| 144 | |||
| 145 | } // namespace Pica | ||
diff --git a/src/video_core/shader/shader.h b/src/video_core/shader/shader.h new file mode 100644 index 000000000..2007a2844 --- /dev/null +++ b/src/video_core/shader/shader.h | |||
| @@ -0,0 +1,169 @@ | |||
| 1 | // Copyright 2015 Citra Emulator Project | ||
| 2 | // Licensed under GPLv2 or any later version | ||
| 3 | // Refer to the license.txt file included. | ||
| 4 | |||
| 5 | #pragma once | ||
| 6 | |||
| 7 | #include <boost/container/static_vector.hpp> | ||
| 8 | #include <nihstro/shader_binary.h> | ||
| 9 | |||
| 10 | #include "common/common_funcs.h" | ||
| 11 | #include "common/common_types.h" | ||
| 12 | #include "common/vector_math.h" | ||
| 13 | |||
| 14 | #include "video_core/pica.h" | ||
| 15 | |||
| 16 | using nihstro::RegisterType; | ||
| 17 | using nihstro::SourceRegister; | ||
| 18 | using nihstro::DestRegister; | ||
| 19 | |||
| 20 | namespace Pica { | ||
| 21 | |||
| 22 | namespace Shader { | ||
| 23 | |||
| 24 | struct InputVertex { | ||
| 25 | Math::Vec4<float24> attr[16]; | ||
| 26 | }; | ||
| 27 | |||
| 28 | struct OutputVertex { | ||
| 29 | OutputVertex() = default; | ||
| 30 | |||
| 31 | // VS output attributes | ||
| 32 | Math::Vec4<float24> pos; | ||
| 33 | Math::Vec4<float24> dummy; // quaternions (not implemented, yet) | ||
| 34 | Math::Vec4<float24> color; | ||
| 35 | Math::Vec2<float24> tc0; | ||
| 36 | Math::Vec2<float24> tc1; | ||
| 37 | float24 pad[6]; | ||
| 38 | Math::Vec2<float24> tc2; | ||
| 39 | |||
| 40 | // Padding for optimal alignment | ||
| 41 | float24 pad2[4]; | ||
| 42 | |||
| 43 | // Attributes used to store intermediate results | ||
| 44 | |||
| 45 | // position after perspective divide | ||
| 46 | Math::Vec3<float24> screenpos; | ||
| 47 | float24 pad3; | ||
| 48 | |||
| 49 | // Linear interpolation | ||
| 50 | // factor: 0=this, 1=vtx | ||
| 51 | void Lerp(float24 factor, const OutputVertex& vtx) { | ||
| 52 | pos = pos * factor + vtx.pos * (float24::FromFloat32(1) - factor); | ||
| 53 | |||
| 54 | // TODO: Should perform perspective correct interpolation here... | ||
| 55 | tc0 = tc0 * factor + vtx.tc0 * (float24::FromFloat32(1) - factor); | ||
| 56 | tc1 = tc1 * factor + vtx.tc1 * (float24::FromFloat32(1) - factor); | ||
| 57 | tc2 = tc2 * factor + vtx.tc2 * (float24::FromFloat32(1) - factor); | ||
| 58 | |||
| 59 | screenpos = screenpos * factor + vtx.screenpos * (float24::FromFloat32(1) - factor); | ||
| 60 | |||
| 61 | color = color * factor + vtx.color * (float24::FromFloat32(1) - factor); | ||
| 62 | } | ||
| 63 | |||
| 64 | // Linear interpolation | ||
| 65 | // factor: 0=v0, 1=v1 | ||
| 66 | static OutputVertex Lerp(float24 factor, const OutputVertex& v0, const OutputVertex& v1) { | ||
| 67 | OutputVertex ret = v0; | ||
| 68 | ret.Lerp(factor, v1); | ||
| 69 | return ret; | ||
| 70 | } | ||
| 71 | }; | ||
| 72 | static_assert(std::is_pod<OutputVertex>::value, "Structure is not POD"); | ||
| 73 | static_assert(sizeof(OutputVertex) == 32 * sizeof(float), "OutputVertex has invalid size"); | ||
| 74 | |||
| 75 | /** | ||
| 76 | * This structure contains the state information that needs to be unique for a shader unit. The 3DS | ||
| 77 | * has four shader units that process shaders in parallel. At the present, Citra only implements a | ||
| 78 | * single shader unit that processes all shaders serially. Putting the state information in a struct | ||
| 79 | * here will make it easier for us to parallelize the shader processing later. | ||
| 80 | */ | ||
| 81 | struct UnitState { | ||
| 82 | struct Registers { | ||
| 83 | // The registers are accessed by the shader JIT using SSE instructions, and are therefore | ||
| 84 | // required to be 16-byte aligned. | ||
| 85 | Math::Vec4<float24> MEMORY_ALIGNED16(input[16]); | ||
| 86 | Math::Vec4<float24> MEMORY_ALIGNED16(output[16]); | ||
| 87 | Math::Vec4<float24> MEMORY_ALIGNED16(temporary[16]); | ||
| 88 | } registers; | ||
| 89 | static_assert(std::is_pod<Registers>::value, "Structure is not POD"); | ||
| 90 | |||
| 91 | u32 program_counter; | ||
| 92 | bool conditional_code[2]; | ||
| 93 | |||
| 94 | // Two Address registers and one loop counter | ||
| 95 | // TODO: How many bits do these actually have? | ||
| 96 | s32 address_registers[3]; | ||
| 97 | |||
| 98 | enum { | ||
| 99 | INVALID_ADDRESS = 0xFFFFFFFF | ||
| 100 | }; | ||
| 101 | |||
| 102 | struct CallStackElement { | ||
| 103 | u32 final_address; // Address upon which we jump to return_address | ||
| 104 | u32 return_address; // Where to jump when leaving scope | ||
| 105 | u8 repeat_counter; // How often to repeat until this call stack element is removed | ||
| 106 | u8 loop_increment; // Which value to add to the loop counter after an iteration | ||
| 107 | // TODO: Should this be a signed value? Does it even matter? | ||
| 108 | u32 loop_address; // The address where we'll return to after each loop iteration | ||
| 109 | }; | ||
| 110 | |||
| 111 | // TODO: Is there a maximal size for this? | ||
| 112 | boost::container::static_vector<CallStackElement, 16> call_stack; | ||
| 113 | |||
| 114 | struct { | ||
| 115 | u32 max_offset; // maximum program counter ever reached | ||
| 116 | u32 max_opdesc_id; // maximum swizzle pattern index ever used | ||
| 117 | } debug; | ||
| 118 | |||
| 119 | static int InputOffset(const SourceRegister& reg) { | ||
| 120 | switch (reg.GetRegisterType()) { | ||
| 121 | case RegisterType::Input: | ||
| 122 | return (int)offsetof(UnitState::Registers, input) + reg.GetIndex()*sizeof(Math::Vec4<float24>); | ||
| 123 | |||
| 124 | case RegisterType::Temporary: | ||
| 125 | return (int)offsetof(UnitState::Registers, temporary) + reg.GetIndex()*sizeof(Math::Vec4<float24>); | ||
| 126 | |||
| 127 | default: | ||
| 128 | UNREACHABLE(); | ||
| 129 | return 0; | ||
| 130 | } | ||
| 131 | } | ||
| 132 | |||
| 133 | static int OutputOffset(const DestRegister& reg) { | ||
| 134 | switch (reg.GetRegisterType()) { | ||
| 135 | case RegisterType::Output: | ||
| 136 | return (int)offsetof(UnitState::Registers, output) + reg.GetIndex()*sizeof(Math::Vec4<float24>); | ||
| 137 | |||
| 138 | case RegisterType::Temporary: | ||
| 139 | return (int)offsetof(UnitState::Registers, temporary) + reg.GetIndex()*sizeof(Math::Vec4<float24>); | ||
| 140 | |||
| 141 | default: | ||
| 142 | UNREACHABLE(); | ||
| 143 | return 0; | ||
| 144 | } | ||
| 145 | } | ||
| 146 | }; | ||
| 147 | |||
| 148 | /** | ||
| 149 | * Performs any shader unit setup that only needs to happen once per shader (as opposed to once per | ||
| 150 | * vertex, which would happen within the `Run` function). | ||
| 151 | * @param state Shader unit state, must be setup per shader and per shader unit | ||
| 152 | */ | ||
| 153 | void Setup(UnitState& state); | ||
| 154 | |||
| 155 | /// Performs any cleanup when the emulator is shutdown | ||
| 156 | void Shutdown(); | ||
| 157 | |||
| 158 | /** | ||
| 159 | * Runs the currently setup shader | ||
| 160 | * @param state Shader unit state, must be setup per shader and per shader unit | ||
| 161 | * @param input Input vertex into the shader | ||
| 162 | * @param num_attributes The number of vertex shader attributes | ||
| 163 | * @return The output vertex, after having been processed by the vertex shader | ||
| 164 | */ | ||
| 165 | OutputVertex Run(UnitState& state, const InputVertex& input, int num_attributes); | ||
| 166 | |||
| 167 | } // namespace Shader | ||
| 168 | |||
| 169 | } // namespace Pica | ||
diff --git a/src/video_core/vertex_shader.cpp b/src/video_core/shader/shader_interpreter.cpp index 5f66f3455..c8489f920 100644 --- a/src/video_core/vertex_shader.cpp +++ b/src/video_core/shader/shader_interpreter.cpp | |||
| @@ -2,18 +2,14 @@ | |||
| 2 | // Licensed under GPLv2 or any later version | 2 | // Licensed under GPLv2 or any later version |
| 3 | // Refer to the license.txt file included. | 3 | // Refer to the license.txt file included. |
| 4 | 4 | ||
| 5 | #include <boost/container/static_vector.hpp> | ||
| 6 | #include <boost/range/algorithm.hpp> | ||
| 7 | |||
| 8 | #include <common/file_util.h> | 5 | #include <common/file_util.h> |
| 9 | 6 | ||
| 10 | #include <nihstro/shader_bytecode.h> | 7 | #include <nihstro/shader_bytecode.h> |
| 11 | 8 | ||
| 12 | #include "common/profiler.h" | 9 | #include "video_core/pica.h" |
| 13 | 10 | ||
| 14 | #include "pica.h" | 11 | #include "shader.h" |
| 15 | #include "vertex_shader.h" | 12 | #include "shader_interpreter.h" |
| 16 | #include "debug_utils/debug_utils.h" | ||
| 17 | 13 | ||
| 18 | using nihstro::OpCode; | 14 | using nihstro::OpCode; |
| 19 | using nihstro::Instruction; | 15 | using nihstro::Instruction; |
| @@ -23,44 +19,9 @@ using nihstro::SwizzlePattern; | |||
| 23 | 19 | ||
| 24 | namespace Pica { | 20 | namespace Pica { |
| 25 | 21 | ||
| 26 | namespace VertexShader { | 22 | namespace Shader { |
| 27 | |||
| 28 | struct VertexShaderState { | ||
| 29 | u32 program_counter; | ||
| 30 | |||
| 31 | const float24* input_register_table[16]; | ||
| 32 | Math::Vec4<float24> output_registers[16]; | ||
| 33 | |||
| 34 | Math::Vec4<float24> temporary_registers[16]; | ||
| 35 | bool conditional_code[2]; | ||
| 36 | |||
| 37 | // Two Address registers and one loop counter | ||
| 38 | // TODO: How many bits do these actually have? | ||
| 39 | s32 address_registers[3]; | ||
| 40 | |||
| 41 | enum { | ||
| 42 | INVALID_ADDRESS = 0xFFFFFFFF | ||
| 43 | }; | ||
| 44 | 23 | ||
| 45 | struct CallStackElement { | 24 | void RunInterpreter(UnitState& state) { |
| 46 | u32 final_address; // Address upon which we jump to return_address | ||
| 47 | u32 return_address; // Where to jump when leaving scope | ||
| 48 | u8 repeat_counter; // How often to repeat until this call stack element is removed | ||
| 49 | u8 loop_increment; // Which value to add to the loop counter after an iteration | ||
| 50 | // TODO: Should this be a signed value? Does it even matter? | ||
| 51 | u32 loop_address; // The address where we'll return to after each loop iteration | ||
| 52 | }; | ||
| 53 | |||
| 54 | // TODO: Is there a maximal size for this? | ||
| 55 | boost::container::static_vector<CallStackElement, 16> call_stack; | ||
| 56 | |||
| 57 | struct { | ||
| 58 | u32 max_offset; // maximum program counter ever reached | ||
| 59 | u32 max_opdesc_id; // maximum swizzle pattern index ever used | ||
| 60 | } debug; | ||
| 61 | }; | ||
| 62 | |||
| 63 | static void ProcessShaderCode(VertexShaderState& state) { | ||
| 64 | const auto& uniforms = g_state.vs.uniforms; | 25 | const auto& uniforms = g_state.vs.uniforms; |
| 65 | const auto& swizzle_data = g_state.vs.swizzle_data; | 26 | const auto& swizzle_data = g_state.vs.swizzle_data; |
| 66 | const auto& program_code = g_state.vs.program_code; | 27 | const auto& program_code = g_state.vs.program_code; |
| @@ -90,7 +51,7 @@ static void ProcessShaderCode(VertexShaderState& state) { | |||
| 90 | const Instruction instr = { program_code[state.program_counter] }; | 51 | const Instruction instr = { program_code[state.program_counter] }; |
| 91 | const SwizzlePattern swizzle = { swizzle_data[instr.common.operand_desc_id] }; | 52 | const SwizzlePattern swizzle = { swizzle_data[instr.common.operand_desc_id] }; |
| 92 | 53 | ||
| 93 | static auto call = [](VertexShaderState& state, u32 offset, u32 num_instructions, | 54 | static auto call = [](UnitState& state, u32 offset, u32 num_instructions, |
| 94 | u32 return_offset, u8 repeat_count, u8 loop_increment) { | 55 | u32 return_offset, u8 repeat_count, u8 loop_increment) { |
| 95 | state.program_counter = offset - 1; // -1 to make sure when incrementing the PC we end up at the correct offset | 56 | state.program_counter = offset - 1; // -1 to make sure when incrementing the PC we end up at the correct offset |
| 96 | ASSERT(state.call_stack.size() < state.call_stack.capacity()); | 57 | ASSERT(state.call_stack.size() < state.call_stack.capacity()); |
| @@ -101,10 +62,10 @@ static void ProcessShaderCode(VertexShaderState& state) { | |||
| 101 | auto LookupSourceRegister = [&](const SourceRegister& source_reg) -> const float24* { | 62 | auto LookupSourceRegister = [&](const SourceRegister& source_reg) -> const float24* { |
| 102 | switch (source_reg.GetRegisterType()) { | 63 | switch (source_reg.GetRegisterType()) { |
| 103 | case RegisterType::Input: | 64 | case RegisterType::Input: |
| 104 | return state.input_register_table[source_reg.GetIndex()]; | 65 | return &state.registers.input[source_reg.GetIndex()].x; |
| 105 | 66 | ||
| 106 | case RegisterType::Temporary: | 67 | case RegisterType::Temporary: |
| 107 | return &state.temporary_registers[source_reg.GetIndex()].x; | 68 | return &state.registers.temporary[source_reg.GetIndex()].x; |
| 108 | 69 | ||
| 109 | case RegisterType::FloatUniform: | 70 | case RegisterType::FloatUniform: |
| 110 | return &uniforms.f[source_reg.GetIndex()].x; | 71 | return &uniforms.f[source_reg.GetIndex()].x; |
| @@ -153,8 +114,8 @@ static void ProcessShaderCode(VertexShaderState& state) { | |||
| 153 | src2[3] = src2[3] * float24::FromFloat32(-1); | 114 | src2[3] = src2[3] * float24::FromFloat32(-1); |
| 154 | } | 115 | } |
| 155 | 116 | ||
| 156 | float24* dest = (instr.common.dest.Value() < 0x10) ? &state.output_registers[instr.common.dest.Value().GetIndex()][0] | 117 | float24* dest = (instr.common.dest.Value() < 0x10) ? &state.registers.output[instr.common.dest.Value().GetIndex()][0] |
| 157 | : (instr.common.dest.Value() < 0x20) ? &state.temporary_registers[instr.common.dest.Value().GetIndex()][0] | 118 | : (instr.common.dest.Value() < 0x20) ? &state.registers.temporary[instr.common.dest.Value().GetIndex()][0] |
| 158 | : dummy_vec4_float24; | 119 | : dummy_vec4_float24; |
| 159 | 120 | ||
| 160 | state.debug.max_opdesc_id = std::max<u32>(state.debug.max_opdesc_id, 1+instr.common.operand_desc_id); | 121 | state.debug.max_opdesc_id = std::max<u32>(state.debug.max_opdesc_id, 1+instr.common.operand_desc_id); |
| @@ -394,8 +355,8 @@ static void ProcessShaderCode(VertexShaderState& state) { | |||
| 394 | src3[3] = src3[3] * float24::FromFloat32(-1); | 355 | src3[3] = src3[3] * float24::FromFloat32(-1); |
| 395 | } | 356 | } |
| 396 | 357 | ||
| 397 | float24* dest = (instr.mad.dest.Value() < 0x10) ? &state.output_registers[instr.mad.dest.Value().GetIndex()][0] | 358 | float24* dest = (instr.mad.dest.Value() < 0x10) ? &state.registers.output[instr.mad.dest.Value().GetIndex()][0] |
| 398 | : (instr.mad.dest.Value() < 0x20) ? &state.temporary_registers[instr.mad.dest.Value().GetIndex()][0] | 359 | : (instr.mad.dest.Value() < 0x20) ? &state.registers.temporary[instr.mad.dest.Value().GetIndex()][0] |
| 399 | : dummy_vec4_float24; | 360 | : dummy_vec4_float24; |
| 400 | 361 | ||
| 401 | for (int i = 0; i < 4; ++i) { | 362 | for (int i = 0; i < 4; ++i) { |
| @@ -413,7 +374,7 @@ static void ProcessShaderCode(VertexShaderState& state) { | |||
| 413 | 374 | ||
| 414 | default: | 375 | default: |
| 415 | { | 376 | { |
| 416 | static auto evaluate_condition = [](const VertexShaderState& state, bool refx, bool refy, Instruction::FlowControlType flow_control) { | 377 | static auto evaluate_condition = [](const UnitState& state, bool refx, bool refy, Instruction::FlowControlType flow_control) { |
| 417 | bool results[2] = { refx == state.conditional_code[0], | 378 | bool results[2] = { refx == state.conditional_code[0], |
| 418 | refy == state.conditional_code[1] }; | 379 | refy == state.conditional_code[1] }; |
| 419 | 380 | ||
| @@ -542,88 +503,6 @@ static void ProcessShaderCode(VertexShaderState& state) { | |||
| 542 | } | 503 | } |
| 543 | } | 504 | } |
| 544 | 505 | ||
| 545 | static Common::Profiling::TimingCategory shader_category("Vertex Shader"); | ||
| 546 | |||
| 547 | OutputVertex RunShader(const InputVertex& input, int num_attributes, const Regs::ShaderConfig& config, const State::ShaderSetup& setup) { | ||
| 548 | Common::Profiling::ScopeTimer timer(shader_category); | ||
| 549 | |||
| 550 | VertexShaderState state; | ||
| 551 | |||
| 552 | state.program_counter = config.main_offset; | ||
| 553 | state.debug.max_offset = 0; | ||
| 554 | state.debug.max_opdesc_id = 0; | ||
| 555 | |||
| 556 | // Setup input register table | ||
| 557 | const auto& attribute_register_map = config.input_register_map; | ||
| 558 | float24 dummy_register; | ||
| 559 | boost::fill(state.input_register_table, &dummy_register); | ||
| 560 | |||
| 561 | if (num_attributes > 0) state.input_register_table[attribute_register_map.attribute0_register] = &input.attr[0].x; | ||
| 562 | if (num_attributes > 1) state.input_register_table[attribute_register_map.attribute1_register] = &input.attr[1].x; | ||
| 563 | if (num_attributes > 2) state.input_register_table[attribute_register_map.attribute2_register] = &input.attr[2].x; | ||
| 564 | if (num_attributes > 3) state.input_register_table[attribute_register_map.attribute3_register] = &input.attr[3].x; | ||
| 565 | if (num_attributes > 4) state.input_register_table[attribute_register_map.attribute4_register] = &input.attr[4].x; | ||
| 566 | if (num_attributes > 5) state.input_register_table[attribute_register_map.attribute5_register] = &input.attr[5].x; | ||
| 567 | if (num_attributes > 6) state.input_register_table[attribute_register_map.attribute6_register] = &input.attr[6].x; | ||
| 568 | if (num_attributes > 7) state.input_register_table[attribute_register_map.attribute7_register] = &input.attr[7].x; | ||
| 569 | if (num_attributes > 8) state.input_register_table[attribute_register_map.attribute8_register] = &input.attr[8].x; | ||
| 570 | if (num_attributes > 9) state.input_register_table[attribute_register_map.attribute9_register] = &input.attr[9].x; | ||
| 571 | if (num_attributes > 10) state.input_register_table[attribute_register_map.attribute10_register] = &input.attr[10].x; | ||
| 572 | if (num_attributes > 11) state.input_register_table[attribute_register_map.attribute11_register] = &input.attr[11].x; | ||
| 573 | if (num_attributes > 12) state.input_register_table[attribute_register_map.attribute12_register] = &input.attr[12].x; | ||
| 574 | if (num_attributes > 13) state.input_register_table[attribute_register_map.attribute13_register] = &input.attr[13].x; | ||
| 575 | if (num_attributes > 14) state.input_register_table[attribute_register_map.attribute14_register] = &input.attr[14].x; | ||
| 576 | if (num_attributes > 15) state.input_register_table[attribute_register_map.attribute15_register] = &input.attr[15].x; | ||
| 577 | |||
| 578 | state.conditional_code[0] = false; | ||
| 579 | state.conditional_code[1] = false; | ||
| 580 | |||
| 581 | ProcessShaderCode(state); | ||
| 582 | #if PICA_DUMP_SHADERS | ||
| 583 | DebugUtils::DumpShader(setup.program_code.data(), state.debug.max_offset, setup.swizzle_data.data(), | ||
| 584 | state.debug.max_opdesc_id, config.main_offset, | ||
| 585 | g_state.regs.vs_output_attributes); // TODO: Don't hardcode VS here | ||
| 586 | #endif | ||
| 587 | |||
| 588 | // Setup output data | ||
| 589 | OutputVertex ret; | ||
| 590 | // TODO(neobrain): Under some circumstances, up to 16 attributes may be output. We need to | ||
| 591 | // figure out what those circumstances are and enable the remaining outputs then. | ||
| 592 | for (int i = 0; i < 7; ++i) { | ||
| 593 | const auto& output_register_map = g_state.regs.vs_output_attributes[i]; // TODO: Don't hardcode VS here | ||
| 594 | |||
| 595 | u32 semantics[4] = { | ||
| 596 | output_register_map.map_x, output_register_map.map_y, | ||
| 597 | output_register_map.map_z, output_register_map.map_w | ||
| 598 | }; | ||
| 599 | |||
| 600 | for (int comp = 0; comp < 4; ++comp) { | ||
| 601 | float24* out = ((float24*)&ret) + semantics[comp]; | ||
| 602 | if (semantics[comp] != Regs::VSOutputAttributes::INVALID) { | ||
| 603 | *out = state.output_registers[i][comp]; | ||
| 604 | } else { | ||
| 605 | // Zero output so that attributes which aren't output won't have denormals in them, | ||
| 606 | // which would slow us down later. | ||
| 607 | memset(out, 0, sizeof(*out)); | ||
| 608 | } | ||
| 609 | } | ||
| 610 | } | ||
| 611 | |||
| 612 | // The hardware takes the absolute and saturates vertex colors like this, *before* doing interpolation | ||
| 613 | for (int i = 0; i < 4; ++i) { | ||
| 614 | ret.color[i] = float24::FromFloat32( | ||
| 615 | std::fmin(std::fabs(ret.color[i].ToFloat32()), 1.0f)); | ||
| 616 | } | ||
| 617 | |||
| 618 | LOG_TRACE(Render_Software, "Output vertex: pos (%.2f, %.2f, %.2f, %.2f), col(%.2f, %.2f, %.2f, %.2f), tc0(%.2f, %.2f)", | ||
| 619 | ret.pos.x.ToFloat32(), ret.pos.y.ToFloat32(), ret.pos.z.ToFloat32(), ret.pos.w.ToFloat32(), | ||
| 620 | ret.color.x.ToFloat32(), ret.color.y.ToFloat32(), ret.color.z.ToFloat32(), ret.color.w.ToFloat32(), | ||
| 621 | ret.tc0.u().ToFloat32(), ret.tc0.v().ToFloat32()); | ||
| 622 | |||
| 623 | return ret; | ||
| 624 | } | ||
| 625 | |||
| 626 | |||
| 627 | } // namespace | 506 | } // namespace |
| 628 | 507 | ||
| 629 | } // namespace | 508 | } // namespace |
diff --git a/src/video_core/shader/shader_interpreter.h b/src/video_core/shader/shader_interpreter.h new file mode 100644 index 000000000..ad6e58e39 --- /dev/null +++ b/src/video_core/shader/shader_interpreter.h | |||
| @@ -0,0 +1,19 @@ | |||
| 1 | // Copyright 2014 Citra Emulator Project | ||
| 2 | // Licensed under GPLv2 or any later version | ||
| 3 | // Refer to the license.txt file included. | ||
| 4 | |||
| 5 | #pragma once | ||
| 6 | |||
| 7 | #include "video_core/pica.h" | ||
| 8 | |||
| 9 | #include "shader.h" | ||
| 10 | |||
| 11 | namespace Pica { | ||
| 12 | |||
| 13 | namespace Shader { | ||
| 14 | |||
| 15 | void RunInterpreter(UnitState& state); | ||
| 16 | |||
| 17 | } // namespace | ||
| 18 | |||
| 19 | } // namespace | ||
diff --git a/src/video_core/shader/shader_jit_x64.cpp b/src/video_core/shader/shader_jit_x64.cpp new file mode 100644 index 000000000..ce47774d5 --- /dev/null +++ b/src/video_core/shader/shader_jit_x64.cpp | |||
| @@ -0,0 +1,675 @@ | |||
| 1 | // Copyright 2015 Citra Emulator Project | ||
| 2 | // Licensed under GPLv2 or any later version | ||
| 3 | // Refer to the license.txt file included. | ||
| 4 | |||
| 5 | #include <smmintrin.h> | ||
| 6 | |||
| 7 | #include "common/x64/abi.h" | ||
| 8 | #include "common/x64/cpu_detect.h" | ||
| 9 | #include "common/x64/emitter.h" | ||
| 10 | |||
| 11 | #include "shader.h" | ||
| 12 | #include "shader_jit_x64.h" | ||
| 13 | |||
| 14 | namespace Pica { | ||
| 15 | |||
| 16 | namespace Shader { | ||
| 17 | |||
| 18 | using namespace Gen; | ||
| 19 | |||
| 20 | typedef void (JitCompiler::*JitFunction)(Instruction instr); | ||
| 21 | |||
| 22 | const JitFunction instr_table[64] = { | ||
| 23 | &JitCompiler::Compile_ADD, // add | ||
| 24 | &JitCompiler::Compile_DP3, // dp3 | ||
| 25 | &JitCompiler::Compile_DP4, // dp4 | ||
| 26 | nullptr, // dph | ||
| 27 | nullptr, // unknown | ||
| 28 | nullptr, // ex2 | ||
| 29 | nullptr, // lg2 | ||
| 30 | nullptr, // unknown | ||
| 31 | &JitCompiler::Compile_MUL, // mul | ||
| 32 | nullptr, // lge | ||
| 33 | nullptr, // slt | ||
| 34 | &JitCompiler::Compile_FLR, // flr | ||
| 35 | &JitCompiler::Compile_MAX, // max | ||
| 36 | &JitCompiler::Compile_MIN, // min | ||
| 37 | &JitCompiler::Compile_RCP, // rcp | ||
| 38 | &JitCompiler::Compile_RSQ, // rsq | ||
| 39 | nullptr, // unknown | ||
| 40 | nullptr, // unknown | ||
| 41 | &JitCompiler::Compile_MOVA, // mova | ||
| 42 | &JitCompiler::Compile_MOV, // mov | ||
| 43 | nullptr, // unknown | ||
| 44 | nullptr, // unknown | ||
| 45 | nullptr, // unknown | ||
| 46 | nullptr, // unknown | ||
| 47 | nullptr, // dphi | ||
| 48 | nullptr, // unknown | ||
| 49 | nullptr, // sgei | ||
| 50 | &JitCompiler::Compile_SLTI, // slti | ||
| 51 | nullptr, // unknown | ||
| 52 | nullptr, // unknown | ||
| 53 | nullptr, // unknown | ||
| 54 | nullptr, // unknown | ||
| 55 | nullptr, // unknown | ||
| 56 | &JitCompiler::Compile_NOP, // nop | ||
| 57 | &JitCompiler::Compile_END, // end | ||
| 58 | nullptr, // break | ||
| 59 | &JitCompiler::Compile_CALL, // call | ||
| 60 | &JitCompiler::Compile_CALLC, // callc | ||
| 61 | &JitCompiler::Compile_CALLU, // callu | ||
| 62 | &JitCompiler::Compile_IF, // ifu | ||
| 63 | &JitCompiler::Compile_IF, // ifc | ||
| 64 | &JitCompiler::Compile_LOOP, // loop | ||
| 65 | nullptr, // emit | ||
| 66 | nullptr, // sete | ||
| 67 | &JitCompiler::Compile_JMP, // jmpc | ||
| 68 | &JitCompiler::Compile_JMP, // jmpu | ||
| 69 | &JitCompiler::Compile_CMP, // cmp | ||
| 70 | &JitCompiler::Compile_CMP, // cmp | ||
| 71 | &JitCompiler::Compile_MAD, // madi | ||
| 72 | &JitCompiler::Compile_MAD, // madi | ||
| 73 | &JitCompiler::Compile_MAD, // madi | ||
| 74 | &JitCompiler::Compile_MAD, // madi | ||
| 75 | &JitCompiler::Compile_MAD, // madi | ||
| 76 | &JitCompiler::Compile_MAD, // madi | ||
| 77 | &JitCompiler::Compile_MAD, // madi | ||
| 78 | &JitCompiler::Compile_MAD, // madi | ||
| 79 | &JitCompiler::Compile_MAD, // mad | ||
| 80 | &JitCompiler::Compile_MAD, // mad | ||
| 81 | &JitCompiler::Compile_MAD, // mad | ||
| 82 | &JitCompiler::Compile_MAD, // mad | ||
| 83 | &JitCompiler::Compile_MAD, // mad | ||
| 84 | &JitCompiler::Compile_MAD, // mad | ||
| 85 | &JitCompiler::Compile_MAD, // mad | ||
| 86 | &JitCompiler::Compile_MAD, // mad | ||
| 87 | }; | ||
| 88 | |||
| 89 | // The following is used to alias some commonly used registers. Generally, RAX-RDX and XMM0-XMM3 can | ||
| 90 | // be used as scratch registers within a compiler function. The other registers have designated | ||
| 91 | // purposes, as documented below: | ||
| 92 | |||
| 93 | /// Pointer to the uniform memory | ||
| 94 | static const X64Reg UNIFORMS = R9; | ||
| 95 | /// The two 32-bit VS address offset registers set by the MOVA instruction | ||
| 96 | static const X64Reg ADDROFFS_REG_0 = R10; | ||
| 97 | static const X64Reg ADDROFFS_REG_1 = R11; | ||
| 98 | /// VS loop count register | ||
| 99 | static const X64Reg LOOPCOUNT_REG = R12; | ||
| 100 | /// Current VS loop iteration number (we could probably use LOOPCOUNT_REG, but this quicker) | ||
| 101 | static const X64Reg LOOPCOUNT = RSI; | ||
| 102 | /// Number to increment LOOPCOUNT_REG by on each loop iteration | ||
| 103 | static const X64Reg LOOPINC = RDI; | ||
| 104 | /// Result of the previous CMP instruction for the X-component comparison | ||
| 105 | static const X64Reg COND0 = R13; | ||
| 106 | /// Result of the previous CMP instruction for the Y-component comparison | ||
| 107 | static const X64Reg COND1 = R14; | ||
| 108 | /// Pointer to the UnitState instance for the current VS unit | ||
| 109 | static const X64Reg REGISTERS = R15; | ||
| 110 | /// SIMD scratch register | ||
| 111 | static const X64Reg SCRATCH = XMM0; | ||
| 112 | /// Loaded with the first swizzled source register, otherwise can be used as a scratch register | ||
| 113 | static const X64Reg SRC1 = XMM1; | ||
| 114 | /// Loaded with the second swizzled source register, otherwise can be used as a scratch register | ||
| 115 | static const X64Reg SRC2 = XMM2; | ||
| 116 | /// Loaded with the third swizzled source register, otherwise can be used as a scratch register | ||
| 117 | static const X64Reg SRC3 = XMM3; | ||
| 118 | /// Constant vector of [1.0f, 1.0f, 1.0f, 1.0f], used to efficiently set a vector to one | ||
| 119 | static const X64Reg ONE = XMM14; | ||
| 120 | /// Constant vector of [-0.f, -0.f, -0.f, -0.f], used to efficiently negate a vector with XOR | ||
| 121 | static const X64Reg NEGBIT = XMM15; | ||
| 122 | |||
| 123 | /// Raw constant for the source register selector that indicates no swizzling is performed | ||
| 124 | static const u8 NO_SRC_REG_SWIZZLE = 0x1b; | ||
| 125 | /// Raw constant for the destination register enable mask that indicates all components are enabled | ||
| 126 | static const u8 NO_DEST_REG_MASK = 0xf; | ||
| 127 | |||
| 128 | /** | ||
| 129 | * Loads and swizzles a source register into the specified XMM register. | ||
| 130 | * @param instr VS instruction, used for determining how to load the source register | ||
| 131 | * @param src_num Number indicating which source register to load (1 = src1, 2 = src2, 3 = src3) | ||
| 132 | * @param src_reg SourceRegister object corresponding to the source register to load | ||
| 133 | * @param dest Destination XMM register to store the loaded, swizzled source register | ||
| 134 | */ | ||
| 135 | void JitCompiler::Compile_SwizzleSrc(Instruction instr, unsigned src_num, SourceRegister src_reg, X64Reg dest) { | ||
| 136 | X64Reg src_ptr; | ||
| 137 | int src_offset; | ||
| 138 | |||
| 139 | if (src_reg.GetRegisterType() == RegisterType::FloatUniform) { | ||
| 140 | src_ptr = UNIFORMS; | ||
| 141 | src_offset = src_reg.GetIndex() * sizeof(float24) * 4; | ||
| 142 | } else { | ||
| 143 | src_ptr = REGISTERS; | ||
| 144 | src_offset = UnitState::InputOffset(src_reg); | ||
| 145 | } | ||
| 146 | |||
| 147 | unsigned operand_desc_id; | ||
| 148 | if (instr.opcode.Value().EffectiveOpCode() == OpCode::Id::MAD || | ||
| 149 | instr.opcode.Value().EffectiveOpCode() == OpCode::Id::MADI) { | ||
| 150 | // The MAD and MADI instructions do not use the address offset registers, so loading the | ||
| 151 | // source is a bit simpler here | ||
| 152 | |||
| 153 | operand_desc_id = instr.mad.operand_desc_id; | ||
| 154 | |||
| 155 | // Load the source | ||
| 156 | MOVAPS(dest, MDisp(src_ptr, src_offset)); | ||
| 157 | } else { | ||
| 158 | operand_desc_id = instr.common.operand_desc_id; | ||
| 159 | |||
| 160 | const bool is_inverted = (0 != (instr.opcode.Value().GetInfo().subtype & OpCode::Info::SrcInversed)); | ||
| 161 | unsigned offset_src = is_inverted ? 2 : 1; | ||
| 162 | |||
| 163 | if (src_num == offset_src && instr.common.address_register_index != 0) { | ||
| 164 | switch (instr.common.address_register_index) { | ||
| 165 | case 1: // address offset 1 | ||
| 166 | MOVAPS(dest, MComplex(src_ptr, ADDROFFS_REG_0, 1, src_offset)); | ||
| 167 | break; | ||
| 168 | case 2: // address offset 2 | ||
| 169 | MOVAPS(dest, MComplex(src_ptr, ADDROFFS_REG_1, 1, src_offset)); | ||
| 170 | break; | ||
| 171 | case 3: // adddress offet 3 | ||
| 172 | MOVAPS(dest, MComplex(src_ptr, LOOPCOUNT_REG, 1, src_offset)); | ||
| 173 | break; | ||
| 174 | default: | ||
| 175 | UNREACHABLE(); | ||
| 176 | break; | ||
| 177 | } | ||
| 178 | } else { | ||
| 179 | // Load the source | ||
| 180 | MOVAPS(dest, MDisp(src_ptr, src_offset)); | ||
| 181 | } | ||
| 182 | } | ||
| 183 | |||
| 184 | SwizzlePattern swiz = { g_state.vs.swizzle_data[operand_desc_id] }; | ||
| 185 | |||
| 186 | // Generate instructions for source register swizzling as needed | ||
| 187 | u8 sel = swiz.GetRawSelector(src_num); | ||
| 188 | if (sel != NO_SRC_REG_SWIZZLE) { | ||
| 189 | // Selector component order needs to be reversed for the SHUFPS instruction | ||
| 190 | sel = ((sel & 0xc0) >> 6) | ((sel & 3) << 6) | ((sel & 0xc) << 2) | ((sel & 0x30) >> 2); | ||
| 191 | |||
| 192 | // Shuffle inputs for swizzle | ||
| 193 | SHUFPS(dest, R(dest), sel); | ||
| 194 | } | ||
| 195 | |||
| 196 | // If the source register should be negated, flip the negative bit using XOR | ||
| 197 | const bool negate[] = { swiz.negate_src1, swiz.negate_src2, swiz.negate_src3 }; | ||
| 198 | if (negate[src_num - 1]) { | ||
| 199 | XORPS(dest, R(NEGBIT)); | ||
| 200 | } | ||
| 201 | } | ||
| 202 | |||
| 203 | void JitCompiler::Compile_DestEnable(Instruction instr,X64Reg src) { | ||
| 204 | DestRegister dest; | ||
| 205 | unsigned operand_desc_id; | ||
| 206 | if (instr.opcode.Value().EffectiveOpCode() == OpCode::Id::MAD || | ||
| 207 | instr.opcode.Value().EffectiveOpCode() == OpCode::Id::MADI) { | ||
| 208 | operand_desc_id = instr.mad.operand_desc_id; | ||
| 209 | dest = instr.mad.dest.Value(); | ||
| 210 | } else { | ||
| 211 | operand_desc_id = instr.common.operand_desc_id; | ||
| 212 | dest = instr.common.dest.Value(); | ||
| 213 | } | ||
| 214 | |||
| 215 | SwizzlePattern swiz = { g_state.vs.swizzle_data[operand_desc_id] }; | ||
| 216 | |||
| 217 | // If all components are enabled, write the result to the destination register | ||
| 218 | if (swiz.dest_mask == NO_DEST_REG_MASK) { | ||
| 219 | // Store dest back to memory | ||
| 220 | MOVAPS(MDisp(REGISTERS, UnitState::OutputOffset(dest)), src); | ||
| 221 | |||
| 222 | } else { | ||
| 223 | // Not all components are enabled, so mask the result when storing to the destination register... | ||
| 224 | MOVAPS(SCRATCH, MDisp(REGISTERS, UnitState::OutputOffset(dest))); | ||
| 225 | |||
| 226 | if (Common::GetCPUCaps().sse4_1) { | ||
| 227 | u8 mask = ((swiz.dest_mask & 1) << 3) | ((swiz.dest_mask & 8) >> 3) | ((swiz.dest_mask & 2) << 1) | ((swiz.dest_mask & 4) >> 1); | ||
| 228 | BLENDPS(SCRATCH, R(src), mask); | ||
| 229 | } else { | ||
| 230 | MOVAPS(XMM4, R(src)); | ||
| 231 | UNPCKHPS(XMM4, R(SCRATCH)); // Unpack X/Y components of source and destination | ||
| 232 | UNPCKLPS(SCRATCH, R(src)); // Unpack Z/W components of source and destination | ||
| 233 | |||
| 234 | // Compute selector to selectively copy source components to destination for SHUFPS instruction | ||
| 235 | u8 sel = ((swiz.DestComponentEnabled(0) ? 1 : 0) << 0) | | ||
| 236 | ((swiz.DestComponentEnabled(1) ? 3 : 2) << 2) | | ||
| 237 | ((swiz.DestComponentEnabled(2) ? 0 : 1) << 4) | | ||
| 238 | ((swiz.DestComponentEnabled(3) ? 2 : 3) << 6); | ||
| 239 | SHUFPS(SCRATCH, R(XMM4), sel); | ||
| 240 | } | ||
| 241 | |||
| 242 | // Store dest back to memory | ||
| 243 | MOVAPS(MDisp(REGISTERS, UnitState::OutputOffset(dest)), SCRATCH); | ||
| 244 | } | ||
| 245 | } | ||
| 246 | |||
| 247 | void JitCompiler::Compile_EvaluateCondition(Instruction instr) { | ||
| 248 | // Note: NXOR is used below to check for equality | ||
| 249 | switch (instr.flow_control.op) { | ||
| 250 | case Instruction::FlowControlType::Or: | ||
| 251 | MOV(32, R(RAX), R(COND0)); | ||
| 252 | MOV(32, R(RBX), R(COND1)); | ||
| 253 | XOR(32, R(RAX), Imm32(instr.flow_control.refx.Value() ^ 1)); | ||
| 254 | XOR(32, R(RBX), Imm32(instr.flow_control.refy.Value() ^ 1)); | ||
| 255 | OR(32, R(RAX), R(RBX)); | ||
| 256 | break; | ||
| 257 | |||
| 258 | case Instruction::FlowControlType::And: | ||
| 259 | MOV(32, R(RAX), R(COND0)); | ||
| 260 | MOV(32, R(RBX), R(COND1)); | ||
| 261 | XOR(32, R(RAX), Imm32(instr.flow_control.refx.Value() ^ 1)); | ||
| 262 | XOR(32, R(RBX), Imm32(instr.flow_control.refy.Value() ^ 1)); | ||
| 263 | AND(32, R(RAX), R(RBX)); | ||
| 264 | break; | ||
| 265 | |||
| 266 | case Instruction::FlowControlType::JustX: | ||
| 267 | MOV(32, R(RAX), R(COND0)); | ||
| 268 | XOR(32, R(RAX), Imm32(instr.flow_control.refx.Value() ^ 1)); | ||
| 269 | break; | ||
| 270 | |||
| 271 | case Instruction::FlowControlType::JustY: | ||
| 272 | MOV(32, R(RAX), R(COND1)); | ||
| 273 | XOR(32, R(RAX), Imm32(instr.flow_control.refy.Value() ^ 1)); | ||
| 274 | break; | ||
| 275 | } | ||
| 276 | } | ||
| 277 | |||
| 278 | void JitCompiler::Compile_UniformCondition(Instruction instr) { | ||
| 279 | int offset = offsetof(decltype(g_state.vs.uniforms), b) + (instr.flow_control.bool_uniform_id * sizeof(bool)); | ||
| 280 | CMP(sizeof(bool) * 8, MDisp(UNIFORMS, offset), Imm8(0)); | ||
| 281 | } | ||
| 282 | |||
| 283 | void JitCompiler::Compile_ADD(Instruction instr) { | ||
| 284 | Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1); | ||
| 285 | Compile_SwizzleSrc(instr, 2, instr.common.src2, SRC2); | ||
| 286 | ADDPS(SRC1, R(SRC2)); | ||
| 287 | Compile_DestEnable(instr, SRC1); | ||
| 288 | } | ||
| 289 | |||
| 290 | void JitCompiler::Compile_DP3(Instruction instr) { | ||
| 291 | Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1); | ||
| 292 | Compile_SwizzleSrc(instr, 2, instr.common.src2, SRC2); | ||
| 293 | |||
| 294 | if (Common::GetCPUCaps().sse4_1) { | ||
| 295 | DPPS(SRC1, R(SRC2), 0x7f); | ||
| 296 | } else { | ||
| 297 | MULPS(SRC1, R(SRC2)); | ||
| 298 | |||
| 299 | MOVAPS(SRC2, R(SRC1)); | ||
| 300 | SHUFPS(SRC2, R(SRC2), _MM_SHUFFLE(1, 1, 1, 1)); | ||
| 301 | |||
| 302 | MOVAPS(SRC3, R(SRC1)); | ||
| 303 | SHUFPS(SRC3, R(SRC3), _MM_SHUFFLE(2, 2, 2, 2)); | ||
| 304 | |||
| 305 | SHUFPS(SRC1, R(SRC1), _MM_SHUFFLE(0, 0, 0, 0)); | ||
| 306 | ADDPS(SRC1, R(SRC2)); | ||
| 307 | ADDPS(SRC1, R(SRC3)); | ||
| 308 | } | ||
| 309 | |||
| 310 | Compile_DestEnable(instr, SRC1); | ||
| 311 | } | ||
| 312 | |||
| 313 | void JitCompiler::Compile_DP4(Instruction instr) { | ||
| 314 | Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1); | ||
| 315 | Compile_SwizzleSrc(instr, 2, instr.common.src2, SRC2); | ||
| 316 | |||
| 317 | if (Common::GetCPUCaps().sse4_1) { | ||
| 318 | DPPS(SRC1, R(SRC2), 0xff); | ||
| 319 | } else { | ||
| 320 | MULPS(SRC1, R(SRC2)); | ||
| 321 | |||
| 322 | MOVAPS(SRC2, R(SRC1)); | ||
| 323 | SHUFPS(SRC1, R(SRC1), _MM_SHUFFLE(2, 3, 0, 1)); // XYZW -> ZWXY | ||
| 324 | ADDPS(SRC1, R(SRC2)); | ||
| 325 | |||
| 326 | MOVAPS(SRC2, R(SRC1)); | ||
| 327 | SHUFPS(SRC1, R(SRC1), _MM_SHUFFLE(0, 1, 2, 3)); // XYZW -> WZYX | ||
| 328 | ADDPS(SRC1, R(SRC2)); | ||
| 329 | } | ||
| 330 | |||
| 331 | Compile_DestEnable(instr, SRC1); | ||
| 332 | } | ||
| 333 | |||
| 334 | void JitCompiler::Compile_MUL(Instruction instr) { | ||
| 335 | Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1); | ||
| 336 | Compile_SwizzleSrc(instr, 2, instr.common.src2, SRC2); | ||
| 337 | MULPS(SRC1, R(SRC2)); | ||
| 338 | Compile_DestEnable(instr, SRC1); | ||
| 339 | } | ||
| 340 | |||
| 341 | void JitCompiler::Compile_FLR(Instruction instr) { | ||
| 342 | Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1); | ||
| 343 | |||
| 344 | if (Common::GetCPUCaps().sse4_1) { | ||
| 345 | ROUNDFLOORPS(SRC1, R(SRC1)); | ||
| 346 | } else { | ||
| 347 | CVTPS2DQ(SRC1, R(SRC1)); | ||
| 348 | CVTDQ2PS(SRC1, R(SRC1)); | ||
| 349 | } | ||
| 350 | |||
| 351 | Compile_DestEnable(instr, SRC1); | ||
| 352 | } | ||
| 353 | |||
| 354 | void JitCompiler::Compile_MAX(Instruction instr) { | ||
| 355 | Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1); | ||
| 356 | Compile_SwizzleSrc(instr, 2, instr.common.src2, SRC2); | ||
| 357 | MAXPS(SRC1, R(SRC2)); | ||
| 358 | Compile_DestEnable(instr, SRC1); | ||
| 359 | } | ||
| 360 | |||
| 361 | void JitCompiler::Compile_MIN(Instruction instr) { | ||
| 362 | Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1); | ||
| 363 | Compile_SwizzleSrc(instr, 2, instr.common.src2, SRC2); | ||
| 364 | MINPS(SRC1, R(SRC2)); | ||
| 365 | Compile_DestEnable(instr, SRC1); | ||
| 366 | } | ||
| 367 | |||
| 368 | void JitCompiler::Compile_MOVA(Instruction instr) { | ||
| 369 | SwizzlePattern swiz = { g_state.vs.swizzle_data[instr.common.operand_desc_id] }; | ||
| 370 | |||
| 371 | if (!swiz.DestComponentEnabled(0) && !swiz.DestComponentEnabled(1)) { | ||
| 372 | return; // NoOp | ||
| 373 | } | ||
| 374 | |||
| 375 | Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1); | ||
| 376 | |||
| 377 | // Convert floats to integers (only care about X and Y components) | ||
| 378 | CVTPS2DQ(SRC1, R(SRC1)); | ||
| 379 | |||
| 380 | // Get result | ||
| 381 | MOVQ_xmm(R(RAX), SRC1); | ||
| 382 | |||
| 383 | // Handle destination enable | ||
| 384 | if (swiz.DestComponentEnabled(0) && swiz.DestComponentEnabled(1)) { | ||
| 385 | // Move and sign-extend low 32 bits | ||
| 386 | MOVSX(64, 32, ADDROFFS_REG_0, R(RAX)); | ||
| 387 | |||
| 388 | // Move and sign-extend high 32 bits | ||
| 389 | SHR(64, R(RAX), Imm8(32)); | ||
| 390 | MOVSX(64, 32, ADDROFFS_REG_1, R(RAX)); | ||
| 391 | |||
| 392 | // Multiply by 16 to be used as an offset later | ||
| 393 | SHL(64, R(ADDROFFS_REG_0), Imm8(4)); | ||
| 394 | SHL(64, R(ADDROFFS_REG_1), Imm8(4)); | ||
| 395 | } else { | ||
| 396 | if (swiz.DestComponentEnabled(0)) { | ||
| 397 | // Move and sign-extend low 32 bits | ||
| 398 | MOVSX(64, 32, ADDROFFS_REG_0, R(RAX)); | ||
| 399 | |||
| 400 | // Multiply by 16 to be used as an offset later | ||
| 401 | SHL(64, R(ADDROFFS_REG_0), Imm8(4)); | ||
| 402 | } else if (swiz.DestComponentEnabled(1)) { | ||
| 403 | // Move and sign-extend high 32 bits | ||
| 404 | SHR(64, R(RAX), Imm8(32)); | ||
| 405 | MOVSX(64, 32, ADDROFFS_REG_1, R(RAX)); | ||
| 406 | |||
| 407 | // Multiply by 16 to be used as an offset later | ||
| 408 | SHL(64, R(ADDROFFS_REG_1), Imm8(4)); | ||
| 409 | } | ||
| 410 | } | ||
| 411 | } | ||
| 412 | |||
| 413 | void JitCompiler::Compile_MOV(Instruction instr) { | ||
| 414 | Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1); | ||
| 415 | Compile_DestEnable(instr, SRC1); | ||
| 416 | } | ||
| 417 | |||
| 418 | void JitCompiler::Compile_SLTI(Instruction instr) { | ||
| 419 | Compile_SwizzleSrc(instr, 1, instr.common.src1i, SRC1); | ||
| 420 | Compile_SwizzleSrc(instr, 1, instr.common.src2i, SRC2); | ||
| 421 | |||
| 422 | CMPSS(SRC1, R(SRC2), CMP_LT); | ||
| 423 | ANDPS(SRC1, R(ONE)); | ||
| 424 | |||
| 425 | Compile_DestEnable(instr, SRC1); | ||
| 426 | } | ||
| 427 | |||
| 428 | void JitCompiler::Compile_RCP(Instruction instr) { | ||
| 429 | Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1); | ||
| 430 | |||
| 431 | // TODO(bunnei): RCPPS is a pretty rough approximation, this might cause problems if Pica | ||
| 432 | // performs this operation more accurately. This should be checked on hardware. | ||
| 433 | RCPPS(SRC1, R(SRC1)); | ||
| 434 | |||
| 435 | Compile_DestEnable(instr, SRC1); | ||
| 436 | } | ||
| 437 | |||
| 438 | void JitCompiler::Compile_RSQ(Instruction instr) { | ||
| 439 | Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1); | ||
| 440 | |||
| 441 | // TODO(bunnei): RSQRTPS is a pretty rough approximation, this might cause problems if Pica | ||
| 442 | // performs this operation more accurately. This should be checked on hardware. | ||
| 443 | RSQRTPS(SRC1, R(SRC1)); | ||
| 444 | |||
| 445 | Compile_DestEnable(instr, SRC1); | ||
| 446 | } | ||
| 447 | |||
| 448 | void JitCompiler::Compile_NOP(Instruction instr) { | ||
| 449 | } | ||
| 450 | |||
| 451 | void JitCompiler::Compile_END(Instruction instr) { | ||
| 452 | ABI_PopAllCalleeSavedRegsAndAdjustStack(); | ||
| 453 | RET(); | ||
| 454 | } | ||
| 455 | |||
| 456 | void JitCompiler::Compile_CALL(Instruction instr) { | ||
| 457 | unsigned offset = instr.flow_control.dest_offset; | ||
| 458 | while (offset < (instr.flow_control.dest_offset + instr.flow_control.num_instructions)) { | ||
| 459 | Compile_NextInstr(&offset); | ||
| 460 | } | ||
| 461 | } | ||
| 462 | |||
| 463 | void JitCompiler::Compile_CALLC(Instruction instr) { | ||
| 464 | Compile_EvaluateCondition(instr); | ||
| 465 | FixupBranch b = J_CC(CC_Z, true); | ||
| 466 | Compile_CALL(instr); | ||
| 467 | SetJumpTarget(b); | ||
| 468 | } | ||
| 469 | |||
| 470 | void JitCompiler::Compile_CALLU(Instruction instr) { | ||
| 471 | Compile_UniformCondition(instr); | ||
| 472 | FixupBranch b = J_CC(CC_Z, true); | ||
| 473 | Compile_CALL(instr); | ||
| 474 | SetJumpTarget(b); | ||
| 475 | } | ||
| 476 | |||
| 477 | void JitCompiler::Compile_CMP(Instruction instr) { | ||
| 478 | Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1); | ||
| 479 | Compile_SwizzleSrc(instr, 2, instr.common.src2, SRC2); | ||
| 480 | |||
| 481 | static const u8 cmp[] = { CMP_EQ, CMP_NEQ, CMP_LT, CMP_LE, CMP_NLE, CMP_NLT }; | ||
| 482 | |||
| 483 | if (instr.common.compare_op.x == instr.common.compare_op.y) { | ||
| 484 | // Compare X-component and Y-component together | ||
| 485 | CMPPS(SRC1, R(SRC2), cmp[instr.common.compare_op.x]); | ||
| 486 | |||
| 487 | MOVQ_xmm(R(COND0), SRC1); | ||
| 488 | MOV(64, R(COND1), R(COND0)); | ||
| 489 | } else { | ||
| 490 | // Compare X-component | ||
| 491 | MOVAPS(SCRATCH, R(SRC1)); | ||
| 492 | CMPSS(SCRATCH, R(SRC2), cmp[instr.common.compare_op.x]); | ||
| 493 | |||
| 494 | // Compare Y-component | ||
| 495 | CMPPS(SRC1, R(SRC2), cmp[instr.common.compare_op.y]); | ||
| 496 | |||
| 497 | MOVQ_xmm(R(COND0), SCRATCH); | ||
| 498 | MOVQ_xmm(R(COND1), SRC1); | ||
| 499 | } | ||
| 500 | |||
| 501 | SHR(32, R(COND0), Imm8(31)); | ||
| 502 | SHR(64, R(COND1), Imm8(63)); | ||
| 503 | } | ||
| 504 | |||
| 505 | void JitCompiler::Compile_MAD(Instruction instr) { | ||
| 506 | Compile_SwizzleSrc(instr, 1, instr.mad.src1, SRC1); | ||
| 507 | |||
| 508 | if (instr.opcode.Value().EffectiveOpCode() == OpCode::Id::MADI) { | ||
| 509 | Compile_SwizzleSrc(instr, 2, instr.mad.src2i, SRC2); | ||
| 510 | Compile_SwizzleSrc(instr, 3, instr.mad.src3i, SRC3); | ||
| 511 | } else { | ||
| 512 | Compile_SwizzleSrc(instr, 2, instr.mad.src2, SRC2); | ||
| 513 | Compile_SwizzleSrc(instr, 3, instr.mad.src3, SRC3); | ||
| 514 | } | ||
| 515 | |||
| 516 | if (Common::GetCPUCaps().fma) { | ||
| 517 | VFMADD213PS(SRC1, SRC2, R(SRC3)); | ||
| 518 | } else { | ||
| 519 | MULPS(SRC1, R(SRC2)); | ||
| 520 | ADDPS(SRC1, R(SRC3)); | ||
| 521 | } | ||
| 522 | |||
| 523 | Compile_DestEnable(instr, SRC1); | ||
| 524 | } | ||
| 525 | |||
| 526 | void JitCompiler::Compile_IF(Instruction instr) { | ||
| 527 | ASSERT_MSG(instr.flow_control.dest_offset > *offset_ptr, "Backwards if-statements not supported"); | ||
| 528 | |||
| 529 | // Evaluate the "IF" condition | ||
| 530 | if (instr.opcode.Value() == OpCode::Id::IFU) { | ||
| 531 | Compile_UniformCondition(instr); | ||
| 532 | } else if (instr.opcode.Value() == OpCode::Id::IFC) { | ||
| 533 | Compile_EvaluateCondition(instr); | ||
| 534 | } | ||
| 535 | FixupBranch b = J_CC(CC_Z, true); | ||
| 536 | |||
| 537 | // Compile the code that corresponds to the condition evaluating as true | ||
| 538 | Compile_Block(instr.flow_control.dest_offset - 1); | ||
| 539 | |||
| 540 | // If there isn't an "ELSE" condition, we are done here | ||
| 541 | if (instr.flow_control.num_instructions == 0) { | ||
| 542 | SetJumpTarget(b); | ||
| 543 | return; | ||
| 544 | } | ||
| 545 | |||
| 546 | FixupBranch b2 = J(true); | ||
| 547 | |||
| 548 | SetJumpTarget(b); | ||
| 549 | |||
| 550 | // This code corresponds to the "ELSE" condition | ||
| 551 | // Comple the code that corresponds to the condition evaluating as false | ||
| 552 | Compile_Block(instr.flow_control.dest_offset + instr.flow_control.num_instructions - 1); | ||
| 553 | |||
| 554 | SetJumpTarget(b2); | ||
| 555 | } | ||
| 556 | |||
| 557 | void JitCompiler::Compile_LOOP(Instruction instr) { | ||
| 558 | ASSERT_MSG(instr.flow_control.dest_offset > *offset_ptr, "Backwards loops not supported"); | ||
| 559 | ASSERT_MSG(!looping, "Nested loops not supported"); | ||
| 560 | |||
| 561 | looping = true; | ||
| 562 | |||
| 563 | int offset = offsetof(decltype(g_state.vs.uniforms), i) + (instr.flow_control.int_uniform_id * sizeof(Math::Vec4<u8>)); | ||
| 564 | MOV(32, R(LOOPCOUNT), MDisp(UNIFORMS, offset)); | ||
| 565 | MOV(32, R(LOOPCOUNT_REG), R(LOOPCOUNT)); | ||
| 566 | SHR(32, R(LOOPCOUNT_REG), Imm8(8)); | ||
| 567 | AND(32, R(LOOPCOUNT_REG), Imm32(0xff)); // Y-component is the start | ||
| 568 | MOV(32, R(LOOPINC), R(LOOPCOUNT)); | ||
| 569 | SHR(32, R(LOOPINC), Imm8(16)); | ||
| 570 | MOVZX(32, 8, LOOPINC, R(LOOPINC)); // Z-component is the incrementer | ||
| 571 | MOVZX(32, 8, LOOPCOUNT, R(LOOPCOUNT)); // X-component is iteration count | ||
| 572 | ADD(32, R(LOOPCOUNT), Imm8(1)); // Iteration count is X-component + 1 | ||
| 573 | |||
| 574 | auto loop_start = GetCodePtr(); | ||
| 575 | |||
| 576 | Compile_Block(instr.flow_control.dest_offset); | ||
| 577 | |||
| 578 | ADD(32, R(LOOPCOUNT_REG), R(LOOPINC)); // Increment LOOPCOUNT_REG by Z-component | ||
| 579 | SUB(32, R(LOOPCOUNT), Imm8(1)); // Increment loop count by 1 | ||
| 580 | J_CC(CC_NZ, loop_start); // Loop if not equal | ||
| 581 | |||
| 582 | looping = false; | ||
| 583 | } | ||
| 584 | |||
| 585 | void JitCompiler::Compile_JMP(Instruction instr) { | ||
| 586 | ASSERT_MSG(instr.flow_control.dest_offset > *offset_ptr, "Backwards jumps not supported"); | ||
| 587 | |||
| 588 | if (instr.opcode.Value() == OpCode::Id::JMPC) | ||
| 589 | Compile_EvaluateCondition(instr); | ||
| 590 | else if (instr.opcode.Value() == OpCode::Id::JMPU) | ||
| 591 | Compile_UniformCondition(instr); | ||
| 592 | else | ||
| 593 | UNREACHABLE(); | ||
| 594 | |||
| 595 | FixupBranch b = J_CC(CC_NZ, true); | ||
| 596 | |||
| 597 | Compile_Block(instr.flow_control.dest_offset); | ||
| 598 | |||
| 599 | SetJumpTarget(b); | ||
| 600 | } | ||
| 601 | |||
| 602 | void JitCompiler::Compile_Block(unsigned stop) { | ||
| 603 | // Save current offset pointer | ||
| 604 | unsigned* prev_offset_ptr = offset_ptr; | ||
| 605 | unsigned offset = *prev_offset_ptr; | ||
| 606 | |||
| 607 | while (offset <= stop) | ||
| 608 | Compile_NextInstr(&offset); | ||
| 609 | |||
| 610 | // Restore current offset pointer | ||
| 611 | offset_ptr = prev_offset_ptr; | ||
| 612 | *offset_ptr = offset; | ||
| 613 | } | ||
| 614 | |||
| 615 | void JitCompiler::Compile_NextInstr(unsigned* offset) { | ||
| 616 | offset_ptr = offset; | ||
| 617 | |||
| 618 | Instruction instr = *(Instruction*)&g_state.vs.program_code[(*offset_ptr)++]; | ||
| 619 | OpCode::Id opcode = instr.opcode.Value(); | ||
| 620 | auto instr_func = instr_table[static_cast<unsigned>(opcode)]; | ||
| 621 | |||
| 622 | if (instr_func) { | ||
| 623 | // JIT the instruction! | ||
| 624 | ((*this).*instr_func)(instr); | ||
| 625 | } else { | ||
| 626 | // Unhandled instruction | ||
| 627 | LOG_CRITICAL(HW_GPU, "Unhandled instruction: 0x%02x (0x%08x)", instr.opcode.Value(), instr.hex); | ||
| 628 | } | ||
| 629 | } | ||
| 630 | |||
| 631 | CompiledShader* JitCompiler::Compile() { | ||
| 632 | const u8* start = GetCodePtr(); | ||
| 633 | const auto& code = g_state.vs.program_code; | ||
| 634 | unsigned offset = g_state.regs.vs.main_offset; | ||
| 635 | |||
| 636 | ABI_PushAllCalleeSavedRegsAndAdjustStack(); | ||
| 637 | |||
| 638 | MOV(PTRBITS, R(REGISTERS), R(ABI_PARAM1)); | ||
| 639 | MOV(PTRBITS, R(UNIFORMS), ImmPtr(&g_state.vs.uniforms)); | ||
| 640 | |||
| 641 | // Zero address/loop registers | ||
| 642 | XOR(64, R(ADDROFFS_REG_0), R(ADDROFFS_REG_0)); | ||
| 643 | XOR(64, R(ADDROFFS_REG_1), R(ADDROFFS_REG_1)); | ||
| 644 | XOR(64, R(LOOPCOUNT_REG), R(LOOPCOUNT_REG)); | ||
| 645 | |||
| 646 | // Used to set a register to one | ||
| 647 | static const __m128 one = { 1.f, 1.f, 1.f, 1.f }; | ||
| 648 | MOV(PTRBITS, R(RAX), ImmPtr(&one)); | ||
| 649 | MOVAPS(ONE, MDisp(RAX, 0)); | ||
| 650 | |||
| 651 | // Used to negate registers | ||
| 652 | static const __m128 neg = { -0.f, -0.f, -0.f, -0.f }; | ||
| 653 | MOV(PTRBITS, R(RAX), ImmPtr(&neg)); | ||
| 654 | MOVAPS(NEGBIT, MDisp(RAX, 0)); | ||
| 655 | |||
| 656 | looping = false; | ||
| 657 | |||
| 658 | while (offset < g_state.vs.program_code.size()) { | ||
| 659 | Compile_NextInstr(&offset); | ||
| 660 | } | ||
| 661 | |||
| 662 | return (CompiledShader*)start; | ||
| 663 | } | ||
| 664 | |||
| 665 | JitCompiler::JitCompiler() { | ||
| 666 | AllocCodeSpace(1024 * 1024 * 4); | ||
| 667 | } | ||
| 668 | |||
| 669 | void JitCompiler::Clear() { | ||
| 670 | ClearCodeSpace(); | ||
| 671 | } | ||
| 672 | |||
| 673 | } // namespace Shader | ||
| 674 | |||
| 675 | } // namespace Pica | ||
diff --git a/src/video_core/shader/shader_jit_x64.h b/src/video_core/shader/shader_jit_x64.h new file mode 100644 index 000000000..b88f2a0d2 --- /dev/null +++ b/src/video_core/shader/shader_jit_x64.h | |||
| @@ -0,0 +1,79 @@ | |||
| 1 | // Copyright 2015 Citra Emulator Project | ||
| 2 | // Licensed under GPLv2 or any later version | ||
| 3 | // Refer to the license.txt file included. | ||
| 4 | |||
| 5 | #pragma once | ||
| 6 | |||
| 7 | #include <nihstro/shader_bytecode.h> | ||
| 8 | |||
| 9 | #include "common/x64/emitter.h" | ||
| 10 | |||
| 11 | #include "video_core/pica.h" | ||
| 12 | |||
| 13 | #include "shader.h" | ||
| 14 | |||
| 15 | using nihstro::Instruction; | ||
| 16 | using nihstro::OpCode; | ||
| 17 | using nihstro::SwizzlePattern; | ||
| 18 | |||
| 19 | namespace Pica { | ||
| 20 | |||
| 21 | namespace Shader { | ||
| 22 | |||
| 23 | using CompiledShader = void(void* registers); | ||
| 24 | |||
| 25 | /** | ||
| 26 | * This class implements the shader JIT compiler. It recompiles a Pica shader program into x86_64 | ||
| 27 | * code that can be executed on the host machine directly. | ||
| 28 | */ | ||
| 29 | class JitCompiler : public Gen::XCodeBlock { | ||
| 30 | public: | ||
| 31 | JitCompiler(); | ||
| 32 | |||
| 33 | CompiledShader* Compile(); | ||
| 34 | |||
| 35 | void Clear(); | ||
| 36 | |||
| 37 | void Compile_ADD(Instruction instr); | ||
| 38 | void Compile_DP3(Instruction instr); | ||
| 39 | void Compile_DP4(Instruction instr); | ||
| 40 | void Compile_MUL(Instruction instr); | ||
| 41 | void Compile_FLR(Instruction instr); | ||
| 42 | void Compile_MAX(Instruction instr); | ||
| 43 | void Compile_MIN(Instruction instr); | ||
| 44 | void Compile_RCP(Instruction instr); | ||
| 45 | void Compile_RSQ(Instruction instr); | ||
| 46 | void Compile_MOVA(Instruction instr); | ||
| 47 | void Compile_MOV(Instruction instr); | ||
| 48 | void Compile_SLTI(Instruction instr); | ||
| 49 | void Compile_NOP(Instruction instr); | ||
| 50 | void Compile_END(Instruction instr); | ||
| 51 | void Compile_CALL(Instruction instr); | ||
| 52 | void Compile_CALLC(Instruction instr); | ||
| 53 | void Compile_CALLU(Instruction instr); | ||
| 54 | void Compile_IF(Instruction instr); | ||
| 55 | void Compile_LOOP(Instruction instr); | ||
| 56 | void Compile_JMP(Instruction instr); | ||
| 57 | void Compile_CMP(Instruction instr); | ||
| 58 | void Compile_MAD(Instruction instr); | ||
| 59 | |||
| 60 | private: | ||
| 61 | void Compile_Block(unsigned stop); | ||
| 62 | void Compile_NextInstr(unsigned* offset); | ||
| 63 | |||
| 64 | void Compile_SwizzleSrc(Instruction instr, unsigned src_num, SourceRegister src_reg, Gen::X64Reg dest); | ||
| 65 | void Compile_DestEnable(Instruction instr, Gen::X64Reg dest); | ||
| 66 | |||
| 67 | void Compile_EvaluateCondition(Instruction instr); | ||
| 68 | void Compile_UniformCondition(Instruction instr); | ||
| 69 | |||
| 70 | /// Pointer to the variable that stores the current Pica code offset. Used to handle nested code blocks. | ||
| 71 | unsigned* offset_ptr = nullptr; | ||
| 72 | |||
| 73 | /// Set to true if currently in a loop, used to check for the existence of nested loops | ||
| 74 | bool looping = false; | ||
| 75 | }; | ||
| 76 | |||
| 77 | } // Shader | ||
| 78 | |||
| 79 | } // Pica | ||
diff --git a/src/video_core/vertex_shader.h b/src/video_core/vertex_shader.h deleted file mode 100644 index 97f9250dd..000000000 --- a/src/video_core/vertex_shader.h +++ /dev/null | |||
| @@ -1,73 +0,0 @@ | |||
| 1 | // Copyright 2014 Citra Emulator Project | ||
| 2 | // Licensed under GPLv2 or any later version | ||
| 3 | // Refer to the license.txt file included. | ||
| 4 | |||
| 5 | #pragma once | ||
| 6 | |||
| 7 | #include <type_traits> | ||
| 8 | |||
| 9 | #include "common/vector_math.h" | ||
| 10 | |||
| 11 | #include "pica.h" | ||
| 12 | |||
| 13 | namespace Pica { | ||
| 14 | |||
| 15 | namespace VertexShader { | ||
| 16 | |||
| 17 | struct InputVertex { | ||
| 18 | Math::Vec4<float24> attr[16]; | ||
| 19 | }; | ||
| 20 | |||
| 21 | struct OutputVertex { | ||
| 22 | OutputVertex() = default; | ||
| 23 | |||
| 24 | // VS output attributes | ||
| 25 | Math::Vec4<float24> pos; | ||
| 26 | Math::Vec4<float24> dummy; // quaternions (not implemented, yet) | ||
| 27 | Math::Vec4<float24> color; | ||
| 28 | Math::Vec2<float24> tc0; | ||
| 29 | Math::Vec2<float24> tc1; | ||
| 30 | float24 pad[6]; | ||
| 31 | Math::Vec2<float24> tc2; | ||
| 32 | |||
| 33 | // Padding for optimal alignment | ||
| 34 | float24 pad2[4]; | ||
| 35 | |||
| 36 | // Attributes used to store intermediate results | ||
| 37 | |||
| 38 | // position after perspective divide | ||
| 39 | Math::Vec3<float24> screenpos; | ||
| 40 | float24 pad3; | ||
| 41 | |||
| 42 | // Linear interpolation | ||
| 43 | // factor: 0=this, 1=vtx | ||
| 44 | void Lerp(float24 factor, const OutputVertex& vtx) { | ||
| 45 | pos = pos * factor + vtx.pos * (float24::FromFloat32(1) - factor); | ||
| 46 | |||
| 47 | // TODO: Should perform perspective correct interpolation here... | ||
| 48 | tc0 = tc0 * factor + vtx.tc0 * (float24::FromFloat32(1) - factor); | ||
| 49 | tc1 = tc1 * factor + vtx.tc1 * (float24::FromFloat32(1) - factor); | ||
| 50 | tc2 = tc2 * factor + vtx.tc2 * (float24::FromFloat32(1) - factor); | ||
| 51 | |||
| 52 | screenpos = screenpos * factor + vtx.screenpos * (float24::FromFloat32(1) - factor); | ||
| 53 | |||
| 54 | color = color * factor + vtx.color * (float24::FromFloat32(1) - factor); | ||
| 55 | } | ||
| 56 | |||
| 57 | // Linear interpolation | ||
| 58 | // factor: 0=v0, 1=v1 | ||
| 59 | static OutputVertex Lerp(float24 factor, const OutputVertex& v0, const OutputVertex& v1) { | ||
| 60 | OutputVertex ret = v0; | ||
| 61 | ret.Lerp(factor, v1); | ||
| 62 | return ret; | ||
| 63 | } | ||
| 64 | }; | ||
| 65 | static_assert(std::is_pod<OutputVertex>::value, "Structure is not POD"); | ||
| 66 | static_assert(sizeof(OutputVertex) == 32 * sizeof(float), "OutputVertex has invalid size"); | ||
| 67 | |||
| 68 | OutputVertex RunShader(const InputVertex& input, int num_attributes, const Regs::ShaderConfig& config, const State::ShaderSetup& setup); | ||
| 69 | |||
| 70 | } // namespace | ||
| 71 | |||
| 72 | } // namespace | ||
| 73 | |||
diff --git a/src/video_core/video_core.cpp b/src/video_core/video_core.cpp index 3becc4261..943fde5ee 100644 --- a/src/video_core/video_core.cpp +++ b/src/video_core/video_core.cpp | |||
| @@ -23,6 +23,7 @@ EmuWindow* g_emu_window = nullptr; ///< Frontend emulator window | |||
| 23 | RendererBase* g_renderer = nullptr; ///< Renderer plugin | 23 | RendererBase* g_renderer = nullptr; ///< Renderer plugin |
| 24 | 24 | ||
| 25 | std::atomic<bool> g_hw_renderer_enabled; | 25 | std::atomic<bool> g_hw_renderer_enabled; |
| 26 | std::atomic<bool> g_shader_jit_enabled; | ||
| 26 | 27 | ||
| 27 | /// Initialize the video core | 28 | /// Initialize the video core |
| 28 | void Init(EmuWindow* emu_window) { | 29 | void Init(EmuWindow* emu_window) { |
diff --git a/src/video_core/video_core.h b/src/video_core/video_core.h index 14b33c9dd..2867bf03e 100644 --- a/src/video_core/video_core.h +++ b/src/video_core/video_core.h | |||
| @@ -32,8 +32,9 @@ static const int kScreenBottomHeight = 240; ///< 3DS bottom screen height | |||
| 32 | extern RendererBase* g_renderer; ///< Renderer plugin | 32 | extern RendererBase* g_renderer; ///< Renderer plugin |
| 33 | extern EmuWindow* g_emu_window; ///< Emu window | 33 | extern EmuWindow* g_emu_window; ///< Emu window |
| 34 | 34 | ||
| 35 | // TODO: Wrap this in a user settings struct along with any other graphics settings (often set from qt ui) | 35 | // TODO: Wrap these in a user settings struct along with any other graphics settings (often set from qt ui) |
| 36 | extern std::atomic<bool> g_hw_renderer_enabled; | 36 | extern std::atomic<bool> g_hw_renderer_enabled; |
| 37 | extern std::atomic<bool> g_shader_jit_enabled; | ||
| 37 | 38 | ||
| 38 | /// Start the video core | 39 | /// Start the video core |
| 39 | void Start(); | 40 | void Start(); |