summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--CMakeLists.txt12
m---------externals/nihstro0
-rw-r--r--src/citra/CMakeLists.txt2
-rw-r--r--src/citra/citra.cpp1
-rw-r--r--src/citra/config.cpp1
-rw-r--r--src/citra/default_ini.h4
-rw-r--r--src/citra_qt/CMakeLists.txt2
-rw-r--r--src/citra_qt/config.cpp2
-rw-r--r--src/citra_qt/debugger/graphics_vertex_shader.cpp2
-rw-r--r--src/citra_qt/main.cpp8
-rw-r--r--src/citra_qt/main.h1
-rw-r--r--src/citra_qt/main.ui9
-rw-r--r--src/common/CMakeLists.txt16
-rw-r--r--src/common/code_block.h87
-rw-r--r--src/common/common_funcs.h2
-rw-r--r--src/common/cpu_detect.h78
-rw-r--r--src/common/hash.cpp126
-rw-r--r--src/common/hash.h25
-rw-r--r--src/common/memory_util.cpp8
-rw-r--r--src/common/platform.h2
-rw-r--r--src/common/x64/abi.cpp680
-rw-r--r--src/common/x64/abi.h78
-rw-r--r--src/common/x64/cpu_detect.cpp187
-rw-r--r--src/common/x64/cpu_detect.h66
-rw-r--r--src/common/x64/emitter.cpp1989
-rw-r--r--src/common/x64/emitter.h1067
-rw-r--r--src/core/settings.h1
-rw-r--r--src/video_core/CMakeLists.txt14
-rw-r--r--src/video_core/clipper.cpp2
-rw-r--r--src/video_core/clipper.h4
-rw-r--r--src/video_core/command_processor.cpp22
-rw-r--r--src/video_core/hwrasterizer_base.h8
-rw-r--r--src/video_core/pica.cpp3
-rw-r--r--src/video_core/pica.h6
-rw-r--r--src/video_core/primitive_assembly.cpp4
-rw-r--r--src/video_core/primitive_assembly.h2
-rw-r--r--src/video_core/rasterizer.cpp14
-rw-r--r--src/video_core/rasterizer.h8
-rw-r--r--src/video_core/renderer_opengl/gl_rasterizer.cpp6
-rw-r--r--src/video_core/renderer_opengl/gl_rasterizer.h10
-rw-r--r--src/video_core/shader/shader.cpp145
-rw-r--r--src/video_core/shader/shader.h169
-rw-r--r--src/video_core/shader/shader_interpreter.cpp (renamed from src/video_core/vertex_shader.cpp)147
-rw-r--r--src/video_core/shader/shader_interpreter.h19
-rw-r--r--src/video_core/shader/shader_jit_x64.cpp675
-rw-r--r--src/video_core/shader/shader_jit_x64.h79
-rw-r--r--src/video_core/vertex_shader.h73
-rw-r--r--src/video_core/video_core.cpp1
-rw-r--r--src/video_core/video_core.h3
49 files changed, 5532 insertions, 338 deletions
diff --git a/CMakeLists.txt b/CMakeLists.txt
index a02b85da3..00d71dbdc 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -10,9 +10,21 @@ if(NOT EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/.git/hooks/pre-commit)
10 DESTINATION ${CMAKE_CURRENT_SOURCE_DIR}/.git/hooks) 10 DESTINATION ${CMAKE_CURRENT_SOURCE_DIR}/.git/hooks)
11endif() 11endif()
12 12
13# Platform-agnostic definition to check if we are on x86_64
14if(${CMAKE_SYSTEM_PROCESSOR} MATCHES "[xX]86_64" OR
15 ${CMAKE_SYSTEM_PROCESSOR} MATCHES "[aA][mM][dD]64")
16 set(ARCHITECTURE_x86_64 1)
17 add_definitions(-DARCHITECTURE_x86_64=1)
18endif()
19
13if (NOT MSVC) 20if (NOT MSVC)
14 set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11 -Wno-attributes -pthread") 21 set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11 -Wno-attributes -pthread")
15 set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -pthread") 22 set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -pthread")
23
24 if (ARCHITECTURE_x86_64)
25 set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -msse4.1")
26 set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -msse4.1")
27 endif()
16else() 28else()
17 # Silence "deprecation" warnings 29 # Silence "deprecation" warnings
18 add_definitions(/D_CRT_SECURE_NO_WARNINGS /D_CRT_NONSTDC_NO_DEPRECATE) 30 add_definitions(/D_CRT_SECURE_NO_WARNINGS /D_CRT_NONSTDC_NO_DEPRECATE)
diff --git a/externals/nihstro b/externals/nihstro
Subproject 676254f71e0a7ef0aca8acce078d3c3dc80ccf7 Subproject 445cba0b2ff8d348368e32698e4760a670260bf
diff --git a/src/citra/CMakeLists.txt b/src/citra/CMakeLists.txt
index 918687312..1d6aac9a9 100644
--- a/src/citra/CMakeLists.txt
+++ b/src/citra/CMakeLists.txt
@@ -14,7 +14,7 @@ set(HEADERS
14create_directory_groups(${SRCS} ${HEADERS}) 14create_directory_groups(${SRCS} ${HEADERS})
15 15
16add_executable(citra ${SRCS} ${HEADERS}) 16add_executable(citra ${SRCS} ${HEADERS})
17target_link_libraries(citra core common video_core) 17target_link_libraries(citra core video_core common)
18target_link_libraries(citra ${GLFW_LIBRARIES} ${OPENGL_gl_LIBRARY} inih) 18target_link_libraries(citra ${GLFW_LIBRARIES} ${OPENGL_gl_LIBRARY} inih)
19if (MSVC) 19if (MSVC)
20 target_link_libraries(citra getopt) 20 target_link_libraries(citra getopt)
diff --git a/src/citra/citra.cpp b/src/citra/citra.cpp
index 182646f4c..d6fcb66a5 100644
--- a/src/citra/citra.cpp
+++ b/src/citra/citra.cpp
@@ -71,6 +71,7 @@ int main(int argc, char **argv) {
71 EmuWindow_GLFW* emu_window = new EmuWindow_GLFW; 71 EmuWindow_GLFW* emu_window = new EmuWindow_GLFW;
72 72
73 VideoCore::g_hw_renderer_enabled = Settings::values.use_hw_renderer; 73 VideoCore::g_hw_renderer_enabled = Settings::values.use_hw_renderer;
74 VideoCore::g_shader_jit_enabled = Settings::values.use_shader_jit;
74 75
75 System::Init(emu_window); 76 System::Init(emu_window);
76 77
diff --git a/src/citra/config.cpp b/src/citra/config.cpp
index 2c1407a6f..8a98bda87 100644
--- a/src/citra/config.cpp
+++ b/src/citra/config.cpp
@@ -61,6 +61,7 @@ void Config::ReadValues() {
61 61
62 // Renderer 62 // Renderer
63 Settings::values.use_hw_renderer = glfw_config->GetBoolean("Renderer", "use_hw_renderer", false); 63 Settings::values.use_hw_renderer = glfw_config->GetBoolean("Renderer", "use_hw_renderer", false);
64 Settings::values.use_shader_jit = glfw_config->GetBoolean("Renderer", "use_shader_jit", true);
64 65
65 Settings::values.bg_red = (float)glfw_config->GetReal("Renderer", "bg_red", 1.0); 66 Settings::values.bg_red = (float)glfw_config->GetReal("Renderer", "bg_red", 1.0);
66 Settings::values.bg_green = (float)glfw_config->GetReal("Renderer", "bg_green", 1.0); 67 Settings::values.bg_green = (float)glfw_config->GetReal("Renderer", "bg_green", 1.0);
diff --git a/src/citra/default_ini.h b/src/citra/default_ini.h
index 1925bece8..7e5d49729 100644
--- a/src/citra/default_ini.h
+++ b/src/citra/default_ini.h
@@ -42,6 +42,10 @@ frame_skip =
42# 0 (default): Software, 1: Hardware 42# 0 (default): Software, 1: Hardware
43use_hw_renderer = 43use_hw_renderer =
44 44
45# Whether to use the Just-In-Time (JIT) compiler for shader emulation
46# 0 : Interpreter (slow), 1 (default): JIT (fast)
47use_shader_jit =
48
45# The clear color for the renderer. What shows up on the sides of the bottom screen. 49# The clear color for the renderer. What shows up on the sides of the bottom screen.
46# Must be in range of 0.0-1.0. Defaults to 1.0 for all. 50# Must be in range of 0.0-1.0. Defaults to 1.0 for all.
47bg_red = 51bg_red =
diff --git a/src/citra_qt/CMakeLists.txt b/src/citra_qt/CMakeLists.txt
index 47aaeca24..0c0515054 100644
--- a/src/citra_qt/CMakeLists.txt
+++ b/src/citra_qt/CMakeLists.txt
@@ -71,7 +71,7 @@ if (APPLE)
71else() 71else()
72 add_executable(citra-qt ${SRCS} ${HEADERS} ${UI_HDRS}) 72 add_executable(citra-qt ${SRCS} ${HEADERS} ${UI_HDRS})
73endif() 73endif()
74target_link_libraries(citra-qt core common video_core qhexedit) 74target_link_libraries(citra-qt core video_core common qhexedit)
75target_link_libraries(citra-qt ${OPENGL_gl_LIBRARY} ${CITRA_QT_LIBS}) 75target_link_libraries(citra-qt ${OPENGL_gl_LIBRARY} ${CITRA_QT_LIBS})
76target_link_libraries(citra-qt ${PLATFORM_LIBRARIES}) 76target_link_libraries(citra-qt ${PLATFORM_LIBRARIES})
77 77
diff --git a/src/citra_qt/config.cpp b/src/citra_qt/config.cpp
index 5716634ee..a20351fb8 100644
--- a/src/citra_qt/config.cpp
+++ b/src/citra_qt/config.cpp
@@ -44,6 +44,7 @@ void Config::ReadValues() {
44 44
45 qt_config->beginGroup("Renderer"); 45 qt_config->beginGroup("Renderer");
46 Settings::values.use_hw_renderer = qt_config->value("use_hw_renderer", false).toBool(); 46 Settings::values.use_hw_renderer = qt_config->value("use_hw_renderer", false).toBool();
47 Settings::values.use_shader_jit = qt_config->value("use_shader_jit", true).toBool();
47 48
48 Settings::values.bg_red = qt_config->value("bg_red", 1.0).toFloat(); 49 Settings::values.bg_red = qt_config->value("bg_red", 1.0).toFloat();
49 Settings::values.bg_green = qt_config->value("bg_green", 1.0).toFloat(); 50 Settings::values.bg_green = qt_config->value("bg_green", 1.0).toFloat();
@@ -77,6 +78,7 @@ void Config::SaveValues() {
77 78
78 qt_config->beginGroup("Renderer"); 79 qt_config->beginGroup("Renderer");
79 qt_config->setValue("use_hw_renderer", Settings::values.use_hw_renderer); 80 qt_config->setValue("use_hw_renderer", Settings::values.use_hw_renderer);
81 qt_config->setValue("use_shader_jit", Settings::values.use_shader_jit);
80 82
81 // Cast to double because Qt's written float values are not human-readable 83 // Cast to double because Qt's written float values are not human-readable
82 qt_config->setValue("bg_red", (double)Settings::values.bg_red); 84 qt_config->setValue("bg_red", (double)Settings::values.bg_red);
diff --git a/src/citra_qt/debugger/graphics_vertex_shader.cpp b/src/citra_qt/debugger/graphics_vertex_shader.cpp
index f42a2f4ce..302e22d7a 100644
--- a/src/citra_qt/debugger/graphics_vertex_shader.cpp
+++ b/src/citra_qt/debugger/graphics_vertex_shader.cpp
@@ -8,7 +8,7 @@
8#include <QBoxLayout> 8#include <QBoxLayout>
9#include <QTreeView> 9#include <QTreeView>
10 10
11#include "video_core/vertex_shader.h" 11#include "video_core/shader/shader_interpreter.h"
12 12
13#include "graphics_vertex_shader.h" 13#include "graphics_vertex_shader.h"
14 14
diff --git a/src/citra_qt/main.cpp b/src/citra_qt/main.cpp
index 6b030c178..4c3edf87a 100644
--- a/src/citra_qt/main.cpp
+++ b/src/citra_qt/main.cpp
@@ -131,6 +131,9 @@ GMainWindow::GMainWindow() : emu_thread(nullptr)
131 ui.action_Use_Hardware_Renderer->setChecked(Settings::values.use_hw_renderer); 131 ui.action_Use_Hardware_Renderer->setChecked(Settings::values.use_hw_renderer);
132 SetHardwareRendererEnabled(ui.action_Use_Hardware_Renderer->isChecked()); 132 SetHardwareRendererEnabled(ui.action_Use_Hardware_Renderer->isChecked());
133 133
134 ui.action_Use_Shader_JIT->setChecked(Settings::values.use_shader_jit);
135 SetShaderJITEnabled(ui.action_Use_Shader_JIT->isChecked());
136
134 ui.action_Single_Window_Mode->setChecked(settings.value("singleWindowMode", true).toBool()); 137 ui.action_Single_Window_Mode->setChecked(settings.value("singleWindowMode", true).toBool());
135 ToggleWindowMode(); 138 ToggleWindowMode();
136 139
@@ -144,6 +147,7 @@ GMainWindow::GMainWindow() : emu_thread(nullptr)
144 connect(ui.action_Pause, SIGNAL(triggered()), this, SLOT(OnPauseGame())); 147 connect(ui.action_Pause, SIGNAL(triggered()), this, SLOT(OnPauseGame()));
145 connect(ui.action_Stop, SIGNAL(triggered()), this, SLOT(OnStopGame())); 148 connect(ui.action_Stop, SIGNAL(triggered()), this, SLOT(OnStopGame()));
146 connect(ui.action_Use_Hardware_Renderer, SIGNAL(triggered(bool)), this, SLOT(SetHardwareRendererEnabled(bool))); 149 connect(ui.action_Use_Hardware_Renderer, SIGNAL(triggered(bool)), this, SLOT(SetHardwareRendererEnabled(bool)));
150 connect(ui.action_Use_Shader_JIT, SIGNAL(triggered(bool)), this, SLOT(SetShaderJITEnabled(bool)));
147 connect(ui.action_Single_Window_Mode, SIGNAL(triggered(bool)), this, SLOT(ToggleWindowMode())); 151 connect(ui.action_Single_Window_Mode, SIGNAL(triggered(bool)), this, SLOT(ToggleWindowMode()));
148 connect(ui.action_Hotkeys, SIGNAL(triggered()), this, SLOT(OnOpenHotkeysDialog())); 152 connect(ui.action_Hotkeys, SIGNAL(triggered()), this, SLOT(OnOpenHotkeysDialog()));
149 153
@@ -331,6 +335,10 @@ void GMainWindow::SetHardwareRendererEnabled(bool enabled) {
331 VideoCore::g_hw_renderer_enabled = enabled; 335 VideoCore::g_hw_renderer_enabled = enabled;
332} 336}
333 337
338void GMainWindow::SetShaderJITEnabled(bool enabled) {
339 VideoCore::g_shader_jit_enabled = enabled;
340}
341
334void GMainWindow::ToggleWindowMode() { 342void GMainWindow::ToggleWindowMode() {
335 if (ui.action_Single_Window_Mode->isChecked()) { 343 if (ui.action_Single_Window_Mode->isChecked()) {
336 // Render in the main window... 344 // Render in the main window...
diff --git a/src/citra_qt/main.h b/src/citra_qt/main.h
index 9fe9e0c9c..61114a04d 100644
--- a/src/citra_qt/main.h
+++ b/src/citra_qt/main.h
@@ -70,6 +70,7 @@ private slots:
70 void OnConfigure(); 70 void OnConfigure();
71 void OnDisplayTitleBars(bool); 71 void OnDisplayTitleBars(bool);
72 void SetHardwareRendererEnabled(bool); 72 void SetHardwareRendererEnabled(bool);
73 void SetShaderJITEnabled(bool);
73 void ToggleWindowMode(); 74 void ToggleWindowMode();
74 75
75private: 76private:
diff --git a/src/citra_qt/main.ui b/src/citra_qt/main.ui
index 9a809ee6c..b2ce8167d 100644
--- a/src/citra_qt/main.ui
+++ b/src/citra_qt/main.ui
@@ -66,6 +66,7 @@
66 <addaction name="action_Stop"/> 66 <addaction name="action_Stop"/>
67 <addaction name="separator"/> 67 <addaction name="separator"/>
68 <addaction name="action_Use_Hardware_Renderer"/> 68 <addaction name="action_Use_Hardware_Renderer"/>
69 <addaction name="action_Use_Shader_JIT"/>
69 <addaction name="action_Configure"/> 70 <addaction name="action_Configure"/>
70 </widget> 71 </widget>
71 <widget class="QMenu" name="menu_View"> 72 <widget class="QMenu" name="menu_View">
@@ -153,6 +154,14 @@
153 <string>Use Hardware Renderer</string> 154 <string>Use Hardware Renderer</string>
154 </property> 155 </property>
155 </action> 156 </action>
157 <action name="action_Use_Shader_JIT">
158 <property name="checkable">
159 <bool>true</bool>
160 </property>
161 <property name="text">
162 <string>Use Shader JIT</string>
163 </property>
164 </action>
156 <action name="action_Configure"> 165 <action name="action_Configure">
157 <property name="text"> 166 <property name="text">
158 <string>Configure ...</string> 167 <string>Configure ...</string>
diff --git a/src/common/CMakeLists.txt b/src/common/CMakeLists.txt
index 4c086cd2f..e743a026d 100644
--- a/src/common/CMakeLists.txt
+++ b/src/common/CMakeLists.txt
@@ -5,6 +5,7 @@ set(SRCS
5 break_points.cpp 5 break_points.cpp
6 emu_window.cpp 6 emu_window.cpp
7 file_util.cpp 7 file_util.cpp
8 hash.cpp
8 key_map.cpp 9 key_map.cpp
9 logging/filter.cpp 10 logging/filter.cpp
10 logging/text_formatter.cpp 11 logging/text_formatter.cpp
@@ -24,14 +25,15 @@ set(HEADERS
24 bit_field.h 25 bit_field.h
25 break_points.h 26 break_points.h
26 chunk_file.h 27 chunk_file.h
28 code_block.h
27 color.h 29 color.h
28 common_funcs.h 30 common_funcs.h
29 common_paths.h 31 common_paths.h
30 common_types.h 32 common_types.h
31 cpu_detect.h
32 debug_interface.h 33 debug_interface.h
33 emu_window.h 34 emu_window.h
34 file_util.h 35 file_util.h
36 hash.h
35 key_map.h 37 key_map.h
36 linear_disk_cache.h 38 linear_disk_cache.h
37 logging/text_formatter.h 39 logging/text_formatter.h
@@ -56,6 +58,18 @@ set(HEADERS
56 vector_math.h 58 vector_math.h
57 ) 59 )
58 60
61if(ARCHITECTURE_x86_64)
62 set(SRCS ${SRCS}
63 x64/abi.cpp
64 x64/cpu_detect.cpp
65 x64/emitter.cpp)
66
67 set(HEADERS ${HEADERS}
68 x64/abi.h
69 x64/cpu_detect.h
70 x64/emitter.h)
71endif()
72
59create_directory_groups(${SRCS} ${HEADERS}) 73create_directory_groups(${SRCS} ${HEADERS})
60 74
61add_library(common STATIC ${SRCS} ${HEADERS}) 75add_library(common STATIC ${SRCS} ${HEADERS})
diff --git a/src/common/code_block.h b/src/common/code_block.h
new file mode 100644
index 000000000..9ef7296d3
--- /dev/null
+++ b/src/common/code_block.h
@@ -0,0 +1,87 @@
1// Copyright 2013 Dolphin Emulator Project
2// Licensed under GPLv2
3// Refer to the license.txt file included.
4
5#pragma once
6
7#include "common_types.h"
8#include "memory_util.h"
9
10// Everything that needs to generate code should inherit from this.
11// You get memory management for free, plus, you can use all emitter functions without
12// having to prefix them with gen-> or something similar.
13// Example implementation:
14// class JIT : public CodeBlock<ARMXEmitter> {}
15template<class T> class CodeBlock : public T, NonCopyable
16{
17private:
18 // A privately used function to set the executable RAM space to something invalid.
19 // For debugging usefulness it should be used to set the RAM to a host specific breakpoint instruction
20 virtual void PoisonMemory() = 0;
21
22protected:
23 u8 *region;
24 size_t region_size;
25
26public:
27 CodeBlock() : region(nullptr), region_size(0) {}
28 virtual ~CodeBlock() { if (region) FreeCodeSpace(); }
29
30 // Call this before you generate any code.
31 void AllocCodeSpace(int size)
32 {
33 region_size = size;
34 region = (u8*)AllocateExecutableMemory(region_size);
35 T::SetCodePtr(region);
36 }
37
38 // Always clear code space with breakpoints, so that if someone accidentally executes
39 // uninitialized, it just breaks into the debugger.
40 void ClearCodeSpace()
41 {
42 PoisonMemory();
43 ResetCodePtr();
44 }
45
46 // Call this when shutting down. Don't rely on the destructor, even though it'll do the job.
47 void FreeCodeSpace()
48 {
49#ifdef __SYMBIAN32__
50 ResetExecutableMemory(region);
51#else
52 FreeMemoryPages(region, region_size);
53#endif
54 region = nullptr;
55 region_size = 0;
56 }
57
58 bool IsInSpace(const u8 *ptr)
59 {
60 return (ptr >= region) && (ptr < (region + region_size));
61 }
62
63 // Cannot currently be undone. Will write protect the entire code region.
64 // Start over if you need to change the code (call FreeCodeSpace(), AllocCodeSpace()).
65 void WriteProtect()
66 {
67 WriteProtectMemory(region, region_size, true);
68 }
69
70 void ResetCodePtr()
71 {
72 T::SetCodePtr(region);
73 }
74
75 size_t GetSpaceLeft() const
76 {
77 return region_size - (T::GetCodePtr() - region);
78 }
79
80 u8 *GetBasePtr() {
81 return region;
82 }
83
84 size_t GetOffset(const u8 *ptr) const {
85 return ptr - region;
86 }
87};
diff --git a/src/common/common_funcs.h b/src/common/common_funcs.h
index 83b47f61e..88e452a16 100644
--- a/src/common/common_funcs.h
+++ b/src/common/common_funcs.h
@@ -35,7 +35,7 @@
35 35
36#ifndef _MSC_VER 36#ifndef _MSC_VER
37 37
38#if defined(__x86_64__) || defined(_M_X64) 38#ifdef ARCHITECTURE_x86_64
39#define Crash() __asm__ __volatile__("int $3") 39#define Crash() __asm__ __volatile__("int $3")
40#elif defined(_M_ARM) 40#elif defined(_M_ARM)
41#define Crash() __asm__ __volatile__("trap") 41#define Crash() __asm__ __volatile__("trap")
diff --git a/src/common/cpu_detect.h b/src/common/cpu_detect.h
deleted file mode 100644
index b585f9608..000000000
--- a/src/common/cpu_detect.h
+++ /dev/null
@@ -1,78 +0,0 @@
1// Copyright 2013 Dolphin Emulator Project / 2014 Citra Emulator Project
2// Licensed under GPLv2 or any later version
3// Refer to the license.txt file included.
4
5
6// Detect the cpu, so we'll know which optimizations to use
7#pragma once
8
9#include <string>
10
11enum CPUVendor
12{
13 VENDOR_INTEL = 0,
14 VENDOR_AMD = 1,
15 VENDOR_ARM = 2,
16 VENDOR_OTHER = 3,
17};
18
19struct CPUInfo
20{
21 CPUVendor vendor;
22
23 char cpu_string[0x21];
24 char brand_string[0x41];
25 bool OS64bit;
26 bool CPU64bit;
27 bool Mode64bit;
28
29 bool HTT;
30 int num_cores;
31 int logical_cpu_count;
32
33 bool bSSE;
34 bool bSSE2;
35 bool bSSE3;
36 bool bSSSE3;
37 bool bPOPCNT;
38 bool bSSE4_1;
39 bool bSSE4_2;
40 bool bLZCNT;
41 bool bSSE4A;
42 bool bAVX;
43 bool bAES;
44 bool bLAHFSAHF64;
45 bool bLongMode;
46
47 // ARM specific CPUInfo
48 bool bSwp;
49 bool bHalf;
50 bool bThumb;
51 bool bFastMult;
52 bool bVFP;
53 bool bEDSP;
54 bool bThumbEE;
55 bool bNEON;
56 bool bVFPv3;
57 bool bTLS;
58 bool bVFPv4;
59 bool bIDIVa;
60 bool bIDIVt;
61 bool bArmV7; // enable MOVT, MOVW etc
62
63 // ARMv8 specific
64 bool bFP;
65 bool bASIMD;
66
67 // Call Detect()
68 explicit CPUInfo();
69
70 // Turn the cpu info into a string we can show
71 std::string Summarize();
72
73private:
74 // Detects the various cpu features
75 void Detect();
76};
77
78extern CPUInfo cpu_info;
diff --git a/src/common/hash.cpp b/src/common/hash.cpp
new file mode 100644
index 000000000..413e9c6f1
--- /dev/null
+++ b/src/common/hash.cpp
@@ -0,0 +1,126 @@
1// Copyright 2015 Citra Emulator Project
2// Licensed under GPLv2 or any later version
3// Refer to the license.txt file included.
4
5#if defined(_MSC_VER)
6#include <stdlib.h>
7#endif
8
9#include "common_funcs.h"
10#include "common_types.h"
11#include "hash.h"
12
13namespace Common {
14
15// MurmurHash3 was written by Austin Appleby, and is placed in the public
16// domain. The author hereby disclaims copyright to this source code.
17
18// Block read - if your platform needs to do endian-swapping or can only handle aligned reads, do
19// the conversion here
20
21static FORCE_INLINE u32 getblock32(const u32* p, int i) {
22 return p[i];
23}
24
25static FORCE_INLINE u64 getblock64(const u64* p, int i) {
26 return p[i];
27}
28
29// Finalization mix - force all bits of a hash block to avalanche
30
31static FORCE_INLINE u32 fmix32(u32 h) {
32 h ^= h >> 16;
33 h *= 0x85ebca6b;
34 h ^= h >> 13;
35 h *= 0xc2b2ae35;
36 h ^= h >> 16;
37
38 return h;
39}
40
41static FORCE_INLINE u64 fmix64(u64 k) {
42 k ^= k >> 33;
43 k *= 0xff51afd7ed558ccdllu;
44 k ^= k >> 33;
45 k *= 0xc4ceb9fe1a85ec53llu;
46 k ^= k >> 33;
47
48 return k;
49}
50
51// This is the 128-bit variant of the MurmurHash3 hash function that is targetted for 64-bit
52// platforms (MurmurHash3_x64_128). It was taken from:
53// https://code.google.com/p/smhasher/source/browse/trunk/MurmurHash3.cpp
54void MurmurHash3_128(const void* key, int len, u32 seed, void* out) {
55 const u8 * data = (const u8*)key;
56 const int nblocks = len / 16;
57
58 u64 h1 = seed;
59 u64 h2 = seed;
60
61 const u64 c1 = 0x87c37b91114253d5llu;
62 const u64 c2 = 0x4cf5ad432745937fllu;
63
64 // Body
65
66 const u64 * blocks = (const u64 *)(data);
67
68 for (int i = 0; i < nblocks; i++) {
69 u64 k1 = getblock64(blocks,i*2+0);
70 u64 k2 = getblock64(blocks,i*2+1);
71
72 k1 *= c1; k1 = _rotl64(k1,31); k1 *= c2; h1 ^= k1;
73
74 h1 = _rotl64(h1,27); h1 += h2; h1 = h1*5+0x52dce729;
75
76 k2 *= c2; k2 = _rotl64(k2,33); k2 *= c1; h2 ^= k2;
77
78 h2 = _rotl64(h2,31); h2 += h1; h2 = h2*5+0x38495ab5;
79 }
80
81 // Tail
82
83 const u8 * tail = (const u8*)(data + nblocks*16);
84
85 u64 k1 = 0;
86 u64 k2 = 0;
87
88 switch (len & 15) {
89 case 15: k2 ^= ((u64)tail[14]) << 48;
90 case 14: k2 ^= ((u64)tail[13]) << 40;
91 case 13: k2 ^= ((u64)tail[12]) << 32;
92 case 12: k2 ^= ((u64)tail[11]) << 24;
93 case 11: k2 ^= ((u64)tail[10]) << 16;
94 case 10: k2 ^= ((u64)tail[ 9]) << 8;
95 case 9: k2 ^= ((u64)tail[ 8]) << 0;
96 k2 *= c2; k2 = _rotl64(k2,33); k2 *= c1; h2 ^= k2;
97
98 case 8: k1 ^= ((u64)tail[ 7]) << 56;
99 case 7: k1 ^= ((u64)tail[ 6]) << 48;
100 case 6: k1 ^= ((u64)tail[ 5]) << 40;
101 case 5: k1 ^= ((u64)tail[ 4]) << 32;
102 case 4: k1 ^= ((u64)tail[ 3]) << 24;
103 case 3: k1 ^= ((u64)tail[ 2]) << 16;
104 case 2: k1 ^= ((u64)tail[ 1]) << 8;
105 case 1: k1 ^= ((u64)tail[ 0]) << 0;
106 k1 *= c1; k1 = _rotl64(k1,31); k1 *= c2; h1 ^= k1;
107 };
108
109 // Finalization
110
111 h1 ^= len; h2 ^= len;
112
113 h1 += h2;
114 h2 += h1;
115
116 h1 = fmix64(h1);
117 h2 = fmix64(h2);
118
119 h1 += h2;
120 h2 += h1;
121
122 ((u64*)out)[0] = h1;
123 ((u64*)out)[1] = h2;
124}
125
126} // namespace Common
diff --git a/src/common/hash.h b/src/common/hash.h
new file mode 100644
index 000000000..a3850be68
--- /dev/null
+++ b/src/common/hash.h
@@ -0,0 +1,25 @@
1// Copyright 2015 Citra Emulator Project
2// Licensed under GPLv2 or any later version
3// Refer to the license.txt file included.
4
5#pragma once
6
7#include "common/common_types.h"
8
9namespace Common {
10
11void MurmurHash3_128(const void* key, int len, u32 seed, void* out);
12
13/**
14 * Computes a 64-bit hash over the specified block of data
15 * @param data Block of data to compute hash over
16 * @param len Length of data (in bytes) to compute hash over
17 * @returns 64-bit hash value that was computed over the data block
18 */
19static inline u64 ComputeHash64(const void* data, int len) {
20 u64 res[2];
21 MurmurHash3_128(data, len, 0, res);
22 return res[0];
23}
24
25} // namespace Common
diff --git a/src/common/memory_util.cpp b/src/common/memory_util.cpp
index 2b3ace528..5ef784224 100644
--- a/src/common/memory_util.cpp
+++ b/src/common/memory_util.cpp
@@ -16,7 +16,7 @@
16 #include <sys/mman.h> 16 #include <sys/mman.h>
17#endif 17#endif
18 18
19#if !defined(_WIN32) && defined(__x86_64__) && !defined(MAP_32BIT) 19#if !defined(_WIN32) && defined(ARCHITECTURE_X64) && !defined(MAP_32BIT)
20#include <unistd.h> 20#include <unistd.h>
21#define PAGE_MASK (getpagesize() - 1) 21#define PAGE_MASK (getpagesize() - 1)
22#define round_page(x) ((((unsigned long)(x)) + PAGE_MASK) & ~(PAGE_MASK)) 22#define round_page(x) ((((unsigned long)(x)) + PAGE_MASK) & ~(PAGE_MASK))
@@ -31,7 +31,7 @@ void* AllocateExecutableMemory(size_t size, bool low)
31 void* ptr = VirtualAlloc(0, size, MEM_COMMIT, PAGE_EXECUTE_READWRITE); 31 void* ptr = VirtualAlloc(0, size, MEM_COMMIT, PAGE_EXECUTE_READWRITE);
32#else 32#else
33 static char *map_hint = 0; 33 static char *map_hint = 0;
34#if defined(__x86_64__) && !defined(MAP_32BIT) 34#if defined(ARCHITECTURE_X64) && !defined(MAP_32BIT)
35 // This OS has no flag to enforce allocation below the 4 GB boundary, 35 // This OS has no flag to enforce allocation below the 4 GB boundary,
36 // but if we hint that we want a low address it is very likely we will 36 // but if we hint that we want a low address it is very likely we will
37 // get one. 37 // get one.
@@ -43,7 +43,7 @@ void* AllocateExecutableMemory(size_t size, bool low)
43#endif 43#endif
44 void* ptr = mmap(map_hint, size, PROT_READ | PROT_WRITE | PROT_EXEC, 44 void* ptr = mmap(map_hint, size, PROT_READ | PROT_WRITE | PROT_EXEC,
45 MAP_ANON | MAP_PRIVATE 45 MAP_ANON | MAP_PRIVATE
46#if defined(__x86_64__) && defined(MAP_32BIT) 46#if defined(ARCHITECTURE_X64) && defined(MAP_32BIT)
47 | (low ? MAP_32BIT : 0) 47 | (low ? MAP_32BIT : 0)
48#endif 48#endif
49 , -1, 0); 49 , -1, 0);
@@ -62,7 +62,7 @@ void* AllocateExecutableMemory(size_t size, bool low)
62#endif 62#endif
63 LOG_ERROR(Common_Memory, "Failed to allocate executable memory"); 63 LOG_ERROR(Common_Memory, "Failed to allocate executable memory");
64 } 64 }
65#if !defined(_WIN32) && defined(__x86_64__) && !defined(MAP_32BIT) 65#if !defined(_WIN32) && defined(ARCHITECTURE_X64) && !defined(MAP_32BIT)
66 else 66 else
67 { 67 {
68 if (low) 68 if (low)
diff --git a/src/common/platform.h b/src/common/platform.h
index 0a912dda3..9ba4db11b 100644
--- a/src/common/platform.h
+++ b/src/common/platform.h
@@ -27,7 +27,7 @@
27//////////////////////////////////////////////////////////////////////////////////////////////////// 27////////////////////////////////////////////////////////////////////////////////////////////////////
28// Platform detection 28// Platform detection
29 29
30#if defined(__x86_64__) || defined(_M_X64) || defined(__aarch64__) 30#if defined(ARCHITECTURE_x86_64) || defined(__aarch64__)
31 #define EMU_ARCH_BITS 64 31 #define EMU_ARCH_BITS 64
32#elif defined(__i386) || defined(_M_IX86) || defined(__arm__) || defined(_M_ARM) 32#elif defined(__i386) || defined(_M_IX86) || defined(__arm__) || defined(_M_ARM)
33 #define EMU_ARCH_BITS 32 33 #define EMU_ARCH_BITS 32
diff --git a/src/common/x64/abi.cpp b/src/common/x64/abi.cpp
new file mode 100644
index 000000000..4c07a6ebe
--- /dev/null
+++ b/src/common/x64/abi.cpp
@@ -0,0 +1,680 @@
1// Copyright (C) 2003 Dolphin Project.
2
3// This program is free software: you can redistribute it and/or modify
4// it under the terms of the GNU General Public License as published by
5// the Free Software Foundation, version 2.0 or later versions.
6
7// This program is distributed in the hope that it will be useful,
8// but WITHOUT ANY WARRANTY; without even the implied warranty of
9// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
10// GNU General Public License 2.0 for more details.
11
12// A copy of the GPL 2.0 should have been included with the program.
13// If not, see http://www.gnu.org/licenses/
14
15// Official SVN repository and contact information can be found at
16// http://code.google.com/p/dolphin-emu/
17
18#include "abi.h"
19#include "emitter.h"
20
21using namespace Gen;
22
23// Shared code between Win64 and Unix64
24
25// Sets up a __cdecl function.
26void XEmitter::ABI_EmitPrologue(int maxCallParams)
27{
28#ifdef _M_IX86
29 // Don't really need to do anything
30#elif defined(ARCHITECTURE_x86_64)
31#if _WIN32
32 int stacksize = ((maxCallParams + 1) & ~1) * 8 + 8;
33 // Set up a stack frame so that we can call functions
34 // TODO: use maxCallParams
35 SUB(64, R(RSP), Imm8(stacksize));
36#endif
37#else
38#error Arch not supported
39#endif
40}
41
42void XEmitter::ABI_EmitEpilogue(int maxCallParams)
43{
44#ifdef _M_IX86
45 RET();
46#elif defined(ARCHITECTURE_x86_64)
47#ifdef _WIN32
48 int stacksize = ((maxCallParams+1)&~1)*8 + 8;
49 ADD(64, R(RSP), Imm8(stacksize));
50#endif
51 RET();
52#else
53#error Arch not supported
54
55
56#endif
57}
58
59#ifdef _M_IX86 // All32
60
61// Shared code between Win32 and Unix32
62void XEmitter::ABI_CallFunction(const void *func) {
63 ABI_AlignStack(0);
64 CALL(func);
65 ABI_RestoreStack(0);
66}
67
68void XEmitter::ABI_CallFunctionC16(const void *func, u16 param1) {
69 ABI_AlignStack(1 * 2);
70 PUSH(16, Imm16(param1));
71 CALL(func);
72 ABI_RestoreStack(1 * 2);
73}
74
75void XEmitter::ABI_CallFunctionCC16(const void *func, u32 param1, u16 param2) {
76 ABI_AlignStack(1 * 2 + 1 * 4);
77 PUSH(16, Imm16(param2));
78 PUSH(32, Imm32(param1));
79 CALL(func);
80 ABI_RestoreStack(1 * 2 + 1 * 4);
81}
82
83void XEmitter::ABI_CallFunctionC(const void *func, u32 param1) {
84 ABI_AlignStack(1 * 4);
85 PUSH(32, Imm32(param1));
86 CALL(func);
87 ABI_RestoreStack(1 * 4);
88}
89
90void XEmitter::ABI_CallFunctionCC(const void *func, u32 param1, u32 param2) {
91 ABI_AlignStack(2 * 4);
92 PUSH(32, Imm32(param2));
93 PUSH(32, Imm32(param1));
94 CALL(func);
95 ABI_RestoreStack(2 * 4);
96}
97
98void XEmitter::ABI_CallFunctionCCC(const void *func, u32 param1, u32 param2, u32 param3) {
99 ABI_AlignStack(3 * 4);
100 PUSH(32, Imm32(param3));
101 PUSH(32, Imm32(param2));
102 PUSH(32, Imm32(param1));
103 CALL(func);
104 ABI_RestoreStack(3 * 4);
105}
106
107void XEmitter::ABI_CallFunctionCCP(const void *func, u32 param1, u32 param2, void *param3) {
108 ABI_AlignStack(3 * 4);
109 PUSH(32, ImmPtr(param3));
110 PUSH(32, Imm32(param2));
111 PUSH(32, Imm32(param1));
112 CALL(func);
113 ABI_RestoreStack(3 * 4);
114}
115
116void XEmitter::ABI_CallFunctionCCCP(const void *func, u32 param1, u32 param2,u32 param3, void *param4) {
117 ABI_AlignStack(4 * 4);
118 PUSH(32, ImmPtr(param4));
119 PUSH(32, Imm32(param3));
120 PUSH(32, Imm32(param2));
121 PUSH(32, Imm32(param1));
122 CALL(func);
123 ABI_RestoreStack(4 * 4);
124}
125
126void XEmitter::ABI_CallFunctionP(const void *func, void *param1) {
127 ABI_AlignStack(1 * 4);
128 PUSH(32, ImmPtr(param1));
129 CALL(func);
130 ABI_RestoreStack(1 * 4);
131}
132
133void XEmitter::ABI_CallFunctionPA(const void *func, void *param1, const Gen::OpArg &arg2) {
134 ABI_AlignStack(2 * 4);
135 PUSH(32, arg2);
136 PUSH(32, ImmPtr(param1));
137 CALL(func);
138 ABI_RestoreStack(2 * 4);
139}
140
141void XEmitter::ABI_CallFunctionPAA(const void *func, void *param1, const Gen::OpArg &arg2, const Gen::OpArg &arg3) {
142 ABI_AlignStack(3 * 4);
143 PUSH(32, arg3);
144 PUSH(32, arg2);
145 PUSH(32, ImmPtr(param1));
146 CALL(func);
147 ABI_RestoreStack(3 * 4);
148}
149
150void XEmitter::ABI_CallFunctionPPC(const void *func, void *param1, void *param2, u32 param3) {
151 ABI_AlignStack(3 * 4);
152 PUSH(32, Imm32(param3));
153 PUSH(32, ImmPtr(param2));
154 PUSH(32, ImmPtr(param1));
155 CALL(func);
156 ABI_RestoreStack(3 * 4);
157}
158
159// Pass a register as a parameter.
160void XEmitter::ABI_CallFunctionR(const void *func, X64Reg reg1) {
161 ABI_AlignStack(1 * 4);
162 PUSH(32, R(reg1));
163 CALL(func);
164 ABI_RestoreStack(1 * 4);
165}
166
167// Pass two registers as parameters.
168void XEmitter::ABI_CallFunctionRR(const void *func, Gen::X64Reg reg1, Gen::X64Reg reg2)
169{
170 ABI_AlignStack(2 * 4);
171 PUSH(32, R(reg2));
172 PUSH(32, R(reg1));
173 CALL(func);
174 ABI_RestoreStack(2 * 4);
175}
176
177void XEmitter::ABI_CallFunctionAC(const void *func, const Gen::OpArg &arg1, u32 param2)
178{
179 ABI_AlignStack(2 * 4);
180 PUSH(32, Imm32(param2));
181 PUSH(32, arg1);
182 CALL(func);
183 ABI_RestoreStack(2 * 4);
184}
185
186void XEmitter::ABI_CallFunctionACC(const void *func, const Gen::OpArg &arg1, u32 param2, u32 param3)
187{
188 ABI_AlignStack(3 * 4);
189 PUSH(32, Imm32(param3));
190 PUSH(32, Imm32(param2));
191 PUSH(32, arg1);
192 CALL(func);
193 ABI_RestoreStack(3 * 4);
194}
195
196void XEmitter::ABI_CallFunctionA(const void *func, const Gen::OpArg &arg1)
197{
198 ABI_AlignStack(1 * 4);
199 PUSH(32, arg1);
200 CALL(func);
201 ABI_RestoreStack(1 * 4);
202}
203
204void XEmitter::ABI_CallFunctionAA(const void *func, const Gen::OpArg &arg1, const Gen::OpArg &arg2)
205{
206 ABI_AlignStack(2 * 4);
207 PUSH(32, arg2);
208 PUSH(32, arg1);
209 CALL(func);
210 ABI_RestoreStack(2 * 4);
211}
212
213void XEmitter::ABI_PushAllCalleeSavedRegsAndAdjustStack() {
214 // Note: 4 * 4 = 16 bytes, so alignment is preserved.
215 PUSH(EBP);
216 PUSH(EBX);
217 PUSH(ESI);
218 PUSH(EDI);
219}
220
221void XEmitter::ABI_PopAllCalleeSavedRegsAndAdjustStack() {
222 POP(EDI);
223 POP(ESI);
224 POP(EBX);
225 POP(EBP);
226}
227
228unsigned int XEmitter::ABI_GetAlignedFrameSize(unsigned int frameSize) {
229 frameSize += 4; // reserve space for return address
230 unsigned int alignedSize =
231#ifdef __GNUC__
232 (frameSize + 15) & -16;
233#else
234 (frameSize + 3) & -4;
235#endif
236 return alignedSize;
237}
238
239
240void XEmitter::ABI_AlignStack(unsigned int frameSize) {
241// Mac OS X requires the stack to be 16-byte aligned before every call.
242// Linux requires the stack to be 16-byte aligned before calls that put SSE
243// vectors on the stack, but since we do not keep track of which calls do that,
244// it is effectively every call as well.
245// Windows binaries compiled with MSVC do not have such a restriction*, but I
246// expect that GCC on Windows acts the same as GCC on Linux in this respect.
247// It would be nice if someone could verify this.
248// *However, the MSVC optimizing compiler assumes a 4-byte-aligned stack at times.
249 unsigned int fillSize =
250 ABI_GetAlignedFrameSize(frameSize) - (frameSize + 4);
251 if (fillSize != 0) {
252 SUB(32, R(ESP), Imm8(fillSize));
253 }
254}
255
256void XEmitter::ABI_RestoreStack(unsigned int frameSize) {
257 unsigned int alignedSize = ABI_GetAlignedFrameSize(frameSize);
258 alignedSize -= 4; // return address is POPped at end of call
259 if (alignedSize != 0) {
260 ADD(32, R(ESP), Imm8(alignedSize));
261 }
262}
263
264#else //64bit
265
266// Common functions
267void XEmitter::ABI_CallFunction(const void *func) {
268 u64 distance = u64(func) - (u64(code) + 5);
269 if (distance >= 0x0000000080000000ULL
270 && distance < 0xFFFFFFFF80000000ULL) {
271 // Far call
272 MOV(64, R(RAX), ImmPtr(func));
273 CALLptr(R(RAX));
274 } else {
275 CALL(func);
276 }
277}
278
279void XEmitter::ABI_CallFunctionC16(const void *func, u16 param1) {
280 MOV(32, R(ABI_PARAM1), Imm32((u32)param1));
281 u64 distance = u64(func) - (u64(code) + 5);
282 if (distance >= 0x0000000080000000ULL
283 && distance < 0xFFFFFFFF80000000ULL) {
284 // Far call
285 MOV(64, R(RAX), ImmPtr(func));
286 CALLptr(R(RAX));
287 } else {
288 CALL(func);
289 }
290}
291
292void XEmitter::ABI_CallFunctionCC16(const void *func, u32 param1, u16 param2) {
293 MOV(32, R(ABI_PARAM1), Imm32(param1));
294 MOV(32, R(ABI_PARAM2), Imm32((u32)param2));
295 u64 distance = u64(func) - (u64(code) + 5);
296 if (distance >= 0x0000000080000000ULL
297 && distance < 0xFFFFFFFF80000000ULL) {
298 // Far call
299 MOV(64, R(RAX), ImmPtr(func));
300 CALLptr(R(RAX));
301 } else {
302 CALL(func);
303 }
304}
305
306void XEmitter::ABI_CallFunctionC(const void *func, u32 param1) {
307 MOV(32, R(ABI_PARAM1), Imm32(param1));
308 u64 distance = u64(func) - (u64(code) + 5);
309 if (distance >= 0x0000000080000000ULL
310 && distance < 0xFFFFFFFF80000000ULL) {
311 // Far call
312 MOV(64, R(RAX), ImmPtr(func));
313 CALLptr(R(RAX));
314 } else {
315 CALL(func);
316 }
317}
318
319void XEmitter::ABI_CallFunctionCC(const void *func, u32 param1, u32 param2) {
320 MOV(32, R(ABI_PARAM1), Imm32(param1));
321 MOV(32, R(ABI_PARAM2), Imm32(param2));
322 u64 distance = u64(func) - (u64(code) + 5);
323 if (distance >= 0x0000000080000000ULL
324 && distance < 0xFFFFFFFF80000000ULL) {
325 // Far call
326 MOV(64, R(RAX), ImmPtr(func));
327 CALLptr(R(RAX));
328 } else {
329 CALL(func);
330 }
331}
332
333void XEmitter::ABI_CallFunctionCCC(const void *func, u32 param1, u32 param2, u32 param3) {
334 MOV(32, R(ABI_PARAM1), Imm32(param1));
335 MOV(32, R(ABI_PARAM2), Imm32(param2));
336 MOV(32, R(ABI_PARAM3), Imm32(param3));
337 u64 distance = u64(func) - (u64(code) + 5);
338 if (distance >= 0x0000000080000000ULL
339 && distance < 0xFFFFFFFF80000000ULL) {
340 // Far call
341 MOV(64, R(RAX), ImmPtr(func));
342 CALLptr(R(RAX));
343 } else {
344 CALL(func);
345 }
346}
347
348void XEmitter::ABI_CallFunctionCCP(const void *func, u32 param1, u32 param2, void *param3) {
349 MOV(32, R(ABI_PARAM1), Imm32(param1));
350 MOV(32, R(ABI_PARAM2), Imm32(param2));
351 MOV(64, R(ABI_PARAM3), ImmPtr(param3));
352 u64 distance = u64(func) - (u64(code) + 5);
353 if (distance >= 0x0000000080000000ULL
354 && distance < 0xFFFFFFFF80000000ULL) {
355 // Far call
356 MOV(64, R(RAX), ImmPtr(func));
357 CALLptr(R(RAX));
358 } else {
359 CALL(func);
360 }
361}
362
363void XEmitter::ABI_CallFunctionCCCP(const void *func, u32 param1, u32 param2, u32 param3, void *param4) {
364 MOV(32, R(ABI_PARAM1), Imm32(param1));
365 MOV(32, R(ABI_PARAM2), Imm32(param2));
366 MOV(32, R(ABI_PARAM3), Imm32(param3));
367 MOV(64, R(ABI_PARAM4), ImmPtr(param4));
368 u64 distance = u64(func) - (u64(code) + 5);
369 if (distance >= 0x0000000080000000ULL
370 && distance < 0xFFFFFFFF80000000ULL) {
371 // Far call
372 MOV(64, R(RAX), ImmPtr(func));
373 CALLptr(R(RAX));
374 } else {
375 CALL(func);
376 }
377}
378
379void XEmitter::ABI_CallFunctionP(const void *func, void *param1) {
380 MOV(64, R(ABI_PARAM1), ImmPtr(param1));
381 u64 distance = u64(func) - (u64(code) + 5);
382 if (distance >= 0x0000000080000000ULL
383 && distance < 0xFFFFFFFF80000000ULL) {
384 // Far call
385 MOV(64, R(RAX), ImmPtr(func));
386 CALLptr(R(RAX));
387 } else {
388 CALL(func);
389 }
390}
391
392void XEmitter::ABI_CallFunctionPA(const void *func, void *param1, const Gen::OpArg &arg2) {
393 MOV(64, R(ABI_PARAM1), ImmPtr(param1));
394 if (!arg2.IsSimpleReg(ABI_PARAM2))
395 MOV(32, R(ABI_PARAM2), arg2);
396 u64 distance = u64(func) - (u64(code) + 5);
397 if (distance >= 0x0000000080000000ULL
398 && distance < 0xFFFFFFFF80000000ULL) {
399 // Far call
400 MOV(64, R(RAX), ImmPtr(func));
401 CALLptr(R(RAX));
402 } else {
403 CALL(func);
404 }
405}
406
407void XEmitter::ABI_CallFunctionPAA(const void *func, void *param1, const Gen::OpArg &arg2, const Gen::OpArg &arg3) {
408 MOV(64, R(ABI_PARAM1), ImmPtr(param1));
409 if (!arg2.IsSimpleReg(ABI_PARAM2))
410 MOV(32, R(ABI_PARAM2), arg2);
411 if (!arg3.IsSimpleReg(ABI_PARAM3))
412 MOV(32, R(ABI_PARAM3), arg3);
413 u64 distance = u64(func) - (u64(code) + 5);
414 if (distance >= 0x0000000080000000ULL
415 && distance < 0xFFFFFFFF80000000ULL) {
416 // Far call
417 MOV(64, R(RAX), ImmPtr(func));
418 CALLptr(R(RAX));
419 } else {
420 CALL(func);
421 }
422}
423
424void XEmitter::ABI_CallFunctionPPC(const void *func, void *param1, void *param2, u32 param3) {
425 MOV(64, R(ABI_PARAM1), ImmPtr(param1));
426 MOV(64, R(ABI_PARAM2), ImmPtr(param2));
427 MOV(32, R(ABI_PARAM3), Imm32(param3));
428 u64 distance = u64(func) - (u64(code) + 5);
429 if (distance >= 0x0000000080000000ULL
430 && distance < 0xFFFFFFFF80000000ULL) {
431 // Far call
432 MOV(64, R(RAX), ImmPtr(func));
433 CALLptr(R(RAX));
434 } else {
435 CALL(func);
436 }
437}
438
439// Pass a register as a parameter.
440void XEmitter::ABI_CallFunctionR(const void *func, X64Reg reg1) {
441 if (reg1 != ABI_PARAM1)
442 MOV(32, R(ABI_PARAM1), R(reg1));
443 u64 distance = u64(func) - (u64(code) + 5);
444 if (distance >= 0x0000000080000000ULL
445 && distance < 0xFFFFFFFF80000000ULL) {
446 // Far call
447 MOV(64, R(RAX), ImmPtr(func));
448 CALLptr(R(RAX));
449 } else {
450 CALL(func);
451 }
452}
453
454// Pass two registers as parameters.
455void XEmitter::ABI_CallFunctionRR(const void *func, X64Reg reg1, X64Reg reg2) {
456 if (reg2 != ABI_PARAM1) {
457 if (reg1 != ABI_PARAM1)
458 MOV(64, R(ABI_PARAM1), R(reg1));
459 if (reg2 != ABI_PARAM2)
460 MOV(64, R(ABI_PARAM2), R(reg2));
461 } else {
462 if (reg2 != ABI_PARAM2)
463 MOV(64, R(ABI_PARAM2), R(reg2));
464 if (reg1 != ABI_PARAM1)
465 MOV(64, R(ABI_PARAM1), R(reg1));
466 }
467 u64 distance = u64(func) - (u64(code) + 5);
468 if (distance >= 0x0000000080000000ULL
469 && distance < 0xFFFFFFFF80000000ULL) {
470 // Far call
471 MOV(64, R(RAX), ImmPtr(func));
472 CALLptr(R(RAX));
473 } else {
474 CALL(func);
475 }
476}
477
478void XEmitter::ABI_CallFunctionAC(const void *func, const Gen::OpArg &arg1, u32 param2)
479{
480 if (!arg1.IsSimpleReg(ABI_PARAM1))
481 MOV(32, R(ABI_PARAM1), arg1);
482 MOV(32, R(ABI_PARAM2), Imm32(param2));
483 u64 distance = u64(func) - (u64(code) + 5);
484 if (distance >= 0x0000000080000000ULL
485 && distance < 0xFFFFFFFF80000000ULL) {
486 // Far call
487 MOV(64, R(RAX), ImmPtr(func));
488 CALLptr(R(RAX));
489 } else {
490 CALL(func);
491 }
492}
493
494void XEmitter::ABI_CallFunctionACC(const void *func, const Gen::OpArg &arg1, u32 param2, u32 param3)
495{
496 if (!arg1.IsSimpleReg(ABI_PARAM1))
497 MOV(32, R(ABI_PARAM1), arg1);
498 MOV(32, R(ABI_PARAM2), Imm32(param2));
499 MOV(64, R(ABI_PARAM3), Imm64(param3));
500 u64 distance = u64(func) - (u64(code) + 5);
501 if (distance >= 0x0000000080000000ULL
502 && distance < 0xFFFFFFFF80000000ULL) {
503 // Far call
504 MOV(64, R(RAX), ImmPtr(func));
505 CALLptr(R(RAX));
506 } else {
507 CALL(func);
508 }
509}
510
511void XEmitter::ABI_CallFunctionA(const void *func, const Gen::OpArg &arg1)
512{
513 if (!arg1.IsSimpleReg(ABI_PARAM1))
514 MOV(32, R(ABI_PARAM1), arg1);
515 u64 distance = u64(func) - (u64(code) + 5);
516 if (distance >= 0x0000000080000000ULL
517 && distance < 0xFFFFFFFF80000000ULL) {
518 // Far call
519 MOV(64, R(RAX), ImmPtr(func));
520 CALLptr(R(RAX));
521 } else {
522 CALL(func);
523 }
524}
525
526void XEmitter::ABI_CallFunctionAA(const void *func, const Gen::OpArg &arg1, const Gen::OpArg &arg2)
527{
528 if (!arg1.IsSimpleReg(ABI_PARAM1))
529 MOV(32, R(ABI_PARAM1), arg1);
530 if (!arg2.IsSimpleReg(ABI_PARAM2))
531 MOV(32, R(ABI_PARAM2), arg2);
532 u64 distance = u64(func) - (u64(code) + 5);
533 if (distance >= 0x0000000080000000ULL
534 && distance < 0xFFFFFFFF80000000ULL) {
535 // Far call
536 MOV(64, R(RAX), ImmPtr(func));
537 CALLptr(R(RAX));
538 } else {
539 CALL(func);
540 }
541}
542
543unsigned int XEmitter::ABI_GetAlignedFrameSize(unsigned int frameSize) {
544 return frameSize;
545}
546
547#ifdef _WIN32
548
549// The Windows x64 ABI requires XMM6 - XMM15 to be callee saved. 10 regs.
550// But, not saving XMM4 and XMM5 breaks things in VS 2010, even though they are volatile regs.
551// Let's just save all 16.
552const int XMM_STACK_SPACE = 16 * 16;
553
554// Win64 Specific Code
555void XEmitter::ABI_PushAllCalleeSavedRegsAndAdjustStack() {
556 //we only want to do this once
557 PUSH(RBX);
558 PUSH(RSI);
559 PUSH(RDI);
560 PUSH(RBP);
561 PUSH(R12);
562 PUSH(R13);
563 PUSH(R14);
564 PUSH(R15);
565 ABI_AlignStack(0);
566
567 // Do this after aligning, because before it's offset by 8.
568 SUB(64, R(RSP), Imm32(XMM_STACK_SPACE));
569 for (int i = 0; i < 16; ++i)
570 MOVAPS(MDisp(RSP, i * 16), (X64Reg)(XMM0 + i));
571}
572
573void XEmitter::ABI_PopAllCalleeSavedRegsAndAdjustStack() {
574 for (int i = 0; i < 16; ++i)
575 MOVAPS((X64Reg)(XMM0 + i), MDisp(RSP, i * 16));
576 ADD(64, R(RSP), Imm32(XMM_STACK_SPACE));
577
578 ABI_RestoreStack(0);
579 POP(R15);
580 POP(R14);
581 POP(R13);
582 POP(R12);
583 POP(RBP);
584 POP(RDI);
585 POP(RSI);
586 POP(RBX);
587}
588
589// Win64 Specific Code
590void XEmitter::ABI_PushAllCallerSavedRegsAndAdjustStack() {
591 PUSH(RCX);
592 PUSH(RDX);
593 PUSH(RSI);
594 PUSH(RDI);
595 PUSH(R8);
596 PUSH(R9);
597 PUSH(R10);
598 PUSH(R11);
599 // TODO: Callers preserve XMM4-5 (XMM0-3 are args.)
600 ABI_AlignStack(0);
601}
602
603void XEmitter::ABI_PopAllCallerSavedRegsAndAdjustStack() {
604 ABI_RestoreStack(0);
605 POP(R11);
606 POP(R10);
607 POP(R9);
608 POP(R8);
609 POP(RDI);
610 POP(RSI);
611 POP(RDX);
612 POP(RCX);
613}
614
615void XEmitter::ABI_AlignStack(unsigned int /*frameSize*/) {
616 SUB(64, R(RSP), Imm8(0x28));
617}
618
619void XEmitter::ABI_RestoreStack(unsigned int /*frameSize*/) {
620 ADD(64, R(RSP), Imm8(0x28));
621}
622
623#else
624// Unix64 Specific Code
625void XEmitter::ABI_PushAllCalleeSavedRegsAndAdjustStack() {
626 PUSH(RBX);
627 PUSH(RBP);
628 PUSH(R12);
629 PUSH(R13);
630 PUSH(R14);
631 PUSH(R15);
632 PUSH(R15); //just to align stack. duped push/pop doesn't hurt.
633 // TODO: XMM?
634}
635
636void XEmitter::ABI_PopAllCalleeSavedRegsAndAdjustStack() {
637 POP(R15);
638 POP(R15);
639 POP(R14);
640 POP(R13);
641 POP(R12);
642 POP(RBP);
643 POP(RBX);
644}
645
646void XEmitter::ABI_PushAllCallerSavedRegsAndAdjustStack() {
647 PUSH(RCX);
648 PUSH(RDX);
649 PUSH(RSI);
650 PUSH(RDI);
651 PUSH(R8);
652 PUSH(R9);
653 PUSH(R10);
654 PUSH(R11);
655 PUSH(R11);
656}
657
658void XEmitter::ABI_PopAllCallerSavedRegsAndAdjustStack() {
659 POP(R11);
660 POP(R11);
661 POP(R10);
662 POP(R9);
663 POP(R8);
664 POP(RDI);
665 POP(RSI);
666 POP(RDX);
667 POP(RCX);
668}
669
670void XEmitter::ABI_AlignStack(unsigned int /*frameSize*/) {
671 SUB(64, R(RSP), Imm8(0x08));
672}
673
674void XEmitter::ABI_RestoreStack(unsigned int /*frameSize*/) {
675 ADD(64, R(RSP), Imm8(0x08));
676}
677
678#endif // WIN32
679
680#endif // 32bit
diff --git a/src/common/x64/abi.h b/src/common/x64/abi.h
new file mode 100644
index 000000000..7e9c156ae
--- /dev/null
+++ b/src/common/x64/abi.h
@@ -0,0 +1,78 @@
1// Copyright (C) 2003 Dolphin Project.
2
3// This program is free software: you can redistribute it and/or modify
4// it under the terms of the GNU General Public License as published by
5// the Free Software Foundation, version 2.0 or later versions.
6
7// This program is distributed in the hope that it will be useful,
8// but WITHOUT ANY WARRANTY; without even the implied warranty of
9// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
10// GNU General Public License 2.0 for more details.
11
12// A copy of the GPL 2.0 should have been included with the program.
13// If not, see http://www.gnu.org/licenses/
14
15// Official SVN repository and contact information can be found at
16// http://code.google.com/p/dolphin-emu/
17
18#pragma once
19
20#include "common/common_types.h"
21
22// x86/x64 ABI:s, and helpers to help follow them when JIT-ing code.
23// All convensions return values in EAX (+ possibly EDX).
24
25// Linux 32-bit, Windows 32-bit (cdecl, System V):
26// * Caller pushes left to right
27// * Caller fixes stack after call
28// * function subtract from stack for local storage only.
29// Scratch: EAX ECX EDX
30// Callee-save: EBX ESI EDI EBP
31// Parameters: -
32
33// Windows 64-bit
34// * 4-reg "fastcall" variant, very new-skool stack handling
35// * Callee moves stack pointer, to make room for shadow regs for the biggest function _it itself calls_
36// * Parameters passed in RCX, RDX, ... further parameters are MOVed into the allocated stack space.
37// Scratch: RAX RCX RDX R8 R9 R10 R11
38// Callee-save: RBX RSI RDI RBP R12 R13 R14 R15
39// Parameters: RCX RDX R8 R9, further MOV-ed
40
41// Linux 64-bit
42// * 6-reg "fastcall" variant, old skool stack handling (parameters are pushed)
43// Scratch: RAX RCX RDX RSI RDI R8 R9 R10 R11
44// Callee-save: RBX RBP R12 R13 R14 R15
45// Parameters: RDI RSI RDX RCX R8 R9
46
47#ifdef _M_IX86 // 32 bit calling convention, shared by all
48
49// 32-bit don't pass parameters in regs, but these are convenient to have anyway when we have to
50// choose regs to put stuff in.
51#define ABI_PARAM1 RCX
52#define ABI_PARAM2 RDX
53
54// There are no ABI_PARAM* here, since args are pushed.
55// 32-bit bog standard cdecl, shared between linux and windows
56// MacOSX 32-bit is same as System V with a few exceptions that we probably don't care much about.
57
58#elif ARCHITECTURE_x86_64 // 64 bit calling convention
59
60#ifdef _WIN32 // 64-bit Windows - the really exotic calling convention
61
62#define ABI_PARAM1 RCX
63#define ABI_PARAM2 RDX
64#define ABI_PARAM3 R8
65#define ABI_PARAM4 R9
66
67#else //64-bit Unix (hopefully MacOSX too)
68
69#define ABI_PARAM1 RDI
70#define ABI_PARAM2 RSI
71#define ABI_PARAM3 RDX
72#define ABI_PARAM4 RCX
73#define ABI_PARAM5 R8
74#define ABI_PARAM6 R9
75
76#endif // WIN32
77
78#endif // X86
diff --git a/src/common/x64/cpu_detect.cpp b/src/common/x64/cpu_detect.cpp
new file mode 100644
index 000000000..d9c430c67
--- /dev/null
+++ b/src/common/x64/cpu_detect.cpp
@@ -0,0 +1,187 @@
1// Copyright 2013 Dolphin Emulator Project / 2015 Citra Emulator Project
2// Licensed under GPLv2 or any later version
3// Refer to the license.txt file included.
4
5#include <cstring>
6#include <string>
7#include <thread>
8
9#include "common/common_types.h"
10
11#include "cpu_detect.h"
12
13namespace Common {
14
15#ifndef _MSC_VER
16
17#ifdef __FreeBSD__
18#include <sys/types.h>
19#include <machine/cpufunc.h>
20#endif
21
22static inline void __cpuidex(int info[4], int function_id, int subfunction_id) {
23#ifdef __FreeBSD__
24 // Despite the name, this is just do_cpuid() with ECX as second input.
25 cpuid_count((u_int)function_id, (u_int)subfunction_id, (u_int*)info);
26#else
27 info[0] = function_id; // eax
28 info[2] = subfunction_id; // ecx
29 __asm__(
30 "cpuid"
31 : "=a" (info[0]),
32 "=b" (info[1]),
33 "=c" (info[2]),
34 "=d" (info[3])
35 : "a" (function_id),
36 "c" (subfunction_id)
37 );
38#endif
39}
40
41static inline void __cpuid(int info[4], int function_id) {
42 return __cpuidex(info, function_id, 0);
43}
44
45#define _XCR_XFEATURE_ENABLED_MASK 0
46static u64 _xgetbv(u32 index) {
47 u32 eax, edx;
48 __asm__ __volatile__("xgetbv" : "=a"(eax), "=d"(edx) : "c"(index));
49 return ((u64)edx << 32) | eax;
50}
51
52#endif // ifndef _MSC_VER
53
54// Detects the various CPU features
55static CPUCaps Detect() {
56 CPUCaps caps = {};
57
58 caps.num_cores = std::thread::hardware_concurrency();
59
60 // Assumes the CPU supports the CPUID instruction. Those that don't would likely not support
61 // Citra at all anyway
62
63 int cpu_id[4];
64 memset(caps.brand_string, 0, sizeof(caps.brand_string));
65
66 // Detect CPU's CPUID capabilities and grab CPU string
67 __cpuid(cpu_id, 0x00000000);
68 u32 max_std_fn = cpu_id[0]; // EAX
69
70 std::memcpy(&caps.brand_string[0], &cpu_id[1], sizeof(int));
71 std::memcpy(&caps.brand_string[4], &cpu_id[3], sizeof(int));
72 std::memcpy(&caps.brand_string[8], &cpu_id[2], sizeof(int));
73
74 __cpuid(cpu_id, 0x80000000);
75
76 u32 max_ex_fn = cpu_id[0];
77 if (!strcmp(caps.brand_string, "GenuineIntel"))
78 caps.vendor = CPUVendor::INTEL;
79 else if (!strcmp(caps.brand_string, "AuthenticAMD"))
80 caps.vendor = CPUVendor::AMD;
81 else
82 caps.vendor = CPUVendor::OTHER;
83
84 // Set reasonable default brand string even if brand string not available
85 strcpy(caps.cpu_string, caps.brand_string);
86
87 // Detect family and other miscellaneous features
88 if (max_std_fn >= 1) {
89 __cpuid(cpu_id, 0x00000001);
90
91 if ((cpu_id[3] >> 25) & 1) caps.sse = true;
92 if ((cpu_id[3] >> 26) & 1) caps.sse2 = true;
93 if ((cpu_id[2]) & 1) caps.sse3 = true;
94 if ((cpu_id[2] >> 9) & 1) caps.ssse3 = true;
95 if ((cpu_id[2] >> 19) & 1) caps.sse4_1 = true;
96 if ((cpu_id[2] >> 20) & 1) caps.sse4_2 = true;
97 if ((cpu_id[2] >> 22) & 1) caps.movbe = true;
98 if ((cpu_id[2] >> 25) & 1) caps.aes = true;
99
100 if ((cpu_id[3] >> 24) & 1) {
101 caps.fxsave_fxrstor = true;
102 }
103
104 // AVX support requires 3 separate checks:
105 // - Is the AVX bit set in CPUID?
106 // - Is the XSAVE bit set in CPUID?
107 // - XGETBV result has the XCR bit set.
108 if (((cpu_id[2] >> 28) & 1) && ((cpu_id[2] >> 27) & 1)) {
109 if ((_xgetbv(_XCR_XFEATURE_ENABLED_MASK) & 0x6) == 0x6) {
110 caps.avx = true;
111 if ((cpu_id[2] >> 12) & 1)
112 caps.fma = true;
113 }
114 }
115
116 if (max_std_fn >= 7) {
117 __cpuidex(cpu_id, 0x00000007, 0x00000000);
118 // Can't enable AVX2 unless the XSAVE/XGETBV checks above passed
119 if ((cpu_id[1] >> 5) & 1)
120 caps.avx2 = caps.avx;
121 if ((cpu_id[1] >> 3) & 1)
122 caps.bmi1 = true;
123 if ((cpu_id[1] >> 8) & 1)
124 caps.bmi2 = true;
125 }
126 }
127
128 caps.flush_to_zero = caps.sse;
129
130 if (max_ex_fn >= 0x80000004) {
131 // Extract CPU model string
132 __cpuid(cpu_id, 0x80000002);
133 std::memcpy(caps.cpu_string, cpu_id, sizeof(cpu_id));
134 __cpuid(cpu_id, 0x80000003);
135 std::memcpy(caps.cpu_string + 16, cpu_id, sizeof(cpu_id));
136 __cpuid(cpu_id, 0x80000004);
137 std::memcpy(caps.cpu_string + 32, cpu_id, sizeof(cpu_id));
138 }
139
140 if (max_ex_fn >= 0x80000001) {
141 // Check for more features
142 __cpuid(cpu_id, 0x80000001);
143 if (cpu_id[2] & 1) caps.lahf_sahf_64 = true;
144 if ((cpu_id[2] >> 5) & 1) caps.lzcnt = true;
145 if ((cpu_id[2] >> 16) & 1) caps.fma4 = true;
146 if ((cpu_id[3] >> 29) & 1) caps.long_mode = true;
147 }
148
149 return caps;
150}
151
152const CPUCaps& GetCPUCaps() {
153 static CPUCaps caps = Detect();
154 return caps;
155}
156
157std::string GetCPUCapsString() {
158 auto caps = GetCPUCaps();
159
160 std::string sum(caps.cpu_string);
161 sum += " (";
162 sum += caps.brand_string;
163 sum += ")";
164
165 if (caps.sse) sum += ", SSE";
166 if (caps.sse2) {
167 sum += ", SSE2";
168 if (!caps.flush_to_zero) sum += " (without DAZ)";
169 }
170
171 if (caps.sse3) sum += ", SSE3";
172 if (caps.ssse3) sum += ", SSSE3";
173 if (caps.sse4_1) sum += ", SSE4.1";
174 if (caps.sse4_2) sum += ", SSE4.2";
175 if (caps.avx) sum += ", AVX";
176 if (caps.avx2) sum += ", AVX2";
177 if (caps.bmi1) sum += ", BMI1";
178 if (caps.bmi2) sum += ", BMI2";
179 if (caps.fma) sum += ", FMA";
180 if (caps.aes) sum += ", AES";
181 if (caps.movbe) sum += ", MOVBE";
182 if (caps.long_mode) sum += ", 64-bit support";
183
184 return sum;
185}
186
187} // namespace Common
diff --git a/src/common/x64/cpu_detect.h b/src/common/x64/cpu_detect.h
new file mode 100644
index 000000000..0af3a8adb
--- /dev/null
+++ b/src/common/x64/cpu_detect.h
@@ -0,0 +1,66 @@
1// Copyright 2013 Dolphin Emulator Project / 2015 Citra Emulator Project
2// Licensed under GPLv2 or any later version
3// Refer to the license.txt file included.
4
5#pragma once
6
7#include <string>
8
9namespace Common {
10
11/// x86/x64 CPU vendors that may be detected by this module
12enum class CPUVendor {
13 INTEL,
14 AMD,
15 OTHER,
16};
17
18/// x86/x64 CPU capabilities that may be detected by this module
19struct CPUCaps {
20 CPUVendor vendor;
21 char cpu_string[0x21];
22 char brand_string[0x41];
23 int num_cores;
24 bool sse;
25 bool sse2;
26 bool sse3;
27 bool ssse3;
28 bool sse4_1;
29 bool sse4_2;
30 bool lzcnt;
31 bool avx;
32 bool avx2;
33 bool bmi1;
34 bool bmi2;
35 bool fma;
36 bool fma4;
37 bool aes;
38
39 // Support for the FXSAVE and FXRSTOR instructions
40 bool fxsave_fxrstor;
41
42 bool movbe;
43
44 // This flag indicates that the hardware supports some mode in which denormal inputs and outputs
45 // are automatically set to (signed) zero.
46 bool flush_to_zero;
47
48 // Support for LAHF and SAHF instructions in 64-bit mode
49 bool lahf_sahf_64;
50
51 bool long_mode;
52};
53
54/**
55 * Gets the supported capabilities of the host CPU
56 * @return Reference to a CPUCaps struct with the detected host CPU capabilities
57 */
58const CPUCaps& GetCPUCaps();
59
60/**
61 * Gets a string summary of the name and supported capabilities of the host CPU
62 * @return String summary
63 */
64std::string GetCPUCapsString();
65
66} // namespace Common
diff --git a/src/common/x64/emitter.cpp b/src/common/x64/emitter.cpp
new file mode 100644
index 000000000..4b79acd1f
--- /dev/null
+++ b/src/common/x64/emitter.cpp
@@ -0,0 +1,1989 @@
1// Copyright (C) 2003 Dolphin Project.
2
3// This program is free software: you can redistribute it and/or modify
4// it under the terms of the GNU General Public License as published by
5// the Free Software Foundation, version 2.0 or later versions.
6
7// This program is distributed in the hope that it will be useful,
8// but WITHOUT ANY WARRANTY; without even the implied warranty of
9// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
10// GNU General Public License 2.0 for more details.
11
12// A copy of the GPL 2.0 should have been included with the program.
13// If not, see http://www.gnu.org/licenses/
14
15// Official SVN repository and contact information can be found at
16// http://code.google.com/p/dolphin-emu/
17
18#include <cstring>
19
20#include "common/assert.h"
21#include "common/logging/log.h"
22#include "common/memory_util.h"
23
24#include "abi.h"
25#include "cpu_detect.h"
26#include "emitter.h"
27
28#define PRIx64 "llx"
29
30// Minimize the diff against Dolphin
31#define DYNA_REC JIT
32
33namespace Gen
34{
35
36struct NormalOpDef
37{
38 u8 toRm8, toRm32, fromRm8, fromRm32, imm8, imm32, simm8, eaximm8, eaximm32, ext;
39};
40
41// 0xCC is code for invalid combination of immediates
42static const NormalOpDef normalops[11] =
43{
44 {0x00, 0x01, 0x02, 0x03, 0x80, 0x81, 0x83, 0x04, 0x05, 0}, //ADD
45 {0x10, 0x11, 0x12, 0x13, 0x80, 0x81, 0x83, 0x14, 0x15, 2}, //ADC
46
47 {0x28, 0x29, 0x2A, 0x2B, 0x80, 0x81, 0x83, 0x2C, 0x2D, 5}, //SUB
48 {0x18, 0x19, 0x1A, 0x1B, 0x80, 0x81, 0x83, 0x1C, 0x1D, 3}, //SBB
49
50 {0x20, 0x21, 0x22, 0x23, 0x80, 0x81, 0x83, 0x24, 0x25, 4}, //AND
51 {0x08, 0x09, 0x0A, 0x0B, 0x80, 0x81, 0x83, 0x0C, 0x0D, 1}, //OR
52
53 {0x30, 0x31, 0x32, 0x33, 0x80, 0x81, 0x83, 0x34, 0x35, 6}, //XOR
54 {0x88, 0x89, 0x8A, 0x8B, 0xC6, 0xC7, 0xCC, 0xCC, 0xCC, 0}, //MOV
55
56 {0x84, 0x85, 0x84, 0x85, 0xF6, 0xF7, 0xCC, 0xA8, 0xA9, 0}, //TEST (to == from)
57 {0x38, 0x39, 0x3A, 0x3B, 0x80, 0x81, 0x83, 0x3C, 0x3D, 7}, //CMP
58
59 {0x86, 0x87, 0x86, 0x87, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 7}, //XCHG
60};
61
62enum NormalSSEOps
63{
64 sseCMP = 0xC2,
65 sseADD = 0x58, //ADD
66 sseSUB = 0x5C, //SUB
67 sseAND = 0x54, //AND
68 sseANDN = 0x55, //ANDN
69 sseOR = 0x56,
70 sseXOR = 0x57,
71 sseMUL = 0x59, //MUL
72 sseDIV = 0x5E, //DIV
73 sseMIN = 0x5D, //MIN
74 sseMAX = 0x5F, //MAX
75 sseCOMIS = 0x2F, //COMIS
76 sseUCOMIS = 0x2E, //UCOMIS
77 sseSQRT = 0x51, //SQRT
78 sseRSQRT = 0x52, //RSQRT (NO DOUBLE PRECISION!!!)
79 sseRCP = 0x53, //RCP
80 sseMOVAPfromRM = 0x28, //MOVAP from RM
81 sseMOVAPtoRM = 0x29, //MOVAP to RM
82 sseMOVUPfromRM = 0x10, //MOVUP from RM
83 sseMOVUPtoRM = 0x11, //MOVUP to RM
84 sseMOVLPfromRM= 0x12,
85 sseMOVLPtoRM = 0x13,
86 sseMOVHPfromRM= 0x16,
87 sseMOVHPtoRM = 0x17,
88 sseMOVHLPS = 0x12,
89 sseMOVLHPS = 0x16,
90 sseMOVDQfromRM = 0x6F,
91 sseMOVDQtoRM = 0x7F,
92 sseMASKMOVDQU = 0xF7,
93 sseLDDQU = 0xF0,
94 sseSHUF = 0xC6,
95 sseMOVNTDQ = 0xE7,
96 sseMOVNTP = 0x2B,
97 sseHADD = 0x7C,
98};
99
100
101void XEmitter::SetCodePtr(u8 *ptr)
102{
103 code = ptr;
104}
105
106const u8 *XEmitter::GetCodePtr() const
107{
108 return code;
109}
110
111u8 *XEmitter::GetWritableCodePtr()
112{
113 return code;
114}
115
116void XEmitter::ReserveCodeSpace(int bytes)
117{
118 for (int i = 0; i < bytes; i++)
119 *code++ = 0xCC;
120}
121
122const u8 *XEmitter::AlignCode4()
123{
124 int c = int((u64)code & 3);
125 if (c)
126 ReserveCodeSpace(4-c);
127 return code;
128}
129
130const u8 *XEmitter::AlignCode16()
131{
132 int c = int((u64)code & 15);
133 if (c)
134 ReserveCodeSpace(16-c);
135 return code;
136}
137
138const u8 *XEmitter::AlignCodePage()
139{
140 int c = int((u64)code & 4095);
141 if (c)
142 ReserveCodeSpace(4096-c);
143 return code;
144}
145
146// This operation modifies flags; check to see the flags are locked.
147// If the flags are locked, we should immediately and loudly fail before
148// causing a subtle JIT bug.
149void XEmitter::CheckFlags()
150{
151 ASSERT_MSG(!flags_locked, "Attempt to modify flags while flags locked!");
152}
153
154void XEmitter::WriteModRM(int mod, int reg, int rm)
155{
156 Write8((u8)((mod << 6) | ((reg & 7) << 3) | (rm & 7)));
157}
158
159void XEmitter::WriteSIB(int scale, int index, int base)
160{
161 Write8((u8)((scale << 6) | ((index & 7) << 3) | (base & 7)));
162}
163
164void OpArg::WriteRex(XEmitter *emit, int opBits, int bits, int customOp) const
165{
166 if (customOp == -1) customOp = operandReg;
167#ifdef ARCHITECTURE_x86_64
168 u8 op = 0x40;
169 // REX.W (whether operation is a 64-bit operation)
170 if (opBits == 64) op |= 8;
171 // REX.R (whether ModR/M reg field refers to R8-R15.
172 if (customOp & 8) op |= 4;
173 // REX.X (whether ModR/M SIB index field refers to R8-R15)
174 if (indexReg & 8) op |= 2;
175 // REX.B (whether ModR/M rm or SIB base or opcode reg field refers to R8-R15)
176 if (offsetOrBaseReg & 8) op |= 1;
177 // Write REX if wr have REX bits to write, or if the operation accesses
178 // SIL, DIL, BPL, or SPL.
179 if (op != 0x40 ||
180 (scale == SCALE_NONE && bits == 8 && (offsetOrBaseReg & 0x10c) == 4) ||
181 (opBits == 8 && (customOp & 0x10c) == 4))
182 {
183 emit->Write8(op);
184 // Check the operation doesn't access AH, BH, CH, or DH.
185 DEBUG_ASSERT((offsetOrBaseReg & 0x100) == 0);
186 DEBUG_ASSERT((customOp & 0x100) == 0);
187 }
188#else
189 DEBUG_ASSERT(opBits != 64);
190 DEBUG_ASSERT((customOp & 8) == 0 || customOp == -1);
191 DEBUG_ASSERT((indexReg & 8) == 0);
192 DEBUG_ASSERT((offsetOrBaseReg & 8) == 0);
193 DEBUG_ASSERT(opBits != 8 || (customOp & 0x10c) != 4 || customOp == -1);
194 DEBUG_ASSERT(scale == SCALE_ATREG || bits != 8 || (offsetOrBaseReg & 0x10c) != 4);
195#endif
196}
197
198void OpArg::WriteVex(XEmitter* emit, X64Reg regOp1, X64Reg regOp2, int L, int pp, int mmmmm, int W) const
199{
200 int R = !(regOp1 & 8);
201 int X = !(indexReg & 8);
202 int B = !(offsetOrBaseReg & 8);
203
204 int vvvv = (regOp2 == X64Reg::INVALID_REG) ? 0xf : (regOp2 ^ 0xf);
205
206 // do we need any VEX fields that only appear in the three-byte form?
207 if (X == 1 && B == 1 && W == 0 && mmmmm == 1)
208 {
209 u8 RvvvvLpp = (R << 7) | (vvvv << 3) | (L << 1) | pp;
210 emit->Write8(0xC5);
211 emit->Write8(RvvvvLpp);
212 }
213 else
214 {
215 u8 RXBmmmmm = (R << 7) | (X << 6) | (B << 5) | mmmmm;
216 u8 WvvvvLpp = (W << 7) | (vvvv << 3) | (L << 1) | pp;
217 emit->Write8(0xC4);
218 emit->Write8(RXBmmmmm);
219 emit->Write8(WvvvvLpp);
220 }
221}
222
223void OpArg::WriteRest(XEmitter *emit, int extraBytes, X64Reg _operandReg,
224 bool warn_64bit_offset) const
225{
226 if (_operandReg == INVALID_REG)
227 _operandReg = (X64Reg)this->operandReg;
228 int mod = 0;
229 int ireg = indexReg;
230 bool SIB = false;
231 int _offsetOrBaseReg = this->offsetOrBaseReg;
232
233 if (scale == SCALE_RIP) //Also, on 32-bit, just an immediate address
234 {
235 // Oh, RIP addressing.
236 _offsetOrBaseReg = 5;
237 emit->WriteModRM(0, _operandReg, _offsetOrBaseReg);
238 //TODO : add some checks
239#ifdef ARCHITECTURE_x86_64
240 u64 ripAddr = (u64)emit->GetCodePtr() + 4 + extraBytes;
241 s64 distance = (s64)offset - (s64)ripAddr;
242 ASSERT_MSG(
243 (distance < 0x80000000LL &&
244 distance >= -0x80000000LL) ||
245 !warn_64bit_offset,
246 "WriteRest: op out of range (0x%" PRIx64 " uses 0x%" PRIx64 ")",
247 ripAddr, offset);
248 s32 offs = (s32)distance;
249 emit->Write32((u32)offs);
250#else
251 emit->Write32((u32)offset);
252#endif
253 return;
254 }
255
256 if (scale == 0)
257 {
258 // Oh, no memory, Just a reg.
259 mod = 3; //11
260 }
261 else if (scale >= 1)
262 {
263 //Ah good, no scaling.
264 if (scale == SCALE_ATREG && !((_offsetOrBaseReg & 7) == 4 || (_offsetOrBaseReg & 7) == 5))
265 {
266 //Okay, we're good. No SIB necessary.
267 int ioff = (int)offset;
268 if (ioff == 0)
269 {
270 mod = 0;
271 }
272 else if (ioff<-128 || ioff>127)
273 {
274 mod = 2; //32-bit displacement
275 }
276 else
277 {
278 mod = 1; //8-bit displacement
279 }
280 }
281 else if (scale >= SCALE_NOBASE_2 && scale <= SCALE_NOBASE_8)
282 {
283 SIB = true;
284 mod = 0;
285 _offsetOrBaseReg = 5;
286 }
287 else //if (scale != SCALE_ATREG)
288 {
289 if ((_offsetOrBaseReg & 7) == 4) //this would occupy the SIB encoding :(
290 {
291 //So we have to fake it with SIB encoding :(
292 SIB = true;
293 }
294
295 if (scale >= SCALE_1 && scale < SCALE_ATREG)
296 {
297 SIB = true;
298 }
299
300 if (scale == SCALE_ATREG && ((_offsetOrBaseReg & 7) == 4))
301 {
302 SIB = true;
303 ireg = _offsetOrBaseReg;
304 }
305
306 //Okay, we're fine. Just disp encoding.
307 //We need displacement. Which size?
308 int ioff = (int)(s64)offset;
309 if (ioff < -128 || ioff > 127)
310 {
311 mod = 2; //32-bit displacement
312 }
313 else
314 {
315 mod = 1; //8-bit displacement
316 }
317 }
318 }
319
320 // Okay. Time to do the actual writing
321 // ModRM byte:
322 int oreg = _offsetOrBaseReg;
323 if (SIB)
324 oreg = 4;
325
326 // TODO(ector): WTF is this if about? I don't remember writing it :-)
327 //if (RIP)
328 // oreg = 5;
329
330 emit->WriteModRM(mod, _operandReg&7, oreg&7);
331
332 if (SIB)
333 {
334 //SIB byte
335 int ss;
336 switch (scale)
337 {
338 case SCALE_NONE: _offsetOrBaseReg = 4; ss = 0; break; //RSP
339 case SCALE_1: ss = 0; break;
340 case SCALE_2: ss = 1; break;
341 case SCALE_4: ss = 2; break;
342 case SCALE_8: ss = 3; break;
343 case SCALE_NOBASE_2: ss = 1; break;
344 case SCALE_NOBASE_4: ss = 2; break;
345 case SCALE_NOBASE_8: ss = 3; break;
346 case SCALE_ATREG: ss = 0; break;
347 default: ASSERT_MSG(0, "Invalid scale for SIB byte"); ss = 0; break;
348 }
349 emit->Write8((u8)((ss << 6) | ((ireg&7)<<3) | (_offsetOrBaseReg&7)));
350 }
351
352 if (mod == 1) //8-bit disp
353 {
354 emit->Write8((u8)(s8)(s32)offset);
355 }
356 else if (mod == 2 || (scale >= SCALE_NOBASE_2 && scale <= SCALE_NOBASE_8)) //32-bit disp
357 {
358 emit->Write32((u32)offset);
359 }
360}
361
362// W = operand extended width (1 if 64-bit)
363// R = register# upper bit
364// X = scale amnt upper bit
365// B = base register# upper bit
366void XEmitter::Rex(int w, int r, int x, int b)
367{
368 w = w ? 1 : 0;
369 r = r ? 1 : 0;
370 x = x ? 1 : 0;
371 b = b ? 1 : 0;
372 u8 rx = (u8)(0x40 | (w << 3) | (r << 2) | (x << 1) | (b));
373 if (rx != 0x40)
374 Write8(rx);
375}
376
377void XEmitter::JMP(const u8 *addr, bool force5Bytes)
378{
379 u64 fn = (u64)addr;
380 if (!force5Bytes)
381 {
382 s64 distance = (s64)(fn - ((u64)code + 2));
383 ASSERT_MSG(distance >= -0x80 && distance < 0x80,
384 "Jump target too far away, needs force5Bytes = true");
385 //8 bits will do
386 Write8(0xEB);
387 Write8((u8)(s8)distance);
388 }
389 else
390 {
391 s64 distance = (s64)(fn - ((u64)code + 5));
392
393 ASSERT_MSG(
394 distance >= -0x80000000LL && distance < 0x80000000LL,
395 "Jump target too far away, needs indirect register");
396 Write8(0xE9);
397 Write32((u32)(s32)distance);
398 }
399}
400
401void XEmitter::JMPptr(const OpArg &arg2)
402{
403 OpArg arg = arg2;
404 if (arg.IsImm()) ASSERT_MSG(0, "JMPptr - Imm argument");
405 arg.operandReg = 4;
406 arg.WriteRex(this, 0, 0);
407 Write8(0xFF);
408 arg.WriteRest(this);
409}
410
411//Can be used to trap other processors, before overwriting their code
412// not used in dolphin
413void XEmitter::JMPself()
414{
415 Write8(0xEB);
416 Write8(0xFE);
417}
418
419void XEmitter::CALLptr(OpArg arg)
420{
421 if (arg.IsImm()) ASSERT_MSG(0, "CALLptr - Imm argument");
422 arg.operandReg = 2;
423 arg.WriteRex(this, 0, 0);
424 Write8(0xFF);
425 arg.WriteRest(this);
426}
427
428void XEmitter::CALL(const void *fnptr)
429{
430 u64 distance = u64(fnptr) - (u64(code) + 5);
431 ASSERT_MSG(
432 distance < 0x0000000080000000ULL ||
433 distance >= 0xFFFFFFFF80000000ULL,
434 "CALL out of range (%p calls %p)", code, fnptr);
435 Write8(0xE8);
436 Write32(u32(distance));
437}
438
439FixupBranch XEmitter::J(bool force5bytes)
440{
441 FixupBranch branch;
442 branch.type = force5bytes ? 1 : 0;
443 branch.ptr = code + (force5bytes ? 5 : 2);
444 if (!force5bytes)
445 {
446 //8 bits will do
447 Write8(0xEB);
448 Write8(0);
449 }
450 else
451 {
452 Write8(0xE9);
453 Write32(0);
454 }
455 return branch;
456}
457
458FixupBranch XEmitter::J_CC(CCFlags conditionCode, bool force5bytes)
459{
460 FixupBranch branch;
461 branch.type = force5bytes ? 1 : 0;
462 branch.ptr = code + (force5bytes ? 6 : 2);
463 if (!force5bytes)
464 {
465 //8 bits will do
466 Write8(0x70 + conditionCode);
467 Write8(0);
468 }
469 else
470 {
471 Write8(0x0F);
472 Write8(0x80 + conditionCode);
473 Write32(0);
474 }
475 return branch;
476}
477
478void XEmitter::J_CC(CCFlags conditionCode, const u8* addr, bool force5bytes)
479{
480 u64 fn = (u64)addr;
481 s64 distance = (s64)(fn - ((u64)code + 2));
482 if (distance < -0x80 || distance >= 0x80 || force5bytes)
483 {
484 distance = (s64)(fn - ((u64)code + 6));
485 ASSERT_MSG(
486 distance >= -0x80000000LL && distance < 0x80000000LL,
487 "Jump target too far away, needs indirect register");
488 Write8(0x0F);
489 Write8(0x80 + conditionCode);
490 Write32((u32)(s32)distance);
491 }
492 else
493 {
494 Write8(0x70 + conditionCode);
495 Write8((u8)(s8)distance);
496 }
497}
498
499void XEmitter::SetJumpTarget(const FixupBranch &branch)
500{
501 if (branch.type == 0)
502 {
503 s64 distance = (s64)(code - branch.ptr);
504 ASSERT_MSG(distance >= -0x80 && distance < 0x80, "Jump target too far away, needs force5Bytes = true");
505 branch.ptr[-1] = (u8)(s8)distance;
506 }
507 else if (branch.type == 1)
508 {
509 s64 distance = (s64)(code - branch.ptr);
510 ASSERT_MSG(distance >= -0x80000000LL && distance < 0x80000000LL, "Jump target too far away, needs indirect register");
511 ((s32*)branch.ptr)[-1] = (s32)distance;
512 }
513}
514
515// INC/DEC considered harmful on newer CPUs due to partial flag set.
516// Use ADD, SUB instead.
517
518/*
519void XEmitter::INC(int bits, OpArg arg)
520{
521 if (arg.IsImm()) ASSERT_MSG(0, "INC - Imm argument");
522 arg.operandReg = 0;
523 if (bits == 16) {Write8(0x66);}
524 arg.WriteRex(this, bits, bits);
525 Write8(bits == 8 ? 0xFE : 0xFF);
526 arg.WriteRest(this);
527}
528void XEmitter::DEC(int bits, OpArg arg)
529{
530 if (arg.IsImm()) ASSERT_MSG(0, "DEC - Imm argument");
531 arg.operandReg = 1;
532 if (bits == 16) {Write8(0x66);}
533 arg.WriteRex(this, bits, bits);
534 Write8(bits == 8 ? 0xFE : 0xFF);
535 arg.WriteRest(this);
536}
537*/
538
539//Single byte opcodes
540//There is no PUSHAD/POPAD in 64-bit mode.
541void XEmitter::INT3() {Write8(0xCC);}
542void XEmitter::RET() {Write8(0xC3);}
543void XEmitter::RET_FAST() {Write8(0xF3); Write8(0xC3);} //two-byte return (rep ret) - recommended by AMD optimization manual for the case of jumping to a ret
544
545// The first sign of decadence: optimized NOPs.
546void XEmitter::NOP(size_t size)
547{
548 DEBUG_ASSERT((int)size > 0);
549 while (true)
550 {
551 switch (size)
552 {
553 case 0:
554 return;
555 case 1:
556 Write8(0x90);
557 return;
558 case 2:
559 Write8(0x66); Write8(0x90);
560 return;
561 case 3:
562 Write8(0x0F); Write8(0x1F); Write8(0x00);
563 return;
564 case 4:
565 Write8(0x0F); Write8(0x1F); Write8(0x40); Write8(0x00);
566 return;
567 case 5:
568 Write8(0x0F); Write8(0x1F); Write8(0x44); Write8(0x00);
569 Write8(0x00);
570 return;
571 case 6:
572 Write8(0x66); Write8(0x0F); Write8(0x1F); Write8(0x44);
573 Write8(0x00); Write8(0x00);
574 return;
575 case 7:
576 Write8(0x0F); Write8(0x1F); Write8(0x80); Write8(0x00);
577 Write8(0x00); Write8(0x00); Write8(0x00);
578 return;
579 case 8:
580 Write8(0x0F); Write8(0x1F); Write8(0x84); Write8(0x00);
581 Write8(0x00); Write8(0x00); Write8(0x00); Write8(0x00);
582 return;
583 case 9:
584 Write8(0x66); Write8(0x0F); Write8(0x1F); Write8(0x84);
585 Write8(0x00); Write8(0x00); Write8(0x00); Write8(0x00);
586 Write8(0x00);
587 return;
588 case 10:
589 Write8(0x66); Write8(0x66); Write8(0x0F); Write8(0x1F);
590 Write8(0x84); Write8(0x00); Write8(0x00); Write8(0x00);
591 Write8(0x00); Write8(0x00);
592 return;
593 default:
594 // Even though x86 instructions are allowed to be up to 15 bytes long,
595 // AMD advises against using NOPs longer than 11 bytes because they
596 // carry a performance penalty on CPUs older than AMD family 16h.
597 Write8(0x66); Write8(0x66); Write8(0x66); Write8(0x0F);
598 Write8(0x1F); Write8(0x84); Write8(0x00); Write8(0x00);
599 Write8(0x00); Write8(0x00); Write8(0x00);
600 size -= 11;
601 continue;
602 }
603 }
604}
605
606void XEmitter::PAUSE() {Write8(0xF3); NOP();} //use in tight spinloops for energy saving on some cpu
607void XEmitter::CLC() {CheckFlags(); Write8(0xF8);} //clear carry
608void XEmitter::CMC() {CheckFlags(); Write8(0xF5);} //flip carry
609void XEmitter::STC() {CheckFlags(); Write8(0xF9);} //set carry
610
611//TODO: xchg ah, al ???
612void XEmitter::XCHG_AHAL()
613{
614 Write8(0x86);
615 Write8(0xe0);
616 // alt. 86 c4
617}
618
619//These two can not be executed on early Intel 64-bit CPU:s, only on AMD!
620void XEmitter::LAHF() {Write8(0x9F);}
621void XEmitter::SAHF() {CheckFlags(); Write8(0x9E);}
622
623void XEmitter::PUSHF() {Write8(0x9C);}
624void XEmitter::POPF() {CheckFlags(); Write8(0x9D);}
625
626void XEmitter::LFENCE() {Write8(0x0F); Write8(0xAE); Write8(0xE8);}
627void XEmitter::MFENCE() {Write8(0x0F); Write8(0xAE); Write8(0xF0);}
628void XEmitter::SFENCE() {Write8(0x0F); Write8(0xAE); Write8(0xF8);}
629
630void XEmitter::WriteSimple1Byte(int bits, u8 byte, X64Reg reg)
631{
632 if (bits == 16)
633 Write8(0x66);
634 Rex(bits == 64, 0, 0, (int)reg >> 3);
635 Write8(byte + ((int)reg & 7));
636}
637
638void XEmitter::WriteSimple2Byte(int bits, u8 byte1, u8 byte2, X64Reg reg)
639{
640 if (bits == 16)
641 Write8(0x66);
642 Rex(bits==64, 0, 0, (int)reg >> 3);
643 Write8(byte1);
644 Write8(byte2 + ((int)reg & 7));
645}
646
647void XEmitter::CWD(int bits)
648{
649 if (bits == 16)
650 Write8(0x66);
651 Rex(bits == 64, 0, 0, 0);
652 Write8(0x99);
653}
654
655void XEmitter::CBW(int bits)
656{
657 if (bits == 8)
658 Write8(0x66);
659 Rex(bits == 32, 0, 0, 0);
660 Write8(0x98);
661}
662
663//Simple opcodes
664
665
666//push/pop do not need wide to be 64-bit
667void XEmitter::PUSH(X64Reg reg) {WriteSimple1Byte(32, 0x50, reg);}
668void XEmitter::POP(X64Reg reg) {WriteSimple1Byte(32, 0x58, reg);}
669
670void XEmitter::PUSH(int bits, const OpArg &reg)
671{
672 if (reg.IsSimpleReg())
673 PUSH(reg.GetSimpleReg());
674 else if (reg.IsImm())
675 {
676 switch (reg.GetImmBits())
677 {
678 case 8:
679 Write8(0x6A);
680 Write8((u8)(s8)reg.offset);
681 break;
682 case 16:
683 Write8(0x66);
684 Write8(0x68);
685 Write16((u16)(s16)(s32)reg.offset);
686 break;
687 case 32:
688 Write8(0x68);
689 Write32((u32)reg.offset);
690 break;
691 default:
692 ASSERT_MSG(0, "PUSH - Bad imm bits");
693 break;
694 }
695 }
696 else
697 {
698 if (bits == 16)
699 Write8(0x66);
700 reg.WriteRex(this, bits, bits);
701 Write8(0xFF);
702 reg.WriteRest(this, 0, (X64Reg)6);
703 }
704}
705
706void XEmitter::POP(int /*bits*/, const OpArg &reg)
707{
708 if (reg.IsSimpleReg())
709 POP(reg.GetSimpleReg());
710 else
711 ASSERT_MSG(0, "POP - Unsupported encoding");
712}
713
714void XEmitter::BSWAP(int bits, X64Reg reg)
715{
716 if (bits >= 32)
717 {
718 WriteSimple2Byte(bits, 0x0F, 0xC8, reg);
719 }
720 else if (bits == 16)
721 {
722 ROL(16, R(reg), Imm8(8));
723 }
724 else if (bits == 8)
725 {
726 // Do nothing - can't bswap a single byte...
727 }
728 else
729 {
730 ASSERT_MSG(0, "BSWAP - Wrong number of bits");
731 }
732}
733
734// Undefined opcode - reserved
735// If we ever need a way to always cause a non-breakpoint hard exception...
736void XEmitter::UD2()
737{
738 Write8(0x0F);
739 Write8(0x0B);
740}
741
742void XEmitter::PREFETCH(PrefetchLevel level, OpArg arg)
743{
744 ASSERT_MSG(!arg.IsImm(), "PREFETCH - Imm argument");
745 arg.operandReg = (u8)level;
746 arg.WriteRex(this, 0, 0);
747 Write8(0x0F);
748 Write8(0x18);
749 arg.WriteRest(this);
750}
751
752void XEmitter::SETcc(CCFlags flag, OpArg dest)
753{
754 ASSERT_MSG(!dest.IsImm(), "SETcc - Imm argument");
755 dest.operandReg = 0;
756 dest.WriteRex(this, 0, 8);
757 Write8(0x0F);
758 Write8(0x90 + (u8)flag);
759 dest.WriteRest(this);
760}
761
762void XEmitter::CMOVcc(int bits, X64Reg dest, OpArg src, CCFlags flag)
763{
764 ASSERT_MSG(!src.IsImm(), "CMOVcc - Imm argument");
765 ASSERT_MSG(bits != 8, "CMOVcc - 8 bits unsupported");
766 if (bits == 16)
767 Write8(0x66);
768 src.operandReg = dest;
769 src.WriteRex(this, bits, bits);
770 Write8(0x0F);
771 Write8(0x40 + (u8)flag);
772 src.WriteRest(this);
773}
774
775void XEmitter::WriteMulDivType(int bits, OpArg src, int ext)
776{
777 ASSERT_MSG(!src.IsImm(), "WriteMulDivType - Imm argument");
778 CheckFlags();
779 src.operandReg = ext;
780 if (bits == 16)
781 Write8(0x66);
782 src.WriteRex(this, bits, bits, 0);
783 if (bits == 8)
784 {
785 Write8(0xF6);
786 }
787 else
788 {
789 Write8(0xF7);
790 }
791 src.WriteRest(this);
792}
793
794void XEmitter::MUL(int bits, OpArg src) {WriteMulDivType(bits, src, 4);}
795void XEmitter::DIV(int bits, OpArg src) {WriteMulDivType(bits, src, 6);}
796void XEmitter::IMUL(int bits, OpArg src) {WriteMulDivType(bits, src, 5);}
797void XEmitter::IDIV(int bits, OpArg src) {WriteMulDivType(bits, src, 7);}
798void XEmitter::NEG(int bits, OpArg src) {WriteMulDivType(bits, src, 3);}
799void XEmitter::NOT(int bits, OpArg src) {WriteMulDivType(bits, src, 2);}
800
801void XEmitter::WriteBitSearchType(int bits, X64Reg dest, OpArg src, u8 byte2, bool rep)
802{
803 ASSERT_MSG(!src.IsImm(), "WriteBitSearchType - Imm argument");
804 CheckFlags();
805 src.operandReg = (u8)dest;
806 if (bits == 16)
807 Write8(0x66);
808 if (rep)
809 Write8(0xF3);
810 src.WriteRex(this, bits, bits);
811 Write8(0x0F);
812 Write8(byte2);
813 src.WriteRest(this);
814}
815
816void XEmitter::MOVNTI(int bits, OpArg dest, X64Reg src)
817{
818 if (bits <= 16)
819 ASSERT_MSG(0, "MOVNTI - bits<=16");
820 WriteBitSearchType(bits, src, dest, 0xC3);
821}
822
823void XEmitter::BSF(int bits, X64Reg dest, OpArg src) {WriteBitSearchType(bits,dest,src,0xBC);} //bottom bit to top bit
824void XEmitter::BSR(int bits, X64Reg dest, OpArg src) {WriteBitSearchType(bits,dest,src,0xBD);} //top bit to bottom bit
825
826void XEmitter::TZCNT(int bits, X64Reg dest, OpArg src)
827{
828 CheckFlags();
829 if (!Common::GetCPUCaps().bmi1)
830 ASSERT_MSG(0, "Trying to use BMI1 on a system that doesn't support it. Bad programmer.");
831 WriteBitSearchType(bits, dest, src, 0xBC, true);
832}
833void XEmitter::LZCNT(int bits, X64Reg dest, OpArg src)
834{
835 CheckFlags();
836 if (!Common::GetCPUCaps().lzcnt)
837 ASSERT_MSG(0, "Trying to use LZCNT on a system that doesn't support it. Bad programmer.");
838 WriteBitSearchType(bits, dest, src, 0xBD, true);
839}
840
841void XEmitter::MOVSX(int dbits, int sbits, X64Reg dest, OpArg src)
842{
843 ASSERT_MSG(!src.IsImm(), "MOVSX - Imm argument");
844 if (dbits == sbits)
845 {
846 MOV(dbits, R(dest), src);
847 return;
848 }
849 src.operandReg = (u8)dest;
850 if (dbits == 16)
851 Write8(0x66);
852 src.WriteRex(this, dbits, sbits);
853 if (sbits == 8)
854 {
855 Write8(0x0F);
856 Write8(0xBE);
857 }
858 else if (sbits == 16)
859 {
860 Write8(0x0F);
861 Write8(0xBF);
862 }
863 else if (sbits == 32 && dbits == 64)
864 {
865 Write8(0x63);
866 }
867 else
868 {
869 Crash();
870 }
871 src.WriteRest(this);
872}
873
874void XEmitter::MOVZX(int dbits, int sbits, X64Reg dest, OpArg src)
875{
876 ASSERT_MSG(!src.IsImm(), "MOVZX - Imm argument");
877 if (dbits == sbits)
878 {
879 MOV(dbits, R(dest), src);
880 return;
881 }
882 src.operandReg = (u8)dest;
883 if (dbits == 16)
884 Write8(0x66);
885 //the 32bit result is automatically zero extended to 64bit
886 src.WriteRex(this, dbits == 64 ? 32 : dbits, sbits);
887 if (sbits == 8)
888 {
889 Write8(0x0F);
890 Write8(0xB6);
891 }
892 else if (sbits == 16)
893 {
894 Write8(0x0F);
895 Write8(0xB7);
896 }
897 else if (sbits == 32 && dbits == 64)
898 {
899 Write8(0x8B);
900 }
901 else
902 {
903 ASSERT_MSG(0, "MOVZX - Invalid size");
904 }
905 src.WriteRest(this);
906}
907
908void XEmitter::MOVBE(int bits, const OpArg& dest, const OpArg& src)
909{
910 ASSERT_MSG(Common::GetCPUCaps().movbe, "Generating MOVBE on a system that does not support it.");
911 if (bits == 8)
912 {
913 MOV(bits, dest, src);
914 return;
915 }
916
917 if (bits == 16)
918 Write8(0x66);
919
920 if (dest.IsSimpleReg())
921 {
922 ASSERT_MSG(!src.IsSimpleReg() && !src.IsImm(), "MOVBE: Loading from !mem");
923 src.WriteRex(this, bits, bits, dest.GetSimpleReg());
924 Write8(0x0F); Write8(0x38); Write8(0xF0);
925 src.WriteRest(this, 0, dest.GetSimpleReg());
926 }
927 else if (src.IsSimpleReg())
928 {
929 ASSERT_MSG(!dest.IsSimpleReg() && !dest.IsImm(), "MOVBE: Storing to !mem");
930 dest.WriteRex(this, bits, bits, src.GetSimpleReg());
931 Write8(0x0F); Write8(0x38); Write8(0xF1);
932 dest.WriteRest(this, 0, src.GetSimpleReg());
933 }
934 else
935 {
936 ASSERT_MSG(0, "MOVBE: Not loading or storing to mem");
937 }
938}
939
940
941void XEmitter::LEA(int bits, X64Reg dest, OpArg src)
942{
943 ASSERT_MSG(!src.IsImm(), "LEA - Imm argument");
944 src.operandReg = (u8)dest;
945 if (bits == 16)
946 Write8(0x66); //TODO: performance warning
947 src.WriteRex(this, bits, bits);
948 Write8(0x8D);
949 src.WriteRest(this, 0, INVALID_REG, bits == 64);
950}
951
952//shift can be either imm8 or cl
953void XEmitter::WriteShift(int bits, OpArg dest, OpArg &shift, int ext)
954{
955 CheckFlags();
956 bool writeImm = false;
957 if (dest.IsImm())
958 {
959 ASSERT_MSG(0, "WriteShift - can't shift imms");
960 }
961 if ((shift.IsSimpleReg() && shift.GetSimpleReg() != ECX) || (shift.IsImm() && shift.GetImmBits() != 8))
962 {
963 ASSERT_MSG(0, "WriteShift - illegal argument");
964 }
965 dest.operandReg = ext;
966 if (bits == 16)
967 Write8(0x66);
968 dest.WriteRex(this, bits, bits, 0);
969 if (shift.GetImmBits() == 8)
970 {
971 //ok an imm
972 u8 imm = (u8)shift.offset;
973 if (imm == 1)
974 {
975 Write8(bits == 8 ? 0xD0 : 0xD1);
976 }
977 else
978 {
979 writeImm = true;
980 Write8(bits == 8 ? 0xC0 : 0xC1);
981 }
982 }
983 else
984 {
985 Write8(bits == 8 ? 0xD2 : 0xD3);
986 }
987 dest.WriteRest(this, writeImm ? 1 : 0);
988 if (writeImm)
989 Write8((u8)shift.offset);
990}
991
992// large rotates and shift are slower on intel than amd
993// intel likes to rotate by 1, and the op is smaller too
994void XEmitter::ROL(int bits, OpArg dest, OpArg shift) {WriteShift(bits, dest, shift, 0);}
995void XEmitter::ROR(int bits, OpArg dest, OpArg shift) {WriteShift(bits, dest, shift, 1);}
996void XEmitter::RCL(int bits, OpArg dest, OpArg shift) {WriteShift(bits, dest, shift, 2);}
997void XEmitter::RCR(int bits, OpArg dest, OpArg shift) {WriteShift(bits, dest, shift, 3);}
998void XEmitter::SHL(int bits, OpArg dest, OpArg shift) {WriteShift(bits, dest, shift, 4);}
999void XEmitter::SHR(int bits, OpArg dest, OpArg shift) {WriteShift(bits, dest, shift, 5);}
1000void XEmitter::SAR(int bits, OpArg dest, OpArg shift) {WriteShift(bits, dest, shift, 7);}
1001
1002// index can be either imm8 or register, don't use memory destination because it's slow
1003void XEmitter::WriteBitTest(int bits, OpArg &dest, OpArg &index, int ext)
1004{
1005 CheckFlags();
1006 if (dest.IsImm())
1007 {
1008 ASSERT_MSG(0, "WriteBitTest - can't test imms");
1009 }
1010 if ((index.IsImm() && index.GetImmBits() != 8))
1011 {
1012 ASSERT_MSG(0, "WriteBitTest - illegal argument");
1013 }
1014 if (bits == 16)
1015 Write8(0x66);
1016 if (index.IsImm())
1017 {
1018 dest.WriteRex(this, bits, bits);
1019 Write8(0x0F); Write8(0xBA);
1020 dest.WriteRest(this, 1, (X64Reg)ext);
1021 Write8((u8)index.offset);
1022 }
1023 else
1024 {
1025 X64Reg operand = index.GetSimpleReg();
1026 dest.WriteRex(this, bits, bits, operand);
1027 Write8(0x0F); Write8(0x83 + 8*ext);
1028 dest.WriteRest(this, 1, operand);
1029 }
1030}
1031
1032void XEmitter::BT(int bits, OpArg dest, OpArg index) {WriteBitTest(bits, dest, index, 4);}
1033void XEmitter::BTS(int bits, OpArg dest, OpArg index) {WriteBitTest(bits, dest, index, 5);}
1034void XEmitter::BTR(int bits, OpArg dest, OpArg index) {WriteBitTest(bits, dest, index, 6);}
1035void XEmitter::BTC(int bits, OpArg dest, OpArg index) {WriteBitTest(bits, dest, index, 7);}
1036
1037//shift can be either imm8 or cl
1038void XEmitter::SHRD(int bits, OpArg dest, OpArg src, OpArg shift)
1039{
1040 CheckFlags();
1041 if (dest.IsImm())
1042 {
1043 ASSERT_MSG(0, "SHRD - can't use imms as destination");
1044 }
1045 if (!src.IsSimpleReg())
1046 {
1047 ASSERT_MSG(0, "SHRD - must use simple register as source");
1048 }
1049 if ((shift.IsSimpleReg() && shift.GetSimpleReg() != ECX) || (shift.IsImm() && shift.GetImmBits() != 8))
1050 {
1051 ASSERT_MSG(0, "SHRD - illegal shift");
1052 }
1053 if (bits == 16)
1054 Write8(0x66);
1055 X64Reg operand = src.GetSimpleReg();
1056 dest.WriteRex(this, bits, bits, operand);
1057 if (shift.GetImmBits() == 8)
1058 {
1059 Write8(0x0F); Write8(0xAC);
1060 dest.WriteRest(this, 1, operand);
1061 Write8((u8)shift.offset);
1062 }
1063 else
1064 {
1065 Write8(0x0F); Write8(0xAD);
1066 dest.WriteRest(this, 0, operand);
1067 }
1068}
1069
1070void XEmitter::SHLD(int bits, OpArg dest, OpArg src, OpArg shift)
1071{
1072 CheckFlags();
1073 if (dest.IsImm())
1074 {
1075 ASSERT_MSG(0, "SHLD - can't use imms as destination");
1076 }
1077 if (!src.IsSimpleReg())
1078 {
1079 ASSERT_MSG(0, "SHLD - must use simple register as source");
1080 }
1081 if ((shift.IsSimpleReg() && shift.GetSimpleReg() != ECX) || (shift.IsImm() && shift.GetImmBits() != 8))
1082 {
1083 ASSERT_MSG(0, "SHLD - illegal shift");
1084 }
1085 if (bits == 16)
1086 Write8(0x66);
1087 X64Reg operand = src.GetSimpleReg();
1088 dest.WriteRex(this, bits, bits, operand);
1089 if (shift.GetImmBits() == 8)
1090 {
1091 Write8(0x0F); Write8(0xA4);
1092 dest.WriteRest(this, 1, operand);
1093 Write8((u8)shift.offset);
1094 }
1095 else
1096 {
1097 Write8(0x0F); Write8(0xA5);
1098 dest.WriteRest(this, 0, operand);
1099 }
1100}
1101
1102void OpArg::WriteSingleByteOp(XEmitter *emit, u8 op, X64Reg _operandReg, int bits)
1103{
1104 if (bits == 16)
1105 emit->Write8(0x66);
1106
1107 this->operandReg = (u8)_operandReg;
1108 WriteRex(emit, bits, bits);
1109 emit->Write8(op);
1110 WriteRest(emit);
1111}
1112
1113//operand can either be immediate or register
1114void OpArg::WriteNormalOp(XEmitter *emit, bool toRM, NormalOp op, const OpArg &operand, int bits) const
1115{
1116 X64Reg _operandReg;
1117 if (IsImm())
1118 {
1119 ASSERT_MSG(0, "WriteNormalOp - Imm argument, wrong order");
1120 }
1121
1122 if (bits == 16)
1123 emit->Write8(0x66);
1124
1125 int immToWrite = 0;
1126
1127 if (operand.IsImm())
1128 {
1129 WriteRex(emit, bits, bits);
1130
1131 if (!toRM)
1132 {
1133 ASSERT_MSG(0, "WriteNormalOp - Writing to Imm (!toRM)");
1134 }
1135
1136 if (operand.scale == SCALE_IMM8 && bits == 8)
1137 {
1138 // op al, imm8
1139 if (!scale && offsetOrBaseReg == AL && normalops[op].eaximm8 != 0xCC)
1140 {
1141 emit->Write8(normalops[op].eaximm8);
1142 emit->Write8((u8)operand.offset);
1143 return;
1144 }
1145 // mov reg, imm8
1146 if (!scale && op == nrmMOV)
1147 {
1148 emit->Write8(0xB0 + (offsetOrBaseReg & 7));
1149 emit->Write8((u8)operand.offset);
1150 return;
1151 }
1152 // op r/m8, imm8
1153 emit->Write8(normalops[op].imm8);
1154 immToWrite = 8;
1155 }
1156 else if ((operand.scale == SCALE_IMM16 && bits == 16) ||
1157 (operand.scale == SCALE_IMM32 && bits == 32) ||
1158 (operand.scale == SCALE_IMM32 && bits == 64))
1159 {
1160 // Try to save immediate size if we can, but first check to see
1161 // if the instruction supports simm8.
1162 // op r/m, imm8
1163 if (normalops[op].simm8 != 0xCC &&
1164 ((operand.scale == SCALE_IMM16 && (s16)operand.offset == (s8)operand.offset) ||
1165 (operand.scale == SCALE_IMM32 && (s32)operand.offset == (s8)operand.offset)))
1166 {
1167 emit->Write8(normalops[op].simm8);
1168 immToWrite = 8;
1169 }
1170 else
1171 {
1172 // mov reg, imm
1173 if (!scale && op == nrmMOV && bits != 64)
1174 {
1175 emit->Write8(0xB8 + (offsetOrBaseReg & 7));
1176 if (bits == 16)
1177 emit->Write16((u16)operand.offset);
1178 else
1179 emit->Write32((u32)operand.offset);
1180 return;
1181 }
1182 // op eax, imm
1183 if (!scale && offsetOrBaseReg == EAX && normalops[op].eaximm32 != 0xCC)
1184 {
1185 emit->Write8(normalops[op].eaximm32);
1186 if (bits == 16)
1187 emit->Write16((u16)operand.offset);
1188 else
1189 emit->Write32((u32)operand.offset);
1190 return;
1191 }
1192 // op r/m, imm
1193 emit->Write8(normalops[op].imm32);
1194 immToWrite = bits == 16 ? 16 : 32;
1195 }
1196 }
1197 else if ((operand.scale == SCALE_IMM8 && bits == 16) ||
1198 (operand.scale == SCALE_IMM8 && bits == 32) ||
1199 (operand.scale == SCALE_IMM8 && bits == 64))
1200 {
1201 // op r/m, imm8
1202 emit->Write8(normalops[op].simm8);
1203 immToWrite = 8;
1204 }
1205 else if (operand.scale == SCALE_IMM64 && bits == 64)
1206 {
1207 if (scale)
1208 {
1209 ASSERT_MSG(0, "WriteNormalOp - MOV with 64-bit imm requres register destination");
1210 }
1211 // mov reg64, imm64
1212 else if (op == nrmMOV)
1213 {
1214 emit->Write8(0xB8 + (offsetOrBaseReg & 7));
1215 emit->Write64((u64)operand.offset);
1216 return;
1217 }
1218 ASSERT_MSG(0, "WriteNormalOp - Only MOV can take 64-bit imm");
1219 }
1220 else
1221 {
1222 ASSERT_MSG(0, "WriteNormalOp - Unhandled case");
1223 }
1224 _operandReg = (X64Reg)normalops[op].ext; //pass extension in REG of ModRM
1225 }
1226 else
1227 {
1228 _operandReg = (X64Reg)operand.offsetOrBaseReg;
1229 WriteRex(emit, bits, bits, _operandReg);
1230 // op r/m, reg
1231 if (toRM)
1232 {
1233 emit->Write8(bits == 8 ? normalops[op].toRm8 : normalops[op].toRm32);
1234 }
1235 // op reg, r/m
1236 else
1237 {
1238 emit->Write8(bits == 8 ? normalops[op].fromRm8 : normalops[op].fromRm32);
1239 }
1240 }
1241 WriteRest(emit, immToWrite >> 3, _operandReg);
1242 switch (immToWrite)
1243 {
1244 case 0:
1245 break;
1246 case 8:
1247 emit->Write8((u8)operand.offset);
1248 break;
1249 case 16:
1250 emit->Write16((u16)operand.offset);
1251 break;
1252 case 32:
1253 emit->Write32((u32)operand.offset);
1254 break;
1255 default:
1256 ASSERT_MSG(0, "WriteNormalOp - Unhandled case");
1257 }
1258}
1259
1260void XEmitter::WriteNormalOp(XEmitter *emit, int bits, NormalOp op, const OpArg &a1, const OpArg &a2)
1261{
1262 if (a1.IsImm())
1263 {
1264 //Booh! Can't write to an imm
1265 ASSERT_MSG(0, "WriteNormalOp - a1 cannot be imm");
1266 return;
1267 }
1268 if (a2.IsImm())
1269 {
1270 a1.WriteNormalOp(emit, true, op, a2, bits);
1271 }
1272 else
1273 {
1274 if (a1.IsSimpleReg())
1275 {
1276 a2.WriteNormalOp(emit, false, op, a1, bits);
1277 }
1278 else
1279 {
1280 ASSERT_MSG(a2.IsSimpleReg() || a2.IsImm(), "WriteNormalOp - a1 and a2 cannot both be memory");
1281 a1.WriteNormalOp(emit, true, op, a2, bits);
1282 }
1283 }
1284}
1285
1286void XEmitter::ADD (int bits, const OpArg &a1, const OpArg &a2) {CheckFlags(); WriteNormalOp(this, bits, nrmADD, a1, a2);}
1287void XEmitter::ADC (int bits, const OpArg &a1, const OpArg &a2) {CheckFlags(); WriteNormalOp(this, bits, nrmADC, a1, a2);}
1288void XEmitter::SUB (int bits, const OpArg &a1, const OpArg &a2) {CheckFlags(); WriteNormalOp(this, bits, nrmSUB, a1, a2);}
1289void XEmitter::SBB (int bits, const OpArg &a1, const OpArg &a2) {CheckFlags(); WriteNormalOp(this, bits, nrmSBB, a1, a2);}
1290void XEmitter::AND (int bits, const OpArg &a1, const OpArg &a2) {CheckFlags(); WriteNormalOp(this, bits, nrmAND, a1, a2);}
1291void XEmitter::OR (int bits, const OpArg &a1, const OpArg &a2) {CheckFlags(); WriteNormalOp(this, bits, nrmOR , a1, a2);}
1292void XEmitter::XOR (int bits, const OpArg &a1, const OpArg &a2) {CheckFlags(); WriteNormalOp(this, bits, nrmXOR, a1, a2);}
1293void XEmitter::MOV (int bits, const OpArg &a1, const OpArg &a2)
1294{
1295 if (a1.IsSimpleReg() && a2.IsSimpleReg() && a1.GetSimpleReg() == a2.GetSimpleReg())
1296 LOG_ERROR(Common, "Redundant MOV @ %p - bug in JIT?", code);
1297 WriteNormalOp(this, bits, nrmMOV, a1, a2);
1298}
1299void XEmitter::TEST(int bits, const OpArg &a1, const OpArg &a2) {CheckFlags(); WriteNormalOp(this, bits, nrmTEST, a1, a2);}
1300void XEmitter::CMP (int bits, const OpArg &a1, const OpArg &a2) {CheckFlags(); WriteNormalOp(this, bits, nrmCMP, a1, a2);}
1301void XEmitter::XCHG(int bits, const OpArg &a1, const OpArg &a2) {WriteNormalOp(this, bits, nrmXCHG, a1, a2);}
1302
1303void XEmitter::IMUL(int bits, X64Reg regOp, OpArg a1, OpArg a2)
1304{
1305 CheckFlags();
1306 if (bits == 8)
1307 {
1308 ASSERT_MSG(0, "IMUL - illegal bit size!");
1309 return;
1310 }
1311
1312 if (a1.IsImm())
1313 {
1314 ASSERT_MSG(0, "IMUL - second arg cannot be imm!");
1315 return;
1316 }
1317
1318 if (!a2.IsImm())
1319 {
1320 ASSERT_MSG(0, "IMUL - third arg must be imm!");
1321 return;
1322 }
1323
1324 if (bits == 16)
1325 Write8(0x66);
1326 a1.WriteRex(this, bits, bits, regOp);
1327
1328 if (a2.GetImmBits() == 8 ||
1329 (a2.GetImmBits() == 16 && (s8)a2.offset == (s16)a2.offset) ||
1330 (a2.GetImmBits() == 32 && (s8)a2.offset == (s32)a2.offset))
1331 {
1332 Write8(0x6B);
1333 a1.WriteRest(this, 1, regOp);
1334 Write8((u8)a2.offset);
1335 }
1336 else
1337 {
1338 Write8(0x69);
1339 if (a2.GetImmBits() == 16 && bits == 16)
1340 {
1341 a1.WriteRest(this, 2, regOp);
1342 Write16((u16)a2.offset);
1343 }
1344 else if (a2.GetImmBits() == 32 && (bits == 32 || bits == 64))
1345 {
1346 a1.WriteRest(this, 4, regOp);
1347 Write32((u32)a2.offset);
1348 }
1349 else
1350 {
1351 ASSERT_MSG(0, "IMUL - unhandled case!");
1352 }
1353 }
1354}
1355
1356void XEmitter::IMUL(int bits, X64Reg regOp, OpArg a)
1357{
1358 CheckFlags();
1359 if (bits == 8)
1360 {
1361 ASSERT_MSG(0, "IMUL - illegal bit size!");
1362 return;
1363 }
1364
1365 if (a.IsImm())
1366 {
1367 IMUL(bits, regOp, R(regOp), a) ;
1368 return;
1369 }
1370
1371 if (bits == 16)
1372 Write8(0x66);
1373 a.WriteRex(this, bits, bits, regOp);
1374 Write8(0x0F);
1375 Write8(0xAF);
1376 a.WriteRest(this, 0, regOp);
1377}
1378
1379
1380void XEmitter::WriteSSEOp(u8 opPrefix, u16 op, X64Reg regOp, OpArg arg, int extrabytes)
1381{
1382 if (opPrefix)
1383 Write8(opPrefix);
1384 arg.operandReg = regOp;
1385 arg.WriteRex(this, 0, 0);
1386 Write8(0x0F);
1387 if (op > 0xFF)
1388 Write8((op >> 8) & 0xFF);
1389 Write8(op & 0xFF);
1390 arg.WriteRest(this, extrabytes);
1391}
1392
1393void XEmitter::WriteAVXOp(u8 opPrefix, u16 op, X64Reg regOp, OpArg arg, int extrabytes)
1394{
1395 WriteAVXOp(opPrefix, op, regOp, INVALID_REG, arg, extrabytes);
1396}
1397
1398static int GetVEXmmmmm(u16 op)
1399{
1400 // Currently, only 0x38 and 0x3A are used as secondary escape byte.
1401 if ((op >> 8) == 0x3A)
1402 return 3;
1403 else if ((op >> 8) == 0x38)
1404 return 2;
1405 else
1406 return 1;
1407}
1408
1409static int GetVEXpp(u8 opPrefix)
1410{
1411 if (opPrefix == 0x66)
1412 return 1;
1413 else if (opPrefix == 0xF3)
1414 return 2;
1415 else if (opPrefix == 0xF2)
1416 return 3;
1417 else
1418 return 0;
1419}
1420
1421void XEmitter::WriteAVXOp(u8 opPrefix, u16 op, X64Reg regOp1, X64Reg regOp2, OpArg arg, int extrabytes)
1422{
1423 if (!Common::GetCPUCaps().avx)
1424 ASSERT_MSG(0, "Trying to use AVX on a system that doesn't support it. Bad programmer.");
1425 int mmmmm = GetVEXmmmmm(op);
1426 int pp = GetVEXpp(opPrefix);
1427 // FIXME: we currently don't support 256-bit instructions, and "size" is not the vector size here
1428 arg.WriteVex(this, regOp1, regOp2, 0, pp, mmmmm);
1429 Write8(op & 0xFF);
1430 arg.WriteRest(this, extrabytes, regOp1);
1431}
1432
1433// Like the above, but more general; covers GPR-based VEX operations, like BMI1/2
1434void XEmitter::WriteVEXOp(int size, u8 opPrefix, u16 op, X64Reg regOp1, X64Reg regOp2, OpArg arg, int extrabytes)
1435{
1436 if (size != 32 && size != 64)
1437 ASSERT_MSG(0, "VEX GPR instructions only support 32-bit and 64-bit modes!");
1438 int mmmmm = GetVEXmmmmm(op);
1439 int pp = GetVEXpp(opPrefix);
1440 arg.WriteVex(this, regOp1, regOp2, 0, pp, mmmmm, size == 64);
1441 Write8(op & 0xFF);
1442 arg.WriteRest(this, extrabytes, regOp1);
1443}
1444
1445void XEmitter::WriteBMI1Op(int size, u8 opPrefix, u16 op, X64Reg regOp1, X64Reg regOp2, OpArg arg, int extrabytes)
1446{
1447 CheckFlags();
1448 if (!Common::GetCPUCaps().bmi1)
1449 ASSERT_MSG(0, "Trying to use BMI1 on a system that doesn't support it. Bad programmer.");
1450 WriteVEXOp(size, opPrefix, op, regOp1, regOp2, arg, extrabytes);
1451}
1452
1453void XEmitter::WriteBMI2Op(int size, u8 opPrefix, u16 op, X64Reg regOp1, X64Reg regOp2, OpArg arg, int extrabytes)
1454{
1455 CheckFlags();
1456 if (!Common::GetCPUCaps().bmi2)
1457 ASSERT_MSG(0, "Trying to use BMI2 on a system that doesn't support it. Bad programmer.");
1458 WriteVEXOp(size, opPrefix, op, regOp1, regOp2, arg, extrabytes);
1459}
1460
1461void XEmitter::MOVD_xmm(X64Reg dest, const OpArg &arg) {WriteSSEOp(0x66, 0x6E, dest, arg, 0);}
1462void XEmitter::MOVD_xmm(const OpArg &arg, X64Reg src) {WriteSSEOp(0x66, 0x7E, src, arg, 0);}
1463
1464void XEmitter::MOVQ_xmm(X64Reg dest, OpArg arg)
1465{
1466#ifdef ARCHITECTURE_x86_64
1467 // Alternate encoding
1468 // This does not display correctly in MSVC's debugger, it thinks it's a MOVD
1469 arg.operandReg = dest;
1470 Write8(0x66);
1471 arg.WriteRex(this, 64, 0);
1472 Write8(0x0f);
1473 Write8(0x6E);
1474 arg.WriteRest(this, 0);
1475#else
1476 arg.operandReg = dest;
1477 Write8(0xF3);
1478 Write8(0x0f);
1479 Write8(0x7E);
1480 arg.WriteRest(this, 0);
1481#endif
1482}
1483
1484void XEmitter::MOVQ_xmm(OpArg arg, X64Reg src)
1485{
1486 if (src > 7 || arg.IsSimpleReg())
1487 {
1488 // Alternate encoding
1489 // This does not display correctly in MSVC's debugger, it thinks it's a MOVD
1490 arg.operandReg = src;
1491 Write8(0x66);
1492 arg.WriteRex(this, 64, 0);
1493 Write8(0x0f);
1494 Write8(0x7E);
1495 arg.WriteRest(this, 0);
1496 }
1497 else
1498 {
1499 arg.operandReg = src;
1500 arg.WriteRex(this, 0, 0);
1501 Write8(0x66);
1502 Write8(0x0f);
1503 Write8(0xD6);
1504 arg.WriteRest(this, 0);
1505 }
1506}
1507
1508void XEmitter::WriteMXCSR(OpArg arg, int ext)
1509{
1510 if (arg.IsImm() || arg.IsSimpleReg())
1511 ASSERT_MSG(0, "MXCSR - invalid operand");
1512
1513 arg.operandReg = ext;
1514 arg.WriteRex(this, 0, 0);
1515 Write8(0x0F);
1516 Write8(0xAE);
1517 arg.WriteRest(this);
1518}
1519
1520void XEmitter::STMXCSR(OpArg memloc) {WriteMXCSR(memloc, 3);}
1521void XEmitter::LDMXCSR(OpArg memloc) {WriteMXCSR(memloc, 2);}
1522
1523void XEmitter::MOVNTDQ(OpArg arg, X64Reg regOp) {WriteSSEOp(0x66, sseMOVNTDQ, regOp, arg);}
1524void XEmitter::MOVNTPS(OpArg arg, X64Reg regOp) {WriteSSEOp(0x00, sseMOVNTP, regOp, arg);}
1525void XEmitter::MOVNTPD(OpArg arg, X64Reg regOp) {WriteSSEOp(0x66, sseMOVNTP, regOp, arg);}
1526
1527void XEmitter::ADDSS(X64Reg regOp, OpArg arg) {WriteSSEOp(0xF3, sseADD, regOp, arg);}
1528void XEmitter::ADDSD(X64Reg regOp, OpArg arg) {WriteSSEOp(0xF2, sseADD, regOp, arg);}
1529void XEmitter::SUBSS(X64Reg regOp, OpArg arg) {WriteSSEOp(0xF3, sseSUB, regOp, arg);}
1530void XEmitter::SUBSD(X64Reg regOp, OpArg arg) {WriteSSEOp(0xF2, sseSUB, regOp, arg);}
1531void XEmitter::CMPSS(X64Reg regOp, OpArg arg, u8 compare) {WriteSSEOp(0xF3, sseCMP, regOp, arg, 1); Write8(compare);}
1532void XEmitter::CMPSD(X64Reg regOp, OpArg arg, u8 compare) {WriteSSEOp(0xF2, sseCMP, regOp, arg, 1); Write8(compare);}
1533void XEmitter::MULSS(X64Reg regOp, OpArg arg) {WriteSSEOp(0xF3, sseMUL, regOp, arg);}
1534void XEmitter::MULSD(X64Reg regOp, OpArg arg) {WriteSSEOp(0xF2, sseMUL, regOp, arg);}
1535void XEmitter::DIVSS(X64Reg regOp, OpArg arg) {WriteSSEOp(0xF3, sseDIV, regOp, arg);}
1536void XEmitter::DIVSD(X64Reg regOp, OpArg arg) {WriteSSEOp(0xF2, sseDIV, regOp, arg);}
1537void XEmitter::MINSS(X64Reg regOp, OpArg arg) {WriteSSEOp(0xF3, sseMIN, regOp, arg);}
1538void XEmitter::MINSD(X64Reg regOp, OpArg arg) {WriteSSEOp(0xF2, sseMIN, regOp, arg);}
1539void XEmitter::MAXSS(X64Reg regOp, OpArg arg) {WriteSSEOp(0xF3, sseMAX, regOp, arg);}
1540void XEmitter::MAXSD(X64Reg regOp, OpArg arg) {WriteSSEOp(0xF2, sseMAX, regOp, arg);}
1541void XEmitter::SQRTSS(X64Reg regOp, OpArg arg) {WriteSSEOp(0xF3, sseSQRT, regOp, arg);}
1542void XEmitter::SQRTSD(X64Reg regOp, OpArg arg) {WriteSSEOp(0xF2, sseSQRT, regOp, arg);}
1543void XEmitter::RSQRTSS(X64Reg regOp, OpArg arg) {WriteSSEOp(0xF3, sseRSQRT, regOp, arg);}
1544
1545void XEmitter::ADDPS(X64Reg regOp, OpArg arg) {WriteSSEOp(0x00, sseADD, regOp, arg);}
1546void XEmitter::ADDPD(X64Reg regOp, OpArg arg) {WriteSSEOp(0x66, sseADD, regOp, arg);}
1547void XEmitter::SUBPS(X64Reg regOp, OpArg arg) {WriteSSEOp(0x00, sseSUB, regOp, arg);}
1548void XEmitter::SUBPD(X64Reg regOp, OpArg arg) {WriteSSEOp(0x66, sseSUB, regOp, arg);}
1549void XEmitter::CMPPS(X64Reg regOp, OpArg arg, u8 compare) {WriteSSEOp(0x00, sseCMP, regOp, arg, 1); Write8(compare);}
1550void XEmitter::CMPPD(X64Reg regOp, OpArg arg, u8 compare) {WriteSSEOp(0x66, sseCMP, regOp, arg, 1); Write8(compare);}
1551void XEmitter::ANDPS(X64Reg regOp, OpArg arg) {WriteSSEOp(0x00, sseAND, regOp, arg);}
1552void XEmitter::ANDPD(X64Reg regOp, OpArg arg) {WriteSSEOp(0x66, sseAND, regOp, arg);}
1553void XEmitter::ANDNPS(X64Reg regOp, OpArg arg) {WriteSSEOp(0x00, sseANDN, regOp, arg);}
1554void XEmitter::ANDNPD(X64Reg regOp, OpArg arg) {WriteSSEOp(0x66, sseANDN, regOp, arg);}
1555void XEmitter::ORPS(X64Reg regOp, OpArg arg) {WriteSSEOp(0x00, sseOR, regOp, arg);}
1556void XEmitter::ORPD(X64Reg regOp, OpArg arg) {WriteSSEOp(0x66, sseOR, regOp, arg);}
1557void XEmitter::XORPS(X64Reg regOp, OpArg arg) {WriteSSEOp(0x00, sseXOR, regOp, arg);}
1558void XEmitter::XORPD(X64Reg regOp, OpArg arg) {WriteSSEOp(0x66, sseXOR, regOp, arg);}
1559void XEmitter::MULPS(X64Reg regOp, OpArg arg) {WriteSSEOp(0x00, sseMUL, regOp, arg);}
1560void XEmitter::MULPD(X64Reg regOp, OpArg arg) {WriteSSEOp(0x66, sseMUL, regOp, arg);}
1561void XEmitter::DIVPS(X64Reg regOp, OpArg arg) {WriteSSEOp(0x00, sseDIV, regOp, arg);}
1562void XEmitter::DIVPD(X64Reg regOp, OpArg arg) {WriteSSEOp(0x66, sseDIV, regOp, arg);}
1563void XEmitter::MINPS(X64Reg regOp, OpArg arg) {WriteSSEOp(0x00, sseMIN, regOp, arg);}
1564void XEmitter::MINPD(X64Reg regOp, OpArg arg) {WriteSSEOp(0x66, sseMIN, regOp, arg);}
1565void XEmitter::MAXPS(X64Reg regOp, OpArg arg) {WriteSSEOp(0x00, sseMAX, regOp, arg);}
1566void XEmitter::MAXPD(X64Reg regOp, OpArg arg) {WriteSSEOp(0x66, sseMAX, regOp, arg);}
1567void XEmitter::SQRTPS(X64Reg regOp, OpArg arg) {WriteSSEOp(0x00, sseSQRT, regOp, arg);}
1568void XEmitter::SQRTPD(X64Reg regOp, OpArg arg) {WriteSSEOp(0x66, sseSQRT, regOp, arg);}
1569void XEmitter::RCPPS(X64Reg regOp, OpArg arg) { WriteSSEOp(0x00, sseRCP, regOp, arg); }
1570void XEmitter::RSQRTPS(X64Reg regOp, OpArg arg) {WriteSSEOp(0x00, sseRSQRT, regOp, arg);}
1571void XEmitter::SHUFPS(X64Reg regOp, OpArg arg, u8 shuffle) {WriteSSEOp(0x00, sseSHUF, regOp, arg,1); Write8(shuffle);}
1572void XEmitter::SHUFPD(X64Reg regOp, OpArg arg, u8 shuffle) {WriteSSEOp(0x66, sseSHUF, regOp, arg,1); Write8(shuffle);}
1573
1574void XEmitter::HADDPS(X64Reg regOp, OpArg arg) {WriteSSEOp(0xF2, sseHADD, regOp, arg);}
1575
1576void XEmitter::COMISS(X64Reg regOp, OpArg arg) {WriteSSEOp(0x00, sseCOMIS, regOp, arg);} //weird that these should be packed
1577void XEmitter::COMISD(X64Reg regOp, OpArg arg) {WriteSSEOp(0x66, sseCOMIS, regOp, arg);} //ordered
1578void XEmitter::UCOMISS(X64Reg regOp, OpArg arg) {WriteSSEOp(0x00, sseUCOMIS, regOp, arg);} //unordered
1579void XEmitter::UCOMISD(X64Reg regOp, OpArg arg) {WriteSSEOp(0x66, sseUCOMIS, regOp, arg);}
1580
1581void XEmitter::MOVAPS(X64Reg regOp, OpArg arg) {WriteSSEOp(0x00, sseMOVAPfromRM, regOp, arg);}
1582void XEmitter::MOVAPD(X64Reg regOp, OpArg arg) {WriteSSEOp(0x66, sseMOVAPfromRM, regOp, arg);}
1583void XEmitter::MOVAPS(OpArg arg, X64Reg regOp) {WriteSSEOp(0x00, sseMOVAPtoRM, regOp, arg);}
1584void XEmitter::MOVAPD(OpArg arg, X64Reg regOp) {WriteSSEOp(0x66, sseMOVAPtoRM, regOp, arg);}
1585
1586void XEmitter::MOVUPS(X64Reg regOp, OpArg arg) {WriteSSEOp(0x00, sseMOVUPfromRM, regOp, arg);}
1587void XEmitter::MOVUPD(X64Reg regOp, OpArg arg) {WriteSSEOp(0x66, sseMOVUPfromRM, regOp, arg);}
1588void XEmitter::MOVUPS(OpArg arg, X64Reg regOp) {WriteSSEOp(0x00, sseMOVUPtoRM, regOp, arg);}
1589void XEmitter::MOVUPD(OpArg arg, X64Reg regOp) {WriteSSEOp(0x66, sseMOVUPtoRM, regOp, arg);}
1590
1591void XEmitter::MOVDQA(X64Reg regOp, OpArg arg) {WriteSSEOp(0x66, sseMOVDQfromRM, regOp, arg);}
1592void XEmitter::MOVDQA(OpArg arg, X64Reg regOp) {WriteSSEOp(0x66, sseMOVDQtoRM, regOp, arg);}
1593void XEmitter::MOVDQU(X64Reg regOp, OpArg arg) {WriteSSEOp(0xF3, sseMOVDQfromRM, regOp, arg);}
1594void XEmitter::MOVDQU(OpArg arg, X64Reg regOp) {WriteSSEOp(0xF3, sseMOVDQtoRM, regOp, arg);}
1595
1596void XEmitter::MOVSS(X64Reg regOp, OpArg arg) {WriteSSEOp(0xF3, sseMOVUPfromRM, regOp, arg);}
1597void XEmitter::MOVSD(X64Reg regOp, OpArg arg) {WriteSSEOp(0xF2, sseMOVUPfromRM, regOp, arg);}
1598void XEmitter::MOVSS(OpArg arg, X64Reg regOp) {WriteSSEOp(0xF3, sseMOVUPtoRM, regOp, arg);}
1599void XEmitter::MOVSD(OpArg arg, X64Reg regOp) {WriteSSEOp(0xF2, sseMOVUPtoRM, regOp, arg);}
1600
1601void XEmitter::MOVLPS(X64Reg regOp, OpArg arg) { WriteSSEOp(0x00, sseMOVLPfromRM, regOp, arg); }
1602void XEmitter::MOVLPD(X64Reg regOp, OpArg arg) { WriteSSEOp(0x66, sseMOVLPfromRM, regOp, arg); }
1603void XEmitter::MOVLPS(OpArg arg, X64Reg regOp) { WriteSSEOp(0x00, sseMOVLPtoRM, regOp, arg); }
1604void XEmitter::MOVLPD(OpArg arg, X64Reg regOp) { WriteSSEOp(0x66, sseMOVLPtoRM, regOp, arg); }
1605
1606void XEmitter::MOVHPS(X64Reg regOp, OpArg arg) { WriteSSEOp(0x00, sseMOVHPfromRM, regOp, arg); }
1607void XEmitter::MOVHPD(X64Reg regOp, OpArg arg) { WriteSSEOp(0x66, sseMOVHPfromRM, regOp, arg); }
1608void XEmitter::MOVHPS(OpArg arg, X64Reg regOp) { WriteSSEOp(0x00, sseMOVHPtoRM, regOp, arg); }
1609void XEmitter::MOVHPD(OpArg arg, X64Reg regOp) { WriteSSEOp(0x66, sseMOVHPtoRM, regOp, arg); }
1610
1611void XEmitter::MOVHLPS(X64Reg regOp1, X64Reg regOp2) {WriteSSEOp(0x00, sseMOVHLPS, regOp1, R(regOp2));}
1612void XEmitter::MOVLHPS(X64Reg regOp1, X64Reg regOp2) {WriteSSEOp(0x00, sseMOVLHPS, regOp1, R(regOp2));}
1613
1614void XEmitter::CVTPS2PD(X64Reg regOp, OpArg arg) {WriteSSEOp(0x00, 0x5A, regOp, arg);}
1615void XEmitter::CVTPD2PS(X64Reg regOp, OpArg arg) {WriteSSEOp(0x66, 0x5A, regOp, arg);}
1616
1617void XEmitter::CVTSD2SS(X64Reg regOp, OpArg arg) {WriteSSEOp(0xF2, 0x5A, regOp, arg);}
1618void XEmitter::CVTSS2SD(X64Reg regOp, OpArg arg) {WriteSSEOp(0xF3, 0x5A, regOp, arg);}
1619void XEmitter::CVTSD2SI(X64Reg regOp, OpArg arg) {WriteSSEOp(0xF2, 0x2D, regOp, arg);}
1620void XEmitter::CVTSS2SI(X64Reg regOp, OpArg arg) {WriteSSEOp(0xF3, 0x2D, regOp, arg);}
1621void XEmitter::CVTSI2SD(X64Reg regOp, OpArg arg) {WriteSSEOp(0xF2, 0x2A, regOp, arg);}
1622void XEmitter::CVTSI2SS(X64Reg regOp, OpArg arg) {WriteSSEOp(0xF3, 0x2A, regOp, arg);}
1623
1624void XEmitter::CVTDQ2PD(X64Reg regOp, OpArg arg) {WriteSSEOp(0xF3, 0xE6, regOp, arg);}
1625void XEmitter::CVTDQ2PS(X64Reg regOp, OpArg arg) {WriteSSEOp(0x00, 0x5B, regOp, arg);}
1626void XEmitter::CVTPD2DQ(X64Reg regOp, OpArg arg) {WriteSSEOp(0xF2, 0xE6, regOp, arg);}
1627void XEmitter::CVTPS2DQ(X64Reg regOp, OpArg arg) {WriteSSEOp(0x66, 0x5B, regOp, arg);}
1628
1629void XEmitter::CVTTSD2SI(X64Reg regOp, OpArg arg) {WriteSSEOp(0xF2, 0x2C, regOp, arg);}
1630void XEmitter::CVTTSS2SI(X64Reg regOp, OpArg arg) {WriteSSEOp(0xF3, 0x2C, regOp, arg);}
1631void XEmitter::CVTTPS2DQ(X64Reg regOp, OpArg arg) {WriteSSEOp(0xF3, 0x5B, regOp, arg);}
1632void XEmitter::CVTTPD2DQ(X64Reg regOp, OpArg arg) {WriteSSEOp(0x66, 0xE6, regOp, arg);}
1633
1634void XEmitter::MASKMOVDQU(X64Reg dest, X64Reg src) {WriteSSEOp(0x66, sseMASKMOVDQU, dest, R(src));}
1635
1636void XEmitter::MOVMSKPS(X64Reg dest, OpArg arg) {WriteSSEOp(0x00, 0x50, dest, arg);}
1637void XEmitter::MOVMSKPD(X64Reg dest, OpArg arg) {WriteSSEOp(0x66, 0x50, dest, arg);}
1638
1639void XEmitter::LDDQU(X64Reg dest, OpArg arg) {WriteSSEOp(0xF2, sseLDDQU, dest, arg);} // For integer data only
1640
1641// THESE TWO ARE UNTESTED.
1642void XEmitter::UNPCKLPS(X64Reg dest, OpArg arg) {WriteSSEOp(0x00, 0x14, dest, arg);}
1643void XEmitter::UNPCKHPS(X64Reg dest, OpArg arg) {WriteSSEOp(0x00, 0x15, dest, arg);}
1644
1645void XEmitter::UNPCKLPD(X64Reg dest, OpArg arg) {WriteSSEOp(0x66, 0x14, dest, arg);}
1646void XEmitter::UNPCKHPD(X64Reg dest, OpArg arg) {WriteSSEOp(0x66, 0x15, dest, arg);}
1647
1648void XEmitter::MOVDDUP(X64Reg regOp, OpArg arg)
1649{
1650 if (Common::GetCPUCaps().sse3)
1651 {
1652 WriteSSEOp(0xF2, 0x12, regOp, arg); //SSE3 movddup
1653 }
1654 else
1655 {
1656 // Simulate this instruction with SSE2 instructions
1657 if (!arg.IsSimpleReg(regOp))
1658 MOVSD(regOp, arg);
1659 UNPCKLPD(regOp, R(regOp));
1660 }
1661}
1662
1663//There are a few more left
1664
1665// Also some integer instructions are missing
1666void XEmitter::PACKSSDW(X64Reg dest, OpArg arg) {WriteSSEOp(0x66, 0x6B, dest, arg);}
1667void XEmitter::PACKSSWB(X64Reg dest, OpArg arg) {WriteSSEOp(0x66, 0x63, dest, arg);}
1668void XEmitter::PACKUSWB(X64Reg dest, OpArg arg) {WriteSSEOp(0x66, 0x67, dest, arg);}
1669
1670void XEmitter::PUNPCKLBW(X64Reg dest, const OpArg &arg) {WriteSSEOp(0x66, 0x60, dest, arg);}
1671void XEmitter::PUNPCKLWD(X64Reg dest, const OpArg &arg) {WriteSSEOp(0x66, 0x61, dest, arg);}
1672void XEmitter::PUNPCKLDQ(X64Reg dest, const OpArg &arg) {WriteSSEOp(0x66, 0x62, dest, arg);}
1673void XEmitter::PUNPCKLQDQ(X64Reg dest, const OpArg &arg) {WriteSSEOp(0x66, 0x6C, dest, arg);}
1674
1675void XEmitter::PSRLW(X64Reg reg, int shift)
1676{
1677 WriteSSEOp(0x66, 0x71, (X64Reg)2, R(reg));
1678 Write8(shift);
1679}
1680
1681void XEmitter::PSRLD(X64Reg reg, int shift)
1682{
1683 WriteSSEOp(0x66, 0x72, (X64Reg)2, R(reg));
1684 Write8(shift);
1685}
1686
1687void XEmitter::PSRLQ(X64Reg reg, int shift)
1688{
1689 WriteSSEOp(0x66, 0x73, (X64Reg)2, R(reg));
1690 Write8(shift);
1691}
1692
1693void XEmitter::PSRLQ(X64Reg reg, OpArg arg)
1694{
1695 WriteSSEOp(0x66, 0xd3, reg, arg);
1696}
1697
1698void XEmitter::PSRLDQ(X64Reg reg, int shift) {
1699 WriteSSEOp(0x66, 0x73, (X64Reg)3, R(reg));
1700 Write8(shift);
1701}
1702
1703void XEmitter::PSLLW(X64Reg reg, int shift)
1704{
1705 WriteSSEOp(0x66, 0x71, (X64Reg)6, R(reg));
1706 Write8(shift);
1707}
1708
1709void XEmitter::PSLLD(X64Reg reg, int shift)
1710{
1711 WriteSSEOp(0x66, 0x72, (X64Reg)6, R(reg));
1712 Write8(shift);
1713}
1714
1715void XEmitter::PSLLQ(X64Reg reg, int shift)
1716{
1717 WriteSSEOp(0x66, 0x73, (X64Reg)6, R(reg));
1718 Write8(shift);
1719}
1720
1721void XEmitter::PSLLDQ(X64Reg reg, int shift) {
1722 WriteSSEOp(0x66, 0x73, (X64Reg)7, R(reg));
1723 Write8(shift);
1724}
1725
1726void XEmitter::PSRAW(X64Reg reg, int shift)
1727{
1728 WriteSSEOp(0x66, 0x71, (X64Reg)4, R(reg));
1729 Write8(shift);
1730}
1731
1732void XEmitter::PSRAD(X64Reg reg, int shift)
1733{
1734 WriteSSEOp(0x66, 0x72, (X64Reg)4, R(reg));
1735 Write8(shift);
1736}
1737
1738void XEmitter::WriteSSSE3Op(u8 opPrefix, u16 op, X64Reg regOp, OpArg arg, int extrabytes)
1739{
1740 if (!Common::GetCPUCaps().ssse3)
1741 ASSERT_MSG(0, "Trying to use SSSE3 on a system that doesn't support it. Bad programmer.");
1742 WriteSSEOp(opPrefix, op, regOp, arg, extrabytes);
1743}
1744
1745void XEmitter::WriteSSE41Op(u8 opPrefix, u16 op, X64Reg regOp, OpArg arg, int extrabytes)
1746{
1747 if (!Common::GetCPUCaps().sse4_1)
1748 ASSERT_MSG(0, "Trying to use SSE4.1 on a system that doesn't support it. Bad programmer.");
1749 WriteSSEOp(opPrefix, op, regOp, arg, extrabytes);
1750}
1751
1752void XEmitter::PSHUFB(X64Reg dest, OpArg arg) {WriteSSSE3Op(0x66, 0x3800, dest, arg);}
1753void XEmitter::PTEST(X64Reg dest, OpArg arg) {WriteSSE41Op(0x66, 0x3817, dest, arg);}
1754void XEmitter::PACKUSDW(X64Reg dest, OpArg arg) {WriteSSE41Op(0x66, 0x382b, dest, arg);}
1755void XEmitter::DPPS(X64Reg dest, OpArg arg, u8 mask) {WriteSSE41Op(0x66, 0x3A40, dest, arg, 1); Write8(mask);}
1756
1757void XEmitter::PMINSB(X64Reg dest, OpArg arg) {WriteSSE41Op(0x66, 0x3838, dest, arg);}
1758void XEmitter::PMINSD(X64Reg dest, OpArg arg) {WriteSSE41Op(0x66, 0x3839, dest, arg);}
1759void XEmitter::PMINUW(X64Reg dest, OpArg arg) {WriteSSE41Op(0x66, 0x383a, dest, arg);}
1760void XEmitter::PMINUD(X64Reg dest, OpArg arg) {WriteSSE41Op(0x66, 0x383b, dest, arg);}
1761void XEmitter::PMAXSB(X64Reg dest, OpArg arg) {WriteSSE41Op(0x66, 0x383c, dest, arg);}
1762void XEmitter::PMAXSD(X64Reg dest, OpArg arg) {WriteSSE41Op(0x66, 0x383d, dest, arg);}
1763void XEmitter::PMAXUW(X64Reg dest, OpArg arg) {WriteSSE41Op(0x66, 0x383e, dest, arg);}
1764void XEmitter::PMAXUD(X64Reg dest, OpArg arg) {WriteSSE41Op(0x66, 0x383f, dest, arg);}
1765
1766void XEmitter::PMOVSXBW(X64Reg dest, OpArg arg) {WriteSSE41Op(0x66, 0x3820, dest, arg);}
1767void XEmitter::PMOVSXBD(X64Reg dest, OpArg arg) {WriteSSE41Op(0x66, 0x3821, dest, arg);}
1768void XEmitter::PMOVSXBQ(X64Reg dest, OpArg arg) {WriteSSE41Op(0x66, 0x3822, dest, arg);}
1769void XEmitter::PMOVSXWD(X64Reg dest, OpArg arg) {WriteSSE41Op(0x66, 0x3823, dest, arg);}
1770void XEmitter::PMOVSXWQ(X64Reg dest, OpArg arg) {WriteSSE41Op(0x66, 0x3824, dest, arg);}
1771void XEmitter::PMOVSXDQ(X64Reg dest, OpArg arg) {WriteSSE41Op(0x66, 0x3825, dest, arg);}
1772void XEmitter::PMOVZXBW(X64Reg dest, OpArg arg) {WriteSSE41Op(0x66, 0x3830, dest, arg);}
1773void XEmitter::PMOVZXBD(X64Reg dest, OpArg arg) {WriteSSE41Op(0x66, 0x3831, dest, arg);}
1774void XEmitter::PMOVZXBQ(X64Reg dest, OpArg arg) {WriteSSE41Op(0x66, 0x3832, dest, arg);}
1775void XEmitter::PMOVZXWD(X64Reg dest, OpArg arg) {WriteSSE41Op(0x66, 0x3833, dest, arg);}
1776void XEmitter::PMOVZXWQ(X64Reg dest, OpArg arg) {WriteSSE41Op(0x66, 0x3834, dest, arg);}
1777void XEmitter::PMOVZXDQ(X64Reg dest, OpArg arg) {WriteSSE41Op(0x66, 0x3835, dest, arg);}
1778
1779void XEmitter::PBLENDVB(X64Reg dest, OpArg arg) {WriteSSE41Op(0x66, 0x3810, dest, arg);}
1780void XEmitter::BLENDVPS(X64Reg dest, OpArg arg) {WriteSSE41Op(0x66, 0x3814, dest, arg);}
1781void XEmitter::BLENDVPD(X64Reg dest, OpArg arg) {WriteSSE41Op(0x66, 0x3815, dest, arg);}
1782void XEmitter::BLENDPS(X64Reg dest, const OpArg& arg, u8 blend) { WriteSSE41Op(0x66, 0x3A0C, dest, arg, 1); Write8(blend); }
1783void XEmitter::BLENDPD(X64Reg dest, const OpArg& arg, u8 blend) { WriteSSE41Op(0x66, 0x3A0D, dest, arg, 1); Write8(blend); }
1784
1785void XEmitter::ROUNDSS(X64Reg dest, OpArg arg, u8 mode) {WriteSSE41Op(0x66, 0x3A0A, dest, arg, 1); Write8(mode);}
1786void XEmitter::ROUNDSD(X64Reg dest, OpArg arg, u8 mode) {WriteSSE41Op(0x66, 0x3A0B, dest, arg, 1); Write8(mode);}
1787void XEmitter::ROUNDPS(X64Reg dest, OpArg arg, u8 mode) {WriteSSE41Op(0x66, 0x3A08, dest, arg, 1); Write8(mode);}
1788void XEmitter::ROUNDPD(X64Reg dest, OpArg arg, u8 mode) {WriteSSE41Op(0x66, 0x3A09, dest, arg, 1); Write8(mode);}
1789
1790void XEmitter::PAND(X64Reg dest, OpArg arg) {WriteSSEOp(0x66, 0xDB, dest, arg);}
1791void XEmitter::PANDN(X64Reg dest, OpArg arg) {WriteSSEOp(0x66, 0xDF, dest, arg);}
1792void XEmitter::PXOR(X64Reg dest, OpArg arg) {WriteSSEOp(0x66, 0xEF, dest, arg);}
1793void XEmitter::POR(X64Reg dest, OpArg arg) {WriteSSEOp(0x66, 0xEB, dest, arg);}
1794
1795void XEmitter::PADDB(X64Reg dest, OpArg arg) {WriteSSEOp(0x66, 0xFC, dest, arg);}
1796void XEmitter::PADDW(X64Reg dest, OpArg arg) {WriteSSEOp(0x66, 0xFD, dest, arg);}
1797void XEmitter::PADDD(X64Reg dest, OpArg arg) {WriteSSEOp(0x66, 0xFE, dest, arg);}
1798void XEmitter::PADDQ(X64Reg dest, OpArg arg) {WriteSSEOp(0x66, 0xD4, dest, arg);}
1799
1800void XEmitter::PADDSB(X64Reg dest, OpArg arg) {WriteSSEOp(0x66, 0xEC, dest, arg);}
1801void XEmitter::PADDSW(X64Reg dest, OpArg arg) {WriteSSEOp(0x66, 0xED, dest, arg);}
1802void XEmitter::PADDUSB(X64Reg dest, OpArg arg) {WriteSSEOp(0x66, 0xDC, dest, arg);}
1803void XEmitter::PADDUSW(X64Reg dest, OpArg arg) {WriteSSEOp(0x66, 0xDD, dest, arg);}
1804
1805void XEmitter::PSUBB(X64Reg dest, OpArg arg) {WriteSSEOp(0x66, 0xF8, dest, arg);}
1806void XEmitter::PSUBW(X64Reg dest, OpArg arg) {WriteSSEOp(0x66, 0xF9, dest, arg);}
1807void XEmitter::PSUBD(X64Reg dest, OpArg arg) {WriteSSEOp(0x66, 0xFA, dest, arg);}
1808void XEmitter::PSUBQ(X64Reg dest, OpArg arg) {WriteSSEOp(0x66, 0xFB, dest, arg);}
1809
1810void XEmitter::PSUBSB(X64Reg dest, OpArg arg) {WriteSSEOp(0x66, 0xE8, dest, arg);}
1811void XEmitter::PSUBSW(X64Reg dest, OpArg arg) {WriteSSEOp(0x66, 0xE9, dest, arg);}
1812void XEmitter::PSUBUSB(X64Reg dest, OpArg arg) {WriteSSEOp(0x66, 0xD8, dest, arg);}
1813void XEmitter::PSUBUSW(X64Reg dest, OpArg arg) {WriteSSEOp(0x66, 0xD9, dest, arg);}
1814
1815void XEmitter::PAVGB(X64Reg dest, OpArg arg) {WriteSSEOp(0x66, 0xE0, dest, arg);}
1816void XEmitter::PAVGW(X64Reg dest, OpArg arg) {WriteSSEOp(0x66, 0xE3, dest, arg);}
1817
1818void XEmitter::PCMPEQB(X64Reg dest, OpArg arg) {WriteSSEOp(0x66, 0x74, dest, arg);}
1819void XEmitter::PCMPEQW(X64Reg dest, OpArg arg) {WriteSSEOp(0x66, 0x75, dest, arg);}
1820void XEmitter::PCMPEQD(X64Reg dest, OpArg arg) {WriteSSEOp(0x66, 0x76, dest, arg);}
1821
1822void XEmitter::PCMPGTB(X64Reg dest, OpArg arg) {WriteSSEOp(0x66, 0x64, dest, arg);}
1823void XEmitter::PCMPGTW(X64Reg dest, OpArg arg) {WriteSSEOp(0x66, 0x65, dest, arg);}
1824void XEmitter::PCMPGTD(X64Reg dest, OpArg arg) {WriteSSEOp(0x66, 0x66, dest, arg);}
1825
1826void XEmitter::PEXTRW(X64Reg dest, OpArg arg, u8 subreg) {WriteSSEOp(0x66, 0xC5, dest, arg, 1); Write8(subreg);}
1827void XEmitter::PINSRW(X64Reg dest, OpArg arg, u8 subreg) {WriteSSEOp(0x66, 0xC4, dest, arg, 1); Write8(subreg);}
1828
1829void XEmitter::PMADDWD(X64Reg dest, OpArg arg) {WriteSSEOp(0x66, 0xF5, dest, arg); }
1830void XEmitter::PSADBW(X64Reg dest, OpArg arg) {WriteSSEOp(0x66, 0xF6, dest, arg);}
1831
1832void XEmitter::PMAXSW(X64Reg dest, OpArg arg) {WriteSSEOp(0x66, 0xEE, dest, arg); }
1833void XEmitter::PMAXUB(X64Reg dest, OpArg arg) {WriteSSEOp(0x66, 0xDE, dest, arg); }
1834void XEmitter::PMINSW(X64Reg dest, OpArg arg) {WriteSSEOp(0x66, 0xEA, dest, arg); }
1835void XEmitter::PMINUB(X64Reg dest, OpArg arg) {WriteSSEOp(0x66, 0xDA, dest, arg); }
1836
1837void XEmitter::PMOVMSKB(X64Reg dest, OpArg arg) {WriteSSEOp(0x66, 0xD7, dest, arg); }
1838void XEmitter::PSHUFD(X64Reg regOp, OpArg arg, u8 shuffle) {WriteSSEOp(0x66, 0x70, regOp, arg, 1); Write8(shuffle);}
1839void XEmitter::PSHUFLW(X64Reg regOp, OpArg arg, u8 shuffle) {WriteSSEOp(0xF2, 0x70, regOp, arg, 1); Write8(shuffle);}
1840void XEmitter::PSHUFHW(X64Reg regOp, OpArg arg, u8 shuffle) {WriteSSEOp(0xF3, 0x70, regOp, arg, 1); Write8(shuffle);}
1841
1842// VEX
1843void XEmitter::VADDSD(X64Reg regOp1, X64Reg regOp2, OpArg arg) {WriteAVXOp(0xF2, sseADD, regOp1, regOp2, arg);}
1844void XEmitter::VSUBSD(X64Reg regOp1, X64Reg regOp2, OpArg arg) {WriteAVXOp(0xF2, sseSUB, regOp1, regOp2, arg);}
1845void XEmitter::VMULSD(X64Reg regOp1, X64Reg regOp2, OpArg arg) {WriteAVXOp(0xF2, sseMUL, regOp1, regOp2, arg);}
1846void XEmitter::VDIVSD(X64Reg regOp1, X64Reg regOp2, OpArg arg) {WriteAVXOp(0xF2, sseDIV, regOp1, regOp2, arg);}
1847void XEmitter::VADDPD(X64Reg regOp1, X64Reg regOp2, OpArg arg) {WriteAVXOp(0x66, sseADD, regOp1, regOp2, arg);}
1848void XEmitter::VSUBPD(X64Reg regOp1, X64Reg regOp2, OpArg arg) {WriteAVXOp(0x66, sseSUB, regOp1, regOp2, arg);}
1849void XEmitter::VMULPD(X64Reg regOp1, X64Reg regOp2, OpArg arg) {WriteAVXOp(0x66, sseMUL, regOp1, regOp2, arg);}
1850void XEmitter::VDIVPD(X64Reg regOp1, X64Reg regOp2, OpArg arg) {WriteAVXOp(0x66, sseDIV, regOp1, regOp2, arg);}
1851void XEmitter::VSQRTSD(X64Reg regOp1, X64Reg regOp2, OpArg arg) {WriteAVXOp(0xF2, sseSQRT, regOp1, regOp2, arg);}
1852void XEmitter::VSHUFPD(X64Reg regOp1, X64Reg regOp2, OpArg arg, u8 shuffle) {WriteAVXOp(0x66, sseSHUF, regOp1, regOp2, arg, 1); Write8(shuffle);}
1853void XEmitter::VUNPCKLPD(X64Reg regOp1, X64Reg regOp2, OpArg arg){WriteAVXOp(0x66, 0x14, regOp1, regOp2, arg);}
1854void XEmitter::VUNPCKHPD(X64Reg regOp1, X64Reg regOp2, OpArg arg){WriteAVXOp(0x66, 0x15, regOp1, regOp2, arg);}
1855
1856void XEmitter::VANDPS(X64Reg regOp1, X64Reg regOp2, OpArg arg) { WriteAVXOp(0x00, sseAND, regOp1, regOp2, arg); }
1857void XEmitter::VANDPD(X64Reg regOp1, X64Reg regOp2, OpArg arg) { WriteAVXOp(0x66, sseAND, regOp1, regOp2, arg); }
1858void XEmitter::VANDNPS(X64Reg regOp1, X64Reg regOp2, OpArg arg) { WriteAVXOp(0x00, sseANDN, regOp1, regOp2, arg); }
1859void XEmitter::VANDNPD(X64Reg regOp1, X64Reg regOp2, OpArg arg) { WriteAVXOp(0x66, sseANDN, regOp1, regOp2, arg); }
1860void XEmitter::VORPS(X64Reg regOp1, X64Reg regOp2, OpArg arg) { WriteAVXOp(0x00, sseOR, regOp1, regOp2, arg); }
1861void XEmitter::VORPD(X64Reg regOp1, X64Reg regOp2, OpArg arg) { WriteAVXOp(0x66, sseOR, regOp1, regOp2, arg); }
1862void XEmitter::VXORPS(X64Reg regOp1, X64Reg regOp2, OpArg arg) { WriteAVXOp(0x00, sseXOR, regOp1, regOp2, arg); }
1863void XEmitter::VXORPD(X64Reg regOp1, X64Reg regOp2, OpArg arg) { WriteAVXOp(0x66, sseXOR, regOp1, regOp2, arg); }
1864
1865void XEmitter::VPAND(X64Reg regOp1, X64Reg regOp2, OpArg arg) { WriteAVXOp(0x66, 0xDB, regOp1, regOp2, arg); }
1866void XEmitter::VPANDN(X64Reg regOp1, X64Reg regOp2, OpArg arg) { WriteAVXOp(0x66, 0xDF, regOp1, regOp2, arg); }
1867void XEmitter::VPOR(X64Reg regOp1, X64Reg regOp2, OpArg arg) { WriteAVXOp(0x66, 0xEB, regOp1, regOp2, arg); }
1868void XEmitter::VPXOR(X64Reg regOp1, X64Reg regOp2, OpArg arg) { WriteAVXOp(0x66, 0xEF, regOp1, regOp2, arg); }
1869
1870void XEmitter::VFMADD132PS(X64Reg regOp1, X64Reg regOp2, OpArg arg) { WriteAVXOp(0x66, 0x3898, regOp1, regOp2, arg); }
1871void XEmitter::VFMADD213PS(X64Reg regOp1, X64Reg regOp2, OpArg arg) { WriteAVXOp(0x66, 0x38A8, regOp1, regOp2, arg); }
1872void XEmitter::VFMADD231PS(X64Reg regOp1, X64Reg regOp2, OpArg arg) { WriteAVXOp(0x66, 0x38B8, regOp1, regOp2, arg); }
1873void XEmitter::VFMADD132PD(X64Reg regOp1, X64Reg regOp2, OpArg arg) { WriteAVXOp(0x66, 0x3898, regOp1, regOp2, arg, 1); }
1874void XEmitter::VFMADD213PD(X64Reg regOp1, X64Reg regOp2, OpArg arg) { WriteAVXOp(0x66, 0x38A8, regOp1, regOp2, arg, 1); }
1875void XEmitter::VFMADD231PD(X64Reg regOp1, X64Reg regOp2, OpArg arg) { WriteAVXOp(0x66, 0x38B8, regOp1, regOp2, arg, 1); }
1876void XEmitter::VFMADD132SS(X64Reg regOp1, X64Reg regOp2, OpArg arg) { WriteAVXOp(0x66, 0x3899, regOp1, regOp2, arg); }
1877void XEmitter::VFMADD213SS(X64Reg regOp1, X64Reg regOp2, OpArg arg) { WriteAVXOp(0x66, 0x38A9, regOp1, regOp2, arg); }
1878void XEmitter::VFMADD231SS(X64Reg regOp1, X64Reg regOp2, OpArg arg) { WriteAVXOp(0x66, 0x38B9, regOp1, regOp2, arg); }
1879void XEmitter::VFMADD132SD(X64Reg regOp1, X64Reg regOp2, OpArg arg) { WriteAVXOp(0x66, 0x3899, regOp1, regOp2, arg, 1); }
1880void XEmitter::VFMADD213SD(X64Reg regOp1, X64Reg regOp2, OpArg arg) { WriteAVXOp(0x66, 0x38A9, regOp1, regOp2, arg, 1); }
1881void XEmitter::VFMADD231SD(X64Reg regOp1, X64Reg regOp2, OpArg arg) { WriteAVXOp(0x66, 0x38B9, regOp1, regOp2, arg, 1); }
1882void XEmitter::VFMSUB132PS(X64Reg regOp1, X64Reg regOp2, OpArg arg) { WriteAVXOp(0x66, 0x389A, regOp1, regOp2, arg); }
1883void XEmitter::VFMSUB213PS(X64Reg regOp1, X64Reg regOp2, OpArg arg) { WriteAVXOp(0x66, 0x38AA, regOp1, regOp2, arg); }
1884void XEmitter::VFMSUB231PS(X64Reg regOp1, X64Reg regOp2, OpArg arg) { WriteAVXOp(0x66, 0x38BA, regOp1, regOp2, arg); }
1885void XEmitter::VFMSUB132PD(X64Reg regOp1, X64Reg regOp2, OpArg arg) { WriteAVXOp(0x66, 0x389A, regOp1, regOp2, arg, 1); }
1886void XEmitter::VFMSUB213PD(X64Reg regOp1, X64Reg regOp2, OpArg arg) { WriteAVXOp(0x66, 0x38AA, regOp1, regOp2, arg, 1); }
1887void XEmitter::VFMSUB231PD(X64Reg regOp1, X64Reg regOp2, OpArg arg) { WriteAVXOp(0x66, 0x38BA, regOp1, regOp2, arg, 1); }
1888void XEmitter::VFMSUB132SS(X64Reg regOp1, X64Reg regOp2, OpArg arg) { WriteAVXOp(0x66, 0x389B, regOp1, regOp2, arg); }
1889void XEmitter::VFMSUB213SS(X64Reg regOp1, X64Reg regOp2, OpArg arg) { WriteAVXOp(0x66, 0x38AB, regOp1, regOp2, arg); }
1890void XEmitter::VFMSUB231SS(X64Reg regOp1, X64Reg regOp2, OpArg arg) { WriteAVXOp(0x66, 0x38BB, regOp1, regOp2, arg); }
1891void XEmitter::VFMSUB132SD(X64Reg regOp1, X64Reg regOp2, OpArg arg) { WriteAVXOp(0x66, 0x389B, regOp1, regOp2, arg, 1); }
1892void XEmitter::VFMSUB213SD(X64Reg regOp1, X64Reg regOp2, OpArg arg) { WriteAVXOp(0x66, 0x38AB, regOp1, regOp2, arg, 1); }
1893void XEmitter::VFMSUB231SD(X64Reg regOp1, X64Reg regOp2, OpArg arg) { WriteAVXOp(0x66, 0x38BB, regOp1, regOp2, arg, 1); }
1894void XEmitter::VFNMADD132PS(X64Reg regOp1, X64Reg regOp2, OpArg arg) { WriteAVXOp(0x66, 0x389C, regOp1, regOp2, arg); }
1895void XEmitter::VFNMADD213PS(X64Reg regOp1, X64Reg regOp2, OpArg arg) { WriteAVXOp(0x66, 0x38AC, regOp1, regOp2, arg); }
1896void XEmitter::VFNMADD231PS(X64Reg regOp1, X64Reg regOp2, OpArg arg) { WriteAVXOp(0x66, 0x38BC, regOp1, regOp2, arg); }
1897void XEmitter::VFNMADD132PD(X64Reg regOp1, X64Reg regOp2, OpArg arg) { WriteAVXOp(0x66, 0x389C, regOp1, regOp2, arg, 1); }
1898void XEmitter::VFNMADD213PD(X64Reg regOp1, X64Reg regOp2, OpArg arg) { WriteAVXOp(0x66, 0x38AC, regOp1, regOp2, arg, 1); }
1899void XEmitter::VFNMADD231PD(X64Reg regOp1, X64Reg regOp2, OpArg arg) { WriteAVXOp(0x66, 0x38BC, regOp1, regOp2, arg, 1); }
1900void XEmitter::VFNMADD132SS(X64Reg regOp1, X64Reg regOp2, OpArg arg) { WriteAVXOp(0x66, 0x389D, regOp1, regOp2, arg); }
1901void XEmitter::VFNMADD213SS(X64Reg regOp1, X64Reg regOp2, OpArg arg) { WriteAVXOp(0x66, 0x38AD, regOp1, regOp2, arg); }
1902void XEmitter::VFNMADD231SS(X64Reg regOp1, X64Reg regOp2, OpArg arg) { WriteAVXOp(0x66, 0x38BD, regOp1, regOp2, arg); }
1903void XEmitter::VFNMADD132SD(X64Reg regOp1, X64Reg regOp2, OpArg arg) { WriteAVXOp(0x66, 0x389D, regOp1, regOp2, arg, 1); }
1904void XEmitter::VFNMADD213SD(X64Reg regOp1, X64Reg regOp2, OpArg arg) { WriteAVXOp(0x66, 0x38AD, regOp1, regOp2, arg, 1); }
1905void XEmitter::VFNMADD231SD(X64Reg regOp1, X64Reg regOp2, OpArg arg) { WriteAVXOp(0x66, 0x38BD, regOp1, regOp2, arg, 1); }
1906void XEmitter::VFNMSUB132PS(X64Reg regOp1, X64Reg regOp2, OpArg arg) { WriteAVXOp(0x66, 0x389E, regOp1, regOp2, arg); }
1907void XEmitter::VFNMSUB213PS(X64Reg regOp1, X64Reg regOp2, OpArg arg) { WriteAVXOp(0x66, 0x38AE, regOp1, regOp2, arg); }
1908void XEmitter::VFNMSUB231PS(X64Reg regOp1, X64Reg regOp2, OpArg arg) { WriteAVXOp(0x66, 0x38BE, regOp1, regOp2, arg); }
1909void XEmitter::VFNMSUB132PD(X64Reg regOp1, X64Reg regOp2, OpArg arg) { WriteAVXOp(0x66, 0x389E, regOp1, regOp2, arg, 1); }
1910void XEmitter::VFNMSUB213PD(X64Reg regOp1, X64Reg regOp2, OpArg arg) { WriteAVXOp(0x66, 0x38AE, regOp1, regOp2, arg, 1); }
1911void XEmitter::VFNMSUB231PD(X64Reg regOp1, X64Reg regOp2, OpArg arg) { WriteAVXOp(0x66, 0x38BE, regOp1, regOp2, arg, 1); }
1912void XEmitter::VFNMSUB132SS(X64Reg regOp1, X64Reg regOp2, OpArg arg) { WriteAVXOp(0x66, 0x389F, regOp1, regOp2, arg); }
1913void XEmitter::VFNMSUB213SS(X64Reg regOp1, X64Reg regOp2, OpArg arg) { WriteAVXOp(0x66, 0x38AF, regOp1, regOp2, arg); }
1914void XEmitter::VFNMSUB231SS(X64Reg regOp1, X64Reg regOp2, OpArg arg) { WriteAVXOp(0x66, 0x38BF, regOp1, regOp2, arg); }
1915void XEmitter::VFNMSUB132SD(X64Reg regOp1, X64Reg regOp2, OpArg arg) { WriteAVXOp(0x66, 0x389F, regOp1, regOp2, arg, 1); }
1916void XEmitter::VFNMSUB213SD(X64Reg regOp1, X64Reg regOp2, OpArg arg) { WriteAVXOp(0x66, 0x38AF, regOp1, regOp2, arg, 1); }
1917void XEmitter::VFNMSUB231SD(X64Reg regOp1, X64Reg regOp2, OpArg arg) { WriteAVXOp(0x66, 0x38BF, regOp1, regOp2, arg, 1); }
1918void XEmitter::VFMADDSUB132PS(X64Reg regOp1, X64Reg regOp2, OpArg arg) { WriteAVXOp(0x66, 0x3896, regOp1, regOp2, arg); }
1919void XEmitter::VFMADDSUB213PS(X64Reg regOp1, X64Reg regOp2, OpArg arg) { WriteAVXOp(0x66, 0x38A6, regOp1, regOp2, arg); }
1920void XEmitter::VFMADDSUB231PS(X64Reg regOp1, X64Reg regOp2, OpArg arg) { WriteAVXOp(0x66, 0x38B6, regOp1, regOp2, arg); }
1921void XEmitter::VFMADDSUB132PD(X64Reg regOp1, X64Reg regOp2, OpArg arg) { WriteAVXOp(0x66, 0x3896, regOp1, regOp2, arg, 1); }
1922void XEmitter::VFMADDSUB213PD(X64Reg regOp1, X64Reg regOp2, OpArg arg) { WriteAVXOp(0x66, 0x38A6, regOp1, regOp2, arg, 1); }
1923void XEmitter::VFMADDSUB231PD(X64Reg regOp1, X64Reg regOp2, OpArg arg) { WriteAVXOp(0x66, 0x38B6, regOp1, regOp2, arg, 1); }
1924void XEmitter::VFMSUBADD132PS(X64Reg regOp1, X64Reg regOp2, OpArg arg) { WriteAVXOp(0x66, 0x3897, regOp1, regOp2, arg); }
1925void XEmitter::VFMSUBADD213PS(X64Reg regOp1, X64Reg regOp2, OpArg arg) { WriteAVXOp(0x66, 0x38A7, regOp1, regOp2, arg); }
1926void XEmitter::VFMSUBADD231PS(X64Reg regOp1, X64Reg regOp2, OpArg arg) { WriteAVXOp(0x66, 0x38B7, regOp1, regOp2, arg); }
1927void XEmitter::VFMSUBADD132PD(X64Reg regOp1, X64Reg regOp2, OpArg arg) { WriteAVXOp(0x66, 0x3897, regOp1, regOp2, arg, 1); }
1928void XEmitter::VFMSUBADD213PD(X64Reg regOp1, X64Reg regOp2, OpArg arg) { WriteAVXOp(0x66, 0x38A7, regOp1, regOp2, arg, 1); }
1929void XEmitter::VFMSUBADD231PD(X64Reg regOp1, X64Reg regOp2, OpArg arg) { WriteAVXOp(0x66, 0x38B7, regOp1, regOp2, arg, 1); }
1930
1931void XEmitter::SARX(int bits, X64Reg regOp1, OpArg arg, X64Reg regOp2) {WriteBMI2Op(bits, 0xF3, 0x38F7, regOp1, regOp2, arg);}
1932void XEmitter::SHLX(int bits, X64Reg regOp1, OpArg arg, X64Reg regOp2) {WriteBMI2Op(bits, 0x66, 0x38F7, regOp1, regOp2, arg);}
1933void XEmitter::SHRX(int bits, X64Reg regOp1, OpArg arg, X64Reg regOp2) {WriteBMI2Op(bits, 0xF2, 0x38F7, regOp1, regOp2, arg);}
1934void XEmitter::RORX(int bits, X64Reg regOp, OpArg arg, u8 rotate) {WriteBMI2Op(bits, 0xF2, 0x3AF0, regOp, INVALID_REG, arg, 1); Write8(rotate);}
1935void XEmitter::PEXT(int bits, X64Reg regOp1, X64Reg regOp2, OpArg arg) {WriteBMI2Op(bits, 0xF3, 0x38F5, regOp1, regOp2, arg);}
1936void XEmitter::PDEP(int bits, X64Reg regOp1, X64Reg regOp2, OpArg arg) {WriteBMI2Op(bits, 0xF2, 0x38F5, regOp1, regOp2, arg);}
1937void XEmitter::MULX(int bits, X64Reg regOp1, X64Reg regOp2, OpArg arg) {WriteBMI2Op(bits, 0xF2, 0x38F6, regOp2, regOp1, arg);}
1938void XEmitter::BZHI(int bits, X64Reg regOp1, OpArg arg, X64Reg regOp2) {WriteBMI2Op(bits, 0x00, 0x38F5, regOp1, regOp2, arg);}
1939void XEmitter::BLSR(int bits, X64Reg regOp, OpArg arg) {WriteBMI1Op(bits, 0x00, 0x38F3, (X64Reg)0x1, regOp, arg);}
1940void XEmitter::BLSMSK(int bits, X64Reg regOp, OpArg arg) {WriteBMI1Op(bits, 0x00, 0x38F3, (X64Reg)0x2, regOp, arg);}
1941void XEmitter::BLSI(int bits, X64Reg regOp, OpArg arg) {WriteBMI1Op(bits, 0x00, 0x38F3, (X64Reg)0x3, regOp, arg);}
1942void XEmitter::BEXTR(int bits, X64Reg regOp1, OpArg arg, X64Reg regOp2){WriteBMI1Op(bits, 0x00, 0x38F7, regOp1, regOp2, arg);}
1943void XEmitter::ANDN(int bits, X64Reg regOp1, X64Reg regOp2, OpArg arg) {WriteBMI1Op(bits, 0x00, 0x38F2, regOp1, regOp2, arg);}
1944
1945// Prefixes
1946
1947void XEmitter::LOCK() { Write8(0xF0); }
1948void XEmitter::REP() { Write8(0xF3); }
1949void XEmitter::REPNE() { Write8(0xF2); }
1950void XEmitter::FSOverride() { Write8(0x64); }
1951void XEmitter::GSOverride() { Write8(0x65); }
1952
1953void XEmitter::FWAIT()
1954{
1955 Write8(0x9B);
1956}
1957
1958// TODO: make this more generic
1959void XEmitter::WriteFloatLoadStore(int bits, FloatOp op, FloatOp op_80b, OpArg arg)
1960{
1961 int mf = 0;
1962 ASSERT_MSG(!(bits == 80 && op_80b == floatINVALID), "WriteFloatLoadStore: 80 bits not supported for this instruction");
1963 switch (bits)
1964 {
1965 case 32: mf = 0; break;
1966 case 64: mf = 4; break;
1967 case 80: mf = 2; break;
1968 default: ASSERT_MSG(0, "WriteFloatLoadStore: invalid bits (should be 32/64/80)");
1969 }
1970 Write8(0xd9 | mf);
1971 // x87 instructions use the reg field of the ModR/M byte as opcode:
1972 if (bits == 80)
1973 op = op_80b;
1974 arg.WriteRest(this, 0, (X64Reg) op);
1975}
1976
1977void XEmitter::FLD(int bits, OpArg src) {WriteFloatLoadStore(bits, floatLD, floatLD80, src);}
1978void XEmitter::FST(int bits, OpArg dest) {WriteFloatLoadStore(bits, floatST, floatINVALID, dest);}
1979void XEmitter::FSTP(int bits, OpArg dest) {WriteFloatLoadStore(bits, floatSTP, floatSTP80, dest);}
1980void XEmitter::FNSTSW_AX() { Write8(0xDF); Write8(0xE0); }
1981
1982void XEmitter::RDTSC() { Write8(0x0F); Write8(0x31); }
1983
1984void XCodeBlock::PoisonMemory() {
1985 // x86/64: 0xCC = breakpoint
1986 memset(region, 0xCC, region_size);
1987}
1988
1989}
diff --git a/src/common/x64/emitter.h b/src/common/x64/emitter.h
new file mode 100644
index 000000000..e9c924126
--- /dev/null
+++ b/src/common/x64/emitter.h
@@ -0,0 +1,1067 @@
1// Copyright (C) 2003 Dolphin Project.
2
3// This program is free software: you can redistribute it and/or modify
4// it under the terms of the GNU General Public License as published by
5// the Free Software Foundation, version 2.0 or later versions.
6
7// This program is distributed in the hope that it will be useful,
8// but WITHOUT ANY WARRANTY; without even the implied warranty of
9// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
10// GNU General Public License 2.0 for more details.
11
12// A copy of the GPL 2.0 should have been included with the program.
13// If not, see http://www.gnu.org/licenses/
14
15// Official SVN repository and contact information can be found at
16// http://code.google.com/p/dolphin-emu/
17
18#pragma once
19
20#include "common/assert.h"
21#include "common/common_types.h"
22#include "common/code_block.h"
23
24#if defined(ARCHITECTURE_x86_64) && !defined(_ARCH_64)
25#define _ARCH_64
26#endif
27
28#ifdef _ARCH_64
29#define PTRBITS 64
30#else
31#define PTRBITS 32
32#endif
33
34namespace Gen
35{
36
37enum X64Reg
38{
39 EAX = 0, EBX = 3, ECX = 1, EDX = 2,
40 ESI = 6, EDI = 7, EBP = 5, ESP = 4,
41
42 RAX = 0, RBX = 3, RCX = 1, RDX = 2,
43 RSI = 6, RDI = 7, RBP = 5, RSP = 4,
44 R8 = 8, R9 = 9, R10 = 10,R11 = 11,
45 R12 = 12,R13 = 13,R14 = 14,R15 = 15,
46
47 AL = 0, BL = 3, CL = 1, DL = 2,
48 SIL = 6, DIL = 7, BPL = 5, SPL = 4,
49 AH = 0x104, BH = 0x107, CH = 0x105, DH = 0x106,
50
51 AX = 0, BX = 3, CX = 1, DX = 2,
52 SI = 6, DI = 7, BP = 5, SP = 4,
53
54 XMM0=0, XMM1, XMM2, XMM3, XMM4, XMM5, XMM6, XMM7,
55 XMM8, XMM9, XMM10, XMM11, XMM12, XMM13, XMM14, XMM15,
56
57 YMM0=0, YMM1, YMM2, YMM3, YMM4, YMM5, YMM6, YMM7,
58 YMM8, YMM9, YMM10, YMM11, YMM12, YMM13, YMM14, YMM15,
59
60 INVALID_REG = 0xFFFFFFFF
61};
62
63enum CCFlags
64{
65 CC_O = 0,
66 CC_NO = 1,
67 CC_B = 2, CC_C = 2, CC_NAE = 2,
68 CC_NB = 3, CC_NC = 3, CC_AE = 3,
69 CC_Z = 4, CC_E = 4,
70 CC_NZ = 5, CC_NE = 5,
71 CC_BE = 6, CC_NA = 6,
72 CC_NBE = 7, CC_A = 7,
73 CC_S = 8,
74 CC_NS = 9,
75 CC_P = 0xA, CC_PE = 0xA,
76 CC_NP = 0xB, CC_PO = 0xB,
77 CC_L = 0xC, CC_NGE = 0xC,
78 CC_NL = 0xD, CC_GE = 0xD,
79 CC_LE = 0xE, CC_NG = 0xE,
80 CC_NLE = 0xF, CC_G = 0xF
81};
82
83enum
84{
85 NUMGPRs = 16,
86 NUMXMMs = 16,
87};
88
89enum
90{
91 SCALE_NONE = 0,
92 SCALE_1 = 1,
93 SCALE_2 = 2,
94 SCALE_4 = 4,
95 SCALE_8 = 8,
96 SCALE_ATREG = 16,
97 //SCALE_NOBASE_1 is not supported and can be replaced with SCALE_ATREG
98 SCALE_NOBASE_2 = 34,
99 SCALE_NOBASE_4 = 36,
100 SCALE_NOBASE_8 = 40,
101 SCALE_RIP = 0xFF,
102 SCALE_IMM8 = 0xF0,
103 SCALE_IMM16 = 0xF1,
104 SCALE_IMM32 = 0xF2,
105 SCALE_IMM64 = 0xF3,
106};
107
108enum NormalOp {
109 nrmADD,
110 nrmADC,
111 nrmSUB,
112 nrmSBB,
113 nrmAND,
114 nrmOR ,
115 nrmXOR,
116 nrmMOV,
117 nrmTEST,
118 nrmCMP,
119 nrmXCHG,
120};
121
122enum {
123 CMP_EQ = 0,
124 CMP_LT = 1,
125 CMP_LE = 2,
126 CMP_UNORD = 3,
127 CMP_NEQ = 4,
128 CMP_NLT = 5,
129 CMP_NLE = 6,
130 CMP_ORD = 7,
131};
132
133enum FloatOp {
134 floatLD = 0,
135 floatST = 2,
136 floatSTP = 3,
137 floatLD80 = 5,
138 floatSTP80 = 7,
139
140 floatINVALID = -1,
141};
142
143enum FloatRound {
144 FROUND_NEAREST = 0,
145 FROUND_FLOOR = 1,
146 FROUND_CEIL = 2,
147 FROUND_ZERO = 3,
148 FROUND_MXCSR = 4,
149
150 FROUND_RAISE_PRECISION = 0,
151 FROUND_IGNORE_PRECISION = 8,
152};
153
154class XEmitter;
155
156// RIP addressing does not benefit from micro op fusion on Core arch
157struct OpArg
158{
159 OpArg() {} // dummy op arg, used for storage
160 OpArg(u64 _offset, int _scale, X64Reg rmReg = RAX, X64Reg scaledReg = RAX)
161 {
162 operandReg = 0;
163 scale = (u8)_scale;
164 offsetOrBaseReg = (u16)rmReg;
165 indexReg = (u16)scaledReg;
166 //if scale == 0 never mind offsetting
167 offset = _offset;
168 }
169 bool operator==(const OpArg &b) const
170 {
171 return operandReg == b.operandReg && scale == b.scale && offsetOrBaseReg == b.offsetOrBaseReg &&
172 indexReg == b.indexReg && offset == b.offset;
173 }
174 void WriteRex(XEmitter *emit, int opBits, int bits, int customOp = -1) const;
175 void WriteVex(XEmitter* emit, X64Reg regOp1, X64Reg regOp2, int L, int pp, int mmmmm, int W = 0) const;
176 void WriteRest(XEmitter *emit, int extraBytes=0, X64Reg operandReg=INVALID_REG, bool warn_64bit_offset = true) const;
177 void WriteFloatModRM(XEmitter *emit, FloatOp op);
178 void WriteSingleByteOp(XEmitter *emit, u8 op, X64Reg operandReg, int bits);
179 // This one is public - must be written to
180 u64 offset; // use RIP-relative as much as possible - 64-bit immediates are not available.
181 u16 operandReg;
182
183 void WriteNormalOp(XEmitter *emit, bool toRM, NormalOp op, const OpArg &operand, int bits) const;
184 bool IsImm() const {return scale == SCALE_IMM8 || scale == SCALE_IMM16 || scale == SCALE_IMM32 || scale == SCALE_IMM64;}
185 bool IsSimpleReg() const {return scale == SCALE_NONE;}
186 bool IsSimpleReg(X64Reg reg) const
187 {
188 if (!IsSimpleReg())
189 return false;
190 return GetSimpleReg() == reg;
191 }
192
193 bool CanDoOpWith(const OpArg &other) const
194 {
195 if (IsSimpleReg()) return true;
196 if (!IsSimpleReg() && !other.IsSimpleReg() && !other.IsImm()) return false;
197 return true;
198 }
199
200 int GetImmBits() const
201 {
202 switch (scale)
203 {
204 case SCALE_IMM8: return 8;
205 case SCALE_IMM16: return 16;
206 case SCALE_IMM32: return 32;
207 case SCALE_IMM64: return 64;
208 default: return -1;
209 }
210 }
211
212 void SetImmBits(int bits) {
213 switch (bits)
214 {
215 case 8: scale = SCALE_IMM8; break;
216 case 16: scale = SCALE_IMM16; break;
217 case 32: scale = SCALE_IMM32; break;
218 case 64: scale = SCALE_IMM64; break;
219 }
220 }
221
222 X64Reg GetSimpleReg() const
223 {
224 if (scale == SCALE_NONE)
225 return (X64Reg)offsetOrBaseReg;
226 else
227 return INVALID_REG;
228 }
229
230 u32 GetImmValue() const {
231 return (u32)offset;
232 }
233
234 // For loops.
235 void IncreaseOffset(int sz) {
236 offset += sz;
237 }
238
239private:
240 u8 scale;
241 u16 offsetOrBaseReg;
242 u16 indexReg;
243};
244
245inline OpArg M(const void *ptr) {return OpArg((u64)ptr, (int)SCALE_RIP);}
246template <typename T>
247inline OpArg M(const T *ptr) {return OpArg((u64)(const void *)ptr, (int)SCALE_RIP);}
248inline OpArg R(X64Reg value) {return OpArg(0, SCALE_NONE, value);}
249inline OpArg MatR(X64Reg value) {return OpArg(0, SCALE_ATREG, value);}
250
251inline OpArg MDisp(X64Reg value, int offset)
252{
253 return OpArg((u32)offset, SCALE_ATREG, value);
254}
255
256inline OpArg MComplex(X64Reg base, X64Reg scaled, int scale, int offset)
257{
258 return OpArg(offset, scale, base, scaled);
259}
260
261inline OpArg MScaled(X64Reg scaled, int scale, int offset)
262{
263 if (scale == SCALE_1)
264 return OpArg(offset, SCALE_ATREG, scaled);
265 else
266 return OpArg(offset, scale | 0x20, RAX, scaled);
267}
268
269inline OpArg MRegSum(X64Reg base, X64Reg offset)
270{
271 return MComplex(base, offset, 1, 0);
272}
273
274inline OpArg Imm8 (u8 imm) {return OpArg(imm, SCALE_IMM8);}
275inline OpArg Imm16(u16 imm) {return OpArg(imm, SCALE_IMM16);} //rarely used
276inline OpArg Imm32(u32 imm) {return OpArg(imm, SCALE_IMM32);}
277inline OpArg Imm64(u64 imm) {return OpArg(imm, SCALE_IMM64);}
278inline OpArg UImmAuto(u32 imm) {
279 return OpArg(imm, imm >= 128 ? SCALE_IMM32 : SCALE_IMM8);
280}
281inline OpArg SImmAuto(s32 imm) {
282 return OpArg(imm, (imm >= 128 || imm < -128) ? SCALE_IMM32 : SCALE_IMM8);
283}
284
285#ifdef _ARCH_64
286inline OpArg ImmPtr(const void* imm) {return Imm64((u64)imm);}
287#else
288inline OpArg ImmPtr(const void* imm) {return Imm32((u32)imm);}
289#endif
290
291inline u32 PtrOffset(const void* ptr, const void* base)
292{
293#ifdef _ARCH_64
294 s64 distance = (s64)ptr-(s64)base;
295 if (distance >= 0x80000000LL ||
296 distance < -0x80000000LL)
297 {
298 ASSERT_MSG(0, "pointer offset out of range");
299 return 0;
300 }
301
302 return (u32)distance;
303#else
304 return (u32)ptr-(u32)base;
305#endif
306}
307
308//usage: int a[]; ARRAY_OFFSET(a,10)
309#define ARRAY_OFFSET(array,index) ((u32)((u64)&(array)[index]-(u64)&(array)[0]))
310//usage: struct {int e;} s; STRUCT_OFFSET(s,e)
311#define STRUCT_OFFSET(str,elem) ((u32)((u64)&(str).elem-(u64)&(str)))
312
313struct FixupBranch
314{
315 u8 *ptr;
316 int type; //0 = 8bit 1 = 32bit
317};
318
319enum SSECompare
320{
321 EQ = 0,
322 LT,
323 LE,
324 UNORD,
325 NEQ,
326 NLT,
327 NLE,
328 ORD,
329};
330
331typedef const u8* JumpTarget;
332
333class XEmitter
334{
335 friend struct OpArg; // for Write8 etc
336private:
337 u8 *code;
338 bool flags_locked;
339
340 void CheckFlags();
341
342 void Rex(int w, int r, int x, int b);
343 void WriteSimple1Byte(int bits, u8 byte, X64Reg reg);
344 void WriteSimple2Byte(int bits, u8 byte1, u8 byte2, X64Reg reg);
345 void WriteMulDivType(int bits, OpArg src, int ext);
346 void WriteBitSearchType(int bits, X64Reg dest, OpArg src, u8 byte2, bool rep = false);
347 void WriteShift(int bits, OpArg dest, OpArg &shift, int ext);
348 void WriteBitTest(int bits, OpArg &dest, OpArg &index, int ext);
349 void WriteMXCSR(OpArg arg, int ext);
350 void WriteSSEOp(u8 opPrefix, u16 op, X64Reg regOp, OpArg arg, int extrabytes = 0);
351 void WriteSSSE3Op(u8 opPrefix, u16 op, X64Reg regOp, OpArg arg, int extrabytes = 0);
352 void WriteSSE41Op(u8 opPrefix, u16 op, X64Reg regOp, OpArg arg, int extrabytes = 0);
353 void WriteAVXOp(u8 opPrefix, u16 op, X64Reg regOp, OpArg arg, int extrabytes = 0);
354 void WriteAVXOp(u8 opPrefix, u16 op, X64Reg regOp1, X64Reg regOp2, OpArg arg, int extrabytes = 0);
355 void WriteVEXOp(int size, u8 opPrefix, u16 op, X64Reg regOp1, X64Reg regOp2, OpArg arg, int extrabytes = 0);
356 void WriteBMI1Op(int size, u8 opPrefix, u16 op, X64Reg regOp1, X64Reg regOp2, OpArg arg, int extrabytes = 0);
357 void WriteBMI2Op(int size, u8 opPrefix, u16 op, X64Reg regOp1, X64Reg regOp2, OpArg arg, int extrabytes = 0);
358 void WriteFloatLoadStore(int bits, FloatOp op, FloatOp op_80b, OpArg arg);
359 void WriteNormalOp(XEmitter *emit, int bits, NormalOp op, const OpArg &a1, const OpArg &a2);
360
361 void ABI_CalculateFrameSize(u32 mask, size_t rsp_alignment, size_t needed_frame_size, size_t* shadowp, size_t* subtractionp, size_t* xmm_offsetp);
362
363protected:
364 inline void Write8(u8 value) {*code++ = value;}
365 inline void Write16(u16 value) {*(u16*)code = (value); code += 2;}
366 inline void Write32(u32 value) {*(u32*)code = (value); code += 4;}
367 inline void Write64(u64 value) {*(u64*)code = (value); code += 8;}
368
369public:
370 XEmitter() { code = nullptr; flags_locked = false; }
371 XEmitter(u8 *code_ptr) { code = code_ptr; flags_locked = false; }
372 virtual ~XEmitter() {}
373
374 void WriteModRM(int mod, int rm, int reg);
375 void WriteSIB(int scale, int index, int base);
376
377 void SetCodePtr(u8 *ptr);
378 void ReserveCodeSpace(int bytes);
379 const u8 *AlignCode4();
380 const u8 *AlignCode16();
381 const u8 *AlignCodePage();
382 const u8 *GetCodePtr() const;
383 u8 *GetWritableCodePtr();
384
385 void LockFlags() { flags_locked = true; }
386 void UnlockFlags() { flags_locked = false; }
387
388 // Looking for one of these? It's BANNED!! Some instructions are slow on modern CPU
389 // INC, DEC, LOOP, LOOPNE, LOOPE, ENTER, LEAVE, XCHG, XLAT, REP MOVSB/MOVSD, REP SCASD + other string instr.,
390 // INC and DEC are slow on Intel Core, but not on AMD. They create a
391 // false flag dependency because they only update a subset of the flags.
392 // XCHG is SLOW and should be avoided.
393
394 // Debug breakpoint
395 void INT3();
396
397 // Do nothing
398 void NOP(size_t count = 1);
399
400 // Save energy in wait-loops on P4 only. Probably not too useful.
401 void PAUSE();
402
403 // Flag control
404 void STC();
405 void CLC();
406 void CMC();
407
408 // These two can not be executed in 64-bit mode on early Intel 64-bit CPU:s, only on Core2 and AMD!
409 void LAHF(); // 3 cycle vector path
410 void SAHF(); // direct path fast
411
412
413 // Stack control
414 void PUSH(X64Reg reg);
415 void POP(X64Reg reg);
416 void PUSH(int bits, const OpArg &reg);
417 void POP(int bits, const OpArg &reg);
418 void PUSHF();
419 void POPF();
420
421 // Flow control
422 void RET();
423 void RET_FAST();
424 void UD2();
425 FixupBranch J(bool force5bytes = false);
426
427 void JMP(const u8 * addr, bool force5Bytes = false);
428 void JMP(OpArg arg);
429 void JMPptr(const OpArg &arg);
430 void JMPself(); //infinite loop!
431#ifdef CALL
432#undef CALL
433#endif
434 void CALL(const void *fnptr);
435 void CALLptr(OpArg arg);
436
437 FixupBranch J_CC(CCFlags conditionCode, bool force5bytes = false);
438 //void J_CC(CCFlags conditionCode, JumpTarget target);
439 void J_CC(CCFlags conditionCode, const u8 * addr, bool force5Bytes = false);
440
441 void SetJumpTarget(const FixupBranch &branch);
442
443 void SETcc(CCFlags flag, OpArg dest);
444 // Note: CMOV brings small if any benefit on current cpus.
445 void CMOVcc(int bits, X64Reg dest, OpArg src, CCFlags flag);
446
447 // Fences
448 void LFENCE();
449 void MFENCE();
450 void SFENCE();
451
452 // Bit scan
453 void BSF(int bits, X64Reg dest, OpArg src); //bottom bit to top bit
454 void BSR(int bits, X64Reg dest, OpArg src); //top bit to bottom bit
455
456 // Cache control
457 enum PrefetchLevel
458 {
459 PF_NTA, //Non-temporal (data used once and only once)
460 PF_T0, //All cache levels
461 PF_T1, //Levels 2+ (aliased to T0 on AMD)
462 PF_T2, //Levels 3+ (aliased to T0 on AMD)
463 };
464 void PREFETCH(PrefetchLevel level, OpArg arg);
465 void MOVNTI(int bits, OpArg dest, X64Reg src);
466 void MOVNTDQ(OpArg arg, X64Reg regOp);
467 void MOVNTPS(OpArg arg, X64Reg regOp);
468 void MOVNTPD(OpArg arg, X64Reg regOp);
469
470 // Multiplication / division
471 void MUL(int bits, OpArg src); //UNSIGNED
472 void IMUL(int bits, OpArg src); //SIGNED
473 void IMUL(int bits, X64Reg regOp, OpArg src);
474 void IMUL(int bits, X64Reg regOp, OpArg src, OpArg imm);
475 void DIV(int bits, OpArg src);
476 void IDIV(int bits, OpArg src);
477
478 // Shift
479 void ROL(int bits, OpArg dest, OpArg shift);
480 void ROR(int bits, OpArg dest, OpArg shift);
481 void RCL(int bits, OpArg dest, OpArg shift);
482 void RCR(int bits, OpArg dest, OpArg shift);
483 void SHL(int bits, OpArg dest, OpArg shift);
484 void SHR(int bits, OpArg dest, OpArg shift);
485 void SAR(int bits, OpArg dest, OpArg shift);
486
487 // Bit Test
488 void BT(int bits, OpArg dest, OpArg index);
489 void BTS(int bits, OpArg dest, OpArg index);
490 void BTR(int bits, OpArg dest, OpArg index);
491 void BTC(int bits, OpArg dest, OpArg index);
492
493 // Double-Precision Shift
494 void SHRD(int bits, OpArg dest, OpArg src, OpArg shift);
495 void SHLD(int bits, OpArg dest, OpArg src, OpArg shift);
496
497 // Extend EAX into EDX in various ways
498 void CWD(int bits = 16);
499 inline void CDQ() {CWD(32);}
500 inline void CQO() {CWD(64);}
501 void CBW(int bits = 8);
502 inline void CWDE() {CBW(16);}
503 inline void CDQE() {CBW(32);}
504
505 // Load effective address
506 void LEA(int bits, X64Reg dest, OpArg src);
507
508 // Integer arithmetic
509 void NEG (int bits, OpArg src);
510 void ADD (int bits, const OpArg &a1, const OpArg &a2);
511 void ADC (int bits, const OpArg &a1, const OpArg &a2);
512 void SUB (int bits, const OpArg &a1, const OpArg &a2);
513 void SBB (int bits, const OpArg &a1, const OpArg &a2);
514 void AND (int bits, const OpArg &a1, const OpArg &a2);
515 void CMP (int bits, const OpArg &a1, const OpArg &a2);
516
517 // Bit operations
518 void NOT (int bits, OpArg src);
519 void OR (int bits, const OpArg &a1, const OpArg &a2);
520 void XOR (int bits, const OpArg &a1, const OpArg &a2);
521 void MOV (int bits, const OpArg &a1, const OpArg &a2);
522 void TEST(int bits, const OpArg &a1, const OpArg &a2);
523
524 // Are these useful at all? Consider removing.
525 void XCHG(int bits, const OpArg &a1, const OpArg &a2);
526 void XCHG_AHAL();
527
528 // Byte swapping (32 and 64-bit only).
529 void BSWAP(int bits, X64Reg reg);
530
531 // Sign/zero extension
532 void MOVSX(int dbits, int sbits, X64Reg dest, OpArg src); //automatically uses MOVSXD if necessary
533 void MOVZX(int dbits, int sbits, X64Reg dest, OpArg src);
534
535 // Available only on Atom or >= Haswell so far. Test with GetCPUCaps().movbe.
536 void MOVBE(int dbits, const OpArg& dest, const OpArg& src);
537
538 // Available only on AMD >= Phenom or Intel >= Haswell
539 void LZCNT(int bits, X64Reg dest, OpArg src);
540 // Note: this one is actually part of BMI1
541 void TZCNT(int bits, X64Reg dest, OpArg src);
542
543 // WARNING - These two take 11-13 cycles and are VectorPath! (AMD64)
544 void STMXCSR(OpArg memloc);
545 void LDMXCSR(OpArg memloc);
546
547 // Prefixes
548 void LOCK();
549 void REP();
550 void REPNE();
551 void FSOverride();
552 void GSOverride();
553
554 // x87
555 enum x87StatusWordBits {
556 x87_InvalidOperation = 0x1,
557 x87_DenormalizedOperand = 0x2,
558 x87_DivisionByZero = 0x4,
559 x87_Overflow = 0x8,
560 x87_Underflow = 0x10,
561 x87_Precision = 0x20,
562 x87_StackFault = 0x40,
563 x87_ErrorSummary = 0x80,
564 x87_C0 = 0x100,
565 x87_C1 = 0x200,
566 x87_C2 = 0x400,
567 x87_TopOfStack = 0x2000 | 0x1000 | 0x800,
568 x87_C3 = 0x4000,
569 x87_FPUBusy = 0x8000,
570 };
571
572 void FLD(int bits, OpArg src);
573 void FST(int bits, OpArg dest);
574 void FSTP(int bits, OpArg dest);
575 void FNSTSW_AX();
576 void FWAIT();
577
578 // SSE/SSE2: Floating point arithmetic
579 void ADDSS(X64Reg regOp, OpArg arg);
580 void ADDSD(X64Reg regOp, OpArg arg);
581 void SUBSS(X64Reg regOp, OpArg arg);
582 void SUBSD(X64Reg regOp, OpArg arg);
583 void MULSS(X64Reg regOp, OpArg arg);
584 void MULSD(X64Reg regOp, OpArg arg);
585 void DIVSS(X64Reg regOp, OpArg arg);
586 void DIVSD(X64Reg regOp, OpArg arg);
587 void MINSS(X64Reg regOp, OpArg arg);
588 void MINSD(X64Reg regOp, OpArg arg);
589 void MAXSS(X64Reg regOp, OpArg arg);
590 void MAXSD(X64Reg regOp, OpArg arg);
591 void SQRTSS(X64Reg regOp, OpArg arg);
592 void SQRTSD(X64Reg regOp, OpArg arg);
593 void RSQRTSS(X64Reg regOp, OpArg arg);
594
595 // SSE/SSE2: Floating point bitwise (yes)
596 void CMPSS(X64Reg regOp, OpArg arg, u8 compare);
597 void CMPSD(X64Reg regOp, OpArg arg, u8 compare);
598
599 inline void CMPEQSS(X64Reg regOp, OpArg arg) { CMPSS(regOp, arg, CMP_EQ); }
600 inline void CMPLTSS(X64Reg regOp, OpArg arg) { CMPSS(regOp, arg, CMP_LT); }
601 inline void CMPLESS(X64Reg regOp, OpArg arg) { CMPSS(regOp, arg, CMP_LE); }
602 inline void CMPUNORDSS(X64Reg regOp, OpArg arg) { CMPSS(regOp, arg, CMP_UNORD); }
603 inline void CMPNEQSS(X64Reg regOp, OpArg arg) { CMPSS(regOp, arg, CMP_NEQ); }
604 inline void CMPNLTSS(X64Reg regOp, OpArg arg) { CMPSS(regOp, arg, CMP_NLT); }
605 inline void CMPORDSS(X64Reg regOp, OpArg arg) { CMPSS(regOp, arg, CMP_ORD); }
606
607 // SSE/SSE2: Floating point packed arithmetic (x4 for float, x2 for double)
608 void ADDPS(X64Reg regOp, OpArg arg);
609 void ADDPD(X64Reg regOp, OpArg arg);
610 void SUBPS(X64Reg regOp, OpArg arg);
611 void SUBPD(X64Reg regOp, OpArg arg);
612 void CMPPS(X64Reg regOp, OpArg arg, u8 compare);
613 void CMPPD(X64Reg regOp, OpArg arg, u8 compare);
614 void MULPS(X64Reg regOp, OpArg arg);
615 void MULPD(X64Reg regOp, OpArg arg);
616 void DIVPS(X64Reg regOp, OpArg arg);
617 void DIVPD(X64Reg regOp, OpArg arg);
618 void MINPS(X64Reg regOp, OpArg arg);
619 void MINPD(X64Reg regOp, OpArg arg);
620 void MAXPS(X64Reg regOp, OpArg arg);
621 void MAXPD(X64Reg regOp, OpArg arg);
622 void SQRTPS(X64Reg regOp, OpArg arg);
623 void SQRTPD(X64Reg regOp, OpArg arg);
624 void RCPPS(X64Reg regOp, OpArg arg);
625 void RSQRTPS(X64Reg regOp, OpArg arg);
626
627 // SSE/SSE2: Floating point packed bitwise (x4 for float, x2 for double)
628 void ANDPS(X64Reg regOp, OpArg arg);
629 void ANDPD(X64Reg regOp, OpArg arg);
630 void ANDNPS(X64Reg regOp, OpArg arg);
631 void ANDNPD(X64Reg regOp, OpArg arg);
632 void ORPS(X64Reg regOp, OpArg arg);
633 void ORPD(X64Reg regOp, OpArg arg);
634 void XORPS(X64Reg regOp, OpArg arg);
635 void XORPD(X64Reg regOp, OpArg arg);
636
637 // SSE/SSE2: Shuffle components. These are tricky - see Intel documentation.
638 void SHUFPS(X64Reg regOp, OpArg arg, u8 shuffle);
639 void SHUFPD(X64Reg regOp, OpArg arg, u8 shuffle);
640
641 // SSE/SSE2: Useful alternative to shuffle in some cases.
642 void MOVDDUP(X64Reg regOp, OpArg arg);
643
644 // TODO: Actually implement
645#if 0
646 // SSE3: Horizontal operations in SIMD registers. Could be useful for various VFPU things like dot products...
647 void ADDSUBPS(X64Reg dest, OpArg src);
648 void ADDSUBPD(X64Reg dest, OpArg src);
649 void HADDPD(X64Reg dest, OpArg src);
650 void HSUBPS(X64Reg dest, OpArg src);
651 void HSUBPD(X64Reg dest, OpArg src);
652
653 // SSE4: Further horizontal operations - dot products. These are weirdly flexible, the arg contains both a read mask and a write "mask".
654 void DPPD(X64Reg dest, OpArg src, u8 arg);
655
656 // These are probably useful for VFPU emulation.
657 void INSERTPS(X64Reg dest, OpArg src, u8 arg);
658 void EXTRACTPS(OpArg dest, X64Reg src, u8 arg);
659#endif
660
661 // SSE3: Horizontal operations in SIMD registers. Very slow! shufps-based code beats it handily on Ivy.
662 void HADDPS(X64Reg dest, OpArg src);
663
664 // SSE4: Further horizontal operations - dot products. These are weirdly flexible, the arg contains both a read mask and a write "mask".
665 void DPPS(X64Reg dest, OpArg src, u8 arg);
666
667 void UNPCKLPS(X64Reg dest, OpArg src);
668 void UNPCKHPS(X64Reg dest, OpArg src);
669 void UNPCKLPD(X64Reg dest, OpArg src);
670 void UNPCKHPD(X64Reg dest, OpArg src);
671
672 // SSE/SSE2: Compares.
673 void COMISS(X64Reg regOp, OpArg arg);
674 void COMISD(X64Reg regOp, OpArg arg);
675 void UCOMISS(X64Reg regOp, OpArg arg);
676 void UCOMISD(X64Reg regOp, OpArg arg);
677
678 // SSE/SSE2: Moves. Use the right data type for your data, in most cases.
679 void MOVAPS(X64Reg regOp, OpArg arg);
680 void MOVAPD(X64Reg regOp, OpArg arg);
681 void MOVAPS(OpArg arg, X64Reg regOp);
682 void MOVAPD(OpArg arg, X64Reg regOp);
683
684 void MOVUPS(X64Reg regOp, OpArg arg);
685 void MOVUPD(X64Reg regOp, OpArg arg);
686 void MOVUPS(OpArg arg, X64Reg regOp);
687 void MOVUPD(OpArg arg, X64Reg regOp);
688
689 void MOVDQA(X64Reg regOp, OpArg arg);
690 void MOVDQA(OpArg arg, X64Reg regOp);
691 void MOVDQU(X64Reg regOp, OpArg arg);
692 void MOVDQU(OpArg arg, X64Reg regOp);
693
694 void MOVSS(X64Reg regOp, OpArg arg);
695 void MOVSD(X64Reg regOp, OpArg arg);
696 void MOVSS(OpArg arg, X64Reg regOp);
697 void MOVSD(OpArg arg, X64Reg regOp);
698
699 void MOVLPS(X64Reg regOp, OpArg arg);
700 void MOVLPD(X64Reg regOp, OpArg arg);
701 void MOVLPS(OpArg arg, X64Reg regOp);
702 void MOVLPD(OpArg arg, X64Reg regOp);
703
704 void MOVHPS(X64Reg regOp, OpArg arg);
705 void MOVHPD(X64Reg regOp, OpArg arg);
706 void MOVHPS(OpArg arg, X64Reg regOp);
707 void MOVHPD(OpArg arg, X64Reg regOp);
708
709 void MOVHLPS(X64Reg regOp1, X64Reg regOp2);
710 void MOVLHPS(X64Reg regOp1, X64Reg regOp2);
711
712 void MOVD_xmm(X64Reg dest, const OpArg &arg);
713 void MOVQ_xmm(X64Reg dest, OpArg arg);
714 void MOVD_xmm(const OpArg &arg, X64Reg src);
715 void MOVQ_xmm(OpArg arg, X64Reg src);
716
717 // SSE/SSE2: Generates a mask from the high bits of the components of the packed register in question.
718 void MOVMSKPS(X64Reg dest, OpArg arg);
719 void MOVMSKPD(X64Reg dest, OpArg arg);
720
721 // SSE2: Selective byte store, mask in src register. EDI/RDI specifies store address. This is a weird one.
722 void MASKMOVDQU(X64Reg dest, X64Reg src);
723 void LDDQU(X64Reg dest, OpArg src);
724
725 // SSE/SSE2: Data type conversions.
726 void CVTPS2PD(X64Reg dest, OpArg src);
727 void CVTPD2PS(X64Reg dest, OpArg src);
728 void CVTSS2SD(X64Reg dest, OpArg src);
729 void CVTSI2SS(X64Reg dest, OpArg src);
730 void CVTSD2SS(X64Reg dest, OpArg src);
731 void CVTSI2SD(X64Reg dest, OpArg src);
732 void CVTDQ2PD(X64Reg regOp, OpArg arg);
733 void CVTPD2DQ(X64Reg regOp, OpArg arg);
734 void CVTDQ2PS(X64Reg regOp, OpArg arg);
735 void CVTPS2DQ(X64Reg regOp, OpArg arg);
736
737 void CVTTPS2DQ(X64Reg regOp, OpArg arg);
738 void CVTTPD2DQ(X64Reg regOp, OpArg arg);
739
740 // Destinations are X64 regs (rax, rbx, ...) for these instructions.
741 void CVTSS2SI(X64Reg xregdest, OpArg src);
742 void CVTSD2SI(X64Reg xregdest, OpArg src);
743 void CVTTSS2SI(X64Reg xregdest, OpArg arg);
744 void CVTTSD2SI(X64Reg xregdest, OpArg arg);
745
746 // SSE2: Packed integer instructions
747 void PACKSSDW(X64Reg dest, OpArg arg);
748 void PACKSSWB(X64Reg dest, OpArg arg);
749 void PACKUSDW(X64Reg dest, OpArg arg);
750 void PACKUSWB(X64Reg dest, OpArg arg);
751
752 void PUNPCKLBW(X64Reg dest, const OpArg &arg);
753 void PUNPCKLWD(X64Reg dest, const OpArg &arg);
754 void PUNPCKLDQ(X64Reg dest, const OpArg &arg);
755 void PUNPCKLQDQ(X64Reg dest, const OpArg &arg);
756
757 void PTEST(X64Reg dest, OpArg arg);
758 void PAND(X64Reg dest, OpArg arg);
759 void PANDN(X64Reg dest, OpArg arg);
760 void PXOR(X64Reg dest, OpArg arg);
761 void POR(X64Reg dest, OpArg arg);
762
763 void PADDB(X64Reg dest, OpArg arg);
764 void PADDW(X64Reg dest, OpArg arg);
765 void PADDD(X64Reg dest, OpArg arg);
766 void PADDQ(X64Reg dest, OpArg arg);
767
768 void PADDSB(X64Reg dest, OpArg arg);
769 void PADDSW(X64Reg dest, OpArg arg);
770 void PADDUSB(X64Reg dest, OpArg arg);
771 void PADDUSW(X64Reg dest, OpArg arg);
772
773 void PSUBB(X64Reg dest, OpArg arg);
774 void PSUBW(X64Reg dest, OpArg arg);
775 void PSUBD(X64Reg dest, OpArg arg);
776 void PSUBQ(X64Reg dest, OpArg arg);
777
778 void PSUBSB(X64Reg dest, OpArg arg);
779 void PSUBSW(X64Reg dest, OpArg arg);
780 void PSUBUSB(X64Reg dest, OpArg arg);
781 void PSUBUSW(X64Reg dest, OpArg arg);
782
783 void PAVGB(X64Reg dest, OpArg arg);
784 void PAVGW(X64Reg dest, OpArg arg);
785
786 void PCMPEQB(X64Reg dest, OpArg arg);
787 void PCMPEQW(X64Reg dest, OpArg arg);
788 void PCMPEQD(X64Reg dest, OpArg arg);
789
790 void PCMPGTB(X64Reg dest, OpArg arg);
791 void PCMPGTW(X64Reg dest, OpArg arg);
792 void PCMPGTD(X64Reg dest, OpArg arg);
793
794 void PEXTRW(X64Reg dest, OpArg arg, u8 subreg);
795 void PINSRW(X64Reg dest, OpArg arg, u8 subreg);
796
797 void PMADDWD(X64Reg dest, OpArg arg);
798 void PSADBW(X64Reg dest, OpArg arg);
799
800 void PMAXSW(X64Reg dest, OpArg arg);
801 void PMAXUB(X64Reg dest, OpArg arg);
802 void PMINSW(X64Reg dest, OpArg arg);
803 void PMINUB(X64Reg dest, OpArg arg);
804 // SSE4: More MAX/MIN instructions.
805 void PMINSB(X64Reg dest, OpArg arg);
806 void PMINSD(X64Reg dest, OpArg arg);
807 void PMINUW(X64Reg dest, OpArg arg);
808 void PMINUD(X64Reg dest, OpArg arg);
809 void PMAXSB(X64Reg dest, OpArg arg);
810 void PMAXSD(X64Reg dest, OpArg arg);
811 void PMAXUW(X64Reg dest, OpArg arg);
812 void PMAXUD(X64Reg dest, OpArg arg);
813
814 void PMOVMSKB(X64Reg dest, OpArg arg);
815 void PSHUFD(X64Reg dest, OpArg arg, u8 shuffle);
816 void PSHUFB(X64Reg dest, OpArg arg);
817
818 void PSHUFLW(X64Reg dest, OpArg arg, u8 shuffle);
819 void PSHUFHW(X64Reg dest, OpArg arg, u8 shuffle);
820
821 void PSRLW(X64Reg reg, int shift);
822 void PSRLD(X64Reg reg, int shift);
823 void PSRLQ(X64Reg reg, int shift);
824 void PSRLQ(X64Reg reg, OpArg arg);
825 void PSRLDQ(X64Reg reg, int shift);
826
827 void PSLLW(X64Reg reg, int shift);
828 void PSLLD(X64Reg reg, int shift);
829 void PSLLQ(X64Reg reg, int shift);
830 void PSLLDQ(X64Reg reg, int shift);
831
832 void PSRAW(X64Reg reg, int shift);
833 void PSRAD(X64Reg reg, int shift);
834
835 // SSE4: data type conversions
836 void PMOVSXBW(X64Reg dest, OpArg arg);
837 void PMOVSXBD(X64Reg dest, OpArg arg);
838 void PMOVSXBQ(X64Reg dest, OpArg arg);
839 void PMOVSXWD(X64Reg dest, OpArg arg);
840 void PMOVSXWQ(X64Reg dest, OpArg arg);
841 void PMOVSXDQ(X64Reg dest, OpArg arg);
842 void PMOVZXBW(X64Reg dest, OpArg arg);
843 void PMOVZXBD(X64Reg dest, OpArg arg);
844 void PMOVZXBQ(X64Reg dest, OpArg arg);
845 void PMOVZXWD(X64Reg dest, OpArg arg);
846 void PMOVZXWQ(X64Reg dest, OpArg arg);
847 void PMOVZXDQ(X64Reg dest, OpArg arg);
848
849 // SSE4: variable blend instructions (xmm0 implicit argument)
850 void PBLENDVB(X64Reg dest, OpArg arg);
851 void BLENDVPS(X64Reg dest, OpArg arg);
852 void BLENDVPD(X64Reg dest, OpArg arg);
853 void BLENDPS(X64Reg dest, const OpArg& arg, u8 blend);
854 void BLENDPD(X64Reg dest, const OpArg& arg, u8 blend);
855
856 // SSE4: rounding (see FloatRound for mode or use ROUNDNEARSS, etc. helpers.)
857 void ROUNDSS(X64Reg dest, OpArg arg, u8 mode);
858 void ROUNDSD(X64Reg dest, OpArg arg, u8 mode);
859 void ROUNDPS(X64Reg dest, OpArg arg, u8 mode);
860 void ROUNDPD(X64Reg dest, OpArg arg, u8 mode);
861
862 inline void ROUNDNEARSS(X64Reg dest, OpArg arg) { ROUNDSS(dest, arg, FROUND_NEAREST); }
863 inline void ROUNDFLOORSS(X64Reg dest, OpArg arg) { ROUNDSS(dest, arg, FROUND_FLOOR); }
864 inline void ROUNDCEILSS(X64Reg dest, OpArg arg) { ROUNDSS(dest, arg, FROUND_CEIL); }
865 inline void ROUNDZEROSS(X64Reg dest, OpArg arg) { ROUNDSS(dest, arg, FROUND_ZERO); }
866
867 inline void ROUNDNEARSD(X64Reg dest, OpArg arg) { ROUNDSD(dest, arg, FROUND_NEAREST); }
868 inline void ROUNDFLOORSD(X64Reg dest, OpArg arg) { ROUNDSD(dest, arg, FROUND_FLOOR); }
869 inline void ROUNDCEILSD(X64Reg dest, OpArg arg) { ROUNDSD(dest, arg, FROUND_CEIL); }
870 inline void ROUNDZEROSD(X64Reg dest, OpArg arg) { ROUNDSD(dest, arg, FROUND_ZERO); }
871
872 inline void ROUNDNEARPS(X64Reg dest, OpArg arg) { ROUNDPS(dest, arg, FROUND_NEAREST); }
873 inline void ROUNDFLOORPS(X64Reg dest, OpArg arg) { ROUNDPS(dest, arg, FROUND_FLOOR); }
874 inline void ROUNDCEILPS(X64Reg dest, OpArg arg) { ROUNDPS(dest, arg, FROUND_CEIL); }
875 inline void ROUNDZEROPS(X64Reg dest, OpArg arg) { ROUNDPS(dest, arg, FROUND_ZERO); }
876
877 inline void ROUNDNEARPD(X64Reg dest, OpArg arg) { ROUNDPD(dest, arg, FROUND_NEAREST); }
878 inline void ROUNDFLOORPD(X64Reg dest, OpArg arg) { ROUNDPD(dest, arg, FROUND_FLOOR); }
879 inline void ROUNDCEILPD(X64Reg dest, OpArg arg) { ROUNDPD(dest, arg, FROUND_CEIL); }
880 inline void ROUNDZEROPD(X64Reg dest, OpArg arg) { ROUNDPD(dest, arg, FROUND_ZERO); }
881
882 // AVX
883 void VADDSD(X64Reg regOp1, X64Reg regOp2, OpArg arg);
884 void VSUBSD(X64Reg regOp1, X64Reg regOp2, OpArg arg);
885 void VMULSD(X64Reg regOp1, X64Reg regOp2, OpArg arg);
886 void VDIVSD(X64Reg regOp1, X64Reg regOp2, OpArg arg);
887 void VADDPD(X64Reg regOp1, X64Reg regOp2, OpArg arg);
888 void VSUBPD(X64Reg regOp1, X64Reg regOp2, OpArg arg);
889 void VMULPD(X64Reg regOp1, X64Reg regOp2, OpArg arg);
890 void VDIVPD(X64Reg regOp1, X64Reg regOp2, OpArg arg);
891 void VSQRTSD(X64Reg regOp1, X64Reg regOp2, OpArg arg);
892 void VSHUFPD(X64Reg regOp1, X64Reg regOp2, OpArg arg, u8 shuffle);
893 void VUNPCKLPD(X64Reg regOp1, X64Reg regOp2, OpArg arg);
894 void VUNPCKHPD(X64Reg regOp1, X64Reg regOp2, OpArg arg);
895
896 void VANDPS(X64Reg regOp1, X64Reg regOp2, OpArg arg);
897 void VANDPD(X64Reg regOp1, X64Reg regOp2, OpArg arg);
898 void VANDNPS(X64Reg regOp1, X64Reg regOp2, OpArg arg);
899 void VANDNPD(X64Reg regOp1, X64Reg regOp2, OpArg arg);
900 void VORPS(X64Reg regOp1, X64Reg regOp2, OpArg arg);
901 void VORPD(X64Reg regOp1, X64Reg regOp2, OpArg arg);
902 void VXORPS(X64Reg regOp1, X64Reg regOp2, OpArg arg);
903 void VXORPD(X64Reg regOp1, X64Reg regOp2, OpArg arg);
904
905 void VPAND(X64Reg regOp1, X64Reg regOp2, OpArg arg);
906 void VPANDN(X64Reg regOp1, X64Reg regOp2, OpArg arg);
907 void VPOR(X64Reg regOp1, X64Reg regOp2, OpArg arg);
908 void VPXOR(X64Reg regOp1, X64Reg regOp2, OpArg arg);
909
910 // FMA3
911 void VFMADD132PS(X64Reg regOp1, X64Reg regOp2, OpArg arg);
912 void VFMADD213PS(X64Reg regOp1, X64Reg regOp2, OpArg arg);
913 void VFMADD231PS(X64Reg regOp1, X64Reg regOp2, OpArg arg);
914 void VFMADD132PD(X64Reg regOp1, X64Reg regOp2, OpArg arg);
915 void VFMADD213PD(X64Reg regOp1, X64Reg regOp2, OpArg arg);
916 void VFMADD231PD(X64Reg regOp1, X64Reg regOp2, OpArg arg);
917 void VFMADD132SS(X64Reg regOp1, X64Reg regOp2, OpArg arg);
918 void VFMADD213SS(X64Reg regOp1, X64Reg regOp2, OpArg arg);
919 void VFMADD231SS(X64Reg regOp1, X64Reg regOp2, OpArg arg);
920 void VFMADD132SD(X64Reg regOp1, X64Reg regOp2, OpArg arg);
921 void VFMADD213SD(X64Reg regOp1, X64Reg regOp2, OpArg arg);
922 void VFMADD231SD(X64Reg regOp1, X64Reg regOp2, OpArg arg);
923 void VFMSUB132PS(X64Reg regOp1, X64Reg regOp2, OpArg arg);
924 void VFMSUB213PS(X64Reg regOp1, X64Reg regOp2, OpArg arg);
925 void VFMSUB231PS(X64Reg regOp1, X64Reg regOp2, OpArg arg);
926 void VFMSUB132PD(X64Reg regOp1, X64Reg regOp2, OpArg arg);
927 void VFMSUB213PD(X64Reg regOp1, X64Reg regOp2, OpArg arg);
928 void VFMSUB231PD(X64Reg regOp1, X64Reg regOp2, OpArg arg);
929 void VFMSUB132SS(X64Reg regOp1, X64Reg regOp2, OpArg arg);
930 void VFMSUB213SS(X64Reg regOp1, X64Reg regOp2, OpArg arg);
931 void VFMSUB231SS(X64Reg regOp1, X64Reg regOp2, OpArg arg);
932 void VFMSUB132SD(X64Reg regOp1, X64Reg regOp2, OpArg arg);
933 void VFMSUB213SD(X64Reg regOp1, X64Reg regOp2, OpArg arg);
934 void VFMSUB231SD(X64Reg regOp1, X64Reg regOp2, OpArg arg);
935 void VFNMADD132PS(X64Reg regOp1, X64Reg regOp2, OpArg arg);
936 void VFNMADD213PS(X64Reg regOp1, X64Reg regOp2, OpArg arg);
937 void VFNMADD231PS(X64Reg regOp1, X64Reg regOp2, OpArg arg);
938 void VFNMADD132PD(X64Reg regOp1, X64Reg regOp2, OpArg arg);
939 void VFNMADD213PD(X64Reg regOp1, X64Reg regOp2, OpArg arg);
940 void VFNMADD231PD(X64Reg regOp1, X64Reg regOp2, OpArg arg);
941 void VFNMADD132SS(X64Reg regOp1, X64Reg regOp2, OpArg arg);
942 void VFNMADD213SS(X64Reg regOp1, X64Reg regOp2, OpArg arg);
943 void VFNMADD231SS(X64Reg regOp1, X64Reg regOp2, OpArg arg);
944 void VFNMADD132SD(X64Reg regOp1, X64Reg regOp2, OpArg arg);
945 void VFNMADD213SD(X64Reg regOp1, X64Reg regOp2, OpArg arg);
946 void VFNMADD231SD(X64Reg regOp1, X64Reg regOp2, OpArg arg);
947 void VFNMSUB132PS(X64Reg regOp1, X64Reg regOp2, OpArg arg);
948 void VFNMSUB213PS(X64Reg regOp1, X64Reg regOp2, OpArg arg);
949 void VFNMSUB231PS(X64Reg regOp1, X64Reg regOp2, OpArg arg);
950 void VFNMSUB132PD(X64Reg regOp1, X64Reg regOp2, OpArg arg);
951 void VFNMSUB213PD(X64Reg regOp1, X64Reg regOp2, OpArg arg);
952 void VFNMSUB231PD(X64Reg regOp1, X64Reg regOp2, OpArg arg);
953 void VFNMSUB132SS(X64Reg regOp1, X64Reg regOp2, OpArg arg);
954 void VFNMSUB213SS(X64Reg regOp1, X64Reg regOp2, OpArg arg);
955 void VFNMSUB231SS(X64Reg regOp1, X64Reg regOp2, OpArg arg);
956 void VFNMSUB132SD(X64Reg regOp1, X64Reg regOp2, OpArg arg);
957 void VFNMSUB213SD(X64Reg regOp1, X64Reg regOp2, OpArg arg);
958 void VFNMSUB231SD(X64Reg regOp1, X64Reg regOp2, OpArg arg);
959 void VFMADDSUB132PS(X64Reg regOp1, X64Reg regOp2, OpArg arg);
960 void VFMADDSUB213PS(X64Reg regOp1, X64Reg regOp2, OpArg arg);
961 void VFMADDSUB231PS(X64Reg regOp1, X64Reg regOp2, OpArg arg);
962 void VFMADDSUB132PD(X64Reg regOp1, X64Reg regOp2, OpArg arg);
963 void VFMADDSUB213PD(X64Reg regOp1, X64Reg regOp2, OpArg arg);
964 void VFMADDSUB231PD(X64Reg regOp1, X64Reg regOp2, OpArg arg);
965 void VFMSUBADD132PS(X64Reg regOp1, X64Reg regOp2, OpArg arg);
966 void VFMSUBADD213PS(X64Reg regOp1, X64Reg regOp2, OpArg arg);
967 void VFMSUBADD231PS(X64Reg regOp1, X64Reg regOp2, OpArg arg);
968 void VFMSUBADD132PD(X64Reg regOp1, X64Reg regOp2, OpArg arg);
969 void VFMSUBADD213PD(X64Reg regOp1, X64Reg regOp2, OpArg arg);
970 void VFMSUBADD231PD(X64Reg regOp1, X64Reg regOp2, OpArg arg);
971
972 // VEX GPR instructions
973 void SARX(int bits, X64Reg regOp1, OpArg arg, X64Reg regOp2);
974 void SHLX(int bits, X64Reg regOp1, OpArg arg, X64Reg regOp2);
975 void SHRX(int bits, X64Reg regOp1, OpArg arg, X64Reg regOp2);
976 void RORX(int bits, X64Reg regOp, OpArg arg, u8 rotate);
977 void PEXT(int bits, X64Reg regOp1, X64Reg regOp2, OpArg arg);
978 void PDEP(int bits, X64Reg regOp1, X64Reg regOp2, OpArg arg);
979 void MULX(int bits, X64Reg regOp1, X64Reg regOp2, OpArg arg);
980 void BZHI(int bits, X64Reg regOp1, OpArg arg, X64Reg regOp2);
981 void BLSR(int bits, X64Reg regOp, OpArg arg);
982 void BLSMSK(int bits, X64Reg regOp, OpArg arg);
983 void BLSI(int bits, X64Reg regOp, OpArg arg);
984 void BEXTR(int bits, X64Reg regOp1, OpArg arg, X64Reg regOp2);
985 void ANDN(int bits, X64Reg regOp1, X64Reg regOp2, OpArg arg);
986
987 void RDTSC();
988
989 // Utility functions
990 // The difference between this and CALL is that this aligns the stack
991 // where appropriate.
992 void ABI_CallFunction(const void *func);
993 template <typename T>
994 void ABI_CallFunction(T (*func)()) {
995 ABI_CallFunction((const void *)func);
996 }
997
998 void ABI_CallFunction(const u8 *func) {
999 ABI_CallFunction((const void *)func);
1000 }
1001 void ABI_CallFunctionC16(const void *func, u16 param1);
1002 void ABI_CallFunctionCC16(const void *func, u32 param1, u16 param2);
1003
1004
1005 // These only support u32 parameters, but that's enough for a lot of uses.
1006 // These will destroy the 1 or 2 first "parameter regs".
1007 void ABI_CallFunctionC(const void *func, u32 param1);
1008 void ABI_CallFunctionCC(const void *func, u32 param1, u32 param2);
1009 void ABI_CallFunctionCCC(const void *func, u32 param1, u32 param2, u32 param3);
1010 void ABI_CallFunctionCCP(const void *func, u32 param1, u32 param2, void *param3);
1011 void ABI_CallFunctionCCCP(const void *func, u32 param1, u32 param2, u32 param3, void *param4);
1012 void ABI_CallFunctionP(const void *func, void *param1);
1013 void ABI_CallFunctionPA(const void *func, void *param1, const Gen::OpArg &arg2);
1014 void ABI_CallFunctionPAA(const void *func, void *param1, const Gen::OpArg &arg2, const Gen::OpArg &arg3);
1015 void ABI_CallFunctionPPC(const void *func, void *param1, void *param2, u32 param3);
1016 void ABI_CallFunctionAC(const void *func, const Gen::OpArg &arg1, u32 param2);
1017 void ABI_CallFunctionACC(const void *func, const Gen::OpArg &arg1, u32 param2, u32 param3);
1018 void ABI_CallFunctionA(const void *func, const Gen::OpArg &arg1);
1019 void ABI_CallFunctionAA(const void *func, const Gen::OpArg &arg1, const Gen::OpArg &arg2);
1020
1021 // Pass a register as a parameter.
1022 void ABI_CallFunctionR(const void *func, X64Reg reg1);
1023 void ABI_CallFunctionRR(const void *func, X64Reg reg1, X64Reg reg2);
1024
1025 template <typename Tr, typename T1>
1026 void ABI_CallFunctionC(Tr (*func)(T1), u32 param1) {
1027 ABI_CallFunctionC((const void *)func, param1);
1028 }
1029
1030 // A function that doesn't have any control over what it will do to regs,
1031 // such as the dispatcher, should be surrounded by these.
1032 void ABI_PushAllCalleeSavedRegsAndAdjustStack();
1033 void ABI_PopAllCalleeSavedRegsAndAdjustStack();
1034
1035 // A function that doesn't know anything about it's surroundings, should
1036 // be surrounded by these to establish a safe environment, where it can roam free.
1037 // An example is a backpatch injected function.
1038 void ABI_PushAllCallerSavedRegsAndAdjustStack();
1039 void ABI_PopAllCallerSavedRegsAndAdjustStack();
1040
1041 unsigned int ABI_GetAlignedFrameSize(unsigned int frameSize);
1042 void ABI_AlignStack(unsigned int frameSize);
1043 void ABI_RestoreStack(unsigned int frameSize);
1044
1045 // Sets up a __cdecl function.
1046 // Only x64 really needs the parameter count.
1047 void ABI_EmitPrologue(int maxCallParams);
1048 void ABI_EmitEpilogue(int maxCallParams);
1049
1050 #ifdef _M_IX86
1051 inline int ABI_GetNumXMMRegs() { return 8; }
1052 #else
1053 inline int ABI_GetNumXMMRegs() { return 16; }
1054 #endif
1055}; // class XEmitter
1056
1057
1058// Everything that needs to generate X86 code should inherit from this.
1059// You get memory management for free, plus, you can use all the MOV etc functions without
1060// having to prefix them with gen-> or something similar.
1061
1062class XCodeBlock : public CodeBlock<XEmitter> {
1063public:
1064 void PoisonMemory() override;
1065};
1066
1067} // namespace
diff --git a/src/core/settings.h b/src/core/settings.h
index 2775ee257..6ca0e1afc 100644
--- a/src/core/settings.h
+++ b/src/core/settings.h
@@ -53,6 +53,7 @@ struct Values {
53 53
54 // Renderer 54 // Renderer
55 bool use_hw_renderer; 55 bool use_hw_renderer;
56 bool use_shader_jit;
56 57
57 float bg_red; 58 float bg_red;
58 float bg_green; 59 float bg_green;
diff --git a/src/video_core/CMakeLists.txt b/src/video_core/CMakeLists.txt
index 162108301..183709d8b 100644
--- a/src/video_core/CMakeLists.txt
+++ b/src/video_core/CMakeLists.txt
@@ -11,8 +11,9 @@ set(SRCS
11 pica.cpp 11 pica.cpp
12 primitive_assembly.cpp 12 primitive_assembly.cpp
13 rasterizer.cpp 13 rasterizer.cpp
14 shader/shader.cpp
15 shader/shader_interpreter.cpp
14 utils.cpp 16 utils.cpp
15 vertex_shader.cpp
16 video_core.cpp 17 video_core.cpp
17 ) 18 )
18 19
@@ -35,11 +36,20 @@ set(HEADERS
35 primitive_assembly.h 36 primitive_assembly.h
36 rasterizer.h 37 rasterizer.h
37 renderer_base.h 38 renderer_base.h
39 shader/shader.h
40 shader/shader_interpreter.h
38 utils.h 41 utils.h
39 vertex_shader.h
40 video_core.h 42 video_core.h
41 ) 43 )
42 44
45if(ARCHITECTURE_x86_64)
46 set(SRCS ${SRCS}
47 shader/shader_jit_x64.cpp)
48
49 set(HEADERS ${HEADERS}
50 shader/shader_jit_x64.h)
51endif()
52
43create_directory_groups(${SRCS} ${HEADERS}) 53create_directory_groups(${SRCS} ${HEADERS})
44 54
45add_library(video_core STATIC ${SRCS} ${HEADERS}) 55add_library(video_core STATIC ${SRCS} ${HEADERS})
diff --git a/src/video_core/clipper.cpp b/src/video_core/clipper.cpp
index 558b49d60..bb6048cc0 100644
--- a/src/video_core/clipper.cpp
+++ b/src/video_core/clipper.cpp
@@ -7,7 +7,7 @@
7#include "clipper.h" 7#include "clipper.h"
8#include "pica.h" 8#include "pica.h"
9#include "rasterizer.h" 9#include "rasterizer.h"
10#include "vertex_shader.h" 10#include "shader/shader_interpreter.h"
11 11
12namespace Pica { 12namespace Pica {
13 13
diff --git a/src/video_core/clipper.h b/src/video_core/clipper.h
index 19ce8e140..6ed01e877 100644
--- a/src/video_core/clipper.h
+++ b/src/video_core/clipper.h
@@ -6,13 +6,13 @@
6 6
7namespace Pica { 7namespace Pica {
8 8
9namespace VertexShader { 9namespace Shader {
10 struct OutputVertex; 10 struct OutputVertex;
11} 11}
12 12
13namespace Clipper { 13namespace Clipper {
14 14
15using VertexShader::OutputVertex; 15using Shader::OutputVertex;
16 16
17void ProcessTriangle(OutputVertex& v0, OutputVertex& v1, OutputVertex& v2); 17void ProcessTriangle(OutputVertex& v0, OutputVertex& v1, OutputVertex& v2);
18 18
diff --git a/src/video_core/command_processor.cpp b/src/video_core/command_processor.cpp
index 243abe842..374c4748d 100644
--- a/src/video_core/command_processor.cpp
+++ b/src/video_core/command_processor.cpp
@@ -18,7 +18,7 @@
18#include "pica.h" 18#include "pica.h"
19#include "primitive_assembly.h" 19#include "primitive_assembly.h"
20#include "renderer_base.h" 20#include "renderer_base.h"
21#include "vertex_shader.h" 21#include "shader/shader_interpreter.h"
22#include "video_core.h" 22#include "video_core.h"
23 23
24namespace Pica { 24namespace Pica {
@@ -165,7 +165,7 @@ static inline void WritePicaReg(u32 id, u32 value, u32 mask) {
165 DebugUtils::GeometryDumper geometry_dumper; 165 DebugUtils::GeometryDumper geometry_dumper;
166 PrimitiveAssembler<DebugUtils::GeometryDumper::Vertex> dumping_primitive_assembler(regs.triangle_topology.Value()); 166 PrimitiveAssembler<DebugUtils::GeometryDumper::Vertex> dumping_primitive_assembler(regs.triangle_topology.Value());
167#endif 167#endif
168 PrimitiveAssembler<VertexShader::OutputVertex> primitive_assembler(regs.triangle_topology.Value()); 168 PrimitiveAssembler<Shader::OutputVertex> primitive_assembler(regs.triangle_topology.Value());
169 169
170 if (g_debug_context) { 170 if (g_debug_context) {
171 for (int i = 0; i < 3; ++i) { 171 for (int i = 0; i < 3; ++i) {
@@ -210,11 +210,14 @@ static inline void WritePicaReg(u32 id, u32 value, u32 mask) {
210 // The size has been tuned for optimal balance between hit-rate and the cost of lookup 210 // The size has been tuned for optimal balance between hit-rate and the cost of lookup
211 const size_t VERTEX_CACHE_SIZE = 32; 211 const size_t VERTEX_CACHE_SIZE = 32;
212 std::array<u16, VERTEX_CACHE_SIZE> vertex_cache_ids; 212 std::array<u16, VERTEX_CACHE_SIZE> vertex_cache_ids;
213 std::array<VertexShader::OutputVertex, VERTEX_CACHE_SIZE> vertex_cache; 213 std::array<Shader::OutputVertex, VERTEX_CACHE_SIZE> vertex_cache;
214 214
215 unsigned int vertex_cache_pos = 0; 215 unsigned int vertex_cache_pos = 0;
216 vertex_cache_ids.fill(-1); 216 vertex_cache_ids.fill(-1);
217 217
218 Shader::UnitState shader_unit;
219 Shader::Setup(shader_unit);
220
218 for (unsigned int index = 0; index < regs.num_vertices; ++index) 221 for (unsigned int index = 0; index < regs.num_vertices; ++index)
219 { 222 {
220 unsigned int vertex = is_indexed ? (index_u16 ? index_address_16[index] : index_address_8[index]) : index; 223 unsigned int vertex = is_indexed ? (index_u16 ? index_address_16[index] : index_address_8[index]) : index;
@@ -224,7 +227,7 @@ static inline void WritePicaReg(u32 id, u32 value, u32 mask) {
224 ASSERT(vertex != -1); 227 ASSERT(vertex != -1);
225 228
226 bool vertex_cache_hit = false; 229 bool vertex_cache_hit = false;
227 VertexShader::OutputVertex output; 230 Shader::OutputVertex output;
228 231
229 if (is_indexed) { 232 if (is_indexed) {
230 if (g_debug_context && Pica::g_debug_context->recorder) { 233 if (g_debug_context && Pica::g_debug_context->recorder) {
@@ -243,7 +246,7 @@ static inline void WritePicaReg(u32 id, u32 value, u32 mask) {
243 246
244 if (!vertex_cache_hit) { 247 if (!vertex_cache_hit) {
245 // Initialize data for the current vertex 248 // Initialize data for the current vertex
246 VertexShader::InputVertex input; 249 Shader::InputVertex input;
247 250
248 for (int i = 0; i < attribute_config.GetNumTotalAttributes(); ++i) { 251 for (int i = 0; i < attribute_config.GetNumTotalAttributes(); ++i) {
249 if (vertex_attribute_elements[i] != 0) { 252 if (vertex_attribute_elements[i] != 0) {
@@ -306,9 +309,8 @@ static inline void WritePicaReg(u32 id, u32 value, u32 mask) {
306 std::bind(&DebugUtils::GeometryDumper::AddTriangle, 309 std::bind(&DebugUtils::GeometryDumper::AddTriangle,
307 &geometry_dumper, _1, _2, _3)); 310 &geometry_dumper, _1, _2, _3));
308#endif 311#endif
309
310 // Send to vertex shader 312 // Send to vertex shader
311 output = VertexShader::RunShader(input, attribute_config.GetNumTotalAttributes(), g_state.regs.vs, g_state.vs); 313 output = Shader::Run(shader_unit, input, attribute_config.GetNumTotalAttributes());
312 314
313 if (is_indexed) { 315 if (is_indexed) {
314 vertex_cache[vertex_cache_pos] = output; 316 vertex_cache[vertex_cache_pos] = output;
@@ -319,9 +321,9 @@ static inline void WritePicaReg(u32 id, u32 value, u32 mask) {
319 321
320 if (Settings::values.use_hw_renderer) { 322 if (Settings::values.use_hw_renderer) {
321 // Send to hardware renderer 323 // Send to hardware renderer
322 static auto AddHWTriangle = [](const Pica::VertexShader::OutputVertex& v0, 324 static auto AddHWTriangle = [](const Pica::Shader::OutputVertex& v0,
323 const Pica::VertexShader::OutputVertex& v1, 325 const Pica::Shader::OutputVertex& v1,
324 const Pica::VertexShader::OutputVertex& v2) { 326 const Pica::Shader::OutputVertex& v2) {
325 VideoCore::g_renderer->hw_rasterizer->AddTriangle(v0, v1, v2); 327 VideoCore::g_renderer->hw_rasterizer->AddTriangle(v0, v1, v2);
326 }; 328 };
327 329
diff --git a/src/video_core/hwrasterizer_base.h b/src/video_core/hwrasterizer_base.h
index c8746c608..54b8892fb 100644
--- a/src/video_core/hwrasterizer_base.h
+++ b/src/video_core/hwrasterizer_base.h
@@ -7,7 +7,7 @@
7#include "common/common_types.h" 7#include "common/common_types.h"
8 8
9namespace Pica { 9namespace Pica {
10namespace VertexShader { 10namespace Shader {
11struct OutputVertex; 11struct OutputVertex;
12} 12}
13} 13}
@@ -24,9 +24,9 @@ public:
24 virtual void Reset() = 0; 24 virtual void Reset() = 0;
25 25
26 /// Queues the primitive formed by the given vertices for rendering 26 /// Queues the primitive formed by the given vertices for rendering
27 virtual void AddTriangle(const Pica::VertexShader::OutputVertex& v0, 27 virtual void AddTriangle(const Pica::Shader::OutputVertex& v0,
28 const Pica::VertexShader::OutputVertex& v1, 28 const Pica::Shader::OutputVertex& v1,
29 const Pica::VertexShader::OutputVertex& v2) = 0; 29 const Pica::Shader::OutputVertex& v2) = 0;
30 30
31 /// Draw the current batch of triangles 31 /// Draw the current batch of triangles
32 virtual void DrawTriangles() = 0; 32 virtual void DrawTriangles() = 0;
diff --git a/src/video_core/pica.cpp b/src/video_core/pica.cpp
index 17cb66780..c73a8178e 100644
--- a/src/video_core/pica.cpp
+++ b/src/video_core/pica.cpp
@@ -6,6 +6,7 @@
6#include <unordered_map> 6#include <unordered_map>
7 7
8#include "pica.h" 8#include "pica.h"
9#include "shader/shader.h"
9 10
10namespace Pica { 11namespace Pica {
11 12
@@ -84,6 +85,8 @@ void Init() {
84} 85}
85 86
86void Shutdown() { 87void Shutdown() {
88 Shader::Shutdown();
89
87 memset(&g_state, 0, sizeof(State)); 90 memset(&g_state, 0, sizeof(State));
88} 91}
89 92
diff --git a/src/video_core/pica.h b/src/video_core/pica.h
index 34b02b2f8..6ce90f95a 100644
--- a/src/video_core/pica.h
+++ b/src/video_core/pica.h
@@ -1083,6 +1083,7 @@ private:
1083 // TODO: Perform proper arithmetic on this! 1083 // TODO: Perform proper arithmetic on this!
1084 float value; 1084 float value;
1085}; 1085};
1086static_assert(sizeof(float24) == sizeof(float), "Shader JIT assumes float24 is implemented as a 32-bit float");
1086 1087
1087/// Struct used to describe current Pica state 1088/// Struct used to describe current Pica state
1088struct State { 1089struct State {
@@ -1092,7 +1093,10 @@ struct State {
1092 /// Vertex shader memory 1093 /// Vertex shader memory
1093 struct ShaderSetup { 1094 struct ShaderSetup {
1094 struct { 1095 struct {
1095 Math::Vec4<float24> f[96]; 1096 // The float uniforms are accessed by the shader JIT using SSE instructions, and are
1097 // therefore required to be 16-byte aligned.
1098 Math::Vec4<float24> MEMORY_ALIGNED16(f[96]);
1099
1096 std::array<bool, 16> b; 1100 std::array<bool, 16> b;
1097 std::array<Math::Vec4<u8>, 4> i; 1101 std::array<Math::Vec4<u8>, 4> i;
1098 } uniforms; 1102 } uniforms;
diff --git a/src/video_core/primitive_assembly.cpp b/src/video_core/primitive_assembly.cpp
index 2f22bdcce..e2b1df44c 100644
--- a/src/video_core/primitive_assembly.cpp
+++ b/src/video_core/primitive_assembly.cpp
@@ -4,7 +4,7 @@
4 4
5#include "pica.h" 5#include "pica.h"
6#include "primitive_assembly.h" 6#include "primitive_assembly.h"
7#include "vertex_shader.h" 7#include "shader/shader_interpreter.h"
8 8
9#include "common/logging/log.h" 9#include "common/logging/log.h"
10#include "video_core/debug_utils/debug_utils.h" 10#include "video_core/debug_utils/debug_utils.h"
@@ -56,7 +56,7 @@ void PrimitiveAssembler<VertexType>::SubmitVertex(VertexType& vtx, TriangleHandl
56 56
57// explicitly instantiate use cases 57// explicitly instantiate use cases
58template 58template
59struct PrimitiveAssembler<VertexShader::OutputVertex>; 59struct PrimitiveAssembler<Shader::OutputVertex>;
60template 60template
61struct PrimitiveAssembler<DebugUtils::GeometryDumper::Vertex>; 61struct PrimitiveAssembler<DebugUtils::GeometryDumper::Vertex>;
62 62
diff --git a/src/video_core/primitive_assembly.h b/src/video_core/primitive_assembly.h
index 52ff4cd89..80432d68a 100644
--- a/src/video_core/primitive_assembly.h
+++ b/src/video_core/primitive_assembly.h
@@ -8,7 +8,7 @@
8 8
9#include "video_core/pica.h" 9#include "video_core/pica.h"
10 10
11#include "video_core/vertex_shader.h" 11#include "video_core/shader/shader_interpreter.h"
12 12
13namespace Pica { 13namespace Pica {
14 14
diff --git a/src/video_core/rasterizer.cpp b/src/video_core/rasterizer.cpp
index 68b7cc05d..b83798b0f 100644
--- a/src/video_core/rasterizer.cpp
+++ b/src/video_core/rasterizer.cpp
@@ -16,7 +16,7 @@
16#include "math.h" 16#include "math.h"
17#include "pica.h" 17#include "pica.h"
18#include "rasterizer.h" 18#include "rasterizer.h"
19#include "vertex_shader.h" 19#include "shader/shader_interpreter.h"
20#include "video_core/utils.h" 20#include "video_core/utils.h"
21 21
22namespace Pica { 22namespace Pica {
@@ -272,9 +272,9 @@ static Common::Profiling::TimingCategory rasterization_category("Rasterization")
272 * Helper function for ProcessTriangle with the "reversed" flag to allow for implementing 272 * Helper function for ProcessTriangle with the "reversed" flag to allow for implementing
273 * culling via recursion. 273 * culling via recursion.
274 */ 274 */
275static void ProcessTriangleInternal(const VertexShader::OutputVertex& v0, 275static void ProcessTriangleInternal(const Shader::OutputVertex& v0,
276 const VertexShader::OutputVertex& v1, 276 const Shader::OutputVertex& v1,
277 const VertexShader::OutputVertex& v2, 277 const Shader::OutputVertex& v2,
278 bool reversed = false) 278 bool reversed = false)
279{ 279{
280 const auto& regs = g_state.regs; 280 const auto& regs = g_state.regs;
@@ -1107,9 +1107,9 @@ static void ProcessTriangleInternal(const VertexShader::OutputVertex& v0,
1107 } 1107 }
1108} 1108}
1109 1109
1110void ProcessTriangle(const VertexShader::OutputVertex& v0, 1110void ProcessTriangle(const Shader::OutputVertex& v0,
1111 const VertexShader::OutputVertex& v1, 1111 const Shader::OutputVertex& v1,
1112 const VertexShader::OutputVertex& v2) { 1112 const Shader::OutputVertex& v2) {
1113 ProcessTriangleInternal(v0, v1, v2); 1113 ProcessTriangleInternal(v0, v1, v2);
1114} 1114}
1115 1115
diff --git a/src/video_core/rasterizer.h b/src/video_core/rasterizer.h
index 42148f8b1..a6a9634b4 100644
--- a/src/video_core/rasterizer.h
+++ b/src/video_core/rasterizer.h
@@ -6,15 +6,15 @@
6 6
7namespace Pica { 7namespace Pica {
8 8
9namespace VertexShader { 9namespace Shader {
10 struct OutputVertex; 10 struct OutputVertex;
11} 11}
12 12
13namespace Rasterizer { 13namespace Rasterizer {
14 14
15void ProcessTriangle(const VertexShader::OutputVertex& v0, 15void ProcessTriangle(const Shader::OutputVertex& v0,
16 const VertexShader::OutputVertex& v1, 16 const Shader::OutputVertex& v1,
17 const VertexShader::OutputVertex& v2); 17 const Shader::OutputVertex& v2);
18 18
19} // namespace Rasterizer 19} // namespace Rasterizer
20 20
diff --git a/src/video_core/renderer_opengl/gl_rasterizer.cpp b/src/video_core/renderer_opengl/gl_rasterizer.cpp
index e7c1cfeb7..9f1552adf 100644
--- a/src/video_core/renderer_opengl/gl_rasterizer.cpp
+++ b/src/video_core/renderer_opengl/gl_rasterizer.cpp
@@ -202,9 +202,9 @@ void RasterizerOpenGL::Reset() {
202 res_cache.FullFlush(); 202 res_cache.FullFlush();
203} 203}
204 204
205void RasterizerOpenGL::AddTriangle(const Pica::VertexShader::OutputVertex& v0, 205void RasterizerOpenGL::AddTriangle(const Pica::Shader::OutputVertex& v0,
206 const Pica::VertexShader::OutputVertex& v1, 206 const Pica::Shader::OutputVertex& v1,
207 const Pica::VertexShader::OutputVertex& v2) { 207 const Pica::Shader::OutputVertex& v2) {
208 vertex_batch.push_back(HardwareVertex(v0)); 208 vertex_batch.push_back(HardwareVertex(v0));
209 vertex_batch.push_back(HardwareVertex(v1)); 209 vertex_batch.push_back(HardwareVertex(v1));
210 vertex_batch.push_back(HardwareVertex(v2)); 210 vertex_batch.push_back(HardwareVertex(v2));
diff --git a/src/video_core/renderer_opengl/gl_rasterizer.h b/src/video_core/renderer_opengl/gl_rasterizer.h
index ae7b26fc6..a02d5c856 100644
--- a/src/video_core/renderer_opengl/gl_rasterizer.h
+++ b/src/video_core/renderer_opengl/gl_rasterizer.h
@@ -9,7 +9,7 @@
9#include "common/common_types.h" 9#include "common/common_types.h"
10 10
11#include "video_core/hwrasterizer_base.h" 11#include "video_core/hwrasterizer_base.h"
12#include "video_core/vertex_shader.h" 12#include "video_core/shader/shader_interpreter.h"
13 13
14#include "gl_state.h" 14#include "gl_state.h"
15#include "gl_rasterizer_cache.h" 15#include "gl_rasterizer_cache.h"
@@ -27,9 +27,9 @@ public:
27 void Reset() override; 27 void Reset() override;
28 28
29 /// Queues the primitive formed by the given vertices for rendering 29 /// Queues the primitive formed by the given vertices for rendering
30 void AddTriangle(const Pica::VertexShader::OutputVertex& v0, 30 void AddTriangle(const Pica::Shader::OutputVertex& v0,
31 const Pica::VertexShader::OutputVertex& v1, 31 const Pica::Shader::OutputVertex& v1,
32 const Pica::VertexShader::OutputVertex& v2) override; 32 const Pica::Shader::OutputVertex& v2) override;
33 33
34 /// Draw the current batch of triangles 34 /// Draw the current batch of triangles
35 void DrawTriangles() override; 35 void DrawTriangles() override;
@@ -82,7 +82,7 @@ private:
82 82
83 /// Structure that the hardware rendered vertices are composed of 83 /// Structure that the hardware rendered vertices are composed of
84 struct HardwareVertex { 84 struct HardwareVertex {
85 HardwareVertex(const Pica::VertexShader::OutputVertex& v) { 85 HardwareVertex(const Pica::Shader::OutputVertex& v) {
86 position[0] = v.pos.x.ToFloat32(); 86 position[0] = v.pos.x.ToFloat32();
87 position[1] = v.pos.y.ToFloat32(); 87 position[1] = v.pos.y.ToFloat32();
88 position[2] = v.pos.z.ToFloat32(); 88 position[2] = v.pos.z.ToFloat32();
diff --git a/src/video_core/shader/shader.cpp b/src/video_core/shader/shader.cpp
new file mode 100644
index 000000000..6a27a8015
--- /dev/null
+++ b/src/video_core/shader/shader.cpp
@@ -0,0 +1,145 @@
1// Copyright 2015 Citra Emulator Project
2// Licensed under GPLv2 or any later version
3// Refer to the license.txt file included.
4
5#include <memory>
6#include <unordered_map>
7
8#include "common/hash.h"
9#include "common/make_unique.h"
10#include "common/profiler.h"
11
12#include "video_core/debug_utils/debug_utils.h"
13#include "video_core/pica.h"
14#include "video_core/video_core.h"
15
16#include "shader.h"
17#include "shader_interpreter.h"
18
19#ifdef ARCHITECTURE_x86_64
20#include "shader_jit_x64.h"
21#endif // ARCHITECTURE_x86_64
22
23namespace Pica {
24
25namespace Shader {
26
27#ifdef ARCHITECTURE_x86_64
28static std::unordered_map<u64, CompiledShader*> shader_map;
29static JitCompiler jit;
30static CompiledShader* jit_shader;
31#endif // ARCHITECTURE_x86_64
32
33void Setup(UnitState& state) {
34#ifdef ARCHITECTURE_x86_64
35 if (VideoCore::g_shader_jit_enabled) {
36 u64 cache_key = (Common::ComputeHash64(&g_state.vs.program_code, sizeof(g_state.vs.program_code)) ^
37 Common::ComputeHash64(&g_state.vs.swizzle_data, sizeof(g_state.vs.swizzle_data)) ^
38 g_state.regs.vs.main_offset);
39
40 auto iter = shader_map.find(cache_key);
41 if (iter != shader_map.end()) {
42 jit_shader = iter->second;
43 } else {
44 jit_shader = jit.Compile();
45 shader_map.emplace(cache_key, jit_shader);
46 }
47 }
48#endif // ARCHITECTURE_x86_64
49}
50
51void Shutdown() {
52 shader_map.clear();
53}
54
55static Common::Profiling::TimingCategory shader_category("Vertex Shader");
56
57OutputVertex Run(UnitState& state, const InputVertex& input, int num_attributes) {
58 auto& config = g_state.regs.vs;
59 auto& setup = g_state.vs;
60
61 Common::Profiling::ScopeTimer timer(shader_category);
62
63 state.program_counter = config.main_offset;
64 state.debug.max_offset = 0;
65 state.debug.max_opdesc_id = 0;
66
67 // Setup input register table
68 const auto& attribute_register_map = config.input_register_map;
69
70 if (num_attributes > 0) state.registers.input[attribute_register_map.attribute0_register] = input.attr[0];
71 if (num_attributes > 1) state.registers.input[attribute_register_map.attribute1_register] = input.attr[1];
72 if (num_attributes > 2) state.registers.input[attribute_register_map.attribute2_register] = input.attr[2];
73 if (num_attributes > 3) state.registers.input[attribute_register_map.attribute3_register] = input.attr[3];
74 if (num_attributes > 4) state.registers.input[attribute_register_map.attribute4_register] = input.attr[4];
75 if (num_attributes > 5) state.registers.input[attribute_register_map.attribute5_register] = input.attr[5];
76 if (num_attributes > 6) state.registers.input[attribute_register_map.attribute6_register] = input.attr[6];
77 if (num_attributes > 7) state.registers.input[attribute_register_map.attribute7_register] = input.attr[7];
78 if (num_attributes > 8) state.registers.input[attribute_register_map.attribute8_register] = input.attr[8];
79 if (num_attributes > 9) state.registers.input[attribute_register_map.attribute9_register] = input.attr[9];
80 if (num_attributes > 10) state.registers.input[attribute_register_map.attribute10_register] = input.attr[10];
81 if (num_attributes > 11) state.registers.input[attribute_register_map.attribute11_register] = input.attr[11];
82 if (num_attributes > 12) state.registers.input[attribute_register_map.attribute12_register] = input.attr[12];
83 if (num_attributes > 13) state.registers.input[attribute_register_map.attribute13_register] = input.attr[13];
84 if (num_attributes > 14) state.registers.input[attribute_register_map.attribute14_register] = input.attr[14];
85 if (num_attributes > 15) state.registers.input[attribute_register_map.attribute15_register] = input.attr[15];
86
87 state.conditional_code[0] = false;
88 state.conditional_code[1] = false;
89
90#ifdef ARCHITECTURE_x86_64
91 if (VideoCore::g_shader_jit_enabled)
92 jit_shader(&state.registers);
93 else
94 RunInterpreter(state);
95#else
96 RunInterpreter(state);
97#endif // ARCHITECTURE_x86_64
98
99#if PICA_DUMP_SHADERS
100 DebugUtils::DumpShader(setup.program_code.data(), state.debug.max_offset, setup.swizzle_data.data(),
101 state.debug.max_opdesc_id, config.main_offset,
102 g_state.regs.vs_output_attributes); // TODO: Don't hardcode VS here
103#endif
104
105 // Setup output data
106 OutputVertex ret;
107 // TODO(neobrain): Under some circumstances, up to 16 attributes may be output. We need to
108 // figure out what those circumstances are and enable the remaining outputs then.
109 for (int i = 0; i < 7; ++i) {
110 const auto& output_register_map = g_state.regs.vs_output_attributes[i]; // TODO: Don't hardcode VS here
111
112 u32 semantics[4] = {
113 output_register_map.map_x, output_register_map.map_y,
114 output_register_map.map_z, output_register_map.map_w
115 };
116
117 for (int comp = 0; comp < 4; ++comp) {
118 float24* out = ((float24*)&ret) + semantics[comp];
119 if (semantics[comp] != Regs::VSOutputAttributes::INVALID) {
120 *out = state.registers.output[i][comp];
121 } else {
122 // Zero output so that attributes which aren't output won't have denormals in them,
123 // which would slow us down later.
124 memset(out, 0, sizeof(*out));
125 }
126 }
127 }
128
129 // The hardware takes the absolute and saturates vertex colors like this, *before* doing interpolation
130 for (int i = 0; i < 4; ++i) {
131 ret.color[i] = float24::FromFloat32(
132 std::fmin(std::fabs(ret.color[i].ToFloat32()), 1.0f));
133 }
134
135 LOG_TRACE(Render_Software, "Output vertex: pos (%.2f, %.2f, %.2f, %.2f), col(%.2f, %.2f, %.2f, %.2f), tc0(%.2f, %.2f)",
136 ret.pos.x.ToFloat32(), ret.pos.y.ToFloat32(), ret.pos.z.ToFloat32(), ret.pos.w.ToFloat32(),
137 ret.color.x.ToFloat32(), ret.color.y.ToFloat32(), ret.color.z.ToFloat32(), ret.color.w.ToFloat32(),
138 ret.tc0.u().ToFloat32(), ret.tc0.v().ToFloat32());
139
140 return ret;
141}
142
143} // namespace Shader
144
145} // namespace Pica
diff --git a/src/video_core/shader/shader.h b/src/video_core/shader/shader.h
new file mode 100644
index 000000000..2007a2844
--- /dev/null
+++ b/src/video_core/shader/shader.h
@@ -0,0 +1,169 @@
1// Copyright 2015 Citra Emulator Project
2// Licensed under GPLv2 or any later version
3// Refer to the license.txt file included.
4
5#pragma once
6
7#include <boost/container/static_vector.hpp>
8#include <nihstro/shader_binary.h>
9
10#include "common/common_funcs.h"
11#include "common/common_types.h"
12#include "common/vector_math.h"
13
14#include "video_core/pica.h"
15
16using nihstro::RegisterType;
17using nihstro::SourceRegister;
18using nihstro::DestRegister;
19
20namespace Pica {
21
22namespace Shader {
23
24struct InputVertex {
25 Math::Vec4<float24> attr[16];
26};
27
28struct OutputVertex {
29 OutputVertex() = default;
30
31 // VS output attributes
32 Math::Vec4<float24> pos;
33 Math::Vec4<float24> dummy; // quaternions (not implemented, yet)
34 Math::Vec4<float24> color;
35 Math::Vec2<float24> tc0;
36 Math::Vec2<float24> tc1;
37 float24 pad[6];
38 Math::Vec2<float24> tc2;
39
40 // Padding for optimal alignment
41 float24 pad2[4];
42
43 // Attributes used to store intermediate results
44
45 // position after perspective divide
46 Math::Vec3<float24> screenpos;
47 float24 pad3;
48
49 // Linear interpolation
50 // factor: 0=this, 1=vtx
51 void Lerp(float24 factor, const OutputVertex& vtx) {
52 pos = pos * factor + vtx.pos * (float24::FromFloat32(1) - factor);
53
54 // TODO: Should perform perspective correct interpolation here...
55 tc0 = tc0 * factor + vtx.tc0 * (float24::FromFloat32(1) - factor);
56 tc1 = tc1 * factor + vtx.tc1 * (float24::FromFloat32(1) - factor);
57 tc2 = tc2 * factor + vtx.tc2 * (float24::FromFloat32(1) - factor);
58
59 screenpos = screenpos * factor + vtx.screenpos * (float24::FromFloat32(1) - factor);
60
61 color = color * factor + vtx.color * (float24::FromFloat32(1) - factor);
62 }
63
64 // Linear interpolation
65 // factor: 0=v0, 1=v1
66 static OutputVertex Lerp(float24 factor, const OutputVertex& v0, const OutputVertex& v1) {
67 OutputVertex ret = v0;
68 ret.Lerp(factor, v1);
69 return ret;
70 }
71};
72static_assert(std::is_pod<OutputVertex>::value, "Structure is not POD");
73static_assert(sizeof(OutputVertex) == 32 * sizeof(float), "OutputVertex has invalid size");
74
75/**
76 * This structure contains the state information that needs to be unique for a shader unit. The 3DS
77 * has four shader units that process shaders in parallel. At the present, Citra only implements a
78 * single shader unit that processes all shaders serially. Putting the state information in a struct
79 * here will make it easier for us to parallelize the shader processing later.
80 */
81struct UnitState {
82 struct Registers {
83 // The registers are accessed by the shader JIT using SSE instructions, and are therefore
84 // required to be 16-byte aligned.
85 Math::Vec4<float24> MEMORY_ALIGNED16(input[16]);
86 Math::Vec4<float24> MEMORY_ALIGNED16(output[16]);
87 Math::Vec4<float24> MEMORY_ALIGNED16(temporary[16]);
88 } registers;
89 static_assert(std::is_pod<Registers>::value, "Structure is not POD");
90
91 u32 program_counter;
92 bool conditional_code[2];
93
94 // Two Address registers and one loop counter
95 // TODO: How many bits do these actually have?
96 s32 address_registers[3];
97
98 enum {
99 INVALID_ADDRESS = 0xFFFFFFFF
100 };
101
102 struct CallStackElement {
103 u32 final_address; // Address upon which we jump to return_address
104 u32 return_address; // Where to jump when leaving scope
105 u8 repeat_counter; // How often to repeat until this call stack element is removed
106 u8 loop_increment; // Which value to add to the loop counter after an iteration
107 // TODO: Should this be a signed value? Does it even matter?
108 u32 loop_address; // The address where we'll return to after each loop iteration
109 };
110
111 // TODO: Is there a maximal size for this?
112 boost::container::static_vector<CallStackElement, 16> call_stack;
113
114 struct {
115 u32 max_offset; // maximum program counter ever reached
116 u32 max_opdesc_id; // maximum swizzle pattern index ever used
117 } debug;
118
119 static int InputOffset(const SourceRegister& reg) {
120 switch (reg.GetRegisterType()) {
121 case RegisterType::Input:
122 return (int)offsetof(UnitState::Registers, input) + reg.GetIndex()*sizeof(Math::Vec4<float24>);
123
124 case RegisterType::Temporary:
125 return (int)offsetof(UnitState::Registers, temporary) + reg.GetIndex()*sizeof(Math::Vec4<float24>);
126
127 default:
128 UNREACHABLE();
129 return 0;
130 }
131 }
132
133 static int OutputOffset(const DestRegister& reg) {
134 switch (reg.GetRegisterType()) {
135 case RegisterType::Output:
136 return (int)offsetof(UnitState::Registers, output) + reg.GetIndex()*sizeof(Math::Vec4<float24>);
137
138 case RegisterType::Temporary:
139 return (int)offsetof(UnitState::Registers, temporary) + reg.GetIndex()*sizeof(Math::Vec4<float24>);
140
141 default:
142 UNREACHABLE();
143 return 0;
144 }
145 }
146};
147
148/**
149 * Performs any shader unit setup that only needs to happen once per shader (as opposed to once per
150 * vertex, which would happen within the `Run` function).
151 * @param state Shader unit state, must be setup per shader and per shader unit
152 */
153void Setup(UnitState& state);
154
155/// Performs any cleanup when the emulator is shutdown
156void Shutdown();
157
158/**
159 * Runs the currently setup shader
160 * @param state Shader unit state, must be setup per shader and per shader unit
161 * @param input Input vertex into the shader
162 * @param num_attributes The number of vertex shader attributes
163 * @return The output vertex, after having been processed by the vertex shader
164 */
165OutputVertex Run(UnitState& state, const InputVertex& input, int num_attributes);
166
167} // namespace Shader
168
169} // namespace Pica
diff --git a/src/video_core/vertex_shader.cpp b/src/video_core/shader/shader_interpreter.cpp
index 5f66f3455..c8489f920 100644
--- a/src/video_core/vertex_shader.cpp
+++ b/src/video_core/shader/shader_interpreter.cpp
@@ -2,18 +2,14 @@
2// Licensed under GPLv2 or any later version 2// Licensed under GPLv2 or any later version
3// Refer to the license.txt file included. 3// Refer to the license.txt file included.
4 4
5#include <boost/container/static_vector.hpp>
6#include <boost/range/algorithm.hpp>
7
8#include <common/file_util.h> 5#include <common/file_util.h>
9 6
10#include <nihstro/shader_bytecode.h> 7#include <nihstro/shader_bytecode.h>
11 8
12#include "common/profiler.h" 9#include "video_core/pica.h"
13 10
14#include "pica.h" 11#include "shader.h"
15#include "vertex_shader.h" 12#include "shader_interpreter.h"
16#include "debug_utils/debug_utils.h"
17 13
18using nihstro::OpCode; 14using nihstro::OpCode;
19using nihstro::Instruction; 15using nihstro::Instruction;
@@ -23,44 +19,9 @@ using nihstro::SwizzlePattern;
23 19
24namespace Pica { 20namespace Pica {
25 21
26namespace VertexShader { 22namespace Shader {
27
28struct VertexShaderState {
29 u32 program_counter;
30
31 const float24* input_register_table[16];
32 Math::Vec4<float24> output_registers[16];
33
34 Math::Vec4<float24> temporary_registers[16];
35 bool conditional_code[2];
36
37 // Two Address registers and one loop counter
38 // TODO: How many bits do these actually have?
39 s32 address_registers[3];
40
41 enum {
42 INVALID_ADDRESS = 0xFFFFFFFF
43 };
44 23
45 struct CallStackElement { 24void RunInterpreter(UnitState& state) {
46 u32 final_address; // Address upon which we jump to return_address
47 u32 return_address; // Where to jump when leaving scope
48 u8 repeat_counter; // How often to repeat until this call stack element is removed
49 u8 loop_increment; // Which value to add to the loop counter after an iteration
50 // TODO: Should this be a signed value? Does it even matter?
51 u32 loop_address; // The address where we'll return to after each loop iteration
52 };
53
54 // TODO: Is there a maximal size for this?
55 boost::container::static_vector<CallStackElement, 16> call_stack;
56
57 struct {
58 u32 max_offset; // maximum program counter ever reached
59 u32 max_opdesc_id; // maximum swizzle pattern index ever used
60 } debug;
61};
62
63static void ProcessShaderCode(VertexShaderState& state) {
64 const auto& uniforms = g_state.vs.uniforms; 25 const auto& uniforms = g_state.vs.uniforms;
65 const auto& swizzle_data = g_state.vs.swizzle_data; 26 const auto& swizzle_data = g_state.vs.swizzle_data;
66 const auto& program_code = g_state.vs.program_code; 27 const auto& program_code = g_state.vs.program_code;
@@ -90,7 +51,7 @@ static void ProcessShaderCode(VertexShaderState& state) {
90 const Instruction instr = { program_code[state.program_counter] }; 51 const Instruction instr = { program_code[state.program_counter] };
91 const SwizzlePattern swizzle = { swizzle_data[instr.common.operand_desc_id] }; 52 const SwizzlePattern swizzle = { swizzle_data[instr.common.operand_desc_id] };
92 53
93 static auto call = [](VertexShaderState& state, u32 offset, u32 num_instructions, 54 static auto call = [](UnitState& state, u32 offset, u32 num_instructions,
94 u32 return_offset, u8 repeat_count, u8 loop_increment) { 55 u32 return_offset, u8 repeat_count, u8 loop_increment) {
95 state.program_counter = offset - 1; // -1 to make sure when incrementing the PC we end up at the correct offset 56 state.program_counter = offset - 1; // -1 to make sure when incrementing the PC we end up at the correct offset
96 ASSERT(state.call_stack.size() < state.call_stack.capacity()); 57 ASSERT(state.call_stack.size() < state.call_stack.capacity());
@@ -101,10 +62,10 @@ static void ProcessShaderCode(VertexShaderState& state) {
101 auto LookupSourceRegister = [&](const SourceRegister& source_reg) -> const float24* { 62 auto LookupSourceRegister = [&](const SourceRegister& source_reg) -> const float24* {
102 switch (source_reg.GetRegisterType()) { 63 switch (source_reg.GetRegisterType()) {
103 case RegisterType::Input: 64 case RegisterType::Input:
104 return state.input_register_table[source_reg.GetIndex()]; 65 return &state.registers.input[source_reg.GetIndex()].x;
105 66
106 case RegisterType::Temporary: 67 case RegisterType::Temporary:
107 return &state.temporary_registers[source_reg.GetIndex()].x; 68 return &state.registers.temporary[source_reg.GetIndex()].x;
108 69
109 case RegisterType::FloatUniform: 70 case RegisterType::FloatUniform:
110 return &uniforms.f[source_reg.GetIndex()].x; 71 return &uniforms.f[source_reg.GetIndex()].x;
@@ -153,8 +114,8 @@ static void ProcessShaderCode(VertexShaderState& state) {
153 src2[3] = src2[3] * float24::FromFloat32(-1); 114 src2[3] = src2[3] * float24::FromFloat32(-1);
154 } 115 }
155 116
156 float24* dest = (instr.common.dest.Value() < 0x10) ? &state.output_registers[instr.common.dest.Value().GetIndex()][0] 117 float24* dest = (instr.common.dest.Value() < 0x10) ? &state.registers.output[instr.common.dest.Value().GetIndex()][0]
157 : (instr.common.dest.Value() < 0x20) ? &state.temporary_registers[instr.common.dest.Value().GetIndex()][0] 118 : (instr.common.dest.Value() < 0x20) ? &state.registers.temporary[instr.common.dest.Value().GetIndex()][0]
158 : dummy_vec4_float24; 119 : dummy_vec4_float24;
159 120
160 state.debug.max_opdesc_id = std::max<u32>(state.debug.max_opdesc_id, 1+instr.common.operand_desc_id); 121 state.debug.max_opdesc_id = std::max<u32>(state.debug.max_opdesc_id, 1+instr.common.operand_desc_id);
@@ -394,8 +355,8 @@ static void ProcessShaderCode(VertexShaderState& state) {
394 src3[3] = src3[3] * float24::FromFloat32(-1); 355 src3[3] = src3[3] * float24::FromFloat32(-1);
395 } 356 }
396 357
397 float24* dest = (instr.mad.dest.Value() < 0x10) ? &state.output_registers[instr.mad.dest.Value().GetIndex()][0] 358 float24* dest = (instr.mad.dest.Value() < 0x10) ? &state.registers.output[instr.mad.dest.Value().GetIndex()][0]
398 : (instr.mad.dest.Value() < 0x20) ? &state.temporary_registers[instr.mad.dest.Value().GetIndex()][0] 359 : (instr.mad.dest.Value() < 0x20) ? &state.registers.temporary[instr.mad.dest.Value().GetIndex()][0]
399 : dummy_vec4_float24; 360 : dummy_vec4_float24;
400 361
401 for (int i = 0; i < 4; ++i) { 362 for (int i = 0; i < 4; ++i) {
@@ -413,7 +374,7 @@ static void ProcessShaderCode(VertexShaderState& state) {
413 374
414 default: 375 default:
415 { 376 {
416 static auto evaluate_condition = [](const VertexShaderState& state, bool refx, bool refy, Instruction::FlowControlType flow_control) { 377 static auto evaluate_condition = [](const UnitState& state, bool refx, bool refy, Instruction::FlowControlType flow_control) {
417 bool results[2] = { refx == state.conditional_code[0], 378 bool results[2] = { refx == state.conditional_code[0],
418 refy == state.conditional_code[1] }; 379 refy == state.conditional_code[1] };
419 380
@@ -542,88 +503,6 @@ static void ProcessShaderCode(VertexShaderState& state) {
542 } 503 }
543} 504}
544 505
545static Common::Profiling::TimingCategory shader_category("Vertex Shader");
546
547OutputVertex RunShader(const InputVertex& input, int num_attributes, const Regs::ShaderConfig& config, const State::ShaderSetup& setup) {
548 Common::Profiling::ScopeTimer timer(shader_category);
549
550 VertexShaderState state;
551
552 state.program_counter = config.main_offset;
553 state.debug.max_offset = 0;
554 state.debug.max_opdesc_id = 0;
555
556 // Setup input register table
557 const auto& attribute_register_map = config.input_register_map;
558 float24 dummy_register;
559 boost::fill(state.input_register_table, &dummy_register);
560
561 if (num_attributes > 0) state.input_register_table[attribute_register_map.attribute0_register] = &input.attr[0].x;
562 if (num_attributes > 1) state.input_register_table[attribute_register_map.attribute1_register] = &input.attr[1].x;
563 if (num_attributes > 2) state.input_register_table[attribute_register_map.attribute2_register] = &input.attr[2].x;
564 if (num_attributes > 3) state.input_register_table[attribute_register_map.attribute3_register] = &input.attr[3].x;
565 if (num_attributes > 4) state.input_register_table[attribute_register_map.attribute4_register] = &input.attr[4].x;
566 if (num_attributes > 5) state.input_register_table[attribute_register_map.attribute5_register] = &input.attr[5].x;
567 if (num_attributes > 6) state.input_register_table[attribute_register_map.attribute6_register] = &input.attr[6].x;
568 if (num_attributes > 7) state.input_register_table[attribute_register_map.attribute7_register] = &input.attr[7].x;
569 if (num_attributes > 8) state.input_register_table[attribute_register_map.attribute8_register] = &input.attr[8].x;
570 if (num_attributes > 9) state.input_register_table[attribute_register_map.attribute9_register] = &input.attr[9].x;
571 if (num_attributes > 10) state.input_register_table[attribute_register_map.attribute10_register] = &input.attr[10].x;
572 if (num_attributes > 11) state.input_register_table[attribute_register_map.attribute11_register] = &input.attr[11].x;
573 if (num_attributes > 12) state.input_register_table[attribute_register_map.attribute12_register] = &input.attr[12].x;
574 if (num_attributes > 13) state.input_register_table[attribute_register_map.attribute13_register] = &input.attr[13].x;
575 if (num_attributes > 14) state.input_register_table[attribute_register_map.attribute14_register] = &input.attr[14].x;
576 if (num_attributes > 15) state.input_register_table[attribute_register_map.attribute15_register] = &input.attr[15].x;
577
578 state.conditional_code[0] = false;
579 state.conditional_code[1] = false;
580
581 ProcessShaderCode(state);
582#if PICA_DUMP_SHADERS
583 DebugUtils::DumpShader(setup.program_code.data(), state.debug.max_offset, setup.swizzle_data.data(),
584 state.debug.max_opdesc_id, config.main_offset,
585 g_state.regs.vs_output_attributes); // TODO: Don't hardcode VS here
586#endif
587
588 // Setup output data
589 OutputVertex ret;
590 // TODO(neobrain): Under some circumstances, up to 16 attributes may be output. We need to
591 // figure out what those circumstances are and enable the remaining outputs then.
592 for (int i = 0; i < 7; ++i) {
593 const auto& output_register_map = g_state.regs.vs_output_attributes[i]; // TODO: Don't hardcode VS here
594
595 u32 semantics[4] = {
596 output_register_map.map_x, output_register_map.map_y,
597 output_register_map.map_z, output_register_map.map_w
598 };
599
600 for (int comp = 0; comp < 4; ++comp) {
601 float24* out = ((float24*)&ret) + semantics[comp];
602 if (semantics[comp] != Regs::VSOutputAttributes::INVALID) {
603 *out = state.output_registers[i][comp];
604 } else {
605 // Zero output so that attributes which aren't output won't have denormals in them,
606 // which would slow us down later.
607 memset(out, 0, sizeof(*out));
608 }
609 }
610 }
611
612 // The hardware takes the absolute and saturates vertex colors like this, *before* doing interpolation
613 for (int i = 0; i < 4; ++i) {
614 ret.color[i] = float24::FromFloat32(
615 std::fmin(std::fabs(ret.color[i].ToFloat32()), 1.0f));
616 }
617
618 LOG_TRACE(Render_Software, "Output vertex: pos (%.2f, %.2f, %.2f, %.2f), col(%.2f, %.2f, %.2f, %.2f), tc0(%.2f, %.2f)",
619 ret.pos.x.ToFloat32(), ret.pos.y.ToFloat32(), ret.pos.z.ToFloat32(), ret.pos.w.ToFloat32(),
620 ret.color.x.ToFloat32(), ret.color.y.ToFloat32(), ret.color.z.ToFloat32(), ret.color.w.ToFloat32(),
621 ret.tc0.u().ToFloat32(), ret.tc0.v().ToFloat32());
622
623 return ret;
624}
625
626
627} // namespace 506} // namespace
628 507
629} // namespace 508} // namespace
diff --git a/src/video_core/shader/shader_interpreter.h b/src/video_core/shader/shader_interpreter.h
new file mode 100644
index 000000000..ad6e58e39
--- /dev/null
+++ b/src/video_core/shader/shader_interpreter.h
@@ -0,0 +1,19 @@
1// Copyright 2014 Citra Emulator Project
2// Licensed under GPLv2 or any later version
3// Refer to the license.txt file included.
4
5#pragma once
6
7#include "video_core/pica.h"
8
9#include "shader.h"
10
11namespace Pica {
12
13namespace Shader {
14
15void RunInterpreter(UnitState& state);
16
17} // namespace
18
19} // namespace
diff --git a/src/video_core/shader/shader_jit_x64.cpp b/src/video_core/shader/shader_jit_x64.cpp
new file mode 100644
index 000000000..ce47774d5
--- /dev/null
+++ b/src/video_core/shader/shader_jit_x64.cpp
@@ -0,0 +1,675 @@
1// Copyright 2015 Citra Emulator Project
2// Licensed under GPLv2 or any later version
3// Refer to the license.txt file included.
4
5#include <smmintrin.h>
6
7#include "common/x64/abi.h"
8#include "common/x64/cpu_detect.h"
9#include "common/x64/emitter.h"
10
11#include "shader.h"
12#include "shader_jit_x64.h"
13
14namespace Pica {
15
16namespace Shader {
17
18using namespace Gen;
19
20typedef void (JitCompiler::*JitFunction)(Instruction instr);
21
22const JitFunction instr_table[64] = {
23 &JitCompiler::Compile_ADD, // add
24 &JitCompiler::Compile_DP3, // dp3
25 &JitCompiler::Compile_DP4, // dp4
26 nullptr, // dph
27 nullptr, // unknown
28 nullptr, // ex2
29 nullptr, // lg2
30 nullptr, // unknown
31 &JitCompiler::Compile_MUL, // mul
32 nullptr, // lge
33 nullptr, // slt
34 &JitCompiler::Compile_FLR, // flr
35 &JitCompiler::Compile_MAX, // max
36 &JitCompiler::Compile_MIN, // min
37 &JitCompiler::Compile_RCP, // rcp
38 &JitCompiler::Compile_RSQ, // rsq
39 nullptr, // unknown
40 nullptr, // unknown
41 &JitCompiler::Compile_MOVA, // mova
42 &JitCompiler::Compile_MOV, // mov
43 nullptr, // unknown
44 nullptr, // unknown
45 nullptr, // unknown
46 nullptr, // unknown
47 nullptr, // dphi
48 nullptr, // unknown
49 nullptr, // sgei
50 &JitCompiler::Compile_SLTI, // slti
51 nullptr, // unknown
52 nullptr, // unknown
53 nullptr, // unknown
54 nullptr, // unknown
55 nullptr, // unknown
56 &JitCompiler::Compile_NOP, // nop
57 &JitCompiler::Compile_END, // end
58 nullptr, // break
59 &JitCompiler::Compile_CALL, // call
60 &JitCompiler::Compile_CALLC, // callc
61 &JitCompiler::Compile_CALLU, // callu
62 &JitCompiler::Compile_IF, // ifu
63 &JitCompiler::Compile_IF, // ifc
64 &JitCompiler::Compile_LOOP, // loop
65 nullptr, // emit
66 nullptr, // sete
67 &JitCompiler::Compile_JMP, // jmpc
68 &JitCompiler::Compile_JMP, // jmpu
69 &JitCompiler::Compile_CMP, // cmp
70 &JitCompiler::Compile_CMP, // cmp
71 &JitCompiler::Compile_MAD, // madi
72 &JitCompiler::Compile_MAD, // madi
73 &JitCompiler::Compile_MAD, // madi
74 &JitCompiler::Compile_MAD, // madi
75 &JitCompiler::Compile_MAD, // madi
76 &JitCompiler::Compile_MAD, // madi
77 &JitCompiler::Compile_MAD, // madi
78 &JitCompiler::Compile_MAD, // madi
79 &JitCompiler::Compile_MAD, // mad
80 &JitCompiler::Compile_MAD, // mad
81 &JitCompiler::Compile_MAD, // mad
82 &JitCompiler::Compile_MAD, // mad
83 &JitCompiler::Compile_MAD, // mad
84 &JitCompiler::Compile_MAD, // mad
85 &JitCompiler::Compile_MAD, // mad
86 &JitCompiler::Compile_MAD, // mad
87};
88
89// The following is used to alias some commonly used registers. Generally, RAX-RDX and XMM0-XMM3 can
90// be used as scratch registers within a compiler function. The other registers have designated
91// purposes, as documented below:
92
93/// Pointer to the uniform memory
94static const X64Reg UNIFORMS = R9;
95/// The two 32-bit VS address offset registers set by the MOVA instruction
96static const X64Reg ADDROFFS_REG_0 = R10;
97static const X64Reg ADDROFFS_REG_1 = R11;
98/// VS loop count register
99static const X64Reg LOOPCOUNT_REG = R12;
100/// Current VS loop iteration number (we could probably use LOOPCOUNT_REG, but this quicker)
101static const X64Reg LOOPCOUNT = RSI;
102/// Number to increment LOOPCOUNT_REG by on each loop iteration
103static const X64Reg LOOPINC = RDI;
104/// Result of the previous CMP instruction for the X-component comparison
105static const X64Reg COND0 = R13;
106/// Result of the previous CMP instruction for the Y-component comparison
107static const X64Reg COND1 = R14;
108/// Pointer to the UnitState instance for the current VS unit
109static const X64Reg REGISTERS = R15;
110/// SIMD scratch register
111static const X64Reg SCRATCH = XMM0;
112/// Loaded with the first swizzled source register, otherwise can be used as a scratch register
113static const X64Reg SRC1 = XMM1;
114/// Loaded with the second swizzled source register, otherwise can be used as a scratch register
115static const X64Reg SRC2 = XMM2;
116/// Loaded with the third swizzled source register, otherwise can be used as a scratch register
117static const X64Reg SRC3 = XMM3;
118/// Constant vector of [1.0f, 1.0f, 1.0f, 1.0f], used to efficiently set a vector to one
119static const X64Reg ONE = XMM14;
120/// Constant vector of [-0.f, -0.f, -0.f, -0.f], used to efficiently negate a vector with XOR
121static const X64Reg NEGBIT = XMM15;
122
123/// Raw constant for the source register selector that indicates no swizzling is performed
124static const u8 NO_SRC_REG_SWIZZLE = 0x1b;
125/// Raw constant for the destination register enable mask that indicates all components are enabled
126static const u8 NO_DEST_REG_MASK = 0xf;
127
128/**
129 * Loads and swizzles a source register into the specified XMM register.
130 * @param instr VS instruction, used for determining how to load the source register
131 * @param src_num Number indicating which source register to load (1 = src1, 2 = src2, 3 = src3)
132 * @param src_reg SourceRegister object corresponding to the source register to load
133 * @param dest Destination XMM register to store the loaded, swizzled source register
134 */
135void JitCompiler::Compile_SwizzleSrc(Instruction instr, unsigned src_num, SourceRegister src_reg, X64Reg dest) {
136 X64Reg src_ptr;
137 int src_offset;
138
139 if (src_reg.GetRegisterType() == RegisterType::FloatUniform) {
140 src_ptr = UNIFORMS;
141 src_offset = src_reg.GetIndex() * sizeof(float24) * 4;
142 } else {
143 src_ptr = REGISTERS;
144 src_offset = UnitState::InputOffset(src_reg);
145 }
146
147 unsigned operand_desc_id;
148 if (instr.opcode.Value().EffectiveOpCode() == OpCode::Id::MAD ||
149 instr.opcode.Value().EffectiveOpCode() == OpCode::Id::MADI) {
150 // The MAD and MADI instructions do not use the address offset registers, so loading the
151 // source is a bit simpler here
152
153 operand_desc_id = instr.mad.operand_desc_id;
154
155 // Load the source
156 MOVAPS(dest, MDisp(src_ptr, src_offset));
157 } else {
158 operand_desc_id = instr.common.operand_desc_id;
159
160 const bool is_inverted = (0 != (instr.opcode.Value().GetInfo().subtype & OpCode::Info::SrcInversed));
161 unsigned offset_src = is_inverted ? 2 : 1;
162
163 if (src_num == offset_src && instr.common.address_register_index != 0) {
164 switch (instr.common.address_register_index) {
165 case 1: // address offset 1
166 MOVAPS(dest, MComplex(src_ptr, ADDROFFS_REG_0, 1, src_offset));
167 break;
168 case 2: // address offset 2
169 MOVAPS(dest, MComplex(src_ptr, ADDROFFS_REG_1, 1, src_offset));
170 break;
171 case 3: // adddress offet 3
172 MOVAPS(dest, MComplex(src_ptr, LOOPCOUNT_REG, 1, src_offset));
173 break;
174 default:
175 UNREACHABLE();
176 break;
177 }
178 } else {
179 // Load the source
180 MOVAPS(dest, MDisp(src_ptr, src_offset));
181 }
182 }
183
184 SwizzlePattern swiz = { g_state.vs.swizzle_data[operand_desc_id] };
185
186 // Generate instructions for source register swizzling as needed
187 u8 sel = swiz.GetRawSelector(src_num);
188 if (sel != NO_SRC_REG_SWIZZLE) {
189 // Selector component order needs to be reversed for the SHUFPS instruction
190 sel = ((sel & 0xc0) >> 6) | ((sel & 3) << 6) | ((sel & 0xc) << 2) | ((sel & 0x30) >> 2);
191
192 // Shuffle inputs for swizzle
193 SHUFPS(dest, R(dest), sel);
194 }
195
196 // If the source register should be negated, flip the negative bit using XOR
197 const bool negate[] = { swiz.negate_src1, swiz.negate_src2, swiz.negate_src3 };
198 if (negate[src_num - 1]) {
199 XORPS(dest, R(NEGBIT));
200 }
201}
202
203void JitCompiler::Compile_DestEnable(Instruction instr,X64Reg src) {
204 DestRegister dest;
205 unsigned operand_desc_id;
206 if (instr.opcode.Value().EffectiveOpCode() == OpCode::Id::MAD ||
207 instr.opcode.Value().EffectiveOpCode() == OpCode::Id::MADI) {
208 operand_desc_id = instr.mad.operand_desc_id;
209 dest = instr.mad.dest.Value();
210 } else {
211 operand_desc_id = instr.common.operand_desc_id;
212 dest = instr.common.dest.Value();
213 }
214
215 SwizzlePattern swiz = { g_state.vs.swizzle_data[operand_desc_id] };
216
217 // If all components are enabled, write the result to the destination register
218 if (swiz.dest_mask == NO_DEST_REG_MASK) {
219 // Store dest back to memory
220 MOVAPS(MDisp(REGISTERS, UnitState::OutputOffset(dest)), src);
221
222 } else {
223 // Not all components are enabled, so mask the result when storing to the destination register...
224 MOVAPS(SCRATCH, MDisp(REGISTERS, UnitState::OutputOffset(dest)));
225
226 if (Common::GetCPUCaps().sse4_1) {
227 u8 mask = ((swiz.dest_mask & 1) << 3) | ((swiz.dest_mask & 8) >> 3) | ((swiz.dest_mask & 2) << 1) | ((swiz.dest_mask & 4) >> 1);
228 BLENDPS(SCRATCH, R(src), mask);
229 } else {
230 MOVAPS(XMM4, R(src));
231 UNPCKHPS(XMM4, R(SCRATCH)); // Unpack X/Y components of source and destination
232 UNPCKLPS(SCRATCH, R(src)); // Unpack Z/W components of source and destination
233
234 // Compute selector to selectively copy source components to destination for SHUFPS instruction
235 u8 sel = ((swiz.DestComponentEnabled(0) ? 1 : 0) << 0) |
236 ((swiz.DestComponentEnabled(1) ? 3 : 2) << 2) |
237 ((swiz.DestComponentEnabled(2) ? 0 : 1) << 4) |
238 ((swiz.DestComponentEnabled(3) ? 2 : 3) << 6);
239 SHUFPS(SCRATCH, R(XMM4), sel);
240 }
241
242 // Store dest back to memory
243 MOVAPS(MDisp(REGISTERS, UnitState::OutputOffset(dest)), SCRATCH);
244 }
245}
246
247void JitCompiler::Compile_EvaluateCondition(Instruction instr) {
248 // Note: NXOR is used below to check for equality
249 switch (instr.flow_control.op) {
250 case Instruction::FlowControlType::Or:
251 MOV(32, R(RAX), R(COND0));
252 MOV(32, R(RBX), R(COND1));
253 XOR(32, R(RAX), Imm32(instr.flow_control.refx.Value() ^ 1));
254 XOR(32, R(RBX), Imm32(instr.flow_control.refy.Value() ^ 1));
255 OR(32, R(RAX), R(RBX));
256 break;
257
258 case Instruction::FlowControlType::And:
259 MOV(32, R(RAX), R(COND0));
260 MOV(32, R(RBX), R(COND1));
261 XOR(32, R(RAX), Imm32(instr.flow_control.refx.Value() ^ 1));
262 XOR(32, R(RBX), Imm32(instr.flow_control.refy.Value() ^ 1));
263 AND(32, R(RAX), R(RBX));
264 break;
265
266 case Instruction::FlowControlType::JustX:
267 MOV(32, R(RAX), R(COND0));
268 XOR(32, R(RAX), Imm32(instr.flow_control.refx.Value() ^ 1));
269 break;
270
271 case Instruction::FlowControlType::JustY:
272 MOV(32, R(RAX), R(COND1));
273 XOR(32, R(RAX), Imm32(instr.flow_control.refy.Value() ^ 1));
274 break;
275 }
276}
277
278void JitCompiler::Compile_UniformCondition(Instruction instr) {
279 int offset = offsetof(decltype(g_state.vs.uniforms), b) + (instr.flow_control.bool_uniform_id * sizeof(bool));
280 CMP(sizeof(bool) * 8, MDisp(UNIFORMS, offset), Imm8(0));
281}
282
283void JitCompiler::Compile_ADD(Instruction instr) {
284 Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1);
285 Compile_SwizzleSrc(instr, 2, instr.common.src2, SRC2);
286 ADDPS(SRC1, R(SRC2));
287 Compile_DestEnable(instr, SRC1);
288}
289
290void JitCompiler::Compile_DP3(Instruction instr) {
291 Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1);
292 Compile_SwizzleSrc(instr, 2, instr.common.src2, SRC2);
293
294 if (Common::GetCPUCaps().sse4_1) {
295 DPPS(SRC1, R(SRC2), 0x7f);
296 } else {
297 MULPS(SRC1, R(SRC2));
298
299 MOVAPS(SRC2, R(SRC1));
300 SHUFPS(SRC2, R(SRC2), _MM_SHUFFLE(1, 1, 1, 1));
301
302 MOVAPS(SRC3, R(SRC1));
303 SHUFPS(SRC3, R(SRC3), _MM_SHUFFLE(2, 2, 2, 2));
304
305 SHUFPS(SRC1, R(SRC1), _MM_SHUFFLE(0, 0, 0, 0));
306 ADDPS(SRC1, R(SRC2));
307 ADDPS(SRC1, R(SRC3));
308 }
309
310 Compile_DestEnable(instr, SRC1);
311}
312
313void JitCompiler::Compile_DP4(Instruction instr) {
314 Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1);
315 Compile_SwizzleSrc(instr, 2, instr.common.src2, SRC2);
316
317 if (Common::GetCPUCaps().sse4_1) {
318 DPPS(SRC1, R(SRC2), 0xff);
319 } else {
320 MULPS(SRC1, R(SRC2));
321
322 MOVAPS(SRC2, R(SRC1));
323 SHUFPS(SRC1, R(SRC1), _MM_SHUFFLE(2, 3, 0, 1)); // XYZW -> ZWXY
324 ADDPS(SRC1, R(SRC2));
325
326 MOVAPS(SRC2, R(SRC1));
327 SHUFPS(SRC1, R(SRC1), _MM_SHUFFLE(0, 1, 2, 3)); // XYZW -> WZYX
328 ADDPS(SRC1, R(SRC2));
329 }
330
331 Compile_DestEnable(instr, SRC1);
332}
333
334void JitCompiler::Compile_MUL(Instruction instr) {
335 Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1);
336 Compile_SwizzleSrc(instr, 2, instr.common.src2, SRC2);
337 MULPS(SRC1, R(SRC2));
338 Compile_DestEnable(instr, SRC1);
339}
340
341void JitCompiler::Compile_FLR(Instruction instr) {
342 Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1);
343
344 if (Common::GetCPUCaps().sse4_1) {
345 ROUNDFLOORPS(SRC1, R(SRC1));
346 } else {
347 CVTPS2DQ(SRC1, R(SRC1));
348 CVTDQ2PS(SRC1, R(SRC1));
349 }
350
351 Compile_DestEnable(instr, SRC1);
352}
353
354void JitCompiler::Compile_MAX(Instruction instr) {
355 Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1);
356 Compile_SwizzleSrc(instr, 2, instr.common.src2, SRC2);
357 MAXPS(SRC1, R(SRC2));
358 Compile_DestEnable(instr, SRC1);
359}
360
361void JitCompiler::Compile_MIN(Instruction instr) {
362 Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1);
363 Compile_SwizzleSrc(instr, 2, instr.common.src2, SRC2);
364 MINPS(SRC1, R(SRC2));
365 Compile_DestEnable(instr, SRC1);
366}
367
368void JitCompiler::Compile_MOVA(Instruction instr) {
369 SwizzlePattern swiz = { g_state.vs.swizzle_data[instr.common.operand_desc_id] };
370
371 if (!swiz.DestComponentEnabled(0) && !swiz.DestComponentEnabled(1)) {
372 return; // NoOp
373 }
374
375 Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1);
376
377 // Convert floats to integers (only care about X and Y components)
378 CVTPS2DQ(SRC1, R(SRC1));
379
380 // Get result
381 MOVQ_xmm(R(RAX), SRC1);
382
383 // Handle destination enable
384 if (swiz.DestComponentEnabled(0) && swiz.DestComponentEnabled(1)) {
385 // Move and sign-extend low 32 bits
386 MOVSX(64, 32, ADDROFFS_REG_0, R(RAX));
387
388 // Move and sign-extend high 32 bits
389 SHR(64, R(RAX), Imm8(32));
390 MOVSX(64, 32, ADDROFFS_REG_1, R(RAX));
391
392 // Multiply by 16 to be used as an offset later
393 SHL(64, R(ADDROFFS_REG_0), Imm8(4));
394 SHL(64, R(ADDROFFS_REG_1), Imm8(4));
395 } else {
396 if (swiz.DestComponentEnabled(0)) {
397 // Move and sign-extend low 32 bits
398 MOVSX(64, 32, ADDROFFS_REG_0, R(RAX));
399
400 // Multiply by 16 to be used as an offset later
401 SHL(64, R(ADDROFFS_REG_0), Imm8(4));
402 } else if (swiz.DestComponentEnabled(1)) {
403 // Move and sign-extend high 32 bits
404 SHR(64, R(RAX), Imm8(32));
405 MOVSX(64, 32, ADDROFFS_REG_1, R(RAX));
406
407 // Multiply by 16 to be used as an offset later
408 SHL(64, R(ADDROFFS_REG_1), Imm8(4));
409 }
410 }
411}
412
413void JitCompiler::Compile_MOV(Instruction instr) {
414 Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1);
415 Compile_DestEnable(instr, SRC1);
416}
417
418void JitCompiler::Compile_SLTI(Instruction instr) {
419 Compile_SwizzleSrc(instr, 1, instr.common.src1i, SRC1);
420 Compile_SwizzleSrc(instr, 1, instr.common.src2i, SRC2);
421
422 CMPSS(SRC1, R(SRC2), CMP_LT);
423 ANDPS(SRC1, R(ONE));
424
425 Compile_DestEnable(instr, SRC1);
426}
427
428void JitCompiler::Compile_RCP(Instruction instr) {
429 Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1);
430
431 // TODO(bunnei): RCPPS is a pretty rough approximation, this might cause problems if Pica
432 // performs this operation more accurately. This should be checked on hardware.
433 RCPPS(SRC1, R(SRC1));
434
435 Compile_DestEnable(instr, SRC1);
436}
437
438void JitCompiler::Compile_RSQ(Instruction instr) {
439 Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1);
440
441 // TODO(bunnei): RSQRTPS is a pretty rough approximation, this might cause problems if Pica
442 // performs this operation more accurately. This should be checked on hardware.
443 RSQRTPS(SRC1, R(SRC1));
444
445 Compile_DestEnable(instr, SRC1);
446}
447
448void JitCompiler::Compile_NOP(Instruction instr) {
449}
450
451void JitCompiler::Compile_END(Instruction instr) {
452 ABI_PopAllCalleeSavedRegsAndAdjustStack();
453 RET();
454}
455
456void JitCompiler::Compile_CALL(Instruction instr) {
457 unsigned offset = instr.flow_control.dest_offset;
458 while (offset < (instr.flow_control.dest_offset + instr.flow_control.num_instructions)) {
459 Compile_NextInstr(&offset);
460 }
461}
462
463void JitCompiler::Compile_CALLC(Instruction instr) {
464 Compile_EvaluateCondition(instr);
465 FixupBranch b = J_CC(CC_Z, true);
466 Compile_CALL(instr);
467 SetJumpTarget(b);
468}
469
470void JitCompiler::Compile_CALLU(Instruction instr) {
471 Compile_UniformCondition(instr);
472 FixupBranch b = J_CC(CC_Z, true);
473 Compile_CALL(instr);
474 SetJumpTarget(b);
475}
476
477void JitCompiler::Compile_CMP(Instruction instr) {
478 Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1);
479 Compile_SwizzleSrc(instr, 2, instr.common.src2, SRC2);
480
481 static const u8 cmp[] = { CMP_EQ, CMP_NEQ, CMP_LT, CMP_LE, CMP_NLE, CMP_NLT };
482
483 if (instr.common.compare_op.x == instr.common.compare_op.y) {
484 // Compare X-component and Y-component together
485 CMPPS(SRC1, R(SRC2), cmp[instr.common.compare_op.x]);
486
487 MOVQ_xmm(R(COND0), SRC1);
488 MOV(64, R(COND1), R(COND0));
489 } else {
490 // Compare X-component
491 MOVAPS(SCRATCH, R(SRC1));
492 CMPSS(SCRATCH, R(SRC2), cmp[instr.common.compare_op.x]);
493
494 // Compare Y-component
495 CMPPS(SRC1, R(SRC2), cmp[instr.common.compare_op.y]);
496
497 MOVQ_xmm(R(COND0), SCRATCH);
498 MOVQ_xmm(R(COND1), SRC1);
499 }
500
501 SHR(32, R(COND0), Imm8(31));
502 SHR(64, R(COND1), Imm8(63));
503}
504
505void JitCompiler::Compile_MAD(Instruction instr) {
506 Compile_SwizzleSrc(instr, 1, instr.mad.src1, SRC1);
507
508 if (instr.opcode.Value().EffectiveOpCode() == OpCode::Id::MADI) {
509 Compile_SwizzleSrc(instr, 2, instr.mad.src2i, SRC2);
510 Compile_SwizzleSrc(instr, 3, instr.mad.src3i, SRC3);
511 } else {
512 Compile_SwizzleSrc(instr, 2, instr.mad.src2, SRC2);
513 Compile_SwizzleSrc(instr, 3, instr.mad.src3, SRC3);
514 }
515
516 if (Common::GetCPUCaps().fma) {
517 VFMADD213PS(SRC1, SRC2, R(SRC3));
518 } else {
519 MULPS(SRC1, R(SRC2));
520 ADDPS(SRC1, R(SRC3));
521 }
522
523 Compile_DestEnable(instr, SRC1);
524}
525
526void JitCompiler::Compile_IF(Instruction instr) {
527 ASSERT_MSG(instr.flow_control.dest_offset > *offset_ptr, "Backwards if-statements not supported");
528
529 // Evaluate the "IF" condition
530 if (instr.opcode.Value() == OpCode::Id::IFU) {
531 Compile_UniformCondition(instr);
532 } else if (instr.opcode.Value() == OpCode::Id::IFC) {
533 Compile_EvaluateCondition(instr);
534 }
535 FixupBranch b = J_CC(CC_Z, true);
536
537 // Compile the code that corresponds to the condition evaluating as true
538 Compile_Block(instr.flow_control.dest_offset - 1);
539
540 // If there isn't an "ELSE" condition, we are done here
541 if (instr.flow_control.num_instructions == 0) {
542 SetJumpTarget(b);
543 return;
544 }
545
546 FixupBranch b2 = J(true);
547
548 SetJumpTarget(b);
549
550 // This code corresponds to the "ELSE" condition
551 // Comple the code that corresponds to the condition evaluating as false
552 Compile_Block(instr.flow_control.dest_offset + instr.flow_control.num_instructions - 1);
553
554 SetJumpTarget(b2);
555}
556
557void JitCompiler::Compile_LOOP(Instruction instr) {
558 ASSERT_MSG(instr.flow_control.dest_offset > *offset_ptr, "Backwards loops not supported");
559 ASSERT_MSG(!looping, "Nested loops not supported");
560
561 looping = true;
562
563 int offset = offsetof(decltype(g_state.vs.uniforms), i) + (instr.flow_control.int_uniform_id * sizeof(Math::Vec4<u8>));
564 MOV(32, R(LOOPCOUNT), MDisp(UNIFORMS, offset));
565 MOV(32, R(LOOPCOUNT_REG), R(LOOPCOUNT));
566 SHR(32, R(LOOPCOUNT_REG), Imm8(8));
567 AND(32, R(LOOPCOUNT_REG), Imm32(0xff)); // Y-component is the start
568 MOV(32, R(LOOPINC), R(LOOPCOUNT));
569 SHR(32, R(LOOPINC), Imm8(16));
570 MOVZX(32, 8, LOOPINC, R(LOOPINC)); // Z-component is the incrementer
571 MOVZX(32, 8, LOOPCOUNT, R(LOOPCOUNT)); // X-component is iteration count
572 ADD(32, R(LOOPCOUNT), Imm8(1)); // Iteration count is X-component + 1
573
574 auto loop_start = GetCodePtr();
575
576 Compile_Block(instr.flow_control.dest_offset);
577
578 ADD(32, R(LOOPCOUNT_REG), R(LOOPINC)); // Increment LOOPCOUNT_REG by Z-component
579 SUB(32, R(LOOPCOUNT), Imm8(1)); // Increment loop count by 1
580 J_CC(CC_NZ, loop_start); // Loop if not equal
581
582 looping = false;
583}
584
585void JitCompiler::Compile_JMP(Instruction instr) {
586 ASSERT_MSG(instr.flow_control.dest_offset > *offset_ptr, "Backwards jumps not supported");
587
588 if (instr.opcode.Value() == OpCode::Id::JMPC)
589 Compile_EvaluateCondition(instr);
590 else if (instr.opcode.Value() == OpCode::Id::JMPU)
591 Compile_UniformCondition(instr);
592 else
593 UNREACHABLE();
594
595 FixupBranch b = J_CC(CC_NZ, true);
596
597 Compile_Block(instr.flow_control.dest_offset);
598
599 SetJumpTarget(b);
600}
601
602void JitCompiler::Compile_Block(unsigned stop) {
603 // Save current offset pointer
604 unsigned* prev_offset_ptr = offset_ptr;
605 unsigned offset = *prev_offset_ptr;
606
607 while (offset <= stop)
608 Compile_NextInstr(&offset);
609
610 // Restore current offset pointer
611 offset_ptr = prev_offset_ptr;
612 *offset_ptr = offset;
613}
614
615void JitCompiler::Compile_NextInstr(unsigned* offset) {
616 offset_ptr = offset;
617
618 Instruction instr = *(Instruction*)&g_state.vs.program_code[(*offset_ptr)++];
619 OpCode::Id opcode = instr.opcode.Value();
620 auto instr_func = instr_table[static_cast<unsigned>(opcode)];
621
622 if (instr_func) {
623 // JIT the instruction!
624 ((*this).*instr_func)(instr);
625 } else {
626 // Unhandled instruction
627 LOG_CRITICAL(HW_GPU, "Unhandled instruction: 0x%02x (0x%08x)", instr.opcode.Value(), instr.hex);
628 }
629}
630
631CompiledShader* JitCompiler::Compile() {
632 const u8* start = GetCodePtr();
633 const auto& code = g_state.vs.program_code;
634 unsigned offset = g_state.regs.vs.main_offset;
635
636 ABI_PushAllCalleeSavedRegsAndAdjustStack();
637
638 MOV(PTRBITS, R(REGISTERS), R(ABI_PARAM1));
639 MOV(PTRBITS, R(UNIFORMS), ImmPtr(&g_state.vs.uniforms));
640
641 // Zero address/loop registers
642 XOR(64, R(ADDROFFS_REG_0), R(ADDROFFS_REG_0));
643 XOR(64, R(ADDROFFS_REG_1), R(ADDROFFS_REG_1));
644 XOR(64, R(LOOPCOUNT_REG), R(LOOPCOUNT_REG));
645
646 // Used to set a register to one
647 static const __m128 one = { 1.f, 1.f, 1.f, 1.f };
648 MOV(PTRBITS, R(RAX), ImmPtr(&one));
649 MOVAPS(ONE, MDisp(RAX, 0));
650
651 // Used to negate registers
652 static const __m128 neg = { -0.f, -0.f, -0.f, -0.f };
653 MOV(PTRBITS, R(RAX), ImmPtr(&neg));
654 MOVAPS(NEGBIT, MDisp(RAX, 0));
655
656 looping = false;
657
658 while (offset < g_state.vs.program_code.size()) {
659 Compile_NextInstr(&offset);
660 }
661
662 return (CompiledShader*)start;
663}
664
665JitCompiler::JitCompiler() {
666 AllocCodeSpace(1024 * 1024 * 4);
667}
668
669void JitCompiler::Clear() {
670 ClearCodeSpace();
671}
672
673} // namespace Shader
674
675} // namespace Pica
diff --git a/src/video_core/shader/shader_jit_x64.h b/src/video_core/shader/shader_jit_x64.h
new file mode 100644
index 000000000..b88f2a0d2
--- /dev/null
+++ b/src/video_core/shader/shader_jit_x64.h
@@ -0,0 +1,79 @@
1// Copyright 2015 Citra Emulator Project
2// Licensed under GPLv2 or any later version
3// Refer to the license.txt file included.
4
5#pragma once
6
7#include <nihstro/shader_bytecode.h>
8
9#include "common/x64/emitter.h"
10
11#include "video_core/pica.h"
12
13#include "shader.h"
14
15using nihstro::Instruction;
16using nihstro::OpCode;
17using nihstro::SwizzlePattern;
18
19namespace Pica {
20
21namespace Shader {
22
23using CompiledShader = void(void* registers);
24
25/**
26 * This class implements the shader JIT compiler. It recompiles a Pica shader program into x86_64
27 * code that can be executed on the host machine directly.
28 */
29class JitCompiler : public Gen::XCodeBlock {
30public:
31 JitCompiler();
32
33 CompiledShader* Compile();
34
35 void Clear();
36
37 void Compile_ADD(Instruction instr);
38 void Compile_DP3(Instruction instr);
39 void Compile_DP4(Instruction instr);
40 void Compile_MUL(Instruction instr);
41 void Compile_FLR(Instruction instr);
42 void Compile_MAX(Instruction instr);
43 void Compile_MIN(Instruction instr);
44 void Compile_RCP(Instruction instr);
45 void Compile_RSQ(Instruction instr);
46 void Compile_MOVA(Instruction instr);
47 void Compile_MOV(Instruction instr);
48 void Compile_SLTI(Instruction instr);
49 void Compile_NOP(Instruction instr);
50 void Compile_END(Instruction instr);
51 void Compile_CALL(Instruction instr);
52 void Compile_CALLC(Instruction instr);
53 void Compile_CALLU(Instruction instr);
54 void Compile_IF(Instruction instr);
55 void Compile_LOOP(Instruction instr);
56 void Compile_JMP(Instruction instr);
57 void Compile_CMP(Instruction instr);
58 void Compile_MAD(Instruction instr);
59
60private:
61 void Compile_Block(unsigned stop);
62 void Compile_NextInstr(unsigned* offset);
63
64 void Compile_SwizzleSrc(Instruction instr, unsigned src_num, SourceRegister src_reg, Gen::X64Reg dest);
65 void Compile_DestEnable(Instruction instr, Gen::X64Reg dest);
66
67 void Compile_EvaluateCondition(Instruction instr);
68 void Compile_UniformCondition(Instruction instr);
69
70 /// Pointer to the variable that stores the current Pica code offset. Used to handle nested code blocks.
71 unsigned* offset_ptr = nullptr;
72
73 /// Set to true if currently in a loop, used to check for the existence of nested loops
74 bool looping = false;
75};
76
77} // Shader
78
79} // Pica
diff --git a/src/video_core/vertex_shader.h b/src/video_core/vertex_shader.h
deleted file mode 100644
index 97f9250dd..000000000
--- a/src/video_core/vertex_shader.h
+++ /dev/null
@@ -1,73 +0,0 @@
1// Copyright 2014 Citra Emulator Project
2// Licensed under GPLv2 or any later version
3// Refer to the license.txt file included.
4
5#pragma once
6
7#include <type_traits>
8
9#include "common/vector_math.h"
10
11#include "pica.h"
12
13namespace Pica {
14
15namespace VertexShader {
16
17struct InputVertex {
18 Math::Vec4<float24> attr[16];
19};
20
21struct OutputVertex {
22 OutputVertex() = default;
23
24 // VS output attributes
25 Math::Vec4<float24> pos;
26 Math::Vec4<float24> dummy; // quaternions (not implemented, yet)
27 Math::Vec4<float24> color;
28 Math::Vec2<float24> tc0;
29 Math::Vec2<float24> tc1;
30 float24 pad[6];
31 Math::Vec2<float24> tc2;
32
33 // Padding for optimal alignment
34 float24 pad2[4];
35
36 // Attributes used to store intermediate results
37
38 // position after perspective divide
39 Math::Vec3<float24> screenpos;
40 float24 pad3;
41
42 // Linear interpolation
43 // factor: 0=this, 1=vtx
44 void Lerp(float24 factor, const OutputVertex& vtx) {
45 pos = pos * factor + vtx.pos * (float24::FromFloat32(1) - factor);
46
47 // TODO: Should perform perspective correct interpolation here...
48 tc0 = tc0 * factor + vtx.tc0 * (float24::FromFloat32(1) - factor);
49 tc1 = tc1 * factor + vtx.tc1 * (float24::FromFloat32(1) - factor);
50 tc2 = tc2 * factor + vtx.tc2 * (float24::FromFloat32(1) - factor);
51
52 screenpos = screenpos * factor + vtx.screenpos * (float24::FromFloat32(1) - factor);
53
54 color = color * factor + vtx.color * (float24::FromFloat32(1) - factor);
55 }
56
57 // Linear interpolation
58 // factor: 0=v0, 1=v1
59 static OutputVertex Lerp(float24 factor, const OutputVertex& v0, const OutputVertex& v1) {
60 OutputVertex ret = v0;
61 ret.Lerp(factor, v1);
62 return ret;
63 }
64};
65static_assert(std::is_pod<OutputVertex>::value, "Structure is not POD");
66static_assert(sizeof(OutputVertex) == 32 * sizeof(float), "OutputVertex has invalid size");
67
68OutputVertex RunShader(const InputVertex& input, int num_attributes, const Regs::ShaderConfig& config, const State::ShaderSetup& setup);
69
70} // namespace
71
72} // namespace
73
diff --git a/src/video_core/video_core.cpp b/src/video_core/video_core.cpp
index 3becc4261..943fde5ee 100644
--- a/src/video_core/video_core.cpp
+++ b/src/video_core/video_core.cpp
@@ -23,6 +23,7 @@ EmuWindow* g_emu_window = nullptr; ///< Frontend emulator window
23RendererBase* g_renderer = nullptr; ///< Renderer plugin 23RendererBase* g_renderer = nullptr; ///< Renderer plugin
24 24
25std::atomic<bool> g_hw_renderer_enabled; 25std::atomic<bool> g_hw_renderer_enabled;
26std::atomic<bool> g_shader_jit_enabled;
26 27
27/// Initialize the video core 28/// Initialize the video core
28void Init(EmuWindow* emu_window) { 29void Init(EmuWindow* emu_window) {
diff --git a/src/video_core/video_core.h b/src/video_core/video_core.h
index 14b33c9dd..2867bf03e 100644
--- a/src/video_core/video_core.h
+++ b/src/video_core/video_core.h
@@ -32,8 +32,9 @@ static const int kScreenBottomHeight = 240; ///< 3DS bottom screen height
32extern RendererBase* g_renderer; ///< Renderer plugin 32extern RendererBase* g_renderer; ///< Renderer plugin
33extern EmuWindow* g_emu_window; ///< Emu window 33extern EmuWindow* g_emu_window; ///< Emu window
34 34
35// TODO: Wrap this in a user settings struct along with any other graphics settings (often set from qt ui) 35// TODO: Wrap these in a user settings struct along with any other graphics settings (often set from qt ui)
36extern std::atomic<bool> g_hw_renderer_enabled; 36extern std::atomic<bool> g_hw_renderer_enabled;
37extern std::atomic<bool> g_shader_jit_enabled;
37 38
38/// Start the video core 39/// Start the video core
39void Start(); 40void Start();