diff options
| author | 2015-08-27 16:34:13 -0700 | |
|---|---|---|
| committer | 2015-08-27 16:34:13 -0700 | |
| commit | c5a4025b6581c1c64c2761d09510c5827eaada05 (patch) | |
| tree | c7b7072b2ad53041127c454e7de0dcb0607d02e8 /src | |
| parent | Merge pull request #1068 from bunnei/gl-hash-textures (diff) | |
| parent | fixup! Shaders: Fix multiplications between 0.0 and inf (diff) | |
| download | yuzu-c5a4025b6581c1c64c2761d09510c5827eaada05.tar.gz yuzu-c5a4025b6581c1c64c2761d09510c5827eaada05.tar.xz yuzu-c5a4025b6581c1c64c2761d09510c5827eaada05.zip | |
Merge pull request #1065 from yuriks/shader-fp
Shader FP compliance fixes
Diffstat (limited to 'src')
| -rw-r--r-- | src/video_core/pica.h | 14 | ||||
| -rw-r--r-- | src/video_core/shader/shader_interpreter.cpp | 10 | ||||
| -rw-r--r-- | src/video_core/shader/shader_jit_x64.cpp | 127 | ||||
| -rw-r--r-- | src/video_core/shader/shader_jit_x64.h | 6 |
4 files changed, 100 insertions, 57 deletions
diff --git a/src/video_core/pica.h b/src/video_core/pica.h index 58b924f9e..bb689f2a9 100644 --- a/src/video_core/pica.h +++ b/src/video_core/pica.h | |||
| @@ -1021,12 +1021,20 @@ struct float24 { | |||
| 1021 | return ret; | 1021 | return ret; |
| 1022 | } | 1022 | } |
| 1023 | 1023 | ||
| 1024 | static float24 Zero() { | ||
| 1025 | return FromFloat32(0.f); | ||
| 1026 | } | ||
| 1027 | |||
| 1024 | // Not recommended for anything but logging | 1028 | // Not recommended for anything but logging |
| 1025 | float ToFloat32() const { | 1029 | float ToFloat32() const { |
| 1026 | return value; | 1030 | return value; |
| 1027 | } | 1031 | } |
| 1028 | 1032 | ||
| 1029 | float24 operator * (const float24& flt) const { | 1033 | float24 operator * (const float24& flt) const { |
| 1034 | if ((this->value == 0.f && !std::isnan(flt.value)) || | ||
| 1035 | (flt.value == 0.f && !std::isnan(this->value))) | ||
| 1036 | // PICA gives 0 instead of NaN when multiplying by inf | ||
| 1037 | return Zero(); | ||
| 1030 | return float24::FromFloat32(ToFloat32() * flt.ToFloat32()); | 1038 | return float24::FromFloat32(ToFloat32() * flt.ToFloat32()); |
| 1031 | } | 1039 | } |
| 1032 | 1040 | ||
| @@ -1043,7 +1051,11 @@ struct float24 { | |||
| 1043 | } | 1051 | } |
| 1044 | 1052 | ||
| 1045 | float24& operator *= (const float24& flt) { | 1053 | float24& operator *= (const float24& flt) { |
| 1046 | value *= flt.ToFloat32(); | 1054 | if ((this->value == 0.f && !std::isnan(flt.value)) || |
| 1055 | (flt.value == 0.f && !std::isnan(this->value))) | ||
| 1056 | // PICA gives 0 instead of NaN when multiplying by inf | ||
| 1057 | *this = Zero(); | ||
| 1058 | else value *= flt.ToFloat32(); | ||
| 1047 | return *this; | 1059 | return *this; |
| 1048 | } | 1060 | } |
| 1049 | 1061 | ||
diff --git a/src/video_core/shader/shader_interpreter.cpp b/src/video_core/shader/shader_interpreter.cpp index ae5a30441..69e4efa68 100644 --- a/src/video_core/shader/shader_interpreter.cpp +++ b/src/video_core/shader/shader_interpreter.cpp | |||
| @@ -177,7 +177,10 @@ void RunInterpreter(UnitState<Debug>& state) { | |||
| 177 | if (!swizzle.DestComponentEnabled(i)) | 177 | if (!swizzle.DestComponentEnabled(i)) |
| 178 | continue; | 178 | continue; |
| 179 | 179 | ||
| 180 | dest[i] = std::max(src1[i], src2[i]); | 180 | // NOTE: Exact form required to match NaN semantics to hardware: |
| 181 | // max(0, NaN) -> NaN | ||
| 182 | // max(NaN, 0) -> 0 | ||
| 183 | dest[i] = (src1[i] > src2[i]) ? src1[i] : src2[i]; | ||
| 181 | } | 184 | } |
| 182 | Record<DebugDataRecord::DEST_OUT>(state.debug, iteration, dest); | 185 | Record<DebugDataRecord::DEST_OUT>(state.debug, iteration, dest); |
| 183 | break; | 186 | break; |
| @@ -190,7 +193,10 @@ void RunInterpreter(UnitState<Debug>& state) { | |||
| 190 | if (!swizzle.DestComponentEnabled(i)) | 193 | if (!swizzle.DestComponentEnabled(i)) |
| 191 | continue; | 194 | continue; |
| 192 | 195 | ||
| 193 | dest[i] = std::min(src1[i], src2[i]); | 196 | // NOTE: Exact form required to match NaN semantics to hardware: |
| 197 | // min(0, NaN) -> NaN | ||
| 198 | // min(NaN, 0) -> 0 | ||
| 199 | dest[i] = (src1[i] < src2[i]) ? src1[i] : src2[i]; | ||
| 194 | } | 200 | } |
| 195 | Record<DebugDataRecord::DEST_OUT>(state.debug, iteration, dest); | 201 | Record<DebugDataRecord::DEST_OUT>(state.debug, iteration, dest); |
| 196 | break; | 202 | break; |
diff --git a/src/video_core/shader/shader_jit_x64.cpp b/src/video_core/shader/shader_jit_x64.cpp index a8045d4b0..d3cfe109e 100644 --- a/src/video_core/shader/shader_jit_x64.cpp +++ b/src/video_core/shader/shader_jit_x64.cpp | |||
| @@ -115,6 +115,8 @@ static const X64Reg SRC1 = XMM1; | |||
| 115 | static const X64Reg SRC2 = XMM2; | 115 | static const X64Reg SRC2 = XMM2; |
| 116 | /// Loaded with the third swizzled source register, otherwise can be used as a scratch register | 116 | /// Loaded with the third swizzled source register, otherwise can be used as a scratch register |
| 117 | static const X64Reg SRC3 = XMM3; | 117 | static const X64Reg SRC3 = XMM3; |
| 118 | /// Additional scratch register | ||
| 119 | static const X64Reg SCRATCH2 = XMM4; | ||
| 118 | /// Constant vector of [1.0f, 1.0f, 1.0f, 1.0f], used to efficiently set a vector to one | 120 | /// Constant vector of [1.0f, 1.0f, 1.0f, 1.0f], used to efficiently set a vector to one |
| 119 | static const X64Reg ONE = XMM14; | 121 | static const X64Reg ONE = XMM14; |
| 120 | /// Constant vector of [-0.f, -0.f, -0.f, -0.f], used to efficiently negate a vector with XOR | 122 | /// Constant vector of [-0.f, -0.f, -0.f, -0.f], used to efficiently negate a vector with XOR |
| @@ -227,8 +229,8 @@ void JitCompiler::Compile_DestEnable(Instruction instr,X64Reg src) { | |||
| 227 | u8 mask = ((swiz.dest_mask & 1) << 3) | ((swiz.dest_mask & 8) >> 3) | ((swiz.dest_mask & 2) << 1) | ((swiz.dest_mask & 4) >> 1); | 229 | u8 mask = ((swiz.dest_mask & 1) << 3) | ((swiz.dest_mask & 8) >> 3) | ((swiz.dest_mask & 2) << 1) | ((swiz.dest_mask & 4) >> 1); |
| 228 | BLENDPS(SCRATCH, R(src), mask); | 230 | BLENDPS(SCRATCH, R(src), mask); |
| 229 | } else { | 231 | } else { |
| 230 | MOVAPS(XMM4, R(src)); | 232 | MOVAPS(SCRATCH2, R(src)); |
| 231 | UNPCKHPS(XMM4, R(SCRATCH)); // Unpack X/Y components of source and destination | 233 | UNPCKHPS(SCRATCH2, R(SCRATCH)); // Unpack X/Y components of source and destination |
| 232 | UNPCKLPS(SCRATCH, R(src)); // Unpack Z/W components of source and destination | 234 | UNPCKLPS(SCRATCH, R(src)); // Unpack Z/W components of source and destination |
| 233 | 235 | ||
| 234 | // Compute selector to selectively copy source components to destination for SHUFPS instruction | 236 | // Compute selector to selectively copy source components to destination for SHUFPS instruction |
| @@ -236,7 +238,7 @@ void JitCompiler::Compile_DestEnable(Instruction instr,X64Reg src) { | |||
| 236 | ((swiz.DestComponentEnabled(1) ? 3 : 2) << 2) | | 238 | ((swiz.DestComponentEnabled(1) ? 3 : 2) << 2) | |
| 237 | ((swiz.DestComponentEnabled(2) ? 0 : 1) << 4) | | 239 | ((swiz.DestComponentEnabled(2) ? 0 : 1) << 4) | |
| 238 | ((swiz.DestComponentEnabled(3) ? 2 : 3) << 6); | 240 | ((swiz.DestComponentEnabled(3) ? 2 : 3) << 6); |
| 239 | SHUFPS(SCRATCH, R(XMM4), sel); | 241 | SHUFPS(SCRATCH, R(SCRATCH2), sel); |
| 240 | } | 242 | } |
| 241 | 243 | ||
| 242 | // Store dest back to memory | 244 | // Store dest back to memory |
| @@ -244,6 +246,19 @@ void JitCompiler::Compile_DestEnable(Instruction instr,X64Reg src) { | |||
| 244 | } | 246 | } |
| 245 | } | 247 | } |
| 246 | 248 | ||
| 249 | void JitCompiler::Compile_SanitizedMul(Gen::X64Reg src1, Gen::X64Reg src2, Gen::X64Reg scratch) { | ||
| 250 | MOVAPS(scratch, R(src1)); | ||
| 251 | CMPPS(scratch, R(src2), CMP_ORD); | ||
| 252 | |||
| 253 | MULPS(src1, R(src2)); | ||
| 254 | |||
| 255 | MOVAPS(src2, R(src1)); | ||
| 256 | CMPPS(src2, R(src2), CMP_UNORD); | ||
| 257 | |||
| 258 | XORPS(scratch, R(src2)); | ||
| 259 | ANDPS(src1, R(scratch)); | ||
| 260 | } | ||
| 261 | |||
| 247 | void JitCompiler::Compile_EvaluateCondition(Instruction instr) { | 262 | void JitCompiler::Compile_EvaluateCondition(Instruction instr) { |
| 248 | // Note: NXOR is used below to check for equality | 263 | // Note: NXOR is used below to check for equality |
| 249 | switch (instr.flow_control.op) { | 264 | switch (instr.flow_control.op) { |
| @@ -307,21 +322,17 @@ void JitCompiler::Compile_DP3(Instruction instr) { | |||
| 307 | Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1); | 322 | Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1); |
| 308 | Compile_SwizzleSrc(instr, 2, instr.common.src2, SRC2); | 323 | Compile_SwizzleSrc(instr, 2, instr.common.src2, SRC2); |
| 309 | 324 | ||
| 310 | if (Common::GetCPUCaps().sse4_1) { | 325 | Compile_SanitizedMul(SRC1, SRC2, SCRATCH); |
| 311 | DPPS(SRC1, R(SRC2), 0x7f); | ||
| 312 | } else { | ||
| 313 | MULPS(SRC1, R(SRC2)); | ||
| 314 | 326 | ||
| 315 | MOVAPS(SRC2, R(SRC1)); | 327 | MOVAPS(SRC2, R(SRC1)); |
| 316 | SHUFPS(SRC2, R(SRC2), _MM_SHUFFLE(1, 1, 1, 1)); | 328 | SHUFPS(SRC2, R(SRC2), _MM_SHUFFLE(1, 1, 1, 1)); |
| 317 | 329 | ||
| 318 | MOVAPS(SRC3, R(SRC1)); | 330 | MOVAPS(SRC3, R(SRC1)); |
| 319 | SHUFPS(SRC3, R(SRC3), _MM_SHUFFLE(2, 2, 2, 2)); | 331 | SHUFPS(SRC3, R(SRC3), _MM_SHUFFLE(2, 2, 2, 2)); |
| 320 | 332 | ||
| 321 | SHUFPS(SRC1, R(SRC1), _MM_SHUFFLE(0, 0, 0, 0)); | 333 | SHUFPS(SRC1, R(SRC1), _MM_SHUFFLE(0, 0, 0, 0)); |
| 322 | ADDPS(SRC1, R(SRC2)); | 334 | ADDPS(SRC1, R(SRC2)); |
| 323 | ADDPS(SRC1, R(SRC3)); | 335 | ADDPS(SRC1, R(SRC3)); |
| 324 | } | ||
| 325 | 336 | ||
| 326 | Compile_DestEnable(instr, SRC1); | 337 | Compile_DestEnable(instr, SRC1); |
| 327 | } | 338 | } |
| @@ -330,19 +341,15 @@ void JitCompiler::Compile_DP4(Instruction instr) { | |||
| 330 | Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1); | 341 | Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1); |
| 331 | Compile_SwizzleSrc(instr, 2, instr.common.src2, SRC2); | 342 | Compile_SwizzleSrc(instr, 2, instr.common.src2, SRC2); |
| 332 | 343 | ||
| 333 | if (Common::GetCPUCaps().sse4_1) { | 344 | Compile_SanitizedMul(SRC1, SRC2, SCRATCH); |
| 334 | DPPS(SRC1, R(SRC2), 0xff); | ||
| 335 | } else { | ||
| 336 | MULPS(SRC1, R(SRC2)); | ||
| 337 | 345 | ||
| 338 | MOVAPS(SRC2, R(SRC1)); | 346 | MOVAPS(SRC2, R(SRC1)); |
| 339 | SHUFPS(SRC1, R(SRC1), _MM_SHUFFLE(2, 3, 0, 1)); // XYZW -> ZWXY | 347 | SHUFPS(SRC1, R(SRC1), _MM_SHUFFLE(2, 3, 0, 1)); // XYZW -> ZWXY |
| 340 | ADDPS(SRC1, R(SRC2)); | 348 | ADDPS(SRC1, R(SRC2)); |
| 341 | 349 | ||
| 342 | MOVAPS(SRC2, R(SRC1)); | 350 | MOVAPS(SRC2, R(SRC1)); |
| 343 | SHUFPS(SRC1, R(SRC1), _MM_SHUFFLE(0, 1, 2, 3)); // XYZW -> WZYX | 351 | SHUFPS(SRC1, R(SRC1), _MM_SHUFFLE(0, 1, 2, 3)); // XYZW -> WZYX |
| 344 | ADDPS(SRC1, R(SRC2)); | 352 | ADDPS(SRC1, R(SRC2)); |
| 345 | } | ||
| 346 | 353 | ||
| 347 | Compile_DestEnable(instr, SRC1); | 354 | Compile_DestEnable(instr, SRC1); |
| 348 | } | 355 | } |
| @@ -359,23 +366,22 @@ void JitCompiler::Compile_DPH(Instruction instr) { | |||
| 359 | if (Common::GetCPUCaps().sse4_1) { | 366 | if (Common::GetCPUCaps().sse4_1) { |
| 360 | // Set 4th component to 1.0 | 367 | // Set 4th component to 1.0 |
| 361 | BLENDPS(SRC1, R(ONE), 0x8); // 0b1000 | 368 | BLENDPS(SRC1, R(ONE), 0x8); // 0b1000 |
| 362 | DPPS(SRC1, R(SRC2), 0xff); | ||
| 363 | } else { | 369 | } else { |
| 364 | // Reverse to set the 4th component to 1.0 | 370 | // Set 4th component to 1.0 |
| 365 | SHUFPS(SRC1, R(SRC1), _MM_SHUFFLE(0, 1, 2, 3)); | 371 | MOVAPS(SCRATCH, R(SRC1)); |
| 366 | MOVSS(SRC1, R(ONE)); | 372 | UNPCKHPS(SCRATCH, R(ONE)); // XYZW, 1111 -> Z1__ |
| 367 | SHUFPS(SRC1, R(SRC1), _MM_SHUFFLE(0, 1, 2, 3)); | 373 | UNPCKLPD(SRC1, R(SCRATCH)); // XYZW, Z1__ -> XYZ1 |
| 374 | } | ||
| 368 | 375 | ||
| 369 | MULPS(SRC1, R(SRC2)); | 376 | Compile_SanitizedMul(SRC1, SRC2, SCRATCH); |
| 370 | 377 | ||
| 371 | MOVAPS(SRC2, R(SRC1)); | 378 | MOVAPS(SRC2, R(SRC1)); |
| 372 | SHUFPS(SRC1, R(SRC1), _MM_SHUFFLE(2, 3, 0, 1)); // XYZW -> ZWXY | 379 | SHUFPS(SRC1, R(SRC1), _MM_SHUFFLE(2, 3, 0, 1)); // XYZW -> ZWXY |
| 373 | ADDPS(SRC1, R(SRC2)); | 380 | ADDPS(SRC1, R(SRC2)); |
| 374 | 381 | ||
| 375 | MOVAPS(SRC2, R(SRC1)); | 382 | MOVAPS(SRC2, R(SRC1)); |
| 376 | SHUFPS(SRC1, R(SRC1), _MM_SHUFFLE(0, 1, 2, 3)); // XYZW -> WZYX | 383 | SHUFPS(SRC1, R(SRC1), _MM_SHUFFLE(0, 1, 2, 3)); // XYZW -> WZYX |
| 377 | ADDPS(SRC1, R(SRC2)); | 384 | ADDPS(SRC1, R(SRC2)); |
| 378 | } | ||
| 379 | 385 | ||
| 380 | Compile_DestEnable(instr, SRC1); | 386 | Compile_DestEnable(instr, SRC1); |
| 381 | } | 387 | } |
| @@ -415,7 +421,7 @@ void JitCompiler::Compile_LG2(Instruction instr) { | |||
| 415 | void JitCompiler::Compile_MUL(Instruction instr) { | 421 | void JitCompiler::Compile_MUL(Instruction instr) { |
| 416 | Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1); | 422 | Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1); |
| 417 | Compile_SwizzleSrc(instr, 2, instr.common.src2, SRC2); | 423 | Compile_SwizzleSrc(instr, 2, instr.common.src2, SRC2); |
| 418 | MULPS(SRC1, R(SRC2)); | 424 | Compile_SanitizedMul(SRC1, SRC2, SCRATCH); |
| 419 | Compile_DestEnable(instr, SRC1); | 425 | Compile_DestEnable(instr, SRC1); |
| 420 | } | 426 | } |
| 421 | 427 | ||
| @@ -465,6 +471,7 @@ void JitCompiler::Compile_FLR(Instruction instr) { | |||
| 465 | void JitCompiler::Compile_MAX(Instruction instr) { | 471 | void JitCompiler::Compile_MAX(Instruction instr) { |
| 466 | Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1); | 472 | Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1); |
| 467 | Compile_SwizzleSrc(instr, 2, instr.common.src2, SRC2); | 473 | Compile_SwizzleSrc(instr, 2, instr.common.src2, SRC2); |
| 474 | // SSE semantics match PICA200 ones: In case of NaN, SRC2 is returned. | ||
| 468 | MAXPS(SRC1, R(SRC2)); | 475 | MAXPS(SRC1, R(SRC2)); |
| 469 | Compile_DestEnable(instr, SRC1); | 476 | Compile_DestEnable(instr, SRC1); |
| 470 | } | 477 | } |
| @@ -472,6 +479,7 @@ void JitCompiler::Compile_MAX(Instruction instr) { | |||
| 472 | void JitCompiler::Compile_MIN(Instruction instr) { | 479 | void JitCompiler::Compile_MIN(Instruction instr) { |
| 473 | Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1); | 480 | Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1); |
| 474 | Compile_SwizzleSrc(instr, 2, instr.common.src2, SRC2); | 481 | Compile_SwizzleSrc(instr, 2, instr.common.src2, SRC2); |
| 482 | // SSE semantics match PICA200 ones: In case of NaN, SRC2 is returned. | ||
| 475 | MINPS(SRC1, R(SRC2)); | 483 | MINPS(SRC1, R(SRC2)); |
| 476 | Compile_DestEnable(instr, SRC1); | 484 | Compile_DestEnable(instr, SRC1); |
| 477 | } | 485 | } |
| @@ -578,27 +586,42 @@ void JitCompiler::Compile_CALLU(Instruction instr) { | |||
| 578 | } | 586 | } |
| 579 | 587 | ||
| 580 | void JitCompiler::Compile_CMP(Instruction instr) { | 588 | void JitCompiler::Compile_CMP(Instruction instr) { |
| 589 | using Op = Instruction::Common::CompareOpType::Op; | ||
| 590 | Op op_x = instr.common.compare_op.x; | ||
| 591 | Op op_y = instr.common.compare_op.y; | ||
| 592 | |||
| 581 | Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1); | 593 | Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1); |
| 582 | Compile_SwizzleSrc(instr, 2, instr.common.src2, SRC2); | 594 | Compile_SwizzleSrc(instr, 2, instr.common.src2, SRC2); |
| 583 | 595 | ||
| 584 | static const u8 cmp[] = { CMP_EQ, CMP_NEQ, CMP_LT, CMP_LE, CMP_NLE, CMP_NLT }; | 596 | // SSE doesn't have greater-than (GT) or greater-equal (GE) comparison operators. You need to |
| 597 | // emulate them by swapping the lhs and rhs and using LT and LE. NLT and NLE can't be used here | ||
| 598 | // because they don't match when used with NaNs. | ||
| 599 | static const u8 cmp[] = { CMP_EQ, CMP_NEQ, CMP_LT, CMP_LE, CMP_LT, CMP_LE }; | ||
| 600 | |||
| 601 | bool invert_op_x = (op_x == Op::GreaterThan || op_x == Op::GreaterEqual); | ||
| 602 | Gen::X64Reg lhs_x = invert_op_x ? SRC2 : SRC1; | ||
| 603 | Gen::X64Reg rhs_x = invert_op_x ? SRC1 : SRC2; | ||
| 585 | 604 | ||
| 586 | if (instr.common.compare_op.x == instr.common.compare_op.y) { | 605 | if (op_x == op_y) { |
| 587 | // Compare X-component and Y-component together | 606 | // Compare X-component and Y-component together |
| 588 | CMPPS(SRC1, R(SRC2), cmp[instr.common.compare_op.x]); | 607 | CMPPS(lhs_x, R(rhs_x), cmp[op_x]); |
| 608 | MOVQ_xmm(R(COND0), lhs_x); | ||
| 589 | 609 | ||
| 590 | MOVQ_xmm(R(COND0), SRC1); | ||
| 591 | MOV(64, R(COND1), R(COND0)); | 610 | MOV(64, R(COND1), R(COND0)); |
| 592 | } else { | 611 | } else { |
| 612 | bool invert_op_y = (op_y == Op::GreaterThan || op_y == Op::GreaterEqual); | ||
| 613 | Gen::X64Reg lhs_y = invert_op_y ? SRC2 : SRC1; | ||
| 614 | Gen::X64Reg rhs_y = invert_op_y ? SRC1 : SRC2; | ||
| 615 | |||
| 593 | // Compare X-component | 616 | // Compare X-component |
| 594 | MOVAPS(SCRATCH, R(SRC1)); | 617 | MOVAPS(SCRATCH, R(lhs_x)); |
| 595 | CMPSS(SCRATCH, R(SRC2), cmp[instr.common.compare_op.x]); | 618 | CMPSS(SCRATCH, R(rhs_x), cmp[op_x]); |
| 596 | 619 | ||
| 597 | // Compare Y-component | 620 | // Compare Y-component |
| 598 | CMPPS(SRC1, R(SRC2), cmp[instr.common.compare_op.y]); | 621 | CMPPS(lhs_y, R(rhs_y), cmp[op_y]); |
| 599 | 622 | ||
| 600 | MOVQ_xmm(R(COND0), SCRATCH); | 623 | MOVQ_xmm(R(COND0), SCRATCH); |
| 601 | MOVQ_xmm(R(COND1), SRC1); | 624 | MOVQ_xmm(R(COND1), lhs_y); |
| 602 | } | 625 | } |
| 603 | 626 | ||
| 604 | SHR(32, R(COND0), Imm8(31)); | 627 | SHR(32, R(COND0), Imm8(31)); |
| @@ -616,12 +639,8 @@ void JitCompiler::Compile_MAD(Instruction instr) { | |||
| 616 | Compile_SwizzleSrc(instr, 3, instr.mad.src3, SRC3); | 639 | Compile_SwizzleSrc(instr, 3, instr.mad.src3, SRC3); |
| 617 | } | 640 | } |
| 618 | 641 | ||
| 619 | if (Common::GetCPUCaps().fma) { | 642 | Compile_SanitizedMul(SRC1, SRC2, SCRATCH); |
| 620 | VFMADD213PS(SRC1, SRC2, R(SRC3)); | 643 | ADDPS(SRC1, R(SRC3)); |
| 621 | } else { | ||
| 622 | MULPS(SRC1, R(SRC2)); | ||
| 623 | ADDPS(SRC1, R(SRC3)); | ||
| 624 | } | ||
| 625 | 644 | ||
| 626 | Compile_DestEnable(instr, SRC1); | 645 | Compile_DestEnable(instr, SRC1); |
| 627 | } | 646 | } |
diff --git a/src/video_core/shader/shader_jit_x64.h b/src/video_core/shader/shader_jit_x64.h index fbe19fe93..58828ecc8 100644 --- a/src/video_core/shader/shader_jit_x64.h +++ b/src/video_core/shader/shader_jit_x64.h | |||
| @@ -68,6 +68,12 @@ private: | |||
| 68 | void Compile_SwizzleSrc(Instruction instr, unsigned src_num, SourceRegister src_reg, Gen::X64Reg dest); | 68 | void Compile_SwizzleSrc(Instruction instr, unsigned src_num, SourceRegister src_reg, Gen::X64Reg dest); |
| 69 | void Compile_DestEnable(Instruction instr, Gen::X64Reg dest); | 69 | void Compile_DestEnable(Instruction instr, Gen::X64Reg dest); |
| 70 | 70 | ||
| 71 | /** | ||
| 72 | * Compiles a `MUL src1, src2` operation, properly handling the PICA semantics when multiplying | ||
| 73 | * zero by inf. Clobbers `src2` and `scratch`. | ||
| 74 | */ | ||
| 75 | void Compile_SanitizedMul(Gen::X64Reg src1, Gen::X64Reg src2, Gen::X64Reg scratch); | ||
| 76 | |||
| 71 | void Compile_EvaluateCondition(Instruction instr); | 77 | void Compile_EvaluateCondition(Instruction instr); |
| 72 | void Compile_UniformCondition(Instruction instr); | 78 | void Compile_UniformCondition(Instruction instr); |
| 73 | 79 | ||