summaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
authorGravatar Yuri Kunde Schlesner2015-08-27 16:34:13 -0700
committerGravatar Yuri Kunde Schlesner2015-08-27 16:34:13 -0700
commitc5a4025b6581c1c64c2761d09510c5827eaada05 (patch)
treec7b7072b2ad53041127c454e7de0dcb0607d02e8 /src
parentMerge pull request #1068 from bunnei/gl-hash-textures (diff)
parentfixup! Shaders: Fix multiplications between 0.0 and inf (diff)
downloadyuzu-c5a4025b6581c1c64c2761d09510c5827eaada05.tar.gz
yuzu-c5a4025b6581c1c64c2761d09510c5827eaada05.tar.xz
yuzu-c5a4025b6581c1c64c2761d09510c5827eaada05.zip
Merge pull request #1065 from yuriks/shader-fp
Shader FP compliance fixes
Diffstat (limited to 'src')
-rw-r--r--src/video_core/pica.h14
-rw-r--r--src/video_core/shader/shader_interpreter.cpp10
-rw-r--r--src/video_core/shader/shader_jit_x64.cpp127
-rw-r--r--src/video_core/shader/shader_jit_x64.h6
4 files changed, 100 insertions, 57 deletions
diff --git a/src/video_core/pica.h b/src/video_core/pica.h
index 58b924f9e..bb689f2a9 100644
--- a/src/video_core/pica.h
+++ b/src/video_core/pica.h
@@ -1021,12 +1021,20 @@ struct float24 {
1021 return ret; 1021 return ret;
1022 } 1022 }
1023 1023
1024 static float24 Zero() {
1025 return FromFloat32(0.f);
1026 }
1027
1024 // Not recommended for anything but logging 1028 // Not recommended for anything but logging
1025 float ToFloat32() const { 1029 float ToFloat32() const {
1026 return value; 1030 return value;
1027 } 1031 }
1028 1032
1029 float24 operator * (const float24& flt) const { 1033 float24 operator * (const float24& flt) const {
1034 if ((this->value == 0.f && !std::isnan(flt.value)) ||
1035 (flt.value == 0.f && !std::isnan(this->value)))
1036 // PICA gives 0 instead of NaN when multiplying by inf
1037 return Zero();
1030 return float24::FromFloat32(ToFloat32() * flt.ToFloat32()); 1038 return float24::FromFloat32(ToFloat32() * flt.ToFloat32());
1031 } 1039 }
1032 1040
@@ -1043,7 +1051,11 @@ struct float24 {
1043 } 1051 }
1044 1052
1045 float24& operator *= (const float24& flt) { 1053 float24& operator *= (const float24& flt) {
1046 value *= flt.ToFloat32(); 1054 if ((this->value == 0.f && !std::isnan(flt.value)) ||
1055 (flt.value == 0.f && !std::isnan(this->value)))
1056 // PICA gives 0 instead of NaN when multiplying by inf
1057 *this = Zero();
1058 else value *= flt.ToFloat32();
1047 return *this; 1059 return *this;
1048 } 1060 }
1049 1061
diff --git a/src/video_core/shader/shader_interpreter.cpp b/src/video_core/shader/shader_interpreter.cpp
index ae5a30441..69e4efa68 100644
--- a/src/video_core/shader/shader_interpreter.cpp
+++ b/src/video_core/shader/shader_interpreter.cpp
@@ -177,7 +177,10 @@ void RunInterpreter(UnitState<Debug>& state) {
177 if (!swizzle.DestComponentEnabled(i)) 177 if (!swizzle.DestComponentEnabled(i))
178 continue; 178 continue;
179 179
180 dest[i] = std::max(src1[i], src2[i]); 180 // NOTE: Exact form required to match NaN semantics to hardware:
181 // max(0, NaN) -> NaN
182 // max(NaN, 0) -> 0
183 dest[i] = (src1[i] > src2[i]) ? src1[i] : src2[i];
181 } 184 }
182 Record<DebugDataRecord::DEST_OUT>(state.debug, iteration, dest); 185 Record<DebugDataRecord::DEST_OUT>(state.debug, iteration, dest);
183 break; 186 break;
@@ -190,7 +193,10 @@ void RunInterpreter(UnitState<Debug>& state) {
190 if (!swizzle.DestComponentEnabled(i)) 193 if (!swizzle.DestComponentEnabled(i))
191 continue; 194 continue;
192 195
193 dest[i] = std::min(src1[i], src2[i]); 196 // NOTE: Exact form required to match NaN semantics to hardware:
197 // min(0, NaN) -> NaN
198 // min(NaN, 0) -> 0
199 dest[i] = (src1[i] < src2[i]) ? src1[i] : src2[i];
194 } 200 }
195 Record<DebugDataRecord::DEST_OUT>(state.debug, iteration, dest); 201 Record<DebugDataRecord::DEST_OUT>(state.debug, iteration, dest);
196 break; 202 break;
diff --git a/src/video_core/shader/shader_jit_x64.cpp b/src/video_core/shader/shader_jit_x64.cpp
index a8045d4b0..d3cfe109e 100644
--- a/src/video_core/shader/shader_jit_x64.cpp
+++ b/src/video_core/shader/shader_jit_x64.cpp
@@ -115,6 +115,8 @@ static const X64Reg SRC1 = XMM1;
115static const X64Reg SRC2 = XMM2; 115static const X64Reg SRC2 = XMM2;
116/// Loaded with the third swizzled source register, otherwise can be used as a scratch register 116/// Loaded with the third swizzled source register, otherwise can be used as a scratch register
117static const X64Reg SRC3 = XMM3; 117static const X64Reg SRC3 = XMM3;
118/// Additional scratch register
119static const X64Reg SCRATCH2 = XMM4;
118/// Constant vector of [1.0f, 1.0f, 1.0f, 1.0f], used to efficiently set a vector to one 120/// Constant vector of [1.0f, 1.0f, 1.0f, 1.0f], used to efficiently set a vector to one
119static const X64Reg ONE = XMM14; 121static const X64Reg ONE = XMM14;
120/// Constant vector of [-0.f, -0.f, -0.f, -0.f], used to efficiently negate a vector with XOR 122/// Constant vector of [-0.f, -0.f, -0.f, -0.f], used to efficiently negate a vector with XOR
@@ -227,8 +229,8 @@ void JitCompiler::Compile_DestEnable(Instruction instr,X64Reg src) {
227 u8 mask = ((swiz.dest_mask & 1) << 3) | ((swiz.dest_mask & 8) >> 3) | ((swiz.dest_mask & 2) << 1) | ((swiz.dest_mask & 4) >> 1); 229 u8 mask = ((swiz.dest_mask & 1) << 3) | ((swiz.dest_mask & 8) >> 3) | ((swiz.dest_mask & 2) << 1) | ((swiz.dest_mask & 4) >> 1);
228 BLENDPS(SCRATCH, R(src), mask); 230 BLENDPS(SCRATCH, R(src), mask);
229 } else { 231 } else {
230 MOVAPS(XMM4, R(src)); 232 MOVAPS(SCRATCH2, R(src));
231 UNPCKHPS(XMM4, R(SCRATCH)); // Unpack X/Y components of source and destination 233 UNPCKHPS(SCRATCH2, R(SCRATCH)); // Unpack X/Y components of source and destination
232 UNPCKLPS(SCRATCH, R(src)); // Unpack Z/W components of source and destination 234 UNPCKLPS(SCRATCH, R(src)); // Unpack Z/W components of source and destination
233 235
234 // Compute selector to selectively copy source components to destination for SHUFPS instruction 236 // Compute selector to selectively copy source components to destination for SHUFPS instruction
@@ -236,7 +238,7 @@ void JitCompiler::Compile_DestEnable(Instruction instr,X64Reg src) {
236 ((swiz.DestComponentEnabled(1) ? 3 : 2) << 2) | 238 ((swiz.DestComponentEnabled(1) ? 3 : 2) << 2) |
237 ((swiz.DestComponentEnabled(2) ? 0 : 1) << 4) | 239 ((swiz.DestComponentEnabled(2) ? 0 : 1) << 4) |
238 ((swiz.DestComponentEnabled(3) ? 2 : 3) << 6); 240 ((swiz.DestComponentEnabled(3) ? 2 : 3) << 6);
239 SHUFPS(SCRATCH, R(XMM4), sel); 241 SHUFPS(SCRATCH, R(SCRATCH2), sel);
240 } 242 }
241 243
242 // Store dest back to memory 244 // Store dest back to memory
@@ -244,6 +246,19 @@ void JitCompiler::Compile_DestEnable(Instruction instr,X64Reg src) {
244 } 246 }
245} 247}
246 248
249void JitCompiler::Compile_SanitizedMul(Gen::X64Reg src1, Gen::X64Reg src2, Gen::X64Reg scratch) {
250 MOVAPS(scratch, R(src1));
251 CMPPS(scratch, R(src2), CMP_ORD);
252
253 MULPS(src1, R(src2));
254
255 MOVAPS(src2, R(src1));
256 CMPPS(src2, R(src2), CMP_UNORD);
257
258 XORPS(scratch, R(src2));
259 ANDPS(src1, R(scratch));
260}
261
247void JitCompiler::Compile_EvaluateCondition(Instruction instr) { 262void JitCompiler::Compile_EvaluateCondition(Instruction instr) {
248 // Note: NXOR is used below to check for equality 263 // Note: NXOR is used below to check for equality
249 switch (instr.flow_control.op) { 264 switch (instr.flow_control.op) {
@@ -307,21 +322,17 @@ void JitCompiler::Compile_DP3(Instruction instr) {
307 Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1); 322 Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1);
308 Compile_SwizzleSrc(instr, 2, instr.common.src2, SRC2); 323 Compile_SwizzleSrc(instr, 2, instr.common.src2, SRC2);
309 324
310 if (Common::GetCPUCaps().sse4_1) { 325 Compile_SanitizedMul(SRC1, SRC2, SCRATCH);
311 DPPS(SRC1, R(SRC2), 0x7f);
312 } else {
313 MULPS(SRC1, R(SRC2));
314 326
315 MOVAPS(SRC2, R(SRC1)); 327 MOVAPS(SRC2, R(SRC1));
316 SHUFPS(SRC2, R(SRC2), _MM_SHUFFLE(1, 1, 1, 1)); 328 SHUFPS(SRC2, R(SRC2), _MM_SHUFFLE(1, 1, 1, 1));
317 329
318 MOVAPS(SRC3, R(SRC1)); 330 MOVAPS(SRC3, R(SRC1));
319 SHUFPS(SRC3, R(SRC3), _MM_SHUFFLE(2, 2, 2, 2)); 331 SHUFPS(SRC3, R(SRC3), _MM_SHUFFLE(2, 2, 2, 2));
320 332
321 SHUFPS(SRC1, R(SRC1), _MM_SHUFFLE(0, 0, 0, 0)); 333 SHUFPS(SRC1, R(SRC1), _MM_SHUFFLE(0, 0, 0, 0));
322 ADDPS(SRC1, R(SRC2)); 334 ADDPS(SRC1, R(SRC2));
323 ADDPS(SRC1, R(SRC3)); 335 ADDPS(SRC1, R(SRC3));
324 }
325 336
326 Compile_DestEnable(instr, SRC1); 337 Compile_DestEnable(instr, SRC1);
327} 338}
@@ -330,19 +341,15 @@ void JitCompiler::Compile_DP4(Instruction instr) {
330 Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1); 341 Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1);
331 Compile_SwizzleSrc(instr, 2, instr.common.src2, SRC2); 342 Compile_SwizzleSrc(instr, 2, instr.common.src2, SRC2);
332 343
333 if (Common::GetCPUCaps().sse4_1) { 344 Compile_SanitizedMul(SRC1, SRC2, SCRATCH);
334 DPPS(SRC1, R(SRC2), 0xff);
335 } else {
336 MULPS(SRC1, R(SRC2));
337 345
338 MOVAPS(SRC2, R(SRC1)); 346 MOVAPS(SRC2, R(SRC1));
339 SHUFPS(SRC1, R(SRC1), _MM_SHUFFLE(2, 3, 0, 1)); // XYZW -> ZWXY 347 SHUFPS(SRC1, R(SRC1), _MM_SHUFFLE(2, 3, 0, 1)); // XYZW -> ZWXY
340 ADDPS(SRC1, R(SRC2)); 348 ADDPS(SRC1, R(SRC2));
341 349
342 MOVAPS(SRC2, R(SRC1)); 350 MOVAPS(SRC2, R(SRC1));
343 SHUFPS(SRC1, R(SRC1), _MM_SHUFFLE(0, 1, 2, 3)); // XYZW -> WZYX 351 SHUFPS(SRC1, R(SRC1), _MM_SHUFFLE(0, 1, 2, 3)); // XYZW -> WZYX
344 ADDPS(SRC1, R(SRC2)); 352 ADDPS(SRC1, R(SRC2));
345 }
346 353
347 Compile_DestEnable(instr, SRC1); 354 Compile_DestEnable(instr, SRC1);
348} 355}
@@ -359,23 +366,22 @@ void JitCompiler::Compile_DPH(Instruction instr) {
359 if (Common::GetCPUCaps().sse4_1) { 366 if (Common::GetCPUCaps().sse4_1) {
360 // Set 4th component to 1.0 367 // Set 4th component to 1.0
361 BLENDPS(SRC1, R(ONE), 0x8); // 0b1000 368 BLENDPS(SRC1, R(ONE), 0x8); // 0b1000
362 DPPS(SRC1, R(SRC2), 0xff);
363 } else { 369 } else {
364 // Reverse to set the 4th component to 1.0 370 // Set 4th component to 1.0
365 SHUFPS(SRC1, R(SRC1), _MM_SHUFFLE(0, 1, 2, 3)); 371 MOVAPS(SCRATCH, R(SRC1));
366 MOVSS(SRC1, R(ONE)); 372 UNPCKHPS(SCRATCH, R(ONE)); // XYZW, 1111 -> Z1__
367 SHUFPS(SRC1, R(SRC1), _MM_SHUFFLE(0, 1, 2, 3)); 373 UNPCKLPD(SRC1, R(SCRATCH)); // XYZW, Z1__ -> XYZ1
374 }
368 375
369 MULPS(SRC1, R(SRC2)); 376 Compile_SanitizedMul(SRC1, SRC2, SCRATCH);
370 377
371 MOVAPS(SRC2, R(SRC1)); 378 MOVAPS(SRC2, R(SRC1));
372 SHUFPS(SRC1, R(SRC1), _MM_SHUFFLE(2, 3, 0, 1)); // XYZW -> ZWXY 379 SHUFPS(SRC1, R(SRC1), _MM_SHUFFLE(2, 3, 0, 1)); // XYZW -> ZWXY
373 ADDPS(SRC1, R(SRC2)); 380 ADDPS(SRC1, R(SRC2));
374 381
375 MOVAPS(SRC2, R(SRC1)); 382 MOVAPS(SRC2, R(SRC1));
376 SHUFPS(SRC1, R(SRC1), _MM_SHUFFLE(0, 1, 2, 3)); // XYZW -> WZYX 383 SHUFPS(SRC1, R(SRC1), _MM_SHUFFLE(0, 1, 2, 3)); // XYZW -> WZYX
377 ADDPS(SRC1, R(SRC2)); 384 ADDPS(SRC1, R(SRC2));
378 }
379 385
380 Compile_DestEnable(instr, SRC1); 386 Compile_DestEnable(instr, SRC1);
381} 387}
@@ -415,7 +421,7 @@ void JitCompiler::Compile_LG2(Instruction instr) {
415void JitCompiler::Compile_MUL(Instruction instr) { 421void JitCompiler::Compile_MUL(Instruction instr) {
416 Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1); 422 Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1);
417 Compile_SwizzleSrc(instr, 2, instr.common.src2, SRC2); 423 Compile_SwizzleSrc(instr, 2, instr.common.src2, SRC2);
418 MULPS(SRC1, R(SRC2)); 424 Compile_SanitizedMul(SRC1, SRC2, SCRATCH);
419 Compile_DestEnable(instr, SRC1); 425 Compile_DestEnable(instr, SRC1);
420} 426}
421 427
@@ -465,6 +471,7 @@ void JitCompiler::Compile_FLR(Instruction instr) {
465void JitCompiler::Compile_MAX(Instruction instr) { 471void JitCompiler::Compile_MAX(Instruction instr) {
466 Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1); 472 Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1);
467 Compile_SwizzleSrc(instr, 2, instr.common.src2, SRC2); 473 Compile_SwizzleSrc(instr, 2, instr.common.src2, SRC2);
474 // SSE semantics match PICA200 ones: In case of NaN, SRC2 is returned.
468 MAXPS(SRC1, R(SRC2)); 475 MAXPS(SRC1, R(SRC2));
469 Compile_DestEnable(instr, SRC1); 476 Compile_DestEnable(instr, SRC1);
470} 477}
@@ -472,6 +479,7 @@ void JitCompiler::Compile_MAX(Instruction instr) {
472void JitCompiler::Compile_MIN(Instruction instr) { 479void JitCompiler::Compile_MIN(Instruction instr) {
473 Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1); 480 Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1);
474 Compile_SwizzleSrc(instr, 2, instr.common.src2, SRC2); 481 Compile_SwizzleSrc(instr, 2, instr.common.src2, SRC2);
482 // SSE semantics match PICA200 ones: In case of NaN, SRC2 is returned.
475 MINPS(SRC1, R(SRC2)); 483 MINPS(SRC1, R(SRC2));
476 Compile_DestEnable(instr, SRC1); 484 Compile_DestEnable(instr, SRC1);
477} 485}
@@ -578,27 +586,42 @@ void JitCompiler::Compile_CALLU(Instruction instr) {
578} 586}
579 587
580void JitCompiler::Compile_CMP(Instruction instr) { 588void JitCompiler::Compile_CMP(Instruction instr) {
589 using Op = Instruction::Common::CompareOpType::Op;
590 Op op_x = instr.common.compare_op.x;
591 Op op_y = instr.common.compare_op.y;
592
581 Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1); 593 Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1);
582 Compile_SwizzleSrc(instr, 2, instr.common.src2, SRC2); 594 Compile_SwizzleSrc(instr, 2, instr.common.src2, SRC2);
583 595
584 static const u8 cmp[] = { CMP_EQ, CMP_NEQ, CMP_LT, CMP_LE, CMP_NLE, CMP_NLT }; 596 // SSE doesn't have greater-than (GT) or greater-equal (GE) comparison operators. You need to
597 // emulate them by swapping the lhs and rhs and using LT and LE. NLT and NLE can't be used here
598 // because they don't match when used with NaNs.
599 static const u8 cmp[] = { CMP_EQ, CMP_NEQ, CMP_LT, CMP_LE, CMP_LT, CMP_LE };
600
601 bool invert_op_x = (op_x == Op::GreaterThan || op_x == Op::GreaterEqual);
602 Gen::X64Reg lhs_x = invert_op_x ? SRC2 : SRC1;
603 Gen::X64Reg rhs_x = invert_op_x ? SRC1 : SRC2;
585 604
586 if (instr.common.compare_op.x == instr.common.compare_op.y) { 605 if (op_x == op_y) {
587 // Compare X-component and Y-component together 606 // Compare X-component and Y-component together
588 CMPPS(SRC1, R(SRC2), cmp[instr.common.compare_op.x]); 607 CMPPS(lhs_x, R(rhs_x), cmp[op_x]);
608 MOVQ_xmm(R(COND0), lhs_x);
589 609
590 MOVQ_xmm(R(COND0), SRC1);
591 MOV(64, R(COND1), R(COND0)); 610 MOV(64, R(COND1), R(COND0));
592 } else { 611 } else {
612 bool invert_op_y = (op_y == Op::GreaterThan || op_y == Op::GreaterEqual);
613 Gen::X64Reg lhs_y = invert_op_y ? SRC2 : SRC1;
614 Gen::X64Reg rhs_y = invert_op_y ? SRC1 : SRC2;
615
593 // Compare X-component 616 // Compare X-component
594 MOVAPS(SCRATCH, R(SRC1)); 617 MOVAPS(SCRATCH, R(lhs_x));
595 CMPSS(SCRATCH, R(SRC2), cmp[instr.common.compare_op.x]); 618 CMPSS(SCRATCH, R(rhs_x), cmp[op_x]);
596 619
597 // Compare Y-component 620 // Compare Y-component
598 CMPPS(SRC1, R(SRC2), cmp[instr.common.compare_op.y]); 621 CMPPS(lhs_y, R(rhs_y), cmp[op_y]);
599 622
600 MOVQ_xmm(R(COND0), SCRATCH); 623 MOVQ_xmm(R(COND0), SCRATCH);
601 MOVQ_xmm(R(COND1), SRC1); 624 MOVQ_xmm(R(COND1), lhs_y);
602 } 625 }
603 626
604 SHR(32, R(COND0), Imm8(31)); 627 SHR(32, R(COND0), Imm8(31));
@@ -616,12 +639,8 @@ void JitCompiler::Compile_MAD(Instruction instr) {
616 Compile_SwizzleSrc(instr, 3, instr.mad.src3, SRC3); 639 Compile_SwizzleSrc(instr, 3, instr.mad.src3, SRC3);
617 } 640 }
618 641
619 if (Common::GetCPUCaps().fma) { 642 Compile_SanitizedMul(SRC1, SRC2, SCRATCH);
620 VFMADD213PS(SRC1, SRC2, R(SRC3)); 643 ADDPS(SRC1, R(SRC3));
621 } else {
622 MULPS(SRC1, R(SRC2));
623 ADDPS(SRC1, R(SRC3));
624 }
625 644
626 Compile_DestEnable(instr, SRC1); 645 Compile_DestEnable(instr, SRC1);
627} 646}
diff --git a/src/video_core/shader/shader_jit_x64.h b/src/video_core/shader/shader_jit_x64.h
index fbe19fe93..58828ecc8 100644
--- a/src/video_core/shader/shader_jit_x64.h
+++ b/src/video_core/shader/shader_jit_x64.h
@@ -68,6 +68,12 @@ private:
68 void Compile_SwizzleSrc(Instruction instr, unsigned src_num, SourceRegister src_reg, Gen::X64Reg dest); 68 void Compile_SwizzleSrc(Instruction instr, unsigned src_num, SourceRegister src_reg, Gen::X64Reg dest);
69 void Compile_DestEnable(Instruction instr, Gen::X64Reg dest); 69 void Compile_DestEnable(Instruction instr, Gen::X64Reg dest);
70 70
71 /**
72 * Compiles a `MUL src1, src2` operation, properly handling the PICA semantics when multiplying
73 * zero by inf. Clobbers `src2` and `scratch`.
74 */
75 void Compile_SanitizedMul(Gen::X64Reg src1, Gen::X64Reg src2, Gen::X64Reg scratch);
76
71 void Compile_EvaluateCondition(Instruction instr); 77 void Compile_EvaluateCondition(Instruction instr);
72 void Compile_UniformCondition(Instruction instr); 78 void Compile_UniformCondition(Instruction instr);
73 79