diff options
| author | 2015-08-23 15:13:36 +0200 | |
|---|---|---|
| committer | 2015-08-23 22:01:17 +0200 | |
| commit | fa552f11ef1f96afab715c9653c705e3fbbe2a74 (patch) | |
| tree | 152cb8e6321e7d02b61893079573ac0b2d07ce68 /src | |
| parent | x64-emitter: add RCPSS SSE instruction (diff) | |
| download | yuzu-fa552f11ef1f96afab715c9653c705e3fbbe2a74.tar.gz yuzu-fa552f11ef1f96afab715c9653c705e3fbbe2a74.tar.xz yuzu-fa552f11ef1f96afab715c9653c705e3fbbe2a74.zip | |
Shader: RCP and RSQ computes only the 1st component
Diffstat (limited to 'src')
| -rw-r--r-- | src/video_core/shader/shader_interpreter.cpp | 10 | ||||
| -rw-r--r-- | src/video_core/shader/shader_jit_x64.cpp | 10 |
2 files changed, 10 insertions, 10 deletions
diff --git a/src/video_core/shader/shader_interpreter.cpp b/src/video_core/shader/shader_interpreter.cpp index 063cc38f0..51fee6f97 100644 --- a/src/video_core/shader/shader_interpreter.cpp +++ b/src/video_core/shader/shader_interpreter.cpp | |||
| @@ -221,13 +221,12 @@ void RunInterpreter(UnitState<Debug>& state) { | |||
| 221 | { | 221 | { |
| 222 | Record<DebugDataRecord::SRC1>(state.debug, iteration, src1); | 222 | Record<DebugDataRecord::SRC1>(state.debug, iteration, src1); |
| 223 | Record<DebugDataRecord::DEST_IN>(state.debug, iteration, dest); | 223 | Record<DebugDataRecord::DEST_IN>(state.debug, iteration, dest); |
| 224 | float24 rcp_res = float24::FromFloat32(1.0f / src1[0].ToFloat32()); | ||
| 224 | for (int i = 0; i < 4; ++i) { | 225 | for (int i = 0; i < 4; ++i) { |
| 225 | if (!swizzle.DestComponentEnabled(i)) | 226 | if (!swizzle.DestComponentEnabled(i)) |
| 226 | continue; | 227 | continue; |
| 227 | 228 | ||
| 228 | // TODO: Be stable against division by zero! | 229 | dest[i] = rcp_res; |
| 229 | // TODO: I think this might be wrong... we should only use one component here | ||
| 230 | dest[i] = float24::FromFloat32(1.0f / src1[i].ToFloat32()); | ||
| 231 | } | 230 | } |
| 232 | Record<DebugDataRecord::DEST_OUT>(state.debug, iteration, dest); | 231 | Record<DebugDataRecord::DEST_OUT>(state.debug, iteration, dest); |
| 233 | break; | 232 | break; |
| @@ -238,13 +237,12 @@ void RunInterpreter(UnitState<Debug>& state) { | |||
| 238 | { | 237 | { |
| 239 | Record<DebugDataRecord::SRC1>(state.debug, iteration, src1); | 238 | Record<DebugDataRecord::SRC1>(state.debug, iteration, src1); |
| 240 | Record<DebugDataRecord::DEST_IN>(state.debug, iteration, dest); | 239 | Record<DebugDataRecord::DEST_IN>(state.debug, iteration, dest); |
| 240 | float24 rsq_res = float24::FromFloat32(1.0f / sqrt(src1[0].ToFloat32())); | ||
| 241 | for (int i = 0; i < 4; ++i) { | 241 | for (int i = 0; i < 4; ++i) { |
| 242 | if (!swizzle.DestComponentEnabled(i)) | 242 | if (!swizzle.DestComponentEnabled(i)) |
| 243 | continue; | 243 | continue; |
| 244 | 244 | ||
| 245 | // TODO: Be stable against division by zero! | 245 | dest[i] = rsq_res; |
| 246 | // TODO: I think this might be wrong... we should only use one component here | ||
| 247 | dest[i] = float24::FromFloat32(1.0f / sqrt(src1[i].ToFloat32())); | ||
| 248 | } | 246 | } |
| 249 | Record<DebugDataRecord::DEST_OUT>(state.debug, iteration, dest); | 247 | Record<DebugDataRecord::DEST_OUT>(state.debug, iteration, dest); |
| 250 | break; | 248 | break; |
diff --git a/src/video_core/shader/shader_jit_x64.cpp b/src/video_core/shader/shader_jit_x64.cpp index a1bdd8456..e52fe43fb 100644 --- a/src/video_core/shader/shader_jit_x64.cpp +++ b/src/video_core/shader/shader_jit_x64.cpp | |||
| @@ -496,9 +496,10 @@ void JitCompiler::Compile_MOV(Instruction instr) { | |||
| 496 | void JitCompiler::Compile_RCP(Instruction instr) { | 496 | void JitCompiler::Compile_RCP(Instruction instr) { |
| 497 | Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1); | 497 | Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1); |
| 498 | 498 | ||
| 499 | // TODO(bunnei): RCPPS is a pretty rough approximation, this might cause problems if Pica | 499 | // TODO(bunnei): RCPSS is a pretty rough approximation, this might cause problems if Pica |
| 500 | // performs this operation more accurately. This should be checked on hardware. | 500 | // performs this operation more accurately. This should be checked on hardware. |
| 501 | RCPPS(SRC1, R(SRC1)); | 501 | RCPSS(SRC1, R(SRC1)); |
| 502 | SHUFPS(SRC1, R(SRC1), _MM_SHUFFLE(0, 0, 0, 0)); // XYWZ -> XXXX | ||
| 502 | 503 | ||
| 503 | Compile_DestEnable(instr, SRC1); | 504 | Compile_DestEnable(instr, SRC1); |
| 504 | } | 505 | } |
| @@ -506,9 +507,10 @@ void JitCompiler::Compile_RCP(Instruction instr) { | |||
| 506 | void JitCompiler::Compile_RSQ(Instruction instr) { | 507 | void JitCompiler::Compile_RSQ(Instruction instr) { |
| 507 | Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1); | 508 | Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1); |
| 508 | 509 | ||
| 509 | // TODO(bunnei): RSQRTPS is a pretty rough approximation, this might cause problems if Pica | 510 | // TODO(bunnei): RSQRTSS is a pretty rough approximation, this might cause problems if Pica |
| 510 | // performs this operation more accurately. This should be checked on hardware. | 511 | // performs this operation more accurately. This should be checked on hardware. |
| 511 | RSQRTPS(SRC1, R(SRC1)); | 512 | RSQRTSS(SRC1, R(SRC1)); |
| 513 | SHUFPS(SRC1, R(SRC1), _MM_SHUFFLE(0, 0, 0, 0)); // XYWZ -> XXXX | ||
| 512 | 514 | ||
| 513 | Compile_DestEnable(instr, SRC1); | 515 | Compile_DestEnable(instr, SRC1); |
| 514 | } | 516 | } |