summaryrefslogtreecommitdiff
path: root/src/core/hw/y2r.cpp
diff options
context:
space:
mode:
authorGravatar bunnei2017-10-10 17:32:14 -0400
committerGravatar bunnei2017-10-10 17:32:14 -0400
commit0906de9a14b735d1d409290ca050eb7d2c2d3d84 (patch)
tree79bb57d3a4dc4ca377e7a62744c3941de29e785b /src/core/hw/y2r.cpp
parentMerge remote-tracking branch 'upstream/master' into nx (diff)
downloadyuzu-0906de9a14b735d1d409290ca050eb7d2c2d3d84.tar.gz
yuzu-0906de9a14b735d1d409290ca050eb7d2c2d3d84.tar.xz
yuzu-0906de9a14b735d1d409290ca050eb7d2c2d3d84.zip
hle: Remove a large amount of 3ds-specific service code.
Diffstat (limited to 'src/core/hw/y2r.cpp')
-rw-r--r--src/core/hw/y2r.cpp382
1 files changed, 0 insertions, 382 deletions
diff --git a/src/core/hw/y2r.cpp b/src/core/hw/y2r.cpp
deleted file mode 100644
index e697f84b3..000000000
--- a/src/core/hw/y2r.cpp
+++ /dev/null
@@ -1,382 +0,0 @@
1// Copyright 2015 Citra Emulator Project
2// Licensed under GPLv2 or any later version
3// Refer to the license.txt file included.
4
5#include <algorithm>
6#include <array>
7#include <cstddef>
8#include <memory>
9#include "common/assert.h"
10#include "common/color.h"
11#include "common/common_types.h"
12#include "common/math_util.h"
13#include "common/vector_math.h"
14#include "core/hle/service/y2r_u.h"
15#include "core/hw/y2r.h"
16#include "core/memory.h"
17
18namespace HW {
19namespace Y2R {
20
21using namespace Service::Y2R;
22
23static const size_t MAX_TILES = 1024 / 8;
24static const size_t TILE_SIZE = 8 * 8;
25using ImageTile = std::array<u32, TILE_SIZE>;
26
27/// Converts a image strip from the source YUV format into individual 8x8 RGB32 tiles.
28static void ConvertYUVToRGB(InputFormat input_format, const u8* input_Y, const u8* input_U,
29 const u8* input_V, ImageTile output[], unsigned int width,
30 unsigned int height, const CoefficientSet& coefficients) {
31
32 for (unsigned int y = 0; y < height; ++y) {
33 for (unsigned int x = 0; x < width; ++x) {
34 s32 Y = 0;
35 s32 U = 0;
36 s32 V = 0;
37 switch (input_format) {
38 case InputFormat::YUV422_Indiv8:
39 case InputFormat::YUV422_Indiv16:
40 Y = input_Y[y * width + x];
41 U = input_U[(y * width + x) / 2];
42 V = input_V[(y * width + x) / 2];
43 break;
44 case InputFormat::YUV420_Indiv8:
45 case InputFormat::YUV420_Indiv16:
46 Y = input_Y[y * width + x];
47 U = input_U[((y / 2) * width + x) / 2];
48 V = input_V[((y / 2) * width + x) / 2];
49 break;
50 case InputFormat::YUYV422_Interleaved:
51 Y = input_Y[(y * width + x) * 2];
52 U = input_Y[(y * width + (x / 2) * 2) * 2 + 1];
53 V = input_Y[(y * width + (x / 2) * 2) * 2 + 3];
54 break;
55 }
56
57 // This conversion process is bit-exact with hardware, as far as could be tested.
58 auto& c = coefficients;
59 s32 cY = c[0] * Y;
60
61 s32 r = cY + c[1] * V;
62 s32 g = cY - c[2] * V - c[3] * U;
63 s32 b = cY + c[4] * U;
64
65 const s32 rounding_offset = 0x18;
66 r = (r >> 3) + c[5] + rounding_offset;
67 g = (g >> 3) + c[6] + rounding_offset;
68 b = (b >> 3) + c[7] + rounding_offset;
69
70 unsigned int tile = x / 8;
71 unsigned int tile_x = x % 8;
72 u32* out = &output[tile][y * 8 + tile_x];
73
74 using MathUtil::Clamp;
75 *out = ((u32)Clamp(r >> 5, 0, 0xFF) << 24) | ((u32)Clamp(g >> 5, 0, 0xFF) << 16) |
76 ((u32)Clamp(b >> 5, 0, 0xFF) << 8);
77 }
78 }
79}
80
81/// Simulates an incoming CDMA transfer. The N parameter is used to automatically convert 16-bit
82/// formats to 8-bit.
83template <size_t N>
84static void ReceiveData(u8* output, ConversionBuffer& buf, size_t amount_of_data) {
85 const u8* input = Memory::GetPointer(buf.address);
86
87 size_t output_unit = buf.transfer_unit / N;
88 ASSERT(amount_of_data % output_unit == 0);
89
90 while (amount_of_data > 0) {
91 for (size_t i = 0; i < output_unit; ++i) {
92 output[i] = input[i * N];
93 }
94
95 output += output_unit;
96 input += buf.transfer_unit + buf.gap;
97
98 buf.address += buf.transfer_unit + buf.gap;
99 buf.image_size -= buf.transfer_unit;
100 amount_of_data -= output_unit;
101 }
102}
103
104/// Convert intermediate RGB32 format to the final output format while simulating an outgoing CDMA
105/// transfer.
106static void SendData(const u32* input, ConversionBuffer& buf, int amount_of_data,
107 OutputFormat output_format, u8 alpha) {
108
109 u8* output = Memory::GetPointer(buf.address);
110
111 while (amount_of_data > 0) {
112 u8* unit_end = output + buf.transfer_unit;
113 while (output < unit_end) {
114 u32 color = *input++;
115 Math::Vec4<u8> col_vec{(u8)(color >> 24), (u8)(color >> 16), (u8)(color >> 8), alpha};
116
117 switch (output_format) {
118 case OutputFormat::RGBA8:
119 Color::EncodeRGBA8(col_vec, output);
120 output += 4;
121 break;
122 case OutputFormat::RGB8:
123 Color::EncodeRGB8(col_vec, output);
124 output += 3;
125 break;
126 case OutputFormat::RGB5A1:
127 Color::EncodeRGB5A1(col_vec, output);
128 output += 2;
129 break;
130 case OutputFormat::RGB565:
131 Color::EncodeRGB565(col_vec, output);
132 output += 2;
133 break;
134 }
135
136 amount_of_data -= 1;
137 }
138
139 output += buf.gap;
140 buf.address += buf.transfer_unit + buf.gap;
141 buf.image_size -= buf.transfer_unit;
142 }
143}
144
145static const u8 linear_lut[TILE_SIZE] = {
146 // clang-format off
147 0, 1, 2, 3, 4, 5, 6, 7,
148 8, 9, 10, 11, 12, 13, 14, 15,
149 16, 17, 18, 19, 20, 21, 22, 23,
150 24, 25, 26, 27, 28, 29, 30, 31,
151 32, 33, 34, 35, 36, 37, 38, 39,
152 40, 41, 42, 43, 44, 45, 46, 47,
153 48, 49, 50, 51, 52, 53, 54, 55,
154 56, 57, 58, 59, 60, 61, 62, 63,
155 // clang-format on
156};
157
158static const u8 morton_lut[TILE_SIZE] = {
159 // clang-format off
160 0, 1, 4, 5, 16, 17, 20, 21,
161 2, 3, 6, 7, 18, 19, 22, 23,
162 8, 9, 12, 13, 24, 25, 28, 29,
163 10, 11, 14, 15, 26, 27, 30, 31,
164 32, 33, 36, 37, 48, 49, 52, 53,
165 34, 35, 38, 39, 50, 51, 54, 55,
166 40, 41, 44, 45, 56, 57, 60, 61,
167 42, 43, 46, 47, 58, 59, 62, 63,
168 // clang-format on
169};
170
171static void RotateTile0(const ImageTile& input, ImageTile& output, int height,
172 const u8 out_map[64]) {
173 for (int i = 0; i < height * 8; ++i) {
174 output[out_map[i]] = input[i];
175 }
176}
177
178static void RotateTile90(const ImageTile& input, ImageTile& output, int height,
179 const u8 out_map[64]) {
180 int out_i = 0;
181 for (int x = 0; x < 8; ++x) {
182 for (int y = height - 1; y >= 0; --y) {
183 output[out_map[out_i++]] = input[y * 8 + x];
184 }
185 }
186}
187
188static void RotateTile180(const ImageTile& input, ImageTile& output, int height,
189 const u8 out_map[64]) {
190 int out_i = 0;
191 for (int i = height * 8 - 1; i >= 0; --i) {
192 output[out_map[out_i++]] = input[i];
193 }
194}
195
196static void RotateTile270(const ImageTile& input, ImageTile& output, int height,
197 const u8 out_map[64]) {
198 int out_i = 0;
199 for (int x = 8 - 1; x >= 0; --x) {
200 for (int y = 0; y < height; ++y) {
201 output[out_map[out_i++]] = input[y * 8 + x];
202 }
203 }
204}
205
206static void WriteTileToOutput(u32* output, const ImageTile& tile, int height, int line_stride) {
207 for (int y = 0; y < height; ++y) {
208 for (int x = 0; x < 8; ++x) {
209 output[y * line_stride + x] = tile[y * 8 + x];
210 }
211 }
212}
213
214/**
215 * Performs a Y2R colorspace conversion.
216 *
217 * The Y2R hardware implements hardware-accelerated YUV to RGB colorspace conversions. It is most
218 * commonly used for video playback or to display camera input to the screen.
219 *
220 * The conversion process is quite configurable, and can be divided in distinct steps. From
221 * observation, it appears that the hardware buffers a single 8-pixel tall strip of image data
222 * internally and converts it in one go before writing to the output and loading the next strip.
223 *
224 * The steps taken to convert one strip of image data are:
225 *
226 * - The hardware receives data via CDMA (http://3dbrew.org/wiki/Corelink_DMA_Engines), which is
227 * presumably stored in one or more internal buffers. This process can be done in several separate
228 * transfers, as long as they don't exceed the size of the internal image buffer. This allows
229 * flexibility in input strides.
230 * - The input data is decoded into a YUV tuple. Several formats are suported, see the `InputFormat`
231 * enum.
232 * - The YUV tuple is converted, using fixed point calculations, to RGB. This step can be configured
233 * using a set of coefficients to support different colorspace standards. See `CoefficientSet`.
234 * - The strip can be optionally rotated 90, 180 or 270 degrees. Since each strip is processed
235 * independently, this notably rotates each *strip*, not the entire image. This means that for 90
236 * or 270 degree rotations, the output will be in terms of several 8 x height images, and for any
237 * non-zero rotation the strips will have to be re-arranged so that the parts of the image will
238 * not be shuffled together. This limitation makes this a feature of somewhat dubious utility. 90
239 * or 270 degree rotations in images with non-even height don't seem to work properly.
240 * - The data is converted to the output RGB format. See the `OutputFormat` enum.
241 * - The data can be output either linearly line-by-line or in the swizzled 8x8 tile format used by
242 * the PICA. This is decided by the `BlockAlignment` enum. If 8x8 alignment is used, then the
243 * image must have a height divisible by 8. The image width must always be divisible by 8.
244 * - The final data is then CDMAed out to main memory and the next image strip is processed. This
245 * offers the same flexibility as the input stage.
246 *
247 * In this implementation, to avoid the combinatorial explosion of parameter combinations, common
248 * intermediate formats are used and where possible tables or parameters are used instead of
249 * diverging code paths to keep the amount of branches in check. Some steps are also merged to
250 * increase efficiency.
251 *
252 * Output for all valid settings combinations matches hardware, however output in some edge-cases
253 * differs:
254 *
255 * - `Block8x8` alignment with non-mod8 height produces different garbage patterns on the last
256 * strip, especially when combined with rotation.
257 * - Hardware, when using `Linear` alignment with a non-even height and 90 or 270 degree rotation
258 * produces misaligned output on the last strip. This implmentation produces output with the
259 * correct "expected" alignment.
260 *
261 * Hardware behaves strangely (doesn't fire the completion interrupt, for example) in these cases,
262 * so they are believed to be invalid configurations anyway.
263 */
264void PerformConversion(ConversionConfiguration& cvt) {
265 ASSERT(cvt.input_line_width % 8 == 0);
266 ASSERT(cvt.block_alignment != BlockAlignment::Block8x8 || cvt.input_lines % 8 == 0);
267 // Tiles per row
268 size_t num_tiles = cvt.input_line_width / 8;
269 ASSERT(num_tiles <= MAX_TILES);
270
271 // Buffer used as a CDMA source/target.
272 std::unique_ptr<u8[]> data_buffer(new u8[cvt.input_line_width * 8 * 4]);
273 // Intermediate storage for decoded 8x8 image tiles. Always stored as RGB32.
274 std::unique_ptr<ImageTile[]> tiles(new ImageTile[num_tiles]);
275 ImageTile tmp_tile;
276
277 // LUT used to remap writes to a tile. Used to allow linear or swizzled output without
278 // requiring two different code paths.
279 const u8* tile_remap = nullptr;
280 switch (cvt.block_alignment) {
281 case BlockAlignment::Linear:
282 tile_remap = linear_lut;
283 break;
284 case BlockAlignment::Block8x8:
285 tile_remap = morton_lut;
286 break;
287 }
288
289 for (unsigned int y = 0; y < cvt.input_lines; y += 8) {
290 unsigned int row_height = std::min(cvt.input_lines - y, 8u);
291
292 // Total size in pixels of incoming data required for this strip.
293 const size_t row_data_size = row_height * cvt.input_line_width;
294
295 u8* input_Y = data_buffer.get();
296 u8* input_U = input_Y + 8 * cvt.input_line_width;
297 u8* input_V = input_U + 8 * cvt.input_line_width / 2;
298
299 switch (cvt.input_format) {
300 case InputFormat::YUV422_Indiv8:
301 ReceiveData<1>(input_Y, cvt.src_Y, row_data_size);
302 ReceiveData<1>(input_U, cvt.src_U, row_data_size / 2);
303 ReceiveData<1>(input_V, cvt.src_V, row_data_size / 2);
304 break;
305 case InputFormat::YUV420_Indiv8:
306 ReceiveData<1>(input_Y, cvt.src_Y, row_data_size);
307 ReceiveData<1>(input_U, cvt.src_U, row_data_size / 4);
308 ReceiveData<1>(input_V, cvt.src_V, row_data_size / 4);
309 break;
310 case InputFormat::YUV422_Indiv16:
311 ReceiveData<2>(input_Y, cvt.src_Y, row_data_size);
312 ReceiveData<2>(input_U, cvt.src_U, row_data_size / 2);
313 ReceiveData<2>(input_V, cvt.src_V, row_data_size / 2);
314 break;
315 case InputFormat::YUV420_Indiv16:
316 ReceiveData<2>(input_Y, cvt.src_Y, row_data_size);
317 ReceiveData<2>(input_U, cvt.src_U, row_data_size / 4);
318 ReceiveData<2>(input_V, cvt.src_V, row_data_size / 4);
319 break;
320 case InputFormat::YUYV422_Interleaved:
321 input_U = nullptr;
322 input_V = nullptr;
323 ReceiveData<1>(input_Y, cvt.src_YUYV, row_data_size * 2);
324 break;
325 }
326
327 // Note(yuriks): If additional optimization is required, input_format can be moved to a
328 // template parameter, so that its dispatch can be moved to outside the inner loop.
329 ConvertYUVToRGB(cvt.input_format, input_Y, input_U, input_V, tiles.get(),
330 cvt.input_line_width, row_height, cvt.coefficients);
331
332 u32* output_buffer = reinterpret_cast<u32*>(data_buffer.get());
333
334 for (size_t i = 0; i < num_tiles; ++i) {
335 int image_strip_width = 0;
336 int output_stride = 0;
337
338 switch (cvt.rotation) {
339 case Rotation::None:
340 RotateTile0(tiles[i], tmp_tile, row_height, tile_remap);
341 image_strip_width = cvt.input_line_width;
342 output_stride = 8;
343 break;
344 case Rotation::Clockwise_90:
345 RotateTile90(tiles[i], tmp_tile, row_height, tile_remap);
346 image_strip_width = 8;
347 output_stride = 8 * row_height;
348 break;
349 case Rotation::Clockwise_180:
350 // For 180 and 270 degree rotations we also invert the order of tiles in the strip,
351 // since the rotates are done individually on each tile.
352 RotateTile180(tiles[num_tiles - i - 1], tmp_tile, row_height, tile_remap);
353 image_strip_width = cvt.input_line_width;
354 output_stride = 8;
355 break;
356 case Rotation::Clockwise_270:
357 RotateTile270(tiles[num_tiles - i - 1], tmp_tile, row_height, tile_remap);
358 image_strip_width = 8;
359 output_stride = 8 * row_height;
360 break;
361 }
362
363 switch (cvt.block_alignment) {
364 case BlockAlignment::Linear:
365 WriteTileToOutput(output_buffer, tmp_tile, row_height, image_strip_width);
366 output_buffer += output_stride;
367 break;
368 case BlockAlignment::Block8x8:
369 WriteTileToOutput(output_buffer, tmp_tile, 8, 8);
370 output_buffer += TILE_SIZE;
371 break;
372 }
373 }
374
375 // Note(yuriks): If additional optimization is required, output_format can be moved to a
376 // template parameter, so that its dispatch can be moved to outside the inner loop.
377 SendData(reinterpret_cast<u32*>(data_buffer.get()), cvt.dst, (int)row_data_size,
378 cvt.output_format, (u8)cvt.alpha);
379 }
380}
381}
382}