summaryrefslogtreecommitdiff
path: root/src/core/hw/y2r.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'src/core/hw/y2r.cpp')
-rw-r--r--src/core/hw/y2r.cpp369
1 files changed, 369 insertions, 0 deletions
diff --git a/src/core/hw/y2r.cpp b/src/core/hw/y2r.cpp
new file mode 100644
index 000000000..5b7fb39e1
--- /dev/null
+++ b/src/core/hw/y2r.cpp
@@ -0,0 +1,369 @@
1// Copyright 2015 Citra Emulator Project
2// Licensed under GPLv2 or any later version
3// Refer to the license.txt file included.
4
5#include <array>
6#include <numeric>
7
8#include "common/assert.h"
9#include "common/color.h"
10#include "common/common_types.h"
11#include "common/math_util.h"
12#include "common/vector_math.h"
13
14#include "core/hle/service/y2r_u.h"
15#include "core/memory.h"
16
17namespace HW {
18namespace Y2R {
19
20using namespace Y2R_U;
21
22static const size_t MAX_TILES = 1024 / 8;
23static const size_t TILE_SIZE = 8 * 8;
24using ImageTile = std::array<u32, TILE_SIZE>;
25
26/// Converts a image strip from the source YUV format into individual 8x8 RGB32 tiles.
27static void ConvertYUVToRGB(InputFormat input_format,
28 const u8* input_Y, const u8* input_U, const u8* input_V, ImageTile output[],
29 unsigned int width, unsigned int height, const CoefficientSet& coefficients) {
30
31 for (unsigned int y = 0; y < height; ++y) {
32 for (unsigned int x = 0; x < width; ++x) {
33 s32 Y, U, V;
34 switch (input_format) {
35 case InputFormat::YUV422_Indiv8:
36 case InputFormat::YUV422_Indiv16:
37 Y = input_Y[y * width + x];
38 U = input_U[(y * width + x) / 2];
39 V = input_V[(y * width + x) / 2];
40 break;
41 case InputFormat::YUV420_Indiv8:
42 case InputFormat::YUV420_Indiv16:
43 Y = input_Y[y * width + x];
44 U = input_U[((y / 2) * width + x) / 2];
45 V = input_V[((y / 2) * width + x) / 2];
46 break;
47 case InputFormat::YUYV422_Interleaved:
48 Y = input_Y[(y * width + x) * 2];
49 U = input_Y[(y * width + (x / 2) * 2) * 2 + 1];
50 V = input_Y[(y * width + (x / 2) * 2) * 2 + 3];
51 break;
52 }
53
54 // This conversion process is bit-exact with hardware, as far as could be tested.
55 auto& c = coefficients;
56 s32 cY = c[0]*Y;
57
58 s32 r = cY + c[1]*V;
59 s32 g = cY - c[3]*U - c[2]*V;
60 s32 b = cY + c[4]*U;
61
62 const s32 rounding_offset = 0x18;
63 r = (r >> 3) + c[5] + rounding_offset;
64 g = (g >> 3) + c[6] + rounding_offset;
65 b = (b >> 3) + c[7] + rounding_offset;
66
67 unsigned int tile = x / 8;
68 unsigned int tile_x = x % 8;
69 u32* out = &output[tile][y * 8 + tile_x];
70
71 using MathUtil::Clamp;
72 *out = ((u32)Clamp(r >> 5, 0, 0xFF) << 24) |
73 ((u32)Clamp(g >> 5, 0, 0xFF) << 16) |
74 ((u32)Clamp(b >> 5, 0, 0xFF) << 8);
75 }
76 }
77}
78
79/// Simulates an incoming CDMA transfer. The N parameter is used to automatically convert 16-bit formats to 8-bit.
80template <size_t N>
81static void ReceiveData(u8* output, ConversionBuffer& buf, size_t amount_of_data) {
82 const u8* input = Memory::GetPointer(buf.address);
83
84 size_t output_unit = buf.transfer_unit / N;
85 ASSERT(amount_of_data % output_unit == 0);
86
87 while (amount_of_data > 0) {
88 for (size_t i = 0; i < output_unit; ++i) {
89 output[i] = input[i * N];
90 }
91
92 output += output_unit;
93 input += buf.transfer_unit + buf.gap;
94
95 buf.address += buf.transfer_unit + buf.gap;
96 buf.image_size -= buf.transfer_unit;
97 amount_of_data -= output_unit;
98 }
99}
100
101/// Convert intermediate RGB32 format to the final output format while simulating an outgoing CDMA transfer.
102static void SendData(const u32* input, ConversionBuffer& buf, int amount_of_data,
103 OutputFormat output_format, u8 alpha) {
104
105 u8* output = Memory::GetPointer(buf.address);
106
107 while (amount_of_data > 0) {
108 u8* unit_end = output + buf.transfer_unit;
109 while (output < unit_end) {
110 u32 color = *input++;
111 Math::Vec4<u8> col_vec{
112 (color >> 24) & 0xFF, (color >> 16) & 0xFF, (color >> 8) & 0xFF, alpha,
113 };
114
115 switch (output_format) {
116 case OutputFormat::RGBA8:
117 Color::EncodeRGBA8(col_vec, output);
118 output += 4;
119 break;
120 case OutputFormat::RGB8:
121 Color::EncodeRGB8(col_vec, output);
122 output += 3;
123 break;
124 case OutputFormat::RGB5A1:
125 Color::EncodeRGB5A1(col_vec, output);
126 output += 2;
127 break;
128 case OutputFormat::RGB565:
129 Color::EncodeRGB565(col_vec, output);
130 output += 2;
131 break;
132 }
133
134 amount_of_data -= 1;
135 }
136
137 output += buf.gap;
138 buf.address += buf.transfer_unit + buf.gap;
139 buf.image_size -= buf.transfer_unit;
140 }
141}
142
143static const u8 linear_lut[64] = {
144 0, 1, 2, 3, 4, 5, 6, 7,
145 8, 9, 10, 11, 12, 13, 14, 15,
146 16, 17, 18, 19, 20, 21, 22, 23,
147 24, 25, 26, 27, 28, 29, 30, 31,
148 32, 33, 34, 35, 36, 37, 38, 39,
149 40, 41, 42, 43, 44, 45, 46, 47,
150 48, 49, 50, 51, 52, 53, 54, 55,
151 56, 57, 58, 59, 60, 61, 62, 63,
152};
153
154static const u8 morton_lut[64] = {
155 0, 1, 4, 5, 16, 17, 20, 21,
156 2, 3, 6, 7, 18, 19, 22, 23,
157 8, 9, 12, 13, 24, 25, 28, 29,
158 10, 11, 14, 15, 26, 27, 30, 31,
159 32, 33, 36, 37, 48, 49, 52, 53,
160 34, 35, 38, 39, 50, 51, 54, 55,
161 40, 41, 44, 45, 56, 57, 60, 61,
162 42, 43, 46, 47, 58, 59, 62, 63,
163};
164
165static void RotateTile0(const ImageTile& input, ImageTile& output, int height, const u8 out_map[64]) {
166 for (int i = 0; i < height * 8; ++i) {
167 output[out_map[i]] = input[i];
168 }
169}
170
171static void RotateTile90(const ImageTile& input, ImageTile& output, int height, const u8 out_map[64]) {
172 int out_i = 0;
173 for (int x = 0; x < 8; ++x) {
174 for (int y = height - 1; y >= 0; --y) {
175 output[out_map[out_i++]] = input[y * 8 + x];
176 }
177 }
178}
179
180static void RotateTile180(const ImageTile& input, ImageTile& output, int height, const u8 out_map[64]) {
181 int out_i = 0;
182 for (int i = height * 8 - 1; i >= 0; --i) {
183 output[out_map[out_i++]] = input[i];
184 }
185}
186
187static void RotateTile270(const ImageTile& input, ImageTile& output, int height, const u8 out_map[64]) {
188 int out_i = 0;
189 for (int x = 8-1; x >= 0; --x) {
190 for (int y = 0; y < height; ++y) {
191 output[out_map[out_i++]] = input[y * 8 + x];
192 }
193 }
194}
195
196static void WriteTileToOutput(u32* output, const ImageTile& tile, int height, int line_stride) {
197 for (int y = 0; y < height; ++y) {
198 for (int x = 0; x < 8; ++x) {
199 output[y * line_stride + x] = tile[y * 8 + x];
200 }
201 }
202}
203
204/**
205 * Performs a Y2R colorspace conversion.
206 *
207 * The Y2R hardware implements hardware-accelerated YUV to RGB colorspace conversions. It is most
208 * commonly used for video playback or to display camera input to the screen.
209 *
210 * The conversion process is quite configurable, and can be divided in distinct steps. From
211 * observation, it appears that the hardware buffers a single 8-pixel tall strip of image data
212 * internally and converts it in one go before writing to the output and loading the next strip.
213 *
214 * The steps taken to convert one strip of image data are:
215 *
216 * - The hardware receives data via CDMA (http://3dbrew.org/wiki/Corelink_DMA_Engines), which is
217 * presumably stored in one or more internal buffers. This process can be done in several separate
218 * transfers, as long as they don't exceed the size of the internal image buffer. This allows
219 * flexibility in input strides.
220 * - The input data is decoded into a YUV tuple. Several formats are suported, see the `InputFormat`
221 * enum.
222 * - The YUV tuple is converted, using fixed point calculations, to RGB. This step can be configured
223 * using a set of coefficients to support different colorspace standards. See `CoefficientSet`.
224 * - The strip can be optionally rotated 90, 180 or 270 degrees. Since each strip is processed
225 * independently, this notably rotates each *strip*, not the entire image. This means that for 90
226 * or 270 degree rotations, the output will be in terms of several 8 x height images, and for any
227 * non-zero rotation the strips will have to be re-arranged so that the parts of the image will
228 * not be shuffled together. This limitation makes this a feature of somewhat dubious utility. 90
229 * or 270 degree rotations in images with non-even height don't seem to work properly.
230 * - The data is converted to the output RGB format. See the `OutputFormat` enum.
231 * - The data can be output either linearly line-by-line or in the swizzled 8x8 tile format used by
232 * the PICA. This is decided by the `BlockAlignment` enum. If 8x8 alignment is used, then the
233 * image must have a height divisible by 8. The image width must always be divisible by 8.
234 * - The final data is then CDMAed out to main memory and the next image strip is processed. This
235 * offers the same flexibility as the input stage.
236 *
237 * In this implementation, to avoid the combinatorial explosion of parameter combinations, common
238 * intermediate formats are used and where possible tables or parameters are used instead of
239 * diverging code paths to keep the amount of branches in check. Some steps are also merged to
240 * increase efficiency.
241 *
242 * Output for all valid settings combinations matches hardware, however output in some edge-cases
243 * differs:
244 *
245 * - `Block8x8` alignment with non-mod8 height produces different garbage patterns on the last
246 * strip, especially when combined with rotation.
247 * - Hardware, when using `Linear` alignment with a non-even height and 90 or 270 degree rotation
248 * produces misaligned output on the last strip. This implmentation produces output with the
249 * correct "expected" alignment.
250 *
251 * Hardware behaves strangely (doesn't fire the completion interrupt, for example) in these cases,
252 * so they are believed to be invalid configurations anyway.
253 */
254void PerformConversion(ConversionConfiguration& cvt) {
255 ASSERT(cvt.input_line_width % 8 == 0);
256 ASSERT(cvt.block_alignment != BlockAlignment::Block8x8 || cvt.input_lines % 8 == 0);
257 // Tiles per row
258 size_t num_tiles = cvt.input_line_width / 8;
259 ASSERT(num_tiles < MAX_TILES);
260
261 // Buffer used as a CDMA source/target.
262 std::unique_ptr<u8[]> data_buffer(new u8[cvt.input_line_width * 8 * 4]);
263 // Intermediate storage for decoded 8x8 image tiles. Always stored as RGB32.
264 std::unique_ptr<ImageTile[]> tiles(new ImageTile[num_tiles]);
265 ImageTile tmp_tile;
266
267 // LUT used to remap writes to a tile. Used to allow linear or swizzled output without
268 // requiring two different code paths.
269 const u8* tile_remap;
270 switch (cvt.block_alignment) {
271 case BlockAlignment::Linear:
272 tile_remap = linear_lut; break;
273 case BlockAlignment::Block8x8:
274 tile_remap = morton_lut; break;
275 }
276
277 for (unsigned int y = 0; y < cvt.input_lines; y += 8) {
278 unsigned int row_height = std::min(cvt.input_lines - y, 8u);
279
280 // Total size in pixels of incoming data required for this strip.
281 const size_t row_data_size = row_height * cvt.input_line_width;
282
283 u8* input_Y = data_buffer.get();
284 u8* input_U = input_Y + 8 * cvt.input_line_width;
285 u8* input_V = input_U + 8 * cvt.input_line_width / 2;
286
287 switch (cvt.input_format) {
288 case InputFormat::YUV422_Indiv8:
289 ReceiveData<1>(input_Y, cvt.src_Y, row_data_size);
290 ReceiveData<1>(input_U, cvt.src_U, row_data_size / 2);
291 ReceiveData<1>(input_V, cvt.src_V, row_data_size / 2);
292 break;
293 case InputFormat::YUV420_Indiv8:
294 ReceiveData<1>(input_Y, cvt.src_Y, row_data_size);
295 ReceiveData<1>(input_U, cvt.src_U, row_data_size / 4);
296 ReceiveData<1>(input_V, cvt.src_V, row_data_size / 4);
297 break;
298 case InputFormat::YUV422_Indiv16:
299 ReceiveData<2>(input_Y, cvt.src_Y, row_data_size);
300 ReceiveData<2>(input_U, cvt.src_U, row_data_size / 2);
301 ReceiveData<2>(input_V, cvt.src_V, row_data_size / 2);
302 break;
303 case InputFormat::YUV420_Indiv16:
304 ReceiveData<2>(input_Y, cvt.src_Y, row_data_size);
305 ReceiveData<2>(input_U, cvt.src_U, row_data_size / 4);
306 ReceiveData<2>(input_V, cvt.src_V, row_data_size / 4);
307 break;
308 case InputFormat::YUYV422_Interleaved:
309 input_U = nullptr;
310 input_V = nullptr;
311 ReceiveData<1>(input_Y, cvt.src_YUYV, row_data_size * 2);
312 break;
313 }
314
315 // Note(yuriks): If additional optimization is required, input_format can be moved to a
316 // template parameter, so that its dispatch can be moved to outside the inner loop.
317 ConvertYUVToRGB(cvt.input_format, input_Y, input_U, input_V, tiles.get(),
318 cvt.input_line_width, row_height, cvt.coefficients);
319
320 u32* output_buffer = reinterpret_cast<u32*>(data_buffer.get());
321
322 for (int i = 0; i < num_tiles; ++i) {
323 int image_strip_width, output_stride;
324
325 switch (cvt.rotation) {
326 case Rotation::None:
327 RotateTile0(tiles[i], tmp_tile, row_height, tile_remap);
328 image_strip_width = cvt.input_line_width;
329 output_stride = 8;
330 break;
331 case Rotation::Clockwise_90:
332 RotateTile90(tiles[i], tmp_tile, row_height, tile_remap);
333 image_strip_width = 8;
334 output_stride = 8 * row_height;
335 break;
336 case Rotation::Clockwise_180:
337 // For 180 and 270 degree rotations we also invert the order of tiles in the strip,
338 // since the rotates are done individually on each tile.
339 RotateTile180(tiles[num_tiles - i - 1], tmp_tile, row_height, tile_remap);
340 image_strip_width = cvt.input_line_width;
341 output_stride = 8;
342 break;
343 case Rotation::Clockwise_270:
344 RotateTile270(tiles[num_tiles - i - 1], tmp_tile, row_height, tile_remap);
345 image_strip_width = 8;
346 output_stride = 8 * row_height;
347 break;
348 }
349
350 switch (cvt.block_alignment) {
351 case BlockAlignment::Linear:
352 WriteTileToOutput(output_buffer, tmp_tile, row_height, image_strip_width);
353 output_buffer += output_stride;
354 break;
355 case BlockAlignment::Block8x8:
356 WriteTileToOutput(output_buffer, tmp_tile, 8, 8);
357 output_buffer += TILE_SIZE;
358 break;
359 }
360 }
361
362 // Note(yuriks): If additional optimization is required, output_format can be moved to a
363 // template parameter, so that its dispatch can be moved to outside the inner loop.
364 SendData(reinterpret_cast<u32*>(data_buffer.get()), cvt.dst, (int)row_data_size, cvt.output_format, (u8)cvt.alpha);
365 }
366}
367
368}
369}