diff options
| author | 2015-06-07 22:24:03 -0300 | |
|---|---|---|
| committer | 2015-06-21 20:58:55 -0300 | |
| commit | 3e6663da433d98a0bf4db1256ea3ccdefd404a0c (patch) | |
| tree | 88dc92d21b40edc99edf6c818a9c304b7f75cb1b /src/core/hw/y2r.cpp | |
| parent | Y2R: Re-organize how params are stored. Support SetConversionParams (diff) | |
| download | yuzu-3e6663da433d98a0bf4db1256ea3ccdefd404a0c.tar.gz yuzu-3e6663da433d98a0bf4db1256ea3ccdefd404a0c.tar.xz yuzu-3e6663da433d98a0bf4db1256ea3ccdefd404a0c.zip | |
Y2R: Rework conversion process, enabling support for all formats
Diffstat (limited to 'src/core/hw/y2r.cpp')
| -rw-r--r-- | src/core/hw/y2r.cpp | 369 |
1 files changed, 369 insertions, 0 deletions
diff --git a/src/core/hw/y2r.cpp b/src/core/hw/y2r.cpp new file mode 100644 index 000000000..5b7fb39e1 --- /dev/null +++ b/src/core/hw/y2r.cpp | |||
| @@ -0,0 +1,369 @@ | |||
| 1 | // Copyright 2015 Citra Emulator Project | ||
| 2 | // Licensed under GPLv2 or any later version | ||
| 3 | // Refer to the license.txt file included. | ||
| 4 | |||
| 5 | #include <array> | ||
| 6 | #include <numeric> | ||
| 7 | |||
| 8 | #include "common/assert.h" | ||
| 9 | #include "common/color.h" | ||
| 10 | #include "common/common_types.h" | ||
| 11 | #include "common/math_util.h" | ||
| 12 | #include "common/vector_math.h" | ||
| 13 | |||
| 14 | #include "core/hle/service/y2r_u.h" | ||
| 15 | #include "core/memory.h" | ||
| 16 | |||
| 17 | namespace HW { | ||
| 18 | namespace Y2R { | ||
| 19 | |||
| 20 | using namespace Y2R_U; | ||
| 21 | |||
| 22 | static const size_t MAX_TILES = 1024 / 8; | ||
| 23 | static const size_t TILE_SIZE = 8 * 8; | ||
| 24 | using ImageTile = std::array<u32, TILE_SIZE>; | ||
| 25 | |||
| 26 | /// Converts a image strip from the source YUV format into individual 8x8 RGB32 tiles. | ||
| 27 | static void ConvertYUVToRGB(InputFormat input_format, | ||
| 28 | const u8* input_Y, const u8* input_U, const u8* input_V, ImageTile output[], | ||
| 29 | unsigned int width, unsigned int height, const CoefficientSet& coefficients) { | ||
| 30 | |||
| 31 | for (unsigned int y = 0; y < height; ++y) { | ||
| 32 | for (unsigned int x = 0; x < width; ++x) { | ||
| 33 | s32 Y, U, V; | ||
| 34 | switch (input_format) { | ||
| 35 | case InputFormat::YUV422_Indiv8: | ||
| 36 | case InputFormat::YUV422_Indiv16: | ||
| 37 | Y = input_Y[y * width + x]; | ||
| 38 | U = input_U[(y * width + x) / 2]; | ||
| 39 | V = input_V[(y * width + x) / 2]; | ||
| 40 | break; | ||
| 41 | case InputFormat::YUV420_Indiv8: | ||
| 42 | case InputFormat::YUV420_Indiv16: | ||
| 43 | Y = input_Y[y * width + x]; | ||
| 44 | U = input_U[((y / 2) * width + x) / 2]; | ||
| 45 | V = input_V[((y / 2) * width + x) / 2]; | ||
| 46 | break; | ||
| 47 | case InputFormat::YUYV422_Interleaved: | ||
| 48 | Y = input_Y[(y * width + x) * 2]; | ||
| 49 | U = input_Y[(y * width + (x / 2) * 2) * 2 + 1]; | ||
| 50 | V = input_Y[(y * width + (x / 2) * 2) * 2 + 3]; | ||
| 51 | break; | ||
| 52 | } | ||
| 53 | |||
| 54 | // This conversion process is bit-exact with hardware, as far as could be tested. | ||
| 55 | auto& c = coefficients; | ||
| 56 | s32 cY = c[0]*Y; | ||
| 57 | |||
| 58 | s32 r = cY + c[1]*V; | ||
| 59 | s32 g = cY - c[3]*U - c[2]*V; | ||
| 60 | s32 b = cY + c[4]*U; | ||
| 61 | |||
| 62 | const s32 rounding_offset = 0x18; | ||
| 63 | r = (r >> 3) + c[5] + rounding_offset; | ||
| 64 | g = (g >> 3) + c[6] + rounding_offset; | ||
| 65 | b = (b >> 3) + c[7] + rounding_offset; | ||
| 66 | |||
| 67 | unsigned int tile = x / 8; | ||
| 68 | unsigned int tile_x = x % 8; | ||
| 69 | u32* out = &output[tile][y * 8 + tile_x]; | ||
| 70 | |||
| 71 | using MathUtil::Clamp; | ||
| 72 | *out = ((u32)Clamp(r >> 5, 0, 0xFF) << 24) | | ||
| 73 | ((u32)Clamp(g >> 5, 0, 0xFF) << 16) | | ||
| 74 | ((u32)Clamp(b >> 5, 0, 0xFF) << 8); | ||
| 75 | } | ||
| 76 | } | ||
| 77 | } | ||
| 78 | |||
| 79 | /// Simulates an incoming CDMA transfer. The N parameter is used to automatically convert 16-bit formats to 8-bit. | ||
| 80 | template <size_t N> | ||
| 81 | static void ReceiveData(u8* output, ConversionBuffer& buf, size_t amount_of_data) { | ||
| 82 | const u8* input = Memory::GetPointer(buf.address); | ||
| 83 | |||
| 84 | size_t output_unit = buf.transfer_unit / N; | ||
| 85 | ASSERT(amount_of_data % output_unit == 0); | ||
| 86 | |||
| 87 | while (amount_of_data > 0) { | ||
| 88 | for (size_t i = 0; i < output_unit; ++i) { | ||
| 89 | output[i] = input[i * N]; | ||
| 90 | } | ||
| 91 | |||
| 92 | output += output_unit; | ||
| 93 | input += buf.transfer_unit + buf.gap; | ||
| 94 | |||
| 95 | buf.address += buf.transfer_unit + buf.gap; | ||
| 96 | buf.image_size -= buf.transfer_unit; | ||
| 97 | amount_of_data -= output_unit; | ||
| 98 | } | ||
| 99 | } | ||
| 100 | |||
| 101 | /// Convert intermediate RGB32 format to the final output format while simulating an outgoing CDMA transfer. | ||
| 102 | static void SendData(const u32* input, ConversionBuffer& buf, int amount_of_data, | ||
| 103 | OutputFormat output_format, u8 alpha) { | ||
| 104 | |||
| 105 | u8* output = Memory::GetPointer(buf.address); | ||
| 106 | |||
| 107 | while (amount_of_data > 0) { | ||
| 108 | u8* unit_end = output + buf.transfer_unit; | ||
| 109 | while (output < unit_end) { | ||
| 110 | u32 color = *input++; | ||
| 111 | Math::Vec4<u8> col_vec{ | ||
| 112 | (color >> 24) & 0xFF, (color >> 16) & 0xFF, (color >> 8) & 0xFF, alpha, | ||
| 113 | }; | ||
| 114 | |||
| 115 | switch (output_format) { | ||
| 116 | case OutputFormat::RGBA8: | ||
| 117 | Color::EncodeRGBA8(col_vec, output); | ||
| 118 | output += 4; | ||
| 119 | break; | ||
| 120 | case OutputFormat::RGB8: | ||
| 121 | Color::EncodeRGB8(col_vec, output); | ||
| 122 | output += 3; | ||
| 123 | break; | ||
| 124 | case OutputFormat::RGB5A1: | ||
| 125 | Color::EncodeRGB5A1(col_vec, output); | ||
| 126 | output += 2; | ||
| 127 | break; | ||
| 128 | case OutputFormat::RGB565: | ||
| 129 | Color::EncodeRGB565(col_vec, output); | ||
| 130 | output += 2; | ||
| 131 | break; | ||
| 132 | } | ||
| 133 | |||
| 134 | amount_of_data -= 1; | ||
| 135 | } | ||
| 136 | |||
| 137 | output += buf.gap; | ||
| 138 | buf.address += buf.transfer_unit + buf.gap; | ||
| 139 | buf.image_size -= buf.transfer_unit; | ||
| 140 | } | ||
| 141 | } | ||
| 142 | |||
| 143 | static const u8 linear_lut[64] = { | ||
| 144 | 0, 1, 2, 3, 4, 5, 6, 7, | ||
| 145 | 8, 9, 10, 11, 12, 13, 14, 15, | ||
| 146 | 16, 17, 18, 19, 20, 21, 22, 23, | ||
| 147 | 24, 25, 26, 27, 28, 29, 30, 31, | ||
| 148 | 32, 33, 34, 35, 36, 37, 38, 39, | ||
| 149 | 40, 41, 42, 43, 44, 45, 46, 47, | ||
| 150 | 48, 49, 50, 51, 52, 53, 54, 55, | ||
| 151 | 56, 57, 58, 59, 60, 61, 62, 63, | ||
| 152 | }; | ||
| 153 | |||
| 154 | static const u8 morton_lut[64] = { | ||
| 155 | 0, 1, 4, 5, 16, 17, 20, 21, | ||
| 156 | 2, 3, 6, 7, 18, 19, 22, 23, | ||
| 157 | 8, 9, 12, 13, 24, 25, 28, 29, | ||
| 158 | 10, 11, 14, 15, 26, 27, 30, 31, | ||
| 159 | 32, 33, 36, 37, 48, 49, 52, 53, | ||
| 160 | 34, 35, 38, 39, 50, 51, 54, 55, | ||
| 161 | 40, 41, 44, 45, 56, 57, 60, 61, | ||
| 162 | 42, 43, 46, 47, 58, 59, 62, 63, | ||
| 163 | }; | ||
| 164 | |||
| 165 | static void RotateTile0(const ImageTile& input, ImageTile& output, int height, const u8 out_map[64]) { | ||
| 166 | for (int i = 0; i < height * 8; ++i) { | ||
| 167 | output[out_map[i]] = input[i]; | ||
| 168 | } | ||
| 169 | } | ||
| 170 | |||
| 171 | static void RotateTile90(const ImageTile& input, ImageTile& output, int height, const u8 out_map[64]) { | ||
| 172 | int out_i = 0; | ||
| 173 | for (int x = 0; x < 8; ++x) { | ||
| 174 | for (int y = height - 1; y >= 0; --y) { | ||
| 175 | output[out_map[out_i++]] = input[y * 8 + x]; | ||
| 176 | } | ||
| 177 | } | ||
| 178 | } | ||
| 179 | |||
| 180 | static void RotateTile180(const ImageTile& input, ImageTile& output, int height, const u8 out_map[64]) { | ||
| 181 | int out_i = 0; | ||
| 182 | for (int i = height * 8 - 1; i >= 0; --i) { | ||
| 183 | output[out_map[out_i++]] = input[i]; | ||
| 184 | } | ||
| 185 | } | ||
| 186 | |||
| 187 | static void RotateTile270(const ImageTile& input, ImageTile& output, int height, const u8 out_map[64]) { | ||
| 188 | int out_i = 0; | ||
| 189 | for (int x = 8-1; x >= 0; --x) { | ||
| 190 | for (int y = 0; y < height; ++y) { | ||
| 191 | output[out_map[out_i++]] = input[y * 8 + x]; | ||
| 192 | } | ||
| 193 | } | ||
| 194 | } | ||
| 195 | |||
| 196 | static void WriteTileToOutput(u32* output, const ImageTile& tile, int height, int line_stride) { | ||
| 197 | for (int y = 0; y < height; ++y) { | ||
| 198 | for (int x = 0; x < 8; ++x) { | ||
| 199 | output[y * line_stride + x] = tile[y * 8 + x]; | ||
| 200 | } | ||
| 201 | } | ||
| 202 | } | ||
| 203 | |||
| 204 | /** | ||
| 205 | * Performs a Y2R colorspace conversion. | ||
| 206 | * | ||
| 207 | * The Y2R hardware implements hardware-accelerated YUV to RGB colorspace conversions. It is most | ||
| 208 | * commonly used for video playback or to display camera input to the screen. | ||
| 209 | * | ||
| 210 | * The conversion process is quite configurable, and can be divided in distinct steps. From | ||
| 211 | * observation, it appears that the hardware buffers a single 8-pixel tall strip of image data | ||
| 212 | * internally and converts it in one go before writing to the output and loading the next strip. | ||
| 213 | * | ||
| 214 | * The steps taken to convert one strip of image data are: | ||
| 215 | * | ||
| 216 | * - The hardware receives data via CDMA (http://3dbrew.org/wiki/Corelink_DMA_Engines), which is | ||
| 217 | * presumably stored in one or more internal buffers. This process can be done in several separate | ||
| 218 | * transfers, as long as they don't exceed the size of the internal image buffer. This allows | ||
| 219 | * flexibility in input strides. | ||
| 220 | * - The input data is decoded into a YUV tuple. Several formats are suported, see the `InputFormat` | ||
| 221 | * enum. | ||
| 222 | * - The YUV tuple is converted, using fixed point calculations, to RGB. This step can be configured | ||
| 223 | * using a set of coefficients to support different colorspace standards. See `CoefficientSet`. | ||
| 224 | * - The strip can be optionally rotated 90, 180 or 270 degrees. Since each strip is processed | ||
| 225 | * independently, this notably rotates each *strip*, not the entire image. This means that for 90 | ||
| 226 | * or 270 degree rotations, the output will be in terms of several 8 x height images, and for any | ||
| 227 | * non-zero rotation the strips will have to be re-arranged so that the parts of the image will | ||
| 228 | * not be shuffled together. This limitation makes this a feature of somewhat dubious utility. 90 | ||
| 229 | * or 270 degree rotations in images with non-even height don't seem to work properly. | ||
| 230 | * - The data is converted to the output RGB format. See the `OutputFormat` enum. | ||
| 231 | * - The data can be output either linearly line-by-line or in the swizzled 8x8 tile format used by | ||
| 232 | * the PICA. This is decided by the `BlockAlignment` enum. If 8x8 alignment is used, then the | ||
| 233 | * image must have a height divisible by 8. The image width must always be divisible by 8. | ||
| 234 | * - The final data is then CDMAed out to main memory and the next image strip is processed. This | ||
| 235 | * offers the same flexibility as the input stage. | ||
| 236 | * | ||
| 237 | * In this implementation, to avoid the combinatorial explosion of parameter combinations, common | ||
| 238 | * intermediate formats are used and where possible tables or parameters are used instead of | ||
| 239 | * diverging code paths to keep the amount of branches in check. Some steps are also merged to | ||
| 240 | * increase efficiency. | ||
| 241 | * | ||
| 242 | * Output for all valid settings combinations matches hardware, however output in some edge-cases | ||
| 243 | * differs: | ||
| 244 | * | ||
| 245 | * - `Block8x8` alignment with non-mod8 height produces different garbage patterns on the last | ||
| 246 | * strip, especially when combined with rotation. | ||
| 247 | * - Hardware, when using `Linear` alignment with a non-even height and 90 or 270 degree rotation | ||
| 248 | * produces misaligned output on the last strip. This implmentation produces output with the | ||
| 249 | * correct "expected" alignment. | ||
| 250 | * | ||
| 251 | * Hardware behaves strangely (doesn't fire the completion interrupt, for example) in these cases, | ||
| 252 | * so they are believed to be invalid configurations anyway. | ||
| 253 | */ | ||
| 254 | void PerformConversion(ConversionConfiguration& cvt) { | ||
| 255 | ASSERT(cvt.input_line_width % 8 == 0); | ||
| 256 | ASSERT(cvt.block_alignment != BlockAlignment::Block8x8 || cvt.input_lines % 8 == 0); | ||
| 257 | // Tiles per row | ||
| 258 | size_t num_tiles = cvt.input_line_width / 8; | ||
| 259 | ASSERT(num_tiles < MAX_TILES); | ||
| 260 | |||
| 261 | // Buffer used as a CDMA source/target. | ||
| 262 | std::unique_ptr<u8[]> data_buffer(new u8[cvt.input_line_width * 8 * 4]); | ||
| 263 | // Intermediate storage for decoded 8x8 image tiles. Always stored as RGB32. | ||
| 264 | std::unique_ptr<ImageTile[]> tiles(new ImageTile[num_tiles]); | ||
| 265 | ImageTile tmp_tile; | ||
| 266 | |||
| 267 | // LUT used to remap writes to a tile. Used to allow linear or swizzled output without | ||
| 268 | // requiring two different code paths. | ||
| 269 | const u8* tile_remap; | ||
| 270 | switch (cvt.block_alignment) { | ||
| 271 | case BlockAlignment::Linear: | ||
| 272 | tile_remap = linear_lut; break; | ||
| 273 | case BlockAlignment::Block8x8: | ||
| 274 | tile_remap = morton_lut; break; | ||
| 275 | } | ||
| 276 | |||
| 277 | for (unsigned int y = 0; y < cvt.input_lines; y += 8) { | ||
| 278 | unsigned int row_height = std::min(cvt.input_lines - y, 8u); | ||
| 279 | |||
| 280 | // Total size in pixels of incoming data required for this strip. | ||
| 281 | const size_t row_data_size = row_height * cvt.input_line_width; | ||
| 282 | |||
| 283 | u8* input_Y = data_buffer.get(); | ||
| 284 | u8* input_U = input_Y + 8 * cvt.input_line_width; | ||
| 285 | u8* input_V = input_U + 8 * cvt.input_line_width / 2; | ||
| 286 | |||
| 287 | switch (cvt.input_format) { | ||
| 288 | case InputFormat::YUV422_Indiv8: | ||
| 289 | ReceiveData<1>(input_Y, cvt.src_Y, row_data_size); | ||
| 290 | ReceiveData<1>(input_U, cvt.src_U, row_data_size / 2); | ||
| 291 | ReceiveData<1>(input_V, cvt.src_V, row_data_size / 2); | ||
| 292 | break; | ||
| 293 | case InputFormat::YUV420_Indiv8: | ||
| 294 | ReceiveData<1>(input_Y, cvt.src_Y, row_data_size); | ||
| 295 | ReceiveData<1>(input_U, cvt.src_U, row_data_size / 4); | ||
| 296 | ReceiveData<1>(input_V, cvt.src_V, row_data_size / 4); | ||
| 297 | break; | ||
| 298 | case InputFormat::YUV422_Indiv16: | ||
| 299 | ReceiveData<2>(input_Y, cvt.src_Y, row_data_size); | ||
| 300 | ReceiveData<2>(input_U, cvt.src_U, row_data_size / 2); | ||
| 301 | ReceiveData<2>(input_V, cvt.src_V, row_data_size / 2); | ||
| 302 | break; | ||
| 303 | case InputFormat::YUV420_Indiv16: | ||
| 304 | ReceiveData<2>(input_Y, cvt.src_Y, row_data_size); | ||
| 305 | ReceiveData<2>(input_U, cvt.src_U, row_data_size / 4); | ||
| 306 | ReceiveData<2>(input_V, cvt.src_V, row_data_size / 4); | ||
| 307 | break; | ||
| 308 | case InputFormat::YUYV422_Interleaved: | ||
| 309 | input_U = nullptr; | ||
| 310 | input_V = nullptr; | ||
| 311 | ReceiveData<1>(input_Y, cvt.src_YUYV, row_data_size * 2); | ||
| 312 | break; | ||
| 313 | } | ||
| 314 | |||
| 315 | // Note(yuriks): If additional optimization is required, input_format can be moved to a | ||
| 316 | // template parameter, so that its dispatch can be moved to outside the inner loop. | ||
| 317 | ConvertYUVToRGB(cvt.input_format, input_Y, input_U, input_V, tiles.get(), | ||
| 318 | cvt.input_line_width, row_height, cvt.coefficients); | ||
| 319 | |||
| 320 | u32* output_buffer = reinterpret_cast<u32*>(data_buffer.get()); | ||
| 321 | |||
| 322 | for (int i = 0; i < num_tiles; ++i) { | ||
| 323 | int image_strip_width, output_stride; | ||
| 324 | |||
| 325 | switch (cvt.rotation) { | ||
| 326 | case Rotation::None: | ||
| 327 | RotateTile0(tiles[i], tmp_tile, row_height, tile_remap); | ||
| 328 | image_strip_width = cvt.input_line_width; | ||
| 329 | output_stride = 8; | ||
| 330 | break; | ||
| 331 | case Rotation::Clockwise_90: | ||
| 332 | RotateTile90(tiles[i], tmp_tile, row_height, tile_remap); | ||
| 333 | image_strip_width = 8; | ||
| 334 | output_stride = 8 * row_height; | ||
| 335 | break; | ||
| 336 | case Rotation::Clockwise_180: | ||
| 337 | // For 180 and 270 degree rotations we also invert the order of tiles in the strip, | ||
| 338 | // since the rotates are done individually on each tile. | ||
| 339 | RotateTile180(tiles[num_tiles - i - 1], tmp_tile, row_height, tile_remap); | ||
| 340 | image_strip_width = cvt.input_line_width; | ||
| 341 | output_stride = 8; | ||
| 342 | break; | ||
| 343 | case Rotation::Clockwise_270: | ||
| 344 | RotateTile270(tiles[num_tiles - i - 1], tmp_tile, row_height, tile_remap); | ||
| 345 | image_strip_width = 8; | ||
| 346 | output_stride = 8 * row_height; | ||
| 347 | break; | ||
| 348 | } | ||
| 349 | |||
| 350 | switch (cvt.block_alignment) { | ||
| 351 | case BlockAlignment::Linear: | ||
| 352 | WriteTileToOutput(output_buffer, tmp_tile, row_height, image_strip_width); | ||
| 353 | output_buffer += output_stride; | ||
| 354 | break; | ||
| 355 | case BlockAlignment::Block8x8: | ||
| 356 | WriteTileToOutput(output_buffer, tmp_tile, 8, 8); | ||
| 357 | output_buffer += TILE_SIZE; | ||
| 358 | break; | ||
| 359 | } | ||
| 360 | } | ||
| 361 | |||
| 362 | // Note(yuriks): If additional optimization is required, output_format can be moved to a | ||
| 363 | // template parameter, so that its dispatch can be moved to outside the inner loop. | ||
| 364 | SendData(reinterpret_cast<u32*>(data_buffer.get()), cvt.dst, (int)row_data_size, cvt.output_format, (u8)cvt.alpha); | ||
| 365 | } | ||
| 366 | } | ||
| 367 | |||
| 368 | } | ||
| 369 | } | ||