diff options
Diffstat (limited to 'src/core/hw/y2r.cpp')
| -rw-r--r-- | src/core/hw/y2r.cpp | 382 |
1 files changed, 0 insertions, 382 deletions
diff --git a/src/core/hw/y2r.cpp b/src/core/hw/y2r.cpp deleted file mode 100644 index e697f84b3..000000000 --- a/src/core/hw/y2r.cpp +++ /dev/null | |||
| @@ -1,382 +0,0 @@ | |||
| 1 | // Copyright 2015 Citra Emulator Project | ||
| 2 | // Licensed under GPLv2 or any later version | ||
| 3 | // Refer to the license.txt file included. | ||
| 4 | |||
| 5 | #include <algorithm> | ||
| 6 | #include <array> | ||
| 7 | #include <cstddef> | ||
| 8 | #include <memory> | ||
| 9 | #include "common/assert.h" | ||
| 10 | #include "common/color.h" | ||
| 11 | #include "common/common_types.h" | ||
| 12 | #include "common/math_util.h" | ||
| 13 | #include "common/vector_math.h" | ||
| 14 | #include "core/hle/service/y2r_u.h" | ||
| 15 | #include "core/hw/y2r.h" | ||
| 16 | #include "core/memory.h" | ||
| 17 | |||
| 18 | namespace HW { | ||
| 19 | namespace Y2R { | ||
| 20 | |||
| 21 | using namespace Service::Y2R; | ||
| 22 | |||
| 23 | static const size_t MAX_TILES = 1024 / 8; | ||
| 24 | static const size_t TILE_SIZE = 8 * 8; | ||
| 25 | using ImageTile = std::array<u32, TILE_SIZE>; | ||
| 26 | |||
| 27 | /// Converts a image strip from the source YUV format into individual 8x8 RGB32 tiles. | ||
| 28 | static void ConvertYUVToRGB(InputFormat input_format, const u8* input_Y, const u8* input_U, | ||
| 29 | const u8* input_V, ImageTile output[], unsigned int width, | ||
| 30 | unsigned int height, const CoefficientSet& coefficients) { | ||
| 31 | |||
| 32 | for (unsigned int y = 0; y < height; ++y) { | ||
| 33 | for (unsigned int x = 0; x < width; ++x) { | ||
| 34 | s32 Y = 0; | ||
| 35 | s32 U = 0; | ||
| 36 | s32 V = 0; | ||
| 37 | switch (input_format) { | ||
| 38 | case InputFormat::YUV422_Indiv8: | ||
| 39 | case InputFormat::YUV422_Indiv16: | ||
| 40 | Y = input_Y[y * width + x]; | ||
| 41 | U = input_U[(y * width + x) / 2]; | ||
| 42 | V = input_V[(y * width + x) / 2]; | ||
| 43 | break; | ||
| 44 | case InputFormat::YUV420_Indiv8: | ||
| 45 | case InputFormat::YUV420_Indiv16: | ||
| 46 | Y = input_Y[y * width + x]; | ||
| 47 | U = input_U[((y / 2) * width + x) / 2]; | ||
| 48 | V = input_V[((y / 2) * width + x) / 2]; | ||
| 49 | break; | ||
| 50 | case InputFormat::YUYV422_Interleaved: | ||
| 51 | Y = input_Y[(y * width + x) * 2]; | ||
| 52 | U = input_Y[(y * width + (x / 2) * 2) * 2 + 1]; | ||
| 53 | V = input_Y[(y * width + (x / 2) * 2) * 2 + 3]; | ||
| 54 | break; | ||
| 55 | } | ||
| 56 | |||
| 57 | // This conversion process is bit-exact with hardware, as far as could be tested. | ||
| 58 | auto& c = coefficients; | ||
| 59 | s32 cY = c[0] * Y; | ||
| 60 | |||
| 61 | s32 r = cY + c[1] * V; | ||
| 62 | s32 g = cY - c[2] * V - c[3] * U; | ||
| 63 | s32 b = cY + c[4] * U; | ||
| 64 | |||
| 65 | const s32 rounding_offset = 0x18; | ||
| 66 | r = (r >> 3) + c[5] + rounding_offset; | ||
| 67 | g = (g >> 3) + c[6] + rounding_offset; | ||
| 68 | b = (b >> 3) + c[7] + rounding_offset; | ||
| 69 | |||
| 70 | unsigned int tile = x / 8; | ||
| 71 | unsigned int tile_x = x % 8; | ||
| 72 | u32* out = &output[tile][y * 8 + tile_x]; | ||
| 73 | |||
| 74 | using MathUtil::Clamp; | ||
| 75 | *out = ((u32)Clamp(r >> 5, 0, 0xFF) << 24) | ((u32)Clamp(g >> 5, 0, 0xFF) << 16) | | ||
| 76 | ((u32)Clamp(b >> 5, 0, 0xFF) << 8); | ||
| 77 | } | ||
| 78 | } | ||
| 79 | } | ||
| 80 | |||
| 81 | /// Simulates an incoming CDMA transfer. The N parameter is used to automatically convert 16-bit | ||
| 82 | /// formats to 8-bit. | ||
| 83 | template <size_t N> | ||
| 84 | static void ReceiveData(u8* output, ConversionBuffer& buf, size_t amount_of_data) { | ||
| 85 | const u8* input = Memory::GetPointer(buf.address); | ||
| 86 | |||
| 87 | size_t output_unit = buf.transfer_unit / N; | ||
| 88 | ASSERT(amount_of_data % output_unit == 0); | ||
| 89 | |||
| 90 | while (amount_of_data > 0) { | ||
| 91 | for (size_t i = 0; i < output_unit; ++i) { | ||
| 92 | output[i] = input[i * N]; | ||
| 93 | } | ||
| 94 | |||
| 95 | output += output_unit; | ||
| 96 | input += buf.transfer_unit + buf.gap; | ||
| 97 | |||
| 98 | buf.address += buf.transfer_unit + buf.gap; | ||
| 99 | buf.image_size -= buf.transfer_unit; | ||
| 100 | amount_of_data -= output_unit; | ||
| 101 | } | ||
| 102 | } | ||
| 103 | |||
| 104 | /// Convert intermediate RGB32 format to the final output format while simulating an outgoing CDMA | ||
| 105 | /// transfer. | ||
| 106 | static void SendData(const u32* input, ConversionBuffer& buf, int amount_of_data, | ||
| 107 | OutputFormat output_format, u8 alpha) { | ||
| 108 | |||
| 109 | u8* output = Memory::GetPointer(buf.address); | ||
| 110 | |||
| 111 | while (amount_of_data > 0) { | ||
| 112 | u8* unit_end = output + buf.transfer_unit; | ||
| 113 | while (output < unit_end) { | ||
| 114 | u32 color = *input++; | ||
| 115 | Math::Vec4<u8> col_vec{(u8)(color >> 24), (u8)(color >> 16), (u8)(color >> 8), alpha}; | ||
| 116 | |||
| 117 | switch (output_format) { | ||
| 118 | case OutputFormat::RGBA8: | ||
| 119 | Color::EncodeRGBA8(col_vec, output); | ||
| 120 | output += 4; | ||
| 121 | break; | ||
| 122 | case OutputFormat::RGB8: | ||
| 123 | Color::EncodeRGB8(col_vec, output); | ||
| 124 | output += 3; | ||
| 125 | break; | ||
| 126 | case OutputFormat::RGB5A1: | ||
| 127 | Color::EncodeRGB5A1(col_vec, output); | ||
| 128 | output += 2; | ||
| 129 | break; | ||
| 130 | case OutputFormat::RGB565: | ||
| 131 | Color::EncodeRGB565(col_vec, output); | ||
| 132 | output += 2; | ||
| 133 | break; | ||
| 134 | } | ||
| 135 | |||
| 136 | amount_of_data -= 1; | ||
| 137 | } | ||
| 138 | |||
| 139 | output += buf.gap; | ||
| 140 | buf.address += buf.transfer_unit + buf.gap; | ||
| 141 | buf.image_size -= buf.transfer_unit; | ||
| 142 | } | ||
| 143 | } | ||
| 144 | |||
| 145 | static const u8 linear_lut[TILE_SIZE] = { | ||
| 146 | // clang-format off | ||
| 147 | 0, 1, 2, 3, 4, 5, 6, 7, | ||
| 148 | 8, 9, 10, 11, 12, 13, 14, 15, | ||
| 149 | 16, 17, 18, 19, 20, 21, 22, 23, | ||
| 150 | 24, 25, 26, 27, 28, 29, 30, 31, | ||
| 151 | 32, 33, 34, 35, 36, 37, 38, 39, | ||
| 152 | 40, 41, 42, 43, 44, 45, 46, 47, | ||
| 153 | 48, 49, 50, 51, 52, 53, 54, 55, | ||
| 154 | 56, 57, 58, 59, 60, 61, 62, 63, | ||
| 155 | // clang-format on | ||
| 156 | }; | ||
| 157 | |||
| 158 | static const u8 morton_lut[TILE_SIZE] = { | ||
| 159 | // clang-format off | ||
| 160 | 0, 1, 4, 5, 16, 17, 20, 21, | ||
| 161 | 2, 3, 6, 7, 18, 19, 22, 23, | ||
| 162 | 8, 9, 12, 13, 24, 25, 28, 29, | ||
| 163 | 10, 11, 14, 15, 26, 27, 30, 31, | ||
| 164 | 32, 33, 36, 37, 48, 49, 52, 53, | ||
| 165 | 34, 35, 38, 39, 50, 51, 54, 55, | ||
| 166 | 40, 41, 44, 45, 56, 57, 60, 61, | ||
| 167 | 42, 43, 46, 47, 58, 59, 62, 63, | ||
| 168 | // clang-format on | ||
| 169 | }; | ||
| 170 | |||
| 171 | static void RotateTile0(const ImageTile& input, ImageTile& output, int height, | ||
| 172 | const u8 out_map[64]) { | ||
| 173 | for (int i = 0; i < height * 8; ++i) { | ||
| 174 | output[out_map[i]] = input[i]; | ||
| 175 | } | ||
| 176 | } | ||
| 177 | |||
| 178 | static void RotateTile90(const ImageTile& input, ImageTile& output, int height, | ||
| 179 | const u8 out_map[64]) { | ||
| 180 | int out_i = 0; | ||
| 181 | for (int x = 0; x < 8; ++x) { | ||
| 182 | for (int y = height - 1; y >= 0; --y) { | ||
| 183 | output[out_map[out_i++]] = input[y * 8 + x]; | ||
| 184 | } | ||
| 185 | } | ||
| 186 | } | ||
| 187 | |||
| 188 | static void RotateTile180(const ImageTile& input, ImageTile& output, int height, | ||
| 189 | const u8 out_map[64]) { | ||
| 190 | int out_i = 0; | ||
| 191 | for (int i = height * 8 - 1; i >= 0; --i) { | ||
| 192 | output[out_map[out_i++]] = input[i]; | ||
| 193 | } | ||
| 194 | } | ||
| 195 | |||
| 196 | static void RotateTile270(const ImageTile& input, ImageTile& output, int height, | ||
| 197 | const u8 out_map[64]) { | ||
| 198 | int out_i = 0; | ||
| 199 | for (int x = 8 - 1; x >= 0; --x) { | ||
| 200 | for (int y = 0; y < height; ++y) { | ||
| 201 | output[out_map[out_i++]] = input[y * 8 + x]; | ||
| 202 | } | ||
| 203 | } | ||
| 204 | } | ||
| 205 | |||
| 206 | static void WriteTileToOutput(u32* output, const ImageTile& tile, int height, int line_stride) { | ||
| 207 | for (int y = 0; y < height; ++y) { | ||
| 208 | for (int x = 0; x < 8; ++x) { | ||
| 209 | output[y * line_stride + x] = tile[y * 8 + x]; | ||
| 210 | } | ||
| 211 | } | ||
| 212 | } | ||
| 213 | |||
| 214 | /** | ||
| 215 | * Performs a Y2R colorspace conversion. | ||
| 216 | * | ||
| 217 | * The Y2R hardware implements hardware-accelerated YUV to RGB colorspace conversions. It is most | ||
| 218 | * commonly used for video playback or to display camera input to the screen. | ||
| 219 | * | ||
| 220 | * The conversion process is quite configurable, and can be divided in distinct steps. From | ||
| 221 | * observation, it appears that the hardware buffers a single 8-pixel tall strip of image data | ||
| 222 | * internally and converts it in one go before writing to the output and loading the next strip. | ||
| 223 | * | ||
| 224 | * The steps taken to convert one strip of image data are: | ||
| 225 | * | ||
| 226 | * - The hardware receives data via CDMA (http://3dbrew.org/wiki/Corelink_DMA_Engines), which is | ||
| 227 | * presumably stored in one or more internal buffers. This process can be done in several separate | ||
| 228 | * transfers, as long as they don't exceed the size of the internal image buffer. This allows | ||
| 229 | * flexibility in input strides. | ||
| 230 | * - The input data is decoded into a YUV tuple. Several formats are suported, see the `InputFormat` | ||
| 231 | * enum. | ||
| 232 | * - The YUV tuple is converted, using fixed point calculations, to RGB. This step can be configured | ||
| 233 | * using a set of coefficients to support different colorspace standards. See `CoefficientSet`. | ||
| 234 | * - The strip can be optionally rotated 90, 180 or 270 degrees. Since each strip is processed | ||
| 235 | * independently, this notably rotates each *strip*, not the entire image. This means that for 90 | ||
| 236 | * or 270 degree rotations, the output will be in terms of several 8 x height images, and for any | ||
| 237 | * non-zero rotation the strips will have to be re-arranged so that the parts of the image will | ||
| 238 | * not be shuffled together. This limitation makes this a feature of somewhat dubious utility. 90 | ||
| 239 | * or 270 degree rotations in images with non-even height don't seem to work properly. | ||
| 240 | * - The data is converted to the output RGB format. See the `OutputFormat` enum. | ||
| 241 | * - The data can be output either linearly line-by-line or in the swizzled 8x8 tile format used by | ||
| 242 | * the PICA. This is decided by the `BlockAlignment` enum. If 8x8 alignment is used, then the | ||
| 243 | * image must have a height divisible by 8. The image width must always be divisible by 8. | ||
| 244 | * - The final data is then CDMAed out to main memory and the next image strip is processed. This | ||
| 245 | * offers the same flexibility as the input stage. | ||
| 246 | * | ||
| 247 | * In this implementation, to avoid the combinatorial explosion of parameter combinations, common | ||
| 248 | * intermediate formats are used and where possible tables or parameters are used instead of | ||
| 249 | * diverging code paths to keep the amount of branches in check. Some steps are also merged to | ||
| 250 | * increase efficiency. | ||
| 251 | * | ||
| 252 | * Output for all valid settings combinations matches hardware, however output in some edge-cases | ||
| 253 | * differs: | ||
| 254 | * | ||
| 255 | * - `Block8x8` alignment with non-mod8 height produces different garbage patterns on the last | ||
| 256 | * strip, especially when combined with rotation. | ||
| 257 | * - Hardware, when using `Linear` alignment with a non-even height and 90 or 270 degree rotation | ||
| 258 | * produces misaligned output on the last strip. This implmentation produces output with the | ||
| 259 | * correct "expected" alignment. | ||
| 260 | * | ||
| 261 | * Hardware behaves strangely (doesn't fire the completion interrupt, for example) in these cases, | ||
| 262 | * so they are believed to be invalid configurations anyway. | ||
| 263 | */ | ||
| 264 | void PerformConversion(ConversionConfiguration& cvt) { | ||
| 265 | ASSERT(cvt.input_line_width % 8 == 0); | ||
| 266 | ASSERT(cvt.block_alignment != BlockAlignment::Block8x8 || cvt.input_lines % 8 == 0); | ||
| 267 | // Tiles per row | ||
| 268 | size_t num_tiles = cvt.input_line_width / 8; | ||
| 269 | ASSERT(num_tiles <= MAX_TILES); | ||
| 270 | |||
| 271 | // Buffer used as a CDMA source/target. | ||
| 272 | std::unique_ptr<u8[]> data_buffer(new u8[cvt.input_line_width * 8 * 4]); | ||
| 273 | // Intermediate storage for decoded 8x8 image tiles. Always stored as RGB32. | ||
| 274 | std::unique_ptr<ImageTile[]> tiles(new ImageTile[num_tiles]); | ||
| 275 | ImageTile tmp_tile; | ||
| 276 | |||
| 277 | // LUT used to remap writes to a tile. Used to allow linear or swizzled output without | ||
| 278 | // requiring two different code paths. | ||
| 279 | const u8* tile_remap = nullptr; | ||
| 280 | switch (cvt.block_alignment) { | ||
| 281 | case BlockAlignment::Linear: | ||
| 282 | tile_remap = linear_lut; | ||
| 283 | break; | ||
| 284 | case BlockAlignment::Block8x8: | ||
| 285 | tile_remap = morton_lut; | ||
| 286 | break; | ||
| 287 | } | ||
| 288 | |||
| 289 | for (unsigned int y = 0; y < cvt.input_lines; y += 8) { | ||
| 290 | unsigned int row_height = std::min(cvt.input_lines - y, 8u); | ||
| 291 | |||
| 292 | // Total size in pixels of incoming data required for this strip. | ||
| 293 | const size_t row_data_size = row_height * cvt.input_line_width; | ||
| 294 | |||
| 295 | u8* input_Y = data_buffer.get(); | ||
| 296 | u8* input_U = input_Y + 8 * cvt.input_line_width; | ||
| 297 | u8* input_V = input_U + 8 * cvt.input_line_width / 2; | ||
| 298 | |||
| 299 | switch (cvt.input_format) { | ||
| 300 | case InputFormat::YUV422_Indiv8: | ||
| 301 | ReceiveData<1>(input_Y, cvt.src_Y, row_data_size); | ||
| 302 | ReceiveData<1>(input_U, cvt.src_U, row_data_size / 2); | ||
| 303 | ReceiveData<1>(input_V, cvt.src_V, row_data_size / 2); | ||
| 304 | break; | ||
| 305 | case InputFormat::YUV420_Indiv8: | ||
| 306 | ReceiveData<1>(input_Y, cvt.src_Y, row_data_size); | ||
| 307 | ReceiveData<1>(input_U, cvt.src_U, row_data_size / 4); | ||
| 308 | ReceiveData<1>(input_V, cvt.src_V, row_data_size / 4); | ||
| 309 | break; | ||
| 310 | case InputFormat::YUV422_Indiv16: | ||
| 311 | ReceiveData<2>(input_Y, cvt.src_Y, row_data_size); | ||
| 312 | ReceiveData<2>(input_U, cvt.src_U, row_data_size / 2); | ||
| 313 | ReceiveData<2>(input_V, cvt.src_V, row_data_size / 2); | ||
| 314 | break; | ||
| 315 | case InputFormat::YUV420_Indiv16: | ||
| 316 | ReceiveData<2>(input_Y, cvt.src_Y, row_data_size); | ||
| 317 | ReceiveData<2>(input_U, cvt.src_U, row_data_size / 4); | ||
| 318 | ReceiveData<2>(input_V, cvt.src_V, row_data_size / 4); | ||
| 319 | break; | ||
| 320 | case InputFormat::YUYV422_Interleaved: | ||
| 321 | input_U = nullptr; | ||
| 322 | input_V = nullptr; | ||
| 323 | ReceiveData<1>(input_Y, cvt.src_YUYV, row_data_size * 2); | ||
| 324 | break; | ||
| 325 | } | ||
| 326 | |||
| 327 | // Note(yuriks): If additional optimization is required, input_format can be moved to a | ||
| 328 | // template parameter, so that its dispatch can be moved to outside the inner loop. | ||
| 329 | ConvertYUVToRGB(cvt.input_format, input_Y, input_U, input_V, tiles.get(), | ||
| 330 | cvt.input_line_width, row_height, cvt.coefficients); | ||
| 331 | |||
| 332 | u32* output_buffer = reinterpret_cast<u32*>(data_buffer.get()); | ||
| 333 | |||
| 334 | for (size_t i = 0; i < num_tiles; ++i) { | ||
| 335 | int image_strip_width = 0; | ||
| 336 | int output_stride = 0; | ||
| 337 | |||
| 338 | switch (cvt.rotation) { | ||
| 339 | case Rotation::None: | ||
| 340 | RotateTile0(tiles[i], tmp_tile, row_height, tile_remap); | ||
| 341 | image_strip_width = cvt.input_line_width; | ||
| 342 | output_stride = 8; | ||
| 343 | break; | ||
| 344 | case Rotation::Clockwise_90: | ||
| 345 | RotateTile90(tiles[i], tmp_tile, row_height, tile_remap); | ||
| 346 | image_strip_width = 8; | ||
| 347 | output_stride = 8 * row_height; | ||
| 348 | break; | ||
| 349 | case Rotation::Clockwise_180: | ||
| 350 | // For 180 and 270 degree rotations we also invert the order of tiles in the strip, | ||
| 351 | // since the rotates are done individually on each tile. | ||
| 352 | RotateTile180(tiles[num_tiles - i - 1], tmp_tile, row_height, tile_remap); | ||
| 353 | image_strip_width = cvt.input_line_width; | ||
| 354 | output_stride = 8; | ||
| 355 | break; | ||
| 356 | case Rotation::Clockwise_270: | ||
| 357 | RotateTile270(tiles[num_tiles - i - 1], tmp_tile, row_height, tile_remap); | ||
| 358 | image_strip_width = 8; | ||
| 359 | output_stride = 8 * row_height; | ||
| 360 | break; | ||
| 361 | } | ||
| 362 | |||
| 363 | switch (cvt.block_alignment) { | ||
| 364 | case BlockAlignment::Linear: | ||
| 365 | WriteTileToOutput(output_buffer, tmp_tile, row_height, image_strip_width); | ||
| 366 | output_buffer += output_stride; | ||
| 367 | break; | ||
| 368 | case BlockAlignment::Block8x8: | ||
| 369 | WriteTileToOutput(output_buffer, tmp_tile, 8, 8); | ||
| 370 | output_buffer += TILE_SIZE; | ||
| 371 | break; | ||
| 372 | } | ||
| 373 | } | ||
| 374 | |||
| 375 | // Note(yuriks): If additional optimization is required, output_format can be moved to a | ||
| 376 | // template parameter, so that its dispatch can be moved to outside the inner loop. | ||
| 377 | SendData(reinterpret_cast<u32*>(data_buffer.get()), cvt.dst, (int)row_data_size, | ||
| 378 | cvt.output_format, (u8)cvt.alpha); | ||
| 379 | } | ||
| 380 | } | ||
| 381 | } | ||
| 382 | } | ||