1 files changed, 369 insertions, 0 deletions
diff --git a/src/core/hw/y2r.cpp b/src/core/hw/y2r.cpp
new file mode 100644
index 000000000..5b7fb39e1
--- /dev/null
+++ b/src/core/hw/y2r.cpp
@@ -0,0 +1,369 @@
+// Copyright 2015 Citra Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+#include <array>
+#include <numeric>
+#include "common/assert.h"
+#include "common/color.h"
+#include "common/common_types.h"
+#include "common/math_util.h"
+#include "common/vector_math.h"
+#include "core/hle/service/y2r_u.h"
+#include "core/memory.h"
+namespace HW {
+namespace Y2R {
+using namespace Y2R_U;
+static const size_t MAX_TILES = 1024 / 8;
+static const size_t TILE_SIZE = 8 * 8;
+using ImageTile = std::array<u32, TILE_SIZE>;
+/// Converts a image strip from the source YUV format into individual 8x8 RGB32 tiles.
+static void ConvertYUVToRGB(InputFormat input_format,
+        const u8* input_Y, const u8* input_U, const u8* input_V, ImageTile output[],
+        unsigned int width, unsigned int height, const CoefficientSet& coefficients) {
+    for (unsigned int y = 0; y < height; ++y) {
+        for (unsigned int x = 0; x < width; ++x) {
+            s32 Y, U, V;
+            switch (input_format) {
+            case InputFormat::YUV422_Indiv8:
+            case InputFormat::YUV422_Indiv16:
+                Y = input_Y[y * width + x];
+                U = input_U[(y * width + x) / 2];
+                V = input_V[(y * width + x) / 2];
+                break;
+            case InputFormat::YUV420_Indiv8:
+            case InputFormat::YUV420_Indiv16:
+                Y = input_Y[y * width + x];
+                U = input_U[((y / 2) * width + x) / 2];
+                V = input_V[((y / 2) * width + x) / 2];
+                break;
+            case InputFormat::YUYV422_Interleaved:
+                Y = input_Y[(y * width + x) * 2];
+                U = input_Y[(y * width + (x / 2) * 2) * 2 + 1];
+                V = input_Y[(y * width + (x / 2) * 2) * 2 + 3];
+                break;
+            }
+            // This conversion process is bit-exact with hardware, as far as could be tested.
+            auto& c = coefficients;
+            s32 cY = c[0]*Y;
+            s32 r = cY          + c[1]*V;
+            s32 g = cY - c[3]*U - c[2]*V;
+            s32 b = cY + c[4]*U;
+            const s32 rounding_offset = 0x18;
+            r = (r >> 3) + c[5] + rounding_offset;
+            g = (g >> 3) + c[6] + rounding_offset;
+            b = (b >> 3) + c[7] + rounding_offset;
+            unsigned int tile = x / 8;
+            unsigned int tile_x = x % 8;
+            u32* out = &output[tile][y * 8 + tile_x];
+            using MathUtil::Clamp;
+            *out = ((u32)Clamp(r >> 5, 0, 0xFF) << 24) |
+                   ((u32)Clamp(g >> 5, 0, 0xFF) << 16) |
+                   ((u32)Clamp(b >> 5, 0, 0xFF) << 8);
+        }
+    }
+}
+/// Simulates an incoming CDMA transfer. The N parameter is used to automatically convert 16-bit formats to 8-bit.
+template <size_t N>
+static void ReceiveData(u8* output, ConversionBuffer& buf, size_t amount_of_data) {
+    const u8* input = Memory::GetPointer(buf.address);
+    size_t output_unit = buf.transfer_unit / N;
+    ASSERT(amount_of_data % output_unit == 0);
+    while (amount_of_data > 0) {
+        for (size_t i = 0; i < output_unit; ++i) {
+            output[i] = input[i * N];
+        }
+        output += output_unit;
+        input += buf.transfer_unit + buf.gap;
+        buf.address += buf.transfer_unit + buf.gap;
+        buf.image_size -= buf.transfer_unit;
+        amount_of_data -= output_unit;
+    }
+}
+/// Convert intermediate RGB32 format to the final output format while simulating an outgoing CDMA transfer.
+static void SendData(const u32* input, ConversionBuffer& buf, int amount_of_data,
+        OutputFormat output_format, u8 alpha) {
+    u8* output = Memory::GetPointer(buf.address);
+    while (amount_of_data > 0) {
+        u8* unit_end = output + buf.transfer_unit;
+        while (output < unit_end) {
+            u32 color = *input++;
+            Math::Vec4<u8> col_vec{
+                (color >> 24) & 0xFF, (color >> 16) & 0xFF, (color >>  8) & 0xFF, alpha,
+            };
+            switch (output_format) {
+            case OutputFormat::RGBA8:
+                Color::EncodeRGBA8(col_vec, output);
+                output += 4;
+                break;
+            case OutputFormat::RGB8:
+                Color::EncodeRGB8(col_vec, output);
+                output += 3;
+                break;
+            case OutputFormat::RGB5A1:
+                Color::EncodeRGB5A1(col_vec, output);
+                output += 2;
+                break;
+            case OutputFormat::RGB565:
+                Color::EncodeRGB565(col_vec, output);
+                output += 2;
+                break;
+            }
+            amount_of_data -= 1;
+        }
+        output += buf.gap;
+        buf.address += buf.transfer_unit + buf.gap;
+        buf.image_size -= buf.transfer_unit;
+    }
+}
+static const u8 linear_lut[64] = {
+     0,  1,  2,  3,  4,  5,  6,  7,
+     8,  9, 10, 11, 12, 13, 14, 15,
+    16, 17, 18, 19, 20, 21, 22, 23,
+    24, 25, 26, 27, 28, 29, 30, 31,
+    32, 33, 34, 35, 36, 37, 38, 39,
+    40, 41, 42, 43, 44, 45, 46, 47,
+    48, 49, 50, 51, 52, 53, 54, 55,
+    56, 57, 58, 59, 60, 61, 62, 63,
+};
+static const u8 morton_lut[64] = {
+     0,  1,  4,  5, 16, 17, 20, 21,
+     2,  3,  6,  7, 18, 19, 22, 23,
+     8,  9, 12, 13, 24, 25, 28, 29,
+    10, 11, 14, 15, 26, 27, 30, 31,
+    32, 33, 36, 37, 48, 49, 52, 53,
+    34, 35, 38, 39, 50, 51, 54, 55,
+    40, 41, 44, 45, 56, 57, 60, 61,
+    42, 43, 46, 47, 58, 59, 62, 63,
+};
+static void RotateTile0(const ImageTile& input, ImageTile& output, int height, const u8 out_map[64]) {
+    for (int i = 0; i < height * 8; ++i) {
+        output[out_map[i]] = input[i];
+    }
+}
+static void RotateTile90(const ImageTile& input, ImageTile& output, int height, const u8 out_map[64]) {
+    int out_i = 0;
+    for (int x = 0; x < 8; ++x) {
+        for (int y = height - 1; y >= 0; --y) {
+            output[out_map[out_i++]] = input[y * 8 + x];
+        }
+    }
+}
+static void RotateTile180(const ImageTile& input, ImageTile& output, int height, const u8 out_map[64]) {
+    int out_i = 0;
+    for (int i = height * 8 - 1; i >= 0; --i) {
+        output[out_map[out_i++]] = input[i];
+    }
+}
+static void RotateTile270(const ImageTile& input, ImageTile& output, int height, const u8 out_map[64]) {
+    int out_i = 0;
+    for (int x = 8-1; x >= 0; --x) {
+        for (int y = 0; y < height; ++y) {
+            output[out_map[out_i++]] = input[y * 8 + x];
+        }
+    }
+}
+static void WriteTileToOutput(u32* output, const ImageTile& tile, int height, int line_stride) {
+    for (int y = 0; y < height; ++y) {
+        for (int x = 0; x < 8; ++x) {
+            output[y * line_stride + x] = tile[y * 8 + x];
+        }
+    }
+}
+/**
+ * Performs a Y2R colorspace conversion.
+ *
+ * The Y2R hardware implements hardware-accelerated YUV to RGB colorspace conversions. It is most
+ * commonly used for video playback or to display camera input to the screen.
+ *
+ * The conversion process is quite configurable, and can be divided in distinct steps. From
+ * observation, it appears that the hardware buffers a single 8-pixel tall strip of image data
+ * internally and converts it in one go before writing to the output and loading the next strip.
+ *
+ * The steps taken to convert one strip of image data are:
+ *
+ * - The hardware receives data via CDMA (http://3dbrew.org/wiki/Corelink_DMA_Engines), which is
+ *   presumably stored in one or more internal buffers. This process can be done in several separate
+ *   transfers, as long as they don't exceed the size of the internal image buffer. This allows
+ *   flexibility in input strides.
+ * - The input data is decoded into a YUV tuple. Several formats are suported, see the `InputFormat`
+ *   enum.
+ * - The YUV tuple is converted, using fixed point calculations, to RGB. This step can be configured
+ *   using a set of coefficients to support different colorspace standards. See `CoefficientSet`.
+ * - The strip can be optionally rotated 90, 180 or 270 degrees. Since each strip is processed
+ *   independently, this notably rotates each *strip*, not the entire image. This means that for 90
+ *   or 270 degree rotations, the output will be in terms of several 8 x height images, and for any
+ *   non-zero rotation the strips will have to be re-arranged so that the parts of the image will
+ *   not be shuffled together. This limitation makes this a feature of somewhat dubious utility. 90
+ *   or 270 degree rotations in images with non-even height don't seem to work properly.
+ * - The data is converted to the output RGB format. See the `OutputFormat` enum.
+ * - The data can be output either linearly line-by-line or in the swizzled 8x8 tile format used by
+ *   the PICA. This is decided by the `BlockAlignment` enum. If 8x8 alignment is used, then the
+ *   image must have a height divisible by 8. The image width must always be divisible by 8.
+ * - The final data is then CDMAed out to main memory and the next image strip is processed. This
+ *   offers the same flexibility as the input stage.
+ *
+ * In this implementation, to avoid the combinatorial explosion of parameter combinations, common
+ * intermediate formats are used and where possible tables or parameters are used instead of
+ * diverging code paths to keep the amount of branches in check. Some steps are also merged to
+ * increase efficiency.
+ *
+ * Output for all valid settings combinations matches hardware, however output in some edge-cases
+ * differs:
+ *
+ * - `Block8x8` alignment with non-mod8 height produces different garbage patterns on the last
+ *   strip, especially when combined with rotation.
+ * - Hardware, when using `Linear` alignment with a non-even height and 90 or 270 degree rotation
+ *   produces misaligned output on the last strip. This implmentation produces output with the
+ *   correct "expected" alignment.
+ *
+ * Hardware behaves strangely (doesn't fire the completion interrupt, for example) in these cases,
+ * so they are believed to be invalid configurations anyway.
+ */
+void PerformConversion(ConversionConfiguration& cvt) {
+    ASSERT(cvt.input_line_width % 8 == 0);
+    ASSERT(cvt.block_alignment != BlockAlignment::Block8x8 || cvt.input_lines % 8 == 0);
+    // Tiles per row
+    size_t num_tiles = cvt.input_line_width / 8;
+    ASSERT(num_tiles < MAX_TILES);
+    // Buffer used as a CDMA source/target.
+    std::unique_ptr<u8[]> data_buffer(new u8[cvt.input_line_width * 8 * 4]);
+    // Intermediate storage for decoded 8x8 image tiles. Always stored as RGB32.
+    std::unique_ptr<ImageTile[]> tiles(new ImageTile[num_tiles]);
+    ImageTile tmp_tile;
+    // LUT used to remap writes to a tile. Used to allow linear or swizzled output without
+    // requiring two different code paths.
+    const u8* tile_remap;
+    switch (cvt.block_alignment) {
+    case BlockAlignment::Linear:
+        tile_remap = linear_lut; break;
+    case BlockAlignment::Block8x8:
+        tile_remap = morton_lut; break;
+    }
+    for (unsigned int y = 0; y < cvt.input_lines; y += 8) {
+        unsigned int row_height = std::min(cvt.input_lines - y, 8u);
+        // Total size in pixels of incoming data required for this strip.
+        const size_t row_data_size = row_height * cvt.input_line_width;
+        u8* input_Y = data_buffer.get();
+        u8* input_U = input_Y + 8 * cvt.input_line_width;
+        u8* input_V = input_U + 8 * cvt.input_line_width / 2;
+        switch (cvt.input_format) {
+        case InputFormat::YUV422_Indiv8:
+            ReceiveData<1>(input_Y, cvt.src_Y, row_data_size);
+            ReceiveData<1>(input_U, cvt.src_U, row_data_size / 2);
+            ReceiveData<1>(input_V, cvt.src_V, row_data_size / 2);
+            break;
+        case InputFormat::YUV420_Indiv8:
+            ReceiveData<1>(input_Y, cvt.src_Y, row_data_size);
+            ReceiveData<1>(input_U, cvt.src_U, row_data_size / 4);
+            ReceiveData<1>(input_V, cvt.src_V, row_data_size / 4);
+            break;
+        case InputFormat::YUV422_Indiv16:
+            ReceiveData<2>(input_Y, cvt.src_Y, row_data_size);
+            ReceiveData<2>(input_U, cvt.src_U, row_data_size / 2);
+            ReceiveData<2>(input_V, cvt.src_V, row_data_size / 2);
+            break;
+        case InputFormat::YUV420_Indiv16:
+            ReceiveData<2>(input_Y, cvt.src_Y, row_data_size);
+            ReceiveData<2>(input_U, cvt.src_U, row_data_size / 4);
+            ReceiveData<2>(input_V, cvt.src_V, row_data_size / 4);
+            break;
+        case InputFormat::YUYV422_Interleaved:
+            input_U = nullptr;
+            input_V = nullptr;
+            ReceiveData<1>(input_Y, cvt.src_YUYV, row_data_size * 2);
+            break;
+        }
+        // Note(yuriks): If additional optimization is required, input_format can be moved to a
+        // template parameter, so that its dispatch can be moved to outside the inner loop.
+        ConvertYUVToRGB(cvt.input_format, input_Y, input_U, input_V, tiles.get(),
+                cvt.input_line_width, row_height, cvt.coefficients);
+        u32* output_buffer = reinterpret_cast<u32*>(data_buffer.get());
+        for (int i = 0; i < num_tiles; ++i) {
+            int image_strip_width, output_stride;
+            switch (cvt.rotation) {
+            case Rotation::None:
+                RotateTile0(tiles[i], tmp_tile, row_height, tile_remap);
+                image_strip_width = cvt.input_line_width;
+                output_stride = 8;
+                break;
+            case Rotation::Clockwise_90:
+                RotateTile90(tiles[i], tmp_tile, row_height, tile_remap);
+                image_strip_width = 8;
+                output_stride = 8 * row_height;
+                break;
+            case Rotation::Clockwise_180:
+                // For 180 and 270 degree rotations we also invert the order of tiles in the strip,
+                // since the rotates are done individually on each tile.
+                RotateTile180(tiles[num_tiles - i - 1], tmp_tile, row_height, tile_remap);
+                image_strip_width = cvt.input_line_width;
+                output_stride = 8;
+                break;
+            case Rotation::Clockwise_270:
+                RotateTile270(tiles[num_tiles - i - 1], tmp_tile, row_height, tile_remap);
+                image_strip_width = 8;
+                output_stride = 8 * row_height;
+                break;
+            }
+            switch (cvt.block_alignment) {
+            case BlockAlignment::Linear:
+                WriteTileToOutput(output_buffer, tmp_tile, row_height, image_strip_width);
+                output_buffer += output_stride;
+                break;
+            case BlockAlignment::Block8x8:
+                WriteTileToOutput(output_buffer, tmp_tile, 8, 8);
+                output_buffer += TILE_SIZE;
+                break;
+            }
+        }
+        // Note(yuriks): If additional optimization is required, output_format can be moved to a
+        // template parameter, so that its dispatch can be moved to outside the inner loop.
+        SendData(reinterpret_cast<u32*>(data_buffer.get()), cvt.dst, (int)row_data_size, cvt.output_format, (u8)cvt.alpha);
+    }
+}
+}
+}

diff --git a/src/core/hw/y2r.cpp b/src/core/hw/y2r.cpp new file mode 100644 index 000000000..5b7fb39e1 --- /dev/null +++ b/src/core/hw/y2r.cpp
@@ -0,0 +1,369 @@
	1	// Copyright 2015 Citra Emulator Project
	2	// Licensed under GPLv2 or any later version
	3	// Refer to the license.txt file included.
	4
	5	#include <array>
	6	#include <numeric>
	7
	8	#include "common/assert.h"
	9	#include "common/color.h"
	10	#include "common/common_types.h"
	11	#include "common/math_util.h"
	12	#include "common/vector_math.h"
	13
	14	#include "core/hle/service/y2r_u.h"
	15	#include "core/memory.h"
	16
	17	namespace HW {
	18	namespace Y2R {
	19
	20	using namespace Y2R_U;
	21
	22	static const size_t MAX_TILES = 1024 / 8;
	23	static const size_t TILE_SIZE = 8 * 8;
	24	using ImageTile = std::array<u32, TILE_SIZE>;
	25
	26	/// Converts a image strip from the source YUV format into individual 8x8 RGB32 tiles.
	27	static void ConvertYUVToRGB(InputFormat input_format,
	28	const u8* input_Y, const u8* input_U, const u8* input_V, ImageTile output[],
	29	unsigned int width, unsigned int height, const CoefficientSet& coefficients) {
	30
	31	for (unsigned int y = 0; y < height; ++y) {
	32	for (unsigned int x = 0; x < width; ++x) {
	33	s32 Y, U, V;
	34	switch (input_format) {
	35	case InputFormat::YUV422_Indiv8:
	36	case InputFormat::YUV422_Indiv16:
	37	Y = input_Y[y * width + x];
	38	U = input_U[(y * width + x) / 2];
	39	V = input_V[(y * width + x) / 2];
	40	break;
	41	case InputFormat::YUV420_Indiv8:
	42	case InputFormat::YUV420_Indiv16:
	43	Y = input_Y[y * width + x];
	44	U = input_U[((y / 2) * width + x) / 2];
	45	V = input_V[((y / 2) * width + x) / 2];
	46	break;
	47	case InputFormat::YUYV422_Interleaved:
	48	Y = input_Y[(y * width + x) * 2];
	49	U = input_Y[(y * width + (x / 2) * 2) * 2 + 1];
	50	V = input_Y[(y * width + (x / 2) * 2) * 2 + 3];
	51	break;
	52	}
	53
	54	// This conversion process is bit-exact with hardware, as far as could be tested.
	55	auto& c = coefficients;
	56	s32 cY = c[0]*Y;
	57
	58	s32 r = cY + c[1]*V;
	59	s32 g = cY - c[3]U - c[2]V;
	60	s32 b = cY + c[4]*U;
	61
	62	const s32 rounding_offset = 0x18;
	63	r = (r >> 3) + c[5] + rounding_offset;
	64	g = (g >> 3) + c[6] + rounding_offset;
	65	b = (b >> 3) + c[7] + rounding_offset;
	66
	67	unsigned int tile = x / 8;
	68	unsigned int tile_x = x % 8;
	69	u32* out = &output[tile][y * 8 + tile_x];
	70
	71	using MathUtil::Clamp;
	72	*out = ((u32)Clamp(r >> 5, 0, 0xFF) << 24) \|
	73	((u32)Clamp(g >> 5, 0, 0xFF) << 16) \|
	74	((u32)Clamp(b >> 5, 0, 0xFF) << 8);
	75	}
	76	}
	77	}
	78
	79	/// Simulates an incoming CDMA transfer. The N parameter is used to automatically convert 16-bit formats to 8-bit.
	80	template <size_t N>
	81	static void ReceiveData(u8* output, ConversionBuffer& buf, size_t amount_of_data) {
	82	const u8* input = Memory::GetPointer(buf.address);
	83
	84	size_t output_unit = buf.transfer_unit / N;
	85	ASSERT(amount_of_data % output_unit == 0);
	86
	87	while (amount_of_data > 0) {
	88	for (size_t i = 0; i < output_unit; ++i) {
	89	output[i] = input[i * N];
	90	}
	91
	92	output += output_unit;
	93	input += buf.transfer_unit + buf.gap;
	94
	95	buf.address += buf.transfer_unit + buf.gap;
	96	buf.image_size -= buf.transfer_unit;
	97	amount_of_data -= output_unit;
	98	}
	99	}
	100
	101	/// Convert intermediate RGB32 format to the final output format while simulating an outgoing CDMA transfer.
	102	static void SendData(const u32* input, ConversionBuffer& buf, int amount_of_data,
	103	OutputFormat output_format, u8 alpha) {
	104
	105	u8* output = Memory::GetPointer(buf.address);
	106
	107	while (amount_of_data > 0) {
	108	u8* unit_end = output + buf.transfer_unit;
	109	while (output < unit_end) {
	110	u32 color = *input++;
	111	Math::Vec4<u8> col_vec{
	112	(color >> 24) & 0xFF, (color >> 16) & 0xFF, (color >> 8) & 0xFF, alpha,
	113	};
	114
	115	switch (output_format) {
	116	case OutputFormat::RGBA8:
	117	Color::EncodeRGBA8(col_vec, output);
	118	output += 4;
	119	break;
	120	case OutputFormat::RGB8:
	121	Color::EncodeRGB8(col_vec, output);
	122	output += 3;
	123	break;
	124	case OutputFormat::RGB5A1:
	125	Color::EncodeRGB5A1(col_vec, output);
	126	output += 2;
	127	break;
	128	case OutputFormat::RGB565:
	129	Color::EncodeRGB565(col_vec, output);
	130	output += 2;
	131	break;
	132	}
	133
	134	amount_of_data -= 1;
	135	}
	136
	137	output += buf.gap;
	138	buf.address += buf.transfer_unit + buf.gap;
	139	buf.image_size -= buf.transfer_unit;
	140	}
	141	}
	142
	143	static const u8 linear_lut[64] = {
	144	0, 1, 2, 3, 4, 5, 6, 7,
	145	8, 9, 10, 11, 12, 13, 14, 15,
	146	16, 17, 18, 19, 20, 21, 22, 23,
	147	24, 25, 26, 27, 28, 29, 30, 31,
	148	32, 33, 34, 35, 36, 37, 38, 39,
	149	40, 41, 42, 43, 44, 45, 46, 47,
	150	48, 49, 50, 51, 52, 53, 54, 55,
	151	56, 57, 58, 59, 60, 61, 62, 63,
	152	};
	153
	154	static const u8 morton_lut[64] = {
	155	0, 1, 4, 5, 16, 17, 20, 21,
	156	2, 3, 6, 7, 18, 19, 22, 23,
	157	8, 9, 12, 13, 24, 25, 28, 29,
	158	10, 11, 14, 15, 26, 27, 30, 31,
	159	32, 33, 36, 37, 48, 49, 52, 53,
	160	34, 35, 38, 39, 50, 51, 54, 55,
	161	40, 41, 44, 45, 56, 57, 60, 61,
	162	42, 43, 46, 47, 58, 59, 62, 63,
	163	};
	164
	165	static void RotateTile0(const ImageTile& input, ImageTile& output, int height, const u8 out_map[64]) {
	166	for (int i = 0; i < height * 8; ++i) {
	167	output[out_map[i]] = input[i];
	168	}
	169	}
	170
	171	static void RotateTile90(const ImageTile& input, ImageTile& output, int height, const u8 out_map[64]) {
	172	int out_i = 0;
	173	for (int x = 0; x < 8; ++x) {
	174	for (int y = height - 1; y >= 0; --y) {
	175	output[out_map[out_i++]] = input[y * 8 + x];
	176	}
	177	}
	178	}
	179
	180	static void RotateTile180(const ImageTile& input, ImageTile& output, int height, const u8 out_map[64]) {
	181	int out_i = 0;
	182	for (int i = height * 8 - 1; i >= 0; --i) {
	183	output[out_map[out_i++]] = input[i];
	184	}
	185	}
	186
	187	static void RotateTile270(const ImageTile& input, ImageTile& output, int height, const u8 out_map[64]) {
	188	int out_i = 0;
	189	for (int x = 8-1; x >= 0; --x) {
	190	for (int y = 0; y < height; ++y) {
	191	output[out_map[out_i++]] = input[y * 8 + x];
	192	}
	193	}
	194	}
	195
	196	static void WriteTileToOutput(u32* output, const ImageTile& tile, int height, int line_stride) {
	197	for (int y = 0; y < height; ++y) {
	198	for (int x = 0; x < 8; ++x) {
	199	output[y * line_stride + x] = tile[y * 8 + x];
	200	}
	201	}
	202	}
	203
	204	/**
	205	* Performs a Y2R colorspace conversion.
	206	*
	207	* The Y2R hardware implements hardware-accelerated YUV to RGB colorspace conversions. It is most
	208	* commonly used for video playback or to display camera input to the screen.
	209	*
	210	* The conversion process is quite configurable, and can be divided in distinct steps. From
	211	* observation, it appears that the hardware buffers a single 8-pixel tall strip of image data
	212	* internally and converts it in one go before writing to the output and loading the next strip.
	213	*
	214	* The steps taken to convert one strip of image data are:
	215	*
	216	* - The hardware receives data via CDMA (http://3dbrew.org/wiki/Corelink_DMA_Engines), which is
	217	* presumably stored in one or more internal buffers. This process can be done in several separate
	218	* transfers, as long as they don't exceed the size of the internal image buffer. This allows
	219	* flexibility in input strides.
	220	* - The input data is decoded into a YUV tuple. Several formats are suported, see the `InputFormat`
	221	* enum.
	222	* - The YUV tuple is converted, using fixed point calculations, to RGB. This step can be configured
	223	* using a set of coefficients to support different colorspace standards. See `CoefficientSet`.
	224	* - The strip can be optionally rotated 90, 180 or 270 degrees. Since each strip is processed
	225	* independently, this notably rotates each strip, not the entire image. This means that for 90
	226	* or 270 degree rotations, the output will be in terms of several 8 x height images, and for any
	227	* non-zero rotation the strips will have to be re-arranged so that the parts of the image will
	228	* not be shuffled together. This limitation makes this a feature of somewhat dubious utility. 90
	229	* or 270 degree rotations in images with non-even height don't seem to work properly.
	230	* - The data is converted to the output RGB format. See the `OutputFormat` enum.
	231	* - The data can be output either linearly line-by-line or in the swizzled 8x8 tile format used by
	232	* the PICA. This is decided by the `BlockAlignment` enum. If 8x8 alignment is used, then the
	233	* image must have a height divisible by 8. The image width must always be divisible by 8.
	234	* - The final data is then CDMAed out to main memory and the next image strip is processed. This
	235	* offers the same flexibility as the input stage.
	236	*
	237	* In this implementation, to avoid the combinatorial explosion of parameter combinations, common
	238	* intermediate formats are used and where possible tables or parameters are used instead of
	239	* diverging code paths to keep the amount of branches in check. Some steps are also merged to
	240	* increase efficiency.
	241	*
	242	* Output for all valid settings combinations matches hardware, however output in some edge-cases
	243	* differs:
	244	*
	245	* - `Block8x8` alignment with non-mod8 height produces different garbage patterns on the last
	246	* strip, especially when combined with rotation.
	247	* - Hardware, when using `Linear` alignment with a non-even height and 90 or 270 degree rotation
	248	* produces misaligned output on the last strip. This implmentation produces output with the
	249	* correct "expected" alignment.
	250	*
	251	* Hardware behaves strangely (doesn't fire the completion interrupt, for example) in these cases,
	252	* so they are believed to be invalid configurations anyway.
	253	*/
	254	void PerformConversion(ConversionConfiguration& cvt) {
	255	ASSERT(cvt.input_line_width % 8 == 0);
	256	ASSERT(cvt.block_alignment != BlockAlignment::Block8x8 \|\| cvt.input_lines % 8 == 0);
	257	// Tiles per row
	258	size_t num_tiles = cvt.input_line_width / 8;
	259	ASSERT(num_tiles < MAX_TILES);
	260
	261	// Buffer used as a CDMA source/target.
	262	std::unique_ptr<u8[]> data_buffer(new u8[cvt.input_line_width * 8 * 4]);
	263	// Intermediate storage for decoded 8x8 image tiles. Always stored as RGB32.
	264	std::unique_ptr<ImageTile[]> tiles(new ImageTile[num_tiles]);
	265	ImageTile tmp_tile;
	266
	267	// LUT used to remap writes to a tile. Used to allow linear or swizzled output without
	268	// requiring two different code paths.
	269	const u8* tile_remap;
	270	switch (cvt.block_alignment) {
	271	case BlockAlignment::Linear:
	272	tile_remap = linear_lut; break;
	273	case BlockAlignment::Block8x8:
	274	tile_remap = morton_lut; break;
	275	}
	276
	277	for (unsigned int y = 0; y < cvt.input_lines; y += 8) {
	278	unsigned int row_height = std::min(cvt.input_lines - y, 8u);
	279
	280	// Total size in pixels of incoming data required for this strip.
	281	const size_t row_data_size = row_height * cvt.input_line_width;
	282
	283	u8* input_Y = data_buffer.get();
	284	u8* input_U = input_Y + 8 * cvt.input_line_width;
	285	u8* input_V = input_U + 8 * cvt.input_line_width / 2;
	286
	287	switch (cvt.input_format) {
	288	case InputFormat::YUV422_Indiv8:
	289	ReceiveData<1>(input_Y, cvt.src_Y, row_data_size);
	290	ReceiveData<1>(input_U, cvt.src_U, row_data_size / 2);
	291	ReceiveData<1>(input_V, cvt.src_V, row_data_size / 2);
	292	break;
	293	case InputFormat::YUV420_Indiv8:
	294	ReceiveData<1>(input_Y, cvt.src_Y, row_data_size);
	295	ReceiveData<1>(input_U, cvt.src_U, row_data_size / 4);
	296	ReceiveData<1>(input_V, cvt.src_V, row_data_size / 4);
	297	break;
	298	case InputFormat::YUV422_Indiv16:
	299	ReceiveData<2>(input_Y, cvt.src_Y, row_data_size);
	300	ReceiveData<2>(input_U, cvt.src_U, row_data_size / 2);
	301	ReceiveData<2>(input_V, cvt.src_V, row_data_size / 2);
	302	break;
	303	case InputFormat::YUV420_Indiv16:
	304	ReceiveData<2>(input_Y, cvt.src_Y, row_data_size);
	305	ReceiveData<2>(input_U, cvt.src_U, row_data_size / 4);
	306	ReceiveData<2>(input_V, cvt.src_V, row_data_size / 4);
	307	break;
	308	case InputFormat::YUYV422_Interleaved:
	309	input_U = nullptr;
	310	input_V = nullptr;
	311	ReceiveData<1>(input_Y, cvt.src_YUYV, row_data_size * 2);
	312	break;
	313	}
	314
	315	// Note(yuriks): If additional optimization is required, input_format can be moved to a
	316	// template parameter, so that its dispatch can be moved to outside the inner loop.
	317	ConvertYUVToRGB(cvt.input_format, input_Y, input_U, input_V, tiles.get(),
	318	cvt.input_line_width, row_height, cvt.coefficients);
	319
	320	u32* output_buffer = reinterpret_cast<u32*>(data_buffer.get());
	321
	322	for (int i = 0; i < num_tiles; ++i) {
	323	int image_strip_width, output_stride;
	324
	325	switch (cvt.rotation) {
	326	case Rotation::None:
	327	RotateTile0(tiles[i], tmp_tile, row_height, tile_remap);
	328	image_strip_width = cvt.input_line_width;
	329	output_stride = 8;
	330	break;
	331	case Rotation::Clockwise_90:
	332	RotateTile90(tiles[i], tmp_tile, row_height, tile_remap);
	333	image_strip_width = 8;
	334	output_stride = 8 * row_height;
	335	break;
	336	case Rotation::Clockwise_180:
	337	// For 180 and 270 degree rotations we also invert the order of tiles in the strip,
	338	// since the rotates are done individually on each tile.
	339	RotateTile180(tiles[num_tiles - i - 1], tmp_tile, row_height, tile_remap);
	340	image_strip_width = cvt.input_line_width;
	341	output_stride = 8;
	342	break;
	343	case Rotation::Clockwise_270:
	344	RotateTile270(tiles[num_tiles - i - 1], tmp_tile, row_height, tile_remap);
	345	image_strip_width = 8;
	346	output_stride = 8 * row_height;
	347	break;
	348	}
	349
	350	switch (cvt.block_alignment) {
	351	case BlockAlignment::Linear:
	352	WriteTileToOutput(output_buffer, tmp_tile, row_height, image_strip_width);
	353	output_buffer += output_stride;
	354	break;
	355	case BlockAlignment::Block8x8:
	356	WriteTileToOutput(output_buffer, tmp_tile, 8, 8);
	357	output_buffer += TILE_SIZE;
	358	break;
	359	}
	360	}
	361
	362	// Note(yuriks): If additional optimization is required, output_format can be moved to a
	363	// template parameter, so that its dispatch can be moved to outside the inner loop.
	364	SendData(reinterpret_cast<u32*>(data_buffer.get()), cvt.dst, (int)row_data_size, cvt.output_format, (u8)cvt.alpha);
	365	}
	366	}
	367
	368	}
	369	}