hle: Remove a large amount of 3ds-specific service code.

author: bunnei 2017-10-10 17:32:14 -0400
committer: bunnei 2017-10-10 17:32:14 -0400
commit: 0906de9a14b735d1d409290ca050eb7d2c2d3d84 (patch)
tree: 79bb57d3a4dc4ca377e7a62744c3941de29e785b /src/core/hw/y2r.cpp
parent: Merge remote-tracking branch 'upstream/master' into nx (diff)
download: yuzu-0906de9a14b735d1d409290ca050eb7d2c2d3d84.tar.gz
yuzu-0906de9a14b735d1d409290ca050eb7d2c2d3d84.tar.xz
yuzu-0906de9a14b735d1d409290ca050eb7d2c2d3d84.zip
1 files changed, 0 insertions, 382 deletions
diff --git a/src/core/hw/y2r.cpp b/src/core/hw/y2r.cpp
deleted file mode 100644
index e697f84b3..000000000
--- a/src/core/hw/y2r.cpp
+++ /dev/null
@@ -1,382 +0,0 @@
-// Copyright 2015 Citra Emulator Project
-// Licensed under GPLv2 or any later version
-// Refer to the license.txt file included.
-#include <algorithm>
-#include <array>
-#include <cstddef>
-#include <memory>
-#include "common/assert.h"
-#include "common/color.h"
-#include "common/common_types.h"
-#include "common/math_util.h"
-#include "common/vector_math.h"
-#include "core/hle/service/y2r_u.h"
-#include "core/hw/y2r.h"
-#include "core/memory.h"
-namespace HW {
-namespace Y2R {
-using namespace Service::Y2R;
-static const size_t MAX_TILES = 1024 / 8;
-static const size_t TILE_SIZE = 8 * 8;
-using ImageTile = std::array<u32, TILE_SIZE>;
-/// Converts a image strip from the source YUV format into individual 8x8 RGB32 tiles.
-static void ConvertYUVToRGB(InputFormat input_format, const u8* input_Y, const u8* input_U,
-                            const u8* input_V, ImageTile output[], unsigned int width,
-                            unsigned int height, const CoefficientSet& coefficients) {
-    for (unsigned int y = 0; y < height; ++y) {
-        for (unsigned int x = 0; x < width; ++x) {
-            s32 Y = 0;
-            s32 U = 0;
-            s32 V = 0;
-            switch (input_format) {
-            case InputFormat::YUV422_Indiv8:
-            case InputFormat::YUV422_Indiv16:
-                Y = input_Y[y * width + x];
-                U = input_U[(y * width + x) / 2];
-                V = input_V[(y * width + x) / 2];
-                break;
-            case InputFormat::YUV420_Indiv8:
-            case InputFormat::YUV420_Indiv16:
-                Y = input_Y[y * width + x];
-                U = input_U[((y / 2) * width + x) / 2];
-                V = input_V[((y / 2) * width + x) / 2];
-                break;
-            case InputFormat::YUYV422_Interleaved:
-                Y = input_Y[(y * width + x) * 2];
-                U = input_Y[(y * width + (x / 2) * 2) * 2 + 1];
-                V = input_Y[(y * width + (x / 2) * 2) * 2 + 3];
-                break;
-            }
-            // This conversion process is bit-exact with hardware, as far as could be tested.
-            auto& c = coefficients;
-            s32 cY = c[0] * Y;
-            s32 r = cY + c[1] * V;
-            s32 g = cY - c[2] * V - c[3] * U;
-            s32 b = cY + c[4] * U;
-            const s32 rounding_offset = 0x18;
-            r = (r >> 3) + c[5] + rounding_offset;
-            g = (g >> 3) + c[6] + rounding_offset;
-            b = (b >> 3) + c[7] + rounding_offset;
-            unsigned int tile = x / 8;
-            unsigned int tile_x = x % 8;
-            u32* out = &output[tile][y * 8 + tile_x];
-            using MathUtil::Clamp;
-            *out = ((u32)Clamp(r >> 5, 0, 0xFF) << 24) | ((u32)Clamp(g >> 5, 0, 0xFF) << 16) |
-                   ((u32)Clamp(b >> 5, 0, 0xFF) << 8);
-        }
-    }
-}
-/// Simulates an incoming CDMA transfer. The N parameter is used to automatically convert 16-bit
-/// formats to 8-bit.
-template <size_t N>
-static void ReceiveData(u8* output, ConversionBuffer& buf, size_t amount_of_data) {
-    const u8* input = Memory::GetPointer(buf.address);
-    size_t output_unit = buf.transfer_unit / N;
-    ASSERT(amount_of_data % output_unit == 0);
-    while (amount_of_data > 0) {
-        for (size_t i = 0; i < output_unit; ++i) {
-            output[i] = input[i * N];
-        }
-        output += output_unit;
-        input += buf.transfer_unit + buf.gap;
-        buf.address += buf.transfer_unit + buf.gap;
-        buf.image_size -= buf.transfer_unit;
-        amount_of_data -= output_unit;
-    }
-}
-/// Convert intermediate RGB32 format to the final output format while simulating an outgoing CDMA
-/// transfer.
-static void SendData(const u32* input, ConversionBuffer& buf, int amount_of_data,
-                     OutputFormat output_format, u8 alpha) {
-    u8* output = Memory::GetPointer(buf.address);
-    while (amount_of_data > 0) {
-        u8* unit_end = output + buf.transfer_unit;
-        while (output < unit_end) {
-            u32 color = *input++;
-            Math::Vec4<u8> col_vec{(u8)(color >> 24), (u8)(color >> 16), (u8)(color >> 8), alpha};
-            switch (output_format) {
-            case OutputFormat::RGBA8:
-                Color::EncodeRGBA8(col_vec, output);
-                output += 4;
-                break;
-            case OutputFormat::RGB8:
-                Color::EncodeRGB8(col_vec, output);
-                output += 3;
-                break;
-            case OutputFormat::RGB5A1:
-                Color::EncodeRGB5A1(col_vec, output);
-                output += 2;
-                break;
-            case OutputFormat::RGB565:
-                Color::EncodeRGB565(col_vec, output);
-                output += 2;
-                break;
-            }
-            amount_of_data -= 1;
-        }
-        output += buf.gap;
-        buf.address += buf.transfer_unit + buf.gap;
-        buf.image_size -= buf.transfer_unit;
-    }
-}
-static const u8 linear_lut[TILE_SIZE] = {
-    // clang-format off
-     0,  1,  2,  3,  4,  5,  6,  7,
-     8,  9, 10, 11, 12, 13, 14, 15,
-    16, 17, 18, 19, 20, 21, 22, 23,
-    24, 25, 26, 27, 28, 29, 30, 31,
-    32, 33, 34, 35, 36, 37, 38, 39,
-    40, 41, 42, 43, 44, 45, 46, 47,
-    48, 49, 50, 51, 52, 53, 54, 55,
-    56, 57, 58, 59, 60, 61, 62, 63,
-    // clang-format on
-};
-static const u8 morton_lut[TILE_SIZE] = {
-    // clang-format off
-     0,  1,  4,  5, 16, 17, 20, 21,
-     2,  3,  6,  7, 18, 19, 22, 23,
-     8,  9, 12, 13, 24, 25, 28, 29,
-    10, 11, 14, 15, 26, 27, 30, 31,
-    32, 33, 36, 37, 48, 49, 52, 53,
-    34, 35, 38, 39, 50, 51, 54, 55,
-    40, 41, 44, 45, 56, 57, 60, 61,
-    42, 43, 46, 47, 58, 59, 62, 63,
-    // clang-format on
-};
-static void RotateTile0(const ImageTile& input, ImageTile& output, int height,
-                        const u8 out_map[64]) {
-    for (int i = 0; i < height * 8; ++i) {
-        output[out_map[i]] = input[i];
-    }
-}
-static void RotateTile90(const ImageTile& input, ImageTile& output, int height,
-                         const u8 out_map[64]) {
-    int out_i = 0;
-    for (int x = 0; x < 8; ++x) {
-        for (int y = height - 1; y >= 0; --y) {
-            output[out_map[out_i++]] = input[y * 8 + x];
-        }
-    }
-}
-static void RotateTile180(const ImageTile& input, ImageTile& output, int height,
-                          const u8 out_map[64]) {
-    int out_i = 0;
-    for (int i = height * 8 - 1; i >= 0; --i) {
-        output[out_map[out_i++]] = input[i];
-    }
-}
-static void RotateTile270(const ImageTile& input, ImageTile& output, int height,
-                          const u8 out_map[64]) {
-    int out_i = 0;
-    for (int x = 8 - 1; x >= 0; --x) {
-        for (int y = 0; y < height; ++y) {
-            output[out_map[out_i++]] = input[y * 8 + x];
-        }
-    }
-}
-static void WriteTileToOutput(u32* output, const ImageTile& tile, int height, int line_stride) {
-    for (int y = 0; y < height; ++y) {
-        for (int x = 0; x < 8; ++x) {
-            output[y * line_stride + x] = tile[y * 8 + x];
-        }
-    }
-}
-/**
- * Performs a Y2R colorspace conversion.
- *
- * The Y2R hardware implements hardware-accelerated YUV to RGB colorspace conversions. It is most
- * commonly used for video playback or to display camera input to the screen.
- *
- * The conversion process is quite configurable, and can be divided in distinct steps. From
- * observation, it appears that the hardware buffers a single 8-pixel tall strip of image data
- * internally and converts it in one go before writing to the output and loading the next strip.
- *
- * The steps taken to convert one strip of image data are:
- *
- * - The hardware receives data via CDMA (http://3dbrew.org/wiki/Corelink_DMA_Engines), which is
- *   presumably stored in one or more internal buffers. This process can be done in several separate
- *   transfers, as long as they don't exceed the size of the internal image buffer. This allows
- *   flexibility in input strides.
- * - The input data is decoded into a YUV tuple. Several formats are suported, see the `InputFormat`
- *   enum.
- * - The YUV tuple is converted, using fixed point calculations, to RGB. This step can be configured
- *   using a set of coefficients to support different colorspace standards. See `CoefficientSet`.
- * - The strip can be optionally rotated 90, 180 or 270 degrees. Since each strip is processed
- *   independently, this notably rotates each *strip*, not the entire image. This means that for 90
- *   or 270 degree rotations, the output will be in terms of several 8 x height images, and for any
- *   non-zero rotation the strips will have to be re-arranged so that the parts of the image will
- *   not be shuffled together. This limitation makes this a feature of somewhat dubious utility. 90
- *   or 270 degree rotations in images with non-even height don't seem to work properly.
- * - The data is converted to the output RGB format. See the `OutputFormat` enum.
- * - The data can be output either linearly line-by-line or in the swizzled 8x8 tile format used by
- *   the PICA. This is decided by the `BlockAlignment` enum. If 8x8 alignment is used, then the
- *   image must have a height divisible by 8. The image width must always be divisible by 8.
- * - The final data is then CDMAed out to main memory and the next image strip is processed. This
- *   offers the same flexibility as the input stage.
- *
- * In this implementation, to avoid the combinatorial explosion of parameter combinations, common
- * intermediate formats are used and where possible tables or parameters are used instead of
- * diverging code paths to keep the amount of branches in check. Some steps are also merged to
- * increase efficiency.
- *
- * Output for all valid settings combinations matches hardware, however output in some edge-cases
- * differs:
- *
- * - `Block8x8` alignment with non-mod8 height produces different garbage patterns on the last
- *   strip, especially when combined with rotation.
- * - Hardware, when using `Linear` alignment with a non-even height and 90 or 270 degree rotation
- *   produces misaligned output on the last strip. This implmentation produces output with the
- *   correct "expected" alignment.
- *
- * Hardware behaves strangely (doesn't fire the completion interrupt, for example) in these cases,
- * so they are believed to be invalid configurations anyway.
- */
-void PerformConversion(ConversionConfiguration& cvt) {
-    ASSERT(cvt.input_line_width % 8 == 0);
-    ASSERT(cvt.block_alignment != BlockAlignment::Block8x8 || cvt.input_lines % 8 == 0);
-    // Tiles per row
-    size_t num_tiles = cvt.input_line_width / 8;
-    ASSERT(num_tiles <= MAX_TILES);
-    // Buffer used as a CDMA source/target.
-    std::unique_ptr<u8[]> data_buffer(new u8[cvt.input_line_width * 8 * 4]);
-    // Intermediate storage for decoded 8x8 image tiles. Always stored as RGB32.
-    std::unique_ptr<ImageTile[]> tiles(new ImageTile[num_tiles]);
-    ImageTile tmp_tile;
-    // LUT used to remap writes to a tile. Used to allow linear or swizzled output without
-    // requiring two different code paths.
-    const u8* tile_remap = nullptr;
-    switch (cvt.block_alignment) {
-    case BlockAlignment::Linear:
-        tile_remap = linear_lut;
-        break;
-    case BlockAlignment::Block8x8:
-        tile_remap = morton_lut;
-        break;
-    }
-    for (unsigned int y = 0; y < cvt.input_lines; y += 8) {
-        unsigned int row_height = std::min(cvt.input_lines - y, 8u);
-        // Total size in pixels of incoming data required for this strip.
-        const size_t row_data_size = row_height * cvt.input_line_width;
-        u8* input_Y = data_buffer.get();
-        u8* input_U = input_Y + 8 * cvt.input_line_width;
-        u8* input_V = input_U + 8 * cvt.input_line_width / 2;
-        switch (cvt.input_format) {
-        case InputFormat::YUV422_Indiv8:
-            ReceiveData<1>(input_Y, cvt.src_Y, row_data_size);
-            ReceiveData<1>(input_U, cvt.src_U, row_data_size / 2);
-            ReceiveData<1>(input_V, cvt.src_V, row_data_size / 2);
-            break;
-        case InputFormat::YUV420_Indiv8:
-            ReceiveData<1>(input_Y, cvt.src_Y, row_data_size);
-            ReceiveData<1>(input_U, cvt.src_U, row_data_size / 4);
-            ReceiveData<1>(input_V, cvt.src_V, row_data_size / 4);
-            break;
-        case InputFormat::YUV422_Indiv16:
-            ReceiveData<2>(input_Y, cvt.src_Y, row_data_size);
-            ReceiveData<2>(input_U, cvt.src_U, row_data_size / 2);
-            ReceiveData<2>(input_V, cvt.src_V, row_data_size / 2);
-            break;
-        case InputFormat::YUV420_Indiv16:
-            ReceiveData<2>(input_Y, cvt.src_Y, row_data_size);
-            ReceiveData<2>(input_U, cvt.src_U, row_data_size / 4);
-            ReceiveData<2>(input_V, cvt.src_V, row_data_size / 4);
-            break;
-        case InputFormat::YUYV422_Interleaved:
-            input_U = nullptr;
-            input_V = nullptr;
-            ReceiveData<1>(input_Y, cvt.src_YUYV, row_data_size * 2);
-            break;
-        }
-        // Note(yuriks): If additional optimization is required, input_format can be moved to a
-        // template parameter, so that its dispatch can be moved to outside the inner loop.
-        ConvertYUVToRGB(cvt.input_format, input_Y, input_U, input_V, tiles.get(),
-                        cvt.input_line_width, row_height, cvt.coefficients);
-        u32* output_buffer = reinterpret_cast<u32*>(data_buffer.get());
-        for (size_t i = 0; i < num_tiles; ++i) {
-            int image_strip_width = 0;
-            int output_stride = 0;
-            switch (cvt.rotation) {
-            case Rotation::None:
-                RotateTile0(tiles[i], tmp_tile, row_height, tile_remap);
-                image_strip_width = cvt.input_line_width;
-                output_stride = 8;
-                break;
-            case Rotation::Clockwise_90:
-                RotateTile90(tiles[i], tmp_tile, row_height, tile_remap);
-                image_strip_width = 8;
-                output_stride = 8 * row_height;
-                break;
-            case Rotation::Clockwise_180:
-                // For 180 and 270 degree rotations we also invert the order of tiles in the strip,
-                // since the rotates are done individually on each tile.
-                RotateTile180(tiles[num_tiles - i - 1], tmp_tile, row_height, tile_remap);
-                image_strip_width = cvt.input_line_width;
-                output_stride = 8;
-                break;
-            case Rotation::Clockwise_270:
-                RotateTile270(tiles[num_tiles - i - 1], tmp_tile, row_height, tile_remap);
-                image_strip_width = 8;
-                output_stride = 8 * row_height;
-                break;
-            }
-            switch (cvt.block_alignment) {
-            case BlockAlignment::Linear:
-                WriteTileToOutput(output_buffer, tmp_tile, row_height, image_strip_width);
-                output_buffer += output_stride;
-                break;
-            case BlockAlignment::Block8x8:
-                WriteTileToOutput(output_buffer, tmp_tile, 8, 8);
-                output_buffer += TILE_SIZE;
-                break;
-            }
-        }
-        // Note(yuriks): If additional optimization is required, output_format can be moved to a
-        // template parameter, so that its dispatch can be moved to outside the inner loop.
-        SendData(reinterpret_cast<u32*>(data_buffer.get()), cvt.dst, (int)row_data_size,
-                 cvt.output_format, (u8)cvt.alpha);
-    }
-}
-}
-}
author	bunnei	2017-10-10 17:32:14 -0400
committer	bunnei	2017-10-10 17:32:14 -0400
commit	0906de9a14b735d1d409290ca050eb7d2c2d3d84 (patch)
tree	79bb57d3a4dc4ca377e7a62744c3941de29e785b /src/core/hw/y2r.cpp
parent	Merge remote-tracking branch 'upstream/master' into nx (diff)
download	yuzu-0906de9a14b735d1d409290ca050eb7d2c2d3d84.tar.gz yuzu-0906de9a14b735d1d409290ca050eb7d2c2d3d84.tar.xz yuzu-0906de9a14b735d1d409290ca050eb7d2c2d3d84.zip

diff --git a/src/core/hw/y2r.cpp b/src/core/hw/y2r.cpp deleted file mode 100644 index e697f84b3..000000000 --- a/src/core/hw/y2r.cpp +++ /dev/null
@@ -1,382 +0,0 @@
1	// Copyright 2015 Citra Emulator Project
2	// Licensed under GPLv2 or any later version
3	// Refer to the license.txt file included.
4
5	#include <algorithm>
6	#include <array>
7	#include <cstddef>
8	#include <memory>
9	#include "common/assert.h"
10	#include "common/color.h"
11	#include "common/common_types.h"
12	#include "common/math_util.h"
13	#include "common/vector_math.h"
14	#include "core/hle/service/y2r_u.h"
15	#include "core/hw/y2r.h"
16	#include "core/memory.h"
17
18	namespace HW {
19	namespace Y2R {
20
21	using namespace Service::Y2R;
22
23	static const size_t MAX_TILES = 1024 / 8;
24	static const size_t TILE_SIZE = 8 * 8;
25	using ImageTile = std::array<u32, TILE_SIZE>;
26
27	/// Converts a image strip from the source YUV format into individual 8x8 RGB32 tiles.
28	static void ConvertYUVToRGB(InputFormat input_format, const u8* input_Y, const u8* input_U,
29	const u8* input_V, ImageTile output[], unsigned int width,
30	unsigned int height, const CoefficientSet& coefficients) {
31
32	for (unsigned int y = 0; y < height; ++y) {
33	for (unsigned int x = 0; x < width; ++x) {
34	s32 Y = 0;
35	s32 U = 0;
36	s32 V = 0;
37	switch (input_format) {
38	case InputFormat::YUV422_Indiv8:
39	case InputFormat::YUV422_Indiv16:
40	Y = input_Y[y * width + x];
41	U = input_U[(y * width + x) / 2];
42	V = input_V[(y * width + x) / 2];
43	break;
44	case InputFormat::YUV420_Indiv8:
45	case InputFormat::YUV420_Indiv16:
46	Y = input_Y[y * width + x];
47	U = input_U[((y / 2) * width + x) / 2];
48	V = input_V[((y / 2) * width + x) / 2];
49	break;
50	case InputFormat::YUYV422_Interleaved:
51	Y = input_Y[(y * width + x) * 2];
52	U = input_Y[(y * width + (x / 2) * 2) * 2 + 1];
53	V = input_Y[(y * width + (x / 2) * 2) * 2 + 3];
54	break;
55	}
56
57	// This conversion process is bit-exact with hardware, as far as could be tested.
58	auto& c = coefficients;
59	s32 cY = c[0] * Y;
60
61	s32 r = cY + c[1] * V;
62	s32 g = cY - c[2] * V - c[3] * U;
63	s32 b = cY + c[4] * U;
64
65	const s32 rounding_offset = 0x18;
66	r = (r >> 3) + c[5] + rounding_offset;
67	g = (g >> 3) + c[6] + rounding_offset;
68	b = (b >> 3) + c[7] + rounding_offset;
69
70	unsigned int tile = x / 8;
71	unsigned int tile_x = x % 8;
72	u32* out = &output[tile][y * 8 + tile_x];
73
74	using MathUtil::Clamp;
75	*out = ((u32)Clamp(r >> 5, 0, 0xFF) << 24) \| ((u32)Clamp(g >> 5, 0, 0xFF) << 16) \|
76	((u32)Clamp(b >> 5, 0, 0xFF) << 8);
77	}
78	}
79	}
80
81	/// Simulates an incoming CDMA transfer. The N parameter is used to automatically convert 16-bit
82	/// formats to 8-bit.
83	template <size_t N>
84	static void ReceiveData(u8* output, ConversionBuffer& buf, size_t amount_of_data) {
85	const u8* input = Memory::GetPointer(buf.address);
86
87	size_t output_unit = buf.transfer_unit / N;
88	ASSERT(amount_of_data % output_unit == 0);
89
90	while (amount_of_data > 0) {
91	for (size_t i = 0; i < output_unit; ++i) {
92	output[i] = input[i * N];
93	}
94
95	output += output_unit;
96	input += buf.transfer_unit + buf.gap;
97
98	buf.address += buf.transfer_unit + buf.gap;
99	buf.image_size -= buf.transfer_unit;
100	amount_of_data -= output_unit;
101	}
102	}
103
104	/// Convert intermediate RGB32 format to the final output format while simulating an outgoing CDMA
105	/// transfer.
106	static void SendData(const u32* input, ConversionBuffer& buf, int amount_of_data,
107	OutputFormat output_format, u8 alpha) {
108
109	u8* output = Memory::GetPointer(buf.address);
110
111	while (amount_of_data > 0) {
112	u8* unit_end = output + buf.transfer_unit;
113	while (output < unit_end) {
114	u32 color = *input++;
115	Math::Vec4<u8> col_vec{(u8)(color >> 24), (u8)(color >> 16), (u8)(color >> 8), alpha};
116
117	switch (output_format) {
118	case OutputFormat::RGBA8:
119	Color::EncodeRGBA8(col_vec, output);
120	output += 4;
121	break;
122	case OutputFormat::RGB8:
123	Color::EncodeRGB8(col_vec, output);
124	output += 3;
125	break;
126	case OutputFormat::RGB5A1:
127	Color::EncodeRGB5A1(col_vec, output);
128	output += 2;
129	break;
130	case OutputFormat::RGB565:
131	Color::EncodeRGB565(col_vec, output);
132	output += 2;
133	break;
134	}
135
136	amount_of_data -= 1;
137	}
138
139	output += buf.gap;
140	buf.address += buf.transfer_unit + buf.gap;
141	buf.image_size -= buf.transfer_unit;
142	}
143	}
144
145	static const u8 linear_lut[TILE_SIZE] = {
146	// clang-format off
147	0, 1, 2, 3, 4, 5, 6, 7,
148	8, 9, 10, 11, 12, 13, 14, 15,
149	16, 17, 18, 19, 20, 21, 22, 23,
150	24, 25, 26, 27, 28, 29, 30, 31,
151	32, 33, 34, 35, 36, 37, 38, 39,
152	40, 41, 42, 43, 44, 45, 46, 47,
153	48, 49, 50, 51, 52, 53, 54, 55,
154	56, 57, 58, 59, 60, 61, 62, 63,
155	// clang-format on
156	};
157
158	static const u8 morton_lut[TILE_SIZE] = {
159	// clang-format off
160	0, 1, 4, 5, 16, 17, 20, 21,
161	2, 3, 6, 7, 18, 19, 22, 23,
162	8, 9, 12, 13, 24, 25, 28, 29,
163	10, 11, 14, 15, 26, 27, 30, 31,
164	32, 33, 36, 37, 48, 49, 52, 53,
165	34, 35, 38, 39, 50, 51, 54, 55,
166	40, 41, 44, 45, 56, 57, 60, 61,
167	42, 43, 46, 47, 58, 59, 62, 63,
168	// clang-format on
169	};
170
171	static void RotateTile0(const ImageTile& input, ImageTile& output, int height,
172	const u8 out_map[64]) {
173	for (int i = 0; i < height * 8; ++i) {
174	output[out_map[i]] = input[i];
175	}
176	}
177
178	static void RotateTile90(const ImageTile& input, ImageTile& output, int height,
179	const u8 out_map[64]) {
180	int out_i = 0;
181	for (int x = 0; x < 8; ++x) {
182	for (int y = height - 1; y >= 0; --y) {
183	output[out_map[out_i++]] = input[y * 8 + x];
184	}
185	}
186	}
187
188	static void RotateTile180(const ImageTile& input, ImageTile& output, int height,
189	const u8 out_map[64]) {
190	int out_i = 0;
191	for (int i = height * 8 - 1; i >= 0; --i) {
192	output[out_map[out_i++]] = input[i];
193	}
194	}
195
196	static void RotateTile270(const ImageTile& input, ImageTile& output, int height,
197	const u8 out_map[64]) {
198	int out_i = 0;
199	for (int x = 8 - 1; x >= 0; --x) {
200	for (int y = 0; y < height; ++y) {
201	output[out_map[out_i++]] = input[y * 8 + x];
202	}
203	}
204	}
205
206	static void WriteTileToOutput(u32* output, const ImageTile& tile, int height, int line_stride) {
207	for (int y = 0; y < height; ++y) {
208	for (int x = 0; x < 8; ++x) {
209	output[y * line_stride + x] = tile[y * 8 + x];
210	}
211	}
212	}
213
214	/**
215	* Performs a Y2R colorspace conversion.
216	*
217	* The Y2R hardware implements hardware-accelerated YUV to RGB colorspace conversions. It is most
218	* commonly used for video playback or to display camera input to the screen.
219	*
220	* The conversion process is quite configurable, and can be divided in distinct steps. From
221	* observation, it appears that the hardware buffers a single 8-pixel tall strip of image data
222	* internally and converts it in one go before writing to the output and loading the next strip.
223	*
224	* The steps taken to convert one strip of image data are:
225	*
226	* - The hardware receives data via CDMA (http://3dbrew.org/wiki/Corelink_DMA_Engines), which is
227	* presumably stored in one or more internal buffers. This process can be done in several separate
228	* transfers, as long as they don't exceed the size of the internal image buffer. This allows
229	* flexibility in input strides.
230	* - The input data is decoded into a YUV tuple. Several formats are suported, see the `InputFormat`
231	* enum.
232	* - The YUV tuple is converted, using fixed point calculations, to RGB. This step can be configured
233	* using a set of coefficients to support different colorspace standards. See `CoefficientSet`.
234	* - The strip can be optionally rotated 90, 180 or 270 degrees. Since each strip is processed
235	* independently, this notably rotates each strip, not the entire image. This means that for 90
236	* or 270 degree rotations, the output will be in terms of several 8 x height images, and for any
237	* non-zero rotation the strips will have to be re-arranged so that the parts of the image will
238	* not be shuffled together. This limitation makes this a feature of somewhat dubious utility. 90
239	* or 270 degree rotations in images with non-even height don't seem to work properly.
240	* - The data is converted to the output RGB format. See the `OutputFormat` enum.
241	* - The data can be output either linearly line-by-line or in the swizzled 8x8 tile format used by
242	* the PICA. This is decided by the `BlockAlignment` enum. If 8x8 alignment is used, then the
243	* image must have a height divisible by 8. The image width must always be divisible by 8.
244	* - The final data is then CDMAed out to main memory and the next image strip is processed. This
245	* offers the same flexibility as the input stage.
246	*
247	* In this implementation, to avoid the combinatorial explosion of parameter combinations, common
248	* intermediate formats are used and where possible tables or parameters are used instead of
249	* diverging code paths to keep the amount of branches in check. Some steps are also merged to
250	* increase efficiency.
251	*
252	* Output for all valid settings combinations matches hardware, however output in some edge-cases
253	* differs:
254	*
255	* - `Block8x8` alignment with non-mod8 height produces different garbage patterns on the last
256	* strip, especially when combined with rotation.
257	* - Hardware, when using `Linear` alignment with a non-even height and 90 or 270 degree rotation
258	* produces misaligned output on the last strip. This implmentation produces output with the
259	* correct "expected" alignment.
260	*
261	* Hardware behaves strangely (doesn't fire the completion interrupt, for example) in these cases,
262	* so they are believed to be invalid configurations anyway.
263	*/
264	void PerformConversion(ConversionConfiguration& cvt) {
265	ASSERT(cvt.input_line_width % 8 == 0);
266	ASSERT(cvt.block_alignment != BlockAlignment::Block8x8 \|\| cvt.input_lines % 8 == 0);
267	// Tiles per row
268	size_t num_tiles = cvt.input_line_width / 8;
269	ASSERT(num_tiles <= MAX_TILES);
270
271	// Buffer used as a CDMA source/target.
272	std::unique_ptr<u8[]> data_buffer(new u8[cvt.input_line_width * 8 * 4]);
273	// Intermediate storage for decoded 8x8 image tiles. Always stored as RGB32.
274	std::unique_ptr<ImageTile[]> tiles(new ImageTile[num_tiles]);
275	ImageTile tmp_tile;
276
277	// LUT used to remap writes to a tile. Used to allow linear or swizzled output without
278	// requiring two different code paths.
279	const u8* tile_remap = nullptr;
280	switch (cvt.block_alignment) {
281	case BlockAlignment::Linear:
282	tile_remap = linear_lut;
283	break;
284	case BlockAlignment::Block8x8:
285	tile_remap = morton_lut;
286	break;
287	}
288
289	for (unsigned int y = 0; y < cvt.input_lines; y += 8) {
290	unsigned int row_height = std::min(cvt.input_lines - y, 8u);
291
292	// Total size in pixels of incoming data required for this strip.
293	const size_t row_data_size = row_height * cvt.input_line_width;
294
295	u8* input_Y = data_buffer.get();
296	u8* input_U = input_Y + 8 * cvt.input_line_width;
297	u8* input_V = input_U + 8 * cvt.input_line_width / 2;
298
299	switch (cvt.input_format) {
300	case InputFormat::YUV422_Indiv8:
301	ReceiveData<1>(input_Y, cvt.src_Y, row_data_size);
302	ReceiveData<1>(input_U, cvt.src_U, row_data_size / 2);
303	ReceiveData<1>(input_V, cvt.src_V, row_data_size / 2);
304	break;
305	case InputFormat::YUV420_Indiv8:
306	ReceiveData<1>(input_Y, cvt.src_Y, row_data_size);
307	ReceiveData<1>(input_U, cvt.src_U, row_data_size / 4);
308	ReceiveData<1>(input_V, cvt.src_V, row_data_size / 4);
309	break;
310	case InputFormat::YUV422_Indiv16:
311	ReceiveData<2>(input_Y, cvt.src_Y, row_data_size);
312	ReceiveData<2>(input_U, cvt.src_U, row_data_size / 2);
313	ReceiveData<2>(input_V, cvt.src_V, row_data_size / 2);
314	break;
315	case InputFormat::YUV420_Indiv16:
316	ReceiveData<2>(input_Y, cvt.src_Y, row_data_size);
317	ReceiveData<2>(input_U, cvt.src_U, row_data_size / 4);
318	ReceiveData<2>(input_V, cvt.src_V, row_data_size / 4);
319	break;
320	case InputFormat::YUYV422_Interleaved:
321	input_U = nullptr;
322	input_V = nullptr;
323	ReceiveData<1>(input_Y, cvt.src_YUYV, row_data_size * 2);
324	break;
325	}
326
327	// Note(yuriks): If additional optimization is required, input_format can be moved to a
328	// template parameter, so that its dispatch can be moved to outside the inner loop.
329	ConvertYUVToRGB(cvt.input_format, input_Y, input_U, input_V, tiles.get(),
330	cvt.input_line_width, row_height, cvt.coefficients);
331
332	u32* output_buffer = reinterpret_cast<u32*>(data_buffer.get());
333
334	for (size_t i = 0; i < num_tiles; ++i) {
335	int image_strip_width = 0;
336	int output_stride = 0;
337
338	switch (cvt.rotation) {
339	case Rotation::None:
340	RotateTile0(tiles[i], tmp_tile, row_height, tile_remap);
341	image_strip_width = cvt.input_line_width;
342	output_stride = 8;
343	break;
344	case Rotation::Clockwise_90:
345	RotateTile90(tiles[i], tmp_tile, row_height, tile_remap);
346	image_strip_width = 8;
347	output_stride = 8 * row_height;
348	break;
349	case Rotation::Clockwise_180:
350	// For 180 and 270 degree rotations we also invert the order of tiles in the strip,
351	// since the rotates are done individually on each tile.
352	RotateTile180(tiles[num_tiles - i - 1], tmp_tile, row_height, tile_remap);
353	image_strip_width = cvt.input_line_width;
354	output_stride = 8;
355	break;
356	case Rotation::Clockwise_270:
357	RotateTile270(tiles[num_tiles - i - 1], tmp_tile, row_height, tile_remap);
358	image_strip_width = 8;
359	output_stride = 8 * row_height;
360	break;
361	}
362
363	switch (cvt.block_alignment) {
364	case BlockAlignment::Linear:
365	WriteTileToOutput(output_buffer, tmp_tile, row_height, image_strip_width);
366	output_buffer += output_stride;
367	break;
368	case BlockAlignment::Block8x8:
369	WriteTileToOutput(output_buffer, tmp_tile, 8, 8);
370	output_buffer += TILE_SIZE;
371	break;
372	}
373	}
374
375	// Note(yuriks): If additional optimization is required, output_format can be moved to a
376	// template parameter, so that its dispatch can be moved to outside the inner loop.
377	SendData(reinterpret_cast<u32*>(data_buffer.get()), cvt.dst, (int)row_data_size,
378	cvt.output_format, (u8)cvt.alpha);
379	}
380	}
381	}
382	}