Merge commit 'b5d955f' into develop-next

author: Sam Atman 2025-05-15 16:06:35 -0400
committer: Sam Atman 2025-05-15 16:06:35 -0400
commit: 6ae1e4aabf3b8d3a0c3c1154c297979d0277e6b3 (patch)
tree: 61f83982aaab67f862524d46b073fcfee01d56cf
parent: wordAtIndex passes conformance (diff)
parent: Merge branch 'work-branch' into HEAD (diff)
download: zg-6ae1e4aabf3b8d3a0c3c1154c297979d0277e6b3.tar.gz
zg-6ae1e4aabf3b8d3a0c3c1154c297979d0277e6b3.tar.xz
zg-6ae1e4aabf3b8d3a0c3c1154c297979d0277e6b3.zip
3 files changed, 299 insertions, 3 deletions
diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md
index c42cb8b..530c921 100644
--- a/CONTRIBUTORS.md
+++ b/CONTRIBUTORS.md
@@ -18,3 +18,5 @@ Jonathan Raphaelson <jon@accidental.cc>
 Sam Atman <atmanistan@gmail.com>
 lch361 <author@lch361.net>
+Matteo Romano <mr.matteo.romano.02@gmail.com>
diff --git a/src/Graphemes.zig b/src/Graphemes.zig
index 1f67fc6..0338c04 100644
--- a/src/Graphemes.zig
+++ b/src/Graphemes.zig
@@ -7,6 +7,7 @@ const unicode = std.unicode;
 const CodePoint = @import("code_point").CodePoint;
 const CodePointIterator = @import("code_point").Iterator;
+const CodePointReverseIterator = @import("code_point").ReverseIterator;
 s1: []u16 = undefined,
 s2: []u16 = undefined,
@@ -70,6 +71,10 @@ pub fn iterator(graphemes: *const Graphemes, string: []const u8) Iterator {
    return Iterator.init(string, graphemes);
 }
+pub fn reverseIterator(graphemes: *const Graphemes, string: []const u8) ReverseIterator {
+    return ReverseIterator.init(string, graphemes);
+}
 /// Indic syllable type.
 pub const Indic = enum {
    none,
@@ -182,6 +187,221 @@ pub const Iterator = struct {
    }
 };
+pub const ReverseIterator = struct {
+    buf: [2]?CodePoint = .{ null, null },
+    cp_iter: CodePointReverseIterator,
+    data: *const Graphemes,
+    /// Codepoint read from `cp_iter` but not returned by `previous`
+    pending: Pending = .{ .none = {} },
+    const Pending = union(enum) {
+        none: void,
+        /// Count of pending RI codepoints, it is an even number
+        ri_count: usize,
+        /// End of (Extend* ZWJ) sequence pending from failed GB11: !Emoji Extend* ZWJ x Emoji
+        extend_end: u32,
+    };
+    const Self = @This();
+    pub fn init(str: []const u8, data: *const Graphemes) Self {
+        var self: Self = .{ .cp_iter = .init(str), .data = data };
+        self.advance();
+        self.advance();
+        return self;
+    }
+    fn advance(self: *Self) void {
+        self.buf[1] = self.buf[0];
+        self.buf[0] = self.cp_iter.prev();
+    }
+    pub fn prev(self: *Self) ?Grapheme {
+        if (self.buf[1] == null) return null;
+        const grapheme_end: u32 = end: {
+            const codepoint = self.buf[1].?;
+            switch (self.pending) {
+                // BUF: [?Any, Any]
+                .none => break :end codepoint.offset + codepoint.len,
+                .ri_count => |ri_count| {
+                    std.debug.assert(ri_count > 0);
+                    std.debug.assert(ri_count % 2 == 0);
+                    if (ri_count > 2) {
+                        self.pending.ri_count -= 2;
+                        // Use the fact that all RI have length 4 in utf8 encoding
+                        // since they are in range 0x1f1e6...0x1f1ff
+                        // https://en.wikipedia.org/wiki/UTF-8#Encoding
+                        return Grapheme{
+                            .len = 8,
+                            .offset = @intCast(codepoint.offset + self.pending.ri_count * 4),
+                        };
+                    } else {
+                        self.pending = .{ .none = {} };
+                        break :end codepoint.offset + codepoint.len + 4;
+                    }
+                },
+                // BUF: [?Any, Extend] Extend* ZWJ
+                .extend_end => |extend_end| {
+                    self.pending = .{ .none = {} };
+                    break :end extend_end;
+                },
+            }
+        };
+        while (self.buf[0] != null) {
+            var state: State = .{};
+            state.setXpic();
+            state.unsetRegional();
+            state.setIndic();
+            if (graphemeBreak(
+                self.buf[0].?.code,
+                self.buf[1].?.code,
+                self.data,
+                &state,
+            )) break;
+            self.advance();
+            if (!state.hasIndic()) {
+                // BUF: [?Any, Extend | Linker] Consonant
+                var indic_offset: u32 = self.buf[1].?.offset + self.buf[1].?.len;
+                indic: while (true) {
+                    if (self.buf[0] == null) {
+                        self.pending = .{ .extend_end = indic_offset };
+                        return .{
+                            .len = @intCast(grapheme_end - indic_offset),
+                            .offset = indic_offset,
+                        };
+                    }
+                    const codepoint = self.buf[0].?;
+                    switch (self.data.indic(codepoint.code)) {
+                        .Extend, .Linker => {
+                            self.advance();
+                            continue :indic;
+                        },
+                        .Consonant => {
+                            // BUF: [Consonant, Extend | Linker] (Extend | Linker)* Consonant
+                            indic_offset = codepoint.offset;
+                            self.advance();
+                            if (self.buf[0]) |cp1| {
+                                state.setIndic();
+                                if (graphemeBreak(cp1.code, self.buf[1].?.code, self.data, &state)) break;
+                                if (!state.hasIndic()) {
+                                    continue :indic;
+                                } else {
+                                    break :indic;
+                                }
+                            } else {
+                                break :indic;
+                            }
+                        },
+                        .none => {
+                            // BUF: [Any, Extend | Linker] (Extend | Linker)* Consonant
+                            self.pending = .{ .extend_end = indic_offset };
+                            return .{
+                                .len = @intCast(grapheme_end - indic_offset),
+                                .offset = indic_offset,
+                            };
+                        },
+                    }
+                }
+            }
+            if (!state.hasXpic()) {
+                // BUF: [?Any, ZWJ] Emoji
+                var emoji_offset: u32 = self.buf[1].?.offset + self.buf[1].?.len;
+                // Look for previous Emoji
+                emoji: while (true) {
+                    if (self.buf[0] == null) {
+                        self.pending = .{ .extend_end = emoji_offset };
+                        return .{
+                            .len = @intCast(grapheme_end - emoji_offset),
+                            .offset = emoji_offset,
+                        };
+                    }
+                    const codepoint = self.buf[0].?;
+                    if (self.data.gbp(codepoint.code) == .Extend) {
+                        self.advance();
+                        continue :emoji;
+                    }
+                    if (self.data.isEmoji(codepoint.code)) {
+                        // BUF: [Emoji, Extend] (Extend* ZWJ Emoji)*
+                        emoji_offset = codepoint.offset;
+                        self.advance();
+                        if (self.buf[0] != null and
+                            // ZWJ = 0x200d
+                            self.buf[0].?.code == 0x200d)
+                        {
+                            // BUF: [ZWJ, Emoji] (Extend* ZWJ Emoji)*
+                            // Back at the beginning of the loop, "recursively" look for emoji
+                            self.advance();
+                            continue :emoji;
+                        } else {
+                            // BUF: [?Any, Emoji] (Extend* ZWJ Emoji)*
+                            break :emoji;
+                        }
+                    } else {
+                        // BUF: [Any, Extend] (Extend* ZWJ Emoji)*
+                        self.pending = .{ .extend_end = emoji_offset };
+                        return .{
+                            .len = @intCast(grapheme_end - emoji_offset),
+                            .offset = emoji_offset,
+                        };
+                    }
+                }
+            }
+            if (state.hasRegional()) {
+                var ri_count: usize = 0;
+                while (self.buf[0] != null and
+                    self.data.gbp(self.buf[0].?.code) == .Regional_Indicator)
+                {
+                    ri_count += 1;
+                    self.advance();
+                }
+                // Use the fact that all RI have length 4 in utf8 encoding
+                // since they are in range 0x1f1e6...0x1f1ff
+                // https://en.wikipedia.org/wiki/UTF-8#Encoding
+                if (ri_count == 0) {
+                    // There are no pending RI codepoints
+                } else if (ri_count % 2 == 0) {
+                    self.pending = .{ .ri_count = ri_count };
+                    return .{ .len = 8, .offset = grapheme_end - 8 };
+                } else {
+                    // Add one to count for the unused RI
+                    self.pending = .{ .ri_count = ri_count + 1 };
+                    return .{ .len = 4, .offset = grapheme_end - 4 };
+                }
+            }
+        }
+        const grapheme_start = if (self.buf[1]) |codepoint| codepoint.offset else 0;
+        self.advance();
+        return .{
+            .len = @intCast(grapheme_end - grapheme_start),
+            .offset = grapheme_start,
+        };
+    }
+};
 // Predicates
 fn isBreaker(cp: u21, data: *const Graphemes) bool {
    // Extract relevant properties.
@@ -201,7 +421,7 @@ pub const State = struct {
        self.bits |= 1;
    }
    fn unsetXpic(self: *State) void {
-        self.bits ^= 1;
+        self.bits &= ~@as(u3, 1);
    }
    // Regional Indicatior (flags)
@@ -212,7 +432,7 @@ pub const State = struct {
        self.bits |= 2;
    }
    fn unsetRegional(self: *State) void {
-        self.bits ^= 2;
+        self.bits &= ~@as(u3, 2);
    }
    // Indic Conjunct
@@ -223,7 +443,7 @@ pub const State = struct {
        self.bits |= 4;
    }
    fn unsetIndic(self: *State) void {
-        self.bits ^= 4;
+        self.bits &= ~@as(u3, 4);
    }
 };
diff --git a/src/unicode_tests.zig b/src/unicode_tests.zig
index 8b02e98..0204b92 100644
--- a/src/unicode_tests.zig
+++ b/src/unicode_tests.zig
@@ -175,6 +175,80 @@ test "Segmentation GraphemeIterator" {
    }
 }
+test "Segmentation ReverseGraphemeIterator" {
+    const allocator = std.testing.allocator;
+    var file = try std.fs.cwd().openFile("data/unicode/auxiliary/GraphemeBreakTest.txt", .{});
+    defer file.close();
+    var buf_reader = std.io.bufferedReader(file.reader());
+    var input_stream = buf_reader.reader();
+    const data = try Graphemes.init(allocator);
+    defer data.deinit(allocator);
+    var buf: [4096]u8 = undefined;
+    var line_no: usize = 1;
+    while (try input_stream.readUntilDelimiterOrEof(&buf, '\n')) |raw| : (line_no += 1) {
+        // Skip comments or empty lines.
+        if (raw.len == 0 or raw[0] == '#' or raw[0] == '@') continue;
+        // Clean up.
+        var line = std.mem.trimLeft(u8, raw, "÷ ");
+        if (std.mem.indexOf(u8, line, " ÷\t#")) |octo| {
+            line = line[0..octo];
+        }
+        // Iterate over fields.
+        var want = std.ArrayList(Grapheme).init(allocator);
+        defer want.deinit();
+        var all_bytes = std.ArrayList(u8).init(allocator);
+        defer all_bytes.deinit();
+        var graphemes = std.mem.splitSequence(u8, line, " ÷ ");
+        var bytes_index: u32 = 0;
+        while (graphemes.next()) |field| {
+            var code_points = std.mem.splitScalar(u8, field, ' ');
+            var cp_buf: [4]u8 = undefined;
+            var cp_index: u32 = 0;
+            var gc_len: u8 = 0;
+            while (code_points.next()) |code_point| {
+                if (std.mem.eql(u8, code_point, "×")) continue;
+                const cp: u21 = try std.fmt.parseInt(u21, code_point, 16);
+                const len = try unicode.utf8Encode(cp, &cp_buf);
+                try all_bytes.appendSlice(cp_buf[0..len]);
+                cp_index += len;
+                gc_len += len;
+            }
+            try want.append(Grapheme{ .len = gc_len, .offset = bytes_index });
+            bytes_index += cp_index;
+        }
+        // std.debug.print("\nline {}: {s}\n", .{ line_no, all_bytes.items });
+        var iter = data.reverseIterator(all_bytes.items);
+        // Check.
+        var i: usize = want.items.len;
+        while (i > 0) {
+            i -= 1;
+            const want_gc = want.items[i];
+            const got_gc = iter.prev() orelse {
+                std.debug.print("line {d} grapheme {d}: expected {any} found null\n", .{ line_no, i, want_gc });
+                return error.TestExpectedEqual;
+            };
+            std.testing.expectEqualStrings(
+                want_gc.bytes(all_bytes.items),
+                got_gc.bytes(all_bytes.items),
+            ) catch |err| {
+                std.debug.print("line {d} grapheme {d}: expected {any} found {any}\n", .{ line_no, i, want_gc, got_gc });
+                return err;
+            };
+        }
+    }
+}
 test "Segmentation Word Iterator" {
    const allocator = std.testing.allocator;
    var file = try std.fs.cwd().openFile("data/unicode/auxiliary/WordBreakTest.txt", .{});
author	Sam Atman	2025-05-15 16:06:35 -0400
committer	Sam Atman	2025-05-15 16:06:35 -0400
commit	6ae1e4aabf3b8d3a0c3c1154c297979d0277e6b3 (patch)
tree	61f83982aaab67f862524d46b073fcfee01d56cf
parent	wordAtIndex passes conformance (diff)
parent	Merge branch 'work-branch' into HEAD (diff)
download	zg-6ae1e4aabf3b8d3a0c3c1154c297979d0277e6b3.tar.gz zg-6ae1e4aabf3b8d3a0c3c1154c297979d0277e6b3.tar.xz zg-6ae1e4aabf3b8d3a0c3c1154c297979d0277e6b3.zip

diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md index c42cb8b..530c921 100644 --- a/CONTRIBUTORS.md +++ b/CONTRIBUTORS.md
@@ -18,3 +18,5 @@ Jonathan Raphaelson <jon@accidental.cc>
18	Sam Atman <atmanistan@gmail.com>	18	Sam Atman <atmanistan@gmail.com>
19		19
20	lch361 <author@lch361.net>	20	lch361 <author@lch361.net>
		21
		22	Matteo Romano <mr.matteo.romano.02@gmail.com>


diff --git a/src/Graphemes.zig b/src/Graphemes.zig index 1f67fc6..0338c04 100644 --- a/src/Graphemes.zig +++ b/src/Graphemes.zig
@@ -7,6 +7,7 @@ const unicode = std.unicode;
7		7
8	const CodePoint = @import("code_point").CodePoint;	8	const CodePoint = @import("code_point").CodePoint;
9	const CodePointIterator = @import("code_point").Iterator;	9	const CodePointIterator = @import("code_point").Iterator;
		10	const CodePointReverseIterator = @import("code_point").ReverseIterator;
10		11
11	s1: []u16 = undefined,	12	s1: []u16 = undefined,
12	s2: []u16 = undefined,	13	s2: []u16 = undefined,
@@ -70,6 +71,10 @@ pub fn iterator(graphemes: *const Graphemes, string: []const u8) Iterator {
70	return Iterator.init(string, graphemes);	71	return Iterator.init(string, graphemes);
71	}	72	}
72		73
		74	pub fn reverseIterator(graphemes: *const Graphemes, string: []const u8) ReverseIterator {
		75	return ReverseIterator.init(string, graphemes);
		76	}
		77
73	/// Indic syllable type.	78	/// Indic syllable type.
74	pub const Indic = enum {	79	pub const Indic = enum {
75	none,	80	none,
@@ -182,6 +187,221 @@ pub const Iterator = struct {
182	}	187	}
183	};	188	};
184		189
		190	pub const ReverseIterator = struct {
		191	buf: [2]?CodePoint = .{ null, null },
		192	cp_iter: CodePointReverseIterator,
		193	data: *const Graphemes,
		194	/// Codepoint read from `cp_iter` but not returned by `previous`
		195	pending: Pending = .{ .none = {} },
		196
		197	const Pending = union(enum) {
		198	none: void,
		199	/// Count of pending RI codepoints, it is an even number
		200	ri_count: usize,
		201	/// End of (Extend* ZWJ) sequence pending from failed GB11: !Emoji Extend* ZWJ x Emoji
		202	extend_end: u32,
		203	};
		204
		205	const Self = @This();
		206
		207	pub fn init(str: []const u8, data: *const Graphemes) Self {
		208	var self: Self = .{ .cp_iter = .init(str), .data = data };
		209	self.advance();
		210	self.advance();
		211	return self;
		212	}
		213
		214	fn advance(self: *Self) void {
		215	self.buf[1] = self.buf[0];
		216	self.buf[0] = self.cp_iter.prev();
		217	}
		218
		219	pub fn prev(self: *Self) ?Grapheme {
		220	if (self.buf[1] == null) return null;
		221
		222	const grapheme_end: u32 = end: {
		223	const codepoint = self.buf[1].?;
		224
		225	switch (self.pending) {
		226	// BUF: [?Any, Any]
		227	.none => break :end codepoint.offset + codepoint.len,
		228	.ri_count => \|ri_count\| {
		229	std.debug.assert(ri_count > 0);
		230	std.debug.assert(ri_count % 2 == 0);
		231
		232	if (ri_count > 2) {
		233	self.pending.ri_count -= 2;
		234
		235	// Use the fact that all RI have length 4 in utf8 encoding
		236	// since they are in range 0x1f1e6...0x1f1ff
		237	// https://en.wikipedia.org/wiki/UTF-8#Encoding
		238	return Grapheme{
		239	.len = 8,
		240	.offset = @intCast(codepoint.offset + self.pending.ri_count * 4),
		241	};
		242	} else {
		243	self.pending = .{ .none = {} };
		244	break :end codepoint.offset + codepoint.len + 4;
		245	}
		246	},
		247	// BUF: [?Any, Extend] Extend* ZWJ
		248	.extend_end => \|extend_end\| {
		249	self.pending = .{ .none = {} };
		250	break :end extend_end;
		251	},
		252	}
		253	};
		254
		255	while (self.buf[0] != null) {
		256	var state: State = .{};
		257	state.setXpic();
		258	state.unsetRegional();
		259	state.setIndic();
		260
		261	if (graphemeBreak(
		262	self.buf[0].?.code,
		263	self.buf[1].?.code,
		264	self.data,
		265	&state,
		266	)) break;
		267
		268	self.advance();
		269
		270	if (!state.hasIndic()) {
		271
		272	// BUF: [?Any, Extend \| Linker] Consonant
		273	var indic_offset: u32 = self.buf[1].?.offset + self.buf[1].?.len;
		274
		275	indic: while (true) {
		276	if (self.buf[0] == null) {
		277	self.pending = .{ .extend_end = indic_offset };
		278	return .{
		279	.len = @intCast(grapheme_end - indic_offset),
		280	.offset = indic_offset,
		281	};
		282	}
		283
		284	const codepoint = self.buf[0].?;
		285
		286	switch (self.data.indic(codepoint.code)) {
		287	.Extend, .Linker => {
		288	self.advance();
		289	continue :indic;
		290	},
		291	.Consonant => {
		292	// BUF: [Consonant, Extend \| Linker] (Extend \| Linker)* Consonant
		293	indic_offset = codepoint.offset;
		294	self.advance();
		295
		296	if (self.buf[0]) \|cp1\| {
		297	state.setIndic();
		298
		299	if (graphemeBreak(cp1.code, self.buf[1].?.code, self.data, &state)) break;
		300
		301	if (!state.hasIndic()) {
		302	continue :indic;
		303	} else {
		304	break :indic;
		305	}
		306	} else {
		307	break :indic;
		308	}
		309	},
		310	.none => {
		311	// BUF: [Any, Extend \| Linker] (Extend \| Linker)* Consonant
		312	self.pending = .{ .extend_end = indic_offset };
		313	return .{
		314	.len = @intCast(grapheme_end - indic_offset),
		315	.offset = indic_offset,
		316	};
		317	},
		318	}
		319	}
		320	}
		321
		322	if (!state.hasXpic()) {
		323	// BUF: [?Any, ZWJ] Emoji
		324	var emoji_offset: u32 = self.buf[1].?.offset + self.buf[1].?.len;
		325
		326	// Look for previous Emoji
		327	emoji: while (true) {
		328	if (self.buf[0] == null) {
		329	self.pending = .{ .extend_end = emoji_offset };
		330	return .{
		331	.len = @intCast(grapheme_end - emoji_offset),
		332	.offset = emoji_offset,
		333	};
		334	}
		335
		336	const codepoint = self.buf[0].?;
		337
		338	if (self.data.gbp(codepoint.code) == .Extend) {
		339	self.advance();
		340	continue :emoji;
		341	}
		342
		343	if (self.data.isEmoji(codepoint.code)) {
		344	// BUF: [Emoji, Extend] (Extend* ZWJ Emoji)*
		345	emoji_offset = codepoint.offset;
		346	self.advance();
		347
		348	if (self.buf[0] != null and
		349	// ZWJ = 0x200d
		350	self.buf[0].?.code == 0x200d)
		351	{
		352	// BUF: [ZWJ, Emoji] (Extend* ZWJ Emoji)*
		353	// Back at the beginning of the loop, "recursively" look for emoji
		354	self.advance();
		355	continue :emoji;
		356	} else {
		357	// BUF: [?Any, Emoji] (Extend* ZWJ Emoji)*
		358	break :emoji;
		359	}
		360	} else {
		361	// BUF: [Any, Extend] (Extend* ZWJ Emoji)*
		362	self.pending = .{ .extend_end = emoji_offset };
		363	return .{
		364	.len = @intCast(grapheme_end - emoji_offset),
		365	.offset = emoji_offset,
		366	};
		367	}
		368	}
		369	}
		370
		371	if (state.hasRegional()) {
		372	var ri_count: usize = 0;
		373	while (self.buf[0] != null and
		374	self.data.gbp(self.buf[0].?.code) == .Regional_Indicator)
		375	{
		376	ri_count += 1;
		377	self.advance();
		378	}
		379
		380	// Use the fact that all RI have length 4 in utf8 encoding
		381	// since they are in range 0x1f1e6...0x1f1ff
		382	// https://en.wikipedia.org/wiki/UTF-8#Encoding
		383	if (ri_count == 0) {
		384	// There are no pending RI codepoints
		385	} else if (ri_count % 2 == 0) {
		386	self.pending = .{ .ri_count = ri_count };
		387	return .{ .len = 8, .offset = grapheme_end - 8 };
		388	} else {
		389	// Add one to count for the unused RI
		390	self.pending = .{ .ri_count = ri_count + 1 };
		391	return .{ .len = 4, .offset = grapheme_end - 4 };
		392	}
		393	}
		394	}
		395
		396	const grapheme_start = if (self.buf[1]) \|codepoint\| codepoint.offset else 0;
		397	self.advance();
		398	return .{
		399	.len = @intCast(grapheme_end - grapheme_start),
		400	.offset = grapheme_start,
		401	};
		402	}
		403	};
		404
185	// Predicates	405	// Predicates
186	fn isBreaker(cp: u21, data: *const Graphemes) bool {	406	fn isBreaker(cp: u21, data: *const Graphemes) bool {
187	// Extract relevant properties.	407	// Extract relevant properties.
@@ -201,7 +421,7 @@ pub const State = struct {
201	self.bits \|= 1;	421	self.bits \|= 1;
202	}	422	}
203	fn unsetXpic(self: *State) void {	423	fn unsetXpic(self: *State) void {
204	self.bits ^= 1;	424	self.bits &= ~@as(u3, 1);
205	}	425	}
206		426
207	// Regional Indicatior (flags)	427	// Regional Indicatior (flags)
@@ -212,7 +432,7 @@ pub const State = struct {
212	self.bits \|= 2;	432	self.bits \|= 2;
213	}	433	}
214	fn unsetRegional(self: *State) void {	434	fn unsetRegional(self: *State) void {
215	self.bits ^= 2;	435	self.bits &= ~@as(u3, 2);
216	}	436	}
217		437
218	// Indic Conjunct	438	// Indic Conjunct
@@ -223,7 +443,7 @@ pub const State = struct {
223	self.bits \|= 4;	443	self.bits \|= 4;
224	}	444	}
225	fn unsetIndic(self: *State) void {	445	fn unsetIndic(self: *State) void {
226	self.bits ^= 4;	446	self.bits &= ~@as(u3, 4);
227	}	447	}
228	};	448	};
229		449


diff --git a/src/unicode_tests.zig b/src/unicode_tests.zig index 8b02e98..0204b92 100644 --- a/src/unicode_tests.zig +++ b/src/unicode_tests.zig
@@ -175,6 +175,80 @@ test "Segmentation GraphemeIterator" {
175	}	175	}
176	}	176	}
177		177
		178	test "Segmentation ReverseGraphemeIterator" {
		179	const allocator = std.testing.allocator;
		180	var file = try std.fs.cwd().openFile("data/unicode/auxiliary/GraphemeBreakTest.txt", .{});
		181	defer file.close();
		182	var buf_reader = std.io.bufferedReader(file.reader());
		183	var input_stream = buf_reader.reader();
		184
		185	const data = try Graphemes.init(allocator);
		186	defer data.deinit(allocator);
		187
		188	var buf: [4096]u8 = undefined;
		189	var line_no: usize = 1;
		190
		191	while (try input_stream.readUntilDelimiterOrEof(&buf, '\n')) \|raw\| : (line_no += 1) {
		192	// Skip comments or empty lines.
		193	if (raw.len == 0 or raw[0] == '#' or raw[0] == '@') continue;
		194
		195	// Clean up.
		196	var line = std.mem.trimLeft(u8, raw, "÷ ");
		197	if (std.mem.indexOf(u8, line, " ÷\t#")) \|octo\| {
		198	line = line[0..octo];
		199	}
		200	// Iterate over fields.
		201	var want = std.ArrayList(Grapheme).init(allocator);
		202	defer want.deinit();
		203
		204	var all_bytes = std.ArrayList(u8).init(allocator);
		205	defer all_bytes.deinit();
		206
		207	var graphemes = std.mem.splitSequence(u8, line, " ÷ ");
		208	var bytes_index: u32 = 0;
		209
		210	while (graphemes.next()) \|field\| {
		211	var code_points = std.mem.splitScalar(u8, field, ' ');
		212	var cp_buf: [4]u8 = undefined;
		213	var cp_index: u32 = 0;
		214	var gc_len: u8 = 0;
		215
		216	while (code_points.next()) \|code_point\| {
		217	if (std.mem.eql(u8, code_point, "×")) continue;
		218	const cp: u21 = try std.fmt.parseInt(u21, code_point, 16);
		219	const len = try unicode.utf8Encode(cp, &cp_buf);
		220	try all_bytes.appendSlice(cp_buf[0..len]);
		221	cp_index += len;
		222	gc_len += len;
		223	}
		224
		225	try want.append(Grapheme{ .len = gc_len, .offset = bytes_index });
		226	bytes_index += cp_index;
		227	}
		228
		229	// std.debug.print("\nline {}: {s}\n", .{ line_no, all_bytes.items });
		230	var iter = data.reverseIterator(all_bytes.items);
		231
		232	// Check.
		233	var i: usize = want.items.len;
		234	while (i > 0) {
		235	i -= 1;
		236	const want_gc = want.items[i];
		237	const got_gc = iter.prev() orelse {
		238	std.debug.print("line {d} grapheme {d}: expected {any} found null\n", .{ line_no, i, want_gc });
		239	return error.TestExpectedEqual;
		240	};
		241	std.testing.expectEqualStrings(
		242	want_gc.bytes(all_bytes.items),
		243	got_gc.bytes(all_bytes.items),
		244	) catch \|err\| {
		245	std.debug.print("line {d} grapheme {d}: expected {any} found {any}\n", .{ line_no, i, want_gc, got_gc });
		246	return err;
		247	};
		248	}
		249	}
		250	}
		251
178	test "Segmentation Word Iterator" {	252	test "Segmentation Word Iterator" {
179	const allocator = std.testing.allocator;	253	const allocator = std.testing.allocator;
180	var file = try std.fs.cwd().openFile("data/unicode/auxiliary/WordBreakTest.txt", .{});	254	var file = try std.fs.cwd().openFile("data/unicode/auxiliary/WordBreakTest.txt", .{});