From 8e24f0ac6a0c9052381001688cf425f2a4017b30 Mon Sep 17 00:00:00 2001 From: Sam Atman Date: Fri, 9 May 2025 12:58:23 -0400 Subject: Add reverse CodePoint iterator --- src/code_point.zig | 81 ++++++++++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 75 insertions(+), 6 deletions(-) (limited to 'src') diff --git a/src/code_point.zig b/src/code_point.zig index 13e38bf..ed5fede 100644 --- a/src/code_point.zig +++ b/src/code_point.zig @@ -94,6 +94,45 @@ pub const Iterator = struct { } }; +pub const ReverseIterator = struct { + bytes: []const u8, + i: ?u32, + + pub fn init(str: []const u8) ReverseIterator { + var r_iter: ReverseIterator = undefined; + r_iter.bytes = str; + r_iter.i = if (str.len == 0) 0 else @intCast(str.len - 1); + return r_iter; + } + + pub fn prev(iter: *ReverseIterator) ?CodePoint { + if (iter.i == null) return null; + var i_prev = iter.i.?; + + while (i_prev > 0) : (i_prev -= 1) { + if (!followbyte(iter.bytes[i_prev])) break; + if (i_prev == 0) break; + } + + if (i_prev > 0) + iter.i = i_prev - 1 + else + iter.i = null; + + return decode(iter.bytes[i_prev..], i_prev); + } + + pub fn peek(iter: *ReverseIterator) ?CodePoint { + const saved_i = iter.i; + defer iter.i = saved_i; + return iter.prev(); + } +}; + +inline fn followbyte(b: u8) bool { + return 0x80 <= b and b <= 0xbf; +} + test "decode" { const bytes = "🌩️"; const res = decode(bytes, 0); @@ -107,12 +146,42 @@ test "decode" { } } -test "peek" { +test Iterator { var iter = Iterator{ .bytes = "Hi" }; - try std.testing.expectEqual(@as(u21, 'H'), iter.next().?.code); - try std.testing.expectEqual(@as(u21, 'i'), iter.peek().?.code); - try std.testing.expectEqual(@as(u21, 'i'), iter.next().?.code); - try std.testing.expectEqual(@as(?CodePoint, null), iter.peek()); - try std.testing.expectEqual(@as(?CodePoint, null), iter.next()); + try testing.expectEqual(@as(u21, 'H'), iter.next().?.code); + try testing.expectEqual(@as(u21, 'i'), iter.peek().?.code); + try testing.expectEqual(@as(u21, 'i'), iter.next().?.code); + try testing.expectEqual(@as(?CodePoint, null), iter.peek()); + try testing.expectEqual(@as(?CodePoint, null), iter.next()); + try testing.expectEqual(@as(?CodePoint, null), iter.next()); } + +test ReverseIterator { + { + var r_iter: ReverseIterator = .init("ABC"); + try testing.expectEqual(@as(u21, 'C'), r_iter.prev().?.code); + try testing.expectEqual(@as(u21, 'B'), r_iter.peek().?.code); + try testing.expectEqual(@as(u21, 'B'), r_iter.prev().?.code); + try testing.expectEqual(@as(u21, 'A'), r_iter.prev().?.code); + try testing.expectEqual(@as(?CodePoint, null), r_iter.peek()); + try testing.expectEqual(@as(?CodePoint, null), r_iter.prev()); + try testing.expectEqual(@as(?CodePoint, null), r_iter.prev()); + } + { + var r_iter: ReverseIterator = .init("∅δq🦾ă"); + try testing.expectEqual(@as(u21, 'ă'), r_iter.prev().?.code); + try testing.expectEqual(@as(u21, '🦾'), r_iter.prev().?.code); + try testing.expectEqual(@as(u21, 'q'), r_iter.prev().?.code); + try testing.expectEqual(@as(u21, 'δ'), r_iter.peek().?.code); + try testing.expectEqual(@as(u21, 'δ'), r_iter.prev().?.code); + try testing.expectEqual(@as(u21, '∅'), r_iter.peek().?.code); + try testing.expectEqual(@as(u21, '∅'), r_iter.peek().?.code); + try testing.expectEqual(@as(u21, '∅'), r_iter.prev().?.code); + try testing.expectEqual(@as(?CodePoint, null), r_iter.peek()); + try testing.expectEqual(@as(?CodePoint, null), r_iter.prev()); + try testing.expectEqual(@as(?CodePoint, null), r_iter.prev()); + } +} + +const testing = std.testing; -- cgit v1.2.3 From 94b1f37474c7444d8129445ae7984f922cb9c283 Mon Sep 17 00:00:00 2001 From: Matteo Romano Date: Mon, 12 May 2025 12:14:14 +0200 Subject: fix: State.unset* did toggle the bit instead of unsetting it --- src/Graphemes.zig | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'src') diff --git a/src/Graphemes.zig b/src/Graphemes.zig index 7bf328a..5780ed4 100644 --- a/src/Graphemes.zig +++ b/src/Graphemes.zig @@ -258,7 +258,7 @@ pub const State = struct { self.bits |= 1; } fn unsetXpic(self: *State) void { - self.bits ^= 1; + self.bits &= ~@as(u3, 1); } // Regional Indicatior (flags) @@ -269,7 +269,7 @@ pub const State = struct { self.bits |= 2; } fn unsetRegional(self: *State) void { - self.bits ^= 2; + self.bits &= ~@as(u3, 2); } // Indic Conjunct @@ -280,7 +280,7 @@ pub const State = struct { self.bits |= 4; } fn unsetIndic(self: *State) void { - self.bits ^= 4; + self.bits &= ~@as(u3, 4); } }; -- cgit v1.2.3 From ad4b046c36953b01ae31d109197ee9db5c16d5ea Mon Sep 17 00:00:00 2001 From: Sam Atman Date: Tue, 13 May 2025 13:38:28 -0400 Subject: Various small iterator improvements --- src/code_point.zig | 55 +++++++++++++++++++++++++++++++++++++++++++++--------- 1 file changed, 46 insertions(+), 9 deletions(-) (limited to 'src') diff --git a/src/code_point.zig b/src/code_point.zig index ed5fede..5aa12b7 100644 --- a/src/code_point.zig +++ b/src/code_point.zig @@ -42,7 +42,7 @@ pub fn decode(bytes: []const u8, offset: u32) ?CodePoint { .offset = offset, }; - // Return replacement if we don' have a complete codepoint remaining. Consumes only one byte + // Return replacement if we don't have a complete codepoint remaining. Consumes only one byte. if (cp.len > bytes.len) { // Unicode replacement code point. return .{ @@ -76,21 +76,30 @@ pub const Iterator = struct { bytes: []const u8, i: u32 = 0, - pub fn next(self: *Iterator) ?CodePoint { - if (self.i >= self.bytes.len) return null; + pub fn next(iter: *Iterator) ?CodePoint { + if (iter.i >= iter.bytes.len) return null; - const res = decode(self.bytes[self.i..], self.i); + const res = decode(iter.bytes[iter.i..], iter.i); if (res) |cp| { - self.i += cp.len; + iter.i += cp.len; } return res; } - pub fn peek(self: *Iterator) ?CodePoint { - const saved_i = self.i; - defer self.i = saved_i; - return self.next(); + pub fn peek(iter: *Iterator) ?CodePoint { + const saved_i = iter.i; + defer iter.i = saved_i; + return iter.next(); + } + + /// Create a backward iterator at this point. It will repeat + /// the last CodePoint seen. + pub fn reverseIterator(iter: *Iterator) ReverseIterator { + if (iter.i == iter.bytes.len) { + return .init(iter.bytes); + } + return .{ .i = iter.i, .bytes = iter.bytes }; } }; @@ -127,6 +136,17 @@ pub const ReverseIterator = struct { defer iter.i = saved_i; return iter.prev(); } + + /// Create a forward iterator at this point. It will repeat the + /// last CodePoint seen. + pub fn forwardIterator(iter: *ReverseIterator) Iterator { + if (iter.i) |i| { + var fwd: Iterator = .{ .i = i, .bytes = iter.bytes }; + _ = fwd.next(); + return fwd; + } + return .{ .i = 0, .bytes = iter.bytes }; + } }; inline fn followbyte(b: u8) bool { @@ -182,6 +202,23 @@ test ReverseIterator { try testing.expectEqual(@as(?CodePoint, null), r_iter.prev()); try testing.expectEqual(@as(?CodePoint, null), r_iter.prev()); } + { + var r_iter: ReverseIterator = .init("123"); + try testing.expectEqual(@as(u21, '3'), r_iter.prev().?.code); + try testing.expectEqual(@as(u21, '2'), r_iter.prev().?.code); + try testing.expectEqual(@as(u21, '1'), r_iter.prev().?.code); + var iter = r_iter.forwardIterator(); + try testing.expectEqual(@as(u21, '1'), iter.next().?.code); + try testing.expectEqual(@as(u21, '2'), iter.next().?.code); + try testing.expectEqual(@as(u21, '3'), iter.next().?.code); + r_iter = iter.reverseIterator(); + try testing.expectEqual(@as(u21, '3'), r_iter.prev().?.code); + try testing.expectEqual(@as(u21, '2'), r_iter.prev().?.code); + iter = r_iter.forwardIterator(); + r_iter = iter.reverseIterator(); + try testing.expectEqual(@as(u21, '2'), iter.next().?.code); + try testing.expectEqual(@as(u21, '2'), r_iter.prev().?.code); + } } const testing = std.testing; -- cgit v1.2.3 From 890370f5479299940f505e1247c408064f789bd5 Mon Sep 17 00:00:00 2001 From: Matteo Romano Date: Mon, 12 May 2025 12:14:30 +0200 Subject: feat: add reverse grapheme iterator Closes #53 --- src/Graphemes.zig | 220 ++++++++++++++++++++++++++++++++++++++++++++++++++ src/unicode_tests.zig | 74 +++++++++++++++++ 2 files changed, 294 insertions(+) (limited to 'src') diff --git a/src/Graphemes.zig b/src/Graphemes.zig index 5780ed4..3bff18d 100644 --- a/src/Graphemes.zig +++ b/src/Graphemes.zig @@ -7,6 +7,7 @@ const unicode = std.unicode; const CodePoint = @import("code_point").CodePoint; const CodePointIterator = @import("code_point").Iterator; +const CodePointReverseIterator = @import("code_point").ReverseIterator; s1: []u16 = undefined, s2: []u16 = undefined, @@ -70,6 +71,10 @@ pub fn iterator(graphemes: *const Graphemes, string: []const u8) Iterator { return Iterator.init(string, graphemes); } +pub fn reverseIterator(graphemes: *const Graphemes, string: []const u8) ReverseIterator { + return ReverseIterator.init(string, graphemes); +} + /// Indic syllable type. pub const Indic = enum { none, @@ -239,6 +244,221 @@ pub const Iterator = struct { } }; +pub const ReverseIterator = struct { + buf: [2]?CodePoint = .{ null, null }, + cp_iter: CodePointReverseIterator, + data: *const Graphemes, + /// Codepoint read from `cp_iter` but not returned by `previous` + pending: Pending = .{ .none = {} }, + + const Pending = union(enum) { + none: void, + /// Count of pending RI codepoints, it is an even number + ri_count: usize, + /// End of (Extend* ZWJ) sequence pending from failed GB11: !Emoji Extend* ZWJ x Emoji + extend_end: u32, + }; + + const Self = @This(); + + pub fn init(str: []const u8, data: *const Graphemes) Self { + var self: Self = .{ .cp_iter = .init(str), .data = data }; + self.advance(); + self.advance(); + return self; + } + + fn advance(self: *Self) void { + self.buf[1] = self.buf[0]; + self.buf[0] = self.cp_iter.prev(); + } + + pub fn prev(self: *Self) ?Grapheme { + if (self.buf[1] == null) return null; + + const grapheme_end: u32 = end: { + const codepoint = self.buf[1].?; + + switch (self.pending) { + // BUF: [?Any, Any] + .none => break :end codepoint.offset + codepoint.len, + .ri_count => |ri_count| { + std.debug.assert(ri_count > 0); + std.debug.assert(ri_count % 2 == 0); + + if (ri_count > 2) { + self.pending.ri_count -= 2; + + // Use the fact that all RI have length 4 in utf8 encoding + // since they are in range 0x1f1e6...0x1f1ff + // https://en.wikipedia.org/wiki/UTF-8#Encoding + return Grapheme{ + .len = 8, + .offset = @intCast(codepoint.offset + self.pending.ri_count * 4), + }; + } else { + self.pending = .{ .none = {} }; + break :end codepoint.offset + codepoint.len + 4; + } + }, + // BUF: [?Any, Extend] Extend* ZWJ + .extend_end => |extend_end| { + self.pending = .{ .none = {} }; + break :end extend_end; + }, + } + }; + + while (self.buf[0] != null) { + var state: State = .{}; + state.setXpic(); + state.unsetRegional(); + state.setIndic(); + + if (graphemeBreak( + self.buf[0].?.code, + self.buf[1].?.code, + self.data, + &state, + )) break; + + self.advance(); + + if (!state.hasIndic()) { + + // BUF: [?Any, Extend | Linker] Consonant + var indic_offset: u32 = self.buf[1].?.offset + self.buf[1].?.len; + + indic: while (true) { + if (self.buf[0] == null) { + self.pending = .{ .extend_end = indic_offset }; + return .{ + .len = @intCast(grapheme_end - indic_offset), + .offset = indic_offset, + }; + } + + const codepoint = self.buf[0].?; + + switch (self.data.indic(codepoint.code)) { + .Extend, .Linker => { + self.advance(); + continue :indic; + }, + .Consonant => { + // BUF: [Consonant, Extend | Linker] (Extend | Linker)* Consonant + indic_offset = codepoint.offset; + self.advance(); + + if (self.buf[0]) |cp1| { + state.setIndic(); + + if (graphemeBreak(cp1.code, self.buf[1].?.code, self.data, &state)) break; + + if (!state.hasIndic()) { + continue :indic; + } else { + break :indic; + } + } else { + break :indic; + } + }, + .none => { + // BUF: [Any, Extend | Linker] (Extend | Linker)* Consonant + self.pending = .{ .extend_end = indic_offset }; + return .{ + .len = @intCast(grapheme_end - indic_offset), + .offset = indic_offset, + }; + }, + } + } + } + + if (!state.hasXpic()) { + // BUF: [?Any, ZWJ] Emoji + var emoji_offset: u32 = self.buf[1].?.offset + self.buf[1].?.len; + + // Look for previous Emoji + emoji: while (true) { + if (self.buf[0] == null) { + self.pending = .{ .extend_end = emoji_offset }; + return .{ + .len = @intCast(grapheme_end - emoji_offset), + .offset = emoji_offset, + }; + } + + const codepoint = self.buf[0].?; + + if (self.data.gbp(codepoint.code) == .Extend) { + self.advance(); + continue :emoji; + } + + if (self.data.isEmoji(codepoint.code)) { + // BUF: [Emoji, Extend] (Extend* ZWJ Emoji)* + emoji_offset = codepoint.offset; + self.advance(); + + if (self.buf[0] != null and + // ZWJ = 0x200d + self.buf[0].?.code == 0x200d) + { + // BUF: [ZWJ, Emoji] (Extend* ZWJ Emoji)* + // Back at the beginning of the loop, "recursively" look for emoji + self.advance(); + continue :emoji; + } else { + // BUF: [?Any, Emoji] (Extend* ZWJ Emoji)* + break :emoji; + } + } else { + // BUF: [Any, Extend] (Extend* ZWJ Emoji)* + self.pending = .{ .extend_end = emoji_offset }; + return .{ + .len = @intCast(grapheme_end - emoji_offset), + .offset = emoji_offset, + }; + } + } + } + + if (state.hasRegional()) { + var ri_count: usize = 0; + while (self.buf[0] != null and + self.data.gbp(self.buf[0].?.code) == .Regional_Indicator) + { + ri_count += 1; + self.advance(); + } + + // Use the fact that all RI have length 4 in utf8 encoding + // since they are in range 0x1f1e6...0x1f1ff + // https://en.wikipedia.org/wiki/UTF-8#Encoding + if (ri_count == 0) { + // There are no pending RI codepoints + } else if (ri_count % 2 == 0) { + self.pending = .{ .ri_count = ri_count }; + return .{ .len = 8, .offset = grapheme_end - 8 }; + } else { + // Add one to count for the unused RI + self.pending = .{ .ri_count = ri_count + 1 }; + return .{ .len = 4, .offset = grapheme_end - 4 }; + } + } + } + + const grapheme_start = if (self.buf[1]) |codepoint| codepoint.offset else 0; + self.advance(); + return .{ + .len = @intCast(grapheme_end - grapheme_start), + .offset = grapheme_start, + }; + } +}; + // Predicates fn isBreaker(cp: u21, data: *const Graphemes) bool { // Extract relevant properties. diff --git a/src/unicode_tests.zig b/src/unicode_tests.zig index 2249007..828559a 100644 --- a/src/unicode_tests.zig +++ b/src/unicode_tests.zig @@ -219,3 +219,77 @@ test "Segmentation GraphemeIterator" { } } } + +test "Segmentation ReverseGraphemeIterator" { + const allocator = std.testing.allocator; + var file = try std.fs.cwd().openFile("data/unicode/auxiliary/GraphemeBreakTest.txt", .{}); + defer file.close(); + var buf_reader = std.io.bufferedReader(file.reader()); + var input_stream = buf_reader.reader(); + + const data = try Graphemes.init(allocator); + defer data.deinit(allocator); + + var buf: [4096]u8 = undefined; + var line_no: usize = 1; + + while (try input_stream.readUntilDelimiterOrEof(&buf, '\n')) |raw| : (line_no += 1) { + // Skip comments or empty lines. + if (raw.len == 0 or raw[0] == '#' or raw[0] == '@') continue; + + // Clean up. + var line = std.mem.trimLeft(u8, raw, "÷ "); + if (std.mem.indexOf(u8, line, " ÷\t#")) |octo| { + line = line[0..octo]; + } + // Iterate over fields. + var want = std.ArrayList(Grapheme).init(allocator); + defer want.deinit(); + + var all_bytes = std.ArrayList(u8).init(allocator); + defer all_bytes.deinit(); + + var graphemes = std.mem.splitSequence(u8, line, " ÷ "); + var bytes_index: u32 = 0; + + while (graphemes.next()) |field| { + var code_points = std.mem.splitScalar(u8, field, ' '); + var cp_buf: [4]u8 = undefined; + var cp_index: u32 = 0; + var gc_len: u8 = 0; + + while (code_points.next()) |code_point| { + if (std.mem.eql(u8, code_point, "×")) continue; + const cp: u21 = try std.fmt.parseInt(u21, code_point, 16); + const len = try unicode.utf8Encode(cp, &cp_buf); + try all_bytes.appendSlice(cp_buf[0..len]); + cp_index += len; + gc_len += len; + } + + try want.append(Grapheme{ .len = gc_len, .offset = bytes_index }); + bytes_index += cp_index; + } + + // std.debug.print("\nline {}: {s}\n", .{ line_no, all_bytes.items }); + var iter = data.reverseIterator(all_bytes.items); + + // Check. + var i: usize = want.items.len; + while (i > 0) { + i -= 1; + const want_gc = want.items[i]; + const got_gc = iter.prev() orelse { + std.debug.print("line {d} grapheme {d}: expected {any} found null\n", .{ line_no, i, want_gc }); + return error.TestExpectedEqual; + }; + std.testing.expectEqualStrings( + want_gc.bytes(all_bytes.items), + got_gc.bytes(all_bytes.items), + ) catch |err| { + std.debug.print("line {d} grapheme {d}: expected {any} found {any}\n", .{ line_no, i, want_gc, got_gc }); + return err; + }; + } + } +} -- cgit v1.2.3 From 75dedc6aec86c1a4f43d0d7cd120abf68b5baeb1 Mon Sep 17 00:00:00 2001 From: Sam Atman Date: Fri, 9 May 2025 12:58:23 -0400 Subject: Add reverse CodePoint iterator --- src/code_point.zig | 68 +++++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 67 insertions(+), 1 deletion(-) (limited to 'src') diff --git a/src/code_point.zig b/src/code_point.zig index fe7ad6e..a319d36 100644 --- a/src/code_point.zig +++ b/src/code_point.zig @@ -233,6 +233,45 @@ const class_mask: [12]u8 = .{ 0, }; +pub const ReverseIterator = struct { + bytes: []const u8, + i: ?u32, + + pub fn init(str: []const u8) ReverseIterator { + var r_iter: ReverseIterator = undefined; + r_iter.bytes = str; + r_iter.i = if (str.len == 0) 0 else @intCast(str.len - 1); + return r_iter; + } + + pub fn prev(iter: *ReverseIterator) ?CodePoint { + if (iter.i == null) return null; + var i_prev = iter.i.?; + + while (i_prev > 0) : (i_prev -= 1) { + if (!followbyte(iter.bytes[i_prev])) break; + if (i_prev == 0) break; + } + + if (i_prev > 0) + iter.i = i_prev - 1 + else + iter.i = null; + + return decode(iter.bytes[i_prev..], i_prev); + } + + pub fn peek(iter: *ReverseIterator) ?CodePoint { + const saved_i = iter.i; + defer iter.i = saved_i; + return iter.prev(); + } +}; + +inline fn followbyte(b: u8) bool { + return 0x80 <= b and b <= 0xbf; +} + test "decode" { const bytes = "🌩️"; const res = decode(bytes, 0); @@ -246,7 +285,7 @@ test "decode" { } } -test "peek" { +test Iterator { var iter = Iterator{ .bytes = "Hi" }; try expectEqual(@as(u21, 'H'), iter.next().?.code); @@ -346,6 +385,33 @@ test "truncation" { } } +test ReverseIterator { + { + var r_iter: ReverseIterator = .init("ABC"); + try testing.expectEqual(@as(u21, 'C'), r_iter.prev().?.code); + try testing.expectEqual(@as(u21, 'B'), r_iter.peek().?.code); + try testing.expectEqual(@as(u21, 'B'), r_iter.prev().?.code); + try testing.expectEqual(@as(u21, 'A'), r_iter.prev().?.code); + try testing.expectEqual(@as(?CodePoint, null), r_iter.peek()); + try testing.expectEqual(@as(?CodePoint, null), r_iter.prev()); + try testing.expectEqual(@as(?CodePoint, null), r_iter.prev()); + } + { + var r_iter: ReverseIterator = .init("∅δq🦾ă"); + try testing.expectEqual(@as(u21, 'ă'), r_iter.prev().?.code); + try testing.expectEqual(@as(u21, '🦾'), r_iter.prev().?.code); + try testing.expectEqual(@as(u21, 'q'), r_iter.prev().?.code); + try testing.expectEqual(@as(u21, 'δ'), r_iter.peek().?.code); + try testing.expectEqual(@as(u21, 'δ'), r_iter.prev().?.code); + try testing.expectEqual(@as(u21, '∅'), r_iter.peek().?.code); + try testing.expectEqual(@as(u21, '∅'), r_iter.peek().?.code); + try testing.expectEqual(@as(u21, '∅'), r_iter.prev().?.code); + try testing.expectEqual(@as(?CodePoint, null), r_iter.peek()); + try testing.expectEqual(@as(?CodePoint, null), r_iter.prev()); + try testing.expectEqual(@as(?CodePoint, null), r_iter.prev()); + } +} + const std = @import("std"); const testing = std.testing; const expect = testing.expect; -- cgit v1.2.3 From 7e4cc4641e966a1aa32fa021a9476af3b9813485 Mon Sep 17 00:00:00 2001 From: Sam Atman Date: Tue, 13 May 2025 13:38:28 -0400 Subject: Various small iterator improvements --- src/code_point.zig | 55 ++++++++++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 51 insertions(+), 4 deletions(-) (limited to 'src') diff --git a/src/code_point.zig b/src/code_point.zig index a319d36..b3c9aea 100644 --- a/src/code_point.zig +++ b/src/code_point.zig @@ -48,12 +48,22 @@ pub fn decodeAtCursor(bytes: []const u8, cursor: *u32) ?CodePoint { }; // Multibyte +<<<<<<< HEAD // Second: var class: u4 = @intCast(u8dfa[byte]); var st: u32 = state_dfa[class]; if (st == RUNE_REJECT or cursor.* == bytes.len) { @branchHint(.cold); // First one is never a truncation +||||||| parent of ad4b046 (Various small iterator improvements) + // Return replacement if we don' have a complete codepoint remaining. Consumes only one byte + if (cp.len > bytes.len) { + // Unicode replacement code point. +======= + // Return replacement if we don't have a complete codepoint remaining. Consumes only one byte. + if (cp.len > bytes.len) { + // Unicode replacement code point. +>>>>>>> ad4b046 (Various small iterator improvements) return .{ .code = 0xfffd, .len = 1, @@ -158,10 +168,19 @@ pub const Iterator = struct { return decodeAtCursor(self.bytes, &self.i); } - pub fn peek(self: *Iterator) ?CodePoint { - const saved_i = self.i; - defer self.i = saved_i; - return self.next(); + pub fn peek(iter: *Iterator) ?CodePoint { + const saved_i = iter.i; + defer iter.i = saved_i; + return iter.next(); + } + + /// Create a backward iterator at this point. It will repeat + /// the last CodePoint seen. + pub fn reverseIterator(iter: *Iterator) ReverseIterator { + if (iter.i == iter.bytes.len) { + return .init(iter.bytes); + } + return .{ .i = iter.i, .bytes = iter.bytes }; } }; @@ -266,6 +285,17 @@ pub const ReverseIterator = struct { defer iter.i = saved_i; return iter.prev(); } + + /// Create a forward iterator at this point. It will repeat the + /// last CodePoint seen. + pub fn forwardIterator(iter: *ReverseIterator) Iterator { + if (iter.i) |i| { + var fwd: Iterator = .{ .i = i, .bytes = iter.bytes }; + _ = fwd.next(); + return fwd; + } + return .{ .i = 0, .bytes = iter.bytes }; + } }; inline fn followbyte(b: u8) bool { @@ -410,6 +440,23 @@ test ReverseIterator { try testing.expectEqual(@as(?CodePoint, null), r_iter.prev()); try testing.expectEqual(@as(?CodePoint, null), r_iter.prev()); } + { + var r_iter: ReverseIterator = .init("123"); + try testing.expectEqual(@as(u21, '3'), r_iter.prev().?.code); + try testing.expectEqual(@as(u21, '2'), r_iter.prev().?.code); + try testing.expectEqual(@as(u21, '1'), r_iter.prev().?.code); + var iter = r_iter.forwardIterator(); + try testing.expectEqual(@as(u21, '1'), iter.next().?.code); + try testing.expectEqual(@as(u21, '2'), iter.next().?.code); + try testing.expectEqual(@as(u21, '3'), iter.next().?.code); + r_iter = iter.reverseIterator(); + try testing.expectEqual(@as(u21, '3'), r_iter.prev().?.code); + try testing.expectEqual(@as(u21, '2'), r_iter.prev().?.code); + iter = r_iter.forwardIterator(); + r_iter = iter.reverseIterator(); + try testing.expectEqual(@as(u21, '2'), iter.next().?.code); + try testing.expectEqual(@as(u21, '2'), r_iter.prev().?.code); + } } const std = @import("std"); -- cgit v1.2.3 From e3dbcc70688321e48ac31599105c51edac2736af Mon Sep 17 00:00:00 2001 From: Sam Atman Date: Sun, 11 May 2025 16:30:47 -0400 Subject: Add WordBreakPropertyData Passes some simple lookup tests. --- src/WordBreak.zig | 102 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 102 insertions(+) create mode 100644 src/WordBreak.zig (limited to 'src') diff --git a/src/WordBreak.zig b/src/WordBreak.zig new file mode 100644 index 0000000..9044740 --- /dev/null +++ b/src/WordBreak.zig @@ -0,0 +1,102 @@ +//! Word Breaking Algorithm. + +const WordBreakProperty = enum(u5) { + none, + Double_Quote, + Single_Quote, + Hebrew_Letter, + CR, + LF, + Newline, + Extend, + Regional_Indicator, + Format, + Katakana, + ALetter, + MidLetter, + MidNum, + MidNumLet, + Numeric, + ExtendNumLet, + ZWJ, + WSegSpace, +}; + +s1: []u16 = undefined, +s2: []u5 = undefined, + +const WordBreak = @This(); + +pub fn init(allocator: Allocator) Allocator.Error!WordBreak { + var wb: WordBreak = undefined; + try wb.setup(allocator); + return wb; +} + +pub fn setup(wb: *WordBreak, allocator: Allocator) Allocator.Error!void { + wb.setupImpl(allocator) catch |err| { + switch (err) { + error.OutOfMemory => |e| return e, + else => unreachable, + } + }; +} + +inline fn setupImpl(wb: *WordBreak, allocator: Allocator) !void { + const decompressor = compress.flate.inflate.decompressor; + const in_bytes = @embedFile("wbp"); + var in_fbs = std.io.fixedBufferStream(in_bytes); + var in_decomp = decompressor(.raw, in_fbs.reader()); + var reader = in_decomp.reader(); + + const endian = builtin.cpu.arch.endian(); + + const stage_1_len: u16 = try reader.readInt(u16, endian); + wb.s1 = try allocator.alloc(u16, stage_1_len); + errdefer allocator.free(wb.s1); + for (0..stage_1_len) |i| wb.s1[i] = try reader.readInt(u16, endian); + + const stage_2_len: u16 = try reader.readInt(u16, endian); + wb.s2 = try allocator.alloc(u5, stage_2_len); + errdefer allocator.free(wb.s2); + for (0..stage_2_len) |i| wb.s2[i] = @intCast(try reader.readInt(u8, endian)); + var count_0: usize = 0; + for (wb.s2) |nyb| { + if (nyb == 0) count_0 += 1; + } +} + +pub fn deinit(wordbreak: *const WordBreak, allocator: mem.Allocator) void { + allocator.free(wordbreak.s1); + allocator.free(wordbreak.s2); +} + +/// Returns the word break property type for `cp`. +pub fn breakProperty(wordbreak: *const WordBreak, cp: u21) WordBreakProperty { + return @enumFromInt(wordbreak.s2[wordbreak.s1[cp >> 8] + (cp & 0xff)]); +} + +test "Word Break Properties" { + const wb = try WordBreak.init(testing.allocator); + defer wb.deinit(testing.allocator); + try testing.expectEqual(.CR, wb.breakProperty('\r')); + try testing.expectEqual(.LF, wb.breakProperty('\n')); + try testing.expectEqual(.Hebrew_Letter, wb.breakProperty('ש')); + try testing.expectEqual(.Katakana, wb.breakProperty('\u{30ff}')); +} + +fn testAllocations(allocator: Allocator) !void { + const wb = try WordBreak.init(allocator); + wb.deinit(allocator); +} + +test "allocation safety" { + try testing.checkAllAllocationFailures(testing.allocator, testAllocations, .{}); +} + +const std = @import("std"); +const builtin = @import("builtin"); +const compress = std.compress; +const mem = std.mem; +const Allocator = mem.Allocator; +const testing = std.testing; -- cgit v1.2.3 From 470e896483300d099c7650f9cd8a13e236c63864 Mon Sep 17 00:00:00 2001 From: Sam Atman Date: Sun, 11 May 2025 17:26:50 -0400 Subject: Refactor in unicode_tests The comments in WordBreak and SentenceBreak tests get really long, the provided buffer would be inadequate. So this just provides a sub- iterator which will strip comments and comment lines, while keeping an eye on line numbers for any debugging. --- src/CaseFolding.zig | 8 +++--- src/unicode_tests.zig | 77 ++++++++++++++++++++++++++++++++------------------- 2 files changed, 53 insertions(+), 32 deletions(-) (limited to 'src') diff --git a/src/CaseFolding.zig b/src/CaseFolding.zig index f63b860..ff41b3e 100644 --- a/src/CaseFolding.zig +++ b/src/CaseFolding.zig @@ -300,13 +300,13 @@ fn testAllocations(allocator: Allocator) !void { { const normalize = try Normalize.init(allocator); defer normalize.deinit(allocator); - const caser1 = try CaseFolding.initWithNormalize(allocator, normalize); - defer caser1.deinit(allocator); + const caser = try CaseFolding.initWithNormalize(allocator, normalize); + defer caser.deinit(allocator); } // With normalize owned { - const caser2 = try CaseFolding.init(allocator); - defer caser2.deinit(allocator); + const caser = try CaseFolding.init(allocator); + defer caser.deinit(allocator); } } diff --git a/src/unicode_tests.zig b/src/unicode_tests.zig index 2249007..ee259a3 100644 --- a/src/unicode_tests.zig +++ b/src/unicode_tests.zig @@ -1,17 +1,4 @@ -const std = @import("std"); -const fmt = std.fmt; -const fs = std.fs; -const io = std.io; -const heap = std.heap; -const mem = std.mem; -const testing = std.testing; -const unicode = std.unicode; - -const grapheme = @import("Graphemes"); -const Grapheme = @import("Graphemes").Grapheme; -const Graphemes = @import("Graphemes"); -const GraphemeIterator = @import("Graphemes").Iterator; -const Normalize = @import("Normalize"); +const dbg_print = false; comptime { testing.refAllDecls(grapheme); @@ -50,16 +37,14 @@ test "Unicode normalization tests" { var file = try fs.cwd().openFile("data/unicode/NormalizationTest.txt", .{}); defer file.close(); var buf_reader = io.bufferedReader(file.reader()); - const input_stream = buf_reader.reader(); + var input_stream = buf_reader.reader(); - var line_no: usize = 0; var buf: [4096]u8 = undefined; var cp_buf: [4]u8 = undefined; - while (try input_stream.readUntilDelimiterOrEof(&buf, '\n')) |line| { - line_no += 1; - // Skip comments or empty lines. - if (line.len == 0 or line[0] == '#' or line[0] == '@') continue; + var line_iter: IterRead = .{ .read = &input_stream }; + + while (try line_iter.next(&buf)) |line| { // Iterate over fields. var fields = mem.splitScalar(u8, line, ';'); var field_index: usize = 0; @@ -80,7 +65,7 @@ test "Unicode normalization tests" { input = try i_buf.toOwnedSlice(); } else if (field_index == 1) { - //debug.print("\n*** {s} ***\n", .{line}); + if (dbg_print) debug.print("\n*** {s} ***\n", .{line}); // NFC, time to test. var w_buf = std.ArrayList(u8).init(allocator); defer w_buf.deinit(); @@ -166,16 +151,15 @@ test "Segmentation GraphemeIterator" { defer data.deinit(allocator); var buf: [4096]u8 = undefined; - var line_no: usize = 1; + var line_iter: IterRead = .{ .read = &input_stream }; - while (try input_stream.readUntilDelimiterOrEof(&buf, '\n')) |raw| : (line_no += 1) { + while (try line_iter.next(&buf)) |raw| { // Skip comments or empty lines. - if (raw.len == 0 or raw[0] == '#' or raw[0] == '@') continue; - + // if (raw.len == 0 or raw[0] == '#' or raw[0] == '@') continue; // Clean up. var line = std.mem.trimLeft(u8, raw, "÷ "); - if (std.mem.indexOf(u8, line, " ÷\t#")) |octo| { - line = line[0..octo]; + if (std.mem.indexOf(u8, line, " ÷\t")) |final| { + line = line[0..final]; } // Iterate over fields. var want = std.ArrayList(Grapheme).init(allocator); @@ -206,7 +190,6 @@ test "Segmentation GraphemeIterator" { bytes_index += cp_index; } - // std.debug.print("\nline {}: {s}\n", .{ line_no, all_bytes.items }); var iter = data.iterator(all_bytes.items); // Check. @@ -219,3 +202,41 @@ test "Segmentation GraphemeIterator" { } } } + +const IterRead = struct { + read: *Reader, + line: usize = 0, + + pub fn next(iter: *IterRead, buf: []u8) !?[]const u8 { + defer iter.line += 1; + const maybe_line = try iter.read.readUntilDelimiterOrEof(buf, '#'); + if (maybe_line) |this_line| { + try iter.read.skipUntilDelimiterOrEof('\n'); + if (this_line.len == 0 or this_line[0] == '@') { + // comment, next line + return iter.next(buf); + } else { + return this_line; + } + } else { + return null; + } + } +}; + +const std = @import("std"); +const fmt = std.fmt; +const fs = std.fs; +const io = std.io; +const Reader = io.BufferedReader(4096, fs.File.Reader).Reader; +const heap = std.heap; +const mem = std.mem; +const debug = std.debug; +const testing = std.testing; +const unicode = std.unicode; + +const grapheme = @import("Graphemes"); +const Grapheme = @import("Graphemes").Grapheme; +const Graphemes = @import("Graphemes"); +const GraphemeIterator = @import("Graphemes").Iterator; +const Normalize = @import("Normalize"); -- cgit v1.2.3 From 04123c2280088acbe4501bbe4c314ca64ff27dab Mon Sep 17 00:00:00 2001 From: Sam Atman Date: Mon, 12 May 2025 12:57:04 -0400 Subject: Vastly simplify peek() Idiomatic Zig takes awhile, what can I say (yes I wrote the first one). --- src/Graphemes.zig | 63 +++---------------------------------------------------- 1 file changed, 3 insertions(+), 60 deletions(-) (limited to 'src') diff --git a/src/Graphemes.zig b/src/Graphemes.zig index 7bf328a..1ce1ea6 100644 --- a/src/Graphemes.zig +++ b/src/Graphemes.zig @@ -99,7 +99,7 @@ pub const Gbp = enum { /// `Grapheme` represents a Unicode grapheme cluster by its length and offset in the source bytes. pub const Grapheme = struct { - len: u8, + len: u32, offset: u32, /// `bytes` returns the slice of bytes that correspond to @@ -173,69 +173,12 @@ pub const Iterator = struct { const saved_cp_iter = self.cp_iter; const s0 = self.buf[0]; const s1 = self.buf[1]; - - self.advance(); - - // If no more - if (self.buf[0] == null) { - self.cp_iter = saved_cp_iter; - self.buf[0] = s0; - self.buf[1] = s1; - return null; - } - // If last one - if (self.buf[1] == null) { - const len = self.buf[0].?.len; - const offset = self.buf[0].?.offset; + defer { self.cp_iter = saved_cp_iter; self.buf[0] = s0; self.buf[1] = s1; - return Grapheme{ .len = len, .offset = offset }; } - // If ASCII - if (self.buf[0].?.code != '\r' and self.buf[0].?.code < 128 and self.buf[1].?.code < 128) { - const len = self.buf[0].?.len; - const offset = self.buf[0].?.offset; - self.cp_iter = saved_cp_iter; - self.buf[0] = s0; - self.buf[1] = s1; - return Grapheme{ .len = len, .offset = offset }; - } - - const gc_start = self.buf[0].?.offset; - var gc_len: u8 = self.buf[0].?.len; - var state = State{}; - - if (graphemeBreak( - self.buf[0].?.code, - self.buf[1].?.code, - self.data, - &state, - )) { - self.cp_iter = saved_cp_iter; - self.buf[0] = s0; - self.buf[1] = s1; - return Grapheme{ .len = gc_len, .offset = gc_start }; - } - - while (true) { - self.advance(); - if (self.buf[0] == null) break; - - gc_len += self.buf[0].?.len; - - if (graphemeBreak( - self.buf[0].?.code, - if (self.buf[1]) |ncp| ncp.code else 0, - self.data, - &state, - )) break; - } - self.cp_iter = saved_cp_iter; - self.buf[0] = s0; - self.buf[1] = s1; - - return Grapheme{ .len = gc_len, .offset = gc_start }; + return self.next(); } }; -- cgit v1.2.3 From a832b90bc00503a201bf457df1f49dc338314e7b Mon Sep 17 00:00:00 2001 From: Sam Atman Date: Mon, 12 May 2025 12:57:49 -0400 Subject: Implement Word iterator A by-the-book implmentation of the word break rules from tr29. This is superficially inefficient, but compilers are more than able to handle the common subexpression folding ignored by this approach. Now to port the WordBreakPropertyTests, and clean up the inevitable bugs in the implementation. --- src/WordBreak.zig | 228 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 228 insertions(+) (limited to 'src') diff --git a/src/WordBreak.zig b/src/WordBreak.zig index 9044740..84fd1f7 100644 --- a/src/WordBreak.zig +++ b/src/WordBreak.zig @@ -71,11 +71,234 @@ pub fn deinit(wordbreak: *const WordBreak, allocator: mem.Allocator) void { allocator.free(wordbreak.s2); } +/// Represents a Unicode word span, as an offset into the source string +/// and the length of the word. +pub const Word = struct { + offset: u32, + len: u32, + + /// Returns a slice of the word given the source string. + pub fn bytes(self: Word, src: []const u8) []const u8 { + return src[self.offset..][0..self.len]; + } +}; + /// Returns the word break property type for `cp`. pub fn breakProperty(wordbreak: *const WordBreak, cp: u21) WordBreakProperty { return @enumFromInt(wordbreak.s2[wordbreak.s1[cp >> 8] + (cp & 0xff)]); } +const IterState = packed struct { + mid_punct: bool, // AHLetter (MidLetter | MidNumLetQ) × AHLetter + mid_num: bool, // Numeric (MidNum | MidNumLetQ) × Numeric + quote_heb: bool, // Hebrew_Letter Double_Quote × Hebrew_Letter + regional: bool, // [^RI] (RI RI)* RI × RI + + pub const initial: IterState = .{ + .mid_punct = false, + .mid_num = false, + .quote_heb = false, + .regional = false, + }; +}; + +pub const Iterator = struct { + this: ?CodePoint = null, + that: ?CodePoint = null, + cache: ?WordBreakProperty = null, + cp_iter: CodepointIterator, + wb: *const WordBreak, + + /// Assumes `str` is valid UTF-8. + pub fn init(wb: *const WordBreak, str: []const u8) Iterator { + var wb_iter: Iterator = .{ .cp_iter = .{ .bytes = str }, .wb = wb }; + wb_iter.advance(); + return wb; + } + + pub fn next(iter: *Iterator) ?Word { + iter.advance(); + + // Done? + if (iter.this == null) return null; + // Last? + if (iter.that == null) return Word{ .len = iter.this.?.len, .offset = iter.this.?.offset }; + + const word_start = iter.this.?.offset; + var word_len: u32 = 0; + + var state: IterState = .initial; + + scan: while (true) : (iter.advance()) { + const this = iter.this.?; + word_len += this.len; + if (iter.that) |that| { + const that_p = iter.wb.breakProperty(that.code); + const this_p = this_p: { + if (!isIgnorable(that_p) and iter.cache != null) { + defer iter.cache = null; + break :this_p iter.cache.?; + } else { + break :this_p iter.wb.breakProperty(this.code); + } + }; + // WB3 CR × LF + if (this_p == .CR and that_p == .LF) continue :scan; + // WB3a (Newline | CR | LF) ÷ + if (isNewline(this_p)) break :scan; + // WB3b ÷ (Newline | CR | LF) + if (isNewline(that_p)) break :scan; + // WB3c ZWJ × \p{Extended_Pictographic} + // The right way to do this one is a RuneSet, TODO: circle back + // WB3d WSegSpace × WSegSpace + if (this_p == .WSegSpace and that_p == .WSegSpace) continue :scan; + // WB4 X (Extend | Format | ZWJ)* → X + if (isIgnorable(that_p)) { + if (iter.cache == null) { + iter.cache = this_p; + } + continue :scan; + } + if (isAHLetter(this_p)) { + // WB5 AHLetter × AHLetter + if (isAHLetter(that_p)) continue :scan; + // WB6 AHLetter × (MidLetter | MidNumLetQ) AHLetter + if (isMidVal(that_p)) { + const next_val = iter.cp_iter.peek(); + if (next_val) |next_cp| { + const next_p = iter.wb.breakProperty(next_cp.code); + if (isAHLetter(next_p)) { + state.mid_punct = true; + continue :scan; + } + } else break :scan; + } + } + // AHLetter (MidLetter | MidNumLetQ) × AHLetter + if (state.mid_punct) { + // Should always be true: + assert(isMidVal(this_p)); + assert(isAHLetter(that_p)); + state.mid_punct = false; + continue :scan; + } + if (this_p == .Hebrew_Letter) { + // WB7a Hebrew_Letter × Single_Quote + if (that_p == .Single_Quote) continue :scan; + // WB7b Hebrew_Letter × Double_Quote Hebrew_Letter + if (that_p == .Double_Quote) { + const next_val = iter.cp_iter.peek(); + if (next_val) |next_cp| { + const next_p = iter.wb.breakProperty(next_cp.code); + if (next_p == .Hebrew_Letter) { + state.quote_heb = true; + continue :scan; + } + } else break :scan; + } + } + // WB7c Hebrew_Letter Double_Quote × Hebrew_Letter + if (state.quote_heb) { + // Should always be true: + assert(this_p == .Double_Quote); + assert(that_p == .Hebrew_Letter); + state.quote_heb = false; + continue :scan; + } + // WB8 Numeric × Numeric + if (this_p == .Numeric and that_p == .Numeric) continue :scan; + // WB9 AHLetter × Numeric + if (isAHLetter(this_p) and that_p == .Numeric) continue :scan; + // WB10 Numeric × AHLetter + if (this_p == .Numeric and isAHLetter(that_p)) continue :scan; + // WB12 Numeric × (MidNum | MidNumLetQ) Numeric + if (this_p == .Numeric and isMidVal(that_p)) { + const next_val = iter.cp_iter.peek(); + if (next_val) |next_cp| { + const next_p = iter.wb.breakProperty(next_cp.code); + if (next_p == .Numeric) { + state.mid_num = true; + continue :scan; + } + } else break :scan; + } + // WB11 Numeric (MidNum | MidNumLetQ) × Numeric + if (state.mid_num) { + assert(isMidVal(this_p)); + assert(that_p == .Numeric); + state.mid_num = false; + continue :scan; + } + // WB13 Katakana × Katakana + if (this_p == .Katakana and that_p == .Katakana) continue :scan; + // WB13a (AHLetter | Numeric | Katakana | ExtendNumLet) × ExtendNumLet + if (isExtensible(this_p) and that_p == .ExtendNumLet) continue :scan; + // WB13b ExtendNumLet × (AHLetter | Numeric | Katakana) + if (this_p == .ExtendNumLet and isExtensible(that_p)) continue :scan; + // WB15, WB16 ([^RI] ! sot) (RI RI)* RI × RI + if (that_p == .Regional_Indicator) { + if (this_p == .Regional_Indicator) { + if (state.regional) { + state.regional = false; + continue :scan; + } else { + break :scan; + } + } else { + const next_val = iter.cp_iter.peek(); + if (next_val) |next_cp| { + const next_p = iter.wb.breakProperty(next_cp.code); + if (next_p == .Regional_Indicator) { + state.regional = true; + continue :scan; + } + } else break :scan; + } + } + // WB999 Any ÷ Any + break :scan; + } else { // iter.that == null + break :scan; + } + } + + return Word{ .len = word_len, .offset = word_start }; + } + + fn advance(wb_iter: *Iterator) void { + wb_iter.this = wb_iter.that; + wb_iter.that = wb_iter.cp_iter.next(); + } +}; + +//| Predicates + +inline fn isNewline(wbp: WordBreakProperty) bool { + return wbp == .CR or wbp == .LF or wbp == .Newline; +} + +inline fn isIgnorable(wbp: WordBreakProperty) bool { + return switch (wbp) { + .Format, .Extend, .ZWJ => true, + else => false, + }; +} + +inline fn isAHLetter(wbp: WordBreakProperty) bool { + return wbp == .ALetter or wbp == .Hebrew_Letter; +} + +inline fn isMidVal(wbp: WordBreakProperty) bool { + return wbp == .MidLetter or wbp == .MidNumLet or wbp == .Single_Quote; +} + +inline fn isExtensible(wbp: WordBreakProperty) bool { + return switch (wbp) { + .ALetter, .Hebrew_Letter, .Katakana, .Numeric, .ExtendNumLet => true, + else => false, + }; +} + test "Word Break Properties" { const wb = try WordBreak.init(testing.allocator); defer wb.deinit(testing.allocator); @@ -99,4 +322,9 @@ const builtin = @import("builtin"); const compress = std.compress; const mem = std.mem; const Allocator = mem.Allocator; +const assert = std.debug.assert; const testing = std.testing; + +const code_point = @import("code_point"); +const CodepointIterator = code_point.Iterator; +const CodePoint = code_point.CodePoint; -- cgit v1.2.3 From cf8d8fe5d640511f6c4134fdaa36e930232ca7da Mon Sep 17 00:00:00 2001 From: Sam Atman Date: Mon, 12 May 2025 15:22:37 -0400 Subject: Begin conformance test I'm not sure the details of this strategy can actually be made to work. But, something can. --- src/Graphemes.zig | 22 ++++++ src/WordBreak.zig | 83 +++++++++++++------- src/code_point.zig | 5 ++ src/micro_runeset.zig | 207 ++++++++++++++++++++++++++++++++++++++++++++++++++ src/unicode_tests.zig | 102 +++++++++++++++++-------- 5 files changed, 361 insertions(+), 58 deletions(-) create mode 100644 src/micro_runeset.zig (limited to 'src') diff --git a/src/Graphemes.zig b/src/Graphemes.zig index 1ce1ea6..1f67fc6 100644 --- a/src/Graphemes.zig +++ b/src/Graphemes.zig @@ -364,3 +364,25 @@ test "Segmentation ZWJ and ZWSP emoji sequences" { try std.testing.expectEqual(@as(usize, 2), i); } } + +test "Iterator.peek" { + const peek_seq = "aΔ👨🏻‍🌾→"; + const data = try Graphemes.init(std.testing.allocator); + defer data.deinit(std.testing.allocator); + + var iter = data.iterator(peek_seq); + const peek_a = iter.peek().?; + const next_a = iter.next().?; + try std.testing.expectEqual(peek_a, next_a); + try std.testing.expectEqualStrings("a", peek_a.bytes(peek_seq)); + const peek_d1 = iter.peek().?; + const peek_d2 = iter.peek().?; + try std.testing.expectEqual(peek_d1, peek_d2); + const next_d = iter.next().?; + try std.testing.expectEqual(peek_d2, next_d); + try std.testing.expectEqual(iter.peek(), iter.next()); + try std.testing.expectEqual(iter.peek(), iter.next()); + try std.testing.expectEqual(null, iter.peek()); + try std.testing.expectEqual(null, iter.peek()); + try std.testing.expectEqual(iter.peek(), iter.next()); +} diff --git a/src/WordBreak.zig b/src/WordBreak.zig index 84fd1f7..53db76b 100644 --- a/src/WordBreak.zig +++ b/src/WordBreak.zig @@ -88,6 +88,11 @@ pub fn breakProperty(wordbreak: *const WordBreak, cp: u21) WordBreakProperty { return @enumFromInt(wordbreak.s2[wordbreak.s1[cp >> 8] + (cp & 0xff)]); } +/// Returns an iterator over words in `slice` +pub fn iterator(wordbreak: *const WordBreak, slice: []const u8) Iterator { + return Iterator.init(wordbreak, slice); +} + const IterState = packed struct { mid_punct: bool, // AHLetter (MidLetter | MidNumLetQ) × AHLetter mid_num: bool, // Numeric (MidNum | MidNumLetQ) × Numeric @@ -113,7 +118,7 @@ pub const Iterator = struct { pub fn init(wb: *const WordBreak, str: []const u8) Iterator { var wb_iter: Iterator = .{ .cp_iter = .{ .bytes = str }, .wb = wb }; wb_iter.advance(); - return wb; + return wb_iter; } pub fn next(iter: *Iterator) ?Word { @@ -132,12 +137,18 @@ pub const Iterator = struct { scan: while (true) : (iter.advance()) { const this = iter.this.?; word_len += this.len; + var ignored = false; if (iter.that) |that| { const that_p = iter.wb.breakProperty(that.code); const this_p = this_p: { if (!isIgnorable(that_p) and iter.cache != null) { + // TODO: might not need these what with peekPast + ignored = true; defer iter.cache = null; - break :this_p iter.cache.?; + // Fixup some state, apply pre-4 rules + const restore = iter.cache.?; + if (restore == .WSegSpace) break :this_p .none; + break :this_p restore; } else { break :this_p iter.wb.breakProperty(this.code); } @@ -149,11 +160,22 @@ pub const Iterator = struct { // WB3b ÷ (Newline | CR | LF) if (isNewline(that_p)) break :scan; // WB3c ZWJ × \p{Extended_Pictographic} - // The right way to do this one is a RuneSet, TODO: circle back + if (this_p == .ZWJ and ext_pict.isMatch(that.bytes(iter.cp_iter.bytes))) { + // Invalid after ignoring + if (ignored) break :scan else continue :scan; + } // WB3d WSegSpace × WSegSpace if (this_p == .WSegSpace and that_p == .WSegSpace) continue :scan; // WB4 X (Extend | Format | ZWJ)* → X if (isIgnorable(that_p)) { + if (that_p == .ZWJ) { + const next_val = iter.peekPast(); + if (next_val) |next_cp| { + if (ext_pict.isMatch(next_cp.bytes(iter.cp_iter.bytes))) { + continue :scan; + } + } + } if (iter.cache == null) { iter.cache = this_p; } @@ -164,14 +186,14 @@ pub const Iterator = struct { if (isAHLetter(that_p)) continue :scan; // WB6 AHLetter × (MidLetter | MidNumLetQ) AHLetter if (isMidVal(that_p)) { - const next_val = iter.cp_iter.peek(); + const next_val = iter.peekPast(); if (next_val) |next_cp| { const next_p = iter.wb.breakProperty(next_cp.code); if (isAHLetter(next_p)) { state.mid_punct = true; continue :scan; } - } else break :scan; + } } } // AHLetter (MidLetter | MidNumLetQ) × AHLetter @@ -187,7 +209,7 @@ pub const Iterator = struct { if (that_p == .Single_Quote) continue :scan; // WB7b Hebrew_Letter × Double_Quote Hebrew_Letter if (that_p == .Double_Quote) { - const next_val = iter.cp_iter.peek(); + const next_val = iter.peekPast(); if (next_val) |next_cp| { const next_p = iter.wb.breakProperty(next_cp.code); if (next_p == .Hebrew_Letter) { @@ -212,8 +234,8 @@ pub const Iterator = struct { // WB10 Numeric × AHLetter if (this_p == .Numeric and isAHLetter(that_p)) continue :scan; // WB12 Numeric × (MidNum | MidNumLetQ) Numeric - if (this_p == .Numeric and isMidVal(that_p)) { - const next_val = iter.cp_iter.peek(); + if (this_p == .Numeric and isMidNum(that_p)) { + const next_val = iter.peekPast(); if (next_val) |next_cp| { const next_p = iter.wb.breakProperty(next_cp.code); if (next_p == .Numeric) { @@ -224,7 +246,7 @@ pub const Iterator = struct { } // WB11 Numeric (MidNum | MidNumLetQ) × Numeric if (state.mid_num) { - assert(isMidVal(this_p)); + assert(isMidNum(this_p)); assert(that_p == .Numeric); state.mid_num = false; continue :scan; @@ -235,25 +257,18 @@ pub const Iterator = struct { if (isExtensible(this_p) and that_p == .ExtendNumLet) continue :scan; // WB13b ExtendNumLet × (AHLetter | Numeric | Katakana) if (this_p == .ExtendNumLet and isExtensible(that_p)) continue :scan; - // WB15, WB16 ([^RI] ! sot) (RI RI)* RI × RI - if (that_p == .Regional_Indicator) { - if (this_p == .Regional_Indicator) { - if (state.regional) { + // WB15, WB16 ([^RI] | sot) (RI RI)* RI × RI + if (this_p == .Regional_Indicator) { + if (that_p == .Regional_Indicator) { + if (state.regional == true or this.offset == 0) { state.regional = false; continue :scan; - } else { - break :scan; } } else { - const next_val = iter.cp_iter.peek(); - if (next_val) |next_cp| { - const next_p = iter.wb.breakProperty(next_cp.code); - if (next_p == .Regional_Indicator) { - state.regional = true; - continue :scan; - } - } else break :scan; + state.regional = true; } + } else if (that_p == .Regional_Indicator) { + state.regional = true; } // WB999 Any ÷ Any break :scan; @@ -265,9 +280,19 @@ pub const Iterator = struct { return Word{ .len = word_len, .offset = word_start }; } - fn advance(wb_iter: *Iterator) void { - wb_iter.this = wb_iter.that; - wb_iter.that = wb_iter.cp_iter.next(); + fn advance(iter: *Iterator) void { + iter.this = iter.that; + iter.that = iter.cp_iter.next(); + } + + fn peekPast(iter: *Iterator) ?CodePoint { + const save_cp = iter.cp_iter; + defer iter.cp_iter = save_cp; + while (iter.cp_iter.peek()) |peeked| { + if (!isIgnorable(iter.wb.breakProperty(peeked.code))) return peeked; + _ = iter.cp_iter.next(); + } + return null; } }; @@ -292,6 +317,10 @@ inline fn isMidVal(wbp: WordBreakProperty) bool { return wbp == .MidLetter or wbp == .MidNumLet or wbp == .Single_Quote; } +inline fn isMidNum(wbp: WordBreakProperty) bool { + return wbp == .MidNum or wbp == .MidNumLet or wbp == .Single_Quote; +} + inline fn isExtensible(wbp: WordBreakProperty) bool { return switch (wbp) { .ALetter, .Hebrew_Letter, .Katakana, .Numeric, .ExtendNumLet => true, @@ -328,3 +357,5 @@ const testing = std.testing; const code_point = @import("code_point"); const CodepointIterator = code_point.Iterator; const CodePoint = code_point.CodePoint; + +const ext_pict = @import("micro_runeset.zig").Extended_Pictographic; diff --git a/src/code_point.zig b/src/code_point.zig index b3c9aea..79ee5cd 100644 --- a/src/code_point.zig +++ b/src/code_point.zig @@ -10,6 +10,11 @@ pub const CodePoint = struct { code: u21, len: u3, offset: u32, + + /// Return the slice of this codepoint, given the original string. + pub fn bytes(cp: CodePoint, str: []const u8) []const u8 { + return str[cp.offset..][0..cp.len]; + } }; /// This function is deprecated and will be removed in a later release. diff --git a/src/micro_runeset.zig b/src/micro_runeset.zig new file mode 100644 index 0000000..34fbcd3 --- /dev/null +++ b/src/micro_runeset.zig @@ -0,0 +1,207 @@ +//! Minimal RuneSet implementation +//! +//! This is copied from the full RuneSet module, so that `zg` doesn't +//! depend on it. There's one spot in the WordBreak algorithm which +//! needs to identify the emoji Extended_Pictographic property, which +//! is not otherwise used in ZG. The Runeset is 89 words, while the +//! trie lookup used throughout ZG would be much larger. +//! +//! The RuneSet is borrowed from Runicode, which encodes Unicode things +//! in RuneSet form. This will need updating for each version of Unicode. + +pub const Extended_Pictographic = RuneSet{ .body = &.{ 0x0, 0x0, 0x1000c00000004, 0x1f, 0x420000000000, 0x30107fc8d053, 0x401, 0x80000000, 0xffff0fffafffffff, 0x2800000, 0x2001000000000000, 0x210000, 0x8000060, 0x10000000000000, 0x8001000200600000, 0x7800985090, 0x801022055ef2d, 0xedf57effffffdf57, 0xaffd75bd6f7d001f, 0xdbffffbbbff7ff7f, 0x7d7fddd76f56dfb5, 0x3800000000000001, 0x40040000000000, 0x4, 0x30bae0000008000, 0x100, 0x10004000000, 0x20001f00000, 0x200000400000000, 0x200, 0x1000000000000000, 0xfffffffffffffff7, 0xffffffffffffffff, 0xffffffffffffffff, 0x7fffffffffffbfff, 0x800000006000, 0x4001700000000000, 0xffffe00003fe4000, 0x1fffffffff, 0x73fc800004007ffa, 0xfffffffffffd7e00, 0xffffffffffffffff, 0x7fffffffffffffff, 0xffd56ff6bedfafff, 0x77ffffffffff7bff, 0xffffffff5757ffff, 0x3fafff77ff7bfef, 0xbffffdfffffab77f, 0xffffd7efffffffff, 0xff5fefffffffffff, 0xef6fd7ffffffffff, 0x1fffd7ffffefff7b, 0xfdfabf7ff7ffbac0, 0xf7faff77ffaf5dbf, 0x7dfbbf7eb7f6ffed, 0xfff7775fbfefdebf, 0x7fee, 0xbedddfddfbf7f7db, 0x6ebb6edf776b7bdf, 0x7ff0000000000000, 0x7fff77ff7fe00000, 0x7000, 0x7c007f00, 0xffffc00000007f00, 0x7fffffffffffffff, 0xb3fb7f7fbeff7000, 0x7ebef7ffbfff779f, 0x7dff5bebff7dffef, 0x7fffffbfffff7bfb, 0xffffffffffffffff, 0x6b777fffffffffff, 0xdbbf6effffdfbebb, 0x7ebf7f7fb5bf5fdb, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0x1fffffffffffffff } }; + +// Meaningful names for the T1 slots +const LOW = 0; +const HI = 1; +const LEAD = 2; +const T4_OFF = 3; + +/// Minimum Viable Runeset. Must be statically created, strictly boolean matching. +pub const RuneSet = struct { + body: []const u64, + + // Returns whether the slice is a match. This assumes the validity of the + // string, which can be ensured by, in particular, deriving it from a CodePoint. + pub fn isMatch(runeset: RuneSet, str: []const u8) bool { + const set = runeset.body; + const a = codeunit(str[0]); + switch (a.kind) { + .follow => return false, + .low => { + const mask = toMask(set[LOW]); + if (mask.isIn(a)) + return true + else + return false; + }, + .hi => { + const mask = toMask(set[HI]); + if (mask.isIn(a)) + return true + else + return false; + }, + .lead => { + const nB = a.nMultiBytes().?; + const a_mask = toMask(set[LEAD]); + if (!a_mask.isIn(a)) return false; + const b = codeunit(str[1]); + const b_loc = 4 + a_mask.lowerThan(a).?; + const b_mask = toMask(set[b_loc]); + if (!b_mask.isIn(b)) return false; + if (nB == 2) return true; + const t3_off = 4 + @popCount(set[LEAD]); + const c = codeunit(str[2]); + // Slice is safe because we know the T2 span has at least one word. + const c_off = b_mask.higherThan(b).? + popCountSlice(set[b_loc + 1 .. t3_off]); + const c_loc = t3_off + c_off; + const c_mask = toMask(set[c_loc]); + if (!c_mask.isIn(c)) return false; + if (nB == 3) return true; + const d_off = c_mask.lowerThan(c).? + popCountSlice(set[t3_off..c_loc]); + const d_loc = set[T4_OFF] + d_off; + const d = codeunit(str[3]); + const d_mask = toMask(set[d_loc]); + if (d_mask.isIn(d)) return true else return false; + }, + } + } +}; + +/// Kinds of most significant bits in UTF-8 +const RuneKind = enum(u2) { + low, + hi, + follow, + lead, +}; + +/// Packed `u8` struct representing one codeunit of UTF-8. +const CodeUnit = packed struct(u8) { + body: u6, + kind: RuneKind, + + /// Mask to check presence + pub inline fn inMask(self: *const CodeUnit) u64 { + return @as(u64, 1) << self.body; + } + + // TODO consider an nMultiBytesFast, for the cases where we + // know that invalid lead bytes are never present (such as in set) + // operations, where we may assume that (and will assert that) the + // LEAD mask contains no such bytes. + + /// Number of bytes in known multi-byte rune. + /// + /// Caller guarantees that the CodeUnit is a lead byte + /// of a multi-byte rune: `cu.kind == .lead`. + /// + /// Invalid lead bytes will return null. + pub inline fn nMultiBytes(self: *const CodeUnit) ?u8 { + return switch (self.body) { + // 0 and 1 are invalid for overlong reasons, + // but RuneSet supports overlong encodings + 0...31 => 2, + 32...47 => 3, + 48...55 => 4, + // Wasted space 56...61 is due entirely to Microsoft's + // lack of vision and insistence on a substandard + // and utterly inadequate encoding for Unicode + // "64k should be enough for anyone" + 56...63 => null, + }; + } + + /// Given a valid lead byte, return the number of bytes that should + /// make up the code unit sequence. Will return `null` if the lead + /// byte is invalid. + pub inline fn nBytes(self: *const CodeUnit) ?u8 { + switch (self.kind) { + .low, .hi => return 1, + .lead => return self.nMultiBytes(), + .follow => return null, + } + } + + /// Mask off all bits >= cu.body + pub inline fn hiMask(self: *const CodeUnit) u64 { + return (@as(u64, 1) << self.body) - 1; + } + + /// Mask off all bits <= cu.body + pub inline fn lowMask(self: *const CodeUnit) u64 { + if (self.body == 63) + return 0 + else + return ~((@as(u64, 1) << (self.body + 1)) - 1); + } + + /// Cast the `CodeUnit` to its backing `u8`. + pub inline fn byte(self: *const CodeUnit) u8 { + return @bitCast(self.*); + } +}; + +/// Cast raw byte to CodeUnit +inline fn codeunit(b: u8) CodeUnit { + return @bitCast(b); +} + +inline fn toMask(w: u64) Mask { + return Mask.toMask(w); +} + +/// Bitmask for runesets +/// +/// We define our own bitset, because the operations we need to +/// perform only overlap with IntegerBitSet for trivial one-liners, +/// and furthermore, we need nondestructive versions of the basic +/// operations, which aren't a part of the IntegerBitSet interface. +/// +/// Note that Masks do not track which kind of byte they apply to, +/// since they will be stored as ordinary u64s. User code must +/// ensure that CodeUnits tested against a Mask are of the appropriate +/// type, and otherwise valid for the test performed. +/// +const Mask = struct { + m: u64, + + pub fn toMask(w: u64) Mask { + return Mask{ .m = w }; + } + + /// Test if a CodeUnit's low bytes are present in mask + pub inline fn isIn(self: Mask, cu: CodeUnit) bool { + return self.m | cu.inMask() == self.m; + } + + /// Return number of bytes lower than cu.body in mask, + /// if cu inhabits the mask. Otherwise return null. + pub inline fn lowerThan(self: Mask, cu: CodeUnit) ?u64 { + if (self.isIn(cu)) { + const m = cu.hiMask(); + return @popCount(self.m & m); + } else { + return null; + } + } + + /// Return number of bytes higher than cu.body in mask, + /// if cu inhabits the mask. Otherwise return null. + pub inline fn higherThan(self: Mask, cu: CodeUnit) ?u64 { + if (self.isIn(cu)) { + const m = cu.lowMask(); + return @popCount(self.m & m); + } else { + return null; + } + } +}; + +/// Sum of @popCount of all words in region. +inline fn popCountSlice(region: []const u64) usize { + var ct: usize = 0; + for (region) |w| ct += @popCount(w); + return ct; +} diff --git a/src/unicode_tests.zig b/src/unicode_tests.zig index ee259a3..7ce2b4e 100644 --- a/src/unicode_tests.zig +++ b/src/unicode_tests.zig @@ -1,31 +1,5 @@ const dbg_print = false; -comptime { - testing.refAllDecls(grapheme); -} - -test "Iterator.peek" { - const peek_seq = "aΔ👨🏻‍🌾→"; - const data = try Graphemes.init(std.testing.allocator); - defer data.deinit(std.testing.allocator); - - var iter = data.iterator(peek_seq); - const peek_a = iter.peek().?; - const next_a = iter.next().?; - try std.testing.expectEqual(peek_a, next_a); - try std.testing.expectEqualStrings("a", peek_a.bytes(peek_seq)); - const peek_d1 = iter.peek().?; - const peek_d2 = iter.peek().?; - try std.testing.expectEqual(peek_d1, peek_d2); - const next_d = iter.next().?; - try std.testing.expectEqual(peek_d2, next_d); - try std.testing.expectEqual(iter.peek(), iter.next()); - try std.testing.expectEqual(iter.peek(), iter.next()); - try std.testing.expectEqual(null, iter.peek()); - try std.testing.expectEqual(null, iter.peek()); - try std.testing.expectEqual(iter.peek(), iter.next()); -} - test "Unicode normalization tests" { var arena = heap.ArenaAllocator.init(testing.allocator); defer arena.deinit(); @@ -147,15 +121,13 @@ test "Segmentation GraphemeIterator" { var buf_reader = std.io.bufferedReader(file.reader()); var input_stream = buf_reader.reader(); - const data = try Graphemes.init(allocator); - defer data.deinit(allocator); + const graph = try Graphemes.init(allocator); + defer graph.deinit(allocator); var buf: [4096]u8 = undefined; var line_iter: IterRead = .{ .read = &input_stream }; while (try line_iter.next(&buf)) |raw| { - // Skip comments or empty lines. - // if (raw.len == 0 or raw[0] == '#' or raw[0] == '@') continue; // Clean up. var line = std.mem.trimLeft(u8, raw, "÷ "); if (std.mem.indexOf(u8, line, " ÷\t")) |final| { @@ -190,7 +162,7 @@ test "Segmentation GraphemeIterator" { bytes_index += cp_index; } - var iter = data.iterator(all_bytes.items); + var iter = graph.iterator(all_bytes.items); // Check. for (want.items) |want_gc| { @@ -203,6 +175,71 @@ test "Segmentation GraphemeIterator" { } } +test "Segmentation Word Iterator" { + const allocator = std.testing.allocator; + var file = try std.fs.cwd().openFile("data/unicode/auxiliary/WordBreakTest.txt", .{}); + defer file.close(); + var buf_reader = std.io.bufferedReader(file.reader()); + var input_stream = buf_reader.reader(); + + const wb = try WordBreak.init(allocator); + defer wb.deinit(allocator); + + var buf: [4096]u8 = undefined; + var line_iter: IterRead = .{ .read = &input_stream }; + + while (try line_iter.next(&buf)) |raw| { + // Clean up. + var line = std.mem.trimLeft(u8, raw, "÷ "); + if (std.mem.indexOf(u8, line, " ÷\t")) |final| { + line = line[0..final]; + } + // Iterate over fields. + var want = std.ArrayList(Grapheme).init(allocator); + defer want.deinit(); + + var all_bytes = std.ArrayList(u8).init(allocator); + defer all_bytes.deinit(); + + var words = std.mem.splitSequence(u8, line, " ÷ "); + var bytes_index: u32 = 0; + + while (words.next()) |field| { + var code_points = std.mem.splitScalar(u8, field, ' '); + var cp_buf: [4]u8 = undefined; + var cp_index: u32 = 0; + var gc_len: u8 = 0; + + while (code_points.next()) |code_point| { + if (std.mem.eql(u8, code_point, "×")) continue; + const cp: u21 = try std.fmt.parseInt(u21, code_point, 16); + const len = try unicode.utf8Encode(cp, &cp_buf); + try all_bytes.appendSlice(cp_buf[0..len]); + cp_index += len; + gc_len += len; + } + + try want.append(Grapheme{ .len = gc_len, .offset = bytes_index }); + bytes_index += cp_index; + } + + var iter = wb.iterator(all_bytes.items); + + // Check. + for (want.items, 1..) |want_word, i| { + const got_word = (iter.next()).?; + std.testing.expectEqualSlices( + u8, + want_word.bytes(all_bytes.items), + got_word.bytes(all_bytes.items), + ) catch |err| { + debug.print("Error on line {d}, #{d}\n", .{ line_iter.line, i }); + return err; + }; + } + } +} + const IterRead = struct { read: *Reader, line: usize = 0, @@ -235,8 +272,9 @@ const debug = std.debug; const testing = std.testing; const unicode = std.unicode; -const grapheme = @import("Graphemes"); const Grapheme = @import("Graphemes").Grapheme; const Graphemes = @import("Graphemes"); const GraphemeIterator = @import("Graphemes").Iterator; const Normalize = @import("Normalize"); + +const WordBreak = @import("WordBreak"); -- cgit v1.2.3 From a7f6990a8d433c6c8d34892a2126e94cdb31541f Mon Sep 17 00:00:00 2001 From: Sam Atman Date: Mon, 12 May 2025 18:10:02 -0400 Subject: Rewrite, passes WordBreakTest After fixing a bug in Runicode which was fenceposting codepoints off the end of ranges. As one does. --- src/WordBreak.zig | 111 +++++++++++++++++--------------------------------- src/micro_runeset.zig | 4 +- src/unicode_tests.zig | 3 +- 3 files changed, 40 insertions(+), 78 deletions(-) (limited to 'src') diff --git a/src/WordBreak.zig b/src/WordBreak.zig index 53db76b..a2be011 100644 --- a/src/WordBreak.zig +++ b/src/WordBreak.zig @@ -132,28 +132,21 @@ pub const Iterator = struct { const word_start = iter.this.?.offset; var word_len: u32 = 0; - var state: IterState = .initial; + // state variables + var last_p: WordBreakProperty = .none; + var last_last_p: WordBreakProperty = .none; + var ri_count: usize = 0; scan: while (true) : (iter.advance()) { const this = iter.this.?; word_len += this.len; - var ignored = false; if (iter.that) |that| { + const this_p = iter.wb.breakProperty(this.code); // WB3 CR × LF const that_p = iter.wb.breakProperty(that.code); - const this_p = this_p: { - if (!isIgnorable(that_p) and iter.cache != null) { - // TODO: might not need these what with peekPast - ignored = true; - defer iter.cache = null; - // Fixup some state, apply pre-4 rules - const restore = iter.cache.?; - if (restore == .WSegSpace) break :this_p .none; - break :this_p restore; - } else { - break :this_p iter.wb.breakProperty(this.code); - } - }; - // WB3 CR × LF + if (!isIgnorable(this_p)) { + last_last_p = last_p; + last_p = this_p; + } if (this_p == .CR and that_p == .LF) continue :scan; // WB3a (Newline | CR | LF) ÷ if (isNewline(this_p)) break :scan; @@ -161,27 +154,15 @@ pub const Iterator = struct { if (isNewline(that_p)) break :scan; // WB3c ZWJ × \p{Extended_Pictographic} if (this_p == .ZWJ and ext_pict.isMatch(that.bytes(iter.cp_iter.bytes))) { - // Invalid after ignoring - if (ignored) break :scan else continue :scan; + continue :scan; } // WB3d WSegSpace × WSegSpace if (this_p == .WSegSpace and that_p == .WSegSpace) continue :scan; // WB4 X (Extend | Format | ZWJ)* → X if (isIgnorable(that_p)) { - if (that_p == .ZWJ) { - const next_val = iter.peekPast(); - if (next_val) |next_cp| { - if (ext_pict.isMatch(next_cp.bytes(iter.cp_iter.bytes))) { - continue :scan; - } - } - } - if (iter.cache == null) { - iter.cache = this_p; - } continue :scan; - } - if (isAHLetter(this_p)) { + } // Now we use last_p instead of this_p for ignorable's sake + if (isAHLetter(last_p)) { // WB5 AHLetter × AHLetter if (isAHLetter(that_p)) continue :scan; // WB6 AHLetter × (MidLetter | MidNumLetQ) AHLetter @@ -190,21 +171,16 @@ pub const Iterator = struct { if (next_val) |next_cp| { const next_p = iter.wb.breakProperty(next_cp.code); if (isAHLetter(next_p)) { - state.mid_punct = true; continue :scan; } } } } - // AHLetter (MidLetter | MidNumLetQ) × AHLetter - if (state.mid_punct) { - // Should always be true: - assert(isMidVal(this_p)); - assert(isAHLetter(that_p)); - state.mid_punct = false; + // WB7 AHLetter (MidLetter | MidNumLetQ) × AHLetter + if (isAHLetter(last_last_p) and isMidVal(last_p) and isAHLetter(that_p)) { continue :scan; } - if (this_p == .Hebrew_Letter) { + if (last_p == .Hebrew_Letter) { // WB7a Hebrew_Letter × Single_Quote if (that_p == .Single_Quote) continue :scan; // WB7b Hebrew_Letter × Double_Quote Hebrew_Letter @@ -213,62 +189,44 @@ pub const Iterator = struct { if (next_val) |next_cp| { const next_p = iter.wb.breakProperty(next_cp.code); if (next_p == .Hebrew_Letter) { - state.quote_heb = true; continue :scan; } - } else break :scan; + } } } // WB7c Hebrew_Letter Double_Quote × Hebrew_Letter - if (state.quote_heb) { - // Should always be true: - assert(this_p == .Double_Quote); - assert(that_p == .Hebrew_Letter); - state.quote_heb = false; + if (last_last_p == .Hebrew_Letter and last_p == .Double_Quote and that_p == .Hebrew_Letter) continue :scan; - } // WB8 Numeric × Numeric - if (this_p == .Numeric and that_p == .Numeric) continue :scan; + if (last_p == .Numeric and that_p == .Numeric) continue :scan; // WB9 AHLetter × Numeric - if (isAHLetter(this_p) and that_p == .Numeric) continue :scan; + if (isAHLetter(last_p) and that_p == .Numeric) continue :scan; // WB10 Numeric × AHLetter - if (this_p == .Numeric and isAHLetter(that_p)) continue :scan; + if (last_p == .Numeric and isAHLetter(that_p)) continue :scan; + // WB11 Numeric (MidNum | MidNumLetQ) × Numeric + if (last_last_p == .Numeric and isMidNum(last_p) and that_p == .Numeric) + continue :scan; // WB12 Numeric × (MidNum | MidNumLetQ) Numeric - if (this_p == .Numeric and isMidNum(that_p)) { + if (last_p == .Numeric and isMidNum(that_p)) { const next_val = iter.peekPast(); if (next_val) |next_cp| { const next_p = iter.wb.breakProperty(next_cp.code); if (next_p == .Numeric) { - state.mid_num = true; continue :scan; } - } else break :scan; - } - // WB11 Numeric (MidNum | MidNumLetQ) × Numeric - if (state.mid_num) { - assert(isMidNum(this_p)); - assert(that_p == .Numeric); - state.mid_num = false; - continue :scan; + } } // WB13 Katakana × Katakana - if (this_p == .Katakana and that_p == .Katakana) continue :scan; + if (last_p == .Katakana and that_p == .Katakana) continue :scan; // WB13a (AHLetter | Numeric | Katakana | ExtendNumLet) × ExtendNumLet - if (isExtensible(this_p) and that_p == .ExtendNumLet) continue :scan; + if (isExtensible(last_p) and that_p == .ExtendNumLet) continue :scan; // WB13b ExtendNumLet × (AHLetter | Numeric | Katakana) - if (this_p == .ExtendNumLet and isExtensible(that_p)) continue :scan; + if (last_p == .ExtendNumLet and isExtensible(that_p)) continue :scan; // WB15, WB16 ([^RI] | sot) (RI RI)* RI × RI - if (this_p == .Regional_Indicator) { - if (that_p == .Regional_Indicator) { - if (state.regional == true or this.offset == 0) { - state.regional = false; - continue :scan; - } - } else { - state.regional = true; - } - } else if (that_p == .Regional_Indicator) { - state.regional = true; + const maybe_flag = that_p == .Regional_Indicator and last_p == .Regional_Indicator; + if (maybe_flag) { + ri_count += 1; + if (ri_count % 2 == 1) continue :scan; } // WB999 Any ÷ Any break :scan; @@ -337,6 +295,11 @@ test "Word Break Properties" { try testing.expectEqual(.Katakana, wb.breakProperty('\u{30ff}')); } +test "ext_pic" { + try testing.expect(ext_pict.isMatch("👇")); + try testing.expect(ext_pict.isMatch("\u{2704}")); +} + fn testAllocations(allocator: Allocator) !void { const wb = try WordBreak.init(allocator); wb.deinit(allocator); diff --git a/src/micro_runeset.zig b/src/micro_runeset.zig index 34fbcd3..80ce4bf 100644 --- a/src/micro_runeset.zig +++ b/src/micro_runeset.zig @@ -9,7 +9,7 @@ //! The RuneSet is borrowed from Runicode, which encodes Unicode things //! in RuneSet form. This will need updating for each version of Unicode. -pub const Extended_Pictographic = RuneSet{ .body = &.{ 0x0, 0x0, 0x1000c00000004, 0x1f, 0x420000000000, 0x30107fc8d053, 0x401, 0x80000000, 0xffff0fffafffffff, 0x2800000, 0x2001000000000000, 0x210000, 0x8000060, 0x10000000000000, 0x8001000200600000, 0x7800985090, 0x801022055ef2d, 0xedf57effffffdf57, 0xaffd75bd6f7d001f, 0xdbffffbbbff7ff7f, 0x7d7fddd76f56dfb5, 0x3800000000000001, 0x40040000000000, 0x4, 0x30bae0000008000, 0x100, 0x10004000000, 0x20001f00000, 0x200000400000000, 0x200, 0x1000000000000000, 0xfffffffffffffff7, 0xffffffffffffffff, 0xffffffffffffffff, 0x7fffffffffffbfff, 0x800000006000, 0x4001700000000000, 0xffffe00003fe4000, 0x1fffffffff, 0x73fc800004007ffa, 0xfffffffffffd7e00, 0xffffffffffffffff, 0x7fffffffffffffff, 0xffd56ff6bedfafff, 0x77ffffffffff7bff, 0xffffffff5757ffff, 0x3fafff77ff7bfef, 0xbffffdfffffab77f, 0xffffd7efffffffff, 0xff5fefffffffffff, 0xef6fd7ffffffffff, 0x1fffd7ffffefff7b, 0xfdfabf7ff7ffbac0, 0xf7faff77ffaf5dbf, 0x7dfbbf7eb7f6ffed, 0xfff7775fbfefdebf, 0x7fee, 0xbedddfddfbf7f7db, 0x6ebb6edf776b7bdf, 0x7ff0000000000000, 0x7fff77ff7fe00000, 0x7000, 0x7c007f00, 0xffffc00000007f00, 0x7fffffffffffffff, 0xb3fb7f7fbeff7000, 0x7ebef7ffbfff779f, 0x7dff5bebff7dffef, 0x7fffffbfffff7bfb, 0xffffffffffffffff, 0x6b777fffffffffff, 0xdbbf6effffdfbebb, 0x7ebf7f7fb5bf5fdb, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0x1fffffffffffffff } }; +pub const Extended_Pictographic = RuneSet{ .body = &.{ 0x0, 0x0, 0x1000c00000004, 0x1f, 0x420000000000, 0x30107fc8d053, 0x401, 0x80000000, 0xffff0fffafffffff, 0x2800000, 0x2001000000000000, 0x210000, 0x180000e0, 0x30000000000000, 0x8001000200e00000, 0xf800b85090, 0x1801022057ff3f, 0xffffffffffffffff, 0xffffffffffff003f, 0xffffffffffffffff, 0xfffffffffff7ffbf, 0x7800000000000001, 0x400c0000000000, 0x4, 0x70ffe0000008000, 0x100, 0x1000c000000, 0x60003f00000, 0x200000400000000, 0x200, 0x1000000000000000, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0x80000000e000, 0xc003f00000000000, 0xffffe00007fe4000, 0x3fffffffff, 0xf7fc80000400fffe, 0xfffffffffffffe00, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0x7ffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0x3fffffffffffffff, 0xffffffffffffffc0, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xfff0000000000000, 0xffffffffffe00000, 0xf000, 0xfc00ff00, 0xffffc0000000ff00, 0xffffffffffffffff, 0xf7fffffffffff000, 0xffffffffffffffbf, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0x3fffffffffffffff } }; // Meaningful names for the T1 slots const LOW = 0; @@ -27,7 +27,7 @@ pub const RuneSet = struct { const set = runeset.body; const a = codeunit(str[0]); switch (a.kind) { - .follow => return false, + .follow => unreachable, .low => { const mask = toMask(set[LOW]); if (mask.isIn(a)) diff --git a/src/unicode_tests.zig b/src/unicode_tests.zig index 7ce2b4e..59f0c6f 100644 --- a/src/unicode_tests.zig +++ b/src/unicode_tests.zig @@ -228,8 +228,7 @@ test "Segmentation Word Iterator" { // Check. for (want.items, 1..) |want_word, i| { const got_word = (iter.next()).?; - std.testing.expectEqualSlices( - u8, + std.testing.expectEqualStrings( want_word.bytes(all_bytes.items), got_word.bytes(all_bytes.items), ) catch |err| { -- cgit v1.2.3 From 49843d6dc6cde85fe670085b809d41db314ec47d Mon Sep 17 00:00:00 2001 From: Sam Atman Date: Tue, 13 May 2025 12:15:18 -0400 Subject: Add wordAtCursor This is not actually the way to do it, and can break on some crafted strings. The way to actually do it: implement a reverse word search iterator, then do next() to find a word break, prev() to find a _valid_ word start, then next() again to find the valid end of said word. Maybe 2+, 2-, 1+ actually. I can probably write a test to see if the cursor spot is ambiguous, and apply an extra round if so. Need to mull the rules over before making any rash moves. --- src/WordBreak.zig | 148 ++++++++++++++++++++++++++++++++++++------------------ 1 file changed, 100 insertions(+), 48 deletions(-) (limited to 'src') diff --git a/src/WordBreak.zig b/src/WordBreak.zig index a2be011..f0da30d 100644 --- a/src/WordBreak.zig +++ b/src/WordBreak.zig @@ -42,30 +42,6 @@ pub fn setup(wb: *WordBreak, allocator: Allocator) Allocator.Error!void { }; } -inline fn setupImpl(wb: *WordBreak, allocator: Allocator) !void { - const decompressor = compress.flate.inflate.decompressor; - const in_bytes = @embedFile("wbp"); - var in_fbs = std.io.fixedBufferStream(in_bytes); - var in_decomp = decompressor(.raw, in_fbs.reader()); - var reader = in_decomp.reader(); - - const endian = builtin.cpu.arch.endian(); - - const stage_1_len: u16 = try reader.readInt(u16, endian); - wb.s1 = try allocator.alloc(u16, stage_1_len); - errdefer allocator.free(wb.s1); - for (0..stage_1_len) |i| wb.s1[i] = try reader.readInt(u16, endian); - - const stage_2_len: u16 = try reader.readInt(u16, endian); - wb.s2 = try allocator.alloc(u5, stage_2_len); - errdefer allocator.free(wb.s2); - for (0..stage_2_len) |i| wb.s2[i] = @intCast(try reader.readInt(u8, endian)); - var count_0: usize = 0; - for (wb.s2) |nyb| { - if (nyb == 0) count_0 += 1; - } -} - pub fn deinit(wordbreak: *const WordBreak, allocator: mem.Allocator) void { allocator.free(wordbreak.s1); allocator.free(wordbreak.s2); @@ -88,29 +64,48 @@ pub fn breakProperty(wordbreak: *const WordBreak, cp: u21) WordBreakProperty { return @enumFromInt(wordbreak.s2[wordbreak.s1[cp >> 8] + (cp & 0xff)]); } +fn breakProp(wb: *const WordBreak, point: CodePoint) WordBreakProperty { + return @enumFromInt(wb.s2[wb.s1[point.code >> 8] + (point.code & 0xff)]); +} + +/// Returns the Word at the given index. Asserts that the index is valid for +/// the provided string. Always returns a word. This algorithm is O(n^2) for +/// the size of the word before the cursor. +pub fn wordAtCursor(wordbreak: *const WordBreak, string: []const u8, index: usize) Word { + assert(index < string.len); + var i = index; + var iter1 = wordbreak.iterator(string[index..]); + var last_word = iter1.next(); + if (index == 0) return if (last_word) |word| word else Word{ .offset = 0, .len = 0 }; + var this_word: ?Word = undefined; + while (last_word) |l_word| : (i -= 1) { + var iter2 = wordbreak.iterator(string[i..]); + this_word = iter2.next(); + if (this_word) |t_word| { + if (false) std.debug.print("last_word: '{s}'\n", .{l_word.bytes(string[i + 1 ..])}); + if (false) std.debug.print("this_word: '{s}'\n", .{t_word.bytes(string[i..])}); + // Check if we've captured a previous word + const t_end = t_word.offset + t_word.len + i - 1; + const l_start = l_word.offset + i + 1; + if (false) std.debug.print("t_end {d}, l_start {d}\n", .{ t_end, l_start }); + if (t_end < l_start) { + return .{ .offset = @intCast(l_start), .len = l_word.len }; + } + last_word = this_word; + } else unreachable; + if (i == 0) break; + } + return this_word.?; +} + /// Returns an iterator over words in `slice` pub fn iterator(wordbreak: *const WordBreak, slice: []const u8) Iterator { return Iterator.init(wordbreak, slice); } -const IterState = packed struct { - mid_punct: bool, // AHLetter (MidLetter | MidNumLetQ) × AHLetter - mid_num: bool, // Numeric (MidNum | MidNumLetQ) × Numeric - quote_heb: bool, // Hebrew_Letter Double_Quote × Hebrew_Letter - regional: bool, // [^RI] (RI RI)* RI × RI - - pub const initial: IterState = .{ - .mid_punct = false, - .mid_num = false, - .quote_heb = false, - .regional = false, - }; -}; - pub const Iterator = struct { this: ?CodePoint = null, that: ?CodePoint = null, - cache: ?WordBreakProperty = null, cp_iter: CodepointIterator, wb: *const WordBreak, @@ -121,6 +116,16 @@ pub const Iterator = struct { return wb_iter; } + /// Returns the next word segment, without advancing. + pub fn peek(iter: *Iterator) ?Word { + const cache = .{ iter.this, iter.that, iter.cp_iter }; + defer { + iter.this, iter.that, iter.cp_iter = cache; + } + return iter.next(); + } + + /// Returns the next word segment. pub fn next(iter: *Iterator) ?Word { iter.advance(); @@ -132,7 +137,7 @@ pub const Iterator = struct { const word_start = iter.this.?.offset; var word_len: u32 = 0; - // state variables + // State variables. var last_p: WordBreakProperty = .none; var last_last_p: WordBreakProperty = .none; var ri_count: usize = 0; @@ -141,12 +146,13 @@ pub const Iterator = struct { const this = iter.this.?; word_len += this.len; if (iter.that) |that| { - const this_p = iter.wb.breakProperty(this.code); // WB3 CR × LF - const that_p = iter.wb.breakProperty(that.code); + const this_p = iter.wb.breakProp(this); + const that_p = iter.wb.breakProp(that); if (!isIgnorable(this_p)) { last_last_p = last_p; last_p = this_p; } + // WB3 CR × LF if (this_p == .CR and that_p == .LF) continue :scan; // WB3a (Newline | CR | LF) ÷ if (isNewline(this_p)) break :scan; @@ -169,7 +175,7 @@ pub const Iterator = struct { if (isMidVal(that_p)) { const next_val = iter.peekPast(); if (next_val) |next_cp| { - const next_p = iter.wb.breakProperty(next_cp.code); + const next_p = iter.wb.breakProp(next_cp); if (isAHLetter(next_p)) { continue :scan; } @@ -187,7 +193,7 @@ pub const Iterator = struct { if (that_p == .Double_Quote) { const next_val = iter.peekPast(); if (next_val) |next_cp| { - const next_p = iter.wb.breakProperty(next_cp.code); + const next_p = iter.wb.breakProp(next_cp); if (next_p == .Hebrew_Letter) { continue :scan; } @@ -210,7 +216,7 @@ pub const Iterator = struct { if (last_p == .Numeric and isMidNum(that_p)) { const next_val = iter.peekPast(); if (next_val) |next_cp| { - const next_p = iter.wb.breakProperty(next_cp.code); + const next_p = iter.wb.breakProp(next_cp); if (next_p == .Numeric) { continue :scan; } @@ -247,13 +253,37 @@ pub const Iterator = struct { const save_cp = iter.cp_iter; defer iter.cp_iter = save_cp; while (iter.cp_iter.peek()) |peeked| { - if (!isIgnorable(iter.wb.breakProperty(peeked.code))) return peeked; + if (!isIgnorable(iter.wb.breakProp(peeked))) return peeked; _ = iter.cp_iter.next(); } return null; } }; +inline fn setupImpl(wb: *WordBreak, allocator: Allocator) !void { + const decompressor = compress.flate.inflate.decompressor; + const in_bytes = @embedFile("wbp"); + var in_fbs = std.io.fixedBufferStream(in_bytes); + var in_decomp = decompressor(.raw, in_fbs.reader()); + var reader = in_decomp.reader(); + + const endian = builtin.cpu.arch.endian(); + + const stage_1_len: u16 = try reader.readInt(u16, endian); + wb.s1 = try allocator.alloc(u16, stage_1_len); + errdefer allocator.free(wb.s1); + for (0..stage_1_len) |i| wb.s1[i] = try reader.readInt(u16, endian); + + const stage_2_len: u16 = try reader.readInt(u16, endian); + wb.s2 = try allocator.alloc(u5, stage_2_len); + errdefer allocator.free(wb.s2); + for (0..stage_2_len) |i| wb.s2[i] = @intCast(try reader.readInt(u8, endian)); + var count_0: usize = 0; + for (wb.s2) |nyb| { + if (nyb == 0) count_0 += 1; + } +} + //| Predicates inline fn isNewline(wbp: WordBreakProperty) bool { @@ -293,11 +323,33 @@ test "Word Break Properties" { try testing.expectEqual(.LF, wb.breakProperty('\n')); try testing.expectEqual(.Hebrew_Letter, wb.breakProperty('ש')); try testing.expectEqual(.Katakana, wb.breakProperty('\u{30ff}')); + var iter = wb.iterator("xxx"); + _ = iter.peek(); } -test "ext_pic" { +test "ext_pict" { try testing.expect(ext_pict.isMatch("👇")); - try testing.expect(ext_pict.isMatch("\u{2704}")); + try testing.expect(ext_pict.isMatch("\u{2701}")); +} + +test wordAtCursor { + const wb = try WordBreak.init(testing.allocator); + defer wb.deinit(testing.allocator); + const t_string = "first second third"; + const second = wb.wordAtCursor(t_string, 8); + try testing.expectEqualStrings("second", second.bytes(t_string)); + const third = wb.wordAtCursor(t_string, 14); + try testing.expectEqualStrings("third", third.bytes(t_string)); + { + const first = wb.wordAtCursor(t_string, 3); + try testing.expectEqualStrings("first", first.bytes(t_string)); + } + { + const first = wb.wordAtCursor(t_string, 0); + try testing.expectEqualStrings("first", first.bytes(t_string)); + } + const last = wb.wordAtCursor(t_string, 14); + try testing.expectEqualStrings("third", last.bytes(t_string)); } fn testAllocations(allocator: Allocator) !void { -- cgit v1.2.3 From 7ff729895e72fc841440ec73a44c142779fcae1e Mon Sep 17 00:00:00 2001 From: Sam Atman Date: Tue, 13 May 2025 16:33:02 -0400 Subject: Reverse Word Iterator Next up I hook it to the tests. --- src/WordBreak.zig | 156 +++++++++++++++++++++++++++++++++++++++++++++++++++++ src/code_point.zig | 2 +- 2 files changed, 157 insertions(+), 1 deletion(-) (limited to 'src') diff --git a/src/WordBreak.zig b/src/WordBreak.zig index f0da30d..37c0df9 100644 --- a/src/WordBreak.zig +++ b/src/WordBreak.zig @@ -260,6 +260,161 @@ pub const Iterator = struct { } }; +pub const ReverseIterator = struct { + after: ?CodePoint = null, + before: ?CodePoint = null, + cp_iter: ReverseCodepointIterator, + wb: *const WordBreak, + + /// Assumes `str` is valid UTF-8. + pub fn init(wb: *const WordBreak, str: []const u8) Iterator { + var wb_iter: Iterator = .{ .cp_iter = .init(str), .wb = wb }; + wb_iter.advance(); + return wb_iter; + } + + /// Returns the previous word segment, without advancing. + pub fn peek(iter: *ReverseIterator) ?Word { + const cache = .{ iter.before, iter.after, iter.cp_iter }; + defer { + iter.before, iter.after, iter.cp_iter = cache; + } + return iter.prev(); + } + + /// Return the previous word, if any. + pub fn prev(iter: *ReverseIterator) ?Word { + iter.advance(); + + // Done? + if (iter.after == null) return null; + // Last? + if (iter.before == null) return Word{ .len = iter.after.?.len, .offset = 0 }; + + const word_end = iter.after.?.offset + iter.after.?.len; + var word_len: u32 = 0; + + // State variables. + var last_p: WordBreakProperty = .none; + var last_last_p: WordBreakProperty = .none; + var ri_count: usize = 0; + + scan: while (true) : (iter.advance()) { + const after = iter.after.?; + word_len += after.len; + if (iter.before) |before| { + const after_p = iter.wb.breakProp(after); + const before_p = iter.wb.breakProp(before); + if (!isIgnorable(after_p)) { + last_last_p = last_p; + last_p = after_p; + } + // WB3 CR × LF + if (before_p == .CR and after_p == .LF) continue :scan; + // WB3a (Newline | CR | LF) ÷ + if (isNewline(before_p)) break :scan; + // WB3b ÷ (Newline | CR | LF) + if (isNewline(after_p)) break :scan; + // WB3c ZWJ × \p{Extended_Pictographic} + if (before_p == .ZWJ and ext_pict.isMatch(after.bytes(iter.cp_iter.bytes))) { + continue :scan; + } + // WB3d WSegSpace × WSegSpace + if (before_p == .WSegSpace and after_p == .WSegSpace) continue :scan; + // WB4 X (Extend | Format | ZWJ)* → X + if (isIgnorable(after_p)) { + continue :scan; + } // Now we use last_p instead of after_p for ignorable's sake + // WB5 AHLetter × AHLetter + if (isAHLetter(last_p) and isAHLetter(before_p)) { + continue :scan; + } + // WB6 AHLetter × (MidLetter | MidNumLetQ) AHLetter + if (isAHLetter(before_p) and isMidVal(last_p) and isAHLetter(last_last_p)) { + continue :scan; + } + // WB7 AHLetter (MidLetter | MidNumLetQ) × AHLetter + if (isMidVal(before_p)) { + const prev_val = iter.peekPast(); + if (prev_val) |prev_cp| { + const prev_p = iter.wb.breakProp(prev_cp); + if (isAHLetter(prev_p)) { + continue :scan; + } + } + } + // WB7a Hebrew_Letter × Single_Quote + if (before_p == .Hebrew_Letter and last_p == .Single_Quote) continue :scan; + // WB7b Hebrew_Letter × Double_Quote Hebrew_Letter + if (before_p == .Hebrew_Letter and last_p == .Double_Quote and last_last_p == .Hebrew_Letter) { + continue :scan; + } + // WB7c Hebrew_Letter Double_Quote × Hebrew_Letter + if (before_p == .Double_Quote and last_p == .Hebrew_Letter) { + const prev_val = iter.peekPast(); + if (prev_val) |prev_cp| { + const prev_p = iter.wb.breakProp(prev_cp); + if (prev_p == .Hebrew_Letter) { + continue :scan; + } + } + } + // WB8 Numeric × Numeric + if (before_p == .Numeric and last_p == .Numeric) continue :scan; + // WB9 AHLetter × Numeric + if (isAHLetter(before_p) and last_p == .Numeric) continue :scan; + // WB10 Numeric × AHLetter + if (before_p == .Numeric and isAHLetter(last_p)) continue :scan; + // WB11 Numeric (MidNum | MidNumLetQ) × Numeric + if (isMidNum(before_p) and last_p == .Numeric) { + const prev_val = iter.peekPast(); + if (prev_val) |prev_cp| { + const prev_p = iter.wb.breakProp(prev_cp); + if (prev_p == .Numeric) { + continue :scan; + } + } + } + // WB12 Numeric × (MidNum | MidNumLetQ) Numeric + if (before_p == .Numeric and isMidNum(last_p) and last_last_p == .Numeric) { + continue :scan; + } + // WB13 Katakana × Katakana + if (before_p == .Katakana and last_p == .Katakana) continue :scan; + // WB13a (AHLetter | Numeric | Katakana | ExtendNumLet) × ExtendNumLet + if (isExtensible(before_p) and last_p == .ExtendNumLet) continue :scan; + // WB13b ExtendNumLet × (AHLetter | Numeric | Katakana) + if (before_p == .ExtendNumLet and isExtensible(last_p)) continue :scan; + // WB15, WB16 ([^RI] | sot) (RI RI)* RI × RI + const maybe_flag = before_p == .Regional_Indicator and last_p == .Regional_Indicator; + if (maybe_flag) { + ri_count += 1; + if (ri_count % 2 == 1) continue :scan; + } + // WB999 Any ÷ Any + break :scan; + } + break :scan; + } + return Word{ .len = word_len, .offset = word_end - word_len }; + } + + fn peekPast(iter: *ReverseIterator) ?CodePoint { + const save_cp = iter.cp_iter; + defer iter.cp_iter = save_cp; + while (iter.cp_iter.peek()) |peeked| { + if (!isIgnorable(iter.wb.breakProp(peeked))) return peeked; + _ = iter.cp_iter.prev(); + } + return null; + } + + fn advance(iter: *ReverseIterator) void { + iter.after = iter.before; + iter.before = iter.cp_iter.prev(); + } +}; + inline fn setupImpl(wb: *WordBreak, allocator: Allocator) !void { const decompressor = compress.flate.inflate.decompressor; const in_bytes = @embedFile("wbp"); @@ -371,6 +526,7 @@ const testing = std.testing; const code_point = @import("code_point"); const CodepointIterator = code_point.Iterator; +const ReverseCodepointIterator = code_point.ReverseIterator; const CodePoint = code_point.CodePoint; const ext_pict = @import("micro_runeset.zig").Extended_Pictographic; diff --git a/src/code_point.zig b/src/code_point.zig index 79ee5cd..a5b10d4 100644 --- a/src/code_point.zig +++ b/src/code_point.zig @@ -12,7 +12,7 @@ pub const CodePoint = struct { offset: u32, /// Return the slice of this codepoint, given the original string. - pub fn bytes(cp: CodePoint, str: []const u8) []const u8 { + pub inline fn bytes(cp: CodePoint, str: []const u8) []const u8 { return str[cp.offset..][0..cp.len]; } }; -- cgit v1.2.3 From 5cc8c1875a21bfb398e6685b03a29d6ba1cbf74a Mon Sep 17 00:00:00 2001 From: Sam Atman Date: Tue, 13 May 2025 17:19:56 -0400 Subject: Hooked up break test, some bugs squashed The handling of ignorables is really different, because they 'adhere' to the future of the iteration, not the past. --- src/WordBreak.zig | 39 ++++++++++++++++++++++++++++++--------- src/code_point.zig | 10 ---------- src/unicode_tests.zig | 49 ++++++++++++++++++++++++++++++++++--------------- 3 files changed, 64 insertions(+), 34 deletions(-) (limited to 'src') diff --git a/src/WordBreak.zig b/src/WordBreak.zig index 37c0df9..0cab30e 100644 --- a/src/WordBreak.zig +++ b/src/WordBreak.zig @@ -98,11 +98,16 @@ pub fn wordAtCursor(wordbreak: *const WordBreak, string: []const u8, index: usiz return this_word.?; } -/// Returns an iterator over words in `slice` +/// Returns an iterator over words in `slice`. pub fn iterator(wordbreak: *const WordBreak, slice: []const u8) Iterator { return Iterator.init(wordbreak, slice); } +/// Returns a reverse iterator over the words in `slice`. +pub fn reverseIterator(wordbreak: *const WordBreak, slice: []const u8) ReverseIterator { + return ReverseIterator.init(wordbreak, slice); +} + pub const Iterator = struct { this: ?CodePoint = null, that: ?CodePoint = null, @@ -111,7 +116,7 @@ pub const Iterator = struct { /// Assumes `str` is valid UTF-8. pub fn init(wb: *const WordBreak, str: []const u8) Iterator { - var wb_iter: Iterator = .{ .cp_iter = .{ .bytes = str }, .wb = wb }; + var wb_iter: Iterator = .{ .cp_iter = .init(str), .wb = wb }; wb_iter.advance(); return wb_iter; } @@ -267,8 +272,8 @@ pub const ReverseIterator = struct { wb: *const WordBreak, /// Assumes `str` is valid UTF-8. - pub fn init(wb: *const WordBreak, str: []const u8) Iterator { - var wb_iter: Iterator = .{ .cp_iter = .init(str), .wb = wb }; + pub fn init(wb: *const WordBreak, str: []const u8) ReverseIterator { + var wb_iter: ReverseIterator = .{ .cp_iter = .init(str), .wb = wb }; wb_iter.advance(); return wb_iter; } @@ -299,12 +304,19 @@ pub const ReverseIterator = struct { var last_last_p: WordBreakProperty = .none; var ri_count: usize = 0; + // TODO: Ignorables have to be handled completely differently, unfortunately. + // We have to find whatever is before it, match against that, and use that + // decision to handle the break we're currently working on. + // -- + // This is achieveable I think. Just need to use peekPast to get that, and then + // take it from there. Probably as long as an ignorable is an after_p we just keep + // going. scan: while (true) : (iter.advance()) { const after = iter.after.?; word_len += after.len; if (iter.before) |before| { const after_p = iter.wb.breakProp(after); - const before_p = iter.wb.breakProp(before); + var before_p = iter.wb.breakProp(before); if (!isIgnorable(after_p)) { last_last_p = last_p; last_p = after_p; @@ -322,9 +334,18 @@ pub const ReverseIterator = struct { // WB3d WSegSpace × WSegSpace if (before_p == .WSegSpace and after_p == .WSegSpace) continue :scan; // WB4 X (Extend | Format | ZWJ)* → X - if (isIgnorable(after_p)) { - continue :scan; - } // Now we use last_p instead of after_p for ignorable's sake + if (isIgnorable(before_p)) { + const maybe_before = iter.peekPast(); + if (maybe_before) |valid_before| { + before_p = iter.wb.breakProp(valid_before); + } else if (isIgnorable(after_p)) { + continue :scan; + // We're done + } else { + break :scan; + } + } + if (isIgnorable(after_p)) continue :scan; // WB5 AHLetter × AHLetter if (isAHLetter(last_p) and isAHLetter(before_p)) { continue :scan; @@ -334,7 +355,7 @@ pub const ReverseIterator = struct { continue :scan; } // WB7 AHLetter (MidLetter | MidNumLetQ) × AHLetter - if (isMidVal(before_p)) { + if (isMidVal(before_p) and isAHLetter(last_p)) { const prev_val = iter.peekPast(); if (prev_val) |prev_cp| { const prev_p = iter.wb.breakProp(prev_cp); diff --git a/src/code_point.zig b/src/code_point.zig index a5b10d4..ba0b434 100644 --- a/src/code_point.zig +++ b/src/code_point.zig @@ -53,22 +53,12 @@ pub fn decodeAtCursor(bytes: []const u8, cursor: *u32) ?CodePoint { }; // Multibyte -<<<<<<< HEAD // Second: var class: u4 = @intCast(u8dfa[byte]); var st: u32 = state_dfa[class]; if (st == RUNE_REJECT or cursor.* == bytes.len) { @branchHint(.cold); // First one is never a truncation -||||||| parent of ad4b046 (Various small iterator improvements) - // Return replacement if we don' have a complete codepoint remaining. Consumes only one byte - if (cp.len > bytes.len) { - // Unicode replacement code point. -======= - // Return replacement if we don't have a complete codepoint remaining. Consumes only one byte. - if (cp.len > bytes.len) { - // Unicode replacement code point. ->>>>>>> ad4b046 (Various small iterator improvements) return .{ .code = 0xfffd, .len = 1, diff --git a/src/unicode_tests.zig b/src/unicode_tests.zig index 59f0c6f..8661bfd 100644 --- a/src/unicode_tests.zig +++ b/src/unicode_tests.zig @@ -195,7 +195,7 @@ test "Segmentation Word Iterator" { line = line[0..final]; } // Iterate over fields. - var want = std.ArrayList(Grapheme).init(allocator); + var want = std.ArrayList(Word).init(allocator); defer want.deinit(); var all_bytes = std.ArrayList(u8).init(allocator); @@ -219,22 +219,40 @@ test "Segmentation Word Iterator" { gc_len += len; } - try want.append(Grapheme{ .len = gc_len, .offset = bytes_index }); + try want.append(Word{ .len = gc_len, .offset = bytes_index }); bytes_index += cp_index; } - - var iter = wb.iterator(all_bytes.items); - - // Check. - for (want.items, 1..) |want_word, i| { - const got_word = (iter.next()).?; - std.testing.expectEqualStrings( - want_word.bytes(all_bytes.items), - got_word.bytes(all_bytes.items), - ) catch |err| { - debug.print("Error on line {d}, #{d}\n", .{ line_iter.line, i }); - return err; - }; + { + var iter = wb.iterator(all_bytes.items); + + // Check. + for (want.items, 1..) |want_word, i| { + const got_word = (iter.next()).?; + std.testing.expectEqualStrings( + want_word.bytes(all_bytes.items), + got_word.bytes(all_bytes.items), + ) catch |err| { + debug.print("Error on line {d}, #{d}\n", .{ line_iter.line, i }); + return err; + }; + } + } + { + var r_iter = wb.reverseIterator(all_bytes.items); + var idx = want.items.len - 1; + while (true) : (idx -= 1) { + const want_word = want.items[idx]; + const got_word = r_iter.prev().?; + std.testing.expectEqualSlices( + u8, + want_word.bytes(all_bytes.items), + got_word.bytes(all_bytes.items), + ) catch |err| { + debug.print("Error on line {d}, #{d}\n", .{ line_iter.line, idx + 1 }); + return err; + }; + if (idx == 0) break; + } } } } @@ -277,3 +295,4 @@ const GraphemeIterator = @import("Graphemes").Iterator; const Normalize = @import("Normalize"); const WordBreak = @import("WordBreak"); +const Word = WordBreak.Word; -- cgit v1.2.3 From 3461a7750344fe7301cacf04f2f5e154ef70e966 Mon Sep 17 00:00:00 2001 From: Sam Atman Date: Wed, 14 May 2025 10:05:49 -0400 Subject: ReverseWordIterator passes conformance test Ended up needing a clone of the codepoint iterator, so that WB4 can ignore points in a matter compatible with backward search. So I created a special SneakIterator which can return WBPs directly, so as to skip ignorables. This is also needed for flag emoji, since the odd-number case must be handle immediately. So we count back in a WB4 compatible way, then store the count on the word iterator, and voila. --- src/WordBreak.zig | 83 ++++++++++++++++++++++++++++++++++++++++++------------- 1 file changed, 64 insertions(+), 19 deletions(-) (limited to 'src') diff --git a/src/WordBreak.zig b/src/WordBreak.zig index 0cab30e..f1322ff 100644 --- a/src/WordBreak.zig +++ b/src/WordBreak.zig @@ -270,6 +270,7 @@ pub const ReverseIterator = struct { before: ?CodePoint = null, cp_iter: ReverseCodepointIterator, wb: *const WordBreak, + flags: usize = 0, /// Assumes `str` is valid UTF-8. pub fn init(wb: *const WordBreak, str: []const u8) ReverseIterator { @@ -302,19 +303,12 @@ pub const ReverseIterator = struct { // State variables. var last_p: WordBreakProperty = .none; var last_last_p: WordBreakProperty = .none; - var ri_count: usize = 0; - // TODO: Ignorables have to be handled completely differently, unfortunately. - // We have to find whatever is before it, match against that, and use that - // decision to handle the break we're currently working on. - // -- - // This is achieveable I think. Just need to use peekPast to get that, and then - // take it from there. Probably as long as an ignorable is an after_p we just keep - // going. scan: while (true) : (iter.advance()) { const after = iter.after.?; word_len += after.len; if (iter.before) |before| { + var sneak = sneaky(iter); // 'sneaks' past ignorables const after_p = iter.wb.breakProp(after); var before_p = iter.wb.breakProp(before); if (!isIgnorable(after_p)) { @@ -335,13 +329,11 @@ pub const ReverseIterator = struct { if (before_p == .WSegSpace and after_p == .WSegSpace) continue :scan; // WB4 X (Extend | Format | ZWJ)* → X if (isIgnorable(before_p)) { - const maybe_before = iter.peekPast(); + const maybe_before = sneak.prev(); if (maybe_before) |valid_before| { before_p = iter.wb.breakProp(valid_before); - } else if (isIgnorable(after_p)) { - continue :scan; + } else if (!isIgnorable(after_p)) { // We're done - } else { break :scan; } } @@ -356,7 +348,7 @@ pub const ReverseIterator = struct { } // WB7 AHLetter (MidLetter | MidNumLetQ) × AHLetter if (isMidVal(before_p) and isAHLetter(last_p)) { - const prev_val = iter.peekPast(); + const prev_val = sneak.peek(); if (prev_val) |prev_cp| { const prev_p = iter.wb.breakProp(prev_cp); if (isAHLetter(prev_p)) { @@ -372,7 +364,7 @@ pub const ReverseIterator = struct { } // WB7c Hebrew_Letter Double_Quote × Hebrew_Letter if (before_p == .Double_Quote and last_p == .Hebrew_Letter) { - const prev_val = iter.peekPast(); + const prev_val = sneak.peek(); if (prev_val) |prev_cp| { const prev_p = iter.wb.breakProp(prev_cp); if (prev_p == .Hebrew_Letter) { @@ -388,7 +380,7 @@ pub const ReverseIterator = struct { if (before_p == .Numeric and isAHLetter(last_p)) continue :scan; // WB11 Numeric (MidNum | MidNumLetQ) × Numeric if (isMidNum(before_p) and last_p == .Numeric) { - const prev_val = iter.peekPast(); + const prev_val = sneak.peek(); if (prev_val) |prev_cp| { const prev_p = iter.wb.breakProp(prev_cp); if (prev_p == .Numeric) { @@ -407,10 +399,23 @@ pub const ReverseIterator = struct { // WB13b ExtendNumLet × (AHLetter | Numeric | Katakana) if (before_p == .ExtendNumLet and isExtensible(last_p)) continue :scan; // WB15, WB16 ([^RI] | sot) (RI RI)* RI × RI - const maybe_flag = before_p == .Regional_Indicator and last_p == .Regional_Indicator; - if (maybe_flag) { - ri_count += 1; - if (ri_count % 2 == 1) continue :scan; + // NOTE: + // So here we simply have to know whether a run of flags is even or odd. + // The whole run. To avoid quadratic behavior (and long flag runs are + // actually a thing in the wild), we have to count them once, store that + // on the iterator, and decrement each time we see two, possibly breaking + // once extra at the beginning. They break up one per flag, once we hit + // zero, that's all the flags. If we see another flag we do it again. + if (before_p == .Regional_Indicator and last_p == .Regional_Indicator) { + defer { + if (iter.flags > 0) iter.flags -= 1; + } + if (iter.flags == 0) { + iter.flags = sneak.countFlags(); + } + if (iter.flags % 2 == 0) { + continue :scan; + } } // WB999 Any ÷ Any break :scan; @@ -436,6 +441,46 @@ pub const ReverseIterator = struct { } }; +fn sneaky(iter: *const ReverseIterator) SneakIterator { + return .{ .cp_iter = iter.cp_iter, .wb = iter.wb }; +} + +const SneakIterator = struct { + cp_iter: ReverseCodepointIterator, + wb: *const WordBreak, + + fn peek(iter: *SneakIterator) ?CodePoint { + const save_cp = iter.cp_iter; + defer iter.cp_iter = save_cp; + while (iter.cp_iter.peek()) |peeked| { + if (!isIgnorable(iter.wb.breakProp(peeked))) return peeked; + _ = iter.cp_iter.prev(); + } + return null; + } + + fn countFlags(iter: *SneakIterator) usize { + var flags: usize = 0; + const save_cp = iter.cp_iter; + defer iter.cp_iter = save_cp; + while (iter.cp_iter.prev()) |cp| { + const prop = iter.wb.breakProp(cp); + if (isIgnorable(prop)) continue; + if (prop == .Regional_Indicator) { + flags += 1; + } else break; + } + return flags; + } + + fn prev(iter: *SneakIterator) ?CodePoint { + while (iter.cp_iter.prev()) |peeked| { + if (!isIgnorable(iter.wb.breakProp(peeked))) return peeked; + } + return null; + } +}; + inline fn setupImpl(wb: *WordBreak, allocator: Allocator) !void { const decompressor = compress.flate.inflate.decompressor; const in_bytes = @embedFile("wbp"); -- cgit v1.2.3 From b1d67fab5c3dd3ed1d47ee63ab45a600b19f7a3c Mon Sep 17 00:00:00 2001 From: Sam Atman Date: Wed, 14 May 2025 10:46:25 -0400 Subject: Peek tests for word iterators --- src/unicode_tests.zig | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) (limited to 'src') diff --git a/src/unicode_tests.zig b/src/unicode_tests.zig index 8661bfd..ef459bf 100644 --- a/src/unicode_tests.zig +++ b/src/unicode_tests.zig @@ -224,6 +224,7 @@ test "Segmentation Word Iterator" { } { var iter = wb.iterator(all_bytes.items); + var peeked: ?Word = iter.peek(); // Check. for (want.items, 1..) |want_word, i| { @@ -235,11 +236,21 @@ test "Segmentation Word Iterator" { debug.print("Error on line {d}, #{d}\n", .{ line_iter.line, i }); return err; }; + std.testing.expectEqualStrings( + peeked.?.bytes(all_bytes.items), + got_word.bytes(all_bytes.items), + ) catch |err| { + debug.print("Peek != word on line {d} #{d}\n", .{ line_iter.line, i }); + return err; + }; + peeked = iter.peek(); } } { var r_iter = wb.reverseIterator(all_bytes.items); + var peeked: ?Word = r_iter.peek(); var idx = want.items.len - 1; + while (true) : (idx -= 1) { const want_word = want.items[idx]; const got_word = r_iter.prev().?; @@ -251,6 +262,14 @@ test "Segmentation Word Iterator" { debug.print("Error on line {d}, #{d}\n", .{ line_iter.line, idx + 1 }); return err; }; + std.testing.expectEqualStrings( + peeked.?.bytes(all_bytes.items), + got_word.bytes(all_bytes.items), + ) catch |err| { + debug.print("Peek != word on line {d} #{d}\n", .{ line_iter.line, idx + 1 }); + return err; + }; + peeked = r_iter.peek(); if (idx == 0) break; } } -- cgit v1.2.3 From b3847c8d7d0b73fabf66276a3b82c7533e5f930e Mon Sep 17 00:00:00 2001 From: Sam Atman Date: Wed, 14 May 2025 12:59:30 -0400 Subject: Add reversal functions for word iterators While of only occasional use in real programs, one thing these are good for is reliably retrieving the word at a given index. Which turns out to be.. tricky is the best word. --- src/WordBreak.zig | 83 +++++++++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 81 insertions(+), 2 deletions(-) (limited to 'src') diff --git a/src/WordBreak.zig b/src/WordBreak.zig index f1322ff..0925b2f 100644 --- a/src/WordBreak.zig +++ b/src/WordBreak.zig @@ -130,6 +130,22 @@ pub const Iterator = struct { return iter.next(); } + /// Return a reverse iterator from the point this iterator is paused + /// at. Usually, calling `prev()` will return the word just seen. + pub fn reverseIterator(iter: *Iterator) ReverseIterator { + var cp_it = iter.cp_iter.reverseIterator(); + if (iter.that) |_| + _ = cp_it.prev(); + if (iter.cp_iter.peek()) |_| + _ = cp_it.prev(); + return .{ + .wb = iter.wb, + .before = cp_it.prev(), + .after = iter.that, + .cp_iter = cp_it, + }; + } + /// Returns the next word segment. pub fn next(iter: *Iterator) ?Word { iter.advance(); @@ -249,6 +265,13 @@ pub const Iterator = struct { return Word{ .len = word_len, .offset = word_start }; } + pub fn format(iter: Iterator, _: []const u8, _: std.fmt.FormatOptions, writer: anytype) !void { + try writer.print( + "Iterator {{ .this = {any}, .that = {any} }}", + .{ iter.this, iter.that }, + ); + } + fn advance(iter: *Iterator) void { iter.this = iter.that; iter.that = iter.cp_iter.next(); @@ -281,13 +304,27 @@ pub const ReverseIterator = struct { /// Returns the previous word segment, without advancing. pub fn peek(iter: *ReverseIterator) ?Word { - const cache = .{ iter.before, iter.after, iter.cp_iter }; + const cache = .{ iter.before, iter.after, iter.cp_iter, iter.flags }; defer { - iter.before, iter.after, iter.cp_iter = cache; + iter.before, iter.after, iter.cp_iter, iter.flags = cache; } return iter.prev(); } + /// Return a forward iterator from where this iterator paused. Usually, + /// calling `next()` will return the word just seen. + pub fn forwardIterator(iter: *ReverseIterator) Iterator { + var cp_it = iter.cp_iter.forwardIterator(); + if (iter.before) |_| + _ = cp_it.next(); + return .{ + .wb = iter.wb, + .this = cp_it.next(), + .that = iter.after, + .cp_iter = cp_it, + }; + } + /// Return the previous word, if any. pub fn prev(iter: *ReverseIterator) ?Word { iter.advance(); @@ -425,6 +462,13 @@ pub const ReverseIterator = struct { return Word{ .len = word_len, .offset = word_end - word_len }; } + pub fn format(iter: ReverseIterator, _: []const u8, _: std.fmt.FormatOptions, writer: anytype) !void { + try writer.print( + "ReverseIterator {{ .before = {any}, .after = {any}, .flags = {d} }}", + .{ iter.before, iter.after, iter.flags }, + ); + } + fn peekPast(iter: *ReverseIterator) ?CodePoint { const save_cp = iter.cp_iter; defer iter.cp_iter = save_cp; @@ -573,6 +617,41 @@ test wordAtCursor { try testing.expectEqualStrings("third", last.bytes(t_string)); } +const testr = "don't a:ka fin!"; + +test "reversal" { + const wb = try WordBreak.init(testing.allocator); + defer wb.deinit(testing.allocator); + { + var fwd = wb.iterator(testr); + var this_word: ?Word = fwd.next(); + + while (this_word) |this| : (this_word = fwd.next()) { + var back = fwd.reverseIterator(); + const that_word = back.prev(); + if (that_word) |that| { + try testing.expectEqualStrings(this.bytes(testr), that.bytes(testr)); + } else { + try testing.expect(false); + } + } + } + { + var back = wb.reverseIterator(testr); + var this_word: ?Word = back.prev(); + + while (this_word) |this| : (this_word = back.prev()) { + var fwd = back.forwardIterator(); + const that_word = fwd.next(); + if (that_word) |that| { + try testing.expectEqualStrings(this.bytes(testr), that.bytes(testr)); + } else { + try testing.expect(false); + } + } + } +} + fn testAllocations(allocator: Allocator) !void { const wb = try WordBreak.init(allocator); wb.deinit(allocator); -- cgit v1.2.3 From 2b31e66b68216b876ddaf793f4008e652fcd7686 Mon Sep 17 00:00:00 2001 From: Sam Atman Date: Wed, 14 May 2025 13:15:14 -0400 Subject: Add format for CodePoint --- src/code_point.zig | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) (limited to 'src') diff --git a/src/code_point.zig b/src/code_point.zig index ba0b434..43b38d2 100644 --- a/src/code_point.zig +++ b/src/code_point.zig @@ -15,6 +15,14 @@ pub const CodePoint = struct { pub inline fn bytes(cp: CodePoint, str: []const u8) []const u8 { return str[cp.offset..][0..cp.len]; } + + pub fn format(cp: CodePoint, _: []const u8, _: std.fmt.FormatOptions, writer: anytype) !void { + try writer.print("CodePoint '{u}' .{{ ", .{cp.code}); + try writer.print( + ".code = 0x{x}, .offset = {d}, .len = {d} }}", + .{ cp.code, cp.offset, cp.len }, + ); + } }; /// This function is deprecated and will be removed in a later release. @@ -171,7 +179,7 @@ pub const Iterator = struct { /// Create a backward iterator at this point. It will repeat /// the last CodePoint seen. - pub fn reverseIterator(iter: *Iterator) ReverseIterator { + pub fn reverseIterator(iter: *const Iterator) ReverseIterator { if (iter.i == iter.bytes.len) { return .init(iter.bytes); } @@ -283,7 +291,7 @@ pub const ReverseIterator = struct { /// Create a forward iterator at this point. It will repeat the /// last CodePoint seen. - pub fn forwardIterator(iter: *ReverseIterator) Iterator { + pub fn forwardIterator(iter: *const ReverseIterator) Iterator { if (iter.i) |i| { var fwd: Iterator = .{ .i = i, .bytes = iter.bytes }; _ = fwd.next(); -- cgit v1.2.3 From c98b19032536b0e7d7956432f605c1ede80891f0 Mon Sep 17 00:00:00 2001 From: Sam Atman Date: Wed, 14 May 2025 14:32:06 -0400 Subject: Rewrite wordAtIndex to use iterator flipping This also adds helper functions for initializing iterators at an index within the string. Not that user code should do that necessarily, but `wordAtIndex` definitely should, and there's no reason not to make it available to others. With an appropriate warning at least. --- src/WordBreak.zig | 107 ++++++++++++++++++++++++++++++++++++++++++------------ 1 file changed, 83 insertions(+), 24 deletions(-) (limited to 'src') diff --git a/src/WordBreak.zig b/src/WordBreak.zig index 0925b2f..7ac8f14 100644 --- a/src/WordBreak.zig +++ b/src/WordBreak.zig @@ -69,33 +69,53 @@ fn breakProp(wb: *const WordBreak, point: CodePoint) WordBreakProperty { } /// Returns the Word at the given index. Asserts that the index is valid for -/// the provided string. Always returns a word. This algorithm is O(n^2) for -/// the size of the word before the cursor. +/// the provided string, and that the string is not empty. Always returns a word. +/// The index does not have to be the start of a codepoint in the word. pub fn wordAtCursor(wordbreak: *const WordBreak, string: []const u8, index: usize) Word { - assert(index < string.len); - var i = index; - var iter1 = wordbreak.iterator(string[index..]); - var last_word = iter1.next(); - if (index == 0) return if (last_word) |word| word else Word{ .offset = 0, .len = 0 }; - var this_word: ?Word = undefined; - while (last_word) |l_word| : (i -= 1) { - var iter2 = wordbreak.iterator(string[i..]); - this_word = iter2.next(); - if (this_word) |t_word| { - if (false) std.debug.print("last_word: '{s}'\n", .{l_word.bytes(string[i + 1 ..])}); - if (false) std.debug.print("this_word: '{s}'\n", .{t_word.bytes(string[i..])}); - // Check if we've captured a previous word - const t_end = t_word.offset + t_word.len + i - 1; - const l_start = l_word.offset + i + 1; - if (false) std.debug.print("t_end {d}, l_start {d}\n", .{ t_end, l_start }); - if (t_end < l_start) { - return .{ .offset = @intCast(l_start), .len = l_word.len }; + assert(index < string.len and string.len > 0); + var iter_fwd: Iterator = .initAtIndex(wordbreak, string, index); + if (iter_fwd.next()) |_| { + // This is a bit tricky because we may have been in the middle of various + // stateful things. So we go forward again: + if (iter_fwd.next()) |_| { + // Make a back iterator: + var iter_back = iter_fwd.reverseIterator(); + const last_word = iter_back.prev().?; // Always works. + const no_flags = iter_back.flags == 0; + if (no_flags) { + // Next previous is our word. + const the_word = iter_back.prev(); + if (the_word) |word| { + assert(word.offset <= index and index <= word.offset + word.len); + return word; + } else { // Can happen, at least I think so + assert(last_word.offset <= index and index <= last_word.offset + last_word.len); + return last_word; + } + } else { + // Scan past all the flags. + while (iter_back.flags > 0) { + _ = iter_back.prev(); + } + // Now just look for our word + iter_fwd = iter_back.forwardIterator(); + while (iter_fwd.next()) |word| { + if (word.offset <= index and index <= word.offset + word.len) { + return word; + } + } + unreachable; } - last_word = this_word; - } else unreachable; - if (i == 0) break; + } else { // We can just reverse here + var iter_back = iter_fwd.reverseIterator(); + const word = iter_back.prev().?; + assert(word.offset <= index and index <= word.offset + word.len); + return word; + } + } else { // last word then + var iter_back = iter_fwd.reverseIterator(); + return iter_back.prev().?; } - return this_word.?; } /// Returns an iterator over words in `slice`. @@ -130,6 +150,36 @@ pub const Iterator = struct { return iter.next(); } + /// Initialize an Iterator at the provided index. Assumes str is valid + /// UTF-8, asserts that `index` is less than str.len, and that `str` is not + /// empty. Note that for various stateful reasons, this may give spurious + /// results if used naïvely. If you want to reliably iterate from an index, + /// use `wb.wordAtIndex(string, index)` to obtain the word, then start an + /// iterator at `wb.initAtIndex(string, word.offset)`. + pub fn initAtIndex(wb: *const WordBreak, string: []const u8, index: usize) Iterator { + assert(index < string.len and string.len > 0); + // Just in case... + if (index == 0) return wb.iterator(string); + var idx: u32 = @intCast(index); + // Back up past any any follow bytes: + while (idx > 0 and 0x80 <= string[idx] and string[idx] <= 0xBf) : (idx -= 1) {} + var iter: Iterator = undefined; + iter.wb = wb; + // We need to populate the CodePoints, and the codepoint iterator. + // Consider "abc |def" with the cursor on d. + // We need `this` to be ` ` and `that` to be 'd', + // and `cp_iter.next()` to be `e`. + var cp_back: ReverseCodepointIterator = .{ .bytes = string, .i = idx }; + // Reverse gives us before `d`: + iter.this = cp_back.prev(); // that == ` ` + // This iterator will give us `d`: + iter.cp_iter = .{ .bytes = string, .i = idx }; + iter.that = iter.cp_iter.next(); + // So that the next call will will give us `e`, + // thus the word will be `def`. + return iter; + } + /// Return a reverse iterator from the point this iterator is paused /// at. Usually, calling `prev()` will return the word just seen. pub fn reverseIterator(iter: *Iterator) ReverseIterator { @@ -302,6 +352,15 @@ pub const ReverseIterator = struct { return wb_iter; } + /// Initialize a ReverseIterator at the provided index. Assumes str is valid + /// UTF-8, asserts that `index` is less than str.len, and that `str` is not + /// empty. You should prefer not to use this function, see Iterator.initAtIndex + /// for more details. + pub fn initAtIndex(wb: *const WordBreak, string: []const u8, index: usize) Iterator { + var fw_iter = Iterator.initAtIndex(wb, string, index); + return fw_iter.reverseIterator(); + } + /// Returns the previous word segment, without advancing. pub fn peek(iter: *ReverseIterator) ?Word { const cache = .{ iter.before, iter.after, iter.cp_iter, iter.flags }; -- cgit v1.2.3 From 736b4ccce2384c8f96e63d9c49ab4d6aee1d65a5 Mon Sep 17 00:00:00 2001 From: Sam Atman Date: Thu, 15 May 2025 10:57:33 -0400 Subject: wordAtIndex passes conformance I removed the initAtIndex functions from the public vocabulary, because the last couple of days of sweat and blood prove that it's hard to use correctly. That's probably it for WordBreak, now to fix the overlong bug on v0.14 and get this integrated with the new reverse grapheme iterator. --- src/WordBreak.zig | 161 ++++++++++++++++++++++---------------------------- src/code_point.zig | 1 - src/unicode_tests.zig | 76 ++++++++++++++++++++---- 3 files changed, 135 insertions(+), 103 deletions(-) (limited to 'src') diff --git a/src/WordBreak.zig b/src/WordBreak.zig index 7ac8f14..6ada7e1 100644 --- a/src/WordBreak.zig +++ b/src/WordBreak.zig @@ -64,58 +64,57 @@ pub fn breakProperty(wordbreak: *const WordBreak, cp: u21) WordBreakProperty { return @enumFromInt(wordbreak.s2[wordbreak.s1[cp >> 8] + (cp & 0xff)]); } +/// Convenience function for working with CodePoints fn breakProp(wb: *const WordBreak, point: CodePoint) WordBreakProperty { return @enumFromInt(wb.s2[wb.s1[point.code >> 8] + (point.code & 0xff)]); } -/// Returns the Word at the given index. Asserts that the index is valid for -/// the provided string, and that the string is not empty. Always returns a word. +/// Returns the Word at the given index. Asserts that the index is less than +/// `string.len`, and that the string is not empty. Always returns a word. /// The index does not have to be the start of a codepoint in the word. -pub fn wordAtCursor(wordbreak: *const WordBreak, string: []const u8, index: usize) Word { +pub fn wordAtIndex(wordbreak: *const WordBreak, string: []const u8, index: usize) Word { assert(index < string.len and string.len > 0); - var iter_fwd: Iterator = .initAtIndex(wordbreak, string, index); - if (iter_fwd.next()) |_| { - // This is a bit tricky because we may have been in the middle of various - // stateful things. So we go forward again: - if (iter_fwd.next()) |_| { - // Make a back iterator: - var iter_back = iter_fwd.reverseIterator(); - const last_word = iter_back.prev().?; // Always works. - const no_flags = iter_back.flags == 0; - if (no_flags) { - // Next previous is our word. - const the_word = iter_back.prev(); - if (the_word) |word| { - assert(word.offset <= index and index <= word.offset + word.len); + var iter_back: ReverseIterator = initAtIndex(wordbreak, string, index); + const first_back = iter_back.prev(); + if (first_back) |back| { + if (back.offset == 0) { + var iter_fwd = wordbreak.iterator(string); + while (iter_fwd.next()) |word| { + if (word.offset <= index and index < word.offset + word.len) return word; - } else { // Can happen, at least I think so - assert(last_word.offset <= index and index <= last_word.offset + last_word.len); - return last_word; - } + } + } + } else { + var iter_fwd = wordbreak.iterator(string); + while (iter_fwd.next()) |word| { + if (word.offset <= index and index < word.offset + word.len) + return word; + } + } + const second_back = iter_back.prev(); + if (second_back) |back| if (back.offset == 0) { + var iter_fwd = wordbreak.iterator(string); + while (iter_fwd.next()) |word| { + if (word.offset <= index and index < word.offset + word.len) + return word; + } + }; + // There's sometimes flags: + if (iter_back.flags > 0) { + while (iter_back.flags > 0) { + if (iter_back.prev()) |_| { + continue; } else { - // Scan past all the flags. - while (iter_back.flags > 0) { - _ = iter_back.prev(); - } - // Now just look for our word - iter_fwd = iter_back.forwardIterator(); - while (iter_fwd.next()) |word| { - if (word.offset <= index and index <= word.offset + word.len) { - return word; - } - } - unreachable; + break; } - } else { // We can just reverse here - var iter_back = iter_fwd.reverseIterator(); - const word = iter_back.prev().?; - assert(word.offset <= index and index <= word.offset + word.len); - return word; } - } else { // last word then - var iter_back = iter_fwd.reverseIterator(); - return iter_back.prev().?; } + var iter_fwd = iter_back.forwardIterator(); + while (iter_fwd.next()) |word| { + if (word.offset <= index and index < word.offset + word.len) + return word; + } + unreachable; } /// Returns an iterator over words in `slice`. @@ -128,6 +127,7 @@ pub fn reverseIterator(wordbreak: *const WordBreak, slice: []const u8) ReverseIt return ReverseIterator.init(wordbreak, slice); } +/// An iterator, forward, over all words in a provided string. pub const Iterator = struct { this: ?CodePoint = null, that: ?CodePoint = null, @@ -150,37 +150,7 @@ pub const Iterator = struct { return iter.next(); } - /// Initialize an Iterator at the provided index. Assumes str is valid - /// UTF-8, asserts that `index` is less than str.len, and that `str` is not - /// empty. Note that for various stateful reasons, this may give spurious - /// results if used naïvely. If you want to reliably iterate from an index, - /// use `wb.wordAtIndex(string, index)` to obtain the word, then start an - /// iterator at `wb.initAtIndex(string, word.offset)`. - pub fn initAtIndex(wb: *const WordBreak, string: []const u8, index: usize) Iterator { - assert(index < string.len and string.len > 0); - // Just in case... - if (index == 0) return wb.iterator(string); - var idx: u32 = @intCast(index); - // Back up past any any follow bytes: - while (idx > 0 and 0x80 <= string[idx] and string[idx] <= 0xBf) : (idx -= 1) {} - var iter: Iterator = undefined; - iter.wb = wb; - // We need to populate the CodePoints, and the codepoint iterator. - // Consider "abc |def" with the cursor on d. - // We need `this` to be ` ` and `that` to be 'd', - // and `cp_iter.next()` to be `e`. - var cp_back: ReverseCodepointIterator = .{ .bytes = string, .i = idx }; - // Reverse gives us before `d`: - iter.this = cp_back.prev(); // that == ` ` - // This iterator will give us `d`: - iter.cp_iter = .{ .bytes = string, .i = idx }; - iter.that = iter.cp_iter.next(); - // So that the next call will will give us `e`, - // thus the word will be `def`. - return iter; - } - - /// Return a reverse iterator from the point this iterator is paused + /// Returns a reverse iterator from the point this iterator is paused /// at. Usually, calling `prev()` will return the word just seen. pub fn reverseIterator(iter: *Iterator) ReverseIterator { var cp_it = iter.cp_iter.reverseIterator(); @@ -196,7 +166,7 @@ pub const Iterator = struct { }; } - /// Returns the next word segment. + /// Returns the next word segment, if any. pub fn next(iter: *Iterator) ?Word { iter.advance(); @@ -338,6 +308,7 @@ pub const Iterator = struct { } }; +/// An iterator, backward, over all words in a provided string. pub const ReverseIterator = struct { after: ?CodePoint = null, before: ?CodePoint = null, @@ -352,16 +323,7 @@ pub const ReverseIterator = struct { return wb_iter; } - /// Initialize a ReverseIterator at the provided index. Assumes str is valid - /// UTF-8, asserts that `index` is less than str.len, and that `str` is not - /// empty. You should prefer not to use this function, see Iterator.initAtIndex - /// for more details. - pub fn initAtIndex(wb: *const WordBreak, string: []const u8, index: usize) Iterator { - var fw_iter = Iterator.initAtIndex(wb, string, index); - return fw_iter.reverseIterator(); - } - - /// Returns the previous word segment, without advancing. + /// Returns the previous word segment, if any, without advancing. pub fn peek(iter: *ReverseIterator) ?Word { const cache = .{ iter.before, iter.after, iter.cp_iter, iter.flags }; defer { @@ -544,6 +506,27 @@ pub const ReverseIterator = struct { } }; +//| Implementation Details + +/// Initialize a ReverseIterator at the provided index. Used in wordAtIndex. +fn initAtIndex(wb: *const WordBreak, string: []const u8, index: usize) ReverseIterator { + var idx: u32 = @intCast(index); + while (idx < string.len and 0x80 <= string[idx] and string[idx] <= 0xBf) : (idx += 1) {} + if (idx == string.len) return wb.reverseIterator(string); + var iter: ReverseIterator = undefined; + iter.wb = wb; + iter.flags = 0; + // We need to populate the CodePoints, and the codepoint iterator. + // Consider "abc| def" with the cursor as |. + // We need `before` to be `c` and `after` to be ' ', + // and `cp_iter.prev()` to be `b`. + var cp_iter: ReverseCodepointIterator = .{ .bytes = string, .i = idx }; + iter.after = cp_iter.prev(); + iter.before = cp_iter.prev(); + iter.cp_iter = cp_iter; + return iter; +} + fn sneaky(iter: *const ReverseIterator) SneakIterator { return .{ .cp_iter = iter.cp_iter, .wb = iter.wb }; } @@ -656,23 +639,23 @@ test "ext_pict" { try testing.expect(ext_pict.isMatch("\u{2701}")); } -test wordAtCursor { +test wordAtIndex { const wb = try WordBreak.init(testing.allocator); defer wb.deinit(testing.allocator); const t_string = "first second third"; - const second = wb.wordAtCursor(t_string, 8); + const second = wb.wordAtIndex(t_string, 8); try testing.expectEqualStrings("second", second.bytes(t_string)); - const third = wb.wordAtCursor(t_string, 14); + const third = wb.wordAtIndex(t_string, 14); try testing.expectEqualStrings("third", third.bytes(t_string)); { - const first = wb.wordAtCursor(t_string, 3); + const first = wb.wordAtIndex(t_string, 3); try testing.expectEqualStrings("first", first.bytes(t_string)); } { - const first = wb.wordAtCursor(t_string, 0); + const first = wb.wordAtIndex(t_string, 0); try testing.expectEqualStrings("first", first.bytes(t_string)); } - const last = wb.wordAtCursor(t_string, 14); + const last = wb.wordAtIndex(t_string, 14); try testing.expectEqualStrings("third", last.bytes(t_string)); } diff --git a/src/code_point.zig b/src/code_point.zig index 43b38d2..9a84080 100644 --- a/src/code_point.zig +++ b/src/code_point.zig @@ -272,7 +272,6 @@ pub const ReverseIterator = struct { while (i_prev > 0) : (i_prev -= 1) { if (!followbyte(iter.bytes[i_prev])) break; - if (i_prev == 0) break; } if (i_prev > 0) diff --git a/src/unicode_tests.zig b/src/unicode_tests.zig index ef459bf..8b02e98 100644 --- a/src/unicode_tests.zig +++ b/src/unicode_tests.zig @@ -222,32 +222,58 @@ test "Segmentation Word Iterator" { try want.append(Word{ .len = gc_len, .offset = bytes_index }); bytes_index += cp_index; } + const this_str = all_bytes.items; + { - var iter = wb.iterator(all_bytes.items); + var iter = wb.iterator(this_str); var peeked: ?Word = iter.peek(); // Check. - for (want.items, 1..) |want_word, i| { + for (want.items, 1..) |want_word, idx| { const got_word = (iter.next()).?; std.testing.expectEqualStrings( - want_word.bytes(all_bytes.items), - got_word.bytes(all_bytes.items), + want_word.bytes(this_str), + got_word.bytes(this_str), ) catch |err| { - debug.print("Error on line {d}, #{d}\n", .{ line_iter.line, i }); + debug.print("Error on line {d}, #{d}\n", .{ line_iter.line, idx }); return err; }; std.testing.expectEqualStrings( - peeked.?.bytes(all_bytes.items), - got_word.bytes(all_bytes.items), + peeked.?.bytes(this_str), + got_word.bytes(this_str), ) catch |err| { - debug.print("Peek != word on line {d} #{d}\n", .{ line_iter.line, i }); + debug.print("Peek != word on line {d} #{d}\n", .{ line_iter.line, idx }); return err; }; + var r_iter = iter.reverseIterator(); + const if_r_word = r_iter.prev(); + if (if_r_word) |r_word| { + std.testing.expectEqualStrings( + want_word.bytes(this_str), + r_word.bytes(this_str), + ) catch |err| { + debug.print("Reversal Error on line {d}, #{d}\n", .{ line_iter.line, idx }); + return err; + }; + } else { + try testing.expect(false); + } + for (got_word.offset..got_word.offset + got_word.len) |i| { + const this_word = wb.wordAtIndex(this_str, i); + std.testing.expectEqualSlices( + u8, + got_word.bytes(this_str), + this_word.bytes(this_str), + ) catch |err| { + debug.print("Wrong word on line {d} #{d} offset {d}\n", .{ line_iter.line, idx + 1, i }); + return err; + }; + } peeked = iter.peek(); } } { - var r_iter = wb.reverseIterator(all_bytes.items); + var r_iter = wb.reverseIterator(this_str); var peeked: ?Word = r_iter.peek(); var idx = want.items.len - 1; @@ -256,19 +282,43 @@ test "Segmentation Word Iterator" { const got_word = r_iter.prev().?; std.testing.expectEqualSlices( u8, - want_word.bytes(all_bytes.items), - got_word.bytes(all_bytes.items), + want_word.bytes(this_str), + got_word.bytes(this_str), ) catch |err| { debug.print("Error on line {d}, #{d}\n", .{ line_iter.line, idx + 1 }); return err; }; std.testing.expectEqualStrings( - peeked.?.bytes(all_bytes.items), - got_word.bytes(all_bytes.items), + peeked.?.bytes(this_str), + got_word.bytes(this_str), ) catch |err| { debug.print("Peek != word on line {d} #{d}\n", .{ line_iter.line, idx + 1 }); return err; }; + var f_iter = r_iter.forwardIterator(); + const if_f_word = f_iter.next(); + if (if_f_word) |f_word| { + std.testing.expectEqualStrings( + want_word.bytes(this_str), + f_word.bytes(this_str), + ) catch |err| { + debug.print("Reversal Error on line {d}, #{d}\n", .{ line_iter.line, idx }); + return err; + }; + } else { + try testing.expect(false); + } + for (got_word.offset..got_word.offset + got_word.len) |i| { + const this_word = wb.wordAtIndex(this_str, i); + std.testing.expectEqualSlices( + u8, + got_word.bytes(this_str), + this_word.bytes(this_str), + ) catch |err| { + debug.print("Wrong word on line {d} #{d} offset {d}\n", .{ line_iter.line, idx + 1, i }); + return err; + }; + } peeked = r_iter.peek(); if (idx == 0) break; } -- cgit v1.2.3 From 713c01c22c7c4051cfc2bd83811fd969b1ccaddc Mon Sep 17 00:00:00 2001 From: Sam Atman Date: Thu, 15 May 2025 23:20:50 -0400 Subject: Merge Grapheme Segmentation Iterator Tests --- src/unicode_tests.zig | 113 +++++++++++++++----------------------------------- 1 file changed, 34 insertions(+), 79 deletions(-) (limited to 'src') diff --git a/src/unicode_tests.zig b/src/unicode_tests.zig index 0204b92..7139d4c 100644 --- a/src/unicode_tests.zig +++ b/src/unicode_tests.zig @@ -162,89 +162,44 @@ test "Segmentation GraphemeIterator" { bytes_index += cp_index; } - var iter = graph.iterator(all_bytes.items); - - // Check. - for (want.items) |want_gc| { - const got_gc = (iter.next()).?; - try std.testing.expectEqualStrings( - want_gc.bytes(all_bytes.items), - got_gc.bytes(all_bytes.items), - ); - } - } -} - -test "Segmentation ReverseGraphemeIterator" { - const allocator = std.testing.allocator; - var file = try std.fs.cwd().openFile("data/unicode/auxiliary/GraphemeBreakTest.txt", .{}); - defer file.close(); - var buf_reader = std.io.bufferedReader(file.reader()); - var input_stream = buf_reader.reader(); - - const data = try Graphemes.init(allocator); - defer data.deinit(allocator); - - var buf: [4096]u8 = undefined; - var line_no: usize = 1; - - while (try input_stream.readUntilDelimiterOrEof(&buf, '\n')) |raw| : (line_no += 1) { - // Skip comments or empty lines. - if (raw.len == 0 or raw[0] == '#' or raw[0] == '@') continue; - - // Clean up. - var line = std.mem.trimLeft(u8, raw, "÷ "); - if (std.mem.indexOf(u8, line, " ÷\t#")) |octo| { - line = line[0..octo]; - } - // Iterate over fields. - var want = std.ArrayList(Grapheme).init(allocator); - defer want.deinit(); - - var all_bytes = std.ArrayList(u8).init(allocator); - defer all_bytes.deinit(); - - var graphemes = std.mem.splitSequence(u8, line, " ÷ "); - var bytes_index: u32 = 0; - - while (graphemes.next()) |field| { - var code_points = std.mem.splitScalar(u8, field, ' '); - var cp_buf: [4]u8 = undefined; - var cp_index: u32 = 0; - var gc_len: u8 = 0; + { + var iter = graph.iterator(all_bytes.items); - while (code_points.next()) |code_point| { - if (std.mem.eql(u8, code_point, "×")) continue; - const cp: u21 = try std.fmt.parseInt(u21, code_point, 16); - const len = try unicode.utf8Encode(cp, &cp_buf); - try all_bytes.appendSlice(cp_buf[0..len]); - cp_index += len; - gc_len += len; + // Check. + for (want.items) |want_gc| { + const got_gc = (iter.next()).?; + try std.testing.expectEqualStrings( + want_gc.bytes(all_bytes.items), + got_gc.bytes(all_bytes.items), + ); } - - try want.append(Grapheme{ .len = gc_len, .offset = bytes_index }); - bytes_index += cp_index; } + { + var iter = graph.reverseIterator(all_bytes.items); - // std.debug.print("\nline {}: {s}\n", .{ line_no, all_bytes.items }); - var iter = data.reverseIterator(all_bytes.items); - - // Check. - var i: usize = want.items.len; - while (i > 0) { - i -= 1; - const want_gc = want.items[i]; - const got_gc = iter.prev() orelse { - std.debug.print("line {d} grapheme {d}: expected {any} found null\n", .{ line_no, i, want_gc }); - return error.TestExpectedEqual; - }; - std.testing.expectEqualStrings( - want_gc.bytes(all_bytes.items), - got_gc.bytes(all_bytes.items), - ) catch |err| { - std.debug.print("line {d} grapheme {d}: expected {any} found {any}\n", .{ line_no, i, want_gc, got_gc }); - return err; - }; + // Check. + var i: usize = want.items.len; + while (i > 0) { + i -= 1; + const want_gc = want.items[i]; + const got_gc = iter.prev() orelse { + std.debug.print( + "line {d} grapheme {d}: expected {any} found null\n", + .{ line_iter.line, i, want_gc }, + ); + return error.TestExpectedEqual; + }; + std.testing.expectEqualStrings( + want_gc.bytes(all_bytes.items), + got_gc.bytes(all_bytes.items), + ) catch |err| { + std.debug.print( + "line {d} grapheme {d}: expected {any} found {any}\n", + .{ line_iter.line, i, want_gc, got_gc }, + ); + return err; + }; + } } } } -- cgit v1.2.3 From 9042273383de60f36a7938f0f0b49102117eef85 Mon Sep 17 00:00:00 2001 From: Sam Atman Date: Fri, 16 May 2025 12:03:33 -0400 Subject: Proofread --- src/WordBreak.zig | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) (limited to 'src') diff --git a/src/WordBreak.zig b/src/WordBreak.zig index 6ada7e1..6a532f5 100644 --- a/src/WordBreak.zig +++ b/src/WordBreak.zig @@ -151,7 +151,8 @@ pub const Iterator = struct { } /// Returns a reverse iterator from the point this iterator is paused - /// at. Usually, calling `prev()` will return the word just seen. + /// at. Usually, and always when using the API to create iterators, + /// calling `prev()` will return the word just seen. pub fn reverseIterator(iter: *Iterator) ReverseIterator { var cp_it = iter.cp_iter.reverseIterator(); if (iter.that) |_| @@ -333,7 +334,8 @@ pub const ReverseIterator = struct { } /// Return a forward iterator from where this iterator paused. Usually, - /// calling `next()` will return the word just seen. + /// and always when using the API to create iterators, calling `next()` + /// will return the word just seen. pub fn forwardIterator(iter: *ReverseIterator) Iterator { var cp_it = iter.cp_iter.forwardIterator(); if (iter.before) |_| @@ -508,9 +510,10 @@ pub const ReverseIterator = struct { //| Implementation Details -/// Initialize a ReverseIterator at the provided index. Used in wordAtIndex. +/// Initialize a ReverseIterator at the provided index. Used in `wordAtIndex`. fn initAtIndex(wb: *const WordBreak, string: []const u8, index: usize) ReverseIterator { var idx: u32 = @intCast(index); + // Find the next lead byte: while (idx < string.len and 0x80 <= string[idx] and string[idx] <= 0xBf) : (idx += 1) {} if (idx == string.len) return wb.reverseIterator(string); var iter: ReverseIterator = undefined; @@ -630,8 +633,6 @@ test "Word Break Properties" { try testing.expectEqual(.LF, wb.breakProperty('\n')); try testing.expectEqual(.Hebrew_Letter, wb.breakProperty('ש')); try testing.expectEqual(.Katakana, wb.breakProperty('\u{30ff}')); - var iter = wb.iterator("xxx"); - _ = iter.peek(); } test "ext_pict" { -- cgit v1.2.3 From 21d55b730e25733c98fa58de42d256adf5f80d88 Mon Sep 17 00:00:00 2001 From: Sam Atman Date: Fri, 16 May 2025 12:06:32 -0400 Subject: Move WordBreak to Words --- src/WordBreak.zig | 720 ------------------------------------------------------ src/Words.zig | 720 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 720 insertions(+), 720 deletions(-) delete mode 100644 src/WordBreak.zig create mode 100644 src/Words.zig (limited to 'src') diff --git a/src/WordBreak.zig b/src/WordBreak.zig deleted file mode 100644 index 6a532f5..0000000 --- a/src/WordBreak.zig +++ /dev/null @@ -1,720 +0,0 @@ -//! Word Breaking Algorithm. - -const WordBreakProperty = enum(u5) { - none, - Double_Quote, - Single_Quote, - Hebrew_Letter, - CR, - LF, - Newline, - Extend, - Regional_Indicator, - Format, - Katakana, - ALetter, - MidLetter, - MidNum, - MidNumLet, - Numeric, - ExtendNumLet, - ZWJ, - WSegSpace, -}; - -s1: []u16 = undefined, -s2: []u5 = undefined, - -const WordBreak = @This(); - -pub fn init(allocator: Allocator) Allocator.Error!WordBreak { - var wb: WordBreak = undefined; - try wb.setup(allocator); - return wb; -} - -pub fn setup(wb: *WordBreak, allocator: Allocator) Allocator.Error!void { - wb.setupImpl(allocator) catch |err| { - switch (err) { - error.OutOfMemory => |e| return e, - else => unreachable, - } - }; -} - -pub fn deinit(wordbreak: *const WordBreak, allocator: mem.Allocator) void { - allocator.free(wordbreak.s1); - allocator.free(wordbreak.s2); -} - -/// Represents a Unicode word span, as an offset into the source string -/// and the length of the word. -pub const Word = struct { - offset: u32, - len: u32, - - /// Returns a slice of the word given the source string. - pub fn bytes(self: Word, src: []const u8) []const u8 { - return src[self.offset..][0..self.len]; - } -}; - -/// Returns the word break property type for `cp`. -pub fn breakProperty(wordbreak: *const WordBreak, cp: u21) WordBreakProperty { - return @enumFromInt(wordbreak.s2[wordbreak.s1[cp >> 8] + (cp & 0xff)]); -} - -/// Convenience function for working with CodePoints -fn breakProp(wb: *const WordBreak, point: CodePoint) WordBreakProperty { - return @enumFromInt(wb.s2[wb.s1[point.code >> 8] + (point.code & 0xff)]); -} - -/// Returns the Word at the given index. Asserts that the index is less than -/// `string.len`, and that the string is not empty. Always returns a word. -/// The index does not have to be the start of a codepoint in the word. -pub fn wordAtIndex(wordbreak: *const WordBreak, string: []const u8, index: usize) Word { - assert(index < string.len and string.len > 0); - var iter_back: ReverseIterator = initAtIndex(wordbreak, string, index); - const first_back = iter_back.prev(); - if (first_back) |back| { - if (back.offset == 0) { - var iter_fwd = wordbreak.iterator(string); - while (iter_fwd.next()) |word| { - if (word.offset <= index and index < word.offset + word.len) - return word; - } - } - } else { - var iter_fwd = wordbreak.iterator(string); - while (iter_fwd.next()) |word| { - if (word.offset <= index and index < word.offset + word.len) - return word; - } - } - const second_back = iter_back.prev(); - if (second_back) |back| if (back.offset == 0) { - var iter_fwd = wordbreak.iterator(string); - while (iter_fwd.next()) |word| { - if (word.offset <= index and index < word.offset + word.len) - return word; - } - }; - // There's sometimes flags: - if (iter_back.flags > 0) { - while (iter_back.flags > 0) { - if (iter_back.prev()) |_| { - continue; - } else { - break; - } - } - } - var iter_fwd = iter_back.forwardIterator(); - while (iter_fwd.next()) |word| { - if (word.offset <= index and index < word.offset + word.len) - return word; - } - unreachable; -} - -/// Returns an iterator over words in `slice`. -pub fn iterator(wordbreak: *const WordBreak, slice: []const u8) Iterator { - return Iterator.init(wordbreak, slice); -} - -/// Returns a reverse iterator over the words in `slice`. -pub fn reverseIterator(wordbreak: *const WordBreak, slice: []const u8) ReverseIterator { - return ReverseIterator.init(wordbreak, slice); -} - -/// An iterator, forward, over all words in a provided string. -pub const Iterator = struct { - this: ?CodePoint = null, - that: ?CodePoint = null, - cp_iter: CodepointIterator, - wb: *const WordBreak, - - /// Assumes `str` is valid UTF-8. - pub fn init(wb: *const WordBreak, str: []const u8) Iterator { - var wb_iter: Iterator = .{ .cp_iter = .init(str), .wb = wb }; - wb_iter.advance(); - return wb_iter; - } - - /// Returns the next word segment, without advancing. - pub fn peek(iter: *Iterator) ?Word { - const cache = .{ iter.this, iter.that, iter.cp_iter }; - defer { - iter.this, iter.that, iter.cp_iter = cache; - } - return iter.next(); - } - - /// Returns a reverse iterator from the point this iterator is paused - /// at. Usually, and always when using the API to create iterators, - /// calling `prev()` will return the word just seen. - pub fn reverseIterator(iter: *Iterator) ReverseIterator { - var cp_it = iter.cp_iter.reverseIterator(); - if (iter.that) |_| - _ = cp_it.prev(); - if (iter.cp_iter.peek()) |_| - _ = cp_it.prev(); - return .{ - .wb = iter.wb, - .before = cp_it.prev(), - .after = iter.that, - .cp_iter = cp_it, - }; - } - - /// Returns the next word segment, if any. - pub fn next(iter: *Iterator) ?Word { - iter.advance(); - - // Done? - if (iter.this == null) return null; - // Last? - if (iter.that == null) return Word{ .len = iter.this.?.len, .offset = iter.this.?.offset }; - - const word_start = iter.this.?.offset; - var word_len: u32 = 0; - - // State variables. - var last_p: WordBreakProperty = .none; - var last_last_p: WordBreakProperty = .none; - var ri_count: usize = 0; - - scan: while (true) : (iter.advance()) { - const this = iter.this.?; - word_len += this.len; - if (iter.that) |that| { - const this_p = iter.wb.breakProp(this); - const that_p = iter.wb.breakProp(that); - if (!isIgnorable(this_p)) { - last_last_p = last_p; - last_p = this_p; - } - // WB3 CR × LF - if (this_p == .CR and that_p == .LF) continue :scan; - // WB3a (Newline | CR | LF) ÷ - if (isNewline(this_p)) break :scan; - // WB3b ÷ (Newline | CR | LF) - if (isNewline(that_p)) break :scan; - // WB3c ZWJ × \p{Extended_Pictographic} - if (this_p == .ZWJ and ext_pict.isMatch(that.bytes(iter.cp_iter.bytes))) { - continue :scan; - } - // WB3d WSegSpace × WSegSpace - if (this_p == .WSegSpace and that_p == .WSegSpace) continue :scan; - // WB4 X (Extend | Format | ZWJ)* → X - if (isIgnorable(that_p)) { - continue :scan; - } // Now we use last_p instead of this_p for ignorable's sake - if (isAHLetter(last_p)) { - // WB5 AHLetter × AHLetter - if (isAHLetter(that_p)) continue :scan; - // WB6 AHLetter × (MidLetter | MidNumLetQ) AHLetter - if (isMidVal(that_p)) { - const next_val = iter.peekPast(); - if (next_val) |next_cp| { - const next_p = iter.wb.breakProp(next_cp); - if (isAHLetter(next_p)) { - continue :scan; - } - } - } - } - // WB7 AHLetter (MidLetter | MidNumLetQ) × AHLetter - if (isAHLetter(last_last_p) and isMidVal(last_p) and isAHLetter(that_p)) { - continue :scan; - } - if (last_p == .Hebrew_Letter) { - // WB7a Hebrew_Letter × Single_Quote - if (that_p == .Single_Quote) continue :scan; - // WB7b Hebrew_Letter × Double_Quote Hebrew_Letter - if (that_p == .Double_Quote) { - const next_val = iter.peekPast(); - if (next_val) |next_cp| { - const next_p = iter.wb.breakProp(next_cp); - if (next_p == .Hebrew_Letter) { - continue :scan; - } - } - } - } - // WB7c Hebrew_Letter Double_Quote × Hebrew_Letter - if (last_last_p == .Hebrew_Letter and last_p == .Double_Quote and that_p == .Hebrew_Letter) - continue :scan; - // WB8 Numeric × Numeric - if (last_p == .Numeric and that_p == .Numeric) continue :scan; - // WB9 AHLetter × Numeric - if (isAHLetter(last_p) and that_p == .Numeric) continue :scan; - // WB10 Numeric × AHLetter - if (last_p == .Numeric and isAHLetter(that_p)) continue :scan; - // WB11 Numeric (MidNum | MidNumLetQ) × Numeric - if (last_last_p == .Numeric and isMidNum(last_p) and that_p == .Numeric) - continue :scan; - // WB12 Numeric × (MidNum | MidNumLetQ) Numeric - if (last_p == .Numeric and isMidNum(that_p)) { - const next_val = iter.peekPast(); - if (next_val) |next_cp| { - const next_p = iter.wb.breakProp(next_cp); - if (next_p == .Numeric) { - continue :scan; - } - } - } - // WB13 Katakana × Katakana - if (last_p == .Katakana and that_p == .Katakana) continue :scan; - // WB13a (AHLetter | Numeric | Katakana | ExtendNumLet) × ExtendNumLet - if (isExtensible(last_p) and that_p == .ExtendNumLet) continue :scan; - // WB13b ExtendNumLet × (AHLetter | Numeric | Katakana) - if (last_p == .ExtendNumLet and isExtensible(that_p)) continue :scan; - // WB15, WB16 ([^RI] | sot) (RI RI)* RI × RI - const maybe_flag = that_p == .Regional_Indicator and last_p == .Regional_Indicator; - if (maybe_flag) { - ri_count += 1; - if (ri_count % 2 == 1) continue :scan; - } - // WB999 Any ÷ Any - break :scan; - } else { // iter.that == null - break :scan; - } - } - - return Word{ .len = word_len, .offset = word_start }; - } - - pub fn format(iter: Iterator, _: []const u8, _: std.fmt.FormatOptions, writer: anytype) !void { - try writer.print( - "Iterator {{ .this = {any}, .that = {any} }}", - .{ iter.this, iter.that }, - ); - } - - fn advance(iter: *Iterator) void { - iter.this = iter.that; - iter.that = iter.cp_iter.next(); - } - - fn peekPast(iter: *Iterator) ?CodePoint { - const save_cp = iter.cp_iter; - defer iter.cp_iter = save_cp; - while (iter.cp_iter.peek()) |peeked| { - if (!isIgnorable(iter.wb.breakProp(peeked))) return peeked; - _ = iter.cp_iter.next(); - } - return null; - } -}; - -/// An iterator, backward, over all words in a provided string. -pub const ReverseIterator = struct { - after: ?CodePoint = null, - before: ?CodePoint = null, - cp_iter: ReverseCodepointIterator, - wb: *const WordBreak, - flags: usize = 0, - - /// Assumes `str` is valid UTF-8. - pub fn init(wb: *const WordBreak, str: []const u8) ReverseIterator { - var wb_iter: ReverseIterator = .{ .cp_iter = .init(str), .wb = wb }; - wb_iter.advance(); - return wb_iter; - } - - /// Returns the previous word segment, if any, without advancing. - pub fn peek(iter: *ReverseIterator) ?Word { - const cache = .{ iter.before, iter.after, iter.cp_iter, iter.flags }; - defer { - iter.before, iter.after, iter.cp_iter, iter.flags = cache; - } - return iter.prev(); - } - - /// Return a forward iterator from where this iterator paused. Usually, - /// and always when using the API to create iterators, calling `next()` - /// will return the word just seen. - pub fn forwardIterator(iter: *ReverseIterator) Iterator { - var cp_it = iter.cp_iter.forwardIterator(); - if (iter.before) |_| - _ = cp_it.next(); - return .{ - .wb = iter.wb, - .this = cp_it.next(), - .that = iter.after, - .cp_iter = cp_it, - }; - } - - /// Return the previous word, if any. - pub fn prev(iter: *ReverseIterator) ?Word { - iter.advance(); - - // Done? - if (iter.after == null) return null; - // Last? - if (iter.before == null) return Word{ .len = iter.after.?.len, .offset = 0 }; - - const word_end = iter.after.?.offset + iter.after.?.len; - var word_len: u32 = 0; - - // State variables. - var last_p: WordBreakProperty = .none; - var last_last_p: WordBreakProperty = .none; - - scan: while (true) : (iter.advance()) { - const after = iter.after.?; - word_len += after.len; - if (iter.before) |before| { - var sneak = sneaky(iter); // 'sneaks' past ignorables - const after_p = iter.wb.breakProp(after); - var before_p = iter.wb.breakProp(before); - if (!isIgnorable(after_p)) { - last_last_p = last_p; - last_p = after_p; - } - // WB3 CR × LF - if (before_p == .CR and after_p == .LF) continue :scan; - // WB3a (Newline | CR | LF) ÷ - if (isNewline(before_p)) break :scan; - // WB3b ÷ (Newline | CR | LF) - if (isNewline(after_p)) break :scan; - // WB3c ZWJ × \p{Extended_Pictographic} - if (before_p == .ZWJ and ext_pict.isMatch(after.bytes(iter.cp_iter.bytes))) { - continue :scan; - } - // WB3d WSegSpace × WSegSpace - if (before_p == .WSegSpace and after_p == .WSegSpace) continue :scan; - // WB4 X (Extend | Format | ZWJ)* → X - if (isIgnorable(before_p)) { - const maybe_before = sneak.prev(); - if (maybe_before) |valid_before| { - before_p = iter.wb.breakProp(valid_before); - } else if (!isIgnorable(after_p)) { - // We're done - break :scan; - } - } - if (isIgnorable(after_p)) continue :scan; - // WB5 AHLetter × AHLetter - if (isAHLetter(last_p) and isAHLetter(before_p)) { - continue :scan; - } - // WB6 AHLetter × (MidLetter | MidNumLetQ) AHLetter - if (isAHLetter(before_p) and isMidVal(last_p) and isAHLetter(last_last_p)) { - continue :scan; - } - // WB7 AHLetter (MidLetter | MidNumLetQ) × AHLetter - if (isMidVal(before_p) and isAHLetter(last_p)) { - const prev_val = sneak.peek(); - if (prev_val) |prev_cp| { - const prev_p = iter.wb.breakProp(prev_cp); - if (isAHLetter(prev_p)) { - continue :scan; - } - } - } - // WB7a Hebrew_Letter × Single_Quote - if (before_p == .Hebrew_Letter and last_p == .Single_Quote) continue :scan; - // WB7b Hebrew_Letter × Double_Quote Hebrew_Letter - if (before_p == .Hebrew_Letter and last_p == .Double_Quote and last_last_p == .Hebrew_Letter) { - continue :scan; - } - // WB7c Hebrew_Letter Double_Quote × Hebrew_Letter - if (before_p == .Double_Quote and last_p == .Hebrew_Letter) { - const prev_val = sneak.peek(); - if (prev_val) |prev_cp| { - const prev_p = iter.wb.breakProp(prev_cp); - if (prev_p == .Hebrew_Letter) { - continue :scan; - } - } - } - // WB8 Numeric × Numeric - if (before_p == .Numeric and last_p == .Numeric) continue :scan; - // WB9 AHLetter × Numeric - if (isAHLetter(before_p) and last_p == .Numeric) continue :scan; - // WB10 Numeric × AHLetter - if (before_p == .Numeric and isAHLetter(last_p)) continue :scan; - // WB11 Numeric (MidNum | MidNumLetQ) × Numeric - if (isMidNum(before_p) and last_p == .Numeric) { - const prev_val = sneak.peek(); - if (prev_val) |prev_cp| { - const prev_p = iter.wb.breakProp(prev_cp); - if (prev_p == .Numeric) { - continue :scan; - } - } - } - // WB12 Numeric × (MidNum | MidNumLetQ) Numeric - if (before_p == .Numeric and isMidNum(last_p) and last_last_p == .Numeric) { - continue :scan; - } - // WB13 Katakana × Katakana - if (before_p == .Katakana and last_p == .Katakana) continue :scan; - // WB13a (AHLetter | Numeric | Katakana | ExtendNumLet) × ExtendNumLet - if (isExtensible(before_p) and last_p == .ExtendNumLet) continue :scan; - // WB13b ExtendNumLet × (AHLetter | Numeric | Katakana) - if (before_p == .ExtendNumLet and isExtensible(last_p)) continue :scan; - // WB15, WB16 ([^RI] | sot) (RI RI)* RI × RI - // NOTE: - // So here we simply have to know whether a run of flags is even or odd. - // The whole run. To avoid quadratic behavior (and long flag runs are - // actually a thing in the wild), we have to count them once, store that - // on the iterator, and decrement each time we see two, possibly breaking - // once extra at the beginning. They break up one per flag, once we hit - // zero, that's all the flags. If we see another flag we do it again. - if (before_p == .Regional_Indicator and last_p == .Regional_Indicator) { - defer { - if (iter.flags > 0) iter.flags -= 1; - } - if (iter.flags == 0) { - iter.flags = sneak.countFlags(); - } - if (iter.flags % 2 == 0) { - continue :scan; - } - } - // WB999 Any ÷ Any - break :scan; - } - break :scan; - } - return Word{ .len = word_len, .offset = word_end - word_len }; - } - - pub fn format(iter: ReverseIterator, _: []const u8, _: std.fmt.FormatOptions, writer: anytype) !void { - try writer.print( - "ReverseIterator {{ .before = {any}, .after = {any}, .flags = {d} }}", - .{ iter.before, iter.after, iter.flags }, - ); - } - - fn peekPast(iter: *ReverseIterator) ?CodePoint { - const save_cp = iter.cp_iter; - defer iter.cp_iter = save_cp; - while (iter.cp_iter.peek()) |peeked| { - if (!isIgnorable(iter.wb.breakProp(peeked))) return peeked; - _ = iter.cp_iter.prev(); - } - return null; - } - - fn advance(iter: *ReverseIterator) void { - iter.after = iter.before; - iter.before = iter.cp_iter.prev(); - } -}; - -//| Implementation Details - -/// Initialize a ReverseIterator at the provided index. Used in `wordAtIndex`. -fn initAtIndex(wb: *const WordBreak, string: []const u8, index: usize) ReverseIterator { - var idx: u32 = @intCast(index); - // Find the next lead byte: - while (idx < string.len and 0x80 <= string[idx] and string[idx] <= 0xBf) : (idx += 1) {} - if (idx == string.len) return wb.reverseIterator(string); - var iter: ReverseIterator = undefined; - iter.wb = wb; - iter.flags = 0; - // We need to populate the CodePoints, and the codepoint iterator. - // Consider "abc| def" with the cursor as |. - // We need `before` to be `c` and `after` to be ' ', - // and `cp_iter.prev()` to be `b`. - var cp_iter: ReverseCodepointIterator = .{ .bytes = string, .i = idx }; - iter.after = cp_iter.prev(); - iter.before = cp_iter.prev(); - iter.cp_iter = cp_iter; - return iter; -} - -fn sneaky(iter: *const ReverseIterator) SneakIterator { - return .{ .cp_iter = iter.cp_iter, .wb = iter.wb }; -} - -const SneakIterator = struct { - cp_iter: ReverseCodepointIterator, - wb: *const WordBreak, - - fn peek(iter: *SneakIterator) ?CodePoint { - const save_cp = iter.cp_iter; - defer iter.cp_iter = save_cp; - while (iter.cp_iter.peek()) |peeked| { - if (!isIgnorable(iter.wb.breakProp(peeked))) return peeked; - _ = iter.cp_iter.prev(); - } - return null; - } - - fn countFlags(iter: *SneakIterator) usize { - var flags: usize = 0; - const save_cp = iter.cp_iter; - defer iter.cp_iter = save_cp; - while (iter.cp_iter.prev()) |cp| { - const prop = iter.wb.breakProp(cp); - if (isIgnorable(prop)) continue; - if (prop == .Regional_Indicator) { - flags += 1; - } else break; - } - return flags; - } - - fn prev(iter: *SneakIterator) ?CodePoint { - while (iter.cp_iter.prev()) |peeked| { - if (!isIgnorable(iter.wb.breakProp(peeked))) return peeked; - } - return null; - } -}; - -inline fn setupImpl(wb: *WordBreak, allocator: Allocator) !void { - const decompressor = compress.flate.inflate.decompressor; - const in_bytes = @embedFile("wbp"); - var in_fbs = std.io.fixedBufferStream(in_bytes); - var in_decomp = decompressor(.raw, in_fbs.reader()); - var reader = in_decomp.reader(); - - const endian = builtin.cpu.arch.endian(); - - const stage_1_len: u16 = try reader.readInt(u16, endian); - wb.s1 = try allocator.alloc(u16, stage_1_len); - errdefer allocator.free(wb.s1); - for (0..stage_1_len) |i| wb.s1[i] = try reader.readInt(u16, endian); - - const stage_2_len: u16 = try reader.readInt(u16, endian); - wb.s2 = try allocator.alloc(u5, stage_2_len); - errdefer allocator.free(wb.s2); - for (0..stage_2_len) |i| wb.s2[i] = @intCast(try reader.readInt(u8, endian)); - var count_0: usize = 0; - for (wb.s2) |nyb| { - if (nyb == 0) count_0 += 1; - } -} - -//| Predicates - -inline fn isNewline(wbp: WordBreakProperty) bool { - return wbp == .CR or wbp == .LF or wbp == .Newline; -} - -inline fn isIgnorable(wbp: WordBreakProperty) bool { - return switch (wbp) { - .Format, .Extend, .ZWJ => true, - else => false, - }; -} - -inline fn isAHLetter(wbp: WordBreakProperty) bool { - return wbp == .ALetter or wbp == .Hebrew_Letter; -} - -inline fn isMidVal(wbp: WordBreakProperty) bool { - return wbp == .MidLetter or wbp == .MidNumLet or wbp == .Single_Quote; -} - -inline fn isMidNum(wbp: WordBreakProperty) bool { - return wbp == .MidNum or wbp == .MidNumLet or wbp == .Single_Quote; -} - -inline fn isExtensible(wbp: WordBreakProperty) bool { - return switch (wbp) { - .ALetter, .Hebrew_Letter, .Katakana, .Numeric, .ExtendNumLet => true, - else => false, - }; -} - -test "Word Break Properties" { - const wb = try WordBreak.init(testing.allocator); - defer wb.deinit(testing.allocator); - try testing.expectEqual(.CR, wb.breakProperty('\r')); - try testing.expectEqual(.LF, wb.breakProperty('\n')); - try testing.expectEqual(.Hebrew_Letter, wb.breakProperty('ש')); - try testing.expectEqual(.Katakana, wb.breakProperty('\u{30ff}')); -} - -test "ext_pict" { - try testing.expect(ext_pict.isMatch("👇")); - try testing.expect(ext_pict.isMatch("\u{2701}")); -} - -test wordAtIndex { - const wb = try WordBreak.init(testing.allocator); - defer wb.deinit(testing.allocator); - const t_string = "first second third"; - const second = wb.wordAtIndex(t_string, 8); - try testing.expectEqualStrings("second", second.bytes(t_string)); - const third = wb.wordAtIndex(t_string, 14); - try testing.expectEqualStrings("third", third.bytes(t_string)); - { - const first = wb.wordAtIndex(t_string, 3); - try testing.expectEqualStrings("first", first.bytes(t_string)); - } - { - const first = wb.wordAtIndex(t_string, 0); - try testing.expectEqualStrings("first", first.bytes(t_string)); - } - const last = wb.wordAtIndex(t_string, 14); - try testing.expectEqualStrings("third", last.bytes(t_string)); -} - -const testr = "don't a:ka fin!"; - -test "reversal" { - const wb = try WordBreak.init(testing.allocator); - defer wb.deinit(testing.allocator); - { - var fwd = wb.iterator(testr); - var this_word: ?Word = fwd.next(); - - while (this_word) |this| : (this_word = fwd.next()) { - var back = fwd.reverseIterator(); - const that_word = back.prev(); - if (that_word) |that| { - try testing.expectEqualStrings(this.bytes(testr), that.bytes(testr)); - } else { - try testing.expect(false); - } - } - } - { - var back = wb.reverseIterator(testr); - var this_word: ?Word = back.prev(); - - while (this_word) |this| : (this_word = back.prev()) { - var fwd = back.forwardIterator(); - const that_word = fwd.next(); - if (that_word) |that| { - try testing.expectEqualStrings(this.bytes(testr), that.bytes(testr)); - } else { - try testing.expect(false); - } - } - } -} - -fn testAllocations(allocator: Allocator) !void { - const wb = try WordBreak.init(allocator); - wb.deinit(allocator); -} - -test "allocation safety" { - try testing.checkAllAllocationFailures(testing.allocator, testAllocations, .{}); -} - -const std = @import("std"); -const builtin = @import("builtin"); -const compress = std.compress; -const mem = std.mem; -const Allocator = mem.Allocator; -const assert = std.debug.assert; -const testing = std.testing; - -const code_point = @import("code_point"); -const CodepointIterator = code_point.Iterator; -const ReverseCodepointIterator = code_point.ReverseIterator; -const CodePoint = code_point.CodePoint; - -const ext_pict = @import("micro_runeset.zig").Extended_Pictographic; diff --git a/src/Words.zig b/src/Words.zig new file mode 100644 index 0000000..6a532f5 --- /dev/null +++ b/src/Words.zig @@ -0,0 +1,720 @@ +//! Word Breaking Algorithm. + +const WordBreakProperty = enum(u5) { + none, + Double_Quote, + Single_Quote, + Hebrew_Letter, + CR, + LF, + Newline, + Extend, + Regional_Indicator, + Format, + Katakana, + ALetter, + MidLetter, + MidNum, + MidNumLet, + Numeric, + ExtendNumLet, + ZWJ, + WSegSpace, +}; + +s1: []u16 = undefined, +s2: []u5 = undefined, + +const WordBreak = @This(); + +pub fn init(allocator: Allocator) Allocator.Error!WordBreak { + var wb: WordBreak = undefined; + try wb.setup(allocator); + return wb; +} + +pub fn setup(wb: *WordBreak, allocator: Allocator) Allocator.Error!void { + wb.setupImpl(allocator) catch |err| { + switch (err) { + error.OutOfMemory => |e| return e, + else => unreachable, + } + }; +} + +pub fn deinit(wordbreak: *const WordBreak, allocator: mem.Allocator) void { + allocator.free(wordbreak.s1); + allocator.free(wordbreak.s2); +} + +/// Represents a Unicode word span, as an offset into the source string +/// and the length of the word. +pub const Word = struct { + offset: u32, + len: u32, + + /// Returns a slice of the word given the source string. + pub fn bytes(self: Word, src: []const u8) []const u8 { + return src[self.offset..][0..self.len]; + } +}; + +/// Returns the word break property type for `cp`. +pub fn breakProperty(wordbreak: *const WordBreak, cp: u21) WordBreakProperty { + return @enumFromInt(wordbreak.s2[wordbreak.s1[cp >> 8] + (cp & 0xff)]); +} + +/// Convenience function for working with CodePoints +fn breakProp(wb: *const WordBreak, point: CodePoint) WordBreakProperty { + return @enumFromInt(wb.s2[wb.s1[point.code >> 8] + (point.code & 0xff)]); +} + +/// Returns the Word at the given index. Asserts that the index is less than +/// `string.len`, and that the string is not empty. Always returns a word. +/// The index does not have to be the start of a codepoint in the word. +pub fn wordAtIndex(wordbreak: *const WordBreak, string: []const u8, index: usize) Word { + assert(index < string.len and string.len > 0); + var iter_back: ReverseIterator = initAtIndex(wordbreak, string, index); + const first_back = iter_back.prev(); + if (first_back) |back| { + if (back.offset == 0) { + var iter_fwd = wordbreak.iterator(string); + while (iter_fwd.next()) |word| { + if (word.offset <= index and index < word.offset + word.len) + return word; + } + } + } else { + var iter_fwd = wordbreak.iterator(string); + while (iter_fwd.next()) |word| { + if (word.offset <= index and index < word.offset + word.len) + return word; + } + } + const second_back = iter_back.prev(); + if (second_back) |back| if (back.offset == 0) { + var iter_fwd = wordbreak.iterator(string); + while (iter_fwd.next()) |word| { + if (word.offset <= index and index < word.offset + word.len) + return word; + } + }; + // There's sometimes flags: + if (iter_back.flags > 0) { + while (iter_back.flags > 0) { + if (iter_back.prev()) |_| { + continue; + } else { + break; + } + } + } + var iter_fwd = iter_back.forwardIterator(); + while (iter_fwd.next()) |word| { + if (word.offset <= index and index < word.offset + word.len) + return word; + } + unreachable; +} + +/// Returns an iterator over words in `slice`. +pub fn iterator(wordbreak: *const WordBreak, slice: []const u8) Iterator { + return Iterator.init(wordbreak, slice); +} + +/// Returns a reverse iterator over the words in `slice`. +pub fn reverseIterator(wordbreak: *const WordBreak, slice: []const u8) ReverseIterator { + return ReverseIterator.init(wordbreak, slice); +} + +/// An iterator, forward, over all words in a provided string. +pub const Iterator = struct { + this: ?CodePoint = null, + that: ?CodePoint = null, + cp_iter: CodepointIterator, + wb: *const WordBreak, + + /// Assumes `str` is valid UTF-8. + pub fn init(wb: *const WordBreak, str: []const u8) Iterator { + var wb_iter: Iterator = .{ .cp_iter = .init(str), .wb = wb }; + wb_iter.advance(); + return wb_iter; + } + + /// Returns the next word segment, without advancing. + pub fn peek(iter: *Iterator) ?Word { + const cache = .{ iter.this, iter.that, iter.cp_iter }; + defer { + iter.this, iter.that, iter.cp_iter = cache; + } + return iter.next(); + } + + /// Returns a reverse iterator from the point this iterator is paused + /// at. Usually, and always when using the API to create iterators, + /// calling `prev()` will return the word just seen. + pub fn reverseIterator(iter: *Iterator) ReverseIterator { + var cp_it = iter.cp_iter.reverseIterator(); + if (iter.that) |_| + _ = cp_it.prev(); + if (iter.cp_iter.peek()) |_| + _ = cp_it.prev(); + return .{ + .wb = iter.wb, + .before = cp_it.prev(), + .after = iter.that, + .cp_iter = cp_it, + }; + } + + /// Returns the next word segment, if any. + pub fn next(iter: *Iterator) ?Word { + iter.advance(); + + // Done? + if (iter.this == null) return null; + // Last? + if (iter.that == null) return Word{ .len = iter.this.?.len, .offset = iter.this.?.offset }; + + const word_start = iter.this.?.offset; + var word_len: u32 = 0; + + // State variables. + var last_p: WordBreakProperty = .none; + var last_last_p: WordBreakProperty = .none; + var ri_count: usize = 0; + + scan: while (true) : (iter.advance()) { + const this = iter.this.?; + word_len += this.len; + if (iter.that) |that| { + const this_p = iter.wb.breakProp(this); + const that_p = iter.wb.breakProp(that); + if (!isIgnorable(this_p)) { + last_last_p = last_p; + last_p = this_p; + } + // WB3 CR × LF + if (this_p == .CR and that_p == .LF) continue :scan; + // WB3a (Newline | CR | LF) ÷ + if (isNewline(this_p)) break :scan; + // WB3b ÷ (Newline | CR | LF) + if (isNewline(that_p)) break :scan; + // WB3c ZWJ × \p{Extended_Pictographic} + if (this_p == .ZWJ and ext_pict.isMatch(that.bytes(iter.cp_iter.bytes))) { + continue :scan; + } + // WB3d WSegSpace × WSegSpace + if (this_p == .WSegSpace and that_p == .WSegSpace) continue :scan; + // WB4 X (Extend | Format | ZWJ)* → X + if (isIgnorable(that_p)) { + continue :scan; + } // Now we use last_p instead of this_p for ignorable's sake + if (isAHLetter(last_p)) { + // WB5 AHLetter × AHLetter + if (isAHLetter(that_p)) continue :scan; + // WB6 AHLetter × (MidLetter | MidNumLetQ) AHLetter + if (isMidVal(that_p)) { + const next_val = iter.peekPast(); + if (next_val) |next_cp| { + const next_p = iter.wb.breakProp(next_cp); + if (isAHLetter(next_p)) { + continue :scan; + } + } + } + } + // WB7 AHLetter (MidLetter | MidNumLetQ) × AHLetter + if (isAHLetter(last_last_p) and isMidVal(last_p) and isAHLetter(that_p)) { + continue :scan; + } + if (last_p == .Hebrew_Letter) { + // WB7a Hebrew_Letter × Single_Quote + if (that_p == .Single_Quote) continue :scan; + // WB7b Hebrew_Letter × Double_Quote Hebrew_Letter + if (that_p == .Double_Quote) { + const next_val = iter.peekPast(); + if (next_val) |next_cp| { + const next_p = iter.wb.breakProp(next_cp); + if (next_p == .Hebrew_Letter) { + continue :scan; + } + } + } + } + // WB7c Hebrew_Letter Double_Quote × Hebrew_Letter + if (last_last_p == .Hebrew_Letter and last_p == .Double_Quote and that_p == .Hebrew_Letter) + continue :scan; + // WB8 Numeric × Numeric + if (last_p == .Numeric and that_p == .Numeric) continue :scan; + // WB9 AHLetter × Numeric + if (isAHLetter(last_p) and that_p == .Numeric) continue :scan; + // WB10 Numeric × AHLetter + if (last_p == .Numeric and isAHLetter(that_p)) continue :scan; + // WB11 Numeric (MidNum | MidNumLetQ) × Numeric + if (last_last_p == .Numeric and isMidNum(last_p) and that_p == .Numeric) + continue :scan; + // WB12 Numeric × (MidNum | MidNumLetQ) Numeric + if (last_p == .Numeric and isMidNum(that_p)) { + const next_val = iter.peekPast(); + if (next_val) |next_cp| { + const next_p = iter.wb.breakProp(next_cp); + if (next_p == .Numeric) { + continue :scan; + } + } + } + // WB13 Katakana × Katakana + if (last_p == .Katakana and that_p == .Katakana) continue :scan; + // WB13a (AHLetter | Numeric | Katakana | ExtendNumLet) × ExtendNumLet + if (isExtensible(last_p) and that_p == .ExtendNumLet) continue :scan; + // WB13b ExtendNumLet × (AHLetter | Numeric | Katakana) + if (last_p == .ExtendNumLet and isExtensible(that_p)) continue :scan; + // WB15, WB16 ([^RI] | sot) (RI RI)* RI × RI + const maybe_flag = that_p == .Regional_Indicator and last_p == .Regional_Indicator; + if (maybe_flag) { + ri_count += 1; + if (ri_count % 2 == 1) continue :scan; + } + // WB999 Any ÷ Any + break :scan; + } else { // iter.that == null + break :scan; + } + } + + return Word{ .len = word_len, .offset = word_start }; + } + + pub fn format(iter: Iterator, _: []const u8, _: std.fmt.FormatOptions, writer: anytype) !void { + try writer.print( + "Iterator {{ .this = {any}, .that = {any} }}", + .{ iter.this, iter.that }, + ); + } + + fn advance(iter: *Iterator) void { + iter.this = iter.that; + iter.that = iter.cp_iter.next(); + } + + fn peekPast(iter: *Iterator) ?CodePoint { + const save_cp = iter.cp_iter; + defer iter.cp_iter = save_cp; + while (iter.cp_iter.peek()) |peeked| { + if (!isIgnorable(iter.wb.breakProp(peeked))) return peeked; + _ = iter.cp_iter.next(); + } + return null; + } +}; + +/// An iterator, backward, over all words in a provided string. +pub const ReverseIterator = struct { + after: ?CodePoint = null, + before: ?CodePoint = null, + cp_iter: ReverseCodepointIterator, + wb: *const WordBreak, + flags: usize = 0, + + /// Assumes `str` is valid UTF-8. + pub fn init(wb: *const WordBreak, str: []const u8) ReverseIterator { + var wb_iter: ReverseIterator = .{ .cp_iter = .init(str), .wb = wb }; + wb_iter.advance(); + return wb_iter; + } + + /// Returns the previous word segment, if any, without advancing. + pub fn peek(iter: *ReverseIterator) ?Word { + const cache = .{ iter.before, iter.after, iter.cp_iter, iter.flags }; + defer { + iter.before, iter.after, iter.cp_iter, iter.flags = cache; + } + return iter.prev(); + } + + /// Return a forward iterator from where this iterator paused. Usually, + /// and always when using the API to create iterators, calling `next()` + /// will return the word just seen. + pub fn forwardIterator(iter: *ReverseIterator) Iterator { + var cp_it = iter.cp_iter.forwardIterator(); + if (iter.before) |_| + _ = cp_it.next(); + return .{ + .wb = iter.wb, + .this = cp_it.next(), + .that = iter.after, + .cp_iter = cp_it, + }; + } + + /// Return the previous word, if any. + pub fn prev(iter: *ReverseIterator) ?Word { + iter.advance(); + + // Done? + if (iter.after == null) return null; + // Last? + if (iter.before == null) return Word{ .len = iter.after.?.len, .offset = 0 }; + + const word_end = iter.after.?.offset + iter.after.?.len; + var word_len: u32 = 0; + + // State variables. + var last_p: WordBreakProperty = .none; + var last_last_p: WordBreakProperty = .none; + + scan: while (true) : (iter.advance()) { + const after = iter.after.?; + word_len += after.len; + if (iter.before) |before| { + var sneak = sneaky(iter); // 'sneaks' past ignorables + const after_p = iter.wb.breakProp(after); + var before_p = iter.wb.breakProp(before); + if (!isIgnorable(after_p)) { + last_last_p = last_p; + last_p = after_p; + } + // WB3 CR × LF + if (before_p == .CR and after_p == .LF) continue :scan; + // WB3a (Newline | CR | LF) ÷ + if (isNewline(before_p)) break :scan; + // WB3b ÷ (Newline | CR | LF) + if (isNewline(after_p)) break :scan; + // WB3c ZWJ × \p{Extended_Pictographic} + if (before_p == .ZWJ and ext_pict.isMatch(after.bytes(iter.cp_iter.bytes))) { + continue :scan; + } + // WB3d WSegSpace × WSegSpace + if (before_p == .WSegSpace and after_p == .WSegSpace) continue :scan; + // WB4 X (Extend | Format | ZWJ)* → X + if (isIgnorable(before_p)) { + const maybe_before = sneak.prev(); + if (maybe_before) |valid_before| { + before_p = iter.wb.breakProp(valid_before); + } else if (!isIgnorable(after_p)) { + // We're done + break :scan; + } + } + if (isIgnorable(after_p)) continue :scan; + // WB5 AHLetter × AHLetter + if (isAHLetter(last_p) and isAHLetter(before_p)) { + continue :scan; + } + // WB6 AHLetter × (MidLetter | MidNumLetQ) AHLetter + if (isAHLetter(before_p) and isMidVal(last_p) and isAHLetter(last_last_p)) { + continue :scan; + } + // WB7 AHLetter (MidLetter | MidNumLetQ) × AHLetter + if (isMidVal(before_p) and isAHLetter(last_p)) { + const prev_val = sneak.peek(); + if (prev_val) |prev_cp| { + const prev_p = iter.wb.breakProp(prev_cp); + if (isAHLetter(prev_p)) { + continue :scan; + } + } + } + // WB7a Hebrew_Letter × Single_Quote + if (before_p == .Hebrew_Letter and last_p == .Single_Quote) continue :scan; + // WB7b Hebrew_Letter × Double_Quote Hebrew_Letter + if (before_p == .Hebrew_Letter and last_p == .Double_Quote and last_last_p == .Hebrew_Letter) { + continue :scan; + } + // WB7c Hebrew_Letter Double_Quote × Hebrew_Letter + if (before_p == .Double_Quote and last_p == .Hebrew_Letter) { + const prev_val = sneak.peek(); + if (prev_val) |prev_cp| { + const prev_p = iter.wb.breakProp(prev_cp); + if (prev_p == .Hebrew_Letter) { + continue :scan; + } + } + } + // WB8 Numeric × Numeric + if (before_p == .Numeric and last_p == .Numeric) continue :scan; + // WB9 AHLetter × Numeric + if (isAHLetter(before_p) and last_p == .Numeric) continue :scan; + // WB10 Numeric × AHLetter + if (before_p == .Numeric and isAHLetter(last_p)) continue :scan; + // WB11 Numeric (MidNum | MidNumLetQ) × Numeric + if (isMidNum(before_p) and last_p == .Numeric) { + const prev_val = sneak.peek(); + if (prev_val) |prev_cp| { + const prev_p = iter.wb.breakProp(prev_cp); + if (prev_p == .Numeric) { + continue :scan; + } + } + } + // WB12 Numeric × (MidNum | MidNumLetQ) Numeric + if (before_p == .Numeric and isMidNum(last_p) and last_last_p == .Numeric) { + continue :scan; + } + // WB13 Katakana × Katakana + if (before_p == .Katakana and last_p == .Katakana) continue :scan; + // WB13a (AHLetter | Numeric | Katakana | ExtendNumLet) × ExtendNumLet + if (isExtensible(before_p) and last_p == .ExtendNumLet) continue :scan; + // WB13b ExtendNumLet × (AHLetter | Numeric | Katakana) + if (before_p == .ExtendNumLet and isExtensible(last_p)) continue :scan; + // WB15, WB16 ([^RI] | sot) (RI RI)* RI × RI + // NOTE: + // So here we simply have to know whether a run of flags is even or odd. + // The whole run. To avoid quadratic behavior (and long flag runs are + // actually a thing in the wild), we have to count them once, store that + // on the iterator, and decrement each time we see two, possibly breaking + // once extra at the beginning. They break up one per flag, once we hit + // zero, that's all the flags. If we see another flag we do it again. + if (before_p == .Regional_Indicator and last_p == .Regional_Indicator) { + defer { + if (iter.flags > 0) iter.flags -= 1; + } + if (iter.flags == 0) { + iter.flags = sneak.countFlags(); + } + if (iter.flags % 2 == 0) { + continue :scan; + } + } + // WB999 Any ÷ Any + break :scan; + } + break :scan; + } + return Word{ .len = word_len, .offset = word_end - word_len }; + } + + pub fn format(iter: ReverseIterator, _: []const u8, _: std.fmt.FormatOptions, writer: anytype) !void { + try writer.print( + "ReverseIterator {{ .before = {any}, .after = {any}, .flags = {d} }}", + .{ iter.before, iter.after, iter.flags }, + ); + } + + fn peekPast(iter: *ReverseIterator) ?CodePoint { + const save_cp = iter.cp_iter; + defer iter.cp_iter = save_cp; + while (iter.cp_iter.peek()) |peeked| { + if (!isIgnorable(iter.wb.breakProp(peeked))) return peeked; + _ = iter.cp_iter.prev(); + } + return null; + } + + fn advance(iter: *ReverseIterator) void { + iter.after = iter.before; + iter.before = iter.cp_iter.prev(); + } +}; + +//| Implementation Details + +/// Initialize a ReverseIterator at the provided index. Used in `wordAtIndex`. +fn initAtIndex(wb: *const WordBreak, string: []const u8, index: usize) ReverseIterator { + var idx: u32 = @intCast(index); + // Find the next lead byte: + while (idx < string.len and 0x80 <= string[idx] and string[idx] <= 0xBf) : (idx += 1) {} + if (idx == string.len) return wb.reverseIterator(string); + var iter: ReverseIterator = undefined; + iter.wb = wb; + iter.flags = 0; + // We need to populate the CodePoints, and the codepoint iterator. + // Consider "abc| def" with the cursor as |. + // We need `before` to be `c` and `after` to be ' ', + // and `cp_iter.prev()` to be `b`. + var cp_iter: ReverseCodepointIterator = .{ .bytes = string, .i = idx }; + iter.after = cp_iter.prev(); + iter.before = cp_iter.prev(); + iter.cp_iter = cp_iter; + return iter; +} + +fn sneaky(iter: *const ReverseIterator) SneakIterator { + return .{ .cp_iter = iter.cp_iter, .wb = iter.wb }; +} + +const SneakIterator = struct { + cp_iter: ReverseCodepointIterator, + wb: *const WordBreak, + + fn peek(iter: *SneakIterator) ?CodePoint { + const save_cp = iter.cp_iter; + defer iter.cp_iter = save_cp; + while (iter.cp_iter.peek()) |peeked| { + if (!isIgnorable(iter.wb.breakProp(peeked))) return peeked; + _ = iter.cp_iter.prev(); + } + return null; + } + + fn countFlags(iter: *SneakIterator) usize { + var flags: usize = 0; + const save_cp = iter.cp_iter; + defer iter.cp_iter = save_cp; + while (iter.cp_iter.prev()) |cp| { + const prop = iter.wb.breakProp(cp); + if (isIgnorable(prop)) continue; + if (prop == .Regional_Indicator) { + flags += 1; + } else break; + } + return flags; + } + + fn prev(iter: *SneakIterator) ?CodePoint { + while (iter.cp_iter.prev()) |peeked| { + if (!isIgnorable(iter.wb.breakProp(peeked))) return peeked; + } + return null; + } +}; + +inline fn setupImpl(wb: *WordBreak, allocator: Allocator) !void { + const decompressor = compress.flate.inflate.decompressor; + const in_bytes = @embedFile("wbp"); + var in_fbs = std.io.fixedBufferStream(in_bytes); + var in_decomp = decompressor(.raw, in_fbs.reader()); + var reader = in_decomp.reader(); + + const endian = builtin.cpu.arch.endian(); + + const stage_1_len: u16 = try reader.readInt(u16, endian); + wb.s1 = try allocator.alloc(u16, stage_1_len); + errdefer allocator.free(wb.s1); + for (0..stage_1_len) |i| wb.s1[i] = try reader.readInt(u16, endian); + + const stage_2_len: u16 = try reader.readInt(u16, endian); + wb.s2 = try allocator.alloc(u5, stage_2_len); + errdefer allocator.free(wb.s2); + for (0..stage_2_len) |i| wb.s2[i] = @intCast(try reader.readInt(u8, endian)); + var count_0: usize = 0; + for (wb.s2) |nyb| { + if (nyb == 0) count_0 += 1; + } +} + +//| Predicates + +inline fn isNewline(wbp: WordBreakProperty) bool { + return wbp == .CR or wbp == .LF or wbp == .Newline; +} + +inline fn isIgnorable(wbp: WordBreakProperty) bool { + return switch (wbp) { + .Format, .Extend, .ZWJ => true, + else => false, + }; +} + +inline fn isAHLetter(wbp: WordBreakProperty) bool { + return wbp == .ALetter or wbp == .Hebrew_Letter; +} + +inline fn isMidVal(wbp: WordBreakProperty) bool { + return wbp == .MidLetter or wbp == .MidNumLet or wbp == .Single_Quote; +} + +inline fn isMidNum(wbp: WordBreakProperty) bool { + return wbp == .MidNum or wbp == .MidNumLet or wbp == .Single_Quote; +} + +inline fn isExtensible(wbp: WordBreakProperty) bool { + return switch (wbp) { + .ALetter, .Hebrew_Letter, .Katakana, .Numeric, .ExtendNumLet => true, + else => false, + }; +} + +test "Word Break Properties" { + const wb = try WordBreak.init(testing.allocator); + defer wb.deinit(testing.allocator); + try testing.expectEqual(.CR, wb.breakProperty('\r')); + try testing.expectEqual(.LF, wb.breakProperty('\n')); + try testing.expectEqual(.Hebrew_Letter, wb.breakProperty('ש')); + try testing.expectEqual(.Katakana, wb.breakProperty('\u{30ff}')); +} + +test "ext_pict" { + try testing.expect(ext_pict.isMatch("👇")); + try testing.expect(ext_pict.isMatch("\u{2701}")); +} + +test wordAtIndex { + const wb = try WordBreak.init(testing.allocator); + defer wb.deinit(testing.allocator); + const t_string = "first second third"; + const second = wb.wordAtIndex(t_string, 8); + try testing.expectEqualStrings("second", second.bytes(t_string)); + const third = wb.wordAtIndex(t_string, 14); + try testing.expectEqualStrings("third", third.bytes(t_string)); + { + const first = wb.wordAtIndex(t_string, 3); + try testing.expectEqualStrings("first", first.bytes(t_string)); + } + { + const first = wb.wordAtIndex(t_string, 0); + try testing.expectEqualStrings("first", first.bytes(t_string)); + } + const last = wb.wordAtIndex(t_string, 14); + try testing.expectEqualStrings("third", last.bytes(t_string)); +} + +const testr = "don't a:ka fin!"; + +test "reversal" { + const wb = try WordBreak.init(testing.allocator); + defer wb.deinit(testing.allocator); + { + var fwd = wb.iterator(testr); + var this_word: ?Word = fwd.next(); + + while (this_word) |this| : (this_word = fwd.next()) { + var back = fwd.reverseIterator(); + const that_word = back.prev(); + if (that_word) |that| { + try testing.expectEqualStrings(this.bytes(testr), that.bytes(testr)); + } else { + try testing.expect(false); + } + } + } + { + var back = wb.reverseIterator(testr); + var this_word: ?Word = back.prev(); + + while (this_word) |this| : (this_word = back.prev()) { + var fwd = back.forwardIterator(); + const that_word = fwd.next(); + if (that_word) |that| { + try testing.expectEqualStrings(this.bytes(testr), that.bytes(testr)); + } else { + try testing.expect(false); + } + } + } +} + +fn testAllocations(allocator: Allocator) !void { + const wb = try WordBreak.init(allocator); + wb.deinit(allocator); +} + +test "allocation safety" { + try testing.checkAllAllocationFailures(testing.allocator, testAllocations, .{}); +} + +const std = @import("std"); +const builtin = @import("builtin"); +const compress = std.compress; +const mem = std.mem; +const Allocator = mem.Allocator; +const assert = std.debug.assert; +const testing = std.testing; + +const code_point = @import("code_point"); +const CodepointIterator = code_point.Iterator; +const ReverseCodepointIterator = code_point.ReverseIterator; +const CodePoint = code_point.CodePoint; + +const ext_pict = @import("micro_runeset.zig").Extended_Pictographic; -- cgit v1.2.3 From aa20bebade8eeb3ca75199dc252feb3edb203fb1 Mon Sep 17 00:00:00 2001 From: Sam Atman Date: Fri, 16 May 2025 12:06:36 -0400 Subject: Words module In keeping with the new nomenclature, we're calling the module "Words", not "WordBreak". The latter is Unicode jargon, the module provides word iterators. Words are the figure, word breaks are the ground. --- src/Words.zig | 42 +++++++++++++++++++++--------------------- src/unicode_tests.zig | 6 +++--- 2 files changed, 24 insertions(+), 24 deletions(-) (limited to 'src') diff --git a/src/Words.zig b/src/Words.zig index 6a532f5..565a2fb 100644 --- a/src/Words.zig +++ b/src/Words.zig @@ -25,15 +25,15 @@ const WordBreakProperty = enum(u5) { s1: []u16 = undefined, s2: []u5 = undefined, -const WordBreak = @This(); +const Words = @This(); -pub fn init(allocator: Allocator) Allocator.Error!WordBreak { - var wb: WordBreak = undefined; +pub fn init(allocator: Allocator) Allocator.Error!Words { + var wb: Words = undefined; try wb.setup(allocator); return wb; } -pub fn setup(wb: *WordBreak, allocator: Allocator) Allocator.Error!void { +pub fn setup(wb: *Words, allocator: Allocator) Allocator.Error!void { wb.setupImpl(allocator) catch |err| { switch (err) { error.OutOfMemory => |e| return e, @@ -42,7 +42,7 @@ pub fn setup(wb: *WordBreak, allocator: Allocator) Allocator.Error!void { }; } -pub fn deinit(wordbreak: *const WordBreak, allocator: mem.Allocator) void { +pub fn deinit(wordbreak: *const Words, allocator: mem.Allocator) void { allocator.free(wordbreak.s1); allocator.free(wordbreak.s2); } @@ -60,19 +60,19 @@ pub const Word = struct { }; /// Returns the word break property type for `cp`. -pub fn breakProperty(wordbreak: *const WordBreak, cp: u21) WordBreakProperty { +pub fn breakProperty(wordbreak: *const Words, cp: u21) WordBreakProperty { return @enumFromInt(wordbreak.s2[wordbreak.s1[cp >> 8] + (cp & 0xff)]); } /// Convenience function for working with CodePoints -fn breakProp(wb: *const WordBreak, point: CodePoint) WordBreakProperty { +fn breakProp(wb: *const Words, point: CodePoint) WordBreakProperty { return @enumFromInt(wb.s2[wb.s1[point.code >> 8] + (point.code & 0xff)]); } /// Returns the Word at the given index. Asserts that the index is less than /// `string.len`, and that the string is not empty. Always returns a word. /// The index does not have to be the start of a codepoint in the word. -pub fn wordAtIndex(wordbreak: *const WordBreak, string: []const u8, index: usize) Word { +pub fn wordAtIndex(wordbreak: *const Words, string: []const u8, index: usize) Word { assert(index < string.len and string.len > 0); var iter_back: ReverseIterator = initAtIndex(wordbreak, string, index); const first_back = iter_back.prev(); @@ -118,12 +118,12 @@ pub fn wordAtIndex(wordbreak: *const WordBreak, string: []const u8, index: usize } /// Returns an iterator over words in `slice`. -pub fn iterator(wordbreak: *const WordBreak, slice: []const u8) Iterator { +pub fn iterator(wordbreak: *const Words, slice: []const u8) Iterator { return Iterator.init(wordbreak, slice); } /// Returns a reverse iterator over the words in `slice`. -pub fn reverseIterator(wordbreak: *const WordBreak, slice: []const u8) ReverseIterator { +pub fn reverseIterator(wordbreak: *const Words, slice: []const u8) ReverseIterator { return ReverseIterator.init(wordbreak, slice); } @@ -132,10 +132,10 @@ pub const Iterator = struct { this: ?CodePoint = null, that: ?CodePoint = null, cp_iter: CodepointIterator, - wb: *const WordBreak, + wb: *const Words, /// Assumes `str` is valid UTF-8. - pub fn init(wb: *const WordBreak, str: []const u8) Iterator { + pub fn init(wb: *const Words, str: []const u8) Iterator { var wb_iter: Iterator = .{ .cp_iter = .init(str), .wb = wb }; wb_iter.advance(); return wb_iter; @@ -314,11 +314,11 @@ pub const ReverseIterator = struct { after: ?CodePoint = null, before: ?CodePoint = null, cp_iter: ReverseCodepointIterator, - wb: *const WordBreak, + wb: *const Words, flags: usize = 0, /// Assumes `str` is valid UTF-8. - pub fn init(wb: *const WordBreak, str: []const u8) ReverseIterator { + pub fn init(wb: *const Words, str: []const u8) ReverseIterator { var wb_iter: ReverseIterator = .{ .cp_iter = .init(str), .wb = wb }; wb_iter.advance(); return wb_iter; @@ -511,7 +511,7 @@ pub const ReverseIterator = struct { //| Implementation Details /// Initialize a ReverseIterator at the provided index. Used in `wordAtIndex`. -fn initAtIndex(wb: *const WordBreak, string: []const u8, index: usize) ReverseIterator { +fn initAtIndex(wb: *const Words, string: []const u8, index: usize) ReverseIterator { var idx: u32 = @intCast(index); // Find the next lead byte: while (idx < string.len and 0x80 <= string[idx] and string[idx] <= 0xBf) : (idx += 1) {} @@ -536,7 +536,7 @@ fn sneaky(iter: *const ReverseIterator) SneakIterator { const SneakIterator = struct { cp_iter: ReverseCodepointIterator, - wb: *const WordBreak, + wb: *const Words, fn peek(iter: *SneakIterator) ?CodePoint { const save_cp = iter.cp_iter; @@ -570,7 +570,7 @@ const SneakIterator = struct { } }; -inline fn setupImpl(wb: *WordBreak, allocator: Allocator) !void { +inline fn setupImpl(wb: *Words, allocator: Allocator) !void { const decompressor = compress.flate.inflate.decompressor; const in_bytes = @embedFile("wbp"); var in_fbs = std.io.fixedBufferStream(in_bytes); @@ -627,7 +627,7 @@ inline fn isExtensible(wbp: WordBreakProperty) bool { } test "Word Break Properties" { - const wb = try WordBreak.init(testing.allocator); + const wb = try Words.init(testing.allocator); defer wb.deinit(testing.allocator); try testing.expectEqual(.CR, wb.breakProperty('\r')); try testing.expectEqual(.LF, wb.breakProperty('\n')); @@ -641,7 +641,7 @@ test "ext_pict" { } test wordAtIndex { - const wb = try WordBreak.init(testing.allocator); + const wb = try Words.init(testing.allocator); defer wb.deinit(testing.allocator); const t_string = "first second third"; const second = wb.wordAtIndex(t_string, 8); @@ -663,7 +663,7 @@ test wordAtIndex { const testr = "don't a:ka fin!"; test "reversal" { - const wb = try WordBreak.init(testing.allocator); + const wb = try Words.init(testing.allocator); defer wb.deinit(testing.allocator); { var fwd = wb.iterator(testr); @@ -696,7 +696,7 @@ test "reversal" { } fn testAllocations(allocator: Allocator) !void { - const wb = try WordBreak.init(allocator); + const wb = try Words.init(allocator); wb.deinit(allocator); } diff --git a/src/unicode_tests.zig b/src/unicode_tests.zig index 7139d4c..18f1814 100644 --- a/src/unicode_tests.zig +++ b/src/unicode_tests.zig @@ -211,7 +211,7 @@ test "Segmentation Word Iterator" { var buf_reader = std.io.bufferedReader(file.reader()); var input_stream = buf_reader.reader(); - const wb = try WordBreak.init(allocator); + const wb = try Words.init(allocator); defer wb.deinit(allocator); var buf: [4096]u8 = undefined; @@ -392,5 +392,5 @@ const Graphemes = @import("Graphemes"); const GraphemeIterator = @import("Graphemes").Iterator; const Normalize = @import("Normalize"); -const WordBreak = @import("WordBreak"); -const Word = WordBreak.Word; +const Words = @import("Words"); +const Word = Words.Word; -- cgit v1.2.3 From ef27c51b8e46f3909a27fd137429b717797f1fd9 Mon Sep 17 00:00:00 2001 From: Sam Atman Date: Fri, 23 May 2025 16:48:55 -0400 Subject: Add iterateBefore and iterateAfter These create reverse or forward iterators before or after a Word. So this way, the user can get the word at an index, then iterate forward or back from that word. Also: Fixes #59 Which was fixed awhile back, but I don't feel like doing repo surgery to tag the fix where it happened. We have blame for that kind of thing. --- src/Words.zig | 98 ++++++++++++++++++++++++++++++++++----------------- src/unicode_tests.zig | 38 ++++++++++++++++++++ 2 files changed, 104 insertions(+), 32 deletions(-) (limited to 'src') diff --git a/src/Words.zig b/src/Words.zig index 565a2fb..1d10b2a 100644 --- a/src/Words.zig +++ b/src/Words.zig @@ -1,4 +1,7 @@ //! Word Breaking Algorithm. +//! +//! https://www.unicode.org/reports/tr29/#Word_Boundaries +//! const WordBreakProperty = enum(u5) { none, @@ -42,9 +45,9 @@ pub fn setup(wb: *Words, allocator: Allocator) Allocator.Error!void { }; } -pub fn deinit(wordbreak: *const Words, allocator: mem.Allocator) void { - allocator.free(wordbreak.s1); - allocator.free(wordbreak.s2); +pub fn deinit(words: *const Words, allocator: mem.Allocator) void { + allocator.free(words.s1); + allocator.free(words.s2); } /// Represents a Unicode word span, as an offset into the source string @@ -54,51 +57,44 @@ pub const Word = struct { len: u32, /// Returns a slice of the word given the source string. - pub fn bytes(self: Word, src: []const u8) []const u8 { - return src[self.offset..][0..self.len]; + pub fn bytes(word: Word, src: []const u8) []const u8 { + return src[word.offset..][0..word.len]; } }; /// Returns the word break property type for `cp`. -pub fn breakProperty(wordbreak: *const Words, cp: u21) WordBreakProperty { - return @enumFromInt(wordbreak.s2[wordbreak.s1[cp >> 8] + (cp & 0xff)]); +pub fn breakProperty(words: *const Words, cp: u21) WordBreakProperty { + return @enumFromInt(words.s2[words.s1[cp >> 8] + (cp & 0xff)]); } /// Convenience function for working with CodePoints -fn breakProp(wb: *const Words, point: CodePoint) WordBreakProperty { - return @enumFromInt(wb.s2[wb.s1[point.code >> 8] + (point.code & 0xff)]); +fn breakProp(words: *const Words, point: CodePoint) WordBreakProperty { + return @enumFromInt(words.s2[words.s1[point.code >> 8] + (point.code & 0xff)]); } /// Returns the Word at the given index. Asserts that the index is less than /// `string.len`, and that the string is not empty. Always returns a word. /// The index does not have to be the start of a codepoint in the word. -pub fn wordAtIndex(wordbreak: *const Words, string: []const u8, index: usize) Word { +pub fn wordAtIndex(words: *const Words, string: []const u8, index: usize) Word { assert(index < string.len and string.len > 0); - var iter_back: ReverseIterator = initAtIndex(wordbreak, string, index); + var iter_back: ReverseIterator = reverseFromIndex(words, string, index); const first_back = iter_back.prev(); if (first_back) |back| { if (back.offset == 0) { - var iter_fwd = wordbreak.iterator(string); + var iter_fwd = words.iterator(string); while (iter_fwd.next()) |word| { if (word.offset <= index and index < word.offset + word.len) return word; } } } else { - var iter_fwd = wordbreak.iterator(string); + var iter_fwd = words.iterator(string); while (iter_fwd.next()) |word| { if (word.offset <= index and index < word.offset + word.len) return word; } } - const second_back = iter_back.prev(); - if (second_back) |back| if (back.offset == 0) { - var iter_fwd = wordbreak.iterator(string); - while (iter_fwd.next()) |word| { - if (word.offset <= index and index < word.offset + word.len) - return word; - } - }; + _ = iter_back.prev(); // There's sometimes flags: if (iter_back.flags > 0) { while (iter_back.flags > 0) { @@ -118,13 +114,23 @@ pub fn wordAtIndex(wordbreak: *const Words, string: []const u8, index: usize) Wo } /// Returns an iterator over words in `slice`. -pub fn iterator(wordbreak: *const Words, slice: []const u8) Iterator { - return Iterator.init(wordbreak, slice); +pub fn iterator(words: *const Words, slice: []const u8) Iterator { + return Iterator.init(words, slice); } /// Returns a reverse iterator over the words in `slice`. -pub fn reverseIterator(wordbreak: *const Words, slice: []const u8) ReverseIterator { - return ReverseIterator.init(wordbreak, slice); +pub fn reverseIterator(words: *const Words, slice: []const u8) ReverseIterator { + return ReverseIterator.init(words, slice); +} + +/// Returns an iterator after the `word` in `slice`. +pub fn iterateAfter(words: *const Words, slice: []const u8, word: Word) Iterator { + return forwardFromIndex(words, slice, word.offset + word.len); +} + +/// Returns a reverse iterator before the `word` in `slice`. +pub fn iterateBefore(words: *const Words, slice: []const u8, word: Word) ReverseIterator { + return reverseFromIndex(words, slice, word.offset); } /// An iterator, forward, over all words in a provided string. @@ -135,8 +141,8 @@ pub const Iterator = struct { wb: *const Words, /// Assumes `str` is valid UTF-8. - pub fn init(wb: *const Words, str: []const u8) Iterator { - var wb_iter: Iterator = .{ .cp_iter = .init(str), .wb = wb }; + pub fn init(words: *const Words, str: []const u8) Iterator { + var wb_iter: Iterator = .{ .cp_iter = .init(str), .wb = words }; wb_iter.advance(); return wb_iter; } @@ -318,8 +324,8 @@ pub const ReverseIterator = struct { flags: usize = 0, /// Assumes `str` is valid UTF-8. - pub fn init(wb: *const Words, str: []const u8) ReverseIterator { - var wb_iter: ReverseIterator = .{ .cp_iter = .init(str), .wb = wb }; + pub fn init(words: *const Words, str: []const u8) ReverseIterator { + var wb_iter: ReverseIterator = .{ .cp_iter = .init(str), .wb = words }; wb_iter.advance(); return wb_iter; } @@ -511,13 +517,13 @@ pub const ReverseIterator = struct { //| Implementation Details /// Initialize a ReverseIterator at the provided index. Used in `wordAtIndex`. -fn initAtIndex(wb: *const Words, string: []const u8, index: usize) ReverseIterator { +fn reverseFromIndex(words: *const Words, string: []const u8, index: usize) ReverseIterator { var idx: u32 = @intCast(index); // Find the next lead byte: while (idx < string.len and 0x80 <= string[idx] and string[idx] <= 0xBf) : (idx += 1) {} - if (idx == string.len) return wb.reverseIterator(string); + if (idx == string.len) return words.reverseIterator(string); var iter: ReverseIterator = undefined; - iter.wb = wb; + iter.wb = words; iter.flags = 0; // We need to populate the CodePoints, and the codepoint iterator. // Consider "abc| def" with the cursor as |. @@ -530,6 +536,34 @@ fn initAtIndex(wb: *const Words, string: []const u8, index: usize) ReverseIterat return iter; } +fn forwardFromIndex(words: *const Words, string: []const u8, index: usize) Iterator { + var idx: u32 = @intCast(index); + if (idx == string.len) { + return .{ + .cp_iter = .{ .bytes = string, .i = idx }, + .this = null, + .that = null, + .wb = words, + }; + } + while (idx > 0 and 0x80 <= string[idx] and string[idx] <= 0xBf) : (idx -= 1) {} + if (idx == 0) return words.iterator(string); + var iter: Iterator = undefined; + iter.wb = words; + // We need to populate the CodePoints, and the codepoint iterator. + // Consider "abc |def" with the cursor as |. + // We need `this` to be ` ` and `that` to be 'd', + // and `cp_iter.next()` to be `d`. + idx -= 1; + while (idx > 0 and 0x80 <= string[idx] and string[idx] <= 0xBf) : (idx -= 1) {} + // "abc| def" + var cp_iter: CodepointIterator = .{ .bytes = string, .i = idx }; + iter.this = cp_iter.next(); + iter.that = cp_iter.next(); + iter.cp_iter = cp_iter; + return iter; +} + fn sneaky(iter: *const ReverseIterator) SneakIterator { return .{ .cp_iter = iter.cp_iter, .wb = iter.wb }; } diff --git a/src/unicode_tests.zig b/src/unicode_tests.zig index 18f1814..195fdcb 100644 --- a/src/unicode_tests.zig +++ b/src/unicode_tests.zig @@ -287,6 +287,25 @@ test "Segmentation Word Iterator" { } else { try testing.expect(false); } + var peek_iter = wb.iterateAfter(this_str, got_word); + const peek_1 = peek_iter.next(); + if (peek_1) |p1| { + const peek_2 = iter.peek(); + if (peek_2) |p2| { + std.testing.expectEqualSlices( + u8, + p1.bytes(this_str), + p2.bytes(this_str), + ) catch |err| { + debug.print("Bad peek on line {d} #{d} offset {d}\n", .{ line_iter.line, idx + 1, idx }); + return err; + }; + } else { + try testing.expect(false); + } + } else { + try testing.expectEqual(null, iter.peek()); + } for (got_word.offset..got_word.offset + got_word.len) |i| { const this_word = wb.wordAtIndex(this_str, i); std.testing.expectEqualSlices( @@ -337,6 +356,25 @@ test "Segmentation Word Iterator" { } else { try testing.expect(false); } + var peek_iter = wb.iterateBefore(this_str, got_word); + const peek_1 = peek_iter.prev(); + if (peek_1) |p1| { + const peek_2 = r_iter.peek(); + if (peek_2) |p2| { + std.testing.expectEqualSlices( + u8, + p1.bytes(this_str), + p2.bytes(this_str), + ) catch |err| { + debug.print("Bad peek on line {d} #{d} offset {d}\n", .{ line_iter.line, idx + 1, idx }); + return err; + }; + } else { + try testing.expect(false); + } + } else { + try testing.expectEqual(null, r_iter.peek()); + } for (got_word.offset..got_word.offset + got_word.len) |i| { const this_word = wb.wordAtIndex(this_str, i); std.testing.expectEqualSlices( -- cgit v1.2.3 From c9a1b3392973ee30e6a9a532f1da8605619b5b06 Mon Sep 17 00:00:00 2001 From: Sam Atman Date: Fri, 23 May 2025 18:46:30 -0400 Subject: Make offset size configurable Hopefully I can talk users out of taking advantage of this configuration but I'll have better luck with that if it's available. --- src/Graphemes.zig | 20 +++++++++++--------- src/Words.zig | 14 ++++++++------ src/code_point.zig | 16 +++++++++------- src/unicode_tests.zig | 10 ++++++---- 4 files changed, 34 insertions(+), 26 deletions(-) (limited to 'src') diff --git a/src/Graphemes.zig b/src/Graphemes.zig index 0338c04..49fdbf3 100644 --- a/src/Graphemes.zig +++ b/src/Graphemes.zig @@ -5,9 +5,11 @@ const Allocator = mem.Allocator; const compress = std.compress; const unicode = std.unicode; -const CodePoint = @import("code_point").CodePoint; -const CodePointIterator = @import("code_point").Iterator; -const CodePointReverseIterator = @import("code_point").ReverseIterator; +const code_point = @import("code_point"); +const CodePoint = code_point.CodePoint; +const CodePointIterator = code_point.Iterator; +const CodePointReverseIterator = code_point.ReverseIterator; +const uoffset = code_point.uoffset; s1: []u16 = undefined, s2: []u16 = undefined, @@ -104,8 +106,8 @@ pub const Gbp = enum { /// `Grapheme` represents a Unicode grapheme cluster by its length and offset in the source bytes. pub const Grapheme = struct { - len: u32, - offset: u32, + len: uoffset, + offset: uoffset, /// `bytes` returns the slice of bytes that correspond to /// this grapheme cluster in `src`. @@ -199,7 +201,7 @@ pub const ReverseIterator = struct { /// Count of pending RI codepoints, it is an even number ri_count: usize, /// End of (Extend* ZWJ) sequence pending from failed GB11: !Emoji Extend* ZWJ x Emoji - extend_end: u32, + extend_end: uoffset, }; const Self = @This(); @@ -219,7 +221,7 @@ pub const ReverseIterator = struct { pub fn prev(self: *Self) ?Grapheme { if (self.buf[1] == null) return null; - const grapheme_end: u32 = end: { + const grapheme_end: uoffset = end: { const codepoint = self.buf[1].?; switch (self.pending) { @@ -270,7 +272,7 @@ pub const ReverseIterator = struct { if (!state.hasIndic()) { // BUF: [?Any, Extend | Linker] Consonant - var indic_offset: u32 = self.buf[1].?.offset + self.buf[1].?.len; + var indic_offset: uoffset = self.buf[1].?.offset + self.buf[1].?.len; indic: while (true) { if (self.buf[0] == null) { @@ -321,7 +323,7 @@ pub const ReverseIterator = struct { if (!state.hasXpic()) { // BUF: [?Any, ZWJ] Emoji - var emoji_offset: u32 = self.buf[1].?.offset + self.buf[1].?.len; + var emoji_offset: uoffset = self.buf[1].?.offset + self.buf[1].?.len; // Look for previous Emoji emoji: while (true) { diff --git a/src/Words.zig b/src/Words.zig index 1d10b2a..1707881 100644 --- a/src/Words.zig +++ b/src/Words.zig @@ -53,8 +53,8 @@ pub fn deinit(words: *const Words, allocator: mem.Allocator) void { /// Represents a Unicode word span, as an offset into the source string /// and the length of the word. pub const Word = struct { - offset: u32, - len: u32, + offset: uoffset, + len: uoffset, /// Returns a slice of the word given the source string. pub fn bytes(word: Word, src: []const u8) []const u8 { @@ -183,7 +183,7 @@ pub const Iterator = struct { if (iter.that == null) return Word{ .len = iter.this.?.len, .offset = iter.this.?.offset }; const word_start = iter.this.?.offset; - var word_len: u32 = 0; + var word_len: uoffset = 0; // State variables. var last_p: WordBreakProperty = .none; @@ -364,7 +364,7 @@ pub const ReverseIterator = struct { if (iter.before == null) return Word{ .len = iter.after.?.len, .offset = 0 }; const word_end = iter.after.?.offset + iter.after.?.len; - var word_len: u32 = 0; + var word_len: uoffset = 0; // State variables. var last_p: WordBreakProperty = .none; @@ -518,7 +518,7 @@ pub const ReverseIterator = struct { /// Initialize a ReverseIterator at the provided index. Used in `wordAtIndex`. fn reverseFromIndex(words: *const Words, string: []const u8, index: usize) ReverseIterator { - var idx: u32 = @intCast(index); + var idx: uoffset = @intCast(index); // Find the next lead byte: while (idx < string.len and 0x80 <= string[idx] and string[idx] <= 0xBf) : (idx += 1) {} if (idx == string.len) return words.reverseIterator(string); @@ -537,7 +537,7 @@ fn reverseFromIndex(words: *const Words, string: []const u8, index: usize) Rever } fn forwardFromIndex(words: *const Words, string: []const u8, index: usize) Iterator { - var idx: u32 = @intCast(index); + var idx: uoffset = @intCast(index); if (idx == string.len) { return .{ .cp_iter = .{ .bytes = string, .i = idx }, @@ -746,6 +746,8 @@ const Allocator = mem.Allocator; const assert = std.debug.assert; const testing = std.testing; +const uoffset = code_point.uoffset; + const code_point = @import("code_point"); const CodepointIterator = code_point.Iterator; const ReverseCodepointIterator = code_point.ReverseIterator; diff --git a/src/code_point.zig b/src/code_point.zig index 9a84080..8bd3d5b 100644 --- a/src/code_point.zig +++ b/src/code_point.zig @@ -4,12 +4,14 @@ //! Represents invalid data according to the Replacement of Maximal //! Subparts algorithm. +pub const uoffset = if (@import("config").fat_offset) u64 else u32; + /// `CodePoint` represents a Unicode code point by its code, /// length, and offset in the source bytes. pub const CodePoint = struct { code: u21, len: u3, - offset: u32, + offset: uoffset, /// Return the slice of this codepoint, given the original string. pub inline fn bytes(cp: CodePoint, str: []const u8) []const u8 { @@ -27,8 +29,8 @@ pub const CodePoint = struct { /// This function is deprecated and will be removed in a later release. /// Use `decodeAtIndex` or `decodeAtCursor`. -pub fn decode(bytes: []const u8, offset: u32) ?CodePoint { - var off: u32 = 0; +pub fn decode(bytes: []const u8, offset: uoffset) ?CodePoint { + var off: uoffset = 0; var maybe_code = decodeAtCursor(bytes, &off); if (maybe_code) |*code| { code.offset = offset; @@ -38,14 +40,14 @@ pub fn decode(bytes: []const u8, offset: u32) ?CodePoint { } /// Decode the CodePoint, if any, at `bytes[idx]`. -pub fn decodeAtIndex(bytes: []const u8, idx: u32) ?CodePoint { +pub fn decodeAtIndex(bytes: []const u8, idx: uoffset) ?CodePoint { var off = idx; return decodeAtCursor(bytes, &off); } /// Decode the CodePoint, if any, at `bytes[cursor.*]`. After, the /// cursor will point at the next potential codepoint index. -pub fn decodeAtCursor(bytes: []const u8, cursor: *u32) ?CodePoint { +pub fn decodeAtCursor(bytes: []const u8, cursor: *uoffset) ?CodePoint { // EOS if (cursor.* >= bytes.len) return null; @@ -161,7 +163,7 @@ pub fn decodeAtCursor(bytes: []const u8, cursor: *u32) ?CodePoint { /// `Iterator` iterates a string one `CodePoint` at-a-time. pub const Iterator = struct { bytes: []const u8, - i: u32 = 0, + i: uoffset = 0, pub fn init(bytes: []const u8) Iterator { return .{ .bytes = bytes, .i = 0 }; @@ -257,7 +259,7 @@ const class_mask: [12]u8 = .{ pub const ReverseIterator = struct { bytes: []const u8, - i: ?u32, + i: ?uoffset, pub fn init(str: []const u8) ReverseIterator { var r_iter: ReverseIterator = undefined; diff --git a/src/unicode_tests.zig b/src/unicode_tests.zig index 195fdcb..c463dcc 100644 --- a/src/unicode_tests.zig +++ b/src/unicode_tests.zig @@ -141,12 +141,12 @@ test "Segmentation GraphemeIterator" { defer all_bytes.deinit(); var graphemes = std.mem.splitSequence(u8, line, " ÷ "); - var bytes_index: u32 = 0; + var bytes_index: uoffset = 0; while (graphemes.next()) |field| { var code_points = std.mem.splitScalar(u8, field, ' '); var cp_buf: [4]u8 = undefined; - var cp_index: u32 = 0; + var cp_index: uoffset = 0; var gc_len: u8 = 0; while (code_points.next()) |code_point| { @@ -231,12 +231,12 @@ test "Segmentation Word Iterator" { defer all_bytes.deinit(); var words = std.mem.splitSequence(u8, line, " ÷ "); - var bytes_index: u32 = 0; + var bytes_index: uoffset = 0; while (words.next()) |field| { var code_points = std.mem.splitScalar(u8, field, ' '); var cp_buf: [4]u8 = undefined; - var cp_index: u32 = 0; + var cp_index: uoffset = 0; var gc_len: u8 = 0; while (code_points.next()) |code_point| { @@ -425,6 +425,8 @@ const debug = std.debug; const testing = std.testing; const unicode = std.unicode; +const uoffset = @FieldType(Word, "offset"); + const Grapheme = @import("Graphemes").Grapheme; const Graphemes = @import("Graphemes"); const GraphemeIterator = @import("Graphemes").Iterator; -- cgit v1.2.3 From 8f5209fa095c2ed9114ce102b2f9b2cc90d66b13 Mon Sep 17 00:00:00 2001 From: Sam Atman Date: Sun, 1 Jun 2025 14:08:25 -0400 Subject: Add graphemeAtIndex + iterate before and after That completes the set. I do think it's possible to bum a few more cycles from the implementation, but, I'm not going to. It passes the acceptance suite and that's what it needs to do. --- src/Graphemes.zig | 220 +++++++++++++++++++++++++++++++++----------------- src/Words.zig | 4 +- src/code_point.zig | 60 +++++++++++++- src/unicode_tests.zig | 69 +++++++++++++--- 4 files changed, 266 insertions(+), 87 deletions(-) (limited to 'src') diff --git a/src/Graphemes.zig b/src/Graphemes.zig index 49fdbf3..f1c56ed 100644 --- a/src/Graphemes.zig +++ b/src/Graphemes.zig @@ -1,15 +1,7 @@ -const std = @import("std"); -const builtin = @import("builtin"); -const mem = std.mem; -const Allocator = mem.Allocator; -const compress = std.compress; -const unicode = std.unicode; - -const code_point = @import("code_point"); -const CodePoint = code_point.CodePoint; -const CodePointIterator = code_point.Iterator; -const CodePointReverseIterator = code_point.ReverseIterator; -const uoffset = code_point.uoffset; +//! Graphemes Module +//! +//! Code for handling graphemes: fragments of string which should be +//! treated as one unit. Like Farmer Bob here: 👨🏻‍🌾 s1: []u16 = undefined, s2: []u16 = undefined, @@ -69,10 +61,12 @@ pub fn isEmoji(graphemes: Graphemes, cp: u21) bool { return graphemes.s3[graphemes.s2[graphemes.s1[cp >> 8] + (cp & 0xff)]] & 1 == 1; } +/// Returns an iterator over the graphemes in `string`. pub fn iterator(graphemes: *const Graphemes, string: []const u8) Iterator { return Iterator.init(string, graphemes); } +/// Returns a reverse iterator over the graphemes in `string`. pub fn reverseIterator(graphemes: *const Graphemes, string: []const u8) ReverseIterator { return ReverseIterator.init(string, graphemes); } @@ -116,6 +110,96 @@ pub const Grapheme = struct { } }; +// NOTE: graphemeAtIndex is, probably, not in an optimal form. It has the advantage +// of being composed of other parts, but the constant factor can _probably_ be improved +// by a bespoke implmentation using graphemes.graphemeBreak directly. There's a limit +// to how much cycle-bumming I'm willing to do at any given moment; that limit has been +// reached. Perhaps you, Dear Reader, might pick up the torch? + +/// Returns the `Grapheme` at `string[index]`, which does not have to be a +/// valid start of a codepoint. Asserts the string is not empty. Index must be +/// less than `string.len`. Always returns a `Grapheme`. +pub fn graphemeAtIndex(graphemes: *const Graphemes, string: []const u8, index: usize) Grapheme { + assert(string.len != 0); + if (index == 0 or (index > 0 and + string[index] < 0x80 and + string[index - 1] < 0x80) and + (string[index - 1] != '\r' and string[index] != '\n')) + { + // There's always a grapheme break between two ASCII code points (except CRLF) + var iter = graphemes.iterator(string[index..]); + const next = iter.next().?; + return Grapheme{ + .len = next.len, + .offset = @as(u32, @intCast(index)) + next.offset, + }; + } // Otherwise it gets hairy. + const idx: uoffset = code_point.codepointAtIndex(string, @intCast(index)).?.offset; + if (idx == string.len) { + var iter = graphemes.reverseIterator(string); + return iter.prev().?; + } + // We're on a valid codepoint boundary, we go back from here + var r_iter = graphemes.reverseIterAtIndex(string, idx); + if (r_iter.prev()) |g| { + if (g.offset == 0) { + var iter = graphemes.iterator(string); + while (iter.next()) |g2| { + if (g2.offset <= idx and idx < g2.offset + g2.len) return g2; + } + } + } + // We need to toss one, because otherwise we might not be pending when + // we in fact need to be. + _ = r_iter.prev(); + while (r_iter.pending != .none) : (_ = r_iter.prev()) {} + var iter = graphemes.iterAtIndex(string, r_iter.cp_iter.i orelse 0); + while (iter.next()) |g| { + if (g.offset <= idx and idx < g.offset + g.len) return g; + } + unreachable; +} + +/// Return a (forward) iterator of `string` after `grapheme`. +pub fn iterateAfterGrapheme(graphemes: *const Graphemes, string: []const u8, grapheme: Grapheme) Iterator { + return graphemes.iterAtIndex(string, grapheme.offset + grapheme.len); +} + +/// Return a reverse iterator of `string` before `grapheme`. +pub fn iterateBeforeGrapheme(graphemes: *const Graphemes, string: []const u8, grapheme: Grapheme) ReverseIterator { + // This bit of weirdness is because reverse iterators are "advance last", + // while forward iterators are "advance first". This leaves some room for + // further optimization, if anyone dares. + var r_iter = graphemes.reverseIterAtIndex(string, grapheme.offset + grapheme.len - 1); + _ = r_iter.prev(); + return r_iter; +} + +fn reverseIterAtIndex(graphemes: *const Graphemes, string: []const u8, idx: uoffset) ReverseIterator { + var r_iter: ReverseIterator = undefined; + r_iter.data = graphemes; + var rcp_iter: CodePointReverseIterator = .{ .bytes = string, .i = idx }; + r_iter.buf[1] = rcp_iter.prev(); + r_iter.buf[0] = rcp_iter.prev(); + r_iter.pending = .none; + r_iter.cp_iter = rcp_iter; + return r_iter; +} + +fn iterAtIndex(graphemes: *const Graphemes, string: []const u8, idx: uoffset) Iterator { + var iter: Iterator = undefined; + iter.data = graphemes; + iter.buf[0] = first: { + if (idx == string.len) break :first null; + var r_cp_iter: CodePointReverseIterator = .{ .bytes = string, .i = idx }; + break :first r_cp_iter.prev(); + }; + var cp_iter: CodePointIterator = .{ .bytes = string, .i = idx }; + iter.buf[1] = cp_iter.next(); + iter.cp_iter = cp_iter; + return iter; +} + /// `Iterator` iterates a sting of UTF-8 encoded bytes one grapheme cluster at-a-time. pub const Iterator = struct { buf: [2]?CodePoint = .{ null, null }, @@ -150,7 +234,7 @@ pub const Iterator = struct { const gc_start = self.buf[0].?.offset; var gc_len: u8 = self.buf[0].?.len; - var state = State{}; + var state = IterState{}; if (graphemeBreak( self.buf[0].?.code, @@ -189,12 +273,13 @@ pub const Iterator = struct { } }; +/// Iterate a string backward by Grapheme. pub const ReverseIterator = struct { buf: [2]?CodePoint = .{ null, null }, cp_iter: CodePointReverseIterator, data: *const Graphemes, /// Codepoint read from `cp_iter` but not returned by `previous` - pending: Pending = .{ .none = {} }, + pending: Pending = .none, const Pending = union(enum) { none: void, @@ -218,6 +303,12 @@ pub const ReverseIterator = struct { self.buf[0] = self.cp_iter.prev(); } + pub fn peek(self: *Self) ?Grapheme { + const cache = .{ self.buf, self.cp_iter, self.pending }; + defer self.buf, self.cp_iter, self.pending = cache; + return self.prev(); + } + pub fn prev(self: *Self) ?Grapheme { if (self.buf[1] == null) return null; @@ -255,10 +346,10 @@ pub const ReverseIterator = struct { }; while (self.buf[0] != null) { - var state: State = .{}; - state.setXpic(); - state.unsetRegional(); - state.setIndic(); + var state: IterState = .{}; + state.xpic = true; + state.regional = false; + state.indic = true; if (graphemeBreak( self.buf[0].?.code, @@ -269,7 +360,7 @@ pub const ReverseIterator = struct { self.advance(); - if (!state.hasIndic()) { + if (!state.indic) { // BUF: [?Any, Extend | Linker] Consonant var indic_offset: uoffset = self.buf[1].?.offset + self.buf[1].?.len; @@ -296,11 +387,11 @@ pub const ReverseIterator = struct { self.advance(); if (self.buf[0]) |cp1| { - state.setIndic(); + state.indic = true; if (graphemeBreak(cp1.code, self.buf[1].?.code, self.data, &state)) break; - if (!state.hasIndic()) { + if (!state.indic) { continue :indic; } else { break :indic; @@ -321,7 +412,7 @@ pub const ReverseIterator = struct { } } - if (!state.hasXpic()) { + if (!state.xpic) { // BUF: [?Any, ZWJ] Emoji var emoji_offset: uoffset = self.buf[1].?.offset + self.buf[1].?.len; @@ -370,7 +461,7 @@ pub const ReverseIterator = struct { } } - if (state.hasRegional()) { + if (state.regional) { var ri_count: usize = 0; while (self.buf[0] != null and self.data.gbp(self.buf[0].?.code) == .Regional_Indicator) @@ -404,6 +495,13 @@ pub const ReverseIterator = struct { } }; +/// Grapheme Iterator state. +pub const IterState = packed struct(u3) { + xpic: bool = false, + regional: bool = false, + indic: bool = false, +}; + // Predicates fn isBreaker(cp: u21, data: *const Graphemes) bool { // Extract relevant properties. @@ -411,44 +509,6 @@ fn isBreaker(cp: u21, data: *const Graphemes) bool { return cp == '\x0d' or cp == '\x0a' or cp_gbp_prop == .Control; } -// Grapheme break state. -pub const State = struct { - bits: u3 = 0, - - // Extended Pictographic (emoji) - fn hasXpic(self: State) bool { - return self.bits & 1 == 1; - } - fn setXpic(self: *State) void { - self.bits |= 1; - } - fn unsetXpic(self: *State) void { - self.bits &= ~@as(u3, 1); - } - - // Regional Indicatior (flags) - fn hasRegional(self: State) bool { - return self.bits & 2 == 2; - } - fn setRegional(self: *State) void { - self.bits |= 2; - } - fn unsetRegional(self: *State) void { - self.bits &= ~@as(u3, 2); - } - - // Indic Conjunct - fn hasIndic(self: State) bool { - return self.bits & 4 == 4; - } - fn setIndic(self: *State) void { - self.bits |= 4; - } - fn unsetIndic(self: *State) void { - self.bits &= ~@as(u3, 4); - } -}; - /// `graphemeBreak` returns true only if a grapheme break point is required /// between `cp1` and `cp2`. `state` should start out as 0. If calling /// iteratively over a sequence of code points, this function must be called @@ -459,7 +519,7 @@ pub fn graphemeBreak( cp1: u21, cp2: u21, data: *const Graphemes, - state: *State, + state: *IterState, ) bool { // Extract relevant properties. const cp1_gbp_prop = data.gbp(cp1); @@ -471,9 +531,9 @@ pub fn graphemeBreak( const cp2_is_emoji = data.isEmoji(cp2); // GB11: Emoji Extend* ZWJ x Emoji - if (!state.hasXpic() and cp1_is_emoji) state.setXpic(); + if (!state.xpic and cp1_is_emoji) state.xpic = true; // GB9c: Indic Conjunct Break - if (!state.hasIndic() and cp1_indic_prop == .Consonant) state.setIndic(); + if (!state.indic and cp1_indic_prop == .Consonant) state.indic = true; // GB3: CR x LF if (cp1 == '\r' and cp2 == '\n') return false; @@ -482,11 +542,11 @@ pub fn graphemeBreak( if (isBreaker(cp1, data)) return true; // GB11: Emoji Extend* ZWJ x Emoji - if (state.hasXpic() and + if (state.xpic and cp1_gbp_prop == .ZWJ and cp2_is_emoji) { - state.unsetXpic(); + state.xpic = false; return false; } @@ -501,11 +561,11 @@ pub fn graphemeBreak( // GB12, GB13: RI x RI if (cp1_gbp_prop == .Regional_Indicator and cp2_gbp_prop == .Regional_Indicator) { - if (state.hasRegional()) { - state.unsetRegional(); + if (state.regional) { + state.regional = false; return true; } else { - state.setRegional(); + state.regional = true; return false; } } @@ -530,25 +590,25 @@ pub fn graphemeBreak( } // GB9c: Indic Conjunct Break - if (state.hasIndic() and + if (state.indic and cp1_indic_prop == .Consonant and (cp2_indic_prop == .Extend or cp2_indic_prop == .Linker)) { return false; } - if (state.hasIndic() and + if (state.indic and cp1_indic_prop == .Extend and cp2_indic_prop == .Linker) { return false; } - if (state.hasIndic() and + if (state.indic and (cp1_indic_prop == .Linker or cp1_gbp_prop == .ZWJ) and cp2_indic_prop == .Consonant) { - state.unsetIndic(); + state.indic = false; return false; } @@ -608,3 +668,17 @@ test "Iterator.peek" { try std.testing.expectEqual(null, iter.peek()); try std.testing.expectEqual(iter.peek(), iter.next()); } + +const std = @import("std"); +const builtin = @import("builtin"); +const assert = std.debug.assert; +const mem = std.mem; +const Allocator = mem.Allocator; +const compress = std.compress; +const unicode = std.unicode; + +const code_point = @import("code_point"); +const CodePoint = code_point.CodePoint; +const CodePointIterator = code_point.Iterator; +const CodePointReverseIterator = code_point.ReverseIterator; +const uoffset = code_point.uoffset; diff --git a/src/Words.zig b/src/Words.zig index 1707881..af82562 100644 --- a/src/Words.zig +++ b/src/Words.zig @@ -124,12 +124,12 @@ pub fn reverseIterator(words: *const Words, slice: []const u8) ReverseIterator { } /// Returns an iterator after the `word` in `slice`. -pub fn iterateAfter(words: *const Words, slice: []const u8, word: Word) Iterator { +pub fn iterateAfterWord(words: *const Words, slice: []const u8, word: Word) Iterator { return forwardFromIndex(words, slice, word.offset + word.len); } /// Returns a reverse iterator before the `word` in `slice`. -pub fn iterateBefore(words: *const Words, slice: []const u8, word: Word) ReverseIterator { +pub fn iterateBeforeWord(words: *const Words, slice: []const u8, word: Word) ReverseIterator { return reverseFromIndex(words, slice, word.offset); } diff --git a/src/code_point.zig b/src/code_point.zig index 8bd3d5b..16648af 100644 --- a/src/code_point.zig +++ b/src/code_point.zig @@ -39,9 +39,17 @@ pub fn decode(bytes: []const u8, offset: uoffset) ?CodePoint { return null; } +/// Return the codepoint at `index`, even if `index` is in the middle +/// of that codepoint. +pub fn codepointAtIndex(bytes: []const u8, index: uoffset) ?CodePoint { + var idx = index; + while (idx > 0 and 0x80 <= bytes[idx] and bytes[idx] <= 0xbf) : (idx -= 1) {} + return decodeAtIndex(bytes, idx); +} + /// Decode the CodePoint, if any, at `bytes[idx]`. -pub fn decodeAtIndex(bytes: []const u8, idx: uoffset) ?CodePoint { - var off = idx; +pub fn decodeAtIndex(bytes: []const u8, index: uoffset) ?CodePoint { + var off = index; return decodeAtCursor(bytes, &off); } @@ -329,6 +337,54 @@ test Iterator { try expectEqual(@as(?CodePoint, null), iter.next()); } +const code_point = @This(); + +// Keep this in sync with the README +test "Code point iterator" { + const str = "Hi 😊"; + var iter: code_point.Iterator = .init(str); + var i: usize = 0; + + while (iter.next()) |cp| : (i += 1) { + // The `code` field is the actual code point scalar as a `u21`. + if (i == 0) try expect(cp.code == 'H'); + if (i == 1) try expect(cp.code == 'i'); + if (i == 2) try expect(cp.code == ' '); + + if (i == 3) { + try expect(cp.code == '😊'); + // The `offset` field is the byte offset in the + // source string. + try expect(cp.offset == 3); + try expectEqual(cp, code_point.decodeAtIndex(str, cp.offset).?); + // The `len` field is the length in bytes of the + // code point in the source string. + try expect(cp.len == 4); + // There is also a 'cursor' decode, like so: + { + var cursor = cp.offset; + try expectEqual(cp, code_point.decodeAtCursor(str, &cursor).?); + // Which advances the cursor variable to the next possible + // offset, in this case, `str.len`. Don't forget to account + // for this possibility! + try expectEqual(cp.offset + cp.len, cursor); + } + // There's also this, for when you aren't sure if you have the + // correct start for a code point: + try expectEqual(cp, code_point.codepointAtIndex(str, cp.offset + 1).?); + } + // Reverse iteration is also an option: + var r_iter: code_point.ReverseIterator = .init(str); + // Both iterators can be peeked: + try expectEqual('😊', r_iter.peek().?.code); + try expectEqual('😊', r_iter.prev().?.code); + // Both kinds of iterators can be reversed: + var fwd_iter = r_iter.forwardIterator(); // or iter.reverseIterator(); + // This will always return the last codepoint from + // the prior iterator, _if_ it yielded one: + try expectEqual('😊', fwd_iter.next().?.code); + } +} test "overlongs" { // None of these should equal `/`, all should be byte-for-byte // handled as replacement characters. diff --git a/src/unicode_tests.zig b/src/unicode_tests.zig index c463dcc..ae177a9 100644 --- a/src/unicode_tests.zig +++ b/src/unicode_tests.zig @@ -162,20 +162,51 @@ test "Segmentation GraphemeIterator" { bytes_index += cp_index; } + const this_str = all_bytes.items; + { - var iter = graph.iterator(all_bytes.items); + var iter = graph.iterator(this_str); // Check. - for (want.items) |want_gc| { + for (want.items, 1..) |want_gc, idx| { const got_gc = (iter.next()).?; try std.testing.expectEqualStrings( - want_gc.bytes(all_bytes.items), - got_gc.bytes(all_bytes.items), + want_gc.bytes(this_str), + got_gc.bytes(this_str), ); + for (got_gc.offset..got_gc.offset + got_gc.len) |i| { + const this_gc = graph.graphemeAtIndex(this_str, i); + std.testing.expectEqualSlices( + u8, + got_gc.bytes(this_str), + this_gc.bytes(this_str), + ) catch |err| { + debug.print("Wrong grapheme on line {d} #{d} offset {d}\n", .{ line_iter.line, idx, i }); + return err; + }; + } + var after_iter = graph.iterateAfterGrapheme(this_str, got_gc); + if (after_iter.next()) |next_gc| { + if (iter.peek()) |next_peek| { + std.testing.expectEqualSlices( + u8, + next_gc.bytes(this_str), + next_peek.bytes(this_str), + ) catch |err| { + debug.print("Peeks differ on line {d} #{d} \n", .{ line_iter.line, idx }); + return err; + }; + } else { + debug.print("Mismatch: peek missing, next found, line {d} #{d}\n", .{ line_iter.line, idx }); + try testing.expect(false); + } + } else { + try testing.expectEqual(null, iter.peek()); + } } } { - var iter = graph.reverseIterator(all_bytes.items); + var iter = graph.reverseIterator(this_str); // Check. var i: usize = want.items.len; @@ -190,8 +221,8 @@ test "Segmentation GraphemeIterator" { return error.TestExpectedEqual; }; std.testing.expectEqualStrings( - want_gc.bytes(all_bytes.items), - got_gc.bytes(all_bytes.items), + want_gc.bytes(this_str), + got_gc.bytes(this_str), ) catch |err| { std.debug.print( "line {d} grapheme {d}: expected {any} found {any}\n", @@ -199,6 +230,24 @@ test "Segmentation GraphemeIterator" { ); return err; }; + var before_iter = graph.iterateBeforeGrapheme(this_str, got_gc); + if (before_iter.prev()) |prev_gc| { + if (iter.peek()) |prev_peek| { + std.testing.expectEqualSlices( + u8, + prev_gc.bytes(this_str), + prev_peek.bytes(this_str), + ) catch |err| { + debug.print("Peeks differ on line {d} #{d} \n", .{ line_iter.line, i }); + return err; + }; + } else { + debug.print("Mismatch: peek missing, prev found, line {d} #{d}\n", .{ line_iter.line, i }); + try testing.expect(false); + } + } else { + try testing.expectEqual(null, iter.peek()); + } } } } @@ -287,7 +336,7 @@ test "Segmentation Word Iterator" { } else { try testing.expect(false); } - var peek_iter = wb.iterateAfter(this_str, got_word); + var peek_iter = wb.iterateAfterWord(this_str, got_word); const peek_1 = peek_iter.next(); if (peek_1) |p1| { const peek_2 = iter.peek(); @@ -313,7 +362,7 @@ test "Segmentation Word Iterator" { got_word.bytes(this_str), this_word.bytes(this_str), ) catch |err| { - debug.print("Wrong word on line {d} #{d} offset {d}\n", .{ line_iter.line, idx + 1, i }); + debug.print("Wrong word on line {d} #{d} offset {d}\n", .{ line_iter.line, idx, i }); return err; }; } @@ -356,7 +405,7 @@ test "Segmentation Word Iterator" { } else { try testing.expect(false); } - var peek_iter = wb.iterateBefore(this_str, got_word); + var peek_iter = wb.iterateBeforeWord(this_str, got_word); const peek_1 = peek_iter.prev(); if (peek_1) |p1| { const peek_2 = r_iter.peek(); -- cgit v1.2.3 From e3082e64b3ab8a8aa0777d63be69eb8b6d50a654 Mon Sep 17 00:00:00 2001 From: Sam Atman Date: Tue, 8 Jul 2025 12:12:20 -0400 Subject: Add Words.zig example to README --- src/Words.zig | 17 +++++++++++++++++ src/code_point.zig | 3 +++ 2 files changed, 20 insertions(+) (limited to 'src') diff --git a/src/Words.zig b/src/Words.zig index af82562..617c34d 100644 --- a/src/Words.zig +++ b/src/Words.zig @@ -674,6 +674,23 @@ test "ext_pict" { try testing.expect(ext_pict.isMatch("\u{2701}")); } +test "Words" { + const wb = try Words.init(testing.allocator); + defer wb.deinit(testing.allocator); + const word_str = "Metonym Μετωνύμιο メトニム"; + var w_iter = wb.iterator(word_str); + try testing.expectEqualStrings("Metonym", w_iter.next().?.bytes(word_str)); + // Spaces are "words" too! + try testing.expectEqualStrings(" ", w_iter.next().?.bytes(word_str)); + const in_greek = w_iter.next().?; + for (in_greek.offset..in_greek.offset + in_greek.len) |i| { + const at_index = wb.wordAtIndex(word_str, i).bytes(word_str); + try testing.expectEqualStrings("Μετωνύμιο", at_index); + } + _ = w_iter.next(); + try testing.expectEqualStrings("メトニム", w_iter.next().?.bytes(word_str)); +} + test wordAtIndex { const wb = try Words.init(testing.allocator); defer wb.deinit(testing.allocator); diff --git a/src/code_point.zig b/src/code_point.zig index 16648af..7a638af 100644 --- a/src/code_point.zig +++ b/src/code_point.zig @@ -121,6 +121,9 @@ pub fn decodeAtCursor(bytes: []const u8, cursor: *uoffset) ?CodePoint { } if (st == RUNE_REJECT or cursor.* == bytes.len) { @branchHint(.cold); + // This, and the branch below, detect truncation, the + // only invalid state handled differently by the Maximal + // Subparts algorithm. if (state_dfa[@intCast(u8dfa[byte])] == RUNE_REJECT) { cursor.* -= 2; // +1 return .{ -- cgit v1.2.3