From 1a9168ab7d1d5337ec954f7897c2e6b51a0bd95e Mon Sep 17 00:00:00 2001 From: Sam Atman Date: Wed, 4 Feb 2026 15:02:12 -0500 Subject: Convert Graphemes to static allocation And DisplayWidth, although untested at present. The plan is to just work through the codegen / module pairings, and move tests over until everything is covered. --- build.zig | 7 ++- codegen/dwp.zig | 24 ++++++-- codegen/gbp.zig | 36 ++++++++---- src/Graphemes.zig | 155 ++++++++++++++++++++------------------------------ src/unicode_tests.zig | 12 ++-- 5 files changed, 117 insertions(+), 117 deletions(-) diff --git a/build.zig b/build.zig index 5678cd1..aab8516 100644 --- a/build.zig +++ b/build.zig @@ -52,7 +52,7 @@ pub fn build(b: *std.Build) void { gbp_gen_exe.root_module.addAnonymousImport("GraphemeBreakProperty.txt", .{ .root_source_file = b.path("data/unicode/auxiliary/GraphemeBreakProperty.txt") }); gbp_gen_exe.root_module.addAnonymousImport("emoji-data.txt", .{ .root_source_file = b.path("data/unicode/emoji/emoji-data.txt") }); const run_gbp_gen_exe = b.addRunArtifact(gbp_gen_exe); - const gbp_gen_out = run_gbp_gen_exe.addOutputFileArg("gbp.bin.z"); + const gbp_gen_out = run_gbp_gen_exe.addOutputFileArg("gbp.zig"); const wbp_gen_exe = b.addExecutable(.{ .name = "wbp", @@ -78,7 +78,7 @@ pub fn build(b: *std.Build) void { dwp_gen_exe.root_module.addAnonymousImport("DerivedGeneralCategory.txt", .{ .root_source_file = b.path("data/unicode/extracted/DerivedGeneralCategory.txt") }); dwp_gen_exe.root_module.addOptions("options", dwp_options); const run_dwp_gen_exe = b.addRunArtifact(dwp_gen_exe); - const dwp_gen_out = run_dwp_gen_exe.addOutputFileArg("dwp.bin.z"); + const dwp_gen_out = run_dwp_gen_exe.addOutputFileArg("dwp.zig"); // Normalization properties const canon_gen_exe = b.addExecutable(.{ @@ -514,6 +514,9 @@ pub fn build(b: *std.Build) void { const run_unicode_tests = b.addRunArtifact(unicode_tests); + const test_unicode_step = b.step("unicode", "Rune unicode tests"); + test_unicode_step.dependOn(&run_unicode_tests.step); + const test_step = b.step("test", "Run all module tests"); test_step.dependOn(&run_unicode_tests.step); test_step.dependOn(&code_point_tr.step); diff --git a/codegen/dwp.zig b/codegen/dwp.zig index 75ac68e..b4d1ed0 100644 --- a/codegen/dwp.zig +++ b/codegen/dwp.zig @@ -235,12 +235,24 @@ pub fn main() anyerror!void { defer out_file.close(); var writer = out_file.writer(&write_buf); - const endian = builtin.cpu.arch.endian(); - try writer.interface.writeInt(u16, @intCast(stage1.items.len), endian); - for (stage1.items) |i| try writer.interface.writeInt(u16, i, endian); - - try writer.interface.writeInt(u16, @intCast(stage2.items.len), endian); - for (stage2.items) |i| try writer.interface.writeInt(i8, i, endian); + try writer.interface.print( + \\//! This file is auto-generated. Do not edit. + \\ + \\pub const s1: [{}]u16 = .{{ + , .{stage1.items.len}); + for (stage1.items) |entry| try writer.interface.print("{}, ", .{entry}); + + try writer.interface.print( + \\ + \\}}; + \\ + \\pub const s2: [{}]i4 = .{{ + , .{stage2.items.len}); + for (stage2.items) |entry| try writer.interface.print("{}, ", .{entry}); + + try writer.interface.writeAll( + \\}; + ); try writer.interface.flush(); } diff --git a/codegen/gbp.zig b/codegen/gbp.zig index 1d06e9a..117847f 100644 --- a/codegen/gbp.zig +++ b/codegen/gbp.zig @@ -240,16 +240,32 @@ pub fn main() anyerror!void { defer out_file.close(); var writer = out_file.writer(&write_buf); - const endian = builtin.cpu.arch.endian(); - try writer.interface.writeInt(u16, @intCast(stage1.items.len), endian); - for (stage1.items) |i| try writer.interface.writeInt(u16, i, endian); - - try writer.interface.writeInt(u16, @intCast(stage2.items.len), endian); - for (stage2.items) |i| try writer.interface.writeInt(u16, i, endian); - - const props_bytes = stage3.keys(); - try writer.interface.writeInt(u16, @intCast(props_bytes.len), endian); - try writer.interface.writeAll(props_bytes); + try writer.interface.print( + \\//! This file is auto-generated. Do not edit. + \\ + \\pub const s1: [{}]u16 = .{{ + , .{stage1.items.len}); + for (stage1.items) |entry| try writer.interface.print("{}, ", .{entry}); + + try writer.interface.print( + \\ + \\}}; + \\ + \\pub const s2: [{}]u7 = .{{ + , .{stage2.items.len}); + for (stage2.items) |entry| try writer.interface.print("{}, ", .{entry}); + + const keys = stage3.keys(); + + try writer.interface.print( + \\}}; + \\ + \\pub const s3: [{}]u8 = .{{ + , .{keys.len}); + for (keys) |entry| try writer.interface.print("{}, ", .{entry}); + try writer.interface.writeAll( + \\}; + ); try writer.interface.flush(); } diff --git a/src/Graphemes.zig b/src/Graphemes.zig index 81d874c..d14b6ab 100644 --- a/src/Graphemes.zig +++ b/src/Graphemes.zig @@ -3,70 +3,46 @@ //! Code for handling graphemes: fragments of string which should be //! treated as one unit. Like Farmer Bob here: 👨🏻‍🌾 -s1: []u16 = undefined, -s2: []u16 = undefined, -s3: []u8 = undefined, - const Graphemes = @This(); -pub fn init(allocator: Allocator) Allocator.Error!Graphemes { - var graphemes = Graphemes{}; - try graphemes.setup(allocator); - return graphemes; -} - -pub fn setup(graphemes: *Graphemes, allocator: Allocator) Allocator.Error!void { - const in_bytes = @embedFile("gbp"); - var in_fbs = std.io.fixedBufferStream(in_bytes); - var reader = in_fbs.reader(); - - const endian = builtin.cpu.arch.endian(); - - const s1_len: u16 = reader.readInt(u16, endian) catch unreachable; - graphemes.s1 = try allocator.alloc(u16, s1_len); - errdefer allocator.free(graphemes.s1); - for (0..s1_len) |i| graphemes.s1[i] = reader.readInt(u16, endian) catch unreachable; - - const s2_len: u16 = reader.readInt(u16, endian) catch unreachable; - graphemes.s2 = try allocator.alloc(u16, s2_len); - errdefer allocator.free(graphemes.s2); - for (0..s2_len) |i| graphemes.s2[i] = reader.readInt(u16, endian) catch unreachable; - - const s3_len: u16 = reader.readInt(u16, endian) catch unreachable; - graphemes.s3 = try allocator.alloc(u8, s3_len); - errdefer allocator.free(graphemes.s3); - _ = reader.readAll(graphemes.s3) catch unreachable; -} +const Data = struct { + s1: []const u16 = undefined, + s2: []const u7 = undefined, + s3: []const u8 = undefined, +}; -pub fn deinit(graphemes: *const Graphemes, allocator: Allocator) void { - allocator.free(graphemes.s1); - allocator.free(graphemes.s2); - allocator.free(graphemes.s3); -} +const graphemes = graphemes: { + const data = @import("gbp"); + break :graphemes Data{ + .s1 = &data.s1, + .s2 = &data.s2, + .s3 = &data.s3, + }; +}; /// Lookup the grapheme break property for a code point. -pub fn gbp(graphemes: Graphemes, cp: u21) Gbp { +pub fn gbp(cp: u21) Gbp { return @enumFromInt(graphemes.s3[graphemes.s2[graphemes.s1[cp >> 8] + (cp & 0xff)]] >> 4); } /// Lookup the indic syllable type for a code point. -pub fn indic(graphemes: Graphemes, cp: u21) Indic { +pub fn indic(cp: u21) Indic { return @enumFromInt((graphemes.s3[graphemes.s2[graphemes.s1[cp >> 8] + (cp & 0xff)]] >> 1) & 0x7); } /// Lookup the emoji property for a code point. -pub fn isEmoji(graphemes: Graphemes, cp: u21) bool { +pub fn isEmoji(cp: u21) bool { return graphemes.s3[graphemes.s2[graphemes.s1[cp >> 8] + (cp & 0xff)]] & 1 == 1; } /// Returns an iterator over the graphemes in `string`. -pub fn iterator(graphemes: *const Graphemes, string: []const u8) Iterator { - return Iterator.init(string, graphemes); +pub fn iterator(string: []const u8) Iterator { + return Iterator.init(string); } /// Returns a reverse iterator over the graphemes in `string`. -pub fn reverseIterator(graphemes: *const Graphemes, string: []const u8) ReverseIterator { - return ReverseIterator.init(string, graphemes); +pub fn reverseIterator(string: []const u8) ReverseIterator { + return ReverseIterator.init(string); } /// Indic syllable type. @@ -81,6 +57,7 @@ pub const Indic = enum { /// Grapheme break property. pub const Gbp = enum { none, + Control, CR, Extend, @@ -117,7 +94,7 @@ pub const Grapheme = struct { /// Returns the `Grapheme` at `string[index]`, which does not have to be a /// valid start of a codepoint. Asserts the string is not empty. Index must be /// less than `string.len`. Always returns a `Grapheme`. -pub fn graphemeAtIndex(graphemes: *const Graphemes, string: []const u8, index: usize) Grapheme { +pub fn graphemeAtIndex(string: []const u8, index: usize) Grapheme { assert(string.len != 0); if (index == 0 or (index > 0 and string[index] < 0x80 and @@ -125,7 +102,7 @@ pub fn graphemeAtIndex(graphemes: *const Graphemes, string: []const u8, index: u (string[index - 1] != '\r' and string[index] != '\n')) { // There's always a grapheme break between two ASCII code points (except CRLF) - var iter = graphemes.iterator(string[index..]); + var iter = Graphemes.iterator(string[index..]); const next = iter.next().?; return Grapheme{ .len = next.len, @@ -134,14 +111,14 @@ pub fn graphemeAtIndex(graphemes: *const Graphemes, string: []const u8, index: u } // Otherwise it gets hairy. const idx: uoffset = code_point.codepointAtIndex(string, @intCast(index)).?.offset; if (idx == string.len) { - var iter = graphemes.reverseIterator(string); + var iter = Graphemes.reverseIterator(string); return iter.prev().?; } // We're on a valid codepoint boundary, we go back from here - var r_iter = graphemes.reverseIterAtIndex(string, idx); + var r_iter = Graphemes.reverseIterAtIndex(string, idx); if (r_iter.prev()) |g| { if (g.offset == 0) { - var iter = graphemes.iterator(string); + var iter = Graphemes.iterator(string); while (iter.next()) |g2| { if (g2.offset <= idx and idx < g2.offset + g2.len) return g2; } @@ -151,7 +128,7 @@ pub fn graphemeAtIndex(graphemes: *const Graphemes, string: []const u8, index: u // we in fact need to be. _ = r_iter.prev(); while (r_iter.pending != .none) : (_ = r_iter.prev()) {} - var iter = graphemes.iterAtIndex(string, r_iter.cp_iter.i orelse 0); + var iter = Graphemes.iterAtIndex(string, r_iter.cp_iter.i orelse 0); while (iter.next()) |g| { if (g.offset <= idx and idx < g.offset + g.len) return g; } @@ -159,23 +136,22 @@ pub fn graphemeAtIndex(graphemes: *const Graphemes, string: []const u8, index: u } /// Return a (forward) iterator of `string` after `grapheme`. -pub fn iterateAfterGrapheme(graphemes: *const Graphemes, string: []const u8, grapheme: Grapheme) Iterator { - return graphemes.iterAtIndex(string, grapheme.offset + grapheme.len); +pub fn iterateAfterGrapheme(string: []const u8, grapheme: Grapheme) Iterator { + return Graphemes.iterAtIndex(string, grapheme.offset + grapheme.len); } /// Return a reverse iterator of `string` before `grapheme`. -pub fn iterateBeforeGrapheme(graphemes: *const Graphemes, string: []const u8, grapheme: Grapheme) ReverseIterator { +pub fn iterateBeforeGrapheme(string: []const u8, grapheme: Grapheme) ReverseIterator { // This bit of weirdness is because reverse iterators are "advance last", // while forward iterators are "advance first". This leaves some room for // further optimization, if anyone dares. - var r_iter = graphemes.reverseIterAtIndex(string, grapheme.offset + grapheme.len - 1); + var r_iter = Graphemes.reverseIterAtIndex(string, grapheme.offset + grapheme.len - 1); _ = r_iter.prev(); return r_iter; } -fn reverseIterAtIndex(graphemes: *const Graphemes, string: []const u8, idx: uoffset) ReverseIterator { +fn reverseIterAtIndex(string: []const u8, idx: uoffset) ReverseIterator { var r_iter: ReverseIterator = undefined; - r_iter.data = graphemes; var rcp_iter: CodePointReverseIterator = .{ .bytes = string, .i = idx }; r_iter.buf[1] = rcp_iter.prev(); r_iter.buf[0] = rcp_iter.prev(); @@ -184,9 +160,8 @@ fn reverseIterAtIndex(graphemes: *const Graphemes, string: []const u8, idx: uoff return r_iter; } -fn iterAtIndex(graphemes: *const Graphemes, string: []const u8, idx: uoffset) Iterator { +fn iterAtIndex(string: []const u8, idx: uoffset) Iterator { var iter: Iterator = undefined; - iter.data = graphemes; iter.buf[0] = first: { if (idx == string.len) break :first null; var r_cp_iter: CodePointReverseIterator = .{ .bytes = string, .i = idx }; @@ -202,13 +177,12 @@ fn iterAtIndex(graphemes: *const Graphemes, string: []const u8, idx: uoffset) It pub const Iterator = struct { buf: [2]?CodePoint = .{ null, null }, cp_iter: CodePointIterator, - data: *const Graphemes, const Self = @This(); /// Assumes `src` is valid UTF-8. - pub fn init(str: []const u8, data: *const Graphemes) Self { - var self = Self{ .cp_iter = .{ .bytes = str }, .data = data }; + pub fn init(str: []const u8) Self { + var self = Self{ .cp_iter = .{ .bytes = str } }; self.advance(); return self; } @@ -237,7 +211,6 @@ pub const Iterator = struct { if (graphemeBreak( self.buf[0].?.code, self.buf[1].?.code, - self.data, &state, )) return Grapheme{ .len = gc_len, .offset = gc_start }; @@ -250,7 +223,6 @@ pub const Iterator = struct { if (graphemeBreak( self.buf[0].?.code, if (self.buf[1]) |ncp| ncp.code else 0, - self.data, &state, )) break; } @@ -275,7 +247,6 @@ pub const Iterator = struct { pub const ReverseIterator = struct { buf: [2]?CodePoint = .{ null, null }, cp_iter: CodePointReverseIterator, - data: *const Graphemes, /// Codepoint read from `cp_iter` but not returned by `previous` pending: Pending = .none, @@ -289,8 +260,8 @@ pub const ReverseIterator = struct { const Self = @This(); - pub fn init(str: []const u8, data: *const Graphemes) Self { - var self: Self = .{ .cp_iter = .init(str), .data = data }; + pub fn init(str: []const u8) Self { + var self: Self = .{ .cp_iter = .init(str) }; self.advance(); self.advance(); return self; @@ -352,7 +323,6 @@ pub const ReverseIterator = struct { if (graphemeBreak( self.buf[0].?.code, self.buf[1].?.code, - self.data, &state, )) break; @@ -374,7 +344,7 @@ pub const ReverseIterator = struct { const codepoint = self.buf[0].?; - switch (self.data.indic(codepoint.code)) { + switch (Graphemes.indic(codepoint.code)) { .Extend, .Linker => { self.advance(); continue :indic; @@ -387,7 +357,7 @@ pub const ReverseIterator = struct { if (self.buf[0]) |cp1| { state.indic = true; - if (graphemeBreak(cp1.code, self.buf[1].?.code, self.data, &state)) break; + if (graphemeBreak(cp1.code, self.buf[1].?.code, &state)) break; if (!state.indic) { continue :indic; @@ -426,12 +396,12 @@ pub const ReverseIterator = struct { const codepoint = self.buf[0].?; - if (self.data.gbp(codepoint.code) == .Extend) { + if (Graphemes.gbp(codepoint.code) == .Extend) { self.advance(); continue :emoji; } - if (self.data.isEmoji(codepoint.code)) { + if (Graphemes.isEmoji(codepoint.code)) { // BUF: [Emoji, Extend] (Extend* ZWJ Emoji)* emoji_offset = codepoint.offset; self.advance(); @@ -462,7 +432,7 @@ pub const ReverseIterator = struct { if (state.regional) { var ri_count: usize = 0; while (self.buf[0] != null and - self.data.gbp(self.buf[0].?.code) == .Regional_Indicator) + Graphemes.gbp(self.buf[0].?.code) == .Regional_Indicator) { ri_count += 1; self.advance(); @@ -500,10 +470,13 @@ pub const IterState = packed struct(u3) { indic: bool = false, }; +// TODO: isBreaker is also expensive given the data is already available, +// and should be "semantically inlined" wherever it belongs. + // Predicates -fn isBreaker(cp: u21, data: *const Graphemes) bool { +fn isBreaker(cp: u21) bool { // Extract relevant properties. - const cp_gbp_prop = data.gbp(cp); + const cp_gbp_prop = Graphemes.gbp(cp); return cp == '\x0d' or cp == '\x0a' or cp_gbp_prop == .Control; } @@ -516,17 +489,20 @@ fn isBreaker(cp: u21, data: *const Graphemes) bool { pub fn graphemeBreak( cp1: u21, cp2: u21, - data: *const Graphemes, state: *IterState, ) bool { + // TODO: it's silly to index the same field three times and + // just extra different bits from the data. Optimizable? Maybe + // but it's silly to rely on that. + // // Extract relevant properties. - const cp1_gbp_prop = data.gbp(cp1); - const cp1_indic_prop = data.indic(cp1); - const cp1_is_emoji = data.isEmoji(cp1); + const cp1_gbp_prop = Graphemes.gbp(cp1); + const cp1_indic_prop = Graphemes.indic(cp1); + const cp1_is_emoji = Graphemes.isEmoji(cp1); - const cp2_gbp_prop = data.gbp(cp2); - const cp2_indic_prop = data.indic(cp2); - const cp2_is_emoji = data.isEmoji(cp2); + const cp2_gbp_prop = Graphemes.gbp(cp2); + const cp2_indic_prop = Graphemes.indic(cp2); + const cp2_is_emoji = Graphemes.isEmoji(cp2); // GB11: Emoji Extend* ZWJ x Emoji if (!state.xpic and cp1_is_emoji) state.xpic = true; @@ -537,7 +513,7 @@ pub fn graphemeBreak( if (cp1 == '\r' and cp2 == '\n') return false; // GB4: Control - if (isBreaker(cp1, data)) return true; + if (isBreaker(cp1)) return true; // GB11: Emoji Extend* ZWJ x Emoji if (state.xpic and @@ -555,7 +531,7 @@ pub fn graphemeBreak( if (cp2_gbp_prop == .SpacingMark) return false; // GB9b: Prepend x - if (cp1_gbp_prop == .Prepend and !isBreaker(cp2, data)) return false; + if (cp1_gbp_prop == .Prepend and !isBreaker(cp2)) return false; // GB12, GB13: RI x RI if (cp1_gbp_prop == .Regional_Indicator and cp2_gbp_prop == .Regional_Indicator) { @@ -620,25 +596,22 @@ test "Segmentation ZWJ and ZWSP emoji sequences" { const with_zwsp = seq_1 ++ "\u{200B}" ++ seq_2; const no_joiner = seq_1 ++ seq_2; - const graphemes = try Graphemes.init(std.testing.allocator); - defer graphemes.deinit(std.testing.allocator); - { - var iter = graphemes.iterator(with_zwj); + var iter = Graphemes.iterator(with_zwj); var i: usize = 0; while (iter.next()) |_| : (i += 1) {} try std.testing.expectEqual(@as(usize, 1), i); } { - var iter = graphemes.iterator(with_zwsp); + var iter = Graphemes.iterator(with_zwsp); var i: usize = 0; while (iter.next()) |_| : (i += 1) {} try std.testing.expectEqual(@as(usize, 3), i); } { - var iter = graphemes.iterator(no_joiner); + var iter = Graphemes.iterator(no_joiner); var i: usize = 0; while (iter.next()) |_| : (i += 1) {} try std.testing.expectEqual(@as(usize, 2), i); @@ -647,10 +620,8 @@ test "Segmentation ZWJ and ZWSP emoji sequences" { test "Iterator.peek" { const peek_seq = "aΔ👨🏻‍🌾→"; - const data = try Graphemes.init(std.testing.allocator); - defer data.deinit(std.testing.allocator); - var iter = data.iterator(peek_seq); + var iter = Graphemes.iterator(peek_seq); const peek_a = iter.peek().?; const next_a = iter.next().?; try std.testing.expectEqual(peek_a, next_a); diff --git a/src/unicode_tests.zig b/src/unicode_tests.zig index e2a5a96..946c197 100644 --- a/src/unicode_tests.zig +++ b/src/unicode_tests.zig @@ -118,8 +118,6 @@ test "Segmentation GraphemeIterator" { const allocator = std.testing.allocator; var reader = std.io.Reader.fixed(@embedFile("GraphemeBreakTest.txt")); - const graph = try Graphemes.init(allocator); - defer graph.deinit(allocator); var line_iter: IterRead = .{ .read = &reader }; @@ -161,7 +159,7 @@ test "Segmentation GraphemeIterator" { const this_str = all_bytes.items; { - var iter = graph.iterator(this_str); + var iter = Graphemes.iterator(this_str); // Check. for (want.items, 1..) |want_gc, idx| { @@ -171,7 +169,7 @@ test "Segmentation GraphemeIterator" { got_gc.bytes(this_str), ); for (got_gc.offset..got_gc.offset + got_gc.len) |i| { - const this_gc = graph.graphemeAtIndex(this_str, i); + const this_gc = Graphemes.graphemeAtIndex(this_str, i); std.testing.expectEqualSlices( u8, got_gc.bytes(this_str), @@ -181,7 +179,7 @@ test "Segmentation GraphemeIterator" { return err; }; } - var after_iter = graph.iterateAfterGrapheme(this_str, got_gc); + var after_iter = Graphemes.iterateAfterGrapheme(this_str, got_gc); if (after_iter.next()) |next_gc| { if (iter.peek()) |next_peek| { std.testing.expectEqualSlices( @@ -202,7 +200,7 @@ test "Segmentation GraphemeIterator" { } } { - var iter = graph.reverseIterator(this_str); + var iter = Graphemes.reverseIterator(this_str); // Check. var i: usize = want.items.len; @@ -226,7 +224,7 @@ test "Segmentation GraphemeIterator" { ); return err; }; - var before_iter = graph.iterateBeforeGrapheme(this_str, got_gc); + var before_iter = Graphemes.iterateBeforeGrapheme(this_str, got_gc); if (before_iter.prev()) |prev_gc| { if (iter.peek()) |prev_peek| { std.testing.expectEqualSlices( -- cgit v1.2.3