diff options
| author | 2026-02-04 15:02:12 -0500 | |
|---|---|---|
| committer | 2026-02-04 15:02:12 -0500 | |
| commit | 1a9168ab7d1d5337ec954f7897c2e6b51a0bd95e (patch) | |
| tree | 35847e74f41a0a75870b08d8a5fbbb0f1bd0d378 | |
| parent | Merge pull request 'Move part of the `DisplayWidth.strWidth` into its own `Di... (diff) | |
| download | zg-1a9168ab7d1d5337ec954f7897c2e6b51a0bd95e.tar.gz zg-1a9168ab7d1d5337ec954f7897c2e6b51a0bd95e.tar.xz zg-1a9168ab7d1d5337ec954f7897c2e6b51a0bd95e.zip | |
Convert Graphemes to static allocation
And DisplayWidth, although untested at present.
The plan is to just work through the codegen / module pairings, and
move tests over until everything is covered.
| -rw-r--r-- | build.zig | 7 | ||||
| -rw-r--r-- | codegen/dwp.zig | 24 | ||||
| -rw-r--r-- | codegen/gbp.zig | 36 | ||||
| -rw-r--r-- | src/Graphemes.zig | 155 | ||||
| -rw-r--r-- | src/unicode_tests.zig | 12 |
5 files changed, 117 insertions, 117 deletions
| @@ -52,7 +52,7 @@ pub fn build(b: *std.Build) void { | |||
| 52 | gbp_gen_exe.root_module.addAnonymousImport("GraphemeBreakProperty.txt", .{ .root_source_file = b.path("data/unicode/auxiliary/GraphemeBreakProperty.txt") }); | 52 | gbp_gen_exe.root_module.addAnonymousImport("GraphemeBreakProperty.txt", .{ .root_source_file = b.path("data/unicode/auxiliary/GraphemeBreakProperty.txt") }); |
| 53 | gbp_gen_exe.root_module.addAnonymousImport("emoji-data.txt", .{ .root_source_file = b.path("data/unicode/emoji/emoji-data.txt") }); | 53 | gbp_gen_exe.root_module.addAnonymousImport("emoji-data.txt", .{ .root_source_file = b.path("data/unicode/emoji/emoji-data.txt") }); |
| 54 | const run_gbp_gen_exe = b.addRunArtifact(gbp_gen_exe); | 54 | const run_gbp_gen_exe = b.addRunArtifact(gbp_gen_exe); |
| 55 | const gbp_gen_out = run_gbp_gen_exe.addOutputFileArg("gbp.bin.z"); | 55 | const gbp_gen_out = run_gbp_gen_exe.addOutputFileArg("gbp.zig"); |
| 56 | 56 | ||
| 57 | const wbp_gen_exe = b.addExecutable(.{ | 57 | const wbp_gen_exe = b.addExecutable(.{ |
| 58 | .name = "wbp", | 58 | .name = "wbp", |
| @@ -78,7 +78,7 @@ pub fn build(b: *std.Build) void { | |||
| 78 | dwp_gen_exe.root_module.addAnonymousImport("DerivedGeneralCategory.txt", .{ .root_source_file = b.path("data/unicode/extracted/DerivedGeneralCategory.txt") }); | 78 | dwp_gen_exe.root_module.addAnonymousImport("DerivedGeneralCategory.txt", .{ .root_source_file = b.path("data/unicode/extracted/DerivedGeneralCategory.txt") }); |
| 79 | dwp_gen_exe.root_module.addOptions("options", dwp_options); | 79 | dwp_gen_exe.root_module.addOptions("options", dwp_options); |
| 80 | const run_dwp_gen_exe = b.addRunArtifact(dwp_gen_exe); | 80 | const run_dwp_gen_exe = b.addRunArtifact(dwp_gen_exe); |
| 81 | const dwp_gen_out = run_dwp_gen_exe.addOutputFileArg("dwp.bin.z"); | 81 | const dwp_gen_out = run_dwp_gen_exe.addOutputFileArg("dwp.zig"); |
| 82 | 82 | ||
| 83 | // Normalization properties | 83 | // Normalization properties |
| 84 | const canon_gen_exe = b.addExecutable(.{ | 84 | const canon_gen_exe = b.addExecutable(.{ |
| @@ -514,6 +514,9 @@ pub fn build(b: *std.Build) void { | |||
| 514 | 514 | ||
| 515 | const run_unicode_tests = b.addRunArtifact(unicode_tests); | 515 | const run_unicode_tests = b.addRunArtifact(unicode_tests); |
| 516 | 516 | ||
| 517 | const test_unicode_step = b.step("unicode", "Rune unicode tests"); | ||
| 518 | test_unicode_step.dependOn(&run_unicode_tests.step); | ||
| 519 | |||
| 517 | const test_step = b.step("test", "Run all module tests"); | 520 | const test_step = b.step("test", "Run all module tests"); |
| 518 | test_step.dependOn(&run_unicode_tests.step); | 521 | test_step.dependOn(&run_unicode_tests.step); |
| 519 | test_step.dependOn(&code_point_tr.step); | 522 | test_step.dependOn(&code_point_tr.step); |
diff --git a/codegen/dwp.zig b/codegen/dwp.zig index 75ac68e..b4d1ed0 100644 --- a/codegen/dwp.zig +++ b/codegen/dwp.zig | |||
| @@ -235,12 +235,24 @@ pub fn main() anyerror!void { | |||
| 235 | defer out_file.close(); | 235 | defer out_file.close(); |
| 236 | var writer = out_file.writer(&write_buf); | 236 | var writer = out_file.writer(&write_buf); |
| 237 | 237 | ||
| 238 | const endian = builtin.cpu.arch.endian(); | 238 | try writer.interface.print( |
| 239 | try writer.interface.writeInt(u16, @intCast(stage1.items.len), endian); | 239 | \\//! This file is auto-generated. Do not edit. |
| 240 | for (stage1.items) |i| try writer.interface.writeInt(u16, i, endian); | 240 | \\ |
| 241 | 241 | \\pub const s1: [{}]u16 = .{{ | |
| 242 | try writer.interface.writeInt(u16, @intCast(stage2.items.len), endian); | 242 | , .{stage1.items.len}); |
| 243 | for (stage2.items) |i| try writer.interface.writeInt(i8, i, endian); | 243 | for (stage1.items) |entry| try writer.interface.print("{}, ", .{entry}); |
| 244 | |||
| 245 | try writer.interface.print( | ||
| 246 | \\ | ||
| 247 | \\}}; | ||
| 248 | \\ | ||
| 249 | \\pub const s2: [{}]i4 = .{{ | ||
| 250 | , .{stage2.items.len}); | ||
| 251 | for (stage2.items) |entry| try writer.interface.print("{}, ", .{entry}); | ||
| 252 | |||
| 253 | try writer.interface.writeAll( | ||
| 254 | \\}; | ||
| 255 | ); | ||
| 244 | 256 | ||
| 245 | try writer.interface.flush(); | 257 | try writer.interface.flush(); |
| 246 | } | 258 | } |
diff --git a/codegen/gbp.zig b/codegen/gbp.zig index 1d06e9a..117847f 100644 --- a/codegen/gbp.zig +++ b/codegen/gbp.zig | |||
| @@ -240,16 +240,32 @@ pub fn main() anyerror!void { | |||
| 240 | defer out_file.close(); | 240 | defer out_file.close(); |
| 241 | var writer = out_file.writer(&write_buf); | 241 | var writer = out_file.writer(&write_buf); |
| 242 | 242 | ||
| 243 | const endian = builtin.cpu.arch.endian(); | 243 | try writer.interface.print( |
| 244 | try writer.interface.writeInt(u16, @intCast(stage1.items.len), endian); | 244 | \\//! This file is auto-generated. Do not edit. |
| 245 | for (stage1.items) |i| try writer.interface.writeInt(u16, i, endian); | 245 | \\ |
| 246 | 246 | \\pub const s1: [{}]u16 = .{{ | |
| 247 | try writer.interface.writeInt(u16, @intCast(stage2.items.len), endian); | 247 | , .{stage1.items.len}); |
| 248 | for (stage2.items) |i| try writer.interface.writeInt(u16, i, endian); | 248 | for (stage1.items) |entry| try writer.interface.print("{}, ", .{entry}); |
| 249 | 249 | ||
| 250 | const props_bytes = stage3.keys(); | 250 | try writer.interface.print( |
| 251 | try writer.interface.writeInt(u16, @intCast(props_bytes.len), endian); | 251 | \\ |
| 252 | try writer.interface.writeAll(props_bytes); | 252 | \\}}; |
| 253 | \\ | ||
| 254 | \\pub const s2: [{}]u7 = .{{ | ||
| 255 | , .{stage2.items.len}); | ||
| 256 | for (stage2.items) |entry| try writer.interface.print("{}, ", .{entry}); | ||
| 257 | |||
| 258 | const keys = stage3.keys(); | ||
| 259 | |||
| 260 | try writer.interface.print( | ||
| 261 | \\}}; | ||
| 262 | \\ | ||
| 263 | \\pub const s3: [{}]u8 = .{{ | ||
| 264 | , .{keys.len}); | ||
| 265 | for (keys) |entry| try writer.interface.print("{}, ", .{entry}); | ||
| 266 | try writer.interface.writeAll( | ||
| 267 | \\}; | ||
| 268 | ); | ||
| 253 | 269 | ||
| 254 | try writer.interface.flush(); | 270 | try writer.interface.flush(); |
| 255 | } | 271 | } |
diff --git a/src/Graphemes.zig b/src/Graphemes.zig index 81d874c..d14b6ab 100644 --- a/src/Graphemes.zig +++ b/src/Graphemes.zig | |||
| @@ -3,70 +3,46 @@ | |||
| 3 | //! Code for handling graphemes: fragments of string which should be | 3 | //! Code for handling graphemes: fragments of string which should be |
| 4 | //! treated as one unit. Like Farmer Bob here: 👨🏻🌾 | 4 | //! treated as one unit. Like Farmer Bob here: 👨🏻🌾 |
| 5 | 5 | ||
| 6 | s1: []u16 = undefined, | ||
| 7 | s2: []u16 = undefined, | ||
| 8 | s3: []u8 = undefined, | ||
| 9 | |||
| 10 | const Graphemes = @This(); | 6 | const Graphemes = @This(); |
| 11 | 7 | ||
| 12 | pub fn init(allocator: Allocator) Allocator.Error!Graphemes { | 8 | const Data = struct { |
| 13 | var graphemes = Graphemes{}; | 9 | s1: []const u16 = undefined, |
| 14 | try graphemes.setup(allocator); | 10 | s2: []const u7 = undefined, |
| 15 | return graphemes; | 11 | s3: []const u8 = undefined, |
| 16 | } | 12 | }; |
| 17 | |||
| 18 | pub fn setup(graphemes: *Graphemes, allocator: Allocator) Allocator.Error!void { | ||
| 19 | const in_bytes = @embedFile("gbp"); | ||
| 20 | var in_fbs = std.io.fixedBufferStream(in_bytes); | ||
| 21 | var reader = in_fbs.reader(); | ||
| 22 | |||
| 23 | const endian = builtin.cpu.arch.endian(); | ||
| 24 | |||
| 25 | const s1_len: u16 = reader.readInt(u16, endian) catch unreachable; | ||
| 26 | graphemes.s1 = try allocator.alloc(u16, s1_len); | ||
| 27 | errdefer allocator.free(graphemes.s1); | ||
| 28 | for (0..s1_len) |i| graphemes.s1[i] = reader.readInt(u16, endian) catch unreachable; | ||
| 29 | |||
| 30 | const s2_len: u16 = reader.readInt(u16, endian) catch unreachable; | ||
| 31 | graphemes.s2 = try allocator.alloc(u16, s2_len); | ||
| 32 | errdefer allocator.free(graphemes.s2); | ||
| 33 | for (0..s2_len) |i| graphemes.s2[i] = reader.readInt(u16, endian) catch unreachable; | ||
| 34 | |||
| 35 | const s3_len: u16 = reader.readInt(u16, endian) catch unreachable; | ||
| 36 | graphemes.s3 = try allocator.alloc(u8, s3_len); | ||
| 37 | errdefer allocator.free(graphemes.s3); | ||
| 38 | _ = reader.readAll(graphemes.s3) catch unreachable; | ||
| 39 | } | ||
| 40 | 13 | ||
| 41 | pub fn deinit(graphemes: *const Graphemes, allocator: Allocator) void { | 14 | const graphemes = graphemes: { |
| 42 | allocator.free(graphemes.s1); | 15 | const data = @import("gbp"); |
| 43 | allocator.free(graphemes.s2); | 16 | break :graphemes Data{ |
| 44 | allocator.free(graphemes.s3); | 17 | .s1 = &data.s1, |
| 45 | } | 18 | .s2 = &data.s2, |
| 19 | .s3 = &data.s3, | ||
| 20 | }; | ||
| 21 | }; | ||
| 46 | 22 | ||
| 47 | /// Lookup the grapheme break property for a code point. | 23 | /// Lookup the grapheme break property for a code point. |
| 48 | pub fn gbp(graphemes: Graphemes, cp: u21) Gbp { | 24 | pub fn gbp(cp: u21) Gbp { |
| 49 | return @enumFromInt(graphemes.s3[graphemes.s2[graphemes.s1[cp >> 8] + (cp & 0xff)]] >> 4); | 25 | return @enumFromInt(graphemes.s3[graphemes.s2[graphemes.s1[cp >> 8] + (cp & 0xff)]] >> 4); |
| 50 | } | 26 | } |
| 51 | 27 | ||
| 52 | /// Lookup the indic syllable type for a code point. | 28 | /// Lookup the indic syllable type for a code point. |
| 53 | pub fn indic(graphemes: Graphemes, cp: u21) Indic { | 29 | pub fn indic(cp: u21) Indic { |
| 54 | return @enumFromInt((graphemes.s3[graphemes.s2[graphemes.s1[cp >> 8] + (cp & 0xff)]] >> 1) & 0x7); | 30 | return @enumFromInt((graphemes.s3[graphemes.s2[graphemes.s1[cp >> 8] + (cp & 0xff)]] >> 1) & 0x7); |
| 55 | } | 31 | } |
| 56 | 32 | ||
| 57 | /// Lookup the emoji property for a code point. | 33 | /// Lookup the emoji property for a code point. |
| 58 | pub fn isEmoji(graphemes: Graphemes, cp: u21) bool { | 34 | pub fn isEmoji(cp: u21) bool { |
| 59 | return graphemes.s3[graphemes.s2[graphemes.s1[cp >> 8] + (cp & 0xff)]] & 1 == 1; | 35 | return graphemes.s3[graphemes.s2[graphemes.s1[cp >> 8] + (cp & 0xff)]] & 1 == 1; |
| 60 | } | 36 | } |
| 61 | 37 | ||
| 62 | /// Returns an iterator over the graphemes in `string`. | 38 | /// Returns an iterator over the graphemes in `string`. |
| 63 | pub fn iterator(graphemes: *const Graphemes, string: []const u8) Iterator { | 39 | pub fn iterator(string: []const u8) Iterator { |
| 64 | return Iterator.init(string, graphemes); | 40 | return Iterator.init(string); |
| 65 | } | 41 | } |
| 66 | 42 | ||
| 67 | /// Returns a reverse iterator over the graphemes in `string`. | 43 | /// Returns a reverse iterator over the graphemes in `string`. |
| 68 | pub fn reverseIterator(graphemes: *const Graphemes, string: []const u8) ReverseIterator { | 44 | pub fn reverseIterator(string: []const u8) ReverseIterator { |
| 69 | return ReverseIterator.init(string, graphemes); | 45 | return ReverseIterator.init(string); |
| 70 | } | 46 | } |
| 71 | 47 | ||
| 72 | /// Indic syllable type. | 48 | /// Indic syllable type. |
| @@ -81,6 +57,7 @@ pub const Indic = enum { | |||
| 81 | /// Grapheme break property. | 57 | /// Grapheme break property. |
| 82 | pub const Gbp = enum { | 58 | pub const Gbp = enum { |
| 83 | none, | 59 | none, |
| 60 | |||
| 84 | Control, | 61 | Control, |
| 85 | CR, | 62 | CR, |
| 86 | Extend, | 63 | Extend, |
| @@ -117,7 +94,7 @@ pub const Grapheme = struct { | |||
| 117 | /// Returns the `Grapheme` at `string[index]`, which does not have to be a | 94 | /// Returns the `Grapheme` at `string[index]`, which does not have to be a |
| 118 | /// valid start of a codepoint. Asserts the string is not empty. Index must be | 95 | /// valid start of a codepoint. Asserts the string is not empty. Index must be |
| 119 | /// less than `string.len`. Always returns a `Grapheme`. | 96 | /// less than `string.len`. Always returns a `Grapheme`. |
| 120 | pub fn graphemeAtIndex(graphemes: *const Graphemes, string: []const u8, index: usize) Grapheme { | 97 | pub fn graphemeAtIndex(string: []const u8, index: usize) Grapheme { |
| 121 | assert(string.len != 0); | 98 | assert(string.len != 0); |
| 122 | if (index == 0 or (index > 0 and | 99 | if (index == 0 or (index > 0 and |
| 123 | string[index] < 0x80 and | 100 | string[index] < 0x80 and |
| @@ -125,7 +102,7 @@ pub fn graphemeAtIndex(graphemes: *const Graphemes, string: []const u8, index: u | |||
| 125 | (string[index - 1] != '\r' and string[index] != '\n')) | 102 | (string[index - 1] != '\r' and string[index] != '\n')) |
| 126 | { | 103 | { |
| 127 | // There's always a grapheme break between two ASCII code points (except CRLF) | 104 | // There's always a grapheme break between two ASCII code points (except CRLF) |
| 128 | var iter = graphemes.iterator(string[index..]); | 105 | var iter = Graphemes.iterator(string[index..]); |
| 129 | const next = iter.next().?; | 106 | const next = iter.next().?; |
| 130 | return Grapheme{ | 107 | return Grapheme{ |
| 131 | .len = next.len, | 108 | .len = next.len, |
| @@ -134,14 +111,14 @@ pub fn graphemeAtIndex(graphemes: *const Graphemes, string: []const u8, index: u | |||
| 134 | } // Otherwise it gets hairy. | 111 | } // Otherwise it gets hairy. |
| 135 | const idx: uoffset = code_point.codepointAtIndex(string, @intCast(index)).?.offset; | 112 | const idx: uoffset = code_point.codepointAtIndex(string, @intCast(index)).?.offset; |
| 136 | if (idx == string.len) { | 113 | if (idx == string.len) { |
| 137 | var iter = graphemes.reverseIterator(string); | 114 | var iter = Graphemes.reverseIterator(string); |
| 138 | return iter.prev().?; | 115 | return iter.prev().?; |
| 139 | } | 116 | } |
| 140 | // We're on a valid codepoint boundary, we go back from here | 117 | // We're on a valid codepoint boundary, we go back from here |
| 141 | var r_iter = graphemes.reverseIterAtIndex(string, idx); | 118 | var r_iter = Graphemes.reverseIterAtIndex(string, idx); |
| 142 | if (r_iter.prev()) |g| { | 119 | if (r_iter.prev()) |g| { |
| 143 | if (g.offset == 0) { | 120 | if (g.offset == 0) { |
| 144 | var iter = graphemes.iterator(string); | 121 | var iter = Graphemes.iterator(string); |
| 145 | while (iter.next()) |g2| { | 122 | while (iter.next()) |g2| { |
| 146 | if (g2.offset <= idx and idx < g2.offset + g2.len) return g2; | 123 | if (g2.offset <= idx and idx < g2.offset + g2.len) return g2; |
| 147 | } | 124 | } |
| @@ -151,7 +128,7 @@ pub fn graphemeAtIndex(graphemes: *const Graphemes, string: []const u8, index: u | |||
| 151 | // we in fact need to be. | 128 | // we in fact need to be. |
| 152 | _ = r_iter.prev(); | 129 | _ = r_iter.prev(); |
| 153 | while (r_iter.pending != .none) : (_ = r_iter.prev()) {} | 130 | while (r_iter.pending != .none) : (_ = r_iter.prev()) {} |
| 154 | var iter = graphemes.iterAtIndex(string, r_iter.cp_iter.i orelse 0); | 131 | var iter = Graphemes.iterAtIndex(string, r_iter.cp_iter.i orelse 0); |
| 155 | while (iter.next()) |g| { | 132 | while (iter.next()) |g| { |
| 156 | if (g.offset <= idx and idx < g.offset + g.len) return g; | 133 | if (g.offset <= idx and idx < g.offset + g.len) return g; |
| 157 | } | 134 | } |
| @@ -159,23 +136,22 @@ pub fn graphemeAtIndex(graphemes: *const Graphemes, string: []const u8, index: u | |||
| 159 | } | 136 | } |
| 160 | 137 | ||
| 161 | /// Return a (forward) iterator of `string` after `grapheme`. | 138 | /// Return a (forward) iterator of `string` after `grapheme`. |
| 162 | pub fn iterateAfterGrapheme(graphemes: *const Graphemes, string: []const u8, grapheme: Grapheme) Iterator { | 139 | pub fn iterateAfterGrapheme(string: []const u8, grapheme: Grapheme) Iterator { |
| 163 | return graphemes.iterAtIndex(string, grapheme.offset + grapheme.len); | 140 | return Graphemes.iterAtIndex(string, grapheme.offset + grapheme.len); |
| 164 | } | 141 | } |
| 165 | 142 | ||
| 166 | /// Return a reverse iterator of `string` before `grapheme`. | 143 | /// Return a reverse iterator of `string` before `grapheme`. |
| 167 | pub fn iterateBeforeGrapheme(graphemes: *const Graphemes, string: []const u8, grapheme: Grapheme) ReverseIterator { | 144 | pub fn iterateBeforeGrapheme(string: []const u8, grapheme: Grapheme) ReverseIterator { |
| 168 | // This bit of weirdness is because reverse iterators are "advance last", | 145 | // This bit of weirdness is because reverse iterators are "advance last", |
| 169 | // while forward iterators are "advance first". This leaves some room for | 146 | // while forward iterators are "advance first". This leaves some room for |
| 170 | // further optimization, if anyone dares. | 147 | // further optimization, if anyone dares. |
| 171 | var r_iter = graphemes.reverseIterAtIndex(string, grapheme.offset + grapheme.len - 1); | 148 | var r_iter = Graphemes.reverseIterAtIndex(string, grapheme.offset + grapheme.len - 1); |
| 172 | _ = r_iter.prev(); | 149 | _ = r_iter.prev(); |
| 173 | return r_iter; | 150 | return r_iter; |
| 174 | } | 151 | } |
| 175 | 152 | ||
| 176 | fn reverseIterAtIndex(graphemes: *const Graphemes, string: []const u8, idx: uoffset) ReverseIterator { | 153 | fn reverseIterAtIndex(string: []const u8, idx: uoffset) ReverseIterator { |
| 177 | var r_iter: ReverseIterator = undefined; | 154 | var r_iter: ReverseIterator = undefined; |
| 178 | r_iter.data = graphemes; | ||
| 179 | var rcp_iter: CodePointReverseIterator = .{ .bytes = string, .i = idx }; | 155 | var rcp_iter: CodePointReverseIterator = .{ .bytes = string, .i = idx }; |
| 180 | r_iter.buf[1] = rcp_iter.prev(); | 156 | r_iter.buf[1] = rcp_iter.prev(); |
| 181 | r_iter.buf[0] = rcp_iter.prev(); | 157 | r_iter.buf[0] = rcp_iter.prev(); |
| @@ -184,9 +160,8 @@ fn reverseIterAtIndex(graphemes: *const Graphemes, string: []const u8, idx: uoff | |||
| 184 | return r_iter; | 160 | return r_iter; |
| 185 | } | 161 | } |
| 186 | 162 | ||
| 187 | fn iterAtIndex(graphemes: *const Graphemes, string: []const u8, idx: uoffset) Iterator { | 163 | fn iterAtIndex(string: []const u8, idx: uoffset) Iterator { |
| 188 | var iter: Iterator = undefined; | 164 | var iter: Iterator = undefined; |
| 189 | iter.data = graphemes; | ||
| 190 | iter.buf[0] = first: { | 165 | iter.buf[0] = first: { |
| 191 | if (idx == string.len) break :first null; | 166 | if (idx == string.len) break :first null; |
| 192 | var r_cp_iter: CodePointReverseIterator = .{ .bytes = string, .i = idx }; | 167 | var r_cp_iter: CodePointReverseIterator = .{ .bytes = string, .i = idx }; |
| @@ -202,13 +177,12 @@ fn iterAtIndex(graphemes: *const Graphemes, string: []const u8, idx: uoffset) It | |||
| 202 | pub const Iterator = struct { | 177 | pub const Iterator = struct { |
| 203 | buf: [2]?CodePoint = .{ null, null }, | 178 | buf: [2]?CodePoint = .{ null, null }, |
| 204 | cp_iter: CodePointIterator, | 179 | cp_iter: CodePointIterator, |
| 205 | data: *const Graphemes, | ||
| 206 | 180 | ||
| 207 | const Self = @This(); | 181 | const Self = @This(); |
| 208 | 182 | ||
| 209 | /// Assumes `src` is valid UTF-8. | 183 | /// Assumes `src` is valid UTF-8. |
| 210 | pub fn init(str: []const u8, data: *const Graphemes) Self { | 184 | pub fn init(str: []const u8) Self { |
| 211 | var self = Self{ .cp_iter = .{ .bytes = str }, .data = data }; | 185 | var self = Self{ .cp_iter = .{ .bytes = str } }; |
| 212 | self.advance(); | 186 | self.advance(); |
| 213 | return self; | 187 | return self; |
| 214 | } | 188 | } |
| @@ -237,7 +211,6 @@ pub const Iterator = struct { | |||
| 237 | if (graphemeBreak( | 211 | if (graphemeBreak( |
| 238 | self.buf[0].?.code, | 212 | self.buf[0].?.code, |
| 239 | self.buf[1].?.code, | 213 | self.buf[1].?.code, |
| 240 | self.data, | ||
| 241 | &state, | 214 | &state, |
| 242 | )) return Grapheme{ .len = gc_len, .offset = gc_start }; | 215 | )) return Grapheme{ .len = gc_len, .offset = gc_start }; |
| 243 | 216 | ||
| @@ -250,7 +223,6 @@ pub const Iterator = struct { | |||
| 250 | if (graphemeBreak( | 223 | if (graphemeBreak( |
| 251 | self.buf[0].?.code, | 224 | self.buf[0].?.code, |
| 252 | if (self.buf[1]) |ncp| ncp.code else 0, | 225 | if (self.buf[1]) |ncp| ncp.code else 0, |
| 253 | self.data, | ||
| 254 | &state, | 226 | &state, |
| 255 | )) break; | 227 | )) break; |
| 256 | } | 228 | } |
| @@ -275,7 +247,6 @@ pub const Iterator = struct { | |||
| 275 | pub const ReverseIterator = struct { | 247 | pub const ReverseIterator = struct { |
| 276 | buf: [2]?CodePoint = .{ null, null }, | 248 | buf: [2]?CodePoint = .{ null, null }, |
| 277 | cp_iter: CodePointReverseIterator, | 249 | cp_iter: CodePointReverseIterator, |
| 278 | data: *const Graphemes, | ||
| 279 | /// Codepoint read from `cp_iter` but not returned by `previous` | 250 | /// Codepoint read from `cp_iter` but not returned by `previous` |
| 280 | pending: Pending = .none, | 251 | pending: Pending = .none, |
| 281 | 252 | ||
| @@ -289,8 +260,8 @@ pub const ReverseIterator = struct { | |||
| 289 | 260 | ||
| 290 | const Self = @This(); | 261 | const Self = @This(); |
| 291 | 262 | ||
| 292 | pub fn init(str: []const u8, data: *const Graphemes) Self { | 263 | pub fn init(str: []const u8) Self { |
| 293 | var self: Self = .{ .cp_iter = .init(str), .data = data }; | 264 | var self: Self = .{ .cp_iter = .init(str) }; |
| 294 | self.advance(); | 265 | self.advance(); |
| 295 | self.advance(); | 266 | self.advance(); |
| 296 | return self; | 267 | return self; |
| @@ -352,7 +323,6 @@ pub const ReverseIterator = struct { | |||
| 352 | if (graphemeBreak( | 323 | if (graphemeBreak( |
| 353 | self.buf[0].?.code, | 324 | self.buf[0].?.code, |
| 354 | self.buf[1].?.code, | 325 | self.buf[1].?.code, |
| 355 | self.data, | ||
| 356 | &state, | 326 | &state, |
| 357 | )) break; | 327 | )) break; |
| 358 | 328 | ||
| @@ -374,7 +344,7 @@ pub const ReverseIterator = struct { | |||
| 374 | 344 | ||
| 375 | const codepoint = self.buf[0].?; | 345 | const codepoint = self.buf[0].?; |
| 376 | 346 | ||
| 377 | switch (self.data.indic(codepoint.code)) { | 347 | switch (Graphemes.indic(codepoint.code)) { |
| 378 | .Extend, .Linker => { | 348 | .Extend, .Linker => { |
| 379 | self.advance(); | 349 | self.advance(); |
| 380 | continue :indic; | 350 | continue :indic; |
| @@ -387,7 +357,7 @@ pub const ReverseIterator = struct { | |||
| 387 | if (self.buf[0]) |cp1| { | 357 | if (self.buf[0]) |cp1| { |
| 388 | state.indic = true; | 358 | state.indic = true; |
| 389 | 359 | ||
| 390 | if (graphemeBreak(cp1.code, self.buf[1].?.code, self.data, &state)) break; | 360 | if (graphemeBreak(cp1.code, self.buf[1].?.code, &state)) break; |
| 391 | 361 | ||
| 392 | if (!state.indic) { | 362 | if (!state.indic) { |
| 393 | continue :indic; | 363 | continue :indic; |
| @@ -426,12 +396,12 @@ pub const ReverseIterator = struct { | |||
| 426 | 396 | ||
| 427 | const codepoint = self.buf[0].?; | 397 | const codepoint = self.buf[0].?; |
| 428 | 398 | ||
| 429 | if (self.data.gbp(codepoint.code) == .Extend) { | 399 | if (Graphemes.gbp(codepoint.code) == .Extend) { |
| 430 | self.advance(); | 400 | self.advance(); |
| 431 | continue :emoji; | 401 | continue :emoji; |
| 432 | } | 402 | } |
| 433 | 403 | ||
| 434 | if (self.data.isEmoji(codepoint.code)) { | 404 | if (Graphemes.isEmoji(codepoint.code)) { |
| 435 | // BUF: [Emoji, Extend] (Extend* ZWJ Emoji)* | 405 | // BUF: [Emoji, Extend] (Extend* ZWJ Emoji)* |
| 436 | emoji_offset = codepoint.offset; | 406 | emoji_offset = codepoint.offset; |
| 437 | self.advance(); | 407 | self.advance(); |
| @@ -462,7 +432,7 @@ pub const ReverseIterator = struct { | |||
| 462 | if (state.regional) { | 432 | if (state.regional) { |
| 463 | var ri_count: usize = 0; | 433 | var ri_count: usize = 0; |
| 464 | while (self.buf[0] != null and | 434 | while (self.buf[0] != null and |
| 465 | self.data.gbp(self.buf[0].?.code) == .Regional_Indicator) | 435 | Graphemes.gbp(self.buf[0].?.code) == .Regional_Indicator) |
| 466 | { | 436 | { |
| 467 | ri_count += 1; | 437 | ri_count += 1; |
| 468 | self.advance(); | 438 | self.advance(); |
| @@ -500,10 +470,13 @@ pub const IterState = packed struct(u3) { | |||
| 500 | indic: bool = false, | 470 | indic: bool = false, |
| 501 | }; | 471 | }; |
| 502 | 472 | ||
| 473 | // TODO: isBreaker is also expensive given the data is already available, | ||
| 474 | // and should be "semantically inlined" wherever it belongs. | ||
| 475 | |||
| 503 | // Predicates | 476 | // Predicates |
| 504 | fn isBreaker(cp: u21, data: *const Graphemes) bool { | 477 | fn isBreaker(cp: u21) bool { |
| 505 | // Extract relevant properties. | 478 | // Extract relevant properties. |
| 506 | const cp_gbp_prop = data.gbp(cp); | 479 | const cp_gbp_prop = Graphemes.gbp(cp); |
| 507 | return cp == '\x0d' or cp == '\x0a' or cp_gbp_prop == .Control; | 480 | return cp == '\x0d' or cp == '\x0a' or cp_gbp_prop == .Control; |
| 508 | } | 481 | } |
| 509 | 482 | ||
| @@ -516,17 +489,20 @@ fn isBreaker(cp: u21, data: *const Graphemes) bool { | |||
| 516 | pub fn graphemeBreak( | 489 | pub fn graphemeBreak( |
| 517 | cp1: u21, | 490 | cp1: u21, |
| 518 | cp2: u21, | 491 | cp2: u21, |
| 519 | data: *const Graphemes, | ||
| 520 | state: *IterState, | 492 | state: *IterState, |
| 521 | ) bool { | 493 | ) bool { |
| 494 | // TODO: it's silly to index the same field three times and | ||
| 495 | // just extra different bits from the data. Optimizable? Maybe | ||
| 496 | // but it's silly to rely on that. | ||
| 497 | // | ||
| 522 | // Extract relevant properties. | 498 | // Extract relevant properties. |
| 523 | const cp1_gbp_prop = data.gbp(cp1); | 499 | const cp1_gbp_prop = Graphemes.gbp(cp1); |
| 524 | const cp1_indic_prop = data.indic(cp1); | 500 | const cp1_indic_prop = Graphemes.indic(cp1); |
| 525 | const cp1_is_emoji = data.isEmoji(cp1); | 501 | const cp1_is_emoji = Graphemes.isEmoji(cp1); |
| 526 | 502 | ||
| 527 | const cp2_gbp_prop = data.gbp(cp2); | 503 | const cp2_gbp_prop = Graphemes.gbp(cp2); |
| 528 | const cp2_indic_prop = data.indic(cp2); | 504 | const cp2_indic_prop = Graphemes.indic(cp2); |
| 529 | const cp2_is_emoji = data.isEmoji(cp2); | 505 | const cp2_is_emoji = Graphemes.isEmoji(cp2); |
| 530 | 506 | ||
| 531 | // GB11: Emoji Extend* ZWJ x Emoji | 507 | // GB11: Emoji Extend* ZWJ x Emoji |
| 532 | if (!state.xpic and cp1_is_emoji) state.xpic = true; | 508 | if (!state.xpic and cp1_is_emoji) state.xpic = true; |
| @@ -537,7 +513,7 @@ pub fn graphemeBreak( | |||
| 537 | if (cp1 == '\r' and cp2 == '\n') return false; | 513 | if (cp1 == '\r' and cp2 == '\n') return false; |
| 538 | 514 | ||
| 539 | // GB4: Control | 515 | // GB4: Control |
| 540 | if (isBreaker(cp1, data)) return true; | 516 | if (isBreaker(cp1)) return true; |
| 541 | 517 | ||
| 542 | // GB11: Emoji Extend* ZWJ x Emoji | 518 | // GB11: Emoji Extend* ZWJ x Emoji |
| 543 | if (state.xpic and | 519 | if (state.xpic and |
| @@ -555,7 +531,7 @@ pub fn graphemeBreak( | |||
| 555 | if (cp2_gbp_prop == .SpacingMark) return false; | 531 | if (cp2_gbp_prop == .SpacingMark) return false; |
| 556 | 532 | ||
| 557 | // GB9b: Prepend x | 533 | // GB9b: Prepend x |
| 558 | if (cp1_gbp_prop == .Prepend and !isBreaker(cp2, data)) return false; | 534 | if (cp1_gbp_prop == .Prepend and !isBreaker(cp2)) return false; |
| 559 | 535 | ||
| 560 | // GB12, GB13: RI x RI | 536 | // GB12, GB13: RI x RI |
| 561 | if (cp1_gbp_prop == .Regional_Indicator and cp2_gbp_prop == .Regional_Indicator) { | 537 | if (cp1_gbp_prop == .Regional_Indicator and cp2_gbp_prop == .Regional_Indicator) { |
| @@ -620,25 +596,22 @@ test "Segmentation ZWJ and ZWSP emoji sequences" { | |||
| 620 | const with_zwsp = seq_1 ++ "\u{200B}" ++ seq_2; | 596 | const with_zwsp = seq_1 ++ "\u{200B}" ++ seq_2; |
| 621 | const no_joiner = seq_1 ++ seq_2; | 597 | const no_joiner = seq_1 ++ seq_2; |
| 622 | 598 | ||
| 623 | const graphemes = try Graphemes.init(std.testing.allocator); | ||
| 624 | defer graphemes.deinit(std.testing.allocator); | ||
| 625 | |||
| 626 | { | 599 | { |
| 627 | var iter = graphemes.iterator(with_zwj); | 600 | var iter = Graphemes.iterator(with_zwj); |
| 628 | var i: usize = 0; | 601 | var i: usize = 0; |
| 629 | while (iter.next()) |_| : (i += 1) {} | 602 | while (iter.next()) |_| : (i += 1) {} |
| 630 | try std.testing.expectEqual(@as(usize, 1), i); | 603 | try std.testing.expectEqual(@as(usize, 1), i); |
| 631 | } | 604 | } |
| 632 | 605 | ||
| 633 | { | 606 | { |
| 634 | var iter = graphemes.iterator(with_zwsp); | 607 | var iter = Graphemes.iterator(with_zwsp); |
| 635 | var i: usize = 0; | 608 | var i: usize = 0; |
| 636 | while (iter.next()) |_| : (i += 1) {} | 609 | while (iter.next()) |_| : (i += 1) {} |
| 637 | try std.testing.expectEqual(@as(usize, 3), i); | 610 | try std.testing.expectEqual(@as(usize, 3), i); |
| 638 | } | 611 | } |
| 639 | 612 | ||
| 640 | { | 613 | { |
| 641 | var iter = graphemes.iterator(no_joiner); | 614 | var iter = Graphemes.iterator(no_joiner); |
| 642 | var i: usize = 0; | 615 | var i: usize = 0; |
| 643 | while (iter.next()) |_| : (i += 1) {} | 616 | while (iter.next()) |_| : (i += 1) {} |
| 644 | try std.testing.expectEqual(@as(usize, 2), i); | 617 | try std.testing.expectEqual(@as(usize, 2), i); |
| @@ -647,10 +620,8 @@ test "Segmentation ZWJ and ZWSP emoji sequences" { | |||
| 647 | 620 | ||
| 648 | test "Iterator.peek" { | 621 | test "Iterator.peek" { |
| 649 | const peek_seq = "aΔ👨🏻🌾→"; | 622 | const peek_seq = "aΔ👨🏻🌾→"; |
| 650 | const data = try Graphemes.init(std.testing.allocator); | ||
| 651 | defer data.deinit(std.testing.allocator); | ||
| 652 | 623 | ||
| 653 | var iter = data.iterator(peek_seq); | 624 | var iter = Graphemes.iterator(peek_seq); |
| 654 | const peek_a = iter.peek().?; | 625 | const peek_a = iter.peek().?; |
| 655 | const next_a = iter.next().?; | 626 | const next_a = iter.next().?; |
| 656 | try std.testing.expectEqual(peek_a, next_a); | 627 | try std.testing.expectEqual(peek_a, next_a); |
diff --git a/src/unicode_tests.zig b/src/unicode_tests.zig index e2a5a96..946c197 100644 --- a/src/unicode_tests.zig +++ b/src/unicode_tests.zig | |||
| @@ -118,8 +118,6 @@ test "Segmentation GraphemeIterator" { | |||
| 118 | const allocator = std.testing.allocator; | 118 | const allocator = std.testing.allocator; |
| 119 | 119 | ||
| 120 | var reader = std.io.Reader.fixed(@embedFile("GraphemeBreakTest.txt")); | 120 | var reader = std.io.Reader.fixed(@embedFile("GraphemeBreakTest.txt")); |
| 121 | const graph = try Graphemes.init(allocator); | ||
| 122 | defer graph.deinit(allocator); | ||
| 123 | 121 | ||
| 124 | var line_iter: IterRead = .{ .read = &reader }; | 122 | var line_iter: IterRead = .{ .read = &reader }; |
| 125 | 123 | ||
| @@ -161,7 +159,7 @@ test "Segmentation GraphemeIterator" { | |||
| 161 | const this_str = all_bytes.items; | 159 | const this_str = all_bytes.items; |
| 162 | 160 | ||
| 163 | { | 161 | { |
| 164 | var iter = graph.iterator(this_str); | 162 | var iter = Graphemes.iterator(this_str); |
| 165 | 163 | ||
| 166 | // Check. | 164 | // Check. |
| 167 | for (want.items, 1..) |want_gc, idx| { | 165 | for (want.items, 1..) |want_gc, idx| { |
| @@ -171,7 +169,7 @@ test "Segmentation GraphemeIterator" { | |||
| 171 | got_gc.bytes(this_str), | 169 | got_gc.bytes(this_str), |
| 172 | ); | 170 | ); |
| 173 | for (got_gc.offset..got_gc.offset + got_gc.len) |i| { | 171 | for (got_gc.offset..got_gc.offset + got_gc.len) |i| { |
| 174 | const this_gc = graph.graphemeAtIndex(this_str, i); | 172 | const this_gc = Graphemes.graphemeAtIndex(this_str, i); |
| 175 | std.testing.expectEqualSlices( | 173 | std.testing.expectEqualSlices( |
| 176 | u8, | 174 | u8, |
| 177 | got_gc.bytes(this_str), | 175 | got_gc.bytes(this_str), |
| @@ -181,7 +179,7 @@ test "Segmentation GraphemeIterator" { | |||
| 181 | return err; | 179 | return err; |
| 182 | }; | 180 | }; |
| 183 | } | 181 | } |
| 184 | var after_iter = graph.iterateAfterGrapheme(this_str, got_gc); | 182 | var after_iter = Graphemes.iterateAfterGrapheme(this_str, got_gc); |
| 185 | if (after_iter.next()) |next_gc| { | 183 | if (after_iter.next()) |next_gc| { |
| 186 | if (iter.peek()) |next_peek| { | 184 | if (iter.peek()) |next_peek| { |
| 187 | std.testing.expectEqualSlices( | 185 | std.testing.expectEqualSlices( |
| @@ -202,7 +200,7 @@ test "Segmentation GraphemeIterator" { | |||
| 202 | } | 200 | } |
| 203 | } | 201 | } |
| 204 | { | 202 | { |
| 205 | var iter = graph.reverseIterator(this_str); | 203 | var iter = Graphemes.reverseIterator(this_str); |
| 206 | 204 | ||
| 207 | // Check. | 205 | // Check. |
| 208 | var i: usize = want.items.len; | 206 | var i: usize = want.items.len; |
| @@ -226,7 +224,7 @@ test "Segmentation GraphemeIterator" { | |||
| 226 | ); | 224 | ); |
| 227 | return err; | 225 | return err; |
| 228 | }; | 226 | }; |
| 229 | var before_iter = graph.iterateBeforeGrapheme(this_str, got_gc); | 227 | var before_iter = Graphemes.iterateBeforeGrapheme(this_str, got_gc); |
| 230 | if (before_iter.prev()) |prev_gc| { | 228 | if (before_iter.prev()) |prev_gc| { |
| 231 | if (iter.peek()) |prev_peek| { | 229 | if (iter.peek()) |prev_peek| { |
| 232 | std.testing.expectEqualSlices( | 230 | std.testing.expectEqualSlices( |