diff options
| author | 2025-04-30 11:58:19 -0400 | |
|---|---|---|
| committer | 2025-04-30 11:58:19 -0400 | |
| commit | 1be5e46490e061761b4b97dff5c6acb2181d6fe9 (patch) | |
| tree | 77a1edcdedd7afae7428e92feba37d2bb1035b22 /src/grapheme.zig | |
| parent | Add general tests step (diff) | |
| download | zg-1be5e46490e061761b4b97dff5c6acb2181d6fe9.tar.gz zg-1be5e46490e061761b4b97dff5c6acb2181d6fe9.tar.xz zg-1be5e46490e061761b4b97dff5c6acb2181d6fe9.zip | |
Factor out 'Data' for grapheme and DisplayWidth
In the process of refactoring the whole library, so that it doesn't
expose anything called "Data" separately from user functionality.
Diffstat (limited to 'src/grapheme.zig')
| -rw-r--r-- | src/grapheme.zig | 109 |
1 files changed, 99 insertions, 10 deletions
diff --git a/src/grapheme.zig b/src/grapheme.zig index 25fd71d..79cd2c6 100644 --- a/src/grapheme.zig +++ b/src/grapheme.zig | |||
| @@ -1,10 +1,99 @@ | |||
| 1 | const std = @import("std"); | 1 | const std = @import("std"); |
| 2 | const builtin = @import("builtin"); | ||
| 2 | const mem = std.mem; | 3 | const mem = std.mem; |
| 4 | const Allocator = mem.Allocator; | ||
| 5 | const compress = std.compress; | ||
| 3 | const unicode = std.unicode; | 6 | const unicode = std.unicode; |
| 4 | 7 | ||
| 5 | const CodePoint = @import("code_point").CodePoint; | 8 | const CodePoint = @import("code_point").CodePoint; |
| 6 | const CodePointIterator = @import("code_point").Iterator; | 9 | const CodePointIterator = @import("code_point").Iterator; |
| 7 | pub const GraphemeData = @import("GraphemeData"); | 10 | |
| 11 | s1: []u16 = undefined, | ||
| 12 | s2: []u16 = undefined, | ||
| 13 | s3: []u8 = undefined, | ||
| 14 | |||
| 15 | const Graphemes = @This(); | ||
| 16 | |||
| 17 | pub inline fn init(allocator: mem.Allocator) mem.Allocator.Error!Graphemes { | ||
| 18 | const decompressor = compress.flate.inflate.decompressor; | ||
| 19 | const in_bytes = @embedFile("gbp"); | ||
| 20 | var in_fbs = std.io.fixedBufferStream(in_bytes); | ||
| 21 | var in_decomp = decompressor(.raw, in_fbs.reader()); | ||
| 22 | var reader = in_decomp.reader(); | ||
| 23 | |||
| 24 | const endian = builtin.cpu.arch.endian(); | ||
| 25 | |||
| 26 | var self = Graphemes{}; | ||
| 27 | |||
| 28 | const s1_len: u16 = reader.readInt(u16, endian) catch unreachable; | ||
| 29 | self.s1 = try allocator.alloc(u16, s1_len); | ||
| 30 | errdefer allocator.free(self.s1); | ||
| 31 | for (0..s1_len) |i| self.s1[i] = reader.readInt(u16, endian) catch unreachable; | ||
| 32 | |||
| 33 | const s2_len: u16 = reader.readInt(u16, endian) catch unreachable; | ||
| 34 | self.s2 = try allocator.alloc(u16, s2_len); | ||
| 35 | errdefer allocator.free(self.s2); | ||
| 36 | for (0..s2_len) |i| self.s2[i] = reader.readInt(u16, endian) catch unreachable; | ||
| 37 | |||
| 38 | const s3_len: u16 = reader.readInt(u16, endian) catch unreachable; | ||
| 39 | self.s3 = try allocator.alloc(u8, s3_len); | ||
| 40 | errdefer allocator.free(self.s3); | ||
| 41 | _ = reader.readAll(self.s3) catch unreachable; | ||
| 42 | |||
| 43 | return self; | ||
| 44 | } | ||
| 45 | |||
| 46 | pub fn deinit(graphemes: *const Graphemes, allocator: mem.Allocator) void { | ||
| 47 | allocator.free(graphemes.s1); | ||
| 48 | allocator.free(graphemes.s2); | ||
| 49 | allocator.free(graphemes.s3); | ||
| 50 | } | ||
| 51 | |||
| 52 | /// Lookup the grapheme break property for a code point. | ||
| 53 | pub fn gbp(graphemes: Graphemes, cp: u21) Gbp { | ||
| 54 | return @enumFromInt(graphemes.s3[graphemes.s2[graphemes.s1[cp >> 8] + (cp & 0xff)]] >> 4); | ||
| 55 | } | ||
| 56 | |||
| 57 | /// Lookup the indic syllable type for a code point. | ||
| 58 | pub fn indic(graphemes: Graphemes, cp: u21) Indic { | ||
| 59 | return @enumFromInt((graphemes.s3[graphemes.s2[graphemes.s1[cp >> 8] + (cp & 0xff)]] >> 1) & 0x7); | ||
| 60 | } | ||
| 61 | |||
| 62 | /// Lookup the emoji property for a code point. | ||
| 63 | pub fn isEmoji(graphemes: Graphemes, cp: u21) bool { | ||
| 64 | return graphemes.s3[graphemes.s2[graphemes.s1[cp >> 8] + (cp & 0xff)]] & 1 == 1; | ||
| 65 | } | ||
| 66 | |||
| 67 | pub fn iterator(graphemes: *const Graphemes, string: []const u8) Iterator { | ||
| 68 | return Iterator.init(string, graphemes); | ||
| 69 | } | ||
| 70 | |||
| 71 | /// Indic syllable type. | ||
| 72 | pub const Indic = enum { | ||
| 73 | none, | ||
| 74 | |||
| 75 | Consonant, | ||
| 76 | Extend, | ||
| 77 | Linker, | ||
| 78 | }; | ||
| 79 | |||
| 80 | /// Grapheme break property. | ||
| 81 | pub const Gbp = enum { | ||
| 82 | none, | ||
| 83 | Control, | ||
| 84 | CR, | ||
| 85 | Extend, | ||
| 86 | L, | ||
| 87 | LF, | ||
| 88 | LV, | ||
| 89 | LVT, | ||
| 90 | Prepend, | ||
| 91 | Regional_Indicator, | ||
| 92 | SpacingMark, | ||
| 93 | T, | ||
| 94 | V, | ||
| 95 | ZWJ, | ||
| 96 | }; | ||
| 8 | 97 | ||
| 9 | /// `Grapheme` represents a Unicode grapheme cluster by its length and offset in the source bytes. | 98 | /// `Grapheme` represents a Unicode grapheme cluster by its length and offset in the source bytes. |
| 10 | pub const Grapheme = struct { | 99 | pub const Grapheme = struct { |
| @@ -22,12 +111,12 @@ pub const Grapheme = struct { | |||
| 22 | pub const Iterator = struct { | 111 | pub const Iterator = struct { |
| 23 | buf: [2]?CodePoint = .{ null, null }, | 112 | buf: [2]?CodePoint = .{ null, null }, |
| 24 | cp_iter: CodePointIterator, | 113 | cp_iter: CodePointIterator, |
| 25 | data: *const GraphemeData, | 114 | data: *const Graphemes, |
| 26 | 115 | ||
| 27 | const Self = @This(); | 116 | const Self = @This(); |
| 28 | 117 | ||
| 29 | /// Assumes `src` is valid UTF-8. | 118 | /// Assumes `src` is valid UTF-8. |
| 30 | pub fn init(str: []const u8, data: *const GraphemeData) Self { | 119 | pub fn init(str: []const u8, data: *const Graphemes) Self { |
| 31 | var self = Self{ .cp_iter = .{ .bytes = str }, .data = data }; | 120 | var self = Self{ .cp_iter = .{ .bytes = str }, .data = data }; |
| 32 | self.advance(); | 121 | self.advance(); |
| 33 | return self; | 122 | return self; |
| @@ -149,7 +238,7 @@ pub const Iterator = struct { | |||
| 149 | }; | 238 | }; |
| 150 | 239 | ||
| 151 | // Predicates | 240 | // Predicates |
| 152 | fn isBreaker(cp: u21, data: *const GraphemeData) bool { | 241 | fn isBreaker(cp: u21, data: *const Graphemes) bool { |
| 153 | // Extract relevant properties. | 242 | // Extract relevant properties. |
| 154 | const cp_gbp_prop = data.gbp(cp); | 243 | const cp_gbp_prop = data.gbp(cp); |
| 155 | return cp == '\x0d' or cp == '\x0a' or cp_gbp_prop == .Control; | 244 | return cp == '\x0d' or cp == '\x0a' or cp_gbp_prop == .Control; |
| @@ -202,7 +291,7 @@ pub const State = struct { | |||
| 202 | pub fn graphemeBreak( | 291 | pub fn graphemeBreak( |
| 203 | cp1: u21, | 292 | cp1: u21, |
| 204 | cp2: u21, | 293 | cp2: u21, |
| 205 | data: *const GraphemeData, | 294 | data: *const Graphemes, |
| 206 | state: *State, | 295 | state: *State, |
| 207 | ) bool { | 296 | ) bool { |
| 208 | // Extract relevant properties. | 297 | // Extract relevant properties. |
| @@ -306,25 +395,25 @@ test "Segmentation ZWJ and ZWSP emoji sequences" { | |||
| 306 | const with_zwsp = seq_1 ++ "\u{200B}" ++ seq_2; | 395 | const with_zwsp = seq_1 ++ "\u{200B}" ++ seq_2; |
| 307 | const no_joiner = seq_1 ++ seq_2; | 396 | const no_joiner = seq_1 ++ seq_2; |
| 308 | 397 | ||
| 309 | const data = try GraphemeData.init(std.testing.allocator); | 398 | const graphemes = try Graphemes.init(std.testing.allocator); |
| 310 | defer data.deinit(std.testing.allocator); | 399 | defer graphemes.deinit(std.testing.allocator); |
| 311 | 400 | ||
| 312 | { | 401 | { |
| 313 | var iter = Iterator.init(with_zwj, &data); | 402 | var iter = graphemes.iterator(with_zwj); |
| 314 | var i: usize = 0; | 403 | var i: usize = 0; |
| 315 | while (iter.next()) |_| : (i += 1) {} | 404 | while (iter.next()) |_| : (i += 1) {} |
| 316 | try std.testing.expectEqual(@as(usize, 1), i); | 405 | try std.testing.expectEqual(@as(usize, 1), i); |
| 317 | } | 406 | } |
| 318 | 407 | ||
| 319 | { | 408 | { |
| 320 | var iter = Iterator.init(with_zwsp, &data); | 409 | var iter = graphemes.iterator(with_zwsp); |
| 321 | var i: usize = 0; | 410 | var i: usize = 0; |
| 322 | while (iter.next()) |_| : (i += 1) {} | 411 | while (iter.next()) |_| : (i += 1) {} |
| 323 | try std.testing.expectEqual(@as(usize, 3), i); | 412 | try std.testing.expectEqual(@as(usize, 3), i); |
| 324 | } | 413 | } |
| 325 | 414 | ||
| 326 | { | 415 | { |
| 327 | var iter = Iterator.init(no_joiner, &data); | 416 | var iter = graphemes.iterator(no_joiner); |
| 328 | var i: usize = 0; | 417 | var i: usize = 0; |
| 329 | while (iter.next()) |_| : (i += 1) {} | 418 | while (iter.next()) |_| : (i += 1) {} |
| 330 | try std.testing.expectEqual(@as(usize, 2), i); | 419 | try std.testing.expectEqual(@as(usize, 2), i); |