diff options
Diffstat (limited to 'src/grapheme.zig')
| -rw-r--r-- | src/grapheme.zig | 109 |
1 files changed, 99 insertions, 10 deletions
diff --git a/src/grapheme.zig b/src/grapheme.zig index 25fd71d..79cd2c6 100644 --- a/src/grapheme.zig +++ b/src/grapheme.zig | |||
| @@ -1,10 +1,99 @@ | |||
| 1 | const std = @import("std"); | 1 | const std = @import("std"); |
| 2 | const builtin = @import("builtin"); | ||
| 2 | const mem = std.mem; | 3 | const mem = std.mem; |
| 4 | const Allocator = mem.Allocator; | ||
| 5 | const compress = std.compress; | ||
| 3 | const unicode = std.unicode; | 6 | const unicode = std.unicode; |
| 4 | 7 | ||
| 5 | const CodePoint = @import("code_point").CodePoint; | 8 | const CodePoint = @import("code_point").CodePoint; |
| 6 | const CodePointIterator = @import("code_point").Iterator; | 9 | const CodePointIterator = @import("code_point").Iterator; |
| 7 | pub const GraphemeData = @import("GraphemeData"); | 10 | |
| 11 | s1: []u16 = undefined, | ||
| 12 | s2: []u16 = undefined, | ||
| 13 | s3: []u8 = undefined, | ||
| 14 | |||
| 15 | const Graphemes = @This(); | ||
| 16 | |||
| 17 | pub inline fn init(allocator: mem.Allocator) mem.Allocator.Error!Graphemes { | ||
| 18 | const decompressor = compress.flate.inflate.decompressor; | ||
| 19 | const in_bytes = @embedFile("gbp"); | ||
| 20 | var in_fbs = std.io.fixedBufferStream(in_bytes); | ||
| 21 | var in_decomp = decompressor(.raw, in_fbs.reader()); | ||
| 22 | var reader = in_decomp.reader(); | ||
| 23 | |||
| 24 | const endian = builtin.cpu.arch.endian(); | ||
| 25 | |||
| 26 | var self = Graphemes{}; | ||
| 27 | |||
| 28 | const s1_len: u16 = reader.readInt(u16, endian) catch unreachable; | ||
| 29 | self.s1 = try allocator.alloc(u16, s1_len); | ||
| 30 | errdefer allocator.free(self.s1); | ||
| 31 | for (0..s1_len) |i| self.s1[i] = reader.readInt(u16, endian) catch unreachable; | ||
| 32 | |||
| 33 | const s2_len: u16 = reader.readInt(u16, endian) catch unreachable; | ||
| 34 | self.s2 = try allocator.alloc(u16, s2_len); | ||
| 35 | errdefer allocator.free(self.s2); | ||
| 36 | for (0..s2_len) |i| self.s2[i] = reader.readInt(u16, endian) catch unreachable; | ||
| 37 | |||
| 38 | const s3_len: u16 = reader.readInt(u16, endian) catch unreachable; | ||
| 39 | self.s3 = try allocator.alloc(u8, s3_len); | ||
| 40 | errdefer allocator.free(self.s3); | ||
| 41 | _ = reader.readAll(self.s3) catch unreachable; | ||
| 42 | |||
| 43 | return self; | ||
| 44 | } | ||
| 45 | |||
| 46 | pub fn deinit(graphemes: *const Graphemes, allocator: mem.Allocator) void { | ||
| 47 | allocator.free(graphemes.s1); | ||
| 48 | allocator.free(graphemes.s2); | ||
| 49 | allocator.free(graphemes.s3); | ||
| 50 | } | ||
| 51 | |||
| 52 | /// Lookup the grapheme break property for a code point. | ||
| 53 | pub fn gbp(graphemes: Graphemes, cp: u21) Gbp { | ||
| 54 | return @enumFromInt(graphemes.s3[graphemes.s2[graphemes.s1[cp >> 8] + (cp & 0xff)]] >> 4); | ||
| 55 | } | ||
| 56 | |||
| 57 | /// Lookup the indic syllable type for a code point. | ||
| 58 | pub fn indic(graphemes: Graphemes, cp: u21) Indic { | ||
| 59 | return @enumFromInt((graphemes.s3[graphemes.s2[graphemes.s1[cp >> 8] + (cp & 0xff)]] >> 1) & 0x7); | ||
| 60 | } | ||
| 61 | |||
| 62 | /// Lookup the emoji property for a code point. | ||
| 63 | pub fn isEmoji(graphemes: Graphemes, cp: u21) bool { | ||
| 64 | return graphemes.s3[graphemes.s2[graphemes.s1[cp >> 8] + (cp & 0xff)]] & 1 == 1; | ||
| 65 | } | ||
| 66 | |||
| 67 | pub fn iterator(graphemes: *const Graphemes, string: []const u8) Iterator { | ||
| 68 | return Iterator.init(string, graphemes); | ||
| 69 | } | ||
| 70 | |||
| 71 | /// Indic syllable type. | ||
| 72 | pub const Indic = enum { | ||
| 73 | none, | ||
| 74 | |||
| 75 | Consonant, | ||
| 76 | Extend, | ||
| 77 | Linker, | ||
| 78 | }; | ||
| 79 | |||
| 80 | /// Grapheme break property. | ||
| 81 | pub const Gbp = enum { | ||
| 82 | none, | ||
| 83 | Control, | ||
| 84 | CR, | ||
| 85 | Extend, | ||
| 86 | L, | ||
| 87 | LF, | ||
| 88 | LV, | ||
| 89 | LVT, | ||
| 90 | Prepend, | ||
| 91 | Regional_Indicator, | ||
| 92 | SpacingMark, | ||
| 93 | T, | ||
| 94 | V, | ||
| 95 | ZWJ, | ||
| 96 | }; | ||
| 8 | 97 | ||
| 9 | /// `Grapheme` represents a Unicode grapheme cluster by its length and offset in the source bytes. | 98 | /// `Grapheme` represents a Unicode grapheme cluster by its length and offset in the source bytes. |
| 10 | pub const Grapheme = struct { | 99 | pub const Grapheme = struct { |
| @@ -22,12 +111,12 @@ pub const Grapheme = struct { | |||
| 22 | pub const Iterator = struct { | 111 | pub const Iterator = struct { |
| 23 | buf: [2]?CodePoint = .{ null, null }, | 112 | buf: [2]?CodePoint = .{ null, null }, |
| 24 | cp_iter: CodePointIterator, | 113 | cp_iter: CodePointIterator, |
| 25 | data: *const GraphemeData, | 114 | data: *const Graphemes, |
| 26 | 115 | ||
| 27 | const Self = @This(); | 116 | const Self = @This(); |
| 28 | 117 | ||
| 29 | /// Assumes `src` is valid UTF-8. | 118 | /// Assumes `src` is valid UTF-8. |
| 30 | pub fn init(str: []const u8, data: *const GraphemeData) Self { | 119 | pub fn init(str: []const u8, data: *const Graphemes) Self { |
| 31 | var self = Self{ .cp_iter = .{ .bytes = str }, .data = data }; | 120 | var self = Self{ .cp_iter = .{ .bytes = str }, .data = data }; |
| 32 | self.advance(); | 121 | self.advance(); |
| 33 | return self; | 122 | return self; |
| @@ -149,7 +238,7 @@ pub const Iterator = struct { | |||
| 149 | }; | 238 | }; |
| 150 | 239 | ||
| 151 | // Predicates | 240 | // Predicates |
| 152 | fn isBreaker(cp: u21, data: *const GraphemeData) bool { | 241 | fn isBreaker(cp: u21, data: *const Graphemes) bool { |
| 153 | // Extract relevant properties. | 242 | // Extract relevant properties. |
| 154 | const cp_gbp_prop = data.gbp(cp); | 243 | const cp_gbp_prop = data.gbp(cp); |
| 155 | return cp == '\x0d' or cp == '\x0a' or cp_gbp_prop == .Control; | 244 | return cp == '\x0d' or cp == '\x0a' or cp_gbp_prop == .Control; |
| @@ -202,7 +291,7 @@ pub const State = struct { | |||
| 202 | pub fn graphemeBreak( | 291 | pub fn graphemeBreak( |
| 203 | cp1: u21, | 292 | cp1: u21, |
| 204 | cp2: u21, | 293 | cp2: u21, |
| 205 | data: *const GraphemeData, | 294 | data: *const Graphemes, |
| 206 | state: *State, | 295 | state: *State, |
| 207 | ) bool { | 296 | ) bool { |
| 208 | // Extract relevant properties. | 297 | // Extract relevant properties. |
| @@ -306,25 +395,25 @@ test "Segmentation ZWJ and ZWSP emoji sequences" { | |||
| 306 | const with_zwsp = seq_1 ++ "\u{200B}" ++ seq_2; | 395 | const with_zwsp = seq_1 ++ "\u{200B}" ++ seq_2; |
| 307 | const no_joiner = seq_1 ++ seq_2; | 396 | const no_joiner = seq_1 ++ seq_2; |
| 308 | 397 | ||
| 309 | const data = try GraphemeData.init(std.testing.allocator); | 398 | const graphemes = try Graphemes.init(std.testing.allocator); |
| 310 | defer data.deinit(std.testing.allocator); | 399 | defer graphemes.deinit(std.testing.allocator); |
| 311 | 400 | ||
| 312 | { | 401 | { |
| 313 | var iter = Iterator.init(with_zwj, &data); | 402 | var iter = graphemes.iterator(with_zwj); |
| 314 | var i: usize = 0; | 403 | var i: usize = 0; |
| 315 | while (iter.next()) |_| : (i += 1) {} | 404 | while (iter.next()) |_| : (i += 1) {} |
| 316 | try std.testing.expectEqual(@as(usize, 1), i); | 405 | try std.testing.expectEqual(@as(usize, 1), i); |
| 317 | } | 406 | } |
| 318 | 407 | ||
| 319 | { | 408 | { |
| 320 | var iter = Iterator.init(with_zwsp, &data); | 409 | var iter = graphemes.iterator(with_zwsp); |
| 321 | var i: usize = 0; | 410 | var i: usize = 0; |
| 322 | while (iter.next()) |_| : (i += 1) {} | 411 | while (iter.next()) |_| : (i += 1) {} |
| 323 | try std.testing.expectEqual(@as(usize, 3), i); | 412 | try std.testing.expectEqual(@as(usize, 3), i); |
| 324 | } | 413 | } |
| 325 | 414 | ||
| 326 | { | 415 | { |
| 327 | var iter = Iterator.init(no_joiner, &data); | 416 | var iter = graphemes.iterator(no_joiner); |
| 328 | var i: usize = 0; | 417 | var i: usize = 0; |
| 329 | while (iter.next()) |_| : (i += 1) {} | 418 | while (iter.next()) |_| : (i += 1) {} |
| 330 | try std.testing.expectEqual(@as(usize, 2), i); | 419 | try std.testing.expectEqual(@as(usize, 2), i); |