summaryrefslogtreecommitdiff
path: root/src/grapheme.zig
diff options
context:
space:
mode:
Diffstat (limited to 'src/grapheme.zig')
-rw-r--r--src/grapheme.zig109
1 files changed, 99 insertions, 10 deletions
diff --git a/src/grapheme.zig b/src/grapheme.zig
index 25fd71d..79cd2c6 100644
--- a/src/grapheme.zig
+++ b/src/grapheme.zig
@@ -1,10 +1,99 @@
1const std = @import("std"); 1const std = @import("std");
2const builtin = @import("builtin");
2const mem = std.mem; 3const mem = std.mem;
4const Allocator = mem.Allocator;
5const compress = std.compress;
3const unicode = std.unicode; 6const unicode = std.unicode;
4 7
5const CodePoint = @import("code_point").CodePoint; 8const CodePoint = @import("code_point").CodePoint;
6const CodePointIterator = @import("code_point").Iterator; 9const CodePointIterator = @import("code_point").Iterator;
7pub const GraphemeData = @import("GraphemeData"); 10
11s1: []u16 = undefined,
12s2: []u16 = undefined,
13s3: []u8 = undefined,
14
15const Graphemes = @This();
16
17pub inline fn init(allocator: mem.Allocator) mem.Allocator.Error!Graphemes {
18 const decompressor = compress.flate.inflate.decompressor;
19 const in_bytes = @embedFile("gbp");
20 var in_fbs = std.io.fixedBufferStream(in_bytes);
21 var in_decomp = decompressor(.raw, in_fbs.reader());
22 var reader = in_decomp.reader();
23
24 const endian = builtin.cpu.arch.endian();
25
26 var self = Graphemes{};
27
28 const s1_len: u16 = reader.readInt(u16, endian) catch unreachable;
29 self.s1 = try allocator.alloc(u16, s1_len);
30 errdefer allocator.free(self.s1);
31 for (0..s1_len) |i| self.s1[i] = reader.readInt(u16, endian) catch unreachable;
32
33 const s2_len: u16 = reader.readInt(u16, endian) catch unreachable;
34 self.s2 = try allocator.alloc(u16, s2_len);
35 errdefer allocator.free(self.s2);
36 for (0..s2_len) |i| self.s2[i] = reader.readInt(u16, endian) catch unreachable;
37
38 const s3_len: u16 = reader.readInt(u16, endian) catch unreachable;
39 self.s3 = try allocator.alloc(u8, s3_len);
40 errdefer allocator.free(self.s3);
41 _ = reader.readAll(self.s3) catch unreachable;
42
43 return self;
44}
45
46pub fn deinit(graphemes: *const Graphemes, allocator: mem.Allocator) void {
47 allocator.free(graphemes.s1);
48 allocator.free(graphemes.s2);
49 allocator.free(graphemes.s3);
50}
51
52/// Lookup the grapheme break property for a code point.
53pub fn gbp(graphemes: Graphemes, cp: u21) Gbp {
54 return @enumFromInt(graphemes.s3[graphemes.s2[graphemes.s1[cp >> 8] + (cp & 0xff)]] >> 4);
55}
56
57/// Lookup the indic syllable type for a code point.
58pub fn indic(graphemes: Graphemes, cp: u21) Indic {
59 return @enumFromInt((graphemes.s3[graphemes.s2[graphemes.s1[cp >> 8] + (cp & 0xff)]] >> 1) & 0x7);
60}
61
62/// Lookup the emoji property for a code point.
63pub fn isEmoji(graphemes: Graphemes, cp: u21) bool {
64 return graphemes.s3[graphemes.s2[graphemes.s1[cp >> 8] + (cp & 0xff)]] & 1 == 1;
65}
66
67pub fn iterator(graphemes: *const Graphemes, string: []const u8) Iterator {
68 return Iterator.init(string, graphemes);
69}
70
71/// Indic syllable type.
72pub const Indic = enum {
73 none,
74
75 Consonant,
76 Extend,
77 Linker,
78};
79
80/// Grapheme break property.
81pub const Gbp = enum {
82 none,
83 Control,
84 CR,
85 Extend,
86 L,
87 LF,
88 LV,
89 LVT,
90 Prepend,
91 Regional_Indicator,
92 SpacingMark,
93 T,
94 V,
95 ZWJ,
96};
8 97
9/// `Grapheme` represents a Unicode grapheme cluster by its length and offset in the source bytes. 98/// `Grapheme` represents a Unicode grapheme cluster by its length and offset in the source bytes.
10pub const Grapheme = struct { 99pub const Grapheme = struct {
@@ -22,12 +111,12 @@ pub const Grapheme = struct {
22pub const Iterator = struct { 111pub const Iterator = struct {
23 buf: [2]?CodePoint = .{ null, null }, 112 buf: [2]?CodePoint = .{ null, null },
24 cp_iter: CodePointIterator, 113 cp_iter: CodePointIterator,
25 data: *const GraphemeData, 114 data: *const Graphemes,
26 115
27 const Self = @This(); 116 const Self = @This();
28 117
29 /// Assumes `src` is valid UTF-8. 118 /// Assumes `src` is valid UTF-8.
30 pub fn init(str: []const u8, data: *const GraphemeData) Self { 119 pub fn init(str: []const u8, data: *const Graphemes) Self {
31 var self = Self{ .cp_iter = .{ .bytes = str }, .data = data }; 120 var self = Self{ .cp_iter = .{ .bytes = str }, .data = data };
32 self.advance(); 121 self.advance();
33 return self; 122 return self;
@@ -149,7 +238,7 @@ pub const Iterator = struct {
149}; 238};
150 239
151// Predicates 240// Predicates
152fn isBreaker(cp: u21, data: *const GraphemeData) bool { 241fn isBreaker(cp: u21, data: *const Graphemes) bool {
153 // Extract relevant properties. 242 // Extract relevant properties.
154 const cp_gbp_prop = data.gbp(cp); 243 const cp_gbp_prop = data.gbp(cp);
155 return cp == '\x0d' or cp == '\x0a' or cp_gbp_prop == .Control; 244 return cp == '\x0d' or cp == '\x0a' or cp_gbp_prop == .Control;
@@ -202,7 +291,7 @@ pub const State = struct {
202pub fn graphemeBreak( 291pub fn graphemeBreak(
203 cp1: u21, 292 cp1: u21,
204 cp2: u21, 293 cp2: u21,
205 data: *const GraphemeData, 294 data: *const Graphemes,
206 state: *State, 295 state: *State,
207) bool { 296) bool {
208 // Extract relevant properties. 297 // Extract relevant properties.
@@ -306,25 +395,25 @@ test "Segmentation ZWJ and ZWSP emoji sequences" {
306 const with_zwsp = seq_1 ++ "\u{200B}" ++ seq_2; 395 const with_zwsp = seq_1 ++ "\u{200B}" ++ seq_2;
307 const no_joiner = seq_1 ++ seq_2; 396 const no_joiner = seq_1 ++ seq_2;
308 397
309 const data = try GraphemeData.init(std.testing.allocator); 398 const graphemes = try Graphemes.init(std.testing.allocator);
310 defer data.deinit(std.testing.allocator); 399 defer graphemes.deinit(std.testing.allocator);
311 400
312 { 401 {
313 var iter = Iterator.init(with_zwj, &data); 402 var iter = graphemes.iterator(with_zwj);
314 var i: usize = 0; 403 var i: usize = 0;
315 while (iter.next()) |_| : (i += 1) {} 404 while (iter.next()) |_| : (i += 1) {}
316 try std.testing.expectEqual(@as(usize, 1), i); 405 try std.testing.expectEqual(@as(usize, 1), i);
317 } 406 }
318 407
319 { 408 {
320 var iter = Iterator.init(with_zwsp, &data); 409 var iter = graphemes.iterator(with_zwsp);
321 var i: usize = 0; 410 var i: usize = 0;
322 while (iter.next()) |_| : (i += 1) {} 411 while (iter.next()) |_| : (i += 1) {}
323 try std.testing.expectEqual(@as(usize, 3), i); 412 try std.testing.expectEqual(@as(usize, 3), i);
324 } 413 }
325 414
326 { 415 {
327 var iter = Iterator.init(no_joiner, &data); 416 var iter = graphemes.iterator(no_joiner);
328 var i: usize = 0; 417 var i: usize = 0;
329 while (iter.next()) |_| : (i += 1) {} 418 while (iter.next()) |_| : (i += 1) {}
330 try std.testing.expectEqual(@as(usize, 2), i); 419 try std.testing.expectEqual(@as(usize, 2), i);