summaryrefslogtreecommitdiff
path: root/src/grapheme.zig
diff options
context:
space:
mode:
authorGravatar Sam Atman2025-04-30 11:58:19 -0400
committerGravatar Sam Atman2025-04-30 11:58:19 -0400
commit1be5e46490e061761b4b97dff5c6acb2181d6fe9 (patch)
tree77a1edcdedd7afae7428e92feba37d2bb1035b22 /src/grapheme.zig
parentAdd general tests step (diff)
downloadzg-1be5e46490e061761b4b97dff5c6acb2181d6fe9.tar.gz
zg-1be5e46490e061761b4b97dff5c6acb2181d6fe9.tar.xz
zg-1be5e46490e061761b4b97dff5c6acb2181d6fe9.zip
Factor out 'Data' for grapheme and DisplayWidth
In the process of refactoring the whole library, so that it doesn't expose anything called "Data" separately from user functionality.
Diffstat (limited to 'src/grapheme.zig')
-rw-r--r--src/grapheme.zig109
1 files changed, 99 insertions, 10 deletions
diff --git a/src/grapheme.zig b/src/grapheme.zig
index 25fd71d..79cd2c6 100644
--- a/src/grapheme.zig
+++ b/src/grapheme.zig
@@ -1,10 +1,99 @@
1const std = @import("std"); 1const std = @import("std");
2const builtin = @import("builtin");
2const mem = std.mem; 3const mem = std.mem;
4const Allocator = mem.Allocator;
5const compress = std.compress;
3const unicode = std.unicode; 6const unicode = std.unicode;
4 7
5const CodePoint = @import("code_point").CodePoint; 8const CodePoint = @import("code_point").CodePoint;
6const CodePointIterator = @import("code_point").Iterator; 9const CodePointIterator = @import("code_point").Iterator;
7pub const GraphemeData = @import("GraphemeData"); 10
11s1: []u16 = undefined,
12s2: []u16 = undefined,
13s3: []u8 = undefined,
14
15const Graphemes = @This();
16
17pub inline fn init(allocator: mem.Allocator) mem.Allocator.Error!Graphemes {
18 const decompressor = compress.flate.inflate.decompressor;
19 const in_bytes = @embedFile("gbp");
20 var in_fbs = std.io.fixedBufferStream(in_bytes);
21 var in_decomp = decompressor(.raw, in_fbs.reader());
22 var reader = in_decomp.reader();
23
24 const endian = builtin.cpu.arch.endian();
25
26 var self = Graphemes{};
27
28 const s1_len: u16 = reader.readInt(u16, endian) catch unreachable;
29 self.s1 = try allocator.alloc(u16, s1_len);
30 errdefer allocator.free(self.s1);
31 for (0..s1_len) |i| self.s1[i] = reader.readInt(u16, endian) catch unreachable;
32
33 const s2_len: u16 = reader.readInt(u16, endian) catch unreachable;
34 self.s2 = try allocator.alloc(u16, s2_len);
35 errdefer allocator.free(self.s2);
36 for (0..s2_len) |i| self.s2[i] = reader.readInt(u16, endian) catch unreachable;
37
38 const s3_len: u16 = reader.readInt(u16, endian) catch unreachable;
39 self.s3 = try allocator.alloc(u8, s3_len);
40 errdefer allocator.free(self.s3);
41 _ = reader.readAll(self.s3) catch unreachable;
42
43 return self;
44}
45
46pub fn deinit(graphemes: *const Graphemes, allocator: mem.Allocator) void {
47 allocator.free(graphemes.s1);
48 allocator.free(graphemes.s2);
49 allocator.free(graphemes.s3);
50}
51
52/// Lookup the grapheme break property for a code point.
53pub fn gbp(graphemes: Graphemes, cp: u21) Gbp {
54 return @enumFromInt(graphemes.s3[graphemes.s2[graphemes.s1[cp >> 8] + (cp & 0xff)]] >> 4);
55}
56
57/// Lookup the indic syllable type for a code point.
58pub fn indic(graphemes: Graphemes, cp: u21) Indic {
59 return @enumFromInt((graphemes.s3[graphemes.s2[graphemes.s1[cp >> 8] + (cp & 0xff)]] >> 1) & 0x7);
60}
61
62/// Lookup the emoji property for a code point.
63pub fn isEmoji(graphemes: Graphemes, cp: u21) bool {
64 return graphemes.s3[graphemes.s2[graphemes.s1[cp >> 8] + (cp & 0xff)]] & 1 == 1;
65}
66
67pub fn iterator(graphemes: *const Graphemes, string: []const u8) Iterator {
68 return Iterator.init(string, graphemes);
69}
70
71/// Indic syllable type.
72pub const Indic = enum {
73 none,
74
75 Consonant,
76 Extend,
77 Linker,
78};
79
80/// Grapheme break property.
81pub const Gbp = enum {
82 none,
83 Control,
84 CR,
85 Extend,
86 L,
87 LF,
88 LV,
89 LVT,
90 Prepend,
91 Regional_Indicator,
92 SpacingMark,
93 T,
94 V,
95 ZWJ,
96};
8 97
9/// `Grapheme` represents a Unicode grapheme cluster by its length and offset in the source bytes. 98/// `Grapheme` represents a Unicode grapheme cluster by its length and offset in the source bytes.
10pub const Grapheme = struct { 99pub const Grapheme = struct {
@@ -22,12 +111,12 @@ pub const Grapheme = struct {
22pub const Iterator = struct { 111pub const Iterator = struct {
23 buf: [2]?CodePoint = .{ null, null }, 112 buf: [2]?CodePoint = .{ null, null },
24 cp_iter: CodePointIterator, 113 cp_iter: CodePointIterator,
25 data: *const GraphemeData, 114 data: *const Graphemes,
26 115
27 const Self = @This(); 116 const Self = @This();
28 117
29 /// Assumes `src` is valid UTF-8. 118 /// Assumes `src` is valid UTF-8.
30 pub fn init(str: []const u8, data: *const GraphemeData) Self { 119 pub fn init(str: []const u8, data: *const Graphemes) Self {
31 var self = Self{ .cp_iter = .{ .bytes = str }, .data = data }; 120 var self = Self{ .cp_iter = .{ .bytes = str }, .data = data };
32 self.advance(); 121 self.advance();
33 return self; 122 return self;
@@ -149,7 +238,7 @@ pub const Iterator = struct {
149}; 238};
150 239
151// Predicates 240// Predicates
152fn isBreaker(cp: u21, data: *const GraphemeData) bool { 241fn isBreaker(cp: u21, data: *const Graphemes) bool {
153 // Extract relevant properties. 242 // Extract relevant properties.
154 const cp_gbp_prop = data.gbp(cp); 243 const cp_gbp_prop = data.gbp(cp);
155 return cp == '\x0d' or cp == '\x0a' or cp_gbp_prop == .Control; 244 return cp == '\x0d' or cp == '\x0a' or cp_gbp_prop == .Control;
@@ -202,7 +291,7 @@ pub const State = struct {
202pub fn graphemeBreak( 291pub fn graphemeBreak(
203 cp1: u21, 292 cp1: u21,
204 cp2: u21, 293 cp2: u21,
205 data: *const GraphemeData, 294 data: *const Graphemes,
206 state: *State, 295 state: *State,
207) bool { 296) bool {
208 // Extract relevant properties. 297 // Extract relevant properties.
@@ -306,25 +395,25 @@ test "Segmentation ZWJ and ZWSP emoji sequences" {
306 const with_zwsp = seq_1 ++ "\u{200B}" ++ seq_2; 395 const with_zwsp = seq_1 ++ "\u{200B}" ++ seq_2;
307 const no_joiner = seq_1 ++ seq_2; 396 const no_joiner = seq_1 ++ seq_2;
308 397
309 const data = try GraphemeData.init(std.testing.allocator); 398 const graphemes = try Graphemes.init(std.testing.allocator);
310 defer data.deinit(std.testing.allocator); 399 defer graphemes.deinit(std.testing.allocator);
311 400
312 { 401 {
313 var iter = Iterator.init(with_zwj, &data); 402 var iter = graphemes.iterator(with_zwj);
314 var i: usize = 0; 403 var i: usize = 0;
315 while (iter.next()) |_| : (i += 1) {} 404 while (iter.next()) |_| : (i += 1) {}
316 try std.testing.expectEqual(@as(usize, 1), i); 405 try std.testing.expectEqual(@as(usize, 1), i);
317 } 406 }
318 407
319 { 408 {
320 var iter = Iterator.init(with_zwsp, &data); 409 var iter = graphemes.iterator(with_zwsp);
321 var i: usize = 0; 410 var i: usize = 0;
322 while (iter.next()) |_| : (i += 1) {} 411 while (iter.next()) |_| : (i += 1) {}
323 try std.testing.expectEqual(@as(usize, 3), i); 412 try std.testing.expectEqual(@as(usize, 3), i);
324 } 413 }
325 414
326 { 415 {
327 var iter = Iterator.init(no_joiner, &data); 416 var iter = graphemes.iterator(no_joiner);
328 var i: usize = 0; 417 var i: usize = 0;
329 while (iter.next()) |_| : (i += 1) {} 418 while (iter.next()) |_| : (i += 1) {}
330 try std.testing.expectEqual(@as(usize, 2), i); 419 try std.testing.expectEqual(@as(usize, 2), i);