summaryrefslogtreecommitdiff
path: root/src/grapheme.zig
diff options
context:
space:
mode:
Diffstat (limited to 'src/grapheme.zig')
-rw-r--r--src/grapheme.zig73
1 files changed, 32 insertions, 41 deletions
diff --git a/src/grapheme.zig b/src/grapheme.zig
index 3fdf10b..7125b5b 100644
--- a/src/grapheme.zig
+++ b/src/grapheme.zig
@@ -1,9 +1,10 @@
1const std = @import("std"); 1const std = @import("std");
2const mem = std.mem;
2const unicode = std.unicode; 3const unicode = std.unicode;
3 4
4const CodePoint = @import("code_point").CodePoint; 5const CodePoint = @import("code_point").CodePoint;
5const CodePointIterator = @import("code_point").Iterator; 6const CodePointIterator = @import("code_point").Iterator;
6const gbp = @import("gbp"); 7pub const Data = @import("GraphemeData");
7 8
8/// `Grapheme` represents a Unicode grapheme cluster by its length and offset in the source bytes. 9/// `Grapheme` represents a Unicode grapheme cluster by its length and offset in the source bytes.
9pub const Grapheme = struct { 10pub const Grapheme = struct {
@@ -21,12 +22,13 @@ pub const Grapheme = struct {
21pub const Iterator = struct { 22pub const Iterator = struct {
22 buf: [2]?CodePoint = .{ null, null }, 23 buf: [2]?CodePoint = .{ null, null },
23 cp_iter: CodePointIterator, 24 cp_iter: CodePointIterator,
25 data: *Data,
24 26
25 const Self = @This(); 27 const Self = @This();
26 28
27 /// Assumes `src` is valid UTF-8. 29 /// Assumes `src` is valid UTF-8.
28 pub fn init(str: []const u8) Self { 30 pub fn init(str: []const u8, data: *Data) Self {
29 var self = Self{ .cp_iter = CodePointIterator{ .bytes = str } }; 31 var self = Self{ .cp_iter = .{ .bytes = str }, .data = data };
30 self.advance(); 32 self.advance();
31 return self; 33 return self;
32 } 34 }
@@ -55,6 +57,7 @@ pub const Iterator = struct {
55 if (graphemeBreak( 57 if (graphemeBreak(
56 self.buf[0].?.code, 58 self.buf[0].?.code,
57 self.buf[1].?.code, 59 self.buf[1].?.code,
60 self.data,
58 &state, 61 &state,
59 )) return Grapheme{ .len = gc_len, .offset = gc_start }; 62 )) return Grapheme{ .len = gc_len, .offset = gc_start };
60 63
@@ -67,6 +70,7 @@ pub const Iterator = struct {
67 if (graphemeBreak( 70 if (graphemeBreak(
68 self.buf[0].?.code, 71 self.buf[0].?.code,
69 if (self.buf[1]) |ncp| ncp.code else 0, 72 if (self.buf[1]) |ncp| ncp.code else 0,
73 self.data,
70 &state, 74 &state,
71 )) break; 75 )) break;
72 } 76 }
@@ -76,18 +80,12 @@ pub const Iterator = struct {
76}; 80};
77 81
78// Predicates 82// Predicates
79fn isBreaker(cp: u21) bool { 83fn isBreaker(cp: u21, data: *Data) bool {
80 // Extract relevant properties. 84 // Extract relevant properties.
81 const cp_props_byte = gbp.stage_3[gbp.stage_2[gbp.stage_1[cp >> 8] + (cp & 0xff)]]; 85 const cp_gbp_prop = data.gbp(cp);
82 const cp_gbp_prop: gbp.Gbp = @enumFromInt(cp_props_byte >> 4);
83 return cp == '\x0d' or cp == '\x0a' or cp_gbp_prop == .Control; 86 return cp == '\x0d' or cp == '\x0a' or cp_gbp_prop == .Control;
84} 87}
85 88
86fn isIgnorable(cp: u21) bool {
87 const cp_gbp_prop = gbp.stage_3[gbp.stage_2[gbp.stage_1[cp >> 8] + (cp & 0xff)]];
88 return cp_gbp_prop == .extend or cp_gbp_prop == .spacing or cp == '\u{200d}';
89}
90
91// Grapheme break state. 89// Grapheme break state.
92const State = struct { 90const State = struct {
93 bits: u3 = 0, 91 bits: u3 = 0,
@@ -135,18 +133,17 @@ const State = struct {
135pub fn graphemeBreak( 133pub fn graphemeBreak(
136 cp1: u21, 134 cp1: u21,
137 cp2: u21, 135 cp2: u21,
136 data: *Data,
138 state: *State, 137 state: *State,
139) bool { 138) bool {
140 // Extract relevant properties. 139 // Extract relevant properties.
141 const cp1_props_byte = gbp.stage_3[gbp.stage_2[gbp.stage_1[cp1 >> 8] + (cp1 & 0xff)]]; 140 const cp1_gbp_prop = data.gbp(cp1);
142 const cp1_gbp_prop: gbp.Gbp = @enumFromInt(cp1_props_byte >> 4); 141 const cp1_indic_prop = data.indic(cp1);
143 const cp1_indic_prop: gbp.Indic = @enumFromInt((cp1_props_byte >> 1) & 0x7); 142 const cp1_is_emoji = data.isEmoji(cp1);
144 const cp1_is_emoji = cp1_props_byte & 1 == 1;
145 143
146 const cp2_props_byte = gbp.stage_3[gbp.stage_2[gbp.stage_1[cp2 >> 8] + (cp2 & 0xff)]]; 144 const cp2_gbp_prop = data.gbp(cp2);
147 const cp2_gbp_prop: gbp.Gbp = @enumFromInt(cp2_props_byte >> 4); 145 const cp2_indic_prop = data.indic(cp2);
148 const cp2_indic_prop: gbp.Indic = @enumFromInt((cp2_props_byte >> 1) & 0x7); 146 const cp2_is_emoji = data.isEmoji(cp2);
149 const cp2_is_emoji = cp2_props_byte & 1 == 1;
150 147
151 // GB11: Emoji Extend* ZWJ x Emoji 148 // GB11: Emoji Extend* ZWJ x Emoji
152 if (!state.hasXpic() and cp1_is_emoji) state.setXpic(); 149 if (!state.hasXpic() and cp1_is_emoji) state.setXpic();
@@ -157,7 +154,7 @@ pub fn graphemeBreak(
157 if (cp1 == '\r' and cp2 == '\n') return false; 154 if (cp1 == '\r' and cp2 == '\n') return false;
158 155
159 // GB4: Control 156 // GB4: Control
160 if (isBreaker(cp1)) return true; 157 if (isBreaker(cp1, data)) return true;
161 158
162 // GB11: Emoji Extend* ZWJ x Emoji 159 // GB11: Emoji Extend* ZWJ x Emoji
163 if (state.hasXpic() and 160 if (state.hasXpic() and
@@ -175,7 +172,7 @@ pub fn graphemeBreak(
175 if (cp2_gbp_prop == .SpacingMark) return false; 172 if (cp2_gbp_prop == .SpacingMark) return false;
176 173
177 // GB9b: Prepend x 174 // GB9b: Prepend x
178 if (cp1_gbp_prop == .Prepend and !isBreaker(cp2)) return false; 175 if (cp1_gbp_prop == .Prepend and !isBreaker(cp2, data)) return false;
179 176
180 // GB12, GB13: RI x RI 177 // GB12, GB13: RI x RI
181 if (cp1_gbp_prop == .Regional_Indicator and cp2_gbp_prop == .Regional_Indicator) { 178 if (cp1_gbp_prop == .Regional_Indicator and cp2_gbp_prop == .Regional_Indicator) {
@@ -240,6 +237,9 @@ test "Segmentation GraphemeIterator" {
240 var buf_reader = std.io.bufferedReader(file.reader()); 237 var buf_reader = std.io.bufferedReader(file.reader());
241 var input_stream = buf_reader.reader(); 238 var input_stream = buf_reader.reader();
242 239
240 var data = try Data.init(allocator);
241 defer data.deinit();
242
243 var buf: [4096]u8 = undefined; 243 var buf: [4096]u8 = undefined;
244 var line_no: usize = 1; 244 var line_no: usize = 1;
245 245
@@ -282,7 +282,7 @@ test "Segmentation GraphemeIterator" {
282 } 282 }
283 283
284 // std.debug.print("\nline {}: {s}\n", .{ line_no, all_bytes.items }); 284 // std.debug.print("\nline {}: {s}\n", .{ line_no, all_bytes.items });
285 var iter = Iterator.init(all_bytes.items); 285 var iter = Iterator.init(all_bytes.items, &data);
286 286
287 // Chaeck. 287 // Chaeck.
288 for (want.items) |want_gc| { 288 for (want.items) |want_gc| {
@@ -295,19 +295,6 @@ test "Segmentation GraphemeIterator" {
295 } 295 }
296} 296}
297 297
298test "Segmentation comptime GraphemeIterator" {
299 const want = [_][]const u8{ "H", "é", "l", "l", "o" };
300
301 comptime {
302 const src = "Héllo";
303 var ct_iter = Iterator.init(src);
304 var i = 0;
305 while (ct_iter.next()) |grapheme| : (i += 1) {
306 try std.testing.expectEqualStrings(grapheme.bytes(src), want[i]);
307 }
308 }
309}
310
311test "Segmentation ZWJ and ZWSP emoji sequences" { 298test "Segmentation ZWJ and ZWSP emoji sequences" {
312 const seq_1 = "\u{1F43B}\u{200D}\u{2744}\u{FE0F}"; 299 const seq_1 = "\u{1F43B}\u{200D}\u{2744}\u{FE0F}";
313 const seq_2 = "\u{1F43B}\u{200D}\u{2744}\u{FE0F}"; 300 const seq_2 = "\u{1F43B}\u{200D}\u{2744}\u{FE0F}";
@@ -315,18 +302,22 @@ test "Segmentation ZWJ and ZWSP emoji sequences" {
315 const with_zwsp = seq_1 ++ "\u{200B}" ++ seq_2; 302 const with_zwsp = seq_1 ++ "\u{200B}" ++ seq_2;
316 const no_joiner = seq_1 ++ seq_2; 303 const no_joiner = seq_1 ++ seq_2;
317 304
318 var ct_iter = Iterator.init(with_zwj); 305 var data = try Data.init(std.testing.allocator);
306 defer data.deinit();
307
308 var iter = Iterator.init(with_zwj, &data);
309
319 var i: usize = 0; 310 var i: usize = 0;
320 while (ct_iter.next()) |_| : (i += 1) {} 311 while (iter.next()) |_| : (i += 1) {}
321 try std.testing.expectEqual(@as(usize, 1), i); 312 try std.testing.expectEqual(@as(usize, 1), i);
322 313
323 ct_iter = Iterator.init(with_zwsp); 314 iter = Iterator.init(with_zwsp, &data);
324 i = 0; 315 i = 0;
325 while (ct_iter.next()) |_| : (i += 1) {} 316 while (iter.next()) |_| : (i += 1) {}
326 try std.testing.expectEqual(@as(usize, 3), i); 317 try std.testing.expectEqual(@as(usize, 3), i);
327 318
328 ct_iter = Iterator.init(no_joiner); 319 iter = Iterator.init(no_joiner, &data);
329 i = 0; 320 i = 0;
330 while (ct_iter.next()) |_| : (i += 1) {} 321 while (iter.next()) |_| : (i += 1) {}
331 try std.testing.expectEqual(@as(usize, 2), i); 322 try std.testing.expectEqual(@as(usize, 2), i);
332} 323}