summaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
authorGravatar Sam Atman2025-07-08 12:15:32 -0400
committerGravatar Sam Atman2025-07-08 12:15:32 -0400
commit9427a9e53aaa29ee071f4dcb35b809a699d75aa9 (patch)
tree2607c185fd8053b84d60041fadc35c05a0225d34 /src
parentMerge pull request 'Fix benchmarks' (#56) from jacobsandlund/zg:benchmarks in... (diff)
parentAdd Words.zig example to README (diff)
downloadzg-9427a9e53aaa29ee071f4dcb35b809a699d75aa9.tar.gz
zg-9427a9e53aaa29ee071f4dcb35b809a699d75aa9.tar.xz
zg-9427a9e53aaa29ee071f4dcb35b809a699d75aa9.zip
Merge branch 'develop-next'HEADv0.14.1master
Diffstat (limited to 'src')
-rw-r--r--src/CaseFolding.zig8
-rw-r--r--src/Graphemes.zig479
-rw-r--r--src/Words.zig773
-rw-r--r--src/code_point.zig200
-rw-r--r--src/micro_runeset.zig207
-rw-r--r--src/unicode_tests.zig398
6 files changed, 1873 insertions, 192 deletions
diff --git a/src/CaseFolding.zig b/src/CaseFolding.zig
index f63b860..ff41b3e 100644
--- a/src/CaseFolding.zig
+++ b/src/CaseFolding.zig
@@ -300,13 +300,13 @@ fn testAllocations(allocator: Allocator) !void {
300 { 300 {
301 const normalize = try Normalize.init(allocator); 301 const normalize = try Normalize.init(allocator);
302 defer normalize.deinit(allocator); 302 defer normalize.deinit(allocator);
303 const caser1 = try CaseFolding.initWithNormalize(allocator, normalize); 303 const caser = try CaseFolding.initWithNormalize(allocator, normalize);
304 defer caser1.deinit(allocator); 304 defer caser.deinit(allocator);
305 } 305 }
306 // With normalize owned 306 // With normalize owned
307 { 307 {
308 const caser2 = try CaseFolding.init(allocator); 308 const caser = try CaseFolding.init(allocator);
309 defer caser2.deinit(allocator); 309 defer caser.deinit(allocator);
310 } 310 }
311} 311}
312 312
diff --git a/src/Graphemes.zig b/src/Graphemes.zig
index 7bf328a..f1c56ed 100644
--- a/src/Graphemes.zig
+++ b/src/Graphemes.zig
@@ -1,12 +1,7 @@
1const std = @import("std"); 1//! Graphemes Module
2const builtin = @import("builtin"); 2//!
3const mem = std.mem; 3//! Code for handling graphemes: fragments of string which should be
4const Allocator = mem.Allocator; 4//! treated as one unit. Like Farmer Bob here: 👨🏻‍🌾
5const compress = std.compress;
6const unicode = std.unicode;
7
8const CodePoint = @import("code_point").CodePoint;
9const CodePointIterator = @import("code_point").Iterator;
10 5
11s1: []u16 = undefined, 6s1: []u16 = undefined,
12s2: []u16 = undefined, 7s2: []u16 = undefined,
@@ -66,10 +61,16 @@ pub fn isEmoji(graphemes: Graphemes, cp: u21) bool {
66 return graphemes.s3[graphemes.s2[graphemes.s1[cp >> 8] + (cp & 0xff)]] & 1 == 1; 61 return graphemes.s3[graphemes.s2[graphemes.s1[cp >> 8] + (cp & 0xff)]] & 1 == 1;
67} 62}
68 63
64/// Returns an iterator over the graphemes in `string`.
69pub fn iterator(graphemes: *const Graphemes, string: []const u8) Iterator { 65pub fn iterator(graphemes: *const Graphemes, string: []const u8) Iterator {
70 return Iterator.init(string, graphemes); 66 return Iterator.init(string, graphemes);
71} 67}
72 68
69/// Returns a reverse iterator over the graphemes in `string`.
70pub fn reverseIterator(graphemes: *const Graphemes, string: []const u8) ReverseIterator {
71 return ReverseIterator.init(string, graphemes);
72}
73
73/// Indic syllable type. 74/// Indic syllable type.
74pub const Indic = enum { 75pub const Indic = enum {
75 none, 76 none,
@@ -99,8 +100,8 @@ pub const Gbp = enum {
99 100
100/// `Grapheme` represents a Unicode grapheme cluster by its length and offset in the source bytes. 101/// `Grapheme` represents a Unicode grapheme cluster by its length and offset in the source bytes.
101pub const Grapheme = struct { 102pub const Grapheme = struct {
102 len: u8, 103 len: uoffset,
103 offset: u32, 104 offset: uoffset,
104 105
105 /// `bytes` returns the slice of bytes that correspond to 106 /// `bytes` returns the slice of bytes that correspond to
106 /// this grapheme cluster in `src`. 107 /// this grapheme cluster in `src`.
@@ -109,6 +110,96 @@ pub const Grapheme = struct {
109 } 110 }
110}; 111};
111 112
113// NOTE: graphemeAtIndex is, probably, not in an optimal form. It has the advantage
114// of being composed of other parts, but the constant factor can _probably_ be improved
115// by a bespoke implmentation using graphemes.graphemeBreak directly. There's a limit
116// to how much cycle-bumming I'm willing to do at any given moment; that limit has been
117// reached. Perhaps you, Dear Reader, might pick up the torch?
118
119/// Returns the `Grapheme` at `string[index]`, which does not have to be a
120/// valid start of a codepoint. Asserts the string is not empty. Index must be
121/// less than `string.len`. Always returns a `Grapheme`.
122pub fn graphemeAtIndex(graphemes: *const Graphemes, string: []const u8, index: usize) Grapheme {
123 assert(string.len != 0);
124 if (index == 0 or (index > 0 and
125 string[index] < 0x80 and
126 string[index - 1] < 0x80) and
127 (string[index - 1] != '\r' and string[index] != '\n'))
128 {
129 // There's always a grapheme break between two ASCII code points (except CRLF)
130 var iter = graphemes.iterator(string[index..]);
131 const next = iter.next().?;
132 return Grapheme{
133 .len = next.len,
134 .offset = @as(u32, @intCast(index)) + next.offset,
135 };
136 } // Otherwise it gets hairy.
137 const idx: uoffset = code_point.codepointAtIndex(string, @intCast(index)).?.offset;
138 if (idx == string.len) {
139 var iter = graphemes.reverseIterator(string);
140 return iter.prev().?;
141 }
142 // We're on a valid codepoint boundary, we go back from here
143 var r_iter = graphemes.reverseIterAtIndex(string, idx);
144 if (r_iter.prev()) |g| {
145 if (g.offset == 0) {
146 var iter = graphemes.iterator(string);
147 while (iter.next()) |g2| {
148 if (g2.offset <= idx and idx < g2.offset + g2.len) return g2;
149 }
150 }
151 }
152 // We need to toss one, because otherwise we might not be pending when
153 // we in fact need to be.
154 _ = r_iter.prev();
155 while (r_iter.pending != .none) : (_ = r_iter.prev()) {}
156 var iter = graphemes.iterAtIndex(string, r_iter.cp_iter.i orelse 0);
157 while (iter.next()) |g| {
158 if (g.offset <= idx and idx < g.offset + g.len) return g;
159 }
160 unreachable;
161}
162
163/// Return a (forward) iterator of `string` after `grapheme`.
164pub fn iterateAfterGrapheme(graphemes: *const Graphemes, string: []const u8, grapheme: Grapheme) Iterator {
165 return graphemes.iterAtIndex(string, grapheme.offset + grapheme.len);
166}
167
168/// Return a reverse iterator of `string` before `grapheme`.
169pub fn iterateBeforeGrapheme(graphemes: *const Graphemes, string: []const u8, grapheme: Grapheme) ReverseIterator {
170 // This bit of weirdness is because reverse iterators are "advance last",
171 // while forward iterators are "advance first". This leaves some room for
172 // further optimization, if anyone dares.
173 var r_iter = graphemes.reverseIterAtIndex(string, grapheme.offset + grapheme.len - 1);
174 _ = r_iter.prev();
175 return r_iter;
176}
177
178fn reverseIterAtIndex(graphemes: *const Graphemes, string: []const u8, idx: uoffset) ReverseIterator {
179 var r_iter: ReverseIterator = undefined;
180 r_iter.data = graphemes;
181 var rcp_iter: CodePointReverseIterator = .{ .bytes = string, .i = idx };
182 r_iter.buf[1] = rcp_iter.prev();
183 r_iter.buf[0] = rcp_iter.prev();
184 r_iter.pending = .none;
185 r_iter.cp_iter = rcp_iter;
186 return r_iter;
187}
188
189fn iterAtIndex(graphemes: *const Graphemes, string: []const u8, idx: uoffset) Iterator {
190 var iter: Iterator = undefined;
191 iter.data = graphemes;
192 iter.buf[0] = first: {
193 if (idx == string.len) break :first null;
194 var r_cp_iter: CodePointReverseIterator = .{ .bytes = string, .i = idx };
195 break :first r_cp_iter.prev();
196 };
197 var cp_iter: CodePointIterator = .{ .bytes = string, .i = idx };
198 iter.buf[1] = cp_iter.next();
199 iter.cp_iter = cp_iter;
200 return iter;
201}
202
112/// `Iterator` iterates a sting of UTF-8 encoded bytes one grapheme cluster at-a-time. 203/// `Iterator` iterates a sting of UTF-8 encoded bytes one grapheme cluster at-a-time.
113pub const Iterator = struct { 204pub const Iterator = struct {
114 buf: [2]?CodePoint = .{ null, null }, 205 buf: [2]?CodePoint = .{ null, null },
@@ -143,7 +234,7 @@ pub const Iterator = struct {
143 234
144 const gc_start = self.buf[0].?.offset; 235 const gc_start = self.buf[0].?.offset;
145 var gc_len: u8 = self.buf[0].?.len; 236 var gc_len: u8 = self.buf[0].?.len;
146 var state = State{}; 237 var state = IterState{};
147 238
148 if (graphemeBreak( 239 if (graphemeBreak(
149 self.buf[0].?.code, 240 self.buf[0].?.code,
@@ -173,72 +264,244 @@ pub const Iterator = struct {
173 const saved_cp_iter = self.cp_iter; 264 const saved_cp_iter = self.cp_iter;
174 const s0 = self.buf[0]; 265 const s0 = self.buf[0];
175 const s1 = self.buf[1]; 266 const s1 = self.buf[1];
176 267 defer {
177 self.advance();
178
179 // If no more
180 if (self.buf[0] == null) {
181 self.cp_iter = saved_cp_iter;
182 self.buf[0] = s0;
183 self.buf[1] = s1;
184 return null;
185 }
186 // If last one
187 if (self.buf[1] == null) {
188 const len = self.buf[0].?.len;
189 const offset = self.buf[0].?.offset;
190 self.cp_iter = saved_cp_iter;
191 self.buf[0] = s0;
192 self.buf[1] = s1;
193 return Grapheme{ .len = len, .offset = offset };
194 }
195 // If ASCII
196 if (self.buf[0].?.code != '\r' and self.buf[0].?.code < 128 and self.buf[1].?.code < 128) {
197 const len = self.buf[0].?.len;
198 const offset = self.buf[0].?.offset;
199 self.cp_iter = saved_cp_iter; 268 self.cp_iter = saved_cp_iter;
200 self.buf[0] = s0; 269 self.buf[0] = s0;
201 self.buf[1] = s1; 270 self.buf[1] = s1;
202 return Grapheme{ .len = len, .offset = offset };
203 } 271 }
272 return self.next();
273 }
274};
204 275
205 const gc_start = self.buf[0].?.offset; 276/// Iterate a string backward by Grapheme.
206 var gc_len: u8 = self.buf[0].?.len; 277pub const ReverseIterator = struct {
207 var state = State{}; 278 buf: [2]?CodePoint = .{ null, null },
279 cp_iter: CodePointReverseIterator,
280 data: *const Graphemes,
281 /// Codepoint read from `cp_iter` but not returned by `previous`
282 pending: Pending = .none,
208 283
209 if (graphemeBreak( 284 const Pending = union(enum) {
210 self.buf[0].?.code, 285 none: void,
211 self.buf[1].?.code, 286 /// Count of pending RI codepoints, it is an even number
212 self.data, 287 ri_count: usize,
213 &state, 288 /// End of (Extend* ZWJ) sequence pending from failed GB11: !Emoji Extend* ZWJ x Emoji
214 )) { 289 extend_end: uoffset,
215 self.cp_iter = saved_cp_iter; 290 };
216 self.buf[0] = s0;
217 self.buf[1] = s1;
218 return Grapheme{ .len = gc_len, .offset = gc_start };
219 }
220 291
221 while (true) { 292 const Self = @This();
222 self.advance();
223 if (self.buf[0] == null) break;
224 293
225 gc_len += self.buf[0].?.len; 294 pub fn init(str: []const u8, data: *const Graphemes) Self {
295 var self: Self = .{ .cp_iter = .init(str), .data = data };
296 self.advance();
297 self.advance();
298 return self;
299 }
300
301 fn advance(self: *Self) void {
302 self.buf[1] = self.buf[0];
303 self.buf[0] = self.cp_iter.prev();
304 }
305
306 pub fn peek(self: *Self) ?Grapheme {
307 const cache = .{ self.buf, self.cp_iter, self.pending };
308 defer self.buf, self.cp_iter, self.pending = cache;
309 return self.prev();
310 }
311
312 pub fn prev(self: *Self) ?Grapheme {
313 if (self.buf[1] == null) return null;
314
315 const grapheme_end: uoffset = end: {
316 const codepoint = self.buf[1].?;
317
318 switch (self.pending) {
319 // BUF: [?Any, Any]
320 .none => break :end codepoint.offset + codepoint.len,
321 .ri_count => |ri_count| {
322 std.debug.assert(ri_count > 0);
323 std.debug.assert(ri_count % 2 == 0);
324
325 if (ri_count > 2) {
326 self.pending.ri_count -= 2;
327
328 // Use the fact that all RI have length 4 in utf8 encoding
329 // since they are in range 0x1f1e6...0x1f1ff
330 // https://en.wikipedia.org/wiki/UTF-8#Encoding
331 return Grapheme{
332 .len = 8,
333 .offset = @intCast(codepoint.offset + self.pending.ri_count * 4),
334 };
335 } else {
336 self.pending = .{ .none = {} };
337 break :end codepoint.offset + codepoint.len + 4;
338 }
339 },
340 // BUF: [?Any, Extend] Extend* ZWJ
341 .extend_end => |extend_end| {
342 self.pending = .{ .none = {} };
343 break :end extend_end;
344 },
345 }
346 };
347
348 while (self.buf[0] != null) {
349 var state: IterState = .{};
350 state.xpic = true;
351 state.regional = false;
352 state.indic = true;
226 353
227 if (graphemeBreak( 354 if (graphemeBreak(
228 self.buf[0].?.code, 355 self.buf[0].?.code,
229 if (self.buf[1]) |ncp| ncp.code else 0, 356 self.buf[1].?.code,
230 self.data, 357 self.data,
231 &state, 358 &state,
232 )) break; 359 )) break;
360
361 self.advance();
362
363 if (!state.indic) {
364
365 // BUF: [?Any, Extend | Linker] Consonant
366 var indic_offset: uoffset = self.buf[1].?.offset + self.buf[1].?.len;
367
368 indic: while (true) {
369 if (self.buf[0] == null) {
370 self.pending = .{ .extend_end = indic_offset };
371 return .{
372 .len = @intCast(grapheme_end - indic_offset),
373 .offset = indic_offset,
374 };
375 }
376
377 const codepoint = self.buf[0].?;
378
379 switch (self.data.indic(codepoint.code)) {
380 .Extend, .Linker => {
381 self.advance();
382 continue :indic;
383 },
384 .Consonant => {
385 // BUF: [Consonant, Extend | Linker] (Extend | Linker)* Consonant
386 indic_offset = codepoint.offset;
387 self.advance();
388
389 if (self.buf[0]) |cp1| {
390 state.indic = true;
391
392 if (graphemeBreak(cp1.code, self.buf[1].?.code, self.data, &state)) break;
393
394 if (!state.indic) {
395 continue :indic;
396 } else {
397 break :indic;
398 }
399 } else {
400 break :indic;
401 }
402 },
403 .none => {
404 // BUF: [Any, Extend | Linker] (Extend | Linker)* Consonant
405 self.pending = .{ .extend_end = indic_offset };
406 return .{
407 .len = @intCast(grapheme_end - indic_offset),
408 .offset = indic_offset,
409 };
410 },
411 }
412 }
413 }
414
415 if (!state.xpic) {
416 // BUF: [?Any, ZWJ] Emoji
417 var emoji_offset: uoffset = self.buf[1].?.offset + self.buf[1].?.len;
418
419 // Look for previous Emoji
420 emoji: while (true) {
421 if (self.buf[0] == null) {
422 self.pending = .{ .extend_end = emoji_offset };
423 return .{
424 .len = @intCast(grapheme_end - emoji_offset),
425 .offset = emoji_offset,
426 };
427 }
428
429 const codepoint = self.buf[0].?;
430
431 if (self.data.gbp(codepoint.code) == .Extend) {
432 self.advance();
433 continue :emoji;
434 }
435
436 if (self.data.isEmoji(codepoint.code)) {
437 // BUF: [Emoji, Extend] (Extend* ZWJ Emoji)*
438 emoji_offset = codepoint.offset;
439 self.advance();
440
441 if (self.buf[0] != null and
442 // ZWJ = 0x200d
443 self.buf[0].?.code == 0x200d)
444 {
445 // BUF: [ZWJ, Emoji] (Extend* ZWJ Emoji)*
446 // Back at the beginning of the loop, "recursively" look for emoji
447 self.advance();
448 continue :emoji;
449 } else {
450 // BUF: [?Any, Emoji] (Extend* ZWJ Emoji)*
451 break :emoji;
452 }
453 } else {
454 // BUF: [Any, Extend] (Extend* ZWJ Emoji)*
455 self.pending = .{ .extend_end = emoji_offset };
456 return .{
457 .len = @intCast(grapheme_end - emoji_offset),
458 .offset = emoji_offset,
459 };
460 }
461 }
462 }
463
464 if (state.regional) {
465 var ri_count: usize = 0;
466 while (self.buf[0] != null and
467 self.data.gbp(self.buf[0].?.code) == .Regional_Indicator)
468 {
469 ri_count += 1;
470 self.advance();
471 }
472
473 // Use the fact that all RI have length 4 in utf8 encoding
474 // since they are in range 0x1f1e6...0x1f1ff
475 // https://en.wikipedia.org/wiki/UTF-8#Encoding
476 if (ri_count == 0) {
477 // There are no pending RI codepoints
478 } else if (ri_count % 2 == 0) {
479 self.pending = .{ .ri_count = ri_count };
480 return .{ .len = 8, .offset = grapheme_end - 8 };
481 } else {
482 // Add one to count for the unused RI
483 self.pending = .{ .ri_count = ri_count + 1 };
484 return .{ .len = 4, .offset = grapheme_end - 4 };
485 }
486 }
233 } 487 }
234 self.cp_iter = saved_cp_iter;
235 self.buf[0] = s0;
236 self.buf[1] = s1;
237 488
238 return Grapheme{ .len = gc_len, .offset = gc_start }; 489 const grapheme_start = if (self.buf[1]) |codepoint| codepoint.offset else 0;
490 self.advance();
491 return .{
492 .len = @intCast(grapheme_end - grapheme_start),
493 .offset = grapheme_start,
494 };
239 } 495 }
240}; 496};
241 497
498/// Grapheme Iterator state.
499pub const IterState = packed struct(u3) {
500 xpic: bool = false,
501 regional: bool = false,
502 indic: bool = false,
503};
504
242// Predicates 505// Predicates
243fn isBreaker(cp: u21, data: *const Graphemes) bool { 506fn isBreaker(cp: u21, data: *const Graphemes) bool {
244 // Extract relevant properties. 507 // Extract relevant properties.
@@ -246,44 +509,6 @@ fn isBreaker(cp: u21, data: *const Graphemes) bool {
246 return cp == '\x0d' or cp == '\x0a' or cp_gbp_prop == .Control; 509 return cp == '\x0d' or cp == '\x0a' or cp_gbp_prop == .Control;
247} 510}
248 511
249// Grapheme break state.
250pub const State = struct {
251 bits: u3 = 0,
252
253 // Extended Pictographic (emoji)
254 fn hasXpic(self: State) bool {
255 return self.bits & 1 == 1;
256 }
257 fn setXpic(self: *State) void {
258 self.bits |= 1;
259 }
260 fn unsetXpic(self: *State) void {
261 self.bits ^= 1;
262 }
263
264 // Regional Indicatior (flags)
265 fn hasRegional(self: State) bool {
266 return self.bits & 2 == 2;
267 }
268 fn setRegional(self: *State) void {
269 self.bits |= 2;
270 }
271 fn unsetRegional(self: *State) void {
272 self.bits ^= 2;
273 }
274
275 // Indic Conjunct
276 fn hasIndic(self: State) bool {
277 return self.bits & 4 == 4;
278 }
279 fn setIndic(self: *State) void {
280 self.bits |= 4;
281 }
282 fn unsetIndic(self: *State) void {
283 self.bits ^= 4;
284 }
285};
286
287/// `graphemeBreak` returns true only if a grapheme break point is required 512/// `graphemeBreak` returns true only if a grapheme break point is required
288/// between `cp1` and `cp2`. `state` should start out as 0. If calling 513/// between `cp1` and `cp2`. `state` should start out as 0. If calling
289/// iteratively over a sequence of code points, this function must be called 514/// iteratively over a sequence of code points, this function must be called
@@ -294,7 +519,7 @@ pub fn graphemeBreak(
294 cp1: u21, 519 cp1: u21,
295 cp2: u21, 520 cp2: u21,
296 data: *const Graphemes, 521 data: *const Graphemes,
297 state: *State, 522 state: *IterState,
298) bool { 523) bool {
299 // Extract relevant properties. 524 // Extract relevant properties.
300 const cp1_gbp_prop = data.gbp(cp1); 525 const cp1_gbp_prop = data.gbp(cp1);
@@ -306,9 +531,9 @@ pub fn graphemeBreak(
306 const cp2_is_emoji = data.isEmoji(cp2); 531 const cp2_is_emoji = data.isEmoji(cp2);
307 532
308 // GB11: Emoji Extend* ZWJ x Emoji 533 // GB11: Emoji Extend* ZWJ x Emoji
309 if (!state.hasXpic() and cp1_is_emoji) state.setXpic(); 534 if (!state.xpic and cp1_is_emoji) state.xpic = true;
310 // GB9c: Indic Conjunct Break 535 // GB9c: Indic Conjunct Break
311 if (!state.hasIndic() and cp1_indic_prop == .Consonant) state.setIndic(); 536 if (!state.indic and cp1_indic_prop == .Consonant) state.indic = true;
312 537
313 // GB3: CR x LF 538 // GB3: CR x LF
314 if (cp1 == '\r' and cp2 == '\n') return false; 539 if (cp1 == '\r' and cp2 == '\n') return false;
@@ -317,11 +542,11 @@ pub fn graphemeBreak(
317 if (isBreaker(cp1, data)) return true; 542 if (isBreaker(cp1, data)) return true;
318 543
319 // GB11: Emoji Extend* ZWJ x Emoji 544 // GB11: Emoji Extend* ZWJ x Emoji
320 if (state.hasXpic() and 545 if (state.xpic and
321 cp1_gbp_prop == .ZWJ and 546 cp1_gbp_prop == .ZWJ and
322 cp2_is_emoji) 547 cp2_is_emoji)
323 { 548 {
324 state.unsetXpic(); 549 state.xpic = false;
325 return false; 550 return false;
326 } 551 }
327 552
@@ -336,11 +561,11 @@ pub fn graphemeBreak(
336 561
337 // GB12, GB13: RI x RI 562 // GB12, GB13: RI x RI
338 if (cp1_gbp_prop == .Regional_Indicator and cp2_gbp_prop == .Regional_Indicator) { 563 if (cp1_gbp_prop == .Regional_Indicator and cp2_gbp_prop == .Regional_Indicator) {
339 if (state.hasRegional()) { 564 if (state.regional) {
340 state.unsetRegional(); 565 state.regional = false;
341 return true; 566 return true;
342 } else { 567 } else {
343 state.setRegional(); 568 state.regional = true;
344 return false; 569 return false;
345 } 570 }
346 } 571 }
@@ -365,25 +590,25 @@ pub fn graphemeBreak(
365 } 590 }
366 591
367 // GB9c: Indic Conjunct Break 592 // GB9c: Indic Conjunct Break
368 if (state.hasIndic() and 593 if (state.indic and
369 cp1_indic_prop == .Consonant and 594 cp1_indic_prop == .Consonant and
370 (cp2_indic_prop == .Extend or cp2_indic_prop == .Linker)) 595 (cp2_indic_prop == .Extend or cp2_indic_prop == .Linker))
371 { 596 {
372 return false; 597 return false;
373 } 598 }
374 599
375 if (state.hasIndic() and 600 if (state.indic and
376 cp1_indic_prop == .Extend and 601 cp1_indic_prop == .Extend and
377 cp2_indic_prop == .Linker) 602 cp2_indic_prop == .Linker)
378 { 603 {
379 return false; 604 return false;
380 } 605 }
381 606
382 if (state.hasIndic() and 607 if (state.indic and
383 (cp1_indic_prop == .Linker or cp1_gbp_prop == .ZWJ) and 608 (cp1_indic_prop == .Linker or cp1_gbp_prop == .ZWJ) and
384 cp2_indic_prop == .Consonant) 609 cp2_indic_prop == .Consonant)
385 { 610 {
386 state.unsetIndic(); 611 state.indic = false;
387 return false; 612 return false;
388 } 613 }
389 614
@@ -421,3 +646,39 @@ test "Segmentation ZWJ and ZWSP emoji sequences" {
421 try std.testing.expectEqual(@as(usize, 2), i); 646 try std.testing.expectEqual(@as(usize, 2), i);
422 } 647 }
423} 648}
649
650test "Iterator.peek" {
651 const peek_seq = "aΔ👨🏻‍🌾→";
652 const data = try Graphemes.init(std.testing.allocator);
653 defer data.deinit(std.testing.allocator);
654
655 var iter = data.iterator(peek_seq);
656 const peek_a = iter.peek().?;
657 const next_a = iter.next().?;
658 try std.testing.expectEqual(peek_a, next_a);
659 try std.testing.expectEqualStrings("a", peek_a.bytes(peek_seq));
660 const peek_d1 = iter.peek().?;
661 const peek_d2 = iter.peek().?;
662 try std.testing.expectEqual(peek_d1, peek_d2);
663 const next_d = iter.next().?;
664 try std.testing.expectEqual(peek_d2, next_d);
665 try std.testing.expectEqual(iter.peek(), iter.next());
666 try std.testing.expectEqual(iter.peek(), iter.next());
667 try std.testing.expectEqual(null, iter.peek());
668 try std.testing.expectEqual(null, iter.peek());
669 try std.testing.expectEqual(iter.peek(), iter.next());
670}
671
672const std = @import("std");
673const builtin = @import("builtin");
674const assert = std.debug.assert;
675const mem = std.mem;
676const Allocator = mem.Allocator;
677const compress = std.compress;
678const unicode = std.unicode;
679
680const code_point = @import("code_point");
681const CodePoint = code_point.CodePoint;
682const CodePointIterator = code_point.Iterator;
683const CodePointReverseIterator = code_point.ReverseIterator;
684const uoffset = code_point.uoffset;
diff --git a/src/Words.zig b/src/Words.zig
new file mode 100644
index 0000000..617c34d
--- /dev/null
+++ b/src/Words.zig
@@ -0,0 +1,773 @@
1//! Word Breaking Algorithm.
2//!
3//! https://www.unicode.org/reports/tr29/#Word_Boundaries
4//!
5
6const WordBreakProperty = enum(u5) {
7 none,
8 Double_Quote,
9 Single_Quote,
10 Hebrew_Letter,
11 CR,
12 LF,
13 Newline,
14 Extend,
15 Regional_Indicator,
16 Format,
17 Katakana,
18 ALetter,
19 MidLetter,
20 MidNum,
21 MidNumLet,
22 Numeric,
23 ExtendNumLet,
24 ZWJ,
25 WSegSpace,
26};
27
28s1: []u16 = undefined,
29s2: []u5 = undefined,
30
31const Words = @This();
32
33pub fn init(allocator: Allocator) Allocator.Error!Words {
34 var wb: Words = undefined;
35 try wb.setup(allocator);
36 return wb;
37}
38
39pub fn setup(wb: *Words, allocator: Allocator) Allocator.Error!void {
40 wb.setupImpl(allocator) catch |err| {
41 switch (err) {
42 error.OutOfMemory => |e| return e,
43 else => unreachable,
44 }
45 };
46}
47
48pub fn deinit(words: *const Words, allocator: mem.Allocator) void {
49 allocator.free(words.s1);
50 allocator.free(words.s2);
51}
52
53/// Represents a Unicode word span, as an offset into the source string
54/// and the length of the word.
55pub const Word = struct {
56 offset: uoffset,
57 len: uoffset,
58
59 /// Returns a slice of the word given the source string.
60 pub fn bytes(word: Word, src: []const u8) []const u8 {
61 return src[word.offset..][0..word.len];
62 }
63};
64
65/// Returns the word break property type for `cp`.
66pub fn breakProperty(words: *const Words, cp: u21) WordBreakProperty {
67 return @enumFromInt(words.s2[words.s1[cp >> 8] + (cp & 0xff)]);
68}
69
70/// Convenience function for working with CodePoints
71fn breakProp(words: *const Words, point: CodePoint) WordBreakProperty {
72 return @enumFromInt(words.s2[words.s1[point.code >> 8] + (point.code & 0xff)]);
73}
74
75/// Returns the Word at the given index. Asserts that the index is less than
76/// `string.len`, and that the string is not empty. Always returns a word.
77/// The index does not have to be the start of a codepoint in the word.
78pub fn wordAtIndex(words: *const Words, string: []const u8, index: usize) Word {
79 assert(index < string.len and string.len > 0);
80 var iter_back: ReverseIterator = reverseFromIndex(words, string, index);
81 const first_back = iter_back.prev();
82 if (first_back) |back| {
83 if (back.offset == 0) {
84 var iter_fwd = words.iterator(string);
85 while (iter_fwd.next()) |word| {
86 if (word.offset <= index and index < word.offset + word.len)
87 return word;
88 }
89 }
90 } else {
91 var iter_fwd = words.iterator(string);
92 while (iter_fwd.next()) |word| {
93 if (word.offset <= index and index < word.offset + word.len)
94 return word;
95 }
96 }
97 _ = iter_back.prev();
98 // There's sometimes flags:
99 if (iter_back.flags > 0) {
100 while (iter_back.flags > 0) {
101 if (iter_back.prev()) |_| {
102 continue;
103 } else {
104 break;
105 }
106 }
107 }
108 var iter_fwd = iter_back.forwardIterator();
109 while (iter_fwd.next()) |word| {
110 if (word.offset <= index and index < word.offset + word.len)
111 return word;
112 }
113 unreachable;
114}
115
116/// Returns an iterator over words in `slice`.
117pub fn iterator(words: *const Words, slice: []const u8) Iterator {
118 return Iterator.init(words, slice);
119}
120
121/// Returns a reverse iterator over the words in `slice`.
122pub fn reverseIterator(words: *const Words, slice: []const u8) ReverseIterator {
123 return ReverseIterator.init(words, slice);
124}
125
126/// Returns an iterator after the `word` in `slice`.
127pub fn iterateAfterWord(words: *const Words, slice: []const u8, word: Word) Iterator {
128 return forwardFromIndex(words, slice, word.offset + word.len);
129}
130
131/// Returns a reverse iterator before the `word` in `slice`.
132pub fn iterateBeforeWord(words: *const Words, slice: []const u8, word: Word) ReverseIterator {
133 return reverseFromIndex(words, slice, word.offset);
134}
135
136/// An iterator, forward, over all words in a provided string.
137pub const Iterator = struct {
138 this: ?CodePoint = null,
139 that: ?CodePoint = null,
140 cp_iter: CodepointIterator,
141 wb: *const Words,
142
143 /// Assumes `str` is valid UTF-8.
144 pub fn init(words: *const Words, str: []const u8) Iterator {
145 var wb_iter: Iterator = .{ .cp_iter = .init(str), .wb = words };
146 wb_iter.advance();
147 return wb_iter;
148 }
149
150 /// Returns the next word segment, without advancing.
151 pub fn peek(iter: *Iterator) ?Word {
152 const cache = .{ iter.this, iter.that, iter.cp_iter };
153 defer {
154 iter.this, iter.that, iter.cp_iter = cache;
155 }
156 return iter.next();
157 }
158
159 /// Returns a reverse iterator from the point this iterator is paused
160 /// at. Usually, and always when using the API to create iterators,
161 /// calling `prev()` will return the word just seen.
162 pub fn reverseIterator(iter: *Iterator) ReverseIterator {
163 var cp_it = iter.cp_iter.reverseIterator();
164 if (iter.that) |_|
165 _ = cp_it.prev();
166 if (iter.cp_iter.peek()) |_|
167 _ = cp_it.prev();
168 return .{
169 .wb = iter.wb,
170 .before = cp_it.prev(),
171 .after = iter.that,
172 .cp_iter = cp_it,
173 };
174 }
175
176 /// Returns the next word segment, if any.
177 pub fn next(iter: *Iterator) ?Word {
178 iter.advance();
179
180 // Done?
181 if (iter.this == null) return null;
182 // Last?
183 if (iter.that == null) return Word{ .len = iter.this.?.len, .offset = iter.this.?.offset };
184
185 const word_start = iter.this.?.offset;
186 var word_len: uoffset = 0;
187
188 // State variables.
189 var last_p: WordBreakProperty = .none;
190 var last_last_p: WordBreakProperty = .none;
191 var ri_count: usize = 0;
192
193 scan: while (true) : (iter.advance()) {
194 const this = iter.this.?;
195 word_len += this.len;
196 if (iter.that) |that| {
197 const this_p = iter.wb.breakProp(this);
198 const that_p = iter.wb.breakProp(that);
199 if (!isIgnorable(this_p)) {
200 last_last_p = last_p;
201 last_p = this_p;
202 }
203 // WB3 CR × LF
204 if (this_p == .CR and that_p == .LF) continue :scan;
205 // WB3a (Newline | CR | LF) ÷
206 if (isNewline(this_p)) break :scan;
207 // WB3b ÷ (Newline | CR | LF)
208 if (isNewline(that_p)) break :scan;
209 // WB3c ZWJ × \p{Extended_Pictographic}
210 if (this_p == .ZWJ and ext_pict.isMatch(that.bytes(iter.cp_iter.bytes))) {
211 continue :scan;
212 }
213 // WB3d WSegSpace × WSegSpace
214 if (this_p == .WSegSpace and that_p == .WSegSpace) continue :scan;
215 // WB4 X (Extend | Format | ZWJ)* → X
216 if (isIgnorable(that_p)) {
217 continue :scan;
218 } // Now we use last_p instead of this_p for ignorable's sake
219 if (isAHLetter(last_p)) {
220 // WB5 AHLetter × AHLetter
221 if (isAHLetter(that_p)) continue :scan;
222 // WB6 AHLetter × (MidLetter | MidNumLetQ) AHLetter
223 if (isMidVal(that_p)) {
224 const next_val = iter.peekPast();
225 if (next_val) |next_cp| {
226 const next_p = iter.wb.breakProp(next_cp);
227 if (isAHLetter(next_p)) {
228 continue :scan;
229 }
230 }
231 }
232 }
233 // WB7 AHLetter (MidLetter | MidNumLetQ) × AHLetter
234 if (isAHLetter(last_last_p) and isMidVal(last_p) and isAHLetter(that_p)) {
235 continue :scan;
236 }
237 if (last_p == .Hebrew_Letter) {
238 // WB7a Hebrew_Letter × Single_Quote
239 if (that_p == .Single_Quote) continue :scan;
240 // WB7b Hebrew_Letter × Double_Quote Hebrew_Letter
241 if (that_p == .Double_Quote) {
242 const next_val = iter.peekPast();
243 if (next_val) |next_cp| {
244 const next_p = iter.wb.breakProp(next_cp);
245 if (next_p == .Hebrew_Letter) {
246 continue :scan;
247 }
248 }
249 }
250 }
251 // WB7c Hebrew_Letter Double_Quote × Hebrew_Letter
252 if (last_last_p == .Hebrew_Letter and last_p == .Double_Quote and that_p == .Hebrew_Letter)
253 continue :scan;
254 // WB8 Numeric × Numeric
255 if (last_p == .Numeric and that_p == .Numeric) continue :scan;
256 // WB9 AHLetter × Numeric
257 if (isAHLetter(last_p) and that_p == .Numeric) continue :scan;
258 // WB10 Numeric × AHLetter
259 if (last_p == .Numeric and isAHLetter(that_p)) continue :scan;
260 // WB11 Numeric (MidNum | MidNumLetQ) × Numeric
261 if (last_last_p == .Numeric and isMidNum(last_p) and that_p == .Numeric)
262 continue :scan;
263 // WB12 Numeric × (MidNum | MidNumLetQ) Numeric
264 if (last_p == .Numeric and isMidNum(that_p)) {
265 const next_val = iter.peekPast();
266 if (next_val) |next_cp| {
267 const next_p = iter.wb.breakProp(next_cp);
268 if (next_p == .Numeric) {
269 continue :scan;
270 }
271 }
272 }
273 // WB13 Katakana × Katakana
274 if (last_p == .Katakana and that_p == .Katakana) continue :scan;
275 // WB13a (AHLetter | Numeric | Katakana | ExtendNumLet) × ExtendNumLet
276 if (isExtensible(last_p) and that_p == .ExtendNumLet) continue :scan;
277 // WB13b ExtendNumLet × (AHLetter | Numeric | Katakana)
278 if (last_p == .ExtendNumLet and isExtensible(that_p)) continue :scan;
279 // WB15, WB16 ([^RI] | sot) (RI RI)* RI × RI
280 const maybe_flag = that_p == .Regional_Indicator and last_p == .Regional_Indicator;
281 if (maybe_flag) {
282 ri_count += 1;
283 if (ri_count % 2 == 1) continue :scan;
284 }
285 // WB999 Any ÷ Any
286 break :scan;
287 } else { // iter.that == null
288 break :scan;
289 }
290 }
291
292 return Word{ .len = word_len, .offset = word_start };
293 }
294
295 pub fn format(iter: Iterator, _: []const u8, _: std.fmt.FormatOptions, writer: anytype) !void {
296 try writer.print(
297 "Iterator {{ .this = {any}, .that = {any} }}",
298 .{ iter.this, iter.that },
299 );
300 }
301
302 fn advance(iter: *Iterator) void {
303 iter.this = iter.that;
304 iter.that = iter.cp_iter.next();
305 }
306
307 fn peekPast(iter: *Iterator) ?CodePoint {
308 const save_cp = iter.cp_iter;
309 defer iter.cp_iter = save_cp;
310 while (iter.cp_iter.peek()) |peeked| {
311 if (!isIgnorable(iter.wb.breakProp(peeked))) return peeked;
312 _ = iter.cp_iter.next();
313 }
314 return null;
315 }
316};
317
318/// An iterator, backward, over all words in a provided string.
319pub const ReverseIterator = struct {
320 after: ?CodePoint = null,
321 before: ?CodePoint = null,
322 cp_iter: ReverseCodepointIterator,
323 wb: *const Words,
324 flags: usize = 0,
325
326 /// Assumes `str` is valid UTF-8.
327 pub fn init(words: *const Words, str: []const u8) ReverseIterator {
328 var wb_iter: ReverseIterator = .{ .cp_iter = .init(str), .wb = words };
329 wb_iter.advance();
330 return wb_iter;
331 }
332
333 /// Returns the previous word segment, if any, without advancing.
334 pub fn peek(iter: *ReverseIterator) ?Word {
335 const cache = .{ iter.before, iter.after, iter.cp_iter, iter.flags };
336 defer {
337 iter.before, iter.after, iter.cp_iter, iter.flags = cache;
338 }
339 return iter.prev();
340 }
341
342 /// Return a forward iterator from where this iterator paused. Usually,
343 /// and always when using the API to create iterators, calling `next()`
344 /// will return the word just seen.
345 pub fn forwardIterator(iter: *ReverseIterator) Iterator {
346 var cp_it = iter.cp_iter.forwardIterator();
347 if (iter.before) |_|
348 _ = cp_it.next();
349 return .{
350 .wb = iter.wb,
351 .this = cp_it.next(),
352 .that = iter.after,
353 .cp_iter = cp_it,
354 };
355 }
356
357 /// Return the previous word, if any.
358 pub fn prev(iter: *ReverseIterator) ?Word {
359 iter.advance();
360
361 // Done?
362 if (iter.after == null) return null;
363 // Last?
364 if (iter.before == null) return Word{ .len = iter.after.?.len, .offset = 0 };
365
366 const word_end = iter.after.?.offset + iter.after.?.len;
367 var word_len: uoffset = 0;
368
369 // State variables.
370 var last_p: WordBreakProperty = .none;
371 var last_last_p: WordBreakProperty = .none;
372
373 scan: while (true) : (iter.advance()) {
374 const after = iter.after.?;
375 word_len += after.len;
376 if (iter.before) |before| {
377 var sneak = sneaky(iter); // 'sneaks' past ignorables
378 const after_p = iter.wb.breakProp(after);
379 var before_p = iter.wb.breakProp(before);
380 if (!isIgnorable(after_p)) {
381 last_last_p = last_p;
382 last_p = after_p;
383 }
384 // WB3 CR × LF
385 if (before_p == .CR and after_p == .LF) continue :scan;
386 // WB3a (Newline | CR | LF) ÷
387 if (isNewline(before_p)) break :scan;
388 // WB3b ÷ (Newline | CR | LF)
389 if (isNewline(after_p)) break :scan;
390 // WB3c ZWJ × \p{Extended_Pictographic}
391 if (before_p == .ZWJ and ext_pict.isMatch(after.bytes(iter.cp_iter.bytes))) {
392 continue :scan;
393 }
394 // WB3d WSegSpace × WSegSpace
395 if (before_p == .WSegSpace and after_p == .WSegSpace) continue :scan;
396 // WB4 X (Extend | Format | ZWJ)* → X
397 if (isIgnorable(before_p)) {
398 const maybe_before = sneak.prev();
399 if (maybe_before) |valid_before| {
400 before_p = iter.wb.breakProp(valid_before);
401 } else if (!isIgnorable(after_p)) {
402 // We're done
403 break :scan;
404 }
405 }
406 if (isIgnorable(after_p)) continue :scan;
407 // WB5 AHLetter × AHLetter
408 if (isAHLetter(last_p) and isAHLetter(before_p)) {
409 continue :scan;
410 }
411 // WB6 AHLetter × (MidLetter | MidNumLetQ) AHLetter
412 if (isAHLetter(before_p) and isMidVal(last_p) and isAHLetter(last_last_p)) {
413 continue :scan;
414 }
415 // WB7 AHLetter (MidLetter | MidNumLetQ) × AHLetter
416 if (isMidVal(before_p) and isAHLetter(last_p)) {
417 const prev_val = sneak.peek();
418 if (prev_val) |prev_cp| {
419 const prev_p = iter.wb.breakProp(prev_cp);
420 if (isAHLetter(prev_p)) {
421 continue :scan;
422 }
423 }
424 }
425 // WB7a Hebrew_Letter × Single_Quote
426 if (before_p == .Hebrew_Letter and last_p == .Single_Quote) continue :scan;
427 // WB7b Hebrew_Letter × Double_Quote Hebrew_Letter
428 if (before_p == .Hebrew_Letter and last_p == .Double_Quote and last_last_p == .Hebrew_Letter) {
429 continue :scan;
430 }
431 // WB7c Hebrew_Letter Double_Quote × Hebrew_Letter
432 if (before_p == .Double_Quote and last_p == .Hebrew_Letter) {
433 const prev_val = sneak.peek();
434 if (prev_val) |prev_cp| {
435 const prev_p = iter.wb.breakProp(prev_cp);
436 if (prev_p == .Hebrew_Letter) {
437 continue :scan;
438 }
439 }
440 }
441 // WB8 Numeric × Numeric
442 if (before_p == .Numeric and last_p == .Numeric) continue :scan;
443 // WB9 AHLetter × Numeric
444 if (isAHLetter(before_p) and last_p == .Numeric) continue :scan;
445 // WB10 Numeric × AHLetter
446 if (before_p == .Numeric and isAHLetter(last_p)) continue :scan;
447 // WB11 Numeric (MidNum | MidNumLetQ) × Numeric
448 if (isMidNum(before_p) and last_p == .Numeric) {
449 const prev_val = sneak.peek();
450 if (prev_val) |prev_cp| {
451 const prev_p = iter.wb.breakProp(prev_cp);
452 if (prev_p == .Numeric) {
453 continue :scan;
454 }
455 }
456 }
457 // WB12 Numeric × (MidNum | MidNumLetQ) Numeric
458 if (before_p == .Numeric and isMidNum(last_p) and last_last_p == .Numeric) {
459 continue :scan;
460 }
461 // WB13 Katakana × Katakana
462 if (before_p == .Katakana and last_p == .Katakana) continue :scan;
463 // WB13a (AHLetter | Numeric | Katakana | ExtendNumLet) × ExtendNumLet
464 if (isExtensible(before_p) and last_p == .ExtendNumLet) continue :scan;
465 // WB13b ExtendNumLet × (AHLetter | Numeric | Katakana)
466 if (before_p == .ExtendNumLet and isExtensible(last_p)) continue :scan;
467 // WB15, WB16 ([^RI] | sot) (RI RI)* RI × RI
468 // NOTE:
469 // So here we simply have to know whether a run of flags is even or odd.
470 // The whole run. To avoid quadratic behavior (and long flag runs are
471 // actually a thing in the wild), we have to count them once, store that
472 // on the iterator, and decrement each time we see two, possibly breaking
473 // once extra at the beginning. They break up one per flag, once we hit
474 // zero, that's all the flags. If we see another flag we do it again.
475 if (before_p == .Regional_Indicator and last_p == .Regional_Indicator) {
476 defer {
477 if (iter.flags > 0) iter.flags -= 1;
478 }
479 if (iter.flags == 0) {
480 iter.flags = sneak.countFlags();
481 }
482 if (iter.flags % 2 == 0) {
483 continue :scan;
484 }
485 }
486 // WB999 Any ÷ Any
487 break :scan;
488 }
489 break :scan;
490 }
491 return Word{ .len = word_len, .offset = word_end - word_len };
492 }
493
494 pub fn format(iter: ReverseIterator, _: []const u8, _: std.fmt.FormatOptions, writer: anytype) !void {
495 try writer.print(
496 "ReverseIterator {{ .before = {any}, .after = {any}, .flags = {d} }}",
497 .{ iter.before, iter.after, iter.flags },
498 );
499 }
500
501 fn peekPast(iter: *ReverseIterator) ?CodePoint {
502 const save_cp = iter.cp_iter;
503 defer iter.cp_iter = save_cp;
504 while (iter.cp_iter.peek()) |peeked| {
505 if (!isIgnorable(iter.wb.breakProp(peeked))) return peeked;
506 _ = iter.cp_iter.prev();
507 }
508 return null;
509 }
510
511 fn advance(iter: *ReverseIterator) void {
512 iter.after = iter.before;
513 iter.before = iter.cp_iter.prev();
514 }
515};
516
517//| Implementation Details
518
519/// Initialize a ReverseIterator at the provided index. Used in `wordAtIndex`.
520fn reverseFromIndex(words: *const Words, string: []const u8, index: usize) ReverseIterator {
521 var idx: uoffset = @intCast(index);
522 // Find the next lead byte:
523 while (idx < string.len and 0x80 <= string[idx] and string[idx] <= 0xBf) : (idx += 1) {}
524 if (idx == string.len) return words.reverseIterator(string);
525 var iter: ReverseIterator = undefined;
526 iter.wb = words;
527 iter.flags = 0;
528 // We need to populate the CodePoints, and the codepoint iterator.
529 // Consider "abc| def" with the cursor as |.
530 // We need `before` to be `c` and `after` to be ' ',
531 // and `cp_iter.prev()` to be `b`.
532 var cp_iter: ReverseCodepointIterator = .{ .bytes = string, .i = idx };
533 iter.after = cp_iter.prev();
534 iter.before = cp_iter.prev();
535 iter.cp_iter = cp_iter;
536 return iter;
537}
538
539fn forwardFromIndex(words: *const Words, string: []const u8, index: usize) Iterator {
540 var idx: uoffset = @intCast(index);
541 if (idx == string.len) {
542 return .{
543 .cp_iter = .{ .bytes = string, .i = idx },
544 .this = null,
545 .that = null,
546 .wb = words,
547 };
548 }
549 while (idx > 0 and 0x80 <= string[idx] and string[idx] <= 0xBf) : (idx -= 1) {}
550 if (idx == 0) return words.iterator(string);
551 var iter: Iterator = undefined;
552 iter.wb = words;
553 // We need to populate the CodePoints, and the codepoint iterator.
554 // Consider "abc |def" with the cursor as |.
555 // We need `this` to be ` ` and `that` to be 'd',
556 // and `cp_iter.next()` to be `d`.
557 idx -= 1;
558 while (idx > 0 and 0x80 <= string[idx] and string[idx] <= 0xBf) : (idx -= 1) {}
559 // "abc| def"
560 var cp_iter: CodepointIterator = .{ .bytes = string, .i = idx };
561 iter.this = cp_iter.next();
562 iter.that = cp_iter.next();
563 iter.cp_iter = cp_iter;
564 return iter;
565}
566
567fn sneaky(iter: *const ReverseIterator) SneakIterator {
568 return .{ .cp_iter = iter.cp_iter, .wb = iter.wb };
569}
570
571const SneakIterator = struct {
572 cp_iter: ReverseCodepointIterator,
573 wb: *const Words,
574
575 fn peek(iter: *SneakIterator) ?CodePoint {
576 const save_cp = iter.cp_iter;
577 defer iter.cp_iter = save_cp;
578 while (iter.cp_iter.peek()) |peeked| {
579 if (!isIgnorable(iter.wb.breakProp(peeked))) return peeked;
580 _ = iter.cp_iter.prev();
581 }
582 return null;
583 }
584
585 fn countFlags(iter: *SneakIterator) usize {
586 var flags: usize = 0;
587 const save_cp = iter.cp_iter;
588 defer iter.cp_iter = save_cp;
589 while (iter.cp_iter.prev()) |cp| {
590 const prop = iter.wb.breakProp(cp);
591 if (isIgnorable(prop)) continue;
592 if (prop == .Regional_Indicator) {
593 flags += 1;
594 } else break;
595 }
596 return flags;
597 }
598
599 fn prev(iter: *SneakIterator) ?CodePoint {
600 while (iter.cp_iter.prev()) |peeked| {
601 if (!isIgnorable(iter.wb.breakProp(peeked))) return peeked;
602 }
603 return null;
604 }
605};
606
607inline fn setupImpl(wb: *Words, allocator: Allocator) !void {
608 const decompressor = compress.flate.inflate.decompressor;
609 const in_bytes = @embedFile("wbp");
610 var in_fbs = std.io.fixedBufferStream(in_bytes);
611 var in_decomp = decompressor(.raw, in_fbs.reader());
612 var reader = in_decomp.reader();
613
614 const endian = builtin.cpu.arch.endian();
615
616 const stage_1_len: u16 = try reader.readInt(u16, endian);
617 wb.s1 = try allocator.alloc(u16, stage_1_len);
618 errdefer allocator.free(wb.s1);
619 for (0..stage_1_len) |i| wb.s1[i] = try reader.readInt(u16, endian);
620
621 const stage_2_len: u16 = try reader.readInt(u16, endian);
622 wb.s2 = try allocator.alloc(u5, stage_2_len);
623 errdefer allocator.free(wb.s2);
624 for (0..stage_2_len) |i| wb.s2[i] = @intCast(try reader.readInt(u8, endian));
625 var count_0: usize = 0;
626 for (wb.s2) |nyb| {
627 if (nyb == 0) count_0 += 1;
628 }
629}
630
631//| Predicates
632
633inline fn isNewline(wbp: WordBreakProperty) bool {
634 return wbp == .CR or wbp == .LF or wbp == .Newline;
635}
636
637inline fn isIgnorable(wbp: WordBreakProperty) bool {
638 return switch (wbp) {
639 .Format, .Extend, .ZWJ => true,
640 else => false,
641 };
642}
643
644inline fn isAHLetter(wbp: WordBreakProperty) bool {
645 return wbp == .ALetter or wbp == .Hebrew_Letter;
646}
647
648inline fn isMidVal(wbp: WordBreakProperty) bool {
649 return wbp == .MidLetter or wbp == .MidNumLet or wbp == .Single_Quote;
650}
651
652inline fn isMidNum(wbp: WordBreakProperty) bool {
653 return wbp == .MidNum or wbp == .MidNumLet or wbp == .Single_Quote;
654}
655
656inline fn isExtensible(wbp: WordBreakProperty) bool {
657 return switch (wbp) {
658 .ALetter, .Hebrew_Letter, .Katakana, .Numeric, .ExtendNumLet => true,
659 else => false,
660 };
661}
662
663test "Word Break Properties" {
664 const wb = try Words.init(testing.allocator);
665 defer wb.deinit(testing.allocator);
666 try testing.expectEqual(.CR, wb.breakProperty('\r'));
667 try testing.expectEqual(.LF, wb.breakProperty('\n'));
668 try testing.expectEqual(.Hebrew_Letter, wb.breakProperty('ש'));
669 try testing.expectEqual(.Katakana, wb.breakProperty('\u{30ff}'));
670}
671
672test "ext_pict" {
673 try testing.expect(ext_pict.isMatch("👇"));
674 try testing.expect(ext_pict.isMatch("\u{2701}"));
675}
676
677test "Words" {
678 const wb = try Words.init(testing.allocator);
679 defer wb.deinit(testing.allocator);
680 const word_str = "Metonym Μετωνύμιο メトニム";
681 var w_iter = wb.iterator(word_str);
682 try testing.expectEqualStrings("Metonym", w_iter.next().?.bytes(word_str));
683 // Spaces are "words" too!
684 try testing.expectEqualStrings(" ", w_iter.next().?.bytes(word_str));
685 const in_greek = w_iter.next().?;
686 for (in_greek.offset..in_greek.offset + in_greek.len) |i| {
687 const at_index = wb.wordAtIndex(word_str, i).bytes(word_str);
688 try testing.expectEqualStrings("Μετωνύμιο", at_index);
689 }
690 _ = w_iter.next();
691 try testing.expectEqualStrings("メトニム", w_iter.next().?.bytes(word_str));
692}
693
694test wordAtIndex {
695 const wb = try Words.init(testing.allocator);
696 defer wb.deinit(testing.allocator);
697 const t_string = "first second third";
698 const second = wb.wordAtIndex(t_string, 8);
699 try testing.expectEqualStrings("second", second.bytes(t_string));
700 const third = wb.wordAtIndex(t_string, 14);
701 try testing.expectEqualStrings("third", third.bytes(t_string));
702 {
703 const first = wb.wordAtIndex(t_string, 3);
704 try testing.expectEqualStrings("first", first.bytes(t_string));
705 }
706 {
707 const first = wb.wordAtIndex(t_string, 0);
708 try testing.expectEqualStrings("first", first.bytes(t_string));
709 }
710 const last = wb.wordAtIndex(t_string, 14);
711 try testing.expectEqualStrings("third", last.bytes(t_string));
712}
713
714const testr = "don't a:ka fin!";
715
716test "reversal" {
717 const wb = try Words.init(testing.allocator);
718 defer wb.deinit(testing.allocator);
719 {
720 var fwd = wb.iterator(testr);
721 var this_word: ?Word = fwd.next();
722
723 while (this_word) |this| : (this_word = fwd.next()) {
724 var back = fwd.reverseIterator();
725 const that_word = back.prev();
726 if (that_word) |that| {
727 try testing.expectEqualStrings(this.bytes(testr), that.bytes(testr));
728 } else {
729 try testing.expect(false);
730 }
731 }
732 }
733 {
734 var back = wb.reverseIterator(testr);
735 var this_word: ?Word = back.prev();
736
737 while (this_word) |this| : (this_word = back.prev()) {
738 var fwd = back.forwardIterator();
739 const that_word = fwd.next();
740 if (that_word) |that| {
741 try testing.expectEqualStrings(this.bytes(testr), that.bytes(testr));
742 } else {
743 try testing.expect(false);
744 }
745 }
746 }
747}
748
749fn testAllocations(allocator: Allocator) !void {
750 const wb = try Words.init(allocator);
751 wb.deinit(allocator);
752}
753
754test "allocation safety" {
755 try testing.checkAllAllocationFailures(testing.allocator, testAllocations, .{});
756}
757
758const std = @import("std");
759const builtin = @import("builtin");
760const compress = std.compress;
761const mem = std.mem;
762const Allocator = mem.Allocator;
763const assert = std.debug.assert;
764const testing = std.testing;
765
766const uoffset = code_point.uoffset;
767
768const code_point = @import("code_point");
769const CodepointIterator = code_point.Iterator;
770const ReverseCodepointIterator = code_point.ReverseIterator;
771const CodePoint = code_point.CodePoint;
772
773const ext_pict = @import("micro_runeset.zig").Extended_Pictographic;
diff --git a/src/code_point.zig b/src/code_point.zig
index fe7ad6e..7a638af 100644
--- a/src/code_point.zig
+++ b/src/code_point.zig
@@ -4,18 +4,33 @@
4//! Represents invalid data according to the Replacement of Maximal 4//! Represents invalid data according to the Replacement of Maximal
5//! Subparts algorithm. 5//! Subparts algorithm.
6 6
7pub const uoffset = if (@import("config").fat_offset) u64 else u32;
8
7/// `CodePoint` represents a Unicode code point by its code, 9/// `CodePoint` represents a Unicode code point by its code,
8/// length, and offset in the source bytes. 10/// length, and offset in the source bytes.
9pub const CodePoint = struct { 11pub const CodePoint = struct {
10 code: u21, 12 code: u21,
11 len: u3, 13 len: u3,
12 offset: u32, 14 offset: uoffset,
15
16 /// Return the slice of this codepoint, given the original string.
17 pub inline fn bytes(cp: CodePoint, str: []const u8) []const u8 {
18 return str[cp.offset..][0..cp.len];
19 }
20
21 pub fn format(cp: CodePoint, _: []const u8, _: std.fmt.FormatOptions, writer: anytype) !void {
22 try writer.print("CodePoint '{u}' .{{ ", .{cp.code});
23 try writer.print(
24 ".code = 0x{x}, .offset = {d}, .len = {d} }}",
25 .{ cp.code, cp.offset, cp.len },
26 );
27 }
13}; 28};
14 29
15/// This function is deprecated and will be removed in a later release. 30/// This function is deprecated and will be removed in a later release.
16/// Use `decodeAtIndex` or `decodeAtCursor`. 31/// Use `decodeAtIndex` or `decodeAtCursor`.
17pub fn decode(bytes: []const u8, offset: u32) ?CodePoint { 32pub fn decode(bytes: []const u8, offset: uoffset) ?CodePoint {
18 var off: u32 = 0; 33 var off: uoffset = 0;
19 var maybe_code = decodeAtCursor(bytes, &off); 34 var maybe_code = decodeAtCursor(bytes, &off);
20 if (maybe_code) |*code| { 35 if (maybe_code) |*code| {
21 code.offset = offset; 36 code.offset = offset;
@@ -24,15 +39,23 @@ pub fn decode(bytes: []const u8, offset: u32) ?CodePoint {
24 return null; 39 return null;
25} 40}
26 41
42/// Return the codepoint at `index`, even if `index` is in the middle
43/// of that codepoint.
44pub fn codepointAtIndex(bytes: []const u8, index: uoffset) ?CodePoint {
45 var idx = index;
46 while (idx > 0 and 0x80 <= bytes[idx] and bytes[idx] <= 0xbf) : (idx -= 1) {}
47 return decodeAtIndex(bytes, idx);
48}
49
27/// Decode the CodePoint, if any, at `bytes[idx]`. 50/// Decode the CodePoint, if any, at `bytes[idx]`.
28pub fn decodeAtIndex(bytes: []const u8, idx: u32) ?CodePoint { 51pub fn decodeAtIndex(bytes: []const u8, index: uoffset) ?CodePoint {
29 var off = idx; 52 var off = index;
30 return decodeAtCursor(bytes, &off); 53 return decodeAtCursor(bytes, &off);
31} 54}
32 55
33/// Decode the CodePoint, if any, at `bytes[cursor.*]`. After, the 56/// Decode the CodePoint, if any, at `bytes[cursor.*]`. After, the
34/// cursor will point at the next potential codepoint index. 57/// cursor will point at the next potential codepoint index.
35pub fn decodeAtCursor(bytes: []const u8, cursor: *u32) ?CodePoint { 58pub fn decodeAtCursor(bytes: []const u8, cursor: *uoffset) ?CodePoint {
36 // EOS 59 // EOS
37 if (cursor.* >= bytes.len) return null; 60 if (cursor.* >= bytes.len) return null;
38 61
@@ -98,6 +121,9 @@ pub fn decodeAtCursor(bytes: []const u8, cursor: *u32) ?CodePoint {
98 } 121 }
99 if (st == RUNE_REJECT or cursor.* == bytes.len) { 122 if (st == RUNE_REJECT or cursor.* == bytes.len) {
100 @branchHint(.cold); 123 @branchHint(.cold);
124 // This, and the branch below, detect truncation, the
125 // only invalid state handled differently by the Maximal
126 // Subparts algorithm.
101 if (state_dfa[@intCast(u8dfa[byte])] == RUNE_REJECT) { 127 if (state_dfa[@intCast(u8dfa[byte])] == RUNE_REJECT) {
102 cursor.* -= 2; // +1 128 cursor.* -= 2; // +1
103 return .{ 129 return .{
@@ -148,7 +174,7 @@ pub fn decodeAtCursor(bytes: []const u8, cursor: *u32) ?CodePoint {
148/// `Iterator` iterates a string one `CodePoint` at-a-time. 174/// `Iterator` iterates a string one `CodePoint` at-a-time.
149pub const Iterator = struct { 175pub const Iterator = struct {
150 bytes: []const u8, 176 bytes: []const u8,
151 i: u32 = 0, 177 i: uoffset = 0,
152 178
153 pub fn init(bytes: []const u8) Iterator { 179 pub fn init(bytes: []const u8) Iterator {
154 return .{ .bytes = bytes, .i = 0 }; 180 return .{ .bytes = bytes, .i = 0 };
@@ -158,10 +184,19 @@ pub const Iterator = struct {
158 return decodeAtCursor(self.bytes, &self.i); 184 return decodeAtCursor(self.bytes, &self.i);
159 } 185 }
160 186
161 pub fn peek(self: *Iterator) ?CodePoint { 187 pub fn peek(iter: *Iterator) ?CodePoint {
162 const saved_i = self.i; 188 const saved_i = iter.i;
163 defer self.i = saved_i; 189 defer iter.i = saved_i;
164 return self.next(); 190 return iter.next();
191 }
192
193 /// Create a backward iterator at this point. It will repeat
194 /// the last CodePoint seen.
195 pub fn reverseIterator(iter: *const Iterator) ReverseIterator {
196 if (iter.i == iter.bytes.len) {
197 return .init(iter.bytes);
198 }
199 return .{ .i = iter.i, .bytes = iter.bytes };
165 } 200 }
166}; 201};
167 202
@@ -233,6 +268,55 @@ const class_mask: [12]u8 = .{
233 0, 268 0,
234}; 269};
235 270
271pub const ReverseIterator = struct {
272 bytes: []const u8,
273 i: ?uoffset,
274
275 pub fn init(str: []const u8) ReverseIterator {
276 var r_iter: ReverseIterator = undefined;
277 r_iter.bytes = str;
278 r_iter.i = if (str.len == 0) 0 else @intCast(str.len - 1);
279 return r_iter;
280 }
281
282 pub fn prev(iter: *ReverseIterator) ?CodePoint {
283 if (iter.i == null) return null;
284 var i_prev = iter.i.?;
285
286 while (i_prev > 0) : (i_prev -= 1) {
287 if (!followbyte(iter.bytes[i_prev])) break;
288 }
289
290 if (i_prev > 0)
291 iter.i = i_prev - 1
292 else
293 iter.i = null;
294
295 return decode(iter.bytes[i_prev..], i_prev);
296 }
297
298 pub fn peek(iter: *ReverseIterator) ?CodePoint {
299 const saved_i = iter.i;
300 defer iter.i = saved_i;
301 return iter.prev();
302 }
303
304 /// Create a forward iterator at this point. It will repeat the
305 /// last CodePoint seen.
306 pub fn forwardIterator(iter: *const ReverseIterator) Iterator {
307 if (iter.i) |i| {
308 var fwd: Iterator = .{ .i = i, .bytes = iter.bytes };
309 _ = fwd.next();
310 return fwd;
311 }
312 return .{ .i = 0, .bytes = iter.bytes };
313 }
314};
315
316inline fn followbyte(b: u8) bool {
317 return 0x80 <= b and b <= 0xbf;
318}
319
236test "decode" { 320test "decode" {
237 const bytes = "🌩️"; 321 const bytes = "🌩️";
238 const res = decode(bytes, 0); 322 const res = decode(bytes, 0);
@@ -246,7 +330,7 @@ test "decode" {
246 } 330 }
247} 331}
248 332
249test "peek" { 333test Iterator {
250 var iter = Iterator{ .bytes = "Hi" }; 334 var iter = Iterator{ .bytes = "Hi" };
251 335
252 try expectEqual(@as(u21, 'H'), iter.next().?.code); 336 try expectEqual(@as(u21, 'H'), iter.next().?.code);
@@ -256,6 +340,54 @@ test "peek" {
256 try expectEqual(@as(?CodePoint, null), iter.next()); 340 try expectEqual(@as(?CodePoint, null), iter.next());
257} 341}
258 342
343const code_point = @This();
344
345// Keep this in sync with the README
346test "Code point iterator" {
347 const str = "Hi 😊";
348 var iter: code_point.Iterator = .init(str);
349 var i: usize = 0;
350
351 while (iter.next()) |cp| : (i += 1) {
352 // The `code` field is the actual code point scalar as a `u21`.
353 if (i == 0) try expect(cp.code == 'H');
354 if (i == 1) try expect(cp.code == 'i');
355 if (i == 2) try expect(cp.code == ' ');
356
357 if (i == 3) {
358 try expect(cp.code == '😊');
359 // The `offset` field is the byte offset in the
360 // source string.
361 try expect(cp.offset == 3);
362 try expectEqual(cp, code_point.decodeAtIndex(str, cp.offset).?);
363 // The `len` field is the length in bytes of the
364 // code point in the source string.
365 try expect(cp.len == 4);
366 // There is also a 'cursor' decode, like so:
367 {
368 var cursor = cp.offset;
369 try expectEqual(cp, code_point.decodeAtCursor(str, &cursor).?);
370 // Which advances the cursor variable to the next possible
371 // offset, in this case, `str.len`. Don't forget to account
372 // for this possibility!
373 try expectEqual(cp.offset + cp.len, cursor);
374 }
375 // There's also this, for when you aren't sure if you have the
376 // correct start for a code point:
377 try expectEqual(cp, code_point.codepointAtIndex(str, cp.offset + 1).?);
378 }
379 // Reverse iteration is also an option:
380 var r_iter: code_point.ReverseIterator = .init(str);
381 // Both iterators can be peeked:
382 try expectEqual('😊', r_iter.peek().?.code);
383 try expectEqual('😊', r_iter.prev().?.code);
384 // Both kinds of iterators can be reversed:
385 var fwd_iter = r_iter.forwardIterator(); // or iter.reverseIterator();
386 // This will always return the last codepoint from
387 // the prior iterator, _if_ it yielded one:
388 try expectEqual('😊', fwd_iter.next().?.code);
389 }
390}
259test "overlongs" { 391test "overlongs" {
260 // None of these should equal `/`, all should be byte-for-byte 392 // None of these should equal `/`, all should be byte-for-byte
261 // handled as replacement characters. 393 // handled as replacement characters.
@@ -346,6 +478,50 @@ test "truncation" {
346 } 478 }
347} 479}
348 480
481test ReverseIterator {
482 {
483 var r_iter: ReverseIterator = .init("ABC");
484 try testing.expectEqual(@as(u21, 'C'), r_iter.prev().?.code);
485 try testing.expectEqual(@as(u21, 'B'), r_iter.peek().?.code);
486 try testing.expectEqual(@as(u21, 'B'), r_iter.prev().?.code);
487 try testing.expectEqual(@as(u21, 'A'), r_iter.prev().?.code);
488 try testing.expectEqual(@as(?CodePoint, null), r_iter.peek());
489 try testing.expectEqual(@as(?CodePoint, null), r_iter.prev());
490 try testing.expectEqual(@as(?CodePoint, null), r_iter.prev());
491 }
492 {
493 var r_iter: ReverseIterator = .init("∅δq🦾ă");
494 try testing.expectEqual(@as(u21, 'ă'), r_iter.prev().?.code);
495 try testing.expectEqual(@as(u21, '🦾'), r_iter.prev().?.code);
496 try testing.expectEqual(@as(u21, 'q'), r_iter.prev().?.code);
497 try testing.expectEqual(@as(u21, 'δ'), r_iter.peek().?.code);
498 try testing.expectEqual(@as(u21, 'δ'), r_iter.prev().?.code);
499 try testing.expectEqual(@as(u21, '∅'), r_iter.peek().?.code);
500 try testing.expectEqual(@as(u21, '∅'), r_iter.peek().?.code);
501 try testing.expectEqual(@as(u21, '∅'), r_iter.prev().?.code);
502 try testing.expectEqual(@as(?CodePoint, null), r_iter.peek());
503 try testing.expectEqual(@as(?CodePoint, null), r_iter.prev());
504 try testing.expectEqual(@as(?CodePoint, null), r_iter.prev());
505 }
506 {
507 var r_iter: ReverseIterator = .init("123");
508 try testing.expectEqual(@as(u21, '3'), r_iter.prev().?.code);
509 try testing.expectEqual(@as(u21, '2'), r_iter.prev().?.code);
510 try testing.expectEqual(@as(u21, '1'), r_iter.prev().?.code);
511 var iter = r_iter.forwardIterator();
512 try testing.expectEqual(@as(u21, '1'), iter.next().?.code);
513 try testing.expectEqual(@as(u21, '2'), iter.next().?.code);
514 try testing.expectEqual(@as(u21, '3'), iter.next().?.code);
515 r_iter = iter.reverseIterator();
516 try testing.expectEqual(@as(u21, '3'), r_iter.prev().?.code);
517 try testing.expectEqual(@as(u21, '2'), r_iter.prev().?.code);
518 iter = r_iter.forwardIterator();
519 r_iter = iter.reverseIterator();
520 try testing.expectEqual(@as(u21, '2'), iter.next().?.code);
521 try testing.expectEqual(@as(u21, '2'), r_iter.prev().?.code);
522 }
523}
524
349const std = @import("std"); 525const std = @import("std");
350const testing = std.testing; 526const testing = std.testing;
351const expect = testing.expect; 527const expect = testing.expect;
diff --git a/src/micro_runeset.zig b/src/micro_runeset.zig
new file mode 100644
index 0000000..80ce4bf
--- /dev/null
+++ b/src/micro_runeset.zig
@@ -0,0 +1,207 @@
1//! Minimal RuneSet implementation
2//!
3//! This is copied from the full RuneSet module, so that `zg` doesn't
4//! depend on it. There's one spot in the WordBreak algorithm which
5//! needs to identify the emoji Extended_Pictographic property, which
6//! is not otherwise used in ZG. The Runeset is 89 words, while the
7//! trie lookup used throughout ZG would be much larger.
8//!
9//! The RuneSet is borrowed from Runicode, which encodes Unicode things
10//! in RuneSet form. This will need updating for each version of Unicode.
11
12pub const Extended_Pictographic = RuneSet{ .body = &.{ 0x0, 0x0, 0x1000c00000004, 0x1f, 0x420000000000, 0x30107fc8d053, 0x401, 0x80000000, 0xffff0fffafffffff, 0x2800000, 0x2001000000000000, 0x210000, 0x180000e0, 0x30000000000000, 0x8001000200e00000, 0xf800b85090, 0x1801022057ff3f, 0xffffffffffffffff, 0xffffffffffff003f, 0xffffffffffffffff, 0xfffffffffff7ffbf, 0x7800000000000001, 0x400c0000000000, 0x4, 0x70ffe0000008000, 0x100, 0x1000c000000, 0x60003f00000, 0x200000400000000, 0x200, 0x1000000000000000, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0x80000000e000, 0xc003f00000000000, 0xffffe00007fe4000, 0x3fffffffff, 0xf7fc80000400fffe, 0xfffffffffffffe00, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0x7ffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0x3fffffffffffffff, 0xffffffffffffffc0, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xfff0000000000000, 0xffffffffffe00000, 0xf000, 0xfc00ff00, 0xffffc0000000ff00, 0xffffffffffffffff, 0xf7fffffffffff000, 0xffffffffffffffbf, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0x3fffffffffffffff } };
13
14// Meaningful names for the T1 slots
15const LOW = 0;
16const HI = 1;
17const LEAD = 2;
18const T4_OFF = 3;
19
20/// Minimum Viable Runeset. Must be statically created, strictly boolean matching.
21pub const RuneSet = struct {
22 body: []const u64,
23
24 // Returns whether the slice is a match. This assumes the validity of the
25 // string, which can be ensured by, in particular, deriving it from a CodePoint.
26 pub fn isMatch(runeset: RuneSet, str: []const u8) bool {
27 const set = runeset.body;
28 const a = codeunit(str[0]);
29 switch (a.kind) {
30 .follow => unreachable,
31 .low => {
32 const mask = toMask(set[LOW]);
33 if (mask.isIn(a))
34 return true
35 else
36 return false;
37 },
38 .hi => {
39 const mask = toMask(set[HI]);
40 if (mask.isIn(a))
41 return true
42 else
43 return false;
44 },
45 .lead => {
46 const nB = a.nMultiBytes().?;
47 const a_mask = toMask(set[LEAD]);
48 if (!a_mask.isIn(a)) return false;
49 const b = codeunit(str[1]);
50 const b_loc = 4 + a_mask.lowerThan(a).?;
51 const b_mask = toMask(set[b_loc]);
52 if (!b_mask.isIn(b)) return false;
53 if (nB == 2) return true;
54 const t3_off = 4 + @popCount(set[LEAD]);
55 const c = codeunit(str[2]);
56 // Slice is safe because we know the T2 span has at least one word.
57 const c_off = b_mask.higherThan(b).? + popCountSlice(set[b_loc + 1 .. t3_off]);
58 const c_loc = t3_off + c_off;
59 const c_mask = toMask(set[c_loc]);
60 if (!c_mask.isIn(c)) return false;
61 if (nB == 3) return true;
62 const d_off = c_mask.lowerThan(c).? + popCountSlice(set[t3_off..c_loc]);
63 const d_loc = set[T4_OFF] + d_off;
64 const d = codeunit(str[3]);
65 const d_mask = toMask(set[d_loc]);
66 if (d_mask.isIn(d)) return true else return false;
67 },
68 }
69 }
70};
71
72/// Kinds of most significant bits in UTF-8
73const RuneKind = enum(u2) {
74 low,
75 hi,
76 follow,
77 lead,
78};
79
80/// Packed `u8` struct representing one codeunit of UTF-8.
81const CodeUnit = packed struct(u8) {
82 body: u6,
83 kind: RuneKind,
84
85 /// Mask to check presence
86 pub inline fn inMask(self: *const CodeUnit) u64 {
87 return @as(u64, 1) << self.body;
88 }
89
90 // TODO consider an nMultiBytesFast, for the cases where we
91 // know that invalid lead bytes are never present (such as in set)
92 // operations, where we may assume that (and will assert that) the
93 // LEAD mask contains no such bytes.
94
95 /// Number of bytes in known multi-byte rune.
96 ///
97 /// Caller guarantees that the CodeUnit is a lead byte
98 /// of a multi-byte rune: `cu.kind == .lead`.
99 ///
100 /// Invalid lead bytes will return null.
101 pub inline fn nMultiBytes(self: *const CodeUnit) ?u8 {
102 return switch (self.body) {
103 // 0 and 1 are invalid for overlong reasons,
104 // but RuneSet supports overlong encodings
105 0...31 => 2,
106 32...47 => 3,
107 48...55 => 4,
108 // Wasted space 56...61 is due entirely to Microsoft's
109 // lack of vision and insistence on a substandard
110 // and utterly inadequate encoding for Unicode
111 // "64k should be enough for anyone" <spits>
112 56...63 => null,
113 };
114 }
115
116 /// Given a valid lead byte, return the number of bytes that should
117 /// make up the code unit sequence. Will return `null` if the lead
118 /// byte is invalid.
119 pub inline fn nBytes(self: *const CodeUnit) ?u8 {
120 switch (self.kind) {
121 .low, .hi => return 1,
122 .lead => return self.nMultiBytes(),
123 .follow => return null,
124 }
125 }
126
127 /// Mask off all bits >= cu.body
128 pub inline fn hiMask(self: *const CodeUnit) u64 {
129 return (@as(u64, 1) << self.body) - 1;
130 }
131
132 /// Mask off all bits <= cu.body
133 pub inline fn lowMask(self: *const CodeUnit) u64 {
134 if (self.body == 63)
135 return 0
136 else
137 return ~((@as(u64, 1) << (self.body + 1)) - 1);
138 }
139
140 /// Cast the `CodeUnit` to its backing `u8`.
141 pub inline fn byte(self: *const CodeUnit) u8 {
142 return @bitCast(self.*);
143 }
144};
145
146/// Cast raw byte to CodeUnit
147inline fn codeunit(b: u8) CodeUnit {
148 return @bitCast(b);
149}
150
151inline fn toMask(w: u64) Mask {
152 return Mask.toMask(w);
153}
154
155/// Bitmask for runesets
156///
157/// We define our own bitset, because the operations we need to
158/// perform only overlap with IntegerBitSet for trivial one-liners,
159/// and furthermore, we need nondestructive versions of the basic
160/// operations, which aren't a part of the IntegerBitSet interface.
161///
162/// Note that Masks do not track which kind of byte they apply to,
163/// since they will be stored as ordinary u64s. User code must
164/// ensure that CodeUnits tested against a Mask are of the appropriate
165/// type, and otherwise valid for the test performed.
166///
167const Mask = struct {
168 m: u64,
169
170 pub fn toMask(w: u64) Mask {
171 return Mask{ .m = w };
172 }
173
174 /// Test if a CodeUnit's low bytes are present in mask
175 pub inline fn isIn(self: Mask, cu: CodeUnit) bool {
176 return self.m | cu.inMask() == self.m;
177 }
178
179 /// Return number of bytes lower than cu.body in mask,
180 /// if cu inhabits the mask. Otherwise return null.
181 pub inline fn lowerThan(self: Mask, cu: CodeUnit) ?u64 {
182 if (self.isIn(cu)) {
183 const m = cu.hiMask();
184 return @popCount(self.m & m);
185 } else {
186 return null;
187 }
188 }
189
190 /// Return number of bytes higher than cu.body in mask,
191 /// if cu inhabits the mask. Otherwise return null.
192 pub inline fn higherThan(self: Mask, cu: CodeUnit) ?u64 {
193 if (self.isIn(cu)) {
194 const m = cu.lowMask();
195 return @popCount(self.m & m);
196 } else {
197 return null;
198 }
199 }
200};
201
202/// Sum of @popCount of all words in region.
203inline fn popCountSlice(region: []const u64) usize {
204 var ct: usize = 0;
205 for (region) |w| ct += @popCount(w);
206 return ct;
207}
diff --git a/src/unicode_tests.zig b/src/unicode_tests.zig
index 2249007..ae177a9 100644
--- a/src/unicode_tests.zig
+++ b/src/unicode_tests.zig
@@ -1,43 +1,4 @@
1const std = @import("std"); 1const dbg_print = false;
2const fmt = std.fmt;
3const fs = std.fs;
4const io = std.io;
5const heap = std.heap;
6const mem = std.mem;
7const testing = std.testing;
8const unicode = std.unicode;
9
10const grapheme = @import("Graphemes");
11const Grapheme = @import("Graphemes").Grapheme;
12const Graphemes = @import("Graphemes");
13const GraphemeIterator = @import("Graphemes").Iterator;
14const Normalize = @import("Normalize");
15
16comptime {
17 testing.refAllDecls(grapheme);
18}
19
20test "Iterator.peek" {
21 const peek_seq = "aΔ👨🏻‍🌾→";
22 const data = try Graphemes.init(std.testing.allocator);
23 defer data.deinit(std.testing.allocator);
24
25 var iter = data.iterator(peek_seq);
26 const peek_a = iter.peek().?;
27 const next_a = iter.next().?;
28 try std.testing.expectEqual(peek_a, next_a);
29 try std.testing.expectEqualStrings("a", peek_a.bytes(peek_seq));
30 const peek_d1 = iter.peek().?;
31 const peek_d2 = iter.peek().?;
32 try std.testing.expectEqual(peek_d1, peek_d2);
33 const next_d = iter.next().?;
34 try std.testing.expectEqual(peek_d2, next_d);
35 try std.testing.expectEqual(iter.peek(), iter.next());
36 try std.testing.expectEqual(iter.peek(), iter.next());
37 try std.testing.expectEqual(null, iter.peek());
38 try std.testing.expectEqual(null, iter.peek());
39 try std.testing.expectEqual(iter.peek(), iter.next());
40}
41 2
42test "Unicode normalization tests" { 3test "Unicode normalization tests" {
43 var arena = heap.ArenaAllocator.init(testing.allocator); 4 var arena = heap.ArenaAllocator.init(testing.allocator);
@@ -50,16 +11,14 @@ test "Unicode normalization tests" {
50 var file = try fs.cwd().openFile("data/unicode/NormalizationTest.txt", .{}); 11 var file = try fs.cwd().openFile("data/unicode/NormalizationTest.txt", .{});
51 defer file.close(); 12 defer file.close();
52 var buf_reader = io.bufferedReader(file.reader()); 13 var buf_reader = io.bufferedReader(file.reader());
53 const input_stream = buf_reader.reader(); 14 var input_stream = buf_reader.reader();
54 15
55 var line_no: usize = 0;
56 var buf: [4096]u8 = undefined; 16 var buf: [4096]u8 = undefined;
57 var cp_buf: [4]u8 = undefined; 17 var cp_buf: [4]u8 = undefined;
58 18
59 while (try input_stream.readUntilDelimiterOrEof(&buf, '\n')) |line| { 19 var line_iter: IterRead = .{ .read = &input_stream };
60 line_no += 1; 20
61 // Skip comments or empty lines. 21 while (try line_iter.next(&buf)) |line| {
62 if (line.len == 0 or line[0] == '#' or line[0] == '@') continue;
63 // Iterate over fields. 22 // Iterate over fields.
64 var fields = mem.splitScalar(u8, line, ';'); 23 var fields = mem.splitScalar(u8, line, ';');
65 var field_index: usize = 0; 24 var field_index: usize = 0;
@@ -80,7 +39,7 @@ test "Unicode normalization tests" {
80 39
81 input = try i_buf.toOwnedSlice(); 40 input = try i_buf.toOwnedSlice();
82 } else if (field_index == 1) { 41 } else if (field_index == 1) {
83 //debug.print("\n*** {s} ***\n", .{line}); 42 if (dbg_print) debug.print("\n*** {s} ***\n", .{line});
84 // NFC, time to test. 43 // NFC, time to test.
85 var w_buf = std.ArrayList(u8).init(allocator); 44 var w_buf = std.ArrayList(u8).init(allocator);
86 defer w_buf.deinit(); 45 defer w_buf.deinit();
@@ -162,20 +121,17 @@ test "Segmentation GraphemeIterator" {
162 var buf_reader = std.io.bufferedReader(file.reader()); 121 var buf_reader = std.io.bufferedReader(file.reader());
163 var input_stream = buf_reader.reader(); 122 var input_stream = buf_reader.reader();
164 123
165 const data = try Graphemes.init(allocator); 124 const graph = try Graphemes.init(allocator);
166 defer data.deinit(allocator); 125 defer graph.deinit(allocator);
167 126
168 var buf: [4096]u8 = undefined; 127 var buf: [4096]u8 = undefined;
169 var line_no: usize = 1; 128 var line_iter: IterRead = .{ .read = &input_stream };
170
171 while (try input_stream.readUntilDelimiterOrEof(&buf, '\n')) |raw| : (line_no += 1) {
172 // Skip comments or empty lines.
173 if (raw.len == 0 or raw[0] == '#' or raw[0] == '@') continue;
174 129
130 while (try line_iter.next(&buf)) |raw| {
175 // Clean up. 131 // Clean up.
176 var line = std.mem.trimLeft(u8, raw, "÷ "); 132 var line = std.mem.trimLeft(u8, raw, "÷ ");
177 if (std.mem.indexOf(u8, line, " ÷\t#")) |octo| { 133 if (std.mem.indexOf(u8, line, " ÷\t")) |final| {
178 line = line[0..octo]; 134 line = line[0..final];
179 } 135 }
180 // Iterate over fields. 136 // Iterate over fields.
181 var want = std.ArrayList(Grapheme).init(allocator); 137 var want = std.ArrayList(Grapheme).init(allocator);
@@ -185,12 +141,12 @@ test "Segmentation GraphemeIterator" {
185 defer all_bytes.deinit(); 141 defer all_bytes.deinit();
186 142
187 var graphemes = std.mem.splitSequence(u8, line, " ÷ "); 143 var graphemes = std.mem.splitSequence(u8, line, " ÷ ");
188 var bytes_index: u32 = 0; 144 var bytes_index: uoffset = 0;
189 145
190 while (graphemes.next()) |field| { 146 while (graphemes.next()) |field| {
191 var code_points = std.mem.splitScalar(u8, field, ' '); 147 var code_points = std.mem.splitScalar(u8, field, ' ');
192 var cp_buf: [4]u8 = undefined; 148 var cp_buf: [4]u8 = undefined;
193 var cp_index: u32 = 0; 149 var cp_index: uoffset = 0;
194 var gc_len: u8 = 0; 150 var gc_len: u8 = 0;
195 151
196 while (code_points.next()) |code_point| { 152 while (code_points.next()) |code_point| {
@@ -206,16 +162,324 @@ test "Segmentation GraphemeIterator" {
206 bytes_index += cp_index; 162 bytes_index += cp_index;
207 } 163 }
208 164
209 // std.debug.print("\nline {}: {s}\n", .{ line_no, all_bytes.items }); 165 const this_str = all_bytes.items;
210 var iter = data.iterator(all_bytes.items); 166
167 {
168 var iter = graph.iterator(this_str);
169
170 // Check.
171 for (want.items, 1..) |want_gc, idx| {
172 const got_gc = (iter.next()).?;
173 try std.testing.expectEqualStrings(
174 want_gc.bytes(this_str),
175 got_gc.bytes(this_str),
176 );
177 for (got_gc.offset..got_gc.offset + got_gc.len) |i| {
178 const this_gc = graph.graphemeAtIndex(this_str, i);
179 std.testing.expectEqualSlices(
180 u8,
181 got_gc.bytes(this_str),
182 this_gc.bytes(this_str),
183 ) catch |err| {
184 debug.print("Wrong grapheme on line {d} #{d} offset {d}\n", .{ line_iter.line, idx, i });
185 return err;
186 };
187 }
188 var after_iter = graph.iterateAfterGrapheme(this_str, got_gc);
189 if (after_iter.next()) |next_gc| {
190 if (iter.peek()) |next_peek| {
191 std.testing.expectEqualSlices(
192 u8,
193 next_gc.bytes(this_str),
194 next_peek.bytes(this_str),
195 ) catch |err| {
196 debug.print("Peeks differ on line {d} #{d} \n", .{ line_iter.line, idx });
197 return err;
198 };
199 } else {
200 debug.print("Mismatch: peek missing, next found, line {d} #{d}\n", .{ line_iter.line, idx });
201 try testing.expect(false);
202 }
203 } else {
204 try testing.expectEqual(null, iter.peek());
205 }
206 }
207 }
208 {
209 var iter = graph.reverseIterator(this_str);
210
211 // Check.
212 var i: usize = want.items.len;
213 while (i > 0) {
214 i -= 1;
215 const want_gc = want.items[i];
216 const got_gc = iter.prev() orelse {
217 std.debug.print(
218 "line {d} grapheme {d}: expected {any} found null\n",
219 .{ line_iter.line, i, want_gc },
220 );
221 return error.TestExpectedEqual;
222 };
223 std.testing.expectEqualStrings(
224 want_gc.bytes(this_str),
225 got_gc.bytes(this_str),
226 ) catch |err| {
227 std.debug.print(
228 "line {d} grapheme {d}: expected {any} found {any}\n",
229 .{ line_iter.line, i, want_gc, got_gc },
230 );
231 return err;
232 };
233 var before_iter = graph.iterateBeforeGrapheme(this_str, got_gc);
234 if (before_iter.prev()) |prev_gc| {
235 if (iter.peek()) |prev_peek| {
236 std.testing.expectEqualSlices(
237 u8,
238 prev_gc.bytes(this_str),
239 prev_peek.bytes(this_str),
240 ) catch |err| {
241 debug.print("Peeks differ on line {d} #{d} \n", .{ line_iter.line, i });
242 return err;
243 };
244 } else {
245 debug.print("Mismatch: peek missing, prev found, line {d} #{d}\n", .{ line_iter.line, i });
246 try testing.expect(false);
247 }
248 } else {
249 try testing.expectEqual(null, iter.peek());
250 }
251 }
252 }
253 }
254}
255
256test "Segmentation Word Iterator" {
257 const allocator = std.testing.allocator;
258 var file = try std.fs.cwd().openFile("data/unicode/auxiliary/WordBreakTest.txt", .{});
259 defer file.close();
260 var buf_reader = std.io.bufferedReader(file.reader());
261 var input_stream = buf_reader.reader();
262
263 const wb = try Words.init(allocator);
264 defer wb.deinit(allocator);
265
266 var buf: [4096]u8 = undefined;
267 var line_iter: IterRead = .{ .read = &input_stream };
268
269 while (try line_iter.next(&buf)) |raw| {
270 // Clean up.
271 var line = std.mem.trimLeft(u8, raw, "÷ ");
272 if (std.mem.indexOf(u8, line, " ÷\t")) |final| {
273 line = line[0..final];
274 }
275 // Iterate over fields.
276 var want = std.ArrayList(Word).init(allocator);
277 defer want.deinit();
278
279 var all_bytes = std.ArrayList(u8).init(allocator);
280 defer all_bytes.deinit();
281
282 var words = std.mem.splitSequence(u8, line, " ÷ ");
283 var bytes_index: uoffset = 0;
284
285 while (words.next()) |field| {
286 var code_points = std.mem.splitScalar(u8, field, ' ');
287 var cp_buf: [4]u8 = undefined;
288 var cp_index: uoffset = 0;
289 var gc_len: u8 = 0;
211 290
212 // Check. 291 while (code_points.next()) |code_point| {
213 for (want.items) |want_gc| { 292 if (std.mem.eql(u8, code_point, "×")) continue;
214 const got_gc = (iter.next()).?; 293 const cp: u21 = try std.fmt.parseInt(u21, code_point, 16);
215 try std.testing.expectEqualStrings( 294 const len = try unicode.utf8Encode(cp, &cp_buf);
216 want_gc.bytes(all_bytes.items), 295 try all_bytes.appendSlice(cp_buf[0..len]);
217 got_gc.bytes(all_bytes.items), 296 cp_index += len;
218 ); 297 gc_len += len;
298 }
299
300 try want.append(Word{ .len = gc_len, .offset = bytes_index });
301 bytes_index += cp_index;
302 }
303 const this_str = all_bytes.items;
304
305 {
306 var iter = wb.iterator(this_str);
307 var peeked: ?Word = iter.peek();
308
309 // Check.
310 for (want.items, 1..) |want_word, idx| {
311 const got_word = (iter.next()).?;
312 std.testing.expectEqualStrings(
313 want_word.bytes(this_str),
314 got_word.bytes(this_str),
315 ) catch |err| {
316 debug.print("Error on line {d}, #{d}\n", .{ line_iter.line, idx });
317 return err;
318 };
319 std.testing.expectEqualStrings(
320 peeked.?.bytes(this_str),
321 got_word.bytes(this_str),
322 ) catch |err| {
323 debug.print("Peek != word on line {d} #{d}\n", .{ line_iter.line, idx });
324 return err;
325 };
326 var r_iter = iter.reverseIterator();
327 const if_r_word = r_iter.prev();
328 if (if_r_word) |r_word| {
329 std.testing.expectEqualStrings(
330 want_word.bytes(this_str),
331 r_word.bytes(this_str),
332 ) catch |err| {
333 debug.print("Reversal Error on line {d}, #{d}\n", .{ line_iter.line, idx });
334 return err;
335 };
336 } else {
337 try testing.expect(false);
338 }
339 var peek_iter = wb.iterateAfterWord(this_str, got_word);
340 const peek_1 = peek_iter.next();
341 if (peek_1) |p1| {
342 const peek_2 = iter.peek();
343 if (peek_2) |p2| {
344 std.testing.expectEqualSlices(
345 u8,
346 p1.bytes(this_str),
347 p2.bytes(this_str),
348 ) catch |err| {
349 debug.print("Bad peek on line {d} #{d} offset {d}\n", .{ line_iter.line, idx + 1, idx });
350 return err;
351 };
352 } else {
353 try testing.expect(false);
354 }
355 } else {
356 try testing.expectEqual(null, iter.peek());
357 }
358 for (got_word.offset..got_word.offset + got_word.len) |i| {
359 const this_word = wb.wordAtIndex(this_str, i);
360 std.testing.expectEqualSlices(
361 u8,
362 got_word.bytes(this_str),
363 this_word.bytes(this_str),
364 ) catch |err| {
365 debug.print("Wrong word on line {d} #{d} offset {d}\n", .{ line_iter.line, idx, i });
366 return err;
367 };
368 }
369 peeked = iter.peek();
370 }
371 }
372 {
373 var r_iter = wb.reverseIterator(this_str);
374 var peeked: ?Word = r_iter.peek();
375 var idx = want.items.len - 1;
376
377 while (true) : (idx -= 1) {
378 const want_word = want.items[idx];
379 const got_word = r_iter.prev().?;
380 std.testing.expectEqualSlices(
381 u8,
382 want_word.bytes(this_str),
383 got_word.bytes(this_str),
384 ) catch |err| {
385 debug.print("Error on line {d}, #{d}\n", .{ line_iter.line, idx + 1 });
386 return err;
387 };
388 std.testing.expectEqualStrings(
389 peeked.?.bytes(this_str),
390 got_word.bytes(this_str),
391 ) catch |err| {
392 debug.print("Peek != word on line {d} #{d}\n", .{ line_iter.line, idx + 1 });
393 return err;
394 };
395 var f_iter = r_iter.forwardIterator();
396 const if_f_word = f_iter.next();
397 if (if_f_word) |f_word| {
398 std.testing.expectEqualStrings(
399 want_word.bytes(this_str),
400 f_word.bytes(this_str),
401 ) catch |err| {
402 debug.print("Reversal Error on line {d}, #{d}\n", .{ line_iter.line, idx });
403 return err;
404 };
405 } else {
406 try testing.expect(false);
407 }
408 var peek_iter = wb.iterateBeforeWord(this_str, got_word);
409 const peek_1 = peek_iter.prev();
410 if (peek_1) |p1| {
411 const peek_2 = r_iter.peek();
412 if (peek_2) |p2| {
413 std.testing.expectEqualSlices(
414 u8,
415 p1.bytes(this_str),
416 p2.bytes(this_str),
417 ) catch |err| {
418 debug.print("Bad peek on line {d} #{d} offset {d}\n", .{ line_iter.line, idx + 1, idx });
419 return err;
420 };
421 } else {
422 try testing.expect(false);
423 }
424 } else {
425 try testing.expectEqual(null, r_iter.peek());
426 }
427 for (got_word.offset..got_word.offset + got_word.len) |i| {
428 const this_word = wb.wordAtIndex(this_str, i);
429 std.testing.expectEqualSlices(
430 u8,
431 got_word.bytes(this_str),
432 this_word.bytes(this_str),
433 ) catch |err| {
434 debug.print("Wrong word on line {d} #{d} offset {d}\n", .{ line_iter.line, idx + 1, i });
435 return err;
436 };
437 }
438 peeked = r_iter.peek();
439 if (idx == 0) break;
440 }
219 } 441 }
220 } 442 }
221} 443}
444
445const IterRead = struct {
446 read: *Reader,
447 line: usize = 0,
448
449 pub fn next(iter: *IterRead, buf: []u8) !?[]const u8 {
450 defer iter.line += 1;
451 const maybe_line = try iter.read.readUntilDelimiterOrEof(buf, '#');
452 if (maybe_line) |this_line| {
453 try iter.read.skipUntilDelimiterOrEof('\n');
454 if (this_line.len == 0 or this_line[0] == '@') {
455 // comment, next line
456 return iter.next(buf);
457 } else {
458 return this_line;
459 }
460 } else {
461 return null;
462 }
463 }
464};
465
466const std = @import("std");
467const fmt = std.fmt;
468const fs = std.fs;
469const io = std.io;
470const Reader = io.BufferedReader(4096, fs.File.Reader).Reader;
471const heap = std.heap;
472const mem = std.mem;
473const debug = std.debug;
474const testing = std.testing;
475const unicode = std.unicode;
476
477const uoffset = @FieldType(Word, "offset");
478
479const Grapheme = @import("Graphemes").Grapheme;
480const Graphemes = @import("Graphemes");
481const GraphemeIterator = @import("Graphemes").Iterator;
482const Normalize = @import("Normalize");
483
484const Words = @import("Words");
485const Word = Words.Word;