//! Word Breaking Algorithm. //! //! https://www.unicode.org/reports/tr29/#Word_Boundaries //! const WordBreakProperty = enum(u5) { none, Double_Quote, Single_Quote, Hebrew_Letter, CR, LF, Newline, Extend, Regional_Indicator, Format, Katakana, ALetter, MidLetter, MidNum, MidNumLet, Numeric, ExtendNumLet, ZWJ, WSegSpace, }; s1: []u16 = undefined, s2: []u5 = undefined, const Words = @This(); pub fn init(allocator: Allocator) Allocator.Error!Words { var wb: Words = undefined; try wb.setup(allocator); return wb; } pub fn setup(wb: *Words, allocator: Allocator) Allocator.Error!void { wb.setupImpl(allocator) catch |err| { switch (err) { error.OutOfMemory => |e| return e, else => unreachable, } }; } pub fn deinit(words: *const Words, allocator: mem.Allocator) void { allocator.free(words.s1); allocator.free(words.s2); } /// Represents a Unicode word span, as an offset into the source string /// and the length of the word. pub const Word = struct { offset: uoffset, len: uoffset, /// Returns a slice of the word given the source string. pub fn bytes(word: Word, src: []const u8) []const u8 { return src[word.offset..][0..word.len]; } }; /// Returns the word break property type for `cp`. pub fn breakProperty(words: *const Words, cp: u21) WordBreakProperty { return @enumFromInt(words.s2[words.s1[cp >> 8] + (cp & 0xff)]); } /// Convenience function for working with CodePoints fn breakProp(words: *const Words, point: CodePoint) WordBreakProperty { return @enumFromInt(words.s2[words.s1[point.code >> 8] + (point.code & 0xff)]); } /// Returns the Word at the given index. Asserts that the index is less than /// `string.len`, and that the string is not empty. Always returns a word. /// The index does not have to be the start of a codepoint in the word. pub fn wordAtIndex(words: *const Words, string: []const u8, index: usize) Word { assert(index < string.len and string.len > 0); var iter_back: ReverseIterator = reverseFromIndex(words, string, index); const first_back = iter_back.prev(); if (first_back) |back| { if (back.offset == 0) { var iter_fwd = words.iterator(string); while (iter_fwd.next()) |word| { if (word.offset <= index and index < word.offset + word.len) return word; } } } else { var iter_fwd = words.iterator(string); while (iter_fwd.next()) |word| { if (word.offset <= index and index < word.offset + word.len) return word; } } _ = iter_back.prev(); // There's sometimes flags: if (iter_back.flags > 0) { while (iter_back.flags > 0) { if (iter_back.prev()) |_| { continue; } else { break; } } } var iter_fwd = iter_back.forwardIterator(); while (iter_fwd.next()) |word| { if (word.offset <= index and index < word.offset + word.len) return word; } unreachable; } /// Returns an iterator over words in `slice`. pub fn iterator(words: *const Words, slice: []const u8) Iterator { return Iterator.init(words, slice); } /// Returns a reverse iterator over the words in `slice`. pub fn reverseIterator(words: *const Words, slice: []const u8) ReverseIterator { return ReverseIterator.init(words, slice); } /// Returns an iterator after the `word` in `slice`. pub fn iterateAfterWord(words: *const Words, slice: []const u8, word: Word) Iterator { return forwardFromIndex(words, slice, word.offset + word.len); } /// Returns a reverse iterator before the `word` in `slice`. pub fn iterateBeforeWord(words: *const Words, slice: []const u8, word: Word) ReverseIterator { return reverseFromIndex(words, slice, word.offset); } /// An iterator, forward, over all words in a provided string. pub const Iterator = struct { this: ?CodePoint = null, that: ?CodePoint = null, cp_iter: CodepointIterator, wb: *const Words, /// Assumes `str` is valid UTF-8. pub fn init(words: *const Words, str: []const u8) Iterator { var wb_iter: Iterator = .{ .cp_iter = .init(str), .wb = words }; wb_iter.advance(); return wb_iter; } /// Returns the next word segment, without advancing. pub fn peek(iter: *Iterator) ?Word { const cache = .{ iter.this, iter.that, iter.cp_iter }; defer { iter.this, iter.that, iter.cp_iter = cache; } return iter.next(); } /// Returns a reverse iterator from the point this iterator is paused /// at. Usually, and always when using the API to create iterators, /// calling `prev()` will return the word just seen. pub fn reverseIterator(iter: *Iterator) ReverseIterator { var cp_it = iter.cp_iter.reverseIterator(); if (iter.that) |_| _ = cp_it.prev(); if (iter.cp_iter.peek()) |_| _ = cp_it.prev(); return .{ .wb = iter.wb, .before = cp_it.prev(), .after = iter.that, .cp_iter = cp_it, }; } /// Returns the next word segment, if any. pub fn next(iter: *Iterator) ?Word { iter.advance(); // Done? if (iter.this == null) return null; // Last? if (iter.that == null) return Word{ .len = iter.this.?.len, .offset = iter.this.?.offset }; const word_start = iter.this.?.offset; var word_len: uoffset = 0; // State variables. var last_p: WordBreakProperty = .none; var last_last_p: WordBreakProperty = .none; var ri_count: usize = 0; scan: while (true) : (iter.advance()) { const this = iter.this.?; word_len += this.len; if (iter.that) |that| { const this_p = iter.wb.breakProp(this); const that_p = iter.wb.breakProp(that); if (!isIgnorable(this_p)) { last_last_p = last_p; last_p = this_p; } // WB3 CR × LF if (this_p == .CR and that_p == .LF) continue :scan; // WB3a (Newline | CR | LF) ÷ if (isNewline(this_p)) break :scan; // WB3b ÷ (Newline | CR | LF) if (isNewline(that_p)) break :scan; // WB3c ZWJ × \p{Extended_Pictographic} if (this_p == .ZWJ and ext_pict.isMatch(that.bytes(iter.cp_iter.bytes))) { continue :scan; } // WB3d WSegSpace × WSegSpace if (this_p == .WSegSpace and that_p == .WSegSpace) continue :scan; // WB4 X (Extend | Format | ZWJ)* → X if (isIgnorable(that_p)) { continue :scan; } // Now we use last_p instead of this_p for ignorable's sake if (isAHLetter(last_p)) { // WB5 AHLetter × AHLetter if (isAHLetter(that_p)) continue :scan; // WB6 AHLetter × (MidLetter | MidNumLetQ) AHLetter if (isMidVal(that_p)) { const next_val = iter.peekPast(); if (next_val) |next_cp| { const next_p = iter.wb.breakProp(next_cp); if (isAHLetter(next_p)) { continue :scan; } } } } // WB7 AHLetter (MidLetter | MidNumLetQ) × AHLetter if (isAHLetter(last_last_p) and isMidVal(last_p) and isAHLetter(that_p)) { continue :scan; } if (last_p == .Hebrew_Letter) { // WB7a Hebrew_Letter × Single_Quote if (that_p == .Single_Quote) continue :scan; // WB7b Hebrew_Letter × Double_Quote Hebrew_Letter if (that_p == .Double_Quote) { const next_val = iter.peekPast(); if (next_val) |next_cp| { const next_p = iter.wb.breakProp(next_cp); if (next_p == .Hebrew_Letter) { continue :scan; } } } } // WB7c Hebrew_Letter Double_Quote × Hebrew_Letter if (last_last_p == .Hebrew_Letter and last_p == .Double_Quote and that_p == .Hebrew_Letter) continue :scan; // WB8 Numeric × Numeric if (last_p == .Numeric and that_p == .Numeric) continue :scan; // WB9 AHLetter × Numeric if (isAHLetter(last_p) and that_p == .Numeric) continue :scan; // WB10 Numeric × AHLetter if (last_p == .Numeric and isAHLetter(that_p)) continue :scan; // WB11 Numeric (MidNum | MidNumLetQ) × Numeric if (last_last_p == .Numeric and isMidNum(last_p) and that_p == .Numeric) continue :scan; // WB12 Numeric × (MidNum | MidNumLetQ) Numeric if (last_p == .Numeric and isMidNum(that_p)) { const next_val = iter.peekPast(); if (next_val) |next_cp| { const next_p = iter.wb.breakProp(next_cp); if (next_p == .Numeric) { continue :scan; } } } // WB13 Katakana × Katakana if (last_p == .Katakana and that_p == .Katakana) continue :scan; // WB13a (AHLetter | Numeric | Katakana | ExtendNumLet) × ExtendNumLet if (isExtensible(last_p) and that_p == .ExtendNumLet) continue :scan; // WB13b ExtendNumLet × (AHLetter | Numeric | Katakana) if (last_p == .ExtendNumLet and isExtensible(that_p)) continue :scan; // WB15, WB16 ([^RI] | sot) (RI RI)* RI × RI const maybe_flag = that_p == .Regional_Indicator and last_p == .Regional_Indicator; if (maybe_flag) { ri_count += 1; if (ri_count % 2 == 1) continue :scan; } // WB999 Any ÷ Any break :scan; } else { // iter.that == null break :scan; } } return Word{ .len = word_len, .offset = word_start }; } pub fn format(iter: Iterator, _: []const u8, _: std.fmt.FormatOptions, writer: anytype) !void { try writer.print( "Iterator {{ .this = {any}, .that = {any} }}", .{ iter.this, iter.that }, ); } fn advance(iter: *Iterator) void { iter.this = iter.that; iter.that = iter.cp_iter.next(); } fn peekPast(iter: *Iterator) ?CodePoint { const save_cp = iter.cp_iter; defer iter.cp_iter = save_cp; while (iter.cp_iter.peek()) |peeked| { if (!isIgnorable(iter.wb.breakProp(peeked))) return peeked; _ = iter.cp_iter.next(); } return null; } }; /// An iterator, backward, over all words in a provided string. pub const ReverseIterator = struct { after: ?CodePoint = null, before: ?CodePoint = null, cp_iter: ReverseCodepointIterator, wb: *const Words, flags: usize = 0, /// Assumes `str` is valid UTF-8. pub fn init(words: *const Words, str: []const u8) ReverseIterator { var wb_iter: ReverseIterator = .{ .cp_iter = .init(str), .wb = words }; wb_iter.advance(); return wb_iter; } /// Returns the previous word segment, if any, without advancing. pub fn peek(iter: *ReverseIterator) ?Word { const cache = .{ iter.before, iter.after, iter.cp_iter, iter.flags }; defer { iter.before, iter.after, iter.cp_iter, iter.flags = cache; } return iter.prev(); } /// Return a forward iterator from where this iterator paused. Usually, /// and always when using the API to create iterators, calling `next()` /// will return the word just seen. pub fn forwardIterator(iter: *ReverseIterator) Iterator { var cp_it = iter.cp_iter.forwardIterator(); if (iter.before) |_| _ = cp_it.next(); return .{ .wb = iter.wb, .this = cp_it.next(), .that = iter.after, .cp_iter = cp_it, }; } /// Return the previous word, if any. pub fn prev(iter: *ReverseIterator) ?Word { iter.advance(); // Done? if (iter.after == null) return null; // Last? if (iter.before == null) return Word{ .len = iter.after.?.len, .offset = 0 }; const word_end = iter.after.?.offset + iter.after.?.len; var word_len: uoffset = 0; // State variables. var last_p: WordBreakProperty = .none; var last_last_p: WordBreakProperty = .none; scan: while (true) : (iter.advance()) { const after = iter.after.?; word_len += after.len; if (iter.before) |before| { var sneak = sneaky(iter); // 'sneaks' past ignorables const after_p = iter.wb.breakProp(after); var before_p = iter.wb.breakProp(before); if (!isIgnorable(after_p)) { last_last_p = last_p; last_p = after_p; } // WB3 CR × LF if (before_p == .CR and after_p == .LF) continue :scan; // WB3a (Newline | CR | LF) ÷ if (isNewline(before_p)) break :scan; // WB3b ÷ (Newline | CR | LF) if (isNewline(after_p)) break :scan; // WB3c ZWJ × \p{Extended_Pictographic} if (before_p == .ZWJ and ext_pict.isMatch(after.bytes(iter.cp_iter.bytes))) { continue :scan; } // WB3d WSegSpace × WSegSpace if (before_p == .WSegSpace and after_p == .WSegSpace) continue :scan; // WB4 X (Extend | Format | ZWJ)* → X if (isIgnorable(before_p)) { const maybe_before = sneak.prev(); if (maybe_before) |valid_before| { before_p = iter.wb.breakProp(valid_before); } else if (!isIgnorable(after_p)) { // We're done break :scan; } } if (isIgnorable(after_p)) continue :scan; // WB5 AHLetter × AHLetter if (isAHLetter(last_p) and isAHLetter(before_p)) { continue :scan; } // WB6 AHLetter × (MidLetter | MidNumLetQ) AHLetter if (isAHLetter(before_p) and isMidVal(last_p) and isAHLetter(last_last_p)) { continue :scan; } // WB7 AHLetter (MidLetter | MidNumLetQ) × AHLetter if (isMidVal(before_p) and isAHLetter(last_p)) { const prev_val = sneak.peek(); if (prev_val) |prev_cp| { const prev_p = iter.wb.breakProp(prev_cp); if (isAHLetter(prev_p)) { continue :scan; } } } // WB7a Hebrew_Letter × Single_Quote if (before_p == .Hebrew_Letter and last_p == .Single_Quote) continue :scan; // WB7b Hebrew_Letter × Double_Quote Hebrew_Letter if (before_p == .Hebrew_Letter and last_p == .Double_Quote and last_last_p == .Hebrew_Letter) { continue :scan; } // WB7c Hebrew_Letter Double_Quote × Hebrew_Letter if (before_p == .Double_Quote and last_p == .Hebrew_Letter) { const prev_val = sneak.peek(); if (prev_val) |prev_cp| { const prev_p = iter.wb.breakProp(prev_cp); if (prev_p == .Hebrew_Letter) { continue :scan; } } } // WB8 Numeric × Numeric if (before_p == .Numeric and last_p == .Numeric) continue :scan; // WB9 AHLetter × Numeric if (isAHLetter(before_p) and last_p == .Numeric) continue :scan; // WB10 Numeric × AHLetter if (before_p == .Numeric and isAHLetter(last_p)) continue :scan; // WB11 Numeric (MidNum | MidNumLetQ) × Numeric if (isMidNum(before_p) and last_p == .Numeric) { const prev_val = sneak.peek(); if (prev_val) |prev_cp| { const prev_p = iter.wb.breakProp(prev_cp); if (prev_p == .Numeric) { continue :scan; } } } // WB12 Numeric × (MidNum | MidNumLetQ) Numeric if (before_p == .Numeric and isMidNum(last_p) and last_last_p == .Numeric) { continue :scan; } // WB13 Katakana × Katakana if (before_p == .Katakana and last_p == .Katakana) continue :scan; // WB13a (AHLetter | Numeric | Katakana | ExtendNumLet) × ExtendNumLet if (isExtensible(before_p) and last_p == .ExtendNumLet) continue :scan; // WB13b ExtendNumLet × (AHLetter | Numeric | Katakana) if (before_p == .ExtendNumLet and isExtensible(last_p)) continue :scan; // WB15, WB16 ([^RI] | sot) (RI RI)* RI × RI // NOTE: // So here we simply have to know whether a run of flags is even or odd. // The whole run. To avoid quadratic behavior (and long flag runs are // actually a thing in the wild), we have to count them once, store that // on the iterator, and decrement each time we see two, possibly breaking // once extra at the beginning. They break up one per flag, once we hit // zero, that's all the flags. If we see another flag we do it again. if (before_p == .Regional_Indicator and last_p == .Regional_Indicator) { defer { if (iter.flags > 0) iter.flags -= 1; } if (iter.flags == 0) { iter.flags = sneak.countFlags(); } if (iter.flags % 2 == 0) { continue :scan; } } // WB999 Any ÷ Any break :scan; } break :scan; } return Word{ .len = word_len, .offset = word_end - word_len }; } pub fn format(iter: ReverseIterator, _: []const u8, _: std.fmt.FormatOptions, writer: anytype) !void { try writer.print( "ReverseIterator {{ .before = {any}, .after = {any}, .flags = {d} }}", .{ iter.before, iter.after, iter.flags }, ); } fn peekPast(iter: *ReverseIterator) ?CodePoint { const save_cp = iter.cp_iter; defer iter.cp_iter = save_cp; while (iter.cp_iter.peek()) |peeked| { if (!isIgnorable(iter.wb.breakProp(peeked))) return peeked; _ = iter.cp_iter.prev(); } return null; } fn advance(iter: *ReverseIterator) void { iter.after = iter.before; iter.before = iter.cp_iter.prev(); } }; //| Implementation Details /// Initialize a ReverseIterator at the provided index. Used in `wordAtIndex`. fn reverseFromIndex(words: *const Words, string: []const u8, index: usize) ReverseIterator { var idx: uoffset = @intCast(index); // Find the next lead byte: while (idx < string.len and 0x80 <= string[idx] and string[idx] <= 0xBf) : (idx += 1) {} if (idx == string.len) return words.reverseIterator(string); var iter: ReverseIterator = undefined; iter.wb = words; iter.flags = 0; // We need to populate the CodePoints, and the codepoint iterator. // Consider "abc| def" with the cursor as |. // We need `before` to be `c` and `after` to be ' ', // and `cp_iter.prev()` to be `b`. var cp_iter: ReverseCodepointIterator = .{ .bytes = string, .i = idx }; iter.after = cp_iter.prev(); iter.before = cp_iter.prev(); iter.cp_iter = cp_iter; return iter; } fn forwardFromIndex(words: *const Words, string: []const u8, index: usize) Iterator { var idx: uoffset = @intCast(index); if (idx == string.len) { return .{ .cp_iter = .{ .bytes = string, .i = idx }, .this = null, .that = null, .wb = words, }; } while (idx > 0 and 0x80 <= string[idx] and string[idx] <= 0xBf) : (idx -= 1) {} if (idx == 0) return words.iterator(string); var iter: Iterator = undefined; iter.wb = words; // We need to populate the CodePoints, and the codepoint iterator. // Consider "abc |def" with the cursor as |. // We need `this` to be ` ` and `that` to be 'd', // and `cp_iter.next()` to be `d`. idx -= 1; while (idx > 0 and 0x80 <= string[idx] and string[idx] <= 0xBf) : (idx -= 1) {} // "abc| def" var cp_iter: CodepointIterator = .{ .bytes = string, .i = idx }; iter.this = cp_iter.next(); iter.that = cp_iter.next(); iter.cp_iter = cp_iter; return iter; } fn sneaky(iter: *const ReverseIterator) SneakIterator { return .{ .cp_iter = iter.cp_iter, .wb = iter.wb }; } const SneakIterator = struct { cp_iter: ReverseCodepointIterator, wb: *const Words, fn peek(iter: *SneakIterator) ?CodePoint { const save_cp = iter.cp_iter; defer iter.cp_iter = save_cp; while (iter.cp_iter.peek()) |peeked| { if (!isIgnorable(iter.wb.breakProp(peeked))) return peeked; _ = iter.cp_iter.prev(); } return null; } fn countFlags(iter: *SneakIterator) usize { var flags: usize = 0; const save_cp = iter.cp_iter; defer iter.cp_iter = save_cp; while (iter.cp_iter.prev()) |cp| { const prop = iter.wb.breakProp(cp); if (isIgnorable(prop)) continue; if (prop == .Regional_Indicator) { flags += 1; } else break; } return flags; } fn prev(iter: *SneakIterator) ?CodePoint { while (iter.cp_iter.prev()) |peeked| { if (!isIgnorable(iter.wb.breakProp(peeked))) return peeked; } return null; } }; inline fn setupImpl(wb: *Words, allocator: Allocator) !void { const decompressor = compress.flate.inflate.decompressor; const in_bytes = @embedFile("wbp"); var in_fbs = std.io.fixedBufferStream(in_bytes); var in_decomp = decompressor(.raw, in_fbs.reader()); var reader = in_decomp.reader(); const endian = builtin.cpu.arch.endian(); const stage_1_len: u16 = try reader.readInt(u16, endian); wb.s1 = try allocator.alloc(u16, stage_1_len); errdefer allocator.free(wb.s1); for (0..stage_1_len) |i| wb.s1[i] = try reader.readInt(u16, endian); const stage_2_len: u16 = try reader.readInt(u16, endian); wb.s2 = try allocator.alloc(u5, stage_2_len); errdefer allocator.free(wb.s2); for (0..stage_2_len) |i| wb.s2[i] = @intCast(try reader.readInt(u8, endian)); var count_0: usize = 0; for (wb.s2) |nyb| { if (nyb == 0) count_0 += 1; } } //| Predicates inline fn isNewline(wbp: WordBreakProperty) bool { return wbp == .CR or wbp == .LF or wbp == .Newline; } inline fn isIgnorable(wbp: WordBreakProperty) bool { return switch (wbp) { .Format, .Extend, .ZWJ => true, else => false, }; } inline fn isAHLetter(wbp: WordBreakProperty) bool { return wbp == .ALetter or wbp == .Hebrew_Letter; } inline fn isMidVal(wbp: WordBreakProperty) bool { return wbp == .MidLetter or wbp == .MidNumLet or wbp == .Single_Quote; } inline fn isMidNum(wbp: WordBreakProperty) bool { return wbp == .MidNum or wbp == .MidNumLet or wbp == .Single_Quote; } inline fn isExtensible(wbp: WordBreakProperty) bool { return switch (wbp) { .ALetter, .Hebrew_Letter, .Katakana, .Numeric, .ExtendNumLet => true, else => false, }; } test "Word Break Properties" { const wb = try Words.init(testing.allocator); defer wb.deinit(testing.allocator); try testing.expectEqual(.CR, wb.breakProperty('\r')); try testing.expectEqual(.LF, wb.breakProperty('\n')); try testing.expectEqual(.Hebrew_Letter, wb.breakProperty('ש')); try testing.expectEqual(.Katakana, wb.breakProperty('\u{30ff}')); } test "ext_pict" { try testing.expect(ext_pict.isMatch("👇")); try testing.expect(ext_pict.isMatch("\u{2701}")); } test "Words" { const wb = try Words.init(testing.allocator); defer wb.deinit(testing.allocator); const word_str = "Metonym Μετωνύμιο メトニム"; var w_iter = wb.iterator(word_str); try testing.expectEqualStrings("Metonym", w_iter.next().?.bytes(word_str)); // Spaces are "words" too! try testing.expectEqualStrings(" ", w_iter.next().?.bytes(word_str)); const in_greek = w_iter.next().?; for (in_greek.offset..in_greek.offset + in_greek.len) |i| { const at_index = wb.wordAtIndex(word_str, i).bytes(word_str); try testing.expectEqualStrings("Μετωνύμιο", at_index); } _ = w_iter.next(); try testing.expectEqualStrings("メトニム", w_iter.next().?.bytes(word_str)); } test wordAtIndex { const wb = try Words.init(testing.allocator); defer wb.deinit(testing.allocator); const t_string = "first second third"; const second = wb.wordAtIndex(t_string, 8); try testing.expectEqualStrings("second", second.bytes(t_string)); const third = wb.wordAtIndex(t_string, 14); try testing.expectEqualStrings("third", third.bytes(t_string)); { const first = wb.wordAtIndex(t_string, 3); try testing.expectEqualStrings("first", first.bytes(t_string)); } { const first = wb.wordAtIndex(t_string, 0); try testing.expectEqualStrings("first", first.bytes(t_string)); } const last = wb.wordAtIndex(t_string, 14); try testing.expectEqualStrings("third", last.bytes(t_string)); } const testr = "don't a:ka fin!"; test "reversal" { const wb = try Words.init(testing.allocator); defer wb.deinit(testing.allocator); { var fwd = wb.iterator(testr); var this_word: ?Word = fwd.next(); while (this_word) |this| : (this_word = fwd.next()) { var back = fwd.reverseIterator(); const that_word = back.prev(); if (that_word) |that| { try testing.expectEqualStrings(this.bytes(testr), that.bytes(testr)); } else { try testing.expect(false); } } } { var back = wb.reverseIterator(testr); var this_word: ?Word = back.prev(); while (this_word) |this| : (this_word = back.prev()) { var fwd = back.forwardIterator(); const that_word = fwd.next(); if (that_word) |that| { try testing.expectEqualStrings(this.bytes(testr), that.bytes(testr)); } else { try testing.expect(false); } } } } fn testAllocations(allocator: Allocator) !void { const wb = try Words.init(allocator); wb.deinit(allocator); } test "allocation safety" { try testing.checkAllAllocationFailures(testing.allocator, testAllocations, .{}); } const std = @import("std"); const builtin = @import("builtin"); const compress = std.compress; const mem = std.mem; const Allocator = mem.Allocator; const assert = std.debug.assert; const testing = std.testing; const uoffset = code_point.uoffset; const code_point = @import("code_point"); const CodepointIterator = code_point.Iterator; const ReverseCodepointIterator = code_point.ReverseIterator; const CodePoint = code_point.CodePoint; const ext_pict = @import("micro_runeset.zig").Extended_Pictographic;