diff options
| author | 2024-02-11 19:08:34 -0400 | |
|---|---|---|
| committer | 2024-02-11 19:08:34 -0400 | |
| commit | ed5ce42ba8fc67db8a8bb385490ccec7a218a7e3 (patch) | |
| tree | 807136431dd297ccc273895f09bc7152e5347bb2 /src/Grapheme.zig | |
| download | zg-ed5ce42ba8fc67db8a8bb385490ccec7a218a7e3.tar.gz zg-ed5ce42ba8fc67db8a8bb385490ccec7a218a7e3.tar.xz zg-ed5ce42ba8fc67db8a8bb385490ccec7a218a7e3.zip | |
init
Diffstat (limited to 'src/Grapheme.zig')
| -rw-r--r-- | src/Grapheme.zig | 220 |
1 files changed, 220 insertions, 0 deletions
diff --git a/src/Grapheme.zig b/src/Grapheme.zig new file mode 100644 index 0000000..a8a7638 --- /dev/null +++ b/src/Grapheme.zig | |||
| @@ -0,0 +1,220 @@ | |||
| 1 | //! `Grapheme` represents a Unicode grapheme cluster by its length and offset in the source bytes. | ||
| 2 | |||
| 3 | const std = @import("std"); | ||
| 4 | const unicode = std.unicode; | ||
| 5 | |||
| 6 | const CodePoint = @import("ziglyph").CodePoint; | ||
| 7 | const CodePointIterator = CodePoint.CodePointIterator; | ||
| 8 | const emoji = @import("ziglyph").emoji; | ||
| 9 | |||
| 10 | const gbp = @import("gbp"); | ||
| 11 | |||
| 12 | pub const Grapheme = @This(); | ||
| 13 | |||
| 14 | len: usize, | ||
| 15 | offset: usize, | ||
| 16 | |||
| 17 | /// `eql` comparse `str` with the bytes of this grapheme cluster in `src` for equality. | ||
| 18 | pub fn eql(self: Grapheme, src: []const u8, other: []const u8) bool { | ||
| 19 | return std.mem.eql(u8, src[self.offset .. self.offset + self.len], other); | ||
| 20 | } | ||
| 21 | |||
| 22 | /// `slice` returns the bytes that correspond to this grapheme cluster in `src`. | ||
| 23 | pub fn slice(self: Grapheme, src: []const u8) []const u8 { | ||
| 24 | return src[self.offset .. self.offset + self.len]; | ||
| 25 | } | ||
| 26 | |||
| 27 | /// `GraphemeIterator` iterates a sting of UTF-8 encoded bytes one grapheme cluster at-a-time. | ||
| 28 | pub const GraphemeIterator = struct { | ||
| 29 | buf: [2]?CodePoint = [_]?CodePoint{ null, null }, | ||
| 30 | cp_iter: CodePointIterator, | ||
| 31 | |||
| 32 | const Self = @This(); | ||
| 33 | |||
| 34 | /// Assumes `src` is valid UTF-8. | ||
| 35 | pub fn init(str: []const u8) Self { | ||
| 36 | var self = Self{ .cp_iter = CodePointIterator{ .bytes = str } }; | ||
| 37 | self.buf[1] = self.cp_iter.next(); | ||
| 38 | |||
| 39 | return self; | ||
| 40 | } | ||
| 41 | |||
| 42 | fn advance(self: *Self) void { | ||
| 43 | self.buf[0] = self.buf[1]; | ||
| 44 | self.buf[1] = self.cp_iter.next(); | ||
| 45 | } | ||
| 46 | |||
| 47 | pub fn next(self: *Self) ?Grapheme { | ||
| 48 | self.advance(); | ||
| 49 | |||
| 50 | // If at end | ||
| 51 | if (self.buf[0] == null) return null; | ||
| 52 | if (self.buf[1] == null) return Grapheme{ .len = self.buf[0].?.len, .offset = self.buf[0].?.offset }; | ||
| 53 | |||
| 54 | const gc_start = self.buf[0].?.offset; | ||
| 55 | var gc_len: usize = self.buf[0].?.len; | ||
| 56 | var state: u3 = 0; | ||
| 57 | |||
| 58 | if (graphemeBreak( | ||
| 59 | self.buf[0].?.code, | ||
| 60 | self.buf[1].?.code, | ||
| 61 | &state, | ||
| 62 | )) return Grapheme{ .len = gc_len, .offset = gc_start }; | ||
| 63 | |||
| 64 | while (true) { | ||
| 65 | self.advance(); | ||
| 66 | if (self.buf[0] == null) break; | ||
| 67 | |||
| 68 | gc_len += self.buf[0].?.len; | ||
| 69 | |||
| 70 | if (graphemeBreak( | ||
| 71 | self.buf[0].?.code, | ||
| 72 | if (self.buf[1]) |ncp| ncp.code else 0, | ||
| 73 | &state, | ||
| 74 | )) break; | ||
| 75 | } | ||
| 76 | |||
| 77 | return Grapheme{ .len = gc_len, .offset = gc_start }; | ||
| 78 | } | ||
| 79 | }; | ||
| 80 | |||
| 81 | // Predicates | ||
| 82 | fn isBreaker(cp: u21) bool { | ||
| 83 | return cp == '\x0d' or cp == '\x0a' or gbp.isControl(cp); | ||
| 84 | } | ||
| 85 | |||
| 86 | fn isIgnorable(cp: u21) bool { | ||
| 87 | return gbp.isExtend(cp) or gbp.isSpacingmark(cp) or cp == '\u{200d}'; | ||
| 88 | } | ||
| 89 | |||
| 90 | test "Segmentation comptime GraphemeIterator" { | ||
| 91 | const want = [_][]const u8{ "H", "é", "l", "l", "o" }; | ||
| 92 | |||
| 93 | comptime { | ||
| 94 | const src = "Héllo"; | ||
| 95 | var ct_iter = GraphemeIterator.init(src); | ||
| 96 | var i = 0; | ||
| 97 | while (ct_iter.next()) |grapheme| : (i += 1) { | ||
| 98 | try std.testing.expect(grapheme.eql(src, want[i])); | ||
| 99 | } | ||
| 100 | } | ||
| 101 | } | ||
| 102 | |||
| 103 | test "Segmentation ZWJ and ZWSP emoji sequences" { | ||
| 104 | const seq_1 = "\u{1F43B}\u{200D}\u{2744}\u{FE0F}"; | ||
| 105 | const seq_2 = "\u{1F43B}\u{200D}\u{2744}\u{FE0F}"; | ||
| 106 | const with_zwj = seq_1 ++ "\u{200D}" ++ seq_2; | ||
| 107 | const with_zwsp = seq_1 ++ "\u{200B}" ++ seq_2; | ||
| 108 | const no_joiner = seq_1 ++ seq_2; | ||
| 109 | |||
| 110 | var ct_iter = GraphemeIterator.init(with_zwj); | ||
| 111 | var i: usize = 0; | ||
| 112 | while (ct_iter.next()) |_| : (i += 1) {} | ||
| 113 | try std.testing.expectEqual(@as(usize, 1), i); | ||
| 114 | |||
| 115 | ct_iter = GraphemeIterator.init(with_zwsp); | ||
| 116 | i = 0; | ||
| 117 | while (ct_iter.next()) |_| : (i += 1) {} | ||
| 118 | try std.testing.expectEqual(@as(usize, 3), i); | ||
| 119 | |||
| 120 | ct_iter = GraphemeIterator.init(no_joiner); | ||
| 121 | i = 0; | ||
| 122 | while (ct_iter.next()) |_| : (i += 1) {} | ||
| 123 | try std.testing.expectEqual(@as(usize, 2), i); | ||
| 124 | } | ||
| 125 | |||
| 126 | // Grapheme break state. | ||
| 127 | fn hasXpic(state: *const u3) bool { | ||
| 128 | return state.* & 1 == 1; | ||
| 129 | } | ||
| 130 | |||
| 131 | fn setXpic(state: *u3) void { | ||
| 132 | state.* |= 1; | ||
| 133 | } | ||
| 134 | |||
| 135 | fn unsetXpic(state: *u3) void { | ||
| 136 | state.* ^= 1; | ||
| 137 | } | ||
| 138 | |||
| 139 | fn hasRegional(state: *const u3) bool { | ||
| 140 | return state.* & 2 == 2; | ||
| 141 | } | ||
| 142 | |||
| 143 | fn setRegional(state: *u3) void { | ||
| 144 | state.* |= 2; | ||
| 145 | } | ||
| 146 | |||
| 147 | fn unsetRegional(state: *u3) void { | ||
| 148 | state.* ^= 2; | ||
| 149 | } | ||
| 150 | |||
| 151 | /// `graphemeBreak` returns true only if a grapheme break point is required | ||
| 152 | /// between `cp1` and `cp2`. `state` should start out as 0. If calling | ||
| 153 | /// iteratively over a sequence of code points, this function must be called | ||
| 154 | /// IN ORDER on ALL potential breaks in a string. | ||
| 155 | /// Modeled after the API of utf8proc's `utf8proc_grapheme_break_stateful`. | ||
| 156 | /// https://github.com/JuliaStrings/utf8proc/blob/2bbb1ba932f727aad1fab14fafdbc89ff9dc4604/utf8proc.h#L599-L617 | ||
| 157 | pub fn graphemeBreak( | ||
| 158 | cp1: u21, | ||
| 159 | cp2: u21, | ||
| 160 | state: *u3, | ||
| 161 | ) bool { | ||
| 162 | // GB11: Emoji Extend* ZWJ x Emoji | ||
| 163 | if (!hasXpic(state) and emoji.isExtendedPictographic(cp1)) setXpic(state); | ||
| 164 | |||
| 165 | // GB3: CR x LF | ||
| 166 | if (cp1 == '\r' and cp2 == '\n') return false; | ||
| 167 | |||
| 168 | // GB4: Control | ||
| 169 | if (isBreaker(cp1)) return true; | ||
| 170 | |||
| 171 | // GB6: Hangul L x (L|V|LV|VT) | ||
| 172 | if (gbp.isL(cp1)) { | ||
| 173 | if (gbp.isL(cp2) or | ||
| 174 | gbp.isV(cp2) or | ||
| 175 | gbp.isLv(cp2) or | ||
| 176 | gbp.isLvt(cp2)) return false; | ||
| 177 | } | ||
| 178 | |||
| 179 | // GB7: Hangul (LV | V) x (V | T) | ||
| 180 | if (gbp.isLv(cp1) or gbp.isV(cp1)) { | ||
| 181 | if (gbp.isV(cp2) or | ||
| 182 | gbp.isT(cp2)) return false; | ||
| 183 | } | ||
| 184 | |||
| 185 | // GB8: Hangul (LVT | T) x T | ||
| 186 | if (gbp.isLvt(cp1) or gbp.isT(cp1)) { | ||
| 187 | if (gbp.isT(cp2)) return false; | ||
| 188 | } | ||
| 189 | |||
| 190 | // GB9b: x (Extend | ZWJ) | ||
| 191 | if (gbp.isExtend(cp2) or gbp.isZwj(cp2)) return false; | ||
| 192 | |||
| 193 | // GB9a: x Spacing | ||
| 194 | if (gbp.isSpacingmark(cp2)) return false; | ||
| 195 | |||
| 196 | // GB9b: Prepend x | ||
| 197 | if (gbp.isPrepend(cp1) and !isBreaker(cp2)) return false; | ||
| 198 | |||
| 199 | // GB12, GB13: RI x RI | ||
| 200 | if (gbp.isRegionalIndicator(cp1) and gbp.isRegionalIndicator(cp2)) { | ||
| 201 | if (hasRegional(state)) { | ||
| 202 | unsetRegional(state); | ||
| 203 | return true; | ||
| 204 | } else { | ||
| 205 | setRegional(state); | ||
| 206 | return false; | ||
| 207 | } | ||
| 208 | } | ||
| 209 | |||
| 210 | // GB11: Emoji Extend* ZWJ x Emoji | ||
| 211 | if (hasXpic(state) and | ||
| 212 | gbp.isZwj(cp1) and | ||
| 213 | emoji.isExtendedPictographic(cp2)) | ||
| 214 | { | ||
| 215 | unsetXpic(state); | ||
| 216 | return false; | ||
| 217 | } | ||
| 218 | |||
| 219 | return true; | ||
| 220 | } | ||