diff options
Diffstat (limited to 'src/Graphemes.zig')
| -rw-r--r-- | src/Graphemes.zig | 155 |
1 files changed, 63 insertions, 92 deletions
diff --git a/src/Graphemes.zig b/src/Graphemes.zig index 81d874c..d14b6ab 100644 --- a/src/Graphemes.zig +++ b/src/Graphemes.zig | |||
| @@ -3,70 +3,46 @@ | |||
| 3 | //! Code for handling graphemes: fragments of string which should be | 3 | //! Code for handling graphemes: fragments of string which should be |
| 4 | //! treated as one unit. Like Farmer Bob here: 👨🏻🌾 | 4 | //! treated as one unit. Like Farmer Bob here: 👨🏻🌾 |
| 5 | 5 | ||
| 6 | s1: []u16 = undefined, | ||
| 7 | s2: []u16 = undefined, | ||
| 8 | s3: []u8 = undefined, | ||
| 9 | |||
| 10 | const Graphemes = @This(); | 6 | const Graphemes = @This(); |
| 11 | 7 | ||
| 12 | pub fn init(allocator: Allocator) Allocator.Error!Graphemes { | 8 | const Data = struct { |
| 13 | var graphemes = Graphemes{}; | 9 | s1: []const u16 = undefined, |
| 14 | try graphemes.setup(allocator); | 10 | s2: []const u7 = undefined, |
| 15 | return graphemes; | 11 | s3: []const u8 = undefined, |
| 16 | } | 12 | }; |
| 17 | |||
| 18 | pub fn setup(graphemes: *Graphemes, allocator: Allocator) Allocator.Error!void { | ||
| 19 | const in_bytes = @embedFile("gbp"); | ||
| 20 | var in_fbs = std.io.fixedBufferStream(in_bytes); | ||
| 21 | var reader = in_fbs.reader(); | ||
| 22 | |||
| 23 | const endian = builtin.cpu.arch.endian(); | ||
| 24 | |||
| 25 | const s1_len: u16 = reader.readInt(u16, endian) catch unreachable; | ||
| 26 | graphemes.s1 = try allocator.alloc(u16, s1_len); | ||
| 27 | errdefer allocator.free(graphemes.s1); | ||
| 28 | for (0..s1_len) |i| graphemes.s1[i] = reader.readInt(u16, endian) catch unreachable; | ||
| 29 | |||
| 30 | const s2_len: u16 = reader.readInt(u16, endian) catch unreachable; | ||
| 31 | graphemes.s2 = try allocator.alloc(u16, s2_len); | ||
| 32 | errdefer allocator.free(graphemes.s2); | ||
| 33 | for (0..s2_len) |i| graphemes.s2[i] = reader.readInt(u16, endian) catch unreachable; | ||
| 34 | |||
| 35 | const s3_len: u16 = reader.readInt(u16, endian) catch unreachable; | ||
| 36 | graphemes.s3 = try allocator.alloc(u8, s3_len); | ||
| 37 | errdefer allocator.free(graphemes.s3); | ||
| 38 | _ = reader.readAll(graphemes.s3) catch unreachable; | ||
| 39 | } | ||
| 40 | 13 | ||
| 41 | pub fn deinit(graphemes: *const Graphemes, allocator: Allocator) void { | 14 | const graphemes = graphemes: { |
| 42 | allocator.free(graphemes.s1); | 15 | const data = @import("gbp"); |
| 43 | allocator.free(graphemes.s2); | 16 | break :graphemes Data{ |
| 44 | allocator.free(graphemes.s3); | 17 | .s1 = &data.s1, |
| 45 | } | 18 | .s2 = &data.s2, |
| 19 | .s3 = &data.s3, | ||
| 20 | }; | ||
| 21 | }; | ||
| 46 | 22 | ||
| 47 | /// Lookup the grapheme break property for a code point. | 23 | /// Lookup the grapheme break property for a code point. |
| 48 | pub fn gbp(graphemes: Graphemes, cp: u21) Gbp { | 24 | pub fn gbp(cp: u21) Gbp { |
| 49 | return @enumFromInt(graphemes.s3[graphemes.s2[graphemes.s1[cp >> 8] + (cp & 0xff)]] >> 4); | 25 | return @enumFromInt(graphemes.s3[graphemes.s2[graphemes.s1[cp >> 8] + (cp & 0xff)]] >> 4); |
| 50 | } | 26 | } |
| 51 | 27 | ||
| 52 | /// Lookup the indic syllable type for a code point. | 28 | /// Lookup the indic syllable type for a code point. |
| 53 | pub fn indic(graphemes: Graphemes, cp: u21) Indic { | 29 | pub fn indic(cp: u21) Indic { |
| 54 | return @enumFromInt((graphemes.s3[graphemes.s2[graphemes.s1[cp >> 8] + (cp & 0xff)]] >> 1) & 0x7); | 30 | return @enumFromInt((graphemes.s3[graphemes.s2[graphemes.s1[cp >> 8] + (cp & 0xff)]] >> 1) & 0x7); |
| 55 | } | 31 | } |
| 56 | 32 | ||
| 57 | /// Lookup the emoji property for a code point. | 33 | /// Lookup the emoji property for a code point. |
| 58 | pub fn isEmoji(graphemes: Graphemes, cp: u21) bool { | 34 | pub fn isEmoji(cp: u21) bool { |
| 59 | return graphemes.s3[graphemes.s2[graphemes.s1[cp >> 8] + (cp & 0xff)]] & 1 == 1; | 35 | return graphemes.s3[graphemes.s2[graphemes.s1[cp >> 8] + (cp & 0xff)]] & 1 == 1; |
| 60 | } | 36 | } |
| 61 | 37 | ||
| 62 | /// Returns an iterator over the graphemes in `string`. | 38 | /// Returns an iterator over the graphemes in `string`. |
| 63 | pub fn iterator(graphemes: *const Graphemes, string: []const u8) Iterator { | 39 | pub fn iterator(string: []const u8) Iterator { |
| 64 | return Iterator.init(string, graphemes); | 40 | return Iterator.init(string); |
| 65 | } | 41 | } |
| 66 | 42 | ||
| 67 | /// Returns a reverse iterator over the graphemes in `string`. | 43 | /// Returns a reverse iterator over the graphemes in `string`. |
| 68 | pub fn reverseIterator(graphemes: *const Graphemes, string: []const u8) ReverseIterator { | 44 | pub fn reverseIterator(string: []const u8) ReverseIterator { |
| 69 | return ReverseIterator.init(string, graphemes); | 45 | return ReverseIterator.init(string); |
| 70 | } | 46 | } |
| 71 | 47 | ||
| 72 | /// Indic syllable type. | 48 | /// Indic syllable type. |
| @@ -81,6 +57,7 @@ pub const Indic = enum { | |||
| 81 | /// Grapheme break property. | 57 | /// Grapheme break property. |
| 82 | pub const Gbp = enum { | 58 | pub const Gbp = enum { |
| 83 | none, | 59 | none, |
| 60 | |||
| 84 | Control, | 61 | Control, |
| 85 | CR, | 62 | CR, |
| 86 | Extend, | 63 | Extend, |
| @@ -117,7 +94,7 @@ pub const Grapheme = struct { | |||
| 117 | /// Returns the `Grapheme` at `string[index]`, which does not have to be a | 94 | /// Returns the `Grapheme` at `string[index]`, which does not have to be a |
| 118 | /// valid start of a codepoint. Asserts the string is not empty. Index must be | 95 | /// valid start of a codepoint. Asserts the string is not empty. Index must be |
| 119 | /// less than `string.len`. Always returns a `Grapheme`. | 96 | /// less than `string.len`. Always returns a `Grapheme`. |
| 120 | pub fn graphemeAtIndex(graphemes: *const Graphemes, string: []const u8, index: usize) Grapheme { | 97 | pub fn graphemeAtIndex(string: []const u8, index: usize) Grapheme { |
| 121 | assert(string.len != 0); | 98 | assert(string.len != 0); |
| 122 | if (index == 0 or (index > 0 and | 99 | if (index == 0 or (index > 0 and |
| 123 | string[index] < 0x80 and | 100 | string[index] < 0x80 and |
| @@ -125,7 +102,7 @@ pub fn graphemeAtIndex(graphemes: *const Graphemes, string: []const u8, index: u | |||
| 125 | (string[index - 1] != '\r' and string[index] != '\n')) | 102 | (string[index - 1] != '\r' and string[index] != '\n')) |
| 126 | { | 103 | { |
| 127 | // There's always a grapheme break between two ASCII code points (except CRLF) | 104 | // There's always a grapheme break between two ASCII code points (except CRLF) |
| 128 | var iter = graphemes.iterator(string[index..]); | 105 | var iter = Graphemes.iterator(string[index..]); |
| 129 | const next = iter.next().?; | 106 | const next = iter.next().?; |
| 130 | return Grapheme{ | 107 | return Grapheme{ |
| 131 | .len = next.len, | 108 | .len = next.len, |
| @@ -134,14 +111,14 @@ pub fn graphemeAtIndex(graphemes: *const Graphemes, string: []const u8, index: u | |||
| 134 | } // Otherwise it gets hairy. | 111 | } // Otherwise it gets hairy. |
| 135 | const idx: uoffset = code_point.codepointAtIndex(string, @intCast(index)).?.offset; | 112 | const idx: uoffset = code_point.codepointAtIndex(string, @intCast(index)).?.offset; |
| 136 | if (idx == string.len) { | 113 | if (idx == string.len) { |
| 137 | var iter = graphemes.reverseIterator(string); | 114 | var iter = Graphemes.reverseIterator(string); |
| 138 | return iter.prev().?; | 115 | return iter.prev().?; |
| 139 | } | 116 | } |
| 140 | // We're on a valid codepoint boundary, we go back from here | 117 | // We're on a valid codepoint boundary, we go back from here |
| 141 | var r_iter = graphemes.reverseIterAtIndex(string, idx); | 118 | var r_iter = Graphemes.reverseIterAtIndex(string, idx); |
| 142 | if (r_iter.prev()) |g| { | 119 | if (r_iter.prev()) |g| { |
| 143 | if (g.offset == 0) { | 120 | if (g.offset == 0) { |
| 144 | var iter = graphemes.iterator(string); | 121 | var iter = Graphemes.iterator(string); |
| 145 | while (iter.next()) |g2| { | 122 | while (iter.next()) |g2| { |
| 146 | if (g2.offset <= idx and idx < g2.offset + g2.len) return g2; | 123 | if (g2.offset <= idx and idx < g2.offset + g2.len) return g2; |
| 147 | } | 124 | } |
| @@ -151,7 +128,7 @@ pub fn graphemeAtIndex(graphemes: *const Graphemes, string: []const u8, index: u | |||
| 151 | // we in fact need to be. | 128 | // we in fact need to be. |
| 152 | _ = r_iter.prev(); | 129 | _ = r_iter.prev(); |
| 153 | while (r_iter.pending != .none) : (_ = r_iter.prev()) {} | 130 | while (r_iter.pending != .none) : (_ = r_iter.prev()) {} |
| 154 | var iter = graphemes.iterAtIndex(string, r_iter.cp_iter.i orelse 0); | 131 | var iter = Graphemes.iterAtIndex(string, r_iter.cp_iter.i orelse 0); |
| 155 | while (iter.next()) |g| { | 132 | while (iter.next()) |g| { |
| 156 | if (g.offset <= idx and idx < g.offset + g.len) return g; | 133 | if (g.offset <= idx and idx < g.offset + g.len) return g; |
| 157 | } | 134 | } |
| @@ -159,23 +136,22 @@ pub fn graphemeAtIndex(graphemes: *const Graphemes, string: []const u8, index: u | |||
| 159 | } | 136 | } |
| 160 | 137 | ||
| 161 | /// Return a (forward) iterator of `string` after `grapheme`. | 138 | /// Return a (forward) iterator of `string` after `grapheme`. |
| 162 | pub fn iterateAfterGrapheme(graphemes: *const Graphemes, string: []const u8, grapheme: Grapheme) Iterator { | 139 | pub fn iterateAfterGrapheme(string: []const u8, grapheme: Grapheme) Iterator { |
| 163 | return graphemes.iterAtIndex(string, grapheme.offset + grapheme.len); | 140 | return Graphemes.iterAtIndex(string, grapheme.offset + grapheme.len); |
| 164 | } | 141 | } |
| 165 | 142 | ||
| 166 | /// Return a reverse iterator of `string` before `grapheme`. | 143 | /// Return a reverse iterator of `string` before `grapheme`. |
| 167 | pub fn iterateBeforeGrapheme(graphemes: *const Graphemes, string: []const u8, grapheme: Grapheme) ReverseIterator { | 144 | pub fn iterateBeforeGrapheme(string: []const u8, grapheme: Grapheme) ReverseIterator { |
| 168 | // This bit of weirdness is because reverse iterators are "advance last", | 145 | // This bit of weirdness is because reverse iterators are "advance last", |
| 169 | // while forward iterators are "advance first". This leaves some room for | 146 | // while forward iterators are "advance first". This leaves some room for |
| 170 | // further optimization, if anyone dares. | 147 | // further optimization, if anyone dares. |
| 171 | var r_iter = graphemes.reverseIterAtIndex(string, grapheme.offset + grapheme.len - 1); | 148 | var r_iter = Graphemes.reverseIterAtIndex(string, grapheme.offset + grapheme.len - 1); |
| 172 | _ = r_iter.prev(); | 149 | _ = r_iter.prev(); |
| 173 | return r_iter; | 150 | return r_iter; |
| 174 | } | 151 | } |
| 175 | 152 | ||
| 176 | fn reverseIterAtIndex(graphemes: *const Graphemes, string: []const u8, idx: uoffset) ReverseIterator { | 153 | fn reverseIterAtIndex(string: []const u8, idx: uoffset) ReverseIterator { |
| 177 | var r_iter: ReverseIterator = undefined; | 154 | var r_iter: ReverseIterator = undefined; |
| 178 | r_iter.data = graphemes; | ||
| 179 | var rcp_iter: CodePointReverseIterator = .{ .bytes = string, .i = idx }; | 155 | var rcp_iter: CodePointReverseIterator = .{ .bytes = string, .i = idx }; |
| 180 | r_iter.buf[1] = rcp_iter.prev(); | 156 | r_iter.buf[1] = rcp_iter.prev(); |
| 181 | r_iter.buf[0] = rcp_iter.prev(); | 157 | r_iter.buf[0] = rcp_iter.prev(); |
| @@ -184,9 +160,8 @@ fn reverseIterAtIndex(graphemes: *const Graphemes, string: []const u8, idx: uoff | |||
| 184 | return r_iter; | 160 | return r_iter; |
| 185 | } | 161 | } |
| 186 | 162 | ||
| 187 | fn iterAtIndex(graphemes: *const Graphemes, string: []const u8, idx: uoffset) Iterator { | 163 | fn iterAtIndex(string: []const u8, idx: uoffset) Iterator { |
| 188 | var iter: Iterator = undefined; | 164 | var iter: Iterator = undefined; |
| 189 | iter.data = graphemes; | ||
| 190 | iter.buf[0] = first: { | 165 | iter.buf[0] = first: { |
| 191 | if (idx == string.len) break :first null; | 166 | if (idx == string.len) break :first null; |
| 192 | var r_cp_iter: CodePointReverseIterator = .{ .bytes = string, .i = idx }; | 167 | var r_cp_iter: CodePointReverseIterator = .{ .bytes = string, .i = idx }; |
| @@ -202,13 +177,12 @@ fn iterAtIndex(graphemes: *const Graphemes, string: []const u8, idx: uoffset) It | |||
| 202 | pub const Iterator = struct { | 177 | pub const Iterator = struct { |
| 203 | buf: [2]?CodePoint = .{ null, null }, | 178 | buf: [2]?CodePoint = .{ null, null }, |
| 204 | cp_iter: CodePointIterator, | 179 | cp_iter: CodePointIterator, |
| 205 | data: *const Graphemes, | ||
| 206 | 180 | ||
| 207 | const Self = @This(); | 181 | const Self = @This(); |
| 208 | 182 | ||
| 209 | /// Assumes `src` is valid UTF-8. | 183 | /// Assumes `src` is valid UTF-8. |
| 210 | pub fn init(str: []const u8, data: *const Graphemes) Self { | 184 | pub fn init(str: []const u8) Self { |
| 211 | var self = Self{ .cp_iter = .{ .bytes = str }, .data = data }; | 185 | var self = Self{ .cp_iter = .{ .bytes = str } }; |
| 212 | self.advance(); | 186 | self.advance(); |
| 213 | return self; | 187 | return self; |
| 214 | } | 188 | } |
| @@ -237,7 +211,6 @@ pub const Iterator = struct { | |||
| 237 | if (graphemeBreak( | 211 | if (graphemeBreak( |
| 238 | self.buf[0].?.code, | 212 | self.buf[0].?.code, |
| 239 | self.buf[1].?.code, | 213 | self.buf[1].?.code, |
| 240 | self.data, | ||
| 241 | &state, | 214 | &state, |
| 242 | )) return Grapheme{ .len = gc_len, .offset = gc_start }; | 215 | )) return Grapheme{ .len = gc_len, .offset = gc_start }; |
| 243 | 216 | ||
| @@ -250,7 +223,6 @@ pub const Iterator = struct { | |||
| 250 | if (graphemeBreak( | 223 | if (graphemeBreak( |
| 251 | self.buf[0].?.code, | 224 | self.buf[0].?.code, |
| 252 | if (self.buf[1]) |ncp| ncp.code else 0, | 225 | if (self.buf[1]) |ncp| ncp.code else 0, |
| 253 | self.data, | ||
| 254 | &state, | 226 | &state, |
| 255 | )) break; | 227 | )) break; |
| 256 | } | 228 | } |
| @@ -275,7 +247,6 @@ pub const Iterator = struct { | |||
| 275 | pub const ReverseIterator = struct { | 247 | pub const ReverseIterator = struct { |
| 276 | buf: [2]?CodePoint = .{ null, null }, | 248 | buf: [2]?CodePoint = .{ null, null }, |
| 277 | cp_iter: CodePointReverseIterator, | 249 | cp_iter: CodePointReverseIterator, |
| 278 | data: *const Graphemes, | ||
| 279 | /// Codepoint read from `cp_iter` but not returned by `previous` | 250 | /// Codepoint read from `cp_iter` but not returned by `previous` |
| 280 | pending: Pending = .none, | 251 | pending: Pending = .none, |
| 281 | 252 | ||
| @@ -289,8 +260,8 @@ pub const ReverseIterator = struct { | |||
| 289 | 260 | ||
| 290 | const Self = @This(); | 261 | const Self = @This(); |
| 291 | 262 | ||
| 292 | pub fn init(str: []const u8, data: *const Graphemes) Self { | 263 | pub fn init(str: []const u8) Self { |
| 293 | var self: Self = .{ .cp_iter = .init(str), .data = data }; | 264 | var self: Self = .{ .cp_iter = .init(str) }; |
| 294 | self.advance(); | 265 | self.advance(); |
| 295 | self.advance(); | 266 | self.advance(); |
| 296 | return self; | 267 | return self; |
| @@ -352,7 +323,6 @@ pub const ReverseIterator = struct { | |||
| 352 | if (graphemeBreak( | 323 | if (graphemeBreak( |
| 353 | self.buf[0].?.code, | 324 | self.buf[0].?.code, |
| 354 | self.buf[1].?.code, | 325 | self.buf[1].?.code, |
| 355 | self.data, | ||
| 356 | &state, | 326 | &state, |
| 357 | )) break; | 327 | )) break; |
| 358 | 328 | ||
| @@ -374,7 +344,7 @@ pub const ReverseIterator = struct { | |||
| 374 | 344 | ||
| 375 | const codepoint = self.buf[0].?; | 345 | const codepoint = self.buf[0].?; |
| 376 | 346 | ||
| 377 | switch (self.data.indic(codepoint.code)) { | 347 | switch (Graphemes.indic(codepoint.code)) { |
| 378 | .Extend, .Linker => { | 348 | .Extend, .Linker => { |
| 379 | self.advance(); | 349 | self.advance(); |
| 380 | continue :indic; | 350 | continue :indic; |
| @@ -387,7 +357,7 @@ pub const ReverseIterator = struct { | |||
| 387 | if (self.buf[0]) |cp1| { | 357 | if (self.buf[0]) |cp1| { |
| 388 | state.indic = true; | 358 | state.indic = true; |
| 389 | 359 | ||
| 390 | if (graphemeBreak(cp1.code, self.buf[1].?.code, self.data, &state)) break; | 360 | if (graphemeBreak(cp1.code, self.buf[1].?.code, &state)) break; |
| 391 | 361 | ||
| 392 | if (!state.indic) { | 362 | if (!state.indic) { |
| 393 | continue :indic; | 363 | continue :indic; |
| @@ -426,12 +396,12 @@ pub const ReverseIterator = struct { | |||
| 426 | 396 | ||
| 427 | const codepoint = self.buf[0].?; | 397 | const codepoint = self.buf[0].?; |
| 428 | 398 | ||
| 429 | if (self.data.gbp(codepoint.code) == .Extend) { | 399 | if (Graphemes.gbp(codepoint.code) == .Extend) { |
| 430 | self.advance(); | 400 | self.advance(); |
| 431 | continue :emoji; | 401 | continue :emoji; |
| 432 | } | 402 | } |
| 433 | 403 | ||
| 434 | if (self.data.isEmoji(codepoint.code)) { | 404 | if (Graphemes.isEmoji(codepoint.code)) { |
| 435 | // BUF: [Emoji, Extend] (Extend* ZWJ Emoji)* | 405 | // BUF: [Emoji, Extend] (Extend* ZWJ Emoji)* |
| 436 | emoji_offset = codepoint.offset; | 406 | emoji_offset = codepoint.offset; |
| 437 | self.advance(); | 407 | self.advance(); |
| @@ -462,7 +432,7 @@ pub const ReverseIterator = struct { | |||
| 462 | if (state.regional) { | 432 | if (state.regional) { |
| 463 | var ri_count: usize = 0; | 433 | var ri_count: usize = 0; |
| 464 | while (self.buf[0] != null and | 434 | while (self.buf[0] != null and |
| 465 | self.data.gbp(self.buf[0].?.code) == .Regional_Indicator) | 435 | Graphemes.gbp(self.buf[0].?.code) == .Regional_Indicator) |
| 466 | { | 436 | { |
| 467 | ri_count += 1; | 437 | ri_count += 1; |
| 468 | self.advance(); | 438 | self.advance(); |
| @@ -500,10 +470,13 @@ pub const IterState = packed struct(u3) { | |||
| 500 | indic: bool = false, | 470 | indic: bool = false, |
| 501 | }; | 471 | }; |
| 502 | 472 | ||
| 473 | // TODO: isBreaker is also expensive given the data is already available, | ||
| 474 | // and should be "semantically inlined" wherever it belongs. | ||
| 475 | |||
| 503 | // Predicates | 476 | // Predicates |
| 504 | fn isBreaker(cp: u21, data: *const Graphemes) bool { | 477 | fn isBreaker(cp: u21) bool { |
| 505 | // Extract relevant properties. | 478 | // Extract relevant properties. |
| 506 | const cp_gbp_prop = data.gbp(cp); | 479 | const cp_gbp_prop = Graphemes.gbp(cp); |
| 507 | return cp == '\x0d' or cp == '\x0a' or cp_gbp_prop == .Control; | 480 | return cp == '\x0d' or cp == '\x0a' or cp_gbp_prop == .Control; |
| 508 | } | 481 | } |
| 509 | 482 | ||
| @@ -516,17 +489,20 @@ fn isBreaker(cp: u21, data: *const Graphemes) bool { | |||
| 516 | pub fn graphemeBreak( | 489 | pub fn graphemeBreak( |
| 517 | cp1: u21, | 490 | cp1: u21, |
| 518 | cp2: u21, | 491 | cp2: u21, |
| 519 | data: *const Graphemes, | ||
| 520 | state: *IterState, | 492 | state: *IterState, |
| 521 | ) bool { | 493 | ) bool { |
| 494 | // TODO: it's silly to index the same field three times and | ||
| 495 | // just extra different bits from the data. Optimizable? Maybe | ||
| 496 | // but it's silly to rely on that. | ||
| 497 | // | ||
| 522 | // Extract relevant properties. | 498 | // Extract relevant properties. |
| 523 | const cp1_gbp_prop = data.gbp(cp1); | 499 | const cp1_gbp_prop = Graphemes.gbp(cp1); |
| 524 | const cp1_indic_prop = data.indic(cp1); | 500 | const cp1_indic_prop = Graphemes.indic(cp1); |
| 525 | const cp1_is_emoji = data.isEmoji(cp1); | 501 | const cp1_is_emoji = Graphemes.isEmoji(cp1); |
| 526 | 502 | ||
| 527 | const cp2_gbp_prop = data.gbp(cp2); | 503 | const cp2_gbp_prop = Graphemes.gbp(cp2); |
| 528 | const cp2_indic_prop = data.indic(cp2); | 504 | const cp2_indic_prop = Graphemes.indic(cp2); |
| 529 | const cp2_is_emoji = data.isEmoji(cp2); | 505 | const cp2_is_emoji = Graphemes.isEmoji(cp2); |
| 530 | 506 | ||
| 531 | // GB11: Emoji Extend* ZWJ x Emoji | 507 | // GB11: Emoji Extend* ZWJ x Emoji |
| 532 | if (!state.xpic and cp1_is_emoji) state.xpic = true; | 508 | if (!state.xpic and cp1_is_emoji) state.xpic = true; |
| @@ -537,7 +513,7 @@ pub fn graphemeBreak( | |||
| 537 | if (cp1 == '\r' and cp2 == '\n') return false; | 513 | if (cp1 == '\r' and cp2 == '\n') return false; |
| 538 | 514 | ||
| 539 | // GB4: Control | 515 | // GB4: Control |
| 540 | if (isBreaker(cp1, data)) return true; | 516 | if (isBreaker(cp1)) return true; |
| 541 | 517 | ||
| 542 | // GB11: Emoji Extend* ZWJ x Emoji | 518 | // GB11: Emoji Extend* ZWJ x Emoji |
| 543 | if (state.xpic and | 519 | if (state.xpic and |
| @@ -555,7 +531,7 @@ pub fn graphemeBreak( | |||
| 555 | if (cp2_gbp_prop == .SpacingMark) return false; | 531 | if (cp2_gbp_prop == .SpacingMark) return false; |
| 556 | 532 | ||
| 557 | // GB9b: Prepend x | 533 | // GB9b: Prepend x |
| 558 | if (cp1_gbp_prop == .Prepend and !isBreaker(cp2, data)) return false; | 534 | if (cp1_gbp_prop == .Prepend and !isBreaker(cp2)) return false; |
| 559 | 535 | ||
| 560 | // GB12, GB13: RI x RI | 536 | // GB12, GB13: RI x RI |
| 561 | if (cp1_gbp_prop == .Regional_Indicator and cp2_gbp_prop == .Regional_Indicator) { | 537 | if (cp1_gbp_prop == .Regional_Indicator and cp2_gbp_prop == .Regional_Indicator) { |
| @@ -620,25 +596,22 @@ test "Segmentation ZWJ and ZWSP emoji sequences" { | |||
| 620 | const with_zwsp = seq_1 ++ "\u{200B}" ++ seq_2; | 596 | const with_zwsp = seq_1 ++ "\u{200B}" ++ seq_2; |
| 621 | const no_joiner = seq_1 ++ seq_2; | 597 | const no_joiner = seq_1 ++ seq_2; |
| 622 | 598 | ||
| 623 | const graphemes = try Graphemes.init(std.testing.allocator); | ||
| 624 | defer graphemes.deinit(std.testing.allocator); | ||
| 625 | |||
| 626 | { | 599 | { |
| 627 | var iter = graphemes.iterator(with_zwj); | 600 | var iter = Graphemes.iterator(with_zwj); |
| 628 | var i: usize = 0; | 601 | var i: usize = 0; |
| 629 | while (iter.next()) |_| : (i += 1) {} | 602 | while (iter.next()) |_| : (i += 1) {} |
| 630 | try std.testing.expectEqual(@as(usize, 1), i); | 603 | try std.testing.expectEqual(@as(usize, 1), i); |
| 631 | } | 604 | } |
| 632 | 605 | ||
| 633 | { | 606 | { |
| 634 | var iter = graphemes.iterator(with_zwsp); | 607 | var iter = Graphemes.iterator(with_zwsp); |
| 635 | var i: usize = 0; | 608 | var i: usize = 0; |
| 636 | while (iter.next()) |_| : (i += 1) {} | 609 | while (iter.next()) |_| : (i += 1) {} |
| 637 | try std.testing.expectEqual(@as(usize, 3), i); | 610 | try std.testing.expectEqual(@as(usize, 3), i); |
| 638 | } | 611 | } |
| 639 | 612 | ||
| 640 | { | 613 | { |
| 641 | var iter = graphemes.iterator(no_joiner); | 614 | var iter = Graphemes.iterator(no_joiner); |
| 642 | var i: usize = 0; | 615 | var i: usize = 0; |
| 643 | while (iter.next()) |_| : (i += 1) {} | 616 | while (iter.next()) |_| : (i += 1) {} |
| 644 | try std.testing.expectEqual(@as(usize, 2), i); | 617 | try std.testing.expectEqual(@as(usize, 2), i); |
| @@ -647,10 +620,8 @@ test "Segmentation ZWJ and ZWSP emoji sequences" { | |||
| 647 | 620 | ||
| 648 | test "Iterator.peek" { | 621 | test "Iterator.peek" { |
| 649 | const peek_seq = "aΔ👨🏻🌾→"; | 622 | const peek_seq = "aΔ👨🏻🌾→"; |
| 650 | const data = try Graphemes.init(std.testing.allocator); | ||
| 651 | defer data.deinit(std.testing.allocator); | ||
| 652 | 623 | ||
| 653 | var iter = data.iterator(peek_seq); | 624 | var iter = Graphemes.iterator(peek_seq); |
| 654 | const peek_a = iter.peek().?; | 625 | const peek_a = iter.peek().?; |
| 655 | const next_a = iter.next().?; | 626 | const next_a = iter.next().?; |
| 656 | try std.testing.expectEqual(peek_a, next_a); | 627 | try std.testing.expectEqual(peek_a, next_a); |