diff options
Diffstat (limited to 'src/grapheme.zig')
| -rw-r--r-- | src/grapheme.zig | 73 |
1 files changed, 32 insertions, 41 deletions
diff --git a/src/grapheme.zig b/src/grapheme.zig index 3fdf10b..7125b5b 100644 --- a/src/grapheme.zig +++ b/src/grapheme.zig | |||
| @@ -1,9 +1,10 @@ | |||
| 1 | const std = @import("std"); | 1 | const std = @import("std"); |
| 2 | const mem = std.mem; | ||
| 2 | const unicode = std.unicode; | 3 | const unicode = std.unicode; |
| 3 | 4 | ||
| 4 | const CodePoint = @import("code_point").CodePoint; | 5 | const CodePoint = @import("code_point").CodePoint; |
| 5 | const CodePointIterator = @import("code_point").Iterator; | 6 | const CodePointIterator = @import("code_point").Iterator; |
| 6 | const gbp = @import("gbp"); | 7 | pub const Data = @import("GraphemeData"); |
| 7 | 8 | ||
| 8 | /// `Grapheme` represents a Unicode grapheme cluster by its length and offset in the source bytes. | 9 | /// `Grapheme` represents a Unicode grapheme cluster by its length and offset in the source bytes. |
| 9 | pub const Grapheme = struct { | 10 | pub const Grapheme = struct { |
| @@ -21,12 +22,13 @@ pub const Grapheme = struct { | |||
| 21 | pub const Iterator = struct { | 22 | pub const Iterator = struct { |
| 22 | buf: [2]?CodePoint = .{ null, null }, | 23 | buf: [2]?CodePoint = .{ null, null }, |
| 23 | cp_iter: CodePointIterator, | 24 | cp_iter: CodePointIterator, |
| 25 | data: *Data, | ||
| 24 | 26 | ||
| 25 | const Self = @This(); | 27 | const Self = @This(); |
| 26 | 28 | ||
| 27 | /// Assumes `src` is valid UTF-8. | 29 | /// Assumes `src` is valid UTF-8. |
| 28 | pub fn init(str: []const u8) Self { | 30 | pub fn init(str: []const u8, data: *Data) Self { |
| 29 | var self = Self{ .cp_iter = CodePointIterator{ .bytes = str } }; | 31 | var self = Self{ .cp_iter = .{ .bytes = str }, .data = data }; |
| 30 | self.advance(); | 32 | self.advance(); |
| 31 | return self; | 33 | return self; |
| 32 | } | 34 | } |
| @@ -55,6 +57,7 @@ pub const Iterator = struct { | |||
| 55 | if (graphemeBreak( | 57 | if (graphemeBreak( |
| 56 | self.buf[0].?.code, | 58 | self.buf[0].?.code, |
| 57 | self.buf[1].?.code, | 59 | self.buf[1].?.code, |
| 60 | self.data, | ||
| 58 | &state, | 61 | &state, |
| 59 | )) return Grapheme{ .len = gc_len, .offset = gc_start }; | 62 | )) return Grapheme{ .len = gc_len, .offset = gc_start }; |
| 60 | 63 | ||
| @@ -67,6 +70,7 @@ pub const Iterator = struct { | |||
| 67 | if (graphemeBreak( | 70 | if (graphemeBreak( |
| 68 | self.buf[0].?.code, | 71 | self.buf[0].?.code, |
| 69 | if (self.buf[1]) |ncp| ncp.code else 0, | 72 | if (self.buf[1]) |ncp| ncp.code else 0, |
| 73 | self.data, | ||
| 70 | &state, | 74 | &state, |
| 71 | )) break; | 75 | )) break; |
| 72 | } | 76 | } |
| @@ -76,18 +80,12 @@ pub const Iterator = struct { | |||
| 76 | }; | 80 | }; |
| 77 | 81 | ||
| 78 | // Predicates | 82 | // Predicates |
| 79 | fn isBreaker(cp: u21) bool { | 83 | fn isBreaker(cp: u21, data: *Data) bool { |
| 80 | // Extract relevant properties. | 84 | // Extract relevant properties. |
| 81 | const cp_props_byte = gbp.stage_3[gbp.stage_2[gbp.stage_1[cp >> 8] + (cp & 0xff)]]; | 85 | const cp_gbp_prop = data.gbp(cp); |
| 82 | const cp_gbp_prop: gbp.Gbp = @enumFromInt(cp_props_byte >> 4); | ||
| 83 | return cp == '\x0d' or cp == '\x0a' or cp_gbp_prop == .Control; | 86 | return cp == '\x0d' or cp == '\x0a' or cp_gbp_prop == .Control; |
| 84 | } | 87 | } |
| 85 | 88 | ||
| 86 | fn isIgnorable(cp: u21) bool { | ||
| 87 | const cp_gbp_prop = gbp.stage_3[gbp.stage_2[gbp.stage_1[cp >> 8] + (cp & 0xff)]]; | ||
| 88 | return cp_gbp_prop == .extend or cp_gbp_prop == .spacing or cp == '\u{200d}'; | ||
| 89 | } | ||
| 90 | |||
| 91 | // Grapheme break state. | 89 | // Grapheme break state. |
| 92 | const State = struct { | 90 | const State = struct { |
| 93 | bits: u3 = 0, | 91 | bits: u3 = 0, |
| @@ -135,18 +133,17 @@ const State = struct { | |||
| 135 | pub fn graphemeBreak( | 133 | pub fn graphemeBreak( |
| 136 | cp1: u21, | 134 | cp1: u21, |
| 137 | cp2: u21, | 135 | cp2: u21, |
| 136 | data: *Data, | ||
| 138 | state: *State, | 137 | state: *State, |
| 139 | ) bool { | 138 | ) bool { |
| 140 | // Extract relevant properties. | 139 | // Extract relevant properties. |
| 141 | const cp1_props_byte = gbp.stage_3[gbp.stage_2[gbp.stage_1[cp1 >> 8] + (cp1 & 0xff)]]; | 140 | const cp1_gbp_prop = data.gbp(cp1); |
| 142 | const cp1_gbp_prop: gbp.Gbp = @enumFromInt(cp1_props_byte >> 4); | 141 | const cp1_indic_prop = data.indic(cp1); |
| 143 | const cp1_indic_prop: gbp.Indic = @enumFromInt((cp1_props_byte >> 1) & 0x7); | 142 | const cp1_is_emoji = data.isEmoji(cp1); |
| 144 | const cp1_is_emoji = cp1_props_byte & 1 == 1; | ||
| 145 | 143 | ||
| 146 | const cp2_props_byte = gbp.stage_3[gbp.stage_2[gbp.stage_1[cp2 >> 8] + (cp2 & 0xff)]]; | 144 | const cp2_gbp_prop = data.gbp(cp2); |
| 147 | const cp2_gbp_prop: gbp.Gbp = @enumFromInt(cp2_props_byte >> 4); | 145 | const cp2_indic_prop = data.indic(cp2); |
| 148 | const cp2_indic_prop: gbp.Indic = @enumFromInt((cp2_props_byte >> 1) & 0x7); | 146 | const cp2_is_emoji = data.isEmoji(cp2); |
| 149 | const cp2_is_emoji = cp2_props_byte & 1 == 1; | ||
| 150 | 147 | ||
| 151 | // GB11: Emoji Extend* ZWJ x Emoji | 148 | // GB11: Emoji Extend* ZWJ x Emoji |
| 152 | if (!state.hasXpic() and cp1_is_emoji) state.setXpic(); | 149 | if (!state.hasXpic() and cp1_is_emoji) state.setXpic(); |
| @@ -157,7 +154,7 @@ pub fn graphemeBreak( | |||
| 157 | if (cp1 == '\r' and cp2 == '\n') return false; | 154 | if (cp1 == '\r' and cp2 == '\n') return false; |
| 158 | 155 | ||
| 159 | // GB4: Control | 156 | // GB4: Control |
| 160 | if (isBreaker(cp1)) return true; | 157 | if (isBreaker(cp1, data)) return true; |
| 161 | 158 | ||
| 162 | // GB11: Emoji Extend* ZWJ x Emoji | 159 | // GB11: Emoji Extend* ZWJ x Emoji |
| 163 | if (state.hasXpic() and | 160 | if (state.hasXpic() and |
| @@ -175,7 +172,7 @@ pub fn graphemeBreak( | |||
| 175 | if (cp2_gbp_prop == .SpacingMark) return false; | 172 | if (cp2_gbp_prop == .SpacingMark) return false; |
| 176 | 173 | ||
| 177 | // GB9b: Prepend x | 174 | // GB9b: Prepend x |
| 178 | if (cp1_gbp_prop == .Prepend and !isBreaker(cp2)) return false; | 175 | if (cp1_gbp_prop == .Prepend and !isBreaker(cp2, data)) return false; |
| 179 | 176 | ||
| 180 | // GB12, GB13: RI x RI | 177 | // GB12, GB13: RI x RI |
| 181 | if (cp1_gbp_prop == .Regional_Indicator and cp2_gbp_prop == .Regional_Indicator) { | 178 | if (cp1_gbp_prop == .Regional_Indicator and cp2_gbp_prop == .Regional_Indicator) { |
| @@ -240,6 +237,9 @@ test "Segmentation GraphemeIterator" { | |||
| 240 | var buf_reader = std.io.bufferedReader(file.reader()); | 237 | var buf_reader = std.io.bufferedReader(file.reader()); |
| 241 | var input_stream = buf_reader.reader(); | 238 | var input_stream = buf_reader.reader(); |
| 242 | 239 | ||
| 240 | var data = try Data.init(allocator); | ||
| 241 | defer data.deinit(); | ||
| 242 | |||
| 243 | var buf: [4096]u8 = undefined; | 243 | var buf: [4096]u8 = undefined; |
| 244 | var line_no: usize = 1; | 244 | var line_no: usize = 1; |
| 245 | 245 | ||
| @@ -282,7 +282,7 @@ test "Segmentation GraphemeIterator" { | |||
| 282 | } | 282 | } |
| 283 | 283 | ||
| 284 | // std.debug.print("\nline {}: {s}\n", .{ line_no, all_bytes.items }); | 284 | // std.debug.print("\nline {}: {s}\n", .{ line_no, all_bytes.items }); |
| 285 | var iter = Iterator.init(all_bytes.items); | 285 | var iter = Iterator.init(all_bytes.items, &data); |
| 286 | 286 | ||
| 287 | // Chaeck. | 287 | // Chaeck. |
| 288 | for (want.items) |want_gc| { | 288 | for (want.items) |want_gc| { |
| @@ -295,19 +295,6 @@ test "Segmentation GraphemeIterator" { | |||
| 295 | } | 295 | } |
| 296 | } | 296 | } |
| 297 | 297 | ||
| 298 | test "Segmentation comptime GraphemeIterator" { | ||
| 299 | const want = [_][]const u8{ "H", "é", "l", "l", "o" }; | ||
| 300 | |||
| 301 | comptime { | ||
| 302 | const src = "Héllo"; | ||
| 303 | var ct_iter = Iterator.init(src); | ||
| 304 | var i = 0; | ||
| 305 | while (ct_iter.next()) |grapheme| : (i += 1) { | ||
| 306 | try std.testing.expectEqualStrings(grapheme.bytes(src), want[i]); | ||
| 307 | } | ||
| 308 | } | ||
| 309 | } | ||
| 310 | |||
| 311 | test "Segmentation ZWJ and ZWSP emoji sequences" { | 298 | test "Segmentation ZWJ and ZWSP emoji sequences" { |
| 312 | const seq_1 = "\u{1F43B}\u{200D}\u{2744}\u{FE0F}"; | 299 | const seq_1 = "\u{1F43B}\u{200D}\u{2744}\u{FE0F}"; |
| 313 | const seq_2 = "\u{1F43B}\u{200D}\u{2744}\u{FE0F}"; | 300 | const seq_2 = "\u{1F43B}\u{200D}\u{2744}\u{FE0F}"; |
| @@ -315,18 +302,22 @@ test "Segmentation ZWJ and ZWSP emoji sequences" { | |||
| 315 | const with_zwsp = seq_1 ++ "\u{200B}" ++ seq_2; | 302 | const with_zwsp = seq_1 ++ "\u{200B}" ++ seq_2; |
| 316 | const no_joiner = seq_1 ++ seq_2; | 303 | const no_joiner = seq_1 ++ seq_2; |
| 317 | 304 | ||
| 318 | var ct_iter = Iterator.init(with_zwj); | 305 | var data = try Data.init(std.testing.allocator); |
| 306 | defer data.deinit(); | ||
| 307 | |||
| 308 | var iter = Iterator.init(with_zwj, &data); | ||
| 309 | |||
| 319 | var i: usize = 0; | 310 | var i: usize = 0; |
| 320 | while (ct_iter.next()) |_| : (i += 1) {} | 311 | while (iter.next()) |_| : (i += 1) {} |
| 321 | try std.testing.expectEqual(@as(usize, 1), i); | 312 | try std.testing.expectEqual(@as(usize, 1), i); |
| 322 | 313 | ||
| 323 | ct_iter = Iterator.init(with_zwsp); | 314 | iter = Iterator.init(with_zwsp, &data); |
| 324 | i = 0; | 315 | i = 0; |
| 325 | while (ct_iter.next()) |_| : (i += 1) {} | 316 | while (iter.next()) |_| : (i += 1) {} |
| 326 | try std.testing.expectEqual(@as(usize, 3), i); | 317 | try std.testing.expectEqual(@as(usize, 3), i); |
| 327 | 318 | ||
| 328 | ct_iter = Iterator.init(no_joiner); | 319 | iter = Iterator.init(no_joiner, &data); |
| 329 | i = 0; | 320 | i = 0; |
| 330 | while (ct_iter.next()) |_| : (i += 1) {} | 321 | while (iter.next()) |_| : (i += 1) {} |
| 331 | try std.testing.expectEqual(@as(usize, 2), i); | 322 | try std.testing.expectEqual(@as(usize, 2), i); |
| 332 | } | 323 | } |