diff options
| author | 2024-02-13 13:50:22 -0400 | |
|---|---|---|
| committer | 2024-02-13 13:50:22 -0400 | |
| commit | bbe4bc5a08a042b47a3474e8e9caf20e216634a8 (patch) | |
| tree | 3a2d19765f560481d74b6c2abb3696db573a70e3 /src | |
| parent | Merge table (diff) | |
| download | zg-bbe4bc5a08a042b47a3474e8e9caf20e216634a8.tar.gz zg-bbe4bc5a08a042b47a3474e8e9caf20e216634a8.tar.xz zg-bbe4bc5a08a042b47a3474e8e9caf20e216634a8.zip | |
Passing ziglyph tests
Diffstat (limited to 'src')
| -rw-r--r-- | src/Grapheme.zig | 248 | ||||
| -rw-r--r-- | src/gbp_gen.zig | 14 | ||||
| -rw-r--r-- | src/main.zig | 4 |
3 files changed, 250 insertions, 16 deletions
diff --git a/src/Grapheme.zig b/src/Grapheme.zig index a8a7638..1e9606f 100644 --- a/src/Grapheme.zig +++ b/src/Grapheme.zig | |||
| @@ -3,10 +3,12 @@ | |||
| 3 | const std = @import("std"); | 3 | const std = @import("std"); |
| 4 | const unicode = std.unicode; | 4 | const unicode = std.unicode; |
| 5 | 5 | ||
| 6 | const CodePoint = @import("ziglyph").CodePoint; | 6 | const ziglyph = @import("ziglyph"); |
| 7 | const CodePoint = ziglyph.CodePoint; | ||
| 7 | const CodePointIterator = CodePoint.CodePointIterator; | 8 | const CodePointIterator = CodePoint.CodePointIterator; |
| 8 | const emoji = @import("ziglyph").emoji; | 9 | const readCodePoint = CodePoint.readCodePoint; |
| 9 | 10 | const emoji = ziglyph.emoji; | |
| 11 | // const gbp = ziglyph.grapheme_break; | ||
| 10 | const gbp = @import("gbp"); | 12 | const gbp = @import("gbp"); |
| 11 | 13 | ||
| 12 | pub const Grapheme = @This(); | 14 | pub const Grapheme = @This(); |
| @@ -78,6 +80,171 @@ pub const GraphemeIterator = struct { | |||
| 78 | } | 80 | } |
| 79 | }; | 81 | }; |
| 80 | 82 | ||
| 83 | /// `StreamingGraphemeIterator` iterates a `std.io.Reader` one grapheme cluster at-a-time. | ||
| 84 | /// Note that, given the steaming context, each grapheme cluster is returned as a slice of bytes. | ||
| 85 | pub fn StreamingGraphemeIterator(comptime T: type) type { | ||
| 86 | return struct { | ||
| 87 | allocator: std.mem.Allocator, | ||
| 88 | buf: [2]?u21 = [_]?u21{ null, null }, | ||
| 89 | reader: T, | ||
| 90 | |||
| 91 | const Self = @This(); | ||
| 92 | |||
| 93 | pub fn init(allocator: std.mem.Allocator, reader: anytype) !Self { | ||
| 94 | var self = Self{ .allocator = allocator, .reader = reader }; | ||
| 95 | self.buf[1] = try readCodePoint(self.reader); | ||
| 96 | |||
| 97 | return self; | ||
| 98 | } | ||
| 99 | |||
| 100 | /// Caller must free returned bytes with `allocator` passed to `init`. | ||
| 101 | pub fn next(self: *Self) !?[]u8 { | ||
| 102 | const code = (try self.advance()) orelse return null; | ||
| 103 | |||
| 104 | var all_bytes = std.ArrayList(u8).init(self.allocator); | ||
| 105 | errdefer all_bytes.deinit(); | ||
| 106 | |||
| 107 | try encode_and_append(code, &all_bytes); | ||
| 108 | |||
| 109 | // If at end | ||
| 110 | if (self.buf[1] == null) return try all_bytes.toOwnedSlice(); | ||
| 111 | |||
| 112 | // Instant breakers | ||
| 113 | // CR | ||
| 114 | if (code == '\x0d') { | ||
| 115 | if (self.buf[1].? == '\x0a') { | ||
| 116 | // CRLF | ||
| 117 | try encode_and_append(self.buf[1].?, &all_bytes); | ||
| 118 | _ = self.advance() catch unreachable; | ||
| 119 | } | ||
| 120 | |||
| 121 | return try all_bytes.toOwnedSlice(); | ||
| 122 | } | ||
| 123 | // LF | ||
| 124 | if (code == '\x0a') return try all_bytes.toOwnedSlice(); | ||
| 125 | // Control | ||
| 126 | if (gbp.isControl(code)) return try all_bytes.toOwnedSlice(); | ||
| 127 | |||
| 128 | // Common chars | ||
| 129 | if (code < 0xa9) { | ||
| 130 | // Extend / ignorables loop | ||
| 131 | while (self.buf[1]) |next_cp| { | ||
| 132 | if (next_cp >= 0x300 and isIgnorable(next_cp)) { | ||
| 133 | try encode_and_append(next_cp, &all_bytes); | ||
| 134 | _ = self.advance() catch unreachable; | ||
| 135 | } else { | ||
| 136 | break; | ||
| 137 | } | ||
| 138 | } | ||
| 139 | |||
| 140 | return try all_bytes.toOwnedSlice(); | ||
| 141 | } | ||
| 142 | |||
| 143 | if (emoji.isExtendedPictographic(code)) { | ||
| 144 | var after_zwj = false; | ||
| 145 | |||
| 146 | // Extend / ignorables loop | ||
| 147 | while (self.buf[1]) |next_cp| { | ||
| 148 | if (next_cp >= 0x300 and | ||
| 149 | after_zwj and | ||
| 150 | emoji.isExtendedPictographic(next_cp)) | ||
| 151 | { | ||
| 152 | try encode_and_append(next_cp, &all_bytes); | ||
| 153 | _ = self.advance() catch unreachable; | ||
| 154 | after_zwj = false; | ||
| 155 | } else if (next_cp >= 0x300 and isIgnorable(next_cp)) { | ||
| 156 | try encode_and_append(next_cp, &all_bytes); | ||
| 157 | _ = self.advance() catch unreachable; | ||
| 158 | if (next_cp == '\u{200d}') after_zwj = true; | ||
| 159 | } else { | ||
| 160 | break; | ||
| 161 | } | ||
| 162 | } | ||
| 163 | |||
| 164 | return try all_bytes.toOwnedSlice(); | ||
| 165 | } | ||
| 166 | |||
| 167 | if (0x1100 <= code and code <= 0xd7c6) { | ||
| 168 | const next_cp = self.buf[1].?; | ||
| 169 | |||
| 170 | if (gbp.isL(code)) { | ||
| 171 | if (next_cp >= 0x1100 and | ||
| 172 | (gbp.isL(next_cp) or | ||
| 173 | gbp.isV(next_cp) or | ||
| 174 | gbp.isLv(next_cp) or | ||
| 175 | gbp.isLvt(next_cp))) | ||
| 176 | { | ||
| 177 | try encode_and_append(next_cp, &all_bytes); | ||
| 178 | _ = self.advance() catch unreachable; | ||
| 179 | } | ||
| 180 | } else if (gbp.isLv(code) or gbp.isV(code)) { | ||
| 181 | if (next_cp >= 0x1100 and | ||
| 182 | (gbp.isV(next_cp) or | ||
| 183 | gbp.isT(next_cp))) | ||
| 184 | { | ||
| 185 | try encode_and_append(next_cp, &all_bytes); | ||
| 186 | _ = self.advance() catch unreachable; | ||
| 187 | } | ||
| 188 | } else if (gbp.isLvt(code) or gbp.isT(code)) { | ||
| 189 | if (next_cp >= 0x1100 and gbp.isT(next_cp)) { | ||
| 190 | try encode_and_append(next_cp, &all_bytes); | ||
| 191 | _ = self.advance() catch unreachable; | ||
| 192 | } | ||
| 193 | } | ||
| 194 | } else if (0x600 <= code and code <= 0x11f02) { | ||
| 195 | if (gbp.isPrepend(code)) { | ||
| 196 | const next_cp = self.buf[1].?; | ||
| 197 | |||
| 198 | if (isBreaker(next_cp)) { | ||
| 199 | return try all_bytes.toOwnedSlice(); | ||
| 200 | } else { | ||
| 201 | try encode_and_append(next_cp, &all_bytes); | ||
| 202 | _ = self.advance() catch unreachable; | ||
| 203 | } | ||
| 204 | } | ||
| 205 | } else if (0x1f1e6 <= code and code <= 0x1f1ff) { | ||
| 206 | if (gbp.isRegionalIndicator(code)) { | ||
| 207 | const next_cp = self.buf[1].?; | ||
| 208 | |||
| 209 | if (next_cp >= 0x1f1e6 and gbp.isRegionalIndicator(next_cp)) { | ||
| 210 | try encode_and_append(next_cp, &all_bytes); | ||
| 211 | _ = self.advance() catch unreachable; | ||
| 212 | } | ||
| 213 | } | ||
| 214 | } | ||
| 215 | |||
| 216 | // Extend / ignorables loop | ||
| 217 | while (self.buf[1]) |next_cp| { | ||
| 218 | if (next_cp >= 0x300 and isIgnorable(next_cp)) { | ||
| 219 | try encode_and_append(next_cp, &all_bytes); | ||
| 220 | _ = self.advance() catch unreachable; | ||
| 221 | } else { | ||
| 222 | break; | ||
| 223 | } | ||
| 224 | } | ||
| 225 | |||
| 226 | return try all_bytes.toOwnedSlice(); | ||
| 227 | } | ||
| 228 | |||
| 229 | fn advance(self: *Self) !?u21 { | ||
| 230 | self.buf[0] = self.buf[1]; | ||
| 231 | self.buf[1] = try readCodePoint(self.reader); | ||
| 232 | |||
| 233 | return self.buf[0]; | ||
| 234 | } | ||
| 235 | |||
| 236 | fn peek(self: Self) ?u21 { | ||
| 237 | return self.buf[1]; | ||
| 238 | } | ||
| 239 | |||
| 240 | fn encode_and_append(cp: u21, list: *std.ArrayList(u8)) !void { | ||
| 241 | var tmp: [4]u8 = undefined; | ||
| 242 | const len = try unicode.utf8Encode(cp, &tmp); | ||
| 243 | try list.appendSlice(tmp[0..len]); | ||
| 244 | } | ||
| 245 | }; | ||
| 246 | } | ||
| 247 | |||
| 81 | // Predicates | 248 | // Predicates |
| 82 | fn isBreaker(cp: u21) bool { | 249 | fn isBreaker(cp: u21) bool { |
| 83 | return cp == '\x0d' or cp == '\x0a' or gbp.isControl(cp); | 250 | return cp == '\x0d' or cp == '\x0a' or gbp.isControl(cp); |
| @@ -100,6 +267,22 @@ test "Segmentation comptime GraphemeIterator" { | |||
| 100 | } | 267 | } |
| 101 | } | 268 | } |
| 102 | 269 | ||
| 270 | test "Simple StreamingGraphemeIterator" { | ||
| 271 | var buf = "abe\u{301}😹".*; | ||
| 272 | var fis = std.io.fixedBufferStream(&buf); | ||
| 273 | const reader = fis.reader(); | ||
| 274 | var iter = try StreamingGraphemeIterator(@TypeOf(reader)).init(std.testing.allocator, reader); | ||
| 275 | const want = [_][]const u8{ "a", "b", "e\u{301}", "😹" }; | ||
| 276 | |||
| 277 | for (want) |str| { | ||
| 278 | const gc = (try iter.next()).?; | ||
| 279 | defer std.testing.allocator.free(gc); | ||
| 280 | try std.testing.expectEqualStrings(gc, str); | ||
| 281 | } | ||
| 282 | |||
| 283 | try std.testing.expectEqual(@as(?[]u8, null), try iter.next()); | ||
| 284 | } | ||
| 285 | |||
| 103 | test "Segmentation ZWJ and ZWSP emoji sequences" { | 286 | test "Segmentation ZWJ and ZWSP emoji sequences" { |
| 104 | const seq_1 = "\u{1F43B}\u{200D}\u{2744}\u{FE0F}"; | 287 | const seq_1 = "\u{1F43B}\u{200D}\u{2744}\u{FE0F}"; |
| 105 | const seq_2 = "\u{1F43B}\u{200D}\u{2744}\u{FE0F}"; | 288 | const seq_2 = "\u{1F43B}\u{200D}\u{2744}\u{FE0F}"; |
| @@ -218,3 +401,62 @@ pub fn graphemeBreak( | |||
| 218 | 401 | ||
| 219 | return true; | 402 | return true; |
| 220 | } | 403 | } |
| 404 | |||
| 405 | test "Segmentation GraphemeIterator" { | ||
| 406 | const allocator = std.testing.allocator; | ||
| 407 | var file = try std.fs.cwd().openFile("GraphemeBreakTest.txt", .{}); | ||
| 408 | defer file.close(); | ||
| 409 | var buf_reader = std.io.bufferedReader(file.reader()); | ||
| 410 | var input_stream = buf_reader.reader(); | ||
| 411 | |||
| 412 | var buf: [4096]u8 = undefined; | ||
| 413 | var line_no: usize = 1; | ||
| 414 | |||
| 415 | while (try input_stream.readUntilDelimiterOrEof(&buf, '\n')) |raw| : (line_no += 1) { | ||
| 416 | // Skip comments or empty lines. | ||
| 417 | if (raw.len == 0 or raw[0] == '#' or raw[0] == '@') continue; | ||
| 418 | |||
| 419 | // Clean up. | ||
| 420 | var line = std.mem.trimLeft(u8, raw, "÷ "); | ||
| 421 | if (std.mem.indexOf(u8, line, " ÷\t#")) |octo| { | ||
| 422 | line = line[0..octo]; | ||
| 423 | } | ||
| 424 | // Iterate over fields. | ||
| 425 | var want = std.ArrayList(Grapheme).init(allocator); | ||
| 426 | defer want.deinit(); | ||
| 427 | |||
| 428 | var all_bytes = std.ArrayList(u8).init(allocator); | ||
| 429 | defer all_bytes.deinit(); | ||
| 430 | |||
| 431 | var graphemes = std.mem.split(u8, line, " ÷ "); | ||
| 432 | var bytes_index: usize = 0; | ||
| 433 | |||
| 434 | while (graphemes.next()) |field| { | ||
| 435 | var code_points = std.mem.split(u8, field, " "); | ||
| 436 | var cp_buf: [4]u8 = undefined; | ||
| 437 | var cp_index: usize = 0; | ||
| 438 | var gc_len: usize = 0; | ||
| 439 | |||
| 440 | while (code_points.next()) |code_point| { | ||
| 441 | if (std.mem.eql(u8, code_point, "×")) continue; | ||
| 442 | const cp: u21 = try std.fmt.parseInt(u21, code_point, 16); | ||
| 443 | const len = try unicode.utf8Encode(cp, &cp_buf); | ||
| 444 | try all_bytes.appendSlice(cp_buf[0..len]); | ||
| 445 | cp_index += len; | ||
| 446 | gc_len += len; | ||
| 447 | } | ||
| 448 | |||
| 449 | try want.append(Grapheme{ .len = gc_len, .offset = bytes_index }); | ||
| 450 | bytes_index += cp_index; | ||
| 451 | } | ||
| 452 | |||
| 453 | // std.debug.print("\nline {}: {s}\n", .{ line_no, all_bytes.items }); | ||
| 454 | var iter = GraphemeIterator.init(all_bytes.items); | ||
| 455 | |||
| 456 | // Chaeck. | ||
| 457 | for (want.items) |w| { | ||
| 458 | const g = (iter.next()).?; | ||
| 459 | try std.testing.expect(w.eql(all_bytes.items, all_bytes.items[g.offset .. g.offset + g.len])); | ||
| 460 | } | ||
| 461 | } | ||
| 462 | } | ||
diff --git a/src/gbp_gen.zig b/src/gbp_gen.zig index afc54fc..7e27f35 100644 --- a/src/gbp_gen.zig +++ b/src/gbp_gen.zig | |||
| @@ -62,8 +62,6 @@ pub fn main() !void { | |||
| 62 | var blocks_map = BlockMap.init(allocator); | 62 | var blocks_map = BlockMap.init(allocator); |
| 63 | defer blocks_map.deinit(); | 63 | defer blocks_map.deinit(); |
| 64 | 64 | ||
| 65 | const no_prop = std.math.maxInt(u16); | ||
| 66 | |||
| 67 | var stage1 = std.ArrayList(u16).init(allocator); | 65 | var stage1 = std.ArrayList(u16).init(allocator); |
| 68 | defer stage1.deinit(); | 66 | defer stage1.deinit(); |
| 69 | 67 | ||
| @@ -101,12 +99,7 @@ pub fn main() !void { | |||
| 101 | try stage2.appendSlice(block[0..block_len]); | 99 | try stage2.appendSlice(block[0..block_len]); |
| 102 | } | 100 | } |
| 103 | 101 | ||
| 104 | if (prop == .none) { | 102 | try stage1.append(gop.value_ptr.*); |
| 105 | try stage1.append(no_prop); | ||
| 106 | } else { | ||
| 107 | try stage1.append(gop.value_ptr.*); | ||
| 108 | } | ||
| 109 | |||
| 110 | block_len = 0; | 103 | block_len = 0; |
| 111 | } | 104 | } |
| 112 | 105 | ||
| @@ -120,8 +113,6 @@ pub fn main() !void { | |||
| 120 | const writer = out_buf.writer(); | 113 | const writer = out_buf.writer(); |
| 121 | 114 | ||
| 122 | const prop_code = | 115 | const prop_code = |
| 123 | \\const std = @import("std"); | ||
| 124 | \\ | ||
| 125 | \\const Prop = enum { | 116 | \\const Prop = enum { |
| 126 | \\ none, | 117 | \\ none, |
| 127 | \\ | 118 | \\ |
| @@ -161,11 +152,8 @@ pub fn main() !void { | |||
| 161 | try writer.writeAll("};\n"); | 152 | try writer.writeAll("};\n"); |
| 162 | 153 | ||
| 163 | const code = | 154 | const code = |
| 164 | \\const no_prop = std.math.maxInt(u16); | ||
| 165 | \\ | ||
| 166 | \\inline fn getProp(cp: u21) Prop { | 155 | \\inline fn getProp(cp: u21) Prop { |
| 167 | \\ const stage_1_index = cp >> 8; | 156 | \\ const stage_1_index = cp >> 8; |
| 168 | \\ if (stage_1[stage_1_index] == no_prop) return .none; | ||
| 169 | \\ const stage_2_index = stage_1[stage_1_index] + (cp & 0xff); | 157 | \\ const stage_2_index = stage_1[stage_1_index] + (cp & 0xff); |
| 170 | \\ const stage_3_index = stage_2[stage_2_index]; | 158 | \\ const stage_3_index = stage_2[stage_2_index]; |
| 171 | \\ return stage_3[stage_3_index]; | 159 | \\ return stage_3[stage_3_index]; |
diff --git a/src/main.zig b/src/main.zig index ca167e8..8335530 100644 --- a/src/main.zig +++ b/src/main.zig | |||
| @@ -23,3 +23,7 @@ pub fn main() !void { | |||
| 23 | 23 | ||
| 24 | std.debug.print("result: {}, took: {}\n", .{ result, timer.lap() / std.time.ns_per_ms }); | 24 | std.debug.print("result: {}, took: {}\n", .{ result, timer.lap() / std.time.ns_per_ms }); |
| 25 | } | 25 | } |
| 26 | |||
| 27 | test { | ||
| 28 | _ = @import("Grapheme.zig"); | ||
| 29 | } | ||