From b2be2562b908d5563a8e71c44cbad577e4df4201 Mon Sep 17 00:00:00 2001 From: Jose Colon Rodriguez Date: Wed, 14 Feb 2024 21:39:37 -0400 Subject: gbp and indic direct array access --- codegen/grapheme_break.zig | 60 +++---------------------------- codegen/indic.zig | 32 +++-------------- src/Grapheme.zig | 89 ++++++++++++++++++++++++---------------------- 3 files changed, 55 insertions(+), 126 deletions(-) diff --git a/codegen/grapheme_break.zig b/codegen/grapheme_break.zig index ace875c..95237f3 100644 --- a/codegen/grapheme_break.zig +++ b/codegen/grapheme_break.zig @@ -112,7 +112,7 @@ pub fn main() !void { const writer = out_buf.writer(); const prop_code = - \\const Prop = enum { + \\pub const Prop = enum { \\ none, \\ \\ control, @@ -132,75 +132,23 @@ pub fn main() !void { try writer.writeAll(prop_code); - try writer.print("const stage_1 = [{}]u16{{", .{stage1.items.len}); + try writer.print("pub const stage_1 = [{}]u16{{", .{stage1.items.len}); for (stage1.items) |v| { _ = try writer.print("{},", .{v}); } try writer.writeAll("};\n"); - try writer.print("const stage_2 = [{}]u4{{", .{stage2.items.len}); + try writer.print("pub const stage_2 = [{}]u4{{", .{stage2.items.len}); for (stage2.items) |v| { _ = try writer.print("{},", .{v}); } try writer.writeAll("};\n"); - try writer.print("const stage_3 = [{}]Prop{{", .{stage3.items.len}); + try writer.print("pub const stage_3 = [{}]Prop{{", .{stage3.items.len}); for (stage3.items) |v| { _ = try writer.print(".{s},", .{@tagName(v)}); } try writer.writeAll("};\n"); - const code = - \\inline fn getProp(cp: u21) Prop { - \\ const stage_1_index = cp >> 8; - \\ const stage_2_index = stage_1[stage_1_index] + (cp & 0xff); - \\ const stage_3_index = stage_2[stage_2_index]; - \\ return stage_3[stage_3_index]; - \\} - \\ - \\pub inline fn isControl(cp: u21) bool { - \\ return getProp(cp) == .control; - \\} - \\ - \\pub inline fn isExtend(cp: u21) bool { - \\ return getProp(cp) == .extend; - \\} - \\ - \\pub inline fn isL(cp: u21) bool { - \\ return getProp(cp) == .hangul_l; - \\} - \\pub inline fn isLv(cp: u21) bool { - \\ return getProp(cp) == .hangul_lv; - \\} - \\pub inline fn isLvt(cp: u21) bool { - \\ return getProp(cp) == .hangul_lvt; - \\} - \\pub inline fn isV(cp: u21) bool { - \\ return getProp(cp) == .hangul_v; - \\} - \\pub inline fn isT(cp: u21) bool { - \\ return getProp(cp) == .hangul_t; - \\} - \\ - \\pub inline fn isPrepend(cp: u21) bool { - \\ return getProp(cp) == .prepend; - \\} - \\ - \\pub inline fn isRegionalIndicator(cp: u21) bool { - \\ return getProp(cp) == .regional; - \\} - \\ - \\pub inline fn isSpacingmark(cp: u21) bool { - \\ return getProp(cp) == .spacing; - \\} - \\ - \\pub inline fn isZwj(cp: u21) bool { - \\ return getProp(cp) == .zwj; - \\} - \\ - ; - - try writer.writeAll(code); - try out_buf.flush(); } diff --git a/codegen/indic.zig b/codegen/indic.zig index 871f1c5..07bcd92 100644 --- a/codegen/indic.zig +++ b/codegen/indic.zig @@ -130,7 +130,7 @@ pub fn main() !void { const writer = out_buf.writer(); const prop_code = - \\const Prop = enum { + \\pub const Prop = enum { \\ none, \\ \\ Consonant, @@ -142,47 +142,23 @@ pub fn main() !void { try writer.writeAll(prop_code); - try writer.print("const stage_1 = [{}]u16{{", .{stage1.items.len}); + try writer.print("pub const stage_1 = [{}]u16{{", .{stage1.items.len}); for (stage1.items) |v| { _ = try writer.print("{},", .{v}); } try writer.writeAll("};\n"); - try writer.print("const stage_2 = [{}]u3{{", .{stage2.items.len}); + try writer.print("pub const stage_2 = [{}]u3{{", .{stage2.items.len}); for (stage2.items) |v| { _ = try writer.print("{},", .{v}); } try writer.writeAll("};\n"); - try writer.print("const stage_3 = [{}]Prop{{", .{stage3.items.len}); + try writer.print("pub const stage_3 = [{}]Prop{{", .{stage3.items.len}); for (stage3.items) |v| { _ = try writer.print(".{s},", .{@tagName(v)}); } try writer.writeAll("};\n"); - const code = - \\inline fn getProp(cp: u21) Prop { - \\ const stage_1_index = cp >> 8; - \\ const stage_2_index = stage_1[stage_1_index] + (cp & 0xff); - \\ const stage_3_index = stage_2[stage_2_index]; - \\ return stage_3[stage_3_index]; - \\} - \\ - \\pub inline fn isConsonant(cp: u21) bool { - \\ return getProp(cp) == .Consonant; - \\} - \\ - \\pub inline fn isExtend(cp: u21) bool { - \\ return getProp(cp) == .Extend; - \\} - \\ - \\pub inline fn isLinker(cp: u21) bool { - \\ return getProp(cp) == .Linker; - \\} - \\ - ; - - try writer.writeAll(code); - try out_buf.flush(); } diff --git a/src/Grapheme.zig b/src/Grapheme.zig index a0ec900..56eecbe 100644 --- a/src/Grapheme.zig +++ b/src/Grapheme.zig @@ -82,26 +82,27 @@ pub const GraphemeIterator = struct { }; // Predicates -fn isBreaker(cp: u21) bool { - return cp == '\x0d' or cp == '\x0a' or gbp.isControl(cp); +inline fn isBreaker(cp: u21) bool { + return cp == '\x0d' or cp == '\x0a' or gbp.stage_3[gbp.stage_2[gbp.stage_1[cp >> 8] + (cp & 0xff)]] == .control; } -fn isIgnorable(cp: u21) bool { - return gbp.isExtend(cp) or gbp.isSpacingmark(cp) or cp == '\u{200d}'; +inline fn isIgnorable(cp: u21) bool { + const cp_gbp_prop = gbp.stage_3[gbp.stage_2[gbp.stage_1[cp >> 8] + (cp & 0xff)]]; + return cp_gbp_prop == .extend or cp_gbp_prop == .spacing or cp == '\u{200d}'; } -// test "Segmentation comptime GraphemeIterator" { -// const want = [_][]const u8{ "H", "é", "l", "l", "o" }; -// -// comptime { -// const src = "Héllo"; -// var ct_iter = GraphemeIterator.init(src); -// var i = 0; -// while (ct_iter.next()) |grapheme| : (i += 1) { -// try std.testing.expect(grapheme.eql(src, want[i])); -// } -// } -// } +test "Segmentation comptime GraphemeIterator" { + const want = [_][]const u8{ "H", "é", "l", "l", "o" }; + + comptime { + const src = "Héllo"; + var ct_iter = GraphemeIterator.init(src); + var i = 0; + while (ct_iter.next()) |grapheme| : (i += 1) { + try std.testing.expect(grapheme.eql(src, want[i])); + } + } +} test "Segmentation ZWJ and ZWSP emoji sequences" { const seq_1 = "\u{1F43B}\u{200D}\u{2744}\u{FE0F}"; @@ -172,7 +173,9 @@ pub fn graphemeBreak( // GB11: Emoji Extend* ZWJ x Emoji if (!hasXpic(state) and emoji.isExtendedPictographic(cp1)) setXpic(state); // GB9c: Indic Conjunct Break - if (!hasIndic(state) and indic.isConsonant(cp1)) setIndic(state); + const cp1_indic_prop = indic.stage_3[indic.stage_2[indic.stage_1[cp1 >> 8] + (cp1 & 0xff)]]; + const cp2_indic_prop = indic.stage_3[indic.stage_2[indic.stage_1[cp2 >> 8] + (cp2 & 0xff)]]; + if (!hasIndic(state) and cp1_indic_prop == .Consonant) setIndic(state); // GB3: CR x LF if (cp1 == '\r' and cp2 == '\n') return false; @@ -181,35 +184,37 @@ pub fn graphemeBreak( if (isBreaker(cp1)) return true; // GB6: Hangul L x (L|V|LV|VT) - if (gbp.isL(cp1)) { - if (gbp.isL(cp2) or - gbp.isV(cp2) or - gbp.isLv(cp2) or - gbp.isLvt(cp2)) return false; + const cp1_gbp_prop = gbp.stage_3[gbp.stage_2[gbp.stage_1[cp1 >> 8] + (cp1 & 0xff)]]; + const cp2_gbp_prop = gbp.stage_3[gbp.stage_2[gbp.stage_1[cp2 >> 8] + (cp2 & 0xff)]]; + if (cp1_gbp_prop == .hangul_l) { + if (cp2_gbp_prop == .hangul_l or + cp2_gbp_prop == .hangul_v or + cp2_gbp_prop == .hangul_lv or + cp2_gbp_prop == .hangul_lvt) return false; } // GB7: Hangul (LV | V) x (V | T) - if (gbp.isLv(cp1) or gbp.isV(cp1)) { - if (gbp.isV(cp2) or - gbp.isT(cp2)) return false; + if (cp1_gbp_prop == .hangul_lv or cp1_gbp_prop == .hangul_v) { + if (cp2_gbp_prop == .hangul_v or + cp2_gbp_prop == .hangul_t) return false; } // GB8: Hangul (LVT | T) x T - if (gbp.isLvt(cp1) or gbp.isT(cp1)) { - if (gbp.isT(cp2)) return false; + if (cp1_gbp_prop == .hangul_lvt or cp1_gbp_prop == .hangul_t) { + if (cp2_gbp_prop == .hangul_t) return false; } // GB9b: x (Extend | ZWJ) - if (gbp.isExtend(cp2) or gbp.isZwj(cp2)) return false; + if (cp2_gbp_prop == .extend or cp2_gbp_prop == .zwj) return false; // GB9a: x Spacing - if (gbp.isSpacingmark(cp2)) return false; + if (cp2_gbp_prop == .spacing) return false; // GB9b: Prepend x - if (gbp.isPrepend(cp1) and !isBreaker(cp2)) return false; + if (cp1_gbp_prop == .prepend and !isBreaker(cp2)) return false; // GB12, GB13: RI x RI - if (gbp.isRegionalIndicator(cp1) and gbp.isRegionalIndicator(cp2)) { + if (cp1_gbp_prop == .regional and cp2_gbp_prop == .regional) { if (hasRegional(state)) { unsetRegional(state); return true; @@ -221,7 +226,7 @@ pub fn graphemeBreak( // GB11: Emoji Extend* ZWJ x Emoji if (hasXpic(state) and - gbp.isZwj(cp1) and + cp1_gbp_prop == .zwj and emoji.isExtendedPictographic(cp2)) { unsetXpic(state); @@ -230,37 +235,37 @@ pub fn graphemeBreak( // GB9c: Indic Conjunct Break if (hasIndic(state) and - indic.isConsonant(cp1) and - indic.isExtend(cp2)) + cp1_indic_prop == .Consonant and + cp2_indic_prop == .Extend) { return false; } if (hasIndic(state) and - indic.isConsonant(cp1) and - indic.isLinker(cp2)) + cp1_indic_prop == .Consonant and + cp2_indic_prop == .Linker) { return false; } if (hasIndic(state) and - indic.isExtend(cp1) and - indic.isLinker(cp2)) + cp1_indic_prop == .Extend and + cp2_indic_prop == .Linker) { return false; } if (hasIndic(state) and - indic.isLinker(cp1) and - indic.isConsonant(cp2)) + cp1_indic_prop == .Linker and + cp2_indic_prop == .Consonant) { unsetIndic(state); return false; } if (hasIndic(state) and - gbp.isZwj(cp1) and - indic.isConsonant(cp2)) + cp1_gbp_prop == .zwj and + cp2_indic_prop == .Consonant) { unsetIndic(state); return false; -- cgit v1.2.3