From 490fd008e29420e5b317fd5ef7526f3cc92ba2eb Mon Sep 17 00:00:00 2001 From: Jose Colon Rodriguez Date: Sat, 17 Feb 2024 11:31:52 -0400 Subject: display_width tweaks --- src/Grapheme.zig | 136 ++++++++++++++++++++++++++------------------------ src/display_width.zig | 30 ++++++----- src/main.zig | 15 +++--- 3 files changed, 97 insertions(+), 84 deletions(-) (limited to 'src') diff --git a/src/Grapheme.zig b/src/Grapheme.zig index 6892a2a..910aec5 100644 --- a/src/Grapheme.zig +++ b/src/Grapheme.zig @@ -56,7 +56,7 @@ pub const GraphemeIterator = struct { const gc_start = self.buf[0].?.offset; var gc_len: usize = self.buf[0].?.len; - var state: u3 = 0; + var state = State{}; if (graphemeBreak( self.buf[0].?.code, @@ -95,36 +95,42 @@ fn isIgnorable(cp: u21) bool { } // Grapheme break state. -// Extended Pictographic (emoji) -fn hasXpic(state: *const u3) bool { - return state.* & 1 == 1; -} -fn setXpic(state: *u3) void { - state.* |= 1; -} -fn unsetXpic(state: *u3) void { - state.* ^= 1; -} -// Regional Indicatior (flags) -fn hasRegional(state: *const u3) bool { - return state.* & 2 == 2; -} -fn setRegional(state: *u3) void { - state.* |= 2; -} -fn unsetRegional(state: *u3) void { - state.* ^= 2; -} -// Indic Conjunct -fn hasIndic(state: *const u3) bool { - return state.* & 4 == 4; -} -fn setIndic(state: *u3) void { - state.* |= 4; -} -fn unsetIndic(state: *u3) void { - state.* ^= 4; -} +const State = struct { + bits: u3 = 0, + + // Extended Pictographic (emoji) + fn hasXpic(self: State) bool { + return self.bits & 1 == 1; + } + fn setXpic(self: *State) void { + self.bits |= 1; + } + fn unsetXpic(self: *State) void { + self.bits ^= 1; + } + + // Regional Indicatior (flags) + fn hasRegional(self: State) bool { + return self.bits & 2 == 2; + } + fn setRegional(self: *State) void { + self.bits |= 2; + } + fn unsetRegional(self: *State) void { + self.bits ^= 2; + } + + // Indic Conjunct + fn hasIndic(self: State) bool { + return self.bits & 4 == 4; + } + fn setIndic(self: *State) void { + self.bits |= 4; + } + fn unsetIndic(self: *State) void { + self.bits ^= 4; + } +}; /// `graphemeBreak` returns true only if a grapheme break point is required /// between `cp1` and `cp2`. `state` should start out as 0. If calling @@ -135,7 +141,7 @@ fn unsetIndic(state: *u3) void { pub fn graphemeBreak( cp1: u21, cp2: u21, - state: *u3, + state: *State, ) bool { // Extract relevant properties. const cp1_props_byte = gbp.stage_3[gbp.stage_2[gbp.stage_1[cp1 >> 8] + (cp1 & 0xff)]]; @@ -149,9 +155,9 @@ pub fn graphemeBreak( const cp2_is_emoji = cp2_props_byte & 1 == 1; // GB11: Emoji Extend* ZWJ x Emoji - if (!hasXpic(state) and cp1_is_emoji) setXpic(state); + if (!state.hasXpic() and cp1_is_emoji) state.setXpic(); // GB9c: Indic Conjunct Break - if (!hasIndic(state) and cp1_indic_prop == .Consonant) setIndic(state); + if (!state.hasIndic() and cp1_indic_prop == .Consonant) state.setIndic(); // GB3: CR x LF if (cp1 == '\r' and cp2 == '\n') return false; @@ -159,23 +165,13 @@ pub fn graphemeBreak( // GB4: Control if (isBreaker(cp1)) return true; - // GB6: Hangul L x (L|V|LV|VT) - if (cp1_gbp_prop == .L) { - if (cp2_gbp_prop == .L or - cp2_gbp_prop == .V or - cp2_gbp_prop == .LV or - cp2_gbp_prop == .LVT) return false; - } - - // GB7: Hangul (LV | V) x (V | T) - if (cp1_gbp_prop == .LV or cp1_gbp_prop == .V) { - if (cp2_gbp_prop == .V or - cp2_gbp_prop == .T) return false; - } - - // GB8: Hangul (LVT | T) x T - if (cp1_gbp_prop == .LVT or cp1_gbp_prop == .T) { - if (cp2_gbp_prop == .T) return false; + // GB11: Emoji Extend* ZWJ x Emoji + if (state.hasXpic() and + cp1_gbp_prop == .ZWJ and + cp2_is_emoji) + { + state.unsetXpic(); + return false; } // GB9b: x (Extend | ZWJ) @@ -189,44 +185,54 @@ pub fn graphemeBreak( // GB12, GB13: RI x RI if (cp1_gbp_prop == .Regional_Indicator and cp2_gbp_prop == .Regional_Indicator) { - if (hasRegional(state)) { - unsetRegional(state); + if (state.hasRegional()) { + state.unsetRegional(); return true; } else { - setRegional(state); + state.setRegional(); return false; } } - // GB11: Emoji Extend* ZWJ x Emoji - if (hasXpic(state) and - cp1_gbp_prop == .ZWJ and - cp2_is_emoji) - { - unsetXpic(state); - return false; + // GB6: Hangul L x (L|V|LV|VT) + if (cp1_gbp_prop == .L) { + if (cp2_gbp_prop == .L or + cp2_gbp_prop == .V or + cp2_gbp_prop == .LV or + cp2_gbp_prop == .LVT) return false; + } + + // GB7: Hangul (LV | V) x (V | T) + if (cp1_gbp_prop == .LV or cp1_gbp_prop == .V) { + if (cp2_gbp_prop == .V or + cp2_gbp_prop == .T) return false; + } + + // GB8: Hangul (LVT | T) x T + if (cp1_gbp_prop == .LVT or cp1_gbp_prop == .T) { + if (cp2_gbp_prop == .T) return false; } // GB9c: Indic Conjunct Break - if (hasIndic(state) and + if (state.hasIndic() and cp1_indic_prop == .Consonant and (cp2_indic_prop == .Extend or cp2_indic_prop == .Linker)) { return false; } - if (hasIndic(state) and + if (state.hasIndic() and cp1_indic_prop == .Extend and cp2_indic_prop == .Linker) { return false; } - if (hasIndic(state) and + if (state.hasIndic() and (cp1_indic_prop == .Linker or cp1_gbp_prop == .ZWJ) and cp2_indic_prop == .Consonant) { - unsetIndic(state); + state.unsetIndic(); return false; } diff --git a/src/display_width.zig b/src/display_width.zig index aed0ef1..2ac7093 100644 --- a/src/display_width.zig +++ b/src/display_width.zig @@ -42,17 +42,10 @@ fn isAsciiOnly(str: []const u8) bool { pub fn strWidth(str: []const u8) usize { var total: isize = 0; + // ASCII fast path if (isAsciiOnly(str)) { - for (str) |b| { - // Backspace and delete - if (b == 0x8 or b == 0x7f) { - total -= 1; - } else if (b >= 0x20) { - total += 1; - } - } - - return if (total > 0) @intCast(total) else 0; + for (str) |b| total += codePointWidth(b); + return @intCast(@max(0, total)); } var giter = GraphemeIterator.init(str); @@ -72,14 +65,17 @@ pub fn strWidth(str: []const u8) usize { } // Only adding width of first non-zero-width code point. - if (gc_total == 0) gc_total = w; + if (gc_total == 0) { + gc_total = w; + break; + } } } total += gc_total; } - return if (total > 0) @intCast(total) else 0; + return @intCast(@max(0, total)); } test "display_width Width" { @@ -147,4 +143,14 @@ test "display_width Width" { // The following passes but as a mere coincidence. const kannada_2 = "\u{0cb0}\u{0cbc}\u{0ccd}\u{0c9a}"; try testing.expectEqual(@as(usize, 2), strWidth(kannada_2)); + + // From Rust https://github.com/jameslanska/unicode-display-width + try testing.expectEqual(@as(usize, 15), strWidth("πŸ”₯πŸ—‘πŸ©πŸ‘©πŸ»β€πŸš€β°πŸ’ƒπŸΌπŸ”¦πŸ‘πŸ»")); + try testing.expectEqual(@as(usize, 2), strWidth("πŸ¦€")); + try testing.expectEqual(@as(usize, 2), strWidth("πŸ‘¨β€πŸ‘©β€πŸ‘§β€πŸ‘§")); + try testing.expectEqual(@as(usize, 2), strWidth("πŸ‘©β€πŸ”¬")); + try testing.expectEqual(@as(usize, 9), strWidth("sane text")); + try testing.expectEqual(@as(usize, 9), strWidth("αΊ’ΜŒΓ‘Μ²lΝ”ΜΜžΜ„Μ‘ΝŒgΜ–Μ˜Μ˜Μ”Μ”Ν’ΝžΝoΜͺΜ”TΜ’Μ™Μ«ΜˆΜΝžeΜ¬ΝˆΝ•ΝŒΜΝ‘x̺̍ṭ̓̓ͅ")); + try testing.expectEqual(@as(usize, 17), strWidth("μŠ¬λΌλ°” μš°ν¬λΌμ΄λ‚˜")); + try testing.expectEqual(@as(usize, 1), strWidth("\u{378}")); } diff --git a/src/main.zig b/src/main.zig index bb188ff..38ba343 100644 --- a/src/main.zig +++ b/src/main.zig @@ -1,12 +1,12 @@ const std = @import("std"); // const GraphemeIterator = @import("ziglyph").GraphemeIterator; -const GraphemeIterator = @import("Grapheme").GraphemeIterator; +// const GraphemeIterator = @import("Grapheme").GraphemeIterator; // const codePointWidth = @import("ziglyph").display_width.codePointWidth; // const codePointWidth = @import("display_width").codePointWidth; // const strWidth = @import("ziglyph").display_width.strWidth; -// const strWidth = @import("display_width").strWidth; -// const CodePointIterator = @import("CodePoint").CodePointIterator; +const strWidth = @import("display_width").strWidth; +const CodePointIterator = @import("CodePoint").CodePointIterator; pub fn main() !void { var gpa = std.heap.GeneralPurposeAllocator(.{}){}; @@ -17,16 +17,17 @@ pub fn main() !void { defer allocator.free(input); var result: usize = 0; - var iter = GraphemeIterator.init(input); + // var result: isize = 0; + // var iter = GraphemeIterator.init(input); // var iter = CodePointIterator{ .bytes = input }; - // var iter = std.mem.splitScalar(u8, input, '\n'); + var iter = std.mem.splitScalar(u8, input, '\n'); var timer = try std.time.Timer.start(); // for (0..50) |_| { // while (iter.next()) |cp| result += codePointWidth(@intCast(cp.code)); - while (iter.next()) |_| result += 1; - // while (iter.next()) |line| result += strWidth(line); + // while (iter.next()) |_| result += 1; + while (iter.next()) |line| result += strWidth(line); // iter.cp_iter.i = 0; // } -- cgit v1.2.3