diff options
| -rw-r--r-- | src/Grapheme.zig | 136 | ||||
| -rw-r--r-- | src/display_width.zig | 30 | ||||
| -rw-r--r-- | src/main.zig | 15 |
3 files changed, 97 insertions, 84 deletions
diff --git a/src/Grapheme.zig b/src/Grapheme.zig index 6892a2a..910aec5 100644 --- a/src/Grapheme.zig +++ b/src/Grapheme.zig | |||
| @@ -56,7 +56,7 @@ pub const GraphemeIterator = struct { | |||
| 56 | 56 | ||
| 57 | const gc_start = self.buf[0].?.offset; | 57 | const gc_start = self.buf[0].?.offset; |
| 58 | var gc_len: usize = self.buf[0].?.len; | 58 | var gc_len: usize = self.buf[0].?.len; |
| 59 | var state: u3 = 0; | 59 | var state = State{}; |
| 60 | 60 | ||
| 61 | if (graphemeBreak( | 61 | if (graphemeBreak( |
| 62 | self.buf[0].?.code, | 62 | self.buf[0].?.code, |
| @@ -95,36 +95,42 @@ fn isIgnorable(cp: u21) bool { | |||
| 95 | } | 95 | } |
| 96 | 96 | ||
| 97 | // Grapheme break state. | 97 | // Grapheme break state. |
| 98 | // Extended Pictographic (emoji) | 98 | const State = struct { |
| 99 | fn hasXpic(state: *const u3) bool { | 99 | bits: u3 = 0, |
| 100 | return state.* & 1 == 1; | 100 | |
| 101 | } | 101 | // Extended Pictographic (emoji) |
| 102 | fn setXpic(state: *u3) void { | 102 | fn hasXpic(self: State) bool { |
| 103 | state.* |= 1; | 103 | return self.bits & 1 == 1; |
| 104 | } | 104 | } |
| 105 | fn unsetXpic(state: *u3) void { | 105 | fn setXpic(self: *State) void { |
| 106 | state.* ^= 1; | 106 | self.bits |= 1; |
| 107 | } | 107 | } |
| 108 | // Regional Indicatior (flags) | 108 | fn unsetXpic(self: *State) void { |
| 109 | fn hasRegional(state: *const u3) bool { | 109 | self.bits ^= 1; |
| 110 | return state.* & 2 == 2; | 110 | } |
| 111 | } | 111 | |
| 112 | fn setRegional(state: *u3) void { | 112 | // Regional Indicatior (flags) |
| 113 | state.* |= 2; | 113 | fn hasRegional(self: State) bool { |
| 114 | } | 114 | return self.bits & 2 == 2; |
| 115 | fn unsetRegional(state: *u3) void { | 115 | } |
| 116 | state.* ^= 2; | 116 | fn setRegional(self: *State) void { |
| 117 | } | 117 | self.bits |= 2; |
| 118 | // Indic Conjunct | 118 | } |
| 119 | fn hasIndic(state: *const u3) bool { | 119 | fn unsetRegional(self: *State) void { |
| 120 | return state.* & 4 == 4; | 120 | self.bits ^= 2; |
| 121 | } | 121 | } |
| 122 | fn setIndic(state: *u3) void { | 122 | |
| 123 | state.* |= 4; | 123 | // Indic Conjunct |
| 124 | } | 124 | fn hasIndic(self: State) bool { |
| 125 | fn unsetIndic(state: *u3) void { | 125 | return self.bits & 4 == 4; |
| 126 | state.* ^= 4; | 126 | } |
| 127 | } | 127 | fn setIndic(self: *State) void { |
| 128 | self.bits |= 4; | ||
| 129 | } | ||
| 130 | fn unsetIndic(self: *State) void { | ||
| 131 | self.bits ^= 4; | ||
| 132 | } | ||
| 133 | }; | ||
| 128 | 134 | ||
| 129 | /// `graphemeBreak` returns true only if a grapheme break point is required | 135 | /// `graphemeBreak` returns true only if a grapheme break point is required |
| 130 | /// between `cp1` and `cp2`. `state` should start out as 0. If calling | 136 | /// between `cp1` and `cp2`. `state` should start out as 0. If calling |
| @@ -135,7 +141,7 @@ fn unsetIndic(state: *u3) void { | |||
| 135 | pub fn graphemeBreak( | 141 | pub fn graphemeBreak( |
| 136 | cp1: u21, | 142 | cp1: u21, |
| 137 | cp2: u21, | 143 | cp2: u21, |
| 138 | state: *u3, | 144 | state: *State, |
| 139 | ) bool { | 145 | ) bool { |
| 140 | // Extract relevant properties. | 146 | // Extract relevant properties. |
| 141 | const cp1_props_byte = gbp.stage_3[gbp.stage_2[gbp.stage_1[cp1 >> 8] + (cp1 & 0xff)]]; | 147 | const cp1_props_byte = gbp.stage_3[gbp.stage_2[gbp.stage_1[cp1 >> 8] + (cp1 & 0xff)]]; |
| @@ -149,9 +155,9 @@ pub fn graphemeBreak( | |||
| 149 | const cp2_is_emoji = cp2_props_byte & 1 == 1; | 155 | const cp2_is_emoji = cp2_props_byte & 1 == 1; |
| 150 | 156 | ||
| 151 | // GB11: Emoji Extend* ZWJ x Emoji | 157 | // GB11: Emoji Extend* ZWJ x Emoji |
| 152 | if (!hasXpic(state) and cp1_is_emoji) setXpic(state); | 158 | if (!state.hasXpic() and cp1_is_emoji) state.setXpic(); |
| 153 | // GB9c: Indic Conjunct Break | 159 | // GB9c: Indic Conjunct Break |
| 154 | if (!hasIndic(state) and cp1_indic_prop == .Consonant) setIndic(state); | 160 | if (!state.hasIndic() and cp1_indic_prop == .Consonant) state.setIndic(); |
| 155 | 161 | ||
| 156 | // GB3: CR x LF | 162 | // GB3: CR x LF |
| 157 | if (cp1 == '\r' and cp2 == '\n') return false; | 163 | if (cp1 == '\r' and cp2 == '\n') return false; |
| @@ -159,23 +165,13 @@ pub fn graphemeBreak( | |||
| 159 | // GB4: Control | 165 | // GB4: Control |
| 160 | if (isBreaker(cp1)) return true; | 166 | if (isBreaker(cp1)) return true; |
| 161 | 167 | ||
| 162 | // GB6: Hangul L x (L|V|LV|VT) | 168 | // GB11: Emoji Extend* ZWJ x Emoji |
| 163 | if (cp1_gbp_prop == .L) { | 169 | if (state.hasXpic() and |
| 164 | if (cp2_gbp_prop == .L or | 170 | cp1_gbp_prop == .ZWJ and |
| 165 | cp2_gbp_prop == .V or | 171 | cp2_is_emoji) |
| 166 | cp2_gbp_prop == .LV or | 172 | { |
| 167 | cp2_gbp_prop == .LVT) return false; | 173 | state.unsetXpic(); |
| 168 | } | 174 | return false; |
| 169 | |||
| 170 | // GB7: Hangul (LV | V) x (V | T) | ||
| 171 | if (cp1_gbp_prop == .LV or cp1_gbp_prop == .V) { | ||
| 172 | if (cp2_gbp_prop == .V or | ||
| 173 | cp2_gbp_prop == .T) return false; | ||
| 174 | } | ||
| 175 | |||
| 176 | // GB8: Hangul (LVT | T) x T | ||
| 177 | if (cp1_gbp_prop == .LVT or cp1_gbp_prop == .T) { | ||
| 178 | if (cp2_gbp_prop == .T) return false; | ||
| 179 | } | 175 | } |
| 180 | 176 | ||
| 181 | // GB9b: x (Extend | ZWJ) | 177 | // GB9b: x (Extend | ZWJ) |
| @@ -189,44 +185,54 @@ pub fn graphemeBreak( | |||
| 189 | 185 | ||
| 190 | // GB12, GB13: RI x RI | 186 | // GB12, GB13: RI x RI |
| 191 | if (cp1_gbp_prop == .Regional_Indicator and cp2_gbp_prop == .Regional_Indicator) { | 187 | if (cp1_gbp_prop == .Regional_Indicator and cp2_gbp_prop == .Regional_Indicator) { |
| 192 | if (hasRegional(state)) { | 188 | if (state.hasRegional()) { |
| 193 | unsetRegional(state); | 189 | state.unsetRegional(); |
| 194 | return true; | 190 | return true; |
| 195 | } else { | 191 | } else { |
| 196 | setRegional(state); | 192 | state.setRegional(); |
| 197 | return false; | 193 | return false; |
| 198 | } | 194 | } |
| 199 | } | 195 | } |
| 200 | 196 | ||
| 201 | // GB11: Emoji Extend* ZWJ x Emoji | 197 | // GB6: Hangul L x (L|V|LV|VT) |
| 202 | if (hasXpic(state) and | 198 | if (cp1_gbp_prop == .L) { |
| 203 | cp1_gbp_prop == .ZWJ and | 199 | if (cp2_gbp_prop == .L or |
| 204 | cp2_is_emoji) | 200 | cp2_gbp_prop == .V or |
| 205 | { | 201 | cp2_gbp_prop == .LV or |
| 206 | unsetXpic(state); | 202 | cp2_gbp_prop == .LVT) return false; |
| 207 | return false; | 203 | } |
| 204 | |||
| 205 | // GB7: Hangul (LV | V) x (V | T) | ||
| 206 | if (cp1_gbp_prop == .LV or cp1_gbp_prop == .V) { | ||
| 207 | if (cp2_gbp_prop == .V or | ||
| 208 | cp2_gbp_prop == .T) return false; | ||
| 209 | } | ||
| 210 | |||
| 211 | // GB8: Hangul (LVT | T) x T | ||
| 212 | if (cp1_gbp_prop == .LVT or cp1_gbp_prop == .T) { | ||
| 213 | if (cp2_gbp_prop == .T) return false; | ||
| 208 | } | 214 | } |
| 209 | 215 | ||
| 210 | // GB9c: Indic Conjunct Break | 216 | // GB9c: Indic Conjunct Break |
| 211 | if (hasIndic(state) and | 217 | if (state.hasIndic() and |
| 212 | cp1_indic_prop == .Consonant and | 218 | cp1_indic_prop == .Consonant and |
| 213 | (cp2_indic_prop == .Extend or cp2_indic_prop == .Linker)) | 219 | (cp2_indic_prop == .Extend or cp2_indic_prop == .Linker)) |
| 214 | { | 220 | { |
| 215 | return false; | 221 | return false; |
| 216 | } | 222 | } |
| 217 | 223 | ||
| 218 | if (hasIndic(state) and | 224 | if (state.hasIndic() and |
| 219 | cp1_indic_prop == .Extend and | 225 | cp1_indic_prop == .Extend and |
| 220 | cp2_indic_prop == .Linker) | 226 | cp2_indic_prop == .Linker) |
| 221 | { | 227 | { |
| 222 | return false; | 228 | return false; |
| 223 | } | 229 | } |
| 224 | 230 | ||
| 225 | if (hasIndic(state) and | 231 | if (state.hasIndic() and |
| 226 | (cp1_indic_prop == .Linker or cp1_gbp_prop == .ZWJ) and | 232 | (cp1_indic_prop == .Linker or cp1_gbp_prop == .ZWJ) and |
| 227 | cp2_indic_prop == .Consonant) | 233 | cp2_indic_prop == .Consonant) |
| 228 | { | 234 | { |
| 229 | unsetIndic(state); | 235 | state.unsetIndic(); |
| 230 | return false; | 236 | return false; |
| 231 | } | 237 | } |
| 232 | 238 | ||
diff --git a/src/display_width.zig b/src/display_width.zig index aed0ef1..2ac7093 100644 --- a/src/display_width.zig +++ b/src/display_width.zig | |||
| @@ -42,17 +42,10 @@ fn isAsciiOnly(str: []const u8) bool { | |||
| 42 | pub fn strWidth(str: []const u8) usize { | 42 | pub fn strWidth(str: []const u8) usize { |
| 43 | var total: isize = 0; | 43 | var total: isize = 0; |
| 44 | 44 | ||
| 45 | // ASCII fast path | ||
| 45 | if (isAsciiOnly(str)) { | 46 | if (isAsciiOnly(str)) { |
| 46 | for (str) |b| { | 47 | for (str) |b| total += codePointWidth(b); |
| 47 | // Backspace and delete | 48 | return @intCast(@max(0, total)); |
| 48 | if (b == 0x8 or b == 0x7f) { | ||
| 49 | total -= 1; | ||
| 50 | } else if (b >= 0x20) { | ||
| 51 | total += 1; | ||
| 52 | } | ||
| 53 | } | ||
| 54 | |||
| 55 | return if (total > 0) @intCast(total) else 0; | ||
| 56 | } | 49 | } |
| 57 | 50 | ||
| 58 | var giter = GraphemeIterator.init(str); | 51 | var giter = GraphemeIterator.init(str); |
| @@ -72,14 +65,17 @@ pub fn strWidth(str: []const u8) usize { | |||
| 72 | } | 65 | } |
| 73 | 66 | ||
| 74 | // Only adding width of first non-zero-width code point. | 67 | // Only adding width of first non-zero-width code point. |
| 75 | if (gc_total == 0) gc_total = w; | 68 | if (gc_total == 0) { |
| 69 | gc_total = w; | ||
| 70 | break; | ||
| 71 | } | ||
| 76 | } | 72 | } |
| 77 | } | 73 | } |
| 78 | 74 | ||
| 79 | total += gc_total; | 75 | total += gc_total; |
| 80 | } | 76 | } |
| 81 | 77 | ||
| 82 | return if (total > 0) @intCast(total) else 0; | 78 | return @intCast(@max(0, total)); |
| 83 | } | 79 | } |
| 84 | 80 | ||
| 85 | test "display_width Width" { | 81 | test "display_width Width" { |
| @@ -147,4 +143,14 @@ test "display_width Width" { | |||
| 147 | // The following passes but as a mere coincidence. | 143 | // The following passes but as a mere coincidence. |
| 148 | const kannada_2 = "\u{0cb0}\u{0cbc}\u{0ccd}\u{0c9a}"; | 144 | const kannada_2 = "\u{0cb0}\u{0cbc}\u{0ccd}\u{0c9a}"; |
| 149 | try testing.expectEqual(@as(usize, 2), strWidth(kannada_2)); | 145 | try testing.expectEqual(@as(usize, 2), strWidth(kannada_2)); |
| 146 | |||
| 147 | // From Rust https://github.com/jameslanska/unicode-display-width | ||
| 148 | try testing.expectEqual(@as(usize, 15), strWidth("π₯π‘π©π©π»βπβ°ππΌπ¦ππ»")); | ||
| 149 | try testing.expectEqual(@as(usize, 2), strWidth("π¦")); | ||
| 150 | try testing.expectEqual(@as(usize, 2), strWidth("π¨βπ©βπ§βπ§")); | ||
| 151 | try testing.expectEqual(@as(usize, 2), strWidth("π©βπ¬")); | ||
| 152 | try testing.expectEqual(@as(usize, 9), strWidth("sane text")); | ||
| 153 | try testing.expectEqual(@as(usize, 9), strWidth("αΊΜΓ‘Μ²lΝΜΜΜΜΝgΜΜΜΜΜΝ’ΝΝoΜͺΜTΜ’ΜΜ«ΜΜΝeΜ¬ΝΝΝΜΝxΜΊΜαΉΜΜΝ ")); | ||
| 154 | try testing.expectEqual(@as(usize, 17), strWidth("μ¬λΌλ° μ°ν¬λΌμ΄λ")); | ||
| 155 | try testing.expectEqual(@as(usize, 1), strWidth("\u{378}")); | ||
| 150 | } | 156 | } |
diff --git a/src/main.zig b/src/main.zig index bb188ff..38ba343 100644 --- a/src/main.zig +++ b/src/main.zig | |||
| @@ -1,12 +1,12 @@ | |||
| 1 | const std = @import("std"); | 1 | const std = @import("std"); |
| 2 | 2 | ||
| 3 | // const GraphemeIterator = @import("ziglyph").GraphemeIterator; | 3 | // const GraphemeIterator = @import("ziglyph").GraphemeIterator; |
| 4 | const GraphemeIterator = @import("Grapheme").GraphemeIterator; | 4 | // const GraphemeIterator = @import("Grapheme").GraphemeIterator; |
| 5 | // const codePointWidth = @import("ziglyph").display_width.codePointWidth; | 5 | // const codePointWidth = @import("ziglyph").display_width.codePointWidth; |
| 6 | // const codePointWidth = @import("display_width").codePointWidth; | 6 | // const codePointWidth = @import("display_width").codePointWidth; |
| 7 | // const strWidth = @import("ziglyph").display_width.strWidth; | 7 | // const strWidth = @import("ziglyph").display_width.strWidth; |
| 8 | // const strWidth = @import("display_width").strWidth; | 8 | const strWidth = @import("display_width").strWidth; |
| 9 | // const CodePointIterator = @import("CodePoint").CodePointIterator; | 9 | const CodePointIterator = @import("CodePoint").CodePointIterator; |
| 10 | 10 | ||
| 11 | pub fn main() !void { | 11 | pub fn main() !void { |
| 12 | var gpa = std.heap.GeneralPurposeAllocator(.{}){}; | 12 | var gpa = std.heap.GeneralPurposeAllocator(.{}){}; |
| @@ -17,16 +17,17 @@ pub fn main() !void { | |||
| 17 | defer allocator.free(input); | 17 | defer allocator.free(input); |
| 18 | 18 | ||
| 19 | var result: usize = 0; | 19 | var result: usize = 0; |
| 20 | var iter = GraphemeIterator.init(input); | 20 | // var result: isize = 0; |
| 21 | // var iter = GraphemeIterator.init(input); | ||
| 21 | // var iter = CodePointIterator{ .bytes = input }; | 22 | // var iter = CodePointIterator{ .bytes = input }; |
| 22 | // var iter = std.mem.splitScalar(u8, input, '\n'); | 23 | var iter = std.mem.splitScalar(u8, input, '\n'); |
| 23 | 24 | ||
| 24 | var timer = try std.time.Timer.start(); | 25 | var timer = try std.time.Timer.start(); |
| 25 | 26 | ||
| 26 | // for (0..50) |_| { | 27 | // for (0..50) |_| { |
| 27 | // while (iter.next()) |cp| result += codePointWidth(@intCast(cp.code)); | 28 | // while (iter.next()) |cp| result += codePointWidth(@intCast(cp.code)); |
| 28 | while (iter.next()) |_| result += 1; | 29 | // while (iter.next()) |_| result += 1; |
| 29 | // while (iter.next()) |line| result += strWidth(line); | 30 | while (iter.next()) |line| result += strWidth(line); |
| 30 | // iter.cp_iter.i = 0; | 31 | // iter.cp_iter.i = 0; |
| 31 | // } | 32 | // } |
| 32 | 33 | ||