summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--src/Grapheme.zig136
-rw-r--r--src/display_width.zig30
-rw-r--r--src/main.zig15
3 files changed, 97 insertions, 84 deletions
diff --git a/src/Grapheme.zig b/src/Grapheme.zig
index 6892a2a..910aec5 100644
--- a/src/Grapheme.zig
+++ b/src/Grapheme.zig
@@ -56,7 +56,7 @@ pub const GraphemeIterator = struct {
56 56
57 const gc_start = self.buf[0].?.offset; 57 const gc_start = self.buf[0].?.offset;
58 var gc_len: usize = self.buf[0].?.len; 58 var gc_len: usize = self.buf[0].?.len;
59 var state: u3 = 0; 59 var state = State{};
60 60
61 if (graphemeBreak( 61 if (graphemeBreak(
62 self.buf[0].?.code, 62 self.buf[0].?.code,
@@ -95,36 +95,42 @@ fn isIgnorable(cp: u21) bool {
95} 95}
96 96
97// Grapheme break state. 97// Grapheme break state.
98// Extended Pictographic (emoji) 98const State = struct {
99fn hasXpic(state: *const u3) bool { 99 bits: u3 = 0,
100 return state.* & 1 == 1; 100
101} 101 // Extended Pictographic (emoji)
102fn setXpic(state: *u3) void { 102 fn hasXpic(self: State) bool {
103 state.* |= 1; 103 return self.bits & 1 == 1;
104} 104 }
105fn unsetXpic(state: *u3) void { 105 fn setXpic(self: *State) void {
106 state.* ^= 1; 106 self.bits |= 1;
107} 107 }
108// Regional Indicatior (flags) 108 fn unsetXpic(self: *State) void {
109fn hasRegional(state: *const u3) bool { 109 self.bits ^= 1;
110 return state.* & 2 == 2; 110 }
111} 111
112fn setRegional(state: *u3) void { 112 // Regional Indicatior (flags)
113 state.* |= 2; 113 fn hasRegional(self: State) bool {
114} 114 return self.bits & 2 == 2;
115fn unsetRegional(state: *u3) void { 115 }
116 state.* ^= 2; 116 fn setRegional(self: *State) void {
117} 117 self.bits |= 2;
118// Indic Conjunct 118 }
119fn hasIndic(state: *const u3) bool { 119 fn unsetRegional(self: *State) void {
120 return state.* & 4 == 4; 120 self.bits ^= 2;
121} 121 }
122fn setIndic(state: *u3) void { 122
123 state.* |= 4; 123 // Indic Conjunct
124} 124 fn hasIndic(self: State) bool {
125fn unsetIndic(state: *u3) void { 125 return self.bits & 4 == 4;
126 state.* ^= 4; 126 }
127} 127 fn setIndic(self: *State) void {
128 self.bits |= 4;
129 }
130 fn unsetIndic(self: *State) void {
131 self.bits ^= 4;
132 }
133};
128 134
129/// `graphemeBreak` returns true only if a grapheme break point is required 135/// `graphemeBreak` returns true only if a grapheme break point is required
130/// between `cp1` and `cp2`. `state` should start out as 0. If calling 136/// between `cp1` and `cp2`. `state` should start out as 0. If calling
@@ -135,7 +141,7 @@ fn unsetIndic(state: *u3) void {
135pub fn graphemeBreak( 141pub fn graphemeBreak(
136 cp1: u21, 142 cp1: u21,
137 cp2: u21, 143 cp2: u21,
138 state: *u3, 144 state: *State,
139) bool { 145) bool {
140 // Extract relevant properties. 146 // Extract relevant properties.
141 const cp1_props_byte = gbp.stage_3[gbp.stage_2[gbp.stage_1[cp1 >> 8] + (cp1 & 0xff)]]; 147 const cp1_props_byte = gbp.stage_3[gbp.stage_2[gbp.stage_1[cp1 >> 8] + (cp1 & 0xff)]];
@@ -149,9 +155,9 @@ pub fn graphemeBreak(
149 const cp2_is_emoji = cp2_props_byte & 1 == 1; 155 const cp2_is_emoji = cp2_props_byte & 1 == 1;
150 156
151 // GB11: Emoji Extend* ZWJ x Emoji 157 // GB11: Emoji Extend* ZWJ x Emoji
152 if (!hasXpic(state) and cp1_is_emoji) setXpic(state); 158 if (!state.hasXpic() and cp1_is_emoji) state.setXpic();
153 // GB9c: Indic Conjunct Break 159 // GB9c: Indic Conjunct Break
154 if (!hasIndic(state) and cp1_indic_prop == .Consonant) setIndic(state); 160 if (!state.hasIndic() and cp1_indic_prop == .Consonant) state.setIndic();
155 161
156 // GB3: CR x LF 162 // GB3: CR x LF
157 if (cp1 == '\r' and cp2 == '\n') return false; 163 if (cp1 == '\r' and cp2 == '\n') return false;
@@ -159,23 +165,13 @@ pub fn graphemeBreak(
159 // GB4: Control 165 // GB4: Control
160 if (isBreaker(cp1)) return true; 166 if (isBreaker(cp1)) return true;
161 167
162 // GB6: Hangul L x (L|V|LV|VT) 168 // GB11: Emoji Extend* ZWJ x Emoji
163 if (cp1_gbp_prop == .L) { 169 if (state.hasXpic() and
164 if (cp2_gbp_prop == .L or 170 cp1_gbp_prop == .ZWJ and
165 cp2_gbp_prop == .V or 171 cp2_is_emoji)
166 cp2_gbp_prop == .LV or 172 {
167 cp2_gbp_prop == .LVT) return false; 173 state.unsetXpic();
168 } 174 return false;
169
170 // GB7: Hangul (LV | V) x (V | T)
171 if (cp1_gbp_prop == .LV or cp1_gbp_prop == .V) {
172 if (cp2_gbp_prop == .V or
173 cp2_gbp_prop == .T) return false;
174 }
175
176 // GB8: Hangul (LVT | T) x T
177 if (cp1_gbp_prop == .LVT or cp1_gbp_prop == .T) {
178 if (cp2_gbp_prop == .T) return false;
179 } 175 }
180 176
181 // GB9b: x (Extend | ZWJ) 177 // GB9b: x (Extend | ZWJ)
@@ -189,44 +185,54 @@ pub fn graphemeBreak(
189 185
190 // GB12, GB13: RI x RI 186 // GB12, GB13: RI x RI
191 if (cp1_gbp_prop == .Regional_Indicator and cp2_gbp_prop == .Regional_Indicator) { 187 if (cp1_gbp_prop == .Regional_Indicator and cp2_gbp_prop == .Regional_Indicator) {
192 if (hasRegional(state)) { 188 if (state.hasRegional()) {
193 unsetRegional(state); 189 state.unsetRegional();
194 return true; 190 return true;
195 } else { 191 } else {
196 setRegional(state); 192 state.setRegional();
197 return false; 193 return false;
198 } 194 }
199 } 195 }
200 196
201 // GB11: Emoji Extend* ZWJ x Emoji 197 // GB6: Hangul L x (L|V|LV|VT)
202 if (hasXpic(state) and 198 if (cp1_gbp_prop == .L) {
203 cp1_gbp_prop == .ZWJ and 199 if (cp2_gbp_prop == .L or
204 cp2_is_emoji) 200 cp2_gbp_prop == .V or
205 { 201 cp2_gbp_prop == .LV or
206 unsetXpic(state); 202 cp2_gbp_prop == .LVT) return false;
207 return false; 203 }
204
205 // GB7: Hangul (LV | V) x (V | T)
206 if (cp1_gbp_prop == .LV or cp1_gbp_prop == .V) {
207 if (cp2_gbp_prop == .V or
208 cp2_gbp_prop == .T) return false;
209 }
210
211 // GB8: Hangul (LVT | T) x T
212 if (cp1_gbp_prop == .LVT or cp1_gbp_prop == .T) {
213 if (cp2_gbp_prop == .T) return false;
208 } 214 }
209 215
210 // GB9c: Indic Conjunct Break 216 // GB9c: Indic Conjunct Break
211 if (hasIndic(state) and 217 if (state.hasIndic() and
212 cp1_indic_prop == .Consonant and 218 cp1_indic_prop == .Consonant and
213 (cp2_indic_prop == .Extend or cp2_indic_prop == .Linker)) 219 (cp2_indic_prop == .Extend or cp2_indic_prop == .Linker))
214 { 220 {
215 return false; 221 return false;
216 } 222 }
217 223
218 if (hasIndic(state) and 224 if (state.hasIndic() and
219 cp1_indic_prop == .Extend and 225 cp1_indic_prop == .Extend and
220 cp2_indic_prop == .Linker) 226 cp2_indic_prop == .Linker)
221 { 227 {
222 return false; 228 return false;
223 } 229 }
224 230
225 if (hasIndic(state) and 231 if (state.hasIndic() and
226 (cp1_indic_prop == .Linker or cp1_gbp_prop == .ZWJ) and 232 (cp1_indic_prop == .Linker or cp1_gbp_prop == .ZWJ) and
227 cp2_indic_prop == .Consonant) 233 cp2_indic_prop == .Consonant)
228 { 234 {
229 unsetIndic(state); 235 state.unsetIndic();
230 return false; 236 return false;
231 } 237 }
232 238
diff --git a/src/display_width.zig b/src/display_width.zig
index aed0ef1..2ac7093 100644
--- a/src/display_width.zig
+++ b/src/display_width.zig
@@ -42,17 +42,10 @@ fn isAsciiOnly(str: []const u8) bool {
42pub fn strWidth(str: []const u8) usize { 42pub fn strWidth(str: []const u8) usize {
43 var total: isize = 0; 43 var total: isize = 0;
44 44
45 // ASCII fast path
45 if (isAsciiOnly(str)) { 46 if (isAsciiOnly(str)) {
46 for (str) |b| { 47 for (str) |b| total += codePointWidth(b);
47 // Backspace and delete 48 return @intCast(@max(0, total));
48 if (b == 0x8 or b == 0x7f) {
49 total -= 1;
50 } else if (b >= 0x20) {
51 total += 1;
52 }
53 }
54
55 return if (total > 0) @intCast(total) else 0;
56 } 49 }
57 50
58 var giter = GraphemeIterator.init(str); 51 var giter = GraphemeIterator.init(str);
@@ -72,14 +65,17 @@ pub fn strWidth(str: []const u8) usize {
72 } 65 }
73 66
74 // Only adding width of first non-zero-width code point. 67 // Only adding width of first non-zero-width code point.
75 if (gc_total == 0) gc_total = w; 68 if (gc_total == 0) {
69 gc_total = w;
70 break;
71 }
76 } 72 }
77 } 73 }
78 74
79 total += gc_total; 75 total += gc_total;
80 } 76 }
81 77
82 return if (total > 0) @intCast(total) else 0; 78 return @intCast(@max(0, total));
83} 79}
84 80
85test "display_width Width" { 81test "display_width Width" {
@@ -147,4 +143,14 @@ test "display_width Width" {
147 // The following passes but as a mere coincidence. 143 // The following passes but as a mere coincidence.
148 const kannada_2 = "\u{0cb0}\u{0cbc}\u{0ccd}\u{0c9a}"; 144 const kannada_2 = "\u{0cb0}\u{0cbc}\u{0ccd}\u{0c9a}";
149 try testing.expectEqual(@as(usize, 2), strWidth(kannada_2)); 145 try testing.expectEqual(@as(usize, 2), strWidth(kannada_2));
146
147 // From Rust https://github.com/jameslanska/unicode-display-width
148 try testing.expectEqual(@as(usize, 15), strWidth("πŸ”₯πŸ—‘πŸ©πŸ‘©πŸ»β€πŸš€β°πŸ’ƒπŸΌπŸ”¦πŸ‘πŸ»"));
149 try testing.expectEqual(@as(usize, 2), strWidth("πŸ¦€"));
150 try testing.expectEqual(@as(usize, 2), strWidth("πŸ‘¨β€πŸ‘©β€πŸ‘§β€πŸ‘§"));
151 try testing.expectEqual(@as(usize, 2), strWidth("πŸ‘©β€πŸ”¬"));
152 try testing.expectEqual(@as(usize, 9), strWidth("sane text"));
153 try testing.expectEqual(@as(usize, 9), strWidth("αΊ’ΜŒΓ‘Μ²lΝ”ΜΜžΜ„Μ‘ΝŒgΜ–Μ˜Μ˜Μ”Μ”Ν’ΝžΝoΜͺΜ”TΜ’Μ™Μ«ΜˆΜΝžeΜ¬ΝˆΝ•ΝŒΜΝ‘x̺̍ṭ̓̓ͅ"));
154 try testing.expectEqual(@as(usize, 17), strWidth("μŠ¬λΌλ°” μš°ν¬λΌμ΄λ‚˜"));
155 try testing.expectEqual(@as(usize, 1), strWidth("\u{378}"));
150} 156}
diff --git a/src/main.zig b/src/main.zig
index bb188ff..38ba343 100644
--- a/src/main.zig
+++ b/src/main.zig
@@ -1,12 +1,12 @@
1const std = @import("std"); 1const std = @import("std");
2 2
3// const GraphemeIterator = @import("ziglyph").GraphemeIterator; 3// const GraphemeIterator = @import("ziglyph").GraphemeIterator;
4const GraphemeIterator = @import("Grapheme").GraphemeIterator; 4// const GraphemeIterator = @import("Grapheme").GraphemeIterator;
5// const codePointWidth = @import("ziglyph").display_width.codePointWidth; 5// const codePointWidth = @import("ziglyph").display_width.codePointWidth;
6// const codePointWidth = @import("display_width").codePointWidth; 6// const codePointWidth = @import("display_width").codePointWidth;
7// const strWidth = @import("ziglyph").display_width.strWidth; 7// const strWidth = @import("ziglyph").display_width.strWidth;
8// const strWidth = @import("display_width").strWidth; 8const strWidth = @import("display_width").strWidth;
9// const CodePointIterator = @import("CodePoint").CodePointIterator; 9const CodePointIterator = @import("CodePoint").CodePointIterator;
10 10
11pub fn main() !void { 11pub fn main() !void {
12 var gpa = std.heap.GeneralPurposeAllocator(.{}){}; 12 var gpa = std.heap.GeneralPurposeAllocator(.{}){};
@@ -17,16 +17,17 @@ pub fn main() !void {
17 defer allocator.free(input); 17 defer allocator.free(input);
18 18
19 var result: usize = 0; 19 var result: usize = 0;
20 var iter = GraphemeIterator.init(input); 20 // var result: isize = 0;
21 // var iter = GraphemeIterator.init(input);
21 // var iter = CodePointIterator{ .bytes = input }; 22 // var iter = CodePointIterator{ .bytes = input };
22 // var iter = std.mem.splitScalar(u8, input, '\n'); 23 var iter = std.mem.splitScalar(u8, input, '\n');
23 24
24 var timer = try std.time.Timer.start(); 25 var timer = try std.time.Timer.start();
25 26
26 // for (0..50) |_| { 27 // for (0..50) |_| {
27 // while (iter.next()) |cp| result += codePointWidth(@intCast(cp.code)); 28 // while (iter.next()) |cp| result += codePointWidth(@intCast(cp.code));
28 while (iter.next()) |_| result += 1; 29 // while (iter.next()) |_| result += 1;
29 // while (iter.next()) |line| result += strWidth(line); 30 while (iter.next()) |line| result += strWidth(line);
30 // iter.cp_iter.i = 0; 31 // iter.cp_iter.i = 0;
31 // } 32 // }
32 33