summaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
Diffstat (limited to 'src')
-rw-r--r--src/WordBreak.zig228
1 files changed, 228 insertions, 0 deletions
diff --git a/src/WordBreak.zig b/src/WordBreak.zig
index 9044740..84fd1f7 100644
--- a/src/WordBreak.zig
+++ b/src/WordBreak.zig
@@ -71,11 +71,234 @@ pub fn deinit(wordbreak: *const WordBreak, allocator: mem.Allocator) void {
71 allocator.free(wordbreak.s2); 71 allocator.free(wordbreak.s2);
72} 72}
73 73
74/// Represents a Unicode word span, as an offset into the source string
75/// and the length of the word.
76pub const Word = struct {
77 offset: u32,
78 len: u32,
79
80 /// Returns a slice of the word given the source string.
81 pub fn bytes(self: Word, src: []const u8) []const u8 {
82 return src[self.offset..][0..self.len];
83 }
84};
85
74/// Returns the word break property type for `cp`. 86/// Returns the word break property type for `cp`.
75pub fn breakProperty(wordbreak: *const WordBreak, cp: u21) WordBreakProperty { 87pub fn breakProperty(wordbreak: *const WordBreak, cp: u21) WordBreakProperty {
76 return @enumFromInt(wordbreak.s2[wordbreak.s1[cp >> 8] + (cp & 0xff)]); 88 return @enumFromInt(wordbreak.s2[wordbreak.s1[cp >> 8] + (cp & 0xff)]);
77} 89}
78 90
91const IterState = packed struct {
92 mid_punct: bool, // AHLetter (MidLetter | MidNumLetQ) × AHLetter
93 mid_num: bool, // Numeric (MidNum | MidNumLetQ) × Numeric
94 quote_heb: bool, // Hebrew_Letter Double_Quote × Hebrew_Letter
95 regional: bool, // [^RI] (RI RI)* RI × RI
96
97 pub const initial: IterState = .{
98 .mid_punct = false,
99 .mid_num = false,
100 .quote_heb = false,
101 .regional = false,
102 };
103};
104
105pub const Iterator = struct {
106 this: ?CodePoint = null,
107 that: ?CodePoint = null,
108 cache: ?WordBreakProperty = null,
109 cp_iter: CodepointIterator,
110 wb: *const WordBreak,
111
112 /// Assumes `str` is valid UTF-8.
113 pub fn init(wb: *const WordBreak, str: []const u8) Iterator {
114 var wb_iter: Iterator = .{ .cp_iter = .{ .bytes = str }, .wb = wb };
115 wb_iter.advance();
116 return wb;
117 }
118
119 pub fn next(iter: *Iterator) ?Word {
120 iter.advance();
121
122 // Done?
123 if (iter.this == null) return null;
124 // Last?
125 if (iter.that == null) return Word{ .len = iter.this.?.len, .offset = iter.this.?.offset };
126
127 const word_start = iter.this.?.offset;
128 var word_len: u32 = 0;
129
130 var state: IterState = .initial;
131
132 scan: while (true) : (iter.advance()) {
133 const this = iter.this.?;
134 word_len += this.len;
135 if (iter.that) |that| {
136 const that_p = iter.wb.breakProperty(that.code);
137 const this_p = this_p: {
138 if (!isIgnorable(that_p) and iter.cache != null) {
139 defer iter.cache = null;
140 break :this_p iter.cache.?;
141 } else {
142 break :this_p iter.wb.breakProperty(this.code);
143 }
144 };
145 // WB3 CR × LF
146 if (this_p == .CR and that_p == .LF) continue :scan;
147 // WB3a (Newline | CR | LF) ÷
148 if (isNewline(this_p)) break :scan;
149 // WB3b ÷ (Newline | CR | LF)
150 if (isNewline(that_p)) break :scan;
151 // WB3c ZWJ × \p{Extended_Pictographic}
152 // The right way to do this one is a RuneSet, TODO: circle back
153 // WB3d WSegSpace × WSegSpace
154 if (this_p == .WSegSpace and that_p == .WSegSpace) continue :scan;
155 // WB4 X (Extend | Format | ZWJ)* → X
156 if (isIgnorable(that_p)) {
157 if (iter.cache == null) {
158 iter.cache = this_p;
159 }
160 continue :scan;
161 }
162 if (isAHLetter(this_p)) {
163 // WB5 AHLetter × AHLetter
164 if (isAHLetter(that_p)) continue :scan;
165 // WB6 AHLetter × (MidLetter | MidNumLetQ) AHLetter
166 if (isMidVal(that_p)) {
167 const next_val = iter.cp_iter.peek();
168 if (next_val) |next_cp| {
169 const next_p = iter.wb.breakProperty(next_cp.code);
170 if (isAHLetter(next_p)) {
171 state.mid_punct = true;
172 continue :scan;
173 }
174 } else break :scan;
175 }
176 }
177 // AHLetter (MidLetter | MidNumLetQ) × AHLetter
178 if (state.mid_punct) {
179 // Should always be true:
180 assert(isMidVal(this_p));
181 assert(isAHLetter(that_p));
182 state.mid_punct = false;
183 continue :scan;
184 }
185 if (this_p == .Hebrew_Letter) {
186 // WB7a Hebrew_Letter × Single_Quote
187 if (that_p == .Single_Quote) continue :scan;
188 // WB7b Hebrew_Letter × Double_Quote Hebrew_Letter
189 if (that_p == .Double_Quote) {
190 const next_val = iter.cp_iter.peek();
191 if (next_val) |next_cp| {
192 const next_p = iter.wb.breakProperty(next_cp.code);
193 if (next_p == .Hebrew_Letter) {
194 state.quote_heb = true;
195 continue :scan;
196 }
197 } else break :scan;
198 }
199 }
200 // WB7c Hebrew_Letter Double_Quote × Hebrew_Letter
201 if (state.quote_heb) {
202 // Should always be true:
203 assert(this_p == .Double_Quote);
204 assert(that_p == .Hebrew_Letter);
205 state.quote_heb = false;
206 continue :scan;
207 }
208 // WB8 Numeric × Numeric
209 if (this_p == .Numeric and that_p == .Numeric) continue :scan;
210 // WB9 AHLetter × Numeric
211 if (isAHLetter(this_p) and that_p == .Numeric) continue :scan;
212 // WB10 Numeric × AHLetter
213 if (this_p == .Numeric and isAHLetter(that_p)) continue :scan;
214 // WB12 Numeric × (MidNum | MidNumLetQ) Numeric
215 if (this_p == .Numeric and isMidVal(that_p)) {
216 const next_val = iter.cp_iter.peek();
217 if (next_val) |next_cp| {
218 const next_p = iter.wb.breakProperty(next_cp.code);
219 if (next_p == .Numeric) {
220 state.mid_num = true;
221 continue :scan;
222 }
223 } else break :scan;
224 }
225 // WB11 Numeric (MidNum | MidNumLetQ) × Numeric
226 if (state.mid_num) {
227 assert(isMidVal(this_p));
228 assert(that_p == .Numeric);
229 state.mid_num = false;
230 continue :scan;
231 }
232 // WB13 Katakana × Katakana
233 if (this_p == .Katakana and that_p == .Katakana) continue :scan;
234 // WB13a (AHLetter | Numeric | Katakana | ExtendNumLet) × ExtendNumLet
235 if (isExtensible(this_p) and that_p == .ExtendNumLet) continue :scan;
236 // WB13b ExtendNumLet × (AHLetter | Numeric | Katakana)
237 if (this_p == .ExtendNumLet and isExtensible(that_p)) continue :scan;
238 // WB15, WB16 ([^RI] ! sot) (RI RI)* RI × RI
239 if (that_p == .Regional_Indicator) {
240 if (this_p == .Regional_Indicator) {
241 if (state.regional) {
242 state.regional = false;
243 continue :scan;
244 } else {
245 break :scan;
246 }
247 } else {
248 const next_val = iter.cp_iter.peek();
249 if (next_val) |next_cp| {
250 const next_p = iter.wb.breakProperty(next_cp.code);
251 if (next_p == .Regional_Indicator) {
252 state.regional = true;
253 continue :scan;
254 }
255 } else break :scan;
256 }
257 }
258 // WB999 Any ÷ Any
259 break :scan;
260 } else { // iter.that == null
261 break :scan;
262 }
263 }
264
265 return Word{ .len = word_len, .offset = word_start };
266 }
267
268 fn advance(wb_iter: *Iterator) void {
269 wb_iter.this = wb_iter.that;
270 wb_iter.that = wb_iter.cp_iter.next();
271 }
272};
273
274//| Predicates
275
276inline fn isNewline(wbp: WordBreakProperty) bool {
277 return wbp == .CR or wbp == .LF or wbp == .Newline;
278}
279
280inline fn isIgnorable(wbp: WordBreakProperty) bool {
281 return switch (wbp) {
282 .Format, .Extend, .ZWJ => true,
283 else => false,
284 };
285}
286
287inline fn isAHLetter(wbp: WordBreakProperty) bool {
288 return wbp == .ALetter or wbp == .Hebrew_Letter;
289}
290
291inline fn isMidVal(wbp: WordBreakProperty) bool {
292 return wbp == .MidLetter or wbp == .MidNumLet or wbp == .Single_Quote;
293}
294
295inline fn isExtensible(wbp: WordBreakProperty) bool {
296 return switch (wbp) {
297 .ALetter, .Hebrew_Letter, .Katakana, .Numeric, .ExtendNumLet => true,
298 else => false,
299 };
300}
301
79test "Word Break Properties" { 302test "Word Break Properties" {
80 const wb = try WordBreak.init(testing.allocator); 303 const wb = try WordBreak.init(testing.allocator);
81 defer wb.deinit(testing.allocator); 304 defer wb.deinit(testing.allocator);
@@ -99,4 +322,9 @@ const builtin = @import("builtin");
99const compress = std.compress; 322const compress = std.compress;
100const mem = std.mem; 323const mem = std.mem;
101const Allocator = mem.Allocator; 324const Allocator = mem.Allocator;
325const assert = std.debug.assert;
102const testing = std.testing; 326const testing = std.testing;
327
328const code_point = @import("code_point");
329const CodepointIterator = code_point.Iterator;
330const CodePoint = code_point.CodePoint;