summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--build.zig7
-rw-r--r--codegen/dwp.zig24
-rw-r--r--codegen/gbp.zig36
-rw-r--r--src/Graphemes.zig155
-rw-r--r--src/unicode_tests.zig12
5 files changed, 117 insertions, 117 deletions
diff --git a/build.zig b/build.zig
index 5678cd1..aab8516 100644
--- a/build.zig
+++ b/build.zig
@@ -52,7 +52,7 @@ pub fn build(b: *std.Build) void {
52 gbp_gen_exe.root_module.addAnonymousImport("GraphemeBreakProperty.txt", .{ .root_source_file = b.path("data/unicode/auxiliary/GraphemeBreakProperty.txt") }); 52 gbp_gen_exe.root_module.addAnonymousImport("GraphemeBreakProperty.txt", .{ .root_source_file = b.path("data/unicode/auxiliary/GraphemeBreakProperty.txt") });
53 gbp_gen_exe.root_module.addAnonymousImport("emoji-data.txt", .{ .root_source_file = b.path("data/unicode/emoji/emoji-data.txt") }); 53 gbp_gen_exe.root_module.addAnonymousImport("emoji-data.txt", .{ .root_source_file = b.path("data/unicode/emoji/emoji-data.txt") });
54 const run_gbp_gen_exe = b.addRunArtifact(gbp_gen_exe); 54 const run_gbp_gen_exe = b.addRunArtifact(gbp_gen_exe);
55 const gbp_gen_out = run_gbp_gen_exe.addOutputFileArg("gbp.bin.z"); 55 const gbp_gen_out = run_gbp_gen_exe.addOutputFileArg("gbp.zig");
56 56
57 const wbp_gen_exe = b.addExecutable(.{ 57 const wbp_gen_exe = b.addExecutable(.{
58 .name = "wbp", 58 .name = "wbp",
@@ -78,7 +78,7 @@ pub fn build(b: *std.Build) void {
78 dwp_gen_exe.root_module.addAnonymousImport("DerivedGeneralCategory.txt", .{ .root_source_file = b.path("data/unicode/extracted/DerivedGeneralCategory.txt") }); 78 dwp_gen_exe.root_module.addAnonymousImport("DerivedGeneralCategory.txt", .{ .root_source_file = b.path("data/unicode/extracted/DerivedGeneralCategory.txt") });
79 dwp_gen_exe.root_module.addOptions("options", dwp_options); 79 dwp_gen_exe.root_module.addOptions("options", dwp_options);
80 const run_dwp_gen_exe = b.addRunArtifact(dwp_gen_exe); 80 const run_dwp_gen_exe = b.addRunArtifact(dwp_gen_exe);
81 const dwp_gen_out = run_dwp_gen_exe.addOutputFileArg("dwp.bin.z"); 81 const dwp_gen_out = run_dwp_gen_exe.addOutputFileArg("dwp.zig");
82 82
83 // Normalization properties 83 // Normalization properties
84 const canon_gen_exe = b.addExecutable(.{ 84 const canon_gen_exe = b.addExecutable(.{
@@ -514,6 +514,9 @@ pub fn build(b: *std.Build) void {
514 514
515 const run_unicode_tests = b.addRunArtifact(unicode_tests); 515 const run_unicode_tests = b.addRunArtifact(unicode_tests);
516 516
517 const test_unicode_step = b.step("unicode", "Rune unicode tests");
518 test_unicode_step.dependOn(&run_unicode_tests.step);
519
517 const test_step = b.step("test", "Run all module tests"); 520 const test_step = b.step("test", "Run all module tests");
518 test_step.dependOn(&run_unicode_tests.step); 521 test_step.dependOn(&run_unicode_tests.step);
519 test_step.dependOn(&code_point_tr.step); 522 test_step.dependOn(&code_point_tr.step);
diff --git a/codegen/dwp.zig b/codegen/dwp.zig
index 75ac68e..b4d1ed0 100644
--- a/codegen/dwp.zig
+++ b/codegen/dwp.zig
@@ -235,12 +235,24 @@ pub fn main() anyerror!void {
235 defer out_file.close(); 235 defer out_file.close();
236 var writer = out_file.writer(&write_buf); 236 var writer = out_file.writer(&write_buf);
237 237
238 const endian = builtin.cpu.arch.endian(); 238 try writer.interface.print(
239 try writer.interface.writeInt(u16, @intCast(stage1.items.len), endian); 239 \\//! This file is auto-generated. Do not edit.
240 for (stage1.items) |i| try writer.interface.writeInt(u16, i, endian); 240 \\
241 241 \\pub const s1: [{}]u16 = .{{
242 try writer.interface.writeInt(u16, @intCast(stage2.items.len), endian); 242 , .{stage1.items.len});
243 for (stage2.items) |i| try writer.interface.writeInt(i8, i, endian); 243 for (stage1.items) |entry| try writer.interface.print("{}, ", .{entry});
244
245 try writer.interface.print(
246 \\
247 \\}};
248 \\
249 \\pub const s2: [{}]i4 = .{{
250 , .{stage2.items.len});
251 for (stage2.items) |entry| try writer.interface.print("{}, ", .{entry});
252
253 try writer.interface.writeAll(
254 \\};
255 );
244 256
245 try writer.interface.flush(); 257 try writer.interface.flush();
246} 258}
diff --git a/codegen/gbp.zig b/codegen/gbp.zig
index 1d06e9a..117847f 100644
--- a/codegen/gbp.zig
+++ b/codegen/gbp.zig
@@ -240,16 +240,32 @@ pub fn main() anyerror!void {
240 defer out_file.close(); 240 defer out_file.close();
241 var writer = out_file.writer(&write_buf); 241 var writer = out_file.writer(&write_buf);
242 242
243 const endian = builtin.cpu.arch.endian(); 243 try writer.interface.print(
244 try writer.interface.writeInt(u16, @intCast(stage1.items.len), endian); 244 \\//! This file is auto-generated. Do not edit.
245 for (stage1.items) |i| try writer.interface.writeInt(u16, i, endian); 245 \\
246 246 \\pub const s1: [{}]u16 = .{{
247 try writer.interface.writeInt(u16, @intCast(stage2.items.len), endian); 247 , .{stage1.items.len});
248 for (stage2.items) |i| try writer.interface.writeInt(u16, i, endian); 248 for (stage1.items) |entry| try writer.interface.print("{}, ", .{entry});
249 249
250 const props_bytes = stage3.keys(); 250 try writer.interface.print(
251 try writer.interface.writeInt(u16, @intCast(props_bytes.len), endian); 251 \\
252 try writer.interface.writeAll(props_bytes); 252 \\}};
253 \\
254 \\pub const s2: [{}]u7 = .{{
255 , .{stage2.items.len});
256 for (stage2.items) |entry| try writer.interface.print("{}, ", .{entry});
257
258 const keys = stage3.keys();
259
260 try writer.interface.print(
261 \\}};
262 \\
263 \\pub const s3: [{}]u8 = .{{
264 , .{keys.len});
265 for (keys) |entry| try writer.interface.print("{}, ", .{entry});
266 try writer.interface.writeAll(
267 \\};
268 );
253 269
254 try writer.interface.flush(); 270 try writer.interface.flush();
255} 271}
diff --git a/src/Graphemes.zig b/src/Graphemes.zig
index 81d874c..d14b6ab 100644
--- a/src/Graphemes.zig
+++ b/src/Graphemes.zig
@@ -3,70 +3,46 @@
3//! Code for handling graphemes: fragments of string which should be 3//! Code for handling graphemes: fragments of string which should be
4//! treated as one unit. Like Farmer Bob here: 👨🏻‍🌾 4//! treated as one unit. Like Farmer Bob here: 👨🏻‍🌾
5 5
6s1: []u16 = undefined,
7s2: []u16 = undefined,
8s3: []u8 = undefined,
9
10const Graphemes = @This(); 6const Graphemes = @This();
11 7
12pub fn init(allocator: Allocator) Allocator.Error!Graphemes { 8const Data = struct {
13 var graphemes = Graphemes{}; 9 s1: []const u16 = undefined,
14 try graphemes.setup(allocator); 10 s2: []const u7 = undefined,
15 return graphemes; 11 s3: []const u8 = undefined,
16} 12};
17
18pub fn setup(graphemes: *Graphemes, allocator: Allocator) Allocator.Error!void {
19 const in_bytes = @embedFile("gbp");
20 var in_fbs = std.io.fixedBufferStream(in_bytes);
21 var reader = in_fbs.reader();
22
23 const endian = builtin.cpu.arch.endian();
24
25 const s1_len: u16 = reader.readInt(u16, endian) catch unreachable;
26 graphemes.s1 = try allocator.alloc(u16, s1_len);
27 errdefer allocator.free(graphemes.s1);
28 for (0..s1_len) |i| graphemes.s1[i] = reader.readInt(u16, endian) catch unreachable;
29
30 const s2_len: u16 = reader.readInt(u16, endian) catch unreachable;
31 graphemes.s2 = try allocator.alloc(u16, s2_len);
32 errdefer allocator.free(graphemes.s2);
33 for (0..s2_len) |i| graphemes.s2[i] = reader.readInt(u16, endian) catch unreachable;
34
35 const s3_len: u16 = reader.readInt(u16, endian) catch unreachable;
36 graphemes.s3 = try allocator.alloc(u8, s3_len);
37 errdefer allocator.free(graphemes.s3);
38 _ = reader.readAll(graphemes.s3) catch unreachable;
39}
40 13
41pub fn deinit(graphemes: *const Graphemes, allocator: Allocator) void { 14const graphemes = graphemes: {
42 allocator.free(graphemes.s1); 15 const data = @import("gbp");
43 allocator.free(graphemes.s2); 16 break :graphemes Data{
44 allocator.free(graphemes.s3); 17 .s1 = &data.s1,
45} 18 .s2 = &data.s2,
19 .s3 = &data.s3,
20 };
21};
46 22
47/// Lookup the grapheme break property for a code point. 23/// Lookup the grapheme break property for a code point.
48pub fn gbp(graphemes: Graphemes, cp: u21) Gbp { 24pub fn gbp(cp: u21) Gbp {
49 return @enumFromInt(graphemes.s3[graphemes.s2[graphemes.s1[cp >> 8] + (cp & 0xff)]] >> 4); 25 return @enumFromInt(graphemes.s3[graphemes.s2[graphemes.s1[cp >> 8] + (cp & 0xff)]] >> 4);
50} 26}
51 27
52/// Lookup the indic syllable type for a code point. 28/// Lookup the indic syllable type for a code point.
53pub fn indic(graphemes: Graphemes, cp: u21) Indic { 29pub fn indic(cp: u21) Indic {
54 return @enumFromInt((graphemes.s3[graphemes.s2[graphemes.s1[cp >> 8] + (cp & 0xff)]] >> 1) & 0x7); 30 return @enumFromInt((graphemes.s3[graphemes.s2[graphemes.s1[cp >> 8] + (cp & 0xff)]] >> 1) & 0x7);
55} 31}
56 32
57/// Lookup the emoji property for a code point. 33/// Lookup the emoji property for a code point.
58pub fn isEmoji(graphemes: Graphemes, cp: u21) bool { 34pub fn isEmoji(cp: u21) bool {
59 return graphemes.s3[graphemes.s2[graphemes.s1[cp >> 8] + (cp & 0xff)]] & 1 == 1; 35 return graphemes.s3[graphemes.s2[graphemes.s1[cp >> 8] + (cp & 0xff)]] & 1 == 1;
60} 36}
61 37
62/// Returns an iterator over the graphemes in `string`. 38/// Returns an iterator over the graphemes in `string`.
63pub fn iterator(graphemes: *const Graphemes, string: []const u8) Iterator { 39pub fn iterator(string: []const u8) Iterator {
64 return Iterator.init(string, graphemes); 40 return Iterator.init(string);
65} 41}
66 42
67/// Returns a reverse iterator over the graphemes in `string`. 43/// Returns a reverse iterator over the graphemes in `string`.
68pub fn reverseIterator(graphemes: *const Graphemes, string: []const u8) ReverseIterator { 44pub fn reverseIterator(string: []const u8) ReverseIterator {
69 return ReverseIterator.init(string, graphemes); 45 return ReverseIterator.init(string);
70} 46}
71 47
72/// Indic syllable type. 48/// Indic syllable type.
@@ -81,6 +57,7 @@ pub const Indic = enum {
81/// Grapheme break property. 57/// Grapheme break property.
82pub const Gbp = enum { 58pub const Gbp = enum {
83 none, 59 none,
60
84 Control, 61 Control,
85 CR, 62 CR,
86 Extend, 63 Extend,
@@ -117,7 +94,7 @@ pub const Grapheme = struct {
117/// Returns the `Grapheme` at `string[index]`, which does not have to be a 94/// Returns the `Grapheme` at `string[index]`, which does not have to be a
118/// valid start of a codepoint. Asserts the string is not empty. Index must be 95/// valid start of a codepoint. Asserts the string is not empty. Index must be
119/// less than `string.len`. Always returns a `Grapheme`. 96/// less than `string.len`. Always returns a `Grapheme`.
120pub fn graphemeAtIndex(graphemes: *const Graphemes, string: []const u8, index: usize) Grapheme { 97pub fn graphemeAtIndex(string: []const u8, index: usize) Grapheme {
121 assert(string.len != 0); 98 assert(string.len != 0);
122 if (index == 0 or (index > 0 and 99 if (index == 0 or (index > 0 and
123 string[index] < 0x80 and 100 string[index] < 0x80 and
@@ -125,7 +102,7 @@ pub fn graphemeAtIndex(graphemes: *const Graphemes, string: []const u8, index: u
125 (string[index - 1] != '\r' and string[index] != '\n')) 102 (string[index - 1] != '\r' and string[index] != '\n'))
126 { 103 {
127 // There's always a grapheme break between two ASCII code points (except CRLF) 104 // There's always a grapheme break between two ASCII code points (except CRLF)
128 var iter = graphemes.iterator(string[index..]); 105 var iter = Graphemes.iterator(string[index..]);
129 const next = iter.next().?; 106 const next = iter.next().?;
130 return Grapheme{ 107 return Grapheme{
131 .len = next.len, 108 .len = next.len,
@@ -134,14 +111,14 @@ pub fn graphemeAtIndex(graphemes: *const Graphemes, string: []const u8, index: u
134 } // Otherwise it gets hairy. 111 } // Otherwise it gets hairy.
135 const idx: uoffset = code_point.codepointAtIndex(string, @intCast(index)).?.offset; 112 const idx: uoffset = code_point.codepointAtIndex(string, @intCast(index)).?.offset;
136 if (idx == string.len) { 113 if (idx == string.len) {
137 var iter = graphemes.reverseIterator(string); 114 var iter = Graphemes.reverseIterator(string);
138 return iter.prev().?; 115 return iter.prev().?;
139 } 116 }
140 // We're on a valid codepoint boundary, we go back from here 117 // We're on a valid codepoint boundary, we go back from here
141 var r_iter = graphemes.reverseIterAtIndex(string, idx); 118 var r_iter = Graphemes.reverseIterAtIndex(string, idx);
142 if (r_iter.prev()) |g| { 119 if (r_iter.prev()) |g| {
143 if (g.offset == 0) { 120 if (g.offset == 0) {
144 var iter = graphemes.iterator(string); 121 var iter = Graphemes.iterator(string);
145 while (iter.next()) |g2| { 122 while (iter.next()) |g2| {
146 if (g2.offset <= idx and idx < g2.offset + g2.len) return g2; 123 if (g2.offset <= idx and idx < g2.offset + g2.len) return g2;
147 } 124 }
@@ -151,7 +128,7 @@ pub fn graphemeAtIndex(graphemes: *const Graphemes, string: []const u8, index: u
151 // we in fact need to be. 128 // we in fact need to be.
152 _ = r_iter.prev(); 129 _ = r_iter.prev();
153 while (r_iter.pending != .none) : (_ = r_iter.prev()) {} 130 while (r_iter.pending != .none) : (_ = r_iter.prev()) {}
154 var iter = graphemes.iterAtIndex(string, r_iter.cp_iter.i orelse 0); 131 var iter = Graphemes.iterAtIndex(string, r_iter.cp_iter.i orelse 0);
155 while (iter.next()) |g| { 132 while (iter.next()) |g| {
156 if (g.offset <= idx and idx < g.offset + g.len) return g; 133 if (g.offset <= idx and idx < g.offset + g.len) return g;
157 } 134 }
@@ -159,23 +136,22 @@ pub fn graphemeAtIndex(graphemes: *const Graphemes, string: []const u8, index: u
159} 136}
160 137
161/// Return a (forward) iterator of `string` after `grapheme`. 138/// Return a (forward) iterator of `string` after `grapheme`.
162pub fn iterateAfterGrapheme(graphemes: *const Graphemes, string: []const u8, grapheme: Grapheme) Iterator { 139pub fn iterateAfterGrapheme(string: []const u8, grapheme: Grapheme) Iterator {
163 return graphemes.iterAtIndex(string, grapheme.offset + grapheme.len); 140 return Graphemes.iterAtIndex(string, grapheme.offset + grapheme.len);
164} 141}
165 142
166/// Return a reverse iterator of `string` before `grapheme`. 143/// Return a reverse iterator of `string` before `grapheme`.
167pub fn iterateBeforeGrapheme(graphemes: *const Graphemes, string: []const u8, grapheme: Grapheme) ReverseIterator { 144pub fn iterateBeforeGrapheme(string: []const u8, grapheme: Grapheme) ReverseIterator {
168 // This bit of weirdness is because reverse iterators are "advance last", 145 // This bit of weirdness is because reverse iterators are "advance last",
169 // while forward iterators are "advance first". This leaves some room for 146 // while forward iterators are "advance first". This leaves some room for
170 // further optimization, if anyone dares. 147 // further optimization, if anyone dares.
171 var r_iter = graphemes.reverseIterAtIndex(string, grapheme.offset + grapheme.len - 1); 148 var r_iter = Graphemes.reverseIterAtIndex(string, grapheme.offset + grapheme.len - 1);
172 _ = r_iter.prev(); 149 _ = r_iter.prev();
173 return r_iter; 150 return r_iter;
174} 151}
175 152
176fn reverseIterAtIndex(graphemes: *const Graphemes, string: []const u8, idx: uoffset) ReverseIterator { 153fn reverseIterAtIndex(string: []const u8, idx: uoffset) ReverseIterator {
177 var r_iter: ReverseIterator = undefined; 154 var r_iter: ReverseIterator = undefined;
178 r_iter.data = graphemes;
179 var rcp_iter: CodePointReverseIterator = .{ .bytes = string, .i = idx }; 155 var rcp_iter: CodePointReverseIterator = .{ .bytes = string, .i = idx };
180 r_iter.buf[1] = rcp_iter.prev(); 156 r_iter.buf[1] = rcp_iter.prev();
181 r_iter.buf[0] = rcp_iter.prev(); 157 r_iter.buf[0] = rcp_iter.prev();
@@ -184,9 +160,8 @@ fn reverseIterAtIndex(graphemes: *const Graphemes, string: []const u8, idx: uoff
184 return r_iter; 160 return r_iter;
185} 161}
186 162
187fn iterAtIndex(graphemes: *const Graphemes, string: []const u8, idx: uoffset) Iterator { 163fn iterAtIndex(string: []const u8, idx: uoffset) Iterator {
188 var iter: Iterator = undefined; 164 var iter: Iterator = undefined;
189 iter.data = graphemes;
190 iter.buf[0] = first: { 165 iter.buf[0] = first: {
191 if (idx == string.len) break :first null; 166 if (idx == string.len) break :first null;
192 var r_cp_iter: CodePointReverseIterator = .{ .bytes = string, .i = idx }; 167 var r_cp_iter: CodePointReverseIterator = .{ .bytes = string, .i = idx };
@@ -202,13 +177,12 @@ fn iterAtIndex(graphemes: *const Graphemes, string: []const u8, idx: uoffset) It
202pub const Iterator = struct { 177pub const Iterator = struct {
203 buf: [2]?CodePoint = .{ null, null }, 178 buf: [2]?CodePoint = .{ null, null },
204 cp_iter: CodePointIterator, 179 cp_iter: CodePointIterator,
205 data: *const Graphemes,
206 180
207 const Self = @This(); 181 const Self = @This();
208 182
209 /// Assumes `src` is valid UTF-8. 183 /// Assumes `src` is valid UTF-8.
210 pub fn init(str: []const u8, data: *const Graphemes) Self { 184 pub fn init(str: []const u8) Self {
211 var self = Self{ .cp_iter = .{ .bytes = str }, .data = data }; 185 var self = Self{ .cp_iter = .{ .bytes = str } };
212 self.advance(); 186 self.advance();
213 return self; 187 return self;
214 } 188 }
@@ -237,7 +211,6 @@ pub const Iterator = struct {
237 if (graphemeBreak( 211 if (graphemeBreak(
238 self.buf[0].?.code, 212 self.buf[0].?.code,
239 self.buf[1].?.code, 213 self.buf[1].?.code,
240 self.data,
241 &state, 214 &state,
242 )) return Grapheme{ .len = gc_len, .offset = gc_start }; 215 )) return Grapheme{ .len = gc_len, .offset = gc_start };
243 216
@@ -250,7 +223,6 @@ pub const Iterator = struct {
250 if (graphemeBreak( 223 if (graphemeBreak(
251 self.buf[0].?.code, 224 self.buf[0].?.code,
252 if (self.buf[1]) |ncp| ncp.code else 0, 225 if (self.buf[1]) |ncp| ncp.code else 0,
253 self.data,
254 &state, 226 &state,
255 )) break; 227 )) break;
256 } 228 }
@@ -275,7 +247,6 @@ pub const Iterator = struct {
275pub const ReverseIterator = struct { 247pub const ReverseIterator = struct {
276 buf: [2]?CodePoint = .{ null, null }, 248 buf: [2]?CodePoint = .{ null, null },
277 cp_iter: CodePointReverseIterator, 249 cp_iter: CodePointReverseIterator,
278 data: *const Graphemes,
279 /// Codepoint read from `cp_iter` but not returned by `previous` 250 /// Codepoint read from `cp_iter` but not returned by `previous`
280 pending: Pending = .none, 251 pending: Pending = .none,
281 252
@@ -289,8 +260,8 @@ pub const ReverseIterator = struct {
289 260
290 const Self = @This(); 261 const Self = @This();
291 262
292 pub fn init(str: []const u8, data: *const Graphemes) Self { 263 pub fn init(str: []const u8) Self {
293 var self: Self = .{ .cp_iter = .init(str), .data = data }; 264 var self: Self = .{ .cp_iter = .init(str) };
294 self.advance(); 265 self.advance();
295 self.advance(); 266 self.advance();
296 return self; 267 return self;
@@ -352,7 +323,6 @@ pub const ReverseIterator = struct {
352 if (graphemeBreak( 323 if (graphemeBreak(
353 self.buf[0].?.code, 324 self.buf[0].?.code,
354 self.buf[1].?.code, 325 self.buf[1].?.code,
355 self.data,
356 &state, 326 &state,
357 )) break; 327 )) break;
358 328
@@ -374,7 +344,7 @@ pub const ReverseIterator = struct {
374 344
375 const codepoint = self.buf[0].?; 345 const codepoint = self.buf[0].?;
376 346
377 switch (self.data.indic(codepoint.code)) { 347 switch (Graphemes.indic(codepoint.code)) {
378 .Extend, .Linker => { 348 .Extend, .Linker => {
379 self.advance(); 349 self.advance();
380 continue :indic; 350 continue :indic;
@@ -387,7 +357,7 @@ pub const ReverseIterator = struct {
387 if (self.buf[0]) |cp1| { 357 if (self.buf[0]) |cp1| {
388 state.indic = true; 358 state.indic = true;
389 359
390 if (graphemeBreak(cp1.code, self.buf[1].?.code, self.data, &state)) break; 360 if (graphemeBreak(cp1.code, self.buf[1].?.code, &state)) break;
391 361
392 if (!state.indic) { 362 if (!state.indic) {
393 continue :indic; 363 continue :indic;
@@ -426,12 +396,12 @@ pub const ReverseIterator = struct {
426 396
427 const codepoint = self.buf[0].?; 397 const codepoint = self.buf[0].?;
428 398
429 if (self.data.gbp(codepoint.code) == .Extend) { 399 if (Graphemes.gbp(codepoint.code) == .Extend) {
430 self.advance(); 400 self.advance();
431 continue :emoji; 401 continue :emoji;
432 } 402 }
433 403
434 if (self.data.isEmoji(codepoint.code)) { 404 if (Graphemes.isEmoji(codepoint.code)) {
435 // BUF: [Emoji, Extend] (Extend* ZWJ Emoji)* 405 // BUF: [Emoji, Extend] (Extend* ZWJ Emoji)*
436 emoji_offset = codepoint.offset; 406 emoji_offset = codepoint.offset;
437 self.advance(); 407 self.advance();
@@ -462,7 +432,7 @@ pub const ReverseIterator = struct {
462 if (state.regional) { 432 if (state.regional) {
463 var ri_count: usize = 0; 433 var ri_count: usize = 0;
464 while (self.buf[0] != null and 434 while (self.buf[0] != null and
465 self.data.gbp(self.buf[0].?.code) == .Regional_Indicator) 435 Graphemes.gbp(self.buf[0].?.code) == .Regional_Indicator)
466 { 436 {
467 ri_count += 1; 437 ri_count += 1;
468 self.advance(); 438 self.advance();
@@ -500,10 +470,13 @@ pub const IterState = packed struct(u3) {
500 indic: bool = false, 470 indic: bool = false,
501}; 471};
502 472
473// TODO: isBreaker is also expensive given the data is already available,
474// and should be "semantically inlined" wherever it belongs.
475
503// Predicates 476// Predicates
504fn isBreaker(cp: u21, data: *const Graphemes) bool { 477fn isBreaker(cp: u21) bool {
505 // Extract relevant properties. 478 // Extract relevant properties.
506 const cp_gbp_prop = data.gbp(cp); 479 const cp_gbp_prop = Graphemes.gbp(cp);
507 return cp == '\x0d' or cp == '\x0a' or cp_gbp_prop == .Control; 480 return cp == '\x0d' or cp == '\x0a' or cp_gbp_prop == .Control;
508} 481}
509 482
@@ -516,17 +489,20 @@ fn isBreaker(cp: u21, data: *const Graphemes) bool {
516pub fn graphemeBreak( 489pub fn graphemeBreak(
517 cp1: u21, 490 cp1: u21,
518 cp2: u21, 491 cp2: u21,
519 data: *const Graphemes,
520 state: *IterState, 492 state: *IterState,
521) bool { 493) bool {
494 // TODO: it's silly to index the same field three times and
495 // just extra different bits from the data. Optimizable? Maybe
496 // but it's silly to rely on that.
497 //
522 // Extract relevant properties. 498 // Extract relevant properties.
523 const cp1_gbp_prop = data.gbp(cp1); 499 const cp1_gbp_prop = Graphemes.gbp(cp1);
524 const cp1_indic_prop = data.indic(cp1); 500 const cp1_indic_prop = Graphemes.indic(cp1);
525 const cp1_is_emoji = data.isEmoji(cp1); 501 const cp1_is_emoji = Graphemes.isEmoji(cp1);
526 502
527 const cp2_gbp_prop = data.gbp(cp2); 503 const cp2_gbp_prop = Graphemes.gbp(cp2);
528 const cp2_indic_prop = data.indic(cp2); 504 const cp2_indic_prop = Graphemes.indic(cp2);
529 const cp2_is_emoji = data.isEmoji(cp2); 505 const cp2_is_emoji = Graphemes.isEmoji(cp2);
530 506
531 // GB11: Emoji Extend* ZWJ x Emoji 507 // GB11: Emoji Extend* ZWJ x Emoji
532 if (!state.xpic and cp1_is_emoji) state.xpic = true; 508 if (!state.xpic and cp1_is_emoji) state.xpic = true;
@@ -537,7 +513,7 @@ pub fn graphemeBreak(
537 if (cp1 == '\r' and cp2 == '\n') return false; 513 if (cp1 == '\r' and cp2 == '\n') return false;
538 514
539 // GB4: Control 515 // GB4: Control
540 if (isBreaker(cp1, data)) return true; 516 if (isBreaker(cp1)) return true;
541 517
542 // GB11: Emoji Extend* ZWJ x Emoji 518 // GB11: Emoji Extend* ZWJ x Emoji
543 if (state.xpic and 519 if (state.xpic and
@@ -555,7 +531,7 @@ pub fn graphemeBreak(
555 if (cp2_gbp_prop == .SpacingMark) return false; 531 if (cp2_gbp_prop == .SpacingMark) return false;
556 532
557 // GB9b: Prepend x 533 // GB9b: Prepend x
558 if (cp1_gbp_prop == .Prepend and !isBreaker(cp2, data)) return false; 534 if (cp1_gbp_prop == .Prepend and !isBreaker(cp2)) return false;
559 535
560 // GB12, GB13: RI x RI 536 // GB12, GB13: RI x RI
561 if (cp1_gbp_prop == .Regional_Indicator and cp2_gbp_prop == .Regional_Indicator) { 537 if (cp1_gbp_prop == .Regional_Indicator and cp2_gbp_prop == .Regional_Indicator) {
@@ -620,25 +596,22 @@ test "Segmentation ZWJ and ZWSP emoji sequences" {
620 const with_zwsp = seq_1 ++ "\u{200B}" ++ seq_2; 596 const with_zwsp = seq_1 ++ "\u{200B}" ++ seq_2;
621 const no_joiner = seq_1 ++ seq_2; 597 const no_joiner = seq_1 ++ seq_2;
622 598
623 const graphemes = try Graphemes.init(std.testing.allocator);
624 defer graphemes.deinit(std.testing.allocator);
625
626 { 599 {
627 var iter = graphemes.iterator(with_zwj); 600 var iter = Graphemes.iterator(with_zwj);
628 var i: usize = 0; 601 var i: usize = 0;
629 while (iter.next()) |_| : (i += 1) {} 602 while (iter.next()) |_| : (i += 1) {}
630 try std.testing.expectEqual(@as(usize, 1), i); 603 try std.testing.expectEqual(@as(usize, 1), i);
631 } 604 }
632 605
633 { 606 {
634 var iter = graphemes.iterator(with_zwsp); 607 var iter = Graphemes.iterator(with_zwsp);
635 var i: usize = 0; 608 var i: usize = 0;
636 while (iter.next()) |_| : (i += 1) {} 609 while (iter.next()) |_| : (i += 1) {}
637 try std.testing.expectEqual(@as(usize, 3), i); 610 try std.testing.expectEqual(@as(usize, 3), i);
638 } 611 }
639 612
640 { 613 {
641 var iter = graphemes.iterator(no_joiner); 614 var iter = Graphemes.iterator(no_joiner);
642 var i: usize = 0; 615 var i: usize = 0;
643 while (iter.next()) |_| : (i += 1) {} 616 while (iter.next()) |_| : (i += 1) {}
644 try std.testing.expectEqual(@as(usize, 2), i); 617 try std.testing.expectEqual(@as(usize, 2), i);
@@ -647,10 +620,8 @@ test "Segmentation ZWJ and ZWSP emoji sequences" {
647 620
648test "Iterator.peek" { 621test "Iterator.peek" {
649 const peek_seq = "aΔ👨🏻‍🌾→"; 622 const peek_seq = "aΔ👨🏻‍🌾→";
650 const data = try Graphemes.init(std.testing.allocator);
651 defer data.deinit(std.testing.allocator);
652 623
653 var iter = data.iterator(peek_seq); 624 var iter = Graphemes.iterator(peek_seq);
654 const peek_a = iter.peek().?; 625 const peek_a = iter.peek().?;
655 const next_a = iter.next().?; 626 const next_a = iter.next().?;
656 try std.testing.expectEqual(peek_a, next_a); 627 try std.testing.expectEqual(peek_a, next_a);
diff --git a/src/unicode_tests.zig b/src/unicode_tests.zig
index e2a5a96..946c197 100644
--- a/src/unicode_tests.zig
+++ b/src/unicode_tests.zig
@@ -118,8 +118,6 @@ test "Segmentation GraphemeIterator" {
118 const allocator = std.testing.allocator; 118 const allocator = std.testing.allocator;
119 119
120 var reader = std.io.Reader.fixed(@embedFile("GraphemeBreakTest.txt")); 120 var reader = std.io.Reader.fixed(@embedFile("GraphemeBreakTest.txt"));
121 const graph = try Graphemes.init(allocator);
122 defer graph.deinit(allocator);
123 121
124 var line_iter: IterRead = .{ .read = &reader }; 122 var line_iter: IterRead = .{ .read = &reader };
125 123
@@ -161,7 +159,7 @@ test "Segmentation GraphemeIterator" {
161 const this_str = all_bytes.items; 159 const this_str = all_bytes.items;
162 160
163 { 161 {
164 var iter = graph.iterator(this_str); 162 var iter = Graphemes.iterator(this_str);
165 163
166 // Check. 164 // Check.
167 for (want.items, 1..) |want_gc, idx| { 165 for (want.items, 1..) |want_gc, idx| {
@@ -171,7 +169,7 @@ test "Segmentation GraphemeIterator" {
171 got_gc.bytes(this_str), 169 got_gc.bytes(this_str),
172 ); 170 );
173 for (got_gc.offset..got_gc.offset + got_gc.len) |i| { 171 for (got_gc.offset..got_gc.offset + got_gc.len) |i| {
174 const this_gc = graph.graphemeAtIndex(this_str, i); 172 const this_gc = Graphemes.graphemeAtIndex(this_str, i);
175 std.testing.expectEqualSlices( 173 std.testing.expectEqualSlices(
176 u8, 174 u8,
177 got_gc.bytes(this_str), 175 got_gc.bytes(this_str),
@@ -181,7 +179,7 @@ test "Segmentation GraphemeIterator" {
181 return err; 179 return err;
182 }; 180 };
183 } 181 }
184 var after_iter = graph.iterateAfterGrapheme(this_str, got_gc); 182 var after_iter = Graphemes.iterateAfterGrapheme(this_str, got_gc);
185 if (after_iter.next()) |next_gc| { 183 if (after_iter.next()) |next_gc| {
186 if (iter.peek()) |next_peek| { 184 if (iter.peek()) |next_peek| {
187 std.testing.expectEqualSlices( 185 std.testing.expectEqualSlices(
@@ -202,7 +200,7 @@ test "Segmentation GraphemeIterator" {
202 } 200 }
203 } 201 }
204 { 202 {
205 var iter = graph.reverseIterator(this_str); 203 var iter = Graphemes.reverseIterator(this_str);
206 204
207 // Check. 205 // Check.
208 var i: usize = want.items.len; 206 var i: usize = want.items.len;
@@ -226,7 +224,7 @@ test "Segmentation GraphemeIterator" {
226 ); 224 );
227 return err; 225 return err;
228 }; 226 };
229 var before_iter = graph.iterateBeforeGrapheme(this_str, got_gc); 227 var before_iter = Graphemes.iterateBeforeGrapheme(this_str, got_gc);
230 if (before_iter.prev()) |prev_gc| { 228 if (before_iter.prev()) |prev_gc| {
231 if (iter.peek()) |prev_peek| { 229 if (iter.peek()) |prev_peek| {
232 std.testing.expectEqualSlices( 230 std.testing.expectEqualSlices(