summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorGravatar Sam Atman2025-05-23 18:46:30 -0400
committerGravatar Sam Atman2025-05-23 18:46:30 -0400
commitc9a1b3392973ee30e6a9a532f1da8605619b5b06 (patch)
tree1198b2fcb544bcef9f634cf507d848d82548f00a
parentAdd iterateBefore and iterateAfter (diff)
downloadzg-c9a1b3392973ee30e6a9a532f1da8605619b5b06.tar.gz
zg-c9a1b3392973ee30e6a9a532f1da8605619b5b06.tar.xz
zg-c9a1b3392973ee30e6a9a532f1da8605619b5b06.zip
Make offset size configurable
Hopefully I can talk users out of taking advantage of this configuration but I'll have better luck with that if it's available.
-rw-r--r--build.zig54
-rw-r--r--src/Graphemes.zig20
-rw-r--r--src/Words.zig14
-rw-r--r--src/code_point.zig16
-rw-r--r--src/unicode_tests.zig10
5 files changed, 66 insertions, 48 deletions
diff --git a/build.zig b/build.zig
index 8cfa039..648571b 100644
--- a/build.zig
+++ b/build.zig
@@ -11,7 +11,34 @@ pub fn build(b: *std.Build) void {
11 .optimize = optimize, 11 .optimize = optimize,
12 }); 12 });
13 13
14 // Code generation 14 //| Options
15
16 // Display width
17 const cjk = b.option(bool, "cjk", "Ambiguous code points are wide (display width: 2).") orelse false;
18 const dwp_options = b.addOptions();
19 dwp_options.addOption(bool, "cjk", cjk);
20
21 // Visible Controls
22 const c0_width = b.option(
23 i4,
24 "c0_width",
25 "C0 controls have this width (default: 0, <BS> <Del> default -1)",
26 );
27 dwp_options.addOption(?i4, "c0_width", c0_width);
28 const c1_width = b.option(
29 i4,
30 "c1_width",
31 "C1 controls have this width (default: 0)",
32 );
33 dwp_options.addOption(?i4, "c1_width", c1_width);
34
35 //| Offset size
36 const fat_offset = b.option(bool, "fat_offset", "Offsets in Iterators and data structures will be u64") orelse false;
37 const size_config = b.addOptions();
38 size_config.addOption(bool, "fat_offset", fat_offset);
39
40 //| Code generation
41
15 // Grapheme break 42 // Grapheme break
16 const gbp_gen_exe = b.addExecutable(.{ 43 const gbp_gen_exe = b.addExecutable(.{
17 .name = "gbp", 44 .name = "gbp",
@@ -31,32 +58,13 @@ pub fn build(b: *std.Build) void {
31 const run_wbp_gen_exe = b.addRunArtifact(wbp_gen_exe); 58 const run_wbp_gen_exe = b.addRunArtifact(wbp_gen_exe);
32 const wbp_gen_out = run_wbp_gen_exe.addOutputFileArg("wbp.bin.z"); 59 const wbp_gen_out = run_wbp_gen_exe.addOutputFileArg("wbp.bin.z");
33 60
34 // Display width
35 const cjk = b.option(bool, "cjk", "Ambiguous code points are wide (display width: 2).") orelse false;
36 const options = b.addOptions();
37 options.addOption(bool, "cjk", cjk);
38
39 // Visible Controls
40 const c0_width = b.option(
41 i4,
42 "c0_width",
43 "C0 controls have this width (default: 0, <BS> <Del> default -1)",
44 );
45 options.addOption(?i4, "c0_width", c0_width);
46 const c1_width = b.option(
47 i4,
48 "c1_width",
49 "C1 controls have this width (default: 0)",
50 );
51 options.addOption(?i4, "c1_width", c1_width);
52
53 const dwp_gen_exe = b.addExecutable(.{ 61 const dwp_gen_exe = b.addExecutable(.{
54 .name = "dwp", 62 .name = "dwp",
55 .root_source_file = b.path("codegen/dwp.zig"), 63 .root_source_file = b.path("codegen/dwp.zig"),
56 .target = b.graph.host, 64 .target = b.graph.host,
57 .optimize = .Debug, 65 .optimize = .Debug,
58 }); 66 });
59 dwp_gen_exe.root_module.addOptions("options", options); 67 dwp_gen_exe.root_module.addOptions("options", dwp_options);
60 const run_dwp_gen_exe = b.addRunArtifact(dwp_gen_exe); 68 const run_dwp_gen_exe = b.addRunArtifact(dwp_gen_exe);
61 const dwp_gen_out = run_dwp_gen_exe.addOutputFileArg("dwp.bin.z"); 69 const dwp_gen_out = run_dwp_gen_exe.addOutputFileArg("dwp.bin.z");
62 70
@@ -199,6 +207,7 @@ pub fn build(b: *std.Build) void {
199 .target = target, 207 .target = target,
200 .optimize = optimize, 208 .optimize = optimize,
201 }); 209 });
210 code_point.addOptions("config", size_config);
202 211
203 const code_point_t = b.addTest(.{ 212 const code_point_t = b.addTest(.{
204 .name = "code_point", 213 .name = "code_point",
@@ -216,6 +225,7 @@ pub fn build(b: *std.Build) void {
216 }); 225 });
217 graphemes.addAnonymousImport("gbp", .{ .root_source_file = gbp_gen_out }); 226 graphemes.addAnonymousImport("gbp", .{ .root_source_file = gbp_gen_out });
218 graphemes.addImport("code_point", code_point); 227 graphemes.addImport("code_point", code_point);
228 graphemes.addOptions("config", size_config);
219 229
220 const grapheme_t = b.addTest(.{ 230 const grapheme_t = b.addTest(.{
221 .name = "Graphemes", 231 .name = "Graphemes",
@@ -267,7 +277,7 @@ pub fn build(b: *std.Build) void {
267 display_width.addImport("ascii", ascii); 277 display_width.addImport("ascii", ascii);
268 display_width.addImport("code_point", code_point); 278 display_width.addImport("code_point", code_point);
269 display_width.addImport("Graphemes", graphemes); 279 display_width.addImport("Graphemes", graphemes);
270 display_width.addOptions("options", options); // For testing 280 display_width.addOptions("options", dwp_options); // For testing
271 281
272 const display_width_t = b.addTest(.{ 282 const display_width_t = b.addTest(.{
273 .name = "display_width", 283 .name = "display_width",
diff --git a/src/Graphemes.zig b/src/Graphemes.zig
index 0338c04..49fdbf3 100644
--- a/src/Graphemes.zig
+++ b/src/Graphemes.zig
@@ -5,9 +5,11 @@ const Allocator = mem.Allocator;
5const compress = std.compress; 5const compress = std.compress;
6const unicode = std.unicode; 6const unicode = std.unicode;
7 7
8const CodePoint = @import("code_point").CodePoint; 8const code_point = @import("code_point");
9const CodePointIterator = @import("code_point").Iterator; 9const CodePoint = code_point.CodePoint;
10const CodePointReverseIterator = @import("code_point").ReverseIterator; 10const CodePointIterator = code_point.Iterator;
11const CodePointReverseIterator = code_point.ReverseIterator;
12const uoffset = code_point.uoffset;
11 13
12s1: []u16 = undefined, 14s1: []u16 = undefined,
13s2: []u16 = undefined, 15s2: []u16 = undefined,
@@ -104,8 +106,8 @@ pub const Gbp = enum {
104 106
105/// `Grapheme` represents a Unicode grapheme cluster by its length and offset in the source bytes. 107/// `Grapheme` represents a Unicode grapheme cluster by its length and offset in the source bytes.
106pub const Grapheme = struct { 108pub const Grapheme = struct {
107 len: u32, 109 len: uoffset,
108 offset: u32, 110 offset: uoffset,
109 111
110 /// `bytes` returns the slice of bytes that correspond to 112 /// `bytes` returns the slice of bytes that correspond to
111 /// this grapheme cluster in `src`. 113 /// this grapheme cluster in `src`.
@@ -199,7 +201,7 @@ pub const ReverseIterator = struct {
199 /// Count of pending RI codepoints, it is an even number 201 /// Count of pending RI codepoints, it is an even number
200 ri_count: usize, 202 ri_count: usize,
201 /// End of (Extend* ZWJ) sequence pending from failed GB11: !Emoji Extend* ZWJ x Emoji 203 /// End of (Extend* ZWJ) sequence pending from failed GB11: !Emoji Extend* ZWJ x Emoji
202 extend_end: u32, 204 extend_end: uoffset,
203 }; 205 };
204 206
205 const Self = @This(); 207 const Self = @This();
@@ -219,7 +221,7 @@ pub const ReverseIterator = struct {
219 pub fn prev(self: *Self) ?Grapheme { 221 pub fn prev(self: *Self) ?Grapheme {
220 if (self.buf[1] == null) return null; 222 if (self.buf[1] == null) return null;
221 223
222 const grapheme_end: u32 = end: { 224 const grapheme_end: uoffset = end: {
223 const codepoint = self.buf[1].?; 225 const codepoint = self.buf[1].?;
224 226
225 switch (self.pending) { 227 switch (self.pending) {
@@ -270,7 +272,7 @@ pub const ReverseIterator = struct {
270 if (!state.hasIndic()) { 272 if (!state.hasIndic()) {
271 273
272 // BUF: [?Any, Extend | Linker] Consonant 274 // BUF: [?Any, Extend | Linker] Consonant
273 var indic_offset: u32 = self.buf[1].?.offset + self.buf[1].?.len; 275 var indic_offset: uoffset = self.buf[1].?.offset + self.buf[1].?.len;
274 276
275 indic: while (true) { 277 indic: while (true) {
276 if (self.buf[0] == null) { 278 if (self.buf[0] == null) {
@@ -321,7 +323,7 @@ pub const ReverseIterator = struct {
321 323
322 if (!state.hasXpic()) { 324 if (!state.hasXpic()) {
323 // BUF: [?Any, ZWJ] Emoji 325 // BUF: [?Any, ZWJ] Emoji
324 var emoji_offset: u32 = self.buf[1].?.offset + self.buf[1].?.len; 326 var emoji_offset: uoffset = self.buf[1].?.offset + self.buf[1].?.len;
325 327
326 // Look for previous Emoji 328 // Look for previous Emoji
327 emoji: while (true) { 329 emoji: while (true) {
diff --git a/src/Words.zig b/src/Words.zig
index 1d10b2a..1707881 100644
--- a/src/Words.zig
+++ b/src/Words.zig
@@ -53,8 +53,8 @@ pub fn deinit(words: *const Words, allocator: mem.Allocator) void {
53/// Represents a Unicode word span, as an offset into the source string 53/// Represents a Unicode word span, as an offset into the source string
54/// and the length of the word. 54/// and the length of the word.
55pub const Word = struct { 55pub const Word = struct {
56 offset: u32, 56 offset: uoffset,
57 len: u32, 57 len: uoffset,
58 58
59 /// Returns a slice of the word given the source string. 59 /// Returns a slice of the word given the source string.
60 pub fn bytes(word: Word, src: []const u8) []const u8 { 60 pub fn bytes(word: Word, src: []const u8) []const u8 {
@@ -183,7 +183,7 @@ pub const Iterator = struct {
183 if (iter.that == null) return Word{ .len = iter.this.?.len, .offset = iter.this.?.offset }; 183 if (iter.that == null) return Word{ .len = iter.this.?.len, .offset = iter.this.?.offset };
184 184
185 const word_start = iter.this.?.offset; 185 const word_start = iter.this.?.offset;
186 var word_len: u32 = 0; 186 var word_len: uoffset = 0;
187 187
188 // State variables. 188 // State variables.
189 var last_p: WordBreakProperty = .none; 189 var last_p: WordBreakProperty = .none;
@@ -364,7 +364,7 @@ pub const ReverseIterator = struct {
364 if (iter.before == null) return Word{ .len = iter.after.?.len, .offset = 0 }; 364 if (iter.before == null) return Word{ .len = iter.after.?.len, .offset = 0 };
365 365
366 const word_end = iter.after.?.offset + iter.after.?.len; 366 const word_end = iter.after.?.offset + iter.after.?.len;
367 var word_len: u32 = 0; 367 var word_len: uoffset = 0;
368 368
369 // State variables. 369 // State variables.
370 var last_p: WordBreakProperty = .none; 370 var last_p: WordBreakProperty = .none;
@@ -518,7 +518,7 @@ pub const ReverseIterator = struct {
518 518
519/// Initialize a ReverseIterator at the provided index. Used in `wordAtIndex`. 519/// Initialize a ReverseIterator at the provided index. Used in `wordAtIndex`.
520fn reverseFromIndex(words: *const Words, string: []const u8, index: usize) ReverseIterator { 520fn reverseFromIndex(words: *const Words, string: []const u8, index: usize) ReverseIterator {
521 var idx: u32 = @intCast(index); 521 var idx: uoffset = @intCast(index);
522 // Find the next lead byte: 522 // Find the next lead byte:
523 while (idx < string.len and 0x80 <= string[idx] and string[idx] <= 0xBf) : (idx += 1) {} 523 while (idx < string.len and 0x80 <= string[idx] and string[idx] <= 0xBf) : (idx += 1) {}
524 if (idx == string.len) return words.reverseIterator(string); 524 if (idx == string.len) return words.reverseIterator(string);
@@ -537,7 +537,7 @@ fn reverseFromIndex(words: *const Words, string: []const u8, index: usize) Rever
537} 537}
538 538
539fn forwardFromIndex(words: *const Words, string: []const u8, index: usize) Iterator { 539fn forwardFromIndex(words: *const Words, string: []const u8, index: usize) Iterator {
540 var idx: u32 = @intCast(index); 540 var idx: uoffset = @intCast(index);
541 if (idx == string.len) { 541 if (idx == string.len) {
542 return .{ 542 return .{
543 .cp_iter = .{ .bytes = string, .i = idx }, 543 .cp_iter = .{ .bytes = string, .i = idx },
@@ -746,6 +746,8 @@ const Allocator = mem.Allocator;
746const assert = std.debug.assert; 746const assert = std.debug.assert;
747const testing = std.testing; 747const testing = std.testing;
748 748
749const uoffset = code_point.uoffset;
750
749const code_point = @import("code_point"); 751const code_point = @import("code_point");
750const CodepointIterator = code_point.Iterator; 752const CodepointIterator = code_point.Iterator;
751const ReverseCodepointIterator = code_point.ReverseIterator; 753const ReverseCodepointIterator = code_point.ReverseIterator;
diff --git a/src/code_point.zig b/src/code_point.zig
index 9a84080..8bd3d5b 100644
--- a/src/code_point.zig
+++ b/src/code_point.zig
@@ -4,12 +4,14 @@
4//! Represents invalid data according to the Replacement of Maximal 4//! Represents invalid data according to the Replacement of Maximal
5//! Subparts algorithm. 5//! Subparts algorithm.
6 6
7pub const uoffset = if (@import("config").fat_offset) u64 else u32;
8
7/// `CodePoint` represents a Unicode code point by its code, 9/// `CodePoint` represents a Unicode code point by its code,
8/// length, and offset in the source bytes. 10/// length, and offset in the source bytes.
9pub const CodePoint = struct { 11pub const CodePoint = struct {
10 code: u21, 12 code: u21,
11 len: u3, 13 len: u3,
12 offset: u32, 14 offset: uoffset,
13 15
14 /// Return the slice of this codepoint, given the original string. 16 /// Return the slice of this codepoint, given the original string.
15 pub inline fn bytes(cp: CodePoint, str: []const u8) []const u8 { 17 pub inline fn bytes(cp: CodePoint, str: []const u8) []const u8 {
@@ -27,8 +29,8 @@ pub const CodePoint = struct {
27 29
28/// This function is deprecated and will be removed in a later release. 30/// This function is deprecated and will be removed in a later release.
29/// Use `decodeAtIndex` or `decodeAtCursor`. 31/// Use `decodeAtIndex` or `decodeAtCursor`.
30pub fn decode(bytes: []const u8, offset: u32) ?CodePoint { 32pub fn decode(bytes: []const u8, offset: uoffset) ?CodePoint {
31 var off: u32 = 0; 33 var off: uoffset = 0;
32 var maybe_code = decodeAtCursor(bytes, &off); 34 var maybe_code = decodeAtCursor(bytes, &off);
33 if (maybe_code) |*code| { 35 if (maybe_code) |*code| {
34 code.offset = offset; 36 code.offset = offset;
@@ -38,14 +40,14 @@ pub fn decode(bytes: []const u8, offset: u32) ?CodePoint {
38} 40}
39 41
40/// Decode the CodePoint, if any, at `bytes[idx]`. 42/// Decode the CodePoint, if any, at `bytes[idx]`.
41pub fn decodeAtIndex(bytes: []const u8, idx: u32) ?CodePoint { 43pub fn decodeAtIndex(bytes: []const u8, idx: uoffset) ?CodePoint {
42 var off = idx; 44 var off = idx;
43 return decodeAtCursor(bytes, &off); 45 return decodeAtCursor(bytes, &off);
44} 46}
45 47
46/// Decode the CodePoint, if any, at `bytes[cursor.*]`. After, the 48/// Decode the CodePoint, if any, at `bytes[cursor.*]`. After, the
47/// cursor will point at the next potential codepoint index. 49/// cursor will point at the next potential codepoint index.
48pub fn decodeAtCursor(bytes: []const u8, cursor: *u32) ?CodePoint { 50pub fn decodeAtCursor(bytes: []const u8, cursor: *uoffset) ?CodePoint {
49 // EOS 51 // EOS
50 if (cursor.* >= bytes.len) return null; 52 if (cursor.* >= bytes.len) return null;
51 53
@@ -161,7 +163,7 @@ pub fn decodeAtCursor(bytes: []const u8, cursor: *u32) ?CodePoint {
161/// `Iterator` iterates a string one `CodePoint` at-a-time. 163/// `Iterator` iterates a string one `CodePoint` at-a-time.
162pub const Iterator = struct { 164pub const Iterator = struct {
163 bytes: []const u8, 165 bytes: []const u8,
164 i: u32 = 0, 166 i: uoffset = 0,
165 167
166 pub fn init(bytes: []const u8) Iterator { 168 pub fn init(bytes: []const u8) Iterator {
167 return .{ .bytes = bytes, .i = 0 }; 169 return .{ .bytes = bytes, .i = 0 };
@@ -257,7 +259,7 @@ const class_mask: [12]u8 = .{
257 259
258pub const ReverseIterator = struct { 260pub const ReverseIterator = struct {
259 bytes: []const u8, 261 bytes: []const u8,
260 i: ?u32, 262 i: ?uoffset,
261 263
262 pub fn init(str: []const u8) ReverseIterator { 264 pub fn init(str: []const u8) ReverseIterator {
263 var r_iter: ReverseIterator = undefined; 265 var r_iter: ReverseIterator = undefined;
diff --git a/src/unicode_tests.zig b/src/unicode_tests.zig
index 195fdcb..c463dcc 100644
--- a/src/unicode_tests.zig
+++ b/src/unicode_tests.zig
@@ -141,12 +141,12 @@ test "Segmentation GraphemeIterator" {
141 defer all_bytes.deinit(); 141 defer all_bytes.deinit();
142 142
143 var graphemes = std.mem.splitSequence(u8, line, " ÷ "); 143 var graphemes = std.mem.splitSequence(u8, line, " ÷ ");
144 var bytes_index: u32 = 0; 144 var bytes_index: uoffset = 0;
145 145
146 while (graphemes.next()) |field| { 146 while (graphemes.next()) |field| {
147 var code_points = std.mem.splitScalar(u8, field, ' '); 147 var code_points = std.mem.splitScalar(u8, field, ' ');
148 var cp_buf: [4]u8 = undefined; 148 var cp_buf: [4]u8 = undefined;
149 var cp_index: u32 = 0; 149 var cp_index: uoffset = 0;
150 var gc_len: u8 = 0; 150 var gc_len: u8 = 0;
151 151
152 while (code_points.next()) |code_point| { 152 while (code_points.next()) |code_point| {
@@ -231,12 +231,12 @@ test "Segmentation Word Iterator" {
231 defer all_bytes.deinit(); 231 defer all_bytes.deinit();
232 232
233 var words = std.mem.splitSequence(u8, line, " ÷ "); 233 var words = std.mem.splitSequence(u8, line, " ÷ ");
234 var bytes_index: u32 = 0; 234 var bytes_index: uoffset = 0;
235 235
236 while (words.next()) |field| { 236 while (words.next()) |field| {
237 var code_points = std.mem.splitScalar(u8, field, ' '); 237 var code_points = std.mem.splitScalar(u8, field, ' ');
238 var cp_buf: [4]u8 = undefined; 238 var cp_buf: [4]u8 = undefined;
239 var cp_index: u32 = 0; 239 var cp_index: uoffset = 0;
240 var gc_len: u8 = 0; 240 var gc_len: u8 = 0;
241 241
242 while (code_points.next()) |code_point| { 242 while (code_points.next()) |code_point| {
@@ -425,6 +425,8 @@ const debug = std.debug;
425const testing = std.testing; 425const testing = std.testing;
426const unicode = std.unicode; 426const unicode = std.unicode;
427 427
428const uoffset = @FieldType(Word, "offset");
429
428const Grapheme = @import("Graphemes").Grapheme; 430const Grapheme = @import("Graphemes").Grapheme;
429const Graphemes = @import("Graphemes"); 431const Graphemes = @import("Graphemes");
430const GraphemeIterator = @import("Graphemes").Iterator; 432const GraphemeIterator = @import("Graphemes").Iterator;