From e3dbcc70688321e48ac31599105c51edac2736af Mon Sep 17 00:00:00 2001 From: Sam Atman Date: Sun, 11 May 2025 16:30:47 -0400 Subject: Add WordBreakPropertyData Passes some simple lookup tests. --- build.zig | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) (limited to 'build.zig') diff --git a/build.zig b/build.zig index 58fd3e7..f89e90c 100644 --- a/build.zig +++ b/build.zig @@ -22,6 +22,15 @@ pub fn build(b: *std.Build) void { const run_gbp_gen_exe = b.addRunArtifact(gbp_gen_exe); const gbp_gen_out = run_gbp_gen_exe.addOutputFileArg("gbp.bin.z"); + const wbp_gen_exe = b.addExecutable(.{ + .name = "wbp", + .root_source_file = b.path("codegen/wbp.zig"), + .target = b.graph.host, + .optimize = .Debug, + }); + const run_wbp_gen_exe = b.addRunArtifact(wbp_gen_exe); + const wbp_gen_out = run_wbp_gen_exe.addOutputFileArg("wbp.bin.z"); + // Display width const cjk = b.option(bool, "cjk", "Ambiguous code points are wide (display width: 2).") orelse false; const options = b.addOptions(); @@ -183,6 +192,7 @@ pub fn build(b: *std.Build) void { const props_gen_out = run_props_gen_exe.addOutputFileArg("props.bin.z"); // Modules we provide + // Code points const code_point = b.addModule("code_point", .{ .root_source_file = b.path("src/code_point.zig"), @@ -215,6 +225,23 @@ pub fn build(b: *std.Build) void { }); const grapheme_tr = b.addRunArtifact(grapheme_t); + // Word Breaking + const word_break = b.addModule("WordBreak", .{ + .root_source_file = b.path("src/WordBreak.zig"), + .target = target, + .optimize = optimize, + }); + word_break.addAnonymousImport("wbp", .{ .root_source_file = wbp_gen_out }); + word_break.addImport("code_point", code_point); + + const word_break_t = b.addTest(.{ + .name = "WordBreak", + .root_module = word_break, + .target = target, + .optimize = optimize, + }); + const word_break_tr = b.addRunArtifact(word_break_t); + // ASCII utilities const ascii = b.addModule("ascii", .{ .root_source_file = b.path("src/ascii.zig"), @@ -452,6 +479,7 @@ pub fn build(b: *std.Build) void { test_step.dependOn(&code_point_tr.step); test_step.dependOn(&display_width_tr.step); test_step.dependOn(&grapheme_tr.step); + test_step.dependOn(&word_break_tr.step); test_step.dependOn(&ascii_tr.step); test_step.dependOn(&ccc_data_tr.step); test_step.dependOn(&canon_data_tr.step); -- cgit v1.2.3 From cf8d8fe5d640511f6c4134fdaa36e930232ca7da Mon Sep 17 00:00:00 2001 From: Sam Atman Date: Mon, 12 May 2025 15:22:37 -0400 Subject: Begin conformance test I'm not sure the details of this strategy can actually be made to work. But, something can. --- build.zig | 1 + 1 file changed, 1 insertion(+) (limited to 'build.zig') diff --git a/build.zig b/build.zig index f89e90c..387b4c3 100644 --- a/build.zig +++ b/build.zig @@ -471,6 +471,7 @@ pub fn build(b: *std.Build) void { }); unicode_tests.root_module.addImport("Graphemes", graphemes); unicode_tests.root_module.addImport("Normalize", norm); + unicode_tests.root_module.addImport("WordBreak", word_break); const run_unicode_tests = b.addRunArtifact(unicode_tests); -- cgit v1.2.3 From aa20bebade8eeb3ca75199dc252feb3edb203fb1 Mon Sep 17 00:00:00 2001 From: Sam Atman Date: Fri, 16 May 2025 12:06:36 -0400 Subject: Words module In keeping with the new nomenclature, we're calling the module "Words", not "WordBreak". The latter is Unicode jargon, the module provides word iterators. Words are the figure, word breaks are the ground. --- build.zig | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) (limited to 'build.zig') diff --git a/build.zig b/build.zig index 387b4c3..8cfa039 100644 --- a/build.zig +++ b/build.zig @@ -226,21 +226,21 @@ pub fn build(b: *std.Build) void { const grapheme_tr = b.addRunArtifact(grapheme_t); // Word Breaking - const word_break = b.addModule("WordBreak", .{ - .root_source_file = b.path("src/WordBreak.zig"), + const words = b.addModule("Words", .{ + .root_source_file = b.path("src/Words.zig"), .target = target, .optimize = optimize, }); - word_break.addAnonymousImport("wbp", .{ .root_source_file = wbp_gen_out }); - word_break.addImport("code_point", code_point); + words.addAnonymousImport("wbp", .{ .root_source_file = wbp_gen_out }); + words.addImport("code_point", code_point); - const word_break_t = b.addTest(.{ + const words_t = b.addTest(.{ .name = "WordBreak", - .root_module = word_break, + .root_module = words, .target = target, .optimize = optimize, }); - const word_break_tr = b.addRunArtifact(word_break_t); + const words_tr = b.addRunArtifact(words_t); // ASCII utilities const ascii = b.addModule("ascii", .{ @@ -471,7 +471,7 @@ pub fn build(b: *std.Build) void { }); unicode_tests.root_module.addImport("Graphemes", graphemes); unicode_tests.root_module.addImport("Normalize", norm); - unicode_tests.root_module.addImport("WordBreak", word_break); + unicode_tests.root_module.addImport("Words", words); const run_unicode_tests = b.addRunArtifact(unicode_tests); @@ -480,7 +480,7 @@ pub fn build(b: *std.Build) void { test_step.dependOn(&code_point_tr.step); test_step.dependOn(&display_width_tr.step); test_step.dependOn(&grapheme_tr.step); - test_step.dependOn(&word_break_tr.step); + test_step.dependOn(&words_tr.step); test_step.dependOn(&ascii_tr.step); test_step.dependOn(&ccc_data_tr.step); test_step.dependOn(&canon_data_tr.step); -- cgit v1.2.3 From c9a1b3392973ee30e6a9a532f1da8605619b5b06 Mon Sep 17 00:00:00 2001 From: Sam Atman Date: Fri, 23 May 2025 18:46:30 -0400 Subject: Make offset size configurable Hopefully I can talk users out of taking advantage of this configuration but I'll have better luck with that if it's available. --- build.zig | 54 ++++++++++++++++++++++++++++++++---------------------- 1 file changed, 32 insertions(+), 22 deletions(-) (limited to 'build.zig') diff --git a/build.zig b/build.zig index 8cfa039..648571b 100644 --- a/build.zig +++ b/build.zig @@ -11,7 +11,34 @@ pub fn build(b: *std.Build) void { .optimize = optimize, }); - // Code generation + //| Options + + // Display width + const cjk = b.option(bool, "cjk", "Ambiguous code points are wide (display width: 2).") orelse false; + const dwp_options = b.addOptions(); + dwp_options.addOption(bool, "cjk", cjk); + + // Visible Controls + const c0_width = b.option( + i4, + "c0_width", + "C0 controls have this width (default: 0, default -1)", + ); + dwp_options.addOption(?i4, "c0_width", c0_width); + const c1_width = b.option( + i4, + "c1_width", + "C1 controls have this width (default: 0)", + ); + dwp_options.addOption(?i4, "c1_width", c1_width); + + //| Offset size + const fat_offset = b.option(bool, "fat_offset", "Offsets in Iterators and data structures will be u64") orelse false; + const size_config = b.addOptions(); + size_config.addOption(bool, "fat_offset", fat_offset); + + //| Code generation + // Grapheme break const gbp_gen_exe = b.addExecutable(.{ .name = "gbp", @@ -31,32 +58,13 @@ pub fn build(b: *std.Build) void { const run_wbp_gen_exe = b.addRunArtifact(wbp_gen_exe); const wbp_gen_out = run_wbp_gen_exe.addOutputFileArg("wbp.bin.z"); - // Display width - const cjk = b.option(bool, "cjk", "Ambiguous code points are wide (display width: 2).") orelse false; - const options = b.addOptions(); - options.addOption(bool, "cjk", cjk); - - // Visible Controls - const c0_width = b.option( - i4, - "c0_width", - "C0 controls have this width (default: 0, default -1)", - ); - options.addOption(?i4, "c0_width", c0_width); - const c1_width = b.option( - i4, - "c1_width", - "C1 controls have this width (default: 0)", - ); - options.addOption(?i4, "c1_width", c1_width); - const dwp_gen_exe = b.addExecutable(.{ .name = "dwp", .root_source_file = b.path("codegen/dwp.zig"), .target = b.graph.host, .optimize = .Debug, }); - dwp_gen_exe.root_module.addOptions("options", options); + dwp_gen_exe.root_module.addOptions("options", dwp_options); const run_dwp_gen_exe = b.addRunArtifact(dwp_gen_exe); const dwp_gen_out = run_dwp_gen_exe.addOutputFileArg("dwp.bin.z"); @@ -199,6 +207,7 @@ pub fn build(b: *std.Build) void { .target = target, .optimize = optimize, }); + code_point.addOptions("config", size_config); const code_point_t = b.addTest(.{ .name = "code_point", @@ -216,6 +225,7 @@ pub fn build(b: *std.Build) void { }); graphemes.addAnonymousImport("gbp", .{ .root_source_file = gbp_gen_out }); graphemes.addImport("code_point", code_point); + graphemes.addOptions("config", size_config); const grapheme_t = b.addTest(.{ .name = "Graphemes", @@ -267,7 +277,7 @@ pub fn build(b: *std.Build) void { display_width.addImport("ascii", ascii); display_width.addImport("code_point", code_point); display_width.addImport("Graphemes", graphemes); - display_width.addOptions("options", options); // For testing + display_width.addOptions("options", dwp_options); // For testing const display_width_t = b.addTest(.{ .name = "display_width", -- cgit v1.2.3 From f4a174e27052e38aec09840e9195981cc2f24c88 Mon Sep 17 00:00:00 2001 From: Sam Atman Date: Fri, 23 May 2025 19:01:57 -0400 Subject: Document "fat_offset" in README --- build.zig | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'build.zig') diff --git a/build.zig b/build.zig index 648571b..ca0eeef 100644 --- a/build.zig +++ b/build.zig @@ -14,7 +14,7 @@ pub fn build(b: *std.Build) void { //| Options // Display width - const cjk = b.option(bool, "cjk", "Ambiguous code points are wide (display width: 2).") orelse false; + const cjk = b.option(bool, "cjk", "Ambiguous code points are wide (display width: 2)") orelse false; const dwp_options = b.addOptions(); dwp_options.addOption(bool, "cjk", cjk); @@ -33,7 +33,7 @@ pub fn build(b: *std.Build) void { dwp_options.addOption(?i4, "c1_width", c1_width); //| Offset size - const fat_offset = b.option(bool, "fat_offset", "Offsets in Iterators and data structures will be u64") orelse false; + const fat_offset = b.option(bool, "fat_offset", "Offsets in iterators and data structures will be u64") orelse false; const size_config = b.addOptions(); size_config.addOption(bool, "fat_offset", fat_offset); -- cgit v1.2.3