summaryrefslogtreecommitdiff
path: root/src/unicode_tests.zig
diff options
context:
space:
mode:
authorGravatar Sam Atman2025-05-15 16:06:35 -0400
committerGravatar Sam Atman2025-05-15 16:06:35 -0400
commit6ae1e4aabf3b8d3a0c3c1154c297979d0277e6b3 (patch)
tree61f83982aaab67f862524d46b073fcfee01d56cf /src/unicode_tests.zig
parentwordAtIndex passes conformance (diff)
parentMerge branch 'work-branch' into HEAD (diff)
downloadzg-6ae1e4aabf3b8d3a0c3c1154c297979d0277e6b3.tar.gz
zg-6ae1e4aabf3b8d3a0c3c1154c297979d0277e6b3.tar.xz
zg-6ae1e4aabf3b8d3a0c3c1154c297979d0277e6b3.zip
Merge commit 'b5d955f' into develop-next
Diffstat (limited to 'src/unicode_tests.zig')
-rw-r--r--src/unicode_tests.zig74
1 files changed, 74 insertions, 0 deletions
diff --git a/src/unicode_tests.zig b/src/unicode_tests.zig
index 8b02e98..0204b92 100644
--- a/src/unicode_tests.zig
+++ b/src/unicode_tests.zig
@@ -175,6 +175,80 @@ test "Segmentation GraphemeIterator" {
175 } 175 }
176} 176}
177 177
178test "Segmentation ReverseGraphemeIterator" {
179 const allocator = std.testing.allocator;
180 var file = try std.fs.cwd().openFile("data/unicode/auxiliary/GraphemeBreakTest.txt", .{});
181 defer file.close();
182 var buf_reader = std.io.bufferedReader(file.reader());
183 var input_stream = buf_reader.reader();
184
185 const data = try Graphemes.init(allocator);
186 defer data.deinit(allocator);
187
188 var buf: [4096]u8 = undefined;
189 var line_no: usize = 1;
190
191 while (try input_stream.readUntilDelimiterOrEof(&buf, '\n')) |raw| : (line_no += 1) {
192 // Skip comments or empty lines.
193 if (raw.len == 0 or raw[0] == '#' or raw[0] == '@') continue;
194
195 // Clean up.
196 var line = std.mem.trimLeft(u8, raw, "÷ ");
197 if (std.mem.indexOf(u8, line, " ÷\t#")) |octo| {
198 line = line[0..octo];
199 }
200 // Iterate over fields.
201 var want = std.ArrayList(Grapheme).init(allocator);
202 defer want.deinit();
203
204 var all_bytes = std.ArrayList(u8).init(allocator);
205 defer all_bytes.deinit();
206
207 var graphemes = std.mem.splitSequence(u8, line, " ÷ ");
208 var bytes_index: u32 = 0;
209
210 while (graphemes.next()) |field| {
211 var code_points = std.mem.splitScalar(u8, field, ' ');
212 var cp_buf: [4]u8 = undefined;
213 var cp_index: u32 = 0;
214 var gc_len: u8 = 0;
215
216 while (code_points.next()) |code_point| {
217 if (std.mem.eql(u8, code_point, "×")) continue;
218 const cp: u21 = try std.fmt.parseInt(u21, code_point, 16);
219 const len = try unicode.utf8Encode(cp, &cp_buf);
220 try all_bytes.appendSlice(cp_buf[0..len]);
221 cp_index += len;
222 gc_len += len;
223 }
224
225 try want.append(Grapheme{ .len = gc_len, .offset = bytes_index });
226 bytes_index += cp_index;
227 }
228
229 // std.debug.print("\nline {}: {s}\n", .{ line_no, all_bytes.items });
230 var iter = data.reverseIterator(all_bytes.items);
231
232 // Check.
233 var i: usize = want.items.len;
234 while (i > 0) {
235 i -= 1;
236 const want_gc = want.items[i];
237 const got_gc = iter.prev() orelse {
238 std.debug.print("line {d} grapheme {d}: expected {any} found null\n", .{ line_no, i, want_gc });
239 return error.TestExpectedEqual;
240 };
241 std.testing.expectEqualStrings(
242 want_gc.bytes(all_bytes.items),
243 got_gc.bytes(all_bytes.items),
244 ) catch |err| {
245 std.debug.print("line {d} grapheme {d}: expected {any} found {any}\n", .{ line_no, i, want_gc, got_gc });
246 return err;
247 };
248 }
249 }
250}
251
178test "Segmentation Word Iterator" { 252test "Segmentation Word Iterator" {
179 const allocator = std.testing.allocator; 253 const allocator = std.testing.allocator;
180 var file = try std.fs.cwd().openFile("data/unicode/auxiliary/WordBreakTest.txt", .{}); 254 var file = try std.fs.cwd().openFile("data/unicode/auxiliary/WordBreakTest.txt", .{});