summaryrefslogtreecommitdiff
path: root/src/grapheme.zig
diff options
context:
space:
mode:
authorGravatar Jose Colon Rodriguez2024-03-28 22:19:50 -0400
committerGravatar Jose Colon Rodriguez2024-03-28 22:19:50 -0400
commita2c4b7a57fe6b64bdd7c71305d408e5030af3157 (patch)
treec7af1ed4381ab0eeea52e2a9081cb19469b8c0e6 /src/grapheme.zig
parentMerged NumericData into PropsData (diff)
downloadzg-a2c4b7a57fe6b64bdd7c71305d408e5030af3157.tar.gz
zg-a2c4b7a57fe6b64bdd7c71305d408e5030af3157.tar.xz
zg-a2c4b7a57fe6b64bdd7c71305d408e5030af3157.zip
Split out Unicode tests to separate file
Diffstat (limited to 'src/grapheme.zig')
-rw-r--r--src/grapheme.zig65
1 files changed, 0 insertions, 65 deletions
diff --git a/src/grapheme.zig b/src/grapheme.zig
index ad43cfd..f4cc68c 100644
--- a/src/grapheme.zig
+++ b/src/grapheme.zig
@@ -230,71 +230,6 @@ pub fn graphemeBreak(
230 return true; 230 return true;
231} 231}
232 232
233test "Segmentation GraphemeIterator" {
234 const allocator = std.testing.allocator;
235 var file = try std.fs.cwd().openFile("data/unicode/auxiliary/GraphemeBreakTest.txt", .{});
236 defer file.close();
237 var buf_reader = std.io.bufferedReader(file.reader());
238 var input_stream = buf_reader.reader();
239
240 const data = try GraphemeData.init(allocator);
241 defer data.deinit();
242
243 var buf: [4096]u8 = undefined;
244 var line_no: usize = 1;
245
246 while (try input_stream.readUntilDelimiterOrEof(&buf, '\n')) |raw| : (line_no += 1) {
247 // Skip comments or empty lines.
248 if (raw.len == 0 or raw[0] == '#' or raw[0] == '@') continue;
249
250 // Clean up.
251 var line = std.mem.trimLeft(u8, raw, "÷ ");
252 if (std.mem.indexOf(u8, line, " ÷\t#")) |octo| {
253 line = line[0..octo];
254 }
255 // Iterate over fields.
256 var want = std.ArrayList(Grapheme).init(allocator);
257 defer want.deinit();
258
259 var all_bytes = std.ArrayList(u8).init(allocator);
260 defer all_bytes.deinit();
261
262 var graphemes = std.mem.split(u8, line, " ÷ ");
263 var bytes_index: u32 = 0;
264
265 while (graphemes.next()) |field| {
266 var code_points = std.mem.split(u8, field, " ");
267 var cp_buf: [4]u8 = undefined;
268 var cp_index: u32 = 0;
269 var gc_len: u8 = 0;
270
271 while (code_points.next()) |code_point| {
272 if (std.mem.eql(u8, code_point, "×")) continue;
273 const cp: u21 = try std.fmt.parseInt(u21, code_point, 16);
274 const len = try unicode.utf8Encode(cp, &cp_buf);
275 try all_bytes.appendSlice(cp_buf[0..len]);
276 cp_index += len;
277 gc_len += len;
278 }
279
280 try want.append(Grapheme{ .len = gc_len, .offset = bytes_index });
281 bytes_index += cp_index;
282 }
283
284 // std.debug.print("\nline {}: {s}\n", .{ line_no, all_bytes.items });
285 var iter = Iterator.init(all_bytes.items, &data);
286
287 // Chaeck.
288 for (want.items) |want_gc| {
289 const got_gc = (iter.next()).?;
290 try std.testing.expectEqualStrings(
291 want_gc.bytes(all_bytes.items),
292 got_gc.bytes(all_bytes.items),
293 );
294 }
295 }
296}
297
298test "Segmentation ZWJ and ZWSP emoji sequences" { 233test "Segmentation ZWJ and ZWSP emoji sequences" {
299 const seq_1 = "\u{1F43B}\u{200D}\u{2744}\u{FE0F}"; 234 const seq_1 = "\u{1F43B}\u{200D}\u{2744}\u{FE0F}";
300 const seq_2 = "\u{1F43B}\u{200D}\u{2744}\u{FE0F}"; 235 const seq_2 = "\u{1F43B}\u{200D}\u{2744}\u{FE0F}";