diff options
| author | 2024-08-23 17:22:10 +0200 | |
|---|---|---|
| committer | 2024-08-23 17:22:10 +0200 | |
| commit | 8586e44de92c827d6a19a7700121c8b21d3687b1 (patch) | |
| tree | a0cc5a181a851f58a1cdea505bf1096970eb3e8d /xs_unicode.h | |
| parent | Updated TODO. (diff) | |
| download | snac2-8586e44de92c827d6a19a7700121c8b21d3687b1.tar.gz snac2-8586e44de92c827d6a19a7700121c8b21d3687b1.tar.xz snac2-8586e44de92c827d6a19a7700121c8b21d3687b1.zip | |
Some optimizations.
Diffstat (limited to 'xs_unicode.h')
| -rw-r--r-- | xs_unicode.h | 116 |
1 files changed, 114 insertions, 2 deletions
diff --git a/xs_unicode.h b/xs_unicode.h index 2e9a754..a5a1dcb 100644 --- a/xs_unicode.h +++ b/xs_unicode.h | |||
| @@ -9,6 +9,7 @@ | |||
| 9 | unsigned int xs_utf8_dec(const char **str); | 9 | unsigned int xs_utf8_dec(const char **str); |
| 10 | int xs_unicode_width(unsigned int cpoint); | 10 | int xs_unicode_width(unsigned int cpoint); |
| 11 | int xs_is_surrogate(unsigned int cpoint); | 11 | int xs_is_surrogate(unsigned int cpoint); |
| 12 | int xs_is_diacritic(unsigned int cpoint); | ||
| 12 | unsigned int xs_surrogate_dec(unsigned int p1, unsigned int p2); | 13 | unsigned int xs_surrogate_dec(unsigned int p1, unsigned int p2); |
| 13 | unsigned int xs_surrogate_enc(unsigned int cpoint); | 14 | unsigned int xs_surrogate_enc(unsigned int cpoint); |
| 14 | unsigned int *_xs_unicode_upper_search(unsigned int cpoint); | 15 | unsigned int *_xs_unicode_upper_search(unsigned int cpoint); |
| @@ -22,7 +23,12 @@ | |||
| 22 | int xs_unicode_is_alpha(unsigned int cpoint); | 23 | int xs_unicode_is_alpha(unsigned int cpoint); |
| 23 | 24 | ||
| 24 | #ifdef _XS_H | 25 | #ifdef _XS_H |
| 26 | xs_str *xs_utf8_insert(xs_str *str, unsigned int cpoint, int *offset); | ||
| 25 | xs_str *xs_utf8_cat(xs_str *str, unsigned int cpoint); | 27 | xs_str *xs_utf8_cat(xs_str *str, unsigned int cpoint); |
| 28 | xs_str *xs_utf8_to_upper(const char *str); | ||
| 29 | xs_str *xs_utf8_to_lower(const char *str); | ||
| 30 | xs_str *xs_utf8_to_nfd(const char *str); | ||
| 31 | xs_str *xs_utf8_to_nfc(const char *str); | ||
| 26 | #endif | 32 | #endif |
| 27 | 33 | ||
| 28 | #ifdef XS_IMPLEMENTATION | 34 | #ifdef XS_IMPLEMENTATION |
| @@ -144,6 +150,12 @@ int xs_unicode_width(unsigned int cpoint) | |||
| 144 | } | 150 | } |
| 145 | 151 | ||
| 146 | 152 | ||
| 153 | int xs_is_diacritic(unsigned int cpoint) | ||
| 154 | { | ||
| 155 | return cpoint >= 0x300 && cpoint <= 0x36f; | ||
| 156 | } | ||
| 157 | |||
| 158 | |||
| 147 | /** surrogate pairs **/ | 159 | /** surrogate pairs **/ |
| 148 | 160 | ||
| 149 | int xs_is_surrogate(unsigned int cpoint) | 161 | int xs_is_surrogate(unsigned int cpoint) |
| @@ -172,14 +184,27 @@ unsigned int xs_surrogate_enc(unsigned int cpoint) | |||
| 172 | 184 | ||
| 173 | #ifdef _XS_H | 185 | #ifdef _XS_H |
| 174 | 186 | ||
| 175 | xs_str *xs_utf8_cat(xs_str *str, unsigned int cpoint) | 187 | xs_str *xs_utf8_insert(xs_str *str, unsigned int cpoint, int *offset) |
| 176 | /* encodes an Unicode codepoint to utf-8 into str */ | 188 | /* encodes an Unicode codepoint to utf-8 into str */ |
| 177 | { | 189 | { |
| 178 | char tmp[4]; | 190 | char tmp[4]; |
| 179 | 191 | ||
| 180 | int c = xs_utf8_enc(tmp, cpoint); | 192 | int c = xs_utf8_enc(tmp, cpoint); |
| 181 | 193 | ||
| 182 | return xs_append_m(str, tmp, c); | 194 | str = xs_insert_m(str, *offset, tmp, c); |
| 195 | |||
| 196 | *offset += c; | ||
| 197 | |||
| 198 | return str; | ||
| 199 | } | ||
| 200 | |||
| 201 | |||
| 202 | xs_str *xs_utf8_cat(xs_str *str, unsigned int cpoint) | ||
| 203 | /* encodes an Unicode codepoint to utf-8 into str */ | ||
| 204 | { | ||
| 205 | int offset = strlen(str); | ||
| 206 | |||
| 207 | return xs_utf8_insert(str, cpoint, &offset); | ||
| 183 | } | 208 | } |
| 184 | 209 | ||
| 185 | #endif /* _XS_H */ | 210 | #endif /* _XS_H */ |
| @@ -232,6 +257,9 @@ unsigned int *_xs_unicode_lower_search(unsigned int cpoint) | |||
| 232 | unsigned int xs_unicode_to_lower(unsigned int cpoint) | 257 | unsigned int xs_unicode_to_lower(unsigned int cpoint) |
| 233 | /* returns the cpoint to lowercase */ | 258 | /* returns the cpoint to lowercase */ |
| 234 | { | 259 | { |
| 260 | if (cpoint < 0x80) | ||
| 261 | return tolower(cpoint); | ||
| 262 | |||
| 235 | unsigned int *p = _xs_unicode_upper_search(cpoint); | 263 | unsigned int *p = _xs_unicode_upper_search(cpoint); |
| 236 | 264 | ||
| 237 | return p == NULL ? cpoint : p[1]; | 265 | return p == NULL ? cpoint : p[1]; |
| @@ -241,6 +269,9 @@ unsigned int xs_unicode_to_lower(unsigned int cpoint) | |||
| 241 | unsigned int xs_unicode_to_upper(unsigned int cpoint) | 269 | unsigned int xs_unicode_to_upper(unsigned int cpoint) |
| 242 | /* returns the cpoint to uppercase */ | 270 | /* returns the cpoint to uppercase */ |
| 243 | { | 271 | { |
| 272 | if (cpoint < 0x80) | ||
| 273 | return toupper(cpoint); | ||
| 274 | |||
| 244 | unsigned int *p = _xs_unicode_lower_search(cpoint); | 275 | unsigned int *p = _xs_unicode_lower_search(cpoint); |
| 245 | 276 | ||
| 246 | return p == NULL ? cpoint : p[0]; | 277 | return p == NULL ? cpoint : p[0]; |
| @@ -317,6 +348,87 @@ int xs_unicode_is_alpha(unsigned int cpoint) | |||
| 317 | } | 348 | } |
| 318 | 349 | ||
| 319 | 350 | ||
| 351 | #ifdef _XS_H | ||
| 352 | |||
| 353 | xs_str *xs_utf8_to_upper(const char *str) | ||
| 354 | { | ||
| 355 | xs_str *s = xs_str_new(NULL); | ||
| 356 | unsigned int cpoint; | ||
| 357 | int offset = 0; | ||
| 358 | |||
| 359 | while ((cpoint = xs_utf8_dec(&str))) { | ||
| 360 | cpoint = xs_unicode_to_upper(cpoint); | ||
| 361 | s = xs_utf8_insert(s, cpoint, &offset); | ||
| 362 | } | ||
| 363 | |||
| 364 | return s; | ||
| 365 | } | ||
| 366 | |||
| 367 | |||
| 368 | xs_str *xs_utf8_to_lower(const char *str) | ||
| 369 | { | ||
| 370 | xs_str *s = xs_str_new(NULL); | ||
| 371 | unsigned int cpoint; | ||
| 372 | int offset = 0; | ||
| 373 | |||
| 374 | while ((cpoint = xs_utf8_dec(&str))) { | ||
| 375 | cpoint = xs_unicode_to_lower(cpoint); | ||
| 376 | s = xs_utf8_insert(s, cpoint, &offset); | ||
| 377 | } | ||
| 378 | |||
| 379 | return s; | ||
| 380 | } | ||
| 381 | |||
| 382 | |||
| 383 | xs_str *xs_utf8_to_nfd(const char *str) | ||
| 384 | { | ||
| 385 | xs_str *s = xs_str_new(NULL); | ||
| 386 | unsigned int cpoint; | ||
| 387 | int offset = 0; | ||
| 388 | |||
| 389 | while ((cpoint = xs_utf8_dec(&str))) { | ||
| 390 | unsigned int base; | ||
| 391 | unsigned int diac; | ||
| 392 | |||
| 393 | if (xs_unicode_nfd(cpoint, &base, &diac)) { | ||
| 394 | s = xs_utf8_insert(s, base, &offset); | ||
| 395 | s = xs_utf8_insert(s, diac, &offset); | ||
| 396 | } | ||
| 397 | else | ||
| 398 | s = xs_utf8_insert(s, cpoint, &offset); | ||
| 399 | } | ||
| 400 | |||
| 401 | return s; | ||
| 402 | } | ||
| 403 | |||
| 404 | |||
| 405 | xs_str *xs_utf8_to_nfc(const char *str) | ||
| 406 | { | ||
| 407 | xs_str *s = xs_str_new(NULL); | ||
| 408 | unsigned int cpoint; | ||
| 409 | unsigned int base = 0; | ||
| 410 | int offset = 0; | ||
| 411 | |||
| 412 | while ((cpoint = xs_utf8_dec(&str))) { | ||
| 413 | if (xs_is_diacritic(cpoint)) { | ||
| 414 | if (xs_unicode_nfc(base, cpoint, &base)) | ||
| 415 | continue; | ||
| 416 | } | ||
| 417 | |||
| 418 | if (base) | ||
| 419 | s = xs_utf8_insert(s, base, &offset); | ||
| 420 | |||
| 421 | base = cpoint; | ||
| 422 | } | ||
| 423 | |||
| 424 | if (base) | ||
| 425 | s = xs_utf8_insert(s, base, &offset); | ||
| 426 | |||
| 427 | return s; | ||
| 428 | } | ||
| 429 | |||
| 430 | #endif /* _XS_H */ | ||
| 431 | |||
| 320 | #endif /* _XS_UNICODE_TBL_H */ | 432 | #endif /* _XS_UNICODE_TBL_H */ |
| 321 | 433 | ||
| 322 | #endif /* XS_IMPLEMENTATION */ | 434 | #endif /* XS_IMPLEMENTATION */ |