diff options
| -rw-r--r-- | xs_json.h | 7 | ||||
| -rw-r--r-- | xs_unicode.h | 116 | ||||
| -rw-r--r-- | xs_version.h | 2 |
3 files changed, 120 insertions, 5 deletions
| @@ -208,6 +208,7 @@ static xs_val *_xs_json_load_lexer(FILE *f, js_type *t) | |||
| 208 | { | 208 | { |
| 209 | int c; | 209 | int c; |
| 210 | xs_val *v = NULL; | 210 | xs_val *v = NULL; |
| 211 | int offset; | ||
| 211 | 212 | ||
| 212 | *t = JS_ERROR; | 213 | *t = JS_ERROR; |
| 213 | 214 | ||
| @@ -236,6 +237,7 @@ static xs_val *_xs_json_load_lexer(FILE *f, js_type *t) | |||
| 236 | *t = JS_STRING; | 237 | *t = JS_STRING; |
| 237 | 238 | ||
| 238 | v = xs_str_new(NULL); | 239 | v = xs_str_new(NULL); |
| 240 | offset = 0; | ||
| 239 | 241 | ||
| 240 | while ((c = fgetc(f)) != '"' && c != EOF && *t != JS_ERROR) { | 242 | while ((c = fgetc(f)) != '"' && c != EOF && *t != JS_ERROR) { |
| 241 | if (c == '\\') { | 243 | if (c == '\\') { |
| @@ -274,11 +276,12 @@ static xs_val *_xs_json_load_lexer(FILE *f, js_type *t) | |||
| 274 | break; | 276 | break; |
| 275 | } | 277 | } |
| 276 | 278 | ||
| 277 | v = xs_utf8_cat(v, cp); | 279 | v = xs_utf8_insert(v, cp, &offset); |
| 278 | } | 280 | } |
| 279 | else { | 281 | else { |
| 280 | char cc = c; | 282 | char cc = c; |
| 281 | v = xs_append_m(v, &cc, 1); | 283 | v = xs_insert_m(v, offset, &cc, 1); |
| 284 | offset++; | ||
| 282 | } | 285 | } |
| 283 | } | 286 | } |
| 284 | 287 | ||
diff --git a/xs_unicode.h b/xs_unicode.h index 2e9a754..a5a1dcb 100644 --- a/xs_unicode.h +++ b/xs_unicode.h | |||
| @@ -9,6 +9,7 @@ | |||
| 9 | unsigned int xs_utf8_dec(const char **str); | 9 | unsigned int xs_utf8_dec(const char **str); |
| 10 | int xs_unicode_width(unsigned int cpoint); | 10 | int xs_unicode_width(unsigned int cpoint); |
| 11 | int xs_is_surrogate(unsigned int cpoint); | 11 | int xs_is_surrogate(unsigned int cpoint); |
| 12 | int xs_is_diacritic(unsigned int cpoint); | ||
| 12 | unsigned int xs_surrogate_dec(unsigned int p1, unsigned int p2); | 13 | unsigned int xs_surrogate_dec(unsigned int p1, unsigned int p2); |
| 13 | unsigned int xs_surrogate_enc(unsigned int cpoint); | 14 | unsigned int xs_surrogate_enc(unsigned int cpoint); |
| 14 | unsigned int *_xs_unicode_upper_search(unsigned int cpoint); | 15 | unsigned int *_xs_unicode_upper_search(unsigned int cpoint); |
| @@ -22,7 +23,12 @@ | |||
| 22 | int xs_unicode_is_alpha(unsigned int cpoint); | 23 | int xs_unicode_is_alpha(unsigned int cpoint); |
| 23 | 24 | ||
| 24 | #ifdef _XS_H | 25 | #ifdef _XS_H |
| 26 | xs_str *xs_utf8_insert(xs_str *str, unsigned int cpoint, int *offset); | ||
| 25 | xs_str *xs_utf8_cat(xs_str *str, unsigned int cpoint); | 27 | xs_str *xs_utf8_cat(xs_str *str, unsigned int cpoint); |
| 28 | xs_str *xs_utf8_to_upper(const char *str); | ||
| 29 | xs_str *xs_utf8_to_lower(const char *str); | ||
| 30 | xs_str *xs_utf8_to_nfd(const char *str); | ||
| 31 | xs_str *xs_utf8_to_nfc(const char *str); | ||
| 26 | #endif | 32 | #endif |
| 27 | 33 | ||
| 28 | #ifdef XS_IMPLEMENTATION | 34 | #ifdef XS_IMPLEMENTATION |
| @@ -144,6 +150,12 @@ int xs_unicode_width(unsigned int cpoint) | |||
| 144 | } | 150 | } |
| 145 | 151 | ||
| 146 | 152 | ||
| 153 | int xs_is_diacritic(unsigned int cpoint) | ||
| 154 | { | ||
| 155 | return cpoint >= 0x300 && cpoint <= 0x36f; | ||
| 156 | } | ||
| 157 | |||
| 158 | |||
| 147 | /** surrogate pairs **/ | 159 | /** surrogate pairs **/ |
| 148 | 160 | ||
| 149 | int xs_is_surrogate(unsigned int cpoint) | 161 | int xs_is_surrogate(unsigned int cpoint) |
| @@ -172,14 +184,27 @@ unsigned int xs_surrogate_enc(unsigned int cpoint) | |||
| 172 | 184 | ||
| 173 | #ifdef _XS_H | 185 | #ifdef _XS_H |
| 174 | 186 | ||
| 175 | xs_str *xs_utf8_cat(xs_str *str, unsigned int cpoint) | 187 | xs_str *xs_utf8_insert(xs_str *str, unsigned int cpoint, int *offset) |
| 176 | /* encodes an Unicode codepoint to utf-8 into str */ | 188 | /* encodes an Unicode codepoint to utf-8 into str */ |
| 177 | { | 189 | { |
| 178 | char tmp[4]; | 190 | char tmp[4]; |
| 179 | 191 | ||
| 180 | int c = xs_utf8_enc(tmp, cpoint); | 192 | int c = xs_utf8_enc(tmp, cpoint); |
| 181 | 193 | ||
| 182 | return xs_append_m(str, tmp, c); | 194 | str = xs_insert_m(str, *offset, tmp, c); |
| 195 | |||
| 196 | *offset += c; | ||
| 197 | |||
| 198 | return str; | ||
| 199 | } | ||
| 200 | |||
| 201 | |||
| 202 | xs_str *xs_utf8_cat(xs_str *str, unsigned int cpoint) | ||
| 203 | /* encodes an Unicode codepoint to utf-8 into str */ | ||
| 204 | { | ||
| 205 | int offset = strlen(str); | ||
| 206 | |||
| 207 | return xs_utf8_insert(str, cpoint, &offset); | ||
| 183 | } | 208 | } |
| 184 | 209 | ||
| 185 | #endif /* _XS_H */ | 210 | #endif /* _XS_H */ |
| @@ -232,6 +257,9 @@ unsigned int *_xs_unicode_lower_search(unsigned int cpoint) | |||
| 232 | unsigned int xs_unicode_to_lower(unsigned int cpoint) | 257 | unsigned int xs_unicode_to_lower(unsigned int cpoint) |
| 233 | /* returns the cpoint to lowercase */ | 258 | /* returns the cpoint to lowercase */ |
| 234 | { | 259 | { |
| 260 | if (cpoint < 0x80) | ||
| 261 | return tolower(cpoint); | ||
| 262 | |||
| 235 | unsigned int *p = _xs_unicode_upper_search(cpoint); | 263 | unsigned int *p = _xs_unicode_upper_search(cpoint); |
| 236 | 264 | ||
| 237 | return p == NULL ? cpoint : p[1]; | 265 | return p == NULL ? cpoint : p[1]; |
| @@ -241,6 +269,9 @@ unsigned int xs_unicode_to_lower(unsigned int cpoint) | |||
| 241 | unsigned int xs_unicode_to_upper(unsigned int cpoint) | 269 | unsigned int xs_unicode_to_upper(unsigned int cpoint) |
| 242 | /* returns the cpoint to uppercase */ | 270 | /* returns the cpoint to uppercase */ |
| 243 | { | 271 | { |
| 272 | if (cpoint < 0x80) | ||
| 273 | return toupper(cpoint); | ||
| 274 | |||
| 244 | unsigned int *p = _xs_unicode_lower_search(cpoint); | 275 | unsigned int *p = _xs_unicode_lower_search(cpoint); |
| 245 | 276 | ||
| 246 | return p == NULL ? cpoint : p[0]; | 277 | return p == NULL ? cpoint : p[0]; |
| @@ -317,6 +348,87 @@ int xs_unicode_is_alpha(unsigned int cpoint) | |||
| 317 | } | 348 | } |
| 318 | 349 | ||
| 319 | 350 | ||
| 351 | #ifdef _XS_H | ||
| 352 | |||
| 353 | xs_str *xs_utf8_to_upper(const char *str) | ||
| 354 | { | ||
| 355 | xs_str *s = xs_str_new(NULL); | ||
| 356 | unsigned int cpoint; | ||
| 357 | int offset = 0; | ||
| 358 | |||
| 359 | while ((cpoint = xs_utf8_dec(&str))) { | ||
| 360 | cpoint = xs_unicode_to_upper(cpoint); | ||
| 361 | s = xs_utf8_insert(s, cpoint, &offset); | ||
| 362 | } | ||
| 363 | |||
| 364 | return s; | ||
| 365 | } | ||
| 366 | |||
| 367 | |||
| 368 | xs_str *xs_utf8_to_lower(const char *str) | ||
| 369 | { | ||
| 370 | xs_str *s = xs_str_new(NULL); | ||
| 371 | unsigned int cpoint; | ||
| 372 | int offset = 0; | ||
| 373 | |||
| 374 | while ((cpoint = xs_utf8_dec(&str))) { | ||
| 375 | cpoint = xs_unicode_to_lower(cpoint); | ||
| 376 | s = xs_utf8_insert(s, cpoint, &offset); | ||
| 377 | } | ||
| 378 | |||
| 379 | return s; | ||
| 380 | } | ||
| 381 | |||
| 382 | |||
| 383 | xs_str *xs_utf8_to_nfd(const char *str) | ||
| 384 | { | ||
| 385 | xs_str *s = xs_str_new(NULL); | ||
| 386 | unsigned int cpoint; | ||
| 387 | int offset = 0; | ||
| 388 | |||
| 389 | while ((cpoint = xs_utf8_dec(&str))) { | ||
| 390 | unsigned int base; | ||
| 391 | unsigned int diac; | ||
| 392 | |||
| 393 | if (xs_unicode_nfd(cpoint, &base, &diac)) { | ||
| 394 | s = xs_utf8_insert(s, base, &offset); | ||
| 395 | s = xs_utf8_insert(s, diac, &offset); | ||
| 396 | } | ||
| 397 | else | ||
| 398 | s = xs_utf8_insert(s, cpoint, &offset); | ||
| 399 | } | ||
| 400 | |||
| 401 | return s; | ||
| 402 | } | ||
| 403 | |||
| 404 | |||
| 405 | xs_str *xs_utf8_to_nfc(const char *str) | ||
| 406 | { | ||
| 407 | xs_str *s = xs_str_new(NULL); | ||
| 408 | unsigned int cpoint; | ||
| 409 | unsigned int base = 0; | ||
| 410 | int offset = 0; | ||
| 411 | |||
| 412 | while ((cpoint = xs_utf8_dec(&str))) { | ||
| 413 | if (xs_is_diacritic(cpoint)) { | ||
| 414 | if (xs_unicode_nfc(base, cpoint, &base)) | ||
| 415 | continue; | ||
| 416 | } | ||
| 417 | |||
| 418 | if (base) | ||
| 419 | s = xs_utf8_insert(s, base, &offset); | ||
| 420 | |||
| 421 | base = cpoint; | ||
| 422 | } | ||
| 423 | |||
| 424 | if (base) | ||
| 425 | s = xs_utf8_insert(s, base, &offset); | ||
| 426 | |||
| 427 | return s; | ||
| 428 | } | ||
| 429 | |||
| 430 | #endif /* _XS_H */ | ||
| 431 | |||
| 320 | #endif /* _XS_UNICODE_TBL_H */ | 432 | #endif /* _XS_UNICODE_TBL_H */ |
| 321 | 433 | ||
| 322 | #endif /* XS_IMPLEMENTATION */ | 434 | #endif /* XS_IMPLEMENTATION */ |
diff --git a/xs_version.h b/xs_version.h index 4318c7e..ce88558 100644 --- a/xs_version.h +++ b/xs_version.h | |||
| @@ -1 +1 @@ | |||
| /* c6eca9593f9b3d6791cba600e5950f682fdb36cf 2024-08-12T16:08:37+02:00 */ | /* cc9ebd36ae640e4701277327fbba9996143076f6 2024-08-23T17:17:08+02:00 */ | ||