diff options
| author | 2023-12-27 12:54:38 +0100 | |
|---|---|---|
| committer | 2023-12-27 12:54:38 +0100 | |
| commit | bf435af788d387b3d97fd744e3b1f6a73795beb8 (patch) | |
| tree | 6d193edd88ef3818bffd9278ddab0248e1108ef3 /xs_unicode.h | |
| parent | Also log the job_fifo len in status.txt. (diff) | |
| download | snac2-bf435af788d387b3d97fd744e3b1f6a73795beb8.tar.gz snac2-bf435af788d387b3d97fd744e3b1f6a73795beb8.tar.xz snac2-bf435af788d387b3d97fd744e3b1f6a73795beb8.zip | |
Backport from xs.
Diffstat (limited to 'xs_unicode.h')
| -rw-r--r-- | xs_unicode.h | 192 |
1 files changed, 113 insertions, 79 deletions
diff --git a/xs_unicode.h b/xs_unicode.h index f5880f0..c666479 100644 --- a/xs_unicode.h +++ b/xs_unicode.h | |||
| @@ -5,7 +5,6 @@ | |||
| 5 | #define _XS_UNICODE_H | 5 | #define _XS_UNICODE_H |
| 6 | 6 | ||
| 7 | int _xs_utf8_enc(char buf[4], unsigned int cpoint); | 7 | int _xs_utf8_enc(char buf[4], unsigned int cpoint); |
| 8 | xs_str *xs_utf8_enc(xs_str *str, unsigned int cpoint); | ||
| 9 | unsigned int xs_utf8_dec(char **str); | 8 | unsigned int xs_utf8_dec(char **str); |
| 10 | int xs_unicode_width(unsigned int cpoint); | 9 | int xs_unicode_width(unsigned int cpoint); |
| 11 | int xs_is_surrogate(unsigned int cpoint); | 10 | int xs_is_surrogate(unsigned int cpoint); |
| @@ -21,13 +20,20 @@ | |||
| 21 | int xs_unicode_nfc(unsigned int base, unsigned int diac, unsigned int *cpoint); | 20 | int xs_unicode_nfc(unsigned int base, unsigned int diac, unsigned int *cpoint); |
| 22 | int xs_unicode_is_alpha(unsigned int cpoint); | 21 | int xs_unicode_is_alpha(unsigned int cpoint); |
| 23 | 22 | ||
| 23 | #ifdef _XS_H | ||
| 24 | xs_str *xs_utf8_enc(xs_str *str, unsigned int cpoint); | ||
| 25 | #endif | ||
| 26 | |||
| 24 | #ifdef XS_IMPLEMENTATION | 27 | #ifdef XS_IMPLEMENTATION |
| 25 | 28 | ||
| 29 | #ifndef countof | ||
| 30 | #define countof(a) (sizeof((a)) / sizeof((*a))) | ||
| 31 | #endif | ||
| 26 | 32 | ||
| 27 | int _xs_utf8_enc(char buf[4], unsigned int cpoint) | 33 | int _xs_utf8_enc(char buf[4], unsigned int cpoint) |
| 28 | /* encodes an Unicode codepoint to utf-8 into buf and returns the size in bytes */ | 34 | /* encodes an Unicode codepoint to utf-8 into buf and returns the size in bytes */ |
| 29 | { | 35 | { |
| 30 | unsigned char *p = (unsigned char *)buf; | 36 | char *p = buf; |
| 31 | 37 | ||
| 32 | if (cpoint < 0x80) /* 1 byte char */ | 38 | if (cpoint < 0x80) /* 1 byte char */ |
| 33 | *p++ = cpoint & 0xff; | 39 | *p++ = cpoint & 0xff; |
| @@ -48,27 +54,16 @@ int _xs_utf8_enc(char buf[4], unsigned int cpoint) | |||
| 48 | *p++ = 0x80 | (cpoint & 0x3f); | 54 | *p++ = 0x80 | (cpoint & 0x3f); |
| 49 | } | 55 | } |
| 50 | 56 | ||
| 51 | return p - (unsigned char *)buf; | 57 | return p - buf; |
| 52 | } | ||
| 53 | |||
| 54 | |||
| 55 | xs_str *xs_utf8_enc(xs_str *str, unsigned int cpoint) | ||
| 56 | /* encodes an Unicode codepoint to utf-8 into str */ | ||
| 57 | { | ||
| 58 | char tmp[4]; | ||
| 59 | |||
| 60 | int c = _xs_utf8_enc(tmp, cpoint); | ||
| 61 | |||
| 62 | return xs_append_m(str, tmp, c); | ||
| 63 | } | 58 | } |
| 64 | 59 | ||
| 65 | 60 | ||
| 66 | unsigned int xs_utf8_dec(char **str) | 61 | unsigned int xs_utf8_dec(char **str) |
| 67 | /* decodes an utf-8 char inside str and updates the pointer */ | 62 | /* decodes an utf-8 char inside str and updates the pointer */ |
| 68 | { | 63 | { |
| 69 | unsigned char *p = (unsigned char *)*str; | 64 | char *p = *str; |
| 70 | unsigned int cpoint = 0; | 65 | unsigned int cpoint = 0; |
| 71 | int c = *p++; | 66 | unsigned char c = *p++; |
| 72 | int cb = 0; | 67 | int cb = 0; |
| 73 | 68 | ||
| 74 | if ((c & 0x80) == 0) { /* 1 byte char */ | 69 | if ((c & 0x80) == 0) { /* 1 byte char */ |
| @@ -91,30 +86,19 @@ unsigned int xs_utf8_dec(char **str) | |||
| 91 | } | 86 | } |
| 92 | 87 | ||
| 93 | /* process the continuation bytes */ | 88 | /* process the continuation bytes */ |
| 94 | while (cb--) { | 89 | while (cb > 0 && *p && (*p & 0xc0) == 0x80) |
| 95 | if ((*p & 0xc0) == 0x80) | 90 | cpoint |= (*p++ & 0x3f) << (--cb * 6); |
| 96 | cpoint |= (*p++ & 0x3f) << (cb * 6); | ||
| 97 | else { | ||
| 98 | cpoint = 0xfffd; | ||
| 99 | break; | ||
| 100 | } | ||
| 101 | } | ||
| 102 | 91 | ||
| 103 | *str = (char *)p; | 92 | /* incomplete or broken? */ |
| 104 | return cpoint; | 93 | if (cb) |
| 105 | } | 94 | cpoint = 0xfffd; |
| 106 | |||
| 107 | |||
| 108 | static int int_range_cmp(const void *p1, const void *p2) | ||
| 109 | { | ||
| 110 | const unsigned int *a = p1; | ||
| 111 | const unsigned int *b = p2; | ||
| 112 | 95 | ||
| 113 | return *a < b[0] ? -1 : *a > b[1] ? 1 : 0; | 96 | *str = p; |
| 97 | return cpoint; | ||
| 114 | } | 98 | } |
| 115 | 99 | ||
| 116 | 100 | ||
| 117 | /* intentionally dead simple */ | 101 | /** Unicode character width: intentionally dead simple **/ |
| 118 | 102 | ||
| 119 | static unsigned int xs_unicode_width_table[] = { | 103 | static unsigned int xs_unicode_width_table[] = { |
| 120 | 0x300, 0x36f, 0, /* diacritics */ | 104 | 0x300, 0x36f, 0, /* diacritics */ |
| @@ -132,12 +116,23 @@ static unsigned int xs_unicode_width_table[] = { | |||
| 132 | int xs_unicode_width(unsigned int cpoint) | 116 | int xs_unicode_width(unsigned int cpoint) |
| 133 | /* returns the width in columns of a Unicode codepoint (somewhat simplified) */ | 117 | /* returns the width in columns of a Unicode codepoint (somewhat simplified) */ |
| 134 | { | 118 | { |
| 135 | unsigned int *r = bsearch(&cpoint, xs_unicode_width_table, | 119 | int b = 0; |
| 136 | sizeof(xs_unicode_width_table) / (sizeof(unsigned int) * 3), | 120 | int t = countof(xs_unicode_width_table) / 3 - 1; |
| 137 | sizeof(unsigned int) * 3, | 121 | |
| 138 | int_range_cmp); | 122 | while (t >= b) { |
| 123 | int n = (b + t) / 2; | ||
| 124 | unsigned int *p = &xs_unicode_width_table[n * 3]; | ||
| 125 | |||
| 126 | if (cpoint < p[0]) | ||
| 127 | t = n - 1; | ||
| 128 | else | ||
| 129 | if (cpoint > p[1]) | ||
| 130 | b = n + 1; | ||
| 131 | else | ||
| 132 | return p[2]; | ||
| 133 | } | ||
| 139 | 134 | ||
| 140 | return r ? r[2] : 1; | 135 | return 1; |
| 141 | } | 136 | } |
| 142 | 137 | ||
| 143 | 138 | ||
| @@ -167,38 +162,56 @@ unsigned int xs_surrogate_enc(unsigned int cpoint) | |||
| 167 | } | 162 | } |
| 168 | 163 | ||
| 169 | 164 | ||
| 170 | #ifdef _XS_UNICODE_TBL_H | 165 | #ifdef _XS_H |
| 171 | |||
| 172 | /* include xs_unicode_tbl.h before this one to use these functions */ | ||
| 173 | 166 | ||
| 174 | static int int_cmp(const void *p1, const void *p2) | 167 | xs_str *xs_utf8_enc(xs_str *str, unsigned int cpoint) |
| 168 | /* encodes an Unicode codepoint to utf-8 into str */ | ||
| 175 | { | 169 | { |
| 176 | const unsigned int *a = p1; | 170 | char tmp[4]; |
| 177 | const unsigned int *b = p2; | 171 | |
| 172 | int c = _xs_utf8_enc(tmp, cpoint); | ||
| 178 | 173 | ||
| 179 | return *a < *b ? -1 : *a > *b ? 1 : 0; | 174 | return xs_append_m(str, tmp, c); |
| 180 | } | 175 | } |
| 181 | 176 | ||
| 177 | #endif /* _XS_H */ | ||
| 178 | |||
| 179 | |||
| 180 | #ifdef _XS_UNICODE_TBL_H | ||
| 181 | |||
| 182 | /* include xs_unicode_tbl.h before this one to use these functions */ | ||
| 182 | 183 | ||
| 183 | unsigned int *_xs_unicode_upper_search(unsigned int cpoint) | 184 | unsigned int *_xs_unicode_upper_search(unsigned int cpoint) |
| 184 | /* searches for an uppercase codepoint in the case fold table */ | 185 | /* searches for an uppercase codepoint in the case fold table */ |
| 185 | { | 186 | { |
| 186 | return bsearch(&cpoint, xs_unicode_case_fold_table, | 187 | int b = 0; |
| 187 | sizeof(xs_unicode_case_fold_table) / (sizeof(unsigned int) * 2), | 188 | int t = countof(xs_unicode_case_fold_table) / 2 + 1; |
| 188 | sizeof(unsigned int) * 2, | 189 | |
| 189 | int_cmp); | 190 | while (t >= b) { |
| 191 | int n = (b + t) / 2; | ||
| 192 | unsigned int *p = &xs_unicode_case_fold_table[n * 2]; | ||
| 193 | |||
| 194 | if (cpoint < p[0]) | ||
| 195 | t = n - 1; | ||
| 196 | else | ||
| 197 | if (cpoint > p[0]) | ||
| 198 | b = n + 1; | ||
| 199 | else | ||
| 200 | return p; | ||
| 201 | } | ||
| 202 | |||
| 203 | return NULL; | ||
| 190 | } | 204 | } |
| 191 | 205 | ||
| 192 | 206 | ||
| 193 | unsigned int *_xs_unicode_lower_search(unsigned int cpoint) | 207 | unsigned int *_xs_unicode_lower_search(unsigned int cpoint) |
| 194 | /* searches for a lowercase codepoint in the case fold table */ | 208 | /* searches for a lowercase codepoint in the case fold table */ |
| 195 | { | 209 | { |
| 196 | unsigned int *p = xs_unicode_case_fold_table + 1; | 210 | unsigned int *p = xs_unicode_case_fold_table; |
| 197 | unsigned int *e = xs_unicode_case_fold_table + | 211 | unsigned int *e = p + countof(xs_unicode_case_fold_table); |
| 198 | sizeof(xs_unicode_case_fold_table) / sizeof(unsigned int); | ||
| 199 | 212 | ||
| 200 | while (p < e) { | 213 | while (p < e) { |
| 201 | if (cpoint == *p) | 214 | if (cpoint == p[1]) |
| 202 | return p; | 215 | return p; |
| 203 | 216 | ||
| 204 | p += 2; | 217 | p += 2; |
| @@ -208,38 +221,49 @@ unsigned int *_xs_unicode_lower_search(unsigned int cpoint) | |||
| 208 | } | 221 | } |
| 209 | 222 | ||
| 210 | 223 | ||
| 211 | unsigned int xs_unicode_to_upper(unsigned int cpoint) | 224 | unsigned int xs_unicode_to_lower(unsigned int cpoint) |
| 212 | /* returns the cpoint to uppercase */ | 225 | /* returns the cpoint to lowercase */ |
| 213 | { | 226 | { |
| 214 | unsigned int *p = _xs_unicode_lower_search(cpoint); | 227 | unsigned int *p = _xs_unicode_upper_search(cpoint); |
| 215 | 228 | ||
| 216 | return p == NULL ? cpoint : p[-1]; | 229 | return p == NULL ? cpoint : p[1]; |
| 217 | } | 230 | } |
| 218 | 231 | ||
| 219 | 232 | ||
| 220 | unsigned int xs_unicode_to_lower(unsigned int cpoint) | 233 | unsigned int xs_unicode_to_upper(unsigned int cpoint) |
| 221 | /* returns the cpoint to lowercase */ | 234 | /* returns the cpoint to uppercase */ |
| 222 | { | 235 | { |
| 223 | unsigned int *p = _xs_unicode_upper_search(cpoint); | 236 | unsigned int *p = _xs_unicode_lower_search(cpoint); |
| 224 | 237 | ||
| 225 | return p == NULL ? cpoint : p[1]; | 238 | return p == NULL ? cpoint : p[0]; |
| 226 | } | 239 | } |
| 227 | 240 | ||
| 228 | 241 | ||
| 229 | int xs_unicode_nfd(unsigned int cpoint, unsigned int *base, unsigned int *diac) | 242 | int xs_unicode_nfd(unsigned int cpoint, unsigned int *base, unsigned int *diac) |
| 230 | /* applies unicode Normalization Form D */ | 243 | /* applies unicode Normalization Form D */ |
| 231 | { | 244 | { |
| 232 | unsigned int *r = bsearch(&cpoint, xs_unicode_nfd_table, | 245 | int b = 0; |
| 233 | sizeof(xs_unicode_nfd_table) / (sizeof(unsigned int) * 3), | 246 | int t = countof(xs_unicode_nfd_table) / 3 - 1; |
| 234 | sizeof(unsigned int) * 3, | 247 | |
| 235 | int_cmp); | 248 | while (t >= b) { |
| 236 | 249 | int n = (b + t) / 2; | |
| 237 | if (r != NULL) { | 250 | unsigned int *p = &xs_unicode_nfd_table[n * 3]; |
| 238 | *base = r[1]; | 251 | |
| 239 | *diac = r[2]; | 252 | int c = cpoint - p[0]; |
| 253 | |||
| 254 | if (c < 0) | ||
| 255 | t = n - 1; | ||
| 256 | else | ||
| 257 | if (c > 0) | ||
| 258 | b = n + 1; | ||
| 259 | else { | ||
| 260 | *base = p[1]; | ||
| 261 | *diac = p[2]; | ||
| 262 | return 1; | ||
| 263 | } | ||
| 240 | } | 264 | } |
| 241 | 265 | ||
| 242 | return !!r; | 266 | return 0; |
| 243 | } | 267 | } |
| 244 | 268 | ||
| 245 | 269 | ||
| @@ -247,8 +271,7 @@ int xs_unicode_nfc(unsigned int base, unsigned int diac, unsigned int *cpoint) | |||
| 247 | /* applies unicode Normalization Form C */ | 271 | /* applies unicode Normalization Form C */ |
| 248 | { | 272 | { |
| 249 | unsigned int *p = xs_unicode_nfd_table; | 273 | unsigned int *p = xs_unicode_nfd_table; |
| 250 | unsigned int *e = xs_unicode_nfd_table + | 274 | unsigned int *e = p + countof(xs_unicode_nfd_table); |
| 251 | sizeof(xs_unicode_nfd_table) / sizeof(unsigned int); | ||
| 252 | 275 | ||
| 253 | while (p < e) { | 276 | while (p < e) { |
| 254 | if (p[1] == base && p[2] == diac) { | 277 | if (p[1] == base && p[2] == diac) { |
| @@ -266,12 +289,23 @@ int xs_unicode_nfc(unsigned int base, unsigned int diac, unsigned int *cpoint) | |||
| 266 | int xs_unicode_is_alpha(unsigned int cpoint) | 289 | int xs_unicode_is_alpha(unsigned int cpoint) |
| 267 | /* checks if a codepoint is an alpha (i.e. a letter) */ | 290 | /* checks if a codepoint is an alpha (i.e. a letter) */ |
| 268 | { | 291 | { |
| 269 | unsigned int *r = bsearch(&cpoint, xs_unicode_alpha_table, | 292 | int b = 0; |
| 270 | sizeof(xs_unicode_alpha_table) / (sizeof(unsigned int) * 2), | 293 | int t = countof(xs_unicode_alpha_table) / 2 - 1; |
| 271 | sizeof(unsigned int) * 2, | 294 | |
| 272 | int_range_cmp); | 295 | while (t >= b) { |
| 296 | int n = (b + t) / 2; | ||
| 297 | unsigned int *p = &xs_unicode_alpha_table[n * 2]; | ||
| 298 | |||
| 299 | if (cpoint < p[0]) | ||
| 300 | t = n - 1; | ||
| 301 | else | ||
| 302 | if (cpoint > p[1]) | ||
| 303 | b = n + 1; | ||
| 304 | else | ||
| 305 | return 1; | ||
| 306 | } | ||
| 273 | 307 | ||
| 274 | return !!r; | 308 | return 0; |
| 275 | } | 309 | } |
| 276 | 310 | ||
| 277 | 311 | ||