diff options
| author | 2023-08-03 08:42:38 +0200 | |
|---|---|---|
| committer | 2023-08-03 08:42:38 +0200 | |
| commit | 2137d2f13310aca3cef6a0fc7735fdf4aac53e8c (patch) | |
| tree | 69d426ca8e14ff509a9cdd94d443b81bf975e7a0 /xs_unicode.h | |
| parent | Fixed a log message level. (diff) | |
| download | penes-snac2-2137d2f13310aca3cef6a0fc7735fdf4aac53e8c.tar.gz penes-snac2-2137d2f13310aca3cef6a0fc7735fdf4aac53e8c.tar.xz penes-snac2-2137d2f13310aca3cef6a0fc7735fdf4aac53e8c.zip | |
Backport from xs.
Diffstat (limited to '')
| -rw-r--r-- | xs_unicode.h | 131 |
1 files changed, 119 insertions, 12 deletions
diff --git a/xs_unicode.h b/xs_unicode.h index d45b52e..48cd660 100644 --- a/xs_unicode.h +++ b/xs_unicode.h | |||
| @@ -5,8 +5,15 @@ | |||
| 5 | #define _XS_UNICODE_H | 5 | #define _XS_UNICODE_H |
| 6 | 6 | ||
| 7 | xs_str *xs_utf8_enc(xs_str *str, unsigned int cpoint); | 7 | xs_str *xs_utf8_enc(xs_str *str, unsigned int cpoint); |
| 8 | char *xs_utf8_dec(const char *str, unsigned int *cpoint); | 8 | unsigned int xs_utf8_dec(char **str); |
| 9 | 9 | unsigned int *_xs_unicode_upper_search(unsigned int cpoint); | |
| 10 | unsigned int *_xs_unicode_lower_search(unsigned int cpoint); | ||
| 11 | #define xs_unicode_is_upper(cpoint) (!!_xs_unicode_upper_search(cpoint)) | ||
| 12 | #define xs_unicode_is_lower(cpoint) (!!_xs_unicode_lower_search(cpoint)) | ||
| 13 | unsigned int xs_unicode_to_upper(unsigned int cpoint); | ||
| 14 | unsigned int xs_unicode_to_lower(unsigned int cpoint); | ||
| 15 | int xs_unicode_nfd(unsigned int cpoint, unsigned int *base, unsigned int *diac); | ||
| 16 | int xs_unicode_nfc(unsigned int base, unsigned int diac, unsigned int *cpoint); | ||
| 10 | 17 | ||
| 11 | #ifdef XS_IMPLEMENTATION | 18 | #ifdef XS_IMPLEMENTATION |
| 12 | 19 | ||
| @@ -50,46 +57,146 @@ xs_str *xs_utf8_enc(xs_str *str, unsigned int cpoint) | |||
| 50 | } | 57 | } |
| 51 | 58 | ||
| 52 | 59 | ||
| 53 | char *xs_utf8_dec(const char *str, unsigned int *cpoint) | 60 | unsigned int xs_utf8_dec(char **str) |
| 54 | /* decodes an utf-8 char inside str into cpoint and returns the next position */ | 61 | /* decodes an utf-8 char inside str and updates the pointer */ |
| 55 | { | 62 | { |
| 56 | unsigned char *p = (unsigned char *)str; | 63 | unsigned char *p = (unsigned char *)*str; |
| 64 | unsigned int cpoint = 0; | ||
| 57 | int c = *p++; | 65 | int c = *p++; |
| 58 | int cb = 0; | 66 | int cb = 0; |
| 59 | 67 | ||
| 60 | if ((c & 0x80) == 0) { /* 1 byte char */ | 68 | if ((c & 0x80) == 0) { /* 1 byte char */ |
| 61 | *cpoint = c; | 69 | cpoint = c; |
| 62 | } | 70 | } |
| 63 | else | 71 | else |
| 64 | if ((c & 0xe0) == 0xc0) { /* 2 byte char */ | 72 | if ((c & 0xe0) == 0xc0) { /* 2 byte char */ |
| 65 | *cpoint = (c & 0x1f) << 6; | 73 | cpoint = (c & 0x1f) << 6; |
| 66 | cb = 1; | 74 | cb = 1; |
| 67 | } | 75 | } |
| 68 | else | 76 | else |
| 69 | if ((c & 0xf0) == 0xe0) { /* 3 byte char */ | 77 | if ((c & 0xf0) == 0xe0) { /* 3 byte char */ |
| 70 | *cpoint = (c & 0x0f) << 12; | 78 | cpoint = (c & 0x0f) << 12; |
| 71 | cb = 2; | 79 | cb = 2; |
| 72 | } | 80 | } |
| 73 | else | 81 | else |
| 74 | if ((c & 0xf8) == 0xf0) { /* 4 byte char */ | 82 | if ((c & 0xf8) == 0xf0) { /* 4 byte char */ |
| 75 | *cpoint = (c & 0x07) << 18; | 83 | cpoint = (c & 0x07) << 18; |
| 76 | cb = 3; | 84 | cb = 3; |
| 77 | } | 85 | } |
| 78 | 86 | ||
| 79 | /* process the continuation bytes */ | 87 | /* process the continuation bytes */ |
| 80 | while (cb--) { | 88 | while (cb--) { |
| 81 | if ((*p & 0xc0) == 0x80) | 89 | if ((*p & 0xc0) == 0x80) |
| 82 | *cpoint |= (*p++ & 0x3f) << (cb * 6); | 90 | cpoint |= (*p++ & 0x3f) << (cb * 6); |
| 83 | else { | 91 | else { |
| 84 | *cpoint = 0xfffd; | 92 | cpoint = 0xfffd; |
| 85 | break; | 93 | break; |
| 86 | } | 94 | } |
| 87 | } | 95 | } |
| 88 | 96 | ||
| 89 | return (char *)p; | 97 | *str = (char *)p; |
| 98 | return cpoint; | ||
| 99 | } | ||
| 100 | |||
| 101 | |||
| 102 | #ifdef _XS_UNICODE_TBL_H | ||
| 103 | |||
| 104 | /* include xs_unicode_tbl.h before to use these functions */ | ||
| 105 | |||
| 106 | static int int_cmp(const void *p1, const void *p2) | ||
| 107 | { | ||
| 108 | const unsigned int *a = p1; | ||
| 109 | const unsigned int *b = p2; | ||
| 110 | |||
| 111 | return *a < *b ? -1 : *a > *b ? 1 : 0; | ||
| 112 | } | ||
| 113 | |||
| 114 | |||
| 115 | unsigned int *_xs_unicode_upper_search(unsigned int cpoint) | ||
| 116 | /* searches for an uppercase codepoint in the case fold table */ | ||
| 117 | { | ||
| 118 | return bsearch(&cpoint, xs_unicode_case_fold_table, | ||
| 119 | sizeof(xs_unicode_case_fold_table) / (sizeof(unsigned int) * 2), | ||
| 120 | sizeof(unsigned int) * 2, | ||
| 121 | int_cmp); | ||
| 122 | } | ||
| 123 | |||
| 124 | |||
| 125 | unsigned int *_xs_unicode_lower_search(unsigned int cpoint) | ||
| 126 | /* searches for a lowercase codepoint in the case fold table */ | ||
| 127 | { | ||
| 128 | unsigned int *p = xs_unicode_case_fold_table + 1; | ||
| 129 | unsigned int *e = xs_unicode_case_fold_table + | ||
| 130 | sizeof(xs_unicode_case_fold_table) / sizeof(unsigned int); | ||
| 131 | |||
| 132 | while (p < e) { | ||
| 133 | if (cpoint == *p) | ||
| 134 | return p; | ||
| 135 | |||
| 136 | p += 2; | ||
| 137 | } | ||
| 138 | |||
| 139 | return NULL; | ||
| 140 | } | ||
| 141 | |||
| 142 | |||
| 143 | unsigned int xs_unicode_to_upper(unsigned int cpoint) | ||
| 144 | /* returns the cpoint to uppercase */ | ||
| 145 | { | ||
| 146 | unsigned int *p = _xs_unicode_lower_search(cpoint); | ||
| 147 | |||
| 148 | return p == NULL ? cpoint : p[-1]; | ||
| 149 | } | ||
| 150 | |||
| 151 | |||
| 152 | unsigned int xs_unicode_to_lower(unsigned int cpoint) | ||
| 153 | /* returns the cpoint to lowercase */ | ||
| 154 | { | ||
| 155 | unsigned int *p = _xs_unicode_upper_search(cpoint); | ||
| 156 | |||
| 157 | return p == NULL ? cpoint : p[1]; | ||
| 90 | } | 158 | } |
| 91 | 159 | ||
| 92 | 160 | ||
| 161 | int xs_unicode_nfd(unsigned int cpoint, unsigned int *base, unsigned int *diac) | ||
| 162 | /* applies unicode Normalization Form D */ | ||
| 163 | { | ||
| 164 | unsigned int *r = bsearch(&cpoint, xs_unicode_nfd_table, | ||
| 165 | sizeof(xs_unicode_nfd_table) / (sizeof(unsigned int) * 3), | ||
| 166 | sizeof(unsigned int) * 3, | ||
| 167 | int_cmp); | ||
| 168 | |||
| 169 | if (r != NULL) { | ||
| 170 | *base = r[1]; | ||
| 171 | *diac = r[2]; | ||
| 172 | } | ||
| 173 | |||
| 174 | return !!r; | ||
| 175 | } | ||
| 176 | |||
| 177 | |||
| 178 | int xs_unicode_nfc(unsigned int base, unsigned int diac, unsigned int *cpoint) | ||
| 179 | /* applies unicode Normalization Form C */ | ||
| 180 | { | ||
| 181 | unsigned int *p = xs_unicode_nfd_table; | ||
| 182 | unsigned int *e = xs_unicode_nfd_table + | ||
| 183 | sizeof(xs_unicode_nfd_table) / sizeof(unsigned int); | ||
| 184 | |||
| 185 | while (p < e) { | ||
| 186 | if (p[1] == base && p[2] == diac) { | ||
| 187 | *cpoint = p[0]; | ||
| 188 | return 1; | ||
| 189 | } | ||
| 190 | |||
| 191 | p += 3; | ||
| 192 | } | ||
| 193 | |||
| 194 | return 0; | ||
| 195 | } | ||
| 196 | |||
| 197 | |||
| 198 | #endif /* _XS_UNICODE_TBL_H */ | ||
| 199 | |||
| 93 | #endif /* XS_IMPLEMENTATION */ | 200 | #endif /* XS_IMPLEMENTATION */ |
| 94 | 201 | ||
| 95 | #endif /* _XS_UNICODE_H */ | 202 | #endif /* _XS_UNICODE_H */ |