summaryrefslogtreecommitdiff
path: root/xs_unicode.h
diff options
context:
space:
mode:
Diffstat (limited to 'xs_unicode.h')
-rw-r--r--xs_unicode.h192
1 files changed, 113 insertions, 79 deletions
diff --git a/xs_unicode.h b/xs_unicode.h
index f5880f0..c666479 100644
--- a/xs_unicode.h
+++ b/xs_unicode.h
@@ -5,7 +5,6 @@
5#define _XS_UNICODE_H 5#define _XS_UNICODE_H
6 6
7 int _xs_utf8_enc(char buf[4], unsigned int cpoint); 7 int _xs_utf8_enc(char buf[4], unsigned int cpoint);
8 xs_str *xs_utf8_enc(xs_str *str, unsigned int cpoint);
9 unsigned int xs_utf8_dec(char **str); 8 unsigned int xs_utf8_dec(char **str);
10 int xs_unicode_width(unsigned int cpoint); 9 int xs_unicode_width(unsigned int cpoint);
11 int xs_is_surrogate(unsigned int cpoint); 10 int xs_is_surrogate(unsigned int cpoint);
@@ -21,13 +20,20 @@
21 int xs_unicode_nfc(unsigned int base, unsigned int diac, unsigned int *cpoint); 20 int xs_unicode_nfc(unsigned int base, unsigned int diac, unsigned int *cpoint);
22 int xs_unicode_is_alpha(unsigned int cpoint); 21 int xs_unicode_is_alpha(unsigned int cpoint);
23 22
23#ifdef _XS_H
24 xs_str *xs_utf8_enc(xs_str *str, unsigned int cpoint);
25#endif
26
24#ifdef XS_IMPLEMENTATION 27#ifdef XS_IMPLEMENTATION
25 28
29#ifndef countof
30#define countof(a) (sizeof((a)) / sizeof((*a)))
31#endif
26 32
27int _xs_utf8_enc(char buf[4], unsigned int cpoint) 33int _xs_utf8_enc(char buf[4], unsigned int cpoint)
28/* encodes an Unicode codepoint to utf-8 into buf and returns the size in bytes */ 34/* encodes an Unicode codepoint to utf-8 into buf and returns the size in bytes */
29{ 35{
30 unsigned char *p = (unsigned char *)buf; 36 char *p = buf;
31 37
32 if (cpoint < 0x80) /* 1 byte char */ 38 if (cpoint < 0x80) /* 1 byte char */
33 *p++ = cpoint & 0xff; 39 *p++ = cpoint & 0xff;
@@ -48,27 +54,16 @@ int _xs_utf8_enc(char buf[4], unsigned int cpoint)
48 *p++ = 0x80 | (cpoint & 0x3f); 54 *p++ = 0x80 | (cpoint & 0x3f);
49 } 55 }
50 56
51 return p - (unsigned char *)buf; 57 return p - buf;
52}
53
54
55xs_str *xs_utf8_enc(xs_str *str, unsigned int cpoint)
56/* encodes an Unicode codepoint to utf-8 into str */
57{
58 char tmp[4];
59
60 int c = _xs_utf8_enc(tmp, cpoint);
61
62 return xs_append_m(str, tmp, c);
63} 58}
64 59
65 60
66unsigned int xs_utf8_dec(char **str) 61unsigned int xs_utf8_dec(char **str)
67/* decodes an utf-8 char inside str and updates the pointer */ 62/* decodes an utf-8 char inside str and updates the pointer */
68{ 63{
69 unsigned char *p = (unsigned char *)*str; 64 char *p = *str;
70 unsigned int cpoint = 0; 65 unsigned int cpoint = 0;
71 int c = *p++; 66 unsigned char c = *p++;
72 int cb = 0; 67 int cb = 0;
73 68
74 if ((c & 0x80) == 0) { /* 1 byte char */ 69 if ((c & 0x80) == 0) { /* 1 byte char */
@@ -91,30 +86,19 @@ unsigned int xs_utf8_dec(char **str)
91 } 86 }
92 87
93 /* process the continuation bytes */ 88 /* process the continuation bytes */
94 while (cb--) { 89 while (cb > 0 && *p && (*p & 0xc0) == 0x80)
95 if ((*p & 0xc0) == 0x80) 90 cpoint |= (*p++ & 0x3f) << (--cb * 6);
96 cpoint |= (*p++ & 0x3f) << (cb * 6);
97 else {
98 cpoint = 0xfffd;
99 break;
100 }
101 }
102 91
103 *str = (char *)p; 92 /* incomplete or broken? */
104 return cpoint; 93 if (cb)
105} 94 cpoint = 0xfffd;
106
107
108static int int_range_cmp(const void *p1, const void *p2)
109{
110 const unsigned int *a = p1;
111 const unsigned int *b = p2;
112 95
113 return *a < b[0] ? -1 : *a > b[1] ? 1 : 0; 96 *str = p;
97 return cpoint;
114} 98}
115 99
116 100
117/* intentionally dead simple */ 101/** Unicode character width: intentionally dead simple **/
118 102
119static unsigned int xs_unicode_width_table[] = { 103static unsigned int xs_unicode_width_table[] = {
120 0x300, 0x36f, 0, /* diacritics */ 104 0x300, 0x36f, 0, /* diacritics */
@@ -132,12 +116,23 @@ static unsigned int xs_unicode_width_table[] = {
132int xs_unicode_width(unsigned int cpoint) 116int xs_unicode_width(unsigned int cpoint)
133/* returns the width in columns of a Unicode codepoint (somewhat simplified) */ 117/* returns the width in columns of a Unicode codepoint (somewhat simplified) */
134{ 118{
135 unsigned int *r = bsearch(&cpoint, xs_unicode_width_table, 119 int b = 0;
136 sizeof(xs_unicode_width_table) / (sizeof(unsigned int) * 3), 120 int t = countof(xs_unicode_width_table) / 3 - 1;
137 sizeof(unsigned int) * 3, 121
138 int_range_cmp); 122 while (t >= b) {
123 int n = (b + t) / 2;
124 unsigned int *p = &xs_unicode_width_table[n * 3];
125
126 if (cpoint < p[0])
127 t = n - 1;
128 else
129 if (cpoint > p[1])
130 b = n + 1;
131 else
132 return p[2];
133 }
139 134
140 return r ? r[2] : 1; 135 return 1;
141} 136}
142 137
143 138
@@ -167,38 +162,56 @@ unsigned int xs_surrogate_enc(unsigned int cpoint)
167} 162}
168 163
169 164
170#ifdef _XS_UNICODE_TBL_H 165#ifdef _XS_H
171
172/* include xs_unicode_tbl.h before this one to use these functions */
173 166
174static int int_cmp(const void *p1, const void *p2) 167xs_str *xs_utf8_enc(xs_str *str, unsigned int cpoint)
168/* encodes an Unicode codepoint to utf-8 into str */
175{ 169{
176 const unsigned int *a = p1; 170 char tmp[4];
177 const unsigned int *b = p2; 171
172 int c = _xs_utf8_enc(tmp, cpoint);
178 173
179 return *a < *b ? -1 : *a > *b ? 1 : 0; 174 return xs_append_m(str, tmp, c);
180} 175}
181 176
177#endif /* _XS_H */
178
179
180#ifdef _XS_UNICODE_TBL_H
181
182/* include xs_unicode_tbl.h before this one to use these functions */
182 183
183unsigned int *_xs_unicode_upper_search(unsigned int cpoint) 184unsigned int *_xs_unicode_upper_search(unsigned int cpoint)
184/* searches for an uppercase codepoint in the case fold table */ 185/* searches for an uppercase codepoint in the case fold table */
185{ 186{
186 return bsearch(&cpoint, xs_unicode_case_fold_table, 187 int b = 0;
187 sizeof(xs_unicode_case_fold_table) / (sizeof(unsigned int) * 2), 188 int t = countof(xs_unicode_case_fold_table) / 2 + 1;
188 sizeof(unsigned int) * 2, 189
189 int_cmp); 190 while (t >= b) {
191 int n = (b + t) / 2;
192 unsigned int *p = &xs_unicode_case_fold_table[n * 2];
193
194 if (cpoint < p[0])
195 t = n - 1;
196 else
197 if (cpoint > p[0])
198 b = n + 1;
199 else
200 return p;
201 }
202
203 return NULL;
190} 204}
191 205
192 206
193unsigned int *_xs_unicode_lower_search(unsigned int cpoint) 207unsigned int *_xs_unicode_lower_search(unsigned int cpoint)
194/* searches for a lowercase codepoint in the case fold table */ 208/* searches for a lowercase codepoint in the case fold table */
195{ 209{
196 unsigned int *p = xs_unicode_case_fold_table + 1; 210 unsigned int *p = xs_unicode_case_fold_table;
197 unsigned int *e = xs_unicode_case_fold_table + 211 unsigned int *e = p + countof(xs_unicode_case_fold_table);
198 sizeof(xs_unicode_case_fold_table) / sizeof(unsigned int);
199 212
200 while (p < e) { 213 while (p < e) {
201 if (cpoint == *p) 214 if (cpoint == p[1])
202 return p; 215 return p;
203 216
204 p += 2; 217 p += 2;
@@ -208,38 +221,49 @@ unsigned int *_xs_unicode_lower_search(unsigned int cpoint)
208} 221}
209 222
210 223
211unsigned int xs_unicode_to_upper(unsigned int cpoint) 224unsigned int xs_unicode_to_lower(unsigned int cpoint)
212/* returns the cpoint to uppercase */ 225/* returns the cpoint to lowercase */
213{ 226{
214 unsigned int *p = _xs_unicode_lower_search(cpoint); 227 unsigned int *p = _xs_unicode_upper_search(cpoint);
215 228
216 return p == NULL ? cpoint : p[-1]; 229 return p == NULL ? cpoint : p[1];
217} 230}
218 231
219 232
220unsigned int xs_unicode_to_lower(unsigned int cpoint) 233unsigned int xs_unicode_to_upper(unsigned int cpoint)
221/* returns the cpoint to lowercase */ 234/* returns the cpoint to uppercase */
222{ 235{
223 unsigned int *p = _xs_unicode_upper_search(cpoint); 236 unsigned int *p = _xs_unicode_lower_search(cpoint);
224 237
225 return p == NULL ? cpoint : p[1]; 238 return p == NULL ? cpoint : p[0];
226} 239}
227 240
228 241
229int xs_unicode_nfd(unsigned int cpoint, unsigned int *base, unsigned int *diac) 242int xs_unicode_nfd(unsigned int cpoint, unsigned int *base, unsigned int *diac)
230/* applies unicode Normalization Form D */ 243/* applies unicode Normalization Form D */
231{ 244{
232 unsigned int *r = bsearch(&cpoint, xs_unicode_nfd_table, 245 int b = 0;
233 sizeof(xs_unicode_nfd_table) / (sizeof(unsigned int) * 3), 246 int t = countof(xs_unicode_nfd_table) / 3 - 1;
234 sizeof(unsigned int) * 3, 247
235 int_cmp); 248 while (t >= b) {
236 249 int n = (b + t) / 2;
237 if (r != NULL) { 250 unsigned int *p = &xs_unicode_nfd_table[n * 3];
238 *base = r[1]; 251
239 *diac = r[2]; 252 int c = cpoint - p[0];
253
254 if (c < 0)
255 t = n - 1;
256 else
257 if (c > 0)
258 b = n + 1;
259 else {
260 *base = p[1];
261 *diac = p[2];
262 return 1;
263 }
240 } 264 }
241 265
242 return !!r; 266 return 0;
243} 267}
244 268
245 269
@@ -247,8 +271,7 @@ int xs_unicode_nfc(unsigned int base, unsigned int diac, unsigned int *cpoint)
247/* applies unicode Normalization Form C */ 271/* applies unicode Normalization Form C */
248{ 272{
249 unsigned int *p = xs_unicode_nfd_table; 273 unsigned int *p = xs_unicode_nfd_table;
250 unsigned int *e = xs_unicode_nfd_table + 274 unsigned int *e = p + countof(xs_unicode_nfd_table);
251 sizeof(xs_unicode_nfd_table) / sizeof(unsigned int);
252 275
253 while (p < e) { 276 while (p < e) {
254 if (p[1] == base && p[2] == diac) { 277 if (p[1] == base && p[2] == diac) {
@@ -266,12 +289,23 @@ int xs_unicode_nfc(unsigned int base, unsigned int diac, unsigned int *cpoint)
266int xs_unicode_is_alpha(unsigned int cpoint) 289int xs_unicode_is_alpha(unsigned int cpoint)
267/* checks if a codepoint is an alpha (i.e. a letter) */ 290/* checks if a codepoint is an alpha (i.e. a letter) */
268{ 291{
269 unsigned int *r = bsearch(&cpoint, xs_unicode_alpha_table, 292 int b = 0;
270 sizeof(xs_unicode_alpha_table) / (sizeof(unsigned int) * 2), 293 int t = countof(xs_unicode_alpha_table) / 2 - 1;
271 sizeof(unsigned int) * 2, 294
272 int_range_cmp); 295 while (t >= b) {
296 int n = (b + t) / 2;
297 unsigned int *p = &xs_unicode_alpha_table[n * 2];
298
299 if (cpoint < p[0])
300 t = n - 1;
301 else
302 if (cpoint > p[1])
303 b = n + 1;
304 else
305 return 1;
306 }
273 307
274 return !!r; 308 return 0;
275} 309}
276 310
277 311