1 files changed, 463 insertions, 0 deletions
diff --git a/src/common/utf8.cpp b/src/common/utf8.cpp
new file mode 100644
index 000000000..9aa8088ef
--- /dev/null
+++ b/src/common/utf8.cpp
@@ -0,0 +1,463 @@
+/*
+  Basic UTF-8 manipulation routines
+  by Jeff Bezanson
+  placed in the public domain Fall 2005
+  This code is designed to provide the utilities you need to manipulate
+  UTF-8 as an internal string encoding. These functions do not perform the
+  error checking normally needed when handling UTF-8 data, so if you happen
+  to be from the Unicode Consortium you will want to flay me alive.
+  I do this because error checking can be performed at the boundaries (I/O),
+  with these routines reserved for higher performance on data known to be
+  valid.
+*/
+#ifdef _WIN32
+#include <windows.h>
+#undef min
+#undef max
+#endif
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <stdarg.h>
+#include <algorithm>
+#include <string>
+#include "common_types.h"
+#include "utf8.h"
+// is start of UTF sequence
+inline bool isutf(char c) {
+    return (c & 0xC0) != 0x80;
+}
+static const u32 offsetsFromUTF8[6] = {
+  0x00000000UL, 0x00003080UL, 0x000E2080UL,
+  0x03C82080UL, 0xFA082080UL, 0x82082080UL
+};
+static const u8 trailingBytesForUTF8[256] = {
+  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+  1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
+        2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5,
+};
+/* returns length of next utf-8 sequence */
+int u8_seqlen(const char *s)
+{
+  return trailingBytesForUTF8[(unsigned int)(unsigned char)s[0]] + 1;
+}
+/* conversions without error checking
+   only works for valid UTF-8, i.e. no 5- or 6-byte sequences
+   srcsz = source size in bytes, or -1 if 0-terminated
+   sz = dest size in # of wide characters
+   returns # characters converted
+   dest will always be L'\0'-terminated, even if there isn't enough room
+   for all the characters.
+   if sz = srcsz+1 (i.e. 4*srcsz+4 bytes), there will always be enough space.
+*/
+int u8_toucs(u32 *dest, int sz, const char *src, int srcsz)
+{
+  u32 ch;
+  const char *src_end = src + srcsz;
+  int nb;
+  int i=0;
+  while (i < sz-1) {
+    nb = trailingBytesForUTF8[(unsigned char)*src];
+    if (srcsz == -1) {
+      if (*src == 0)
+        goto done_toucs;
+    }
+    else {
+      if (src + nb >= src_end)
+        goto done_toucs;
+    }
+    ch = 0;
+    switch (nb) {
+      /* these fall through deliberately */
+    case 3: ch += (unsigned char)*src++; ch <<= 6;
+    case 2: ch += (unsigned char)*src++; ch <<= 6;
+    case 1: ch += (unsigned char)*src++; ch <<= 6;
+    case 0: ch += (unsigned char)*src++;
+    }
+    ch -= offsetsFromUTF8[nb];
+    dest[i++] = ch;
+  }
+ done_toucs:
+  dest[i] = 0;
+  return i;
+}
+/* srcsz = number of source characters, or -1 if 0-terminated
+   sz = size of dest buffer in bytes
+   returns # characters converted
+   dest will only be '\0'-terminated if there is enough space. this is
+   for consistency; imagine there are 2 bytes of space left, but the next
+   character requires 3 bytes. in this case we could NUL-terminate, but in
+   general we can't when there's insufficient space. therefore this function
+   only NUL-terminates if all the characters fit, and there's space for
+   the NUL as well.
+   the destination string will never be bigger than the source string.
+*/
+int u8_toutf8(char *dest, int sz, u32 *src, int srcsz)
+{
+  u32 ch;
+  int i = 0;
+  char *dest_end = dest + sz;
+  while (srcsz<0 ? src[i]!=0 : i < srcsz) {
+    ch = src[i];
+    if (ch < 0x80) {
+      if (dest >= dest_end)
+        return i;
+      *dest++ = (char)ch;
+    }
+    else if (ch < 0x800) {
+      if (dest >= dest_end-1)
+        return i;
+      *dest++ = (ch>>6) | 0xC0;
+      *dest++ = (ch & 0x3F) | 0x80;
+    }
+    else if (ch < 0x10000) {
+      if (dest >= dest_end-2)
+        return i;
+      *dest++ = (ch>>12) | 0xE0;
+      *dest++ = ((ch>>6) & 0x3F) | 0x80;
+      *dest++ = (ch & 0x3F) | 0x80;
+    }
+    else if (ch < 0x110000) {
+      if (dest >= dest_end-3)
+        return i;
+      *dest++ = (ch>>18) | 0xF0;
+      *dest++ = ((ch>>12) & 0x3F) | 0x80;
+      *dest++ = ((ch>>6) & 0x3F) | 0x80;
+      *dest++ = (ch & 0x3F) | 0x80;
+    }
+    i++;
+  }
+  if (dest < dest_end)
+    *dest = '\0';
+  return i;
+}
+int u8_wc_toutf8(char *dest, u32 ch)
+{
+  if (ch < 0x80) {
+    dest[0] = (char)ch;
+    return 1;
+  }
+  if (ch < 0x800) {
+    dest[0] = (ch>>6) | 0xC0;
+    dest[1] = (ch & 0x3F) | 0x80;
+    return 2;
+  }
+  if (ch < 0x10000) {
+    dest[0] = (ch>>12) | 0xE0;
+    dest[1] = ((ch>>6) & 0x3F) | 0x80;
+    dest[2] = (ch & 0x3F) | 0x80;
+    return 3;
+  }
+  if (ch < 0x110000) {
+    dest[0] = (ch>>18) | 0xF0;
+    dest[1] = ((ch>>12) & 0x3F) | 0x80;
+    dest[2] = ((ch>>6) & 0x3F) | 0x80;
+    dest[3] = (ch & 0x3F) | 0x80;
+    return 4;
+  }
+  return 0;
+}
+/* charnum => byte offset */
+int u8_offset(const char *str, int charnum)
+{
+  int offs=0;
+  while (charnum > 0 && str[offs]) {
+    (void)(isutf(str[++offs]) || isutf(str[++offs]) ||
+         isutf(str[++offs]) || ++offs);
+    charnum--;
+  }
+  return offs;
+}
+/* byte offset => charnum */
+int u8_charnum(const char *s, int offset)
+{
+  int charnum = 0, offs=0;
+  while (offs < offset && s[offs]) {
+    (void)(isutf(s[++offs]) || isutf(s[++offs]) ||
+         isutf(s[++offs]) || ++offs);
+    charnum++;
+  }
+  return charnum;
+}
+/* number of characters */
+int u8_strlen(const char *s)
+{
+  int count = 0;
+  int i = 0;
+  while (u8_nextchar(s, &i) != 0)
+    count++;
+  return count;
+}
+/* reads the next utf-8 sequence out of a string, updating an index */
+u32 u8_nextchar(const char *s, int *i)
+{
+  u32 ch = 0;
+  int sz = 0;
+  do {
+    ch <<= 6;
+    ch += (unsigned char)s[(*i)++];
+    sz++;
+  } while (s[*i] && !isutf(s[*i]));
+  ch -= offsetsFromUTF8[sz-1];
+  return ch;
+}
+void u8_inc(const char *s, int *i)
+{
+  (void)(isutf(s[++(*i)]) || isutf(s[++(*i)]) ||
+       isutf(s[++(*i)]) || ++(*i));
+}
+void u8_dec(const char *s, int *i)
+{
+  (void)(isutf(s[--(*i)]) || isutf(s[--(*i)]) ||
+       isutf(s[--(*i)]) || --(*i));
+}
+int octal_digit(char c)
+{
+  return (c >= '0' && c <= '7');
+}
+int hex_digit(char c)
+{
+  return ((c >= '0' && c <= '9') ||
+      (c >= 'A' && c <= 'F') ||
+      (c >= 'a' && c <= 'f'));
+}
+/* assumes that src points to the character after a backslash
+   returns number of input characters processed */
+int u8_read_escape_sequence(const char *str, u32 *dest)
+{
+  u32 ch;
+  char digs[9]="\0\0\0\0\0\0\0\0";
+  int dno=0, i=1;
+  ch = (u32)str[0];  /* take literal character */
+  if (str[0] == 'n')
+    ch = L'\n';
+  else if (str[0] == 't')
+    ch = L'\t';
+  else if (str[0] == 'r')
+    ch = L'\r';
+  else if (str[0] == 'b')
+    ch = L'\b';
+  else if (str[0] == 'f')
+    ch = L'\f';
+  else if (str[0] == 'v')
+    ch = L'\v';
+  else if (str[0] == 'a')
+    ch = L'\a';
+  else if (octal_digit(str[0])) {
+    i = 0;
+    do {
+      digs[dno++] = str[i++];
+    } while (octal_digit(str[i]) && dno < 3);
+    ch = strtol(digs, NULL, 8);
+  }
+  else if (str[0] == 'x') {
+    while (hex_digit(str[i]) && dno < 2) {
+      digs[dno++] = str[i++];
+    }
+    if (dno > 0)
+      ch = strtol(digs, NULL, 16);
+  }
+  else if (str[0] == 'u') {
+    while (hex_digit(str[i]) && dno < 4) {
+      digs[dno++] = str[i++];
+    }
+    if (dno > 0)
+      ch = strtol(digs, NULL, 16);
+  }
+  else if (str[0] == 'U') {
+    while (hex_digit(str[i]) && dno < 8) {
+      digs[dno++] = str[i++];
+    }
+    if (dno > 0)
+      ch = strtol(digs, NULL, 16);
+  }
+  *dest = ch;
+  return i;
+}
+/* convert a string with literal \uxxxx or \Uxxxxxxxx characters to UTF-8
+   example: u8_unescape(mybuf, 256, "hello\\u220e")
+   note the double backslash is needed if called on a C string literal */
+int u8_unescape(char *buf, int sz, char *src)
+{
+  int c=0, amt;
+  u32 ch;
+  char temp[4];
+  while (*src && c < sz) {
+    if (*src == '\\') {
+      src++;
+      amt = u8_read_escape_sequence(src, &ch);
+    }
+    else {
+      ch = (u32)*src;
+      amt = 1;
+    }
+    src += amt;
+    amt = u8_wc_toutf8(temp, ch);
+    if (amt > sz-c)
+      break;
+    memcpy(&buf[c], temp, amt);
+    c += amt;
+  }
+  if (c < sz)
+    buf[c] = '\0';
+  return c;
+}
+const char *u8_strchr(const char *s, u32 ch, int *charn)
+{
+  int i = 0, lasti=0;
+  u32 c;
+  *charn = 0;
+  while (s[i]) {
+    c = u8_nextchar(s, &i);
+    if (c == ch) {
+      return &s[lasti];
+    }
+    lasti = i;
+    (*charn)++;
+  }
+  return NULL;
+}
+const char *u8_memchr(const char *s, u32 ch, size_t sz, int *charn)
+{
+  u32 i = 0, lasti=0;
+  u32 c;
+  int csz;
+  *charn = 0;
+  while (i < sz) {
+    c = csz = 0;
+    do {
+      c <<= 6;
+      c += (unsigned char)s[i++];
+      csz++;
+    } while (i < sz && !isutf(s[i]));
+    c -= offsetsFromUTF8[csz-1];
+    if (c == ch) {
+      return &s[lasti];
+    }
+    lasti = i;
+    (*charn)++;
+  }
+  return NULL;
+}
+int u8_is_locale_utf8(const char *locale)
+{
+  /* this code based on libutf8 */
+  const char* cp = locale;
+  for (; *cp != '\0' && *cp != '@' && *cp != '+' && *cp != ','; cp++) {
+    if (*cp == '.') {
+      const char* encoding = ++cp;
+      for (; *cp != '\0' && *cp != '@' && *cp != '+' && *cp != ','; cp++)
+        ;
+      if ((cp-encoding == 5 && !strncmp(encoding, "UTF-8", 5))
+        || (cp-encoding == 4 && !strncmp(encoding, "utf8", 4)))
+        return 1; /* it's UTF-8 */
+      break;
+    }
+  }
+  return 0;
+}
+int UTF8StringNonASCIICount(const char *utf8string) {
+    UTF8 utf(utf8string);
+    int count = 0;
+    while (!utf.end()) {
+        int c = utf.next();
+        if (c > 127)
+            ++count;
+    }
+    return count;
+}
+bool UTF8StringHasNonASCII(const char *utf8string) {
+    return UTF8StringNonASCIICount(utf8string) > 0;
+}
+#ifdef _WIN32
+std::string ConvertWStringToUTF8(const wchar_t *wstr) {
+    int len = (int)wcslen(wstr);
+    int size = (int)WideCharToMultiByte(CP_UTF8, 0, wstr, len, 0, 0, NULL, NULL);
+    std::string s;
+    s.resize(size);
+    if (size > 0) {
+        WideCharToMultiByte(CP_UTF8, 0, wstr, len, &s[0], size, NULL, NULL);
+    }
+    return s;
+}
+std::string ConvertWStringToUTF8(const std::wstring &wstr) {
+    int len = (int)wstr.size();
+    int size = (int)WideCharToMultiByte(CP_UTF8, 0, wstr.c_str(), len, 0, 0, NULL, NULL);
+    std::string s;
+    s.resize(size);
+    if (size > 0) {
+        WideCharToMultiByte(CP_UTF8, 0, wstr.c_str(), len, &s[0], size, NULL, NULL);
+    }
+    return s;
+}
+void ConvertUTF8ToWString(wchar_t *dest, size_t destSize, const std::string &source) {
+    int len = (int)source.size();
+    int size = (int)MultiByteToWideChar(CP_UTF8, 0, source.c_str(), len, NULL, 0);
+    MultiByteToWideChar(CP_UTF8, 0, source.c_str(), len, dest, std::min((int)destSize, size));
+}
+std::wstring ConvertUTF8ToWString(const std::string &source) {
+    int len = (int)source.size();
+    int size = (int)MultiByteToWideChar(CP_UTF8, 0, source.c_str(), len, NULL, 0);
+    std::wstring str;
+    str.resize(size);
+    if (size > 0) {
+        MultiByteToWideChar(CP_UTF8, 0, source.c_str(), len, &str[0], size);
+    }
+    return str;
+}
+#endif
+\ No newline at end of file

diff --git a/src/common/utf8.cpp b/src/common/utf8.cpp new file mode 100644 index 000000000..9aa8088ef --- /dev/null +++ b/src/common/utf8.cpp
@@ -0,0 +1,463 @@
	1	/*
	2	Basic UTF-8 manipulation routines
	3	by Jeff Bezanson
	4	placed in the public domain Fall 2005
	5
	6	This code is designed to provide the utilities you need to manipulate
	7	UTF-8 as an internal string encoding. These functions do not perform the
	8	error checking normally needed when handling UTF-8 data, so if you happen
	9	to be from the Unicode Consortium you will want to flay me alive.
	10	I do this because error checking can be performed at the boundaries (I/O),
	11	with these routines reserved for higher performance on data known to be
	12	valid.
	13	*/
	14
	15	#ifdef _WIN32
	16	#include <windows.h>
	17	#undef min
	18	#undef max
	19	#endif
	20
	21	#include <stdlib.h>
	22	#include <stdio.h>
	23	#include <string.h>
	24	#include <stdarg.h>
	25
	26	#include <algorithm>
	27	#include <string>
	28
	29	#include "common_types.h"
	30	#include "utf8.h"
	31
	32	// is start of UTF sequence
	33	inline bool isutf(char c) {
	34	return (c & 0xC0) != 0x80;
	35	}
	36
	37	static const u32 offsetsFromUTF8[6] = {
	38	0x00000000UL, 0x00003080UL, 0x000E2080UL,
	39	0x03C82080UL, 0xFA082080UL, 0x82082080UL
	40	};
	41
	42	static const u8 trailingBytesForUTF8[256] = {
	43	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
	44	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
	45	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
	46	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
	47	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
	48	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
	49	1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
	50	2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5,
	51	};
	52
	53	/* returns length of next utf-8 sequence */
	54	int u8_seqlen(const char *s)
	55	{
	56	return trailingBytesForUTF8[(unsigned int)(unsigned char)s[0]] + 1;
	57	}
	58
	59	/* conversions without error checking
	60	only works for valid UTF-8, i.e. no 5- or 6-byte sequences
	61	srcsz = source size in bytes, or -1 if 0-terminated
	62	sz = dest size in # of wide characters
	63
	64	returns # characters converted
	65	dest will always be L'\0'-terminated, even if there isn't enough room
	66	for all the characters.
	67	if sz = srcsz+1 (i.e. 4*srcsz+4 bytes), there will always be enough space.
	68	*/
	69	int u8_toucs(u32 dest, int sz, const char src, int srcsz)
	70	{
	71	u32 ch;
	72	const char *src_end = src + srcsz;
	73	int nb;
	74	int i=0;
	75
	76	while (i < sz-1) {
	77	nb = trailingBytesForUTF8[(unsigned char)*src];
	78	if (srcsz == -1) {
	79	if (*src == 0)
	80	goto done_toucs;
	81	}
	82	else {
	83	if (src + nb >= src_end)
	84	goto done_toucs;
	85	}
	86	ch = 0;
	87	switch (nb) {
	88	/* these fall through deliberately */
	89	case 3: ch += (unsigned char)*src++; ch <<= 6;
	90	case 2: ch += (unsigned char)*src++; ch <<= 6;
	91	case 1: ch += (unsigned char)*src++; ch <<= 6;
	92	case 0: ch += (unsigned char)*src++;
	93	}
	94	ch -= offsetsFromUTF8[nb];
	95	dest[i++] = ch;
	96	}
	97	done_toucs:
	98	dest[i] = 0;
	99	return i;
	100	}
	101
	102	/* srcsz = number of source characters, or -1 if 0-terminated
	103	sz = size of dest buffer in bytes
	104
	105	returns # characters converted
	106	dest will only be '\0'-terminated if there is enough space. this is
	107	for consistency; imagine there are 2 bytes of space left, but the next
	108	character requires 3 bytes. in this case we could NUL-terminate, but in
	109	general we can't when there's insufficient space. therefore this function
	110	only NUL-terminates if all the characters fit, and there's space for
	111	the NUL as well.
	112	the destination string will never be bigger than the source string.
	113	*/
	114	int u8_toutf8(char dest, int sz, u32 src, int srcsz)
	115	{
	116	u32 ch;
	117	int i = 0;
	118	char *dest_end = dest + sz;
	119
	120	while (srcsz<0 ? src[i]!=0 : i < srcsz) {
	121	ch = src[i];
	122	if (ch < 0x80) {
	123	if (dest >= dest_end)
	124	return i;
	125	*dest++ = (char)ch;
	126	}
	127	else if (ch < 0x800) {
	128	if (dest >= dest_end-1)
	129	return i;
	130	*dest++ = (ch>>6) \| 0xC0;
	131	*dest++ = (ch & 0x3F) \| 0x80;
	132	}
	133	else if (ch < 0x10000) {
	134	if (dest >= dest_end-2)
	135	return i;
	136	*dest++ = (ch>>12) \| 0xE0;
	137	*dest++ = ((ch>>6) & 0x3F) \| 0x80;
	138	*dest++ = (ch & 0x3F) \| 0x80;
	139	}
	140	else if (ch < 0x110000) {
	141	if (dest >= dest_end-3)
	142	return i;
	143	*dest++ = (ch>>18) \| 0xF0;
	144	*dest++ = ((ch>>12) & 0x3F) \| 0x80;
	145	*dest++ = ((ch>>6) & 0x3F) \| 0x80;
	146	*dest++ = (ch & 0x3F) \| 0x80;
	147	}
	148	i++;
	149	}
	150	if (dest < dest_end)
	151	*dest = '\0';
	152	return i;
	153	}
	154
	155	int u8_wc_toutf8(char *dest, u32 ch)
	156	{
	157	if (ch < 0x80) {
	158	dest[0] = (char)ch;
	159	return 1;
	160	}
	161	if (ch < 0x800) {
	162	dest[0] = (ch>>6) \| 0xC0;
	163	dest[1] = (ch & 0x3F) \| 0x80;
	164	return 2;
	165	}
	166	if (ch < 0x10000) {
	167	dest[0] = (ch>>12) \| 0xE0;
	168	dest[1] = ((ch>>6) & 0x3F) \| 0x80;
	169	dest[2] = (ch & 0x3F) \| 0x80;
	170	return 3;
	171	}
	172	if (ch < 0x110000) {
	173	dest[0] = (ch>>18) \| 0xF0;
	174	dest[1] = ((ch>>12) & 0x3F) \| 0x80;
	175	dest[2] = ((ch>>6) & 0x3F) \| 0x80;
	176	dest[3] = (ch & 0x3F) \| 0x80;
	177	return 4;
	178	}
	179	return 0;
	180	}
	181
	182	/* charnum => byte offset */
	183	int u8_offset(const char *str, int charnum)
	184	{
	185	int offs=0;
	186
	187	while (charnum > 0 && str[offs]) {
	188	(void)(isutf(str[++offs]) \|\| isutf(str[++offs]) \|\|
	189	isutf(str[++offs]) \|\| ++offs);
	190	charnum--;
	191	}
	192	return offs;
	193	}
	194
	195	/* byte offset => charnum */
	196	int u8_charnum(const char *s, int offset)
	197	{
	198	int charnum = 0, offs=0;
	199
	200	while (offs < offset && s[offs]) {
	201	(void)(isutf(s[++offs]) \|\| isutf(s[++offs]) \|\|
	202	isutf(s[++offs]) \|\| ++offs);
	203	charnum++;
	204	}
	205	return charnum;
	206	}
	207
	208	/* number of characters */
	209	int u8_strlen(const char *s)
	210	{
	211	int count = 0;
	212	int i = 0;
	213
	214	while (u8_nextchar(s, &i) != 0)
	215	count++;
	216
	217	return count;
	218	}
	219
	220	/* reads the next utf-8 sequence out of a string, updating an index */
	221	u32 u8_nextchar(const char s, int i)
	222	{
	223	u32 ch = 0;
	224	int sz = 0;
	225
	226	do {
	227	ch <<= 6;
	228	ch += (unsigned char)s[(*i)++];
	229	sz++;
	230	} while (s[i] && !isutf(s[i]));
	231	ch -= offsetsFromUTF8[sz-1];
	232
	233	return ch;
	234	}
	235
	236	void u8_inc(const char s, int i)
	237	{
	238	(void)(isutf(s[++(i)]) \|\| isutf(s[++(i)]) \|\|
	239	isutf(s[++(i)]) \|\| ++(i));
	240	}
	241
	242	void u8_dec(const char s, int i)
	243	{
	244	(void)(isutf(s[--(i)]) \|\| isutf(s[--(i)]) \|\|
	245	isutf(s[--(i)]) \|\| --(i));
	246	}
	247
	248	int octal_digit(char c)
	249	{
	250	return (c >= '0' && c <= '7');
	251	}
	252
	253	int hex_digit(char c)
	254	{
	255	return ((c >= '0' && c <= '9') \|\|
	256	(c >= 'A' && c <= 'F') \|\|
	257	(c >= 'a' && c <= 'f'));
	258	}
	259
	260	/* assumes that src points to the character after a backslash
	261	returns number of input characters processed */
	262	int u8_read_escape_sequence(const char str, u32 dest)
	263	{
	264	u32 ch;
	265	char digs[9]="\0\0\0\0\0\0\0\0";
	266	int dno=0, i=1;
	267
	268	ch = (u32)str[0]; /* take literal character */
	269	if (str[0] == 'n')
	270	ch = L'\n';
	271	else if (str[0] == 't')
	272	ch = L'\t';
	273	else if (str[0] == 'r')
	274	ch = L'\r';
	275	else if (str[0] == 'b')
	276	ch = L'\b';
	277	else if (str[0] == 'f')
	278	ch = L'\f';
	279	else if (str[0] == 'v')
	280	ch = L'\v';
	281	else if (str[0] == 'a')
	282	ch = L'\a';
	283	else if (octal_digit(str[0])) {
	284	i = 0;
	285	do {
	286	digs[dno++] = str[i++];
	287	} while (octal_digit(str[i]) && dno < 3);
	288	ch = strtol(digs, NULL, 8);
	289	}
	290	else if (str[0] == 'x') {
	291	while (hex_digit(str[i]) && dno < 2) {
	292	digs[dno++] = str[i++];
	293	}
	294	if (dno > 0)
	295	ch = strtol(digs, NULL, 16);
	296	}
	297	else if (str[0] == 'u') {
	298	while (hex_digit(str[i]) && dno < 4) {
	299	digs[dno++] = str[i++];
	300	}
	301	if (dno > 0)
	302	ch = strtol(digs, NULL, 16);
	303	}
	304	else if (str[0] == 'U') {
	305	while (hex_digit(str[i]) && dno < 8) {
	306	digs[dno++] = str[i++];
	307	}
	308	if (dno > 0)
	309	ch = strtol(digs, NULL, 16);
	310	}
	311	*dest = ch;
	312
	313	return i;
	314	}
	315
	316	/* convert a string with literal \uxxxx or \Uxxxxxxxx characters to UTF-8
	317	example: u8_unescape(mybuf, 256, "hello\\u220e")
	318	note the double backslash is needed if called on a C string literal */
	319	int u8_unescape(char buf, int sz, char src)
	320	{
	321	int c=0, amt;
	322	u32 ch;
	323	char temp[4];
	324
	325	while (*src && c < sz) {
	326	if (*src == '\\') {
	327	src++;
	328	amt = u8_read_escape_sequence(src, &ch);
	329	}
	330	else {
	331	ch = (u32)*src;
	332	amt = 1;
	333	}
	334	src += amt;
	335	amt = u8_wc_toutf8(temp, ch);
	336	if (amt > sz-c)
	337	break;
	338	memcpy(&buf[c], temp, amt);
	339	c += amt;
	340	}
	341	if (c < sz)
	342	buf[c] = '\0';
	343	return c;
	344	}
	345
	346	const char u8_strchr(const char s, u32 ch, int *charn)
	347	{
	348	int i = 0, lasti=0;
	349	u32 c;
	350
	351	*charn = 0;
	352	while (s[i]) {
	353	c = u8_nextchar(s, &i);
	354	if (c == ch) {
	355	return &s[lasti];
	356	}
	357	lasti = i;
	358	(*charn)++;
	359	}
	360	return NULL;
	361	}
	362
	363	const char u8_memchr(const char s, u32 ch, size_t sz, int *charn)
	364	{
	365	u32 i = 0, lasti=0;
	366	u32 c;
	367	int csz;
	368
	369	*charn = 0;
	370	while (i < sz) {
	371	c = csz = 0;
	372	do {
	373	c <<= 6;
	374	c += (unsigned char)s[i++];
	375	csz++;
	376	} while (i < sz && !isutf(s[i]));
	377	c -= offsetsFromUTF8[csz-1];
	378
	379	if (c == ch) {
	380	return &s[lasti];
	381	}
	382	lasti = i;
	383	(*charn)++;
	384	}
	385	return NULL;
	386	}
	387
	388	int u8_is_locale_utf8(const char *locale)
	389	{
	390	/* this code based on libutf8 */
	391	const char* cp = locale;
	392
	393	for (; cp != '\0' && cp != '@' && cp != '+' && cp != ','; cp++) {
	394	if (*cp == '.') {
	395	const char* encoding = ++cp;
	396	for (; cp != '\0' && cp != '@' && cp != '+' && cp != ','; cp++)
	397	;
	398	if ((cp-encoding == 5 && !strncmp(encoding, "UTF-8", 5))
	399	\|\| (cp-encoding == 4 && !strncmp(encoding, "utf8", 4)))
	400	return 1; /* it's UTF-8 */
	401	break;
	402	}
	403	}
	404	return 0;
	405	}
	406
	407	int UTF8StringNonASCIICount(const char *utf8string) {
	408	UTF8 utf(utf8string);
	409	int count = 0;
	410	while (!utf.end()) {
	411	int c = utf.next();
	412	if (c > 127)
	413	++count;
	414	}
	415	return count;
	416	}
	417
	418	bool UTF8StringHasNonASCII(const char *utf8string) {
	419	return UTF8StringNonASCIICount(utf8string) > 0;
	420	}
	421
	422	#ifdef _WIN32
	423
	424	std::string ConvertWStringToUTF8(const wchar_t *wstr) {
	425	int len = (int)wcslen(wstr);
	426	int size = (int)WideCharToMultiByte(CP_UTF8, 0, wstr, len, 0, 0, NULL, NULL);
	427	std::string s;
	428	s.resize(size);
	429	if (size > 0) {
	430	WideCharToMultiByte(CP_UTF8, 0, wstr, len, &s[0], size, NULL, NULL);
	431	}
	432	return s;
	433	}
	434
	435	std::string ConvertWStringToUTF8(const std::wstring &wstr) {
	436	int len = (int)wstr.size();
	437	int size = (int)WideCharToMultiByte(CP_UTF8, 0, wstr.c_str(), len, 0, 0, NULL, NULL);
	438	std::string s;
	439	s.resize(size);
	440	if (size > 0) {
	441	WideCharToMultiByte(CP_UTF8, 0, wstr.c_str(), len, &s[0], size, NULL, NULL);
	442	}
	443	return s;
	444	}
	445
	446	void ConvertUTF8ToWString(wchar_t *dest, size_t destSize, const std::string &source) {
	447	int len = (int)source.size();
	448	int size = (int)MultiByteToWideChar(CP_UTF8, 0, source.c_str(), len, NULL, 0);
	449	MultiByteToWideChar(CP_UTF8, 0, source.c_str(), len, dest, std::min((int)destSize, size));
	450	}
	451
	452	std::wstring ConvertUTF8ToWString(const std::string &source) {
	453	int len = (int)source.size();
	454	int size = (int)MultiByteToWideChar(CP_UTF8, 0, source.c_str(), len, NULL, 0);
	455	std::wstring str;
	456	str.resize(size);
	457	if (size > 0) {
	458	MultiByteToWideChar(CP_UTF8, 0, source.c_str(), len, &str[0], size);
	459	}
	460	return str;
	461	}
	462
	463	#endif \ No newline at end of file