#include #include #include #ifdef WIN32 #include #endif #include "unicode.h" #include "exceptions.h" /// Returns length of wide character in utf-8 #define UTF8_LENGTH(Char) \ ((Char) < 0x80 ? 1 : \ ((Char) < 0x800 ? 2 : \ ((Char) < 0x10000 ? 3 : \ ((Char) < 0x200000 ? 4 : \ ((Char) < 0x4000000 ? 5 : 6))))) #define UTF8_COMPUTE(Char, Mask, Len) \ if (Char < 128) \ { \ Len = 1; \ Mask = 0x7f; \ } \ else if ((Char & 0xe0) == 0xc0) \ { \ Len = 2; \ Mask = 0x1f; \ } \ else if ((Char & 0xf0) == 0xe0) \ { \ Len = 3; \ Mask = 0x0f; \ } \ else if ((Char & 0xf8) == 0xf0) \ { \ Len = 4; \ Mask = 0x07; \ } \ else if ((Char & 0xfc) == 0xf8) \ { \ Len = 5; \ Mask = 0x03; \ } \ else if ((Char & 0xfe) == 0xfc) \ { \ Len = 6; \ Mask = 0x01; \ } \ else \ Len = -1; #ifndef WIN32 static const char utf8_skip_data[256] = { 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,4,4,4,4,4,4,4,4,5,5,5,5,6,6,1,1 }; const char * const g_utf8_skip = utf8_skip_data; #define g_utf8_next_char(p) (char *)((p) + g_utf8_skip[*(unsigned char *)(p)]) #define UTF8_GET(Result, Chars, Count, Mask, Len) \ (Result) = (Chars)[0] & (Mask); \ for ((Count) = 1; (Count) < (Len); ++(Count)) \ { \ if (((Chars)[(Count)] & 0xc0) != 0x80) \ { \ (Result) = -1; \ break; \ } \ (Result) <<= 6; \ (Result) |= ((Chars)[(Count)] & 0x3f); \ } /* Like g_utf8_get_char, but take a maximum length * and return (wchar_t)-2 on incomplete trailing character */ static inline wchar_t g_utf8_get_char_extended (const char *p, size_t max_len) { unsigned int i, len; wchar_t wc = (unsigned char) *p; if (wc < 0x80) { return wc; } else if (wc < 0xc0) { return (wchar_t)-1; } else if (wc < 0xe0) { len = 2; wc &= 0x1f; } else if (wc < 0xf0) { len = 3; wc &= 0x0f; } else if (wc < 0xf8) { len = 4; wc &= 0x07; } else if (wc < 0xfc) { len = 5; wc &= 0x03; } else if (wc < 0xfe) { len = 6; wc &= 0x01; } else { return (wchar_t)-1; } if (max_len >= 0 && len > max_len) { for (i = 1; i < max_len; i++) { if ((((unsigned char *)p)[i] & 0xc0) != 0x80) return (wchar_t)-1; } return (wchar_t)-2; } for (i = 1; i < len; ++i) { wchar_t ch = ((unsigned char *)p)[i]; if ((ch & 0xc0) != 0x80) { if (ch) return (wchar_t)-1; else return (wchar_t)-2; } wc <<= 6; wc |= (ch & 0x3f); } if (UTF8_LENGTH(wc) != len) return (wchar_t)-1; return wc; } /** * g_utf8_get_char: * @p: a pointer to Unicode character encoded as UTF-8 * * Converts a sequence of bytes encoded as UTF-8 to a Unicode character. * If @p does not point to a valid UTF-8 encoded character, results are * undefined. If you are not sure that the bytes are complete * valid Unicode characters, you should use g_utf8_get_char_validated() * instead. * * Return value: the resulting character **/ wchar_t g_utf8_get_char (const char *p) { int i, mask = 0, len; wchar_t result; unsigned char c = (unsigned char) *p; UTF8_COMPUTE (c, mask, len); if (len == -1) return (wchar_t)-1; UTF8_GET (result, p, i, mask, len); return result; } /** * g_utf8_to_ucs4: * @str: a UTF-8 encoded string * @len: the maximum length of @str to use. If @len < 0, then * the string is nul-terminated. * @items_read: location to store number of bytes read, or %NULL. * If %NULL, then %G_CONVERT_ERROR_PARTIAL_INPUT will be * returned in case @str contains a trailing partial * character. If an error occurs then the index of the * invalid input is stored here. * @items_written: location to store number of characters written or %NULL. * The value here stored does not include the trailing 0 * character. * @error: location to store the error occuring, or %NULL to ignore * errors. Any of the errors in #GConvertError other than * %G_CONVERT_ERROR_NO_CONVERSION may occur. * * Convert a string from UTF-8 to a 32-bit fixed width * representation as UCS-4. A trailing 0 will be added to the * string after the converted text. * * Return value: a pointer to a newly allocated UCS-4 string. * This value must be freed with g_free(). If an * error occurs, %NULL will be returned and * @error set. **/ wchar_t * g_utf8_to_ucs4 (const char *str, long len, long *items_read, long *items_written, const wchar_t **error) { wchar_t *result = NULL; int n_chars, i; const char *in; in = str; n_chars = 0; while ((len < 0 || str + len - in > 0) && *in) { wchar_t wc = g_utf8_get_char_extended (in, str + len - in); if (wc & 0x80000000) { if (wc == (wchar_t)-2) { if (items_read) break; else if (error) *error = L"Partial character sequence at end of input"; } else if (error) *error = L"Invalid byte sequence in conversion input"; goto err_out; } n_chars++; in = g_utf8_next_char (in); } result = (wchar_t*)malloc((n_chars + 1) * sizeof(wchar_t)); in = str; for (i=0; i < n_chars; i++) { result[i] = g_utf8_get_char (in); in = g_utf8_next_char (in); } result[i] = 0; if (items_written) *items_written = n_chars; err_out: if (items_read) *items_read = in - str; return result; } /** * g_unichar_to_utf8: * @c: a ISO10646 character code * @outbuf: output buffer, must have at least 6 bytes of space. * If %NULL, the length will be computed and returned * and nothing will be written to @outbuf. * * Converts a single character to UTF-8. * * Return value: number of bytes written **/ int g_unichar_to_utf8 (wchar_t c, char *outbuf) { unsigned int len = 0; int first; int i; if (c < 0x80) { first = 0; len = 1; } else if (c < 0x800) { first = 0xc0; len = 2; } else if (c < 0x10000) { first = 0xe0; len = 3; } else if (c < 0x200000) { first = 0xf0; len = 4; } else if (c < 0x4000000) { first = 0xf8; len = 5; } else { first = 0xfc; len = 6; } if (outbuf) { for (i = len - 1; i > 0; --i) { outbuf[i] = (c & 0x3f) | 0x80; c >>= 6; } outbuf[0] = c | first; } return len; } /** * g_ucs4_to_utf8: * @str: a UCS-4 encoded string * @len: the maximum length of @str to use. If @len < 0, then * the string is terminated with a 0 character. * @items_read: location to store number of characters read read, or %NULL. * @items_written: location to store number of bytes written or %NULL. * The value here stored does not include the trailing 0 * byte. * @error: location to store the error occuring, or %NULL to ignore * errors. Any of the errors in #GConvertError other than * %G_CONVERT_ERROR_NO_CONVERSION may occur. * * Convert a string from a 32-bit fixed width representation as UCS-4. * to UTF-8. The result will be terminated with a 0 byte. * * Return value: a pointer to a newly allocated UTF-8 string. * This value must be freed with g_free(). If an * error occurs, %NULL will be returned and * @error set. **/ char * g_ucs4_to_utf8 (const wchar_t *str, long len, long *items_read, long *items_written, const wchar_t **error) { int result_length; char *result = NULL; char *p; int i; result_length = 0; for (i = 0; len < 0 || i < len ; i++) { if (!str[i]) break; if ((unsigned)str[i] >= 0x80000000) { if (items_read) *items_read = i; if (error) *error = L"Character out of range for UTF-8"; goto err_out; } result_length += UTF8_LENGTH (str[i]); } result = (char*)malloc (result_length + 1); p = result; i = 0; while (p < result + result_length) p += g_unichar_to_utf8 (str[i++], p); *p = '\0'; if (items_written) *items_written = p - result; err_out: if (items_read) *items_read = i; return result; } std::string toUtf8(const std::wstring &str) { long readed, writed; const wchar_t *errMsg = NULL; char *res = g_ucs4_to_utf8(str.c_str(), str.length(), &readed, &writed, &errMsg); if (! res) { if (errMsg) throw Exception(errMsg); else throw Exception(L"Error converting text to UTF-8"); } std::string s(res); free(res); return s; } std::wstring fromUtf8(const std::string &str) { long readed, writed; const wchar_t *errMsg = NULL; wchar_t *res = g_utf8_to_ucs4(str.c_str(), str.length(), &readed, &writed, &errMsg); if (! res) { if (errMsg) throw Exception(errMsg); else throw Exception(L"Error converting text from UTF-8"); } std::wstring s(res); free(res); return s; } #else std::string toUtf8(const std::wstring &str) { if (! str.length()) return ""; int len = str.length(); int bufSize = (len + 1) * 6 + 1; char buf[bufSize]; int res = WideCharToMultiByte(CP_UTF8, 0, str.c_str(), len + 1, buf, bufSize, NULL, NULL); if (! res) throw Exception(L"Error converting UCS-2 to UTF-8"); return buf; } std::wstring fromUtf8(const std::string &str) { if (! str.length()) return L""; int len = str.length(); wchar_t buf[len + 1]; int res = MultiByteToWideChar(CP_UTF8, 0, str.c_str(), len + 1, buf, len + 1); if (! res) throw Exception(L"Error converting UTF-8 to UCS-2"); return buf; } std::string toOem(const std::wstring &str) { if (! str.length()) return ""; int len = str.length(); int bufSize = (len + 1) * 6 + 1; char buf[bufSize]; int res = WideCharToMultiByte(CP_OEMCP, 0, str.c_str(), len + 1, buf, bufSize, NULL, NULL); if (! res) throw Exception(L"Error converting UCS-2 to OEM"); return buf; } std::wstring fromOem(const std::string &str) { if (! str.length()) return L""; int len = str.length(); wchar_t buf[len + 1]; int res = MultiByteToWideChar(CP_OEMCP, 0, str.c_str(), len + 1, buf, len + 1); if (! res) throw Exception(L"Error converting OEM to UCS-2"); return buf; } #endif std::wstring fromUtf8(const char *str, int len) { char *buf = (char*)malloc(len + 1); if (! buf) throw Exception(L"Error allocating memory"); memcpy(buf, str, len); buf[len] = 0; std::string s(buf); free(buf); return fromUtf8(s); } std::string toMbcs(const std::wstring &str) { int len = str.length(); if (! len) return ""; else { int maxSize = MB_CUR_MAX * len; char buf[maxSize + 1]; size_t l = wcstombs(buf, str.c_str(), maxSize); if ((size_t)-1 == -l) { // convert what we can std::string res; for (int i = 0; i < len; i++) { int b = wctomb(buf, str[i]); if (0 < b) { buf[b] = 0; res += buf; } } return res; } else { buf[l] = 0; return buf; } } } std::wstring fromMbcs(const std::string &str) { int maxLen = str.length(); wchar_t ws[maxLen + 1]; size_t cnt = mbstowcs(ws, str.c_str(), maxLen); if (cnt == (size_t)-1) { return L""; } ws[cnt] = 0; return ws; } std::ostream& operator << (std::ostream &stream, const std::wstring &str) { #ifdef WIN32 if ((stream == std::cout) || (stream == std::cerr) || (stream == std::clog)) stream << toOem(str); else #endif stream << toMbcs(str); return stream; } int getUtf8Length(unsigned char c) { int mask, len; UTF8_COMPUTE(c, mask, len); if (-1 == len) { throw Exception(L"Invalid utf-8 character"); } else { return len; } }