599 lines
13 KiB
C++
599 lines
13 KiB
C++
#include <wchar.h>
|
|
#include <stdlib.h>
|
|
#include <string.h>
|
|
#ifdef WIN32
|
|
#include <windows.h>
|
|
#endif
|
|
#include "unicode.h"
|
|
#include "exceptions.h"
|
|
|
|
|
|
/// Returns length of wide character in utf-8
|
|
#define UTF8_LENGTH(Char) \
|
|
((Char) < 0x80 ? 1 : \
|
|
((Char) < 0x800 ? 2 : \
|
|
((Char) < 0x10000 ? 3 : \
|
|
((Char) < 0x200000 ? 4 : \
|
|
((Char) < 0x4000000 ? 5 : 6)))))
|
|
|
|
#define UTF8_COMPUTE(Char, Mask, Len) \
|
|
if (Char < 128) \
|
|
{ \
|
|
Len = 1; \
|
|
Mask = 0x7f; \
|
|
} \
|
|
else if ((Char & 0xe0) == 0xc0) \
|
|
{ \
|
|
Len = 2; \
|
|
Mask = 0x1f; \
|
|
} \
|
|
else if ((Char & 0xf0) == 0xe0) \
|
|
{ \
|
|
Len = 3; \
|
|
Mask = 0x0f; \
|
|
} \
|
|
else if ((Char & 0xf8) == 0xf0) \
|
|
{ \
|
|
Len = 4; \
|
|
Mask = 0x07; \
|
|
} \
|
|
else if ((Char & 0xfc) == 0xf8) \
|
|
{ \
|
|
Len = 5; \
|
|
Mask = 0x03; \
|
|
} \
|
|
else if ((Char & 0xfe) == 0xfc) \
|
|
{ \
|
|
Len = 6; \
|
|
Mask = 0x01; \
|
|
} \
|
|
else \
|
|
Len = -1;
|
|
|
|
|
|
|
|
#ifndef WIN32
|
|
|
|
static const char utf8_skip_data[256] = {
|
|
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
|
|
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
|
|
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
|
|
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
|
|
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
|
|
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
|
|
2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
|
|
3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,4,4,4,4,4,4,4,4,5,5,5,5,6,6,1,1
|
|
};
|
|
|
|
const char * const g_utf8_skip = utf8_skip_data;
|
|
|
|
#define g_utf8_next_char(p) (char *)((p) + g_utf8_skip[*(unsigned char *)(p)])
|
|
|
|
#define UTF8_GET(Result, Chars, Count, Mask, Len) \
|
|
(Result) = (Chars)[0] & (Mask); \
|
|
for ((Count) = 1; (Count) < (Len); ++(Count)) \
|
|
{ \
|
|
if (((Chars)[(Count)] & 0xc0) != 0x80) \
|
|
{ \
|
|
(Result) = -1; \
|
|
break; \
|
|
} \
|
|
(Result) <<= 6; \
|
|
(Result) |= ((Chars)[(Count)] & 0x3f); \
|
|
}
|
|
|
|
/* Like g_utf8_get_char, but take a maximum length
|
|
* and return (wchar_t)-2 on incomplete trailing character
|
|
*/
|
|
static inline wchar_t
|
|
g_utf8_get_char_extended (const char *p,
|
|
size_t max_len)
|
|
{
|
|
unsigned int i, len;
|
|
wchar_t wc = (unsigned char) *p;
|
|
|
|
if (wc < 0x80)
|
|
{
|
|
return wc;
|
|
}
|
|
else if (wc < 0xc0)
|
|
{
|
|
return (wchar_t)-1;
|
|
}
|
|
else if (wc < 0xe0)
|
|
{
|
|
len = 2;
|
|
wc &= 0x1f;
|
|
}
|
|
else if (wc < 0xf0)
|
|
{
|
|
len = 3;
|
|
wc &= 0x0f;
|
|
}
|
|
else if (wc < 0xf8)
|
|
{
|
|
len = 4;
|
|
wc &= 0x07;
|
|
}
|
|
else if (wc < 0xfc)
|
|
{
|
|
len = 5;
|
|
wc &= 0x03;
|
|
}
|
|
else if (wc < 0xfe)
|
|
{
|
|
len = 6;
|
|
wc &= 0x01;
|
|
}
|
|
else
|
|
{
|
|
return (wchar_t)-1;
|
|
}
|
|
|
|
if (max_len >= 0 && len > max_len)
|
|
{
|
|
for (i = 1; i < max_len; i++)
|
|
{
|
|
if ((((unsigned char *)p)[i] & 0xc0) != 0x80)
|
|
return (wchar_t)-1;
|
|
}
|
|
return (wchar_t)-2;
|
|
}
|
|
|
|
for (i = 1; i < len; ++i)
|
|
{
|
|
wchar_t ch = ((unsigned char *)p)[i];
|
|
|
|
if ((ch & 0xc0) != 0x80)
|
|
{
|
|
if (ch)
|
|
return (wchar_t)-1;
|
|
else
|
|
return (wchar_t)-2;
|
|
}
|
|
|
|
wc <<= 6;
|
|
wc |= (ch & 0x3f);
|
|
}
|
|
|
|
if (UTF8_LENGTH(wc) != len)
|
|
return (wchar_t)-1;
|
|
|
|
return wc;
|
|
}
|
|
|
|
/**
|
|
* g_utf8_get_char:
|
|
* @p: a pointer to Unicode character encoded as UTF-8
|
|
*
|
|
* Converts a sequence of bytes encoded as UTF-8 to a Unicode character.
|
|
* If @p does not point to a valid UTF-8 encoded character, results are
|
|
* undefined. If you are not sure that the bytes are complete
|
|
* valid Unicode characters, you should use g_utf8_get_char_validated()
|
|
* instead.
|
|
*
|
|
* Return value: the resulting character
|
|
**/
|
|
wchar_t
|
|
g_utf8_get_char (const char *p)
|
|
{
|
|
int i, mask = 0, len;
|
|
wchar_t result;
|
|
unsigned char c = (unsigned char) *p;
|
|
|
|
UTF8_COMPUTE (c, mask, len);
|
|
if (len == -1)
|
|
return (wchar_t)-1;
|
|
UTF8_GET (result, p, i, mask, len);
|
|
|
|
return result;
|
|
}
|
|
|
|
|
|
/**
|
|
* g_utf8_to_ucs4:
|
|
* @str: a UTF-8 encoded string
|
|
* @len: the maximum length of @str to use. If @len < 0, then
|
|
* the string is nul-terminated.
|
|
* @items_read: location to store number of bytes read, or %NULL.
|
|
* If %NULL, then %G_CONVERT_ERROR_PARTIAL_INPUT will be
|
|
* returned in case @str contains a trailing partial
|
|
* character. If an error occurs then the index of the
|
|
* invalid input is stored here.
|
|
* @items_written: location to store number of characters written or %NULL.
|
|
* The value here stored does not include the trailing 0
|
|
* character.
|
|
* @error: location to store the error occuring, or %NULL to ignore
|
|
* errors. Any of the errors in #GConvertError other than
|
|
* %G_CONVERT_ERROR_NO_CONVERSION may occur.
|
|
*
|
|
* Convert a string from UTF-8 to a 32-bit fixed width
|
|
* representation as UCS-4. A trailing 0 will be added to the
|
|
* string after the converted text.
|
|
*
|
|
* Return value: a pointer to a newly allocated UCS-4 string.
|
|
* This value must be freed with g_free(). If an
|
|
* error occurs, %NULL will be returned and
|
|
* @error set.
|
|
**/
|
|
wchar_t *
|
|
g_utf8_to_ucs4 (const char *str,
|
|
long len,
|
|
long *items_read,
|
|
long *items_written,
|
|
const wchar_t **error)
|
|
{
|
|
wchar_t *result = NULL;
|
|
int n_chars, i;
|
|
const char *in;
|
|
|
|
in = str;
|
|
n_chars = 0;
|
|
while ((len < 0 || str + len - in > 0) && *in)
|
|
{
|
|
wchar_t wc = g_utf8_get_char_extended (in, str + len - in);
|
|
if (wc & 0x80000000)
|
|
{
|
|
if (wc == (wchar_t)-2)
|
|
{
|
|
if (items_read)
|
|
break;
|
|
else
|
|
if (error)
|
|
*error = L"Partial character sequence at end of input";
|
|
}
|
|
else
|
|
if (error)
|
|
*error = L"Invalid byte sequence in conversion input";
|
|
|
|
goto err_out;
|
|
}
|
|
|
|
n_chars++;
|
|
|
|
in = g_utf8_next_char (in);
|
|
}
|
|
|
|
result = (wchar_t*)malloc((n_chars + 1) * sizeof(wchar_t));
|
|
|
|
in = str;
|
|
for (i=0; i < n_chars; i++)
|
|
{
|
|
result[i] = g_utf8_get_char (in);
|
|
in = g_utf8_next_char (in);
|
|
}
|
|
result[i] = 0;
|
|
|
|
if (items_written)
|
|
*items_written = n_chars;
|
|
|
|
err_out:
|
|
if (items_read)
|
|
*items_read = in - str;
|
|
|
|
return result;
|
|
}
|
|
|
|
/**
|
|
* g_unichar_to_utf8:
|
|
* @c: a ISO10646 character code
|
|
* @outbuf: output buffer, must have at least 6 bytes of space.
|
|
* If %NULL, the length will be computed and returned
|
|
* and nothing will be written to @outbuf.
|
|
*
|
|
* Converts a single character to UTF-8.
|
|
*
|
|
* Return value: number of bytes written
|
|
**/
|
|
int
|
|
g_unichar_to_utf8 (wchar_t c,
|
|
char *outbuf)
|
|
{
|
|
unsigned int len = 0;
|
|
int first;
|
|
int i;
|
|
|
|
if (c < 0x80)
|
|
{
|
|
first = 0;
|
|
len = 1;
|
|
}
|
|
else if (c < 0x800)
|
|
{
|
|
first = 0xc0;
|
|
len = 2;
|
|
}
|
|
else if (c < 0x10000)
|
|
{
|
|
first = 0xe0;
|
|
len = 3;
|
|
}
|
|
else if (c < 0x200000)
|
|
{
|
|
first = 0xf0;
|
|
len = 4;
|
|
}
|
|
else if (c < 0x4000000)
|
|
{
|
|
first = 0xf8;
|
|
len = 5;
|
|
}
|
|
else
|
|
{
|
|
first = 0xfc;
|
|
len = 6;
|
|
}
|
|
|
|
if (outbuf)
|
|
{
|
|
for (i = len - 1; i > 0; --i)
|
|
{
|
|
outbuf[i] = (c & 0x3f) | 0x80;
|
|
c >>= 6;
|
|
}
|
|
outbuf[0] = c | first;
|
|
}
|
|
|
|
return len;
|
|
}
|
|
|
|
/**
|
|
* g_ucs4_to_utf8:
|
|
* @str: a UCS-4 encoded string
|
|
* @len: the maximum length of @str to use. If @len < 0, then
|
|
* the string is terminated with a 0 character.
|
|
* @items_read: location to store number of characters read read, or %NULL.
|
|
* @items_written: location to store number of bytes written or %NULL.
|
|
* The value here stored does not include the trailing 0
|
|
* byte.
|
|
* @error: location to store the error occuring, or %NULL to ignore
|
|
* errors. Any of the errors in #GConvertError other than
|
|
* %G_CONVERT_ERROR_NO_CONVERSION may occur.
|
|
*
|
|
* Convert a string from a 32-bit fixed width representation as UCS-4.
|
|
* to UTF-8. The result will be terminated with a 0 byte.
|
|
*
|
|
* Return value: a pointer to a newly allocated UTF-8 string.
|
|
* This value must be freed with g_free(). If an
|
|
* error occurs, %NULL will be returned and
|
|
* @error set.
|
|
**/
|
|
char *
|
|
g_ucs4_to_utf8 (const wchar_t *str,
|
|
long len,
|
|
long *items_read,
|
|
long *items_written,
|
|
const wchar_t **error)
|
|
{
|
|
int result_length;
|
|
char *result = NULL;
|
|
char *p;
|
|
int i;
|
|
|
|
result_length = 0;
|
|
for (i = 0; len < 0 || i < len ; i++)
|
|
{
|
|
if (!str[i])
|
|
break;
|
|
|
|
if ((unsigned)str[i] >= 0x80000000)
|
|
{
|
|
if (items_read)
|
|
*items_read = i;
|
|
if (error)
|
|
*error = L"Character out of range for UTF-8";
|
|
goto err_out;
|
|
}
|
|
|
|
result_length += UTF8_LENGTH (str[i]);
|
|
}
|
|
|
|
result = (char*)malloc (result_length + 1);
|
|
p = result;
|
|
|
|
i = 0;
|
|
while (p < result + result_length)
|
|
p += g_unichar_to_utf8 (str[i++], p);
|
|
|
|
*p = '\0';
|
|
|
|
if (items_written)
|
|
*items_written = p - result;
|
|
|
|
err_out:
|
|
if (items_read)
|
|
*items_read = i;
|
|
|
|
return result;
|
|
}
|
|
|
|
std::string toUtf8(const std::wstring &str)
|
|
{
|
|
long readed, writed;
|
|
const wchar_t *errMsg = NULL;
|
|
|
|
char *res = g_ucs4_to_utf8(str.c_str(), str.length(), &readed,
|
|
&writed, &errMsg);
|
|
if (! res) {
|
|
if (errMsg)
|
|
throw Exception(errMsg);
|
|
else
|
|
throw Exception(L"Error converting text to UTF-8");
|
|
}
|
|
|
|
std::string s(res);
|
|
free(res);
|
|
|
|
return s;
|
|
}
|
|
|
|
std::wstring fromUtf8(const std::string &str)
|
|
{
|
|
long readed, writed;
|
|
const wchar_t *errMsg = NULL;
|
|
|
|
wchar_t *res = g_utf8_to_ucs4(str.c_str(), str.length(), &readed, &writed, &errMsg);
|
|
if (! res) {
|
|
if (errMsg)
|
|
throw Exception(errMsg);
|
|
else
|
|
throw Exception(L"Error converting text from UTF-8");
|
|
}
|
|
|
|
std::wstring s(res);
|
|
free(res);
|
|
|
|
return s;
|
|
}
|
|
|
|
|
|
#else
|
|
|
|
|
|
|
|
std::string toUtf8(const std::wstring &str)
|
|
{
|
|
if (! str.length())
|
|
return "";
|
|
|
|
int len = str.length();
|
|
int bufSize = (len + 1) * 6 + 1;
|
|
char buf[bufSize];
|
|
int res = WideCharToMultiByte(CP_UTF8, 0, str.c_str(), len + 1,
|
|
buf, bufSize, NULL, NULL);
|
|
|
|
if (! res)
|
|
throw Exception(L"Error converting UCS-2 to UTF-8");
|
|
return buf;
|
|
}
|
|
|
|
std::wstring fromUtf8(const std::string &str)
|
|
{
|
|
if (! str.length())
|
|
return L"";
|
|
|
|
int len = str.length();
|
|
wchar_t buf[len + 1];
|
|
|
|
int res = MultiByteToWideChar(CP_UTF8, 0, str.c_str(), len + 1,
|
|
buf, len + 1);
|
|
if (! res)
|
|
throw Exception(L"Error converting UTF-8 to UCS-2");
|
|
return buf;
|
|
}
|
|
|
|
|
|
std::string toOem(const std::wstring &str)
|
|
{
|
|
if (! str.length())
|
|
return "";
|
|
|
|
int len = str.length();
|
|
int bufSize = (len + 1) * 6 + 1;
|
|
char buf[bufSize];
|
|
int res = WideCharToMultiByte(CP_OEMCP, 0, str.c_str(), len + 1,
|
|
buf, bufSize, NULL, NULL);
|
|
|
|
if (! res)
|
|
throw Exception(L"Error converting UCS-2 to OEM");
|
|
return buf;
|
|
}
|
|
|
|
std::wstring fromOem(const std::string &str)
|
|
{
|
|
if (! str.length())
|
|
return L"";
|
|
|
|
int len = str.length();
|
|
wchar_t buf[len + 1];
|
|
|
|
int res = MultiByteToWideChar(CP_OEMCP, 0, str.c_str(), len + 1,
|
|
buf, len + 1);
|
|
if (! res)
|
|
throw Exception(L"Error converting OEM to UCS-2");
|
|
return buf;
|
|
}
|
|
|
|
#endif
|
|
|
|
|
|
std::wstring fromUtf8(const char *str, int len)
|
|
{
|
|
char *buf = (char*)malloc(len + 1);
|
|
if (! buf)
|
|
throw Exception(L"Error allocating memory");
|
|
memcpy(buf, str, len);
|
|
buf[len] = 0;
|
|
std::string s(buf);
|
|
free(buf);
|
|
return fromUtf8(s);
|
|
}
|
|
|
|
|
|
std::string toMbcs(const std::wstring &str)
|
|
{
|
|
int len = str.length();
|
|
if (! len)
|
|
return "";
|
|
else {
|
|
int maxSize = MB_CUR_MAX * len;
|
|
char buf[maxSize + 1];
|
|
size_t l = wcstombs(buf, str.c_str(), maxSize);
|
|
if ((size_t)-1 == -l) { // convert what we can
|
|
std::string res;
|
|
for (int i = 0; i < len; i++) {
|
|
int b = wctomb(buf, str[i]);
|
|
if (0 < b) {
|
|
buf[b] = 0;
|
|
res += buf;
|
|
}
|
|
}
|
|
return res;
|
|
} else {
|
|
buf[l] = 0;
|
|
return buf;
|
|
}
|
|
}
|
|
}
|
|
|
|
|
|
std::wstring fromMbcs(const std::string &str)
|
|
{
|
|
int maxLen = str.length();
|
|
wchar_t ws[maxLen + 1];
|
|
size_t cnt = mbstowcs(ws, str.c_str(), maxLen);
|
|
if (cnt == (size_t)-1) {
|
|
return L"";
|
|
}
|
|
ws[cnt] = 0;
|
|
return ws;
|
|
}
|
|
|
|
|
|
std::ostream& operator << (std::ostream &stream, const std::wstring &str)
|
|
{
|
|
#ifdef WIN32
|
|
if ((stream == std::cout) || (stream == std::cerr) ||
|
|
(stream == std::clog))
|
|
stream << toOem(str);
|
|
else
|
|
#endif
|
|
stream << toMbcs(str);
|
|
return stream;
|
|
}
|
|
|
|
|
|
int getUtf8Length(unsigned char c)
|
|
{
|
|
int mask, len;
|
|
|
|
UTF8_COMPUTE(c, mask, len);
|
|
|
|
if (-1 == len) {
|
|
throw Exception(L"Invalid utf-8 character");
|
|
} else {
|
|
return len;
|
|
}
|
|
}
|
|
|