einstein/unicode.cpp

599 lines
13 KiB
C++

#include <wchar.h>
#include <stdlib.h>
#include <string.h>
#ifdef WIN32
#include <windows.h>
#endif
#include "unicode.h"
#include "exceptions.h"
/// Returns length of wide character in utf-8
#define UTF8_LENGTH(Char) \
((Char) < 0x80 ? 1 : \
((Char) < 0x800 ? 2 : \
((Char) < 0x10000 ? 3 : \
((Char) < 0x200000 ? 4 : \
((Char) < 0x4000000 ? 5 : 6)))))
#define UTF8_COMPUTE(Char, Mask, Len) \
if (Char < 128) \
{ \
Len = 1; \
Mask = 0x7f; \
} \
else if ((Char & 0xe0) == 0xc0) \
{ \
Len = 2; \
Mask = 0x1f; \
} \
else if ((Char & 0xf0) == 0xe0) \
{ \
Len = 3; \
Mask = 0x0f; \
} \
else if ((Char & 0xf8) == 0xf0) \
{ \
Len = 4; \
Mask = 0x07; \
} \
else if ((Char & 0xfc) == 0xf8) \
{ \
Len = 5; \
Mask = 0x03; \
} \
else if ((Char & 0xfe) == 0xfc) \
{ \
Len = 6; \
Mask = 0x01; \
} \
else \
Len = -1;
#ifndef WIN32
static const char utf8_skip_data[256] = {
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,4,4,4,4,4,4,4,4,5,5,5,5,6,6,1,1
};
const char * const g_utf8_skip = utf8_skip_data;
#define g_utf8_next_char(p) (char *)((p) + g_utf8_skip[*(unsigned char *)(p)])
#define UTF8_GET(Result, Chars, Count, Mask, Len) \
(Result) = (Chars)[0] & (Mask); \
for ((Count) = 1; (Count) < (Len); ++(Count)) \
{ \
if (((Chars)[(Count)] & 0xc0) != 0x80) \
{ \
(Result) = -1; \
break; \
} \
(Result) <<= 6; \
(Result) |= ((Chars)[(Count)] & 0x3f); \
}
/* Like g_utf8_get_char, but take a maximum length
* and return (wchar_t)-2 on incomplete trailing character
*/
static inline wchar_t
g_utf8_get_char_extended (const char *p,
size_t max_len)
{
unsigned int i, len;
wchar_t wc = (unsigned char) *p;
if (wc < 0x80)
{
return wc;
}
else if (wc < 0xc0)
{
return (wchar_t)-1;
}
else if (wc < 0xe0)
{
len = 2;
wc &= 0x1f;
}
else if (wc < 0xf0)
{
len = 3;
wc &= 0x0f;
}
else if (wc < 0xf8)
{
len = 4;
wc &= 0x07;
}
else if (wc < 0xfc)
{
len = 5;
wc &= 0x03;
}
else if (wc < 0xfe)
{
len = 6;
wc &= 0x01;
}
else
{
return (wchar_t)-1;
}
if (max_len >= 0 && len > max_len)
{
for (i = 1; i < max_len; i++)
{
if ((((unsigned char *)p)[i] & 0xc0) != 0x80)
return (wchar_t)-1;
}
return (wchar_t)-2;
}
for (i = 1; i < len; ++i)
{
wchar_t ch = ((unsigned char *)p)[i];
if ((ch & 0xc0) != 0x80)
{
if (ch)
return (wchar_t)-1;
else
return (wchar_t)-2;
}
wc <<= 6;
wc |= (ch & 0x3f);
}
if (UTF8_LENGTH(wc) != len)
return (wchar_t)-1;
return wc;
}
/**
* g_utf8_get_char:
* @p: a pointer to Unicode character encoded as UTF-8
*
* Converts a sequence of bytes encoded as UTF-8 to a Unicode character.
* If @p does not point to a valid UTF-8 encoded character, results are
* undefined. If you are not sure that the bytes are complete
* valid Unicode characters, you should use g_utf8_get_char_validated()
* instead.
*
* Return value: the resulting character
**/
wchar_t
g_utf8_get_char (const char *p)
{
int i, mask = 0, len;
wchar_t result;
unsigned char c = (unsigned char) *p;
UTF8_COMPUTE (c, mask, len);
if (len == -1)
return (wchar_t)-1;
UTF8_GET (result, p, i, mask, len);
return result;
}
/**
* g_utf8_to_ucs4:
* @str: a UTF-8 encoded string
* @len: the maximum length of @str to use. If @len < 0, then
* the string is nul-terminated.
* @items_read: location to store number of bytes read, or %NULL.
* If %NULL, then %G_CONVERT_ERROR_PARTIAL_INPUT will be
* returned in case @str contains a trailing partial
* character. If an error occurs then the index of the
* invalid input is stored here.
* @items_written: location to store number of characters written or %NULL.
* The value here stored does not include the trailing 0
* character.
* @error: location to store the error occuring, or %NULL to ignore
* errors. Any of the errors in #GConvertError other than
* %G_CONVERT_ERROR_NO_CONVERSION may occur.
*
* Convert a string from UTF-8 to a 32-bit fixed width
* representation as UCS-4. A trailing 0 will be added to the
* string after the converted text.
*
* Return value: a pointer to a newly allocated UCS-4 string.
* This value must be freed with g_free(). If an
* error occurs, %NULL will be returned and
* @error set.
**/
wchar_t *
g_utf8_to_ucs4 (const char *str,
long len,
long *items_read,
long *items_written,
const wchar_t **error)
{
wchar_t *result = NULL;
int n_chars, i;
const char *in;
in = str;
n_chars = 0;
while ((len < 0 || str + len - in > 0) && *in)
{
wchar_t wc = g_utf8_get_char_extended (in, str + len - in);
if (wc & 0x80000000)
{
if (wc == (wchar_t)-2)
{
if (items_read)
break;
else
if (error)
*error = L"Partial character sequence at end of input";
}
else
if (error)
*error = L"Invalid byte sequence in conversion input";
goto err_out;
}
n_chars++;
in = g_utf8_next_char (in);
}
result = (wchar_t*)malloc((n_chars + 1) * sizeof(wchar_t));
in = str;
for (i=0; i < n_chars; i++)
{
result[i] = g_utf8_get_char (in);
in = g_utf8_next_char (in);
}
result[i] = 0;
if (items_written)
*items_written = n_chars;
err_out:
if (items_read)
*items_read = in - str;
return result;
}
/**
* g_unichar_to_utf8:
* @c: a ISO10646 character code
* @outbuf: output buffer, must have at least 6 bytes of space.
* If %NULL, the length will be computed and returned
* and nothing will be written to @outbuf.
*
* Converts a single character to UTF-8.
*
* Return value: number of bytes written
**/
int
g_unichar_to_utf8 (wchar_t c,
char *outbuf)
{
unsigned int len = 0;
int first;
int i;
if (c < 0x80)
{
first = 0;
len = 1;
}
else if (c < 0x800)
{
first = 0xc0;
len = 2;
}
else if (c < 0x10000)
{
first = 0xe0;
len = 3;
}
else if (c < 0x200000)
{
first = 0xf0;
len = 4;
}
else if (c < 0x4000000)
{
first = 0xf8;
len = 5;
}
else
{
first = 0xfc;
len = 6;
}
if (outbuf)
{
for (i = len - 1; i > 0; --i)
{
outbuf[i] = (c & 0x3f) | 0x80;
c >>= 6;
}
outbuf[0] = c | first;
}
return len;
}
/**
* g_ucs4_to_utf8:
* @str: a UCS-4 encoded string
* @len: the maximum length of @str to use. If @len < 0, then
* the string is terminated with a 0 character.
* @items_read: location to store number of characters read read, or %NULL.
* @items_written: location to store number of bytes written or %NULL.
* The value here stored does not include the trailing 0
* byte.
* @error: location to store the error occuring, or %NULL to ignore
* errors. Any of the errors in #GConvertError other than
* %G_CONVERT_ERROR_NO_CONVERSION may occur.
*
* Convert a string from a 32-bit fixed width representation as UCS-4.
* to UTF-8. The result will be terminated with a 0 byte.
*
* Return value: a pointer to a newly allocated UTF-8 string.
* This value must be freed with g_free(). If an
* error occurs, %NULL will be returned and
* @error set.
**/
char *
g_ucs4_to_utf8 (const wchar_t *str,
long len,
long *items_read,
long *items_written,
const wchar_t **error)
{
int result_length;
char *result = NULL;
char *p;
int i;
result_length = 0;
for (i = 0; len < 0 || i < len ; i++)
{
if (!str[i])
break;
if ((unsigned)str[i] >= 0x80000000)
{
if (items_read)
*items_read = i;
if (error)
*error = L"Character out of range for UTF-8";
goto err_out;
}
result_length += UTF8_LENGTH (str[i]);
}
result = (char*)malloc (result_length + 1);
p = result;
i = 0;
while (p < result + result_length)
p += g_unichar_to_utf8 (str[i++], p);
*p = '\0';
if (items_written)
*items_written = p - result;
err_out:
if (items_read)
*items_read = i;
return result;
}
std::string toUtf8(const std::wstring &str)
{
long readed, writed;
const wchar_t *errMsg = NULL;
char *res = g_ucs4_to_utf8(str.c_str(), str.length(), &readed,
&writed, &errMsg);
if (! res) {
if (errMsg)
throw Exception(errMsg);
else
throw Exception(L"Error converting text to UTF-8");
}
std::string s(res);
free(res);
return s;
}
std::wstring fromUtf8(const std::string &str)
{
long readed, writed;
const wchar_t *errMsg = NULL;
wchar_t *res = g_utf8_to_ucs4(str.c_str(), str.length(), &readed, &writed, &errMsg);
if (! res) {
if (errMsg)
throw Exception(errMsg);
else
throw Exception(L"Error converting text from UTF-8");
}
std::wstring s(res);
free(res);
return s;
}
#else
std::string toUtf8(const std::wstring &str)
{
if (! str.length())
return "";
int len = str.length();
int bufSize = (len + 1) * 6 + 1;
char buf[bufSize];
int res = WideCharToMultiByte(CP_UTF8, 0, str.c_str(), len + 1,
buf, bufSize, NULL, NULL);
if (! res)
throw Exception(L"Error converting UCS-2 to UTF-8");
return buf;
}
std::wstring fromUtf8(const std::string &str)
{
if (! str.length())
return L"";
int len = str.length();
wchar_t buf[len + 1];
int res = MultiByteToWideChar(CP_UTF8, 0, str.c_str(), len + 1,
buf, len + 1);
if (! res)
throw Exception(L"Error converting UTF-8 to UCS-2");
return buf;
}
std::string toOem(const std::wstring &str)
{
if (! str.length())
return "";
int len = str.length();
int bufSize = (len + 1) * 6 + 1;
char buf[bufSize];
int res = WideCharToMultiByte(CP_OEMCP, 0, str.c_str(), len + 1,
buf, bufSize, NULL, NULL);
if (! res)
throw Exception(L"Error converting UCS-2 to OEM");
return buf;
}
std::wstring fromOem(const std::string &str)
{
if (! str.length())
return L"";
int len = str.length();
wchar_t buf[len + 1];
int res = MultiByteToWideChar(CP_OEMCP, 0, str.c_str(), len + 1,
buf, len + 1);
if (! res)
throw Exception(L"Error converting OEM to UCS-2");
return buf;
}
#endif
std::wstring fromUtf8(const char *str, int len)
{
char *buf = (char*)malloc(len + 1);
if (! buf)
throw Exception(L"Error allocating memory");
memcpy(buf, str, len);
buf[len] = 0;
std::string s(buf);
free(buf);
return fromUtf8(s);
}
std::string toMbcs(const std::wstring &str)
{
int len = str.length();
if (! len)
return "";
else {
int maxSize = MB_CUR_MAX * len;
char buf[maxSize + 1];
size_t l = wcstombs(buf, str.c_str(), maxSize);
if ((size_t)-1 == -l) { // convert what we can
std::string res;
for (int i = 0; i < len; i++) {
int b = wctomb(buf, str[i]);
if (0 < b) {
buf[b] = 0;
res += buf;
}
}
return res;
} else {
buf[l] = 0;
return buf;
}
}
}
std::wstring fromMbcs(const std::string &str)
{
int maxLen = str.length();
wchar_t ws[maxLen + 1];
size_t cnt = mbstowcs(ws, str.c_str(), maxLen);
if (cnt == (size_t)-1) {
return L"";
}
ws[cnt] = 0;
return ws;
}
std::ostream& operator << (std::ostream &stream, const std::wstring &str)
{
#ifdef WIN32
if ((stream == std::cout) || (stream == std::cerr) ||
(stream == std::clog))
stream << toOem(str);
else
#endif
stream << toMbcs(str);
return stream;
}
int getUtf8Length(unsigned char c)
{
int mask, len;
UTF8_COMPUTE(c, mask, len);
if (-1 == len) {
throw Exception(L"Invalid utf-8 character");
} else {
return len;
}
}