PostgreSQL UTF8 和 GB18030编码map文件不完整的问题
背景
http://www.postgresql.org/message-id/20150309205145.4031.32069@wrigleys.postgresql.org
报的一个BUG,在utf8的数据库中,将chr(128512)转换为GB18030时报错,因为它的UTF8编码在MAP表中找不到对应的GB18030编码。
Step to reproduce:
In psql:
arjen=> select convert_to(chr(128512), 'GB18030');
Actual output:
ERROR: character with byte sequence 0xf0 0x9f 0x98 0x80 in encoding "UTF8"
has no equivalent in encoding "GB18030"
Expected output:
convert_to
------------
\x9439fc36
(1 row)
对应的 map 码表在
src/backend/utils/mb/Unicode/utf8_to_gb18030.map
and
src/backend/utils/mb/Unicode/gb18030_to_utf8.map
> hello,
> recently i downloaded postgresql-7.3b5,i found the conversion between
> gb18030 and utf was mistake because the map file bwteen them wasn't
> obviously right.the first byte of utf8 encoding with two bytes shoule
> between 0xc0 with 0xfd,the map file didn't accord with this condition
> .please check it ,i wished that postgresql-7.3 can support the GB18030 and
> can run in China.
> best regards
> jenny wang
Thanks for testing GB18030 support. Yes, the map file is completely
broken. I have attached fixed map generation perl script. Please test
it (I do not understand Chinese).
(1) save the perl script in the postgresql-7.3b5 source tree as:
src/backend/utils/mb/Unicode/UCS_to_GB18030.pl.
(2) run it.
cd src/backend/utils/mb/Unicode
./UCS_to_GB18030.pl
(3) the script will generate
src/backend/utils/mb/Unicode/utf8_to_gb18030.map
and
src/backend/utils/mb/Unicode/gb18030_to_utf8.map
(4) If they look good, rebuild PostgreSQL and test it.
--
Tatsuo Ishii
问题出自:
src/backend/utils/mb/conversion_procs/utf8_and_gb18030/utf8_and_gb18030.c
#include "../../Unicode/gb18030_to_utf8.map"
#include "../../Unicode/utf8_to_gb18030.map"
Datum
utf8_to_gb18030(PG_FUNCTION_ARGS)
{
unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2);
unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3);
int len = PG_GETARG_INT32(4);
CHECK_ENCODING_CONVERSION_ARGS(PG_UTF8, PG_GB18030);
UtfToLocal(src, dest, ULmapGB18030, NULL,
sizeof(ULmapGB18030) / sizeof(pg_utf_to_local), 0, PG_GB18030, len);
PG_RETURN_VOID();
}
UtfToLocal@src/backend/utils/mb/conv.c
/*
* UTF8 ---> local code
*
* utf: input UTF8 string (need not be null-terminated).
* iso: pointer to the output area (must be large enough!)
* map: the conversion map.
* cmap: the conversion map for combined characters.
* (optional)
* size1: the size of the conversion map.
* size2: the size of the conversion map for combined characters
* (optional)
* encoding: the PG identifier for the local encoding.
* len: length of input string.
*/
void
UtfToLocal(const unsigned char *utf, unsigned char *iso,
const pg_utf_to_local *map, const pg_utf_to_local_combined *cmap,
int size1, int size2, int encoding, int len)
{
uint32 iutf;
uint32 cutf[2];
uint32 code;
pg_utf_to_local *p;
pg_utf_to_local_combined *cp;
int l;
for (; len > 0; len -= l)
{
/* "break" cases all represent errors */
if (*utf == '\0')
break;
l = pg_utf_mblen(utf);
if (len < l)
break;
if (!pg_utf8_islegal(utf, l))
break;
if (l == 1)
{
/* ASCII case is easy */
*iso++ = *utf++;
continue;
}
else if (l == 2)
{
iutf = *utf++ << 8;
iutf |= *utf++;
}
else if (l == 3)
{
iutf = *utf++ << 16;
iutf |= *utf++ << 8;
iutf |= *utf++;
}
else if (l == 4)
{
iutf = *utf++ << 24;
iutf |= *utf++ << 16;
iutf |= *utf++ << 8;
iutf |= *utf++;
}
else
{
elog(ERROR, "unsupported character length %d", l);
iutf = 0; /* keep compiler quiet */
}
/*
* first, try with combined map if possible
*/
if (cmap && len > l)
{
const unsigned char *utf_save = utf;
int len_save = len;
int l_save = l;
len -= l;
l = pg_utf_mblen(utf);
if (len < l)
break;
:
if (!pg_utf8_islegal(utf, l))
break;
cutf[0] = iutf;
if (l == 1)
{
if (len_save > 1)
{
p = bsearch(&cutf[0], map, size1,
sizeof(pg_utf_to_local), compare1);
if (p == NULL)
report_untranslatable_char(PG_UTF8, encoding,
(const char *) (utf_save - l_save), len_save);
iso = set_iso_code(iso, p->code);
}
/* ASCII case is easy */
*iso++ = *utf++;
continue;
}
else if (l == 2)
{
iutf = *utf++ << 8;
iutf |= *utf++;
}
else if (l == 3)
{
iutf = *utf++ << 16;
iutf |= *utf++ << 8;
iutf |= *utf++;
}
else if (l == 4)
{
iutf = *utf++ << 24;
iutf |= *utf++ << 16;
iutf |= *utf++ << 8;
iutf |= *utf++;
}
else
{
elog(ERROR, "unsupported character length %d", l);
iutf = 0; /* keep compiler quiet */
}
cutf[1] = iutf;
cp = bsearch(cutf, cmap, size2,
sizeof(pg_utf_to_local_combined), compare3);
if (cp)
code = cp->code;
else
{
/* not found in combined map. try with ordinary map */
p = bsearch(&cutf[0], map, size1,
sizeof(pg_utf_to_local), compare1);
if (p == NULL)
report_untranslatable_char(PG_UTF8, encoding,
(const char *) (utf_save - l_save), len_save);
iso = set_iso_code(iso, p->code);
p = bsearch(&cutf[1], map, size1,
sizeof(pg_utf_to_local), compare1);
if (p == NULL)
report_untranslatable_char(PG_UTF8, encoding,
(const char *) (utf - l), len);
code = p->code;
}
}
else /* no cmap or no remaining data */
{
p = bsearch(&iutf, map, size1,
sizeof(pg_utf_to_local), compare1);
if (p == NULL)
report_untranslatable_char(PG_UTF8, encoding,
(const char *) (utf - l), len);
code = p->code;
}
iso = set_iso_code(iso, code);
}
if (len > 0)
report_invalid_encoding(PG_UTF8, (const char *) utf, len);
*iso = '\0';
}
MAP表内容
src/backend/utils/mb/Unicode/utf8_to_gb18030.map
static pg_utf_to_local ULmapGB18030[ 63360 ] = {
{0xc280, 0x81308130},
{0xc281, 0x81308131},
......
{0xefbfbb, 0x8431a435},
{0xefbfbc, 0x8431a436},
{0xefbfbd, 0x8431a437},
{0xefbfbe, 0x8431a438},
{0xefbfbf, 0x8431a439}
};
例子,找utf8数据库中你的bytea表达。
postgres=# select varcharsend('你');
varcharsend
-------------
\xe4bda0
(1 row)
在map文件中可以找到对应的:
{0xe4bda0, 0xc4e3},
另一个文件可以找到相反的匹配。
我们看看转换是否正常:
postgres=# select convert_to('你', 'GB18030');
convert_to
------------
\xc4e3
(1 row)
对应的正是map文件中的这条记录。
postgres=# \df convert_to
List of functions
Schema | Name | Result data type | Argument data types | Type
------------+------------+------------------+---------------------+--------
pg_catalog | convert_to | bytea | text, name | normal
(1 row)
如果要完成chr(128512)这个转换,必须把对应的记录新增到这个MAP文件中。
postgres=# select varcharsend(chr(128512));
varcharsend
-------------
\xf09f9880
(1 row)
新增
vi src/backend/utils/mb/Unicode/utf8_to_gb18030.map
{0xefbfbf, 0x8431a439},
{0xf09f9880, 0x9439fc36}
vi src/backend/utils/mb/Unicode/gb18030_to_utf8.map
{0x8431a439, 0xefbfbf},
{0x9439fc36, 0xf09f9880}
这里涉及的问题还很多,因为MAP表要补齐的话,工作量还挺大。