PostgreSQL UTF8 和 GB18030编码map文件不完整的问题

4 minute read

背景

http://www.postgresql.org/message-id/20150309205145.4031.32069@wrigleys.postgresql.org

报的一个BUG,在utf8的数据库中,将chr(128512)转换为GB18030时报错,因为它的UTF8编码在MAP表中找不到对应的GB18030编码。

Step to reproduce:  
  
In psql:  
  
arjen=> select convert_to(chr(128512), 'GB18030');  
  
Actual output:  
  
ERROR:  character with byte sequence 0xf0 0x9f 0x98 0x80 in encoding "UTF8"  
has no equivalent in encoding "GB18030"  
  
Expected output:  
  
 convert_to  
------------  
 \x9439fc36  
(1 row)  

对应的 map 码表在

    src/backend/utils/mb/Unicode/utf8_to_gb18030.map  
    and  
    src/backend/utils/mb/Unicode/gb18030_to_utf8.map  
> hello,  
>   recently i downloaded postgresql-7.3b5,i found the conversion between  
> gb18030 and utf was mistake  because the map file bwteen them wasn't  
> obviously right.the first byte of utf8 encoding  with two bytes shoule  
> between 0xc0 with 0xfd,the map file didn't accord with this condition  
> .please check it ,i wished that postgresql-7.3 can support the GB18030 and  
> can run in China.  
>                                       best regards  
>                                                         jenny wang  
  
Thanks for testing GB18030 support. Yes, the map file is completely  
broken. I have attached fixed map generation perl script. Please test  
it (I do not understand Chinese).  
  
(1) save the perl script in the postgresql-7.3b5 source tree as:  
    src/backend/utils/mb/Unicode/UCS_to_GB18030.pl.  
  
(2) run it.  
  
    cd src/backend/utils/mb/Unicode  
    ./UCS_to_GB18030.pl  
  
(3) the script will generate  
    src/backend/utils/mb/Unicode/utf8_to_gb18030.map  
    and  
    src/backend/utils/mb/Unicode/gb18030_to_utf8.map  
  
(4) If they look good, rebuild PostgreSQL and test it.  
--  
Tatsuo Ishii  

问题出自:

src/backend/utils/mb/conversion_procs/utf8_and_gb18030/utf8_and_gb18030.c

#include "../../Unicode/gb18030_to_utf8.map"  
#include "../../Unicode/utf8_to_gb18030.map"  
  
Datum  
utf8_to_gb18030(PG_FUNCTION_ARGS)  
{  
        unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2);  
        unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3);  
        int                     len = PG_GETARG_INT32(4);  
  
        CHECK_ENCODING_CONVERSION_ARGS(PG_UTF8, PG_GB18030);  
  
        UtfToLocal(src, dest, ULmapGB18030, NULL,  
                 sizeof(ULmapGB18030) / sizeof(pg_utf_to_local), 0, PG_GB18030, len);  
  
        PG_RETURN_VOID();  
}  
UtfToLocal@src/backend/utils/mb/conv.c  
/*  
 * UTF8 ---> local code  
 *  
 * utf: input UTF8 string (need not be null-terminated).  
 * iso: pointer to the output area (must be large enough!)  
 * map: the conversion map.  
 * cmap: the conversion map for combined characters.  
 *                (optional)  
 * size1: the size of the conversion map.  
 * size2: the size of the conversion map for combined characters  
 *                (optional)  
 * encoding: the PG identifier for the local encoding.  
 * len: length of input string.  
 */  
void  
UtfToLocal(const unsigned char *utf, unsigned char *iso,  
                   const pg_utf_to_local *map, const pg_utf_to_local_combined *cmap,  
                   int size1, int size2, int encoding, int len)  
{  
        uint32          iutf;  
        uint32          cutf[2];  
        uint32          code;  
        pg_utf_to_local *p;  
        pg_utf_to_local_combined *cp;  
        int                     l;  
  
        for (; len > 0; len -= l)  
        {  
                /* "break" cases all represent errors */  
                if (*utf == '\0')  
                        break;  
  
                l = pg_utf_mblen(utf);  
  
                if (len < l)  
                        break;  
  
                if (!pg_utf8_islegal(utf, l))  
                        break;  
  
                if (l == 1)  
                {  
                        /* ASCII case is easy */  
                        *iso++ = *utf++;  
                        continue;  
                }  
                else if (l == 2)  
                {  
                        iutf = *utf++ << 8;  
                        iutf |= *utf++;  
                }  
                else if (l == 3)  
                {  
                        iutf = *utf++ << 16;  
                        iutf |= *utf++ << 8;  
                        iutf |= *utf++;  
                }  
                else if (l == 4)  
                {  
                        iutf = *utf++ << 24;  
                        iutf |= *utf++ << 16;  
                        iutf |= *utf++ << 8;  
                        iutf |= *utf++;  
                }  
                else  
                {  
                        elog(ERROR, "unsupported character length %d", l);  
                        iutf = 0;                       /* keep compiler quiet */  
                }  
  
                /*  
                 * first, try with combined map if possible  
                 */  
                if (cmap && len > l)  
                {  
                        const unsigned char *utf_save = utf;  
                        int                     len_save = len;  
                        int                     l_save = l;  
  
                        len -= l;  
  
                        l = pg_utf_mblen(utf);  
                        if (len < l)  
                                break;  
:  
                        if (!pg_utf8_islegal(utf, l))  
                                break;  
  
                        cutf[0] = iutf;  
  
                        if (l == 1)  
                        {  
                                if (len_save > 1)  
                                {  
                                        p = bsearch(&cutf[0], map, size1,  
                                                                sizeof(pg_utf_to_local), compare1);  
                                        if (p == NULL)  
                                                report_untranslatable_char(PG_UTF8, encoding,  
                                                           (const char *) (utf_save - l_save), len_save);  
                                        iso = set_iso_code(iso, p->code);  
                                }  
  
                                /* ASCII case is easy */  
                                *iso++ = *utf++;  
                                continue;  
                        }  
                        else if (l == 2)  
                        {  
                                iutf = *utf++ << 8;  
                                iutf |= *utf++;  
                        }  
                        else if (l == 3)  
                        {  
                                iutf = *utf++ << 16;  
                                iutf |= *utf++ << 8;  
                                iutf |= *utf++;  
                        }  
                        else if (l == 4)  
                        {  
                                iutf = *utf++ << 24;  
                                iutf |= *utf++ << 16;  
                                iutf |= *utf++ << 8;  
                                iutf |= *utf++;  
                        }  
                        else  
                        {  
                                elog(ERROR, "unsupported character length %d", l);  
                                iutf = 0;               /* keep compiler quiet */  
                        }  
  
                        cutf[1] = iutf;  
                        cp = bsearch(cutf, cmap, size2,  
                                                 sizeof(pg_utf_to_local_combined), compare3);  
                        if (cp)  
                                code = cp->code;  
                        else  
                        {  
                                /* not found in combined map. try with ordinary map */  
                                p = bsearch(&cutf[0], map, size1,  
                                                        sizeof(pg_utf_to_local), compare1);  
                                if (p == NULL)  
                                        report_untranslatable_char(PG_UTF8, encoding,  
                                                           (const char *) (utf_save - l_save), len_save);  
                                iso = set_iso_code(iso, p->code);  
  
                                p = bsearch(&cutf[1], map, size1,  
                                                        sizeof(pg_utf_to_local), compare1);  
                                if (p == NULL)  
                                        report_untranslatable_char(PG_UTF8, encoding,  
                                                                                           (const char *) (utf - l), len);  
                                code = p->code;  
                        }  
                }  
                else    /* no cmap or no remaining data */  
                {  
                        p = bsearch(&iutf, map, size1,  
                                                sizeof(pg_utf_to_local), compare1);  
                        if (p == NULL)  
                                report_untranslatable_char(PG_UTF8, encoding,  
                                                                                   (const char *) (utf - l), len);  
                        code = p->code;  
                }  
                iso = set_iso_code(iso, code);  
        }  
  
        if (len > 0)  
                report_invalid_encoding(PG_UTF8, (const char *) utf, len);  
  
        *iso = '\0';  
}  

MAP表内容

src/backend/utils/mb/Unicode/utf8_to_gb18030.map

static pg_utf_to_local ULmapGB18030[ 63360 ] = {  
  {0xc280, 0x81308130},  
  {0xc281, 0x81308131},  
......  
  {0xefbfbb, 0x8431a435},  
  {0xefbfbc, 0x8431a436},  
  {0xefbfbd, 0x8431a437},  
  {0xefbfbe, 0x8431a438},  
  {0xefbfbf, 0x8431a439}  
};  

例子,找utf8数据库中你的bytea表达。

postgres=# select varcharsend('你');  
 varcharsend   
-------------  
 \xe4bda0  
(1 row)  

在map文件中可以找到对应的:

  {0xe4bda0, 0xc4e3},  

另一个文件可以找到相反的匹配。

我们看看转换是否正常:

postgres=# select convert_to('你', 'GB18030');  
 convert_to   
------------  
 \xc4e3  
(1 row)  

对应的正是map文件中的这条记录。

postgres=# \df convert_to  
                             List of functions  
   Schema   |    Name    | Result data type | Argument data types |  Type    
------------+------------+------------------+---------------------+--------  
 pg_catalog | convert_to | bytea            | text, name          | normal  
(1 row)  

如果要完成chr(128512)这个转换,必须把对应的记录新增到这个MAP文件中。

postgres=# select varcharsend(chr(128512));  
 varcharsend   
-------------  
 \xf09f9880  
(1 row)  

新增

vi src/backend/utils/mb/Unicode/utf8_to_gb18030.map  
  {0xefbfbf, 0x8431a439},  
  {0xf09f9880, 0x9439fc36}  
vi src/backend/utils/mb/Unicode/gb18030_to_utf8.map   
  {0x8431a439, 0xefbfbf},  
  {0x9439fc36, 0xf09f9880}  

这里涉及的问题还很多,因为MAP表要补齐的话,工作量还挺大。

Flag Counter

digoal’s 大量PostgreSQL文章入口