PostgreSQL 字符串 collate 与排序、memcpy优化 - 源码分析

7 minute read

背景

事情的起因是这样的,某个用户问我为什么在GP里面查询到的’A’ > ‘a’和PostgreSQL中查询到的结果不一样,一个是false, 一个true.

但是这个原因其实和Greenplum还是PostgreSQL是没关系的。

原因的根源还是collate。

例如,在使用比较操作符时,可以指定需要比较的字符串的collate来查看这里的区别:

postgres=# select 'a' > 'A' collate "en_US";    
 ?column?     
----------    
 f    
(1 row)    
  
postgres=# select 'a' > 'A' collate "C";    
 ?column?     
----------    
 t    
(1 row)    
  
postgres=# select 'a' > 'A' collate "zh_CN";    
 ?column?     
----------    
 f    
(1 row)    

操作符>的源代码 :

使用C作为collate时,实际上是memcmp直接对被比较的字符串的比较,而使用非C的collate则需要COPY字符串的内存,然后使用strcoll_l或strcoll进行比较。

所以建议数据库初始化时使用C,比如 --lc-collate=C --lc-ctype=en_US.utf8

Datum    
text_gt(PG_FUNCTION_ARGS)    
{    
        text       *arg1 = PG_GETARG_TEXT_PP(0);    
        text       *arg2 = PG_GETARG_TEXT_PP(1);    
        bool            result;    
  
        result = (text_cmp(arg1, arg2, PG_GET_COLLATION()) > 0);    
  
        PG_FREE_IF_COPY(arg1, 0);    
        PG_FREE_IF_COPY(arg2, 1);    
  
        PG_RETURN_BOOL(result);    
}    
  
/* text_cmp()    
 * Internal comparison function for text strings.    
 * Returns -1, 0 or 1    
 */    
static int    
text_cmp(text *arg1, text *arg2, Oid collid)    
{    
        char       *a1p,    
                           *a2p;    
        int                     len1,    
                                len2;    
  
        a1p = VARDATA_ANY(arg1);    
        a2p = VARDATA_ANY(arg2);    
  
        len1 = VARSIZE_ANY_EXHDR(arg1);    
        len2 = VARSIZE_ANY_EXHDR(arg2);    
  
        return varstr_cmp(a1p, len1, a2p, len2, collid);    
}    
  
/* varstr_cmp()    
 * Comparison function for text strings with given lengths.    
 * Includes locale support, but must copy strings to temporary memory    
 *  to allow null-termination for inputs to strcoll().    
 * Returns an integer less than, equal to, or greater than zero, indicating    
 * whether arg1 is less than, equal to, or greater than arg2.    
 */    
int    
varstr_cmp(char *arg1, int len1, char *arg2, int len2, Oid collid)    
{    
    int         result;    
  
    /*    
     * Unfortunately, there is no strncoll(), so in the non-C locale case we    
     * have to do some memory copying.  This turns out to be significantly    
     * slower, so we optimize the case where LC_COLLATE is C.  We also try to    
     * optimize relatively-short strings by avoiding palloc/pfree overhead.    
     */    
    if (lc_collate_is_c(collid))    
    {    
        result = memcmp(arg1, arg2, Min(len1, len2));    
        if ((result == 0) && (len1 != len2))    
            result = (len1 < len2) ? -1 : 1;    
    }    
    else    
    {    
#define STACKBUFLEN     1024    
  
        char        a1buf[STACKBUFLEN];    
        char        a2buf[STACKBUFLEN];    
        char       *a1p,    
                   *a2p;    
  
#ifdef HAVE_LOCALE_T    
        pg_locale_t mylocale = 0;    
#endif    
  
        if (collid != DEFAULT_COLLATION_OID)    
        {    
            if (!OidIsValid(collid))    
            {    
                /*    
                 * This typically means that the parser could not resolve a    
                 * conflict of implicit collations, so report it that way.    
                 */    
                ereport(ERROR,    
                        (errcode(ERRCODE_INDETERMINATE_COLLATION),    
                         errmsg("could not determine which collation to use for string comparison"),    
                         errhint("Use the COLLATE clause to set the collation explicitly.")));    
            }    
#ifdef HAVE_LOCALE_T    
            mylocale = pg_newlocale_from_collation(collid);    
#endif    
        }    
  
#ifdef WIN32    
        /* Win32 does not have UTF-8, so we need to map to UTF-16 */    
        if (GetDatabaseEncoding() == PG_UTF8)    
        {    
            int         a1len;    
            int         a2len;    
            int         r;    
  
            if (len1 >= STACKBUFLEN / 2)    
            {    
                a1len = len1 * 2 + 2;    
                a1p = palloc(a1len);    
            }    
            else    
            {    
                a1len = STACKBUFLEN;    
                a1p = a1buf;    
            }    
            if (len2 >= STACKBUFLEN / 2)    
            {    
                a2len = len2 * 2 + 2;    
                a2p = palloc(a2len);    
            }    
            else    
            {    
                a2len = STACKBUFLEN;    
                a2p = a2buf;    
            }    
  
            /* stupid Microsloth API does not work for zero-length input */    
            if (len1 == 0)    
                r = 0;    
            else    
            {    
                r = MultiByteToWideChar(CP_UTF8, 0, arg1, len1,    
                                        (LPWSTR) a1p, a1len / 2);    
                if (!r)    
                    ereport(ERROR,    
                            (errmsg("could not convert string to UTF-16: error code %lu",    
                                    GetLastError())));    
            }    
            ((LPWSTR) a1p)[r] = 0;    
  
            if (len2 == 0)    
                r = 0;    
            else    
            {    
                r = MultiByteToWideChar(CP_UTF8, 0, arg2, len2,    
                                        (LPWSTR) a2p, a2len / 2);    
                if (!r)    
                    ereport(ERROR,    
                            (errmsg("could not convert string to UTF-16: error code %lu",    
                                    GetLastError())));    
            }    
            ((LPWSTR) a2p)[r] = 0;    
  
            errno = 0;    
#ifdef HAVE_LOCALE_T    
            if (mylocale)    
                result = wcscoll_l((LPWSTR) a1p, (LPWSTR) a2p, mylocale);    
            else    
#endif    
                result = wcscoll((LPWSTR) a1p, (LPWSTR) a2p);    
            if (result == 2147483647)   /* _NLSCMPERROR; missing from mingw    
                                         * headers */    
                ereport(ERROR,    
                        (errmsg("could not compare Unicode strings: %m")));    
  
            /*    
             * In some locales wcscoll() can claim that nonidentical strings    
             * are equal.  Believing that would be bad news for a number of    
             * reasons, so we follow Perl's lead and sort "equal" strings    
             * according to strcmp (on the UTF-8 representation).    
             */    
            if (result == 0)    
            {    
                result = memcmp(arg1, arg2, Min(len1, len2));    
                if ((result == 0) && (len1 != len2))    
                    result = (len1 < len2) ? -1 : 1;    
            }    
  
            if (a1p != a1buf)    
                pfree(a1p);    
            if (a2p != a2buf)    
                pfree(a2p);    
  
            return result;    
        }    
#endif   /* WIN32 */    
  
        if (len1 >= STACKBUFLEN)    
            a1p = (char *) palloc(len1 + 1);    
        else    
            a1p = a1buf;    
        if (len2 >= STACKBUFLEN)    
            a2p = (char *) palloc(len2 + 1);    
        else    
            a2p = a2buf;    
  
        memcpy(a1p, arg1, len1);    
        a1p[len1] = '\0';    
        memcpy(a2p, arg2, len2);    
        a2p[len2] = '\0';    
  
#ifdef HAVE_LOCALE_T    
        if (mylocale)    
            result = strcoll_l(a1p, a2p, mylocale);    
        else    
#endif    
            result = strcoll(a1p, a2p);    
  
        /*    
         * In some locales strcoll() can claim that nonidentical strings are    
         * equal.  Believing that would be bad news for a number of reasons,    
         * so we follow Perl's lead and sort "equal" strings according to    
         * strcmp().    
         */    
        if (result == 0)    
            result = strcmp(a1p, a2p);    
  
        if (a1p != a1buf)    
            pfree(a1p);    
        if (a2p != a2buf)    
            pfree(a2p);    
    }    
  
    return result;    
}    

在创建索引时也需要注意这一点,collate一定要和实际SQL中的collate匹配。

postgres=# create table test(id int , info text collate "zh_CN");    
CREATE TABLE    
postgres=# insert into test select generate_series(1,10000),md5(random()::text);    
INSERT 0 10000    
postgres=# \l+    
                                                               List of databases    
   Name    |  Owner   | Encoding | Collate | Ctype |   Access privileges   |  Size   | Tablespace |                Description                     
-----------+----------+----------+---------+-------+-----------------------+---------+------------+--------------------------------------------    
 postgres  | postgres | UTF8     | C       | C     |                       | 7960 kB | pg_default | default administrative connection database    
 template0 | postgres | UTF8     | C       | C     | =c/postgres          +| 7129 kB | pg_default | unmodifiable empty database    
           |          |          |         |       | postgres=CTc/postgres |         |            |     
 template1 | postgres | UTF8     | C       | C     | =c/postgres          +| 7129 kB | pg_default | default template for new databases    
           |          |          |         |       | postgres=CTc/postgres |         |            |     
(3 rows)    
  
postgres=# create index idx_test on test(info);  -- 建索引不指定collate则使用的是表结构中的collate    
CREATE INDEX    
  
postgres=# explain select * from test where info='abc';  -- 查询是不使用collate则使用的是表结构中的collate    
                              QUERY PLAN                                  
----------------------------------------------------------------------    
 Index Scan using idx_test on test  (cost=0.29..8.30 rows=1 width=37)    
   Index Cond: (info = 'abc'::text)    
(2 rows)    
  
postgres=# explain select * from test where info='abc' collate "C";  -- 与索引不一样的collate, 不能走索引    
                      QUERY PLAN                           
-------------------------------------------------------    
 Seq Scan on test  (cost=0.00..209.00 rows=1 width=37)    
   Filter: (info = 'abc'::text COLLATE "C")    
(2 rows)    
  
postgres=# explain select * from test where info='abc' collate "zh_CN";  -- 与索引一样的collate, 能走索引    
                              QUERY PLAN                                  
----------------------------------------------------------------------    
 Index Scan using idx_test on test  (cost=0.29..8.30 rows=1 width=37)    
   Index Cond: (info = 'abc'::text COLLATE "zh_CN")    
(2 rows)    
  
postgres=# drop index idx_test;    
DROP INDEX    
postgres=# create index idx_test on test(info collate "C");    
CREATE INDEX    
postgres=# explain select * from test where info='abc' collate "zh_CN"; -- 与索引不一样的collate, 不能走索引    
                      QUERY PLAN                           
-------------------------------------------------------    
 Seq Scan on test  (cost=0.00..209.00 rows=1 width=37)    
   Filter: (info = 'abc'::text COLLATE "zh_CN")    
(2 rows)    
  
postgres=# explain select * from test where info='abc' collate "C"; -- 与索引一样的collate, 能走索引    
                              QUERY PLAN                                  
----------------------------------------------------------------------    
 Index Scan using idx_test on test  (cost=0.29..8.30 rows=1 width=37)    
   Index Cond: (info = 'abc'::text COLLATE "C")    
(2 rows)    

性能也有一定的差异:

用collate C显然要好一点。

-> pgbench -M prepared -n -r -P 1 -f ./test.sql -c 24 -j 24 -T 10    
progress: 1.0 s, 341151.9 tps, lat 0.068 ms stddev 0.028    
progress: 2.0 s, 343049.3 tps, lat 0.068 ms stddev 0.032    
progress: 3.0 s, 343784.8 tps, lat 0.068 ms stddev 0.025    
progress: 4.0 s, 342717.9 tps, lat 0.068 ms stddev 0.040    
progress: 5.0 s, 343661.1 tps, lat 0.068 ms stddev 0.027    
progress: 6.0 s, 343550.0 tps, lat 0.068 ms stddev 0.026    
progress: 7.0 s, 343515.6 tps, lat 0.068 ms stddev 0.024    
progress: 8.0 s, 343560.9 tps, lat 0.068 ms stddev 0.029    
progress: 9.0 s, 342749.7 tps, lat 0.068 ms stddev 0.044    
progress: 10.0 s, 343844.8 tps, lat 0.068 ms stddev 0.024    
transaction type: Custom query    
scaling factor: 1    
query mode: prepared    
number of clients: 24    
number of threads: 24    
duration: 10 s    
number of transactions actually processed: 3431607    
latency average: 0.068 ms    
latency stddev: 0.031 ms    
tps = 343145.601594 (including connections establishing)    
tps = 343323.296483 (excluding connections establishing)    
statement latencies in milliseconds:    
        0.067984        select 'A'>'a' collate "C";    
  
  
pgbench -M prepared -n -r -P 1 -f ./test.sql -c 24 -j 24 -T 10    
progress: 1.0 s, 330980.7 tps, lat 0.070 ms stddev 0.027    
progress: 2.0 s, 331546.4 tps, lat 0.071 ms stddev 0.024    
progress: 3.0 s, 333500.7 tps, lat 0.070 ms stddev 0.021    
progress: 4.0 s, 333055.7 tps, lat 0.070 ms stddev 0.027    
progress: 5.0 s, 332833.6 tps, lat 0.070 ms stddev 0.041    
progress: 6.0 s, 329527.4 tps, lat 0.071 ms stddev 0.068    
progress: 7.0 s, 330480.3 tps, lat 0.071 ms stddev 0.078    
progress: 8.0 s, 333344.7 tps, lat 0.070 ms stddev 0.023    
progress: 9.0 s, 333602.3 tps, lat 0.070 ms stddev 0.022    
progress: 10.0 s, 332386.7 tps, lat 0.071 ms stddev 0.039    
transaction type: Custom query    
scaling factor: 1    
query mode: prepared    
number of clients: 24    
number of threads: 24    
duration: 10 s    
number of transactions actually processed: 3321288    
latency average: 0.071 ms    
latency stddev: 0.042 ms    
tps = 332067.898747 (including connections establishing)    
tps = 332233.047809 (excluding connections establishing)    
statement latencies in milliseconds:    
        0.070443        select 'A'>'a' collate "zh_CN";    

参考

http://www.postgresql.org/docs/9.5/static/sql-createtable.html

http://www.postgresql.org/docs/9.5/static/sql-altertable.html

http://www.postgresql.org/docs/9.5/static/sql-expressions.html#SQL-SYNTAX-COLLATE-EXPRS

http://www.postgresql.org/docs/9.5/static/sql-createindex.html

Flag Counter

digoal’s 大量PostgreSQL文章入口