PostgreSQL json jsonb 支持的value数据类型,如何构造一个jsonb

6 minute read

背景

PostgreSQL 支持JSON已久,那么JSON的value中支持哪些类型呢?

目前支持string, numeric, boolean (true, false), null。 区分大小写哦。

/*  
 *      Recursive Descent parse routines. There is one for each structural  
 *      element in a json document:  
 *        - scalar (string, number, true, false, null)  
 *        - array  ( [ ] )  
 *        - array element  
 *        - object ( { } )  
 *        - object field  
 */  
static inline void  
parse_scalar(JsonLexContext *lex, JsonSemAction *sem)  
{  
        char       *val = NULL;  
        json_scalar_action sfunc = sem->scalar;  
        char      **valaddr;  
        JsonTokenType tok = lex_peek(lex);  
  
        valaddr = sfunc == NULL ? NULL : &val;  
  
        /* a scalar must be a string, a number, true, false, or null */  
        switch (tok)  
        {  
                case JSON_TOKEN_TRUE:  
                        lex_accept(lex, JSON_TOKEN_TRUE, valaddr);  
                        break;  
                case JSON_TOKEN_FALSE:  
                        lex_accept(lex, JSON_TOKEN_FALSE, valaddr);  
                        break;  
                case JSON_TOKEN_NULL:  
                        lex_accept(lex, JSON_TOKEN_NULL, valaddr);  
                        break;  
                case JSON_TOKEN_NUMBER:  
                        lex_accept(lex, JSON_TOKEN_NUMBER, valaddr);  
                        break;  
                case JSON_TOKEN_STRING:  
                        lex_accept(lex, JSON_TOKEN_STRING, valaddr);  
                        break;  
                default:  
                        report_parse_error(JSON_PARSE_VALUE, lex);  
        }  
  
        if (sfunc != NULL)  
                (*sfunc) (sem->semstate, val, tok);  
}  

例子:

postgres=# select jsonb '{"a": TRUE}';  
ERROR:  invalid input syntax for type json  
LINE 1: select jsonb '{"a": TRUE}';  
                     ^  
DETAIL:  Token "TRUE" is invalid.  
CONTEXT:  JSON data, line 1: {"a": TRUE...  
postgres=# select jsonb '{"a": true}';  
    jsonb      
-------------  
 {"a": true}  
(1 row)  
  
postgres=# select jsonb '{"a": false}';  
    jsonb       
--------------  
 {"a": false}  
(1 row)  
  
postgres=# select jsonb '{"a": NULL}';  
ERROR:  invalid input syntax for type json  
LINE 1: select jsonb '{"a": NULL}';  
                     ^  
DETAIL:  Token "NULL" is invalid.  
CONTEXT:  JSON data, line 1: {"a": NULL...  
postgres=# select jsonb '{"a": null}';  
    jsonb      
-------------  
 {"a": null}  
(1 row)  
  
postgres=# select jsonb '{"a": 10.001}';  
     jsonb       
---------------  
 {"a": 10.001}  
(1 row)  
  
postgres=# select jsonb '{"a": "10.001"}';  
      jsonb        
-----------------  
 {"a": "10.001"}  
(1 row)  

src/backend/utils/adt/json.c

解析json,jsonb时,定义的token类型,这个和JSONB里面能存什么类型没有关系:

/*  
 * The context of the parser is maintained by the recursive descent  
 * mechanism, but is passed explicitly to the error reporting routine  
 * for better diagnostics.  
 */  
typedef enum                                    /* contexts of JSON parser */  
{  
        JSON_PARSE_VALUE,                       /* expecting a value */  
        JSON_PARSE_STRING,                      /* expecting a string (for a field name) */  
        JSON_PARSE_ARRAY_START,         /* saw '[', expecting value or ']' */  
        JSON_PARSE_ARRAY_NEXT,          /* saw array element, expecting ',' or ']' */  
        JSON_PARSE_OBJECT_START,        /* saw '{', expecting label or '}' */  
        JSON_PARSE_OBJECT_LABEL,        /* saw object label, expecting ':' */  
        JSON_PARSE_OBJECT_NEXT         /* saw object value, expecting ',' or '}' */  
        JSON_PARSE_OBJECT_COMMA,        /* saw object ',', expecting next label */  
        JSON_PARSE_END                          /* saw the end of a document, expect nothing */  
} JsonParseContext;  

实际上jsonb里没有类型的概念,你前面看到的TOKEN实际上也就是字符串匹配而已,你可以理解为jsonb内部就是一个有一定规则的字符串。

那么我们怎么来构造一个JSONB类型,并且在使用时可以很好的处理里面的数据类型呢?

这里需要用到PostgreSQL的输入输出函数,一般每个类型都会有对应的in和out函数,例如:

 pg_catalog | anyrange_in                     | anyrange         | cstring, oid, integer                                          | normal  
 pg_catalog | anyrange_out                    | cstring          | anyrange                                                       | normal  
 pg_catalog | timestamptz_out               | cstring                     | timestamp with time zone                                                                                  | normal  
 pg_catalog | timestamptz_in                | timestamp with time zone    | cstring, oid, integer                                                                                     | normal  
/* timestamptz_in()  
 * Convert a string to internal form.  
 */  
Datum  
timestamptz_in(PG_FUNCTION_ARGS)  
{  
        char       *str = PG_GETARG_CSTRING(0);  
  
#ifdef NOT_USED  
        Oid                     typelem = PG_GETARG_OID(1);  
#endif  
        int32           typmod = PG_GETARG_INT32(2);  
  
postgres=# select timestamptz_out(now());  
        timestamptz_out          
-------------------------------  
 2015-09-24 19:44:16.076233+08  
(1 row)  
  
postgres=# select oid,* from pg_type where typname='timestamptz';  
-[ RECORD 1 ]--+---------------------  
oid            | 1184  
typname        | timestamptz  
typnamespace   | 11  
typowner       | 10  
typlen         | 8  
typbyval       | t  
typtype        | b  
typcategory    | D  
typispreferred | t  
typisdefined   | t  
typdelim       | ,  
typrelid       | 0  
typelem        | 0  
typarray       | 1185  
typinput       | timestamptz_in  
typoutput      | timestamptz_out  
typreceive     | timestamptz_recv  
typsend        | timestamptz_send  
typmodin       | timestamptztypmodin  
typmodout      | timestamptztypmodout  
typanalyze     | -  
typalign       | d  
typstorage     | p  
typnotnull     | f  
typbasetype    | 0  
typtypmod      | -1  
typndims       | 0  
typcollation   | 0  
typdefaultbin  |   
typdefault     |   
typacl         |   

typmode表示精度

postgres=# select timestamptz_in('2015-09-24 19:44:16.076233+08',1184,1);  
-[ RECORD 1 ]--+-------------------------  
timestamptz_in | 2015-09-24 19:44:16.1+08  
postgres=# select timestamptz_in('2015-09-24 19:44:16.076233+08',1184,6);  
-[ RECORD 1 ]--+------------------------------  
timestamptz_in | 2015-09-24 19:44:16.076233+08  

使用format格式来构造jsonb类型是比较靠谱的。

代码见varlena.c

/*  
 * Returns a formatted string  
 */  
Datum  
text_format(PG_FUNCTION_ARGS)  
{  
......  

例子

构造一个jsonb的cstring

postgres=# select format('{"K": "%s"}', int8range(1,10));  
     format        
-----------------  
 {"K": "[1,10)"}  
(1 row)  

使用jsonb_in把cstring转换为JSONB

postgres=# select jsonb_in( format('{"K": "%s"}', int8range(1,10))::cstring );  
    jsonb_in       
-----------------  
 {"K": "[1,10)"}  
(1 row)  

取出K的element

postgres=# select jsonb_in( format('{"K": "%s"}', int8range(1,10))::cstring ) ->> 'K';  
 ?column?   
----------  
 [1,10)  
(1 row)  

将element转换回去

postgres=# select (jsonb_in( format('{"K": "%s"}', int8range(1,10))::cstring ) ->> 'K') :: int8range;  
 int8range   
-----------  
 [1,10)  
(1 row)  

再举个例子

postgres=# select format('{"K": "%s"}', '你好'::bytea);  
         format            
-------------------------  
 {"K": "\xe4bda0e5a5bd"}  
(1 row)  
postgres=# select convert_from('\xe4bda0e5a5bd', 'utf8');  
 convert_from   
--------------  
 你好  
(1 row)  
  
postgres=# \set VERBOSITY verbose  
postgres=# select jsonb_in( format('{"K": "%s"}', '你好'::bytea)::cstring );  
ERROR:  22P02: invalid input syntax for type json  
DETAIL:  Escape sequence "\x" is invalid.  
CONTEXT:  JSON data, line 1: {"K": "\x...  
LOCATION:  json_lex_string, json.c:890  

非常遗憾,JSONB会处理escape,而\x是不合法的,报错了。这块代码可以改一改。

代码如下:

/*  
 * The next token in the input stream is known to be a string; lex it.  
 */  
static inline void  
json_lex_string(JsonLexContext *lex)  
{  
......  
                                switch (*s)  
                                {  
                                        case '"':  
                                        case '\\':  
                                        case '/':  
                                                appendStringInfoChar(lex->strval, *s);  
                                                break;  
                                        case 'b':  
                                                appendStringInfoChar(lex->strval, '\b');  
                                                break;  
                                        case 'f':  
                                                appendStringInfoChar(lex->strval, '\f');  
                                                break;  
                                        case 'n':  
                                                appendStringInfoChar(lex->strval, '\n');  
                                                break;  
                                        case 'r':  
                                                appendStringInfoChar(lex->strval, '\r');  
                                                break;  
                                        case 't':  
                                                appendStringInfoChar(lex->strval, '\t');  
                                                break;  
                                        default:  
                                                /* Not a valid string escape, so error out. */  
                                                lex->token_terminator = s + pg_mblen(s);  
                                                ereport(ERROR,  
                                                                (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION),  
                                                                 errmsg("invalid input syntax for type json"),  
                                                        errdetail("Escape sequence \"\\%s\" is invalid.",  
                                                                          extract_mb_char(s)),  
                                                                 report_json_context(lex)));  
                                }  

修改后,可以实现存储BYTEA,但是输出时还需要转换一下。

postgres=# select jsonb_in( format('{"K": "%s"}', '你好'::bytea)::cstring ) ->> 'K';  
   ?column?     
--------------  
 e4bda0e5a5bd  
(1 row)  
  
postgres=# select convert_from( byteain (('\x'||(jsonb_in( format('{"K": "%s"}', '你好'::bytea)::cstring ) ->> 'K'))::cstring), 'utf8'::name);  
 convert_from   
--------------  
 你好  
(1 row)  

用format来构造一个字符串,在里面使用%s填充需要的cstring,从而达到构造jsonb的目的。

这种方法同样适用构造JSONB中包含postgis的几何类型。

参考

1. http://www.postgresql.org/docs/9.4/static/functions-string.html

2. src/backend/utils/adt/json.c

/*  
 * Lex one token from the input stream.  
 */  
static inline void  
json_lex(JsonLexContext *lex)  
{  
        char       *s;  
        int                     len;  
  
        /* Skip leading whitespace. */  
        s = lex->token_terminator;  
        len = s - lex->input;  
        while (len < lex->input_length &&  
                   (*s == ' ' || *s == '\t' || *s == '\n' || *s == '\r'))  
        {  
                if (*s == '\n')  
                        ++lex->line_number;  
                ++s;  
                ++len;  
        }  
        lex->token_start = s;  
  
        /* Determine token type. */  
        if (len >= lex->input_length)  
        {  
                lex->token_start = NULL;  
                lex->prev_token_terminator = lex->token_terminator;  
                lex->token_terminator = s;  
                lex->token_type = JSON_TOKEN_END;  
        }  
        else  
                switch (*s)  
                {  
                                /* Single-character token, some kind of punctuation mark. */  
                        case '{':  
                                lex->prev_token_terminator = lex->token_terminator;  
                                lex->token_terminator = s + 1;  
                                lex->token_type = JSON_TOKEN_OBJECT_START;  
                                lex->token_terminator = s + 1;  
                                lex->token_type = JSON_TOKEN_OBJECT_START;  
                                break;  
                        case '}':  
                                lex->prev_token_terminator = lex->token_terminator;  
                                lex->token_terminator = s + 1;  
                                lex->token_type = JSON_TOKEN_OBJECT_END;  
                                break;  
                        case '[':  
                                lex->prev_token_terminator = lex->token_terminator;  
                                lex->token_terminator = s + 1;  
                                lex->token_type = JSON_TOKEN_ARRAY_START;  
                                break;  
                        case ']':  
                                lex->prev_token_terminator = lex->token_terminator;  
                                lex->token_terminator = s + 1;  
                                lex->token_type = JSON_TOKEN_ARRAY_END;  
                                break;  
                        case ',':  
                                lex->prev_token_terminator = lex->token_terminator;  
                                lex->token_terminator = s + 1;  
                                lex->token_type = JSON_TOKEN_COMMA;  
                                break;  
                        case ':':  
                                lex->prev_token_terminator = lex->token_terminator;  
                                lex->token_terminator = s + 1;  
                                lex->token_type = JSON_TOKEN_COLON;  
                                break;  
                        case '"':  
                                /* string */  
                                json_lex_string(lex);  
                                lex->token_type = JSON_TOKEN_STRING;  
                                break;  
                        case '-':  
                                /* Negative number. */  
                                json_lex_number(lex, s + 1, NULL);  
                                lex->token_type = JSON_TOKEN_NUMBER;  
                                break;  
                        case '0':  
                        case '1':  
                        case '2':  
                        case '3':  
                        case '4':  
                        case '5':  
                        case '6':  
                        case '7':  
                        case '8':  
                        case '9':  
                                /* Positive number. */  
                                json_lex_number(lex, s, NULL);  
                                lex->token_type = JSON_TOKEN_NUMBER;  
                                break;  
                        default:  
                                {  
                                        char       *p;  
                                {  
                                        char       *p;  
  
                                        /*  
                                         * We're not dealing with a string, number, legal  
                                         * punctuation mark, or end of string.  The only legal  
                                         * tokens we might find here are true, false, and null,  
                                         * but for error reporting purposes we scan until we see a  
                                         * non-alphanumeric character.  That way, we can report  
                                         * the whole word as an unexpected token, rather than just  
                                         * some unintuitive prefix thereof.  
                                         */  
                                        for (p = s; p - s < lex->input_length - len && JSON_ALPHANUMERIC_CHAR(*p); p++)  
                                                 /* skip */ ;  
  
                                        /*  
                                         * We got some sort of unexpected punctuation or an  
                                         * otherwise unexpected character, so just complain about  
                                         * that one character.  
                                         */  
                                        if (p == s)  
                                        {  
                                                lex->prev_token_terminator = lex->token_terminator;  
                                                lex->token_terminator = s + 1;  
                                                report_invalid_token(lex);  
                                        }  
  
                                        /*  
                                         * We've got a real alphanumeric token here.  If it  
                                         * happens to be true, false, or null, all is well.  If  
                                         * not, error out.  
                                         */  
                                        lex->prev_token_terminator = lex->token_terminator;  
                                        lex->token_terminator = p;  
                                        if (p - s == 4)  
                                        {  
                                                if (memcmp(s, "true", 4) == 0)  
                                                        lex->token_type = JSON_TOKEN_TRUE;  
                                                else if (memcmp(s, "null", 4) == 0)  
                                                        lex->token_type = JSON_TOKEN_NULL;  
                                                else  
                                                        report_invalid_token(lex);  
                                        }  
                                        else if (p - s == 5 && memcmp(s, "false", 5) == 0)  
                                                lex->token_type = JSON_TOKEN_FALSE;  
                                        else  
                                                report_invalid_token(lex);  
  
                                }  
                }                                               /* end of switch */  
}  

Flag Counter

digoal’s 大量PostgreSQL文章入口