PostgreSQL json jsonb 支持的value数据类型,如何构造一个jsonb
背景
PostgreSQL 支持JSON已久,那么JSON的value中支持哪些类型呢?
目前支持string, numeric, boolean (true, false), null。 区分大小写哦。
/*
* Recursive Descent parse routines. There is one for each structural
* element in a json document:
* - scalar (string, number, true, false, null)
* - array ( [ ] )
* - array element
* - object ( { } )
* - object field
*/
static inline void
parse_scalar(JsonLexContext *lex, JsonSemAction *sem)
{
char *val = NULL;
json_scalar_action sfunc = sem->scalar;
char **valaddr;
JsonTokenType tok = lex_peek(lex);
valaddr = sfunc == NULL ? NULL : &val;
/* a scalar must be a string, a number, true, false, or null */
switch (tok)
{
case JSON_TOKEN_TRUE:
lex_accept(lex, JSON_TOKEN_TRUE, valaddr);
break;
case JSON_TOKEN_FALSE:
lex_accept(lex, JSON_TOKEN_FALSE, valaddr);
break;
case JSON_TOKEN_NULL:
lex_accept(lex, JSON_TOKEN_NULL, valaddr);
break;
case JSON_TOKEN_NUMBER:
lex_accept(lex, JSON_TOKEN_NUMBER, valaddr);
break;
case JSON_TOKEN_STRING:
lex_accept(lex, JSON_TOKEN_STRING, valaddr);
break;
default:
report_parse_error(JSON_PARSE_VALUE, lex);
}
if (sfunc != NULL)
(*sfunc) (sem->semstate, val, tok);
}
例子:
postgres=# select jsonb '{"a": TRUE}';
ERROR: invalid input syntax for type json
LINE 1: select jsonb '{"a": TRUE}';
^
DETAIL: Token "TRUE" is invalid.
CONTEXT: JSON data, line 1: {"a": TRUE...
postgres=# select jsonb '{"a": true}';
jsonb
-------------
{"a": true}
(1 row)
postgres=# select jsonb '{"a": false}';
jsonb
--------------
{"a": false}
(1 row)
postgres=# select jsonb '{"a": NULL}';
ERROR: invalid input syntax for type json
LINE 1: select jsonb '{"a": NULL}';
^
DETAIL: Token "NULL" is invalid.
CONTEXT: JSON data, line 1: {"a": NULL...
postgres=# select jsonb '{"a": null}';
jsonb
-------------
{"a": null}
(1 row)
postgres=# select jsonb '{"a": 10.001}';
jsonb
---------------
{"a": 10.001}
(1 row)
postgres=# select jsonb '{"a": "10.001"}';
jsonb
-----------------
{"a": "10.001"}
(1 row)
src/backend/utils/adt/json.c
解析json,jsonb时,定义的token类型,这个和JSONB里面能存什么类型没有关系:
/*
* The context of the parser is maintained by the recursive descent
* mechanism, but is passed explicitly to the error reporting routine
* for better diagnostics.
*/
typedef enum /* contexts of JSON parser */
{
JSON_PARSE_VALUE, /* expecting a value */
JSON_PARSE_STRING, /* expecting a string (for a field name) */
JSON_PARSE_ARRAY_START, /* saw '[', expecting value or ']' */
JSON_PARSE_ARRAY_NEXT, /* saw array element, expecting ',' or ']' */
JSON_PARSE_OBJECT_START, /* saw '{', expecting label or '}' */
JSON_PARSE_OBJECT_LABEL, /* saw object label, expecting ':' */
JSON_PARSE_OBJECT_NEXT /* saw object value, expecting ',' or '}' */
JSON_PARSE_OBJECT_COMMA, /* saw object ',', expecting next label */
JSON_PARSE_END /* saw the end of a document, expect nothing */
} JsonParseContext;
实际上jsonb里没有类型的概念,你前面看到的TOKEN实际上也就是字符串匹配而已,你可以理解为jsonb内部就是一个有一定规则的字符串。
那么我们怎么来构造一个JSONB类型,并且在使用时可以很好的处理里面的数据类型呢?
这里需要用到PostgreSQL的输入输出函数,一般每个类型都会有对应的in和out函数,例如:
pg_catalog | anyrange_in | anyrange | cstring, oid, integer | normal
pg_catalog | anyrange_out | cstring | anyrange | normal
pg_catalog | timestamptz_out | cstring | timestamp with time zone | normal
pg_catalog | timestamptz_in | timestamp with time zone | cstring, oid, integer | normal
/* timestamptz_in()
* Convert a string to internal form.
*/
Datum
timestamptz_in(PG_FUNCTION_ARGS)
{
char *str = PG_GETARG_CSTRING(0);
#ifdef NOT_USED
Oid typelem = PG_GETARG_OID(1);
#endif
int32 typmod = PG_GETARG_INT32(2);
postgres=# select timestamptz_out(now());
timestamptz_out
-------------------------------
2015-09-24 19:44:16.076233+08
(1 row)
postgres=# select oid,* from pg_type where typname='timestamptz';
-[ RECORD 1 ]--+---------------------
oid | 1184
typname | timestamptz
typnamespace | 11
typowner | 10
typlen | 8
typbyval | t
typtype | b
typcategory | D
typispreferred | t
typisdefined | t
typdelim | ,
typrelid | 0
typelem | 0
typarray | 1185
typinput | timestamptz_in
typoutput | timestamptz_out
typreceive | timestamptz_recv
typsend | timestamptz_send
typmodin | timestamptztypmodin
typmodout | timestamptztypmodout
typanalyze | -
typalign | d
typstorage | p
typnotnull | f
typbasetype | 0
typtypmod | -1
typndims | 0
typcollation | 0
typdefaultbin |
typdefault |
typacl |
typmode表示精度
postgres=# select timestamptz_in('2015-09-24 19:44:16.076233+08',1184,1);
-[ RECORD 1 ]--+-------------------------
timestamptz_in | 2015-09-24 19:44:16.1+08
postgres=# select timestamptz_in('2015-09-24 19:44:16.076233+08',1184,6);
-[ RECORD 1 ]--+------------------------------
timestamptz_in | 2015-09-24 19:44:16.076233+08
使用format格式来构造jsonb类型是比较靠谱的。
代码见varlena.c
/*
* Returns a formatted string
*/
Datum
text_format(PG_FUNCTION_ARGS)
{
......
例子
构造一个jsonb的cstring
postgres=# select format('{"K": "%s"}', int8range(1,10));
format
-----------------
{"K": "[1,10)"}
(1 row)
使用jsonb_in把cstring转换为JSONB
postgres=# select jsonb_in( format('{"K": "%s"}', int8range(1,10))::cstring );
jsonb_in
-----------------
{"K": "[1,10)"}
(1 row)
取出K的element
postgres=# select jsonb_in( format('{"K": "%s"}', int8range(1,10))::cstring ) ->> 'K';
?column?
----------
[1,10)
(1 row)
将element转换回去
postgres=# select (jsonb_in( format('{"K": "%s"}', int8range(1,10))::cstring ) ->> 'K') :: int8range;
int8range
-----------
[1,10)
(1 row)
再举个例子
postgres=# select format('{"K": "%s"}', '你好'::bytea);
format
-------------------------
{"K": "\xe4bda0e5a5bd"}
(1 row)
postgres=# select convert_from('\xe4bda0e5a5bd', 'utf8');
convert_from
--------------
你好
(1 row)
postgres=# \set VERBOSITY verbose
postgres=# select jsonb_in( format('{"K": "%s"}', '你好'::bytea)::cstring );
ERROR: 22P02: invalid input syntax for type json
DETAIL: Escape sequence "\x" is invalid.
CONTEXT: JSON data, line 1: {"K": "\x...
LOCATION: json_lex_string, json.c:890
非常遗憾,JSONB会处理escape,而\x是不合法的,报错了。这块代码可以改一改。
代码如下:
/*
* The next token in the input stream is known to be a string; lex it.
*/
static inline void
json_lex_string(JsonLexContext *lex)
{
......
switch (*s)
{
case '"':
case '\\':
case '/':
appendStringInfoChar(lex->strval, *s);
break;
case 'b':
appendStringInfoChar(lex->strval, '\b');
break;
case 'f':
appendStringInfoChar(lex->strval, '\f');
break;
case 'n':
appendStringInfoChar(lex->strval, '\n');
break;
case 'r':
appendStringInfoChar(lex->strval, '\r');
break;
case 't':
appendStringInfoChar(lex->strval, '\t');
break;
default:
/* Not a valid string escape, so error out. */
lex->token_terminator = s + pg_mblen(s);
ereport(ERROR,
(errcode(ERRCODE_INVALID_TEXT_REPRESENTATION),
errmsg("invalid input syntax for type json"),
errdetail("Escape sequence \"\\%s\" is invalid.",
extract_mb_char(s)),
report_json_context(lex)));
}
修改后,可以实现存储BYTEA,但是输出时还需要转换一下。
postgres=# select jsonb_in( format('{"K": "%s"}', '你好'::bytea)::cstring ) ->> 'K';
?column?
--------------
e4bda0e5a5bd
(1 row)
postgres=# select convert_from( byteain (('\x'||(jsonb_in( format('{"K": "%s"}', '你好'::bytea)::cstring ) ->> 'K'))::cstring), 'utf8'::name);
convert_from
--------------
你好
(1 row)
用format来构造一个字符串,在里面使用%s填充需要的cstring,从而达到构造jsonb的目的。
这种方法同样适用构造JSONB中包含postgis的几何类型。
参考
1. http://www.postgresql.org/docs/9.4/static/functions-string.html
2. src/backend/utils/adt/json.c
/*
* Lex one token from the input stream.
*/
static inline void
json_lex(JsonLexContext *lex)
{
char *s;
int len;
/* Skip leading whitespace. */
s = lex->token_terminator;
len = s - lex->input;
while (len < lex->input_length &&
(*s == ' ' || *s == '\t' || *s == '\n' || *s == '\r'))
{
if (*s == '\n')
++lex->line_number;
++s;
++len;
}
lex->token_start = s;
/* Determine token type. */
if (len >= lex->input_length)
{
lex->token_start = NULL;
lex->prev_token_terminator = lex->token_terminator;
lex->token_terminator = s;
lex->token_type = JSON_TOKEN_END;
}
else
switch (*s)
{
/* Single-character token, some kind of punctuation mark. */
case '{':
lex->prev_token_terminator = lex->token_terminator;
lex->token_terminator = s + 1;
lex->token_type = JSON_TOKEN_OBJECT_START;
lex->token_terminator = s + 1;
lex->token_type = JSON_TOKEN_OBJECT_START;
break;
case '}':
lex->prev_token_terminator = lex->token_terminator;
lex->token_terminator = s + 1;
lex->token_type = JSON_TOKEN_OBJECT_END;
break;
case '[':
lex->prev_token_terminator = lex->token_terminator;
lex->token_terminator = s + 1;
lex->token_type = JSON_TOKEN_ARRAY_START;
break;
case ']':
lex->prev_token_terminator = lex->token_terminator;
lex->token_terminator = s + 1;
lex->token_type = JSON_TOKEN_ARRAY_END;
break;
case ',':
lex->prev_token_terminator = lex->token_terminator;
lex->token_terminator = s + 1;
lex->token_type = JSON_TOKEN_COMMA;
break;
case ':':
lex->prev_token_terminator = lex->token_terminator;
lex->token_terminator = s + 1;
lex->token_type = JSON_TOKEN_COLON;
break;
case '"':
/* string */
json_lex_string(lex);
lex->token_type = JSON_TOKEN_STRING;
break;
case '-':
/* Negative number. */
json_lex_number(lex, s + 1, NULL);
lex->token_type = JSON_TOKEN_NUMBER;
break;
case '0':
case '1':
case '2':
case '3':
case '4':
case '5':
case '6':
case '7':
case '8':
case '9':
/* Positive number. */
json_lex_number(lex, s, NULL);
lex->token_type = JSON_TOKEN_NUMBER;
break;
default:
{
char *p;
{
char *p;
/*
* We're not dealing with a string, number, legal
* punctuation mark, or end of string. The only legal
* tokens we might find here are true, false, and null,
* but for error reporting purposes we scan until we see a
* non-alphanumeric character. That way, we can report
* the whole word as an unexpected token, rather than just
* some unintuitive prefix thereof.
*/
for (p = s; p - s < lex->input_length - len && JSON_ALPHANUMERIC_CHAR(*p); p++)
/* skip */ ;
/*
* We got some sort of unexpected punctuation or an
* otherwise unexpected character, so just complain about
* that one character.
*/
if (p == s)
{
lex->prev_token_terminator = lex->token_terminator;
lex->token_terminator = s + 1;
report_invalid_token(lex);
}
/*
* We've got a real alphanumeric token here. If it
* happens to be true, false, or null, all is well. If
* not, error out.
*/
lex->prev_token_terminator = lex->token_terminator;
lex->token_terminator = p;
if (p - s == 4)
{
if (memcmp(s, "true", 4) == 0)
lex->token_type = JSON_TOKEN_TRUE;
else if (memcmp(s, "null", 4) == 0)
lex->token_type = JSON_TOKEN_NULL;
else
report_invalid_token(lex);
}
else if (p - s == 5 && memcmp(s, "false", 5) == 0)
lex->token_type = JSON_TOKEN_FALSE;
else
report_invalid_token(lex);
}
} /* end of switch */
}