PostgreSQL 空间、多维 序列 生成方法
背景
数据库的一维序列是很好理解的东西,就是在一个维度上自增。
那么二维、多维序列怎么理解呢?显然就是在多个维度上齐头并进的自增咯。
二维序列
以二维序列为例,应该是这样增长的:
0,0
0,1
1,0
1,1
1,2
2,1
2,2
...
那么如何生成以上二维序列呢?实际上可以利用数据库的多个一维序列来生成。
create sequence seq1;
create sequence seq2;
create or replace function seq_2d() returns point[] as $$
declare
res point[];
begin
select array_cat(res, array[point(nextval('seq1'), nextval('seq2'))]) into res;
select array_cat(res, array[point(currval('seq1')+1, currval('seq2'))]) into res;
select array_cat(res, array[point(currval('seq1'), currval('seq2')+1)]) into res;
return res;
end;
$$ language plpgsql strict;
测试
test=# select seq_2d();
seq_2d
---------------------------
{"(1,1)","(2,1)","(1,2)"}
(1 row)
test=# select seq_2d();
seq_2d
---------------------------
{"(2,2)","(3,2)","(2,3)"}
(1 row)
test=# select seq_2d();
seq_2d
---------------------------
{"(3,3)","(4,3)","(3,4)"}
(1 row)
三维序列
三维序列的生成方法类似:
0,0,0
1,0,0
0,1,0
0,0,1
1,1,0
0,1,1
1,0,1
1,1,1
2,1,1
1,2,1
1,1,2
2,2,1
1,2,2
2,1,2
2,2,2
......
create sequence seq1;
create sequence seq2;
create sequence seq3;
create extension cube;
create or replace function seq_3d() returns cube[] as $$
declare
res cube[];
begin
select array_cat(res, array[cube(array[nextval('seq1'), nextval('seq2'), nextval('seq3')])]) into res;
select array_cat(res, array[cube(array[currval('seq1')+1, currval('seq2'), currval('seq3')])]) into res;
select array_cat(res, array[cube(array[currval('seq1'), currval('seq2')+1, currval('seq3')])]) into res;
select array_cat(res, array[cube(array[currval('seq1'), currval('seq2'), currval('seq3')+1])]) into res;
select array_cat(res, array[cube(array[currval('seq1')+1, currval('seq2')+1, currval('seq3')])]) into res;
select array_cat(res, array[cube(array[currval('seq1'), currval('seq2')+1, currval('seq3')+1])]) into res;
select array_cat(res, array[cube(array[currval('seq1')+1, currval('seq2'), currval('seq3')+1])]) into res;
return res;
end;
$$ language plpgsql strict;
例子
test=# select seq_3d();
seq_3d
---------------------------------------------------------------------------------------
{"(1, 1, 1)","(2, 1, 1)","(1, 2, 1)","(1, 1, 2)","(2, 2, 1)","(1, 2, 2)","(2, 1, 2)"}
(1 row)
test=# select seq_3d();
seq_3d
---------------------------------------------------------------------------------------
{"(2, 2, 2)","(3, 2, 2)","(2, 3, 2)","(2, 2, 3)","(3, 3, 2)","(2, 3, 3)","(3, 2, 3)"}
(1 row)
多维序列
以此类推,可以得到多维序列。
多维数据的空间存放和BRIN块级索引
前面讲到了空间聚集存储,如果数据按空间顺序存放,使用BRIN块级索引,可以在任意维度上得到最好的查询效率,真正做到一个块级索引支持任意列的高效过滤。
例子
create sequence seq1;
create sequence seq2;
create sequence seq3;
create table tbl(c1 int, c2 int, c3 int);
create or replace function cluster_insert() returns void as $$
declare
begin
insert into tbl values (nextval('seq1'), nextval('seq2'), nextval('seq3'));
insert into tbl values (currval('seq1')+1, currval('seq2'), currval('seq3'));
insert into tbl values (currval('seq1'), currval('seq2')+1, currval('seq3'));
insert into tbl values (currval('seq1'), currval('seq2'), currval('seq3')+1);
insert into tbl values (currval('seq1')+1, currval('seq2')+1, currval('seq3'));
insert into tbl values (currval('seq1'), currval('seq2')+1, currval('seq3')+1);
insert into tbl values (currval('seq1')+1, currval('seq2'), currval('seq3')+1);
end;
$$ language plpgsql strict;
压测,写入大量数据
vi test.sql
select count(*) from (select cluster_insert() from generate_series(1,100)) t;
pgbench -M prepared -n -r -P 1 -f ./test.sql -c 32 -j 32 -T 1200
检查多维聚集性
test=# select * from tbl limit 10;
c1 | c2 | c3
---------+---------+---------
1992652 | 1992653 | 1992652
1992573 | 1992574 | 1992578
1992574 | 1992574 | 1992578
1992573 | 1992575 | 1992578
1992573 | 1992574 | 1992579
1992574 | 1992575 | 1992578
1992573 | 1992575 | 1992579
1992574 | 1992574 | 1992579
1992658 | 1992658 | 1992658
1992659 | 1992658 | 1992658
(10 rows)
创建BRIN块级索引。
create index idx on tbl using brin (c1,c2,c3);
test=# \dt+ tbl
List of relations
Schema | Name | Type | Owner | Size | Description
--------+---------------------+-------+----------+------------+-------------
public | tbl | table | postgres | 97 GB |
(1 row)
test=# \di+ idx
List of relations
Schema | Name | Type | Owner | Table | Size | Description
--------+------+-------+----------+-------+--------+-------------
public | idx | index | postgres | tbl | 456 kB |
(1 row)
看看456KB的索引,在97 GB的数据层面,查询效率如何。
任意列、组合查询过滤性。
explain (analyze,verbose,timing,costs,buffers) select * from tbl where c1 between 1 and 1000;
explain (analyze,verbose,timing,costs,buffers) select * from tbl where c2 between 1 and 1000;
explain (analyze,verbose,timing,costs,buffers) select * from tbl where c3 between 1 and 1000;
explain (analyze,verbose,timing,costs,buffers) select * from tbl where c1 between 1 and 1000 and c2 between 100 and 2000;
explain (analyze,verbose,timing,costs,buffers) select * from tbl where c1 between 1 and 1000 and c3 between 100 and 2000;
explain (analyze,verbose,timing,costs,buffers) select * from tbl where c1 between 1 and 1000 and c2 between 100 and 2000 and c3 between 1 and 2000;
test=# explain (analyze,verbose,timing,costs,buffers) select * from tbl where c1 between 1 and 1000;
------------------------------------------------------------------------------------------------------------------------
Bitmap Heap Scan on public.tbl (cost=650.23..31623.80 rows=1 width=12) (actual time=27.302..50.284 rows=6997 loops=1)
Output: c1, c2, c3
Recheck Cond: ((tbl.c1 >= 1) AND (tbl.c1 <= 1000))
Rows Removed by Index Recheck: 229803
Heap Blocks: lossy=1280
Buffers: shared hit=1942
-> Bitmap Index Scan on idx (cost=0.00..650.23 rows=23810 width=0) (actual time=26.881..26.881 rows=12800 loops=1)
Index Cond: ((tbl.c1 >= 1) AND (tbl.c1 <= 1000))
Buffers: shared hit=662
Planning time: 0.095 ms
Execution time: 50.636 ms
(11 rows)
test=# explain (analyze,verbose,timing,costs,buffers) select * from tbl where c2 between 1 and 1000;
QUERY PLAN
------------------------------------------------------------------------------------------------------------------------
Bitmap Heap Scan on public.tbl (cost=650.23..31623.80 rows=1 width=12) (actual time=27.886..49.011 rows=6997 loops=1)
Output: c1, c2, c3
Recheck Cond: ((tbl.c2 >= 1) AND (tbl.c2 <= 1000))
Rows Removed by Index Recheck: 229803
Heap Blocks: lossy=1280
Buffers: shared hit=1942
-> Bitmap Index Scan on idx (cost=0.00..650.23 rows=23810 width=0) (actual time=27.512..27.512 rows=12800 loops=1)
Index Cond: ((tbl.c2 >= 1) AND (tbl.c2 <= 1000))
Buffers: shared hit=662
Planning time: 0.040 ms
Execution time: 49.348 ms
(11 rows)
test=# explain (analyze,verbose,timing,costs,buffers) select * from tbl where c3 between 1 and 1000;
QUERY PLAN
------------------------------------------------------------------------------------------------------------------------
Bitmap Heap Scan on public.tbl (cost=650.23..31623.80 rows=1 width=12) (actual time=25.238..46.292 rows=6997 loops=1)
Output: c1, c2, c3
Recheck Cond: ((tbl.c3 >= 1) AND (tbl.c3 <= 1000))
Rows Removed by Index Recheck: 229803
Heap Blocks: lossy=1280
Buffers: shared hit=1942
-> Bitmap Index Scan on idx (cost=0.00..650.23 rows=23810 width=0) (actual time=24.875..24.875 rows=12800 loops=1)
Index Cond: ((tbl.c3 >= 1) AND (tbl.c3 <= 1000))
Buffers: shared hit=662
Planning time: 0.044 ms
Execution time: 46.631 ms
(11 rows)
test=# explain (analyze,verbose,timing,costs,buffers) select * from tbl where c1 between 1 and 1000 and c2 between 100 and 2000;
QUERY PLAN
------------------------------------------------------------------------------------------------------------------------
Bitmap Heap Scan on public.tbl (cost=650.23..31742.85 rows=1 width=12) (actual time=30.018..48.522 rows=6307 loops=1)
Output: c1, c2, c3
Recheck Cond: ((tbl.c1 >= 1) AND (tbl.c1 <= 1000) AND (tbl.c2 >= 100) AND (tbl.c2 <= 2000))
Rows Removed by Index Recheck: 230493
Heap Blocks: lossy=1280
Buffers: shared hit=1942
-> Bitmap Index Scan on idx (cost=0.00..650.23 rows=23810 width=0) (actual time=27.273..27.273 rows=12800 loops=1)
Index Cond: ((tbl.c1 >= 1) AND (tbl.c1 <= 1000) AND (tbl.c2 >= 100) AND (tbl.c2 <= 2000))
Buffers: shared hit=662
Planning time: 0.049 ms
Execution time: 48.829 ms
(11 rows)
test=# explain (analyze,verbose,timing,costs,buffers) select * from tbl where c1 between 1 and 1000 and c3 between 100 and 2000;
QUERY PLAN
------------------------------------------------------------------------------------------------------------------------
Bitmap Heap Scan on public.tbl (cost=650.23..31742.85 rows=1 width=12) (actual time=27.565..46.347 rows=6307 loops=1)
Output: c1, c2, c3
Recheck Cond: ((tbl.c1 >= 1) AND (tbl.c1 <= 1000) AND (tbl.c3 >= 100) AND (tbl.c3 <= 2000))
Rows Removed by Index Recheck: 230493
Heap Blocks: lossy=1280
Buffers: shared hit=1942
-> Bitmap Index Scan on idx (cost=0.00..650.23 rows=23810 width=0) (actual time=24.799..24.799 rows=12800 loops=1)
Index Cond: ((tbl.c1 >= 1) AND (tbl.c1 <= 1000) AND (tbl.c3 >= 100) AND (tbl.c3 <= 2000))
Buffers: shared hit=662
Planning time: 0.055 ms
Execution time: 46.656 ms
(11 rows)
test=# explain (analyze,verbose,timing,costs,buffers) select * from tbl where c1 between 1 and 1000 and c2 between 100 and 2000 and c3 between 1 and 2000;
QUERY PLAN
------------------------------------------------------------------------------------------------------------------------------------------
Bitmap Heap Scan on public.tbl (cost=650.23..31861.90 rows=1 width=12) (actual time=28.703..49.599 rows=6307 loops=1)
Output: c1, c2, c3
Recheck Cond: ((tbl.c1 >= 1) AND (tbl.c1 <= 1000) AND (tbl.c2 >= 100) AND (tbl.c2 <= 2000) AND (tbl.c3 >= 1) AND (tbl.c3 <= 2000))
Rows Removed by Index Recheck: 230493
Heap Blocks: lossy=1280
Buffers: shared hit=1942
-> Bitmap Index Scan on idx (cost=0.00..650.23 rows=23810 width=0) (actual time=25.590..25.590 rows=12800 loops=1)
Index Cond: ((tbl.c1 >= 1) AND (tbl.c1 <= 1000) AND (tbl.c2 >= 100) AND (tbl.c2 <= 2000) AND (tbl.c3 >= 1) AND (tbl.c3 <= 2000))
Buffers: shared hit=662
Planning time: 0.114 ms
Execution time: 49.919 ms
(11 rows)
小结
本文介绍了如何创建、生成多维序列。
本文验证了数据如果按照多维序列聚集存放,可以达到块级索引最强过滤性,任意字段都能实现高效率过滤。
如果数据的多列本身不存在相关性,可以参考这篇文档,对数据进行空间重分布存储。得到最强过滤性。