PostgreSQL 空间、多维 序列 生成方法

6 minute read

背景

数据库的一维序列是很好理解的东西,就是在一个维度上自增。

那么二维、多维序列怎么理解呢?显然就是在多个维度上齐头并进的自增咯。

二维序列

以二维序列为例,应该是这样增长的:

0,0  
0,1  
1,0  
1,1  
1,2  
2,1  
2,2  
...  

那么如何生成以上二维序列呢?实际上可以利用数据库的多个一维序列来生成。

create sequence seq1;  
create sequence seq2;  
  
create or replace function seq_2d() returns point[] as $$  
declare  
  res point[];  
begin  
  select array_cat(res, array[point(nextval('seq1'), nextval('seq2'))]) into res;  
  select array_cat(res, array[point(currval('seq1')+1, currval('seq2'))]) into res;   
  select array_cat(res, array[point(currval('seq1'), currval('seq2')+1)]) into res;    
  return res;  
end;  
$$ language plpgsql strict;  

测试

test=# select seq_2d();  
          seq_2d             
---------------------------  
 {"(1,1)","(2,1)","(1,2)"}  
(1 row)  
  
test=# select seq_2d();  
          seq_2d             
---------------------------  
 {"(2,2)","(3,2)","(2,3)"}  
(1 row)  
  
test=# select seq_2d();  
          seq_2d             
---------------------------  
 {"(3,3)","(4,3)","(3,4)"}  
(1 row)  

三维序列

三维序列的生成方法类似:

0,0,0  
1,0,0  
0,1,0  
0,0,1  
1,1,0  
0,1,1  
1,0,1  
  
1,1,1  
2,1,1  
1,2,1  
1,1,2  
2,2,1  
1,2,2  
2,1,2  
  
2,2,2  
......  
create sequence seq1;  
create sequence seq2;  
create sequence seq3;  
create extension cube;  
  
create or replace function seq_3d() returns cube[] as $$  
declare  
  res cube[];  
begin  
  select array_cat(res, array[cube(array[nextval('seq1'), nextval('seq2'), nextval('seq3')])]) into res;  
  select array_cat(res, array[cube(array[currval('seq1')+1, currval('seq2'), currval('seq3')])]) into res;  
  select array_cat(res, array[cube(array[currval('seq1'), currval('seq2')+1, currval('seq3')])]) into res;  
  select array_cat(res, array[cube(array[currval('seq1'), currval('seq2'), currval('seq3')+1])]) into res;  
  select array_cat(res, array[cube(array[currval('seq1')+1, currval('seq2')+1, currval('seq3')])]) into res;  
  select array_cat(res, array[cube(array[currval('seq1'), currval('seq2')+1, currval('seq3')+1])]) into res;  
  select array_cat(res, array[cube(array[currval('seq1')+1, currval('seq2'), currval('seq3')+1])]) into res;  
  return res;  
end;  
$$ language plpgsql strict;  

例子

test=# select seq_3d();  
                                        seq_3d                                           
---------------------------------------------------------------------------------------  
 {"(1, 1, 1)","(2, 1, 1)","(1, 2, 1)","(1, 1, 2)","(2, 2, 1)","(1, 2, 2)","(2, 1, 2)"}  
(1 row)  
  
test=# select seq_3d();  
                                        seq_3d                                           
---------------------------------------------------------------------------------------  
 {"(2, 2, 2)","(3, 2, 2)","(2, 3, 2)","(2, 2, 3)","(3, 3, 2)","(2, 3, 3)","(3, 2, 3)"}  
(1 row)  
  

多维序列

以此类推,可以得到多维序列。

多维数据的空间存放和BRIN块级索引

《PostgreSQL 黑科技 - 空间聚集存储》

前面讲到了空间聚集存储,如果数据按空间顺序存放,使用BRIN块级索引,可以在任意维度上得到最好的查询效率,真正做到一个块级索引支持任意列的高效过滤。

例子

create sequence seq1;  
create sequence seq2;  
create sequence seq3;  
  
create table tbl(c1 int, c2 int, c3 int);  
  
create or replace function cluster_insert() returns void as $$  
declare  
begin  
  insert into tbl values (nextval('seq1'), nextval('seq2'), nextval('seq3'));  
  insert into tbl values (currval('seq1')+1, currval('seq2'), currval('seq3'));  
  insert into tbl values (currval('seq1'), currval('seq2')+1, currval('seq3'));  
  insert into tbl values (currval('seq1'), currval('seq2'), currval('seq3')+1);  
  insert into tbl values (currval('seq1')+1, currval('seq2')+1, currval('seq3'));  
  insert into tbl values (currval('seq1'), currval('seq2')+1, currval('seq3')+1);  
  insert into tbl values (currval('seq1')+1, currval('seq2'), currval('seq3')+1);  
end;  
$$ language plpgsql strict;  

压测,写入大量数据

vi test.sql  
select count(*) from (select cluster_insert() from generate_series(1,100)) t;  
  
pgbench -M prepared -n -r -P 1 -f ./test.sql -c 32 -j 32 -T 1200  

检查多维聚集性

test=# select * from tbl limit 10;  
   c1    |   c2    |   c3      
---------+---------+---------  
 1992652 | 1992653 | 1992652  
 1992573 | 1992574 | 1992578  
 1992574 | 1992574 | 1992578  
 1992573 | 1992575 | 1992578  
 1992573 | 1992574 | 1992579  
 1992574 | 1992575 | 1992578  
 1992573 | 1992575 | 1992579  
 1992574 | 1992574 | 1992579  
 1992658 | 1992658 | 1992658  
 1992659 | 1992658 | 1992658  
(10 rows)  

创建BRIN块级索引。

create index idx on tbl using brin (c1,c2,c3);  
test=# \dt+ tbl  
                             List of relations
 Schema |        Name         | Type  |  Owner   |    Size    | Description 
--------+---------------------+-------+----------+------------+-------------
 public | tbl                 | table | postgres | 97 GB      | 
(1 row)  
  
test=# \di+ idx  
                        List of relations  
 Schema | Name | Type  |  Owner   | Table |  Size  | Description   
--------+------+-------+----------+-------+--------+-------------  
 public | idx  | index | postgres | tbl   | 456 kB |   
(1 row)  

看看456KB的索引,在97 GB的数据层面,查询效率如何。

任意列、组合查询过滤性。

explain (analyze,verbose,timing,costs,buffers) select * from tbl where c1 between 1 and 1000;  
explain (analyze,verbose,timing,costs,buffers) select * from tbl where c2 between 1 and 1000;  
explain (analyze,verbose,timing,costs,buffers) select * from tbl where c3 between 1 and 1000;  
explain (analyze,verbose,timing,costs,buffers) select * from tbl where c1 between 1 and 1000 and c2 between 100 and 2000;  
explain (analyze,verbose,timing,costs,buffers) select * from tbl where c1 between 1 and 1000 and c3 between 100 and 2000;  
explain (analyze,verbose,timing,costs,buffers) select * from tbl where c1 between 1 and 1000 and c2 between 100 and 2000 and c3 between 1 and 2000;  
test=# explain (analyze,verbose,timing,costs,buffers) select * from tbl where c1 between 1 and 1000;
                              
------------------------------------------------------------------------------------------------------------------------
 Bitmap Heap Scan on public.tbl  (cost=650.23..31623.80 rows=1 width=12) (actual time=27.302..50.284 rows=6997 loops=1)
   Output: c1, c2, c3
   Recheck Cond: ((tbl.c1 >= 1) AND (tbl.c1 <= 1000))
   Rows Removed by Index Recheck: 229803
   Heap Blocks: lossy=1280
   Buffers: shared hit=1942
   ->  Bitmap Index Scan on idx  (cost=0.00..650.23 rows=23810 width=0) (actual time=26.881..26.881 rows=12800 loops=1)
         Index Cond: ((tbl.c1 >= 1) AND (tbl.c1 <= 1000))
         Buffers: shared hit=662
 Planning time: 0.095 ms
 Execution time: 50.636 ms
(11 rows)

test=# explain (analyze,verbose,timing,costs,buffers) select * from tbl where c2 between 1 and 1000;
                                                       QUERY PLAN                                                       
------------------------------------------------------------------------------------------------------------------------
 Bitmap Heap Scan on public.tbl  (cost=650.23..31623.80 rows=1 width=12) (actual time=27.886..49.011 rows=6997 loops=1)
   Output: c1, c2, c3
   Recheck Cond: ((tbl.c2 >= 1) AND (tbl.c2 <= 1000))
   Rows Removed by Index Recheck: 229803
   Heap Blocks: lossy=1280
   Buffers: shared hit=1942
   ->  Bitmap Index Scan on idx  (cost=0.00..650.23 rows=23810 width=0) (actual time=27.512..27.512 rows=12800 loops=1)
         Index Cond: ((tbl.c2 >= 1) AND (tbl.c2 <= 1000))
         Buffers: shared hit=662
 Planning time: 0.040 ms
 Execution time: 49.348 ms
(11 rows)

test=# explain (analyze,verbose,timing,costs,buffers) select * from tbl where c3 between 1 and 1000;
                                                       QUERY PLAN                                                       
------------------------------------------------------------------------------------------------------------------------
 Bitmap Heap Scan on public.tbl  (cost=650.23..31623.80 rows=1 width=12) (actual time=25.238..46.292 rows=6997 loops=1)
   Output: c1, c2, c3
   Recheck Cond: ((tbl.c3 >= 1) AND (tbl.c3 <= 1000))
   Rows Removed by Index Recheck: 229803
   Heap Blocks: lossy=1280
   Buffers: shared hit=1942
   ->  Bitmap Index Scan on idx  (cost=0.00..650.23 rows=23810 width=0) (actual time=24.875..24.875 rows=12800 loops=1)
         Index Cond: ((tbl.c3 >= 1) AND (tbl.c3 <= 1000))
         Buffers: shared hit=662
 Planning time: 0.044 ms
 Execution time: 46.631 ms
(11 rows)

test=# explain (analyze,verbose,timing,costs,buffers) select * from tbl where c1 between 1 and 1000 and c2 between 100 and 2000;
                                                       QUERY PLAN                                                       
------------------------------------------------------------------------------------------------------------------------
 Bitmap Heap Scan on public.tbl  (cost=650.23..31742.85 rows=1 width=12) (actual time=30.018..48.522 rows=6307 loops=1)
   Output: c1, c2, c3
   Recheck Cond: ((tbl.c1 >= 1) AND (tbl.c1 <= 1000) AND (tbl.c2 >= 100) AND (tbl.c2 <= 2000))
   Rows Removed by Index Recheck: 230493
   Heap Blocks: lossy=1280
   Buffers: shared hit=1942
   ->  Bitmap Index Scan on idx  (cost=0.00..650.23 rows=23810 width=0) (actual time=27.273..27.273 rows=12800 loops=1)
         Index Cond: ((tbl.c1 >= 1) AND (tbl.c1 <= 1000) AND (tbl.c2 >= 100) AND (tbl.c2 <= 2000))
         Buffers: shared hit=662
 Planning time: 0.049 ms
 Execution time: 48.829 ms
(11 rows)

test=# explain (analyze,verbose,timing,costs,buffers) select * from tbl where c1 between 1 and 1000 and c3 between 100 and 2000;
                                                       QUERY PLAN                                                       
------------------------------------------------------------------------------------------------------------------------
 Bitmap Heap Scan on public.tbl  (cost=650.23..31742.85 rows=1 width=12) (actual time=27.565..46.347 rows=6307 loops=1)
   Output: c1, c2, c3
   Recheck Cond: ((tbl.c1 >= 1) AND (tbl.c1 <= 1000) AND (tbl.c3 >= 100) AND (tbl.c3 <= 2000))
   Rows Removed by Index Recheck: 230493
   Heap Blocks: lossy=1280
   Buffers: shared hit=1942
   ->  Bitmap Index Scan on idx  (cost=0.00..650.23 rows=23810 width=0) (actual time=24.799..24.799 rows=12800 loops=1)
         Index Cond: ((tbl.c1 >= 1) AND (tbl.c1 <= 1000) AND (tbl.c3 >= 100) AND (tbl.c3 <= 2000))
         Buffers: shared hit=662
 Planning time: 0.055 ms
 Execution time: 46.656 ms
(11 rows)

test=# explain (analyze,verbose,timing,costs,buffers) select * from tbl where c1 between 1 and 1000 and c2 between 100 and 2000 and c3 between 1 and 2000;
                                                                QUERY PLAN                                                                
------------------------------------------------------------------------------------------------------------------------------------------
 Bitmap Heap Scan on public.tbl  (cost=650.23..31861.90 rows=1 width=12) (actual time=28.703..49.599 rows=6307 loops=1)
   Output: c1, c2, c3
   Recheck Cond: ((tbl.c1 >= 1) AND (tbl.c1 <= 1000) AND (tbl.c2 >= 100) AND (tbl.c2 <= 2000) AND (tbl.c3 >= 1) AND (tbl.c3 <= 2000))
   Rows Removed by Index Recheck: 230493
   Heap Blocks: lossy=1280
   Buffers: shared hit=1942
   ->  Bitmap Index Scan on idx  (cost=0.00..650.23 rows=23810 width=0) (actual time=25.590..25.590 rows=12800 loops=1)
         Index Cond: ((tbl.c1 >= 1) AND (tbl.c1 <= 1000) AND (tbl.c2 >= 100) AND (tbl.c2 <= 2000) AND (tbl.c3 >= 1) AND (tbl.c3 <= 2000))
         Buffers: shared hit=662
 Planning time: 0.114 ms
 Execution time: 49.919 ms
(11 rows)

小结

本文介绍了如何创建、生成多维序列。

本文验证了数据如果按照多维序列聚集存放,可以达到块级索引最强过滤性,任意字段都能实现高效率过滤。

如果数据的多列本身不存在相关性,可以参考这篇文档,对数据进行空间重分布存储。得到最强过滤性。

《PostgreSQL 黑科技 - 空间聚集存储》

Flag Counter

digoal’s 大量PostgreSQL文章入口