create table tbl (  
  id int8,  -- 序列    
  tag1 int[],   -- 数组  
  c1 int,       -- 1-100  
  c2 int,       -- 1-10000  
  c3 timestamp   -- 时间戳  
);  

2、生成随机值函数

取值范围$1-$2, 取$3个随机值的数组  
  
create or replace function gen_rand_ints(int, int, int) returns int[] as $$  
  select array(select (random()*($2-$1))::int+$1 from generate_series(1,$3));  
$$ language sql strict;  
  
postgres=# select gen_rand_ints(10,25,5);  
  gen_rand_ints     
------------------  
 {20,19,24,22,21}  
(1 row)  

3、写入测试数据

-- 写入热点数组，5000万条  
insert into tbl select id, gen_rand_ints(1,1000,10), random()*100, random()*10000, clock_timestamp() from generate_series(1,50000000) t(id);  
  
-- 写入非热点数组，1亿条  
insert into tbl select id, gen_rand_ints(1,1000000,10), random()*100, random()*10000, clock_timestamp() from generate_series(1,100000000) t(id);  

数据样式如下

postgres=# select * from tbl limit 10;  
    id    |                   tag1                    | c1 |  c2  |             c3               
----------+-------------------------------------------+----+------+----------------------------  
 38931521 | {424,448,91,420,382,657,677,60,530,503}   | 59 | 6120 | 2017-09-11 14:32:06.610512  
 38931522 | {66,87,468,207,79,780,307,714,520,149}    | 44 | 7848 | 2017-09-11 14:32:06.610522  
 38931523 | {99,628,798,558,415,74,863,839,522,953}   | 26 | 9032 | 2017-09-11 14:32:06.610531  
 38931524 | {610,935,962,140,438,551,752,503,636,220} | 71 | 7136 | 2017-09-11 14:32:06.61054  
 38931525 | {998,16,428,518,164,868,303,263,496,102}  | 82 | 9102 | 2017-09-11 14:32:06.61055  
 38931526 | {175,683,749,696,637,8,599,247,942,561}   | 39 | 3796 | 2017-09-11 14:32:06.610559  
 38931527 | {112,138,882,747,356,591,461,355,605,888} | 87 | 7684 | 2017-09-11 14:32:06.610568  
 38931528 | {756,175,31,252,276,850,162,450,533,910}  | 15 | 1691 | 2017-09-11 14:32:06.610578  
 38931529 | {917,744,416,860,306,801,240,416,937,122} | 16 | 2927 | 2017-09-11 14:32:06.610587  
 38931530 | {712,623,647,317,511,519,86,267,693,116}  | 52 | 9676 | 2017-09-11 14:32:06.610596  
(10 rows)  

求任意条件下的tag1的TOP N元素。

4、分析表，生成柱状图。

postgres=# analyze tbl;  
ANALYZE  

表大小 16 GB。

postgres=# \dt+ tbl  
                   List of relations  
 Schema | Name | Type  |  Owner   | Size  | Description   
--------+------+-------+----------+-------+-------------  
 public | tbl  | table | postgres | 16 GB |   
(1 row)  

5、求某个条件下的精确TOP N元素，实际上有1000个热点ID，所以返回TOP 10的COUNT结果非常近似，后面在使用估值时，得到的TOP 10可能就没这么准了，但是一定是在1000个ID以内的。

-- 开启32个并行的查询时间  
  
postgres=# select unnest(tag1) tag1, count(*) from tbl where c1 between 1 and 10 group by 1 order by 2 desc limit 10;  
 tag1 | count   
------+-------  
  134 | 50935  
  768 | 50915  
  663 | 50876  
  567 | 50821  
  146 | 50821  
  332 | 50814  
  450 | 50807  
  884 | 50789  
   58 | 50781  
  605 | 50774  
(10 rows)  
  
Time: 23441.247 ms (00:23.441)  
  
-- 不开并行的查询时间  
postgres=# select unnest(tag1) tag1, count(*) from tbl where c1 between 1 and 10 group by 1 order by 2 desc limit 10;  
 tag1 | count   
------+-------  
  134 | 50935  
  768 | 50915  
  663 | 50876  
  567 | 50821  
  146 | 50821  
  332 | 50814  
  450 | 50807  
  884 | 50789  
   58 | 50781  
  605 | 50774  
(10 rows)  
  
Time: 154935.686 ms (02:34.936)  

6、求同样条件下的采样TOP N

采样算法参考文章末尾，PostgreSQL内置了2种采样方法，同时支持扩展采样方法，其中有两个内置的扩展采样方法，实际上内置总共有4种采样方法。

使用块级采样（目前采样不支持并行）。

postgres=# select unnest(tag1) tag1, (count(*))*20      -- 乘以100/采样系数  
from   
(select * from tbl TABLESAMPLE system (5)) t     
where c1 between 1 and 10 group by 1 order by 2 desc limit 10;  
 tag1 | ?column?   
------+----------  
  724 |    53380  
  798 |    52680  
   24 |    52640  
  371 |    52480  
  569 |    52400  
  531 |    52280  
  979 |    52160  
  429 |    52140  
  980 |    52080  
  350 |    51920  
(10 rows)  
  
-- 采样5%，约7秒。  
Time: 6887.745 ms (00:06.888)   
  
postgres=# select unnest(tag1) tag1, (count(*))*50    -- 乘以100/采样系数  
from   
(select * from tbl TABLESAMPLE system (2)) t     
where c1 between 1 and 10 group by 1 order by 2 desc limit 10;  
 tag1 | ?column?   
------+----------  
  324 |    55450  
  435 |    55150  
  720 |    55050  
  943 |    54950  
  475 |    54750  
  958 |    54600  
   13 |    54400  
  742 |    54300  
  739 |    54100  
  301 |    53950  
(10 rows)  
  
-- 采样2%， 约3秒。  
Time: 2720.140 ms (00:02.720)  
  
采样越多，精确度越高  

采样的方法，得到的TOP N是很准确的，因为例子用了1000个随机值，并且每个随机值的概率是一样的，如果返回TOP 1000，那就准确无疑了。

大表例子

重新设计热点，总共写入40亿测试数据：

一级热点，1，5亿

二级热点，2-4，5亿

三级热点，5-10，5亿

四级热点，11-30，5亿

普通数据，1-100000，20亿

1、表结构设计

create table tbl1 (  
  id int8,  -- 序列  
  c1 int8,  -- 目标字段  
  c2 int8,  -- 1-100  
  c3 int8,  -- 1-100000  
  c4 timestamp  -- 时间戳  
);  

2、写入测试数据

nohup psql -c "insert into tbl1 select id, 1, random()*100, random()*100000, clock_timestamp() from generate_series(1,500000000) t(id);" >/dev/null 2>&1 &  
nohup psql -c "insert into tbl1 select id, random()*(4-2)+2, random()*100, random()*100000, clock_timestamp() from generate_series(1,500000000) t(id);" >/dev/null 2>&1 &  
nohup psql -c "insert into tbl1 select id, random()*(10-5)+5, random()*100, random()*100000, clock_timestamp() from generate_series(1,500000000) t(id);" >/dev/null 2>&1 &  
nohup psql -c "insert into tbl1 select id, random()*(30-11)+11, random()*100, random()*100000, clock_timestamp() from generate_series(1,500000000) t(id);" >/dev/null 2>&1 &  
nohup psql -c "insert into tbl1 select id, random()*100000, random()*100, random()*100000, clock_timestamp() from generate_series(1,2000000000) t(id);" >/dev/null 2>&1 &  

3、分析表

postgres=# analyze tbl1;  
ANALYZE  
Time: 502.421 ms  

表大小，254 GB。

postgres=# \dt+ tbl1  
                    List of relations  
 Schema | Name | Type  |  Owner   |  Size  | Description   
--------+------+-------+----------+--------+-------------  
 public | tbl1 | table | postgres | 254 GB |   
(1 row)  

4、精确TOP 30

-- 开启32个并行的查询时间  
  
postgres=# select c1,count(*) from tbl1 where c2 between 1 and 10 group by 1 order by 2 desc limit 30;  
 c1 |  count     
----+----------  
| 49991259  
| 25006580  
| 12502559  
| 12498741  
| 10004285  
| 10002597  
|  9999530  
|  9999215  
|  5003219  
|  4998870  
|  2636193  
|  2635457  
|  2635344  
|  2634693  
|  2633965  
|  2633690  
|  2633526  
|  2633512  
|  2633363  
|  2633260  
|  2633014  
|  2632926  
|  2632779  
|  2632508  
|  2632288  
|  2632216  
|  2631443  
|  2631315  
|  1318483  
|  1318451  
(30 rows)  
  
Time: 20845.738 ms (00:20.846)  
  
-- 不开启并行的查询时间  
  
postgres=# select c1,count(*) from tbl1 where c2 between 1 and 10 group by 1 order by 2 desc limit 30;  
  
 c1 |  count     
----+----------  
| 49991259  
| 25006580  
| 12502559  
| 12498741  
| 10004285  
| 10002597  
|  9999530  
|  9999215  
|  5003219  
|  4998870  
|  2636193  
|  2635457  
|  2635344  
|  2634693  
|  2633965  
|  2633690  
|  2633526  
|  2633512  
|  2633363  
|  2633260  
|  2633014  
|  2632926  
|  2632779  
|  2632508  
|  2632288  
|  2632216  
|  2631443  
|  2631315  
|  1318483  
|  1318451  
(30 rows)  
  
Time: 471112.827 ms (07:51.113)  

5、采样TOP 30

select c1,(count(*))*20 from   -- 乘以100/采样系数  
(select * from tbl1 TABLESAMPLE system (5)) t     
where c2 between 1 and 10 group by 1 order by 2 desc limit 30;  
  
 c1 | ?column?   
----+----------  
| 50068840  
| 25108820  
| 12558680  
| 12513080  
| 10009300  
| 10006260  
| 10005400  
|  9987220  
|  5008280  
|  5007980  
|  2652940  
|  2648640  
|  2646800  
|  2646600  
|  2642480  
|  2642220  
|  2641620  
|  2640500  
|  2639420  
|  2637740  
|  2637320  
|  2636900  
|  2636100  
|  2635120  
|  2634440  
|  2631480  
|  2629880  
|  2624940  
|  1330140  
|  1316480  
(30 rows)  
  
Time: 31884.725 ms (00:31.885)  
  
-- 采样5%，约32秒。  
  
select c1,(count(*))*50 from   -- 乘以100/采样系数  
(select * from tbl1 TABLESAMPLE system (2)) t     
where c2 between 1 and 10 group by 1 order by 2 desc limit 30;  
  
 c1 | ?column?   
----+----------  
| 50173200  
| 24993550  
| 12487100  
| 12474100  
|  9998250  
|  9980450  
|  9973950  
|  9960450  
|  4999050  
|  4995000  
|  2642700  
|  2640900  
|  2640300  
|  2630250  
|  2627500  
|  2623700  
|  2622350  
|  2622000  
|  2621200  
|  2619450  
|  2616200  
|  2616050  
|  2615800  
|  2613200  
|  2612200  
|  2607700  
|  2605900  
|  2604150  
|  1312300  
|  1311950  
(30 rows)  
  
Time: 12942.455 ms (00:12.942)  
  
-- 采样2%，约13秒。  
  
postgres=# select c1,(count(*))*1000 from   -- 乘以100/采样系数  
(select * from tbl1 TABLESAMPLE system (0.1)) t     
where c2 between 1 and 10 group by 1 order by 2 desc limit 30;  
 c1 | ?column?   
----+----------  
| 48077000  
| 25061000  
| 12762000  
| 12262000  
|  9851000  
|  9789000  
|  9718000  
|  9654000  
|  4971000  
|  4885000  
|  2731000  
|  2727000  
|  2710000  
|  2697000  
|  2687000  
|  2681000  
|  2672000  
|  2672000  
|  2670000  
|  2637000  
|  2632000  
|  2628000  
|  2628000  
|  2622000  
|  2618000  
|  2601000  
|  2522000  
|  2513000  
|  1406000  
|  1301000  
(30 rows)  
  
Time: 863.604 ms  
  
-- 采样0.1%，约0.86秒。  

OK，采样千分之一的时候（仅需约扫描254MB数据），只花了不到1秒，就算出了准确的TOP 30，而且准确度相当的高。

如果在Greenplum中支持这个功能，会很爽，一万亿的数据，秒级任意维度钻取透视不是梦。

小结

1、采样与精确查询耗时对比

1.1、求数组元素TOP N

查询	表大小	记录数	求TOP N耗时
精确，32并行	16GB	1.5亿	23秒
精确，非并行	16GB	1.5亿	155秒
采样5%	16GB	1.5亿	7秒
采样2%	16GB	1.5亿	3秒

1.2、求scalar类型TOP N

查询	表大小	记录数	求TOP N耗时
精确，32并行	254GB	40亿	21秒
精确，非并行	254GB	40亿	471秒
采样5%	254GB	40亿	32秒
采样2%	254GB	40亿	13秒
采样0.1%	254GB	40亿	0.86秒

2、采样计算达到了很高的精确度，同时耗费资源很少。虽然并行计算也非常快，但是需要消耗更多的CPU和IO资源，并行度就会大打折扣，除非有足够的资源给你折腾，否则能采用估值计算的时候，还是建议估值计算。

3、估值计算的效率评估：

由于目前估值计算不能采用多核并行，处理速度约每秒254MB，那么要达到1秒内的响应，对于254GB的表，采样设置为0.1%，对于1TB的表，可以将采样设置为0.025%）。那么TB级的表，也能实现任意维度秒级估算。

4、采样方法

4.1 https://www.postgresql.org/docs/9.6/static/sql-select.html

TABLESAMPLE sampling_method ( argument [, ...] ) [ REPEATABLE ( seed ) ]

A TABLESAMPLE clause after a table_name indicates that the specified sampling_method should be used to retrieve a subset of the rows in that table. This sampling precedes the application of any other filters such as WHERE clauses. The standard PostgreSQL distribution includes two sampling methods, BERNOULLI and SYSTEM, and other sampling methods can be installed in the database via extensions.

The BERNOULLI and SYSTEM sampling methods each accept a single argument which is the fraction of the table to sample, expressed as a percentage between 0 and 100. This argument can be any real-valued expression. (Other sampling methods might accept more or different arguments.) These two methods each return a randomly-chosen sample of the table that will contain approximately the specified percentage of the table’s rows. The BERNOULLI method scans the whole table and selects or ignores individual rows independently with the specified probability. The SYSTEM method does block-level sampling with each block having the specified chance of being selected; all rows in each selected block are returned. The SYSTEM method is significantly faster than the BERNOULLI method when small sampling percentages are specified, but it may return a less-random sample of the table as a result of clustering effects.

The optional REPEATABLE clause specifies a seed number or expression to use for generating random numbers within the sampling method. The seed value can be any non-null floating-point value. Two queries that specify the same seed and argument values will select the same sample of the table, if the table has not been changed meanwhile. But different seed values will usually produce different samples. If REPEATABLE is not given then a new random sample is selected for each query, based upon a system-generated seed. Note that some add-on sampling methods do not accept REPEATABLE, and will always produce new samples on each use.

4.2 https://www.postgresql.org/docs/9.6/static/tsm-system-rows.html

CREATE EXTENSION tsm_system_rows;  
  
SELECT * FROM my_table TABLESAMPLE SYSTEM_ROWS(100);  

4.3 https://www.postgresql.org/docs/9.6/static/tsm-system-time.html

CREATE EXTENSION tsm_system_time;  
  
SELECT * FROM my_table TABLESAMPLE SYSTEM_TIME(1000);  

5、PostgreSQL中还支持很多其他的估值方法，请参考本文开头部分的介绍。

6、如果采样支持并行，可以支持更大的表的更精确的估算，比如1%是一个比较好的估算采样比，对于1T的表就是10G，如果支持并行，预计在3秒内完成从采样到计算的整个过程。

digoal’s 大量PostgreSQL文章入口

Twitter Facebook Google+ LinkedIn

Digoal.zhou

秒级任意维度分析1TB级大表 - 通过采样估值满足高效TOP N等统计分析需求

背景

场景设计

大表例子

小结

digoal’s 大量PostgreSQL文章入口

You May Also Enjoy

PostgreSQL(PPAS 兼容Oracle) 从零开始入门手册 - 珍藏版

PostgreSQL pipelinedb 流计算插件 - IoT应用 - 实时轨迹聚合

PostgreSQL plpgsql 存储过程、函数 - 状态、异常变量打印、异常捕获… - GET [STACKED] DIAGNOSTICS

PostgreSQL datediff 日期间隔（单位转换）兼容SQL用法