PostgreSQL 递归妙用案例 - 分组数据去重与打散
背景
我们现在业务场景,有对筛选结果进行去重和打散的需求,比如每个品牌的商品只能出现不超过10个。
目前我们是在业务层进行处理的,效率比较低。
PGSql有没有更好的方式来支持对结果的去重和打散呢?
PostgreSQL SQL功能强大,有很多种方法可以实现这类业务需求的高效检索。比如
1、递归
2、窗口查询
3、自定义UDF
例子1 - 品牌+商品 唯一
1、建表,品牌+商品字段唯一,也就是说不需要考虑去重。
create table tbl(
c1 int, -- 品牌ID
c2 int, -- 商品ID
c3 int, -- 其他条件,本例用来做过滤的条件
c4 timestamp, -- 时间
unique (c1,c2) -- 唯一约束
);
2、写入5000万测试数据
insert into tbl
select random()*10000,
random()*1000000,
random()*10,
clock_timestamp()
from generate_series(1,50000000)
on conflict (c1,c2) do nothing;
3、创建加速索引,过滤条件放在前面,品牌C1字段在最后(用于类似SKIP INDEX SCAN类似的排序加速)。
create index idx_tbl_1 on tbl(c3,c1);
在满足某个条件的结果集合中,对每一个品牌的商品,只取10个商品的记录。
假设筛选条件为
c3=1 and c1<100
或
c3=1
使用递归
with recursive
tmp as (select c1 from tbl where c3=1 and c1<100 order by c3,c1 limit 1),
skip as (
(
-- 初始查询
select tbl.*, t_max.c1 as t_max_c1 from tbl ,
(select c1 from tbl where c3=1 and c1<100 and c1 > (select c1 from tmp) order by c3,c1 limit 1) t_max
where tbl.c3=1 and tbl.c1<100 and tbl.c1=(select c1 from tmp) and tbl.c1<t_max.c1
order by tbl.c3,tbl.c1 limit 10
)
union all
(
select * from tbl,
(select c1 from tbl where c3=1 and c1<100 and c1 > (select t_max_c1 from skip limit 1) order by c3,c1 limit 1) t_max
where tbl.c3=1 and tbl.c1<100
and tbl.c1=(select t_max_c1 from skip limit 1) -- 递归条件
and tbl.* is not null -- 递归结束条件
order by tbl.c3,tbl.c1 limit 10
)
)
select * from skip;
但是递归查询中不允许将WORK TABLE放到子查询中。
ERROR: recursive reference to query "skip" must not appear within a subquery
LINE 8: and tbl.c1>(select c1 from skip limit 1)
^
修改SQL如下,使用LATERAL引用前面出现过的表的内容
with recursive
-- 符合条件的第一个品牌
tmp as (select c1 from tbl where c3=1 and c1<100 order by c3,c1 limit 1),
skip as (
(
-- 初始查询, 同时计算并输出下一个品牌ID
select tbl.*,
t_max.c1 as t_max_c1 -- 下一个品牌
from tbl ,
(select c1 from tbl
where c3=1 and c1<100 + 1 -- 计算符合条件的(需要比原始条件+1,否则会导致最后符合条件的那个品牌没有输出)
and c1 > (select c1 from tmp) -- 下一个品牌
order by c3,c1 limit 1 -- 采用类似SKIP INDEX SCAN
) t_max
where tbl.c3=1 and tbl.c1<100 -- 符合条件
and tbl.c1=(select c1 from tmp) -- 商家ID等于上一次计算得到的下一个品牌
order by tbl.c3,tbl.c1 limit 10 -- 每个品牌最多返回10个商品
)
union all
(
-- 递归查询
select tbl.*,
t_max.c1 as t_max_c1 -- 下一个品牌
from
(select t_max_c1 from skip limit 1) s, -- 得到上次计算的这次递归查询的品牌
LATERAL (select c1 from tbl
where c3=1 and c1<100 + 1 -- 计算符合条件的(需要比原始条件+1,否则会导致最后符合条件的那个品牌没有输出)
and c1 > s.t_max_c1 -- 下一个品牌
order by c3,c1 limit 1
) t_max,
LATERAL (select * from tbl -- 当前计算品牌的10个商品
where tbl.c3=1 and tbl.c1<100
and tbl.c1=s.t_max_c1 -- 当前品牌由上次的worker table. t_max_c1得到
and tbl.* is not null
order by tbl.c3,tbl.c1 limit 10 -- 每个品牌10个商品
) tbl
)
)
select * from skip;
执行计划
QUERY PLAN
---------------------------------------------------------------------------------------------------------------------------------------------------------------------------
CTE Scan on skip (cost=97.93..99.03 rows=55 width=24) (actual time=0.096..6.381 rows=1000 loops=1)
Output: skip.c1, skip.c2, skip.c3, skip.c4, skip.t_max_c1
Buffers: shared hit=1710
CTE tmp
-> Limit (cost=0.44..1.31 rows=1 width=8) (actual time=0.057..0.057 rows=1 loops=1)
Output: tbl.c1, tbl.c3
Buffers: shared hit=4
-> Index Only Scan using idx_tbl_1 on public.tbl (cost=0.44..39795.81 rows=45681 width=8) (actual time=0.056..0.056 rows=1 loops=1)
Output: tbl.c1, tbl.c3
Index Cond: ((tbl.c3 = 1) AND (tbl.c1 < 100))
Heap Fetches: 1
Buffers: shared hit=4
CTE skip
-> Recursive Union (cost=0.92..96.62 rows=55 width=24) (actual time=0.094..5.947 rows=1000 loops=1)
Buffers: shared hit=1710
-> Limit (cost=0.92..8.64 rows=5 width=24) (actual time=0.093..0.111 rows=10 loops=1)
Output: tbl_1.c1, tbl_1.c2, tbl_1.c3, tbl_1.c4, tbl_2.c1
Buffers: shared hit=21
InitPlan 2 (returns $2)
-> CTE Scan on tmp (cost=0.00..0.02 rows=1 width=4) (actual time=0.000..0.001 rows=1 loops=1)
Output: tmp.c1
-> Nested Loop (cost=0.90..8.62 rows=5 width=24) (actual time=0.092..0.108 rows=10 loops=1)
Output: tbl_1.c1, tbl_1.c2, tbl_1.c3, tbl_1.c4, tbl_2.c1
Buffers: shared hit=21
-> Limit (cost=0.46..1.43 rows=1 width=8) (actual time=0.077..0.077 rows=1 loops=1)
Output: tbl_2.c1, tbl_2.c3
Buffers: shared hit=8
InitPlan 3 (returns $3)
-> CTE Scan on tmp tmp_1 (cost=0.00..0.02 rows=1 width=4) (actual time=0.060..0.060 rows=1 loops=1)
Output: tmp_1.c1
Buffers: shared hit=4
-> Index Only Scan using idx_tbl_1 on public.tbl tbl_2 (cost=0.44..24434.73 rows=25242 width=8) (actual time=0.077..0.077 rows=1 loops=1)
Output: tbl_2.c1, tbl_2.c3
Index Cond: ((tbl_2.c3 = 1) AND (tbl_2.c1 < 101) AND (tbl_2.c1 > $3))
Heap Fetches: 1
Buffers: shared hit=8
-> Index Scan using idx_tbl_1 on public.tbl tbl_1 (cost=0.44..7.13 rows=5 width=20) (actual time=0.012..0.026 rows=10 loops=1)
Output: tbl_1.c1, tbl_1.c2, tbl_1.c3, tbl_1.c4
Index Cond: ((tbl_1.c3 = 1) AND (tbl_1.c1 < 100) AND (tbl_1.c1 = $2))
Buffers: shared hit=13
-> Nested Loop (cost=0.88..8.69 rows=5 width=24) (actual time=0.038..0.055 rows=10 loops=100)
Output: tbl_4.c1, tbl_4.c2, tbl_4.c3, tbl_4.c4, tbl_3.c1
Buffers: shared hit=1689
-> Nested Loop (cost=0.44..1.46 rows=1 width=8) (actual time=0.025..0.026 rows=1 loops=100)
Output: skip_1.t_max_c1, tbl_3.c1
Buffers: shared hit=400
-> Limit (cost=0.00..0.02 rows=1 width=4) (actual time=0.000..0.001 rows=1 loops=100)
Output: skip_1.t_max_c1
-> WorkTable Scan on skip skip_1 (cost=0.00..1.00 rows=50 width=4) (actual time=0.000..0.000 rows=1 loops=100)
Output: skip_1.t_max_c1
-> Limit (cost=0.44..1.41 rows=1 width=8) (actual time=0.024..0.024 rows=1 loops=100)
Output: tbl_3.c1, tbl_3.c3
Buffers: shared hit=400
-> Index Only Scan using idx_tbl_1 on public.tbl tbl_3 (cost=0.44..24434.73 rows=25242 width=8) (actual time=0.024..0.024 rows=1 loops=100)
Output: tbl_3.c1, tbl_3.c3
Index Cond: ((tbl_3.c3 = 1) AND (tbl_3.c1 < 101) AND (tbl_3.c1 > skip_1.t_max_c1))
Heap Fetches: 99
Buffers: shared hit=400
-> Limit (cost=0.44..7.13 rows=5 width=20) (actual time=0.012..0.026 rows=10 loops=99)
Output: tbl_4.c1, tbl_4.c2, tbl_4.c3, tbl_4.c4
Buffers: shared hit=1289
-> Index Scan using idx_tbl_1 on public.tbl tbl_4 (cost=0.44..7.13 rows=5 width=20) (actual time=0.012..0.024 rows=10 loops=99)
Output: tbl_4.c1, tbl_4.c2, tbl_4.c3, tbl_4.c4
Index Cond: ((tbl_4.c3 = 1) AND (tbl_4.c1 < 100) AND (tbl_4.c1 = skip_1.t_max_c1))
Filter: (tbl_4.* IS NOT NULL)
Buffers: shared hit=1289
Planning Time: 0.442 ms
Execution Time: 6.566 ms
(68 rows)
返回100个品牌的1000个商品,耗时6.5毫秒。
返回10001个品牌的100010个商品,耗时418毫秒。
with recursive
-- 符合条件的第一个品牌
tmp as (select c1 from tbl where c3=1 order by c3,c1 limit 1),
skip as (
(
-- 初始查询, 同时计算并输出下一个品牌ID
select tbl.*,
t_max.c1 as t_max_c1 -- 下一个品牌
from tbl ,
(select c1 from tbl
where c3=1 -- 计算符合条件的
and c1 > (select c1 from tmp) -- 下一个品牌
order by c3,c1 limit 1 -- 采用类似SKIP INDEX SCAN
) t_max
where tbl.c3=1 -- 符合条件
and tbl.c1=(select c1 from tmp) -- 商家ID等于上一次计算得到的下一个品牌
order by tbl.c3,tbl.c1 limit 10 -- 每个品牌最多返回10个商品
)
union all
(
-- 递归查询
select tbl.*,
t_max.c1 as t_max_c1 -- 下一个品牌
from
(select t_max_c1 from skip limit 1) s, -- 得到上次计算的这次递归查询的品牌
LATERAL (select c1 from tbl
where c3=1 -- 计算符合条件的
and c1 > s.t_max_c1 -- 下一个品牌
order by c3,c1 limit 1
) t_max,
LATERAL (select * from tbl -- 当前计算品牌的10个商品
where tbl.c3=1
and tbl.c1=s.t_max_c1 -- 当前品牌由上次的worker table. t_max_c1得到
and tbl.* is not null
order by tbl.c3,tbl.c1 limit 10 -- 每个品牌10个商品
) tbl
)
)
select count(*) from skip;
count
--------
100000
(1 row)
Time: 417.975 ms
执行计划
QUERY PLAN
---------------------------------------------------------------------------------------------------------------------------
Aggregate (cost=142.06..142.07 rows=1 width=8)
CTE tmp
-> Limit (cost=0.44..1.31 rows=1 width=8)
-> Index Only Scan using idx_tbl_1 on tbl (cost=0.44..39795.81 rows=45681 width=8)
Index Cond: ((c3 = 1) AND (c1 < 100))
CTE skip
-> Recursive Union (cost=0.92..138.27 rows=110 width=24)
-> Limit (cost=0.92..12.17 rows=10 width=24)
InitPlan 2 (returns $2)
-> CTE Scan on tmp (cost=0.00..0.02 rows=1 width=4)
-> Nested Loop (cost=0.90..567.65 rows=504 width=24)
-> Limit (cost=0.46..0.53 rows=1 width=8)
InitPlan 3 (returns $3)
-> CTE Scan on tmp tmp_1 (cost=0.00..0.02 rows=1 width=4)
-> Index Only Scan using idx_tbl_1 on tbl tbl_2 (cost=0.44..122423.77 rows=1682778 width=8)
Index Cond: ((c3 = 1) AND (c1 > $3))
-> Index Scan using idx_tbl_1 on tbl tbl_1 (cost=0.44..562.07 rows=504 width=20)
Index Cond: ((c3 = 1) AND (c1 = $2))
-> Nested Loop (cost=0.88..12.39 rows=10 width=24)
-> Nested Loop (cost=0.44..0.56 rows=1 width=8)
-> Limit (cost=0.00..0.02 rows=1 width=4)
-> WorkTable Scan on skip skip_1 (cost=0.00..2.00 rows=100 width=4)
-> Limit (cost=0.44..0.51 rows=1 width=8)
-> Index Only Scan using idx_tbl_1 on tbl tbl_3 (cost=0.44..122423.77 rows=1682778 width=8)
Index Cond: ((c3 = 1) AND (c1 > skip_1.t_max_c1))
-> Limit (cost=0.44..11.63 rows=10 width=20)
-> Index Scan using idx_tbl_1 on tbl tbl_4 (cost=0.44..562.07 rows=502 width=20)
Index Cond: ((c3 = 1) AND (c1 = skip_1.t_max_c1))
Filter: (tbl_4.* IS NOT NULL)
-> CTE Scan on skip (cost=0.00..2.20 rows=110 width=0)
(30 rows)
使用UDF
create or replace function get_res() returns setof tbl as $$
declare
v_c1 int;
begin
-- 初始递归条件
select c1 into v_c1 from tbl where c3=1 and c1<100 order by c1 limit 1;
-- 初始语句
return query select * from tbl where c3=1 and c1<100 and c1=v_c1 order by c1 limit 10;
loop
-- 递归条件
select c1 into v_c1 from tbl where c3=1 and c1<100 and c1>v_c1 order by c1 limit 1;
if not found then
return;
end if;
-- 返回加入递归条件后的结果
return query select * from tbl where c3=1 and c1<100 and c1=v_c1 order by c1 limit 10;
end loop;
end;
$$ language plpgsql strict;
响应时间:返回100个品牌的1000个商品,8.5毫秒
postgres=# select count(*) from get_res();
count
-------
1000
(1 row)
Time: 8.536 ms
使用UDF返回10001个品牌的100010行结果。
create or replace function get_res() returns setof tbl as $$
declare
v_c1 int;
begin
-- 初始递归条件
select c1 into v_c1 from tbl where c3=1 order by c1 limit 1;
-- 初始语句
return query select * from tbl where c3=1 and c1=v_c1 order by c1 limit 10;
loop
-- 递归条件
select c1 into v_c1 from tbl where c3=1 and c1>v_c1 order by c1 limit 1;
if not found then
return;
end if;
-- 返回加入递归条件后的结果
return query select * from tbl where c3=1 and c1=v_c1 order by c1 limit 10;
end loop;
end;
$$ language plpgsql strict;
返回10001个品牌的100010个商品,耗时502毫秒。
postgres=# select count(*) from get_res();
count
--------
100010
(1 row)
Time: 502.320 ms
使用窗口函数
select c1,c2,c3,c4 from
(select c1,c2,c3,c4,row_number() over (partition by c1) as rn from tbl where c3=1 and c1<100) t
where t.rn<=10 ;
执行计划
postgres=# explain (analyze,verbose,timing,costs,buffers) select c1,c2,c3,c4 from
(select c1,c2,c3,c4,row_number() over (partition by c1) as rn from tbl where c3=1 and c1<100) t
where t.rn<=10 ;
QUERY PLAN
------------------------------------------------------------------------------------------------------------------------------------------------
Subquery Scan on t (cost=0.44..43017.65 rows=16190 width=20) (actual time=0.054..67.736 rows=1000 loops=1)
Output: t.c1, t.c2, t.c3, t.c4
Filter: (t.rn <= 10)
Rows Removed by Filter: 48739
Buffers: shared hit=49617
-> WindowAgg (cost=0.44..42410.51 rows=48571 width=28) (actual time=0.052..62.473 rows=49739 loops=1)
Output: tbl.c1, tbl.c2, tbl.c3, tbl.c4, row_number() OVER (?)
Buffers: shared hit=49617
-> Index Scan using idx_tbl_1 on public.tbl (cost=0.44..41681.95 rows=48571 width=20) (actual time=0.045..43.892 rows=49739 loops=1)
Output: tbl.c1, tbl.c2, tbl.c3, tbl.c4
Index Cond: ((tbl.c3 = 1) AND (tbl.c1 < 100))
Buffers: shared hit=49617
Planning Time: 0.194 ms
Execution Time: 67.857 ms
(14 rows)
响应耗时:返回100个品牌的1000个商品,68毫秒。
返回10001个品牌的100010个商品,耗时4473毫秒。
select count(*) from
(select c1,c2,c3,c4,row_number() over (partition by c1) as rn from tbl where c3=1) t
where t.rn<=10 ;
count
--------
100010
(1 row)
Time: 4473.348 ms (00:04.473)
例子2 - 品牌+商品 不唯一
1、建表,品牌+商品字段唯一,也就是说不需要考虑去重。
create table tbl(
c1 int, -- 品牌ID
c2 int, -- 商品ID
c3 int, -- 其他条件,本例用来做过滤的条件
c4 timestamp -- 时间
);
2、写入5000万测试数据
insert into tbl
select random()*10000,
random()*100, -- 为了模拟重复,使用少量的商品ID
random()*10,
clock_timestamp()
from generate_series(1,50000000);
3、创建加速索引,过滤条件放在前面,品牌C1字段在最后(用于类似SKIP INDEX SCAN类似的排序加速)。
create index idx_tbl_1 on tbl(c3,c1,c2);
递归
with recursive
-- 符合条件的第一个品牌
tmp as (select c1 from tbl where c3=1 order by c3,c1 limit 1),
skip as (
(
-- 初始查询, 同时计算并输出下一个品牌ID
select distinct on (tbl.c3,tbl.c1,tbl.c2) tbl.*, -- 同一品牌的商品ID去重
t_max.c1 as t_max_c1 -- 下一个品牌
from tbl ,
(select c1 from tbl
where c3=1 -- 计算符合条件的
and c1 > (select c1 from tmp) -- 下一个品牌
order by c3,c1 limit 1 -- 采用类似SKIP INDEX SCAN
) t_max
where tbl.c3=1 -- 符合条件
and tbl.c1=(select c1 from tmp) -- 商家ID等于上一次计算得到的下一个品牌
order by tbl.c3,tbl.c1,tbl.c2 limit 10 -- 每个品牌最多返回10个商品, 排序必须与distinct on子句一样
)
union all
(
-- 递归查询
select tbl.*, -- 同一品牌的商品ID去重
t_max.c1 as t_max_c1 -- 下一个品牌
from
(select t_max_c1 from skip limit 1) s, -- 得到上次计算的这次递归查询的品牌
LATERAL (select c1 from tbl
where c3=1 -- 计算符合条件的
and c1 > s.t_max_c1 -- 下一个品牌
order by c3,c1 limit 1
) t_max,
LATERAL (select distinct on (tbl.c3,tbl.c1,tbl.c2) * from tbl -- 当前计算品牌的10个商品
where tbl.c3=1
and tbl.c1=s.t_max_c1 -- 当前品牌由上次的worker table. t_max_c1得到
and tbl.* is not null
order by tbl.c3,tbl.c1,tbl.c2 limit 10 -- 每个品牌10个商品, 排序必须与distinct on子句一样
) tbl
)
)
select * from skip
union all
-- 符合条件的最后一个品牌
select tbl.*, t.* as t_max_c1 from
(select c1 from tbl where c3=1 order by c3,c1 desc limit 1) t,
LATERAL (select distinct on (tbl.c3,tbl.c1,tbl.c2) tbl.* from tbl
where
c3=1
and c1=t.c1
order by tbl.c3,tbl.c1,tbl.c2 limit 10) as tbl
;
select count(*) from
(
with recursive
-- 符合条件的第一个品牌
tmp as (select c1 from tbl where c3=1 order by c3,c1 limit 1),
skip as (
(
-- 初始查询, 同时计算并输出下一个品牌ID
select distinct on (tbl.c3,tbl.c1,tbl.c2) tbl.*, -- 同一品牌的商品ID去重
t_max.c1 as t_max_c1 -- 下一个品牌
from tbl ,
(select c1 from tbl
where c3=1 -- 计算符合条件的
and c1 > (select c1 from tmp) -- 下一个品牌
order by c3,c1 limit 1 -- 采用类似SKIP INDEX SCAN
) t_max
where tbl.c3=1 -- 符合条件
and tbl.c1=(select c1 from tmp) -- 商家ID等于上一次计算得到的下一个品牌
order by tbl.c3,tbl.c1,tbl.c2 limit 10 -- 每个品牌最多返回10个商品, 排序必须与distinct on子句一样
)
union all
(
-- 递归查询
select tbl.*, -- 同一品牌的商品ID去重
t_max.c1 as t_max_c1 -- 下一个品牌
from
(select t_max_c1 from skip limit 1) s, -- 得到上次计算的这次递归查询的品牌
LATERAL (select c1 from tbl
where c3=1 -- 计算符合条件的
and c1 > s.t_max_c1 -- 下一个品牌
order by c3,c1 limit 1
) t_max,
LATERAL (select distinct on (tbl.c3,tbl.c1,tbl.c2) * from tbl -- 当前计算品牌的10个商品
where tbl.c3=1
and tbl.c1=s.t_max_c1 -- 当前品牌由上次的worker table. t_max_c1得到
and tbl.* is not null
order by tbl.c3,tbl.c1,tbl.c2 limit 10 -- 每个品牌10个商品, 排序必须与distinct on子句一样
) tbl
)
)
select * from skip
union all
-- 符合条件的最后一个品牌
select tbl.*, t.* as t_max_c1 from
(select c1 from tbl where c3=1 order by c3,c1 desc limit 1) t,
LATERAL (select distinct on (tbl.c3,tbl.c1,tbl.c2) tbl.* from tbl
where
c3=1
and c1=t.c1
order by tbl.c3,tbl.c1,tbl.c2 limit 10) as tbl
) t;
count
--------
100010
(1 row)
Time: 726.006 ms
执行计划
QUERY PLAN
----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
Aggregate (cost=155.74..155.75 rows=1 width=8) (actual time=931.733..931.734 rows=1 loops=1)
Output: count(*)
Buffers: shared hit=513268
-> Append (cost=139.05..154.24 rows=120 width=24) (actual time=0.079..919.313 rows=100010 loops=1)
Buffers: shared hit=513268
CTE tmp
-> Limit (cost=0.44..0.48 rows=1 width=8) (actual time=0.039..0.040 rows=1 loops=1)
Output: tbl_2.c1, tbl_2.c3
Buffers: shared hit=4
-> Index Only Scan using idx_tbl_1 on public.tbl tbl_2 (cost=0.44..180668.28 rows=5048333 width=8) (actual time=0.039..0.039 rows=1 loops=1)
Output: tbl_2.c1, tbl_2.c3
Index Cond: (tbl_2.c3 = 1)
Heap Fetches: 1
Buffers: shared hit=4
CTE skip
-> Recursive Union (cost=0.92..138.57 rows=110 width=24) (actual time=0.077..861.314 rows=100000 loops=1)
Buffers: shared hit=513238
-> Limit (cost=0.92..12.22 rows=10 width=24) (actual time=0.077..0.140 rows=10 loops=1)
Output: tbl_3.c1, tbl_3.c2, tbl_3.c3, tbl_3.c4, t_max.c1
Buffers: shared hit=42
InitPlan 2 (returns $2)
-> CTE Scan on tmp (cost=0.00..0.02 rows=1 width=4) (actual time=0.042..0.043 rows=1 loops=1)
Output: tmp.c1
Buffers: shared hit=4
-> Unique (cost=0.90..570.17 rows=504 width=24) (actual time=0.076..0.137 rows=10 loops=1)
Output: tbl_3.c1, tbl_3.c2, tbl_3.c3, tbl_3.c4, t_max.c1
Buffers: shared hit=42
-> Nested Loop (cost=0.90..568.91 rows=504 width=24) (actual time=0.075..0.130 rows=31 loops=1)
Output: tbl_3.c1, tbl_3.c2, tbl_3.c3, tbl_3.c4, t_max.c1
Buffers: shared hit=42
-> Index Scan using idx_tbl_1 on public.tbl tbl_3 (cost=0.44..562.07 rows=504 width=20) (actual time=0.059..0.096 rows=31 loops=1)
Output: tbl_3.c1, tbl_3.c2, tbl_3.c3, tbl_3.c4
Index Cond: ((tbl_3.c3 = 1) AND (tbl_3.c1 = $2))
Buffers: shared hit=38
-> Materialize (cost=0.46..0.55 rows=1 width=4) (actual time=0.001..0.001 rows=1 loops=31)
Output: t_max.c1
Buffers: shared hit=4
-> Subquery Scan on t_max (cost=0.46..0.54 rows=1 width=4) (actual time=0.013..0.013 rows=1 loops=1)
Output: t_max.c1
Buffers: shared hit=4
-> Limit (cost=0.46..0.53 rows=1 width=8) (actual time=0.013..0.013 rows=1 loops=1)
Output: tbl_4.c1, tbl_4.c3
Buffers: shared hit=4
InitPlan 3 (returns $3)
-> CTE Scan on tmp tmp_1 (cost=0.00..0.02 rows=1 width=4) (actual time=0.000..0.001 rows=1 loops=1)
Output: tmp_1.c1
-> Index Only Scan using idx_tbl_1 on public.tbl tbl_4 (cost=0.44..122423.77 rows=1682778 width=8) (actual time=0.012..0.012 rows=1 loops=1)
Output: tbl_4.c1, tbl_4.c3
Index Cond: ((tbl_4.c3 = 1) AND (tbl_4.c1 > $3))
Heap Fetches: 1
Buffers: shared hit=4
-> Nested Loop (cost=0.88..12.42 rows=10 width=24) (actual time=0.030..0.083 rows=10 loops=10000)
Output: tbl_6.c1, tbl_6.c2, tbl_6.c3, tbl_6.c4, tbl_5.c1
Buffers: shared hit=513196
-> Nested Loop (cost=0.44..0.56 rows=1 width=8) (actual time=0.019..0.019 rows=1 loops=10000)
Output: skip_1.t_max_c1, tbl_5.c1
Buffers: shared hit=40011
-> Limit (cost=0.00..0.02 rows=1 width=4) (actual time=0.000..0.000 rows=1 loops=10000)
Output: skip_1.t_max_c1
-> WorkTable Scan on skip skip_1 (cost=0.00..2.00 rows=100 width=4) (actual time=0.000..0.000 rows=1 loops=10000)
Output: skip_1.t_max_c1
-> Limit (cost=0.44..0.51 rows=1 width=8) (actual time=0.018..0.018 rows=1 loops=10000)
Output: tbl_5.c1, tbl_5.c3
Buffers: shared hit=40011
-> Index Only Scan using idx_tbl_1 on public.tbl tbl_5 (cost=0.44..122423.77 rows=1682778 width=8) (actual time=0.018..0.018 rows=1 loops=10000)
Output: tbl_5.c1, tbl_5.c3
Index Cond: ((tbl_5.c3 = 1) AND (tbl_5.c1 > skip_1.t_max_c1))
Heap Fetches: 9999
Buffers: shared hit=40011
-> Limit (cost=0.44..11.65 rows=10 width=20) (actual time=0.011..0.061 rows=10 loops=9999)
Output: tbl_6.c1, tbl_6.c2, tbl_6.c3, tbl_6.c4
Buffers: shared hit=473185
-> Unique (cost=0.44..563.32 rows=502 width=20) (actual time=0.011..0.059 rows=10 loops=9999)
Output: tbl_6.c1, tbl_6.c2, tbl_6.c3, tbl_6.c4
Buffers: shared hit=473185
-> Index Scan using idx_tbl_1 on public.tbl tbl_6 (cost=0.44..562.07 rows=502 width=20) (actual time=0.010..0.053 rows=44 loops=9999)
Output: tbl_6.c1, tbl_6.c2, tbl_6.c3, tbl_6.c4
Index Cond: ((tbl_6.c3 = 1) AND (tbl_6.c1 = skip_1.t_max_c1))
Filter: (tbl_6.* IS NOT NULL)
Buffers: shared hit=473185
-> CTE Scan on skip (cost=0.00..2.20 rows=110 width=24) (actual time=0.079..901.862 rows=100000 loops=1)
Output: skip.c1, skip.c2, skip.c3, skip.c4, skip.t_max_c1
Buffers: shared hit=513238
-> Nested Loop (cost=0.88..12.29 rows=10 width=24) (actual time=0.057..0.086 rows=10 loops=1)
Output: tbl_1.c1, tbl_1.c2, tbl_1.c3, tbl_1.c4, tbl.c1
Buffers: shared hit=30
-> Limit (cost=0.44..0.48 rows=1 width=8) (actual time=0.039..0.039 rows=1 loops=1)
Output: tbl.c1, tbl.c3
Buffers: shared hit=4
-> Index Only Scan Backward using idx_tbl_1 on public.tbl (cost=0.44..180668.28 rows=5048333 width=8) (actual time=0.038..0.038 rows=1 loops=1)
Output: tbl.c1, tbl.c3
Index Cond: (tbl.c3 = 1)
Heap Fetches: 1
Buffers: shared hit=4
-> Limit (cost=0.44..11.61 rows=10 width=20) (actual time=0.015..0.041 rows=10 loops=1)
Output: tbl_1.c1, tbl_1.c2, tbl_1.c3, tbl_1.c4
Buffers: shared hit=26
-> Unique (cost=0.44..563.33 rows=504 width=20) (actual time=0.014..0.039 rows=10 loops=1)
Output: tbl_1.c1, tbl_1.c2, tbl_1.c3, tbl_1.c4
Buffers: shared hit=26
-> Index Scan using idx_tbl_1 on public.tbl tbl_1 (cost=0.44..562.07 rows=504 width=20) (actual time=0.013..0.033 rows=23 loops=1)
Output: tbl_1.c1, tbl_1.c2, tbl_1.c3, tbl_1.c4
Index Cond: ((tbl_1.c3 = 1) AND (tbl_1.c1 = tbl.c1))
Buffers: shared hit=26
Planning Time: 0.648 ms
Execution Time: 933.819 ms
(106 rows)
UDF
create or replace function get_res() returns setof tbl as $$
declare
v_c1 int;
begin
-- 初始递归条件
select c1 into v_c1 from tbl where c3=1 order by c1 limit 1;
-- 初始语句
return query select distinct on (c1,c2) * from tbl where c3=1 and c1=v_c1 order by c1,c2 limit 10;
loop
-- 递归条件
select c1 into v_c1 from tbl where c3=1 and c1>v_c1 order by c1 limit 1;
if not found then
return;
end if;
-- 返回加入递归条件后的结果
return query select distinct on (c1,c2) * from tbl where c3=1 and c1=v_c1 order by c1,c2 limit 10;
end loop;
end;
$$ language plpgsql strict;
postgres=# select count(*) from get_res();
count
--------
100010
(1 row)
Time: 772.746 ms
使用窗口函数
select c1,c2,c3,c4 from
(
select c1,c2,c3,c4,row_number() over (partition by c1) as rn from
(select c1,c2,c3,c4,row_number() over (partition by c1,c2) as rn from tbl where c3=1) t
where t.rn=1
) t
where rn<=10;
select count(*) from
(
select c1,c2,c3,c4,row_number() over (partition by c1) as rn from
(select c1,c2,c3,c4,row_number() over (partition by c1,c2) as rn from tbl where c3=1) t
where t.rn=1
) t
where rn<=10;
count
--------
100010
(1 row)
Time: 4933.848 ms (00:04.934)
例子3 - 品牌+商品 不唯一 , 同时要求每次返回不一样的商品
递归+随机排序
在每个品牌返回的SQL中,外面包一层随机排序
with recursive
-- 符合条件的第一个品牌
tmp as (select c1 from tbl where c3=1 order by c3,c1 limit 1),
skip as (
(
-- 初始查询, 同时计算并输出下一个品牌ID
select * from (
select distinct on (tbl.c3,tbl.c1,tbl.c2) tbl.*, -- 同一品牌的商品ID去重
t_max.c1 as t_max_c1 -- 下一个品牌
from tbl ,
(select c1 from tbl
where c3=1 -- 计算符合条件的
and c1 > (select c1 from tmp) -- 下一个品牌
order by c3,c1 limit 1 -- 采用类似SKIP INDEX SCAN
) t_max
where tbl.c3=1 -- 符合条件
and tbl.c1=(select c1 from tmp) -- 商家ID等于上一次计算得到的下一个品牌
order by tbl.c3,tbl.c1,tbl.c2
) t1
order by random() limit 10 -- 每个品牌最多返回10个商品, 排序必须与distinct on子句一样
)
union all
(
-- 递归查询
select * from (
select tbl.*, -- 同一品牌的商品ID去重
t_max.c1 as t_max_c1 -- 下一个品牌
from
(select t_max_c1 from skip limit 1) s, -- 得到上次计算的这次递归查询的品牌
LATERAL (select c1 from tbl
where c3=1 -- 计算符合条件的
and c1 > s.t_max_c1 -- 下一个品牌
order by c3,c1 limit 1
) t_max,
LATERAL (select distinct on (tbl.c3,tbl.c1,tbl.c2) * from tbl -- 当前计算品牌的10个商品
where tbl.c3=1
and tbl.c1=s.t_max_c1 -- 当前品牌由上次的worker table. t_max_c1得到
and tbl.* is not null
order by tbl.c3,tbl.c1,tbl.c2 -- 每个品牌10个商品, 排序必须与distinct on子句一样
) tbl
) t2
order by random() limit 10
)
)
select * from skip
union all
-- 符合条件的最后一个品牌
select * from (
select tbl.*, t.* as t_max_c1 from
(select c1 from tbl where c3=1 order by c3,c1 desc limit 1) t,
LATERAL ( select distinct on (tbl.c3,tbl.c1,tbl.c2) tbl.* from tbl
where
c3=1
and c1=t.c1
order by tbl.c3,tbl.c1,tbl.c2
) tbl
order by random() limit 10) as t3
;
select count(*) from
(
with recursive
-- 符合条件的第一个品牌
tmp as (select c1 from tbl where c3=1 order by c3,c1 limit 1),
skip as (
(
-- 初始查询, 同时计算并输出下一个品牌ID
select * from (
select distinct on (tbl.c3,tbl.c1,tbl.c2) tbl.*, -- 同一品牌的商品ID去重
t_max.c1 as t_max_c1 -- 下一个品牌
from tbl ,
(select c1 from tbl
where c3=1 -- 计算符合条件的
and c1 > (select c1 from tmp) -- 下一个品牌
order by c3,c1 limit 1 -- 采用类似SKIP INDEX SCAN
) t_max
where tbl.c3=1 -- 符合条件
and tbl.c1=(select c1 from tmp) -- 商家ID等于上一次计算得到的下一个品牌
order by tbl.c3,tbl.c1,tbl.c2
) t1
order by random() limit 10 -- 每个品牌最多返回10个商品, 排序必须与distinct on子句一样
)
union all
(
-- 递归查询
select * from (
select tbl.*, -- 同一品牌的商品ID去重
t_max.c1 as t_max_c1 -- 下一个品牌
from
(select t_max_c1 from skip limit 1) s, -- 得到上次计算的这次递归查询的品牌
LATERAL (select c1 from tbl
where c3=1 -- 计算符合条件的
and c1 > s.t_max_c1 -- 下一个品牌
order by c3,c1 limit 1
) t_max,
LATERAL (select distinct on (tbl.c3,tbl.c1,tbl.c2) * from tbl -- 当前计算品牌的10个商品
where tbl.c3=1
and tbl.c1=s.t_max_c1 -- 当前品牌由上次的worker table. t_max_c1得到
and tbl.* is not null
order by tbl.c3,tbl.c1,tbl.c2 -- 每个品牌10个商品, 排序必须与distinct on子句一样
) tbl
) t2
order by random() limit 10
)
)
select * from skip
union all
-- 符合条件的最后一个品牌
select * from (
select tbl.*, t.* as t_max_c1 from
(select c1 from tbl where c3=1 order by c3,c1 desc limit 1) t,
LATERAL ( select distinct on (tbl.c3,tbl.c1,tbl.c2) tbl.* from tbl
where
c3=1
and c1=t.c1
order by tbl.c3,tbl.c1,tbl.c2
) tbl
order by random() limit 10) as t3
) t;
count
--------
100010
(1 row)
Time: 4758.684 ms (00:04.759)
每个循环比非随机排序增加了0.5毫秒左右。循环1万次基本上就差不多5秒了。
QUERY PLAN
----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
Aggregate (cost=7041.09..7041.10 rows=1 width=8) (actual time=5911.323..5911.323 rows=1 loops=1)
Output: count(*)
Buffers: shared hit=5074419
-> Append (cost=6450.62..7039.59 rows=120 width=24) (actual time=0.656..5899.867 rows=100010 loops=1)
Buffers: shared hit=5074419
CTE tmp
-> Limit (cost=0.44..0.48 rows=1 width=8) (actual time=0.040..0.040 rows=1 loops=1)
Output: tbl_2.c1, tbl_2.c3
Buffers: shared hit=4
-> Index Only Scan using idx_tbl_1 on public.tbl tbl_2 (cost=0.44..180668.28 rows=5048333 width=8) (actual time=0.039..0.039 rows=1 loops=1)
Output: tbl_2.c1, tbl_2.c3
Index Cond: (tbl_2.c3 = 1)
Heap Fetches: 1
Buffers: shared hit=4
CTE skip
-> Recursive Union (cost=587.38..6450.14 rows=110 width=24) (actual time=0.654..5846.618 rows=100000 loops=1)
Buffers: shared hit=5074144
-> Subquery Scan on "*SELECT* 1" (cost=587.38..587.51 rows=10 width=24) (actual time=0.653..0.668 rows=10 loops=1)
Output: "*SELECT* 1".c1, "*SELECT* 1".c2, "*SELECT* 1".c3, "*SELECT* 1".c4, "*SELECT* 1".t_max_c1
Buffers: shared hit=285
-> Limit (cost=587.38..587.41 rows=10 width=32) (actual time=0.652..0.665 rows=10 loops=1)
Output: t1.c1, t1.c2, t1.c3, t1.c4, t1.t_max_c1, (random())
Buffers: shared hit=285
相比非随机排序,这里还增加了显示的排序
-> Sort (cost=587.38..588.64 rows=504 width=32) (actual time=0.651..0.652 rows=10 loops=1)
Output: t1.c1, t1.c2, t1.c3, t1.c4, t1.t_max_c1, (random())
Sort Key: (random())
Sort Method: top-N heapsort Memory: 26kB
Buffers: shared hit=285
-> Subquery Scan on t1 (cost=0.92..576.49 rows=504 width=32) (actual time=0.076..0.622 rows=97 loops=1)
Output: t1.c1, t1.c2, t1.c3, t1.c4, t1.t_max_c1, random()
Buffers: shared hit=285
相比非随机排序,这里需要扫描整个品牌的索引TREE,所以耗时更多
-> Unique (cost=0.92..570.19 rows=504 width=24) (actual time=0.074..0.598 rows=97 loops=1)
Output: tbl_3.c1, tbl_3.c2, tbl_3.c3, tbl_3.c4, t_max.c1
Buffers: shared hit=285
InitPlan 2 (returns $2)
-> CTE Scan on tmp (cost=0.00..0.02 rows=1 width=4) (actual time=0.043..0.043 rows=1 loops=1)
Output: tmp.c1
Buffers: shared hit=4
-> Nested Loop (cost=0.90..568.91 rows=504 width=24) (actual time=0.073..0.552 rows=274 loops=1)
Output: tbl_3.c1, tbl_3.c2, tbl_3.c3, tbl_3.c4, t_max.c1
Buffers: shared hit=285
-> Index Scan using idx_tbl_1 on public.tbl tbl_3 (cost=0.44..562.07 rows=504 width=20) (actual time=0.058..0.380 rows=274 loops=1)
Output: tbl_3.c1, tbl_3.c2, tbl_3.c3, tbl_3.c4
Index Cond: ((tbl_3.c3 = 1) AND (tbl_3.c1 = $2))
Buffers: shared hit=281
-> Materialize (cost=0.46..0.55 rows=1 width=4) (actual time=0.000..0.000 rows=1 loops=274)
Output: t_max.c1
Buffers: shared hit=4
-> Subquery Scan on t_max (cost=0.46..0.54 rows=1 width=4) (actual time=0.013..0.013 rows=1 loops=1)
Output: t_max.c1
Buffers: shared hit=4
-> Limit (cost=0.46..0.53 rows=1 width=8) (actual time=0.012..0.012 rows=1 loops=1)
Output: tbl_4.c1, tbl_4.c3
Buffers: shared hit=4
InitPlan 3 (returns $3)
-> CTE Scan on tmp tmp_1 (cost=0.00..0.02 rows=1 width=4) (actual time=0.000..0.001 rows=1 loops=1)
Output: tmp_1.c1
-> Index Only Scan using idx_tbl_1 on public.tbl tbl_4 (cost=0.44..122423.77 rows=1682778 width=8) (actual time=0.012..0.012 rows=1 loops=1)
Output: tbl_4.c1, tbl_4.c3
Index Cond: ((tbl_4.c3 = 1) AND (tbl_4.c1 > $3))
Heap Fetches: 1
Buffers: shared hit=4
-> Subquery Scan on "*SELECT* 2" (cost=586.03..586.15 rows=10 width=24) (actual time=0.577..0.581 rows=10 loops=10000)
Output: "*SELECT* 2".c1, "*SELECT* 2".c2, "*SELECT* 2".c3, "*SELECT* 2".c4, "*SELECT* 2".t_max_c1
Buffers: shared hit=5073859
-> Limit (cost=586.03..586.05 rows=10 width=32) (actual time=0.576..0.578 rows=10 loops=10000)
Output: tbl_6.c1, tbl_6.c2, tbl_6.c3, tbl_6.c4, tbl_5.c1, (random())
Buffers: shared hit=5073859
-> Sort (cost=586.03..587.28 rows=502 width=32) (actual time=0.576..0.577 rows=10 loops=10000)
Output: tbl_6.c1, tbl_6.c2, tbl_6.c3, tbl_6.c4, tbl_5.c1, (random())
Sort Key: (random())
Sort Method: quicksort Memory: 25kB
Buffers: shared hit=5073859
-> Nested Loop (cost=0.88..575.18 rows=502 width=32) (actual time=0.032..0.553 rows=100 loops=10000)
Output: tbl_6.c1, tbl_6.c2, tbl_6.c3, tbl_6.c4, tbl_5.c1, random()
Buffers: shared hit=5073859
-> Nested Loop (cost=0.44..0.56 rows=1 width=8) (actual time=0.020..0.021 rows=1 loops=10000)
Output: skip_1.t_max_c1, tbl_5.c1
Buffers: shared hit=40011
-> Limit (cost=0.00..0.02 rows=1 width=4) (actual time=0.001..0.001 rows=1 loops=10000)
Output: skip_1.t_max_c1
-> WorkTable Scan on skip skip_1 (cost=0.00..2.00 rows=100 width=4) (actual time=0.000..0.000 rows=1 loops=10000)
Output: skip_1.t_max_c1
-> Limit (cost=0.44..0.51 rows=1 width=8) (actual time=0.019..0.019 rows=1 loops=10000)
Output: tbl_5.c1, tbl_5.c3
Buffers: shared hit=40011
-> Index Only Scan using idx_tbl_1 on public.tbl tbl_5 (cost=0.44..122423.77 rows=1682778 width=8) (actual time=0.019..0.019 rows=1 loops=10000)
Output: tbl_5.c1, tbl_5.c3
Index Cond: ((tbl_5.c3 = 1) AND (tbl_5.c1 > skip_1.t_max_c1))
Heap Fetches: 9999
Buffers: shared hit=40011
-> Unique (cost=0.44..563.32 rows=502 width=20) (actual time=0.011..0.508 rows=100 loops=9999)
Output: tbl_6.c1, tbl_6.c2, tbl_6.c3, tbl_6.c4
Buffers: shared hit=5033848
-> Index Scan using idx_tbl_1 on public.tbl tbl_6 (cost=0.44..562.07 rows=502 width=20) (actual time=0.011..0.444 rows=500 loops=9999)
Output: tbl_6.c1, tbl_6.c2, tbl_6.c3, tbl_6.c4
Index Cond: ((tbl_6.c3 = 1) AND (tbl_6.c1 = skip_1.t_max_c1))
Filter: (tbl_6.* IS NOT NULL)
Buffers: shared hit=5033848
-> CTE Scan on skip (cost=0.00..2.20 rows=110 width=24) (actual time=0.655..5882.899 rows=100000 loops=1)
Output: skip.c1, skip.c2, skip.c3, skip.c4, skip.t_max_c1
Buffers: shared hit=5074144
-> Subquery Scan on t3 (cost=586.04..586.17 rows=10 width=24) (actual time=0.339..0.344 rows=10 loops=1)
Output: t3.c1, t3.c2, t3.c3, t3.c4, t3.c1_1
Buffers: shared hit=275
-> Limit (cost=586.04..586.07 rows=10 width=32) (actual time=0.338..0.340 rows=10 loops=1)
Output: tbl_1.c1, tbl_1.c2, tbl_1.c3, tbl_1.c4, tbl.c1, (random())
Buffers: shared hit=275
-> Sort (cost=586.04..587.30 rows=504 width=32) (actual time=0.337..0.338 rows=10 loops=1)
Output: tbl_1.c1, tbl_1.c2, tbl_1.c3, tbl_1.c4, tbl.c1, (random())
Sort Key: (random())
Sort Method: top-N heapsort Memory: 26kB
Buffers: shared hit=275
-> Nested Loop (cost=0.88..575.15 rows=504 width=32) (actual time=0.058..0.311 rows=93 loops=1)
Output: tbl_1.c1, tbl_1.c2, tbl_1.c3, tbl_1.c4, tbl.c1, random()
Buffers: shared hit=275
-> Limit (cost=0.44..0.48 rows=1 width=8) (actual time=0.041..0.041 rows=1 loops=1)
Output: tbl.c1, tbl.c3
Buffers: shared hit=4
-> Index Only Scan Backward using idx_tbl_1 on public.tbl (cost=0.44..180668.28 rows=5048333 width=8) (actual time=0.041..0.041 rows=1 loops=1)
Output: tbl.c1, tbl.c3
Index Cond: (tbl.c3 = 1)
Heap Fetches: 1
Buffers: shared hit=4
-> Unique (cost=0.44..563.33 rows=504 width=20) (actual time=0.014..0.246 rows=93 loops=1)
Output: tbl_1.c1, tbl_1.c2, tbl_1.c3, tbl_1.c4
Buffers: shared hit=271
-> Index Scan using idx_tbl_1 on public.tbl tbl_1 (cost=0.44..562.07 rows=504 width=20) (actual time=0.013..0.201 rows=268 loops=1)
Output: tbl_1.c1, tbl_1.c2, tbl_1.c3, tbl_1.c4
Index Cond: ((tbl_1.c3 = 1) AND (tbl_1.c1 = tbl.c1))
Buffers: shared hit=271
Planning Time: 0.686 ms
Execution Time: 5913.352 ms
(133 rows)
UDF
create or replace function get_res() returns setof tbl as $$
declare
v_c1 int;
begin
-- 初始递归条件
select c1 into v_c1 from tbl where c3=1 order by c1 limit 1;
-- 初始语句
return query select * from (select distinct on (c1,c2) * from tbl where c3=1 and c1=v_c1 order by c1,c2) t order by random() limit 10;
loop
-- 递归条件
select c1 into v_c1 from tbl where c3=1 and c1>v_c1 order by c1 limit 1;
if not found then
return;
end if;
-- 返回加入递归条件后的结果
return query select * from (select distinct on (c1,c2) * from tbl where c3=1 and c1=v_c1 order by c1,c2) t order by random() limit 10;
end loop;
end;
$$ language plpgsql strict;
postgres=# select count(*) from get_res();
count
--------
100010
(1 row)
Time: 3888.789 ms (00:03.889)
窗口查询
select count(*) from
(
select c1,c2,c3,c4,row_number() over (partition by c1 order by random()) as rn from
(select c1,c2,c3,c4,row_number() over (partition by c1,c2) as rn from tbl where c3=1) t
where t.rn=1
) t
where rn<=10;
count
--------
100010
(1 row)
Time: 5723.719 ms (00:05.724)
性能总结
case | 数据量 | 递归SQL耗时(毫秒) | UDF耗时(毫秒) | 窗口查询耗时(毫秒)
—|—|—|—|—
提取100个品牌的10个商品(不去重) | 5000万记录1万商品 | 6.5 | 8.5 | 68
提取1万个品牌的10个商品(不去重) | 5000万记录1万商品 | 418 | 502 | 4473
提取1万个品牌的10个商品(去重) | 5000万记录1万商品 | 726 | 772 | 4933
提取1万个品牌的10个商品(去重且随机) | 5000万记录1万商品 | 4758 | 3888 | 5723
小结
使用递归,我们在5000万的品牌数据中,为每个品牌筛选10个商品,输出10万行记录,约283毫秒。每个分页1000条的话,分页响应速度约5毫秒。
为了达到最佳的效果,目标是扫描最少的数据块,尽量使用最少的CPU过滤,尽量将多余的扫描降到最少。所以我们用了类似SKIP INDEX SCAN的思路。把SQL优化到了极致,每一次递归仅扫描了需要的记录。(适合在递归层较稀疏的数据,递归次数本例就是指的品牌数)
《PostgreSQL Oracle 兼容性之 - INDEX SKIP SCAN (递归查询变态优化) 非驱动列索引扫描优化》
类似的场景和优化方法还可以参考如下文档:
《PostgrSQL 递归SQL的几个应用 - 极客与正常人的思维》
参考
《PostgreSQL SELECT 的高级用法(CTE, LATERAL, ORDINALITY, WINDOW, SKIP LOCKED, DISTINCT, GROUPING SETS, …)》