PostgreSQL 10 内置分区 vs pg_pathman perf profiling
背景
PostgreSQL 10内置分区的性能不如pg_pathman分区插件的性能。有非常大的优化空间,那么是什么导致了分区的性能问题呢?
编译PostgreSQL 10.0
1、编译、打开debug
CFLAGS="-g -ggdb -fno-omit-frame-pointer" ./configure --prefix=/home/digoal/pgsql10.0
CFLAGS="-g -ggdb -fno-omit-frame-pointer" make world -j 128
CFLAGS="-g -ggdb -fno-omit-frame-pointer" make install-world
2、初始化集群
initdb -D $PGDATA -U postgres -E SQL_ASCII --locale=C
3、配置postgresql.conf
listen_addresses = '0.0.0.0'
port = 1921
max_connections = 400
superuser_reserved_connections = 3
unix_socket_directories = '.'
shared_buffers = 128GB
work_mem = 1024MB
maintenance_work_mem = 4GB
dynamic_shared_memory_type = posix
vacuum_cost_delay = 0
bgwriter_delay = 10ms
bgwriter_flush_after = 0
max_worker_processes = 128
max_parallel_workers_per_gather = 0
max_parallel_workers = 128
wal_level = minimal
synchronous_commit = off
full_page_writes = off
wal_buffers = 1GB
wal_writer_delay = 10ms
wal_writer_flush_after = 0
checkpoint_timeout = 55min
max_wal_size = 128GB
min_wal_size = 80MB
checkpoint_completion_target = 0.01
checkpoint_flush_after = 0
max_wal_senders = 0
parallel_tuple_cost = 0
parallel_setup_cost = 0
min_parallel_table_scan_size = 0
min_parallel_index_scan_size = 0
effective_cache_size = 400GB
log_destination = 'csvlog'
logging_collector = on
log_truncate_on_rotation = on
log_timezone = 'PRC'
log_autovacuum_min_duration = 0
datestyle = 'iso, mdy'
timezone = 'PRC'
lc_messages = 'C'
lc_monetary = 'C'
lc_numeric = 'C'
lc_time = 'C'
default_text_search_config = 'pg_catalog.english'
创建range和list分区,用于测试
1. range 分区表
1.1 主表
create table t_range(id int, info text, crt_time timestamp) partition by range (id);
1.2 分区
do language plpgsql $$
declare
i int;
begin
for i in 0..127 loop
execute 'create table t_range_'||i||' partition of t_range for values from ('||i*100||') to ('||(i+1)*100||')';
end loop;
end;
$$;
2. list 分区表
2.1 主表
create table t_list(id int, info text, crt_time timestamp) partition by list ( mod(hashtext(info), 128) );
2.2 分区
do language plpgsql $$
declare
i int;
begin
for i in 0..127 loop
execute 'create table t_list_'||i||' partition of t_list for values in ('||i||')';
end loop;
end;
$$;
创建测试脚本
1、测试范围分区
vi test1.sql
\set id random(0,12799)
insert into t_range values (:id, 'test', now());
2、测试LIST分区
vi test2.sql
\set id random(0,127)
insert into t_list values (:id, 'test', now());
压测
pgbench -M prepared -n -r -P 1 -f ./test1.sql -c 64 -j 64 -T 12800
pgbench -M prepared -n -r -P 1 -f ./test2.sql -c 64 -j 64 -T 12800
progress: 14.0 s, 21361.0 tps, lat 2.996 ms stddev 0.848
progress: 15.0 s, 21302.0 tps, lat 3.004 ms stddev 0.868
progress: 16.0 s, 21349.0 tps, lat 2.998 ms stddev 0.852
progress: 17.0 s, 21366.9 tps, lat 2.996 ms stddev 0.858
progress: 18.0 s, 21346.0 tps, lat 2.998 ms stddev 0.856
收集profiling
perf record -ag
生成perf报告
1、
perf report -g
结果
Samples: 1M of event 'cycles', Event count (approx.): 662397686992
Children Self Command Shared Object Symbol
+ 88.63% 0.00% postgres libc-2.17.so [.] __libc_start_main
+ 88.63% 0.00% postgres postgres [.] startup_hacks
+ 88.63% 0.00% postgres postgres [.] PostmasterMain
+ 88.49% 0.00% postgres postgres [.] ServerLoop
+ 88.49% 0.00% postgres postgres [.] BackendStartup
+ 88.47% 0.00% postgres postgres [.] ExitPostmaster
+ 88.42% 0.04% postgres postgres [.] PostgresMain
+ 74.92% 0.02% postgres postgres [.] exec_execute_message
+ 74.77% 0.01% postgres postgres [.] PortalRun
+ 74.75% 0.02% postgres postgres [.] PortalRunMulti
+ 74.57% 0.01% postgres postgres [.] ProcessQuery
+ 71.56% 0.01% postgres postgres [.] ExecutorStart
+ 71.54% 0.01% postgres postgres [.] standard_ExecutorStart
+ 71.46% 0.02% postgres postgres [.] InitPlan
+ 71.26% 0.02% postgres postgres [.] ExecInitNode
+ 71.23% 0.05% postgres postgres [.] ExecInitModifyTable
+ 70.34% 0.23% postgres postgres [.] ExecSetupPartitionTupleRouting
+ 45.89% 0.23% postgres postgres [.] find_all_inheritors
+ 42.42% 0.28% postgres postgres [.] find_inheritance_children
+ 23.59% 0.08% postgres postgres [.] LockRelationOid
+ 23.33% 0.04% postgres postgres [.] LockAcquire
+ 22.88% 0.79% postgres postgres [.] LockAcquireExtended
+ 12.59% 1.07% postgres postgres [.] InitResultRelInfo
+ 12.10% 0.26% postgres postgres [.] LWLockAcquire
+ 11.50% 0.08% postgres postgres [.] RelationGetPartitionQual
+ 10.99% 0.01% postgres postgres [.] CommitTransaction
+ 10.89% 1.14% postgres postgres [.] _copyList
+ 10.58% 0.05% postgres postgres [.] systable_getnext
+ 10.51% 0.05% postgres postgres [.] index_getnext
+ 10.50% 6.88% postgres postgres [.] hash_search_with_hash_value
+ 10.35% 0.01% postgres postgres [.] ResourceOwnerRelease
+ 10.34% 0.03% postgres postgres [.] ResourceOwnerReleaseInternal
+ 10.29% 0.01% postgres postgres [.] ProcReleaseLocks
+ 10.22% 1.84% postgres postgres [.] LockReleaseAll
+ 9.46% 0.52% postgres postgres [.] LWLockRelease
+ 9.00% 0.21% postgres postgres [.] index_fetch_heap
+ 8.60% 0.01% postgres [kernel.kallsyms] [k] system_call_fastpath
+ 8.34% 0.23% postgres postgres [.] SearchSysCache
+ 7.98% 0.27% swapper [kernel.kallsyms] [k] cpu_startup_entry
+ 7.86% 0.00% swapper [kernel.kallsyms] [k] start_secondary
+ 7.73% 0.04% postgres [kernel.kallsyms] [k] sys_futex
+ 7.68% 0.05% postgres [kernel.kallsyms] [k] do_futex
+ 7.43% 5.59% postgres postgres [.] AllocSetAlloc
+ 7.27% 3.52% postgres postgres [.] SearchCatCache
+ 7.03% 0.22% postgres postgres [.] _copyOpExpr
+ 6.15% 0.36% postgres postgres [.] LWLockWakeup
+ 5.93% 0.38% postgres postgres [.] RelationGetPartitionDispatchInfo
+ 5.62% 0.14% postgres postgres [.] LockBuffer
+ 5.50% 0.28% postgres postgres [.] hash_search
2、
perf report --stdio -g
结果
45.89% 0.23% postgres postgres [.] find_all_inheritors
|
---find_all_inheritors
|
|--99.99%-- ExecSetupPartitionTupleRouting
| ExecInitModifyTable
42.42% 0.28% postgres postgres [.] find_inheritance_children
|
---find_inheritance_children
|
|--99.97%-- find_all_inheritors
| ExecSetupPartitionTupleRouting
| ExecInitModifyTable
| ExecInitNode
23.59% 0.08% postgres postgres [.] LockRelationOid
|
---LockRelationOid
|
|--98.67%-- find_inheritance_children
| find_all_inheritors
| ExecSetupPartitionTupleRouting
| ExecInitModifyTable
23.33% 0.04% postgres postgres [.] LockAcquire
|
---LockAcquire
|
|--98.59%-- LockRelationOid
| |
| |--98.82%-- find_inheritance_children
| | find_all_inheritors
| | ExecSetupPartitionTupleRouting
| | ExecInitModifyTable
| | ExecInitNode
11.50% 0.08% postgres postgres [.] RelationGetPartitionQual
|
---RelationGetPartitionQual
|
|--99.75%-- InitResultRelInfo
| |
| |--99.99%-- ExecSetupPartitionTupleRouting
| | ExecInitModifyTable
| | ExecInitNode
| | InitPlan
| | standard_ExecutorStart
| | ExecutorStart
11.42% 0.07% postgres postgres [.] generate_partition_qual
|
---generate_partition_qual
|
|--99.90%-- RelationGetPartitionQual
| InitResultRelInfo
| ExecSetupPartitionTupleRouting
| ExecInitModifyTable
| ExecInitNode
| InitPlan
| standard_ExecutorStart
| ExecutorStart
pg_pathman perf profiling
postgres=# CREATE EXTENSION pg_pathman;
CREATE EXTENSION
postgres=# create table tbl_range(id int not null, info text, crt_time timestamp);
CREATE TABLE
postgres=# select create_range_partitions('tbl_range', 'id', 0, 100, 128);
create_range_partitions
-------------------------
128
(1 row)
压测,pg_pathman 是native partition 的10几倍性能。
progress: 11.0 s, 262159.5 tps, lat 0.244 ms stddev 0.135
progress: 12.0 s, 260462.8 tps, lat 0.246 ms stddev 0.124
progress: 13.0 s, 259761.5 tps, lat 0.246 ms stddev 0.130
progress: 14.0 s, 271007.1 tps, lat 0.236 ms stddev 0.118
profiling
perf report -g
Samples: 1M of event 'cycles', Event count (approx.): 529675912799
Children Self Command Shared Object Symbol
+ 78.90% 0.00% postgres libc-2.17.so [.] __libc_start_main
+ 78.90% 0.00% postgres postgres [.] startup_hacks
+ 78.90% 0.00% postgres postgres [.] PostmasterMain
+ 78.00% 0.00% postgres postgres [.] ServerLoop
+ 78.00% 0.00% postgres postgres [.] BackendStartup
+ 78.00% 0.00% postgres postgres [.] ExitPostmaster
+ 77.72% 0.24% postgres postgres [.] PostgresMain
+ 48.13% 0.13% postgres postgres [.] exec_execute_message
+ 47.16% 0.07% postgres postgres [.] PortalRun
+ 47.03% 0.11% postgres postgres [.] PortalRunMulti
+ 44.80% 0.07% postgres postgres [.] ProcessQuery
+ 29.89% 0.01% postgres postgres [.] ExecutorRun
+ 29.87% 0.06% postgres postgres [.] standard_ExecutorRun
+ 29.78% 0.04% postgres postgres [.] ExecutePlan
+ 29.51% 0.03% postgres postgres [.] ExecProcNode
+ 29.47% 0.04% postgres postgres [.] ExecProcNodeFirst
+ 29.41% 0.14% postgres postgres [.] ExecModifyTable
+ 20.70% 0.13% postgres postgres [.] ExecInsert
+ 12.84% 0.02% postgres postgres [.] finish_xact_command
+ 12.80% 0.05% postgres postgres [.] CommitTransactionCommand
+ 12.21% 0.08% postgres postgres [.] CommitTransaction
+ 12.19% 0.05% postgres postgres [.] ExecutorStart
+ 12.09% 0.04% postgres postgres [.] standard_ExecutorStart
+ 11.57% 0.11% postgres postgres [.] InitPlan
+ 10.91% 0.10% postgres postgres [.] ExecConstraints
+ 10.75% 0.07% postgres postgres [.] ExecRelCheck
+ 10.19% 0.16% postgres postgres [.] ExecInitNode
+ 10.04% 0.25% postgres postgres [.] ExecInitModifyTable
+ 10.03% 0.01% postgres [kernel.kallsyms] [k] system_call_fastpath
+ 9.07% 0.11% postgres postgres [.] heap_insert
+ 8.82% 0.11% postgres postgres [.] ExecInitCustomScan
+ 8.46% 0.22% postgres postgres [.] exec_bind_message
+ 8.32% 0.03% postgres postgres [.] ExecProcNode
+ 8.27% 0.06% postgres postgres [.] ExecCustomScan
+ 8.14% 0.09% postgres pg_pathman.so [.] partition_filter_exec
+ 7.07% 0.25% swapper [kernel.kallsyms] [k] cpu_startup_entry
+ 6.99% 0.00% swapper [kernel.kallsyms] [k] start_secondary
+ 6.45% 0.04% postgres postgres [.] stringToNode
+ 6.45% 0.04% postgres pg_pathman.so [.] select_partition_for_insert
+ 6.40% 0.16% postgres postgres [.] nodeRead
+ 6.15% 0.21% postgres postgres [.] parseNodeString
+ 6.02% 0.00% pgbench libpthread-2.17.so [.] start_thread
+ 5.98% 0.04% postgres postgres [.] _readBoolExpr
+ 5.92% 0.05% postgres pg_pathman.so [.] partition_filter_begin
+ 5.21% 0.08% postgres postgres [.] _readOpExpr
+ 4.77% 0.05% postgres postgres [.] XLogInsert
+ 4.74% 0.05% postgres [kernel.kallsyms] [k] sys_futex
+ 4.68% 0.06% postgres [kernel.kallsyms] [k] do_futex
+ 4.64% 0.24% postgres postgres [.] LWLockAcquire
+ 4.42% 4.38% postgres postgres [.] pg_strtok
+ 4.34% 0.10% postgres postgres [.] XLogInsertRecord
+ 4.22% 0.01% pgbench [kernel.kallsyms] [k] system_call_fastpath
+ 4.10% 0.08% postgres postgres [.] RecordTransactionCommit
perf report --stdio -g
分析
从现象看,PG 10 native partition应该是find_all_inheritors效率问题,relation锁时间过长。性能只有pg_pathman的十几分之一。
native partition 的罪魁祸首:
/*
* find_all_inheritors -
* Returns a list of relation OIDs including the given rel plus
* all relations that inherit from it, directly or indirectly.
* Optionally, it also returns the number of parents found for
* each such relation within the inheritance tree rooted at the
* given rel.
*
* The specified lock type is acquired on all child relations (but not on the
* given rel; caller should already have locked it). If lockmode is NoLock
* then no locks are acquired, but caller must beware of race conditions
* against possible DROPs of child relations.
*/
List *
find_all_inheritors(Oid parentrelId, LOCKMODE lockmode, List **numparents)
{
/* hash table for O(1) rel_oid -> rel_numparents cell lookup */
HTAB *seen_rels;
HASHCTL ctl;
List *rels_list,
*rel_numparents;
ListCell *l;
memset(&ctl, 0, sizeof(ctl));
ctl.keysize = sizeof(Oid);
ctl.entrysize = sizeof(SeenRelsEntry);
ctl.hcxt = CurrentMemoryContext;
seen_rels = hash_create("find_all_inheritors temporary table",
32, /* start small and extend */
&ctl,
HASH_ELEM | HASH_BLOBS | HASH_CONTEXT);
/*
* We build a list starting with the given rel and adding all direct and
* indirect children. We can use a single list as both the record of
* already-found rels and the agenda of rels yet to be scanned for more
* children. This is a bit tricky but works because the foreach() macro
* doesn't fetch the next list element until the bottom of the loop.
*/
rels_list = list_make1_oid(parentrelId);
rel_numparents = list_make1_int(0);
foreach(l, rels_list)
{
Oid currentrel = lfirst_oid(l);
List *currentchildren;
ListCell *lc;
/* Get the direct children of this rel */
currentchildren = find_inheritance_children(currentrel, lockmode);
/*
* Add to the queue only those children not already seen. This avoids
* making duplicate entries in case of multiple inheritance paths from
* the same parent. (It'll also keep us from getting into an infinite
* loop, though theoretically there can't be any cycles in the
* inheritance graph anyway.)
*/
foreach(lc, currentchildren)
{
Oid child_oid = lfirst_oid(lc);
bool found;
SeenRelsEntry *hash_entry;
hash_entry = hash_search(seen_rels, &child_oid, HASH_ENTER, &found);
if (found)
{
/* if the rel is already there, bump number-of-parents counter */
lfirst_int(hash_entry->numparents_cell)++;
}
else
{
/* if it's not there, add it. expect 1 parent, initially. */
rels_list = lappend_oid(rels_list, child_oid);
rel_numparents = lappend_int(rel_numparents, 1);
hash_entry->numparents_cell = rel_numparents->tail;
}
}
}
if (numparents)
*numparents = rel_numparents;
else
list_free(rel_numparents);
hash_destroy(seen_rels);
return rels_list;
}
native partition的执行计划:
postgres=# explain (analyze,verbose,timing,costs,buffers) insert into t_range values (1);
QUERY PLAN
-------------------------------------------------------------------------------------------------------
Insert on public.t_range (cost=0.00..0.01 rows=1 width=44) (actual time=0.060..0.060 rows=0 loops=1)
Buffers: shared read=2 dirtied=1
-> Result (cost=0.00..0.01 rows=1 width=44) (actual time=0.001..0.001 rows=1 loops=1)
Output: 1, NULL::text, NULL::timestamp without time zone
Planning time: 0.036 ms
Execution time: 6.330 ms
(6 rows)
pg_pathman的执行计划:
postgres=# explain (analyze,verbose,timing,costs,buffers) insert into tbl_range values (1);
QUERY PLAN
------------------------------------------------------------------------------------------------------------------
Insert on public.tbl_range (cost=0.00..0.01 rows=1 width=44) (actual time=0.214..0.214 rows=0 loops=1)
Buffers: shared hit=19
-> Custom Scan (PartitionFilter) (cost=0.00..0.01 rows=1 width=44) (actual time=0.067..0.068 rows=1 loops=1)
Output: id, info, crt_time
Buffers: shared hit=8
-> Result (cost=0.00..0.01 rows=1 width=44) (actual time=0.001..0.001 rows=1 loops=1)
Output: 1, NULL::text, NULL::timestamp without time zone
Planning time: 5.177 ms
Execution time: 0.287 ms
(9 rows)
参考
《PostgreSQL 10.0 preview 功能增强 - 内置分区表》
《[未完待续] PostgreSQL sharding 套件(pg_pathman, postgres_fdw, logical replication)》
《PostgreSQL 9.5+ 高效分区表实现 - pg_pathman》
《pg_pathman extension for postgresql partitioning》