PostgreSQL standby 在万兆网环境中缘何 延迟? 如何解决?
背景
在一个这样的环境中测试PostgreSQL 9.5 rc1
主库
32C
256G 内存
18TB aliflash ssd (约50万 写iops处理能力)
xfs 文件系统
10GB 网卡 * 2 bond
CentOS 6.x x64
备库
32C
256G 内存
18TB aliflash ssd (约50万 写iops处理能力)
pg_xlog ext4 文件系统
pgdata ZFS 文件系统
10GB 网卡 * 2 bond
CentOS 6.x x64
主备数据库参数:
listen_addresses = '0.0.0.0' # what IP address(es) to listen on;
port = 1921 # (change requires restart)
max_connections = 1000 # (change requires restart)
unix_socket_directories = '.' # comma-separated list of directories
shared_buffers = 32GB # min 128kB
maintenance_work_mem = 512MB # min 1MB
autovacuum_work_mem = 512MB # min 1MB, or -1 to use maintenance_work_mem
dynamic_shared_memory_type = posix # the default is the first option
bgwriter_delay = 10ms # 10-10000ms between rounds
wal_level = hot_standby # minimal, archive, hot_standby, or logical
fsync = on # turns forced synchronization on or off
synchronous_commit = off # synchronization level;
max_wal_size = 32GB
max_wal_senders = 10 # max number of walsender processes
max_replication_slots = 10 # max number of replication slots
hot_standby = on # "on" allows queries during recovery
wal_receiver_status_interval = 1s # send replies at least this often
hot_standby_feedback = on # send info from standby to prevent
random_page_cost = 1.0 # same scale as above
effective_cache_size = 256GB
log_destination = 'csvlog' # Valid values are combinations of
logging_collector = on # Enable capturing of stderr and csvlog
log_checkpoints = on
log_connections = on
log_disconnections = on
log_error_verbosity = verbose # terse, default, or verbose messages
log_timezone = 'PRC'
datestyle = 'iso, mdy'
timezone = 'PRC'
lc_messages = 'C' # locale for system error message
lc_monetary = 'C' # locale for monetary formatting
lc_numeric = 'C' # locale for number formatting
lc_time = 'C' # locale for time formatting
default_text_search_config = 'pg_catalog.english'
测试模型
pgbench -i -s 1000
数据量 17TB
测试
pgbench -M prepared -n -r -P 1 -c 48 -j 48 -T 100000
300秒后主备延迟达到了39GB
postgres=# select pg_size_pretty(pg_xlog_location_diff(pg_current_xlog_insert_location(),sent_location)),pg_size_pretty(pg_xlog_location_diff(pg_current_xlog_insert_location(),replay_location)),* from pg_stat_replication ;
-[ RECORD 1 ]----+------------------------------
pg_size_pretty | 39 GB
pg_size_pretty | 39 GB
pid | 57170
usesysid | 16384
usename | replica
application_name | walreceiver
client_addr | 10.151.163.41
client_hostname |
client_port | 40941
backend_start | 2015-12-28 19:18:50.015165+08
backend_xmin |
state | catchup
sent_location | 19D/9A2A0000
write_location | 19D/99000000
flush_location | 19D/99000000
replay_location | 19D/97FFCAF0
sync_priority | 0
sync_state | async
600秒后,延迟达到了58GB
postgres=# select pg_size_pretty(pg_xlog_location_diff(pg_current_xlog_insert_location(),sent_location)),pg_size_pretty(pg_xlog_location_diff(pg_current_xlog_insert_location(),replay_location)),* from pg_stat_replication ;
-[ RECORD 1 ]----+------------------------------
pg_size_pretty | 58 GB
pg_size_pretty | 58 GB
pid | 57170
usesysid | 16384
usename | replica
application_name | walreceiver
client_addr | 10.151.163.41
client_hostname |
client_port | 40941
backend_start | 2015-12-28 19:18:50.015165+08
backend_xmin |
state | catchup
sent_location | 1A8/6FAC0000
write_location | 1A8/6E000000
flush_location | 1A8/6E000000
replay_location | 1A8/6CFFC160
sync_priority | 0
sync_state | async
网络传输始终保持在170MB/s左右。也就是说网络根本不是瓶颈。
07:24:55 PM IFACE rxpck/s txpck/s rxkB/s txkB/s rxcmp/s txcmp/s rxmcst/s
07:24:56 PM eth0 10.10 9.09 1.25 1.53 0.00 0.00 0.00
07:24:56 PM eth1 1321.21 116254.55 85.99 171845.97 0.00 0.00 0.00
07:24:56 PM bond0 1331.31 116263.64 87.23 171847.50 0.00 0.00 0.00
07:24:56 PM lo 119.19 119.19 83.20 83.20 0.00 0.00 0.00
备库的iostat
sda是存放XLOG的盘
dfa,dfb,dfc是zfs,存放pgdata
Device: rrqm/s wrqm/s r/s w/s rsec/s wsec/s avgrq-sz avgqu-sz await svctm %util
sda 0.00 352.00 0.00 124.00 0.00 656448.00 5293.94 1.75 14.21 4.18 51.80
sdb 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00
dfa 0.00 0.00 874.00 694.00 223744.00 134352.00 228.38 0.35 0.22 0.20 32.10
dfb 0.00 0.00 744.00 841.00 189984.00 128520.00 200.95 0.31 0.19 0.18 28.50
dfc 0.00 0.00 854.00 643.00 217664.00 133376.00 234.50 0.38 0.25 0.23 34.60
从util来看,磁盘也不是瓶颈。
观察主备的CPU开销也正常。
问题出在哪里呢?
通过oprofile可以得知在fsync上耗费较多。
原因是备库xlog落盘调用了fsync,而主库XLOG是异步的。
代码如下,
src/backend/replication/walreceiver.c
/*
* Flush the log to disk.
*
* If we're in the midst of dying, it's unwise to do anything that might throw
* an error, so we skip sending a reply in that case.
*/
static void
XLogWalRcvFlush(bool dying)
{
if (LogstreamResult.Flush < LogstreamResult.Write)
{
/* use volatile pointer to prevent code rearrangement */
volatile WalRcvData *walrcv = WalRcv;
issue_xlog_fsync(recvFile, recvSegNo);
......
src/backend/access/transam/xlog.c
/*
* Issue appropriate kind of fsync (if any) for an XLOG output file.
*
* 'fd' is a file descriptor for the XLOG file to be fsync'd.
* 'log' and 'seg' are for error reporting purposes.
*/
void
issue_xlog_fsync(int fd, XLogSegNo segno)
{
switch (sync_method)
{
case SYNC_METHOD_FSYNC:
if (pg_fsync_no_writethrough(fd) != 0)
ereport(PANIC,
(errcode_for_file_access(),
errmsg("could not fsync log file %s: %m",
XLogFileNameP(ThisTimeLineID, segno))));
break;
#ifdef HAVE_FSYNC_WRITETHROUGH
case SYNC_METHOD_FSYNC_WRITETHROUGH:
if (pg_fsync_writethrough(fd) != 0)
ereport(PANIC,
(errcode_for_file_access(),
errmsg("could not fsync write-through log file %s: %m",
XLogFileNameP(ThisTimeLineID, segno))));
break;
#endif
#ifdef HAVE_FDATASYNC
case SYNC_METHOD_FDATASYNC:
if (pg_fdatasync(fd) != 0)
ereport(PANIC,
(errcode_for_file_access(),
errmsg("could not fdatasync log file %s: %m",
XLogFileNameP(ThisTimeLineID, segno))));
break;
#endif
case SYNC_METHOD_OPEN:
case SYNC_METHOD_OPEN_DSYNC:
/* write synced it already */
break;
default:
elog(PANIC, "unrecognized wal_sync_method: %d", sync_method);
break;
}
}
src/backend/storage/file/fd.c
/*
* pg_fdatasync --- same as fdatasync except does nothing if enableFsync is off
*
* Not all platforms have fdatasync; treat as fsync if not available.
*/
int
pg_fdatasync(int fd)
{
if (enableFsync)
{
#ifdef HAVE_FDATASYNC
return fdatasync(fd);
#else
return fsync(fd);
#endif
}
else
return 0;
}
备库关闭fsync后,问题得到解决。
fsync关闭后是不安全的,因为影响所有的pg fsync操作,包括检查点,数据文件。
所以要改进一下。
配置备库postgresql.conf
fsync=off
重启备库。
重新测试,问题消除。