PostgreSQL standby 在万兆网环境中缘何延迟? 如何解决?

4 minute read

背景

在一个这样的环境中测试PostgreSQL 9.5 rc1

主库

32C  
256G 内存  
18TB aliflash ssd (约50万 写iops处理能力)   
xfs   文件系统  
10GB 网卡 * 2 bond  
CentOS 6.x x64  

备库

32C  
256G 内存  
18TB aliflash ssd (约50万 写iops处理能力)   
pg_xlog   ext4   文件系统  
pgdata    ZFS   文件系统  
10GB 网卡 * 2 bond  
CentOS 6.x x64  

主备数据库参数:

listen_addresses = '0.0.0.0'            # what IP address(es) to listen on;  
port = 1921                             # (change requires restart)  
max_connections = 1000                  # (change requires restart)  
unix_socket_directories = '.'   # comma-separated list of directories  
shared_buffers = 32GB                   # min 128kB  
maintenance_work_mem = 512MB            # min 1MB  
autovacuum_work_mem = 512MB             # min 1MB, or -1 to use maintenance_work_mem  
dynamic_shared_memory_type = posix      # the default is the first option  
bgwriter_delay = 10ms                   # 10-10000ms between rounds  
wal_level = hot_standby  # minimal, archive, hot_standby, or logical  
fsync = on                              # turns forced synchronization on or off  
synchronous_commit = off                # synchronization level;  
max_wal_size = 32GB  
max_wal_senders = 10            # max number of walsender processes  
max_replication_slots = 10      # max number of replication slots  
hot_standby = on                        # "on" allows queries during recovery  
wal_receiver_status_interval = 1s       # send replies at least this often  
hot_standby_feedback = on               # send info from standby to prevent  
random_page_cost = 1.0                  # same scale as above  
effective_cache_size = 256GB  
log_destination = 'csvlog'              # Valid values are combinations of  
logging_collector = on          # Enable capturing of stderr and csvlog  
log_checkpoints = on  
log_connections = on  
log_disconnections = on  
log_error_verbosity = verbose           # terse, default, or verbose messages  
log_timezone = 'PRC'  
datestyle = 'iso, mdy'  
timezone = 'PRC'  
lc_messages = 'C'                       # locale for system error message  
lc_monetary = 'C'                       # locale for monetary formatting  
lc_numeric = 'C'                        # locale for number formatting  
lc_time = 'C'                           # locale for time formatting  
default_text_search_config = 'pg_catalog.english'  

测试模型

pgbench -i -s 1000

数据量 17TB

测试

pgbench -M prepared -n -r -P 1 -c 48 -j 48 -T 100000

300秒后主备延迟达到了39GB

postgres=# select pg_size_pretty(pg_xlog_location_diff(pg_current_xlog_insert_location(),sent_location)),pg_size_pretty(pg_xlog_location_diff(pg_current_xlog_insert_location(),replay_location)),* from pg_stat_replication ;  
-[ RECORD 1 ]----+------------------------------  
pg_size_pretty   | 39 GB  
pg_size_pretty   | 39 GB  
pid              | 57170  
usesysid         | 16384  
usename          | replica  
application_name | walreceiver  
client_addr      | 10.151.163.41  
client_hostname  |   
client_port      | 40941  
backend_start    | 2015-12-28 19:18:50.015165+08  
backend_xmin     |   
state            | catchup  
sent_location    | 19D/9A2A0000  
write_location   | 19D/99000000  
flush_location   | 19D/99000000  
replay_location  | 19D/97FFCAF0  
sync_priority    | 0  
sync_state       | async  

600秒后，延迟达到了58GB

postgres=# select pg_size_pretty(pg_xlog_location_diff(pg_current_xlog_insert_location(),sent_location)),pg_size_pretty(pg_xlog_location_diff(pg_current_xlog_insert_location(),replay_location)),* from pg_stat_replication ;  
-[ RECORD 1 ]----+------------------------------  
pg_size_pretty   | 58 GB  
pg_size_pretty   | 58 GB  
pid              | 57170  
usesysid         | 16384  
usename          | replica  
application_name | walreceiver  
client_addr      | 10.151.163.41  
client_hostname  |   
client_port      | 40941  
backend_start    | 2015-12-28 19:18:50.015165+08  
backend_xmin     |   
state            | catchup  
sent_location    | 1A8/6FAC0000  
write_location   | 1A8/6E000000  
flush_location   | 1A8/6E000000  
replay_location  | 1A8/6CFFC160  
sync_priority    | 0  
sync_state       | async  

网络传输始终保持在170MB/s左右。也就是说网络根本不是瓶颈。

24:55 PM     IFACE   rxpck/s   txpck/s    rxkB/s    txkB/s   rxcmp/s   txcmp/s  rxmcst/s  
24:56 PM      eth0     10.10      9.09      1.25      1.53      0.00      0.00      0.00  
24:56 PM      eth1   1321.21 116254.55     85.99 171845.97      0.00      0.00      0.00  
24:56 PM     bond0   1331.31 116263.64     87.23 171847.50      0.00      0.00      0.00  
24:56 PM        lo    119.19    119.19     83.20     83.20      0.00      0.00      0.00  

备库的iostat

sda是存放XLOG的盘

dfa,dfb,dfc是zfs，存放pgdata

Device:         rrqm/s   wrqm/s     r/s     w/s   rsec/s   wsec/s avgrq-sz avgqu-sz   await  svctm  %util  
sda               0.00   352.00    0.00  124.00     0.00 656448.00  5293.94     1.75   14.21   4.18  51.80  
sdb               0.00     0.00    0.00    0.00     0.00     0.00     0.00     0.00    0.00   0.00   0.00  
dfa               0.00     0.00  874.00  694.00 223744.00 134352.00   228.38     0.35    0.22   0.20  32.10  
dfb               0.00     0.00  744.00  841.00 189984.00 128520.00   200.95     0.31    0.19   0.18  28.50  
dfc               0.00     0.00  854.00  643.00 217664.00 133376.00   234.50     0.38    0.25   0.23  34.60  

从util来看，磁盘也不是瓶颈。

观察主备的CPU开销也正常。

问题出在哪里呢？

通过oprofile可以得知在fsync上耗费较多。

原因是备库xlog落盘调用了fsync，而主库XLOG是异步的。

代码如下，

src/backend/replication/walreceiver.c

/*  
 * Flush the log to disk.  
 *  
 * If we're in the midst of dying, it's unwise to do anything that might throw  
 * an error, so we skip sending a reply in that case.  
 */  
static void  
XLogWalRcvFlush(bool dying)  
{  
        if (LogstreamResult.Flush < LogstreamResult.Write)  
        {  
                /* use volatile pointer to prevent code rearrangement */  
                volatile WalRcvData *walrcv = WalRcv;  
  
                issue_xlog_fsync(recvFile, recvSegNo);  
......  

src/backend/access/transam/xlog.c

/*  
 * Issue appropriate kind of fsync (if any) for an XLOG output file.  
 *  
 * 'fd' is a file descriptor for the XLOG file to be fsync'd.  
 * 'log' and 'seg' are for error reporting purposes.  
 */  
void  
issue_xlog_fsync(int fd, XLogSegNo segno)  
{  
        switch (sync_method)  
        {  
                case SYNC_METHOD_FSYNC:  
                        if (pg_fsync_no_writethrough(fd) != 0)  
                                ereport(PANIC,  
                                                (errcode_for_file_access(),  
                                                 errmsg("could not fsync log file %s: %m",  
                                                                XLogFileNameP(ThisTimeLineID, segno))));  
                        break;  
#ifdef HAVE_FSYNC_WRITETHROUGH  
                case SYNC_METHOD_FSYNC_WRITETHROUGH:  
                        if (pg_fsync_writethrough(fd) != 0)  
                                ereport(PANIC,  
                                                (errcode_for_file_access(),  
                                          errmsg("could not fsync write-through log file %s: %m",  
                                                         XLogFileNameP(ThisTimeLineID, segno))));  
                        break;  
#endif  
#ifdef HAVE_FDATASYNC  
                case SYNC_METHOD_FDATASYNC:  
                        if (pg_fdatasync(fd) != 0)  
                                ereport(PANIC,  
                                                (errcode_for_file_access(),  
                                                 errmsg("could not fdatasync log file %s: %m",  
                                                                XLogFileNameP(ThisTimeLineID, segno))));  
                        break;  
#endif  
                case SYNC_METHOD_OPEN:  
                case SYNC_METHOD_OPEN_DSYNC:  
                        /* write synced it already */  
                        break;  
                default:  
                        elog(PANIC, "unrecognized wal_sync_method: %d", sync_method);  
                        break;  
        }  
}  

src/backend/storage/file/fd.c

/*  
 * pg_fdatasync --- same as fdatasync except does nothing if enableFsync is off  
 *  
 * Not all platforms have fdatasync; treat as fsync if not available.  
 */  
int  
pg_fdatasync(int fd)  
{  
        if (enableFsync)  
        {  
#ifdef HAVE_FDATASYNC  
                return fdatasync(fd);  
#else  
                return fsync(fd);  
#endif  
        }  
        else  
                return 0;  
}  

备库关闭fsync后，问题得到解决。

fsync关闭后是不安全的，因为影响所有的pg fsync操作，包括检查点，数据文件。

所以要改进一下。

配置备库postgresql.conf

fsync=off

重启备库。

重新测试，问题消除。

digoal’s 大量PostgreSQL文章入口

Twitter Facebook Google+ LinkedIn

Digoal.zhou

PostgreSQL standby 在万兆网环境中缘何延迟? 如何解决?

背景

digoal’s 大量PostgreSQL文章入口

You May Also Enjoy

PostgreSQL(PPAS 兼容Oracle) 从零开始入门手册 - 珍藏版

PostgreSQL pipelinedb 流计算插件 - IoT应用 - 实时轨迹聚合

PostgreSQL plpgsql 存储过程、函数 - 状态、异常变量打印、异常捕获… - GET [STACKED] DIAGNOSTICS

PostgreSQL datediff 日期间隔（单位转换）兼容SQL用法