PostgreSQL can continue when postmaster killed

5 minute read

背景

数据库主进程挂了,原有的连接还能继续操作数据库,你信吗?PostgreSQL可以做到,并且原来的进程对数据库的操作是持久化的,不会丢数据哦。这得益于PostgreSQL的进程结构设计。而且postmaster进程只负责了简单的工作,例如监听端口。

(有人会问了,wal writer, backgroup writer进程都挂了,数据还能持久化?)没错,因为backend process也可以完成这些操作,所以不用担心数据丢失。

创建测试表

postgres=# drop table test;  
DROP TABLE  
postgres=# create table test(id int);  
CREATE TABLE  

将postgres主进程杀掉

postgres@digoal-> ps -ewf|grep postgres  
root      1870  1787  0 16:26 pts/1    00:00:00 su - postgres  
postgres  1872  1870  0 16:26 pts/1    00:00:00 -bash  
root      2216  1753  0 16:43 pts/0    00:00:00 su - postgres  
postgres  2218  2216  0 16:43 pts/0    00:00:00 -bash  
postgres  2521     1  0 16:55 pts/1    00:00:00 /opt/pgsql9.4.4/bin/postgres  
postgres  2523  2521  0 16:55 ?        00:00:00 postgres: logger process      
postgres  2539  2521  0 16:55 ?        00:00:00 postgres: checkpointer process     
postgres  2540  2521  0 16:55 ?        00:00:00 postgres: writer process      
postgres  2541  2521  0 16:55 ?        00:00:00 postgres: wal writer process     
postgres  2542  2521  0 16:55 ?        00:00:00 postgres: autovacuum launcher process     
postgres  2543  2521  0 16:55 ?        00:00:00 postgres: stats collector process     
postgres  2544  2218  0 16:55 pts/0    00:00:00 psql  
postgres  2546  2521  0 16:55 ?        00:00:00 postgres: postgres postgres [local] idle  
postgres  2547  1872  0 16:56 pts/1    00:00:00 ps -ewf  
postgres  2548  1872  0 16:56 pts/1    00:00:00 grep postgres  
postgres@digoal-> kill -9 2521  
postgres@digoal-> ps -ewf|grep postgres  
root      1870  1787  0 16:26 pts/1    00:00:00 su - postgres  
postgres  1872  1870  0 16:26 pts/1    00:00:00 -bash  
root      2216  1753  0 16:43 pts/0    00:00:00 su - postgres  
postgres  2218  2216  0 16:43 pts/0    00:00:00 -bash  
postgres  2523     1  0 16:55 ?        00:00:00 postgres: logger process      
postgres  2544  2218  0 16:55 pts/0    00:00:00 psql  
postgres  2546     1  0 16:55 ?        00:00:00 postgres: postgres postgres [local] idle  
postgres  2550  1872  1 16:56 pts/1    00:00:00 ps -ewf  
postgres  2551  1872  0 16:56 pts/1    00:00:00 grep postgres  

杀掉主进程后,只剩下backend process和logger进程,(当然wal buffer和shared buffer还在)。

然后在backend process对应的会话中写入记录。可以正常操作。

postgres=# set synchronous_commit=off;  
SET  
postgres=# insert into test values (1);  
INSERT 0 1  
postgres=# select ctid,* from test;  
 ctid  | id   
-------+----  
 (0,1) |  1  
(1 row)  
postgres=# \q  

退出会话后,所有相关的进程都不在了,logger也退出了。

postgres@digoal-> ps -ewf|grep postgres  
root      1870  1787  0 16:26 pts/1    00:00:00 su - postgres  
postgres  1872  1870  0 16:26 pts/1    00:00:00 -bash  
root      2216  1753  0 16:43 pts/0    00:00:00 su - postgres  
postgres  2218  2216  0 16:43 pts/0    00:00:00 -bash  
postgres  2553  1872  0 16:56 pts/1    00:00:00 ps -ewf  
postgres  2554  1872  0 16:56 pts/1    00:00:00 grep postgres  

启动数据库。

postgres@digoal-> pg_ctl start  
pg_ctl: another server might be running; trying to start server anyway  
server starting  
postgres@digoal-> LOG:  00000: redirecting log output to logging collector process  
HINT:  Future log output will appear in directory "pg_log".  
LOCATION:  SysLogger_Start, syslogger.c:645  

查看不到之前插入的数据,原因是那个事务是异步的,而wal writter process进程当时已经不在了,backend process虽然可以完成flush wal buffer的功能,但是不像wal writter进程是周期性刷的,而是在申请不到BUFFER时才会触发刷BUFFER的动作。

所以一条记录就这样丢失了。

接下来,我们使用同步事务,可以保证数据不丢失。

postgres@digoal-> ps -ewf|grep postgres  
root      2216  1753  0 16:43 pts/0    00:00:00 su - postgres  
postgres  2218  2216  0 16:43 pts/0    00:00:00 -bash  
root      2681  1787  0 17:04 pts/1    00:00:00 su - postgres  
postgres  2683  2681  0 17:04 pts/1    00:00:00 -bash  
postgres  2839     1  0 17:18 pts/1    00:00:00 /opt/pgsql9.4.4/bin/postgres  
postgres  2841  2839  0 17:18 ?        00:00:00 postgres: logger process      
postgres  2843  2839  0 17:18 ?        00:00:00 postgres: checkpointer process     
postgres  2844  2839  0 17:18 ?        00:00:00 postgres: writer process      
postgres  2845  2839  0 17:18 ?        00:00:00 postgres: wal writer process     
postgres  2846  2839  0 17:18 ?        00:00:00 postgres: autovacuum launcher process     
postgres  2847  2839  0 17:18 ?        00:00:00 postgres: stats collector process     
postgres  2848  2218  0 17:18 pts/0    00:00:00 psql  
postgres  2850  2839  0 17:18 ?        00:00:00 postgres: postgres postgres [local] idle  
postgres  2854  2683  2 17:18 pts/1    00:00:00 ps -ewf  
postgres  2855  2683  1 17:18 pts/1    00:00:00 [bash]  
postgres@digoal-> kill -9 2839  

使用同步事务写入数据并退出。

postgres=# set synchronous_commit=on;  
SET  
postgres=# truncate test;  
TRUNCATE TABLE  
postgres=# insert into test values (1);  
INSERT 0 1  
postgres=# \q  

启动数据库

$ pg_ctl start  

可以看到,数据是持久化存储的。

postgres@digoal-> psql  
psql (9.4.4)  
Type "help" for help.  
postgres=# select ctid,* from test;  
 ctid  | id   
-------+----  
 (0,1) |  1  
(1 row)  

注意,虽然backend process可以写wal buffer和shared buffer, 但是不能执行checkpoint, 因为这个操作是checkpoint做的,backend process只会告知它。当我们在postgres主进程被杀掉后,如果执行一个比较大的操作导致触发checkpoint的话,会在日志中看到这样的信息。

2015-08-03 17:20:25.824 CST,"postgres","postgres",2850,"[local]",55bf31ee.b22,3,"INSERT",2015-08-03 17:18:38 CST,2/11,7955331,LOG,00000,"could not signal for checkpoint: No such process",,,,,,"insert into test select generate_series(1,10000000);",,"RequestCheckpoint, checkpointer.c:1028","psql"  

包括autovacuum, stat collecter process都不在了,所以这些操作也会失败。

例如

postgres=# vacuum test;  
VACUUM  
postgres=# checkpoint;  
ERROR:  could not signal for checkpoint: No such process  
postgres=# \set VERBOSITY verbose  
postgres=# checkpoint;  
ERROR:  XX000: could not signal for checkpoint: No such process  
LOCATION:  RequestCheckpoint, checkpointer.c:1028  

可以看到对应的日志:

2015-08-03 17:32:04.651 CST,"postgres","postgres",3005,"[local]",55bf3302.bbd,5,"VACUUM",2015-08-03 17:23:14 CST,2/8,0,LOG,00000,"using stale statistics instead of current ones because stats collector is not responding",,,,,,"vacuum test;",,"backend_read_statsfile, pgstat.c:4476","psql"  

统计信息进程没了,所以统计信息也无法获取。

postgres=# select * from pg_stat_all_tables where relid='test'::regclass;  
-[ RECORD 1 ]-------+-------  
relid               | 186262  
schemaname          | public  
relname             | test  
seq_scan            | 0  
seq_tup_read        | 0  
idx_scan            |   
idx_tup_fetch       |   
n_tup_ins           | 0  
n_tup_upd           | 0  
n_tup_del           | 0  
n_tup_hot_upd       | 0  
n_live_tup          | 0  
n_dead_tup          | 0  
n_mod_since_analyze | 0  
last_vacuum         |   
last_autovacuum     |   
last_analyze        |   
last_autoanalyze    |   
vacuum_count        | 0  
autovacuum_count    | 0  
analyze_count       | 0  
autoanalyze_count   | 0  
2015-08-03 17:37:55.056 CST,"postgres","postgres",3005,"[local]",55bf3302.bbd,9,"SELECT",2015-08-03 17:23:14 CST,2/23,0,LOG,00000,"using stale statistics instead of current ones because stats collector is not responding",,,,,,"select * from pg_stat_all_tables where relid='test'::regclass;",,"backend_read_statsfile, pgstat.c:4476","psql"  

这里还引发一个问题,如果我们使用长连接来监控数据库状态的话,无法了解主进程是否健康,所以最好还是用短连接来监控数据库,至少可以判断认证这块还有主进程是否是正常的。不过短连接也有一定的问题,就是可能数据库的连接被占满了,无法获得连接。有利有弊,长连接+短连接的方式监控可能更加全面。

其他

关于crash自动重启的参数:

restart_after_crash (boolean)  
  
When set to true, which is the default, PostgreSQL will automatically reinitialize after a backend crash.   
  
Leaving this value set to true is normally the best way to maximize the availability of the database.   
  
However, in some circumstances, such as when PostgreSQL is being invoked by clusterware,   
it may be useful to disable the restart so that the clusterware can gain control and take any actions it deems appropriate.  

对应的代码,某些场景会导致数据库重启。

src/backend/postmaster/postmaster.c

/*  
 * Advance the postmaster's state machine and take actions as appropriate  
 *  
 * This is common code for pmdie(), reaper() and sigusr1_handler(), which  
 * receive the signals that might mean we need to change state.  
 */  
static void  
PostmasterStateMachine(void)  
{  
......  
        /*  
         * If recovery failed, or the user does not want an automatic restart  
         * after backend crashes, wait for all non-syslogger children to exit, and  
         * then exit postmaster. We don't try to reinitialize when recovery fails,  
         * because more than likely it will just fail again and we will keep  
         * trying forever.  
         */  
        if (pmState == PM_NO_CHILDREN && (RecoveryError || !restart_after_crash))  
                ExitPostmaster(1);  
  
        /*  
         * If we need to recover from a crash, wait for all non-syslogger children  
         * to exit, then reset shmem and StartupDataBase.  
         */  
        if (FatalError && pmState == PM_NO_CHILDREN)  
        {  
                ereport(LOG,  
                                (errmsg("all server processes terminated; reinitializing")));  
  
                /* allow background workers to immediately restart */  
                ResetBackgroundWorkerCrashTimes();  
  
                shmem_exit(1);  
                reset_shared(PostPortNumber);  
  
                StartupPID = StartupDataBase();  // 重启数据库,postmaster pid不会变化  
                Assert(StartupPID != 0);  
                pmState = PM_STARTUP;  
                /* crash recovery started, reset SIGKILL flag */  
                AbortStartTime = 0;  
        }  
......  
  
#define StartupDataBase()           StartChildProcess(StartupProcess)  
  
  
/*  
 * StartChildProcess -- start an auxiliary process for the postmaster  
 *  
 * xlop determines what kind of child will be started.  All child types  
 * initially go to AuxiliaryProcessMain, which will handle common setup.  
 *  
 * Return value of StartChildProcess is subprocess' PID, or 0 if failed  
 * to start subprocess.  
 */  
static pid_t  
StartChildProcess(AuxProcType type)  
......  

例如autovacuum 进程被kill. 断开所有backend process, 重启autovacuum lanucher

Flag Counter

digoal’s 大量PostgreSQL文章入口