PostgreSQL promote 激活 源码分析
背景
PostgreSQL standby可以通过两种方法来激活成为主库:
一种是trigger file,配置在recovery.conf中。
另一种是pg_ctl promote发送SIGUSR1信号给postmaster进程。
同时,PostgreSQL支持快速激活和非快速激活,
fast promote
fallback promote。
fast promote,开启数据库读写前,不需要做检查点。而是推到开启读写之后执行一个CHECKPOINT_FORCE检查点。
fallback_promote,在开启数据库读写前,需要先做一个检查点,现在这个模式已经不对用户开放,需要修改代码,只是用作调试。
代码分析如下:
激活过程,根据fast_promote变量判断是否需要先做检查点,再激活。
src/backend/access/transam/xlog.c
if (InRecovery)
{
/*
* Perform a checkpoint to update all our recovery activity to disk.
*
* Note that we write a shutdown checkpoint rather than an on-line
* one. This is not particularly critical, but since we may be
* assigning a new TLI, using a shutdown checkpoint allows us to have
* the rule that TLI only changes in shutdown checkpoints, which
* allows some extra error checking in xlog_redo.
*
* In fast promotion, only create a lightweight end-of-recovery record
* instead of a full checkpoint. A checkpoint is requested later,
* after we're fully out of recovery mode and already accepting
* queries.
*/
if (bgwriterLaunched)
{
if (fast_promote) // 如果是快速promote,在打开数据库读写前,不需要创建检查点。
{
checkPointLoc = ControlFile->prevCheckPoint;
/*
* Confirm the last checkpoint is available for us to recover
* from if we fail. Note that we don't check for the secondary
* checkpoint since that isn't available in most base backups.
*/
record = ReadCheckpointRecord(xlogreader, checkPointLoc, 1, false);
if (record != NULL)
{
fast_promoted = true;
/*
* Insert a special WAL record to mark the end of
* recovery, since we aren't doing a checkpoint. That
* means that the checkpointer process may likely be in
* the middle of a time-smoothed restartpoint and could
* continue to be for minutes after this. That sounds
* strange, but the effect is roughly the same and it
* would be stranger to try to come out of the
* restartpoint and then checkpoint. We request a
* checkpoint later anyway, just for safety.
*/
CreateEndOfRecoveryRecord();
}
}
if (!fast_promoted) // 如果是fallback_promote模式,须先创建一个检查点,再开启读写模式。
RequestCheckpoint(CHECKPOINT_END_OF_RECOVERY |
CHECKPOINT_IMMEDIATE |
CHECKPOINT_WAIT);
}
else
CreateCheckPoint(CHECKPOINT_END_OF_RECOVERY | CHECKPOINT_IMMEDIATE);
/*
* And finally, execute the recovery_end_command, if any.
*/
if (recoveryEndCommand)
ExecuteRecoveryCommand(recoveryEndCommand,
"recovery_end_command",
true);
}
/*
* Clean up any (possibly bogus) future WAL segments on the old timeline.
*/
if (ArchiveRecoveryRequested)
RemoveNonParentXlogFiles(EndOfLog, ThisTimeLineID);
/*
* Preallocate additional log files, if wanted.
*/
PreallocXlogFiles(EndOfLog);
/*
* Okay, we're officially UP.
*/
InRecovery = false; // 开启读写模式
LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
ControlFile->state = DB_IN_PRODUCTION; // 改写控制文件的数据库状态
ControlFile->time = (pg_time_t) time(NULL);
UpdateControlFile(); // 更新控制文件
LWLockRelease(ControlFileLock);
/* start the archive_timeout timer running */
XLogCtl->lastSegSwitchTime = (pg_time_t) time(NULL);
/* also initialize latestCompletedXid, to nextXid - 1 */
LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE);
ShmemVariableCache->latestCompletedXid = ShmemVariableCache->nextXid;
TransactionIdRetreat(ShmemVariableCache->latestCompletedXid);
LWLockRelease(ProcArrayLock);
/*
* Start up the commit log and subtrans, if not already done for hot
* standby.
*/
if (standbyState == STANDBY_DISABLED)
{
StartupCLOG();
StartupSUBTRANS(oldestActiveXID);
}
/*
* Perform end of recovery actions for any SLRUs that need it.
*/
TrimCLOG();
TrimMultiXact();
/* Reload shared-memory state for prepared transactions */
RecoverPreparedTransactions();
/*
* Shutdown the recovery environment. This must occur after
* RecoverPreparedTransactions(), see notes for lock_twophase_recover()
*/
if (standbyState != STANDBY_DISABLED)
ShutdownRecoveryTransactionEnvironment();
/* Shut down xlogreader */
if (readFile >= 0)
{
close(readFile);
readFile = -1;
}
XLogReaderFree(xlogreader);
/*
* If any of the critical GUCs have changed, log them before we allow
* backends to write WAL.
*/
LocalSetXLogInsertAllowed();
XLogReportParameters();
/*
* All done. Allow backends to write WAL. (Although the bool flag is
* probably atomic in itself, we use the info_lck here to ensure that
* there are no race conditions concerning visibility of other recent
* updates to shared memory.)
*/
{
/* use volatile pointer to prevent code rearrangement */
volatile XLogCtlData *xlogctl = XLogCtl;
SpinLockAcquire(&xlogctl->info_lck);
xlogctl->SharedRecoveryInProgress = false;
SpinLockRelease(&xlogctl->info_lck);
}
/*
* If there were cascading standby servers connected to us, nudge any wal
* sender processes to notice that we've been promoted.
*/
WalSndWakeup();
/*
* If this was a fast promotion, request an (online) checkpoint now. This
* isn't required for consistency, but the last restartpoint might be far
* back, and in case of a crash, recovering from it might take a longer
* than is appropriate now that we're not in standby mode anymore.
*/
if (fast_promoted) // 如果是快速promote,在这里执行一个检查点。
RequestCheckpoint(CHECKPOINT_FORCE);
......
通过pg_ctl命令行工具,向postmaster发SIGUSR1信号,通知它激活数据库。
首先会写一个promote文件,告诉postmaster,是fast_promote。
src/bin/pg_ctl/pg_ctl.c
/*
* promote
*/
static void
do_promote(void)
{
FILE *prmfile;
pgpid_t pid;
struct stat statbuf;
pid = get_pgpid(false);
if (pid == 0) /* no pid file */
{
write_stderr(_("%s: PID file \"%s\" does not exist\n"), progname, pid_file);
write_stderr(_("Is server running?\n"));
exit(1);
}
else if (pid < 0) /* standalone backend, not postmaster */
{
pid = -pid;
write_stderr(_("%s: cannot promote server; "
"single-user server is running (PID: %ld)\n"),
progname, pid);
exit(1);
}
/* If recovery.conf doesn't exist, the server is not in standby mode */
if (stat(recovery_file, &statbuf) != 0)
{
write_stderr(_("%s: cannot promote server; "
"server is not in standby mode\n"),
progname);
exit(1);
}
/*
* For 9.3 onwards, "fast" promotion is performed. Promotion with a full
* checkpoint is still possible by writing a file called
* "fallback_promote" instead of "promote"
*/
snprintf(promote_file, MAXPGPATH, "%s/promote", pg_data); // touch 一个PROMOTE_SIGNAL_FILE文件
if ((prmfile = fopen(promote_file, "w")) == NULL)
{
write_stderr(_("%s: could not create promote signal file \"%s\": %s\n"),
progname, promote_file, strerror(errno));
exit(1);
}
if (fclose(prmfile))
{
write_stderr(_("%s: could not write promote signal file \"%s\": %s\n"),
progname, promote_file, strerror(errno));
exit(1);
}
sig = SIGUSR1;
if (kill((pid_t) pid, sig) != 0)
{
write_stderr(_("%s: could not send promote signal (PID: %ld): %s\n"),
progname, pid, strerror(errno));
if (unlink(promote_file) != 0)
write_stderr(_("%s: could not remove promote signal file \"%s\": %s\n"),
progname, promote_file, strerror(errno));
exit(1);
}
print_msg(_("server promoting\n"));
}
数据恢复时,检查standby是否收到promote请求或是否存在trigger文件。
如果是promote请求,则检查有没有promote文件,或者fallback_promote文件,如果有promote文件,则是fast_promote请求。如果有fallback_promote文件,则不是fast_promote请求(实际上根本不可能检测到fallback_promote文件,因为没有写这个文件的操作)。所以通过pg_ctl promote来激活,一定是fast promote的,即不需要先做检查点再激活。
如果检查到trigger文件,同样也是fast promote激活模式。
src/backend/access/transam/xlog.c
#define PROMOTE_SIGNAL_FILE "promote"
#define FALLBACK_PROMOTE_SIGNAL_FILE "fallback_promote"
/*
* Check to see whether the user-specified trigger file exists and whether a
* promote request has arrived. If either condition holds, return true.
*/
static bool
CheckForStandbyTrigger(void)
{
struct stat stat_buf;
static bool triggered = false;
if (triggered)
return true;
if (IsPromoteTriggered()) // 检查是否收到pg_ctl promote信号
{
/*
* In 9.1 and 9.2 the postmaster unlinked the promote file inside the
* signal handler. It now leaves the file in place and lets the
* Startup process do the unlink. This allows Startup to know whether
* it should create a full checkpoint before starting up (fallback
* mode). Fast promotion takes precedence.
*/
if (stat(PROMOTE_SIGNAL_FILE, &stat_buf) == 0) // 先检查promote文件是否存在
{
unlink(PROMOTE_SIGNAL_FILE);
unlink(FALLBACK_PROMOTE_SIGNAL_FILE);
fast_promote = true; // 快速promote
}
else if (stat(FALLBACK_PROMOTE_SIGNAL_FILE, &stat_buf) == 0) // 否则再检查fallback_promote文件是否存在
{
unlink(FALLBACK_PROMOTE_SIGNAL_FILE);
fast_promote = false; // 先执行checkpoint再promote
}
ereport(LOG, (errmsg("received promote request")));
ResetPromoteTriggered();
triggered = true;
return true;
}
if (TriggerFile == NULL) // 检查recovery.conf是否配置了trigger_file
return false;
if (stat(TriggerFile, &stat_buf) == 0)
{
ereport(LOG,
(errmsg("trigger file found: %s", TriggerFile)));
unlink(TriggerFile);
triggered = true;
fast_promote = true; // 快速promote
return true;
}
else if (errno != ENOENT)
ereport(ERROR,
(errcode_for_file_access(),
errmsg("could not stat trigger file \"%s\": %m",
TriggerFile)));
return false;
}
src/backend/postmaster/startup.c
pqsignal(SIGUSR2, StartupProcTriggerHandler); // 注册SIGUSR2信号处理函数
/* SIGUSR2: set flag to finish recovery */
static void
StartupProcTriggerHandler(SIGNAL_ARGS)
{
int save_errno = errno;
promote_triggered = true;
WakeupRecovery();
errno = save_errno;
}
bool
IsPromoteTriggered(void)
{
return promote_triggered;
}
postmaster收到sigusr1信号后,检查是否收到promote信号,判断当前的状态是否处于恢复中的任意状态,然后向startup进程发一个SIGUSR2的信号,触发promote。
src/backend/postmaster/postmaster.c
pqsignal(SIGUSR1, sigusr1_handler); /* message from child process */ // 注册SIGUSR1信号处理函数
/*
* sigusr1_handler - handle signal conditions from child processes
*/
static void
sigusr1_handler(SIGNAL_ARGS)
{
......
if (CheckPromoteSignal() && StartupPID != 0 &&
(pmState == PM_STARTUP || pmState == PM_RECOVERY ||
pmState == PM_HOT_STANDBY || pmState == PM_WAIT_READONLY))
{
/* Tell startup process to finish recovery */
signal_child(StartupPID, SIGUSR2); // 向startup进程发SIGUSR2信号,通知它处理promote
}
......
src/backend/access/transam/xlog.c
/*
* Check to see if a promote request has arrived. Should be
* called by postmaster after receiving SIGUSR1.
*/
bool
CheckPromoteSignal(void)
{
struct stat stat_buf;
if (stat(PROMOTE_SIGNAL_FILE, &stat_buf) == 0 ||
stat(FALLBACK_PROMOTE_SIGNAL_FILE, &stat_buf) == 0)
return true;
return false;
}
最后提一点, 9.3以前,曾经出现过pg_ctl promote -m 来指定是否需要fast promote或者fallback promote。
《PostgreSQL 9.3 add Fast promote mode skips checkpoint at end of recovery》