PostgreSQL pg_current_xlog_insert_location() & pg_current_xlog_location()

3 minute read

背景

PostgreSQL pg_current_xlog_insert_location() & pg_current_xlog_location() 是两个获取XLOG位置的函数,他们有什么不同呢?

答案是pg_current_xlog_insert_location指写入wal buffer的位置.

pg_current_xlog_location返回已经write到wal文件的位置.

使用异步提交可以看到这个差异.

synchronous_commit = off
wal_writer_delay = 10000ms 

然后开启一个窗口执行一个大批量的写入操作.

在另外的窗口查看这两个函数的结果, fsync明显要小于insert.

digoal=# select pg_current_xlog_insert_location(),pg_current_xlog_location();
 pg_current_xlog_insert_location | pg_current_xlog_location 
---------------------------------+--------------------------
 4/FCD58288                      | 4/FCD50048
(1 row)

源码

digoal=# select prosrc, proname from pg_proc where proname ~ 'pg_current_xlog';
             prosrc              |             proname             
---------------------------------+---------------------------------
 pg_current_xlog_location        | pg_current_xlog_location
 pg_current_xlog_insert_location | pg_current_xlog_insert_location
(2 rows)

backend/access/transam/xlogfuncs.c

/*
 * Report the current WAL write location (same format as pg_start_backup etc)
 *
 * This is useful for determining how much of WAL is visible to an external
 * archiving process.  Note that the data before this point is written out
 * to the kernel, but is not necessarily synced to disk.
 */
Datum
pg_current_xlog_location(PG_FUNCTION_ARGS)
{
        XLogRecPtr      current_recptr;
        char            location[MAXFNAMELEN];

        if (RecoveryInProgress())
                ereport(ERROR,
                                (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
                                 errmsg("recovery is in progress"),
                                 errhint("WAL control functions cannot be executed during recovery.")));

        current_recptr = GetXLogWriteRecPtr();

        snprintf(location, sizeof(location), "%X/%X",
                         (uint32) (current_recptr >> 32), (uint32) current_recptr);
        PG_RETURN_TEXT_P(cstring_to_text(location));
}

/*
 * Report the current WAL insert location (same format as pg_start_backup etc)
 *
 * This function is mostly for debugging purposes.
 */
Datum
pg_current_xlog_insert_location(PG_FUNCTION_ARGS)
{
        XLogRecPtr      current_recptr;
        char            location[MAXFNAMELEN];

        if (RecoveryInProgress())
                ereport(ERROR,
                                (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
                                 errmsg("recovery is in progress"),
                                 errhint("WAL control functions cannot be executed during recovery.")));

        current_recptr = GetXLogInsertRecPtr();

        snprintf(location, sizeof(location), "%X/%X",
                         (uint32) (current_recptr >> 32), (uint32) current_recptr);
        PG_RETURN_TEXT_P(cstring_to_text(location));
}

src/backend/access/transam/xlog.c

/*
 * Get latest WAL insert pointer
 */
XLogRecPtr
GetXLogInsertRecPtr(void)
{
        XLogCtlInsert *Insert = &XLogCtl->Insert;
        XLogRecPtr      current_recptr;

        LWLockAcquire(WALInsertLock, LW_SHARED);
        INSERT_RECPTR(current_recptr, Insert, Insert->curridx);
        LWLockRelease(WALInsertLock);

        return current_recptr;
}

/*
 * Get latest WAL write pointer
 */
XLogRecPtr
GetXLogWriteRecPtr(void)
{
        {
                /* use volatile pointer to prevent code rearrangement */
                volatile XLogCtlData *xlogctl = XLogCtl;

                SpinLockAcquire(&xlogctl->info_lck);
                LogwrtResult = xlogctl->LogwrtResult;
                SpinLockRelease(&xlogctl->info_lck);
        }

        return LogwrtResult.Write;
}

/*----------
 * Shared-memory data structures for XLOG control
 *
 * LogwrtRqst indicates a byte position that we need to write and/or fsync
 * the log up to (all records before that point must be written or fsynced).
 * LogwrtResult indicates the byte positions we have already written/fsynced.
 * These structs are identical but are declared separately to indicate their
 * slightly different functions.
 *
 * To read XLogCtl->LogwrtResult, you must hold either info_lck or
 * WALWriteLock.  To update it, you need to hold both locks.  The point of
 * this arrangement is that the value can be examined by code that already
 * holds WALWriteLock without needing to grab info_lck as well.  In addition
 * to the shared variable, each backend has a private copy of LogwrtResult,
 * which is updated when convenient.
 *
 * The request bookkeeping is simpler: there is a shared XLogCtl->LogwrtRqst
 * (protected by info_lck), but we don't need to cache any copies of it.
 *
 * info_lck is only held long enough to read/update the protected variables,
 * so it's a plain spinlock.  The other locks are held longer (potentially
 * over I/O operations), so we use LWLocks for them.  These locks are:
 *
 * WALInsertLock: must be held to insert a record into the WAL buffers.
 *
 * WALWriteLock: must be held to write WAL buffers to disk (XLogWrite or
 * XLogFlush).
 *
 * ControlFileLock: must be held to read/update control file or create
 * new log file.
 *
 * CheckpointLock: must be held to do a checkpoint or restartpoint (ensures
 * only one checkpointer at a time; currently, with all checkpoints done by
 * the checkpointer, this is just pro forma).
 *
 *----------
 */

/*
 * Shared state data for XLogInsert.
 */
typedef struct XLogCtlInsert
{
        XLogRecPtr      PrevRecord;             /* start of previously-inserted record */
        int                     curridx;                /* current block index in cache */
        XLogPageHeader currpage;        /* points to header of block in cache */
        char       *currpos;            /* current insertion point in cache */
        XLogRecPtr      RedoRecPtr;             /* current redo point for insertions */
        bool            forcePageWrites;        /* forcing full-page writes for PITR? */

        /*
         * fullPageWrites is the master copy used by all backends to determine
         * whether to write full-page to WAL, instead of using process-local one.
         * This is required because, when full_page_writes is changed by SIGHUP,
         * we must WAL-log it before it actually affects WAL-logging by backends.
         * Checkpointer sets at startup or after SIGHUP.
         */
        bool            fullPageWrites;

        /*
         * exclusiveBackup is true if a backup started with pg_start_backup() is
         * in progress, and nonExclusiveBackups is a counter indicating the number
         * of streaming base backups currently in progress. forcePageWrites is set
         * to true when either of these is non-zero. lastBackupStart is the latest
         * checkpoint redo location used as a starting point for an online backup.
         */
        bool            exclusiveBackup;
        int                     nonExclusiveBackups;
        XLogRecPtr      lastBackupStart;
} XLogCtlInsert;

Flag Counter

digoal’s 大量PostgreSQL文章入口