Systemtap EXP: PostgreSQL IN-BUILD mark Class 5 - read write relation

4 minute read

背景

本文介绍relation 读写相关的探针: 包含读relation以及写relation(例如index, table,tmp table或者他们的fsm, vm文件等).  
从这些探针中我们可以得到一下信息 :   
1. 读relation开始: forkNum, blocknum, tbs_oid, db_oid, pg_class.relfilenode, (read to local or shared buffer)  
这些信息和buffer类的探针相似, 这里就不介绍了, 请参考 :   

http://blog.163.com/digoal@126/blog/static/1638770402013916488761/

2. 读relation结束: 比开始多了2个变量, 真实的从relation读到的字节数以及期望的字节数.如果两者不相等则意味着问题.  
3. 写relation开始: 同读relation开始.  
4. 写relation结束: 比开始多了2个变量, 真实的写到relation的字节数以及期望的字节数. 如果两者不相等则意味着问题.  

探针的详细信息如下 :

name parameter desc
smgr-md-read-start (ForkNumber, BlockNumber, Oid, Oid, Oid, int) Probe that fires when beginning to read a block from a relation. arg0 and arg1 contain the fork and block numbers of the page. arg2, arg3, and arg4 contain the tablespace, database, and relation OIDs identifying the relation. arg5 is the ID of the backend which created the temporary relation for a local buffer, or InvalidBackendId (-1) for a shared buffer.
smgr-md-read-done (ForkNumber, BlockNumber, Oid, Oid, Oid, int, int, int) Probe that fires when a block read is complete. arg0 and arg1 contain the fork and block numbers of the page. arg2, arg3, and arg4 contain the tablespace, database, and relation OIDs identifying the relation. arg5 is the ID of the backend which created the temporary relation for a local buffer, or InvalidBackendId (-1) for a shared buffer. arg6 is the number of bytes actually read, while arg7 is the number requested (if these are different it indicates trouble).
smgr-md-write-start (ForkNumber, BlockNumber, Oid, Oid, Oid, int) Probe that fires when beginning to write a block to a relation. arg0 and arg1 contain the fork and block numbers of the page. arg2, arg3, and arg4 contain the tablespace, database, and relation OIDs identifying the relation. arg5 is the ID of the backend which created the temporary relation for a local buffer, or InvalidBackendId (-1) for a shared buffer.
smgr-md-write-done (ForkNumber, BlockNumber, Oid, Oid, Oid, int, int, int) Probe that fires when a block write is complete. arg0 and arg1 contain the fork and block numbers of the page. arg2, arg3, and arg4 contain the tablespace, database, and relation OIDs identifying the relation. arg5 is the ID of the backend which created the temporary relation for a local buffer, or InvalidBackendId (-1) for a shared buffer. arg6 is the number of bytes actually written, while arg7 is the number requested (if these are different it indicates trouble).

探针在probes.h总的定义信息 :

/* TRACE_POSTGRESQL_SMGR_MD_READ_START ( int, unsigned int, unsigned int, unsigned int, unsigned int, int) */  
#if defined STAP_SDT_V1  
#define TRACE_POSTGRESQL_SMGR_MD_READ_START_ENABLED() __builtin_expect (smgr__md__read__start_semaphore, 0)  
#define postgresql_smgr__md__read__start_semaphore smgr__md__read__start_semaphore  
#else  
#define TRACE_POSTGRESQL_SMGR_MD_READ_START_ENABLED() __builtin_expect (postgresql_smgr__md__read__start_semaphore, 0)  
#endif  
__extension__ extern unsigned short postgresql_smgr__md__read__start_semaphore __attribute__ ((unused)) __attribute__ ((section (".probes")));  
#define TRACE_POSTGRESQL_SMGR_MD_READ_START(arg1,arg2,arg3,arg4,arg5,arg6) \  
DTRACE_PROBE6(postgresql,smgr__md__read__start,arg1,arg2,arg3,arg4,arg5,arg6)  
  
/* TRACE_POSTGRESQL_SMGR_MD_READ_DONE ( int, unsigned int, unsigned int, unsigned int, unsigned int, int, int, int) */  
#if defined STAP_SDT_V1  
#define TRACE_POSTGRESQL_SMGR_MD_READ_DONE_ENABLED() __builtin_expect (smgr__md__read__done_semaphore, 0)  
#define postgresql_smgr__md__read__done_semaphore smgr__md__read__done_semaphore  
#else  
#define TRACE_POSTGRESQL_SMGR_MD_READ_DONE_ENABLED() __builtin_expect (postgresql_smgr__md__read__done_semaphore, 0)  
#endif  
__extension__ extern unsigned short postgresql_smgr__md__read__done_semaphore __attribute__ ((unused)) __attribute__ ((section (".probes")));  
#define TRACE_POSTGRESQL_SMGR_MD_READ_DONE(arg1,arg2,arg3,arg4,arg5,arg6,arg7,arg8) \  
DTRACE_PROBE8(postgresql,smgr__md__read__done,arg1,arg2,arg3,arg4,arg5,arg6,arg7,arg8)  
  
/* TRACE_POSTGRESQL_SMGR_MD_WRITE_START ( int, unsigned int, unsigned int, unsigned int, unsigned int, int) */  
#if defined STAP_SDT_V1  
#define TRACE_POSTGRESQL_SMGR_MD_WRITE_START_ENABLED() __builtin_expect (smgr__md__write__start_semaphore, 0)  
#define postgresql_smgr__md__write__start_semaphore smgr__md__write__start_semaphore  
#else  
#define TRACE_POSTGRESQL_SMGR_MD_WRITE_START_ENABLED() __builtin_expect (postgresql_smgr__md__write__start_semaphore, 0)  
#endif  
__extension__ extern unsigned short postgresql_smgr__md__write__start_semaphore __attribute__ ((unused)) __attribute__ ((section (".probes")));  
#define TRACE_POSTGRESQL_SMGR_MD_WRITE_START(arg1,arg2,arg3,arg4,arg5,arg6) \  
DTRACE_PROBE6(postgresql,smgr__md__write__start,arg1,arg2,arg3,arg4,arg5,arg6)  
  
/* TRACE_POSTGRESQL_SMGR_MD_WRITE_DONE ( int, unsigned int, unsigned int, unsigned int, unsigned int, int, int, int) */  
#if defined STAP_SDT_V1  
#define TRACE_POSTGRESQL_SMGR_MD_WRITE_DONE_ENABLED() __builtin_expect (smgr__md__write__done_semaphore, 0)  
#define postgresql_smgr__md__write__done_semaphore smgr__md__write__done_semaphore  
#else  
#define TRACE_POSTGRESQL_SMGR_MD_WRITE_DONE_ENABLED() __builtin_expect (postgresql_smgr__md__write__done_semaphore, 0)  
#endif  
__extension__ extern unsigned short postgresql_smgr__md__write__done_semaphore __attribute__ ((unused)) __attribute__ ((section (".probes")));  
#define TRACE_POSTGRESQL_SMGR_MD_WRITE_DONE(arg1,arg2,arg3,arg4,arg5,arg6,arg7,arg8) \  
DTRACE_PROBE8(postgresql,smgr__md__write__done,arg1,arg2,arg3,arg4,arg5,arg6,arg7,arg8)  
  
探针在源码中的信息 :   
src/backend/storage/smgr/md.c  
/*  
 *      mdread() -- Read the specified block from a relation.  
 */  
void  
mdread(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,  
           char *buffer)  
{  
...  
  
        TRACE_POSTGRESQL_SMGR_MD_READ_START(forknum, blocknum,  
                                                                                reln->smgr_rnode.node.spcNode,  
                                                                                reln->smgr_rnode.node.dbNode,  
                                                                                reln->smgr_rnode.node.relNode,  
                                                                                reln->smgr_rnode.backend);  
...  
        nbytes = FileRead(v->mdfd_vfd, buffer, BLCKSZ);  
  
        TRACE_POSTGRESQL_SMGR_MD_READ_DONE(forknum, blocknum,  
                                                                           reln->smgr_rnode.node.spcNode,  
                                                                           reln->smgr_rnode.node.dbNode,  
                                                                           reln->smgr_rnode.node.relNode,  
                                                                           reln->smgr_rnode.backend,  
                                                                           nbytes,  
                                                                           BLCKSZ);  
...  
/*  
 *      mdwrite() -- Write the supplied block at the appropriate location.  
 *  
 *              This is to be used only for updating already-existing blocks of a  
 *              relation (ie, those before the current EOF).  To extend a relation,  
 *              use mdextend().  
 */  
void  
mdwrite(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,  
                char *buffer, bool skipFsync)  
{  
        off_t           seekpos;  
        int                     nbytes;  
        MdfdVec    *v;  
  
        /* This assert is too expensive to have on normally ... */  
#ifdef CHECK_WRITE_VS_EXTEND  
        Assert(blocknum < mdnblocks(reln, forknum));  
#endif  
  
        TRACE_POSTGRESQL_SMGR_MD_WRITE_START(forknum, blocknum,  
                                                                                 reln->smgr_rnode.node.spcNode,  
                                                                                 reln->smgr_rnode.node.dbNode,  
                                                                                 reln->smgr_rnode.node.relNode,  
                                                                                 reln->smgr_rnode.backend);  
...  
        nbytes = FileWrite(v->mdfd_vfd, buffer, BLCKSZ);  
  
        TRACE_POSTGRESQL_SMGR_MD_WRITE_DONE(forknum, blocknum,  
                                                                                reln->smgr_rnode.node.spcNode,  
                                                                                reln->smgr_rnode.node.dbNode,  
                                                                                reln->smgr_rnode.node.relNode,  
                                                                                reln->smgr_rnode.backend,  
                                                                                nbytes,  
                                                                                BLCKSZ);  
使用举例 :   
1.  输出读和写relation的信息, 每个数据块1条记录.  
[root@db-172-16-3-150 postgresql-9.3.1]stap -e '  
probe process("/home/pg93/pgsql9.3.1/bin/postgres").mark("smgr__md__read__done") {  
  printdln("***", pn(), $arg1, $arg2, $arg3, $arg4, $arg5, $arg6, $arg7, $arg8)  
}  
probe process("/home/pg93/pgsql9.3.1/bin/postgres").mark("smgr__md__write__done") {  
  printdln("***", pn(), $arg1, $arg2, $arg3, $arg4, $arg5, $arg6, $arg7, $arg8)  
}'  
  
SQL对应的输出如下 :   
digoal=# explain (analyze,verbose,costs,buffers,timing) select count(*) from test ;  
                                                         QUERY PLAN                                                            
-----------------------------------------------------------------------------------------------------------------------------  
 Aggregate  (cost=106075.99..106076.00 rows=1 width=0) (actual time=1561.052..1561.053 rows=1 loops=1)  
   Output: count(*)  
   Buffers: shared hit=47319  
   ->  Seq Scan on public.test  (cost=0.00..94324.59 rows=4700559 width=0) (actual time=0.011..883.486 rows=4676559 loops=1)  
         Output: id, info, crt_time  
         Buffers: shared hit=47319  
 Total runtime: 1561.094 ms  
(7 rows)  
shared buffer命中, 本例stap无输出  
  
digoal=# create table t1(id int, info text);  
CREATE TABLE  
无输出  
  
digoal=# insert into t1 select generate_series(1,100000),md5(random()::text);  
INSERT 0 100000  
输出:  
process("/home/pg93/pgsql9.3.1/bin/postgres").mark("smgr__md__read__done")***1***2***1663***16384***24726***-1***8192***8192  
process("/home/pg93/pgsql9.3.1/bin/postgres").mark("smgr__md__read__done")***1***0***1663***16384***24726***-1***8192***8192  
  
digoal=# explain (analyze,verbose,costs,buffers,timing) select count(*) from t1 ;  
                                                   QUERY PLAN                                                     
----------------------------------------------------------------------------------------------------------------  
 Aggregate  (cost=21.50..21.51 rows=1 width=0) (actual time=0.316..0.316 rows=1 loops=1)  
   Output: count(*)  
   Buffers: shared hit=9  
   ->  Seq Scan on public.t1  (cost=0.00..19.00 rows=1000 width=0) (actual time=0.010..0.164 rows=1000 loops=1)  
         Output: id, info  
         Buffers: shared hit=9  
 Total runtime: 0.353 ms  
(7 rows)  
shared buffer命中, 本例stap无输出  
  
digoal=# checkpoint;  
CHECKPOINT  
输出, checkpoint涉及一些系统表的写.  
process("/home/pg93/pgsql9.3.1/bin/postgres").mark("smgr__md__write__done")***0***0***1663***16384***12658***-1***8192***8192  
process("/home/pg93/pgsql9.3.1/bin/postgres").mark("smgr__md__write__done")***0***2***1663***16384***12658***-1***8192***8192  
process("/home/pg93/pgsql9.3.1/bin/postgres").mark("smgr__md__write__done")***0***5***1663***16384***12658***-1***8192***8192  
process("/home/pg93/pgsql9.3.1/bin/postgres").mark("smgr__md__write__done")***0***7***1663***16384***12658***-1***8192***8192  
process("/home/pg93/pgsql9.3.1/bin/postgres").mark("smgr__md__write__done")***0***1***1663***16384***12660***-1***8192***8192  
process("/home/pg93/pgsql9.3.1/bin/postgres").mark("smgr__md__write__done")***0***1***1663***16384***12685***-1***8192***8192  
process("/home/pg93/pgsql9.3.1/bin/postgres").mark("smgr__md__write__done")***0***2***1663***16384***12682***-1***8192***8192  
process("/home/pg93/pgsql9.3.1/bin/postgres").mark("smgr__md__write__done")***0***1***1663***16384***12637***-1***8192***8192  
process("/home/pg93/pgsql9.3.1/bin/postgres").mark("smgr__md__write__done")***0***1***1663***16384***12638***-1***8192***8192  
process("/home/pg93/pgsql9.3.1/bin/postgres").mark("smgr__md__write__done")***0***2***1663***16384***12661***-1***8192***8192  
process("/home/pg93/pgsql9.3.1/bin/postgres").mark("smgr__md__write__done")***0***10***1663***16384***12650***-1***8192***8192  
process("/home/pg93/pgsql9.3.1/bin/postgres").mark("smgr__md__write__done")***0***43***1663***16384***12647***-1***8192***8192  
process("/home/pg93/pgsql9.3.1/bin/postgres").mark("smgr__md__write__done")***0***2***1663***16384***12638***-1***8192***8192  
process("/home/pg93/pgsql9.3.1/bin/postgres").mark("smgr__md__write__done")***0***1***1663***16384***12684***-1***8192***8192  
process("/home/pg93/pgsql9.3.1/bin/postgres").mark("smgr__md__write__done")***0***2***1663***16384***12634***-1***8192***8192  
process("/home/pg93/pgsql9.3.1/bin/postgres").mark("smgr__md__write__done")***0***18***1663***16384***12629***-1***8192***8192  
process("/home/pg93/pgsql9.3.1/bin/postgres").mark("smgr__md__write__done")***0***1***1663***16384***12633***-1***8192***8192  
process("/home/pg93/pgsql9.3.1/bin/postgres").mark("smgr__md__write__done")***1***2***1663***16384***12631***-1***8192***8192  
process("/home/pg93/pgsql9.3.1/bin/postgres").mark("smgr__md__write__done")***0***6***1663***16384***12631***-1***8192***8192  
process("/home/pg93/pgsql9.3.1/bin/postgres").mark("smgr__md__write__done")***0***19***1663***16384***12629***-1***8192***8192  
process("/home/pg93/pgsql9.3.1/bin/postgres").mark("smgr__md__write__done")***0***8***1663***16384***24726***-1***8192***8192  
process("/home/pg93/pgsql9.3.1/bin/postgres").mark("smgr__md__write__done")***0***7***1663***16384***24726***-1***8192***8192  
process("/home/pg93/pgsql9.3.1/bin/postgres").mark("smgr__md__write__done")***0***6***1663***16384***24726***-1***8192***8192  
process("/home/pg93/pgsql9.3.1/bin/postgres").mark("smgr__md__write__done")***0***5***1663***16384***24726***-1***8192***8192  
process("/home/pg93/pgsql9.3.1/bin/postgres").mark("smgr__md__write__done")***0***4***1663***16384***24726***-1***8192***8192  
process("/home/pg93/pgsql9.3.1/bin/postgres").mark("smgr__md__write__done")***0***3***1663***16384***24726***-1***8192***8192  
process("/home/pg93/pgsql9.3.1/bin/postgres").mark("smgr__md__write__done")***0***2***1663***16384***24726***-1***8192***8192  
process("/home/pg93/pgsql9.3.1/bin/postgres").mark("smgr__md__write__done")***0***1***1663***16384***24726***-1***8192***8192  
process("/home/pg93/pgsql9.3.1/bin/postgres").mark("smgr__md__write__done")***0***0***1663***16384***24726***-1***8192***8192  
process("/home/pg93/pgsql9.3.1/bin/postgres").mark("smgr__md__write__done")***0***16***1663***16384***12649***-1***8192***8192  
process("/home/pg93/pgsql9.3.1/bin/postgres").mark("smgr__md__write__done")***0***7***1663***16384***12631***-1***8192***8192  
process("/home/pg93/pgsql9.3.1/bin/postgres").mark("smgr__md__write__done")***0***7***1663***16384***12635***-1***8192***8192  
process("/home/pg93/pgsql9.3.1/bin/postgres").mark("smgr__md__write__done")***0***27***1663***16384***12767***-1***8192***8192  
process("/home/pg93/pgsql9.3.1/bin/postgres").mark("smgr__md__write__done")***0***46***1663***16384***12764***-1***8192***8192  
process("/home/pg93/pgsql9.3.1/bin/postgres").mark("smgr__md__write__done")***0***20***1663***16384***12766***-1***8192***8192  
process("/home/pg93/pgsql9.3.1/bin/postgres").mark("smgr__md__write__done")***0***5***1663***16384***12767***-1***8192***8192  
process("/home/pg93/pgsql9.3.1/bin/postgres").mark("smgr__md__write__done")***0***3***1663***16384***12649***-1***8192***8192  
process("/home/pg93/pgsql9.3.1/bin/postgres").mark("smgr__md__write__done")***0***15***1663***16384***12649***-1***8192***8192  
process("/home/pg93/pgsql9.3.1/bin/postgres").mark("smgr__md__write__done")***0***21***1663***16384***12767***-1***8192***8192  
process("/home/pg93/pgsql9.3.1/bin/postgres").mark("smgr__md__write__done")***0***23***1663***16384***12766***-1***8192***8192  
process("/home/pg93/pgsql9.3.1/bin/postgres").mark("smgr__md__write__done")***0***44***1663***16384***12647***-1***8192***8192  
process("/home/pg93/pgsql9.3.1/bin/postgres").mark("smgr__md__write__done")***0***11***1663***16384***12631***-1***8192***8192  
process("/home/pg93/pgsql9.3.1/bin/postgres").mark("smgr__md__write__done")***0***0***1663***16384***12669***-1***8192***8192  
相关系统表列举 :   
digoal=# select oid,relname from pg_class where relfilenode in (12658,12660,12682,12685,12637,12638,12661,12650,12631);  
 oid  |          relname            
------+---------------------------  
 2840 | pg_toast_2619  
 2679 | pg_index_indexrelid_index  
 2610 | pg_index  
(3 rows)  
当然也涉及了t1表的9个数据块的flush, 在stap输出中下划线标出.  
digoal=# select relfilenode from pg_class where relname='t1';  
 relfilenode   
-------------  
       24726  
(1 row)  

参考

1. http://www.postgresql.org/docs/9.3/static/dynamic-trace.html

2. src/backend/storage/smgr/md.c

3. http://blog.163.com/digoal@126/blog/static/1638770402013916488761/

Flag Counter

digoal’s 大量PostgreSQL文章入口