Systemtap EXP: PostgreSQL IN-BUILD mark Class 5 - read write relation
背景
本文介绍relation 读写相关的探针: 包含读relation以及写relation(例如index, table,tmp table或者他们的fsm, vm文件等).
从这些探针中我们可以得到一下信息 :
1. 读relation开始: forkNum, blocknum, tbs_oid, db_oid, pg_class.relfilenode, (read to local or shared buffer)
这些信息和buffer类的探针相似, 这里就不介绍了, 请参考 :
http://blog.163.com/digoal@126/blog/static/1638770402013916488761/
2. 读relation结束: 比开始多了2个变量, 真实的从relation读到的字节数以及期望的字节数.如果两者不相等则意味着问题.
3. 写relation开始: 同读relation开始.
4. 写relation结束: 比开始多了2个变量, 真实的写到relation的字节数以及期望的字节数. 如果两者不相等则意味着问题.
探针的详细信息如下 :
name | parameter | desc |
---|---|---|
smgr-md-read-start | (ForkNumber, BlockNumber, Oid, Oid, Oid, int) | Probe that fires when beginning to read a block from a relation. arg0 and arg1 contain the fork and block numbers of the page. arg2, arg3, and arg4 contain the tablespace, database, and relation OIDs identifying the relation. arg5 is the ID of the backend which created the temporary relation for a local buffer, or InvalidBackendId (-1) for a shared buffer. |
smgr-md-read-done | (ForkNumber, BlockNumber, Oid, Oid, Oid, int, int, int) | Probe that fires when a block read is complete. arg0 and arg1 contain the fork and block numbers of the page. arg2, arg3, and arg4 contain the tablespace, database, and relation OIDs identifying the relation. arg5 is the ID of the backend which created the temporary relation for a local buffer, or InvalidBackendId (-1) for a shared buffer. arg6 is the number of bytes actually read, while arg7 is the number requested (if these are different it indicates trouble). |
smgr-md-write-start | (ForkNumber, BlockNumber, Oid, Oid, Oid, int) | Probe that fires when beginning to write a block to a relation. arg0 and arg1 contain the fork and block numbers of the page. arg2, arg3, and arg4 contain the tablespace, database, and relation OIDs identifying the relation. arg5 is the ID of the backend which created the temporary relation for a local buffer, or InvalidBackendId (-1) for a shared buffer. |
smgr-md-write-done | (ForkNumber, BlockNumber, Oid, Oid, Oid, int, int, int) | Probe that fires when a block write is complete. arg0 and arg1 contain the fork and block numbers of the page. arg2, arg3, and arg4 contain the tablespace, database, and relation OIDs identifying the relation. arg5 is the ID of the backend which created the temporary relation for a local buffer, or InvalidBackendId (-1) for a shared buffer. arg6 is the number of bytes actually written, while arg7 is the number requested (if these are different it indicates trouble). |
探针在probes.h总的定义信息 :
/* TRACE_POSTGRESQL_SMGR_MD_READ_START ( int, unsigned int, unsigned int, unsigned int, unsigned int, int) */
#if defined STAP_SDT_V1
#define TRACE_POSTGRESQL_SMGR_MD_READ_START_ENABLED() __builtin_expect (smgr__md__read__start_semaphore, 0)
#define postgresql_smgr__md__read__start_semaphore smgr__md__read__start_semaphore
#else
#define TRACE_POSTGRESQL_SMGR_MD_READ_START_ENABLED() __builtin_expect (postgresql_smgr__md__read__start_semaphore, 0)
#endif
__extension__ extern unsigned short postgresql_smgr__md__read__start_semaphore __attribute__ ((unused)) __attribute__ ((section (".probes")));
#define TRACE_POSTGRESQL_SMGR_MD_READ_START(arg1,arg2,arg3,arg4,arg5,arg6) \
DTRACE_PROBE6(postgresql,smgr__md__read__start,arg1,arg2,arg3,arg4,arg5,arg6)
/* TRACE_POSTGRESQL_SMGR_MD_READ_DONE ( int, unsigned int, unsigned int, unsigned int, unsigned int, int, int, int) */
#if defined STAP_SDT_V1
#define TRACE_POSTGRESQL_SMGR_MD_READ_DONE_ENABLED() __builtin_expect (smgr__md__read__done_semaphore, 0)
#define postgresql_smgr__md__read__done_semaphore smgr__md__read__done_semaphore
#else
#define TRACE_POSTGRESQL_SMGR_MD_READ_DONE_ENABLED() __builtin_expect (postgresql_smgr__md__read__done_semaphore, 0)
#endif
__extension__ extern unsigned short postgresql_smgr__md__read__done_semaphore __attribute__ ((unused)) __attribute__ ((section (".probes")));
#define TRACE_POSTGRESQL_SMGR_MD_READ_DONE(arg1,arg2,arg3,arg4,arg5,arg6,arg7,arg8) \
DTRACE_PROBE8(postgresql,smgr__md__read__done,arg1,arg2,arg3,arg4,arg5,arg6,arg7,arg8)
/* TRACE_POSTGRESQL_SMGR_MD_WRITE_START ( int, unsigned int, unsigned int, unsigned int, unsigned int, int) */
#if defined STAP_SDT_V1
#define TRACE_POSTGRESQL_SMGR_MD_WRITE_START_ENABLED() __builtin_expect (smgr__md__write__start_semaphore, 0)
#define postgresql_smgr__md__write__start_semaphore smgr__md__write__start_semaphore
#else
#define TRACE_POSTGRESQL_SMGR_MD_WRITE_START_ENABLED() __builtin_expect (postgresql_smgr__md__write__start_semaphore, 0)
#endif
__extension__ extern unsigned short postgresql_smgr__md__write__start_semaphore __attribute__ ((unused)) __attribute__ ((section (".probes")));
#define TRACE_POSTGRESQL_SMGR_MD_WRITE_START(arg1,arg2,arg3,arg4,arg5,arg6) \
DTRACE_PROBE6(postgresql,smgr__md__write__start,arg1,arg2,arg3,arg4,arg5,arg6)
/* TRACE_POSTGRESQL_SMGR_MD_WRITE_DONE ( int, unsigned int, unsigned int, unsigned int, unsigned int, int, int, int) */
#if defined STAP_SDT_V1
#define TRACE_POSTGRESQL_SMGR_MD_WRITE_DONE_ENABLED() __builtin_expect (smgr__md__write__done_semaphore, 0)
#define postgresql_smgr__md__write__done_semaphore smgr__md__write__done_semaphore
#else
#define TRACE_POSTGRESQL_SMGR_MD_WRITE_DONE_ENABLED() __builtin_expect (postgresql_smgr__md__write__done_semaphore, 0)
#endif
__extension__ extern unsigned short postgresql_smgr__md__write__done_semaphore __attribute__ ((unused)) __attribute__ ((section (".probes")));
#define TRACE_POSTGRESQL_SMGR_MD_WRITE_DONE(arg1,arg2,arg3,arg4,arg5,arg6,arg7,arg8) \
DTRACE_PROBE8(postgresql,smgr__md__write__done,arg1,arg2,arg3,arg4,arg5,arg6,arg7,arg8)
探针在源码中的信息 :
src/backend/storage/smgr/md.c
/*
* mdread() -- Read the specified block from a relation.
*/
void
mdread(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
char *buffer)
{
...
TRACE_POSTGRESQL_SMGR_MD_READ_START(forknum, blocknum,
reln->smgr_rnode.node.spcNode,
reln->smgr_rnode.node.dbNode,
reln->smgr_rnode.node.relNode,
reln->smgr_rnode.backend);
...
nbytes = FileRead(v->mdfd_vfd, buffer, BLCKSZ);
TRACE_POSTGRESQL_SMGR_MD_READ_DONE(forknum, blocknum,
reln->smgr_rnode.node.spcNode,
reln->smgr_rnode.node.dbNode,
reln->smgr_rnode.node.relNode,
reln->smgr_rnode.backend,
nbytes,
BLCKSZ);
...
/*
* mdwrite() -- Write the supplied block at the appropriate location.
*
* This is to be used only for updating already-existing blocks of a
* relation (ie, those before the current EOF). To extend a relation,
* use mdextend().
*/
void
mdwrite(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
char *buffer, bool skipFsync)
{
off_t seekpos;
int nbytes;
MdfdVec *v;
/* This assert is too expensive to have on normally ... */
#ifdef CHECK_WRITE_VS_EXTEND
Assert(blocknum < mdnblocks(reln, forknum));
#endif
TRACE_POSTGRESQL_SMGR_MD_WRITE_START(forknum, blocknum,
reln->smgr_rnode.node.spcNode,
reln->smgr_rnode.node.dbNode,
reln->smgr_rnode.node.relNode,
reln->smgr_rnode.backend);
...
nbytes = FileWrite(v->mdfd_vfd, buffer, BLCKSZ);
TRACE_POSTGRESQL_SMGR_MD_WRITE_DONE(forknum, blocknum,
reln->smgr_rnode.node.spcNode,
reln->smgr_rnode.node.dbNode,
reln->smgr_rnode.node.relNode,
reln->smgr_rnode.backend,
nbytes,
BLCKSZ);
使用举例 :
1. 输出读和写relation的信息, 每个数据块1条记录.
[root@db-172-16-3-150 postgresql-9.3.1]stap -e '
probe process("/home/pg93/pgsql9.3.1/bin/postgres").mark("smgr__md__read__done") {
printdln("***", pn(), $arg1, $arg2, $arg3, $arg4, $arg5, $arg6, $arg7, $arg8)
}
probe process("/home/pg93/pgsql9.3.1/bin/postgres").mark("smgr__md__write__done") {
printdln("***", pn(), $arg1, $arg2, $arg3, $arg4, $arg5, $arg6, $arg7, $arg8)
}'
SQL对应的输出如下 :
digoal=# explain (analyze,verbose,costs,buffers,timing) select count(*) from test ;
QUERY PLAN
-----------------------------------------------------------------------------------------------------------------------------
Aggregate (cost=106075.99..106076.00 rows=1 width=0) (actual time=1561.052..1561.053 rows=1 loops=1)
Output: count(*)
Buffers: shared hit=47319
-> Seq Scan on public.test (cost=0.00..94324.59 rows=4700559 width=0) (actual time=0.011..883.486 rows=4676559 loops=1)
Output: id, info, crt_time
Buffers: shared hit=47319
Total runtime: 1561.094 ms
(7 rows)
shared buffer命中, 本例stap无输出
digoal=# create table t1(id int, info text);
CREATE TABLE
无输出
digoal=# insert into t1 select generate_series(1,100000),md5(random()::text);
INSERT 0 100000
输出:
process("/home/pg93/pgsql9.3.1/bin/postgres").mark("smgr__md__read__done")***1***2***1663***16384***24726***-1***8192***8192
process("/home/pg93/pgsql9.3.1/bin/postgres").mark("smgr__md__read__done")***1***0***1663***16384***24726***-1***8192***8192
digoal=# explain (analyze,verbose,costs,buffers,timing) select count(*) from t1 ;
QUERY PLAN
----------------------------------------------------------------------------------------------------------------
Aggregate (cost=21.50..21.51 rows=1 width=0) (actual time=0.316..0.316 rows=1 loops=1)
Output: count(*)
Buffers: shared hit=9
-> Seq Scan on public.t1 (cost=0.00..19.00 rows=1000 width=0) (actual time=0.010..0.164 rows=1000 loops=1)
Output: id, info
Buffers: shared hit=9
Total runtime: 0.353 ms
(7 rows)
shared buffer命中, 本例stap无输出
digoal=# checkpoint;
CHECKPOINT
输出, checkpoint涉及一些系统表的写.
process("/home/pg93/pgsql9.3.1/bin/postgres").mark("smgr__md__write__done")***0***0***1663***16384***12658***-1***8192***8192
process("/home/pg93/pgsql9.3.1/bin/postgres").mark("smgr__md__write__done")***0***2***1663***16384***12658***-1***8192***8192
process("/home/pg93/pgsql9.3.1/bin/postgres").mark("smgr__md__write__done")***0***5***1663***16384***12658***-1***8192***8192
process("/home/pg93/pgsql9.3.1/bin/postgres").mark("smgr__md__write__done")***0***7***1663***16384***12658***-1***8192***8192
process("/home/pg93/pgsql9.3.1/bin/postgres").mark("smgr__md__write__done")***0***1***1663***16384***12660***-1***8192***8192
process("/home/pg93/pgsql9.3.1/bin/postgres").mark("smgr__md__write__done")***0***1***1663***16384***12685***-1***8192***8192
process("/home/pg93/pgsql9.3.1/bin/postgres").mark("smgr__md__write__done")***0***2***1663***16384***12682***-1***8192***8192
process("/home/pg93/pgsql9.3.1/bin/postgres").mark("smgr__md__write__done")***0***1***1663***16384***12637***-1***8192***8192
process("/home/pg93/pgsql9.3.1/bin/postgres").mark("smgr__md__write__done")***0***1***1663***16384***12638***-1***8192***8192
process("/home/pg93/pgsql9.3.1/bin/postgres").mark("smgr__md__write__done")***0***2***1663***16384***12661***-1***8192***8192
process("/home/pg93/pgsql9.3.1/bin/postgres").mark("smgr__md__write__done")***0***10***1663***16384***12650***-1***8192***8192
process("/home/pg93/pgsql9.3.1/bin/postgres").mark("smgr__md__write__done")***0***43***1663***16384***12647***-1***8192***8192
process("/home/pg93/pgsql9.3.1/bin/postgres").mark("smgr__md__write__done")***0***2***1663***16384***12638***-1***8192***8192
process("/home/pg93/pgsql9.3.1/bin/postgres").mark("smgr__md__write__done")***0***1***1663***16384***12684***-1***8192***8192
process("/home/pg93/pgsql9.3.1/bin/postgres").mark("smgr__md__write__done")***0***2***1663***16384***12634***-1***8192***8192
process("/home/pg93/pgsql9.3.1/bin/postgres").mark("smgr__md__write__done")***0***18***1663***16384***12629***-1***8192***8192
process("/home/pg93/pgsql9.3.1/bin/postgres").mark("smgr__md__write__done")***0***1***1663***16384***12633***-1***8192***8192
process("/home/pg93/pgsql9.3.1/bin/postgres").mark("smgr__md__write__done")***1***2***1663***16384***12631***-1***8192***8192
process("/home/pg93/pgsql9.3.1/bin/postgres").mark("smgr__md__write__done")***0***6***1663***16384***12631***-1***8192***8192
process("/home/pg93/pgsql9.3.1/bin/postgres").mark("smgr__md__write__done")***0***19***1663***16384***12629***-1***8192***8192
process("/home/pg93/pgsql9.3.1/bin/postgres").mark("smgr__md__write__done")***0***8***1663***16384***24726***-1***8192***8192
process("/home/pg93/pgsql9.3.1/bin/postgres").mark("smgr__md__write__done")***0***7***1663***16384***24726***-1***8192***8192
process("/home/pg93/pgsql9.3.1/bin/postgres").mark("smgr__md__write__done")***0***6***1663***16384***24726***-1***8192***8192
process("/home/pg93/pgsql9.3.1/bin/postgres").mark("smgr__md__write__done")***0***5***1663***16384***24726***-1***8192***8192
process("/home/pg93/pgsql9.3.1/bin/postgres").mark("smgr__md__write__done")***0***4***1663***16384***24726***-1***8192***8192
process("/home/pg93/pgsql9.3.1/bin/postgres").mark("smgr__md__write__done")***0***3***1663***16384***24726***-1***8192***8192
process("/home/pg93/pgsql9.3.1/bin/postgres").mark("smgr__md__write__done")***0***2***1663***16384***24726***-1***8192***8192
process("/home/pg93/pgsql9.3.1/bin/postgres").mark("smgr__md__write__done")***0***1***1663***16384***24726***-1***8192***8192
process("/home/pg93/pgsql9.3.1/bin/postgres").mark("smgr__md__write__done")***0***0***1663***16384***24726***-1***8192***8192
process("/home/pg93/pgsql9.3.1/bin/postgres").mark("smgr__md__write__done")***0***16***1663***16384***12649***-1***8192***8192
process("/home/pg93/pgsql9.3.1/bin/postgres").mark("smgr__md__write__done")***0***7***1663***16384***12631***-1***8192***8192
process("/home/pg93/pgsql9.3.1/bin/postgres").mark("smgr__md__write__done")***0***7***1663***16384***12635***-1***8192***8192
process("/home/pg93/pgsql9.3.1/bin/postgres").mark("smgr__md__write__done")***0***27***1663***16384***12767***-1***8192***8192
process("/home/pg93/pgsql9.3.1/bin/postgres").mark("smgr__md__write__done")***0***46***1663***16384***12764***-1***8192***8192
process("/home/pg93/pgsql9.3.1/bin/postgres").mark("smgr__md__write__done")***0***20***1663***16384***12766***-1***8192***8192
process("/home/pg93/pgsql9.3.1/bin/postgres").mark("smgr__md__write__done")***0***5***1663***16384***12767***-1***8192***8192
process("/home/pg93/pgsql9.3.1/bin/postgres").mark("smgr__md__write__done")***0***3***1663***16384***12649***-1***8192***8192
process("/home/pg93/pgsql9.3.1/bin/postgres").mark("smgr__md__write__done")***0***15***1663***16384***12649***-1***8192***8192
process("/home/pg93/pgsql9.3.1/bin/postgres").mark("smgr__md__write__done")***0***21***1663***16384***12767***-1***8192***8192
process("/home/pg93/pgsql9.3.1/bin/postgres").mark("smgr__md__write__done")***0***23***1663***16384***12766***-1***8192***8192
process("/home/pg93/pgsql9.3.1/bin/postgres").mark("smgr__md__write__done")***0***44***1663***16384***12647***-1***8192***8192
process("/home/pg93/pgsql9.3.1/bin/postgres").mark("smgr__md__write__done")***0***11***1663***16384***12631***-1***8192***8192
process("/home/pg93/pgsql9.3.1/bin/postgres").mark("smgr__md__write__done")***0***0***1663***16384***12669***-1***8192***8192
相关系统表列举 :
digoal=# select oid,relname from pg_class where relfilenode in (12658,12660,12682,12685,12637,12638,12661,12650,12631);
oid | relname
------+---------------------------
2840 | pg_toast_2619
2679 | pg_index_indexrelid_index
2610 | pg_index
(3 rows)
当然也涉及了t1表的9个数据块的flush, 在stap输出中下划线标出.
digoal=# select relfilenode from pg_class where relname='t1';
relfilenode
-------------
24726
(1 row)
参考
1. http://www.postgresql.org/docs/9.3/static/dynamic-trace.html
2. src/backend/storage/smgr/md.c
3. http://blog.163.com/digoal@126/blog/static/1638770402013916488761/