Systemtap(1.8) BUG? : delete from statistics(aggregates) type stored in array elements

5 minute read

背景

在使用systemtap分析数据库单条查询产生系统调用读和写信息时, 发现systemtap统计类型存在的一个BUG.  
只要统计类型存储在数组的元素中, 就会触发这个BUG, 直接使用全局变量则不会发生这个BUG.  
BUG的表现如 :   
var[1] <<< 123  
delete var[1]  
表现为var[1]其中存储的统计流数据未被清除.  
stap 版本  
[root@db-172-16-3-39 ~]# stap -V  
Systemtap translator/driver (version 1.8/0.152 non-git sources)  
Copyright (C) 2005-2012 Red Hat, Inc. and others  
This is free software; see the source for copying conditions.  
enabled features: AVAHI LIBRPM LIBSQLITE3 NSS BOOST_SHARED_PTR TR1_UNORDERED_MAP NLS  

经过测试发现2.4的版本已经修复了这个BUG, 请参考下一篇blog

http://blog.163.com/digoal@126/blog/static/163877040201391391613269/

stap 1.8 触发BUG的测试如下 :   
stap脚本 :   
[root@db-172-16-3-39 ~]# vi test.stp  
global var1[50000], var2[50000], var3[50000]  
  
probe process("/home/pg93/pgsql9.3.1/bin/postgres").mark("query__start") {  
  pid=pid()  
  delete var1[pid]  
  delete var2[pid]  
  delete var3[pid]  
  var3[pid] = user_string($arg1)  
  printf("pid:%u, query start: %s, read: %u,%u, write: %u,%u\n", pid, var3[pid], @sum(var1[pid]), @count(var1[pid]), @sum(var2[pid]), @count(var2[pid]))  
}  
  
probe process("/home/pg93/pgsql9.3.1/bin/postgres").mark("query__done") {  
  pid=pid()  
  printf("pid:%u, query done: %s, read: %u,%u, write: %u,%u\n", pid, var3[pid], @sum(var1[pid]), @count(var1[pid]), @sum(var2[pid]), @count(var2[pid]))  
  delete var1[pid]  
  delete var2[pid]  
  delete var3[pid]  
}  
  
probe syscall.read.return {  
  if (execname()=="postgres") {  
    pid=pid()  
    var1[pid] <<< $count  
  }  
}  
  
probe syscall.write.return {  
  if (execname()=="postgres") {  
    pid=pid()  
    var2[pid] <<< $count  
  }  
}  
  
probe end {  
  delete var1  
  delete var2  
  delete var3  
}  
执行stap :   
[root@db-172-16-3-39 ~]# stap -D MAXSKIPPED=1 ./test.stp   
在数据库中进行如下操作 :   
digoal=# drop table t;  
DROP TABLE  
digoal=# create table t(id int, info text, crt_time timestamp);  
CREATE TABLE  
digoal=# insert into t select generate_series(1,1000000),md5(random()::text),now();  
INSERT 0 1000000  
digoal=# select count(*) from t;  
  count    
---------  
 1000000  
(1 row)  
  
digoal=# select count(*) from t;  
  count    
---------  
 1000000  
(1 row)  
  
digoal=# create index idx_t_1 on t(id);  
CREATE INDEX  
digoal=# select count(*) from t;  
  count    
---------  
 1000000  
(1 row)  
  
digoal=# select count(*) from t;  
  count    
---------  
 1000000  
(1 row)  
  
digoal=# select count(*) from t;  
  count    
---------  
 1000000  
(1 row)  
  
digoal=# checkpoint;  
CHECKPOINT  
digoal=# checkpoint;  
CHECKPOINT  
stap的输出如下 :   
pid:8981, query start: drop table t;, read: 0,0, write: 0,0  
pid:8981, query done: drop table t;, read: 0,0, write: 65751,2  
pid:8981, query start: create table t(id int, info text, crt_time timestamp);, read: 0,0, write: 65751,2  
pid:8981, query done: create table t(id int, info text, crt_time timestamp);, read: 0,0, write: 82391,5  
pid:8981, query start: insert into t select generate_series(1,1000000),md5(random()::text),now();, read: 0,0, write: 65751,2  
pid:8981, query done: insert into t select generate_series(1,1000000),md5(random()::text),now();, read: 32768,4, write: 76669143,9353  
pid:8981, query start: select count(*) from t;, read: 16384,2, write: 2129920,260  
pid:8981, query done: select count(*) from t;, read: 16384,2, write: 2129920,260  
pid:8981, query start: select count(*) from t;, read: 16384,2, write: 2129920,260  
pid:8981, query done: select count(*) from t;, read: 16384,2, write: 2129920,260  
pid:8981, query start: create index idx_t_1 on t(id);, read: 16384,2, write: 2129920,260  
pid:8981, query done: create index idx_t_1 on t(id);, read: 16384,2, write: 24715496,3018  
pid:8981, query start: select count(*) from t;, read: 16384,2, write: 2129920,260  
pid:8981, query done: select count(*) from t;, read: 24576,3, write: 2129920,260  
pid:8981, query start: select count(*) from t;, read: 16384,2, write: 2129920,260  
pid:8981, query done: select count(*) from t;, read: 16384,2, write: 2129920,260  
pid:8981, query start: select count(*) from t;, read: 16384,2, write: 2129920,260  
pid:8981, query done: select count(*) from t;, read: 16384,2, write: 2129920,260  
pid:8981, query start: checkpoint;, read: 16384,2, write: 2129920,260  
pid:8981, query done: checkpoint;, read: 16384,2, write: 2129920,260  
pid:8981, query start: checkpoint;, read: 16384,2, write: 2129920,260  
pid:8981, query done: checkpoint;, read: 16384,2, write: 2129920,260  
显然, 后面的一些输出中read和write的统计信息都没有被清除.   
如果直接使用全局变量的话, 就不会发生这样的问题.  
修改后的stap脚本如下 :   
global var1, var2, var3[50000]  
  
probe process("/home/pg93/pgsql9.3.1/bin/postgres").mark("query__start") {  
  pid=pid()  
  delete var1  
  delete var2  
  delete var3[pid]  
  var3[pid] = user_string($arg1)  
  printf("pid:%u, query start: %s, read: %u,%u, write: %u,%u\n", pid, var3[pid], @sum(var1), @count(var1), @sum(var2), @count(var2))  
}  
  
probe process("/home/pg93/pgsql9.3.1/bin/postgres").mark("query__done") {  
  pid=pid()  
  printf("pid:%u, query done: %s, read: %u,%u, write: %u,%u\n", pid, var3[pid], @sum(var1), @count(var1), @sum(var2), @count(var2))  
  delete var1  
  delete var2  
  delete var3[pid]  
}  
  
probe syscall.read.return {  
  pid=pid()  
  if (pid==$1) {  
    var1 <<< $count  
  }  
}  
  
probe syscall.write.return {  
  pid=pid()  
  if (pid==$1) {  
    var2 <<< $count  
  }  
}  
  
probe end {  
  delete var1  
  delete var2  
  delete var3  
}  
  
查询数据库当前连接的pid :   
digoal=# select pg_backend_pid();  
 pg_backend_pid   
----------------  
           8981  
(1 row)  
  
将这个参数传给stap, 启动 :   
[root@db-172-16-3-39 ~]# stap -D MAXSKIPPED=1 ./test.stp 8981  
在数据库中执行如下操作 :   
digoal=# drop table t;  
DROP TABLE  
digoal=# create table t(id int, info text, crt_time timestamp);  
CREATE TABLE  
digoal=# insert into t select generate_series(1,1000000),md5(random()::text),now();  
INSERT 0 1000000  
digoal=# select count(*) from t;  
  count    
---------  
 1000000  
(1 row)  
  
digoal=# select count(*) from t;  
  count    
---------  
 1000000  
(1 row)  
  
digoal=# create index idx_t_1 on t(id);  
CREATE INDEX  
digoal=# select count(*) from t;  
  count    
---------  
 1000000  
(1 row)  
  
digoal=# select count(*) from t;  
  count    
---------  
 1000000  
(1 row)  
  
digoal=# select count(*) from t;  
  count    
---------  
 1000000  
(1 row)  
  
digoal=# checkpoint;  
CHECKPOINT  
digoal=# checkpoint;  
CHECKPOINT  
  
stap的输出如下, 已经正常了 :   
pid:8981, query start: drop table t;, read: 0,0, write: 0,0  
pid:8981, query done: drop table t;, read: 0,0, write: 65751,2  
pid:8981, query start: create table t(id int, info text, crt_time timestamp);, read: 0,0, write: 0,0  
pid:8981, query done: create table t(id int, info text, crt_time timestamp);, read: 0,0, write: 8448,2  
pid:8981, query start: insert into t select generate_series(1,1000000),md5(random()::text),now();, read: 0,0, write: 0,0  
pid:8981, query done: insert into t select generate_series(1,1000000),md5(random()::text),now();, read: 32768,4, write: 76603392,9351  
pid:8981, query start: select count(*) from t;, read: 0,0, write: 0,0  
pid:8981, query done: select count(*) from t;, read: 0,0, write: 0,0  
pid:8981, query start: select count(*) from t;, read: 0,0, write: 0,0  
pid:8981, query done: select count(*) from t;, read: 0,0, write: 0,0  
pid:8981, query start: create index idx_t_1 on t(id);, read: 0,0, write: 0,0  
pid:8981, query done: create index idx_t_1 on t(id);, read: 0,0, write: 22585576,2758  
pid:8981, query start: select count(*) from t;, read: 0,0, write: 0,0  
pid:8981, query done: select count(*) from t;, read: 8192,1, write: 0,0  
pid:8981, query start: select count(*) from t;, read: 0,0, write: 0,0  
pid:8981, query done: select count(*) from t;, read: 0,0, write: 0,0  
pid:8981, query start: select count(*) from t;, read: 0,0, write: 0,0  
pid:8981, query done: select count(*) from t;, read: 0,0, write: 0,0  
pid:8981, query start: checkpoint;, read: 0,0, write: 0,0  
pid:8981, query done: checkpoint;, read: 0,0, write: 0,0  
pid:8981, query start: checkpoint;, read: 0,0, write: 0,0  
pid:8981, query done: checkpoint;, read: 0,0, write: 0,0  

参考

1. http://www.postgresql.org/docs/9.3/static/dynamic-trace.html

2. syscalls2.stp

3. fs/read_write.c

4.

cd /usr/share/systemtap/tapset  
[root@db-172-16-3-39 tapset]# less syscalls2.stp  
  
# read _______________________________________________________  
# ssize_t sys_read(unsigned int fd, char __user * buf, size_t count)  
probe syscall.read = kernel.function("sys_read").call  
{  
        name = "read"  
        fd = $fd  
        buf_uaddr = $buf  
        count = $count  
        argstr = sprintf("%d, %p, %d", $fd, $buf, $count)  
}  
probe syscall.read.return = kernel.function("sys_read").return  
{  
        name = "read"  
        retstr = return_str(1, $return)  
}  
  
# write ______________________________________________________  
#  
# ssize_t sys_write(unsigned int fd,  
#            const char __user * buf,  
#            size_t count)  
#  
probe syscall.write = kernel.function("sys_write").call  
{  
        name = "write"  
        fd = $fd  
        buf_uaddr = $buf  
        count = $count  
        argstr = sprintf("%d, %s, %d", $fd, text_strn(user_string($buf), syscall_string_trunc, 1), $count)  
}  
probe syscall.write.return = kernel.function("sys_write").return  
{  
        name = "write"  
        retstr = return_str(1, $return)  
}  
  
  
[root@db-172-16-3-39 tapset]# stap -l 'kernel.function("sys_read")'  
kernel.function("sys_read@fs/read_write.c:379")  
[root@db-172-16-3-39 tapset]# stap -l 'kernel.function("sys_write")'  
kernel.function("sys_write@fs/read_write.c:397")  
  
[root@db-172-16-3-39 src]# locate read_write.c  
/usr/src/debug/kernel-2.6.18/linux-2.6.18-348.12.1.el5.x86_64/fs/read_write.c  
  
[root@db-172-16-3-39 src]# less /usr/src/debug/kernel-2.6.18/linux-2.6.18-348.12.1.el5.x86_64/fs/read_write.c  
asmlinkage ssize_t sys_read(unsigned int fd, char __user * buf, size_t count)  
{  
        struct file *file;  
        ssize_t ret = -EBADF;  
        int fput_needed;  
  
        file = fget_light(fd, &fput_needed);  
        if (file) {  
                loff_t pos = file_pos_read(file);  
                ret = vfs_read(file, buf, count, &pos);  
                file_pos_write(file, pos);  
                fput_light(file, fput_needed);  
        }  
  
        return ret;  
}  
EXPORT_SYMBOL_GPL(sys_read);  
  
asmlinkage ssize_t sys_write(unsigned int fd, const char __user * buf, size_t count)  
{  
        struct file *file;  
        ssize_t ret = -EBADF;  
        int fput_needed;  
  
        file = fget_light(fd, &fput_needed);  
        if (file) {  
                loff_t pos = file_pos_read(file);  
                ret = vfs_write(file, buf, count, &pos);  
                file_pos_write(file, pos);  
                fput_light(file, fput_needed);  
        }  
  
        return ret;  
}  
  
EXPORT_SYMBOL_GPL(sys_write);  

Flag Counter

digoal’s 大量PostgreSQL文章入口