Systemtap(1.8) BUG? : delete from statistics(aggregates) type stored in array elements
背景
在使用systemtap分析数据库单条查询产生系统调用读和写信息时, 发现systemtap统计类型存在的一个BUG.
只要统计类型存储在数组的元素中, 就会触发这个BUG, 直接使用全局变量则不会发生这个BUG.
BUG的表现如 :
var[1] <<< 123
delete var[1]
表现为var[1]其中存储的统计流数据未被清除.
stap 版本
[root@db-172-16-3-39 ~]# stap -V
Systemtap translator/driver (version 1.8/0.152 non-git sources)
Copyright (C) 2005-2012 Red Hat, Inc. and others
This is free software; see the source for copying conditions.
enabled features: AVAHI LIBRPM LIBSQLITE3 NSS BOOST_SHARED_PTR TR1_UNORDERED_MAP NLS
经过测试发现2.4的版本已经修复了这个BUG, 请参考下一篇blog
http://blog.163.com/digoal@126/blog/static/163877040201391391613269/
stap 1.8 触发BUG的测试如下 :
stap脚本 :
[root@db-172-16-3-39 ~]# vi test.stp
global var1[50000], var2[50000], var3[50000]
probe process("/home/pg93/pgsql9.3.1/bin/postgres").mark("query__start") {
pid=pid()
delete var1[pid]
delete var2[pid]
delete var3[pid]
var3[pid] = user_string($arg1)
printf("pid:%u, query start: %s, read: %u,%u, write: %u,%u\n", pid, var3[pid], @sum(var1[pid]), @count(var1[pid]), @sum(var2[pid]), @count(var2[pid]))
}
probe process("/home/pg93/pgsql9.3.1/bin/postgres").mark("query__done") {
pid=pid()
printf("pid:%u, query done: %s, read: %u,%u, write: %u,%u\n", pid, var3[pid], @sum(var1[pid]), @count(var1[pid]), @sum(var2[pid]), @count(var2[pid]))
delete var1[pid]
delete var2[pid]
delete var3[pid]
}
probe syscall.read.return {
if (execname()=="postgres") {
pid=pid()
var1[pid] <<< $count
}
}
probe syscall.write.return {
if (execname()=="postgres") {
pid=pid()
var2[pid] <<< $count
}
}
probe end {
delete var1
delete var2
delete var3
}
执行stap :
[root@db-172-16-3-39 ~]# stap -D MAXSKIPPED=1 ./test.stp
在数据库中进行如下操作 :
digoal=# drop table t;
DROP TABLE
digoal=# create table t(id int, info text, crt_time timestamp);
CREATE TABLE
digoal=# insert into t select generate_series(1,1000000),md5(random()::text),now();
INSERT 0 1000000
digoal=# select count(*) from t;
count
---------
1000000
(1 row)
digoal=# select count(*) from t;
count
---------
1000000
(1 row)
digoal=# create index idx_t_1 on t(id);
CREATE INDEX
digoal=# select count(*) from t;
count
---------
1000000
(1 row)
digoal=# select count(*) from t;
count
---------
1000000
(1 row)
digoal=# select count(*) from t;
count
---------
1000000
(1 row)
digoal=# checkpoint;
CHECKPOINT
digoal=# checkpoint;
CHECKPOINT
stap的输出如下 :
pid:8981, query start: drop table t;, read: 0,0, write: 0,0
pid:8981, query done: drop table t;, read: 0,0, write: 65751,2
pid:8981, query start: create table t(id int, info text, crt_time timestamp);, read: 0,0, write: 65751,2
pid:8981, query done: create table t(id int, info text, crt_time timestamp);, read: 0,0, write: 82391,5
pid:8981, query start: insert into t select generate_series(1,1000000),md5(random()::text),now();, read: 0,0, write: 65751,2
pid:8981, query done: insert into t select generate_series(1,1000000),md5(random()::text),now();, read: 32768,4, write: 76669143,9353
pid:8981, query start: select count(*) from t;, read: 16384,2, write: 2129920,260
pid:8981, query done: select count(*) from t;, read: 16384,2, write: 2129920,260
pid:8981, query start: select count(*) from t;, read: 16384,2, write: 2129920,260
pid:8981, query done: select count(*) from t;, read: 16384,2, write: 2129920,260
pid:8981, query start: create index idx_t_1 on t(id);, read: 16384,2, write: 2129920,260
pid:8981, query done: create index idx_t_1 on t(id);, read: 16384,2, write: 24715496,3018
pid:8981, query start: select count(*) from t;, read: 16384,2, write: 2129920,260
pid:8981, query done: select count(*) from t;, read: 24576,3, write: 2129920,260
pid:8981, query start: select count(*) from t;, read: 16384,2, write: 2129920,260
pid:8981, query done: select count(*) from t;, read: 16384,2, write: 2129920,260
pid:8981, query start: select count(*) from t;, read: 16384,2, write: 2129920,260
pid:8981, query done: select count(*) from t;, read: 16384,2, write: 2129920,260
pid:8981, query start: checkpoint;, read: 16384,2, write: 2129920,260
pid:8981, query done: checkpoint;, read: 16384,2, write: 2129920,260
pid:8981, query start: checkpoint;, read: 16384,2, write: 2129920,260
pid:8981, query done: checkpoint;, read: 16384,2, write: 2129920,260
显然, 后面的一些输出中read和write的统计信息都没有被清除.
如果直接使用全局变量的话, 就不会发生这样的问题.
修改后的stap脚本如下 :
global var1, var2, var3[50000]
probe process("/home/pg93/pgsql9.3.1/bin/postgres").mark("query__start") {
pid=pid()
delete var1
delete var2
delete var3[pid]
var3[pid] = user_string($arg1)
printf("pid:%u, query start: %s, read: %u,%u, write: %u,%u\n", pid, var3[pid], @sum(var1), @count(var1), @sum(var2), @count(var2))
}
probe process("/home/pg93/pgsql9.3.1/bin/postgres").mark("query__done") {
pid=pid()
printf("pid:%u, query done: %s, read: %u,%u, write: %u,%u\n", pid, var3[pid], @sum(var1), @count(var1), @sum(var2), @count(var2))
delete var1
delete var2
delete var3[pid]
}
probe syscall.read.return {
pid=pid()
if (pid==$1) {
var1 <<< $count
}
}
probe syscall.write.return {
pid=pid()
if (pid==$1) {
var2 <<< $count
}
}
probe end {
delete var1
delete var2
delete var3
}
查询数据库当前连接的pid :
digoal=# select pg_backend_pid();
pg_backend_pid
----------------
8981
(1 row)
将这个参数传给stap, 启动 :
[root@db-172-16-3-39 ~]# stap -D MAXSKIPPED=1 ./test.stp 8981
在数据库中执行如下操作 :
digoal=# drop table t;
DROP TABLE
digoal=# create table t(id int, info text, crt_time timestamp);
CREATE TABLE
digoal=# insert into t select generate_series(1,1000000),md5(random()::text),now();
INSERT 0 1000000
digoal=# select count(*) from t;
count
---------
1000000
(1 row)
digoal=# select count(*) from t;
count
---------
1000000
(1 row)
digoal=# create index idx_t_1 on t(id);
CREATE INDEX
digoal=# select count(*) from t;
count
---------
1000000
(1 row)
digoal=# select count(*) from t;
count
---------
1000000
(1 row)
digoal=# select count(*) from t;
count
---------
1000000
(1 row)
digoal=# checkpoint;
CHECKPOINT
digoal=# checkpoint;
CHECKPOINT
stap的输出如下, 已经正常了 :
pid:8981, query start: drop table t;, read: 0,0, write: 0,0
pid:8981, query done: drop table t;, read: 0,0, write: 65751,2
pid:8981, query start: create table t(id int, info text, crt_time timestamp);, read: 0,0, write: 0,0
pid:8981, query done: create table t(id int, info text, crt_time timestamp);, read: 0,0, write: 8448,2
pid:8981, query start: insert into t select generate_series(1,1000000),md5(random()::text),now();, read: 0,0, write: 0,0
pid:8981, query done: insert into t select generate_series(1,1000000),md5(random()::text),now();, read: 32768,4, write: 76603392,9351
pid:8981, query start: select count(*) from t;, read: 0,0, write: 0,0
pid:8981, query done: select count(*) from t;, read: 0,0, write: 0,0
pid:8981, query start: select count(*) from t;, read: 0,0, write: 0,0
pid:8981, query done: select count(*) from t;, read: 0,0, write: 0,0
pid:8981, query start: create index idx_t_1 on t(id);, read: 0,0, write: 0,0
pid:8981, query done: create index idx_t_1 on t(id);, read: 0,0, write: 22585576,2758
pid:8981, query start: select count(*) from t;, read: 0,0, write: 0,0
pid:8981, query done: select count(*) from t;, read: 8192,1, write: 0,0
pid:8981, query start: select count(*) from t;, read: 0,0, write: 0,0
pid:8981, query done: select count(*) from t;, read: 0,0, write: 0,0
pid:8981, query start: select count(*) from t;, read: 0,0, write: 0,0
pid:8981, query done: select count(*) from t;, read: 0,0, write: 0,0
pid:8981, query start: checkpoint;, read: 0,0, write: 0,0
pid:8981, query done: checkpoint;, read: 0,0, write: 0,0
pid:8981, query start: checkpoint;, read: 0,0, write: 0,0
pid:8981, query done: checkpoint;, read: 0,0, write: 0,0
参考
1. http://www.postgresql.org/docs/9.3/static/dynamic-trace.html
2. syscalls2.stp
3. fs/read_write.c
4.
cd /usr/share/systemtap/tapset
[root@db-172-16-3-39 tapset]# less syscalls2.stp
# read _______________________________________________________
# ssize_t sys_read(unsigned int fd, char __user * buf, size_t count)
probe syscall.read = kernel.function("sys_read").call
{
name = "read"
fd = $fd
buf_uaddr = $buf
count = $count
argstr = sprintf("%d, %p, %d", $fd, $buf, $count)
}
probe syscall.read.return = kernel.function("sys_read").return
{
name = "read"
retstr = return_str(1, $return)
}
# write ______________________________________________________
#
# ssize_t sys_write(unsigned int fd,
# const char __user * buf,
# size_t count)
#
probe syscall.write = kernel.function("sys_write").call
{
name = "write"
fd = $fd
buf_uaddr = $buf
count = $count
argstr = sprintf("%d, %s, %d", $fd, text_strn(user_string($buf), syscall_string_trunc, 1), $count)
}
probe syscall.write.return = kernel.function("sys_write").return
{
name = "write"
retstr = return_str(1, $return)
}
[root@db-172-16-3-39 tapset]# stap -l 'kernel.function("sys_read")'
kernel.function("sys_read@fs/read_write.c:379")
[root@db-172-16-3-39 tapset]# stap -l 'kernel.function("sys_write")'
kernel.function("sys_write@fs/read_write.c:397")
[root@db-172-16-3-39 src]# locate read_write.c
/usr/src/debug/kernel-2.6.18/linux-2.6.18-348.12.1.el5.x86_64/fs/read_write.c
[root@db-172-16-3-39 src]# less /usr/src/debug/kernel-2.6.18/linux-2.6.18-348.12.1.el5.x86_64/fs/read_write.c
asmlinkage ssize_t sys_read(unsigned int fd, char __user * buf, size_t count)
{
struct file *file;
ssize_t ret = -EBADF;
int fput_needed;
file = fget_light(fd, &fput_needed);
if (file) {
loff_t pos = file_pos_read(file);
ret = vfs_read(file, buf, count, &pos);
file_pos_write(file, pos);
fput_light(file, fput_needed);
}
return ret;
}
EXPORT_SYMBOL_GPL(sys_read);
asmlinkage ssize_t sys_write(unsigned int fd, const char __user * buf, size_t count)
{
struct file *file;
ssize_t ret = -EBADF;
int fput_needed;
file = fget_light(fd, &fput_needed);
if (file) {
loff_t pos = file_pos_read(file);
ret = vfs_write(file, buf, count, &pos);
file_pos_write(file, pos);
fput_light(file, fput_needed);
}
return ret;
}
EXPORT_SYMBOL_GPL(sys_write);