Systemtap examples, DISK IO - 1 Summarizing Disk Read/Write Traffic
背景
本文开始给大家讲一下linux系统中disk io相关的跟踪范例.
例子来自disktop.stp 脚本, 该脚本作者的用意是输出系统中IO请求字节数前10的进程信息以及对应的块设备信息.
但是这个脚本存在一个严重的BUG, 输出时对应的设备名其实是没有意义的, 它只是当时的设备号, 因为一个进程可能会对多个块设备进行IO操作, 但是脚本中记录的仅仅是最后一次io请求的块设备名.
后面会讲如何修改这个文件, 修复bug.
脚本内容以及注解 :
[root@db-172-16-3-150 network]# cd /usr/share/systemtap/testsuite/systemtap.examples/io
[root@db-172-16-3-150 network]# cat disktop.stp
#!/usr/bin/stap
#
# Copyright (C) 2007 Oracle Corp.
#
# Get the status of reading/writing disk every 5 seconds,
# output top ten entries
#
# This is free software,GNU General Public License (GPL);
# either version 2, or (at your option) any later version.
#
# Usage:
# ./disktop.stp
#
global io_stat,device
global read_bytes,write_bytes
// 定义几个全局变量, io_stat存储累加的读, 写的字节数
// device存储设备名, bug就在这里. 它存储的是vfs.read.return或vfs.write.return当时的设备名,
// 同pid(),execname(),uid(),ppid()操作的块设备一下次是可能会变化的.
// read_bytes和write_bytes是全局的读写字节数, 不区分进程号.
probe vfs.read.return {
if ($return>0) {
if (devname!="N/A") {/*skip read from cache*/
io_stat[pid(),execname(),uid(),ppid(),"R"] += $return
device[pid(),execname(),uid(),ppid(),"R"] = devname
read_bytes += $return
}
}
}
// devname != "N/A" 过滤掉cache的块操作. 其他的都是块设备上的io操作.
probe vfs.write.return {
if ($return>0) {
if (devname!="N/A") { /*skip update cache*/
io_stat[pid(),execname(),uid(),ppid(),"W"] += $return
device[pid(),execname(),uid(),ppid(),"W"] = devname
write_bytes += $return
}
}
}
// devname != "N/A" 过滤掉cache的块操作. 其他的都是块设备上的io操作.
probe timer.ms(5000) {
/* skip non-read/write disk */
if (read_bytes+write_bytes) {
printf("\n%-25s, %-8s%4dKb/sec, %-7s%6dKb, %-7s%6dKb\n\n",
ctime(gettimeofday_s()),
"Average:", ((read_bytes+write_bytes)/1024)/5,
"Read:",read_bytes/1024,
"Write:",write_bytes/1024)
// 输出全局读写字节数, 速率.
// ctime用于把unix epoch second转换成年月日时分秒
/* print header */
printf("%8s %8s %8s %25s %8s %4s %12s\n",
"UID","PID","PPID","CMD","DEVICE","T","BYTES")
}
// 输出头信息, 包含用户id, 进程id, 父进程id, 命令名, 块设备名, 读写类型, 字节数
// 接下来的输出与之对应
/* print top ten I/O */
foreach ([process,cmd,userid,parent,action] in io_stat- limit 10)
printf("%8d %8d %8d %25s %8s %4s %12d\n",
userid,process,parent,cmd,
device[process,cmd,userid,parent,action],
action,io_stat[process,cmd,userid,parent,action])
// 输出io请求前10的进程和设备信息.
// 前面说了, 这里的设备没有意义
/* clear data */
delete io_stat
delete device
read_bytes = 0
write_bytes = 0
// 输出后清除全局变量的数据
}
probe end{
delete io_stat
delete device
delete read_bytes
delete write_bytes
}
执行输出举例 :
这里使用两个块设备的读写来给大家看看脚本的bug.
[root@db-172-16-3-150 io]# df -h
Filesystem Size Used Avail Use% Mounted on
/dev/sdc1 29G 6.5G 21G 24% /
tmpfs 48G 0 48G 0% /dev/shm
/dev/sdc3 98G 24G 70G 26% /opt
/dev/sdd1 183G 5.4G 169G 4% /ssd1
/dev/sde1 469G 1.7G 444G 1% /ssd2
/dev/sda1 221G 195M 209G 1% /ssd3
/dev/sdb1 221G 9.6G 200G 5% /ssd4
我们用到的是/dev/sda1和/dev/sdb1
digoal=# \db
List of tablespaces
Name | Owner | Location
------------+----------+-----------------------
pg_default | postgres |
pg_global | postgres |
tbs_digoal | postgres | /ssd3/pg93/tbs_digoal
tbs_idx | postgres | /ssd4/pg93/tbs_idx
(4 rows)
创建表和索引, 分别存在两个设备上.
digoal=# create table test(id int primary key, info text, crt_time timestamp) tablespace tbs_digoal;
CREATE TABLE
digoal=# create index idx_test_info on test(info) tablespace tbs_idx;
CREATE INDEX
创建测试用的函数
digoal=# CREATE OR REPLACE FUNCTION public.f_test(i_id integer)
RETURNS void
LANGUAGE plpgsql
STRICT
AS $function$
declare
begin
update test set info=md5(random()::text), crt_time=clock_timestamp() where id=i_id;
if not found then
insert into test(id,info,crt_time) values(i_id,md5(random()::text),clock_timestamp());
end if;
return;
exception when others then
return;
end;
$function$;
创建测试用的bench 脚本
pg93@db-172-16-3-150-> vi test.sql
\setrandom id 1 5000000
select f_test(:id);
进行bench测试, 维持60秒
pg93@db-172-16-3-150-> pgbench -M prepared -n -r -f ./test.sql -c 1 -T 60 -h $PGDATA -p 1921 -U postgres digoal
测试过程中查看一下相关进程号和进程信息
[root@db-172-16-3-150 io]# ps -ewf|grep pg93
pg93 9171 1 0 10:39 pts/3 00:00:00 /home/pg93/pgsql9.3.1/bin/postgres
pg93 9172 9171 0 10:39 ? 00:00:00 postgres: logger process
pg93 9174 9171 0 10:39 ? 00:00:00 postgres: checkpointer process
pg93 9175 9171 0 10:39 ? 00:00:00 postgres: writer process
pg93 9176 9171 0 10:39 ? 00:00:01 postgres: wal writer process
pg93 9177 9171 0 10:39 ? 00:00:00 postgres: autovacuum launcher process
pg93 9178 9171 0 10:39 ? 00:00:00 postgres: archiver process last was 0000000100000010000000ED
pg93 9179 9171 0 10:39 ? 00:00:00 postgres: stats collector process
pg93 9214 9000 22 10:42 pts/3 00:00:11 pgbench -M prepared -n -r -f ./test.sql -c 1 -T 60 -h /ssd2/pg93/pg_root -p 1921 -U postgres digoal
pg93 9216 9171 80 10:42 ? 00:00:40 postgres: postgres digoal [local] SELECT
开启disktop.stp, 收集块设备io统计信息, 从输出来看, bench的postgres进程9216只写了1个块设备, checkpoint也只写了1个块设备, 这即是disktop.stp中的bug, 记录的只是最后一次块操作的设备名.
[root@db-172-16-3-150 io]# stap disktop.stp
Fri Nov 15 02:42:34 2013 , Average:6711Kb/sec, Read: 7Kb, Write: 33552Kb
UID PID PPID CMD DEVICE T BYTES
500 9176 9171 postgres sde1 W 31064064
500 9216 9171 postgres sda1 W 3293184
500 9223 9178 sh sdc1 R 3620
500 9223 9178 date sdc1 R 3533
500 9223 9178 postgres sdc1 R 788
500 9172 9171 postgres sde1 W 29
Fri Nov 15 02:42:39 2013 , Average:3754Kb/sec, Read: 135Kb, Write: 18637Kb
UID PID PPID CMD DEVICE T BYTES
500 9176 9171 postgres sde1 W 15630336
500 9216 9171 postgres sda1 W 3448832
502 9224 1670 postgres sdd1 R 136419
502 1678 1670 postgres sdd1 W 5547
502 1676 1670 postgres sdd1 R 1968
下面我把disktop.stp修改一下, 在device中存储按块设备的统计, 原来的io_stat保持不变. 脚本如下
[root@db-172-16-3-150 io]# cat disktop_digoal.stp
#!/usr/bin/stap
global io_stat,device
global read_bytes,write_bytes
probe vfs.read.return {
if ($return>0) {
if (devname!="N/A") {/*skip read from cache*/
io_stat[pid(),execname(),uid(),ppid(),"R"] += $return
device[pid(),execname(),uid(),ppid(),"R", devname] += $return
read_bytes += $return
}
}
}
probe vfs.write.return {
if ($return>0) {
if (devname!="N/A") { /*skip update cache*/
io_stat[pid(),execname(),uid(),ppid(),"W"] += $return
device[pid(),execname(),uid(),ppid(),"W", devname] += $return
write_bytes += $return
}
}
}
probe timer.ms(5000) {
/* skip non-read/write disk */
if (read_bytes+write_bytes) {
printf("\n%-25s, %-8s%4dKb/sec, %-7s%6dKb, %-7s%6dKb\n\n",
ctime(gettimeofday_s()),
"Average:", ((read_bytes+write_bytes)/1024)/5,
"Read:",read_bytes/1024,
"Write:",write_bytes/1024)
/* print header */
printf("%8s %8s %8s %25s %8s %4s %12s\n",
"UID","PID","PPID","CMD","DEVICE","T","BYTES")
}
/* print top ten I/O */
foreach ([process,cmd,userid,parent,action] in io_stat- limit 10) {
foreach ([a,b,c,d,e,f] in device) {
if (a==process && b==cmd && c==userid && d==parent && e==action)
printf("%8d %8d %8d %25s %8s %4s %12d\n",
userid,process,parent,cmd,
f,
action,device[process,cmd,userid,parent,action,f])
}
}
/* clear data */
delete io_stat
delete device
read_bytes = 0
write_bytes = 0
}
probe end{
delete io_stat
delete device
delete read_bytes
delete write_bytes
}
修改后重复以上测试 :
[root@db-172-16-3-150 systemtap.examples]# ps -ewf|grep pg93
pg93 9614 1 0 11:17 pts/1 00:00:00 /home/pg93/pgsql9.3.1/bin/postgres
pg93 9615 9614 0 11:17 ? 00:00:00 postgres: logger process
pg93 9617 9614 0 11:17 ? 00:00:00 postgres: checkpointer process
pg93 9618 9614 0 11:17 ? 00:00:00 postgres: writer process
pg93 9619 9614 0 11:17 ? 00:00:00 postgres: wal writer process
pg93 9620 9614 0 11:17 ? 00:00:00 postgres: autovacuum launcher process
pg93 9621 9614 0 11:17 ? 00:00:00 postgres: archiver process last was 0000000100000010000000FB
pg93 9622 9614 0 11:17 ? 00:00:00 postgres: stats collector process
pg93 9632 9000 16 11:18 pts/3 00:00:00 pgbench -M prepared -n -r -f ./test.sql -c 1 -T 60 -h /ssd2/pg93/pg_root -p 1921 -U postgres digoal
pg93 9634 9614 99 11:18 ? 00:00:03 postgres: postgres digoal [local] SELECT
可以看到, 修改后的脚本, 输出的设备名有意义了, 同一个进程9249, 也就是bench进程, 写了3个块设备的数据. sdb1 sda1 sde1,
分别是global, table, index所在的块设备. 同意可以看到wal writer进程只写了pg_xlog所在的块设备.
[root@db-172-16-3-150 io]# stap disktop_digoal.stp
Fri Nov 15 03:18:36 2013 , Average:4103Kb/sec, Read: 1711Kb, Write: 18808Kb
UID PID PPID CMD DEVICE T BYTES
500 9619 9614 postgres sdd1 W 16220160
500 9634 9614 postgres sda1 W 1564672
500 9634 9614 postgres sdb1 W 1343488
500 9634 9614 postgres sde1 W 131072
500 9634 9614 postgres sda1 R 1744896
500 9652 9621 sh sdc1 R 3620
500 9652 9621 date sdc1 R 3533
500 9652 9621 postgres sdc1 R 788
500 9615 9614 postgres sde1 W 29
Fri Nov 15 03:18:41 2013 , Average:3780Kb/sec, Read: 1175Kb, Write: 17725Kb
UID PID PPID CMD DEVICE T BYTES
500 9619 9614 postgres sdd1 W 15204352
500 9634 9614 postgres sda1 W 1589248
500 9634 9614 postgres sdb1 W 1220608
500 9634 9614 postgres sde1 W 131072
500 9634 9614 postgres sda1 R 1064960
502 9653 1670 postgres sdd1 R 136419
502 1678 1670 postgres sdd1 W 5547
502 1676 1670 postgres sdd1 R 1968
本文用到的2个probe alias原型.
/usr/share/systemtap/tapset/vfs.stp
probe vfs.read.return = kernel.function("vfs_read").return
{
name = "vfs.read"
retstr = sprintf("%d", $return)
file = $file
pos = $pos
buf = $buf
bytes_to_read = $count
dev = __file_dev($file)
devname = __find_bdevname(dev, __file_bdev($file))
ino = __file_ino($file)
ret = $return
bytes_read = $return > 0 ? $return : 0
error = $return < 0 ? $return : 0
error_str = error ? errno_str(error) : ""
}
probe vfs.write.return = kernel.function("vfs_write").return
{
name = "vfs.write"
retstr = sprintf("%d", $return)
file = $file
pos = $pos
buf = $buf
bytes_to_write = $count
dev = __file_dev($file)
devname = __find_bdevname(dev, __file_bdev($file))
ino = __file_ino($file)
ret = $return
bytes_written = $return > 0 ? $return : 0
error = $return < 0 ? $return : 0
error_str = error ? errno_str(error) : ""
}
$return为.return事件的上下文变量, 即事件函数返回值, 如本例用到的vfs_read.return, 对应的函数vfs_read@fs/read_write.c的返回类型为ssize_t, 代码中为ret, systemtap将使用$return来代表这个ret.
[root@db-172-16-3-150 io]# stap -L 'kernel.function("vfs_read")'
kernel.function("vfs_read@fs/read_write.c:294") $file:struct file* $buf:char* $count:size_t $pos:loff_t*
[root@db-172-16-3-150 io]# stap -L 'kernel.function("vfs_read").return'
kernel.function("vfs_read@fs/read_write.c:294").return $return:ssize_t $file:struct file* $buf:char* $count:size_t $pos:loff_t*
这两个函数的源码 :
/usr/src/debug/kernel-2.6.32-358.el6/linux-2.6.32-358.el6.x86_64/fs/read_write.c
ssize_t vfs_read(struct file *file, char __user *buf, size_t count, loff_t *pos)
{
ssize_t ret;
if (!(file->f_mode & FMODE_READ))
return -EBADF;
if (!file->f_op || (!file->f_op->read && !file->f_op->aio_read))
return -EINVAL;
if (unlikely(!access_ok(VERIFY_WRITE, buf, count)))
return -EFAULT;
ret = rw_verify_area(READ, file, pos, count);
if (ret >= 0) {
count = ret;
if (file->f_op->read)
ret = file->f_op->read(file, buf, count, pos);
else
ret = do_sync_read(file, buf, count, pos);
if (ret > 0) {
fsnotify_access(file->f_path.dentry);
add_rchar(current, ret);
}
inc_syscr(current);
}
return ret;
}
EXPORT_SYMBOL(vfs_read);
ssize_t vfs_write(struct file *file, const char __user *buf, size_t count, loff_t *pos)
{
ssize_t ret;
if (!(file->f_mode & FMODE_WRITE))
return -EBADF;
if (!file->f_op || (!file->f_op->write && !file->f_op->aio_write))
return -EINVAL;
if (unlikely(!access_ok(VERIFY_READ, buf, count)))
return -EFAULT;
ret = rw_verify_area(WRITE, file, pos, count);
if (ret >= 0) {
count = ret;
if (file->f_op->write)
ret = file->f_op->write(file, buf, count, pos);
else
ret = do_sync_write(file, buf, count, pos);
if (ret > 0) {
fsnotify_modify(file->f_path.dentry);
add_wchar(current, ret);
}
inc_syscw(current);
}
return ret;
}
EXPORT_SYMBOL(vfs_write);
参考
1. https://sourceware.org/systemtap/SystemTap_Beginners_Guide/mainsect-disk.html
2. https://sourceware.org/systemtap/examples/
3. /usr/share/systemtap/testsuite/systemtap.examples
4. systemtap-testsuite
5. /usr/share/systemtap/testsuite/systemtap.examples/index.txt
6. /usr/share/systemtap/testsuite/systemtap.examples/keyword-index.txt
7. /usr/share/systemtap/tapset
8. https://sourceware.org/systemtap/tapsets/API-ctime.html
9. https://sourceware.org/systemtap/tapsets/API-gettimeofday-s.html