Systemtap examples, DISK IO - 3 Track Cumulative IO
背景
例子来自traceio.stp 脚本, 该脚本通过事件vfs.read.return和vfs.write.return统计进程的累计读写调用情况. 输出读写前10的进程信息以及累计的读, 写的字节数信息.
注意原始的例子中没有对devname做过滤, 所以读写包含了cache的操作, 也没有对return值是否大于0做过滤.
我稍微修改了一下, 增加了对return大于0的过滤, 如果读者有兴趣过滤cache, 可以再增加devname的过滤.
修改后的脚本内容以及注解 :
[root@db-172-16-3-150 network]# cd /usr/share/systemtap/testsuite/systemtap.examples/io
[root@db-172-16-3-150 io]# cat traceio.stp
#!/usr/bin/stap
# traceio.stp
# Copyright (C) 2007 Red Hat, Inc., Eugene Teo <eteo@redhat.com>
# Copyright (C) 2009 Kai Meyer <kai@unixlords.com>
# Fixed a bug that allows this to run longer
# And added the humanreadable function
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
# published by the Free Software Foundation.
#
global reads, writes, total_io
// reads 进程读字节数
// writes 进程写字节数
// total_io 进程读写字节数
probe vfs.read.return {
if ($return > 0) {
reads[pid(),execname()] += $return
total_io[pid(),execname()] += $return
}
}
// 读者可以使用devname != "N/A"过滤cache.
probe vfs.write.return {
if ($return > 0) {
writes[pid(),execname()] += $return
total_io[pid(),execname()] += $return
}
}
// 读者可以使用devname != "N/A"过滤cache.
function humanreadable(bytes) {
if (bytes > 1024*1024*1024) {
return sprintf("%d GiB", bytes/1024/1024/1024)
} else if (bytes > 1024*1024) {
return sprintf("%d MiB", bytes/1024/1024)
} else if (bytes > 1024) {
return sprintf("%d KiB", bytes/1024)
} else {
return sprintf("%d B", bytes)
}
}
// humanreadable函数把字节数转换成更可读的字符串.
// 但是请注意, 因为systemtap只支持整形, 转换后的精度不高.
probe timer.s(1) {
foreach([p,e] in total_io- limit 10) // 按照读写字节数倒序输出前10读写最大的进程的读, 写信息.
printf("%8d %15s r: %12s w: %12s\n",
p, e, humanreadable(reads[p,e]),
humanreadable(writes[p,e]))
printf("\n")
# Note we don't zero out reads, writes and total_io,
# so the values are cumulative since the script started.
}
// 每秒输出一次
执行输出举例 :
[root@db-172-16-3-150 io]# stap traceio.stp
10992 postgres r: 116 B w: 9 MiB
5804 postgres r: 0 B w: 472 KiB
10991 postgres r: 14 B w: 14 B
5820 stapio r: 12 B w: 0 B
1706 avahi-daemon r: 4 B w: 4 B
10992 postgres r: 236 B w: 18 MiB
5804 postgres r: 0 B w: 888 KiB
5823 sh r: 4 KiB w: 0 B
5823 date r: 3 KiB w: 29 B
1903 automount r: 2 KiB w: 0 B
5823 postgres r: 788 B w: 0 B
3992 sshd r: 292 B w: 340 B
5820 stapio r: 298 B w: 286 B
10991 postgres r: 30 B w: 30 B
10988 postgres r: 29 B w: 29 B
10992 postgres r: 349 B w: 27 MiB
5804 postgres r: 0 B w: 1 MiB
5823 sh r: 4 KiB w: 0 B
5823 date r: 3 KiB w: 29 B
1903 automount r: 2 KiB w: 0 B
3992 sshd r: 874 B w: 968 B
5820 stapio r: 869 B w: 857 B
5823 postgres r: 788 B w: 0 B
10991 postgres r: 44 B w: 44 B
10988 postgres r: 29 B w: 29 B
本文用到的几个probe alias原型(包含对应的call原型).
/usr/share/systemtap/tapset/vfs.stp
probe vfs.read = kernel.function("vfs_read")
{
file = $file
pos = $pos
buf = $buf
bytes_to_read = $count
dev = __file_dev($file)
devname = __find_bdevname(dev, __file_bdev($file))
ino = __file_ino($file)
name = "vfs.read"
argstr = sprintf("%d, %d, %p", $count, $pos, $buf)
}
probe vfs.read.return = kernel.function("vfs_read").return
{
name = "vfs.read"
retstr = sprintf("%d", $return)
file = $file
pos = $pos
buf = $buf
bytes_to_read = $count
dev = __file_dev($file)
devname = __find_bdevname(dev, __file_bdev($file))
ino = __file_ino($file)
ret = $return
bytes_read = $return > 0 ? $return : 0
error = $return < 0 ? $return : 0
error_str = error ? errno_str(error) : ""
}
probe vfs.write = kernel.function("vfs_write")
{
file = $file
pos = $pos
buf = $buf
bytes_to_write = $count
dev = __file_dev($file)
devname = __find_bdevname(dev, __file_bdev($file))
ino = __file_ino($file)
name = "vfs.write"
argstr = sprintf("%d, %d, %p", $count, $pos, $buf)
}
probe vfs.write.return = kernel.function("vfs_write").return
{
name = "vfs.write"
retstr = sprintf("%d", $return)
file = $file
pos = $pos
buf = $buf
bytes_to_write = $count
dev = __file_dev($file)
devname = __find_bdevname(dev, __file_bdev($file))
ino = __file_ino($file)
ret = $return
bytes_written = $return > 0 ? $return : 0
error = $return < 0 ? $return : 0
error_str = error ? errno_str(error) : ""
}
这几个函数的源码位置信息: (源码这里就不贴出来了)
[root@db-172-16-3-150 io]stap -L 'kernel.function("vfs_read")'
kernel.function("vfs_read@fs/read_write.c:294") $file:struct file* $buf:char* $count:size_t $pos:loff_t*
[root@db-172-16-3-150 io]# stap -L 'kernel.function("vfs_read").return'
kernel.function("vfs_read@fs/read_write.c:294").return $return:ssize_t $file:struct file* $buf:char* $count:size_t $pos:loff_t*
[root@db-172-16-3-150 io]# stap -L 'kernel.function("vfs_write")'
kernel.function("vfs_write@fs/read_write.c:349") $file:struct file* $buf:char const* $count:size_t $pos:loff_t*
[root@db-172-16-3-150 io]# stap -L 'kernel.function("vfs_write").return'
kernel.function("vfs_write@fs/read_write.c:349").return $return:ssize_t $file:struct file* $buf:char const* $count:size_t $pos:loff_t*
参考
1. https://sourceware.org/systemtap/SystemTap_Beginners_Guide/mainsect-disk.html
2. https://sourceware.org/systemtap/examples/
3. /usr/share/systemtap/testsuite/systemtap.examples
4. systemtap-testsuite
5. /usr/share/systemtap/testsuite/systemtap.examples/index.txt
6. /usr/share/systemtap/testsuite/systemtap.examples/keyword-index.txt
7. /usr/share/systemtap/tapset
8. https://sourceware.org/systemtap/tapsets/API-ctime.html
9. https://sourceware.org/systemtap/tapsets/API-gettimeofday-s.html