Systemtap examples, Profiling - 4 Monitoring Polling Applications
背景
例子来自timeout.stp 脚本, 输出系统中指定的系统调用(poll, select, epoll, itimer, futex, nanosleep, signal)等待次数排行前20的进程以及各系统调用的等待次数的统计信息.
脚本内容以及注解 :
[root@db-172-16-3-150 network]# cd /usr/share/systemtap/testsuite/systemtap.examples/profiling
[root@db-172-16-3-150 profiling]# cat timeout.stp
#!/usr/bin/stap
# Copyright (C) 2009 Red Hat, Inc.
# Written by Ulrich Drepper <drepper@redhat.com>
# Modified by William Cohen <wcohen@redhat.com>
global process, timeout_count, to
global poll_timeout, epoll_timeout, select_timeout, itimer_timeout
global nanosleep_timeout, futex_timeout, signal_timeout
probe syscall.poll, syscall.epoll_wait { // epoll_wait : wait for an I/O event on an epoll file descriptor
if (timeout) to[pid()]=timeout // 如果timeout大于0 , 说明有等待. 设置to[pid()]=timeout.
// 后面会用到这个标记判断poll和epoll_wait系统调用是否发生超时.
}
probe syscall.poll.return { // wait for some event on a file descriptor
p = pid()
if ($return == 0 && to[p] > 0 ) { // 返回值=0并且poll, epoll_wait的系统调用等待时间大于0表示发生等待.
poll_timeout[p]++ // 增加一次进程p的系统调用poll的超时计数
timeout_count[p]++ // 增加一次进程p的超时计数
process[p] = execname() // 记录pid对应的命令名
delete to[p] // 删除to[p],
}
}
probe syscall.epoll_wait.return { // wait for an I/O event on an epoll file descriptor
p = pid()
if ($return == 0 && to[p] > 0 ) {
epoll_timeout[p]++ // 增加一次进程p的系统调用epoll的超时计数
timeout_count[p]++ // 增加一次进程p的超时计数
process[p] = execname() // 记录pid对应的命令名
delete to[p] // 删除to[p],
}
}
probe syscall.select.return { // synchronous I/O multiplexing, allow a program to monitor multiple file descriptors,
// waiting until one or more of the file descriptors become "ready" for some class of I/O operation
if ($return == 0) { // select 系统调用返回0时, 说明调用正常.
p = pid()
select_timeout[p]++
timeout_count[p]++
process[p] = execname()
}
}
probe syscall.futex.return { // Fast Userspace Locking system call,
// The futex() system call provides a method for a program to wait for a value at a given address to change
// method to wake up anyone waiting on a particular address (while the addresses for the same memory in separate
// processes may not be equal, the kernel maps them internally so the same memory mapped in different locations
// will correspond for futex() calls). It is typically used to implement the contended case of a lock in shared
// memory, as described in futex(7).
if (errno_str($return) == "ETIMEDOUT") { // futex系统调用返回值$return.
// 使用errno_str转换成字符串, 如果等于ETIMEDOUT说明发生了等待.
// Returns 0 if the process was woken by a FUTEX_WAKE call. In case of timeout, the operation fails with
// the error ETIMEDOUT.
p = pid()
futex_timeout[p]++
timeout_count[p]++
process[p] = execname()
}
}
probe syscall.nanosleep.return { // high-resolution sleep
// nanosleep() suspends the execution of the calling thread until either at least the time specified in *req has
// elapsed, or the delivery of a signal that triggers the invocation of a handler in the calling thread or that
// terminates the process.
if ($return == 0) {
p = pid()
nanosleep_timeout[p]++ //
timeout_count[p]++
process[p] = execname()
}
}
probe kernel.function("it_real_fn") { // The timer is automagically restarted, when interval != 0
p = pid()
itimer_timeout[p]++
timeout_count[p]++
process[p] = execname()
}
probe syscall.rt_sigtimedwait.return { // synchronously wait for queued signals.
if (errno_str($return) == "EAGAIN") { // rt_sigtimedwait系统调用返回值$return.
// 使用errno_str转换成字符串, 如果等于EAGAIN说明发生了等待.
// EAGAIN No signal in set was delivered within the timeout period specified to sigtimedwait().
p = pid()
signal_timeout[p]++
timeout_count[p]++
process[p] = execname()
}
}
probe syscall.exit { // 当系统调用结束时, 清除计数
p = pid()
if (p in process) { // 当p在process这个数组中时, 清除所有相关的计数.
delete process[p]
delete timeout_count[p]
delete poll_timeout[p]
delete epoll_timeout[p]
delete select_timeout[p]
delete itimer_timeout[p]
delete futex_timeout[p]
delete nanosleep_timeout[p]
delete signal_timeout[p]
}
}
probe timer.s(1) {
ansi_clear_screen() // 清屏
printf (" pid | poll select epoll itimer futex nanosle signal| process\n") // 输出头
foreach (p in timeout_count- limit 20) { // timeout_count用于排序, 输出等待次数前20位的进程
printf ("%5d |%7d %7d %7d %7d %7d %7d %7d| %-.38s\n", p,
poll_timeout[p], select_timeout[p],
epoll_timeout[p], itimer_timeout[p],
futex_timeout[p], nanosleep_timeout[p],
signal_timeout[p], process[p])
}
}
// 间隔1秒输出poll, select, epoll, itimer, futex, nanosleep, signal等系统调用在这1秒内的等待次数.
执行输出举例 :
[root@db-172-16-3-150 profiling]# stap timeout.stp
pid | poll select epoll itimer futex nanosle signal| process
7121 | 7 0 0 0 0 0 0| postgres
7120 | 5 0 0 0 0 0 0| postgres
1778 | 1 0 0 0 0 0 0| avahi-daemon
1372 | 0 0 0 0 0 1 0| multipathd
本文用到的probe alias原型 :
/usr/share/systemtap/tapset/syscalls.stp
/usr/share/systemtap/tapset/syscalls2.stp
# epoll_wait _________________________________________________
#
# long sys_epoll_wait(int epfd, struct epoll_event __user *events,
# int maxevents, int timeout)
# long compat_sys_epoll_wait(int epfd,
# struct compat_epoll_event __user *events,
# int maxevents, int timeout)
#
probe syscall.epoll_wait = kernel.function("compat_sys_epoll_wait").call ?,
kernel.function("sys_epoll_wait").call ?
{
name = "epoll_wait"
epfd = $epfd
events_uaddr = $events
maxevents = $maxevents
timeout = $timeout
argstr = sprintf("%d, %p, %d, %d", $epfd, $events, $maxevents, $timeout)
}
probe syscall.epoll_wait.return = kernel.function("compat_sys_epoll_wait").return ?,
kernel.function("sys_epoll_wait").return ?
{
name = "epoll_wait"
retstr = return_str(1, $return)
}
# futex ______________________________________________________
# long sys_futex(u32 __user *uaddr,
# int op,
# int val,
# struct timespec __user *utime,
# u32 __user *uaddr2,
# int val3)
# long compat_sys_futex(u32 __user *uaddr, int op, u32 val,
# struct compat_timespec __user *utime, u32 __user *uaddr2,
# u32 val3)
#
probe syscall.futex = kernel.function("sys_futex").call ?
{
name = "futex"
futex_uaddr = $uaddr
op = $op
val = $val
utime_uaddr = $utime
uaddr2_uaddr = $uaddr2
val3 = $val3
if ($op == 0)
argstr = sprintf("%p, %s, %d, %s", $uaddr, _futex_op_str($op),
$val, _struct_timespec_u($utime, 1))
else
argstr = sprintf("%p, %s, %d", $uaddr, _futex_op_str($op),
$val)
}
probe syscall.futex.return = kernel.function("sys_futex").return ?
{
name = "futex"
retstr = return_str(1, $return)
}
# exit _______________________________________________________
# long sys_exit(int error_code)
probe syscall.exit = kernel.function("do_exit").call
{
name = "exit"
status = $code
argstr = sprint($code)
}
# poll _______________________________________________________
#
# long sys_poll(struct pollfd __user * ufds, unsigned int nfds, long timeout)
#
probe syscall.poll = kernel.function("sys_poll").call
{
name = "poll"
ufds_uaddr = $ufds
nfds = $nfds
timeout = (@defined($timeout_msecs) ? $timeout_msecs : $timeout) // 在sys_poll中timeout正表示milliseconds, 负表示无限
argstr = sprintf("%p, %d, %d", $ufds, $nfds,
(@defined($timeout_msecs) ? $timeout_msecs : $timeout))
}
probe syscall.poll.return = kernel.function("sys_poll").return
{
name = "poll"
retstr = return_str(1, $return)
}
# ppoll _______________________________________________________
#
# long sys_ppoll(struct pollfd __user *ufds, unsigned int nfds,
# struct timespec __user *tsp, const sigset_t __user *sigmask,
# size_t sigsetsize)
#
probe syscall.ppoll = kernel.function("sys_ppoll").call ?
{
name = "ppoll"
argstr = sprintf("%p, %d, %s, %p, %d",
$ufds,
$nfds,
_struct_timespec_u($tsp, 1),
$sigmask,
$sigsetsize)
}
probe syscall.ppoll.return = kernel.function("sys_ppoll").return ?
{
name = "ppoll"
retstr = return_str(1, $return)
}
# select _____________________________________________________
# long sys_select(int n,
# fd_set __user *inp,
# fd_set __user *outp,
# fd_set __user *exp,
# struct timeval __user *tvp)
#
probe syscall.select = kernel.function("sys_select").call
{
name = "select"
n = $n
readfds_uaddr = $inp
writefds_uaddr = $outp
exceptfds_uaddr = $exp
timeout_uaddr = $tvp
argstr = sprintf("%d, %p, %p, %p, %s", $n, $inp, $outp, $exp,
_struct_timeval_u($tvp, 1))
}
probe syscall.select.return = kernel.function("sys_select").return
{
name = "select"
retstr = return_str(1, $return)
}
# rt_sigtimedwait ____________________________________________
#
# long sys_rt_sigtimedwait(const sigset_t __user *uthese,
# siginfo_t __user *uinfo,
# const struct timespec __user *uts,
# size_t sigsetsize)
# long compat_sys_rt_sigtimedwait (compat_sigset_t __user *uthese,
# struct compat_siginfo __user *uinfo,
# struct compat_timespec __user *uts, compat_size_t sigsetsize)
#
probe syscall.rt_sigtimedwait = kernel.function("compat_sys_rt_sigtimedwait").call ?,
kernel.function("sys_rt_sigtimedwait").call
{
name = "rt_sigtimedwait"
uthese_uaddr = $uthese
uinfo_uaddr = $uinfo
uts_uaddr = $uts
sigsetsize = $sigsetsize
argstr = sprintf("%p, %p, %p, %d", $uthese, $uinfo, $uts, $sigsetsize)
}
probe syscall.rt_sigtimedwait.return = kernel.function("compat_sys_rt_sigtimedwait").return ?,
kernel.function("sys_rt_sigtimedwait").return
{
name = "rt_sigtimedwait"
retstr = return_str(1, $return)
}
# nanosleep __________________________________________________
#
# long sys_nanosleep(struct timespec __user *rqtp,
# struct timespec __user *rmtp)
# long compat_sys_nanosleep(struct compat_timespec __user *rqtp,
# struct compat_timespec __user *rmtp)
#
probe syscall.nanosleep = kernel.function("sys_nanosleep").call
{
name = "nanosleep"
req_uaddr = $rqtp
rem_uaddr = $rmtp
argstr = sprintf("%s, %p", _struct_timespec_u($rqtp, 1), $rmtp)
}
probe syscall.nanosleep.return = kernel.function("sys_nanosleep").return
{
name = "nanosleep"
retstr = return_str(1, $return)
}
# ppoll _______________________________________________________
#
# long sys_ppoll(struct pollfd __user *ufds, unsigned int nfds,
# struct timespec __user *tsp, const sigset_t __user *sigmask,
# size_t sigsetsize)
#
probe syscall.ppoll = kernel.function("sys_ppoll").call ?
{
name = "ppoll"
argstr = sprintf("%p, %d, %s, %p, %d",
$ufds,
$nfds,
_struct_timespec_u($tsp, 1),
$sigmask,
$sigsetsize)
}
probe syscall.ppoll.return = kernel.function("sys_ppoll").return ?
{
name = "ppoll"
retstr = return_str(1, $return)
}
本文用到的内核函数以及上下文变量 :
[root@db-172-16-3-150 ~]# stap -L 'kernel.function("it_real_fn")'
kernel.function("it_real_fn@kernel/itimer.c:121") $timer:struct hrtimer*
[root@db-172-16-3-150 ~]# stap -L 'kernel.function("sys_epoll_wait").call'
kernel.function("sys_epoll_wait@fs/eventpoll.c:1710").call $epfd:int $events:struct epoll_event* $maxevents:int $timeout:int
[root@db-172-16-3-150 ~]# stap -L 'kernel.function("sys_epoll_wait").return'
kernel.function("sys_epoll_wait@fs/eventpoll.c:1710").return $return:long int $epfd:int $events:struct epoll_event* $maxevents:int $timeout:int
[root@db-172-16-3-150 ~]# stap -L 'kernel.function("sys_futex").call'
kernel.function("sys_futex@kernel/futex.c:2692").call $uaddr:u32* $op:int $val:u32 $utime:struct timespec* $uaddr2:u32* $val3:u32 $ts:struct timespec $t:ktime_t
[root@db-172-16-3-150 ~]# stap -L 'kernel.function("sys_futex").return'
kernel.function("sys_futex@kernel/futex.c:2692").return $return:long int $uaddr:u32* $op:int $val:u32 $utime:struct timespec* $uaddr2:u32* $val3:u32 $ts:struct timespec $t:ktime_t
[root@db-172-16-3-150 ~]# stap -L 'kernel.function("do_exit").call'
kernel.function("do_exit@kernel/exit.c:897").call $code:long int $tsk:struct task_struct*
[root@db-172-16-3-150 ~]# stap -L 'kernel.function("sys_poll").call'
kernel.function("sys_poll@fs/select.c:899").call $ufds:struct pollfd* $nfds:unsigned int $timeout_msecs:int $end_time:struct timespec
[root@db-172-16-3-150 ~]# stap -L 'kernel.function("sys_poll").return'
kernel.function("sys_poll@fs/select.c:899").return $return:long int $ufds:struct pollfd* $nfds:unsigned int $timeout_msecs:int $end_time:struct timespec
[root@db-172-16-3-150 ~]# stap -L 'kernel.function("sys_ppoll").call'
kernel.function("sys_ppoll@fs/select.c:935").call $ufds:struct pollfd* $nfds:unsigned int $tsp:struct timespec* $sigmask:sigset_t const* $sigsetsize:size_t $ksigmask:sigset_t $sigsaved:sigset_t $ts:struct timespec $end_time:struct timespec
[root@db-172-16-3-150 ~]# stap -L 'kernel.function("sys_ppoll").return'
kernel.function("sys_ppoll@fs/select.c:935").return $return:long int $ufds:struct pollfd* $nfds:unsigned int $tsp:struct timespec* $sigmask:sigset_t const* $sigsetsize:size_t $ksigmask:sigset_t $sigsaved:sigset_t $ts:struct timespec $end_time:struct timespec
[root@db-172-16-3-150 ~]# stap -L 'kernel.function("sys_select").call'
kernel.function("sys_select@fs/select.c:597").call $n:int $inp:fd_set* $outp:fd_set* $exp:fd_set* $tvp:struct timeval* $end_time:struct timespec $tv:struct timeval
[root@db-172-16-3-150 ~]# stap -L 'kernel.function("sys_select").return'
kernel.function("sys_select@fs/select.c:597").return $return:long int $n:int $inp:fd_set* $outp:fd_set* $exp:fd_set* $tvp:struct timeval* $end_time:struct timespec $tv:struct timeval
[root@db-172-16-3-150 ~]# stap -L 'kernel.function("sys_rt_sigtimedwait").return'
kernel.function("sys_rt_sigtimedwait@kernel/signal.c:2215").return $return:long int $uthese:sigset_t const* $uinfo:siginfo_t* $uts:struct timespec const* $sigsetsize:size_t $these:sigset_t $ts:struct timespec $info:siginfo_t
[root@db-172-16-3-150 ~]# stap -L 'kernel.function("sys_rt_sigtimedwait").call'
kernel.function("sys_rt_sigtimedwait@kernel/signal.c:2215").call $uthese:sigset_t const* $uinfo:siginfo_t* $uts:struct timespec const* $sigsetsize:size_t $these:sigset_t $ts:struct timespec $info:siginfo_t
[root@db-172-16-3-150 ~]# stap -L 'kernel.function("sys_nanosleep").call'
kernel.function("sys_nanosleep@kernel/hrtimer.c:1601").call $rqtp:struct timespec* $rmtp:struct timespec* $tu:struct timespec
[root@db-172-16-3-150 ~]# stap -L 'kernel.function("sys_nanosleep").return'
kernel.function("sys_nanosleep@kernel/hrtimer.c:1601").return $return:long int $rqtp:struct timespec* $rmtp:struct timespec* $tu:struct timespec
系统调用源代码截取 :
syscall.poll
/usr/src/debug/kernel-2.6.32-358.el6/linux-2.6.32-358.el6.x86_64/fs/select.c
SYSCALL_DEFINE3(poll, struct pollfd __user *, ufds, unsigned int, nfds,
int, timeout_msecs)
{
struct timespec end_time, *to = NULL;
int ret;
if (timeout_msecs >= 0) {
to = &end_time;
poll_select_set_timeout(to, timeout_msecs / MSEC_PER_SEC,
NSEC_PER_MSEC * (timeout_msecs % MSEC_PER_SEC));
}
ret = do_sys_poll(ufds, nfds, to);
if (ret == -EINTR) {
struct restart_block *restart_block;
restart_block = ¤t_thread_info()->restart_block;
restart_block->fn = do_restart_poll;
restart_block->poll.ufds = ufds;
restart_block->poll.nfds = nfds;
if (timeout_msecs >= 0) {
restart_block->poll.tv_sec = end_time.tv_sec;
restart_block->poll.tv_nsec = end_time.tv_nsec;
restart_block->poll.has_timeout = 1;
} else
restart_block->poll.has_timeout = 0;
ret = -ERESTART_RESTARTBLOCK;
}
return ret;
}
/usr/src/debug/kernel-2.6.32-358.el6/linux-2.6.32-358.el6.x86_64/kernel/itimer.c
/*
* The timer is automagically restarted, when interval != 0
*/
enum hrtimer_restart it_real_fn(struct hrtimer *timer)
{
struct signal_struct *sig =
container_of(timer, struct signal_struct, real_timer);
trace_itimer_expire(ITIMER_REAL, sig->leader_pid, 0);
kill_pid_info(SIGALRM, SEND_SIG_PRIV, sig->leader_pid);
return HRTIMER_NORESTART;
}
参考
1. https://sourceware.org/systemtap/SystemTap_Beginners_Guide/mainsect-profiling.html
2. https://sourceware.org/systemtap/examples/
3. /usr/share/systemtap/testsuite/systemtap.examples
4. systemtap-testsuite
5. /usr/share/systemtap/testsuite/systemtap.examples/index.txt
6. /usr/share/systemtap/testsuite/systemtap.examples/keyword-index.txt
7. /usr/share/systemtap/tapset
8.
man syscalls
man poll
man ppoll
man select
man epoll_wait
man futex
man nanosleep
man rt_sigtimedwait
9. https://sourceware.org/systemtap/tapsets/API-errno-str.html