Systemtap examples, Profiling - 4 Monitoring Polling Applications

8 minute read

背景

例子来自timeout.stp 脚本, 输出系统中指定的系统调用(poll, select, epoll, itimer, futex, nanosleep, signal)等待次数排行前20的进程以及各系统调用的等待次数的统计信息.  
脚本内容以及注解 :   
[root@db-172-16-3-150 network]# cd /usr/share/systemtap/testsuite/systemtap.examples/profiling  
[root@db-172-16-3-150 profiling]# cat timeout.stp  
#!/usr/bin/stap  
# Copyright (C) 2009 Red Hat, Inc.  
# Written by Ulrich Drepper <drepper@redhat.com>  
# Modified by William Cohen <wcohen@redhat.com>  
  
global process, timeout_count, to  
global poll_timeout, epoll_timeout, select_timeout, itimer_timeout  
global nanosleep_timeout, futex_timeout, signal_timeout  
  
probe syscall.poll, syscall.epoll_wait {    //  epoll_wait : wait for an I/O event on an epoll file descriptor  
  if (timeout) to[pid()]=timeout    // 如果timeout大于0 , 说明有等待. 设置to[pid()]=timeout.    
    //  后面会用到这个标记判断poll和epoll_wait系统调用是否发生超时.  
}  
  
probe syscall.poll.return {    // wait for some event on a file descriptor  
  p = pid()  
  if ($return == 0 && to[p] > 0 ) {  // 返回值=0并且poll, epoll_wait的系统调用等待时间大于0表示发生等待.  
    poll_timeout[p]++    // 增加一次进程p的系统调用poll的超时计数  
    timeout_count[p]++    // 增加一次进程p的超时计数  
    process[p] = execname()  // 记录pid对应的命令名  
    delete to[p]  // 删除to[p],   
  }  
}  
  
probe syscall.epoll_wait.return {    //  wait for an I/O event on an epoll file descriptor  
  p = pid()  
  if ($return == 0 && to[p] > 0 ) {  
    epoll_timeout[p]++   // 增加一次进程p的系统调用epoll的超时计数  
    timeout_count[p]++    // 增加一次进程p的超时计数  
    process[p] = execname()    // 记录pid对应的命令名  
    delete to[p]   // 删除to[p],   
  }  
}  
  
probe syscall.select.return {    //  synchronous I/O multiplexing, allow a program to monitor multiple file descriptors,   
                                                   //  waiting until one or more of the file descriptors become "ready" for some class of I/O operation  
  if ($return == 0) {     // select 系统调用返回0时, 说明调用正常.   
    p = pid()  
    select_timeout[p]++  
    timeout_count[p]++  
    process[p] = execname()  
  }  
}  
  
probe syscall.futex.return {    // Fast Userspace Locking system call,   
//  The futex() system call provides a method for a program to wait for a value at a given address to change  
//  method to wake up anyone waiting on a particular address (while the addresses for the same memory  in  separate  
//  processes  may  not  be equal, the kernel maps them internally so the same memory mapped in different locations  
//  will correspond for futex() calls).  It is typically used to implement the contended case of a lock  in  shared  
//  memory, as described in futex(7).  
  if (errno_str($return) == "ETIMEDOUT") {    //  futex系统调用返回值$return.   
    //  使用errno_str转换成字符串, 如果等于ETIMEDOUT说明发生了等待.  
    //  Returns  0  if the process was woken by a FUTEX_WAKE call.  In case of timeout, the operation fails with  
    //  the error ETIMEDOUT.  
    p = pid()  
    futex_timeout[p]++  
    timeout_count[p]++  
    process[p] = execname()  
  }  
}  
  
probe syscall.nanosleep.return {  // high-resolution sleep  
//  nanosleep()  suspends  the execution of the calling thread until either at least the time specified in *req has  
//  elapsed, or the delivery of a signal that triggers the invocation of a handler in the calling  thread  or  that  
//  terminates the process.  
  if ($return == 0) {  
    p = pid()  
    nanosleep_timeout[p]++    //   
    timeout_count[p]++  
    process[p] = execname()  
  }  
}  
  
probe kernel.function("it_real_fn") {    //  The timer is automagically restarted, when interval != 0  
  p = pid()  
  itimer_timeout[p]++  
  timeout_count[p]++  
  process[p] = execname()  
}  
  
probe syscall.rt_sigtimedwait.return {   // synchronously wait for queued signals.  
  if (errno_str($return) == "EAGAIN") {    //  rt_sigtimedwait系统调用返回值$return.   
    //  使用errno_str转换成字符串, 如果等于EAGAIN说明发生了等待.  
    //  EAGAIN No signal in set was delivered within the timeout period specified to sigtimedwait().  
    p = pid()  
    signal_timeout[p]++  
    timeout_count[p]++  
    process[p] = execname()  
  }  
}  
  
probe syscall.exit {    // 当系统调用结束时, 清除计数  
  p = pid()  
  if (p in process) {   // 当p在process这个数组中时, 清除所有相关的计数.  
    delete process[p]  
    delete timeout_count[p]  
    delete poll_timeout[p]  
    delete epoll_timeout[p]  
    delete select_timeout[p]  
    delete itimer_timeout[p]  
    delete futex_timeout[p]  
    delete nanosleep_timeout[p]  
    delete signal_timeout[p]  
  }  
}  
  
probe timer.s(1) {  
  ansi_clear_screen()    //  清屏  
  printf ("  pid |   poll  select   epoll  itimer   futex nanosle  signal| process\n")    // 输出头  
  foreach (p in timeout_count- limit 20) {    //  timeout_count用于排序, 输出等待次数前20位的进程  
     printf ("%5d |%7d %7d %7d %7d %7d %7d %7d| %-.38s\n", p,  
              poll_timeout[p], select_timeout[p],  
              epoll_timeout[p], itimer_timeout[p],  
              futex_timeout[p], nanosleep_timeout[p],  
              signal_timeout[p], process[p])  
  }  
}  
// 间隔1秒输出poll, select, epoll, itimer, futex, nanosleep, signal等系统调用在这1秒内的等待次数.  
  
执行输出举例 :   
[root@db-172-16-3-150 profiling]# stap timeout.stp  
  pid |   poll  select   epoll  itimer   futex nanosle  signal| process  
 7121 |      7       0       0       0       0       0       0| postgres  
 7120 |      5       0       0       0       0       0       0| postgres  
 1778 |      1       0       0       0       0       0       0| avahi-daemon  
 1372 |      0       0       0       0       0       1       0| multipathd  
  
本文用到的probe alias原型 :   
/usr/share/systemtap/tapset/syscalls.stp  
/usr/share/systemtap/tapset/syscalls2.stp  
# epoll_wait _________________________________________________  
#  
# long sys_epoll_wait(int epfd, struct epoll_event __user *events,  
#               int maxevents, int timeout)  
# long compat_sys_epoll_wait(int epfd,  
#               struct compat_epoll_event __user *events,  
#               int maxevents, int timeout)  
#  
probe syscall.epoll_wait = kernel.function("compat_sys_epoll_wait").call ?,  
                           kernel.function("sys_epoll_wait").call ?  
{  
        name = "epoll_wait"  
        epfd = $epfd  
        events_uaddr = $events  
        maxevents = $maxevents  
        timeout = $timeout  
        argstr = sprintf("%d, %p, %d, %d", $epfd, $events, $maxevents, $timeout)  
}  
probe syscall.epoll_wait.return = kernel.function("compat_sys_epoll_wait").return ?,  
                                  kernel.function("sys_epoll_wait").return ?  
{  
        name = "epoll_wait"  
        retstr = return_str(1, $return)  
}  
  
# futex ______________________________________________________  
# long sys_futex(u32 __user *uaddr,  
#           int op,  
#           int val,  
#           struct timespec __user *utime,  
#           u32 __user *uaddr2,  
#           int val3)  
# long compat_sys_futex(u32 __user *uaddr, int op, u32 val,  
#               struct compat_timespec __user *utime, u32 __user *uaddr2,  
#               u32 val3)  
#  
probe syscall.futex = kernel.function("sys_futex").call ?  
{  
        name = "futex"  
        futex_uaddr = $uaddr  
        op = $op  
        val = $val  
        utime_uaddr = $utime  
        uaddr2_uaddr = $uaddr2  
        val3 = $val3  
        if ($op == 0)  
                argstr = sprintf("%p, %s, %d, %s", $uaddr, _futex_op_str($op),  
                        $val, _struct_timespec_u($utime, 1))  
        else  
                argstr = sprintf("%p, %s, %d", $uaddr, _futex_op_str($op),  
                        $val)  
}  
probe syscall.futex.return = kernel.function("sys_futex").return ?  
{  
        name = "futex"  
        retstr = return_str(1, $return)  
}  
  
# exit _______________________________________________________  
# long sys_exit(int error_code)  
probe syscall.exit = kernel.function("do_exit").call  
{  
        name = "exit"  
        status = $code  
        argstr = sprint($code)  
}  
  
# poll _______________________________________________________  
#  
# long sys_poll(struct pollfd __user * ufds, unsigned int nfds, long timeout)  
#  
probe syscall.poll = kernel.function("sys_poll").call  
{  
        name = "poll"  
        ufds_uaddr = $ufds  
        nfds = $nfds  
        timeout = (@defined($timeout_msecs) ? $timeout_msecs : $timeout)  // 在sys_poll中timeout正表示milliseconds, 负表示无限  
        argstr = sprintf("%p, %d, %d", $ufds, $nfds,  
                (@defined($timeout_msecs) ? $timeout_msecs : $timeout))  
}  
probe syscall.poll.return = kernel.function("sys_poll").return  
{  
        name = "poll"  
        retstr = return_str(1, $return)  
}  
  
# ppoll _______________________________________________________  
#  
# long sys_ppoll(struct pollfd __user *ufds, unsigned int nfds,  
#       struct timespec __user *tsp, const sigset_t __user *sigmask,  
#       size_t sigsetsize)  
#  
probe syscall.ppoll = kernel.function("sys_ppoll").call ?  
{  
        name = "ppoll"  
        argstr = sprintf("%p, %d, %s, %p, %d",  
                $ufds,  
                $nfds,  
                _struct_timespec_u($tsp, 1),  
                $sigmask,  
                $sigsetsize)  
}  
probe syscall.ppoll.return = kernel.function("sys_ppoll").return ?  
{  
        name = "ppoll"  
        retstr = return_str(1, $return)  
}  
  
# select _____________________________________________________  
# long sys_select(int n,  
#               fd_set __user *inp,  
#               fd_set __user *outp,  
#               fd_set __user *exp,  
#               struct timeval __user *tvp)  
#  
probe syscall.select = kernel.function("sys_select").call  
{  
        name = "select"  
        n = $n  
        readfds_uaddr = $inp  
        writefds_uaddr = $outp  
        exceptfds_uaddr = $exp  
        timeout_uaddr = $tvp  
        argstr = sprintf("%d, %p, %p, %p, %s", $n, $inp, $outp, $exp,  
                _struct_timeval_u($tvp, 1))  
}  
probe syscall.select.return = kernel.function("sys_select").return  
{  
        name = "select"  
        retstr = return_str(1, $return)  
}  
  
# rt_sigtimedwait ____________________________________________  
#  
# long sys_rt_sigtimedwait(const sigset_t __user *uthese,  
#                  siginfo_t __user *uinfo,  
#                  const struct timespec __user *uts,  
#                  size_t sigsetsize)  
# long compat_sys_rt_sigtimedwait (compat_sigset_t __user *uthese,  
#               struct compat_siginfo __user *uinfo,  
#               struct compat_timespec __user *uts, compat_size_t sigsetsize)  
#  
probe syscall.rt_sigtimedwait = kernel.function("compat_sys_rt_sigtimedwait").call ?,  
                                kernel.function("sys_rt_sigtimedwait").call  
{  
        name = "rt_sigtimedwait"  
        uthese_uaddr = $uthese  
        uinfo_uaddr = $uinfo  
        uts_uaddr = $uts  
        sigsetsize = $sigsetsize  
        argstr = sprintf("%p, %p, %p, %d", $uthese, $uinfo, $uts, $sigsetsize)  
}  
probe syscall.rt_sigtimedwait.return = kernel.function("compat_sys_rt_sigtimedwait").return ?,  
                                       kernel.function("sys_rt_sigtimedwait").return  
{  
        name = "rt_sigtimedwait"  
        retstr = return_str(1, $return)  
}  
  
# nanosleep __________________________________________________  
#  
# long sys_nanosleep(struct timespec __user *rqtp,  
#               struct timespec __user *rmtp)  
# long compat_sys_nanosleep(struct compat_timespec __user *rqtp,  
#               struct compat_timespec __user *rmtp)  
#  
probe syscall.nanosleep = kernel.function("sys_nanosleep").call  
{  
        name = "nanosleep"  
        req_uaddr = $rqtp  
        rem_uaddr = $rmtp  
        argstr = sprintf("%s, %p", _struct_timespec_u($rqtp, 1), $rmtp)  
}  
probe syscall.nanosleep.return = kernel.function("sys_nanosleep").return  
{  
        name = "nanosleep"  
        retstr = return_str(1, $return)  
}  
  
# ppoll _______________________________________________________  
#  
# long sys_ppoll(struct pollfd __user *ufds, unsigned int nfds,  
#       struct timespec __user *tsp, const sigset_t __user *sigmask,  
#       size_t sigsetsize)  
#  
probe syscall.ppoll = kernel.function("sys_ppoll").call ?  
{  
        name = "ppoll"  
        argstr = sprintf("%p, %d, %s, %p, %d",  
                $ufds,  
                $nfds,  
                _struct_timespec_u($tsp, 1),  
                $sigmask,  
                $sigsetsize)  
}  
probe syscall.ppoll.return = kernel.function("sys_ppoll").return ?  
{  
        name = "ppoll"  
        retstr = return_str(1, $return)  
}  
本文用到的内核函数以及上下文变量 :   
[root@db-172-16-3-150 ~]# stap -L 'kernel.function("it_real_fn")'  
kernel.function("it_real_fn@kernel/itimer.c:121") $timer:struct hrtimer*  
[root@db-172-16-3-150 ~]# stap -L 'kernel.function("sys_epoll_wait").call'  
kernel.function("sys_epoll_wait@fs/eventpoll.c:1710").call $epfd:int $events:struct epoll_event* $maxevents:int $timeout:int  
[root@db-172-16-3-150 ~]# stap -L 'kernel.function("sys_epoll_wait").return'  
kernel.function("sys_epoll_wait@fs/eventpoll.c:1710").return $return:long int $epfd:int $events:struct epoll_event* $maxevents:int $timeout:int  
[root@db-172-16-3-150 ~]# stap -L 'kernel.function("sys_futex").call'  
kernel.function("sys_futex@kernel/futex.c:2692").call $uaddr:u32* $op:int $val:u32 $utime:struct timespec* $uaddr2:u32* $val3:u32 $ts:struct timespec $t:ktime_t  
[root@db-172-16-3-150 ~]# stap -L 'kernel.function("sys_futex").return'  
kernel.function("sys_futex@kernel/futex.c:2692").return $return:long int $uaddr:u32* $op:int $val:u32 $utime:struct timespec* $uaddr2:u32* $val3:u32 $ts:struct timespec $t:ktime_t  
[root@db-172-16-3-150 ~]# stap -L 'kernel.function("do_exit").call'  
kernel.function("do_exit@kernel/exit.c:897").call $code:long int $tsk:struct task_struct*  
[root@db-172-16-3-150 ~]# stap -L 'kernel.function("sys_poll").call'  
kernel.function("sys_poll@fs/select.c:899").call $ufds:struct pollfd* $nfds:unsigned int $timeout_msecs:int $end_time:struct timespec  
[root@db-172-16-3-150 ~]# stap -L 'kernel.function("sys_poll").return'  
kernel.function("sys_poll@fs/select.c:899").return $return:long int $ufds:struct pollfd* $nfds:unsigned int $timeout_msecs:int $end_time:struct timespec  
[root@db-172-16-3-150 ~]# stap -L 'kernel.function("sys_ppoll").call'  
kernel.function("sys_ppoll@fs/select.c:935").call $ufds:struct pollfd* $nfds:unsigned int $tsp:struct timespec* $sigmask:sigset_t const* $sigsetsize:size_t $ksigmask:sigset_t $sigsaved:sigset_t $ts:struct timespec $end_time:struct timespec  
[root@db-172-16-3-150 ~]# stap -L 'kernel.function("sys_ppoll").return'  
kernel.function("sys_ppoll@fs/select.c:935").return $return:long int $ufds:struct pollfd* $nfds:unsigned int $tsp:struct timespec* $sigmask:sigset_t const* $sigsetsize:size_t $ksigmask:sigset_t $sigsaved:sigset_t $ts:struct timespec $end_time:struct timespec  
[root@db-172-16-3-150 ~]# stap -L 'kernel.function("sys_select").call'  
kernel.function("sys_select@fs/select.c:597").call $n:int $inp:fd_set* $outp:fd_set* $exp:fd_set* $tvp:struct timeval* $end_time:struct timespec $tv:struct timeval  
[root@db-172-16-3-150 ~]# stap -L 'kernel.function("sys_select").return'  
kernel.function("sys_select@fs/select.c:597").return $return:long int $n:int $inp:fd_set* $outp:fd_set* $exp:fd_set* $tvp:struct timeval* $end_time:struct timespec $tv:struct timeval  
[root@db-172-16-3-150 ~]# stap -L 'kernel.function("sys_rt_sigtimedwait").return'  
kernel.function("sys_rt_sigtimedwait@kernel/signal.c:2215").return $return:long int $uthese:sigset_t const* $uinfo:siginfo_t* $uts:struct timespec const* $sigsetsize:size_t $these:sigset_t $ts:struct timespec $info:siginfo_t  
[root@db-172-16-3-150 ~]# stap -L 'kernel.function("sys_rt_sigtimedwait").call'  
kernel.function("sys_rt_sigtimedwait@kernel/signal.c:2215").call $uthese:sigset_t const* $uinfo:siginfo_t* $uts:struct timespec const* $sigsetsize:size_t $these:sigset_t $ts:struct timespec $info:siginfo_t  
[root@db-172-16-3-150 ~]# stap -L 'kernel.function("sys_nanosleep").call'  
kernel.function("sys_nanosleep@kernel/hrtimer.c:1601").call $rqtp:struct timespec* $rmtp:struct timespec* $tu:struct timespec  
[root@db-172-16-3-150 ~]# stap -L 'kernel.function("sys_nanosleep").return'  
kernel.function("sys_nanosleep@kernel/hrtimer.c:1601").return $return:long int $rqtp:struct timespec* $rmtp:struct timespec* $tu:struct timespec  
系统调用源代码截取 :   
syscall.poll  
/usr/src/debug/kernel-2.6.32-358.el6/linux-2.6.32-358.el6.x86_64/fs/select.c  
SYSCALL_DEFINE3(poll, struct pollfd __user *, ufds, unsigned int, nfds,  
                int, timeout_msecs)  
{  
        struct timespec end_time, *to = NULL;  
        int ret;  
  
        if (timeout_msecs >= 0) {  
                to = &end_time;  
                poll_select_set_timeout(to, timeout_msecs / MSEC_PER_SEC,  
                        NSEC_PER_MSEC * (timeout_msecs % MSEC_PER_SEC));  
        }  
  
        ret = do_sys_poll(ufds, nfds, to);  
  
        if (ret == -EINTR) {  
                struct restart_block *restart_block;  
  
                restart_block = &current_thread_info()->restart_block;  
                restart_block->fn = do_restart_poll;  
                restart_block->poll.ufds = ufds;  
                restart_block->poll.nfds = nfds;  
  
                if (timeout_msecs >= 0) {  
                        restart_block->poll.tv_sec = end_time.tv_sec;  
                        restart_block->poll.tv_nsec = end_time.tv_nsec;  
                        restart_block->poll.has_timeout = 1;  
                } else  
                        restart_block->poll.has_timeout = 0;  
  
                ret = -ERESTART_RESTARTBLOCK;  
        }  
        return ret;  
}  
/usr/src/debug/kernel-2.6.32-358.el6/linux-2.6.32-358.el6.x86_64/kernel/itimer.c  
/*  
 * The timer is automagically restarted, when interval != 0  
 */  
enum hrtimer_restart it_real_fn(struct hrtimer *timer)  
{  
        struct signal_struct *sig =  
                container_of(timer, struct signal_struct, real_timer);  
  
        trace_itimer_expire(ITIMER_REAL, sig->leader_pid, 0);  
        kill_pid_info(SIGALRM, SEND_SIG_PRIV, sig->leader_pid);  
  
        return HRTIMER_NORESTART;  
}  

参考

1. https://sourceware.org/systemtap/SystemTap_Beginners_Guide/mainsect-profiling.html

2. https://sourceware.org/systemtap/examples/

3. /usr/share/systemtap/testsuite/systemtap.examples

4. systemtap-testsuite

5. /usr/share/systemtap/testsuite/systemtap.examples/index.txt

6. /usr/share/systemtap/testsuite/systemtap.examples/keyword-index.txt

7. /usr/share/systemtap/tapset

8.

man syscalls  
man poll  
man ppoll  
man select  
man epoll_wait  
man futex  
man nanosleep  
man rt_sigtimedwait  

9. https://sourceware.org/systemtap/tapsets/API-errno-str.html

Flag Counter

digoal’s 大量PostgreSQL文章入口