Systemtap examples, DISK IO - 1 Summarizing Disk Read/Write Traffic

7 minute read

背景

本文开始给大家讲一下linux系统中disk io相关的跟踪范例.  
例子来自disktop.stp 脚本, 该脚本作者的用意是输出系统中IO请求字节数前10的进程信息以及对应的块设备信息.  
但是这个脚本存在一个严重的BUG, 输出时对应的设备名其实是没有意义的, 它只是当时的设备号, 因为一个进程可能会对多个块设备进行IO操作, 但是脚本中记录的仅仅是最后一次io请求的块设备名.  
后面会讲如何修改这个文件, 修复bug.  
脚本内容以及注解 :   
[root@db-172-16-3-150 network]# cd /usr/share/systemtap/testsuite/systemtap.examples/io  
[root@db-172-16-3-150 network]# cat disktop.stp  
#!/usr/bin/stap   
#  
# Copyright (C) 2007 Oracle Corp.  
#  
# Get the status of reading/writing disk every 5 seconds,  
# output top ten entries   
#  
# This is free software,GNU General Public License (GPL);  
# either version 2, or (at your option) any later version.  
#  
# Usage:  
#  ./disktop.stp  
#  
  
global io_stat,device  
global read_bytes,write_bytes  
// 定义几个全局变量, io_stat存储累加的读, 写的字节数  
// device存储设备名, bug就在这里. 它存储的是vfs.read.return或vfs.write.return当时的设备名,   
// 同pid(),execname(),uid(),ppid()操作的块设备一下次是可能会变化的.  
// read_bytes和write_bytes是全局的读写字节数, 不区分进程号.  
  
probe vfs.read.return {  
  if ($return>0) {  
    if (devname!="N/A") {/*skip read from cache*/  
      io_stat[pid(),execname(),uid(),ppid(),"R"] += $return  
      device[pid(),execname(),uid(),ppid(),"R"] = devname  
      read_bytes += $return  
    }  
  }  
}  
// devname != "N/A" 过滤掉cache的块操作. 其他的都是块设备上的io操作.  
  
probe vfs.write.return {  
  if ($return>0) {  
    if (devname!="N/A") { /*skip update cache*/  
      io_stat[pid(),execname(),uid(),ppid(),"W"] += $return  
      device[pid(),execname(),uid(),ppid(),"W"] = devname  
      write_bytes += $return  
    }  
  }  
}  
// devname != "N/A" 过滤掉cache的块操作. 其他的都是块设备上的io操作.  
  
probe timer.ms(5000) {  
  /* skip non-read/write disk */  
  if (read_bytes+write_bytes) {  
  
    printf("\n%-25s, %-8s%4dKb/sec, %-7s%6dKb, %-7s%6dKb\n\n",  
           ctime(gettimeofday_s()),  
           "Average:", ((read_bytes+write_bytes)/1024)/5,  
           "Read:",read_bytes/1024,  
           "Write:",write_bytes/1024)  
// 输出全局读写字节数, 速率.  
// ctime用于把unix epoch second转换成年月日时分秒  
  
    /* print header */  
    printf("%8s %8s %8s %25s %8s %4s %12s\n",  
           "UID","PID","PPID","CMD","DEVICE","T","BYTES")  
  }  
// 输出头信息, 包含用户id, 进程id, 父进程id, 命令名, 块设备名, 读写类型, 字节数  
// 接下来的输出与之对应  
  
  /* print top ten I/O */  
  foreach ([process,cmd,userid,parent,action] in io_stat- limit 10)  
    printf("%8d %8d %8d %25s %8s %4s %12d\n",  
           userid,process,parent,cmd,  
           device[process,cmd,userid,parent,action],  
           action,io_stat[process,cmd,userid,parent,action])  
// 输出io请求前10的进程和设备信息.  
// 前面说了, 这里的设备没有意义  
  
  /* clear data */  
  delete io_stat  
  delete device  
  read_bytes = 0  
  write_bytes = 0    
// 输出后清除全局变量的数据  
}  
  
probe end{  
  delete io_stat  
  delete device  
  delete read_bytes  
  delete write_bytes  
}  
  
执行输出举例 :   
这里使用两个块设备的读写来给大家看看脚本的bug.  
[root@db-172-16-3-150 io]# df -h  
Filesystem            Size  Used Avail Use% Mounted on  
/dev/sdc1              29G  6.5G   21G  24% /  
tmpfs                  48G     0   48G   0% /dev/shm  
/dev/sdc3              98G   24G   70G  26% /opt  
/dev/sdd1             183G  5.4G  169G   4% /ssd1  
/dev/sde1             469G  1.7G  444G   1% /ssd2  
/dev/sda1             221G  195M  209G   1% /ssd3  
/dev/sdb1             221G  9.6G  200G   5% /ssd4  
我们用到的是/dev/sda1和/dev/sdb1  
digoal=# \db  
              List of tablespaces  
    Name    |  Owner   |       Location          
------------+----------+-----------------------  
 pg_default | postgres |   
 pg_global  | postgres |   
 tbs_digoal | postgres | /ssd3/pg93/tbs_digoal  
 tbs_idx    | postgres | /ssd4/pg93/tbs_idx  
(4 rows)  
创建表和索引, 分别存在两个设备上.  
digoal=# create table test(id int primary key, info text, crt_time timestamp) tablespace tbs_digoal;  
CREATE TABLE  
digoal=# create index idx_test_info on test(info) tablespace tbs_idx;  
CREATE INDEX  
创建测试用的函数  
digoal=# CREATE OR REPLACE FUNCTION public.f_test(i_id integer)  
 RETURNS void  
 LANGUAGE plpgsql  
 STRICT  
AS $function$  
declare  
begin  
  update test set info=md5(random()::text), crt_time=clock_timestamp() where id=i_id;  
  if not found then  
    insert into test(id,info,crt_time) values(i_id,md5(random()::text),clock_timestamp());  
  end if;  
  return;  
  exception when others then  
    return;  
end;  
$function$;  
创建测试用的bench 脚本  
pg93@db-172-16-3-150-> vi test.sql  
\setrandom id 1 5000000  
select f_test(:id);  
进行bench测试, 维持60秒  
pg93@db-172-16-3-150-> pgbench -M prepared -n -r -f ./test.sql -c 1 -T 60 -h $PGDATA -p 1921 -U postgres digoal  
测试过程中查看一下相关进程号和进程信息  
[root@db-172-16-3-150 io]# ps -ewf|grep pg93    
pg93      9171     1  0 10:39 pts/3    00:00:00 /home/pg93/pgsql9.3.1/bin/postgres  
pg93      9172  9171  0 10:39 ?        00:00:00 postgres: logger process            
pg93      9174  9171  0 10:39 ?        00:00:00 postgres: checkpointer process      
pg93      9175  9171  0 10:39 ?        00:00:00 postgres: writer process            
pg93      9176  9171  0 10:39 ?        00:00:01 postgres: wal writer process        
pg93      9177  9171  0 10:39 ?        00:00:00 postgres: autovacuum launcher process     
pg93      9178  9171  0 10:39 ?        00:00:00 postgres: archiver process   last was 0000000100000010000000ED  
pg93      9179  9171  0 10:39 ?        00:00:00 postgres: stats collector process     
pg93      9214  9000 22 10:42 pts/3    00:00:11 pgbench -M prepared -n -r -f ./test.sql -c 1 -T 60 -h /ssd2/pg93/pg_root -p 1921 -U postgres digoal  
pg93      9216  9171 80 10:42 ?        00:00:40 postgres: postgres digoal [local] SELECT  
  
开启disktop.stp, 收集块设备io统计信息, 从输出来看, bench的postgres进程9216只写了1个块设备, checkpoint也只写了1个块设备, 这即是disktop.stp中的bug, 记录的只是最后一次块操作的设备名.   
[root@db-172-16-3-150 io]# stap disktop.stp   
Fri Nov 15 02:42:34 2013 , Average:6711Kb/sec, Read:       7Kb, Write:  33552Kb  
     UID      PID     PPID                       CMD   DEVICE    T        BYTES  
     500     9176     9171                  postgres     sde1    W     31064064  
     500     9216     9171                  postgres     sda1    W      3293184  
     500     9223     9178                        sh     sdc1    R         3620  
     500     9223     9178                      date     sdc1    R         3533  
     500     9223     9178                  postgres     sdc1    R          788  
     500     9172     9171                  postgres     sde1    W           29  
  
Fri Nov 15 02:42:39 2013 , Average:3754Kb/sec, Read:     135Kb, Write:  18637Kb  
     UID      PID     PPID                       CMD   DEVICE    T        BYTES  
     500     9176     9171                  postgres     sde1    W     15630336  
     500     9216     9171                  postgres     sda1    W      3448832  
     502     9224     1670                  postgres     sdd1    R       136419  
     502     1678     1670                  postgres     sdd1    W         5547  
     502     1676     1670                  postgres     sdd1    R         1968  
  
下面我把disktop.stp修改一下, 在device中存储按块设备的统计, 原来的io_stat保持不变. 脚本如下  
[root@db-172-16-3-150 io]# cat disktop_digoal.stp   
#!/usr/bin/stap   
  
global io_stat,device  
global read_bytes,write_bytes  
  
probe vfs.read.return {  
  if ($return>0) {  
    if (devname!="N/A") {/*skip read from cache*/  
      io_stat[pid(),execname(),uid(),ppid(),"R"] += $return  
      device[pid(),execname(),uid(),ppid(),"R", devname] += $return  
      read_bytes += $return  
    }  
  }  
}  
  
probe vfs.write.return {  
  if ($return>0) {  
    if (devname!="N/A") { /*skip update cache*/  
      io_stat[pid(),execname(),uid(),ppid(),"W"] += $return  
      device[pid(),execname(),uid(),ppid(),"W", devname] += $return  
      write_bytes += $return  
    }  
  }  
}  
  
probe timer.ms(5000) {  
  /* skip non-read/write disk */  
  if (read_bytes+write_bytes) {  
  
    printf("\n%-25s, %-8s%4dKb/sec, %-7s%6dKb, %-7s%6dKb\n\n",  
           ctime(gettimeofday_s()),  
           "Average:", ((read_bytes+write_bytes)/1024)/5,  
           "Read:",read_bytes/1024,  
           "Write:",write_bytes/1024)  
  
    /* print header */  
    printf("%8s %8s %8s %25s %8s %4s %12s\n",  
           "UID","PID","PPID","CMD","DEVICE","T","BYTES")  
  }  
  /* print top ten I/O */  
  foreach ([process,cmd,userid,parent,action] in io_stat- limit 10) {  
    foreach ([a,b,c,d,e,f] in device) {  
      if (a==process && b==cmd && c==userid && d==parent && e==action)   
        printf("%8d %8d %8d %25s %8s %4s %12d\n",  
           userid,process,parent,cmd,  
           f,  
           action,device[process,cmd,userid,parent,action,f])  
    }  
  }  
  
  /* clear data */  
  delete io_stat  
  delete device  
  read_bytes = 0  
  write_bytes = 0    
}  
  
probe end{  
  delete io_stat  
  delete device  
  delete read_bytes  
  delete write_bytes  
}  
修改后重复以上测试 :   
[root@db-172-16-3-150 systemtap.examples]# ps -ewf|grep pg93  
pg93      9614     1  0 11:17 pts/1    00:00:00 /home/pg93/pgsql9.3.1/bin/postgres  
pg93      9615  9614  0 11:17 ?        00:00:00 postgres: logger process            
pg93      9617  9614  0 11:17 ?        00:00:00 postgres: checkpointer process      
pg93      9618  9614  0 11:17 ?        00:00:00 postgres: writer process            
pg93      9619  9614  0 11:17 ?        00:00:00 postgres: wal writer process        
pg93      9620  9614  0 11:17 ?        00:00:00 postgres: autovacuum launcher process     
pg93      9621  9614  0 11:17 ?        00:00:00 postgres: archiver process   last was 0000000100000010000000FB  
pg93      9622  9614  0 11:17 ?        00:00:00 postgres: stats collector process     
pg93      9632  9000 16 11:18 pts/3    00:00:00 pgbench -M prepared -n -r -f ./test.sql -c 1 -T 60 -h /ssd2/pg93/pg_root -p 1921 -U postgres digoal  
pg93      9634  9614 99 11:18 ?        00:00:03 postgres: postgres digoal [local] SELECT  
可以看到, 修改后的脚本, 输出的设备名有意义了, 同一个进程9249, 也就是bench进程, 写了3个块设备的数据. sdb1 sda1 sde1,  
分别是global, table, index所在的块设备. 同意可以看到wal writer进程只写了pg_xlog所在的块设备.  
[root@db-172-16-3-150 io]# stap disktop_digoal.stp   
Fri Nov 15 03:18:36 2013 , Average:4103Kb/sec, Read:    1711Kb, Write:  18808Kb  
     UID      PID     PPID                       CMD   DEVICE    T        BYTES  
     500     9619     9614                  postgres     sdd1    W     16220160  
     500     9634     9614                  postgres     sda1    W      1564672  
     500     9634     9614                  postgres     sdb1    W      1343488  
     500     9634     9614                  postgres     sde1    W       131072  
     500     9634     9614                  postgres     sda1    R      1744896  
     500     9652     9621                        sh     sdc1    R         3620  
     500     9652     9621                      date     sdc1    R         3533  
     500     9652     9621                  postgres     sdc1    R          788  
     500     9615     9614                  postgres     sde1    W           29  
  
Fri Nov 15 03:18:41 2013 , Average:3780Kb/sec, Read:    1175Kb, Write:  17725Kb  
     UID      PID     PPID                       CMD   DEVICE    T        BYTES  
     500     9619     9614                  postgres     sdd1    W     15204352  
     500     9634     9614                  postgres     sda1    W      1589248  
     500     9634     9614                  postgres     sdb1    W      1220608  
     500     9634     9614                  postgres     sde1    W       131072  
     500     9634     9614                  postgres     sda1    R      1064960  
     502     9653     1670                  postgres     sdd1    R       136419  
     502     1678     1670                  postgres     sdd1    W         5547  
     502     1676     1670                  postgres     sdd1    R         1968  
  
本文用到的2个probe alias原型.  
/usr/share/systemtap/tapset/vfs.stp  
probe vfs.read.return = kernel.function("vfs_read").return  
{  
        name = "vfs.read"  
        retstr = sprintf("%d", $return)  
  
        file = $file  
        pos = $pos  
        buf = $buf  
        bytes_to_read = $count  
        dev = __file_dev($file)  
        devname = __find_bdevname(dev, __file_bdev($file))  
        ino = __file_ino($file)  
  
        ret = $return  
        bytes_read = $return > 0 ? $return : 0  
        error = $return < 0 ? $return : 0  
        error_str = error ? errno_str(error) : ""  
}  
  
probe vfs.write.return = kernel.function("vfs_write").return  
{  
        name = "vfs.write"  
        retstr = sprintf("%d", $return)  
  
        file = $file  
        pos = $pos  
        buf = $buf  
        bytes_to_write = $count  
        dev = __file_dev($file)  
        devname = __find_bdevname(dev, __file_bdev($file))  
        ino = __file_ino($file)  
  
        ret = $return  
        bytes_written = $return > 0 ? $return : 0  
        error = $return < 0 ? $return : 0  
        error_str = error ? errno_str(error) : ""  
}  
$return为.return事件的上下文变量, 即事件函数返回值, 如本例用到的vfs_read.return, 对应的函数vfs_read@fs/read_write.c的返回类型为ssize_t, 代码中为ret, systemtap将使用$return来代表这个ret.  
[root@db-172-16-3-150 io]# stap -L 'kernel.function("vfs_read")'  
kernel.function("vfs_read@fs/read_write.c:294") $file:struct file* $buf:char* $count:size_t $pos:loff_t*  
[root@db-172-16-3-150 io]# stap -L 'kernel.function("vfs_read").return'  
kernel.function("vfs_read@fs/read_write.c:294").return $return:ssize_t $file:struct file* $buf:char* $count:size_t $pos:loff_t*  
  
这两个函数的源码 :   
/usr/src/debug/kernel-2.6.32-358.el6/linux-2.6.32-358.el6.x86_64/fs/read_write.c  
ssize_t vfs_read(struct file *file, char __user *buf, size_t count, loff_t *pos)  
{  
        ssize_t ret;  
  
        if (!(file->f_mode & FMODE_READ))  
                return -EBADF;  
        if (!file->f_op || (!file->f_op->read && !file->f_op->aio_read))  
                return -EINVAL;  
        if (unlikely(!access_ok(VERIFY_WRITE, buf, count)))  
                return -EFAULT;  
  
        ret = rw_verify_area(READ, file, pos, count);  
        if (ret >= 0) {  
                count = ret;  
                if (file->f_op->read)  
                        ret = file->f_op->read(file, buf, count, pos);  
                else  
                        ret = do_sync_read(file, buf, count, pos);  
                if (ret > 0) {  
                        fsnotify_access(file->f_path.dentry);  
                        add_rchar(current, ret);  
                }  
                inc_syscr(current);  
        }  
  
        return ret;  
}  
EXPORT_SYMBOL(vfs_read);  
  
ssize_t vfs_write(struct file *file, const char __user *buf, size_t count, loff_t *pos)  
{  
        ssize_t ret;  
  
        if (!(file->f_mode & FMODE_WRITE))  
                return -EBADF;  
        if (!file->f_op || (!file->f_op->write && !file->f_op->aio_write))  
                return -EINVAL;  
        if (unlikely(!access_ok(VERIFY_READ, buf, count)))  
                return -EFAULT;  
  
        ret = rw_verify_area(WRITE, file, pos, count);  
        if (ret >= 0) {  
                count = ret;  
                if (file->f_op->write)  
                        ret = file->f_op->write(file, buf, count, pos);  
                else  
                        ret = do_sync_write(file, buf, count, pos);  
                if (ret > 0) {  
                        fsnotify_modify(file->f_path.dentry);  
                        add_wchar(current, ret);  
                }  
                inc_syscw(current);  
        }  
  
        return ret;  
}  
EXPORT_SYMBOL(vfs_write);  

参考

1. https://sourceware.org/systemtap/SystemTap_Beginners_Guide/mainsect-disk.html

2. https://sourceware.org/systemtap/examples/

3. /usr/share/systemtap/testsuite/systemtap.examples

4. systemtap-testsuite

5. /usr/share/systemtap/testsuite/systemtap.examples/index.txt

6. /usr/share/systemtap/testsuite/systemtap.examples/keyword-index.txt

7. /usr/share/systemtap/tapset

8. https://sourceware.org/systemtap/tapsets/API-ctime.html

9. https://sourceware.org/systemtap/tapsets/API-gettimeofday-s.html

Flag Counter

digoal’s 大量PostgreSQL文章入口