Systemtap examples, DISK IO - 7 Periodically Print I/O Block Time

7 minute read

背景

例子来自ioblktime.stp 脚本, 该脚本用以统计每个块设备的io块请求情况, 请求次数, io操作耗费的时间以systemtap的统计变量进程存储, 可以得到块设备上io块请求的平均时间, 总时间, 最大, 最小时间, 次数, 柱状图等等.  
因为内核支持合并块操作, 所以通过ioblock.request触发的块请求, 可能被合并掉, 因此通过对函数kernel.trace("block_bio_frontmerge"), kernel.trace("block_bio_backmerge")的事件跟踪, 可以得到块设备上精确的块操作统计情况.  
脚本内容以及注解 :   
[root@db-172-16-3-150 network]# cd /usr/share/systemtap/testsuite/systemtap.examples/io  
[root@db-172-16-3-150 io]# cat ioblktime.stp  
#!/usr/bin/stap  
  
global req_time, etimes  
// req_time数组, 索引为上下文变量bio的内存地址. 值为当前时间  
// etimes数组, 索引为设备名, R|W字符串. 存储的值为ioblock的整个操作耗时的统计变量.  
// 如果数组元素超过默认的2048可能会导致运行错误退出, 那么可使用stap -D MAXMAPENTRIES=10240或者更大的值来规避这个错误,  
// 或者指定数组的长度global req_time[100000] 或者是使用rotate数组global req_time%[10000]  
  
probe ioblock.request  
{  
  req_time[$bio] = gettimeofday_us()  
}  
// 当ioblock请求发生时, 记录下内存地址和当前时间  
  
probe ioblock.end  
{  
  t = gettimeofday_us()  // 当前事件  
  s =  req_time[$bio]  // 根据bio找到ioblock.request事件记录的请求开始时间.  
  delete req_time[$bio]  // 删除请求开始时间  
  if (s) {  // 当请求未被合并时, 记录此次io操作的耗时.  
    etimes[devname, bio_rw_str(rw)] <<< t - s  
  }  
}  
// 当io操作结束时, 记录下io操作耗时, 存放至统计数组.  
  
/* for time being delete things that get merged with others */  
probe kernel.trace("block_bio_frontmerge"),  
      kernel.trace("block_bio_backmerge")  
{  
  delete req_time[$bio]  // 发生io合并时, 删除请求开始事件.  
}  
  
probe timer.s(10), end {  
  ansi_clear_screen()  // 清屏  
  printf("%10s %3s %10s %10s %10s\n",  
         "device", "rw", "total (us)", "count", "avg (us)")  // 输出头信息  
  foreach ([dev,rw] in etimes - limit 20) {  
    printf("%10s %3s %10d %10d %10d\n", dev, rw,  
           @sum(etimes[dev,rw]), @count(etimes[dev,rw]), @avg(etimes[dev,rw]))  
  }  
// 输出实际的io操作次数排名前20位的信息. 包含设备名, 读写, 总耗时, 实际io操作次数, 平均每次io的耗时.  
  delete etimes  
// 删除统计变量数组, 下10秒重新统计  
}  
  
执行输出举例 :   
[root@db-172-16-3-150 io]# stap ioblktime.stp  
    device  rw total (us)      count   avg (us)  
       sdd   W    5831844       2884       2022  
      sdd1   W      12091       2868          4  
       sdb   W       9210          8       1151  
      sdb1   W        279          6         46  
       sda   W      43526          4      10881  
       sde   W        580          3        193  
  
本文用到的几个probe alias原型(包含对应的call原型).  
[root@db-172-16-3-150 io]# less /usr/share/systemtap/tapset/ioblock.stp  
/**  
 *  probe ioblock.request - Fires whenever making a generic block I/O request.  
 *  
 *  @name      - name of the probe point  
 *  @devname   - block device name  
 *  @ino       - i-node number of the mapped file  
 *  @sector    - beginning sector for the entire bio  
 *  @flags     - see below  
 *      BIO_UPTODATE    0       ok after I/O completion  
 *      BIO_RW_BLOCK    1       RW_AHEAD set, and read/write would block  
 *      BIO_EOF         2       out-out-bounds error  
 *      BIO_SEG_VALID   3       nr_hw_seg valid   
 *      BIO_CLONED      4       doesn't own data  
 *      BIO_BOUNCED     5       bio is a bounce bio  
 *      BIO_USER_MAPPED 6       contains user pages  
 *      BIO_EOPNOTSUPP  7       not supported  
 *    
 *  @rw        - binary trace for read/write request  
 *  @vcnt      - bio vector count which represents number of array element (page, offset, length) which make up this I/O request  
 *  @idx       - offset into the bio vector array  
 *  @phys_segments - number of segments in this bio after physical address coalescing is performed  
 *  @hw_segments -   number of segments after physical and DMA remapping hardware coalescing is performed  
 *  @size      - total size in bytes  
 *  @bdev      - target block device  
 *  @bdev_contains - points to the device object which contains the partition (when bio structure represents a partition)  
 *  @p_start_sect -  points to the start sector of the partition structure of the device  
 *  
 * Context:  
 *  The process makes block I/O request  
 */  
probe ioblock.request = kernel.function ("generic_make_request")  
{  
        name = "ioblock.request"  
        devname = __bio_devname($bio)  
        ino = __bio_ino($bio)  
  
        sector = $bio->bi_sector  
        flags = $bio->bi_flags  
        rw = $bio->bi_rw  
        vcnt = $bio->bi_vcnt  
        idx = $bio->bi_idx  
        phys_segments = $bio->bi_phys_segments  
        hw_segments = (@defined($bio->bi_hw_segments)  
                       ? $bio->bi_hw_segments : 0)  
        size = $bio->bi_size  
  
        bdev = $bio->bi_bdev  
        bdev_contains = $bio->bi_bdev->bd_contains  
        p_start_sect = __bio_start_sect($bio)  
}  
  
/**  
 * probe ioblock.end - Fires whenever a block I/O transfer is complete.  
 *  
 *  @name      - name of the probe point  
 *  @devname   - block device name  
 *  @ino       - i-node number of the mapped file  
 *  @bytes_done - number of bytes transferred  
 *  @sector    - beginning sector for the entire bio  
 *  @flags     - see below  
 *      BIO_UPTODATE    0       ok after I/O completion  
 *      BIO_RW_BLOCK    1       RW_AHEAD set, and read/write would block  
 *      BIO_EOF         2       out-out-bounds error  
 *      BIO_SEG_VALID   3       nr_hw_seg valid  
 *      BIO_CLONED      4       doesn't own data  
 *      BIO_BOUNCED     5       bio is a bounce bio  
 *      BIO_USER_MAPPED 6       contains user pages  
 *      BIO_EOPNOTSUPP  7       not supported  
 *  @error     - 0 on success  
 *  @rw        - binary trace for read/write request  
 *  @vcnt      - bio vector count which represents number of array element (page, offset, length) which makes up this I/O request  
 *  @idx       - offset into the bio vector array  
 *  @phys_segments - number of segments in this bio after physical address coalescing is performed.  
 *  @hw_segments -   number of segments after physical and DMA remapping hardware coalescing is performed  
 *  @size      - total size in bytes  
 *  
 * Context:  
 *  The process signals the transfer is done.  
 */  
probe ioblock.end = kernel.function("bio_endio")  
{  
        name = "ioblock.end"  
        devname = __bio_devname($bio)  
        ino = __bio_ino($bio)  
  
        bytes_done = (@defined($bytes_done) ? $bytes_done : $bio->bi_size)  
        error = $error  
  
        sector = $bio->bi_sector  
        flags = $bio->bi_flags  
        rw = $bio->bi_rw  
        vcnt = $bio->bi_vcnt  
        idx = $bio->bi_idx  
        phys_segments = $bio->bi_phys_segments  
        hw_segments = (@defined($bio->bi_hw_segments)  
                       ? $bio->bi_hw_segments : 0)  
        size = $bio->bi_size  
}  
  
// 把ioblock.end的rw变量转换成R,W字符串  
/* returns 0 for read, 1 for write */  
function bio_rw_num:long(rw:long)  
%{ /* pure */  
    long rw = (long)STAP_ARG_rw;  
    STAP_RETVALUE = (rw & REQ_WRITE);  
%}  
  
/* returns R for read, W for write */  
function bio_rw_str(rw:long)  
{  
    return bio_rw_num(rw) == BIO_READ ? "R" : "W"  
}  
  
这几个函数的源码位置信息:   
[root@db-172-16-3-150 io]# stap -L 'kernel.function ("generic_make_request")'  
kernel.function("generic_make_request@block/blk-core.c:1770") $bio:struct bio*  
[root@db-172-16-3-150 io]# stap -L 'kernel.function ("bio_endio")'  
kernel.function("bio_endio@fs/bio.c:1425") $bio:struct bio* $error:int  
[root@db-172-16-3-150 io]# stap -L 'kernel.trace("block_bio_backmerge")'  
kernel.trace("block_bio_backmerge") $q:struct request_queue* $bio:struct bio*  
[root@db-172-16-3-150 io]# stap -L 'kernel.trace("block_bio_frontmerge")'  
kernel.trace("block_bio_frontmerge") $q:struct request_queue* $bio:struct bio*  
  
源码 :   
/usr/src/debug/kernel-2.6.32-358.el6/linux-2.6.32-358.el6.x86_64/block/blk-core.c  
/*  
 * We only want one ->make_request_fn to be active at a time,  
 * else stack usage with stacked devices could be a problem.  
 * So use current->bio_{list,tail} to keep a list of requests  
 * submited by a make_request_fn function.  
 * current->bio_tail is also used as a flag to say if  
 * generic_make_request is currently active in this task or not.  
 * If it is NULL, then no make_request is active.  If it is non-NULL,  
 * then a make_request is active, and new requests should be added  
 * at the tail  
 */  
void generic_make_request(struct bio *bio)  
{  
        if (current->bio_tail) {  
                /* make_request is active */  
                *(current->bio_tail) = bio;  
                bio->bi_next = NULL;  
                current->bio_tail = &bio->bi_next;  
                return;  
        }  
        /* following loop may be a bit non-obvious, and so deserves some  
         * explanation.  
         * Before entering the loop, bio->bi_next is NULL (as all callers  
         * ensure that) so we have a list with a single bio.  
         * We pretend that we have just taken it off a longer list, so  
         * we assign bio_list to the next (which is NULL) and bio_tail  
         * to &bio_list, thus initialising the bio_list of new bios to be  
         * added.  __generic_make_request may indeed add some more bios  
         * through a recursive call to generic_make_request.  If it  
         * did, we find a non-NULL value in bio_list and re-enter the loop  
         * from the top.  In this case we really did just take the bio  
         * of the top of the list (no pretending) and so fixup bio_list and  
         * bio_tail or bi_next, and call into __generic_make_request again.  
         *  
         * The loop was structured like this to make only one call to  
         * __generic_make_request (which is important as it is large and  
         * inlined) and to keep the structure simple.  
         */  
        BUG_ON(bio->bi_next);  
        do {  
                current->bio_list = bio->bi_next;  
                if (bio->bi_next == NULL)  
                        current->bio_tail = &current->bio_list;  
                else  
                        bio->bi_next = NULL;  
                __generic_make_request(bio);  
                bio = current->bio_list;  
        } while (bio);  
        current->bio_tail = NULL; /* deactivate */  
}  
EXPORT_SYMBOL(generic_make_request);  
  
/usr/src/debug/kernel-2.6.32-358.el6/linux-2.6.32-358.el6.x86_64/fs/bio.c  
/**  
 * bio_endio - end I/O on a bio  
 * @bio:        bio  
 * @error:      error, if any  
 *  
 * Description:  
 *   bio_endio() will end I/O on the whole bio. bio_endio() is the  
 *   preferred way to end I/O on a bio, it takes care of clearing  
 *   BIO_UPTODATE on error. @error is 0 on success, and and one of the  
 *   established -Exxxx (-EIO, for instance) error values in case  
 *   something went wrong. Noone should call bi_end_io() directly on a  
 *   bio unless they own it and thus know that it has an end_io  
 *   function.  
 **/  
void bio_endio(struct bio *bio, int error)  
{  
        if (error)  
                clear_bit(BIO_UPTODATE, &bio->bi_flags);  
        else if (!test_bit(BIO_UPTODATE, &bio->bi_flags))  
                error = -EIO;  
  
        if (bio->bi_end_io)  
                bio->bi_end_io(bio, error);  
}  
EXPORT_SYMBOL(bio_endio);  
  
/usr/src/debug/kernel-2.6.32-358.el6/linux-2.6.32-358.el6.x86_64/include/trace/events/block.h  
DECLARE_EVENT_CLASS(block_bio,  
  
        TP_PROTO(struct request_queue *q, struct bio *bio),  
  
        TP_ARGS(q, bio),  
  
        TP_STRUCT__entry(  
                __field( dev_t,         dev                     )  
                __field( sector_t,      sector                  )  
                __field( unsigned int,  nr_sector               )  
                __array( char,          rwbs,   RWBS_LEN        )  
                __array( char,          comm,   TASK_COMM_LEN   )  
        ),  
  
        TP_fast_assign(  
                __entry->dev            = bio->bi_bdev->bd_dev;  
                __entry->sector         = bio->bi_sector;  
                __entry->nr_sector      = bio->bi_size >> 9;  
                blk_fill_rwbs(__entry->rwbs, bio->bi_rw, bio->bi_size);  
                memcpy(__entry->comm, current->comm, TASK_COMM_LEN);  
        ),  
  
        TP_printk("%d,%d %s %llu + %u [%s]",  
                  MAJOR(__entry->dev), MINOR(__entry->dev), __entry->rwbs,  
                  (unsigned long long)__entry->sector,  
                  __entry->nr_sector, __entry->comm)  
);  
DEFINE_EVENT(block_bio, block_bio_backmerge,  
  
        TP_PROTO(struct request_queue *q, struct bio *bio),  
  
        TP_ARGS(q, bio)  
);  
  
DEFINE_EVENT(block_bio, block_bio_frontmerge,  
  
        TP_PROTO(struct request_queue *q, struct bio *bio),  
  
        TP_ARGS(q, bio)  
);  

参考

1. https://sourceware.org/systemtap/SystemTap_Beginners_Guide/mainsect-disk.html

2. https://sourceware.org/systemtap/examples/

3. /usr/share/systemtap/testsuite/systemtap.examples

4. systemtap-testsuite

5. /usr/share/systemtap/testsuite/systemtap.examples/index.txt

6. /usr/share/systemtap/testsuite/systemtap.examples/keyword-index.txt

7. /usr/share/systemtap/tapset

8. https://sourceware.org/systemtap/tapsets/API-ansi-clear-screen.html

Flag Counter

digoal’s 大量PostgreSQL文章入口