Systemtap examples, DISK IO - 7 Periodically Print I/O Block Time
背景
例子来自ioblktime.stp 脚本, 该脚本用以统计每个块设备的io块请求情况, 请求次数, io操作耗费的时间以systemtap的统计变量进程存储, 可以得到块设备上io块请求的平均时间, 总时间, 最大, 最小时间, 次数, 柱状图等等.
因为内核支持合并块操作, 所以通过ioblock.request触发的块请求, 可能被合并掉, 因此通过对函数kernel.trace("block_bio_frontmerge"), kernel.trace("block_bio_backmerge")的事件跟踪, 可以得到块设备上精确的块操作统计情况.
脚本内容以及注解 :
[root@db-172-16-3-150 network]# cd /usr/share/systemtap/testsuite/systemtap.examples/io
[root@db-172-16-3-150 io]# cat ioblktime.stp
#!/usr/bin/stap
global req_time, etimes
// req_time数组, 索引为上下文变量bio的内存地址. 值为当前时间
// etimes数组, 索引为设备名, R|W字符串. 存储的值为ioblock的整个操作耗时的统计变量.
// 如果数组元素超过默认的2048可能会导致运行错误退出, 那么可使用stap -D MAXMAPENTRIES=10240或者更大的值来规避这个错误,
// 或者指定数组的长度global req_time[100000] 或者是使用rotate数组global req_time%[10000]
probe ioblock.request
{
req_time[$bio] = gettimeofday_us()
}
// 当ioblock请求发生时, 记录下内存地址和当前时间
probe ioblock.end
{
t = gettimeofday_us() // 当前事件
s = req_time[$bio] // 根据bio找到ioblock.request事件记录的请求开始时间.
delete req_time[$bio] // 删除请求开始时间
if (s) { // 当请求未被合并时, 记录此次io操作的耗时.
etimes[devname, bio_rw_str(rw)] <<< t - s
}
}
// 当io操作结束时, 记录下io操作耗时, 存放至统计数组.
/* for time being delete things that get merged with others */
probe kernel.trace("block_bio_frontmerge"),
kernel.trace("block_bio_backmerge")
{
delete req_time[$bio] // 发生io合并时, 删除请求开始事件.
}
probe timer.s(10), end {
ansi_clear_screen() // 清屏
printf("%10s %3s %10s %10s %10s\n",
"device", "rw", "total (us)", "count", "avg (us)") // 输出头信息
foreach ([dev,rw] in etimes - limit 20) {
printf("%10s %3s %10d %10d %10d\n", dev, rw,
@sum(etimes[dev,rw]), @count(etimes[dev,rw]), @avg(etimes[dev,rw]))
}
// 输出实际的io操作次数排名前20位的信息. 包含设备名, 读写, 总耗时, 实际io操作次数, 平均每次io的耗时.
delete etimes
// 删除统计变量数组, 下10秒重新统计
}
执行输出举例 :
[root@db-172-16-3-150 io]# stap ioblktime.stp
device rw total (us) count avg (us)
sdd W 5831844 2884 2022
sdd1 W 12091 2868 4
sdb W 9210 8 1151
sdb1 W 279 6 46
sda W 43526 4 10881
sde W 580 3 193
本文用到的几个probe alias原型(包含对应的call原型).
[root@db-172-16-3-150 io]# less /usr/share/systemtap/tapset/ioblock.stp
/**
* probe ioblock.request - Fires whenever making a generic block I/O request.
*
* @name - name of the probe point
* @devname - block device name
* @ino - i-node number of the mapped file
* @sector - beginning sector for the entire bio
* @flags - see below
* BIO_UPTODATE 0 ok after I/O completion
* BIO_RW_BLOCK 1 RW_AHEAD set, and read/write would block
* BIO_EOF 2 out-out-bounds error
* BIO_SEG_VALID 3 nr_hw_seg valid
* BIO_CLONED 4 doesn't own data
* BIO_BOUNCED 5 bio is a bounce bio
* BIO_USER_MAPPED 6 contains user pages
* BIO_EOPNOTSUPP 7 not supported
*
* @rw - binary trace for read/write request
* @vcnt - bio vector count which represents number of array element (page, offset, length) which make up this I/O request
* @idx - offset into the bio vector array
* @phys_segments - number of segments in this bio after physical address coalescing is performed
* @hw_segments - number of segments after physical and DMA remapping hardware coalescing is performed
* @size - total size in bytes
* @bdev - target block device
* @bdev_contains - points to the device object which contains the partition (when bio structure represents a partition)
* @p_start_sect - points to the start sector of the partition structure of the device
*
* Context:
* The process makes block I/O request
*/
probe ioblock.request = kernel.function ("generic_make_request")
{
name = "ioblock.request"
devname = __bio_devname($bio)
ino = __bio_ino($bio)
sector = $bio->bi_sector
flags = $bio->bi_flags
rw = $bio->bi_rw
vcnt = $bio->bi_vcnt
idx = $bio->bi_idx
phys_segments = $bio->bi_phys_segments
hw_segments = (@defined($bio->bi_hw_segments)
? $bio->bi_hw_segments : 0)
size = $bio->bi_size
bdev = $bio->bi_bdev
bdev_contains = $bio->bi_bdev->bd_contains
p_start_sect = __bio_start_sect($bio)
}
/**
* probe ioblock.end - Fires whenever a block I/O transfer is complete.
*
* @name - name of the probe point
* @devname - block device name
* @ino - i-node number of the mapped file
* @bytes_done - number of bytes transferred
* @sector - beginning sector for the entire bio
* @flags - see below
* BIO_UPTODATE 0 ok after I/O completion
* BIO_RW_BLOCK 1 RW_AHEAD set, and read/write would block
* BIO_EOF 2 out-out-bounds error
* BIO_SEG_VALID 3 nr_hw_seg valid
* BIO_CLONED 4 doesn't own data
* BIO_BOUNCED 5 bio is a bounce bio
* BIO_USER_MAPPED 6 contains user pages
* BIO_EOPNOTSUPP 7 not supported
* @error - 0 on success
* @rw - binary trace for read/write request
* @vcnt - bio vector count which represents number of array element (page, offset, length) which makes up this I/O request
* @idx - offset into the bio vector array
* @phys_segments - number of segments in this bio after physical address coalescing is performed.
* @hw_segments - number of segments after physical and DMA remapping hardware coalescing is performed
* @size - total size in bytes
*
* Context:
* The process signals the transfer is done.
*/
probe ioblock.end = kernel.function("bio_endio")
{
name = "ioblock.end"
devname = __bio_devname($bio)
ino = __bio_ino($bio)
bytes_done = (@defined($bytes_done) ? $bytes_done : $bio->bi_size)
error = $error
sector = $bio->bi_sector
flags = $bio->bi_flags
rw = $bio->bi_rw
vcnt = $bio->bi_vcnt
idx = $bio->bi_idx
phys_segments = $bio->bi_phys_segments
hw_segments = (@defined($bio->bi_hw_segments)
? $bio->bi_hw_segments : 0)
size = $bio->bi_size
}
// 把ioblock.end的rw变量转换成R,W字符串
/* returns 0 for read, 1 for write */
function bio_rw_num:long(rw:long)
%{ /* pure */
long rw = (long)STAP_ARG_rw;
STAP_RETVALUE = (rw & REQ_WRITE);
%}
/* returns R for read, W for write */
function bio_rw_str(rw:long)
{
return bio_rw_num(rw) == BIO_READ ? "R" : "W"
}
这几个函数的源码位置信息:
[root@db-172-16-3-150 io]# stap -L 'kernel.function ("generic_make_request")'
kernel.function("generic_make_request@block/blk-core.c:1770") $bio:struct bio*
[root@db-172-16-3-150 io]# stap -L 'kernel.function ("bio_endio")'
kernel.function("bio_endio@fs/bio.c:1425") $bio:struct bio* $error:int
[root@db-172-16-3-150 io]# stap -L 'kernel.trace("block_bio_backmerge")'
kernel.trace("block_bio_backmerge") $q:struct request_queue* $bio:struct bio*
[root@db-172-16-3-150 io]# stap -L 'kernel.trace("block_bio_frontmerge")'
kernel.trace("block_bio_frontmerge") $q:struct request_queue* $bio:struct bio*
源码 :
/usr/src/debug/kernel-2.6.32-358.el6/linux-2.6.32-358.el6.x86_64/block/blk-core.c
/*
* We only want one ->make_request_fn to be active at a time,
* else stack usage with stacked devices could be a problem.
* So use current->bio_{list,tail} to keep a list of requests
* submited by a make_request_fn function.
* current->bio_tail is also used as a flag to say if
* generic_make_request is currently active in this task or not.
* If it is NULL, then no make_request is active. If it is non-NULL,
* then a make_request is active, and new requests should be added
* at the tail
*/
void generic_make_request(struct bio *bio)
{
if (current->bio_tail) {
/* make_request is active */
*(current->bio_tail) = bio;
bio->bi_next = NULL;
current->bio_tail = &bio->bi_next;
return;
}
/* following loop may be a bit non-obvious, and so deserves some
* explanation.
* Before entering the loop, bio->bi_next is NULL (as all callers
* ensure that) so we have a list with a single bio.
* We pretend that we have just taken it off a longer list, so
* we assign bio_list to the next (which is NULL) and bio_tail
* to &bio_list, thus initialising the bio_list of new bios to be
* added. __generic_make_request may indeed add some more bios
* through a recursive call to generic_make_request. If it
* did, we find a non-NULL value in bio_list and re-enter the loop
* from the top. In this case we really did just take the bio
* of the top of the list (no pretending) and so fixup bio_list and
* bio_tail or bi_next, and call into __generic_make_request again.
*
* The loop was structured like this to make only one call to
* __generic_make_request (which is important as it is large and
* inlined) and to keep the structure simple.
*/
BUG_ON(bio->bi_next);
do {
current->bio_list = bio->bi_next;
if (bio->bi_next == NULL)
current->bio_tail = ¤t->bio_list;
else
bio->bi_next = NULL;
__generic_make_request(bio);
bio = current->bio_list;
} while (bio);
current->bio_tail = NULL; /* deactivate */
}
EXPORT_SYMBOL(generic_make_request);
/usr/src/debug/kernel-2.6.32-358.el6/linux-2.6.32-358.el6.x86_64/fs/bio.c
/**
* bio_endio - end I/O on a bio
* @bio: bio
* @error: error, if any
*
* Description:
* bio_endio() will end I/O on the whole bio. bio_endio() is the
* preferred way to end I/O on a bio, it takes care of clearing
* BIO_UPTODATE on error. @error is 0 on success, and and one of the
* established -Exxxx (-EIO, for instance) error values in case
* something went wrong. Noone should call bi_end_io() directly on a
* bio unless they own it and thus know that it has an end_io
* function.
**/
void bio_endio(struct bio *bio, int error)
{
if (error)
clear_bit(BIO_UPTODATE, &bio->bi_flags);
else if (!test_bit(BIO_UPTODATE, &bio->bi_flags))
error = -EIO;
if (bio->bi_end_io)
bio->bi_end_io(bio, error);
}
EXPORT_SYMBOL(bio_endio);
/usr/src/debug/kernel-2.6.32-358.el6/linux-2.6.32-358.el6.x86_64/include/trace/events/block.h
DECLARE_EVENT_CLASS(block_bio,
TP_PROTO(struct request_queue *q, struct bio *bio),
TP_ARGS(q, bio),
TP_STRUCT__entry(
__field( dev_t, dev )
__field( sector_t, sector )
__field( unsigned int, nr_sector )
__array( char, rwbs, RWBS_LEN )
__array( char, comm, TASK_COMM_LEN )
),
TP_fast_assign(
__entry->dev = bio->bi_bdev->bd_dev;
__entry->sector = bio->bi_sector;
__entry->nr_sector = bio->bi_size >> 9;
blk_fill_rwbs(__entry->rwbs, bio->bi_rw, bio->bi_size);
memcpy(__entry->comm, current->comm, TASK_COMM_LEN);
),
TP_printk("%d,%d %s %llu + %u [%s]",
MAJOR(__entry->dev), MINOR(__entry->dev), __entry->rwbs,
(unsigned long long)__entry->sector,
__entry->nr_sector, __entry->comm)
);
DEFINE_EVENT(block_bio, block_bio_backmerge,
TP_PROTO(struct request_queue *q, struct bio *bio),
TP_ARGS(q, bio)
);
DEFINE_EVENT(block_bio, block_bio_frontmerge,
TP_PROTO(struct request_queue *q, struct bio *bio),
TP_ARGS(q, bio)
);
参考
1. https://sourceware.org/systemtap/SystemTap_Beginners_Guide/mainsect-disk.html
2. https://sourceware.org/systemtap/examples/
3. /usr/share/systemtap/testsuite/systemtap.examples
4. systemtap-testsuite
5. /usr/share/systemtap/testsuite/systemtap.examples/index.txt
6. /usr/share/systemtap/testsuite/systemtap.examples/keyword-index.txt
7. /usr/share/systemtap/tapset
8. https://sourceware.org/systemtap/tapsets/API-ansi-clear-screen.html