Systemtap kernel.trace(*) events source code

3 minute read

背景

我们知道内核trace event可以使用stap -l或-L查看, 而trace的源代码则可以在/usr/src/debug/kernel-2.6.32-358.el6/linux-2.6.32-358.el6.x86_64/include/trace/events中查找到.  
(关注你的内核版本. 目录地址可能不同)  
  
输出系统所有支持的kernel.trace, 使用stap -l 或者stap -L. 如下.  
[root@db-172-16-3-150 events]# stap -l 'kernel.trace("**")'  
kernel.trace("__extent_writepage")  
kernel.trace("block_bio_backmerge")  
kernel.trace("block_bio_bounce")  
kernel.trace("block_bio_complete")  
kernel.trace("block_bio_frontmerge")  
kernel.trace("block_bio_queue")  
kernel.trace("block_getrq")  
kernel.trace("block_plug")  
kernel.trace("block_remap")  
kernel.trace("block_rq_abort")  
kernel.trace("block_rq_complete")  
kernel.trace("block_rq_insert")  
kernel.trace("block_rq_issue")  
kernel.trace("block_rq_remap")  
kernel.trace("block_rq_requeue")  
kernel.trace("block_sleeprq")  
kernel.trace("block_split")  
kernel.trace("block_unplug_io")  
kernel.trace("block_unplug_timer")  
kernel.trace("btrfs_chunk_alloc")  
kernel.trace("btrfs_chunk_free")  
kernel.trace("btrfs_cow_block")  
kernel.trace("btrfs_delayed_data_ref")  
kernel.trace("btrfs_delayed_ref_head")  
kernel.trace("btrfs_delayed_tree_ref")  
kernel.trace("btrfs_failed_cluster_setup")  
kernel.trace("btrfs_find_cluster")  
kernel.trace("btrfs_get_extent")  
kernel.trace("btrfs_inode_evict")  
kernel.trace("btrfs_inode_new")  
kernel.trace("btrfs_inode_request")  
kernel.trace("btrfs_ordered_extent_add")  
kernel.trace("btrfs_ordered_extent_put")  
kernel.trace("btrfs_ordered_extent_remove")  
kernel.trace("btrfs_ordered_extent_start")  
kernel.trace("btrfs_reserve_extent")  
kernel.trace("btrfs_reserve_extent_cluster")  
kernel.trace("btrfs_reserved_extent_alloc")  
kernel.trace("btrfs_reserved_extent_free")  
kernel.trace("btrfs_setup_cluster")  
kernel.trace("btrfs_space_reservation")  
kernel.trace("btrfs_sync_file")  
kernel.trace("btrfs_sync_fs")  
kernel.trace("btrfs_transaction_commit")  
kernel.trace("btrfs_writepage_end_io_hook")  
kernel.trace("consume_skb")  
kernel.trace("ext3_alloc_new_reservation")  
kernel.trace("ext3_allocate_blocks")  
kernel.trace("ext3_allocate_inode")  
kernel.trace("ext3_delete_inode")  
kernel.trace("ext3_direct_IO_enter")  
kernel.trace("ext3_direct_IO_exit")  
kernel.trace("ext3_discard_blocks")  
kernel.trace("ext3_discard_reservation")  
kernel.trace("ext3_forget")  
kernel.trace("ext3_free_blocks")  
kernel.trace("ext3_free_inode")  
kernel.trace("ext3_get_blocks_enter")  
kernel.trace("ext3_get_blocks_exit")  
kernel.trace("ext3_invalidatepage")  
kernel.trace("ext3_journalled_write_end")  
kernel.trace("ext3_journalled_writepage")  
kernel.trace("ext3_load_inode")  
kernel.trace("ext3_mark_inode_dirty")  
kernel.trace("ext3_ordered_write_end")  
kernel.trace("ext3_ordered_writepage")  
kernel.trace("ext3_read_block_bitmap")  
kernel.trace("ext3_readpage")  
kernel.trace("ext3_releasepage")  
kernel.trace("ext3_request_blocks")  
kernel.trace("ext3_request_inode")  
kernel.trace("ext3_reserved")  
kernel.trace("ext3_rsv_window_add")  
kernel.trace("ext3_sync_file_enter")  
kernel.trace("ext3_sync_file_exit")  
kernel.trace("ext3_sync_fs")  
kernel.trace("ext3_truncate_enter")  
kernel.trace("ext3_truncate_exit")  
kernel.trace("ext3_unlink_enter")  
kernel.trace("ext3_unlink_exit")  
kernel.trace("ext3_write_begin")  
kernel.trace("ext3_writeback_write_end")  
kernel.trace("ext3_writeback_writepage")  
kernel.trace("ext4_alloc_da_blocks")  
kernel.trace("ext4_allocate_blocks")  
kernel.trace("ext4_allocate_inode")  
kernel.trace("ext4_da_write_begin")  
kernel.trace("ext4_da_write_end")  
kernel.trace("ext4_da_write_pages")  
kernel.trace("ext4_da_writepages")  
kernel.trace("ext4_da_writepages_result")  
kernel.trace("ext4_discard_blocks")  
kernel.trace("ext4_discard_preallocations")  
kernel.trace("ext4_free_blocks")  
kernel.trace("ext4_free_inode")  
kernel.trace("ext4_journalled_write_end")  
kernel.trace("ext4_mb_discard_preallocations")  
kernel.trace("ext4_mb_new_group_pa")  
kernel.trace("ext4_mb_new_inode_pa")  
kernel.trace("ext4_mb_release_group_pa")  
kernel.trace("ext4_mb_release_inode_pa")  
kernel.trace("ext4_mballoc_alloc")  
kernel.trace("ext4_mballoc_discard")  
kernel.trace("ext4_mballoc_free")  
kernel.trace("ext4_mballoc_prealloc")  
kernel.trace("ext4_ordered_write_end")  
kernel.trace("ext4_request_blocks")  
kernel.trace("ext4_request_inode")  
kernel.trace("ext4_sync_file")  
kernel.trace("ext4_sync_fs")  
kernel.trace("ext4_trim_all_free")  
kernel.trace("ext4_trim_extent")  
kernel.trace("ext4_write_begin")  
kernel.trace("ext4_writeback_write_end")  
kernel.trace("ext4_writepage")  
... 略  
kernel.trace("xfs_lookup")  
kernel.trace("xfs_map_blocks_alloc")  
kernel.trace("xfs_map_blocks_found")  
kernel.trace("xfs_pagecache_inval")  
kernel.trace("xfs_perag_clear_reclaim")  
kernel.trace("xfs_perag_get")  
kernel.trace("xfs_perag_get_tag")  
kernel.trace("xfs_perag_put")  
kernel.trace("xfs_perag_set_reclaim")  
kernel.trace("xfs_readdir")  
kernel.trace("xfs_readlink")  
kernel.trace("xfs_releasepage")  
kernel.trace("xfs_remove")  
kernel.trace("xfs_rename")  
kernel.trace("xfs_reset_dqcounts")  
kernel.trace("xfs_setattr")  
kernel.trace("xfs_swap_extent_after")  
kernel.trace("xfs_swap_extent_before")  
kernel.trace("xfs_symlink")  
kernel.trace("xfs_trans_bhold")  
kernel.trace("xfs_trans_bhold_release")  
kernel.trace("xfs_trans_binval")  
kernel.trace("xfs_trans_bjoin")  
kernel.trace("xfs_trans_brelse")  
kernel.trace("xfs_trans_commit_lsn")  
kernel.trace("xfs_trans_get_buf")  
kernel.trace("xfs_trans_get_buf_recur")  
kernel.trace("xfs_trans_getsb")  
kernel.trace("xfs_trans_getsb_recur")  
kernel.trace("xfs_trans_log_buf")  
kernel.trace("xfs_trans_read_buf")  
kernel.trace("xfs_trans_read_buf_io")  
kernel.trace("xfs_trans_read_buf_recur")  
kernel.trace("xfs_trans_read_buf_shut")  
kernel.trace("xfs_unwritten_convert")  
kernel.trace("xfs_vm_bmap")  
kernel.trace("xfs_writepage")  
stap -L包含上下文变量信息 :   
[root@db-172-16-3-150 events]# stap -L 'kernel.trace("**")'|less  
kernel.trace("__extent_writepage") $page:struct page* $inode:struct inode* $wbc:struct writeback_control*  
kernel.trace("block_bio_backmerge") $q:struct request_queue* $bio:struct bio*  
kernel.trace("block_bio_bounce") $q:struct request_queue* $bio:struct bio*  
kernel.trace("block_bio_complete") $q:struct request_queue* $bio:struct bio*  
kernel.trace("block_bio_frontmerge") $q:struct request_queue* $bio:struct bio*  
kernel.trace("block_bio_queue") $q:struct request_queue* $bio:struct bio*  
kernel.trace("block_getrq") $q:struct request_queue* $bio:struct bio* $rw:int  
这些trace的source在/usr/src/debug/kernel-2.6.32-358.el6/linux-2.6.32-358.el6.x86_64/include/trace/events中可找到,  
例如block_bio_backmerge trace :   
[root@db-172-16-3-150 events]# grep -rn block_bio_backmerge /usr/src/debug/kernel-2.6.32-358.el6/linux-2.6.32-358.el6.x86_64/include/trace/events  
/usr/src/debug/kernel-2.6.32-358.el6/linux-2.6.32-358.el6.x86_64/include/trace/events/block.h:203:DEFINE_EVENT(block_bio, block_bio_backmerge,  
对应的source code.  
DEFINE_EVENT(block_bio, block_bio_backmerge,  
  
        TP_PROTO(struct request_queue *q, struct bio *bio),  
  
        TP_ARGS(q, bio)  
);  
通过trace name, 追踪这个trace在哪些函数中被调用了.  
[root@db-172-16-3-150 events]# grep -rn block_bio_backmerge /usr/src/debug/kernel-2.6.32-358.el6/linux-2.6.32-358.el6.x86_64/  
/usr/src/debug/kernel-2.6.32-358.el6/linux-2.6.32-358.el6.x86_64/include/trace/events/block.h:203:DEFINE_EVENT(block_bio, block_bio_backmerge,  
/usr/src/debug/kernel-2.6.32-358.el6/linux-2.6.32-358.el6.x86_64/block/blk-core.c:1421:         trace_block_bio_backmerge(q, bio);  
/usr/src/debug/kernel-2.6.32-358.el6/linux-2.6.32-358.el6.x86_64/kernel/trace/blktrace.c:946:   ret = register_trace_block_bio_backmerge(blk_add_trace_bio_backmerge);  
/usr/src/debug/kernel-2.6.32-358.el6/linux-2.6.32-358.el6.x86_64/kernel/trace/blktrace.c:982:   unregister_trace_block_bio_backmerge(blk_add_trace_bio_backmerge);  
例如以上输出的block/blk-core.c文件中包含了这个trace, 通过这个文件我们看一下它是在函数(blk_queue_bio)中被调用到了?  
[root@db-172-16-3-150 events]# less /usr/src/debug/kernel-2.6.32-358.el6/linux-2.6.32-358.el6.x86_64/block/blk-core.c  
int blk_queue_bio(struct request_queue *q, struct bio *bio)  
{  
        struct request *req;  
        int el_ret;  
        unsigned int bytes = bio->bi_size;  
        const unsigned short prio = bio_prio(bio);  
        const bool sync = bio_rw_flagged(bio, BIO_RW_SYNCIO);  
        const bool unplug = bio_rw_flagged(bio, BIO_RW_UNPLUG);  
        const unsigned int ff = bio->bi_rw & REQ_FAILFAST_MASK;  
        int where = ELEVATOR_INSERT_SORT;  
        int rw_flags;  
  
        /* BIO_RW_BARRIER is deprecated */  
        if (WARN_ONCE(bio_rw_flagged(bio, BIO_RW_BARRIER),  
                "block: BARRIER is deprecated, use FLUSH/FUA instead\n")) {  
                bio_endio(bio, -EOPNOTSUPP);  
                return 0;  
        }  
  
        /*  
         * low level driver can indicate that it wants pages above a  
         * certain limit bounced to low memory (ie for highmem, or even  
         * ISA dma in theory)  
         */  
        blk_queue_bounce(q, &bio);  
  
        spin_lock_irq(q->queue_lock);  
  
        if (bio->bi_rw & (BIO_FLUSH | BIO_FUA)) {  
                where = ELEVATOR_INSERT_FLUSH;  
                goto get_rq;  
        }  
  
        if (elv_queue_empty(q))  
                goto get_rq;  
  
        el_ret = elv_merge(q, &req, bio);  
        switch (el_ret) {  
        case ELEVATOR_BACK_MERGE:  
                BUG_ON(!rq_mergeable(req));  
  
                if (!ll_back_merge_fn(q, req, bio))  
                        break;  
// 就在这里  
                trace_block_bio_backmerge(q, bio);  
  
                if ((req->cmd_flags & REQ_FAILFAST_MASK) != ff)  
                        blk_rq_set_mixed_merge(req);  
  
                req->biotail->bi_next = bio;  
                req->biotail = bio;  
                req->__data_len += bytes;  
                req->ioprio = ioprio_best(req->ioprio, prio);  
                if (!blk_rq_cpu_valid(req))  
                        req->cpu = bio->bi_comp_cpu;  
                drive_stat_acct(req, 0);  
                elv_bio_merged(q, req, bio);  
                if (!attempt_back_merge(q, req))  
                        elv_merged_request(q, req, el_ret);  
                goto out;  
  
        case ELEVATOR_FRONT_MERGE:  
                BUG_ON(!rq_mergeable(req));  
  
                if (!ll_front_merge_fn(q, req, bio))  
                        break;  
  
                trace_block_bio_frontmerge(q, bio);  
  
                if ((req->cmd_flags & REQ_FAILFAST_MASK) != ff) {  
                        blk_rq_set_mixed_merge(req);  
                        req->cmd_flags &= ~REQ_FAILFAST_MASK;  
                        req->cmd_flags |= ff;  
                }  
  
                bio->bi_next = req->bio;  
                req->bio = bio;  
  
                /*  
                 * may not be valid. if the low level driver said  
                 * it didn't need a bounce buffer then it better  
                 * not touch req->buffer either...  
                 */  
                req->buffer = bio_data(bio);  
                /*  
                 * The merge may happen accross partitions  
                 * We must update in_flight value accordingly  
                 */  
                blk_account_io_front_merge(req, bio->bi_sector);  
                req->__sector = bio->bi_sector;  
                req->__data_len += bytes;  
                req->ioprio = ioprio_best(req->ioprio, prio);  
                if (!blk_rq_cpu_valid(req))  
                        req->cpu = bio->bi_comp_cpu;  
                drive_stat_acct(req, 0);  
                elv_bio_merged(q, req, bio);  
                if (!attempt_front_merge(q, req))  
                        elv_merged_request(q, req, el_ret);  
                goto out;  
  
        /* ELV_NO_MERGE: elevator says don't/can't merge. */  
        default:  
                ;  
        }  
  
get_rq:  
        /*  
         * This sync check and mask will be re-done in init_request_from_bio(),  
         * but we need to set it earlier to expose the sync flag to the  
         * rq allocator and io schedulers.  
         */  
        rw_flags = bio_data_dir(bio);  
        if (sync)  
                rw_flags |= REQ_SYNC;  
  
        /*  
         * Grab a free request. This is might sleep but can not fail.  
         * Returns with the queue unlocked.  
         */  
        req = get_request_wait(q, rw_flags, bio);  
        if (unlikely(!req)) {  
                bio_endio(bio, -ENODEV);        /* @q is dead */  
                goto out_unlock;  
        }  
  
        /*  
         * After dropping the lock and possibly sleeping here, our request  
         * may now be mergeable after it had proven unmergeable (above).  
         * We don't worry about that case for efficiency. It won't happen  
         * often, and the elevators are able to handle it.  
         */  
        init_request_from_bio(req, bio);  
  
        spin_lock_irq(q->queue_lock);  
        if (test_bit(QUEUE_FLAG_SAME_COMP, &q->queue_flags) ||  
            bio_flagged(bio, BIO_CPU_AFFINE))  
                req->cpu = raw_smp_processor_id();  
              
        if (queue_should_plug(q) && elv_queue_empty(q))  
                blk_plug_device(q);  
  
        /* insert the request into the elevator */  
        drive_stat_acct(req, 1);  
        __elv_add_request(q, req, where, 0);  
out:  
        if (unplug || !queue_should_plug(q))  
                __generic_unplug_device(q);  
out_unlock:  
        spin_unlock_irq(q->queue_lock);  
        return 0;  
}  
EXPORT_SYMBOL_GPL(blk_queue_bio);       /* for device mapper only */  

参考

1. /usr/src/debug/kernel-2.6.32-358.el6/linux-2.6.32-358.el6.x86_64/include/trace/events

Flag Counter

digoal’s 大量PostgreSQL文章入口