diff --git a/README.md b/README.md index 045e8777bb14..f731c0a2b7f8 100644 --- a/README.md +++ b/README.md @@ -175,6 +175,7 @@ pair of .c and .py files, and some are directories of files. - tools/[uthreads](tools/lib/uthreads.py): Trace thread creation events in Java and raw pthreads. [Examples](tools/lib/uthreads_example.txt). - tools/[vfscount](tools/vfscount.py): Count VFS calls. [Examples](tools/vfscount_example.txt). - tools/[vfsstat](tools/vfsstat.py): Count some VFS calls, with column output. [Examples](tools/vfsstat_example.txt). +- tools/[virtiostat](tools/virtiostat.py): Show VIRTIO device IO statistics. [Examples](tools/virtiostat_example.txt). - tools/[wakeuptime](tools/wakeuptime.py): Summarize sleep to wakeup time by waker kernel stack. [Examples](tools/wakeuptime_example.txt). - tools/[xfsdist](tools/xfsdist.py): Summarize XFS operation latency distribution as a histogram. [Examples](tools/xfsdist_example.txt). - tools/[xfsslower](tools/xfsslower.py): Trace slow XFS operations. [Examples](tools/xfsslower_example.txt). diff --git a/man/man8/virtiostat.8 b/man/man8/virtiostat.8 new file mode 100644 index 000000000000..93c3152dfad8 --- /dev/null +++ b/man/man8/virtiostat.8 @@ -0,0 +1,56 @@ +.TH virtiostat 8 "2021-02-15" "USER COMMANDS" +.SH NAME +virtiostat \- Trace VIRTIO devices input/output statistics. Uses Linux eBPF/bcc. +.SH SYNOPSIS +.B virtiostat [\-h] [\-T] [\-D] [INTERVAL] [COUNT] +.SH DESCRIPTION +This tool traces VIRTIO devices input/output statistics. It works in lower +layer of VIRTIO base driver, so it could trace all the devices of VIRTIO +family. For example, we can't get IO statistics of 9p-fs in a guest virtual +machine by iostat command, but we can analyze IO statistics by virtiostat. +The outputing result shows In/Out SGs(scatter list operation) to represent +positive correlation IOPS, and In/Out BW to represent throughput. + +Since this uses BPF, only the root user can use this tool. +.SH REQUIREMENTS +CONFIG_BPF and bcc. +.SH OPTIONS +.TP +\-h +Print usage message. +.TP +\-T +Include a time column on output (HH:MM:SS). +.TP +\-D +Show debug infomation of bpf text. +.TP +INTERVAL +Print output every interval seconds. +.TP +COUNT +Total count of trace in seconds. +.SH EXAMPLES +.TP +Trace virtio device statistics and print 1 second summaries, 10 times: +# +.B virtiostat 1 10 +.SH OVERHEAD +This traces the kernel virtqueue_add_sgs, virtqueue_add_outbuf, +virtqueue_add_inbuf, virtqueue_add_inbuf_ctx functions. +The rate of this depends on all the VIRTIO devices IOPS. +.SH SOURCE +This is from bcc. +.IP +https://github.com/iovisor/bcc +.PP +Also look in the bcc distribution for a companion _examples.txt file containing +example usage, output, and commentary for this tool. +.SH OS +Linux +.SH STABILITY +Unstable - in development. +.SH AUTHOR +zhenwei pi +.SH SEE ALSO +iostat(1), iftop(8), funccount(8) diff --git a/tools/virtiostat.py b/tools/virtiostat.py new file mode 100755 index 000000000000..b36a7f5a6562 --- /dev/null +++ b/tools/virtiostat.py @@ -0,0 +1,238 @@ +#!/usr/bin/python +# @lint-avoid-python-3-compatibility-imports +# +# virtiostat Show virtio devices input/output statistics. +# For Linux, uses BCC, eBPF. +# +# USAGE: virtiostat [-h] [-T] [-D] [INTERVAL] [COUNT] +# +# Copyright (c) 2021 zhenwei pi +# Licensed under the Apache License, Version 2.0 (the "License") +# +# 13-Feb-2021 zhenwei pi Created this. + +from __future__ import print_function +from bcc import BPF +from time import sleep, strftime +import argparse + +# arguments +examples = """examples: + ./virtiostat # print 3(default) second summaries + ./virtiostat 1 10 # print 1 second summaries, 10 times + ./virtiostat -T # show timestamps + ./virtiostat -D # show debug bpf text +""" +parser = argparse.ArgumentParser( + description="Show virtio devices input/output statistics", + formatter_class=argparse.RawDescriptionHelpFormatter, + epilog=examples) +parser.add_argument("interval", nargs="?", default=3, + help="output interval, in seconds") +parser.add_argument("count", nargs="?", default=99999999, + help="number of outputs") +parser.add_argument("-T", "--timestamp", action="store_true", + help="show timestamp on output") +parser.add_argument("-D", "--debug", action="store_true", + help="print BPF program before starting (for debugging purposes)") +parser.add_argument("--ebpf", action="store_true", + help=argparse.SUPPRESS) +args = parser.parse_args() + +# define BPF program +bpf_text = """ +#ifndef KBUILD_MODNAME +#define KBUILD_MODNAME "bcc" +#endif +#include +#include + +/* typically virtio scsi has max SGs of 6 */ +#define VIRTIO_MAX_SGS 6 +/* typically virtio blk has max SEG of 128 */ +#define SG_MAX 128 + +typedef struct virtio_stat { + char driver[16]; + char dev[12]; + char vqname[12]; + u32 in_sgs; + u32 out_sgs; + u64 in_bw; + u64 out_bw; +} virtio_stat_t; + +BPF_HASH(stats, u64, virtio_stat_t); + +static struct scatterlist *__sg_next(struct scatterlist *sgp) +{ + struct scatterlist sg; + + bpf_probe_read_kernel(&sg, sizeof(sg), sgp); + if (sg_is_last(&sg)) + return NULL; + + sgp++; + + bpf_probe_read_kernel(&sg, sizeof(sg), sgp); + if (unlikely(sg_is_chain(&sg))) + sgp = sg_chain_ptr(&sg); + + return sgp; +} + +static u64 count_len(struct scatterlist **sgs, unsigned int num) +{ + u64 length = 0; + unsigned int i, n; + struct scatterlist *sgp = NULL; + + for (i = 0; (i < VIRTIO_MAX_SGS) && (i < num); i++) { + for (n = 0, sgp = sgs[i]; sgp && (n < SG_MAX); sgp = __sg_next(sgp)) { + length += sgp->length; + n++; + } + + /* Suggested by Yonghong Song: + * IndVarSimplifyPass with clang 12 may cause verifier failure: + * ; for (i = 0; (i < VIRTIO_MAX_SGS) && (i < num); i++) { // Line 60 + * 90: 15 08 15 00 00 00 00 00 if r8 == 0 goto +21 + * 91: bf 81 00 00 00 00 00 00 r1 = r8 + * 92: 07 01 00 00 ff ff ff ff r1 += -1 + * 93: 67 01 00 00 20 00 00 00 r1 <<= 32 + * 94: 77 01 00 00 20 00 00 00 r1 >>= 32 + * 95: b7 02 00 00 05 00 00 00 r2 = 5 + * 96: 2d 12 01 00 00 00 00 00 if r2 > r1 goto +1 + * 97: b7 08 00 00 06 00 00 00 r8 = 6 + * 98: b7 02 00 00 00 00 00 00 r2 = 0 + * 99: b7 09 00 00 00 00 00 00 r9 = 0 + * 100: 7b 8a 68 ff 00 00 00 00 *(u64 *)(r10 - 152) = r8 + * 101: 05 00 35 00 00 00 00 00 goto +53 + * Note that r1 is refined by r8 is saved to stack for later use. + * This will give verifier u64_max loop bound and eventually cause + * verification failure. Workaround with the below asm code. + */ +#if __clang_major__ >= 7 + asm volatile("" : "=r"(i) : "0"(i)); +#endif + } + + return length; +} + +static void record(struct virtqueue *vq, struct scatterlist **sgs, + unsigned int out_sgs, unsigned int in_sgs) +{ + virtio_stat_t newvs = {0}; + virtio_stat_t *vs; + u64 key = (u64)vq; + u64 in_bw = 0; + + /* Workaround: separate two count_len() calls, one here and the + * other below. Otherwise, compiler may generate some spills which + * harms verifier pruning. This happens in llvm12, but not llvm4. + * Below code works on both cases. + */ + if (in_sgs) + in_bw = count_len(sgs + out_sgs, in_sgs); + + vs = stats.lookup(&key); + if (!vs) { + bpf_probe_read_kernel(newvs.driver, sizeof(newvs.driver), vq->vdev->dev.driver->name); + bpf_probe_read_kernel(newvs.dev, sizeof(newvs.dev), vq->vdev->dev.kobj.name); + bpf_probe_read_kernel(newvs.vqname, sizeof(newvs.vqname), vq->name); + newvs.out_sgs = out_sgs; + newvs.in_sgs = in_sgs; + if (out_sgs) + newvs.out_bw = count_len(sgs, out_sgs); + newvs.in_bw = in_bw; + stats.update(&key, &newvs); + } else { + vs->out_sgs += out_sgs; + vs->in_sgs += in_sgs; + if (out_sgs) + vs->out_bw += count_len(sgs, out_sgs); + vs->in_bw += in_bw; + } +} + +int trace_virtqueue_add_sgs(struct pt_regs *ctx, struct virtqueue *vq, + struct scatterlist **sgs, unsigned int out_sgs, + unsigned int in_sgs, void *data, gfp_t gfp) + +{ + record(vq, sgs, out_sgs, in_sgs); + + return 0; +} + +int trace_virtqueue_add_outbuf(struct pt_regs *ctx, struct virtqueue *vq, + struct scatterlist *sg, unsigned int num, + void *data, gfp_t gfp) +{ + record(vq, &sg, 1, 0); + + return 0; +} + +int trace_virtqueue_add_inbuf(struct pt_regs *ctx, struct virtqueue *vq, + struct scatterlist *sg, unsigned int num, + void *data, gfp_t gfp) +{ + record(vq, &sg, 0, 1); + + return 0; +} + +int trace_virtqueue_add_inbuf_ctx(struct pt_regs *ctx, struct virtqueue *vq, + struct scatterlist *sg, unsigned int num, + void *data, void *_ctx, gfp_t gfp) +{ + record(vq, &sg, 0, 1); + + return 0; +} +""" + +# debug mode: print bpf text +if args.debug: + print(bpf_text) + +# dump mode: print bpf text and exit +if args.ebpf: + print(bpf_text) + exit() + +# load BPF program +b = BPF(text=bpf_text) +b.attach_kprobe(event="virtqueue_add_sgs", fn_name="trace_virtqueue_add_sgs") +b.attach_kprobe(event="virtqueue_add_outbuf", fn_name="trace_virtqueue_add_outbuf") +b.attach_kprobe(event="virtqueue_add_inbuf", fn_name="trace_virtqueue_add_inbuf") +b.attach_kprobe(event="virtqueue_add_inbuf_ctx", fn_name="trace_virtqueue_add_inbuf_ctx") + +print("Tracing virtio devices statistics ... Hit Ctrl-C to end.") + +# start main loop +exiting = 0 if args.interval else 1 +seconds = 0 +while (1): + try: + sleep(int(args.interval)) + seconds = seconds + int(args.interval) + except KeyboardInterrupt: + exiting = 1 + + if args.timestamp: + print("%-8s\n" % strftime("%H:%M:%S"), end="") + else: + print("--------", end="\n") + + print("%14s %8s %10s %7s %7s %14s %14s" % ("Driver", "Device", "VQ Name", "In SGs", "Out SGs", "In BW", "Out BW")) + stats = b.get_table("stats") + for k, v in sorted(stats.items(), key=lambda vs: vs[1].dev): + print("%14s %8s %10s %7d %7d %14d %14d" % (v.driver, v.dev, v.vqname, v.in_sgs, v.out_sgs, v.in_bw, v.out_bw)) + + stats.clear() + + if exiting or seconds >= int(args.count): + exit() diff --git a/tools/virtiostat_example.txt b/tools/virtiostat_example.txt new file mode 100644 index 000000000000..7eff46397713 --- /dev/null +++ b/tools/virtiostat_example.txt @@ -0,0 +1,46 @@ +Demonstrations of virtiostat, the Linux eBPF/bcc version. + + +This program traces virtio devices to analyze the IO operations and +throughput. For example, guest side mounts a 9p fs, and we can't get +io statistics by `iostat` command any more. In this scenario, we can +only get statistics from VIRTIO layer instead of block layer. + +Example +#./virtiostat -T +Tracing virtio devices statistics ... Hit Ctrl-C to end. +14:48:30 + Driver Device VQ Name In SGs Out SGs In BW Out BW + virtio_net virtio0 input.0 260669 0 406743040 0 + virtio_net virtio0 output.0 0 9873 0 833344 + virtio_blk virtio4 req.0 28 46 448 278976 + 9pnet_virtio virtio6 requests 99083 99354 1883687 137537263 +14:48:33 + Driver Device VQ Name In SGs Out SGs In BW Out BW + virtio_net virtio0 input.0 260718 0 406819328 0 + virtio_net virtio0 output.0 0 7139 0 562355 + virtio_blk virtio4 req.0 11 18 176 110768 + 9pnet_virtio virtio6 requests 91520 91141 1737364 125320785 + + +Full USAGE: + +#./virtiostat -h +usage: virtiostat.py [-h] [-T] [-D] [interval] [count] + +Show virtio devices input/output statistics + +positional arguments: + interval output interval, in seconds + count number of outputs + +optional arguments: + -h, --help show this help message and exit + -T, --timestamp show timestamp on output + -D, --debug print BPF program before starting (for debugging purposes) + +examples: + ./virtiostat # print 3(default) second summaries + ./virtiostat 1 10 # print 1 second summaries, 10 times + ./virtiostat -T # show timestamps + ./virtiostat -D # show debug bpf text