[Dtrace snippets for zfs at vnode (file) level] Snippets used for observing core zfs filesystem functions like zfs_read and zfs_write for determining filesystem latency and for measuring IO sizes.
dtrace -qn '
/*
* This script is intended for measuring basic characteristics of
* the zio_{read,write} functions in ZFS. These functions are a good
* proxy for how the underlying storage is performing under current load.
* Resulting data is CSV-formatted for easy digestion by various command-line
* tools and import into R and other statistical tools.
* Totals are normalized to a per-second rate, i.e. amount of time spent
* in function for each sample is per second of real time.
*/
int start, interval, rmin, rmax, wmin, wmax, rdata, wdata;
int cnt[string] ;
int ldelta[string] ;
signed long lastema[string] ; /* exp. moving avg. associative array */
/*
* Weight for EMA calculation W = 2 ÷ (N + 1), where N is number of periods.
* Because we cannot do math with floats in dtrace, we assume that a 20 period
* EMA is 2 ÷ (20 + 1) ~= 0.0952 to four decimal points. Multiplying by 10000
* results in chosen weight.
*/
inline int WEIGHT = 952 ;
BEGIN {
printf("%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s\n",
"delta", "rlat", "rlat.ema", "rminlat", "rmaxlat", "rcnt",
"read.tot.tm", "read.range", "wlat", "wlat.ema", "wminlat",
"wmaxlat", "wcnt", "write.tot.tm", "write.range") ;
start = timestamp ;
}
/*
* To avoid losing information, due to lack of floating point arithmetic
* support in dtrace, we avoid conversion from nanoseconds to microseconds
* in the return clauses, where we compute delta between entry and return
* triggers. This should be easy to accomplish later on, if necessary in the
* data processing pipeline.
*/
::zio_read:entry,::zio_write:entry { self->s = timestamp ; }
::zio_read:return /timestamp > self->s/ {
this->delta = timestamp - self->s ;
rmin = rmin == 0 ? this->delta : rmin > this->delta ? this->delta : rmin ;
rmax = rmax == 0 ? this->delta : rmax < this->delta ? this->delta : rmax ;
@rlat = avg(this->delta) ;
@rminlat = min(this->delta) ;
@rmaxlat = max(this->delta) ;
@rcnt = count() ;
@rtottm = sum(this->delta) ;
rdata = 1 ; cnt[probefunc]++ ;
}
::zio_write:return /timestamp > self->s/ {
this->delta = timestamp - self->s ;
wmin = wmin == 0 ? this->delta : wmin > this->delta ? this->delta : wmin ;
wmax = wmax == 0 ? this->delta : wmax < this->delta ? this->delta : wmax ;
@wlat = avg(this->delta) ;
@wminlat = min(this->delta) ;
@wmaxlat = max(this->delta) ;
@wcnt = count() ;
@wtottm = sum(this->delta) ;
wdata = 1 ; cnt[probefunc]++ ;
}
/*
* Consider this a required first interval to seed EMA,
* and do not use this sample for additional computations.
*/
::zio_read:return,::zio_write:return
/self->s && lastema[probefunc] == 0 && this->delta/ {
lastema[probefunc] = this->delta ;
self->s = 0 ;
}
/* 1/10th of the time do EMA computation. We are purposely skipping over
* some observations to make the data smoother.
*/
::zio_read:return,::zio_write:return
/self->s && lastema[probefunc] > 0 && this->delta > 0
&& (cnt[probefunc] % 10) == 0/ {
/* If count is > 10000, reset it to 0, start over. */
cnt[probefunc] = cnt[probefunc] > 10000 ? 0 : cnt[probefunc] ;
/*
* Compute exponential moving average, and due to lack of a better means
* store values in a avg() aggregate, which is not ideal, but it is better
* than nothing at all.
*/
this->ema = ((WEIGHT * this->delta) +
((10000 - WEIGHT) * lastema[probefunc])) / 10000 ;
lastema[probefunc] = this->ema ;
self->s = 0 ;
}
::zio_read:return /this->ema/ {
@rlatema = avg(this->ema) ;
this->ema = 0 ;
}
::zio_write:return /this->ema/ {
@wlatema = avg(this->ema) ;
this->ema = 0 ;
}
tick-5sec /!rdata && !wdata/ {
/*
* If the function calls did not happen, we print NA to make it clear
* that values are not available, as opposed to printing 0s.
*/
this->toffset = (timestamp - start) / 1000000000 ;
printf("%d,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA\n",
this->toffset) ;
interval = this->toffset ;
}
tick-5sec /rdata || wdata/ {
/*
* We are only using a sum here because we need something that will store
* results as an aggregated variable, which we can print with printa.
*/
this->toffset = (timestamp - start) / 1000000000 ;
@rrng = sum(rdata > 0 ? rmax - rmin : 0) ;
@wrng = sum(wdata > 0 ? wmax - wmin : 0) ;
normalize(@rtottm, this->toffset - interval) ;
normalize(@wtottm, this->toffset - interval) ;
normalize(@rcnt, this->toffset - interval) ;
normalize(@wcnt, this->toffset - interval) ;
printf("%d,", this->toffset) ;
printa("%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d\n",
@rlat, @rlatema, @rminlat, @rmaxlat, @rcnt, @rtottm, @rrng,
@wlat, @wlatema, @wminlat, @wmaxlat, @wcnt, @wtottm, @wrng) ;
trunc(@rlat) ; trunc(@rlatema) ; trunc(@rminlat) ; trunc(@rmaxlat) ;
trunc(@rcnt) ; trunc(@rtottm) ; trunc(@rrng) ;
trunc(@wlat) ; trunc(@wlatema) ; trunc(@wminlat) ; trunc(@wmaxlat) ;
trunc(@wcnt) ; trunc(@wtottm) ; trunc(@wrng) ;
rmin, rmax, rdata, wmin, wmax, wdata = 0 ;
interval = this->toffset ;
}'
dtrace -qn '
#pragma D option dynvarsize=8m
int gotdata ;
BEGIN {
printf("pool,io.type,lt100us,lt1000us,lt10000us,lt100ms,lt1000ms,gt1000ms,blkszle1k,blksz2k,blksz4k,blksz8k,blksz16k,blksz32k,blksz64k,blksz128k,blksz256k,blkszgt256k,io.count\n") ;
x["zfs_read"] = "R" ;
x["zfs_write"] = "W" ;
}
::zfs_read:entry,::zfs_write:entry
/strstr(strtok(args[0]->v_path, "/storage/"), "p") != NULL/ {
self->b = args[1]->uio_resid ; /* Number of bytes to read or write */
self->p = strtok(args[0]->v_path, "/storage/") ; /* Should be pool name */
self->s = timestamp ;
gotdata = 1 ;
}
::zfs_read:return,::zfs_write:return /self->s/ {
/*
* Delta is converted from nanoseconds to microseconds, since we
* expect few IOs to complete in a matter of a few nanoseconds,
* operating with microseconds istead makes sense.
*/
this->delta = (timestamp - self->s) / 1000 ;
this->n = self->p ;
@cnt[stringof(this->n), x[probefunc]] = count();
/*
* Histogram with 6 bins representing distribution of latencies
* for issued IOs. Bins are from 1000us< to >1000ms.
* We use the sum() function to increment by 1 each time a value
* matches width of a bucket, otherwise we leave unchanged.
*/
@lt100us[stringof(this->n), x[probefunc]] =
sum(this->delta <= 100 ? 1 : 0) ;
@lt1000us[stringof(this->n), x[probefunc]] =
sum(this->delta > 100 && this->delta <= 1000 ? 1 : 0) ;
@lt10000us[stringof(this->n), x[probefunc]] =
sum(this->delta > 1000 && this->delta <= 10000 ? 1 : 0) ;
@lt100ms[stringof(this->n), x[probefunc]] =
sum(this->delta > 10000 && this->delta <= 100000 ? 1 : 0) ;
@lt1000ms[stringof(this->n), x[probefunc]] =
sum(this->delta > 100000 && this->delta <= 1000000 ? 1 : 0) ;
@gt1000ms[stringof(this->n), x[probefunc]] =
sum(this->delta > 1000000 ? 1 : 0) ;
/*
* Histogram with 10 bins representing distibution of IO sizes
* for issued IOs. Bins are from 1K< to >256K.
* We use the sum() function to increment by 1 each time a value
* matches width of a bucket, otherwise we leave unchanged.
*/
@blkszle1k[stringof(this->n), x[probefunc]] =
sum(self->b <= 0x400 ? 1 : 0) ;
@blksz2k[stringof(this->n), x[probefunc]] =
sum(self->b > 0x400 && self->b <= 0x800 ? 1 : 0) ;
@blksz4k[stringof(this->n), x[probefunc]] =
sum(self->b > 0x800 && self->b <= 0x1000 ? 1 : 0) ;
@blksz8k[stringof(this->n), x[probefunc]] =
sum(self->b > 0x1000 && self->b <= 0x2000 ? 1 : 0) ;
@blksz16k[stringof(this->n), x[probefunc]] =
sum(self->b > 0x2000 && self->b <= 0x4000 ? 1 : 0) ;
@blksz32k[stringof(this->n), x[probefunc]] =
sum(self->b > 0x4000 && self->b <= 0x8000 ? 1 : 0) ;
@blksz64k[stringof(this->n), x[probefunc]] =
sum(self->b > 0x8000 && self->b <= 0x10000 ? 1 : 0) ;
@blksz128k[stringof(this->n), x[probefunc]] =
sum(self->b > 0x10000 && self->b <= 0x20000 ? 1 : 0) ;
@blksz256k[stringof(this->n), x[probefunc]] =
sum(self->b > 0x20000 && self->b <= 0x40000 ? 1 : 0) ;
@blkszgt256k[stringof(this->n), x[probefunc]] =
sum(self->b > 0x40000 ? 1 : 0) ;
self->b = 0 ;
self->p = 0 ;
self->s = 0 ;
}
tick-1sec /gotdata/ {
printa("%s,%s,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d\n",
@lt100us, @lt1000us, @lt10000us, @lt100ms, @lt1000ms,
@gt1000ms, @blkszle1k, @blksz2k, @blksz4k, @blksz8k, @blksz16k,
@blksz32k, @blksz64k, @blksz128k, @blksz256k, @blkszgt256k, @cnt
) ;
trunc(@lt100us) ; trunc(@lt1000us) ; trunc(@lt10000us) ;
trunc(@lt100ms) ; trunc(@lt1000ms) ; trunc(@gt1000ms) ;
trunc(@blkszle1k) ; trunc(@blksz2k) ; trunc(@blksz4k) ;
trunc(@blksz8k) ; trunc(@blksz16k) ; trunc(@blksz32k) ;
trunc(@blksz64k) ; trunc(@blksz128k) ; trunc(@blksz256k) ;
trunc(@blkszgt256k) ; trunc(@cnt) ;
gotdata = 0 ;
}'
dtrace -xdynvarsize=10M -qn '
BEGIN {
MSEC = 1000000;
MICROSEC = 1000;
threshold = 50 * MSEC;
div = "--------------------------------------------------------------------------------"
}
fbt:zfs::entry {
self->start[probefunc]=timestamp
}
/* Do not enter unless we are over threshold */
fbt:zfs::return {
this->now = timestamp;
}
fbt:zfs::return /this->now && (this->now - self->start[probefunc]) > threshold/ {
@["delta (ms)", probefunc] = quantize(
(timestamp - self->start[probefunc]) / MSEC);
self->start[probefunc] = 0;
this->now = 0;
}
tick-5sec {
printf("%s\n", div); printf("%Y\n", walltimestamp); printa(@); trunc(@);
}'
#!/bin/sh
dtrace -xdynvarsize=10M -qn '
BEGIN {
MSEC = 1000000;
MICROSEC = 1000;
threshold = 50 * MSEC;
div = "--------------------------------------------------------------------------------"
}
fbt:zfs::entry {
self->start[probefunc]=timestamp
}
/* Do not enter unless we are over threshold */
fbt:zfs::return {
this->now = timestamp;
}
fbt:zfs::return /this->now && (this->now - self->start[probefunc]) > threshold/ {
@["delta (ms)", probefunc] = quantize(
(timestamp - self->start[probefunc]) / MSEC);
self->start[probefunc] = 0;
this->now = 0;
}
tick-5sec {
printf("%s\n", div); printf("%Y\n", walltimestamp); printa(@); trunc(@);
}' >> /root/long-running.log
dtrace -n 'fbt::fop_read:entry /args[0]->v_op->vnop_name == "zfs"/ {
self->start = timestamp; } fbt::fop_read:return /self->start/ {
@[execname, "ns"] = quantize(timestamp - self->start); self->start = 0; }
dtrace -qn '
::fop_fsync:entry /args[0]->v_path != NULL/ {
self->p = args[0]->v_path;
self->tick = timestamp;
}
::fop_fsync:entry /self->p/ {
this->delta = timestamp - self->tick;
@lat[execname, stringof(self->p)] = quantize(this->delta);
@[execname, stringof(self->p)] = count();
self->tick = 0; self->p = 0;
}
END {
printa("prog: %s path: %s count: %@d\n%@d\n", @, @lat);
}'