[Dtrace NFS snippets] Scripts and One-liners gathering data about NFS operations and performance #tags: nfs, nfsv3, nfsv4, latency, nfs operations
dtrace -qn'
BEGIN {
printf("timestamp,cmd,path,count\n");
typ["op-access-done"] = "A";
typ["op-create-done"] = "C";
typ["op-getattr-done"] = "G";
typ["op-read-done"] = "R";
typ["op-write-done"] = "W";
}
tick-5sec {
ts = walltimestamp;
}
nfsv3:::op-access-done,
nfsv3:::op-create-done,
nfsv3:::op-getattr-done,
nfsv3:::op-read-done,
nfsv3:::op-write-done {
this->path = args[1]->noi_curpath == NULL ?
"<Unknown>" : args[1]->noi_curpath;
}
nfsv3:::op-access-done,
nfsv3:::op-create-done,
nfsv3:::op-getattr-done,
nfsv3:::op-read-done,
nfsv3:::op-write-done /ts && strstr(this->path, "vmdk")!=0 / {
@[ts, typ[probename], this->path] = count();
data = 1;
}
tick-60sec /data/ {
trunc(@, 10);
printa("%Y,%s,%s,%@d\n", @);
trunc(@);
data = 0;
}'
dtrace -qn '
/*
* Measure how random or sequential NFSv3 IO is, and bucket it by
* client IP address. Print results every 5 seconds by default.
* Uncomment lines with @h aggregates to get histograms which show
* common offsets. This is a good way to know just how sequential
* sequential IO is, and in what offsets it is happening, i.e. 4K, 64K, etc.
*/
int offset[string, string];
int lastdelta[string, string];
int havedata;
BEGIN {
printf("command,ip.addr,rand.count,seq.count\n");
map["op-read-start"] = "R";
map["op-write-start"] = "W";
}
nfsv3:::op-read-start,nfsv3:::op-write-start
/!offset[map[probename], args[0]->ci_remote]/ {
offset[map[probename], args[0]->ci_remote] = args[2]->offset;
lastdelta[map[probename], args[0]->ci_remote] = 0;
}
nfsv3:::op-read-start,nfsv3:::op-write-start
/* Previous offset is greater than this offset */
/offset[map[probename], args[0]->ci_remote] > args[2]->offset/ {
this->dist = offset[map[probename], args[0]->ci_remote] - args[2]->offset;
/* @h[map[probename], args[0]->ci_remote] = quantize(this->dist); */
@seq[map[probename], args[0]->ci_remote] =
sum(this->dist == lastdelta[map[probename], args[0]->ci_remote]);
@rand[map[probename], args[0]->ci_remote] =
sum(this->dist != lastdelta[map[probename], args[0]->ci_remote]);
offset[map[probename], args[0]->ci_remote] = args[2]->offset;
lastdelta[map[probename], args[0]->ci_remote] = this->dist;
havedata = 1;
}
nfsv3:::op-read-start,nfsv3:::op-write-start
/* Previous offset is less than this offset */
/offset[map[probename], args[0]->ci_remote] < args[2]->offset/ {
this->dist = args[2]->offset - offset[map[probename], args[0]->ci_remote];
/* @h[map[probename], args[0]->ci_remote] = quantize(this->dist); */
@seq[map[probename], args[0]->ci_remote] =
sum(this->dist == lastdelta[map[probename], args[0]->ci_remote]);
@rand[map[probename], args[0]->ci_remote] =
sum(this->dist != lastdelta[map[probename], args[0]->ci_remote]);
offset[map[probename], args[0]->ci_remote] = args[2]->offset;
lastdelta[map[probename], args[0]->ci_remote] = this->dist;
havedata = 1;
}
tick-5sec /havedata/ {
printa("%s,%s,%@d,%@d\n", @rand, @seq);
trunc(@rand); trunc(@seq);
/* trunc(@h); */
havedata = 0;
}'
dtrace -qn'
#pragma D option aggsize=16m
#pragma D option bufsize=8m
#pragma D option dynvarsize=8m
string typ[string];
BEGIN
{
printf("source,cmd,path,IOs,avg.lat.us,bytes\n");
typ["op-access-done"] = "A";
typ["op-create-done"] = "C";
typ["op-getattr-done"] = "G";
typ["op-read-done"] = "R";
typ["op-write-done"] = "W";
}
/*
* Ignoring all R/W IOs less than 1K. If doing some really tiny <1K IOs,
* lets pretend these IOs did not happen.
*/
nfsv3:::op-read-start,
nfsv3:::op-write-start /args[2]->count > 1 << 10/ { self->bytes = args[2]->count ; }
nfsv3:::op-access-start, nfsv3:::op-create-start, nfsv3:::op-getattr-start,
nfsv3:::op-read-start, nfsv3:::op-write-start
{
self->src_addr = args[0]->ci_remote;
tm[self->src_addr, args[1]->noi_xid] = timestamp;
self->have_path = args[1]->noi_curpath == NULL ? 0 : 1;
/* We should make sure we handle empty value as well */
self->path = args[1]->noi_curpath != "" ?
args[1]->noi_curpath : "NOPATH";
}
/*
* Continue to ignore any R/W IOs that are less than 1K.
*/
nfsv3:::op-read-done, nfsv3:::op-write-done
/self->have_path &&
tm[self->src_addr, args[1]->noi_xid] &&
self->bytes > 1 << 10
/
{
@ios[self->src_addr,
typ[probename], self->path] = count();
@lat_us[self->src_addr,
typ[probename], self->path] =
avg((timestamp - tm[self->src_addr, args[1]->noi_xid]) / 1000);
@s[self->src_addr,
typ[probename], self->path] = sum(self->bytes);
++data;
}
nfsv3:::op-access-done, nfsv3:::op-create-done, nfsv3:::op-getattr-done /self->have_path && tm[self->src_addr, args[1]->noi_xid]/
{
@ios[self->src_addr,
typ[probename], self->path] =
count();
@lat_us[self->src_addr,
typ[probename], self->path] =
avg((timestamp - tm[self->src_addr, args[1]->noi_xid]) / 1000);
@s[self->src_addr,
typ[probename], self->path] =
sum(0);
++data;
}
nfsv3:::op-access-done,
nfsv3:::op-create-done,
nfsv3:::op-getattr-done,
nfsv3:::op-read-done,
nfsv3:::op-write-done {
/* Zero out things to make sure we avoid issues with dynvar drops */
self->bytes = 0;
self->have_path = 0;
tm[self->src_addr, args[1]->noi_xid] = 0;
self->src_addr = 0;
self->path = 0;
}
tick-1sec /data/
{
/*
* Shrink each aggregation to 100 observations first, then print those
* 100 observations, which will not mean we are limited to 100 unique
* connection addresses, or 100 unique paths. As it stands, aggregations
* right now use a tuple for their key, which is a set of three values:
* client address, command, path. This means that in theory we could have:
* (client address) x (command) x (path) number of unique keys.
*/
trunc(@ios, 100); trunc(@lat_us, 100); trunc(@s, 100);
printa("%s,%s,%s,%@d,%@d,%@d\n", @ios, @lat_us, @s);
trunc(@ios); trunc(@lat_us); trunc(@s);
data = 0;
}'
dtrace -qn '
inline const int NORM_KB = 1024;
inline const int NS_S = 1000000000;
inline const int NS_MS = 1000000;
inline const int NS_US = 1000;
int cnt, realtm, start, epoch;
self int nfs_r;
#pragma D option aggsize=16m
#pragma D option bufsize=50m
#pragma D option dynvarsize=50m
BEGIN {
printf("%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s\n",
"io.type", "runtime.ms", "min.lat", "max.lat", "avg.lat",
"max.io.size", "iops", "throughput.KBps",
"512b.ct", "1024b.ct", "2048b.ct", "4096b.ct", "8192b.ct",
"16384b.ct", "32768b.ct", "65536b.ct", "131072b.ct",
"volatile", "fsync", "dsync", "arc.hit.ct", "arc.miss.ct",
"arc.size.av", "arc.size.min", "arc.size.max", "arc.mru.size.av",
"arc.mru.size.min", "arc.mru.size.max", "arc.mfu.size.av",
"arc.mfu.size.min", "arc.mfu.size.max", "total.ios", "avg.bytes.per.us"
);
hdr = 1;
epoch = timestamp;
start = timestamp;
/* How stable data is expected to be. */
stable_how[0] = "NOSYNC";
stable_how[1] = "DSYNC";
stable_how[2] = "FSYNC";
ts = walltimestamp;
}
tick-1sec /(timestamp - epoch) >= 0 && cnt > 0/ {
realtm = timestamp - epoch; /* Update real time passed since last sample. */
}
tick-1sec /!cnt/ {
/* Set realtime to 0 as first run, to make sure first reported line starts
* at 0 and increments from there. This should only happen once.
*/
realtm = 0;
++cnt;
}
nfsv3:::op-read-start,
nfsv3:::op-write-start,
nfsv4:::op-read-start {
/* If this is a read, increment counter by one, else leave previous value */
self->nfs_r += probename == "op-read-start" ? 1 : 0;
tm[args[0]->ci_remote, args[1]->noi_xid] = timestamp;
opsz[args[0]->ci_remote, args[1]->noi_xid] = args[2]->count;
flag[args[0]->ci_remote, args[1]->noi_xid] = 0;
}
/*
* sdt:::arc-hit and sdt:::arc-miss give info on direct users of the arc
* if arc-miss, then we do an I/O.
* Unfortunately, we cannot directly correlate a NFS Read with event in the
* ARC, at least not easily.
* We are likely to miss at least some ARC hits and misses here, but this
* should be good enough to get a sense for hits and misses in general.
*/
sdt:::arc-miss /self->nfs_r/ { @arcm["R"] = count(); }
sdt:::arc-hit /self->nfs_r/ { @arch["R"] = count(); }
::arc_read:entry {
this->cmd = self->nfs_r ? "R" : "W";
@arcszmin[this->cmd] = min(`arc_stats.arcstat_size.value.i64);
@arcszmax[this->cmd] = max(`arc_stats.arcstat_size.value.i64);
@arcszav[this->cmd] = avg(`arc_stats.arcstat_size.value.i64);
@arcmruszmin[this->cmd] = min(`arc_stats.arcstat_mru_size.value.i64);
@arcmruszmax[this->cmd] = max(`arc_stats.arcstat_mru_size.value.i64);
@arcmruszav[this->cmd] = avg(`arc_stats.arcstat_mru_size.value.i64);
@arcmfuszmin[this->cmd] = min(`arc_stats.arcstat_mfu_size.value.i64);
@arcmfuszmax[this->cmd] = max(`arc_stats.arcstat_mfu_size.value.i64);
@arcmfuszav[this->cmd] = avg(`arc_stats.arcstat_mfu_size.value.i64);
}
nfsv3:::op-write-start {
flag[args[0]->ci_remote,args[1]->noi_xid] = args[2]->stable;
}
nfsv4:::op-write-start {
flag[args[0]->ci_remote,args[1]->noi_xid] = args[2]->stable;
tm[args[0]->ci_remote,args[1]->noi_xid] = timestamp;
opsz[args[0]->ci_remote,args[1]->noi_xid] = args[2]->data_len;
}
nfsv3:::op-read-done,
nfsv3:::op-write-done,
nfsv4:::op-read-done,
nfsv4:::op-write-done
/tm[args[0]->ci_remote,args[1]->noi_xid]/ {
/* Total time from start of operation to end of same operation. */
this->delta = (timestamp - tm[args[0]->ci_remote, args[1]->noi_xid]);
/*
* Flag indicates volatility mode for this operation, i.e.
* `Unstable`, `Data Sync`, or `File Sync`.
*/
this->flag = flag[args[0]->ci_remote,args[1]->noi_xid];
this->name = probename == "op-write-done" ? "W" : "R";
/*
* This is a hack, it takes advantage of the fact that realtm value will
* always increase, and never decrease. Because we do not have an aggregate
* for storing only last value, this is the best we can do. We are
* effectively storing latest computed value.
*/
@t[this->name] = max(realtm);
this->size = opsz[args[0]->ci_remote,args[1]->noi_xid];
this->buck = 512; /* We assume this is smallest possible size */
this->buck = ( this->size > 512 ) ? 1024 : this->buck ;
this->buck = ( this->size > 1024 ) ? 2048 : this->buck ;
this->buck = ( this->size > 2048 ) ? 4096 : this->buck ;
this->buck = ( this->size > 4096 ) ? 8192 : this->buck ;
this->buck = ( this->size > 8192 ) ? 16384 : this->buck ;
this->buck = ( this->size > 16384 ) ? 32768 : this->buck ;
this->buck = ( this->size > 32768 ) ? 65536 : this->buck ;
this->buck = ( this->size > 65536 ) ? 0x20000 : this->buck ;
this->buck = ( this->size > 0x20000 ) ? 0x40000 : this->buck ;
this->buck = ( this->size > 0x40000 ) ? 0x80000 : this->buck ;
this->buck = ( this->size > 0x80000 ) ? 0x100000 : this->buck ;
this->buck = ( this->size > 0x100000 ) ? 0x200000 : this->buck ;
@lmin[this->name] = min(this->delta);
@lmax[this->name] = max(this->delta);
@a[this->name] = avg(this->delta);
@maxiosz[this->name] = max(this->size);
@i[this->name] = count();
/* Average bytes per microsecond of IO time. */
this->bpus = this->delta > 0 ? (this->size*1000 / this->delta) : this->size;
@abus[this->name] = avg(this->bpus);
/* Do not normalize (@ios) to get total I/Os per interval. */
@ios[this->name] = count();
@b[this->name] = sum(opsz[args[0]->ci_remote, args[1]->noi_xid]);
@sz512[this->name] = sum(this->buck == 512 ? 1 : 0 );
@sz1024[this->name] = sum(this->buck == 1024 ? 1 : 0 );
@sz2048[this->name] = sum(this->buck == 2048 ? 1 : 0 );
@sz4096[this->name] = sum(this->buck == 4096 ? 1 : 0 );
@sz8192[this->name] = sum(this->buck == 8192 ? 1 : 0 );
@sz16384[this->name] = sum(this->buck == 16384 ? 1 : 0 );
@sz32768[this->name] = sum(this->buck == 32768 ? 1 : 0 );
@sz65536[this->name] = sum(this->buck == 65536 ? 1 : 0 );
@sz131072[this->name] = sum(this->buck == 131072 ? 1 : 0 );
/*
* There is an aggregate for each possible flag, and we are counting
* number of occurrences of each, reporting each as a separate field.
*/
@nos[this->name] = sum(this->flag == 0 ? 1 : 0 );
@fs[this->name] = sum(this->flag == 1 ? 1 : 0 );
@ds[this->name] = sum(this->flag == 2 ? 1 : 0 );
tm[args[0]->ci_remote,args[1]->noi_xid] = 0;
opsz[args[0]->ci_remote,args[1]->noi_xid] = 0;
flag[args[0]->ci_remote,args[1]->noi_xid] = 0;
self->nfs_r -= probename == "op-read-done" ? 1 : 0;
}
tick-1sec /(timestamp - start) > 0 && cnt/ {
/*
* Normalizing by 1024 can introduce sample error when the value being
* divided is less than 1024. Since there is no floating point division,
* resulting value will be 0, which if data is routinely less than 1024,
* can create a serious discontinuity between real data and normalized
* data.
*/
normalize(@lmin, NS_US);
normalize(@lmax, NS_US);
normalize(@a, NS_US);
normalize(@b, NORM_KB);
normalize(@t, NS_MS);
printa(
"%s,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d\n",
@t, @lmin, @lmax, @a, @maxiosz, @i, @b,
@sz512, @sz1024, @sz2048, @sz4096, @sz8192, @sz16384,
@sz32768, @sz65536, @sz131072, @nos, @fs, @ds,
@arch, @arcm, @arcszav, @arcszmin, @arcszmax,
@arcmruszav, @arcmruszmin, @arcmruszmax, @arcmfuszav, @arcmfuszmin, @arcmfuszmax, @ios, @abus);
/*
* Truncate aggregations and re-set timestamp for next sampling period.
*/
trunc(@t); trunc(@lmin); trunc(@lmax); trunc(@a); trunc(@maxiosz); trunc(@i);
trunc(@b); trunc(@sz512); trunc(@sz1024); trunc(@sz2048); trunc(@sz4096);
trunc(@sz8192); trunc(@sz16384); trunc(@sz32768); trunc(@sz65536);
trunc(@sz131072); trunc(@nos); trunc(@fs); trunc(@ds);
trunc(@ios); trunc(@abus); trunc(@arch); trunc(@arcm);
trunc(@arcszav); trunc(@arcszmin); trunc(@arcszmax);
trunc(@arcmruszav); trunc(@arcmruszmin); trunc(@arcmruszmax);
trunc(@arcmfuszav); trunc(@arcmfuszmin); trunc(@arcmfuszmax);
start = timestamp;
}'
// What is the recordsize of operations, both reads and writes?
dtrace -qn 'nfsv3:::op-commit-start,nfsv3:::op-read-start,nfsv3:::op-write-start {@[probename=="op-write-start" ? "W" : "R"] = quantize(args[2]->count);}'
dtrace -n '
BEGIN {
command["op-commit-start"] = "COMMIT";
command["op-read-start"] = "READ";
command["op-write-start"] = "WRITE";
}
nfsv3:::op-commit-start,nfsv3:::op-read-start,nfsv3:::op-write-start
{
@[command[probename], args[1]->noi_curpath] = quantize(args[2]->count);
}
tick-5sec {
trunc(@, 5); printa(@); trunc(@);
}'