szaydel
10/5/2014 - 5:52 PM

[Dtrace NFS snippets] Scripts and One-liners gathering data about NFS operations and performance #tags: nfs, nfsv3, nfsv4, latency, nfs oper

[Dtrace NFS snippets] Scripts and One-liners gathering data about NFS operations and performance #tags: nfs, nfsv3, nfsv4, latency, nfs operations

dtrace -qn'
    BEGIN {
        printf("timestamp,cmd,path,count\n");
        typ["op-access-done"] = "A";
        typ["op-create-done"] = "C";
        typ["op-getattr-done"] = "G";
        typ["op-read-done"] = "R";
        typ["op-write-done"] = "W";
        
}
    tick-5sec {
        ts = walltimestamp;
    }
    
    nfsv3:::op-access-done,
    nfsv3:::op-create-done,
    nfsv3:::op-getattr-done,
    nfsv3:::op-read-done,
    nfsv3:::op-write-done {
        this->path = args[1]->noi_curpath == NULL ? 
        "<Unknown>" : args[1]->noi_curpath;
    }
    nfsv3:::op-access-done,
    nfsv3:::op-create-done,
    nfsv3:::op-getattr-done,
    nfsv3:::op-read-done,
    nfsv3:::op-write-done /ts && strstr(this->path, "vmdk")!=0 / {
        @[ts, typ[probename], this->path] = count();
        data = 1;
    }
    tick-60sec /data/ {
        trunc(@, 10); 
        printa("%Y,%s,%s,%@d\n", @); 
        trunc(@); 
        data = 0;
    }'
dtrace -qn '
  /*
   * Measure how random or sequential NFSv3 IO is, and bucket it by
   * client IP address. Print results every 5 seconds by default.
   * Uncomment lines with @h aggregates to get histograms which show
   * common offsets. This is a good way to know just how sequential
   * sequential IO is, and in what offsets it is happening, i.e. 4K, 64K, etc.
   */
  int offset[string, string];
  int lastdelta[string, string];
  int havedata;

  BEGIN {
    printf("command,ip.addr,rand.count,seq.count\n");
    map["op-read-start"] = "R";
    map["op-write-start"] = "W";
  }
  nfsv3:::op-read-start,nfsv3:::op-write-start
  /!offset[map[probename], args[0]->ci_remote]/ {
    offset[map[probename], args[0]->ci_remote] = args[2]->offset;
    lastdelta[map[probename], args[0]->ci_remote]  = 0;
  }
  nfsv3:::op-read-start,nfsv3:::op-write-start
  /* Previous offset is greater than this offset */
  /offset[map[probename], args[0]->ci_remote] > args[2]->offset/ {
    this->dist = offset[map[probename], args[0]->ci_remote] - args[2]->offset;
    /* @h[map[probename], args[0]->ci_remote] = quantize(this->dist); */
    @seq[map[probename], args[0]->ci_remote] = 
      sum(this->dist == lastdelta[map[probename], args[0]->ci_remote]);
    @rand[map[probename], args[0]->ci_remote] = 
      sum(this->dist != lastdelta[map[probename], args[0]->ci_remote]);
    offset[map[probename], args[0]->ci_remote] = args[2]->offset;
    lastdelta[map[probename], args[0]->ci_remote] = this->dist;
    havedata = 1;
  }
  nfsv3:::op-read-start,nfsv3:::op-write-start
  /* Previous offset is less than this offset */
  /offset[map[probename], args[0]->ci_remote] < args[2]->offset/ {
    this->dist = args[2]->offset - offset[map[probename], args[0]->ci_remote];
    /* @h[map[probename], args[0]->ci_remote] = quantize(this->dist); */
    @seq[map[probename], args[0]->ci_remote] = 
      sum(this->dist == lastdelta[map[probename], args[0]->ci_remote]);
    @rand[map[probename], args[0]->ci_remote] = 
      sum(this->dist != lastdelta[map[probename], args[0]->ci_remote]);
    offset[map[probename], args[0]->ci_remote] = args[2]->offset;
    lastdelta[map[probename], args[0]->ci_remote] = this->dist;
    havedata = 1;
  }
  tick-5sec /havedata/ {
    printa("%s,%s,%@d,%@d\n", @rand, @seq);
    trunc(@rand); trunc(@seq);
    /* trunc(@h); */
    havedata = 0;
  }'
dtrace -qn'
  #pragma D option aggsize=16m
  #pragma D option bufsize=8m
  #pragma D option dynvarsize=8m

  string typ[string];
  
  BEGIN 
  {
    printf("source,cmd,path,IOs,avg.lat.us,bytes\n");
    typ["op-access-done"] = "A";
    typ["op-create-done"] = "C";
    typ["op-getattr-done"] = "G";
    typ["op-read-done"] = "R";
    typ["op-write-done"] = "W";
  }
  /*
   * Ignoring all R/W IOs less than 1K. If doing some really tiny <1K IOs,
   * lets pretend these IOs did not happen.
   */
  nfsv3:::op-read-start,
  nfsv3:::op-write-start /args[2]->count > 1 << 10/ { self->bytes = args[2]->count ; }

  nfsv3:::op-access-start, nfsv3:::op-create-start, nfsv3:::op-getattr-start,
  nfsv3:::op-read-start, nfsv3:::op-write-start
  {
    self->src_addr = args[0]->ci_remote;
    tm[self->src_addr, args[1]->noi_xid] = timestamp;

    self->have_path = args[1]->noi_curpath == NULL ? 0 : 1;
    
    /* We should make sure we handle empty value as well */
    self->path = args[1]->noi_curpath != "" ? 
              args[1]->noi_curpath  : "NOPATH";
  }

  /*
   * Continue to ignore any R/W IOs that are less than 1K.
   */
  nfsv3:::op-read-done, nfsv3:::op-write-done 
  /self->have_path &&
    tm[self->src_addr, args[1]->noi_xid] &&
    self->bytes > 1 << 10
  / 
  {
    @ios[self->src_addr,
        typ[probename], self->path] = count();
    @lat_us[self->src_addr,
        typ[probename], self->path] = 
          avg((timestamp - tm[self->src_addr, args[1]->noi_xid]) / 1000);
    @s[self->src_addr,
        typ[probename], self->path] = sum(self->bytes);
    ++data;
  }

  nfsv3:::op-access-done, nfsv3:::op-create-done, nfsv3:::op-getattr-done /self->have_path && tm[self->src_addr, args[1]->noi_xid]/
  {
    @ios[self->src_addr,
        typ[probename], self->path] =
      count();
    @lat_us[self->src_addr,
        typ[probename], self->path] =
      avg((timestamp - tm[self->src_addr, args[1]->noi_xid]) / 1000);
    @s[self->src_addr,
        typ[probename], self->path] =
      sum(0);
    ++data;
  }
  
  nfsv3:::op-access-done,
  nfsv3:::op-create-done,
  nfsv3:::op-getattr-done,
  nfsv3:::op-read-done,
  nfsv3:::op-write-done {
    /* Zero out things to make sure we avoid issues with dynvar drops */
    self->bytes = 0;
    self->have_path = 0;
    tm[self->src_addr, args[1]->noi_xid] = 0;
    self->src_addr = 0;
    self->path = 0;
  }

  tick-1sec /data/
  {
    /*
     * Shrink each aggregation to 100 observations first, then print those
     * 100 observations, which will not mean we are limited to 100 unique
     * connection addresses, or 100 unique paths. As it stands, aggregations
     * right now use a tuple for their key, which is a set of three values:
     * client address, command, path. This means that in theory we could have:
     * (client address) x (command) x (path) number of unique keys.
     */
    trunc(@ios, 100); trunc(@lat_us, 100); trunc(@s, 100);
    printa("%s,%s,%s,%@d,%@d,%@d\n", @ios, @lat_us, @s);
    
    trunc(@ios); trunc(@lat_us); trunc(@s);
    data = 0;
  }'
dtrace -qn '

inline const int NORM_KB = 1024;
inline const int NS_S = 1000000000;
inline const int NS_MS = 1000000;
inline const int NS_US = 1000;
int cnt, realtm, start, epoch;
self int nfs_r;

#pragma D option aggsize=16m
#pragma D option bufsize=50m
#pragma D option dynvarsize=50m

BEGIN {
printf("%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s\n",
    "io.type", "runtime.ms", "min.lat", "max.lat", "avg.lat",
    "max.io.size", "iops", "throughput.KBps",
    "512b.ct", "1024b.ct", "2048b.ct", "4096b.ct", "8192b.ct", 
    "16384b.ct", "32768b.ct", "65536b.ct", "131072b.ct", 
    "volatile", "fsync", "dsync", "arc.hit.ct", "arc.miss.ct",
    "arc.size.av", "arc.size.min", "arc.size.max", "arc.mru.size.av",
    "arc.mru.size.min", "arc.mru.size.max", "arc.mfu.size.av",
    "arc.mfu.size.min", "arc.mfu.size.max", "total.ios", "avg.bytes.per.us"
);
hdr = 1;
epoch = timestamp;
start = timestamp;
  /* How stable data is expected to be. */
  stable_how[0] = "NOSYNC";
  stable_how[1] = "DSYNC";
  stable_how[2] = "FSYNC";
  ts = walltimestamp;
}

tick-1sec /(timestamp - epoch) >= 0 && cnt > 0/ {
  realtm = timestamp - epoch; /* Update real time passed since last sample. */
}

tick-1sec /!cnt/ {
  /* Set realtime to 0 as first run, to make sure first reported line starts
   * at 0 and increments from there. This should only happen once.
   */
  realtm = 0;
  ++cnt;
}

nfsv3:::op-read-start, 
nfsv3:::op-write-start,
nfsv4:::op-read-start {
  /* If this is a read, increment counter by one, else leave previous value */
  self->nfs_r += probename == "op-read-start" ? 1 : 0;
  tm[args[0]->ci_remote, args[1]->noi_xid] = timestamp;
  opsz[args[0]->ci_remote, args[1]->noi_xid] = args[2]->count;
  flag[args[0]->ci_remote, args[1]->noi_xid] = 0;
}

/*
 * sdt:::arc-hit and sdt:::arc-miss give info on direct users of the arc 
 * if arc-miss, then we do an I/O.
 * Unfortunately, we cannot directly correlate a NFS Read with event in the
 * ARC, at least not easily.
 * We are likely to miss at least some ARC hits and misses here, but this
 * should be good enough to get a sense for hits and misses in general.
 */
sdt:::arc-miss /self->nfs_r/ { @arcm["R"] = count(); }
sdt:::arc-hit /self->nfs_r/ { @arch["R"] = count(); }

::arc_read:entry {
    this->cmd = self->nfs_r ? "R" : "W";
    @arcszmin[this->cmd]     = min(`arc_stats.arcstat_size.value.i64);
    @arcszmax[this->cmd]     = max(`arc_stats.arcstat_size.value.i64);
    @arcszav[this->cmd]      = avg(`arc_stats.arcstat_size.value.i64);
    @arcmruszmin[this->cmd]  = min(`arc_stats.arcstat_mru_size.value.i64);
    @arcmruszmax[this->cmd]  = max(`arc_stats.arcstat_mru_size.value.i64);
    @arcmruszav[this->cmd]   = avg(`arc_stats.arcstat_mru_size.value.i64);
    @arcmfuszmin[this->cmd]  = min(`arc_stats.arcstat_mfu_size.value.i64);
    @arcmfuszmax[this->cmd]  = max(`arc_stats.arcstat_mfu_size.value.i64);
    @arcmfuszav[this->cmd]   = avg(`arc_stats.arcstat_mfu_size.value.i64);
}

nfsv3:::op-write-start {
    flag[args[0]->ci_remote,args[1]->noi_xid] = args[2]->stable;
}
nfsv4:::op-write-start {
    flag[args[0]->ci_remote,args[1]->noi_xid] = args[2]->stable;
    tm[args[0]->ci_remote,args[1]->noi_xid] = timestamp;
    opsz[args[0]->ci_remote,args[1]->noi_xid] = args[2]->data_len;
}

nfsv3:::op-read-done,
nfsv3:::op-write-done,
nfsv4:::op-read-done,
nfsv4:::op-write-done
/tm[args[0]->ci_remote,args[1]->noi_xid]/ {
  /* Total time from start of operation to end of same operation. */
  this->delta = (timestamp - tm[args[0]->ci_remote, args[1]->noi_xid]);
  /*
   * Flag indicates volatility mode for this operation, i.e.
   * `Unstable`, `Data Sync`, or `File Sync`.
   */
  this->flag =  flag[args[0]->ci_remote,args[1]->noi_xid];
  this->name =  probename == "op-write-done" ? "W" : "R";

  /*
  * This is a hack, it takes advantage of the fact that realtm value will
  * always increase, and never decrease. Because we do not have an aggregate
  * for storing only last value, this is the best we can do. We are
  * effectively storing latest computed value.
  */
  @t[this->name]      = max(realtm);

  this->size = opsz[args[0]->ci_remote,args[1]->noi_xid];
  this->buck = 512; /* We assume this is smallest possible size */
  this->buck = ( this->size > 512 ) ? 1024 : this->buck ;
  this->buck = ( this->size > 1024 ) ? 2048 : this->buck ;
  this->buck = ( this->size > 2048 ) ? 4096 : this->buck ;
  this->buck = ( this->size > 4096 ) ? 8192 : this->buck ;
  this->buck = ( this->size > 8192 ) ?  16384 : this->buck ;
  this->buck = ( this->size > 16384 ) ?  32768 : this->buck ;
  this->buck = ( this->size > 32768 ) ?  65536 : this->buck ;
  this->buck = ( this->size > 65536 ) ?  0x20000 : this->buck ;
  this->buck = ( this->size > 0x20000 ) ?  0x40000 : this->buck ;
  this->buck = ( this->size > 0x40000 ) ?  0x80000 : this->buck ;
  this->buck = ( this->size > 0x80000 ) ?  0x100000 : this->buck ;
  this->buck = ( this->size > 0x100000 ) ?  0x200000 : this->buck ;

  @lmin[this->name]   = min(this->delta);
  @lmax[this->name]   = max(this->delta);
  @a[this->name]      = avg(this->delta);
  @maxiosz[this->name]  = max(this->size);
  @i[this->name]      = count();
  /* Average bytes per microsecond of IO time. */
  this->bpus = this->delta > 0 ? (this->size*1000 / this->delta) : this->size;
  @abus[this->name]   = avg(this->bpus);
  /* Do not normalize (@ios) to get total I/Os per interval. */
  @ios[this->name]    = count();
  @b[this->name]      = sum(opsz[args[0]->ci_remote, args[1]->noi_xid]);
  @sz512[this->name]  = sum(this->buck == 512 ? 1 : 0 );
  @sz1024[this->name]  = sum(this->buck == 1024 ? 1 : 0 );
  @sz2048[this->name]  = sum(this->buck == 2048 ? 1 : 0 );
  @sz4096[this->name]  = sum(this->buck == 4096 ? 1 : 0 );
  @sz8192[this->name]  = sum(this->buck == 8192 ? 1 : 0 );
  @sz16384[this->name]  = sum(this->buck == 16384 ? 1 : 0 );
  @sz32768[this->name]  = sum(this->buck == 32768 ? 1 : 0 );
  @sz65536[this->name]  = sum(this->buck == 65536 ? 1 : 0 );
  @sz131072[this->name]  = sum(this->buck == 131072 ? 1 : 0 );

  /*
   * There is an aggregate for each possible flag, and we are counting
   * number of occurrences of each, reporting each as a separate field.
   */
  @nos[this->name]   = sum(this->flag == 0 ? 1 : 0 );
  @fs[this->name]    = sum(this->flag == 1 ? 1 : 0 );
  @ds[this->name]    = sum(this->flag == 2 ? 1 : 0 );
  tm[args[0]->ci_remote,args[1]->noi_xid]   = 0;
  opsz[args[0]->ci_remote,args[1]->noi_xid] = 0;
  flag[args[0]->ci_remote,args[1]->noi_xid] = 0;
  self->nfs_r -= probename == "op-read-done" ? 1 : 0;
}

tick-1sec /(timestamp - start) > 0 && cnt/ {
/*
 * Normalizing by 1024 can introduce sample error when the value being 
 * divided is less than 1024. Since there is no floating point division,
 * resulting value will be 0, which if data is routinely less than 1024,
 * can create a serious discontinuity between real data and normalized
 * data.
 */
normalize(@lmin, NS_US);
normalize(@lmax, NS_US);
normalize(@a, NS_US);
normalize(@b, NORM_KB);
normalize(@t, NS_MS);
printa(
  "%s,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d\n", 
                    @t, @lmin, @lmax, @a, @maxiosz, @i, @b, 
                    @sz512, @sz1024, @sz2048, @sz4096, @sz8192, @sz16384,
                    @sz32768, @sz65536, @sz131072, @nos, @fs, @ds, 
                    @arch, @arcm, @arcszav, @arcszmin, @arcszmax, 
                    @arcmruszav, @arcmruszmin, @arcmruszmax, @arcmfuszav, @arcmfuszmin, @arcmfuszmax, @ios, @abus);
/*
 * Truncate aggregations and re-set timestamp for next sampling period.
 */
trunc(@t); trunc(@lmin); trunc(@lmax); trunc(@a); trunc(@maxiosz); trunc(@i); 
trunc(@b); trunc(@sz512); trunc(@sz1024); trunc(@sz2048); trunc(@sz4096);
trunc(@sz8192); trunc(@sz16384); trunc(@sz32768); trunc(@sz65536);
trunc(@sz131072); trunc(@nos); trunc(@fs); trunc(@ds); 
trunc(@ios); trunc(@abus); trunc(@arch); trunc(@arcm);
trunc(@arcszav); trunc(@arcszmin); trunc(@arcszmax);
trunc(@arcmruszav); trunc(@arcmruszmin); trunc(@arcmruszmax); 
trunc(@arcmfuszav); trunc(@arcmfuszmin); trunc(@arcmfuszmax);

start = timestamp;
}'
// What is the recordsize of operations, both reads and writes?
dtrace -qn 'nfsv3:::op-commit-start,nfsv3:::op-read-start,nfsv3:::op-write-start {@[probename=="op-write-start" ? "W" : "R"] = quantize(args[2]->count);}'
dtrace -n '
BEGIN {
    command["op-commit-start"] = "COMMIT";
    command["op-read-start"] = "READ";
    command["op-write-start"] = "WRITE";

}
nfsv3:::op-commit-start,nfsv3:::op-read-start,nfsv3:::op-write-start 
{
    @[command[probename], args[1]->noi_curpath] = quantize(args[2]->count); 
}
tick-5sec {
    trunc(@, 5); printa(@); trunc(@);
}'