szaydel
3/12/2015 - 2:51 PM

[Dtrace snippets for zfs at vnode (file) level] Snippets used for observing core zfs filesystem functions like zfs_read and zfs_write for de

[Dtrace snippets for zfs at vnode (file) level] Snippets used for observing core zfs filesystem functions like zfs_read and zfs_write for determining filesystem latency and for measuring IO sizes.

dtrace -qn '
/*
 * This script is intended for measuring basic characteristics of
 * the zio_{read,write} functions in ZFS. These functions are a good
 * proxy for how the underlying storage is performing under current load.
 * Resulting data is CSV-formatted for easy digestion by various command-line
 * tools and import into R and other statistical tools.
 * Totals are normalized to a per-second rate, i.e. amount of time spent
 * in function for each sample is per second of real time.
 */
  int start, interval, rmin, rmax, wmin, wmax, rdata, wdata;
  int cnt[string] ;
  int ldelta[string] ;
  signed long lastema[string] ; /* exp. moving avg. associative array */
 /* 
  * Weight for EMA calculation W = 2 ÷ (N + 1), where N is number of periods.
  * Because we cannot do math with floats in dtrace, we assume that a 20 period
  * EMA is 2 ÷ (20 + 1) ~= 0.0952 to four decimal points. Multiplying by 10000
  * results in chosen weight.
  */
  inline int WEIGHT = 952 ;
  BEGIN {
  printf("%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s\n",
          "delta", "rlat", "rlat.ema", "rminlat", "rmaxlat", "rcnt", 
          "read.tot.tm", "read.range", "wlat", "wlat.ema", "wminlat", 
          "wmaxlat", "wcnt", "write.tot.tm", "write.range") ;
    start = timestamp ;
  }
  /*
   * To avoid losing information, due to lack of floating point arithmetic
   * support in dtrace, we avoid conversion from nanoseconds to microseconds
   * in the return clauses, where we compute delta between entry and return
   * triggers. This should be easy to accomplish later on, if necessary in the
   * data processing pipeline.
   */
  ::zio_read:entry,::zio_write:entry { self->s = timestamp ; }

  ::zio_read:return /timestamp > self->s/ {
    this->delta = timestamp - self->s ;
    rmin = rmin == 0 ? this->delta : rmin > this->delta ? this->delta : rmin ;
    rmax = rmax == 0 ? this->delta : rmax < this->delta ? this->delta : rmax ;
    
    @rlat    = avg(this->delta) ;
    @rminlat = min(this->delta) ;
    @rmaxlat = max(this->delta) ;
    @rcnt    = count() ;
    @rtottm  = sum(this->delta) ;

    rdata = 1 ; cnt[probefunc]++ ;
  }

  ::zio_write:return /timestamp > self->s/ {
    this->delta = timestamp - self->s ;
    wmin = wmin == 0 ? this->delta : wmin > this->delta ? this->delta : wmin ;
    wmax = wmax == 0 ? this->delta : wmax < this->delta ? this->delta : wmax ;

    @wlat    = avg(this->delta) ;
    @wminlat = min(this->delta) ;
    @wmaxlat = max(this->delta) ;
    @wcnt    = count() ;
    @wtottm  = sum(this->delta) ;
    
    wdata = 1 ; cnt[probefunc]++ ;
  }

  /*
   * Consider this a required first interval to seed EMA,
   * and do not use this sample for additional computations.
   */
  ::zio_read:return,::zio_write:return
  /self->s && lastema[probefunc] == 0 && this->delta/ {
  lastema[probefunc] = this->delta ;
  self->s = 0 ;
  }

  /* 1/10th of the time do EMA computation. We are purposely skipping over
   * some observations to make the data smoother.
   */
  ::zio_read:return,::zio_write:return
  /self->s && lastema[probefunc] > 0 && this->delta > 0
  && (cnt[probefunc] % 10) == 0/ {
      /* If count is > 10000, reset it to 0, start over. */
      cnt[probefunc] = cnt[probefunc] > 10000 ? 0 : cnt[probefunc] ;
    /*
     * Compute exponential moving average, and due to lack of a better means
     * store values in a avg() aggregate, which is not ideal, but it is better
     * than nothing at all.
     */
    this->ema           = ((WEIGHT * this->delta) + 
                          ((10000 - WEIGHT) * lastema[probefunc])) / 10000 ;
    lastema[probefunc]  = this->ema ;
    self->s = 0 ;
  }
  ::zio_read:return /this->ema/ {
    @rlatema            = avg(this->ema) ;
    this->ema           = 0 ;
  }
  ::zio_write:return /this->ema/ {
    @wlatema            = avg(this->ema) ;
    this->ema           = 0 ;

  }
  tick-5sec /!rdata && !wdata/ {
  /*
   * If the function calls did not happen, we print NA to make it clear
   * that values are not available, as opposed to printing 0s.
   */
   this->toffset = (timestamp - start) / 1000000000 ;
    printf("%d,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA\n",
            this->toffset) ;
    interval = this->toffset ;            
  }
  tick-5sec /rdata || wdata/ {
    /* 
     * We are only using a sum here because we need something that will store
     * results as an aggregated variable, which we can print with printa.
     */
    this->toffset = (timestamp - start) / 1000000000 ;
    @rrng = sum(rdata > 0 ? rmax - rmin : 0) ;
    @wrng = sum(wdata > 0 ? wmax - wmin : 0) ;
    normalize(@rtottm, this->toffset - interval) ; 
    normalize(@wtottm, this->toffset - interval) ;
    normalize(@rcnt, this->toffset - interval) ; 
    normalize(@wcnt, this->toffset - interval) ;

    printf("%d,", this->toffset) ;
    printa("%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d\n",
            @rlat, @rlatema, @rminlat, @rmaxlat, @rcnt, @rtottm, @rrng,
            @wlat, @wlatema, @wminlat, @wmaxlat, @wcnt, @wtottm, @wrng) ;

    trunc(@rlat) ; trunc(@rlatema) ; trunc(@rminlat) ; trunc(@rmaxlat) ;
    trunc(@rcnt) ; trunc(@rtottm) ; trunc(@rrng) ;
    trunc(@wlat) ; trunc(@wlatema) ; trunc(@wminlat) ; trunc(@wmaxlat) ;
    trunc(@wcnt) ; trunc(@wtottm) ; trunc(@wrng) ;
    rmin, rmax, rdata, wmin, wmax, wdata = 0 ; 
    interval = this->toffset ;
  }'
dtrace -qn '
#pragma D option dynvarsize=8m
int gotdata ;

  BEGIN {
    printf("pool,io.type,lt100us,lt1000us,lt10000us,lt100ms,lt1000ms,gt1000ms,blkszle1k,blksz2k,blksz4k,blksz8k,blksz16k,blksz32k,blksz64k,blksz128k,blksz256k,blkszgt256k,io.count\n") ;
    x["zfs_read"] = "R" ;
    x["zfs_write"] = "W" ;
  }

  ::zfs_read:entry,::zfs_write:entry
  /strstr(strtok(args[0]->v_path, "/storage/"), "p") != NULL/ {
    self->b = args[1]->uio_resid ; /* Number of bytes to read or write */
    self->p = strtok(args[0]->v_path, "/storage/") ; /* Should be pool name */
    self->s = timestamp ;
    gotdata = 1 ;
}

  ::zfs_read:return,::zfs_write:return /self->s/ {
    /*
     * Delta is converted from nanoseconds to microseconds, since we
     * expect few IOs to complete in a matter of a few nanoseconds,
     * operating with microseconds istead makes sense.
     */
    this->delta = (timestamp - self->s) / 1000 ;
    this->n = self->p ;
    @cnt[stringof(this->n), x[probefunc]] = count();
    /*
     * Histogram with 6 bins representing distribution of latencies
     * for issued IOs. Bins are from 1000us< to >1000ms.
     * We use the sum() function to increment by 1 each time a value 
     * matches width of a bucket, otherwise we leave unchanged.
     */
    @lt100us[stringof(this->n), x[probefunc]] =
        sum(this->delta <= 100 ? 1 : 0) ;
    @lt1000us[stringof(this->n), x[probefunc]] =
        sum(this->delta > 100 && this->delta <= 1000 ? 1 : 0) ;
    @lt10000us[stringof(this->n), x[probefunc]] =
        sum(this->delta > 1000 && this->delta <= 10000 ? 1 : 0) ;
    @lt100ms[stringof(this->n), x[probefunc]] =
        sum(this->delta > 10000 && this->delta <= 100000 ? 1 : 0) ;
    @lt1000ms[stringof(this->n), x[probefunc]] =
        sum(this->delta > 100000 && this->delta <= 1000000 ? 1 : 0) ;
    @gt1000ms[stringof(this->n), x[probefunc]] =
        sum(this->delta > 1000000 ? 1 : 0) ;
    /*
     * Histogram with 10 bins representing distibution of IO sizes 
     * for issued IOs. Bins are from 1K< to >256K.
     * We use the sum() function to increment by 1 each time a value 
     * matches width of a bucket, otherwise we leave unchanged.
     */

    @blkszle1k[stringof(this->n), x[probefunc]] = 
        sum(self->b <= 0x400 ? 1 : 0) ;

    @blksz2k[stringof(this->n), x[probefunc]] = 
        sum(self->b > 0x400 && self->b <= 0x800 ? 1 : 0) ;

    @blksz4k[stringof(this->n), x[probefunc]] = 
        sum(self->b > 0x800 && self->b <= 0x1000 ? 1 : 0) ;
    
    @blksz8k[stringof(this->n), x[probefunc]] = 
        sum(self->b > 0x1000 && self->b <= 0x2000 ? 1 : 0) ;
    
    @blksz16k[stringof(this->n), x[probefunc]] = 
        sum(self->b > 0x2000 && self->b <= 0x4000 ? 1 : 0) ;
    
    @blksz32k[stringof(this->n), x[probefunc]] = 
        sum(self->b > 0x4000 && self->b <= 0x8000 ? 1 : 0) ;
    
    @blksz64k[stringof(this->n), x[probefunc]] = 
        sum(self->b > 0x8000 && self->b <= 0x10000 ? 1 : 0) ;

    @blksz128k[stringof(this->n), x[probefunc]] = 
        sum(self->b > 0x10000 && self->b <= 0x20000 ? 1 : 0) ;

    @blksz256k[stringof(this->n), x[probefunc]] = 
        sum(self->b > 0x20000 && self->b <= 0x40000 ? 1 : 0) ;

    @blkszgt256k[stringof(this->n), x[probefunc]] = 
        sum(self->b > 0x40000 ? 1 : 0) ;

    self->b  = 0 ;
    self->p = 0 ;
    self->s = 0 ;
  }

  tick-1sec /gotdata/ {
    printa("%s,%s,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d\n", 
            @lt100us, @lt1000us, @lt10000us, @lt100ms, @lt1000ms, 
            @gt1000ms, @blkszle1k, @blksz2k, @blksz4k, @blksz8k, @blksz16k,
            @blksz32k, @blksz64k, @blksz128k, @blksz256k, @blkszgt256k, @cnt
            ) ;
    trunc(@lt100us) ; trunc(@lt1000us) ; trunc(@lt10000us) ;
    trunc(@lt100ms) ; trunc(@lt1000ms) ; trunc(@gt1000ms) ;
    trunc(@blkszle1k) ; trunc(@blksz2k) ; trunc(@blksz4k) ;
    trunc(@blksz8k) ; trunc(@blksz16k) ; trunc(@blksz32k) ;
    trunc(@blksz64k) ; trunc(@blksz128k) ; trunc(@blksz256k) ;
    trunc(@blkszgt256k) ; trunc(@cnt) ;
    gotdata = 0 ;
  }'
dtrace -xdynvarsize=10M -qn '
    BEGIN {
        MSEC        = 1000000;
        MICROSEC    = 1000;
        threshold   = 50 * MSEC;
        div = "--------------------------------------------------------------------------------"
    }
    fbt:zfs::entry {
        self->start[probefunc]=timestamp
    } 
    /* Do not enter unless we are over threshold */
    fbt:zfs::return {
        this->now = timestamp;
    }
    fbt:zfs::return /this->now && (this->now - self->start[probefunc]) > threshold/ {
        @["delta (ms)", probefunc] = quantize(
            (timestamp - self->start[probefunc]) / MSEC); 
        self->start[probefunc] = 0;
        this->now = 0;
    }
    tick-5sec {
    printf("%s\n", div); printf("%Y\n", walltimestamp); printa(@); trunc(@);
}'
#!/bin/sh

dtrace -xdynvarsize=10M -qn '
    BEGIN {
        MSEC        = 1000000;
        MICROSEC    = 1000;
        threshold   = 50 * MSEC;
        div = "--------------------------------------------------------------------------------"
    }
    fbt:zfs::entry {
        self->start[probefunc]=timestamp
    }
    /* Do not enter unless we are over threshold */
    fbt:zfs::return {
        this->now = timestamp;
    }
    fbt:zfs::return /this->now && (this->now - self->start[probefunc]) > threshold/ {
        @["delta (ms)", probefunc] = quantize(
            (timestamp - self->start[probefunc]) / MSEC);
        self->start[probefunc] = 0;
        this->now = 0;
    }
    tick-5sec {
    printf("%s\n", div); printf("%Y\n", walltimestamp); printa(@); trunc(@);
}' >> /root/long-running.log
dtrace -n 'fbt::fop_read:entry /args[0]->v_op->vnop_name == "zfs"/ {
    self->start = timestamp; } fbt::fop_read:return /self->start/ {
    @[execname, "ns"] = quantize(timestamp - self->start); self->start = 0; }
    
dtrace -qn '
  ::fop_fsync:entry /args[0]->v_path != NULL/ {
    self->p = args[0]->v_path;
    self->tick = timestamp;
  }
  ::fop_fsync:entry /self->p/ {
    this->delta = timestamp - self->tick;
    @lat[execname, stringof(self->p)] = quantize(this->delta);
    @[execname, stringof(self->p)] = count();
    self->tick = 0; self->p = 0;
  }
  END {
    printa("prog: %s path: %s count: %@d\n%@d\n", @, @lat);
  }'