szaydel
2/19/2015 - 10:46 PM

[Dtrace IO snippets] One-liners and Scripts for observation at lower levels of the IO stack #tags: sd, io, io probes, stable, io::: probes

[Dtrace snippets using io::: probes] One-liners and Scripts for observation at lower levels of the IO stack #tags: sd, io, io probes, stable probes, block device, dtrace, bandwidth, throughput, io errors, io retries

dtrace -qn '
  BEGIN {
    printf("timestamp(ns),latency(us),iotype,device,filename,blk.number,blk.count");
  }
  io:::start {
    ts[args[0]->b_edev, args[0]->b_blkno] = timestamp;
  }
  io:::done /ts[args[0]->b_edev, args[0]->b_blkno] != 0/ {
    this->lat_us = (timestamp - ts[args[0]->b_edev, args[0]->b_blkno]) 
                    / 1000;
    this->iotype = args[0]->b_flags & B_WRITE ? "WRITE" : 
                    args[0]->b_flags & B_READ ? "READ"  : "NULL";
    this->device = args[1]->dev_statname;
    this->fp = args[2]->fi_pathname == "<unknown>" ? "UNKNOWN" : 
                  args[2]->fi_pathname == "<none>" ? "UNKNOWN" : 
                  args[2]->fi_pathname;
    this->blocks = args[0]->b_bcount / 512; /* Assuming a 512b formatted dev */

    printf("%d,%d,%s,%s,%s,0x%x,%d\n", walltimestamp, 
                                this->lat_us, this->iotype, this->device, 
                                this->fp, args[0]->b_blkno, this->blocks);
  }' 2> /dev/null
#!/usr/sbin/dtrace -Cs
#pragma D option quiet
#pragma D option dynvarsize=16M

/*
 * CDDL HEADER START
 *
 * The contents of this file are subject to the terms of the
 * Common Development and Distribution License (the "License").
 * You may not use this file except in compliance with the License.
 *
 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
 * or http://www.opensolaris.org/os/licensing.
 * See the License for the specific language governing permissions
 * and limitations under the License.
 *
 * When distributing Covered Code, include this CDDL HEADER in each
 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
 * If applicable, add the following below this CDDL HEADER, with the
 * fields enclosed by brackets "[]" replaced with your own identifying
 * information: Portions Copyright [yyyy] [name of copyright owner]
 *
 * CDDL HEADER END
 *
 * Copyright (c) 2017 Sam Zaydel / RackTop Systems.
 *
 * bw-tput-iops-actv-time-hist-csv.d
 * This script is meant to be a more informative replacement for the
 * iostat utility. It measures some features that iostat in my opinion
 * is not useful for. For example, iostat does not offer distributions
 * at all, whereas here we plot distributions of IO rates as observed
 * for each disk. Also, iostat does not offer operating ranges for
 * measurements, which here we define as max - min of some measurement.
 *
 * Currently, script is limited to scsi_vhci devices, because they are
 * ones we normally use for actual data storage. Anything else is either
 * a locally attached device, or something not used by system, i.e.
 * USB storage media, etc.
 */
unsigned long minlati[dev_t, int], maxlati[dev_t, int]; /* Latency min and max */
unsigned long minratei[dev_t], maxratei[dev_t];         /* IO Rate min and max */
unsigned long miniosz[dev_t], maxiosz[dev_t];           /* IO size min and max */
unsigned long iocnt[dev_t, int];
int pend[dev_t];
hrtime_t start[dev_t, uint64_t];
hrtime_t ival_timer[dev_t];
hrtime_t ticks;
inline const int NSEC_PER_SEC = 1000000000;
inline const int NSEC_PER_MSEC = 1000000;
inline const int NSEC_PER_USEC = 1000;
inline const int SPA_MAXBLOCKSIZE = 128 << 10;
inline const int R = 0;
inline const int W = 1;

/*
 * These are the Output Parameter definitions for collected metrics
 *
 * sdname == Name of device, i.e. sd0
 * mpxiowwn == mpxio device, like: 5000cca24500f698
 * actvtm == amount of real, busy time spent processing IO
 * rangerlat, rangewlat == latency range: max - min for Reads and Writes
 * totbytes == total number of bytes Read and Written during sample interval
 * tput,maxtput == mean and maximum or burst throughput
 * ctrd,ctwr == Count of Reads and Writes over interval set in tick-Xsec clause
 * aviosz,rangeiosz == IO size mean and range: max - min
 * iops,maxiops == normalized IOPS mean and maximum
 * avKBps,rangeKBps == normalized IO rate and IO rate range: max - min
 * // Histogram of IO Rate distribution with 4 buckets //
 * [ ratelt1MBps  ] ==> 1 second interval with < 1,000KB/s bw
 * [ rate10MBps   ] ==> 1 second interval with < 10,000KB/s bw
 * [ rate100MBps  ] ==> 1 second interval with < 100,000KB/s bw
 * [ rate1GBps    ] ==> 1 second interval with < 1,000,000KB/s bw
 * avtime,maxtime == average and maximum IO completion latency
 * // Histogram of latency distribution with 6 buckets //
 * [ timegt1000ms ] ==> >1 second (SLOW!)
 * [ time100ms    ] ==> >50ms && 100ms<=    (not acceptable)
 * [ time50ms     ] ==> >25ms && 50ms<=     (acceptable)
 * [ time25ms     ] ==> >10ms && 25ms<=     (ok)
 * [ time10ms     ] ==> >1ms && 10ms<=      (good)
 * [ timelt1ms    ] ==> 1ms<                (probably cache)
 * // Histogram of IO size distribution with 7 buckets //
 * [ iosztiny     ] ==> 4K<
 * [ iosz4k       ] ==> between 4K and 8K
 * [ iosz8k       ] ==> between 8K and 16K
 * [ iosz16k      ] ==> between 16K and 32K
 * [ iosz32k      ] ==> between 32K and 64K
 * [ iosz64k      ] ==> between 64K and 128K
 * [ ioszbig      ] ==> anything 128K or above
 * avpending    == Average measured IOs in processing per sample interval
 * maxpending   == Maximum measured IOs in processing per sample interval
 * cterr        == Counter tracking number of errors per sample interval
 * ctretry      == Counter tracking number of retries per sample interval
 */

BEGIN {
    printf("sdname,mpxiowwn,actvtm,rangerlat,rangewlat,totbytes,tput,maxtput,ctrd,ctwr,aviosz,rangeiosz,iops,maxiops,avKBps,rangeKBps,ratelt1MBps,rate10MBps,rate100MBps,rate1GBps,avtime,maxtime,timegt1000ms,time100ms,time50ms,time25ms,time10ms,timelt1ms,iosztiny,iosz4k,iosz8k,iosz16k,iosz32k,iosz64k,ioszbig,avpending,maxpending,cterr,ctretry\n");
}

/* 
 * Set interval timer initial value once for a given device.
 * Each subsequent update will happen roughly once per second,
 * or if no IO, whenever there is some IO that triggers io:::done
 * probe, at which point we determine that timer expired.
 */
io:::start
/ival_timer[args[0]->b_edev] == 0/  {
    ival_timer[args[0]->b_edev] = timestamp;
}

io:::start {
    start[args[0]->b_edev, args[0]->b_blkno] = timestamp;
    /* Increment pending IOs by 1 */
    pend[args[0]->b_edev]++;
}

io:::done
/start[args[0]->b_edev, args[0]->b_blkno]/ {
    this->sn    = args[1]->dev_statname;
    this->p     = substr(args[1]->dev_pathname, 25, 16);
    this->p     = (strstr(this->p, "disk@") == 0 ||
                    strstr(this->p, "disk@") == "") ? this->p : "NA";
    this->x     = args[0]->b_bcount * 976562;
    this->elapsed = timestamp - start[args[0]->b_edev, args[0]->b_blkno];
    start[args[0]->b_edev, args[0]->b_blkno] = 0;
    
    /* Decrement pending IOs by 1, set to 0 if value < 0 */
    pend[args[0]->b_edev]--;
    pend[args[0]->b_edev] = 
        pend[args[0]->b_edev] > 0 ?
        pend[args[0]->b_edev] : 0; /* avoid underflow */

    /* Total Number of bytes per device */
    @totbytes[this->sn, this->p]    = sum(args[0]->b_bcount);
    /* Total nanoseconds of active time per device */
    @actv_tm[this->sn, this->p]     = sum(this->elapsed);

    /* 
     * Instead of converting nanoseconds to seconds we multiply
     * the top by NSEC_PER_SEC and the divide by delta in nanoseconds.
     * In extreme observations, i.e. stalled IO, we may have huge
     * this->elapsed values, in which case result will be 0 KB/s, even
     * if there in fact was a non-zero value of bytes transferred.
     */
    this->b_nsec = args[0]->b_bcount * NSEC_PER_SEC;
    this->kb_per_sec = (this->b_nsec / this->elapsed) >> 10;

    /* Measure IO rate range in KB/s */
    @rangeKBps[this->sn, this->p]     =
        max(maxratei[args[0]->b_edev] - minratei[args[0]->b_edev]);

    /* IO Rate histogram base10, limited at 1000000 KB/s | 976 MB/s */
    /* 1000KB/s <= */
    @ratelt1MBps[this->sn, this->p]     = sum(this->kb_per_sec <= 0x3e8 ? 1 : 0);
    /* > 1000KB/s && 10,000KB/s <= */ 
    @rate10MBps[this->sn, this->p]      = sum(this->kb_per_sec > 0x3e8 &&
                                            this->kb_per_sec <= 0x2710 ? 1 : 0);
    /* > 10,000KB/s && 100,000KB/s <= */ 
    @rate100MBps[this->sn, this->p]     = sum(this->kb_per_sec > 0x2710 &&
                                            this->kb_per_sec <= 0x186a0 ? 1 : 0);
    /* > 100,000KB/s && 1,000,000 KB/s <= */ 
    @rate1GBps[this->sn, this->p]       = sum(this->kb_per_sec > 0x186a0 &&
                                            this->kb_per_sec <= 0xf4240 ? 1 : 0);

    /*
     * Collect minimum and maximum observed rate for later measurement
     * of range for this metric.
     */
    minratei[args[0]->b_edev] = minratei[args[0]->b_edev] == 0 ?
        this->kb_per_sec : minratei[args[0]->b_edev] < this->kb_per_sec ?
        minratei[args[0]->b_edev] : this->kb_per_sec;

    maxratei[args[0]->b_edev] = maxratei[args[0]->b_edev] == 0 ?
        this->kb_per_sec : maxratei[args[0]->b_edev] > this->kb_per_sec ?
        maxratei[args[0]->b_edev] : this->kb_per_sec;
    
    /* Actual Kbytes/sec per device */
    @avKBps[this->sn, this->p]  = avg(this->kb_per_sec);
    
    /* Average and Maximum Latency per device */
    @avtime[this->sn, this->p]  = avg(this->elapsed);
    @maxtime[this->sn, this->p] = max(this->elapsed);
    /*
     * Latency histogram with buckets:
     * >1000ms, >50 to 100ms, >25 to 50ms, >10 to 25ms, >1 to 10ms, 1ms<
     */
    @timegt1000ms[this->sn, this->p] = sum(
        this->elapsed >= 1 * NSEC_PER_SEC ? 1 : 0);
    @time100ms[this->sn, this->p] = sum(
        this->elapsed > 50 * NSEC_PER_MSEC &&
        this->elapsed <= 100 * NSEC_PER_MSEC ? 1 : 0);
    @time50ms[this->sn, this->p] = sum(
        this->elapsed > 25 * NSEC_PER_MSEC &&
        this->elapsed <= 50 * NSEC_PER_MSEC ? 1 : 0);
    @time25ms[this->sn, this->p] = sum(
        this->elapsed > 10 * NSEC_PER_MSEC &&
        this->elapsed <= 25 * NSEC_PER_MSEC ? 1 : 0);
    @time10ms[this->sn, this->p] = sum(
        this->elapsed > 1 * NSEC_PER_MSEC &&
        this->elapsed <= 10 * NSEC_PER_MSEC ? 1 : 0);
    @timelt1ms[this->sn, this->p] = sum(
        this->elapsed < 1 * NSEC_PER_MSEC ? 1 : 0);

    /*
     * Collect minimum and maximum io size for later measurement
     * of range for this metric.
     */
    miniosz[args[0]->b_edev] =
        args[0]->b_bcount < miniosz[args[0]->b_edev] ?
        args[0]->b_bcount : miniosz[args[0]->b_edev];
    maxiosz[args[0]->b_edev] =
        args[0]->b_bcount > maxiosz[args[0]->b_edev] ?
        args[0]->b_bcount : maxiosz[args[0]->b_edev];

    /* Measure IO size range in Kilobytes */
    @rangeiosz[this->sn, this->p] = 
        max((maxiosz[args[0]->b_edev] - miniosz[args[0]->b_edev]) >> 10);

    /* 
     * Convert from bytes and nanoseconds to KB/s with 976562 to obtain
     * avg. effective throughput, and maximum effective throughput.
     * Maximum throughput is likely measuring cache effects.
     */
    @tput[this->sn, this->p] = avg(this->x / this->elapsed);
    @maxtput[this->sn, this->p] = max(this->x / this->elapsed);
    iocnt[args[0]->b_edev, R] += args[0]->b_flags & B_READ ? 1 : 0;
    iocnt[args[0]->b_edev, W] += args[0]->b_flags & B_WRITE ? 1 : 0;

    /* Count number of IOs by IO-type */
    @ctrd[this->sn, this->p]        = sum(args[0]->b_flags & B_READ ? 1 : 0);
    @ctwr[this->sn, this->p]        = sum(args[0]->b_flags & B_WRITE ? 1 : 0);
    @iops[this->sn, this->p]        = 
        count(); /* Normalized to per second in tick-X probe */
    
    /* Maximum sampled IOPS per device */
    @maxiops[this->sn, this->p]     = 
        max(iocnt[args[0]->b_edev, R] + iocnt[args[0]->b_edev, W]);

    /* 
     * Collect minimum and maximum latency for later measurement
     * of range for this metric.
     */
    minlati[args[0]->b_edev, R] =
        args[0]->b_flags & B_READ &&
        this->elapsed < minlati[args[0]->b_edev, R] ? this->elapsed :
        minlati[args[0]->b_edev, R];
    maxlati[args[0]->b_edev, R] =
        args[0]->b_flags & B_READ &&
        this->elapsed > maxlati[args[0]->b_edev, R] ? this->elapsed :
        maxlati[args[0]->b_edev, R];
    
    minlati[args[0]->b_edev, W] =
        (args[0]->b_flags & B_READ) == 0 &&
        this->elapsed < minlati[args[0]->b_edev, W] ? this->elapsed :
        minlati[args[0]->b_edev, W];
    maxlati[args[0]->b_edev, W] =
        (args[0]->b_flags & B_READ) == 0 &&
        this->elapsed > maxlati[args[0]->b_edev, W] ? this->elapsed :
        maxlati[args[0]->b_edev, W];
    
    /*
     * IOsize distribution not grouped by direction, i.e. no distinction
     * is made between reads and writes. IO buckets double in size from
     * previous bucket. i.e. 4, 8, 16, 32...
     */
     this->bs = args[0]->b_bcount ;
    /* 4K< */
    @iosztiny[this->sn, this->p]    =
        sum(this->bs < 0x1000 ? 1 : 0);
    /* 4K to 8K< */
    @iosz4k[this->sn, this->p]      =
        sum(this->bs >= 0x1000 && this->bs < 0x2000 ? 1 : 0);
    /* 8K to 16K< */
    @iosz8k[this->sn, this->p]      =
        sum(this->bs >= 0x2000 && this->bs < 0x4000 ? 1 : 0);
    /* 16K to 32K< */
    @iosz16k[this->sn, this->p]     =
        sum(this->bs >= 0x4000 && this->bs < 0x8000 ? 1 : 0);
    /* 32K to 64K< */
    @iosz32k[this->sn, this->p]     =
        sum(this->bs >= 0x8000 && this->bs < 0x10000 ? 1 : 0);
    /* 64K to 128K< */
    @iosz64k[this->sn, this->p]     =
        sum(this->bs >= 0x10000 && this->bs < 0x20000 ? 1 : 0);
    /* >128K */
    @ioszbig[this->sn, this->p]     =
        sum(this->bs >= 0x20000 ? 1 : 0);

    /* Average IO size for given device */
    @aviosz[this->sn, this->p]      = avg(this->bs);

    /* 
     * Each time we observe an error at completion through B_ERROR flag,
     * increment count of errors for given device. This should always
     * be zero, assuming healthy device.
     */
    @cterr[this->sn, this->p] = sum(args[0]->b_flags & B_ERROR ? 1 : 0);
}

/* 
 * Entry controlled by timer. By design, each device will be registered
 * here about once per second when there is even a litte bit of IO.
 */
io:::done
/ival_timer[args[0]->b_edev] > 0 &&
timestamp >= ival_timer[args[0]->b_edev] + NSEC_PER_SEC/ {
    this->sn    = args[1]->dev_statname;
    this->p     = substr(args[1]->dev_pathname, 25, 16);
    this->p     = (strstr(this->p, "disk@") == 0 ||
                    strstr(this->p, "disk@") == "") ? this->p : "NA";

    /*
     * Measure operating latency range in uS for Reads and Writes,
     * storing largest observed difference.
     */
    @rangerlat[this->sn, this->p]   =
        max((maxlati[args[0]->b_edev, R] -
            minlati[args[0]->b_edev, R]) / NSEC_PER_USEC);
    @rangewlat[this->sn, this->p]   =
        max((maxlati[args[0]->b_edev, W] -
            minlati[args[0]->b_edev, W]) / NSEC_PER_USEC);

    @avpending[this->sn, this->p]   = avg(pend[args[0]->b_edev]);
    @maxpending[this->sn, this->p]  = max(pend[args[0]->b_edev]);

    /* Reset various counters for next measurement period */
    minlati[args[0]->b_edev, R] = 0;
    maxlati[args[0]->b_edev, R] = 0;
    minlati[args[0]->b_edev, W] = 0;
    maxlati[args[0]->b_edev, W] = 0;
    iocnt[args[0]->b_edev, R]   = 0;
    iocnt[args[0]->b_edev, W]   = 0;
    miniosz[args[0]->b_edev]    = 0;
    maxiosz[args[0]->b_edev]    = 0;
    minratei[args[0]->b_edev]   = 0;
    maxratei[args[0]->b_edev]   = 0;
    ival_timer[args[0]->b_edev]         = timestamp;
}

/* 
 * Count number of retries issued to a disk. These are a good
 * indicator of potentially failing, or borderline device.
 * Under normal circumstances we should not expect
 * this to be a positive value.
 */
::sd_set_retry_bp:entry
/xlate <devinfo_t *>(args[1])->dev_pathname != "<nfs>" &&
xlate <devinfo_t *>(args[1])->dev_pathname != "" / {
    this->sn    = xlate <devinfo_t *>(args[1])->dev_statname;
    this->p     = substr(xlate <devinfo_t *>(args[1])->dev_pathname, 25, 16);
    this->p     = (strstr(this->p, "disk@") == 0 ||
                    strstr(this->p, "disk@") == "") ? this->p : "NA";
    @ctretry[this->sn, this->p] = count();
}

tick-10sec {
/* First time we enter this clause, ticks will be 0, so we just assume 10 seconds */
this->elapsed = ticks > 0 ? (timestamp - ticks) / NSEC_PER_SEC : 10 ;

/* Normalize Data for correct per second reporting of rates, like IOPS */
normalize(@actv_tm, NSEC_PER_MSEC); /* from nanoseconds to milliseconds */
normalize(@iops, this->elapsed);

printa("%s,%s,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d\n",
    @actv_tm, @rangerlat, @rangewlat, @totbytes, @tput, @maxtput, 
    @ctrd, @ctwr, @aviosz, @rangeiosz, @iops, @maxiops, @avKBps, @rangeKBps,
    @ratelt1MBps, @rate10MBps, @rate100MBps, @rate1GBps, @avtime, @maxtime,
    @timegt1000ms, @time100ms, @time50ms, @time25ms, @time10ms, @timelt1ms,
    @iosztiny, @iosz4k, @iosz8k, @iosz16k, @iosz32k, @iosz64k, @ioszbig,
    @avpending, @maxpending, @cterr, @ctretry);
    trunc(@actv_tm); trunc(@rangerlat); trunc(@rangewlat); trunc(@totbytes);
    trunc(@tput); trunc(@maxtput); trunc(@ctrd); trunc(@ctwr); trunc(@aviosz);
    trunc(@rangeiosz); trunc(@iops); trunc(@maxiops); trunc(@avKBps); 
    trunc(@rangeKBps); trunc(@ratelt1MBps); trunc(@rate10MBps); trunc(@rate100MBps); trunc(@rate1GBps); trunc(@avtime); trunc(@maxtime);
    trunc(@iosztiny); trunc(@iosz4k); trunc(@iosz8k); trunc(@iosz16k); 
    trunc(@iosz32k); trunc(@iosz64k); trunc(@ioszbig);
    trunc(@avpending); trunc(@maxpending); trunc(@cterr); trunc(@ctretry);
    trunc(@timegt1000ms); trunc(@time100ms); trunc(@time50ms); trunc(@time25ms);
    trunc(@time10ms); trunc(@timelt1ms);
    ticks = timestamp ;
}
#!/usr/sbin/dtrace -Cs
#pragma D option quiet
#pragma D option dynvarsize=16M

/*
 * CDDL HEADER START
 *
 * The contents of this file are subject to the terms of the
 * Common Development and Distribution License (the "License").
 * You may not use this file except in compliance with the License.
 *
 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
 * or http://www.opensolaris.org/os/licensing.
 * See the License for the specific language governing permissions
 * and limitations under the License.
 *
 * When distributing Covered Code, include this CDDL HEADER in each
 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
 * If applicable, add the following below this CDDL HEADER, with the
 * fields enclosed by brackets "[]" replaced with your own identifying
 * information: Portions Copyright [yyyy] [name of copyright owner]
 *
 * CDDL HEADER END
 *
 * Copyright (c) 2017 Sam Zaydel / RackTop Systems.
 *
 * bw-tput-iops-actv-time-hist-with-ts-csv.d
 * This script is meant to be a more informative replacement for the
 * iostat utility. It measures some features that iostat in my opinion
 * is not useful for. For example, iostat does not offer distributions
 * at all, whereas here we plot distributions of IO rates as observed
 * for each disk. Also, iostat does not offer operating ranges for
 * measurements, which here we define as max - min of some measurement.
 *
 * Currently, script is limited to scsi_vhci devices, because they are
 * ones we normally use for actual data storage. Anything else is either
 * a locally attached device, or something not used by system, i.e.
 * USB storage media, etc.
 */
unsigned long minlati[dev_t, int], maxlati[dev_t, int]; /* Latency min and max */
unsigned long minratei[dev_t], maxratei[dev_t];         /* IO Rate min and max */
unsigned long miniosz[dev_t], maxiosz[dev_t];           /* IO size min and max */
unsigned long iocnt[dev_t, int];
int pend[dev_t];
hrtime_t start[dev_t, uint64_t];
hrtime_t ival_timer[dev_t];
hrtime_t ticks;
inline const int NSEC_PER_SEC = 1000000000;
inline const int NSEC_PER_MSEC = 1000000;
inline const int NSEC_PER_USEC = 1000;
inline const int SPA_MAXBLOCKSIZE = 128 << 10;
inline const int R = 0;
inline const int W = 1;

/*
 * These are the Output Parameter definitions for collected metrics
 *
 * sdname == Name of device, i.e. sd0
 * mpxiowwn == mpxio device, like: 5000cca24500f698
 * actvtm == amount of real, busy time spent processing IO
 * rangerlat, rangewlat == latency range: max - min for Reads and Writes
 * totbytes == total number of bytes Read and Written during sample interval
 * tput,maxtput == mean and maximum or burst throughput
 * ctrd,ctwr == Count of Reads and Writes over interval set in tick-Xsec clause
 * aviosz,rangeiosz == IO size mean and range: max - min
 * iops,maxiops == normalized IOPS mean and maximum
 * avKBps,rangeKBps == normalized IO rate and IO rate range: max - min
 * // Histogram of IO Rate distribution with 4 buckets //
 * [ ratelt1MBps  ] ==> 1 second interval with < 1,000KB/s bw
 * [ rate10MBps   ] ==> 1 second interval with < 10,000KB/s bw
 * [ rate100MBps  ] ==> 1 second interval with < 100,000KB/s bw
 * [ rate1GBps    ] ==> 1 second interval with < 1,000,000KB/s bw
 * avtime,maxtime == average and maximum IO completion latency
 * // Histogram of latency distribution with 6 buckets //
 * [ timegt1000ms ] ==> >1 second (SLOW!)
 * [ time100ms    ] ==> >50ms && 100ms<=    (not acceptable)
 * [ time50ms     ] ==> >25ms && 50ms<=     (acceptable)
 * [ time25ms     ] ==> >10ms && 25ms<=     (ok)
 * [ time10ms     ] ==> >1ms && 10ms<=      (good)
 * [ timelt1ms    ] ==> 1ms<                (probably cache)
 * // Histogram of IO size distribution with 7 buckets //
 * [ iosztiny     ] ==> 4K<
 * [ iosz4k       ] ==> between 4K and 8K
 * [ iosz8k       ] ==> between 8K and 16K
 * [ iosz16k      ] ==> between 16K and 32K
 * [ iosz32k      ] ==> between 32K and 64K
 * [ iosz64k      ] ==> between 64K and 128K
 * [ ioszbig      ] ==> anything 128K or above
 * avpending    == Average measured IOs in processing per sample interval
 * maxpending   == Maximum measured IOs in processing per sample interval
 * cterr        == Counter tracking number of errors per sample interval
 * ctretry      == Counter tracking number of retries per sample interval
 */

BEGIN {
    printf("timestamp,sdname,mpxiowwn,actvtm,rangerlat,rangewlat,totbytes,tput,maxtput,ctrd,ctwr,aviosz,rangeiosz,iops,maxiops,avKBps,rangeKBps,ratelt1MBps,rate10MBps,rate100MBps,rate1GBps,avtime,maxtime,timegt1000ms,time100ms,time50ms,time25ms,time10ms,timelt1ms,iosztiny,iosz4k,iosz8k,iosz16k,iosz32k,iosz64k,ioszbig,avpending,maxpending,cterr,ctretry\n");
    interval = walltimestamp - (walltimestamp%1000000000);
}

/* 
 * Set interval timer initial value once for a given device.
 * Each subsequent update will happen roughly once per second,
 * or if no IO, whenever there is some IO that triggers io:::done
 * probe, at which point we determine that timer expired.
 */
io:::start
/ival_timer[args[0]->b_edev] == 0/  {
    ival_timer[args[0]->b_edev] = timestamp;
}

io:::start {
    start[args[0]->b_edev, args[0]->b_blkno] = timestamp;
    /* Increment pending IOs by 1 */
    pend[args[0]->b_edev]++;
}

io:::done
/start[args[0]->b_edev, args[0]->b_blkno]/ {
    this->sn    = args[1]->dev_statname;
    this->p     = substr(args[1]->dev_pathname, 25, 16);
    this->p     = (strstr(this->p, "disk@") == 0 ||
                    strstr(this->p, "disk@") == "") ? this->p : "NA";
    this->x     = args[0]->b_bcount * 976562;
    this->elapsed = timestamp - start[args[0]->b_edev, args[0]->b_blkno];
    start[args[0]->b_edev, args[0]->b_blkno] = 0;
    
    /* Decrement pending IOs by 1, set to 0 if value < 0 */
    pend[args[0]->b_edev]--;
    pend[args[0]->b_edev] = 
        pend[args[0]->b_edev] > 0 ?
        pend[args[0]->b_edev] : 0; /* avoid underflow */

    /* Total Number of bytes per device */
    @totbytes[interval, this->sn, this->p]    = sum(args[0]->b_bcount);
    /* Total nanoseconds of active time per device */
    @actv_tm[interval, this->sn, this->p]     = sum(this->elapsed);

    /* 
     * Instead of converting nanoseconds to seconds we multiply
     * the top by NSEC_PER_SEC and the divide by delta in nanoseconds.
     * In extreme observations, i.e. stalled IO, we may have huge
     * this->elapsed values, in which case result will be 0 KB/s, even
     * if there in fact was a non-zero value of bytes transferred.
     */
    this->b_nsec = args[0]->b_bcount * NSEC_PER_SEC;
    this->kb_per_sec = (this->b_nsec / this->elapsed) >> 10;

    /* Measure IO rate range in KB/s */
    @rangeKBps[interval, this->sn, this->p]     =
        max(maxratei[args[0]->b_edev] - minratei[args[0]->b_edev]);

    /* IO Rate histogram base10, limited at 1000000 KB/s | 976 MB/s */
    /* 1000KB/s <= */
    @ratelt1MBps[interval, this->sn, this->p]     = sum(this->kb_per_sec <= 0x3e8 ? 1 : 0);
    /* > 1000KB/s && 10,000KB/s <= */ 
    @rate10MBps[interval, this->sn, this->p]      = sum(this->kb_per_sec > 0x3e8 &&
                                            this->kb_per_sec <= 0x2710 ? 1 : 0);
    /* > 10,000KB/s && 100,000KB/s <= */ 
    @rate100MBps[interval, this->sn, this->p]     = sum(this->kb_per_sec > 0x2710 &&
                                            this->kb_per_sec <= 0x186a0 ? 1 : 0);
    /* > 100,000KB/s && 1,000,000 KB/s <= */ 
    @rate1GBps[interval, this->sn, this->p]       = sum(this->kb_per_sec > 0x186a0 &&
                                            this->kb_per_sec <= 0xf4240 ? 1 : 0);

    /*
     * Collect minimum and maximum observed rate for later measurement
     * of range for this metric.
     */
    minratei[args[0]->b_edev] = minratei[args[0]->b_edev] == 0 ?
        this->kb_per_sec : minratei[args[0]->b_edev] < this->kb_per_sec ?
        minratei[args[0]->b_edev] : this->kb_per_sec;

    maxratei[args[0]->b_edev] = maxratei[args[0]->b_edev] == 0 ?
        this->kb_per_sec : maxratei[args[0]->b_edev] > this->kb_per_sec ?
        maxratei[args[0]->b_edev] : this->kb_per_sec;
    
    /* Actual Kbytes/sec per device */
    @avKBps[interval, this->sn, this->p]  = avg(this->kb_per_sec);
    
    /* Average and Maximum Latency per device */
    @avtime[interval, this->sn, this->p]  = avg(this->elapsed);
    @maxtime[interval, this->sn, this->p] = max(this->elapsed);
    /*
     * Latency histogram with buckets:
     * >1000ms, >50 to 100ms, >25 to 50ms, >10 to 25ms, >1 to 10ms, 1ms<
     */
    @timegt1000ms[interval, this->sn, this->p] = sum(
        this->elapsed >= 1 * NSEC_PER_SEC ? 1 : 0);
    @time100ms[interval, this->sn, this->p] = sum(
        this->elapsed > 50 * NSEC_PER_MSEC &&
        this->elapsed <= 100 * NSEC_PER_MSEC ? 1 : 0);
    @time50ms[interval, this->sn, this->p] = sum(
        this->elapsed > 25 * NSEC_PER_MSEC &&
        this->elapsed <= 50 * NSEC_PER_MSEC ? 1 : 0);
    @time25ms[interval, this->sn, this->p] = sum(
        this->elapsed > 10 * NSEC_PER_MSEC &&
        this->elapsed <= 25 * NSEC_PER_MSEC ? 1 : 0);
    @time10ms[interval, this->sn, this->p] = sum(
        this->elapsed > 1 * NSEC_PER_MSEC &&
        this->elapsed <= 10 * NSEC_PER_MSEC ? 1 : 0);
    @timelt1ms[interval, this->sn, this->p] = sum(
        this->elapsed < 1 * NSEC_PER_MSEC ? 1 : 0);

    /*
     * Collect minimum and maximum io size for later measurement
     * of range for this metric.
     */
    miniosz[args[0]->b_edev] =
        args[0]->b_bcount < miniosz[args[0]->b_edev] ?
        args[0]->b_bcount : miniosz[args[0]->b_edev];
    maxiosz[args[0]->b_edev] =
        args[0]->b_bcount > maxiosz[args[0]->b_edev] ?
        args[0]->b_bcount : maxiosz[args[0]->b_edev];

    /* Measure IO size range in Kilobytes */
    @rangeiosz[interval, this->sn, this->p] = 
        max((maxiosz[args[0]->b_edev] - miniosz[args[0]->b_edev]) >> 10);

    /* 
     * Convert from bytes and nanoseconds to KB/s with 976562 to obtain
     * avg. effective throughput, and maximum effective throughput.
     * Maximum throughput is likely measuring cache effects.
     */
    @tput[interval, this->sn, this->p] = avg(this->x / this->elapsed);
    @maxtput[interval, this->sn, this->p] = max(this->x / this->elapsed);
    iocnt[args[0]->b_edev, R] += args[0]->b_flags & B_READ ? 1 : 0;
    iocnt[args[0]->b_edev, W] += args[0]->b_flags & B_WRITE ? 1 : 0;

    /* Count number of IOs by IO-type */
    @ctrd[interval, this->sn, this->p]        = sum(args[0]->b_flags & B_READ ? 1 : 0);
    @ctwr[interval, this->sn, this->p]        = sum(args[0]->b_flags & B_WRITE ? 1 : 0);
    @iops[interval, this->sn, this->p]        = 
        count(); /* Normalized to per second in tick-X probe */
    
    /* Maximum sampled IOPS per device */
    @maxiops[interval, this->sn, this->p]     = 
        max(iocnt[args[0]->b_edev, R] + iocnt[args[0]->b_edev, W]);

    /* 
     * Collect minimum and maximum latency for later measurement
     * of range for this metric.
     */
    minlati[args[0]->b_edev, R] =
        args[0]->b_flags & B_READ &&
        this->elapsed < minlati[args[0]->b_edev, R] ? this->elapsed :
        minlati[args[0]->b_edev, R];
    maxlati[args[0]->b_edev, R] =
        args[0]->b_flags & B_READ &&
        this->elapsed > maxlati[args[0]->b_edev, R] ? this->elapsed :
        maxlati[args[0]->b_edev, R];
    
    minlati[args[0]->b_edev, W] =
        (args[0]->b_flags & B_READ) == 0 &&
        this->elapsed < minlati[args[0]->b_edev, W] ? this->elapsed :
        minlati[args[0]->b_edev, W];
    maxlati[args[0]->b_edev, W] =
        (args[0]->b_flags & B_READ) == 0 &&
        this->elapsed > maxlati[args[0]->b_edev, W] ? this->elapsed :
        maxlati[args[0]->b_edev, W];
    
    /*
     * IOsize distribution not grouped by direction, i.e. no distinction
     * is made between reads and writes. IO buckets double in size from
     * previous bucket. i.e. 4, 8, 16, 32...
     */
     this->bs = args[0]->b_bcount ;
    /* 4K< */
    @iosztiny[interval, this->sn, this->p]    =
        sum(this->bs < 0x1000 ? 1 : 0);
    /* 4K to 8K< */
    @iosz4k[interval, this->sn, this->p]      =
        sum(this->bs >= 0x1000 && this->bs < 0x2000 ? 1 : 0);
    /* 8K to 16K< */
    @iosz8k[interval, this->sn, this->p]      =
        sum(this->bs >= 0x2000 && this->bs < 0x4000 ? 1 : 0);
    /* 16K to 32K< */
    @iosz16k[interval, this->sn, this->p]     =
        sum(this->bs >= 0x4000 && this->bs < 0x8000 ? 1 : 0);
    /* 32K to 64K< */
    @iosz32k[interval, this->sn, this->p]     =
        sum(this->bs >= 0x8000 && this->bs < 0x10000 ? 1 : 0);
    /* 64K to 128K< */
    @iosz64k[interval, this->sn, this->p]     =
        sum(this->bs >= 0x10000 && this->bs < 0x20000 ? 1 : 0);
    /* >128K */
    @ioszbig[interval, this->sn, this->p]     =
        sum(this->bs >= 0x20000 ? 1 : 0);

    /* Average IO size for given device */
    @aviosz[interval, this->sn, this->p]      = avg(this->bs);

    /* 
     * Each time we observe an error at completion through B_ERROR flag,
     * increment count of errors for given device. This should always
     * be zero, assuming healthy device.
     */
    @cterr[interval, this->sn, this->p] = sum(args[0]->b_flags & B_ERROR ? 1 : 0);
}

/* 
 * Entry controlled by timer. By design, each device will be registered
 * here about once per second when there is even a litte bit of IO.
 */
io:::done
/ival_timer[args[0]->b_edev] > 0 &&
timestamp >= ival_timer[args[0]->b_edev] + NSEC_PER_SEC/ {
    this->sn    = args[1]->dev_statname;
    this->p     = substr(args[1]->dev_pathname, 25, 16);
    this->p     = (strstr(this->p, "disk@") == 0 ||
                    strstr(this->p, "disk@") == "") ? this->p : "NA";

    /*
     * Measure operating latency range in uS for Reads and Writes,
     * storing largest observed difference.
     */
    @rangerlat[interval, this->sn, this->p]   =
        max((maxlati[args[0]->b_edev, R] -
            minlati[args[0]->b_edev, R]) / NSEC_PER_USEC);
    @rangewlat[interval, this->sn, this->p]   =
        max((maxlati[args[0]->b_edev, W] -
            minlati[args[0]->b_edev, W]) / NSEC_PER_USEC);

    @avpending[interval, this->sn, this->p]   = avg(pend[args[0]->b_edev]);
    @maxpending[interval, this->sn, this->p]  = max(pend[args[0]->b_edev]);

    /* Reset various counters for next measurement period */
    minlati[args[0]->b_edev, R] = 0;
    maxlati[args[0]->b_edev, R] = 0;
    minlati[args[0]->b_edev, W] = 0;
    maxlati[args[0]->b_edev, W] = 0;
    iocnt[args[0]->b_edev, R]   = 0;
    iocnt[args[0]->b_edev, W]   = 0;
    miniosz[args[0]->b_edev]    = 0;
    maxiosz[args[0]->b_edev]    = 0;
    minratei[args[0]->b_edev]   = 0;
    maxratei[args[0]->b_edev]   = 0;
    ival_timer[args[0]->b_edev]         = timestamp;
}

/* 
 * Count number of retries issued to a disk. These are a good
 * indicator of potentially failing, or borderline device.
 * Under normal circumstances we should not expect
 * this to be a positive value.
 */
::sd_set_retry_bp:entry
/xlate <devinfo_t *>(args[1])->dev_pathname != "<nfs>" &&
xlate <devinfo_t *>(args[1])->dev_pathname != "" / {
    this->sn    = xlate <devinfo_t *>(args[1])->dev_statname;
    this->p     = substr(xlate <devinfo_t *>(args[1])->dev_pathname, 25, 16);
    this->p     = (strstr(this->p, "disk@") == 0 ||
                    strstr(this->p, "disk@") == "") ? this->p : "NA";
    @ctretry[interval, this->sn, this->p] = count();
}

tick-10sec {
/* First time we enter this clause, ticks will be 0, so we just assume 10 seconds */
this->elapsed = ticks > 0 ? (timestamp - ticks) / NSEC_PER_SEC : 10 ;

/* Normalize Data for correct per second reporting of rates, like IOPS */
normalize(@actv_tm, NSEC_PER_MSEC); /* from nanoseconds to milliseconds */
normalize(@iops, this->elapsed);

printa("%ld,%s,%s,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d\n",
    @actv_tm, @rangerlat, @rangewlat, @totbytes, @tput, @maxtput, 
    @ctrd, @ctwr, @aviosz, @rangeiosz, @iops, @maxiops, @avKBps, @rangeKBps,
    @ratelt1MBps, @rate10MBps, @rate100MBps, @rate1GBps, @avtime, @maxtime,
    @timegt1000ms, @time100ms, @time50ms, @time25ms, @time10ms, @timelt1ms,
    @iosztiny, @iosz4k, @iosz8k, @iosz16k, @iosz32k, @iosz64k, @ioszbig,
    @avpending, @maxpending, @cterr, @ctretry);
    trunc(@actv_tm); trunc(@rangerlat); trunc(@rangewlat); trunc(@totbytes);
    trunc(@tput); trunc(@maxtput); trunc(@ctrd); trunc(@ctwr); trunc(@aviosz);
    trunc(@rangeiosz); trunc(@iops); trunc(@maxiops); trunc(@avKBps); 
    trunc(@rangeKBps); trunc(@ratelt1MBps); trunc(@rate10MBps); trunc(@rate100MBps); trunc(@rate1GBps); trunc(@avtime); trunc(@maxtime);
    trunc(@iosztiny); trunc(@iosz4k); trunc(@iosz8k); trunc(@iosz16k); 
    trunc(@iosz32k); trunc(@iosz64k); trunc(@ioszbig);
    trunc(@avpending); trunc(@maxpending); trunc(@cterr); trunc(@ctretry);
    trunc(@timegt1000ms); trunc(@time100ms); trunc(@time50ms); trunc(@time25ms);
    trunc(@time10ms); trunc(@timelt1ms);
    ticks = timestamp ;
    interval = walltimestamp - (walltimestamp%1000000000);
}
#!/usr/sbin/dtrace -Cs
#pragma D option quiet
#pragma D option dynvarsize=10M
/*
 * CDDL HEADER START
 *
 * The contents of this file are subject to the terms of the
 * Common Development and Distribution License (the "License").
 * You may not use this file except in compliance with the License.
 *
 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
 * or http://www.opensolaris.org/os/licensing.
 * See the License for the specific language governing permissions
 * and limitations under the License.
 *
 * When distributing Covered Code, include this CDDL HEADER in each
 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
 * If applicable, add the following below this CDDL HEADER, with the
 * fields enclosed by brackets "[]" replaced with your own identifying
 * information: Portions Copyright [yyyy] [name of copyright owner]
 *
 * CDDL HEADER END
 *
 * Copyright (c) 2017 Sam Zaydel / RackTop Systems.
 *
 * bw-tput-iops-actv-time-hist.d
 * This script is meant to be a more informative replacement for the
 * iostat utility. It measures some features that iostat in my opinion
 * is not useful for. For example, iostat does not offer distributions
 * at all, whereas here we plot distributions of bandwidth as observed
 * for each disk. Also, iostat does not offer operating ranges for
 * measurements, which here we define as max - min of some measurement.
 */
unsigned long minlati[dev_t, int], maxlati[dev_t, int]; /* Latency min and max */
unsigned long miniosz[dev_t], maxiosz[dev_t]; /* IO size min and max */
unsigned long long bufcnt[dev_t], ts[dev_t];
unsigned long long iocnt[dev_t, int];
self unsigned long int tottime[dev_t];
inline const int NSEC_PER_SEC = 1000000000;
inline const int SPA_MAXBLOCKSIZE = 128 << 10;
inline const int R = 0;
inline const int W = 1;

io:::start /ts[args[0]->b_edev] == 0/  {
    ts[args[0]->b_edev] = timestamp;
    /*
     * We need to seed minimum value with something other than zero.
     * Because this value is never signed, zero is by definition smallest
     * value will result in range potentially spanning 0 => SPA_MAXBLOCKSIZE.
     * As soon as any IO SPA_MAXBLOCKSIZE< is encountered, this value will
     * update accordingly.
     */
    miniosz[args[0]->b_edev] = SPA_MAXBLOCKSIZE;
}
io:::start {
    start[args[0]->b_edev, args[0]->b_blkno] = timestamp;
}
io:::done /start[args[0]->b_edev, args[0]->b_blkno]/ {
    this->sn    = args[1]->dev_statname; 
    this->p     = args[1]->dev_pathname;
    this->x     = args[0]->b_bcount * 976562;
    this->elapsed = timestamp - start[args[0]->b_edev, args[0]->b_blkno];
    bufcnt[args[0]->b_edev] += args[0]->b_bcount ; /* sum of bytes */
    self->tottime[args[0]->b_edev] += this->elapsed;
    start[args[0]->b_edev, args[0]->b_blkno] = 0;
    
    /*
     * Collect minimum and maximum io size for later measurement
     * of range for this metric.
     */
    miniosz[args[0]->b_edev] =
        args[0]->b_bcount < miniosz[args[0]->b_edev] ?
        args[0]->b_bcount : miniosz[args[0]->b_edev];
    maxiosz[args[0]->b_edev] =
        args[0]->b_bcount > maxiosz[args[0]->b_edev] ?
        args[0]->b_bcount : maxiosz[args[0]->b_edev];
    /* 
     * Convert from bytes and nanoseconds to KB/s with 976562 to obtain
     * avg. effective throughput, and maximum effective throughput.
     * Maximum throughput is likely measuring cache effects.
     */
    @tput[this->sn, this->p] = avg(this->x / this->elapsed);
    @maxtput[this->sn, this->p] = max(this->x / this->elapsed);
    iocnt[args[0]->b_edev, R] += args[0]->b_flags & B_READ ? 1 : 0;
    iocnt[args[0]->b_edev, W] += args[0]->b_flags & B_READ ? 0 : 1;
    /* 
     * Collect minimum and maximum latency for later measurement
     * of range for this metric.
     */
    minlati[args[0]->b_edev, R] =
        args[0]->b_flags & B_READ &&
        this->elapsed < minlati[args[0]->b_edev, R] ? this->elapsed :
        minlati[args[0]->b_edev, R];
    maxlati[args[0]->b_edev, R] =
        args[0]->b_flags & B_READ &&
        this->elapsed > maxlati[args[0]->b_edev, R] ? this->elapsed :
        maxlati[args[0]->b_edev, R];
    
    minlati[args[0]->b_edev, W] =
        (args[0]->b_flags & B_READ) == 0 &&
        this->elapsed < minlati[args[0]->b_edev, W] ? this->elapsed :
        minlati[args[0]->b_edev, W];
    maxlati[args[0]->b_edev, W] =
        (args[0]->b_flags & B_READ) == 0 &&
        this->elapsed > maxlati[args[0]->b_edev, W] ? this->elapsed :
        maxlati[args[0]->b_edev, W];
}
io:::done
/timestamp >= ts[args[0]->b_edev] + NSEC_PER_SEC/ {
    this->sn    = args[1]->dev_statname; 
    this->p     = args[1]->dev_pathname;
    this->delta = timestamp - ts[args[0]->b_edev];
    /* 
     * Instead of converting nanoseconds to seconds we multiply
     * the top by NSEC_PER_SEC and the divide by delta in nanoseconds.
     */
    this->b_nsec = (bufcnt[args[0]->b_edev]) * NSEC_PER_SEC;
    this->kb_per_sec = (this->b_nsec / this->delta) >> 10;
    this->ioct = iocnt[args[0]->b_edev, R] + iocnt[args[0]->b_edev, W];
    this->iops = (this->ioct * NSEC_PER_SEC) / this->delta;
    /* Actual Kbytes/sec per device */
    @bw[this->sn, this->p]          = quantize(this->kb_per_sec);
    @avbw[this->sn, this->p]        = avg(this->kb_per_sec);
    @maxbw[this->sn, this->p]       = max(this->kb_per_sec);
    /* Total Number of bytes per device */
    @totbytes[this->sn, this->p]    = sum(bufcnt[args[0]->b_edev]);
    /* Total nanoseconds of active time per device */
    @actv_tm[this->sn, this->p]     = sum(self->tottime[args[0]->b_edev]);
    /* Average IOPS per device */
    @aviops[this->sn, this->p]      = avg(this->iops);
    @maxiops[this->sn, this->p]     = max(this->iops);
    /* Average IO size per device */
    @aviosz[this->sn, this->p]      = avg(args[0]->b_bcount);
    /* Count number of IOs by IO-type */
    @ctrd[this->sn, this->p]        = sum(iocnt[args[0]->b_edev, R]);
    @ctwr[this->sn, this->p]        = sum(iocnt[args[0]->b_edev, W]);

    /* Measure operating latency range in uS for Reads and Writes */
    @rangerlat[this->sn, this->p]   =
        max((maxlati[args[0]->b_edev, R] -
            minlati[args[0]->b_edev, R]) / 1000); /* largest difference stored */
    @rangewlat[this->sn, this->p]   =
        max((maxlati[args[0]->b_edev, W] -
            minlati[args[0]->b_edev, W]) / 1000); /* largest difference stored */

    @rangeiosz[this->sn, this->p]   =
        max((maxiosz[args[0]->b_edev] - miniosz[args[0]->b_edev]) >> 10);
    /* Reset various counters for next measurement period */
    minlati[args[0]->b_edev, R] = 0;
    maxlati[args[0]->b_edev, R] = 0;
    minlati[args[0]->b_edev, W] = 0;
    maxlati[args[0]->b_edev, W] = 0;
    iocnt[args[0]->b_edev, R] = 0;
    iocnt[args[0]->b_edev, W] = 0;
    bufcnt[args[0]->b_edev] = 0;

    self->tottime[args[0]->b_edev] = 0;
    ts[args[0]->b_edev] = timestamp;
}

END {
normalize(@actv_tm, 1000000); /* from nanoseconds to milliseconds */
normalize(@totbytes, 1024);   /* from bytes to kilobytes */
printa("\t   %s (%s)\n\t   Active Time: %@d(ms) | Lat. Range[max-min] Read: %@d(uS) / Write: %@d(uS)\n\t   Volume: %@d(KB) | Tput Avg: %@d(KB/s) / Max: %@d(KB/s)\n\t   IOs Read: %@d / Write: %@d | IO Size Avg: %@d(b) / Range[max-min]: %@d(KB)\n\t   IOPS Avg: %@d | Max: %@d | Bandwidth Avg: %@d(KB/s) | Max: %@d(KB/s)\n\t   Bandwidth Histogram(KB/s)%@d\n",
    @actv_tm, @rangerlat, @rangewlat, @totbytes, @tput, @maxtput, 
    @ctrd, @ctwr, @aviosz, @rangeiosz, @aviops, @maxiops, @avbw, @maxbw, @bw);
}
// Identify devices to which SYNC CACHE commands are being sent
dtrace -qn '::sd_send_scsi_SYNCHRONIZE_CACHE:entry {print(stringof(args[0]->un_sd->sd_inq->inq_vid))}'
#pragma D option quiet

BEGIN
{
        start = timestamp;
}

io:::start
{
        ts[args[0]->b_edev, args[0]->b_lblkno] = timestamp;
}

io:::done
/ts[args[0]->b_edev, args[0]->b_lblkno]/
{
        this->delta = (timestamp - ts[args[0]->b_edev, args[0]->b_lblkno]) / 1000;
        this->name = (args[0]->b_flags & (B_READ | B_WRITE)) == B_READ ?
            "read " : "write ";

        @q[this->name] = quantize(this->delta);
        @a[this->name] = avg(this->delta);
        @v[this->name] = stddev(this->delta);
        @i[this->name] = count();
        @b[this->name] = sum(args[0]->b_bcount);

        ts[args[0]->b_edev, args[0]->b_lblkno] = 0;
}

END
{
        printa(@q);

        normalize(@i, (timestamp - start) / 1000000000);
        normalize(@b, (timestamp - start) / 1000000000 * 1024);

        printf("%-30s %11s %11s %11s %11s\n", "", "avg latency", "stddev",
            "iops", "throughput");
        printa("%-30s %@9uus %@9uus %@9u/s %@8uk/s\n", @a, @v, @i, @b);
}
/*
    # dtrace -s rw.d -c 'sleep 60'
    
      read
               value  ------------- Distribution ------------- count
                  32 |                                         0
                  64 |                                         23
                 128 |@                                        655
                 256 |@@@@                                     1638
                 512 |@@                                       743
                1024 |@                                        380
                2048 |@@@                                      1341
                4096 |@@@@@@@@@@@@                             5295
                8192 |@@@@@@@@@@@                              5033
               16384 |@@@                                      1297
               32768 |@@                                       684
               65536 |@                                        400
              131072 |                                         225
              262144 |                                         206
              524288 |                                         127
             1048576 |                                         19
             2097152 |                                         0        
    
      write
               value  ------------- Distribution ------------- count
                  32 |                                         0
                  64 |                                         47
                 128 |                                         469
                 256 |                                         591
                 512 |                                         327
                1024 |                                         924
                2048 |@                                        6734
                4096 |@@@@@@@                                  43416
                8192 |@@@@@@@@@@@@@@@@@                        102013
               16384 |@@@@@@@@@@                               60992
               32768 |@@@                                      20312
               65536 |@                                        6789
              131072 |                                         860
              262144 |                                         208
              524288 |                                         153
             1048576 |                                         36
             2097152 |                                         0        
    
                                   avg latency      stddev        iops  throughput
    write                              19442us     32468us      4064/s   261889k/s
    read                               23733us     88206us       301/s    13113k/s
*/
# Get a quick sense of the size(s) of IO, quantized, grouped by (device, Read/Write Ops).
dtrace -n 'io:::start {@[args[1]->dev_pathname, args[0]->b_flags & B_READ ? "READ" : "WRITE"] = quantize(args[0]->b_bcount);}'
dtrace -qn '
  int activ[string];
  BEGIN {
    start = timestamp;
  }

  io:::start {
    ts[args[0]->b_edev, args[0]->b_lblkno] = timestamp;
  }

  io:::done
  /ts[args[0]->b_edev, args[0]->b_lblkno]/ {
    this->delta = 
        (timestamp - ts[args[0]->b_edev, args[0]->b_lblkno]) / 1000;
    this->name = 
        (args[0]->b_flags & (B_READ | B_WRITE)) == B_READ ?
        "read" : "write";

    /* Convert bytes to KB, to measure microseconds per KB */
    this->kb = (args[0]->b_bcount>>10) ? (args[0]->b_bcount>>10) > 0 : 1;

    @q[this->name] = quantize(this->delta);
    /* Above graph should support this mean value. Largest count on histogram
     * is expected to hover in the three buckets closest to the mean value.
     */
    @a[this->name] = avg(this->delta);
    @v[this->name] = stddev(this->delta);
    @i[this->name] = count();
    @b[this->name] = sum(args[0]->b_bcount);
    /* Time spent per KB of IO. This is an unusual metric, but may prove
     * useful at some point.
     */
    @kbt[this->name] = quantize(this->delta / this->kb);

    ts[args[0]->b_edev, args[0]->b_lblkno] = 0;
  }

  END {
    this->elapsed = (timestamp - start) / 1000000000;
    printa("\tOP: %s\n\ttime spent/KB (us): %@d\n", @kbt);
    printa("\tOP: %s\n\tlatency (us): %@d\n", @q);
    /* When there is very little IO of a given type, normalization can create
     * an unacceptable degree of error. We are normalizing over some period of
     * time, and if we have lots of IO for the given command over a short
     * interval but the script runs for a much longer period of time, 
     * we will normalize over that time period, which can create incorrect 
     * and perhaps confusing results.
     */
    normalize(@i, this->elapsed);
    normalize(@b, this->elapsed * 1024);

    printf("%-30s %11s %11s %11s %11s\n", "", "avg.latency", "stddev",
        "IOPS", "throughput");
    printa("%-30s %@9uus %@9uus %@9u/s %@8uKB/s\n", @a, @v, @i, @b);
  }'
// Is IO aligned on 4K boundary? This is critical for SSDs, and could be impacting
// performance to a lesser degree of mechanical devices.
dtrace -qn '
    io:::start {
        @[args[0]->b_flags & B_WRITE ? "W" : "R"] = quantize(args[0]->b_blkno % 8);
    }
    END {
        printa("OP: (%s) %@d\n", @)
    }'
dtrace -qn 'BEGIN {
x[123] = 0;
}
io:::start {
    x[123] = 1;
    start[arg0] = timestamp;
}
io:::done /this->start = start[arg0]/ {
    @delta[args[0]->b_flags & B_WRITE ? "WRITE" : "READ",  args[1]->dev_pathname] = quantize((timestamp - this->start) / 1000); self->ts = 0;
    start[arg0] = 0;
}
tick-30sec /x[123]/ {
    printa("%12s (us) Device: %s %@d\n", @delta);
    trunc(@delta);
    x[123] = 0;
}'
// Measure throughput per device and get some useful statistics
// in a form of a summary that looks like this:
// +=======================================+
// |=============== Summary ===============|
// +=======================================+
// Start Time: 2017 Jul 13 18:35:46
// Human Time:   3.972 seconds
// Active (All drives):   5.460 seconds
// Spent   0% of elapsed time Reading
// Spent 100% of elapsed time Writing
// Average Write throughput: 76993KB/sec
// Burst   Write throughput: 468351KB/sec
// =========================================
//
dtrace -qn '
  inline const int NSEC_IN_SEC       = 1000000000;
  BEGIN {
      starttime = walltimestamp;
      runtime   = timestamp;
      busytime  = 0;
      readtime  = 0;
      writetime = 0;
  }
  io:::start {
    start[args[0]->b_edev, args[0]->b_blkno] = timestamp;
  }
  io:::done
  /start[args[0]->b_edev, args[0]->b_blkno]/ {
    this->devp = strtok(args[1]->dev_pathname, ":");
    this->elapsed = timestamp - 
            start[args[0]->b_edev, args[0]->b_blkno];
    /* 
     * Odd looking calculation below is to go from bytes and nanoseconds 
     * to kilobytes(KB) and seconds. bytes/ns => KB/sec.
     */
    @cumtp[args[0]->b_flags & B_READ ? "Read" : "Write"] =
      avg((args[0]->b_bcount * 976562) / this->elapsed);
    @cumtp_max[args[0]->b_flags & B_READ ? "Read" : "Write"] =
      max((args[0]->b_bcount * 976562) / this->elapsed);
    @tp[args[1]->dev_statname, this->devp] =
      quantize((args[0]->b_bcount * 976562) / this->elapsed);

    readtime    += args[0]->b_flags & B_READ ? this->elapsed : 0;
    writetime   += args[0]->b_flags & B_WRITE ? this->elapsed : 0;

    start[args[0]->b_edev, args[0]->b_blkno] = 0;
  }
  END {
    runtime = timestamp - runtime;
    /* 
     * Small amount of rounding-off error is possible here, 
     * so the two percentages may not add-up to 100% exactly.
     */
    pctrd   = readtime * 100 / (readtime + writetime);
    pctwr   = writetime * 100 / (readtime + writetime);
    printf("+=======================================+\n");
    printf("|=============== Summary ===============|\n");
    printf("+=======================================+\n");
    printf("Start Time: %Y\n", starttime);
    printf("Human Time: %3d.%03d seconds\n", 
            runtime / NSEC_IN_SEC, 
            (runtime / 1000) % 1000);
    printf("Active (All drives): %3d.%03d seconds\n", 
            (readtime+writetime) / NSEC_IN_SEC,
            ((readtime+writetime) / 1000) % 1000);
    printf("Spent %3d%% of elapsed time Reading\n", pctrd);
    printf("Spent %3d%% of elapsed time Writing\n", pctwr);
    printa("Average %-5s throughput: %@dKB/sec\n", @cumtp);
    printa("Burst   %-5s throughput: %@dKB/sec\n", @cumtp_max);
    printf("=========================================\n\n");
    printa("  %s (%s)\n\t   KB/s%@d\n", @tp);
  }'
#!/usr/sbin/dtrace -s
#pragma D option quiet

BEGIN
{
        start = timestamp;
}
io:::start
{
        ts[args[0]->b_edev, args[0]->b_lblkno] = timestamp;
}
io:::done
/ts[args[0]->b_edev, args[0]->b_lblkno]/
{
        this->delta = (timestamp - ts[args[0]->b_edev, args[0]->b_lblkno]) / 1000;
        this->name = (args[0]->b_flags & (B_READ | B_WRITE)) == B_READ ?
            "read " : "write ";
        @q[this->name] = quantize(this->delta);
        @a[this->name] = avg(this->delta);
        @v[this->name] = stddev(this->delta);
        @i[this->name] = count();
        @b[this->name] = sum(args[0]->b_bcount);
        ts[args[0]->b_edev, args[0]->b_lblkno] = 0;
}
END
{
        printa(@q);
        normalize(@i, (timestamp - start) / 1000000000);
        normalize(@b, (timestamp - start) / 1000000000 * 1024);
        printf("%-30s %11s %11s %11s %11s\n", "", "avg latency", "stddev",
            "iops", "throughput");
        printa("%-30s %@9uus %@9uus %@9u/s %@8uk/s\n", @a, @v, @i, @b);
}
#!/usr/sbin/dtrace -s
/*
 * sdretry.d
 *
 * Example script from Chapter 4 of the book: DTrace: Dynamic Tracing in
 * Oracle Solaris, Mac OS X, and FreeBSD", by Brendan Gregg and Jim Mauro,
 * Prentice Hall, 2011. ISBN-10: 0132091518. http://dtracebook.com.
 * 
 * See the book for the script description and warnings. Many of these are
 * provided as example solutions, and will need changes to work on your OS.
 */

#pragma D option quiet

dtrace:::BEGIN
{
	printf("Tracing... output every 10 seconds.\n");
}

fbt::sd_set_retry_bp:entry
{
	@[xlate <devinfo_t *>(args[1])->dev_statname,
	    xlate <devinfo_t *>(args[1])->dev_major,
	    xlate <devinfo_t *>(args[1])->dev_minor] = count();
}

tick-10sec
{
	printf("\n%Y:\n", walltimestamp);
	printf("%28s  %-3s,%-4s  %s\n", "DEVICE", "MAJ", "MIN", "RETRIES");
	printa("%28s  %-03d,%-4d  %@d\n", @);
	trunc(@);
}