szaydel
1/19/2018 - 5:26 PM

Dtrace script for getting detailed IO metrics on BrickstorOS

Leveraging io:::start, io:::done probes for collecting lower-level IO metrics. Various measurements are made, including latency ranges, bytes processed, throughput, various rates, IO counts by IO type, etc.

#!/usr/sbin/dtrace -Cs
#pragma D option quiet
#pragma D option dynvarsize=16M

/*
 * CDDL HEADER START
 *
 * The contents of this file are subject to the terms of the
 * Common Development and Distribution License (the "License").
 * You may not use this file except in compliance with the License.
 *
 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
 * or http://www.opensolaris.org/os/licensing.
 * See the License for the specific language governing permissions
 * and limitations under the License.
 *
 * When distributing Covered Code, include this CDDL HEADER in each
 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
 * If applicable, add the following below this CDDL HEADER, with the
 * fields enclosed by brackets "[]" replaced with your own identifying
 * information: Portions Copyright [yyyy] [name of copyright owner]
 *
 * CDDL HEADER END
 *
 * Copyright (c) 2017 Sam Zaydel / RackTop Systems.
 *
 * bw-tput-iops-actv-time-hist-with-ts-csv.d
 * This script is meant to be a more informative replacement for the
 * iostat utility. It measures some features that iostat in my opinion
 * is not useful for. For example, iostat does not offer distributions
 * at all, whereas here we plot distributions of IO rates as observed
 * for each disk. Also, iostat does not offer operating ranges for
 * measurements, which here we define as max - min of some measurement.
 *
 * Currently, script is limited to scsi_vhci devices, because they are
 * ones we normally use for actual data storage. Anything else is either
 * a locally attached device, or something not used by system, i.e.
 * USB storage media, etc.
 */
unsigned long minlati[dev_t, int], maxlati[dev_t, int]; /* Latency min and max */
unsigned long minratei[dev_t], maxratei[dev_t];         /* IO Rate min and max */
unsigned long miniosz[dev_t], maxiosz[dev_t];           /* IO size min and max */
unsigned long iocnt[dev_t, int];
int pend[dev_t];
hrtime_t start[dev_t, uint64_t];
hrtime_t ival_timer[dev_t];
hrtime_t ticks;
inline const int NSEC_PER_SEC = 1000000000;
inline const int NSEC_PER_MSEC = 1000000;
inline const int NSEC_PER_USEC = 1000;
inline const int SPA_MAXBLOCKSIZE = 128 << 10;
inline const int R = 0;
inline const int W = 1;

/*
 * These are the Output Parameter definitions for collected metrics
 *
 * sdname == Name of device, i.e. sd0
 * mpxiowwn == mpxio device, like: 5000cca24500f698
 * actvtm == amount of real, busy time spent processing IO
 * rangerlat, rangewlat == latency range: max - min for Reads and Writes
 * totbytes == total number of bytes Read and Written during sample interval
 * tput,maxtput == mean and maximum or burst throughput
 * ctrd,ctwr == Count of Reads and Writes over interval set in tick-Xsec clause
 * aviosz,rangeiosz == IO size mean and range: max - min
 * iops,maxiops == normalized IOPS mean and maximum
 * avKBps,rangeKBps == normalized IO rate and IO rate range: max - min
 * // Histogram of IO Rate distribution with 4 buckets //
 * [ ratelt1MBps  ] ==> 1 second interval with < 1,000KB/s bw
 * [ rate10MBps   ] ==> 1 second interval with < 10,000KB/s bw
 * [ rate100MBps  ] ==> 1 second interval with < 100,000KB/s bw
 * [ rate1GBps    ] ==> 1 second interval with < 1,000,000KB/s bw
 * avtime,maxtime == average and maximum IO completion latency
 * // Histogram of latency distribution with 6 buckets //
 * [ timegt1000ms ] ==> >1 second (SLOW!)
 * [ time100ms    ] ==> >50ms && 100ms<=    (not acceptable)
 * [ time50ms     ] ==> >25ms && 50ms<=     (acceptable)
 * [ time25ms     ] ==> >10ms && 25ms<=     (ok)
 * [ time10ms     ] ==> >1ms && 10ms<=      (good)
 * [ timelt1ms    ] ==> 1ms<                (probably cache)
 * // Histogram of IO size distribution with 7 buckets //
 * [ iosztiny     ] ==> 4K<
 * [ iosz4k       ] ==> between 4K and 8K
 * [ iosz8k       ] ==> between 8K and 16K
 * [ iosz16k      ] ==> between 16K and 32K
 * [ iosz32k      ] ==> between 32K and 64K
 * [ iosz64k      ] ==> between 64K and 128K
 * [ ioszbig      ] ==> anything 128K or above
 * avpending    == Average measured IOs in processing per sample interval
 * maxpending   == Maximum measured IOs in processing per sample interval
 * cterr        == Counter tracking number of errors per sample interval
 * ctretry      == Counter tracking number of retries per sample interval
 */

BEGIN {
    printf("timestamp,sdname,mpxiowwn,actvtm,rangerlat,rangewlat,totbytes,tput,maxtput,ctrd,ctwr,aviosz,rangeiosz,iops,maxiops,avKBps,rangeKBps,ratelt1MBps,rate10MBps,rate100MBps,rate1GBps,avtime,maxtime,timegt1000ms,time100ms,time50ms,time25ms,time10ms,timelt1ms,iosztiny,iosz4k,iosz8k,iosz16k,iosz32k,iosz64k,ioszbig,avpending,maxpending,cterr,ctretry\n");
    interval = walltimestamp - (walltimestamp%1000000000);
}

/* 
 * Set interval timer initial value once for a given device.
 * Each subsequent update will happen roughly once per second,
 * or if no IO, whenever there is some IO that triggers io:::done
 * probe, at which point we determine that timer expired.
 */
io:::start
/ival_timer[args[0]->b_edev] == 0/  {
    ival_timer[args[0]->b_edev] = timestamp;
}

io:::start {
    start[args[0]->b_edev, args[0]->b_blkno] = timestamp;
    /* Increment pending IOs by 1 */
    pend[args[0]->b_edev]++;
}

io:::done
/start[args[0]->b_edev, args[0]->b_blkno]/ {
    this->sn    = args[1]->dev_statname;
    this->p     = substr(args[1]->dev_pathname, 25, 16);
    this->p     = (strstr(this->p, "disk@") == 0 ||
                    strstr(this->p, "disk@") == "") ? this->p : "NA";
    this->x     = args[0]->b_bcount * 976562;
    this->elapsed = timestamp - start[args[0]->b_edev, args[0]->b_blkno];
    start[args[0]->b_edev, args[0]->b_blkno] = 0;
    
    /* Decrement pending IOs by 1, set to 0 if value < 0 */
    pend[args[0]->b_edev]--;
    pend[args[0]->b_edev] = 
        pend[args[0]->b_edev] > 0 ?
        pend[args[0]->b_edev] : 0; /* avoid underflow */

    /* Total Number of bytes per device */
    @totbytes[interval, this->sn, this->p]    = sum(args[0]->b_bcount);
    /* Total nanoseconds of active time per device */
    @actv_tm[interval, this->sn, this->p]     = sum(this->elapsed);

    /* 
     * Instead of converting nanoseconds to seconds we multiply
     * the top by NSEC_PER_SEC and the divide by delta in nanoseconds.
     * In extreme observations, i.e. stalled IO, we may have huge
     * this->elapsed values, in which case result will be 0 KB/s, even
     * if there in fact was a non-zero value of bytes transferred.
     */
    this->b_nsec = args[0]->b_bcount * NSEC_PER_SEC;
    this->kb_per_sec = (this->b_nsec / this->elapsed) >> 10;

    /* Measure IO rate range in KB/s */
    @rangeKBps[interval, this->sn, this->p]     =
        max(maxratei[args[0]->b_edev] - minratei[args[0]->b_edev]);

    /* IO Rate histogram base10, limited at 1000000 KB/s | 976 MB/s */
    /* 1000KB/s <= */
    @ratelt1MBps[interval, this->sn, this->p]     = sum(this->kb_per_sec <= 0x3e8 ? 1 : 0);
    /* > 1000KB/s && 10,000KB/s <= */ 
    @rate10MBps[interval, this->sn, this->p]      = sum(this->kb_per_sec > 0x3e8 &&
                                            this->kb_per_sec <= 0x2710 ? 1 : 0);
    /* > 10,000KB/s && 100,000KB/s <= */ 
    @rate100MBps[interval, this->sn, this->p]     = sum(this->kb_per_sec > 0x2710 &&
                                            this->kb_per_sec <= 0x186a0 ? 1 : 0);
    /* > 100,000KB/s && 1,000,000 KB/s <= */ 
    @rate1GBps[interval, this->sn, this->p]       = sum(this->kb_per_sec > 0x186a0 &&
                                            this->kb_per_sec <= 0xf4240 ? 1 : 0);

    /*
     * Collect minimum and maximum observed rate for later measurement
     * of range for this metric.
     */
    minratei[args[0]->b_edev] = minratei[args[0]->b_edev] == 0 ?
        this->kb_per_sec : minratei[args[0]->b_edev] < this->kb_per_sec ?
        minratei[args[0]->b_edev] : this->kb_per_sec;

    maxratei[args[0]->b_edev] = maxratei[args[0]->b_edev] == 0 ?
        this->kb_per_sec : maxratei[args[0]->b_edev] > this->kb_per_sec ?
        maxratei[args[0]->b_edev] : this->kb_per_sec;
    
    /* Actual Kbytes/sec per device */
    @avKBps[interval, this->sn, this->p]  = avg(this->kb_per_sec);
    
    /* Average and Maximum Latency per device */
    @avtime[interval, this->sn, this->p]  = avg(this->elapsed);
    @maxtime[interval, this->sn, this->p] = max(this->elapsed);
    /*
     * Latency histogram with buckets:
     * >1000ms, >50 to 100ms, >25 to 50ms, >10 to 25ms, >1 to 10ms, 1ms<
     */
    @timegt1000ms[interval, this->sn, this->p] = sum(
        this->elapsed >= 1 * NSEC_PER_SEC ? 1 : 0);
    @time100ms[interval, this->sn, this->p] = sum(
        this->elapsed > 50 * NSEC_PER_MSEC &&
        this->elapsed <= 100 * NSEC_PER_MSEC ? 1 : 0);
    @time50ms[interval, this->sn, this->p] = sum(
        this->elapsed > 25 * NSEC_PER_MSEC &&
        this->elapsed <= 50 * NSEC_PER_MSEC ? 1 : 0);
    @time25ms[interval, this->sn, this->p] = sum(
        this->elapsed > 10 * NSEC_PER_MSEC &&
        this->elapsed <= 25 * NSEC_PER_MSEC ? 1 : 0);
    @time10ms[interval, this->sn, this->p] = sum(
        this->elapsed > 1 * NSEC_PER_MSEC &&
        this->elapsed <= 10 * NSEC_PER_MSEC ? 1 : 0);
    @timelt1ms[interval, this->sn, this->p] = sum(
        this->elapsed < 1 * NSEC_PER_MSEC ? 1 : 0);

    /*
     * Collect minimum and maximum io size for later measurement
     * of range for this metric.
     */
    miniosz[args[0]->b_edev] =
        args[0]->b_bcount < miniosz[args[0]->b_edev] ?
        args[0]->b_bcount : miniosz[args[0]->b_edev];
    maxiosz[args[0]->b_edev] =
        args[0]->b_bcount > maxiosz[args[0]->b_edev] ?
        args[0]->b_bcount : maxiosz[args[0]->b_edev];

    /* Measure IO size range in Kilobytes */
    @rangeiosz[interval, this->sn, this->p] = 
        max((maxiosz[args[0]->b_edev] - miniosz[args[0]->b_edev]) >> 10);

    /* 
     * Convert from bytes and nanoseconds to KB/s with 976562 to obtain
     * avg. effective throughput, and maximum effective throughput.
     * Maximum throughput is likely measuring cache effects.
     */
    @tput[interval, this->sn, this->p] = avg(this->x / this->elapsed);
    @maxtput[interval, this->sn, this->p] = max(this->x / this->elapsed);
    iocnt[args[0]->b_edev, R] += args[0]->b_flags & B_READ ? 1 : 0;
    iocnt[args[0]->b_edev, W] += args[0]->b_flags & B_WRITE ? 1 : 0;

    /* Count number of IOs by IO-type */
    @ctrd[interval, this->sn, this->p]        = sum(args[0]->b_flags & B_READ ? 1 : 0);
    @ctwr[interval, this->sn, this->p]        = sum(args[0]->b_flags & B_WRITE ? 1 : 0);
    @iops[interval, this->sn, this->p]        = 
        count(); /* Normalized to per second in tick-X probe */
    
    /* Maximum sampled IOPS per device */
    @maxiops[interval, this->sn, this->p]     = 
        max(iocnt[args[0]->b_edev, R] + iocnt[args[0]->b_edev, W]);

    /* 
     * Collect minimum and maximum latency for later measurement
     * of range for this metric.
     */
    minlati[args[0]->b_edev, R] =
        args[0]->b_flags & B_READ &&
        this->elapsed < minlati[args[0]->b_edev, R] ? this->elapsed :
        minlati[args[0]->b_edev, R];
    maxlati[args[0]->b_edev, R] =
        args[0]->b_flags & B_READ &&
        this->elapsed > maxlati[args[0]->b_edev, R] ? this->elapsed :
        maxlati[args[0]->b_edev, R];
    
    minlati[args[0]->b_edev, W] =
        (args[0]->b_flags & B_READ) == 0 &&
        this->elapsed < minlati[args[0]->b_edev, W] ? this->elapsed :
        minlati[args[0]->b_edev, W];
    maxlati[args[0]->b_edev, W] =
        (args[0]->b_flags & B_READ) == 0 &&
        this->elapsed > maxlati[args[0]->b_edev, W] ? this->elapsed :
        maxlati[args[0]->b_edev, W];
    
    /*
     * IOsize distribution not grouped by direction, i.e. no distinction
     * is made between reads and writes. IO buckets double in size from
     * previous bucket. i.e. 4, 8, 16, 32...
     */
     this->bs = args[0]->b_bcount ;
    /* 4K< */
    @iosztiny[interval, this->sn, this->p]    =
        sum(this->bs < 0x1000 ? 1 : 0);
    /* 4K to 8K< */
    @iosz4k[interval, this->sn, this->p]      =
        sum(this->bs >= 0x1000 && this->bs < 0x2000 ? 1 : 0);
    /* 8K to 16K< */
    @iosz8k[interval, this->sn, this->p]      =
        sum(this->bs >= 0x2000 && this->bs < 0x4000 ? 1 : 0);
    /* 16K to 32K< */
    @iosz16k[interval, this->sn, this->p]     =
        sum(this->bs >= 0x4000 && this->bs < 0x8000 ? 1 : 0);
    /* 32K to 64K< */
    @iosz32k[interval, this->sn, this->p]     =
        sum(this->bs >= 0x8000 && this->bs < 0x10000 ? 1 : 0);
    /* 64K to 128K< */
    @iosz64k[interval, this->sn, this->p]     =
        sum(this->bs >= 0x10000 && this->bs < 0x20000 ? 1 : 0);
    /* >128K */
    @ioszbig[interval, this->sn, this->p]     =
        sum(this->bs >= 0x20000 ? 1 : 0);

    /* Average IO size for given device */
    @aviosz[interval, this->sn, this->p]      = avg(this->bs);

    /* 
     * Each time we observe an error at completion through B_ERROR flag,
     * increment count of errors for given device. This should always
     * be zero, assuming healthy device.
     */
    @cterr[interval, this->sn, this->p] = sum(args[0]->b_flags & B_ERROR ? 1 : 0);
}

/* 
 * Entry controlled by timer. By design, each device will be registered
 * here about once per second when there is even a litte bit of IO.
 */
io:::done
/ival_timer[args[0]->b_edev] > 0 &&
timestamp >= ival_timer[args[0]->b_edev] + NSEC_PER_SEC/ {
    this->sn    = args[1]->dev_statname;
    this->p     = substr(args[1]->dev_pathname, 25, 16);
    this->p     = (strstr(this->p, "disk@") == 0 ||
                    strstr(this->p, "disk@") == "") ? this->p : "NA";

    /*
     * Measure operating latency range in uS for Reads and Writes,
     * storing largest observed difference.
     */
    @rangerlat[interval, this->sn, this->p]   =
        max((maxlati[args[0]->b_edev, R] -
            minlati[args[0]->b_edev, R]) / NSEC_PER_USEC);
    @rangewlat[interval, this->sn, this->p]   =
        max((maxlati[args[0]->b_edev, W] -
            minlati[args[0]->b_edev, W]) / NSEC_PER_USEC);

    @avpending[interval, this->sn, this->p]   = avg(pend[args[0]->b_edev]);
    @maxpending[interval, this->sn, this->p]  = max(pend[args[0]->b_edev]);

    /* Reset various counters for next measurement period */
    minlati[args[0]->b_edev, R] = 0;
    maxlati[args[0]->b_edev, R] = 0;
    minlati[args[0]->b_edev, W] = 0;
    maxlati[args[0]->b_edev, W] = 0;
    iocnt[args[0]->b_edev, R]   = 0;
    iocnt[args[0]->b_edev, W]   = 0;
    miniosz[args[0]->b_edev]    = 0;
    maxiosz[args[0]->b_edev]    = 0;
    minratei[args[0]->b_edev]   = 0;
    maxratei[args[0]->b_edev]   = 0;
    ival_timer[args[0]->b_edev]         = timestamp;
}

/* 
 * Count number of retries issued to a disk. These are a good
 * indicator of potentially failing, or borderline device.
 * Under normal circumstances we should not expect
 * this to be a positive value.
 */
::sd_set_retry_bp:entry
/xlate <devinfo_t *>(args[1])->dev_pathname != "<nfs>" &&
xlate <devinfo_t *>(args[1])->dev_pathname != "" / {
    this->sn    = xlate <devinfo_t *>(args[1])->dev_statname;
    this->p     = substr(xlate <devinfo_t *>(args[1])->dev_pathname, 25, 16);
    this->p     = (strstr(this->p, "disk@") == 0 ||
                    strstr(this->p, "disk@") == "") ? this->p : "NA";
    @ctretry[interval, this->sn, this->p] = count();
}

tick-10sec {
/* First time we enter this clause, ticks will be 0, so we just assume 10 seconds */
this->elapsed = ticks > 0 ? (timestamp - ticks) / NSEC_PER_SEC : 10 ;

/* Normalize Data for correct per second reporting of rates, like IOPS */
normalize(@actv_tm, NSEC_PER_MSEC); /* from nanoseconds to milliseconds */
normalize(@iops, this->elapsed);

printa("%ld,%s,%s,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d\n",
    @actv_tm, @rangerlat, @rangewlat, @totbytes, @tput, @maxtput, 
    @ctrd, @ctwr, @aviosz, @rangeiosz, @iops, @maxiops, @avKBps, @rangeKBps,
    @ratelt1MBps, @rate10MBps, @rate100MBps, @rate1GBps, @avtime, @maxtime,
    @timegt1000ms, @time100ms, @time50ms, @time25ms, @time10ms, @timelt1ms,
    @iosztiny, @iosz4k, @iosz8k, @iosz16k, @iosz32k, @iosz64k, @ioszbig,
    @avpending, @maxpending, @cterr, @ctretry);
    trunc(@actv_tm); trunc(@rangerlat); trunc(@rangewlat); trunc(@totbytes);
    trunc(@tput); trunc(@maxtput); trunc(@ctrd); trunc(@ctwr); trunc(@aviosz);
    trunc(@rangeiosz); trunc(@iops); trunc(@maxiops); trunc(@avKBps); 
    trunc(@rangeKBps); trunc(@ratelt1MBps); trunc(@rate10MBps); trunc(@rate100MBps); trunc(@rate1GBps); trunc(@avtime); trunc(@maxtime);
    trunc(@iosztiny); trunc(@iosz4k); trunc(@iosz8k); trunc(@iosz16k); 
    trunc(@iosz32k); trunc(@iosz64k); trunc(@ioszbig);
    trunc(@avpending); trunc(@maxpending); trunc(@cterr); trunc(@ctretry);
    trunc(@timegt1000ms); trunc(@time100ms); trunc(@time50ms); trunc(@time25ms);
    trunc(@time10ms); trunc(@timelt1ms);
    ticks = timestamp ;
    interval = walltimestamp - (walltimestamp%1000000000);
}