Leveraging io:::start, io:::done probes for collecting lower-level IO metrics. Various measurements are made, including latency ranges, bytes processed, throughput, various rates, IO counts by IO type, etc.
#!/usr/sbin/dtrace -Cs
#pragma D option quiet
#pragma D option dynvarsize=16M
/*
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
* Common Development and Distribution License (the "License").
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or http://www.opensolaris.org/os/licensing.
* See the License for the specific language governing permissions
* and limitations under the License.
*
* When distributing Covered Code, include this CDDL HEADER in each
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
*
* Copyright (c) 2017 Sam Zaydel / RackTop Systems.
*
* bw-tput-iops-actv-time-hist-with-ts-csv.d
* This script is meant to be a more informative replacement for the
* iostat utility. It measures some features that iostat in my opinion
* is not useful for. For example, iostat does not offer distributions
* at all, whereas here we plot distributions of IO rates as observed
* for each disk. Also, iostat does not offer operating ranges for
* measurements, which here we define as max - min of some measurement.
*
* Currently, script is limited to scsi_vhci devices, because they are
* ones we normally use for actual data storage. Anything else is either
* a locally attached device, or something not used by system, i.e.
* USB storage media, etc.
*/
unsigned long minlati[dev_t, int], maxlati[dev_t, int]; /* Latency min and max */
unsigned long minratei[dev_t], maxratei[dev_t]; /* IO Rate min and max */
unsigned long miniosz[dev_t], maxiosz[dev_t]; /* IO size min and max */
unsigned long iocnt[dev_t, int];
int pend[dev_t];
hrtime_t start[dev_t, uint64_t];
hrtime_t ival_timer[dev_t];
hrtime_t ticks;
inline const int NSEC_PER_SEC = 1000000000;
inline const int NSEC_PER_MSEC = 1000000;
inline const int NSEC_PER_USEC = 1000;
inline const int SPA_MAXBLOCKSIZE = 128 << 10;
inline const int R = 0;
inline const int W = 1;
/*
* These are the Output Parameter definitions for collected metrics
*
* sdname == Name of device, i.e. sd0
* mpxiowwn == mpxio device, like: 5000cca24500f698
* actvtm == amount of real, busy time spent processing IO
* rangerlat, rangewlat == latency range: max - min for Reads and Writes
* totbytes == total number of bytes Read and Written during sample interval
* tput,maxtput == mean and maximum or burst throughput
* ctrd,ctwr == Count of Reads and Writes over interval set in tick-Xsec clause
* aviosz,rangeiosz == IO size mean and range: max - min
* iops,maxiops == normalized IOPS mean and maximum
* avKBps,rangeKBps == normalized IO rate and IO rate range: max - min
* // Histogram of IO Rate distribution with 4 buckets //
* [ ratelt1MBps ] ==> 1 second interval with < 1,000KB/s bw
* [ rate10MBps ] ==> 1 second interval with < 10,000KB/s bw
* [ rate100MBps ] ==> 1 second interval with < 100,000KB/s bw
* [ rate1GBps ] ==> 1 second interval with < 1,000,000KB/s bw
* avtime,maxtime == average and maximum IO completion latency
* // Histogram of latency distribution with 6 buckets //
* [ timegt1000ms ] ==> >1 second (SLOW!)
* [ time100ms ] ==> >50ms && 100ms<= (not acceptable)
* [ time50ms ] ==> >25ms && 50ms<= (acceptable)
* [ time25ms ] ==> >10ms && 25ms<= (ok)
* [ time10ms ] ==> >1ms && 10ms<= (good)
* [ timelt1ms ] ==> 1ms< (probably cache)
* // Histogram of IO size distribution with 7 buckets //
* [ iosztiny ] ==> 4K<
* [ iosz4k ] ==> between 4K and 8K
* [ iosz8k ] ==> between 8K and 16K
* [ iosz16k ] ==> between 16K and 32K
* [ iosz32k ] ==> between 32K and 64K
* [ iosz64k ] ==> between 64K and 128K
* [ ioszbig ] ==> anything 128K or above
* avpending == Average measured IOs in processing per sample interval
* maxpending == Maximum measured IOs in processing per sample interval
* cterr == Counter tracking number of errors per sample interval
* ctretry == Counter tracking number of retries per sample interval
*/
BEGIN {
printf("timestamp,sdname,mpxiowwn,actvtm,rangerlat,rangewlat,totbytes,tput,maxtput,ctrd,ctwr,aviosz,rangeiosz,iops,maxiops,avKBps,rangeKBps,ratelt1MBps,rate10MBps,rate100MBps,rate1GBps,avtime,maxtime,timegt1000ms,time100ms,time50ms,time25ms,time10ms,timelt1ms,iosztiny,iosz4k,iosz8k,iosz16k,iosz32k,iosz64k,ioszbig,avpending,maxpending,cterr,ctretry\n");
interval = walltimestamp - (walltimestamp%1000000000);
}
/*
* Set interval timer initial value once for a given device.
* Each subsequent update will happen roughly once per second,
* or if no IO, whenever there is some IO that triggers io:::done
* probe, at which point we determine that timer expired.
*/
io:::start
/ival_timer[args[0]->b_edev] == 0/ {
ival_timer[args[0]->b_edev] = timestamp;
}
io:::start {
start[args[0]->b_edev, args[0]->b_blkno] = timestamp;
/* Increment pending IOs by 1 */
pend[args[0]->b_edev]++;
}
io:::done
/start[args[0]->b_edev, args[0]->b_blkno]/ {
this->sn = args[1]->dev_statname;
this->p = substr(args[1]->dev_pathname, 25, 16);
this->p = (strstr(this->p, "disk@") == 0 ||
strstr(this->p, "disk@") == "") ? this->p : "NA";
this->x = args[0]->b_bcount * 976562;
this->elapsed = timestamp - start[args[0]->b_edev, args[0]->b_blkno];
start[args[0]->b_edev, args[0]->b_blkno] = 0;
/* Decrement pending IOs by 1, set to 0 if value < 0 */
pend[args[0]->b_edev]--;
pend[args[0]->b_edev] =
pend[args[0]->b_edev] > 0 ?
pend[args[0]->b_edev] : 0; /* avoid underflow */
/* Total Number of bytes per device */
@totbytes[interval, this->sn, this->p] = sum(args[0]->b_bcount);
/* Total nanoseconds of active time per device */
@actv_tm[interval, this->sn, this->p] = sum(this->elapsed);
/*
* Instead of converting nanoseconds to seconds we multiply
* the top by NSEC_PER_SEC and the divide by delta in nanoseconds.
* In extreme observations, i.e. stalled IO, we may have huge
* this->elapsed values, in which case result will be 0 KB/s, even
* if there in fact was a non-zero value of bytes transferred.
*/
this->b_nsec = args[0]->b_bcount * NSEC_PER_SEC;
this->kb_per_sec = (this->b_nsec / this->elapsed) >> 10;
/* Measure IO rate range in KB/s */
@rangeKBps[interval, this->sn, this->p] =
max(maxratei[args[0]->b_edev] - minratei[args[0]->b_edev]);
/* IO Rate histogram base10, limited at 1000000 KB/s | 976 MB/s */
/* 1000KB/s <= */
@ratelt1MBps[interval, this->sn, this->p] = sum(this->kb_per_sec <= 0x3e8 ? 1 : 0);
/* > 1000KB/s && 10,000KB/s <= */
@rate10MBps[interval, this->sn, this->p] = sum(this->kb_per_sec > 0x3e8 &&
this->kb_per_sec <= 0x2710 ? 1 : 0);
/* > 10,000KB/s && 100,000KB/s <= */
@rate100MBps[interval, this->sn, this->p] = sum(this->kb_per_sec > 0x2710 &&
this->kb_per_sec <= 0x186a0 ? 1 : 0);
/* > 100,000KB/s && 1,000,000 KB/s <= */
@rate1GBps[interval, this->sn, this->p] = sum(this->kb_per_sec > 0x186a0 &&
this->kb_per_sec <= 0xf4240 ? 1 : 0);
/*
* Collect minimum and maximum observed rate for later measurement
* of range for this metric.
*/
minratei[args[0]->b_edev] = minratei[args[0]->b_edev] == 0 ?
this->kb_per_sec : minratei[args[0]->b_edev] < this->kb_per_sec ?
minratei[args[0]->b_edev] : this->kb_per_sec;
maxratei[args[0]->b_edev] = maxratei[args[0]->b_edev] == 0 ?
this->kb_per_sec : maxratei[args[0]->b_edev] > this->kb_per_sec ?
maxratei[args[0]->b_edev] : this->kb_per_sec;
/* Actual Kbytes/sec per device */
@avKBps[interval, this->sn, this->p] = avg(this->kb_per_sec);
/* Average and Maximum Latency per device */
@avtime[interval, this->sn, this->p] = avg(this->elapsed);
@maxtime[interval, this->sn, this->p] = max(this->elapsed);
/*
* Latency histogram with buckets:
* >1000ms, >50 to 100ms, >25 to 50ms, >10 to 25ms, >1 to 10ms, 1ms<
*/
@timegt1000ms[interval, this->sn, this->p] = sum(
this->elapsed >= 1 * NSEC_PER_SEC ? 1 : 0);
@time100ms[interval, this->sn, this->p] = sum(
this->elapsed > 50 * NSEC_PER_MSEC &&
this->elapsed <= 100 * NSEC_PER_MSEC ? 1 : 0);
@time50ms[interval, this->sn, this->p] = sum(
this->elapsed > 25 * NSEC_PER_MSEC &&
this->elapsed <= 50 * NSEC_PER_MSEC ? 1 : 0);
@time25ms[interval, this->sn, this->p] = sum(
this->elapsed > 10 * NSEC_PER_MSEC &&
this->elapsed <= 25 * NSEC_PER_MSEC ? 1 : 0);
@time10ms[interval, this->sn, this->p] = sum(
this->elapsed > 1 * NSEC_PER_MSEC &&
this->elapsed <= 10 * NSEC_PER_MSEC ? 1 : 0);
@timelt1ms[interval, this->sn, this->p] = sum(
this->elapsed < 1 * NSEC_PER_MSEC ? 1 : 0);
/*
* Collect minimum and maximum io size for later measurement
* of range for this metric.
*/
miniosz[args[0]->b_edev] =
args[0]->b_bcount < miniosz[args[0]->b_edev] ?
args[0]->b_bcount : miniosz[args[0]->b_edev];
maxiosz[args[0]->b_edev] =
args[0]->b_bcount > maxiosz[args[0]->b_edev] ?
args[0]->b_bcount : maxiosz[args[0]->b_edev];
/* Measure IO size range in Kilobytes */
@rangeiosz[interval, this->sn, this->p] =
max((maxiosz[args[0]->b_edev] - miniosz[args[0]->b_edev]) >> 10);
/*
* Convert from bytes and nanoseconds to KB/s with 976562 to obtain
* avg. effective throughput, and maximum effective throughput.
* Maximum throughput is likely measuring cache effects.
*/
@tput[interval, this->sn, this->p] = avg(this->x / this->elapsed);
@maxtput[interval, this->sn, this->p] = max(this->x / this->elapsed);
iocnt[args[0]->b_edev, R] += args[0]->b_flags & B_READ ? 1 : 0;
iocnt[args[0]->b_edev, W] += args[0]->b_flags & B_WRITE ? 1 : 0;
/* Count number of IOs by IO-type */
@ctrd[interval, this->sn, this->p] = sum(args[0]->b_flags & B_READ ? 1 : 0);
@ctwr[interval, this->sn, this->p] = sum(args[0]->b_flags & B_WRITE ? 1 : 0);
@iops[interval, this->sn, this->p] =
count(); /* Normalized to per second in tick-X probe */
/* Maximum sampled IOPS per device */
@maxiops[interval, this->sn, this->p] =
max(iocnt[args[0]->b_edev, R] + iocnt[args[0]->b_edev, W]);
/*
* Collect minimum and maximum latency for later measurement
* of range for this metric.
*/
minlati[args[0]->b_edev, R] =
args[0]->b_flags & B_READ &&
this->elapsed < minlati[args[0]->b_edev, R] ? this->elapsed :
minlati[args[0]->b_edev, R];
maxlati[args[0]->b_edev, R] =
args[0]->b_flags & B_READ &&
this->elapsed > maxlati[args[0]->b_edev, R] ? this->elapsed :
maxlati[args[0]->b_edev, R];
minlati[args[0]->b_edev, W] =
(args[0]->b_flags & B_READ) == 0 &&
this->elapsed < minlati[args[0]->b_edev, W] ? this->elapsed :
minlati[args[0]->b_edev, W];
maxlati[args[0]->b_edev, W] =
(args[0]->b_flags & B_READ) == 0 &&
this->elapsed > maxlati[args[0]->b_edev, W] ? this->elapsed :
maxlati[args[0]->b_edev, W];
/*
* IOsize distribution not grouped by direction, i.e. no distinction
* is made between reads and writes. IO buckets double in size from
* previous bucket. i.e. 4, 8, 16, 32...
*/
this->bs = args[0]->b_bcount ;
/* 4K< */
@iosztiny[interval, this->sn, this->p] =
sum(this->bs < 0x1000 ? 1 : 0);
/* 4K to 8K< */
@iosz4k[interval, this->sn, this->p] =
sum(this->bs >= 0x1000 && this->bs < 0x2000 ? 1 : 0);
/* 8K to 16K< */
@iosz8k[interval, this->sn, this->p] =
sum(this->bs >= 0x2000 && this->bs < 0x4000 ? 1 : 0);
/* 16K to 32K< */
@iosz16k[interval, this->sn, this->p] =
sum(this->bs >= 0x4000 && this->bs < 0x8000 ? 1 : 0);
/* 32K to 64K< */
@iosz32k[interval, this->sn, this->p] =
sum(this->bs >= 0x8000 && this->bs < 0x10000 ? 1 : 0);
/* 64K to 128K< */
@iosz64k[interval, this->sn, this->p] =
sum(this->bs >= 0x10000 && this->bs < 0x20000 ? 1 : 0);
/* >128K */
@ioszbig[interval, this->sn, this->p] =
sum(this->bs >= 0x20000 ? 1 : 0);
/* Average IO size for given device */
@aviosz[interval, this->sn, this->p] = avg(this->bs);
/*
* Each time we observe an error at completion through B_ERROR flag,
* increment count of errors for given device. This should always
* be zero, assuming healthy device.
*/
@cterr[interval, this->sn, this->p] = sum(args[0]->b_flags & B_ERROR ? 1 : 0);
}
/*
* Entry controlled by timer. By design, each device will be registered
* here about once per second when there is even a litte bit of IO.
*/
io:::done
/ival_timer[args[0]->b_edev] > 0 &&
timestamp >= ival_timer[args[0]->b_edev] + NSEC_PER_SEC/ {
this->sn = args[1]->dev_statname;
this->p = substr(args[1]->dev_pathname, 25, 16);
this->p = (strstr(this->p, "disk@") == 0 ||
strstr(this->p, "disk@") == "") ? this->p : "NA";
/*
* Measure operating latency range in uS for Reads and Writes,
* storing largest observed difference.
*/
@rangerlat[interval, this->sn, this->p] =
max((maxlati[args[0]->b_edev, R] -
minlati[args[0]->b_edev, R]) / NSEC_PER_USEC);
@rangewlat[interval, this->sn, this->p] =
max((maxlati[args[0]->b_edev, W] -
minlati[args[0]->b_edev, W]) / NSEC_PER_USEC);
@avpending[interval, this->sn, this->p] = avg(pend[args[0]->b_edev]);
@maxpending[interval, this->sn, this->p] = max(pend[args[0]->b_edev]);
/* Reset various counters for next measurement period */
minlati[args[0]->b_edev, R] = 0;
maxlati[args[0]->b_edev, R] = 0;
minlati[args[0]->b_edev, W] = 0;
maxlati[args[0]->b_edev, W] = 0;
iocnt[args[0]->b_edev, R] = 0;
iocnt[args[0]->b_edev, W] = 0;
miniosz[args[0]->b_edev] = 0;
maxiosz[args[0]->b_edev] = 0;
minratei[args[0]->b_edev] = 0;
maxratei[args[0]->b_edev] = 0;
ival_timer[args[0]->b_edev] = timestamp;
}
/*
* Count number of retries issued to a disk. These are a good
* indicator of potentially failing, or borderline device.
* Under normal circumstances we should not expect
* this to be a positive value.
*/
::sd_set_retry_bp:entry
/xlate <devinfo_t *>(args[1])->dev_pathname != "<nfs>" &&
xlate <devinfo_t *>(args[1])->dev_pathname != "" / {
this->sn = xlate <devinfo_t *>(args[1])->dev_statname;
this->p = substr(xlate <devinfo_t *>(args[1])->dev_pathname, 25, 16);
this->p = (strstr(this->p, "disk@") == 0 ||
strstr(this->p, "disk@") == "") ? this->p : "NA";
@ctretry[interval, this->sn, this->p] = count();
}
tick-10sec {
/* First time we enter this clause, ticks will be 0, so we just assume 10 seconds */
this->elapsed = ticks > 0 ? (timestamp - ticks) / NSEC_PER_SEC : 10 ;
/* Normalize Data for correct per second reporting of rates, like IOPS */
normalize(@actv_tm, NSEC_PER_MSEC); /* from nanoseconds to milliseconds */
normalize(@iops, this->elapsed);
printa("%ld,%s,%s,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d\n",
@actv_tm, @rangerlat, @rangewlat, @totbytes, @tput, @maxtput,
@ctrd, @ctwr, @aviosz, @rangeiosz, @iops, @maxiops, @avKBps, @rangeKBps,
@ratelt1MBps, @rate10MBps, @rate100MBps, @rate1GBps, @avtime, @maxtime,
@timegt1000ms, @time100ms, @time50ms, @time25ms, @time10ms, @timelt1ms,
@iosztiny, @iosz4k, @iosz8k, @iosz16k, @iosz32k, @iosz64k, @ioszbig,
@avpending, @maxpending, @cterr, @ctretry);
trunc(@actv_tm); trunc(@rangerlat); trunc(@rangewlat); trunc(@totbytes);
trunc(@tput); trunc(@maxtput); trunc(@ctrd); trunc(@ctwr); trunc(@aviosz);
trunc(@rangeiosz); trunc(@iops); trunc(@maxiops); trunc(@avKBps);
trunc(@rangeKBps); trunc(@ratelt1MBps); trunc(@rate10MBps); trunc(@rate100MBps); trunc(@rate1GBps); trunc(@avtime); trunc(@maxtime);
trunc(@iosztiny); trunc(@iosz4k); trunc(@iosz8k); trunc(@iosz16k);
trunc(@iosz32k); trunc(@iosz64k); trunc(@ioszbig);
trunc(@avpending); trunc(@maxpending); trunc(@cterr); trunc(@ctretry);
trunc(@timegt1000ms); trunc(@time100ms); trunc(@time50ms); trunc(@time25ms);
trunc(@time10ms); trunc(@timelt1ms);
ticks = timestamp ;
interval = walltimestamp - (walltimestamp%1000000000);
}