[ZFS pool Queue and Per-Disk Level Observations] Dtrace snippets for measuring queue depths and latency at the boundary between ZFS and physical storage. #tags: per-disk, iostat, dtrace, queue, queuing, spa, vdev_queue_io_add, kstat, runq, waitq, queue-length, queue-depth, zio-class, csv
#!/usr/sbin/dtrace -Cs
#pragma D option quiet
#pragma D option dynvarsize=16M
/*
* This script is intended for observing IOs as they transition from
* Wait Queue to Run Queue and to completion. Latency is measured as
* the IOs enter and then exit the Wait Queue, and then as they leave
* the Wait Queue and enter the Run Queue.
* We collect this information by pool, so we can see differences in
* terms of time spent in these two queues.
* Also collected are counts of IOs by priority classification. See the
* zio_priority enum below, as well zio_priority.h for more details.
* Ideally we want to see as little as possible time spent in these queues.
* And dispersion between lowest and highest times remaining tight and fairly
* consistent without many offending outliers.
*
* aggregates prefixed with `tot` are tracking each ZIO from time it gets into
* the queue to the time it is processed and is removed from queue.
* We can think of `tot*` aggregates as a proxy for latency that we would
* normally measure at the application level, roughly.
*/
long wqueue[vdev_queue_t *, zio_t *]; /* measure wait queue time */
long rqueue[vdev_queue_t *, zio_t *]; /* measure run queue time */
long totqueue[zio_t *]; /* measure combined queue time */
inline const int SLOW = 10000 ; /* >10ms considered slow */
/* see usr/src/uts/common/fs/zfs/sys/zio_priority.h for details */
typedef enum zio_priority {
ZIO_PRIORITY_SYNC_READ,
ZIO_PRIORITY_SYNC_WRITE, /* ZIL */
ZIO_PRIORITY_ASYNC_READ, /* prefetch */
ZIO_PRIORITY_ASYNC_WRITE, /* spa_sync() */
ZIO_PRIORITY_SCRUB, /* asynchronous scrub/resilver reads */
ZIO_PRIORITY_NUM_QUEUEABLE,
ZIO_PRIORITY_NOW /* non-queued i/os (e.g. free) */
} zio_priority_t;
BEGIN {
printf("ts,pool,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s\n",
"rqav", "rqmin", "rqmax", "rqsd", "rqlt1ms", "rq1to10ms", "rq10to100ms",
"rq100msto1s", "rqgt1s", "rqslowct", "rqaddct", "rqrmct", "rqctav",
"rqctmin", "rqctmax", "rqctsd",
"wqav", "wqmin", "wqmax", "wqsd", "wqlt1ms", "wq1to10ms", "wq10to100ms",
"wq100msto1s", "wqgt1s", "wqslowct", "wqaddct", "wqrmct", "wqctav",
"wqctmin", "wqctmax", "wqctsd",
"totqav", "totqmin", "totqmax", "totqsd", "totqct", "syncrdct", "syncwrct", "asyncrdct", "asyncwrct", "scrubct"
);
interval = walltimestamp - (walltimestamp%1000000000);
}
/* (z)IO is entering Wait Queue */
::vdev_queue_io_add:entry {
wqueue[args[0], args[1]] = args[1]->io_queued_timestamp ;
this->spa = args[1]->io_spa ;
this->allqueued = 0 ; /* reset to avoid bogus data */
this->allqueued += this->spa->spa_queue_stats[ZIO_PRIORITY_SYNC_READ].spa_queued ;
this->allqueued += this->spa->spa_queue_stats[ZIO_PRIORITY_SYNC_WRITE].spa_queued ;
this->allqueued += this->spa->spa_queue_stats[ZIO_PRIORITY_ASYNC_READ].spa_queued ;
this->allqueued += this->spa->spa_queue_stats[ZIO_PRIORITY_ASYNC_WRITE].spa_queued ;
this->allqueued += this->spa->spa_queue_stats[ZIO_PRIORITY_SCRUB].spa_queued ;
@wqctav[interval, this->spa->spa_name] = avg(this->allqueued) ;
@wqctmin[interval, this->spa->spa_name] = min(this->allqueued) ;
@wqctmax[interval, this->spa->spa_name] = max(this->allqueued) ;
@wqctsd[interval, this->spa->spa_name] = stddev(this->allqueued) ;
@wqaddct[interval, this->spa->spa_name] = count() ; /* Count ++ to Wait Queue */
}
/*
* Count number of IOs by priority classification. In other words
* we bucket each IO by ZIO priority as defined in the zio_priority enum.
* This is in effect a histogram with 5 buckets, one each for:
* sync-read, sync-write, async-read, async-write, and scrub.
*/
::vdev_queue_io_add:entry {
this->spa = args[1]->io_spa ;
this->pri = args[1]->io_priority ;
@syncrdct[interval, this->spa->spa_name] =
sum(this->pri == ZIO_PRIORITY_SYNC_READ ? 1 : 0) ;
@syncwrct[interval, this->spa->spa_name] =
sum(this->pri == ZIO_PRIORITY_SYNC_WRITE ? 1 : 0) ;
@asyncrdct[interval, this->spa->spa_name] =
sum(this->pri == ZIO_PRIORITY_ASYNC_READ ? 1 : 0) ;
@asyncwrct[interval, this->spa->spa_name] =
sum(this->pri == ZIO_PRIORITY_ASYNC_WRITE ? 1 : 0) ;
@scrubct[interval, this->spa->spa_name] =
sum(this->pri == ZIO_PRIORITY_SCRUB ? 1 : 0) ;
}
::vdev_queue_io_remove:entry {
this->spa = args[1]->io_spa ;
@wqrmct[interval, this->spa->spa_name] = count() ; /* Count -- from Wait Queue */
}
/* (z)IO is leaving Wait Queue on its way to Run Queue */
::vdev_queue_io_remove:entry
/wqueue[args[0], args[1]] == args[1]->io_queued_timestamp/ {
this->delta = (timestamp - wqueue[args[0], args[1]]) / 1000 ;
this->spa = args[1]->io_spa ;
@wqavlat[interval, this->spa->spa_name] = avg(this->delta) ;
@wqminlat[interval, this->spa->spa_name] = min(this->delta) ;
@wqmaxlat[interval, this->spa->spa_name] = max(this->delta) ;
@wqsdlat[interval, this->spa->spa_name] = stddev(this->delta) ;
/* If spending to long in the queue, identify as slow IO. */
@wqslowct[interval, this->spa->spa_name] = sum(this->delta >= SLOW ? 1 : 0) ;
/*
* Maintain a histogram of wait qeueue latency distribution
* using base10 buckets: <1ms, 1-10ms, 10-100ms, 100ms-1s.
*/
@wqlatlt1ms[interval, this->spa->spa_name] =
sum(this->delta < 1000 ? 1 : 0) ;
@wqlat1to10ms[interval, this->spa->spa_name] =
sum(this->delta >= 1000 && this->delta < 10000 ? 1 : 0) ;
@wqlat10to100ms[interval, this->spa->spa_name]=
sum(this->delta >= 10000 && this->delta < 100000 ? 1 : 0) ;
@wqlat100msto1s[interval, this->spa->spa_name]=
sum(this->delta >= 100000 && this->delta < 1000000 ? 1 : 0) ;
@wqlatgt1s[interval, this->spa->spa_name] = sum(this->delta > 1000000 ? 1 : 0) ;
/*
* The zio_t structures are going to be recycled over and over and
* as such, we need to make sure that we avoid chances of bogus data
* because we are checking entry points into the clauses for a !0
* value.
*/
wqueue[args[0], args[1]] = 0 ;
}
/* (z)IO is entering Run Queue */
::vdev_queue_pending_add:entry {
rqueue[args[0], args[1]] = timestamp ; /* time IO entered Run Queue */
this->spa = args[1]->io_spa ;
this->allactive = 0 ; /* reset to avoid bogus data */
this->allactive += this->spa->spa_queue_stats[ZIO_PRIORITY_SYNC_READ].spa_active ;
this->allactive += this->spa->spa_queue_stats[ZIO_PRIORITY_SYNC_WRITE].spa_active ;
this->allactive += this->spa->spa_queue_stats[ZIO_PRIORITY_ASYNC_READ].spa_active ;
this->allactive += this->spa->spa_queue_stats[ZIO_PRIORITY_ASYNC_WRITE].spa_active ;
this->allactive += this->spa->spa_queue_stats[ZIO_PRIORITY_SCRUB].spa_active ;
@rqctav[interval, this->spa->spa_name] = avg(this->allactive) ;
@rqctmin[interval, this->spa->spa_name] = min(this->allactive) ;
@rqctmax[interval, this->spa->spa_name] = max(this->allactive) ;
@rqctsd[interval, this->spa->spa_name] = stddev(this->allactive) ;
@rqaddct[interval, this->spa->spa_name] = count() ; /* Count ++ to Run Queue */
}
/* IO enters the queue */
::vdev_queue_io:entry {
/*
* This should capture the total time we spend waiting and processing
* for each zio.
*/
totqueue[args[0]] = args[0]->io_queued_timestamp ;
}
::vdev_queue_pending_remove:entry {
this->spa = args[1]->io_spa ;
@rqrmct[interval, this->spa->spa_name] = count() ; /* Count -- from Run Queue */
}
/* (z)IO is leaving Run Queue, which implies IO completed */
::vdev_queue_pending_remove:entry /rqueue[args[0], args[1]]/ {
this->delta = (timestamp - rqueue[args[0], args[1]]) / 1000 ;
this->spa = args[1]->io_spa ;
@rqavlat[interval, this->spa->spa_name] = avg(this->delta) ;
@rqminlat[interval, this->spa->spa_name] = min(this->delta) ;
@rqmaxlat[interval, this->spa->spa_name] = max(this->delta) ;
@rqsdlat[interval, this->spa->spa_name] = stddev(this->delta == 0 ?
1 : this->delta) ;
@rqslowct[interval, this->spa->spa_name] = sum(this->delta >= SLOW ? 1 : 0) ;
/*
* Maintain a histogram of run qeueue latency distribution
* using base10 buckets: <1ms, 1-10ms, 10-100ms, 100ms-1s.
*/
@rqlatlt1ms[interval, this->spa->spa_name] =
sum(this->delta < 1000 ? 1 : 0) ;
@rqlat1to10ms[interval, this->spa->spa_name] =
sum(this->delta >= 1000 && this->delta < 10000 ? 1 : 0) ;
@rqlat10to100ms[interval, this->spa->spa_name]=
sum(this->delta >= 10000 && this->delta < 100000 ? 1 : 0) ;
@rqlat100msto1s[interval, this->spa->spa_name]=
sum(this->delta >= 100000 && this->delta < 1000000 ? 1 : 0) ;
@rqlatgt1s[interval, this->spa->spa_name] =
sum(this->delta > 1000000 ? 1 : 0) ;
/*
* The zio_t structures are going to be recycled over and over and
* as such, we need to make sure that we avoid chances of bogus data
* because we are checking entry points into the clauses for a !0
* value.
*/
rqueue[args[0], args[1]] = 0 ;
}
/* IO completed */
::vdev_queue_io_done:entry
/totqueue[args[0]] == args[0]->io_queued_timestamp/ {
/*
* This is where we measure amount of time all the way from IO entering the
* wait queue through all the stages to completion.
*/
this->spa = args[0]->io_spa ;
this->delta = (timestamp - totqueue[args[0]]) / 1000 ;
@totqct[interval, this->spa->spa_name] = count() ;
@totqavlat[interval, this->spa->spa_name] = avg(this->delta) ;
@totqminlat[interval, this->spa->spa_name] = min(this->delta) ;
@totqmaxlat[interval, this->spa->spa_name] = max(this->delta) ;
@totqsdlat[interval, this->spa->spa_name] = stddev(this->delta == 0 ?
1 : this->delta) ;
totqueue[args[0]] = 0 ;
}
tick-5sec {
/*
* If the counts are not equal between these histograms, we may have
* interrupted the script at the very moment an IO was transitioning
* and it did not get accounted for correctly. In general we expect
* that these counts are all equal.
*/
printa("%ld,%s,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d\n",
@rqavlat, @rqminlat, @rqmaxlat, @rqsdlat, @rqlatlt1ms, @rqlat1to10ms,
@rqlat10to100ms, @rqlat100msto1s, @rqlatgt1s,
@rqslowct, @rqaddct, @rqrmct, @rqctav, @rqctmin, @rqctmax, @rqctsd,
@wqavlat, @wqminlat, @wqmaxlat, @wqsdlat, @wqlatlt1ms, @wqlat1to10ms,
@wqlat10to100ms, @wqlat100msto1s, @wqlatgt1s, @wqslowct, @wqaddct,
@wqrmct, @wqctav, @wqctmin, @wqctmax, @wqctsd,
@totqavlat, @totqminlat, @totqmaxlat, @totqsdlat, @totqct,
@syncrdct, @syncwrct, @asyncrdct, @asyncwrct, @scrubct
) ;
trunc(@rqavlat) ; trunc(@rqminlat) ; trunc(@rqmaxlat) ; trunc(@rqsdlat) ;
trunc(@rqlatlt1ms) ; trunc(@rqlat1to10ms) ; trunc(@rqlat10to100ms) ;
trunc(@rqlat100msto1s) ; trunc(@rqlatgt1s) ;
trunc(@rqslowct) ; trunc(@rqaddct) ; trunc(@rqrmct) ;
trunc(@rqctav) ; trunc(@rqctmin) ; trunc(@rqctmax) ; trunc(@rqctsd) ;
trunc(@wqavlat) ; trunc(@wqminlat) ; trunc(@wqmaxlat) ; trunc(@wqsdlat) ;
trunc(@wqlatlt1ms) ; trunc(@wqlat1to10ms) ; trunc(@wqlat10to100ms) ;
trunc(@wqlat100msto1s) ; trunc(@wqlatgt1s) ;
trunc(@wqslowct) ; trunc(@wqaddct) ; trunc(@wqrmct) ;
trunc(@wqctav) ; trunc(@wqctmin) ; trunc(@wqctmax) ; trunc(@wqctsd) ;
trunc(@totqavlat) ; trunc(@totqminlat) ; trunc(@totqmaxlat) ;
trunc(@totqsdlat) ; trunc(@totqct) ;
trunc(@syncrdct); trunc(@syncwrct); trunc(@asyncrdct);
trunc(@asyncwrct); trunc(@scrubct) ;
interval = walltimestamp - (walltimestamp%1000000000);
}dtrace -qn '
/*
* This script is intended for observing IOs as they transition from
* Wait Queue to Run Queue and to completion. Latency is measured as
* the IOs enter and then exit the Wait Queue, and then as they leave
* the Wait Queue and enter the Run Queue.
* We collect this information by pool, so we can see differences in
* terms of time spent in these two queues.
* Also collected are counts of IOs by priority classification. See the
* zio_priority enum below, as well zio_priority.h for more details.
* Ideally we want to see as little as possible time spent in these queues.
* And dispersion between lowest and highest times remaining tight and fairly
* consistent without many offending outliers.
*
* aggregates prefixed with `tot` are tracking each ZIO from time it gets into
* the queue to the time it is processed and is removed from queue.
* We can think of `tot*` aggregates as a proxy for latency that we would
* normally measure at the application level, roughly.
*/
int wqueue[zio_t *]; /* zio_t pointer passed through the queuing stages */
int rqueue[zio_t *]; /* same as above used for run queue */
int totqueue[zio_t *];
inline const int SLOW = 10000 ; /* >10ms considered slow */
/* see usr/src/uts/common/fs/zfs/sys/zio_priority.h for details */
typedef enum zio_priority {
ZIO_PRIORITY_SYNC_READ,
ZIO_PRIORITY_SYNC_WRITE, /* ZIL */
ZIO_PRIORITY_ASYNC_READ, /* prefetch */
ZIO_PRIORITY_ASYNC_WRITE, /* spa_sync() */
ZIO_PRIORITY_SCRUB, /* asynchronous scrub/resilver reads */
ZIO_PRIORITY_NUM_QUEUEABLE,
ZIO_PRIORITY_NOW /* non-queued i/os (e.g. free) */
} zio_priority_t;
BEGIN {
printf("pool,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s\n",
"rqav", "rqmin", "rqmax", "rqsd", "rqslowct", "rqct", "wqav", "wqmin",
"wqmax", "wqsd", "wqslowct", "wqct", "totqav", "totqmin", "totqmax", "totqsd",
"totqct", "syncrdct", "syncwrct", "asyncrdct", "asyncwrct", "scrubct"
);
}
/* (z)IO is entering Wait Queue */
::vdev_queue_io_add:entry {
wqueue[args[1]] = timestamp ;
}
/*
* Count number of IOs by priority classification. In other words
* we bucket each IO by ZIO priority as defined in the zio_priority enum.
* This is in effect a histogram with 5 buckets, one each for:
* sync-read, sync-write, async-read, async-write, and scrub.
*/
::vdev_queue_io_add:entry {
this->spa = args[1]->io_spa ;
this->pri = args[1]->io_priority ;
@syncrdct[this->spa->spa_name] =
sum(this->pri == ZIO_PRIORITY_SYNC_READ ? 1 : 0) ;
@syncwrct[this->spa->spa_name] =
sum(this->pri == ZIO_PRIORITY_SYNC_WRITE ? 1 : 0) ;
@asyncrdct[this->spa->spa_name] =
sum(this->pri == ZIO_PRIORITY_ASYNC_READ ? 1 : 0) ;
@asyncwrct[this->spa->spa_name] =
sum(this->pri == ZIO_PRIORITY_ASYNC_WRITE ? 1 : 0) ;
@scrubct[this->spa->spa_name] =
sum(this->pri == ZIO_PRIORITY_SCRUB ? 1 : 0) ;
}
/* (z)IO is leaving Wait Queue on its way to Run Queue */
::vdev_queue_io_remove:entry /wqueue[args[1]]/ {
this->delta = (timestamp - wqueue[args[1]]) / 1000 ;
this->spa = args[1]->io_spa ;
@wqavlat[this->spa->spa_name] = avg(this->delta) ;
@wqminlat[this->spa->spa_name] = min(this->delta) ;
@wqmaxlat[this->spa->spa_name] = max(this->delta) ;
@wqsdlat[this->spa->spa_name] = stddev(this->delta) ;
@wqct[this->spa->spa_name] = count() ;
/* If spending to long in the queue, identify as slow IO. */
@wqslowct[this->spa->spa_name] = sum(this->delta >= SLOW ? 1 : 0) ;
wqueue[args[1]] = 0 ;
}
/* (z)IO is entering Run Queue */
::vdev_queue_pending_add:entry {
rqueue[args[1]] = timestamp ; /* time IO entered Run Queue */
}
/* IO enters the queue */
::vdev_queue_io:entry {
/*
* This should capture the total time we spend waiting and processing
* for each zio.
*/
totqueue[args[0]] = timestamp ;
}
/* (z)IO is leaving Run Queue, which implies IO completed */
::vdev_queue_pending_remove:entry /rqueue[args[1]]/ {
this->delta = (timestamp - rqueue[args[1]]) / 1000 ;
this->spa = args[1]->io_spa ;
@rqavlat[this->spa->spa_name] = avg(this->delta) ;
@rqminlat[this->spa->spa_name] = min(this->delta) ;
@rqmaxlat[this->spa->spa_name] = max(this->delta) ;
@rqsdlat[this->spa->spa_name] = stddev(this->delta) ;
@rqct[this->spa->spa_name] = count() ;
@rqslowct[this->spa->spa_name] = sum(this->delta >= SLOW ? 1 : 0) ;
/*
* The zio_t structures are going to be recycled over and over and
* as such, we need to make sure that we avoid chances of bogus data
* because we are checking entry points into the clauses for a !0
* value.
*/
rqueue[args[1]] = 0 ;
}
/* IO completed */
::vdev_queue_io_done:entry /totqueue[args[0]] > 0/ {
/*
* This is where we measure amount of time all the way from IO entering the
* wait queue through all the stages to completion.
*/
this->spa = args[0]->io_spa ;
this->delta = (timestamp - totqueue[args[0]]) / 1000 ;
@totqct[this->spa->spa_name] = count() ;
@totqavlat[this->spa->spa_name] = avg(this->delta) ;
@totqminlat[this->spa->spa_name] = min(this->delta) ;
@totqmaxlat[this->spa->spa_name] = max(this->delta) ;
@totqsdlat[this->spa->spa_name] = stddev(this->delta) ;
totqueue[args[0]] = 0 ;
}
tick-5sec {
/*
* If the counts are not equal between these histograms, we may have
* interrupted the script at the very moment an IO was transitioning
* and it did not get accounted for correctly. In general we expect
* that these counts are all equal.
*/
printa("%s,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d\n",
@rqavlat, @rqminlat, @rqmaxlat, @rqsdlat, @rqslowct, @rqct,
@wqavlat, @wqminlat, @wqmaxlat, @wqsdlat, @wqslowct, @wqct,
@totqavlat, @totqminlat, @totqmaxlat, @totqsdlat, @totqct,
@syncrdct, @syncwrct, @asyncrdct, @asyncwrct, @scrubct
) ;
trunc(@rqavlat) ; trunc(@rqminlat) ; trunc(@rqmaxlat) ; trunc(@rqsdlat) ;
trunc(@rqslowct) ; trunc(@rqct) ;
trunc(@wqavlat) ; trunc(@wqminlat) ; trunc(@wqmaxlat) ; trunc(@wqsdlat) ;
trunc(@wqslowct) ; trunc(@wqct) ;
trunc(@totqavlat) ; trunc(@totqminlat) ; trunc(@totqmaxlat) ;
trunc(@totqsdlat) ; trunc(@totqct) ;
trunc(@syncrdct); trunc(@syncwrct); trunc(@asyncrdct);
trunc(@asyncwrct); trunc(@scrubct) ;
}'#!/usr/sbin/dtrace -Cs
#pragma D option quiet
#pragma D option dynvarsize=16M
/*
* This script is intended for observing IOs as they transition from
* Wait Queue to Run Queue and to completion. Latency is measured as
* the IOs enter and then exit the Wait Queue, and then as they leave
* the Wait Queue and enter the Run Queue.
* We collect this information by pool, so we can see differences in
* terms of time spent in these two queues.
* Also collected are counts of IOs by priority classification. See the
* zio_priority enum below, as well zio_priority.h for more details.
* Ideally we want to see as little as possible time spent in these queues.
* And dispersion between lowest and highest times remaining tight and fairly
* consistent without many offending outliers.
*
* aggregates prefixed with `tot` are tracking each ZIO from time it gets into
* the queue to the time it is processed and is removed from queue.
* We can think of `tot*` aggregates as a proxy for latency that we would
* normally measure at the application level, roughly.
*/
long wqueue[vdev_queue_t *, zio_t *]; /* measure wait queue time */
long rqueue[vdev_queue_t *, zio_t *]; /* measure run queue time */
long totqueue[zio_t *]; /* measure combined queue time */
inline const int SLOW = 10000 ; /* >10ms considered slow */
/* see usr/src/uts/common/fs/zfs/sys/zio_priority.h for details */
typedef enum zio_priority {
ZIO_PRIORITY_SYNC_READ,
ZIO_PRIORITY_SYNC_WRITE, /* ZIL */
ZIO_PRIORITY_ASYNC_READ, /* prefetch */
ZIO_PRIORITY_ASYNC_WRITE, /* spa_sync() */
ZIO_PRIORITY_SCRUB, /* asynchronous scrub/resilver reads */
ZIO_PRIORITY_NUM_QUEUEABLE,
ZIO_PRIORITY_NOW /* non-queued i/os (e.g. free) */
} zio_priority_t;
BEGIN {
printf("pool,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s\n",
"rqav", "rqmin", "rqmax", "rqsd", "rqlt1ms", "rq1to10ms", "rq10to100ms",
"rq100msto1s", "rqgt1s", "rqslowct", "rqaddct", "rqrmct", "rqctav",
"rqctmin", "rqctmax", "rqctsd",
"wqav", "wqmin", "wqmax", "wqsd", "wqlt1ms", "wq1to10ms", "wq10to100ms",
"wq100msto1s", "wqgt1s", "wqslowct", "wqaddct", "wqrmct", "wqctav",
"wqctmin", "wqctmax", "wqctsd",
"totqav", "totqmin", "totqmax", "totqsd", "totqct", "syncrdct", "syncwrct", "asyncrdct", "asyncwrct", "scrubct"
);
}
/* (z)IO is entering Wait Queue */
::vdev_queue_io_add:entry {
wqueue[args[0], args[1]] = args[1]->io_queued_timestamp ;
this->spa = args[1]->io_spa ;
this->allqueued = 0 ; /* reset to avoid bogus data */
this->allqueued += this->spa->spa_queue_stats[ZIO_PRIORITY_SYNC_READ].spa_queued ;
this->allqueued += this->spa->spa_queue_stats[ZIO_PRIORITY_SYNC_WRITE].spa_queued ;
this->allqueued += this->spa->spa_queue_stats[ZIO_PRIORITY_ASYNC_READ].spa_queued ;
this->allqueued += this->spa->spa_queue_stats[ZIO_PRIORITY_ASYNC_WRITE].spa_queued ;
this->allqueued += this->spa->spa_queue_stats[ZIO_PRIORITY_SCRUB].spa_queued ;
@wqctav[this->spa->spa_name] = avg(this->allqueued) ;
@wqctmin[this->spa->spa_name] = min(this->allqueued) ;
@wqctmax[this->spa->spa_name] = max(this->allqueued) ;
@wqctsd[this->spa->spa_name] = stddev(this->allqueued) ;
@wqaddct[this->spa->spa_name] = count() ; /* Count ++ to Wait Queue */
}
/*
* Count number of IOs by priority classification. In other words
* we bucket each IO by ZIO priority as defined in the zio_priority enum.
* This is in effect a histogram with 5 buckets, one each for:
* sync-read, sync-write, async-read, async-write, and scrub.
*/
::vdev_queue_io_add:entry {
this->spa = args[1]->io_spa ;
this->pri = args[1]->io_priority ;
@syncrdct[this->spa->spa_name] =
sum(this->pri == ZIO_PRIORITY_SYNC_READ ? 1 : 0) ;
@syncwrct[this->spa->spa_name] =
sum(this->pri == ZIO_PRIORITY_SYNC_WRITE ? 1 : 0) ;
@asyncrdct[this->spa->spa_name] =
sum(this->pri == ZIO_PRIORITY_ASYNC_READ ? 1 : 0) ;
@asyncwrct[this->spa->spa_name] =
sum(this->pri == ZIO_PRIORITY_ASYNC_WRITE ? 1 : 0) ;
@scrubct[this->spa->spa_name] =
sum(this->pri == ZIO_PRIORITY_SCRUB ? 1 : 0) ;
}
::vdev_queue_io_remove:entry {
this->spa = args[1]->io_spa ;
@wqrmct[this->spa->spa_name] = count() ; /* Count -- from Wait Queue */
}
/* (z)IO is leaving Wait Queue on its way to Run Queue */
::vdev_queue_io_remove:entry
/wqueue[args[0], args[1]] == args[1]->io_queued_timestamp/ {
this->delta = (timestamp - wqueue[args[0], args[1]]) / 1000 ;
this->spa = args[1]->io_spa ;
@wqavlat[this->spa->spa_name] = avg(this->delta) ;
@wqminlat[this->spa->spa_name] = min(this->delta) ;
@wqmaxlat[this->spa->spa_name] = max(this->delta) ;
@wqsdlat[this->spa->spa_name] = stddev(this->delta) ;
/* If spending to long in the queue, identify as slow IO. */
@wqslowct[this->spa->spa_name] = sum(this->delta >= SLOW ? 1 : 0) ;
/*
* Maintain a histogram of wait qeueue latency distribution
* using base10 buckets: <1ms, 1-10ms, 10-100ms, 100ms-1s.
*/
@wqlatlt1ms[this->spa->spa_name] =
sum(this->delta < 1000 ? 1 : 0) ;
@wqlat1to10ms[this->spa->spa_name] =
sum(this->delta >= 1000 && this->delta < 10000 ? 1 : 0) ;
@wqlat10to100ms[this->spa->spa_name]=
sum(this->delta >= 10000 && this->delta < 100000 ? 1 : 0) ;
@wqlat100msto1s[this->spa->spa_name]=
sum(this->delta >= 100000 && this->delta < 1000000 ? 1 : 0) ;
@wqlatgt1s[this->spa->spa_name] = sum(this->delta > 1000000 ? 1 : 0) ;
/*
* The zio_t structures are going to be recycled over and over and
* as such, we need to make sure that we avoid chances of bogus data
* because we are checking entry points into the clauses for a !0
* value.
*/
wqueue[args[0], args[1]] = 0 ;
}
/* (z)IO is entering Run Queue */
::vdev_queue_pending_add:entry {
rqueue[args[0], args[1]] = timestamp ; /* time IO entered Run Queue */
this->spa = args[1]->io_spa ;
this->allactive = 0 ; /* reset to avoid bogus data */
this->allactive += this->spa->spa_queue_stats[ZIO_PRIORITY_SYNC_READ].spa_active ;
this->allactive += this->spa->spa_queue_stats[ZIO_PRIORITY_SYNC_WRITE].spa_active ;
this->allactive += this->spa->spa_queue_stats[ZIO_PRIORITY_ASYNC_READ].spa_active ;
this->allactive += this->spa->spa_queue_stats[ZIO_PRIORITY_ASYNC_WRITE].spa_active ;
this->allactive += this->spa->spa_queue_stats[ZIO_PRIORITY_SCRUB].spa_active ;
@rqctav[this->spa->spa_name] = avg(this->allactive) ;
@rqctmin[this->spa->spa_name] = min(this->allactive) ;
@rqctmax[this->spa->spa_name] = max(this->allactive) ;
@rqctsd[this->spa->spa_name] = stddev(this->allactive) ;
@rqaddct[this->spa->spa_name] = count() ; /* Count ++ to Run Queue */
}
/* IO enters the queue */
::vdev_queue_io:entry {
/*
* This should capture the total time we spend waiting and processing
* for each zio.
*/
totqueue[args[0]] = args[0]->io_queued_timestamp ;
}
::vdev_queue_pending_remove:entry {
this->spa = args[1]->io_spa ;
@rqrmct[this->spa->spa_name] = count() ; /* Count -- from Run Queue */
}
/* (z)IO is leaving Run Queue, which implies IO completed */
::vdev_queue_pending_remove:entry /rqueue[args[0], args[1]]/ {
this->delta = (timestamp - rqueue[args[0], args[1]]) / 1000 ;
this->spa = args[1]->io_spa ;
@rqavlat[this->spa->spa_name] = avg(this->delta) ;
@rqminlat[this->spa->spa_name] = min(this->delta) ;
@rqmaxlat[this->spa->spa_name] = max(this->delta) ;
@rqsdlat[this->spa->spa_name] = stddev(this->delta == 0 ?
1 : this->delta) ;
@rqslowct[this->spa->spa_name] = sum(this->delta >= SLOW ? 1 : 0) ;
/*
* Maintain a histogram of run qeueue latency distribution
* using base10 buckets: <1ms, 1-10ms, 10-100ms, 100ms-1s.
*/
@rqlatlt1ms[this->spa->spa_name] =
sum(this->delta < 1000 ? 1 : 0) ;
@rqlat1to10ms[this->spa->spa_name] =
sum(this->delta >= 1000 && this->delta < 10000 ? 1 : 0) ;
@rqlat10to100ms[this->spa->spa_name]=
sum(this->delta >= 10000 && this->delta < 100000 ? 1 : 0) ;
@rqlat100msto1s[this->spa->spa_name]=
sum(this->delta >= 100000 && this->delta < 1000000 ? 1 : 0) ;
@rqlatgt1s[this->spa->spa_name] =
sum(this->delta > 1000000 ? 1 : 0) ;
/*
* The zio_t structures are going to be recycled over and over and
* as such, we need to make sure that we avoid chances of bogus data
* because we are checking entry points into the clauses for a !0
* value.
*/
rqueue[args[0], args[1]] = 0 ;
}
/* IO completed */
::vdev_queue_io_done:entry
/totqueue[args[0]] == args[0]->io_queued_timestamp/ {
/*
* This is where we measure amount of time all the way from IO entering the
* wait queue through all the stages to completion.
*/
this->spa = args[0]->io_spa ;
this->delta = (timestamp - totqueue[args[0]]) / 1000 ;
@totqct[this->spa->spa_name] = count() ;
@totqavlat[this->spa->spa_name] = avg(this->delta) ;
@totqminlat[this->spa->spa_name] = min(this->delta) ;
@totqmaxlat[this->spa->spa_name] = max(this->delta) ;
@totqsdlat[this->spa->spa_name] = stddev(this->delta == 0 ?
1 : this->delta) ;
totqueue[args[0]] = 0 ;
}
tick-5sec {
/*
* If the counts are not equal between these histograms, we may have
* interrupted the script at the very moment an IO was transitioning
* and it did not get accounted for correctly. In general we expect
* that these counts are all equal.
*/
printa("%s,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d\n",
@rqavlat, @rqminlat, @rqmaxlat, @rqsdlat, @rqlatlt1ms, @rqlat1to10ms,
@rqlat10to100ms, @rqlat100msto1s, @rqlatgt1s,
@rqslowct, @rqaddct, @rqrmct, @rqctav, @rqctmin, @rqctmax, @rqctsd,
@wqavlat, @wqminlat, @wqmaxlat, @wqsdlat, @wqlatlt1ms, @wqlat1to10ms,
@wqlat10to100ms, @wqlat100msto1s, @wqlatgt1s, @wqslowct, @wqaddct,
@wqrmct, @wqctav, @wqctmin, @wqctmax, @wqctsd,
@totqavlat, @totqminlat, @totqmaxlat, @totqsdlat, @totqct,
@syncrdct, @syncwrct, @asyncrdct, @asyncwrct, @scrubct
) ;
trunc(@rqavlat) ; trunc(@rqminlat) ; trunc(@rqmaxlat) ; trunc(@rqsdlat) ;
trunc(@rqlatlt1ms) ; trunc(@rqlat1to10ms) ; trunc(@rqlat10to100ms) ;
trunc(@rqlat100msto1s) ; trunc(@rqlatgt1s) ;
trunc(@rqslowct) ; trunc(@rqaddct) ; trunc(@rqrmct) ;
trunc(@rqctav) ; trunc(@rqctmin) ; trunc(@rqctmax) ; trunc(@rqctsd) ;
trunc(@wqavlat) ; trunc(@wqminlat) ; trunc(@wqmaxlat) ; trunc(@wqsdlat) ;
trunc(@wqlatlt1ms) ; trunc(@wqlat1to10ms) ; trunc(@wqlat10to100ms) ;
trunc(@wqlat100msto1s) ; trunc(@wqlatgt1s) ;
trunc(@wqslowct) ; trunc(@wqaddct) ; trunc(@wqrmct) ;
trunc(@wqctav) ; trunc(@wqctmin) ; trunc(@wqctmax) ; trunc(@wqctsd) ;
trunc(@totqavlat) ; trunc(@totqminlat) ; trunc(@totqmaxlat) ;
trunc(@totqsdlat) ; trunc(@totqct) ;
trunc(@syncrdct); trunc(@syncwrct); trunc(@asyncrdct);
trunc(@asyncwrct); trunc(@scrubct) ;
}dtrace -qn '
enum zio_priority {
ZIO_PRIORITY_SYNC_READ,
ZIO_PRIORITY_SYNC_WRITE, /* ZIL */
ZIO_PRIORITY_ASYNC_READ, /* prefetch */
ZIO_PRIORITY_ASYNC_WRITE, /* spa_sync() */
ZIO_PRIORITY_SCRUB, /* asynchronous scrub/resilver reads */
ZIO_PRIORITY_NUM_QUEUEABLE,
ZIO_PRIORITY_NOW /* non-queued i/os (e.g. free) */
} ;
BEGIN {
start = timestamp ;
pri[ZIO_PRIORITY_SYNC_READ] = "Sync-Read" ;
pri[ZIO_PRIORITY_SYNC_WRITE] = "Sync-Write" ;
pri[ZIO_PRIORITY_ASYNC_READ] = "Async-Read" ;
pri[ZIO_PRIORITY_ASYNC_WRITE] = "Async-Write" ;
pri[ZIO_PRIORITY_SCRUB] = "Scrub" ;
pri[ZIO_PRIORITY_NUM_QUEUEABLE] = "Queueable" ;
pri[ZIO_PRIORITY_NOW] = "Non-queued" ;
}
::zio_read:entry, ::zio_write:entry {
self->s = timestamp ;
}
::zio_read:return, ::zio_write:return {
@ops[pri[args[1]->io_priority]] = count() ;
@hist[pri[args[1]->io_priority]] = quantize(args[1]->io_size) ;
}
/* Capture amount of time spent in this function. */
::zio_read:return, ::zio_write:return /self->s/ {
@avlat[pri[args[1]->io_priority]] = avg(timestamp - self->s) ;
self->s = 0 ;
}
END {
/* normalize to operations per second */
normalize(@ops, (timestamp - start) / 1000000000) ;
printa("\tPRI: %s Ops/s: %@d AvLat(ns): %@d %@d\n",
@ops, @avlat, @hist) ;
}'#!/usr/sbin/dtrace -Cs
#pragma D option quiet
/*
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
* Common Development and Distribution License (the "License").
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or http://www.opensolaris.org/os/licensing.
* See the License for the specific language governing permissions
* and limitations under the License.
*
* When distributing Covered Code, include this CDDL HEADER in each
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
*
* Copyright (c) 2017 Sam Zaydel / RackTop Systems.
*
* Script is intended to provide details about queueing at the vdev layer
* aggregating some information as well as reporting how many times we
* did not issue an IO, because queue was empty as opposed to how many times
* IOs were issued, because queue was not empty. Details are given through
* histograms describing size of queues and amount of time IO spends in wait
* queue before being processed.
* We still capture descriptive statistics here, like the avg(s), min(s), max(s),
* etc.
* Our goal is to capture time spent information along with queue size details
* because there is an expected relationship between size of queue and time
* IOs spend in the queue(s).
*/
long long ts[zio_t *], totqueue[zio_t *];
/* see usr/src/uts/common/fs/zfs/sys/zio_priority.h for details */
typedef enum zio_priority {
ZIO_PRIORITY_SYNC_READ,
ZIO_PRIORITY_SYNC_WRITE, /* ZIL */
ZIO_PRIORITY_ASYNC_READ, /* prefetch */
ZIO_PRIORITY_ASYNC_WRITE, /* spa_sync() */
ZIO_PRIORITY_SCRUB, /* asynchronous scrub/resilver reads */
ZIO_PRIORITY_NUM_QUEUEABLE,
ZIO_PRIORITY_NOW /* non-queued i/os (e.g. free) */
} zio_priority_t;
vdev_queue_io_to_issue:return
/arg1 != NULL/
{
@qe = quantize(1) ;
}
vdev_queue_io_to_issue:return
/arg1 == NULL/
{
@qe = quantize(0) ;
}
vdev_queue_io_to_issue:entry {
@vdavrq = avg(args[0]->vq_active_tree.avl_numnodes);
@vdminrq = min(args[0]->vq_active_tree.avl_numnodes);
@vdmaxrq = max(args[0]->vq_active_tree.avl_numnodes);
@vdsdrq = stddev(args[0]->vq_active_tree.avl_numnodes);
@vdrqhist = quantize(args[0]->vq_active_tree.avl_numnodes);
}
/*
* Count number of IOs by priority classification. In other words
* we bucket each IO by ZIO priority as defined in the zio_priority enum.
* This is in effect a histogram with 5 buckets, one each for:
* sync-read, sync-write, async-read, async-write, and scrub.
*/
vdev_queue_io_add:entry {
this->spa = args[1]->io_spa ;
this->pri = args[1]->io_priority ;
@prihist[this->spa->spa_name] = lquantize(this->pri, 0, 5) ;
}
vdev_queue_io_add:entry {
ts[args[1]] = timestamp ;
/*
* Sum of items with every possible ZIO priority where
* this->qstats >= 0 && this->qstats < ZIO_PRIORITY_NUM_QUEUEABLE.
*/
this->all_pri = (
args[1]->io_spa->spa_queue_stats[ZIO_PRIORITY_SYNC_READ].spa_queued +
args[1]->io_spa->spa_queue_stats[ZIO_PRIORITY_SYNC_WRITE].spa_queued +
args[1]->io_spa->spa_queue_stats[ZIO_PRIORITY_ASYNC_READ].spa_queued +
args[1]->io_spa->spa_queue_stats[ZIO_PRIORITY_ASYNC_WRITE].spa_queued +
args[1]->io_spa->spa_queue_stats[ZIO_PRIORITY_SCRUB].spa_queued
) ;
@avwq[args[1]->io_spa->spa_name] = avg(this->all_pri) ;
@sdwq[args[1]->io_spa->spa_name] = stddev(this->all_pri) ;
@minwq[args[1]->io_spa->spa_name] = min(this->all_pri) ;
@maxwq[args[1]->io_spa->spa_name] = max(this->all_pri) ;
@wqhist[args[1]->io_spa->spa_name] = quantize(this->all_pri) ;
}
vdev_queue_pending_add:entry {
/*
* Sum of items with every possible ZIO priority where
* this->qstats >= 0 && this->qstats < ZIO_PRIORITY_NUM_QUEUEABLE.
*/
this->all_pri = (
args[1]->io_spa->spa_queue_stats[ZIO_PRIORITY_SYNC_READ].spa_active +
args[1]->io_spa->spa_queue_stats[ZIO_PRIORITY_SYNC_WRITE].spa_active +
args[1]->io_spa->spa_queue_stats[ZIO_PRIORITY_ASYNC_READ].spa_active +
args[1]->io_spa->spa_queue_stats[ZIO_PRIORITY_ASYNC_WRITE].spa_active +
args[1]->io_spa->spa_queue_stats[ZIO_PRIORITY_SCRUB].spa_active
) ;
@avrq[args[1]->io_spa->spa_name] = avg(this->all_pri) ;
@sdrq[args[1]->io_spa->spa_name] = stddev(this->all_pri) ;
@minrq[args[1]->io_spa->spa_name] = min(this->all_pri) ;
@maxrq[args[1]->io_spa->spa_name] = max(this->all_pri) ;
@rqhist[args[1]->io_spa->spa_name] = quantize(this->all_pri) ;
}
/*
* Measure delta between the time given ZIO entered and exited wait queue.
*/
vdev_queue_io_remove:entry /ts[args[1]] > 0/ {
this->delta = (timestamp - ts[args[1]]) / 1000 ;
@avwqlat[args[1]->io_spa->spa_name] = avg(this->delta) ;
@sdwqlat[args[1]->io_spa->spa_name] = stddev(this->delta) ;
@minwqlat[args[1]->io_spa->spa_name] = min(this->delta) ;
@maxwqlat[args[1]->io_spa->spa_name] = max(this->delta) ;
@wqlathist[args[1]->io_spa->spa_name] = quantize(this->delta) ;
ts[args[1]] = 0 ; /* zio structures are recycled */
}
/* IO enters the queue */
::vdev_queue_io:entry /totqueue[args[0]] == 0/ {
/*
* This should capture the total time we spend waiting and processing
* for each zio.
*/
/* hrtime_t of entry for given IO into queue */
totqueue[args[0]] = args[0]->io_queued_timestamp ;
}
/* IO completed */
::vdev_queue_io_done:entry
/* match hrtime_t saved in the totqueue[args[0]] to value in io_queued_timestamp */
/totqueue[args[0]] == args[0]->io_queued_timestamp/ {
/*
* This is where we measure amount of time all the way from IO entering the
* wait queue through all the stages to completion.
* The point when IO was queued is stored in io_queued_timestamp field,
* and we delta between it and current time.
*/
this->delayed = args[0]->io_target_timestamp > 0 ? 1 : 0;
this->spa = args[0]->io_spa ;
this->delta = (timestamp - totqueue[args[0]]) / 1000 ; /* mS */
@totqavlat[this->spa->spa_name] = avg(this->delta) ;
@totqminlat[this->spa->spa_name] = min(this->delta) ;
@totqmaxlat[this->spa->spa_name] = max(this->delta) ;
@totqsdlat[this->spa->spa_name] = stddev(this->delta) ;
@totqhist[this->spa->spa_name] = quantize(this->delta) ;
/* Each delay will result in @delayed ++ */
@delayed[this->spa->spa_name] = sum(this->delayed) ;
totqueue[args[0]] = 0 ;
}
END {
printf("************************\n") ;
printf("Empty v. Not Empty Queue\n") ;
printf("************************\n\n") ;
printa("IOs (Not Physical = 0 | Physical = 1) %@d\n", @qe) ;
printf("\n");
printf("************************\n") ;
printf("*** Queueing Details ***\n") ;
printf("************************\n\n") ;
printa("\t [%s] Wait Queue Time Avg/Min/Max/StdDev | %@d(uS)/%@d(uS)/%@d(uS)/%@d(uS)%@d\n",
@avwqlat, @minwqlat, @maxwqlat, @sdwqlat, @wqlathist) ;
printa("\t [%s] Wait Queue Length Avg/Min/Max/StdDev | %@d/%@d/%@d/%@d%@d\n",
@avwq, @minwq, @maxwq, @sdwq, @wqhist) ;
printa("\t [%s] Run Queue Length Avg/Min/Max/StdDev | %@d/%@d/%@d/%@d%@d\n",
@avrq, @minrq, @maxrq, @sdrq, @rqhist) ;
printa("\t Per Device Queue Length Avg/Min/Max/StdDev | %@d/%@d/%@d/%@d%@d\n",
@vdavrq, @vdminrq, @vdmaxrq, @vdsdrq, @vdrqhist) ;
printa("\t [%s] Wait+Run Queue Time Avg/Min/Max/StdDev | %@d(uS)/%@d(uS)/%@d(uS)/%@d(uS)\n\t Delayed IOs: %@d%@d\n",
@totqavlat, @totqminlat, @totqmaxlat, @totqsdlat, @delayed, @totqhist) ;
printa("\t [%s] IO Histogram by ZIO priority\n SyncRD => 0 | SyncWR => 1 | AsyncRD => 2 | AsyncWR => 3 | Scrub => 4 %@d\n", @prihist) ;
}dtrace -qn '
/*
* This script is intended for observing IOs as they transition from
* Wait Queue to Run Queue and to completion. Latency is measured as
* the IOs enter and then exit the Wait Queue, and then as they leave
* the Wait Queue and enter the Run Queue.
* We collect this information by pool, so we can see differences in
* terms of time spent in these two queues. Ideally we want to see as
* little as possible time spent in these queues, and the dispersion
* between lowest and highest times remaining tight and fairly consistent
* without offending outliers that likely will be seen as load increases.
*/
int wqueue[zio_t *]; /* zio_t pointer passed through the queuing stages */
int rqueue[zio_t *]; /* same as above used for run queue */
int wrqueue[zio_t *]; /* same as above used for total queue time */
/* (z)IO is entering Wait Queue */
::vdev_queue_io_add:entry {
this->spa = args[1]->io_spa ;
wqueue[args[1]] = timestamp ;
}
/* (z)IO is leaving Wait Queue on its way to Run Queue */
::vdev_queue_io_remove:entry /wqueue[args[1]] > 0/ {
this->delta = (timestamp - wqueue[args[1]]) / 1000 ;
@waitqlat[this->spa->spa_name] = quantize(this->delta) ;
@waitqct[this->spa->spa_name] = count() ;
self->waitw_t = 0 ;
wqueue[args[1]] = 0 ;
}
/* (z)IO is entering Run Queue */
::vdev_queue_pending_add:entry {
rqueue[args[1]] = timestamp ; /* time IO entered Run Queue */
this->spa = args[1]->io_spa ;
}
/* IO enters the queue */
::vdev_queue_io:entry {
/*
* This should capture the total time we spend waiting and processing
* for each zio.
*/
wrqueue[args[0]] = timestamp ;
}
/* IO completed */
::vdev_queue_io_done:entry /wrqueue[args[0]] > 0/ {
/*
* This is where we measure amount of time all the way from IO entering the
* wait queue through all the stages to completion.
*/
this->spa = args[0]->io_spa ;
this->delta = (timestamp - wrqueue[args[0]]) / 1000 ;
@wqtorqlat[this->spa->spa_name] = quantize(this->delta) ;
@wqtorqct[this->spa->spa_name] = count() ;
wrqueue[args[0]] = 0 ;
}
/* (z)IO is leaving Run Queue, which implies IO completed */
::vdev_queue_pending_remove:entry /rqueue[args[1]]/ {
this->delta = (timestamp - rqueue[args[1]]) / 1000 ;
this->spa = args[1]->io_spa ;
@runqlat[this->spa->spa_name] = quantize(this->delta) ;
@runqct[this->spa->spa_name] = count() ;
/*
* The zio_t structures are going to be recycled over and over and
* as such, we need to make sure that we avoid chances of bogus data
* because we are checking entry points into the clauses for a !0
* value.
*/
rqueue[args[1]] = 0 ;
}
END {
/*
* If the counts are not equal between these histograms, we may have
* interrupted the script at the very moment an IO was transitioning
* and it did not get accounted for correctly. In general we expect
* that these counts are all equal.
*/
printa(" Wait Queue [%s](uS)\t\tCount: %@d%@d\n", @waitqct, @waitqlat) ;
printa(" Run Queue [%s](uS)\t\tCount: %@d%@d\n", @runqct, @runqlat) ;
printa(" W+R Queue [%s](uS)\t\tCount: %@d%@d\n", @wqtorqct, @wqtorqlat) ;
}'#!/usr/sbin/dtrace -Cs
#pragma D option quiet
/*
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
* Common Development and Distribution License (the "License").
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or http://www.opensolaris.org/os/licensing.
* See the License for the specific language governing permissions
* and limitations under the License.
*
* When distributing Covered Code, include this CDDL HEADER in each
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
*
* Copyright (c) 2017 Sam Zaydel / RackTop Systems.
*
* Script will report queue depth for what we can think of as
* run queue and wait queue in ZFS. The data is drawn from kstat
* each time we add an entry into the run wait queue, which happens
* through a call to `kstat_runq_enter`, or entry in the wait queue
* which happens through a call to `kstat_waitq_enter`.
*
* What we get from this is an output that looks something like this:
* pool,rq.ct,rq.av,rq.max,rq.e,rq.grew,wq.ct,wq.av,wq.max,wq.e,wq.grew
* p01,19613,30,190,105,8136,73053,52,513,40468,26471
* p01,24770,50,358,149,11269,84962,27,247,51705,27830
* p01,17107,15,58,119,6719,64745,46,460,35134,24129
*
* Column Definitions:
* rq.ct/wq.ct => Count of enter(s) into run/wait queues
* rq.av/wq.av => Average number of IOs in run/wait queues
* rq.max/wq.max => Maximum number of IOs in run/wait queues
* rq.e/wq.e => Count of enter(s) into empty run/wait queues
* rq.grew/wq.grew => Count of times run/wait queue grew from last entry
*
* This data captures count of times we entered queue, number of items in
* queue at the time we entered, maximum number of items during sample
* period, count of times queue had 0 items, and whether queue grew since
* last time we counted number of items. We do this for both Run and Wait
* queues. Results are printed at regular intervals.
*/
#include <kstat.h>
#define KSTAT_IO_WCNT(kptr)((uint_t)(kptr)->wcnt)
#define KSTAT_IO_RCNT(kptr)((uint_t)(kptr)->rcnt)
int lastcnt[int] ; /* track count of IOs in queue for delta calculation */
inline int PREV_RCNT = 0 ;
inline int PREV_WCNT = 1 ;
BEGIN {
printf("pool,rq.ct,rq.av,rq.max,rq.e,rq.grew,wq.ct,wq.av,wq.max,wq.e,wq.grew\n");
}
/* Tag IO as entering Wait Queue */
::vdev_queue_io_add:entry {
self->pzio = args[1] ;
}
/* Tag IO as entering Run Queue */
::vdev_queue_pending_add:entry {
self->pzio = args[1] ;
}
/* Wait Queue */
::kstat_waitq_enter:entry /self->pzio/ {
this->spa = self->pzio->io_spa ;
this->p = self->pzio->io_priority ;
/* this->activ = this->spa->spa_queue_stats[this->p].spa_active ; */
this->cnt = KSTAT_IO_WCNT(args[0]) ;
/* Wait Queue Count */
@wqc[self->pzio->io_spa->spa_name] = count() ;
@wqav[self->pzio->io_spa->spa_name] = avg(this->cnt) ;
@wqmax[self->pzio->io_spa->spa_name] = max(this->cnt) ;
@wqempty[self->pzio->io_spa->spa_name] = sum(this->cnt > 0 ? 0 : 1) ;
/*
* We want to determine here if last time we entered wait queue
* we had fewer or more items than we do now, i.e. queue grew?
*/
@wqgrew[self->pzio->io_spa->spa_name] =
sum(this->cnt > lastcnt[PREV_WCNT] ? 1 : 0) ;
lastcnt[PREV_WCNT] = this->cnt ;
self->pzio = 0 ;
}
/* Run Queue */
::kstat_runq_enter:entry /self->pzio/ {
this->spa = self->pzio->io_spa ;
this->p = self->pzio->io_priority ;
/* this->activ = this->spa->spa_queue_stats[this->p].spa_active ; */
this->cnt = KSTAT_IO_RCNT(args[0]) ;
/* Run Queue Count */
@rqc[self->pzio->io_spa->spa_name] = count() ;
@rqav[self->pzio->io_spa->spa_name] = avg(this->cnt) ;
@rqmax[self->pzio->io_spa->spa_name] = max(this->cnt) ;
@rqempty[self->pzio->io_spa->spa_name] = sum(this->cnt > 0 ? 0 : 1) ;
/*
* We want to determine here if last time we entered wait queue
* we had fewer or more items than we do now, i.e. queue grew?
*/
@rqgrew[self->pzio->io_spa->spa_name] =
sum(this->cnt > lastcnt[PREV_RCNT] ? 1 : 0) ;
lastcnt[PREV_RCNT] = this->cnt ;
self->pzio = 0 ;
}
tick-5sec {
printa("%s,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d\n",
@rqc, @rqav, @rqmax, @rqempty, @rqgrew, @wqc, @wqav, @wqmax, @wqempty, @wqgrew) ;
trunc(@rqc); trunc(@rqav); trunc(@rqmax); trunc(@rqempty); trunc(@rqgrew);
trunc(@wqc); trunc(@wqav); trunc(@wqmax); trunc(@wqempty); trunc(@wqgrew);
}#!/usr/sbin/dtrace -Cs
#pragma D option quiet
/*
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
* Common Development and Distribution License (the "License").
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or http://www.opensolaris.org/os/licensing.
* See the License for the specific language governing permissions
* and limitations under the License.
*
* When distributing Covered Code, include this CDDL HEADER in each
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
*
* Copyright (c) 2017 Sam Zaydel / RackTop Systems.
*
* Script will report queue depth for what we can think of as
* run queue and wait queue in ZFS. The data is drawn from kstat
* each time we add an entry into the run wait queue, which happens
* through a call to `kstat_runq_enter`, or entry in the wait queue
* which happens through a call to `kstat_waitq_enter`.
*
* We take a count of number of elements in the queue and quantize
* results creating a distribution with peak being the mode of this
* distribution. In general resulting distributions are unimodal, though
* it is possible to see a bimodal distribution. If we see a bimodal
* distribution, we are looking at the the grouped histogram where we
* do no distinguish between different ZIO Priority queues. There are
* MIN and MAX tunables in ZIO pipeline, and the two modes are likely
* a result of switching between MIN and MAX settings, due to change in
* IO load, based on amount of work.
* Kstats do not specifically identify individual ZIO queues, instead they
* report information aggregating by the two broad states, Wait and Run.
* Some IOs will most certainly be double counted, since an IO will move
* from Wait to Run, and possibly back to Wait.
*/
#include <kstat.h>
#define KSTAT_IO_WCNT(kptr)((kstat_io_t *)(kptr)->wcnt)
#define KSTAT_IO_RCNT(kptr)((kstat_io_t *)(kptr)->rcnt)
/* See uts/common/fs/zfs/sys/zio_priority.h */
typedef enum zio_priority {
ZIO_PRIORITY_SYNC_READ,
ZIO_PRIORITY_SYNC_WRITE, /* ZIL */
ZIO_PRIORITY_ASYNC_READ, /* prefetch */
ZIO_PRIORITY_ASYNC_WRITE, /* spa_sync() */
ZIO_PRIORITY_SCRUB, /* asynchronous scrub/resilver reads */
ZIO_PRIORITY_NUM_QUEUEABLE,
ZIO_PRIORITY_NOW /* non-queued i/os (e.g. free) */
};
BEGIN {
/*
* Map Enum values in zio_priority to string representations.
*/
qn[ZIO_PRIORITY_SYNC_READ] = "Read";
qn[ZIO_PRIORITY_ASYNC_READ] = "Prefetch";
qn[ZIO_PRIORITY_SYNC_WRITE] = "Write(ZIL)";
qn[ZIO_PRIORITY_ASYNC_WRITE] = "Write(SPAsync)";
qn[ZIO_PRIORITY_SCRUB] = "Read(Scrum/Resiver)";
qn[ZIO_PRIORITY_NUM_QUEUEABLE] = "Queuable";
qn[ZIO_PRIORITY_NOW] = "Immediate";
/* At the end, we compute delta to normalize count by second. */
start_time = timestamp ;
}
/* Tag IO as entering Run Queue */
::vdev_queue_io_add:entry {
self->pzio = args[1] ;
this->spa = self->pzio->io_spa ;
self->ks = this->spa->spa_iokstat->ks_data ;
}
/* Tag IO as entering Wait Queue */
::vdev_queue_pending_add:entry {
self->pzio = args[1] ;
this->spa = self->pzio->io_spa ;
self->ks = this->spa->spa_iokstat->ks_data ;
}
/* Wait Queue */
::kstat_waitq_enter:entry /self->ks == args[0]/ {
this->spa = self->pzio->io_spa ;
this->p = self->pzio->io_priority ;
this->activ = this->spa->spa_queue_stats[this->p].spa_active ;
@c[self->pzio->io_spa->spa_name, "WAITQ"] = count() ;
@q[self->pzio->io_spa->spa_name, "WAITQ"] =
quantize(KSTAT_IO_WCNT(args[0])) ;
/* Grouped by ZIO Priority all IOs in Wait Queue */
@qbyziot[self->pzio->io_spa->spa_name,
"WAITQ", qn[this->p]] = quantize(this->activ);
self->pzio = 0 ;
self->ks = 0 ;
}
/* Run Queue */
::kstat_runq_enter:entry /self->ks == args[0]/ {
this->spa = self->pzio->io_spa ;
this->p = self->pzio->io_priority ;
this->activ = this->spa->spa_queue_stats[this->p].spa_active ;
@c[self->pzio->io_spa->spa_name, "RUNQ"] = count() ;
@q[self->pzio->io_spa->spa_name, "RUNQ"] =
quantize(KSTAT_IO_RCNT(args[0])) ;
/* Grouped by ZIO Priority all IOs in Run Queue */
@qbyziot[self->pzio->io_spa->spa_name,
"RUNQ", qn[this->p]] = quantize(this->activ);
self->pzio = 0 ;
self->ks = 0 ;
}
END {
runtime_sec = (timestamp - start_time) / 1000000000 ;
normalize(@c, runtime_sec) ;
/*
* Print histograms and final count by pool.
* We should hope that total count in Run and Wait queues
* equals to sum of counts over all ZIO priority queues.
* There are several states that any ZIO could be in, and they are
* represented by individual histograms. In other words, each state
* will be represented by a single histogram. We should expect most
* by zio priority histograms to be pretty boring.
*/
printa(" %s State: %s\tZIO Priority: %s%@d\n", @qbyziot);
printa(" %s [Queued in: %s] %@d\n", @q);
printf("+----------------------------------------------+\n");
printf("|---->> Pool Per-Second Normalized Count <<----|\n");
printf("+----------------------------------------------+\n");
printf("Pool Queue\tCount\n");
printa("%-8s %-5s\t%@d\n", @c);
printf("+----------------------------------------------+\n");
}