szaydel
12/5/2016 - 8:17 PM

[ZFS pool Queue and Per-Disk Level Observations] Dtrace snippets for measuring queue depths and latency at the boundary between ZFS and phys

[ZFS pool Queue and Per-Disk Level Observations] Dtrace snippets for measuring queue depths and latency at the boundary between ZFS and physical storage. #tags: per-disk, iostat, dtrace, queue, queuing, spa, vdev_queue_io_add, kstat, runq, waitq, queue-length, queue-depth, zio-class, csv

#!/usr/sbin/dtrace -Cs
#pragma D option quiet
#pragma D option dynvarsize=16M
/*
 * This script is intended for observing IOs as they transition from
 * Wait Queue to Run Queue and to completion. Latency is measured as
 * the IOs enter and then exit the Wait Queue, and then as they leave
 * the Wait Queue and enter the Run Queue.
 * We collect this information by pool, so we can see differences in
 * terms of time spent in these two queues.
 * Also collected are counts of IOs by priority classification. See the
 * zio_priority enum below, as well zio_priority.h for more details.
 * Ideally we want to see as little as possible time spent in these queues.
 * And dispersion between lowest and highest times remaining tight and fairly 
 * consistent without many offending outliers.
 *
 * aggregates prefixed with `tot` are tracking each ZIO from time it gets into
 * the queue to the time it is processed and is removed from queue.
 * We can think of `tot*` aggregates as a proxy for latency that we would
 * normally measure at the application level, roughly.
 */
long wqueue[vdev_queue_t *, zio_t *];   /* measure wait queue time */
long rqueue[vdev_queue_t *, zio_t *];   /* measure run queue time */
long totqueue[zio_t *];                 /* measure combined queue time */

inline const int SLOW = 10000 ; /* >10ms considered slow */

/* see usr/src/uts/common/fs/zfs/sys/zio_priority.h for details */
typedef enum zio_priority {
    ZIO_PRIORITY_SYNC_READ,
    ZIO_PRIORITY_SYNC_WRITE,	/* ZIL */
    ZIO_PRIORITY_ASYNC_READ,	/* prefetch */
    ZIO_PRIORITY_ASYNC_WRITE,	/* spa_sync() */
    ZIO_PRIORITY_SCRUB,		    /* asynchronous scrub/resilver reads */
    ZIO_PRIORITY_NUM_QUEUEABLE,

    ZIO_PRIORITY_NOW		    /* non-queued i/os (e.g. free) */
} zio_priority_t;

BEGIN {
    printf("ts,pool,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s\n",
        "rqav", "rqmin", "rqmax", "rqsd", "rqlt1ms", "rq1to10ms", "rq10to100ms",
        "rq100msto1s", "rqgt1s", "rqslowct", "rqaddct", "rqrmct", "rqctav",
        "rqctmin", "rqctmax", "rqctsd",
        "wqav", "wqmin", "wqmax", "wqsd", "wqlt1ms", "wq1to10ms", "wq10to100ms",
        "wq100msto1s", "wqgt1s", "wqslowct", "wqaddct", "wqrmct", "wqctav",
        "wqctmin", "wqctmax", "wqctsd",
        "totqav", "totqmin", "totqmax", "totqsd", "totqct", "syncrdct", "syncwrct", "asyncrdct", "asyncwrct", "scrubct"
    );
    interval = walltimestamp - (walltimestamp%1000000000);
}

/* (z)IO is entering Wait Queue */
::vdev_queue_io_add:entry {
    wqueue[args[0], args[1]]  = args[1]->io_queued_timestamp ;
    this->spa        = args[1]->io_spa ;
    this->allqueued  = 0 ; /* reset to avoid bogus data */
    this->allqueued += this->spa->spa_queue_stats[ZIO_PRIORITY_SYNC_READ].spa_queued ;
    this->allqueued += this->spa->spa_queue_stats[ZIO_PRIORITY_SYNC_WRITE].spa_queued ;
    this->allqueued += this->spa->spa_queue_stats[ZIO_PRIORITY_ASYNC_READ].spa_queued ;
    this->allqueued += this->spa->spa_queue_stats[ZIO_PRIORITY_ASYNC_WRITE].spa_queued ;
    this->allqueued += this->spa->spa_queue_stats[ZIO_PRIORITY_SCRUB].spa_queued ;
    @wqctav[interval, this->spa->spa_name]    = avg(this->allqueued) ;
    @wqctmin[interval, this->spa->spa_name]   = min(this->allqueued) ;
    @wqctmax[interval, this->spa->spa_name]   = max(this->allqueued) ;
    @wqctsd[interval, this->spa->spa_name]    = stddev(this->allqueued) ;
    @wqaddct[interval, this->spa->spa_name]   = count() ; /* Count ++ to Wait Queue */
}

/* 
 * Count number of IOs by priority classification. In other words
 * we bucket each IO by ZIO priority as defined in the zio_priority enum.
 * This is in effect a histogram with 5 buckets, one each for:
 * sync-read, sync-write, async-read, async-write, and scrub.
 */
::vdev_queue_io_add:entry {
    this->spa       = args[1]->io_spa ;
    this->pri       = args[1]->io_priority ;
    @syncrdct[interval, this->spa->spa_name] =
        sum(this->pri == ZIO_PRIORITY_SYNC_READ ? 1 : 0) ;
    @syncwrct[interval, this->spa->spa_name] =
        sum(this->pri == ZIO_PRIORITY_SYNC_WRITE ? 1 : 0) ;
    @asyncrdct[interval, this->spa->spa_name] =
        sum(this->pri == ZIO_PRIORITY_ASYNC_READ ? 1 : 0) ;
    @asyncwrct[interval, this->spa->spa_name] =
        sum(this->pri == ZIO_PRIORITY_ASYNC_WRITE ? 1 : 0) ;
    @scrubct[interval, this->spa->spa_name] =
        sum(this->pri == ZIO_PRIORITY_SCRUB ? 1 : 0) ;
}

::vdev_queue_io_remove:entry {
    this->spa       = args[1]->io_spa ;
    @wqrmct[interval, this->spa->spa_name]        = count() ; /* Count -- from Wait Queue */
}

/* (z)IO is leaving Wait Queue on its way to Run Queue */
::vdev_queue_io_remove:entry
/wqueue[args[0], args[1]] == args[1]->io_queued_timestamp/ {
    this->delta     = (timestamp - wqueue[args[0], args[1]]) / 1000 ;
    this->spa       = args[1]->io_spa ;
    @wqavlat[interval, this->spa->spa_name]       = avg(this->delta) ;
    @wqminlat[interval, this->spa->spa_name]      = min(this->delta) ;
    @wqmaxlat[interval, this->spa->spa_name]      = max(this->delta) ;
    @wqsdlat[interval, this->spa->spa_name]       = stddev(this->delta) ;
    /* If spending to long in the queue, identify as slow IO. */
    @wqslowct[interval, this->spa->spa_name]      = sum(this->delta >= SLOW ? 1 : 0) ;
    /* 
     * Maintain a histogram of wait qeueue latency distribution 
     * using base10 buckets: <1ms, 1-10ms, 10-100ms, 100ms-1s.
     */
    @wqlatlt1ms[interval, this->spa->spa_name]    = 
        sum(this->delta < 1000 ? 1 : 0) ;
    @wqlat1to10ms[interval, this->spa->spa_name]  = 
        sum(this->delta >= 1000 && this->delta < 10000 ? 1 : 0) ;
    @wqlat10to100ms[interval, this->spa->spa_name]= 
        sum(this->delta >= 10000 && this->delta < 100000 ? 1 : 0) ;
    @wqlat100msto1s[interval, this->spa->spa_name]= 
        sum(this->delta >= 100000 && this->delta < 1000000 ? 1 : 0) ;
    @wqlatgt1s[interval, this->spa->spa_name]     = sum(this->delta > 1000000 ? 1 : 0) ;
    /* 
     * The zio_t structures are going to be recycled over and over and
     * as such, we need to make sure that we avoid chances of bogus data
     * because we are checking entry points into the clauses for a !0
     * value.
     */
    wqueue[args[0], args[1]] = 0 ;
}

/* (z)IO is entering Run Queue */
::vdev_queue_pending_add:entry {
    rqueue[args[0], args[1]]  = timestamp ; /* time IO entered Run Queue */
    this->spa        = args[1]->io_spa ;
    this->allactive  = 0 ; /* reset to avoid bogus data */
    this->allactive += this->spa->spa_queue_stats[ZIO_PRIORITY_SYNC_READ].spa_active ;
    this->allactive += this->spa->spa_queue_stats[ZIO_PRIORITY_SYNC_WRITE].spa_active ;
    this->allactive += this->spa->spa_queue_stats[ZIO_PRIORITY_ASYNC_READ].spa_active ;
    this->allactive += this->spa->spa_queue_stats[ZIO_PRIORITY_ASYNC_WRITE].spa_active ;
    this->allactive += this->spa->spa_queue_stats[ZIO_PRIORITY_SCRUB].spa_active ;
    @rqctav[interval, this->spa->spa_name]    = avg(this->allactive) ;
    @rqctmin[interval, this->spa->spa_name]   = min(this->allactive) ;
    @rqctmax[interval, this->spa->spa_name]   = max(this->allactive) ;
    @rqctsd[interval, this->spa->spa_name]    = stddev(this->allactive) ;
    @rqaddct[interval, this->spa->spa_name]   = count() ; /* Count ++ to Run Queue */
}

/* IO enters the queue */
::vdev_queue_io:entry {
    /* 
     * This should capture the total time we spend waiting and processing
     * for each zio.
     */
    totqueue[args[0]]    = args[0]->io_queued_timestamp ;
}

::vdev_queue_pending_remove:entry {
    this->spa       = args[1]->io_spa ;
    @rqrmct[interval, this->spa->spa_name]        = count() ; /* Count -- from Run Queue */
}

/* (z)IO is leaving Run Queue, which implies IO completed */
::vdev_queue_pending_remove:entry /rqueue[args[0], args[1]]/ {
    this->delta     = (timestamp - rqueue[args[0], args[1]]) / 1000 ;
    this->spa       = args[1]->io_spa ;
    @rqavlat[interval, this->spa->spa_name]       = avg(this->delta) ;
    @rqminlat[interval, this->spa->spa_name]      = min(this->delta) ;
    @rqmaxlat[interval, this->spa->spa_name]      = max(this->delta) ;
    @rqsdlat[interval, this->spa->spa_name]       = stddev(this->delta == 0 ?
                                                1 : this->delta) ;
    @rqslowct[interval, this->spa->spa_name]      = sum(this->delta >= SLOW ? 1 : 0) ;
    /* 
     * Maintain a histogram of run qeueue latency distribution 
     * using base10 buckets: <1ms, 1-10ms, 10-100ms, 100ms-1s.
     */
    @rqlatlt1ms[interval, this->spa->spa_name]    = 
        sum(this->delta < 1000 ? 1 : 0) ;
    @rqlat1to10ms[interval, this->spa->spa_name]  = 
        sum(this->delta >= 1000 && this->delta < 10000 ? 1 : 0) ;
    @rqlat10to100ms[interval, this->spa->spa_name]= 
        sum(this->delta >= 10000 && this->delta < 100000 ? 1 : 0) ;
    @rqlat100msto1s[interval, this->spa->spa_name]= 
        sum(this->delta >= 100000 && this->delta < 1000000 ? 1 : 0) ;
    @rqlatgt1s[interval, this->spa->spa_name]     = 
        sum(this->delta > 1000000 ? 1 : 0) ;
    /* 
     * The zio_t structures are going to be recycled over and over and
     * as such, we need to make sure that we avoid chances of bogus data
     * because we are checking entry points into the clauses for a !0
     * value.
     */
    rqueue[args[0], args[1]] = 0 ;
}

/* IO completed */
::vdev_queue_io_done:entry
/totqueue[args[0]] == args[0]->io_queued_timestamp/ {
    /*
     * This is where we measure amount of time all the way from IO entering the
     * wait queue through all the stages to completion.
     */
    this->spa           = args[0]->io_spa ;
    this->delta         = (timestamp - totqueue[args[0]]) / 1000 ;
    @totqct[interval, this->spa->spa_name]      = count() ;
    @totqavlat[interval, this->spa->spa_name]   = avg(this->delta) ;
    @totqminlat[interval, this->spa->spa_name]  = min(this->delta) ;
    @totqmaxlat[interval, this->spa->spa_name]  = max(this->delta) ;
    @totqsdlat[interval, this->spa->spa_name]   = stddev(this->delta == 0 ?
                                                1 : this->delta) ;
    totqueue[args[0]]   = 0 ;
}

tick-5sec {
    /*
     * If the counts are not equal between these histograms, we may have
     * interrupted the script at the very moment an IO was transitioning
     * and it did not get accounted for correctly. In general we expect
     * that these counts are all equal.
     */
    printa("%ld,%s,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d\n",
            @rqavlat, @rqminlat, @rqmaxlat, @rqsdlat, @rqlatlt1ms, @rqlat1to10ms,
            @rqlat10to100ms, @rqlat100msto1s, @rqlatgt1s,
            @rqslowct, @rqaddct, @rqrmct, @rqctav, @rqctmin, @rqctmax, @rqctsd,
            @wqavlat, @wqminlat, @wqmaxlat, @wqsdlat, @wqlatlt1ms, @wqlat1to10ms,
            @wqlat10to100ms, @wqlat100msto1s, @wqlatgt1s, @wqslowct, @wqaddct,
            @wqrmct, @wqctav, @wqctmin, @wqctmax, @wqctsd,
            @totqavlat, @totqminlat, @totqmaxlat, @totqsdlat, @totqct,
            @syncrdct, @syncwrct, @asyncrdct, @asyncwrct, @scrubct
            ) ;
    trunc(@rqavlat) ; trunc(@rqminlat) ; trunc(@rqmaxlat) ; trunc(@rqsdlat) ;
    trunc(@rqlatlt1ms) ; trunc(@rqlat1to10ms) ; trunc(@rqlat10to100ms) ;
    trunc(@rqlat100msto1s) ; trunc(@rqlatgt1s) ;
    trunc(@rqslowct) ; trunc(@rqaddct) ; trunc(@rqrmct) ;
    trunc(@rqctav) ; trunc(@rqctmin) ; trunc(@rqctmax) ; trunc(@rqctsd) ;
    trunc(@wqavlat) ; trunc(@wqminlat) ; trunc(@wqmaxlat) ; trunc(@wqsdlat) ;
    trunc(@wqlatlt1ms) ; trunc(@wqlat1to10ms) ; trunc(@wqlat10to100ms) ;
    trunc(@wqlat100msto1s) ; trunc(@wqlatgt1s) ;
    trunc(@wqslowct) ; trunc(@wqaddct) ; trunc(@wqrmct) ;
    trunc(@wqctav) ; trunc(@wqctmin) ; trunc(@wqctmax) ; trunc(@wqctsd) ;
    trunc(@totqavlat) ; trunc(@totqminlat) ; trunc(@totqmaxlat) ; 
    trunc(@totqsdlat) ; trunc(@totqct) ;
    trunc(@syncrdct); trunc(@syncwrct); trunc(@asyncrdct); 
    trunc(@asyncwrct); trunc(@scrubct) ;
    interval = walltimestamp - (walltimestamp%1000000000);
}
dtrace -qn '
    /*
     * This script is intended for observing IOs as they transition from
     * Wait Queue to Run Queue and to completion. Latency is measured as
     * the IOs enter and then exit the Wait Queue, and then as they leave
     * the Wait Queue and enter the Run Queue.
     * We collect this information by pool, so we can see differences in
     * terms of time spent in these two queues.
     * Also collected are counts of IOs by priority classification. See the
     * zio_priority enum below, as well zio_priority.h for more details.
     * Ideally we want to see as little as possible time spent in these queues.
     * And dispersion between lowest and highest times remaining tight and fairly 
     * consistent without many offending outliers.
     *
     * aggregates prefixed with `tot` are tracking each ZIO from time it gets into
     * the queue to the time it is processed and is removed from queue.
     * We can think of `tot*` aggregates as a proxy for latency that we would
     * normally measure at the application level, roughly.
     */
    int wqueue[zio_t *]; /* zio_t pointer passed through the queuing stages */
    int rqueue[zio_t *]; /* same as above used for run queue */
    int totqueue[zio_t *];

    inline const int SLOW = 10000 ; /* >10ms considered slow */

    /* see usr/src/uts/common/fs/zfs/sys/zio_priority.h for details */
    typedef enum zio_priority {
        ZIO_PRIORITY_SYNC_READ,
        ZIO_PRIORITY_SYNC_WRITE,	/* ZIL */
        ZIO_PRIORITY_ASYNC_READ,	/* prefetch */
        ZIO_PRIORITY_ASYNC_WRITE,	/* spa_sync() */
        ZIO_PRIORITY_SCRUB,		/* asynchronous scrub/resilver reads */
        ZIO_PRIORITY_NUM_QUEUEABLE,

        ZIO_PRIORITY_NOW		/* non-queued i/os (e.g. free) */
    } zio_priority_t;

    BEGIN {
        printf("pool,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s\n",
            "rqav", "rqmin", "rqmax", "rqsd", "rqslowct", "rqct", "wqav", "wqmin",
            "wqmax", "wqsd", "wqslowct", "wqct", "totqav", "totqmin", "totqmax", "totqsd",
            "totqct", "syncrdct", "syncwrct", "asyncrdct", "asyncwrct", "scrubct"
        );
    }

    /* (z)IO is entering Wait Queue */
    ::vdev_queue_io_add:entry {
        wqueue[args[1]] = timestamp ;
    }

    /* 
     * Count number of IOs by priority classification. In other words
     * we bucket each IO by ZIO priority as defined in the zio_priority enum.
     * This is in effect a histogram with 5 buckets, one each for:
     * sync-read, sync-write, async-read, async-write, and scrub.
     */
    ::vdev_queue_io_add:entry {
        this->spa       = args[1]->io_spa ;
        this->pri       = args[1]->io_priority ;
        @syncrdct[this->spa->spa_name] =
            sum(this->pri == ZIO_PRIORITY_SYNC_READ ? 1 : 0) ;
        @syncwrct[this->spa->spa_name] =
            sum(this->pri == ZIO_PRIORITY_SYNC_WRITE ? 1 : 0) ;
        @asyncrdct[this->spa->spa_name] =
            sum(this->pri == ZIO_PRIORITY_ASYNC_READ ? 1 : 0) ;
        @asyncwrct[this->spa->spa_name] =
            sum(this->pri == ZIO_PRIORITY_ASYNC_WRITE ? 1 : 0) ;
        @scrubct[this->spa->spa_name] =
            sum(this->pri == ZIO_PRIORITY_SCRUB ? 1 : 0) ;
    }

    /* (z)IO is leaving Wait Queue on its way to Run Queue */
    ::vdev_queue_io_remove:entry /wqueue[args[1]]/ {
        this->delta     = (timestamp - wqueue[args[1]]) / 1000 ;
        this->spa       = args[1]->io_spa ;
        @wqavlat[this->spa->spa_name]       = avg(this->delta) ;
        @wqminlat[this->spa->spa_name]      = min(this->delta) ;
        @wqmaxlat[this->spa->spa_name]      = max(this->delta) ;
        @wqsdlat[this->spa->spa_name]       = stddev(this->delta) ;
        @wqct[this->spa->spa_name]          = count() ;
        /* If spending to long in the queue, identify as slow IO. */
        @wqslowct[this->spa->spa_name]      = sum(this->delta >= SLOW ? 1 : 0) ;
        wqueue[args[1]] = 0 ;
    }

    /* (z)IO is entering Run Queue */
    ::vdev_queue_pending_add:entry {
        rqueue[args[1]]  = timestamp ; /* time IO entered Run Queue */
    }

    /* IO enters the queue */
    ::vdev_queue_io:entry {
        /* 
         * This should capture the total time we spend waiting and processing
         * for each zio.
         */
        totqueue[args[0]]    = timestamp ;
    }

    /* (z)IO is leaving Run Queue, which implies IO completed */
    ::vdev_queue_pending_remove:entry /rqueue[args[1]]/ {
        this->delta     = (timestamp - rqueue[args[1]]) / 1000 ;
        this->spa       = args[1]->io_spa ;
        @rqavlat[this->spa->spa_name]       = avg(this->delta) ;
        @rqminlat[this->spa->spa_name]      = min(this->delta) ;
        @rqmaxlat[this->spa->spa_name]      = max(this->delta) ;
        @rqsdlat[this->spa->spa_name]       = stddev(this->delta) ;
        @rqct[this->spa->spa_name]          = count() ;
        @rqslowct[this->spa->spa_name]      = sum(this->delta >= SLOW ? 1 : 0) ;
        /* 
         * The zio_t structures are going to be recycled over and over and
         * as such, we need to make sure that we avoid chances of bogus data
         * because we are checking entry points into the clauses for a !0
         * value.
         */
        rqueue[args[1]] = 0 ;
    }
    
    /* IO completed */
    ::vdev_queue_io_done:entry /totqueue[args[0]] > 0/ {
        /*
         * This is where we measure amount of time all the way from IO entering the
         * wait queue through all the stages to completion.
         */
        this->spa           = args[0]->io_spa ;
        this->delta         = (timestamp - totqueue[args[0]]) / 1000 ;
        @totqct[this->spa->spa_name]      = count() ;
        @totqavlat[this->spa->spa_name]   = avg(this->delta) ;
        @totqminlat[this->spa->spa_name]  = min(this->delta) ;
        @totqmaxlat[this->spa->spa_name]  = max(this->delta) ;
        @totqsdlat[this->spa->spa_name]   = stddev(this->delta) ;
        totqueue[args[0]]   = 0 ;
    }

    tick-5sec {
        /*
         * If the counts are not equal between these histograms, we may have
         * interrupted the script at the very moment an IO was transitioning
         * and it did not get accounted for correctly. In general we expect
         * that these counts are all equal.
         */
        printa("%s,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d\n",
                @rqavlat, @rqminlat, @rqmaxlat, @rqsdlat, @rqslowct, @rqct,
                @wqavlat, @wqminlat, @wqmaxlat, @wqsdlat, @wqslowct, @wqct,
                @totqavlat, @totqminlat, @totqmaxlat, @totqsdlat, @totqct,
                @syncrdct, @syncwrct, @asyncrdct, @asyncwrct, @scrubct
                ) ;
        trunc(@rqavlat) ; trunc(@rqminlat) ; trunc(@rqmaxlat) ; trunc(@rqsdlat) ;
        trunc(@rqslowct) ; trunc(@rqct) ;
        trunc(@wqavlat) ; trunc(@wqminlat) ; trunc(@wqmaxlat) ; trunc(@wqsdlat) ;
        trunc(@wqslowct) ; trunc(@wqct) ;
        trunc(@totqavlat) ; trunc(@totqminlat) ; trunc(@totqmaxlat) ; 
        trunc(@totqsdlat) ; trunc(@totqct) ;
        trunc(@syncrdct); trunc(@syncwrct); trunc(@asyncrdct); 
        trunc(@asyncwrct); trunc(@scrubct) ;
    }'
#!/usr/sbin/dtrace -Cs
#pragma D option quiet
#pragma D option dynvarsize=16M
/*
 * This script is intended for observing IOs as they transition from
 * Wait Queue to Run Queue and to completion. Latency is measured as
 * the IOs enter and then exit the Wait Queue, and then as they leave
 * the Wait Queue and enter the Run Queue.
 * We collect this information by pool, so we can see differences in
 * terms of time spent in these two queues.
 * Also collected are counts of IOs by priority classification. See the
 * zio_priority enum below, as well zio_priority.h for more details.
 * Ideally we want to see as little as possible time spent in these queues.
 * And dispersion between lowest and highest times remaining tight and fairly 
 * consistent without many offending outliers.
 *
 * aggregates prefixed with `tot` are tracking each ZIO from time it gets into
 * the queue to the time it is processed and is removed from queue.
 * We can think of `tot*` aggregates as a proxy for latency that we would
 * normally measure at the application level, roughly.
 */
long wqueue[vdev_queue_t *, zio_t *];   /* measure wait queue time */
long rqueue[vdev_queue_t *, zio_t *];   /* measure run queue time */
long totqueue[zio_t *];                 /* measure combined queue time */

inline const int SLOW = 10000 ; /* >10ms considered slow */

/* see usr/src/uts/common/fs/zfs/sys/zio_priority.h for details */
typedef enum zio_priority {
    ZIO_PRIORITY_SYNC_READ,
    ZIO_PRIORITY_SYNC_WRITE,	/* ZIL */
    ZIO_PRIORITY_ASYNC_READ,	/* prefetch */
    ZIO_PRIORITY_ASYNC_WRITE,	/* spa_sync() */
    ZIO_PRIORITY_SCRUB,		    /* asynchronous scrub/resilver reads */
    ZIO_PRIORITY_NUM_QUEUEABLE,

    ZIO_PRIORITY_NOW		    /* non-queued i/os (e.g. free) */
} zio_priority_t;

BEGIN {
    printf("pool,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s\n",
        "rqav", "rqmin", "rqmax", "rqsd", "rqlt1ms", "rq1to10ms", "rq10to100ms",
        "rq100msto1s", "rqgt1s", "rqslowct", "rqaddct", "rqrmct", "rqctav",
        "rqctmin", "rqctmax", "rqctsd",
        "wqav", "wqmin", "wqmax", "wqsd", "wqlt1ms", "wq1to10ms", "wq10to100ms",
        "wq100msto1s", "wqgt1s", "wqslowct", "wqaddct", "wqrmct", "wqctav",
        "wqctmin", "wqctmax", "wqctsd",
        "totqav", "totqmin", "totqmax", "totqsd", "totqct", "syncrdct", "syncwrct", "asyncrdct", "asyncwrct", "scrubct"
    );
}

/* (z)IO is entering Wait Queue */
::vdev_queue_io_add:entry {
    wqueue[args[0], args[1]]  = args[1]->io_queued_timestamp ;
    this->spa        = args[1]->io_spa ;
    this->allqueued  = 0 ; /* reset to avoid bogus data */
    this->allqueued += this->spa->spa_queue_stats[ZIO_PRIORITY_SYNC_READ].spa_queued ;
    this->allqueued += this->spa->spa_queue_stats[ZIO_PRIORITY_SYNC_WRITE].spa_queued ;
    this->allqueued += this->spa->spa_queue_stats[ZIO_PRIORITY_ASYNC_READ].spa_queued ;
    this->allqueued += this->spa->spa_queue_stats[ZIO_PRIORITY_ASYNC_WRITE].spa_queued ;
    this->allqueued += this->spa->spa_queue_stats[ZIO_PRIORITY_SCRUB].spa_queued ;
    @wqctav[this->spa->spa_name]    = avg(this->allqueued) ;
    @wqctmin[this->spa->spa_name]   = min(this->allqueued) ;
    @wqctmax[this->spa->spa_name]   = max(this->allqueued) ;
    @wqctsd[this->spa->spa_name]    = stddev(this->allqueued) ;
    @wqaddct[this->spa->spa_name]   = count() ; /* Count ++ to Wait Queue */
}

/* 
 * Count number of IOs by priority classification. In other words
 * we bucket each IO by ZIO priority as defined in the zio_priority enum.
 * This is in effect a histogram with 5 buckets, one each for:
 * sync-read, sync-write, async-read, async-write, and scrub.
 */
::vdev_queue_io_add:entry {
    this->spa       = args[1]->io_spa ;
    this->pri       = args[1]->io_priority ;
    @syncrdct[this->spa->spa_name] =
        sum(this->pri == ZIO_PRIORITY_SYNC_READ ? 1 : 0) ;
    @syncwrct[this->spa->spa_name] =
        sum(this->pri == ZIO_PRIORITY_SYNC_WRITE ? 1 : 0) ;
    @asyncrdct[this->spa->spa_name] =
        sum(this->pri == ZIO_PRIORITY_ASYNC_READ ? 1 : 0) ;
    @asyncwrct[this->spa->spa_name] =
        sum(this->pri == ZIO_PRIORITY_ASYNC_WRITE ? 1 : 0) ;
    @scrubct[this->spa->spa_name] =
        sum(this->pri == ZIO_PRIORITY_SCRUB ? 1 : 0) ;
}

::vdev_queue_io_remove:entry {
    this->spa       = args[1]->io_spa ;
    @wqrmct[this->spa->spa_name]        = count() ; /* Count -- from Wait Queue */
}

/* (z)IO is leaving Wait Queue on its way to Run Queue */
::vdev_queue_io_remove:entry
/wqueue[args[0], args[1]] == args[1]->io_queued_timestamp/ {
    this->delta     = (timestamp - wqueue[args[0], args[1]]) / 1000 ;
    this->spa       = args[1]->io_spa ;
    @wqavlat[this->spa->spa_name]       = avg(this->delta) ;
    @wqminlat[this->spa->spa_name]      = min(this->delta) ;
    @wqmaxlat[this->spa->spa_name]      = max(this->delta) ;
    @wqsdlat[this->spa->spa_name]       = stddev(this->delta) ;
    /* If spending to long in the queue, identify as slow IO. */
    @wqslowct[this->spa->spa_name]      = sum(this->delta >= SLOW ? 1 : 0) ;
    /* 
     * Maintain a histogram of wait qeueue latency distribution 
     * using base10 buckets: <1ms, 1-10ms, 10-100ms, 100ms-1s.
     */
    @wqlatlt1ms[this->spa->spa_name]    = 
        sum(this->delta < 1000 ? 1 : 0) ;
    @wqlat1to10ms[this->spa->spa_name]  = 
        sum(this->delta >= 1000 && this->delta < 10000 ? 1 : 0) ;
    @wqlat10to100ms[this->spa->spa_name]= 
        sum(this->delta >= 10000 && this->delta < 100000 ? 1 : 0) ;
    @wqlat100msto1s[this->spa->spa_name]= 
        sum(this->delta >= 100000 && this->delta < 1000000 ? 1 : 0) ;
    @wqlatgt1s[this->spa->spa_name]     = sum(this->delta > 1000000 ? 1 : 0) ;
    /* 
     * The zio_t structures are going to be recycled over and over and
     * as such, we need to make sure that we avoid chances of bogus data
     * because we are checking entry points into the clauses for a !0
     * value.
     */
    wqueue[args[0], args[1]] = 0 ;
}

/* (z)IO is entering Run Queue */
::vdev_queue_pending_add:entry {
    rqueue[args[0], args[1]]  = timestamp ; /* time IO entered Run Queue */
    this->spa        = args[1]->io_spa ;
    this->allactive  = 0 ; /* reset to avoid bogus data */
    this->allactive += this->spa->spa_queue_stats[ZIO_PRIORITY_SYNC_READ].spa_active ;
    this->allactive += this->spa->spa_queue_stats[ZIO_PRIORITY_SYNC_WRITE].spa_active ;
    this->allactive += this->spa->spa_queue_stats[ZIO_PRIORITY_ASYNC_READ].spa_active ;
    this->allactive += this->spa->spa_queue_stats[ZIO_PRIORITY_ASYNC_WRITE].spa_active ;
    this->allactive += this->spa->spa_queue_stats[ZIO_PRIORITY_SCRUB].spa_active ;
    @rqctav[this->spa->spa_name]    = avg(this->allactive) ;
    @rqctmin[this->spa->spa_name]   = min(this->allactive) ;
    @rqctmax[this->spa->spa_name]   = max(this->allactive) ;
    @rqctsd[this->spa->spa_name]    = stddev(this->allactive) ;
    @rqaddct[this->spa->spa_name]   = count() ; /* Count ++ to Run Queue */
}

/* IO enters the queue */
::vdev_queue_io:entry {
    /* 
     * This should capture the total time we spend waiting and processing
     * for each zio.
     */
    totqueue[args[0]]    = args[0]->io_queued_timestamp ;
}

::vdev_queue_pending_remove:entry {
    this->spa       = args[1]->io_spa ;
    @rqrmct[this->spa->spa_name]        = count() ; /* Count -- from Run Queue */
}

/* (z)IO is leaving Run Queue, which implies IO completed */
::vdev_queue_pending_remove:entry /rqueue[args[0], args[1]]/ {
    this->delta     = (timestamp - rqueue[args[0], args[1]]) / 1000 ;
    this->spa       = args[1]->io_spa ;
    @rqavlat[this->spa->spa_name]       = avg(this->delta) ;
    @rqminlat[this->spa->spa_name]      = min(this->delta) ;
    @rqmaxlat[this->spa->spa_name]      = max(this->delta) ;
    @rqsdlat[this->spa->spa_name]       = stddev(this->delta == 0 ?
                                                1 : this->delta) ;
    @rqslowct[this->spa->spa_name]      = sum(this->delta >= SLOW ? 1 : 0) ;
    /* 
     * Maintain a histogram of run qeueue latency distribution 
     * using base10 buckets: <1ms, 1-10ms, 10-100ms, 100ms-1s.
     */
    @rqlatlt1ms[this->spa->spa_name]    = 
        sum(this->delta < 1000 ? 1 : 0) ;
    @rqlat1to10ms[this->spa->spa_name]  = 
        sum(this->delta >= 1000 && this->delta < 10000 ? 1 : 0) ;
    @rqlat10to100ms[this->spa->spa_name]= 
        sum(this->delta >= 10000 && this->delta < 100000 ? 1 : 0) ;
    @rqlat100msto1s[this->spa->spa_name]= 
        sum(this->delta >= 100000 && this->delta < 1000000 ? 1 : 0) ;
    @rqlatgt1s[this->spa->spa_name]     = 
        sum(this->delta > 1000000 ? 1 : 0) ;
    /* 
     * The zio_t structures are going to be recycled over and over and
     * as such, we need to make sure that we avoid chances of bogus data
     * because we are checking entry points into the clauses for a !0
     * value.
     */
    rqueue[args[0], args[1]] = 0 ;
}

/* IO completed */
::vdev_queue_io_done:entry
/totqueue[args[0]] == args[0]->io_queued_timestamp/ {
    /*
     * This is where we measure amount of time all the way from IO entering the
     * wait queue through all the stages to completion.
     */
    this->spa           = args[0]->io_spa ;
    this->delta         = (timestamp - totqueue[args[0]]) / 1000 ;
    @totqct[this->spa->spa_name]      = count() ;
    @totqavlat[this->spa->spa_name]   = avg(this->delta) ;
    @totqminlat[this->spa->spa_name]  = min(this->delta) ;
    @totqmaxlat[this->spa->spa_name]  = max(this->delta) ;
    @totqsdlat[this->spa->spa_name]   = stddev(this->delta == 0 ?
                                                1 : this->delta) ;
    totqueue[args[0]]   = 0 ;
}

tick-5sec {
    /*
     * If the counts are not equal between these histograms, we may have
     * interrupted the script at the very moment an IO was transitioning
     * and it did not get accounted for correctly. In general we expect
     * that these counts are all equal.
     */
    printa("%s,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d\n",
            @rqavlat, @rqminlat, @rqmaxlat, @rqsdlat, @rqlatlt1ms, @rqlat1to10ms,
            @rqlat10to100ms, @rqlat100msto1s, @rqlatgt1s,
            @rqslowct, @rqaddct, @rqrmct, @rqctav, @rqctmin, @rqctmax, @rqctsd,
            @wqavlat, @wqminlat, @wqmaxlat, @wqsdlat, @wqlatlt1ms, @wqlat1to10ms,
            @wqlat10to100ms, @wqlat100msto1s, @wqlatgt1s, @wqslowct, @wqaddct,
            @wqrmct, @wqctav, @wqctmin, @wqctmax, @wqctsd,
            @totqavlat, @totqminlat, @totqmaxlat, @totqsdlat, @totqct,
            @syncrdct, @syncwrct, @asyncrdct, @asyncwrct, @scrubct
            ) ;
    trunc(@rqavlat) ; trunc(@rqminlat) ; trunc(@rqmaxlat) ; trunc(@rqsdlat) ;
    trunc(@rqlatlt1ms) ; trunc(@rqlat1to10ms) ; trunc(@rqlat10to100ms) ;
    trunc(@rqlat100msto1s) ; trunc(@rqlatgt1s) ;
    trunc(@rqslowct) ; trunc(@rqaddct) ; trunc(@rqrmct) ;
    trunc(@rqctav) ; trunc(@rqctmin) ; trunc(@rqctmax) ; trunc(@rqctsd) ;
    trunc(@wqavlat) ; trunc(@wqminlat) ; trunc(@wqmaxlat) ; trunc(@wqsdlat) ;
    trunc(@wqlatlt1ms) ; trunc(@wqlat1to10ms) ; trunc(@wqlat10to100ms) ;
    trunc(@wqlat100msto1s) ; trunc(@wqlatgt1s) ;
    trunc(@wqslowct) ; trunc(@wqaddct) ; trunc(@wqrmct) ;
    trunc(@wqctav) ; trunc(@wqctmin) ; trunc(@wqctmax) ; trunc(@wqctsd) ;
    trunc(@totqavlat) ; trunc(@totqminlat) ; trunc(@totqmaxlat) ; 
    trunc(@totqsdlat) ; trunc(@totqct) ;
    trunc(@syncrdct); trunc(@syncwrct); trunc(@asyncrdct); 
    trunc(@asyncwrct); trunc(@scrubct) ;
}
dtrace -qn '
  enum zio_priority {
	ZIO_PRIORITY_SYNC_READ,
	ZIO_PRIORITY_SYNC_WRITE,	/* ZIL */
	ZIO_PRIORITY_ASYNC_READ,	/* prefetch */
	ZIO_PRIORITY_ASYNC_WRITE,	/* spa_sync() */
	ZIO_PRIORITY_SCRUB,		/* asynchronous scrub/resilver reads */
	ZIO_PRIORITY_NUM_QUEUEABLE,
	ZIO_PRIORITY_NOW		/* non-queued i/os (e.g. free) */
  } ;

  BEGIN {
    start = timestamp ;
    pri[ZIO_PRIORITY_SYNC_READ] = "Sync-Read" ;
    pri[ZIO_PRIORITY_SYNC_WRITE] = "Sync-Write" ;
    pri[ZIO_PRIORITY_ASYNC_READ] = "Async-Read" ;
    pri[ZIO_PRIORITY_ASYNC_WRITE] = "Async-Write" ;
    pri[ZIO_PRIORITY_SCRUB] = "Scrub" ;
    pri[ZIO_PRIORITY_NUM_QUEUEABLE] = "Queueable" ;
    pri[ZIO_PRIORITY_NOW] = "Non-queued" ;
    }
  ::zio_read:entry, ::zio_write:entry {
    self->s = timestamp ; 
  }
  ::zio_read:return, ::zio_write:return {
    @ops[pri[args[1]->io_priority]] = count() ;
    @hist[pri[args[1]->io_priority]] = quantize(args[1]->io_size) ;
  }
  /* Capture amount of time spent in this function. */
  ::zio_read:return, ::zio_write:return /self->s/ {
    @avlat[pri[args[1]->io_priority]] = avg(timestamp - self->s) ;
    self->s = 0 ;
  }
  END {
    /* normalize to operations per second */
    normalize(@ops, (timestamp - start) / 1000000000) ;
    printa("\tPRI: %s Ops/s: %@d AvLat(ns): %@d %@d\n", 
            @ops, @avlat, @hist) ; 
  }'
#!/usr/sbin/dtrace -Cs
#pragma D option quiet
/*
 * CDDL HEADER START
 *
 * The contents of this file are subject to the terms of the
 * Common Development and Distribution License (the "License").
 * You may not use this file except in compliance with the License.
 *
 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
 * or http://www.opensolaris.org/os/licensing.
 * See the License for the specific language governing permissions
 * and limitations under the License.
 *
 * When distributing Covered Code, include this CDDL HEADER in each
 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
 * If applicable, add the following below this CDDL HEADER, with the
 * fields enclosed by brackets "[]" replaced with your own identifying
 * information: Portions Copyright [yyyy] [name of copyright owner]
 *
 * CDDL HEADER END
 *
 * Copyright (c) 2017 Sam Zaydel / RackTop Systems.
 *
 * Script is intended to provide details about queueing at the vdev layer
 * aggregating some information as well as reporting how many times we
 * did not issue an IO, because queue was empty as opposed to how many times
 * IOs were issued, because queue was not empty. Details are given through
 * histograms describing size of queues and amount of time IO spends in wait
 * queue before being processed.
 * We still capture descriptive statistics here, like the avg(s), min(s), max(s),
 * etc.
 * Our goal is to capture time spent information along with queue size details
 * because there is an expected relationship between size of queue and time
 * IOs spend in the queue(s).
 */

long long ts[zio_t *], totqueue[zio_t *];

/* see usr/src/uts/common/fs/zfs/sys/zio_priority.h for details */
typedef enum zio_priority {
	ZIO_PRIORITY_SYNC_READ,
	ZIO_PRIORITY_SYNC_WRITE,	/* ZIL */
	ZIO_PRIORITY_ASYNC_READ,	/* prefetch */
	ZIO_PRIORITY_ASYNC_WRITE,	/* spa_sync() */
	ZIO_PRIORITY_SCRUB,		/* asynchronous scrub/resilver reads */
	ZIO_PRIORITY_NUM_QUEUEABLE,

	ZIO_PRIORITY_NOW		/* non-queued i/os (e.g. free) */
} zio_priority_t;

vdev_queue_io_to_issue:return
/arg1 != NULL/
{
    @qe = quantize(1) ;
}

vdev_queue_io_to_issue:return
/arg1 == NULL/
{
    @qe = quantize(0) ;
}

vdev_queue_io_to_issue:entry {
    @vdavrq         = avg(args[0]->vq_active_tree.avl_numnodes);
    @vdminrq        = min(args[0]->vq_active_tree.avl_numnodes);
    @vdmaxrq        = max(args[0]->vq_active_tree.avl_numnodes);
    @vdsdrq         = stddev(args[0]->vq_active_tree.avl_numnodes);
    @vdrqhist       = quantize(args[0]->vq_active_tree.avl_numnodes);
}

/* 
 * Count number of IOs by priority classification. In other words
 * we bucket each IO by ZIO priority as defined in the zio_priority enum.
 * This is in effect a histogram with 5 buckets, one each for:
 * sync-read, sync-write, async-read, async-write, and scrub.
 */
vdev_queue_io_add:entry {
    this->spa       = args[1]->io_spa ;
    this->pri       = args[1]->io_priority ;
    @prihist[this->spa->spa_name] = lquantize(this->pri, 0, 5) ;
}

vdev_queue_io_add:entry {
    ts[args[1]] = timestamp ;
    /*
     * Sum of items with every possible ZIO priority where
     * this->qstats >= 0 && this->qstats < ZIO_PRIORITY_NUM_QUEUEABLE.
     */
    this->all_pri = (
        args[1]->io_spa->spa_queue_stats[ZIO_PRIORITY_SYNC_READ].spa_queued +
        args[1]->io_spa->spa_queue_stats[ZIO_PRIORITY_SYNC_WRITE].spa_queued +
        args[1]->io_spa->spa_queue_stats[ZIO_PRIORITY_ASYNC_READ].spa_queued +
        args[1]->io_spa->spa_queue_stats[ZIO_PRIORITY_ASYNC_WRITE].spa_queued +
        args[1]->io_spa->spa_queue_stats[ZIO_PRIORITY_SCRUB].spa_queued
        ) ;

    @avwq[args[1]->io_spa->spa_name]        = avg(this->all_pri) ;
    @sdwq[args[1]->io_spa->spa_name]        = stddev(this->all_pri) ;
    @minwq[args[1]->io_spa->spa_name]       = min(this->all_pri) ;
    @maxwq[args[1]->io_spa->spa_name]       = max(this->all_pri) ;
    @wqhist[args[1]->io_spa->spa_name]      = quantize(this->all_pri) ;
}

vdev_queue_pending_add:entry {
    /*
     * Sum of items with every possible ZIO priority where
     * this->qstats >= 0 && this->qstats < ZIO_PRIORITY_NUM_QUEUEABLE.
     */
    this->all_pri = (
        args[1]->io_spa->spa_queue_stats[ZIO_PRIORITY_SYNC_READ].spa_active +
        args[1]->io_spa->spa_queue_stats[ZIO_PRIORITY_SYNC_WRITE].spa_active +
        args[1]->io_spa->spa_queue_stats[ZIO_PRIORITY_ASYNC_READ].spa_active +
        args[1]->io_spa->spa_queue_stats[ZIO_PRIORITY_ASYNC_WRITE].spa_active +
        args[1]->io_spa->spa_queue_stats[ZIO_PRIORITY_SCRUB].spa_active
        ) ;

    @avrq[args[1]->io_spa->spa_name]        = avg(this->all_pri) ;
    @sdrq[args[1]->io_spa->spa_name]        = stddev(this->all_pri) ;
    @minrq[args[1]->io_spa->spa_name]       = min(this->all_pri) ;
    @maxrq[args[1]->io_spa->spa_name]       = max(this->all_pri) ;
    @rqhist[args[1]->io_spa->spa_name]      = quantize(this->all_pri) ;
}

/*
 * Measure delta between the time given ZIO entered and exited wait queue.
 */
vdev_queue_io_remove:entry /ts[args[1]] > 0/ {
    this->delta = (timestamp - ts[args[1]]) / 1000 ;
    @avwqlat[args[1]->io_spa->spa_name]     = avg(this->delta) ;
    @sdwqlat[args[1]->io_spa->spa_name]     = stddev(this->delta) ;
    @minwqlat[args[1]->io_spa->spa_name]    = min(this->delta) ;
    @maxwqlat[args[1]->io_spa->spa_name]    = max(this->delta) ;
    @wqlathist[args[1]->io_spa->spa_name]   = quantize(this->delta) ;
    ts[args[1]] = 0 ; /* zio structures are recycled  */
}

/* IO enters the queue */
::vdev_queue_io:entry /totqueue[args[0]] == 0/ {
    /* 
     * This should capture the total time we spend waiting and processing
     * for each zio.
     */

    /* hrtime_t of entry for given IO into queue */
    totqueue[args[0]]    =  args[0]->io_queued_timestamp ;
}

/* IO completed */
::vdev_queue_io_done:entry
/* match hrtime_t saved in the totqueue[args[0]] to value in io_queued_timestamp */
/totqueue[args[0]] == args[0]->io_queued_timestamp/ {
    /*
     * This is where we measure amount of time all the way from IO entering the
     * wait queue through all the stages to completion.
     * The point when IO was queued is stored in io_queued_timestamp field,
     * and we delta between it and current time.
     */
    this->delayed       = args[0]->io_target_timestamp > 0 ? 1 : 0;
    this->spa           = args[0]->io_spa ;
    this->delta         = (timestamp - totqueue[args[0]]) / 1000 ; /* mS */
    @totqavlat[this->spa->spa_name]         = avg(this->delta) ;
    @totqminlat[this->spa->spa_name]        = min(this->delta) ;
    @totqmaxlat[this->spa->spa_name]        = max(this->delta) ;
    @totqsdlat[this->spa->spa_name]         = stddev(this->delta) ;
    @totqhist[this->spa->spa_name]          = quantize(this->delta) ;
    /* Each delay will result in @delayed ++ */
    @delayed[this->spa->spa_name]           = sum(this->delayed) ;
    totqueue[args[0]]    = 0 ;
}

END {
    printf("************************\n") ;
    printf("Empty v. Not Empty Queue\n") ;
    printf("************************\n\n") ;
    printa("IOs (Not Physical = 0 | Physical = 1) %@d\n", @qe) ;
    printf("\n");
    printf("************************\n") ;
    printf("*** Queueing Details ***\n") ;
    printf("************************\n\n") ;
    printa("\t   [%s] Wait Queue Time Avg/Min/Max/StdDev | %@d(uS)/%@d(uS)/%@d(uS)/%@d(uS)%@d\n",
        @avwqlat, @minwqlat, @maxwqlat, @sdwqlat, @wqlathist) ;
    printa("\t   [%s] Wait Queue Length Avg/Min/Max/StdDev | %@d/%@d/%@d/%@d%@d\n",
            @avwq, @minwq, @maxwq, @sdwq, @wqhist) ;
    printa("\t   [%s] Run Queue Length Avg/Min/Max/StdDev | %@d/%@d/%@d/%@d%@d\n",
            @avrq, @minrq, @maxrq, @sdrq, @rqhist) ;
    printa("\t   Per Device Queue Length Avg/Min/Max/StdDev | %@d/%@d/%@d/%@d%@d\n",
            @vdavrq, @vdminrq, @vdmaxrq, @vdsdrq, @vdrqhist) ;
    printa("\t   [%s] Wait+Run Queue Time Avg/Min/Max/StdDev | %@d(uS)/%@d(uS)/%@d(uS)/%@d(uS)\n\t   Delayed IOs: %@d%@d\n",
            @totqavlat, @totqminlat, @totqmaxlat, @totqsdlat, @delayed, @totqhist) ;
    printa("\t   [%s] IO Histogram by ZIO priority\n           SyncRD => 0 | SyncWR => 1 | AsyncRD => 2 | AsyncWR => 3 | Scrub => 4 %@d\n", @prihist) ;
}
dtrace -qn '
    /*
     * This script is intended for observing IOs as they transition from
     * Wait Queue to Run Queue and to completion. Latency is measured as
     * the IOs enter and then exit the Wait Queue, and then as they leave
     * the Wait Queue and enter the Run Queue.
     * We collect this information by pool, so we can see differences in
     * terms of time spent in these two queues. Ideally we want to see as
     * little as possible time spent in these queues, and the dispersion
     * between lowest and highest times remaining tight and fairly consistent
     * without offending outliers that likely will be seen as load increases.
     */
    int wqueue[zio_t *]; /* zio_t pointer passed through the queuing stages */
    int rqueue[zio_t *]; /* same as above used for run queue */
    int wrqueue[zio_t *]; /* same as above used for total queue time */

    /* (z)IO is entering Wait Queue */
    ::vdev_queue_io_add:entry {
        this->spa       = args[1]->io_spa ;
        wqueue[args[1]] = timestamp ;
    }

    /* (z)IO is leaving Wait Queue on its way to Run Queue */
    ::vdev_queue_io_remove:entry /wqueue[args[1]] > 0/ {
        this->delta     = (timestamp - wqueue[args[1]]) / 1000 ;
        @waitqlat[this->spa->spa_name]      = quantize(this->delta) ;
        @waitqct[this->spa->spa_name]       = count() ;
        self->waitw_t   = 0 ;
        wqueue[args[1]] = 0 ;
    }

    /* (z)IO is entering Run Queue */
    ::vdev_queue_pending_add:entry {
        rqueue[args[1]] = timestamp ; /* time IO entered Run Queue */
        this->spa       = args[1]->io_spa ;
    }

    /* IO enters the queue */
    ::vdev_queue_io:entry {
        /* 
         * This should capture the total time we spend waiting and processing
         * for each zio.
         */
        wrqueue[args[0]]    = timestamp ;
    }

    /* IO completed */
    ::vdev_queue_io_done:entry /wrqueue[args[0]] > 0/ {
        /*
         * This is where we measure amount of time all the way from IO entering the
         * wait queue through all the stages to completion.
         */
        this->spa           = args[0]->io_spa ;
        this->delta         = (timestamp - wrqueue[args[0]]) / 1000 ;
        @wqtorqlat[this->spa->spa_name]     = quantize(this->delta) ;
        @wqtorqct[this->spa->spa_name]      = count() ;
        wrqueue[args[0]]    = 0 ;
    }

    /* (z)IO is leaving Run Queue, which implies IO completed */
    ::vdev_queue_pending_remove:entry /rqueue[args[1]]/ {
        this->delta     = (timestamp - rqueue[args[1]]) / 1000 ;
        this->spa       = args[1]->io_spa ;
        @runqlat[this->spa->spa_name] = quantize(this->delta) ;
        @runqct[this->spa->spa_name]  = count() ;
        /* 
         * The zio_t structures are going to be recycled over and over and
         * as such, we need to make sure that we avoid chances of bogus data
         * because we are checking entry points into the clauses for a !0
         * value.
         */
        rqueue[args[1]] = 0 ;
    }
    
    END {
        /*
         * If the counts are not equal between these histograms, we may have
         * interrupted the script at the very moment an IO was transitioning
         * and it did not get accounted for correctly. In general we expect
         * that these counts are all equal.
         */
        printa("           Wait Queue [%s](uS)\t\tCount: %@d%@d\n", @waitqct, @waitqlat) ;
        printa("           Run Queue  [%s](uS)\t\tCount: %@d%@d\n", @runqct, @runqlat) ;
        printa("           W+R Queue  [%s](uS)\t\tCount: %@d%@d\n", @wqtorqct, @wqtorqlat) ;
    }'
#!/usr/sbin/dtrace -Cs
#pragma D option quiet
/*
 * CDDL HEADER START
 *
 * The contents of this file are subject to the terms of the
 * Common Development and Distribution License (the "License").
 * You may not use this file except in compliance with the License.
 *
 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
 * or http://www.opensolaris.org/os/licensing.
 * See the License for the specific language governing permissions
 * and limitations under the License.
 *
 * When distributing Covered Code, include this CDDL HEADER in each
 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
 * If applicable, add the following below this CDDL HEADER, with the
 * fields enclosed by brackets "[]" replaced with your own identifying
 * information: Portions Copyright [yyyy] [name of copyright owner]
 *
 * CDDL HEADER END
 *
 * Copyright (c) 2017 Sam Zaydel / RackTop Systems.
 *
 * Script will report queue depth for what we can think of as
 * run queue and wait queue in ZFS. The data is drawn from kstat
 * each time we add an entry into the run wait queue, which happens
 * through a call to `kstat_runq_enter`, or entry in the wait queue
 * which happens through a call to `kstat_waitq_enter`.
 *
 * What we get from this is an output that looks something like this:
 * pool,rq.ct,rq.av,rq.max,rq.e,rq.grew,wq.ct,wq.av,wq.max,wq.e,wq.grew
 * p01,19613,30,190,105,8136,73053,52,513,40468,26471
 * p01,24770,50,358,149,11269,84962,27,247,51705,27830
 * p01,17107,15,58,119,6719,64745,46,460,35134,24129
 *
 * Column Definitions:
 * rq.ct/wq.ct      => Count of enter(s) into run/wait queues
 * rq.av/wq.av      => Average number of IOs in run/wait queues
 * rq.max/wq.max    => Maximum number of IOs in run/wait queues
 * rq.e/wq.e        => Count of enter(s) into empty run/wait queues
 * rq.grew/wq.grew  => Count of times run/wait queue grew from last entry
 *
 * This data captures count of times we entered queue, number of items in
 * queue at the time we entered, maximum number of items during sample
 * period, count of times queue had 0 items, and whether queue grew since
 * last time we counted number of items. We do this for both Run and Wait
 * queues. Results are printed at regular intervals.
 */
#include <kstat.h>
#define	KSTAT_IO_WCNT(kptr)((uint_t)(kptr)->wcnt)
#define	KSTAT_IO_RCNT(kptr)((uint_t)(kptr)->rcnt)


int lastcnt[int] ; /* track count of IOs in queue for delta calculation */

inline int PREV_RCNT = 0 ;
inline int PREV_WCNT = 1 ;

BEGIN {
    printf("pool,rq.ct,rq.av,rq.max,rq.e,rq.grew,wq.ct,wq.av,wq.max,wq.e,wq.grew\n");
}

/* Tag IO as entering Wait Queue */
::vdev_queue_io_add:entry {
    self->pzio = args[1] ;
}

/* Tag IO as entering Run Queue */
::vdev_queue_pending_add:entry {
    self->pzio = args[1] ;
}

/* Wait Queue */
::kstat_waitq_enter:entry /self->pzio/ {
    this->spa   = self->pzio->io_spa ;
    this->p     = self->pzio->io_priority ;
    /* this->activ = this->spa->spa_queue_stats[this->p].spa_active ; */
    this->cnt   = KSTAT_IO_WCNT(args[0]) ;
    /* Wait Queue Count */
    @wqc[self->pzio->io_spa->spa_name]      = count() ;
    @wqav[self->pzio->io_spa->spa_name]     = avg(this->cnt) ;
    @wqmax[self->pzio->io_spa->spa_name]    = max(this->cnt) ;
    @wqempty[self->pzio->io_spa->spa_name]  = sum(this->cnt > 0 ? 0 : 1) ;
    /*
     * We want to determine here if last time we entered wait queue
     * we had fewer or more items than we do now, i.e. queue grew?
     */
    @wqgrew[self->pzio->io_spa->spa_name]   = 
        sum(this->cnt > lastcnt[PREV_WCNT] ? 1 : 0) ;
    lastcnt[PREV_WCNT] = this->cnt ;
    self->pzio = 0 ;
}

/* Run Queue */
::kstat_runq_enter:entry /self->pzio/ {
    this->spa   = self->pzio->io_spa ;
    this->p     = self->pzio->io_priority ;
    /* this->activ = this->spa->spa_queue_stats[this->p].spa_active ; */
    this->cnt   = KSTAT_IO_RCNT(args[0]) ;
    /* Run Queue Count */
    @rqc[self->pzio->io_spa->spa_name]      = count() ;
    @rqav[self->pzio->io_spa->spa_name]     = avg(this->cnt) ;
    @rqmax[self->pzio->io_spa->spa_name]    = max(this->cnt) ;
    @rqempty[self->pzio->io_spa->spa_name]  = sum(this->cnt > 0 ? 0 : 1) ;
    /*
     * We want to determine here if last time we entered wait queue
     * we had fewer or more items than we do now, i.e. queue grew?
     */
    @rqgrew[self->pzio->io_spa->spa_name]   = 
        sum(this->cnt > lastcnt[PREV_RCNT] ? 1 : 0) ;
    lastcnt[PREV_RCNT] = this->cnt ;
    self->pzio = 0 ;
}

tick-5sec {
    printa("%s,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d\n",
            @rqc, @rqav, @rqmax, @rqempty, @rqgrew, @wqc, @wqav, @wqmax, @wqempty, @wqgrew) ;
    trunc(@rqc); trunc(@rqav); trunc(@rqmax); trunc(@rqempty); trunc(@rqgrew);
    trunc(@wqc); trunc(@wqav); trunc(@wqmax); trunc(@wqempty); trunc(@wqgrew);
}
#!/usr/sbin/dtrace -Cs
#pragma D option quiet
/*
 * CDDL HEADER START
 *
 * The contents of this file are subject to the terms of the
 * Common Development and Distribution License (the "License").
 * You may not use this file except in compliance with the License.
 *
 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
 * or http://www.opensolaris.org/os/licensing.
 * See the License for the specific language governing permissions
 * and limitations under the License.
 *
 * When distributing Covered Code, include this CDDL HEADER in each
 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
 * If applicable, add the following below this CDDL HEADER, with the
 * fields enclosed by brackets "[]" replaced with your own identifying
 * information: Portions Copyright [yyyy] [name of copyright owner]
 *
 * CDDL HEADER END
 *
 * Copyright (c) 2017 Sam Zaydel / RackTop Systems.
 *
 * Script will report queue depth for what we can think of as
 * run queue and wait queue in ZFS. The data is drawn from kstat
 * each time we add an entry into the run wait queue, which happens
 * through a call to `kstat_runq_enter`, or entry in the wait queue
 * which happens through a call to `kstat_waitq_enter`.
 *
 * We take a count of number of elements in the queue and quantize
 * results creating a distribution with peak being the mode of this
 * distribution. In general resulting distributions are unimodal, though
 * it is possible to see a bimodal distribution. If we see a bimodal
 * distribution, we are looking at the the grouped histogram where we
 * do no distinguish between different ZIO Priority queues. There are
 * MIN and MAX tunables in ZIO pipeline, and the two modes are likely
 * a result of switching between MIN and MAX settings, due to change in
 * IO load, based on amount of work. 
 * Kstats do not specifically identify individual ZIO queues, instead they 
 * report information aggregating by the two broad states, Wait and Run.
 * Some IOs will most certainly be double counted, since an IO will move
 * from Wait to Run, and possibly back to Wait.
 */
#include <kstat.h>
#define	KSTAT_IO_WCNT(kptr)((kstat_io_t *)(kptr)->wcnt)
#define	KSTAT_IO_RCNT(kptr)((kstat_io_t *)(kptr)->rcnt)

/* See uts/common/fs/zfs/sys/zio_priority.h */
typedef enum zio_priority {
	ZIO_PRIORITY_SYNC_READ,
	ZIO_PRIORITY_SYNC_WRITE,	/* ZIL */
	ZIO_PRIORITY_ASYNC_READ,	/* prefetch */
	ZIO_PRIORITY_ASYNC_WRITE,	/* spa_sync() */
	ZIO_PRIORITY_SCRUB,		/* asynchronous scrub/resilver reads */
	ZIO_PRIORITY_NUM_QUEUEABLE,

	ZIO_PRIORITY_NOW		/* non-queued i/os (e.g. free) */
};

BEGIN {
    /*
     * Map Enum values in zio_priority to string representations.
     */
    qn[ZIO_PRIORITY_SYNC_READ] = "Read";
    qn[ZIO_PRIORITY_ASYNC_READ] = "Prefetch";
    qn[ZIO_PRIORITY_SYNC_WRITE] = "Write(ZIL)";
    qn[ZIO_PRIORITY_ASYNC_WRITE] = "Write(SPAsync)";
    qn[ZIO_PRIORITY_SCRUB] = "Read(Scrum/Resiver)";
    qn[ZIO_PRIORITY_NUM_QUEUEABLE] = "Queuable";
    qn[ZIO_PRIORITY_NOW] = "Immediate";
    /* At the end, we compute delta to normalize count by second. */
    start_time = timestamp ;
}

/* Tag IO as entering Run Queue */
::vdev_queue_io_add:entry {
    self->pzio  = args[1] ;
    this->spa   = self->pzio->io_spa ;
    self->ks    = this->spa->spa_iokstat->ks_data ;
}

/* Tag IO as entering Wait Queue */
::vdev_queue_pending_add:entry {
    self->pzio  = args[1] ;
    this->spa   = self->pzio->io_spa ;
    self->ks    = this->spa->spa_iokstat->ks_data ;
}

/* Wait Queue */
::kstat_waitq_enter:entry /self->ks == args[0]/ {
    this->spa   = self->pzio->io_spa ;
    this->p     = self->pzio->io_priority ;
    this->activ = this->spa->spa_queue_stats[this->p].spa_active ;
    @c[self->pzio->io_spa->spa_name, "WAITQ"] = count() ;
    @q[self->pzio->io_spa->spa_name, "WAITQ"] = 
        quantize(KSTAT_IO_WCNT(args[0])) ;
    /* Grouped by ZIO Priority all IOs in Wait Queue */
    @qbyziot[self->pzio->io_spa->spa_name,
            "WAITQ", qn[this->p]] = quantize(this->activ);
    self->pzio  = 0 ;
    self->ks    = 0 ;
}

/* Run Queue */
::kstat_runq_enter:entry /self->ks == args[0]/ {
    this->spa   = self->pzio->io_spa ;
    this->p     = self->pzio->io_priority ;
    this->activ = this->spa->spa_queue_stats[this->p].spa_active ;
    @c[self->pzio->io_spa->spa_name, "RUNQ"] = count() ;
    @q[self->pzio->io_spa->spa_name, "RUNQ"] = 
        quantize(KSTAT_IO_RCNT(args[0])) ;
    /* Grouped by ZIO Priority all IOs in Run Queue */
    @qbyziot[self->pzio->io_spa->spa_name,
            "RUNQ", qn[this->p]] = quantize(this->activ);
    self->pzio  = 0 ;
    self->ks    = 0 ;
}

END {
    runtime_sec = (timestamp - start_time) / 1000000000 ;
    normalize(@c, runtime_sec) ;
    /*
     * Print histograms and final count by pool.
     * We should hope that total count in Run and Wait queues
     * equals to sum of counts over all ZIO priority queues.
     * There are several states that any ZIO could be in, and they are
     * represented by individual histograms. In other words, each state
     * will be represented by a single histogram. We should expect most
     * by zio priority histograms to be pretty boring.
     */
    printa("           %s State: %s\tZIO Priority: %s%@d\n", @qbyziot);
    printa("           %s [Queued in: %s] %@d\n", @q);

    printf("+----------------------------------------------+\n");
    printf("|---->> Pool Per-Second Normalized Count <<----|\n");
    printf("+----------------------------------------------+\n");
    printf("Pool     Queue\tCount\n");
    printa("%-8s %-5s\t%@d\n", @c);
    printf("+----------------------------------------------+\n");
}