szaydel
4/29/2015 - 3:23 PM

[Dtrace IO snippets] Various pieces dealing with low-level SCSI information #tags: io probes, io, dtrace, device alignment, queue depth, lat

[Dtrace IO snippets] Various pieces dealing with low-level SCSI information #tags: io probes, io, dtrace, device alignment, queue depth, latency, throughput, bandwidth, IO-size, seek size, seeking, biowait, biodone

dtrace -qn '
  ::biowait:entry 
  / execname != "sched" && execname != "fsflush" / {
    this->start = timestamp ;
    this->buf = args[0] ;
  }
  ::biowait:return
  / this->start && execname != "sched" && execname != "fsflush" / {
    this->delta = timestamp - this->start ;

    /* This semaphore count may be interesting to track also */
    this->sema = (sema_impl_t *)&this->buf->b_sem ;

    /* Categorize results by whether or not there was an EIO */
    this->err = args[1] > 0 ? "failed (EIO)" : "completed" ;
    @q[execname, this->err] = quantize(this->delta) ;
    this->start = 0 ;
  }
  END {
      printa("\t   [%s] | IO => %s%@d\n", @q);
  }'
#!/usr/sbin/dtrace -s
/*
 * seeksize.d - analyse disk head seek distance by process.
 *              Written using DTrace (Solaris 10 3/05).
 *
 * Disk I/O events caused by processes will in turn cause the disk heads
 * to seek. This program analyses those seeks, so that we can determine
 * if processes are causing the disks to seek in a "random" or "sequential"
 * manner.
 *
 * 15-Jun-2005, ver 1.00
 *
 * USAGE:	seeksize.d		# wait several seconds, then hit Ctrl-C
 *
 * FIELDS:
 *		PID	process ID
 *		CMD	command and argument list
 *		value	distance in disk blocks (sectors)
 *		count	number of I/O operations
 *
 * SEE ALSO: bitesize.d, iosnoop
 *
 * Standard Disclaimer: This is freeware, use at your own risk.
 *
 * 11-Sep-2004	Brendan Gregg	Created this.
 * 10-Oct-2004	   "      "	Rewrote to use the io provider.
 */

#pragma D option quiet

/*
 * Print header
 */
dtrace:::BEGIN
{
	printf("Sampling... Hit Ctrl-C to end.\n");
}

self int last[dev_t];

/*
 * Process io start
 */
io:::start
{
	/* fetch details */
	this->dev = args[0]->b_edev;
	this->blk = args[0]->b_blkno;
	this->size = args[0]->b_bcount;
	cmd = (string)curpsinfo->pr_psargs;
}
io:::start
/self->last[this->dev] != 0/
{
	/* calculate seek distance */
	this->dist = this->blk - self->last[this->dev] > 0 ?
	    this->blk - self->last[this->dev] :
	    self->last[this->dev] - this->blk;

	/* store details */
	@Size[pid,cmd] = quantize(this->dist);
}
io:::start
{
	/* save last position of disk head */
	self->last[this->dev] = this->blk + this->size / 512;
}

/*
 * Print final report
 */
dtrace:::END 
{
	printf("\n%8s  %s\n","PID","CMD");
	printa("%8d  %s\n%@d\n",@Size);
}
#!/usr/sbin/dtrace -s
#pragma D option quiet
/*
 * This script provides useful information about distances we have to seek,
 * as well as direction and sequential vs. random IO observations.
 * We account for which disk the seeking pertains to, so the distances
 * measured are relevant to each device, but numbers are reported
 * cumulatively for all devices. It may be useful to report an average seeking
 * distance for all IOs that we observe in a sample period.
 */

self long last[dev_t]; 

io:::start
/self->last[args[0]->b_edev] != 0/
{
    this->iotype = args[0]->b_flags & B_WRITE ? "WRITE" :
                    args[0]->b_flags & B_READ ? "READ"  : "NOT R/W";

    this->last = self->last[args[0]->b_edev];
    this->dist = (int)(args[0]->b_blkno - this->last) > 0 ?
        args[0]->b_blkno - this->last : this->last - args[0]->b_blkno;
      
    this->fwd = args[0]->b_blkno + 
                (args[0]->b_bcount / 512) > this->last ? 1 : 0;

    this->seq = this->dist == 0 ? 0 : 1;
    this->direction = args[0]->b_blkno - this->last > 0 ? "RAND FW =>" : 
                      this->last - args[0]->b_blkno > 0 ? "<= RAND REV" :
                      this->last == args[0]->b_blkno && this->fwd ? 
                      "SEQ FW =>" : "<= SEQ REV";

    @gap[this->iotype, this->direction] = quantize(this->dist);
}

io:::start
{
  self->last[args[0]->b_edev] = args[0]->b_blkno +
      args[0]->b_bcount / 512;
}

dtrace:::END
{
  printa("        IO Type: %-8s Direction: %-12s\n%@d\n", @gap);
}
# Check for entry into power management functions in sd driver.
dtrace -qn '
  tick-1sec {
    ts = walltimestamp;
  }
  :::pm_lower_power:entry,
  :::sd_pm*:entry,
  :::sd_setup_pm:entry,
  :::sdpower:entry {@[ts, probefunc] = count();
  }
  tick-10sec {
    printa("%Y,%s,%@d\n", @);
    trunc(@);
  }'
dtrace -qn '
    tick-1sec {
    ts = walltimestamp;
    x = 1;
    }
    :::sd_pm_entry:entry /x/ {
    this->inq = ((struct sd_lun *)args[0])->un_sd->sd_inq;
    this->p = stringof(args[0]->un_sd->sd_inq->inq_pid);
    printf("%Y,%s,%s\n", ts, stringof(this->inq->inq_vid), stringof(this->inq->inq_pid));
    }'
dtrace -qn '::sd_enable_descr_sense:entry {print(*(struct sd_lun *)args[0]->ssc_un);}'

// How long does it take for system to deal with a mode page change? 
// In some cases we notice this could be rather slow, i.e. seconds.
dtrace -qn '
    ::sd_enable_descr_sense:entry { self->in=timestamp; }
    ::sd_enable_descr_sense:return {
        printf("%d\n", (timestamp - self->in) / 1000);
    }'
dtrace -qn '
  inline int threshold = 500000;
  inline int nano_to_micro = 1000;
  BEGIN {
    start = timestamp;
  }
  ::sd_start_cmds:entry {
    self->a = args[0]; 
    self->s = timestamp;
    self->devi = (struct dev_info *)args[0]->un_sd->sd_dev;
    @ioc = count();
  }
  ::sd_start_cmds:return /timestamp - self->s > threshold/ {
    this->devname = (struct dev_info *)self->devi->devi_devid_str;
    @q[stringof(this->devname)] =  quantize(timestamp - self->s);
    @iolc[stringof(this->devname)] = count();
    @longest[stringof(this->devname)] = max(timestamp - self->s);
  }
  ::sd_start_cmds:return {
    @avlat = avg(timestamp - self->s);
    self->a = 0; self->s = 0; self->devi = 0;
  }
  END {
    this->runt = (timestamp - start) / 1000000000;
    normalize(@avlat, nano_to_micro);
    normalize(@longest, nano_to_micro);
    normalize(@ioc, this->runt);
    printa("Device: %s\n# Long IOs: %@d Longest IO: %@d(us)\n%@d\n",
            @iolc, @longest, @q);
    printf("Runtime: %d(sec)\n", this->runt);
    printa("Average IO/sec: %@d Average Latency: %@d(us)\n", @ioc, @avlat);
  }'
// This should give us a bit of information about a failed command returned
// to caller. In some cases we may get a path and in most cases we should have
// exec'ing command.
dtrace -qn '
  int x;
  ::sd_return_failed_command:entry {
    ++x;
    this->path = args[1]->b_file != NULL ?
    stringof(args[1]->b_file->v_path) : "nopath";
    this->proc = args[1]->b_proc != NULL ?
    stringof(args[1]->b_proc->p_exec->v_path) : "noproc";
    @fail[args[0], this->path, this->proc] = count();
  }
  tick-10sec /x/ {
    printa("ptr: %p path: %s cmd: %s = %@d\n", @fail);
    trunc(@fail); x = 0;
  }'
dtrace -qn '
  /* 
   * All Reads and Writes go through this function, so it is a convenient
   * place to take counts.
   */
  BEGIN { start = timestamp ; }
  ::bioerror:entry /args[0]->b_flags&B_READ/ {
    @r = sum(args[0]->b_flags&B_READ ? 1 : 0);
    @rsz = sum(args[0]->b_bcount >> 10);
  }
  ::bioerror:entry /args[0]->b_flags&B_WRITE/ {
    @w  = sum(args[0]->b_flags&B_WRITE ? 1 : 0);
    @wsz = sum(args[0]->b_bcount >> 10);
  }
  END {
    normalize(@r, (timestamp - start) / 1000000000);
    normalize(@w, (timestamp - start) / 1000000000);
    normalize(@rsz, (timestamp - start) / 1000000000);
    normalize(@wsz, (timestamp - start) / 1000000000);
    printa("Av_R/s: %@d Av_KBr/s: %@d Av_W/s: %@d Av_KBw/s: %@d\n",
            @r, @rsz, @w, @wsz);
  }'
dtrace -qn '
/* This value may require adjustment, depending upon amount IO a system does. */
#pragma D option dynvarsize=8m

string scsi_cmd[uchar_t];

dtrace:::BEGIN
{
	/* See /usr/include/sys/scsi/generic/commands.h for the full list. */
	scsi_cmd[0x00] = "test_unit_ready";
	scsi_cmd[0x08] = "read";
	scsi_cmd[0x0a] = "write";
	scsi_cmd[0x12] = "inquiry";
    scsi_cmd[0x15] = "mode_select";
	scsi_cmd[0x17] = "release";
	scsi_cmd[0x1a] = "mode_sense";
	scsi_cmd[0x1b] = "load/start/stop";
	scsi_cmd[0x1c] = "get_diagnostic_results";
	scsi_cmd[0x1d] = "send_diagnostic_command";
	scsi_cmd[0x25] = "read_capacity";
	scsi_cmd[0x28] = "read10";
	scsi_cmd[0x2a] = "write10";
	scsi_cmd[0x35] = "synchronize_cache";
	scsi_cmd[0x4d] = "log_sense";
	scsi_cmd[0x5e] = "persistent_reserve_in";
	scsi_cmd[0xa0] = "report_luns";
}

BEGIN {
    ts = walltimestamp;
    printf("timestamp,cmd,reason,avg.lat,stddev.lat,min.lat,max.lat,lat.1000us,lat.2000us,lat.4000us,lat.8000us,lat.16000us,lat.32000us,lat.64000us,lat.gt.64000us,count.ios\n");
}

fbt::scsi_transport:entry
{
	start[arg0] = timestamp;
}

fbt::scsi_destroy_pkt:entry
/start[arg0]/
{
	this->delta = (timestamp - start[arg0]) / 1000;
    start[arg0] = 0;
	
    this->code = *args[0]->pkt_cdbp;
	this->cmd = scsi_cmd[this->code] != NULL ?
	    scsi_cmd[this->code] : lltostr(this->code);
	this->reason = args[0]->pkt_reason == 0 ? "Success" :
	    strjoin("Fail:", lltostr(args[0]->pkt_reason));

    /* There may be a small error in terms of count measured in @cnt, when
     * compared against summing all latency buckets. This is hopefully along 
     * the lines of >1% or less. We expect that in most cases the two numbers
     * should be identical.
     */
    @cnt[ts, this->cmd, this->reason]      = count();
    @latavg[ts, this->cmd, this->reason]   = avg(this->delta);
	@latsd[ts, this->cmd, this->reason]    = stddev(this->delta);
	@latmin[ts, this->cmd, this->reason]   = min(this->delta);
    @latmax[ts, this->cmd, this->reason]   = max(this->delta);
    /* 
     * Bucket latencies into buckets from 1000 to above 64K microseconds, 
     * starting small and then increasing rapidly, meaning error in reporting
     * will be not scale linearly. 
     */
    @lat1000[ts, this->cmd, this->reason]  = sum(this->delta > 0 && 
                                                this->delta <= 1000 ? 1 : 0);
    @lat2000[ts, this->cmd, this->reason]  = sum(this->delta > 1000 &&      
                                                this->delta <= 2000 ? 1 : 0);
    @lat4000[ts, this->cmd, this->reason]  = sum(this->delta > 2000 && 
                                                this->delta <= 4000 ? 1 : 0);
    @lat8000[ts, this->cmd, this->reason]  = sum(this->delta > 4000 && 
                                                this->delta <= 8000 ? 1 : 0);
    @lat16000[ts, this->cmd, this->reason] = sum(this->delta > 8000 && 
                                                this->delta <= 16000 ? 1 : 0);
    @lat32000[ts, this->cmd, this->reason] = sum(this->delta > 16000 && 
                                                this->delta <= 32000 ? 1 : 0);
    @lat64000[ts, this->cmd, this->reason] = sum(this->delta > 32000 && 
                                                this->delta <= 64000 ? 1 : 0);
    @latgt64k[ts, this->cmd, this->reason] = sum(this->delta > 64000 ? 1 : 0);
}

tick-1sec
{
	printa("%Y,%s,%s,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d\n", 
            @latavg, @latsd, @latmin, @latmax, @lat1000, @lat2000, @lat4000, 
            @lat8000, @lat16000, @lat32000, @lat64000, @latgt64k, @cnt);
    trunc(@lat1000); trunc(@lat2000); trunc(@lat4000); 
    trunc(@lat8000); trunc(@lat16000); trunc(@lat32000); 
    trunc(@lat64000); trunc(@latgt64k);
    trunc(@cnt); trunc(@latavg); trunc(@latsd); trunc(@latmin); trunc(@latmax);
    ts = walltimestamp;
}'
dtrace -qn '
/* This value may require adjustment, depending upon amount IO a system does. */
#pragma D option dynvarsize=8m

/* This version will not work on earlier versions of bsros than
 * BrickStorOS bsr-fdb711e2 15.07 15.0724.001D i86pc i386 i86pc
*/

string scsi_cmd[uchar_t];

dtrace:::BEGIN
{
    /* See /usr/include/sys/scsi/generic/commands.h for the full list. */
    scsi_cmd[0x00] = "test_unit_ready";
    scsi_cmd[0x03] = "request_sense";
    scsi_cmd[0x08] = "read";
    scsi_cmd[0x0a] = "write";
    scsi_cmd[0x12] = "inquiry";
    scsi_cmd[0x15] = "mode_select";
    scsi_cmd[0x17] = "release";
    scsi_cmd[0x1a] = "mode_sense";
    scsi_cmd[0x1b] = "load/start/stop";
    scsi_cmd[0x1c] = "get_diagnostic_results";
    scsi_cmd[0x1d] = "send_diagnostic_command";
    scsi_cmd[0x25] = "read_capacity";
    scsi_cmd[0x28] = "read10";
    scsi_cmd[0x2a] = "write10";
    scsi_cmd[0x35] = "synchronize_cache";
    scsi_cmd[0x37] = "read_defect_data10";
    scsi_cmd[0x4d] = "log_sense";
    scsi_cmd[0x5e] = "persistent_reserve_in";
    scsi_cmd[0x88] = "read16";
    scsi_cmd[0x8a] = "write16";
    scsi_cmd[0xa0] = "report_luns";
    scsi_cmd[0xb7] = "read_defect_data12";
}

/* This is a fairly wide table, and some columns can benefit from a bit of
 * explanation. This is a description of all columns in order, all latency
 * bucket columns are lat.XXXXus, where XXXX is a unit of time.
 *
 * timestamp - time event logged
 * cmd - scsi command, one of scs_cmd[] above
 * reason - usually indicates success or failure of command
 * avg.lat - on average how long this group of command takes to complete
 * combined.lat - sum of latencies for all instances of this type of command
 * min.lat - smallest recorded latency value
 * max.lat - largest recorded latency value
 * lat.lt.100us - anything under 100 microseconds
 * lat.XXXXus - histogram of latency, each bucket is 2x previous bucket
 * lat.gt.64000us - anything above 64K microseconds (64milliseconds)
 * count.ios - number of IOs of given command
 * avg.qd.drv - average queue size in the drive
 * min.qd.drv - smallest recorded queue size in the drive
 * max.qd.drv - largest recorded queue size in the drive
 * avg.qd.xport - average number of IOs in transport
 * min.qd.xport - smallest recorded number of IOs in transport
 * max.qd.xport - largest recorded number of IOs in transport
 */

BEGIN {
    ts = walltimestamp;
    printf("timestamp,cmd,reason,avg.lat,combined.lat,min.lat,max.lat,lat.lt.100us,lat.1000us,lat.2000us,lat.4000us,lat.8000us,lat.16000us,lat.32000us,lat.64000us,lat.gt.64000us,count.ios,avg.qd.drv,min.qd.drv,max.qd.drv,avg.qd.xport,min.qd.xport,max.qd.xport\n");
}

::sd_start_cmds:entry 
{
    x[args[0]->un_sd] = args[0];
}

fbt::scsi_transport:entry /x[args[0]->pkt_address.a.a_sd] !=NULL /
{
    start[arg0] = timestamp;
    in_xport[arg0] = x[args[0]->pkt_address.a.a_sd]->un_ncmds_in_transport;
    in_drv[arg0] = x[args[0]->pkt_address.a.a_sd]->un_ncmds_in_driver;
    /* x[args[0]->pkt_address.a.a_sd] = 0; clear contents of array */
}

/* 
 * Depending on version of OS, this may need to be fbt::scsi_destroy_pkt:entry,
 * or fbt::vhci_scsi_destroy_pkt:entry. In the first case, args[0] is where
 * the pkt_cdbp should be coming from, and args[1] otherwise.
 */
fbt::vhci_scsi_destroy_pkt:entry
/start[arg1]/
{
    this->in_xport = in_xport[arg1];
    this->in_drv = in_drv[arg1];
    this->delta = timestamp - start[arg1];
    in_xport[arg1] = 0;
    in_drv[arg1] = 0;
    start[arg1] = 0;
    this->code = *args[1]->pkt_cdbp;
    this->cmd = scsi_cmd[this->code] != NULL ?
        scsi_cmd[this->code] : lltostr(this->code);
    this->reason = args[1]->pkt_reason == 0 ? "Success" :
        strjoin("Fail:", lltostr(args[1]->pkt_reason));

    /* There may be a small error in terms of count measured in @cnt, when
     * compared against summing all latency buckets. This is hopefully along 
     * the lines of >1% or less. We expect that in most cases the two numbers
     * should be identical.
     *
     * Do not add standard deviation calculation to this tool. Due to a bug
     * in printing aggregations, standard deviation may result in a
     * divisor != 0 assert being triggered.
     */
    @cnt[ts, this->cmd, this->reason]      = count();
    @latavg[ts, this->cmd, this->reason]   = avg(this->delta);
    @latsum[ts, this->cmd, this->reason]   = sum(this->delta);
    @latmin[ts, this->cmd, this->reason]   = min(this->delta);
    @latmax[ts, this->cmd, this->reason]   = max(this->delta);
    
    /* Track queue depth in the drive for each sd_lun and in transport. */
    @qdxpavg[ts, this->cmd, this->reason]  = avg(this->in_xport);
    @qddrvavg[ts, this->cmd, this->reason] = avg(this->in_drv);
    @qdxpmin[ts, this->cmd, this->reason]  = min(this->in_xport);
    @qddrvmin[ts, this->cmd, this->reason] = min(this->in_drv);
    @qdxpmax[ts, this->cmd, this->reason]  = max(this->in_xport);
    @qddrvmax[ts, this->cmd, this->reason] = max(this->in_drv);

    /* 
     * Bucket latencies into buckets from 1000 to above 64K microseconds, 
     * starting small and then increasing rapidly, meaning error in reporting
     * will be not scale linearly. 
     */
    @latlt100[ts, this->cmd, this->reason]  = sum(this->delta > 0 && 
                                this->delta <= 100000 ? 1 : 0);
    @lat1000[ts, this->cmd, this->reason]  = sum(this->delta > 100000 && 
                                this->delta <= 1000000 ? 1 : 0);
    @lat2000[ts, this->cmd, this->reason]  = sum(this->delta > 1000000 &&      
                                this->delta <= 2000000 ? 1 : 0);
    @lat4000[ts, this->cmd, this->reason]  = sum(this->delta > 2000000 && 
                                this->delta <= 4000000 ? 1 : 0);
    @lat8000[ts, this->cmd, this->reason]  = sum(this->delta > 4000000 && 
                                this->delta <= 8000000 ? 1 : 0);
    @lat16000[ts, this->cmd, this->reason] = sum(this->delta > 8000000 && 
                                this->delta <= 16000000 ? 1 : 0);
    @lat32000[ts, this->cmd, this->reason] = sum(this->delta > 16000000 && 
                                this->delta <= 32000000 ? 1 : 0);
    @lat64000[ts, this->cmd, this->reason] = sum(this->delta > 32000000 && 
                                this->delta <= 64000000 ? 1 : 0);
    @latgt64k[ts, this->cmd, this->reason] = sum(this->delta > 64000000 ? 1 : 0);
}

tick-1sec
{
    /* Instead of delta divided by 1000 to convert to microseconds we
     * normalize the aggregation by 1000, i.e. convert nanoseconds to 
     * microseconds.
     */
    normalize(@latavg, 1000);
    normalize(@latsum, 1000); 
    normalize(@latmin, 1000);
    normalize(@latmax, 1000);
    printa("%Y,%s,%s,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d\n", 
            @latavg, @latsum, @latmin, @latmax, @latlt100, @lat1000, @lat2000, @lat4000, 
            @lat8000, @lat16000, @lat32000, @lat64000, @latgt64k, @cnt,
            @qddrvavg, @qddrvmin, @qddrvmax, @qdxpavg, @qdxpmin, @qdxpmax
            );
    trunc(@qdxpavg); trunc(@qdxpmin); trunc(@qdxpmax);
    trunc(@qddrvavg); trunc(@qddrvmin); trunc(@qddrvmax);
    trunc(@latlt100); trunc(@lat1000); trunc(@lat2000); trunc(@lat4000); 
    trunc(@lat8000); trunc(@lat16000); trunc(@lat32000); 
    trunc(@lat64000); trunc(@latgt64k); trunc(@cnt); trunc(@latavg); 
    trunc(@latsum); trunc(@latmin); trunc(@latmax);
    ts = walltimestamp;
}'
# Correlating sd_start_cmds and scsi_transport

dtrace -qn '::sd_start_cmds:entry {x[args[0]->un_sd] = 1} ::scsi_transport:entry /x[args[0]->pkt_address.a.a_sd] > 0/ {trace("X"); x[args[0]->pkt_address.a.a_sd] = 0;}'

dtrace -qn '
  ::sd_start_cmds:entry 
  {
    x[args[0]->un_sd] = args[0]
  }
  ::scsi_transport:entry /x[args[0]->pkt_address.a.a_sd] !=NULL / 
  {
    @qavx["in-xport"] = 
        avg(x[args[0]->pkt_address.a.a_sd]->un_ncmds_in_transport);
    @qavsd["in-sd"] = 
        avg(x[args[0]->pkt_address.a.a_sd]->un_ncmds_in_driver);
    @qminx["in-xport"] = 
        min(x[args[0]->pkt_address.a.a_sd]->un_ncmds_in_transport);
    @qminsd["in-sd"] = 
        min(x[args[0]->pkt_address.a.a_sd]->un_ncmds_in_driver);
    @qmaxx["in-xport"] = 
        max(x[args[0]->pkt_address.a.a_sd]->un_ncmds_in_transport);
    @qmaxsd["in-sd"] = 
        max(x[args[0]->pkt_address.a.a_sd]->un_ncmds_in_driver);
  }
  tick-30sec {
    printa("xport avgq: %@d sd avgq: %@d xport minq: %@d sd minq: %@d xport maxq: %@d sd maxq: %@d\n", 
    @qavx, @qavsd, @qminx, @qminsd, @qmaxx, @qmaxsd);
    trunc(@qavx); trunc(@qavsd); 
    trunc(@qminx); trunc(@qminsd); 
    trunc(@qmaxx); trunc(@qmaxsd);
    }'
    
***

dtrace -qn '
  io:::start {
    start[args[0]->b_edev, args[0]->b_blkno] = timestamp;
  }
  io:::done
  /start[args[0]->b_edev, args[0]->b_blkno]/ {
    this->op = args[0]->b_flags & B_READ ? "Read" : "Write";
    this->devp = strtok(args[1]->dev_pathname, ":");
    this->elapsed = timestamp - 
            start[args[0]->b_edev, args[0]->b_blkno];

    this->b = args[0]->b_bcount >= 1 << 20 ? 1 << 20 :
            args[0]->b_bcount >= 1 << 19 ? 1 << 19 :
            args[0]->b_bcount >= 1 << 18 ? 1 << 18 :
            args[0]->b_bcount >= 1 << 17 ? 1 << 17 :
            args[0]->b_bcount >= 1 << 16 ? 1 << 16 :
            args[0]->b_bcount >= 1 << 18 ? 1 << 18 :
            args[0]->b_bcount >= 1 << 17 ? 1 << 17 :
            args[0]->b_bcount >= 1 << 16 ? 1 << 16 :
            args[0]->b_bcount >= 1 << 15 ? 1 << 15 :
            args[0]->b_bcount >= 1 << 14 ? 1 << 14 :
            args[0]->b_bcount >= 1 << 13 ? 1 << 13 :
            args[0]->b_bcount >= 1 << 12 ? 1 << 12 :
            args[0]->b_bcount >= 1 << 11 ? 1 << 11 :
            args[0]->b_bcount >= 1 << 10 ? 1 << 10 :
            args[0]->b_bcount >= 1 << 9 ? 1 << 9 :
            args[0]->b_bcount >= 1 << 8 ? 1 << 8 :
            args[0]->b_bcount >= 1 << 7 ? 1 << 7 :
            args[0]->b_bcount >= 1 << 6 ? 1 << 6 :
            args[0]->b_bcount >= 1 << 5 ? 1 << 5 :
            args[0]->b_bcount >= 1 << 4 ? 1 << 4 :
            args[0]->b_bcount >= 1 << 3 ? 1 << 3 :
            args[0]->b_bcount >= 1 << 2 ? 1 << 2 :
            args[0]->b_bcount >= 1 << 1 ? 2 :
            args[0]->b_bcount >= 1 << 0 ? 1 : 0;

    /*
     * To get much more verbose output, which breaks this further down
     * by individual device, use this commented aggregate instead of
     * one we are currently using.
     *
     * @[this->op, this->b, args[1]->dev_statname, this->devp] =
     *     quantize(this->elapsed / 1000);
     * And, use this printa instead of one used now.
     * printa("\t   %s(b)=%d <%s> %s%@d\n", @);
     */
    @h[this->op, this->b]   = quantize(this->elapsed / 1000); /* latency in uS */
    @av[this->op, this->b]  = avg(this->elapsed / 1000);
    @tot[this->op, this->b] = sum(this->elapsed / 1000);
    @tp[this->op, this->b]  = avg((args[0]->b_bcount * 976562) / this->elapsed);
    start[args[0]->b_edev, args[0]->b_blkno] = 0;
  }
  END {
    normalize(@tot, 1000);
    printa("\t   %s = %d(b) | AvLat = %@d(uS) | Active = %@d(mS) | Tput = %@d(KB/s)%@d\n",
    @av, @tot, @tp, @h);
  }'
#!/usr/sbin/dtrace -qCs
/* 
 * Script produces histograms to give a sense of how much IO is queued
 * in the driver and in transport. Histogram seems to be a better means
 * of examining this information as opposed to averages, mins and maxes.
 */
#define SD_TO_DEVINFO(un) ((struct dev_info *)((un)->un_sd->sd_dev))
#define DEV_NAME(un) stringof(`devnamesp[SD_TO_DEVINFO(un)->devi_major].dn_name)
#define DEV_INST(un) (SD_TO_DEVINFO(un)->devi_instance)

/*
 This script should be run in this manner, which gives us ability to control 
 how long the script will run before it returns.
 # dtrace -qCs  ./x -n 'tick-5sec { exit(0) }'
 
 Result will look something like this, where a serial number is given, 
 if known, and not a "" string, and instance corresponds to what you
 get with `iostat -e`.
 
           [In Driver]>     ZA204NDT            (sd2)
           value  ------------- Distribution ------------- count
               1 |                                         0
               2 |@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ 24
               3 |                                         0
*/

::sd_start_cmds:entry {
    this->serial = stringof(args[0]->un_sd->sd_inq->inq_serial) != "" ? stringof(args[0]->un_sd->sd_inq->inq_serial) : "UNKNOWN";
    this->devname = DEV_NAME(args[0]);
    this->devinst = DEV_INST(args[0]);

    @htrans[this->serial, this->devname, this->devinst] =
        lquantize(args[0]->un_ncmds_in_transport, 0, 32, 1);
    @hdriv[this->serial, this->devname, this->devinst] =
        lquantize(args[0]->un_ncmds_in_driver, 0, 32, 1);
}

END {
    printa("\t   [In Driver]>     %-16s\t(%s%d) %@d\n", @hdriv);
    printa("\t   [In Transport]>  %-16s\t(%s%d) %@d\n", @htrans);
}
#!/usr/sbin/dtrace -qCs
/* Script tracks max and average number of commands queued.  */
#define SD_TO_DEVINFO(un) ((struct dev_info *)((un)->un_sd->sd_dev))
#define DEV_NAME(un) stringof(`devnamesp[SD_TO_DEVINFO(un)->devi_major].dn_name)
#define DEV_INST(un) (SD_TO_DEVINFO(un)->devi_instance)

/*
 This script should be run in this manner, which gives us ability to control 
 how long the script will run before it returns.
 # dtrace -qCs  ./x -n 'tick-5sec { exit(0) }'
 
 Result will look something like this, where a serial number is given, 
 if known, and not a "" string, and instance corresponds to what you
 get with `iostat -e`.
 
 
 Device Serial    Instance Max.Queued Avg.Queued Max.Transp Avg.Transp
 -------------    -------- ---------- ---------- ---------- ----------
 Z3G011B2         sd12     17         6          16         5
 Z3G011G8         sd10     19         6          18         5
...trimmed...
*/

::sd_start_cmds:entry {
    this->serial = stringof(args[0]->un_sd->sd_inq->inq_serial) != "" ? stringof(args[0]->un_sd->sd_inq->inq_serial) : "UNKNOWN";
    this->devname = DEV_NAME(args[0]);
    this->devinst = DEV_INST(args[0]);
    @md[this->serial, this->devname, this->devinst] =
        max(args[0]->un_ncmds_in_driver);
    @mt[this->serial, this->devname, this->devinst] =
        max(args[0]->un_ncmds_in_transport);
    @ad[this->serial, this->devname, this->devinst] =
        avg(args[0]->un_ncmds_in_driver);
    @at[this->serial, this->devname, this->devinst] =
        avg(args[0]->un_ncmds_in_transport);
}

END {
        printf("%-16s %-8s %-10s %-10s %-10s %-10s\n", "Device Serial", "Instance", 
                "Max.Queued", "Avg.Queued", "Max.Transp", "Avg.Transp");

        printf("%-16s %-8s %-10s %-10s %-10s %-10s\n", "-------------", "--------", 
                "----------", "----------", "----------", "----------");
        printa("%-16s %s%-6d %-10@d %-10@d %-10@d %-10@d\n", @md, @ad, @mt, @at);

}
#!/usr/sbin/dtrace -Cs
#pragma D option quiet

#define SD_TO_DEVINFO(un) ((struct dev_info *)((un)->un_sd->sd_dev))
/* Not using these for now, but they could offer more information
 * about devices.
 */
#define DEV_NAME(un) \
      stringof(`devnamesp[SD_TO_DEVINFO(un)->devi_major].dn_name) /* ` */
#define DEV_INST(un) (SD_TO_DEVINFO(un)->devi_instance)

self long long last_byte_offset[struct sd_lun *];

::sd_core_iostart:entry {
  this->phys_blksz = args[1]->un_phy_blocksize;
  this->blk = args[2]->_b_blkno._p._l;
  this->size = args[2]->b_bcount;
  this->dev = args[1];
}
::sd_core_iostart:entry /self->last_byte_offset[this->dev] != 0/ {
  /* 
   * Create buckets to group distances by IO size. The intent is to measure
   * whether certain patterns are evident, like all small IOs seem to cluster
   * in a particular area of a disk or are they evenly spread out, etc.
   */
  this->bucket = 
    this->size <= 4096 ? "<= 4K" :
    this->size > 4096 && this->size <= 0x8000 ?
        "> 4K to 32K" :
    this->size > 0x8000 && this->size <= 0x10000 ?
        "> 32K to 64K" : "> 64K" ;

  this->dist = this->blk - self->last_byte_offset[this->dev] > 0 ?
        this->blk - self->last_byte_offset[this->dev] :
        self->last_byte_offset[this->dev] - this->blk;
  @seeks[stringof(SD_TO_DEVINFO(this->dev)->devi_devid_str), 
        this->bucket] = quantize(this->dist);
}
::sd_core_iostart:entry {
/* Record offset on device accounting for size of last IO done. */
self->last_byte_offset[args[1]] = 
    this->blk + (this->size / this->phys_blksz);
}
END {
  printa("\t   %s | IO Size: [ %s ] %@d\n", @seeks);
}
dtrace -qn '
/* 
  Script measures and reports every 10 seconds read and write latency,
  IOPs, and throughput. This is a useful script to watch, while tuning
  ZFS write throttle mechanism. The goal is to have acceptable latency
  while also maximizing possible throughput to disk drives.
*/
BEGIN
{
    start = timestamp;
    line = "------------------------------------------------------------------------";
}

io:::start {
    ts[args[0]->b_edev, args[0]->b_lblkno] = timestamp;
}

io:::done /ts[args[0]->b_edev, args[0]->b_lblkno]/ {
    this->delta = (timestamp - ts[args[0]->b_edev, args[0]->b_lblkno]) / 1000;
    this->name = (args[0]->b_flags & (B_READ | B_WRITE)) == B_READ ?
        "Read" : "Write";

    this->size = args[0]->b_bcount;
    this->buck = 512; /* We assume this is smallest possible size */
    this->buck = ( this->size > 512 ) ? 1024 : this->buck ;
    this->buck = ( this->size > 1024 ) ? 2048 : this->buck ;
    this->buck = ( this->size > 4096 ) ? 8192 : this->buck ;
    this->buck = ( this->size > 8192 ) ? 16384 : this->buck ;
    this->buck = ( this->size > 16384 ) ? 32768 : this->buck ;
    this->buck = ( this->size > 32768 ) ? 65536 : this->buck ;
    this->buck = ( this->size > 65536 ) ? 131072 : this->buck ;
    this->buck = ( this->size > 131072 ) ? 262144 : this->buck ;
    this->buck = ( this->size > 262144 ) ? 524288 : this->buck ;
    this->buck = ( this->size > 524288 ) ? 1048576 : this->buck ;
    
    @iosz[this->name] = quantize(this->buck);
    @q[this->name] = quantize(this->delta);
    @a[this->name] = avg(this->delta);
    @v[this->name] = stddev(this->delta);
    @i[this->name] = count();
    @b[this->name] = sum(args[0]->b_bcount);

    ts[args[0]->b_edev, args[0]->b_lblkno] = 0;
}

tick-10sec {
    printf("\n%80s\n", line);
    printa("%14s I/Os by size: %@d\n", @iosz);
    printa("%14s latency(uS): %@d\n", @q);
    /*
     * Adjust for the runtime of each sample, i.e. calculate total seconds
     * since last timestamp, and normalize the data, to get true per second 
     * values.
     */
    normalize(@i, (timestamp - start) / 1000000000);
    normalize(@b, (timestamp - start) / 1000000000 * 1024);
    printf("%-30s %11s %11s %11s %11s\n", "", "avg.lat(uS)", "stddev(uS)",
        "iops", "throughput");
    printa("%-30s %@9uus %@9uus %@9u/s %@8uk/s\n", @a, @v, @i, @b);
    trunc(@iosz); trunc(@q); trunc(@a); trunc(@v); trunc(@i); trunc(@b);
    start = timestamp;
}'
dtrace -qn '
int realtm, start;
/* int done[string]; */
inline const int NORM_KB = 1024;
inline const int NS_S = 1000000000;
inline const int NS_MS = 1000000;
/* 
 * Script measures and reports every 1 seconds read and write latency,
 * IOPs, and throughput. This is a useful script to watch, while tuning
 * ZFS write throttle mechanism. The goal is to have acceptable latency
 * while also maximizing possible throughput to disk drives.
 *
 * This script is also ideal for pairing with Load Dynamix standardized 
 * tests that we run, as a means of correlating what happens at the disk
 * level, vs. what happens at the application layer. We want to measure IOs
 * and latency, which gives us a sense of service time at the application
 * layer and see how that translates to IOs and latency at the disk layer.
 */
BEGIN
{
  printf("%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s\n",
        "io.type", "runtime.ms", "min.lat", "max.lat", "avg.lat",
        "max.io.size", "iops", "throughput.KBps",
        "512b.ct", "1024b.ct", "2048b.ct", "4096b.ct", "8192b.ct",
        "16384b.ct", "32768b.ct", "65536b.ct", "131072b.ct",
        "total.ios", "active.ios", "arc.ios", "arc.io.bytes", "avg.bytes.per.us"
  );
  hdr = 1;
  start = timestamp;
}

io:::start {
  this->name = (args[0]->b_flags & (B_READ | B_WRITE)) == B_READ ? "R" : "W";
  ts[args[0]->b_edev, args[0]->b_lblkno] = timestamp;
  @queued[this->name] = sum(1); /* increment count of active IOs */
}

/* 
 * There is no very direct correlation between IOs measured here
 * and IOs resulting in physical bytes being read from the disks.
 * It does help though to have some context about IOs that do not actually
 * result in physical IO.
 */
::arc_read_done:entry {
  @asz["R"]           = sum((uint64_t)args[0]->io_size) ;
  @aios["R"]          = count() ;
}

io:::done /ts[args[0]->b_edev, args[0]->b_lblkno]/ {
  this->name = (args[0]->b_flags & (B_READ | B_WRITE)) == B_READ ? "R" : "W";
}

io:::done /ts[args[0]->b_edev, args[0]->b_lblkno]/ {
  this->delta = (timestamp - ts[args[0]->b_edev, args[0]->b_lblkno]) / 1000;
  this->name = (args[0]->b_flags & (B_READ | B_WRITE)) == B_READ ? "R" : "W";
  this->size = args[0]->b_bcount; /* How many bytes in this IO? */
  this->buck = 512; /* We assume this is smallest possible IO size. */
  this->buck = ( this->size > 512 ) ? 1024 : this->buck ;
  this->buck = ( this->size > 1024 ) ? 2048 : this->buck ;
  this->buck = ( this->size > 4096 ) ? 8192 : this->buck ;
  this->buck = ( this->size > 8192 ) ? 16384 : this->buck ;
  this->buck = ( this->size > 16384 ) ? 32768 : this->buck ;
  this->buck = ( this->size > 32768 ) ? 65536 : this->buck ;
  this->buck = ( this->size > 65536 ) ? 131072 : this->buck ;
  this->buck = ( this->size > 131072 ) ? 262144 : this->buck ;
  this->buck = ( this->size > 262144 ) ? 524288 : this->buck ;
  this->buck = ( this->size > 524288 ) ? 1048576 : this->buck ;

  /*
   * The size buckets are currently limited to 128K, from 512b to 128K,
   * in base2 steps, i.e. 512, 1024, etc.
   * We may want to add larger IO sizes, if we find that they happen.
   */
  @lmin[this->name]   = min(this->delta);
  @lmax[this->name]   = max(this->delta);
  @a[this->name]      = avg(this->delta);
  /* Normalize to 1-second intervals */
  @i[this->name]      = count();
  /* Average bytes per microsecond of IO time. */
  this->bpus = this->delta > 0 ? (this->size / this->delta) : this->size;
  @abus[this->name]   = avg(this->size / this->delta);
  /* Do not normalize (@ios) to get total I/Os per interval. */
  @ios[this->name]    = count();
  @b[this->name]      = sum(args[0]->b_bcount);
  @queued[this->name] = sum(-1); /* decrement count of active IOs */
  @maxiosz[this->name]     = max(this->size);
  @sz512[this->name]  = sum(this->buck == 512 ? 1 : 0 );
  @sz1024[this->name]  = sum(this->buck == 1024 ? 1 : 0 );
  @sz2048[this->name]  = sum(this->buck == 2048 ? 1 : 0 );
  @sz4096[this->name]  = sum(this->buck == 4096 ? 1 : 0 );
  @sz8192[this->name]  = sum(this->buck == 8192 ? 1 : 0 );
  @sz16384[this->name]  = sum(this->buck == 16384 ? 1 : 0 );
  @sz32768[this->name]  = sum(this->buck == 32768 ? 1 : 0 );
  @sz65536[this->name]  = sum(this->buck == 65536 ? 1 : 0 );
  @sz131072[this->name]  = sum(this->buck == 131072 ? 1 : 0 );
  ts[args[0]->b_edev, args[0]->b_lblkno] = 0;
}

tick-5sec {
  this->delta = (timestamp - start) / NS_S;
  /*
   * This is a hack, it takes advantage of the fact that realtm value will
   * always increase, and never decrease. Because we do not have an aggregate
   * for storing only last value, this is the best we can do. We are
   * effectively storing latest computed value.
   */
  @t["R"]      = max(realtm);
  @t["W"]      = max(realtm);

  /* 
   * Normalizing by 1024 can introduce sample error when the value being 
   * divided is less than 1024. Since there is no floating point division,
   * resulting value will be 0, which if data is routinely less than 1024,
   * can create a serious discontinuity between real data and normalized
   * data.
   */
  normalize(@b, NORM_KB);
  normalize(@t, NS_MS);
  normalize(@i, 5); /* adjust based on tick-$value */

  printa(
    "%s,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d\n", 
                      @t, @lmin, @lmax, @a, @maxiosz, @i, @b, 
                      @sz512, @sz1024, @sz2048, @sz4096, @sz8192, @sz16384,
                      @sz32768, @sz65536, @sz131072, @ios, @queued, 
                      @aios, @asz, @abus);
  /*
   * Truncate aggregations and re-set timestamp for next sampling period.
   */
  trunc(@lmin); trunc(@lmax); trunc(@a); trunc(@maxiosz); trunc(@i); 
  trunc(@b); trunc(@sz512); trunc(@sz1024); trunc(@sz2048); trunc(@sz4096);
  trunc(@sz8192); trunc(@sz16384); trunc(@sz32768); trunc(@sz65536);
  trunc(@sz131072); trunc(@ios); trunc(@aios); trunc(@asz); trunc(@abus);
  realtm = realtm + (timestamp - start);
  start = timestamp;
}'
// Measure how much IO is aligned vs. not-aligned to 16K
// physical device size.
dtrace -qn '
    long total_bytes, issued, notmult;
    long misaligned, misalignedR, misalignedW, ltsize, ltsizeR, ltsizeW;

    BEGIN {
        issued = 0;
        misaligned = 0;
        notmult = 0;
        ltsize = 0;
        ltsizeR = 0;
        ltsizeW = 0;
        total_bytes = 0;
        OFFSET = 32;
        SIZE=16384; /* 16K blocksize */
    }

    fbt:sd:sd_core_iostart:entry {
        issued++;
        bp = (struct buf *) arg2;
        xp = (struct sd_xbuf *)((bp)->b_private);
        misalignedR += bp->b_flags & B_READ
            ? (xp->xb_blkno % OFFSET ? 1 : 0) : 0;
        misalignedW += bp->b_flags & B_WRITE
            ? (xp->xb_blkno % OFFSET ? 1 : 0) : 0;    
        misaligned  += (xp->xb_blkno % OFFSET) ? 1 : 0;
        @zeroalign[(bp->b_bcount % SIZE) ? 
                    "NOT MULTIPLE" : "MULTIPLE"] = 
                    quantize(xp->xb_blkno % OFFSET);
        ltsize      += bp->b_bcount < SIZE ? 1 : 0; /* # of IOs under 16K */
        ltsizeR     += (bp->b_bcount < SIZE) 
            ? (bp->b_flags & B_READ ? 1 : 0) : 0;
        ltsizeW     += (bp->b_bcount < SIZE) 
            ? (bp->b_flags & B_WRITE ? 1 : 0) : 0;
        notmult     += (bp->b_bcount % SIZE) ? 1 : 0;
        total_bytes += bp->b_bcount;
        @align[(xp->xb_blkno % OFFSET) 
                ? "UNALIGNED(bufsize)" : "ALIGNED(bufsize)"] =
                quantize(bp->b_bcount);
    }
    END {
        pct_ltsizeR     = issued > 0 ? 100 * ltsizeR / issued : 0;
        pct_ltsizeW     = issued > 0 ? 100 * ltsizeW / issued : 0;
        pct_ltsize      = issued > 0 ? 100 * ltsize / issued : 0;
        pct_misalign    = issued > 0 ? 100 * misaligned / issued : 0;
        pct_notmult     = issued > 0 ? 100 * notmult / issued : 0;
        printf("=========================================\n");
        printf("================ Summary ================\n");
        printf("=========================================\n");
        printf("Short Reads: %u%% Short Writes: %u%%\n", 
                pct_ltsizeR, pct_ltsizeW);
        printf("Unaligned Reads: %u Unaligned Writes: %u\n",
                misalignedR, misalignedW);
        printf("Bytes Processed: %u\nTotal IOs Issued: %u\nTotal IOs Smaller than %d: %u%%\nNot multiple of %d: %d%%\nUnaligned: %d%%\n", 
        total_bytes, issued, SIZE, pct_ltsize, SIZE, pct_notmult, pct_misalign);
        printf("=========================================\n");
        printa(@align);
        printa(@zeroalign);
    }'
dtrace -qn '
    /* First argument to this should be some blocksize, like 8192 for 8K. */
    long total_bytes, issued, notmult;
    long misaligned, misalignedR, misalignedW, ltsize, ltsizeR, ltsizeW;

    BEGIN {
        issued = 0;
        misaligned = 0;
        notmult = 0;
        ltsize = 0;
        ltsizeR = 0;
        ltsizeW = 0;
        total_bytes = 0;
        OFFSET = 32;
        SIZE=$1; /* XX blocksize */
    }

    fbt:sd:sd_core_iostart:entry {
        issued++;
        bp = (struct buf *) arg2;
        xp = (struct sd_xbuf *)((bp)->b_private);
        misalignedR += bp->b_flags & B_READ
            ? (xp->xb_blkno % OFFSET ? 1 : 0) : 0;
        misalignedW += bp->b_flags & B_WRITE
            ? (xp->xb_blkno % OFFSET ? 1 : 0) : 0;    
        misaligned  += (xp->xb_blkno % OFFSET) ? 1 : 0;
        @zeroalign[(bp->b_bcount % SIZE) ? 
                    "NOT MULTIPLE" : "MULTIPLE"] = 
                    quantize(xp->xb_blkno % OFFSET);
        ltsize      += bp->b_bcount < SIZE ? 1 : 0; /* # of IOs under 16K */
        ltsizeR     += (bp->b_bcount < SIZE) 
            ? (bp->b_flags & B_READ ? 1 : 0) : 0;
        ltsizeW     += (bp->b_bcount < SIZE) 
            ? (bp->b_flags & B_WRITE ? 1 : 0) : 0;
        notmult     += (bp->b_bcount % SIZE) ? 1 : 0;
        total_bytes += bp->b_bcount;
        @align[(xp->xb_blkno % OFFSET) 
                ? "UNALIGNED(bufsize)" : "ALIGNED(bufsize)"] =
                quantize(bp->b_bcount);
    }
    END {
        pct_ltsizeR     = issued > 0 ? 100 * ltsizeR / issued : 0;
        pct_ltsizeW     = issued > 0 ? 100 * ltsizeW / issued : 0;
        pct_ltsize      = issued > 0 ? 100 * ltsize / issued : 0;
        pct_misalign    = issued > 0 ? 100 * misaligned / issued : 0;
        pct_notmult     = issued > 0 ? 100 * notmult / issued : 0;
        printf("=========================================\n");
        printf("================ Summary ================\n");
        printf("=========================================\n");
        printf("Short Reads: %u%% Short Writes: %u%%\n", 
                pct_ltsizeR, pct_ltsizeW);
        printf("Unaligned Reads: %u Unaligned Writes: %u\n",
                misalignedR, misalignedW);
        printf("Bytes Processed: %u\nTotal IOs Issued: %u\nTotal IOs Smaller than %d: %u%%\nNot multiple of %d: %d%%\nUnaligned: %d%%\n", 
        total_bytes, issued, SIZE, pct_ltsize, SIZE, pct_notmult, pct_misalign);
        printf("=========================================\n");
        printa(@align);
        printa(@zeroalign);
    }'