[Dtrace IO snippets] Various pieces dealing with low-level SCSI information #tags: io probes, io, dtrace, device alignment, queue depth, latency, throughput, bandwidth, IO-size, seek size, seeking, biowait, biodone
dtrace -qn '
::biowait:entry
/ execname != "sched" && execname != "fsflush" / {
this->start = timestamp ;
this->buf = args[0] ;
}
::biowait:return
/ this->start && execname != "sched" && execname != "fsflush" / {
this->delta = timestamp - this->start ;
/* This semaphore count may be interesting to track also */
this->sema = (sema_impl_t *)&this->buf->b_sem ;
/* Categorize results by whether or not there was an EIO */
this->err = args[1] > 0 ? "failed (EIO)" : "completed" ;
@q[execname, this->err] = quantize(this->delta) ;
this->start = 0 ;
}
END {
printa("\t [%s] | IO => %s%@d\n", @q);
}'
#!/usr/sbin/dtrace -s
/*
* seeksize.d - analyse disk head seek distance by process.
* Written using DTrace (Solaris 10 3/05).
*
* Disk I/O events caused by processes will in turn cause the disk heads
* to seek. This program analyses those seeks, so that we can determine
* if processes are causing the disks to seek in a "random" or "sequential"
* manner.
*
* 15-Jun-2005, ver 1.00
*
* USAGE: seeksize.d # wait several seconds, then hit Ctrl-C
*
* FIELDS:
* PID process ID
* CMD command and argument list
* value distance in disk blocks (sectors)
* count number of I/O operations
*
* SEE ALSO: bitesize.d, iosnoop
*
* Standard Disclaimer: This is freeware, use at your own risk.
*
* 11-Sep-2004 Brendan Gregg Created this.
* 10-Oct-2004 " " Rewrote to use the io provider.
*/
#pragma D option quiet
/*
* Print header
*/
dtrace:::BEGIN
{
printf("Sampling... Hit Ctrl-C to end.\n");
}
self int last[dev_t];
/*
* Process io start
*/
io:::start
{
/* fetch details */
this->dev = args[0]->b_edev;
this->blk = args[0]->b_blkno;
this->size = args[0]->b_bcount;
cmd = (string)curpsinfo->pr_psargs;
}
io:::start
/self->last[this->dev] != 0/
{
/* calculate seek distance */
this->dist = this->blk - self->last[this->dev] > 0 ?
this->blk - self->last[this->dev] :
self->last[this->dev] - this->blk;
/* store details */
@Size[pid,cmd] = quantize(this->dist);
}
io:::start
{
/* save last position of disk head */
self->last[this->dev] = this->blk + this->size / 512;
}
/*
* Print final report
*/
dtrace:::END
{
printf("\n%8s %s\n","PID","CMD");
printa("%8d %s\n%@d\n",@Size);
}
#!/usr/sbin/dtrace -s
#pragma D option quiet
/*
* This script provides useful information about distances we have to seek,
* as well as direction and sequential vs. random IO observations.
* We account for which disk the seeking pertains to, so the distances
* measured are relevant to each device, but numbers are reported
* cumulatively for all devices. It may be useful to report an average seeking
* distance for all IOs that we observe in a sample period.
*/
self long last[dev_t];
io:::start
/self->last[args[0]->b_edev] != 0/
{
this->iotype = args[0]->b_flags & B_WRITE ? "WRITE" :
args[0]->b_flags & B_READ ? "READ" : "NOT R/W";
this->last = self->last[args[0]->b_edev];
this->dist = (int)(args[0]->b_blkno - this->last) > 0 ?
args[0]->b_blkno - this->last : this->last - args[0]->b_blkno;
this->fwd = args[0]->b_blkno +
(args[0]->b_bcount / 512) > this->last ? 1 : 0;
this->seq = this->dist == 0 ? 0 : 1;
this->direction = args[0]->b_blkno - this->last > 0 ? "RAND FW =>" :
this->last - args[0]->b_blkno > 0 ? "<= RAND REV" :
this->last == args[0]->b_blkno && this->fwd ?
"SEQ FW =>" : "<= SEQ REV";
@gap[this->iotype, this->direction] = quantize(this->dist);
}
io:::start
{
self->last[args[0]->b_edev] = args[0]->b_blkno +
args[0]->b_bcount / 512;
}
dtrace:::END
{
printa(" IO Type: %-8s Direction: %-12s\n%@d\n", @gap);
}
# Check for entry into power management functions in sd driver.
dtrace -qn '
tick-1sec {
ts = walltimestamp;
}
:::pm_lower_power:entry,
:::sd_pm*:entry,
:::sd_setup_pm:entry,
:::sdpower:entry {@[ts, probefunc] = count();
}
tick-10sec {
printa("%Y,%s,%@d\n", @);
trunc(@);
}'
dtrace -qn '
tick-1sec {
ts = walltimestamp;
x = 1;
}
:::sd_pm_entry:entry /x/ {
this->inq = ((struct sd_lun *)args[0])->un_sd->sd_inq;
this->p = stringof(args[0]->un_sd->sd_inq->inq_pid);
printf("%Y,%s,%s\n", ts, stringof(this->inq->inq_vid), stringof(this->inq->inq_pid));
}'
dtrace -qn '::sd_enable_descr_sense:entry {print(*(struct sd_lun *)args[0]->ssc_un);}'
// How long does it take for system to deal with a mode page change?
// In some cases we notice this could be rather slow, i.e. seconds.
dtrace -qn '
::sd_enable_descr_sense:entry { self->in=timestamp; }
::sd_enable_descr_sense:return {
printf("%d\n", (timestamp - self->in) / 1000);
}'
dtrace -qn '
inline int threshold = 500000;
inline int nano_to_micro = 1000;
BEGIN {
start = timestamp;
}
::sd_start_cmds:entry {
self->a = args[0];
self->s = timestamp;
self->devi = (struct dev_info *)args[0]->un_sd->sd_dev;
@ioc = count();
}
::sd_start_cmds:return /timestamp - self->s > threshold/ {
this->devname = (struct dev_info *)self->devi->devi_devid_str;
@q[stringof(this->devname)] = quantize(timestamp - self->s);
@iolc[stringof(this->devname)] = count();
@longest[stringof(this->devname)] = max(timestamp - self->s);
}
::sd_start_cmds:return {
@avlat = avg(timestamp - self->s);
self->a = 0; self->s = 0; self->devi = 0;
}
END {
this->runt = (timestamp - start) / 1000000000;
normalize(@avlat, nano_to_micro);
normalize(@longest, nano_to_micro);
normalize(@ioc, this->runt);
printa("Device: %s\n# Long IOs: %@d Longest IO: %@d(us)\n%@d\n",
@iolc, @longest, @q);
printf("Runtime: %d(sec)\n", this->runt);
printa("Average IO/sec: %@d Average Latency: %@d(us)\n", @ioc, @avlat);
}'
// This should give us a bit of information about a failed command returned
// to caller. In some cases we may get a path and in most cases we should have
// exec'ing command.
dtrace -qn '
int x;
::sd_return_failed_command:entry {
++x;
this->path = args[1]->b_file != NULL ?
stringof(args[1]->b_file->v_path) : "nopath";
this->proc = args[1]->b_proc != NULL ?
stringof(args[1]->b_proc->p_exec->v_path) : "noproc";
@fail[args[0], this->path, this->proc] = count();
}
tick-10sec /x/ {
printa("ptr: %p path: %s cmd: %s = %@d\n", @fail);
trunc(@fail); x = 0;
}'
dtrace -qn '
/*
* All Reads and Writes go through this function, so it is a convenient
* place to take counts.
*/
BEGIN { start = timestamp ; }
::bioerror:entry /args[0]->b_flags&B_READ/ {
@r = sum(args[0]->b_flags&B_READ ? 1 : 0);
@rsz = sum(args[0]->b_bcount >> 10);
}
::bioerror:entry /args[0]->b_flags&B_WRITE/ {
@w = sum(args[0]->b_flags&B_WRITE ? 1 : 0);
@wsz = sum(args[0]->b_bcount >> 10);
}
END {
normalize(@r, (timestamp - start) / 1000000000);
normalize(@w, (timestamp - start) / 1000000000);
normalize(@rsz, (timestamp - start) / 1000000000);
normalize(@wsz, (timestamp - start) / 1000000000);
printa("Av_R/s: %@d Av_KBr/s: %@d Av_W/s: %@d Av_KBw/s: %@d\n",
@r, @rsz, @w, @wsz);
}'
dtrace -qn '
/* This value may require adjustment, depending upon amount IO a system does. */
#pragma D option dynvarsize=8m
string scsi_cmd[uchar_t];
dtrace:::BEGIN
{
/* See /usr/include/sys/scsi/generic/commands.h for the full list. */
scsi_cmd[0x00] = "test_unit_ready";
scsi_cmd[0x08] = "read";
scsi_cmd[0x0a] = "write";
scsi_cmd[0x12] = "inquiry";
scsi_cmd[0x15] = "mode_select";
scsi_cmd[0x17] = "release";
scsi_cmd[0x1a] = "mode_sense";
scsi_cmd[0x1b] = "load/start/stop";
scsi_cmd[0x1c] = "get_diagnostic_results";
scsi_cmd[0x1d] = "send_diagnostic_command";
scsi_cmd[0x25] = "read_capacity";
scsi_cmd[0x28] = "read10";
scsi_cmd[0x2a] = "write10";
scsi_cmd[0x35] = "synchronize_cache";
scsi_cmd[0x4d] = "log_sense";
scsi_cmd[0x5e] = "persistent_reserve_in";
scsi_cmd[0xa0] = "report_luns";
}
BEGIN {
ts = walltimestamp;
printf("timestamp,cmd,reason,avg.lat,stddev.lat,min.lat,max.lat,lat.1000us,lat.2000us,lat.4000us,lat.8000us,lat.16000us,lat.32000us,lat.64000us,lat.gt.64000us,count.ios\n");
}
fbt::scsi_transport:entry
{
start[arg0] = timestamp;
}
fbt::scsi_destroy_pkt:entry
/start[arg0]/
{
this->delta = (timestamp - start[arg0]) / 1000;
start[arg0] = 0;
this->code = *args[0]->pkt_cdbp;
this->cmd = scsi_cmd[this->code] != NULL ?
scsi_cmd[this->code] : lltostr(this->code);
this->reason = args[0]->pkt_reason == 0 ? "Success" :
strjoin("Fail:", lltostr(args[0]->pkt_reason));
/* There may be a small error in terms of count measured in @cnt, when
* compared against summing all latency buckets. This is hopefully along
* the lines of >1% or less. We expect that in most cases the two numbers
* should be identical.
*/
@cnt[ts, this->cmd, this->reason] = count();
@latavg[ts, this->cmd, this->reason] = avg(this->delta);
@latsd[ts, this->cmd, this->reason] = stddev(this->delta);
@latmin[ts, this->cmd, this->reason] = min(this->delta);
@latmax[ts, this->cmd, this->reason] = max(this->delta);
/*
* Bucket latencies into buckets from 1000 to above 64K microseconds,
* starting small and then increasing rapidly, meaning error in reporting
* will be not scale linearly.
*/
@lat1000[ts, this->cmd, this->reason] = sum(this->delta > 0 &&
this->delta <= 1000 ? 1 : 0);
@lat2000[ts, this->cmd, this->reason] = sum(this->delta > 1000 &&
this->delta <= 2000 ? 1 : 0);
@lat4000[ts, this->cmd, this->reason] = sum(this->delta > 2000 &&
this->delta <= 4000 ? 1 : 0);
@lat8000[ts, this->cmd, this->reason] = sum(this->delta > 4000 &&
this->delta <= 8000 ? 1 : 0);
@lat16000[ts, this->cmd, this->reason] = sum(this->delta > 8000 &&
this->delta <= 16000 ? 1 : 0);
@lat32000[ts, this->cmd, this->reason] = sum(this->delta > 16000 &&
this->delta <= 32000 ? 1 : 0);
@lat64000[ts, this->cmd, this->reason] = sum(this->delta > 32000 &&
this->delta <= 64000 ? 1 : 0);
@latgt64k[ts, this->cmd, this->reason] = sum(this->delta > 64000 ? 1 : 0);
}
tick-1sec
{
printa("%Y,%s,%s,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d\n",
@latavg, @latsd, @latmin, @latmax, @lat1000, @lat2000, @lat4000,
@lat8000, @lat16000, @lat32000, @lat64000, @latgt64k, @cnt);
trunc(@lat1000); trunc(@lat2000); trunc(@lat4000);
trunc(@lat8000); trunc(@lat16000); trunc(@lat32000);
trunc(@lat64000); trunc(@latgt64k);
trunc(@cnt); trunc(@latavg); trunc(@latsd); trunc(@latmin); trunc(@latmax);
ts = walltimestamp;
}'
dtrace -qn '
/* This value may require adjustment, depending upon amount IO a system does. */
#pragma D option dynvarsize=8m
/* This version will not work on earlier versions of bsros than
* BrickStorOS bsr-fdb711e2 15.07 15.0724.001D i86pc i386 i86pc
*/
string scsi_cmd[uchar_t];
dtrace:::BEGIN
{
/* See /usr/include/sys/scsi/generic/commands.h for the full list. */
scsi_cmd[0x00] = "test_unit_ready";
scsi_cmd[0x03] = "request_sense";
scsi_cmd[0x08] = "read";
scsi_cmd[0x0a] = "write";
scsi_cmd[0x12] = "inquiry";
scsi_cmd[0x15] = "mode_select";
scsi_cmd[0x17] = "release";
scsi_cmd[0x1a] = "mode_sense";
scsi_cmd[0x1b] = "load/start/stop";
scsi_cmd[0x1c] = "get_diagnostic_results";
scsi_cmd[0x1d] = "send_diagnostic_command";
scsi_cmd[0x25] = "read_capacity";
scsi_cmd[0x28] = "read10";
scsi_cmd[0x2a] = "write10";
scsi_cmd[0x35] = "synchronize_cache";
scsi_cmd[0x37] = "read_defect_data10";
scsi_cmd[0x4d] = "log_sense";
scsi_cmd[0x5e] = "persistent_reserve_in";
scsi_cmd[0x88] = "read16";
scsi_cmd[0x8a] = "write16";
scsi_cmd[0xa0] = "report_luns";
scsi_cmd[0xb7] = "read_defect_data12";
}
/* This is a fairly wide table, and some columns can benefit from a bit of
* explanation. This is a description of all columns in order, all latency
* bucket columns are lat.XXXXus, where XXXX is a unit of time.
*
* timestamp - time event logged
* cmd - scsi command, one of scs_cmd[] above
* reason - usually indicates success or failure of command
* avg.lat - on average how long this group of command takes to complete
* combined.lat - sum of latencies for all instances of this type of command
* min.lat - smallest recorded latency value
* max.lat - largest recorded latency value
* lat.lt.100us - anything under 100 microseconds
* lat.XXXXus - histogram of latency, each bucket is 2x previous bucket
* lat.gt.64000us - anything above 64K microseconds (64milliseconds)
* count.ios - number of IOs of given command
* avg.qd.drv - average queue size in the drive
* min.qd.drv - smallest recorded queue size in the drive
* max.qd.drv - largest recorded queue size in the drive
* avg.qd.xport - average number of IOs in transport
* min.qd.xport - smallest recorded number of IOs in transport
* max.qd.xport - largest recorded number of IOs in transport
*/
BEGIN {
ts = walltimestamp;
printf("timestamp,cmd,reason,avg.lat,combined.lat,min.lat,max.lat,lat.lt.100us,lat.1000us,lat.2000us,lat.4000us,lat.8000us,lat.16000us,lat.32000us,lat.64000us,lat.gt.64000us,count.ios,avg.qd.drv,min.qd.drv,max.qd.drv,avg.qd.xport,min.qd.xport,max.qd.xport\n");
}
::sd_start_cmds:entry
{
x[args[0]->un_sd] = args[0];
}
fbt::scsi_transport:entry /x[args[0]->pkt_address.a.a_sd] !=NULL /
{
start[arg0] = timestamp;
in_xport[arg0] = x[args[0]->pkt_address.a.a_sd]->un_ncmds_in_transport;
in_drv[arg0] = x[args[0]->pkt_address.a.a_sd]->un_ncmds_in_driver;
/* x[args[0]->pkt_address.a.a_sd] = 0; clear contents of array */
}
/*
* Depending on version of OS, this may need to be fbt::scsi_destroy_pkt:entry,
* or fbt::vhci_scsi_destroy_pkt:entry. In the first case, args[0] is where
* the pkt_cdbp should be coming from, and args[1] otherwise.
*/
fbt::vhci_scsi_destroy_pkt:entry
/start[arg1]/
{
this->in_xport = in_xport[arg1];
this->in_drv = in_drv[arg1];
this->delta = timestamp - start[arg1];
in_xport[arg1] = 0;
in_drv[arg1] = 0;
start[arg1] = 0;
this->code = *args[1]->pkt_cdbp;
this->cmd = scsi_cmd[this->code] != NULL ?
scsi_cmd[this->code] : lltostr(this->code);
this->reason = args[1]->pkt_reason == 0 ? "Success" :
strjoin("Fail:", lltostr(args[1]->pkt_reason));
/* There may be a small error in terms of count measured in @cnt, when
* compared against summing all latency buckets. This is hopefully along
* the lines of >1% or less. We expect that in most cases the two numbers
* should be identical.
*
* Do not add standard deviation calculation to this tool. Due to a bug
* in printing aggregations, standard deviation may result in a
* divisor != 0 assert being triggered.
*/
@cnt[ts, this->cmd, this->reason] = count();
@latavg[ts, this->cmd, this->reason] = avg(this->delta);
@latsum[ts, this->cmd, this->reason] = sum(this->delta);
@latmin[ts, this->cmd, this->reason] = min(this->delta);
@latmax[ts, this->cmd, this->reason] = max(this->delta);
/* Track queue depth in the drive for each sd_lun and in transport. */
@qdxpavg[ts, this->cmd, this->reason] = avg(this->in_xport);
@qddrvavg[ts, this->cmd, this->reason] = avg(this->in_drv);
@qdxpmin[ts, this->cmd, this->reason] = min(this->in_xport);
@qddrvmin[ts, this->cmd, this->reason] = min(this->in_drv);
@qdxpmax[ts, this->cmd, this->reason] = max(this->in_xport);
@qddrvmax[ts, this->cmd, this->reason] = max(this->in_drv);
/*
* Bucket latencies into buckets from 1000 to above 64K microseconds,
* starting small and then increasing rapidly, meaning error in reporting
* will be not scale linearly.
*/
@latlt100[ts, this->cmd, this->reason] = sum(this->delta > 0 &&
this->delta <= 100000 ? 1 : 0);
@lat1000[ts, this->cmd, this->reason] = sum(this->delta > 100000 &&
this->delta <= 1000000 ? 1 : 0);
@lat2000[ts, this->cmd, this->reason] = sum(this->delta > 1000000 &&
this->delta <= 2000000 ? 1 : 0);
@lat4000[ts, this->cmd, this->reason] = sum(this->delta > 2000000 &&
this->delta <= 4000000 ? 1 : 0);
@lat8000[ts, this->cmd, this->reason] = sum(this->delta > 4000000 &&
this->delta <= 8000000 ? 1 : 0);
@lat16000[ts, this->cmd, this->reason] = sum(this->delta > 8000000 &&
this->delta <= 16000000 ? 1 : 0);
@lat32000[ts, this->cmd, this->reason] = sum(this->delta > 16000000 &&
this->delta <= 32000000 ? 1 : 0);
@lat64000[ts, this->cmd, this->reason] = sum(this->delta > 32000000 &&
this->delta <= 64000000 ? 1 : 0);
@latgt64k[ts, this->cmd, this->reason] = sum(this->delta > 64000000 ? 1 : 0);
}
tick-1sec
{
/* Instead of delta divided by 1000 to convert to microseconds we
* normalize the aggregation by 1000, i.e. convert nanoseconds to
* microseconds.
*/
normalize(@latavg, 1000);
normalize(@latsum, 1000);
normalize(@latmin, 1000);
normalize(@latmax, 1000);
printa("%Y,%s,%s,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d\n",
@latavg, @latsum, @latmin, @latmax, @latlt100, @lat1000, @lat2000, @lat4000,
@lat8000, @lat16000, @lat32000, @lat64000, @latgt64k, @cnt,
@qddrvavg, @qddrvmin, @qddrvmax, @qdxpavg, @qdxpmin, @qdxpmax
);
trunc(@qdxpavg); trunc(@qdxpmin); trunc(@qdxpmax);
trunc(@qddrvavg); trunc(@qddrvmin); trunc(@qddrvmax);
trunc(@latlt100); trunc(@lat1000); trunc(@lat2000); trunc(@lat4000);
trunc(@lat8000); trunc(@lat16000); trunc(@lat32000);
trunc(@lat64000); trunc(@latgt64k); trunc(@cnt); trunc(@latavg);
trunc(@latsum); trunc(@latmin); trunc(@latmax);
ts = walltimestamp;
}'
# Correlating sd_start_cmds and scsi_transport
dtrace -qn '::sd_start_cmds:entry {x[args[0]->un_sd] = 1} ::scsi_transport:entry /x[args[0]->pkt_address.a.a_sd] > 0/ {trace("X"); x[args[0]->pkt_address.a.a_sd] = 0;}'
dtrace -qn '
::sd_start_cmds:entry
{
x[args[0]->un_sd] = args[0]
}
::scsi_transport:entry /x[args[0]->pkt_address.a.a_sd] !=NULL /
{
@qavx["in-xport"] =
avg(x[args[0]->pkt_address.a.a_sd]->un_ncmds_in_transport);
@qavsd["in-sd"] =
avg(x[args[0]->pkt_address.a.a_sd]->un_ncmds_in_driver);
@qminx["in-xport"] =
min(x[args[0]->pkt_address.a.a_sd]->un_ncmds_in_transport);
@qminsd["in-sd"] =
min(x[args[0]->pkt_address.a.a_sd]->un_ncmds_in_driver);
@qmaxx["in-xport"] =
max(x[args[0]->pkt_address.a.a_sd]->un_ncmds_in_transport);
@qmaxsd["in-sd"] =
max(x[args[0]->pkt_address.a.a_sd]->un_ncmds_in_driver);
}
tick-30sec {
printa("xport avgq: %@d sd avgq: %@d xport minq: %@d sd minq: %@d xport maxq: %@d sd maxq: %@d\n",
@qavx, @qavsd, @qminx, @qminsd, @qmaxx, @qmaxsd);
trunc(@qavx); trunc(@qavsd);
trunc(@qminx); trunc(@qminsd);
trunc(@qmaxx); trunc(@qmaxsd);
}'
***
dtrace -qn '
io:::start {
start[args[0]->b_edev, args[0]->b_blkno] = timestamp;
}
io:::done
/start[args[0]->b_edev, args[0]->b_blkno]/ {
this->op = args[0]->b_flags & B_READ ? "Read" : "Write";
this->devp = strtok(args[1]->dev_pathname, ":");
this->elapsed = timestamp -
start[args[0]->b_edev, args[0]->b_blkno];
this->b = args[0]->b_bcount >= 1 << 20 ? 1 << 20 :
args[0]->b_bcount >= 1 << 19 ? 1 << 19 :
args[0]->b_bcount >= 1 << 18 ? 1 << 18 :
args[0]->b_bcount >= 1 << 17 ? 1 << 17 :
args[0]->b_bcount >= 1 << 16 ? 1 << 16 :
args[0]->b_bcount >= 1 << 18 ? 1 << 18 :
args[0]->b_bcount >= 1 << 17 ? 1 << 17 :
args[0]->b_bcount >= 1 << 16 ? 1 << 16 :
args[0]->b_bcount >= 1 << 15 ? 1 << 15 :
args[0]->b_bcount >= 1 << 14 ? 1 << 14 :
args[0]->b_bcount >= 1 << 13 ? 1 << 13 :
args[0]->b_bcount >= 1 << 12 ? 1 << 12 :
args[0]->b_bcount >= 1 << 11 ? 1 << 11 :
args[0]->b_bcount >= 1 << 10 ? 1 << 10 :
args[0]->b_bcount >= 1 << 9 ? 1 << 9 :
args[0]->b_bcount >= 1 << 8 ? 1 << 8 :
args[0]->b_bcount >= 1 << 7 ? 1 << 7 :
args[0]->b_bcount >= 1 << 6 ? 1 << 6 :
args[0]->b_bcount >= 1 << 5 ? 1 << 5 :
args[0]->b_bcount >= 1 << 4 ? 1 << 4 :
args[0]->b_bcount >= 1 << 3 ? 1 << 3 :
args[0]->b_bcount >= 1 << 2 ? 1 << 2 :
args[0]->b_bcount >= 1 << 1 ? 2 :
args[0]->b_bcount >= 1 << 0 ? 1 : 0;
/*
* To get much more verbose output, which breaks this further down
* by individual device, use this commented aggregate instead of
* one we are currently using.
*
* @[this->op, this->b, args[1]->dev_statname, this->devp] =
* quantize(this->elapsed / 1000);
* And, use this printa instead of one used now.
* printa("\t %s(b)=%d <%s> %s%@d\n", @);
*/
@h[this->op, this->b] = quantize(this->elapsed / 1000); /* latency in uS */
@av[this->op, this->b] = avg(this->elapsed / 1000);
@tot[this->op, this->b] = sum(this->elapsed / 1000);
@tp[this->op, this->b] = avg((args[0]->b_bcount * 976562) / this->elapsed);
start[args[0]->b_edev, args[0]->b_blkno] = 0;
}
END {
normalize(@tot, 1000);
printa("\t %s = %d(b) | AvLat = %@d(uS) | Active = %@d(mS) | Tput = %@d(KB/s)%@d\n",
@av, @tot, @tp, @h);
}'
#!/usr/sbin/dtrace -qCs
/*
* Script produces histograms to give a sense of how much IO is queued
* in the driver and in transport. Histogram seems to be a better means
* of examining this information as opposed to averages, mins and maxes.
*/
#define SD_TO_DEVINFO(un) ((struct dev_info *)((un)->un_sd->sd_dev))
#define DEV_NAME(un) stringof(`devnamesp[SD_TO_DEVINFO(un)->devi_major].dn_name)
#define DEV_INST(un) (SD_TO_DEVINFO(un)->devi_instance)
/*
This script should be run in this manner, which gives us ability to control
how long the script will run before it returns.
# dtrace -qCs ./x -n 'tick-5sec { exit(0) }'
Result will look something like this, where a serial number is given,
if known, and not a "" string, and instance corresponds to what you
get with `iostat -e`.
[In Driver]> ZA204NDT (sd2)
value ------------- Distribution ------------- count
1 | 0
2 |@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ 24
3 | 0
*/
::sd_start_cmds:entry {
this->serial = stringof(args[0]->un_sd->sd_inq->inq_serial) != "" ? stringof(args[0]->un_sd->sd_inq->inq_serial) : "UNKNOWN";
this->devname = DEV_NAME(args[0]);
this->devinst = DEV_INST(args[0]);
@htrans[this->serial, this->devname, this->devinst] =
lquantize(args[0]->un_ncmds_in_transport, 0, 32, 1);
@hdriv[this->serial, this->devname, this->devinst] =
lquantize(args[0]->un_ncmds_in_driver, 0, 32, 1);
}
END {
printa("\t [In Driver]> %-16s\t(%s%d) %@d\n", @hdriv);
printa("\t [In Transport]> %-16s\t(%s%d) %@d\n", @htrans);
}
#!/usr/sbin/dtrace -qCs
/* Script tracks max and average number of commands queued. */
#define SD_TO_DEVINFO(un) ((struct dev_info *)((un)->un_sd->sd_dev))
#define DEV_NAME(un) stringof(`devnamesp[SD_TO_DEVINFO(un)->devi_major].dn_name)
#define DEV_INST(un) (SD_TO_DEVINFO(un)->devi_instance)
/*
This script should be run in this manner, which gives us ability to control
how long the script will run before it returns.
# dtrace -qCs ./x -n 'tick-5sec { exit(0) }'
Result will look something like this, where a serial number is given,
if known, and not a "" string, and instance corresponds to what you
get with `iostat -e`.
Device Serial Instance Max.Queued Avg.Queued Max.Transp Avg.Transp
------------- -------- ---------- ---------- ---------- ----------
Z3G011B2 sd12 17 6 16 5
Z3G011G8 sd10 19 6 18 5
...trimmed...
*/
::sd_start_cmds:entry {
this->serial = stringof(args[0]->un_sd->sd_inq->inq_serial) != "" ? stringof(args[0]->un_sd->sd_inq->inq_serial) : "UNKNOWN";
this->devname = DEV_NAME(args[0]);
this->devinst = DEV_INST(args[0]);
@md[this->serial, this->devname, this->devinst] =
max(args[0]->un_ncmds_in_driver);
@mt[this->serial, this->devname, this->devinst] =
max(args[0]->un_ncmds_in_transport);
@ad[this->serial, this->devname, this->devinst] =
avg(args[0]->un_ncmds_in_driver);
@at[this->serial, this->devname, this->devinst] =
avg(args[0]->un_ncmds_in_transport);
}
END {
printf("%-16s %-8s %-10s %-10s %-10s %-10s\n", "Device Serial", "Instance",
"Max.Queued", "Avg.Queued", "Max.Transp", "Avg.Transp");
printf("%-16s %-8s %-10s %-10s %-10s %-10s\n", "-------------", "--------",
"----------", "----------", "----------", "----------");
printa("%-16s %s%-6d %-10@d %-10@d %-10@d %-10@d\n", @md, @ad, @mt, @at);
}
#!/usr/sbin/dtrace -Cs
#pragma D option quiet
#define SD_TO_DEVINFO(un) ((struct dev_info *)((un)->un_sd->sd_dev))
/* Not using these for now, but they could offer more information
* about devices.
*/
#define DEV_NAME(un) \
stringof(`devnamesp[SD_TO_DEVINFO(un)->devi_major].dn_name) /* ` */
#define DEV_INST(un) (SD_TO_DEVINFO(un)->devi_instance)
self long long last_byte_offset[struct sd_lun *];
::sd_core_iostart:entry {
this->phys_blksz = args[1]->un_phy_blocksize;
this->blk = args[2]->_b_blkno._p._l;
this->size = args[2]->b_bcount;
this->dev = args[1];
}
::sd_core_iostart:entry /self->last_byte_offset[this->dev] != 0/ {
/*
* Create buckets to group distances by IO size. The intent is to measure
* whether certain patterns are evident, like all small IOs seem to cluster
* in a particular area of a disk or are they evenly spread out, etc.
*/
this->bucket =
this->size <= 4096 ? "<= 4K" :
this->size > 4096 && this->size <= 0x8000 ?
"> 4K to 32K" :
this->size > 0x8000 && this->size <= 0x10000 ?
"> 32K to 64K" : "> 64K" ;
this->dist = this->blk - self->last_byte_offset[this->dev] > 0 ?
this->blk - self->last_byte_offset[this->dev] :
self->last_byte_offset[this->dev] - this->blk;
@seeks[stringof(SD_TO_DEVINFO(this->dev)->devi_devid_str),
this->bucket] = quantize(this->dist);
}
::sd_core_iostart:entry {
/* Record offset on device accounting for size of last IO done. */
self->last_byte_offset[args[1]] =
this->blk + (this->size / this->phys_blksz);
}
END {
printa("\t %s | IO Size: [ %s ] %@d\n", @seeks);
}
dtrace -qn '
/*
Script measures and reports every 10 seconds read and write latency,
IOPs, and throughput. This is a useful script to watch, while tuning
ZFS write throttle mechanism. The goal is to have acceptable latency
while also maximizing possible throughput to disk drives.
*/
BEGIN
{
start = timestamp;
line = "------------------------------------------------------------------------";
}
io:::start {
ts[args[0]->b_edev, args[0]->b_lblkno] = timestamp;
}
io:::done /ts[args[0]->b_edev, args[0]->b_lblkno]/ {
this->delta = (timestamp - ts[args[0]->b_edev, args[0]->b_lblkno]) / 1000;
this->name = (args[0]->b_flags & (B_READ | B_WRITE)) == B_READ ?
"Read" : "Write";
this->size = args[0]->b_bcount;
this->buck = 512; /* We assume this is smallest possible size */
this->buck = ( this->size > 512 ) ? 1024 : this->buck ;
this->buck = ( this->size > 1024 ) ? 2048 : this->buck ;
this->buck = ( this->size > 4096 ) ? 8192 : this->buck ;
this->buck = ( this->size > 8192 ) ? 16384 : this->buck ;
this->buck = ( this->size > 16384 ) ? 32768 : this->buck ;
this->buck = ( this->size > 32768 ) ? 65536 : this->buck ;
this->buck = ( this->size > 65536 ) ? 131072 : this->buck ;
this->buck = ( this->size > 131072 ) ? 262144 : this->buck ;
this->buck = ( this->size > 262144 ) ? 524288 : this->buck ;
this->buck = ( this->size > 524288 ) ? 1048576 : this->buck ;
@iosz[this->name] = quantize(this->buck);
@q[this->name] = quantize(this->delta);
@a[this->name] = avg(this->delta);
@v[this->name] = stddev(this->delta);
@i[this->name] = count();
@b[this->name] = sum(args[0]->b_bcount);
ts[args[0]->b_edev, args[0]->b_lblkno] = 0;
}
tick-10sec {
printf("\n%80s\n", line);
printa("%14s I/Os by size: %@d\n", @iosz);
printa("%14s latency(uS): %@d\n", @q);
/*
* Adjust for the runtime of each sample, i.e. calculate total seconds
* since last timestamp, and normalize the data, to get true per second
* values.
*/
normalize(@i, (timestamp - start) / 1000000000);
normalize(@b, (timestamp - start) / 1000000000 * 1024);
printf("%-30s %11s %11s %11s %11s\n", "", "avg.lat(uS)", "stddev(uS)",
"iops", "throughput");
printa("%-30s %@9uus %@9uus %@9u/s %@8uk/s\n", @a, @v, @i, @b);
trunc(@iosz); trunc(@q); trunc(@a); trunc(@v); trunc(@i); trunc(@b);
start = timestamp;
}'
dtrace -qn '
int realtm, start;
/* int done[string]; */
inline const int NORM_KB = 1024;
inline const int NS_S = 1000000000;
inline const int NS_MS = 1000000;
/*
* Script measures and reports every 1 seconds read and write latency,
* IOPs, and throughput. This is a useful script to watch, while tuning
* ZFS write throttle mechanism. The goal is to have acceptable latency
* while also maximizing possible throughput to disk drives.
*
* This script is also ideal for pairing with Load Dynamix standardized
* tests that we run, as a means of correlating what happens at the disk
* level, vs. what happens at the application layer. We want to measure IOs
* and latency, which gives us a sense of service time at the application
* layer and see how that translates to IOs and latency at the disk layer.
*/
BEGIN
{
printf("%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s\n",
"io.type", "runtime.ms", "min.lat", "max.lat", "avg.lat",
"max.io.size", "iops", "throughput.KBps",
"512b.ct", "1024b.ct", "2048b.ct", "4096b.ct", "8192b.ct",
"16384b.ct", "32768b.ct", "65536b.ct", "131072b.ct",
"total.ios", "active.ios", "arc.ios", "arc.io.bytes", "avg.bytes.per.us"
);
hdr = 1;
start = timestamp;
}
io:::start {
this->name = (args[0]->b_flags & (B_READ | B_WRITE)) == B_READ ? "R" : "W";
ts[args[0]->b_edev, args[0]->b_lblkno] = timestamp;
@queued[this->name] = sum(1); /* increment count of active IOs */
}
/*
* There is no very direct correlation between IOs measured here
* and IOs resulting in physical bytes being read from the disks.
* It does help though to have some context about IOs that do not actually
* result in physical IO.
*/
::arc_read_done:entry {
@asz["R"] = sum((uint64_t)args[0]->io_size) ;
@aios["R"] = count() ;
}
io:::done /ts[args[0]->b_edev, args[0]->b_lblkno]/ {
this->name = (args[0]->b_flags & (B_READ | B_WRITE)) == B_READ ? "R" : "W";
}
io:::done /ts[args[0]->b_edev, args[0]->b_lblkno]/ {
this->delta = (timestamp - ts[args[0]->b_edev, args[0]->b_lblkno]) / 1000;
this->name = (args[0]->b_flags & (B_READ | B_WRITE)) == B_READ ? "R" : "W";
this->size = args[0]->b_bcount; /* How many bytes in this IO? */
this->buck = 512; /* We assume this is smallest possible IO size. */
this->buck = ( this->size > 512 ) ? 1024 : this->buck ;
this->buck = ( this->size > 1024 ) ? 2048 : this->buck ;
this->buck = ( this->size > 4096 ) ? 8192 : this->buck ;
this->buck = ( this->size > 8192 ) ? 16384 : this->buck ;
this->buck = ( this->size > 16384 ) ? 32768 : this->buck ;
this->buck = ( this->size > 32768 ) ? 65536 : this->buck ;
this->buck = ( this->size > 65536 ) ? 131072 : this->buck ;
this->buck = ( this->size > 131072 ) ? 262144 : this->buck ;
this->buck = ( this->size > 262144 ) ? 524288 : this->buck ;
this->buck = ( this->size > 524288 ) ? 1048576 : this->buck ;
/*
* The size buckets are currently limited to 128K, from 512b to 128K,
* in base2 steps, i.e. 512, 1024, etc.
* We may want to add larger IO sizes, if we find that they happen.
*/
@lmin[this->name] = min(this->delta);
@lmax[this->name] = max(this->delta);
@a[this->name] = avg(this->delta);
/* Normalize to 1-second intervals */
@i[this->name] = count();
/* Average bytes per microsecond of IO time. */
this->bpus = this->delta > 0 ? (this->size / this->delta) : this->size;
@abus[this->name] = avg(this->size / this->delta);
/* Do not normalize (@ios) to get total I/Os per interval. */
@ios[this->name] = count();
@b[this->name] = sum(args[0]->b_bcount);
@queued[this->name] = sum(-1); /* decrement count of active IOs */
@maxiosz[this->name] = max(this->size);
@sz512[this->name] = sum(this->buck == 512 ? 1 : 0 );
@sz1024[this->name] = sum(this->buck == 1024 ? 1 : 0 );
@sz2048[this->name] = sum(this->buck == 2048 ? 1 : 0 );
@sz4096[this->name] = sum(this->buck == 4096 ? 1 : 0 );
@sz8192[this->name] = sum(this->buck == 8192 ? 1 : 0 );
@sz16384[this->name] = sum(this->buck == 16384 ? 1 : 0 );
@sz32768[this->name] = sum(this->buck == 32768 ? 1 : 0 );
@sz65536[this->name] = sum(this->buck == 65536 ? 1 : 0 );
@sz131072[this->name] = sum(this->buck == 131072 ? 1 : 0 );
ts[args[0]->b_edev, args[0]->b_lblkno] = 0;
}
tick-5sec {
this->delta = (timestamp - start) / NS_S;
/*
* This is a hack, it takes advantage of the fact that realtm value will
* always increase, and never decrease. Because we do not have an aggregate
* for storing only last value, this is the best we can do. We are
* effectively storing latest computed value.
*/
@t["R"] = max(realtm);
@t["W"] = max(realtm);
/*
* Normalizing by 1024 can introduce sample error when the value being
* divided is less than 1024. Since there is no floating point division,
* resulting value will be 0, which if data is routinely less than 1024,
* can create a serious discontinuity between real data and normalized
* data.
*/
normalize(@b, NORM_KB);
normalize(@t, NS_MS);
normalize(@i, 5); /* adjust based on tick-$value */
printa(
"%s,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d\n",
@t, @lmin, @lmax, @a, @maxiosz, @i, @b,
@sz512, @sz1024, @sz2048, @sz4096, @sz8192, @sz16384,
@sz32768, @sz65536, @sz131072, @ios, @queued,
@aios, @asz, @abus);
/*
* Truncate aggregations and re-set timestamp for next sampling period.
*/
trunc(@lmin); trunc(@lmax); trunc(@a); trunc(@maxiosz); trunc(@i);
trunc(@b); trunc(@sz512); trunc(@sz1024); trunc(@sz2048); trunc(@sz4096);
trunc(@sz8192); trunc(@sz16384); trunc(@sz32768); trunc(@sz65536);
trunc(@sz131072); trunc(@ios); trunc(@aios); trunc(@asz); trunc(@abus);
realtm = realtm + (timestamp - start);
start = timestamp;
}'
// Measure how much IO is aligned vs. not-aligned to 16K
// physical device size.
dtrace -qn '
long total_bytes, issued, notmult;
long misaligned, misalignedR, misalignedW, ltsize, ltsizeR, ltsizeW;
BEGIN {
issued = 0;
misaligned = 0;
notmult = 0;
ltsize = 0;
ltsizeR = 0;
ltsizeW = 0;
total_bytes = 0;
OFFSET = 32;
SIZE=16384; /* 16K blocksize */
}
fbt:sd:sd_core_iostart:entry {
issued++;
bp = (struct buf *) arg2;
xp = (struct sd_xbuf *)((bp)->b_private);
misalignedR += bp->b_flags & B_READ
? (xp->xb_blkno % OFFSET ? 1 : 0) : 0;
misalignedW += bp->b_flags & B_WRITE
? (xp->xb_blkno % OFFSET ? 1 : 0) : 0;
misaligned += (xp->xb_blkno % OFFSET) ? 1 : 0;
@zeroalign[(bp->b_bcount % SIZE) ?
"NOT MULTIPLE" : "MULTIPLE"] =
quantize(xp->xb_blkno % OFFSET);
ltsize += bp->b_bcount < SIZE ? 1 : 0; /* # of IOs under 16K */
ltsizeR += (bp->b_bcount < SIZE)
? (bp->b_flags & B_READ ? 1 : 0) : 0;
ltsizeW += (bp->b_bcount < SIZE)
? (bp->b_flags & B_WRITE ? 1 : 0) : 0;
notmult += (bp->b_bcount % SIZE) ? 1 : 0;
total_bytes += bp->b_bcount;
@align[(xp->xb_blkno % OFFSET)
? "UNALIGNED(bufsize)" : "ALIGNED(bufsize)"] =
quantize(bp->b_bcount);
}
END {
pct_ltsizeR = issued > 0 ? 100 * ltsizeR / issued : 0;
pct_ltsizeW = issued > 0 ? 100 * ltsizeW / issued : 0;
pct_ltsize = issued > 0 ? 100 * ltsize / issued : 0;
pct_misalign = issued > 0 ? 100 * misaligned / issued : 0;
pct_notmult = issued > 0 ? 100 * notmult / issued : 0;
printf("=========================================\n");
printf("================ Summary ================\n");
printf("=========================================\n");
printf("Short Reads: %u%% Short Writes: %u%%\n",
pct_ltsizeR, pct_ltsizeW);
printf("Unaligned Reads: %u Unaligned Writes: %u\n",
misalignedR, misalignedW);
printf("Bytes Processed: %u\nTotal IOs Issued: %u\nTotal IOs Smaller than %d: %u%%\nNot multiple of %d: %d%%\nUnaligned: %d%%\n",
total_bytes, issued, SIZE, pct_ltsize, SIZE, pct_notmult, pct_misalign);
printf("=========================================\n");
printa(@align);
printa(@zeroalign);
}'
dtrace -qn '
/* First argument to this should be some blocksize, like 8192 for 8K. */
long total_bytes, issued, notmult;
long misaligned, misalignedR, misalignedW, ltsize, ltsizeR, ltsizeW;
BEGIN {
issued = 0;
misaligned = 0;
notmult = 0;
ltsize = 0;
ltsizeR = 0;
ltsizeW = 0;
total_bytes = 0;
OFFSET = 32;
SIZE=$1; /* XX blocksize */
}
fbt:sd:sd_core_iostart:entry {
issued++;
bp = (struct buf *) arg2;
xp = (struct sd_xbuf *)((bp)->b_private);
misalignedR += bp->b_flags & B_READ
? (xp->xb_blkno % OFFSET ? 1 : 0) : 0;
misalignedW += bp->b_flags & B_WRITE
? (xp->xb_blkno % OFFSET ? 1 : 0) : 0;
misaligned += (xp->xb_blkno % OFFSET) ? 1 : 0;
@zeroalign[(bp->b_bcount % SIZE) ?
"NOT MULTIPLE" : "MULTIPLE"] =
quantize(xp->xb_blkno % OFFSET);
ltsize += bp->b_bcount < SIZE ? 1 : 0; /* # of IOs under 16K */
ltsizeR += (bp->b_bcount < SIZE)
? (bp->b_flags & B_READ ? 1 : 0) : 0;
ltsizeW += (bp->b_bcount < SIZE)
? (bp->b_flags & B_WRITE ? 1 : 0) : 0;
notmult += (bp->b_bcount % SIZE) ? 1 : 0;
total_bytes += bp->b_bcount;
@align[(xp->xb_blkno % OFFSET)
? "UNALIGNED(bufsize)" : "ALIGNED(bufsize)"] =
quantize(bp->b_bcount);
}
END {
pct_ltsizeR = issued > 0 ? 100 * ltsizeR / issued : 0;
pct_ltsizeW = issued > 0 ? 100 * ltsizeW / issued : 0;
pct_ltsize = issued > 0 ? 100 * ltsize / issued : 0;
pct_misalign = issued > 0 ? 100 * misaligned / issued : 0;
pct_notmult = issued > 0 ? 100 * notmult / issued : 0;
printf("=========================================\n");
printf("================ Summary ================\n");
printf("=========================================\n");
printf("Short Reads: %u%% Short Writes: %u%%\n",
pct_ltsizeR, pct_ltsizeW);
printf("Unaligned Reads: %u Unaligned Writes: %u\n",
misalignedR, misalignedW);
printf("Bytes Processed: %u\nTotal IOs Issued: %u\nTotal IOs Smaller than %d: %u%%\nNot multiple of %d: %d%%\nUnaligned: %d%%\n",
total_bytes, issued, SIZE, pct_ltsize, SIZE, pct_notmult, pct_misalign);
printf("=========================================\n");
printa(@align);
printa(@zeroalign);
}'