szaydel
3/19/2018 - 4:40 PM

Dtrace script for monitoring sd driver sd_ready_and_valid function

In some instances a drive fails and while still seemingly online it reports that it is not ready. This is a check that callers make to make sure the device is usable. A non-zero result here means there's a problem with given device.

#!/usr/sbin/dtrace -Cs
#pragma D option quiet
/*
 * CDDL HEADER START
 *
 * The contents of this file are subject to the terms of the
 * Common Development and Distribution License (the "License").
 * You may not use this file except in compliance with the License.
 *
 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
 * or http://www.opensolaris.org/os/licensing.
 * See the License for the specific language governing permissions
 * and limitations under the License.
 *
 * When distributing Covered Code, include this CDDL HEADER in each
 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
 * If applicable, add the following below this CDDL HEADER, with the
 * fields enclosed by brackets "[]" replaced with your own identifying
 * information: Portions Copyright [yyyy] [name of copyright owner]
 *
 * CDDL HEADER END
 *
 * Copyright (c) 2018 Sam Zaydel / RackTop Systems.
 *
 * sd-ready-valid-csv.d
 *
 * Description:
 * Script tracks return code from sd_ready_and_valid function, which tells
 * the caller (sdopen or sdioctl) whether a given drive is usable. 
 * When a device fails this test a non-zero value is returned and depending 
 * on state of drive we should see messages in the kernel log along the lines
 * of `drive offline`.
 */

#define SD_TO_DEVINFO(un) ((struct dev_info *)((un)->un_sd->sd_dev))
#define DEV_NAME(un) \
      stringof(`devnamesp[SD_TO_DEVINFO(un)->devi_major].dn_name) /* ` */
#define DEV_INST(un) (SD_TO_DEVINFO(un)->devi_instance)

::sd_ready_and_valid:entry {
    self->un = args[0]->ssc_un;
}
::sd_ready_and_valid:return /self->un/ {
    @[stringof(SD_TO_DEVINFO(self->un)->devi_devid_str),
        DEV_INST(self->un), args[1]] = count();
    self->un = NULL;
}

END {
    printf("device,instance,retcode,count\n");
    printa("%s,sd%d,%d,%@d\n", @);
}
#define SD_TO_DEVINFO(un) ((struct dev_info *)((un)->un_sd->sd_dev))
#define DEV_NAME(un) \
      stringof(`devnamesp[SD_TO_DEVINFO(un)->devi_major].dn_name) /* ` */
#define DEV_INST(un) (SD_TO_DEVINFO(un)->devi_instance)
#define    SD_GET_XBUF(bp)        ((struct sd_xbuf *)((bp)->b_private))
BEGIN {
    printf("instance,ncmds,ncmds_transport,errno,nretries,failed_cmd,reset\n");
}
::sd_retry_command:entry {
    this->un = args[0];
    this->ncmds = this->un->un_ncmds_in_driver;
    this->ncmds_trans = this->un->un_ncmds_in_transport;
    this->errno = args[5]; /* Probably always EIO */
    this->xbuf = SD_GET_XBUF(args[1]);
    this->retry_cnt = this->xbuf->xb_retry_count;
    this->limit = this->xbuf->xb_retry_count >= this->un->un_busy_retry_count;

    this->un_reset_retry_count = (this->un->un_reset_retry_count < 2) ? 2 : this->un->un_reset_retry_count;
    /* This should also be seen in the system log. Timestamp should help to 
     * confirm. 
     */
    this->reset = this->xbuf->xb_retry_count == this->un_reset_retry_count;
    
    printf("sd%d,%d,%d,%d,%d,%s,%s\n",
            DEV_INST(this->un), this->ncmds, this->ncmds_trans,
            this->errno, this->retry_cnt, this->limit > 0 ? "y" : "n",
            this->reset > 0 ? "y" : "n");
}
#!/usr/sbin/dtrace -Cs
#pragma D option quiet

#define SD_TO_DEVINFO(un) ((struct dev_info *)((un)->un_sd->sd_dev))
#define DEV_NAME(un) \
      stringof(`devnamesp[SD_TO_DEVINFO(un)->devi_major].dn_name) /* ` */
#define DEV_INST(un) (SD_TO_DEVINFO(un)->devi_instance)
#define    SD_GET_XBUF(bp)        ((struct sd_xbuf *)((bp)->b_private))
#define YN(val) val != 0 ? "y" : "n"

::sd_return_failed_command_no_restart:entry { this->errno = args[2]; }
::sd_retry_command:entry { this->errno = args[5]; }
::sd_return_failed_command_no_restart:entry,
::sd_retry_command:entry {
    this->un = args[0];
    this->ncmds = this->un->un_ncmds_in_driver;
    this->ncmds_trans = this->un->un_ncmds_in_transport;
    /* this->errno = args[5]; */ /* Probably always EIO */
    this->xbuf = SD_GET_XBUF(args[1]);
    this->retry_cnt = this->xbuf->xb_retry_count;
    this->retry_count_limit = this->xbuf->xb_retry_count >= this->un->un_busy_retry_count;

    this->un_reset_retry_count = (this->un->un_reset_retry_count < 2) ? 2 : this->un->un_reset_retry_count;
    /* This should also be seen in the system log. Timestamp should help to 
     * confirm. 
     */
    this->reset = this->xbuf->xb_retry_count == this->un_reset_retry_count;
    /* Removable and ejection support are here to make sure we know which sd's 
     * are not the ones we care about. Anything with either of the two flags set
     * will be something other than a disk drive, like a CDROM, etc.
     * We can inspect the soft state of the current system with mdb:
     * > *sd_state::walk softstate|::print struct sd_lun un_f_has_removable_media un_f_eject_media_supported
     */
    this->removable = this->un->un_f_has_removable_media;
    this->ejectable = this->un->un_f_eject_media_supported;
    printf("%Y instance=sd%d ejectable=%s removable=%s ncmds=%d ncmds_transport=%d errno=%d retries=%d retries_limit_reached=%s reset_triggered=%s probe=%s\n", walltimestamp,
            DEV_INST(this->un), YN(this->ejectable), YN(this->removable),
            this->ncmds, this->ncmds_trans, this->errno, this->retry_cnt,
            this->retry_count_limit > 0 ? "y" : "n",
            this->reset > 0 ? "y" : "n", probefunc);
}