szaydel
1/18/2018 - 6:03 PM

Dtrace IO Error Counting Script

#!/usr/sbin/dtrace -Cs
#pragma D option quiet

/*
 * CDDL HEADER START
 *
 * The contents of this file are subject to the terms of the
 * Common Development and Distribution License (the "License").
 * You may not use this file except in compliance with the License.
 *
 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
 * or http://www.opensolaris.org/os/licensing.
 * See the License for the specific language governing permissions
 * and limitations under the License.
 *
 * When distributing Covered Code, include this CDDL HEADER in each
 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
 * If applicable, add the following below this CDDL HEADER, with the
 * fields enclosed by brackets "[]" replaced with your own identifying
 * information: Portions Copyright [yyyy] [name of copyright owner]
 *
 * CDDL HEADER END
 *
 * Copyright (c) 2017 Sam Zaydel / RackTop Systems.
 *
 * io-retry-and-err-count-csv.d
 *
 * Description:
 * Script collects a count of IOs that resulted in an error or retry.
 * The rate of error is multiplied by `multiplier` and reported with a
 * e-6, but without actually doing floating point arithmetic, which
 * dtrace does not have support for.
 * Expectation here is that if device is experiencing an IO error each
 * time it issues an IO, we should see 1000000e-6 in the output, meaning
 * rate of error is 1.0.
 */
inline const int multiplier = 1000000;
unsigned long ioct[dev_t], errct[dev_t], timer[dev_t];

BEGIN {
    printf("sdname,mpxiowwn,ctretry,cterr,cteio,noxfer,rateerr\n");
}

::sd_set_retry_bp:entry
/ xlate <devinfo_t *>(args[1])->dev_pathname != "<nfs>" &&
xlate <devinfo_t *>(args[1])->dev_pathname != "" /
{
    this->sn    = xlate <devinfo_t *>(args[1])->dev_statname;
    this->xx    = xlate <devinfo_t *>(args[1])->dev_pathname;
    this->p     = substr(this->xx, 25, 16);
    this->p     = (strstr(this->p, "disk@") == 0 ||
                    strstr(this->p, "disk@") == "") ? toupper(this->p) : "NA";
    @ctretry[this->sn, this->p] = count();
}

io:::start
/ args[1]->dev_pathname != "<nfs>" && args[1]->dev_pathname != "" &&
timer[args[0]->b_edev] == 0/
{
    timer[args[0]->b_edev] = timestamp;
}

io:::done
/ args[1]->dev_pathname != "<nfs>" && args[1]->dev_pathname != "" /
{
    ioct[args[0]->b_edev]++;
    errct[args[0]->b_edev] += args[0]->b_flags & B_ERROR ? 1 : 0;
}

io:::done
/ args[1]->dev_pathname != "<nfs>" && args[1]->dev_pathname != "" &&
timer[args[0]->b_edev] != 0 &&
timestamp - 10000000000 > timer[args[0]->b_edev] /
{
    timer[args[0]->b_edev]  = timestamp;
    this->sn    = args[1]->dev_statname;
    this->p     = substr(args[1]->dev_pathname, 25, 16);
    this->p     = (strstr(this->p, "disk@") == 0 ||
                    strstr(this->p, "disk@") == "") ? toupper(this->p) : "NA";
    /* This is a hack to work around lack of floating-point support */
    this->rate  = (multiplier * errct[args[0]->b_edev]) / ioct[args[0]->b_edev];
    @maxrateerr[this->sn, this->p]  = max(this->rate);
    ioct[args[0]->b_edev]   = 0;
    errct[args[0]->b_edev]  = 0;

}

io:::done
/ args[1]->dev_pathname != "<nfs>" && args[1]->dev_pathname != "" &&
args[0]->b_flags & B_ERROR /
{
    this->sn    = args[1]->dev_statname;
    this->p     = substr(args[1]->dev_pathname, 25, 16);
    this->p     = (strstr(this->p, "disk@") == 0 ||
                    strstr(this->p, "disk@") == "") ? toupper(this->p) : "NA";

    /* Any difference between cterr and cteio means not all errors are EIO. */
    @cterr[this->sn, this->p]   = sum(args[0]->b_flags & B_ERROR ? 1 : 0);
    @cteio[this->sn, this->p]   = sum(args[0]->b_error == EIO ? 1 : 0);
    @noxfer[this->sn, this->p]  = sum(args[0]->b_resid);
}

tick-1min
{
    printa("%s,%s,%@d,%@d,%@d,%@d,%@de-6\n",
        @ctretry, @cterr, @cteio, @noxfer, @maxrateerr);
	trunc(@ctretry); trunc(@cterr); trunc(@cteio); 
    trunc(@noxfer); trunc(@maxrateerr);
}