szaydel
7/8/2015 - 12:30 PM

Dtrace snippets for observing behaviour of ZIL

Dtrace snippets for observing behaviour of ZIL

#!/usr/sbin/dtrace -Cs
#include <sys/fs/zfs.h>
#pragma D option quiet
/*
 * CDDL HEADER START
 *
 * The contents of this file are subject to the terms of the
 * Common Development and Distribution License (the "License").
 * You may not use this file except in compliance with the License.
 *
 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
 * or http://www.opensolaris.org/os/licensing.
 * See the License for the specific language governing permissions
 * and limitations under the License.
 *
 * When distributing Covered Code, include this CDDL HEADER in each
 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
 * If applicable, add the following below this CDDL HEADER, with the
 * fields enclosed by brackets "[]" replaced with your own identifying
 * information: Portions Copyright [yyyy] [name of copyright owner]
 *
 * CDDL HEADER END
 *
 * Copyright (c) 2016, RackTop Systems.
 */

 string logbias[zfs_logbias_op_t];
 int x;
 inline int BLKSZ_ABOVE_128K = -0xffff; /* Just a magic value */

/* This is a good way to play with this a little bit. Just copy and paste in
 * a separate shell and let it roll. Make sure /bp/test exists.
  while :; do dd if=/dev/urandom of=/bp/test/out.bin bs=4096 count=$RANDOM ; 
  dd if=/dev/urandom of=/bp/test/out.bin bs=0x2000 count=$RANDOM ; 
  dd if=/dev/urandom of=/bp/test/out.bin bs=0x4000 count=$RANDOM ; 
  sleep 1; done
*/

BEGIN {
  printf("%s,%s,%s,%s,%s,%s,%s,%s,%s\n", "poolname", "bias",
    "4K", "8K", "16K", "32K", "64K", "128K", "gt.128K"
  );
  logbias[ZFS_LOGBIAS_LATENCY] = "latency";
  logbias[ZFS_LOGBIAS_THROUGHPUT] = "throughput";
}

::zil_lwb_write_start:entry {
  this->pooln = stringof(args[0]->zl_dmu_pool->dp_spa->spa_name);
  this->size = args[1]->lwb_sz;
  this->lb = args[0]->zl_logbias;
  /* We need to fit into one of these buckewith anything larger than
   * 128K tagged with BLKSZ_ABOVE_128K, and ultimately resulting in
   * being counted by @bgt128k aggregate.
   */
  this->buck = 0x1000; /* We assume this is smallest possible IO size. */
  this->buck = ( this->size > 0x1000 ) ? 8192 : this->buck ;
  this->buck = ( this->size > 0x2000 ) ? 16384 : this->buck ;
  this->buck = ( this->size >= 0x4000 ) ? 32768 : this->buck ;
  this->buck = ( this->size >= 0x8000 ) ? 65536 : this->buck ;
  this->buck = ( this->size > 0x10000 ) ? 131072 : this->buck ;
  this->buck = ( this->size > 0x20000 ) ? BLKSZ_ABOVE_128K : this->buck ;

  @b0x1000[this->pooln, logbias[this->lb]]   =
        sum(this->buck == 0x1000 ? 1 : 0);
  @b0x2000[this->pooln, logbias[this->lb]]   =
        sum(this->buck == 0x2000 ? 1 : 0);
  @b0x4000[this->pooln, logbias[this->lb]]   =
        sum(this->buck == 0x4000 ? 1 : 0);
  @b0x8000[this->pooln, logbias[this->lb]]   =
        sum(this->buck == 0x8000 ? 1 : 0);
  @b0x10000[this->pooln, logbias[this->lb]]  =
        sum(this->buck == 0x10000 ? 1 : 0);
  @b0x20000[this->pooln, logbias[this->lb]]  =
        sum(this->buck == 0x20000 ? 1 : 0);
  @bgt128k[this->pooln, logbias[this->lb]] =
        sum(this->buck == BLKSZ_ABOVE_128K ? 1 : 0);
  ++x;
}
tick-1sec /x/ {
  printa("%s,%s,%@d,%@d,%@d,%@d,%@d,%@d,%@d\n",
      @b0x1000, @b0x2000, @b0x4000, @b0x8000, @b0x10000, @b0x20000, @bgt128k
  );
  trunc(@b0x1000); trunc(@b0x2000); 
  trunc(@b0x4000); trunc(@b0x8000); trunc(@b0x10000); 
  trunc(@b0x20000); trunc(@bgt128k);
  x = 0;
}
dtrace -qn '
BEGIN {totalbytes=0; total = 0;} 
	::zfs_log_write:entry { totalbytes+=args[5]; total++; @ = quantize(args[5]); } 
END { 
    printf("\tTotal IOs: %d Total KBytes: %d\n", total, totalbytes >> 10); printa("\tIO size (bytes): %@d\n", @); }'
    
// IO size through the zil by process ID and execname. Includes both the size of IO,
// and amount of work done per microsecond of time spent in the zfs_log_write function.
dtrace -qn '
  self int64_t bytes;
  self int64_t delta;
  ::zfs_log_write:entry {
    @[pid, execname] = quantize(args[5]);
    self->ts = timestamp;
    self->bytes += args[5];
  }
  ::zfs_log_write:return /(timestamp - self->ts) > 0/ {
    self->delta += (timestamp - self->ts); /* measure delta */
  }
  ::zfs_log_write:return /self->delta > 100000/ {
    /* How many bytes per microsecond? */
    this->delta_us = self->delta / 1000;
    @bytes_us[pid, execname] = quantize(self->bytes/this->delta_us);
    self->ts = 0; self->bytes = 0; self->delta = 0;
  }
  tick-10sec {
    printa("IO Size\npid: %d, name: %s %@d\n", @); 
    printa("Bytes/us\npid: %d, name: %s %@d\n", @bytes_us); 
    trunc(@); trunc(@bytes_us); 
  }'
dtrace -qn '
  /*
   * Measure IO sizes and latency through the zil code. This script hides
   * reality through effectively quantizing data to multiples of 4K 
   * numbers, where actual sizes may not be on those exact boundaries. 
   * We can get a sense for how much we are lied to by looking at the total
   * count and comparing against the count given by base2 column, which counts
   * each IO that is actually sized on the bucket boundaries.
  */
  BEGIN {
    printf("%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s\n",
            "timestamp","sz.1k","sz.4k","sz.8k","sz.16k","sz.32k","sz.64k",
            "sz.128k","sz.256k","sz.512k", "sz.1m","unk",
            "l1k","l4k","l8k","l16k","l32k","l64k","l128k","l256k","l512k", "l1m","lunk", "mult.4k","total.ios", "throughput");
  }
  ::zfs_log_write:entry {
    /* 
     * this->wt reduces precision down to 1 second, to make result
     * presentation a little bit easier.
     */
    this->wt = (walltimestamp / 1000000000) * 1000000000;
    self->start = timestamp;
    this->sz = args[5];
    this->m4k = this->sz%4096 == 0 ? 1 : 0;

    this->l = this->sz > 0 && this->sz <= 0x400  ? "1k" : 
        this->sz > 0x400 && this->sz <= 0x1000 ? "4k" :
        this->sz > 0x1000 && this->sz <= 0x2000 ? "8k" :
        this->sz > 0x2000 && this->sz <= 0x4000 ? "16k" :
        this->sz > 0x4000 && this->sz <= 0x8000 ? "32k" : 
        this->sz > 0x8000 && this->sz <= 0x10000 ? "64k" :
        this->sz > 0x10000 && this->sz <= 0x20000 ? "128k" :
        this->sz > 0x20000 && this->sz <= 0x40000 ? "256k" :
        this->sz > 0x40000 && this->sz <= 0x80000 ? "512k" :
        this->sz > 0x80000 && this->sz <= 0x100000 ? "1m" :
         "unknown";
    @sz1k[this->wt]   = sum(this->l == "1k" ? 1 : 0);
    @sz4k[this->wt]   = sum(this->l == "4k" ? 1 : 0);
    @sz8k[this->wt]   = sum(this->l == "8k" ? 1 : 0);
    @sz16k[this->wt]  = sum(this->l == "16k" ? 1 : 0);
    @sz32k[this->wt]  = sum(this->l == "32k" ? 1 : 0);
    @sz64k[this->wt]  = sum(this->l == "64k" ? 1 : 0);
    @sz128k[this->wt]  = sum(this->l == "128k" ? 1 : 0);
    @sz256k[this->wt]  = sum(this->l == "256k" ? 1 : 0);
    @sz512k[this->wt]  = sum(this->l == "512k" ? 1 : 0);
    @sz1m[this->wt]  = sum(this->l == "1m" ? 1 : 0);
    @szu[this->wt]    = sum(this->l == "unknown" ? 1 : 0);
    @bucksize[this->wt]     = sum(this->m4k);
    /* IOPS, assuming regular interval sampling */
    @tot[this->wt]    = count();
    /* Throughput assuming regular interval sampling */
    @tput[this->wt]   = sum(this->sz);
  }

  ::zfs_log_write:return /this->sz != NULL/ {
    this->delta = (timestamp - self->start); /* note, this is nanoseconds */
    @l1k[this->wt]    = avg(this->l == "1k" ? this->delta : 0);
    @l4k[this->wt]    = avg(this->l == "4k" ? this->delta : 0);
    @l8k[this->wt]    = avg(this->l == "8k" ? this->delta : 0);
    @l16k[this->wt]   = avg(this->l == "16k" ? this->delta : 0);
    @l32k[this->wt]   = avg(this->l == "32k" ? this->delta : 0);
    @l64k[this->wt]   = avg(this->l == "64k" ? this->delta : 0);
    @l128k[this->wt]   = avg(this->l == "128k" ? this->delta : 0);
    @l256k[this->wt]   = avg(this->l == "256k" ? this->delta : 0);
    @l512k[this->wt]   = avg(this->l == "512k" ? this->delta : 0);
    @l1m[this->wt]   = avg(this->l == "1m" ? this->delta : 0);
    @lu[this->wt]     = sum(this->l == "unknown" ? this->delta : 0);
  }
  
  tick-1sec {
    printa("%Y,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d\n", 
            @sz1k, @sz4k, @sz8k, @sz16k, @sz32k, @sz64k, @sz128k, @sz256k, 
            @sz512k, @sz1m, @szu, @l1k, @l4k, @l8k, @l16k, @l32k,
            @l64k, @l128k, @l256k, @l512k, @l1m, @lu, @bucksize, @tot, 
            @tput);
    trunc(@sz1k); trunc(@sz4k); trunc(@sz8k);
    trunc(@sz16k); trunc(@sz32k); trunc(@sz64k); trunc(@sz128k); trunc(@sz256k);
    trunc(@sz512k); trunc(@sz1m); trunc(@szu);
    trunc(@l1k); trunc(@l4k); trunc(@l8k);
    trunc(@l16k); trunc(@l32k); trunc(@l64k); trunc(@l128k); trunc(@l256k);
    trunc(@l512k); trunc(@l1m); trunc(@lu); 
    trunc(@bucksize); trunc(@tot); trunc(@tput);
  }'
/* Observe size buckets of IOs through the zfs_log_write function and measure zil_commit latency. */
dtrace -qn '
    ::zil_commit:entry {
        self->zilog = args[0] ;
        self->spa   = args[0]->zl_spa ;
        self->t     = timestamp ;
    }
    ::zfs_log_write:entry {
        this->spa   = args[0]->zl_spa ;
        this->sync  = args[0]->zl_sync == 0 ? "Async" : "Sync" ;
        @x4[this->spa->spa_name, this->sync] =
            sum(args[5] <= 0x1000 ? 1 : 0) ;
        @x8[this->spa->spa_name, this->sync] =
            sum((args[5] <= 0x2000 && args[5] > 0x1000) ? 1 : 0) ;
        @x16[this->spa->spa_name, this->sync] =
            sum((args[5] <= 0x4000 && args[5] > 0x2000) ? 1 : 0) ;
        @x32[this->spa->spa_name, this->sync] =
            sum((args[5] <= 0x8000 && args[5] > 0x4000) ? 1 : 0) ;
        @x64[this->spa->spa_name, this->sync] =
            sum((args[5] <= 0x10000 && args[5] > 0x8000) ? 1 : 0) ;
        @x128[this->spa->spa_name, this->sync] =
            sum((args[5] <= 0x20000 && args[5] > 0x10000) ? 1 : 0) ;
        @xulim[this->spa->spa_name, this->sync] =
            sum(args[5] > 0x20000 ? 1 : 0) ;
    }
    ::zil_commit:return /self->t/ {
        this->sync = self->zilog == 0 ? "Async" : "Sync" ;
        @lathist[self->spa->spa_name, this->sync] =
            quantize((timestamp - self->t) / 1000);
        self->t = 0 ; self->spa = 0 ; self->zilog = 0 ;
    }
    END {
        printa("\n           Pool [%s] zil_commit %s IO record latency%@d\n", 
            @lathist);
        printf("IO Count by Size:\n") ;
        printf("pool     type     %-8s %-8s %-8s %-8s %-8s %-8s %-8s\n",
                "4kb","8kb","16kb","32kb","64kb","128kb",">128kb") ;
        printa("%-8s %-8s %-8@d %-8@d %-8@d %-8@d %-8@d %-8@d %-8@d\n",
                @x4, @x8, @x16, @x32, @x64, @x128, @xulim) ;
    }'

/* Sizes of IOs through the zfs_log_write  */
dtrace -qn '::zfs_log_write:entry {
  this->ts = (walltimestamp / 1000000000) * 1000000000;
  @sz[this->ts, args[5]] = count();
  }
  END {
    printf("timestamp,io.size,count\n");
    printa("%Y,%d,%@d\n", @sz);
  }'
dtrace -qn '
  /* Add a number greater than 0 after the script on the command line, to
   * tell the script how often to print results to screen. If no value is
   * given, script will fail immediately. Value must be positive and is in
   * seconds. Results will be printed "this many" seconds apart, but will
   * be normalized to a per-second result.
   */
  int ts;

  BEGIN { ts = timestamp; }
  ::zfs_log_write:entry {
    this->file_path = stringof(args[3]->z_vnode->v_path);
    @[this->file_path] = quantize(arg5);
    @cnt = count();
    @bytes = sum(arg5);
    bypass_log = `zfs_slog_bypass_ok == 0 ? "will not" : "will";
  }
  tick-$$1sec {
    this->now = timestamp;
    this->delta = (this->now - ts) / 1000000000;
    /* Adjust data to reflect per second values */
    normalize(@, this->delta);
    normalize(@cnt, this->delta);
    normalize(@bytes, this->delta);
    
    printf("----------------------------------------\n");
    printf("Summary:\n");
    printf("IOs >%d(kb) %s bypass log\n", 
          `zfs_immediate_write_sz >> 10, bypass_log);
    printa("IOs through ZIL/sec: %@d\nBytes through ZIL/sec: %@d\n", 
          @cnt, @bytes);
    printf("----------------------------------------\n");
    
    printa("          Filepath: %s\n          IOsize\t\t\t\tIOs/sec %@d\n", @);
    trunc(@); trunc(@cnt); trunc(@bytes);
    ts = this->now;
  }'