[ARC adjustment observation] Dtrace snippets and scripts for observing ARC behavior. #tags: arc, zfs-arc, arc-latency, arc-eviction, latency, changes, mutexes, shrinking, evictions, arc-throughput, cache, zfs-cache
dtrace -qn '
/*
* Mesure the amount of time it takes to adjust the size of ARC
* by given amount of bytes. We expect 1-second granularity and
* data being in KBytes, and duration being in microseconds (us).
*/
BEGIN {
printf("interval,ev.KBps,avg.lat,longest,tottime\n");
start_time = timestamp;
interval = 0; data = 0;
}
::arc_adjust:entry {
self->ts = timestamp;
}
::arc_adjust:return / self->ts > 0/ {
this->delta = (timestamp-self->ts) / 1000;
this->kb = args[1] / 1024;
self->ts = 0;
data = 1;
@avlat[interval] = avg(this->delta);
@tottime[interval] = sum(this->delta);
@longest[interval] = max(this->delta);
@sz[interval] = sum(this->kb);
}
/*
* We want to make sure to print NAs instead of 0's or nothing
* in order to maintain regular time-series, more or less.
*/
tick-1sec /!data/ {
printf("%d,NA,NA,NA,NA\n", interval) ;
interval = (timestamp - start_time) / 1000000000 ;
}
tick-1sec /data/ {
data = 0 ;
interval = (timestamp - start_time) / 1000000000 ;
printa("%d,%@d,%@d,%@d,%@d\n", @sz, @avlat, @longest, @tottime);
trunc(@tottime); trunc(@sz);
trunc(@avlat); trunc(@longest);
}'
#!/usr/sbin/dtrace -Cs
/* Dtrace script checking for whether arc_adjust was called from from arc_shrink
or from arc_reclaim_thread.
*/
#pragma D option quiet
#define AR "arc_reclaim_thread"
#define AS "arc_shrink"
BEGIN {
x["arc_reclaim_thread"] = 0;
x["arc_shrink"] = 0;
in_shrink = 0;
in_reclaim = 0;
new_data = 0;
}
::arc_reclaim_thread:entry {
++in_reclaim;
}
::arc_shrink:entry {
++in_shrink;
}
::arc_adjust:return /in_reclaim/ {
x[AR] = args[1];
}
::arc_adjust:return /in_shrink/ {
x[AS] = args[1];
}
::arc_adjust:return /x[AR] && x[AS]/ {
@[AR] = sum(x[AR]);
@[AS] = sum(x[AS]);
x[AR] = 0;
x[AS] = 0;
in_reclaim = --in_reclaim ? in_reclaim > 0 : 0;
in_shrink = --in_shrink ? in_shrink > 0 : 0;
++new_data;
}
tick-1sec /new_data/ {
printa(@);
trunc(@);
new_data = 0;
}
dtrace -qn '
BEGIN {
printf("timestamp,shrink,duration\n");
in_shrink = 0;
}
::arc_adjust:entry,::arc_kmem_reap_now:entry {
self->x[curpsinfo->pr_psargs] = timestamp;
}
/* If the ARC is being resized we need to know about this. */
::arc_shrink:entry
{
in_shrink = 1;
}
::arc_kmem_reap_now:return,
::arc_adjust:return
/self->x[curpsinfo->pr_psargs] && ((timestamp - self->x[curpsinfo->pr_psargs]) / 1000000 > 0)/
{
printf("%Y,%d,%d\n", walltimestamp, in_shrink,
(timestamp - self->x[curpsinfo->pr_psargs]) / 1000000);
self->x[curpsinfo->pr_psargs] = 0;
in_shrink = 0;
}'
#!/usr/sbin/dtrace -s
#pragma D option quiet
/*
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
* Common Development and Distribution License (the "License").
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or http://www.opensolaris.org/os/licensing.
* See the License for the specific language governing permissions
* and limitations under the License.
*
* When distributing Covered Code, include this CDDL HEADER in each
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
*
* Copyright (c) 2015, RackTop Systems.
*
* Description:
* This script is meant to obtain information about ARC internal state, from
* values that are not necessarily exposed via kstat, and these values are
* potentially modified inside of arc_shrink. As such, we are trying to observe
* values at the beginning and at the end of the function call.
*/
BEGIN {
printf("%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s\n",
"timestamp", "duration", "avail.rmem", "free.mem", "lotsfree.mem",
"c.adjust", "p.adjust", "size.adjust",
"bytes.to.free", "arc_c_min", "arc_c_max",
"c.top", "p.top", "c.size.top",
"c.bottom", "p.bottom", "c.size.bottom",
"c.gt.arc_c_min.top", "c.gt.arc_p.top",
"c.gt.arc_c_min.bottom", "c.gt.arc_p.bottom"
);
}
::arc_shrink:entry {
self->availrmem = (pgcnt_t)`availrmem << 12;
self->freemem = (pgcnt_t)`freemem << 12;
self->lotsfree = (pgcnt_t)`lotsfree << 12;
self->start = timestamp;
self->tofree = args[0];
self->c_min = `arc_stats.arcstat_c_min.value.ull;
self->c_max = `arc_stats.arcstat_c_max.value.ull;
self->c_top = `arc_stats.arcstat_c.value.ull;
self->p_top = `arc_stats.arcstat_p.value.ull;
self->size_top = `arc_stats.arcstat_size.value.ull;
self->c_gt_arc_c_min_top = self->c_top > self->c_min ? 1 : 0;
self->c_gt_arc_p_top = self->c_top > self->p_top ? 1 : 0;
}
::arc_shrink:return /self->start/ {
this->delta = (timestamp - self->start) / 1000; /* Duration of call */
self->size_btm = `arc_stats.arcstat_size.value.ull;
self->c_btm = `arc_stats.arcstat_c.value.ull;
self->p_btm = `arc_stats.arcstat_p.value.ull;
self->c_gt_arc_c_min_btm = self->c_btm > self->c_min ? 1 : 0;
self->c_gt_arc_p_btm = self->c_btm > self->p_btm ? 1 : 0;
this->c_adjust = self->c_top - self->c_btm;
this->p_adjust = self->p_top - self->p_btm;
this->size_adjust = self->size_top - self->size_btm;
printf("%Y,%d,%d,%d,%d,%d,%d,%d,%d,%d,%d,%d,%d,%d,%d,%d,%d,%d,%d,%d,%d\n",
walltimestamp, this->delta, self->availrmem, self->freemem, self->lotsfree,
this->c_adjust, this->p_adjust, this->size_adjust,
self->tofree, self->c_min, self->c_max,
self->c_top, self->p_top, self->size_top,
self->c_btm, self->p_btm, self->size_btm,
self->c_gt_arc_c_min_top, self->c_gt_arc_p_top,
self->c_gt_arc_c_min_btm, self->c_gt_arc_p_btm
);
}
dtrace -qn '
BEGIN {
ts = walltimestamp;
printf("timestamp,arc_c_min,arc_c_max,bytes.to.free\n");
}
::arc_shrink:entry /ts/ {
this->tofree = args[0];
this->min = `arc_stats.arcstat_c_min.value.ull;
this->max = `arc_stats.arcstat_c_max.value.ull;
@[ts, this->min, this->max] = sum(this->tofree);
}
tick-1sec {
printa("%Y,%d,%d,%@d\n", @);
trunc(@);
ts = walltimestamp;
}'
// More detailed version of the above snippet.
dtrace -qn '
BEGIN {
ts = walltimestamp;
printf("timestamp,bytes.to.free,arc_c_min,arc_c_max,c,p,c.size\n");
}
::arc_shrink:entry /ts/ {
self->tofree = args[0];
self->min = `arc_stats.arcstat_c_min.value.ull;
self->max = `arc_stats.arcstat_c_max.value.ull;
self->c = `arc_stats.arcstat_c.value.ull;
self->p = `arc_stats.arcstat_p.value.ull;
self->size = `arc_stats.arcstat_size.value.ull;
}
::arc_shrink:return /ts/ {
printf("%Y,%d,%d,%d,%d,%d,%d\n", walltimestamp,
self->tofree, self->min, self->max, self->c, self->p, self->size);
}'
dtrace -qn '
#pragma option dynvarsize=1M
BEGIN {
printf("%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s\n",
"timestamp", "call.count", "b512.count", "b1024.count",
"b2048.count", "b4096.count", "b16384.count", "b32768.count",
"b65536.count", "b131072.count", "call.dur.us");
have_data = 0;
ts = walltimestamp;
}
::arc_release:entry /ts/ {
this->bhdr = args[0]->b_hdr;
self->sz = this->bhdr->b_size;
self->start = timestamp;
}
::arc_release:return /ts/ {
this->delta = timestamp - self->start;
@cnt[ts] = count();
@dur[ts] = sum(this->delta/1000);
/* If size matches bucket, add 1 to sum for that bucket, build buckets
* for common buffer sizes, such as 512, 4K, 16K, 32K, etc.
*/
@cnt512[ts] = sum(self->sz == 512 ? 1 :0);
@cnt1024[ts] = sum(self->sz == 1024 ? 1 :0);
@cnt2048[ts] = sum(self->sz == 2048 ? 1 :0);
@cnt4096[ts] = sum(self->sz == 4096 ? 1 :0);
@cnt16384[ts] = sum(self->sz == 16384 ? 1 :0);
@cnt32768[ts] = sum(self->sz == 32768 ? 1 :0);
@cnt65536[ts] = sum(self->sz == 65536 ? 1 :0);
@cnt131072[ts] = sum(self->sz == 131072 ? 1 :0);
++have_data;
}
/*
* Zero values to avoid gaps in time-series, so as to make it easier
* to plot this information.
*/
tick-1sec /!have_data/ {
ts = walltimestamp;
printf("%Y,0,0,0,0,0,0,0,0,0,0\n", ts);
}
tick-1sec /have_data/ {
printa("%Y,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d\n",
@cnt, @cnt512, @cnt1024, @cnt2048,
@cnt4096, @cnt16384, @cnt32768,
@cnt65536, @cnt131072, @dur);
trunc(@cnt512); trunc(@cnt1024);
trunc(@cnt2048); trunc(@cnt4096);
trunc(@cnt16384); trunc(@cnt32768);
trunc(@cnt65536); trunc(@cnt131072);
trunc(@cnt); trunc(@dur);
have_data = 0;
ts = walltimestamp;
}
END {
exit(0);
}'
dtrace -qn '
/*
* Measure amount of time spent between start and stop of arc_release,
* as well as time spent in the mutex locks while inside of arc_release.
*/
BEGIN {
ts = walltimestamp;
printf("timestamp,count.ar,count.mutex,dur.ar.us,dur.mutex.us\n");
}
int self x[kthread_t *];
::arc_release:entry {
++self->x[curthread];
this->start = timestamp;
}
lockstat::mutex_enter:adaptive-acquire /self->x[curthread] > 0/ {
/* arg0 is a pointer to kmutex_t struct representing the lock */
self->start[arg0] = timestamp;
@cnt_lck[ts] = count();
}
lockstat::mutex_exit:adaptive-release /self->start[arg0]/ {
this->delta = timestamp - self->start[arg0];
@t_lck[ts] = sum(this->delta / 1000);
self->start[arg0] = 0;
}
::arc_release:return /self->x[curthread]/ {
this->delta = timestamp - this->start;
self->x[curthread] = 0;
@t_ar[ts] = sum(this->delta / 1000);
@cnt_ar[ts] = count();
}
tick-1sec {
printa("%Y,%@d,%@d,%@d,%@d\n",
@cnt_ar, @cnt_lck, @t_ar, @t_lck);
trunc(@cnt_lck); trunc(@t_lck); trunc(@cnt_ar); trunc(@t_ar);
ts = walltimestamp;
}
'
dtrace -qn '
int data, start;
BEGIN {
printf("timedelta,call.count,arc.sz,evict.sz.kb,call.dur.us\n");
start = timestamp;
}
::arc_evict_state_impl:entry {
self->start = timestamp;
}
::arc_evict_state_impl:return /timestamp - self->start > 0/ {
this->delta = timestamp - self->start ;
this->offset = (timestamp - start) / 1000000000 ;
@cnt[this->offset] = count();
@dur[this->offset] = sum(this->delta/1000);
/* total number of kilobytes evicted */
@evsz[this->offset] = sum(args[1] >> 10);
@arcsz[this->offset] = avg(`arc_stats.arcstat_size.value.ui64 >> 10);
data = 1 ;
}
/*
* NA values to avoid gaps in time-series, so as to make it easier
* to plot this information.
*/
tick-1sec /!data/ {
printf("%d,NA,NA,NA,NA\n", (timestamp - start) / 1000000000);
}
tick-1sec /data/ {
printa("%d,%@d,%@d,%@d,%@d\n", @cnt, @arcsz, @evsz, @dur);
trunc(@cnt); trunc(@arcsz); trunc(@evsz); trunc(@dur);
data = 0;
}
END {
exit(0);
}'
dtrace -qn '
int data, start;
BEGIN {
printf("%s,%s,%s,%s,%s,%s,%s,%s\n",
"timedelta", "call.count", "arc.sz.kb", "arc.c.kb", "evict.kb",
"av.evict.kb", "call.len.us", "longest.call.len.us"
);
start = timestamp;
}
::arc_evict_state_impl:entry {
self->start = timestamp;
}
::arc_evict_state_impl:return /timestamp - self->start > 0/ {
this->delta = timestamp - self->start ;
data = 1 ;
@cnt = count();
@dur = sum(this->delta/1000);
@longest = max(this->delta/1000);
/* total number of kilobytes evicted */
@evsz = sum(args[1] >> 10);
@arcszkb = avg(`arc_stats.arcstat_size.value.ui64 >> 10);
@arcckb = avg(`arc_stats.arcstat_c.value.ui64 >> 10);
@avevkb = avg(args[1] >> 10); /* average KB per eviction */
}
/*
* NA values to avoid gaps in time-series, so as to make it easier
* to plot this information.
*/
tick-1sec /!data/ {
printf("%d,NA,NA,NA,NA,NA,NA,NA\n",
(timestamp - start) / 1000000000);
}
tick-1sec /data/ {
this->seconds = (timestamp - start) / 1000000000 ;
normalize(@cnt, this->seconds) ;
normalize(@evsz, this->seconds) ;
normalize(@dur, this->seconds) ;
printf("%d,", this->seconds);
printa("%@d,%@d,%@d,%@d,%@d,%@d,%@d\n",
@cnt, @arcszkb, @arcckb, @evsz, @avevkb, @dur, @longest);
trunc(@cnt); trunc(@arcszkb); trunc(@arcckb);
trunc(@evsz); trunc(@dur); trunc(@avevkb); trunc(@longest);
data = 0;
}
END {
exit(0);
}'
dtrace -qn '
/*
* Mesure the amount of time it takes to adjust the size of ARC
* by given amount of bytes. We expect 1-second granularity and
* data being in KBytes, and duration being in microseconds (us).
*/
inline const int SLOW = 1000; /* (1ms) limit triggering logging of data */
inline const string HEADER =
"timestamp,evicted.kb,slow,longest.call_t.us,free.mem.kb\n";
BEGIN {
printf("%s", HEADER);
cnt = 0;
}
tick-1sec {
ts = walltimestamp;
}
::arc_adjust:entry /ts > 0/ {
self->ts = timestamp;
/* Unreserved memory in kilobytes */
this->freemem = (`freemem << 12) >> 10;
}
::arc_adjust:return /self->ts/ {
this->delta = (timestamp-self->ts) / 1000;
this->l = this->delta > SLOW ? 1 : 0;
this->kb = args[1] / 1024;
@delay_max[ts, this->kb, this->l] = max(this->delta);
@freemem[ts, this->kb, this->l] = min(this->freemem);
}
::arc_adjust:return /this->l > 0/ {
/* Duration and size are not necessarily linked. One should not assume
* that the more time is spent the more bytes evicted. This function will
* call other functions and will spend time in them. This script measures
* total amount of time spent in those other functions. They do not always
* remove same amount of data, and depending on how mcuh work they do,
* they may be slower or faster and amount of bytes evicted does not
* actually directly relate to amount of time it takes to evict given
* number of bytes.
*/
@delay_max[ts, this->kb, this->l] = max(this->delta);
@freemem[ts, this->kb, this->l] = min(this->freemem);
printa("%Y,%d,%d,%@d,%@d\n", @delay_max, @freemem);
trunc(@delay_max); trunc(@freemem);
self->ts = 0;
}'
#!/usr/sbin/dtrace -s
#pragma D option quiet
/*
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
* Common Development and Distribution License (the "License").
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or http://www.opensolaris.org/os/licensing.
* See the License for the specific language governing permissions
* and limitations under the License.
*
* When distributing Covered Code, include this CDDL HEADER in each
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
*
* Copyright (c) 2015, RackTop Systems.
*
* Description:
* This script is meant to measure ARC's adjustment mechanism performance.
* It has been observed that it takes sometimes a long period of time to
* adjust the lists in the ARC, but what is not clear is the overall impact.
* We can measure here how much time is spent doing adjustments and what the
* size of adjustments is. This tool lets us see the size of the ARC when
* adjustment happens, the amount of bytes processed and the amount of time
* it takes to compete adjustments.
*/
/*
* Mesure the amount of time it takes to adjust the size of ARC
* by given amount of bytes. We expect 1-second granularity and
* data being in KBytes, and duration being in microseconds (us).
*/
inline const int SLOW = 1000; /* (1ms) limit triggering logging of data */
inline const string HEADER =
"timestamp,arc.sz.kb,evict.kb,slow,longest.call_t.us\n";
BEGIN {
printf("%s", HEADER);
cnt = 0;
}
tick-1sec {
ts = walltimestamp;
}
::arc_adjust:entry /ts > 0/ {
self->ts = timestamp;
}
::arc_adjust:return /self->ts/ {
this->delta = (timestamp-self->ts) / 1000;
this->l = this->delta > SLOW ? 1 : 0;
this->evicted_kb = args[1] / 1024;
this->arc_sz_kb = `arc_stats.arcstat_size.value.ui64 >> 10;
@delay_max[ts, this->arc_sz_kb,
this->evicted_kb, this->l] = max(this->delta);
}
::arc_adjust:return /self->ts/ {
/* Duration and size are not necessarily linked. One should not assume
* that the more time is spent the more bytes evicted. This function will
* call other functions and will spend time in them. This script measures
* total amount of time spent in those other functions. They do not always
* remove same amount of data, and depending on how much work they do,
* they may be slower or faster and amount of bytes evicted does not
* actually directly relate to amount of time it takes to evict given
* number of bytes.
*/
@delay_max[ts, this->arc_sz_kb,
this->evicted_kb, this->l] = max(this->delta);
printa("%Y,%d,%d,%d,%@d\n", @delay_max);
trunc(@delay_max);
self->ts = 0;
}
dtrace -qn '
/*
* Mesure the amount of time it takes to adjust the size of ARC
* by given amount of bytes. We expect 1-second granularity and
* data being in KBytes, and duration being in microseconds (us).
*/
inline const int SLOW = 1000; /* (1ms) limit triggering logging of data */
inline const string HEADER =
"timestamp,arc.sz.kb,evict.kb,slow,longest.call_t.us\n";
BEGIN {
printf("%s", HEADER);
cnt = 0;
}
tick-1sec {
ts = walltimestamp;
}
::arc_adjust:entry /ts > 0/ {
self->ts = timestamp;
}
::arc_adjust:return /self->ts/ {
this->delta = (timestamp-self->ts) / 1000;
this->l = this->delta > SLOW ? 1 : 0;
this->evicted_kb = args[1] / 1024;
this->arc_sz_kb = `arc_stats.arcstat_size.value.ui64 >> 10;
@delay_max[ts, this->arc_sz_kb,
this->evicted_kb, this->l] = max(this->delta);
}
::arc_adjust:return /self->ts/ {
/* Duration and size are not necessarily linked. One should not assume
* that the more time is spent the more bytes evicted. This function will
* call other functions and will spend time in them. This script measures
* total amount of time spent in those other functions. They do not always
* remove same amount of data, and depending on how much work they do,
* they may be slower or faster and amount of bytes evicted does not
* actually directly relate to amount of time it takes to evict given
* number of bytes.
*/
@delay_max[ts, this->arc_sz_kb,
this->evicted_kb, this->l] = max(this->delta);
printa("%Y,%d,%d,%d,%@d\n", @delay_max);
trunc(@delay_max);
self->ts = 0;
}'
dtrace -qn '
/*
* Collect how many bytes ARC is being asked to evict and how many bytes ARC
* actually evicted, as well as what type these buffers being evicted are,
* i.e. metadata, data, etc.
*/
enum arc_buf_contents {
ARC_BUFC_DATA, /* buffer contains data */
ARC_BUFC_METADATA, /* buffer contains metadata */
ARC_BUFC_NUMTYPES
};
BEGIN {
ts = walltimestamp;
printf("timestamp,buf.type,kbytes.needed,kbytes.evicted\n");
}
::arc_evict_state:entry /ts/ {
self->typ = args[3];
self->bytes = args[2] >> 10;
}
::arc_evict_state:return /ts/ {
self->evicted = args[1] >> 10;
@bytes[ts, self->typ == ARC_BUFC_DATA ? "D" :
self->typ == ARC_BUFC_METADATA ?
"MD" : "OTHER" ] = sum(self->bytes);
@evicted[ts, self->typ == ARC_BUFC_DATA ? "D" :
self->typ == ARC_BUFC_METADATA ?
"MD" : "U" ] = sum(self->evicted);
}
tick-1sec {
printa("%Y,%s,%@d,%@d\n", @bytes, @evicted);
trunc(@bytes);
trunc(@evicted);
ts = walltimestamp;
}'
dtrace -qn '
/* Measure length of time to complete state and header evictions */
inline int LONG_CALL = 20000 ; /* nanoseconds */
::arc_evict_state:entry,::arc_evict_hdr*:entry {
self->s = timestamp ;
}
::arc_evict_state:return,::arc_evict_hdr*:return /self->s/ {
this->delta = timestamp - self->s ;
/*
* If we are above the LONG_CALL threshold, aggregate value
* under the key `probefunc`-long.
*/
this->lab = this->delta >= LONG_CALL ?
strjoin(probefunc, "-long") : probefunc ;
@[this->lab] = quantize(this->delta) ;
self->s = 0 ;
}'
dtrace -qn '
/* Measure number of times we missed acquiring a mutex lock in
* arc_evict_hdr and in arc_evict_state_impl.
*/
BEGIN {
printf("timestamp,function,mutex.misses,call.duration.us\n");
changed = 0;
f["arc_evict_hdr"] = "evict_hdr";
f["arc_evict_state_impl"] = "evict_state";
}
tick-1sec {
ts = walltimestamp;
}
::arc_evict_hdr:entry,
::arc_evict_state_impl:entry {
self->start = timestamp;
self->x[probefunc] = `arc_stats.arcstat_mutex_miss.value.ui64;
}
::arc_evict_hdr:return,
::arc_evict_state_impl:return /ts/ {
this->delta_t = (timestamp - self->start) / 1000;
this->delta_miss =
`arc_stats.arcstat_mutex_miss.value.ui64 - self->x[probefunc];
@[ts, f[probefunc]] = sum(this->delta_miss);
@max_t[ts, f[probefunc]] = max(this->delta_t);
changed += this->delta_miss > 0;
}
tick-1sec {
printa("%Y,%s,%@d,%@d\n", @, @max_t);
trunc(@);
trunc(@max_t);
changed = 0;
}
END {
exit(0);
}'