]> jfr.im git - munin-plugins.git/blame - unbound_munin_
support for a 2nd unbound plugin
[munin-plugins.git] / unbound_munin_
CommitLineData
0d8dd5f2
JR
1#!/bin/sh
2#
3# plugin for munin to monitor usage of unbound servers.
4# To install copy this to /usr/local/share/munin/plugins/unbound_munin_
5# and use munin-node-configure (--suggest, --shell).
6#
7# (C) 2008 W.C.A. Wijngaards. BSD Licensed.
8#
9# To install; enable statistics and unbound-control in unbound.conf
10# server: extended-statistics: yes
11# statistics-cumulative: no
12# statistics-interval: 0
13# remote-control: control-enable: yes
14# Run the command unbound-control-setup to generate the key files.
15#
16# Environment variables for this script
17# unbound_conf - where the unbound.conf file is located.
18# unbound_control - where to find unbound-control executable.
19# spoof_warn - what level to warn about spoofing
20# spoof_crit - what level to crit about spoofing
21#
22# You can set them in your munin/plugin-conf.d/plugins.conf file
23# with:
24# [unbound*]
25# user root
26# env.unbound_conf /usr/local/etc/unbound/unbound.conf
27# env.unbound_control /usr/local/sbin/unbound-control
28# env.spoof_warn 1000
29# env.spoof_crit 100000
30#
31# This plugin can create different graphs depending on what name
32# you link it as (with ln -s) into the plugins directory
33# You can link it multiple times.
34# If you are only a casual user, the _hits and _by_type are most interesting,
35# possibly followed by _by_rcode.
36#
37# unbound_munin_hits - base volume, cache hits, unwanted traffic
38# unbound_munin_queue - to monitor the internal requestlist
39# unbound_munin_memory - memory usage
40# unbound_munin_by_type - incoming queries by type
41# unbound_munin_by_class - incoming queries by class
42# unbound_munin_by_opcode - incoming queries by opcode
43# unbound_munin_by_rcode - answers by rcode, validation status
44# unbound_munin_by_flags - incoming queries by flags
45# unbound_munin_histogram - histogram of query resolving times
46#
47# Magic markers - optional - used by installation scripts and
48# munin-config: (originally contrib family but munin-node-configure ignores it)
49#
50#%# family=auto
51#%# capabilities=autoconf suggest
52
53# POD documentation
54: <<=cut
55=head1 NAME
56
57unbound_munin_ - Munin plugin to monitor the Unbound DNS resolver.
58
59=head1 APPLICABLE SYSTEMS
60
61System with unbound daemon.
62
63=head1 CONFIGURATION
64
65 [unbound*]
66 user root
67 env.router 10.1.0.1
68 env.spoof_warn 1000
69 env.spoof_crit 100000
70
71Use the .env settings to override the defaults.
72
73=head1 USAGE
74
75Can be used to present different graphs. Use ln -s for that name in
76the plugins directory to enable the graph.
77unbound_munin_hits - base volume, cache hits, unwanted traffic
78unbound_munin_queue - to monitor the internal requestlist
79unbound_munin_memory - memory usage
80unbound_munin_by_type - incoming queries by type
81unbound_munin_by_class - incoming queries by class
82unbound_munin_by_opcode - incoming queries by opcode
83unbound_munin_by_rcode - answers by rcode, validation status
84unbound_munin_by_flags - incoming queries by flags
85unbound_munin_histogram - histogram of query resolving times
86
87=head1 AUTHOR
88
89Copyright 2008 W.C.A. Wijngaards
90
91=head1 LICENSE
92
93BSD
94
95=cut
96
1f4eda3c
JR
97my_state_name=`echo $0 | sed -e 's/^.*unbound\(2\?\)_munin_.*$/unbound\1/'`
98state="${MUNIN_PLUGSTATE}/$my_state_name.state"
99seentags="${MUNIN_PLUGSTATE}/$my_state_name-seentags.state"
0d8dd5f2
JR
100warn=${spoof_warn:-1000}
101crit=${spoof_crit:-100000}
102lock=$state.lock
103
104# number of seconds between polling attempts.
105# makes the statefile hang around for at least this many seconds,
106# so that multiple links of this script can share the results.
107lee=55
108
109# to keep things within 19 characters
110ABBREV="-e s/total/t/ -e s/thread/t/ -e s/num/n/ -e s/query/q/ -e s/answer/a/ -e s/unwanted/u/ -e s/requestlist/ql/ -e s/type/t/ -e s/class/c/ -e s/opcode/o/ -e s/rcode/r/ -e s/edns/e/ -e s/mem/m/ -e s/cache/c/ -e s/mod/m/"
111
112# get value from $1 into return variable $value
113get_value ( ) {
114 value="`grep '^'$1'=' $state | sed -e 's/^.*=//'`"
115 if test "$value"x = ""x; then
116 value="0"
117 fi
118}
119
120# Update list of seen query types etc to seentags file. This is run while
121# holding the lock, after the state file is updated.
122update_seentags() {
123 tmplist="$(cat ${seentags} 2> /dev/null)
124num.query.type.A
125num.query.class.IN
126num.query.opcode.QUERY
127num.answer.rcode.NOERROR
128"
129 (echo "${tmplist}"; grep ^num ${state} | sed -e 's/=.*//') | sort -u > ${seentags}
130}
131
132# download the state from the unbound server.
133get_state ( ) {
134 # obtain lock for fetching the state
135 # because there is a race condition in fetching and writing to file
136
137 # see if the lock is stale, if so, take it
138 if test -f $lock ; then
139 pid="`cat $lock 2>&1`"
140 kill -0 "$pid" >/dev/null 2>&1
141 if test $? -ne 0 -a "$pid" != $$ ; then
142 echo $$ >$lock
143 fi
144 fi
145
146 i=0
147 while test ! -f $lock || test "`cat $lock 2>&1`" != $$; do
148 while test -f $lock; do
149 # wait
150 i=`expr $i + 1`
151 if test $i -gt 1000; then
152 sleep 1;
153 fi
154 if test $i -gt 1500; then
155 echo "error locking $lock" "=" `cat $lock`
156 rm -f $lock
157 exit 1
158 fi
159 done
160 # try to get it
161 if echo $$ >$lock ; then : ; else break; fi
162 done
163 # do not refetch if the file exists and only LEE seconds old
164 if test -f $state; then
165 now=`date +%s`
166 get_value "time.now"
167 value="`echo $value | sed -e 's/\..*$//'`"
168 if test $now -lt `expr $value + $lee`; then
169 rm -f $lock
170 return
171 fi
172 fi
173
174 ssh -o ControlPath=/tmp/unbound-ssh.$$.sock -M -o ExitOnForwardFailure=yes -p 2222 -fnN -L /tmp/unbound-control.$$.sock:/tmp/unbound-control.sock root@$router >/dev/null # open an SSH tunnel
1f4eda3c 175 unbound-control -c /etc/unbound/unbound-control.conf -s /tmp/unbound-control.$$.sock stats >$state # run unbound-control over the SSH tunnel socket
0d8dd5f2
JR
176 if test $? -ne 0; then
177 echo "error retrieving data from unbound server"
178 rm -f $lock
179 exit 1
180 fi
181 ssh -o ControlPath=/tmp/unbound-ssh.$$.sock -O exit root@$router >/dev/null # close the SSH tunnel
182 rm -f /tmp/unbound-ssh.$$.sock /tmp/unbound-control.$$.sock # but it doesn't remove the socket, so do that
183
184 update_seentags
185 rm -f $lock
186}
187
188if test "$1" = "autoconf" ; then
189 if test ! -f $conf; then
190 echo no "($conf does not exist)"
191 exit 0
192 fi
193 if test ! -d `dirname $state`; then
194 echo no "(`dirname $state` directory does not exist)"
195 exit 0
196 fi
197 echo yes
198 exit 0
199fi
200
201if test "$1" = "suggest" ; then
202 echo "hits"
203 echo "queue"
204 echo "memory"
205 echo "by_type"
206 echo "by_class"
207 echo "by_opcode"
208 echo "by_rcode"
209 echo "by_flags"
210 echo "histogram"
211 exit 0
212fi
213
214# determine my type, by name
1f4eda3c 215id=`echo $0 | sed -e 's/^.*unbound2\?_munin_//'`
0d8dd5f2
JR
216if test "$id"x = ""x; then
217 # some default to keep people sane.
218 id="hits"
219fi
220
221# if $1 exists in statefile, config is echoed with label $2
222exist_config ( ) {
223 mn=`echo $1 | sed $ABBREV | tr . _`
224 if grep '^'$1'=' $state >/dev/null 2>&1; then
225 echo "$mn.label $2"
226 echo "$mn.min 0"
227 echo "$mn.type ABSOLUTE"
228 fi
229}
230
231# print label and min 0 for a name $1 in unbound format
232p_config ( ) {
233 mn=`echo $1 | sed $ABBREV | tr . _`
234 echo $mn.label "$2"
235 echo $mn.min 0
236 echo $mn.type $3
237}
238
239if test "$1" = "config" ; then
240 if test ! -f $state; then
241 get_state
242 fi
243 echo "host_name $router"
244 case $id in
245 hits)
246 echo "graph_title Unbound DNS traffic and cache hits"
247 echo "graph_args --base 1000 -l 0"
248 echo "graph_vlabel queries / \${graph_period}"
249 echo "graph_scale no"
250 echo "graph_category dns"
251 echo "graph_period minute"
252 for x in `grep "^thread[0-9][0-9]*\.num\.queries=" $state |
253 sed -e 's/=.*//'`; do
254 exist_config $x "queries handled by `basename $x .num.queries`"
255 done
256 p_config "total.num.queries" "total queries from clients" "ABSOLUTE"
257 p_config "total.num.cachehits" "cache hits" "ABSOLUTE"
258 p_config "total.num.prefetch" "cache prefetch" "ABSOLUTE"
259 p_config "num.query.tcp" "TCP queries" "ABSOLUTE"
260 p_config "num.query.tcpout" "TCP out queries" "ABSOLUTE"
261 p_config "num.query.udpout" "UDP out queries" "ABSOLUTE"
262 p_config "num.query.tls" "TLS queries" "ABSOLUTE"
263 p_config "num.query.tls.resume" "TLS resumes" "ABSOLUTE"
264 p_config "num.query.ipv6" "IPv6 queries" "ABSOLUTE"
265 p_config "unwanted.queries" "queries that failed acl" "ABSOLUTE"
266 p_config "unwanted.replies" "unwanted or unsolicited replies" "ABSOLUTE"
267 echo "u_replies.warning $warn"
268 echo "u_replies.critical $crit"
269 echo "graph_info DNS queries to the recursive resolver. The unwanted replies could be innocent duplicate packets, late replies, or spoof threats."
270 ;;
271 queue)
272 echo "graph_title Unbound requestlist size"
273 echo "graph_args --base 1000 -l 0"
274 echo "graph_vlabel number of queries"
275 echo "graph_scale no"
276 echo "graph_category dns"
277 echo "graph_period minute"
278 p_config "total.requestlist.avg" "Average size of queue on insert" "GAUGE"
279 p_config "total.requestlist.max" "Max size of queue (in 5 min)" "GAUGE"
280 p_config "total.requestlist.overwritten" "Number of queries replaced by new ones" "GAUGE"
281 p_config "total.requestlist.exceeded" "Number of queries dropped due to lack of space" "GAUGE"
282 echo "graph_info The queries that did not hit the cache and need recursion service take up space in the requestlist. If there are too many queries, first queries get overwritten, and at last resort dropped."
283 ;;
284 memory)
285 echo "graph_title Unbound memory usage"
286 echo "graph_args --base 1024 -l 0"
287 echo "graph_vlabel memory used in bytes"
288 echo "graph_category dns"
289 echo "graph_period minute"
290 p_config "mem.cache.rrset" "RRset cache memory" "GAUGE"
291 p_config "mem.cache.message" "Message cache memory" "GAUGE"
292 p_config "mem.mod.iterator" "Iterator module memory" "GAUGE"
293 p_config "mem.mod.validator" "Validator module and key cache memory" "GAUGE"
294 p_config "msg.cache.count" "msg cache count" "GAUGE"
295 p_config "rrset.cache.count" "rrset cache count" "GAUGE"
296 p_config "infra.cache.count" "infra cache count" "GAUGE"
297 p_config "key.cache.count" "key cache count" "GAUGE"
298 echo "graph_info The memory used by unbound."
299 ;;
300 by_type)
301 echo "graph_title Unbound DNS queries by type"
302 echo "graph_args --base 1000 -l 0"
303 echo "graph_vlabel queries / \${graph_period}"
304 echo "graph_scale no"
305 echo "graph_category dns"
306 echo "graph_period minute"
307 for nm in `grep "^num.query.type" $seentags`; do
308 tp=`echo $nm | sed -e s/num.query.type.//`
309 p_config "$nm" "$tp" "ABSOLUTE"
310 done
311 echo "graph_info queries by DNS RR type queried for"
312 ;;
313 by_class)
314 echo "graph_title Unbound DNS queries by class"
315 echo "graph_args --base 1000 -l 0"
316 echo "graph_vlabel queries / \${graph_period}"
317 echo "graph_scale no"
318 echo "graph_category dns"
319 echo "graph_period minute"
320 for nm in `grep "^num.query.class" $seentags`; do
321 tp=`echo $nm | sed -e s/num.query.class.//`
322 p_config "$nm" "$tp" "ABSOLUTE"
323 done
324 echo "graph_info queries by DNS RR class queried for."
325 ;;
326 by_opcode)
327 echo "graph_title Unbound DNS queries by opcode"
328 echo "graph_args --base 1000 -l 0"
329 echo "graph_vlabel queries / \${graph_period}"
330 echo "graph_scale no"
331 echo "graph_category dns"
332 echo "graph_period minute"
333 for nm in `grep "^num.query.opcode" $seentags`; do
334 tp=`echo $nm | sed -e s/num.query.opcode.//`
335 p_config "$nm" "$tp" "ABSOLUTE"
336 done
337 echo "graph_info queries by opcode in the query packet."
338 ;;
339 by_rcode)
340 echo "graph_title Unbound DNS answers by return code"
341 echo "graph_args --base 1000 -l 0"
342 echo "graph_vlabel answer packets / \${graph_period}"
343 echo "graph_scale no"
344 echo "graph_category dns"
345 echo "graph_period minute"
346 for nm in `grep "^num.answer.rcode" $seentags`; do
347 tp=`echo $nm | sed -e s/num.answer.rcode.//`
348 p_config "$nm" "$tp" "ABSOLUTE"
349 done
350 p_config "num.answer.secure" "answer secure" "ABSOLUTE"
351 p_config "num.answer.bogus" "answer bogus" "ABSOLUTE"
352 p_config "num.rrset.bogus" "num rrsets marked bogus" "ABSOLUTE"
353 echo "graph_info answers sorted by return value. rrsets bogus is the number of rrsets marked bogus per \${graph_period} by the validator"
354 ;;
355 by_flags)
356 echo "graph_title Unbound DNS incoming queries by flags"
357 echo "graph_args --base 1000 -l 0"
358 echo "graph_vlabel queries / \${graph_period}"
359 echo "graph_scale no"
360 echo "graph_category dns"
361 echo "graph_period minute"
362 p_config "num.query.flags.QR" "QR (query reply) flag" "ABSOLUTE"
363 p_config "num.query.flags.AA" "AA (auth answer) flag" "ABSOLUTE"
364 p_config "num.query.flags.TC" "TC (truncated) flag" "ABSOLUTE"
365 p_config "num.query.flags.RD" "RD (recursion desired) flag" "ABSOLUTE"
366 p_config "num.query.flags.RA" "RA (rec avail) flag" "ABSOLUTE"
367 p_config "num.query.flags.Z" "Z (zero) flag" "ABSOLUTE"
368 p_config "num.query.flags.AD" "AD (auth data) flag" "ABSOLUTE"
369 p_config "num.query.flags.CD" "CD (check disabled) flag" "ABSOLUTE"
370 p_config "num.query.edns.present" "EDNS OPT present" "ABSOLUTE"
371 p_config "num.query.edns.DO" "DO (DNSSEC OK) flag" "ABSOLUTE"
372 echo "graph_info This graphs plots the flags inside incoming queries. For example, if QR, AA, TC, RA, Z flags are set, the query can be rejected. RD, AD, CD and DO are legitimately set by some software."
373 ;;
374 histogram)
375 echo "graph_title Unbound DNS histogram of reply time"
376 echo "graph_args --base 1000 -l 0"
377 echo "graph_vlabel queries / \${graph_period}"
378 echo "graph_scale no"
379 echo "graph_category dns"
380 echo "graph_period minute"
381 echo hcache.label "cache hits"
382 echo hcache.min 0
383 echo hcache.type ABSOLUTE
384 echo hcache.draw AREA
385 echo hcache.colour 999999
386 echo h64ms.label "0 msec - 66 msec"
387 echo h64ms.min 0
388 echo h64ms.type ABSOLUTE
389 echo h64ms.draw STACK
390 echo h64ms.colour 0000FF
391 echo h128ms.label "66 msec - 131 msec"
392 echo h128ms.min 0
393 echo h128ms.type ABSOLUTE
394 echo h128ms.colour 1F00DF
395 echo h128ms.draw STACK
396 echo h256ms.label "131 msec - 262 msec"
397 echo h256ms.min 0
398 echo h256ms.type ABSOLUTE
399 echo h256ms.draw STACK
400 echo h256ms.colour 3F00BF
401 echo h512ms.label "262 msec - 524 msec"
402 echo h512ms.min 0
403 echo h512ms.type ABSOLUTE
404 echo h512ms.draw STACK
405 echo h512ms.colour 5F009F
406 echo h1s.label "524 msec - 1 sec"
407 echo h1s.min 0
408 echo h1s.type ABSOLUTE
409 echo h1s.draw STACK
410 echo h1s.colour 7F007F
411 echo h2s.label "1 sec - 2 sec"
412 echo h2s.min 0
413 echo h2s.type ABSOLUTE
414 echo h2s.draw STACK
415 echo h2s.colour 9F005F
416 echo h4s.label "2 sec - 4 sec"
417 echo h4s.min 0
418 echo h4s.type ABSOLUTE
419 echo h4s.draw STACK
420 echo h4s.colour BF003F
421 echo h8s.label "4 sec - 8 sec"
422 echo h8s.min 0
423 echo h8s.type ABSOLUTE
424 echo h8s.draw STACK
425 echo h8s.colour DF001F
426 echo h16s.label "8 sec - ..."
427 echo h16s.min 0
428 echo h16s.type ABSOLUTE
429 echo h16s.draw STACK
430 echo h16s.colour FF0000
431 echo "graph_info Histogram of the reply times for queries."
432 ;;
433 esac
434
435 exit 0
436fi
437
438# do the stats itself
439get_state
440
441# get the time elapsed
442get_value "time.elapsed"
443if test $value = 0 || test $value = "0.000000"; then
444 echo "error: time elapsed 0 or could not retrieve data"
445 exit 1
446fi
447elapsed="$value"
448
449# print value for $1
450print_value ( ) {
451 mn=`echo $1 | sed $ABBREV | tr . _`
452 get_value $1
453 echo "$mn.value" $value
454}
455
456# print value if line already found in $2
457print_value_line ( ) {
458 mn=`echo $1 | sed $ABBREV | tr . _`
459 value="`echo $2 | sed -e 's/^.*=//'`"
460 echo "$mn.value" $value
461}
462
463
464case $id in
465hits)
466 for x in `grep "^thread[0-9][0-9]*\.num\.queries=" $state |
467 sed -e 's/=.*//'` total.num.queries \
468 total.num.cachehits total.num.prefetch num.query.tcp \
469 num.query.tcpout num.query.udpout num.query.tls num.query.tls.resume \
470 num.query.ipv6 unwanted.queries \
471 unwanted.replies; do
472 if grep "^"$x"=" $state >/dev/null 2>&1; then
473 print_value $x
474 fi
475 done
476 ;;
477queue)
478 for x in total.requestlist.avg total.requestlist.max \
479 total.requestlist.overwritten total.requestlist.exceeded; do
480 print_value $x
481 done
482 ;;
483memory)
484 for x in mem.cache.rrset mem.cache.message mem.mod.iterator \
485 mem.mod.validator msg.cache.count rrset.cache.count \
486 infra.cache.count key.cache.count; do
487 print_value $x
488 done
489 ;;
490by_type)
491 for nm in `grep "^num.query.type" $seentags`; do
492 print_value $nm
493 done
494 ;;
495by_class)
496 for nm in `grep "^num.query.class" $seentags`; do
497 print_value $nm
498 done
499 ;;
500by_opcode)
501 for nm in `grep "^num.query.opcode" $seentags`; do
502 print_value $nm
503 done
504 ;;
505by_rcode)
506 for nm in `grep "^num.answer.rcode" $seentags`; do
507 print_value $nm
508 done
509 print_value "num.answer.secure"
510 print_value "num.answer.bogus"
511 print_value "num.rrset.bogus"
512 ;;
513by_flags)
514 for x in num.query.flags.QR num.query.flags.AA num.query.flags.TC num.query.flags.RD num.query.flags.RA num.query.flags.Z num.query.flags.AD num.query.flags.CD num.query.edns.present num.query.edns.DO; do
515 print_value $x
516 done
517 ;;
518histogram)
519 get_value total.num.cachehits
520 echo hcache.value $value
521 r=0
522 for x in histogram.000000.000000.to.000000.000001 \
523 histogram.000000.000001.to.000000.000002 \
524 histogram.000000.000002.to.000000.000004 \
525 histogram.000000.000004.to.000000.000008 \
526 histogram.000000.000008.to.000000.000016 \
527 histogram.000000.000016.to.000000.000032 \
528 histogram.000000.000032.to.000000.000064 \
529 histogram.000000.000064.to.000000.000128 \
530 histogram.000000.000128.to.000000.000256 \
531 histogram.000000.000256.to.000000.000512 \
532 histogram.000000.000512.to.000000.001024 \
533 histogram.000000.001024.to.000000.002048 \
534 histogram.000000.002048.to.000000.004096 \
535 histogram.000000.004096.to.000000.008192 \
536 histogram.000000.008192.to.000000.016384 \
537 histogram.000000.016384.to.000000.032768 \
538 histogram.000000.032768.to.000000.065536; do
539 get_value $x
540 r=`expr $r + $value`
541 done
542 echo h64ms.value $r
543 get_value histogram.000000.065536.to.000000.131072
544 echo h128ms.value $value
545 get_value histogram.000000.131072.to.000000.262144
546 echo h256ms.value $value
547 get_value histogram.000000.262144.to.000000.524288
548 echo h512ms.value $value
549 get_value histogram.000000.524288.to.000001.000000
550 echo h1s.value $value
551 get_value histogram.000001.000000.to.000002.000000
552 echo h2s.value $value
553 get_value histogram.000002.000000.to.000004.000000
554 echo h4s.value $value
555 get_value histogram.000004.000000.to.000008.000000
556 echo h8s.value $value
557 r=0
558 for x in histogram.000008.000000.to.000016.000000 \
559 histogram.000016.000000.to.000032.000000 \
560 histogram.000032.000000.to.000064.000000 \
561 histogram.000064.000000.to.000128.000000 \
562 histogram.000128.000000.to.000256.000000 \
563 histogram.000256.000000.to.000512.000000 \
564 histogram.000512.000000.to.001024.000000 \
565 histogram.001024.000000.to.002048.000000 \
566 histogram.002048.000000.to.004096.000000 \
567 histogram.004096.000000.to.008192.000000 \
568 histogram.008192.000000.to.016384.000000 \
569 histogram.016384.000000.to.032768.000000 \
570 histogram.032768.000000.to.065536.000000 \
571 histogram.065536.000000.to.131072.000000 \
572 histogram.131072.000000.to.262144.000000 \
573 histogram.262144.000000.to.524288.000000; do
574 get_value $x
575 r=`expr $r + $value`
576 done
577 echo h16s.value $r
578 ;;
579esac