]> jfr.im git - munin-plugins.git/blame - nvme
support for a 2nd unbound plugin
[munin-plugins.git] / nvme
CommitLineData
0d8dd5f2
JR
1#! /usr/bin/perl
2# -*- mode: perl; perl-indent-level: 4 -*-
3
4=head1 NAME
5
6nvme - Munin plugin to monitor the use of NVMe devices
7
8=head1 APPLICABLE SYSTEMS
9
10Linux systems with NVMe (Non-Volatile Memory storage attached via PCIe
11bus).
12
13=head1 CONFIGURATION
14
15The plugin uses nvme(1) from the nvme-cli project to read status from
16the NVMe devices. This requires root access.
17
18 [nvme]
19 user root
20
21When setting alert levels per device, use graph and basename of device
22name, e.g., 'nvme0n1', to make environment variable:
23
24 env.nvme_usage_nvme0n1_warning 5:
25 env.nvme_usage_warning 8:
26
27If your device names change on reboot you can also use the labels
28(based on serial numbers) to set the warning and critical labels
29
30 env.nvme_usage_SN_1234567_warning 8:101
31 env.nvme_usage_SN_1234567_critical 5:101
32
33=head1 INTERPRETATION
34
35This is a multigraph plugin which makes three graphs.
36
37=head2 nvme_usage
38
39This reports how much of capacity is allocated in each NVMe
40"namespace". The report is in percent. This number may not have much
41relation to actual use, e.g., if deleted data areas have not been
42trimmed/discarded.
43
44Default warning and critical: '95', '98'
45
46=head2 nvme_bytes
47
48This reports read and write activity on each NVMe device, in bytes per
49second. Ideally there should be much more read than write. If they
50are symmetrical, you are using your NVMe as a very expensive FIFO, and
51if you write more than you read, you should probably look for archival
52storage instead.
53
54It is a good idea to compare these numbers to I/O counters from
55diskstats. If they are much higher, look into whether the write
56amplification can be due to suboptimal I/O request sizes.
57
58This graph does not support alerting.
59
60=head2 nvme_writecycles
61
62This graphs is intended to give an indication of how much life there
63is left in your NVMe. It calculates the number of bytes written
64during each device's lifetime against the capacity of the device,
65thereby getting an average number of write cycle each cell has
66experienced.
67
68A prosumer NVMe will handle a few thousand writes to each cell before
69the error rate gets out of hand.
70
71No default values for warning and critical.
72
73=head2 nvme_spare
74
75All NVMe has set a side reserve space to remap media errors. This
76graphs how much is left in percent, taken directly from smart-log
77output.
78
79Default warning and critical: '10:', '3:'
80
81=head1 MAGIC MARKERS
82
83 #%# family=auto
84 #%# capabilities=autoconf
85
86=head1 BUGS
87
88None known.
89
90=head1 VERSION
91
92 1.1
93
94=head1 AUTHOR
95
96Kjetil Torgrim Homme <kjetil.homme@redpill-linpro.com>
97
98=head1 LICENSE
99
100GPLv2
101
102=cut
103
104use strict;
105use Munin::Plugin;
106use IPC::Cmd qw(can_run);
107use File::Basename;
108
109# Check that multigraph is supported
110need_multigraph();
111
112# Return undef if no problem, otherwise explanation
113sub autoconf_problem {
114 return if can_run('nvme');
115 if (open(my $mods, '/proc/modules')) {
116 while (<$mods>) {
117 return "missing nvme(1)" if /^nvme[^a-z]/;
118 }
119 close($mods);
120 }
121 return "missing nvme"; # vague message for non-Linux
122}
123
124sub run_nvme {
125 my (@cmd) = @_;
126 my @lines;
127 if (can_run('nvme') && open(my $nvme, '-|', 'nvme', @cmd)) {
128 @lines = <$nvme>;
129 close($nvme);
130 warn "nvme: probably needs to run as user root\n" if $? && $> != 0;
131 }
132 @lines;
133}
134
135sub human_to_bytes {
136 my ($str) = @_;
137 my %units = (
138 kB => 1000,
139 MB => 1000_000,
140 GB => 1000_000_000,
141 TB => 1000_000_000_000,
142 PB => 1000_000_000_000_000, # I wish I had need for this
143 );
144 $str =~ /(\d+(\.\d+)?)\s+(.B)/;
145 int($1 * $units{$3});
146}
147
148sub nvme_list {
149 # Node SN Model Namespace Usage Format FW Rev
150 # ---------------- -------------------- ---------------------------------------- --------- -------------------------- ---------------- --------
151 # /dev/nvme1n1 S464NB0K601188N Samsung SSD 970 EVO 2TB 1 695.50 GB / 2.00 TB 512 B + 0 B 1B2QEXE7
152 my %devices;
153
154 my $recognised_output;
155 my $lineno = 0;
156 for (run_nvme('list')) {
157 ++$lineno;
158 if (m:^Node\s+SN\s+Model\s+Namespace Usage:) {
159 ++$recognised_output;
160 } elsif (m:^(/\S+)\s+(\S+)\s+(\S.*\S)\s{3,}(\d+)\s+(\S+\s+.B)\s+/\s+(\S+\s+.B):) {
161 $devices{'SN_'.$2} = {
162 device => $1,
163 sn => $2,
164 model => $3,
165 namespace => $4,
166 usage => human_to_bytes($5),
167 capacity => human_to_bytes($6),
168 };
169 } elsif ($lineno > 2) {
170 # could not parse device information
171 $recognised_output = 0;
172 }
173 }
174 if ($lineno && !$recognised_output) {
175 warn "Could not recognise output from 'nvme list', please report\n";
176 }
177 \%devices;
178}
179
180sub smart_log {
181 my ($dev) = @_;
182 my %info;
183 for (run_nvme('smart-log', $dev)) {
184 next if /^Smart Log/;
185 if (/(.*?)\s+:\s+(.*)/) {
186 my ($var, $value) = ($1, $2);
187 $var =~ s/\s/_/g;
188 if ($value =~ /^\d+(,\d\d\d)+$/) {
189 $value =~ s/,//g;
190 }
191 $info{lc $var} = $value;
192 }
193 }
194 return \%info;
195}
196
197sub my_print_thresholds {
198 my ($label, $graph, $device, $warn_default, $crit_default) = @_;
199 my $dev = basename($device);
200 my ($warn_label, $crit_label) = get_thresholds($graph, "${graph}_${label}_warning", "${graph}_${label}_critical",
201 $warn_default, $crit_default);
202 my ($warn, $crit) = get_thresholds($graph, "${graph}_${dev}_warning", "${graph}_${dev}_critical",
203 $warn_label, $crit_label);
204 print "${label}.warning $warn\n" if defined $warn;
205 print "${label}.critical $crit\n" if defined $crit;
206}
207
208use Data::Dumper;
209
210my $mode = ($ARGV[0] or "print");
211
212my $problem = autoconf_problem();
213my $list = nvme_list();
214
215if ($mode eq 'autoconf') {
216 if (keys %{$list}) {
217 print "yes\n";
218 } else {
219 printf("no (%s)\n", $problem || "no devices to monitor");
220 }
221 exit 0;
222}
223
224my @sn = sort keys %{$list};
225
226if ($mode eq 'config') {
227 my $sn_list = join(' ', @sn);
228
229 print <<'EOF';
230multigraph nvme_usage
231graph_title NVME Namespace Usage
232graph_order $sn_list
233graph_vlabel Percent used
234graph_scale no
235graph_category disk
236graph_info How much space is used
237EOF
238 for (@sn) {
239 my $device = $list->{$_}->{device};
240 print <<"EOF";
241$_.label $device used
242$_.type GAUGE
243$_.max 100
244$_.min 0
245EOF
246 my_print_thresholds($_, 'nvme_usage', $device, '95', '98');
247 }
248 print <<'EOF';
249multigraph nvme_bytes
250graph_title NVME Bytes Read / Written
251graph_order $sn_list
252graph_vlabel bytes read (-) / written (+) per ${graph_period}'
253graph_category disk
254graph_info How much data is read and written
255graph_period second
256EOF
257 for (@sn) {
258 print <<"EOF";
259${_}_r.label $list->{$_}->{device}
260${_}_r.type DERIVE
261${_}_r.min 0
262${_}_r.graph no
263${_}_w.label $list->{$_}->{device}
264${_}_w.type DERIVE
265${_}_w.min 0
266${_}_w.negative ${_}_r
267EOF
268 }
269 print <<'EOF';
270multigraph nvme_writecycles
271graph_title NVME Write Cycles
272graph_order $sn_list
273graph_vlabel Cycles
274graph_args --logarithmic
275graph_category disk
276graph_info How much data has been written in lifetime divided by capacity
277EOF
278 for (@sn) {
279 my $device = $list->{$_}->{device};
280 print <<"EOF";
281$_.label $device write cycles
282$_.type GAUGE
283$_.min 0
284EOF
285 my_print_thresholds($_, 'nvme_writecycles', $device);
286 }
287 print <<'EOF';
288multigraph nvme_spare
289graph_title Available spare blocks
290graph_order $sn_list
291graph_vlabel Percent
292graph_category disk
293graph_info Spare capacity for replacing bad blocks
294EOF
295 for (@sn) {
296 my $device = $list->{$_}->{device};
297 print <<"EOF";
298$_.label $device spare capacity
299$_.type GAUGE
300$_.min 0
301$_.max 100
302EOF
303 my_print_thresholds($_, 'nvme_spare', $device, '10:', '3:');
304 }
305} else {
306 for (@sn) {
307 $list->{$_}->{smart} = smart_log($list->{$_}->{device});
308 }
309 print "multigraph nvme_usage\n";
310 for (@sn) {
311 my $info = $list->{$_};
312 my $used = 100 * $info->{usage} / $info->{capacity};
313 print "$_.value $used\n";
314 }
315 print "multigraph nvme_bytes\n";
316 for (@sn) {
317 my $info = $list->{$_};
318 my $rbytes = $info->{smart}->{data_units_read};
319 my $wbytes = $info->{smart}->{data_units_written};
320 print "${_}_r.value $rbytes\n";
321 print "${_}_w.value $wbytes\n";
322 }
323 print "multigraph nvme_writecycles\n";
324 for (@sn) {
325 my $info = $list->{$_};
326
327 # The unit size reported is 1000 blocks.
328 my $cycles = $info->{smart}->{data_units_written} * 512_000 / $info->{capacity};
329 print "$_.value $cycles\n";
330 }
331 print "multigraph nvme_spare\n";
332 for (@sn) {
333 my $info = $list->{$_};
334
335 # The unit size reported is 1000 blocks.
336 my $spare = $info->{smart}->{available_spare};
337 $spare =~ s/%//;
338 print "$_.value $spare\n";
339 }
340}