]> jfr.im git - munin-plugins.git/blob - nvme
support for a 2nd unbound plugin
[munin-plugins.git] / nvme
1 #! /usr/bin/perl
2 # -*- mode: perl; perl-indent-level: 4 -*-
3
4 =head1 NAME
5
6 nvme - Munin plugin to monitor the use of NVMe devices
7
8 =head1 APPLICABLE SYSTEMS
9
10 Linux systems with NVMe (Non-Volatile Memory storage attached via PCIe
11 bus).
12
13 =head1 CONFIGURATION
14
15 The plugin uses nvme(1) from the nvme-cli project to read status from
16 the NVMe devices. This requires root access.
17
18 [nvme]
19 user root
20
21 When setting alert levels per device, use graph and basename of device
22 name, e.g., 'nvme0n1', to make environment variable:
23
24 env.nvme_usage_nvme0n1_warning 5:
25 env.nvme_usage_warning 8:
26
27 If your device names change on reboot you can also use the labels
28 (based on serial numbers) to set the warning and critical labels
29
30 env.nvme_usage_SN_1234567_warning 8:101
31 env.nvme_usage_SN_1234567_critical 5:101
32
33 =head1 INTERPRETATION
34
35 This is a multigraph plugin which makes three graphs.
36
37 =head2 nvme_usage
38
39 This reports how much of capacity is allocated in each NVMe
40 "namespace". The report is in percent. This number may not have much
41 relation to actual use, e.g., if deleted data areas have not been
42 trimmed/discarded.
43
44 Default warning and critical: '95', '98'
45
46 =head2 nvme_bytes
47
48 This reports read and write activity on each NVMe device, in bytes per
49 second. Ideally there should be much more read than write. If they
50 are symmetrical, you are using your NVMe as a very expensive FIFO, and
51 if you write more than you read, you should probably look for archival
52 storage instead.
53
54 It is a good idea to compare these numbers to I/O counters from
55 diskstats. If they are much higher, look into whether the write
56 amplification can be due to suboptimal I/O request sizes.
57
58 This graph does not support alerting.
59
60 =head2 nvme_writecycles
61
62 This graphs is intended to give an indication of how much life there
63 is left in your NVMe. It calculates the number of bytes written
64 during each device's lifetime against the capacity of the device,
65 thereby getting an average number of write cycle each cell has
66 experienced.
67
68 A prosumer NVMe will handle a few thousand writes to each cell before
69 the error rate gets out of hand.
70
71 No default values for warning and critical.
72
73 =head2 nvme_spare
74
75 All NVMe has set a side reserve space to remap media errors. This
76 graphs how much is left in percent, taken directly from smart-log
77 output.
78
79 Default warning and critical: '10:', '3:'
80
81 =head1 MAGIC MARKERS
82
83 #%# family=auto
84 #%# capabilities=autoconf
85
86 =head1 BUGS
87
88 None known.
89
90 =head1 VERSION
91
92 1.1
93
94 =head1 AUTHOR
95
96 Kjetil Torgrim Homme <kjetil.homme@redpill-linpro.com>
97
98 =head1 LICENSE
99
100 GPLv2
101
102 =cut
103
104 use strict;
105 use Munin::Plugin;
106 use IPC::Cmd qw(can_run);
107 use File::Basename;
108
109 # Check that multigraph is supported
110 need_multigraph();
111
112 # Return undef if no problem, otherwise explanation
113 sub autoconf_problem {
114 return if can_run('nvme');
115 if (open(my $mods, '/proc/modules')) {
116 while (<$mods>) {
117 return "missing nvme(1)" if /^nvme[^a-z]/;
118 }
119 close($mods);
120 }
121 return "missing nvme"; # vague message for non-Linux
122 }
123
124 sub run_nvme {
125 my (@cmd) = @_;
126 my @lines;
127 if (can_run('nvme') && open(my $nvme, '-|', 'nvme', @cmd)) {
128 @lines = <$nvme>;
129 close($nvme);
130 warn "nvme: probably needs to run as user root\n" if $? && $> != 0;
131 }
132 @lines;
133 }
134
135 sub human_to_bytes {
136 my ($str) = @_;
137 my %units = (
138 kB => 1000,
139 MB => 1000_000,
140 GB => 1000_000_000,
141 TB => 1000_000_000_000,
142 PB => 1000_000_000_000_000, # I wish I had need for this
143 );
144 $str =~ /(\d+(\.\d+)?)\s+(.B)/;
145 int($1 * $units{$3});
146 }
147
148 sub nvme_list {
149 # Node SN Model Namespace Usage Format FW Rev
150 # ---------------- -------------------- ---------------------------------------- --------- -------------------------- ---------------- --------
151 # /dev/nvme1n1 S464NB0K601188N Samsung SSD 970 EVO 2TB 1 695.50 GB / 2.00 TB 512 B + 0 B 1B2QEXE7
152 my %devices;
153
154 my $recognised_output;
155 my $lineno = 0;
156 for (run_nvme('list')) {
157 ++$lineno;
158 if (m:^Node\s+SN\s+Model\s+Namespace Usage:) {
159 ++$recognised_output;
160 } elsif (m:^(/\S+)\s+(\S+)\s+(\S.*\S)\s{3,}(\d+)\s+(\S+\s+.B)\s+/\s+(\S+\s+.B):) {
161 $devices{'SN_'.$2} = {
162 device => $1,
163 sn => $2,
164 model => $3,
165 namespace => $4,
166 usage => human_to_bytes($5),
167 capacity => human_to_bytes($6),
168 };
169 } elsif ($lineno > 2) {
170 # could not parse device information
171 $recognised_output = 0;
172 }
173 }
174 if ($lineno && !$recognised_output) {
175 warn "Could not recognise output from 'nvme list', please report\n";
176 }
177 \%devices;
178 }
179
180 sub smart_log {
181 my ($dev) = @_;
182 my %info;
183 for (run_nvme('smart-log', $dev)) {
184 next if /^Smart Log/;
185 if (/(.*?)\s+:\s+(.*)/) {
186 my ($var, $value) = ($1, $2);
187 $var =~ s/\s/_/g;
188 if ($value =~ /^\d+(,\d\d\d)+$/) {
189 $value =~ s/,//g;
190 }
191 $info{lc $var} = $value;
192 }
193 }
194 return \%info;
195 }
196
197 sub my_print_thresholds {
198 my ($label, $graph, $device, $warn_default, $crit_default) = @_;
199 my $dev = basename($device);
200 my ($warn_label, $crit_label) = get_thresholds($graph, "${graph}_${label}_warning", "${graph}_${label}_critical",
201 $warn_default, $crit_default);
202 my ($warn, $crit) = get_thresholds($graph, "${graph}_${dev}_warning", "${graph}_${dev}_critical",
203 $warn_label, $crit_label);
204 print "${label}.warning $warn\n" if defined $warn;
205 print "${label}.critical $crit\n" if defined $crit;
206 }
207
208 use Data::Dumper;
209
210 my $mode = ($ARGV[0] or "print");
211
212 my $problem = autoconf_problem();
213 my $list = nvme_list();
214
215 if ($mode eq 'autoconf') {
216 if (keys %{$list}) {
217 print "yes\n";
218 } else {
219 printf("no (%s)\n", $problem || "no devices to monitor");
220 }
221 exit 0;
222 }
223
224 my @sn = sort keys %{$list};
225
226 if ($mode eq 'config') {
227 my $sn_list = join(' ', @sn);
228
229 print <<'EOF';
230 multigraph nvme_usage
231 graph_title NVME Namespace Usage
232 graph_order $sn_list
233 graph_vlabel Percent used
234 graph_scale no
235 graph_category disk
236 graph_info How much space is used
237 EOF
238 for (@sn) {
239 my $device = $list->{$_}->{device};
240 print <<"EOF";
241 $_.label $device used
242 $_.type GAUGE
243 $_.max 100
244 $_.min 0
245 EOF
246 my_print_thresholds($_, 'nvme_usage', $device, '95', '98');
247 }
248 print <<'EOF';
249 multigraph nvme_bytes
250 graph_title NVME Bytes Read / Written
251 graph_order $sn_list
252 graph_vlabel bytes read (-) / written (+) per ${graph_period}'
253 graph_category disk
254 graph_info How much data is read and written
255 graph_period second
256 EOF
257 for (@sn) {
258 print <<"EOF";
259 ${_}_r.label $list->{$_}->{device}
260 ${_}_r.type DERIVE
261 ${_}_r.min 0
262 ${_}_r.graph no
263 ${_}_w.label $list->{$_}->{device}
264 ${_}_w.type DERIVE
265 ${_}_w.min 0
266 ${_}_w.negative ${_}_r
267 EOF
268 }
269 print <<'EOF';
270 multigraph nvme_writecycles
271 graph_title NVME Write Cycles
272 graph_order $sn_list
273 graph_vlabel Cycles
274 graph_args --logarithmic
275 graph_category disk
276 graph_info How much data has been written in lifetime divided by capacity
277 EOF
278 for (@sn) {
279 my $device = $list->{$_}->{device};
280 print <<"EOF";
281 $_.label $device write cycles
282 $_.type GAUGE
283 $_.min 0
284 EOF
285 my_print_thresholds($_, 'nvme_writecycles', $device);
286 }
287 print <<'EOF';
288 multigraph nvme_spare
289 graph_title Available spare blocks
290 graph_order $sn_list
291 graph_vlabel Percent
292 graph_category disk
293 graph_info Spare capacity for replacing bad blocks
294 EOF
295 for (@sn) {
296 my $device = $list->{$_}->{device};
297 print <<"EOF";
298 $_.label $device spare capacity
299 $_.type GAUGE
300 $_.min 0
301 $_.max 100
302 EOF
303 my_print_thresholds($_, 'nvme_spare', $device, '10:', '3:');
304 }
305 } else {
306 for (@sn) {
307 $list->{$_}->{smart} = smart_log($list->{$_}->{device});
308 }
309 print "multigraph nvme_usage\n";
310 for (@sn) {
311 my $info = $list->{$_};
312 my $used = 100 * $info->{usage} / $info->{capacity};
313 print "$_.value $used\n";
314 }
315 print "multigraph nvme_bytes\n";
316 for (@sn) {
317 my $info = $list->{$_};
318 my $rbytes = $info->{smart}->{data_units_read};
319 my $wbytes = $info->{smart}->{data_units_written};
320 print "${_}_r.value $rbytes\n";
321 print "${_}_w.value $wbytes\n";
322 }
323 print "multigraph nvme_writecycles\n";
324 for (@sn) {
325 my $info = $list->{$_};
326
327 # The unit size reported is 1000 blocks.
328 my $cycles = $info->{smart}->{data_units_written} * 512_000 / $info->{capacity};
329 print "$_.value $cycles\n";
330 }
331 print "multigraph nvme_spare\n";
332 for (@sn) {
333 my $info = $list->{$_};
334
335 # The unit size reported is 1000 blocks.
336 my $spare = $info->{smart}->{available_spare};
337 $spare =~ s/%//;
338 print "$_.value $spare\n";
339 }
340 }