]>
jfr.im git - munin-plugins.git/blob - nvme
2 # -*- mode: perl; perl-indent-level: 4 -*-
6 nvme - Munin plugin to monitor the use of NVMe devices
8 =head1 APPLICABLE SYSTEMS
10 Linux systems with NVMe (Non-Volatile Memory storage attached via PCIe
15 The plugin uses nvme(1) from the nvme-cli project to read status from
16 the NVMe devices. This requires root access.
21 When setting alert levels per device, use graph and basename of device
22 name, e.g., 'nvme0n1', to make environment variable:
24 env.nvme_usage_nvme0n1_warning 5:
25 env.nvme_usage_warning 8:
27 If your device names change on reboot you can also use the labels
28 (based on serial numbers) to set the warning and critical labels
30 env.nvme_usage_SN_1234567_warning 8:101
31 env.nvme_usage_SN_1234567_critical 5:101
35 This is a multigraph plugin which makes three graphs.
39 This reports how much of capacity is allocated in each NVMe
40 "namespace". The report is in percent. This number may not have much
41 relation to actual use, e.g., if deleted data areas have not been
44 Default warning and critical: '95', '98'
48 This reports read and write activity on each NVMe device, in bytes per
49 second. Ideally there should be much more read than write. If they
50 are symmetrical, you are using your NVMe as a very expensive FIFO, and
51 if you write more than you read, you should probably look for archival
54 It is a good idea to compare these numbers to I/O counters from
55 diskstats. If they are much higher, look into whether the write
56 amplification can be due to suboptimal I/O request sizes.
58 This graph does not support alerting.
60 =head2 nvme_writecycles
62 This graphs is intended to give an indication of how much life there
63 is left in your NVMe. It calculates the number of bytes written
64 during each device's lifetime against the capacity of the device,
65 thereby getting an average number of write cycle each cell has
68 A prosumer NVMe will handle a few thousand writes to each cell before
69 the error rate gets out of hand.
71 No default values for warning and critical.
75 All NVMe has set a side reserve space to remap media errors. This
76 graphs how much is left in percent, taken directly from smart-log
79 Default warning and critical: '10:', '3:'
84 #%# capabilities=autoconf
96 Kjetil Torgrim Homme <kjetil.homme@redpill-linpro.com>
106 use IPC
::Cmd
qw(can_run);
109 # Check that multigraph is supported
112 # Return undef if no problem, otherwise explanation
113 sub autoconf_problem
{
114 return if can_run
('nvme');
115 if (open(my $mods, '/proc/modules')) {
117 return "missing nvme(1)" if /^nvme[^a-z]/;
121 return "missing nvme"; # vague message for non-Linux
127 if (can_run
('nvme') && open(my $nvme, '-|', 'nvme', @cmd)) {
130 warn "nvme: probably needs to run as user root\n" if $? && $> != 0;
141 TB
=> 1000_000_000_000,
142 PB
=> 1000_000_000_000_000, # I wish I had need for this
144 $str =~ /(\d+(\.\d+)?)\s+(.B)/;
145 int($1 * $units{$3});
149 # Node SN Model Namespace Usage Format FW Rev
150 # ---------------- -------------------- ---------------------------------------- --------- -------------------------- ---------------- --------
151 # /dev/nvme1n1 S464NB0K601188N Samsung SSD 970 EVO 2TB 1 695.50 GB / 2.00 TB 512 B + 0 B 1B2QEXE7
154 my $recognised_output;
156 for (run_nvme
('list')) {
158 if (m
:^Node\s
+SN\s
+Model\s
+Namespace Usage
:) {
159 ++$recognised_output;
160 } elsif (m
:^(/\S+)\s+(\S+)\s+(\S.*\S)\s{3,}(\d+)\s+(\S+\s+.B)\s+/\s
+(\S
+\s
+.B
):) {
161 $devices{'SN_'.$2} = {
166 usage
=> human_to_bytes
($5),
167 capacity
=> human_to_bytes
($6),
169 } elsif ($lineno > 2) {
170 # could not parse device information
171 $recognised_output = 0;
174 if ($lineno && !$recognised_output) {
175 warn "Could not recognise output from 'nvme list', please report\n";
183 for (run_nvme
('smart-log', $dev)) {
184 next if /^Smart Log/;
185 if (/(.*?)\s+:\s+(.*)/) {
186 my ($var, $value) = ($1, $2);
188 if ($value =~ /^\d+(,\d\d\d)+$/) {
191 $info{lc $var} = $value;
197 sub my_print_thresholds
{
198 my ($label, $graph, $device, $warn_default, $crit_default) = @_;
199 my $dev = basename
($device);
200 my ($warn_label, $crit_label) = get_thresholds
($graph, "${graph}_${label}_warning", "${graph}_${label}_critical",
201 $warn_default, $crit_default);
202 my ($warn, $crit) = get_thresholds
($graph, "${graph}_${dev}_warning", "${graph}_${dev}_critical",
203 $warn_label, $crit_label);
204 print "${label}.warning $warn\n" if defined $warn;
205 print "${label}.critical $crit\n" if defined $crit;
210 my $mode = ($ARGV[0] or "print");
212 my $problem = autoconf_problem
();
213 my $list = nvme_list
();
215 if ($mode eq 'autoconf') {
219 printf("no (%s)\n", $problem || "no devices to monitor");
224 my @sn = sort keys %{$list};
226 if ($mode eq 'config') {
227 my $sn_list = join(' ', @sn);
230 multigraph nvme_usage
231 graph_title NVME Namespace Usage
233 graph_vlabel Percent used
236 graph_info How much space is used
239 my $device = $list->{$_}->{device
};
241 $_.label $device used
246 my_print_thresholds
($_, 'nvme_usage', $device, '95', '98');
249 multigraph nvme_bytes
250 graph_title NVME Bytes Read / Written
252 graph_vlabel bytes read (-) / written (+) per ${graph_period}'
254 graph_info How much data is read and written
259 ${_}_r.label $list->{$_}->{device}
263 ${_}_w.label $list->{$_}->{device}
266 ${_}_w.negative ${_}_r
270 multigraph nvme_writecycles
271 graph_title NVME Write Cycles
274 graph_args --logarithmic
276 graph_info How much data has been written in lifetime divided by capacity
279 my $device = $list->{$_}->{device
};
281 $_.label $device write cycles
285 my_print_thresholds
($_, 'nvme_writecycles', $device);
288 multigraph nvme_spare
289 graph_title Available spare blocks
293 graph_info Spare capacity for replacing bad blocks
296 my $device = $list->{$_}->{device
};
298 $_.label $device spare capacity
303 my_print_thresholds
($_, 'nvme_spare', $device, '10:', '3:');
307 $list->{$_}->{smart
} = smart_log
($list->{$_}->{device
});
309 print "multigraph nvme_usage\n";
311 my $info = $list->{$_};
312 my $used = 100 * $info->{usage
} / $info->{capacity
};
313 print "$_.value $used\n";
315 print "multigraph nvme_bytes\n";
317 my $info = $list->{$_};
318 my $rbytes = $info->{smart
}->{data_units_read
};
319 my $wbytes = $info->{smart
}->{data_units_written
};
320 print "${_}_r.value $rbytes\n";
321 print "${_}_w.value $wbytes\n";
323 print "multigraph nvme_writecycles\n";
325 my $info = $list->{$_};
327 # The unit size reported is 1000 blocks.
328 my $cycles = $info->{smart
}->{data_units_written
} * 512_000 / $info->{capacity
};
329 print "$_.value $cycles\n";
331 print "multigraph nvme_spare\n";
333 my $info = $list->{$_};
335 # The unit size reported is 1000 blocks.
336 my $spare = $info->{smart
}->{available_spare
};
338 print "$_.value $spare\n";