From c10122e0b15d28c16a6e34db28a9788bae53e144 Mon Sep 17 00:00:00 2001 From: Ben Firshman Date: Thu, 1 Dec 2011 12:52:45 +0000 Subject: [PATCH 1/2] Added check_linux_software_raid.pl to check Linux RAID status --- check_linux_software_raid.pl | 124 +++++++++++++++++++++++++++++++++++ 1 file changed, 124 insertions(+) create mode 100755 check_linux_software_raid.pl diff --git a/check_linux_software_raid.pl b/check_linux_software_raid.pl new file mode 100755 index 0000000..0a3c4f8 --- /dev/null +++ b/check_linux_software_raid.pl @@ -0,0 +1,124 @@ +#!/usr/bin/env perl + +# Get status of Linux software RAID for Cloudkick +# Author: Michal Ludvig +# http://www.logix.cz/michal/devel/nagios +# Adapted to Cloudkick by Ben Firshman +# +# Simple parser for /proc/mdstat that outputs status of all +# or some RAID devices. Possible results are OK and CRITICAL. +# It could eventually be extended to output WARNING result in +# case the array is being rebuilt or if there are still some +# spares remaining, but for now leave it as it is. +# +# To run the script remotely via SNMP daemon (net-snmp) add the +# following line to /etc/snmpd.conf: +# +# extend raid-md0 /root/parse-mdstat.pl --device=md0 +# +# The script result will be available e.g. with command: +# +# snmpwalk -v2c -c public localhost .1.3.6.1.4.1.8072.1.3.2 + +use strict; +use Getopt::Long; + +# Sample /proc/mdstat output: +# +# Personalities : [raid1] [raid5] +# md0 : active (read-only) raid1 sdc1[1] +# 2096384 blocks [2/1] [_U] +# +# md1 : active raid5 sdb3[2] sdb4[3] sdb2[4](F) sdb1[0] sdb5[5](S) +# 995712 blocks level 5, 64k chunk, algorithm 2 [3/2] [U_U] +# [=================>...] recovery = 86.0% (429796/497856) finish=0.0min speed=23877K/sec +# +# unused devices: + +my $file = "/proc/mdstat"; +my $device = "all"; + +# Get command line options. +GetOptions ('file=s' => \$file, + 'device=s' => \$device, + 'help' => sub { &usage() } ); + +## Strip leading "/dev/" from --device in case it has been given +$device =~ s/^\/dev\///; + +## Return codes for Nagios +my %ERRORS=('OK'=>0,'WARNING'=>1,'CRITICAL'=>2,'UNKNOWN'=>3,'DEPENDENT'=>4); + +## This is a global return value - set to the worst result we get overall +my $retval = 0; + +my (%active_devs, %failed_devs, %spare_devs); + +my $status = "ok"; +my @status_string = (); + +open FILE, "< $file" or die "Can't open $file : $!"; +while () { + next if ! /^(md\d+)+\s*:/; + next if $device ne "all" and $device ne $1; + my $dev = $1; + + my @array = split(/ /); + for $_ (@array) { + next if ! /(\w+)\[\d+\](\(.\))*/; + if ($2 eq "(F)") { + $failed_devs{$dev} .= "$1,"; + } + elsif ($2 eq "(S)") { + $spare_devs{$dev} .= "$1,"; + } + else { + $active_devs{$dev} .= "$1,"; + } + } + if (! defined($active_devs{$dev})) { $active_devs{$dev} = "none"; } + else { $active_devs{$dev} =~ s/,$//; } + if (! defined($spare_devs{$dev})) { $spare_devs{$dev} = "none"; } + else { $spare_devs{$dev} =~ s/,$//; } + if (! defined($failed_devs{$dev})) { $failed_devs{$dev} = "none"; } + else { $failed_devs{$dev} =~ s/,$//; } + + $_ = ; + /\[(\d+)\/(\d+)\]\s+\[(.*)\]$/; + my $devs_total = $1; + my $devs_up = $2; + my $stat = $3; + if ($devs_total > $devs_up or $failed_devs{$dev} ne "none") { + $status = "err"; + $retval = 1; + } + + push(@status_string, "$dev [$stat] has $devs_up of $devs_total devices active"); +} + +print "status $status " . join(", ", @status_string); +print "\n"; + +close FILE; +exit $retval; + +# ===== +sub usage() +{ + printf(" +Check status of Linux SW RAID + +Author: Michal Ludvig (c) 2006 + http://www.logix.cz/michal/devel/nagios + +Usage: check_linux_software_raid.pl [options] + + --file= Name of file to parse. Default is /proc/mdstat + --device= Name of MD device, e.g. md0. Default is \"all\" + +"); + exit(1); +} + + + From 72afa3218ce2cc39e0ec984ae1ad79099bacbaa1 Mon Sep 17 00:00:00 2001 From: Ben Firshman Date: Thu, 1 Dec 2011 13:26:44 +0000 Subject: [PATCH 2/2] Return values confuse cloudkick --- check_linux_software_raid.pl | 5 ----- 1 file changed, 5 deletions(-) diff --git a/check_linux_software_raid.pl b/check_linux_software_raid.pl index 0a3c4f8..777059c 100755 --- a/check_linux_software_raid.pl +++ b/check_linux_software_raid.pl @@ -49,9 +49,6 @@ ## Return codes for Nagios my %ERRORS=('OK'=>0,'WARNING'=>1,'CRITICAL'=>2,'UNKNOWN'=>3,'DEPENDENT'=>4); -## This is a global return value - set to the worst result we get overall -my $retval = 0; - my (%active_devs, %failed_devs, %spare_devs); my $status = "ok"; @@ -90,7 +87,6 @@ my $stat = $3; if ($devs_total > $devs_up or $failed_devs{$dev} ne "none") { $status = "err"; - $retval = 1; } push(@status_string, "$dev [$stat] has $devs_up of $devs_total devices active"); @@ -100,7 +96,6 @@ print "\n"; close FILE; -exit $retval; # ===== sub usage()