#!/bin/ksh # # check_disks - check metadevices for errors and alert the user. # # USAGE: check_disks [ -m [
] ] # # Author: J.D. Baldwin, baldwin+scripts @ panix.com # Date: 5 Feb 1998 # # This program is Copyright (C) 1998-1999 by Distributed Computing # (visit DCCI at www.distcom.com) Consultants, Inc. It may be freely # redistributed as long as the copyright notice is not removed and as # long as it is not modified in any way prior to redistribution. # # IMPORTANT NOTICE: check_disks is not warranted in any way whatsoever. # The user assumes all risk of use. It should not be run under a # privileged account unless necessary (see the Usage Notes for a # description of such circumstances). You should never run this, or # any other script, on a system without examining its contents first. # # Usage notes: # # OPTIONS: [ -m [ ] ] to mail to (root by default) # Otherwise, check_disks prints to stdout. (Null output # means everything is OK.) # # NOTES: First off, you should change the TZ variable to reflect # your time zone. This will dictate how NOTIFY_HOUR (see # below) is treated and how paging will be handled). # # The output of check_disks is intended to be short and sweet, # with a minimum of detail. For example, it gives no indication # of which disk(s) is/are in error. It is intended as an alert # message only, suitable perhaps for use with an alphanumeric # pager. (Something for which vxnotify is not suitable.) Once # the alert is received, the admin may investigate the problem # directly. # # check_disks examines the output of DiskSuite and/or VxVM # commands for error indications. # # This script is intended to be run regularly from # crontab(1). Run it hourly, daily, whatever is suitable # for the availability requirements of the machine in # question. (The author runs it every five minutes on # production servers.) If VxVM is not installed, it should # probably be run from an unprivileged account. If VxVM is # present, it must be run as root so the vxprint command # will work. # # When run with the -m option, check_disks sends an alert # via mail immediately when an error is detected. It then # remains quiet (N.B.: EVEN IF NEW ERRORS SHOULD CROP UP) # until 8 a.m. local time on every subsequent day, when it # sends another alert. (The 8 a.m. figure is adjustable by # changing the value of NOTIFY_HOUR in the script.) # # Please send bug reports, suggestions and other comments # to J.D. Baldwin at the address shown above. # # REVISED: 6-Feb-98 by JDB - added check for capital letters in # metadb output, which indicates a problem. # # REVISED: 18-Feb-1998 by MPD - added pageadmin alias to default MAIL_RECIP. # # REVISED: 24-Mar-1998 by JDB - changed default MAIL_RECIP to root # # REVISED: 14-Aug-1998 by JDB - added check of vxprint output # # REVISED: 27-May-1999 by JDB - changed to ksh and fixed check of whether # DiskSuite is present on the system; also # cleaned up output message generation # # REVISED: 10-Sep-1999 by JDB - used getopts to determine options # (extensible); added section to keep # track of whether messages have been # sent on current problem; added # NOTIFY_HOUR trickery # # REVISED: 17-Dec-1999 by JDB - Revised copyright notice for latest public # version, disclaimer, etc. Revised # usage notes. # # REVISED: 06-Nov-2000 by JDB - Added a check of the PATH variable # (including /usr/sbin, if not already there) # for metastat and metadb. This accommodates # a suggestion by Thomas Insel to take into # account the Solaris 8/9 file locations. # # REVISED: 02-Aug-2002 by JDB - Added OUTPUT_CODE so that a problem returns # a non-zero exit code when metadevice # problems are detected (making this script # suitable for use in other scripts). TZ=EST5EDT SUBJECT="DISK ERRORS ON `uname -n`" GREP_CMD="/bin/egrep" GREP_ARG="-s" METASTAT_CMD=/usr/opt/SUNWmd/sbin/metastat METADB_CMD=/usr/opt/SUNWmd/sbin/metadb DISKSUITE_PRESENT=1 VXPRINT_CMD="/usr/sbin/vxprint" VXPRINT_ARG="-a" VM_PRESENT=1 MAIL_OUTPUT='' MAIL_RECIP='' DEFAULT_MAIL_RECIP='root' OUTPUT_MSG="" RESULT='' NOTIFY_HOUR='8' NOTIFY_SENTINEL_DIR='/var/tmp' NOTIFY_SENTINEL_FILE="${NOTIFY_SENTINEL_DIR}/.check_disks" OUTPUT_CODE=0 ##### FUNCTION DEFINITIONS ##### # check_for_previous_notification: takes no arguments, puts "YES" into # variable RESULT if user was notified prior function check_for_previous_notification { # Check for existence of sentinel file; if it doesn't exist, # return "NO" -- if it does exist, check its mod date; if it # matches today, then the user has been notified today, and return # "YES" -- otherwise touch the file (to update it to today) ONLY # if the current time is equal to or later than NOTIFY_HOUR (so we # don't wake anyone up if this is a daily notification for # something already paged over. Clear? Just read the code. RESULT='NO' # Default when we think the file is not there # First, find out if the sentinel file is there; if it's NOT # there, just return "NO": if [ ! -e $NOTIFY_SENTINEL_FILE ] then RESULT='NO' return fi RESULT='YES' # Default when the file is there # Now that we know the file is there, get its last modification # date: LAST_MOD_MONTH=$(/bin/ls -l $NOTIFY_SENTINEL_FILE | awk '{print $6}') LAST_MOD_DATE=$(/bin/ls -l $NOTIFY_SENTINEL_FILE | awk '{print $7}') TODAY_MONTH=$(/bin/date '+%b') TODAY_DATE=$(/bin/date '+%e') if [[ $TODAY_MONTH != $LAST_MOD_MONTH || \ $((TODAY_DATE)) -ne $((LAST_MOD_DATE)) ]] then # Now check that it's no earlier than NOTIFY_HOUR: NOW_HOUR=$(/bin/date '+%H') if [[ $NOW_HOUR -ge $NOTIFY_HOUR ]] then # If we get here, then it is at least as late as NOTIFY_HOUR, # and we can return "NO": RESULT='NO' else # If we get here, then it is too early to page, so return "YES": RESULT='YES' fi fi return } # error_out: takes two arguments, an error code and a message to be printed; # exits with that error code function error_out { RETURN_CODE=$2 ERROR_MSG=$1 print - $ERROR_MSG exit $RETURN_CODE } ##### MAIN PROGRAM ##### # Determine and act on options while getopts :m: c do case $c in m ) MAIL_OUTPUT="YES" # any non-null string will do MAIL_RECIP=$OPTARG if [[ -z $MAIL_RECIP ]] then MAIL_RECIP=$DEFAULT_MAIL_RECIP fi ;; ? ) error_out "Usage: check_disks [ -m mail_addr ]" 1 ;; esac done # First establish the location of the grep command, exit if error: if [ ! -x $GREP_CMD ] then GREP_CMD="/usr/xpg4/bin/egrep" fi # Try again for egrep: if [ ! -x $GREP_CMD ] then print "ERROR: egrep not executable or not found" exit 1 fi # Now establish whether DiskSuite is installed on the machine. # First, add /usr/sbin to the PATH just to make sure: PATH=$PATH:/usr/sbin METASTAT_BIN=$(/bin/which metastat) if [ ! -x $METASTAT_CMD ] then # If not found in its "standard" location, use what we found with # 'which': if [ -x $METASTAT_BIN ] then METASTAT_CMD=$METASTAT_BIN DISKSUITE_PRESENT=1 else DISKSUITE_PRESENT=0 fi fi METADB_BIN=$(/bin/which metadb) if [ ! -x $METADB_CMD ] then # If not found in its "standard" location, use what we found with # 'which': if [ -x $METADB_BIN ] then METADB_CMD=$METADB_BIN DISKSUITE_PRESENT=1 else DISKSUITE_PRESENT=0 fi fi # And the same for VxVM: if [ ! -x $VXPRINT_CMD ] then VM_PRESENT=0 fi if [[ $VM_PRESENT -eq 0 && $DISKSUITE_PRESENT -eq 0 ]] then error_out "Neither VxVM nor DiskSuite has been found." 3 fi ##### TEST SECTION ##### # This allows the user to test the script functionality, assuming # DiskSuite is present, by simply touching /tmp/testcheckdisk if [[ -f /tmp/testcheckdisk ]] then $METASTAT_CMD | $GREP_CMD $GREP_ARG 'State:' if [ $? -eq 0 ] then OUTPUT_MSG="/tmp/testcheckdisk exists and $METASTAT_CMD\nis working on `uname -n`\n" else OUTPUT_MSG="/tmp/testcheckdisk exists and there is a problem\nwith $METASTAT_CMD on `uname -n`\n" fi fi ##### DISKSUITE SECTION ##### if [ $DISKSUITE_PRESENT -ne 0 ] then # Check for disks requiring "Maintenance": $METASTAT_CMD | $GREP_CMD $GREP_ARG aint if [ $? -eq 0 ] then OUTPUT_MSG="${OUTPUT_MSG}Disk requires maintenance on `uname -n`\n" fi # Now check for hot spares "In use": $METASTAT_CMD | $GREP_CMD $GREP_ARG "In use" if [ $? -eq 0 ] then OUTPUT_MSG="${OUTPUT_MSG}Disk hot spared on `uname -n`" fi # Now check for metadb problems $METADB_CMD | $GREP_CMD $GREP_ARG "[A-Z]" if [ $? -eq 0 ] then OUTPUT_MSG="${OUTPUT_MSG}Metadb problems on `uname -n`\n" fi fi ##### VXVM (VOLUME MANAGER) SECTION ##### if [ $VM_PRESENT -ne 0 ] then # First check that VXPRINT_CMD runs (it won't run from an unprivileged # account); if it is present and fails to run, we should terminate # with an error message. $VXPRINT_CMD >/dev/null 2>&1 if [[ $? -ne 0 ]] then error_out "$VXPRINT_CMD present but not runnable - try running as root" 2 fi $VXPRINT_CMD $VXPRINT_ARG | $GREP_CMD $GREP_ARG "failing=on" if [ $? -eq 0 ] then OUTPUT_MSG="${OUTPUT_MSG}VM disk failing on `uname -n`\n" fi fi # If there is no output, then get rid of the sentinel file (because # everything is clean now) if [[ -z $OUTPUT_MSG ]] then /bin/rm -f $NOTIFY_SENTINEL_FILE fi ##### OUTPUT SECTION ##### # Now, decide what to do based on options: if [[ ! -z $MAIL_OUTPUT ]] # are we to mail the output? then check_for_previous_notification # has notification been mailed previously? if [[ $RESULT != "YES" ]] # not previously notified then if [[ ! -z "$OUTPUT_MSG" && ! -z $MAIL_RECIP ]] then print $OUTPUT_MSG | mailx -s "$SUBJECT" $MAIL_RECIP /bin/touch $NOTIFY_SENTINEL_FILE OUTPUT_CODE=-1 fi fi else if [[ ! -z "$OUTPUT_MSG" ]] # or else just print on stdout then print $OUTPUT_MSG OUTPUT_CODE=-1 fi fi return $OUTPUT_CODE