#!/bin/ksh
#
# check_disks - check metadevices for errors and alert the user.
#
# USAGE:    check_disks [ -m [ <address> ] ]
#
# Author:  J.D. Baldwin, baldwin+scripts @ panix.com 
# Date: 5 Feb 1998
#
# This program is Copyright (C) 1998-1999 by Distributed Computing
# (visit DCCI at www.distcom.com) Consultants, Inc.  It may be freely
# redistributed as long as the copyright notice is not removed and as
# long as it is not modified in any way prior to redistribution.
#
# IMPORTANT NOTICE:  check_disks is not warranted in any way whatsoever.
# The user assumes all risk of use.  It should not be run under a
# privileged account unless necessary (see the Usage Notes for a 
# description of such circumstances).  You should never run this, or
# any other script, on a system without examining its contents first.
#
# Usage notes:
#
# OPTIONS:  [ -m [ <address> ] ] to mail to <address> (root by default)
#           Otherwise, check_disks prints to stdout.  (Null output
#           means everything is OK.)
#
# NOTES:    First off, you should change the TZ variable to reflect
#           your time zone.  This will dictate how NOTIFY_HOUR (see
#           below) is treated and how paging will be handled).
#
#           The output of check_disks is intended to be short and sweet,
#           with a minimum of detail.  For example, it gives no indication
#           of which disk(s) is/are in error.  It is intended as an alert
#           message only, suitable perhaps for use with an alphanumeric
#           pager.  (Something for which vxnotify is not suitable.)  Once
#           the alert is received, the admin may investigate the problem
#           directly.
#
#           check_disks examines the output of DiskSuite and/or VxVM
#           commands for error indications.
#
#           This script is intended to be run regularly from
#           crontab(1).  Run it hourly, daily, whatever is suitable
#           for the availability requirements of the machine in
#           question.  (The author runs it every five minutes on
#           production servers.)  If VxVM is not installed, it should
#           probably be run from an unprivileged account.  If VxVM is
#           present, it must be run as root so the vxprint command
#           will work.
#
#           When run with the -m option, check_disks sends an alert
#           via mail immediately when an error is detected.  It then
#           remains quiet (N.B.: EVEN IF NEW ERRORS SHOULD CROP UP)
#           until 8 a.m. local time on every subsequent day, when it
#           sends another alert.  (The 8 a.m. figure is adjustable by
#           changing the value of NOTIFY_HOUR in the script.)
#
#           Please send bug reports, suggestions and other comments
#           to J.D. Baldwin at the address shown above.
#
# REVISED: 6-Feb-98 by JDB - added check for capital letters in 
#          metadb output, which indicates a problem.
#
# REVISED: 18-Feb-1998 by MPD - added pageadmin alias to default MAIL_RECIP.  
#
# REVISED: 24-Mar-1998 by JDB - changed default MAIL_RECIP to root
#
# REVISED: 14-Aug-1998 by JDB - added check of vxprint output
#
# REVISED: 27-May-1999 by JDB - changed to ksh and fixed check of whether
#                               DiskSuite is present on the system; also
#                               cleaned up output message generation
#
# REVISED: 10-Sep-1999 by JDB - used getopts to determine options
#                               (extensible); added section to keep
#                               track of whether messages have been
#                               sent on current problem; added
#                               NOTIFY_HOUR trickery
#
# REVISED: 17-Dec-1999 by JDB - Revised copyright notice for latest public
#                               version, disclaimer, etc.  Revised 
#                               usage notes.
#
# REVISED: 06-Nov-2000 by JDB - Added a check of the PATH variable 
#                               (including /usr/sbin, if not already there)
#                               for metastat and metadb.  This accommodates
#                               a suggestion by Thomas Insel to take into
#                               account the Solaris 8/9 file locations.
#
# REVISED: 02-Aug-2002 by JDB - Added OUTPUT_CODE so that a problem returns
#                               a non-zero exit code when metadevice 
#                               problems are detected (making this script
#                               suitable for use in other scripts).


TZ=EST5EDT
SUBJECT="DISK ERRORS ON `uname -n`"
GREP_CMD="/bin/egrep"
GREP_ARG="-s"
METASTAT_CMD=/usr/opt/SUNWmd/sbin/metastat
METADB_CMD=/usr/opt/SUNWmd/sbin/metadb
DISKSUITE_PRESENT=1
VXPRINT_CMD="/usr/sbin/vxprint"
VXPRINT_ARG="-a"
VM_PRESENT=1
MAIL_OUTPUT=''
MAIL_RECIP=''
DEFAULT_MAIL_RECIP='root'
OUTPUT_MSG=""
RESULT=''
NOTIFY_HOUR='8'
NOTIFY_SENTINEL_DIR='/var/tmp'
NOTIFY_SENTINEL_FILE="${NOTIFY_SENTINEL_DIR}/.check_disks"
OUTPUT_CODE=0

##### FUNCTION DEFINITIONS #####

# check_for_previous_notification:  takes no arguments, puts "YES" into
#           variable RESULT if user was notified prior

function check_for_previous_notification
{
    # Check for existence of sentinel file; if it doesn't exist,
    # return "NO" -- if it does exist, check its mod date; if it
    # matches today, then the user has been notified today, and return
    # "YES" -- otherwise touch the file (to update it to today) ONLY
    # if the current time is equal to or later than NOTIFY_HOUR (so we
    # don't wake anyone up if this is a daily notification for
    # something already paged over.  Clear?  Just read the code.

    RESULT='NO'  # Default when we think the file is not there

    # First, find out if the sentinel file is there; if it's NOT
    # there, just return "NO":

    if [ ! -e $NOTIFY_SENTINEL_FILE ]
    then
	RESULT='NO'
	return
    fi

    RESULT='YES'  # Default when the file is there

    # Now that we know the file is there, get its last modification
    # date:

    LAST_MOD_MONTH=$(/bin/ls -l $NOTIFY_SENTINEL_FILE | awk '{print $6}')
    LAST_MOD_DATE=$(/bin/ls -l $NOTIFY_SENTINEL_FILE | awk '{print $7}')

    TODAY_MONTH=$(/bin/date '+%b')
    TODAY_DATE=$(/bin/date '+%e')

    if [[ $TODAY_MONTH != $LAST_MOD_MONTH || \
	    $((TODAY_DATE)) -ne $((LAST_MOD_DATE)) ]]
    then
	# Now check that it's no earlier than NOTIFY_HOUR:

	NOW_HOUR=$(/bin/date '+%H')

	if [[ $NOW_HOUR -ge $NOTIFY_HOUR ]]
	then
	    
	# If we get here, then it is at least as late as NOTIFY_HOUR,
	# and we can return "NO":

	    RESULT='NO'
	else

	# If we get here, then it is too early to page, so return "YES":

	    RESULT='YES'
	fi
    fi

    return
}
    

# error_out: takes two arguments, an error code and a message to be printed;
#            exits with that error code

function error_out {
    RETURN_CODE=$2
    ERROR_MSG=$1
    print - $ERROR_MSG
    exit $RETURN_CODE
}

##### MAIN PROGRAM #####

# Determine and act on options

while getopts :m: c
do
    case $c in
	m ) MAIL_OUTPUT="YES"   # any non-null string will do
	   MAIL_RECIP=$OPTARG
	   if [[ -z $MAIL_RECIP ]]
	   then
	       MAIL_RECIP=$DEFAULT_MAIL_RECIP
	   fi
	    ;;
	? ) error_out "Usage:  check_disks [ -m mail_addr ]" 1
	    ;;
    esac
done

# First establish the location of the grep command, exit if error:

if [ ! -x $GREP_CMD ]
then
    GREP_CMD="/usr/xpg4/bin/egrep"
fi

# Try again for egrep:

if [ ! -x $GREP_CMD ]
then
    print "ERROR:  egrep not executable or not found"
    exit 1
fi

# Now establish whether DiskSuite is installed on the machine.

# First, add /usr/sbin to the PATH just to make sure:

PATH=$PATH:/usr/sbin

METASTAT_BIN=$(/bin/which metastat)

if [ ! -x $METASTAT_CMD ]
then

# If not found in its "standard" location, use what we found with
# 'which':

    if [ -x $METASTAT_BIN ]
    then
	METASTAT_CMD=$METASTAT_BIN
        DISKSUITE_PRESENT=1
    else
	DISKSUITE_PRESENT=0
    fi
fi

METADB_BIN=$(/bin/which metadb)

if [ ! -x $METADB_CMD ]
then

# If not found in its "standard" location, use what we found with
# 'which':

    if [ -x $METADB_BIN ]
    then
	METADB_CMD=$METADB_BIN
        DISKSUITE_PRESENT=1
    else
	DISKSUITE_PRESENT=0
    fi
fi

# And the same for VxVM:

if [ ! -x $VXPRINT_CMD ]
then
    VM_PRESENT=0
fi

if [[ $VM_PRESENT -eq 0 && $DISKSUITE_PRESENT -eq 0 ]]
then
    error_out "Neither VxVM nor DiskSuite has been found." 3
fi

##### TEST SECTION #####

# This allows the user to test the script functionality, assuming
# DiskSuite is present, by simply touching /tmp/testcheckdisk

if [[ -f /tmp/testcheckdisk ]]
then
    $METASTAT_CMD | $GREP_CMD $GREP_ARG 'State:'
    if [ $? -eq 0 ]
    then
        OUTPUT_MSG="/tmp/testcheckdisk exists and $METASTAT_CMD\nis working on `uname -n`\n"
    else
        OUTPUT_MSG="/tmp/testcheckdisk exists and there is a problem\nwith $METASTAT_CMD on `uname -n`\n"
    fi
fi

##### DISKSUITE SECTION #####

if [ $DISKSUITE_PRESENT -ne 0 ]
then

    # Check for disks requiring "Maintenance":

    $METASTAT_CMD | $GREP_CMD $GREP_ARG aint
    if [ $? -eq 0 ]
    then
        OUTPUT_MSG="${OUTPUT_MSG}Disk requires maintenance on `uname -n`\n"
    fi
    
    # Now check for hot spares "In use":

    $METASTAT_CMD | $GREP_CMD $GREP_ARG "In use"
    if [ $? -eq 0 ]
    then
        OUTPUT_MSG="${OUTPUT_MSG}Disk hot spared on `uname -n`"
    fi

    # Now check for metadb problems

    $METADB_CMD | $GREP_CMD $GREP_ARG "[A-Z]"
    if [ $? -eq 0 ]
    then
        OUTPUT_MSG="${OUTPUT_MSG}Metadb problems on `uname -n`\n"
    fi

fi

##### VXVM (VOLUME MANAGER) SECTION #####

if [ $VM_PRESENT -ne 0 ]
then

# First check that VXPRINT_CMD runs (it won't run from an unprivileged
# account); if it is present and fails to run, we should terminate
# with an error message.

    $VXPRINT_CMD >/dev/null 2>&1

    if [[ $? -ne 0 ]]
    then
	error_out "$VXPRINT_CMD present but not runnable - try running as root" 2
    fi

    $VXPRINT_CMD $VXPRINT_ARG | $GREP_CMD $GREP_ARG "failing=on"
    if [ $? -eq 0 ]
    then
        OUTPUT_MSG="${OUTPUT_MSG}VM disk failing on `uname -n`\n"
    fi
fi

# If there is no output, then get rid of the sentinel file (because
# everything is clean now)

if [[ -z $OUTPUT_MSG ]]
then
    /bin/rm -f $NOTIFY_SENTINEL_FILE
fi

##### OUTPUT SECTION #####

# Now, decide what to do based on options:

if [[ ! -z $MAIL_OUTPUT  ]]                      # are we to mail the output?
then
    check_for_previous_notification    # has notification been mailed previously?
    if [[ $RESULT != "YES" ]]  # not previously notified
    then
	if [[ ! -z "$OUTPUT_MSG" && ! -z $MAIL_RECIP ]]
	then
	    print $OUTPUT_MSG | mailx -s "$SUBJECT" $MAIL_RECIP
	    /bin/touch $NOTIFY_SENTINEL_FILE
            OUTPUT_CODE=-1
	fi
    fi	
else
    if [[ ! -z "$OUTPUT_MSG" ]] # or else just print on stdout
    then
	print $OUTPUT_MSG
        OUTPUT_CODE=-1
    fi
fi

return $OUTPUT_CODE
