###############################################################################
#
# Synopsis:	db_check.sh [-b] [sid ...]
# Purpose:	to check the health of database instances
#
# Author:	Steve Adams (based on various similar scripts)
#
# Description:  This script checks the named instances, or all instances
#		listed in the oratab file with a Y in the third field.
#		The -b option should be used during the backup window.
#
#		It may be used in 3 ways:
#		* interactively
#		* from cron
#		* from other scripts
#
#		The following checks are performed:
#		* a listener must be running
#		* each instance must be available
#		* instances must not be in resticted session mode
#		* auto-archiving must be enabled in archivelog mode
#		* archiver processes must not be stuck
#		* tablespaces must not be in hot backup mode
#		During the backup window, instances may be down, or in
#		restricted session mode, and tablespaces may be in hot
#		backup mode.
#		
#		If any checks fail, they are reported:
#		* to the terminal, if running interactively
#		* to the system log via logger (syslog)
#		* with a non-zero exit status
#
###############################################################################

# set variables
#
export PROGRAM=${0##*/}			# program name
export ORATAB				# path to the oratab file
export INTERACTIVE=:			# whether to print messages to stdout
export DEBUG=				# whether to actually log messages
# export DEBUG=print			# (uncomment this line while debugging)
export FACILITY=oracle			# change to "user" if logger complains
export BACKUP=false			# are we in the backup window
export ORACLE_SID=			# the instance being checked
export SPOOL=/tmp/$$.spool		# the spool file for output
export READY=/tmp/$$.ready		# file to flag that output is ready
export STATUS=0				# exit status


# find the oratab file
# if none, assume no Oracle and quit
#
{ ORATAB=/etc/opt/oracle/oratab && [[ -r $ORATAB ]] ; } ||
{ ORATAB=/var/opt/oracle/oratab && [[ -r $ORATAB ]] ; } ||
{ ORATAB=/etc/oratab            && [[ -r $ORATAB ]] ; } ||
exit 0


# are we running interactively?
# if not, we must have logger
#
tty -s && INTERACTIVE=print || whence logger > /dev/null || exit 1


# check that a listener is running
#
if ps -fu oracle | grep -v grep | grep -c tnslsnr >/dev/null 
then
    $INTERACTIVE "Listener is running"
else
    msg="$PROGRAM: Listener not running - no network access to Oracle"
    $DEBUG logger -p $FACILITY.err "$msg"
    STATUS=1
    $INTERACTIVE $msg
fi


# check for oraenv
#
whence oraenv > /dev/null ||
{
    msg="$PROGRAM: Cannot check Oracle - oraenv not in PATH"
    $DEBUG logger -p $FACILITY.warning "$msg"
    $INTERACTIVE $msg
    exit 1
}


# are we in the backup window
#
[[ $1 = -b ]] && { BACKUP=true; shift; }


# check specified instances or all auto-started instances
#
for ORACLE_SID in ${*-$(awk -F: '!/^#/ && $3=="Y" {print $1}' $ORATAB)}
do                           
    # set environment and check for sqlplus
    #
    ORAENV_ASK=NO . oraenv                   
    whence sqlplus > /dev/null ||
    {
	msg="$PROGRAM: Cannot check Oracle - sqlplus not in PATH"
	$DEBUG logger -p $FACILITY.warning "$msg"
	STATUS=1
	$INTERACTIVE $msg
	continue
    }


    # check connectivity for an ordinary user
    # (we expect an ORA-01017 error; most others mean something)
    #
    rm -f $READY
    print "
	connect nobody/really
	host touch $READY
	exit " |
    sqlplus /nolog > $SPOOL &

    # wait for up to 59 seconds
    #
    ((timeout = 60))
    while ((timeout -= 1)) && [[ ! -r $READY ]]
    do
	sleep 1
    done

    # check for hang
    #
    [[ -r $READY ]] ||
    {
	kill $!
	msg="$PROGRAM: Oracle instance $ORACLE_SID is not responding"
	$DEBUG logger -p $FACILITY.err "$msg"
	STATUS=1
	$INTERACTIVE $msg
	continue
    }

    # check for other problems
    #
    ERROR=$(sed -n 's/.*ORA-\([0-9]*\):.*/\1/p' $SPOOL|head -1)
    case $ERROR in
	    #
	    # this is what we expect
	    #
    01017)  # invalid username/password
	    ;;
	    #
	    # this group are not expected, but OK anyway
	    #
    "")     # no error (connected OK)
	    ;;
    01035)  # instance in restricted session mode
	    ;;
    01040)  # invalid password
	    ;;
    01045)  # no create session priv
	    ;;
	    #
	    # any other error is a problem
	    #
    00257)  # archiver stuck
	    #
	    msg="$PROGRAM: Oracle archiver for $ORACLE_SID is stuck"
	    $DEBUG logger -p $FACILITY.alert "$msg"
	    STATUS=1
	    $INTERACTIVE $msg
	    ;;
    01034)  # Oracle not available
	    #
	    msg="$PROGRAM: Oracle instance $ORACLE_SID is not up"
	    $BACKUP || $DEBUG logger -p $FACILITY.err "$msg"
	    $BACKUP || STATUS=1
	    $BACKUP || $INTERACTIVE $msg
	    continue
	    ;;
    *)      # something else
	    #
	    msg="$PROGRAM: Got Oracle error ORA-$ERROR from $ORACLE_SID"
	    $DEBUG logger -p $FACILITY.err "$msg"
	    STATUS=1
	    $INTERACTIVE $msg
	    continue
	    ;;
    esac
    $INTERACTIVE "Oracle instance $ORACLE_SID is up"

    # now connect internal and check everything else
    #
    { SVRMGRL=svrmgrl		&& whence $SVRMGRL > /dev/null ; } ||
    { SVRMGRL=sqldba		&& whence $SVRMGRL > /dev/null ; } ||
    {
	msg="$PROGRAM: Cannot check Oracle - svrmgrl not in PATH"
	$DEBUG logger -p $FACILITY.warning "$msg"
	STATUS=1
	$INTERACTIVE $msg
	continue
    }
    rm -f $READY
    print "
	connect internal
	select '/actives=' || count(*) from v\$backup where status = 'ACTIVE';
	select '/restrict=' || value from v\$instance where key like 'RESTR%';
	select '/restrict=' || logins from v\$instance;
	archive log list;
	host touch $READY
	exit " |
    $SVRMGRL > $SPOOL &

    # wait for up to 89 seconds
    #
    ((timeout = 90))
    while ((timeout -= 1)) && [[ ! -r $READY ]]
    do
	sleep 1
    done

    # check for hang (most unlikely at this point)
    #
    [[ -r $READY ]] ||
    {
	kill $!
	msg="$PROGRAM: Oracle instance $ORACLE_SID is not responding"
	$DEBUG logger -p $FACILITY.err "$msg"
	STATUS=1
	$INTERACTIVE $msg
	continue
    }

    # check for active backups, archiver and restricted session
    #
    eval $(sed -n 's:^/\(.*=.*\):\1:p' $SPOOL)
    [[ $actives -eq 0 ]] ||
    {
	msg="$PROGRAM: Database $ORACLE_SID has files in hot backup mode"
	$BACKUP || $DEBUG logger -p $FACILITY.warning "$msg"
	$BACKUP || STATUS=1
	$BACKUP || $INTERACTIVE $msg
    }
    eval $(awk '/Database log mode/ {print "mode=" $(NF-2)}
		/Automatic archival/ {print "auto=" $NF}' $SPOOL)
    [[ $auto = Enabled || $mode = No ]] ||
    {
	msg="$PROGRAM: Oracle archiver for $ORACLE_SID is not running"
	$DEBUG logger -p $FACILITY.err "$msg"
	STATUS=1
	$INTERACTIVE $msg
    }
    [[ $restrict = 0 || $restrict = ALLOWED ]] ||
    {
	msg="$PROGRAM: Oracle instance $ORACLE_SID is in restricted mode"
	$BACKUP || $DEBUG logger -p $FACILITY.warning "$msg"
	$BACKUP || STATUS=1
	$BACKUP || $INTERACTIVE $msg
    }
done

# cleanup
#
$DEBUG rm -f $SPOOL
$DEBUG rm -f $READY

exit $STATUS

