#!/bin/ksh # # Bacula monitor for Nagios # Written by Allan Black # Last Modified: 2011-01-28 # # Usage: bacula_monitor # # Description: # # This script will examine the output of the Bacula console "st dir" # command and look in the "Running Jobs" section for job named as # argument. It will then monitor the status of the job and track the # progress of the job using Nagios commands to update a passive # service, whose name is based on the job name. # STARTINT=30 MONINT=600 WARNDELAY=300 CRITDELAY=900 WARNTIME=900 CRITTIME=1800 BACULADIR=/usr/local/bacula BACULASBIN=${BACULADIR}/sbin BACULAETC=${BACULADIR}/etc BCCONFIG=${BACULAETC}/bconsole.conf BCONSOLE="${BACULASBIN}/bconsole -c ${BCCONFIG}" STATCMD="st dir" STATE_OK=0 STATE_WARN=1 STATE_CRIT=2 STATE_UNKNOWN=3 PATH=/usr/sbin:/usr/bin export PATH # # If necessary, set to pfexec (Solaris), sudo (Linux) etc. # PFE=pfexec usage() { print "usage: $0 [args]" print "args:" print " -W|--waiting (allow waiting on other jobs)" print " -m|--monint (monitoring interval)" print " -w|-wt|--warntime (warning time threshold)" print " -c|-ct|--crittime (critical time threshold)" print " -s|--startint (startup monitoring interval)" print " -wd|--warndelay (warning startup time threshold)" print " -cd|--critdelay (critical startup time threshold)" print " -J|--job JobName (Bacula job to monitor)" print " JobName (Bacula job to monitor)" } while [[ $# -gt 0 ]]; do case "$1" in -H | --help) usage >&2 exit 0 ;; -W | --waiting) WAITING="yes" ;; -m | --monint) shift MONINT="$1" ;; -w | -wt | --warntime) shift WARNTIME="$1" ;; -c | -ct | --crittime | --critime) shift CRITTIME="$1" ;; -s | --startint) shift STARTINT="$1" ;; -wd | --warndelay) shift WARNDELAY="$1" ;; -cd | --critdelay) shift CRITDELAY="$1" ;; -J | --job) shift JOB="$1" ;; *) if [[ -n "$JOB" ]]; then usage >&2 exit 1 fi JOB="$1" ;; esac shift done if [[ -z "$JOB" ]]; then usage >&2 exit 1 fi director_status() { print -R "$STATCMD" | ${PFE} ${BCONSOLE} } itime() { case $(uname -s) in SunOS) truss date 2>&1 | awk '$1 == "time()" {print $NF}';; *) date '+%s';; esac } nagios() { if [[ $# -gt 0 ]]; then NSTAT="$1" shift else NSTAT=${STATE_UNKNOWN} fi if [[ $# -gt 0 ]]; then NMSG="$*" else NMSG="Job status unknown" fi nargs="$(uname -n);Backup: ${JOB};${NSTAT};${NMSG}" print -R "[$(itime)] PROCESS_SERVICE_CHECK_RESULT;${nargs}" \ > /var/nagios/rw/nagios.cmd } # # Wait for the job to start and get the Job ID. We start with the # sleep to give the job time to start up. # STARTTIME=$(itime) while [[ -z "$jobid" ]]; do delay=$(("$(itime)" - "$STARTTIME")) if [[ "$delay" -gt "$CRITDELAY" ]]; then nagios ${STATE_CRIT} "Job ${JOB} not running" elif [[ "$delay" -gt "$WARNDELAY" ]]; then nagios ${STATE_WARN} "Job ${JOB} not running" fi sleep ${STARTINT} jobid=$(director_status | awk ' BEGIN { running = 0; rjlist = 0; jobid = ""; } NF == 2 && $1 == "Running" && $2 == "Jobs:" { running = 1; } NF == 1 && $1 ~ /^=/ { if(rjlist) exit; if(running) rjlist = 1; continue; } rjlist != 0 && $3 ~ /^'"$JOB"'\./ { jobid = $1; } END { print jobid; }') done nagstate=${STATE_UNKNOWN} STARTTIME=$(itime) lgtime=${STARTTIME} lctime=${STARTTIME} while :; do runstat=$(director_status | awk ' BEGIN { running = 0; rjlist = 0; } NF == 2 && $1 == "Running" && $2 == "Jobs:" { running = 1; } NF == 3 && $1 == "No" && $2 == "Jobs" \ && $3 == "running." { exit; } NF == 1 && $1 ~ /^=/ { if(rjlist) exit; if(running) rjlist = 1; continue; } rjlist != 0 { print; exit; }') jobstat=$(print -R "$runstat" | awk ' $1 == "'"$jobid"'" { print; }') if [[ -z "$jobstat" ]]; then # # Job has disappeared from the Running Jobs: list # break fi # # Get the current status # timenow=$(itime) currstat=$(print -R "$jobstat" | sed \ -e 's/^ *[^ ][^ ]* *[^ ][^ ]* *[^ ][^ ]* *//' \ -e 's/ *$//') # # If we are allowed to accept a job waiting execution for a # while (e.g. because of job priority), we ignore this state # as long as there is at least one other job in the list # if [[ "$currstat" = "is waiting execution" && -n "$WAITING" ]]; then njobs=$(print -R "$runstat" | wc -l) if [[ "$njobs" -gt 1 ]]; then lgtime=${timenow} lctime=${timenow} nagios ${STATE_OK} "$jobstat" sleep ${MONINT} continue fi fi # # Anything other than a normal running state is warning/critical # if it stays like that for too long # case "$currstat" in "is running" \ | "Dir inserting Attributes" \ | "has terminated") nagios ${STATE_OK} "$jobstat" lgtime=${timenow} ;; *) ngtime=$(("$timenow" - "$lgtime")) if [[ "$currstat" != "$laststat" ]]; then lctime=${timenow} fi nctime=$(("$timenow" - "$lctime")) if [[ "$nctime" -gt "$CRITTIME" ]]; then nagios ${STATE_CRIT} "$jobstat" elif [[ "$ngtime" -gt "$WARNTIME" ]]; then nagios ${STATE_WARN} "$jobstat" fi esac laststat=${currstat} sleep ${MONINT} done jobstat=$(director_status | awk ' BEGIN { terminated = 0; tjlist = 0; } NF == 2 && $1 == "Terminated" && $2 == "Jobs:" { terminated = 1; } NF == 1 && $1 ~ /^=/ { if(tjlist) exit; if(terminated) tjlist = 1; continue; } tjlist != 0 && $1 == "'"$jobid"'" { print; exit; }') termstat=$(print -R "$jobstat" | awk '{print $6}') if [[ "$termstat" != "OK" ]]; then if [[ -z "$jobstat" ]]; then jobstat="${JOB} failed" fi nagios ${STATE_CRIT} "Backup Error: $jobstat" exit 1 fi nagios ${STATE_OK} "Backup OK: $jobstat" exit 0