#!/bin/ksh
#
# Bacula monitor for Nagios
# Written by Allan Black
# Last Modified: 2011-01-28
#
# Usage: bacula_monitor <jobname>
#
# Description:
#
# This script will examine the output of the Bacula console "st dir"
# command and look in the "Running Jobs" section for job named as
# argument. It will then monitor the status of the job and track the
# progress of the job using Nagios commands to update a passive
# service, whose name is based on the job name.
#

STARTINT=30
MONINT=600
WARNDELAY=300
CRITDELAY=900
WARNTIME=900
CRITTIME=1800

BACULADIR=/usr/local/bacula
BACULASBIN=${BACULADIR}/sbin
BACULAETC=${BACULADIR}/etc
BCCONFIG=${BACULAETC}/bconsole.conf
BCONSOLE="${BACULASBIN}/bconsole -c ${BCCONFIG}"
STATCMD="st dir"

STATE_OK=0
STATE_WARN=1
STATE_CRIT=2
STATE_UNKNOWN=3

PATH=/usr/sbin:/usr/bin
export PATH

#
# If necessary, set to pfexec (Solaris), sudo (Linux) etc.
#
PFE=pfexec

usage() {
	print "usage: $0 [args]"
	print "args:"
	print "        -W|--waiting         (allow waiting on other jobs)"
	print "        -m|--monint          (monitoring interval)"
	print "        -w|-wt|--warntime    (warning time threshold)"
	print "        -c|-ct|--crittime    (critical time threshold)"
	print "        -s|--startint        (startup monitoring interval)"
	print "        -wd|--warndelay      (warning startup time threshold)"
	print "        -cd|--critdelay      (critical startup time threshold)"
	print "        -J|--job JobName     (Bacula job to monitor)"
	print "        JobName              (Bacula job to monitor)"
}

while [[ $# -gt 0 ]]; do
	case "$1" in
	-H | --help)
		usage >&2
		exit 0
		;;

	-W | --waiting)
		WAITING="yes"
		;;

	-m | --monint)
		shift
		MONINT="$1"
		;;

	-w | -wt | --warntime)
		shift
		WARNTIME="$1"
		;;

	-c | -ct | --crittime | --critime)
		shift
		CRITTIME="$1"
		;;

	-s | --startint)
		shift
		STARTINT="$1"
		;;

	-wd | --warndelay)
		shift
		WARNDELAY="$1"
		;;

	-cd | --critdelay)
		shift
		CRITDELAY="$1"
		;;

	-J | --job)
		shift
		JOB="$1"
		;;

	*)
		if [[ -n "$JOB" ]]; then
			usage >&2
			exit 1
		fi
		JOB="$1"
		;;
	esac

	shift
done

if [[ -z "$JOB" ]]; then
	usage >&2
	exit 1
fi

director_status()
{
	print -R "$STATCMD" | ${PFE} ${BCONSOLE}
}

itime() {
	case $(uname -s) in
	SunOS)
		truss date 2>&1 | awk '$1 == "time()" {print $NF}';;
	*)
		date '+%s';;
	esac
}

nagios() {
	if [[ $# -gt 0 ]]; then
		NSTAT="$1"
		shift
	else
		NSTAT=${STATE_UNKNOWN}
	fi

	if [[ $# -gt 0 ]]; then
		NMSG="$*"
	else
		NMSG="Job status unknown"
	fi

	nargs="$(uname -n);Backup: ${JOB};${NSTAT};${NMSG}"

	print -R "[$(itime)] PROCESS_SERVICE_CHECK_RESULT;${nargs}" \
					> /var/nagios/rw/nagios.cmd
}

#
# Wait for the job to start and get the Job ID. We start with the
# sleep to give the job time to start up.
#
STARTTIME=$(itime)

while [[ -z "$jobid" ]]; do
	delay=$(("$(itime)" - "$STARTTIME"))
	if [[ "$delay" -gt "$CRITDELAY" ]]; then
		nagios ${STATE_CRIT} "Job ${JOB} not running"
	elif [[ "$delay" -gt "$WARNDELAY" ]]; then
		nagios ${STATE_WARN} "Job ${JOB} not running"
	fi
	sleep ${STARTINT}

	jobid=$(director_status | awk '
				BEGIN {
					running = 0;
					rjlist = 0;
					jobid = "";
				}
				NF == 2 && $1 == "Running" && $2 == "Jobs:" {
					running = 1;
				}
				NF == 1 && $1 ~ /^=/ {
					if(rjlist) exit;
					if(running) rjlist = 1;
					continue;
				}
				rjlist != 0 && $3 ~ /^'"$JOB"'\./ {
					jobid = $1;
				}
				END {
					print jobid;
				}')
done

nagstate=${STATE_UNKNOWN}

STARTTIME=$(itime)
lgtime=${STARTTIME}
lctime=${STARTTIME}

while :; do
	runstat=$(director_status | awk '
				BEGIN {
					running = 0;
					rjlist = 0;
				}
				NF == 2 && $1 == "Running" && $2 == "Jobs:" {
					running = 1;
				}
				NF == 3 && $1 == "No" && $2 == "Jobs" \
							&& $3 == "running." {
					exit;
				}
				NF == 1 && $1 ~ /^=/ {
					if(rjlist) exit;
					if(running) rjlist = 1;
					continue;
				}
				rjlist != 0 {
					print;
					exit;
				}')

	jobstat=$(print -R "$runstat" | awk '
				$1 == "'"$jobid"'" {
					print;
				}')

	if [[ -z "$jobstat" ]]; then
		#
		# Job has disappeared from the Running Jobs: list
		#
		break
	fi

	#
	# Get the current status
	#
	timenow=$(itime)
	currstat=$(print -R "$jobstat" | sed \
			-e 's/^  *[^ ][^ ]*  *[^ ][^ ]*  *[^ ][^ ]*  *//' \
			-e 's/ *$//')

	#
	# If we are allowed to accept a job waiting execution for a
	# while (e.g. because of job priority), we ignore this state
	# as long as there is at least one other job in the list
	#
	if [[ "$currstat" = "is waiting execution"  && -n "$WAITING" ]]; then
		njobs=$(print -R "$runstat" | wc -l)
		if [[ "$njobs" -gt 1 ]]; then
			lgtime=${timenow}
			lctime=${timenow}
			nagios ${STATE_OK} "$jobstat"
			sleep ${MONINT}
			continue
		fi
	fi

	#
	# Anything other than a normal running state is warning/critical
	# if it stays like that for too long
	#
	case "$currstat" in

	"is running" \
	| "Dir inserting Attributes" \
	| "has terminated")
		nagios ${STATE_OK} "$jobstat"
		lgtime=${timenow}
		;;

	*)
		ngtime=$(("$timenow" - "$lgtime"))

		if [[ "$currstat" != "$laststat" ]]; then
			lctime=${timenow}
		fi
		nctime=$(("$timenow" - "$lctime"))

		if [[ "$nctime" -gt "$CRITTIME" ]]; then
			nagios ${STATE_CRIT} "$jobstat"
		elif [[ "$ngtime" -gt "$WARNTIME" ]]; then
			nagios ${STATE_WARN} "$jobstat"
		fi
	esac

	laststat=${currstat}
	sleep ${MONINT}
done

jobstat=$(director_status | awk '
			BEGIN {
				terminated = 0;
				tjlist = 0;
			}
			NF == 2 && $1 == "Terminated" && $2 == "Jobs:" {
				terminated = 1;
			}
			NF == 1 && $1 ~ /^=/ {
				if(tjlist) exit;
				if(terminated) tjlist = 1;
				continue;
			}
			tjlist != 0 && $1 == "'"$jobid"'" {
				print;
				exit;
			}')

termstat=$(print -R "$jobstat" | awk '{print $6}')

if [[ "$termstat" != "OK" ]]; then
	if [[ -z "$jobstat" ]]; then
		jobstat="${JOB} failed"
	fi
	nagios ${STATE_CRIT} "Backup Error: $jobstat"
	exit 1
fi

nagios ${STATE_OK} "Backup OK: $jobstat"
exit 0