eid-ops/global/overlay/usr/local/bin/scriptherder

1382 lines
49 KiB
Python
Executable file

#!/usr/bin/env python3
#
# Copyright 2014, 2015, 2017, 2018 SUNET. All rights reserved.
#
# Redistribution and use in source and binary forms, with or without modification, are
# permitted provided that the following conditions are met:
#
# 1. Redistributions of source code must retain the above copyright notice, this list of
# conditions and the following disclaimer.
#
# 2. Redistributions in binary form must reproduce the above copyright notice, this list
# of conditions and the following disclaimer in the documentation and/or other materials
# provided with the distribution.
#
# THIS SOFTWARE IS PROVIDED BY SUNET ``AS IS'' AND ANY EXPRESS OR IMPLIED
# WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
# FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SUNET OR
# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
# ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
# ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#
# The views and conclusions contained in the software and documentation are those of the
# authors and should not be interpreted as representing official policies, either expressed
# or implied, of SUNET.
#
# Author : Fredrik Thulin <fredrik@thulin.net>
#
"""
The basic idea with Scriptherder is to run e.g. cronjobs and save metadata about the
execution better than sending e-mails to root that never gets read.
While we're at it, we save more than just the output (time, exit status, ...) which
it is then possible to use to monitor that jobs are working.
Scriptherder can be run in one of the following modes:
wrap -- Stores output, exit status etc. about a script invocation
ls -- Lists the logged script invocations
check -- Check if script execution results match given criteria,
output Nagios compatible result
lastlog -- Show last execution output of a job (or all jobs)
lastfaillog -- Show last failed execution output of a job (or all jobs)
The 'check' mode compares job status against criteria in INI-files (in checkdir, default
/etc/scriptherder/check) and produces Nagios compatible output.
Example check file contents for job that is OK if it exited 0 and was last run less
than eight hours ago, WARNING if less than 24 and after that CRITICAL:
[check]
ok = exit_status=0, max_age=8h
warning = exit_status=0, max_age=24h
All criteria:
exit_status=0 Must exit(0)
max_age=8h Must have executed less than 8h ago
not_running Job is not running
output_contains=OK Output contains the text OK
output_matches=.*OK.* Output matches the regexp
OR_file_exists=FILE Check if a file exists, such as a disable-file for a job
OR_running True if a job is running - useful for jobs that run @reboot etc.
"""
import argparse
import json
import logging
import logging.handlers
import os
import re
import shutil
import subprocess
import sys
import time
from datetime import datetime
from typing import Any, AnyStr, Dict, List, Mapping, NewType, Optional, Tuple, Union, cast
from configparser import ConfigParser
Arguments = NewType("Arguments", argparse.Namespace)
# Default arguments
_defaults = {
"debug": False,
"syslog": False,
"mode": "ls",
"datadir": "/var/cache/scriptherder",
"checkdir": "/etc/scriptherder/check",
"umask": "033",
}
_check_defaults = {
"ok": "exit_status=0,max_age=8h",
"warning": "exit_status=0,max_age=24h",
}
exit_status = {
"OK": 0,
"WARNING": 1,
"CRITICAL": 2,
"UNKNOWN": 3,
}
class ScriptHerderError(Exception):
"""
Base exception class for scriptherder.
"""
def __init__(self, reason: str, filename: str):
self.reason = reason
self.filename = filename
class JobLoadError(ScriptHerderError):
"""
Raised when loading a job file fails.
"""
class CheckLoadError(ScriptHerderError):
"""
Raised when loading a check file fails.
"""
class Job:
"""
Representation of an execution of a job.
"""
def __init__(self, name: str, cmd: Optional[List[str]] = None, data: Optional[Dict[str, Any]] = None):
if cmd is None:
cmd = []
for x in cmd:
assert isinstance(x, str)
if data is None:
data = {
"version": 2,
"name": name,
"cmd": cmd,
}
if data.get("name") is None:
if cmd:
data["name"] = os.path.basename(cmd[0])
if data.get("version") not in [1, 2]:
raise JobLoadError("Unknown version: {!r}".format(data.get("version")), filename=data["filename"])
# Output of command is saved outside self._data between execution and save
self._output: Optional[bytes] = None
self._data = data
def __repr__(self) -> str:
return "<{} instance at {:#x}: {}>".format(
self.__class__.__name__,
id(self),
str(self),
)
def __str__(self) -> str:
if not self.is_running:
return "{!r} not_running".format(self.name)
start = time.strftime("%Y-%m-%d %X", time.localtime(self.start_time))
status = ""
if self.check_status:
status = ", status={}".format(self.check_status)
return "{name} start={start} ({age} ago), duration={duration}, exit={exit}{status}".format(
name=self.name,
start=start,
age=self.age,
duration=self.duration_str,
exit=self.exit_status,
status=status,
)
@property
def age(self) -> str:
"""Return how long ago this job executed."""
if self.start_time is None:
return "N/A"
return _time_to_str(time.time() - self.start_time)
def status_summary(self) -> str:
"""
Return short string with status of job.
E.g. 'name[exit=0,age=19h]'
"""
if not self.is_running:
return "{name}[not_running]".format(name=self.name)
assert self.start_time is not None
age = _time_to_str(time.time() - self.start_time)
return "{name}[exit={exit_status},age={age}]".format(
name=self.name,
exit_status=self.exit_status,
age=age,
)
@property
def name(self) -> str:
"""
The name of the job.
"""
if self._data.get("name") is None:
return self.cmd
assert isinstance(self._data["name"], str)
return self._data["name"]
@property
def cmd(self) -> str:
"""
The wrapped scripts name.
"""
assert isinstance(self._data["cmd"], list)
assert isinstance(self._data["cmd"][0], str)
return self._data["cmd"][0]
@property
def args(self) -> List[str]:
"""
The wrapped scripts arguments.
"""
cmd: List[str] = self._data.get("cmd", [])
assert len(cmd)
for x in cmd:
assert isinstance(x, str)
return cmd[1:]
@property
def start_time(self) -> Optional[float]:
"""
The start time of the script invocation.
"""
if "start_time" not in self._data:
return None
return float(self._data["start_time"])
@property
def end_time(self) -> Optional[float]:
"""
The end time of the script invocation.
"""
if "end_time" not in self._data:
return None
return float(self._data["end_time"])
@property
def duration_str(self) -> str:
"""
Time spent executing job, as a human readable string.
"""
if self.end_time is None or self.start_time is None:
return "NaN"
duration = self.end_time - self.start_time
return _time_to_str(duration)
@property
def exit_status(self) -> Optional[int]:
"""
The exit status of the script invocation.
"""
return self._data.get("exit_status")
@property
def pid(self) -> Optional[int]:
"""
The process ID of the script invocation.
"""
pid = self._data.get("pid")
assert isinstance(pid, int) or pid is None
return pid
@property
def filename(self) -> Optional[str]:
"""
The filename this job is stored in.
"""
return self._data.get("filename")
@property
def output(self) -> Optional[bytes]:
"""
The output (STDOUT and STDERR) of the script invocation.
"""
if self._output is not None:
return self._output
if not self._data.get("output") and self.output_filename:
f = open(self.output_filename, "r")
self._data["output"] = f.read()
f.close()
return self._data.get("output")
@property
def output_filename(self) -> Optional[str]:
"""
The name of the file holding the output (STDOUT and STDERR) of the script invocation.
"""
return self._data.get("output_filename")
@property
def check_status(self) -> Optional[str]:
"""
The check verdict for this job, if checked ('OK', 'WARNING', ...)
"""
return self._data.get("check_status", None)
@check_status.setter
def check_status(self, value: str) -> None:
if value not in exit_status:
raise ValueError("Unknown check_status {!r}".format(value))
self._data["check_status"] = value
@property
def check_reason(self) -> Optional[str]:
"""
Text reason for check verdict for this job, if checked.
"""
return self._data.get("check_reason")
@check_reason.setter
def check_reason(self, value: str) -> None:
self._data["check_reason"] = value
def run(self) -> None:
"""
Run script, storing various aspects of the results.
"""
self._data["start_time"] = time.time()
proc = subprocess.Popen(
self._data["cmd"],
cwd="/",
stdout=subprocess.PIPE,
stderr=subprocess.STDOUT,
close_fds=True,
)
(stdout, _stderr) = proc.communicate()
self._data["end_time"] = time.time()
self._data["exit_status"] = proc.returncode
self._data["pid"] = proc.pid
self._output = stdout
return None
def save_to_file(self, datadir: str, logger: logging.Logger, filename: Optional[str] = None) -> None:
"""
Create a record with the details of a script invocation.
@param datadir: Directory to keep records in
@param logger: logging logger
@param filename: Filename to use - default is reasonably constructed
"""
if filename is None:
fn = ""
for x in self.name:
if x.isalnum():
fn += x
else:
fn += "_"
assert self.start_time is not None
_ts = datetime.fromtimestamp(self.start_time)
_time_str = "{!s}.{:03}".format(
datetime.fromtimestamp(self.start_time).strftime("%Y%m%dT%H%M%S"), _ts.microsecond
)
filename = "{}__ts-{}_pid-{}".format(fn, _time_str, self.pid)
fn = str(os.path.join(datadir, filename))
_umask = int(f"0o{args.umask}", 8)
logger.debug(f"Setting umask to 0o{_umask:03o}")
old_umask = os.umask(_umask)
logger.debug("Saving job metadata to file '{!s}.tmp'".format(fn))
output_fn = fn + "_output"
f = open(fn + ".tmp", "w")
if self._output is not None:
self._data["output_filename"] = output_fn + ".data"
self._data["output_size"] = len(self._output)
f.write(json.dumps(self._data, indent=4, sort_keys=True))
f.write("\n")
f.close()
os.rename(fn + ".tmp", fn + ".json")
self._data["filename"] = fn
os.umask(old_umask)
if self._output is not None:
assert self.output_filename is not None
output_fn = self.output_filename
logger.debug("Saving job output to file {!r}".format(output_fn))
with open(output_fn + ".tmp", "wb") as fd:
fd.write(self._output)
os.rename(output_fn + ".tmp", output_fn)
self._output = None
def check(self, check: "Check", logger: logging.Logger) -> None:
"""
Figure out status of this job, based on it's check criteria.
:type check: Check
:type logger: logging.logger
:return: None
"""
status, msg = check.job_is_ok(self)
logger.debug("OK check result: {} {}".format(status, msg))
if status is True:
self.check_status = "OK"
self.check_reason = ", ".join(msg)
else:
status, warn_msg = check.job_is_warning(self)
logger.debug("Warning check result: {} {}".format(status, warn_msg))
msg += [x for x in warn_msg if x not in msg]
self.check_status = "WARNING" if status is True else "CRITICAL"
self.check_reason = ", ".join(msg)
logger.debug("Stored check status {}, {}".format(self.check_status, self.check_reason))
def is_ok(self) -> bool:
return self.check_status == "OK"
def is_warning(self) -> bool:
return self.check_status == "WARNING"
@property
def is_running(self) -> bool:
"""
Check if job has executed or not.
"""
return self.start_time is not None and self.end_time is not None
@classmethod
def from_file(cls, filename: str) -> "Job":
"""
Initialize this Job instance with data loaded from a file (previously created with
`save_to_file()'.
@param filename: Filename to load data from
"""
with open(filename, "rt") as f:
try:
data = json.loads(f.read(100 * 1024 * 1024))
except ValueError:
raise JobLoadError("JSON parsing failed", filename=filename)
except Exception as exc:
raise JobLoadError("Error ({}) loading job output".format(repr(exc)), filename=filename)
data["filename"] = filename
return cls("", data=data)
class JobsList:
"""
Load all jobs matching any specified name on the command line.
@param args: Parsed command line arguments
@param logger: logging logger
@param jobs: List of jobs
"""
def __init__(
self, args: Arguments, logger: logging.Logger, jobs: Optional[List[Job]] = None, load_not_running: bool = True
):
self.jobs: List[Job] = []
self._by_name: Dict[str, List[Job]] = {}
self._args = args
self._logger = logger
if jobs is None:
jobs = []
files = [f for f in os.listdir(args.datadir) if os.path.isfile(os.path.join(args.datadir, f))]
for this in files:
if not this.endswith(".json"):
continue
filename = os.path.join(args.datadir, this)
try:
job = Job.from_file(filename)
except JobLoadError as exc:
logger.warning("Failed loading job file {!r} ({!s})".format(exc.filename, exc.reason))
continue
if args.names and args.names != ["ALL"]:
if job.name not in args.names:
logger.debug(
"Skipping {!r} not matching {!r} (file {!s})".format(job.name, args.names, filename)
)
continue
jobs.append(job)
# Sort jobs, oldest first
self.jobs = sorted(jobs, key=lambda x: x.start_time if x.start_time is not None else 0)
if load_not_running:
self._load_not_running()
def _load_not_running(self) -> None:
"""
Look for jobs that have not executed at all.
To figure out which jobs _should_ be executed, we make an inventory of all the check files in
args.checkdir. For some jobs, not_running is an OK/WARNING status, so call the check.not_running()
to figure that out.
"""
files = [f for f in os.listdir(self._args.checkdir) if os.path.isfile(os.path.join(self._args.checkdir, f))]
for this in files:
if not this.endswith(".ini"):
continue
name = this[:-4] # remove the '.ini' suffix
if self._args.names and self._args.names != ["ALL"]:
if name not in self._args.names:
self._logger.debug(
"Skipping not-running {!r} not matching {!r} (file {!s})".format(name, self._args.names, this)
)
continue
if name not in self.by_name:
filename = os.path.join(self._args.checkdir, this)
self._logger.debug("Check {!r} (filename {!r}) not found in jobs".format(name, filename))
job = Job(name)
self.jobs.append(job)
if job not in self.by_name.get(name, []):
assert self._by_name is not None
self._by_name[name] = [job]
@property
def by_name(self) -> Dict[str, List[Job]]:
"""
Group jobs by name into a dict - in chronological order.
"""
if not self._by_name:
jobs_by_name: Dict[str, List[Job]] = {}
for job in self.jobs:
# Jobs in self.jobs are sorted by start_time, oldest first
if job.name not in jobs_by_name:
jobs_by_name[job.name] = []
jobs_by_name[job.name].append(job)
self._by_name = jobs_by_name
return self._by_name
@property
def last_of_each(self) -> List[Job]:
"""
Get a list of just the last job of each
"""
res: List[Job] = []
for jobs in self.by_name.values():
res.append(jobs[-1])
self._logger.debug("Last of each: {}".format(res))
return res
TCriteria = NewType("TCriteria", Tuple[str, Optional[str], bool])
class Check:
"""
Conditions for the 'check' command. Loaded from file (one file per job name),
and used to check if a Job instance is OK or WARNING or ...
"""
def __init__(self, ok_str: str, warning_str: str, filename: str, logger: logging.Logger, runtime_mode: bool):
"""
Check criteria typically loaded from a file (using Check.from_file).
See top-level comment in this script for syntax.
"""
self._logger = logger
self.filename = filename
try:
self._ok_criteria = self._parse_criteria(ok_str, runtime_mode)
self._warning_criteria = self._parse_criteria(warning_str, runtime_mode)
except CheckLoadError:
raise
except Exception:
logger.exception("Failed parsing criteria")
raise CheckLoadError("Failed loading file", filename)
if not runtime_mode:
self._ok_criteria += [cast(TCriteria, ("stored_status", "OK", False))]
def _parse_criteria(self, data_str: str, runtime_mode: bool) -> List[TCriteria]:
"""
Parse a full set of criteria, such as 'exit_status=0, max_age=25h'
:param data_str: Criteria
:return: [(what, value, negate)]
"""
res: List[TCriteria] = []
self._logger.debug("Parsing criteria: {!r}".format(data_str))
for this in data_str.split(","):
this = this.strip()
if not this:
continue
#
# Backwards-compat for renamed criteria
#
replace = {
"not_running": "!OR_running",
"output_not_contains": "!output_contains",
}
for old, new in replace.items():
if this == old or this.startswith(old + "="):
self._logger.warning(
"Criteria {!r} in file {} is obsoleted by {!r}".format(old, self.filename, new)
)
this = new + this[len(old) :]
negate = False
if this.startswith("!"):
negate = True
this = this[1:]
if "=" not in this:
# check for allowed single-value criteria
if this not in ["OR_running"]:
self._logger.debug("Unrecognized token: {!r}".format(this))
raise CheckLoadError("Bad criteria: {!r}".format(this), self.filename)
res += [cast(TCriteria, (this, None, negate))]
continue
# parse regular what=value criteria
(what, value) = this.split("=")
what = what.strip()
value = value.strip()
is_runtime_check = what not in ["max_age", "OR_file_exists"]
if runtime_mode != is_runtime_check:
self._logger.debug("Skipping criteria {} for runtime_mode={}".format(this, runtime_mode))
continue
res += [cast(TCriteria, (what, value, negate))]
return res
def job_is_ok(self, job: Job) -> Tuple[bool, List[str]]:
"""
Evaluate a Job against the OK criteria for this check.
"""
return self._evaluate("OK", self._ok_criteria, job)
def job_is_warning(self, job: Job) -> Tuple[bool, List[str]]:
"""
Evaluate a Job against the WARNING criteria for this check.
"""
return self._evaluate("warning", self._warning_criteria, job)
def _evaluate(self, name: str, criteria: List[TCriteria], job: Job) -> Tuple[bool, List[str]]:
"""
The actual evaluation engine.
For each criteria `foo', look for a corresponding check_foo function and call it.
@param name: Name of criteria, used for logging only
@param criteria: List of criteria to test ([('max_age', '8h', False)] for example)
@param job: The job
@returns: True or False, and a list of strings describing success/failure
"""
ok_msgs: List[str] = []
fail_msgs: List[str] = []
def separate_or(criteria: List[TCriteria]) -> Tuple[List[TCriteria], List[TCriteria]]:
"""Separate OR_ criteria from the other"""
_or: List[TCriteria] = []
_and: List[TCriteria] = []
for this in criteria:
what, _value, _negate = this
if what.startswith("OR_"):
_or += [this]
else:
_and += [this]
return _or, _and
or_criteria, and_criteria = separate_or(criteria)
# First, evaluate the OR criteria. If any of them return True, we are done with this check.
for this in or_criteria:
self._logger.debug("Evaluating {!r} condition OR {!s}".format(name, _criteria_to_str(this)))
status, msg = self._call_check(this, job)
if status:
self._logger.debug("{!r} OR criteria {} fulfilled: {}".format(name, this, msg))
return True, [msg]
else:
fail_msgs += [msg]
if not and_criteria:
return False, fail_msgs
res = True
for this in and_criteria:
self._logger.debug("Evaluating {!r} condition AND {!s}".format(name, _criteria_to_str(this)))
status, msg = self._call_check(this, job)
if not status:
self._logger.debug(
"Job {!r} failed {!r} AND criteria {!r} with status {!r}".format(job, name, this, status)
)
res = False
fail_msgs += [msg]
else:
ok_msgs += [msg]
self._logger.debug("Check {!r} result: {!r}, messages: {!r} / {!r}".format(name, res, ok_msgs, fail_msgs))
if res:
return True, ok_msgs
return False, fail_msgs
def _call_check(self, criteria: TCriteria, job: Job) -> Tuple[bool, str]:
what, value, negate = criteria
func = getattr(self, "check_" + what)
if not func:
return False, "{}=unknown_criteria".format(what)
status, msg = func(job, value, negate)
self._logger.debug("Function check_{}({!r}) returned: {} {}".format(what, value, status, msg))
if msg == "":
# default message is the criteria as a string
neg_str = "!" if negate else ""
msg = "{}{}={}".format(neg_str, what, value)
return status, msg
# Functions named check_ are the actual criteria that can be entered in the INI files.
# These functions should return True, False and a string describing why they succeeded or failed.
#
# Negating isn't done in _call_check because some checks formulate their message differently
# when they are negated.
def check_exit_status(self, job: Job, value: str, negate: bool) -> Tuple[bool, str]:
"""Check if job exit status matches 'value'"""
res = job.exit_status == int(value)
if negate:
res = not res
if res:
# short message for happy-case
return True, "exit={}".format(value)
if negate:
return False, "exit={}=={}".format(job.exit_status, value)
return False, "exit={}!={}".format(job.exit_status, value)
def check_max_age(self, job: Job, value: str, negate: bool) -> Tuple[bool, str]:
_value = _parse_time_value(value)
assert _value is not None
now = int(time.time())
if job.end_time is None:
res = False
else:
res = job.end_time > (now - _value)
if negate:
res = not res
if res:
# No message for happy-case
return True, ""
if negate:
return False, "age={}<={}".format(job.age, _time_to_str(_value))
return False, "age={}>{}".format(job.age, _time_to_str(_value))
def check_output_contains(self, job: Job, value: str, negate: bool) -> Tuple[bool, str]:
_output_bytes = b"" if job.output is None else _to_bytes(job.output)
res = _to_bytes(value) in _output_bytes
if negate:
res = not res # invert result
neg_str = "!" if negate else ""
return res, "{}output_contains={}=={}".format(neg_str, value, res)
def check_output_matches(self, job: Job, value: str, negate: bool) -> Tuple[bool, str]:
res = re.match(_to_bytes(value), _to_bytes(job.output)) is not None
if negate:
res = not res # invert result
neg_str = "!" if negate else ""
return res, "{}output_matches={}=={}".format(neg_str, value, res)
def check_OR_running(self, job: Job, value: str, negate: bool) -> Tuple[bool, str]:
res = job.is_running
msg = "is_running" if res else "not_running"
if negate:
res = not res
return res, msg
def check_OR_file_exists(self, job: Job, value: str, negate: bool) -> Tuple[bool, str]:
res = os.path.isfile(value)
msg = "file_exists=" if res else "file_does_not_exist="
msg += value
if negate:
res = not res
return res, msg
def check_stored_status(self, job: Job, value: str, negate: bool) -> Tuple[bool, str]:
res = job.check_status == value
if negate:
res = not res # invert result
neg_str = "!" if negate else ""
return res, "{}stored_status={}=={}".format(neg_str, value, res)
@classmethod
def from_file(cls, filename: str, logger: logging.Logger, runtime_mode: bool = False) -> "Check":
config = ConfigParser(_check_defaults)
if not config.read([filename]):
raise CheckLoadError("Failed reading file", filename)
_section = "check"
try:
_ok_criteria = config.get(_section, "ok")
_warning_criteria = config.get(_section, "warning")
except Exception as exc:
logger.exception(exc)
raise CheckLoadError("Failed loading file", filename)
return cls(_ok_criteria, _warning_criteria, filename, logger, runtime_mode)
class CheckStatus:
"""
Aggregated status of job invocations for --mode check.
Attributes:
checks_ok: List of checks in OK state ([Job()]).
checks_warning: List of checks in WARNING state ([Job()]).
checks_critical: List of checks in CRITICAL state ([Job()]).
"""
def __init__(
self,
args: Arguments,
logger: logging.Logger,
runtime_mode: bool = False,
jobs: Optional[JobsList] = None,
checks: Optional[Dict[str, Check]] = None,
):
"""
@param args: Parsed command line arguments
@param logger: logging logger
@param runtime_mode: Execute runtime-checks (not age) or the other way around
"""
self.checks_ok: List[Job] = []
self.checks_warning: List[Job] = []
self.checks_unknown: List[Job] = []
self.checks_critical: List[Job] = []
self._checks: Dict[str, Check] = {} if checks is None else checks
self._args = args
self._logger = logger
self._runtime_mode = runtime_mode
self._last_num_checked = 0
if jobs is not None:
self.check_jobs(jobs)
def check_jobs(self, jobs: JobsList) -> None:
"""
Run checks on a number of jobs.
Look for job execution entries (parsed into Job() instances), group them
per check name and determine the status. For each group, append status
to one of the three aggregate status lists of this object (checks_ok,
checks_warning or checks_critical).
"""
self.checks_ok = []
self.checks_warning = []
self.checks_unknown = []
self.checks_critical = []
# determine total check status based on all logged invocations of this job
for (name, these_jobs) in jobs.by_name.items():
self._logger.debug("")
try:
check = self.get_check(name)
except CheckLoadError as exc:
self._logger.error("Failed loading check for {}: {}".format(name, exc.reason))
this_job = these_jobs[-1]
this_job.check_status = "UNKNOWN"
this_job.check_reason = "Failed to load check"
self.checks_unknown.append(this_job)
continue
# Check most recent job first since it is pretty probable one
# will be OK or WARNING. More efficient than wading through tens or
# hundreds of jobs to find that the last one is OK.
these_jobs.reverse()
matched = False
for job in these_jobs:
self._logger.debug("Checking {!r}: {!r}".format(name, job))
job.check(check, self._logger)
self._logger.debug("Checking for OK status")
if job.is_ok():
self._logger.debug("Job status is OK")
self.checks_ok.append(job)
matched = True
break
else:
self._logger.debug("Checking for WARNING status")
if job.is_warning():
self._logger.debug("Job status is WARNING")
self.checks_warning.append(job)
matched = True
break
if not matched:
self._logger.debug("Concluding CRITICAL status")
self.checks_critical.append(these_jobs[0])
self._last_num_checked = len(jobs.by_name)
def get_check(self, name: str) -> Check:
"""
Load and cache the evaluation criteria for this job.
:param name: Name of job
:return: The check
"""
if name not in self._checks:
check_filename = os.path.join(self._args.checkdir, name + ".ini")
self._logger.debug("Loading check definition from {!r}".format(check_filename))
try:
self._checks[name] = Check.from_file(check_filename, self._logger, runtime_mode=self._runtime_mode)
except ScriptHerderError:
raise CheckLoadError("Failed loading check", filename=check_filename)
return self._checks[name]
@property
def num_jobs(self) -> int:
"""
Return number of jobs processed. This is number of different jobs running + not running.
"""
return self._last_num_checked
def aggregate_status(self) -> Tuple[str, Optional[str]]:
"""
Return the aggregate status of all jobs checked.
The level returned is 'OK', 'WARNING', 'CRITICAL' or 'UNKNOWN'.
:return: Level and message
"""
if self.num_jobs == 1:
# Single job check requested, output detailed information
if self.checks_ok:
return "OK", self.checks_ok[-1].check_reason
if self.checks_warning:
return "WARNING", self.checks_warning[-1].check_reason
if self.checks_critical:
return "CRITICAL", self.checks_critical[-1].check_reason
if self.checks_unknown:
return "UNKNOWN", self.checks_unknown[-1].check_reason
return "FAIL", "No jobs found for {!r}?".format(self._args.names)
# When looking at multiple jobs at once, logic gets a bit reversed - if ANY
# job invocation is CRITICAL/WARNING, the aggregate message given to
# Nagios will have to be a failure.
if self.checks_critical:
return "CRITICAL", _status_summary(self.num_jobs, self.checks_critical)
if self.checks_warning:
return "WARNING", _status_summary(self.num_jobs, self.checks_warning)
if self.checks_unknown:
return "UNKNOWN", _status_summary(self.num_jobs, self.checks_unknown)
if self.checks_ok:
return "OK", _status_summary(self.num_jobs, self.checks_ok)
return "UNKNOWN", "No jobs found?"
def parse_args(defaults: Mapping[str, Any]) -> Arguments:
"""
Parse the command line arguments
@param defaults: Argument defaults
"""
parser = argparse.ArgumentParser(
description="Script herder script",
add_help=True,
formatter_class=argparse.ArgumentDefaultsHelpFormatter,
)
parser.add_argument(
"--debug", dest="debug", action="store_true", default=defaults["debug"], help="Enable debug operation"
)
parser.add_argument(
"-d", "--datadir", dest="datadir", default=defaults["datadir"], help="Data directory", metavar="PATH"
)
parser.add_argument(
"--checkdir", dest="checkdir", default=defaults["checkdir"], help="Check definitions directory", metavar="PATH"
)
subparsers = parser.add_subparsers(
help="Mode of operation",
dest="mode",
)
parser_wrap = subparsers.add_parser("wrap", help="Wrap a command and store metadata about it")
parser_ls = subparsers.add_parser("ls", help="List jobs (jobs are created with 'wrap'")
parser_check = subparsers.add_parser("check", help="Return status of jobs in a Nagios compatible way")
parser_lastlog = subparsers.add_parser("lastlog", help="Show last entry for a job")
parser_lastfaillog = subparsers.add_parser("lastfaillog", help="Show last failure entry for a job")
parser_wrap.add_argument("-N", "--name", dest="name", help="Job name", metavar="NAME", required=True)
parser_wrap.add_argument(
"--umask",
dest="umask",
help=f"Job output file umask (default: {defaults['umask']})",
metavar="OCTAL",
default=defaults["umask"],
)
parser_wrap.add_argument("cmd", nargs="+", default=[], help="Script command", metavar="CMD")
parser_wrap.add_argument(
"--syslog", dest="syslog", action="store_true", default=defaults["syslog"], help="Enable syslog output"
)
parser_ls.add_argument("names", nargs="*", default=[], help="Names of jobs to include", metavar="NAME")
parser_check.add_argument("names", nargs="*", default=[], help="Names of jobs to include", metavar="NAME")
parser_lastlog.add_argument("names", nargs="*", default=[], help="Names of jobs to include", metavar="NAME")
parser_lastfaillog.add_argument("names", nargs="*", default=[], help="Names of jobs to include", metavar="NAME")
_args = sys.argv[1:]
if _args and _args[0] == "--mode":
# Old style invocation. Need to remove the "--mode" argument to have the subparser execute.
_args = _args[1:]
if not _args:
# If we set subparsers.default to "ls", the parser_ls won't execute and there won't be an args.name
# which causes issues later on. So we need to add a dummy argument to make the parser execute.
_args = ["ls"]
args = parser.parse_args(_args)
if args.mode == "wrap" and len(args.umask) != 3:
parser.error(f"Umask must be 3 digits (e.g. the default '{defaults['umask']}')")
return cast(Arguments, args)
class ColumnMeta:
"""
Metadata for a column
"""
def __init__(self, name: str, width: int = 0, align: str = ""):
self.name = name
self.width = width
self.align = align
self.update_width(len(name))
def update_width(self, value: int) -> None:
if value > self.width:
self.width = value
def to_string(self, element: Tuple[str, int]) -> str:
(value, print_width) = element
_pad = " " * (self.width - print_width)
if self.align == "right":
return _pad + value
return value + _pad
class DataTable:
"""Format data in fixed-width columns"""
def __init__(self, meta: List[ColumnMeta]) -> None:
self.rows: List[List[Tuple[str, int]]] = []
self._curr: List[Tuple[str, int]] = []
self._meta = meta
self.without_ANSI = re.compile(
r"""
\x1b # literal ESC
\[ # literal [
[;\d]* # zero or more digits or semicolons
[A-Za-z] # a letter
""",
re.VERBOSE,
).sub
def push(self, value: str) -> None:
"""Add a value to the current row"""
_print_width = len(self.without_ANSI("", value)) # get the actual print width for this value
self._curr.append((value, _print_width))
if len(self._meta) >= len(self._curr):
_meta = self._meta[len(self._curr) - 1]
_meta.update_width(len(self.without_ANSI("", value)))
def new_line(self) -> None:
self.rows.append(self._curr)
self._curr = []
def __str__(self) -> str:
"""Return the formatted table"""
res: List[str] = []
# Output field names
_this = ""
for header in self._meta:
_element = (header.name, len(header.name))
_this += f"{header.to_string(_element)} "
_this = _this.rstrip()
res.append(_this)
# Output data rows
for row in self.rows:
_this = ""
for idx in range(len(row)):
if len(self._meta) >= idx:
_meta = self._meta[idx]
_this += _meta.to_string(row[idx])
else:
_this += str(row[idx])
_this += " "
_this = _this.rstrip()
res.append(_this)
return "\n".join(res)
def mode_wrap(args: Arguments, logger: logging.Logger) -> bool:
"""
Execute a job and save result state in a file.
@param args: Parsed command line arguments
@param logger: logging logger
"""
job = Job(args.name, cmd=args.cmd)
logger.debug("Invoking '{!s}'".format("".join(args.cmd)))
job.run()
logger.debug("Finished, exit status {!r}".format(job.exit_status))
logger.debug("Job output:\n{!s}".format(job.output))
# Record what the jobs status evaluates to at the time of execution
checkstatus = CheckStatus(args, logger, runtime_mode=True)
try:
check = checkstatus.get_check(job.name)
except CheckLoadError:
check = None
if check:
job.check(check, logger)
level = logging.INFO if job.is_ok() else logging.WARNING
logger.log(level, "Job {!r} check status is {} ({})".format(job.name, job.check_status, job.check_reason))
job.save_to_file(args.datadir, logger)
return True
def mode_ls(args: Arguments, logger: logging.Logger) -> bool:
"""
List all the saved states for jobs.
@param args: Parsed command line arguments
@param logger: logging logger
"""
jobs = JobsList(args, logger)
last_of_each = jobs.last_of_each
if not args.names:
# Short-mode, just show the last execution for all jobs
print("\n=== Showing the last execution of each job, use 'ls ALL' to see all executions\n")
chosen_jobs = last_of_each
else:
chosen_jobs = jobs.jobs
checkstatus = CheckStatus(args, logger)
_fields = [
ColumnMeta("Start time", align="right"),
ColumnMeta("Duration"),
ColumnMeta("Age"),
ColumnMeta("Status"),
ColumnMeta("Criteria"),
ColumnMeta("Name"),
ColumnMeta("Filename"),
]
data = DataTable(meta=_fields)
for this in chosen_jobs:
start = "***"
if this.start_time:
start = time.strftime("%Y-%m-%d %X", time.localtime(this.start_time))
data.push(start)
data.push(this.duration_str)
data.push(this.age + " ago")
if this in last_of_each:
# For the last instance of each job, evaluate full check-mode status
temp_jobs = JobsList(args, logger, jobs=[this], load_not_running=False)
checkstatus.check_jobs(temp_jobs)
(level, msg) = checkstatus.aggregate_status()
else:
level = "-"
if this.exit_status != 0:
level = "Non-zero"
msg = "exit={}, age={}".format(this.exit_status, this.age)
color1 = ""
color2 = ""
reset = ""
if level not in ["OK", "-"] and sys.stdout.isatty():
color1 = "\033[;1m" # bold
color2 = "\033[;1m" # bold
reset = "\033[0;0m"
if level == "CRITICAL":
color1 = "\033[1;31m" # red
data.push(color1 + level + reset)
data.push(color2 + (msg or "") + reset)
data.push(this.name)
data.push(this.filename or "")
data.new_line()
print(data)
return True
def mode_check(args: Arguments, logger: logging.Logger) -> int:
"""
Evaluate the stored states for either a specific job, or all jobs.
Return Nagios compatible output ("scriptherder check" is intended to
run using Nagios NRPE or similar).
@param args: Parsed command line arguments
@param logger: logging logger
"""
try:
status = CheckStatus(args, logger, jobs=JobsList(args, logger))
except CheckLoadError as exc:
print("UNKNOWN: Failed loading check from file '{!s}' ({!s})".format(exc.filename, exc.reason))
return exit_status["UNKNOWN"]
level, msg = status.aggregate_status()
print("{!s}: {!s}".format(level, msg))
return exit_status[level]
def mode_lastlog(args: Arguments, logger: logging.Logger, fail_status: bool = False) -> Optional[int]:
"""
View script output for the last execution for either a specific
job, or all jobs.
@param args: Parsed command line arguments
@param logger: logging logger
@param fail_status: Show last failed log if True
"""
_jobs = JobsList(args, logger)
if not _jobs.jobs:
print("No jobs found")
return None
view_jobs: List[Job] = []
for job in _jobs.last_of_each:
if job.output_filename and os.path.isfile(job.output_filename):
if fail_status and job.exit_status != 0:
view_jobs.append(job)
elif not fail_status:
view_jobs.append(job)
if view_jobs:
for job in view_jobs:
if not job.output_filename:
continue
with open(job.output_filename, "r") as f:
print("=== Script output of {!r}".format(job))
shutil.copyfileobj(f, sys.stdout)
print("=== End of script output\n")
else:
print(
"No script output found for {!s} with fail_status={!s}".format(", ".join(_jobs.by_name.keys()), fail_status)
)
return bool(view_jobs)
def _status_summary(num_jobs: int, failed: List[Job]) -> str:
"""
String format routine used in output of checks status.
"""
plural = "s" if num_jobs != 1 else ""
summary = ", ".join(sorted([str(x.status_summary()) for x in failed]))
return "{jobs}/{num_jobs} job{plural} in this state: {summary}".format(
jobs=len(failed),
num_jobs=num_jobs,
summary=summary,
plural=plural,
)
def _parse_time_value(value: str) -> Optional[int]:
"""
Parse time period strings such as 1d. A lone number is considered number of seconds.
A composit string like 1d1h will be split in to two parts and evaluated recursivly.
Return parsed value as number of seconds.
@param value: Value to parse
"""
match = re.match(r"^(\d+)([hmsd]*)$", value)
if match:
num = int(match.group(1))
what = match.group(2)
if what == "m":
return num * 60
if what == "h":
return num * 3600
if what == "d":
return num * 86400
return num
else:
alpha = list(filter(None, re.split('[0-9]', value)))
numeric = list(filter(None, re.split('[mhd]', value)))
return _parse_time_value(str(numeric[0]) + alpha[0]) + _parse_time_value(str(numeric[1] + alpha[1]))
def _time_to_str(value: Union[float, int]) -> str:
"""
Format number of seconds to short readable string.
"""
if value < 1:
# milliseconds
return "{!s}ms".format(int(value * 1000))
if value < 60:
return "{!s}s".format(int(value))
if value < 3600:
return "{!s}m".format(int(value / 60))
if value < 86400:
return "{!s}h".format(int(value / 3600))
days = int(value / 86400)
return "{!s}d{!s}h".format(days, int((value % 86400) / 3600))
def _to_bytes(data: Optional[AnyStr]) -> bytes:
if not data:
return b""
if isinstance(data, bytes):
return data
return data.encode("utf-8")
def _criteria_to_str(criteria: TCriteria) -> str:
name, value, negate = criteria
eq = "!=" if negate else "=="
return "{}{}{}".format(name, eq, value)
def main(myname: str, args: Arguments, logger: Optional[logging.Logger] = None) -> Optional[Union[int, bool]]:
"""
Main entry point for either wrapping a script, or checking the status of it.
@param myname: String, used for logging
@param args: Command line arguments
@param logger: logging logger
@param defaults: Default command line arguments
"""
# initialize various components
if not logger:
level = logging.INFO
if args.debug:
level = logging.DEBUG
logging.basicConfig(
level=level, stream=sys.stderr, format="%(asctime)s: %(threadName)s %(levelname)s %(message)s"
)
logger = logging.getLogger(myname)
# If stderr is not a TTY, change the log level of the StreamHandler (stream = sys.stderr above) to ERROR
if not sys.stderr.isatty() and not args.debug:
for this_h in logging.getLogger("").handlers:
this_h.setLevel(logging.ERROR)
if args.debug:
logger.setLevel(logging.DEBUG)
if args.mode == "wrap" and args.syslog:
syslog_h = logging.handlers.SysLogHandler("/dev/log")
formatter = logging.Formatter("%(name)s: %(levelname)s %(message)s")
syslog_h.setFormatter(formatter)
syslog_h.setLevel(logging.INFO)
logger.addHandler(syslog_h)
if args.mode == "wrap":
return mode_wrap(args, logger)
elif args.mode == "ls":
return mode_ls(args, logger)
elif args.mode == "check":
return mode_check(args, logger)
elif args.mode == "lastlog":
return mode_lastlog(args, logger)
elif args.mode == "lastfaillog":
return mode_lastlog(args, logger, fail_status=True)
logger.error("Invalid mode {!r}".format(args.mode))
return False
if __name__ == "__main__":
try:
progname = os.path.basename(sys.argv[0])
args = parse_args(_defaults)
res = main(progname, args=args)
if isinstance(res, int):
sys.exit(res)
if res:
sys.exit(0)
sys.exit(1)
except KeyboardInterrupt:
sys.exit(0)