matrix-ops/global/overlay/usr/local/bin/sunet-machine-healthy
Patrik Lundin 7baf9affb1
Add fleetlock support to run-cosmos
Makes run-cosmos request a fleetlock lock before running cosmos "update"
and "apply" steps. This is helpful for making sure only one (or several)
machine out of some set of machines runs cosmos changes at a time. This
way if cosmos (or puppet) decides that a service needs to be restarted
this will only happen on a subset of machines at a time. When the cosmos
"apply" is done a fleetlock unlock request will be performed so the
other machines can progress.

The unlock code in run-cosmos will also run the new tool
sunet-machine-healthy to decide things are good before unlocking. This
way if a restarted service breaks this will stop the unlock attempt
and in turn make it so the others should not break their service as
well, giving an operator time to figure out what is wrong.
2023-06-17 08:10:00 +02:00

104 lines
2.8 KiB
Python
Executable file

#!/usr/bin/env python3
# pylint: disable=invalid-name
# pylint: enable=invalid-name
""" Run any check tools in a directory to decide if the machine is considered
healthy, called by run-cosmos if fleetlock locking is configured """
import pathlib
import os
import os.path
import subprocess
import sys
import signal
import argparse
from typing import List, Optional
from types import FrameType
class TimeoutException(Exception):
"""Exception returned when checks takes too long"""
def timeout_handler(signum: int, frame: Optional[FrameType]) -> None:
"""This is called if the tool takes too long to run"""
raise TimeoutException(f"{os.path.basename(sys.argv[0])} hit --timeout limit")
def find_checks(check_dir: str) -> List[pathlib.Path]:
"""Find all executable .check files in the given directory"""
check_files = []
dirobj = pathlib.Path(check_dir)
# iterdir() will raise error if the directory does not exist, and in this
# case we will just return an empty list
try:
for entry in dirobj.iterdir():
if entry.is_file():
if str(entry).endswith(".check") and os.access(entry, os.X_OK):
check_files.append(entry)
# run checks in alphabetical order
check_files = sorted(check_files)
except FileNotFoundError:
pass
return check_files
def run_checks(check_files: List[pathlib.Path]) -> bool:
"""Run all checks"""
for check_file in check_files:
try:
subprocess.run([str(check_file)], check=True)
except subprocess.CalledProcessError as exc:
print(f"error: {exc}")
return False
return True
def main() -> None:
"""Starting point of the program"""
default_timeout = 60
default_health_check_dir = "/etc/sunet-machine-healthy/health-checks.d"
parser = argparse.ArgumentParser(
description="Determine if machine is considered healthy."
)
parser.add_argument("--verbose", help="print more information", action="store_true")
parser.add_argument(
"--health-check-dir",
help=f"directory to run checks from (default: {default_health_check_dir}",
default=default_health_check_dir,
)
parser.add_argument(
"--timeout",
type=int,
help=f"seconds before giving up and exiting tool (default: {default_timeout}s)",
default=default_timeout,
)
args = parser.parse_args()
checks_ok = False
# Give up if checks has been running for more than --timeout seconds:
signal.signal(signal.SIGALRM, timeout_handler)
signal.alarm(args.timeout)
check_files = find_checks(args.health_check_dir)
checks_ok = run_checks(check_files)
if checks_ok:
sys.exit(0)
sys.exit(1)
if __name__ == "__main__":
main()