Patrik Lundin
7baf9affb1
Makes run-cosmos request a fleetlock lock before running cosmos "update" and "apply" steps. This is helpful for making sure only one (or several) machine out of some set of machines runs cosmos changes at a time. This way if cosmos (or puppet) decides that a service needs to be restarted this will only happen on a subset of machines at a time. When the cosmos "apply" is done a fleetlock unlock request will be performed so the other machines can progress. The unlock code in run-cosmos will also run the new tool sunet-machine-healthy to decide things are good before unlocking. This way if a restarted service breaks this will stop the unlock attempt and in turn make it so the others should not break their service as well, giving an operator time to figure out what is wrong.
104 lines
2.8 KiB
Python
Executable file
104 lines
2.8 KiB
Python
Executable file
#!/usr/bin/env python3
|
|
# pylint: disable=invalid-name
|
|
# pylint: enable=invalid-name
|
|
|
|
""" Run any check tools in a directory to decide if the machine is considered
|
|
healthy, called by run-cosmos if fleetlock locking is configured """
|
|
|
|
import pathlib
|
|
import os
|
|
import os.path
|
|
import subprocess
|
|
import sys
|
|
import signal
|
|
import argparse
|
|
|
|
from typing import List, Optional
|
|
from types import FrameType
|
|
|
|
|
|
class TimeoutException(Exception):
|
|
"""Exception returned when checks takes too long"""
|
|
|
|
|
|
def timeout_handler(signum: int, frame: Optional[FrameType]) -> None:
|
|
"""This is called if the tool takes too long to run"""
|
|
raise TimeoutException(f"{os.path.basename(sys.argv[0])} hit --timeout limit")
|
|
|
|
|
|
def find_checks(check_dir: str) -> List[pathlib.Path]:
|
|
"""Find all executable .check files in the given directory"""
|
|
check_files = []
|
|
|
|
dirobj = pathlib.Path(check_dir)
|
|
|
|
# iterdir() will raise error if the directory does not exist, and in this
|
|
# case we will just return an empty list
|
|
try:
|
|
for entry in dirobj.iterdir():
|
|
if entry.is_file():
|
|
if str(entry).endswith(".check") and os.access(entry, os.X_OK):
|
|
check_files.append(entry)
|
|
|
|
# run checks in alphabetical order
|
|
check_files = sorted(check_files)
|
|
except FileNotFoundError:
|
|
pass
|
|
|
|
return check_files
|
|
|
|
|
|
def run_checks(check_files: List[pathlib.Path]) -> bool:
|
|
"""Run all checks"""
|
|
for check_file in check_files:
|
|
try:
|
|
subprocess.run([str(check_file)], check=True)
|
|
except subprocess.CalledProcessError as exc:
|
|
print(f"error: {exc}")
|
|
return False
|
|
|
|
return True
|
|
|
|
|
|
def main() -> None:
|
|
"""Starting point of the program"""
|
|
|
|
default_timeout = 60
|
|
default_health_check_dir = "/etc/sunet-machine-healthy/health-checks.d"
|
|
|
|
parser = argparse.ArgumentParser(
|
|
description="Determine if machine is considered healthy."
|
|
)
|
|
parser.add_argument("--verbose", help="print more information", action="store_true")
|
|
parser.add_argument(
|
|
"--health-check-dir",
|
|
help=f"directory to run checks from (default: {default_health_check_dir}",
|
|
default=default_health_check_dir,
|
|
)
|
|
parser.add_argument(
|
|
"--timeout",
|
|
type=int,
|
|
help=f"seconds before giving up and exiting tool (default: {default_timeout}s)",
|
|
default=default_timeout,
|
|
)
|
|
args = parser.parse_args()
|
|
|
|
checks_ok = False
|
|
|
|
# Give up if checks has been running for more than --timeout seconds:
|
|
signal.signal(signal.SIGALRM, timeout_handler)
|
|
signal.alarm(args.timeout)
|
|
|
|
check_files = find_checks(args.health_check_dir)
|
|
|
|
checks_ok = run_checks(check_files)
|
|
|
|
if checks_ok:
|
|
sys.exit(0)
|
|
|
|
sys.exit(1)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|