Merge pull request #40 from SUNET/patlu-fleetlock
Add fleetlock support to run-cosmos
This commit is contained in:
commit
9bfac2520b
|
@ -6,6 +6,11 @@
|
||||||
readonly PROGNAME=$(basename "$0")
|
readonly PROGNAME=$(basename "$0")
|
||||||
readonly LOCKFILE_DIR=/tmp
|
readonly LOCKFILE_DIR=/tmp
|
||||||
readonly LOCK_FD=200
|
readonly LOCK_FD=200
|
||||||
|
readonly FLEETLOCK_CONFIG=/etc/run-cosmos-fleetlock-conf
|
||||||
|
readonly FLEETLOCK_DISABLE_FILE=/etc/run-cosmos-fleetlock-disable
|
||||||
|
readonly FLEETLOCK_TOOL=/usr/local/bin/sunet-fleetlock
|
||||||
|
readonly HEALTHCHECK_TOOL=/usr/local/bin/sunet-machine-healthy
|
||||||
|
readonly HEALTHCHECK_DISABLE_FILE=/etc/run-cosmos-healthcheck-disable
|
||||||
|
|
||||||
lock() {
|
lock() {
|
||||||
local prefix=$1
|
local prefix=$1
|
||||||
|
@ -28,10 +33,51 @@ eexit() {
|
||||||
exit 1
|
exit 1
|
||||||
}
|
}
|
||||||
|
|
||||||
|
fleetlock_lock() {
|
||||||
|
if [ ! -f $FLEETLOCK_DISABLE_FILE ] && [ -f $FLEETLOCK_CONFIG ] && [ -x $FLEETLOCK_TOOL ]; then
|
||||||
|
local fleetlock_group=""
|
||||||
|
# shellcheck source=/dev/null
|
||||||
|
. $FLEETLOCK_CONFIG || return 1
|
||||||
|
if [ -z "$fleetlock_group" ]; then
|
||||||
|
echo "Unable to set fleetlock_group"
|
||||||
|
return 1
|
||||||
|
fi
|
||||||
|
echo "Getting fleetlock lock"
|
||||||
|
$FLEETLOCK_TOOL --lock-group "$fleetlock_group" --lock || return 1
|
||||||
|
fi
|
||||||
|
return 0
|
||||||
|
}
|
||||||
|
|
||||||
|
fleetlock_unlock() {
|
||||||
|
if [ ! -f $FLEETLOCK_DISABLE_FILE ] && [ -f $FLEETLOCK_CONFIG ] && [ -x $FLEETLOCK_TOOL ]; then
|
||||||
|
local fleetlock_group=""
|
||||||
|
# shellcheck source=/dev/null
|
||||||
|
. $FLEETLOCK_CONFIG || return 1
|
||||||
|
if [ -z "$fleetlock_group" ]; then
|
||||||
|
echo "Unable to set fleetlock_group"
|
||||||
|
return 1
|
||||||
|
fi
|
||||||
|
machine_is_healthy || return 1
|
||||||
|
echo "Releasing fleetlock lock"
|
||||||
|
$FLEETLOCK_TOOL --lock-group "$fleetlock_group" --unlock || return 1
|
||||||
|
fi
|
||||||
|
return 0
|
||||||
|
}
|
||||||
|
|
||||||
|
machine_is_healthy() {
|
||||||
|
if [ ! -f $HEALTHCHECK_DISABLE_FILE ] && [ -x $HEALTHCHECK_TOOL ]; then
|
||||||
|
echo "Running any health checks"
|
||||||
|
$HEALTHCHECK_TOOL || return 1
|
||||||
|
fi
|
||||||
|
return 0
|
||||||
|
}
|
||||||
|
|
||||||
main () {
|
main () {
|
||||||
lock "$PROGNAME" || eexit "Only one instance of $PROGNAME can run at one time."
|
lock "$PROGNAME" || eexit "Only one instance of $PROGNAME can run at one time."
|
||||||
|
fleetlock_lock || eexit "Unable to acquire fleetlock lock."
|
||||||
cosmos "$@" update
|
cosmos "$@" update
|
||||||
cosmos "$@" apply
|
cosmos "$@" apply
|
||||||
|
fleetlock_unlock || eexit "Unable to release fleetlock lock."
|
||||||
|
|
||||||
touch /var/run/last-cosmos-ok.stamp
|
touch /var/run/last-cosmos-ok.stamp
|
||||||
|
|
||||||
|
|
240
global/overlay/usr/local/bin/sunet-fleetlock
Executable file
240
global/overlay/usr/local/bin/sunet-fleetlock
Executable file
|
@ -0,0 +1,240 @@
|
||||||
|
#!/usr/bin/env python3
|
||||||
|
# pylint: disable=invalid-name
|
||||||
|
# pylint: enable=invalid-name
|
||||||
|
""" Tool for taking and releasing fleetlock locks, used by run-cosmos if fleetlock is configured """
|
||||||
|
|
||||||
|
#
|
||||||
|
# You need a config file in "configparser" format with a section for the
|
||||||
|
# lock group you are using, so if the file describes two lock groups where one
|
||||||
|
# is called "fl-test1" and the other "fl-test2" then example contents would
|
||||||
|
# look like this:
|
||||||
|
# ===
|
||||||
|
# [fl-test1]
|
||||||
|
# server = https://fleetlock-server1.example.com
|
||||||
|
# password = mysecret1
|
||||||
|
#
|
||||||
|
# [fl-test2]
|
||||||
|
# server = https://fleetlock-server2.example.com
|
||||||
|
# password = mysecret2
|
||||||
|
# ===
|
||||||
|
#
|
||||||
|
# The password needs to match an acl configured for the lock group in the
|
||||||
|
# knubbis-fleetlock service.
|
||||||
|
#
|
||||||
|
# When modifying this code please make sure it is passed through the following
|
||||||
|
# tools:
|
||||||
|
# ===
|
||||||
|
# black
|
||||||
|
# pylint
|
||||||
|
# mypy --strict
|
||||||
|
# ===
|
||||||
|
|
||||||
|
import platform
|
||||||
|
import sys
|
||||||
|
import signal
|
||||||
|
import time
|
||||||
|
import argparse
|
||||||
|
import configparser
|
||||||
|
import os.path
|
||||||
|
from typing import Optional, Union
|
||||||
|
from types import FrameType
|
||||||
|
|
||||||
|
import requests
|
||||||
|
|
||||||
|
|
||||||
|
class TimeoutException(Exception):
|
||||||
|
"""Exception raised when we hit tool timeout"""
|
||||||
|
|
||||||
|
|
||||||
|
def timeout_handler(signum: int, frame: Optional[FrameType]) -> None:
|
||||||
|
"""This is called if the tool takes too long to run"""
|
||||||
|
raise TimeoutException(f"{os.path.basename(sys.argv[0])} hit --timeout limit")
|
||||||
|
|
||||||
|
|
||||||
|
def do_fleetlock_request(
|
||||||
|
config: configparser.ConfigParser, args: argparse.Namespace, operation: str
|
||||||
|
) -> bool:
|
||||||
|
"""Perform fleetlock request based on given operation and return true if it succeeded"""
|
||||||
|
fleetlock_data = {
|
||||||
|
"client_params": {
|
||||||
|
"group": args.lock_group,
|
||||||
|
"id": args.lock_id,
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
fleetlock_headers = {
|
||||||
|
"fleet-lock-protocol": "true",
|
||||||
|
}
|
||||||
|
|
||||||
|
if operation == "lock":
|
||||||
|
fleetlock_path = "/v1/pre-reboot"
|
||||||
|
url = config[args.lock_group]["server"] + fleetlock_path
|
||||||
|
elif operation == "unlock":
|
||||||
|
fleetlock_path = "/v1/steady-state"
|
||||||
|
url = config[args.lock_group]["server"] + fleetlock_path
|
||||||
|
else:
|
||||||
|
raise ValueError(f"unsupported operation: {operation}")
|
||||||
|
|
||||||
|
# Log the request-id header from responses so we can track requests in
|
||||||
|
# the knubbis-fleetlock logs more easily
|
||||||
|
request_id_key = "request-id"
|
||||||
|
request_id = None
|
||||||
|
|
||||||
|
# Loop forever: we depend on the SIGALRM timout to raise an error if it
|
||||||
|
# takes too long
|
||||||
|
while True:
|
||||||
|
if args.verbose:
|
||||||
|
print(f"{operation} POST at url {url}")
|
||||||
|
|
||||||
|
resp = requests.post(
|
||||||
|
url,
|
||||||
|
headers=fleetlock_headers,
|
||||||
|
json=fleetlock_data,
|
||||||
|
timeout=args.timeout,
|
||||||
|
auth=("", config[args.lock_group]["password"]),
|
||||||
|
)
|
||||||
|
|
||||||
|
if request_id_key in resp.headers:
|
||||||
|
request_id = resp.headers[request_id_key]
|
||||||
|
|
||||||
|
if resp.status_code == requests.codes.ok: # pylint: disable=no-member
|
||||||
|
if args.verbose:
|
||||||
|
print(
|
||||||
|
f"successful {operation} request for lock ID '{args.lock_id}'",
|
||||||
|
f"in lock group '{args.lock_group}' ({request_id_key}: {request_id})",
|
||||||
|
)
|
||||||
|
|
||||||
|
return True
|
||||||
|
|
||||||
|
# If the request is unauthorized this means we probably either try to
|
||||||
|
# use a lock group that does not exist, or we are using the wrong
|
||||||
|
# credentials and in either case we can give up immediately
|
||||||
|
if resp.status_code == requests.codes.unauthorized: # pylint: disable=no-member
|
||||||
|
print(
|
||||||
|
f"{operation} request unauthorized: incorrect lock group name '{args.lock_group}'",
|
||||||
|
f"or wrong credentials? ({request_id_key}: {request_id})",
|
||||||
|
)
|
||||||
|
return False
|
||||||
|
|
||||||
|
# If the request failed in some other way we expect a JSON formatted
|
||||||
|
# response message:
|
||||||
|
print(
|
||||||
|
f"{operation} request failed:"
|
||||||
|
+ " "
|
||||||
|
+ resp.content.decode("utf-8").rstrip()
|
||||||
|
+ " "
|
||||||
|
+ f"({request_id_key}: {request_id})"
|
||||||
|
)
|
||||||
|
|
||||||
|
time.sleep(1)
|
||||||
|
|
||||||
|
|
||||||
|
def read_config(args: argparse.Namespace) -> Union[configparser.ConfigParser, None]:
|
||||||
|
"""Read lock group specific settings from config file"""
|
||||||
|
config = configparser.ConfigParser()
|
||||||
|
with open(args.config, encoding="utf-8") as config_fileobj:
|
||||||
|
config.read_file(config_fileobj)
|
||||||
|
|
||||||
|
if args.lock_group not in config:
|
||||||
|
print(f"missing required config section for lock group '{args.lock_group}'")
|
||||||
|
return None
|
||||||
|
|
||||||
|
required_settings = {
|
||||||
|
"server",
|
||||||
|
"password",
|
||||||
|
}
|
||||||
|
|
||||||
|
have_required_settings = True
|
||||||
|
for setting in required_settings:
|
||||||
|
if setting not in config[args.lock_group]:
|
||||||
|
print(
|
||||||
|
f"missing required setting '{setting}' in lock group '{args.lock_group}'"
|
||||||
|
)
|
||||||
|
have_required_settings = False
|
||||||
|
|
||||||
|
if not have_required_settings:
|
||||||
|
return None
|
||||||
|
|
||||||
|
return config
|
||||||
|
|
||||||
|
|
||||||
|
def main() -> None:
|
||||||
|
"""Starting point of the program"""
|
||||||
|
|
||||||
|
# How long to wait per HTTP request to fleetlock service
|
||||||
|
default_request_timeout = 5
|
||||||
|
|
||||||
|
# How to long before giving up and exiting the tool with a failure
|
||||||
|
default_timeout = 60
|
||||||
|
|
||||||
|
default_config_file = "/etc/sunet-fleetlock/sunet-fleetlock.conf"
|
||||||
|
parser = argparse.ArgumentParser(description="Take and release fleetlock lock.")
|
||||||
|
parser.add_argument("--verbose", help="print more information", action="store_true")
|
||||||
|
parser.add_argument(
|
||||||
|
"--config",
|
||||||
|
help=f"the conf file to read (default: {default_config_file})",
|
||||||
|
default=default_config_file,
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--lock-group", required=True, help="the group to take a lock in"
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--lock-id",
|
||||||
|
help=f"the lock ID to use in the group (default: {platform.node()})",
|
||||||
|
default=platform.node(),
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--timeout",
|
||||||
|
type=int,
|
||||||
|
help=f"how many seconds before giving up and exiting tool (default: {default_timeout}s)",
|
||||||
|
default=default_timeout,
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--request_timeout",
|
||||||
|
type=int,
|
||||||
|
help=f"individal fleetlock HTTP request timeout (default: {default_request_timeout}s)",
|
||||||
|
default=default_request_timeout,
|
||||||
|
)
|
||||||
|
action_group = parser.add_mutually_exclusive_group(required=True)
|
||||||
|
action_group.add_argument("--lock", action="store_true", help="lock a reboot slot")
|
||||||
|
action_group.add_argument(
|
||||||
|
"--unlock", action="store_true", help="unlock a reboot slot"
|
||||||
|
)
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
config = read_config(args)
|
||||||
|
|
||||||
|
if config is None:
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
# Give up if tool has been running for more than --timeout seconds:
|
||||||
|
signal.signal(signal.SIGALRM, timeout_handler)
|
||||||
|
signal.alarm(args.timeout)
|
||||||
|
|
||||||
|
if args.lock:
|
||||||
|
locked = False
|
||||||
|
|
||||||
|
try:
|
||||||
|
locked = do_fleetlock_request(config, args, "lock")
|
||||||
|
except TimeoutException as exc:
|
||||||
|
print(exc)
|
||||||
|
|
||||||
|
if locked:
|
||||||
|
sys.exit(0)
|
||||||
|
|
||||||
|
if args.unlock:
|
||||||
|
unlocked = False
|
||||||
|
|
||||||
|
try:
|
||||||
|
unlocked = do_fleetlock_request(config, args, "unlock")
|
||||||
|
except TimeoutException as exc:
|
||||||
|
print(exc)
|
||||||
|
|
||||||
|
if unlocked:
|
||||||
|
sys.exit(0)
|
||||||
|
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
103
global/overlay/usr/local/bin/sunet-machine-healthy
Executable file
103
global/overlay/usr/local/bin/sunet-machine-healthy
Executable file
|
@ -0,0 +1,103 @@
|
||||||
|
#!/usr/bin/env python3
|
||||||
|
# pylint: disable=invalid-name
|
||||||
|
# pylint: enable=invalid-name
|
||||||
|
|
||||||
|
""" Run any check tools in a directory to decide if the machine is considered
|
||||||
|
healthy, called by run-cosmos if fleetlock locking is configured """
|
||||||
|
|
||||||
|
import pathlib
|
||||||
|
import os
|
||||||
|
import os.path
|
||||||
|
import subprocess
|
||||||
|
import sys
|
||||||
|
import signal
|
||||||
|
import argparse
|
||||||
|
|
||||||
|
from typing import List, Optional
|
||||||
|
from types import FrameType
|
||||||
|
|
||||||
|
|
||||||
|
class TimeoutException(Exception):
|
||||||
|
"""Exception returned when checks takes too long"""
|
||||||
|
|
||||||
|
|
||||||
|
def timeout_handler(signum: int, frame: Optional[FrameType]) -> None:
|
||||||
|
"""This is called if the tool takes too long to run"""
|
||||||
|
raise TimeoutException(f"{os.path.basename(sys.argv[0])} hit --timeout limit")
|
||||||
|
|
||||||
|
|
||||||
|
def find_checks(check_dir: str) -> List[pathlib.Path]:
|
||||||
|
"""Find all executable .check files in the given directory"""
|
||||||
|
check_files = []
|
||||||
|
|
||||||
|
dirobj = pathlib.Path(check_dir)
|
||||||
|
|
||||||
|
# iterdir() will raise error if the directory does not exist, and in this
|
||||||
|
# case we will just return an empty list
|
||||||
|
try:
|
||||||
|
for entry in dirobj.iterdir():
|
||||||
|
if entry.is_file():
|
||||||
|
if str(entry).endswith(".check") and os.access(entry, os.X_OK):
|
||||||
|
check_files.append(entry)
|
||||||
|
|
||||||
|
# run checks in alphabetical order
|
||||||
|
check_files = sorted(check_files)
|
||||||
|
except FileNotFoundError:
|
||||||
|
pass
|
||||||
|
|
||||||
|
return check_files
|
||||||
|
|
||||||
|
|
||||||
|
def run_checks(check_files: List[pathlib.Path]) -> bool:
|
||||||
|
"""Run all checks"""
|
||||||
|
for check_file in check_files:
|
||||||
|
try:
|
||||||
|
subprocess.run([str(check_file)], check=True)
|
||||||
|
except subprocess.CalledProcessError as exc:
|
||||||
|
print(f"error: {exc}")
|
||||||
|
return False
|
||||||
|
|
||||||
|
return True
|
||||||
|
|
||||||
|
|
||||||
|
def main() -> None:
|
||||||
|
"""Starting point of the program"""
|
||||||
|
|
||||||
|
default_timeout = 60
|
||||||
|
default_health_check_dir = "/etc/sunet-machine-healthy/health-checks.d"
|
||||||
|
|
||||||
|
parser = argparse.ArgumentParser(
|
||||||
|
description="Determine if machine is considered healthy."
|
||||||
|
)
|
||||||
|
parser.add_argument("--verbose", help="print more information", action="store_true")
|
||||||
|
parser.add_argument(
|
||||||
|
"--health-check-dir",
|
||||||
|
help=f"directory to run checks from (default: {default_health_check_dir}",
|
||||||
|
default=default_health_check_dir,
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--timeout",
|
||||||
|
type=int,
|
||||||
|
help=f"seconds before giving up and exiting tool (default: {default_timeout}s)",
|
||||||
|
default=default_timeout,
|
||||||
|
)
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
checks_ok = False
|
||||||
|
|
||||||
|
# Give up if checks has been running for more than --timeout seconds:
|
||||||
|
signal.signal(signal.SIGALRM, timeout_handler)
|
||||||
|
signal.alarm(args.timeout)
|
||||||
|
|
||||||
|
check_files = find_checks(args.health_check_dir)
|
||||||
|
|
||||||
|
checks_ok = run_checks(check_files)
|
||||||
|
|
||||||
|
if checks_ok:
|
||||||
|
sys.exit(0)
|
||||||
|
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
Loading…
Reference in a new issue