Add fleetlock support to run-cosmos

Makes run-cosmos request a fleetlock lock before running cosmos "update"
and "apply" steps. This is helpful for making sure only one (or several)
machine out of some set of machines runs cosmos changes at a time. This
way if cosmos (or puppet) decides that a service needs to be restarted
this will only happen on a subset of machines at a time. When the cosmos
"apply" is done a fleetlock unlock request will be performed so the
other machines can progress.

The unlock code in run-cosmos will also run the new tool
sunet-machine-healthy to decide things are good before unlocking. This
way if a restarted service breaks this will stop the unlock attempt
and in turn make it so the others should not break their service as
well, giving an operator time to figure out what is wrong.
This commit is contained in:
Patrik Lundin 2023-06-16 11:30:54 +02:00
parent 397f0595f7
commit 7baf9affb1
Signed by untrusted user: patlu
GPG key ID: A0A812BA2249F294
3 changed files with 389 additions and 0 deletions

View file

@ -6,6 +6,11 @@
readonly PROGNAME=$(basename "$0")
readonly LOCKFILE_DIR=/tmp
readonly LOCK_FD=200
readonly FLEETLOCK_CONFIG=/etc/run-cosmos-fleetlock-conf
readonly FLEETLOCK_DISABLE_FILE=/etc/run-cosmos-fleetlock-disable
readonly FLEETLOCK_TOOL=/usr/local/bin/sunet-fleetlock
readonly HEALTHCHECK_TOOL=/usr/local/bin/sunet-machine-healthy
readonly HEALTHCHECK_DISABLE_FILE=/etc/run-cosmos-healthcheck-disable
lock() {
local prefix=$1
@ -28,10 +33,51 @@ eexit() {
exit 1
}
fleetlock_lock() {
if [ ! -f $FLEETLOCK_DISABLE_FILE ] && [ -f $FLEETLOCK_CONFIG ] && [ -x $FLEETLOCK_TOOL ]; then
local fleetlock_group=""
# shellcheck source=/dev/null
. $FLEETLOCK_CONFIG || return 1
if [ -z "$fleetlock_group" ]; then
echo "Unable to set fleetlock_group"
return 1
fi
echo "Getting fleetlock lock"
$FLEETLOCK_TOOL --lock-group "$fleetlock_group" --lock || return 1
fi
return 0
}
fleetlock_unlock() {
if [ ! -f $FLEETLOCK_DISABLE_FILE ] && [ -f $FLEETLOCK_CONFIG ] && [ -x $FLEETLOCK_TOOL ]; then
local fleetlock_group=""
# shellcheck source=/dev/null
. $FLEETLOCK_CONFIG || return 1
if [ -z "$fleetlock_group" ]; then
echo "Unable to set fleetlock_group"
return 1
fi
machine_is_healthy || return 1
echo "Releasing fleetlock lock"
$FLEETLOCK_TOOL --lock-group "$fleetlock_group" --unlock || return 1
fi
return 0
}
machine_is_healthy() {
if [ ! -f $HEALTHCHECK_DISABLE_FILE ] && [ -x $HEALTHCHECK_TOOL ]; then
echo "Running any health checks"
$HEALTHCHECK_TOOL || return 1
fi
return 0
}
main () {
lock "$PROGNAME" || eexit "Only one instance of $PROGNAME can run at one time."
fleetlock_lock || eexit "Unable to acquire fleetlock lock."
cosmos "$@" update
cosmos "$@" apply
fleetlock_unlock || eexit "Unable to release fleetlock lock."
touch /var/run/last-cosmos-ok.stamp

View file

@ -0,0 +1,240 @@
#!/usr/bin/env python3
# pylint: disable=invalid-name
# pylint: enable=invalid-name
""" Tool for taking and releasing fleetlock locks, used by run-cosmos if fleetlock is configured """
#
# You need a config file in "configparser" format with a section for the
# lock group you are using, so if the file describes two lock groups where one
# is called "fl-test1" and the other "fl-test2" then example contents would
# look like this:
# ===
# [fl-test1]
# server = https://fleetlock-server1.example.com
# password = mysecret1
#
# [fl-test2]
# server = https://fleetlock-server2.example.com
# password = mysecret2
# ===
#
# The password needs to match an acl configured for the lock group in the
# knubbis-fleetlock service.
#
# When modifying this code please make sure it is passed through the following
# tools:
# ===
# black
# pylint
# mypy --strict
# ===
import platform
import sys
import signal
import time
import argparse
import configparser
import os.path
from typing import Optional, Union
from types import FrameType
import requests
class TimeoutException(Exception):
"""Exception raised when we hit tool timeout"""
def timeout_handler(signum: int, frame: Optional[FrameType]) -> None:
"""This is called if the tool takes too long to run"""
raise TimeoutException(f"{os.path.basename(sys.argv[0])} hit --timeout limit")
def do_fleetlock_request(
config: configparser.ConfigParser, args: argparse.Namespace, operation: str
) -> bool:
"""Perform fleetlock request based on given operation and return true if it succeeded"""
fleetlock_data = {
"client_params": {
"group": args.lock_group,
"id": args.lock_id,
},
}
fleetlock_headers = {
"fleet-lock-protocol": "true",
}
if operation == "lock":
fleetlock_path = "/v1/pre-reboot"
url = config[args.lock_group]["server"] + fleetlock_path
elif operation == "unlock":
fleetlock_path = "/v1/steady-state"
url = config[args.lock_group]["server"] + fleetlock_path
else:
raise ValueError(f"unsupported operation: {operation}")
# Log the request-id header from responses so we can track requests in
# the knubbis-fleetlock logs more easily
request_id_key = "request-id"
request_id = None
# Loop forever: we depend on the SIGALRM timout to raise an error if it
# takes too long
while True:
if args.verbose:
print(f"{operation} POST at url {url}")
resp = requests.post(
url,
headers=fleetlock_headers,
json=fleetlock_data,
timeout=args.timeout,
auth=("", config[args.lock_group]["password"]),
)
if request_id_key in resp.headers:
request_id = resp.headers[request_id_key]
if resp.status_code == requests.codes.ok: # pylint: disable=no-member
if args.verbose:
print(
f"successful {operation} request for lock ID '{args.lock_id}'",
f"in lock group '{args.lock_group}' ({request_id_key}: {request_id})",
)
return True
# If the request is unauthorized this means we probably either try to
# use a lock group that does not exist, or we are using the wrong
# credentials and in either case we can give up immediately
if resp.status_code == requests.codes.unauthorized: # pylint: disable=no-member
print(
f"{operation} request unauthorized: incorrect lock group name '{args.lock_group}'",
f"or wrong credentials? ({request_id_key}: {request_id})",
)
return False
# If the request failed in some other way we expect a JSON formatted
# response message:
print(
f"{operation} request failed:"
+ " "
+ resp.content.decode("utf-8").rstrip()
+ " "
+ f"({request_id_key}: {request_id})"
)
time.sleep(1)
def read_config(args: argparse.Namespace) -> Union[configparser.ConfigParser, None]:
"""Read lock group specific settings from config file"""
config = configparser.ConfigParser()
with open(args.config, encoding="utf-8") as config_fileobj:
config.read_file(config_fileobj)
if args.lock_group not in config:
print(f"missing required config section for lock group '{args.lock_group}'")
return None
required_settings = {
"server",
"password",
}
have_required_settings = True
for setting in required_settings:
if setting not in config[args.lock_group]:
print(
f"missing required setting '{setting}' in lock group '{args.lock_group}'"
)
have_required_settings = False
if not have_required_settings:
return None
return config
def main() -> None:
"""Starting point of the program"""
# How long to wait per HTTP request to fleetlock service
default_request_timeout = 5
# How to long before giving up and exiting the tool with a failure
default_timeout = 60
default_config_file = "/etc/sunet-fleetlock/sunet-fleetlock.conf"
parser = argparse.ArgumentParser(description="Take and release fleetlock lock.")
parser.add_argument("--verbose", help="print more information", action="store_true")
parser.add_argument(
"--config",
help=f"the conf file to read (default: {default_config_file})",
default=default_config_file,
)
parser.add_argument(
"--lock-group", required=True, help="the group to take a lock in"
)
parser.add_argument(
"--lock-id",
help=f"the lock ID to use in the group (default: {platform.node()})",
default=platform.node(),
)
parser.add_argument(
"--timeout",
type=int,
help=f"how many seconds before giving up and exiting tool (default: {default_timeout}s)",
default=default_timeout,
)
parser.add_argument(
"--request_timeout",
type=int,
help=f"individal fleetlock HTTP request timeout (default: {default_request_timeout}s)",
default=default_request_timeout,
)
action_group = parser.add_mutually_exclusive_group(required=True)
action_group.add_argument("--lock", action="store_true", help="lock a reboot slot")
action_group.add_argument(
"--unlock", action="store_true", help="unlock a reboot slot"
)
args = parser.parse_args()
config = read_config(args)
if config is None:
sys.exit(1)
# Give up if tool has been running for more than --timeout seconds:
signal.signal(signal.SIGALRM, timeout_handler)
signal.alarm(args.timeout)
if args.lock:
locked = False
try:
locked = do_fleetlock_request(config, args, "lock")
except TimeoutException as exc:
print(exc)
if locked:
sys.exit(0)
if args.unlock:
unlocked = False
try:
unlocked = do_fleetlock_request(config, args, "unlock")
except TimeoutException as exc:
print(exc)
if unlocked:
sys.exit(0)
sys.exit(1)
if __name__ == "__main__":
main()

View file

@ -0,0 +1,103 @@
#!/usr/bin/env python3
# pylint: disable=invalid-name
# pylint: enable=invalid-name
""" Run any check tools in a directory to decide if the machine is considered
healthy, called by run-cosmos if fleetlock locking is configured """
import pathlib
import os
import os.path
import subprocess
import sys
import signal
import argparse
from typing import List, Optional
from types import FrameType
class TimeoutException(Exception):
"""Exception returned when checks takes too long"""
def timeout_handler(signum: int, frame: Optional[FrameType]) -> None:
"""This is called if the tool takes too long to run"""
raise TimeoutException(f"{os.path.basename(sys.argv[0])} hit --timeout limit")
def find_checks(check_dir: str) -> List[pathlib.Path]:
"""Find all executable .check files in the given directory"""
check_files = []
dirobj = pathlib.Path(check_dir)
# iterdir() will raise error if the directory does not exist, and in this
# case we will just return an empty list
try:
for entry in dirobj.iterdir():
if entry.is_file():
if str(entry).endswith(".check") and os.access(entry, os.X_OK):
check_files.append(entry)
# run checks in alphabetical order
check_files = sorted(check_files)
except FileNotFoundError:
pass
return check_files
def run_checks(check_files: List[pathlib.Path]) -> bool:
"""Run all checks"""
for check_file in check_files:
try:
subprocess.run([str(check_file)], check=True)
except subprocess.CalledProcessError as exc:
print(f"error: {exc}")
return False
return True
def main() -> None:
"""Starting point of the program"""
default_timeout = 60
default_health_check_dir = "/etc/sunet-machine-healthy/health-checks.d"
parser = argparse.ArgumentParser(
description="Determine if machine is considered healthy."
)
parser.add_argument("--verbose", help="print more information", action="store_true")
parser.add_argument(
"--health-check-dir",
help=f"directory to run checks from (default: {default_health_check_dir}",
default=default_health_check_dir,
)
parser.add_argument(
"--timeout",
type=int,
help=f"seconds before giving up and exiting tool (default: {default_timeout}s)",
default=default_timeout,
)
args = parser.parse_args()
checks_ok = False
# Give up if checks has been running for more than --timeout seconds:
signal.signal(signal.SIGALRM, timeout_handler)
signal.alarm(args.timeout)
check_files = find_checks(args.health_check_dir)
checks_ok = run_checks(check_files)
if checks_ok:
sys.exit(0)
sys.exit(1)
if __name__ == "__main__":
main()