drive-ops/global/overlay/usr/local/bin/run-cosmos
Patrik Lundin 01768129f0
fleetlock: configurable lock/unlock timeout
While we already support setting a healthcheck timeout it probably
makes sense to be able to control how long we wait for a
fleetlock_lock() or fleetlock_unlock() call. This becomes important if
only running cosmos once a night or something like that. In that case we
you probably want to give a physical machine more than than 1 minute to
complete a reboot etc.

This can now be controlled by setting fleetlock_lock_timeout and
fleetlock_unlock_timeout in /etc/run-cosmos-fleetlock-conf. Keep in mind
that while it can make sense to increase the time for taking a lock,
releasing a lock should always be fast (either you have it and release
it, or you dont have it and it is a no-op) so setting a long unlock
timeout should probably never be done.

Since we also potentially wait the unlock timeout at boot (if the
fleetlock server is broken etc) that is another reason to keep it
short. The default 1m is probably OK for most uses.
2024-07-03 13:27:52 +02:00

153 lines
4.8 KiB
Bash
Executable file

#!/bin/bash
#
# Simplify running cosmos, with serialization if flock is available.
#
readonly PROGNAME=$(basename "$0")
readonly LOCKFILE_DIR=/tmp
readonly LOCK_FD=200
readonly FLEETLOCK_CONFIG=/etc/run-cosmos-fleetlock-conf
readonly FLEETLOCK_DISABLE_FILE=/etc/run-cosmos-fleetlock-disable
readonly FLEETLOCK_TOOL=/usr/local/bin/sunet-fleetlock
readonly FLEETLOCK_UNLOCK_SERVICE=run-cosmos-fleetlock-unlocker.service
readonly HEALTHCHECK_TOOL=/usr/local/bin/sunet-machine-healthy
readonly HEALTHCHECK_DISABLE_FILE=/etc/run-cosmos-healthcheck-disable
lock() {
local prefix=$1
local fd=${2:-$LOCK_FD}
local lock_file=$LOCKFILE_DIR/$prefix.lock
# create lock file
eval "exec $fd>$lock_file"
# acquier the lock
flock -n "$fd" \
&& return 0 \
|| return 1
}
eexit() {
local error_str="$*"
echo "$error_str"
exit 1
}
oexit() {
local info_str="$*"
echo "$info_str"
exit 0
}
fleetlock_enable_unlock_service() {
# In case e.g. the unit file has been removed "FragmentPath" will still
# return the old filename until daemon-reload is called, so do that here
# before we try checking for the FragmentPath.
need_reload=$(systemctl show --property NeedDaemonReload $FLEETLOCK_UNLOCK_SERVICE | awk -F= '{print $2}')
if [ "$need_reload" = "yes" ]; then
systemctl daemon-reload
fi
unit_file=$(systemctl show --property FragmentPath $FLEETLOCK_UNLOCK_SERVICE | awk -F= '{print $2}')
if [ -z "$unit_file" ]; then
# No unit file matching the service name, do nothing
return 0
fi
# Enable the service if needed
systemctl is-enabled --quiet $FLEETLOCK_UNLOCK_SERVICE || systemctl enable --quiet $FLEETLOCK_UNLOCK_SERVICE
}
fleetlock_lock() {
if [ ! -f $FLEETLOCK_DISABLE_FILE ] && [ -f $FLEETLOCK_CONFIG ] && [ -x $FLEETLOCK_TOOL ]; then
# Make sure the unlock service is enabled before we take a lock if
# cosmos ends up rebooting the machine before fleetlock_unlock() is
# called.
fleetlock_enable_unlock_service || return 1
local fleetlock_group=""
local optional_args=()
# shellcheck source=/dev/null
. $FLEETLOCK_CONFIG || return 1
if [ -z "$fleetlock_group" ]; then
echo "Unable to set fleetlock_group"
return 1
fi
if [ -n "$fleetlock_lock_timeout" ]; then
optional_args+=("--timeout")
optional_args+=("$fleetlock_lock_timeout")
fi
echo "Getting fleetlock lock"
$FLEETLOCK_TOOL --lock-group "$fleetlock_group" --lock "${optional_args[@]}" || return 1
fi
return 0
}
fleetlock_unlock() {
if [ ! -f $FLEETLOCK_DISABLE_FILE ] && [ -f $FLEETLOCK_CONFIG ] && [ -x $FLEETLOCK_TOOL ]; then
local fleetlock_group=""
local optional_args=()
# shellcheck source=/dev/null
. $FLEETLOCK_CONFIG || return 1
if [ -z "$fleetlock_group" ]; then
echo "Unable to set fleetlock_group"
return 1
fi
if [ -n "$fleetlock_unlock_timeout" ]; then
optional_args+=("--timeout")
optional_args+=("$fleetlock_unlock_timeout")
fi
machine_is_healthy || return 1
echo "Releasing fleetlock lock"
$FLEETLOCK_TOOL --lock-group "$fleetlock_group" --unlock "${optional_args[@]}" || return 1
fi
return 0
}
machine_is_healthy() {
if [ ! -f $HEALTHCHECK_DISABLE_FILE ] && [ -x $HEALTHCHECK_TOOL ]; then
local fleetlock_healthcheck_timeout=""
local optional_args=()
# shellcheck source=/dev/null
. $FLEETLOCK_CONFIG || return 1
if [ -n "$fleetlock_healthcheck_timeout" ]; then
optional_args+=("--timeout")
optional_args+=("$fleetlock_healthcheck_timeout")
fi
echo "Running any health checks"
$HEALTHCHECK_TOOL "${optional_args[@]}" || return 1
fi
return 0
}
main () {
lock "$PROGNAME" || eexit "Only one instance of $PROGNAME can run at one time."
fleetlock_lock || eexit "Unable to acquire fleetlock lock."
cosmos "$@" update
cosmos "$@" apply
fleetlock_unlock || eexit "Unable to release fleetlock lock."
touch /var/run/last-cosmos-ok.stamp
find /var/lib/puppet/reports/ -type f -mtime +10 -print0 | xargs -0 rm -f
if [ -f /cosmos-reboot ]; then
rm -f /cosmos-reboot
reboot
fi
}
# Most of the time we just pass on any arguments to the underlying cosmos
# tools, if adding special cases here make sure to not shadow any arguments
# (like "-v") which users expect to be passed on to cosmos.
case "$1" in
"fleetlock-unlock")
lock "$PROGNAME" || oexit "$PROGNAME appears locked by a running run-cosmos, let it handle unlocking instead."
fleetlock_unlock || eexit "Unable to release fleetlock lock."
;;
*)
main "$@"
;;
esac