Patrik Lundin
01768129f0
While we already support setting a healthcheck timeout it probably makes sense to be able to control how long we wait for a fleetlock_lock() or fleetlock_unlock() call. This becomes important if only running cosmos once a night or something like that. In that case we you probably want to give a physical machine more than than 1 minute to complete a reboot etc. This can now be controlled by setting fleetlock_lock_timeout and fleetlock_unlock_timeout in /etc/run-cosmos-fleetlock-conf. Keep in mind that while it can make sense to increase the time for taking a lock, releasing a lock should always be fast (either you have it and release it, or you dont have it and it is a no-op) so setting a long unlock timeout should probably never be done. Since we also potentially wait the unlock timeout at boot (if the fleetlock server is broken etc) that is another reason to keep it short. The default 1m is probably OK for most uses.
153 lines
4.8 KiB
Bash
Executable file
153 lines
4.8 KiB
Bash
Executable file
#!/bin/bash
|
|
#
|
|
# Simplify running cosmos, with serialization if flock is available.
|
|
#
|
|
|
|
readonly PROGNAME=$(basename "$0")
|
|
readonly LOCKFILE_DIR=/tmp
|
|
readonly LOCK_FD=200
|
|
readonly FLEETLOCK_CONFIG=/etc/run-cosmos-fleetlock-conf
|
|
readonly FLEETLOCK_DISABLE_FILE=/etc/run-cosmos-fleetlock-disable
|
|
readonly FLEETLOCK_TOOL=/usr/local/bin/sunet-fleetlock
|
|
readonly FLEETLOCK_UNLOCK_SERVICE=run-cosmos-fleetlock-unlocker.service
|
|
readonly HEALTHCHECK_TOOL=/usr/local/bin/sunet-machine-healthy
|
|
readonly HEALTHCHECK_DISABLE_FILE=/etc/run-cosmos-healthcheck-disable
|
|
|
|
lock() {
|
|
local prefix=$1
|
|
local fd=${2:-$LOCK_FD}
|
|
local lock_file=$LOCKFILE_DIR/$prefix.lock
|
|
|
|
# create lock file
|
|
eval "exec $fd>$lock_file"
|
|
|
|
# acquier the lock
|
|
flock -n "$fd" \
|
|
&& return 0 \
|
|
|| return 1
|
|
}
|
|
|
|
eexit() {
|
|
local error_str="$*"
|
|
|
|
echo "$error_str"
|
|
exit 1
|
|
}
|
|
|
|
oexit() {
|
|
local info_str="$*"
|
|
|
|
echo "$info_str"
|
|
exit 0
|
|
}
|
|
|
|
fleetlock_enable_unlock_service() {
|
|
# In case e.g. the unit file has been removed "FragmentPath" will still
|
|
# return the old filename until daemon-reload is called, so do that here
|
|
# before we try checking for the FragmentPath.
|
|
need_reload=$(systemctl show --property NeedDaemonReload $FLEETLOCK_UNLOCK_SERVICE | awk -F= '{print $2}')
|
|
if [ "$need_reload" = "yes" ]; then
|
|
systemctl daemon-reload
|
|
fi
|
|
|
|
unit_file=$(systemctl show --property FragmentPath $FLEETLOCK_UNLOCK_SERVICE | awk -F= '{print $2}')
|
|
if [ -z "$unit_file" ]; then
|
|
# No unit file matching the service name, do nothing
|
|
return 0
|
|
fi
|
|
|
|
# Enable the service if needed
|
|
systemctl is-enabled --quiet $FLEETLOCK_UNLOCK_SERVICE || systemctl enable --quiet $FLEETLOCK_UNLOCK_SERVICE
|
|
}
|
|
|
|
fleetlock_lock() {
|
|
if [ ! -f $FLEETLOCK_DISABLE_FILE ] && [ -f $FLEETLOCK_CONFIG ] && [ -x $FLEETLOCK_TOOL ]; then
|
|
# Make sure the unlock service is enabled before we take a lock if
|
|
# cosmos ends up rebooting the machine before fleetlock_unlock() is
|
|
# called.
|
|
fleetlock_enable_unlock_service || return 1
|
|
local fleetlock_group=""
|
|
local optional_args=()
|
|
# shellcheck source=/dev/null
|
|
. $FLEETLOCK_CONFIG || return 1
|
|
if [ -z "$fleetlock_group" ]; then
|
|
echo "Unable to set fleetlock_group"
|
|
return 1
|
|
fi
|
|
if [ -n "$fleetlock_lock_timeout" ]; then
|
|
optional_args+=("--timeout")
|
|
optional_args+=("$fleetlock_lock_timeout")
|
|
fi
|
|
echo "Getting fleetlock lock"
|
|
$FLEETLOCK_TOOL --lock-group "$fleetlock_group" --lock "${optional_args[@]}" || return 1
|
|
fi
|
|
return 0
|
|
}
|
|
|
|
fleetlock_unlock() {
|
|
if [ ! -f $FLEETLOCK_DISABLE_FILE ] && [ -f $FLEETLOCK_CONFIG ] && [ -x $FLEETLOCK_TOOL ]; then
|
|
local fleetlock_group=""
|
|
local optional_args=()
|
|
# shellcheck source=/dev/null
|
|
. $FLEETLOCK_CONFIG || return 1
|
|
if [ -z "$fleetlock_group" ]; then
|
|
echo "Unable to set fleetlock_group"
|
|
return 1
|
|
fi
|
|
if [ -n "$fleetlock_unlock_timeout" ]; then
|
|
optional_args+=("--timeout")
|
|
optional_args+=("$fleetlock_unlock_timeout")
|
|
fi
|
|
machine_is_healthy || return 1
|
|
echo "Releasing fleetlock lock"
|
|
$FLEETLOCK_TOOL --lock-group "$fleetlock_group" --unlock "${optional_args[@]}" || return 1
|
|
fi
|
|
return 0
|
|
}
|
|
|
|
machine_is_healthy() {
|
|
if [ ! -f $HEALTHCHECK_DISABLE_FILE ] && [ -x $HEALTHCHECK_TOOL ]; then
|
|
local fleetlock_healthcheck_timeout=""
|
|
local optional_args=()
|
|
# shellcheck source=/dev/null
|
|
. $FLEETLOCK_CONFIG || return 1
|
|
if [ -n "$fleetlock_healthcheck_timeout" ]; then
|
|
optional_args+=("--timeout")
|
|
optional_args+=("$fleetlock_healthcheck_timeout")
|
|
fi
|
|
echo "Running any health checks"
|
|
$HEALTHCHECK_TOOL "${optional_args[@]}" || return 1
|
|
fi
|
|
return 0
|
|
}
|
|
|
|
main () {
|
|
lock "$PROGNAME" || eexit "Only one instance of $PROGNAME can run at one time."
|
|
fleetlock_lock || eexit "Unable to acquire fleetlock lock."
|
|
cosmos "$@" update
|
|
cosmos "$@" apply
|
|
fleetlock_unlock || eexit "Unable to release fleetlock lock."
|
|
|
|
touch /var/run/last-cosmos-ok.stamp
|
|
|
|
find /var/lib/puppet/reports/ -type f -mtime +10 -print0 | xargs -0 rm -f
|
|
|
|
if [ -f /cosmos-reboot ]; then
|
|
rm -f /cosmos-reboot
|
|
reboot
|
|
fi
|
|
}
|
|
|
|
# Most of the time we just pass on any arguments to the underlying cosmos
|
|
# tools, if adding special cases here make sure to not shadow any arguments
|
|
# (like "-v") which users expect to be passed on to cosmos.
|
|
case "$1" in
|
|
"fleetlock-unlock")
|
|
lock "$PROGNAME" || oexit "$PROGNAME appears locked by a running run-cosmos, let it handle unlocking instead."
|
|
fleetlock_unlock || eexit "Unable to release fleetlock lock."
|
|
;;
|
|
*)
|
|
main "$@"
|
|
;;
|
|
esac
|