rut-test-ops/global/overlay/usr/local/bin/run-cosmos
Patrik Lundin 7baf9affb1
Add fleetlock support to run-cosmos
Makes run-cosmos request a fleetlock lock before running cosmos "update"
and "apply" steps. This is helpful for making sure only one (or several)
machine out of some set of machines runs cosmos changes at a time. This
way if cosmos (or puppet) decides that a service needs to be restarted
this will only happen on a subset of machines at a time. When the cosmos
"apply" is done a fleetlock unlock request will be performed so the
other machines can progress.

The unlock code in run-cosmos will also run the new tool
sunet-machine-healthy to decide things are good before unlocking. This
way if a restarted service breaks this will stop the unlock attempt
and in turn make it so the others should not break their service as
well, giving an operator time to figure out what is wrong.
2023-06-17 08:10:00 +02:00

93 lines
2.4 KiB
Bash
Executable file

#!/bin/bash
#
# Simplify running cosmos, with serialization if flock is available.
#
readonly PROGNAME=$(basename "$0")
readonly LOCKFILE_DIR=/tmp
readonly LOCK_FD=200
readonly FLEETLOCK_CONFIG=/etc/run-cosmos-fleetlock-conf
readonly FLEETLOCK_DISABLE_FILE=/etc/run-cosmos-fleetlock-disable
readonly FLEETLOCK_TOOL=/usr/local/bin/sunet-fleetlock
readonly HEALTHCHECK_TOOL=/usr/local/bin/sunet-machine-healthy
readonly HEALTHCHECK_DISABLE_FILE=/etc/run-cosmos-healthcheck-disable
lock() {
local prefix=$1
local fd=${2:-$LOCK_FD}
local lock_file=$LOCKFILE_DIR/$prefix.lock
# create lock file
eval "exec $fd>$lock_file"
# acquier the lock
flock -n "$fd" \
&& return 0 \
|| return 1
}
eexit() {
local error_str="$*"
echo "$error_str"
exit 1
}
fleetlock_lock() {
if [ ! -f $FLEETLOCK_DISABLE_FILE ] && [ -f $FLEETLOCK_CONFIG ] && [ -x $FLEETLOCK_TOOL ]; then
local fleetlock_group=""
# shellcheck source=/dev/null
. $FLEETLOCK_CONFIG || return 1
if [ -z "$fleetlock_group" ]; then
echo "Unable to set fleetlock_group"
return 1
fi
echo "Getting fleetlock lock"
$FLEETLOCK_TOOL --lock-group "$fleetlock_group" --lock || return 1
fi
return 0
}
fleetlock_unlock() {
if [ ! -f $FLEETLOCK_DISABLE_FILE ] && [ -f $FLEETLOCK_CONFIG ] && [ -x $FLEETLOCK_TOOL ]; then
local fleetlock_group=""
# shellcheck source=/dev/null
. $FLEETLOCK_CONFIG || return 1
if [ -z "$fleetlock_group" ]; then
echo "Unable to set fleetlock_group"
return 1
fi
machine_is_healthy || return 1
echo "Releasing fleetlock lock"
$FLEETLOCK_TOOL --lock-group "$fleetlock_group" --unlock || return 1
fi
return 0
}
machine_is_healthy() {
if [ ! -f $HEALTHCHECK_DISABLE_FILE ] && [ -x $HEALTHCHECK_TOOL ]; then
echo "Running any health checks"
$HEALTHCHECK_TOOL || return 1
fi
return 0
}
main () {
lock "$PROGNAME" || eexit "Only one instance of $PROGNAME can run at one time."
fleetlock_lock || eexit "Unable to acquire fleetlock lock."
cosmos "$@" update
cosmos "$@" apply
fleetlock_unlock || eexit "Unable to release fleetlock lock."
touch /var/run/last-cosmos-ok.stamp
find /var/lib/puppet/reports/ -type f -mtime +10 -print0 | xargs -0 rm -f
}
main "$@"
if [ -f /cosmos-reboot ]; then
rm -f /cosmos-reboot
reboot
fi