sunetdrive/templates/script/restart-db-cluster.erb

130 lines
4.3 KiB
Plaintext
Raw Normal View History

2023-11-01 14:20:23 +00:00
#!/usr/bin/env python3
2023-02-13 09:44:56 +00:00
# vim: set filetype=python:
2023-09-02 10:50:08 +00:00
import subprocess
2023-02-13 09:44:56 +00:00
import sys
import time
import urllib.parse
import requests
from drive_utils import (build_fqdn, get_ips_for_hostname, run_remote_command,
smoketest_db_node)
def add_downtime(fqdn: str,
apikey: str,
monitor_host: str = 'monitor.drive.test.sunet.se') -> None:
if not apikey:
return
print("\tAdding downtime for: {}".format(fqdn))
action = 'schedule_host_svc_downtime'
start_time = int(time.time())
end_time = start_time + (10 * 60) # 10 minutes
data = {
'comment_data': 'Reboot from script',
'start_time': start_time,
'end_time': end_time
}
post_url = 'https://{}/thruk/r/hosts/{}/cmd/{}'.format(
monitor_host, fqdn, action)
headers = {'X-Thruk-Auth-Key': apikey}
try:
requests.post(post_url, data=data, headers=headers)
except Exception:
print("Failed to add downtime for {}".format(fqdn))
2023-02-13 09:44:56 +00:00
def remove_downtime(fqdn: str,
apikey: str,
monitor_host: str = 'monitor.drive.test.sunet.se') -> None:
if not apikey:
return
print("\tRemoving downtime for: {}".format(fqdn))
get_url = 'https://{}/thruk/r/hosts?name={}&columns=services'.format(
monitor_host, fqdn)
headers = {'X-Thruk-Auth-Key': apikey}
try:
req = requests.get(get_url, headers=headers)
action = 'del_active_service_downtimes'
for service in req.json()[0]['services']:
post_url = 'https://{}/thruk/r/services/{}/{}/cmd/{}'.format(
monitor_host, fqdn, urllib.parse.quote(service), action)
requests.post(post_url, headers=headers)
except Exception:
print("Failed to remove downtime for {}".format(fqdn))
2023-02-13 09:44:56 +00:00
def main() -> int:
2023-07-20 09:23:48 +00:00
customers = ["<%= @customer %>"]
2023-02-13 09:44:56 +00:00
environment = "<%= @environment %>"
apikey_test = "<%= @apikey_test %>"
apikey_prod = "<%= @apikey_prod %>"
user = "script"
2023-09-02 10:50:08 +00:00
backup_command = ['sudo /home/script/bin/backup_db.sh']
2023-02-13 09:44:56 +00:00
reboot_command = ['sudo /usr/local/bin/safer_reboot']
2023-07-20 09:23:48 +00:00
if customers[0] == "common":
customers = ["gss", "lookup", "multinode"]
for customer in customers:
2023-09-02 10:50:08 +00:00
backup_type = "backup"
if customer == "gss":
backup_type = "gssbackup"
elif customer == "lookup":
backup_type = "lookupbackup"
elif customer == "multinode":
backup_command = ['sudo /home/script/bin/backup_multinode_db.sh']
backup_type = "multinode-db"
backup = build_fqdn(customer, environment, 1, backup_type)
print("\tRunning backup command at {}".format(backup))
run_remote_command(backup,
backup_command,
user="script",
output=subprocess.DEVNULL)
2023-07-20 09:23:48 +00:00
for number in reversed(range(1, 4)):
fqdn = build_fqdn(customer, environment, number)
ipv4, _ = get_ips_for_hostname(fqdn)
ip = ipv4[0]
print("Upgrading: {} with ip: {}".format(fqdn, ip))
add_downtime(fqdn, apikey_test)
2023-09-02 10:50:08 +00:00
add_downtime(fqdn,
apikey_prod,
monitor_host="monitor.drive.sunet.se")
2023-07-20 09:23:48 +00:00
2023-09-02 10:50:08 +00:00
run_remote_command(fqdn, reboot_command, user=user)
2023-07-20 09:23:48 +00:00
success = False
for testnumber in reversed(range(1, 32, 2)):
2023-09-02 10:50:08 +00:00
print(
"\tSleeping for {} seconds before smoketest on {}".format(
testnumber, fqdn))
2023-07-20 09:23:48 +00:00
time.sleep(testnumber)
2023-09-02 10:50:08 +00:00
if smoketest_db_node(fqdn, user=user):
2023-07-20 09:23:48 +00:00
success = True
break
remove_downtime(fqdn, apikey_test)
2023-09-02 10:50:08 +00:00
remove_downtime(fqdn,
apikey_prod,
monitor_host="monitor.drive.sunet.se")
2023-07-20 09:23:48 +00:00
if success:
print("Upgrade cycle succeeded on {} ".format(fqdn))
else:
2023-09-02 10:50:08 +00:00
print("Smoketest failed on {} after server reboot command".
format(fqdn))
2023-07-20 09:23:48 +00:00
return 5
print("All {}-servers successfully upgraded for {}".format(
environment, customer))
2023-02-13 09:44:56 +00:00
return 0
if __name__ == "__main__":
sys.exit(main())