Compare commits

..

No commits in common. "main" and "cosmos-ops-2025-04-01-v02" have entirely different histories.

5 changed files with 8 additions and 95 deletions

View file

@ -21,7 +21,6 @@
'^internal-sto4-prod-k8sc-[0-9].rut.sunet.se$': '^internal-sto4-prod-k8sc-[0-9].rut.sunet.se$':
rut::infra_ca_rp: rut::infra_ca_rp:
rut::controller_nrpe:
sunet::microk8s::node: sunet::microk8s::node:
channel: 1.31/stable channel: 1.31/stable
drain_reboot_cron: true drain_reboot_cron: true
@ -47,7 +46,7 @@
'^internal-sto4-prod-monitor-[0-9].rut.sunet.se$': '^internal-sto4-prod-monitor-[0-9].rut.sunet.se$':
sunet::dockerhost2: sunet::dockerhost2:
rut::rut_mon: rut::controller_checks:
sunet::naemon_monitor: sunet::naemon_monitor:
domain: monitor-prod.rut.sunet.se domain: monitor-prod.rut.sunet.se
naemon_tag: latest naemon_tag: latest

View file

@ -0,0 +1,7 @@
# Everything is awesome!
class rut::controller_checks {
nagioscfg::service {'check_rut_pods':
hostgroup_name => ['internal-sto4-prod-k8sc-0.rut.sunet.se'],
check_command => 'check_nrpe!check_rut_pods',
description => 'Microk8s cluster health',
}

View file

@ -1,17 +0,0 @@
class rut::controller_nrpe {
sunet::nagios::nrpe_command {'check_rut_pods':
command_line => '/usr/lib/nagios/plugins/check_rut_pods.sh'
}
file { "/usr/lib/nagios/plugins/check_rut_pods.sh":
ensure => "file",
content => template("rut/check_rut_pods.sh.erb"),
mode => '0755',
}
user { 'nagios':
ensure => present,
groups => ['microk8s'],
membership => minimum,
}
}

View file

@ -1,7 +0,0 @@
class rut::rut_mon {
nagioscfg::service {'check_rut_pods':
host_name => ['internal-sto4-prod-k8sc-0.rut.sunet.se', 'internal-sto4-prod-k8sc-1.rut.sunet.se', 'internal-sto4-prod-k8sc-2.rut.sunet.se'],
check_command => 'check_nrpe!check_rut_pods',
description => 'Microk8s cluster health',
}
}

View file

@ -1,69 +0,0 @@
#!/bin/bash
# This file is managed by puppet.
STATUS=$(/snap/bin/kubectl get events --all-namespaces -o json)
# number warnings required to make critical status (any warning makes warning and any critical makes critical)
critical_warning_num_threshold=3
num_warnings=$(echo "$STATUS" | jq '[.items[] | select(.type == "Warning")] | length')
num_normal=$(echo "$STATUS" | jq '[.items[] | select(.type == "Normal")] | length')
num_critical=$(echo "$STATUS" | jq '[.items[] | select(.type == "Critical")] | length')
function print_info {
# echo "$msg: Criticals: $num_critical", "Warnings: $num_warnings"
output="$msg - "
if [[ $num_critical -gt 0 ]]; then
output+="Criticals: $num_critical "
fi
if [[ $num_warnings -gt 0 ]]; then
[[ $num_critical -gt 0 ]] && output+=", " # Add a comma if both exist
output+="Warnings: $num_warnings "
fi
echo "$output"
if [[ "$num_critical" -gt 0 ]]; then
echo "----------------------------------------"
echo "$STATUS" | jq -r '
.items[] | select(.type == "Critical") |
"Host: " + .source.host +
"\nType: " + .type +
"\nPod: " + .involvedObject.name +
"\nMessage: " + .message +
"\n----------------------------------------"
'
fi
if [[ "$num_warnings" -gt 0 ]]; then
echo "----------------------------------------"
echo "$STATUS" | jq -r '
.items[] | select(.type == "Warning") |
"Host: " + .source.host +
"\nType: " + .type +
"\nPod: " + .involvedObject.name +
"\nMessage: " + .message +
"\n----------------------------------------"
'
fi
echo "run \"kubectl get events --all-namespaces\" on $HOSTNAME to get more info"
}
if [[ "$num_critical" -gt 0 || "$num_warnings" -ge "$critical_warning_num_threshold" ]]; then
msg="CRITICAL"
print_info
exit 2
fi
if [[ "$num_warnings" -gt 0 ]]; then
msg="WARNING"
print_info
exit 1
fi
msg="OK"
print_info
exit 0