Compare commits

...

10 commits

7 changed files with 99 additions and 4 deletions
global/overlay/etc/puppet
internal-sto4-prod-k8sc-0.rut.sunet.se/overlay/etc/nftables/conf.d
internal-sto4-prod-k8sc-1.rut.sunet.se/overlay/etc/nftables/conf.d
internal-sto4-prod-k8sc-2.rut.sunet.se/overlay/etc/nftables/conf.d

View file

@ -21,6 +21,7 @@
'^internal-sto4-prod-k8sc-[0-9].rut.sunet.se$':
rut::infra_ca_rp:
rut::controller_nrpe:
sunet::microk8s::node:
channel: 1.31/stable
drain_reboot_cron: true
@ -30,7 +31,7 @@
frontends:
- sthb-lb-1.sunet.se
- tug-lb-1.sunet.se
port: '30443'
port: '443'
sunet::otel::alloy:
otel_receiver: monitor-prod.rut.sunet.se
sunet::fleetlock_client:
@ -46,6 +47,7 @@
'^internal-sto4-prod-monitor-[0-9].rut.sunet.se$':
sunet::dockerhost2:
rut::rut_mon:
sunet::naemon_monitor:
domain: monitor-prod.rut.sunet.se
naemon_tag: latest

View file

@ -0,0 +1,17 @@
class rut::controller_nrpe {
sunet::nagios::nrpe_command {'check_rut_pods':
command_line => '/usr/lib/nagios/plugins/check_rut_pods.sh'
}
file { "/usr/lib/nagios/plugins/check_rut_pods.sh":
ensure => "file",
content => template("rut/check_rut_pods.sh.erb"),
mode => '0755',
}
user { 'nagios':
ensure => present,
groups => ['microk8s'],
membership => minimum,
}
}

View file

@ -0,0 +1,7 @@
class rut::rut_mon {
nagioscfg::service {'check_rut_pods':
host_name => ['internal-sto4-prod-k8sc-0.rut.sunet.se', 'internal-sto4-prod-k8sc-1.rut.sunet.se', 'internal-sto4-prod-k8sc-2.rut.sunet.se'],
check_command => 'check_nrpe!check_rut_pods',
description => 'Microk8s cluster health',
}
}

View file

@ -0,0 +1,69 @@
#!/bin/bash
# This file is managed by puppet.
STATUS=$(/snap/bin/kubectl get events --all-namespaces -o json)
# number warnings required to make critical status (any warning makes warning and any critical makes critical)
critical_warning_num_threshold=3
num_warnings=$(echo "$STATUS" | jq '[.items[] | select(.type == "Warning")] | length')
num_normal=$(echo "$STATUS" | jq '[.items[] | select(.type == "Normal")] | length')
num_critical=$(echo "$STATUS" | jq '[.items[] | select(.type == "Critical")] | length')
function print_info {
# echo "$msg: Criticals: $num_critical", "Warnings: $num_warnings"
output="$msg - "
if [[ $num_critical -gt 0 ]]; then
output+="Criticals: $num_critical "
fi
if [[ $num_warnings -gt 0 ]]; then
[[ $num_critical -gt 0 ]] && output+=", " # Add a comma if both exist
output+="Warnings: $num_warnings "
fi
echo "$output"
if [[ "$num_critical" -gt 0 ]]; then
echo "----------------------------------------"
echo "$STATUS" | jq -r '
.items[] | select(.type == "Critical") |
"Host: " + .source.host +
"\nType: " + .type +
"\nPod: " + .involvedObject.name +
"\nMessage: " + .message +
"\n----------------------------------------"
'
fi
if [[ "$num_warnings" -gt 0 ]]; then
echo "----------------------------------------"
echo "$STATUS" | jq -r '
.items[] | select(.type == "Warning") |
"Host: " + .source.host +
"\nType: " + .type +
"\nPod: " + .involvedObject.name +
"\nMessage: " + .message +
"\n----------------------------------------"
'
fi
echo "run \"kubectl get events --all-namespaces\" on $HOSTNAME to get more info"
}
if [[ "$num_critical" -gt 0 || "$num_warnings" -ge "$critical_warning_num_threshold" ]]; then
msg="CRITICAL"
print_info
exit 2
fi
if [[ "$num_warnings" -gt 0 ]]; then
msg="WARNING"
print_info
exit 1
fi
msg="OK"
print_info
exit 0

View file

@ -1 +1 @@
add rule inet filter input tcp dport { 80, 443 , 30080, 30443} counter accept comment "nft_public" }
add rule inet filter input tcp dport { 80, 443 , 30080, 30443 } counter accept comment "nft_public"

View file

@ -1 +1 @@
add rule inet filter input tcp dport { 80, 443 , 30080, 30443} counter accept comment "nft_public" }
add rule inet filter input tcp dport { 80, 443 , 30080, 30443 } counter accept comment "nft_public"

View file

@ -1 +1 @@
add rule inet filter input tcp dport { 80, 443 , 30080, 30443} counter accept comment "nft_public" }
add rule inet filter input tcp dport { 80, 443 , 30080, 30443 } counter accept comment "nft_public"