From f5404ec114f2c31891eb35e9f05c237a82822260 Mon Sep 17 00:00:00 2001 From: Rasmus Thorslund Date: Mon, 7 Apr 2025 09:38:56 +0200 Subject: [PATCH] added nrpe check on controller node --- global/overlay/etc/puppet/cosmos-rules.yaml | 3 +- .../modules/rut/manifests/controller_nrpe.pp | 16 +++++ .../{controller_checks.pp => rut_mon.pp} | 3 +- .../rut/templates/check_rut_pods.sh.erb | 69 +++++++++++++++++++ 4 files changed, 88 insertions(+), 3 deletions(-) create mode 100644 global/overlay/etc/puppet/modules/rut/manifests/controller_nrpe.pp rename global/overlay/etc/puppet/modules/rut/manifests/{controller_checks.pp => rut_mon.pp} (79%) create mode 100644 global/overlay/etc/puppet/modules/rut/templates/check_rut_pods.sh.erb diff --git a/global/overlay/etc/puppet/cosmos-rules.yaml b/global/overlay/etc/puppet/cosmos-rules.yaml index a053fa5..f398a5e 100644 --- a/global/overlay/etc/puppet/cosmos-rules.yaml +++ b/global/overlay/etc/puppet/cosmos-rules.yaml @@ -21,6 +21,7 @@ '^internal-sto4-prod-k8sc-[0-9].rut.sunet.se$': rut::infra_ca_rp: + rut::controller_nrpe: sunet::microk8s::node: channel: 1.31/stable drain_reboot_cron: true @@ -46,7 +47,7 @@ '^internal-sto4-prod-monitor-[0-9].rut.sunet.se$': sunet::dockerhost2: - rut::controller_checks: + rut::rut_mon: sunet::naemon_monitor: domain: monitor-prod.rut.sunet.se naemon_tag: latest diff --git a/global/overlay/etc/puppet/modules/rut/manifests/controller_nrpe.pp b/global/overlay/etc/puppet/modules/rut/manifests/controller_nrpe.pp new file mode 100644 index 0000000..cb26357 --- /dev/null +++ b/global/overlay/etc/puppet/modules/rut/manifests/controller_nrpe.pp @@ -0,0 +1,16 @@ +class rut::controller_nrpe { + sunet::nagios::nrpe_command {'check_rut_pods': + command_line => '/usr/lib/nagios/plugins/check_rut_pods.sh' + } + + file { "/usr/lib/nagios/plugins/check_rut_pods.sh": + ensure => "file", + content => template("rut/check_rut_pods.sh.erb") + } + + user { 'nagios': + ensure => present, + groups => ['microk8s'], + membership => minimum, + } +} diff --git a/global/overlay/etc/puppet/modules/rut/manifests/controller_checks.pp b/global/overlay/etc/puppet/modules/rut/manifests/rut_mon.pp similarity index 79% rename from global/overlay/etc/puppet/modules/rut/manifests/controller_checks.pp rename to global/overlay/etc/puppet/modules/rut/manifests/rut_mon.pp index 718c22d..62bc054 100644 --- a/global/overlay/etc/puppet/modules/rut/manifests/controller_checks.pp +++ b/global/overlay/etc/puppet/modules/rut/manifests/rut_mon.pp @@ -1,5 +1,4 @@ -# Everything is awesome! -class rut::controller_checks { +class rut::rut_mon { nagioscfg::service {'check_rut_pods': host_name => ['internal-sto4-prod-k8sc-0.rut.sunet.se'], check_command => 'check_nrpe!check_rut_pods', diff --git a/global/overlay/etc/puppet/modules/rut/templates/check_rut_pods.sh.erb b/global/overlay/etc/puppet/modules/rut/templates/check_rut_pods.sh.erb new file mode 100644 index 0000000..c447162 --- /dev/null +++ b/global/overlay/etc/puppet/modules/rut/templates/check_rut_pods.sh.erb @@ -0,0 +1,69 @@ +#!/bin/bash +# This file is managed by puppet. + +STATUS=$(/snap/bin/kubectl get events --all-namespaces -o json) + +# number warnings required to make critical status (any warning makes warning and any critical makes critical) +critical_warning_num_threshold=3 + +num_warnings=$(echo "$STATUS" | jq '[.items[] | select(.type == "Warning")] | length') +num_normal=$(echo "$STATUS" | jq '[.items[] | select(.type == "Normal")] | length') +num_critical=$(echo "$STATUS" | jq '[.items[] | select(.type == "Critical")] | length') + +function print_info { +# echo "$msg: Criticals: $num_critical", "Warnings: $num_warnings" + output="$msg - " + + if [[ $num_critical -gt 0 ]]; then + output+="Criticals: $num_critical " + fi + + if [[ $num_warnings -gt 0 ]]; then + [[ $num_critical -gt 0 ]] && output+=", " # Add a comma if both exist + output+="Warnings: $num_warnings " + fi + + echo "$output" + if [[ "$num_critical" -gt 0 ]]; then + echo "----------------------------------------" + echo "$STATUS" | jq -r ' + .items[] | select(.type == "Critical") | + "Host: " + .source.host + + "\nType: " + .type + + "\nPod: " + .involvedObject.name + + "\nMessage: " + .message + + "\n----------------------------------------" + ' + fi + + + if [[ "$num_warnings" -gt 0 ]]; then + echo "----------------------------------------" + echo "$STATUS" | jq -r ' + .items[] | select(.type == "Warning") | + "Host: " + .source.host + + "\nType: " + .type + + "\nPod: " + .involvedObject.name + + "\nMessage: " + .message + + "\n----------------------------------------" + ' + fi + + echo "run \"kubectl get events --all-namespaces\" on $HOSTNAME to get more info" + } + +if [[ "$num_critical" -gt 0 || "$num_warnings" -ge "$critical_warning_num_threshold" ]]; then + msg="CRITICAL" + print_info + exit 2 + fi + +if [[ "$num_warnings" -gt 0 ]]; then + msg="WARNING" + print_info + exit 1 + fi + +msg="OK" +print_info +exit 0