Copy class sunet::naemon_monitor to soc as we use acme-d for certs.

This commit is contained in:
Johan Björklund 2024-11-27 15:56:01 +01:00
parent 66b4b1783d
commit f01b8efa63
Signed by untrusted user: bjorklund
GPG key ID: 5E8401339C7F5037
24 changed files with 25857 additions and 2 deletions

View file

@ -46,8 +46,7 @@ intelmq-dev.cert.sunet.se:
monitor-dev.cert.sunet.se:
sunet::dockerhost2:
sunet::certbot::acmed:
sunet::naemon_monitor:
soc::naemon_monitor:
domain: monitor-dev.cert.sunet.se
thruk_admins:
- bjorklund@sunet.se

View file

@ -0,0 +1,2 @@
do '/usr/share/thruk/menu.conf';
insert_item('General', { 'href' => '/grafana', 'name' => 'Grafana', target => '_self' });

View file

@ -0,0 +1,496 @@
# @summary Run naemon with Thruk.
# @param receive_otel Feature flag to enable the LGTM stack
# @param otel_retention Number of hours to keep logs, metrics and traces, defaults to 3 months
#
class soc::naemon_monitor (
String $domain,
String $influx_password = lookup('influx_password', String, undef, ''),
String $naemon_tag = 'latest',
Array $naemon_extra_volumes = [],
Array $thruk_extra_volumes = [],
Array $resolvers = [],
String $thruk_tag = 'latest',
Array $thruk_admins = ['placeholder'],
Array $thruk_users = [],
String $influxdb_tag = '1.8',
String $histou_tag = 'latest',
String $nagflux_tag = 'latest',
String $grafana_tag = '11.1.4',
String $grafana_default_role = 'Viewer',
String $loki_tag = '3.1.1',
String $mimir_tag = '2.13.0',
String $tempo_tag = '2.6.0',
String $alloy_tag = 'v1.3.0',
Hash $manual_hosts = {},
Hash $additional_entities = {},
String $nrpe_group = 'nrpe',
String $interface = 'ens3',
Array $exclude_hosts = [],
Optional[String] $default_host_group = undef,
Array[Optional[String]] $optout_checks = [],
Optional[Boolean] $receive_otel = false,
String $otel_retention = '2232h',
String $acme_provider = 'acme-d',
) {
include sunet::systemd_reload
$naemon_container = $::facts['dockerhost2'] ? {
'yes' => 'naemon_monitor-naemon-1',
default => 'naemon_monitor_naemon_1',
}
if $::facts['sunet_nftables_enabled'] == 'yes' {
sunet::nftables::docker_expose { 'allow_http' :
iif => $interface,
allow_clients => 'any',
port => 80,
}
sunet::nftables::docker_expose { 'allow_https' :
iif => $interface,
allow_clients => 'any',
port => 443,
}
if $receive_otel {
sunet::nftables::docker_expose { 'allow_otel_grpc' :
iif => $interface,
allow_clients => 'any',
port => 4317,
}
sunet::nftables::docker_expose { 'allow_otel_http' :
iif => $interface,
allow_clients => 'any',
port => 4318,
}
}
} else {
sunet::misc::ufw_allow { 'allow-http':
from => 'any',
port => '80',
}
sunet::misc::ufw_allow { 'allow-https':
from => 'any',
port => '443',
}
if $receive_otel {
sunet::misc::ufw_allow { 'allow-otel-grpc':
from => 'any',
port => '4317',
}
sunet::misc::ufw_allow { 'allow-otel-http':
from => 'any',
port => '4318',
}
}
}
if $acme_provider == 'dehydrated' {
class { 'sunet::dehydrated::client': domain => $domain, ssl_links => true }
} elsif $acme_provider == 'acme-d' {
class { 'sunet::certbot::acmed' }
file { '/opt/naemon_monitor/apache-thruk.conf':
ensure => file,
content => template('soc/naemon_monitor/apache-thruk.conf.erb'),
mode => '0444',
}
}
if lookup('shib_key', undef, undef, undef) != undef {
sunet::snippets::secret_file { '/opt/naemon_monitor/shib-certs/sp-key.pem': hiera_key => 'shib_key' }
# assume cert is in cosmos repo (overlay)
}
$thruk_admins_string = inline_template('ADMIN_USERS=<%- @thruk_admins.each do |user| -%><%= user %>,<%- end -%>')
$thruk_users_string = inline_template('READONLY_USERS=<%- @thruk_users.each do |user| -%><%= user %>,<%- end -%>')
$thruk_env = [$thruk_admins_string, $thruk_users_string]
if $influx_password == '' {
err('ERROR: influx password not set')
}
$influx_env = ['INFLUXDB_ADMIN_USER=admin',"INFLUXDB_ADMIN_PASSWORD=${influx_password}", 'INFLUXDB_DB=nagflux']
$nagflux_env = ["INFLUXDB_ADMIN_PASSWORD=${influx_password}"]
file { '/etc/systemd/system/sunet-naemon_monitor.service.d/':
ensure => directory,
recurse => true,
}
file { '/opt/naemon_monitor/menu_local.conf':
ensure => file,
content => file('soc/naemon_monitor/menu_local.conf'),
}
file { '/etc/systemd/system/sunet-naemon_monitor.service.d/override.conf':
ensure => file,
content => template('soc/naemon_monitor/service-override.conf.erb'),
require => File['/etc/systemd/system/sunet-naemon_monitor.service.d/'],
notify => Class['sunet::systemd_reload'],
}
sunet::docker_compose { 'naemon_monitor':
content => template('soc/naemon_monitor/docker-compose.yml.erb'),
service_name => 'naemon_monitor',
compose_dir => '/opt/',
compose_filename => 'docker-compose.yml',
description => 'Naemon monitoring (with Thruk)',
require => File['/etc/systemd/system/sunet-naemon_monitor.service.d/override.conf'],
}
# This section can be removed when the class is run on all machines
file { '/opt/naemon_monitor/stop-monitor.sh':
ensure => absent,
}
#
file { '/etc/logrotate.d/naemon_monitor':
ensure => file,
content => template('soc/naemon_monitor/logrotate.erb'),
mode => '0644',
group => 'root',
owner => 'root',
}
file { '/opt/naemon_monitor/grafana.ini':
ensure => file,
content => template('soc/naemon_monitor/grafana.ini'),
mode => '0644',
group => 'root',
owner => 'root',
}
file { '/opt/naemon_monitor/histou.js':
ensure => file,
content => template('soc/naemon_monitor/histou.js'),
mode => '0644',
group => 'root',
owner => 'root',
}
file { '/opt/naemon_monitor/grafana-provisioning':
ensure => directory,
mode => '0644',
group => 'root',
owner => 'root',
}
file { '/opt/naemon_monitor/grafana-provisioning/datasources':
ensure => directory,
mode => '0644',
group => 'root',
owner => 'root',
}
file { '/opt/naemon_monitor/grafana-provisioning/dashboards':
ensure => directory,
mode => '0644',
group => 'root',
owner => 'root',
}
file { '/opt/naemon_monitor/grafana-provisioning/datasources/influxdb.yaml':
ensure => file,
content => template('soc/naemon_monitor/grafana-provisioning/datasources/influxdb.yaml'),
mode => '0644',
group => 'root',
owner => 'root',
}
file { '/opt/naemon_monitor/data':
ensure => directory,
owner => 'www-data',
mode => '0644',
group => 'root',
}
if $receive_otel {
# Grafana can only use one group via the apache proxy auth module, so we cheat and make everyone editors
# and admins can be manually assigned via gui.
$allowed_users_string = join($thruk_admins + $thruk_users,' ')
file { '/opt/naemon_monitor/groups.txt':
ensure => file,
content => inline_template('editors:<%= @allowed_users_string-%>'),
mode => '0644',
group => 'root',
owner => 'root',
}
file { '/opt/naemon_monitor/grafana-provisioning/datasources/loki.yaml':
ensure => file,
content => template('soc/naemon_monitor/grafana-provisioning/datasources/loki.yaml'),
mode => '0644',
group => 'root',
owner => 'root',
}
file { '/opt/naemon_monitor/grafana-provisioning/datasources/mimir.yaml':
ensure => file,
content => template('soc/naemon_monitor/grafana-provisioning/datasources/mimir.yaml'),
mode => '0644',
group => 'root',
owner => 'root',
}
file { '/opt/naemon_monitor/grafana-provisioning/datasources/tempo.yaml':
ensure => file,
content => template('soc/naemon_monitor/grafana-provisioning/datasources/tempo.yaml'),
mode => '0644',
group => 'root',
owner => 'root',
}
file { '/opt/naemon_monitor/grafana-provisioning/dashboards/default.yaml':
ensure => file,
content => template('soc/naemon_monitor/grafana-provisioning/dashboards/default.yaml'),
mode => '0644',
group => 'root',
owner => 'root',
}
file { '/opt/naemon_monitor/grafana-provisioning/dashboards/overview.json':
ensure => file,
content => template('soc/naemon_monitor/grafana-provisioning/dashboards/overview.json'),
mode => '0644',
group => 'root',
owner => 'root',
}
file { '/opt/naemon_monitor/grafana-provisioning/dashboards/node-export-full.json':
ensure => file,
content => template('soc/naemon_monitor/grafana-provisioning/dashboards/node-export-full.json'),
mode => '0644',
group => 'root',
owner => 'root',
}
file { '/opt/naemon_monitor/loki-server.yaml':
ensure => file,
content => template('soc/naemon_monitor/loki-server.yaml'),
mode => '0644',
group => 'root',
owner => 'root',
}
file { '/opt/naemon_monitor/loki':
ensure => directory,
owner => '10001',
mode => '0644',
group => '10001',
}
file { '/opt/naemon_monitor/mimir':
ensure => directory,
owner => 'root',
mode => '0644',
group => 'root',
}
file { '/opt/naemon_monitor/mimir-server.yaml':
ensure => file,
content => template('soc/naemon_monitor/mimir-server.yaml'),
mode => '0644',
group => 'root',
owner => 'root',
}
file { '/opt/naemon_monitor/tempo':
ensure => directory,
owner => '10001',
mode => '0644',
group => '10001',
}
file { '/opt/naemon_monitor/tempo-server.yaml':
ensure => file,
content => template('soc/naemon_monitor/tempo-server.yaml'),
mode => '0644',
group => 'root',
owner => 'root',
}
file { '/opt/naemon_monitor/alloy-server.alloy':
ensure => file,
content => template('soc/naemon_monitor/alloy-server.alloy'),
mode => '0644',
group => 'root',
owner => 'root',
}
}
file { '/opt/naemon_monitor/grafana':
ensure => directory,
owner => 'www-data',
mode => '0664',
group => 'root',
}
file { '/usr/lib/nagios/plugins/cosmos':
ensure => directory,
recurse => true,
mode => '0644',
group => 'root',
owner => 'root',
}
$nagioscfg_dirs = ['/etc/', '/etc/naemon/', '/etc/naemon/conf.d/', '/etc/naemon/conf.d/nagioscfg/', '/etc/naemon/conf.d/cosmos/']
$nagioscfg_dirs.each |$dir| {
ensure_resource('file',$dir, {
ensure => directory,
mode => '0644',
group => 'root',
owner => 'root',
})
}
nagioscfg::contactgroup { 'alerts': }
unless 'load' in $optout_checks {
nagioscfg::service { 'check_load':
use => 'naemon-service',
hostgroup_name => [$nrpe_group],
check_command => 'check_nrpe!check_load',
description => 'System Load',
require => File['/etc/naemon/conf.d/nagioscfg/'],
}
}
unless 'users' in $optout_checks {
nagioscfg::service { 'check_users':
use => 'naemon-service',
hostgroup_name => [$nrpe_group],
check_command => 'check_nrpe!check_users',
description => 'Active Users',
require => File['/etc/naemon/conf.d/nagioscfg/'],
}
}
unless 'zombie_procs' in $optout_checks {
nagioscfg::service { 'check_zombie_procs':
use => 'naemon-service',
hostgroup_name => [$nrpe_group],
check_command => 'check_nrpe!check_zombie_procs',
description => 'Zombie Processes',
require => File['/etc/naemon/conf.d/nagioscfg/'],
}
}
unless 'total_procs' in $optout_checks {
nagioscfg::service { 'check_total_procs':
use => 'naemon-service',
hostgroup_name => [$nrpe_group],
check_command => 'check_nrpe!check_total_procs_lax',
description => 'Total Processes',
require => File['/etc/naemon/conf.d/nagioscfg/'],
}
}
unless 'dynamic_disk' in $optout_checks {
nagioscfg::service { 'check_dynamic_disk':
use => 'naemon-service',
hostgroup_name => [$nrpe_group],
check_command => 'check_nrpe!check_dynamic_disk',
description => 'Disk',
require => File['/etc/naemon/conf.d/nagioscfg/'],
}
}
unless 'uptime' in $optout_checks {
nagioscfg::service { 'check_uptime':
use => 'naemon-service',
hostgroup_name => [$nrpe_group],
check_command => 'check_nrpe!check_uptime',
description => 'Uptime',
require => File['/etc/naemon/conf.d/nagioscfg/'],
}
}
unless 'reboot' in $optout_checks {
nagioscfg::service { 'check_reboot':
hostgroup_name => [$nrpe_group],
check_command => 'check_nrpe!check_reboot',
description => 'Reboot Needed',
contact_groups => ['alerts'],
require => File['/etc/naemon/conf.d/nagioscfg/'],
}
}
unless 'memory' in $optout_checks {
nagioscfg::service { 'check_memory':
use => 'naemon-service',
hostgroup_name => [$nrpe_group],
check_command => 'check_nrpe!check_memory',
description => 'System Memory',
require => File['/etc/naemon/conf.d/nagioscfg/'],
}
}
unless 'entropy' in $optout_checks {
nagioscfg::service { 'check_entropy':
use => 'naemon-service',
hostgroup_name => [$nrpe_group],
check_command => 'check_nrpe!check_entropy',
description => 'System Entropy',
require => File['/etc/naemon/conf.d/nagioscfg/'],
}
}
unless 'ntp_time' in $optout_checks {
nagioscfg::service { 'check_ntp_time':
use => 'naemon-service',
hostgroup_name => [$nrpe_group],
check_command => 'check_nrpe!check_ntp_time',
description => 'System NTP Time',
require => File['/etc/naemon/conf.d/nagioscfg/'],
}
}
unless 'scriptherder' in $optout_checks {
nagioscfg::service { 'check_scriptherder':
hostgroup_name => [$nrpe_group],
check_command => 'check_nrpe!check_scriptherder',
description => 'Scriptherder Status',
contact_groups => ['naemon-admins'],
require => File['/etc/naemon/conf.d/nagioscfg/'],
}
}
unless 'apt' in $optout_checks {
nagioscfg::service { 'check_apt':
use => 'naemon-service',
hostgroup_name => [$nrpe_group],
check_command => 'check_nrpe!check_apt',
description => 'Packages available for upgrade',
require => File['/etc/naemon/conf.d/nagioscfg/'],
}
}
require sunet::nagios::nrpe_check_cosmos_keys
nagioscfg::service {'check_cosmos_keys':
hostgroup_name => ['sunet::naemon_monitor'],
check_command => 'check_nrpe!check_cosmos_keys',
description => 'Cosmos GPG keys',
}
file { '/etc/naemon/conf.d/cosmos/naemon-hostgroups.cfg':
ensure => file,
mode => '0644',
group => 'root',
owner => 'root',
content => template('soc/naemon_monitor/naemon-hostgroups.cfg.erb'),
require => File['/etc/naemon/conf.d/cosmos/'],
}
file { '/etc/naemon/conf.d/cosmos/naemon-host.cfg':
ensure => file,
mode => '0644',
group => 'root',
owner => 'root',
content => template('soc/naemon_monitor/naemon-host.cfg.erb'),
require => File['/etc/naemon/conf.d/cosmos/'],
}
file { '/etc/naemon/conf.d/cosmos/naemon-service.cfg':
ensure => file,
mode => '0644',
group => 'root',
owner => 'root',
content => template('soc/naemon_monitor/naemon-service.cfg.erb'),
require => File['/etc/naemon/conf.d/cosmos/'],
}
file { '/etc/naemon/conf.d/cosmos/naemon-contactgroups.cfg':
ensure => file,
mode => '0644',
group => 'root',
owner => 'root',
content => template('soc/naemon_monitor/naemon-contactgroups.cfg.erb'),
require => File['/etc/naemon/conf.d/cosmos/'],
}
sunet::scriptherder::cronjob { 'thrukmaintenance':
cmd => '/usr/bin/docker exec --user www-data naemon_monitor-thruk-1 /usr/bin/thruk maintenance',
minute => '50',
ok_criteria => ['exit_status=0'],
warn_criteria => ['exit_status=1', 'max_age=24h'],
}
class { 'nagioscfg':
additional_entities => $additional_entities,
config => 'naemon_monitor',
default_host_group => $default_host_group,
manage_package => false,
manage_service => false,
cfgdir => '/etc/naemon/conf.d/nagioscfg',
host_template => 'naemon-host',
service => 'sunet-naemon_monitor',
single_ip => true,
require => File['/etc/naemon/conf.d/nagioscfg/'],
exclude_hosts => $exclude_hosts,
}
}

View file

@ -0,0 +1,61 @@
otelcol.receiver.otlp "example" {
grpc {
endpoint = "[::]:4317"
tls {
cert_file = "/etc/dehydrated/fullchain.pem"
key_file = "/etc/dehydrated/privkey.pem"
}
}
http {
endpoint = "[::]:4318"
tls {
cert_file = "/etc/dehydrated/fullchain.pem"
key_file = "/etc/dehydrated/privkey.pem"
}
}
output {
metrics = [otelcol.processor.batch.example.input]
logs = [otelcol.processor.batch.example.input]
traces = [otelcol.processor.batch.example.input]
}
}
otelcol.processor.batch "example" {
output {
metrics = [otelcol.exporter.prometheus.monitor_mimir.input]
logs = [otelcol.exporter.loki.monitor_loki.input]
traces = [otelcol.exporter.otlphttp.monitor_tempo.input]
}
}
otelcol.exporter.otlphttp "monitor_tempo" {
client {
endpoint = "http://tempo:4318"
}
}
otelcol.exporter.prometheus "monitor_mimir" {
forward_to = [prometheus.remote_write.monitor_mimir.receiver]
}
prometheus.remote_write "monitor_mimir" {
endpoint {
url = "http://mimir:9009/api/v1/push"
}
}
otelcol.exporter.loki "monitor_loki" {
forward_to = [loki.write.monitor_loki.receiver]
}
loki.write "monitor_loki" {
endpoint {
url = "http://loki:3100/loki/api/v1/push"
}
}

View file

@ -0,0 +1,60 @@
<VirtualHost _default_:443>
SSLEngine on
SSLCertificateFile /etc/letsencrypt/live/<%= @domain %>/cert.pem
SSLCertificateKeyFile /etc/letsencrypt/live/<%= @domain %>/privkey.pem
SSLCertificateChainFile /etc/letsencrypt/live/<%= @domain %>/chain.pem
Header always set Strict-Transport-Security "max-age=63072000"
SSLProtocol all -SSLv3 -TLSv1 -TLSv1.1
SSLCipherSuite ECDHE-ECDSA-AES128-GCM-SHA256:ECDHE-RSA-AES128-GCM-SHA256:ECDHE-ECDSA-AES256-GCM-SHA384:ECDHE-RSA-AES256-GCM-SHA384:ECDHE-ECDSA-CHACHA20-POLY1305:ECDHE-RSA-CHACHA20-POLY1305:DHE-RSA-AES128-GCM-SHA256:DHE-RSA-AES256-GCM-SHA384
SSLHonorCipherOrder off
SSLSessionTickets off
SSLUseStapling On
SSLCompression off
SSLOptions +StrictRequire
# Add vhost name to log entries:
LogFormat "%h %l %u %t \"%r\" %>s %b \"%{Referer}i\" \"%{User-agent}i\"" vhost_combined
LogFormat "%v %h %l %u %t \"%r\" %>s %b" vhost_common
BrowserMatch "MSIE [2-6]" \
nokeepalive ssl-unclean-shutdown \
downgrade-1.0 force-response-1.0
BrowserMatch "MSIE [17-9]" ssl-unclean-shutdown
RedirectMatch ^/$ /thruk/
Header set Content-Security-Policy "frame-src 'self' *.sunet.se *.swamid.se;"
<Location />
AuthType shibboleth
ShibRequestSetting requireSession 1
ShibRequestSetting entityIDSelf https://$hostname
Require valid-user
</Location>
<Location /grafana>
<IfFile '/etc/apache2/groups.txt'>
AuthGroupFile /etc/apache2/groups.txt
Require group editors
</IfFile>
ProxyPass http://grafana:3000 retry=0 disablereuse=On
ProxyPassReverse http://127.0.0.1:3000/grafana
RewriteEngine On
RewriteRule .* - [E=PROXY_USER:%{LA-U:REMOTE_USER},NS]
RequestHeader set X-WEBAUTH-USER "%{PROXY_USER}e"
RequestHeader set X-WEBAUTH-NAME "%{PROXY_USER}e"
RewriteCond %{HTTP:Upgrade} websocket [NC] # For live view
RewriteCond %{HTTP:Connection} upgrade [NC] #
RewriteRule ^/?(.*) "ws://127.0.0.1:3000/$1" [P,L] #
</Location>
<Location /histou>
ProxyPass http://histou:80/histou retry=0 disablereuse=On
ProxyPassReverse http://histou:80/histou
</Location>
</VirtualHost>
SSLStaplingCache "shmcb:logs/ssl_stapling(32768)"

View file

@ -0,0 +1,139 @@
version: '3.2'
services:
always-https:
image: docker.sunet.se/always-https
<% unless @resolvers.empty? -%>
dns:
<% @resolvers.each do |resolver| -%>
- <%= resolver %>
<% end -%>
<% end -%>
ports:
- '80:80'
environment:
- 'ACME_URL=http://acme-c.sunet.se/'
naemon:
init: true
image: docker.sunet.se/naemon:<%= @naemon_tag %>
<% unless @resolvers.empty? -%>
dns:
<% @resolvers.each do |resolver| -%>
- <%= resolver %>
<% end -%>
<% end -%>
ports:
- '127.0.0.1:6666:6666'
volumes:
- '/etc/naemon/conf.d/nagioscfg:/etc/naemon/conf.d/nagioscfg:ro'
- '/etc/naemon/conf.d/cosmos:/etc/naemon/conf.d/cosmos:ro'
- '/usr/lib/nagios/plugins/check_inodes:/usr/lib/nagios/plugins/check_inodes:ro'
- '/usr/lib/nagios/plugins/cosmos:/usr/lib/nagios/plugins/cosmos:ro'
- '/var/lib/naemon/:/var/lib/naemon/'
- '/var/log/naemon/:/var/log/naemon/'
- '/var/nagflux/:/var/nagflux/'
<%- @naemon_extra_volumes.each do |extra_volume| -%>
- "<%= extra_volume %>"
<%- end -%>
thruk:
image: docker.sunet.se/thruk:<%= @thruk_tag %>
<% unless @resolvers.empty? -%>
dns:
<% @resolvers.each do |resolver| -%>
- <%= resolver %>
<% end -%>
<% end -%>
ports:
- '443:443'
volumes:
<%- if @acme_provider == 'dehydrated' -%>
- "/etc/dehydrated/certs/<%= @domain %>:/etc/dehydrated:ro"
<% end -%>
<%- if @acme_provider == 'acme-d' -%>
- "/etc/letsencrypt:/etc/letsencrypt:ro"
- "/opt/naemon_monitor/apache-thruk.cfg:/etc/apache2/sites-enabled/thruk.conf:ro'
<% end -%>
- '/opt/naemon_monitor/shib-certs:/etc/shibboleth/certs'
- '/opt/naemon_monitor/data:/var/lib/thruk'
- '/opt/naemon_monitor/menu_local.conf:/etc/thruk/menu_local.conf'
<%- @thruk_extra_volumes.each do |extra_volume| -%>
- "<%= extra_volume %>"
<%- end -%>
environment:
<%- @thruk_env.each do |environ| -%>
- "<%= environ %>"
<%- end -%>
influxdb:
image: influxdb:<%= @influxdb_tag %>
volumes:
- '/var/lib/influxdb:/var/lib/influxdb'
environment:
<%- @influx_env.each do |environ| -%>
- "<%= environ %>"
<%- end -%>
histou:
image: docker.sunet.se/histou:<%= @histou_tag %>
nagflux:
image: docker.sunet.se/nagflux:<%= @nagflux_tag %>
volumes:
- '/var/nagflux/:/var/nagflux/'
environment:
<%- @nagflux_env.each do |environ| -%>
- "<%= environ %>"
<%- end -%>
grafana:
image: grafana/grafana:<%= @grafana_tag %>
volumes:
- '/opt/naemon_monitor/grafana.ini/:/etc/grafana/grafana.ini'
- '/opt/naemon_monitor/grafana-provisioning:/etc/grafana/provisioning:ro'
- '/opt/naemon_monitor/histou.js:/usr/share/grafana/public/dashboards/histou.js:ro'
- '/opt/naemon_monitor/grafana:/var/lib/grafana:rw'
<% if @receive_otel -%>
loki:
image: grafana/loki:<%= @loki_tag %>
ports:
- "3100:3100"
volumes:
- '/opt/naemon_monitor/loki:/loki:rw'
- '/opt/naemon_monitor/loki-server.yaml:/etc/loki/local-config.yaml:ro'
command: -config.file=/etc/loki/local-config.yaml
tempo:
image: grafana/tempo:<%= @tempo_tag %>
ports:
- "14268:14268" # jaeger ingest
- "3200:3200" # tempo
- "9095:9095" # tempo grpc
- "9411:9411" # zipkin
expose: #Only used between dockers
- "4317" #grpc otel
- "4318" #http otel
command: [ "-config.file=/etc/tempo.yaml" ]
volumes:
- "/opt/naemon_monitor/tempo-server.yaml:/etc/tempo.yaml"
- "/opt/naemon_monitor/tempo:/var/tempo:rw"
mimir:
image: grafana/mimir:<%= @mimir_tag %>
command: ["-ingester.native-histograms-ingestion-enabled=true", "-config.file=/etc/mimir.yaml"]
ports:
- "9009:9009"
volumes:
- "/opt/naemon_monitor/mimir-server.yaml:/etc/mimir.yaml:ro"
- "/opt/naemon_monitor/mimir:/data:rw"
alloy: #Router for otel
image: grafana/alloy:<%= @alloy_tag %>
command: ["run", "/etc/alloy/config.alloy"]
ports:
- "4317-4318:4317-4318"
volumes:
- "/opt/naemon_monitor/alloy-server.alloy:/etc/alloy/config.alloy:ro"
- "/etc/dehydrated/certs/<%= @domain %>:/etc/dehydrated:ro"
<% end -%>

View file

@ -0,0 +1,12 @@
apiVersion: 1
providers:
- name: "dashboards"
orgId: 1
type: file
disableDeletion: false
updateIntervalSeconds: 60
allowUiUpdates: false
options:
path: /etc/grafana/provisioning
foldersFromFilesStructure: true

View file

@ -0,0 +1,282 @@
{
"__inputs": [
{
"name": "DS_PROMETHEUS",
"label": "Prometheus",
"description": "",
"type": "datasource",
"pluginId": "prometheus",
"pluginName": "Prometheus"
},
{
"name": "DS_LOKI",
"label": "Loki",
"description": "",
"type": "datasource",
"pluginId": "loki",
"pluginName": "Loki"
}
],
"__elements": {},
"__requires": [
{
"type": "grafana",
"id": "grafana",
"name": "Grafana",
"version": "11.1.4"
},
{
"type": "panel",
"id": "logs",
"name": "Logs",
"version": ""
},
{
"type": "datasource",
"id": "loki",
"name": "Loki",
"version": "1.0.0"
},
{
"type": "datasource",
"id": "prometheus",
"name": "Prometheus",
"version": "1.0.0"
},
{
"type": "panel",
"id": "timeseries",
"name": "Time series",
"version": ""
}
],
"annotations": {
"list": [
{
"builtIn": 1,
"datasource": {
"type": "datasource",
"uid": "grafana"
},
"enable": true,
"hide": true,
"iconColor": "rgba(0, 211, 255, 1)",
"name": "Annotations & Alerts",
"type": "dashboard"
}
]
},
"editable": true,
"fiscalYearStartMonth": 0,
"graphTooltip": 0,
"id": null,
"links": [],
"panels": [
{
"collapsed": false,
"gridPos": {
"h": 1,
"w": 24,
"x": 0,
"y": 0
},
"id": 2,
"panels": [],
"repeat": "instance",
"repeatDirection": "h",
"title": "$instance",
"type": "row"
},
{
"datasource": {
"type": "prometheus",
"uid": "mimir"
},
"description": "",
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
},
"custom": {
"axisBorderShow": false,
"axisCenteredZero": false,
"axisColorMode": "text",
"axisLabel": "",
"axisPlacement": "auto",
"barAlignment": 0,
"drawStyle": "line",
"fillOpacity": 0,
"gradientMode": "none",
"hideFrom": {
"legend": false,
"tooltip": false,
"viz": false
},
"insertNulls": false,
"lineInterpolation": "linear",
"lineWidth": 1,
"pointSize": 5,
"scaleDistribution": {
"type": "linear"
},
"showPoints": "auto",
"spanNulls": false,
"stacking": {
"group": "A",
"mode": "none"
},
"thresholdsStyle": {
"mode": "off"
}
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
},
{
"color": "red",
"value": 80
}
]
}
},
"overrides": [
{
"__systemRef": "hideSeriesFrom",
"matcher": {
"id": "byNames",
"options": {
"mode": "exclude",
"names": [
"{__name__=\"up\",job=\"node\"}"
],
"prefix": "All except:",
"readOnly": true
}
},
"properties": []
}
]
},
"gridPos": {
"h": 8,
"w": 24,
"x": 0,
"y": 1
},
"id": 1,
"options": {
"legend": {
"calcs": [],
"displayMode": "list",
"placement": "bottom",
"showLegend": true
},
"tooltip": {
"mode": "single",
"sort": "none"
}
},
"pluginVersion": "11.1.4",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "mimir"
},
"editorMode": "code",
"expr": "rate(node_cpu_seconds_total{job=\"node\", instance=~\"$instance\", mode=\"user\"}[$__rate_interval])",
"instant": false,
"interval": "1m",
"legendFormat": "__auto",
"range": true,
"refId": "A"
}
],
"title": "CPU",
"type": "timeseries"
},
{
"datasource": {
"type": "loki",
"uid": "loki"
},
"gridPos": {
"h": 8,
"w": 24,
"x": 0,
"y": 9
},
"id": 3,
"options": {
"dedupStrategy": "none",
"enableLogDetails": true,
"prettifyLogMessage": false,
"showCommonLabels": false,
"showLabels": false,
"showTime": true,
"sortOrder": "Descending",
"wrapLogMessage": false
},
"pluginVersion": "11.1.4",
"targets": [
{
"datasource": {
"type": "loki",
"uid": "loki"
},
"editorMode": "code",
"expr": "{job=\"loki.source.journal.read\",hostname=~\"$instance\"}",
"queryType": "range",
"refId": "A"
}
],
"title": "Panel Title",
"type": "logs"
}
],
"schemaVersion": 39,
"tags": [],
"templating": {
"list": [
{
"current": {},
"datasource": {
"type": "prometheus",
"uid": "mimir"
},
"definition": "label_values(up,instance)",
"hide": 2,
"includeAll": true,
"multi": true,
"name": "instance",
"options": [],
"query": {
"qryType": 1,
"query": "label_values(up,instance)",
"refId": "PrometheusVariableQueryEditor-VariableQuery"
},
"refresh": 1,
"regex": "",
"skipUrlSync": false,
"sort": 0,
"type": "query"
}
]
},
"time": {
"from": "now-6h",
"to": "now"
},
"timepicker": {},
"timezone": "browser",
"title": "Overview",
"uid": "sunet-overview",
"version": 8,
"weekStart": ""
}

View file

@ -0,0 +1,14 @@
apiVersion: 1
deleteDatasources:
- name: nagflux
datasources:
- name: nagflux
type: influxdb
url: http://influxdb:8086
access: proxy
database: nagflux
isDefault: true
version: 1
editable: true

View file

@ -0,0 +1,12 @@
apiVersion: 1
datasources:
- name: Loki
type: loki
uid: loki
access: proxy
orgId: 1
url: http://loki:3100
basicAuth: false
isDefault: false
version: 1
editable: false

View file

@ -0,0 +1,16 @@
apiVersion: 1
datasources:
- name: Prometheus
type: prometheus
access: proxy
# Access mode - proxy (server in the UI) or direct (browser in the UI).
url: "http://mimir:9009/prometheus"
uid: mimir
jsonData:
httpMethod: POST
manageAlerts: true
prometheusType: Mimir
cacheLevel: 'High'
disableRecordingRules: false
incrementalQueryOverlapWindow: 10m

View file

@ -0,0 +1,48 @@
apiVersion: 1
datasources:
- name: Tempo
type: tempo
uid: tempo
url: http://tempo:3200
access: proxy
basicAuth: false
jsonData:
tracesToLogsV2:
# Field with an internal link pointing to a logs data source in Grafana.
# datasourceUid value must match the uid value of the logs data source.
datasourceUid: 'loki'
spanStartTimeShift: '-1h'
spanEndTimeShift: '1h'
tags: ['job', 'instance', 'pod', 'namespace']
filterByTraceID: false
filterBySpanID: false
customQuery: true
query: 'method="$${__span.tags.method}"'
tracesToMetrics:
datasourceUid: 'prom'
spanStartTimeShift: '1h'
spanEndTimeShift: '-1h'
tags: [{ key: 'service.name', value: 'service' }, { key: 'job' }]
queries:
- name: 'Sample query'
query: 'sum(rate(traces_spanmetrics_latency_bucket{$$__tags}[5m]))'
tracesToProfiles:
datasourceUid: 'grafana-pyroscope-datasource'
tags: ['job', 'instance', 'pod', 'namespace']
profileTypeId: 'process_cpu:cpu:nanoseconds:cpu:nanoseconds'
customQuery: true
query: 'method="$${__span.tags.method}"'
serviceMap:
datasourceUid: 'mimir'
nodeGraph:
enabled: true
search:
hide: false
traceQuery:
timeShiftEnabled: true
spanStartTimeShift: '1h'
spanEndTimeShift: '-1h'
spanBar:
type: 'Tag'
tag: 'http.path'

View file

@ -0,0 +1,30 @@
[log]
mode = console
level = debug
[server]
root_url = https://<%= @domain %>/grafana/
[users]
default_theme = light
allow_sign_up = false
auto_assign_org_role = <%= @grafana_default_role %>
[auth]
disable_signout_menu = true
[auth.proxy]
enabled = true
header_name = X-WEBAUTH-USER
header_property = username
auto_sign_up = true
sync_ttl = 60
[alerting]
enabled = false
[unified_alerting]
enabled = true
[security]
allow_embedding = true

View file

@ -0,0 +1,223 @@
/* global _ */
// accessible variables in this scope
var window, document, ARGS, $, jQuery, moment, kbn;
//parse arguments
parseArgs()
return function (callback) {
if (window.location.href.search('/dashboard-solo/') != -1) {
document.documentElement.style.background = '#FFF';
}
var url = location.protocol+'//'+window.location.hostname+'/histou/';
var configUrl = url+'index.php?host='+host+'&service='+service+'&height='+height+'&legend='+legend+debug+disablePanelTitle+disablePerfdataLookup+specificTemplate+'&annotations='+annotations;
var flotAddons = url + 'flotAddons.js';
$.getScript(flotAddons, function (){});
if (!_.isUndefined(ARGS.customCSSFile)) {
$('head').append('<link rel="stylesheet" href="' + ARGS.customCSSFile + '" type="text/css" />');
}
cssLoaded = false;
jQuery('body').on('DOMNodeInserted', 'DIV.drop-popover', function (e) {
var cssUrl = url+'lightbox/css/light.css'
if (!cssLoaded) {
$('head').append('<link rel="stylesheet" href="'+url+'lightbox/css/light.css" type="text/css" />');
$.getScript(url+'lightbox/js/light.js', function(){});
cssLoaded = true;
}
var box = $( e.currentTarget ).find( "DIV.sakuli-popup" );
if (box.length > 0 ){
$(box[0]).attr('class', 'sakuli-image');
var sakuliUrl = site[1] + box[0].innerHTML;
var svcoutput;
var imagename;
jQuery.when(
// fetch Sakuli serviceoutput file
$.get( sakuliUrl + "output.txt").always(function(data ,state) {
if (state != "success" ) {
data = "Could not find Sakuli service outputfile at " + sakuliUrl + "output.txt !"
}
console.log(data);
svcoutput = $("<div>").text(data).html().replace(/['"]+/g, '');
console.log("Sakuli service output: " + svcoutput);
}) &&
// fetch Sakuli screenshot (jpg/png)
$.get( sakuliUrl ).always(function(imgdata ,state) {
if (state != "success" ) {
imgdata = "Could not access screenshot list page at " + sakuliUrl + "!"
}
// the 3rd href on the apache index page contains the img name
imagename = $(imgdata).find('a')[2].text.trim();
console.log("Sakuli screenshot image name: " + imagename);
})
).then ( function() {
box[0].innerHTML = '<a href="' + sakuliUrl + imagename + '" data-lightbox="sakuli" data-title="'+ svcoutput +'"><img src="'+ sakuliUrl + imagename +'" alt="Sakuli error image" width=250px /></a>';
});
}
});
$.ajax(
{
method: 'GET',
url: configUrl,
dataType: "jsonp",
}
).done(
function (result) {
console.log(result);
callback(result);
}
).fail(
function (result) {
console.log(result);
console.log(configUrl);
if (result.status == 200) {
callback(createErrorDashboard('# HTTP code: '+result.status+'\n# Message: '+result.statusText+'\n# Url: '+configUrl+'\n# Probably the output is not valid json, because the returncode is 200!'));
} else {
callback(createErrorDashboard('# HTTP code: '+result.status+'\n# Message: '+result.statusText+'\n# Url: '+configUrl));
}
}
);
}
function createErrorDashboard(message)
{
return {
rows : [{
title: 'Chart',
height: '300px',
panels : [{
title: 'Error Message below',
type: 'text',
span: 12,
fill: 1,
content: message,
}]
}],
services : {},
title : 'JS Error / HTTP Error'
};
}
function parseArgs()
{
if (!_.isUndefined(ARGS.reduce)) {
$('head').append('<style>.panel-fullscreen {top:0}</style>');
//change ui to our needs
clearUi();
}
if (!_.isUndefined(ARGS.dynUnit)) {
dynUnit = true;
} else {
dynUnit = false;
}
if (!_.isUndefined(ARGS.host)) {
host = ARGS.host;
} else {
host = "";
}
if (!_.isUndefined(ARGS.service)) {
service = ARGS.service;
} else {
service = "";
}
if (!_.isUndefined(ARGS.command)) {
command = ARGS.command;
} else {
command = "";
}
if (!_.isUndefined(ARGS.perf)) {
perf = ARGS.perf;
} else {
perf = "";
}
if (!_.isUndefined(ARGS.height)) {
height = ARGS.height;
} else {
height = "";
}
if (_.isUndefined(ARGS.debug)) {
debug = '';
} else {
debug = "&debug";
}
if (!_.isUndefined(ARGS.legend)) {
legend = ARGS.legend;
} else {
legend = true;
}
if (!_.isUndefined(ARGS.annotations)) {
annotations = ARGS.annotations;
} else {
annotations = false;
}
if(_.isUndefined(ARGS.disablePanelTitle)) {
disablePanelTitle = '';
}else{
disablePanelTitle = "&disablePanelTitle";
}
if(_.isUndefined(ARGS.disablePerfdataLookup)) {
disablePerfdataLookup = '';
}else{
disablePerfdataLookup = "&disablePerfdataLookup";
}
if(_.isUndefined(ARGS.specificTemplate)) {
specificTemplate = '';
}else{
specificTemplate = "&specificTemplate="+ARGS.specificTemplate;
}
}
function clearUi()
{
//removes white space
var checkExist = setInterval(
function () {
if ($('.panel-content').length) {
clearInterval(checkExist);
document.getElementsByClassName("panel-content")[0].style.paddingBottom = '0px';
}
},
100
);
/*
.panel-header removes the headline of the graphs
.navbar-static-top removes the menubar on the top
.row-control-inner removes the row controll button on the left
.span12 removes the add new row button on the bottom
*/
divs = ['.panel-header','.navbar-static-top','.row-control-inner','.span12']
for (index = 0; index < divs.length; index++) {
waitForDivAndDeleteIt(divs[index]);
}
function waitForDivAndDeleteIt(div)
{
var checkExist = setInterval(
function () {
if ($(div).length) {
clearInterval(checkExist);
$(div).remove();
}
},
100
);
}
}

View file

@ -0,0 +1,13 @@
/var/log/naemon/naemon.log {
daily
rotate 3650
nocompress
olddir archives
dateext
dateformat -%Y%m%d
missingok
notifempty
postrotate
/usr/bin/docker exec naemon_monitor-naemon-1 pkill --signal USR1 -f '/usr/bin/naemon --allow-root /etc/naemon/naemon.cfg'
endscript
}

View file

@ -0,0 +1,65 @@
auth_enabled: false
server:
http_listen_port: 3100
common:
path_prefix: /loki
storage:
filesystem:
chunks_directory: /loki/chunks
rules_directory: /loki/rules
replication_factor: 1
ring:
kvstore:
store: inmemory
compactor:
working_directory: /loki/retention
compaction_interval: 10m
retention_enabled: true
retention_delete_delay: 2h
retention_delete_worker_count: 150
delete_request_store: filesystem
schema_config:
configs:
- from: 2020-10-24
store: boltdb-shipper
object_store: filesystem
schema: v11
index:
prefix: index_
period: 24h
- from: "2023-01-05" # <---- A date in the future
index:
period: 24h
prefix: index_
object_store: filesystem
schema: v13
store: tsdb
storage_config:
tsdb_shipper:
active_index_directory: /loki/tsdb-index
cache_location: /loki/tsdb-cache
ruler:
alertmanager_url: http://localhost:9093
limits_config:
retention_period: <%= @otel_retention %>
retention_stream:
- selector: '{namespace="debug"}'
priority: 1
period: 48h
# By default, Loki will send anonymous, but uniquely-identifiable usage and configuration
# analytics to Grafana Labs. These statistics are sent to https://stats.grafana.org/
#
# Statistics help us better understand how Loki is used, and they show us performance
# levels for most users. This helps us prioritize features and documentation.
# For more information on what's sent, look at
# https://github.com/grafana/loki/blob/main/pkg/usagestats/stats.go
# Refer to the buildReport method to see what goes into a report.
#
# If you would like to disable reporting, uncomment the following lines:
#analytics:
# reporting_enabled: false

View file

@ -0,0 +1,45 @@
multitenancy_enabled: false
blocks_storage:
backend: filesystem
bucket_store:
sync_dir: /data/mimir/tsdb-sync
filesystem:
dir: /data/mimir/data/tsdb
tsdb:
dir: /data/mimir/tsdb
compactor:
data_dir: /data/mimir/compactor
sharding_ring:
kvstore:
store: memberlist
distributor:
ring:
instance_addr: 127.0.0.1
kvstore:
store: memberlist
ingester:
ring:
instance_addr: 127.0.0.1
kvstore:
store: memberlist
replication_factor: 1
ruler_storage:
backend: filesystem
filesystem:
dir: /data/mimir/rules
server:
http_listen_port: 9009
log_level: error
store_gateway:
sharding_ring:
replication_factor: 1
limits:
# Delete from storage metrics data older than x.
compactor_blocks_retention_period: <%= @otel_retention %>

View file

@ -0,0 +1,4 @@
define contactgroup {
contactgroup_name naemon-admins
alias Naemon Administrators
}

View file

@ -0,0 +1,19 @@
# Generic host definition template - This is NOT a real host, just a template!
define host{
name naemon-host ; The name of this host template
action_url /grafana/dashboard/script/histou.js?host=$HOSTNAME$&theme=light&annotations=true
notifications_enabled 1 ; Host notifications are enabled
event_handler_enabled 1 ; Host event handler is enabled
flap_detection_enabled 1 ; Flap detection is enabled
process_perf_data 1 ; Process performance data
retain_status_information 1 ; Retain status information across program restarts
retain_nonstatus_information 1 ; Retain non-status information across program restarts
check_command check-host-alive
max_check_attempts 10
notification_interval 0
notification_period 24x7
notification_options d,u,r
contact_groups admins
register 0 ; DONT REGISTER THIS DEFINITION - ITS NOT A REAL HOST, JUST A TEMPLATE!
}

View file

@ -0,0 +1,12 @@
# A simple wildcard hostgroup
define hostgroup {
hostgroup_name all
alias All Servers
members *
}
# Predefine empty group that we can use in puppet-nagioscfg in order to migrate
# away from the 'nrpe' group created from the cosmos-db.
define hostgroup {
hostgroup_name from_puppet-nagioscfg
alias from_puppet-nagioscfg
}

View file

@ -0,0 +1,16 @@
define service {
name naemon-service ; The 'name' of this service template
action_url /grafana/dashboard/script/histou.js?host=$HOSTNAME$&service=$SERVICEDISPLAYNAME$&theme=light&annotations=true'
contact_groups naemon-admins ; Notifications get sent out to everyone in the 'admins' group
event_handler_enabled 1 ; Service event handler is enabled
flap_detection_enabled 1 ; Flap detection is enabled
max_check_attempts 3 ; Re-check the service up to 3 times in order to determine its final (hard) state
notification_interval 0 ; Re-notify about service problems every hour
notification_options u,w,c,r ; Send notifications about warning, unknown, critical, and recovery events
notification_period 24x7 ; Notifications can be sent out at any time
obsess_over_service 1 ; We should obsess over this service (if necessary)
process_perf_data 1 ; Process performance data
register 0 ; DONT REGISTER THIS DEFINITION - ITS NOT A REAL SERVICE, JUST A TEMPLATE!
retain_nonstatus_information 1 ; Retain non-status information across program restarts
retain_status_information 1 ; Retain status information across program restarts
}

View file

@ -0,0 +1,4 @@
[Service]
# livestatus.so can't handle HUP with a TCP listner
# https://github.com/naemon/naemon-livestatus/issues/117
ExecReload=/usr/bin/docker restart <%= @naemon_container %>

View file

@ -0,0 +1,56 @@
stream_over_http_enabled: true
server:
http_listen_port: 3200
log_level: info
query_frontend:
search:
duration_slo: 5s
throughput_bytes_slo: 1.073741824e+09
trace_by_id:
duration_slo: 5s
distributor:
receivers: # this configuration will listen on all ports and protocols that tempo is capable of.
jaeger: # the receives all come from the OpenTelemetry collector. more configuration information can
protocols: # be found there: https://github.com/open-telemetry/opentelemetry-collector/tree/main/receiver
thrift_http: #
grpc: # for a production deployment you should only enable the receivers you need!
thrift_binary:
thrift_compact:
zipkin:
otlp:
protocols:
http:
grpc:
opencensus:
compactor:
compaction:
block_retention: <%= @otel_retention %>
metrics_generator:
registry:
external_labels:
source: tempo
cluster: docker-compose
storage:
path: /var/tempo/generator/wal
remote_write:
- url: http://prometheus:9009/api/v1/write
send_exemplars: true
traces_storage:
path: /var/tempo/generator/traces
storage:
trace:
backend: local # backend configuration to use
wal:
path: /var/tempo/wal # where to store the wal locally
local:
path: /var/tempo/blocks
overrides:
defaults:
metrics_generator:
processors: [service-graphs, span-metrics, local-blocks] # enables metrics generator