diff options
Diffstat (limited to 'host/Spacebar-monitoring/configuration.nix')
-rwxr-xr-x | host/Spacebar-monitoring/configuration.nix | 267 |
1 files changed, 0 insertions, 267 deletions
diff --git a/host/Spacebar-monitoring/configuration.nix b/host/Spacebar-monitoring/configuration.nix deleted file mode 100755 index 436dc47..0000000 --- a/host/Spacebar-monitoring/configuration.nix +++ /dev/null @@ -1,267 +0,0 @@ -{ config, pkgs, lib, secrets, ... }: - -{ - imports = - [ - ../../modules/base.nix - ]; - - networking = { - hostName = "Spacebar-monitoring"; - interfaces.ens18.ipv4.addresses = [ { - address = "192.168.1.99"; - prefixLength = 24; - } ]; - interfaces.ens19.ipv4.addresses = [ { - address = "10.10.11.99"; - prefixLength = 16; - } ]; - }; - - services = { - prometheus = { - enable = true; - stateDir = "prometheus"; - retentionTime = "1y"; - extraFlags = [ - ]; - # alertmanagerURL = [ "http://localhost:9093" ]; - # rules = [ - # '' - # ALERT node_down - # IF up == 0 - # FOR 5m - # LABELS { - # severity="page" - # } - # ANNOTATIONS { - # summary = "{{$labels.alias}}: Node is down.", - # description = "{{$labels.alias}} has been down for more than 5 minutes." - # } - # ALERT node_systemd_service_failed - # IF node_systemd_unit_state{state="failed"} == 1 - # FOR 4m - # LABELS { - # severity="page" - # } - # ANNOTATIONS { - # summary = "{{$labels.alias}}: Service {{$labels.name}} failed to start.", - # description = "{{$labels.alias}} failed to (re)start service {{$labels.name}}." - # } - # ALERT node_filesystem_full_90percent - # IF sort(node_filesystem_free{device!="ramfs"} < node_filesystem_size{device!="ramfs"} * 0.1) / 1024^3 - # FOR 5m - # LABELS { - # severity="page" - # } - # ANNOTATIONS { - # summary = "{{$labels.alias}}: Filesystem is running out of space soon.", - # description = "{{$labels.alias}} device {{$labels.device}} on {{$labels.mountpoint}} got less than 10% space left on its filesystem." - # } - # ALERT node_filesystem_full_in_4h - # IF predict_linear(node_filesystem_free{device!="ramfs"}[1h], 4*3600) <= 0 - # FOR 5m - # LABELS { - # severity="page" - # } - # ANNOTATIONS { - # summary = "{{$labels.alias}}: Filesystem is running out of space in 4 hours.", - # description = "{{$labels.alias}} device {{$labels.device}} on {{$labels.mountpoint}} is running out of space of in approx. 4 hours" - # } - # ALERT node_filedescriptors_full_in_3h - # IF predict_linear(node_filefd_allocated[1h], 3*3600) >= node_filefd_maximum - # FOR 20m - # LABELS { - # severity="page" - # } - # ANNOTATIONS { - # summary = "{{$labels.alias}} is running out of available file descriptors in 3 hours.", - # description = "{{$labels.alias}} is running out of available file descriptors in approx. 3 hours" - # } - # ALERT node_load1_90percent - # IF node_load1 / on(alias) count(node_cpu{mode="system"}) by (alias) >= 0.9 - # FOR 1h - # LABELS { - # severity="page" - # } - # ANNOTATIONS { - # summary = "{{$labels.alias}}: Running on high load.", - # description = "{{$labels.alias}} is running with > 90% total load for at least 1h." - # } - # ALERT node_cpu_util_90percent - # IF 100 - (avg by (alias) (irate(node_cpu{mode="idle"}[5m])) * 100) >= 90 - # FOR 1h - # LABELS { - # severity="page" - # } - # ANNOTATIONS { - # summary = "{{$labels.alias}}: High CPU utilization.", - # description = "{{$labels.alias}} has total CPU utilization over 90% for at least 1h." - # } - # ALERT node_ram_using_90percent - # IF node_memory_MemFree + node_memory_Buffers + node_memory_Cached < node_memory_MemTotal * 0.1 - # FOR 30m - # LABELS { - # severity="page" - # } - # ANNOTATIONS { - # summary="{{$labels.alias}}: Using lots of RAM.", - # description="{{$labels.alias}} is using at least 90% of its RAM for at least 30 minutes now.", - # } - # ALERT node_swap_using_80percent - # IF node_memory_SwapTotal - (node_memory_SwapFree + node_memory_SwapCached) > node_memory_SwapTotal * 0.8 - # FOR 10m - # LABELS { - # severity="page" - # } - # ANNOTATIONS { - # summary="{{$labels.alias}}: Running out of swap soon.", - # description="{{$labels.alias}} is using 80% of its swap space for at least 10 minutes now." - # } - # '' - # ]; - scrapeConfigs = [ - { - job_name = "node"; - scrape_interval = "5s"; - static_configs = [ - { - targets = [ - "localhost:9100" - ]; - labels = { - alias = "monitoring"; - }; - } - { - targets = [ - "192.168.1.2:9100" - ]; - labels = { - alias = "nginx"; - }; - } - { - targets = [ - "192.168.1.3:9100" - ]; - labels = { - alias = "email"; - }; - } - { - targets = [ - "192.168.1.4:9100" - ]; - labels = { - alias = "postgres"; - }; - } - # { - # targets = [ - # "192.168.1.5:9100" - # ]; - # labels = { - # alias = "synapse"; - # }; - # } - ]; - } - ]; - alertmanager = { - enable = false; - listenAddress = "0.0.0.0"; - configuration = { - "global" = { - "smtp_smarthost" = "smtp.example.com:587"; - "smtp_from" = "alertmanager@example.com"; - }; - "route" = { - "group_by" = [ "alertname" "alias" ]; - "group_wait" = "30s"; - "group_interval" = "2m"; - "repeat_interval" = "4h"; - "receiver" = "team-admins"; - }; - "receivers" = [ - { - "name" = "team-admins"; - "email_configs" = [ - { - "to" = "devnull@example.com"; - "send_resolved" = true; - } - ]; - "webhook_configs" = [ - { - "url" = "https://example.com/prometheus-alerts"; - "send_resolved" = true; - } - ]; - } - ]; - }; - }; - }; - grafana = { - enable = true; - settings = { - server = { - root_url = "https://grafana.spacebar.chat/"; - enable_gzip = true; - http_addr = "0.0.0.0"; - domain = "grafana.spacebar.chat"; - }; - analytics.reporting_enabled = false; - security.secret_key = secrets.secret_keys.grafana; - }; - provision = { - datasources.settings = { - datasources = [ - { - name = "Prometheus"; - type = "prometheus"; - url = "http://localhost:9090"; - access = "proxy"; - isDefault = true; - } - ]; - }; - dashboards.settings = { - apiVersion = 1; - providers = [ - { - name = "Prometheus 2.0 Overview"; - revision = "1"; - folder = "prometheus"; - uid = "prometheus2"; - type = "file"; - options = { - path = pkgs.fetchurl { - url = "https://grafana.com/api/dashboards/3662/revisions/2/download"; - hash = "sha256:111krihyc4ydwcb9r9a6xrn10841nprvb7sz36az2djqyzrj4yzs"; - }; - }; - } - { - name = "Node Exporter Full"; - revision = "31"; - folder = "prometheus"; - uid = "node-exporter-full"; - type = "file"; - options = { - path = pkgs.fetchurl { - url = "https://grafana.com/api/dashboards/1860/revisions/31/download"; - hash = "sha256:120wyg0d1ycn8wkyng9ngms4v2hri8b7x37dfd318qdjfsr4gi22"; - }; - }; - } - ]; - }; - }; - }; - }; - - system.stateVersion = "22.11"; # DO NOT EDIT! -} - |