summary refs log tree commit diff
diff options
context:
space:
mode:
authorTheArcaneBrony <myrainbowdash949@gmail.com>2023-04-04 17:28:55 +0200
committerRory& <root@rory.gay>2024-06-05 17:25:16 +0200
commitfde1a8dc87adbbc24d80cfb06973b4c01bf31121 (patch)
tree963c539bf9a89e9bdb5884250345cc0440d243c2
parentNginx body size (diff)
downloadSpacebar-Open-Infrastructure-fde1a8dc87adbbc24d80cfb06973b4c01bf31121.tar.xz
Add monitoring with prometheus
-rwxr-xr-xhost/Spacebar-monitoring/configuration.nix223
-rwxr-xr-xhost/Spacebar-nginx/hosts/spacebar.chat/grafana.nix14
2 files changed, 237 insertions, 0 deletions
diff --git a/host/Spacebar-monitoring/configuration.nix b/host/Spacebar-monitoring/configuration.nix
new file mode 100755
index 0000000..c73b94f
--- /dev/null
+++ b/host/Spacebar-monitoring/configuration.nix
@@ -0,0 +1,223 @@
+{ config, pkgs, lib, ... }:
+
+{
+	imports =
+		[
+		../../modules/base.nix
+		];
+
+	networking = {
+		hostName = "Spacebar-monitoring";
+		interfaces.ens18.ipv4.addresses = [ { 
+			address = "192.168.1.99";
+			prefixLength = 24;
+		} ];
+		interfaces.ens19.ipv4.addresses = [ {
+			address = "10.10.11.99";
+			prefixLength = 16;
+		} ];
+	};
+
+	services = {
+		prometheus = {
+			enable = true;
+			stateDir = "prometheus";
+			extraFlags = [
+				"-storage.local.retention 8760h"
+				"-storage.local.series-file-shrink-ratio 0.3"
+				"-storage.local.memory-chunks 2097152"
+				"-storage.local.max-chunks-to-persist 1048576"
+				"-storage.local.index-cache-size.fingerprint-to-metric 2097152"
+				"-storage.local.index-cache-size.fingerprint-to-timerange 1048576"
+				"-storage.local.index-cache-size.label-name-to-label-values 2097152"
+				"-storage.local.index-cache-size.label-pair-to-fingerprints 41943040"
+			];
+			# alertmanagerURL = [ "http://localhost:9093" ];
+			rules = [
+				''
+				ALERT node_down
+				IF up == 0
+				FOR 5m
+				LABELS {
+					severity="page"
+				}
+				ANNOTATIONS {
+					summary = "{{$labels.alias}}: Node is down.",
+					description = "{{$labels.alias}} has been down for more than 5 minutes."
+				}
+				ALERT node_systemd_service_failed
+				IF node_systemd_unit_state{state="failed"} == 1
+				FOR 4m
+				LABELS {
+					severity="page"
+				}
+				ANNOTATIONS {
+					summary = "{{$labels.alias}}: Service {{$labels.name}} failed to start.",
+					description = "{{$labels.alias}} failed to (re)start service {{$labels.name}}."
+				}
+				ALERT node_filesystem_full_90percent
+				IF sort(node_filesystem_free{device!="ramfs"} < node_filesystem_size{device!="ramfs"} * 0.1) / 1024^3
+				FOR 5m
+				LABELS {
+					severity="page"
+				}
+				ANNOTATIONS {
+					summary = "{{$labels.alias}}: Filesystem is running out of space soon.",
+					description = "{{$labels.alias}} device {{$labels.device}} on {{$labels.mountpoint}} got less than 10% space left on its filesystem."
+				}
+				ALERT node_filesystem_full_in_4h
+				IF predict_linear(node_filesystem_free{device!="ramfs"}[1h], 4*3600) <= 0
+				FOR 5m
+				LABELS {
+					severity="page"
+				}
+				ANNOTATIONS {
+					summary = "{{$labels.alias}}: Filesystem is running out of space in 4 hours.",
+					description = "{{$labels.alias}} device {{$labels.device}} on {{$labels.mountpoint}} is running out of space of in approx. 4 hours"
+				}
+				ALERT node_filedescriptors_full_in_3h
+				IF predict_linear(node_filefd_allocated[1h], 3*3600) >= node_filefd_maximum
+				FOR 20m
+				LABELS {
+					severity="page"
+				}
+				ANNOTATIONS {
+					summary = "{{$labels.alias}} is running out of available file descriptors in 3 hours.",
+					description = "{{$labels.alias}} is running out of available file descriptors in approx. 3 hours"
+				}
+				ALERT node_load1_90percent
+				IF node_load1 / on(alias) count(node_cpu{mode="system"}) by (alias) >= 0.9
+				FOR 1h
+				LABELS {
+					severity="page"
+				}
+				ANNOTATIONS {
+					summary = "{{$labels.alias}}: Running on high load.",
+					description = "{{$labels.alias}} is running with > 90% total load for at least 1h."
+				}
+				ALERT node_cpu_util_90percent
+				IF 100 - (avg by (alias) (irate(node_cpu{mode="idle"}[5m])) * 100) >= 90
+				FOR 1h
+				LABELS {
+					severity="page"
+				}
+				ANNOTATIONS {
+					summary = "{{$labels.alias}}: High CPU utilization.",
+					description = "{{$labels.alias}} has total CPU utilization over 90% for at least 1h."
+				}
+				ALERT node_ram_using_90percent
+				IF node_memory_MemFree + node_memory_Buffers + node_memory_Cached < node_memory_MemTotal * 0.1
+				FOR 30m
+				LABELS {
+					severity="page"
+				}
+				ANNOTATIONS {
+					summary="{{$labels.alias}}: Using lots of RAM.",
+					description="{{$labels.alias}} is using at least 90% of its RAM for at least 30 minutes now.",
+				}
+				ALERT node_swap_using_80percent
+				IF node_memory_SwapTotal - (node_memory_SwapFree + node_memory_SwapCached) > node_memory_SwapTotal * 0.8
+				FOR 10m
+				LABELS {
+					severity="page"
+				}
+				ANNOTATIONS {
+					summary="{{$labels.alias}}: Running out of swap soon.",
+					description="{{$labels.alias}} is using 80% of its swap space for at least 10 minutes now."
+				}
+				''
+			];
+			scrapeConfigs = [
+				{
+					job_name = "node";
+					scrape_interval = "5s";
+					static_configs = [
+						{
+							targets = [
+								"localhost:9100"
+							];
+							labels = {
+								alias = "monitoring";
+							};
+						}
+						{
+							targets = [
+								"192.168.1.2:9100"
+							];
+							labels = {
+								alias = "nginx";
+							};
+						}
+						{
+							targets = [
+								"192.168.1.3:9100"
+							];
+							labels = {
+								alias = "email";
+							};
+						}
+						{
+							targets = [
+								"192.168.1.4:9100"
+							];
+							labels = {
+								alias = "postgres";
+							};
+						}
+						# {
+						# 	targets = [
+						# 		"192.168.1.5:9100"
+						# 	];
+						# 	labels = {
+						# 		alias = "synapse";
+						# 	};
+						# }
+					];
+				}
+			];
+			alertmanager = {
+				enable = false;
+				listenAddress = "0.0.0.0";
+				configuration = {
+					"global" = {
+						"smtp_smarthost" = "smtp.example.com:587";
+						"smtp_from" = "alertmanager@example.com";
+					};
+					"route" = {
+						"group_by" = [ "alertname" "alias" ];
+						"group_wait" = "30s";
+						"group_interval" = "2m";
+						"repeat_interval" = "4h";
+						"receiver" = "team-admins";
+					};
+					"receivers" = [
+						{
+						"name" = "team-admins";
+						"email_configs" = [
+							{
+							"to" = "devnull@example.com";
+							"send_resolved" = true;
+							}
+						];
+						"webhook_configs" = [
+							{
+							"url" = "https://example.com/prometheus-alerts";
+							"send_resolved" = true;
+							}
+						];
+						}
+					];
+				};
+			};
+		};
+		grafana = {
+			enable = true;
+			addr = "0.0.0.0";
+			domain = "grafana.spacebar.chat";
+			rootUrl = "https://grafana.spacebar.chat/";
+		};
+	};
+
+	system.stateVersion = "22.11"; # DO NOT EDIT!
+}
+
diff --git a/host/Spacebar-nginx/hosts/spacebar.chat/grafana.nix b/host/Spacebar-nginx/hosts/spacebar.chat/grafana.nix
new file mode 100755
index 0000000..9aad5f5
--- /dev/null
+++ b/host/Spacebar-nginx/hosts/spacebar.chat/grafana.nix
@@ -0,0 +1,14 @@
+{
+  enableACME = true;
+  forceSSL = true;
+  locations = {
+    "/" = {
+      proxyPass = "http://192.168.99:3000";
+      proxyWebsockets = true;
+      extraConfig =
+        "proxy_ssl_server_name on;" +
+        "proxy_pass_header Authorization;"
+        ;
+    };
+  };
+}