summary refs log tree commit diff
path: root/host/Spacebar-monitoring/configuration.nix
diff options
context:
space:
mode:
Diffstat (limited to 'host/Spacebar-monitoring/configuration.nix')
-rwxr-xr-xhost/Spacebar-monitoring/configuration.nix188
1 files changed, 94 insertions, 94 deletions
diff --git a/host/Spacebar-monitoring/configuration.nix b/host/Spacebar-monitoring/configuration.nix
index c73b94f..e5a464b 100755
--- a/host/Spacebar-monitoring/configuration.nix
+++ b/host/Spacebar-monitoring/configuration.nix
@@ -33,100 +33,100 @@
 				"-storage.local.index-cache-size.label-pair-to-fingerprints 41943040"
 			];
 			# alertmanagerURL = [ "http://localhost:9093" ];
-			rules = [
-				''
-				ALERT node_down
-				IF up == 0
-				FOR 5m
-				LABELS {
-					severity="page"
-				}
-				ANNOTATIONS {
-					summary = "{{$labels.alias}}: Node is down.",
-					description = "{{$labels.alias}} has been down for more than 5 minutes."
-				}
-				ALERT node_systemd_service_failed
-				IF node_systemd_unit_state{state="failed"} == 1
-				FOR 4m
-				LABELS {
-					severity="page"
-				}
-				ANNOTATIONS {
-					summary = "{{$labels.alias}}: Service {{$labels.name}} failed to start.",
-					description = "{{$labels.alias}} failed to (re)start service {{$labels.name}}."
-				}
-				ALERT node_filesystem_full_90percent
-				IF sort(node_filesystem_free{device!="ramfs"} < node_filesystem_size{device!="ramfs"} * 0.1) / 1024^3
-				FOR 5m
-				LABELS {
-					severity="page"
-				}
-				ANNOTATIONS {
-					summary = "{{$labels.alias}}: Filesystem is running out of space soon.",
-					description = "{{$labels.alias}} device {{$labels.device}} on {{$labels.mountpoint}} got less than 10% space left on its filesystem."
-				}
-				ALERT node_filesystem_full_in_4h
-				IF predict_linear(node_filesystem_free{device!="ramfs"}[1h], 4*3600) <= 0
-				FOR 5m
-				LABELS {
-					severity="page"
-				}
-				ANNOTATIONS {
-					summary = "{{$labels.alias}}: Filesystem is running out of space in 4 hours.",
-					description = "{{$labels.alias}} device {{$labels.device}} on {{$labels.mountpoint}} is running out of space of in approx. 4 hours"
-				}
-				ALERT node_filedescriptors_full_in_3h
-				IF predict_linear(node_filefd_allocated[1h], 3*3600) >= node_filefd_maximum
-				FOR 20m
-				LABELS {
-					severity="page"
-				}
-				ANNOTATIONS {
-					summary = "{{$labels.alias}} is running out of available file descriptors in 3 hours.",
-					description = "{{$labels.alias}} is running out of available file descriptors in approx. 3 hours"
-				}
-				ALERT node_load1_90percent
-				IF node_load1 / on(alias) count(node_cpu{mode="system"}) by (alias) >= 0.9
-				FOR 1h
-				LABELS {
-					severity="page"
-				}
-				ANNOTATIONS {
-					summary = "{{$labels.alias}}: Running on high load.",
-					description = "{{$labels.alias}} is running with > 90% total load for at least 1h."
-				}
-				ALERT node_cpu_util_90percent
-				IF 100 - (avg by (alias) (irate(node_cpu{mode="idle"}[5m])) * 100) >= 90
-				FOR 1h
-				LABELS {
-					severity="page"
-				}
-				ANNOTATIONS {
-					summary = "{{$labels.alias}}: High CPU utilization.",
-					description = "{{$labels.alias}} has total CPU utilization over 90% for at least 1h."
-				}
-				ALERT node_ram_using_90percent
-				IF node_memory_MemFree + node_memory_Buffers + node_memory_Cached < node_memory_MemTotal * 0.1
-				FOR 30m
-				LABELS {
-					severity="page"
-				}
-				ANNOTATIONS {
-					summary="{{$labels.alias}}: Using lots of RAM.",
-					description="{{$labels.alias}} is using at least 90% of its RAM for at least 30 minutes now.",
-				}
-				ALERT node_swap_using_80percent
-				IF node_memory_SwapTotal - (node_memory_SwapFree + node_memory_SwapCached) > node_memory_SwapTotal * 0.8
-				FOR 10m
-				LABELS {
-					severity="page"
-				}
-				ANNOTATIONS {
-					summary="{{$labels.alias}}: Running out of swap soon.",
-					description="{{$labels.alias}} is using 80% of its swap space for at least 10 minutes now."
-				}
-				''
-			];
+			# rules = [
+			# 	''
+			# 	ALERT node_down
+			# 	IF up == 0
+			# 	FOR 5m
+			# 	LABELS {
+			# 		severity="page"
+			# 	}
+			# 	ANNOTATIONS {
+			# 		summary = "{{$labels.alias}}: Node is down.",
+			# 		description = "{{$labels.alias}} has been down for more than 5 minutes."
+			# 	}
+			# 	ALERT node_systemd_service_failed
+			# 	IF node_systemd_unit_state{state="failed"} == 1
+			# 	FOR 4m
+			# 	LABELS {
+			# 		severity="page"
+			# 	}
+			# 	ANNOTATIONS {
+			# 		summary = "{{$labels.alias}}: Service {{$labels.name}} failed to start.",
+			# 		description = "{{$labels.alias}} failed to (re)start service {{$labels.name}}."
+			# 	}
+			# 	ALERT node_filesystem_full_90percent
+			# 	IF sort(node_filesystem_free{device!="ramfs"} < node_filesystem_size{device!="ramfs"} * 0.1) / 1024^3
+			# 	FOR 5m
+			# 	LABELS {
+			# 		severity="page"
+			# 	}
+			# 	ANNOTATIONS {
+			# 		summary = "{{$labels.alias}}: Filesystem is running out of space soon.",
+			# 		description = "{{$labels.alias}} device {{$labels.device}} on {{$labels.mountpoint}} got less than 10% space left on its filesystem."
+			# 	}
+			# 	ALERT node_filesystem_full_in_4h
+			# 	IF predict_linear(node_filesystem_free{device!="ramfs"}[1h], 4*3600) <= 0
+			# 	FOR 5m
+			# 	LABELS {
+			# 		severity="page"
+			# 	}
+			# 	ANNOTATIONS {
+			# 		summary = "{{$labels.alias}}: Filesystem is running out of space in 4 hours.",
+			# 		description = "{{$labels.alias}} device {{$labels.device}} on {{$labels.mountpoint}} is running out of space of in approx. 4 hours"
+			# 	}
+			# 	ALERT node_filedescriptors_full_in_3h
+			# 	IF predict_linear(node_filefd_allocated[1h], 3*3600) >= node_filefd_maximum
+			# 	FOR 20m
+			# 	LABELS {
+			# 		severity="page"
+			# 	}
+			# 	ANNOTATIONS {
+			# 		summary = "{{$labels.alias}} is running out of available file descriptors in 3 hours.",
+			# 		description = "{{$labels.alias}} is running out of available file descriptors in approx. 3 hours"
+			# 	}
+			# 	ALERT node_load1_90percent
+			# 	IF node_load1 / on(alias) count(node_cpu{mode="system"}) by (alias) >= 0.9
+			# 	FOR 1h
+			# 	LABELS {
+			# 		severity="page"
+			# 	}
+			# 	ANNOTATIONS {
+			# 		summary = "{{$labels.alias}}: Running on high load.",
+			# 		description = "{{$labels.alias}} is running with > 90% total load for at least 1h."
+			# 	}
+			# 	ALERT node_cpu_util_90percent
+			# 	IF 100 - (avg by (alias) (irate(node_cpu{mode="idle"}[5m])) * 100) >= 90
+			# 	FOR 1h
+			# 	LABELS {
+			# 		severity="page"
+			# 	}
+			# 	ANNOTATIONS {
+			# 		summary = "{{$labels.alias}}: High CPU utilization.",
+			# 		description = "{{$labels.alias}} has total CPU utilization over 90% for at least 1h."
+			# 	}
+			# 	ALERT node_ram_using_90percent
+			# 	IF node_memory_MemFree + node_memory_Buffers + node_memory_Cached < node_memory_MemTotal * 0.1
+			# 	FOR 30m
+			# 	LABELS {
+			# 		severity="page"
+			# 	}
+			# 	ANNOTATIONS {
+			# 		summary="{{$labels.alias}}: Using lots of RAM.",
+			# 		description="{{$labels.alias}} is using at least 90% of its RAM for at least 30 minutes now.",
+			# 	}
+			# 	ALERT node_swap_using_80percent
+			# 	IF node_memory_SwapTotal - (node_memory_SwapFree + node_memory_SwapCached) > node_memory_SwapTotal * 0.8
+			# 	FOR 10m
+			# 	LABELS {
+			# 		severity="page"
+			# 	}
+			# 	ANNOTATIONS {
+			# 		summary="{{$labels.alias}}: Running out of swap soon.",
+			# 		description="{{$labels.alias}} is using 80% of its swap space for at least 10 minutes now."
+			# 	}
+			# 	''
+			# ];
 			scrapeConfigs = [
 				{
 					job_name = "node";