feat: monitoring fixes
Some checks failed
/ lint (push) Failing after 2m5s

This commit is contained in:
Tine 2024-07-27 11:31:15 +02:00
parent 0f179e4c0e
commit 52f5d3e307
Signed by: mentos1386
SSH key fingerprint: SHA256:MNtTsLbihYaWF8j1fkOHfkKNlnN1JQfxEU/rBU8nCGw
2 changed files with 119 additions and 67 deletions

View file

@ -12,10 +12,36 @@ resource "kubernetes_manifest" "prometheus-service-monitors" {
manifest = yamldecode(file("${path.module}/manifests/crd-servicemonitors.yaml")) manifest = yamldecode(file("${path.module}/manifests/crd-servicemonitors.yaml"))
} }
resource "helm_release" "kube-state-metrics" {
depends_on = [kubernetes_manifest.prometheus-pod-monitors, kubernetes_manifest.prometheus-service-monitors]
name = "kube-state-metrics"
chart = "kube-state-metrics"
repository = "https://prometheus-community.github.io/helm-charts"
version = "5.24.0"
namespace = kubernetes_namespace.monitoring-system.metadata[0].name
atomic = true
cleanup_on_fail = true
values = [<<-EOF
nodeSelector:
node-role.kubernetes.io/control-plane: ""
tolerations:
- key: "node-role.kubernetes.io/control-plane"
effect: "NoSchedule"
prometheus:
monitor:
enabled: true
http:
honorLabels: true
EOF
]
}
resource "helm_release" "grafana-alloy" { resource "helm_release" "grafana-alloy" {
depends_on = [kubernetes_manifest.prometheus-pod-monitors, kubernetes_manifest.prometheus-service-monitors] depends_on = [kubernetes_manifest.prometheus-pod-monitors, kubernetes_manifest.prometheus-service-monitors]
name = "grafana-alloy-deamonset" name = "grafana-alloy"
chart = "alloy" chart = "alloy"
repository = "https://grafana.github.io/helm-charts" repository = "https://grafana.github.io/helm-charts"
version = "0.5.1" version = "0.5.1"
@ -24,6 +50,15 @@ resource "helm_release" "grafana-alloy" {
cleanup_on_fail = true cleanup_on_fail = true
values = [<<-EOF values = [<<-EOF
serviceMonitor:
enabled: true
controller:
type: "deployment"
nodeSelector:
node-role.kubernetes.io/control-plane: ""
tolerations:
- key: "node-role.kubernetes.io/control-plane"
effect: "NoSchedule"
alloy: alloy:
extraEnv: extraEnv:
- name: "CLUSTER_NAME" - name: "CLUSTER_NAME"
@ -39,20 +74,17 @@ resource "helm_release" "grafana-alloy" {
format = "logfmt" format = "logfmt"
} }
// --
// Discovery
// --
discovery.kubernetes "pods" { discovery.kubernetes "pods" {
role = "pod" role = "pod"
selectors {
role = "pod"
field = "spec.nodeName=" + coalesce(env("HOSTNAME"), constants.hostname)
} }
discovery.kubernetes "services" {
role = "services"
} }
discovery.relabel "all" {
// -- targets = concat(discovery.kubernetes.pods.targets, discovery.kubernetes.services.targets)
// Metrics
// --
prometheus.exporter.unix "self" {}
discovery.relabel "pod_metrics" {
targets = concat(discovery.kubernetes.pods.targets, prometheus.exporter.unix.self.targets)
// allow override of http scheme with `promehteus.io/scheme` // allow override of http scheme with `promehteus.io/scheme`
rule { rule {
@ -64,7 +96,6 @@ resource "helm_release" "grafana-alloy" {
] ]
target_label = "__scheme__" target_label = "__scheme__"
} }
// allow override of default /metrics path with `prometheus.io/path` // allow override of default /metrics path with `prometheus.io/path`
rule { rule {
action = "replace" action = "replace"
@ -75,7 +106,6 @@ resource "helm_release" "grafana-alloy" {
] ]
target_label = "__metrics_path__" target_label = "__metrics_path__"
} }
// allow override of default port with `prometheus.io/port` // allow override of default port with `prometheus.io/port`
rule { rule {
action = "replace" action = "replace"
@ -88,40 +118,64 @@ resource "helm_release" "grafana-alloy" {
] ]
target_label = "__address__" target_label = "__address__"
} }
// Add Namespace
rule {
action = "replace"
source_labels = ["__meta_kubernetes_namespace"]
target_label = "kubernetes_namespace"
}
// Add Pod Name
rule {
action = "replace"
source_labels = ["__meta_kubernetes_pod_name"]
target_label = "kubernetes_pod"
}
// Add Service Name
rule {
action = "replace"
source_labels = ["__meta_kubernetes_service_name"]
target_label = "kubernetes_service"
} }
// Add all pod labels // --
rule { // Metrics
action = "labelmap" // --
regex = "__meta_kubernetes_pod_label_(.+)" prometheus.scrape "all" {
honor_labels = true
targets = discovery.relabel.all.output
forward_to = [prometheus.relabel.all.receiver]
} }
// Add all service labels prometheus.operator.podmonitors "all" {
rule { forward_to = [prometheus.relabel.all.receiver]
action = "labelmap"
regex = "__meta_kubernetes_service_label_(.+)"
} }
prometheus.operator.servicemonitors "all" {
forward_to = [prometheus.relabel.all.receiver]
} }
prometheus.scrape "containers" { prometheus.relabel "all" {
targets = discovery.relabel.pod_metrics.output
forward_to = [prometheus.remote_write.prometheus_monitor_tjo_space.receiver] forward_to = [prometheus.remote_write.prometheus_monitor_tjo_space.receiver]
rule {
source_labels = ["__meta_kubernetes_namespace"]
action = "replace"
target_label = "namespace"
}
rule {
source_labels = ["__meta_kubernetes_pod_name"]
action = "replace"
target_label = "pod"
}
rule {
source_labels = ["__meta_kubernetes_pod_container_name"]
action = "replace"
target_label = "container"
}
rule {
source_labels = ["__meta_kubernetes_pod_label_app_kubernetes_io_name"]
action = "replace"
target_label = "app"
}
rule {
source_labels = ["__meta_kubernetes_pod_label_app_kubernetes_io_version"]
action = "replace"
target_label = "version"
}
rule {
source_labels = ["__meta_kubernetes_namespace", "__meta_kubernetes_pod_container_name"]
action = "replace"
target_label = "job"
separator = "/"
replacement = "$1"
}
rule {
source_labels = ["__meta_kubernetes_pod_container_id"]
action = "replace"
target_label = "container_runtime"
regex = "^(\\S+):\\/\\/.+$"
replacement = "$1"
}
} }
prometheus.remote_write "prometheus_monitor_tjo_space" { prometheus.remote_write "prometheus_monitor_tjo_space" {
external_labels = { external_labels = {
@ -145,22 +199,15 @@ resource "helm_release" "grafana-alloy" {
// -- // --
// Logs // Logs
// -- // --
local.file_match "node_logs" { loki.source.kubernetes "pods" {
path_targets = [{ targets = discovery.relabel.all.output
// Monitor syslog to scrape node-logs forward_to = [loki.relabel.all.receiver]
__path__ = "/var/log/syslog",
job = "node/syslog",
node_name = env("HOSTNAME"),
}]
} }
loki.source.file "node_logs" { loki.source.kubernetes_events "all" {
targets = local.file_match.node_logs.targets forward_to = [loki.relabel.all.receiver]
}
loki.relabel "all" {
forward_to = [loki.write.loki_monitor_tjo_space.receiver] forward_to = [loki.write.loki_monitor_tjo_space.receiver]
}
discovery.relabel "pod_logs" {
targets = discovery.kubernetes.pod.targets
rule { rule {
source_labels = ["__meta_kubernetes_namespace"] source_labels = ["__meta_kubernetes_namespace"]
@ -182,6 +229,11 @@ resource "helm_release" "grafana-alloy" {
action = "replace" action = "replace"
target_label = "app" target_label = "app"
} }
rule {
source_labels = ["__meta_kubernetes_pod_label_app_kubernetes_io_version"]
action = "replace"
target_label = "version"
}
rule { rule {
source_labels = ["__meta_kubernetes_namespace", "__meta_kubernetes_pod_container_name"] source_labels = ["__meta_kubernetes_namespace", "__meta_kubernetes_pod_container_name"]
action = "replace" action = "replace"
@ -204,10 +256,6 @@ resource "helm_release" "grafana-alloy" {
replacement = "$1" replacement = "$1"
} }
} }
loki.source.kubernetes "pod_logs" {
targets = discovery.relabel.pod_logs.output
forward_to = [loki.write.loki_monitor_tjo_space.receiver]
}
loki.write "loki_monitor_tjo_space" { loki.write "loki_monitor_tjo_space" {
external_labels = { external_labels = {
cluster = env("CLUSTER_NAME"), cluster = env("CLUSTER_NAME"),

View file

@ -282,6 +282,10 @@ data "talos_cluster_kubeconfig" "this" {
resource "local_file" "kubeconfig" { resource "local_file" "kubeconfig" {
content = data.talos_cluster_kubeconfig.this.kubeconfig_raw content = data.talos_cluster_kubeconfig.this.kubeconfig_raw
filename = "${path.root}/admin.kubeconfig" filename = "${path.root}/admin.kubeconfig"
lifecycle {
ignore_changes = [content]
}
} }
data "talos_client_configuration" "this" { data "talos_client_configuration" "this" {