From 2552624dd4e927f08ab1ff26d6bce56f13f48f7f Mon Sep 17 00:00:00 2001 From: Malte Viering Date: Mon, 1 Jun 2026 15:22:58 +0000 Subject: [PATCH 1/3] feat(alerts): make nova alerts region- and value-aware Inline alert rules into each bundle's templates/alerts.yaml so they can be gated on Helm values. Nova: severity of CortexNovaSchedulingDown depends on kvm.enabled, CortexNovaDoesntFindValidKVMHosts only renders when KVM is enabled, memory and reconcile-duration thresholds are configurable via .Values.alerts.thresholds. Other bundles: structural relocation only with Style-B escaping of Prometheus directives. Ironcore: empty rules removed. --- .../committed-resource-reservations.md | 2 +- .../cortex-cinder/alerts/cinder.alerts.yaml | 260 -------- .../cortex-cinder/templates/alerts.yaml | 268 +++++++- .../alerts/ironcore.alerts.yaml | 3 - .../cortex-ironcore/templates/alerts.yaml | 17 - .../cortex-manila/alerts/manila.alerts.yaml | 235 ------- .../cortex-manila/templates/alerts.yaml | 243 ++++++- .../cortex-nova/alerts/nova.alerts.yaml | 609 ----------------- .../bundles/cortex-nova/templates/alerts.yaml | 619 +++++++++++++++++- helm/bundles/cortex-nova/values.yaml | 5 + .../alerts/placement-shim.alerts.yaml | 179 ----- .../templates/alerts.yaml | 182 ++++- 12 files changed, 1302 insertions(+), 1320 deletions(-) delete mode 100644 helm/bundles/cortex-cinder/alerts/cinder.alerts.yaml delete mode 100644 helm/bundles/cortex-ironcore/alerts/ironcore.alerts.yaml delete mode 100644 helm/bundles/cortex-ironcore/templates/alerts.yaml delete mode 100644 helm/bundles/cortex-manila/alerts/manila.alerts.yaml delete mode 100644 helm/bundles/cortex-nova/alerts/nova.alerts.yaml delete mode 100644 helm/bundles/cortex-placement-shim/alerts/placement-shim.alerts.yaml diff --git a/docs/reservations/committed-resource-reservations.md b/docs/reservations/committed-resource-reservations.md index 4d96d43a6..7d80064b0 100644 --- a/docs/reservations/committed-resource-reservations.md +++ b/docs/reservations/committed-resource-reservations.md @@ -35,7 +35,7 @@ The CR reservation implementation is located in `internal/scheduling/reservation - Scheduling pipeline selection per flavor group - Per-flavor-group resource flags (`handlesCommitments`, `hasCapacity`, `hasQuota`) controlling which resource types are active for each group -**Metrics and Alerts**: Defined in `helm/bundles/cortex-nova/alerts/nova.alerts.yaml` with prefixes: +**Metrics and Alerts**: Defined in `helm/bundles/cortex-nova/templates/alerts.yaml` with prefixes: - `cortex_committed_resource_change_api_*` - `cortex_committed_resource_usage_api_*` - `cortex_committed_resource_capacity_api_*` diff --git a/helm/bundles/cortex-cinder/alerts/cinder.alerts.yaml b/helm/bundles/cortex-cinder/alerts/cinder.alerts.yaml deleted file mode 100644 index 6684e3392..000000000 --- a/helm/bundles/cortex-cinder/alerts/cinder.alerts.yaml +++ /dev/null @@ -1,260 +0,0 @@ -groups: -- name: cortex-cinder-alerts - rules: - - alert: CortexCinderSchedulingDown - expr: | - up{pod=~"cortex-cinder-scheduling-.*"} != 1 or - absent(up{pod=~"cortex-cinder-scheduling-.*"}) - for: 5m - labels: - context: liveness - dashboard: cortex-status-dashboard/cortex-status-dashboard - service: cortex - severity: warning - support_group: workload-management - playbook: docs/support/playbook/cortex/alerts/down - annotations: - summary: "Cortex Scheduling for Cinder is down" - description: > - The Cortex scheduling service is down. Scheduling requests from Cinder will - not be served. This is no immediate problem, since Cinder will continue - placing new VMs. However, the placement will be less desirable. - - - alert: CortexCinderKnowledgeDown - expr: | - up{pod=~"cortex-cinder-knowledge-.*"} != 1 or - absent(up{pod=~"cortex-cinder-knowledge-.*"}) - for: 5m - labels: - context: liveness - dashboard: cortex-status-dashboard/cortex-status-dashboard - service: cortex - severity: warning - support_group: workload-management - playbook: docs/support/playbook/cortex/alerts/down - annotations: - summary: "Cortex Knowledge for Cinder is down" - description: > - The Cortex Knowledge service is down. This is no immediate problem, - since cortex is still able to process requests, - but the quality of the responses may be affected. - - - alert: CortexCinderHttpRequest400sTooHigh - expr: rate(cortex_scheduler_api_request_duration_seconds_count{service="cortex-cinder-metrics", status=~"4.+"}[5m]) > 0.1 - for: 5m - labels: - context: api - dashboard: cortex-status-dashboard/cortex-status-dashboard - service: cortex - severity: warning - support_group: workload-management - annotations: - summary: "Cinder Scheduler HTTP request 400 errors too high" - description: > - Cinder Scheduler is responding to placement requests with HTTP 4xx - errors. This is expected when the scheduling request cannot be served - by Cortex. However, it could also indicate that the request format has - changed and Cortex is unable to parse it. - - - alert: CortexCinderSchedulingHttpRequest500sTooHigh - expr: rate(cortex_scheduler_api_request_duration_seconds_count{service="cortex-cinder-metrics", status=~"5.+" }[5m]) > 0.1 - for: 5m - labels: - context: api - dashboard: cortex-status-dashboard/cortex-status-dashboard - service: cortex - severity: warning - support_group: workload-management - annotations: - summary: "Cinder Scheduler HTTP request 500 errors too high" - description: > - Cinder Scheduler is responding to placement requests with HTTP 5xx errors. - This is not expected and indicates that Cortex is having some internal problem. - Cinder will continue to place new VMs, but the placement will be less desirable. - Thus, no immediate action is needed. - - - alert: CortexCinderHighMemoryUsage - expr: process_resident_memory_bytes{service="cortex-cinder-metrics"} > 6000 * 1024 * 1024 - for: 5m - labels: - context: memory - dashboard: cortex-status-dashboard/cortex-status-dashboard - service: cortex - severity: warning - support_group: workload-management - annotations: - summary: "`{{$labels.component}}` uses too much memory" - description: > - `{{$labels.component}}` should not be using more than 6000 MiB of memory. Usually it - should use much less, so there may be a memory leak or other changes - that are causing the memory usage to increase significantly. - - - alert: CortexCinderHighCPUUsage - expr: rate(process_cpu_seconds_total{service="cortex-cinder-metrics"}[1m]) > 0.5 - for: 5m - labels: - context: cpu - dashboard: cortex-status-dashboard/cortex-status-dashboard - service: cortex - severity: warning - support_group: workload-management - annotations: - summary: "`{{$labels.component}}` uses too much CPU" - description: > - `{{$labels.component}}` should not be using more than 50% of a single CPU core. Usually - it should use much less, so there may be a CPU leak or other changes - that are causing the CPU usage to increase significantly. - - - alert: CortexCinderTooManyDBConnectionAttempts - expr: rate(cortex_db_connection_attempts_total{service="cortex-cinder-metrics"}[5m]) > 0.1 - for: 5m - labels: - context: db - dashboard: cortex-status-dashboard/cortex-status-dashboard - service: cortex - severity: warning - support_group: workload-management - annotations: - summary: "`{{$labels.component}}` is trying to connect to the database too often" - description: > - `{{$labels.component}}` is trying to connect to the database too often. This may happen - when the database is down or the connection parameters are misconfigured. - - - alert: CortexCinderSyncNotSuccessful - expr: cortex_sync_request_processed_total{service="cortex-cinder-metrics"} - cortex_sync_request_duration_seconds_count{service="cortex-cinder-metrics"} > 0 - for: 5m - labels: - context: syncstatus - dashboard: cortex-status-dashboard/cortex-status-dashboard - service: cortex - severity: warning - support_group: workload-management - annotations: - summary: "`{{$labels.component}}` Sync not successful" - description: > - `{{$labels.component}}` experienced an issue syncing data from the datasource `{{$labels.datasource}}`. This may - happen when the datasource (OpenStack, Prometheus, etc.) is down or - the sync module is misconfigured. No immediate action is needed, since - the sync module will retry the sync operation and the currently synced - data will be kept. However, when this problem persists for a longer - time the service will have a less recent view of the datacenter. - - - alert: CortexCinderSyncObjectsDroppedToZero - expr: cortex_sync_objects{service="cortex-cinder-metrics"} == 0 - for: 60m - labels: - context: syncobjects - dashboard: cortex-status-dashboard/cortex-status-dashboard - service: cortex - severity: warning - support_group: workload-management - annotations: - summary: "`{{$labels.component}}` is not syncing any new data from `{{$labels.datasource}}`" - description: > - `{{$labels.component}}` is not syncing any objects from the datasource `{{$labels.datasource}}`. This may happen - when the datasource (OpenStack, Prometheus, etc.) is down or the sync - module is misconfigured. No immediate action is needed, since the sync - module will retry the sync operation and the currently synced data will - be kept. However, when this problem persists for a longer time the - service will have a less recent view of the datacenter. - - - alert: CortexCinderDatasourceUnready - expr: cortex_datasource_state{domain="cinder",state!="ready"} != 0 - for: 60m - labels: - context: datasources - dashboard: cortex-status-dashboard/cortex-status-dashboard - service: cortex - severity: warning - support_group: workload-management - annotations: - summary: "Datasource `{{$labels.datasource}}` is in `{{$labels.state}}` state" - description: > - This may indicate issues with the datasource - connectivity or configuration. It is recommended to investigate the - datasource status and logs for more details. - - - alert: CortexCinderKnowledgeUnready - expr: cortex_knowledge_state{domain="cinder",state!="ready"} != 0 - for: 60m - labels: - context: knowledge - dashboard: cortex-status-dashboard/cortex-status-dashboard - service: cortex - severity: warning - support_group: workload-management - annotations: - summary: "Knowledge `{{$labels.knowledge}}` is in `{{$labels.state}}` state" - description: > - This may indicate issues with the knowledge - configuration. It is recommended to investigate the - knowledge status and logs for more details. - - - alert: CortexCinderDecisionsWithErrors - expr: cortex_decision_state{domain="cinder",state="error"} > 0 - for: 5m - labels: - context: decisions - dashboard: cortex-status-dashboard/cortex-status-dashboard - service: cortex - severity: warning - support_group: workload-management - annotations: - summary: "Some decisions are in error state for operator `{{$labels.operator}}`" - description: > - The cortex scheduling pipeline generated decisions that are in error state. - This may indicate issues with the decision logic or the underlying infrastructure. - It is recommended to investigate the decision logs and the state of the - VMs being processed. - - - alert: CortexCinderTooManyDecisionsWaiting - expr: cortex_decision_state{domain="cinder",state="waiting"} > 10 - for: 5m - labels: - context: decisions - dashboard: cortex-status-dashboard/cortex-status-dashboard - service: cortex - severity: warning - support_group: workload-management - annotations: - summary: "Too many decisions are in waiting state for operator `{{$labels.operator}}`" - description: > - The cortex scheduling pipeline has a high number of decisions for which - no target host has been assigned yet. - - This may indicate a backlog in processing or issues with the decision logic. - It is recommended to investigate the decision logs and the state of the - VMs being processed. - - - alert: CortexCinderKPIUnready - expr: | - cortex_kpi_state{domain="cinder",state!="ready"} != 0 - for: 60m - labels: - context: kpis - dashboard: cortex-status-dashboard/cortex-status-dashboard - service: cortex - severity: warning - support_group: workload-management - annotations: - summary: "KPI `{{$labels.kpi}}` is in `{{$labels.state}}` state" - description: > - This may indicate issues with the KPI - configuration. It is recommended to investigate the - KPI status and logs for more details. - - - alert: CortexCinderPipelineUnready - expr: cortex_pipeline_state{domain="cinder",state!="ready"} != 0 - for: 5m - labels: - context: pipelines - dashboard: cortex-status-dashboard/cortex-status-dashboard - service: cortex - severity: warning - support_group: workload-management - annotations: - summary: "Pipeline `{{$labels.pipeline}}` is in `{{$labels.state}}` state" - description: > - This may indicate issues with the pipeline - configuration. It is recommended to investigate the - pipeline status and logs for more details. diff --git a/helm/bundles/cortex-cinder/templates/alerts.yaml b/helm/bundles/cortex-cinder/templates/alerts.yaml index 59496c33d..4beea8b53 100644 --- a/helm/bundles/cortex-cinder/templates/alerts.yaml +++ b/helm/bundles/cortex-cinder/templates/alerts.yaml @@ -1,6 +1,10 @@ # Copyright SAP SE # SPDX-License-Identifier: Apache-2.0 +# NOTE: This file is rendered by Helm. Prometheus templating directives +# (e.g. {{ "{{" }} $labels.foo {{ "}}" }}) must be escaped using Style B: +# replace the outer `{{` and `}}` with `{{ "{{" }}` and `{{ "}}" }}`. + {{- if .Values.alerts.enabled }} apiVersion: monitoring.coreos.com/v1 kind: PrometheusRule @@ -10,8 +14,264 @@ metadata: type: alerting-rules prometheus: {{ required ".Values.alerts.prometheus missing" .Values.alerts.prometheus | quote }} spec: - {{- $files := .Files.Glob "alerts/*.alerts.yaml" }} - {{- range $path, $file := $files }} - {{ $file | toString | nindent 2 }} - {{- end }} + groups: + - name: cortex-cinder-alerts + rules: + - alert: CortexCinderSchedulingDown + expr: | + up{pod=~"cortex-cinder-scheduling-.*"} != 1 or + absent(up{pod=~"cortex-cinder-scheduling-.*"}) + for: 5m + labels: + context: liveness + dashboard: cortex-status-dashboard/cortex-status-dashboard + service: cortex + severity: warning + support_group: workload-management + playbook: docs/support/playbook/cortex/alerts/down + annotations: + summary: "Cortex Scheduling for Cinder is down" + description: > + The Cortex scheduling service is down. Scheduling requests from Cinder will + not be served. This is no immediate problem, since Cinder will continue + placing new VMs. However, the placement will be less desirable. + + - alert: CortexCinderKnowledgeDown + expr: | + up{pod=~"cortex-cinder-knowledge-.*"} != 1 or + absent(up{pod=~"cortex-cinder-knowledge-.*"}) + for: 5m + labels: + context: liveness + dashboard: cortex-status-dashboard/cortex-status-dashboard + service: cortex + severity: warning + support_group: workload-management + playbook: docs/support/playbook/cortex/alerts/down + annotations: + summary: "Cortex Knowledge for Cinder is down" + description: > + The Cortex Knowledge service is down. This is no immediate problem, + since cortex is still able to process requests, + but the quality of the responses may be affected. + + - alert: CortexCinderHttpRequest400sTooHigh + expr: rate(cortex_scheduler_api_request_duration_seconds_count{service="cortex-cinder-metrics", status=~"4.+"}[5m]) > 0.1 + for: 5m + labels: + context: api + dashboard: cortex-status-dashboard/cortex-status-dashboard + service: cortex + severity: warning + support_group: workload-management + annotations: + summary: "Cinder Scheduler HTTP request 400 errors too high" + description: > + Cinder Scheduler is responding to placement requests with HTTP 4xx + errors. This is expected when the scheduling request cannot be served + by Cortex. However, it could also indicate that the request format has + changed and Cortex is unable to parse it. + + - alert: CortexCinderSchedulingHttpRequest500sTooHigh + expr: rate(cortex_scheduler_api_request_duration_seconds_count{service="cortex-cinder-metrics", status=~"5.+" }[5m]) > 0.1 + for: 5m + labels: + context: api + dashboard: cortex-status-dashboard/cortex-status-dashboard + service: cortex + severity: warning + support_group: workload-management + annotations: + summary: "Cinder Scheduler HTTP request 500 errors too high" + description: > + Cinder Scheduler is responding to placement requests with HTTP 5xx errors. + This is not expected and indicates that Cortex is having some internal problem. + Cinder will continue to place new VMs, but the placement will be less desirable. + Thus, no immediate action is needed. + + - alert: CortexCinderHighMemoryUsage + expr: process_resident_memory_bytes{service="cortex-cinder-metrics"} > 6000 * 1024 * 1024 + for: 5m + labels: + context: memory + dashboard: cortex-status-dashboard/cortex-status-dashboard + service: cortex + severity: warning + support_group: workload-management + annotations: + summary: "`{{ "{{" }} $labels.component {{ "}}" }}` uses too much memory" + description: > + `{{ "{{" }} $labels.component {{ "}}" }}` should not be using more than 6000 MiB of memory. Usually it + should use much less, so there may be a memory leak or other changes + that are causing the memory usage to increase significantly. + + - alert: CortexCinderHighCPUUsage + expr: rate(process_cpu_seconds_total{service="cortex-cinder-metrics"}[1m]) > 0.5 + for: 5m + labels: + context: cpu + dashboard: cortex-status-dashboard/cortex-status-dashboard + service: cortex + severity: warning + support_group: workload-management + annotations: + summary: "`{{ "{{" }} $labels.component {{ "}}" }}` uses too much CPU" + description: > + `{{ "{{" }} $labels.component {{ "}}" }}` should not be using more than 50% of a single CPU core. Usually + it should use much less, so there may be a CPU leak or other changes + that are causing the CPU usage to increase significantly. + + - alert: CortexCinderTooManyDBConnectionAttempts + expr: rate(cortex_db_connection_attempts_total{service="cortex-cinder-metrics"}[5m]) > 0.1 + for: 5m + labels: + context: db + dashboard: cortex-status-dashboard/cortex-status-dashboard + service: cortex + severity: warning + support_group: workload-management + annotations: + summary: "`{{ "{{" }} $labels.component {{ "}}" }}` is trying to connect to the database too often" + description: > + `{{ "{{" }} $labels.component {{ "}}" }}` is trying to connect to the database too often. This may happen + when the database is down or the connection parameters are misconfigured. + + - alert: CortexCinderSyncNotSuccessful + expr: cortex_sync_request_processed_total{service="cortex-cinder-metrics"} - cortex_sync_request_duration_seconds_count{service="cortex-cinder-metrics"} > 0 + for: 5m + labels: + context: syncstatus + dashboard: cortex-status-dashboard/cortex-status-dashboard + service: cortex + severity: warning + support_group: workload-management + annotations: + summary: "`{{ "{{" }} $labels.component {{ "}}" }}` Sync not successful" + description: > + `{{ "{{" }} $labels.component {{ "}}" }}` experienced an issue syncing data from the datasource `{{ "{{" }} $labels.datasource {{ "}}" }}`. This may + happen when the datasource (OpenStack, Prometheus, etc.) is down or + the sync module is misconfigured. No immediate action is needed, since + the sync module will retry the sync operation and the currently synced + data will be kept. However, when this problem persists for a longer + time the service will have a less recent view of the datacenter. + + - alert: CortexCinderSyncObjectsDroppedToZero + expr: cortex_sync_objects{service="cortex-cinder-metrics"} == 0 + for: 60m + labels: + context: syncobjects + dashboard: cortex-status-dashboard/cortex-status-dashboard + service: cortex + severity: warning + support_group: workload-management + annotations: + summary: "`{{ "{{" }} $labels.component {{ "}}" }}` is not syncing any new data from `{{ "{{" }} $labels.datasource {{ "}}" }}`" + description: > + `{{ "{{" }} $labels.component {{ "}}" }}` is not syncing any objects from the datasource `{{ "{{" }} $labels.datasource {{ "}}" }}`. This may happen + when the datasource (OpenStack, Prometheus, etc.) is down or the sync + module is misconfigured. No immediate action is needed, since the sync + module will retry the sync operation and the currently synced data will + be kept. However, when this problem persists for a longer time the + service will have a less recent view of the datacenter. + + - alert: CortexCinderDatasourceUnready + expr: cortex_datasource_state{domain="cinder",state!="ready"} != 0 + for: 60m + labels: + context: datasources + dashboard: cortex-status-dashboard/cortex-status-dashboard + service: cortex + severity: warning + support_group: workload-management + annotations: + summary: "Datasource `{{ "{{" }} $labels.datasource {{ "}}" }}` is in `{{ "{{" }} $labels.state {{ "}}" }}` state" + description: > + This may indicate issues with the datasource + connectivity or configuration. It is recommended to investigate the + datasource status and logs for more details. + + - alert: CortexCinderKnowledgeUnready + expr: cortex_knowledge_state{domain="cinder",state!="ready"} != 0 + for: 60m + labels: + context: knowledge + dashboard: cortex-status-dashboard/cortex-status-dashboard + service: cortex + severity: warning + support_group: workload-management + annotations: + summary: "Knowledge `{{ "{{" }} $labels.knowledge {{ "}}" }}` is in `{{ "{{" }} $labels.state {{ "}}" }}` state" + description: > + This may indicate issues with the knowledge + configuration. It is recommended to investigate the + knowledge status and logs for more details. + + - alert: CortexCinderDecisionsWithErrors + expr: cortex_decision_state{domain="cinder",state="error"} > 0 + for: 5m + labels: + context: decisions + dashboard: cortex-status-dashboard/cortex-status-dashboard + service: cortex + severity: warning + support_group: workload-management + annotations: + summary: "Some decisions are in error state for operator `{{ "{{" }} $labels.operator {{ "}}" }}`" + description: > + The cortex scheduling pipeline generated decisions that are in error state. + This may indicate issues with the decision logic or the underlying infrastructure. + It is recommended to investigate the decision logs and the state of the + VMs being processed. + + - alert: CortexCinderTooManyDecisionsWaiting + expr: cortex_decision_state{domain="cinder",state="waiting"} > 10 + for: 5m + labels: + context: decisions + dashboard: cortex-status-dashboard/cortex-status-dashboard + service: cortex + severity: warning + support_group: workload-management + annotations: + summary: "Too many decisions are in waiting state for operator `{{ "{{" }} $labels.operator {{ "}}" }}`" + description: > + The cortex scheduling pipeline has a high number of decisions for which + no target host has been assigned yet. + + This may indicate a backlog in processing or issues with the decision logic. + It is recommended to investigate the decision logs and the state of the + VMs being processed. + + - alert: CortexCinderKPIUnready + expr: | + cortex_kpi_state{domain="cinder",state!="ready"} != 0 + for: 60m + labels: + context: kpis + dashboard: cortex-status-dashboard/cortex-status-dashboard + service: cortex + severity: warning + support_group: workload-management + annotations: + summary: "KPI `{{ "{{" }} $labels.kpi {{ "}}" }}` is in `{{ "{{" }} $labels.state {{ "}}" }}` state" + description: > + This may indicate issues with the KPI + configuration. It is recommended to investigate the + KPI status and logs for more details. + + - alert: CortexCinderPipelineUnready + expr: cortex_pipeline_state{domain="cinder",state!="ready"} != 0 + for: 5m + labels: + context: pipelines + dashboard: cortex-status-dashboard/cortex-status-dashboard + service: cortex + severity: warning + support_group: workload-management + annotations: + summary: "Pipeline `{{ "{{" }} $labels.pipeline {{ "}}" }}` is in `{{ "{{" }} $labels.state {{ "}}" }}` state" + description: > + This may indicate issues with the pipeline + configuration. It is recommended to investigate the + pipeline status and logs for more details. {{- end }} diff --git a/helm/bundles/cortex-ironcore/alerts/ironcore.alerts.yaml b/helm/bundles/cortex-ironcore/alerts/ironcore.alerts.yaml deleted file mode 100644 index 0c72d9a92..000000000 --- a/helm/bundles/cortex-ironcore/alerts/ironcore.alerts.yaml +++ /dev/null @@ -1,3 +0,0 @@ -groups: -- name: cortex-ironcore-alerts - rules: [] diff --git a/helm/bundles/cortex-ironcore/templates/alerts.yaml b/helm/bundles/cortex-ironcore/templates/alerts.yaml deleted file mode 100644 index ca27396a5..000000000 --- a/helm/bundles/cortex-ironcore/templates/alerts.yaml +++ /dev/null @@ -1,17 +0,0 @@ -# Copyright SAP SE -# SPDX-License-Identifier: Apache-2.0 - -{{- if .Values.alerts.enabled }} -apiVersion: monitoring.coreos.com/v1 -kind: PrometheusRule -metadata: - name: cortex-ironcore-alerts - labels: - type: alerting-rules - prometheus: {{ required ".Values.alerts.prometheus missing" .Values.alerts.prometheus | quote }} -spec: - {{- $files := .Files.Glob "alerts/*.alerts.yaml" }} - {{- range $path, $file := $files }} - {{ $file | toString | nindent 2 }} - {{- end }} -{{- end }} diff --git a/helm/bundles/cortex-manila/alerts/manila.alerts.yaml b/helm/bundles/cortex-manila/alerts/manila.alerts.yaml deleted file mode 100644 index 2211d44fe..000000000 --- a/helm/bundles/cortex-manila/alerts/manila.alerts.yaml +++ /dev/null @@ -1,235 +0,0 @@ -groups: -- name: cortex-manila-alerts - rules: - - alert: CortexManilaSchedulingDown - expr: | - up{pod=~"cortex-manila-scheduling-.*"} != 1 or - absent(up{pod=~"cortex-manila-scheduling-.*"}) - for: 5m - labels: - context: liveness - dashboard: cortex-status-dashboard/cortex-status-dashboard - service: cortex - severity: warning - support_group: workload-management - playbook: docs/support/playbook/cortex/alerts/down - annotations: - summary: "Cortex Scheduling for Manila is down" - description: > - The Cortex scheduling service is down. Scheduling requests from Manila will - not be served. This is no immediate problem, since Manila will continue - placing new VMs. However, the placement will be less desirable. - - - alert: CortexManilaKnowledgeDown - expr: | - up{pod=~"cortex-manila-knowledge-.*"} != 1 or - absent(up{pod=~"cortex-manila-knowledge-.*"}) - for: 5m - labels: - context: liveness - dashboard: cortex-status-dashboard/cortex-status-dashboard - service: cortex - severity: warning - support_group: workload-management - playbook: docs/support/playbook/cortex/alerts/down - annotations: - summary: "Cortex Knowledge for Manila is down" - description: > - The Cortex Knowledge service is down. This is no immediate problem, - since cortex is still able to process requests, - but the quality of the responses may be affected. - - - alert: CortexManilaHttpRequest400sTooHigh - expr: rate(cortex_scheduler_api_request_duration_seconds_count{service="cortex-manila-metrics", status=~"4.+"}[5m]) > 0.1 - for: 5m - labels: - context: api - dashboard: cortex-status-dashboard/cortex-status-dashboard - service: cortex - severity: warning - support_group: workload-management - playbook: docs/support/playbook/cortex/alerts/apierrors - annotations: - summary: "Manila Scheduler HTTP request 400 errors too high" - description: > - Manila Scheduler is responding to placement requests with HTTP 4xx - errors. This is expected when the scheduling request cannot be served - by Cortex. However, it could also indicate that the request format has - changed and Cortex is unable to parse it. - - - alert: CortexManilaSchedulingHttpRequest500sTooHigh - expr: rate(cortex_scheduler_api_request_duration_seconds_count{service="cortex-manila-metrics", status=~"5.+" }[5m]) > 0.1 - for: 5m - labels: - context: api - dashboard: cortex-status-dashboard/cortex-status-dashboard - service: cortex - severity: warning - support_group: workload-management - playbook: docs/support/playbook/cortex/alerts/apierrors - annotations: - summary: "Manila Scheduler HTTP request 500 errors too high" - description: > - Manila Scheduler is responding to placement requests with HTTP 5xx errors. - This is not expected and indicates that Cortex is having some internal problem. - Manila will continue to place new VMs, but the placement will be less desirable. - Thus, no immediate action is needed. - - - alert: CortexManilaHighMemoryUsage - expr: process_resident_memory_bytes{service="cortex-manila-metrics"} > 6000 * 1024 * 1024 - for: 5m - labels: - context: memory - dashboard: cortex-status-dashboard/cortex-status-dashboard - service: cortex - severity: warning - support_group: workload-management - playbook: docs/support/playbook/cortex/alerts/deployment - annotations: - summary: "`{{$labels.component}}` uses too much memory" - description: > - `{{$labels.component}}` should not be using more than 6000 MiB of memory. Usually it - should use much less, so there may be a memory leak or other changes - that are causing the memory usage to increase significantly. - - - alert: CortexManilaHighCPUUsage - expr: rate(process_cpu_seconds_total{service="cortex-manila-metrics"}[1m]) > 0.5 - for: 5m - labels: - context: cpu - dashboard: cortex-status-dashboard/cortex-status-dashboard - service: cortex - severity: warning - support_group: workload-management - playbook: docs/support/playbook/cortex/alerts/deployment - annotations: - summary: "`{{$labels.component}}` uses too much CPU" - description: > - `{{$labels.component}}` should not be using more than 50% of a single CPU core. Usually - it should use much less, so there may be a CPU leak or other changes - that are causing the CPU usage to increase significantly. - - - alert: CortexManilaTooManyDBConnectionAttempts - expr: rate(cortex_db_connection_attempts_total{service="cortex-manila-metrics"}[5m]) > 0.1 - for: 5m - labels: - context: db - dashboard: cortex-status-dashboard/cortex-status-dashboard - service: cortex - severity: warning - support_group: workload-management - playbook: docs/support/playbook/cortex/alerts/database - annotations: - summary: "`{{$labels.component}}` is trying to connect to the database too often" - description: > - `{{$labels.component}}` is trying to connect to the database too often. This may happen - when the database is down or the connection parameters are misconfigured. - - - alert: CortexManilaSyncNotSuccessful - expr: cortex_sync_request_processed_total{service="cortex-manila-metrics"} - cortex_sync_request_duration_seconds_count{service="cortex-manila-metrics"} > 0 - for: 5m - labels: - context: syncstatus - dashboard: cortex-status-dashboard/cortex-status-dashboard - service: cortex - severity: warning - support_group: workload-management - playbook: docs/support/playbook/cortex/alerts/datasources - annotations: - summary: "`{{$labels.component}}` Sync not successful" - description: > - `{{$labels.component}}` experienced an issue syncing data from the datasource `{{$labels.datasource}}`. This may - happen when the datasource (OpenStack, Prometheus, etc.) is down or - the sync module is misconfigured. No immediate action is needed, since - the sync module will retry the sync operation and the currently synced - data will be kept. However, when this problem persists for a longer - time the service will have a less recent view of the datacenter. - - - alert: CortexManilaSyncObjectsDroppedToZero - expr: cortex_sync_objects{service="cortex-manila-metrics"} == 0 - for: 60m - labels: - context: syncobjects - dashboard: cortex-status-dashboard/cortex-status-dashboard - service: cortex - severity: warning - support_group: workload-management - playbook: docs/support/playbook/cortex/alerts/datasources - annotations: - summary: "`{{$labels.component}}` is not syncing any new data from `{{$labels.datasource}}`" - description: > - `{{$labels.component}}` is not syncing any objects from the datasource `{{$labels.datasource}}`. This may happen - when the datasource (OpenStack, Prometheus, etc.) is down or the sync - module is misconfigured. No immediate action is needed, since the sync - module will retry the sync operation and the currently synced data will - be kept. However, when this problem persists for a longer time the - service will have a less recent view of the datacenter. - - - alert: CortexManilaDatasourceUnready - expr: cortex_datasource_state{domain="manila",state!="ready"} != 0 - for: 60m - labels: - context: datasources - dashboard: cortex-status-dashboard/cortex-status-dashboard - service: cortex - severity: warning - support_group: workload-management - playbook: docs/support/playbook/cortex/alerts/unready - annotations: - summary: "Datasource `{{$labels.datasource}}` is in `{{$labels.state}}` state" - description: > - This may indicate issues with the datasource - connectivity or configuration. It is recommended to investigate the - datasource status and logs for more details. - - - alert: CortexManilaKnowledgeUnready - expr: cortex_knowledge_state{domain="manila",state!="ready"} != 0 - for: 60m - labels: - context: knowledge - dashboard: cortex-status-dashboard/cortex-status-dashboard - service: cortex - severity: warning - support_group: workload-management - playbook: docs/support/playbook/cortex/alerts/unready - annotations: - summary: "Knowledge `{{$labels.knowledge}}` is in `{{$labels.state}}` state" - description: > - This may indicate issues with the knowledge - configuration. It is recommended to investigate the - knowledge status and logs for more details. - - - alert: CortexManilaKPIUnready - expr: | - cortex_kpi_state{domain="manila",state!="ready"} != 0 - for: 60m - labels: - context: kpis - dashboard: cortex-status-dashboard/cortex-status-dashboard - service: cortex - severity: warning - support_group: workload-management - playbook: docs/support/playbook/cortex/alerts/unready - annotations: - summary: "KPI `{{$labels.kpi}}` is in `{{$labels.state}}` state" - description: > - This may indicate issues with the KPI - configuration. It is recommended to investigate the - KPI status and logs for more details. - - - alert: CortexManilaPipelineUnready - expr: cortex_pipeline_state{domain="manila",state!="ready"} != 0 - for: 5m - labels: - context: kpis - dashboard: cortex-status-dashboard/cortex-status-dashboard - service: cortex - severity: warning - support_group: workload-management - playbook: docs/support/playbook/cortex/alerts/unready - annotations: - summary: "Pipeline `{{$labels.pipeline}}` is in `{{$labels.state}}` state" - description: > - This may indicate issues with the pipeline - configuration. It is recommended to investigate the - pipeline status and logs for more details. diff --git a/helm/bundles/cortex-manila/templates/alerts.yaml b/helm/bundles/cortex-manila/templates/alerts.yaml index 1f25b0354..ef36fe983 100644 --- a/helm/bundles/cortex-manila/templates/alerts.yaml +++ b/helm/bundles/cortex-manila/templates/alerts.yaml @@ -1,6 +1,10 @@ # Copyright SAP SE # SPDX-License-Identifier: Apache-2.0 +# NOTE: This file is rendered by Helm. Prometheus templating directives +# (e.g. {{ "{{" }} $labels.foo {{ "}}" }}) must be escaped using Style B: +# replace the outer `{{` and `}}` with `{{ "{{" }}` and `{{ "}}" }}`. + {{- if .Values.alerts.enabled }} apiVersion: monitoring.coreos.com/v1 kind: PrometheusRule @@ -10,8 +14,239 @@ metadata: type: alerting-rules prometheus: {{ required ".Values.alerts.prometheus missing" .Values.alerts.prometheus | quote }} spec: - {{- $files := .Files.Glob "alerts/*.alerts.yaml" }} - {{- range $path, $file := $files }} - {{ $file | toString | nindent 2 }} - {{- end }} + groups: + - name: cortex-manila-alerts + rules: + - alert: CortexManilaSchedulingDown + expr: | + up{pod=~"cortex-manila-scheduling-.*"} != 1 or + absent(up{pod=~"cortex-manila-scheduling-.*"}) + for: 5m + labels: + context: liveness + dashboard: cortex-status-dashboard/cortex-status-dashboard + service: cortex + severity: warning + support_group: workload-management + playbook: docs/support/playbook/cortex/alerts/down + annotations: + summary: "Cortex Scheduling for Manila is down" + description: > + The Cortex scheduling service is down. Scheduling requests from Manila will + not be served. This is no immediate problem, since Manila will continue + placing new VMs. However, the placement will be less desirable. + + - alert: CortexManilaKnowledgeDown + expr: | + up{pod=~"cortex-manila-knowledge-.*"} != 1 or + absent(up{pod=~"cortex-manila-knowledge-.*"}) + for: 5m + labels: + context: liveness + dashboard: cortex-status-dashboard/cortex-status-dashboard + service: cortex + severity: warning + support_group: workload-management + playbook: docs/support/playbook/cortex/alerts/down + annotations: + summary: "Cortex Knowledge for Manila is down" + description: > + The Cortex Knowledge service is down. This is no immediate problem, + since cortex is still able to process requests, + but the quality of the responses may be affected. + + - alert: CortexManilaHttpRequest400sTooHigh + expr: rate(cortex_scheduler_api_request_duration_seconds_count{service="cortex-manila-metrics", status=~"4.+"}[5m]) > 0.1 + for: 5m + labels: + context: api + dashboard: cortex-status-dashboard/cortex-status-dashboard + service: cortex + severity: warning + support_group: workload-management + playbook: docs/support/playbook/cortex/alerts/apierrors + annotations: + summary: "Manila Scheduler HTTP request 400 errors too high" + description: > + Manila Scheduler is responding to placement requests with HTTP 4xx + errors. This is expected when the scheduling request cannot be served + by Cortex. However, it could also indicate that the request format has + changed and Cortex is unable to parse it. + + - alert: CortexManilaSchedulingHttpRequest500sTooHigh + expr: rate(cortex_scheduler_api_request_duration_seconds_count{service="cortex-manila-metrics", status=~"5.+" }[5m]) > 0.1 + for: 5m + labels: + context: api + dashboard: cortex-status-dashboard/cortex-status-dashboard + service: cortex + severity: warning + support_group: workload-management + playbook: docs/support/playbook/cortex/alerts/apierrors + annotations: + summary: "Manila Scheduler HTTP request 500 errors too high" + description: > + Manila Scheduler is responding to placement requests with HTTP 5xx errors. + This is not expected and indicates that Cortex is having some internal problem. + Manila will continue to place new VMs, but the placement will be less desirable. + Thus, no immediate action is needed. + + - alert: CortexManilaHighMemoryUsage + expr: process_resident_memory_bytes{service="cortex-manila-metrics"} > 6000 * 1024 * 1024 + for: 5m + labels: + context: memory + dashboard: cortex-status-dashboard/cortex-status-dashboard + service: cortex + severity: warning + support_group: workload-management + playbook: docs/support/playbook/cortex/alerts/deployment + annotations: + summary: "`{{ "{{" }} $labels.component {{ "}}" }}` uses too much memory" + description: > + `{{ "{{" }} $labels.component {{ "}}" }}` should not be using more than 6000 MiB of memory. Usually it + should use much less, so there may be a memory leak or other changes + that are causing the memory usage to increase significantly. + + - alert: CortexManilaHighCPUUsage + expr: rate(process_cpu_seconds_total{service="cortex-manila-metrics"}[1m]) > 0.5 + for: 5m + labels: + context: cpu + dashboard: cortex-status-dashboard/cortex-status-dashboard + service: cortex + severity: warning + support_group: workload-management + playbook: docs/support/playbook/cortex/alerts/deployment + annotations: + summary: "`{{ "{{" }} $labels.component {{ "}}" }}` uses too much CPU" + description: > + `{{ "{{" }} $labels.component {{ "}}" }}` should not be using more than 50% of a single CPU core. Usually + it should use much less, so there may be a CPU leak or other changes + that are causing the CPU usage to increase significantly. + + - alert: CortexManilaTooManyDBConnectionAttempts + expr: rate(cortex_db_connection_attempts_total{service="cortex-manila-metrics"}[5m]) > 0.1 + for: 5m + labels: + context: db + dashboard: cortex-status-dashboard/cortex-status-dashboard + service: cortex + severity: warning + support_group: workload-management + playbook: docs/support/playbook/cortex/alerts/database + annotations: + summary: "`{{ "{{" }} $labels.component {{ "}}" }}` is trying to connect to the database too often" + description: > + `{{ "{{" }} $labels.component {{ "}}" }}` is trying to connect to the database too often. This may happen + when the database is down or the connection parameters are misconfigured. + + - alert: CortexManilaSyncNotSuccessful + expr: cortex_sync_request_processed_total{service="cortex-manila-metrics"} - cortex_sync_request_duration_seconds_count{service="cortex-manila-metrics"} > 0 + for: 5m + labels: + context: syncstatus + dashboard: cortex-status-dashboard/cortex-status-dashboard + service: cortex + severity: warning + support_group: workload-management + playbook: docs/support/playbook/cortex/alerts/datasources + annotations: + summary: "`{{ "{{" }} $labels.component {{ "}}" }}` Sync not successful" + description: > + `{{ "{{" }} $labels.component {{ "}}" }}` experienced an issue syncing data from the datasource `{{ "{{" }} $labels.datasource {{ "}}" }}`. This may + happen when the datasource (OpenStack, Prometheus, etc.) is down or + the sync module is misconfigured. No immediate action is needed, since + the sync module will retry the sync operation and the currently synced + data will be kept. However, when this problem persists for a longer + time the service will have a less recent view of the datacenter. + + - alert: CortexManilaSyncObjectsDroppedToZero + expr: cortex_sync_objects{service="cortex-manila-metrics"} == 0 + for: 60m + labels: + context: syncobjects + dashboard: cortex-status-dashboard/cortex-status-dashboard + service: cortex + severity: warning + support_group: workload-management + playbook: docs/support/playbook/cortex/alerts/datasources + annotations: + summary: "`{{ "{{" }} $labels.component {{ "}}" }}` is not syncing any new data from `{{ "{{" }} $labels.datasource {{ "}}" }}`" + description: > + `{{ "{{" }} $labels.component {{ "}}" }}` is not syncing any objects from the datasource `{{ "{{" }} $labels.datasource {{ "}}" }}`. This may happen + when the datasource (OpenStack, Prometheus, etc.) is down or the sync + module is misconfigured. No immediate action is needed, since the sync + module will retry the sync operation and the currently synced data will + be kept. However, when this problem persists for a longer time the + service will have a less recent view of the datacenter. + + - alert: CortexManilaDatasourceUnready + expr: cortex_datasource_state{domain="manila",state!="ready"} != 0 + for: 60m + labels: + context: datasources + dashboard: cortex-status-dashboard/cortex-status-dashboard + service: cortex + severity: warning + support_group: workload-management + playbook: docs/support/playbook/cortex/alerts/unready + annotations: + summary: "Datasource `{{ "{{" }} $labels.datasource {{ "}}" }}` is in `{{ "{{" }} $labels.state {{ "}}" }}` state" + description: > + This may indicate issues with the datasource + connectivity or configuration. It is recommended to investigate the + datasource status and logs for more details. + + - alert: CortexManilaKnowledgeUnready + expr: cortex_knowledge_state{domain="manila",state!="ready"} != 0 + for: 60m + labels: + context: knowledge + dashboard: cortex-status-dashboard/cortex-status-dashboard + service: cortex + severity: warning + support_group: workload-management + playbook: docs/support/playbook/cortex/alerts/unready + annotations: + summary: "Knowledge `{{ "{{" }} $labels.knowledge {{ "}}" }}` is in `{{ "{{" }} $labels.state {{ "}}" }}` state" + description: > + This may indicate issues with the knowledge + configuration. It is recommended to investigate the + knowledge status and logs for more details. + + - alert: CortexManilaKPIUnready + expr: | + cortex_kpi_state{domain="manila",state!="ready"} != 0 + for: 60m + labels: + context: kpis + dashboard: cortex-status-dashboard/cortex-status-dashboard + service: cortex + severity: warning + support_group: workload-management + playbook: docs/support/playbook/cortex/alerts/unready + annotations: + summary: "KPI `{{ "{{" }} $labels.kpi {{ "}}" }}` is in `{{ "{{" }} $labels.state {{ "}}" }}` state" + description: > + This may indicate issues with the KPI + configuration. It is recommended to investigate the + KPI status and logs for more details. + + - alert: CortexManilaPipelineUnready + expr: cortex_pipeline_state{domain="manila",state!="ready"} != 0 + for: 5m + labels: + context: kpis + dashboard: cortex-status-dashboard/cortex-status-dashboard + service: cortex + severity: warning + support_group: workload-management + playbook: docs/support/playbook/cortex/alerts/unready + annotations: + summary: "Pipeline `{{ "{{" }} $labels.pipeline {{ "}}" }}` is in `{{ "{{" }} $labels.state {{ "}}" }}` state" + description: > + This may indicate issues with the pipeline + configuration. It is recommended to investigate the + pipeline status and logs for more details. {{- end }} diff --git a/helm/bundles/cortex-nova/alerts/nova.alerts.yaml b/helm/bundles/cortex-nova/alerts/nova.alerts.yaml deleted file mode 100644 index 46e93ef05..000000000 --- a/helm/bundles/cortex-nova/alerts/nova.alerts.yaml +++ /dev/null @@ -1,609 +0,0 @@ -groups: -- name: cortex-nova-alerts - rules: - - alert: CortexNovaSchedulingDown - expr: | - up{pod=~"cortex-nova-scheduling-.*"} != 1 or - absent(up{pod=~"cortex-nova-scheduling-.*"}) - for: 5m - labels: - context: liveness - dashboard: cortex-status-dashboard/cortex-status-dashboard - service: cortex - severity: critical - support_group: workload-management - playbook: docs/support/playbook/cortex/alerts/down - annotations: - summary: "Cortex Scheduling for Nova is down" - description: > - The Cortex scheduling service is down. Scheduling requests from Nova will - not be served. This is non-critical for vmware virtual machines, but - blocks kvm virtual machines from being scheduled. Thus, it is - recommended to immediately investigate and resolve the issue. - - - alert: CortexNovaKnowledgeDown - expr: | - up{pod=~"cortex-nova-knowledge-.*"} != 1 or - absent(up{pod=~"cortex-nova-knowledge-.*"}) - for: 5m - labels: - context: liveness - dashboard: cortex-status-dashboard/cortex-status-dashboard - service: cortex - severity: warning - support_group: workload-management - playbook: docs/support/playbook/cortex/alerts/down - annotations: - summary: "Cortex Knowledge for Nova is down" - description: > - The Cortex Knowledge service is down. This is no immediate problem, - since cortex is still able to process requests, - but the quality of the responses may be affected. - - - alert: CortexNovaDeschedulerPipelineErroring - expr: delta(cortex_detector_pipeline_run_duration_seconds_count{component="nova-scheduling", error="true"}[2m]) > 0 - for: 5m - labels: - context: descheduler - dashboard: cortex-status-dashboard/cortex-status-dashboard - service: cortex - severity: warning - support_group: workload-management - annotations: - summary: "Descheduler pipeline is erroring." - description: > - The Cortex descheduler pipeline is encountering errors during its execution. - This may indicate issues with the descheduling logic or the underlying infrastructure. - It is recommended to investigate the descheduler logs and the state of the VMs being processed. - - - alert: CortexNovaHttpRequest400sTooHigh - expr: rate(cortex_scheduler_api_request_duration_seconds_count{service="cortex-nova-metrics", status=~"4.+"}[5m]) > 0.1 - for: 5m - labels: - context: api - dashboard: cortex-status-dashboard/cortex-status-dashboard - service: cortex - severity: warning - support_group: workload-management - playbook: docs/support/playbook/cortex/alerts/api-errors - annotations: - summary: "Nova Scheduler HTTP request 400 errors too high" - description: > - Nova Scheduler is responding to placement requests with HTTP 4xx - errors. This is expected when the scheduling request cannot be served - by Cortex. However, it could also indicate that the request format has - changed and Cortex is unable to parse it. - - - alert: CortexNovaSchedulingHttpRequest500sTooHigh - expr: rate(cortex_scheduler_api_request_duration_seconds_count{service="cortex-nova-metrics", status=~"5.+" }[5m]) > 0.1 - for: 5m - labels: - context: api - dashboard: cortex-status-dashboard/cortex-status-dashboard - service: cortex - severity: warning - support_group: workload-management - playbook: docs/support/playbook/cortex/alerts/api-errors - annotations: - summary: "Nova Scheduler HTTP request 500 errors too high" - description: > - Nova Scheduler is responding to placement requests with HTTP 5xx errors. - This is not expected and indicates that Cortex is having some internal problem. - Nova will continue to place new VMs, but the placement will be less desirable. - Thus, no immediate action is needed. - - - alert: CortexNovaHighMemoryUsage - expr: process_resident_memory_bytes{service="cortex-nova-metrics"} > 6000 * 1024 * 1024 - for: 5m - labels: - context: memory - dashboard: cortex-status-dashboard/cortex-status-dashboard - service: cortex - severity: warning - support_group: workload-management - playbook: docs/support/playbook/cortex/alerts/deployment - annotations: - summary: "`{{$labels.component}}` uses too much memory" - description: > - `{{$labels.component}}` should not be using more than 6000 MiB of memory. Usually it - should use much less, so there may be a memory leak or other changes - that are causing the memory usage to increase significantly. - - - alert: CortexNovaHighCPUUsage - expr: rate(process_cpu_seconds_total{service="cortex-nova-metrics"}[1m]) > 0.5 - for: 5m - labels: - context: cpu - dashboard: cortex-status-dashboard/cortex-status-dashboard - service: cortex - severity: warning - support_group: workload-management - playbook: docs/support/playbook/cortex/alerts/deployment - annotations: - summary: "`{{$labels.component}}` uses too much CPU" - description: > - `{{$labels.component}}` should not be using more than 50% of a single CPU core. Usually - it should use much less, so there may be a CPU leak or other changes - that are causing the CPU usage to increase significantly. - - - alert: CortexNovaTooManyDBConnectionAttempts - expr: rate(cortex_db_connection_attempts_total{service="cortex-nova-metrics"}[5m]) > 0.1 - for: 5m - labels: - context: db - dashboard: cortex-status-dashboard/cortex-status-dashboard - service: cortex - severity: warning - support_group: workload-management - playbook: docs/support/playbook/cortex/alerts/database - annotations: - summary: "`{{$labels.component}}` is trying to connect to the database too often" - description: > - `{{$labels.component}}` is trying to connect to the database too often. This may happen - when the database is down or the connection parameters are misconfigured. - - - alert: CortexNovaSyncNotSuccessful - expr: cortex_sync_request_processed_total{service="cortex-nova-metrics"} - cortex_sync_request_duration_seconds_count{service="cortex-nova-metrics"} > 0 - for: 5m - labels: - context: syncstatus - dashboard: cortex-status-dashboard/cortex-status-dashboard - service: cortex - severity: warning - support_group: workload-management - playbook: docs/support/playbook/cortex/alerts/datasources - annotations: - summary: "`{{$labels.component}}` Sync not successful" - description: > - `{{$labels.component}}` experienced an issue syncing data from the datasource `{{$labels.datasource}}`. This may - happen when the datasource (OpenStack, Prometheus, etc.) is down or - the sync module is misconfigured. No immediate action is needed, since - the sync module will retry the sync operation and the currently synced - data will be kept. However, when this problem persists for a longer - time the service will have a less recent view of the datacenter. - - - alert: CortexNovaSyncObjectsDroppedToZero - expr: cortex_sync_objects{service="cortex-nova-metrics", datasource!="openstack_migrations"} == 0 - for: 60m - labels: - context: syncobjects - dashboard: cortex-status-dashboard/cortex-status-dashboard - service: cortex - severity: warning - support_group: workload-management - playbook: docs/support/playbook/cortex/alerts/datasources - annotations: - summary: "`{{$labels.component}}` is not syncing any new data from `{{$labels.datasource}}`" - description: > - `{{$labels.component}}` is not syncing any objects from the datasource `{{$labels.datasource}}`. This may happen - when the datasource (OpenStack, Prometheus, etc.) is down or the sync - module is misconfigured. No immediate action is needed, since the sync - module will retry the sync operation and the currently synced data will - be kept. However, when this problem persists for a longer time the - service will have a less recent view of the datacenter. - - - alert: CortexNovaDatasourceUnready - expr: cortex_datasource_state{domain="nova",state!="ready"} != 0 - for: 60m - labels: - context: datasources - dashboard: cortex-status-dashboard/cortex-status-dashboard - service: cortex - severity: warning - support_group: workload-management - playbook: docs/support/playbook/cortex/alerts/unready - annotations: - summary: "Datasource `{{$labels.datasource}}` is in `{{$labels.state}}` state" - description: > - This may indicate issues with the datasource - connectivity or configuration. It is recommended to investigate the - datasource status and logs for more details. - - - alert: CortexNovaKnowledgeUnready - expr: cortex_knowledge_state{domain="nova",state!="ready"} != 0 - for: 60m - labels: - context: knowledge - dashboard: cortex-status-dashboard/cortex-status-dashboard - service: cortex - severity: warning - support_group: workload-management - playbook: docs/support/playbook/cortex/alerts/unready - annotations: - summary: "Knowledge `{{$labels.knowledge}}` is in `{{$labels.state}}` state" - description: > - This may indicate issues with the knowledge - configuration. It is recommended to investigate the - knowledge status and logs for more details. - - - alert: CortexNovaDecisionsWithErrors - expr: cortex_decision_state{domain="nova",state="error"} > 0 - for: 5m - labels: - context: decisions - dashboard: cortex-status-dashboard/cortex-status-dashboard - service: cortex - severity: warning - support_group: workload-management - annotations: - summary: "Some decisions are in error state for operator `{{$labels.operator}}`" - description: > - The cortex scheduling pipeline generated decisions that are in error state. - This may indicate issues with the decision logic or the underlying infrastructure. - It is recommended to investigate the decision logs and the state of the - VMs being processed. - - - alert: CortexNovaTooManyDecisionsWaiting - expr: cortex_decision_state{domain="nova",state="waiting"} > 10 - for: 5m - labels: - context: decisions - dashboard: cortex-status-dashboard/cortex-status-dashboard - service: cortex - severity: warning - support_group: workload-management - annotations: - summary: "Too many decisions are in waiting state for operator `{{$labels.operator}}`" - description: > - The cortex scheduling pipeline has a high number of decisions for which - no target host has been assigned yet. - - This may indicate a backlog in processing or issues with the decision logic. - It is recommended to investigate the decision logs and the state of the - VMs being processed. - - - alert: CortexNovaKPIUnready - expr: | - cortex_kpi_state{domain="nova",state!="ready"} != 0 - for: 60m - labels: - context: kpis - dashboard: cortex-status-dashboard/cortex-status-dashboard - service: cortex - severity: warning - support_group: workload-management - playbook: docs/support/playbook/cortex/alerts/unready - annotations: - summary: "KPI `{{$labels.kpi}}` is in `{{$labels.state}}` state" - description: > - This may indicate issues with the KPI - configuration. It is recommended to investigate the - KPI status and logs for more details. - - - alert: CortexNovaPipelineUnready - expr: cortex_pipeline_state{domain="nova",state!="ready"} != 0 - for: 5m - labels: - context: pipelines - dashboard: cortex-status-dashboard/cortex-status-dashboard - service: cortex - severity: warning - support_group: workload-management - playbook: docs/support/playbook/cortex/alerts/unready - annotations: - summary: "Pipeline `{{$labels.pipeline}}` is in `{{$labels.state}}` state" - description: > - This may indicate issues with the pipeline - configuration. It is recommended to investigate the - pipeline status and logs for more details. - - - alert: CortexNovaDoesntFindValidKVMHosts - expr: sum by (az, hvtype) (increase(cortex_vm_faults{hvtype=~"CH|QEMU",faultmsg=~".*No valid host was found.*",faultmsg!~".*No such host.*"}[5m])) > 0 - for: 5m - labels: - context: scheduling - dashboard: cortex-status-dashboard/cortex-status-dashboard - service: cortex - severity: warning - support_group: workload-management - playbook: docs/support/playbook/cortex/alerts/scheduling - annotations: - summary: "Nova scheduling cannot find valid KVM hosts" - description: > - Cortex is seeing new faulty vms in `{{$labels.az}}` where Nova scheduling - failed to find a valid `{{$labels.hvtype}}` host. This may indicate - capacity issues, misconfigured filters, or resource constraints in the - datacenter. Investigate the affected VMs and hypervisor availability. - - - alert: CortexNovaNewDatasourcesNotReconciling - expr: count by(datasource) (cortex_datasource_seconds_until_reconcile{queued="false",domain="nova"}) > 0 - for: 60m - labels: - context: datasources - dashboard: cortex-status-dashboard/cortex-status-dashboard - service: cortex - severity: warning - support_group: workload-management - playbook: docs/support/playbook/cortex/alerts/datasources - annotations: - summary: "New datasource `{{$labels.datasource}}` has not reconciled" - description: > - A new datasource `{{$labels.datasource}}` has been added but has not - completed its first reconciliation yet. This may indicate issues with - the datasource controller's workqueue overprioritizing other datasources. - - - alert: CortexNovaExistingDatasourcesLackingBehind - expr: | - sum by(datasource) (cortex_datasource_seconds_until_reconcile{queued="true",domain="nova"}) < -600 - and on(datasource) cortex_datasource_state{state="ready",domain="nova"} == 1 - for: 10m - labels: - context: datasources - dashboard: cortex-status-dashboard/cortex-status-dashboard - service: cortex - severity: warning - support_group: workload-management - playbook: docs/support/playbook/cortex/alerts/datasources - annotations: - summary: "Existing datasource `{{$labels.datasource}}` is lacking behind" - description: > - An existing datasource `{{$labels.datasource}}` has been queued for - reconciliation for more than 10 minutes. This may indicate issues with - the datasource controller's workqueue or that this or another datasource - is taking an unusually long time to reconcile. - - - alert: CortexNovaReconcileErrorsHigh - expr: | - (sum by (controller) (rate(controller_runtime_reconcile_errors_total{service="cortex-nova-metrics"}[5m]))) - / (sum by (controller) (rate(controller_runtime_reconcile_total{service="cortex-nova-metrics"}[5m]))) > 0.1 - for: 15m - labels: - context: controller-errors - dashboard: cortex-status-dashboard/cortex-status-dashboard - service: cortex - severity: warning - support_group: workload-management - playbook: docs/support/playbook/cortex/alerts/reconciles - annotations: - summary: "Controller reconcile error rate >10%" - description: > - More than 10% of controller reconciles are resulting in errors. This may - indicate issues with the controller logic, connectivity problems, or - external factors causing failures. Check the controller logs for error - details and investigate the affected resources. - - - alert: CortexNovaReconcileDurationHigher10Min - expr: | - (sum by (controller) (rate(controller_runtime_reconcile_time_seconds_sum{service="cortex-nova-metrics"}[5m]))) - / (sum by (controller) (rate(controller_runtime_reconcile_time_seconds_count{service="cortex-nova-metrics"}[5m]))) > 600 - for: 15m - labels: - context: controller-duration - dashboard: cortex-status-dashboard/cortex-status-dashboard - service: cortex - severity: warning - support_group: workload-management - playbook: docs/support/playbook/cortex/alerts/reconciles - annotations: - summary: "Controller reconciliation takes longer than ({{ $value | humanizeDuration }})" - description: "Reconcile duration higher than 10m while reconciling {{ $labels.controller }}" - - - alert: CortexNovaWorkqueueNotDrained - expr: | - sum by (name) (workqueue_depth{service="cortex-nova-metrics"}) > 0 - for: 60m - labels: - context: controller-workqueue - dashboard: cortex-status-dashboard/cortex-status-dashboard - service: cortex - severity: warning - support_group: workload-management - playbook: docs/support/playbook/cortex/alerts/datasources - annotations: - summary: "Controller {{ $labels.name }}'s backlog is not being drained." - description: > - The workqueue for controller {{ $labels.name }} has a backlog that is - not being drained. This may indicate that the controller is overwhelmed - with work or is stuck on certain resources. Check the controller logs - and the state of the resources it manages for more details. - - - alert: CortexNovaWebhookLatencyHigh - expr: | - histogram_quantile(0.9, avg(rate(controller_runtime_webhook_latency_seconds_bucket{service="cortex-nova-metrics"}[5m])) by (webhook, le)) > 0.2 - for: 15m - labels: - context: controller-webhook - dashboard: cortex-status-dashboard/cortex-status-dashboard - service: cortex - severity: warning - support_group: workload-management - annotations: - summary: "Controller webhook {{ $labels.webhook }} latency is high" - description: > - The latency for webhook {{ $labels.webhook }} is higher than expected (p90 > 200ms). - This may indicate performance issues with the webhook server or the logic it executes. - Check the webhook server logs and monitor its resource usage for more insights. - - - alert: CortexNovaWebhookErrorsHigh - expr: | - (sum by (webhook) (rate(controller_runtime_webhook_requests_total{code!="200", service="cortex-nova-metrics"}[5m]))) - / (sum by (webhook) (rate(controller_runtime_webhook_requests_total{service="cortex-nova-metrics"}[5m]))) > 0.1 - for: 15m - labels: - context: controller-webhook - dashboard: cortex-status-dashboard/cortex-status-dashboard - service: cortex - severity: warning - support_group: workload-management - annotations: - summary: "Controller webhook {{ $labels.webhook }} is experiencing errors" - description: > - The webhook {{ $labels.webhook }} has experienced errors in the last 5 minutes. - This may indicate issues with the webhook logic, connectivity problems, or - external factors causing failures. Check the webhook server logs for error - details and investigate the affected resources. - - # Committed Resource Info API - - alert: CortexNovaCommittedResourceInfoUnavailable - expr: | - rate(cortex_committed_resource_info_api_requests_total{service="cortex-nova-metrics", status_code="503"}[5m]) > 0 - for: 5m - labels: - context: committed-resource-api - dashboard: cortex-status-dashboard/cortex-status-dashboard - service: cortex - severity: warning - support_group: workload-management - playbook: docs/support/playbook/cortex/alerts/committed-resource-api-errors - annotations: - summary: "Committed Resource info API is unavailable" - description: > - The committed resource info API (Limes LIQUID integration) has been returning - 503 Service Unavailable for more than 5 minutes. This typically means the - flavor group knowledge CRD is not ready or missing. Limes cannot discover - available committed resources until the issue is resolved. - - # Committed Resource Change API - - alert: CortexNovaCommittedResourceChangeErrors - expr: | - rate(cortex_committed_resource_change_api_requests_total{service="cortex-nova-metrics", status_code=~"5.."}[5m]) > 0 - for: 5m - labels: - context: committed-resource-api - dashboard: cortex-status-dashboard/cortex-status-dashboard - service: cortex - severity: warning - support_group: workload-management - playbook: docs/support/playbook/cortex/alerts/committed-resource-api-errors - annotations: - summary: "Committed Resource change API HTTP 5xx errors" - description: > - The committed resource change API (Limes LIQUID integration) is returning - HTTP 5xx errors. This is not expected and indicates an internal problem - processing commitment changes. Limes will retry, but new commitments may - not be fulfilled until the issue is resolved. - - - alert: CortexNovaCommittedResourceRejectionRateTooHigh - expr: | - ( - sum(rate(cortex_committed_resource_change_api_commitment_changes_total{service="cortex-nova-metrics", result="rejected", dry_run="false"}[15m])) - / sum(rate(cortex_committed_resource_change_api_commitment_changes_total{service="cortex-nova-metrics", dry_run="false"}[15m])) - ) > 0.3 - and on() sum(rate(cortex_committed_resource_change_api_commitment_changes_total{service="cortex-nova-metrics", dry_run="false"}[15m])) > 0 - for: 15m - labels: - context: committed-resource-api - dashboard: cortex-status-dashboard/cortex-status-dashboard - service: cortex - severity: warning - support_group: workload-management - playbook: docs/support/playbook/cortex/alerts/committed-resource-performance - annotations: - summary: "Committed Resource rejection rate too high ({{ $value | humanizePercentage }})" - description: > - More than 30% of commitment changes have been rejected over the last 15 minutes. - This may indicate insufficient capacity to fulfill new commitments. Rejected - commitments are rolled back. - - - alert: CortexNovaCommittedResourceTimeoutsTooHigh - expr: increase(cortex_committed_resource_change_api_timeouts_total{service="cortex-nova-metrics", dry_run="false"}[10m]) > 0 - for: 1m - labels: - context: committed-resource-api - dashboard: cortex-status-dashboard/cortex-status-dashboard - service: cortex - severity: warning - support_group: workload-management - playbook: docs/support/playbook/cortex/alerts/committed-resource-performance - annotations: - summary: "Committed Resource change API timeout detected" - description: > - A commitment change request timed out after the configured deadline. - Timeouts indicate the scheduling pipeline could not place reservations in time. - Affected changes are rolled back. Investigate scheduler performance or reservation backlog. - - - alert: CortexNovaCommittedResourceChangeLatencyTooHigh - expr: | - histogram_quantile(0.95, sum(rate(cortex_committed_resource_change_api_request_duration_seconds_bucket{service="cortex-nova-metrics", dry_run="false"}[5m])) by (le)) >= 10 - and on() sum(rate(cortex_committed_resource_change_api_requests_total{service="cortex-nova-metrics", dry_run="false"}[5m])) > 0 - for: 5m - labels: - context: committed-resource-api - dashboard: cortex-status-dashboard/cortex-status-dashboard - service: cortex - severity: warning - support_group: workload-management - playbook: docs/support/playbook/cortex/alerts/committed-resource-performance - annotations: - summary: "Committed Resource change API p95 latency >= 10s" - description: > - The committed resource change API p95 latency has reached or exceeded 10 seconds, - approaching the configured watch timeout. Requests close to the timeout are at risk - of being rolled back. Investigate scheduler performance or reservation backlog. - - # Committed Resource Capacity API - - alert: CortexNovaCommittedResourceCapacityErrors - expr: | - rate(cortex_committed_resource_capacity_api_requests_total{service="cortex-nova-metrics", status_code=~"5.."}[5m]) > 0 - for: 5m - labels: - context: committed-resource-api - dashboard: cortex-status-dashboard/cortex-status-dashboard - service: cortex - severity: warning - support_group: workload-management - playbook: docs/support/playbook/cortex/alerts/committed-resource-capacity - annotations: - summary: "Committed Resource capacity API HTTP 5xx errors" - description: > - The committed resource capacity API (Limes LIQUID integration) is returning - HTTP 5xx errors. This indicates internal problems calculating cluster capacity. - Limes may receive stale or incomplete capacity data. - - - alert: CortexNovaCommittedResourceCapacityDroppedToZero - expr: | - (cortex_committed_resource_reported_capacity_gib{service="cortex-nova-metrics"} == 0) - and on(resource, az) (cortex_committed_resource_reported_capacity_gib{service="cortex-nova-metrics"} offset 30m > 0) - for: 5m - labels: - context: committed-resource-api - dashboard: cortex-status-dashboard/cortex-status-dashboard - service: cortex - severity: warning - support_group: workload-management - playbook: docs/support/playbook/cortex/alerts/committed-resource-capacity - annotations: - summary: "Committed Resource capacity for {{ $labels.resource }} in {{ $labels.az }} dropped to zero" - description: > - The reported capacity for committed resource {{ $labels.resource }} in - availability zone {{ $labels.az }} has dropped from a positive value to zero. - This may mean hypervisors in that AZ are fully utilized for the corresponding - flavor group and no further committed resources can be placed there. - - # Committed Resource Usage API - - alert: CortexNovaCommittedResourceUsageErrors - expr: | - rate(cortex_committed_resource_usage_api_requests_total{service="cortex-nova-metrics", status_code=~"5.."}[5m]) > 0 - for: 5m - labels: - context: committed-resource-api - dashboard: cortex-status-dashboard/cortex-status-dashboard - service: cortex - severity: warning - support_group: workload-management - playbook: docs/support/playbook/cortex/alerts/committed-resource-api-errors - annotations: - summary: "Committed Resource usage API HTTP 5xx errors" - description: > - The committed resource usage API (Limes LIQUID integration) is returning - HTTP 5xx errors. This indicates internal problems fetching reservation or - Nova server data. Limes may receive stale or incomplete usage data. - - # Committed Resource Quota API - - alert: CortexNovaCommittedResourceQuotaErrors - expr: | - rate(cortex_committed_resource_quota_api_requests_total{service="cortex-nova-metrics", status_code=~"5.."}[5m]) > 0 - for: 5m - labels: - context: committed-resource-api - dashboard: cortex-status-dashboard/cortex-status-dashboard - service: cortex - severity: warning - support_group: workload-management - playbook: docs/support/playbook/cortex/alerts/committed-resource-api-errors - annotations: - summary: "Committed Resource quota API HTTP 5xx errors" - description: > - The committed resource quota API (Limes LIQUID integration) is returning - HTTP 5xx errors. This indicates internal problems computing or applying - quota. Limes may not be able to enforce committed resource quotas. diff --git a/helm/bundles/cortex-nova/templates/alerts.yaml b/helm/bundles/cortex-nova/templates/alerts.yaml index d2964e864..6f3fabef2 100644 --- a/helm/bundles/cortex-nova/templates/alerts.yaml +++ b/helm/bundles/cortex-nova/templates/alerts.yaml @@ -1,6 +1,10 @@ # Copyright SAP SE # SPDX-License-Identifier: Apache-2.0 +# NOTE: This file is rendered by Helm. Prometheus templating directives +# (e.g. {{ "{{" }} $labels.foo {{ "}}" }}) must be escaped using Style B: +# replace the outer `{{` and `}}` with `{{ "{{" }}` and `{{ "}}" }}`. + {{- if .Values.alerts.enabled }} apiVersion: monitoring.coreos.com/v1 kind: PrometheusRule @@ -10,8 +14,615 @@ metadata: type: alerting-rules prometheus: {{ required ".Values.alerts.prometheus missing" .Values.alerts.prometheus | quote }} spec: - {{- $files := .Files.Glob "alerts/*.alerts.yaml" }} - {{- range $path, $file := $files }} - {{ $file | toString | nindent 2 }} - {{- end }} + groups: + - name: cortex-nova-alerts + rules: + - alert: CortexNovaSchedulingDown + expr: | + up{pod=~"cortex-nova-scheduling-.*"} != 1 or + absent(up{pod=~"cortex-nova-scheduling-.*"}) + for: 5m + labels: + context: liveness + dashboard: cortex-status-dashboard/cortex-status-dashboard + service: cortex + severity: {{ if .Values.kvm.enabled }}critical{{ else }}warning{{ end }} + support_group: workload-management + playbook: docs/support/playbook/cortex/alerts/down + annotations: + summary: "Cortex Scheduling for Nova is down" + description: > + The Cortex scheduling service is down. Scheduling requests from Nova will + not be served. This is non-critical for vmware virtual machines, but + blocks kvm virtual machines from being scheduled. Thus, it is + recommended to immediately investigate and resolve the issue. + + - alert: CortexNovaKnowledgeDown + expr: | + up{pod=~"cortex-nova-knowledge-.*"} != 1 or + absent(up{pod=~"cortex-nova-knowledge-.*"}) + for: 5m + labels: + context: liveness + dashboard: cortex-status-dashboard/cortex-status-dashboard + service: cortex + severity: warning + support_group: workload-management + playbook: docs/support/playbook/cortex/alerts/down + annotations: + summary: "Cortex Knowledge for Nova is down" + description: > + The Cortex Knowledge service is down. This is no immediate problem, + since cortex is still able to process requests, + but the quality of the responses may be affected. + + - alert: CortexNovaDeschedulerPipelineErroring + expr: delta(cortex_detector_pipeline_run_duration_seconds_count{component="nova-scheduling", error="true"}[2m]) > 0 + for: 5m + labels: + context: descheduler + dashboard: cortex-status-dashboard/cortex-status-dashboard + service: cortex + severity: warning + support_group: workload-management + annotations: + summary: "Descheduler pipeline is erroring." + description: > + The Cortex descheduler pipeline is encountering errors during its execution. + This may indicate issues with the descheduling logic or the underlying infrastructure. + It is recommended to investigate the descheduler logs and the state of the VMs being processed. + + - alert: CortexNovaHttpRequest400sTooHigh + expr: rate(cortex_scheduler_api_request_duration_seconds_count{service="cortex-nova-metrics", status=~"4.+"}[5m]) > 0.1 + for: 5m + labels: + context: api + dashboard: cortex-status-dashboard/cortex-status-dashboard + service: cortex + severity: warning + support_group: workload-management + playbook: docs/support/playbook/cortex/alerts/api-errors + annotations: + summary: "Nova Scheduler HTTP request 400 errors too high" + description: > + Nova Scheduler is responding to placement requests with HTTP 4xx + errors. This is expected when the scheduling request cannot be served + by Cortex. However, it could also indicate that the request format has + changed and Cortex is unable to parse it. + + - alert: CortexNovaSchedulingHttpRequest500sTooHigh + expr: rate(cortex_scheduler_api_request_duration_seconds_count{service="cortex-nova-metrics", status=~"5.+" }[5m]) > 0.1 + for: 5m + labels: + context: api + dashboard: cortex-status-dashboard/cortex-status-dashboard + service: cortex + severity: warning + support_group: workload-management + playbook: docs/support/playbook/cortex/alerts/api-errors + annotations: + summary: "Nova Scheduler HTTP request 500 errors too high" + description: > + Nova Scheduler is responding to placement requests with HTTP 5xx errors. + This is not expected and indicates that Cortex is having some internal problem. + Nova will continue to place new VMs, but the placement will be less desirable. + Thus, no immediate action is needed. + + - alert: CortexNovaHighMemoryUsage + expr: process_resident_memory_bytes{service="cortex-nova-metrics"} > {{ .Values.alerts.thresholds.highMemoryMiB }} * 1024 * 1024 + for: 5m + labels: + context: memory + dashboard: cortex-status-dashboard/cortex-status-dashboard + service: cortex + severity: warning + support_group: workload-management + playbook: docs/support/playbook/cortex/alerts/deployment + annotations: + summary: "`{{ "{{" }} $labels.component {{ "}}" }}` uses too much memory" + description: > + `{{ "{{" }} $labels.component {{ "}}" }}` should not be using more than {{ .Values.alerts.thresholds.highMemoryMiB }} MiB of memory. Usually it + should use much less, so there may be a memory leak or other changes + that are causing the memory usage to increase significantly. + + - alert: CortexNovaHighCPUUsage + expr: rate(process_cpu_seconds_total{service="cortex-nova-metrics"}[1m]) > 0.5 + for: 5m + labels: + context: cpu + dashboard: cortex-status-dashboard/cortex-status-dashboard + service: cortex + severity: warning + support_group: workload-management + playbook: docs/support/playbook/cortex/alerts/deployment + annotations: + summary: "`{{ "{{" }} $labels.component {{ "}}" }}` uses too much CPU" + description: > + `{{ "{{" }} $labels.component {{ "}}" }}` should not be using more than 50% of a single CPU core. Usually + it should use much less, so there may be a CPU leak or other changes + that are causing the CPU usage to increase significantly. + + - alert: CortexNovaTooManyDBConnectionAttempts + expr: rate(cortex_db_connection_attempts_total{service="cortex-nova-metrics"}[5m]) > 0.1 + for: 5m + labels: + context: db + dashboard: cortex-status-dashboard/cortex-status-dashboard + service: cortex + severity: warning + support_group: workload-management + playbook: docs/support/playbook/cortex/alerts/database + annotations: + summary: "`{{ "{{" }} $labels.component {{ "}}" }}` is trying to connect to the database too often" + description: > + `{{ "{{" }} $labels.component {{ "}}" }}` is trying to connect to the database too often. This may happen + when the database is down or the connection parameters are misconfigured. + + - alert: CortexNovaSyncNotSuccessful + expr: cortex_sync_request_processed_total{service="cortex-nova-metrics"} - cortex_sync_request_duration_seconds_count{service="cortex-nova-metrics"} > 0 + for: 5m + labels: + context: syncstatus + dashboard: cortex-status-dashboard/cortex-status-dashboard + service: cortex + severity: warning + support_group: workload-management + playbook: docs/support/playbook/cortex/alerts/datasources + annotations: + summary: "`{{ "{{" }} $labels.component {{ "}}" }}` Sync not successful" + description: > + `{{ "{{" }} $labels.component {{ "}}" }}` experienced an issue syncing data from the datasource `{{ "{{" }} $labels.datasource {{ "}}" }}`. This may + happen when the datasource (OpenStack, Prometheus, etc.) is down or + the sync module is misconfigured. No immediate action is needed, since + the sync module will retry the sync operation and the currently synced + data will be kept. However, when this problem persists for a longer + time the service will have a less recent view of the datacenter. + + - alert: CortexNovaSyncObjectsDroppedToZero + expr: cortex_sync_objects{service="cortex-nova-metrics", datasource!="openstack_migrations"} == 0 + for: 60m + labels: + context: syncobjects + dashboard: cortex-status-dashboard/cortex-status-dashboard + service: cortex + severity: warning + support_group: workload-management + playbook: docs/support/playbook/cortex/alerts/datasources + annotations: + summary: "`{{ "{{" }} $labels.component {{ "}}" }}` is not syncing any new data from `{{ "{{" }} $labels.datasource {{ "}}" }}`" + description: > + `{{ "{{" }} $labels.component {{ "}}" }}` is not syncing any objects from the datasource `{{ "{{" }} $labels.datasource {{ "}}" }}`. This may happen + when the datasource (OpenStack, Prometheus, etc.) is down or the sync + module is misconfigured. No immediate action is needed, since the sync + module will retry the sync operation and the currently synced data will + be kept. However, when this problem persists for a longer time the + service will have a less recent view of the datacenter. + + - alert: CortexNovaDatasourceUnready + expr: cortex_datasource_state{domain="nova",state!="ready"} != 0 + for: 60m + labels: + context: datasources + dashboard: cortex-status-dashboard/cortex-status-dashboard + service: cortex + severity: warning + support_group: workload-management + playbook: docs/support/playbook/cortex/alerts/unready + annotations: + summary: "Datasource `{{ "{{" }} $labels.datasource {{ "}}" }}` is in `{{ "{{" }} $labels.state {{ "}}" }}` state" + description: > + This may indicate issues with the datasource + connectivity or configuration. It is recommended to investigate the + datasource status and logs for more details. + + - alert: CortexNovaKnowledgeUnready + expr: cortex_knowledge_state{domain="nova",state!="ready"} != 0 + for: 60m + labels: + context: knowledge + dashboard: cortex-status-dashboard/cortex-status-dashboard + service: cortex + severity: warning + support_group: workload-management + playbook: docs/support/playbook/cortex/alerts/unready + annotations: + summary: "Knowledge `{{ "{{" }} $labels.knowledge {{ "}}" }}` is in `{{ "{{" }} $labels.state {{ "}}" }}` state" + description: > + This may indicate issues with the knowledge + configuration. It is recommended to investigate the + knowledge status and logs for more details. + + - alert: CortexNovaDecisionsWithErrors + expr: cortex_decision_state{domain="nova",state="error"} > 0 + for: 5m + labels: + context: decisions + dashboard: cortex-status-dashboard/cortex-status-dashboard + service: cortex + severity: warning + support_group: workload-management + annotations: + summary: "Some decisions are in error state for operator `{{ "{{" }} $labels.operator {{ "}}" }}`" + description: > + The cortex scheduling pipeline generated decisions that are in error state. + This may indicate issues with the decision logic or the underlying infrastructure. + It is recommended to investigate the decision logs and the state of the + VMs being processed. + + - alert: CortexNovaTooManyDecisionsWaiting + expr: cortex_decision_state{domain="nova",state="waiting"} > 10 + for: 5m + labels: + context: decisions + dashboard: cortex-status-dashboard/cortex-status-dashboard + service: cortex + severity: warning + support_group: workload-management + annotations: + summary: "Too many decisions are in waiting state for operator `{{ "{{" }} $labels.operator {{ "}}" }}`" + description: > + The cortex scheduling pipeline has a high number of decisions for which + no target host has been assigned yet. + + This may indicate a backlog in processing or issues with the decision logic. + It is recommended to investigate the decision logs and the state of the + VMs being processed. + + - alert: CortexNovaKPIUnready + expr: | + cortex_kpi_state{domain="nova",state!="ready"} != 0 + for: 60m + labels: + context: kpis + dashboard: cortex-status-dashboard/cortex-status-dashboard + service: cortex + severity: warning + support_group: workload-management + playbook: docs/support/playbook/cortex/alerts/unready + annotations: + summary: "KPI `{{ "{{" }} $labels.kpi {{ "}}" }}` is in `{{ "{{" }} $labels.state {{ "}}" }}` state" + description: > + This may indicate issues with the KPI + configuration. It is recommended to investigate the + KPI status and logs for more details. + + - alert: CortexNovaPipelineUnready + expr: cortex_pipeline_state{domain="nova",state!="ready"} != 0 + for: 5m + labels: + context: pipelines + dashboard: cortex-status-dashboard/cortex-status-dashboard + service: cortex + severity: warning + support_group: workload-management + playbook: docs/support/playbook/cortex/alerts/unready + annotations: + summary: "Pipeline `{{ "{{" }} $labels.pipeline {{ "}}" }}` is in `{{ "{{" }} $labels.state {{ "}}" }}` state" + description: > + This may indicate issues with the pipeline + configuration. It is recommended to investigate the + pipeline status and logs for more details. + + {{- if .Values.kvm.enabled }} + - alert: CortexNovaDoesntFindValidKVMHosts + expr: sum by (az, hvtype) (increase(cortex_vm_faults{hvtype=~"CH|QEMU",faultmsg=~".*No valid host was found.*",faultmsg!~".*No such host.*"}[5m])) > 0 + for: 5m + labels: + context: scheduling + dashboard: cortex-status-dashboard/cortex-status-dashboard + service: cortex + severity: warning + support_group: workload-management + playbook: docs/support/playbook/cortex/alerts/scheduling + annotations: + summary: "Nova scheduling cannot find valid KVM hosts" + description: > + Cortex is seeing new faulty vms in `{{ "{{" }} $labels.az {{ "}}" }}` where Nova scheduling + failed to find a valid `{{ "{{" }} $labels.hvtype {{ "}}" }}` host. This may indicate + capacity issues, misconfigured filters, or resource constraints in the + datacenter. Investigate the affected VMs and hypervisor availability. + {{- end }} + + - alert: CortexNovaNewDatasourcesNotReconciling + expr: count by(datasource) (cortex_datasource_seconds_until_reconcile{queued="false",domain="nova"}) > 0 + for: 60m + labels: + context: datasources + dashboard: cortex-status-dashboard/cortex-status-dashboard + service: cortex + severity: warning + support_group: workload-management + playbook: docs/support/playbook/cortex/alerts/datasources + annotations: + summary: "New datasource `{{ "{{" }} $labels.datasource {{ "}}" }}` has not reconciled" + description: > + A new datasource `{{ "{{" }} $labels.datasource {{ "}}" }}` has been added but has not + completed its first reconciliation yet. This may indicate issues with + the datasource controller's workqueue overprioritizing other datasources. + + - alert: CortexNovaExistingDatasourcesLackingBehind + expr: | + sum by(datasource) (cortex_datasource_seconds_until_reconcile{queued="true",domain="nova"}) < -600 + and on(datasource) cortex_datasource_state{state="ready",domain="nova"} == 1 + for: 10m + labels: + context: datasources + dashboard: cortex-status-dashboard/cortex-status-dashboard + service: cortex + severity: warning + support_group: workload-management + playbook: docs/support/playbook/cortex/alerts/datasources + annotations: + summary: "Existing datasource `{{ "{{" }} $labels.datasource {{ "}}" }}` is lacking behind" + description: > + An existing datasource `{{ "{{" }} $labels.datasource {{ "}}" }}` has been queued for + reconciliation for more than 10 minutes. This may indicate issues with + the datasource controller's workqueue or that this or another datasource + is taking an unusually long time to reconcile. + + - alert: CortexNovaReconcileErrorsHigh + expr: | + (sum by (controller) (rate(controller_runtime_reconcile_errors_total{service="cortex-nova-metrics"}[5m]))) + / (sum by (controller) (rate(controller_runtime_reconcile_total{service="cortex-nova-metrics"}[5m]))) > 0.1 + for: 15m + labels: + context: controller-errors + dashboard: cortex-status-dashboard/cortex-status-dashboard + service: cortex + severity: warning + support_group: workload-management + playbook: docs/support/playbook/cortex/alerts/reconciles + annotations: + summary: "Controller reconcile error rate >10%" + description: > + More than 10% of controller reconciles are resulting in errors. This may + indicate issues with the controller logic, connectivity problems, or + external factors causing failures. Check the controller logs for error + details and investigate the affected resources. + + - alert: CortexNovaReconcileDurationHigher10Min + expr: | + (sum by (controller) (rate(controller_runtime_reconcile_time_seconds_sum{service="cortex-nova-metrics"}[5m]))) + / (sum by (controller) (rate(controller_runtime_reconcile_time_seconds_count{service="cortex-nova-metrics"}[5m]))) > {{ .Values.alerts.thresholds.reconcileDurationSeconds }} + for: 15m + labels: + context: controller-duration + dashboard: cortex-status-dashboard/cortex-status-dashboard + service: cortex + severity: warning + support_group: workload-management + playbook: docs/support/playbook/cortex/alerts/reconciles + annotations: + summary: "Controller reconciliation takes longer than ({{ "{{" }} $value | humanizeDuration {{ "}}" }})" + description: "Reconcile duration higher than 10m while reconciling {{ "{{" }} $labels.controller {{ "}}" }}" + + - alert: CortexNovaWorkqueueNotDrained + expr: | + sum by (name) (workqueue_depth{service="cortex-nova-metrics"}) > 0 + for: 60m + labels: + context: controller-workqueue + dashboard: cortex-status-dashboard/cortex-status-dashboard + service: cortex + severity: warning + support_group: workload-management + playbook: docs/support/playbook/cortex/alerts/datasources + annotations: + summary: "Controller {{ "{{" }} $labels.name {{ "}}" }}'s backlog is not being drained." + description: > + The workqueue for controller {{ "{{" }} $labels.name {{ "}}" }} has a backlog that is + not being drained. This may indicate that the controller is overwhelmed + with work or is stuck on certain resources. Check the controller logs + and the state of the resources it manages for more details. + + - alert: CortexNovaWebhookLatencyHigh + expr: | + histogram_quantile(0.9, avg(rate(controller_runtime_webhook_latency_seconds_bucket{service="cortex-nova-metrics"}[5m])) by (webhook, le)) > 0.2 + for: 15m + labels: + context: controller-webhook + dashboard: cortex-status-dashboard/cortex-status-dashboard + service: cortex + severity: warning + support_group: workload-management + annotations: + summary: "Controller webhook {{ "{{" }} $labels.webhook {{ "}}" }} latency is high" + description: > + The latency for webhook {{ "{{" }} $labels.webhook {{ "}}" }} is higher than expected (p90 > 200ms). + This may indicate performance issues with the webhook server or the logic it executes. + Check the webhook server logs and monitor its resource usage for more insights. + + - alert: CortexNovaWebhookErrorsHigh + expr: | + (sum by (webhook) (rate(controller_runtime_webhook_requests_total{code!="200", service="cortex-nova-metrics"}[5m]))) + / (sum by (webhook) (rate(controller_runtime_webhook_requests_total{service="cortex-nova-metrics"}[5m]))) > 0.1 + for: 15m + labels: + context: controller-webhook + dashboard: cortex-status-dashboard/cortex-status-dashboard + service: cortex + severity: warning + support_group: workload-management + annotations: + summary: "Controller webhook {{ "{{" }} $labels.webhook {{ "}}" }} is experiencing errors" + description: > + The webhook {{ "{{" }} $labels.webhook {{ "}}" }} has experienced errors in the last 5 minutes. + This may indicate issues with the webhook logic, connectivity problems, or + external factors causing failures. Check the webhook server logs for error + details and investigate the affected resources. + + # Committed Resource Info API + - alert: CortexNovaCommittedResourceInfoUnavailable + expr: | + rate(cortex_committed_resource_info_api_requests_total{service="cortex-nova-metrics", status_code="503"}[5m]) > 0 + for: 5m + labels: + context: committed-resource-api + dashboard: cortex-status-dashboard/cortex-status-dashboard + service: cortex + severity: warning + support_group: workload-management + playbook: docs/support/playbook/cortex/alerts/committed-resource-api-errors + annotations: + summary: "Committed Resource info API is unavailable" + description: > + The committed resource info API (Limes LIQUID integration) has been returning + 503 Service Unavailable for more than 5 minutes. This typically means the + flavor group knowledge CRD is not ready or missing. Limes cannot discover + available committed resources until the issue is resolved. + + # Committed Resource Change API + - alert: CortexNovaCommittedResourceChangeErrors + expr: | + rate(cortex_committed_resource_change_api_requests_total{service="cortex-nova-metrics", status_code=~"5.."}[5m]) > 0 + for: 5m + labels: + context: committed-resource-api + dashboard: cortex-status-dashboard/cortex-status-dashboard + service: cortex + severity: warning + support_group: workload-management + playbook: docs/support/playbook/cortex/alerts/committed-resource-api-errors + annotations: + summary: "Committed Resource change API HTTP 5xx errors" + description: > + The committed resource change API (Limes LIQUID integration) is returning + HTTP 5xx errors. This is not expected and indicates an internal problem + processing commitment changes. Limes will retry, but new commitments may + not be fulfilled until the issue is resolved. + + - alert: CortexNovaCommittedResourceRejectionRateTooHigh + expr: | + ( + sum(rate(cortex_committed_resource_change_api_commitment_changes_total{service="cortex-nova-metrics", result="rejected", dry_run="false"}[15m])) + / sum(rate(cortex_committed_resource_change_api_commitment_changes_total{service="cortex-nova-metrics", dry_run="false"}[15m])) + ) > 0.3 + and on() sum(rate(cortex_committed_resource_change_api_commitment_changes_total{service="cortex-nova-metrics", dry_run="false"}[15m])) > 0 + for: 15m + labels: + context: committed-resource-api + dashboard: cortex-status-dashboard/cortex-status-dashboard + service: cortex + severity: warning + support_group: workload-management + playbook: docs/support/playbook/cortex/alerts/committed-resource-performance + annotations: + summary: "Committed Resource rejection rate too high ({{ "{{" }} $value | humanizePercentage {{ "}}" }})" + description: > + More than 30% of commitment changes have been rejected over the last 15 minutes. + This may indicate insufficient capacity to fulfill new commitments. Rejected + commitments are rolled back. + + - alert: CortexNovaCommittedResourceTimeoutsTooHigh + expr: increase(cortex_committed_resource_change_api_timeouts_total{service="cortex-nova-metrics", dry_run="false"}[10m]) > 0 + for: 1m + labels: + context: committed-resource-api + dashboard: cortex-status-dashboard/cortex-status-dashboard + service: cortex + severity: warning + support_group: workload-management + playbook: docs/support/playbook/cortex/alerts/committed-resource-performance + annotations: + summary: "Committed Resource change API timeout detected" + description: > + A commitment change request timed out after the configured deadline. + Timeouts indicate the scheduling pipeline could not place reservations in time. + Affected changes are rolled back. Investigate scheduler performance or reservation backlog. + + - alert: CortexNovaCommittedResourceChangeLatencyTooHigh + expr: | + histogram_quantile(0.95, sum(rate(cortex_committed_resource_change_api_request_duration_seconds_bucket{service="cortex-nova-metrics", dry_run="false"}[5m])) by (le)) >= 10 + and on() sum(rate(cortex_committed_resource_change_api_requests_total{service="cortex-nova-metrics", dry_run="false"}[5m])) > 0 + for: 5m + labels: + context: committed-resource-api + dashboard: cortex-status-dashboard/cortex-status-dashboard + service: cortex + severity: warning + support_group: workload-management + playbook: docs/support/playbook/cortex/alerts/committed-resource-performance + annotations: + summary: "Committed Resource change API p95 latency >= 10s" + description: > + The committed resource change API p95 latency has reached or exceeded 10 seconds, + approaching the configured watch timeout. Requests close to the timeout are at risk + of being rolled back. Investigate scheduler performance or reservation backlog. + + # Committed Resource Capacity API + - alert: CortexNovaCommittedResourceCapacityErrors + expr: | + rate(cortex_committed_resource_capacity_api_requests_total{service="cortex-nova-metrics", status_code=~"5.."}[5m]) > 0 + for: 5m + labels: + context: committed-resource-api + dashboard: cortex-status-dashboard/cortex-status-dashboard + service: cortex + severity: warning + support_group: workload-management + playbook: docs/support/playbook/cortex/alerts/committed-resource-capacity + annotations: + summary: "Committed Resource capacity API HTTP 5xx errors" + description: > + The committed resource capacity API (Limes LIQUID integration) is returning + HTTP 5xx errors. This indicates internal problems calculating cluster capacity. + Limes may receive stale or incomplete capacity data. + + - alert: CortexNovaCommittedResourceCapacityDroppedToZero + expr: | + (cortex_committed_resource_reported_capacity_gib{service="cortex-nova-metrics"} == 0) + and on(resource, az) (cortex_committed_resource_reported_capacity_gib{service="cortex-nova-metrics"} offset 30m > 0) + for: 5m + labels: + context: committed-resource-api + dashboard: cortex-status-dashboard/cortex-status-dashboard + service: cortex + severity: warning + support_group: workload-management + playbook: docs/support/playbook/cortex/alerts/committed-resource-capacity + annotations: + summary: "Committed Resource capacity for {{ "{{" }} $labels.resource {{ "}}" }} in {{ "{{" }} $labels.az {{ "}}" }} dropped to zero" + description: > + The reported capacity for committed resource {{ "{{" }} $labels.resource {{ "}}" }} in + availability zone {{ "{{" }} $labels.az {{ "}}" }} has dropped from a positive value to zero. + This may mean hypervisors in that AZ are fully utilized for the corresponding + flavor group and no further committed resources can be placed there. + + # Committed Resource Usage API + - alert: CortexNovaCommittedResourceUsageErrors + expr: | + rate(cortex_committed_resource_usage_api_requests_total{service="cortex-nova-metrics", status_code=~"5.."}[5m]) > 0 + for: 5m + labels: + context: committed-resource-api + dashboard: cortex-status-dashboard/cortex-status-dashboard + service: cortex + severity: warning + support_group: workload-management + playbook: docs/support/playbook/cortex/alerts/committed-resource-api-errors + annotations: + summary: "Committed Resource usage API HTTP 5xx errors" + description: > + The committed resource usage API (Limes LIQUID integration) is returning + HTTP 5xx errors. This indicates internal problems fetching reservation or + Nova server data. Limes may receive stale or incomplete usage data. + + # Committed Resource Quota API + - alert: CortexNovaCommittedResourceQuotaErrors + expr: | + rate(cortex_committed_resource_quota_api_requests_total{service="cortex-nova-metrics", status_code=~"5.."}[5m]) > 0 + for: 5m + labels: + context: committed-resource-api + dashboard: cortex-status-dashboard/cortex-status-dashboard + service: cortex + severity: warning + support_group: workload-management + playbook: docs/support/playbook/cortex/alerts/committed-resource-api-errors + annotations: + summary: "Committed Resource quota API HTTP 5xx errors" + description: > + The committed resource quota API (Limes LIQUID integration) is returning + HTTP 5xx errors. This indicates internal problems computing or applying + quota. Limes may not be able to enforce committed resource quotas. {{- end }} diff --git a/helm/bundles/cortex-nova/values.yaml b/helm/bundles/cortex-nova/values.yaml index 4a194ae50..0ac9f49d5 100644 --- a/helm/bundles/cortex-nova/values.yaml +++ b/helm/bundles/cortex-nova/values.yaml @@ -17,6 +17,11 @@ owner-info: alerts: enabled: true prometheus: openstack + thresholds: + # Memory threshold for CortexNovaHighMemoryUsage in MiB. + highMemoryMiB: 6000 + # Reconcile-duration threshold for CortexNovaReconcileDurationHigher10Min in seconds. + reconcileDurationSeconds: 600 serviceMonitor: extraLabels: {} diff --git a/helm/bundles/cortex-placement-shim/alerts/placement-shim.alerts.yaml b/helm/bundles/cortex-placement-shim/alerts/placement-shim.alerts.yaml deleted file mode 100644 index e65b944d6..000000000 --- a/helm/bundles/cortex-placement-shim/alerts/placement-shim.alerts.yaml +++ /dev/null @@ -1,179 +0,0 @@ -groups: -- name: cortex-placement-shim-alerts - rules: - # Liveness - - alert: CortexPlacementShimDown - expr: | - up{pod=~"cortex-placement-shim-.*"} != 1 or - absent(up{pod=~"cortex-placement-shim-.*"}) - for: 5m - labels: - context: liveness - dashboard: cortex-placement-shim-status-dashboard/cortex-placement-shim-status-dashboard - service: cortex - severity: warning - support_group: workload-management - playbook: docs/support/playbook/cortex/alerts/shim-down - annotations: - summary: "Cortex Placement Shim is down" - description: > - The Cortex Placement Shim is down. Placement API requests that are - routed through the shim will not be served. OpenStack services relying - on the shim for resource provider lookups and allocation candidates - will degrade. - - # Downstream HTTP errors (client -> shim) - - alert: CortexPlacementShimDownstreamHttp400sTooHigh - expr: rate(cortex_placement_shim_downstream_request_duration_seconds_count{service="cortex-placement-shim-metrics-service", responsecode=~"4.."}[5m]) > 0.1 - for: 5m - labels: - context: api - dashboard: cortex-placement-shim-status-dashboard/cortex-placement-shim-status-dashboard - service: cortex - severity: warning - support_group: workload-management - playbook: docs/support/playbook/cortex/alerts/shim-api-errors - annotations: - summary: "Placement Shim downstream HTTP 4xx errors too high" - description: > - The Placement Shim is responding to client requests with HTTP 4xx - errors at a sustained rate. This may indicate that the request format - from OpenStack services has changed, authentication tokens are invalid, - or the shim is rejecting malformed requests. Investigate the shim logs - for details on which endpoints and request patterns are affected. - - - alert: CortexPlacementShimDownstreamHttp500sTooHigh - expr: rate(cortex_placement_shim_downstream_request_duration_seconds_count{service="cortex-placement-shim-metrics-service", responsecode=~"5.."}[5m]) > 0.1 - for: 5m - labels: - context: api - dashboard: cortex-placement-shim-status-dashboard/cortex-placement-shim-status-dashboard - service: cortex - severity: warning - support_group: workload-management - playbook: docs/support/playbook/cortex/alerts/shim-api-errors - annotations: - summary: "Placement Shim downstream HTTP 5xx errors too high" - description: > - The Placement Shim is responding to client requests with HTTP 5xx - errors. This indicates internal problems within the shim such as - handler panics or misconfiguration. OpenStack services may experience - degraded placement functionality until the issue is resolved. - - # Upstream HTTP errors (shim -> Placement API) - - alert: CortexPlacementShimUpstreamHttp5xxTooHigh - expr: rate(cortex_placement_shim_upstream_request_duration_seconds_count{service="cortex-placement-shim-metrics-service", responsecode=~"5.."}[5m]) > 0.1 - for: 5m - labels: - context: upstream - dashboard: cortex-placement-shim-status-dashboard/cortex-placement-shim-status-dashboard - service: cortex - severity: warning - support_group: workload-management - playbook: docs/support/playbook/cortex/alerts/shim-api-errors - annotations: - summary: "Placement Shim upstream HTTP 5xx errors too high" - description: > - The upstream Placement API is returning 5xx errors to the shim. - This indicates the OpenStack Placement service itself is having - problems. The shim forwards these errors to its clients. Investigate - the Placement API service health and logs. - - - alert: CortexPlacementShimUpstreamUnreachable - expr: rate(cortex_placement_shim_upstream_request_duration_seconds_count{service="cortex-placement-shim-metrics-service", responsecode="502"}[5m]) > 0.1 - for: 5m - labels: - context: upstream - dashboard: cortex-placement-shim-status-dashboard/cortex-placement-shim-status-dashboard - service: cortex - severity: warning - support_group: workload-management - playbook: docs/support/playbook/cortex/alerts/shim-api-errors - annotations: - summary: "Placement Shim cannot reach the upstream Placement API" - description: > - The Placement Shim is unable to reach the upstream OpenStack Placement - API and is returning 502 Bad Gateway errors. This means all forwarded - requests are failing. Check network connectivity, the Placement API - service endpoint configuration, and whether the upstream service is - running. - - # Latency alerts - - alert: CortexPlacementShimDownstreamLatencyTooHigh - expr: | - histogram_quantile(0.95, sum(rate(cortex_placement_shim_downstream_request_duration_seconds_bucket{service="cortex-placement-shim-metrics-service"}[5m])) by (le)) > 10 - and on() sum(rate(cortex_placement_shim_downstream_request_duration_seconds_count{service="cortex-placement-shim-metrics-service"}[5m])) > 0 - for: 5m - labels: - context: api - dashboard: cortex-placement-shim-status-dashboard/cortex-placement-shim-status-dashboard - service: cortex - severity: warning - support_group: workload-management - playbook: docs/support/playbook/cortex/alerts/shim-api-errors - annotations: - summary: "Placement Shim downstream latency too high" - description: > - The Placement Shim downstream request latency (p95) exceeds 10 - seconds. This affects all OpenStack services making placement - requests through the shim. The cause may be slow upstream responses, - shim processing overhead, or resource contention. Investigate both - shim and upstream Placement API performance. - - - alert: CortexPlacementShimUpstreamLatencyTooHigh - expr: | - histogram_quantile(0.95, sum(rate(cortex_placement_shim_upstream_request_duration_seconds_bucket{service="cortex-placement-shim-metrics-service"}[5m])) by (le)) > 10 - and on() sum(rate(cortex_placement_shim_upstream_request_duration_seconds_count{service="cortex-placement-shim-metrics-service"}[5m])) > 0 - for: 5m - labels: - context: upstream - dashboard: cortex-placement-shim-status-dashboard/cortex-placement-shim-status-dashboard - service: cortex - severity: warning - support_group: workload-management - playbook: docs/support/playbook/cortex/alerts/shim-api-errors - annotations: - summary: "Placement Shim upstream latency too high" - description: > - The upstream Placement API response latency (p95) as seen by the - shim exceeds 10 seconds. This directly impacts the end-to-end - latency of placement requests. Investigate the upstream Placement - API performance and network conditions. - - # Resource usage - - alert: CortexPlacementShimHighMemoryUsage - expr: process_resident_memory_bytes{service="cortex-placement-shim-metrics-service"} > 1500 * 1024 * 1024 - for: 5m - labels: - context: memory - dashboard: cortex-placement-shim-status-dashboard/cortex-placement-shim-status-dashboard - service: cortex - severity: warning - support_group: workload-management - playbook: docs/support/playbook/cortex/alerts/shim-resource-usage - annotations: - summary: "Placement Shim uses too much memory" - description: > - The Placement Shim is using more than 1500 MiB of resident memory - against a limit of 2048 MiB. This may indicate a memory leak, a - large number of cached hypervisors, or unexpected request patterns. - If the usage continues to grow, the pod will be OOM-killed. - - - alert: CortexPlacementShimHighCPUUsage - expr: rate(process_cpu_seconds_total{service="cortex-placement-shim-metrics-service"}[1m]) > 0.4 - for: 5m - labels: - context: cpu - dashboard: cortex-placement-shim-status-dashboard/cortex-placement-shim-status-dashboard - service: cortex - severity: warning - support_group: workload-management - playbook: docs/support/playbook/cortex/alerts/shim-resource-usage - annotations: - summary: "Placement Shim uses too much CPU" - description: > - The Placement Shim is consuming more than 40% of a single CPU core - against a limit of 500m. Under normal operation the shim should use - much less since it primarily proxies requests. This may indicate a - hot loop, excessive logging, or an unusual traffic spike. - diff --git a/helm/bundles/cortex-placement-shim/templates/alerts.yaml b/helm/bundles/cortex-placement-shim/templates/alerts.yaml index 7db3b96e6..c570ccd91 100644 --- a/helm/bundles/cortex-placement-shim/templates/alerts.yaml +++ b/helm/bundles/cortex-placement-shim/templates/alerts.yaml @@ -10,8 +10,182 @@ metadata: type: alerting-rules prometheus: {{ required ".Values.alerts.prometheus missing" .Values.alerts.prometheus | quote }} spec: - {{- $files := .Files.Glob "alerts/*.alerts.yaml" }} - {{- range $path, $file := $files }} - {{ $file | toString | nindent 2 }} - {{- end }} + groups: + - name: cortex-placement-shim-alerts + rules: + # Liveness + - alert: CortexPlacementShimDown + expr: | + up{pod=~"cortex-placement-shim-.*"} != 1 or + absent(up{pod=~"cortex-placement-shim-.*"}) + for: 5m + labels: + context: liveness + dashboard: cortex-placement-shim-status-dashboard/cortex-placement-shim-status-dashboard + service: cortex + severity: warning + support_group: workload-management + playbook: docs/support/playbook/cortex/alerts/shim-down + annotations: + summary: "Cortex Placement Shim is down" + description: > + The Cortex Placement Shim is down. Placement API requests that are + routed through the shim will not be served. OpenStack services relying + on the shim for resource provider lookups and allocation candidates + will degrade. + + # Downstream HTTP errors (client -> shim) + - alert: CortexPlacementShimDownstreamHttp400sTooHigh + expr: rate(cortex_placement_shim_downstream_request_duration_seconds_count{service="cortex-placement-shim-metrics-service", responsecode=~"4.."}[5m]) > 0.1 + for: 5m + labels: + context: api + dashboard: cortex-placement-shim-status-dashboard/cortex-placement-shim-status-dashboard + service: cortex + severity: warning + support_group: workload-management + playbook: docs/support/playbook/cortex/alerts/shim-api-errors + annotations: + summary: "Placement Shim downstream HTTP 4xx errors too high" + description: > + The Placement Shim is responding to client requests with HTTP 4xx + errors at a sustained rate. This may indicate that the request format + from OpenStack services has changed, authentication tokens are invalid, + or the shim is rejecting malformed requests. Investigate the shim logs + for details on which endpoints and request patterns are affected. + + - alert: CortexPlacementShimDownstreamHttp500sTooHigh + expr: rate(cortex_placement_shim_downstream_request_duration_seconds_count{service="cortex-placement-shim-metrics-service", responsecode=~"5.."}[5m]) > 0.1 + for: 5m + labels: + context: api + dashboard: cortex-placement-shim-status-dashboard/cortex-placement-shim-status-dashboard + service: cortex + severity: warning + support_group: workload-management + playbook: docs/support/playbook/cortex/alerts/shim-api-errors + annotations: + summary: "Placement Shim downstream HTTP 5xx errors too high" + description: > + The Placement Shim is responding to client requests with HTTP 5xx + errors. This indicates internal problems within the shim such as + handler panics or misconfiguration. OpenStack services may experience + degraded placement functionality until the issue is resolved. + + # Upstream HTTP errors (shim -> Placement API) + - alert: CortexPlacementShimUpstreamHttp5xxTooHigh + expr: rate(cortex_placement_shim_upstream_request_duration_seconds_count{service="cortex-placement-shim-metrics-service", responsecode=~"5.."}[5m]) > 0.1 + for: 5m + labels: + context: upstream + dashboard: cortex-placement-shim-status-dashboard/cortex-placement-shim-status-dashboard + service: cortex + severity: warning + support_group: workload-management + playbook: docs/support/playbook/cortex/alerts/shim-api-errors + annotations: + summary: "Placement Shim upstream HTTP 5xx errors too high" + description: > + The upstream Placement API is returning 5xx errors to the shim. + This indicates the OpenStack Placement service itself is having + problems. The shim forwards these errors to its clients. Investigate + the Placement API service health and logs. + + - alert: CortexPlacementShimUpstreamUnreachable + expr: rate(cortex_placement_shim_upstream_request_duration_seconds_count{service="cortex-placement-shim-metrics-service", responsecode="502"}[5m]) > 0.1 + for: 5m + labels: + context: upstream + dashboard: cortex-placement-shim-status-dashboard/cortex-placement-shim-status-dashboard + service: cortex + severity: warning + support_group: workload-management + playbook: docs/support/playbook/cortex/alerts/shim-api-errors + annotations: + summary: "Placement Shim cannot reach the upstream Placement API" + description: > + The Placement Shim is unable to reach the upstream OpenStack Placement + API and is returning 502 Bad Gateway errors. This means all forwarded + requests are failing. Check network connectivity, the Placement API + service endpoint configuration, and whether the upstream service is + running. + + # Latency alerts + - alert: CortexPlacementShimDownstreamLatencyTooHigh + expr: | + histogram_quantile(0.95, sum(rate(cortex_placement_shim_downstream_request_duration_seconds_bucket{service="cortex-placement-shim-metrics-service"}[5m])) by (le)) > 10 + and on() sum(rate(cortex_placement_shim_downstream_request_duration_seconds_count{service="cortex-placement-shim-metrics-service"}[5m])) > 0 + for: 5m + labels: + context: api + dashboard: cortex-placement-shim-status-dashboard/cortex-placement-shim-status-dashboard + service: cortex + severity: warning + support_group: workload-management + playbook: docs/support/playbook/cortex/alerts/shim-api-errors + annotations: + summary: "Placement Shim downstream latency too high" + description: > + The Placement Shim downstream request latency (p95) exceeds 10 + seconds. This affects all OpenStack services making placement + requests through the shim. The cause may be slow upstream responses, + shim processing overhead, or resource contention. Investigate both + shim and upstream Placement API performance. + + - alert: CortexPlacementShimUpstreamLatencyTooHigh + expr: | + histogram_quantile(0.95, sum(rate(cortex_placement_shim_upstream_request_duration_seconds_bucket{service="cortex-placement-shim-metrics-service"}[5m])) by (le)) > 10 + and on() sum(rate(cortex_placement_shim_upstream_request_duration_seconds_count{service="cortex-placement-shim-metrics-service"}[5m])) > 0 + for: 5m + labels: + context: upstream + dashboard: cortex-placement-shim-status-dashboard/cortex-placement-shim-status-dashboard + service: cortex + severity: warning + support_group: workload-management + playbook: docs/support/playbook/cortex/alerts/shim-api-errors + annotations: + summary: "Placement Shim upstream latency too high" + description: > + The upstream Placement API response latency (p95) as seen by the + shim exceeds 10 seconds. This directly impacts the end-to-end + latency of placement requests. Investigate the upstream Placement + API performance and network conditions. + + # Resource usage + - alert: CortexPlacementShimHighMemoryUsage + expr: process_resident_memory_bytes{service="cortex-placement-shim-metrics-service"} > 1500 * 1024 * 1024 + for: 5m + labels: + context: memory + dashboard: cortex-placement-shim-status-dashboard/cortex-placement-shim-status-dashboard + service: cortex + severity: warning + support_group: workload-management + playbook: docs/support/playbook/cortex/alerts/shim-resource-usage + annotations: + summary: "Placement Shim uses too much memory" + description: > + The Placement Shim is using more than 1500 MiB of resident memory + against a limit of 2048 MiB. This may indicate a memory leak, a + large number of cached hypervisors, or unexpected request patterns. + If the usage continues to grow, the pod will be OOM-killed. + + - alert: CortexPlacementShimHighCPUUsage + expr: rate(process_cpu_seconds_total{service="cortex-placement-shim-metrics-service"}[1m]) > 0.4 + for: 5m + labels: + context: cpu + dashboard: cortex-placement-shim-status-dashboard/cortex-placement-shim-status-dashboard + service: cortex + severity: warning + support_group: workload-management + playbook: docs/support/playbook/cortex/alerts/shim-resource-usage + annotations: + summary: "Placement Shim uses too much CPU" + description: > + The Placement Shim is consuming more than 40% of a single CPU core + against a limit of 500m. Under normal operation the shim should use + much less since it primarily proxies requests. This may indicate a + hot loop, excessive logging, or an unusual traffic spike. {{- end }} From 954ed1d3c8a818d955a01ecc7aa27a7a2a00e7ca Mon Sep 17 00:00:00 2001 From: Malte Viering Date: Tue, 2 Jun 2026 11:57:32 +0200 Subject: [PATCH 2/3] fix linting --- .github/workflows/check-alerts.yaml | 49 ++++++++++++++++++----------- 1 file changed, 31 insertions(+), 18 deletions(-) diff --git a/.github/workflows/check-alerts.yaml b/.github/workflows/check-alerts.yaml index abd44d40b..e9479f5da 100644 --- a/.github/workflows/check-alerts.yaml +++ b/.github/workflows/check-alerts.yaml @@ -2,32 +2,45 @@ name: Check Alerts using Promtool on: pull_request: paths: - - '**/*.rules.yaml' - - '**/*.alerts.yaml' + - 'helm/bundles/*/templates/alerts.yaml' + - 'helm/bundles/*/values.yaml' + - 'helm/bundles/*/Chart.yaml' + - 'helm/library/**' + - '.github/workflows/check-alerts.yaml' jobs: lint: - runs-on: ubuntu-latest + # Pinned to ubuntu-24.04 so the pre-installed helm and yq versions are + # stable. helm and yq come from the base runner image (no install step + # needed); promtool is installed by the peimanja action below. + runs-on: ubuntu-24.04 steps: - - name: Checkout PR - uses: actions/checkout@v6 + - uses: actions/checkout@v6 - - name: Get changed rule and alert files - id: changed - uses: tj-actions/changed-files@v47 - with: - files: | - **/*.rules.yaml - **/*.alerts.yaml + - name: Render bundles to rule files + run: | + set -euo pipefail + mkdir -p rendered + + helm dep update helm/bundles/cortex-cinder + helm dep update helm/bundles/cortex-manila + helm dep update helm/bundles/cortex-nova + helm dep update helm/bundles/cortex-placement-shim + + helm template cortex-cinder helm/bundles/cortex-cinder | yq 'select(.kind == "PrometheusRule") | .spec' > rendered/cortex-cinder.yaml + helm template cortex-manila helm/bundles/cortex-manila | yq 'select(.kind == "PrometheusRule") | .spec' > rendered/cortex-manila.yaml + helm template cortex-placement-shim helm/bundles/cortex-placement-shim | yq 'select(.kind == "PrometheusRule") | .spec' > rendered/cortex-placement-shim.yaml + + # nova has KVM-gated rules; render both flavours. + helm template cortex-nova helm/bundles/cortex-nova | yq 'select(.kind == "PrometheusRule") | .spec' > rendered/cortex-nova-default.yaml + helm template cortex-nova helm/bundles/cortex-nova --set kvm.enabled=true | yq 'select(.kind == "PrometheusRule") | .spec' > rendered/cortex-nova-kvm.yaml - - name: Install Helm - uses: azure/setup-helm@v5 + ls -la rendered/ - - name: Check changed rule and alert files via promtool - if: steps.changed.outputs.any_changed == 'true' + - name: Check rules with promtool uses: peimanja/promtool-github-actions@v0.0.2 with: promtool_actions_subcommand: 'rules' - promtool_actions_files: ${{ steps.changed.outputs.all_changed_files }} + promtool_actions_files: 'rendered/*.yaml' promtool_actions_version: 'latest' - promtool_actions_comment: 'false' \ No newline at end of file + promtool_actions_comment: 'false' From 54ed5a8e21f38b0769c6f99036cc388851346846 Mon Sep 17 00:00:00 2001 From: Malte Viering Date: Tue, 2 Jun 2026 10:07:13 +0000 Subject: [PATCH 3/3] ci(check-alerts): pin promtool action to commit SHA Replace mutable tag reference with immutable commit SHA so the action cannot be changed by retagging. Verified via GitHub refs API that the SHA matches the v0.0.2 tag. --- .github/workflows/check-alerts.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/check-alerts.yaml b/.github/workflows/check-alerts.yaml index e9479f5da..33f8407ff 100644 --- a/.github/workflows/check-alerts.yaml +++ b/.github/workflows/check-alerts.yaml @@ -38,7 +38,7 @@ jobs: ls -la rendered/ - name: Check rules with promtool - uses: peimanja/promtool-github-actions@v0.0.2 + uses: peimanja/promtool-github-actions@741be6fd6b8ee6a1d777ea020076b70c6233b3a1 # v0.0.2 with: promtool_actions_subcommand: 'rules' promtool_actions_files: 'rendered/*.yaml'