diff --git a/.github/workflows/check-alerts.yaml b/.github/workflows/check-alerts.yaml index abd44d40b..33f8407ff 100644 --- a/.github/workflows/check-alerts.yaml +++ b/.github/workflows/check-alerts.yaml @@ -2,32 +2,45 @@ name: Check Alerts using Promtool on: pull_request: paths: - - '**/*.rules.yaml' - - '**/*.alerts.yaml' + - 'helm/bundles/*/templates/alerts.yaml' + - 'helm/bundles/*/values.yaml' + - 'helm/bundles/*/Chart.yaml' + - 'helm/library/**' + - '.github/workflows/check-alerts.yaml' jobs: lint: - runs-on: ubuntu-latest + # Pinned to ubuntu-24.04 so the pre-installed helm and yq versions are + # stable. helm and yq come from the base runner image (no install step + # needed); promtool is installed by the peimanja action below. + runs-on: ubuntu-24.04 steps: - - name: Checkout PR - uses: actions/checkout@v6 + - uses: actions/checkout@v6 - - name: Get changed rule and alert files - id: changed - uses: tj-actions/changed-files@v47 - with: - files: | - **/*.rules.yaml - **/*.alerts.yaml + - name: Render bundles to rule files + run: | + set -euo pipefail + mkdir -p rendered + + helm dep update helm/bundles/cortex-cinder + helm dep update helm/bundles/cortex-manila + helm dep update helm/bundles/cortex-nova + helm dep update helm/bundles/cortex-placement-shim + + helm template cortex-cinder helm/bundles/cortex-cinder | yq 'select(.kind == "PrometheusRule") | .spec' > rendered/cortex-cinder.yaml + helm template cortex-manila helm/bundles/cortex-manila | yq 'select(.kind == "PrometheusRule") | .spec' > rendered/cortex-manila.yaml + helm template cortex-placement-shim helm/bundles/cortex-placement-shim | yq 'select(.kind == "PrometheusRule") | .spec' > rendered/cortex-placement-shim.yaml + + # nova has KVM-gated rules; render both flavours. + helm template cortex-nova helm/bundles/cortex-nova | yq 'select(.kind == "PrometheusRule") | .spec' > rendered/cortex-nova-default.yaml + helm template cortex-nova helm/bundles/cortex-nova --set kvm.enabled=true | yq 'select(.kind == "PrometheusRule") | .spec' > rendered/cortex-nova-kvm.yaml - - name: Install Helm - uses: azure/setup-helm@v5 + ls -la rendered/ - - name: Check changed rule and alert files via promtool - if: steps.changed.outputs.any_changed == 'true' - uses: peimanja/promtool-github-actions@v0.0.2 + - name: Check rules with promtool + uses: peimanja/promtool-github-actions@741be6fd6b8ee6a1d777ea020076b70c6233b3a1 # v0.0.2 with: promtool_actions_subcommand: 'rules' - promtool_actions_files: ${{ steps.changed.outputs.all_changed_files }} + promtool_actions_files: 'rendered/*.yaml' promtool_actions_version: 'latest' - promtool_actions_comment: 'false' \ No newline at end of file + promtool_actions_comment: 'false' diff --git a/docs/reservations/committed-resource-reservations.md b/docs/reservations/committed-resource-reservations.md index 4d96d43a6..7d80064b0 100644 --- a/docs/reservations/committed-resource-reservations.md +++ b/docs/reservations/committed-resource-reservations.md @@ -35,7 +35,7 @@ The CR reservation implementation is located in `internal/scheduling/reservation - Scheduling pipeline selection per flavor group - Per-flavor-group resource flags (`handlesCommitments`, `hasCapacity`, `hasQuota`) controlling which resource types are active for each group -**Metrics and Alerts**: Defined in `helm/bundles/cortex-nova/alerts/nova.alerts.yaml` with prefixes: +**Metrics and Alerts**: Defined in `helm/bundles/cortex-nova/templates/alerts.yaml` with prefixes: - `cortex_committed_resource_change_api_*` - `cortex_committed_resource_usage_api_*` - `cortex_committed_resource_capacity_api_*` diff --git a/helm/bundles/cortex-cinder/alerts/cinder.alerts.yaml b/helm/bundles/cortex-cinder/alerts/cinder.alerts.yaml deleted file mode 100644 index 6684e3392..000000000 --- a/helm/bundles/cortex-cinder/alerts/cinder.alerts.yaml +++ /dev/null @@ -1,260 +0,0 @@ -groups: -- name: cortex-cinder-alerts - rules: - - alert: CortexCinderSchedulingDown - expr: | - up{pod=~"cortex-cinder-scheduling-.*"} != 1 or - absent(up{pod=~"cortex-cinder-scheduling-.*"}) - for: 5m - labels: - context: liveness - dashboard: cortex-status-dashboard/cortex-status-dashboard - service: cortex - severity: warning - support_group: workload-management - playbook: docs/support/playbook/cortex/alerts/down - annotations: - summary: "Cortex Scheduling for Cinder is down" - description: > - The Cortex scheduling service is down. Scheduling requests from Cinder will - not be served. This is no immediate problem, since Cinder will continue - placing new VMs. However, the placement will be less desirable. - - - alert: CortexCinderKnowledgeDown - expr: | - up{pod=~"cortex-cinder-knowledge-.*"} != 1 or - absent(up{pod=~"cortex-cinder-knowledge-.*"}) - for: 5m - labels: - context: liveness - dashboard: cortex-status-dashboard/cortex-status-dashboard - service: cortex - severity: warning - support_group: workload-management - playbook: docs/support/playbook/cortex/alerts/down - annotations: - summary: "Cortex Knowledge for Cinder is down" - description: > - The Cortex Knowledge service is down. This is no immediate problem, - since cortex is still able to process requests, - but the quality of the responses may be affected. - - - alert: CortexCinderHttpRequest400sTooHigh - expr: rate(cortex_scheduler_api_request_duration_seconds_count{service="cortex-cinder-metrics", status=~"4.+"}[5m]) > 0.1 - for: 5m - labels: - context: api - dashboard: cortex-status-dashboard/cortex-status-dashboard - service: cortex - severity: warning - support_group: workload-management - annotations: - summary: "Cinder Scheduler HTTP request 400 errors too high" - description: > - Cinder Scheduler is responding to placement requests with HTTP 4xx - errors. This is expected when the scheduling request cannot be served - by Cortex. However, it could also indicate that the request format has - changed and Cortex is unable to parse it. - - - alert: CortexCinderSchedulingHttpRequest500sTooHigh - expr: rate(cortex_scheduler_api_request_duration_seconds_count{service="cortex-cinder-metrics", status=~"5.+" }[5m]) > 0.1 - for: 5m - labels: - context: api - dashboard: cortex-status-dashboard/cortex-status-dashboard - service: cortex - severity: warning - support_group: workload-management - annotations: - summary: "Cinder Scheduler HTTP request 500 errors too high" - description: > - Cinder Scheduler is responding to placement requests with HTTP 5xx errors. - This is not expected and indicates that Cortex is having some internal problem. - Cinder will continue to place new VMs, but the placement will be less desirable. - Thus, no immediate action is needed. - - - alert: CortexCinderHighMemoryUsage - expr: process_resident_memory_bytes{service="cortex-cinder-metrics"} > 6000 * 1024 * 1024 - for: 5m - labels: - context: memory - dashboard: cortex-status-dashboard/cortex-status-dashboard - service: cortex - severity: warning - support_group: workload-management - annotations: - summary: "`{{$labels.component}}` uses too much memory" - description: > - `{{$labels.component}}` should not be using more than 6000 MiB of memory. Usually it - should use much less, so there may be a memory leak or other changes - that are causing the memory usage to increase significantly. - - - alert: CortexCinderHighCPUUsage - expr: rate(process_cpu_seconds_total{service="cortex-cinder-metrics"}[1m]) > 0.5 - for: 5m - labels: - context: cpu - dashboard: cortex-status-dashboard/cortex-status-dashboard - service: cortex - severity: warning - support_group: workload-management - annotations: - summary: "`{{$labels.component}}` uses too much CPU" - description: > - `{{$labels.component}}` should not be using more than 50% of a single CPU core. Usually - it should use much less, so there may be a CPU leak or other changes - that are causing the CPU usage to increase significantly. - - - alert: CortexCinderTooManyDBConnectionAttempts - expr: rate(cortex_db_connection_attempts_total{service="cortex-cinder-metrics"}[5m]) > 0.1 - for: 5m - labels: - context: db - dashboard: cortex-status-dashboard/cortex-status-dashboard - service: cortex - severity: warning - support_group: workload-management - annotations: - summary: "`{{$labels.component}}` is trying to connect to the database too often" - description: > - `{{$labels.component}}` is trying to connect to the database too often. This may happen - when the database is down or the connection parameters are misconfigured. - - - alert: CortexCinderSyncNotSuccessful - expr: cortex_sync_request_processed_total{service="cortex-cinder-metrics"} - cortex_sync_request_duration_seconds_count{service="cortex-cinder-metrics"} > 0 - for: 5m - labels: - context: syncstatus - dashboard: cortex-status-dashboard/cortex-status-dashboard - service: cortex - severity: warning - support_group: workload-management - annotations: - summary: "`{{$labels.component}}` Sync not successful" - description: > - `{{$labels.component}}` experienced an issue syncing data from the datasource `{{$labels.datasource}}`. This may - happen when the datasource (OpenStack, Prometheus, etc.) is down or - the sync module is misconfigured. No immediate action is needed, since - the sync module will retry the sync operation and the currently synced - data will be kept. However, when this problem persists for a longer - time the service will have a less recent view of the datacenter. - - - alert: CortexCinderSyncObjectsDroppedToZero - expr: cortex_sync_objects{service="cortex-cinder-metrics"} == 0 - for: 60m - labels: - context: syncobjects - dashboard: cortex-status-dashboard/cortex-status-dashboard - service: cortex - severity: warning - support_group: workload-management - annotations: - summary: "`{{$labels.component}}` is not syncing any new data from `{{$labels.datasource}}`" - description: > - `{{$labels.component}}` is not syncing any objects from the datasource `{{$labels.datasource}}`. This may happen - when the datasource (OpenStack, Prometheus, etc.) is down or the sync - module is misconfigured. No immediate action is needed, since the sync - module will retry the sync operation and the currently synced data will - be kept. However, when this problem persists for a longer time the - service will have a less recent view of the datacenter. - - - alert: CortexCinderDatasourceUnready - expr: cortex_datasource_state{domain="cinder",state!="ready"} != 0 - for: 60m - labels: - context: datasources - dashboard: cortex-status-dashboard/cortex-status-dashboard - service: cortex - severity: warning - support_group: workload-management - annotations: - summary: "Datasource `{{$labels.datasource}}` is in `{{$labels.state}}` state" - description: > - This may indicate issues with the datasource - connectivity or configuration. It is recommended to investigate the - datasource status and logs for more details. - - - alert: CortexCinderKnowledgeUnready - expr: cortex_knowledge_state{domain="cinder",state!="ready"} != 0 - for: 60m - labels: - context: knowledge - dashboard: cortex-status-dashboard/cortex-status-dashboard - service: cortex - severity: warning - support_group: workload-management - annotations: - summary: "Knowledge `{{$labels.knowledge}}` is in `{{$labels.state}}` state" - description: > - This may indicate issues with the knowledge - configuration. It is recommended to investigate the - knowledge status and logs for more details. - - - alert: CortexCinderDecisionsWithErrors - expr: cortex_decision_state{domain="cinder",state="error"} > 0 - for: 5m - labels: - context: decisions - dashboard: cortex-status-dashboard/cortex-status-dashboard - service: cortex - severity: warning - support_group: workload-management - annotations: - summary: "Some decisions are in error state for operator `{{$labels.operator}}`" - description: > - The cortex scheduling pipeline generated decisions that are in error state. - This may indicate issues with the decision logic or the underlying infrastructure. - It is recommended to investigate the decision logs and the state of the - VMs being processed. - - - alert: CortexCinderTooManyDecisionsWaiting - expr: cortex_decision_state{domain="cinder",state="waiting"} > 10 - for: 5m - labels: - context: decisions - dashboard: cortex-status-dashboard/cortex-status-dashboard - service: cortex - severity: warning - support_group: workload-management - annotations: - summary: "Too many decisions are in waiting state for operator `{{$labels.operator}}`" - description: > - The cortex scheduling pipeline has a high number of decisions for which - no target host has been assigned yet. - - This may indicate a backlog in processing or issues with the decision logic. - It is recommended to investigate the decision logs and the state of the - VMs being processed. - - - alert: CortexCinderKPIUnready - expr: | - cortex_kpi_state{domain="cinder",state!="ready"} != 0 - for: 60m - labels: - context: kpis - dashboard: cortex-status-dashboard/cortex-status-dashboard - service: cortex - severity: warning - support_group: workload-management - annotations: - summary: "KPI `{{$labels.kpi}}` is in `{{$labels.state}}` state" - description: > - This may indicate issues with the KPI - configuration. It is recommended to investigate the - KPI status and logs for more details. - - - alert: CortexCinderPipelineUnready - expr: cortex_pipeline_state{domain="cinder",state!="ready"} != 0 - for: 5m - labels: - context: pipelines - dashboard: cortex-status-dashboard/cortex-status-dashboard - service: cortex - severity: warning - support_group: workload-management - annotations: - summary: "Pipeline `{{$labels.pipeline}}` is in `{{$labels.state}}` state" - description: > - This may indicate issues with the pipeline - configuration. It is recommended to investigate the - pipeline status and logs for more details. diff --git a/helm/bundles/cortex-cinder/templates/alerts.yaml b/helm/bundles/cortex-cinder/templates/alerts.yaml index 59496c33d..4beea8b53 100644 --- a/helm/bundles/cortex-cinder/templates/alerts.yaml +++ b/helm/bundles/cortex-cinder/templates/alerts.yaml @@ -1,6 +1,10 @@ # Copyright SAP SE # SPDX-License-Identifier: Apache-2.0 +# NOTE: This file is rendered by Helm. Prometheus templating directives +# (e.g. {{ "{{" }} $labels.foo {{ "}}" }}) must be escaped using Style B: +# replace the outer `{{` and `}}` with `{{ "{{" }}` and `{{ "}}" }}`. + {{- if .Values.alerts.enabled }} apiVersion: monitoring.coreos.com/v1 kind: PrometheusRule @@ -10,8 +14,264 @@ metadata: type: alerting-rules prometheus: {{ required ".Values.alerts.prometheus missing" .Values.alerts.prometheus | quote }} spec: - {{- $files := .Files.Glob "alerts/*.alerts.yaml" }} - {{- range $path, $file := $files }} - {{ $file | toString | nindent 2 }} - {{- end }} + groups: + - name: cortex-cinder-alerts + rules: + - alert: CortexCinderSchedulingDown + expr: | + up{pod=~"cortex-cinder-scheduling-.*"} != 1 or + absent(up{pod=~"cortex-cinder-scheduling-.*"}) + for: 5m + labels: + context: liveness + dashboard: cortex-status-dashboard/cortex-status-dashboard + service: cortex + severity: warning + support_group: workload-management + playbook: docs/support/playbook/cortex/alerts/down + annotations: + summary: "Cortex Scheduling for Cinder is down" + description: > + The Cortex scheduling service is down. Scheduling requests from Cinder will + not be served. This is no immediate problem, since Cinder will continue + placing new VMs. However, the placement will be less desirable. + + - alert: CortexCinderKnowledgeDown + expr: | + up{pod=~"cortex-cinder-knowledge-.*"} != 1 or + absent(up{pod=~"cortex-cinder-knowledge-.*"}) + for: 5m + labels: + context: liveness + dashboard: cortex-status-dashboard/cortex-status-dashboard + service: cortex + severity: warning + support_group: workload-management + playbook: docs/support/playbook/cortex/alerts/down + annotations: + summary: "Cortex Knowledge for Cinder is down" + description: > + The Cortex Knowledge service is down. This is no immediate problem, + since cortex is still able to process requests, + but the quality of the responses may be affected. + + - alert: CortexCinderHttpRequest400sTooHigh + expr: rate(cortex_scheduler_api_request_duration_seconds_count{service="cortex-cinder-metrics", status=~"4.+"}[5m]) > 0.1 + for: 5m + labels: + context: api + dashboard: cortex-status-dashboard/cortex-status-dashboard + service: cortex + severity: warning + support_group: workload-management + annotations: + summary: "Cinder Scheduler HTTP request 400 errors too high" + description: > + Cinder Scheduler is responding to placement requests with HTTP 4xx + errors. This is expected when the scheduling request cannot be served + by Cortex. However, it could also indicate that the request format has + changed and Cortex is unable to parse it. + + - alert: CortexCinderSchedulingHttpRequest500sTooHigh + expr: rate(cortex_scheduler_api_request_duration_seconds_count{service="cortex-cinder-metrics", status=~"5.+" }[5m]) > 0.1 + for: 5m + labels: + context: api + dashboard: cortex-status-dashboard/cortex-status-dashboard + service: cortex + severity: warning + support_group: workload-management + annotations: + summary: "Cinder Scheduler HTTP request 500 errors too high" + description: > + Cinder Scheduler is responding to placement requests with HTTP 5xx errors. + This is not expected and indicates that Cortex is having some internal problem. + Cinder will continue to place new VMs, but the placement will be less desirable. + Thus, no immediate action is needed. + + - alert: CortexCinderHighMemoryUsage + expr: process_resident_memory_bytes{service="cortex-cinder-metrics"} > 6000 * 1024 * 1024 + for: 5m + labels: + context: memory + dashboard: cortex-status-dashboard/cortex-status-dashboard + service: cortex + severity: warning + support_group: workload-management + annotations: + summary: "`{{ "{{" }} $labels.component {{ "}}" }}` uses too much memory" + description: > + `{{ "{{" }} $labels.component {{ "}}" }}` should not be using more than 6000 MiB of memory. Usually it + should use much less, so there may be a memory leak or other changes + that are causing the memory usage to increase significantly. + + - alert: CortexCinderHighCPUUsage + expr: rate(process_cpu_seconds_total{service="cortex-cinder-metrics"}[1m]) > 0.5 + for: 5m + labels: + context: cpu + dashboard: cortex-status-dashboard/cortex-status-dashboard + service: cortex + severity: warning + support_group: workload-management + annotations: + summary: "`{{ "{{" }} $labels.component {{ "}}" }}` uses too much CPU" + description: > + `{{ "{{" }} $labels.component {{ "}}" }}` should not be using more than 50% of a single CPU core. Usually + it should use much less, so there may be a CPU leak or other changes + that are causing the CPU usage to increase significantly. + + - alert: CortexCinderTooManyDBConnectionAttempts + expr: rate(cortex_db_connection_attempts_total{service="cortex-cinder-metrics"}[5m]) > 0.1 + for: 5m + labels: + context: db + dashboard: cortex-status-dashboard/cortex-status-dashboard + service: cortex + severity: warning + support_group: workload-management + annotations: + summary: "`{{ "{{" }} $labels.component {{ "}}" }}` is trying to connect to the database too often" + description: > + `{{ "{{" }} $labels.component {{ "}}" }}` is trying to connect to the database too often. This may happen + when the database is down or the connection parameters are misconfigured. + + - alert: CortexCinderSyncNotSuccessful + expr: cortex_sync_request_processed_total{service="cortex-cinder-metrics"} - cortex_sync_request_duration_seconds_count{service="cortex-cinder-metrics"} > 0 + for: 5m + labels: + context: syncstatus + dashboard: cortex-status-dashboard/cortex-status-dashboard + service: cortex + severity: warning + support_group: workload-management + annotations: + summary: "`{{ "{{" }} $labels.component {{ "}}" }}` Sync not successful" + description: > + `{{ "{{" }} $labels.component {{ "}}" }}` experienced an issue syncing data from the datasource `{{ "{{" }} $labels.datasource {{ "}}" }}`. This may + happen when the datasource (OpenStack, Prometheus, etc.) is down or + the sync module is misconfigured. No immediate action is needed, since + the sync module will retry the sync operation and the currently synced + data will be kept. However, when this problem persists for a longer + time the service will have a less recent view of the datacenter. + + - alert: CortexCinderSyncObjectsDroppedToZero + expr: cortex_sync_objects{service="cortex-cinder-metrics"} == 0 + for: 60m + labels: + context: syncobjects + dashboard: cortex-status-dashboard/cortex-status-dashboard + service: cortex + severity: warning + support_group: workload-management + annotations: + summary: "`{{ "{{" }} $labels.component {{ "}}" }}` is not syncing any new data from `{{ "{{" }} $labels.datasource {{ "}}" }}`" + description: > + `{{ "{{" }} $labels.component {{ "}}" }}` is not syncing any objects from the datasource `{{ "{{" }} $labels.datasource {{ "}}" }}`. This may happen + when the datasource (OpenStack, Prometheus, etc.) is down or the sync + module is misconfigured. No immediate action is needed, since the sync + module will retry the sync operation and the currently synced data will + be kept. However, when this problem persists for a longer time the + service will have a less recent view of the datacenter. + + - alert: CortexCinderDatasourceUnready + expr: cortex_datasource_state{domain="cinder",state!="ready"} != 0 + for: 60m + labels: + context: datasources + dashboard: cortex-status-dashboard/cortex-status-dashboard + service: cortex + severity: warning + support_group: workload-management + annotations: + summary: "Datasource `{{ "{{" }} $labels.datasource {{ "}}" }}` is in `{{ "{{" }} $labels.state {{ "}}" }}` state" + description: > + This may indicate issues with the datasource + connectivity or configuration. It is recommended to investigate the + datasource status and logs for more details. + + - alert: CortexCinderKnowledgeUnready + expr: cortex_knowledge_state{domain="cinder",state!="ready"} != 0 + for: 60m + labels: + context: knowledge + dashboard: cortex-status-dashboard/cortex-status-dashboard + service: cortex + severity: warning + support_group: workload-management + annotations: + summary: "Knowledge `{{ "{{" }} $labels.knowledge {{ "}}" }}` is in `{{ "{{" }} $labels.state {{ "}}" }}` state" + description: > + This may indicate issues with the knowledge + configuration. It is recommended to investigate the + knowledge status and logs for more details. + + - alert: CortexCinderDecisionsWithErrors + expr: cortex_decision_state{domain="cinder",state="error"} > 0 + for: 5m + labels: + context: decisions + dashboard: cortex-status-dashboard/cortex-status-dashboard + service: cortex + severity: warning + support_group: workload-management + annotations: + summary: "Some decisions are in error state for operator `{{ "{{" }} $labels.operator {{ "}}" }}`" + description: > + The cortex scheduling pipeline generated decisions that are in error state. + This may indicate issues with the decision logic or the underlying infrastructure. + It is recommended to investigate the decision logs and the state of the + VMs being processed. + + - alert: CortexCinderTooManyDecisionsWaiting + expr: cortex_decision_state{domain="cinder",state="waiting"} > 10 + for: 5m + labels: + context: decisions + dashboard: cortex-status-dashboard/cortex-status-dashboard + service: cortex + severity: warning + support_group: workload-management + annotations: + summary: "Too many decisions are in waiting state for operator `{{ "{{" }} $labels.operator {{ "}}" }}`" + description: > + The cortex scheduling pipeline has a high number of decisions for which + no target host has been assigned yet. + + This may indicate a backlog in processing or issues with the decision logic. + It is recommended to investigate the decision logs and the state of the + VMs being processed. + + - alert: CortexCinderKPIUnready + expr: | + cortex_kpi_state{domain="cinder",state!="ready"} != 0 + for: 60m + labels: + context: kpis + dashboard: cortex-status-dashboard/cortex-status-dashboard + service: cortex + severity: warning + support_group: workload-management + annotations: + summary: "KPI `{{ "{{" }} $labels.kpi {{ "}}" }}` is in `{{ "{{" }} $labels.state {{ "}}" }}` state" + description: > + This may indicate issues with the KPI + configuration. It is recommended to investigate the + KPI status and logs for more details. + + - alert: CortexCinderPipelineUnready + expr: cortex_pipeline_state{domain="cinder",state!="ready"} != 0 + for: 5m + labels: + context: pipelines + dashboard: cortex-status-dashboard/cortex-status-dashboard + service: cortex + severity: warning + support_group: workload-management + annotations: + summary: "Pipeline `{{ "{{" }} $labels.pipeline {{ "}}" }}` is in `{{ "{{" }} $labels.state {{ "}}" }}` state" + description: > + This may indicate issues with the pipeline + configuration. It is recommended to investigate the + pipeline status and logs for more details. {{- end }} diff --git a/helm/bundles/cortex-ironcore/alerts/ironcore.alerts.yaml b/helm/bundles/cortex-ironcore/alerts/ironcore.alerts.yaml deleted file mode 100644 index 0c72d9a92..000000000 --- a/helm/bundles/cortex-ironcore/alerts/ironcore.alerts.yaml +++ /dev/null @@ -1,3 +0,0 @@ -groups: -- name: cortex-ironcore-alerts - rules: [] diff --git a/helm/bundles/cortex-ironcore/templates/alerts.yaml b/helm/bundles/cortex-ironcore/templates/alerts.yaml deleted file mode 100644 index ca27396a5..000000000 --- a/helm/bundles/cortex-ironcore/templates/alerts.yaml +++ /dev/null @@ -1,17 +0,0 @@ -# Copyright SAP SE -# SPDX-License-Identifier: Apache-2.0 - -{{- if .Values.alerts.enabled }} -apiVersion: monitoring.coreos.com/v1 -kind: PrometheusRule -metadata: - name: cortex-ironcore-alerts - labels: - type: alerting-rules - prometheus: {{ required ".Values.alerts.prometheus missing" .Values.alerts.prometheus | quote }} -spec: - {{- $files := .Files.Glob "alerts/*.alerts.yaml" }} - {{- range $path, $file := $files }} - {{ $file | toString | nindent 2 }} - {{- end }} -{{- end }} diff --git a/helm/bundles/cortex-manila/alerts/manila.alerts.yaml b/helm/bundles/cortex-manila/alerts/manila.alerts.yaml deleted file mode 100644 index 2211d44fe..000000000 --- a/helm/bundles/cortex-manila/alerts/manila.alerts.yaml +++ /dev/null @@ -1,235 +0,0 @@ -groups: -- name: cortex-manila-alerts - rules: - - alert: CortexManilaSchedulingDown - expr: | - up{pod=~"cortex-manila-scheduling-.*"} != 1 or - absent(up{pod=~"cortex-manila-scheduling-.*"}) - for: 5m - labels: - context: liveness - dashboard: cortex-status-dashboard/cortex-status-dashboard - service: cortex - severity: warning - support_group: workload-management - playbook: docs/support/playbook/cortex/alerts/down - annotations: - summary: "Cortex Scheduling for Manila is down" - description: > - The Cortex scheduling service is down. Scheduling requests from Manila will - not be served. This is no immediate problem, since Manila will continue - placing new VMs. However, the placement will be less desirable. - - - alert: CortexManilaKnowledgeDown - expr: | - up{pod=~"cortex-manila-knowledge-.*"} != 1 or - absent(up{pod=~"cortex-manila-knowledge-.*"}) - for: 5m - labels: - context: liveness - dashboard: cortex-status-dashboard/cortex-status-dashboard - service: cortex - severity: warning - support_group: workload-management - playbook: docs/support/playbook/cortex/alerts/down - annotations: - summary: "Cortex Knowledge for Manila is down" - description: > - The Cortex Knowledge service is down. This is no immediate problem, - since cortex is still able to process requests, - but the quality of the responses may be affected. - - - alert: CortexManilaHttpRequest400sTooHigh - expr: rate(cortex_scheduler_api_request_duration_seconds_count{service="cortex-manila-metrics", status=~"4.+"}[5m]) > 0.1 - for: 5m - labels: - context: api - dashboard: cortex-status-dashboard/cortex-status-dashboard - service: cortex - severity: warning - support_group: workload-management - playbook: docs/support/playbook/cortex/alerts/apierrors - annotations: - summary: "Manila Scheduler HTTP request 400 errors too high" - description: > - Manila Scheduler is responding to placement requests with HTTP 4xx - errors. This is expected when the scheduling request cannot be served - by Cortex. However, it could also indicate that the request format has - changed and Cortex is unable to parse it. - - - alert: CortexManilaSchedulingHttpRequest500sTooHigh - expr: rate(cortex_scheduler_api_request_duration_seconds_count{service="cortex-manila-metrics", status=~"5.+" }[5m]) > 0.1 - for: 5m - labels: - context: api - dashboard: cortex-status-dashboard/cortex-status-dashboard - service: cortex - severity: warning - support_group: workload-management - playbook: docs/support/playbook/cortex/alerts/apierrors - annotations: - summary: "Manila Scheduler HTTP request 500 errors too high" - description: > - Manila Scheduler is responding to placement requests with HTTP 5xx errors. - This is not expected and indicates that Cortex is having some internal problem. - Manila will continue to place new VMs, but the placement will be less desirable. - Thus, no immediate action is needed. - - - alert: CortexManilaHighMemoryUsage - expr: process_resident_memory_bytes{service="cortex-manila-metrics"} > 6000 * 1024 * 1024 - for: 5m - labels: - context: memory - dashboard: cortex-status-dashboard/cortex-status-dashboard - service: cortex - severity: warning - support_group: workload-management - playbook: docs/support/playbook/cortex/alerts/deployment - annotations: - summary: "`{{$labels.component}}` uses too much memory" - description: > - `{{$labels.component}}` should not be using more than 6000 MiB of memory. Usually it - should use much less, so there may be a memory leak or other changes - that are causing the memory usage to increase significantly. - - - alert: CortexManilaHighCPUUsage - expr: rate(process_cpu_seconds_total{service="cortex-manila-metrics"}[1m]) > 0.5 - for: 5m - labels: - context: cpu - dashboard: cortex-status-dashboard/cortex-status-dashboard - service: cortex - severity: warning - support_group: workload-management - playbook: docs/support/playbook/cortex/alerts/deployment - annotations: - summary: "`{{$labels.component}}` uses too much CPU" - description: > - `{{$labels.component}}` should not be using more than 50% of a single CPU core. Usually - it should use much less, so there may be a CPU leak or other changes - that are causing the CPU usage to increase significantly. - - - alert: CortexManilaTooManyDBConnectionAttempts - expr: rate(cortex_db_connection_attempts_total{service="cortex-manila-metrics"}[5m]) > 0.1 - for: 5m - labels: - context: db - dashboard: cortex-status-dashboard/cortex-status-dashboard - service: cortex - severity: warning - support_group: workload-management - playbook: docs/support/playbook/cortex/alerts/database - annotations: - summary: "`{{$labels.component}}` is trying to connect to the database too often" - description: > - `{{$labels.component}}` is trying to connect to the database too often. This may happen - when the database is down or the connection parameters are misconfigured. - - - alert: CortexManilaSyncNotSuccessful - expr: cortex_sync_request_processed_total{service="cortex-manila-metrics"} - cortex_sync_request_duration_seconds_count{service="cortex-manila-metrics"} > 0 - for: 5m - labels: - context: syncstatus - dashboard: cortex-status-dashboard/cortex-status-dashboard - service: cortex - severity: warning - support_group: workload-management - playbook: docs/support/playbook/cortex/alerts/datasources - annotations: - summary: "`{{$labels.component}}` Sync not successful" - description: > - `{{$labels.component}}` experienced an issue syncing data from the datasource `{{$labels.datasource}}`. This may - happen when the datasource (OpenStack, Prometheus, etc.) is down or - the sync module is misconfigured. No immediate action is needed, since - the sync module will retry the sync operation and the currently synced - data will be kept. However, when this problem persists for a longer - time the service will have a less recent view of the datacenter. - - - alert: CortexManilaSyncObjectsDroppedToZero - expr: cortex_sync_objects{service="cortex-manila-metrics"} == 0 - for: 60m - labels: - context: syncobjects - dashboard: cortex-status-dashboard/cortex-status-dashboard - service: cortex - severity: warning - support_group: workload-management - playbook: docs/support/playbook/cortex/alerts/datasources - annotations: - summary: "`{{$labels.component}}` is not syncing any new data from `{{$labels.datasource}}`" - description: > - `{{$labels.component}}` is not syncing any objects from the datasource `{{$labels.datasource}}`. This may happen - when the datasource (OpenStack, Prometheus, etc.) is down or the sync - module is misconfigured. No immediate action is needed, since the sync - module will retry the sync operation and the currently synced data will - be kept. However, when this problem persists for a longer time the - service will have a less recent view of the datacenter. - - - alert: CortexManilaDatasourceUnready - expr: cortex_datasource_state{domain="manila",state!="ready"} != 0 - for: 60m - labels: - context: datasources - dashboard: cortex-status-dashboard/cortex-status-dashboard - service: cortex - severity: warning - support_group: workload-management - playbook: docs/support/playbook/cortex/alerts/unready - annotations: - summary: "Datasource `{{$labels.datasource}}` is in `{{$labels.state}}` state" - description: > - This may indicate issues with the datasource - connectivity or configuration. It is recommended to investigate the - datasource status and logs for more details. - - - alert: CortexManilaKnowledgeUnready - expr: cortex_knowledge_state{domain="manila",state!="ready"} != 0 - for: 60m - labels: - context: knowledge - dashboard: cortex-status-dashboard/cortex-status-dashboard - service: cortex - severity: warning - support_group: workload-management - playbook: docs/support/playbook/cortex/alerts/unready - annotations: - summary: "Knowledge `{{$labels.knowledge}}` is in `{{$labels.state}}` state" - description: > - This may indicate issues with the knowledge - configuration. It is recommended to investigate the - knowledge status and logs for more details. - - - alert: CortexManilaKPIUnready - expr: | - cortex_kpi_state{domain="manila",state!="ready"} != 0 - for: 60m - labels: - context: kpis - dashboard: cortex-status-dashboard/cortex-status-dashboard - service: cortex - severity: warning - support_group: workload-management - playbook: docs/support/playbook/cortex/alerts/unready - annotations: - summary: "KPI `{{$labels.kpi}}` is in `{{$labels.state}}` state" - description: > - This may indicate issues with the KPI - configuration. It is recommended to investigate the - KPI status and logs for more details. - - - alert: CortexManilaPipelineUnready - expr: cortex_pipeline_state{domain="manila",state!="ready"} != 0 - for: 5m - labels: - context: kpis - dashboard: cortex-status-dashboard/cortex-status-dashboard - service: cortex - severity: warning - support_group: workload-management - playbook: docs/support/playbook/cortex/alerts/unready - annotations: - summary: "Pipeline `{{$labels.pipeline}}` is in `{{$labels.state}}` state" - description: > - This may indicate issues with the pipeline - configuration. It is recommended to investigate the - pipeline status and logs for more details. diff --git a/helm/bundles/cortex-manila/templates/alerts.yaml b/helm/bundles/cortex-manila/templates/alerts.yaml index 1f25b0354..ef36fe983 100644 --- a/helm/bundles/cortex-manila/templates/alerts.yaml +++ b/helm/bundles/cortex-manila/templates/alerts.yaml @@ -1,6 +1,10 @@ # Copyright SAP SE # SPDX-License-Identifier: Apache-2.0 +# NOTE: This file is rendered by Helm. Prometheus templating directives +# (e.g. {{ "{{" }} $labels.foo {{ "}}" }}) must be escaped using Style B: +# replace the outer `{{` and `}}` with `{{ "{{" }}` and `{{ "}}" }}`. + {{- if .Values.alerts.enabled }} apiVersion: monitoring.coreos.com/v1 kind: PrometheusRule @@ -10,8 +14,239 @@ metadata: type: alerting-rules prometheus: {{ required ".Values.alerts.prometheus missing" .Values.alerts.prometheus | quote }} spec: - {{- $files := .Files.Glob "alerts/*.alerts.yaml" }} - {{- range $path, $file := $files }} - {{ $file | toString | nindent 2 }} - {{- end }} + groups: + - name: cortex-manila-alerts + rules: + - alert: CortexManilaSchedulingDown + expr: | + up{pod=~"cortex-manila-scheduling-.*"} != 1 or + absent(up{pod=~"cortex-manila-scheduling-.*"}) + for: 5m + labels: + context: liveness + dashboard: cortex-status-dashboard/cortex-status-dashboard + service: cortex + severity: warning + support_group: workload-management + playbook: docs/support/playbook/cortex/alerts/down + annotations: + summary: "Cortex Scheduling for Manila is down" + description: > + The Cortex scheduling service is down. Scheduling requests from Manila will + not be served. This is no immediate problem, since Manila will continue + placing new VMs. However, the placement will be less desirable. + + - alert: CortexManilaKnowledgeDown + expr: | + up{pod=~"cortex-manila-knowledge-.*"} != 1 or + absent(up{pod=~"cortex-manila-knowledge-.*"}) + for: 5m + labels: + context: liveness + dashboard: cortex-status-dashboard/cortex-status-dashboard + service: cortex + severity: warning + support_group: workload-management + playbook: docs/support/playbook/cortex/alerts/down + annotations: + summary: "Cortex Knowledge for Manila is down" + description: > + The Cortex Knowledge service is down. This is no immediate problem, + since cortex is still able to process requests, + but the quality of the responses may be affected. + + - alert: CortexManilaHttpRequest400sTooHigh + expr: rate(cortex_scheduler_api_request_duration_seconds_count{service="cortex-manila-metrics", status=~"4.+"}[5m]) > 0.1 + for: 5m + labels: + context: api + dashboard: cortex-status-dashboard/cortex-status-dashboard + service: cortex + severity: warning + support_group: workload-management + playbook: docs/support/playbook/cortex/alerts/apierrors + annotations: + summary: "Manila Scheduler HTTP request 400 errors too high" + description: > + Manila Scheduler is responding to placement requests with HTTP 4xx + errors. This is expected when the scheduling request cannot be served + by Cortex. However, it could also indicate that the request format has + changed and Cortex is unable to parse it. + + - alert: CortexManilaSchedulingHttpRequest500sTooHigh + expr: rate(cortex_scheduler_api_request_duration_seconds_count{service="cortex-manila-metrics", status=~"5.+" }[5m]) > 0.1 + for: 5m + labels: + context: api + dashboard: cortex-status-dashboard/cortex-status-dashboard + service: cortex + severity: warning + support_group: workload-management + playbook: docs/support/playbook/cortex/alerts/apierrors + annotations: + summary: "Manila Scheduler HTTP request 500 errors too high" + description: > + Manila Scheduler is responding to placement requests with HTTP 5xx errors. + This is not expected and indicates that Cortex is having some internal problem. + Manila will continue to place new VMs, but the placement will be less desirable. + Thus, no immediate action is needed. + + - alert: CortexManilaHighMemoryUsage + expr: process_resident_memory_bytes{service="cortex-manila-metrics"} > 6000 * 1024 * 1024 + for: 5m + labels: + context: memory + dashboard: cortex-status-dashboard/cortex-status-dashboard + service: cortex + severity: warning + support_group: workload-management + playbook: docs/support/playbook/cortex/alerts/deployment + annotations: + summary: "`{{ "{{" }} $labels.component {{ "}}" }}` uses too much memory" + description: > + `{{ "{{" }} $labels.component {{ "}}" }}` should not be using more than 6000 MiB of memory. Usually it + should use much less, so there may be a memory leak or other changes + that are causing the memory usage to increase significantly. + + - alert: CortexManilaHighCPUUsage + expr: rate(process_cpu_seconds_total{service="cortex-manila-metrics"}[1m]) > 0.5 + for: 5m + labels: + context: cpu + dashboard: cortex-status-dashboard/cortex-status-dashboard + service: cortex + severity: warning + support_group: workload-management + playbook: docs/support/playbook/cortex/alerts/deployment + annotations: + summary: "`{{ "{{" }} $labels.component {{ "}}" }}` uses too much CPU" + description: > + `{{ "{{" }} $labels.component {{ "}}" }}` should not be using more than 50% of a single CPU core. Usually + it should use much less, so there may be a CPU leak or other changes + that are causing the CPU usage to increase significantly. + + - alert: CortexManilaTooManyDBConnectionAttempts + expr: rate(cortex_db_connection_attempts_total{service="cortex-manila-metrics"}[5m]) > 0.1 + for: 5m + labels: + context: db + dashboard: cortex-status-dashboard/cortex-status-dashboard + service: cortex + severity: warning + support_group: workload-management + playbook: docs/support/playbook/cortex/alerts/database + annotations: + summary: "`{{ "{{" }} $labels.component {{ "}}" }}` is trying to connect to the database too often" + description: > + `{{ "{{" }} $labels.component {{ "}}" }}` is trying to connect to the database too often. This may happen + when the database is down or the connection parameters are misconfigured. + + - alert: CortexManilaSyncNotSuccessful + expr: cortex_sync_request_processed_total{service="cortex-manila-metrics"} - cortex_sync_request_duration_seconds_count{service="cortex-manila-metrics"} > 0 + for: 5m + labels: + context: syncstatus + dashboard: cortex-status-dashboard/cortex-status-dashboard + service: cortex + severity: warning + support_group: workload-management + playbook: docs/support/playbook/cortex/alerts/datasources + annotations: + summary: "`{{ "{{" }} $labels.component {{ "}}" }}` Sync not successful" + description: > + `{{ "{{" }} $labels.component {{ "}}" }}` experienced an issue syncing data from the datasource `{{ "{{" }} $labels.datasource {{ "}}" }}`. This may + happen when the datasource (OpenStack, Prometheus, etc.) is down or + the sync module is misconfigured. No immediate action is needed, since + the sync module will retry the sync operation and the currently synced + data will be kept. However, when this problem persists for a longer + time the service will have a less recent view of the datacenter. + + - alert: CortexManilaSyncObjectsDroppedToZero + expr: cortex_sync_objects{service="cortex-manila-metrics"} == 0 + for: 60m + labels: + context: syncobjects + dashboard: cortex-status-dashboard/cortex-status-dashboard + service: cortex + severity: warning + support_group: workload-management + playbook: docs/support/playbook/cortex/alerts/datasources + annotations: + summary: "`{{ "{{" }} $labels.component {{ "}}" }}` is not syncing any new data from `{{ "{{" }} $labels.datasource {{ "}}" }}`" + description: > + `{{ "{{" }} $labels.component {{ "}}" }}` is not syncing any objects from the datasource `{{ "{{" }} $labels.datasource {{ "}}" }}`. This may happen + when the datasource (OpenStack, Prometheus, etc.) is down or the sync + module is misconfigured. No immediate action is needed, since the sync + module will retry the sync operation and the currently synced data will + be kept. However, when this problem persists for a longer time the + service will have a less recent view of the datacenter. + + - alert: CortexManilaDatasourceUnready + expr: cortex_datasource_state{domain="manila",state!="ready"} != 0 + for: 60m + labels: + context: datasources + dashboard: cortex-status-dashboard/cortex-status-dashboard + service: cortex + severity: warning + support_group: workload-management + playbook: docs/support/playbook/cortex/alerts/unready + annotations: + summary: "Datasource `{{ "{{" }} $labels.datasource {{ "}}" }}` is in `{{ "{{" }} $labels.state {{ "}}" }}` state" + description: > + This may indicate issues with the datasource + connectivity or configuration. It is recommended to investigate the + datasource status and logs for more details. + + - alert: CortexManilaKnowledgeUnready + expr: cortex_knowledge_state{domain="manila",state!="ready"} != 0 + for: 60m + labels: + context: knowledge + dashboard: cortex-status-dashboard/cortex-status-dashboard + service: cortex + severity: warning + support_group: workload-management + playbook: docs/support/playbook/cortex/alerts/unready + annotations: + summary: "Knowledge `{{ "{{" }} $labels.knowledge {{ "}}" }}` is in `{{ "{{" }} $labels.state {{ "}}" }}` state" + description: > + This may indicate issues with the knowledge + configuration. It is recommended to investigate the + knowledge status and logs for more details. + + - alert: CortexManilaKPIUnready + expr: | + cortex_kpi_state{domain="manila",state!="ready"} != 0 + for: 60m + labels: + context: kpis + dashboard: cortex-status-dashboard/cortex-status-dashboard + service: cortex + severity: warning + support_group: workload-management + playbook: docs/support/playbook/cortex/alerts/unready + annotations: + summary: "KPI `{{ "{{" }} $labels.kpi {{ "}}" }}` is in `{{ "{{" }} $labels.state {{ "}}" }}` state" + description: > + This may indicate issues with the KPI + configuration. It is recommended to investigate the + KPI status and logs for more details. + + - alert: CortexManilaPipelineUnready + expr: cortex_pipeline_state{domain="manila",state!="ready"} != 0 + for: 5m + labels: + context: kpis + dashboard: cortex-status-dashboard/cortex-status-dashboard + service: cortex + severity: warning + support_group: workload-management + playbook: docs/support/playbook/cortex/alerts/unready + annotations: + summary: "Pipeline `{{ "{{" }} $labels.pipeline {{ "}}" }}` is in `{{ "{{" }} $labels.state {{ "}}" }}` state" + description: > + This may indicate issues with the pipeline + configuration. It is recommended to investigate the + pipeline status and logs for more details. {{- end }} diff --git a/helm/bundles/cortex-nova/alerts/nova.alerts.yaml b/helm/bundles/cortex-nova/alerts/nova.alerts.yaml deleted file mode 100644 index 46e93ef05..000000000 --- a/helm/bundles/cortex-nova/alerts/nova.alerts.yaml +++ /dev/null @@ -1,609 +0,0 @@ -groups: -- name: cortex-nova-alerts - rules: - - alert: CortexNovaSchedulingDown - expr: | - up{pod=~"cortex-nova-scheduling-.*"} != 1 or - absent(up{pod=~"cortex-nova-scheduling-.*"}) - for: 5m - labels: - context: liveness - dashboard: cortex-status-dashboard/cortex-status-dashboard - service: cortex - severity: critical - support_group: workload-management - playbook: docs/support/playbook/cortex/alerts/down - annotations: - summary: "Cortex Scheduling for Nova is down" - description: > - The Cortex scheduling service is down. Scheduling requests from Nova will - not be served. This is non-critical for vmware virtual machines, but - blocks kvm virtual machines from being scheduled. Thus, it is - recommended to immediately investigate and resolve the issue. - - - alert: CortexNovaKnowledgeDown - expr: | - up{pod=~"cortex-nova-knowledge-.*"} != 1 or - absent(up{pod=~"cortex-nova-knowledge-.*"}) - for: 5m - labels: - context: liveness - dashboard: cortex-status-dashboard/cortex-status-dashboard - service: cortex - severity: warning - support_group: workload-management - playbook: docs/support/playbook/cortex/alerts/down - annotations: - summary: "Cortex Knowledge for Nova is down" - description: > - The Cortex Knowledge service is down. This is no immediate problem, - since cortex is still able to process requests, - but the quality of the responses may be affected. - - - alert: CortexNovaDeschedulerPipelineErroring - expr: delta(cortex_detector_pipeline_run_duration_seconds_count{component="nova-scheduling", error="true"}[2m]) > 0 - for: 5m - labels: - context: descheduler - dashboard: cortex-status-dashboard/cortex-status-dashboard - service: cortex - severity: warning - support_group: workload-management - annotations: - summary: "Descheduler pipeline is erroring." - description: > - The Cortex descheduler pipeline is encountering errors during its execution. - This may indicate issues with the descheduling logic or the underlying infrastructure. - It is recommended to investigate the descheduler logs and the state of the VMs being processed. - - - alert: CortexNovaHttpRequest400sTooHigh - expr: rate(cortex_scheduler_api_request_duration_seconds_count{service="cortex-nova-metrics", status=~"4.+"}[5m]) > 0.1 - for: 5m - labels: - context: api - dashboard: cortex-status-dashboard/cortex-status-dashboard - service: cortex - severity: warning - support_group: workload-management - playbook: docs/support/playbook/cortex/alerts/api-errors - annotations: - summary: "Nova Scheduler HTTP request 400 errors too high" - description: > - Nova Scheduler is responding to placement requests with HTTP 4xx - errors. This is expected when the scheduling request cannot be served - by Cortex. However, it could also indicate that the request format has - changed and Cortex is unable to parse it. - - - alert: CortexNovaSchedulingHttpRequest500sTooHigh - expr: rate(cortex_scheduler_api_request_duration_seconds_count{service="cortex-nova-metrics", status=~"5.+" }[5m]) > 0.1 - for: 5m - labels: - context: api - dashboard: cortex-status-dashboard/cortex-status-dashboard - service: cortex - severity: warning - support_group: workload-management - playbook: docs/support/playbook/cortex/alerts/api-errors - annotations: - summary: "Nova Scheduler HTTP request 500 errors too high" - description: > - Nova Scheduler is responding to placement requests with HTTP 5xx errors. - This is not expected and indicates that Cortex is having some internal problem. - Nova will continue to place new VMs, but the placement will be less desirable. - Thus, no immediate action is needed. - - - alert: CortexNovaHighMemoryUsage - expr: process_resident_memory_bytes{service="cortex-nova-metrics"} > 6000 * 1024 * 1024 - for: 5m - labels: - context: memory - dashboard: cortex-status-dashboard/cortex-status-dashboard - service: cortex - severity: warning - support_group: workload-management - playbook: docs/support/playbook/cortex/alerts/deployment - annotations: - summary: "`{{$labels.component}}` uses too much memory" - description: > - `{{$labels.component}}` should not be using more than 6000 MiB of memory. Usually it - should use much less, so there may be a memory leak or other changes - that are causing the memory usage to increase significantly. - - - alert: CortexNovaHighCPUUsage - expr: rate(process_cpu_seconds_total{service="cortex-nova-metrics"}[1m]) > 0.5 - for: 5m - labels: - context: cpu - dashboard: cortex-status-dashboard/cortex-status-dashboard - service: cortex - severity: warning - support_group: workload-management - playbook: docs/support/playbook/cortex/alerts/deployment - annotations: - summary: "`{{$labels.component}}` uses too much CPU" - description: > - `{{$labels.component}}` should not be using more than 50% of a single CPU core. Usually - it should use much less, so there may be a CPU leak or other changes - that are causing the CPU usage to increase significantly. - - - alert: CortexNovaTooManyDBConnectionAttempts - expr: rate(cortex_db_connection_attempts_total{service="cortex-nova-metrics"}[5m]) > 0.1 - for: 5m - labels: - context: db - dashboard: cortex-status-dashboard/cortex-status-dashboard - service: cortex - severity: warning - support_group: workload-management - playbook: docs/support/playbook/cortex/alerts/database - annotations: - summary: "`{{$labels.component}}` is trying to connect to the database too often" - description: > - `{{$labels.component}}` is trying to connect to the database too often. This may happen - when the database is down or the connection parameters are misconfigured. - - - alert: CortexNovaSyncNotSuccessful - expr: cortex_sync_request_processed_total{service="cortex-nova-metrics"} - cortex_sync_request_duration_seconds_count{service="cortex-nova-metrics"} > 0 - for: 5m - labels: - context: syncstatus - dashboard: cortex-status-dashboard/cortex-status-dashboard - service: cortex - severity: warning - support_group: workload-management - playbook: docs/support/playbook/cortex/alerts/datasources - annotations: - summary: "`{{$labels.component}}` Sync not successful" - description: > - `{{$labels.component}}` experienced an issue syncing data from the datasource `{{$labels.datasource}}`. This may - happen when the datasource (OpenStack, Prometheus, etc.) is down or - the sync module is misconfigured. No immediate action is needed, since - the sync module will retry the sync operation and the currently synced - data will be kept. However, when this problem persists for a longer - time the service will have a less recent view of the datacenter. - - - alert: CortexNovaSyncObjectsDroppedToZero - expr: cortex_sync_objects{service="cortex-nova-metrics", datasource!="openstack_migrations"} == 0 - for: 60m - labels: - context: syncobjects - dashboard: cortex-status-dashboard/cortex-status-dashboard - service: cortex - severity: warning - support_group: workload-management - playbook: docs/support/playbook/cortex/alerts/datasources - annotations: - summary: "`{{$labels.component}}` is not syncing any new data from `{{$labels.datasource}}`" - description: > - `{{$labels.component}}` is not syncing any objects from the datasource `{{$labels.datasource}}`. This may happen - when the datasource (OpenStack, Prometheus, etc.) is down or the sync - module is misconfigured. No immediate action is needed, since the sync - module will retry the sync operation and the currently synced data will - be kept. However, when this problem persists for a longer time the - service will have a less recent view of the datacenter. - - - alert: CortexNovaDatasourceUnready - expr: cortex_datasource_state{domain="nova",state!="ready"} != 0 - for: 60m - labels: - context: datasources - dashboard: cortex-status-dashboard/cortex-status-dashboard - service: cortex - severity: warning - support_group: workload-management - playbook: docs/support/playbook/cortex/alerts/unready - annotations: - summary: "Datasource `{{$labels.datasource}}` is in `{{$labels.state}}` state" - description: > - This may indicate issues with the datasource - connectivity or configuration. It is recommended to investigate the - datasource status and logs for more details. - - - alert: CortexNovaKnowledgeUnready - expr: cortex_knowledge_state{domain="nova",state!="ready"} != 0 - for: 60m - labels: - context: knowledge - dashboard: cortex-status-dashboard/cortex-status-dashboard - service: cortex - severity: warning - support_group: workload-management - playbook: docs/support/playbook/cortex/alerts/unready - annotations: - summary: "Knowledge `{{$labels.knowledge}}` is in `{{$labels.state}}` state" - description: > - This may indicate issues with the knowledge - configuration. It is recommended to investigate the - knowledge status and logs for more details. - - - alert: CortexNovaDecisionsWithErrors - expr: cortex_decision_state{domain="nova",state="error"} > 0 - for: 5m - labels: - context: decisions - dashboard: cortex-status-dashboard/cortex-status-dashboard - service: cortex - severity: warning - support_group: workload-management - annotations: - summary: "Some decisions are in error state for operator `{{$labels.operator}}`" - description: > - The cortex scheduling pipeline generated decisions that are in error state. - This may indicate issues with the decision logic or the underlying infrastructure. - It is recommended to investigate the decision logs and the state of the - VMs being processed. - - - alert: CortexNovaTooManyDecisionsWaiting - expr: cortex_decision_state{domain="nova",state="waiting"} > 10 - for: 5m - labels: - context: decisions - dashboard: cortex-status-dashboard/cortex-status-dashboard - service: cortex - severity: warning - support_group: workload-management - annotations: - summary: "Too many decisions are in waiting state for operator `{{$labels.operator}}`" - description: > - The cortex scheduling pipeline has a high number of decisions for which - no target host has been assigned yet. - - This may indicate a backlog in processing or issues with the decision logic. - It is recommended to investigate the decision logs and the state of the - VMs being processed. - - - alert: CortexNovaKPIUnready - expr: | - cortex_kpi_state{domain="nova",state!="ready"} != 0 - for: 60m - labels: - context: kpis - dashboard: cortex-status-dashboard/cortex-status-dashboard - service: cortex - severity: warning - support_group: workload-management - playbook: docs/support/playbook/cortex/alerts/unready - annotations: - summary: "KPI `{{$labels.kpi}}` is in `{{$labels.state}}` state" - description: > - This may indicate issues with the KPI - configuration. It is recommended to investigate the - KPI status and logs for more details. - - - alert: CortexNovaPipelineUnready - expr: cortex_pipeline_state{domain="nova",state!="ready"} != 0 - for: 5m - labels: - context: pipelines - dashboard: cortex-status-dashboard/cortex-status-dashboard - service: cortex - severity: warning - support_group: workload-management - playbook: docs/support/playbook/cortex/alerts/unready - annotations: - summary: "Pipeline `{{$labels.pipeline}}` is in `{{$labels.state}}` state" - description: > - This may indicate issues with the pipeline - configuration. It is recommended to investigate the - pipeline status and logs for more details. - - - alert: CortexNovaDoesntFindValidKVMHosts - expr: sum by (az, hvtype) (increase(cortex_vm_faults{hvtype=~"CH|QEMU",faultmsg=~".*No valid host was found.*",faultmsg!~".*No such host.*"}[5m])) > 0 - for: 5m - labels: - context: scheduling - dashboard: cortex-status-dashboard/cortex-status-dashboard - service: cortex - severity: warning - support_group: workload-management - playbook: docs/support/playbook/cortex/alerts/scheduling - annotations: - summary: "Nova scheduling cannot find valid KVM hosts" - description: > - Cortex is seeing new faulty vms in `{{$labels.az}}` where Nova scheduling - failed to find a valid `{{$labels.hvtype}}` host. This may indicate - capacity issues, misconfigured filters, or resource constraints in the - datacenter. Investigate the affected VMs and hypervisor availability. - - - alert: CortexNovaNewDatasourcesNotReconciling - expr: count by(datasource) (cortex_datasource_seconds_until_reconcile{queued="false",domain="nova"}) > 0 - for: 60m - labels: - context: datasources - dashboard: cortex-status-dashboard/cortex-status-dashboard - service: cortex - severity: warning - support_group: workload-management - playbook: docs/support/playbook/cortex/alerts/datasources - annotations: - summary: "New datasource `{{$labels.datasource}}` has not reconciled" - description: > - A new datasource `{{$labels.datasource}}` has been added but has not - completed its first reconciliation yet. This may indicate issues with - the datasource controller's workqueue overprioritizing other datasources. - - - alert: CortexNovaExistingDatasourcesLackingBehind - expr: | - sum by(datasource) (cortex_datasource_seconds_until_reconcile{queued="true",domain="nova"}) < -600 - and on(datasource) cortex_datasource_state{state="ready",domain="nova"} == 1 - for: 10m - labels: - context: datasources - dashboard: cortex-status-dashboard/cortex-status-dashboard - service: cortex - severity: warning - support_group: workload-management - playbook: docs/support/playbook/cortex/alerts/datasources - annotations: - summary: "Existing datasource `{{$labels.datasource}}` is lacking behind" - description: > - An existing datasource `{{$labels.datasource}}` has been queued for - reconciliation for more than 10 minutes. This may indicate issues with - the datasource controller's workqueue or that this or another datasource - is taking an unusually long time to reconcile. - - - alert: CortexNovaReconcileErrorsHigh - expr: | - (sum by (controller) (rate(controller_runtime_reconcile_errors_total{service="cortex-nova-metrics"}[5m]))) - / (sum by (controller) (rate(controller_runtime_reconcile_total{service="cortex-nova-metrics"}[5m]))) > 0.1 - for: 15m - labels: - context: controller-errors - dashboard: cortex-status-dashboard/cortex-status-dashboard - service: cortex - severity: warning - support_group: workload-management - playbook: docs/support/playbook/cortex/alerts/reconciles - annotations: - summary: "Controller reconcile error rate >10%" - description: > - More than 10% of controller reconciles are resulting in errors. This may - indicate issues with the controller logic, connectivity problems, or - external factors causing failures. Check the controller logs for error - details and investigate the affected resources. - - - alert: CortexNovaReconcileDurationHigher10Min - expr: | - (sum by (controller) (rate(controller_runtime_reconcile_time_seconds_sum{service="cortex-nova-metrics"}[5m]))) - / (sum by (controller) (rate(controller_runtime_reconcile_time_seconds_count{service="cortex-nova-metrics"}[5m]))) > 600 - for: 15m - labels: - context: controller-duration - dashboard: cortex-status-dashboard/cortex-status-dashboard - service: cortex - severity: warning - support_group: workload-management - playbook: docs/support/playbook/cortex/alerts/reconciles - annotations: - summary: "Controller reconciliation takes longer than ({{ $value | humanizeDuration }})" - description: "Reconcile duration higher than 10m while reconciling {{ $labels.controller }}" - - - alert: CortexNovaWorkqueueNotDrained - expr: | - sum by (name) (workqueue_depth{service="cortex-nova-metrics"}) > 0 - for: 60m - labels: - context: controller-workqueue - dashboard: cortex-status-dashboard/cortex-status-dashboard - service: cortex - severity: warning - support_group: workload-management - playbook: docs/support/playbook/cortex/alerts/datasources - annotations: - summary: "Controller {{ $labels.name }}'s backlog is not being drained." - description: > - The workqueue for controller {{ $labels.name }} has a backlog that is - not being drained. This may indicate that the controller is overwhelmed - with work or is stuck on certain resources. Check the controller logs - and the state of the resources it manages for more details. - - - alert: CortexNovaWebhookLatencyHigh - expr: | - histogram_quantile(0.9, avg(rate(controller_runtime_webhook_latency_seconds_bucket{service="cortex-nova-metrics"}[5m])) by (webhook, le)) > 0.2 - for: 15m - labels: - context: controller-webhook - dashboard: cortex-status-dashboard/cortex-status-dashboard - service: cortex - severity: warning - support_group: workload-management - annotations: - summary: "Controller webhook {{ $labels.webhook }} latency is high" - description: > - The latency for webhook {{ $labels.webhook }} is higher than expected (p90 > 200ms). - This may indicate performance issues with the webhook server or the logic it executes. - Check the webhook server logs and monitor its resource usage for more insights. - - - alert: CortexNovaWebhookErrorsHigh - expr: | - (sum by (webhook) (rate(controller_runtime_webhook_requests_total{code!="200", service="cortex-nova-metrics"}[5m]))) - / (sum by (webhook) (rate(controller_runtime_webhook_requests_total{service="cortex-nova-metrics"}[5m]))) > 0.1 - for: 15m - labels: - context: controller-webhook - dashboard: cortex-status-dashboard/cortex-status-dashboard - service: cortex - severity: warning - support_group: workload-management - annotations: - summary: "Controller webhook {{ $labels.webhook }} is experiencing errors" - description: > - The webhook {{ $labels.webhook }} has experienced errors in the last 5 minutes. - This may indicate issues with the webhook logic, connectivity problems, or - external factors causing failures. Check the webhook server logs for error - details and investigate the affected resources. - - # Committed Resource Info API - - alert: CortexNovaCommittedResourceInfoUnavailable - expr: | - rate(cortex_committed_resource_info_api_requests_total{service="cortex-nova-metrics", status_code="503"}[5m]) > 0 - for: 5m - labels: - context: committed-resource-api - dashboard: cortex-status-dashboard/cortex-status-dashboard - service: cortex - severity: warning - support_group: workload-management - playbook: docs/support/playbook/cortex/alerts/committed-resource-api-errors - annotations: - summary: "Committed Resource info API is unavailable" - description: > - The committed resource info API (Limes LIQUID integration) has been returning - 503 Service Unavailable for more than 5 minutes. This typically means the - flavor group knowledge CRD is not ready or missing. Limes cannot discover - available committed resources until the issue is resolved. - - # Committed Resource Change API - - alert: CortexNovaCommittedResourceChangeErrors - expr: | - rate(cortex_committed_resource_change_api_requests_total{service="cortex-nova-metrics", status_code=~"5.."}[5m]) > 0 - for: 5m - labels: - context: committed-resource-api - dashboard: cortex-status-dashboard/cortex-status-dashboard - service: cortex - severity: warning - support_group: workload-management - playbook: docs/support/playbook/cortex/alerts/committed-resource-api-errors - annotations: - summary: "Committed Resource change API HTTP 5xx errors" - description: > - The committed resource change API (Limes LIQUID integration) is returning - HTTP 5xx errors. This is not expected and indicates an internal problem - processing commitment changes. Limes will retry, but new commitments may - not be fulfilled until the issue is resolved. - - - alert: CortexNovaCommittedResourceRejectionRateTooHigh - expr: | - ( - sum(rate(cortex_committed_resource_change_api_commitment_changes_total{service="cortex-nova-metrics", result="rejected", dry_run="false"}[15m])) - / sum(rate(cortex_committed_resource_change_api_commitment_changes_total{service="cortex-nova-metrics", dry_run="false"}[15m])) - ) > 0.3 - and on() sum(rate(cortex_committed_resource_change_api_commitment_changes_total{service="cortex-nova-metrics", dry_run="false"}[15m])) > 0 - for: 15m - labels: - context: committed-resource-api - dashboard: cortex-status-dashboard/cortex-status-dashboard - service: cortex - severity: warning - support_group: workload-management - playbook: docs/support/playbook/cortex/alerts/committed-resource-performance - annotations: - summary: "Committed Resource rejection rate too high ({{ $value | humanizePercentage }})" - description: > - More than 30% of commitment changes have been rejected over the last 15 minutes. - This may indicate insufficient capacity to fulfill new commitments. Rejected - commitments are rolled back. - - - alert: CortexNovaCommittedResourceTimeoutsTooHigh - expr: increase(cortex_committed_resource_change_api_timeouts_total{service="cortex-nova-metrics", dry_run="false"}[10m]) > 0 - for: 1m - labels: - context: committed-resource-api - dashboard: cortex-status-dashboard/cortex-status-dashboard - service: cortex - severity: warning - support_group: workload-management - playbook: docs/support/playbook/cortex/alerts/committed-resource-performance - annotations: - summary: "Committed Resource change API timeout detected" - description: > - A commitment change request timed out after the configured deadline. - Timeouts indicate the scheduling pipeline could not place reservations in time. - Affected changes are rolled back. Investigate scheduler performance or reservation backlog. - - - alert: CortexNovaCommittedResourceChangeLatencyTooHigh - expr: | - histogram_quantile(0.95, sum(rate(cortex_committed_resource_change_api_request_duration_seconds_bucket{service="cortex-nova-metrics", dry_run="false"}[5m])) by (le)) >= 10 - and on() sum(rate(cortex_committed_resource_change_api_requests_total{service="cortex-nova-metrics", dry_run="false"}[5m])) > 0 - for: 5m - labels: - context: committed-resource-api - dashboard: cortex-status-dashboard/cortex-status-dashboard - service: cortex - severity: warning - support_group: workload-management - playbook: docs/support/playbook/cortex/alerts/committed-resource-performance - annotations: - summary: "Committed Resource change API p95 latency >= 10s" - description: > - The committed resource change API p95 latency has reached or exceeded 10 seconds, - approaching the configured watch timeout. Requests close to the timeout are at risk - of being rolled back. Investigate scheduler performance or reservation backlog. - - # Committed Resource Capacity API - - alert: CortexNovaCommittedResourceCapacityErrors - expr: | - rate(cortex_committed_resource_capacity_api_requests_total{service="cortex-nova-metrics", status_code=~"5.."}[5m]) > 0 - for: 5m - labels: - context: committed-resource-api - dashboard: cortex-status-dashboard/cortex-status-dashboard - service: cortex - severity: warning - support_group: workload-management - playbook: docs/support/playbook/cortex/alerts/committed-resource-capacity - annotations: - summary: "Committed Resource capacity API HTTP 5xx errors" - description: > - The committed resource capacity API (Limes LIQUID integration) is returning - HTTP 5xx errors. This indicates internal problems calculating cluster capacity. - Limes may receive stale or incomplete capacity data. - - - alert: CortexNovaCommittedResourceCapacityDroppedToZero - expr: | - (cortex_committed_resource_reported_capacity_gib{service="cortex-nova-metrics"} == 0) - and on(resource, az) (cortex_committed_resource_reported_capacity_gib{service="cortex-nova-metrics"} offset 30m > 0) - for: 5m - labels: - context: committed-resource-api - dashboard: cortex-status-dashboard/cortex-status-dashboard - service: cortex - severity: warning - support_group: workload-management - playbook: docs/support/playbook/cortex/alerts/committed-resource-capacity - annotations: - summary: "Committed Resource capacity for {{ $labels.resource }} in {{ $labels.az }} dropped to zero" - description: > - The reported capacity for committed resource {{ $labels.resource }} in - availability zone {{ $labels.az }} has dropped from a positive value to zero. - This may mean hypervisors in that AZ are fully utilized for the corresponding - flavor group and no further committed resources can be placed there. - - # Committed Resource Usage API - - alert: CortexNovaCommittedResourceUsageErrors - expr: | - rate(cortex_committed_resource_usage_api_requests_total{service="cortex-nova-metrics", status_code=~"5.."}[5m]) > 0 - for: 5m - labels: - context: committed-resource-api - dashboard: cortex-status-dashboard/cortex-status-dashboard - service: cortex - severity: warning - support_group: workload-management - playbook: docs/support/playbook/cortex/alerts/committed-resource-api-errors - annotations: - summary: "Committed Resource usage API HTTP 5xx errors" - description: > - The committed resource usage API (Limes LIQUID integration) is returning - HTTP 5xx errors. This indicates internal problems fetching reservation or - Nova server data. Limes may receive stale or incomplete usage data. - - # Committed Resource Quota API - - alert: CortexNovaCommittedResourceQuotaErrors - expr: | - rate(cortex_committed_resource_quota_api_requests_total{service="cortex-nova-metrics", status_code=~"5.."}[5m]) > 0 - for: 5m - labels: - context: committed-resource-api - dashboard: cortex-status-dashboard/cortex-status-dashboard - service: cortex - severity: warning - support_group: workload-management - playbook: docs/support/playbook/cortex/alerts/committed-resource-api-errors - annotations: - summary: "Committed Resource quota API HTTP 5xx errors" - description: > - The committed resource quota API (Limes LIQUID integration) is returning - HTTP 5xx errors. This indicates internal problems computing or applying - quota. Limes may not be able to enforce committed resource quotas. diff --git a/helm/bundles/cortex-nova/templates/alerts.yaml b/helm/bundles/cortex-nova/templates/alerts.yaml index d2964e864..6f3fabef2 100644 --- a/helm/bundles/cortex-nova/templates/alerts.yaml +++ b/helm/bundles/cortex-nova/templates/alerts.yaml @@ -1,6 +1,10 @@ # Copyright SAP SE # SPDX-License-Identifier: Apache-2.0 +# NOTE: This file is rendered by Helm. Prometheus templating directives +# (e.g. {{ "{{" }} $labels.foo {{ "}}" }}) must be escaped using Style B: +# replace the outer `{{` and `}}` with `{{ "{{" }}` and `{{ "}}" }}`. + {{- if .Values.alerts.enabled }} apiVersion: monitoring.coreos.com/v1 kind: PrometheusRule @@ -10,8 +14,615 @@ metadata: type: alerting-rules prometheus: {{ required ".Values.alerts.prometheus missing" .Values.alerts.prometheus | quote }} spec: - {{- $files := .Files.Glob "alerts/*.alerts.yaml" }} - {{- range $path, $file := $files }} - {{ $file | toString | nindent 2 }} - {{- end }} + groups: + - name: cortex-nova-alerts + rules: + - alert: CortexNovaSchedulingDown + expr: | + up{pod=~"cortex-nova-scheduling-.*"} != 1 or + absent(up{pod=~"cortex-nova-scheduling-.*"}) + for: 5m + labels: + context: liveness + dashboard: cortex-status-dashboard/cortex-status-dashboard + service: cortex + severity: {{ if .Values.kvm.enabled }}critical{{ else }}warning{{ end }} + support_group: workload-management + playbook: docs/support/playbook/cortex/alerts/down + annotations: + summary: "Cortex Scheduling for Nova is down" + description: > + The Cortex scheduling service is down. Scheduling requests from Nova will + not be served. This is non-critical for vmware virtual machines, but + blocks kvm virtual machines from being scheduled. Thus, it is + recommended to immediately investigate and resolve the issue. + + - alert: CortexNovaKnowledgeDown + expr: | + up{pod=~"cortex-nova-knowledge-.*"} != 1 or + absent(up{pod=~"cortex-nova-knowledge-.*"}) + for: 5m + labels: + context: liveness + dashboard: cortex-status-dashboard/cortex-status-dashboard + service: cortex + severity: warning + support_group: workload-management + playbook: docs/support/playbook/cortex/alerts/down + annotations: + summary: "Cortex Knowledge for Nova is down" + description: > + The Cortex Knowledge service is down. This is no immediate problem, + since cortex is still able to process requests, + but the quality of the responses may be affected. + + - alert: CortexNovaDeschedulerPipelineErroring + expr: delta(cortex_detector_pipeline_run_duration_seconds_count{component="nova-scheduling", error="true"}[2m]) > 0 + for: 5m + labels: + context: descheduler + dashboard: cortex-status-dashboard/cortex-status-dashboard + service: cortex + severity: warning + support_group: workload-management + annotations: + summary: "Descheduler pipeline is erroring." + description: > + The Cortex descheduler pipeline is encountering errors during its execution. + This may indicate issues with the descheduling logic or the underlying infrastructure. + It is recommended to investigate the descheduler logs and the state of the VMs being processed. + + - alert: CortexNovaHttpRequest400sTooHigh + expr: rate(cortex_scheduler_api_request_duration_seconds_count{service="cortex-nova-metrics", status=~"4.+"}[5m]) > 0.1 + for: 5m + labels: + context: api + dashboard: cortex-status-dashboard/cortex-status-dashboard + service: cortex + severity: warning + support_group: workload-management + playbook: docs/support/playbook/cortex/alerts/api-errors + annotations: + summary: "Nova Scheduler HTTP request 400 errors too high" + description: > + Nova Scheduler is responding to placement requests with HTTP 4xx + errors. This is expected when the scheduling request cannot be served + by Cortex. However, it could also indicate that the request format has + changed and Cortex is unable to parse it. + + - alert: CortexNovaSchedulingHttpRequest500sTooHigh + expr: rate(cortex_scheduler_api_request_duration_seconds_count{service="cortex-nova-metrics", status=~"5.+" }[5m]) > 0.1 + for: 5m + labels: + context: api + dashboard: cortex-status-dashboard/cortex-status-dashboard + service: cortex + severity: warning + support_group: workload-management + playbook: docs/support/playbook/cortex/alerts/api-errors + annotations: + summary: "Nova Scheduler HTTP request 500 errors too high" + description: > + Nova Scheduler is responding to placement requests with HTTP 5xx errors. + This is not expected and indicates that Cortex is having some internal problem. + Nova will continue to place new VMs, but the placement will be less desirable. + Thus, no immediate action is needed. + + - alert: CortexNovaHighMemoryUsage + expr: process_resident_memory_bytes{service="cortex-nova-metrics"} > {{ .Values.alerts.thresholds.highMemoryMiB }} * 1024 * 1024 + for: 5m + labels: + context: memory + dashboard: cortex-status-dashboard/cortex-status-dashboard + service: cortex + severity: warning + support_group: workload-management + playbook: docs/support/playbook/cortex/alerts/deployment + annotations: + summary: "`{{ "{{" }} $labels.component {{ "}}" }}` uses too much memory" + description: > + `{{ "{{" }} $labels.component {{ "}}" }}` should not be using more than {{ .Values.alerts.thresholds.highMemoryMiB }} MiB of memory. Usually it + should use much less, so there may be a memory leak or other changes + that are causing the memory usage to increase significantly. + + - alert: CortexNovaHighCPUUsage + expr: rate(process_cpu_seconds_total{service="cortex-nova-metrics"}[1m]) > 0.5 + for: 5m + labels: + context: cpu + dashboard: cortex-status-dashboard/cortex-status-dashboard + service: cortex + severity: warning + support_group: workload-management + playbook: docs/support/playbook/cortex/alerts/deployment + annotations: + summary: "`{{ "{{" }} $labels.component {{ "}}" }}` uses too much CPU" + description: > + `{{ "{{" }} $labels.component {{ "}}" }}` should not be using more than 50% of a single CPU core. Usually + it should use much less, so there may be a CPU leak or other changes + that are causing the CPU usage to increase significantly. + + - alert: CortexNovaTooManyDBConnectionAttempts + expr: rate(cortex_db_connection_attempts_total{service="cortex-nova-metrics"}[5m]) > 0.1 + for: 5m + labels: + context: db + dashboard: cortex-status-dashboard/cortex-status-dashboard + service: cortex + severity: warning + support_group: workload-management + playbook: docs/support/playbook/cortex/alerts/database + annotations: + summary: "`{{ "{{" }} $labels.component {{ "}}" }}` is trying to connect to the database too often" + description: > + `{{ "{{" }} $labels.component {{ "}}" }}` is trying to connect to the database too often. This may happen + when the database is down or the connection parameters are misconfigured. + + - alert: CortexNovaSyncNotSuccessful + expr: cortex_sync_request_processed_total{service="cortex-nova-metrics"} - cortex_sync_request_duration_seconds_count{service="cortex-nova-metrics"} > 0 + for: 5m + labels: + context: syncstatus + dashboard: cortex-status-dashboard/cortex-status-dashboard + service: cortex + severity: warning + support_group: workload-management + playbook: docs/support/playbook/cortex/alerts/datasources + annotations: + summary: "`{{ "{{" }} $labels.component {{ "}}" }}` Sync not successful" + description: > + `{{ "{{" }} $labels.component {{ "}}" }}` experienced an issue syncing data from the datasource `{{ "{{" }} $labels.datasource {{ "}}" }}`. This may + happen when the datasource (OpenStack, Prometheus, etc.) is down or + the sync module is misconfigured. No immediate action is needed, since + the sync module will retry the sync operation and the currently synced + data will be kept. However, when this problem persists for a longer + time the service will have a less recent view of the datacenter. + + - alert: CortexNovaSyncObjectsDroppedToZero + expr: cortex_sync_objects{service="cortex-nova-metrics", datasource!="openstack_migrations"} == 0 + for: 60m + labels: + context: syncobjects + dashboard: cortex-status-dashboard/cortex-status-dashboard + service: cortex + severity: warning + support_group: workload-management + playbook: docs/support/playbook/cortex/alerts/datasources + annotations: + summary: "`{{ "{{" }} $labels.component {{ "}}" }}` is not syncing any new data from `{{ "{{" }} $labels.datasource {{ "}}" }}`" + description: > + `{{ "{{" }} $labels.component {{ "}}" }}` is not syncing any objects from the datasource `{{ "{{" }} $labels.datasource {{ "}}" }}`. This may happen + when the datasource (OpenStack, Prometheus, etc.) is down or the sync + module is misconfigured. No immediate action is needed, since the sync + module will retry the sync operation and the currently synced data will + be kept. However, when this problem persists for a longer time the + service will have a less recent view of the datacenter. + + - alert: CortexNovaDatasourceUnready + expr: cortex_datasource_state{domain="nova",state!="ready"} != 0 + for: 60m + labels: + context: datasources + dashboard: cortex-status-dashboard/cortex-status-dashboard + service: cortex + severity: warning + support_group: workload-management + playbook: docs/support/playbook/cortex/alerts/unready + annotations: + summary: "Datasource `{{ "{{" }} $labels.datasource {{ "}}" }}` is in `{{ "{{" }} $labels.state {{ "}}" }}` state" + description: > + This may indicate issues with the datasource + connectivity or configuration. It is recommended to investigate the + datasource status and logs for more details. + + - alert: CortexNovaKnowledgeUnready + expr: cortex_knowledge_state{domain="nova",state!="ready"} != 0 + for: 60m + labels: + context: knowledge + dashboard: cortex-status-dashboard/cortex-status-dashboard + service: cortex + severity: warning + support_group: workload-management + playbook: docs/support/playbook/cortex/alerts/unready + annotations: + summary: "Knowledge `{{ "{{" }} $labels.knowledge {{ "}}" }}` is in `{{ "{{" }} $labels.state {{ "}}" }}` state" + description: > + This may indicate issues with the knowledge + configuration. It is recommended to investigate the + knowledge status and logs for more details. + + - alert: CortexNovaDecisionsWithErrors + expr: cortex_decision_state{domain="nova",state="error"} > 0 + for: 5m + labels: + context: decisions + dashboard: cortex-status-dashboard/cortex-status-dashboard + service: cortex + severity: warning + support_group: workload-management + annotations: + summary: "Some decisions are in error state for operator `{{ "{{" }} $labels.operator {{ "}}" }}`" + description: > + The cortex scheduling pipeline generated decisions that are in error state. + This may indicate issues with the decision logic or the underlying infrastructure. + It is recommended to investigate the decision logs and the state of the + VMs being processed. + + - alert: CortexNovaTooManyDecisionsWaiting + expr: cortex_decision_state{domain="nova",state="waiting"} > 10 + for: 5m + labels: + context: decisions + dashboard: cortex-status-dashboard/cortex-status-dashboard + service: cortex + severity: warning + support_group: workload-management + annotations: + summary: "Too many decisions are in waiting state for operator `{{ "{{" }} $labels.operator {{ "}}" }}`" + description: > + The cortex scheduling pipeline has a high number of decisions for which + no target host has been assigned yet. + + This may indicate a backlog in processing or issues with the decision logic. + It is recommended to investigate the decision logs and the state of the + VMs being processed. + + - alert: CortexNovaKPIUnready + expr: | + cortex_kpi_state{domain="nova",state!="ready"} != 0 + for: 60m + labels: + context: kpis + dashboard: cortex-status-dashboard/cortex-status-dashboard + service: cortex + severity: warning + support_group: workload-management + playbook: docs/support/playbook/cortex/alerts/unready + annotations: + summary: "KPI `{{ "{{" }} $labels.kpi {{ "}}" }}` is in `{{ "{{" }} $labels.state {{ "}}" }}` state" + description: > + This may indicate issues with the KPI + configuration. It is recommended to investigate the + KPI status and logs for more details. + + - alert: CortexNovaPipelineUnready + expr: cortex_pipeline_state{domain="nova",state!="ready"} != 0 + for: 5m + labels: + context: pipelines + dashboard: cortex-status-dashboard/cortex-status-dashboard + service: cortex + severity: warning + support_group: workload-management + playbook: docs/support/playbook/cortex/alerts/unready + annotations: + summary: "Pipeline `{{ "{{" }} $labels.pipeline {{ "}}" }}` is in `{{ "{{" }} $labels.state {{ "}}" }}` state" + description: > + This may indicate issues with the pipeline + configuration. It is recommended to investigate the + pipeline status and logs for more details. + + {{- if .Values.kvm.enabled }} + - alert: CortexNovaDoesntFindValidKVMHosts + expr: sum by (az, hvtype) (increase(cortex_vm_faults{hvtype=~"CH|QEMU",faultmsg=~".*No valid host was found.*",faultmsg!~".*No such host.*"}[5m])) > 0 + for: 5m + labels: + context: scheduling + dashboard: cortex-status-dashboard/cortex-status-dashboard + service: cortex + severity: warning + support_group: workload-management + playbook: docs/support/playbook/cortex/alerts/scheduling + annotations: + summary: "Nova scheduling cannot find valid KVM hosts" + description: > + Cortex is seeing new faulty vms in `{{ "{{" }} $labels.az {{ "}}" }}` where Nova scheduling + failed to find a valid `{{ "{{" }} $labels.hvtype {{ "}}" }}` host. This may indicate + capacity issues, misconfigured filters, or resource constraints in the + datacenter. Investigate the affected VMs and hypervisor availability. + {{- end }} + + - alert: CortexNovaNewDatasourcesNotReconciling + expr: count by(datasource) (cortex_datasource_seconds_until_reconcile{queued="false",domain="nova"}) > 0 + for: 60m + labels: + context: datasources + dashboard: cortex-status-dashboard/cortex-status-dashboard + service: cortex + severity: warning + support_group: workload-management + playbook: docs/support/playbook/cortex/alerts/datasources + annotations: + summary: "New datasource `{{ "{{" }} $labels.datasource {{ "}}" }}` has not reconciled" + description: > + A new datasource `{{ "{{" }} $labels.datasource {{ "}}" }}` has been added but has not + completed its first reconciliation yet. This may indicate issues with + the datasource controller's workqueue overprioritizing other datasources. + + - alert: CortexNovaExistingDatasourcesLackingBehind + expr: | + sum by(datasource) (cortex_datasource_seconds_until_reconcile{queued="true",domain="nova"}) < -600 + and on(datasource) cortex_datasource_state{state="ready",domain="nova"} == 1 + for: 10m + labels: + context: datasources + dashboard: cortex-status-dashboard/cortex-status-dashboard + service: cortex + severity: warning + support_group: workload-management + playbook: docs/support/playbook/cortex/alerts/datasources + annotations: + summary: "Existing datasource `{{ "{{" }} $labels.datasource {{ "}}" }}` is lacking behind" + description: > + An existing datasource `{{ "{{" }} $labels.datasource {{ "}}" }}` has been queued for + reconciliation for more than 10 minutes. This may indicate issues with + the datasource controller's workqueue or that this or another datasource + is taking an unusually long time to reconcile. + + - alert: CortexNovaReconcileErrorsHigh + expr: | + (sum by (controller) (rate(controller_runtime_reconcile_errors_total{service="cortex-nova-metrics"}[5m]))) + / (sum by (controller) (rate(controller_runtime_reconcile_total{service="cortex-nova-metrics"}[5m]))) > 0.1 + for: 15m + labels: + context: controller-errors + dashboard: cortex-status-dashboard/cortex-status-dashboard + service: cortex + severity: warning + support_group: workload-management + playbook: docs/support/playbook/cortex/alerts/reconciles + annotations: + summary: "Controller reconcile error rate >10%" + description: > + More than 10% of controller reconciles are resulting in errors. This may + indicate issues with the controller logic, connectivity problems, or + external factors causing failures. Check the controller logs for error + details and investigate the affected resources. + + - alert: CortexNovaReconcileDurationHigher10Min + expr: | + (sum by (controller) (rate(controller_runtime_reconcile_time_seconds_sum{service="cortex-nova-metrics"}[5m]))) + / (sum by (controller) (rate(controller_runtime_reconcile_time_seconds_count{service="cortex-nova-metrics"}[5m]))) > {{ .Values.alerts.thresholds.reconcileDurationSeconds }} + for: 15m + labels: + context: controller-duration + dashboard: cortex-status-dashboard/cortex-status-dashboard + service: cortex + severity: warning + support_group: workload-management + playbook: docs/support/playbook/cortex/alerts/reconciles + annotations: + summary: "Controller reconciliation takes longer than ({{ "{{" }} $value | humanizeDuration {{ "}}" }})" + description: "Reconcile duration higher than 10m while reconciling {{ "{{" }} $labels.controller {{ "}}" }}" + + - alert: CortexNovaWorkqueueNotDrained + expr: | + sum by (name) (workqueue_depth{service="cortex-nova-metrics"}) > 0 + for: 60m + labels: + context: controller-workqueue + dashboard: cortex-status-dashboard/cortex-status-dashboard + service: cortex + severity: warning + support_group: workload-management + playbook: docs/support/playbook/cortex/alerts/datasources + annotations: + summary: "Controller {{ "{{" }} $labels.name {{ "}}" }}'s backlog is not being drained." + description: > + The workqueue for controller {{ "{{" }} $labels.name {{ "}}" }} has a backlog that is + not being drained. This may indicate that the controller is overwhelmed + with work or is stuck on certain resources. Check the controller logs + and the state of the resources it manages for more details. + + - alert: CortexNovaWebhookLatencyHigh + expr: | + histogram_quantile(0.9, avg(rate(controller_runtime_webhook_latency_seconds_bucket{service="cortex-nova-metrics"}[5m])) by (webhook, le)) > 0.2 + for: 15m + labels: + context: controller-webhook + dashboard: cortex-status-dashboard/cortex-status-dashboard + service: cortex + severity: warning + support_group: workload-management + annotations: + summary: "Controller webhook {{ "{{" }} $labels.webhook {{ "}}" }} latency is high" + description: > + The latency for webhook {{ "{{" }} $labels.webhook {{ "}}" }} is higher than expected (p90 > 200ms). + This may indicate performance issues with the webhook server or the logic it executes. + Check the webhook server logs and monitor its resource usage for more insights. + + - alert: CortexNovaWebhookErrorsHigh + expr: | + (sum by (webhook) (rate(controller_runtime_webhook_requests_total{code!="200", service="cortex-nova-metrics"}[5m]))) + / (sum by (webhook) (rate(controller_runtime_webhook_requests_total{service="cortex-nova-metrics"}[5m]))) > 0.1 + for: 15m + labels: + context: controller-webhook + dashboard: cortex-status-dashboard/cortex-status-dashboard + service: cortex + severity: warning + support_group: workload-management + annotations: + summary: "Controller webhook {{ "{{" }} $labels.webhook {{ "}}" }} is experiencing errors" + description: > + The webhook {{ "{{" }} $labels.webhook {{ "}}" }} has experienced errors in the last 5 minutes. + This may indicate issues with the webhook logic, connectivity problems, or + external factors causing failures. Check the webhook server logs for error + details and investigate the affected resources. + + # Committed Resource Info API + - alert: CortexNovaCommittedResourceInfoUnavailable + expr: | + rate(cortex_committed_resource_info_api_requests_total{service="cortex-nova-metrics", status_code="503"}[5m]) > 0 + for: 5m + labels: + context: committed-resource-api + dashboard: cortex-status-dashboard/cortex-status-dashboard + service: cortex + severity: warning + support_group: workload-management + playbook: docs/support/playbook/cortex/alerts/committed-resource-api-errors + annotations: + summary: "Committed Resource info API is unavailable" + description: > + The committed resource info API (Limes LIQUID integration) has been returning + 503 Service Unavailable for more than 5 minutes. This typically means the + flavor group knowledge CRD is not ready or missing. Limes cannot discover + available committed resources until the issue is resolved. + + # Committed Resource Change API + - alert: CortexNovaCommittedResourceChangeErrors + expr: | + rate(cortex_committed_resource_change_api_requests_total{service="cortex-nova-metrics", status_code=~"5.."}[5m]) > 0 + for: 5m + labels: + context: committed-resource-api + dashboard: cortex-status-dashboard/cortex-status-dashboard + service: cortex + severity: warning + support_group: workload-management + playbook: docs/support/playbook/cortex/alerts/committed-resource-api-errors + annotations: + summary: "Committed Resource change API HTTP 5xx errors" + description: > + The committed resource change API (Limes LIQUID integration) is returning + HTTP 5xx errors. This is not expected and indicates an internal problem + processing commitment changes. Limes will retry, but new commitments may + not be fulfilled until the issue is resolved. + + - alert: CortexNovaCommittedResourceRejectionRateTooHigh + expr: | + ( + sum(rate(cortex_committed_resource_change_api_commitment_changes_total{service="cortex-nova-metrics", result="rejected", dry_run="false"}[15m])) + / sum(rate(cortex_committed_resource_change_api_commitment_changes_total{service="cortex-nova-metrics", dry_run="false"}[15m])) + ) > 0.3 + and on() sum(rate(cortex_committed_resource_change_api_commitment_changes_total{service="cortex-nova-metrics", dry_run="false"}[15m])) > 0 + for: 15m + labels: + context: committed-resource-api + dashboard: cortex-status-dashboard/cortex-status-dashboard + service: cortex + severity: warning + support_group: workload-management + playbook: docs/support/playbook/cortex/alerts/committed-resource-performance + annotations: + summary: "Committed Resource rejection rate too high ({{ "{{" }} $value | humanizePercentage {{ "}}" }})" + description: > + More than 30% of commitment changes have been rejected over the last 15 minutes. + This may indicate insufficient capacity to fulfill new commitments. Rejected + commitments are rolled back. + + - alert: CortexNovaCommittedResourceTimeoutsTooHigh + expr: increase(cortex_committed_resource_change_api_timeouts_total{service="cortex-nova-metrics", dry_run="false"}[10m]) > 0 + for: 1m + labels: + context: committed-resource-api + dashboard: cortex-status-dashboard/cortex-status-dashboard + service: cortex + severity: warning + support_group: workload-management + playbook: docs/support/playbook/cortex/alerts/committed-resource-performance + annotations: + summary: "Committed Resource change API timeout detected" + description: > + A commitment change request timed out after the configured deadline. + Timeouts indicate the scheduling pipeline could not place reservations in time. + Affected changes are rolled back. Investigate scheduler performance or reservation backlog. + + - alert: CortexNovaCommittedResourceChangeLatencyTooHigh + expr: | + histogram_quantile(0.95, sum(rate(cortex_committed_resource_change_api_request_duration_seconds_bucket{service="cortex-nova-metrics", dry_run="false"}[5m])) by (le)) >= 10 + and on() sum(rate(cortex_committed_resource_change_api_requests_total{service="cortex-nova-metrics", dry_run="false"}[5m])) > 0 + for: 5m + labels: + context: committed-resource-api + dashboard: cortex-status-dashboard/cortex-status-dashboard + service: cortex + severity: warning + support_group: workload-management + playbook: docs/support/playbook/cortex/alerts/committed-resource-performance + annotations: + summary: "Committed Resource change API p95 latency >= 10s" + description: > + The committed resource change API p95 latency has reached or exceeded 10 seconds, + approaching the configured watch timeout. Requests close to the timeout are at risk + of being rolled back. Investigate scheduler performance or reservation backlog. + + # Committed Resource Capacity API + - alert: CortexNovaCommittedResourceCapacityErrors + expr: | + rate(cortex_committed_resource_capacity_api_requests_total{service="cortex-nova-metrics", status_code=~"5.."}[5m]) > 0 + for: 5m + labels: + context: committed-resource-api + dashboard: cortex-status-dashboard/cortex-status-dashboard + service: cortex + severity: warning + support_group: workload-management + playbook: docs/support/playbook/cortex/alerts/committed-resource-capacity + annotations: + summary: "Committed Resource capacity API HTTP 5xx errors" + description: > + The committed resource capacity API (Limes LIQUID integration) is returning + HTTP 5xx errors. This indicates internal problems calculating cluster capacity. + Limes may receive stale or incomplete capacity data. + + - alert: CortexNovaCommittedResourceCapacityDroppedToZero + expr: | + (cortex_committed_resource_reported_capacity_gib{service="cortex-nova-metrics"} == 0) + and on(resource, az) (cortex_committed_resource_reported_capacity_gib{service="cortex-nova-metrics"} offset 30m > 0) + for: 5m + labels: + context: committed-resource-api + dashboard: cortex-status-dashboard/cortex-status-dashboard + service: cortex + severity: warning + support_group: workload-management + playbook: docs/support/playbook/cortex/alerts/committed-resource-capacity + annotations: + summary: "Committed Resource capacity for {{ "{{" }} $labels.resource {{ "}}" }} in {{ "{{" }} $labels.az {{ "}}" }} dropped to zero" + description: > + The reported capacity for committed resource {{ "{{" }} $labels.resource {{ "}}" }} in + availability zone {{ "{{" }} $labels.az {{ "}}" }} has dropped from a positive value to zero. + This may mean hypervisors in that AZ are fully utilized for the corresponding + flavor group and no further committed resources can be placed there. + + # Committed Resource Usage API + - alert: CortexNovaCommittedResourceUsageErrors + expr: | + rate(cortex_committed_resource_usage_api_requests_total{service="cortex-nova-metrics", status_code=~"5.."}[5m]) > 0 + for: 5m + labels: + context: committed-resource-api + dashboard: cortex-status-dashboard/cortex-status-dashboard + service: cortex + severity: warning + support_group: workload-management + playbook: docs/support/playbook/cortex/alerts/committed-resource-api-errors + annotations: + summary: "Committed Resource usage API HTTP 5xx errors" + description: > + The committed resource usage API (Limes LIQUID integration) is returning + HTTP 5xx errors. This indicates internal problems fetching reservation or + Nova server data. Limes may receive stale or incomplete usage data. + + # Committed Resource Quota API + - alert: CortexNovaCommittedResourceQuotaErrors + expr: | + rate(cortex_committed_resource_quota_api_requests_total{service="cortex-nova-metrics", status_code=~"5.."}[5m]) > 0 + for: 5m + labels: + context: committed-resource-api + dashboard: cortex-status-dashboard/cortex-status-dashboard + service: cortex + severity: warning + support_group: workload-management + playbook: docs/support/playbook/cortex/alerts/committed-resource-api-errors + annotations: + summary: "Committed Resource quota API HTTP 5xx errors" + description: > + The committed resource quota API (Limes LIQUID integration) is returning + HTTP 5xx errors. This indicates internal problems computing or applying + quota. Limes may not be able to enforce committed resource quotas. {{- end }} diff --git a/helm/bundles/cortex-nova/values.yaml b/helm/bundles/cortex-nova/values.yaml index 4a194ae50..0ac9f49d5 100644 --- a/helm/bundles/cortex-nova/values.yaml +++ b/helm/bundles/cortex-nova/values.yaml @@ -17,6 +17,11 @@ owner-info: alerts: enabled: true prometheus: openstack + thresholds: + # Memory threshold for CortexNovaHighMemoryUsage in MiB. + highMemoryMiB: 6000 + # Reconcile-duration threshold for CortexNovaReconcileDurationHigher10Min in seconds. + reconcileDurationSeconds: 600 serviceMonitor: extraLabels: {} diff --git a/helm/bundles/cortex-placement-shim/alerts/placement-shim.alerts.yaml b/helm/bundles/cortex-placement-shim/alerts/placement-shim.alerts.yaml deleted file mode 100644 index e65b944d6..000000000 --- a/helm/bundles/cortex-placement-shim/alerts/placement-shim.alerts.yaml +++ /dev/null @@ -1,179 +0,0 @@ -groups: -- name: cortex-placement-shim-alerts - rules: - # Liveness - - alert: CortexPlacementShimDown - expr: | - up{pod=~"cortex-placement-shim-.*"} != 1 or - absent(up{pod=~"cortex-placement-shim-.*"}) - for: 5m - labels: - context: liveness - dashboard: cortex-placement-shim-status-dashboard/cortex-placement-shim-status-dashboard - service: cortex - severity: warning - support_group: workload-management - playbook: docs/support/playbook/cortex/alerts/shim-down - annotations: - summary: "Cortex Placement Shim is down" - description: > - The Cortex Placement Shim is down. Placement API requests that are - routed through the shim will not be served. OpenStack services relying - on the shim for resource provider lookups and allocation candidates - will degrade. - - # Downstream HTTP errors (client -> shim) - - alert: CortexPlacementShimDownstreamHttp400sTooHigh - expr: rate(cortex_placement_shim_downstream_request_duration_seconds_count{service="cortex-placement-shim-metrics-service", responsecode=~"4.."}[5m]) > 0.1 - for: 5m - labels: - context: api - dashboard: cortex-placement-shim-status-dashboard/cortex-placement-shim-status-dashboard - service: cortex - severity: warning - support_group: workload-management - playbook: docs/support/playbook/cortex/alerts/shim-api-errors - annotations: - summary: "Placement Shim downstream HTTP 4xx errors too high" - description: > - The Placement Shim is responding to client requests with HTTP 4xx - errors at a sustained rate. This may indicate that the request format - from OpenStack services has changed, authentication tokens are invalid, - or the shim is rejecting malformed requests. Investigate the shim logs - for details on which endpoints and request patterns are affected. - - - alert: CortexPlacementShimDownstreamHttp500sTooHigh - expr: rate(cortex_placement_shim_downstream_request_duration_seconds_count{service="cortex-placement-shim-metrics-service", responsecode=~"5.."}[5m]) > 0.1 - for: 5m - labels: - context: api - dashboard: cortex-placement-shim-status-dashboard/cortex-placement-shim-status-dashboard - service: cortex - severity: warning - support_group: workload-management - playbook: docs/support/playbook/cortex/alerts/shim-api-errors - annotations: - summary: "Placement Shim downstream HTTP 5xx errors too high" - description: > - The Placement Shim is responding to client requests with HTTP 5xx - errors. This indicates internal problems within the shim such as - handler panics or misconfiguration. OpenStack services may experience - degraded placement functionality until the issue is resolved. - - # Upstream HTTP errors (shim -> Placement API) - - alert: CortexPlacementShimUpstreamHttp5xxTooHigh - expr: rate(cortex_placement_shim_upstream_request_duration_seconds_count{service="cortex-placement-shim-metrics-service", responsecode=~"5.."}[5m]) > 0.1 - for: 5m - labels: - context: upstream - dashboard: cortex-placement-shim-status-dashboard/cortex-placement-shim-status-dashboard - service: cortex - severity: warning - support_group: workload-management - playbook: docs/support/playbook/cortex/alerts/shim-api-errors - annotations: - summary: "Placement Shim upstream HTTP 5xx errors too high" - description: > - The upstream Placement API is returning 5xx errors to the shim. - This indicates the OpenStack Placement service itself is having - problems. The shim forwards these errors to its clients. Investigate - the Placement API service health and logs. - - - alert: CortexPlacementShimUpstreamUnreachable - expr: rate(cortex_placement_shim_upstream_request_duration_seconds_count{service="cortex-placement-shim-metrics-service", responsecode="502"}[5m]) > 0.1 - for: 5m - labels: - context: upstream - dashboard: cortex-placement-shim-status-dashboard/cortex-placement-shim-status-dashboard - service: cortex - severity: warning - support_group: workload-management - playbook: docs/support/playbook/cortex/alerts/shim-api-errors - annotations: - summary: "Placement Shim cannot reach the upstream Placement API" - description: > - The Placement Shim is unable to reach the upstream OpenStack Placement - API and is returning 502 Bad Gateway errors. This means all forwarded - requests are failing. Check network connectivity, the Placement API - service endpoint configuration, and whether the upstream service is - running. - - # Latency alerts - - alert: CortexPlacementShimDownstreamLatencyTooHigh - expr: | - histogram_quantile(0.95, sum(rate(cortex_placement_shim_downstream_request_duration_seconds_bucket{service="cortex-placement-shim-metrics-service"}[5m])) by (le)) > 10 - and on() sum(rate(cortex_placement_shim_downstream_request_duration_seconds_count{service="cortex-placement-shim-metrics-service"}[5m])) > 0 - for: 5m - labels: - context: api - dashboard: cortex-placement-shim-status-dashboard/cortex-placement-shim-status-dashboard - service: cortex - severity: warning - support_group: workload-management - playbook: docs/support/playbook/cortex/alerts/shim-api-errors - annotations: - summary: "Placement Shim downstream latency too high" - description: > - The Placement Shim downstream request latency (p95) exceeds 10 - seconds. This affects all OpenStack services making placement - requests through the shim. The cause may be slow upstream responses, - shim processing overhead, or resource contention. Investigate both - shim and upstream Placement API performance. - - - alert: CortexPlacementShimUpstreamLatencyTooHigh - expr: | - histogram_quantile(0.95, sum(rate(cortex_placement_shim_upstream_request_duration_seconds_bucket{service="cortex-placement-shim-metrics-service"}[5m])) by (le)) > 10 - and on() sum(rate(cortex_placement_shim_upstream_request_duration_seconds_count{service="cortex-placement-shim-metrics-service"}[5m])) > 0 - for: 5m - labels: - context: upstream - dashboard: cortex-placement-shim-status-dashboard/cortex-placement-shim-status-dashboard - service: cortex - severity: warning - support_group: workload-management - playbook: docs/support/playbook/cortex/alerts/shim-api-errors - annotations: - summary: "Placement Shim upstream latency too high" - description: > - The upstream Placement API response latency (p95) as seen by the - shim exceeds 10 seconds. This directly impacts the end-to-end - latency of placement requests. Investigate the upstream Placement - API performance and network conditions. - - # Resource usage - - alert: CortexPlacementShimHighMemoryUsage - expr: process_resident_memory_bytes{service="cortex-placement-shim-metrics-service"} > 1500 * 1024 * 1024 - for: 5m - labels: - context: memory - dashboard: cortex-placement-shim-status-dashboard/cortex-placement-shim-status-dashboard - service: cortex - severity: warning - support_group: workload-management - playbook: docs/support/playbook/cortex/alerts/shim-resource-usage - annotations: - summary: "Placement Shim uses too much memory" - description: > - The Placement Shim is using more than 1500 MiB of resident memory - against a limit of 2048 MiB. This may indicate a memory leak, a - large number of cached hypervisors, or unexpected request patterns. - If the usage continues to grow, the pod will be OOM-killed. - - - alert: CortexPlacementShimHighCPUUsage - expr: rate(process_cpu_seconds_total{service="cortex-placement-shim-metrics-service"}[1m]) > 0.4 - for: 5m - labels: - context: cpu - dashboard: cortex-placement-shim-status-dashboard/cortex-placement-shim-status-dashboard - service: cortex - severity: warning - support_group: workload-management - playbook: docs/support/playbook/cortex/alerts/shim-resource-usage - annotations: - summary: "Placement Shim uses too much CPU" - description: > - The Placement Shim is consuming more than 40% of a single CPU core - against a limit of 500m. Under normal operation the shim should use - much less since it primarily proxies requests. This may indicate a - hot loop, excessive logging, or an unusual traffic spike. - diff --git a/helm/bundles/cortex-placement-shim/templates/alerts.yaml b/helm/bundles/cortex-placement-shim/templates/alerts.yaml index 7db3b96e6..c570ccd91 100644 --- a/helm/bundles/cortex-placement-shim/templates/alerts.yaml +++ b/helm/bundles/cortex-placement-shim/templates/alerts.yaml @@ -10,8 +10,182 @@ metadata: type: alerting-rules prometheus: {{ required ".Values.alerts.prometheus missing" .Values.alerts.prometheus | quote }} spec: - {{- $files := .Files.Glob "alerts/*.alerts.yaml" }} - {{- range $path, $file := $files }} - {{ $file | toString | nindent 2 }} - {{- end }} + groups: + - name: cortex-placement-shim-alerts + rules: + # Liveness + - alert: CortexPlacementShimDown + expr: | + up{pod=~"cortex-placement-shim-.*"} != 1 or + absent(up{pod=~"cortex-placement-shim-.*"}) + for: 5m + labels: + context: liveness + dashboard: cortex-placement-shim-status-dashboard/cortex-placement-shim-status-dashboard + service: cortex + severity: warning + support_group: workload-management + playbook: docs/support/playbook/cortex/alerts/shim-down + annotations: + summary: "Cortex Placement Shim is down" + description: > + The Cortex Placement Shim is down. Placement API requests that are + routed through the shim will not be served. OpenStack services relying + on the shim for resource provider lookups and allocation candidates + will degrade. + + # Downstream HTTP errors (client -> shim) + - alert: CortexPlacementShimDownstreamHttp400sTooHigh + expr: rate(cortex_placement_shim_downstream_request_duration_seconds_count{service="cortex-placement-shim-metrics-service", responsecode=~"4.."}[5m]) > 0.1 + for: 5m + labels: + context: api + dashboard: cortex-placement-shim-status-dashboard/cortex-placement-shim-status-dashboard + service: cortex + severity: warning + support_group: workload-management + playbook: docs/support/playbook/cortex/alerts/shim-api-errors + annotations: + summary: "Placement Shim downstream HTTP 4xx errors too high" + description: > + The Placement Shim is responding to client requests with HTTP 4xx + errors at a sustained rate. This may indicate that the request format + from OpenStack services has changed, authentication tokens are invalid, + or the shim is rejecting malformed requests. Investigate the shim logs + for details on which endpoints and request patterns are affected. + + - alert: CortexPlacementShimDownstreamHttp500sTooHigh + expr: rate(cortex_placement_shim_downstream_request_duration_seconds_count{service="cortex-placement-shim-metrics-service", responsecode=~"5.."}[5m]) > 0.1 + for: 5m + labels: + context: api + dashboard: cortex-placement-shim-status-dashboard/cortex-placement-shim-status-dashboard + service: cortex + severity: warning + support_group: workload-management + playbook: docs/support/playbook/cortex/alerts/shim-api-errors + annotations: + summary: "Placement Shim downstream HTTP 5xx errors too high" + description: > + The Placement Shim is responding to client requests with HTTP 5xx + errors. This indicates internal problems within the shim such as + handler panics or misconfiguration. OpenStack services may experience + degraded placement functionality until the issue is resolved. + + # Upstream HTTP errors (shim -> Placement API) + - alert: CortexPlacementShimUpstreamHttp5xxTooHigh + expr: rate(cortex_placement_shim_upstream_request_duration_seconds_count{service="cortex-placement-shim-metrics-service", responsecode=~"5.."}[5m]) > 0.1 + for: 5m + labels: + context: upstream + dashboard: cortex-placement-shim-status-dashboard/cortex-placement-shim-status-dashboard + service: cortex + severity: warning + support_group: workload-management + playbook: docs/support/playbook/cortex/alerts/shim-api-errors + annotations: + summary: "Placement Shim upstream HTTP 5xx errors too high" + description: > + The upstream Placement API is returning 5xx errors to the shim. + This indicates the OpenStack Placement service itself is having + problems. The shim forwards these errors to its clients. Investigate + the Placement API service health and logs. + + - alert: CortexPlacementShimUpstreamUnreachable + expr: rate(cortex_placement_shim_upstream_request_duration_seconds_count{service="cortex-placement-shim-metrics-service", responsecode="502"}[5m]) > 0.1 + for: 5m + labels: + context: upstream + dashboard: cortex-placement-shim-status-dashboard/cortex-placement-shim-status-dashboard + service: cortex + severity: warning + support_group: workload-management + playbook: docs/support/playbook/cortex/alerts/shim-api-errors + annotations: + summary: "Placement Shim cannot reach the upstream Placement API" + description: > + The Placement Shim is unable to reach the upstream OpenStack Placement + API and is returning 502 Bad Gateway errors. This means all forwarded + requests are failing. Check network connectivity, the Placement API + service endpoint configuration, and whether the upstream service is + running. + + # Latency alerts + - alert: CortexPlacementShimDownstreamLatencyTooHigh + expr: | + histogram_quantile(0.95, sum(rate(cortex_placement_shim_downstream_request_duration_seconds_bucket{service="cortex-placement-shim-metrics-service"}[5m])) by (le)) > 10 + and on() sum(rate(cortex_placement_shim_downstream_request_duration_seconds_count{service="cortex-placement-shim-metrics-service"}[5m])) > 0 + for: 5m + labels: + context: api + dashboard: cortex-placement-shim-status-dashboard/cortex-placement-shim-status-dashboard + service: cortex + severity: warning + support_group: workload-management + playbook: docs/support/playbook/cortex/alerts/shim-api-errors + annotations: + summary: "Placement Shim downstream latency too high" + description: > + The Placement Shim downstream request latency (p95) exceeds 10 + seconds. This affects all OpenStack services making placement + requests through the shim. The cause may be slow upstream responses, + shim processing overhead, or resource contention. Investigate both + shim and upstream Placement API performance. + + - alert: CortexPlacementShimUpstreamLatencyTooHigh + expr: | + histogram_quantile(0.95, sum(rate(cortex_placement_shim_upstream_request_duration_seconds_bucket{service="cortex-placement-shim-metrics-service"}[5m])) by (le)) > 10 + and on() sum(rate(cortex_placement_shim_upstream_request_duration_seconds_count{service="cortex-placement-shim-metrics-service"}[5m])) > 0 + for: 5m + labels: + context: upstream + dashboard: cortex-placement-shim-status-dashboard/cortex-placement-shim-status-dashboard + service: cortex + severity: warning + support_group: workload-management + playbook: docs/support/playbook/cortex/alerts/shim-api-errors + annotations: + summary: "Placement Shim upstream latency too high" + description: > + The upstream Placement API response latency (p95) as seen by the + shim exceeds 10 seconds. This directly impacts the end-to-end + latency of placement requests. Investigate the upstream Placement + API performance and network conditions. + + # Resource usage + - alert: CortexPlacementShimHighMemoryUsage + expr: process_resident_memory_bytes{service="cortex-placement-shim-metrics-service"} > 1500 * 1024 * 1024 + for: 5m + labels: + context: memory + dashboard: cortex-placement-shim-status-dashboard/cortex-placement-shim-status-dashboard + service: cortex + severity: warning + support_group: workload-management + playbook: docs/support/playbook/cortex/alerts/shim-resource-usage + annotations: + summary: "Placement Shim uses too much memory" + description: > + The Placement Shim is using more than 1500 MiB of resident memory + against a limit of 2048 MiB. This may indicate a memory leak, a + large number of cached hypervisors, or unexpected request patterns. + If the usage continues to grow, the pod will be OOM-killed. + + - alert: CortexPlacementShimHighCPUUsage + expr: rate(process_cpu_seconds_total{service="cortex-placement-shim-metrics-service"}[1m]) > 0.4 + for: 5m + labels: + context: cpu + dashboard: cortex-placement-shim-status-dashboard/cortex-placement-shim-status-dashboard + service: cortex + severity: warning + support_group: workload-management + playbook: docs/support/playbook/cortex/alerts/shim-resource-usage + annotations: + summary: "Placement Shim uses too much CPU" + description: > + The Placement Shim is consuming more than 40% of a single CPU core + against a limit of 500m. Under normal operation the shim should use + much less since it primarily proxies requests. This may indicate a + hot loop, excessive logging, or an unusual traffic spike. {{- end }}