Skip to content

Commit 55af340

Browse files
authored
scripts: fixed kurtosis alerts script (#4458)
* scripts: fixed kurtosis alerts script
1 parent af1e3cd commit 55af340

2 files changed

Lines changed: 99 additions & 24 deletions

File tree

.github/workflows/kurtosis-smoke-test.yml

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -130,14 +130,16 @@ jobs:
130130
KURTOSIS_INSTANCE_TYPE: ${{ inputs.instance_type || 'c6a.4xlarge' }}
131131
EXPECTED_CLUSTERS: ${{ steps.deploy.outputs.cluster_count }}
132132
run: |
133-
ALERT_FROM=$((DEPLOY_START + 20 * 60))
133+
WARMUP_SECONDS=$((20 * 60))
134+
ALERT_FROM=$((DEPLOY_START + WARMUP_SECONDS))
134135
ALERT_TO=$((DEPLOY_START + (LIFETIME_MINUTES - 5) * 60))
135136
136137
echo "Alert window: $(date -u -d @${ALERT_FROM} +%Y-%m-%dT%H:%M:%SZ) to $(date -u -d @${ALERT_TO} +%Y-%m-%dT%H:%M:%SZ)"
137138
138139
set +e
139140
python scripts/debug/kurtosis_alerts.py \
140141
--from "$ALERT_FROM" --to "$ALERT_TO" \
142+
--lookback "$WARMUP_SECONDS" \
141143
--expected-clusters "$EXPECTED_CLUSTERS" | tee report.txt
142144
EXIT_CODE=${PIPESTATUS[0]}
143145
set -e

scripts/debug/kurtosis_alerts.py

Lines changed: 96 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -2,9 +2,10 @@
22
"""
33
Fetches alert history for Charon Kurtosis alerts from Grafana.
44
Requires OBOL_GRAFANA_API_TOKEN environment variable.
5-
Usage: python kurtosis_alerts.py --from <start> --to <end> [--expected-clusters N]
5+
Usage: python kurtosis_alerts.py --from <start> --to <end> [--lookback <seconds>] [--expected-clusters N]
66
--from: Start time (ISO 8601 e.g. 2024-03-01T00:00:00Z, or epoch seconds)
7-
--to: End time (ISO 8601 e.g. 2024-03-02T00:00:00Z, or epoch seconds)
7+
--to: End time (ISO 8601 or epoch seconds)
8+
--lookback: Seconds before --from to also fetch annotations from (default: 0)
89
--expected-clusters: Expected number of clusters (queries Prometheus to verify coverage)
910
1011
Outputs a structured report to stdout.
@@ -224,31 +225,90 @@ def query_metrics_clusters(
224225

225226

226227
_FIRING_STATES = {"alerting", "firing"}
228+
_NORMAL_STATE_PREFIXES = ("normal", "ok", "inactive")
227229

228230

229-
def is_firing(entry: dict) -> bool:
230-
"""Check if an alert entry indicates the alert was firing during the window.
231+
def _is_normal_state(state: str) -> bool:
232+
"""Return True for any Grafana normal/resolved state, including variants like 'Normal (MissingSeries)'."""
233+
return state.startswith(_NORMAL_STATE_PREFIXES)
231234

232-
An alert was firing if it transitioned INTO a firing state (newState is firing)
233-
or transitioned OUT of a firing state (prevState is firing, meaning it was
234-
firing before it resolved).
235+
236+
def _cluster_key(entry: dict) -> tuple:
237+
return (
238+
entry["alert_name"],
239+
entry["labels"].get("cluster_name", ""),
240+
entry["labels"].get("cluster_hash", ""),
241+
)
242+
243+
244+
def compute_firing(entries: list[dict], from_ms: int) -> list[dict]:
245+
"""Compute which alerts were firing during the window [from_ms, ...].
246+
247+
Entries before from_ms are lookback data used to reconstruct pre-window state.
248+
An alert counts as firing during the window if:
249+
- It transitioned into a firing state within the window, OR
250+
- It was already firing at window start (its lookback annotation has no time_end
251+
before from_ms) and never recovered by window end.
235252
"""
236-
state = entry.get("state", "").lower()
237-
prev = entry.get("previous_state", "").lower()
238-
return state in _FIRING_STATES or prev in _FIRING_STATES
253+
# Split into pre-window and in-window entries (already sorted by timestamp)
254+
pre_window = [e for e in entries if e["timestamp"] < from_ms]
255+
in_window = [e for e in entries if e["timestamp"] >= from_ms]
256+
257+
# Reconstruct state at window start from lookback annotations.
258+
# If an entry has a non-zero time_end before from_ms, the alert already resolved
259+
# before the window — treat it as normal regardless of the state field.
260+
pre_window_state: dict[tuple, str] = {}
261+
for e in pre_window:
262+
time_end = e.get("time_end", 0)
263+
ts = e["timestamp"]
264+
# time_end == ts means Grafana set timeEnd to the evaluation time (still firing).
265+
# Only treat as resolved if time_end is strictly after ts (actual end recorded).
266+
resolved_before_window = time_end > ts and time_end < from_ms
267+
state = "normal" if resolved_before_window else e["state"].lower()
268+
pre_window_state[_cluster_key(e)] = state
269+
270+
# Track which keys resolved during the window
271+
resolved_in_window: set[tuple] = set()
272+
fired_in_window: list[dict] = []
273+
for e in in_window:
274+
key = _cluster_key(e)
275+
state = e["state"].lower()
276+
if state in _FIRING_STATES:
277+
fired_in_window.append(e)
278+
elif _is_normal_state(state):
279+
resolved_in_window.add(key)
280+
281+
# Keys that were firing before the window and never resolved during it
282+
carried_over = [
283+
key for key, state in pre_window_state.items()
284+
if state in _FIRING_STATES and not _is_normal_state(state) and key not in resolved_in_window
285+
]
286+
287+
# Build synthetic entries for carried-over alerts so the report can display them
288+
carried_entries = []
289+
for alert_name, cluster_name, cluster_hash in carried_over:
290+
carried_entries.append({
291+
"alert_name": alert_name,
292+
"state": "alerting",
293+
"previous_state": "",
294+
"timestamp": from_ms,
295+
"time_end": from_ms,
296+
"labels": {"cluster_name": cluster_name, "cluster_hash": cluster_hash},
297+
})
298+
299+
return fired_in_window + carried_entries
239300

240301

241302
def print_report(
242303
entries: list[dict],
243304
from_ms: int,
244305
to_ms: int,
245306
rules: list[dict],
307+
firing: list[dict],
246308
metrics_clusters: list[tuple[str, str]] | None = None,
247309
expected_clusters: int = 0,
248310
):
249311
"""Print a structured human-readable report."""
250-
firing = [e for e in entries if is_firing(e)]
251-
252312
# === Section A: Input Parameters ===
253313
print("=== Input Parameters ===")
254314
print(f"From: {ms_to_human(from_ms)}")
@@ -263,7 +323,8 @@ def print_report(
263323
print()
264324

265325
# === Section C: Clusters Observed ===
266-
clusters = {} # (cluster_name, cluster_hash) -> set
326+
# Include clusters from all entries (including lookback) that appeared in any annotation
327+
clusters: dict[tuple, bool] = {}
267328
for e in entries:
268329
name = e["labels"].get("cluster_name", "")
269330
h = e["labels"].get("cluster_hash", "")
@@ -307,7 +368,7 @@ def print_report(
307368
return
308369

309370
# Group by alert name
310-
by_alert = defaultdict(list)
371+
by_alert: dict[str, list] = defaultdict(list)
311372
for e in firing:
312373
by_alert[e["alert_name"]].append(e)
313374

@@ -316,7 +377,7 @@ def print_report(
316377
print(f"--- {alert_name} ({len(alert_entries)} occurrences) ---")
317378

318379
# Group by cluster within this alert
319-
by_cluster = defaultdict(int)
380+
by_cluster: dict[tuple, int] = defaultdict(int)
320381
for e in alert_entries:
321382
name = e["labels"].get("cluster_name", "(unknown)")
322383
h = e["labels"].get("cluster_hash", "(unknown)")
@@ -347,6 +408,10 @@ def main():
347408
"--to", dest="time_to", required=True,
348409
help="End time (ISO 8601 or epoch seconds)",
349410
)
411+
parser.add_argument(
412+
"--lookback", type=int, default=0,
413+
help="Seconds before --from to fetch annotations from, to catch alerts that started firing before the window (default: 0)",
414+
)
350415
parser.add_argument(
351416
"--expected-clusters", type=int, default=0,
352417
help="Expected number of clusters (0 to skip metrics check)",
@@ -355,6 +420,7 @@ def main():
355420

356421
from_ms = parse_timestamp(args.time_from)
357422
to_ms = parse_timestamp(args.time_to)
423+
fetch_from_ms = from_ms - args.lookback * 1000
358424

359425
headers = get_auth_header()
360426
if not headers:
@@ -375,10 +441,10 @@ def main():
375441
print("=== No data available ===")
376442
sys.exit(0)
377443

378-
# Fetch annotations and filter by rule titles
444+
# Fetch annotations (including lookback period) and filter by rule titles
379445
rule_titles = {r["title"] for r in rules}
380446

381-
annotations = fetch_annotations(headers, from_ms, to_ms)
447+
annotations = fetch_annotations(headers, fetch_from_ms, to_ms)
382448
matching = [
383449
a for a in annotations
384450
if a.get("alertName") in rule_titles
@@ -389,9 +455,17 @@ def main():
389455
entries = [format_alert_entry(a) for a in matching]
390456
entries.sort(key=lambda e: e["timestamp"])
391457

392-
firing = [e for e in entries if is_firing(e)]
393-
print(f"=== Alert Summary ===")
394-
print(f" {len(annotations)} alert annotations found, {len(matching)} matched monitored rules.")
458+
firing = compute_firing(entries, from_ms)
459+
460+
# Count only in-window annotations for the summary line
461+
in_window_annotations = [a for a in annotations if a.get("time", 0) >= from_ms]
462+
in_window_matching = [
463+
a for a in in_window_annotations
464+
if a.get("alertName") in rule_titles
465+
and _has_folder_tag(a, TARGET_FOLDER)
466+
]
467+
print("=== Alert Summary ===")
468+
print(f" {len(in_window_annotations)} alert annotations found, {len(in_window_matching)} matched monitored rules.")
395469
print(f" {len(firing)} alerts were firing during this time window.")
396470
print()
397471

@@ -406,17 +480,16 @@ def main():
406480

407481
# Print report
408482
print_report(
409-
entries, from_ms, to_ms, rules,
483+
entries, from_ms, to_ms, rules, firing,
410484
metrics_clusters, args.expected_clusters,
411485
)
412486

413487
# Exit code based on firing alerts and metrics coverage
414-
has_firing = any(is_firing(e) for e in entries)
415488
metrics_incomplete = (
416489
args.expected_clusters > 0
417490
and (metrics_clusters is None or len(metrics_clusters) < args.expected_clusters)
418491
)
419-
sys.exit(1 if has_firing or metrics_incomplete else 0)
492+
sys.exit(1 if firing or metrics_incomplete else 0)
420493

421494

422495
if __name__ == "__main__":

0 commit comments

Comments
 (0)