22"""
33Fetches alert history for Charon Kurtosis alerts from Grafana.
44Requires OBOL_GRAFANA_API_TOKEN environment variable.
5- Usage: python kurtosis_alerts.py --from <start> --to <end> [--expected-clusters N]
5+ Usage: python kurtosis_alerts.py --from <start> --to <end> [--lookback <seconds>] [-- expected-clusters N]
66 --from: Start time (ISO 8601 e.g. 2024-03-01T00:00:00Z, or epoch seconds)
7- --to: End time (ISO 8601 e.g. 2024-03-02T00:00:00Z, or epoch seconds)
7+ --to: End time (ISO 8601 or epoch seconds)
8+ --lookback: Seconds before --from to also fetch annotations from (default: 0)
89 --expected-clusters: Expected number of clusters (queries Prometheus to verify coverage)
910
1011Outputs a structured report to stdout.
@@ -224,31 +225,90 @@ def query_metrics_clusters(
224225
225226
226227_FIRING_STATES = {"alerting" , "firing" }
228+ _NORMAL_STATE_PREFIXES = ("normal" , "ok" , "inactive" )
227229
228230
229- def is_firing (entry : dict ) -> bool :
230- """Check if an alert entry indicates the alert was firing during the window.
231+ def _is_normal_state (state : str ) -> bool :
232+ """Return True for any Grafana normal/resolved state, including variants like 'Normal (MissingSeries)'."""
233+ return state .startswith (_NORMAL_STATE_PREFIXES )
231234
232- An alert was firing if it transitioned INTO a firing state (newState is firing)
233- or transitioned OUT of a firing state (prevState is firing, meaning it was
234- firing before it resolved).
235+
236+ def _cluster_key (entry : dict ) -> tuple :
237+ return (
238+ entry ["alert_name" ],
239+ entry ["labels" ].get ("cluster_name" , "" ),
240+ entry ["labels" ].get ("cluster_hash" , "" ),
241+ )
242+
243+
244+ def compute_firing (entries : list [dict ], from_ms : int ) -> list [dict ]:
245+ """Compute which alerts were firing during the window [from_ms, ...].
246+
247+ Entries before from_ms are lookback data used to reconstruct pre-window state.
248+ An alert counts as firing during the window if:
249+ - It transitioned into a firing state within the window, OR
250+ - It was already firing at window start (its lookback annotation has no time_end
251+ before from_ms) and never recovered by window end.
235252 """
236- state = entry .get ("state" , "" ).lower ()
237- prev = entry .get ("previous_state" , "" ).lower ()
238- return state in _FIRING_STATES or prev in _FIRING_STATES
253+ # Split into pre-window and in-window entries (already sorted by timestamp)
254+ pre_window = [e for e in entries if e ["timestamp" ] < from_ms ]
255+ in_window = [e for e in entries if e ["timestamp" ] >= from_ms ]
256+
257+ # Reconstruct state at window start from lookback annotations.
258+ # If an entry has a non-zero time_end before from_ms, the alert already resolved
259+ # before the window — treat it as normal regardless of the state field.
260+ pre_window_state : dict [tuple , str ] = {}
261+ for e in pre_window :
262+ time_end = e .get ("time_end" , 0 )
263+ ts = e ["timestamp" ]
264+ # time_end == ts means Grafana set timeEnd to the evaluation time (still firing).
265+ # Only treat as resolved if time_end is strictly after ts (actual end recorded).
266+ resolved_before_window = time_end > ts and time_end < from_ms
267+ state = "normal" if resolved_before_window else e ["state" ].lower ()
268+ pre_window_state [_cluster_key (e )] = state
269+
270+ # Track which keys resolved during the window
271+ resolved_in_window : set [tuple ] = set ()
272+ fired_in_window : list [dict ] = []
273+ for e in in_window :
274+ key = _cluster_key (e )
275+ state = e ["state" ].lower ()
276+ if state in _FIRING_STATES :
277+ fired_in_window .append (e )
278+ elif _is_normal_state (state ):
279+ resolved_in_window .add (key )
280+
281+ # Keys that were firing before the window and never resolved during it
282+ carried_over = [
283+ key for key , state in pre_window_state .items ()
284+ if state in _FIRING_STATES and not _is_normal_state (state ) and key not in resolved_in_window
285+ ]
286+
287+ # Build synthetic entries for carried-over alerts so the report can display them
288+ carried_entries = []
289+ for alert_name , cluster_name , cluster_hash in carried_over :
290+ carried_entries .append ({
291+ "alert_name" : alert_name ,
292+ "state" : "alerting" ,
293+ "previous_state" : "" ,
294+ "timestamp" : from_ms ,
295+ "time_end" : from_ms ,
296+ "labels" : {"cluster_name" : cluster_name , "cluster_hash" : cluster_hash },
297+ })
298+
299+ return fired_in_window + carried_entries
239300
240301
241302def print_report (
242303 entries : list [dict ],
243304 from_ms : int ,
244305 to_ms : int ,
245306 rules : list [dict ],
307+ firing : list [dict ],
246308 metrics_clusters : list [tuple [str , str ]] | None = None ,
247309 expected_clusters : int = 0 ,
248310):
249311 """Print a structured human-readable report."""
250- firing = [e for e in entries if is_firing (e )]
251-
252312 # === Section A: Input Parameters ===
253313 print ("=== Input Parameters ===" )
254314 print (f"From: { ms_to_human (from_ms )} " )
@@ -263,7 +323,8 @@ def print_report(
263323 print ()
264324
265325 # === Section C: Clusters Observed ===
266- clusters = {} # (cluster_name, cluster_hash) -> set
326+ # Include clusters from all entries (including lookback) that appeared in any annotation
327+ clusters : dict [tuple , bool ] = {}
267328 for e in entries :
268329 name = e ["labels" ].get ("cluster_name" , "" )
269330 h = e ["labels" ].get ("cluster_hash" , "" )
@@ -307,7 +368,7 @@ def print_report(
307368 return
308369
309370 # Group by alert name
310- by_alert = defaultdict (list )
371+ by_alert : dict [ str , list ] = defaultdict (list )
311372 for e in firing :
312373 by_alert [e ["alert_name" ]].append (e )
313374
@@ -316,7 +377,7 @@ def print_report(
316377 print (f"--- { alert_name } ({ len (alert_entries )} occurrences) ---" )
317378
318379 # Group by cluster within this alert
319- by_cluster = defaultdict (int )
380+ by_cluster : dict [ tuple , int ] = defaultdict (int )
320381 for e in alert_entries :
321382 name = e ["labels" ].get ("cluster_name" , "(unknown)" )
322383 h = e ["labels" ].get ("cluster_hash" , "(unknown)" )
@@ -347,6 +408,10 @@ def main():
347408 "--to" , dest = "time_to" , required = True ,
348409 help = "End time (ISO 8601 or epoch seconds)" ,
349410 )
411+ parser .add_argument (
412+ "--lookback" , type = int , default = 0 ,
413+ help = "Seconds before --from to fetch annotations from, to catch alerts that started firing before the window (default: 0)" ,
414+ )
350415 parser .add_argument (
351416 "--expected-clusters" , type = int , default = 0 ,
352417 help = "Expected number of clusters (0 to skip metrics check)" ,
@@ -355,6 +420,7 @@ def main():
355420
356421 from_ms = parse_timestamp (args .time_from )
357422 to_ms = parse_timestamp (args .time_to )
423+ fetch_from_ms = from_ms - args .lookback * 1000
358424
359425 headers = get_auth_header ()
360426 if not headers :
@@ -375,10 +441,10 @@ def main():
375441 print ("=== No data available ===" )
376442 sys .exit (0 )
377443
378- # Fetch annotations and filter by rule titles
444+ # Fetch annotations (including lookback period) and filter by rule titles
379445 rule_titles = {r ["title" ] for r in rules }
380446
381- annotations = fetch_annotations (headers , from_ms , to_ms )
447+ annotations = fetch_annotations (headers , fetch_from_ms , to_ms )
382448 matching = [
383449 a for a in annotations
384450 if a .get ("alertName" ) in rule_titles
@@ -389,9 +455,17 @@ def main():
389455 entries = [format_alert_entry (a ) for a in matching ]
390456 entries .sort (key = lambda e : e ["timestamp" ])
391457
392- firing = [e for e in entries if is_firing (e )]
393- print (f"=== Alert Summary ===" )
394- print (f" { len (annotations )} alert annotations found, { len (matching )} matched monitored rules." )
458+ firing = compute_firing (entries , from_ms )
459+
460+ # Count only in-window annotations for the summary line
461+ in_window_annotations = [a for a in annotations if a .get ("time" , 0 ) >= from_ms ]
462+ in_window_matching = [
463+ a for a in in_window_annotations
464+ if a .get ("alertName" ) in rule_titles
465+ and _has_folder_tag (a , TARGET_FOLDER )
466+ ]
467+ print ("=== Alert Summary ===" )
468+ print (f" { len (in_window_annotations )} alert annotations found, { len (in_window_matching )} matched monitored rules." )
395469 print (f" { len (firing )} alerts were firing during this time window." )
396470 print ()
397471
@@ -406,17 +480,16 @@ def main():
406480
407481 # Print report
408482 print_report (
409- entries , from_ms , to_ms , rules ,
483+ entries , from_ms , to_ms , rules , firing ,
410484 metrics_clusters , args .expected_clusters ,
411485 )
412486
413487 # Exit code based on firing alerts and metrics coverage
414- has_firing = any (is_firing (e ) for e in entries )
415488 metrics_incomplete = (
416489 args .expected_clusters > 0
417490 and (metrics_clusters is None or len (metrics_clusters ) < args .expected_clusters )
418491 )
419- sys .exit (1 if has_firing or metrics_incomplete else 0 )
492+ sys .exit (1 if firing or metrics_incomplete else 0 )
420493
421494
422495if __name__ == "__main__" :
0 commit comments