diff --git a/.cursor/skills/analyze-bot-failures/SKILL.md b/.cursor/skills/analyze-bot-failures/SKILL.md new file mode 100644 index 00000000..5aad91dc --- /dev/null +++ b/.cursor/skills/analyze-bot-failures/SKILL.md @@ -0,0 +1,8 @@ +--- +name: analyze-bot-failures +description: Analyze failures from the Metaculus bot forecasting GitHub Actions workflow. Pulls failed job logs, aggregates failure reasons, and investigates whether real bugs need fixing. Use when the user asks why bot runs/workflows are failing, wants a summary of forecasting errors, or mentions analyzing bot failure logs from GitHub Actions. +--- + +# Analyze Bot Forecasting Failures + +The full skill lives with its script in the repo. Read `scripts/skills/analyze_bot_failures/SKILL.md` and follow its instructions. diff --git a/code_tests/unit_tests/test_analyze_bot_run_failures.py b/code_tests/unit_tests/test_analyze_bot_run_failures.py new file mode 100644 index 00000000..3c03402e --- /dev/null +++ b/code_tests/unit_tests/test_analyze_bot_run_failures.py @@ -0,0 +1,89 @@ +from scripts.skills.analyze_bot_failures.analyze_bot_run_failures import ( + group_failures, + normalize_message_to_signature, + parse_failures_from_log, +) + +GH_PREFIX = "2026-06-11T17:13:45.1234567Z " + +TRACEBACK_LOG = ( + f"{GH_PREFIX}2026-06-11 17:13:40,100 - root - INFO - Running bot METAC_GPT_5_5_HIGH with 3 questions\n" + f"{GH_PREFIX}2026-06-11 17:13:45,123 - forecasting_tools.forecast_bots.forecast_bot - ERROR - Exception occurred during forecasting:\n" + f"{GH_PREFIX}Traceback (most recent call last):\n" + f'{GH_PREFIX} File "/home/runner/work/forecasting-tools/forecasting-tools/forecasting_tools/forecast_bots/forecast_bot.py", line 341, in _run_individual_question_with_error_propagation\n' + f"{GH_PREFIX} return await self._run_individual_question(question)\n" + f'{GH_PREFIX} File "/home/runner/work/forecasting-tools/forecasting-tools/.venv/lib/python3.11/site-packages/litellm/main.py", line 99, in completion\n' + f"{GH_PREFIX} raise error\n" + f"{GH_PREFIX}RuntimeError: Error while processing question url: 'https://www.metaculus.com/questions/12345/': Rate limit hit for model gpt-5.5\n" + f"{GH_PREFIX}2026-06-11 17:14:00,500 - root - INFO - done\n" +) + +SHORT_SUMMARY_ONLY_LOG = f"{GH_PREFIX}2026-06-11 17:13:45,123 - root - INFO - ❌ Exception: ValueError | Message: Error while processing question url: 'https://www.metaculus.com/questions/678/': bad probability\n" + +UNPARSABLE_LOG = ( + f"{GH_PREFIX}2026-06-11 17:13:45,123 - root - INFO - starting up\n" + f"{GH_PREFIX}The operation was canceled.\n" +) + + +def test_parses_traceback_failure_with_question_url_and_repo_frame() -> None: + events = parse_failures_from_log( + TRACEBACK_LOG, + "METAC_GPT_5_5_HIGH", + 111, + "bot_gpt_5_5_high / run_bot", + "http://job-url", + ) + assert len(events) == 1 + event = events[0] + assert event.exception_type == "RuntimeError" + assert "Rate limit hit" in event.message + assert event.question_url == "https://www.metaculus.com/questions/12345/" + assert event.deepest_repo_frame is not None + assert ( + event.deepest_repo_frame.file_path + == "forecasting_tools/forecast_bots/forecast_bot.py" + ) + assert event.deepest_repo_frame.line_number == 341 + assert "Traceback (most recent call last):" in (event.traceback_text or "") + + +def test_falls_back_to_short_summary_lines() -> None: + events = parse_failures_from_log( + SHORT_SUMMARY_ONLY_LOG, "BOT_A", 222, "bot_a / run_bot", "http://job-url" + ) + assert len(events) == 1 + assert events[0].exception_type == "ValueError" + assert events[0].question_url == "https://www.metaculus.com/questions/678/" + + +def test_falls_back_to_log_tail_when_nothing_parseable() -> None: + events = parse_failures_from_log( + UNPARSABLE_LOG, "BOT_B", 333, "bot_b / run_bot", "http://job-url" + ) + assert len(events) == 1 + assert events[0].exception_type == "UnparsedFailure" + assert "The operation was canceled." in (events[0].traceback_text or "") + + +def test_grouping_normalizes_urls_and_numbers() -> None: + signature_one = normalize_message_to_signature( + "RuntimeError", + "Error while processing question url: 'https://www.metaculus.com/questions/12345/': Rate limit hit after 30 seconds", + ) + signature_two = normalize_message_to_signature( + "RuntimeError", + "Error while processing question url: 'https://www.metaculus.com/questions/99999/': Rate limit hit after 61 seconds", + ) + assert signature_one == signature_two + + events_one = parse_failures_from_log( + TRACEBACK_LOG, "BOT_A", 111, "bot_a / run_bot", "http://job-a" + ) + events_two = parse_failures_from_log( + TRACEBACK_LOG, "BOT_B", 112, "bot_b / run_bot", "http://job-b" + ) + groups = group_failures(events_one + events_two) + assert len(groups) == 1 + assert len(groups[0].events) == 2 + assert {event.bot_name for event in groups[0].events} == {"BOT_A", "BOT_B"} diff --git a/forecasting_tools/forecast_bots/official_bots/template_bot_2026_summer.py b/forecasting_tools/forecast_bots/official_bots/template_bot_2026_summer.py index 4e44432c..2e4eb662 100644 --- a/forecasting_tools/forecast_bots/official_bots/template_bot_2026_summer.py +++ b/forecasting_tools/forecast_bots/official_bots/template_bot_2026_summer.py @@ -228,11 +228,18 @@ async def _binary_prompt_to_forecast( ) -> ReasonedPrediction[float]: reasoning = await self.get_llm("default", "llm").invoke(prompt) logger.info(f"Reasoning for URL {question.page_url}: {reasoning}") + parsing_instructions = clean_indents( + f""" + The text given to you is trying to give a probability forecast for a binary question. + {self._create_resolved_question_parsing_message()} + """ + ) binary_prediction: BinaryPrediction = await structure_output( reasoning, BinaryPrediction, model=self.get_llm("parser", "llm"), num_validation_samples=self._structure_output_validation_samples, + additional_instructions=parsing_instructions, ) decimal_pred = max(0.01, min(0.99, binary_prediction.prediction_in_decimal)) @@ -298,6 +305,7 @@ async def _multiple_choice_prompt_to_forecast( The text you are parsing may prepend these options with some variation of "Option" which you should remove if not part of the option names I just gave you. Additionally, you may sometimes need to parse a 0% probability. Please do not skip options with 0% but rather make it an entry in your final list with 0% probability. + {self._create_resolved_question_parsing_message()} """ ) reasoning = await self.get_llm("default", "llm").invoke(prompt) @@ -389,6 +397,8 @@ async def _numeric_prompt_to_forecast( f""" The text given to you is trying to give a forecast distribution for a numeric question. - This text is trying to answer the numeric question: "{question.question_text}". + {self._create_single_distribution_parsing_message(question)} + {self._create_resolved_question_parsing_message()} - When parsing the text, please make sure to give the values (the ones assigned to percentiles) in terms of the correct units. - The units for the forecast are: {question.unit_of_measure} - Your work will be shown publicly with these units stated verbatim after the numbers your parse. @@ -483,6 +493,8 @@ async def _date_prompt_to_forecast( f""" The text given to you is trying to give a forecast distribution for a date question. - This text is trying to answer the question: "{question.question_text}". + {self._create_single_distribution_parsing_message(question)} + {self._create_resolved_question_parsing_message()} - As an example, someone else guessed that the answer will be between {question.lower_bound} and {question.upper_bound}, so the numbers parsed from an answer like this would be verbatim "{question.lower_bound}" and "{question.upper_bound}". - The output is given as dates/times please format it into a valid datetime parsable string. Assume midnight UTC if no hour is given. - If percentiles are not explicitly given (e.g. only a single value is given) please don't return a parsed output, but rather indicate that the answer is not explicitly given in the text. @@ -509,6 +521,24 @@ async def _date_prompt_to_forecast( ) return ReasonedPrediction(prediction_value=prediction, reasoning=reasoning) + def _create_resolved_question_parsing_message(self) -> str: + return "- If the text concludes that the question has already resolved *in the past* (i.e. it treats the question as decided rather than something to forecast), please DO NOT return a parsed output, even if a final forecast is also given. Instead indicate that the answer is not explicitly given in the text.\n" + + def _create_single_distribution_parsing_message( + self, question: NumericQuestion | DateQuestion + ) -> str: + message = ( + "- The text may contain multiple percentile distributions (e.g. forecasts for several related questions/entities, or intermediate drafts before a final answer). You must return exactly ONE distribution: the single final distribution that answers the question stated above.\n" + "- Never merge or concatenate percentile lists that refer to different entities, options, or scenarios. Each percentile should appear at most once in your output.\n" + "- If there are multiple final distributions and you cannot tell which one answers the question stated above, do not guess or combine them. Instead indicate that the answer is not explicitly given in the text." + ) + if question.group_question_option is not None: + message += ( + f'\n- This question is specifically about "{question.group_question_option}" (one subquestion within a group of related questions). ' + f'If the text gives distributions for multiple subjects, return only the distribution for "{question.group_question_option}".' + ) + return message + def _create_upper_and_lower_bound_messages( self, question: NumericQuestion | DateQuestion ) -> tuple[str, str]: diff --git a/run_bots.py b/run_bots.py index 3ea6146e..5fce81b6 100644 --- a/run_bots.py +++ b/run_bots.py @@ -54,6 +54,9 @@ 40280, # https://www.metaculus.com/questions/40280/ is rejected since noisy workflow errors 39138, # https://www.metaculus.com/questions/39138/ is rejected the best value is way out of bounds, and bots are constrained to not be able to make these forecasts ] +POST_IDS_TO_NOT_RAISE_ERRORS_FOR = [ + # 43335, # https://www.metaculus.com/questions/43335/ is still forecasted but should not fail the workflow if it errors +] class ScheduleConfig: @@ -173,10 +176,24 @@ async def configure_and_run_bot( for i, question_report in enumerate(zip(questions, all_reports)): question, report = question_report if isinstance(report, BaseException) and "TimeoutError" in str(report): + logger.warning( + f"TimeoutError occurred for question {question.id_of_post}, retrying..." + ) new_report = await bot.forecast_question(question, return_exceptions=True) all_reports[i] = new_report - bot.log_report_summary(all_reports) + bot.log_report_summary(all_reports, raise_errors=False) + + errors_to_raise = [ + report + for question, report in zip(questions, all_reports) + if isinstance(report, BaseException) + and question.id_of_post not in POST_IDS_TO_NOT_RAISE_ERRORS_FOR + ] + if errors_to_raise: + raise RuntimeError( + f"{len(errors_to_raise)} errors occurred while forecasting: {errors_to_raise}" + ) return all_reports @@ -1209,7 +1226,7 @@ def get_default_bot_dict() -> dict[str, RunBotConfig]: # NOSONAR # **flex_price_settings, ), ), - "tournaments": [AllowedTourn.METACULUS_CUP], + "tournaments": TournConfig.NONE, # NOTE: gpt-5 (gpt-5-2025-08-07) deprecated by OpenAI, API shutoff Dec 10, 2026 }, "METAC_GPT_5": { "estimated_cost_per_question": roughly_gpt_5_cost, @@ -1221,7 +1238,7 @@ def get_default_bot_dict() -> dict[str, RunBotConfig]: # NOSONAR # **flex_price_settings, ), ), - "tournaments": TournConfig.NONE, + "tournaments": TournConfig.NONE, # NOTE: gpt-5 (gpt-5-2025-08-07) deprecated by OpenAI, API shutoff Dec 10, 2026 }, "METAC_GPT_5_MINI": { "estimated_cost_per_question": roughly_gpt_4o_mini_cost, @@ -1232,7 +1249,7 @@ def get_default_bot_dict() -> dict[str, RunBotConfig]: # NOSONAR **flex_price_settings, ), ), - "tournaments": TournConfig.NONE, + "tournaments": TournConfig.NONE, # NOTE: gpt-5-mini (gpt-5-mini-2025-08-07) deprecated by OpenAI, API shutoff Dec 10, 2026 }, "METAC_GPT_5_NANO": { "estimated_cost_per_question": roughly_deepseek_r1_cost, @@ -1242,7 +1259,7 @@ def get_default_bot_dict() -> dict[str, RunBotConfig]: # NOSONAR temperature=default_temperature, ), ), - "tournaments": TournConfig.NONE, + "tournaments": TournConfig.NONE, # NOTE: gpt-5-nano (gpt-5-nano-2025-08-07) deprecated by OpenAI, API shutoff Dec 10, 2026 }, "METAC_CLAUDE_4_SONNET_HIGH_16K": { "estimated_cost_per_question": 0.33980, @@ -1414,7 +1431,7 @@ def get_default_bot_dict() -> dict[str, RunBotConfig]: # NOSONAR llm=gpt_5_with_search, bot_type="research_only", ), - "tournaments": TournConfig.experimental, + "tournaments": TournConfig.NONE, # NOTE: gpt-5 (gpt-5-2025-08-07) deprecated by OpenAI, API shutoff Dec 10, 2026 }, "METAC_GROK_4_LIVE_SEARCH": { "estimated_cost_per_question": 3 * roughly_one_call_to_grok_4_llm, diff --git a/scripts/skills/analyze_bot_failures/SKILL.md b/scripts/skills/analyze_bot_failures/SKILL.md new file mode 100644 index 00000000..caf964b7 --- /dev/null +++ b/scripts/skills/analyze_bot_failures/SKILL.md @@ -0,0 +1,66 @@ +--- +name: analyze-bot-failures +description: Analyze failures from the Metaculus bot forecasting GitHub Actions workflow. Pulls failed job logs, aggregates failure reasons, and investigates whether real bugs need fixing. Use when the user asks why bot runs/workflows are failing, wants a summary of forecasting errors, or mentions analyzing bot failure logs from GitHub Actions. +--- + +# Analyze Bot Forecasting Failures + +Workflow for diagnosing failures in the `run-bot-aib-tournament.yaml` GitHub Actions workflow, which runs ~30 bot jobs every 30 minutes via `run_bots.py`. + +## Step 1: Pull and aggregate the failure logs + +A GitHub token is required (the job-log API rejects unauthenticated requests). Check `GITHUB_TOKEN` env var or `gh auth token`; if neither works, ask the user to run `gh auth login`. + +```bash +poetry run python scripts/skills/analyze_bot_failures/analyze_bot_run_failures.py --since 1d +``` + +Useful options: +- `--since 12h|2d|1w` or an ISO datetime (default `1d`) +- `--run-id ` to analyze one specific run +- `--max-runs ` cap on runs fetched (default 50) + +Output goes to `logs/workflow_failure_analysis//`: +- `report.md` — counts by category/bot/question, plus failure groups (deduped by normalized signature) with an example message, traceback, and the deepest repo code frame +- `failures.json` — every parsed failure with full messages and signatures (machine-readable) +- `raw_logs/run_.log` — complete raw log of each failed job + +## Step 2: Read the report, then verify against raw logs + +Read `report.md` first. The parser extracts full tracebacks when present (falling back to `❌ Exception: ... | Message: ...` summary lines, then to the raw log tail), so each failure group usually includes a "deepest repo frame" — the last traceback frame inside this repo rather than a dependency — which is the first place to look in the code. Still **spot-check 2-3 raw logs** for the most frequent signatures to see surrounding context (retry attempts, which LLM call failed). Failures of type `UnparsedFailure` mean nothing parseable was found — the job likely failed at the infrastructure level (poetry install failures, token resolution, job-level timeout at 55 min, cancellation) — read their raw logs directly. + +## Step 3: Triage — transient noise vs real bug + +Likely transient (usually not worth fixing, just note frequency): +- Provider 5xx / overloaded / rate-limit errors that hit one bot in one run +- Occasional timeouts that succeeded on the in-run retry (`run_bots.py` retries reports whose error contains "TimeoutError") + +Likely real bugs (investigate the code): +- The same bot failing across most runs (config/model-name/credentials issue in its `RunBotConfig` in `run_bots.py`) +- Structured-output or validation errors clustering on one question type (binary/MC/numeric/group) — points at parsing or prompt code +- Prediction validation errors (probabilities out of bounds, CDF/percentile issues) — points at report data models +- The same question ID recurring across bots/runs — candidate for `POST_IDS_TO_SKIP` or `POST_IDS_TO_NOT_RAISE_ERRORS_FOR` in `run_bots.py` +- Tracebacks ending inside `forecasting_tools/` rather than a provider SDK + +## Step 4: Map errors to code + +| Symptom | Where to look | +|---|---| +| Bot config, skip lists, question selection, final RuntimeError | `run_bots.py` | +| Forecast orchestration, retries, report summary format | `forecasting_tools/forecast_bots/forecast_bot.py` | +| Structured output parsing failures | `forecasting_tools/helpers/structure_output.py` | +| LLM call errors, model names, retries, token limits | `forecasting_tools/ai_models/` (esp. `general_llm.py`) | +| AskNews/research errors, cache misses | `forecasting_tools/helpers/` and `.github/scripts/precache_asknews.py` | +| Metaculus API / publish errors | `forecasting_tools/helpers/metaculus_client.py` | +| Job setup/infra failures, secrets, timeouts | `.github/workflows/run-bot-launcher.yaml`, `run-bot-aib-tournament.yaml` | + +It may not be in any of the above, so be willing to search the codebase. + +## Step 5: Summarize findings + +Report back with: +1. Failure counts by category and bot (from `report.md`) +2. **Real bugs found**: each with evidence (log excerpt + code location) and a proposed fix +3. **Transient noise**: what to ignore and why +4. **Question skip candidates**: recurring question IDs with failure counts +5. Do not change code or skip lists without confirming the proposed fixes with the user first diff --git a/scripts/skills/analyze_bot_failures/analyze_bot_run_failures.py b/scripts/skills/analyze_bot_failures/analyze_bot_run_failures.py new file mode 100644 index 00000000..65670038 --- /dev/null +++ b/scripts/skills/analyze_bot_failures/analyze_bot_run_failures.py @@ -0,0 +1,663 @@ +""" +Pulls logs of failed bot jobs from the GitHub Actions tournament workflow, +parses out the failure reasons, and writes an aggregated markdown/json report. + +Requires a GitHub token with repo read access (the job-log endpoint rejects +unauthenticated requests even on public repos). The token is resolved from the +GITHUB_TOKEN env var, then from `gh auth token`. + +Example usage: + poetry run python scripts/skills/analyze_bot_failures/analyze_bot_run_failures.py --since 2d + poetry run python scripts/skills/analyze_bot_failures/analyze_bot_run_failures.py --run-id 27362307940 +""" + +import argparse +import json +import logging +import os +import re +import subprocess +import sys +from dataclasses import dataclass +from datetime import datetime, timedelta, timezone +from pathlib import Path + +import requests + +logger = logging.getLogger(__name__) + +GITHUB_API = "https://api.github.com" +DEFAULT_REPO = "Metaculus/forecasting-tools" +DEFAULT_WORKFLOW = "run-bot-aib-tournament.yaml" +DEFAULT_OUTPUT_DIR = Path("logs/workflow_failure_analysis") +LOG_TAIL_LINES_FOR_UNPARSED_FAILURES = 60 + +ERROR_CATEGORY_PATTERNS: dict[str, list[str]] = { + "timeout": [r"TimeoutError", r"timed?[\s_-]?out", r"Timeout"], + "rate_limit": [r"429", r"rate[\s_-]?limit", r"RateLimitError", r"quota"], + "provider_5xx": [ + r"\b50[0-49]\b", + r"InternalServerError", + r"ServiceUnavailable", + r"upstream", + r"overloaded", + ], + "auth_or_credits": [ + r"\b401\b", + r"\b402\b", + r"\b403\b", + r"AuthenticationError", + r"insufficient[\s_-]?credits", + r"API key", + ], + "bad_request_or_model_refusal": [ + r"\b400\b", + r"BadRequestError", + r"InvalidRequestError", + r"refus", + r"content[\s_-]?policy", + ], + "structured_output_parsing": [ + r"structure[d]?[\s_-]?output", + r"ValidationError", + r"JSONDecodeError", + r"Failed to parse", + r"pydantic", + ], + "prediction_validation": [ + r"probabilit(y|ies).{0,80}(sum|bound|range|between)", + r"out of bounds", + r"cdf", + r"percentile", + r"AssertionError", + r"bad probability", + ], + "asknews_or_research": [r"asknews", r"AskNews", r"news[\s_-]?result", r"exa\.ai"], + "metaculus_api": [r"metaculus.{0,80}(api|post|publish)", r"Failed to post"], + "connection": [ + r"ConnectionError", + r"ConnectTimeout", + r"SSLError", + r"Connection reset", + r"APIConnectionError", + ], +} + +QUESTION_URL_PATTERN = re.compile(r"https://www\.metaculus\.com/questions/\d+/") +GH_TIMESTAMP_PREFIX_PATTERN = re.compile( + r"^\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}\.\d+Z\s?" +) +PYTHON_LOG_RECORD_PATTERN = re.compile(r"^\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2},\d{3} - ") +FAILED_FORECAST_LINE_PATTERN = re.compile( + r"❌ Exception: (?P\w+) \| Message: (?P.*)" +) +TRACEBACK_FRAME_PATTERN = re.compile( + r'File "(?P[^"]+)", line (?P\d+)' +) +EXCEPTION_LINE_PATTERN = re.compile( + r"^(?P[A-Za-z_][\w.]*(?:Error|Exception|Exit|Interrupt|Warning)?): (?P.+)" +) +GITHUB_RUNNER_REPO_ROOT_PATTERN = re.compile(r"^/home/runner/work/[^/]+/[^/]+/") + + +@dataclass +class RepoFrame: + file_path: str + line_number: int + + +@dataclass +class FailureEvent: + bot_name: str + run_id: int + job_name: str + job_url: str + exception_type: str + message: str + categories: list[str] + question_url: str | None + traceback_text: str | None + deepest_repo_frame: RepoFrame | None + + +@dataclass +class FailureGroup: + signature: str + events: list[FailureEvent] + + +@dataclass +class JobAnalysis: + bot_name: str + run_id: int + job_url: str + log_path: str + events: list[FailureEvent] + + +def resolve_github_token() -> str: + env_token = os.getenv("GITHUB_TOKEN") or os.getenv("GH_TOKEN") + if env_token: + return env_token + try: + result = subprocess.run( + ["gh", "auth", "token"], capture_output=True, text=True, check=True + ) + return result.stdout.strip() + except (subprocess.CalledProcessError, FileNotFoundError): + raise RuntimeError( + "No GitHub token found. Set GITHUB_TOKEN or run `gh auth login`. " + "A token is required to download job logs (even for public repos)." + ) + + +def github_get( + path: str, token: str, params: dict | None = None, as_text: bool = False +) -> dict | str: + response = requests.get( + f"{GITHUB_API}{path}", + headers={ + "Authorization": f"Bearer {token}", + "Accept": "application/vnd.github+json", + }, + params=params, + timeout=60, + ) + response.raise_for_status() + return response.text if as_text else response.json() + + +def parse_since_to_datetime(since: str) -> datetime: + match = re.fullmatch(r"(\d+)([hdw])", since) + if match: + amount, unit = int(match.group(1)), match.group(2) + unit_to_delta = { + "h": timedelta(hours=amount), + "d": timedelta(days=amount), + "w": timedelta(weeks=amount), + } + return datetime.now(timezone.utc) - unit_to_delta[unit] + return datetime.fromisoformat(since).astimezone(timezone.utc) + + +def list_workflow_runs( + repo: str, workflow: str, token: str, since: datetime, max_runs: int +) -> list[dict]: + runs: list[dict] = [] + page = 1 + while len(runs) < max_runs: + data = github_get( + f"/repos/{repo}/actions/workflows/{workflow}/runs", + token, + params={ + "per_page": 100, + "page": page, + "created": f">={since.isoformat()}", + }, + ) + assert isinstance(data, dict) + page_runs = data.get("workflow_runs", []) + if not page_runs: + break + runs.extend(page_runs) + page += 1 + return runs[:max_runs] + + +def list_failed_jobs(repo: str, run_id: int, token: str) -> list[dict]: + jobs: list[dict] = [] + page = 1 + while True: + data = github_get( + f"/repos/{repo}/actions/runs/{run_id}/jobs", + token, + params={"per_page": 100, "page": page, "filter": "latest"}, + ) + assert isinstance(data, dict) + page_jobs = data.get("jobs", []) + if not page_jobs: + break + jobs.extend(page_jobs) + page += 1 + return [job for job in jobs if job.get("conclusion") == "failure"] + + +def download_job_log(repo: str, job_id: int, token: str) -> str: + log_text = github_get( + f"/repos/{repo}/actions/jobs/{job_id}/logs", token, as_text=True + ) + assert isinstance(log_text, str) + return log_text + + +def strip_github_timestamps(log_text: str) -> str: + return "\n".join( + GH_TIMESTAMP_PREFIX_PATTERN.sub("", line) for line in log_text.splitlines() + ) + + +def categorize_error(error_text: str) -> list[str]: + categories = [ + category + for category, patterns in ERROR_CATEGORY_PATTERNS.items() + if any(re.search(pattern, error_text, re.IGNORECASE) for pattern in patterns) + ] + return categories or ["uncategorized"] + + +def extract_bot_name(job_name: str) -> str: + return job_name.split("/")[0].strip() + + +def extract_traceback_blocks(log_lines: list[str]) -> list[list[str]]: + blocks: list[list[str]] = [] + current_block: list[str] | None = None + for line in log_lines: + if line.startswith("Traceback (most recent call last)"): + if current_block is not None: + blocks.append(current_block) + current_block = [line] + elif current_block is not None: + if PYTHON_LOG_RECORD_PATTERN.match(line) or line.startswith("##["): + blocks.append(current_block) + current_block = None + else: + current_block.append(line) + if current_block is not None: + blocks.append(current_block) + return blocks + + +def extract_deepest_repo_frame(traceback_lines: list[str]) -> RepoFrame | None: + repo_frames = [] + for line in traceback_lines: + frame_match = TRACEBACK_FRAME_PATTERN.search(line) + if not frame_match: + continue + file_path = frame_match.group("file_path") + is_dependency_frame = ".venv" in file_path or "site-packages" in file_path + if is_dependency_frame: + continue + relative_path = GITHUB_RUNNER_REPO_ROOT_PATTERN.sub("", file_path) + repo_frames.append( + RepoFrame( + file_path=relative_path, + line_number=int(frame_match.group("line_number")), + ) + ) + return repo_frames[-1] if repo_frames else None + + +def extract_exception_from_traceback(traceback_lines: list[str]) -> tuple[str, str]: + for line_index in range(len(traceback_lines) - 1, -1, -1): + exception_match = EXCEPTION_LINE_PATTERN.match(traceback_lines[line_index]) + if exception_match: + message_continuation = traceback_lines[line_index + 1 :] + full_message = "\n".join( + [exception_match.group("message")] + message_continuation + ).strip() + exception_type = exception_match.group("exception_type").split(".")[-1] + return exception_type, full_message + return "UnknownException", traceback_lines[-1] if traceback_lines else "" + + +def extract_question_url(text: str) -> str | None: + url_match = QUESTION_URL_PATTERN.search(text) + return url_match.group(0) if url_match else None + + +def parse_traceback_failure_events( + log_lines: list[str], bot_name: str, run_id: int, job_name: str, job_url: str +) -> list[FailureEvent]: + events = [] + for block_lines in extract_traceback_blocks(log_lines): + exception_type, message = extract_exception_from_traceback(block_lines) + traceback_text = "\n".join(block_lines) + events.append( + FailureEvent( + bot_name=bot_name, + run_id=run_id, + job_name=job_name, + job_url=job_url, + exception_type=exception_type, + message=message, + categories=categorize_error(f"{exception_type} {message}"), + question_url=extract_question_url(message) + or extract_question_url(traceback_text), + traceback_text=traceback_text, + deepest_repo_frame=extract_deepest_repo_frame(block_lines), + ) + ) + return events + + +def parse_short_summary_failure_events( + log_lines: list[str], bot_name: str, run_id: int, job_name: str, job_url: str +) -> list[FailureEvent]: + events = [] + for line_index, line in enumerate(log_lines): + line_match = FAILED_FORECAST_LINE_PATTERN.search(line) + if not line_match: + continue + message_lines = [line_match.group("message")] + for continuation_line in log_lines[line_index + 1 :]: + is_new_section = ( + "✅" in continuation_line + or "❌" in continuation_line + or continuation_line.startswith("Stats for passing reports") + or continuation_line.startswith("----") + or PYTHON_LOG_RECORD_PATTERN.match(continuation_line) + ) + if is_new_section: + break + message_lines.append(continuation_line) + message = "\n".join(message_lines).strip() + exception_type = line_match.group("exception_type") + events.append( + FailureEvent( + bot_name=bot_name, + run_id=run_id, + job_name=job_name, + job_url=job_url, + exception_type=exception_type, + message=message, + categories=categorize_error(f"{exception_type} {message}"), + question_url=extract_question_url(message), + traceback_text=None, + deepest_repo_frame=None, + ) + ) + return events + + +def make_unparsed_failure_event( + log_lines: list[str], bot_name: str, run_id: int, job_name: str, job_url: str +) -> FailureEvent: + log_tail = "\n".join(log_lines[-LOG_TAIL_LINES_FOR_UNPARSED_FAILURES:]) + return FailureEvent( + bot_name=bot_name, + run_id=run_id, + job_name=job_name, + job_url=job_url, + exception_type="UnparsedFailure", + message=( + "No traceback or per-question failure line found. The job likely failed " + "at the infrastructure level (setup, token resolution, job cancellation/" + "timeout) or the log format changed. Log tail attached as traceback_text." + ), + categories=categorize_error(log_tail), + question_url=None, + traceback_text=log_tail, + deepest_repo_frame=None, + ) + + +def parse_failures_from_log( + log_text: str, bot_name: str, run_id: int, job_name: str, job_url: str +) -> list[FailureEvent]: + log_lines = strip_github_timestamps(log_text).splitlines() + events = parse_traceback_failure_events( + log_lines, bot_name, run_id, job_name, job_url + ) + if not events: + events = parse_short_summary_failure_events( + log_lines, bot_name, run_id, job_name, job_url + ) + if not events: + events = [ + make_unparsed_failure_event(log_lines, bot_name, run_id, job_name, job_url) + ] + return events + + +def normalize_message_to_signature(exception_type: str, message: str) -> str: + no_urls = QUESTION_URL_PATTERN.sub("", message) + no_request_ids = re.sub(r"(req|chatcmpl|gen)-[\w-]+", "", no_urls) + no_numbers = re.sub(r"\d+", "", no_request_ids) + return f"{exception_type}: {no_numbers[:300]}" + + +def group_failures(events: list[FailureEvent]) -> list[FailureGroup]: + groups_by_signature: dict[str, FailureGroup] = {} + for event in events: + signature = normalize_message_to_signature(event.exception_type, event.message) + if signature not in groups_by_signature: + groups_by_signature[signature] = FailureGroup( + signature=signature, events=[] + ) + groups_by_signature[signature].events.append(event) + return sorted(groups_by_signature.values(), key=lambda group: -len(group.events)) + + +def sanitize_filename(name: str) -> str: + return re.sub(r"[^\w.-]", "_", name) + + +def count_by(items: list[str]) -> list[tuple[str, int]]: + counts: dict[str, int] = {} + for item in items: + counts[item] = counts.get(item, 0) + 1 + return sorted(counts.items(), key=lambda pair: -pair[1]) + + +def truncate_text(text: str, max_length: int) -> str: + return text if len(text) < max_length else text[:max_length] + "...(truncated)" + + +def build_failure_group_section(group: FailureGroup) -> list[str]: + example = group.events[0] + bots_affected = sorted({event.bot_name for event in group.events}) + section_lines = [ + f"### [{len(group.events)}x] {example.exception_type} | " + f"categories={','.join(example.categories)}", + f"Bots affected: {', '.join(bots_affected)}", + ] + if example.deepest_repo_frame: + section_lines.append( + f"Deepest repo frame: `{example.deepest_repo_frame.file_path}:" + f"{example.deepest_repo_frame.line_number}`" + ) + section_lines.append(f"\n```\n{truncate_text(example.message, 2000)}\n```\n") + if example.traceback_text: + section_lines.append( + f"
Example traceback / log tail\n\n" + f"```\n{truncate_text(example.traceback_text, 3000)}\n```\n\n
\n" + ) + return section_lines + + +def build_report(job_analyses: list[JobAnalysis], output_dir: Path) -> str: + all_events = [event for analysis in job_analyses for event in analysis.events] + groups = group_failures(all_events) + + report_lines = [ + "# Bot Workflow Failure Report", + f"\nGenerated: {datetime.now(timezone.utc).isoformat()}", + f"\nFailed jobs analyzed: {len(job_analyses)}", + f"Individual failures parsed: {len(all_events)}", + f"Unique failure signatures: {len(groups)}", + "\n## Failures by category\n", + ] + category_occurrences = [ + category for event in all_events for category in event.categories + ] + for category, count in count_by(category_occurrences): + report_lines.append(f"- {category}: {count}") + + report_lines.append("\n## Failures by bot\n") + for bot_name, count in count_by([event.bot_name for event in all_events]): + report_lines.append(f"- {bot_name}: {count}") + + question_urls = [event.question_url for event in all_events if event.question_url] + if question_urls: + report_lines.append( + "\n## Questions appearing in failures " + "(recurring ones are POST_IDS_TO_SKIP candidates)\n" + ) + for question_url, count in count_by(question_urls): + report_lines.append(f"- {question_url} : {count}") + + report_lines.append("\n## Failure groups (most frequent first)\n") + for group in groups: + report_lines.extend(build_failure_group_section(group)) + + report_lines.append("\n## Per-job details\n") + for analysis in job_analyses: + unparsed_note = ( + " | NOTE: only an unparsed log tail was extracted, read the raw log" + if any( + event.exception_type == "UnparsedFailure" for event in analysis.events + ) + else "" + ) + report_lines.append( + f"- **{analysis.bot_name}** (run {analysis.run_id}): " + f"{len(analysis.events)} failures parsed | " + f"[job link]({analysis.job_url}) | raw log: `{analysis.log_path}`" + f"{unparsed_note}" + ) + + report_path = output_dir / "report.md" + report_path.write_text("\n".join(report_lines)) + + json_path = output_dir / "failures.json" + json_path.write_text( + json.dumps( + [ + { + "bot_name": event.bot_name, + "run_id": event.run_id, + "job_name": event.job_name, + "job_url": event.job_url, + "exception_type": event.exception_type, + "categories": event.categories, + "question_url": event.question_url, + "signature": normalize_message_to_signature( + event.exception_type, event.message + ), + "deepest_repo_frame": ( + f"{event.deepest_repo_frame.file_path}:" + f"{event.deepest_repo_frame.line_number}" + if event.deepest_repo_frame + else None + ), + "message": event.message, + } + for event in all_events + ], + indent=2, + ) + ) + return str(report_path) + + +def analyze_failed_jobs_for_run( + repo: str, run: dict, token: str, raw_logs_dir: Path +) -> list[JobAnalysis]: + failed_jobs = list_failed_jobs(repo, run["id"], token) + logger.info( + f"Run {run['id']} ({run['created_at']}): {len(failed_jobs)} failed jobs" + ) + job_analyses: list[JobAnalysis] = [] + for job in failed_jobs: + bot_name = extract_bot_name(job["name"]) + log_path = raw_logs_dir / f"run{run['id']}_{sanitize_filename(bot_name)}.log" + try: + log_text = download_job_log(repo, job["id"], token) + except requests.HTTPError as http_error: + logger.warning(f"Could not download log for job {job['id']}: {http_error}") + continue + log_path.write_text(log_text) + events = parse_failures_from_log( + log_text, bot_name, run["id"], job["name"], job["html_url"] + ) + job_analyses.append( + JobAnalysis( + bot_name=bot_name, + run_id=run["id"], + job_url=job["html_url"], + log_path=str(log_path), + events=events, + ) + ) + return job_analyses + + +def analyze_runs( + repo: str, + workflow: str, + since: str, + max_runs: int, + output_dir: Path, + run_id: int | None = None, +) -> str: + token = resolve_github_token() + if run_id is not None: + runs = [github_get(f"/repos/{repo}/actions/runs/{run_id}", token)] + else: + since_datetime = parse_since_to_datetime(since) + all_runs = list_workflow_runs(repo, workflow, token, since_datetime, max_runs) + runs = [ + run + for run in all_runs + if run.get("conclusion") not in ("success", None) + or run.get("status") != "completed" + ] + logger.info( + f"Found {len(all_runs)} runs since {since_datetime.isoformat()}, " + f"{len(runs)} with failures" + ) + + timestamp = datetime.now(timezone.utc).strftime("%Y%m%d_%H%M%S") + analysis_dir = output_dir / timestamp + raw_logs_dir = analysis_dir / "raw_logs" + raw_logs_dir.mkdir(parents=True, exist_ok=True) + + job_analyses: list[JobAnalysis] = [] + for run in runs: + assert isinstance(run, dict) + job_analyses.extend(analyze_failed_jobs_for_run(repo, run, token, raw_logs_dir)) + + if not job_analyses: + logger.info("No failed jobs found in the selected window.") + + report_path = build_report(job_analyses, analysis_dir) + logger.info(f"Report written to {report_path}") + logger.info(f"Raw logs saved under {raw_logs_dir}") + return report_path + + +def main() -> None: + logging.basicConfig(level=logging.INFO, format="%(message)s") + parser = argparse.ArgumentParser( + description="Analyze failed bot forecasting jobs from GitHub Actions" + ) + parser.add_argument( + "--since", + default="1d", + help="Time window like 12h, 2d, 1w, or an ISO datetime (default: 1d)", + ) + parser.add_argument( + "--run-id", type=int, default=None, help="Analyze a single specific run id" + ) + parser.add_argument("--repo", default=DEFAULT_REPO) + parser.add_argument("--workflow", default=DEFAULT_WORKFLOW) + parser.add_argument("--max-runs", type=int, default=50) + parser.add_argument("--output-dir", type=Path, default=DEFAULT_OUTPUT_DIR) + args = parser.parse_args() + + try: + analyze_runs( + repo=args.repo, + workflow=args.workflow, + since=args.since, + max_runs=args.max_runs, + output_dir=args.output_dir, + run_id=args.run_id, + ) + except RuntimeError as error: + sys.exit(str(error)) + + +if __name__ == "__main__": + main()