From 6029f2eac0b4c62e6d816e9f87a213bcaf1eca47 Mon Sep 17 00:00:00 2001 From: Claude Date: Mon, 19 Jan 2026 14:20:24 +0000 Subject: [PATCH 1/4] fix: handle validation errors gracefully and filter empty links - Fix UnboundLocalError in sort_yaml.py by validating conferences individually instead of via list comprehension - Skip invalid conferences instead of failing the entire process - Filter out ICS calendar entries without valid links at import time --- utils/import_python_official.py | 7 +++++++ utils/sort_yaml.py | 14 ++++++++------ 2 files changed, 15 insertions(+), 6 deletions(-) diff --git a/utils/import_python_official.py b/utils/import_python_official.py index 7bdbe898cb..1045616836 100644 --- a/utils/import_python_official.py +++ b/utils/import_python_official.py @@ -194,6 +194,13 @@ def ics_to_dataframe() -> pd.DataFrame: except Exception as e: logger.error(f"Error cleaning DataFrame: {e}") + # Filter out entries with empty or missing links + initial_count = len(df) + df = df[df["link"].str.len() > 0] + filtered_count = initial_count - len(df) + if filtered_count > 0: + logger.info(f"Filtered out {filtered_count} entries without valid links") + return df diff --git a/utils/sort_yaml.py b/utils/sort_yaml.py index af2456d9c2..48f2038f46 100644 --- a/utils/sort_yaml.py +++ b/utils/sort_yaml.py @@ -302,14 +302,16 @@ def sort_data(base="", prefix="", skip_links=False): data[i] = order_keywords(q) logger.info("✅ Validating conference data with Pydantic schema") + new_data = [] validation_errors = 0 - try: - new_data = [Conference(**q) for q in data] - except pydantic.ValidationError as e: - validation_errors += 1 - logger.error(f"❌ Validation error in conference: {e}") - logger.debug(f"Invalid data: \n{yaml.dump(q, default_flow_style=False)}") + for q in data: + try: + new_data.append(Conference(**q)) + except pydantic.ValidationError as e: + validation_errors += 1 + logger.error(f"❌ Validation error in conference: {e}") + logger.debug(f"Invalid data: \n{yaml.dump(q, default_flow_style=False)}") if validation_errors > 0: logger.warning(f"⚠️ {validation_errors} conferences failed validation and were skipped") From bdd0b199d023c62b8f146ab0408a28f3d886e801 Mon Sep 17 00:00:00 2001 From: Claude Date: Mon, 19 Jan 2026 14:22:01 +0000 Subject: [PATCH 2/4] feat: fill missing links from historical conference data When an ICS calendar entry has no link, look up the conference name in existing YAML data and use the historical link with the year replaced. This allows conferences without descriptions to still have valid links if we've seen them before. --- utils/import_python_official.py | 83 ++++++++++++++++++++++++++++++--- 1 file changed, 76 insertions(+), 7 deletions(-) diff --git a/utils/import_python_official.py b/utils/import_python_official.py index 1045616836..fd5fd9a546 100644 --- a/utils/import_python_official.py +++ b/utils/import_python_official.py @@ -37,6 +37,67 @@ logger = get_tqdm_logger(__name__) +def fill_links_from_history(df_ics: pd.DataFrame, df_yml: pd.DataFrame) -> pd.DataFrame: + """Fill missing links in ICS data from historical conference data. + + For conferences without links, look up the conference name in historical data + and use that link, replacing any year references with the current year. + + Parameters + ---------- + df_ics : pd.DataFrame + DataFrame with ICS conference data (may have empty links) + df_yml : pd.DataFrame + DataFrame with existing conference data from YAML files + + Returns + ------- + pd.DataFrame + DataFrame with missing links filled where historical data exists + """ + if df_yml.empty: + return df_ics + + # Create a lookup of conference names to their most recent links + # Group by normalized conference name and get the most recent entry + historical_links = {} + for _, row in df_yml.iterrows(): + conf_name = row.get("conference", "") + link = row.get("link", "") + year = row.get("year", 0) + + if conf_name and link: + # Keep the most recent link for each conference + if conf_name not in historical_links or year > historical_links[conf_name][1]: + historical_links[conf_name] = (link, year) + + filled_count = 0 + for idx, row in df_ics.iterrows(): + link = row.get("link", "") + if not link or len(str(link).strip()) == 0: + conf_name = row.get("conference", "") + target_year = row.get("year", datetime.now(tz=timezone.utc).year) + + if conf_name in historical_links: + hist_link, hist_year = historical_links[conf_name] + # Replace the historical year with the target year in the link + new_link = re.sub( + rf"\b{hist_year}\b", + str(target_year), + str(hist_link), + ) + df_ics.at[idx, "link"] = new_link + filled_count += 1 + logger.debug( + f"Filled link for '{conf_name}' from historical data: {new_link}", + ) + + if filled_count > 0: + logger.info(f"Filled {filled_count} missing links from historical conference data") + + return df_ics + + def ics_to_dataframe() -> pd.DataFrame: """Parse an .ics file and return a DataFrame with the event data. @@ -194,13 +255,6 @@ def ics_to_dataframe() -> pd.DataFrame: except Exception as e: logger.error(f"Error cleaning DataFrame: {e}") - # Filter out entries with empty or missing links - initial_count = len(df) - df = df[df["link"].str.len() > 0] - filtered_count = initial_count - len(df) - if filtered_count > 0: - logger.info(f"Filtered out {filtered_count} entries without valid links") - return df @@ -248,6 +302,21 @@ def main(year=None, base="") -> bool: logger.warning("No conference data retrieved from calendar") return False + # Try to fill missing links from historical conference data + logger.info("Filling missing links from historical data") + df_ics = fill_links_from_history(df_ics, df_yml) + + # Filter out entries with empty or missing links + initial_count = len(df_ics) + df_ics = df_ics[df_ics["link"].str.len() > 0] + filtered_count = initial_count - len(df_ics) + if filtered_count > 0: + logger.info(f"Filtered out {filtered_count} entries without valid links") + + if df_ics.empty: + logger.warning("No conferences with valid links after filtering") + return False + except Exception as e: logger.error(f"Failed to initialize import process: {e}") return False From ac32b813c07f0876dfcecb0ec9f435cb54bf3de1 Mon Sep 17 00:00:00 2001 From: Claude Date: Mon, 19 Jan 2026 14:33:33 +0000 Subject: [PATCH 3/4] style: fix ruff SIM102 and PERF203 warnings - Combine nested if statements in fill_links_from_history (SIM102) - Extract validate_conference helper to avoid try-except in loop (PERF203) --- utils/import_python_official.py | 11 +++++++---- utils/sort_yaml.py | 16 +++++++++------- 2 files changed, 16 insertions(+), 11 deletions(-) diff --git a/utils/import_python_official.py b/utils/import_python_official.py index fd5fd9a546..54e6922c29 100644 --- a/utils/import_python_official.py +++ b/utils/import_python_official.py @@ -66,10 +66,13 @@ def fill_links_from_history(df_ics: pd.DataFrame, df_yml: pd.DataFrame) -> pd.Da link = row.get("link", "") year = row.get("year", 0) - if conf_name and link: - # Keep the most recent link for each conference - if conf_name not in historical_links or year > historical_links[conf_name][1]: - historical_links[conf_name] = (link, year) + # Keep the most recent link for each conference + if ( + conf_name + and link + and (conf_name not in historical_links or year > historical_links[conf_name][1]) + ): + historical_links[conf_name] = (link, year) filled_count = 0 for idx, row in df_ics.iterrows(): diff --git a/utils/sort_yaml.py b/utils/sort_yaml.py index 48f2038f46..6b1a84b48f 100644 --- a/utils/sort_yaml.py +++ b/utils/sort_yaml.py @@ -301,17 +301,19 @@ def sort_data(base="", prefix="", skip_links=False): for i, q in enumerate(data.copy()): data[i] = order_keywords(q) - logger.info("✅ Validating conference data with Pydantic schema") - new_data = [] - validation_errors = 0 - - for q in data: + def validate_conference(q: dict) -> Conference | None: + """Validate a single conference entry, returning None if invalid.""" try: - new_data.append(Conference(**q)) + return Conference(**q) except pydantic.ValidationError as e: - validation_errors += 1 logger.error(f"❌ Validation error in conference: {e}") logger.debug(f"Invalid data: \n{yaml.dump(q, default_flow_style=False)}") + return None + + logger.info("✅ Validating conference data with Pydantic schema") + validated = [validate_conference(q) for q in data] + new_data = [c for c in validated if c is not None] + validation_errors = len(validated) - len(new_data) if validation_errors > 0: logger.warning(f"⚠️ {validation_errors} conferences failed validation and were skipped") From fe9685916f0f2a1c4d05ea37bbc005f402bea7a4 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Mon, 19 Jan 2026 14:34:09 +0000 Subject: [PATCH 4/4] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- utils/import_python_official.py | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/utils/import_python_official.py b/utils/import_python_official.py index 54e6922c29..6966ae811d 100644 --- a/utils/import_python_official.py +++ b/utils/import_python_official.py @@ -67,11 +67,7 @@ def fill_links_from_history(df_ics: pd.DataFrame, df_yml: pd.DataFrame) -> pd.Da year = row.get("year", 0) # Keep the most recent link for each conference - if ( - conf_name - and link - and (conf_name not in historical_links or year > historical_links[conf_name][1]) - ): + if conf_name and link and (conf_name not in historical_links or year > historical_links[conf_name][1]): historical_links[conf_name] = (link, year) filled_count = 0