diff --git a/utils/import_python_official.py b/utils/import_python_official.py index 7bdbe898cb..6966ae811d 100644 --- a/utils/import_python_official.py +++ b/utils/import_python_official.py @@ -37,6 +37,66 @@ logger = get_tqdm_logger(__name__) +def fill_links_from_history(df_ics: pd.DataFrame, df_yml: pd.DataFrame) -> pd.DataFrame: + """Fill missing links in ICS data from historical conference data. + + For conferences without links, look up the conference name in historical data + and use that link, replacing any year references with the current year. + + Parameters + ---------- + df_ics : pd.DataFrame + DataFrame with ICS conference data (may have empty links) + df_yml : pd.DataFrame + DataFrame with existing conference data from YAML files + + Returns + ------- + pd.DataFrame + DataFrame with missing links filled where historical data exists + """ + if df_yml.empty: + return df_ics + + # Create a lookup of conference names to their most recent links + # Group by normalized conference name and get the most recent entry + historical_links = {} + for _, row in df_yml.iterrows(): + conf_name = row.get("conference", "") + link = row.get("link", "") + year = row.get("year", 0) + + # Keep the most recent link for each conference + if conf_name and link and (conf_name not in historical_links or year > historical_links[conf_name][1]): + historical_links[conf_name] = (link, year) + + filled_count = 0 + for idx, row in df_ics.iterrows(): + link = row.get("link", "") + if not link or len(str(link).strip()) == 0: + conf_name = row.get("conference", "") + target_year = row.get("year", datetime.now(tz=timezone.utc).year) + + if conf_name in historical_links: + hist_link, hist_year = historical_links[conf_name] + # Replace the historical year with the target year in the link + new_link = re.sub( + rf"\b{hist_year}\b", + str(target_year), + str(hist_link), + ) + df_ics.at[idx, "link"] = new_link + filled_count += 1 + logger.debug( + f"Filled link for '{conf_name}' from historical data: {new_link}", + ) + + if filled_count > 0: + logger.info(f"Filled {filled_count} missing links from historical conference data") + + return df_ics + + def ics_to_dataframe() -> pd.DataFrame: """Parse an .ics file and return a DataFrame with the event data. @@ -241,6 +301,21 @@ def main(year=None, base="") -> bool: logger.warning("No conference data retrieved from calendar") return False + # Try to fill missing links from historical conference data + logger.info("Filling missing links from historical data") + df_ics = fill_links_from_history(df_ics, df_yml) + + # Filter out entries with empty or missing links + initial_count = len(df_ics) + df_ics = df_ics[df_ics["link"].str.len() > 0] + filtered_count = initial_count - len(df_ics) + if filtered_count > 0: + logger.info(f"Filtered out {filtered_count} entries without valid links") + + if df_ics.empty: + logger.warning("No conferences with valid links after filtering") + return False + except Exception as e: logger.error(f"Failed to initialize import process: {e}") return False diff --git a/utils/sort_yaml.py b/utils/sort_yaml.py index af2456d9c2..6b1a84b48f 100644 --- a/utils/sort_yaml.py +++ b/utils/sort_yaml.py @@ -301,15 +301,19 @@ def sort_data(base="", prefix="", skip_links=False): for i, q in enumerate(data.copy()): data[i] = order_keywords(q) + def validate_conference(q: dict) -> Conference | None: + """Validate a single conference entry, returning None if invalid.""" + try: + return Conference(**q) + except pydantic.ValidationError as e: + logger.error(f"❌ Validation error in conference: {e}") + logger.debug(f"Invalid data: \n{yaml.dump(q, default_flow_style=False)}") + return None + logger.info("✅ Validating conference data with Pydantic schema") - validation_errors = 0 - - try: - new_data = [Conference(**q) for q in data] - except pydantic.ValidationError as e: - validation_errors += 1 - logger.error(f"❌ Validation error in conference: {e}") - logger.debug(f"Invalid data: \n{yaml.dump(q, default_flow_style=False)}") + validated = [validate_conference(q) for q in data] + new_data = [c for c in validated if c is not None] + validation_errors = len(validated) - len(new_data) if validation_errors > 0: logger.warning(f"⚠️ {validation_errors} conferences failed validation and were skipped")