|
37 | 37 | logger = get_tqdm_logger(__name__) |
38 | 38 |
|
39 | 39 |
|
| 40 | +def fill_links_from_history(df_ics: pd.DataFrame, df_yml: pd.DataFrame) -> pd.DataFrame: |
| 41 | + """Fill missing links in ICS data from historical conference data. |
| 42 | +
|
| 43 | + For conferences without links, look up the conference name in historical data |
| 44 | + and use that link, replacing any year references with the current year. |
| 45 | +
|
| 46 | + Parameters |
| 47 | + ---------- |
| 48 | + df_ics : pd.DataFrame |
| 49 | + DataFrame with ICS conference data (may have empty links) |
| 50 | + df_yml : pd.DataFrame |
| 51 | + DataFrame with existing conference data from YAML files |
| 52 | +
|
| 53 | + Returns |
| 54 | + ------- |
| 55 | + pd.DataFrame |
| 56 | + DataFrame with missing links filled where historical data exists |
| 57 | + """ |
| 58 | + if df_yml.empty: |
| 59 | + return df_ics |
| 60 | + |
| 61 | + # Create a lookup of conference names to their most recent links |
| 62 | + # Group by normalized conference name and get the most recent entry |
| 63 | + historical_links = {} |
| 64 | + for _, row in df_yml.iterrows(): |
| 65 | + conf_name = row.get("conference", "") |
| 66 | + link = row.get("link", "") |
| 67 | + year = row.get("year", 0) |
| 68 | + |
| 69 | + if conf_name and link: |
| 70 | + # Keep the most recent link for each conference |
| 71 | + if conf_name not in historical_links or year > historical_links[conf_name][1]: |
| 72 | + historical_links[conf_name] = (link, year) |
| 73 | + |
| 74 | + filled_count = 0 |
| 75 | + for idx, row in df_ics.iterrows(): |
| 76 | + link = row.get("link", "") |
| 77 | + if not link or len(str(link).strip()) == 0: |
| 78 | + conf_name = row.get("conference", "") |
| 79 | + target_year = row.get("year", datetime.now(tz=timezone.utc).year) |
| 80 | + |
| 81 | + if conf_name in historical_links: |
| 82 | + hist_link, hist_year = historical_links[conf_name] |
| 83 | + # Replace the historical year with the target year in the link |
| 84 | + new_link = re.sub( |
| 85 | + rf"\b{hist_year}\b", |
| 86 | + str(target_year), |
| 87 | + str(hist_link), |
| 88 | + ) |
| 89 | + df_ics.at[idx, "link"] = new_link |
| 90 | + filled_count += 1 |
| 91 | + logger.debug( |
| 92 | + f"Filled link for '{conf_name}' from historical data: {new_link}", |
| 93 | + ) |
| 94 | + |
| 95 | + if filled_count > 0: |
| 96 | + logger.info(f"Filled {filled_count} missing links from historical conference data") |
| 97 | + |
| 98 | + return df_ics |
| 99 | + |
| 100 | + |
40 | 101 | def ics_to_dataframe() -> pd.DataFrame: |
41 | 102 | """Parse an .ics file and return a DataFrame with the event data. |
42 | 103 |
|
@@ -194,13 +255,6 @@ def ics_to_dataframe() -> pd.DataFrame: |
194 | 255 | except Exception as e: |
195 | 256 | logger.error(f"Error cleaning DataFrame: {e}") |
196 | 257 |
|
197 | | - # Filter out entries with empty or missing links |
198 | | - initial_count = len(df) |
199 | | - df = df[df["link"].str.len() > 0] |
200 | | - filtered_count = initial_count - len(df) |
201 | | - if filtered_count > 0: |
202 | | - logger.info(f"Filtered out {filtered_count} entries without valid links") |
203 | | - |
204 | 258 | return df |
205 | 259 |
|
206 | 260 |
|
@@ -248,6 +302,21 @@ def main(year=None, base="") -> bool: |
248 | 302 | logger.warning("No conference data retrieved from calendar") |
249 | 303 | return False |
250 | 304 |
|
| 305 | + # Try to fill missing links from historical conference data |
| 306 | + logger.info("Filling missing links from historical data") |
| 307 | + df_ics = fill_links_from_history(df_ics, df_yml) |
| 308 | + |
| 309 | + # Filter out entries with empty or missing links |
| 310 | + initial_count = len(df_ics) |
| 311 | + df_ics = df_ics[df_ics["link"].str.len() > 0] |
| 312 | + filtered_count = initial_count - len(df_ics) |
| 313 | + if filtered_count > 0: |
| 314 | + logger.info(f"Filtered out {filtered_count} entries without valid links") |
| 315 | + |
| 316 | + if df_ics.empty: |
| 317 | + logger.warning("No conferences with valid links after filtering") |
| 318 | + return False |
| 319 | + |
251 | 320 | except Exception as e: |
252 | 321 | logger.error(f"Failed to initialize import process: {e}") |
253 | 322 | return False |
|
0 commit comments