Skip to content

Commit bdd0b19

Browse files
committed
feat: fill missing links from historical conference data
When an ICS calendar entry has no link, look up the conference name in existing YAML data and use the historical link with the year replaced. This allows conferences without descriptions to still have valid links if we've seen them before.
1 parent 6029f2e commit bdd0b19

1 file changed

Lines changed: 76 additions & 7 deletions

File tree

utils/import_python_official.py

Lines changed: 76 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,67 @@
3737
logger = get_tqdm_logger(__name__)
3838

3939

40+
def fill_links_from_history(df_ics: pd.DataFrame, df_yml: pd.DataFrame) -> pd.DataFrame:
41+
"""Fill missing links in ICS data from historical conference data.
42+
43+
For conferences without links, look up the conference name in historical data
44+
and use that link, replacing any year references with the current year.
45+
46+
Parameters
47+
----------
48+
df_ics : pd.DataFrame
49+
DataFrame with ICS conference data (may have empty links)
50+
df_yml : pd.DataFrame
51+
DataFrame with existing conference data from YAML files
52+
53+
Returns
54+
-------
55+
pd.DataFrame
56+
DataFrame with missing links filled where historical data exists
57+
"""
58+
if df_yml.empty:
59+
return df_ics
60+
61+
# Create a lookup of conference names to their most recent links
62+
# Group by normalized conference name and get the most recent entry
63+
historical_links = {}
64+
for _, row in df_yml.iterrows():
65+
conf_name = row.get("conference", "")
66+
link = row.get("link", "")
67+
year = row.get("year", 0)
68+
69+
if conf_name and link:
70+
# Keep the most recent link for each conference
71+
if conf_name not in historical_links or year > historical_links[conf_name][1]:
72+
historical_links[conf_name] = (link, year)
73+
74+
filled_count = 0
75+
for idx, row in df_ics.iterrows():
76+
link = row.get("link", "")
77+
if not link or len(str(link).strip()) == 0:
78+
conf_name = row.get("conference", "")
79+
target_year = row.get("year", datetime.now(tz=timezone.utc).year)
80+
81+
if conf_name in historical_links:
82+
hist_link, hist_year = historical_links[conf_name]
83+
# Replace the historical year with the target year in the link
84+
new_link = re.sub(
85+
rf"\b{hist_year}\b",
86+
str(target_year),
87+
str(hist_link),
88+
)
89+
df_ics.at[idx, "link"] = new_link
90+
filled_count += 1
91+
logger.debug(
92+
f"Filled link for '{conf_name}' from historical data: {new_link}",
93+
)
94+
95+
if filled_count > 0:
96+
logger.info(f"Filled {filled_count} missing links from historical conference data")
97+
98+
return df_ics
99+
100+
40101
def ics_to_dataframe() -> pd.DataFrame:
41102
"""Parse an .ics file and return a DataFrame with the event data.
42103
@@ -194,13 +255,6 @@ def ics_to_dataframe() -> pd.DataFrame:
194255
except Exception as e:
195256
logger.error(f"Error cleaning DataFrame: {e}")
196257

197-
# Filter out entries with empty or missing links
198-
initial_count = len(df)
199-
df = df[df["link"].str.len() > 0]
200-
filtered_count = initial_count - len(df)
201-
if filtered_count > 0:
202-
logger.info(f"Filtered out {filtered_count} entries without valid links")
203-
204258
return df
205259

206260

@@ -248,6 +302,21 @@ def main(year=None, base="") -> bool:
248302
logger.warning("No conference data retrieved from calendar")
249303
return False
250304

305+
# Try to fill missing links from historical conference data
306+
logger.info("Filling missing links from historical data")
307+
df_ics = fill_links_from_history(df_ics, df_yml)
308+
309+
# Filter out entries with empty or missing links
310+
initial_count = len(df_ics)
311+
df_ics = df_ics[df_ics["link"].str.len() > 0]
312+
filtered_count = initial_count - len(df_ics)
313+
if filtered_count > 0:
314+
logger.info(f"Filtered out {filtered_count} entries without valid links")
315+
316+
if df_ics.empty:
317+
logger.warning("No conferences with valid links after filtering")
318+
return False
319+
251320
except Exception as e:
252321
logger.error(f"Failed to initialize import process: {e}")
253322
return False

0 commit comments

Comments
 (0)