Skip to content

Commit f7443b1

Browse files
fix(merge): move fuzzy match and merge logic inside year loop (#208)
The fuzzy_match() and merge_conferences() calls were incorrectly placed outside the for loop due to wrong indentation. This caused: - Merge logic to only run once after the loop completed - It used the last value of y (year+9) which often had no data - Both DataFrames passed to fuzzy_match were empty for that year Fixed by: 1. Indenting lines 305-361 to be inside the for loop 2. Changing df_merged["year"] = year to df_merged["year"] = y Now the fuzzy matching and merge operations run for each year where both YAML and ICS data exist. Co-authored-by: Claude <noreply@anthropic.com>
1 parent 71105d3 commit f7443b1

1 file changed

Lines changed: 39 additions & 39 deletions

File tree

utils/import_python_official.py

Lines changed: 39 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -302,52 +302,52 @@ def main(year=None, base="") -> bool:
302302
)
303303
continue
304304

305-
df_merged, df_remote, merge_report = fuzzy_match(
306-
df_yml[df_yml["year"] == y],
307-
df_ics.loc[df_ics["year"] == y],
308-
)
309-
logger.info(
310-
f"Merge report: {merge_report.exact_matches} exact, "
311-
f"{merge_report.fuzzy_matches} fuzzy, {merge_report.no_matches} no match",
312-
)
313-
df_merged["year"] = year
314-
diff_idx = df_merged.index.difference(df_remote.index)
315-
df_missing = df_merged.loc[diff_idx, :].sort_values("start")
316-
df_merged = df_merged.drop(["conference"], axis=1)
317-
df_merged = deduplicate(df_merged)
318-
df_remote = deduplicate(df_remote)
319-
df_merged = merge_conferences(df_merged, df_remote)
320-
321-
# Concatenate the new data with the existing data
322-
df_new = pd.concat([df_new, df_merged], ignore_index=True)
323-
for _index, row in df_missing.iterrows():
324-
325-
reverse_title_data = reverse_titles.get(row["conference"])
326-
if reverse_title_data is None:
327-
reverse_title = f"{row['conference']} {row['year']}"
328-
else:
329-
# Get the first variation from the reverse title data
330-
reverse_title_data = reverse_title_data.get("variations")
331-
if reverse_title_data:
332-
reverse_title = f"{reverse_title_data[0]} {row['year']}"
333-
else:
305+
df_merged, df_remote, merge_report = fuzzy_match(
306+
df_yml[df_yml["year"] == y],
307+
df_ics.loc[df_ics["year"] == y],
308+
)
309+
logger.info(
310+
f"Merge report: {merge_report.exact_matches} exact, "
311+
f"{merge_report.fuzzy_matches} fuzzy, {merge_report.no_matches} no match",
312+
)
313+
df_merged["year"] = y
314+
diff_idx = df_merged.index.difference(df_remote.index)
315+
df_missing = df_merged.loc[diff_idx, :].sort_values("start")
316+
df_merged = df_merged.drop(["conference"], axis=1)
317+
df_merged = deduplicate(df_merged)
318+
df_remote = deduplicate(df_remote)
319+
df_merged = merge_conferences(df_merged, df_remote)
320+
321+
# Concatenate the new data with the existing data
322+
df_new = pd.concat([df_new, df_merged], ignore_index=True)
323+
for _index, row in df_missing.iterrows():
324+
325+
reverse_title_data = reverse_titles.get(row["conference"])
326+
if reverse_title_data is None:
334327
reverse_title = f"{row['conference']} {row['year']}"
328+
else:
329+
# Get the first variation from the reverse title data
330+
reverse_title_data = reverse_title_data.get("variations")
331+
if reverse_title_data:
332+
reverse_title = f"{reverse_title_data[0]} {row['year']}"
333+
else:
334+
reverse_title = f"{row['conference']} {row['year']}"
335335

336-
timezone_str = row["timezone"] if isinstance(row["timezone"], str) else "UTC"
337-
dates = f'{create_nice_date(row)["date"]} ({timezone_str})'
338-
link = f'<a href="{row["link"]}">{row["conference"]}</a>'
339-
out = f""" * name of the event: {reverse_title}
336+
timezone_str = row["timezone"] if isinstance(row["timezone"], str) else "UTC"
337+
dates = f'{create_nice_date(row)["date"]} ({timezone_str})'
338+
link = f'<a href="{row["link"]}">{row["conference"]}</a>'
339+
out = f""" * name of the event: {reverse_title}
340340
* type of event: conference
341341
* focus on Python: yes
342342
* approximate number of attendees: Unknown
343343
* location (incl. country): {row["place"]}
344344
* dates/times/recurrence (incl. time zone): {dates})
345345
* HTML link using the format <a href="http://url/">name of the event</a>: {link}"""
346-
with Path("missing_conferences.txt").open("a") as f:
347-
f.write(out + "\n\n")
348-
Path(".tmp").mkdir(exist_ok=True, parents=True)
349-
Path(".tmp", f"{reverse_title}.ics".lower().replace(" ", "-")).write_text(
350-
f"""BEGIN:VCALENDAR
346+
with Path("missing_conferences.txt").open("a") as f:
347+
f.write(out + "\n\n")
348+
Path(".tmp").mkdir(exist_ok=True, parents=True)
349+
Path(".tmp", f"{reverse_title}.ics".lower().replace(" ", "-")).write_text(
350+
f"""BEGIN:VCALENDAR
351351
VERSION:2.0
352352
BEGIN:VEVENT
353353
SUMMARY:{reverse_title}
@@ -357,7 +357,7 @@ def main(year=None, base="") -> bool:
357357
LOCATION:{ row.place }
358358
END:VEVENT
359359
END:VCALENDAR""",
360-
)
360+
)
361361
processed_years += 1
362362

363363
logger.info(f"Fuzzy matching complete: processed {processed_years} years")

0 commit comments

Comments
 (0)