diff --git a/utils/tidy_conf/deduplicate.py b/utils/tidy_conf/deduplicate.py index e0852b9fc0..868fab0810 100644 --- a/utils/tidy_conf/deduplicate.py +++ b/utils/tidy_conf/deduplicate.py @@ -3,7 +3,7 @@ def merge_near_duplicates(group): # Fill missing values with the next value then take the first row - with pd.option_context('future.no_silent_downcasting', True): + with pd.option_context("future.no_silent_downcasting", True): group = group.bfill().ffill().infer_objects(copy=False) return group.iloc[0] diff --git a/utils/tidy_conf/subs.py b/utils/tidy_conf/subs.py index 3deeef596d..2bcbddc3e7 100644 --- a/utils/tidy_conf/subs.py +++ b/utils/tidy_conf/subs.py @@ -20,6 +20,11 @@ def auto_add_sub(data): def load_subs(): - with Path("utils", "tidy_conf", "data", "subs.yml").open(encoding="utf-8") as file: + """Load sub keywords from subs.yml. + + Uses module-relative path for robustness regardless of working directory. + """ + subs_path = Path(__file__).parent / "data" / "subs.yml" + with subs_path.open(encoding="utf-8") as file: data = yaml.safe_load(file) return data diff --git a/utils/tidy_conf/utils.py b/utils/tidy_conf/utils.py index 12aa8d8347..0ee7eafa4d 100644 --- a/utils/tidy_conf/utils.py +++ b/utils/tidy_conf/utils.py @@ -98,7 +98,60 @@ def query_yes_no(question, default="no"): sys.stdout.write("Please respond with 'yes' or 'no' (or 'y' or 'n').\n") -def fill_missing_required(df): +def _load_subs_keywords(): + """Load sub keywords from subs.yml for auto-detection. + + Returns empty dict if loading fails, allowing fallback to DEFAULT_SUB. + """ + try: + from .subs import load_subs + + return load_subs() + except (FileNotFoundError, ImportError): + return {} + + +def _auto_detect_sub(conference_name: str) -> str | None: + """Auto-detect sub category based on conference name. + + Parameters + ---------- + conference_name : str + Name of the conference + + Returns + ------- + str | None + Sub category string if matched, None otherwise. + """ + keywords = _load_subs_keywords() + name_lower = conference_name.lower() + for sub_key, sub_keywords in keywords.items(): + if any(word in name_lower for word in sub_keywords): + return sub_key + return None + + +# Default sub value for conferences that don't match any keyword +DEFAULT_SUB = "PY" + + +def fill_missing_required(df: pd.DataFrame) -> pd.DataFrame: + """Fill missing required fields in the DataFrame. + + In non-interactive environments (CI), uses auto-detection and defaults + instead of prompting for user input. + + Parameters + ---------- + df : pd.DataFrame + DataFrame with conference data + + Returns + ------- + pd.DataFrame + DataFrame with missing required fields filled. + """ required = [ "conference", "year", @@ -110,9 +163,28 @@ def fill_missing_required(df): "sub", ] + is_interactive = sys.stdin.isatty() + for i, row in df.copy().iterrows(): for keyword in required: if pd.isna(row[keyword]): + # Handle sub field specially - try auto-detection first + if keyword == "sub": + detected_sub = _auto_detect_sub(row["conference"]) + if detected_sub: + df.loc[i, keyword] = detected_sub + continue + # Use default if no match and non-interactive + if not is_interactive: + df.loc[i, keyword] = DEFAULT_SUB + continue + + # In non-interactive mode, skip prompting for other fields + if not is_interactive: + # Leave as NaN - will be caught by validation later + continue + + # Interactive mode - prompt user user_input = input( f"What's the value of '{keyword}' for conference '{row['conference']}' check {row['link']} ?: ", )