Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion utils/tidy_conf/deduplicate.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@

def merge_near_duplicates(group):
# Fill missing values with the next value then take the first row
with pd.option_context('future.no_silent_downcasting', True):
with pd.option_context("future.no_silent_downcasting", True):
group = group.bfill().ffill().infer_objects(copy=False)
return group.iloc[0]

Expand Down
7 changes: 6 additions & 1 deletion utils/tidy_conf/subs.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,11 @@ def auto_add_sub(data):


def load_subs():
with Path("utils", "tidy_conf", "data", "subs.yml").open(encoding="utf-8") as file:
"""Load sub keywords from subs.yml.

Uses module-relative path for robustness regardless of working directory.
"""
subs_path = Path(__file__).parent / "data" / "subs.yml"
with subs_path.open(encoding="utf-8") as file:
data = yaml.safe_load(file)
return data
74 changes: 73 additions & 1 deletion utils/tidy_conf/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -98,7 +98,60 @@ def query_yes_no(question, default="no"):
sys.stdout.write("Please respond with 'yes' or 'no' (or 'y' or 'n').\n")


def fill_missing_required(df):
def _load_subs_keywords():
"""Load sub keywords from subs.yml for auto-detection.

Returns empty dict if loading fails, allowing fallback to DEFAULT_SUB.
"""
try:
from .subs import load_subs

return load_subs()
except (FileNotFoundError, ImportError):
return {}


def _auto_detect_sub(conference_name: str) -> str | None:
"""Auto-detect sub category based on conference name.

Parameters
----------
conference_name : str
Name of the conference

Returns
-------
str | None
Sub category string if matched, None otherwise.
"""
keywords = _load_subs_keywords()
name_lower = conference_name.lower()
for sub_key, sub_keywords in keywords.items():
if any(word in name_lower for word in sub_keywords):
return sub_key
return None


# Default sub value for conferences that don't match any keyword
DEFAULT_SUB = "PY"


def fill_missing_required(df: pd.DataFrame) -> pd.DataFrame:
"""Fill missing required fields in the DataFrame.

In non-interactive environments (CI), uses auto-detection and defaults
instead of prompting for user input.

Parameters
----------
df : pd.DataFrame
DataFrame with conference data

Returns
-------
pd.DataFrame
DataFrame with missing required fields filled.
"""
required = [
"conference",
"year",
Expand All @@ -110,9 +163,28 @@ def fill_missing_required(df):
"sub",
]

is_interactive = sys.stdin.isatty()

for i, row in df.copy().iterrows():
for keyword in required:
if pd.isna(row[keyword]):
# Handle sub field specially - try auto-detection first
if keyword == "sub":
detected_sub = _auto_detect_sub(row["conference"])
if detected_sub:
df.loc[i, keyword] = detected_sub
continue
# Use default if no match and non-interactive
if not is_interactive:
df.loc[i, keyword] = DEFAULT_SUB
continue

# In non-interactive mode, skip prompting for other fields
if not is_interactive:
# Leave as NaN - will be caught by validation later
continue

# Interactive mode - prompt user
user_input = input(
f"What's the value of '{keyword}' for conference '{row['conference']}' check {row['link']} ?: ",
)
Expand Down