feat: author、source表规则#441
Conversation
There was a problem hiding this comment.
Code Review
This pull request introduces two new validation rules, RuleAuthorFieldValidation and RuleSourceFieldValidation, to validate OpenAlex author and source metadata fields. The review feedback focuses on aligning the validation logic with the OpenAlex schema by correctly handling optional or null fields (such as orcid, institution, host_organization, and created_date), removing an unused parameter in a date helper function, and ensuring consistent integer validation for year and work counts in the source rule.
Important
The consumer version of Gemini Code Assist on GitHub is being sunset. Starting June 18, 2026, new organization installations will be blocked, and all code review activity will officially cease on July 17, 2026.
For more details on the timeline and next steps, please review the Help Documentation.
| def check_orcid(orcid: Any) -> ValidationResult: | ||
| if orcid is None: | ||
| return _fail("value is null") | ||
| if not isinstance(orcid, str): | ||
| return _fail("value must be a string") | ||
| if orcid == "": | ||
| return _ok() | ||
| if not URL_RE.fullmatch(orcid): | ||
| return _fail("value is not a valid URL") | ||
| return _ok() |
There was a problem hiding this comment.
在 OpenAlex 中,作者的 ORCID 是可选的,如果未绑定则为 null。当前的校验逻辑在 orcid 为 None 时会直接报错,而允许 orcid 为空字符串 ""。建议统一处理 None 和 "",避免对没有 ORCID 的正常作者记录报错。
def check_orcid(orcid: Any) -> ValidationResult:
if orcid is None or orcid == "":
return _ok()
if not isinstance(orcid, str):
return _fail("value must be a string")
if not URL_RE.fullmatch(orcid):
return _fail("value is not a valid URL")
return _ok()| def check_affiliations(affiliations: Any) -> ValidationResult: | ||
| if affiliations is None: | ||
| return _fail("value is null") | ||
| if not isinstance(affiliations, list): | ||
| return _fail("value must be a list") | ||
| for idx, item in enumerate(affiliations): | ||
| if not isinstance(item, dict): | ||
| return _fail(f"item[{idx}] must be an object") | ||
| institution = item.get("institution") | ||
| years = item.get("years") | ||
| if not isinstance(institution, dict): | ||
| return _fail(f"item[{idx}].institution must be an object") | ||
| if not isinstance(years, list): | ||
| return _fail(f"item[{idx}].years must be a list") | ||
| for year_idx, year in enumerate(years): | ||
| if not _is_int(year): | ||
| return _fail(f"item[{idx}].years[{year_idx}] must be an integer") | ||
| return _ok() |
There was a problem hiding this comment.
在 OpenAlex 中,如果作者的机构未能成功匹配到数据库中的机构,affiliations 中的 institution 字段会为 null。当前的校验逻辑直接使用 isinstance(institution, dict) 判定,这会导致 institution 为 null 时校验失败。建议允许 institution 为 None。
| def check_affiliations(affiliations: Any) -> ValidationResult: | |
| if affiliations is None: | |
| return _fail("value is null") | |
| if not isinstance(affiliations, list): | |
| return _fail("value must be a list") | |
| for idx, item in enumerate(affiliations): | |
| if not isinstance(item, dict): | |
| return _fail(f"item[{idx}] must be an object") | |
| institution = item.get("institution") | |
| years = item.get("years") | |
| if not isinstance(institution, dict): | |
| return _fail(f"item[{idx}].institution must be an object") | |
| if not isinstance(years, list): | |
| return _fail(f"item[{idx}].years must be a list") | |
| for year_idx, year in enumerate(years): | |
| if not _is_int(year): | |
| return _fail(f"item[{idx}].years[{year_idx}] must be an integer") | |
| return _ok() | |
| def check_affiliations(affiliations: Any) -> ValidationResult: | |
| if affiliations is None: | |
| return _fail("value is null") | |
| if not isinstance(affiliations, list): | |
| return _fail("value must be a list") | |
| for idx, item in enumerate(affiliations): | |
| if not isinstance(item, dict): | |
| return _fail(f"item[{idx}] must be an object") | |
| institution = item.get("institution") | |
| years = item.get("years") | |
| if institution is not None and not isinstance(institution, dict): | |
| return _fail(f"item[{idx}].institution must be an object") | |
| if not isinstance(years, list): | |
| return _fail(f"item[{idx}].years must be a list") | |
| for year_idx, year in enumerate(years): | |
| if not _is_int(year): | |
| return _fail(f"item[{idx}].years[{year_idx}] must be an integer") | |
| return _ok() |
| def check_host_organization(host_organization: Any) -> ValidationResult: | ||
| if host_organization is None: | ||
| return _fail("value is null") | ||
| if not isinstance(host_organization, str): | ||
| return _fail("value must be a string") | ||
| if host_organization == "": | ||
| return _ok() | ||
| if not URL_RE.fullmatch(host_organization): | ||
| return _fail("value is not a valid URL") | ||
| return _ok() |
There was a problem hiding this comment.
在 OpenAlex 中,Source 的 host_organization 字段是可选的,如果不存在则为 null。当前的校验逻辑在 host_organization 为 None 时会直接报错,而允许其为空字符串 ""。建议统一处理 None 和 "",避免对没有 host_organization 的正常 Source 记录报错。
def check_host_organization(host_organization: Any) -> ValidationResult:
if host_organization is None or host_organization == "":
return _ok()
if not isinstance(host_organization, str):
return _fail("value must be a string")
if not URL_RE.fullmatch(host_organization):
return _fail("value is not a valid URL")
return _ok()| def _check_date_yyyy_mm_dd(value: Any, field_name: str) -> ValidationResult: | ||
| if value is None: | ||
| return _fail("value is null") | ||
| if not isinstance(value, str): | ||
| return _fail("value must be a string") | ||
| if not re.fullmatch(r"\d{4}-\d{2}-\d{2}", value): | ||
| return _fail("value must match YYYY-MM-DD") | ||
| try: | ||
| datetime.strptime(value, "%Y-%m-%d") | ||
| except ValueError: | ||
| return _fail("value is not a valid calendar date") | ||
| return _ok() | ||
|
|
||
|
|
||
| def check_updated_date(updated_date: Any) -> ValidationResult: | ||
| return _check_date_yyyy_mm_dd(updated_date, "updated_date") | ||
|
|
||
|
|
||
| def check_created_date(created_date: Any) -> ValidationResult: | ||
| return _check_date_yyyy_mm_dd(created_date, "created_date") |
There was a problem hiding this comment.
_check_date_yyyy_mm_dd 函数中的 field_name 参数未被使用。建议将其移除,并简化调用方的传参。
| def _check_date_yyyy_mm_dd(value: Any, field_name: str) -> ValidationResult: | |
| if value is None: | |
| return _fail("value is null") | |
| if not isinstance(value, str): | |
| return _fail("value must be a string") | |
| if not re.fullmatch(r"\d{4}-\d{2}-\d{2}", value): | |
| return _fail("value must match YYYY-MM-DD") | |
| try: | |
| datetime.strptime(value, "%Y-%m-%d") | |
| except ValueError: | |
| return _fail("value is not a valid calendar date") | |
| return _ok() | |
| def check_updated_date(updated_date: Any) -> ValidationResult: | |
| return _check_date_yyyy_mm_dd(updated_date, "updated_date") | |
| def check_created_date(created_date: Any) -> ValidationResult: | |
| return _check_date_yyyy_mm_dd(created_date, "created_date") | |
| def _check_date_yyyy_mm_dd(value: Any) -> ValidationResult: | |
| if value is None: | |
| return _fail("value is null") | |
| if not isinstance(value, str): | |
| return _fail("value must be a string") | |
| if not re.fullmatch(r"\d{4}-\d{2}-\d{2}", value): | |
| return _fail("value must match YYYY-MM-DD") | |
| try: | |
| datetime.strptime(value, "%Y-%m-%d") | |
| except ValueError: | |
| return _fail("value is not a valid calendar date") | |
| return _ok() | |
| def check_updated_date(updated_date: Any) -> ValidationResult: | |
| return _check_date_yyyy_mm_dd(updated_date) | |
| def check_created_date(created_date: Any) -> ValidationResult: | |
| return _check_date_yyyy_mm_dd(created_date) |
| def check_counts_by_year(counts_by_year: Any) -> ValidationResult: | ||
| if counts_by_year is None: | ||
| return _fail("value is null") | ||
| if not isinstance(counts_by_year, list): | ||
| return _fail("value must be a list") | ||
| required_keys = {"year", "works_count", "oa_works_count", "cited_by_count"} | ||
| for idx, item in enumerate(counts_by_year): | ||
| if not isinstance(item, dict): | ||
| return _fail(f"counts_by_year[{idx}] must be an object") | ||
| missing = required_keys - set(item.keys()) | ||
| if missing: | ||
| return _fail(f"counts_by_year[{idx}] missing keys {sorted(missing)}") | ||
| return _ok() |
There was a problem hiding this comment.
在 rule_source.py 中,check_counts_by_year 仅校验了必需的 key 是否存在,但未校验其值是否为整数。这与 rule_author.py 中的校验逻辑不一致。建议增加对 year、works_count、oa_works_count 和 cited_by_count 字段的整数类型校验。
def check_counts_by_year(counts_by_year: Any) -> ValidationResult:
if counts_by_year is None:
return _fail("value is null")
if not isinstance(counts_by_year, list):
return _fail("value must be a list")
required_keys = {"year", "works_count", "oa_works_count", "cited_by_count"}
for idx, item in enumerate(counts_by_year):
if not isinstance(item, dict):
return _fail(f"counts_by_year[{idx}] must be an object")
missing = required_keys - set(item.keys())
if missing:
return _fail(f"counts_by_year[{idx}] missing keys {sorted(missing)}")
for key in ("year", "works_count", "oa_works_count", "cited_by_count"):
val = item.get(key)
if not (isinstance(val, int) and not isinstance(val, bool)):
return _fail(f"counts_by_year[{idx}].{key} must be an integer")
return _ok()| def check_created_date(created_date: Any) -> ValidationResult: | ||
| if created_date == "": | ||
| return _ok() | ||
| return _check_date_yyyy_mm_dd(created_date) |
There was a problem hiding this comment.
在 check_created_date 中,当 created_date 为 "" 时允许通过,但如果为 None,则会因为调用 _check_date_yyyy_mm_dd 而报错。建议统一将 None 和 "" 均视为合法的空值。
| def check_created_date(created_date: Any) -> ValidationResult: | |
| if created_date == "": | |
| return _ok() | |
| return _check_date_yyyy_mm_dd(created_date) | |
| def check_created_date(created_date: Any) -> ValidationResult: | |
| if created_date is None or created_date == "": | |
| return _ok() | |
| return _check_date_yyyy_mm_dd(created_date) |
No description provided.