From d6de13661519c6d5698f9165fb4123f1a1ec73d8 Mon Sep 17 00:00:00 2001 From: guhuaiyu Date: Mon, 8 Jun 2026 11:46:38 +0800 Subject: [PATCH] Add SciBase rule report validators --- dingo/exec/local.py | 18 + dingo/exec/spark.py | 26 +- dingo/model/model.py | 24 +- dingo/model/rule/scibase/__init__.py | 2 +- .../scibase/assets/ebook_unique_mapping.csv | 21 + .../rule/scibase/assets/osi_arxiv_mapping.csv | 61 + .../scibase/assets/paper_unique_mapping.csv | 42 + .../rule/scibase/assets/patent_mapping.csv | 109 + .../assets/union_unique_data_mapping.csv | 109 + dingo/model/rule/scibase/meta_ebook_unique.py | 1520 ++++++++ dingo/model/rule/scibase/meta_paper_data.py | 3408 +++++++++++++++++ dingo/model/rule/scibase/meta_paper_unique.py | 2278 +++++++++++ .../rule/scibase/meta_patent_parsed_info.py | 1720 +++++++++ dingo/model/rule/scibase/report_utils.py | 163 + dingo/model/rule/scibase/rule_quanliang.py | 655 ---- .../rule/scibase/union_unique_meta_data.py | 2548 ++++++++++++ docs/metrics.md | 2 +- setup.py | 6 + 18 files changed, 12044 insertions(+), 668 deletions(-) create mode 100644 dingo/model/rule/scibase/assets/ebook_unique_mapping.csv create mode 100644 dingo/model/rule/scibase/assets/osi_arxiv_mapping.csv create mode 100644 dingo/model/rule/scibase/assets/paper_unique_mapping.csv create mode 100644 dingo/model/rule/scibase/assets/patent_mapping.csv create mode 100644 dingo/model/rule/scibase/assets/union_unique_data_mapping.csv create mode 100644 dingo/model/rule/scibase/meta_ebook_unique.py create mode 100644 dingo/model/rule/scibase/meta_paper_data.py create mode 100644 dingo/model/rule/scibase/meta_paper_unique.py create mode 100644 dingo/model/rule/scibase/meta_patent_parsed_info.py create mode 100644 dingo/model/rule/scibase/report_utils.py delete mode 100644 dingo/model/rule/scibase/rule_quanliang.py create mode 100644 dingo/model/rule/scibase/union_unique_meta_data.py diff --git a/dingo/exec/local.py b/dingo/exec/local.py index 5f11b1ea..8cf0f53f 100644 --- a/dingo/exec/local.py +++ b/dingo/exec/local.py @@ -176,6 +176,24 @@ def evaluate_single_data(self, dingo_id: str, eval_fields: dict, eval_type: str, model_cls = Model.rule_name_map.get(e_c_i.name) model = model_cls() # 实例化类为对象,避免多线程配置覆盖 Model.set_config_rule(model, e_c_i.config) + if getattr(model_cls, "__module__", "").startswith("dingo.model.rule.scibase."): + if "dynamic_config" not in model.__dict__: + model.dynamic_config = model.dynamic_config.model_copy(deep=True) + if model.dynamic_config.parameters is None: + model.dynamic_config.parameters = {} + model.dynamic_config.parameters.setdefault( + "_dingo_dataset_sql_config", + self.input_args.dataset.sql_config.model_dump(), + ) + model.dynamic_config.parameters.setdefault( + "_dingo_dataset_s3_config", + self.input_args.dataset.s3_config.model_dump(), + ) + model.dynamic_config.parameters.setdefault("_dingo_dataset_source", self.input_args.dataset.source) + model.dynamic_config.parameters.setdefault("_dingo_dataset_format", self.input_args.dataset.format) + model.dynamic_config.parameters.setdefault("_dingo_input_path", self.input_args.input_path) + setattr(model_cls, "dynamic_config", model.dynamic_config) + model = model_cls elif eval_type == 'llm': model_cls = Model.llm_name_map.get(e_c_i.name) model = model_cls() diff --git a/dingo/exec/spark.py b/dingo/exec/spark.py index dd57b6be..b2235a31 100644 --- a/dingo/exec/spark.py +++ b/dingo/exec/spark.py @@ -236,8 +236,30 @@ def evaluate_item(self, eval_fields: dict, eval_type: str, map_data: dict, eval_ for e_c_i in eval_list: if eval_type == 'rule': - model = Model.rule_name_map.get(e_c_i.name) - Model.set_config_rule(model, e_c_i.config) + model_cls = Model.rule_name_map.get(e_c_i.name) + if getattr(model_cls, "__module__", "").startswith("dingo.model.rule.scibase."): + model = model_cls() + Model.set_config_rule(model, e_c_i.config) + if "dynamic_config" not in model.__dict__: + model.dynamic_config = model.dynamic_config.model_copy(deep=True) + if model.dynamic_config.parameters is None: + model.dynamic_config.parameters = {} + model.dynamic_config.parameters.setdefault( + "_dingo_dataset_sql_config", + self.input_args.dataset.sql_config.model_dump(), + ) + model.dynamic_config.parameters.setdefault( + "_dingo_dataset_s3_config", + self.input_args.dataset.s3_config.model_dump(), + ) + model.dynamic_config.parameters.setdefault("_dingo_dataset_source", self.input_args.dataset.source) + model.dynamic_config.parameters.setdefault("_dingo_dataset_format", self.input_args.dataset.format) + model.dynamic_config.parameters.setdefault("_dingo_input_path", self.input_args.input_path) + setattr(model_cls, "dynamic_config", model.dynamic_config) + model = model_cls + else: + model = model_cls + Model.set_config_rule(model, e_c_i.config) elif eval_type == 'llm': model = Model.llm_name_map.get(e_c_i.name) Model.set_config_llm(model, e_c_i.config) diff --git a/dingo/model/model.py b/dingo/model/model.py index fbcbc7a8..9bd9cec1 100644 --- a/dingo/model/model.py +++ b/dingo/model/model.py @@ -124,16 +124,22 @@ def load_model(cls): if cls.module_loaded: return this_module_directory = os.path.dirname(os.path.abspath(__file__)) - # rule auto register - for file in os.listdir(os.path.join(this_module_directory, "rule")): - path = os.path.join(this_module_directory, "rule", file) - if ( - os.path.isfile(path) - and file.endswith(".py") - and not file == "__init__.py" - ): + # rule auto register - recursively scan subdirectories + rule_base_dir = os.path.join(this_module_directory, "rule") + for root, dirs, files in os.walk(rule_base_dir): + dirs[:] = [d for d in dirs if d != "__pycache__"] + + for file in files: + if not file.endswith(".py") or file == "__init__.py": + continue + rel_path = os.path.relpath(root, rule_base_dir) + if rel_path == ".": + module_name = f"dingo.model.rule.{file[:-3]}" + else: + rel_module = rel_path.replace(os.sep, ".") + module_name = f"dingo.model.rule.{rel_module}.{file[:-3]}" try: - importlib.import_module("dingo.model.rule." + file.split(".")[0]) + importlib.import_module(module_name) except ModuleNotFoundError as e: log.debug(e) diff --git a/dingo/model/rule/scibase/__init__.py b/dingo/model/rule/scibase/__init__.py index ac7fae13..cef22734 100644 --- a/dingo/model/rule/scibase/__init__.py +++ b/dingo/model/rule/scibase/__init__.py @@ -1 +1 @@ -"""Quanliang/scibase rule implementations.""" +"""SciBase QA rule implementations.""" diff --git a/dingo/model/rule/scibase/assets/ebook_unique_mapping.csv b/dingo/model/rule/scibase/assets/ebook_unique_mapping.csv new file mode 100644 index 00000000..7b1a02fa --- /dev/null +++ b/dingo/model/rule/scibase/assets/ebook_unique_mapping.csv @@ -0,0 +1,21 @@ +字段名,数据类型,聚合策略,策略参数,源字段名,去重 / 聚合处理逻辑 +isbns,array,isbn_normalize,,,数组聚合,全部转换为13位ISBN格式,10位前面加978,13位保留,其他丢弃,全局去重 +isbn13,string,isbn_min,,,唯一去重键;从isbns数组中取最小的归一化13位ISBN +title,string,freq_lex_max,min_len=2;max_len=1000,,在非空值里取词频最高;剔除长度<2或>1000极值;词频相同取字典序最大值 +abstract,string,freq_lex_max,min_len=10;max_len=10000,,在非空值里取词频最高;剔除长度<10或>10000极值;词频相同取字典序最大值 +language,string,freq_lex_max,,,在非空值里取词频最高;词频相同取字典序最大值 +type,array,dedup_array,lower=true,,统一小写后数组聚合去重 +author,array,dedup_array,,,数组聚合去重 +contributors,array,dedup_array,,,数组聚合去重 +indexed_in,array,dedup_array,,,数组聚合去重 +identifiers,"map",merge_map,,,"key去重,相同key取max(value)" +publication_publisher,array,dedup_array,,publisher,数组聚合去重;原字段名称publisher +publication_published_year,int,freq_int_max,min_val=1000;max_val=CURRENT_YEAR;extract_year=true,published_year,在非空值里取词频最高;剔除<1000或>当年极值;词频相同取最大值 +publication_published_place,array,dedup_array,,published_place,数组聚合去重;原字段名称published_place +publication_published_country,array,dedup_array,,published_country,数组聚合去重;原字段名称published_country +publication_pages,int,max_int,,pages,取本书多版本中的最大页数 +subjects,array,dedup_array,,,数组聚合去重 +genre,array,dedup_array,,,数组聚合去重 +category,string,freq_lex_max,,,在非空值里取词频最高;词频相同取字典序最大值 +access_oa_url,array,dedup_array,,oa_url,数组聚合去重;原字段名称oa_url +dt,string,latest_dt,,,保留最新分区日期 diff --git a/dingo/model/rule/scibase/assets/osi_arxiv_mapping.csv b/dingo/model/rule/scibase/assets/osi_arxiv_mapping.csv new file mode 100644 index 00000000..e83733d2 --- /dev/null +++ b/dingo/model/rule/scibase/assets/osi_arxiv_mapping.csv @@ -0,0 +1,61 @@ +预期字段,arxiv对应字段,字段值数据类型 +track_id,数仓自己赋予,String +title,title,String +abstract,abstract,String +language,无,String +doi,doi;在 doi 为空时,使用"10.48550/arxiv."拼接doc_id,String +type,无,List[string] +author,author 解析为作者数组(字符串拆分),List[string] +identifiers,oaiId->oai_identifier、"arxivId->paper_id去掉http前缀",Object +indexed_in,新增字符串"arxiv",List[string] +published_date,updated,String +published_year,updated中年份,s3是RFC 1123 时间格式,需要转化为 yyyy-mm-dd 格式和db做对比,Integer +venue,,Object +venue.name,journal_ref :从 journal_ref 解析期刊/会议名(后续处理),String +venue.type,无,String +venue.issn,无,List[string] +venue.publisher,无,List[string] +venue.biblio,,Object +venue.biblio.volume,从 journal_ref解析(后续处理),String +venue.biblio.issue,从 journal_ref解析(后续处理),String +venue.biblio.pages,从 journal_ref解析(后续处理),String +access_is_oa,布尔值true,String +access_oa_status,空字符串,String +access_oa_url,pdf_url(get_pdf=0 时为""),String +access_license,license_url 将协议链接映射为对应可选值填入,String +keywords,无,List[string] +fieldsOfStudy,无,List[object] +s2FieldsOfStudy,无,List[object] +primary_topic,无,Object +topics,无,List[object] +concepts,无,List[object] +subject,无,String +major,无,String +major_2,无,String +major_3,无,String +category,无,String +area,无,String +grade_class,无,String +grade,无,String +origin_id,doc_id,String +origin_osi,取值"arxiv",String +origin_db_source,无,String +reference_count,无,Integer +citation_count,无,Integer +influential_citation_count,无,Integer +references,无,List[string] +related_works,无,List[string] +citation_normalized_percentile,无,Object +cited_by_percentile_year,无,Object +fwci,无,Float +cited_by_api_url,无,String +locations,,List[object] +locations.type,对pdf_url来说,get_pdf值为1时,值为download,get_pdf值为0时,保持空字符串;对source_url来说,get_source值为1时值为download,get_source值为0时保持空字符串。,String +locations.url,"pdf_url,source_url",String +locations.license,license_url 将协议链接映射为对应可选值,String +locations.is_oa,对pdf_url来说,get_pdf值为1时,值为true,get_pdf值为0时为false;对source_url来说,get_source值为1时值为true,get_source值为0时为false。,String +classifications,,Object +mesh,无,List[object] +msc_class,msc_class,String +acm_class,acm_class,String +arxiv_category,category,List[string] diff --git a/dingo/model/rule/scibase/assets/paper_unique_mapping.csv b/dingo/model/rule/scibase/assets/paper_unique_mapping.csv new file mode 100644 index 00000000..01395384 --- /dev/null +++ b/dingo/model/rule/scibase/assets/paper_unique_mapping.csv @@ -0,0 +1,42 @@ +字段名,数据类型,聚合策略,策略参数,源字段名,去重 / 聚合处理逻辑 +doi,string,key_lower,,,唯一去重键,精确匹配,统一小写 +identifiers,"map",merge_identifiers,,,"MAP聚合,key去重,doi/DOI/mag/MAG小写后与origin_osi拼接,相同key取max(value)" +indexed_in,array,dedup_array,,,数组聚合并去重 +type,array,dedup_array,lower=true,,统一小写后数组聚合去重 +title,string,freq_lex_max,min_len=2;max_len=1000,,在非空值里取词频最高;剔除长度<2或>1000极值;词频相同取字典序最大值 +abstract,string,freq_lex_max,min_len=10;max_len=10000,,在非空值里取词频最高;剔除长度<10或>10000极值;词频相同取字典序最大值 +author,"array>",dedup_struct,,,数组聚合去重 +language,string,freq_lex_max,,,在非空值里取词频最高;词频相同取字典序最大值 +published_year,int,freq_int_max,min_val=1000;max_val=CURRENT_YEAR,,在非空值里取词频最高;剔除<1000或>当年极值;词频相同取最大值 +published_date,string,freq_date,,,在非空值里取词频最高的出版日期;剔除年份异常值;词频相同取字典序最大值 +venue_name,string,freq_lex_max,,,在非空值里取词频最高;词频相同取字典序最大值 +venue_type,string,freq_lex_max,,,在非空值里取词频最高;词频相同取字典序最大值 +venue_issn,array,dedup_array,,,数组聚合去重 +venue_publisher,array,dedup_array,,,数组聚合去重 +access_license,string,freq_lex_max,,,在非空值里取词频最高;词频相同取字典序最大值 +biblio_volume,string,freq_lex_max,,,在非空值里取词频最高;词频相同取字典序最大值 +biblio_issue,string,freq_lex_max,,,在非空值里取词频最高;词频相同取字典序最大值 +biblio_pages,string,freq_lex_max,,,在非空值里取词频最高;词频相同取字典序最大值 +access_is_oa,string,freq_lex_max,,,在非空值里取词频最高;词频相同取字典序最大值 +access_oa_status,string,freq_lex_max,,,在非空值里取词频最高;词频相同取字典序最大值 +access_oa_url,array,dedup_array,,,数组聚合去重 +locations,"array>",dedup_locations,,,"STRUCT转成STRING再去重,key已排序" +keywords,array,dedup_array,,,数组聚合去重 +fieldsOfStudy,"array>",dedup_map,,,MAP转成STRING再去重 +s2fieldsofstudy,"array>",dedup_map,,,MAP转成STRING再去重 +primary_topic,"STRUCT,field:STRUCT,domain:STRUCT>",freq_struct,,,在非空值里取词频最高;词频相同取字典序最大值 +topics,"ARRAY,field:STRUCT,domain:STRUCT>>",dedup_struct,,,STRUCT转成STRING再去重 +concepts,"array>",dedup_map,,,MAP转成STRING再去重 +category,string,freq_lex_max,,,在非空值里取词频最高;词频相同取字典序最大值 +reference_count,int,freq_int_max,,,在非空值里取词频最高;词频相同取最大值 +citation_count,int,freq_int_max,,,在非空值里取词频最高;词频相同取最大值 +influential_citation_count,int,freq_int_max,,,在非空值里取词频最高;词频相同取最大值 +references,array,dedup_array,,,数组聚合去重 +related_works,array,dedup_array,,,数组聚合去重 +citation_normalized_percentile,"MAP",merge_map,,,"MAP聚合,key去重,相同key取max(value)" +cited_by_percentile_year,"MAP",merge_map,,,"MAP聚合,key去重,相同key取max(value)" +fwci,"decimal(15,4)",freq_decimal_max,,,在非空值里取词频最高;词频相同取最大值 +cited_by_api_url,string,freq_lex_max,,,在非空值里取词频最高;词频相同取字典序最大值 +mesh,"ARRAY>",dedup_map,,,MAP转成STRING再去重 +classifications,"STRUCT>,msc_class:STRING,acm_class:STRING,arxiv_category:ARRAY>",random_pick_cls,,,"mesh随机取一条不为空的值;msc_class/acm_class/arxiv_category从arxiv记录取" +dt,string,latest_dt,,,保留最新分区日期 diff --git a/dingo/model/rule/scibase/assets/patent_mapping.csv b/dingo/model/rule/scibase/assets/patent_mapping.csv new file mode 100644 index 00000000..0f2bd1b2 --- /dev/null +++ b/dingo/model/rule/scibase/assets/patent_mapping.csv @@ -0,0 +1,109 @@ +预期字段名,xml映射字段,数据类型,字段描述,有效性规则,可空,模块 +document_number,business:BibliographicData/business:PublicationReference[@dataFormat='original']/base:DocumentID → WIPOST3Code+DocNumber+Kind;回退 standard/original 或根 @country+@docNumber+@kind,string,文件号(包含所有公开/公告阶段的专利文件编号(如公开号、专利号)。),非空;同一局内唯一;保留原始格式,否*,著录信息 +document_kind_text,business:BibliographicData/business:SpecificBibliographicData/business:OriginalKindCode,string,文件种类文字描述,与 13 一致,否*,著录信息 +document_kind_code,PublicationReference(original)/base:DocumentID/base:Kind;回退根 @kind,string,ST.16 文件种类代码,必须是合法 ST.16 代码,可选值A1、B、U、S、A、B1、B2,否*,著录信息 +document_status_code,business:Abstract/@status;回退根 @status,string,数据版本状态标识,"不改变专利本身的法律效力,仅反映数据文件的版本可靠性。例如: +A:原始首次发布的数据; +C:因内容错误(如印刷错误、技术描述修正)发布的更正版数据; +D:数据已废弃(通常因重大错误)。",,著录信息 +document_wipo_country_code,PublicationReference(original)/base:DocumentID/base:WIPOST3Code;回退根 @country,string,ST.3,,是,著录信息 +publication_date,PublicationReference/base:DocumentID/base:Date(优先 dataFormat=standard/original);回退根 @datePublication,string,,,,著录信息 +publication_language,根 @lang,string,公开语言,ISO 639 两位语言码,是,著录信息 +publication_office_code,PublicationReference/@sourceDB 或 DocumentID/WIPOST3Code;回退根 @country,string,公布局/组织代码,ST.3 两位代码或官方组织标识,条件否**,著录信息 +correction_info,business:BibliographicData/business:PublicAvailabilityDate/business:GrantTerms/business:Disclaimer/base:Text,object,更正/勘误信息,结构需符合 ST.50,是,著录信息 +invention_title,business:BibliographicData/business:InventionTitle[@lang=publication_language],string,发明名称,非空;建议长度限制,否*,技术信息 +ipc,business:ClassificationIPC/business:MainClassification|FurtherClassification[@dataFormat='original'],list[string],IPC(国际专利分类),必须是合法分类符号,是*,技术信息 +ipc_text,business:ClassificationIPC/base:Text(按行拆分),list[string],,,,技术信息 +ipc_edition_statement,business:ClassificationIPC/base:EditionStatement,string,IPC版本号,,,技术信息 +ipcr_classifications,business:ClassificationIPCRDetails/business:ClassificationIPCR/*,list[object],IPCR(改革后 IPC 分类详细信息),建议每项包含 classification_symbol、classification_version_date、classification_level、classification_value、action_date 等结构化信息;保留原始分类号,是,技术信息 +patent_national_classifications,business:ClassificationNational/business:MainClassification[@dataFormat='original'],list[string],国家专利分类号,保留原始格式;建议记录分类体系来源或国家/地区代码,是,技术信息 +patent_domestic_classifications,business:ClassificationDomestic|DomesticClassification|DomesticPatentClassification/business:MainClassification,list[string],国内/本国专利分类号,保留原始格式;适用于局内自有分类体系,是,技术信息 +patent_fi_classifications,business:ClassificationFI|FIClassification|ClassificationFIData/business:MainClassification|base:Text,list[string],FI 分类号,保留原始格式;适用于日本 FI 分类体系,是,技术信息 +patent_cpc_classifications,business:PatentClassificationDetails/business:PatentClassification[business:ClassificationScheme/@scheme='CPC'];SearchField 下同类节点,list[string],CPC(合作专利分类)分类号,必须是合法 CPC 分类符号;保留原始格式,是,技术信息 +patent_locarno_classes,business:ClassificationLocarno/business:MainClassification,list[string],Locarno Classification工业外观分类号,,是,技术信息 +prior_art_references,business:ReferencesCited/business:Citation/business:ApplicationCitation/business:PublicationReference,list[object],对比文件/现有技术文献,每项建议含 citation_text、doc_number、category,是,技术信息 +search_field,business:SearchField/business:ClassificationNational;PatentClassificationDetails,list[string],检索领域/检索分类,与分类体系一致,是,技术信息 +,,,,,,技术信息 +abstract,business:Abstract/base:Paragraphs,string,摘要全文,非结构化文本,是,全文内容 +description,business:Description/business:Heading|base:Paragraphs|base:Image(递归 business 子节),list[object],说明书全文,非结构化文本,是,全文内容 +claims,business:Claims/business:Claim/business:ClaimText,list[object],权利要求项,每项至少含 claim_id+claim_num+claim_text,是,全文内容 +drawings,business:Drawings/base:Figure/base:Image,list[object],附图信息,每项至少含 figure_id+image_file;可带 width/height/orientation,是,全文内容 +chemistry,//Chemistry|business:ChemistrySection|ChemicalFormulae|ChemicalFormula 内 Chemistry/base:Image,list[object],化学相关专利独有的item信息,,是,全文内容 +content,根下 Abstract+Description+Drawings+Claims 按文档顺序拼接(标题用 invention_title),string,处理后专利全文,,是,全文内容 +applicants,business:Parties/business:ApplicantDetails/business:Applicant/base:AddressBook,list[object],申请人,每项至少 name;建议带 country,条件否**,当事人 +assignees,business:Parties/business:AssigneeDetails/business:Assignee/base:AddressBook,list[object],专利权人/所有人,每项至少 name,条件否**,当事人 +inventors,business:Parties/business:InventorDetails/business:Inventor/base:AddressBook,list[object],发明人,已知即填;每项至少 name,是,当事人 +designers,business:Parties/business:DesignerDetails/business:Designer/base:AddressBook,list[object],设计人,,是,当事人 +patent_agents,business:Parties/business:AgentDetails/business:Agent|business:AgentDetails/business:Agent/business:Agency,list[object],代理人,每项至少 name,是,当事人 +patent_agency,business:AgentDetails/business:PatentAgency/base:AddressBook;business:CustomerNumber,list[object],代理机构,,是,当事人 +priority_numbers,business:PriorityClaim/base:DocumentID/base:DocNumber,list[string],优先权申请号,至少 1 个;格式按 ST.10/C、ST.34,否*,优先权 +priority_filing_dates,business:PriorityClaim/base:DocumentID/base:Date,array,优先权申请日,日期按 ST.2,否*,优先权 +priority_office_codes,business:PriorityClaim/base:OfficeCode|business:GeneratingOffice|base:WIPOST3Code,list[string],优先权受理局代码,ST.3 两位代码;PCT 用 WO,否*,优先权 +priority_country_codes,business:PriorityClaim/base:DocumentID/base:WIPOST3Code,list[string],区域/国际优先权指定国家代码,ST.3;至少一个缔约方/WTO 成员,是,优先权 +public_availability_group,business:PublicAvailabilityDate/*(未映射到具名字段的子节点日期),object,公开/公告日期分组,仅作容器,是,公开/公告/授权时间 +public_availability_unexamined_view_date,business:PublicAvailabilityDate/*Unexamined*View* → base:Date,date,未审查文献阅览公开日,日期按 ST.2,条件否**,公开/公告/授权时间 +public_availability_examined_view_date,business:PublicAvailabilityDate/*Examined*View* → base:Date,date,已审查文献阅览公开日,日期按 ST.2,条件否**,公开/公告/授权时间 +public_availability_unexamined_print_date,business:PublicAvailabilityDate/*Unexamined*Print* → base:Date,date,未审查文献印刷公开日,日期按 ST.2,条件否**,公开/公告/授权时间 +public_availability_examined_print_date,business:PublicAvailabilityDate/*Examined*Print* → base:Date,date,已审查文献印刷公开日,日期按 ST.2,条件否**,公开/公告/授权时间 +grant_publication_date,PublicAvailabilityDate 授权日节点;或 kind∈B/B1/B2 时 publication_date,date,授权文献公开日,日期按 ST.2,条件否**,公开/公告/授权时间 +claims_only_public_date,business:PublicAvailabilityDate/*ClaimsOnly* → base:Date,date,仅权利要求公开日,日期按 ST.2,是,公开/公告/授权时间 +granted_view_date,business:PublicAvailabilityDate/*Granted*View* → base:Date,date,已授权文献阅览公开日,日期按 ST.2,条件否**,公开/公告/授权时间 +corrected_document_issue_date,business:PublicAvailabilityDate/*Corrected* → base:Date,date,更正文献发行日,日期按 ST.2,否*,公开/公告/授权时间 +spc_basic_patent_reference,business:SupplementaryProtectionCertificate|business:SPC/business:BasicPatent|UnderlyingPatent,object,SPC 基础专利引用,含基础专利号,可带公开号,是,SPC(补充保护证书) +spc_first_national_marketing_auth,SPC/business:FirstNationalMarketingAuthorization|NationalMarketingAuth,object,SPC 首次国家上市许可,含 number+date,是,SPC(补充保护证书) +spc_first_regional_marketing_auth,SPC/business:FirstRegionalMarketingAuthorization|RegionalMarketingAuth,object,SPC 首次区域上市许可(仅欧盟等区域体系需用),含 number+date,可选 country_of_origin(ST.3),是,SPC(补充保护证书) +spc_expiry_or_duration,SPC/business:Expiry|Duration|TermOfProtection,object,SPC 到期日或期限,至少含 expiry_date 或 duration 之一,是,SPC(补充保护证书) +spc_protected_product_name,SPC/business:Product|ProductName,string,SPC 保护产品名称,非空时建议与基础专利一致性校验,是,SPC(补充保护证书) +spc_application_date,SPC/business:ApplicationReference/base:DocumentID/base:Date,string,SPC申请日(验证是否符合6个月时限),,,SPC(补充保护证书) +spc_country_code,SPC/base:WIPOST3Code 或 @country,string,SPC管辖国家,,,SPC(补充保护证书) +spc_underlying_patent_number,SPC BasicPatent/DocumentID/base:DocNumber,string,关联的基础专利号(SPC法律效力的唯一依据),,,SPC(补充保护证书) +application_numbers,business:ApplicationReference[@dataFormat='original']/base:DocumentID → WIPOST3Code+DocNumber,list[string],申请号,至少 1 个;保留原格式,否*,申请 +filing_dates,business:ApplicationReference/base:DocumentID/base:Date,list[string],申请日,日期按 ST.2;与 21 可一一对应,否*,申请 +original_filing_language,ApplicationReference/@lang;回退根 @lang,string,原始申请语言,ISO 639 两位语言码,是,申请 +effective_rights_date,ApplicationReference/base:DocumentID/base:Date(首个非空),date,权利生效日,日期按 ST.2,是,申请 +previous_application_reference,business:PreviousApplicationReference|ParentApplicationReference|ProvisionalApplicationReference;ApplicationReference[@applType=provisional|parent|previous],object,PLT 5(7) 先前申请引用,需含 office_code+application_number,可选 filing_date,是,申请 +pct_designated_states,business:PctOrRegionalFilingData//DesignatedState|base:WIPOST3Code|CountryCode,list[string],PCT 指定国,ST.3 两位代码,是,PCT +regional_designated_states,business:RegionalFilingData//DesignatedState|base:WIPOST3Code,list[string],区域专利指定缔约国,ST.3 两位代码,是,PCT +pct_national_phase_date,business:PctNationalPhaseEntry|NationalPhaseEntry/base:Date,date,PCT 进入国家/地区阶段日期,日期按 ST.2,是,PCT +pct_filing_data,business:PctOrRegionalFilingData/base:DocumentID/base:Date,list[object],PCT 国际申请提交数据,每项含 filing_date+application_number,可选 language,是,PCT +pct_publication_data,business:PctOrRegionalFilingData/business:PctPublication/base:DocumentID,list[object],PCT 国际公开数据,每项含 publication_date+publication_number,可选 language,是,PCT +pct_effect_ceased_date,business:PctRefiledRevised|RefiledRevisedApplication/base:Date,date,PCT 国际申请失效/未进阶段确认日,日期按 ST.2,是,PCT +search_report_deferred_publication_date,business:SearchReportDifferentPublication/base:Date,date,检索报告延迟公开日,日期按 ST.2,是,PCT +regional_filing_data,business:RegionalFilingData/base:DocumentID/base:Date,list[object],区域申请提交数据,每项含 filing_date+application_number,可选 language,是,PCT +regional_publication_data,business:RegionalFilingData/business:RegionalPublication/base:DocumentID,list[object],区域申请/授权公开数据,每项含 publication_date+publication_number,可选 language,是,PCT +microorganism_deposit_info,business:BiologicalDeposit|MicroorganismDeposit|MicroorganismDepositDetails|DepositInstitution(容器),object,微生物保藏信息,仅作容器,是,微生物保藏 +microorganism_deposit_no,保藏节点 base:DepositNumber|business:DepositNumber|base:AccessionNumber,string,微生物保藏编号,,,微生物保藏 +microorganism_deposit_address,保藏节点 base:AddressBook/base:Address,string,微生物保藏地址,,,微生物保藏 +microorganism_deposit_date,保藏节点 base:Date|business:DepositDate,string,微生物保藏时间,,,微生物保藏 +microorganism_deposit_unit,保藏节点 base:Name|business:InstitutionName|AddressBook,string,微生物保藏单位名称,,,微生物保藏 +microorganism_deposit_unit_code,保藏节点 base:WIPOST3Code|business:InstitutionCode,string,微生物保藏单位编号,,,微生物保藏 +microorganism_deposit_survival_status,保藏节点 business:RevivalStatus|@revivalStatus,string,存活情况,,,微生物保藏 +,,,,,,微生物保藏 +addition_relation,business:Addition|AdditionApplication|AdditionRelation → DocumentID|ApplicationReference,object,追加/附加专利关联,含前案号,宜带申请日,是,分案与分组 +division_relation,business:Division|DivisionApplication|DivisionRelation,object,分案来源关联,含母案号,宜带申请日,是,分案与分组 +continuation_relation,business:Continuation|ContinuationApplication|ContinuationInPart|ContinuationRelation;或 Description 标题 RELATED APPLICATION,object,continued/continuation 关联,含前案号+申请日,是,分案与分组 +reissue_publication_number,business:ReissuePublication|ReissuePublicationReference,string,再版/再授权前公开号,非空时应为合法文献号,是,分案与分组 +same_application_previous_publication_number,business:SameApplicationPreviousPublication|SameApplicationPreviousPublicationNumber|PreviousPublicationSameApplication,string,同一申请先前公开号,非空时应为合法文献号,是,分案与分组 +substitute_application_relation,business:Substitute|SubstituteApplication|SubstituteApplicationRelation,object,替代申请关联,含前案号+申请日,是,分案与分组 +utility_model_base_relation,business:UtilityModel|UtilityModelBasis|UtilityModelBase,object,实用新型基础专利/申请关联,含案号,宜带申请日,是,分案与分组 +attachments,//business:Attachment,list[object],附件列表,每项建议含 attachment_type+attachment_name+file_name+order;可带 copies/pages,是,附件与变更 +bibliographic_changes,business:BibliographicChange/base:Date + 节点全文,list[object],著录项目变更,每项建议含 change_seq+change_item+before_value+after_value,是,附件与变更 +,,,,,,附件与变更 +sha256 ,,,,,,库信息 +"origin_url +",,,,,,库信息 +"origin_path +",,,,,,库信息 +file_format,,,,,,库信息 +"file_type +",,,,,,库信息 +"obtain_timestamp +",,,,,,库信息 +content_type,,,,,,库信息 +content_length,,,,,,库信息 +process_status,,,,,,库信息 +processed_path ,,,,,,库信息 +page_cnt,,,,,,库信息 +is_broken,,,,,,库信息 +dt,,,,,,库信息 +patent_source,,,,,,库信息 diff --git a/dingo/model/rule/scibase/assets/union_unique_data_mapping.csv b/dingo/model/rule/scibase/assets/union_unique_data_mapping.csv new file mode 100644 index 00000000..3f3785c3 --- /dev/null +++ b/dingo/model/rule/scibase/assets/union_unique_data_mapping.csv @@ -0,0 +1,109 @@ +统一字段名,字段值数据类型,源字段映射(论文),源字段映射(图书),源字段映射(星河),字段有效性规则,备注,字段来源 +unique_id,String,'paper:{doi}','ebook:{isbn13}',,,全量表唯一标识,用于跨 Metadata 与 Fulltext 数据关联、去重与检索。,全量表生成字段 +metadata_type,String,,,,"可选值:""paper"",""ebook""",【Metadata表】元数据来源类型。论文来源取值 paper,图书来源取值 ebook。数据类型:String。,Metadata表 +doi,String,doi,,doi,"格式参考10.1016/s0021-9258(19)52451-6,需均保持小写,无前缀“https://doi.org/”部分 +规范:星河图书馆qa测试代码","【Metadata表】数字对象唯一标识符(DOI),主要用于论文等学术资源定位。数据类型:String。来源映射:OSI论文字段=doi。 +【Fulltext表】全文解析得到的 DOI。数据类型:string。","Metadata表, Fulltext表" +isbns,List[string],,isbns,,"符合 ISBN 校验规则。13 位与 10 位两种格式。格式参考[ + ""9781426208072"", + ""1426208073"" + ] +规范:星河图书馆qa测试代码",【Metadata表】图书 ISBN 列表,可能包含多个 ISBN。数据类型:List[string]。来源映射:OSI图书字段=isbns。,Metadata表 +isbn13,String,,isbn13,,"格式参考""9781426208072"" +规范:星河图书馆qa测试代码",【Metadata表】13 位 ISBN,图书资源的标准编号。数据类型:String。来源映射:OSI图书字段=isbn13。,Metadata表 +title,String,title,title,title,"不包含不可见字符 +规范:星河图书馆qa测试代码","【Metadata表】资源标题/题名。数据类型:String。来源映射:OSI图书字段=title;OSI论文字段=title。 +【Fulltext表】全文解析得到的标题。数据类型:string。","Metadata表, Fulltext表" +abstract,String,abstract,abstract,abstract,"不包含不可见字符 +规范:星河图书馆qa测试代码","【Metadata表】摘要、简介或内容概述。数据类型:String。来源映射:OSI图书字段=abstract;OSI论文字段=abstract。 +【Fulltext表】全文解析得到的摘要。数据类型:string。","Metadata表, Fulltext表" +language,String,language,language,language,"有效性规则见元数据目录格式标准【WIP】 +ISO 639-1语言代码标识,其无法覆盖范围则应用iso ISO 639-3代码。规范:星河图书馆qa测试代码","【Metadata表】资源语言。数据类型:String。来源映射:OSI图书字段=language;OSI论文字段=language。 +【Fulltext表】全文解析得到的语言。数据类型:string。","Metadata表, Fulltext表" +type,String,type,type,,,【Metadata表】资源类型或文献类型。数据类型:String。来源映射:OSI图书字段=type;OSI论文字段=type。,Metadata表 +author,List[object],author,author,author,"2026.06.01 author字段格式修改为List[object]。每个对象都有两个key-value对,key分别为:""name""、""orcid"",其中name存储作者姓名,orcid存储作者的开放学者身份标识。举例:[ + { + ""name"": ""Alan Aspuru-Guzik"", + ""orcid"": ""https://orcid.org/0000-0002-8277-4434"" + }]。 +每个string指向一个作者,不能多个人名放在同个string下 +每个string不包含分割性质字符,规范:星河图书馆qa测试代码","【Metadata表】作者列表。数据类型:List[object]。来源映射:OSI图书字段=author;OSI论文字段=author。 +【Fulltext表】全文解析得到的作者列表。数据类型:List[object]。","Metadata表, Fulltext表" +contributors,List[string],,contributors,,"每个string指向一个作者,不能多个人名放在同个string下 +每个string不包含分割性质字符,规范:星河图书馆qa测试代码",【Metadata表】贡献者列表,如编者、译者等。数据类型:List[string]。来源映射:OSI图书字段=contributors。,Metadata表 +indexed_in,List[string],indexed_in,indexed_in,,,【Metadata表】收录/索引来源列表。数据类型:List[string]。来源映射:OSI图书字段=indexed_in;OSI论文字段=indexed_in。非对外字段。,Metadata表 +identifiers,Object,identifiers,identifiers,,,【Metadata表】其他外部标识符集合。数据类型:Object。来源映射:OSI图书字段=identifiers;OSI论文字段=identifiers。非对外字段。,Metadata表 +locations,List[object],locations,,,"每个object都有4个属性: +type:可选值:""download"",""reader"",""display"","""" +url:正则规范 r'^[Hh][Tt][Tt][Pp][Ss]?://[^/$.?#][\s\S]*$' +license:可选值: cc-by、cc-by-nc、cc-by-sa、cc-by-nd、cc-by-nc-sa、cc-by-nc-nd、other-oa、cc0、""""、public-domain、publisher-specific-oa、publisher-specific、wiley-specific、elsevier-specific、oup-specific、acs-specific、rsc-specific、iop-specific、other-oa、unspecified-oa、implied-oa、nonexclusive-distrib 。以下协议类型非论文许可而为数据许可(gpl-v1、gpl-v2、gpl-v3、mit)政府许可(ogl-c)未知许可类型(pd),但允许出现在可选值中。 +is_oa:可选值:""true"" ""false"" ""unknown""",【Metadata表】资源可访问位置、来源链接或馆藏/开放获取位置列表。数据类型:List[object]。来源映射:OSI论文字段=locations。,Metadata表 +access_is_oa,String,access_is_oa,,,可选值:"true" "false" "",【Metadata表】是否开放获取。数据类型:String。来源映射:OSI论文字段=is_oa。,Metadata表 +access_oa_status,String,access_oa_status,,,可选值:diamond、gold、green、hybrid、bronze、closed、"",【Metadata表】开放获取状态。数据类型:String。来源映射:OSI论文字段=oa_status。,Metadata表 +access_oa_url,List[string],access_oa_url,access_oa_url,,正则规范 r'^[Hh][Tt][Tt][Pp][Ss]?://[^/$.?#][\s\S]*$',【Metadata表】开放获取 URL。数据类型:String。来源映射:OSI图书字段=oa_url;OSI论文字段=oa_url。,Metadata表 +access_license,String,access_license,,,"可选值: cc-by、cc-by-nc、cc-by-sa、cc-by-nd、cc-by-nc-sa、cc-by-nc-nd、other-oa、cc0、""""、public-domain、publisher-specific-oa、publisher-specific、wiley-specific、elsevier-specific、oup-specific、acs-specific、rsc-specific、iop-specific、other-oa、unspecified-oa、implied-oa、nonexclusive-distrib 。以下协议类型非论文许可而为数据许可(gpl-v1、gpl-v2、gpl-v3、mit)政府许可(ogl-c)未知许可类型(pd),但允许出现在可选值中。",【Metadata表】开放获取或使用许可协议。数据类型:String。来源映射:OSI论文字段=license。,Metadata表 +publication_published_date,String,published_date,,,格式为"1951-11-01"即"yyyy-mm-dd",【Metadata表】出版/发表日期。数据类型:String。来源映射:OSI论文字段=published_date。,Metadata表 +publication_published_year,Integer,published_year,publication_published_year,,格式为1951,大于0,小于2100,【Metadata表】出版/发表年份。数据类型:Integer。来源映射:OSI图书字段=published_year;OSI论文字段=published_year。,Metadata表 +publication_published_place,List[string],,publication_published_place,,,【Metadata表】出版地。数据类型:List[string]。来源映射:OSI图书字段=published_place。,Metadata表 +publication_published_country,List[string],,publication_published_country,,,【Metadata表】出版国家/地区。数据类型:List[string]。来源映射:OSI图书字段=published_country。,Metadata表 +publication_venue_name,String,venue_name,,,,【Metadata表】发表载体名称,如期刊、会议或图书系列名称。数据类型:String。来源映射:OSI论文字段=venue.name。,Metadata表 +publication_venue_name_unified,String,,,,期刊字典标准化如果publication_venue_name在target_journal_name列,那么本字段就是target_journal_name列,不在则取publication_venue_name,归一化的期刊统一命名,全量表生成字段 +publication_venue_type,String,venue_type,,,,【Metadata表】发表载体类型。数据类型:String。来源映射:OSI论文字段=venue.type。,Metadata表 +publication_venue_issn,List[string],venue_issn,,,"其中string元素:由8位数字组成。8位数字分为前后两段各4位,中间用连接号相连,""xxxx-xxxx"",(前7位数字为单纯的数字序号,无任何特殊含义,第八个数字是根据前七个数字按模数 11算法计算得出的检验码。若计算结果为10,第八个数字可为“X”) +规范:每个string元素 星河图书馆qa测试代码",【Metadata表】发表载体 ISSN 列表。数据类型:List[string]。来源映射:OSI论文字段=venue.issn。,Metadata表 +publication_publisher,List[string],venue_publisher,publication_publisher,,,【Metadata表】出版方/出版社。数据类型:List[string]。来源映射:OSI图书字段=publisher;OSI论文字段=venue.publisher。,Metadata表 +publication_venue_biblio_volume,String,biblio_volume,,,格式示例为"193",可以转化为数字,【Metadata表】期刊/会议卷号。数据类型:String。来源映射:OSI论文字段=venue.biblio.volume。,Metadata表 +publication_venue_biblio_issue,String,biblio_issue,,,格式示例为"1",可以转化为数字,【Metadata表】期刊/会议期号。数据类型:String。来源映射:OSI论文字段=venue.biblio.issue。,Metadata表 +publication_venue_biblio_pages,String,biblio_pages,,,格式示例为"265-275",【Metadata表】期刊/会议页码范围。数据类型:String。来源映射:OSI论文字段=venue.biblio.issue。,Metadata表 +publication_pages,Integer,,publication_pages,,大于0,【Metadata表】图书页数或资源总页数。数据类型:Integer。来源映射:OSI图书字段=pages。,Metadata表 +keywords,List[string],keywords,,,,【Metadata表】关键词列表。数据类型:List[string]。来源映射:OSI论文字段=keywords。,Metadata表 +fieldsOfStudy,List[object],fieldsOfStudy,,,,【Metadata表】学科领域信息。数据类型:List[object]。来源映射:OSI论文字段=fieldsOfStudy。,Metadata表 +s2fieldsofstudy,List[object],s2fieldsofstudy,,,,【Metadata表】Semantic Scholar 学科领域信息,内部字段。数据类型:List[object]。来源映射:OSI论文字段=s2FieldsOfStudy。非对外字段。,Metadata表 +primary_topic,Object,primary_topic,,,,【Metadata表】主要主题信息。数据类型:Object。来源映射:OSI论文字段=primary_topic。,Metadata表 +topics,List[object],topics,,,,【Metadata表】主题列表。数据类型:List[object]。来源映射:OSI论文字段=topics。,Metadata表 +concepts,List[object],concepts,,,,【Metadata表】概念标签列表。数据类型:List[object]。来源映射:OSI论文字段=concepts。,Metadata表 +grade_class,String,/,,grade_class,"有效性规则见元数据目录格式标准【WIP】 +规范:星河图书馆qa测试代码","【Metadata表】等级分类。数据类型:String。来源映射:OSI图书字段=grade_class;OSI论文字段=grade_class。 +【Fulltext表】全文解析得到的等级分类。数据类型:string。","Metadata表, Fulltext表" +grade,String,/,,grade,"有效性规则见元数据目录格式标准【WIP】 +规范:星河图书馆qa测试代码","【Metadata表】等级。数据类型:String。来源映射:OSI图书字段=grade;OSI论文字段=grade。 +【Fulltext表】全文解析得到的等级。数据类型:string。","Metadata表, Fulltext表" +subjects,List[string],,subjects,,,【Metadata表】图书主题词列表。数据类型:List[string]。来源映射:OSI图书字段=subjects。,Metadata表 +genre,List[string],,genre,,,【Metadata表】图书体裁/类型列表。数据类型:List[string]。来源映射:OSI图书字段=genre。,Metadata表 +reference_count,Integer,reference_count,,,,【Metadata表】引用文献数,Metadata表 +citation_count,Integer,citation_count,,,,【Metadata表】被引次数,指一篇文章被其他文章引用的次数,是衡量文章影响力的重要指标。,Metadata表 +influential_citation_count,Integer,influential_citation_count,,,,【Metadata表】高影响力被引次数,Metadata表 +fwci,Float,fwci,,,,【Metadata表】一篇文献的“领域加权引用影响”Field-Weighted Citation Impact,Metadata表 +citations,List[object],/,,,,【Metadata表】引用本篇论文的文献列表,Metadata表 +references,List[string],references,,,"string元素中存储url +规范:url星河图书馆qa测试代码",【Metadata表】本论文引用的文献列表,Metadata表 +related_works,List[string],related_works,,,"string元素中存储url +规范:url星河图书馆qa测试代码",【Metadata表】本论文相关工作列表,Metadata表 +citation_normalized_percentile,Object,citation_normalized_percentile,,,,【Metadata表】一篇文献的“被引百分位”,表示该文献的引用次数在其文献类型、出版年份和学科子领域构成的可比集合中所处的百分位位置。例如 0.999 62 意味着该文献的引用次数高于 99.962% 的同类型、同年、同领域文献,因此属于全球前 0.038%。,Metadata表 +cited_by_percentile_year,Object,cited_by_percentile_year,,,,"【Metadata表】按“出版年份”细分的被引百分位区间。min: 99, max: 100 表示该文献在同年份的所有可比文献中,处于 99–100 百分位段,即“最顶尖 1%”。",Metadata表 +cited_by_api_url,String,cited_by_api_url,,,"string元素中存储url +规范:url星河图书馆qa测试代码",【Metadata表】获取“被哪些文献引用”的 API 端点(可直接调用 OpenAlex API,获取所有引用该文献的作品(works)列表。),Metadata表 +classifications,Object,classifications,/,/,,【Metadata表】, +supplementary_material,List[object],/,/,supplementary_material,每个对象都有三个key-value对,key分别为:supplementary_material_name、supplementary_material_url、supplementary_material_path,分别存储补充材料文件名、补充材料链接、补充材料的对象存储路径,【Fulltext表】全文对应的补充材料文件,Fulltext表 +access_xinghe_repository_has_fulltext,Boolean,,,/,可选值有布尔值true、布尔值false,默认值为false,【Metadata表】有全文数据。数据类型:Boolean。来源映射:OSI图书字段=xinghe_repository_has_fulltext;OSI论文字段=xinghe_repository_has_fulltext。,Metadata表 +access_xinghe_repository_sha256,String,,,sha256,默认值空列表;当xinghe_repository_has_fulltext 为true时,此字段不可以为空。,"【Metadata表】全文文件 SHA-256 哈希值,用于文件唯一识别、去重与关联。数据类型:List[string]。来源映射:OSI图书字段=access_xinghe_repository_sha256;OSI论文字段=access_xinghe_repository_sha256。 +【Fulltext表】数据类型:string。","Metadata表, Fulltext表" +access_xinghe_repository_origin_path,String,,,origin_path,默认值"";当xinghe_repository_has_fulltext 为true时,此字段不可以为空。,【Fulltext表】全文原始文件存储路径,内部字段。数据类型:string。非对外字段。,Fulltext表 +access_xinghe_repository_page_cnt,Integer,,,page_cnt,,【Fulltext表】全文页数。数据类型:int。,Fulltext表 +access_xinghe_repository_process_status,Integer,,,process_status,1-已处理;0-未处理,默认值为0,【Fulltext表】全文处理状态,内部字段。数据类型:int。非对外字段。,Fulltext表 +access_xinghe_repository_processed_path,String,,,processed_path,当access_xinghe_repository_process_status为1,此字段不允许为空,【Fulltext表】全文处理后文件路径,内部字段。数据类型:string。非对外字段。,Fulltext表 +access_xinghe_repository_origin_url,String,,,origin_url,,【Fulltext表】全文原始来源 URL。数据类型:string。,Fulltext表 +access_xinghe_repository_file_format,String,,,file_format,示例(仅示例非可选值范围)pdfmp4oggdocx使用python-magic 进行文档类别识别,可选范围即magic识别库中包含的文档类型+unknown,【Fulltext表】全文文件格式,如 pdf、html 等。数据类型:string。,Fulltext表 +access_xinghe_repository_file_type,String,,,file_type,"范围: +paper +ebook +textbook +patent +report +other",【Fulltext表】全文文件类型。数据类型:string。,Fulltext表 +access_xinghe_repository_content_type,String,,,content_type,取值:https://mimetype.io/all-types,【Fulltext表】文件 MIME Content-Type。数据类型:string。,Fulltext表 +access_xinghe_repository_content_length,long,,,content_length,,【Fulltext表】文件内容长度/大小。数据类型:long。,Fulltext表 +access_xinghe_repository_is_broken,Integer,,,is_broken,具体取值范围与定义见文件完整性校验,【Fulltext表】全文文件是否损坏或不可正常解析。数据类型:int。枚举值:0、1、2、3,详见文件完整性校验,Fulltext表 +access_xinghe_repository_obtain_timestamp,timestamp (seconds),,,obtain_timestamp,,【Fulltext表】全文获取时间戳(秒),内部字段。数据类型:timestamp (seconds)。非对外字段。Unix epoch seconds,可以是整数(精确到秒),也可以是浮点数(小数部分可精确至微秒 ,Fulltext表 +access_xinghe_repository_model_name,String,/,/,model_name,目前可选值为【数据字典】数据域-模型名称-学科科目,【Fulltexta表】全文解析使用的模型名称,内部字段。数据类型:string。非对外字段。,Fulltext表 +access_xinghe_repository_model_version,String,/,/,model_version,目前可选值为【数据字典】数据域-模型名称-学科科目,【Fulltext表】全文解析使用的模型版本,内部字段。数据类型:string。非对外字段。,Fulltext表 diff --git a/dingo/model/rule/scibase/meta_ebook_unique.py b/dingo/model/rule/scibase/meta_ebook_unique.py new file mode 100644 index 00000000..89216d8a --- /dev/null +++ b/dingo/model/rule/scibase/meta_ebook_unique.py @@ -0,0 +1,1520 @@ +#!/usr/bin/env python3 +"""Self-contained meta_ebook unique DB validator. + +Field aggregation rules are driven by ../doc/ebook_unique_mapping.csv. +""" +from __future__ import annotations + +import csv +import re +import argparse +import json +import sys +import time +from collections import Counter +from dataclasses import dataclass +from datetime import date, datetime +from decimal import Decimal +from pathlib import Path +from typing import Any, Callable, Dict, Iterable, List, Optional, Sequence, Tuple + +SKIP_COMPARE_STRATEGIES = frozenset({"random_pick_cls"}) +ORDER_INSENSITIVE_COMPARE_STRATEGIES = frozenset({"dedup_array", "isbn_normalize"}) +StrategyHandler = Callable[[List[Dict[str, Any]], "FieldRule", Dict[str, Any]], Any] + + +@dataclass +class FieldRule: + field_name: str + data_type: str + strategy: str + params: Dict[str, Any] + source_field: str + description: str + + @property + def effective_source(self) -> str: + return self.source_field or self.field_name + + +def _parse_params(raw: str) -> Dict[str, Any]: + if not raw: + return {} + params: Dict[str, Any] = {} + for pair in raw.split(";"): + pair = pair.strip() + if "=" not in pair: + continue + key, val = pair.split("=", 1) + key, val = key.strip(), val.strip() + if val.lower() == "true": + params[key] = True + elif val.lower() == "false": + params[key] = False + elif val.lstrip("-").isdigit(): + params[key] = int(val) + else: + params[key] = val + return params + + +def load_field_rules( + path: Path, + *, + field_column: str = "字段名", + type_column: str = "数据类型", + strategy_column: str = "聚合策略", + params_column: str = "策略参数", + source_column: str = "源字段名", + desc_column: str = "去重 / 聚合处理逻辑", +) -> List[FieldRule]: + rules: List[FieldRule] = [] + with path.open(encoding="utf-8-sig", newline="") as f: + reader = csv.DictReader(f) + if not reader.fieldnames or field_column not in reader.fieldnames: + available = ", ".join(fn for fn in (reader.fieldnames or []) if fn.strip()) + raise ValueError( + f"映射文件 {path} 缺少字段列 {field_column!r}(可用列: {available})" + ) + for row in reader: + name = (row.get(field_column) or "").strip() + if not name: + continue + rules.append(FieldRule( + field_name=name, + data_type=(row.get(type_column) or "").strip(), + strategy=(row.get(strategy_column) or "").strip(), + params=_parse_params((row.get(params_column) or "").strip()), + source_field=(row.get(source_column) or "").strip(), + description=(row.get(desc_column) or "").strip(), + )) + return rules + + +def output_fields_from_rules(rules: Sequence[FieldRule]) -> List[str]: + return [r.field_name for r in rules if r.strategy not in SKIP_COMPARE_STRATEGIES] + + +def order_insensitive_fields_from_rules(rules: Sequence[FieldRule]) -> set: + return { + r.field_name + for r in rules + if r.strategy in ORDER_INSENSITIVE_COMPARE_STRATEGIES + } + + +def aggregate_by_rules( + records: List[Dict[str, Any]], + rules: Sequence[FieldRule], + handlers: Dict[str, StrategyHandler], +) -> Dict[str, Any]: + result: Dict[str, Any] = {} + for rule in rules: + handler = handlers.get(rule.strategy) + if handler is None: + raise ValueError( + f"Unknown aggregation strategy {rule.strategy!r} " + f"for field {rule.field_name!r}" + ) + result[rule.field_name] = handler(records, rule, result) + return result + +try: + import pymysql +except ImportError: # pragma: no cover - runtime dependency check + pymysql = None # type: ignore + + +CURRENT_YEAR = datetime.now().year +PROJECT_ROOT = Path(__file__).resolve().parent +ASSETS_DIR = PROJECT_ROOT / "assets" +DEFAULT_CONFIG_PATH = Path("sci_base_qa_test_config.json") +TEMPLATE_CONFIG_PATH = ASSETS_DIR / "settings.template.json" +DEFAULT_MAPPING_CSV = ASSETS_DIR / "ebook_unique_mapping.csv" +REPORT_ROOT = Path("report") +DEFAULT_SOURCE_TABLE = "dws_meta_ebook_data_acc_d" +DEFAULT_TARGET_TABLE = "dws_meta_ebook_isbn_unique_acc_d" + + +def safe_filename_token(value: Optional[Any]) -> str: + text = "all" if value in (None, "") else str(value) + return re.sub(r"[^0-9A-Za-z_-]+", "_", text).strip("_") or "all" + + +def default_report_path(dt: Optional[str], sample_mode: str, full: bool) -> Path: + mode = "full" if full else sample_mode + report_dir = REPORT_ROOT / f"meta_ebook_unique_dt_{safe_filename_token(dt)}_{safe_filename_token(mode)}" + return report_dir / "source_field_mismatch.jsonl" + + +def _json_inline(value: Any) -> str: + return json.dumps(value, ensure_ascii=False, cls=JsonEncoder) + + +def summary_paths(report_path: Path) -> Tuple[Path, Path]: + return report_path.parent / "summary.json", report_path.parent / "readable_summary.md" + + +REPORT_KEY_LABELS = { + "report": "报告路径", + "total_problem_rows": "问题记录数", + "result": "校验结果", + "status_counts": "状态分布", + "field_counts": "字段问题分布", + "field_samples": "字段问题样例", + "key": "键值", + "expected_key": "预期键值", + "dt": "分区日期", + "source_count": "源表记录数", + "status": "状态", + "expected": "预期值", + "actual": "实际值", + "kind": "校验类型", + "source_table": "源表", + "target_table": "目标表", + "key_field": "去重键字段", + "validated_partitions": "已校验分区", + "sample_mode": "抽样模式", + "sample_size": "抽样数量", + "dt_check": "分区检查", + "checked": "已校验数", + "passed": "通过数", + "failed": "失败数", + "missing_source": "源表缺失数", + "missing_target": "目标表缺失数", + "source_count_buckets": "源表记录数分桶", + "missing_samples": "缺失样例", + "source_records": "源表记录", + "target_records": "目标表记录", + "expected_record": "预期记录", + "report_path": "报告路径", + "sample_mismatches": "问题样例", + "mismatches": "字段差异", + "source_dt_count": "源表分区数", + "target_dt_count": "目标表分区数", + "missing_in_target": "目标表缺失分区", + "extra_in_target": "目标表多余分区", + "count_mismatches": "数量不一致明细", + "source_distinct_skipped": "源表去重计数已跳过", +} + + +def localize_report_keys(value: Any) -> Any: + if isinstance(value, dict): + return { + REPORT_KEY_LABELS.get(str(key), str(key)): localize_report_keys(val) + for key, val in value.items() + } + if isinstance(value, list): + return [localize_report_keys(item) for item in value] + return value + + +TOP_FIELD_LIMIT = 20 +TOP_SAMPLE_FIELD_LIMIT = 5 +SAMPLES_PER_FIELD = 3 + + +def compact_record_for_report(record: Dict[str, Any]) -> Dict[str, Any]: + keys = ( + "track_id", + "isbn13", + "isbns", + "origin_osi", + "origin_id", + "title", + "type", + "author", + "contributors", + "published_year", + "published_date", + "publisher", + "dt", + ) + return { + key: record.get(key) + for key in keys + if record.get(key) not in (None, "", [], {}) + } + + +def compact_records_for_report(records: Any) -> Any: + if isinstance(records, dict): + return compact_record_for_report(records) + if not isinstance(records, list): + return records + return [ + compact_record_for_report(record) + for record in records + if isinstance(record, dict) + ] + + +def build_report_summary( + report_path: Path, + result: Dict[str, Any], + mismatch_rows: Sequence[Dict[str, Any]], +) -> Dict[str, Any]: + status_counts = Counter(str(row.get("status") or "unknown") for row in mismatch_rows) + field_counts: Counter = Counter() + field_samples: Dict[str, List[Dict[str, Any]]] = {} + missing_samples: List[Dict[str, Any]] = [] + for row in mismatch_rows: + if row.get("status") in ("missing_target", "missing_source") and len(missing_samples) < SAMPLES_PER_FIELD: + missing_samples.append( + { + "key": row.get("key"), + "expected_key": row.get("expected_key"), + "dt": row.get("dt"), + "source_count": row.get("source_count"), + "status": row.get("status"), + "source_records": compact_records_for_report(row.get("source_records")), + "target_records": compact_records_for_report(row.get("target_records")), + "expected_record": compact_records_for_report(row.get("expected_record")), + } + ) + for field, diff in (row.get("mismatches") or {}).items(): + field_counts[field] += 1 + samples = field_samples.setdefault(field, []) + if len(samples) < SAMPLES_PER_FIELD: + samples.append( + { + "key": row.get("key"), + "expected_key": row.get("expected_key"), + "dt": row.get("dt"), + "source_count": row.get("source_count"), + "status": row.get("status"), + "expected": diff.get("expected") if isinstance(diff, dict) else None, + "actual": diff.get("actual") if isinstance(diff, dict) else None, + } + ) + sorted_field_counts = dict(field_counts.most_common()) + top_sample_fields = set(list(sorted_field_counts)[:TOP_SAMPLE_FIELD_LIMIT]) + return { + "report": str(report_path), + "total_problem_rows": len(mismatch_rows), + "result": {k: v for k, v in result.items() if k != "sample_mismatches"}, + "status_counts": dict(status_counts.most_common()), + "field_counts": sorted_field_counts, + "field_count_total": len(sorted_field_counts), + "field_samples": { + field: field_samples[field] + for field in sorted_field_counts + if field in top_sample_fields and field in field_samples + }, + "missing_samples": missing_samples, + } + + +def write_report_summary(report_path: Path, result: Dict[str, Any], mismatch_rows: Sequence[Dict[str, Any]]) -> None: + summary_json_path, summary_md_path = summary_paths(report_path) + summary = build_report_summary(report_path, result, mismatch_rows) + with summary_json_path.open("w", encoding="utf-8") as f: + json.dump(localize_report_keys(summary), f, ensure_ascii=False, indent=2, cls=JsonEncoder) + + lines = [ + "# Ebook 去重校验报告摘要", + "", + f"- 分区: `{result.get('dt')}`", + f"- 抽样: `{result.get('sample_mode')}`, 数量 `{result.get('sample_size')}`", + f"- 结果: 已校验 `{result.get('checked')}`,通过 `{result.get('passed')}`,失败 `{result.get('failed')}`", + f"- 缺失: 源表 `{result.get('missing_source')}`,目标表 `{result.get('missing_target')}`", + f"- 明细报告: `{report_path}`", + f"- 报告目录: `{report_path.parent}`", + f"- 源表记录数分桶: `{_json_inline(result.get('source_count_buckets'))}`", + "", + "## Count 校验", + "", + f"- source_distinct_skipped: `{(result.get('dt_check') or {}).get('source_distinct_skipped')}`", + f"- count_mismatches: `{len((result.get('dt_check') or {}).get('count_mismatches') or [])}`", + "", + "## 状态分布", + "", + ] + for status, count in summary["status_counts"].items(): + lines.append(f"- `{status}`: {count}") + if not summary["status_counts"]: + lines.append("- 无") + lines.extend(["", "## 字段问题分布", ""]) + for field, count in summary["field_counts"].items(): + lines.append(f"- `{field}`: {count}") + if not summary["field_counts"]: + lines.append("- 无") + if summary.get("missing_samples"): + lines.extend(["", "## 缺失样例", ""]) + for sample in summary["missing_samples"]: + lines.append( + f"- ISBN13 `{sample.get('key')}`, expected_key=`{sample.get('expected_key')}`, " + f"source_count={sample.get('source_count')}, status=`{sample.get('status')}`" + ) + for name in ("source_records", "target_records", "expected_record"): + if sample.get(name) is not None: + lines.append(f" - {name}: `{_json_inline(sample.get(name))}`") + lines.extend(["", "## 字段问题样例", ""]) + for field, samples in summary["field_samples"].items(): + lines.append(f"### {field} ({summary['field_counts'].get(field)})") + lines.append("") + for sample in samples: + lines.append( + f"- ISBN13 `{sample.get('key')}`, expected_key=`{sample.get('expected_key')}`, " + f"source_count={sample.get('source_count')}, status=`{sample.get('status')}`" + ) + lines.append(f" - expected: `{_json_inline(sample.get('expected'))}`") + lines.append(f" - actual: `{_json_inline(sample.get('actual'))}`") + lines.append("") + with summary_md_path.open("w", encoding="utf-8") as f: + f.write("\n".join(lines).rstrip() + "\n") + + +class JsonEncoder(json.JSONEncoder): + def default(self, obj: Any) -> Any: + if isinstance(obj, Decimal): + if obj == obj.to_integral_value(): + return int(obj) + return float(obj) + if isinstance(obj, (date, datetime)): + return obj.isoformat() + return super().default(obj) + + +# ---- common scalar/array helpers ---- + + +def is_non_empty(value: Any) -> bool: + if value is None: + return False + if isinstance(value, str): + return value != "" + if isinstance(value, (list, dict)): + return len(value) > 0 + return True + + +def choose_freq_then_lex_max(values: Iterable[str]) -> str: + vals = [v for v in values if isinstance(v, str) and v != ""] + if not vals: + return "" + cnt = Counter(vals) + max_freq = max(cnt.values()) + candidates = [k for k, v in cnt.items() if v == max_freq] + return max(candidates) + + +def choose_freq_then_max_int(values: Iterable[int]) -> Optional[int]: + vals = [v for v in values if isinstance(v, int)] + if not vals: + return None + cnt = Counter(vals) + max_freq = max(cnt.values()) + candidates = [k for k, v in cnt.items() if v == max_freq] + return max(candidates) + + +def dedup_str_array(values: Iterable[Any], lower: bool = False) -> List[str]: + out = set() + for item in values: + if isinstance(item, list): + for v in item: + if v is None: + continue + s = str(v) + if s == "": + continue + out.add(s.lower() if lower else s) + elif item is not None: + s = str(item) + if s != "": + out.add(s.lower() if lower else s) + return sorted(out) + + +def merge_identifiers(values: Iterable[Any]) -> Dict[str, str]: + merged: Dict[str, str] = {} + for item in values: + if not isinstance(item, dict): + continue + for k, v in item.items(): + if v is None: + continue + sv = str(v) + if k not in merged or sv > merged[k]: + merged[k] = sv + return merged + + +def parse_int(value: Any) -> Optional[int]: + if value is None or value == "": + return None + if isinstance(value, bool): + return None + if isinstance(value, int): + return value + txt = str(value).strip() + if txt == "": + return None + try: + return int(txt) + except ValueError: + return None + + +# ---- ebook-specific normalization helpers ---- + + +def normalize_isbn_to_13(raw: Any) -> Optional[str]: + """10 位 ISBN 前面加 978 转为 13 位,13 位保留,其他长度丢弃。""" + if raw is None: + return None + s = str(raw).strip().replace("-", "") + if not s: + return None + if len(s) == 13 and s.isdigit(): + return s + if len(s) == 10 and s[:9].isdigit() and (s[9].isdigit() or s[9].upper() == "X"): + return "978" + s + return None + + +def extract_year(value: Any) -> Optional[int]: + if value is None or value == "": + return None + if isinstance(value, int): + year = value + else: + txt = str(value).strip() + m = re.search(r"(1\d{3}|20\d{2})", txt) + if not m: + return None + year = int(m.group(1)) + if year < 1000 or year > CURRENT_YEAR: + return None + return year + + +# ---- strategy handlers ---- + + +def _handle_freq_lex_max( + records: List[Dict[str, Any]], rule: FieldRule, result: Dict[str, Any], +) -> Any: + src = rule.effective_source + min_len = rule.params.get("min_len") + max_len = rule.params.get("max_len") + vals: List[str] = [] + for r in records: + v = r.get(src) + if not is_non_empty(v): + continue + s = str(v) + if min_len is not None and len(s) < min_len: + continue + if max_len is not None and len(s) > max_len: + continue + vals.append(s) + return choose_freq_then_lex_max(vals) + + +def _handle_freq_int_max( + records: List[Dict[str, Any]], rule: FieldRule, result: Dict[str, Any], +) -> Any: + src = rule.effective_source + min_val = rule.params.get("min_val") + max_val = rule.params.get("max_val") + if isinstance(max_val, str) and max_val == "CURRENT_YEAR": + max_val = CURRENT_YEAR + use_extract = rule.params.get("extract_year", False) + vals: List[int] = [] + for r in records: + v = extract_year(r.get(src)) if use_extract else parse_int(r.get(src)) + if v is None: + continue + if min_val is not None and v < min_val: + continue + if max_val is not None and v > max_val: + continue + vals.append(v) + return choose_freq_then_max_int(vals) + + +def _handle_dedup_array( + records: List[Dict[str, Any]], rule: FieldRule, result: Dict[str, Any], +) -> List[str]: + return dedup_str_array( + [r.get(rule.effective_source, []) for r in records], + lower=rule.params.get("lower", False), + ) + + +def _handle_merge_map( + records: List[Dict[str, Any]], rule: FieldRule, result: Dict[str, Any], +) -> Dict[str, str]: + return merge_identifiers([r.get(rule.effective_source, {}) for r in records]) + + +def _handle_max_int( + records: List[Dict[str, Any]], rule: FieldRule, result: Dict[str, Any], +) -> Optional[int]: + vals = [v for r in records for v in [parse_int(r.get(rule.effective_source))] if v is not None] + return max(vals) if vals else None + + +def _handle_latest_dt( + records: List[Dict[str, Any]], rule: FieldRule, result: Dict[str, Any], +) -> str: + src = rule.effective_source + vals = [str(r.get(src, "")) for r in records if is_non_empty(r.get(src))] + return max(vals) if vals else "" + + +def _handle_isbn_normalize( + records: List[Dict[str, Any]], rule: FieldRule, result: Dict[str, Any], +) -> List[str]: + raw = dedup_str_array([r.get(rule.effective_source, []) for r in records]) + normalized = [v for v in (normalize_isbn_to_13(s) for s in raw) if v is not None] + return sorted(set(normalized)) + + +def _handle_isbn_min( + records: List[Dict[str, Any]], rule: FieldRule, result: Dict[str, Any], +) -> str: + isbns = result.get("isbns", []) + if isbns: + return isbns[0] + return str(records[0].get("isbn13", "")) if records else "" + + +STRATEGY_HANDLERS: Dict[str, StrategyHandler] = { + "freq_lex_max": _handle_freq_lex_max, + "freq_int_max": _handle_freq_int_max, + "dedup_array": _handle_dedup_array, + "merge_map": _handle_merge_map, + "max_int": _handle_max_int, + "latest_dt": _handle_latest_dt, + "isbn_normalize": _handle_isbn_normalize, + "isbn_min": _handle_isbn_min, +} + + +# ---- aggregation ---- + + +def aggregate_group(records: List[Dict[str, Any]], rules: Sequence[FieldRule]) -> Dict[str, Any]: + return aggregate_by_rules(records, rules, STRATEGY_HANDLERS) + + +# ---- DB validation helpers ---- + + +def _log(message: str) -> None: + print(message, file=sys.stderr, flush=True) + + +def load_config(path: Path) -> Dict[str, Any]: + if not path.exists(): + raise FileNotFoundError( + f"Config file not found: {path}\n" + f"Copy the template and fill in credentials:\n" + f" cp {TEMPLATE_CONFIG_PATH} {path}" + ) + with path.open("r", encoding="utf-8") as f: + return json.load(f) + + +def connect_starrocks(config_path: Path): + if pymysql is None: + raise RuntimeError("pymysql is required. Install pymysql before running DB validation.") + cfg = load_config(config_path) + mysql_cfg = cfg["mysql"] + retry_cfg = cfg.get("retry", {}) if isinstance(cfg.get("retry"), dict) else {} + max_attempts = max(1, int(retry_cfg.get("max_attempts", 3))) + delay = max(0.0, float(retry_cfg.get("initial_delay_sec", 2.0))) + backoff = max(1.0, float(retry_cfg.get("backoff_factor", 2.0))) + read_timeout = int(mysql_cfg.get("read_timeout_sec", 600)) + + def _is_retryable_connect_error(exc: Exception) -> bool: + if pymysql is None: + return False + if isinstance(exc, pymysql.err.OperationalError): + code = exc.args[0] if exc.args else None + if code in (2003, 2006, 2013): + return True + msg = str(exc).lower() + return any(token in msg for token in ("lost connection", "can't connect", "timed out", "timeout")) + + for attempt in range(1, max_attempts + 1): + try: + # Do not pass database= on connect: this StarRocks endpoint drops + # auth when a default schema is selected; use fully-qualified table names in SQL. + return pymysql.connect( + host=mysql_cfg["host"], + port=int(mysql_cfg["port"]), + user=mysql_cfg["user"], + password=mysql_cfg["password"], + charset=mysql_cfg.get("charset", "utf8mb4"), + connect_timeout=30, + read_timeout=read_timeout, + ) + except Exception as exc: + if attempt >= max_attempts or not _is_retryable_connect_error(exc): + raise + print( + f"[retry] MySQL 连接失败 ({type(exc).__name__}: {exc})," + f"{delay:.1f}s 后重试 ({attempt}/{max_attempts})" + ) + time.sleep(delay) + delay *= backoff + + raise RuntimeError("MySQL connection retry exhausted unexpectedly") + + +def qualify_table_name( + table: str, + catalog: Optional[str], + database: str = "dws", +) -> str: + """Resolve table to catalog.database.table for StarRocks Iceberg queries.""" + parts = [part.strip() for part in table.split(".") if part.strip()] + if len(parts) >= 3: + return table + if len(parts) == 2: + db_name, table_name = parts + if catalog: + return f"{catalog}.{db_name}.{table_name}" + return table + if len(parts) == 1: + if catalog: + return f"{catalog}.{database}.{parts[0]}" + return f"{database}.{parts[0]}" + return table + + +def quote_identifier(identifier: str) -> str: + parts = [part.strip() for part in identifier.split(".") if part.strip()] + if not parts: + raise ValueError(f"Invalid identifier: {identifier!r}") + return ".".join(f"`{part.replace('`', '``')}`" for part in parts) + + +def fetch_records(conn: Any, sql: str, params: Sequence[Any] = ()) -> List[Dict[str, Any]]: + with conn.cursor() as cursor: + cursor.execute(sql, params) + if cursor.description is None: + return [] + cols = [field[0] for field in cursor.description] + return [dict(zip(cols, row)) for row in cursor.fetchall()] + + +def normalize_json_like(value: Any) -> Any: + if isinstance(value, (bytes, bytearray)): + value = value.decode("utf-8", errors="replace") + if isinstance(value, str): + stripped = value.strip() + if stripped and stripped[0] in "[{": + try: + return json.loads(stripped) + except json.JSONDecodeError: + return value + return value + + +def canonicalize(value: Any) -> Any: + value = normalize_json_like(value) + if isinstance(value, Decimal): + if value == value.to_integral_value(): + return int(value) + return float(value) + if isinstance(value, (date, datetime)): + return value.isoformat() + if isinstance(value, dict): + return {str(k): canonicalize(v) for k, v in sorted(value.items(), key=lambda item: str(item[0]))} + if isinstance(value, list): + return [canonicalize(v) for v in value] + return value + + +def canonical_json(value: Any) -> str: + return json.dumps(value, ensure_ascii=False, sort_keys=True, separators=(",", ":")) + + +def comparable_record(record: Dict[str, Any], fields: Iterable[str]) -> Dict[str, Any]: + return {field: canonicalize(record.get(field)) for field in fields} + + +def _dt_clause(dt: Optional[str], params: List[Any]) -> str: + if dt is not None: + params.append(dt) + return " AND `dt` = %s" + return "" + + +def _limit_clause(limit: Optional[int]) -> str: + return "" if limit is None else f" LIMIT {int(limit)}" + + +def source_canonical_isbn13_expr(array_field: str = "`isbns`") -> str: + """SQL expression matching normalize_isbn_to_13 + min per source row.""" + cleaned = "regexp_replace(trim(x), '-', '')" + normalized = ( + "CASE " + f"WHEN {cleaned} REGEXP '^[0-9]{{13}}$' THEN {cleaned} " + f"WHEN {cleaned} REGEXP '^[0-9]{{9}}[0-9Xx]$' THEN concat('978', {cleaned}) " + "ELSE NULL END" + ) + return ( + "array_min(array_distinct(array_filter(" + f"array_map(x -> {normalized}, {array_field}), " + "x -> x IS NOT NULL AND x != ''" + ")))" + ) + + +def _key_not_null_clause(key_expr: str) -> str: + return f" AND {key_expr} IS NOT NULL AND {key_expr} != ''" + + +def _hash_sample_predicate( + mod_base: Optional[int], + mod_max: Optional[int], + *, + key_expr: str = "`isbn13`", +) -> str: + if not mod_base or not mod_max or mod_max <= 0: + return "" + return f" AND (ABS(CRC32({key_expr})) MOD {int(mod_base)}) < {int(mod_max)}" + + +def _sample_order_clause(*, high_first: bool = False, key_expr: str = "sample_key") -> str: + if high_first: + return f"source_count DESC, CRC32({key_expr})" + return f"CRC32({key_expr})" + + +def build_target_key_query( + table: str, + dt: Optional[str], + limit: Optional[int], + *, + hash_mod_base: Optional[int] = None, + hash_mod_max: Optional[int] = None, +) -> Tuple[str, List[Any]]: + params: List[Any] = [] + sql = ( + f"SELECT `isbn13` AS sample_key FROM {quote_identifier(table)} " + "WHERE 1=1" + f"{_key_not_null_clause('`isbn13`')}" + f"{_dt_clause(dt, params)}" + f"{_hash_sample_predicate(hash_mod_base, hash_mod_max, key_expr='`isbn13`')}" + f" ORDER BY {_sample_order_clause(key_expr='`isbn13`')}{_limit_clause(limit)}" + ) + return sql, params + + +def build_target_first_key_query( + table: str, + dt: Optional[str], + limit: Optional[int], +) -> Tuple[str, List[Any]]: + params: List[Any] = [] + sql = ( + f"SELECT `isbn13` AS sample_key FROM {quote_identifier(table)} " + "WHERE 1=1" + f"{_key_not_null_clause('`isbn13`')}" + f"{_dt_clause(dt, params)}" + f"{_limit_clause(limit)}" + ) + return sql, params + + +def build_random_key_query( + table: str, + dt: Optional[str], + limit: Optional[int], + *, + hash_mod_base: Optional[int] = None, + hash_mod_max: Optional[int] = None, +) -> Tuple[str, List[Any]]: + return build_target_key_query( + table, + dt, + limit, + hash_mod_base=hash_mod_base, + hash_mod_max=hash_mod_max, + ) + + +def build_duplicate_key_query( + table: str, + dt: Optional[str], + limit: Optional[int], + *, + high_first: bool, + hash_mod_base: Optional[int] = None, + hash_mod_max: Optional[int] = None, +) -> Tuple[str, List[Any]]: + params: List[Any] = [] + key_expr = source_canonical_isbn13_expr() + sql = ( + "SELECT sample_key, COUNT(*) AS source_count FROM (" + f"SELECT {key_expr} AS sample_key FROM {quote_identifier(table)} WHERE 1=1" + f"{_dt_clause(dt, params)}" + ") keyed WHERE 1=1" + f"{_key_not_null_clause('sample_key')}" + f"{_hash_sample_predicate(hash_mod_base, hash_mod_max, key_expr='sample_key')}" + " GROUP BY sample_key HAVING COUNT(*) > 1 " + f"ORDER BY {_sample_order_clause(high_first=high_first)}{_limit_clause(limit)}" + ) + return sql, params + + +def build_field_conflict_key_query( + table: str, + dt: Optional[str], + limit: Optional[int], + *, + hash_mod_base: Optional[int] = None, + hash_mod_max: Optional[int] = None, +) -> Tuple[str, List[Any]]: + params: List[Any] = [] + key_expr = source_canonical_isbn13_expr() + conflict_checks = [ + "COUNT(DISTINCT `title`) > 1", + "COUNT(DISTINCT `abstract`) > 1", + "COUNT(DISTINCT `language`) > 1", + "COUNT(DISTINCT `published_year`) > 1", + "COUNT(DISTINCT `pages`) > 1", + "COUNT(DISTINCT `category`) > 1", + ] + sql = ( + "SELECT sample_key, COUNT(*) AS source_count FROM (" + f"SELECT {key_expr} AS sample_key, `title`, `abstract`, `language`, " + f"`published_year`, `pages`, `category` FROM {quote_identifier(table)} WHERE 1=1" + f"{_dt_clause(dt, params)}" + ") keyed WHERE 1=1" + f"{_key_not_null_clause('sample_key')}" + f"{_hash_sample_predicate(hash_mod_base, hash_mod_max, key_expr='sample_key')}" + " GROUP BY sample_key HAVING COUNT(*) > 1 AND " + f"({' OR '.join(conflict_checks)}) " + f"ORDER BY {_sample_order_clause(high_first=True)}{_limit_clause(limit)}" + ) + return sql, params + + +def build_count_bucket_key_query( + table: str, + dt: Optional[str], + limit: Optional[int], + *, + bucket: str, + hash_mod_base: Optional[int] = None, + hash_mod_max: Optional[int] = None, +) -> Tuple[str, List[Any]]: + params: List[Any] = [] + key_expr = source_canonical_isbn13_expr() + if bucket == "one": + having = "COUNT(*) = 1" + elif bucket == "two": + having = "COUNT(*) = 2" + elif bucket == "multi": + having = "COUNT(*) > 2" + else: + raise ValueError(f"Unsupported count bucket: {bucket}") + sql = ( + "SELECT sample_key, COUNT(*) AS source_count FROM (" + f"SELECT {key_expr} AS sample_key FROM {quote_identifier(table)} WHERE 1=1" + f"{_dt_clause(dt, params)}" + ") keyed WHERE 1=1" + f"{_key_not_null_clause('sample_key')}" + f"{_hash_sample_predicate(hash_mod_base, hash_mod_max, key_expr='sample_key')}" + f" GROUP BY sample_key HAVING {having} " + f"ORDER BY {_sample_order_clause()}{_limit_clause(limit)}" + ) + return sql, params + + +def _append_sample_key( + keys: List[str], + seen: set, + key: str, + *, + sample_size: Optional[int], +) -> bool: + if not key or key in seen: + return False + seen.add(key) + keys.append(key) + return sample_size is not None and len(keys) >= sample_size + + +def fetch_sample_keys( + conn: Any, + *, + source_table: str, + target_table: str, + dt: Optional[str], + sample_mode: str, + sample_size: Optional[int], + hash_mod_base: Optional[int] = None, + hash_mod_max: Optional[int] = None, +) -> List[str]: + hash_kw = {"hash_mod_base": hash_mod_base, "hash_mod_max": hash_mod_max} + + if sample_mode == "target-first": + sql, params = build_target_first_key_query(target_table, dt, sample_size) + query_plan: List[Tuple[str, Tuple[str, List[Any]]]] = [("target-first", (sql, params))] + elif sample_mode == "target-random": + sql, params = build_target_key_query(target_table, dt, sample_size, **hash_kw) + query_plan: List[Tuple[str, Tuple[str, List[Any]]]] = [("target-random", (sql, params))] + elif sample_mode == "count-buckets": + per_bucket = None if sample_size is None else max(1, sample_size // 3) + query_plan = [ + ("count=1", build_count_bucket_key_query(source_table, dt, per_bucket, bucket="one", **hash_kw)), + ("count=2", build_count_bucket_key_query(source_table, dt, per_bucket, bucket="two", **hash_kw)), + ("count>2", build_count_bucket_key_query(source_table, dt, per_bucket, bucket="multi", **hash_kw)), + ] + elif sample_mode == "mixed": + per_bucket = None if sample_size is None else max(1, sample_size // 6) + query_plan = [ + ("count=1", build_count_bucket_key_query(source_table, dt, per_bucket, bucket="one", **hash_kw)), + ("count=2", build_count_bucket_key_query(source_table, dt, per_bucket, bucket="two", **hash_kw)), + ("count>2", build_count_bucket_key_query(source_table, dt, per_bucket, bucket="multi", **hash_kw)), + ("field-conflict", build_field_conflict_key_query(source_table, dt, per_bucket, **hash_kw)), + ("high-duplicate", build_duplicate_key_query(source_table, dt, per_bucket, high_first=True, **hash_kw)), + ("target-random", build_random_key_query(target_table, dt, per_bucket, **hash_kw)), + ] + else: + raise ValueError(f"Unsupported sample_mode: {sample_mode}") + + keys: List[str] = [] + seen: set = set() + + for idx, (label, (sql, params)) in enumerate(query_plan, start=1): + _log( + f"[info] 抽样 SQL {idx}/{len(query_plan)} [{label}] 开始执行" + f"(dt={dt!r}, mode={sample_mode})…" + ) + t0 = time.monotonic() + rows = fetch_records(conn, sql, params) + for row in rows: + if _append_sample_key(keys, seen, str(row.get("sample_key") or ""), sample_size=sample_size): + _log( + f"[info] 抽样 SQL {idx}/{len(query_plan)} [{label}] 完成," + f"耗时 {time.monotonic() - t0:.1f}s,已收集 {len(keys)} 个 key" + ) + return keys + _log( + f"[info] 抽样 SQL {idx}/{len(query_plan)} [{label}] 完成," + f"耗时 {time.monotonic() - t0:.1f}s,当前共 {len(keys)} 个 key" + ) + return keys + + +def build_target_record_query(table: str, isbn13: Any, dt: Optional[str]) -> Tuple[str, List[Any]]: + params: List[Any] = [] + if dt is not None: + params.append(dt) + params.append(str(isbn13)) + dt_sql = " AND `dt` = %s" if dt is not None else "" + sql = ( + f"SELECT * FROM {quote_identifier(table)} WHERE 1=1" + f"{dt_sql} AND `isbn13` = %s LIMIT 1" + ) + return sql, params + + +def build_source_query(table: str, isbn13: Any, dt: Optional[str]) -> Tuple[str, List[Any]]: + params: List[Any] = [] + key_expr = source_canonical_isbn13_expr() + if dt is not None: + params.append(dt) + params.append(str(isbn13)) + dt_sql = " AND `dt` = %s" if dt is not None else "" + return ( + "SELECT * FROM (" + f"SELECT *, {key_expr} AS sample_key FROM {quote_identifier(table)} WHERE 1=1{dt_sql}" + ") keyed WHERE sample_key = %s", + params, + ) + + +def build_source_batch_query( + table: str, + sample_keys: Sequence[str], + dt: Optional[str], +) -> Tuple[str, List[Any]]: + if not sample_keys: + raise ValueError("sample_keys must not be empty") + + key_expr = source_canonical_isbn13_expr() + sample_key_sql = " UNION ALL ".join("SELECT %s AS sample_key" for _ in sample_keys) + params: List[Any] = [str(key) for key in sample_keys] + if dt is not None: + params.append(dt) + dt_sql = " AND `dt` = %s" if dt is not None else "" + + sql = ( + f"WITH sample_keys AS ({sample_key_sql}), " + "source_keyed AS (" + f"SELECT *, {key_expr} AS sample_key FROM {quote_identifier(table)} WHERE 1=1{dt_sql}" + ") " + "SELECT source_keyed.* FROM source_keyed " + "JOIN sample_keys ON source_keyed.sample_key = sample_keys.sample_key" + ) + return sql, params + + +def group_source_rows_by_sample_key( + rows: Sequence[Dict[str, Any]], +) -> Dict[str, List[Dict[str, Any]]]: + grouped: Dict[str, List[Dict[str, Any]]] = {} + for row in rows: + key = str(row.get("sample_key") or "") + if not key: + continue + grouped.setdefault(key, []).append(row) + return grouped + + +def normalize_order_insensitive_value(value: Any) -> Any: + value = canonicalize(value) + if isinstance(value, list): + dedup_map: Dict[str, Any] = {} + for item in value: + if item is None or item == "": + continue + dedup_map[canonical_json(item)] = item + return [dedup_map[key] for key in sorted(dedup_map)] + return value + + +def normalize_empty_for_compare(value: Any, data_type: str) -> Any: + type_text = (data_type or "").strip().lower() + if value is None: + return None + if type_text in ("string", "varchar", "char", "text"): + return None if isinstance(value, str) and value.strip() == "" else value + if type_text.startswith("array"): + if value == []: + return None + if isinstance(value, str) and value.strip() in ("", "[]"): + return None + return value + + +def compare_records( + expected: Dict[str, Any], + actual: Dict[str, Any], + order_insensitive_fields: Optional[set] = None, + field_types: Optional[Dict[str, str]] = None, +) -> Dict[str, Dict[str, Any]]: + mismatches: Dict[str, Dict[str, Any]] = {} + order_insensitive_fields = order_insensitive_fields or set() + field_types = field_types or {} + for field, expected_value in expected.items(): + if field in order_insensitive_fields: + expected_value = normalize_order_insensitive_value(expected_value) + actual_value = normalize_order_insensitive_value(actual.get(field)) + else: + actual_value = canonicalize(actual.get(field)) + expected_value = normalize_empty_for_compare(expected_value, field_types.get(field, "")) + actual_value = normalize_empty_for_compare(actual_value, field_types.get(field, "")) + if expected_value != actual_value: + mismatches[field] = {"expected": expected_value, "actual": actual_value} + return mismatches + + +def validate_dt_partitions( + conn: Any, + source_table: str, + target_table: str, + dt: Optional[str], + *, + skip_source_distinct: bool = False, +) -> Dict[str, Any]: + """Check dt partition coverage and key counts between source and target.""" + params: List[Any] = [] + dt_filter = _dt_clause(dt, params) + + src_map: Dict[str, int] = {} + if not skip_source_distinct: + key_expr = source_canonical_isbn13_expr() + src_sql = ( + "SELECT `dt`, COUNT(DISTINCT sample_key) AS key_count FROM (" + f"SELECT `dt`, {key_expr} AS sample_key FROM {quote_identifier(source_table)}" + f" WHERE 1=1{dt_filter}" + ") keyed WHERE 1=1" + f"{_key_not_null_clause('sample_key')}" + " GROUP BY `dt` ORDER BY `dt`" + ) + src_rows = fetch_records(conn, src_sql, params) + src_map = {str(r["dt"]): int(r["key_count"]) for r in src_rows} + + tgt_sql = ( + f"SELECT `dt`, COUNT(*) AS row_count" + f" FROM {quote_identifier(target_table)}" + f" WHERE 1=1{dt_filter} GROUP BY `dt` ORDER BY `dt`" + ) + tgt_rows = fetch_records(conn, tgt_sql, params) + tgt_map = {str(r["dt"]): int(r["row_count"]) for r in tgt_rows} + all_dts = sorted(set(src_map) | set(tgt_map)) + + mismatches: List[Dict[str, Any]] = [] + for d in all_dts: + src_cnt = src_map.get(d) + tgt_cnt = tgt_map.get(d) + if src_cnt != tgt_cnt: + mismatches.append({ + "dt": d, + "source_key_count": src_cnt, + "target_row_count": tgt_cnt, + }) + + return { + "source_dt_count": len(src_map), + "target_dt_count": len(tgt_map), + "missing_in_target": sorted(set(src_map) - set(tgt_map)), + "extra_in_target": sorted(set(tgt_map) - set(src_map)), + "count_mismatches": mismatches, + "source_distinct_skipped": skip_source_distinct, + } + + +def discover_dt_values(conn: Any, table: str) -> List[str]: + sql = ( + f"SELECT DISTINCT `dt` FROM {quote_identifier(table)} " + "WHERE `dt` IS NOT NULL AND `dt` != '' ORDER BY `dt`" + ) + return [str(r["dt"]) for r in fetch_records(conn, sql)] + + +def validate_db( + *, + config_path: Path, + source_table: str, + target_table: str, + dt: Optional[str], + limit: Optional[int], + sample_mode: str, + report_path: Optional[Path], + mapping_csv: Path = DEFAULT_MAPPING_CSV, + skip_dt_check: bool = False, + skip_source_distinct: bool = False, + hash_mod_base: Optional[int] = 100, + hash_mod_max: Optional[int] = 2, +) -> Dict[str, Any]: + rules = load_field_rules(mapping_csv) + output_fields = output_fields_from_rules(rules) + order_insensitive_fields = order_insensitive_fields_from_rules(rules) + field_types = {rule.field_name: rule.data_type for rule in rules} + cfg = load_config(config_path) + mysql_cfg = cfg.get("mysql", {}) if isinstance(cfg.get("mysql"), dict) else {} + catalog = mysql_cfg.get("catalog") + database = str(mysql_cfg.get("database") or "dws") + source_table = qualify_table_name(source_table, catalog, database) + target_table = qualify_table_name(target_table, catalog, database) + hash_enabled = bool(hash_mod_base and hash_mod_max and hash_mod_max > 0) + _log( + f"[info] 图书去重校验开始:dt={dt!r}, limit={limit}, sample_mode={sample_mode}, " + f"hash_sample={'on' if hash_enabled else 'off'}, " + f"skip_dt_check={skip_dt_check}, source={source_table}, target={target_table}" + ) + with connect_starrocks(config_path) as conn: + _log("[info] StarRocks 连接成功") + if dt is not None: + dt_list = [dt] + else: + _log("[info] 正在发现源表 dt 分区…") + dt_list = discover_dt_values(conn, source_table) + _log(f"[info] 自动发现 {len(dt_list)} 个 dt 分区,逐分区验证") + + if skip_dt_check: + dt_check = {"skipped": True} + _log("[info] 跳过分区行数统计(--skip-dt-check)") + else: + _log("[info] 正在统计目标/源分区行数(源表 DISTINCT 可较慢,可用 --skip-source-distinct 跳过)…") + t0 = time.monotonic() + dt_check = validate_dt_partitions( + conn, + source_table, + target_table, + dt, + skip_source_distinct=skip_source_distinct, + ) + _log(f"[info] 分区统计完成,耗时 {time.monotonic() - t0:.1f}s") + + checked = passed = failed = missing_source = missing_target = 0 + source_count_buckets = {"one": 0, "two": 0, "multi": 0} + mismatch_rows: List[Dict[str, Any]] = [] + + for partition_dt in dt_list: + _log(f"[info] 分区 {partition_dt}:开始抽样 key…") + sample_keys = fetch_sample_keys( + conn, + source_table=source_table, + target_table=target_table, + dt=partition_dt, + sample_mode=sample_mode, + sample_size=limit, + hash_mod_base=hash_mod_base if hash_enabled else None, + hash_mod_max=hash_mod_max if hash_enabled else None, + ) + _log(f"[info] 分区 {partition_dt}:抽到 {len(sample_keys)} 个 ISBN13,开始批量拉取源记录…") + t0 = time.monotonic() + source_rows_by_key: Dict[str, List[Dict[str, Any]]] = {} + if sample_keys: + source_sql, source_params = build_source_batch_query(source_table, sample_keys, partition_dt) + source_rows_by_key = group_source_rows_by_sample_key( + fetch_records(conn, source_sql, source_params) + ) + _log( + f"[info] 分区 {partition_dt}:源记录批量拉取完成,耗时 " + f"{time.monotonic() - t0:.1f}s,命中 {len(source_rows_by_key)}/{len(sample_keys)} 个 key" + ) + _log(f"[info] 分区 {partition_dt}:开始逐条比对…") + + for isbn13 in sample_keys: + source_rows = source_rows_by_key.get(isbn13, []) + checked += 1 + if checked == 1 or checked % 20 == 0: + _log(f"[info] 分区 {partition_dt}:已比对 {checked}/{len(sample_keys)} 条") + + if not source_rows: + missing_source += 1 + mismatch_rows.append({ + "key": isbn13, + "dt": partition_dt, + "status": "missing_source", + "source_count": 0, + "mismatches": {}, + }) + continue + + if len(source_rows) == 1: + source_count_buckets["one"] += 1 + elif len(source_rows) == 2: + source_count_buckets["two"] += 1 + else: + source_count_buckets["multi"] += 1 + normalized_source = [{key: normalize_json_like(value) for key, value in row.items()} for row in source_rows] + aggregated = aggregate_group(normalized_source, rules) + expected = comparable_record(aggregated, output_fields) + expected_isbn13 = str(expected.get("isbn13") or isbn13) + target_sql, target_params = build_target_record_query(target_table, expected_isbn13, partition_dt) + target_rows = fetch_records(conn, target_sql, target_params) + if not target_rows: + missing_target += 1 + mismatch_rows.append({ + "key": isbn13, + "expected_key": expected_isbn13, + "dt": partition_dt, + "status": "missing_target", + "source_count": len(source_rows), + "source_records": normalized_source, + "expected_record": expected, + "mismatches": {}, + }) + continue + + target_row = target_rows[0] + actual = comparable_record(target_row, output_fields) + mismatches = compare_records(expected, actual, order_insensitive_fields, field_types) + if mismatches: + failed += 1 + mismatch_rows.append( + { + "key": isbn13, + "expected_key": expected_isbn13, + "dt": partition_dt, + "status": "field_mismatch", + "source_count": len(source_rows), + "mismatches": mismatches, + } + ) + else: + passed += 1 + + if report_path is not None: + report_path.parent.mkdir(parents=True, exist_ok=True) + with report_path.open("w", encoding="utf-8") as f: + for row in mismatch_rows: + f.write(json.dumps(localize_report_keys(row), ensure_ascii=False, cls=JsonEncoder) + "\n") + (report_path.parent / "source_field_warning.jsonl").write_text("", encoding="utf-8") + + result = { + "status": "ok", + "kind": "ebook", + "source_table": source_table, + "target_table": target_table, + "key_field": "isbn13", + "dt": dt, + "validated_partitions": dt_list, + "sample_mode": sample_mode, + "sample_size": limit, + "dt_check": dt_check, + "checked": checked, + "passed": passed, + "failed": failed, + "missing_source": missing_source, + "missing_target": missing_target, + "source_count_buckets": source_count_buckets, + "report_path": str(report_path) if report_path is not None else None, + "sample_mismatches": mismatch_rows[:5], + } + if report_path is not None: + write_report_summary(report_path, result, mismatch_rows) + print(json.dumps(result, ensure_ascii=False, cls=JsonEncoder)) + return result + + +# ---- CLI ---- + + +def cli() -> None: + config_parser = argparse.ArgumentParser(add_help=False) + config_parser.add_argument("--config", type=Path, default=DEFAULT_CONFIG_PATH) + config_args, _ = config_parser.parse_known_args() + cfg = load_config(config_args.config) if config_args.config.exists() else {} + ebook_cfg = cfg.get("unique_ebook", {}) + + default_csv = ebook_cfg.get("mapping_csv") + if default_csv: + default_csv = PROJECT_ROOT / default_csv + else: + default_csv = DEFAULT_MAPPING_CSV + + parser = argparse.ArgumentParser(description="Validate meta_ebook unique DB table by ISBN13.") + parser.add_argument("--config", type=Path, default=DEFAULT_CONFIG_PATH, help="shared settings JSON path") + parser.add_argument("--mapping-csv", type=Path, default=default_csv, help="field mapping CSV") + parser.add_argument("--source-table", default=ebook_cfg.get("source_table", DEFAULT_SOURCE_TABLE)) + parser.add_argument("--target-table", default=ebook_cfg.get("target_table", DEFAULT_TARGET_TABLE)) + parser.add_argument("--dt", default=ebook_cfg.get("dt"), help="dt partition filter") + parser.add_argument("--limit", type=int, default=int(ebook_cfg.get("limit", 600))) + parser.add_argument( + "--sample-mode", + choices=("count-buckets", "mixed", "target-random", "target-first"), + default=ebook_cfg.get("sample_mode", "count-buckets"), + help="count-buckets: 1/2/N 源行分桶;mixed: 加深抽样;target-random: 目标表稳定排序抽样;target-first: 目标表 LIMIT 抽样(smoke 最快)", + ) + parser.add_argument("--full", action="store_true", help="validate all target rows") + parser.add_argument("--skip-dt-check", action="store_true", default=bool(ebook_cfg.get("skip_dt_check"))) + parser.add_argument( + "--skip-source-distinct", + action="store_true", + default=bool(ebook_cfg.get("skip_source_distinct")), + help="dt 统计时跳过源表 COUNT(DISTINCT canonical_isbn13)", + ) + parser.add_argument( + "--no-sample-hash", + action="store_true", + help="关闭 CRC32 哈希预过滤(默认 mod 100 取 2,约 2%% 子集)", + ) + parser.add_argument( + "--sample-hash-mod-base", + type=int, + default=int(ebook_cfg.get("sample_hash_mod_base", 100)), + ) + parser.add_argument( + "--sample-hash-mod-max", + type=int, + default=int(ebook_cfg.get("sample_hash_mod_max", 2)), + ) + parser.add_argument("--report", type=Path, default=ebook_cfg.get("report_path"), help="JSONL report path") + args = parser.parse_args() + + hash_mod_base = None if args.no_sample_hash else args.sample_hash_mod_base + hash_mod_max = None if args.no_sample_hash else args.sample_hash_mod_max + report_path = Path(args.report) if args.report else default_report_path( + args.dt, + "count-buckets" if args.full else args.sample_mode, + args.full, + ) + + validate_db( + config_path=args.config, + source_table=args.source_table, + target_table=args.target_table, + dt=args.dt, + limit=None if args.full else args.limit, + sample_mode="count-buckets" if args.full else args.sample_mode, + report_path=report_path, + mapping_csv=args.mapping_csv, + skip_dt_check=args.skip_dt_check, + skip_source_distinct=args.skip_source_distinct, + hash_mod_base=hash_mod_base, + hash_mod_max=hash_mod_max, + ) + + +from dingo.config.input_args import EvaluatorRuleArgs +from dingo.io.input import Data, RequiredField +from dingo.io.output.eval_detail import EvalDetail, QualityLabel +from dingo.model.model import Model +from dingo.model.rule.base import BaseRule +from dingo.model.rule.scibase.report_utils import bool_param, int_param, write_temp_settings + + +@Model.rule_register( + "QUALITY_BAD_EFFECTIVENESS", + ["sci_base_qa_test", "meta_ebook_unique"], +) +class RuleSciBaseMetaEbookUniqueReport(BaseRule): + _metric_info = { + "category": "Rule-Based Metadata Quality Metrics", + "quality_dimension": "EFFECTIVENESS", + "metric_name": "RuleSciBaseMetaEbookUniqueReport", + "description": "Run SciBase ebook ISBN unique DB validation and write reports.", + "paper_title": "", + "paper_url": "", + "paper_authors": "", + "evaluation_results": "", + } + + _required_fields = [RequiredField.METADATA] + dynamic_config = EvaluatorRuleArgs(parameters={}) + + @classmethod + def eval(cls, input_data: Data) -> EvalDetail: + del input_data + params = cls.dynamic_config.parameters or {} + full = bool_param(params, "full", False) + sample_mode = str(params.get("sample_mode") or "count-buckets") + dt = params.get("dt") + report_path = Path(params["report_path"]) if params.get("report_path") else None + if report_path is None and params.get("output_dir"): + report_path = Path(str(params["output_dir"])) / "source_field_mismatch.jsonl" + if report_path is None: + report_path = default_report_path(dt, "count-buckets" if full else sample_mode, full) + + config_path = write_temp_settings(params) + result = validate_db( + config_path=config_path, + source_table=str(params.get("source_table") or DEFAULT_SOURCE_TABLE), + target_table=str(params.get("target_table") or DEFAULT_TARGET_TABLE), + dt=dt, + limit=None if full else int_param(params, "limit", 600), + sample_mode="count-buckets" if full else sample_mode, + report_path=report_path, + mapping_csv=Path(str(params.get("mapping_csv") or DEFAULT_MAPPING_CSV)), + skip_dt_check=bool_param(params, "skip_dt_check", False), + skip_source_distinct=bool_param(params, "skip_source_distinct", False), + hash_mod_base=None if bool_param(params, "no_sample_hash", False) else int_param(params, "sample_hash_mod_base", 100), + hash_mod_max=None if bool_param(params, "no_sample_hash", False) else int_param(params, "sample_hash_mod_max", 2), + ) + bad = any( + int(result.get(key) or 0) > 0 + for key in ("failed", "missing_source", "missing_target") + ) + count_mismatches = ((result.get("dt_check") or {}).get("count_mismatches") or []) + bad = bad or bool(count_mismatches) + reason = [str(report_path.parent), f"checked={result.get('checked')}", f"failed={result.get('failed')}"] + if bad: + return EvalDetail( + metric=cls.__name__, + status=True, + label=[f"{cls.metric_type}.{cls.__name__}"], + reason=reason, + ) + return EvalDetail(metric=cls.__name__, label=[QualityLabel.QUALITY_GOOD], reason=reason) + + +if __name__ == "__main__": + cli() diff --git a/dingo/model/rule/scibase/meta_paper_data.py b/dingo/model/rule/scibase/meta_paper_data.py new file mode 100644 index 00000000..ff20066b --- /dev/null +++ b/dingo/model/rule/scibase/meta_paper_data.py @@ -0,0 +1,3408 @@ +#!/usr/bin/env python3 +"""Single-file verifier for S3 arxiv data loaded into the paper source table. + +Generated from /Users/guhuaiyu/PycharmProjects/osi_test without modifying that source project. +This file validates S3 arxiv metadata against the paper source table. +Runtime dependencies: pymysql, duckdb, pyarrow, boto3. +""" +from __future__ import annotations + + +# ---- osi_verify/common.py ---- + + +import json +from datetime import datetime +from typing import Any, Dict + + +def sql_literal(value: str) -> str: + return value.replace("'", "''") + + +def json_loads_maybe(v: Any) -> Any: + if v is None or isinstance(v, (dict, list)): + return v + if isinstance(v, (bytes, bytearray)): + v = v.decode("utf-8", errors="replace") + if isinstance(v, str): + s = v.strip() + if s and s[0] in "{[": + try: + return json.loads(s) + except json.JSONDecodeError: + pass + return v + + +def normalize_scalar(v: Any) -> Any: + if v is None: + return None + if isinstance(v, bool): + return v + if isinstance(v, datetime): + return v.isoformat(sep=" ", timespec="seconds") + if isinstance(v, (int, float)) and not isinstance(v, bool): + return v + s = str(v).strip() + return s if s else None + + +def get_first(row: Dict[str, Any], *keys: str) -> Any: + for k in keys: + if k in row and row[k] is not None: + return row[k] + return None + + +def as_bool_flag(v: Any) -> bool: + if isinstance(v, bool): + return v + if v is None: + return False + if isinstance(v, (int, float)): + return int(v) == 1 + return str(v).strip() in ("1", "true", "True", "yes", "Y") + + +def oa_flag_str(flag: bool) -> str: + return "true" if flag else "false" + + +# ---- osi_verify/retry.py ---- + + +import sys +import time +from dataclasses import dataclass +from typing import Any, Callable, Dict, Optional, TypeVar + +T = TypeVar("T") + +try: + import pymysql +except ImportError: + pymysql = None # type: ignore + +try: + import duckdb +except ImportError: + duckdb = None # type: ignore + +try: + from botocore.exceptions import BotoCoreError, ClientError, ConnectionError as BotoConnectionError +except ImportError: + BotoCoreError = ClientError = BotoConnectionError = () # type: ignore + + +@dataclass(frozen=True) +class RetryConfig: + enabled: bool = True + max_attempts: int = 3 + initial_delay_sec: float = 1.0 + backoff_factor: float = 2.0 + max_delay_sec: float = 30.0 + + def __post_init__(self) -> None: + max_attempts = max(1, int(self.max_attempts)) + initial_delay = max(0.0, float(self.initial_delay_sec)) + backoff = max(1.0, float(self.backoff_factor)) + max_delay = max(initial_delay, max(0.0, float(self.max_delay_sec))) + object.__setattr__(self, "max_attempts", max_attempts) + object.__setattr__(self, "initial_delay_sec", initial_delay) + object.__setattr__(self, "backoff_factor", backoff) + object.__setattr__(self, "max_delay_sec", max_delay) + + @classmethod + def disabled(cls) -> "RetryConfig": + return cls(enabled=False, max_attempts=1) + + def attempts(self) -> int: + return max(1, self.max_attempts) if self.enabled else 1 + + +def _positive_int(value: Any, default: int) -> int: + try: + parsed = int(value) + except (TypeError, ValueError): + return default + return max(1, parsed) + + +def _non_negative_float(value: Any, default: float) -> float: + try: + parsed = float(value) + except (TypeError, ValueError): + return default + return max(0.0, parsed) + + +def _min_float(value: Any, default: float, minimum: float) -> float: + try: + parsed = float(value) + except (TypeError, ValueError): + return default + return max(minimum, parsed) + + +def load_retry_config(settings: Dict[str, Any]) -> RetryConfig: + raw = settings.get("retry") + if not isinstance(raw, dict): + return RetryConfig() + return RetryConfig( + enabled=bool(raw.get("enabled", True)), + max_attempts=_positive_int(raw.get("max_attempts", 3), 3), + initial_delay_sec=_non_negative_float(raw.get("initial_delay_sec", 1.0), 1.0), + backoff_factor=_min_float(raw.get("backoff_factor", 2.0), 2.0, 1.0), + max_delay_sec=_non_negative_float(raw.get("max_delay_sec", 30.0), 30.0), + ) + + +def _exc_message(exc: BaseException) -> str: + return str(exc).lower() + + +def is_mysql_retryable(exc: BaseException) -> bool: + if pymysql is None: + return False + if isinstance(exc, pymysql.err.OperationalError): + code = exc.args[0] if exc.args else None + if code in (2003, 2006, 2013): + return True + if isinstance(exc, pymysql.err.ProgrammingError): + msg = _exc_message(exc) + return any( + token in msg + for token in ( + "timeout", + "timed out", + "connection", + "lost connection", + "brpc", + "host is down", + "not connected", + "could not determine master", + "master from helpers", + "no alive backend", + "frontend", + ) + ) + if isinstance(exc, (TimeoutError, ConnectionError, OSError)): + return True + return False + + +def is_s3_retryable(exc: BaseException) -> bool: + if duckdb is not None and isinstance(exc, duckdb.IOException): + msg = _exc_message(exc) + return any( + token in msg + for token in ( + "connection", + "failed to read", + "timeout", + "network", + "io error", + ) + ) + if isinstance(exc, (BotoConnectionError, TimeoutError, ConnectionError, OSError)): + return True + if isinstance(exc, ClientError): + code = str(exc.response.get("Error", {}).get("Code", "")) + status = int(exc.response.get("ResponseMetadata", {}).get("HTTPStatusCode", 0) or 0) + if status in (408, 429, 500, 502, 503, 504): + return True + return code in { + "RequestTimeout", + "RequestTimeoutException", + "Throttling", + "ThrottlingException", + "SlowDown", + "InternalError", + "ServiceUnavailable", + } + if isinstance(exc, BotoCoreError): + return True + return False + + +def retry_call( + fn: Callable[[], T], + config: Optional[RetryConfig], + *, + label: str, + retryable: Callable[[BaseException], bool], +) -> T: + cfg = config or RetryConfig() + attempts = cfg.attempts() + delay = cfg.initial_delay_sec + last_exc: Optional[BaseException] = None + + for attempt in range(1, attempts + 1): + try: + return fn() + except Exception as exc: + last_exc = exc + if attempt >= attempts or not cfg.enabled or not retryable(exc): + raise + print( + f"[retry] {label} 失败 ({type(exc).__name__}: {exc})," + f"{delay:.1f}s 后重试 ({attempt}/{attempts})", + file=sys.stderr, + ) + time.sleep(delay) + delay = min(delay * cfg.backoff_factor, cfg.max_delay_sec) + + assert last_exc is not None + raise last_exc + + +# ---- osi_verify/config.py ---- + + +import json +import re +from dataclasses import dataclass +from pathlib import Path +from typing import Any, Dict, Optional, Union + +PROJECT_ROOT = Path(__file__).resolve().parent +ASSETS_DIR = PROJECT_ROOT / "assets" +REPORT_ROOT = Path("report") +DEFAULT_ICEBERG_CATALOG = "lakehouse_iceberg" +DEFAULT_SETTINGS_JSON = Path("sci_base_qa_test_config.json") +DT_RE = re.compile(r"dt=([^/]+)") + + +@dataclass(frozen=True) +class TargetConfig: + name: str + kind: str + description: str + mapping_csv: Path + database: str + table: str + catalog: Optional[str] + origin_osi: str + source_id_field: str + transform: str + mapping_target_column: str + mapping_source_column: str + s3_settings: Dict[str, Any] + s3_subpath: Optional[str] + s3_path: Optional[str] + s3_format: Optional[str] + + +def resolve_project_path(value: Optional[Union[str, Path]]) -> Optional[Path]: + if value is None: + return None + path = Path(value).expanduser() + if path.is_absolute(): + return path + return PROJECT_ROOT / path + + +def load_settings(path: Path) -> Dict[str, Any]: + if not path.exists(): + return {} + return json.loads(path.read_text(encoding="utf-8")) + + +def load_arxiv_target_config(settings: Dict[str, Any]) -> TargetConfig: + arxiv_settings = settings.get("osi_arxiv", {}) if isinstance(settings.get("osi_arxiv"), dict) else {} + table_settings = settings.get("table", {}) if isinstance(settings.get("table"), dict) else {} + mapping_settings = settings.get("mapping", {}) if isinstance(settings.get("mapping"), dict) else {} + s3_settings = arxiv_settings.get("s3", {}) if isinstance(arxiv_settings.get("s3"), dict) else {} + for key in ("config_file", "path", "subpath", "format"): + if key in arxiv_settings and arxiv_settings[key] not in (None, ""): + s3_settings[key] = arxiv_settings[key] + mapping_csv = resolve_project_path( + arxiv_settings.get("mapping_csv") + or table_settings.get("mapping_csv") + or mapping_settings.get("csv") + or str(ASSETS_DIR / "osi_arxiv_mapping.csv") + ) + if mapping_csv is None: + mapping_csv = ASSETS_DIR / "osi_arxiv_mapping.csv" + return TargetConfig( + name="osi_axiv", + kind="osi_axiv", + description="S3 arxiv 数据到论文源数据表校验", + mapping_csv=mapping_csv, + database=str(arxiv_settings.get("database") or table_settings.get("database") or "dws"), + table=str(arxiv_settings.get("target_table") or arxiv_settings.get("table") or table_settings.get("table") or "dws_meta_paper_data_acc_d"), + catalog=str(arxiv_settings.get("catalog") or table_settings.get("catalog") or DEFAULT_ICEBERG_CATALOG), + origin_osi="arxiv", + source_id_field="doc_id", + transform="osi_arxiv", + mapping_target_column=str(arxiv_settings.get("mapping_target_column") or mapping_settings.get("target_column") or "预期字段"), + mapping_source_column=str(arxiv_settings.get("mapping_source_column") or mapping_settings.get("source_column") or "arxiv对应字段"), + s3_settings=dict(s3_settings), + s3_subpath=s3_settings.get("subpath"), + s3_path=s3_settings.get("path"), + s3_format=s3_settings.get("format"), + ) + + + +def _merge_present(base: Dict[str, Any], overrides: Dict[str, Any]) -> Dict[str, Any]: + out = dict(base) + for k, v in overrides.items(): + if v is not None and v != "": + out[k] = v + return out + + +def _strip_endpoint_scheme(value: str) -> str: + return value.removeprefix("https://").removeprefix("http://").rstrip("/") + + +def _parse_bool(value: Any) -> bool: + if isinstance(value, bool): + return value + return str(value).strip().lower() in {"1", "true", "yes", "y", "on"} + + +def parse_mysql_config(path: Path) -> Dict[str, Any]: + cfg: Dict[str, Any] = {} + for line in path.read_text(encoding="utf-8").splitlines(): + line = line.strip() + if not line or ":" not in line: + continue + key, val = line.split(":", 1) + key, val = key.strip(), val.strip() + if key in ("账号", "用户名", "user"): + cfg["user"] = val + elif key in ("密码", "password"): + cfg["password"] = val + elif key in ("地址", "host"): + if ":" in val: + host, port = val.rsplit(":", 1) + cfg["host"] = host + cfg["port"] = int(port) + else: + cfg["host"] = val + elif key in ("catalog", "iceberg_catalog", "catalog名"): + cfg["catalog"] = val + if "port" not in cfg: + cfg["port"] = 3306 + missing = [k for k in ("user", "password", "host") if k not in cfg] + if missing: + raise ValueError(f"MySQL 配置缺少字段: {missing}(文件: {path})") + return cfg + + +def load_mysql_config(path: Optional[Path], inline: Optional[Dict[str, Any]] = None) -> Dict[str, Any]: + cfg: Dict[str, Any] = {} + if path and path.exists(): + cfg.update(parse_mysql_config(path)) + if inline: + cfg = _merge_present(cfg, inline) + host = cfg.get("host") + if isinstance(host, str) and ":" in host: + raise ValueError("MySQL host 请只配置主机名/IP,端口请通过 port 单独配置") + if "port" not in cfg: + cfg["port"] = 3306 + else: + cfg["port"] = int(cfg["port"]) + missing = [k for k in ("user", "password", "host") if k not in cfg] + if missing: + source = path or "inline settings" + raise ValueError(f"MySQL 配置缺少字段: {missing}(来源: {source})") + return cfg + + +def parse_s3_config(path: Path) -> Dict[str, Any]: + cfg: Dict[str, Any] = {} + for line in path.read_text(encoding="utf-8").splitlines(): + line = line.strip() + if not line: + continue + if line.startswith("s3://"): + cfg["default_path"] = line if line.endswith("/") else line + "/" + continue + sep = ":" if ":" in line else (":" if ":" in line else None) + if not sep: + continue + key, val = line.split(sep, 1) + key, val = key.strip().upper(), val.strip() + if key in ("AK", "ACCESS_KEY", "AWS_ACCESS_KEY_ID"): + cfg["access_key"] = val + elif key in ("SK", "SECRET_KEY", "AWS_SECRET_ACCESS_KEY"): + cfg["secret_key"] = val + elif key in ("ENDPOINT", "S3_ENDPOINT"): + cfg["endpoint"] = _strip_endpoint_scheme(val) + elif key in ("USE_SSL", "S3_USE_SSL"): + cfg["use_ssl"] = _parse_bool(val) + elif key in ("VERIFY_SSL", "S3_VERIFY_SSL"): + cfg["verify_ssl"] = _parse_bool(val) + missing = [k for k in ("access_key", "secret_key", "endpoint") if k not in cfg] + if missing: + raise ValueError(f"S3 配置缺少字段: {missing}(文件: {path})") + if "default_path" not in cfg: + cfg["default_path"] = "s3://lakehouse-scibase/" + return cfg + + +def load_s3_config(path: Optional[Path], inline: Optional[Dict[str, Any]] = None) -> Dict[str, Any]: + cfg: Dict[str, Any] = {} + if path and path.exists(): + cfg.update(parse_s3_config(path)) + if inline: + aliases = { + "ak": "access_key", + "sk": "secret_key", + "bucket_path": "default_path", + "path": "default_path", + } + normalized = {aliases.get(k, k): v for k, v in inline.items()} + if "endpoint" in normalized and normalized["endpoint"]: + normalized["endpoint"] = _strip_endpoint_scheme(str(normalized["endpoint"])) + if "use_ssl" in normalized: + normalized["use_ssl"] = _parse_bool(normalized["use_ssl"]) + if "verify_ssl" in normalized: + normalized["verify_ssl"] = _parse_bool(normalized["verify_ssl"]) + cfg = _merge_present(cfg, normalized) + missing = [k for k in ("access_key", "secret_key", "endpoint") if k not in cfg] + if missing: + source = path or "inline settings" + raise ValueError(f"S3 配置缺少字段: {missing}(来源: {source})") + if "default_path" not in cfg: + cfg["default_path"] = "s3://lakehouse-scibase/" + return cfg + + +def resolve_s3_path(base: str, subpath: Optional[str]) -> str: + base = base.rstrip("/") + if not subpath: + return base + "/" + return base + "/" + subpath.strip("/") + "/" + + +def apply_s3_dt_to_path(s3_path: str, s3_dt: Optional[str]) -> str: + if not s3_dt: + return s3_path + if "YYYY-MM-DD" in s3_path: + return s3_path.replace("YYYY-MM-DD", s3_dt) + if DT_RE.search(s3_path): + return DT_RE.sub(f"dt={s3_dt}", s3_path, count=1) + return s3_path + + +def extract_partition_dt(s3_subpath: Optional[str], override: Optional[str] = None) -> Optional[str]: + if override: + return override + if not s3_subpath: + return None + m = DT_RE.search(s3_subpath) + return m.group(1) if m else None + + +# ---- osi_verify/mapping.py ---- + + +import csv +from dataclasses import dataclass +from pathlib import Path +from typing import List, Sequence + + +CONTAINER_COMPARE_FIELDS = {"locations", "classifications"} +CONTAINER_CHILD_PREFIXES = tuple(f"{field}." for field in CONTAINER_COMPARE_FIELDS) +NON_COMPARE_MARKERS = ("后续处理",) +DEFAULT_EMPTY_SOURCE_MARKERS = {"无", "/"} + + +@dataclass(frozen=True) +class MappingRule: + target_field: str + source_note: str + compare_field: str + value_type: str = "" + compare: bool = True + + +def canonical_field(field: str) -> str: + return field.strip() + + +def should_compare(field: str, source_note: str) -> bool: + if not field: + return False + if field.startswith(CONTAINER_CHILD_PREFIXES): + return False + if any(marker in source_note for marker in NON_COMPARE_MARKERS): + return False + if not source_note and field not in CONTAINER_COMPARE_FIELDS: + return False + return True + + +def load_mapping_rules( + path: Path, + *, + target_column: str = "预期字段", + source_column: str = "arxiv对应字段", + type_column: str = "字段值数据类型", +) -> List[MappingRule]: + rules: List[MappingRule] = [] + with path.open(encoding="utf-8-sig", newline="") as f: + reader = csv.DictReader(f) + if not reader.fieldnames or target_column not in reader.fieldnames: + available = ", ".join(reader.fieldnames or []) + raise ValueError( + f"映射文件 {path} 缺少目标字段列 {target_column!r}" + f"(可用列: {available})" + ) + for row in reader: + target = (row.get(target_column) or "").strip() + note = (row.get(source_column) or "").strip() + value_type = (row.get(type_column) or "").strip() + if not target: + continue + compare_field = canonical_field(target) + rules.append( + MappingRule( + target_field=target, + source_note=note, + compare_field=compare_field, + value_type=value_type, + compare=should_compare(target, note), + ) + ) + return rules + + +def compare_fields_from_rules(rules: Sequence[MappingRule]) -> List[str]: + fields: List[str] = [] + seen = set() + for rule in rules: + if not rule.compare: + continue + if rule.compare_field in seen: + continue + seen.add(rule.compare_field) + fields.append(rule.compare_field) + return fields + + +def default_empty_field_types_from_rules(rules: Sequence[MappingRule]) -> dict[str, str]: + """映射来源为“无”的字段:按声明类型做默认空值校验。""" + fields: dict[str, str] = {} + for rule in rules: + if not rule.compare or rule.source_note not in DEFAULT_EMPTY_SOURCE_MARKERS: + continue + if rule.compare_field in fields: + continue + fields[rule.compare_field] = rule.value_type + return fields + + +# ---- osi_verify/transform.py ---- + + +import json +import re +from datetime import datetime +from typing import Any, Dict, List, Optional, Tuple + + +ARXIV_ABS_RE = re.compile( + r"^(?:https?://)?(?:arxiv\.org/abs/|export\.arxiv\.org/abs/)?", + re.I, +) +ARXIV_DOI_PREFIX = "10.48550/arxiv." + +# 校验字段名 -> 湖仓表实际列名(当二者不一致时) +DB_COLUMN_ALIASES: Dict[str, str] = { + "s2FieldsOfStudy": "s2fieldsofstudy", +} + + +def strip_arxiv_id(paper_id: Optional[str]) -> Optional[str]: + if not paper_id: + return None + s = ARXIV_ABS_RE.sub("", str(paper_id).strip()) + return s.strip("/") or None + + +def parse_datetime_value(updated: Any) -> Optional[datetime]: + if updated is None: + return None + if isinstance(updated, datetime): + return updated + if hasattr(updated, "year") and hasattr(updated, "month") and hasattr(updated, "day"): + return datetime(updated.year, updated.month, updated.day) + s = str(updated).strip() + if not s: + return None + if re.match(r"^\d{4}-\d{2}-\d{2}", s): + try: + return datetime.strptime(s[:10], "%Y-%m-%d") + except ValueError: + pass + for fmt in ( + "%Y-%m-%d %H:%M:%S", + "%Y-%m-%dT%H:%M:%S", + "%Y-%m-%d", + "%a, %d %b %Y %H:%M:%S %Z", + ): + try: + sample = s[:30] if "GMT" in s else s[:19] + return datetime.strptime(sample, fmt) + except ValueError: + continue + m = re.match(r"(\d{4})", s) + return datetime(int(m.group(1)), 1, 1) if m else None + + +def parse_date_iso(updated: Any) -> Optional[str]: + """将 GMT/各类日期字符串规范为 YYYY-MM-DD(与落库一致)。""" + dt = parse_datetime_value(updated) + return dt.strftime("%Y-%m-%d") if dt else None + + +def parse_year(updated: Any) -> Optional[int]: + dt = parse_datetime_value(updated) + return dt.year if dt else None + + +def _normalize_author_name(name: str) -> str: + """去掉作者名前的 and(与落库一致,如 ', and Foo' 按逗号拆分后残留)。""" + s = name.strip() + if s.lower().startswith("and "): + s = s[4:].strip() + return s + + +def parse_authors(author: Any) -> List[str]: + if author is None: + return [] + if isinstance(author, list): + return [ + n + for a in author + if (n := _normalize_author_name(str(a))) + ] + s = str(author).strip() + if not s: + return [] + return [ + n + for p in re.split(r"[,;]\s*|\s+and\s+", s, flags=re.I) + if p.strip() and (n := _normalize_author_name(p)) + ] + + +# 产品 license_url 可选值(2025.09.01) +LICENSE_ALLOWED: frozenset = frozenset( + { + "cc-by", + "cc-by-nc", + "cc-by-sa", + "cc-by-nd", + "cc-by-nc-sa", + "cc-by-nc-nd", + "other-oa", + "cc0", + "", + "public-domain", + "publisher-specific-oa", + "nonexclusive-distrib", + } +) + +# S3 license_url / 历史别名 -> 标准可选值 +DEFAULT_LICENSE_MAP: Dict[str, str] = { + "http://arxiv.org/licenses/nonexclusive-distrib/1.0/": "nonexclusive-distrib", + "https://arxiv.org/licenses/nonexclusive-distrib/1.0/": "nonexclusive-distrib", + "arxiv-nonexclusive-distrib-1.0": "nonexclusive-distrib", + "http://creativecommons.org/licenses/by/4.0/": "cc-by", + "https://creativecommons.org/licenses/by/4.0/": "cc-by", + "http://creativecommons.org/licenses/by/3.0/": "cc-by", + "https://creativecommons.org/licenses/by/3.0/": "cc-by", + "CC-BY-4.0": "cc-by", + "CC-BY-3.0": "cc-by", + "http://creativecommons.org/licenses/by-nc/4.0/": "cc-by-nc", + "https://creativecommons.org/licenses/by-nc/4.0/": "cc-by-nc", + "http://creativecommons.org/licenses/by-sa/4.0/": "cc-by-sa", + "https://creativecommons.org/licenses/by-sa/4.0/": "cc-by-sa", + "http://creativecommons.org/licenses/by-nd/4.0/": "cc-by-nd", + "https://creativecommons.org/licenses/by-nd/4.0/": "cc-by-nd", + "http://creativecommons.org/licenses/by-nc-sa/4.0/": "cc-by-nc-sa", + "https://creativecommons.org/licenses/by-nc-sa/4.0/": "cc-by-nc-sa", + "http://creativecommons.org/licenses/by-nc-nd/4.0/": "cc-by-nc-nd", + "https://creativecommons.org/licenses/by-nc-nd/4.0/": "cc-by-nc-nd", + "http://creativecommons.org/publicdomain/zero/1.0/": "cc0", + "https://creativecommons.org/publicdomain/zero/1.0/": "cc0", + "CC0-1.0": "cc0", +} + +_CC_LICENSE_URL_RULES: List[Tuple[re.Pattern, str]] = [ + (re.compile(r"creativecommons\.org/licenses/by-nc-sa", re.I), "cc-by-nc-sa"), + (re.compile(r"creativecommons\.org/licenses/by-nc-nd", re.I), "cc-by-nc-nd"), + (re.compile(r"creativecommons\.org/licenses/by-nc(?:/|$)", re.I), "cc-by-nc"), + (re.compile(r"creativecommons\.org/licenses/by-sa", re.I), "cc-by-sa"), + (re.compile(r"creativecommons\.org/licenses/by-nd", re.I), "cc-by-nd"), + (re.compile(r"creativecommons\.org/licenses/by(?:/|$)", re.I), "cc-by"), + (re.compile(r"creativecommons\.org/publicdomain/zero", re.I), "cc0"), + (re.compile(r"arxiv\.org/licenses/nonexclusive-distrib", re.I), "nonexclusive-distrib"), +] + + +def normalize_license_value(v: Any, license_map: Dict[str, str]) -> str: + """将 S3 URL / 别名 / DB 值规范为 license_url 可选值。""" + if v is None: + return "" + s = str(v).strip() + if not s: + return "" + if s in license_map: + return license_map[s] + trimmed = s.rstrip("/") + if trimmed in license_map: + return license_map[trimmed] + low = s.lower() + if low in LICENSE_ALLOWED: + return low + for pat, canon in _CC_LICENSE_URL_RULES: + if pat.search(s): + return canon + return low + + +def license_out_of_allowed_warning(value: str, *, source: str = "S3") -> Optional[str]: + if value and value not in LICENSE_ALLOWED: + allowed = ", ".join(sorted(LICENSE_ALLOWED - {""})) + return ( + f"[WARN] license_url {source} 值 '{value}' 不在产品可选值内" + f"({allowed}),属上游数据,不判定为开发缺陷" + ) + return None + + +def map_license_url(url: Any, license_map: Dict[str, str]) -> str: + return normalize_license_value(url, license_map) + + + + + +def build_doi(row: Dict[str, Any]) -> Optional[str]: + """与落库一致:S3 有 doi 直接用;否则 10.48550/arxiv.{doc_id}。""" + doi = get_first(row, "doi") + if doi is not None and str(doi).strip(): + return str(doi).strip() + doc_id = get_first(row, "doc_id") + if doc_id is not None and str(doc_id).strip(): + return f"{ARXIV_DOI_PREFIX}{str(doc_id).strip()}" + return None + + +def normalize_doi(v: Any) -> Optional[str]: + """DOI 比对忽略大小写。""" + v = normalize_scalar(v) + if v is None: + return None + return str(v).strip().lower() + + +def normalize_indexed_in(v: Any) -> List[str]: + """与落库一致:List[string]。""" + v = json_loads_maybe(v) + if v is None: + return [] + if isinstance(v, list): + return [str(x).strip() for x in v if str(x).strip()] + s = str(v).strip() + return [s] if s else [] + + +def build_identifiers(row: Dict[str, Any]) -> Dict[str, str]: + """ + 与落库一致:map,oaiId <- oai_identifier,arxivId <- paper_id(去掉 https:// 等前缀)。 + """ + out: Dict[str, str] = {} + oai = get_first(row, "oai_identifier") + if oai: + out["oaiId"] = str(oai).strip() + aid = strip_arxiv_id(get_first(row, "paper_id")) + if aid: + out["arxivId"] = aid + return out + + +def normalize_identifiers(v: Any) -> Dict[str, str]: + """比对用:统一为 {oaiId, arxivId} map。""" + v = json_loads_maybe(v) + if v is None: + return {} + if isinstance(v, dict): + out: Dict[str, str] = {} + oai = v.get("oaiId") or v.get("oai_id") + if oai: + out["oaiId"] = str(oai).strip() + arxiv = v.get("arxivId") or v.get("arxiv_id") + if arxiv: + aid = strip_arxiv_id(arxiv) or str(arxiv).strip() + if aid: + out["arxivId"] = aid + return out + if isinstance(v, list): + out = {} + for item in v: + if not isinstance(item, dict): + continue + t = str(item.get("type", "")).lower() + val = item.get("value") + if not val: + continue + if t in ("oai_identifier", "oaiid", "oai_id"): + out["oaiId"] = str(val).strip() + elif t in ("arxiv_id", "arxivid"): + aid = strip_arxiv_id(val) or str(val).strip() + if aid: + out["arxivId"] = aid + return out + return {} + + +def build_locations(row: Dict[str, Any], license_map: Dict[str, str]) -> List[Dict[str, Any]]: + locs: List[Dict[str, Any]] = [] + get_pdf = as_bool_flag(get_first(row, "get_pdf")) + get_source = as_bool_flag(get_first(row, "get_source")) + lic = map_license_url(get_first(row, "license_url"), license_map) + pdf_url = get_first(row, "pdf_url") + if pdf_url: + locs.append( + { + "type": "download" if get_pdf else "", + "url": str(pdf_url), + "license": lic, + "is_oa": oa_flag_str(get_pdf), + } + ) + source_url = get_first(row, "source_url") + if source_url: + locs.append( + { + "type": "download" if get_source else "", + "url": str(source_url), + "license": lic, + "is_oa": oa_flag_str(get_source), + } + ) + return locs + + +def normalize_locations(v: Any, license_map: Dict[str, str]) -> List[Dict[str, Any]]: + """比对用:统一 locations,is_oa 为 string,license 为标准可选值。""" + v = json_loads_maybe(v) + if not isinstance(v, list): + return [] + out: List[Dict[str, Any]] = [] + for item in v: + if not isinstance(item, dict): + continue + loc = dict(item) + if "is_oa" in loc: + loc["is_oa"] = oa_flag_str(as_bool_flag(loc["is_oa"])) + if "license" in loc: + loc["license"] = normalize_license_value(loc["license"], license_map) + out.append(loc) + return out + + +def _classification_field(row: Dict[str, Any], key: str) -> Any: + """从 S3 行取 classifications 子字段;category -> arxiv_category。""" + if key == "arxiv_category": + raw = get_first(row, "category") + else: + raw = get_first(row, key) + if raw is None: + return None + if isinstance(raw, str) and not raw.strip(): + return None + if key == "arxiv_category": + if isinstance(raw, list): + return [str(x).strip() for x in raw if str(x).strip()] + return [raw.strip()] if str(raw).strip() else None + return raw + + +def build_classifications(row: Dict[str, Any]) -> Dict[str, Any]: + """与落库一致:固定 Object,含 mesh / msc_class / acm_class / arxiv_category。""" + return { + "mesh": _classification_field(row, "mesh"), + "msc_class": _classification_field(row, "msc_class"), + "acm_class": _classification_field(row, "acm_class"), + "arxiv_category": _classification_field(row, "arxiv_category"), + } + + +def normalize_classifications(v: Any) -> Dict[str, Any]: + """比对用:统一四类 key,空值归一为 null;category 别名 -> arxiv_category。""" + v = json_loads_maybe(v) + if not isinstance(v, dict): + v = {} + raw_cat = v.get("arxiv_category") + if raw_cat is None and "category" in v: + raw_cat = v.get("category") + + def norm_scalar(val: Any) -> Any: + if val is None: + return None + if isinstance(val, str) and not val.strip(): + return None + return val + + def norm_category(val: Any) -> Any: + if val is None: + return None + if isinstance(val, list): + items = [str(x).strip() for x in val if str(x).strip()] + return items or None + s = str(val).strip() + return [s] if s else None + + return { + "mesh": norm_scalar(v.get("mesh")), + "msc_class": norm_scalar(v.get("msc_class")), + "acm_class": norm_scalar(v.get("acm_class")), + "arxiv_category": norm_category(raw_cat), + } + + +def build_track_id(row: Dict[str, Any]) -> Optional[str]: + """与落库一致:track_id = arxiv:{doc_id}。""" + oid = get_first(row, "doc_id") + if oid is None or not str(oid).strip(): + return None + return f"arxiv:{str(oid).strip()}" + + +def arxiv_empty_field_defaults() -> Dict[str, Any]: + """arxiv 源无对应字段时,落库表中的默认/空值(与湖仓表现一致)。""" + return { + "language": "", + "type": [], + "keywords": [], + "fieldsOfStudy": [], + "s2FieldsOfStudy": [], + "primary_topic": {}, + "topics": [], + "concepts": [], + "subject": "", + "major": "", + "major_2": "", + "major_3": "", + "category": "", + "area": "", + "grade_class": "", + "grade": "", + "origin_db_source": "", + "reference_count": None, + "citation_count": None, + "influential_citation_count": None, + "fwci": None, + "references": [], + "related_works": [], + "citation_normalized_percentile": {}, + "cited_by_percentile_year": {}, + "cited_by_api_url": "", + "venue_name": "", + "venue_type": "", + "venue_issn": [], + "venue_publisher": [], + "venue.type": "", + "venue.issn": [], + "venue.publisher": [], + "biblio_volume": "", + "biblio_issue": "", + "biblio_pages": "", + "mesh": None, + "msc_class": None, + "acm_class": None, + "arxiv_category": None, + } + + +def transform_arxiv_row(row: Dict[str, Any], license_map: Dict[str, str]) -> Dict[str, Any]: + updated = get_first(row, "updated") + get_pdf = as_bool_flag(get_first(row, "get_pdf")) + pdf_url = get_first(row, "pdf_url") or "" + expected: Dict[str, Any] = arxiv_empty_field_defaults() + expected.update({ + "track_id": build_track_id(row), + "title": get_first(row, "title"), + "abstract": get_first(row, "abstract"), + "doi": build_doi(row), + "author": parse_authors(get_first(row, "authors")), + "identifiers": build_identifiers(row), + "indexed_in": ["arxiv"], + "published_date": parse_date_iso(updated), + "published_year": parse_year(updated), + "access_is_oa": "true", + "access_oa_status": "", + "access_oa_url": str(pdf_url) if get_pdf else "", + "access_license": map_license_url(get_first(row, "license_url"), license_map), + "origin_id": get_first(row, "doc_id"), + "origin_osi": "arxiv", + "locations": build_locations(row, license_map), + "classifications": build_classifications(row), + "mesh": _classification_field(row, "mesh"), + "msc_class": _classification_field(row, "msc_class"), + "acm_class": _classification_field(row, "acm_class"), + "arxiv_category": _classification_field(row, "arxiv_category"), + }) + return expected + + +# ---- osi_verify/transforms/registry.py ---- + + +from typing import Any, Callable, Dict + + +TransformFn = Callable[[Dict[str, Any], Dict[str, str]], Dict[str, Any]] + +TRANSFORMS: Dict[str, TransformFn] = { + "osi_arxiv": transform_arxiv_row, +} + + +def transform_row(row: Dict[str, Any], license_map: Dict[str, str], transform: str) -> Dict[str, Any]: + try: + fn = TRANSFORMS[transform] + except KeyError as e: + raise ValueError(f"不支持的 transform: {transform}") from e + return fn(row, license_map) + + +# ---- osi_verify/compare.py ---- + + +import json +from dataclasses import dataclass, field +from datetime import datetime +from typing import Any, Dict, List, Optional, Sequence, Tuple + + + +def mysql_column_for_field(field: str, columns: Sequence[str]) -> Optional[str]: + original = field + if original in columns: + return original + field = DB_COLUMN_ALIASES.get(field, field) + if field in columns: + return field + flat = original.replace(".", "_") + if flat in columns: + return flat + flat = field.replace(".", "_") + if flat in columns: + return flat + top = field.split(".")[0] + return top if top in columns else None + + +def get_nested_value(obj: Any, path: str) -> Any: + cur = obj + for p in path.split("."): + if not isinstance(cur, dict): + return None + cur = cur.get(p) + return cur + + +def get_mysql_field_value(mysql_row: Dict[str, Any], field: str, columns: Sequence[str]) -> Any: + col = mysql_column_for_field(field, columns) + if not col: + return None + raw = mysql_row.get(col) + parsed = json_loads_maybe(raw) + if "." in field: + if col == field.split(".")[0] and isinstance(parsed, dict): + return get_nested_value(parsed, ".".join(field.split(".")[1:])) + return get_nested_value(parsed if isinstance(parsed, dict) else mysql_row, field) + return parsed + + +def normalize_scalar(v: Any) -> Any: + if v is None: + return None + if isinstance(v, bool): + return v + if isinstance(v, datetime): + return v.isoformat(sep=" ", timespec="seconds") + if isinstance(v, (int, float)) and not isinstance(v, bool): + return v + s = str(v).strip() + return s if s else None + + +def normalize_json(v: Any) -> Any: + v = json_loads_maybe(v) + if isinstance(v, dict): + return {k: normalize_json(vv) for k, vv in sorted(v.items())} + if isinstance(v, list): + return [normalize_json(x) for x in v] + return normalize_scalar(v) + + +def is_empty_value(v: Any) -> bool: + v = normalize_json(v) + if v is None: + return True + if isinstance(v, (list, dict)) and not v: + return True + if isinstance(v, str) and not v.strip(): + return True + return False + + +def is_empty_value_for_type(v: Any, value_type: str) -> bool: + v = json_loads_maybe(v) + type_name = value_type.strip().lower() + if v is None: + return True + if type_name.startswith("list"): + return isinstance(v, list) and not v + if type_name in {"object", "dict", "map"}: + return isinstance(v, dict) and not v + if type_name in {"string", "str"}: + return isinstance(v, str) and not v.strip() + if type_name in {"integer", "int", "float", "double", "number", "boolean", "bool"}: + return v is None + return is_empty_value(v) + + +def _format_diff_value(v: Any, max_len: int = 500) -> str: + if v is None: + return "null" + if isinstance(v, (dict, list)): + s = json.dumps(v, ensure_ascii=False, default=str) + else: + s = repr(v) if isinstance(v, str) else str(v) + return s if len(s) <= max_len else s[: max_len - 3] + "..." + + +@dataclass +class FieldMismatch: + """单字段不一致:s3 为转换后的期望值,db 为 Iceberg/MySQL 实际值。""" + + field: str + s3: Any + db: Any + + def to_dict(self) -> Dict[str, Any]: + return {"field": self.field, "s3": self.s3, "db": self.db} + + def __str__(self) -> str: + return f"{self.field}: S3={_format_diff_value(self.s3)} | DB={_format_diff_value(self.db)}" + + +def values_equal( + s3_val: Any, + db_val: Any, + field: str, + license_map: Optional[Dict[str, str]] = None, + empty_value_type: Optional[str] = None, +) -> Tuple[bool, Optional[FieldMismatch], Optional[str]]: + license_map = license_map or DEFAULT_LICENSE_MAP + s3_n = normalize_json(s3_val) + db_n = normalize_json(db_val) + if empty_value_type: + s3_typed_empty = is_empty_value_for_type(s3_val, empty_value_type) + db_typed_empty = is_empty_value_for_type(db_val, empty_value_type) + if s3_typed_empty and db_typed_empty: + return True, None, None + if is_empty_value(s3_n) or is_empty_value(db_n): + return False, FieldMismatch(field, s3_val, db_val), None + if field == "doi": + s3_n = normalize_doi(s3_val) + db_n = normalize_doi(db_val) + if s3_n == db_n: + return True, None, None + return False, FieldMismatch(field, s3_n, db_n), None + if field == "identifiers": + s3_n = normalize_identifiers(s3_val) + db_n = normalize_identifiers(db_val) + if s3_n == db_n: + return True, None, None + return False, FieldMismatch(field, s3_n, db_n), None + if field == "indexed_in": + s3_n = normalize_indexed_in(s3_val) + db_n = normalize_indexed_in(db_val) + if s3_n == db_n: + return True, None, None + return False, FieldMismatch(field, s3_n, db_n), None + if field in ("published_date", "publication_published_date"): + s3_n = parse_date_iso(s3_val) + db_n = parse_date_iso(db_val) + if s3_n == db_n: + return True, None, None + return False, FieldMismatch(field, s3_n, db_n), None + if field == "access_license": + s3_n = normalize_license_value(s3_val, license_map) + db_n = normalize_license_value(db_val, license_map) + warn = license_out_of_allowed_warning(s3_n, source="S3") + if warn: + return True, None, warn + if s3_n == db_n: + return True, None, None + return False, FieldMismatch(field, s3_n, db_n), None + if field == "locations": + s3_n = normalize_locations(s3_val, license_map) + db_n = normalize_locations(db_val, license_map) + for i, loc in enumerate(s3_n): + lic = loc.get("license", "") + w = license_out_of_allowed_warning(lic, source=f"S3 locations[{i}]") + if w: + return True, None, w + if s3_n == db_n: + return True, None, None + return False, FieldMismatch(field, s3_n, db_n), None + if field == "classifications": + s3_n = normalize_classifications(s3_val) + db_n = normalize_classifications(db_val) + if s3_n == db_n: + return True, None, None + return False, FieldMismatch(field, s3_n, db_n), None + if field == "author": + if s3_n == db_n: + return True, None, None + return False, FieldMismatch(field, s3_n, db_n), None + if field == "access_is_oa": + s3_s = oa_flag_str(as_bool_flag(s3_val)) + db_s = oa_flag_str(as_bool_flag(db_val)) + if s3_s == db_s: + return True, None, None + return False, FieldMismatch(field, s3_s, db_s), None + if field in ("published_year", "publication_published_year"): + try: + if int(s3_n or 0) == int(db_n or 0): + return True, None, None + except (TypeError, ValueError): + pass + return False, FieldMismatch(field, s3_n, db_n), None + if normalize_scalar(s3_n) == normalize_scalar(db_n): + return True, None, None + return False, FieldMismatch(field, s3_n, db_n), None + + +def compare_fields_for_table( + columns: Sequence[str], + mapping_rules: Sequence[Any], +) -> List[str]: + """按当前 target 的 mapping CSV 生成字段清单,并仅保留目标表存在的列。""" + requested = compare_fields_from_rules(mapping_rules) + return [f for f in requested if mysql_column_for_field(f, columns)] + + +def check_track_id( + expected_tid: Any, + db_tid: Any, + origin_id: Any, + track_registry: Dict[str, str], +) -> List[FieldMismatch]: + """track_id 非空、与期望值一致、本次校验批次内唯一。""" + failures: List[FieldMismatch] = [] + exp = normalize_scalar(expected_tid) + db = normalize_scalar(db_tid) + if not db: + failures.append(FieldMismatch("track_id", exp, db_tid)) + return failures + if exp is not None and db != exp: + failures.append(FieldMismatch("track_id", exp, db)) + tid = str(db) + oid = str(origin_id) if origin_id is not None else "" + prev = track_registry.get(tid) + if prev is not None and prev != oid: + failures.append( + FieldMismatch( + "track_id", + exp, + f"duplicate: also used by origin_id={prev}", + ) + ) + else: + track_registry[tid] = oid + return failures + + +@dataclass +class RowResult: + origin_id: Any + ok: bool + jsonl_file: str = "" + missing_in_mysql: bool = False + failures: List[FieldMismatch] = field(default_factory=list) + passes: List[str] = field(default_factory=list) + warnings: List[str] = field(default_factory=list) + + @property + def status(self) -> str: + if self.ok: + return "PASS" + return "MISSING" if self.missing_in_mysql else "FAIL" + + +def compare_row( + s3_row: Dict[str, Any], + mysql_row: Optional[Dict[str, Any]], + license_map: Dict[str, str], + *, + track_registry: Optional[Dict[str, str]] = None, + compare_fields: Optional[Sequence[str]] = None, + default_empty_field_types: Optional[Dict[str, str]] = None, + transform: str = "osi_arxiv", +) -> RowResult: + expected = transform_row(s3_row, license_map, transform) + origin_id = expected.get("origin_id") + if mysql_row is None: + return RowResult(origin_id=origin_id, ok=False, missing_in_mysql=True) + columns = list(mysql_row.keys()) + if compare_fields is None: + raise ValueError("compare_fields 不能为空;字段校验必须由当前 target 的 mapping CSV 生成") + fields = list(compare_fields) + empty_field_types = default_empty_field_types or {} + failures, passes, warnings = [], [], [] + registry = track_registry if track_registry is not None else {} + if "track_id" in fields: + failures.extend( + check_track_id( + expected.get("track_id"), + get_mysql_field_value(mysql_row, "track_id", columns), + origin_id, + registry, + ) + ) + fields = [f for f in fields if f != "track_id"] + for fld in fields: + exp_val = expected.get(fld) + ok, mismatch, warn = values_equal( + exp_val, + get_mysql_field_value(mysql_row, fld, columns), + fld, + license_map, + empty_value_type=empty_field_types.get(fld), + ) + if warn: + warnings.append(warn) + if ok: + passes.append(fld) + elif mismatch: + failures.append(mismatch) + return RowResult( + origin_id=origin_id, + ok=not failures, + failures=failures, + passes=passes, + warnings=warnings, + ) + + +# ---- osi_verify/mysql_session.py ---- + + +from typing import Any, Callable, Dict, Optional, TypeVar + + +try: + import pymysql + from pymysql.cursors import DictCursor +except ImportError: + pymysql = None # type: ignore + DictCursor = None # type: ignore + +T = TypeVar("T") + + +class MySQLSession: + """带重试的 MySQL/StarRocks 会话;连接断开时自动重连。""" + + def __init__( + self, + cfg: Dict[str, Any], + database: Optional[str], + *, + catalog: Optional[str] = None, + retry_config: Optional[RetryConfig] = None, + ): + self.cfg = cfg + self.database = database + self.catalog = catalog + self.retry_config = retry_config or RetryConfig() + self._conn: Any = None + + @property + def conn(self) -> Any: + if self._conn is None: + self.connect() + return self._conn + + def connect(self) -> Any: + if pymysql is None: + raise RuntimeError("请安装 pymysql: pip install pymysql") + kwargs: Dict[str, Any] = dict( + host=self.cfg["host"], + port=self.cfg["port"], + user=self.cfg["user"], + password=self.cfg["password"], + charset="utf8mb4", + cursorclass=DictCursor, + connect_timeout=30, + read_timeout=300 if self.catalog else 60, + ) + if not self.catalog and self.database: + kwargs["database"] = self.database + + def _connect(): + return pymysql.connect(**kwargs) + + self._conn = retry_call( + _connect, + self.retry_config, + label="MySQL 连接", + retryable=is_mysql_retryable, + ) + return self._conn + + def reconnect(self) -> None: + self.close() + self.connect() + + def close(self) -> None: + if self._conn is not None: + try: + self._conn.close() + except Exception: + pass + self._conn = None + + def run(self, fn: Callable[[Any], T], *, label: str) -> T: + def attempt() -> T: + try: + return fn(self.conn) + except Exception as exc: + if is_mysql_retryable(exc): + self.close() + raise + + return retry_call( + attempt, + self.retry_config, + label=label, + retryable=is_mysql_retryable, + ) + + def __enter__(self) -> "MySQLSession": + self.connect() + return self + + def __exit__(self, exc_type, exc, tb) -> None: + self.close() + + +# ---- osi_verify/s3_reader.py ---- + + +import json +import random +import sys +from typing import Any, Dict, Generator, List, Optional, Tuple + +try: + import duckdb +except ImportError: + duckdb = None # type: ignore + + +RANGE_SAMPLE_CHUNK_BYTES = 1024 * 1024 +RANGE_SAMPLE_MAX_ATTEMPT_FACTOR = 20 +BOTOCORE_RETRY_ATTEMPTS = 2 + + +def rows_from_cursor(cur) -> List[Dict[str, Any]]: + cols = [d[0] for d in cur.description] + return [dict(zip(cols, row)) for row in cur.fetchall()] + + +def configure_duckdb_s3(con: "duckdb.DuckDBPyConnection", s3_cfg: Dict[str, Any]) -> None: + con.execute("INSTALL httpfs; LOAD httpfs;") + ep = sql_literal(s3_cfg["endpoint"]) + ak = sql_literal(s3_cfg["access_key"]) + sk = sql_literal(s3_cfg["secret_key"]) + use_ssl = bool(s3_cfg.get("use_ssl", True)) + con.execute(f"SET s3_endpoint='{ep}';") + con.execute(f"SET s3_access_key_id='{ak}';") + con.execute(f"SET s3_secret_access_key='{sk}';") + con.execute(f"SET s3_use_ssl={'true' if use_ssl else 'false'};") + con.execute("SET s3_url_style='path';") + con.execute("SET s3_region='us-east-1';") + + +def _detect_s3_format( + con: "duckdb.DuckDBPyConnection", + s3_path: str, + *, + s3_cfg: Optional[Dict[str, Any]] = None, + retry_config: Optional[RetryConfig] = None, +) -> str: + if s3_cfg: + if list_s3_files_boto3(s3_path, s3_cfg, ".jsonl", retry_config=retry_config): + return "jsonl" + if list_s3_files_boto3(s3_path, s3_cfg, ".parquet", retry_config=retry_config): + return "parquet" + raise FileNotFoundError(f"S3 路径下未找到 .jsonl 或 .parquet 文件: {s3_path}") + + base = sql_literal(s3_path.rstrip("/")) + + def _jsonl_count() -> int: + return int(con.execute(f"SELECT count(*) FROM glob('{base}/*.jsonl')").fetchone()[0]) + + def _parquet_count() -> int: + return int(con.execute(f"SELECT count(*) FROM glob('{base}/**/*.parquet')").fetchone()[0]) + + if retry_call(_jsonl_count, retry_config, label="S3 探测 jsonl", retryable=is_s3_retryable): + return "jsonl" + if retry_call(_parquet_count, retry_config, label="S3 探测 parquet", retryable=is_s3_retryable): + return "parquet" + raise FileNotFoundError(f"S3 路径下未找到 .jsonl 或 .parquet 文件: {s3_path}") + + +def open_duckdb_s3(s3_cfg: Optional[Dict[str, Any]]) -> "duckdb.DuckDBPyConnection": + if duckdb is None: + raise RuntimeError("请安装 duckdb: pip install duckdb pyarrow") + con = duckdb.connect() + if s3_cfg: + configure_duckdb_s3(con, s3_cfg) + return con + + +def jsonl_basename(s3_uri: str) -> str: + return s3_uri.rsplit("/", 1)[-1] + + +def parse_s3_uri(uri: str) -> Tuple[str, str]: + u = uri.replace("\\", "/") + if not u.startswith("s3://"): + raise ValueError(f"非 S3 URI: {uri}") + rest = u[5:] + bucket, _, key = rest.partition("/") + if not bucket or not key: + raise ValueError(f"无法解析 S3 URI: {uri}") + return bucket, key + + +def _suppress_insecure_request_warning() -> None: + """内网 Ceph 常使用自签证书,verify=False 时抑制 urllib3 重复告警。""" + try: + import urllib3 + except ImportError: + return + urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning) + + +def s3_boto_client(s3_cfg: Dict[str, Any], *, retry_config: Optional[RetryConfig] = None): + try: + import boto3 + from botocore.config import Config + except ImportError as e: + raise RuntimeError("S3 流式/Range 抽样需要 boto3: pip install boto3") from e + cfg = retry_config or RetryConfig() + use_ssl = bool(s3_cfg.get("use_ssl", True)) + verify_ssl = bool(s3_cfg.get("verify_ssl", False)) + if use_ssl and not verify_ssl: + _suppress_insecure_request_warning() + scheme = "https" if use_ssl else "http" + return boto3.client( + "s3", + endpoint_url=f"{scheme}://{s3_cfg['endpoint']}", + aws_access_key_id=s3_cfg["access_key"], + aws_secret_access_key=s3_cfg["secret_key"], + region_name="us-east-1", + config=Config( + s3={"addressing_style": "path"}, + signature_version="s3v4", + retries={ + "max_attempts": BOTOCORE_RETRY_ATTEMPTS if cfg.enabled else 1, + "mode": "standard", + }, + ), + verify=verify_ssl, + ) + + +def list_s3_files_boto3( + s3_path: str, + s3_cfg: Dict[str, Any], + suffix: str, + *, + retry_config: Optional[RetryConfig] = None, +) -> List[str]: + bucket, prefix = parse_s3_uri(s3_path.rstrip("/") + "/") + suffix_lc = suffix.lower() + + def _list() -> List[str]: + client = s3_boto_client(s3_cfg, retry_config=retry_config) + files: List[str] = [] + continuation_token: Optional[str] = None + while True: + kwargs: Dict[str, Any] = {"Bucket": bucket, "Prefix": prefix} + if continuation_token: + kwargs["ContinuationToken"] = continuation_token + response = client.list_objects_v2(**kwargs) + for item in response.get("Contents", []): + key = item.get("Key") + if key and str(key).lower().endswith(suffix_lc): + files.append(f"s3://{bucket}/{key}") + if not response.get("IsTruncated"): + break + continuation_token = response.get("NextContinuationToken") + if not continuation_token: + break + return sorted(files) + + return retry_call( + _list, + retry_config, + label=f"S3 列出 {suffix}", + retryable=is_s3_retryable, + ) + + +def sample_jsonl_rows_sequential_stream( + s3_uri: str, + s3_cfg: Dict[str, Any], + sample_size: int, + *, + retry_config: Optional[RetryConfig] = None, +) -> List[Dict[str, Any]]: + """流式读取 jsonl 前 N 条(不扫全文件)。""" + if sample_size <= 0: + return [] + bucket, key = parse_s3_uri(s3_uri) + client = s3_boto_client(s3_cfg, retry_config=retry_config) + + def _read() -> List[Dict[str, Any]]: + rows: List[Dict[str, Any]] = [] + body = client.get_object(Bucket=bucket, Key=key)["Body"] + for raw in body.iter_lines(): + if not raw or not raw.strip(): + continue + rows.append(json.loads(raw)) + if len(rows) >= sample_size: + break + return rows + + return retry_call(_read, retry_config, label=f"S3 顺序读 {jsonl_basename(s3_uri)}", retryable=is_s3_retryable) + + +def _json_line_from_range(payload: bytes, *, offset: int, object_size: int) -> Optional[bytes]: + if not payload: + return None + start = 0 + if offset > 0: + first_newline = payload.find(b"\n") + if first_newline < 0: + return None + start = first_newline + 1 + end = payload.find(b"\n", start) + if end < 0: + if offset + len(payload) >= object_size: + end = len(payload) + else: + return None + line = payload[start:end].strip() + return line or None + + +def sample_jsonl_rows_s3_range( + s3_uri: str, + s3_cfg: Dict[str, Any], + sample_size: int, + *, + retry_config: Optional[RetryConfig] = None, +) -> List[Dict[str, Any]]: + """通过 S3 Range 近似随机抽样,不全量扫描大 JSONL 文件。""" + if sample_size <= 0: + return [] + bucket, key = parse_s3_uri(s3_uri) + client = s3_boto_client(s3_cfg, retry_config=retry_config) + + def _head_size() -> int: + return int(client.head_object(Bucket=bucket, Key=key)["ContentLength"]) + + object_size = retry_call(_head_size, retry_config, label=f"S3 head {jsonl_basename(s3_uri)}", retryable=is_s3_retryable) + if object_size <= 0: + return [] + + rows: List[Dict[str, Any]] = [] + seen = set() + attempts = 0 + max_attempts = max(sample_size * RANGE_SAMPLE_MAX_ATTEMPT_FACTOR, sample_size) + while len(rows) < sample_size and attempts < max_attempts: + attempts += 1 + offset = random.randint(0, max(0, object_size - 1)) + end = min(object_size - 1, offset + RANGE_SAMPLE_CHUNK_BYTES - 1) + + def _read_range(off: int = offset, end_byte: int = end) -> bytes: + return client.get_object( + Bucket=bucket, + Key=key, + Range=f"bytes={off}-{end_byte}", + )["Body"].read() + + body = retry_call( + _read_range, + retry_config, + label=f"S3 Range 读 {jsonl_basename(s3_uri)}", + retryable=is_s3_retryable, + ) + raw_line = _json_line_from_range(body, offset=offset, object_size=object_size) + if raw_line is None or raw_line in seen: + continue + try: + row = json.loads(raw_line.decode("utf-8")) + except (UnicodeDecodeError, json.JSONDecodeError): + continue + seen.add(raw_line) + rows.append(row) + return rows + + +def list_s3_jsonl_files( + con: "duckdb.DuckDBPyConnection", + s3_path: str, + *, + s3_cfg: Optional[Dict[str, Any]] = None, + retry_config: Optional[RetryConfig] = None, +) -> List[str]: + if s3_cfg: + files = list_s3_files_boto3(s3_path, s3_cfg, ".jsonl", retry_config=retry_config) + if not files: + raise FileNotFoundError(f"未找到 jsonl: {s3_path}") + return files + + base = sql_literal(s3_path.rstrip("/")) + + def _list() -> List[str]: + return [ + r[0] + for r in con.execute( + f"SELECT file FROM glob('{base}/*.jsonl') ORDER BY file" + ).fetchall() + ] + + files = retry_call(_list, retry_config, label="S3 列出 jsonl", retryable=is_s3_retryable) + if not files: + raise FileNotFoundError(f"未找到 jsonl: {s3_path}") + return files + + +def sample_jsonl_rows( + con: "duckdb.DuckDBPyConnection", + fpath: str, + sample_size: int, + *, + sequential: bool = False, + s3_cfg: Optional[Dict[str, Any]] = None, + retry_config: Optional[RetryConfig] = None, +) -> List[Dict[str, Any]]: + """从单个 jsonl 抽取最多 sample_size 行。""" + if sample_size <= 0: + return [] + if sequential and s3_cfg and fpath.startswith("s3://"): + return sample_jsonl_rows_sequential_stream( + fpath, s3_cfg, sample_size, retry_config=retry_config + ) + if not sequential and s3_cfg and fpath.startswith("s3://"): + return sample_jsonl_rows_s3_range( + fpath, s3_cfg, sample_size, retry_config=retry_config + ) + inner = f"SELECT * FROM read_json_auto('{sql_literal(fpath)}')" + if sequential: + sql = f"SELECT * FROM ({inner}) LIMIT {int(sample_size)}" + else: + sql = f"SELECT * FROM ({inner}) ORDER BY random() LIMIT {int(sample_size)}" + + def _sample() -> List[Dict[str, Any]]: + return rows_from_cursor(con.execute(sql)) + + return retry_call( + _sample, + retry_config, + label=f"S3 DuckDB 抽样 {jsonl_basename(fpath)}", + retryable=is_s3_retryable, + ) + + +def iter_s3_batches( + *, + parquet_glob: Optional[str], + s3_path: Optional[str], + s3_cfg: Optional[Dict[str, Any]], + s3_format: str, + full: bool, + limit: int, + batch_size: int, + sequential: bool = False, + retry_config: Optional[RetryConfig] = None, +) -> Generator[Tuple[str, List[Dict[str, Any]]], None, None]: + """按批产出 (jsonl_s3_uri, rows)。""" + con = open_duckdb_s3(s3_cfg) + + if s3_path: + base = s3_path.rstrip("/") + fmt = s3_format if s3_format != "auto" else _detect_s3_format( + con, + base, + s3_cfg=s3_cfg, + retry_config=retry_config, + ) + if fmt == "jsonl": + files = list_s3_jsonl_files(con, base, s3_cfg=s3_cfg, retry_config=retry_config) + if full: + print( + f"S3 数据格式: jsonl,全量 {len(files)} 个文件,batch_size={batch_size}", + file=sys.stderr, + ) + else: + mode = f"顺序前 {limit} 条" if sequential else f"随机 {limit} 条" + print( + f"S3 数据格式: jsonl,抽样 {len(files)} 个文件,每文件{mode}", + file=sys.stderr, + ) + for fpath in files: + if full: + offset = 0 + while True: + path_lit = sql_literal(fpath) + off = offset + bs = batch_size + + def _read_batch() -> List[Dict[str, Any]]: + cur = con.execute( + f"SELECT * FROM read_json_auto('{path_lit}') " + f"LIMIT {int(bs)} OFFSET {int(off)}" + ) + return rows_from_cursor(cur) + + rows = retry_call( + _read_batch, + retry_config, + label=f"S3 DuckDB 全量批 {jsonl_basename(fpath)}", + retryable=is_s3_retryable, + ) + if not rows: + break + yield fpath, rows + offset += len(rows) + if len(rows) < batch_size: + break + else: + basename = jsonl_basename(fpath) + print(f" [抽样] {basename} ...", file=sys.stderr, flush=True) + rows = sample_jsonl_rows( + con, + fpath, + limit, + sequential=sequential, + s3_cfg=s3_cfg, + retry_config=retry_config, + ) + if rows: + print( + f" [抽样] {basename}: {len(rows)} 条", + file=sys.stderr, + ) + yield fpath, rows + return + + path_expr = f"'{sql_literal(base)}/**/*.parquet'" + sql = f"SELECT * FROM read_parquet({path_expr})" + if not full and limit > 0: + sql += f" ORDER BY random() LIMIT {int(limit)}" + + def _read_parquet() -> List[Dict[str, Any]]: + return rows_from_cursor(con.execute(sql)) + + rows = retry_call(_read_parquet, retry_config, label="S3 读 parquet", retryable=is_s3_retryable) + yield "parquet", rows + return + + if parquet_glob: + reader = "read_json_auto" if parquet_glob.endswith(".jsonl") else "read_parquet" + sql = f"SELECT * FROM {reader}('{sql_literal(parquet_glob)}')" + if not full and limit > 0: + sql += f" ORDER BY random() LIMIT {int(limit)}" + + def _read_glob() -> List[Dict[str, Any]]: + return rows_from_cursor(con.execute(sql)) + + rows = _read_glob() + yield parquet_glob, rows + return + + raise ValueError("请指定 --parquet-glob、--s3-path,或提供 s3 配置文件") + + +# ---- osi_verify/db.py ---- + + +import sys +from dataclasses import dataclass +from typing import Any, Dict, List, Optional, Sequence, Tuple + + + +@dataclass(frozen=True) +class TableRef: + """库表引用:Iceberg 经 StarRocks 时为 catalog.schema.table。""" + + catalog: Optional[str] + schema: str + table: str + + @property + def sql_name(self) -> str: + if self.catalog: + return f"{self.catalog}.{self.schema}.{self.table}" + return f"`{self.schema}`.`{self.table}`" + + @property + def display_name(self) -> str: + return self.sql_name + + +def resolve_table_ref( + catalog: Optional[str], schema: str, table: str +) -> TableRef: + return TableRef(catalog=catalog or None, schema=schema, table=table) + + +def discover_table(session: MySQLSession, table_ref: TableRef) -> TableRef: + if table_ref.catalog: + + def _probe(conn) -> None: + with conn.cursor() as cur: + cur.execute(f"SELECT 1 FROM {table_ref.sql_name} LIMIT 1") + + session.run(_probe, label="MySQL 探活表") + return table_ref + if table_ref.table: + return table_ref + + def _discover(conn) -> TableRef: + with conn.cursor() as cur: + cur.execute( + """ + SELECT TABLE_NAME, COUNT(*) AS cnt + FROM information_schema.COLUMNS + WHERE TABLE_SCHEMA = DATABASE() + AND COLUMN_NAME IN ('origin_id', 'origin_osi') + GROUP BY TABLE_NAME + HAVING cnt >= 2 + ORDER BY cnt DESC + """ + ) + rows = cur.fetchall() + if not rows: + raise RuntimeError("未能自动发现含 origin_id/origin_osi 的表,请用 --table 指定") + return TableRef(catalog=None, schema=table_ref.schema, table=rows[0]["TABLE_NAME"]) + + return session.run(_discover, label="MySQL 发现表") + + + +def fetch_mysql_rows_by_ids( + session: MySQLSession, + table_ref: TableRef, + origin_ids: Sequence[Any], + origin_osi: str = "arxiv", + target_dt: Optional[str] = None, +) -> Dict[str, Dict[str, Any]]: + ids = [str(i) for i in origin_ids if i is not None] + if not ids: + return {} + + def _fetch(conn) -> Dict[str, Dict[str, Any]]: + placeholders = ",".join(["%s"] * len(ids)) + sql = ( + f"SELECT * FROM {table_ref.sql_name} " + f"WHERE origin_osi = %s AND origin_id IN ({placeholders})" + ) + params: List[Any] = [origin_osi, *ids] + if target_dt: + sql += " AND dt = %s" + params.append(target_dt) + with conn.cursor() as cur: + cur.execute(sql, params) + rows = cur.fetchall() + return {str(r["origin_id"]): r for r in rows} + + return session.run(_fetch, label="MySQL 批量查询") + + +def count_s3_jsonl_lines_boto3( + s3_uri: str, + s3_cfg: Dict[str, Any], + *, + retry_config: Optional[RetryConfig] = None, +) -> int: + """boto3 流式按换行符计数(不解析 JSON,适合大 jsonl)。""" + bucket, key = parse_s3_uri(s3_uri) + client = s3_boto_client(s3_cfg, retry_config=retry_config) + + def _count() -> int: + n = 0 + body = client.get_object(Bucket=bucket, Key=key)["Body"] + for chunk in body.iter_chunks(chunk_size=16 * 1024 * 1024): + n += chunk.count(b"\n") + return n + + return retry_call(_count, retry_config, label=f"S3 计数 {jsonl_basename(s3_uri)}", retryable=is_s3_retryable) + + +def count_s3_partition( + con: "duckdb.DuckDBPyConnection", + s3_path: str, + files: Optional[List[str]] = None, + *, + s3_cfg: Optional[Dict[str, Any]] = None, + retry_config: Optional[RetryConfig] = None, +) -> Tuple[int, Dict[str, int]]: + """统计分区内 S3 行数(按 jsonl 文件)。有 s3_cfg 时用 boto3 流式计数。""" + files = files or list_s3_jsonl_files( + con, + s3_path, + s3_cfg=s3_cfg, + retry_config=retry_config, + ) + per_file: Dict[str, int] = {} + total = 0 + use_boto = bool(s3_cfg) + if use_boto: + print(" [S3 计数] 使用 boto3 流式按行计数", file=sys.stderr) + for fpath in files: + if use_boto and fpath.startswith("s3://"): + n = count_s3_jsonl_lines_boto3(fpath, s3_cfg, retry_config=retry_config) + else: + path_lit = sql_literal(fpath) + + def _duck_count() -> int: + return int( + con.execute( + f"SELECT count(*) FROM read_json_auto('{path_lit}')" + ).fetchone()[0] + ) + + n = retry_call( + _duck_count, + retry_config, + label=f"S3 DuckDB 计数 {jsonl_basename(fpath)}", + retryable=is_s3_retryable, + ) + name = jsonl_basename(fpath) + per_file[name] = n + total += n + print(f" [S3 计数] {name}: {n:,} 行", file=sys.stderr) + return total, per_file + + +def table_columns(session: MySQLSession, table_ref: TableRef) -> List[str]: + def _columns(conn) -> List[str]: + with conn.cursor() as cur: + if table_ref.catalog: + cur.execute(f"DESCRIBE {table_ref.sql_name}") + return [r["Field"] for r in cur.fetchall()] + cur.execute( + """ + SELECT COLUMN_NAME FROM information_schema.COLUMNS + WHERE TABLE_SCHEMA = DATABASE() AND TABLE_NAME = %s + """, + (table_ref.table,), + ) + return [r["COLUMN_NAME"] for r in cur.fetchall()] + + return session.run(_columns, label="MySQL 读取表结构") + + +def count_mysql_origin( + session: MySQLSession, + table_ref: TableRef, + partition_dt: Optional[str], + origin_osi: str = "arxiv", +) -> Tuple[int, str]: + """ + 统计落库表中该分区指定 origin_osi 记录数。 + 优先使用 dt / partition_dt / data_dt 等列;否则用 DATE(updated)=partition_dt。 + """ + cols = table_columns(session, table_ref) + where = "origin_osi = %s" + desc = f"origin_osi='{origin_osi}'" + base_params: Tuple[Any, ...] = (origin_osi,) + + dt_cols = [c for c in ("dt", "partition_dt", "data_dt", "crawl_dt", "batch_dt") if c in cols] + if partition_dt and dt_cols: + c = dt_cols[0] + where += f" AND `{c}` = %s" + desc += f" AND {c}='{partition_dt}'" + params: Tuple[Any, ...] = base_params + (partition_dt,) + elif partition_dt and "updated" in cols: + where += " AND DATE(`updated`) = %s" + desc += f" AND DATE(updated)='{partition_dt}'" + params = base_params + (partition_dt,) + elif partition_dt and "published_date" in cols: + where += " AND DATE(`published_date`) = %s" + desc += f" AND DATE(published_date)='{partition_dt}'" + params = base_params + (partition_dt,) + elif partition_dt: + print( + f"[warn] 表 {table_ref.display_name} 无 dt/updated 等分区字段,仅按 origin_osi 统计总数", + file=sys.stderr, + ) + params = base_params + else: + params = base_params + + def _count(conn) -> int: + with conn.cursor() as cur: + cur.execute(f"SELECT COUNT(*) AS n FROM {table_ref.sql_name} WHERE {where}", params) + return int(cur.fetchone()["n"]) + + n = session.run(_count, label="MySQL 统计行数") + return n, desc + + +# ---- osi_verify/report.py ---- + + +import json +from collections import Counter, defaultdict +from dataclasses import dataclass +from typing import Any, Dict, List, Optional, Tuple + + + +@dataclass +class ReportContext: + target: str + target_kind: str + transform: str + table_name: str + origin_osi: str + s3_path: str + mapping_csv: str + config_path: str + + def to_dict(self) -> Dict[str, str]: + return { + "target": self.target, + "target_kind": self.target_kind, + "transform": self.transform, + "table_name": self.table_name, + "origin_osi": self.origin_osi, + "s3_path": self.s3_path, + "mapping_csv": self.mapping_csv, + "config_path": self.config_path, + } + + +@dataclass +class FileStats: + total: int = 0 + pass_n: int = 0 + fail_n: int = 0 + miss_n: int = 0 + + +@dataclass +class CountSummary: + context: ReportContext + s3_dt: Optional[str] + target_dt: Optional[str] + s3_path: str + s3_total: int + s3_per_file: Dict[str, int] + mysql_total: int + mysql_filter: str + checked_rows: int = 0 + + +def print_count_summary(cs: CountSummary) -> None: + diff = cs.s3_total - cs.mysql_total + print("\n" + "=" * 60) + print("数据总量校验") + print("=" * 60) + print(f"Target : {cs.context.target} ({cs.context.target_kind})") + print(f"目标表 : {cs.context.table_name}") + print(f"origin_osi : {cs.context.origin_osi}") + print(f"映射 CSV : {cs.context.mapping_csv}") + print(f"S3 分区 dt : {cs.s3_dt or '(未解析)'}") + print(f"目标表 dt : {cs.target_dt or '(未指定)'}") + print(f"S3 路径 : {cs.s3_path}") + print(f"S3 jsonl 文件数 : {len(cs.s3_per_file)}") + print(f"S3 总行数 : {cs.s3_total:,}") + for name, n in sorted(cs.s3_per_file.items()): + print(f" - {name}: {n:,}") + print(f"MySQL 过滤条件 : {cs.mysql_filter}") + print(f"MySQL 行数 : {cs.mysql_total:,}") + print(f"S3 - MySQL 差异 : {diff:+,}") + if diff != 0: + print(" >> 总量不一致,请检查落库任务是否漏跑/重复或分区字段过滤条件") + else: + print(" >> 总量一致") + if cs.checked_rows and cs.checked_rows != cs.s3_total: + print( + f" >> 本次仅校验抽样 {cs.checked_rows:,} 条," + f"字段级结果不代表全量(加 --full 做全量字段校验)" + ) + + +def print_anomaly_table(anomalies: List[RowResult], max_show: int = 50) -> None: + print("\n" + "=" * 60) + print(f"落库异常明细(共 {len(anomalies)} 条,展示前 {min(len(anomalies), max_show)} 条)") + print("=" * 60) + print(f"{'jsonl 文件':<32} {'origin_id':<16} {'状态':<8} 异常摘要") + print("-" * 60) + for r in anomalies[:max_show]: + brief = (str(r.failures[0])[:60] if r.failures else "数据库无该 origin_id 记录") + print( + f"{r.jsonl_file:<32} {str(r.origin_id):<16} " + f"{r.status:<8} {brief}" + ) + if len(anomalies) > max_show: + print(f"... 另有 {len(anomalies) - max_show} 条,见 --report 文件") + + +SummaryKey = Tuple[str, str, str, str] + + +def _summary_value(v: Any, max_len: int = 500) -> str: + if v is None: + return "null" + if isinstance(v, (dict, list)): + text = json.dumps(v, ensure_ascii=False, default=str) + else: + text = repr(v) if isinstance(v, str) else str(v) + return text if len(text) <= max_len else text[: max_len - 3] + "..." + + +def _mismatch_key(mismatch: FieldMismatch) -> SummaryKey: + return ( + "mismatch", + mismatch.field, + _summary_value(mismatch.s3), + _summary_value(mismatch.db), + ) + + +def _print_summary_key(count: int, key: SummaryKey) -> None: + kind, label, s3_val, db_val = key + if kind == "mismatch": + print(f"{count:>8} {label}:") + print(f"{'':>8} S3: {s3_val}") + print(f"{'':>8} DB: {db_val}") + return + print(f"{count:>8} {label}") + + +def print_anomaly_summary(anomalies: List[RowResult], max_examples: int = 3) -> None: + print("\n" + "=" * 60) + print(f"落库异常/Warning 类型汇总(共 {len(anomalies)} 条记录)") + print("=" * 60) + if not anomalies: + print("无异常") + return + + type_counts: Counter[SummaryKey] = Counter() + examples: Dict[SummaryKey, List[str]] = defaultdict(list) + + for r in anomalies: + if r.missing_in_mysql: + key = ("message", "MySQL 缺失: 数据库无该 origin_id 记录", "", "") + type_counts[key] += 1 + if len(examples[key]) < max_examples: + examples[key].append(f"{r.jsonl_file} origin_id={r.origin_id}") + + for w in r.warnings: + key = ("message", f"Warning: {w}", "", "") + type_counts[key] += 1 + if len(examples[key]) < max_examples: + examples[key].append(f"{r.jsonl_file} origin_id={r.origin_id}") + + for m in r.failures: + key = _mismatch_key(m) + type_counts[key] += 1 + if len(examples[key]) < max_examples: + examples[key].append(f"{r.jsonl_file} origin_id={r.origin_id}") + + print(f"{'次数':>8} 错误类型") + print("-" * 60) + for key, count in type_counts.most_common(): + _print_summary_key(count, key) + if max_examples > 0 and examples.get(key): + print(f"{'':>8} 样例: {', '.join(examples[key])}") + print("\n完整逐条明细请查看 --report 输出的 JSONL 文件") + + +def print_file_stats(stats: Dict[str, FileStats]) -> None: + print("\n" + "=" * 60) + print("按 jsonl 文件统计(本次已校验行)") + print("=" * 60) + print(f"{'jsonl 文件':<36} {'校验':>8} {'通过':>8} {'失败':>8} {'缺失':>8}") + print("-" * 60) + for name in sorted(stats): + s = stats[name] + print(f"{name:<36} {s.total:>8} {s.pass_n:>8} {s.fail_n:>8} {s.miss_n:>8}") + + +def print_run_context(ctx: ReportContext) -> None: + print("\n" + "=" * 60) + print("校验上下文") + print("=" * 60) + print(f"Target : {ctx.target} ({ctx.target_kind})") + print(f"Transform : {ctx.transform}") + print(f"目标表 : {ctx.table_name}") + print(f"origin_osi : {ctx.origin_osi}") + print(f"S3 路径 : {ctx.s3_path or '(local/parquet-glob)'}") + print(f"映射 CSV : {ctx.mapping_csv}") + print(f"配置文件 : {ctx.config_path}") + + +def safe_filename_token(value: Any) -> str: + text = "unknown" if value in (None, "") else str(value) + return re.sub(r"[^0-9A-Za-z_-]+", "_", text).strip("_") or "unknown" + + +def default_osi_report_path(target: str, s3_dt: Optional[str], target_dt: Optional[str]) -> Path: + dt_tag = f"s3_{safe_filename_token(s3_dt)}_target_{safe_filename_token(target_dt)}" + report_dir = REPORT_ROOT / f"meta_paper_data_{safe_filename_token(target)}_{dt_tag}" + return report_dir / "source_field_mismatch.jsonl" + + +def summary_paths(report_path: Path) -> Tuple[Path, Path]: + return report_path.parent / "summary.json", report_path.parent / "readable_summary.md" + + +REPORT_KEY_LABELS = { + "report": "报告路径", + "context": "校验上下文", + "target": "目标名称", + "target_kind": "目标类型", + "transform": "转换逻辑", + "table_name": "目标表", + "origin_osi": "来源标识", + "s3_path": "S3路径", + "mapping_csv": "映射文件", + "config_path": "配置文件", + "s3_dt": "S3分区", + "target_dt": "目标表分区", + "partition_dt": "目标表分区", + "checked": "已校验数", + "passed": "通过数", + "failed": "失败数", + "missing": "目标表缺失数", + "warnings": "Warning数量", + "count_summary": "Count校验", + "s3_total": "S3总行数", + "mysql_total": "目标表行数", + "diff": "数量差异", + "mysql_filter": "目标表过滤条件", + "checked_rows": "已校验行数", + "s3_file_count": "S3文件数", + "file_stats": "文件统计", + "status_counts": "状态分布", + "field_counts": "字段问题分布", + "field_samples": "字段问题样例", + "warning_counts": "Warning分布", + "warning_samples": "Warning样例", + "jsonl_file": "JSONL文件", + "jsonl_s3_uri": "JSONL S3路径", + "origin_id": "来源ID", + "status": "状态", + "field_diffs": "字段差异", + "field": "字段", + "s3": "S3值", + "db": "目标表值", + "expected": "预期值", + "actual": "实际值", + "missing_in_mysql": "目标表缺失", +} + + +def localize_report_keys(value: Any) -> Any: + if isinstance(value, dict): + return { + REPORT_KEY_LABELS.get(str(key), str(key)): localize_report_keys(val) + for key, val in value.items() + } + if isinstance(value, list): + return [localize_report_keys(item) for item in value] + return value + + +TOP_FIELD_LIMIT = 20 +TOP_SAMPLE_FIELD_LIMIT = 5 +SAMPLES_PER_FIELD = 3 + + +def build_osi_report_summary( + *, + report_path: Optional[Path], + context: ReportContext, + s3_dt: Optional[str], + target_dt: Optional[str], + total: int, + ok_n: int, + fail_n: int, + miss_n: int, + warn_n: int, + count_summary: Optional[CountSummary], + per_file: Dict[str, FileStats], + notable_results: List[RowResult], +) -> Dict[str, Any]: + status_counts = Counter() + field_counts = Counter() + warning_counts = Counter() + field_samples: Dict[str, List[Dict[str, Any]]] = defaultdict(list) + warning_samples: Dict[str, List[Dict[str, Any]]] = defaultdict(list) + + for row in notable_results: + status_counts[row.status if not row.ok else "WARN"] += 1 + for warning in row.warnings: + warning_counts[warning] += 1 + samples = warning_samples[warning] + if len(samples) < SAMPLES_PER_FIELD: + samples.append({"jsonl_file": row.jsonl_file, "origin_id": row.origin_id}) + if row.missing_in_mysql: + field_counts["missing_in_mysql"] += 1 + samples = field_samples["missing_in_mysql"] + if len(samples) < SAMPLES_PER_FIELD: + samples.append({"jsonl_file": row.jsonl_file, "origin_id": row.origin_id}) + for mismatch in row.failures: + field_counts[mismatch.field] += 1 + samples = field_samples[mismatch.field] + if len(samples) < SAMPLES_PER_FIELD: + samples.append( + { + "jsonl_file": row.jsonl_file, + "origin_id": row.origin_id, + "s3": mismatch.s3, + "db": mismatch.db, + } + ) + + count_payload = None + if count_summary: + count_payload = { + "s3_dt": count_summary.s3_dt, + "target_dt": count_summary.target_dt, + "s3_total": count_summary.s3_total, + "mysql_total": count_summary.mysql_total, + "diff": count_summary.s3_total - count_summary.mysql_total, + "mysql_filter": count_summary.mysql_filter, + "checked_rows": count_summary.checked_rows, + "s3_file_count": len(count_summary.s3_per_file), + } + + sorted_field_counts = dict(field_counts.most_common()) + sorted_warning_counts = dict(warning_counts.most_common()) + top_sample_fields = set(list(sorted_field_counts)[:TOP_SAMPLE_FIELD_LIMIT]) + top_sample_warnings = set(list(sorted_warning_counts)[:TOP_SAMPLE_FIELD_LIMIT]) + return { + "report": str(report_path) if report_path else None, + "context": context.to_dict(), + "s3_dt": s3_dt, + "target_dt": target_dt, + "partition_dt": target_dt, + "checked": total, + "passed": ok_n, + "failed": fail_n, + "missing": miss_n, + "warnings": warn_n, + "count_summary": count_payload, + "file_stats": { + name: { + "total": stats.total, + "passed": stats.pass_n, + "failed": stats.fail_n, + "missing": stats.miss_n, + } + for name, stats in sorted(per_file.items()) + }, + "status_counts": dict(status_counts.most_common()), + "field_counts": sorted_field_counts, + "field_count_total": len(sorted_field_counts), + "field_samples": { + field: field_samples[field] + for field in sorted_field_counts + if field in top_sample_fields + }, + "warning_counts": sorted_warning_counts, + "warning_type_total": len(sorted_warning_counts), + "warning_samples": { + warning: warning_samples[warning] + for warning in sorted_warning_counts + if warning in top_sample_warnings + }, + } + + +def write_osi_report_summary(report_path: Path, summary: Dict[str, Any]) -> None: + summary_json_path, summary_md_path = summary_paths(report_path) + summary_json_path.parent.mkdir(parents=True, exist_ok=True) + with summary_json_path.open("w", encoding="utf-8") as f: + json.dump(localize_report_keys(summary), f, ensure_ascii=False, indent=2, default=str) + + count_summary = summary.get("count_summary") or {} + lines = [ + "# S3 数据到论文源数据表校验报告摘要", + "", + f"- 目标表: `{summary.get('context', {}).get('table_name')}`", + f"- 分区: S3=`{summary.get('s3_dt')}`, 目标表=`{summary.get('target_dt')}`", + f"- 结果: 已校验 `{summary.get('checked')}`,通过 `{summary.get('passed')}`,失败 `{summary.get('failed')}`,缺失 `{summary.get('missing')}`", + f"- Warning: `{summary.get('warnings')}`", + f"- 明细报告: `{summary.get('report')}`", + f"- 报告目录: `{Path(str(summary.get('report'))).parent if summary.get('report') else None}`", + "", + "## Count 校验", + "", + ] + if count_summary: + lines.extend( + [ + f"- s3_total: `{count_summary.get('s3_total')}`", + f"- mysql_total: `{count_summary.get('mysql_total')}`", + f"- diff: `{count_summary.get('diff')}`", + f"- checked_rows: `{count_summary.get('checked_rows')}`", + f"- mysql_filter: `{count_summary.get('mysql_filter')}`", + ] + ) + else: + lines.append("- 未执行或已跳过") + lines.extend(["", "## 状态分布", ""]) + for status, count in (summary.get("status_counts") or {}).items(): + lines.append(f"- `{status}`: {count}") + if not summary.get("status_counts"): + lines.append("- 无") + lines.extend(["", "## 字段问题分布", ""]) + for field, count in (summary.get("field_counts") or {}).items(): + lines.append(f"- `{field}`: {count}") + if not summary.get("field_counts"): + lines.append("- 无") + lines.extend(["", "## 字段问题样例", ""]) + for field, samples in (summary.get("field_samples") or {}).items(): + count = (summary.get("field_counts") or {}).get(field, len(samples)) + lines.append(f"### {field} ({count})") + lines.append("") + for sample in samples: + lines.append( + f"- origin_id `{sample.get('origin_id')}`, jsonl_file=`{sample.get('jsonl_file')}`" + ) + if "s3" in sample or "db" in sample: + lines.append(f" - s3: `{json.dumps(sample.get('s3'), ensure_ascii=False, default=str)}`") + lines.append(f" - db: `{json.dumps(sample.get('db'), ensure_ascii=False, default=str)}`") + lines.append("") + + if summary.get("warnings"): + lines.extend(["", "## Warning 分布", ""]) + for warning, count in (summary.get("warning_counts") or {}).items(): + lines.append(f"- `{warning}`: {count}") + if not summary.get("warning_counts"): + lines.append("- 无") + lines.extend(["", "## Warning 样例", ""]) + for warning, samples in (summary.get("warning_samples") or {}).items(): + count = (summary.get("warning_counts") or {}).get(warning, len(samples)) + lines.append(f"### {warning} ({count})") + lines.append("") + for sample in samples: + lines.append( + f"- origin_id `{sample.get('origin_id')}`, jsonl_file=`{sample.get('jsonl_file')}`" + ) + lines.append("") + with summary_md_path.open("w", encoding="utf-8") as f: + f.write("\n".join(lines).rstrip() + "\n") + + +# ---- osi_verify/runner.py ---- + + +import json +import sys +from argparse import Namespace +from datetime import datetime +from typing import Any, Dict, List, Optional, Sequence + + + +def run_verification( + *, + args: Namespace, + target_config: TargetConfig, + mysql_settings: Dict[str, Any], + s3_settings: Dict[str, Any], + mapping_rules: Sequence[MappingRule], + requested_compare_fields: Sequence[str], + license_map: Dict[str, str], + retry_config: Optional[RetryConfig] = None, +) -> int: + s3_cfg, s3_path = None, args.s3_path + if args.parquet_glob: + s3_path = None + elif args.s3_config.exists() or s3_settings: + inline_s3 = { + k: v + for k, v in s3_settings.items() + if k not in {"config_file", "subpath", "format", "path"} + } + if args.s3_path: + inline_s3["default_path"] = args.s3_path + s3_cfg = load_s3_config(args.s3_config, inline_s3) + s3_path = resolve_s3_path(s3_path or s3_cfg["default_path"], args.s3_subpath) + s3_path = apply_s3_dt_to_path(s3_path, args.s3_dt or args.partition_dt) + print(f"S3: endpoint={s3_cfg['endpoint']} path={s3_path}", file=sys.stderr) + elif not s3_path: + print("请指定 --parquet-glob、--s3-path,或提供 s3 配置文件", file=sys.stderr) + return 2 + else: + s3_path = apply_s3_dt_to_path(s3_path, args.s3_dt or args.partition_dt) + + row_limit = 0 if args.full else (max(args.limit, 1000) if args.origin_id else args.limit) + origin_filter = set(args.origin_id) if args.origin_id else None + source_id_field = target_config.source_id_field + + def align_filtered(rows: List[Dict[str, Any]]) -> List[Dict[str, Any]]: + if not origin_filter: + return rows + return [r for r in rows if str(get_first(r, source_id_field)) in origin_filter] + + s3_dt = extract_partition_dt(args.s3_subpath or s3_path, args.s3_dt or args.partition_dt) + target_dt = args.target_dt or args.partition_dt or s3_dt + + def batch_kwargs() -> Dict[str, Any]: + return { + "parquet_glob": args.parquet_glob, + "s3_path": s3_path, + "s3_cfg": s3_cfg, + "s3_format": args.s3_format, + "full": args.full, + "limit": row_limit, + "batch_size": args.batch_size, + "sequential": args.sequential, + "retry_config": retry_config, + } + + dry_run_context = ReportContext( + target=target_config.name, + target_kind=target_config.kind, + transform=target_config.transform, + table_name=resolve_table_ref( + args.catalog if args.catalog else None, + args.database, + args.table, + ).display_name, + origin_osi=target_config.origin_osi, + s3_path=s3_path or args.parquet_glob or "", + mapping_csv=str(args.mapping_csv), + config_path=str(args.config), + ) + + if args.dry_run: + print_run_context(dry_run_context) + shown = 0 + for src, batch in iter_s3_batches(**batch_kwargs()): + batch = align_filtered(batch) + if not batch: + continue + for row in batch: + exp = transform_row(row, license_map, target_config.transform) + print( + f"\n--- [{shown}] {jsonl_basename(src)} " + f"origin_id={exp.get('origin_id')} ---" + ) + print(json.dumps(exp, ensure_ascii=False, indent=2, default=str)) + shown += 1 + print(f"共展示 {shown} 条", file=sys.stderr) + return 0 + + inline_mysql = { + k: v + for k, v in mysql_settings.items() + if k not in {"config_file", "database", "table"} + } + mysql_cfg = load_mysql_config(args.mysql_config, inline_mysql) + catalog = (args.catalog or mysql_cfg.get("catalog") or "").strip() or None + table_ref = resolve_table_ref(catalog, args.database, args.table) + if retry_config and retry_config.enabled: + print( + f"[info] 连接重试已启用: max_attempts={retry_config.max_attempts}, " + f"initial_delay={retry_config.initial_delay_sec}s, " + f"backoff={retry_config.backoff_factor}x", + file=sys.stderr, + ) + mysql_session = MySQLSession( + mysql_cfg, + args.database, + catalog=catalog, + retry_config=retry_config, + ) + report_context = ReportContext( + target=target_config.name, + target_kind=target_config.kind, + transform=target_config.transform, + table_name=table_ref.display_name, + origin_osi=target_config.origin_osi, + s3_path=s3_path or args.parquet_glob or "", + mapping_csv=str(args.mapping_csv), + config_path=str(args.config), + ) + report_path = args.report + if report_path is None: + report_path = default_osi_report_path(target_config.name, s3_dt, target_dt) + report_path.parent.mkdir(parents=True, exist_ok=True) + warning_report_path = report_path.parent / "source_field_warning.jsonl" + report_fp = report_path.open("w", encoding="utf-8") + warning_report_fp = warning_report_path.open("w", encoding="utf-8") + ok_n = miss_n = fail_n = warn_n = total = 0 + anomalies: List[RowResult] = [] + notable_results: List[RowResult] = [] + per_file: Dict[str, FileStats] = {} + count_summary: Optional[CountSummary] = None + track_registry: Dict[str, str] = {} + active_compare_fields: Optional[List[str]] = None + active_default_empty_field_types = default_empty_field_types_from_rules(mapping_rules) + + try: + mysql_session.connect() + table_ref = discover_table(mysql_session, table_ref) + report_context.table_name = table_ref.display_name + mode = f"StarRocks Iceberg catalog={catalog}" if catalog else "MySQL" + print(f"使用表 ({mode}): {table_ref.display_name}") + print_run_context(report_context) + + if s3_path and not args.skip_count: + print("\n正在统计 S3 分区行数(可能较慢)...", file=sys.stderr) + con = open_duckdb_s3(s3_cfg) + jsonl_files = list_s3_jsonl_files( + con, + s3_path, + s3_cfg=s3_cfg, + retry_config=retry_config, + ) + s3_total, s3_per_file = count_s3_partition( + con, + s3_path, + jsonl_files, + s3_cfg=s3_cfg, + retry_config=retry_config, + ) + mysql_total, mysql_filter = count_mysql_origin( + mysql_session, + table_ref, + target_dt, + origin_osi=target_config.origin_osi, + ) + count_summary = CountSummary( + context=report_context, + s3_dt=s3_dt, + target_dt=target_dt, + s3_path=s3_path, + s3_total=s3_total, + s3_per_file=s3_per_file, + mysql_total=mysql_total, + mysql_filter=mysql_filter, + ) + print_count_summary(count_summary) + + if args.count_only: + if count_summary: + write_osi_report_summary( + report_path, + build_osi_report_summary( + report_path=report_path, + context=report_context, + s3_dt=s3_dt, + target_dt=target_dt, + total=0, + ok_n=0, + fail_n=0, + miss_n=0, + warn_n=0, + count_summary=count_summary, + per_file=per_file, + notable_results=notable_results, + ), + ) + print(f"\n汇总报告: {summary_paths(report_path)[0]}") + return 0 if count_summary and count_summary.s3_total == count_summary.mysql_total else 1 + + for src, batch in iter_s3_batches(**batch_kwargs()): + batch = align_filtered(batch) + if not batch: + continue + fname = jsonl_basename(src) + if fname not in per_file: + per_file[fname] = FileStats() + ids = [get_first(r, source_id_field) for r in batch] + mysql_map = fetch_mysql_rows_by_ids( + mysql_session, + table_ref, + ids, + origin_osi=target_config.origin_osi, + target_dt=target_dt, + ) + if mysql_map and active_compare_fields is None: + active_compare_fields = compare_fields_for_table( + list(next(iter(mysql_map.values())).keys()), + mapping_rules, + ) + skipped = [ + f for f in requested_compare_fields + if f not in active_compare_fields + ] + print( + f"[info] 字段比对共 {len(active_compare_fields)} 列" + + (f",表无列跳过: {', '.join(skipped)}" if skipped else ""), + file=sys.stderr, + ) + for row in batch: + oid = str(get_first(row, source_id_field)) + result = compare_row( + row, + mysql_map.get(oid), + license_map, + track_registry=track_registry, + compare_fields=active_compare_fields, + default_empty_field_types=active_default_empty_field_types, + transform=target_config.transform, + ) + result.jsonl_file = fname + total += 1 + fs = per_file[fname] + fs.total += 1 + if result.warnings: + warn_n += len(result.warnings) + notable_results.append(result) + if result.ok: + ok_n += 1 + fs.pass_n += 1 + elif result.missing_in_mysql: + miss_n += 1 + fs.miss_n += 1 + anomalies.append(result) + if not result.warnings: + notable_results.append(result) + else: + fail_n += 1 + fs.fail_n += 1 + anomalies.append(result) + if not result.warnings: + notable_results.append(result) + if report_fp and not result.ok: + payload: Dict[str, Any] = { + "status": result.status, + "context": report_context.to_dict(), + "s3_dt": s3_dt, + "target_dt": target_dt, + "partition_dt": target_dt, + "jsonl_file": fname, + "jsonl_s3_uri": src, + "origin_id": result.origin_id, + } + if result.failures: + payload["field_diffs"] = [m.to_dict() for m in result.failures] + report_fp.write(json.dumps(localize_report_keys(payload), ensure_ascii=False) + "\n") + if warning_report_fp and result.warnings: + warning_payload: Dict[str, Any] = { + "status": "warning", + "context": report_context.to_dict(), + "s3_dt": s3_dt, + "target_dt": target_dt, + "partition_dt": target_dt, + "jsonl_file": fname, + "jsonl_s3_uri": src, + "origin_id": result.origin_id, + "warnings": result.warnings, + } + warning_report_fp.write( + json.dumps(localize_report_keys(warning_payload), ensure_ascii=False) + "\n" + ) + if args.verbose_failures and (not result.ok or result.warnings): + tag = result.status if not result.ok else "WARN" + print( + f"\n[{tag}] {fname} origin_id={result.origin_id}" + ) + for w in result.warnings: + print(f" ! {w}") + for m in result.failures: + print(f" - {m}") + print( + f"[进度] 已校验 {total} 条(通过 {ok_n} / 失败 {fail_n} / 缺失 {miss_n}" + f" / warning {warn_n}) 当前: {fname}", + file=sys.stderr, + ) + + if count_summary: + count_summary.checked_rows = total + print_count_summary(count_summary) + + print("\n" + "=" * 60) + print("字段校验汇总") + print("=" * 60) + print(f"已校验行数 : {total:,}") + print(f"通过 : {ok_n:,}") + print(f"字段不一致 : {fail_n:,}") + print(f"MySQL 缺失 : {miss_n:,}") + print(f"Warning : {warn_n:,}(license 超出可选值,不记为缺陷)") + + print_file_stats(per_file) + print_anomaly_summary(notable_results, max_examples=args.max_show) + + if report_fp: + print(f"\n完整异常报告: {report_path}") + write_osi_report_summary( + report_path, + build_osi_report_summary( + report_path=report_path, + context=report_context, + s3_dt=s3_dt, + target_dt=target_dt, + total=total, + ok_n=ok_n, + fail_n=fail_n, + miss_n=miss_n, + warn_n=warn_n, + count_summary=count_summary, + per_file=per_file, + notable_results=notable_results, + ), + ) + print(f"汇总报告: {summary_paths(report_path)[0]}") + count_ok = not count_summary or count_summary.s3_total == count_summary.mysql_total + return 0 if fail_n == 0 and miss_n == 0 and count_ok else 1 + finally: + mysql_session.close() + if report_fp: + report_fp.close() + if warning_report_fp: + warning_report_fp.close() + + +# ---- osi_verify/cli.py ---- + + +import argparse +import json +import sys +from pathlib import Path +from typing import Any, Dict, Optional, Sequence + + + +def _nested(settings: Dict[str, Any], *keys: str, default: Any = None) -> Any: + cur: Any = settings + for key in keys: + if not isinstance(cur, dict) or key not in cur: + return default + cur = cur[key] + return cur + + +def _bool_default(settings: Dict[str, Any], *keys: str, default: bool = False) -> bool: + return bool(_nested(settings, *keys, default=default)) + + +def _merge_target_s3_settings( + global_s3_settings: Dict[str, Any], + target_s3_settings: Dict[str, Any], +) -> Dict[str, Any]: + merged = dict(global_s3_settings) + for key, value in target_s3_settings.items(): + if value is not None and value != "": + merged[key] = value + return merged + + +def _section_dict(settings: Dict[str, Any], key: str) -> Dict[str, Any]: + value = settings.get(key) + return dict(value) if isinstance(value, dict) else {} + + +def _merged_arxiv_options( + settings: Dict[str, Any], + section: str, + flat_keys: Sequence[str], +) -> Dict[str, Any]: + arxiv_settings = _section_dict(settings, "osi_arxiv") + merged = _section_dict(settings, section) + merged.update(_section_dict(arxiv_settings, section)) + for key in flat_keys: + if key in arxiv_settings and arxiv_settings[key] is not None: + merged[key] = arxiv_settings[key] + return merged + + +def main(argv: Optional[Sequence[str]] = None) -> int: + config_parser = argparse.ArgumentParser(add_help=False) + config_parser.add_argument("--config", type=Path, default=DEFAULT_SETTINGS_JSON) + config_args, remaining_argv = config_parser.parse_known_args(argv) + settings = load_settings(config_args.config) + target_config = load_arxiv_target_config(settings) + mysql_settings = settings.get("mysql", {}) if isinstance(settings.get("mysql", {}), dict) else {} + global_s3_settings = settings.get("s3", {}) if isinstance(settings.get("s3", {}), dict) else {} + s3_settings = _merge_target_s3_settings(global_s3_settings, target_config.s3_settings) + run_settings = _merged_arxiv_options( + settings, + "run", + ( + "limit", + "sequential", + "full", + "batch_size", + "dry_run", + "origin_ids", + "partition_dt", + "s3_dt", + "target_dt", + "skip_count", + "count_only", + "parquet_glob", + "s3_path", + ), + ) + report_settings = _merged_arxiv_options( + settings, + "report", + ("report_path", "summary_only", "verbose_failures", "max_show"), + ) + retry_config = load_retry_config(settings) + + parser = argparse.ArgumentParser(description="校验 S3 arxiv 数据到论文源数据表的一致性") + parser.add_argument("--config", type=Path, default=config_args.config, help="可选自动化配置文件") + parser.add_argument("--mysql-config", type=Path, default=resolve_project_path(mysql_settings.get("config_file")) or PROJECT_ROOT / "mysql") + parser.add_argument("--mapping-csv", type=Path, default=target_config.mapping_csv) + parser.add_argument( + "--database", + default=target_config.database, + help="库名(Iceberg 模式下为 schema,如 dws)", + ) + parser.add_argument("--table", default=target_config.table) + parser.add_argument( + "--catalog", + default=target_config.catalog if target_config.catalog is not None else mysql_settings.get("catalog", DEFAULT_ICEBERG_CATALOG), + help="StarRocks Iceberg catalog(默认 lakehouse_iceberg);传空字符串则用原生库连接", + ) + parser.add_argument("--s3-config", type=Path, default=resolve_project_path(s3_settings.get("config_file")) or PROJECT_ROOT / "s3") + parser.add_argument("--parquet-glob", default=run_settings.get("parquet_glob")) + parser.add_argument("--s3-path", default=target_config.s3_path or s3_settings.get("path") or run_settings.get("s3_path")) + parser.add_argument("--s3-subpath", default=target_config.s3_subpath or s3_settings.get("subpath")) + parser.add_argument("--s3-format", choices=("auto", "jsonl", "parquet"), default=target_config.s3_format or s3_settings.get("format", "auto")) + parser.add_argument( + "--limit", + type=int, + default=int(run_settings.get("limit", 200)), + help="抽样模式:每个 jsonl 随机抽查条数(默认 200);与 --full 互斥", + ) + parser.add_argument( + "--sequential", + action="store_true", + default=bool(run_settings.get("sequential", False)), + help="顺序抽取:每个 jsonl 取文件开头前 N 条(配合 --limit,比随机抽样快)", + ) + parser.add_argument( + "--full", + action="store_true", + default=bool(run_settings.get("full", False)), + help="全量:读取分区内全部 jsonl 文件、全部行(分批处理)", + ) + parser.add_argument( + "--batch-size", + type=int, + default=int(run_settings.get("batch_size", 500)), + help="全量模式每批处理条数(默认 500)", + ) + parser.add_argument( + "--report", + type=Path, + default=resolve_project_path(report_settings.get("path") or report_settings.get("report_path")) if (report_settings.get("path") or report_settings.get("report_path")) else None, + help="将 FAIL/MISSING 记录写入 JSONL 报告文件", + ) + parser.add_argument( + "--summary-only", + action="store_true", + default=bool(report_settings.get("summary_only", False)), + help="兼容旧参数:当前默认仅打印汇总,不逐条打印失败详情", + ) + parser.add_argument( + "--verbose-failures", + action="store_true", + default=bool(report_settings.get("verbose_failures", False)), + help="逐条打印 FAIL/WARN 明细;默认只打印错误类型汇总", + ) + parser.add_argument("--license-map", type=Path, default=resolve_project_path(settings.get("license_map")) if settings.get("license_map") else None) + parser.add_argument("--dry-run", action="store_true", default=bool(run_settings.get("dry_run", False))) + parser.add_argument("--origin-id", action="append", default=run_settings.get("origin_ids")) + parser.add_argument( + "--partition-dt", + default=run_settings.get("partition_dt"), + help="兼容旧参数:同时作为 S3 分区和目标表分区默认值;建议改用 --s3-dt / --target-dt", + ) + parser.add_argument( + "--s3-dt", + default=run_settings.get("s3_dt"), + help="S3 数据分区日期;默认从 S3 path 中的 dt= 解析", + ) + parser.add_argument( + "--target-dt", + default=run_settings.get("target_dt"), + help="论文源数据表 dt;默认沿用 --partition-dt,未指定时再沿用 S3 dt", + ) + parser.add_argument( + "--skip-count", + action="store_true", + default=bool(run_settings.get("skip_count", False)), + help="跳过 S3/MySQL 总量统计(大分区计数较慢)", + ) + parser.add_argument( + "--count-only", + action="store_true", + default=bool(run_settings.get("count_only", False)), + help="仅做总量统计,不做字段级校验", + ) + parser.add_argument( + "--max-show", + type=int, + default=int(report_settings.get("max_show", 3)), + help="错误类型汇总中每类最多展示多少个样例 origin_id", + ) + parser.add_argument( + "--retry-max-attempts", + type=int, + default=retry_config.max_attempts, + help="连接/查询失败时的最大重试次数(含首次,默认 3)", + ) + parser.add_argument( + "--retry-initial-delay", + type=float, + default=retry_config.initial_delay_sec, + help="重试初始等待秒数(默认 1.0)", + ) + parser.add_argument( + "--no-retry", + action="store_true", + help="禁用数据库与 S3 连接重试", + ) + args = parser.parse_args(remaining_argv) + if args.summary_only and "--verbose-failures" not in remaining_argv: + args.verbose_failures = False + if args.full: + print("[info] 全量模式:读取分区内全部 jsonl,忽略 --limit", file=sys.stderr) + if args.full and args.dry_run: + print("[warn] 全量 dry-run 可能极慢,建议加 --summary-only", file=sys.stderr) + + if not args.mapping_csv.exists(): + print(f"映射文件不存在: {args.mapping_csv}", file=sys.stderr) + return 2 + + try: + mapping_rules = load_mapping_rules( + args.mapping_csv, + target_column=target_config.mapping_target_column, + source_column=target_config.mapping_source_column, + ) + except ValueError as e: + print(str(e), file=sys.stderr) + return 2 + requested_compare_fields = compare_fields_from_rules(mapping_rules) + print( + f"目标 target={target_config.name} kind={target_config.kind} " + f"transform={target_config.transform}" + ) + print( + f"已加载映射规则 {len(mapping_rules)} 条,启用字段校验 {len(requested_compare_fields)} 列" + ) + license_map = dict(DEFAULT_LICENSE_MAP) + if args.license_map: + license_map.update(json.loads(args.license_map.read_text(encoding="utf-8"))) + effective_retry = RetryConfig( + enabled=not args.no_retry and retry_config.enabled, + max_attempts=args.retry_max_attempts, + initial_delay_sec=args.retry_initial_delay, + backoff_factor=retry_config.backoff_factor, + max_delay_sec=retry_config.max_delay_sec, + ) + try: + return run_verification( + args=args, + target_config=target_config, + mysql_settings=mysql_settings, + s3_settings=s3_settings, + mapping_rules=mapping_rules, + requested_compare_fields=requested_compare_fields, + license_map=license_map, + retry_config=effective_retry, + ) + except Exception as exc: + if is_s3_retryable(exc): + print( + f"\n[S3 ERROR] {type(exc).__name__}: {exc}\n" + "S3 连接重试已耗尽。若是内网 Ceph HTTPS 偶发断连,可重跑;" + "抽样校验建议加 --sequential --skip-count 减少 HEAD/Range 请求。" + "如果 endpoint 支持 HTTP,可在 evaluator parameters 中设置 use_ssl=false。", + file=sys.stderr, + ) + return 2 + raise + + + +def init_config(path: Path) -> int: + target = path.expanduser() + if not target.is_absolute(): + target = Path.cwd() / target + target.parent.mkdir(parents=True, exist_ok=True) + source = PROJECT_ROOT / "config" / "settings.template.json" + target.write_text(source.read_text(encoding="utf-8"), encoding="utf-8") + print(f"created config template: {target}") + return 0 + + +def arxiv_entry(argv: Optional[Sequence[str]] = None) -> int: + args = list(sys.argv[1:] if argv is None else argv) + if "--init-config" in args: + idx = args.index("--init-config") + if idx + 1 < len(args) and not args[idx + 1].startswith("-"): + return init_config(Path(args[idx + 1])) + return init_config(DEFAULT_SETTINGS_JSON) + return main(args) + + +from dingo.config.input_args import EvaluatorRuleArgs +from dingo.io.input import Data, RequiredField +from dingo.io.output.eval_detail import EvalDetail, QualityLabel +from dingo.model.model import Model +from dingo.model.rule.base import BaseRule +from dingo.model.rule.scibase.report_utils import ( + bool_param, + int_param, + s3_path_from_dingo, + write_temp_settings, +) + + +def _dingo_append_cli_option(argv: list[str], flag: str, value: Any) -> None: + if value is not None and value != "": + argv.extend([flag, str(value)]) + + +def _dingo_append_cli_flag(argv: list[str], flag: str, enabled: bool) -> None: + if enabled: + argv.append(flag) + + +def _dingo_append_origin_ids(argv: list[str], value: Any) -> None: + if value is None or value == "": + return + if isinstance(value, (list, tuple, set)): + for item in value: + _dingo_append_cli_option(argv, "--origin-id", item) + return + _dingo_append_cli_option(argv, "--origin-id", value) + + +@Model.rule_register( + "QUALITY_BAD_EFFECTIVENESS", + ["sci_base_qa_test", "meta_paper_data"], +) +class RuleSciBaseMetaPaperDataReport(BaseRule): + _metric_info = { + "category": "Rule-Based Metadata Quality Metrics", + "quality_dimension": "EFFECTIVENESS", + "metric_name": "RuleSciBaseMetaPaperDataReport", + "description": "Run SciBase S3 paper source-data validation and write reports.", + "paper_title": "", + "paper_url": "", + "paper_authors": "", + "evaluation_results": "", + } + + _required_fields = [RequiredField.METADATA] + dynamic_config = EvaluatorRuleArgs(parameters={}) + + @classmethod + def eval(cls, input_data: Data) -> EvalDetail: + del input_data + params = cls.dynamic_config.parameters or {} + config_path = write_temp_settings(params, include_s3=True) + report_path = Path(params["report_path"]) if params.get("report_path") else None + if report_path is None and params.get("output_dir"): + report_path = Path(str(params["output_dir"])) / "source_field_mismatch.jsonl" + + s3_path = s3_path_from_dingo(params) + parquet_glob = params.get("parquet_glob") + if not s3_path and not parquet_glob: + raise RuntimeError( + "S3 path is required for RuleSciBaseMetaPaperDataReport. " + "Set evaluator config parameters.s3_path, or run with dataset.source=s3 " + "so input_path and dataset.s3_config.s3_bucket can be combined." + ) + + argv = [ + "--config", + str(config_path), + "--mapping-csv", + str(params.get("mapping_csv") or ASSETS_DIR / "osi_arxiv_mapping.csv"), + "--database", + str(params.get("database") or "dws"), + "--table", + str(params.get("target_table") or params.get("table") or "dws_meta_paper_data_acc_d"), + ] + catalog = params.get("catalog", DEFAULT_ICEBERG_CATALOG) + _dingo_append_cli_option(argv, "--catalog", catalog) + _dingo_append_cli_option(argv, "--s3-path", s3_path) + _dingo_append_cli_option(argv, "--s3-subpath", params.get("s3_subpath")) + _dingo_append_cli_option(argv, "--s3-format", params.get("s3_format")) + _dingo_append_cli_option(argv, "--parquet-glob", parquet_glob) + _dingo_append_cli_option(argv, "--partition-dt", params.get("partition_dt")) + _dingo_append_cli_option(argv, "--s3-dt", params.get("s3_dt")) + _dingo_append_cli_option(argv, "--target-dt", params.get("target_dt")) + _dingo_append_cli_option(argv, "--limit", int_param(params, "limit", 200)) + _dingo_append_cli_option(argv, "--batch-size", int_param(params, "batch_size", 500)) + _dingo_append_cli_option(argv, "--max-show", int_param(params, "max_show", 3)) + _dingo_append_cli_option(argv, "--report", report_path) + _dingo_append_cli_option(argv, "--license-map", params.get("license_map")) + _dingo_append_cli_option(argv, "--retry-max-attempts", params.get("retry_max_attempts")) + _dingo_append_cli_option(argv, "--retry-initial-delay", params.get("retry_initial_delay")) + _dingo_append_origin_ids(argv, params.get("origin_id") or params.get("origin_ids")) + + _dingo_append_cli_flag(argv, "--sequential", bool_param(params, "sequential", False)) + _dingo_append_cli_flag(argv, "--full", bool_param(params, "full", False)) + _dingo_append_cli_flag(argv, "--dry-run", bool_param(params, "dry_run", False)) + _dingo_append_cli_flag(argv, "--skip-count", bool_param(params, "skip_count", False)) + _dingo_append_cli_flag(argv, "--count-only", bool_param(params, "count_only", False)) + _dingo_append_cli_flag(argv, "--summary-only", bool_param(params, "summary_only", False)) + _dingo_append_cli_flag(argv, "--verbose-failures", bool_param(params, "verbose_failures", False)) + _dingo_append_cli_flag(argv, "--no-retry", bool_param(params, "no_retry", False)) + + exit_code = main(argv) + reason = [ + f"exit_code={exit_code}", + str(report_path.parent if report_path else REPORT_ROOT), + ] + if exit_code != 0: + return EvalDetail( + metric=cls.__name__, + status=True, + label=[f"{cls.metric_type}.{cls.__name__}"], + reason=reason, + ) + return EvalDetail(metric=cls.__name__, label=[QualityLabel.QUALITY_GOOD], reason=reason) + + +if __name__ == "__main__": + raise SystemExit(arxiv_entry()) diff --git a/dingo/model/rule/scibase/meta_paper_unique.py b/dingo/model/rule/scibase/meta_paper_unique.py new file mode 100644 index 00000000..90ae8795 --- /dev/null +++ b/dingo/model/rule/scibase/meta_paper_unique.py @@ -0,0 +1,2278 @@ +#!/usr/bin/env python3 +"""Self-contained meta_paper unique DB validator. + +Field aggregation rules are driven by ../doc/paper_unique_mapping.csv. +""" +from __future__ import annotations + +import csv +import argparse +import html +import json +import re +import sys +import time +from collections import Counter +from dataclasses import dataclass +from datetime import date, datetime +from decimal import Decimal, InvalidOperation +from pathlib import Path +from typing import Any, Callable, Dict, Iterable, List, Optional, Sequence, Tuple, Union + +SKIP_COMPARE_STRATEGIES = frozenset({"random_pick_cls"}) +ORDER_INSENSITIVE_COMPARE_STRATEGIES = frozenset( + {"dedup_array", "dedup_map", "dedup_struct", "dedup_locations"} +) +StrategyHandler = Callable[[List[Dict[str, Any]], "FieldRule", Dict[str, Any]], Any] + + +@dataclass +class FieldRule: + field_name: str + data_type: str + strategy: str + params: Dict[str, Any] + source_field: str + description: str + + @property + def effective_source(self) -> str: + return self.source_field or self.field_name + + +def _parse_params(raw: str) -> Dict[str, Any]: + if not raw: + return {} + params: Dict[str, Any] = {} + for pair in raw.split(";"): + pair = pair.strip() + if "=" not in pair: + continue + key, val = pair.split("=", 1) + key, val = key.strip(), val.strip() + if val.lower() == "true": + params[key] = True + elif val.lower() == "false": + params[key] = False + elif val.lstrip("-").isdigit(): + params[key] = int(val) + else: + params[key] = val + return params + + +def load_field_rules( + path: Path, + *, + field_column: str = "字段名", + type_column: str = "数据类型", + strategy_column: str = "聚合策略", + params_column: str = "策略参数", + source_column: str = "源字段名", + desc_column: str = "去重 / 聚合处理逻辑", +) -> List[FieldRule]: + rules: List[FieldRule] = [] + with path.open(encoding="utf-8-sig", newline="") as f: + reader = csv.DictReader(f) + if not reader.fieldnames or field_column not in reader.fieldnames: + available = ", ".join(fn for fn in (reader.fieldnames or []) if fn.strip()) + raise ValueError( + f"映射文件 {path} 缺少字段列 {field_column!r}(可用列: {available})" + ) + for row in reader: + name = (row.get(field_column) or "").strip() + if not name: + continue + rules.append(FieldRule( + field_name=name, + data_type=(row.get(type_column) or "").strip(), + strategy=(row.get(strategy_column) or "").strip(), + params=_parse_params((row.get(params_column) or "").strip()), + source_field=(row.get(source_column) or "").strip(), + description=(row.get(desc_column) or "").strip(), + )) + return rules + + +def output_fields_from_rules(rules: Sequence[FieldRule]) -> List[str]: + return [r.field_name for r in rules if r.strategy not in SKIP_COMPARE_STRATEGIES] + + +def order_insensitive_fields_from_rules(rules: Sequence[FieldRule]) -> set: + return { + r.field_name + for r in rules + if r.strategy in ORDER_INSENSITIVE_COMPARE_STRATEGIES + } + + +def aggregate_by_rules( + records: List[Dict[str, Any]], + rules: Sequence[FieldRule], + handlers: Dict[str, StrategyHandler], +) -> Dict[str, Any]: + result: Dict[str, Any] = {} + for rule in rules: + handler = handlers.get(rule.strategy) + if handler is None: + raise ValueError( + f"Unknown aggregation strategy {rule.strategy!r} " + f"for field {rule.field_name!r}" + ) + result[rule.field_name] = handler(records, rule, result) + return result + +try: + import pymysql +except ImportError: # pragma: no cover - runtime dependency check + pymysql = None # type: ignore + + +CURRENT_YEAR = datetime.now().year +PROJECT_ROOT = Path(__file__).resolve().parent +ASSETS_DIR = PROJECT_ROOT / "assets" +DEFAULT_CONFIG_PATH = Path("sci_base_qa_test_config.json") +TEMPLATE_CONFIG_PATH = ASSETS_DIR / "settings.template.json" +DEFAULT_MAPPING_CSV = ASSETS_DIR / "paper_unique_mapping.csv" +REPORT_ROOT = Path("report") +DEFAULT_SOURCE_TABLE = "dws_meta_paper_data_acc_d" +DEFAULT_TARGET_TABLE = "dws_meta_paper_doi_unique_acc_d" +DOI_KEY_SQL_PATTERN = r'(10\.[^[:space:]<>"&;]+|[^[:space:]<>"&;]+)' + + +def safe_filename_token(value: Optional[Any]) -> str: + text = "all" if value in (None, "") else str(value) + return re.sub(r"[^0-9A-Za-z_-]+", "_", text).strip("_") or "all" + + +def default_report_path(dt: Optional[str], sample_mode: str, full: bool) -> Path: + mode = "full" if full else sample_mode + report_dir = REPORT_ROOT / f"meta_paper_unique_dt_{safe_filename_token(dt)}_{safe_filename_token(mode)}" + return report_dir / "source_field_mismatch.jsonl" + + +def _json_inline(value: Any) -> str: + return json.dumps(value, ensure_ascii=False, cls=JsonEncoder) + + +def summary_paths(report_path: Path) -> Tuple[Path, Path]: + return report_path.parent / "summary.json", report_path.parent / "readable_summary.md" + + +REPORT_KEY_LABELS = { + "report": "报告路径", + "total_problem_rows": "问题记录数", + "result": "校验结果", + "status_counts": "状态分布", + "field_counts": "字段问题分布", + "field_samples": "字段问题样例", + "key": "键值", + "dt": "分区日期", + "source_count": "源表记录数", + "status": "状态", + "expected": "预期值", + "actual": "实际值", + "kind": "校验类型", + "source_table": "源表", + "target_table": "目标表", + "key_field": "去重键字段", + "validated_partitions": "已校验分区", + "sample_mode": "抽样模式", + "sample_size": "抽样数量", + "dt_check": "分区检查", + "checked": "已校验数", + "passed": "通过数", + "failed": "失败数", + "missing_source": "源表缺失数", + "missing_target": "目标表缺失数", + "source_count_buckets": "源表记录数分桶", + "missing_samples": "缺失样例", + "source_records": "源表记录", + "target_records": "目标表记录", + "report_path": "报告路径", + "sample_mismatches": "问题样例", + "mismatches": "字段差异", + "source_count_mode": "源表计数模式", + "source_failed_buckets": "源表计数失败分桶", + "count_mismatches": "数量不一致明细", + "count_check": "数量校验", + "mismatch_count": "数量不一致数", + "failed_bucket_count": "计数失败分桶数", + "difference": "目标表多出记录数", + "source_dt_count": "源表分区数", + "target_dt_count": "目标表分区数", + "missing_in_target": "目标表缺失分区", + "extra_in_target": "目标表多余分区", + "source_distinct_skipped": "源表去重计数已跳过", + "matched_key_count": "源表目标表共同 DOI 数", + "source_missing_in_target_key_count": "元数据有目标无", + "target_extra_key_count": "目标有元数据无", + "key_gap_failed": "key 覆盖统计失败", +} + + +def localize_report_keys(value: Any) -> Any: + if isinstance(value, dict): + return { + REPORT_KEY_LABELS.get(str(key), str(key)): localize_report_keys(val) + for key, val in value.items() + } + if isinstance(value, list): + return [localize_report_keys(item) for item in value] + return value + + +TOP_FIELD_LIMIT = 20 +TOP_SAMPLE_FIELD_LIMIT = 5 +SAMPLES_PER_FIELD = 3 + + +def compact_record_for_report(record: Dict[str, Any]) -> Dict[str, Any]: + keys = ( + "track_id", + "origin_osi", + "origin_id", + "title", + "published_year", + "published_date", + "venue_name", + ) + return { + key: record.get(key) + for key in keys + if record.get(key) not in (None, "", [], {}) + } + + +def compact_records_for_report(records: Any) -> Any: + if not isinstance(records, list): + return records + compacted = [] + seen = set() + for record in records: + if not isinstance(record, dict): + continue + compact = compact_record_for_report(record) + marker = json.dumps(compact, ensure_ascii=False, sort_keys=True, cls=JsonEncoder) + if marker in seen: + continue + seen.add(marker) + compacted.append(compact) + return compacted + + +def compact_dt_check(dt_check: Optional[Dict[str, Any]]) -> Dict[str, Any]: + dt_check = dt_check or {} + mismatches = [] + for item in dt_check.get("count_mismatches") or []: + source_count = item.get("source_key_count") + target_count = item.get("target_row_count") + difference = None + if source_count is not None and target_count is not None: + difference = int(target_count) - int(source_count) + mismatches.append( + { + "dt": item.get("dt"), + "source_key_count": source_count, + "target_row_count": target_count, + "difference": difference, + } + ) + failed_buckets = dt_check.get("source_failed_buckets") or [] + compact = { + "source_count_mode": dt_check.get("source_count_mode"), + "source_distinct_skipped": dt_check.get("source_distinct_skipped"), + "failed_bucket_count": len(failed_buckets), + "mismatch_count": len(mismatches), + "count_mismatches": mismatches, + "missing_in_target": dt_check.get("missing_in_target") or [], + "extra_in_target": dt_check.get("extra_in_target") or [], + } + for key in ( + "matched_key_count", + "source_missing_in_target_key_count", + "target_extra_key_count", + "key_gap_failed", + ): + if key in dt_check: + compact[key] = dt_check.get(key) + return compact + + +def build_report_summary( + report_path: Path, + result: Dict[str, Any], + mismatch_rows: Sequence[Dict[str, Any]], +) -> Dict[str, Any]: + status_counts = Counter(str(row.get("status") or "unknown") for row in mismatch_rows) + field_counts: Counter = Counter() + field_samples: Dict[str, List[Dict[str, Any]]] = {} + missing_samples: List[Dict[str, Any]] = [] + for row in mismatch_rows: + if row.get("status") in ("missing_target", "missing_source") and len(missing_samples) < SAMPLES_PER_FIELD: + missing_samples.append( + { + "key": row.get("key"), + "dt": row.get("dt"), + "source_count": row.get("source_count"), + "status": row.get("status"), + "source_records": compact_records_for_report(row.get("source_records")), + "target_records": compact_records_for_report(row.get("target_records")), + } + ) + for field, diff in (row.get("mismatches") or {}).items(): + field_counts[field] += 1 + samples = field_samples.setdefault(field, []) + if len(samples) < SAMPLES_PER_FIELD: + samples.append( + { + "key": row.get("key"), + "dt": row.get("dt"), + "source_count": row.get("source_count"), + "status": row.get("status"), + "expected": diff.get("expected") if isinstance(diff, dict) else None, + "actual": diff.get("actual") if isinstance(diff, dict) else None, + } + ) + sorted_field_counts = dict(field_counts.most_common()) + top_sample_fields = set(list(sorted_field_counts)[:TOP_SAMPLE_FIELD_LIMIT]) + compact_result = {k: v for k, v in result.items() if k not in ("sample_mismatches", "dt_check")} + count_check = compact_dt_check(result.get("dt_check")) + return { + "report": str(report_path), + "total_problem_rows": len(mismatch_rows), + "result": compact_result, + "count_check": count_check, + "status_counts": dict(status_counts.most_common()), + "field_counts": sorted_field_counts, + "field_count_total": len(sorted_field_counts), + "field_samples": { + field: field_samples[field] + for field in sorted_field_counts + if field in top_sample_fields and field in field_samples + }, + "missing_samples": missing_samples, + } + + +def write_report_summary(report_path: Path, result: Dict[str, Any], mismatch_rows: Sequence[Dict[str, Any]]) -> None: + summary_json_path, summary_md_path = summary_paths(report_path) + summary = build_report_summary(report_path, result, mismatch_rows) + with summary_json_path.open("w", encoding="utf-8") as f: + json.dump(localize_report_keys(summary), f, ensure_ascii=False, indent=2, cls=JsonEncoder) + + lines = [ + "# Paper 去重校验报告摘要", + "", + f"- 分区: `{result.get('dt')}`", + f"- 抽样: `{result.get('sample_mode')}`, 数量 `{result.get('sample_size')}`", + f"- 结果: 已校验 `{result.get('checked')}`,通过 `{result.get('passed')}`,失败 `{result.get('failed')}`", + f"- 缺失: 源表 `{result.get('missing_source')}`,目标表 `{result.get('missing_target')}`", + f"- 明细报告: `{report_path}`", + f"- 报告目录: `{report_path.parent}`", + f"- 源表记录数分桶: `{_json_inline(result.get('source_count_buckets'))}`", + "", + "## 数量校验", + "", + f"- 源表计数模式: `{summary['count_check'].get('source_count_mode')}`", + f"- 计数失败分桶数: `{summary['count_check'].get('failed_bucket_count')}`", + f"- 数量不一致数: `{summary['count_check'].get('mismatch_count')}`", + ] + if "source_missing_in_target_key_count" in summary["count_check"]: + lines.append( + f"- 元数据有目标无: `{summary['count_check'].get('source_missing_in_target_key_count')}`" + ) + if "target_extra_key_count" in summary["count_check"]: + lines.append( + f"- 目标有元数据无: `{summary['count_check'].get('target_extra_key_count')}`" + ) + for item in summary["count_check"].get("count_mismatches") or []: + lines.append( + "- 分区 `{}`: source_key_count `{}`,target_row_count `{}`,difference `{}`".format( + item.get("dt"), + item.get("source_key_count"), + item.get("target_row_count"), + item.get("difference"), + ), + ) + lines.extend(["", "## 状态分布", ""]) + for status, count in summary["status_counts"].items(): + lines.append(f"- `{status}`: {count}") + if not summary["status_counts"]: + lines.append("- 无") + lines.extend(["", "## 字段问题分布", ""]) + for field, count in summary["field_counts"].items(): + lines.append(f"- `{field}`: {count}") + if not summary["field_counts"]: + lines.append("- 无") + if summary.get("missing_samples"): + lines.extend(["", "## 缺失样例", ""]) + for sample in summary["missing_samples"]: + lines.append( + f"- DOI `{sample.get('key')}`, source_count={sample.get('source_count')}, " + f"status=`{sample.get('status')}`" + ) + source_records = sample.get("source_records") + target_records = sample.get("target_records") + if source_records is not None: + lines.append(f" - source_records: `{_json_inline(source_records)}`") + if target_records is not None: + lines.append(f" - target_records: `{_json_inline(target_records)}`") + lines.extend(["", "## 字段问题样例", ""]) + for field, samples in summary["field_samples"].items(): + lines.append(f"### {field} ({summary['field_counts'].get(field)})") + lines.append("") + for sample in samples: + lines.append( + f"- DOI `{sample.get('key')}`, source_count={sample.get('source_count')}, " + f"status=`{sample.get('status')}`" + ) + lines.append(f" - expected: `{_json_inline(sample.get('expected'))}`") + lines.append(f" - actual: `{_json_inline(sample.get('actual'))}`") + lines.append("") + with summary_md_path.open("w", encoding="utf-8") as f: + f.write("\n".join(lines).rstrip() + "\n") + + +class JsonEncoder(json.JSONEncoder): + def default(self, obj: Any) -> Any: + if isinstance(obj, Decimal): + if obj == obj.to_integral_value(): + return int(obj) + return float(obj) + if isinstance(obj, (date, datetime)): + return obj.isoformat() + return super().default(obj) + + +# ---- common scalar/array helpers ---- + + +def is_non_empty(value: Any) -> bool: + if value is None: + return False + if isinstance(value, str): + return value not in ("", "{}") + if isinstance(value, (list, dict)): + return len(value) > 0 + return True + + +def choose_freq_then_lex_max(values: Iterable[str]) -> str: + vals = [v for v in values if v not in ("", "{}")] + if not vals: + return "" + cnt = Counter(vals) + max_freq = max(cnt.values()) + candidates = [k for k, v in cnt.items() if v == max_freq] + return max(candidates) + + +def choose_freq_then_max_int(values: Iterable[int]) -> Optional[int]: + vals = [v for v in values if isinstance(v, int)] + if not vals: + return None + cnt = Counter(vals) + max_freq = max(cnt.values()) + candidates = [k for k, v in cnt.items() if v == max_freq] + return max(candidates) + + +def choose_freq_then_max_decimal(values: Iterable[Decimal]) -> Optional[Decimal]: + vals = [v for v in values if isinstance(v, Decimal)] + if not vals: + return None + cnt = Counter(vals) + max_freq = max(cnt.values()) + candidates = [k for k, v in cnt.items() if v == max_freq] + return max(candidates) + + +def normalize_doi(doi: Any) -> str: + if doi is None: + return "" + s = html.unescape(str(doi).strip().lower()) + if s in ("", "{}"): + return "" + start = s.find("10.") + if start >= 0: + s = s[start:] + s = re.split(r"[\s<>\"&;]", s, maxsplit=1)[0].strip() + if s in ("", "{}"): + return "" + return s + + +def parse_int(value: Any) -> Optional[int]: + if value is None or (isinstance(value, str) and value in ("", "{}")): + return None + if isinstance(value, bool): + return None + if isinstance(value, int): + return value + try: + return int(str(value)) + except ValueError: + return None + + +def parse_decimal(value: Any) -> Optional[Decimal]: + if value is None or (isinstance(value, str) and value in ("", "{}")): + return None + try: + return Decimal(str(value)) + except (InvalidOperation, ValueError): + return None + + +def decimal_to_json_number(value: Decimal) -> Union[int, float]: + if value == value.to_integral_value(): + return int(value) + return float(value) + + +def year_from_date_str(s: str) -> Optional[int]: + if len(s) < 4: + return None + year_txt = s[:4] + if not year_txt.isdigit(): + return None + year = int(year_txt) + if year < 1000 or year > CURRENT_YEAR: + return None + return year + + +def canonical_json(value: Any) -> str: + return json.dumps(value, ensure_ascii=False, sort_keys=True, separators=(",", ":")) + + +def dedup_str_array(values: Iterable[Any], lower: bool = False) -> List[str]: + out = set() + + def add_value(raw: Any) -> None: + if raw is None: + return + s = str(raw) + if s in ("", "{}", "[]"): + return + out.add(s.lower() if lower else s) + + for item in values: + if isinstance(item, list): + for v in item: + if isinstance(v, str): + try: + parsed = json.loads(v) + except json.JSONDecodeError: + parsed = None + if isinstance(parsed, list): + for elem in parsed: + add_value(elem) + continue + add_value(v) + elif item is not None: + if isinstance(item, str): + try: + parsed = json.loads(item) + except json.JSONDecodeError: + parsed = None + if isinstance(parsed, list): + for elem in parsed: + add_value(elem) + continue + add_value(item) + return sorted(out) + + +# ---- paper-specific complex value helpers ---- + + +def dedup_complex_to_string(values: Iterable[Any]) -> List[str]: + out = set() + for item in values: + if isinstance(item, list): + for v in item: + if v is None or (isinstance(v, str) and v in ("", "{}")): + continue + out.add(canonical_json(v) if not isinstance(v, str) else v) + elif item is not None and not (isinstance(item, str) and item in ("", "{}")): + out.add(canonical_json(item) if not isinstance(item, str) else item) + return sorted(v for v in out if v not in ("", "{}")) + + +def dedup_locations_struct(values: Iterable[Any]) -> List[Dict[str, str]]: + dedup_map: Dict[str, Dict[str, str]] = {} + for item in values: + candidates = item if isinstance(item, list) else [item] + for candidate in candidates: + if candidate is None or (isinstance(candidate, str) and candidate in ("", "{}")): + continue + + obj: Optional[Dict[str, Any]] = None + if isinstance(candidate, dict): + obj = candidate + elif isinstance(candidate, str): + try: + parsed = json.loads(candidate) + except json.JSONDecodeError: + parsed = None + if isinstance(parsed, dict): + obj = parsed + + if obj is None: + continue + + normalized = { + "type": "" if obj.get("type") is None or str(obj.get("type")) == "{}" else str(obj.get("type")), + "url": "" if obj.get("url") is None or str(obj.get("url")) == "{}" else str(obj.get("url")), + "license": "" + if obj.get("license") is None or str(obj.get("license")) == "{}" + else str(obj.get("license")), + "is_oa": "" if obj.get("is_oa") is None or str(obj.get("is_oa")) == "{}" else str(obj.get("is_oa")), + } + dedup_map[canonical_json(normalized)] = normalized + + return [dedup_map[k] for k in sorted(dedup_map.keys())] + + +def dedup_map_array(values: Iterable[Any]) -> List[Dict[str, str]]: + dedup_map: Dict[str, Dict[str, str]] = {} + for item in values: + candidates = item if isinstance(item, list) else [item] + for candidate in candidates: + if candidate is None or (isinstance(candidate, str) and candidate in ("", "{}")): + continue + + objects: List[Dict[str, Any]] = [] + if isinstance(candidate, dict): + objects = [candidate] + elif isinstance(candidate, list): + objects = [obj for obj in candidate if isinstance(obj, dict)] + elif isinstance(candidate, str): + try: + parsed = json.loads(candidate) + except json.JSONDecodeError: + parsed = None + if isinstance(parsed, dict): + objects = [parsed] + elif isinstance(parsed, list): + objects = [obj for obj in parsed if isinstance(obj, dict)] + + if not objects: + continue + + for obj in objects: + normalized = { + str(k): "" + if v is None or (isinstance(v, str) and v == "{}") + else stringify_map_value(v) + for k, v in obj.items() + } + dedup_map[canonical_json(normalized)] = normalized + + return [dedup_map[k] for k in sorted(dedup_map.keys())] + + +def choose_freq_then_lex_max_struct(values: Iterable[Any]) -> Dict[str, Any]: + candidates: List[str] = [] + for value in values: + obj: Optional[Dict[str, Any]] = None + if isinstance(value, dict): + obj = value + elif isinstance(value, str): + if value in ("", "{}"): + continue + try: + parsed = json.loads(value) + except json.JSONDecodeError: + parsed = None + if isinstance(parsed, dict) and parsed: + obj = parsed + + if isinstance(obj, dict) and obj: + candidates.append(canonical_json(obj)) + + if not candidates: + return {} + + best = choose_freq_then_lex_max(candidates) + try: + parsed_best = json.loads(best) + except json.JSONDecodeError: + return {} + return parsed_best if isinstance(parsed_best, dict) else {} + + +def _parse_struct_obj(value: Any) -> Optional[Dict[str, Any]]: + if isinstance(value, dict): + return value + if isinstance(value, str): + if value in ("", "{}"): + return None + try: + parsed = json.loads(value) + except json.JSONDecodeError: + return None + if isinstance(parsed, dict): + return parsed + return None + + +def _normalize_topic_node(value: Any) -> Optional[Dict[str, Any]]: + obj = _parse_struct_obj(value) + if obj is None: + return None + normalized = { + "id": canonicalize(obj.get("id")), + "display_name": canonicalize(obj.get("display_name")), + } + if not any(is_non_empty(v) for v in normalized.values()): + return None + return normalized + + +def empty_primary_topic_struct() -> Dict[str, Any]: + return { + "id": None, + "display_name": None, + "score": None, + "subfield": None, + "field": None, + "domain": None, + } + + +def normalize_primary_topic_struct(value: Any) -> Dict[str, Any]: + obj = _parse_struct_obj(value) + if obj is None: + return empty_primary_topic_struct() + score = parse_decimal(obj.get("score")) + return { + "id": canonicalize(obj.get("id")), + "display_name": canonicalize(obj.get("display_name")), + "score": decimal_to_json_number(score) if score is not None else None, + "subfield": _normalize_topic_node(obj.get("subfield")), + "field": _normalize_topic_node(obj.get("field")), + "domain": _normalize_topic_node(obj.get("domain")), + } + + +def choose_freq_then_lex_max_primary_topic(values: Iterable[Any]) -> Dict[str, Any]: + candidates: List[str] = [] + for value in values: + obj = _parse_struct_obj(value) + if not obj: + continue + normalized = normalize_primary_topic_struct(obj) + if any(is_non_empty(v) for v in normalized.values()): + candidates.append(canonical_json(normalized)) + + if not candidates: + return empty_primary_topic_struct() + + best = choose_freq_then_lex_max(candidates) + parsed_best = json.loads(best) + return parsed_best if isinstance(parsed_best, dict) else empty_primary_topic_struct() + + +def dedup_struct_array(values: Iterable[Any]) -> List[Dict[str, Any]]: + def parse_to_dict_list(value: Any) -> List[Dict[str, Any]]: + if value is None: + return [] + + def parse_str(raw: str) -> Any: + if raw in ("", "{}", "[]"): + return None + try: + return json.loads(raw) + except json.JSONDecodeError: + return None + + def collect_from_list(items: List[Any]) -> List[Dict[str, Any]]: + out: List[Dict[str, Any]] = [] + for elem in items: + if isinstance(elem, dict): + if elem: + out.append(elem) + continue + if isinstance(elem, str): + parsed_elem = parse_str(elem) + if isinstance(parsed_elem, dict) and parsed_elem: + out.append(parsed_elem) + elif isinstance(parsed_elem, list): + out.extend(collect_from_list(parsed_elem)) + return out + + if isinstance(value, dict): + return [value] if value else [] + if isinstance(value, list): + return collect_from_list(value) + if isinstance(value, str): + parsed = parse_str(value) + if isinstance(parsed, dict): + return [parsed] if parsed else [] + if isinstance(parsed, list): + return collect_from_list(parsed) + return [] + + merged_topics: List[Dict[str, Any]] = [] + for item in values: + merged_topics.extend(parse_to_dict_list(item)) + + dedup_map: Dict[str, Dict[str, Any]] = {} + for topic in merged_topics: + dedup_map[canonical_json(topic)] = topic + + return [dedup_map[k] for k in sorted(dedup_map.keys())] + + +def normalize_origin_osi(value: Any) -> str: + if value is None: + return "" + origin = str(value).strip().lower() + if origin in ("", "{}"): + return "" + if origin.startswith("semantic"): + return "semantic" + return origin + + +def stringify_map_value(value: Any) -> str: + return stringify_map_value_with_style(value, compact=True) + + +def stringify_map_value_with_style(value: Any, compact: Optional[bool]) -> str: + if value is None: + return "" + if isinstance(value, bool): + return "true" if value else "false" + if isinstance(value, (dict, list)): + if compact is True: + return json.dumps(value, ensure_ascii=False, separators=(",", ":")) + return json.dumps(value, ensure_ascii=False) + return str(value) + + +def detect_json_compact_style(raw: str) -> bool: + in_string = False + escaped = False + length = len(raw) + + for idx, ch in enumerate(raw): + if in_string: + if escaped: + escaped = False + elif ch == "\\": + escaped = True + elif ch == '"': + in_string = False + continue + + if ch == '"': + in_string = True + continue + + if ch in {":", ","}: + if idx + 1 < length and raw[idx + 1].isspace(): + return False + + return True + + +def merge_string_map(values: Iterable[Any]) -> Dict[str, str]: + merged: Dict[str, str] = {} + for item in values: + if isinstance(item, str): + try: + parsed = json.loads(item) + except json.JSONDecodeError: + parsed = None + item = parsed + if not isinstance(item, dict): + continue + for k, v in item.items(): + if k is None or v is None: + continue + key = str(k) + val = stringify_map_value(v) + if key in ("", "{}"): + continue + if val == "{}": + val = "" + if key not in merged or val > merged[key]: + merged[key] = val + return merged + + +def merge_identifiers(values: Iterable[Any], origin_osi_values: Iterable[Any]) -> Dict[str, str]: + merged: Dict[str, str] = {} + for item, origin_osi in zip(values, origin_osi_values): + if isinstance(item, str): + try: + parsed = json.loads(item) + except json.JSONDecodeError: + parsed = None + item = parsed + if not isinstance(item, dict): + continue + normalized_origin = normalize_origin_osi(origin_osi) + for k, v in item.items(): + if k is None or v is None: + continue + key = str(k) + if key in ("", "{}"): + continue + lowered_key = key.lower() + if lowered_key in {"doi", "mag"} and normalized_origin: + key = f"{normalized_origin}_{lowered_key}" + sv = str(v) + if sv == "{}": + sv = "" + if key not in merged or sv > merged[key]: + merged[key] = sv + return merged + + +# ---- strategy handlers ---- + + +def _handle_key_lower( + records: List[Dict[str, Any]], rule: FieldRule, result: Dict[str, Any], +) -> str: + src = rule.effective_source + vals = [normalize_doi(r.get(src, "")) for r in records if normalize_doi(r.get(src, ""))] + return vals[0] if vals else "" + + +def _handle_freq_lex_max( + records: List[Dict[str, Any]], rule: FieldRule, result: Dict[str, Any], +) -> str: + src = rule.effective_source + min_len = rule.params.get("min_len") + max_len = rule.params.get("max_len") + vals: List[str] = [] + for r in records: + v = r.get(src) + if not is_non_empty(v): + continue + s = str(v) + if min_len is not None and len(s) < min_len: + continue + if max_len is not None and len(s) > max_len: + continue + if rule.field_name == "access_is_oa" and s.lower() == "unknown": + continue + vals.append(s) + return choose_freq_then_lex_max(vals) + + +def _handle_freq_int_max( + records: List[Dict[str, Any]], rule: FieldRule, result: Dict[str, Any], +) -> Optional[int]: + src = rule.effective_source + min_val = rule.params.get("min_val") + max_val = rule.params.get("max_val") + if isinstance(max_val, str) and max_val == "CURRENT_YEAR": + max_val = CURRENT_YEAR + vals: List[int] = [] + for r in records: + v = parse_int(r.get(src)) + if v is None: + continue + if min_val is not None and v < min_val: + continue + if max_val is not None and v > max_val: + continue + vals.append(v) + return choose_freq_then_max_int(vals) + + +def _handle_freq_decimal_max( + records: List[Dict[str, Any]], rule: FieldRule, result: Dict[str, Any], +) -> Any: + src = rule.effective_source + vals = [d for r in records for d in [parse_decimal(r.get(src))] if d is not None] + best = choose_freq_then_max_decimal(vals) + return decimal_to_json_number(best) if best is not None else None + + +def _handle_freq_date( + records: List[Dict[str, Any]], rule: FieldRule, result: Dict[str, Any], +) -> str: + src = rule.effective_source + vals: List[str] = [] + for r in records: + d = r.get(src) + if isinstance(d, str) and d and year_from_date_str(d) is not None: + vals.append(d) + return choose_freq_then_lex_max(vals) + + +def _handle_freq_struct( + records: List[Dict[str, Any]], rule: FieldRule, result: Dict[str, Any], +) -> Dict[str, Any]: + if rule.field_name == "primary_topic": + return choose_freq_then_lex_max_primary_topic( + [r.get(rule.effective_source) for r in records] + ) + return choose_freq_then_lex_max_struct([r.get(rule.effective_source) for r in records]) + + +def _handle_dedup_array( + records: List[Dict[str, Any]], rule: FieldRule, result: Dict[str, Any], +) -> List[str]: + return dedup_str_array( + [r.get(rule.effective_source, []) for r in records], + lower=rule.params.get("lower", False), + ) + + +def _handle_dedup_map( + records: List[Dict[str, Any]], rule: FieldRule, result: Dict[str, Any], +) -> List[Dict[str, str]]: + return dedup_map_array([r.get(rule.effective_source, []) for r in records]) + + +def _handle_dedup_struct( + records: List[Dict[str, Any]], rule: FieldRule, result: Dict[str, Any], +) -> List[Dict[str, Any]]: + return dedup_struct_array([r.get(rule.effective_source, []) for r in records]) + + +def _handle_dedup_locations( + records: List[Dict[str, Any]], rule: FieldRule, result: Dict[str, Any], +) -> List[Dict[str, str]]: + return dedup_locations_struct([r.get(rule.effective_source, []) for r in records]) + + +def _handle_merge_map( + records: List[Dict[str, Any]], rule: FieldRule, result: Dict[str, Any], +) -> Dict[str, str]: + return merge_string_map([r.get(rule.effective_source) for r in records]) + + +def _handle_merge_identifiers( + records: List[Dict[str, Any]], rule: FieldRule, result: Dict[str, Any], +) -> Dict[str, str]: + src = rule.effective_source + return merge_identifiers( + [r.get(src) for r in records], + [r.get("origin_osi") for r in records], + ) + + +def _handle_latest_dt( + records: List[Dict[str, Any]], rule: FieldRule, result: Dict[str, Any], +) -> str: + src = rule.effective_source + vals = [str(r.get(src, "")) for r in records if is_non_empty(r.get(src, ""))] + return max(vals) if vals else "" + + +def _handle_random_pick_cls( + records: List[Dict[str, Any]], rule: FieldRule, result: Dict[str, Any], +) -> None: + return None + + +STRATEGY_HANDLERS: Dict[str, StrategyHandler] = { + "key_lower": _handle_key_lower, + "freq_lex_max": _handle_freq_lex_max, + "freq_int_max": _handle_freq_int_max, + "freq_decimal_max": _handle_freq_decimal_max, + "freq_date": _handle_freq_date, + "freq_struct": _handle_freq_struct, + "dedup_array": _handle_dedup_array, + "dedup_map": _handle_dedup_map, + "dedup_struct": _handle_dedup_struct, + "dedup_locations": _handle_dedup_locations, + "merge_map": _handle_merge_map, + "merge_identifiers": _handle_merge_identifiers, + "latest_dt": _handle_latest_dt, + "random_pick_cls": _handle_random_pick_cls, +} + + +# ---- aggregation ---- + + +def aggregate_group(records: List[Dict[str, Any]], rules: Sequence[FieldRule]) -> Dict[str, Any]: + return aggregate_by_rules(records, rules, STRATEGY_HANDLERS) + + +# ---- DB validation helpers ---- + + +def _log(message: str) -> None: + print(message, file=sys.stderr, flush=True) + + +def load_config(path: Path) -> Dict[str, Any]: + if not path.exists(): + raise FileNotFoundError( + f"Config file not found: {path}\n" + f"Copy the template and fill in credentials:\n" + f" cp {TEMPLATE_CONFIG_PATH} {path}" + ) + with path.open("r", encoding="utf-8") as f: + return json.load(f) + + +def connect_starrocks(config_path: Path): + if pymysql is None: + raise RuntimeError("pymysql is required. Install pymysql before running DB validation.") + cfg = load_config(config_path) + mysql_cfg = cfg["mysql"] + retry_cfg = cfg.get("retry", {}) if isinstance(cfg.get("retry"), dict) else {} + max_attempts = max(1, int(retry_cfg.get("max_attempts", 3))) + delay = max(0.0, float(retry_cfg.get("initial_delay_sec", 2.0))) + backoff = max(1.0, float(retry_cfg.get("backoff_factor", 2.0))) + read_timeout = int(mysql_cfg.get("read_timeout_sec", 600)) + + def _is_retryable_connect_error(exc: Exception) -> bool: + if pymysql is None: + return False + if isinstance(exc, pymysql.err.OperationalError): + code = exc.args[0] if exc.args else None + if code in (2003, 2006, 2013): + return True + msg = str(exc).lower() + return any(token in msg for token in ("lost connection", "can't connect", "timed out", "timeout")) + + for attempt in range(1, max_attempts + 1): + try: + # Do not pass database= on connect: this StarRocks endpoint drops + # auth when a default schema is selected; use fully-qualified table names in SQL. + return pymysql.connect( + host=mysql_cfg["host"], + port=int(mysql_cfg["port"]), + user=mysql_cfg["user"], + password=mysql_cfg["password"], + charset=mysql_cfg.get("charset", "utf8mb4"), + connect_timeout=30, + read_timeout=read_timeout, + ) + except Exception as exc: + if attempt >= max_attempts or not _is_retryable_connect_error(exc): + raise + print( + f"[retry] MySQL 连接失败 ({type(exc).__name__}: {exc})," + f"{delay:.1f}s 后重试 ({attempt}/{max_attempts})" + ) + time.sleep(delay) + delay *= backoff + + raise RuntimeError("MySQL connection retry exhausted unexpectedly") + + +def qualify_table_name( + table: str, + catalog: Optional[str], + database: str = "dws", +) -> str: + """Resolve table to catalog.database.table for StarRocks Iceberg queries.""" + parts = [part.strip() for part in table.split(".") if part.strip()] + if len(parts) >= 3: + return table + if len(parts) == 2: + db_name, table_name = parts + if catalog: + return f"{catalog}.{db_name}.{table_name}" + return table + if len(parts) == 1: + if catalog: + return f"{catalog}.{database}.{parts[0]}" + return f"{database}.{parts[0]}" + return table + + +def quote_identifier(identifier: str) -> str: + parts = [part.strip() for part in identifier.split(".") if part.strip()] + if not parts: + raise ValueError(f"Invalid identifier: {identifier!r}") + return ".".join(f"`{part.replace('`', '``')}`" for part in parts) + + +def fetch_records(conn: Any, sql: str, params: Sequence[Any] = ()) -> List[Dict[str, Any]]: + with conn.cursor() as cursor: + cursor.execute(sql, params) + if cursor.description is None: + return [] + cols = [field[0] for field in cursor.description] + return [dict(zip(cols, row)) for row in cursor.fetchall()] + + +def normalize_json_like(value: Any) -> Any: + if isinstance(value, (bytes, bytearray)): + value = value.decode("utf-8", errors="replace") + if isinstance(value, str): + stripped = value.strip() + if stripped and stripped[0] in "[{": + try: + return json.loads(stripped) + except json.JSONDecodeError: + return value + return value + + +def canonicalize(value: Any) -> Any: + value = normalize_json_like(value) + if isinstance(value, Decimal): + if value == value.to_integral_value(): + return int(value) + return float(value) + if isinstance(value, (date, datetime)): + return value.isoformat() + if isinstance(value, dict): + return {str(k): canonicalize(v) for k, v in sorted(value.items(), key=lambda item: str(item[0]))} + if isinstance(value, list): + return [canonicalize(v) for v in value] + return value + + +def comparable_record(record: Dict[str, Any], fields: Iterable[str]) -> Dict[str, Any]: + return {field: canonicalize(record.get(field)) for field in fields} + + +def _dt_clause(dt: Optional[str], params: List[Any]) -> str: + if dt is not None: + params.append(dt) + return " AND `dt` = %s" + return "" + + +def _limit_clause(limit: Optional[int]) -> str: + return "" if limit is None else f" LIMIT {int(limit)}" + + +def _doi_not_null_clause() -> str: + return " AND `doi` IS NOT NULL AND `doi` != ''" + + +def doi_key_expr(alias: Optional[str] = None) -> str: + prefix = f"{alias}." if alias else "" + return f"REGEXP_EXTRACT(LOWER(TRIM({prefix}`doi`)), '{DOI_KEY_SQL_PATTERN}', 1)" + + +def _doi_key_not_null_clause(alias: Optional[str] = None) -> str: + expr = doi_key_expr(alias) + return ( + f" AND {expr} IS NOT NULL" + f" AND {expr} != ''" + f" AND {expr} != '{{}}'" + ) + + +def _hash_sample_predicate(mod_base: Optional[int], mod_max: Optional[int]) -> str: + """Narrow scan on Iceberg dt partitions by cleaned DOI key.""" + if not mod_base or not mod_max or mod_max <= 0: + return "" + return f" AND (ABS(CRC32({doi_key_expr()})) MOD {int(mod_base)}) < {int(mod_max)}" + + +def _sample_order_clause(*, high_first: bool = False) -> str: + if high_first: + return f"source_count DESC, CRC32({doi_key_expr()})" + return f"CRC32({doi_key_expr()})" + + +def build_target_key_query( + table: str, + dt: Optional[str], + limit: Optional[int], + *, + hash_mod_base: Optional[int] = None, + hash_mod_max: Optional[int] = None, +) -> Tuple[str, List[Any]]: + params: List[Any] = [] + key_expr = doi_key_expr() + sql = ( + f"SELECT {key_expr} AS sample_key FROM {quote_identifier(table)} " + "WHERE 1=1" + f"{_doi_key_not_null_clause()}" + f"{_dt_clause(dt, params)}" + f"{_hash_sample_predicate(hash_mod_base, hash_mod_max)}" + f" ORDER BY {_sample_order_clause()}{_limit_clause(limit)}" + ) + return sql, params + + +def build_target_first_key_query( + table: str, + dt: Optional[str], + limit: Optional[int], +) -> Tuple[str, List[Any]]: + params: List[Any] = [] + key_expr = doi_key_expr() + sql = ( + f"SELECT {key_expr} AS sample_key FROM {quote_identifier(table)} " + "WHERE 1=1" + f"{_doi_key_not_null_clause()}" + f"{_dt_clause(dt, params)}" + f"{_limit_clause(limit)}" + ) + return sql, params + + +def build_random_key_query( + table: str, + dt: Optional[str], + limit: Optional[int], + *, + hash_mod_base: Optional[int] = None, + hash_mod_max: Optional[int] = None, +) -> Tuple[str, List[Any]]: + return build_target_key_query( + table, + dt, + limit, + hash_mod_base=hash_mod_base, + hash_mod_max=hash_mod_max, + ) + + +def build_duplicate_key_query( + table: str, + dt: Optional[str], + limit: Optional[int], + *, + high_first: bool, + hash_mod_base: Optional[int] = None, + hash_mod_max: Optional[int] = None, +) -> Tuple[str, List[Any]]: + params: List[Any] = [] + key_expr = doi_key_expr() + sql = ( + f"SELECT {key_expr} AS sample_key, COUNT(*) AS source_count FROM {quote_identifier(table)} " + "WHERE 1=1" + f"{_doi_key_not_null_clause()}" + f"{_dt_clause(dt, params)}" + f"{_hash_sample_predicate(hash_mod_base, hash_mod_max)}" + f" GROUP BY {key_expr} HAVING COUNT(*) > 1 " + f"ORDER BY {_sample_order_clause(high_first=high_first)}{_limit_clause(limit)}" + ) + return sql, params + + +def build_field_conflict_key_query( + table: str, + dt: Optional[str], + limit: Optional[int], + *, + hash_mod_base: Optional[int] = None, + hash_mod_max: Optional[int] = None, +) -> Tuple[str, List[Any]]: + params: List[Any] = [] + conflict_checks = [ + "COUNT(DISTINCT `title`) > 1", + "COUNT(DISTINCT `abstract`) > 1", + "COUNT(DISTINCT `language`) > 1", + "COUNT(DISTINCT `published_year`) > 1", + "COUNT(DISTINCT `published_date`) > 1", + "COUNT(DISTINCT `venue_name`) > 1", + "COUNT(DISTINCT `venue_type`) > 1", + "COUNT(DISTINCT `access_is_oa`) > 1", + "COUNT(DISTINCT `access_oa_status`) > 1", + "COUNT(DISTINCT `citation_count`) > 1", + "COUNT(DISTINCT `reference_count`) > 1", + "COUNT(DISTINCT `fwci`) > 1", + ] + key_expr = doi_key_expr() + sql = ( + f"SELECT {key_expr} AS sample_key, COUNT(*) AS source_count FROM {quote_identifier(table)} " + "WHERE 1=1" + f"{_doi_key_not_null_clause()}" + f"{_dt_clause(dt, params)}" + f"{_hash_sample_predicate(hash_mod_base, hash_mod_max)}" + f" GROUP BY {key_expr} HAVING COUNT(*) > 1 AND " + f"({' OR '.join(conflict_checks)}) " + f"ORDER BY {_sample_order_clause(high_first=True)}{_limit_clause(limit)}" + ) + return sql, params + + +def build_count_bucket_key_query( + table: str, + dt: Optional[str], + limit: Optional[int], + *, + bucket: str, + hash_mod_base: Optional[int] = None, + hash_mod_max: Optional[int] = None, +) -> Tuple[str, List[Any]]: + params: List[Any] = [] + if bucket == "one": + having = "COUNT(*) = 1" + elif bucket == "two": + having = "COUNT(*) = 2" + elif bucket == "multi": + having = "COUNT(*) > 2" + else: + raise ValueError(f"Unsupported count bucket: {bucket}") + key_expr = doi_key_expr() + sql = ( + f"SELECT {key_expr} AS sample_key, COUNT(*) AS source_count FROM {quote_identifier(table)} " + "WHERE 1=1" + f"{_doi_key_not_null_clause()}" + f"{_dt_clause(dt, params)}" + f"{_hash_sample_predicate(hash_mod_base, hash_mod_max)}" + f" GROUP BY {key_expr} HAVING {having} " + f"ORDER BY {_sample_order_clause()}{_limit_clause(limit)}" + ) + return sql, params + + +def _append_sample_key( + keys: List[str], + seen: set, + key: str, + *, + sample_size: Optional[int], +) -> bool: + if not key or key in seen: + return False + seen.add(key) + keys.append(key) + return sample_size is not None and len(keys) >= sample_size + + +def fetch_sample_keys( + conn: Any, + *, + source_table: str, + target_table: str, + dt: Optional[str], + sample_mode: str, + sample_size: Optional[int], + hash_mod_base: Optional[int] = None, + hash_mod_max: Optional[int] = None, +) -> List[str]: + hash_kw = {"hash_mod_base": hash_mod_base, "hash_mod_max": hash_mod_max} + + if sample_mode == "target-first": + sql, params = build_target_first_key_query(target_table, dt, sample_size) + query_plan: List[Tuple[str, Tuple[str, List[Any]]]] = [("target-first", (sql, params))] + elif sample_mode == "target-random": + sql, params = build_target_key_query( + target_table, + dt, + sample_size, + **hash_kw, + ) + query_plan: List[Tuple[str, Tuple[str, List[Any]]]] = [("target-random", (sql, params))] + elif sample_mode == "count-buckets": + per_bucket = None if sample_size is None else max(1, sample_size // 3) + query_plan = [ + ("count=1", build_count_bucket_key_query(source_table, dt, per_bucket, bucket="one", **hash_kw)), + ("count=2", build_count_bucket_key_query(source_table, dt, per_bucket, bucket="two", **hash_kw)), + ("count>2", build_count_bucket_key_query(source_table, dt, per_bucket, bucket="multi", **hash_kw)), + ] + elif sample_mode == "mixed": + per_bucket = None if sample_size is None else max(1, sample_size // 6) + query_plan = [ + ("count=1", build_count_bucket_key_query(source_table, dt, per_bucket, bucket="one", **hash_kw)), + ("count=2", build_count_bucket_key_query(source_table, dt, per_bucket, bucket="two", **hash_kw)), + ("count>2", build_count_bucket_key_query(source_table, dt, per_bucket, bucket="multi", **hash_kw)), + ( + "field-conflict", + build_field_conflict_key_query(source_table, dt, per_bucket, **hash_kw), + ), + ( + "high-duplicate", + build_duplicate_key_query( + source_table, + dt, + per_bucket, + high_first=True, + **hash_kw, + ), + ), + ("target-random", build_random_key_query(target_table, dt, per_bucket, **hash_kw)), + ] + else: + raise ValueError(f"Unsupported sample_mode: {sample_mode}") + + keys: List[str] = [] + seen: set = set() + + for idx, (label, (sql, params)) in enumerate(query_plan, start=1): + _log( + f"[info] 抽样 SQL {idx}/{len(query_plan)} [{label}] 开始执行" + f"(dt={dt!r}, mode={sample_mode})…" + ) + t0 = time.monotonic() + rows = fetch_records(conn, sql, params) + for row in rows: + if _append_sample_key(keys, seen, normalize_doi(row.get("sample_key")), sample_size=sample_size): + _log( + f"[info] 抽样 SQL {idx}/{len(query_plan)} [{label}] 完成," + f"耗时 {time.monotonic() - t0:.1f}s,已收集 {len(keys)} 个 key" + ) + return keys + _log( + f"[info] 抽样 SQL {idx}/{len(query_plan)} [{label}] 完成," + f"耗时 {time.monotonic() - t0:.1f}s,当前共 {len(keys)} 个 key" + ) + return keys + + +def build_target_record_query(table: str, doi: Any, dt: Optional[str]) -> Tuple[str, List[Any]]: + params: List[Any] = [] + if dt is not None: + params.append(dt) + params.append(normalize_doi(doi)) + dt_sql = " AND `dt` = %s" if dt is not None else "" + sql = ( + f"SELECT * FROM {quote_identifier(table)} WHERE 1=1" + f"{dt_sql} AND {doi_key_expr()} = %s LIMIT 1" + ) + return sql, params + + +def build_source_query(table: str, doi: Any, dt: Optional[str]) -> Tuple[str, List[Any]]: + params: List[Any] = [] + if dt is not None: + params.append(dt) + params.append(normalize_doi(doi)) + dt_sql = " AND `dt` = %s" if dt is not None else "" + return ( + f"SELECT * FROM {quote_identifier(table)} WHERE 1=1{dt_sql} AND {doi_key_expr()} = %s", + params, + ) + + +def build_source_batch_query( + table: str, + sample_keys: Sequence[str], + dt: Optional[str], +) -> Tuple[str, List[Any]]: + if not sample_keys: + raise ValueError("sample_keys must not be empty") + + sample_key_sql = " UNION ALL ".join("SELECT %s AS sample_key" for _ in sample_keys) + params: List[Any] = [normalize_doi(key) for key in sample_keys] + if dt is not None: + params.append(dt) + dt_sql = " AND s.`dt` = %s" if dt is not None else "" + + sql = ( + f"WITH sample_keys AS ({sample_key_sql}) " + f"SELECT s.* FROM {quote_identifier(table)} s " + f"JOIN sample_keys k ON {doi_key_expr('s')} = k.sample_key " + f"WHERE 1=1{dt_sql}" + ) + return sql, params + + +def build_target_batch_query( + table: str, + sample_keys: Sequence[str], + dt: Optional[str], +) -> Tuple[str, List[Any]]: + if not sample_keys: + raise ValueError("sample_keys must not be empty") + + sample_key_sql = " UNION ALL ".join("SELECT %s AS sample_key" for _ in sample_keys) + params: List[Any] = [normalize_doi(key) for key in sample_keys] + if dt is not None: + params.append(dt) + dt_sql = " AND t.`dt` = %s" if dt is not None else "" + + sql = ( + f"WITH sample_keys AS ({sample_key_sql}) " + f"SELECT t.* FROM {quote_identifier(table)} t " + f"JOIN sample_keys k ON {doi_key_expr('t')} = k.sample_key " + f"WHERE 1=1{dt_sql}" + ) + return sql, params + + +def group_rows_by_doi(rows: Sequence[Dict[str, Any]]) -> Dict[str, List[Dict[str, Any]]]: + grouped: Dict[str, List[Dict[str, Any]]] = {} + for row in rows: + key = normalize_doi(row.get("doi")) + if not key: + continue + grouped.setdefault(key, []).append(row) + return grouped + + +def _parse_classifications(raw: Any) -> Optional[Dict[str, Any]]: + if raw is None: + return None + if isinstance(raw, str): + try: + parsed = json.loads(raw) + except json.JSONDecodeError: + return None + if isinstance(parsed, dict): + return parsed + return None + if isinstance(raw, dict): + return raw + return None + + +def validate_classifications( + source_records: List[Dict[str, Any]], + target_row: Dict[str, Any], +) -> Dict[str, Dict[str, Any]]: + """Validate classifications field with random-pick semantics. + + CSV rules: + - classifications.mesh: randomly pick one non-empty classifications.mesh value; null if all empty + - msc_class, acm_class, arxiv_category: pick from arxiv records only + """ + actual_cls = _parse_classifications(normalize_json_like(target_row.get("classifications"))) + mismatches: Dict[str, Dict[str, Any]] = {} + + mesh_candidates: List[str] = [] + arxiv_sub_candidates: Dict[str, List[str]] = { + "msc_class": [], "acm_class": [], "arxiv_category": [], + } + + def candidate_expectation(candidates: Iterable[str]) -> Any: + values: List[Any] = [] + for raw in sorted(set(candidates)): + try: + values.append(json.loads(raw)) + except json.JSONDecodeError: + values.append(raw) + if len(values) == 1: + return values[0] + return {"any_of": values} + + for r in source_records: + c = _parse_classifications(r.get("classifications")) + if c is not None and is_non_empty(c.get("mesh")): + mesh_candidates.append(canonical_json(canonicalize(c["mesh"]))) + origin = normalize_origin_osi(r.get("origin_osi")) + if origin == "arxiv": + if c is None: + continue + for sub in arxiv_sub_candidates: + if is_non_empty(c.get(sub)): + arxiv_sub_candidates[sub].append(canonical_json(canonicalize(c[sub]))) + + if actual_cls is None: + has_any_cls = any(_parse_classifications(r.get("classifications")) is not None for r in source_records) + if has_any_cls: + mismatches["classifications"] = {"expected": "non-null struct", "actual": None} + return mismatches + + actual_mesh = actual_cls.get("mesh") + if mesh_candidates: + expected_mesh = candidate_expectation(mesh_candidates) + if not is_non_empty(actual_mesh): + mismatches["classifications.mesh"] = { + "expected": expected_mesh, + "actual": actual_mesh, + } + elif canonical_json(canonicalize(actual_mesh)) not in set(mesh_candidates): + mismatches["classifications.mesh"] = { + "expected": expected_mesh, + "actual": actual_mesh, + } + else: + if is_non_empty(actual_mesh): + mismatches["classifications.mesh"] = {"expected": None, "actual": actual_mesh} + + for sub, candidates in arxiv_sub_candidates.items(): + actual_sub = actual_cls.get(sub) + unique_candidates = set(candidates) + if unique_candidates: + expected_sub = candidate_expectation(unique_candidates) + if not is_non_empty(actual_sub): + mismatches[f"classifications.{sub}"] = { + "expected": expected_sub, + "actual": actual_sub, + } + elif canonical_json(canonicalize(actual_sub)) not in unique_candidates: + mismatches[f"classifications.{sub}"] = { + "expected": expected_sub, + "actual": actual_sub, + } + else: + if is_non_empty(actual_sub): + mismatches[f"classifications.{sub}"] = { + "expected": "empty (no arxiv source)", + "actual": actual_sub, + } + + return mismatches + + +def normalize_order_insensitive_value(value: Any) -> Any: + value = canonicalize(value) + if isinstance(value, list): + return sorted(value, key=canonical_json) + return value + + +def normalize_mesh_empty_values(value: Any) -> Any: + value = canonicalize(value) + if isinstance(value, dict): + return { + key: normalize_mesh_empty_values(None if val == "" else val) + for key, val in value.items() + } + if isinstance(value, list): + return [normalize_mesh_empty_values(item) for item in value] + return None if value == "" else value + + +def normalize_empty_for_compare(value: Any, data_type: str) -> Any: + type_text = (data_type or "").strip().lower() + if value is None: + return None + if type_text in ("string", "varchar", "char", "text"): + return None if isinstance(value, str) and value.strip() == "" else value + if type_text.startswith("array"): + if value == []: + return None + if isinstance(value, str) and value.strip() in ("", "[]"): + return None + return value + + +def compare_records( + expected: Dict[str, Any], + actual: Dict[str, Any], + order_insensitive_fields: Optional[set] = None, + field_types: Optional[Dict[str, str]] = None, +) -> Dict[str, Dict[str, Any]]: + mismatches: Dict[str, Dict[str, Any]] = {} + order_insensitive_fields = order_insensitive_fields or set() + field_types = field_types or {} + for field, expected_value in expected.items(): + if field in order_insensitive_fields: + expected_value = normalize_order_insensitive_value(expected_value) + actual_value = normalize_order_insensitive_value(actual.get(field)) + else: + actual_value = canonicalize(actual.get(field)) + expected_value = normalize_empty_for_compare(expected_value, field_types.get(field, "")) + actual_value = normalize_empty_for_compare(actual_value, field_types.get(field, "")) + if field == "mesh": + expected_value = normalize_mesh_empty_values(expected_value) + actual_value = normalize_mesh_empty_values(actual_value) + if expected_value != actual_value: + mismatches[field] = {"expected": expected_value, "actual": actual_value} + return mismatches + + +def validate_dt_partitions( + conn: Any, + source_table: str, + target_table: str, + dt: Optional[str], + *, + skip_source_distinct: bool = False, + count_mode: str = "hash-buckets", + count_buckets: int = 100, +) -> Dict[str, Any]: + """Check dt partition coverage and key counts between source and target.""" + params: List[Any] = [] + dt_filter = _dt_clause(dt, params) + + src_map: Dict[str, int] = {} + bucket_counts: Dict[str, List[Dict[str, Any]]] = {} + failed_buckets: List[Dict[str, Any]] = [] + matched_key_count: Optional[int] = None + key_gap_failed = False + if skip_source_distinct: + count_mode = "skip" + + if count_mode == "exact": + key_expr = doi_key_expr() + src_sql = ( + f"SELECT `dt`, COUNT(DISTINCT {key_expr}) AS key_count" + f" FROM {quote_identifier(source_table)}" + f" WHERE 1=1{_doi_key_not_null_clause()}{dt_filter} GROUP BY `dt` ORDER BY `dt`" + ) + src_rows = fetch_records(conn, src_sql, params) + src_map = {str(r["dt"]): int(r["key_count"]) for r in src_rows} + elif count_mode == "hash-buckets": + if dt is None: + raise ValueError("--count-mode hash-buckets requires --dt") + if count_buckets <= 0: + raise ValueError("--count-buckets must be positive") + src_map[str(dt)] = 0 + matched_key_count = 0 + bucket_counts[str(dt)] = [] + source_key_expr = doi_key_expr() + target_key_expr_t = doi_key_expr("t") + for bucket in range(count_buckets): + bucket_params: List[Any] = [dt, bucket] + bucket_sql = ( + f"SELECT COUNT(DISTINCT {source_key_expr}) AS key_count" + f" FROM {quote_identifier(source_table)}" + " WHERE 1=1" + f"{_doi_key_not_null_clause()}" + " AND `dt` = %s" + f" AND (ABS(CRC32({source_key_expr})) MOD {int(count_buckets)}) = %s" + ) + _log( + f"[info] source distinct hash bucket {bucket + 1}/{count_buckets} " + f"开始执行(dt={dt!r})…" + ) + t0 = time.monotonic() + try: + rows = fetch_records(conn, bucket_sql, bucket_params) + row = rows[0] if rows else None + key_count = int(row.get("key_count") or 0) if row else 0 + src_map[str(dt)] += key_count + bucket_counts[str(dt)].append({"bucket": bucket, "key_count": key_count}) + _log( + f"[info] source distinct hash bucket {bucket + 1}/{count_buckets} " + f"完成,耗时 {time.monotonic() - t0:.1f}s,key_count={key_count}" + ) + join_sql = ( + "SELECT COUNT(*) AS key_count" + " FROM (" + f" SELECT DISTINCT {source_key_expr} AS doi_key" + f" FROM {quote_identifier(source_table)}" + " WHERE 1=1" + f"{_doi_key_not_null_clause()}" + " AND `dt` = %s" + f" AND (ABS(CRC32({source_key_expr})) MOD {int(count_buckets)}) = %s" + " ) s" + f" JOIN {quote_identifier(target_table)} t" + f" ON t.`dt` = %s AND {target_key_expr_t} = s.doi_key" + ) + join_t0 = time.monotonic() + join_rows = fetch_records(conn, join_sql, [dt, bucket, dt]) + join_row = join_rows[0] if join_rows else None + joined_count = int(join_row.get("key_count") or 0) if join_row else 0 + matched_key_count += joined_count + _log( + f"[info] matched key hash bucket {bucket + 1}/{count_buckets} " + f"完成,耗时 {time.monotonic() - join_t0:.1f}s,key_count={joined_count}" + ) + except Exception as exc: + key_gap_failed = True + failed_buckets.append({"dt": str(dt), "bucket": bucket, "error": str(exc)}) + _log( + f"[warn] source distinct hash bucket {bucket + 1}/{count_buckets} " + f"失败,耗时 {time.monotonic() - t0:.1f}s:{exc}" + ) + elif count_mode == "skip": + pass + else: + raise ValueError(f"Unsupported count_mode: {count_mode}") + + tgt_sql = ( + f"SELECT `dt`, COUNT(*) AS row_count" + f" FROM {quote_identifier(target_table)}" + f" WHERE 1=1{dt_filter} GROUP BY `dt` ORDER BY `dt`" + ) + tgt_rows = fetch_records(conn, tgt_sql, params) + tgt_map = {str(r["dt"]): int(r["row_count"]) for r in tgt_rows} + all_dts = sorted(set(src_map) | set(tgt_map)) + + mismatches: List[Dict[str, Any]] = [] + for d in all_dts: + src_cnt = src_map.get(d) + tgt_cnt = tgt_map.get(d) + if src_cnt != tgt_cnt: + mismatches.append({ + "dt": d, + "source_key_count": src_cnt, + "target_row_count": tgt_cnt, + }) + + result = { + "source_dt_count": len(src_map), + "target_dt_count": len(tgt_map), + "missing_in_target": sorted(set(src_map) - set(tgt_map)), + "extra_in_target": sorted(set(tgt_map) - set(src_map)), + "count_mismatches": mismatches, + "source_distinct_skipped": count_mode == "skip", + "source_count_mode": count_mode, + "source_count_buckets": count_buckets if count_mode == "hash-buckets" else None, + "source_bucket_counts": bucket_counts, + "source_failed_buckets": failed_buckets, + } + if count_mode == "hash-buckets" and dt is not None and matched_key_count is not None: + target_count = tgt_map.get(str(dt)) + source_count = src_map.get(str(dt)) + result["matched_key_count"] = matched_key_count + result["key_gap_failed"] = key_gap_failed + if not key_gap_failed and source_count is not None and target_count is not None: + result["source_missing_in_target_key_count"] = max(source_count - matched_key_count, 0) + result["target_extra_key_count"] = max(target_count - matched_key_count, 0) + return result + + +def discover_dt_values(conn: Any, table: str) -> List[str]: + sql = ( + f"SELECT DISTINCT `dt` FROM {quote_identifier(table)} " + "WHERE `dt` IS NOT NULL AND `dt` != '' ORDER BY `dt`" + ) + return [str(r["dt"]) for r in fetch_records(conn, sql)] + + +def validate_db( + *, + config_path: Path, + source_table: str, + target_table: str, + dt: Optional[str], + limit: Optional[int], + sample_mode: str, + report_path: Optional[Path], + mapping_csv: Path = DEFAULT_MAPPING_CSV, + skip_dt_check: bool = False, + skip_source_distinct: bool = False, + count_mode: str = "hash-buckets", + count_buckets: int = 100, + hash_mod_base: Optional[int] = 100, + hash_mod_max: Optional[int] = 2, +) -> Dict[str, Any]: + rules = load_field_rules(mapping_csv) + output_fields = output_fields_from_rules(rules) + order_insensitive_fields = order_insensitive_fields_from_rules(rules) + field_types = {rule.field_name: rule.data_type for rule in rules} + has_cls = any(r.strategy == "random_pick_cls" for r in rules) + cfg = load_config(config_path) + mysql_cfg = cfg.get("mysql", {}) if isinstance(cfg.get("mysql"), dict) else {} + catalog = mysql_cfg.get("catalog") + database = str(mysql_cfg.get("database") or "dws") + source_table = qualify_table_name(source_table, catalog, database) + target_table = qualify_table_name(target_table, catalog, database) + hash_enabled = bool(hash_mod_base and hash_mod_max and hash_mod_max > 0) + _log( + f"[info] 论文去重校验开始:dt={dt!r}, limit={limit}, sample_mode={sample_mode}, " + f"hash_sample={'on' if hash_enabled else 'off'}, " + f"skip_dt_check={skip_dt_check}, count_mode={count_mode}, " + f"source={source_table}, target={target_table}" + ) + with connect_starrocks(config_path) as conn: + _log("[info] StarRocks 连接成功") + if dt is not None: + dt_list = [dt] + else: + _log("[info] 正在发现源表 dt 分区…") + dt_list = discover_dt_values(conn, source_table) + _log(f"[info] 自动发现 {len(dt_list)} 个 dt 分区,逐分区验证") + + if skip_dt_check: + dt_check = {"skipped": True} + _log("[info] 跳过分区行数统计(--skip-dt-check)") + else: + _log("[info] 正在统计目标分区行数(源表 DISTINCT 可较慢,可用 --skip-source-distinct 跳过)…") + t0 = time.monotonic() + dt_check = validate_dt_partitions( + conn, + source_table, + target_table, + dt, + skip_source_distinct=skip_source_distinct, + count_mode=count_mode, + count_buckets=count_buckets, + ) + _log(f"[info] 分区统计完成,耗时 {time.monotonic() - t0:.1f}s") + + checked = passed = failed = missing_source = missing_target = 0 + source_count_buckets = {"one": 0, "two": 0, "multi": 0} + mismatch_rows: List[Dict[str, Any]] = [] + + for partition_dt in dt_list: + _log(f"[info] 分区 {partition_dt}:开始抽样 key…") + sample_keys = fetch_sample_keys( + conn, + source_table=source_table, + target_table=target_table, + dt=partition_dt, + sample_mode=sample_mode, + sample_size=limit, + hash_mod_base=hash_mod_base if hash_enabled else None, + hash_mod_max=hash_mod_max if hash_enabled else None, + ) + _log(f"[info] 分区 {partition_dt}:抽到 {len(sample_keys)} 个 DOI,开始批量拉取源/目标记录…") + t0 = time.monotonic() + source_rows_by_key: Dict[str, List[Dict[str, Any]]] = {} + target_rows_by_key: Dict[str, List[Dict[str, Any]]] = {} + if sample_keys: + source_sql, source_params = build_source_batch_query(source_table, sample_keys, partition_dt) + source_rows_by_key = group_rows_by_doi(fetch_records(conn, source_sql, source_params)) + target_sql, target_params = build_target_batch_query(target_table, sample_keys, partition_dt) + target_rows_by_key = group_rows_by_doi(fetch_records(conn, target_sql, target_params)) + _log( + f"[info] 分区 {partition_dt}:批量拉取完成,耗时 {time.monotonic() - t0:.1f}s," + f"源命中 {len(source_rows_by_key)}/{len(sample_keys)}," + f"目标命中 {len(target_rows_by_key)}/{len(sample_keys)}" + ) + _log(f"[info] 分区 {partition_dt}:开始逐条比对…") + + for doi in sample_keys: + sample_key = normalize_doi(doi) + target_rows = target_rows_by_key.get(sample_key, []) + source_rows = source_rows_by_key.get(sample_key, []) + checked += 1 + if checked == 1 or checked % 20 == 0: + _log(f"[info] 分区 {partition_dt}:已比对 {checked}/{len(sample_keys)} 条") + + if len(source_rows) == 1: + source_count_buckets["one"] += 1 + elif len(source_rows) == 2: + source_count_buckets["two"] += 1 + elif len(source_rows) > 2: + source_count_buckets["multi"] += 1 + + if not target_rows: + missing_target += 1 + mismatch_rows.append({ + "key": doi, + "dt": partition_dt, + "status": "missing_target", + "source_count": len(source_rows), + "source_records": [ + {key: normalize_json_like(value) for key, value in row.items()} + for row in source_rows + ], + "mismatches": {}, + }) + continue + if not source_rows: + missing_source += 1 + mismatch_rows.append({ + "key": doi, + "dt": partition_dt, + "status": "missing_source", + "source_count": 0, + "target_records": [ + {key: normalize_json_like(value) for key, value in row.items()} + for row in target_rows + ], + "mismatches": {}, + }) + continue + + target_row = target_rows[0] + normalized_source = [{key: normalize_json_like(value) for key, value in row.items()} for row in source_rows] + aggregated = aggregate_group(normalized_source, rules) + expected = comparable_record(aggregated, output_fields) + actual = comparable_record(target_row, output_fields) + mismatches = compare_records(expected, actual, order_insensitive_fields, field_types) + if has_cls: + cls_mismatches = validate_classifications(normalized_source, target_row) + mismatches.update(cls_mismatches) + if mismatches: + failed += 1 + mismatch_rows.append( + { + "key": doi, + "dt": partition_dt, + "status": "field_mismatch", + "source_count": len(source_rows), + "mismatches": mismatches, + } + ) + else: + passed += 1 + + if report_path is not None: + report_path.parent.mkdir(parents=True, exist_ok=True) + with report_path.open("w", encoding="utf-8") as f: + for row in mismatch_rows: + f.write(json.dumps(localize_report_keys(row), ensure_ascii=False, cls=JsonEncoder) + "\n") + (report_path.parent / "source_field_warning.jsonl").write_text("", encoding="utf-8") + + result = { + "status": "ok", + "kind": "paper", + "source_table": source_table, + "target_table": target_table, + "key_field": "doi", + "dt": dt, + "validated_partitions": dt_list, + "sample_mode": sample_mode, + "sample_size": limit, + "dt_check": dt_check, + "checked": checked, + "passed": passed, + "failed": failed, + "missing_source": missing_source, + "missing_target": missing_target, + "source_count_buckets": source_count_buckets, + "report_path": str(report_path) if report_path is not None else None, + "sample_mismatches": mismatch_rows[:5], + } + if report_path is not None: + write_report_summary(report_path, result, mismatch_rows) + print(json.dumps(result, ensure_ascii=False, cls=JsonEncoder)) + return result + + +# ---- CLI ---- + + +def cli() -> None: + config_parser = argparse.ArgumentParser(add_help=False) + config_parser.add_argument("--config", type=Path, default=DEFAULT_CONFIG_PATH) + config_args, _ = config_parser.parse_known_args() + cfg = load_config(config_args.config) if config_args.config.exists() else {} + paper_cfg = cfg.get("unique_paper", {}) + + default_csv = paper_cfg.get("mapping_csv") + if default_csv: + default_csv = PROJECT_ROOT / default_csv + else: + default_csv = DEFAULT_MAPPING_CSV + + parser = argparse.ArgumentParser(description="Validate meta_paper unique DB table by DOI.") + parser.add_argument("--config", type=Path, default=DEFAULT_CONFIG_PATH, help="shared settings JSON path") + parser.add_argument("--mapping-csv", type=Path, default=default_csv, help="field mapping CSV") + parser.add_argument("--source-table", default=paper_cfg.get("source_table", DEFAULT_SOURCE_TABLE)) + parser.add_argument("--target-table", default=paper_cfg.get("target_table", DEFAULT_TARGET_TABLE)) + parser.add_argument("--dt", default=paper_cfg.get("dt"), help="dt partition filter") + parser.add_argument("--limit", type=int, default=int(paper_cfg.get("limit", 600))) + parser.add_argument( + "--sample-mode", + choices=("count-buckets", "mixed", "target-random", "target-first"), + default=paper_cfg.get("sample_mode", "count-buckets"), + help="count-buckets: 1/2/N 源行分桶;mixed: 加深抽样;target-random: 目标表稳定排序抽样;target-first: 目标表 LIMIT 抽样(smoke 最快)", + ) + parser.add_argument("--full", action="store_true", help="validate all target rows") + parser.add_argument("--skip-dt-check", action="store_true", default=bool(paper_cfg.get("skip_dt_check"))) + parser.add_argument( + "--skip-source-distinct", + action="store_true", + default=bool(paper_cfg.get("skip_source_distinct")), + help="dt 统计时跳过源表 COUNT(DISTINCT doi),等价于 --count-mode skip", + ) + parser.add_argument( + "--count-mode", + choices=("exact", "skip", "hash-buckets"), + default=paper_cfg.get("count_mode", "hash-buckets"), + help="源表 distinct DOI 计数模式:exact 单条 COUNT(DISTINCT),hash-buckets 分桶精确统计,skip 跳过", + ) + parser.add_argument( + "--count-buckets", + type=int, + default=int(paper_cfg.get("count_buckets", 100)), + help="--count-mode hash-buckets 时的 hash 分桶数", + ) + parser.add_argument( + "--no-sample-hash", + action="store_true", + help="关闭 CRC32 哈希预过滤(默认 mod 100 取 2,约 2%% 子集)", + ) + parser.add_argument( + "--sample-hash-mod-base", + type=int, + default=int(paper_cfg.get("sample_hash_mod_base", 100)), + ) + parser.add_argument( + "--sample-hash-mod-max", + type=int, + default=int(paper_cfg.get("sample_hash_mod_max", 2)), + ) + parser.add_argument("--report", type=Path, default=paper_cfg.get("report_path"), help="JSONL report path") + args = parser.parse_args() + + hash_mod_base = None if args.no_sample_hash else args.sample_hash_mod_base + hash_mod_max = None if args.no_sample_hash else args.sample_hash_mod_max + count_mode = "skip" if args.skip_source_distinct else args.count_mode + report_path = Path(args.report) if args.report else default_report_path( + args.dt, + "count-buckets" if args.full else args.sample_mode, + args.full, + ) + + validate_db( + config_path=args.config, + source_table=args.source_table, + target_table=args.target_table, + dt=args.dt, + limit=None if args.full else args.limit, + sample_mode="count-buckets" if args.full else args.sample_mode, + report_path=report_path, + mapping_csv=args.mapping_csv, + skip_dt_check=args.skip_dt_check, + skip_source_distinct=args.skip_source_distinct, + count_mode=count_mode, + count_buckets=args.count_buckets, + hash_mod_base=hash_mod_base, + hash_mod_max=hash_mod_max, + ) + + +from dingo.config.input_args import EvaluatorRuleArgs +from dingo.io.input import Data, RequiredField +from dingo.io.output.eval_detail import EvalDetail, QualityLabel +from dingo.model.model import Model +from dingo.model.rule.base import BaseRule +from dingo.model.rule.scibase.report_utils import bool_param, int_param, write_temp_settings + + +@Model.rule_register( + "QUALITY_BAD_EFFECTIVENESS", + ["sci_base_qa_test", "meta_paper_unique"], +) +class RuleSciBaseMetaPaperUniqueReport(BaseRule): + _metric_info = { + "category": "Rule-Based Metadata Quality Metrics", + "quality_dimension": "EFFECTIVENESS", + "metric_name": "RuleSciBaseMetaPaperUniqueReport", + "description": "Run SciBase paper DOI unique DB validation and write reports.", + "paper_title": "", + "paper_url": "", + "paper_authors": "", + "evaluation_results": "", + } + + _required_fields = [RequiredField.METADATA] + dynamic_config = EvaluatorRuleArgs(parameters={}) + + @classmethod + def eval(cls, input_data: Data) -> EvalDetail: + del input_data + params = cls.dynamic_config.parameters or {} + full = bool_param(params, "full", False) + sample_mode = str(params.get("sample_mode") or "count-buckets") + dt = params.get("dt") + report_path = Path(params["report_path"]) if params.get("report_path") else None + if report_path is None and params.get("output_dir"): + report_path = Path(str(params["output_dir"])) / "source_field_mismatch.jsonl" + if report_path is None: + report_path = default_report_path(dt, "count-buckets" if full else sample_mode, full) + + config_path = write_temp_settings(params) + count_mode = "skip" if bool_param(params, "skip_source_distinct", False) else str(params.get("count_mode") or "hash-buckets") + result = validate_db( + config_path=config_path, + source_table=str(params.get("source_table") or DEFAULT_SOURCE_TABLE), + target_table=str(params.get("target_table") or DEFAULT_TARGET_TABLE), + dt=dt, + limit=None if full else int_param(params, "limit", 600), + sample_mode="count-buckets" if full else sample_mode, + report_path=report_path, + mapping_csv=Path(str(params.get("mapping_csv") or DEFAULT_MAPPING_CSV)), + skip_dt_check=bool_param(params, "skip_dt_check", False), + skip_source_distinct=bool_param(params, "skip_source_distinct", False), + count_mode=count_mode, + count_buckets=int_param(params, "count_buckets", 100), + hash_mod_base=None if bool_param(params, "no_sample_hash", False) else int_param(params, "sample_hash_mod_base", 100), + hash_mod_max=None if bool_param(params, "no_sample_hash", False) else int_param(params, "sample_hash_mod_max", 2), + ) + bad = any( + int(result.get(key) or 0) > 0 + for key in ("failed", "missing_source", "missing_target") + ) + count_mismatches = ((result.get("dt_check") or {}).get("count_mismatches") or []) + bad = bad or bool(count_mismatches) + reason = [str(report_path.parent), f"checked={result.get('checked')}", f"failed={result.get('failed')}"] + if bad: + return EvalDetail( + metric=cls.__name__, + status=True, + label=[f"{cls.metric_type}.{cls.__name__}"], + reason=reason, + ) + return EvalDetail(metric=cls.__name__, label=[QualityLabel.QUALITY_GOOD], reason=reason) + + +if __name__ == "__main__": + cli() diff --git a/dingo/model/rule/scibase/meta_patent_parsed_info.py b/dingo/model/rule/scibase/meta_patent_parsed_info.py new file mode 100644 index 00000000..76f3b00f --- /dev/null +++ b/dingo/model/rule/scibase/meta_patent_parsed_info.py @@ -0,0 +1,1720 @@ +#!/usr/bin/env python3 +"""Validate parsed patent fields against the raw XML stored in `content`. + +Field extraction rules are driven by ../doc/patent_mapping.csv. The script is +intentionally conservative: fields with a confident XML extractor are compared; +metadata/library fields and unsupported free-form rules are reported as skipped. +""" +from __future__ import annotations + +import argparse +import csv +import json +import re +import sys +import time +import xml.etree.ElementTree as ET +from collections import Counter +from dataclasses import dataclass +from datetime import date, datetime +from decimal import Decimal +from pathlib import Path +from typing import Any, Callable, Dict, Iterable, List, Optional, Sequence, Tuple + +try: + import pymysql +except ImportError: # pragma: no cover - runtime dependency check + pymysql = None # type: ignore + + +PROJECT_ROOT = Path(__file__).resolve().parent +ASSETS_DIR = PROJECT_ROOT / "assets" +DEFAULT_CONFIG_PATH = Path("sci_base_qa_test_config.json") +TEMPLATE_CONFIG_PATH = ASSETS_DIR / "settings.template.json" +DEFAULT_MAPPING_CSV = ASSETS_DIR / "patent_mapping.csv" +REPORT_ROOT = Path("report") +DEFAULT_TABLE = "test.iceberg_test_patent_parsed_info_acc_d" +DEFAULT_XML_FIELD = "xml_content" + +LIBRARY_MODULE = "库信息" +SKIP_FIELDS = { + "content", # table content is processed full text; raw XML lives in xml_content for this table. +} +FIELD_ALIASES = { + "patent_national_classifications": "national_classifications", + "patent_domestic_classifications": "domestic_classifications", + "patent_fi_classifications": "fi_classifications", + "patent_cpc_classifications": "cpc_classifications", + "patent_locarno_classes": "locarno_classes", +} +ORDER_INSENSITIVE_TYPES = ("list", "array") +ELEMENT_COVERAGE_SAMPLE_LIMIT = 80 +SAMPLE_MODE_RANDOM = "random" +SAMPLE_MODE_BRANCH_COVERAGE = "branch-coverage" +SAMPLE_MODE_ALIASES = { + "random": SAMPLE_MODE_RANDOM, + "branch-coverage": SAMPLE_MODE_BRANCH_COVERAGE, +} +BRANCH_COVERAGE_CANDIDATE_MULTIPLIER = 20 + + +@dataclass(frozen=True) +class PatentRule: + field_name: str + xml_mapping: str + data_type: str + description: str + validation_rule: str + nullable: str + module: str + + +@dataclass +class ExtractResult: + value: Any + status: str = "ok" + reason: str = "" + branch: str = "" + + +Extractor = Callable[[ET.Element, PatentRule], ExtractResult] + + +class JsonEncoder(json.JSONEncoder): + def default(self, obj: Any) -> Any: + if isinstance(obj, Decimal): + if obj == obj.to_integral_value(): + return int(obj) + return float(obj) + if isinstance(obj, (date, datetime)): + return obj.isoformat() + return super().default(obj) + + +def _log(message: str) -> None: + print(message, file=sys.stderr, flush=True) + + +def load_config(path: Path) -> Dict[str, Any]: + if not path.exists(): + raise FileNotFoundError( + f"Config file not found: {path}\n" + f"Copy the template and fill in credentials:\n" + f" cp {TEMPLATE_CONFIG_PATH} {path}" + ) + with path.open("r", encoding="utf-8") as f: + return json.load(f) + + +def connect_starrocks(config_path: Path): + if pymysql is None: + raise RuntimeError("pymysql is required. Install pymysql before running DB validation.") + cfg = load_config(config_path) + mysql_cfg = cfg["mysql"] + retry_cfg = cfg.get("retry", {}) if isinstance(cfg.get("retry"), dict) else {} + max_attempts = max(1, int(retry_cfg.get("max_attempts", 3))) + delay = max(0.0, float(retry_cfg.get("initial_delay_sec", 2.0))) + backoff = max(1.0, float(retry_cfg.get("backoff_factor", 2.0))) + read_timeout = int(mysql_cfg.get("read_timeout_sec", 600)) + + def is_retryable(exc: Exception) -> bool: + if pymysql is not None and isinstance(exc, pymysql.err.OperationalError): + code = exc.args[0] if exc.args else None + if code in (2003, 2006, 2013): + return True + return any(token in str(exc).lower() for token in ("lost connection", "can't connect", "timeout")) + + for attempt in range(1, max_attempts + 1): + try: + return pymysql.connect( + host=mysql_cfg["host"], + port=int(mysql_cfg["port"]), + user=mysql_cfg["user"], + password=mysql_cfg["password"], + charset=mysql_cfg.get("charset", "utf8mb4"), + connect_timeout=30, + read_timeout=read_timeout, + ) + except Exception as exc: + if attempt >= max_attempts or not is_retryable(exc): + raise + _log(f"[retry] MySQL 连接失败 ({type(exc).__name__}: {exc}),{delay:.1f}s 后重试") + time.sleep(delay) + delay *= backoff + raise RuntimeError("MySQL connection retry exhausted unexpectedly") + + +def qualify_table_name(table: str, catalog: Optional[str], database: str = "dws") -> str: + parts = [part.strip() for part in table.split(".") if part.strip()] + if len(parts) >= 3: + return table + if len(parts) == 2: + return f"{catalog}.{table}" if catalog else table + if len(parts) == 1: + return f"{catalog}.{database}.{table}" if catalog else f"{database}.{table}" + return table + + +def quote_identifier(identifier: str) -> str: + parts = [part.strip() for part in identifier.split(".") if part.strip()] + if not parts: + raise ValueError(f"Invalid identifier: {identifier!r}") + return ".".join(f"`{part.replace('`', '``')}`" for part in parts) + + +def fetch_records(conn: Any, sql: str, params: Sequence[Any] = ()) -> List[Dict[str, Any]]: + with conn.cursor() as cursor: + cursor.execute(sql, params) + if cursor.description is None: + return [] + cols = [field[0] for field in cursor.description] + return [dict(zip(cols, row)) for row in cursor.fetchall()] + + +def load_patent_rules(path: Path) -> List[PatentRule]: + rules: List[PatentRule] = [] + with path.open(encoding="utf-8-sig", newline="") as f: + reader = csv.DictReader(f) + required = {"预期字段名", "xml映射字段", "数据类型", "字段描述", "有效性规则", "可空", "模块"} + missing = required - set(reader.fieldnames or []) + if missing: + raise ValueError(f"映射文件 {path} 缺少列: {', '.join(sorted(missing))}") + for row in reader: + field_name = clean_header_value(row.get("预期字段名")) + if not field_name: + continue + rules.append( + PatentRule( + field_name=field_name, + xml_mapping=clean_header_value(row.get("xml映射字段")), + data_type=clean_header_value(row.get("数据类型")), + description=clean_header_value(row.get("字段描述")), + validation_rule=clean_header_value(row.get("有效性规则")), + nullable=clean_header_value(row.get("可空")), + module=clean_header_value(row.get("模块")), + ) + ) + return rules + + +def clean_header_value(value: Any) -> str: + if value is None: + return "" + return str(value).strip().strip('"').strip() + + +def safe_filename_token(value: Optional[Any]) -> str: + text = "all" if value in (None, "") else str(value) + return re.sub(r"[^0-9A-Za-z_-]+", "_", text).strip("_") or "all" + + +def normalize_sample_mode(value: Any) -> str: + text = str(value or SAMPLE_MODE_BRANCH_COVERAGE).strip() + normalized = SAMPLE_MODE_ALIASES.get(text.lower()) or SAMPLE_MODE_ALIASES.get(text) + if normalized is None: + raise ValueError( + f"Unsupported sample_mode: {value!r}. " + f"Use {SAMPLE_MODE_RANDOM!r} or {SAMPLE_MODE_BRANCH_COVERAGE!r}." + ) + return normalized + + +def default_report_path(dt: Optional[str], sample_mode: str, full: bool) -> Path: + mode = "full" if full else sample_mode + report_dir = REPORT_ROOT / f"meta_patent_parsed_info_dt_{safe_filename_token(dt)}_{safe_filename_token(mode)}" + return report_dir / "xml_field_mismatch.jsonl" + + +def summary_paths(report_path: Path) -> Tuple[Path, Path]: + return report_path.parent / "summary.json", report_path.parent / "readable_summary.md" + + +def local_name(tag: str) -> str: + return tag.rsplit("}", 1)[-1] if "}" in tag else tag.split(":", 1)[-1] + + +def norm_name(name: str) -> str: + return re.sub(r"[^a-z0-9]", "", name.lower()) + + +def node_name(node: ET.Element) -> str: + return norm_name(local_name(node.tag)) + + +def text_content(node: Optional[ET.Element]) -> str: + if node is None: + return "" + return normalize_space(" ".join(t for t in node.itertext() if t and t.strip())) + + +def normalize_space(value: Any) -> str: + if value is None: + return "" + return re.sub(r"\s+", " ", str(value)).strip() + + +def attr_value(node: Optional[ET.Element], name: str) -> str: + if node is None: + return "" + wanted = norm_name(name) + for key, value in node.attrib.items(): + if norm_name(local_name(key)) == wanted: + return normalize_space(value) + return "" + + +def children(node: ET.Element, *names: str) -> List[ET.Element]: + wanted = {norm_name(name) for name in names} + return [child for child in list(node) if node_name(child) in wanted] + + +def descendants(node: ET.Element, *names: str) -> List[ET.Element]: + wanted = {norm_name(name) for name in names} + return [elem for elem in node.iter() if elem is not node and node_name(elem) in wanted] + + +def first_descendant(node: ET.Element, *names: str) -> Optional[ET.Element]: + items = descendants(node, *names) + return items[0] if items else None + + +def child_text(node: ET.Element, *names: str) -> str: + for child in children(node, *names): + txt = text_content(child) + if txt: + return txt + return "" + + +def first_descendant_text(node: ET.Element, *names: str) -> str: + found = first_descendant(node, *names) + return text_content(found) + + +def has_ancestor(node: ET.Element, parent_map: Dict[ET.Element, ET.Element], *names: str) -> bool: + wanted = {norm_name(name) for name in names} + cur = node + while cur in parent_map: + cur = parent_map[cur] + if node_name(cur) in wanted: + return True + return False + + +def parent_map(root: ET.Element) -> Dict[ET.Element, ET.Element]: + return {child: parent for parent in root.iter() for child in list(parent)} + + +def xml_element_path(node: ET.Element, parents: Dict[ET.Element, ET.Element]) -> str: + parts = [local_name(node.tag)] + cur = node + while cur in parents: + cur = parents[cur] + parts.append(local_name(cur.tag)) + return "/".join(reversed(parts)) + + +def collect_xml_elements(root: ET.Element) -> List[Dict[str, Any]]: + parents = parent_map(root) + elements: Dict[str, Dict[str, Any]] = {} + for node in root.iter(): + path = xml_element_path(node, parents) + item = elements.setdefault( + path, + { + "path": path, + "name": local_name(node.tag), + "occurrences": 0, + "has_text": False, + "has_attrs": False, + }, + ) + item["occurrences"] += 1 + if text_content(node): + item["has_text"] = True + if node.attrib: + item["has_attrs"] = True + return sorted(elements.values(), key=lambda item: item["path"]) + + +def mapping_element_names(mapping: str) -> set: + if not mapping: + return set() + # Remove examples and prose-ish tail as much as possible while retaining XML node tokens. + cleaned = re.sub(r"[@][A-Za-z0-9_:-]+(?:='[^']*')?", "", mapping) + cleaned = re.sub(r"\bdataFormat\b|\boriginal\b|\bstandard\b|\broot\b|根", " ", cleaned, flags=re.I) + tokens = re.findall(r"(?:[A-Za-z_][A-Za-z0-9_-]*:)?[A-Za-z_][A-Za-z0-9_-]*", cleaned) + ignore = { + "business", + "base", + "xml", + } + names = set() + for token in tokens: + name = local_name(token) + if name in ignore: + continue + # Keep field-like XML node names; skip prose fragments that are usually lower-case words. + if name and (name[0].isupper() or name in {"lang", "status", "country", "docNumber", "kind", "datePublication"}): + names.add(name) + return names + + +def mapped_xml_element_names(rules: Sequence[PatentRule]) -> set: + names = set() + for rule in rules: + if rule.module == LIBRARY_MODULE or rule.field_name in SKIP_FIELDS: + continue + names.update(mapping_element_names(rule.xml_mapping)) + return names + + +def non_library_rule_fields(rules: Sequence[PatentRule]) -> set: + return { + rule.field_name + for rule in rules + if rule.module != LIBRARY_MODULE and rule.field_name not in SKIP_FIELDS + } + + +def actual_parsed_fields(row: Dict[str, Any], rules: Sequence[PatentRule]) -> List[str]: + fields = [] + for rule in rules: + if rule.module == LIBRARY_MODULE or rule.field_name in SKIP_FIELDS: + continue + if is_non_empty(actual_field_value(row, rule.field_name)): + fields.append(rule.field_name) + return sorted(set(fields)) + + +def build_element_coverage( + row: Dict[str, Any], + root: ET.Element, + rules: Sequence[PatentRule], + *, + key: Any, + dt: Optional[str], +) -> Dict[str, Any]: + xml_elements = collect_xml_elements(root) + mapped_names = mapped_xml_element_names(rules) + parsed_fields = actual_parsed_fields(row, rules) + rule_fields = non_library_rule_fields(rules) + xml_significant = [ + elem + for elem in xml_elements + if elem.get("has_text") or elem.get("has_attrs") + ] + unmapped = [ + elem + for elem in xml_significant + if elem["name"] not in mapped_names + ] + parsed_without_mapping = [ + field + for field in parsed_fields + if field not in rule_fields + ] + return { + "key": key, + "dt": dt, + "parsed_field_count": len(parsed_fields), + "parsed_fields": parsed_fields, + "xml_element_count": len(xml_elements), + "xml_significant_element_count": len(xml_significant), + "xml_elements": xml_elements, + "mapped_xml_element_name_count": len(mapped_names), + "unmapped_xml_element_count": len(unmapped), + "unmapped_xml_elements": unmapped[:ELEMENT_COVERAGE_SAMPLE_LIMIT], + "unmapped_xml_elements_truncated": max(0, len(unmapped) - ELEMENT_COVERAGE_SAMPLE_LIMIT), + "parsed_fields_without_xml_mapping": parsed_without_mapping, + } + + +def first_by_path(root: ET.Element, path_names: Sequence[str], attrs: Optional[Dict[str, str]] = None) -> Optional[ET.Element]: + current = [root] + for raw_name in path_names: + wanted = norm_name(raw_name) + next_nodes: List[ET.Element] = [] + for node in current: + next_nodes.extend(child for child in node.iter() if child is not node and node_name(child) == wanted) + current = next_nodes + if not current: + return None + attrs = attrs or {} + for node in current: + if all(attr_value(node, key) == value for key, value in attrs.items()): + return node + return current[0] if current else None + + +def publication_document_ids(root: ET.Element) -> List[ET.Element]: + out: List[ET.Element] = [] + refs = descendants(root, "PublicationReference") + refs.sort(key=lambda node: data_format_rank(attr_value(node, "dataFormat"))) + for pub in refs: + doc_ids = descendants(pub, "DocumentID") + doc_ids.sort(key=lambda node: data_format_rank(attr_value(node, "dataFormat"))) + out.extend(doc_ids) + return out + + +def publication_refs(root: ET.Element, data_format: Optional[str] = None) -> List[ET.Element]: + refs = descendants(root, "PublicationReference") + if data_format is not None: + wanted = data_format.lower() + refs = [ref for ref in refs if attr_value(ref, "dataFormat").lower() == wanted] + refs.sort(key=lambda node: data_format_rank(attr_value(node, "dataFormat"))) + return refs + + +def document_ids_from_refs(refs: Sequence[ET.Element], data_format: Optional[str] = None) -> List[ET.Element]: + out: List[ET.Element] = [] + for ref in refs: + doc_ids = descendants(ref, "DocumentID") + if data_format is not None: + wanted = data_format.lower() + doc_ids = [doc_id for doc_id in doc_ids if attr_value(doc_id, "dataFormat").lower() in {"", wanted}] + doc_ids.sort(key=lambda node: data_format_rank(attr_value(node, "dataFormat"))) + out.extend(doc_ids) + return out + + +def application_document_ids(root: ET.Element) -> List[ET.Element]: + out: List[ET.Element] = [] + refs = descendants(root, "ApplicationReference") + refs.sort(key=lambda node: data_format_rank(attr_value(node, "dataFormat"))) + for app in refs: + doc_ids = descendants(app, "DocumentID") + doc_ids.sort(key=lambda node: data_format_rank(attr_value(node, "dataFormat"))) + out.extend(doc_ids) + return out + + +def data_format_rank(value: str) -> int: + lowered = value.lower() + if lowered == "original": + return 0 + if lowered == "standard": + return 1 + return 2 + + +def choose_doc_id(nodes: Sequence[ET.Element]) -> Optional[ET.Element]: + if not nodes: + return None + for node in nodes: + parent = node + data_formats = [attr_value(n, "dataFormat").lower() for n in [node, *list(node.iter())]] + if "original" in data_formats: + return parent + for node in nodes: + data_formats = [attr_value(n, "dataFormat").lower() for n in [node, *list(node.iter())]] + if "standard" in data_formats: + return node + return nodes[0] + + +def document_number_from_doc_id(doc_id: Optional[ET.Element]) -> str: + if doc_id is None: + return "" + parts = [ + child_text(doc_id, "WIPOST3Code", "CountryCode", "OfficeCode"), + child_text(doc_id, "DocNumber", "DocumentNumber"), + child_text(doc_id, "Kind"), + ] + return "".join(part for part in parts if part) + + +def preferred_by_data_format(nodes: Sequence[ET.Element]) -> List[ET.Element]: + originals = [node for node in nodes if attr_value(node, "dataFormat").lower() == "original"] + if originals: + return originals + standards = [node for node in nodes if attr_value(node, "dataFormat").lower() == "standard"] + if standards: + return standards + return list(nodes) + + +def date_from_doc_id(doc_id: Optional[ET.Element]) -> str: + return child_text(doc_id, "Date") if doc_id is not None else "" + + +def country_from_doc_id(doc_id: Optional[ET.Element]) -> str: + if doc_id is None: + return "" + return child_text(doc_id, "WIPOST3Code", "CountryCode", "OfficeCode") + + +def kind_from_doc_id(doc_id: Optional[ET.Element]) -> str: + return child_text(doc_id, "Kind") if doc_id is not None else "" + + +def root_attr(root: ET.Element, name: str) -> str: + return attr_value(root, name) + + +def unique_nonempty(values: Iterable[Any]) -> List[str]: + seen = set() + out: List[str] = [] + for value in values: + text = normalize_space(value) + if not text or text in seen: + continue + seen.add(text) + out.append(text) + return out + + +def branch_result(value: Any, branch: str) -> ExtractResult: + return ExtractResult(value, branch=branch if is_non_empty(value) else "empty") + + +def result_branch(extracted: ExtractResult) -> str: + if extracted.status != "ok": + return extracted.status + if extracted.branch: + return extracted.branch + return "xml_value" if is_non_empty(extracted.value) else "empty" + + +def extract_document_number(root: ET.Element, rule: PatentRule) -> ExtractResult: + value = document_number_from_doc_id(choose_doc_id(document_ids_from_refs(publication_refs(root, "original"), "original"))) + if value: + return branch_result(value, "pub_original") + value = document_number_from_doc_id(choose_doc_id(publication_document_ids(root))) + if value: + return branch_result(value, "pub_fallback") + value = "".join( + part for part in [root_attr(root, "country"), root_attr(root, "docNumber"), root_attr(root, "kind")] if part + ) + return branch_result(value, "root_attrs") + + +def extract_document_kind_code(root: ET.Element, rule: PatentRule) -> ExtractResult: + refs = publication_refs(root, "original") + value = kind_from_doc_id(choose_doc_id(document_ids_from_refs(refs, "original"))) + if value: + return branch_result(value, "pub_original") + return branch_result(root_attr(root, "kind"), "root_kind") + + +def extract_document_kind_text(root: ET.Element, rule: PatentRule) -> ExtractResult: + node = first_by_path(root, ["SpecificBibliographicData", "OriginalKindCode"]) + return branch_result(text_content(node), "specific_bibliographic_data") + + +def extract_document_status_code(root: ET.Element, rule: PatentRule) -> ExtractResult: + for abstract in descendants(root, "Abstract"): + status = attr_value(abstract, "status") + if status: + return branch_result(status, "abstract_status") + return branch_result(root_attr(root, "status"), "root_status") + + +def extract_document_wipo_country_code(root: ET.Element, rule: PatentRule) -> ExtractResult: + refs = publication_refs(root, "original") + value = country_from_doc_id(choose_doc_id(document_ids_from_refs(refs, "original"))) + if value: + return branch_result(value, "pub_original") + return branch_result(root_attr(root, "country"), "root_country") + + +def extract_publication_date(root: ET.Element, rule: PatentRule) -> ExtractResult: + value = date_from_doc_id(choose_doc_id(publication_document_ids(root))) + if value: + return branch_result(value, "publication_document_id") + return branch_result(root_attr(root, "datePublication"), "root_date_publication") + + +def extract_publication_language(root: ET.Element, rule: PatentRule) -> ExtractResult: + return branch_result(root_attr(root, "lang"), "root_lang") + + +def extract_publication_office_code(root: ET.Element, rule: PatentRule) -> ExtractResult: + for pub in descendants(root, "PublicationReference"): + source_db = attr_value(pub, "sourceDB") + if source_db: + return branch_result(source_db, "publication_source_db") + value = country_from_doc_id(choose_doc_id(publication_document_ids(root))) + if value: + return branch_result(value, "publication_document_id") + return branch_result(root_attr(root, "country"), "root_country") + + +def extract_invention_title(root: ET.Element, rule: PatentRule) -> ExtractResult: + lang = root_attr(root, "lang") + titles = descendants(root, "InventionTitle") + if lang: + for title in titles: + if attr_value(title, "lang").lower() == lang.lower(): + return branch_result(text_content(title), "lang_match") + return ExtractResult("", branch="empty") + + +def extract_ipc(root: ET.Element, rule: PatentRule) -> ExtractResult: + vals: List[str] = [] + branch = "empty" + for ipc_node in descendants(root, "ClassificationIPC"): + candidates: List[ET.Element] = [] + for name in ("MainClassification", "FurtherClassification"): + candidates.extend(descendants(ipc_node, name)) + preferred = preferred_by_data_format(candidates) + for node in preferred: + if branch == "empty": + fmt = attr_value(node, "dataFormat").lower() + branch = f"classification_ipc_{fmt}" if fmt else "classification_ipc" + vals.append(text_content(node)) + return branch_result(unique_nonempty(vals), branch) + + +def extract_ipc_text(root: ET.Element, rule: PatentRule) -> ExtractResult: + vals: List[str] = [] + for ipc_node in descendants(root, "ClassificationIPC"): + for node in descendants(ipc_node, "Text"): + vals.extend(part.strip() for part in text_content(node).splitlines()) + return branch_result(unique_nonempty(vals), "classification_ipc_text") + + +def extract_ipc_edition_statement(root: ET.Element, rule: PatentRule) -> ExtractResult: + for ipc_node in descendants(root, "ClassificationIPC"): + text = first_descendant_text(ipc_node, "EditionStatement") + if text: + return branch_result(text, "classification_ipc_edition_statement") + return ExtractResult("", branch="empty") + + +def extract_classification_objects(root: ET.Element, rule: PatentRule) -> ExtractResult: + names_by_field = { + "ipcr_classifications": ("ClassificationIPCR", "ClassificationIPCRDetails"), + "patent_national_classifications": ("ClassificationNational",), + "patent_domestic_classifications": ("ClassificationDomestic", "DomesticClassification", "DomesticPatentClassification"), + "patent_fi_classifications": ("ClassificationFI", "FIClassification", "ClassificationFIData"), + "patent_locarno_classes": ("ClassificationLocarno",), + } + names = names_by_field.get(rule.field_name, ()) + values: List[Any] = [] + branch = "empty" + for container in descendants(root, *names): + if branch == "empty": + branch = local_name(container.tag) + texts = unique_nonempty( + text_content(node) + for node in container.iter() + if node is not container and node_name(node) in {"mainclassification", "furtherclassification", "text"} + ) + values.extend(texts) + return branch_result(unique_nonempty(values), branch) + + +def extract_cpc(root: ET.Element, rule: PatentRule) -> ExtractResult: + values: List[str] = [] + for pat_cls in descendants(root, "PatentClassification"): + scheme = first_descendant(pat_cls, "ClassificationScheme") + if scheme is not None and attr_value(scheme, "scheme").upper() != "CPC": + continue + symbol = first_descendant_text(pat_cls, "ClassificationSymbol") or text_content(pat_cls) + values.append(symbol) + return branch_result(unique_nonempty(values), "patent_classification_cpc") + + +def extract_abstract(root: ET.Element, rule: PatentRule) -> ExtractResult: + vals = [text_content(node) for node in descendants(root, "Abstract")] + return branch_result("\n".join(unique_nonempty(vals)), "abstract") + + +def extract_description(root: ET.Element, rule: PatentRule) -> ExtractResult: + items = [] + for idx, node in enumerate(descendants(root, "Description"), start=1): + txt = text_content(node) + if txt: + items.append({"seq": idx, "text": txt}) + return branch_result(items, "description") + + +def extract_claims(root: ET.Element, rule: PatentRule) -> ExtractResult: + claims = [] + for idx, claim in enumerate(descendants(root, "Claim"), start=1): + text = text_content(claim) + if not text: + continue + claims.append( + { + "claim_id": attr_value(claim, "id") or attr_value(claim, "num") or str(idx), + "claim_num": attr_value(claim, "num") or str(idx), + "claim_text": text, + } + ) + return branch_result(claims, "claims") + + +def extract_drawings(root: ET.Element, rule: PatentRule) -> ExtractResult: + drawings = [] + for idx, figure in enumerate(descendants(root, "Figure"), start=1): + image = first_descendant(figure, "Image") + if image is None: + continue + drawings.append( + { + "figure_id": attr_value(figure, "id") or str(idx), + "image_file": attr_value(image, "file") or attr_value(image, "filename") or attr_value(image, "href"), + } + ) + return branch_result(drawings, "drawings") + + +def extract_parties(root: ET.Element, rule: PatentRule) -> ExtractResult: + field_to_names = { + "applicants": ("Applicant",), + "assignees": ("Assignee",), + "inventors": ("Inventor",), + "designers": ("Designer",), + "patent_agents": ("Agent", "Agency"), + "patent_agency": ("PatentAgency",), + } + names = field_to_names.get(rule.field_name, ()) + people = [] + branch = "empty" + for node in descendants(root, *names): + if branch == "empty": + branch = local_name(node.tag) + address_book = first_descendant(node, "AddressBook") or node + name = first_descendant_text(address_book, "Name") or first_descendant_text(address_book, "LastName") + country = first_descendant_text(address_book, "CountryCode") or first_descendant_text(address_book, "WIPOST3Code") + text = text_content(address_book) + if name or text: + item = {"name": name or text} + if country: + item["country"] = country + people.append(item) + return branch_result(dedup_dicts(people), branch) + + +def extract_priority_numbers(root: ET.Element, rule: PatentRule) -> ExtractResult: + vals = [] + for node in descendants(root, "PriorityClaim"): + for doc_id in descendants(node, "DocumentID"): + vals.append(child_text(doc_id, "DocNumber", "DocumentNumber")) + return branch_result(unique_nonempty(vals), "priority_claim_document_id") + + +def extract_priority_filing_dates(root: ET.Element, rule: PatentRule) -> ExtractResult: + vals = [] + for node in descendants(root, "PriorityClaim"): + for doc_id in descendants(node, "DocumentID"): + vals.append(child_text(doc_id, "Date")) + return branch_result(unique_nonempty(vals), "priority_claim_document_id") + + +def extract_priority_office_codes(root: ET.Element, rule: PatentRule) -> ExtractResult: + vals = [] + branch = "empty" + for node in descendants(root, "PriorityClaim"): + office = first_descendant_text(node, "OfficeCode") + generating = first_descendant_text(node, "GeneratingOffice") + if office and branch == "empty": + branch = "priority_office_code" + if generating and branch == "empty": + branch = "priority_generating_office" + vals.append(office) + vals.append(generating) + for doc_id in descendants(node, "DocumentID"): + country = country_from_doc_id(doc_id) + if country and branch == "empty": + branch = "priority_document_id_country" + vals.append(country) + return branch_result(unique_nonempty(vals), branch) + + +def extract_public_availability_date(root: ET.Element, rule: PatentRule) -> ExtractResult: + token_map = { + "public_availability_unexamined_view_date": ("unexamined", "view"), + "public_availability_examined_view_date": ("examined", "view"), + "public_availability_unexamined_print_date": ("unexamined", "print"), + "public_availability_examined_print_date": ("examined", "print"), + "claims_only_public_date": ("claimsonly",), + "granted_view_date": ("granted", "view"), + "corrected_document_issue_date": ("corrected",), + } + tokens = token_map.get(rule.field_name, ()) + for container in descendants(root, "PublicAvailabilityDate"): + for node in container.iter(): + name = node_name(node) + if tokens and all(token in name for token in tokens): + date_text = first_descendant_text(node, "Date") + if date_text: + return branch_result(date_text, local_name(node.tag)) + return ExtractResult("", branch="empty") + + +def extract_grant_publication_date(root: ET.Element, rule: PatentRule) -> ExtractResult: + for container in descendants(root, "PublicAvailabilityDate"): + for node in container.iter(): + if "grant" in node_name(node): + date_text = first_descendant_text(node, "Date") + if date_text: + return branch_result(date_text, local_name(node.tag)) + return ExtractResult("", branch="empty") + + +def extract_application_numbers(root: ET.Element, rule: PatentRule) -> ExtractResult: + vals: List[str] = [] + refs = descendants(root, "ApplicationReference") + original_refs = [ref for ref in refs if attr_value(ref, "dataFormat").lower() == "original"] + for ref in original_refs: + doc_id = choose_doc_id(descendants(ref, "DocumentID")) + vals.append(document_number_from_doc_id(doc_id) or child_text(doc_id, "DocNumber") if doc_id is not None else "") + return branch_result(unique_nonempty(vals), "application_original") + + +def extract_filing_dates(root: ET.Element, rule: PatentRule) -> ExtractResult: + return branch_result( + unique_nonempty(date_from_doc_id(doc_id) for doc_id in application_document_ids(root)), + "application_document_id", + ) + + +def extract_original_filing_language(root: ET.Element, rule: PatentRule) -> ExtractResult: + for app in descendants(root, "ApplicationReference"): + lang = attr_value(app, "lang") + if lang: + return branch_result(lang, "application_lang") + return branch_result(root_attr(root, "lang"), "root_lang") + + +def extract_effective_rights_date(root: ET.Element, rule: PatentRule) -> ExtractResult: + dates = unique_nonempty(date_from_doc_id(doc_id) for doc_id in application_document_ids(root)) + return branch_result(dates[0] if dates else "", "application_document_id") + + +def extract_designated_states(root: ET.Element, rule: PatentRule) -> ExtractResult: + container_names = ("PctOrRegionalFilingData",) if rule.field_name == "pct_designated_states" else ("RegionalFilingData",) + vals: List[str] = [] + for container in descendants(root, *container_names): + for node in descendants(container, "DesignatedState", "WIPOST3Code", "CountryCode"): + vals.append(text_content(node)) + return branch_result(unique_nonempty(vals), "_".join(container_names).lower()) + + +def extract_date_by_container(root: ET.Element, rule: PatentRule) -> ExtractResult: + token_map = { + "pct_national_phase_date": ("PctNationalPhaseEntry", "NationalPhaseEntry"), + "pct_effect_ceased_date": ("PctRefiledRevised", "RefiledRevisedApplication"), + "search_report_deferred_publication_date": ("SearchReportDifferentPublication",), + "spc_application_date": ("SPC",), + "microorganism_deposit_date": ("BiologicalDeposit", "MicroorganismDeposit", "MicroorganismDepositDetails", "DepositInstitution"), + } + for container in descendants(root, *token_map.get(rule.field_name, ())): + date_text = first_descendant_text(container, "Date") or first_descendant_text(container, "DepositDate") + if date_text: + return branch_result(date_text, local_name(container.tag)) + return ExtractResult("", branch="empty") + + +def extract_generic_object_by_tokens(root: ET.Element, rule: PatentRule) -> ExtractResult: + tokens = [token for token in re.split(r"[_\s]+", rule.field_name.lower()) if token and token not in {"patent", "data", "info"}] + objects = [] + for node in root.iter(): + name = node_name(node) + if tokens and any(token in name for token in tokens): + txt = text_content(node) + if txt: + objects.append({"node": local_name(node.tag), "text": txt}) + if rule.data_type.lower().startswith("list"): + return branch_result(dedup_dicts(objects), "token_match") + return branch_result(objects[0] if objects else {}, "token_match") + + +def extract_generic_text_by_tokens(root: ET.Element, rule: PatentRule) -> ExtractResult: + tokens = [token for token in re.split(r"[_\s]+", rule.field_name.lower()) if token] + for node in root.iter(): + name = node_name(node) + if tokens and any(token in name for token in tokens): + txt = text_content(node) + if txt: + return branch_result(txt, local_name(node.tag)) + return ExtractResult("", branch="empty") + + +def dedup_dicts(items: Iterable[Dict[str, Any]]) -> List[Dict[str, Any]]: + dedup: Dict[str, Dict[str, Any]] = {} + for item in items: + compact = {k: v for k, v in item.items() if is_non_empty(v)} + if not compact: + continue + dedup[canonical_json(compact)] = compact + return [dedup[key] for key in sorted(dedup)] + + +FIELD_EXTRACTORS: Dict[str, Extractor] = { + "document_number": extract_document_number, + "document_kind_text": extract_document_kind_text, + "document_kind_code": extract_document_kind_code, + "document_status_code": extract_document_status_code, + "document_wipo_country_code": extract_document_wipo_country_code, + "publication_date": extract_publication_date, + "publication_language": extract_publication_language, + "publication_office_code": extract_publication_office_code, + "invention_title": extract_invention_title, + "ipc": extract_ipc, + "ipc_text": extract_ipc_text, + "ipc_edition_statement": extract_ipc_edition_statement, + "ipcr_classifications": extract_classification_objects, + "patent_national_classifications": extract_classification_objects, + "patent_domestic_classifications": extract_classification_objects, + "patent_fi_classifications": extract_classification_objects, + "patent_cpc_classifications": extract_cpc, + "patent_locarno_classes": extract_classification_objects, + "abstract": extract_abstract, + "description": extract_description, + "claims": extract_claims, + "drawings": extract_drawings, + "applicants": extract_parties, + "assignees": extract_parties, + "inventors": extract_parties, + "designers": extract_parties, + "patent_agents": extract_parties, + "patent_agency": extract_parties, + "priority_numbers": extract_priority_numbers, + "priority_filing_dates": extract_priority_filing_dates, + "priority_office_codes": extract_priority_office_codes, + "priority_country_codes": extract_priority_office_codes, + "public_availability_unexamined_view_date": extract_public_availability_date, + "public_availability_examined_view_date": extract_public_availability_date, + "public_availability_unexamined_print_date": extract_public_availability_date, + "public_availability_examined_print_date": extract_public_availability_date, + "grant_publication_date": extract_grant_publication_date, + "claims_only_public_date": extract_public_availability_date, + "granted_view_date": extract_public_availability_date, + "corrected_document_issue_date": extract_public_availability_date, + "application_numbers": extract_application_numbers, + "filing_dates": extract_filing_dates, + "original_filing_language": extract_original_filing_language, + "effective_rights_date": extract_effective_rights_date, + "pct_designated_states": extract_designated_states, + "regional_designated_states": extract_designated_states, + "pct_national_phase_date": extract_date_by_container, + "pct_effect_ceased_date": extract_date_by_container, + "search_report_deferred_publication_date": extract_date_by_container, + "spc_application_date": extract_date_by_container, + "microorganism_deposit_date": extract_date_by_container, +} + + +def get_extractor(rule: PatentRule) -> Optional[Extractor]: + if rule.field_name in FIELD_EXTRACTORS: + return FIELD_EXTRACTORS[rule.field_name] + return None + + +def parse_xml(raw: Any) -> ET.Element: + if isinstance(raw, (bytes, bytearray)): + raw = raw.decode("utf-8", errors="replace") + text = str(raw or "").strip() + if not text: + raise ValueError("empty XML content") + text = re.sub(r"^\s*<\?xml[^>]*\?>", "", text, count=1).lstrip() + return ET.fromstring(text) + + +def json_loads_maybe(value: Any) -> Any: + if isinstance(value, (bytes, bytearray)): + value = value.decode("utf-8", errors="replace") + if isinstance(value, str): + stripped = value.strip() + if stripped and stripped[0] in "[{": + try: + return json.loads(stripped) + except json.JSONDecodeError: + return value + return value + + +def canonicalize(value: Any) -> Any: + value = json_loads_maybe(value) + if isinstance(value, Decimal): + return int(value) if value == value.to_integral_value() else float(value) + if isinstance(value, (date, datetime)): + return value.isoformat() + if isinstance(value, dict): + return {str(k): canonicalize(v) for k, v in sorted(value.items(), key=lambda item: str(item[0]))} + if isinstance(value, list): + return [canonicalize(v) for v in value] + if isinstance(value, str): + return normalize_space(value) + return value + + +def canonical_json(value: Any) -> str: + return json.dumps(canonicalize(value), ensure_ascii=False, sort_keys=True, separators=(",", ":"), cls=JsonEncoder) + + +def is_non_empty(value: Any) -> bool: + value = json_loads_maybe(value) + if value is None: + return False + if isinstance(value, str): + return value.strip() not in {"", "{}", "[]"} + if isinstance(value, (list, dict)): + return len(value) > 0 + return True + + +def normalize_dateish(value: Any) -> Any: + text = normalize_space(value) + digits = re.sub(r"[^0-9]", "", text) + if len(digits) == 8: + return digits + return text + + +def flatten_strings(value: Any) -> List[str]: + value = json_loads_maybe(value) + out: List[str] = [] + if value is None: + return out + if isinstance(value, dict): + for v in value.values(): + out.extend(flatten_strings(v)) + return out + if isinstance(value, list): + for item in value: + out.extend(flatten_strings(item)) + return out + text = normalize_space(value) + if text: + out.append(text) + return out + + +def compact_text_for_compare(value: Any) -> str: + text = " ".join(flatten_strings(value)) + text = normalize_space(text).lower() + return re.sub(r"[^0-9a-z\u3040-\u30ff\u3400-\u9fff]+", "", text) + + +def text_equivalent(expected: Any, actual: Any, field_name: str) -> bool: + expected_text = compact_text_for_compare(expected) + actual_text = compact_text_for_compare(actual) + if not expected_text and not actual_text: + return True + if not expected_text or not actual_text: + return False + if expected_text == actual_text: + return True + if field_name in {"abstract", "description", "claims"}: + shorter, longer = sorted((expected_text, actual_text), key=len) + return bool(shorter) and shorter in longer + return False + + +def compare_values(expected: Any, actual: Any, data_type: str, field_name: str = "") -> Optional[Dict[str, Any]]: + expected = canonicalize(expected) + actual = canonicalize(actual) + type_text = data_type.lower() + if not is_non_empty(expected) and not is_non_empty(actual): + return None + if not is_non_empty(expected) and is_non_empty(actual): + return {"expected": expected, "actual": actual, "reason": "xml_empty_but_field_nonempty"} + if is_non_empty(expected) and not is_non_empty(actual): + return {"expected": expected, "actual": actual, "reason": "xml_nonempty_but_field_empty"} + if "date" in type_text: + if normalize_dateish(expected) != normalize_dateish(actual): + return {"expected": expected, "actual": actual} + return None + if field_name in {"abstract", "description", "claims"} and text_equivalent(expected, actual, field_name): + return None + if type_text.startswith(ORDER_INSENSITIVE_TYPES): + expected_set = set(flatten_strings(expected)) + actual_set = set(flatten_strings(actual)) + if expected_set and not expected_set.issubset(actual_set): + return {"expected": sorted(expected_set), "actual": sorted(actual_set)} + return None + if type_text == "object": + expected_tokens = set(flatten_strings(expected)) + actual_tokens = set(flatten_strings(actual)) + if expected_tokens and not expected_tokens.intersection(actual_tokens): + return {"expected": expected, "actual": actual} + return None + expected_text = normalize_space(expected) + actual_text = normalize_space(actual) + if expected_text != actual_text and not text_equivalent(expected, actual, field_name): + return {"expected": expected, "actual": actual} + return None + + +def compact_record_for_report(record: Dict[str, Any], xml_field: str) -> Dict[str, Any]: + keys = ( + "document_number", + "document_kind_code", + "publication_date", + "invention_title", + "sha256", + "origin_url", + "origin_path", + "dt", + "patent_source", + ) + return {key: canonicalize(record.get(key)) for key in keys if is_non_empty(record.get(key)) and key != xml_field} + + +def actual_field_value(row: Dict[str, Any], field_name: str) -> Any: + if field_name in row: + return row.get(field_name) + alias = FIELD_ALIASES.get(field_name) + if alias: + return row.get(alias) + return None + + +def build_sample_query( + table: str, + dt: Optional[str], + limit: Optional[int], + *, + key_field: str, + xml_field: str, + sample_mode: str, +) -> Tuple[str, List[Any]]: + sample_mode = normalize_sample_mode(sample_mode) + params: List[Any] = [] + where = [f"`{xml_field}` IS NOT NULL", f"`{xml_field}` != ''"] + if dt is not None: + where.append("`dt` = %s") + params.append(dt) + if sample_mode == SAMPLE_MODE_RANDOM: + order = "RAND()" + else: + order = f"CRC32(COALESCE(CAST(`{key_field}` AS STRING), CAST(`{xml_field}` AS STRING)))" + limit_sql = "" if limit is None else f" LIMIT {int(limit)}" + sql = ( + f"SELECT * FROM {quote_identifier(table)} " + f"WHERE {' AND '.join(where)} ORDER BY {order}{limit_sql}" + ) + return sql, params + + +def discover_dt_values(conn: Any, table: str) -> List[str]: + sql = ( + f"SELECT DISTINCT `dt` FROM {quote_identifier(table)} " + "WHERE `dt` IS NOT NULL AND `dt` != '' ORDER BY `dt`" + ) + return [str(r["dt"]) for r in fetch_records(conn, sql)] + + +def validate_row( + row: Dict[str, Any], + rules: Sequence[PatentRule], + *, + xml_field: str, + include_xml_field: bool, + selected_fields: Optional[set], +) -> Tuple[Dict[str, Dict[str, Any]], List[Dict[str, Any]], Dict[str, str]]: + mismatches: Dict[str, Dict[str, Any]] = {} + warnings: List[Dict[str, Any]] = [] + branches: Dict[str, str] = {} + try: + root = parse_xml(row.get(xml_field)) + except Exception as exc: + branches[xml_field] = "xml_parse_failed" + return {xml_field: {"expected": "valid XML", "actual": type(exc).__name__, "reason": str(exc)}}, warnings, branches + + for rule in rules: + if selected_fields is not None and rule.field_name not in selected_fields: + continue + if rule.field_name in SKIP_FIELDS: + warnings.append({"field": rule.field_name, "status": "skipped", "reason": "processed_fulltext_field"}) + continue + if rule.field_name == xml_field and not include_xml_field: + warnings.append({"field": rule.field_name, "status": "skipped", "reason": "raw_xml_field"}) + continue + if rule.module == LIBRARY_MODULE: + continue + extractor = get_extractor(rule) + if extractor is None: + warnings.append({"field": rule.field_name, "status": "skipped", "reason": "unsupported_mapping"}) + continue + try: + extracted = extractor(root, rule) + except Exception as exc: + warnings.append({"field": rule.field_name, "status": "extract_error", "reason": str(exc)}) + continue + branches[rule.field_name] = result_branch(extracted) + if extracted.status != "ok": + warnings.append({"field": rule.field_name, "status": extracted.status, "reason": extracted.reason}) + continue + diff = compare_values( + extracted.value, + actual_field_value(row, rule.field_name), + rule.data_type, + rule.field_name, + ) + if diff is not None: + mismatches[rule.field_name] = diff + return mismatches, warnings, branches + + +def extract_row_branches( + row: Dict[str, Any], + rules: Sequence[PatentRule], + *, + xml_field: str, + include_xml_field: bool, + selected_fields: Optional[set], +) -> Dict[str, str]: + _, _, branches = validate_row( + row, + rules, + xml_field=xml_field, + include_xml_field=include_xml_field, + selected_fields=selected_fields, + ) + return branches + + +def select_branch_coverage_rows( + rows: Sequence[Dict[str, Any]], + rules: Sequence[PatentRule], + *, + limit: Optional[int], + xml_field: str, + include_xml_field: bool, + selected_fields: Optional[set], +) -> List[Dict[str, Any]]: + if limit is None or len(rows) <= limit: + return list(rows) + selected: List[Dict[str, Any]] = [] + deferred: List[Dict[str, Any]] = [] + covered: set = set() + for row in rows: + branches = extract_row_branches( + row, + rules, + xml_field=xml_field, + include_xml_field=include_xml_field, + selected_fields=selected_fields, + ) + new_branches = { + (field, branch) + for field, branch in branches.items() + if branch and branch != "empty" and (field, branch) not in covered + } + if new_branches: + selected.append(row) + covered.update(new_branches) + if len(selected) >= limit: + break + else: + deferred.append(row) + if len(selected) < limit: + selected.extend(deferred[: limit - len(selected)]) + return selected + + +def summarize_branch_coverage(field_branch_counts: Dict[str, Counter]) -> Dict[str, Any]: + by_field = { + field: len(counter) + for field, counter in sorted(field_branch_counts.items()) + if counter + } + return { + "field_count": len(by_field), + "total_branch_count": sum(by_field.values()), + "by_field": by_field, + } + + +def build_report_summary( + report_path: Path, + result: Dict[str, Any], + mismatch_rows: Sequence[Dict[str, Any]], + warning_rows: Sequence[Dict[str, Any]], +) -> Dict[str, Any]: + field_counts: Counter = Counter() + field_samples: Dict[str, List[Dict[str, Any]]] = {} + for row in mismatch_rows: + for field, diff in (row.get("mismatches") or {}).items(): + field_counts[field] += 1 + samples = field_samples.setdefault(field, []) + if len(samples) < 3: + samples.append( + { + "key": row.get("key"), + "dt": row.get("dt"), + "expected": truncate_value(diff.get("expected"), max_chars=600) if isinstance(diff, dict) else None, + "actual": truncate_value(diff.get("actual"), max_chars=600) if isinstance(diff, dict) else None, + "reason": diff.get("reason") if isinstance(diff, dict) else None, + } + ) + warning_counts = Counter(item.get("field") for row in warning_rows for item in row.get("warnings", [])) + return { + "report": str(report_path), + "total_problem_rows": len(mismatch_rows), + "result": {k: v for k, v in result.items() if k != "sample_mismatches"}, + "field_counts": dict(field_counts.most_common()), + "field_samples": {field: field_samples[field] for field, _ in field_counts.most_common(8)}, + "warning_field_counts": dict(warning_counts.most_common()), + "warning_rows": len(warning_rows), + } + + +def truncate_value(value: Any, max_chars: int = 600) -> Any: + value = canonicalize(value) + if isinstance(value, dict): + return {k: truncate_value(v, max_chars=max_chars) for k, v in value.items()} + if isinstance(value, list): + clipped = [truncate_value(item, max_chars=max_chars) for item in value[:5]] + if len(value) > 5: + clipped.append(f"... ({len(value) - 5} more)") + return clipped + if isinstance(value, str) and len(value) > max_chars: + return value[:max_chars] + f"... ({len(value) - max_chars} more chars)" + return value + + +def compact_mismatch_rows(rows: Sequence[Dict[str, Any]], limit: int = 5) -> List[Dict[str, Any]]: + compacted: List[Dict[str, Any]] = [] + for row in rows[:limit]: + compacted.append( + { + "key": row.get("key"), + "dt": row.get("dt"), + "status": row.get("status"), + "record": truncate_value(row.get("record"), max_chars=240), + "mismatches": truncate_value(row.get("mismatches"), max_chars=360), + } + ) + return compacted + + +def write_report_summary( + report_path: Path, + result: Dict[str, Any], + mismatch_rows: Sequence[Dict[str, Any]], + warning_rows: Sequence[Dict[str, Any]], +) -> None: + summary_json_path, summary_md_path = summary_paths(report_path) + summary = build_report_summary(report_path, result, mismatch_rows, warning_rows) + with summary_json_path.open("w", encoding="utf-8") as f: + json.dump(summary, f, ensure_ascii=False, indent=2, cls=JsonEncoder) + + lines = [ + "# Patent XML 字段校验报告摘要", + "", + f"- 分区: `{result.get('dt')}`", + f"- 抽样: `{result.get('sample_mode')}`, 数量 `{result.get('sample_size')}`", + f"- 结果: 已校验 `{result.get('checked')}`,通过 `{result.get('passed')}`,失败 `{result.get('failed')}`", + f"- XML 解析失败: `{result.get('xml_parse_failed')}`", + f"- 明细报告: `{report_path}`", + f"- 报告目录: `{report_path.parent}`", + "", + "## 字段问题分布", + "", + ] + for field, count in summary["field_counts"].items(): + lines.append(f"- `{field}`: {count}") + if not summary["field_counts"]: + lines.append("- 无") + lines.extend(["", "## 字段问题样例", ""]) + for field, samples in summary["field_samples"].items(): + lines.append(f"### {field} ({summary['field_counts'].get(field)})") + lines.append("") + for sample in samples: + lines.append(f"- key `{sample.get('key')}`, dt `{sample.get('dt')}`, reason `{sample.get('reason')}`") + lines.append(f" - expected: `{json.dumps(sample.get('expected'), ensure_ascii=False, cls=JsonEncoder)}`") + lines.append(f" - actual: `{json.dumps(sample.get('actual'), ensure_ascii=False, cls=JsonEncoder)}`") + lines.append("") + lines.extend(["", "## 跳过/告警字段", ""]) + for field, count in summary["warning_field_counts"].items(): + lines.append(f"- `{field}`: {count}") + if not summary["warning_field_counts"]: + lines.append("- 无") + branch_coverage = summary.get("result", {}).get("branch_coverage", {}) + lines.extend(["", "## 字段 Branch 覆盖", ""]) + lines.append(f"- 覆盖字段数: `{branch_coverage.get('field_count', 0)}`") + lines.append(f"- 覆盖 branch 总数: `{branch_coverage.get('total_branch_count', 0)}`") + for field, count in (branch_coverage.get("by_field") or {}).items(): + lines.append(f"- `{field}`: {count}") + with summary_md_path.open("w", encoding="utf-8") as f: + f.write("\n".join(lines).rstrip() + "\n") + + +def validate_db( + *, + config_path: Path, + table: str, + dt: Optional[str], + limit: Optional[int], + sample_mode: str, + report_path: Optional[Path], + mapping_csv: Path = DEFAULT_MAPPING_CSV, + xml_field: str = DEFAULT_XML_FIELD, + key_field: str = "document_number", + include_xml_field: bool = False, + fields: Optional[Sequence[str]] = None, +) -> Dict[str, Any]: + sample_mode = normalize_sample_mode(sample_mode) + rules = load_patent_rules(mapping_csv) + selected_fields = set(fields) if fields else None + cfg = load_config(config_path) + mysql_cfg = cfg.get("mysql", {}) if isinstance(cfg.get("mysql"), dict) else {} + catalog = mysql_cfg.get("catalog") + database = str(mysql_cfg.get("database") or "dws") + table = qualify_table_name(table, catalog, database) + _log( + f"[info] 专利 XML 字段校验开始:dt={dt!r}, limit={limit}, sample_mode={sample_mode}, " + f"table={table}, xml_field={xml_field}" + ) + + checked = passed = failed = xml_parse_failed = 0 + mismatch_rows: List[Dict[str, Any]] = [] + warning_rows: List[Dict[str, Any]] = [] + field_branch_counts: Dict[str, Counter] = {} + + with connect_starrocks(config_path) as conn: + _log("[info] StarRocks 连接成功") + dt_list = [dt] if dt is not None else discover_dt_values(conn, table) + if dt is None: + _log(f"[info] 自动发现 {len(dt_list)} 个 dt 分区,逐分区验证") + + for partition_dt in dt_list: + _log(f"[info] 分区 {partition_dt}:开始抽样记录…") + query_limit = limit + if sample_mode == SAMPLE_MODE_BRANCH_COVERAGE and limit is not None: + query_limit = max(int(limit), int(limit) * BRANCH_COVERAGE_CANDIDATE_MULTIPLIER) + sql, params = build_sample_query( + table, + partition_dt, + query_limit, + key_field=key_field, + xml_field=xml_field, + sample_mode=sample_mode, + ) + t0 = time.monotonic() + rows = fetch_records(conn, sql, params) + if sample_mode == SAMPLE_MODE_BRANCH_COVERAGE: + candidate_count = len(rows) + rows = select_branch_coverage_rows( + rows, + rules, + limit=limit, + xml_field=xml_field, + include_xml_field=include_xml_field, + selected_fields=selected_fields, + ) + _log( + f"[info] 分区 {partition_dt}:branch 候选 {candidate_count} 条," + f"保留 {len(rows)} 条" + ) + _log(f"[info] 分区 {partition_dt}:抽到 {len(rows)} 条,耗时 {time.monotonic() - t0:.1f}s,开始解析 XML…") + for idx, row in enumerate(rows, start=1): + checked += 1 + if idx == 1 or idx % 20 == 0: + _log(f"[info] 分区 {partition_dt}:已比对 {idx}/{len(rows)} 条") + key = row.get(key_field) or row.get("sha256") or f"{partition_dt}:{idx}" + mismatches, warnings, branches = validate_row( + row, + rules, + xml_field=xml_field, + include_xml_field=include_xml_field, + selected_fields=selected_fields, + ) + for field, branch in branches.items(): + if not branch: + continue + field_branch_counts.setdefault(field, Counter())[branch] += 1 + if xml_field in mismatches: + xml_parse_failed += 1 + if warnings: + warning_rows.append({"key": key, "dt": partition_dt, "warnings": warnings}) + if mismatches: + failed += 1 + mismatch_rows.append( + { + "key": key, + "dt": partition_dt, + "status": "field_mismatch", + "record": compact_record_for_report(row, xml_field), + "mismatches": mismatches, + } + ) + else: + passed += 1 + + if report_path is not None: + report_path.parent.mkdir(parents=True, exist_ok=True) + with report_path.open("w", encoding="utf-8") as f: + for row in mismatch_rows: + f.write(json.dumps(row, ensure_ascii=False, cls=JsonEncoder) + "\n") + warning_path = report_path.parent / "xml_field_warning.jsonl" + with warning_path.open("w", encoding="utf-8") as f: + for row in warning_rows: + f.write(json.dumps(row, ensure_ascii=False, cls=JsonEncoder) + "\n") + + result = { + "status": "ok", + "kind": "patent_xml", + "table": table, + "key_field": key_field, + "xml_field": xml_field, + "dt": dt, + "sample_mode": sample_mode, + "sample_size": limit, + "checked": checked, + "passed": passed, + "failed": failed, + "xml_parse_failed": xml_parse_failed, + "warning_rows": len(warning_rows), + "branch_coverage": summarize_branch_coverage(field_branch_counts), + "report_path": str(report_path) if report_path is not None else None, + "sample_mismatches": compact_mismatch_rows(mismatch_rows), + } + if report_path is not None: + write_report_summary(report_path, result, mismatch_rows, warning_rows) + print(json.dumps(result, ensure_ascii=False, cls=JsonEncoder)) + return result + + +def cli() -> None: + config_parser = argparse.ArgumentParser(add_help=False) + config_parser.add_argument("--config", type=Path, default=DEFAULT_CONFIG_PATH) + config_args, _ = config_parser.parse_known_args() + cfg = load_config(config_args.config) if config_args.config.exists() else {} + patent_cfg = cfg.get("patent_parsed_info", {}) if isinstance(cfg.get("patent_parsed_info"), dict) else {} + + default_csv = patent_cfg.get("mapping_csv") + default_csv_path = PROJECT_ROOT / default_csv if default_csv else DEFAULT_MAPPING_CSV + + parser = argparse.ArgumentParser(description="Validate parsed patent DB fields against raw XML content.") + parser.add_argument("--config", type=Path, default=DEFAULT_CONFIG_PATH, help="shared settings JSON path") + parser.add_argument("--mapping-csv", type=Path, default=default_csv_path, help="patent field mapping CSV") + parser.add_argument("--table", default=patent_cfg.get("table", DEFAULT_TABLE)) + parser.add_argument("--dt", default=patent_cfg.get("dt"), help="dt partition filter") + parser.add_argument("--limit", type=int, default=int(patent_cfg.get("limit", 200))) + parser.add_argument("--full", action="store_true", help="validate all sampled partition rows without LIMIT") + parser.add_argument("--xml-field", default=patent_cfg.get("xml_field", DEFAULT_XML_FIELD)) + parser.add_argument("--key-field", default=patent_cfg.get("key_field", "document_number")) + parser.add_argument( + "--sample-mode", + choices=(SAMPLE_MODE_RANDOM, SAMPLE_MODE_BRANCH_COVERAGE), + default=normalize_sample_mode(patent_cfg.get("sample_mode", SAMPLE_MODE_BRANCH_COVERAGE)), + help="random: 随机抽样;branch-coverage: 覆盖所有 branch 抽样", + ) + parser.add_argument( + "--fields", + default=patent_cfg.get("fields"), + help="comma separated field allowlist, e.g. document_number,publication_date", + ) + parser.add_argument("--include-xml-field", action="store_true", help="also compare the XML field itself") + parser.add_argument("--report", type=Path, default=patent_cfg.get("report_path"), help="JSONL report path") + args = parser.parse_args() + + fields = None + if args.fields: + fields = [field.strip() for field in str(args.fields).split(",") if field.strip()] + report_path = Path(args.report) if args.report else default_report_path( + args.dt, + args.sample_mode, + args.full, + ) + validate_db( + config_path=args.config, + table=args.table, + dt=args.dt, + limit=None if args.full else args.limit, + sample_mode=args.sample_mode, + report_path=report_path, + mapping_csv=args.mapping_csv, + xml_field=args.xml_field, + key_field=args.key_field, + include_xml_field=args.include_xml_field, + fields=fields, + ) + + +from dingo.config.input_args import EvaluatorRuleArgs +from dingo.io.input import Data, RequiredField +from dingo.io.output.eval_detail import EvalDetail +from dingo.model.model import Model +from dingo.model.rule.base import BaseRule +from dingo.model.rule.scibase.report_utils import bool_param, int_param, write_temp_settings + + +def _fields_param(value: Any) -> Optional[List[str]]: + if value is None or value == "": + return None + if isinstance(value, str): + return [item.strip() for item in value.split(",") if item.strip()] + if isinstance(value, (list, tuple, set)): + return [str(item).strip() for item in value if str(item).strip()] + return [str(value).strip()] + + +@Model.rule_register( + "QUALITY_BAD_EFFECTIVENESS", + ["sci_base_qa_test", "meta_patent_parsed_info"], +) +class RuleSciBaseMetaPatentParsedInfoReport(BaseRule): + _metric_info = { + "category": "Rule-Based Metadata Quality Metrics", + "quality_dimension": "EFFECTIVENESS", + "metric_name": "RuleSciBaseMetaPatentParsedInfoReport", + "description": "Run SciBase patent XML parsed-field validation with branch coverage sampling.", + "paper_title": "", + "paper_url": "", + "paper_authors": "", + "evaluation_results": "", + } + + _required_fields = [RequiredField.METADATA] + dynamic_config = EvaluatorRuleArgs(parameters={}) + + @classmethod + def eval(cls, input_data: Data) -> EvalDetail: + del input_data + params = cls.dynamic_config.parameters or {} + full = bool_param(params, "full", False) + sample_mode = normalize_sample_mode(params.get("sample_mode", SAMPLE_MODE_BRANCH_COVERAGE)) + report_path = Path(params["report_path"]) if params.get("report_path") else None + if report_path is None and params.get("output_dir"): + report_path = Path(str(params["output_dir"])) / "xml_field_mismatch.jsonl" + if report_path is None: + report_path = default_report_path( + params.get("dt"), + sample_mode, + full, + ) + + config_path = write_temp_settings(params) + result = validate_db( + config_path=config_path, + table=str(params.get("target_table") or params.get("table") or DEFAULT_TABLE), + dt=params.get("dt"), + limit=None if full else int_param(params, "limit", 200), + sample_mode=sample_mode, + report_path=report_path, + mapping_csv=Path(str(params.get("mapping_csv") or DEFAULT_MAPPING_CSV)), + xml_field=str(params.get("xml_field") or DEFAULT_XML_FIELD), + key_field=str(params.get("key_field") or "document_number"), + include_xml_field=bool_param(params, "include_xml_field", False), + fields=_fields_param(params.get("fields")), + ) + branch_coverage = result.get("branch_coverage") or {} + is_bad = bool(result.get("failed") or result.get("xml_parse_failed")) + return EvalDetail( + metric=cls.__name__, + status=is_bad, + label=[ + f"{cls.metric_type}.{cls.__name__}" if is_bad else "QUALITY_GOOD", + ], + reason=[ + str(report_path.parent), + f"checked={result.get('checked')}", + f"failed={result.get('failed')}", + f"branch_fields={branch_coverage.get('field_count', 0)}", + f"branch_total={branch_coverage.get('total_branch_count', 0)}", + ], + ) + + +if __name__ == "__main__": + cli() diff --git a/dingo/model/rule/scibase/report_utils.py b/dingo/model/rule/scibase/report_utils.py new file mode 100644 index 00000000..6bc7ef3c --- /dev/null +++ b/dingo/model/rule/scibase/report_utils.py @@ -0,0 +1,163 @@ +import json +import tempfile +from pathlib import Path +from typing import Any, Dict, Optional +from urllib.parse import parse_qsl + + +def load_scibase_parameters(params: Optional[Dict[str, Any]]) -> Dict[str, Any]: + return dict(params or {}) + + +def datasource_note() -> Dict[str, str]: + return { + "source": "dingo dataset datasource", + "connection_config": "dataset.sql_config or dataset.s3_config", + "input_config": "input_path", + } + + +def table_params(params: Dict[str, Any], defaults: Dict[str, str]) -> Dict[str, Any]: + result = dict(defaults) + for key in ( + "dt", + "s3_dt", + "target_dt", + "paper_dt", + "ebook_dt", + "source_table", + "target_table", + "paper_table", + "ebook_table", + "xinghe_table", + ): + if params.get(key) is not None: + result[key] = params[key] + return result + + +def dingo_sql_config(params: Dict[str, Any]) -> Dict[str, Any]: + config = dict(params.get("_dingo_dataset_sql_config") or params.get("sql_config") or {}) + if not config.get("host") or not config.get("username"): + raise RuntimeError( + "SQL config is required for this SciBase validator. " + "Set dataset.sql_config in the Dingo input config." + ) + return config + + +def dingo_s3_config(params: Dict[str, Any]) -> Dict[str, Any]: + config = dict(params.get("_dingo_dataset_s3_config") or params.get("s3_config") or {}) + return config + + +def s3_path_from_dingo(params: Dict[str, Any]) -> Optional[str]: + explicit_path = params.get("s3_path") + if explicit_path: + return str(explicit_path) + + s3_config = dingo_s3_config(params) + input_path = params.get("_dingo_input_path") + if input_path: + input_path_str = str(input_path).strip() + if input_path_str.startswith("s3://"): + return input_path_str + if params.get("_dingo_dataset_source") == "s3": + bucket = str(s3_config.get("s3_bucket") or "").strip().strip("/") + if bucket: + return f"s3://{bucket}/{input_path_str.lstrip('/')}" + + bucket = str(s3_config.get("s3_bucket") or "").strip().strip("/") + if bucket and params.get("s3_subpath"): + return f"s3://{bucket}/" + return None + + +def _connect_args_dict(raw: Any) -> Dict[str, str]: + if not raw: + return {} + text = str(raw) + if text.startswith("?"): + text = text[1:] + return dict(parse_qsl(text, keep_blank_values=True)) + + +def mysql_settings_from_dingo(params: Dict[str, Any]) -> Dict[str, Any]: + sql_config = dingo_sql_config(params) + connect_args = _connect_args_dict(sql_config.get("connect_args")) + settings = { + "host": sql_config.get("host"), + "port": int(sql_config.get("port") or 0), + "user": sql_config.get("username"), + "password": sql_config.get("password"), + "database": sql_config.get("database") or "dws", + "charset": connect_args.get("charset", "utf8mb4"), + } + for key in ("catalog", "connect_timeout", "read_timeout", "read_timeout_sec"): + if params.get(key) is not None: + settings[key] = params[key] + return settings + + +def s3_settings_from_dingo(params: Dict[str, Any]) -> Dict[str, Any]: + s3_config = dingo_s3_config(params) + endpoint = str(s3_config.get("s3_endpoint_url") or "").rstrip("/") + if endpoint.startswith("https://"): + endpoint = endpoint[len("https://"):] + elif endpoint.startswith("http://"): + endpoint = endpoint[len("http://"):] + settings = { + "endpoint": endpoint, + "access_key": s3_config.get("s3_ak"), + "secret_key": s3_config.get("s3_sk"), + "path": s3_path_from_dingo(params), + "format": params.get("s3_format", "auto"), + } + if params.get("s3_subpath") is not None: + settings["subpath"] = params["s3_subpath"] + for key in ("use_ssl", "verify_ssl"): + if params.get(key) is not None: + settings[key] = params[key] + elif s3_config.get(key) is not None: + settings[key] = s3_config[key] + return settings + + +def write_temp_settings(params: Dict[str, Any], *, include_s3: bool = False) -> Path: + payload: Dict[str, Any] = { + "mysql": mysql_settings_from_dingo(params), + "retry": params.get("retry", {}), + } + if include_s3: + payload["s3"] = s3_settings_from_dingo(params) + payload["osi_arxiv"] = { + "s3": payload["s3"], + "mapping_csv": params.get("mapping_csv"), + "target_table": params.get("target_table"), + "database": params.get("database"), + "catalog": params.get("catalog"), + } + temp = tempfile.NamedTemporaryFile( + "w", + encoding="utf-8", + suffix=".json", + prefix="dingo_scibase_", + delete=False, + ) + with temp: + json.dump(payload, temp, ensure_ascii=False, indent=2) + return Path(temp.name) + + +def int_param(params: Dict[str, Any], key: str, default: int) -> int: + value = params.get(key, default) + return default if value is None else int(value) + + +def bool_param(params: Dict[str, Any], key: str, default: bool = False) -> bool: + value = params.get(key, default) + if isinstance(value, bool): + return value + if isinstance(value, str): + return value.strip().lower() in {"1", "true", "yes", "y"} + return bool(value) diff --git a/dingo/model/rule/scibase/rule_quanliang.py b/dingo/model/rule/scibase/rule_quanliang.py deleted file mode 100644 index 601fb7be..00000000 --- a/dingo/model/rule/scibase/rule_quanliang.py +++ /dev/null @@ -1,655 +0,0 @@ -import json -import re -from datetime import datetime -from pathlib import Path -from typing import Any, Dict, List - -from dingo.config.input_args import EvaluatorRuleArgs -from dingo.io.input import Data, RequiredField -from dingo.io.output.eval_detail import EvalDetail, QualityLabel -from dingo.model.model import Model -from dingo.model.rule.base import BaseRule - -URL_RE = re.compile(r"^[Hh][Tt][Tt][Pp][Ss]?://[^/$.?#][\s\S]*$") -DOI_RE = re.compile(r"^10\.\d{4,9}/([^A-Z\s\|]*)$") -INVISIBLE_RE = re.compile(r"[\u2000-\u200F\u202F\u205F\u3000\uFEFF\u00A0\u2060-\u206F\xa0]") -PAGE_RANGE_RE = re.compile(r"^\d+-\d+$") -ISSN_RE = re.compile(r"^\d{4}-\d{3}[\dX]$") -AUTHOR_SEP_RE = re.compile(r"[|;;]") - -OA_BOOL_VALUES = {"true", "false", "unknown"} -METADATA_TYPE_VALUES = {"paper", "ebook"} -OA_STATUS_VALUES = {"diamond", "gold", "green", "hybrid", "bronze", "closed", ""} -LOC_TYPE_VALUES = {"download", "reader", "display", ""} -JSON_LIST_FIELDS = { - "isbns", - "author", - "contributors", - "locations", - "access_oa_url", - "publication_venue_issn", - "references", - "related_works", -} -LICENSE_VALUES = { - "cc-by", - "cc-by-nc", - "cc-by-sa", - "cc-by-nd", - "cc-by-nc-sa", - "cc-by-nc-nd", - "other-oa", - "cc0", - "", - "public-domain", - "publisher-specific-oa", - "publisher-specific", - "wiley-specific", - "elsevier-specific", - "oup-specific", - "acs-specific", - "rsc-specific", - "iop-specific", - "unspecified-oa", - "implied-oa", - "nonexclusive-distrib", - "gpl-v1", - "gpl-v2", - "gpl-v3", - "mit", - "ogl-c", - "pd", -} -ACCESS_LICENSE_VALUES = set(LICENSE_VALUES) -GRADE_CLASS_VALUES = {"k12", "higher-edu", "vocational-edu", "other", ""} -GRADE_VALUES = {"小学", "初中", "高中", ""} - -_DEFAULT_LANGUAGE_VALUES = {"zh", "en", "ja", "de", "fr", "es", "ru", "ko", "ar"} -ASSETS_DIR = Path(__file__).resolve().parent / "assets" - - -def _load_language_allowed_values() -> set[str]: - base = ASSETS_DIR / "to_iso-639.json" - if not base.exists(): - return set(_DEFAULT_LANGUAGE_VALUES) - try: - with base.open("r", encoding="utf-8") as f: - values = json.load(f) - if isinstance(values, dict): - return set(str(v) for v in values.values() if isinstance(v, str)) - except (TypeError, ValueError, json.JSONDecodeError): - return set(_DEFAULT_LANGUAGE_VALUES) - return set(_DEFAULT_LANGUAGE_VALUES) - - -def _load_journal_mapping() -> Dict[str, str]: - csv_path = ASSETS_DIR / "journal_name_mapping_execute_20260512.csv" - if not csv_path.exists(): - return {} - # Lazy import to avoid top-level optional dependency / heavier import. - import csv - - mapping: Dict[str, str] = {} - with csv_path.open("r", encoding="utf-8", newline="") as f: - for row in csv.DictReader(f): - source_name = row.get("source_journal_name") - target_name = row.get("target_journal_name") - if source_name and target_name: - mapping[source_name] = target_name - return mapping - - -LANGUAGE_ALLOWED_VALUES = _load_language_allowed_values() -JOURNAL_NAME_MAPPING = _load_journal_mapping() - - -def _valid_isbn10(code: str) -> bool: - if not re.fullmatch(r"\d{9}[\dXx]", code): - return False - total = sum((10 - idx) * int(ch) for idx, ch in enumerate(code[:9])) - check = code[9].upper() - check_value = 10 if check == "X" else int(check) - total += check_value - return total % 11 == 0 - - -def _valid_isbn13(code: str) -> bool: - if not re.fullmatch(r"\d{13}", code): - return False - if not (code.startswith("978") or code.startswith("979")): - return False - total = sum(int(ch) * (1 if idx % 2 == 0 else 3) for idx, ch in enumerate(code)) - return total % 10 == 0 - - -def _valid_issn(code: str) -> bool: - if not ISSN_RE.fullmatch(code): - return False - digits = code.replace("-", "") - total = sum(int(ch) * (8 - idx) for idx, ch in enumerate(digits[:7])) - calculated = (11 - (total % 11)) % 11 - expected = "X" if calculated == 10 else str(calculated) - return digits[7].upper() == expected - - -def check_metadata_type(metadata_type: Any) -> bool: - if metadata_type is None: - return True - if not isinstance(metadata_type, str): - return True - if metadata_type.strip() == "": - return True - return metadata_type not in METADATA_TYPE_VALUES - - -def check_doi(doi: Any, metadata_type: Any) -> bool: - if metadata_type not in METADATA_TYPE_VALUES: - return False - required = metadata_type == "paper" - if doi is None: - return required - if not isinstance(doi, str): - return True - if doi == "": - return required - if doi != doi.lower(): - return True - if "https://doi.org/" in doi.lower(): - return True - return not bool(DOI_RE.fullmatch(doi)) - - -def check_isbns(isbns: Any, metadata_type: Any) -> bool: - if metadata_type not in METADATA_TYPE_VALUES: - return False - required = metadata_type == "ebook" - if isbns is None: - return required - if not (isinstance(isbns, list) and all(isinstance(x, str) for x in isbns)): - return True - if len(isbns) == 0: - return required - for item in isbns: - if not (_valid_isbn10(item) or _valid_isbn13(item)): - return True - return False - - -def check_isbn13(isbn13: Any, metadata_type: Any) -> bool: - if metadata_type not in METADATA_TYPE_VALUES: - return False - required = metadata_type == "ebook" - if isbn13 is None: - return required - if not isinstance(isbn13, str): - return True - if isbn13 == "": - return required - return not _valid_isbn13(isbn13) - - -def check_title(title: Any) -> bool: - if title is None: - return True - if not isinstance(title, str): - return True - if title == "": - return False - return bool(INVISIBLE_RE.search(title)) - - -def check_abstract(abstract: Any) -> bool: - if abstract is None: - return True - if not isinstance(abstract, str): - return True - if abstract == "": - return False - return bool(INVISIBLE_RE.search(abstract)) - - -def check_language(language: Any) -> bool: - if language is None: - return True - if not isinstance(language, str): - return True - if language == "": - return False - if not LANGUAGE_ALLOWED_VALUES: - return False - return language not in LANGUAGE_ALLOWED_VALUES - - -def check_author(author: Any) -> bool: - if author is None: - return True - if not (isinstance(author, list) and all(isinstance(x, str) for x in author)): - return True - if len(author) == 0: - return False - for item in author: - if AUTHOR_SEP_RE.search(item): - return True - return False - - -def check_contributors(contributors: Any) -> bool: - if contributors is None: - return True - if not (isinstance(contributors, list) and all(isinstance(x, str) for x in contributors)): - return True - if len(contributors) == 0: - return False - for item in contributors: - if AUTHOR_SEP_RE.search(item): - return True - return False - - -def check_locations(locations: Any) -> bool: - if locations is None: - return True - if not isinstance(locations, list): - return True - if len(locations) == 0: - return False - for item in locations: - if not isinstance(item, dict): - return True - for key in ("type", "url", "license", "is_oa"): - if key not in item: - return True - if item["type"] not in LOC_TYPE_VALUES: - return True - if not (isinstance(item["url"], str) and URL_RE.fullmatch(item["url"])): - return True - if item["license"] not in LICENSE_VALUES: - return True - if item["is_oa"] not in OA_BOOL_VALUES: - return True - return False - - -def check_access_is_oa(access_is_oa: Any, metadata_type: Any) -> bool: - if metadata_type not in METADATA_TYPE_VALUES: - return False - required = metadata_type == "paper" - if access_is_oa is None: - return required - if not isinstance(access_is_oa, str): - return True - if access_is_oa == "": - return required - return access_is_oa not in OA_BOOL_VALUES - - -def check_access_oa_status(access_oa_status: Any) -> bool: - if access_oa_status is None: - return True - if not isinstance(access_oa_status, str): - return True - return access_oa_status not in OA_STATUS_VALUES - - -def check_access_oa_url(access_oa_url: Any) -> bool: - if access_oa_url is None: - return True - if not (isinstance(access_oa_url, list) and all(isinstance(x, str) for x in access_oa_url)): - return True - if len(access_oa_url) == 0: - return False - return any(not bool(URL_RE.fullmatch(item)) for item in access_oa_url) - - -def check_access_license(access_license: Any) -> bool: - if access_license is None: - return True - if not isinstance(access_license, str): - return True - if access_license == "": - return False - return access_license not in ACCESS_LICENSE_VALUES - - -def check_publication_published_date(publication_published_date: Any) -> bool: - if publication_published_date is None: - return True - if not isinstance(publication_published_date, str): - return True - if publication_published_date == "": - return False - if not bool(re.fullmatch(r"\d{4}-\d{2}-\d{2}", publication_published_date)): - return True - try: - datetime.strptime(publication_published_date, "%Y-%m-%d") - return False - except ValueError: - return True - - -def check_publication_published_year(publication_published_year: Any) -> bool: - if publication_published_year is None: - return False - if not isinstance(publication_published_year, int) or isinstance(publication_published_year, bool): - return True - return not (0 < publication_published_year < 2100) - - -def check_publication_venue_issn(publication_venue_issn: Any) -> bool: - if publication_venue_issn is None: - return True - if not (isinstance(publication_venue_issn, list) and all(isinstance(x, str) for x in publication_venue_issn)): - return True - if len(publication_venue_issn) == 0: - return False - for item in publication_venue_issn: - if not _valid_issn(item): - return True - return False - - -def check_publication_venue_biblio_volume(publication_venue_biblio_volume: Any) -> bool: - if publication_venue_biblio_volume is None: - return True - if not isinstance(publication_venue_biblio_volume, str): - return True - if publication_venue_biblio_volume == "": - return False - try: - int(publication_venue_biblio_volume) - return False - except (TypeError, ValueError): - return True - - -def check_publication_venue_biblio_issue(publication_venue_biblio_issue: Any) -> bool: - if publication_venue_biblio_issue is None: - return True - if not isinstance(publication_venue_biblio_issue, str): - return True - if publication_venue_biblio_issue == "": - return False - try: - int(publication_venue_biblio_issue) - return False - except (TypeError, ValueError): - return True - - -def check_publication_venue_biblio_pages(publication_venue_biblio_pages: Any) -> bool: - if publication_venue_biblio_pages is None: - return True - if not isinstance(publication_venue_biblio_pages, str): - return True - if publication_venue_biblio_pages == "": - return False - if not PAGE_RANGE_RE.fullmatch(publication_venue_biblio_pages): - return True - start, end = [int(x.strip()) for x in publication_venue_biblio_pages.split("-")] - return start <= 0 or end <= 0 or start > end - - -def check_publication_pages(publication_pages: Any) -> bool: - if publication_pages is None: - return False - if not isinstance(publication_pages, int) or isinstance(publication_pages, bool): - return True - return publication_pages <= 0 - - -def check_publication_venue_name_unified( - publication_venue_name_unified: Any, publication_venue_name: Any -) -> bool: - if publication_venue_name_unified is None: - return True - if not isinstance(publication_venue_name_unified, str): - return True - if publication_venue_name is not None and not isinstance(publication_venue_name, str): - return True - expected_target = None - if isinstance(publication_venue_name, str) and publication_venue_name != "": - expected_target = JOURNAL_NAME_MAPPING.get(publication_venue_name, publication_venue_name) - if publication_venue_name_unified == "": - return False - if expected_target is None: - return True - return publication_venue_name_unified != expected_target - - -def check_grade_class(grade_class: Any) -> bool: - if grade_class is None: - return True - if not isinstance(grade_class, str): - return True - if grade_class == "": - return False - return grade_class not in GRADE_CLASS_VALUES - - -def check_grade(grade: Any, grade_class: Any) -> bool: - if grade is None: - return True - if not isinstance(grade, str): - return True - if grade_class is not None and not isinstance(grade_class, str): - return True - if grade == "": - return False - if grade not in GRADE_VALUES: - return True - if grade_class != "k12" and grade != "": - return True - return False - - -def check_references(references: Any) -> bool: - if references is None: - return True - if not (isinstance(references, list) and all(isinstance(x, str) for x in references)): - return True - if len(references) == 0: - return False - return any(not URL_RE.fullmatch(item) for item in references) - - -def check_related_works(related_works: Any) -> bool: - if related_works is None: - return True - if not (isinstance(related_works, list) and all(isinstance(x, str) for x in related_works)): - return True - if len(related_works) == 0: - return False - return any(not URL_RE.fullmatch(item) for item in related_works) - - -def check_cited_by_api_url(cited_by_api_url: Any) -> bool: - if cited_by_api_url is None: - return True - if not isinstance(cited_by_api_url, str): - return True - if cited_by_api_url == "": - return False - return not bool(URL_RE.fullmatch(cited_by_api_url)) - - -def check_access_xinghe_repository_sha256( - access_xinghe_repository_sha256: Any, access_xinghe_repository_has_fulltext: Any -) -> bool: - if access_xinghe_repository_sha256 is None: - return True - if not isinstance(access_xinghe_repository_has_fulltext, bool): - return True - has_fulltext = access_xinghe_repository_has_fulltext - if isinstance(access_xinghe_repository_sha256, str): - if not has_fulltext: - return False - return access_xinghe_repository_sha256 == "" - if not ( - isinstance(access_xinghe_repository_sha256, list) - and all(isinstance(x, str) for x in access_xinghe_repository_sha256) - ): - return True - if not has_fulltext: - return False - return len(access_xinghe_repository_sha256) == 0 - - -def check_access_xinghe_repository_origin_path( - access_xinghe_repository_origin_path: Any, access_xinghe_repository_has_fulltext: Any -) -> bool: - if not isinstance(access_xinghe_repository_origin_path, str): - return True - if not isinstance(access_xinghe_repository_has_fulltext, bool): - return True - if not access_xinghe_repository_has_fulltext: - return False - return access_xinghe_repository_origin_path.strip() == "" - - -def _normalize_json_like_field(value: Any) -> Any: - if not isinstance(value, str): - return value - stripped = value.strip() - if not stripped: - return value - if stripped[0] not in ("[", "{"): - return value - try: - return json.loads(stripped) - except (TypeError, ValueError, json.JSONDecodeError): - cleaned = stripped.replace("\r", " ").replace("\n", " ").replace("\t", " ") - cleaned = "".join(ch if ord(ch) >= 32 else " " for ch in cleaned) - invalid_escape_re = re.compile(r'\\(?!["\\/bfnrtu])') - for _ in range(10): - next_cleaned = invalid_escape_re.sub(r"\\\\", cleaned) - if next_cleaned == cleaned: - break - cleaned = next_cleaned - try: - return json.loads(cleaned) - except (TypeError, ValueError, json.JSONDecodeError): - return value - - -def _normalize_bool_field(value: Any) -> Any: - if isinstance(value, bool): - return value - if isinstance(value, int): - if value in (0, 1): - return bool(value) - return value - if isinstance(value, str): - lowered = value.strip().lower() - if lowered in ("1", "true"): - return True - if lowered in ("0", "false"): - return False - return value - - -def normalize_record(record: Dict[str, Any]) -> Dict[str, Any]: - normalized = dict(record) - for field in JSON_LIST_FIELDS: - if field in normalized: - normalized[field] = _normalize_json_like_field(normalized.get(field)) - normalized["access_xinghe_repository_has_fulltext"] = _normalize_bool_field( - normalized.get("access_xinghe_repository_has_fulltext") - ) - return normalized - - -FIELD_VALIDATORS = { - "metadata_type": lambda record: check_metadata_type(record.get("metadata_type")), - "doi": lambda record: check_doi(record.get("doi"), record.get("metadata_type")), - "isbns": lambda record: check_isbns(record.get("isbns"), record.get("metadata_type")), - "isbn13": lambda record: check_isbn13(record.get("isbn13"), record.get("metadata_type")), - "title": lambda record: check_title(record.get("title")), - "abstract": lambda record: check_abstract(record.get("abstract")), - "language": lambda record: check_language(record.get("language")), - "author": lambda record: check_author(record.get("author")), - "contributors": lambda record: check_contributors(record.get("contributors")), - "locations": lambda record: check_locations(record.get("locations")), - "access_is_oa": lambda record: check_access_is_oa(record.get("access_is_oa"), record.get("metadata_type")), - "access_oa_status": lambda record: check_access_oa_status(record.get("access_oa_status")), - "access_oa_url": lambda record: check_access_oa_url(record.get("access_oa_url")), - "access_license": lambda record: check_access_license(record.get("access_license")), - "publication_published_date": lambda record: check_publication_published_date( - record.get("publication_published_date") - ), - "publication_published_year": lambda record: check_publication_published_year( - record.get("publication_published_year") - ), - "publication_venue_issn": lambda record: check_publication_venue_issn(record.get("publication_venue_issn")), - "publication_venue_biblio_volume": lambda record: check_publication_venue_biblio_volume( - record.get("publication_venue_biblio_volume") - ), - "publication_venue_biblio_issue": lambda record: check_publication_venue_biblio_issue( - record.get("publication_venue_biblio_issue") - ), - "publication_venue_biblio_pages": lambda record: check_publication_venue_biblio_pages( - record.get("publication_venue_biblio_pages") - ), - "publication_pages": lambda record: check_publication_pages(record.get("publication_pages")), - "publication_venue_name_unified": lambda record: check_publication_venue_name_unified( - record.get("publication_venue_name_unified"), - record.get("publication_venue_name"), - ), - "grade_class": lambda record: check_grade_class(record.get("grade_class")), - "grade": lambda record: check_grade(record.get("grade"), record.get("grade_class")), - "references": lambda record: check_references(record.get("references")), - "related_works": lambda record: check_related_works(record.get("related_works")), - "cited_by_api_url": lambda record: check_cited_by_api_url(record.get("cited_by_api_url")), - "access_xinghe_repository_sha256": lambda record: check_access_xinghe_repository_sha256( - record.get("access_xinghe_repository_sha256"), - record.get("access_xinghe_repository_has_fulltext"), - ), - "access_xinghe_repository_origin_path": lambda record: check_access_xinghe_repository_origin_path( - record.get("access_xinghe_repository_origin_path"), - record.get("access_xinghe_repository_has_fulltext"), - ), -} - - -@Model.rule_register("QUALITY_BAD_EFFECTIVENESS", ["xinghe", "quanliang"]) -class RuleQuanliangFieldValidation(BaseRule): - _metric_info = { - "category": "Rule-Based Metadata Quality Metrics", - "quality_dimension": "EFFECTIVENESS", - "metric_name": "RuleQuanliangFieldValidation", - "description": "Validate Quanliang metadata fields and report invalid fields", - "paper_title": "", - "paper_url": "", - "paper_authors": "", - "evaluation_results": "", - } - - _required_fields = [RequiredField.METADATA] - dynamic_config = EvaluatorRuleArgs(key_list=list(FIELD_VALIDATORS.keys())) - - @classmethod - def eval(cls, input_data: Data) -> EvalDetail: - res = EvalDetail(metric=cls.__name__) - normalized = normalize_record(input_data.to_dict()) - selected_fields = cls.dynamic_config.key_list or [] - bad_fields: List[str] = [] - reasons: List[str] = [] - for field in selected_fields: - if field not in FIELD_VALIDATORS: - bad_fields.append(field) - reasons.append("unsupported field") - continue - if field not in normalized: - bad_fields.append(field) - reasons.append("missing field") - continue - if FIELD_VALIDATORS[field](normalized): - bad_fields.append(field) - reasons.append(f"{field} invalid") - - if bad_fields: - res.status = True - res.label = bad_fields - res.reason = reasons - else: - res.label = [QualityLabel.QUALITY_GOOD] - return res diff --git a/dingo/model/rule/scibase/union_unique_meta_data.py b/dingo/model/rule/scibase/union_unique_meta_data.py new file mode 100644 index 00000000..3e222483 --- /dev/null +++ b/dingo/model/rule/scibase/union_unique_meta_data.py @@ -0,0 +1,2548 @@ +#!/usr/bin/env python3 +"""DB validator for unified metadata and Xinghe fulltext union table. + +The validator is read-only. It compares the unified target table with three +source tables (paper unique, ebook unique, Xinghe fulltext), validates target +field values, and reports target field NULL / empty rates. +""" +from __future__ import annotations + +import argparse +import csv +import html +import json +import re +import sys +import time +from collections import defaultdict +from dataclasses import dataclass +from datetime import date, datetime +from decimal import Decimal +from pathlib import Path +from typing import Any, Dict, Iterable, List, Optional, Sequence, Set, Tuple + +try: + import pymysql +except ImportError: # pragma: no cover - runtime dependency check + pymysql = None # type: ignore + + +PROJECT_ROOT = Path(__file__).resolve().parent +ASSETS_DIR = PROJECT_ROOT / "assets" +DEFAULT_CONFIG_PATH = Path("sci_base_qa_test_config.json") +TEMPLATE_CONFIG_PATH = ASSETS_DIR / "settings.template.json" +DEFAULT_MAPPING_CSV = ASSETS_DIR / "union_unique_data_mapping.csv" +DEFAULT_JOURNAL_MAPPING_CSV = ASSETS_DIR / "journal_name_mapping_execute_20260512.csv" +REPORT_ROOT = Path("report") +DEFAULT_PAPER_TABLE = "dws_meta_paper_doi_unique_acc_d" +DEFAULT_EBOOK_TABLE = "dws_meta_ebook_isbn_unique_acc_d" +DEFAULT_XINGHE_TABLE = "ads_xinghe_library_acc" +DEFAULT_TARGET_TABLE = "ads_meta_unified_unique_meta_data_acc_d" +XINGHE_SUPPLEMENT_FIELDS = { + "doi", + "title", + "abstract", + "language", + "author", + "grade_class", + "grade", + "supplementary_material", +} +IGNORED_TARGET_EXTRA_FIELDS = {"dt", "mesh"} +LICENSE_ALLOWED: Set[str] = { + "cc-by", + "cc-by-nc", + "cc-by-sa", + "cc-by-nd", + "cc-by-nc-sa", + "cc-by-nc-nd", + "other-oa", + "cc0", + "", + "public-domain", + "publisher-specific-oa", + "publisher-specific", + "wiley-specific", + "elsevier-specific", + "oup-specific", + "acs-specific", + "rsc-specific", + "iop-specific", + "unspecified-oa", + "implied-oa", + "nonexclusive-distrib", + "gpl-v1", + "gpl-v2", + "gpl-v3", + "mit", + "ogl-c", + "pd", +} +DEFAULT_LICENSE_MAP: Dict[str, str] = { + "http://arxiv.org/licenses/nonexclusive-distrib/1.0/": "nonexclusive-distrib", + "https://arxiv.org/licenses/nonexclusive-distrib/1.0/": "nonexclusive-distrib", + "arxiv-nonexclusive-distrib-1.0": "nonexclusive-distrib", + "http://creativecommons.org/licenses/by/4.0/": "cc-by", + "https://creativecommons.org/licenses/by/4.0/": "cc-by", + "http://creativecommons.org/licenses/by/3.0/": "cc-by", + "https://creativecommons.org/licenses/by/3.0/": "cc-by", + "CC-BY-4.0": "cc-by", + "CC-BY-3.0": "cc-by", + "CCBY": "cc-by", + "http://creativecommons.org/licenses/by-nc/4.0/": "cc-by-nc", + "https://creativecommons.org/licenses/by-nc/4.0/": "cc-by-nc", + "CCBYNC": "cc-by-nc", + "http://creativecommons.org/licenses/by-sa/4.0/": "cc-by-sa", + "https://creativecommons.org/licenses/by-sa/4.0/": "cc-by-sa", + "CCBYSA": "cc-by-sa", + "http://creativecommons.org/licenses/by-nd/4.0/": "cc-by-nd", + "https://creativecommons.org/licenses/by-nd/4.0/": "cc-by-nd", + "CCBYND": "cc-by-nd", + "http://creativecommons.org/licenses/by-nc-sa/4.0/": "cc-by-nc-sa", + "https://creativecommons.org/licenses/by-nc-sa/4.0/": "cc-by-nc-sa", + "CCBYNCSA": "cc-by-nc-sa", + "http://creativecommons.org/licenses/by-nc-nd/4.0/": "cc-by-nc-nd", + "https://creativecommons.org/licenses/by-nc-nd/4.0/": "cc-by-nc-nd", + "CCBYNCND": "cc-by-nc-nd", + "http://creativecommons.org/publicdomain/zero/1.0/": "cc0", + "https://creativecommons.org/publicdomain/zero/1.0/": "cc0", + "CC0-1.0": "cc0", + "CC0": "cc0", +} +CC_LICENSE_URL_RULES: List[Tuple[re.Pattern, str]] = [ + (re.compile(r"creativecommons\.org/licenses/by-nc-sa", re.I), "cc-by-nc-sa"), + (re.compile(r"creativecommons\.org/licenses/by-nc-nd", re.I), "cc-by-nc-nd"), + (re.compile(r"creativecommons\.org/licenses/by-nc(?:/|$)", re.I), "cc-by-nc"), + (re.compile(r"creativecommons\.org/licenses/by-sa", re.I), "cc-by-sa"), + (re.compile(r"creativecommons\.org/licenses/by-nd", re.I), "cc-by-nd"), + (re.compile(r"creativecommons\.org/licenses/by(?:/|$)", re.I), "cc-by"), + (re.compile(r"creativecommons\.org/publicdomain/zero", re.I), "cc0"), + (re.compile(r"arxiv\.org/licenses/nonexclusive-distrib", re.I), "nonexclusive-distrib"), +] + + +def log_step(message: str) -> None: + print(f"[info] {message}", file=sys.stderr, flush=True) + + +def timed_step(name: str): + class _Timer: + def __enter__(self): + self.start = time.time() + log_step(f"{name} 开始") + return self + + def __exit__(self, exc_type, exc, tb): + elapsed = time.time() - self.start + status = "失败" if exc_type else "完成" + log_step(f"{name} {status},耗时 {elapsed:.1f}s") + return False + + return _Timer() + + +def safe_filename_token(value: Optional[Any]) -> str: + text = "all" if value in (None, "") else str(value) + return "".join(ch if ch.isalnum() or ch in ("-", "_") else "_" for ch in text).strip("_") or "all" + + +def default_output_dir( + target_dt: Optional[str], + paper_dt: Optional[str], + ebook_dt: Optional[str], + limit: Optional[int], + full: bool, +) -> Path: + del paper_dt, ebook_dt, limit, full + dt_token = safe_filename_token(target_dt) + prefix = f"union_unique_meta_data_{dt_token}_" + max_seq = 0 + if REPORT_ROOT.exists(): + for path in REPORT_ROOT.glob(f"{prefix}[0-9][0-9][0-9][0-9]"): + if not path.is_dir(): + continue + seq_text = path.name.rsplit("_", 1)[-1] + if seq_text.isdigit(): + max_seq = max(max_seq, int(seq_text)) + return REPORT_ROOT / f"{prefix}{max_seq + 1:04d}" + + +@dataclass(frozen=True) +class UnionFieldSpec: + field_name: str + data_type: str + paper_source: str + ebook_source: str + xinghe_source: str + + +class JsonEncoder(json.JSONEncoder): + def default(self, obj: Any) -> Any: + if isinstance(obj, Decimal): + if obj == obj.to_integral_value(): + return int(obj) + return float(obj) + if isinstance(obj, (date, datetime)): + return obj.isoformat() + return super().default(obj) + + +def normalize_data_type(data_type: str) -> str: + text = (data_type or "").strip() + lower = text.lower() + if lower.startswith("list["): + inner = lower[5:-1].strip() + return f"array<{inner}>" + if lower == "object": + return "map" + if lower in ("string", "integer", "long", "float", "boolean"): + return { + "string": "string", + "integer": "int", + "long": "bigint", + "float": "float", + "boolean": "boolean", + }[lower] + if lower.startswith("timestamp"): + return "bigint" + return lower or text + + +def _is_field_ref(value: str) -> bool: + if not value or value in ("-", "/"): + return False + if any("\u4e00" <= c <= "\u9fff" for c in value): + return False + if "'" in value: + return False + return True + + +def load_union_specs( + path: Path, + *, + field_col: str = "", + type_col: str = "", + paper_col: str = "", + ebook_col: str = "", + xinghe_col: str = "", +) -> List[UnionFieldSpec]: + specs: List[UnionFieldSpec] = [] + with path.open(encoding="utf-8-sig", newline="") as f: + reader = csv.DictReader(f) + fieldnames = reader.fieldnames or [] + if not field_col: + field_col = "统一字段名" if "统一字段名" in fieldnames else "字段名" + if not type_col: + type_col = "字段值数据类型" if "字段值数据类型" in fieldnames else "数据类型" + if not paper_col: + paper_col = "源字段映射(论文)" if "源字段映射(论文)" in fieldnames else "论文表对应字段" + if not ebook_col: + ebook_col = "源字段映射(图书)" if "源字段映射(图书)" in fieldnames else "图书表对应字段" + if not xinghe_col: + xinghe_col = "源字段映射(星河)" if "源字段映射(星河)" in fieldnames else "星河全文表对应字段" + if not reader.fieldnames or field_col not in reader.fieldnames: + available = ", ".join(fn for fn in (reader.fieldnames or []) if fn.strip()) + raise ValueError( + f"映射文件 {path} 缺少字段列 {field_col!r}(可用列: {available})" + ) + for row in reader: + name = (row.get(field_col) or "").strip() + if not name: + continue + specs.append( + UnionFieldSpec( + field_name=name, + data_type=normalize_data_type((row.get(type_col) or "").strip()), + paper_source=(row.get(paper_col) or "").strip(), + ebook_source=(row.get(ebook_col) or "").strip(), + xinghe_source=(row.get(xinghe_col) or "").strip(), + ) + ) + return specs + + +def build_field_maps( + specs: Sequence[UnionFieldSpec], + metadata_type: str, +) -> Tuple[Dict[str, str], Dict[str, str]]: + metadata_map: Dict[str, str] = {} + xinghe_map: Dict[str, str] = {} + for spec in specs: + source = spec.paper_source if metadata_type == "paper" else spec.ebook_source + if _is_field_ref(source): + metadata_map[source] = spec.field_name + if _is_field_ref(spec.xinghe_source): + xinghe_map[spec.xinghe_source] = spec.field_name + return metadata_map, xinghe_map + + +def build_empty_output(specs: Sequence[UnionFieldSpec], metadata_type: str) -> Dict[str, Any]: + output: Dict[str, Any] = {} + for spec in specs: + output[spec.field_name] = False if spec.data_type == "boolean" else None + output["metadata_type"] = metadata_type + return output + + +def raw_key(value: Any) -> str: + if value is None: + return "" + return str(value) + + +def normalize_key_text(value: Any) -> str: + if value is None: + return "" + return html.unescape(str(value)).strip() + + +def key_from_unique_id(unique_id: Any, metadata_type: str) -> str: + if unique_id in (None, ""): + return "" + prefix = f"{metadata_type}:" + text = str(unique_id) + if not text.startswith(prefix): + return "" + return normalize_key_text(text[len(prefix):]) + + +def target_key_for_row(row: Dict[str, Any], metadata_type: str) -> str: + key_field = "doi" if metadata_type == "paper" else "isbn13" + key = normalize_key_text(row.get(key_field)) + if key: + return key + return key_from_unique_id(row.get("unique_id"), metadata_type) + + +def normalize_lookup_key(key: Any, metadata_type: str) -> str: + if key in (None, ""): + return "" + text = normalize_key_text(key) + return text.lower() if metadata_type == "paper" else text + + +def get_source_value(record: Dict[str, Any], source: str, source_kind: str = "") -> Any: + if source in record: + return record.get(source) + if "." not in source: + return None + current: Any = record + for part in source.split("."): + current = normalize_json_like(current) + if isinstance(current, dict): + current = current.get(part) + else: + return None + return current + + +def apply_field_map( + output: Dict[str, Any], + record: Optional[Dict[str, Any]], + field_map: Dict[str, str], + *, + source_kind: str = "", + overwrite: bool = True, + fallback_only_fields: Optional[Set[str]] = None, +) -> None: + if record is None: + return + fallback_only_fields = fallback_only_fields or set() + for src, dst in field_map.items(): + value = get_source_value(record, src, source_kind) + if value is None: + continue + if not overwrite or dst in fallback_only_fields: + current = output.get(dst) + if not is_deep_empty(current): + continue + if value is not None: + output[dst] = value + + +def apply_xinghe_only_metadata_fallback( + output: Dict[str, Any], + record: Optional[Dict[str, Any]], + *, + metadata_type: str, + specs: Sequence[UnionFieldSpec], +) -> None: + if record is None: + return + for spec in specs: + if output.get(spec.field_name) is not None: + continue + metadata_source = spec.paper_source if metadata_type == "paper" else spec.ebook_source + candidates = [] + if _is_field_ref(metadata_source): + candidates.append(metadata_source) + candidates.append(spec.field_name) + for src in candidates: + value = get_source_value(record, src, "xinghe") + if value is not None: + output[spec.field_name] = value + break + + +def normalize_journal_lookup_key(value: Any) -> str: + if value is None: + return "" + return " ".join(str(value).strip().split()).casefold() + + +def load_journal_name_mapping( + path: Path = DEFAULT_JOURNAL_MAPPING_CSV, +) -> Tuple[Dict[str, str], Dict[str, str]]: + exact_map: Dict[str, str] = {} + normalized_map: Dict[str, str] = {} + if not path.exists(): + return exact_map, normalized_map + with path.open("r", encoding="utf-8-sig", newline="") as f: + for row in csv.DictReader(f): + source_name = (row.get("source_journal_name") or "").strip() + target_name = (row.get("target_journal_name") or "").strip() + if not source_name or not target_name: + continue + exact_map.setdefault(source_name, target_name) + normalized_key = normalize_journal_lookup_key(source_name) + if normalized_key: + normalized_map.setdefault(normalized_key, target_name) + return exact_map, normalized_map + + +def lookup_journal_name_unified(value: Any) -> Any: + if is_deep_empty(value): + return value + global JOURNAL_NAME_MAPPING_CACHE + if JOURNAL_NAME_MAPPING_CACHE is None: + JOURNAL_NAME_MAPPING_CACHE = load_journal_name_mapping() + exact_map, normalized_map = JOURNAL_NAME_MAPPING_CACHE + text = " ".join(str(value).strip().split()) + return exact_map.get(text) or normalized_map.get(normalize_journal_lookup_key(text)) or value + + +def apply_derived_fields(output: Dict[str, Any]) -> None: + if is_deep_empty(output.get("publication_venue_name_unified")): + output["publication_venue_name_unified"] = lookup_journal_name_unified( + output.get("publication_venue_name") + ) + + +def merge_one( + metadata_record: Optional[Dict[str, Any]], + xinghe_record: Optional[Dict[str, Any]], + *, + metadata_type: str, + specs: Sequence[UnionFieldSpec], + metadata_map: Dict[str, str], + xinghe_map: Dict[str, str], + fallback_key: Optional[Any] = None, +) -> Dict[str, Any]: + output = build_empty_output(specs, metadata_type) + apply_field_map(output, metadata_record, metadata_map, source_kind=metadata_type) + + if xinghe_record is not None: + sha256 = xinghe_record.get("sha256") + output["access_xinghe_repository_has_fulltext"] = sha256 not in (None, "", [], {}) + apply_field_map( + output, + xinghe_record, + xinghe_map, + source_kind="xinghe", + fallback_only_fields=XINGHE_SUPPLEMENT_FIELDS, + ) + if metadata_record is None: + apply_xinghe_only_metadata_fallback( + output, + xinghe_record, + metadata_type=metadata_type, + specs=specs, + ) + + uid_field = "doi" if metadata_type == "paper" else "isbn13" + xinghe_key = "doi" if metadata_type == "paper" else "isbn" + key_val = raw_key(output.get(uid_field)) + if not key_val and metadata_record is None: + fallback = raw_key(xinghe_record.get(xinghe_key)) if xinghe_record is not None else raw_key(fallback_key) + if fallback: + output[uid_field] = xinghe_record.get(xinghe_key) if xinghe_record is not None else fallback + key_val = fallback + output["unique_id"] = f"{metadata_type}:{key_val}" if key_val else None + apply_derived_fields(output) + return output + + +def load_config(path: Path) -> Dict[str, Any]: + if not path.exists(): + raise FileNotFoundError( + f"Config file not found: {path}\n" + f"Copy the template and fill in credentials:\n" + f" cp {TEMPLATE_CONFIG_PATH} {path}" + ) + with path.open("r", encoding="utf-8") as f: + return json.load(f) + + +def connect_starrocks(config_path: Path): + if pymysql is None: + raise RuntimeError("pymysql is required. Install pymysql before running DB validation.") + cfg = load_config(config_path) + mysql_cfg = cfg["mysql"] + return pymysql.connect( + host=mysql_cfg["host"], + port=int(mysql_cfg["port"]), + user=mysql_cfg["user"], + password=mysql_cfg["password"], + charset=mysql_cfg.get("charset", "utf8mb4"), + connect_timeout=int(mysql_cfg.get("connect_timeout", 30)), + read_timeout=int(mysql_cfg.get("read_timeout", 180)), + ) + + +def qualify_table_name( + table: str, + catalog: Optional[str], + database: str = "dws", +) -> str: + """Resolve table to catalog.database.table for StarRocks Iceberg queries.""" + parts = [part.strip() for part in table.split(".") if part.strip()] + if len(parts) >= 3: + return table + if len(parts) == 2: + db_name, table_name = parts + if catalog: + return f"{catalog}.{db_name}.{table_name}" + return table + if len(parts) == 1: + if catalog: + return f"{catalog}.{database}.{parts[0]}" + return f"{database}.{parts[0]}" + return table + + +def quote_identifier(identifier: str) -> str: + parts = [part.strip() for part in identifier.split(".") if part.strip()] + if not parts: + raise ValueError(f"Invalid identifier: {identifier!r}") + return ".".join(f"`{part.replace('`', '``')}`" for part in parts) + + +def fetch_records(conn: Any, sql: str, params: Sequence[Any] = ()) -> List[Dict[str, Any]]: + with conn.cursor() as cursor: + cursor.execute(sql, params) + if cursor.description is None: + return [] + cols = [field[0] for field in cursor.description] + return [dict(zip(cols, row)) for row in cursor.fetchall()] + + +def fetch_one(conn: Any, sql: str, params: Sequence[Any] = ()) -> Optional[Dict[str, Any]]: + rows = fetch_records(conn, sql, params) + return rows[0] if rows else None + + +def normalize_json_like(value: Any) -> Any: + if isinstance(value, (bytes, bytearray)): + value = value.decode("utf-8", errors="replace") + if isinstance(value, str): + stripped = value.strip() + if stripped and stripped[0] in "[{": + try: + return json.loads(stripped) + except json.JSONDecodeError: + return value + return value + + +def canonicalize(value: Any) -> Any: + value = normalize_json_like(value) + if isinstance(value, Decimal): + if value == value.to_integral_value(): + return int(value) + return float(value) + if isinstance(value, (date, datetime)): + return value.isoformat() + if isinstance(value, dict): + return {str(k): canonicalize(v) for k, v in sorted(value.items(), key=lambda item: str(item[0]))} + if isinstance(value, list): + return [canonicalize(v) for v in value] + return value + + +def comparable_record(record: Dict[str, Any], fields: Iterable[str]) -> Dict[str, Any]: + return {field: canonicalize(record.get(field)) for field in fields} + + +HTML_UNESCAPE_COMPARE_FIELDS = {"unique_id", "doi", "isbn13"} + + +def normalize_author_for_compare(value: Any) -> Any: + value = normalize_json_like(value) + if value is None: + return None + if isinstance(value, str): + text = " ".join(value.strip().split()) + return None if text in ("", "[]", "{}") else [text] + if isinstance(value, dict): + name = value.get("name") + if name is None: + return None + text = " ".join(str(name).strip().split()) + return None if not text else [text] + if isinstance(value, list): + names: List[str] = [] + for item in value: + item = normalize_json_like(item) + if isinstance(item, dict): + item = item.get("name") + if item is None: + continue + text = " ".join(str(item).strip().split()) + if text: + names.append(text) + if not names: + return None + return sorted(dict.fromkeys(names)) + return value + + +def normalize_license_value(value: Any) -> str: + if value is None: + return "" + text = str(value).strip() + if not text: + return "" + if text in DEFAULT_LICENSE_MAP: + return DEFAULT_LICENSE_MAP[text] + trimmed = text.rstrip("/") + if trimmed in DEFAULT_LICENSE_MAP: + return DEFAULT_LICENSE_MAP[trimmed] + compact = re.sub(r"[^A-Za-z0-9]", "", text).upper() + if compact in DEFAULT_LICENSE_MAP: + return DEFAULT_LICENSE_MAP[compact] + lower = text.lower() + if lower in LICENSE_ALLOWED: + return lower + for pattern, canonical in CC_LICENSE_URL_RULES: + if pattern.search(text): + return canonical + return lower + + +def normalize_locations_for_compare(value: Any) -> Any: + value = normalize_json_like(value) + if value is None: + return None + if isinstance(value, str): + if value.strip() in ("", "[]"): + return None + return value + if not isinstance(value, list): + return value + out: List[Dict[str, Any]] = [] + for item in value: + item = normalize_json_like(item) + if not isinstance(item, dict): + continue + loc = {str(k): canonicalize(v) for k, v in item.items()} + if "license" in loc: + loc["license"] = normalize_license_value(loc.get("license")) + if "is_oa" in loc and loc.get("is_oa") is not None: + loc["is_oa"] = str(loc.get("is_oa")).lower() + out.append({key: loc.get(key) for key in sorted(loc)}) + return out or None + + +def normalize_empty_for_compare(value: Any, data_type: str, field: str = "") -> Any: + type_text = (data_type or "").strip().lower() + if value is None: + return None + if field == "author": + return normalize_author_for_compare(value) + if field == "access_license": + normalized_license = normalize_license_value(value) + return normalized_license or None + if field == "locations": + return normalize_locations_for_compare(value) + if field in HTML_UNESCAPE_COMPARE_FIELDS and isinstance(value, str): + value = html.unescape(value).strip() + if isinstance(value, list) and is_deep_empty(value): + return None + if type_text in ("string", "varchar", "char", "text"): + return None if isinstance(value, str) and value.strip() == "" else value + if type_text.startswith("array") or type_text.startswith("list"): + if is_deep_empty(value): + return None + if isinstance(value, str) and value.strip() in ("", "[]"): + return None + if type_text.startswith("struct") or type_text.startswith("map"): + return None if is_deep_empty(value) else value + return value + + +def is_deep_empty(value: Any) -> bool: + value = normalize_json_like(value) + if value is None: + return True + if isinstance(value, str): + stripped = value.strip() + return stripped in ("", "[]", "{}") + if isinstance(value, dict): + return all(is_deep_empty(item) for item in value.values()) + if isinstance(value, list): + return all(is_deep_empty(item) for item in value) + return False + + +def compare_records( + expected: Dict[str, Any], + actual: Dict[str, Any], + field_types: Optional[Dict[str, str]] = None, +) -> Dict[str, Dict[str, Any]]: + mismatches: Dict[str, Dict[str, Any]] = {} + field_types = field_types or {} + for field, expected_value in expected.items(): + actual_value = actual.get(field) + expected_value = normalize_empty_for_compare(expected_value, field_types.get(field, ""), field) + actual_value = normalize_empty_for_compare(actual_value, field_types.get(field, ""), field) + if expected_value != actual_value: + mismatches[field] = { + "expected": expected_value, + "actual": actual_value, + } + return mismatches + + +def _dt_clause(dt: Optional[str], params: List[Any], alias: Optional[str] = None) -> str: + if dt is None: + return "" + params.append(dt) + prefix = f"{quote_identifier(alias)}." if alias else "" + return f" AND {prefix}`dt` = %s" + + +def _limit_clause(limit: Optional[int]) -> str: + return "" if limit is None else f" LIMIT {int(limit)}" + + +def split_limit(limit: Optional[int], parts: int) -> List[Optional[int]]: + if limit is None: + return [None] * parts + base = max(0, int(limit)) // parts + remainder = max(0, int(limit)) % parts + return [base + (1 if i < remainder else 0) for i in range(parts)] + + +def show_columns(conn: Any, table: str) -> List[str]: + rows = fetch_records(conn, f"SHOW COLUMNS FROM {quote_identifier(table)}") + columns: List[str] = [] + for row in rows: + field = row.get("Field") or row.get("field") or next(iter(row.values())) + columns.append(str(field)) + return columns + + +def show_column_types(conn: Any, table: str) -> Dict[str, str]: + rows = fetch_records(conn, f"SHOW COLUMNS FROM {quote_identifier(table)}") + column_types: Dict[str, str] = {} + for row in rows: + field = row.get("Field") or row.get("field") or next(iter(row.values())) + data_type = row.get("Type") or row.get("type") or "" + column_types[str(field)] = str(data_type) + return column_types + + +def validate_schema( + conn: Any, + *, + target_table: str, + specs: Sequence[UnionFieldSpec], +) -> Dict[str, Any]: + expected_fields = [spec.field_name for spec in specs] + actual_fields = show_columns(conn, target_table) + actual_set = set(actual_fields) + expected_set = set(expected_fields) + return { + "missing_fields": [field for field in expected_fields if field not in actual_set], + "extra_fields": [ + field + for field in actual_fields + if field not in expected_set and field not in IGNORED_TARGET_EXTRA_FIELDS + ], + "expected_count": len(expected_fields), + "actual_count": len(actual_fields), + } + + +def count_table(conn: Any, table: str, dt: Optional[str]) -> int: + params: List[Any] = [] + sql = f"SELECT COUNT(*) AS cnt FROM {quote_identifier(table)} WHERE 1=1{_dt_clause(dt, params)}" + row = fetch_one(conn, sql, params) + return int(row["cnt"]) if row else 0 + + +def count_xinghe_only_distinct_key( + conn: Any, + *, + xinghe_table: str, + metadata_table: str, + xinghe_key_field: str, + metadata_key_field: str, + metadata_dt: Optional[str], +) -> int: + params: List[Any] = [] + metadata_dt_join = "AND m.`dt` = %s" if metadata_dt is not None else "" + if metadata_dt is not None: + params.append(metadata_dt) + sql = ( + "SELECT COUNT(DISTINCT " + f"x.`{xinghe_key_field}`" + ") AS cnt " + f"FROM {quote_identifier(xinghe_table)} x " + f"LEFT JOIN {quote_identifier(metadata_table)} m " + f"ON m.`{metadata_key_field}` = x.`{xinghe_key_field}` {metadata_dt_join} " + f"WHERE x.`{xinghe_key_field}` IS NOT NULL AND x.`{xinghe_key_field}` != '' " + f"AND m.`{metadata_key_field}` IS NULL" + ) + row = fetch_one(conn, sql, params) + return int(row["cnt"]) if row else 0 + + +def source_coverage_counts( + conn: Any, + *, + paper_table: str, + ebook_table: str, + xinghe_table: str, + target_table: str, + target_dt: Optional[str], + paper_dt: Optional[str], + ebook_dt: Optional[str], +) -> Dict[str, Any]: + paper_source = count_table(conn, paper_table, paper_dt) + ebook_source = count_table(conn, ebook_table, ebook_dt) + target = count_table(conn, target_table, target_dt) + xinghe_only_paper_count = count_xinghe_only_distinct_key( + conn, + xinghe_table=xinghe_table, + metadata_table=paper_table, + xinghe_key_field="doi", + metadata_key_field="doi", + metadata_dt=paper_dt, + ) + xinghe_only_ebook_count = count_xinghe_only_distinct_key( + conn, + xinghe_table=xinghe_table, + metadata_table=ebook_table, + xinghe_key_field="isbn", + metadata_key_field="isbn13", + metadata_dt=ebook_dt, + ) + expected_target_count = ( + paper_source + + ebook_source + + xinghe_only_paper_count + + xinghe_only_ebook_count + ) + result: Dict[str, Any] = { + "paper_source": paper_source, + "ebook_source": ebook_source, + "xinghe_only_paper_count": xinghe_only_paper_count, + "xinghe_only_ebook_count": xinghe_only_ebook_count, + "expected_target_count": expected_target_count, + "actual_target_count": target, + "target_count_diff": target - expected_target_count, + } + return result + + +def count_xinghe_only_missing_target( + conn: Any, + *, + xinghe_table: str, + metadata_table: str, + target_table: str, + metadata_type: str, + xinghe_key_field: str, + metadata_key_field: str, + target_dt: Optional[str], + metadata_dt: Optional[str], +) -> int: + params: List[Any] = [] + metadata_dt_join = "AND m.`dt` = %s" if metadata_dt is not None else "" + if metadata_dt is not None: + params.append(metadata_dt) + target_dt_join = "AND t.`dt` = %s" if target_dt is not None else "" + if target_dt is not None: + params.append(target_dt) + sql = ( + "SELECT COUNT(*) AS cnt " + f"FROM {quote_identifier(xinghe_table)} x " + f"LEFT JOIN {quote_identifier(metadata_table)} m " + f"ON m.`{metadata_key_field}` = x.`{xinghe_key_field}` {metadata_dt_join} " + f"LEFT JOIN {quote_identifier(target_table)} t " + f"ON t.`unique_id` = CONCAT('{metadata_type}:', x.`{xinghe_key_field}`) {target_dt_join} " + f"WHERE x.`{xinghe_key_field}` IS NOT NULL AND x.`{xinghe_key_field}` != '' " + f"AND m.`{metadata_key_field}` IS NULL AND t.`unique_id` IS NULL" + ) + row = fetch_one(conn, sql, params) + return int(row["cnt"]) if row else 0 + + +def skipped_coverage_counts(reason: str) -> Dict[str, Any]: + return {"skipped": True, "reason": reason} + + +def failed_coverage_counts(exc: Exception) -> Dict[str, Any]: + return { + "skipped": True, + "status": "failed", + "reason": "coverage_count_failed", + "error_type": type(exc).__name__, + "error": str(exc), + } + + +def build_target_sample_query( + target_table: str, + dt: Optional[str], + limit: Optional[int], + metadata_type: Optional[str] = None, + sample_mode: str = "natural", +) -> Tuple[str, List[Any]]: + params: List[Any] = [] + type_clause = "" + if metadata_type is not None: + type_clause = " AND `metadata_type` = %s" + params.append(metadata_type) + sql = ( + f"SELECT * FROM {quote_identifier(target_table)} " + f"WHERE `unique_id` IS NOT NULL AND `metadata_type` IN ('paper', 'ebook')" + f"{type_clause}{_dt_clause(dt, params)}" + f"{' AND MOD(CRC32(`unique_id`), 100) = 0' if sample_mode == 'hash' else ''}" + f"{_limit_clause(limit)}" + ) + return sql, params + + +def fetch_target_samples( + conn: Any, + *, + target_table: str, + dt: Optional[str], + limit: Optional[int], + sample_mode: str = "natural", +) -> List[Dict[str, Any]]: + if limit is None: + sql, params = build_target_sample_query(target_table, dt, None) + return fetch_records(conn, sql, params) + + rows: List[Dict[str, Any]] = [] + for metadata_type, part_limit in zip(("paper", "ebook"), split_limit(limit, 2)): + if part_limit == 0: + continue + sql, params = build_target_sample_query(target_table, dt, part_limit, metadata_type, sample_mode) + rows.extend(fetch_records(conn, sql, params)) + return rows + + +def build_missing_target_sample_query( + source_table: str, + target_table: str, + *, + metadata_type: str, + key_field: str, + source_dt: Optional[str], + target_dt: Optional[str], + limit: Optional[int], +) -> Tuple[str, List[Any]]: + params: List[Any] = [] + source_alias = "s" + target_dt_join = "AND t.`dt` = %s" if target_dt is not None else "AND t.`dt` = s.`dt`" + if target_dt is not None: + params.append(target_dt) + sql = ( + f"SELECT {source_alias}.`{key_field}` AS sample_key, {source_alias}.`dt` AS dt " + f"FROM {quote_identifier(source_table)} {source_alias} " + f"LEFT JOIN {quote_identifier(target_table)} t " + f"ON t.`unique_id` = CONCAT('{metadata_type}:', {source_alias}.`{key_field}`) " + f"{target_dt_join} " + f"WHERE {source_alias}.`{key_field}` IS NOT NULL AND {source_alias}.`{key_field}` != ''" + f"{_dt_clause(source_dt, params, source_alias)} AND t.`unique_id` IS NULL " + f"ORDER BY {source_alias}.`{key_field}`{_limit_clause(limit)}" + ) + return sql, params + + +def build_xinghe_missing_target_sample_query( + xinghe_table: str, + target_table: str, + *, + metadata_type: str, + xinghe_key_field: str, + dt: Optional[str], + limit: Optional[int], +) -> Tuple[str, List[Any]]: + params: List[Any] = [] + target_dt_join = "AND t.`dt` = %s" if dt is not None else "" + if dt is not None: + params.append(dt) + sql = ( + f"SELECT x.`{xinghe_key_field}` AS sample_key, " + "x.`data_date` AS data_date, x.`sha256` AS sha256, x.`origin_path` AS origin_path " + f"FROM {quote_identifier(xinghe_table)} x " + f"LEFT JOIN {quote_identifier(target_table)} t " + f"ON t.`unique_id` = CONCAT('{metadata_type}:', x.`{xinghe_key_field}`) " + f"{target_dt_join} " + f"WHERE x.`{xinghe_key_field}` IS NOT NULL AND x.`{xinghe_key_field}` != ''" + " AND t.`unique_id` IS NULL " + f"ORDER BY x.`{xinghe_key_field}`, x.`sha256`, x.`origin_path`{_limit_clause(limit)}" + ) + return sql, params + + +def build_xinghe_only_missing_target_sample_query( + xinghe_table: str, + metadata_table: str, + target_table: str, + *, + metadata_type: str, + xinghe_key_field: str, + metadata_key_field: str, + metadata_dt: Optional[str], + target_dt: Optional[str], + limit: Optional[int], +) -> Tuple[str, List[Any]]: + params: List[Any] = [] + metadata_dt_join = "AND m.`dt` = %s" if metadata_dt is not None else "" + if metadata_dt is not None: + params.append(metadata_dt) + target_dt_join = "AND t.`dt` = %s" if target_dt is not None else "" + if target_dt is not None: + params.append(target_dt) + sql = ( + f"SELECT x.`{xinghe_key_field}` AS sample_key, " + "x.`data_date` AS data_date, x.`sha256` AS sha256, x.`origin_path` AS origin_path " + f"FROM {quote_identifier(xinghe_table)} x " + f"LEFT JOIN {quote_identifier(metadata_table)} m " + f"ON m.`{metadata_key_field}` = x.`{xinghe_key_field}` {metadata_dt_join} " + f"LEFT JOIN {quote_identifier(target_table)} t " + f"ON t.`unique_id` = CONCAT('{metadata_type}:', x.`{xinghe_key_field}`) {target_dt_join} " + f"WHERE x.`{xinghe_key_field}` IS NOT NULL AND x.`{xinghe_key_field}` != '' " + f"AND m.`{metadata_key_field}` IS NULL AND t.`unique_id` IS NULL " + f"ORDER BY x.`{xinghe_key_field}`, x.`sha256`, x.`origin_path`{_limit_clause(limit)}" + ) + return sql, params + + +def fetch_metadata_record( + conn: Any, + *, + table: str, + metadata_type: str, + key: Any, + dt: Optional[str], +) -> Optional[Dict[str, Any]]: + key_field = "doi" if metadata_type == "paper" else "isbn13" + params: List[Any] = [str(key).lower() if metadata_type == "paper" else key] + predicate = f"LOWER(`{key_field}`) = %s" if metadata_type == "paper" else f"`{key_field}` = %s" + sql = ( + f"SELECT * FROM {quote_identifier(table)} WHERE {predicate}" + f"{_dt_clause(dt, params)} ORDER BY `{key_field}` LIMIT 2" + ) + rows = fetch_records(conn, sql, params) + return rows[0] if rows else None + + +def chunked(values: Sequence[Any], size: int) -> Iterable[Sequence[Any]]: + for i in range(0, len(values), size): + yield values[i : i + size] + + +def fetch_metadata_records_batch( + conn: Any, + *, + table: str, + metadata_type: str, + keys: Sequence[Any], + dt: Optional[str], + batch_size: int = 500, +) -> Dict[str, Dict[str, Any]]: + key_field = "doi" if metadata_type == "paper" else "isbn13" + normalized_keys = [ + str(key).lower() if metadata_type == "paper" else str(key) + for key in keys + if key not in (None, "") + ] + result: Dict[str, Dict[str, Any]] = {} + for batch in chunked(sorted(set(normalized_keys)), batch_size): + params: List[Any] = list(batch) + placeholders = ",".join(["%s"] * len(batch)) + predicate = ( + f"LOWER(`{key_field}`) IN ({placeholders})" + if metadata_type == "paper" + else f"`{key_field}` IN ({placeholders})" + ) + sql = ( + f"SELECT * FROM {quote_identifier(table)} WHERE {predicate}" + f"{_dt_clause(dt, params)} ORDER BY `{key_field}`" + ) + for row in fetch_records(conn, sql, params): + row_key = row.get(key_field) + if row_key in (None, ""): + continue + map_key = normalize_lookup_key(row_key, metadata_type) + result.setdefault(map_key, row) + return result + + +def embedded_key_like_patterns(key: Any) -> List[str]: + text = normalize_key_text(key).lower() + if not text: + return [] + if "<" not in text and ">" not in text: + return [] + variants = {text, html.escape(text, quote=False).lower()} + return [f"%{variant}%" for variant in sorted(variants) if variant] + + +def fetch_paper_metadata_records_by_embedded_key( + conn: Any, + *, + table: str, + key: Any, + dt: Optional[str], + limit: int = 20, +) -> List[Dict[str, Any]]: + patterns = embedded_key_like_patterns(key) + if not patterns: + return [] + params: List[Any] = list(patterns) + like_clause = " OR ".join(["LOWER(`doi`) LIKE %s"] * len(patterns)) + sql = ( + f"SELECT * FROM {quote_identifier(table)} " + f"WHERE ({like_clause}){_dt_clause(dt, params)} " + f"ORDER BY `doi` LIMIT {int(limit)}" + ) + return fetch_records(conn, sql, params) + + +def score_metadata_candidate( + target_row: Dict[str, Any], + candidate: Dict[str, Any], + *, + specs: Sequence[UnionFieldSpec], + metadata_type: str, +) -> int: + score = 0 + for spec in specs: + source = spec.paper_source if metadata_type == "paper" else spec.ebook_source + if not _is_field_ref(source): + continue + actual_value = normalize_empty_for_compare( + canonicalize(target_row.get(spec.field_name)), + spec.data_type, + spec.field_name, + ) + if actual_value is None: + continue + candidate_value = normalize_empty_for_compare( + canonicalize(get_source_value(candidate, source, metadata_type)), + spec.data_type, + spec.field_name, + ) + if candidate_value == actual_value: + score += 1 + return score + + +def choose_metadata_record_for_target( + target_row: Dict[str, Any], + candidates: Sequence[Dict[str, Any]], + *, + specs: Sequence[UnionFieldSpec], + metadata_type: str, +) -> Optional[Dict[str, Any]]: + if not candidates: + return None + scored = [ + ( + score_metadata_candidate( + target_row, + candidate, + specs=specs, + metadata_type=metadata_type, + ), + candidate, + ) + for candidate in candidates + ] + scored.sort(key=lambda item: item[0], reverse=True) + if scored[0][0] > 0 or len(scored) == 1: + return scored[0][1] + return None + + +def fetch_xinghe_records( + conn: Any, + *, + table: str, + metadata_type: str, + key: Any, + dt: Optional[str], + limit: int = 100, +) -> List[Dict[str, Any]]: + key_field = "doi" if metadata_type == "paper" else "isbn" + params: List[Any] = [str(key).lower() if metadata_type == "paper" else key] + predicate = f"LOWER(`{key_field}`) = %s" if metadata_type == "paper" else f"`{key_field}` = %s" + sql = ( + f"SELECT * FROM {quote_identifier(table)} WHERE {predicate}" + f" ORDER BY `sha256`, `origin_path` LIMIT {int(limit)}" + ) + return fetch_records(conn, sql, params) + + +def fetch_xinghe_records_batch( + conn: Any, + *, + table: str, + metadata_type: str, + keys: Sequence[Any], + batch_size: int = 500, +) -> Dict[str, List[Dict[str, Any]]]: + key_field = "doi" if metadata_type == "paper" else "isbn" + normalized_keys = [ + str(key).lower() if metadata_type == "paper" else str(key) + for key in keys + if key not in (None, "") + ] + result: Dict[str, List[Dict[str, Any]]] = defaultdict(list) + for batch in chunked(sorted(set(normalized_keys)), batch_size): + params: List[Any] = list(batch) + placeholders = ",".join(["%s"] * len(batch)) + predicate = ( + f"LOWER(`{key_field}`) IN ({placeholders})" + if metadata_type == "paper" + else f"`{key_field}` IN ({placeholders})" + ) + sql = ( + f"SELECT * FROM {quote_identifier(table)} WHERE {predicate}" + " ORDER BY `sha256`, `origin_path`" + ) + for row in fetch_records(conn, sql, params): + row_key = row.get(key_field) + if row_key in (None, ""): + continue + map_key = normalize_lookup_key(row_key, metadata_type) + result[map_key].append(row) + return dict(result) + + +def fetch_xinghe_records_by_sha_batch( + conn: Any, + *, + table: str, + sha_values: Sequence[Any], + batch_size: int = 500, +) -> Dict[str, List[Dict[str, Any]]]: + normalized = [str(value) for value in sha_values if value not in (None, "")] + result: Dict[str, List[Dict[str, Any]]] = defaultdict(list) + for batch in chunked(sorted(set(normalized)), batch_size): + params: List[Any] = list(batch) + placeholders = ",".join(["%s"] * len(batch)) + sql = ( + f"SELECT * FROM {quote_identifier(table)} " + f"WHERE `sha256` IN ({placeholders}) " + "ORDER BY `sha256`, `origin_path`" + ) + for row in fetch_records(conn, sql, params): + sha256 = row.get("sha256") + if sha256 in (None, ""): + continue + result[str(sha256)].append(row) + return dict(result) + + +def fetch_paper_xinghe_records_by_embedded_key( + conn: Any, + *, + table: str, + key: Any, + limit: int = 100, +) -> List[Dict[str, Any]]: + patterns = embedded_key_like_patterns(key) + if not patterns: + return [] + params: List[Any] = list(patterns) + like_clause = " OR ".join(["LOWER(`doi`) LIKE %s"] * len(patterns)) + sql = ( + f"SELECT * FROM {quote_identifier(table)} " + f"WHERE ({like_clause}) " + f"ORDER BY `sha256`, `origin_path` LIMIT {int(limit)}" + ) + return fetch_records(conn, sql, params) + + +def fetch_xinghe_records_by_target_repository_fields( + conn: Any, + *, + table: str, + target_row: Dict[str, Any], + limit: int = 20, +) -> List[Dict[str, Any]]: + sha256 = target_row.get("access_xinghe_repository_sha256") + if sha256 in (None, ""): + return [] + sql = ( + f"SELECT * FROM {quote_identifier(table)} " + "WHERE `sha256` = %s " + f"ORDER BY `sha256`, `origin_path` LIMIT {int(limit)}" + ) + return fetch_records(conn, sql, [sha256]) + + +XINGHE_TARGET_MATCH_FIELDS = ( + ("sha256", "access_xinghe_repository_sha256"), + ("origin_path", "access_xinghe_repository_origin_path"), + ("processed_path", "access_xinghe_repository_processed_path"), + ("origin_url", "access_xinghe_repository_origin_url"), +) +JOURNAL_NAME_MAPPING_CACHE: Optional[Tuple[Dict[str, str], Dict[str, str]]] = None + + +def choose_xinghe_record_for_target( + target_row: Dict[str, Any], + xinghe_rows: Sequence[Dict[str, Any]], +) -> Optional[Dict[str, Any]]: + if not xinghe_rows: + return None + + for source_field, target_field in XINGHE_TARGET_MATCH_FIELDS: + target_value = target_row.get(target_field) + if target_value in (None, ""): + continue + target_cmp = str(target_value).strip() + for row in xinghe_rows: + source_value = row.get(source_field) + if source_value in (None, ""): + continue + if str(source_value).strip() == target_cmp: + return row + + if len(xinghe_rows) == 1: + return xinghe_rows[0] + return None + + +def expected_for_target_row( + conn: Any, + *, + row: Dict[str, Any], + specs: Sequence[UnionFieldSpec], + paper_table: str, + ebook_table: str, + xinghe_table: str, + target_dt: Optional[str], + paper_dt: Optional[str], + ebook_dt: Optional[str], +) -> Tuple[Optional[Dict[str, Any]], Dict[str, Any]]: + metadata_type = row.get("metadata_type") + if metadata_type not in ("paper", "ebook"): + return None, {"reason": "unsupported_metadata_type", "metadata_type": metadata_type} + + key = target_key_for_row(row, metadata_type) + if not key: + key_field = "doi" if metadata_type == "paper" else "isbn13" + return None, {"reason": "missing_target_key", "key_field": key_field} + + metadata_table = paper_table if metadata_type == "paper" else ebook_table + metadata_map, xinghe_map = build_field_maps(specs, metadata_type) + row_dt = target_dt if target_dt is not None else row.get("dt") + metadata_dt = paper_dt if metadata_type == "paper" else ebook_dt + if metadata_dt is None: + metadata_dt = row_dt + metadata_record = fetch_metadata_record( + conn, + table=metadata_table, + metadata_type=metadata_type, + key=key, + dt=metadata_dt, + ) + xinghe_rows = fetch_xinghe_records( + conn, + table=xinghe_table, + metadata_type=metadata_type, + key=key, + dt=row_dt, + ) + warnings: Dict[str, Any] = {} + xinghe_record = choose_xinghe_record_for_target(row, xinghe_rows) + if len(xinghe_rows) > 1: + warnings["xinghe_duplicate_candidates"] = len(xinghe_rows) + if xinghe_record is None: + warnings["xinghe_match"] = "ambiguous_no_repository_field_match" + expected = merge_one( + metadata_record, + xinghe_record, + metadata_type=metadata_type, + specs=specs, + metadata_map=metadata_map, + xinghe_map=xinghe_map, + fallback_key=key, + ) + if row_dt is not None: + expected["dt"] = row_dt + return expected, warnings + + +def expected_for_target_row_from_sources( + *, + row: Dict[str, Any], + specs: Sequence[UnionFieldSpec], + metadata_record: Optional[Dict[str, Any]], + xinghe_rows: Sequence[Dict[str, Any]], + target_dt: Optional[str], +) -> Tuple[Optional[Dict[str, Any]], Dict[str, Any]]: + metadata_type = row.get("metadata_type") + if metadata_type not in ("paper", "ebook"): + return None, {"reason": "unsupported_metadata_type", "metadata_type": metadata_type} + + key = target_key_for_row(row, metadata_type) + if not key: + key_field = "doi" if metadata_type == "paper" else "isbn13" + return None, {"reason": "missing_target_key", "key_field": key_field} + + metadata_map, xinghe_map = build_field_maps(specs, metadata_type) + warnings: Dict[str, Any] = {} + xinghe_record = choose_xinghe_record_for_target(row, xinghe_rows) + if len(xinghe_rows) > 1: + warnings["xinghe_duplicate_candidates"] = len(xinghe_rows) + if xinghe_record is None: + warnings["xinghe_match"] = "ambiguous_no_repository_field_match" + expected = merge_one( + metadata_record, + xinghe_record, + metadata_type=metadata_type, + specs=specs, + metadata_map=metadata_map, + xinghe_map=xinghe_map, + fallback_key=key, + ) + row_dt = target_dt if target_dt is not None else row.get("dt") + if row_dt is not None: + expected["dt"] = row_dt + return expected, warnings + + +def validate_source_field_mapping( + conn: Any, + *, + specs: Sequence[UnionFieldSpec], + paper_table: str, + ebook_table: str, + xinghe_table: str, + target_table: str, + target_dt: Optional[str], + paper_dt: Optional[str], + ebook_dt: Optional[str], + limit: Optional[int], + target_sample_mode: str = "natural", +) -> Dict[str, Any]: + target_rows = fetch_target_samples( + conn, + target_table=target_table, + dt=target_dt, + limit=limit, + sample_mode=target_sample_mode, + ) + log_step(f"source field mapping 抽到目标样本 {len(target_rows)} 条") + keys_by_type: Dict[str, List[Any]] = {"paper": [], "ebook": []} + repository_sha_values: List[Any] = [] + for target_row in target_rows: + metadata_type = target_row.get("metadata_type") + if metadata_type == "paper": + keys_by_type["paper"].append(target_key_for_row(target_row, "paper")) + elif metadata_type == "ebook": + keys_by_type["ebook"].append(target_key_for_row(target_row, "ebook")) + sha256 = target_row.get("access_xinghe_repository_sha256") + if sha256 not in (None, ""): + repository_sha_values.append(sha256) + metadata_records = { + "paper": fetch_metadata_records_batch( + conn, + table=paper_table, + metadata_type="paper", + keys=keys_by_type["paper"], + dt=paper_dt if paper_dt is not None else target_dt, + ), + "ebook": fetch_metadata_records_batch( + conn, + table=ebook_table, + metadata_type="ebook", + keys=keys_by_type["ebook"], + dt=ebook_dt if ebook_dt is not None else target_dt, + ), + } + xinghe_records = { + "paper": fetch_xinghe_records_batch( + conn, + table=xinghe_table, + metadata_type="paper", + keys=keys_by_type["paper"], + ), + "ebook": fetch_xinghe_records_batch( + conn, + table=xinghe_table, + metadata_type="ebook", + keys=keys_by_type["ebook"], + ), + } + xinghe_records_by_sha = fetch_xinghe_records_by_sha_batch( + conn, + table=xinghe_table, + sha_values=repository_sha_values, + ) + log_step( + "source batch 查询完成:" + f"paper metadata={len(metadata_records['paper'])}, " + f"ebook metadata={len(metadata_records['ebook'])}, " + f"paper xinghe={len(xinghe_records['paper'])}, " + f"ebook xinghe={len(xinghe_records['ebook'])}, " + f"sha xinghe={len(xinghe_records_by_sha)}" + ) + compare_fields = [spec.field_name for spec in specs] + field_types = {spec.field_name: spec.data_type for spec in specs} + checked = passed = failed = skipped = 0 + mismatches: List[Dict[str, Any]] = [] + warnings: List[Dict[str, Any]] = [] + paper_metadata_embedded_cache: Dict[str, List[Dict[str, Any]]] = {} + paper_xinghe_embedded_cache: Dict[str, List[Dict[str, Any]]] = {} + + for target_row in target_rows: + checked += 1 + metadata_type = target_row.get("metadata_type") + lookup_key = normalize_lookup_key(target_key_for_row(target_row, str(metadata_type)), str(metadata_type)) + metadata_record = metadata_records.get(str(metadata_type), {}).get(lookup_key) + xinghe_rows = xinghe_records.get(str(metadata_type), {}).get(lookup_key, []) + if metadata_type == "paper" and lookup_key: + if metadata_record is None: + if lookup_key not in paper_metadata_embedded_cache: + paper_metadata_embedded_cache[lookup_key] = fetch_paper_metadata_records_by_embedded_key( + conn, + table=paper_table, + key=lookup_key, + dt=paper_dt if paper_dt is not None else target_dt, + ) + metadata_record = choose_metadata_record_for_target( + target_row, + paper_metadata_embedded_cache[lookup_key], + specs=specs, + metadata_type="paper", + ) + if not xinghe_rows: + if lookup_key not in paper_xinghe_embedded_cache: + paper_xinghe_embedded_cache[lookup_key] = fetch_paper_xinghe_records_by_embedded_key( + conn, + table=xinghe_table, + key=lookup_key, + ) + xinghe_rows = paper_xinghe_embedded_cache[lookup_key] + if not xinghe_rows: + sha256 = target_row.get("access_xinghe_repository_sha256") + if sha256 not in (None, ""): + xinghe_rows = xinghe_records_by_sha.get(str(sha256), []) + expected, row_warnings = expected_for_target_row_from_sources( + row=target_row, + specs=specs, + metadata_record=metadata_record, + xinghe_rows=xinghe_rows, + target_dt=target_dt, + ) + unique_id = target_row.get("unique_id") + if row_warnings: + warnings.append({"unique_id": unique_id, **row_warnings}) + if expected is None: + skipped += 1 + mismatches.append({"unique_id": unique_id, "status": "skipped", **row_warnings}) + continue + expected_cmp = comparable_record(expected, compare_fields) + actual_cmp = comparable_record(target_row, compare_fields) + row_mismatches = compare_records(expected_cmp, actual_cmp, field_types) + if row_mismatches: + failed += 1 + mismatches.append( + { + "unique_id": unique_id, + "dt": target_row.get("dt"), + "metadata_type": target_row.get("metadata_type"), + "status": "field_mismatch", + "mismatches": row_mismatches, + } + ) + else: + passed += 1 + + return { + "checked": checked, + "passed": passed, + "failed": failed, + "skipped": skipped, + "warnings": warnings[:100], + "mismatches": mismatches, + } + + +def validate_missing_target_samples( + conn: Any, + *, + paper_table: str, + ebook_table: str, + target_table: str, + xinghe_table: str, + target_dt: Optional[str], + paper_dt: Optional[str], + ebook_dt: Optional[str], + limit: int, +) -> Dict[str, Any]: + per_kind = max(1, limit // 6) + result: Dict[str, Any] = {} + for metadata_type, table, key_field, source_dt in ( + ("paper", paper_table, "doi", paper_dt), + ("ebook", ebook_table, "isbn13", ebook_dt), + ): + sql, params = build_missing_target_sample_query( + table, + target_table, + metadata_type=metadata_type, + key_field=key_field, + source_dt=source_dt, + target_dt=target_dt, + limit=per_kind, + ) + result[f"{metadata_type}_source"] = fetch_records(conn, sql, params) + for metadata_type, key_field in ( + ("paper", "doi"), + ("ebook", "isbn"), + ): + sql, params = build_xinghe_missing_target_sample_query( + xinghe_table, + target_table, + metadata_type=metadata_type, + xinghe_key_field=key_field, + dt=target_dt, + limit=per_kind, + ) + result[f"xinghe_{metadata_type}_source"] = fetch_records(conn, sql, params) + for metadata_type, xinghe_key_field, metadata_table, metadata_key_field, metadata_dt in ( + ("paper", "doi", paper_table, "doi", paper_dt), + ("ebook", "isbn", ebook_table, "isbn13", ebook_dt), + ): + sql, params = build_xinghe_only_missing_target_sample_query( + xinghe_table, + metadata_table, + target_table, + metadata_type=metadata_type, + xinghe_key_field=xinghe_key_field, + metadata_key_field=metadata_key_field, + metadata_dt=metadata_dt, + target_dt=target_dt, + limit=per_kind, + ) + result[f"xinghe_only_{metadata_type}_source"] = fetch_records(conn, sql, params) + return result + + +def null_empty_rate_for_field( + conn: Any, + *, + table: str, + field: str, + dt: Optional[str], +) -> Dict[str, Any]: + params: List[Any] = [] + quoted = f"`{field.replace('`', '``')}`" + sql = ( + "SELECT " + "COUNT(*) AS total, " + f"SUM(CASE WHEN {quoted} IS NULL THEN 1 ELSE 0 END) AS null_count, " + f"SUM(CASE WHEN {quoted} IS NOT NULL " + f"AND TRIM(CAST({quoted} AS VARCHAR)) IN ('', '[]', '{{}}') " + "THEN 1 ELSE 0 END) AS empty_count " + f"FROM {quote_identifier(table)} WHERE 1=1{_dt_clause(dt, params)}" + ) + row = fetch_one(conn, sql, params) + if not row: + return {"field": field, "total": 0, "null_count": 0, "empty_count": 0} + total = int(row.get("total") or 0) + null_count = int(row.get("null_count") or 0) + empty_count = int(row.get("empty_count") or 0) + return { + "field": field, + "total": total, + "null_count": null_count, + "empty_count": empty_count, + "null_rate": null_count / total if total else 0.0, + "empty_rate": empty_count / total if total else 0.0, + } + + +def empty_condition_sql(quoted_field: str, data_type: str) -> Optional[str]: + type_text = (data_type or "").strip().lower() + if ( + type_text in ("string", "text") + or type_text.startswith("varchar") + or type_text.startswith("char") + ): + return f"TRIM(CAST({quoted_field} AS VARCHAR)) = ''" + if type_text.startswith("array") or type_text.startswith("list"): + return f"CARDINALITY({quoted_field}) = 0" + return None + + +def build_null_empty_rates( + conn: Any, + *, + target_table: str, + specs: Sequence[UnionFieldSpec], + dt: Optional[str], +) -> List[Dict[str, Any]]: + target_field_types = show_column_types(conn, target_table) + row = fetch_null_empty_rate_row( + conn, + target_table=target_table, + specs=specs, + dt=dt, + extra_where="", + extra_params=[], + target_field_types=target_field_types, + ) + return null_empty_rates_from_row(row, specs) + + +def fetch_null_empty_rate_row( + conn: Any, + *, + target_table: str, + specs: Sequence[UnionFieldSpec], + dt: Optional[str], + extra_where: str, + extra_params: Sequence[Any], + target_field_types: Optional[Dict[str, str]] = None, +) -> Dict[str, Any]: + params: List[Any] = [] + select_parts: List[str] = ["COUNT(*) AS `total`"] + target_field_types = target_field_types or {} + for idx, spec in enumerate(specs): + quoted = f"`{spec.field_name.replace('`', '``')}`" + select_parts.append( + f"SUM(CASE WHEN {quoted} IS NULL THEN 1 ELSE 0 END) AS `n_{idx}`" + ) + effective_type = target_field_types.get(spec.field_name) or spec.data_type + empty_condition = empty_condition_sql(quoted, effective_type) + if empty_condition is None: + select_parts.append(f"0 AS `e_{idx}`") + else: + select_parts.append( + f"SUM(CASE WHEN {quoted} IS NOT NULL AND {empty_condition} " + f"THEN 1 ELSE 0 END) AS `e_{idx}`" + ) + sql = ( + "SELECT " + + ", ".join(select_parts) + + f" FROM {quote_identifier(target_table)} WHERE 1=1{_dt_clause(dt, params)}{extra_where}" + ) + params.extend(extra_params) + return fetch_one(conn, sql, params) or {} + + +def null_empty_rates_from_row(row: Dict[str, Any], specs: Sequence[UnionFieldSpec]) -> List[Dict[str, Any]]: + total = int(row.get("total") or 0) + rates: List[Dict[str, Any]] = [] + for idx, spec in enumerate(specs): + null_count = int(row.get(f"n_{idx}") or 0) + empty_count = int(row.get(f"e_{idx}") or 0) + rates.append( + { + "field": spec.field_name, + "total": total, + "null_count": null_count, + "empty_count": empty_count, + "null_rate": null_count / total if total else 0.0, + "empty_rate": empty_count / total if total else 0.0, + } + ) + return rates + + +def skipped_null_empty_rates(reason: str) -> List[Dict[str, Any]]: + return [{"skipped": True, "reason": reason}] + + +def failed_null_empty_rates(exc: Exception) -> List[Dict[str, Any]]: + return [ + { + "skipped": True, + "status": "failed", + "reason": "null_empty_count_failed", + "error_type": type(exc).__name__, + "error": str(exc), + } + ] + + +def validate_target_field_values( + conn: Any, + *, + target_table: str, + dt: Optional[str], + limit: Optional[int], +) -> Dict[str, Any]: + return { + "checked": 0, + "passed": 0, + "failed": 0, + "fail_rate": 0.0, + "field_error_summary": {}, + "issues": [], + "examples": {}, + "skipped": True, + "reason": "field validator removed; union validation uses schema, coverage, null/empty rates, and source field mapping", + } + + +REPORT_KEY_LABELS = { + "status": "状态", + "config_path": "配置文件", + "mapping_csv": "映射文件", + "paper_table": "论文源表", + "ebook_table": "图书源表", + "xinghe_table": "星河全文表", + "target_table": "目标表", + "dt": "目标表分区", + "target_dt": "目标表分区", + "paper_dt": "论文源表分区", + "ebook_dt": "图书源表分区", + "sample_size": "抽样数量", + "coverage_mode": "覆盖统计模式", + "null_empty_mode": "空值率统计模式", + "missing_sample_mode": "缺失样例模式", + "target_sample_mode": "目标表抽样模式", + "schema_check": "Schema检查", + "missing_fields": "缺失字段", + "extra_fields": "多余字段", + "expected_count": "预期字段数", + "actual_count": "实际字段数", + "coverage_counts": "覆盖统计", + "paper_source": "论文去重表记录数", + "ebook_source": "图书去重表记录数", + "xinghe_only_paper_count": "星河表去重论文兜底数", + "xinghe_only_ebook_count": "星河表去重图书兜底数", + "expected_target_count": "理论全量表记录数", + "actual_target_count": "实际全量表记录数", + "target_count_diff": "全量表记录数差异", + "source_field_mapping": "源字段映射校验", + "checked": "已校验数", + "passed": "通过数", + "failed": "失败数", + "skipped": "跳过数", + "warning_count": "Warning数量", + "field_quality": "字段质量校验", + "fail_rate": "失败率", + "field_error_summary": "字段错误汇总", + "reason": "原因", + "output_dir": "报告目录", + "details": "明细", + "null_empty_rates": "空值率统计", + "top_null_empty_rates": "Top空值率统计", + "field": "字段", + "total": "总数", + "null_count": "NULL数量", + "empty_count": "空字符串/空集合数量", + "null_rate": "NULL比例", + "empty_rate": "空值比例", + "null_empty_rate": "NULL和空值合计比例", + "missing_target_samples": "缺失目标样例", + "mismatches": "字段差异", + "warnings": "Warning明细", + "unique_id": "唯一ID", + "metadata_type": "元数据类型", + "expected": "预期值", + "actual": "实际值", + "report": "报告目录", + "total_problem_rows": "问题记录数", + "status_counts": "状态分布", + "field_counts": "字段问题分布", + "field_samples": "字段问题样例", + "warning_samples": "Warning样例", + "missing_target_sample_counts": "缺失目标样例数量", + "sample_key": "样例key", + "data_date": "数据日期", + "sha256": "sha256", + "origin_path": "原始路径", + "error_type": "错误类型", + "error": "错误信息", +} + + +FIELD_NAME_KEY_CONTAINERS = { + "field_counts", + "字段问题分布", + "field_samples", + "字段问题样例", + "mismatches", + "字段差异", +} + + +def localize_report_keys(value: Any, parent_key: Optional[str] = None) -> Any: + if isinstance(value, dict): + if parent_key in FIELD_NAME_KEY_CONTAINERS: + return { + str(key): localize_report_keys(val, str(key)) + for key, val in value.items() + } + return { + REPORT_KEY_LABELS.get(str(key), str(key)): localize_report_keys(val, str(key)) + for key, val in value.items() + } + if isinstance(value, list): + return [localize_report_keys(item, parent_key) for item in value] + return value + + +def write_json(path: Path, payload: Any) -> None: + path.parent.mkdir(parents=True, exist_ok=True) + with path.open("w", encoding="utf-8") as f: + json.dump(localize_report_keys(payload), f, ensure_ascii=False, indent=2, cls=JsonEncoder) + + +def write_jsonl(path: Path, rows: Iterable[Dict[str, Any]]) -> None: + path.parent.mkdir(parents=True, exist_ok=True) + with path.open("w", encoding="utf-8") as f: + for row in rows: + f.write(json.dumps(localize_report_keys(row), ensure_ascii=False, cls=JsonEncoder) + "\n") + + +def _json_inline(value: Any) -> str: + return json.dumps(value, ensure_ascii=False, cls=JsonEncoder) + + +SAMPLES_PER_FIELD = 3 + + +def top_null_empty_rates(rates: Sequence[Dict[str, Any]], limit: int = 10) -> List[Dict[str, Any]]: + rows: List[Dict[str, Any]] = [] + for row in rates: + if row.get("skipped") or row.get("error"): + continue + total = int(row.get("total") or 0) + null_count = int(row.get("null_count") or 0) + empty_count = int(row.get("empty_count") or 0) + total_rate = (null_count + empty_count) / total if total else 0.0 + rows.append({**row, "null_empty_rate": total_rate}) + rows.sort( + key=lambda item: ( + float(item.get("null_empty_rate") or 0), + int(item.get("null_count") or 0) + int(item.get("empty_count") or 0), + str(item.get("field") or ""), + ), + reverse=True, + ) + return rows[:limit] + + +def build_readable_report_summary(result: Dict[str, Any]) -> Dict[str, Any]: + details = result["details"] + mismatch_rows = details["source_field_mapping"]["mismatches"] + warnings = details["source_field_mapping"]["warnings"] + status_counts: Dict[str, int] = {} + field_counts: Dict[str, int] = {} + field_samples: Dict[str, List[Dict[str, Any]]] = defaultdict(list) + + for row in mismatch_rows: + status = str(row.get("status") or "unknown") + status_counts[status] = status_counts.get(status, 0) + 1 + mismatches = row.get("mismatches") or {} + for field, diff in mismatches.items(): + field_counts[field] = field_counts.get(field, 0) + 1 + if len(field_samples[field]) >= SAMPLES_PER_FIELD: + continue + field_samples[field].append( + { + "unique_id": row.get("unique_id") or row.get("唯一ID"), + "metadata_type": row.get("metadata_type") or row.get("元数据类型"), + "dt": row.get("dt") or row.get("目标表分区"), + "status": status, + "expected": diff.get("expected") if isinstance(diff, dict) else None, + "actual": diff.get("actual") if isinstance(diff, dict) else None, + } + ) + + sorted_field_counts = dict(sorted(field_counts.items(), key=lambda item: (-item[1], item[0]))) + sorted_status_counts = dict(sorted(status_counts.items(), key=lambda item: (-item[1], item[0]))) + missing_samples = details.get("missing_target_samples") or {} + if isinstance(missing_samples, dict) and missing_samples.get("skipped"): + missing_sample_counts = {"skipped": 1} + else: + missing_sample_counts = { + name: len(rows) if isinstance(rows, list) else 0 + for name, rows in missing_samples.items() + } + null_empty_rates = details.get("null_empty_rates") or [] + return { + "status": result.get("status"), + "report": result.get("output_dir"), + "mapping_csv": result.get("mapping_csv"), + "paper_table": result.get("paper_table"), + "ebook_table": result.get("ebook_table"), + "xinghe_table": result.get("xinghe_table"), + "target_table": result.get("target_table"), + "target_dt": result.get("target_dt") or result.get("dt"), + "paper_dt": result.get("paper_dt"), + "ebook_dt": result.get("ebook_dt"), + "sample_size": result.get("sample_size"), + "coverage_mode": result.get("coverage_mode"), + "null_empty_mode": result.get("null_empty_mode"), + "missing_sample_mode": result.get("missing_sample_mode"), + "target_sample_mode": result.get("target_sample_mode"), + "schema_check": result.get("schema_check"), + "coverage_counts": result.get("coverage_counts"), + "source_field_mapping": result.get("source_field_mapping"), + "total_problem_rows": len(mismatch_rows), + "status_counts": sorted_status_counts, + "field_counts": sorted_field_counts, + "field_count_total": len(sorted_field_counts), + "field_samples": { + field: field_samples[field] + for field in sorted_field_counts + if field in field_samples + }, + "warning_count": len(warnings), + "warning_samples": warnings[:5], + "null_empty_rates": null_empty_rates, + "top_null_empty_rates": top_null_empty_rates(null_empty_rates), + "missing_target_sample_counts": missing_sample_counts, + "field_quality": result.get("field_quality"), + } + + +def _pct(value: Any) -> str: + try: + return f"{float(value) * 100:.2f}%" + except (TypeError, ValueError): + return "N/A" + + +def _first_present(row: Dict[str, Any], *keys: str) -> Any: + for key in keys: + if key in row: + return row.get(key) + return None + + +def build_readable_report_markdown(summary: Dict[str, Any]) -> str: + lines: List[str] = ["# 全量元数据 Union 校验报告摘要", ""] + schema_check = summary.get("schema_check") or {} + coverage_counts = summary.get("coverage_counts") or {} + lines.extend( + [ + f"- 目标分区: `{summary.get('target_dt')}`", + f"- 源分区: paper=`{summary.get('paper_dt')}`, ebook=`{summary.get('ebook_dt')}`", + f"- 抽样数量: `{summary.get('sample_size')}`", + f"- 空值率统计: mode=`{summary.get('null_empty_mode')}`", + f"- 字段不一致记录数: `{summary.get('total_problem_rows')}`", + f"- 报告目录: `{summary.get('report')}`", + "", + ] + ) + + lines.append("## 重点结论") + lines.append("") + missing_fields = _first_present(schema_check, "missing_fields", "缺失字段") or [] + extra_fields = _first_present(schema_check, "extra_fields", "多余字段") or [] + lines.append( + f"- Schema: 缺失 `{len(missing_fields)}` 个字段," + f"多余 `{len(extra_fields)}` 个字段" + ) + expected_target_count = coverage_counts.get("expected_target_count") + actual_target_count = coverage_counts.get("actual_target_count") or coverage_counts.get("target") + target_count_diff = coverage_counts.get("target_count_diff") + if expected_target_count is not None and actual_target_count is not None: + lines.append( + f"- 目标表数量: 理论 `{expected_target_count}`," + f"实际 `{actual_target_count}`,差异 `{target_count_diff}`" + ) + top_fields = list((summary.get("field_counts") or {}).items())[:5] + if top_fields: + lines.append( + "- Top字段问题: " + + ";".join(f"`{field}`={count}" for field, count in top_fields) + ) + else: + lines.append("- Top字段问题: 无") + lines.append("") + + lines.append("## Schema 对比") + lines.append("") + lines.append(f"- 预期字段数: `{_first_present(schema_check, 'expected_count', '预期字段数')}`") + lines.append(f"- 实际字段数: `{_first_present(schema_check, 'actual_count', '实际字段数')}`") + for field in missing_fields[:10]: + lines.append(f"- missing: `{field}`") + for field in extra_fields[:10]: + lines.append(f"- extra: `{field}`") + if len(extra_fields) > 10: + lines.append(f"- extra 其余 `{len(extra_fields) - 10}` 个见 summary.json") + lines.append("") + + lines.append("## 覆盖率统计") + lines.append("") + for key, value in coverage_counts.items(): + lines.append(f"- `{key}`: {value}") + lines.append("") + + lines.append("## NULL/空值率统计") + lines.append("") + null_empty_rates = summary.get("null_empty_rates") or [] + if null_empty_rates and isinstance(null_empty_rates[0], dict) and null_empty_rates[0].get("skipped"): + if null_empty_rates[0].get("status") == "failed": + lines.append(f"- 统计失败: `{null_empty_rates[0].get('error_type')}`") + lines.append(f"- 原因: `{null_empty_rates[0].get('error')}`") + else: + lines.append(f"- 未统计: `{null_empty_rates[0].get('reason')}`") + lines.append("- 如需输出实际比例,运行时加 `--null-empty-mode exact`") + else: + rate_rows = [] + for row in null_empty_rates: + if row.get("error") or row.get("错误"): + continue + total = int(_first_present(row, "total", "总数") or 0) + null_count = int(_first_present(row, "null_count", "NULL数量") or 0) + empty_count = int(_first_present(row, "empty_count", "空字符串/空集合数量") or 0) + null_empty_rate = _first_present(row, "null_empty_rate", "NULL和空值合计比例") + if null_empty_rate is None: + null_empty_rate = (null_count + empty_count) / total if total else 0.0 + rate_rows.append( + { + **row, + "field": _first_present(row, "field", "字段"), + "total": total, + "null_count": null_count, + "empty_count": empty_count, + "null_rate": _first_present(row, "null_rate", "NULL比例"), + "empty_rate": _first_present(row, "empty_rate", "空值比例"), + "null_empty_rate": null_empty_rate, + } + ) + rate_rows.sort( + key=lambda row: ( + float(row.get("null_empty_rate") or 0), + int(row.get("null_count") or 0) + int(row.get("empty_count") or 0), + str(row.get("field") or ""), + ), + reverse=True, + ) + for row in rate_rows: + lines.append( + f"- `{row.get('field')}`: NULL `{row.get('null_count')}` " + f"({_pct(row.get('null_rate'))}),空值 `{row.get('empty_count')}` " + f"({_pct(row.get('empty_rate'))}),合计 `{_pct(row.get('null_empty_rate'))}`" + ) + if not rate_rows: + lines.append("- 无或未统计") + lines.append("") + + lines.append("## 状态分布") + lines.append("") + for status, count in (summary.get("status_counts") or {}).items(): + lines.append(f"- `{status}`: {count}") + if not summary.get("status_counts"): + lines.append("- 无") + lines.append("") + + lines.append("## 字段问题分布") + lines.append("") + for field, count in (summary.get("field_counts") or {}).items(): + lines.append(f"- `{field}`: {count}") + if not summary.get("field_counts"): + lines.append("- 无") + lines.append("") + + lines.append("## 字段问题样例") + lines.append("") + for field, samples in (summary.get("field_samples") or {}).items(): + count = (summary.get("field_counts") or {}).get(field, len(samples)) + lines.append(f"### {field} ({count})") + lines.append("") + for sample in samples: + lines.append( + f"- unique_id `{sample.get('unique_id')}`, metadata_type=`{sample.get('metadata_type')}`, " + f"dt=`{sample.get('dt')}`, status=`{sample.get('status')}`" + ) + lines.append(f" - expected: `{_json_inline(sample.get('expected'))}`") + lines.append(f" - actual: `{_json_inline(sample.get('actual'))}`") + lines.append("") + + if summary.get("warning_count"): + lines.append("## Warning 样例") + lines.append("") + lines.append(f"- warning_count: `{summary.get('warning_count')}`") + for warning in summary.get("warning_samples") or []: + lines.append(f"- `{_json_inline(warning)}`") + lines.append("") + + lines.append("## 缺失样例数量") + lines.append("") + for key, count in (summary.get("missing_target_sample_counts") or {}).items(): + lines.append(f"- `{key}`: {count}") + lines.append("") + return "\n".join(lines).rstrip() + "\n" + + +def write_report(output_dir: Path, result: Dict[str, Any]) -> None: + details = result["details"] + write_jsonl(output_dir / "source_field_mismatch.jsonl", details["source_field_mapping"]["mismatches"]) + write_jsonl(output_dir / "source_field_warning.jsonl", details["source_field_mapping"]["warnings"]) + readable_summary = build_readable_report_summary(result) + write_json(output_dir / "summary.json", readable_summary) + with (output_dir / "readable_summary.md").open("w", encoding="utf-8") as f: + f.write(build_readable_report_markdown(readable_summary)) + + +def validate_db( + *, + config_path: Path, + paper_table: str, + ebook_table: str, + xinghe_table: str, + target_table: str, + dt: Optional[str], + paper_dt: Optional[str], + ebook_dt: Optional[str], + limit: Optional[int], + output_dir: Optional[Path], + mapping_csv: Path = DEFAULT_MAPPING_CSV, + coverage_mode: str = "exact", + null_empty_mode: str = "exact", + missing_sample_mode: str = "skip", + target_sample_mode: str = "natural", +) -> Dict[str, Any]: + specs = load_union_specs(mapping_csv) + cfg = load_config(config_path) + mysql_cfg = cfg.get("mysql", {}) if isinstance(cfg.get("mysql"), dict) else {} + catalog = mysql_cfg.get("catalog") + paper_table = qualify_table_name(paper_table, catalog, "dws") + ebook_table = qualify_table_name(ebook_table, catalog, "dws") + xinghe_table = qualify_table_name(xinghe_table, catalog, "ads") + target_table = qualify_table_name(target_table, catalog, "ads") + reconnected_conn = None + with connect_starrocks(config_path) as conn: + try: + with timed_step("schema 校验"): + schema_check = validate_schema(conn, target_table=target_table, specs=specs) + if coverage_mode == "exact": + try: + with timed_step("coverage 总量统计"): + coverage_counts = source_coverage_counts( + conn, + paper_table=paper_table, + ebook_table=ebook_table, + xinghe_table=xinghe_table, + target_table=target_table, + target_dt=dt, + paper_dt=paper_dt, + ebook_dt=ebook_dt, + ) + except Exception as exc: + coverage_counts = failed_coverage_counts(exc) + log_step( + "coverage 总量统计失败,继续生成抽样报告:" + f"{type(exc).__name__}: {exc}" + ) + else: + coverage_counts = skipped_coverage_counts("coverage_mode=skip") + log_step("coverage 总量统计已跳过(使用 --coverage-mode exact 开启)") + with timed_step("source 字段映射抽样校验"): + source_field_mapping = validate_source_field_mapping( + conn, + specs=specs, + paper_table=paper_table, + ebook_table=ebook_table, + xinghe_table=xinghe_table, + target_table=target_table, + target_dt=dt, + paper_dt=paper_dt, + ebook_dt=ebook_dt, + limit=limit, + target_sample_mode=target_sample_mode, + ) + field_quality = validate_target_field_values( + conn, + target_table=target_table, + dt=dt, + limit=limit, + ) + if null_empty_mode == "exact": + try: + with timed_step("null/empty rates 统计"): + null_empty_rates = build_null_empty_rates( + conn, + target_table=target_table, + specs=specs, + dt=dt, + ) + except Exception as exc: + null_empty_rates = failed_null_empty_rates(exc) + log_step( + "null/empty rates 统计失败,继续生成报告:" + f"{type(exc).__name__}: {exc}" + ) + else: + null_empty_rates = skipped_null_empty_rates("null_empty_mode=skip") + log_step("null/empty rates 统计已跳过(使用 --null-empty-mode exact 开启)") + if missing_sample_mode == "sample": + with timed_step("missing target 样例抽取"): + missing_target_samples = validate_missing_target_samples( + conn, + paper_table=paper_table, + ebook_table=ebook_table, + xinghe_table=xinghe_table, + target_table=target_table, + target_dt=dt, + paper_dt=paper_dt, + ebook_dt=ebook_dt, + limit=limit or 200, + ) + else: + missing_target_samples = {"skipped": True, "reason": "missing_sample_mode=skip"} + log_step("missing target 样例抽取已跳过") + finally: + if reconnected_conn is not None: + try: + reconnected_conn.close() + except Exception: + pass + + result = { + "status": "ok", + "config_path": str(config_path), + "mapping_csv": str(mapping_csv), + "paper_table": paper_table, + "ebook_table": ebook_table, + "xinghe_table": xinghe_table, + "target_table": target_table, + "dt": dt, + "target_dt": dt, + "paper_dt": paper_dt, + "ebook_dt": ebook_dt, + "sample_size": limit, + "coverage_mode": coverage_mode, + "null_empty_mode": null_empty_mode, + "missing_sample_mode": missing_sample_mode, + "target_sample_mode": target_sample_mode, + "schema_check": schema_check, + "coverage_counts": coverage_counts, + "source_field_mapping": { + "checked": source_field_mapping["checked"], + "passed": source_field_mapping["passed"], + "failed": source_field_mapping["failed"], + "skipped": source_field_mapping["skipped"], + "warning_count": len(source_field_mapping["warnings"]), + }, + "field_quality": { + "checked": field_quality["checked"], + "passed": field_quality["passed"], + "failed": field_quality["failed"], + "fail_rate": field_quality["fail_rate"], + "field_error_summary": field_quality["field_error_summary"], + "skipped": field_quality.get("skipped", False), + "reason": field_quality.get("reason"), + }, + "output_dir": str(output_dir) if output_dir else None, + "details": { + "source_field_mapping": source_field_mapping, + "field_quality": field_quality, + "null_empty_rates": null_empty_rates, + "missing_target_samples": missing_target_samples, + }, + } + if output_dir is not None: + write_report(output_dir, result) + print(json.dumps({k: v for k, v in result.items() if k != "details"}, ensure_ascii=False, cls=JsonEncoder)) + return result + + +def cli() -> None: + config_parser = argparse.ArgumentParser(add_help=False) + config_parser.add_argument("--config", type=Path, default=DEFAULT_CONFIG_PATH) + config_args, _ = config_parser.parse_known_args() + cfg = load_config(config_args.config) if config_args.config.exists() else {} + union_cfg = cfg.get("union_unique_meta_data", {}) + + default_csv = union_cfg.get("mapping_csv") + if default_csv: + default_csv = PROJECT_ROOT / default_csv + else: + default_csv = DEFAULT_MAPPING_CSV + + parser = argparse.ArgumentParser( + description="Validate unified metadata target table against DB sources." + ) + parser.add_argument("--config", type=Path, default=DEFAULT_CONFIG_PATH, help="shared settings JSON path") + parser.add_argument("--mapping-csv", type=Path, default=default_csv, help="field mapping CSV") + parser.add_argument("--paper-table", default=union_cfg.get("paper_table", DEFAULT_PAPER_TABLE)) + parser.add_argument("--ebook-table", default=union_cfg.get("ebook_table", DEFAULT_EBOOK_TABLE)) + parser.add_argument("--xinghe-table", default=union_cfg.get("xinghe_table", DEFAULT_XINGHE_TABLE)) + parser.add_argument("--target-table", default=union_cfg.get("target_table", DEFAULT_TARGET_TABLE)) + parser.add_argument("--dt", default=union_cfg.get("dt"), help="target table dt partition filter") + parser.add_argument( + "--paper-dt", + default=union_cfg.get("paper_dt"), + help="paper unique source dt partition filter; defaults to --dt when omitted", + ) + parser.add_argument( + "--ebook-dt", + default=union_cfg.get("ebook_dt"), + help="ebook unique source dt partition filter; defaults to --dt when omitted", + ) + parser.add_argument("--limit", type=int, default=int(union_cfg.get("limit", 3000)), help="sample size") + parser.add_argument("--full", action="store_true", help="validate all target rows for sampled checks") + parser.add_argument("--output-dir", type=Path, default=union_cfg.get("output_dir"), help="report directory") + parser.add_argument( + "--coverage-mode", + choices=("skip", "exact"), + default=union_cfg.get("coverage_mode", "exact"), + help="coverage count mode; exact runs full count and missing-target count SQL, then continues on timeout/error", + ) + parser.add_argument( + "--null-empty-mode", + choices=("skip", "exact"), + default=union_cfg.get("null_empty_mode", "exact"), + help="null/empty rate mode; exact scans target fields", + ) + parser.add_argument( + "--missing-sample-mode", + choices=("sample", "skip"), + default=union_cfg.get("missing_sample_mode", "skip"), + help="whether to collect source-has-target-missing samples", + ) + parser.add_argument( + "--target-sample-mode", + choices=("natural", "hash"), + default=union_cfg.get("target_sample_mode", "natural"), + help="target sample mode; natural is fastest, hash adds CRC32 filter", + ) + args = parser.parse_args() + paper_dt = args.paper_dt or args.dt + ebook_dt = args.ebook_dt or args.dt + output_dir = Path(args.output_dir) if args.output_dir else default_output_dir( + args.dt, + paper_dt, + ebook_dt, + args.limit, + args.full, + ) + + validate_db( + config_path=args.config, + paper_table=args.paper_table, + ebook_table=args.ebook_table, + xinghe_table=args.xinghe_table, + target_table=args.target_table, + dt=args.dt, + paper_dt=paper_dt, + ebook_dt=ebook_dt, + limit=None if args.full else args.limit, + output_dir=output_dir, + mapping_csv=args.mapping_csv, + coverage_mode=args.coverage_mode, + null_empty_mode=args.null_empty_mode, + missing_sample_mode=args.missing_sample_mode, + target_sample_mode=args.target_sample_mode, + ) + + +from dingo.config.input_args import EvaluatorRuleArgs +from dingo.io.input import Data, RequiredField +from dingo.io.output.eval_detail import EvalDetail, QualityLabel +from dingo.model.model import Model +from dingo.model.rule.base import BaseRule +from dingo.model.rule.scibase.report_utils import bool_param, int_param, write_temp_settings + + +@Model.rule_register( + "QUALITY_BAD_EFFECTIVENESS", + ["sci_base_qa_test", "union_unique_meta_data"], +) +class RuleSciBaseUnionUniqueMetaDataReport(BaseRule): + _metric_info = { + "category": "Rule-Based Metadata Quality Metrics", + "quality_dimension": "EFFECTIVENESS", + "metric_name": "RuleSciBaseUnionUniqueMetaDataReport", + "description": "Run SciBase unified metadata DB validation and write reports.", + "paper_title": "", + "paper_url": "", + "paper_authors": "", + "evaluation_results": "", + } + + _required_fields = [RequiredField.METADATA] + dynamic_config = EvaluatorRuleArgs(parameters={}) + + @classmethod + def eval(cls, input_data: Data) -> EvalDetail: + del input_data + params = cls.dynamic_config.parameters or {} + full = bool_param(params, "full", False) + dt = params.get("dt") + paper_dt = params.get("paper_dt") or dt + ebook_dt = params.get("ebook_dt") or dt + output_dir = Path(str(params["output_dir"])) if params.get("output_dir") else default_output_dir( + dt, + paper_dt, + ebook_dt, + int_param(params, "limit", 3000), + full, + ) + + config_path = write_temp_settings(params) + result = validate_db( + config_path=config_path, + paper_table=str(params.get("paper_table") or DEFAULT_PAPER_TABLE), + ebook_table=str(params.get("ebook_table") or DEFAULT_EBOOK_TABLE), + xinghe_table=str(params.get("xinghe_table") or DEFAULT_XINGHE_TABLE), + target_table=str(params.get("target_table") or DEFAULT_TARGET_TABLE), + dt=dt, + paper_dt=paper_dt, + ebook_dt=ebook_dt, + limit=None if full else int_param(params, "limit", 3000), + output_dir=output_dir, + mapping_csv=Path(str(params.get("mapping_csv") or DEFAULT_MAPPING_CSV)), + coverage_mode=str(params.get("coverage_mode") or "exact"), + null_empty_mode=str(params.get("null_empty_mode") or "exact"), + missing_sample_mode=str(params.get("missing_sample_mode") or "skip"), + target_sample_mode=str(params.get("target_sample_mode") or "natural"), + ) + mapping_summary = result.get("source_field_mapping") or {} + field_quality = result.get("field_quality") or {} + schema_check = result.get("schema_check") or {} + bad = bool(schema_check.get("missing_fields") or schema_check.get("type_mismatches")) + bad = bad or int(mapping_summary.get("failed") or 0) > 0 + bad = bad or int(field_quality.get("failed") or 0) > 0 + reason = [ + str(output_dir), + f"mapping_failed={mapping_summary.get('failed')}", + f"field_failed={field_quality.get('failed')}", + ] + if bad: + return EvalDetail( + metric=cls.__name__, + status=True, + label=[f"{cls.metric_type}.{cls.__name__}"], + reason=reason, + ) + return EvalDetail(metric=cls.__name__, label=[QualityLabel.QUALITY_GOOD], reason=reason) + + +if __name__ == "__main__": + cli() diff --git a/docs/metrics.md b/docs/metrics.md index 3acdf41d..a97b0a0c 100644 --- a/docs/metrics.md +++ b/docs/metrics.md @@ -127,6 +127,7 @@ This document provides comprehensive information about all quality metrics used | Type | Metric | Description | Paper Source | Evaluation Results | Examples | |------|--------|-------------|--------------|-------------------|----------| | `QUALITY_BAD_EFFECTIVENESS` | RuleMetadataSimilarity | 检查元数据字段与基准数据的相似度匹配,阈值默认为0.6 | Internal Implementation | N/A | N/A | +| `QUALITY_BAD_EFFECTIVENESS` | RuleSciBaseMetaPaperUniqueReport, RuleSciBaseMetaEbookUniqueReport, RuleSciBaseUnionUniqueMetaDataReport, RuleSciBaseMetaPaperDataReport, RuleSciBaseMetaPatentParsedInfoReport | Validate SciBase paper unique, ebook unique, unified metadata, S3 paper-source, and patent XML parsed-field records while writing per-record reports | Internal Implementation | N/A | N/A | ### Rule-Based RESUME Quality Metrics @@ -159,4 +160,3 @@ This document provides comprehensive information about all quality metrics used | `AgentFactCheck` | AgentFactCheck | Agent-based hallucination detection with autonomous web search | Internal Implementation | N/A | N/A | | `ArticleFactChecker` | ArticleFactChecker | Article-level fact checking with autonomous claims extraction and verification | Internal Implementation | N/A | N/A | | `LLMCustomMetric` | LLMCustomMetric | Unified metric for user customization | Internal Implementation | N/A | N/A | - diff --git a/setup.py b/setup.py index 357285fc..01e747cc 100644 --- a/setup.py +++ b/setup.py @@ -31,6 +31,12 @@ def _read_requirements(path): url="https://github.com/MigoXLab/dingo", packages=find_packages(), include_package_data=True, + package_data={ + "dingo": [ + "model/rule/scibase/assets/*.csv", + "model/rule/scibase/assets/*.json", + ], + }, classifiers=[ "Programming Language :: Python :: 3", "Operating System :: OS Independent",