From d6de13661519c6d5698f9165fb4123f1a1ec73d8 Mon Sep 17 00:00:00 2001
From: guhuaiyu <guhuaiyu@pjlab.org.cn>
Date: Mon, 8 Jun 2026 11:46:38 +0800
Subject: [PATCH] Add SciBase rule report validators

---
 dingo/exec/local.py                           |   18 +
 dingo/exec/spark.py                           |   26 +-
 dingo/model/model.py                          |   24 +-
 dingo/model/rule/scibase/__init__.py          |    2 +-
 .../scibase/assets/ebook_unique_mapping.csv   |   21 +
 .../rule/scibase/assets/osi_arxiv_mapping.csv |   61 +
 .../scibase/assets/paper_unique_mapping.csv   |   42 +
 .../rule/scibase/assets/patent_mapping.csv    |  109 +
 .../assets/union_unique_data_mapping.csv      |  109 +
 dingo/model/rule/scibase/meta_ebook_unique.py | 1520 ++++++++
 dingo/model/rule/scibase/meta_paper_data.py   | 3408 +++++++++++++++++
 dingo/model/rule/scibase/meta_paper_unique.py | 2278 +++++++++++
 .../rule/scibase/meta_patent_parsed_info.py   | 1720 +++++++++
 dingo/model/rule/scibase/report_utils.py      |  163 +
 dingo/model/rule/scibase/rule_quanliang.py    |  655 ----
 .../rule/scibase/union_unique_meta_data.py    | 2548 ++++++++++++
 docs/metrics.md                               |    2 +-
 setup.py                                      |    6 +
 18 files changed, 12044 insertions(+), 668 deletions(-)
 create mode 100644 dingo/model/rule/scibase/assets/ebook_unique_mapping.csv
 create mode 100644 dingo/model/rule/scibase/assets/osi_arxiv_mapping.csv
 create mode 100644 dingo/model/rule/scibase/assets/paper_unique_mapping.csv
 create mode 100644 dingo/model/rule/scibase/assets/patent_mapping.csv
 create mode 100644 dingo/model/rule/scibase/assets/union_unique_data_mapping.csv
 create mode 100644 dingo/model/rule/scibase/meta_ebook_unique.py
 create mode 100644 dingo/model/rule/scibase/meta_paper_data.py
 create mode 100644 dingo/model/rule/scibase/meta_paper_unique.py
 create mode 100644 dingo/model/rule/scibase/meta_patent_parsed_info.py
 create mode 100644 dingo/model/rule/scibase/report_utils.py
 delete mode 100644 dingo/model/rule/scibase/rule_quanliang.py
 create mode 100644 dingo/model/rule/scibase/union_unique_meta_data.py

diff --git a/dingo/exec/local.py b/dingo/exec/local.py
index 5f11b1ea..8cf0f53f 100644
--- a/dingo/exec/local.py
+++ b/dingo/exec/local.py
@@ -176,6 +176,24 @@ def evaluate_single_data(self, dingo_id: str, eval_fields: dict, eval_type: str,
                 model_cls = Model.rule_name_map.get(e_c_i.name)
                 model = model_cls()  # 实例化类为对象，避免多线程配置覆盖
                 Model.set_config_rule(model, e_c_i.config)
+                if getattr(model_cls, "__module__", "").startswith("dingo.model.rule.scibase."):
+                    if "dynamic_config" not in model.__dict__:
+                        model.dynamic_config = model.dynamic_config.model_copy(deep=True)
+                    if model.dynamic_config.parameters is None:
+                        model.dynamic_config.parameters = {}
+                    model.dynamic_config.parameters.setdefault(
+                        "_dingo_dataset_sql_config",
+                        self.input_args.dataset.sql_config.model_dump(),
+                    )
+                    model.dynamic_config.parameters.setdefault(
+                        "_dingo_dataset_s3_config",
+                        self.input_args.dataset.s3_config.model_dump(),
+                    )
+                    model.dynamic_config.parameters.setdefault("_dingo_dataset_source", self.input_args.dataset.source)
+                    model.dynamic_config.parameters.setdefault("_dingo_dataset_format", self.input_args.dataset.format)
+                    model.dynamic_config.parameters.setdefault("_dingo_input_path", self.input_args.input_path)
+                    setattr(model_cls, "dynamic_config", model.dynamic_config)
+                    model = model_cls
             elif eval_type == 'llm':
                 model_cls = Model.llm_name_map.get(e_c_i.name)
                 model = model_cls()
diff --git a/dingo/exec/spark.py b/dingo/exec/spark.py
index dd57b6be..b2235a31 100644
--- a/dingo/exec/spark.py
+++ b/dingo/exec/spark.py
@@ -236,8 +236,30 @@ def evaluate_item(self, eval_fields: dict, eval_type: str, map_data: dict, eval_
 
         for e_c_i in eval_list:
             if eval_type == 'rule':
-                model = Model.rule_name_map.get(e_c_i.name)
-                Model.set_config_rule(model, e_c_i.config)
+                model_cls = Model.rule_name_map.get(e_c_i.name)
+                if getattr(model_cls, "__module__", "").startswith("dingo.model.rule.scibase."):
+                    model = model_cls()
+                    Model.set_config_rule(model, e_c_i.config)
+                    if "dynamic_config" not in model.__dict__:
+                        model.dynamic_config = model.dynamic_config.model_copy(deep=True)
+                    if model.dynamic_config.parameters is None:
+                        model.dynamic_config.parameters = {}
+                    model.dynamic_config.parameters.setdefault(
+                        "_dingo_dataset_sql_config",
+                        self.input_args.dataset.sql_config.model_dump(),
+                    )
+                    model.dynamic_config.parameters.setdefault(
+                        "_dingo_dataset_s3_config",
+                        self.input_args.dataset.s3_config.model_dump(),
+                    )
+                    model.dynamic_config.parameters.setdefault("_dingo_dataset_source", self.input_args.dataset.source)
+                    model.dynamic_config.parameters.setdefault("_dingo_dataset_format", self.input_args.dataset.format)
+                    model.dynamic_config.parameters.setdefault("_dingo_input_path", self.input_args.input_path)
+                    setattr(model_cls, "dynamic_config", model.dynamic_config)
+                    model = model_cls
+                else:
+                    model = model_cls
+                    Model.set_config_rule(model, e_c_i.config)
             elif eval_type == 'llm':
                 model = Model.llm_name_map.get(e_c_i.name)
                 Model.set_config_llm(model, e_c_i.config)
diff --git a/dingo/model/model.py b/dingo/model/model.py
index fbcbc7a8..9bd9cec1 100644
--- a/dingo/model/model.py
+++ b/dingo/model/model.py
@@ -124,16 +124,22 @@ def load_model(cls):
         if cls.module_loaded:
             return
         this_module_directory = os.path.dirname(os.path.abspath(__file__))
-        # rule auto register
-        for file in os.listdir(os.path.join(this_module_directory, "rule")):
-            path = os.path.join(this_module_directory, "rule", file)
-            if (
-                os.path.isfile(path)
-                and file.endswith(".py")
-                and not file == "__init__.py"
-            ):
+        # rule auto register - recursively scan subdirectories
+        rule_base_dir = os.path.join(this_module_directory, "rule")
+        for root, dirs, files in os.walk(rule_base_dir):
+            dirs[:] = [d for d in dirs if d != "__pycache__"]
+
+            for file in files:
+                if not file.endswith(".py") or file == "__init__.py":
+                    continue
+                rel_path = os.path.relpath(root, rule_base_dir)
+                if rel_path == ".":
+                    module_name = f"dingo.model.rule.{file[:-3]}"
+                else:
+                    rel_module = rel_path.replace(os.sep, ".")
+                    module_name = f"dingo.model.rule.{rel_module}.{file[:-3]}"
                 try:
-                    importlib.import_module("dingo.model.rule." + file.split(".")[0])
+                    importlib.import_module(module_name)
                 except ModuleNotFoundError as e:
                     log.debug(e)
 
diff --git a/dingo/model/rule/scibase/__init__.py b/dingo/model/rule/scibase/__init__.py
index ac7fae13..cef22734 100644
--- a/dingo/model/rule/scibase/__init__.py
+++ b/dingo/model/rule/scibase/__init__.py
@@ -1 +1 @@
-"""Quanliang/scibase rule implementations."""
+"""SciBase QA rule implementations."""
diff --git a/dingo/model/rule/scibase/assets/ebook_unique_mapping.csv b/dingo/model/rule/scibase/assets/ebook_unique_mapping.csv
new file mode 100644
index 00000000..7b1a02fa
--- /dev/null
+++ b/dingo/model/rule/scibase/assets/ebook_unique_mapping.csv
@@ -0,0 +1,21 @@
+﻿字段名,数据类型,聚合策略,策略参数,源字段名,去重 / 聚合处理逻辑
+isbns,array<string>,isbn_normalize,,,数组聚合，全部转换为13位ISBN格式，10位前面加978，13位保留，其他丢弃，全局去重
+isbn13,string,isbn_min,,,唯一去重键；从isbns数组中取最小的归一化13位ISBN
+title,string,freq_lex_max,min_len=2;max_len=1000,,在非空值里取词频最高；剔除长度<2或>1000极值；词频相同取字典序最大值
+abstract,string,freq_lex_max,min_len=10;max_len=10000,,在非空值里取词频最高；剔除长度<10或>10000极值；词频相同取字典序最大值
+language,string,freq_lex_max,,,在非空值里取词频最高；词频相同取字典序最大值
+type,array<string>,dedup_array,lower=true,,统一小写后数组聚合去重
+author,array<string>,dedup_array,,,数组聚合去重
+contributors,array<string>,dedup_array,,,数组聚合去重
+indexed_in,array<string>,dedup_array,,,数组聚合去重
+identifiers,"map<string,string>",merge_map,,,"key去重，相同key取max(value)"
+publication_publisher,array<string>,dedup_array,,publisher,数组聚合去重；原字段名称publisher
+publication_published_year,int,freq_int_max,min_val=1000;max_val=CURRENT_YEAR;extract_year=true,published_year,在非空值里取词频最高；剔除<1000或>当年极值；词频相同取最大值
+publication_published_place,array<string>,dedup_array,,published_place,数组聚合去重；原字段名称published_place
+publication_published_country,array<string>,dedup_array,,published_country,数组聚合去重；原字段名称published_country
+publication_pages,int,max_int,,pages,取本书多版本中的最大页数
+subjects,array<string>,dedup_array,,,数组聚合去重
+genre,array<string>,dedup_array,,,数组聚合去重
+category,string,freq_lex_max,,,在非空值里取词频最高；词频相同取字典序最大值
+access_oa_url,array<string>,dedup_array,,oa_url,数组聚合去重；原字段名称oa_url
+dt,string,latest_dt,,,保留最新分区日期
diff --git a/dingo/model/rule/scibase/assets/osi_arxiv_mapping.csv b/dingo/model/rule/scibase/assets/osi_arxiv_mapping.csv
new file mode 100644
index 00000000..e83733d2
--- /dev/null
+++ b/dingo/model/rule/scibase/assets/osi_arxiv_mapping.csv
@@ -0,0 +1,61 @@
+预期字段,arxiv对应字段,字段值数据类型
+track_id,数仓自己赋予,String
+title,title,String
+abstract,abstract,String
+language,无,String
+doi,doi；在 doi 为空时，使用"10.48550/arxiv."拼接doc_id,String
+type,无,List[string]
+author,author 解析为作者数组（字符串拆分）,List[string]
+identifiers,oaiId->oai_identifier、"arxivId->paper_id去掉http前缀",Object
+indexed_in,新增字符串"arxiv",List[string]
+published_date,updated,String
+published_year,updated中年份，s3是RFC 1123 时间格式，需要转化为 yyyy-mm-dd 格式和db做对比,Integer
+venue,,Object
+venue.name,journal_ref ：从 journal_ref 解析期刊/会议名(后续处理),String
+venue.type,无,String
+venue.issn,无,List[string]
+venue.publisher,无,List[string]
+venue.biblio,,Object
+venue.biblio.volume,从 journal_ref解析(后续处理),String
+venue.biblio.issue,从 journal_ref解析(后续处理),String
+venue.biblio.pages,从 journal_ref解析(后续处理),String
+access_is_oa,布尔值true,String
+access_oa_status,空字符串,String
+access_oa_url,pdf_url（get_pdf=0 时为""）,String
+access_license,license_url 将协议链接映射为对应可选值填入,String
+keywords,无,List[string]
+fieldsOfStudy,无,List[object]
+s2FieldsOfStudy,无,List[object]
+primary_topic,无,Object
+topics,无,List[object]
+concepts,无,List[object]
+subject,无,String
+major,无,String
+major_2,无,String
+major_3,无,String
+category,无,String
+area,无,String
+grade_class,无,String
+grade,无,String
+origin_id,doc_id,String
+origin_osi,取值"arxiv",String
+origin_db_source,无,String
+reference_count,无,Integer
+citation_count,无,Integer
+influential_citation_count,无,Integer
+references,无,List[string]
+related_works,无,List[string]
+citation_normalized_percentile,无,Object
+cited_by_percentile_year,无,Object
+fwci,无,Float
+cited_by_api_url,无,String
+locations,,List[object]
+locations.type,对pdf_url来说，get_pdf值为1时，值为download，get_pdf值为0时，保持空字符串；对source_url来说，get_source值为1时值为download，get_source值为0时保持空字符串。,String
+locations.url,"pdf_url,source_url",String
+locations.license,license_url 将协议链接映射为对应可选值,String
+locations.is_oa,对pdf_url来说，get_pdf值为1时，值为true，get_pdf值为0时为false；对source_url来说，get_source值为1时值为true，get_source值为0时为false。,String
+classifications,,Object
+mesh,无,List[object]
+msc_class,msc_class,String
+acm_class,acm_class,String
+arxiv_category,category,List[string]
diff --git a/dingo/model/rule/scibase/assets/paper_unique_mapping.csv b/dingo/model/rule/scibase/assets/paper_unique_mapping.csv
new file mode 100644
index 00000000..01395384
--- /dev/null
+++ b/dingo/model/rule/scibase/assets/paper_unique_mapping.csv
@@ -0,0 +1,42 @@
+﻿字段名,数据类型,聚合策略,策略参数,源字段名,去重 / 聚合处理逻辑
+doi,string,key_lower,,,唯一去重键，精确匹配，统一小写
+identifiers,"map<string,string>",merge_identifiers,,,"MAP聚合，key去重，doi/DOI/mag/MAG小写后与origin_osi拼接，相同key取max(value)"
+indexed_in,array<string>,dedup_array,,,数组聚合并去重
+type,array<string>,dedup_array,lower=true,,统一小写后数组聚合去重
+title,string,freq_lex_max,min_len=2;max_len=1000,,在非空值里取词频最高；剔除长度<2或>1000极值；词频相同取字典序最大值
+abstract,string,freq_lex_max,min_len=10;max_len=10000,,在非空值里取词频最高；剔除长度<10或>10000极值；词频相同取字典序最大值
+author,"array<struct<name:string,orcid:string>>",dedup_struct,,,数组聚合去重
+language,string,freq_lex_max,,,在非空值里取词频最高；词频相同取字典序最大值
+published_year,int,freq_int_max,min_val=1000;max_val=CURRENT_YEAR,,在非空值里取词频最高；剔除<1000或>当年极值；词频相同取最大值
+published_date,string,freq_date,,,在非空值里取词频最高的出版日期；剔除年份异常值；词频相同取字典序最大值
+venue_name,string,freq_lex_max,,,在非空值里取词频最高；词频相同取字典序最大值
+venue_type,string,freq_lex_max,,,在非空值里取词频最高；词频相同取字典序最大值
+venue_issn,array<string>,dedup_array,,,数组聚合去重
+venue_publisher,array<string>,dedup_array,,,数组聚合去重
+access_license,string,freq_lex_max,,,在非空值里取词频最高；词频相同取字典序最大值
+biblio_volume,string,freq_lex_max,,,在非空值里取词频最高；词频相同取字典序最大值
+biblio_issue,string,freq_lex_max,,,在非空值里取词频最高；词频相同取字典序最大值
+biblio_pages,string,freq_lex_max,,,在非空值里取词频最高；词频相同取字典序最大值
+access_is_oa,string,freq_lex_max,,,在非空值里取词频最高；词频相同取字典序最大值
+access_oa_status,string,freq_lex_max,,,在非空值里取词频最高；词频相同取字典序最大值
+access_oa_url,array<string>,dedup_array,,,数组聚合去重
+locations,"array<struct<type:string,url:string,license:string,is_oa:string>>",dedup_locations,,,"STRUCT转成STRING再去重，key已排序"
+keywords,array<string>,dedup_array,,,数组聚合去重
+fieldsOfStudy,"array<map<string,string>>",dedup_map,,,MAP转成STRING再去重
+s2fieldsofstudy,"array<map<string,string>>",dedup_map,,,MAP转成STRING再去重
+primary_topic,"STRUCT<id:STRING,display_name:STRING,score:DECIMAL(10,4),subfield:STRUCT<id:STRING,display_name:STRING>,field:STRUCT<id:STRING,display_name:STRING>,domain:STRUCT<id:STRING,display_name:STRING>>",freq_struct,,,在非空值里取词频最高；词频相同取字典序最大值
+topics,"ARRAY<STRUCT<id:STRING,display_name:STRING,score:DECIMAL(10,4),subfield:STRUCT<id:STRING,display_name:STRING>,field:STRUCT<id:STRING,display_name:STRING>,domain:STRUCT<id:STRING,display_name:STRING>>>",dedup_struct,,,STRUCT转成STRING再去重
+concepts,"array<map<string,string>>",dedup_map,,,MAP转成STRING再去重
+category,string,freq_lex_max,,,在非空值里取词频最高；词频相同取字典序最大值
+reference_count,int,freq_int_max,,,在非空值里取词频最高；词频相同取最大值
+citation_count,int,freq_int_max,,,在非空值里取词频最高；词频相同取最大值
+influential_citation_count,int,freq_int_max,,,在非空值里取词频最高；词频相同取最大值
+references,array<string>,dedup_array,,,数组聚合去重
+related_works,array<string>,dedup_array,,,数组聚合去重
+citation_normalized_percentile,"MAP<STRING,STRING>",merge_map,,,"MAP聚合，key去重，相同key取max(value)"
+cited_by_percentile_year,"MAP<STRING,STRING>",merge_map,,,"MAP聚合，key去重，相同key取max(value)"
+fwci,"decimal(15,4)",freq_decimal_max,,,在非空值里取词频最高；词频相同取最大值
+cited_by_api_url,string,freq_lex_max,,,在非空值里取词频最高；词频相同取字典序最大值
+mesh,"ARRAY<MAP<STRING,STRING>>",dedup_map,,,MAP转成STRING再去重
+classifications,"STRUCT<mesh:ARRAY<MAP<STRING,STRING>>,msc_class:STRING,acm_class:STRING,arxiv_category:ARRAY<STRING>>",random_pick_cls,,,"mesh随机取一条不为空的值；msc_class/acm_class/arxiv_category从arxiv记录取"
+dt,string,latest_dt,,,保留最新分区日期
diff --git a/dingo/model/rule/scibase/assets/patent_mapping.csv b/dingo/model/rule/scibase/assets/patent_mapping.csv
new file mode 100644
index 00000000..0f2bd1b2
--- /dev/null
+++ b/dingo/model/rule/scibase/assets/patent_mapping.csv
@@ -0,0 +1,109 @@
+预期字段名,xml映射字段,数据类型,字段描述,有效性规则,可空,模块
+document_number,business:BibliographicData/business:PublicationReference[@dataFormat='original']/base:DocumentID → WIPOST3Code+DocNumber+Kind；回退 standard/original 或根 @country+@docNumber+@kind,string,文件号（包含所有公开/公告阶段的专利文件编号（如公开号、专利号）。）,非空；同一局内唯一；保留原始格式,否*,著录信息
+document_kind_text,business:BibliographicData/business:SpecificBibliographicData/business:OriginalKindCode,string,文件种类文字描述,与 13 一致,否*,著录信息
+document_kind_code,PublicationReference(original)/base:DocumentID/base:Kind；回退根 @kind,string,ST.16 文件种类代码,必须是合法 ST.16 代码，可选值A1、B、U、S、A、B1、B2,否*,著录信息
+document_status_code,business:Abstract/@status；回退根 @status,string,数据版本状态标识,"不改变专利本身的法律效力，仅反映数据文件的版本可靠性。例如： 
+A：原始首次发布的数据； 
+C：因内容错误（如印刷错误、技术描述修正）发布的更正版数据； 
+D：数据已废弃（通常因重大错误）。",,著录信息
+document_wipo_country_code,PublicationReference(original)/base:DocumentID/base:WIPOST3Code；回退根 @country,string,ST.3,,是,著录信息
+publication_date,PublicationReference/base:DocumentID/base:Date（优先 dataFormat=standard/original）；回退根 @datePublication,string,,,,著录信息
+publication_language,根 @lang,string,公开语言,ISO 639 两位语言码,是,著录信息
+publication_office_code,PublicationReference/@sourceDB 或 DocumentID/WIPOST3Code；回退根 @country,string,公布局/组织代码,ST.3 两位代码或官方组织标识,条件否**,著录信息
+correction_info,business:BibliographicData/business:PublicAvailabilityDate/business:GrantTerms/business:Disclaimer/base:Text,object,更正/勘误信息,结构需符合 ST.50,是,著录信息
+invention_title,business:BibliographicData/business:InventionTitle[@lang=publication_language],string,发明名称,非空；建议长度限制,否*,技术信息
+ipc,business:ClassificationIPC/business:MainClassification|FurtherClassification[@dataFormat='original'],list[string],IPC(国际专利分类),必须是合法分类符号,是*,技术信息
+ipc_text,business:ClassificationIPC/base:Text（按行拆分）,list[string],,,,技术信息
+ipc_edition_statement,business:ClassificationIPC/base:EditionStatement,string,IPC版本号,,,技术信息
+ipcr_classifications,business:ClassificationIPCRDetails/business:ClassificationIPCR/*,list[object],IPCR（改革后 IPC 分类详细信息）,建议每项包含 classification_symbol、classification_version_date、classification_level、classification_value、action_date 等结构化信息；保留原始分类号,是,技术信息
+patent_national_classifications,business:ClassificationNational/business:MainClassification[@dataFormat='original'],list[string],国家专利分类号,保留原始格式；建议记录分类体系来源或国家/地区代码,是,技术信息
+patent_domestic_classifications,business:ClassificationDomestic|DomesticClassification|DomesticPatentClassification/business:MainClassification,list[string],国内/本国专利分类号,保留原始格式；适用于局内自有分类体系,是,技术信息
+patent_fi_classifications,business:ClassificationFI|FIClassification|ClassificationFIData/business:MainClassification|base:Text,list[string],FI 分类号,保留原始格式；适用于日本 FI 分类体系,是,技术信息
+patent_cpc_classifications,business:PatentClassificationDetails/business:PatentClassification[business:ClassificationScheme/@scheme='CPC']；SearchField 下同类节点,list[string],CPC（合作专利分类）分类号,必须是合法 CPC 分类符号；保留原始格式,是,技术信息
+patent_locarno_classes,business:ClassificationLocarno/business:MainClassification,list[string],Locarno Classification工业外观分类号,,是,技术信息
+prior_art_references,business:ReferencesCited/business:Citation/business:ApplicationCitation/business:PublicationReference,list[object],对比文件/现有技术文献,每项建议含 citation_text、doc_number、category,是,技术信息
+search_field,business:SearchField/business:ClassificationNational；PatentClassificationDetails,list[string],检索领域/检索分类,与分类体系一致,是,技术信息
+,,,,,,技术信息
+abstract,business:Abstract/base:Paragraphs,string,摘要全文,非结构化文本,是,全文内容
+description,business:Description/business:Heading|base:Paragraphs|base:Image（递归 business 子节）,list[object],说明书全文,非结构化文本,是,全文内容
+claims,business:Claims/business:Claim/business:ClaimText,list[object],权利要求项,每项至少含 claim_id+claim_num+claim_text,是,全文内容
+drawings,business:Drawings/base:Figure/base:Image,list[object],附图信息,每项至少含 figure_id+image_file；可带 width/height/orientation,是,全文内容
+chemistry,//Chemistry|business:ChemistrySection|ChemicalFormulae|ChemicalFormula 内 Chemistry/base:Image,list[object],化学相关专利独有的item信息,,是,全文内容
+content,根下 Abstract+Description+Drawings+Claims 按文档顺序拼接（标题用 invention_title）,string,处理后专利全文,,是,全文内容
+applicants,business:Parties/business:ApplicantDetails/business:Applicant/base:AddressBook,list[object],申请人,每项至少 name；建议带 country,条件否**,当事人
+assignees,business:Parties/business:AssigneeDetails/business:Assignee/base:AddressBook,list[object],专利权人/所有人,每项至少 name,条件否**,当事人
+inventors,business:Parties/business:InventorDetails/business:Inventor/base:AddressBook,list[object],发明人,已知即填；每项至少 name,是,当事人
+designers,business:Parties/business:DesignerDetails/business:Designer/base:AddressBook,list[object],设计人,,是,当事人
+patent_agents,business:Parties/business:AgentDetails/business:Agent|business:AgentDetails/business:Agent/business:Agency,list[object],代理人,每项至少 name,是,当事人
+patent_agency,business:AgentDetails/business:PatentAgency/base:AddressBook；business:CustomerNumber,list[object],代理机构,,是,当事人
+priority_numbers,business:PriorityClaim/base:DocumentID/base:DocNumber,list[string],优先权申请号,至少 1 个；格式按 ST.10/C、ST.34,否*,优先权
+priority_filing_dates,business:PriorityClaim/base:DocumentID/base:Date,array<date>,优先权申请日,日期按 ST.2,否*,优先权
+priority_office_codes,business:PriorityClaim/base:OfficeCode|business:GeneratingOffice|base:WIPOST3Code,list[string],优先权受理局代码,ST.3 两位代码；PCT 用 WO,否*,优先权
+priority_country_codes,business:PriorityClaim/base:DocumentID/base:WIPOST3Code,list[string],区域/国际优先权指定国家代码,ST.3；至少一个缔约方/WTO 成员,是,优先权
+public_availability_group,business:PublicAvailabilityDate/*（未映射到具名字段的子节点日期）,object,公开/公告日期分组,仅作容器,是,公开/公告/授权时间
+public_availability_unexamined_view_date,business:PublicAvailabilityDate/*Unexamined*View* → base:Date,date,未审查文献阅览公开日,日期按 ST.2,条件否**,公开/公告/授权时间
+public_availability_examined_view_date,business:PublicAvailabilityDate/*Examined*View* → base:Date,date,已审查文献阅览公开日,日期按 ST.2,条件否**,公开/公告/授权时间
+public_availability_unexamined_print_date,business:PublicAvailabilityDate/*Unexamined*Print* → base:Date,date,未审查文献印刷公开日,日期按 ST.2,条件否**,公开/公告/授权时间
+public_availability_examined_print_date,business:PublicAvailabilityDate/*Examined*Print* → base:Date,date,已审查文献印刷公开日,日期按 ST.2,条件否**,公开/公告/授权时间
+grant_publication_date,PublicAvailabilityDate 授权日节点；或 kind∈B/B1/B2 时 publication_date,date,授权文献公开日,日期按 ST.2,条件否**,公开/公告/授权时间
+claims_only_public_date,business:PublicAvailabilityDate/*ClaimsOnly* → base:Date,date,仅权利要求公开日,日期按 ST.2,是,公开/公告/授权时间
+granted_view_date,business:PublicAvailabilityDate/*Granted*View* → base:Date,date,已授权文献阅览公开日,日期按 ST.2,条件否**,公开/公告/授权时间
+corrected_document_issue_date,business:PublicAvailabilityDate/*Corrected* → base:Date,date,更正文献发行日,日期按 ST.2,否*,公开/公告/授权时间
+spc_basic_patent_reference,business:SupplementaryProtectionCertificate|business:SPC/business:BasicPatent|UnderlyingPatent,object,SPC 基础专利引用,含基础专利号，可带公开号,是,SPC（补充保护证书）
+spc_first_national_marketing_auth,SPC/business:FirstNationalMarketingAuthorization|NationalMarketingAuth,object,SPC 首次国家上市许可,含 number+date,是,SPC（补充保护证书）
+spc_first_regional_marketing_auth,SPC/business:FirstRegionalMarketingAuthorization|RegionalMarketingAuth,object,SPC 首次区域上市许可（仅欧盟等区域体系需用）,含 number+date，可选 country_of_origin(ST.3),是,SPC（补充保护证书）
+spc_expiry_or_duration,SPC/business:Expiry|Duration|TermOfProtection,object,SPC 到期日或期限,至少含 expiry_date 或 duration 之一,是,SPC（补充保护证书）
+spc_protected_product_name,SPC/business:Product|ProductName,string,SPC 保护产品名称,非空时建议与基础专利一致性校验,是,SPC（补充保护证书）
+spc_application_date,SPC/business:ApplicationReference/base:DocumentID/base:Date,string,SPC申请日（验证是否符合6个月时限）,,,SPC（补充保护证书）
+spc_country_code,SPC/base:WIPOST3Code 或 @country,string,SPC管辖国家,,,SPC（补充保护证书）
+spc_underlying_patent_number,SPC BasicPatent/DocumentID/base:DocNumber,string,关联的基础专利号（SPC法律效力的唯一依据）,,,SPC（补充保护证书）
+application_numbers,business:ApplicationReference[@dataFormat='original']/base:DocumentID → WIPOST3Code+DocNumber,list[string],申请号,至少 1 个；保留原格式,否*,申请
+filing_dates,business:ApplicationReference/base:DocumentID/base:Date,list[string],申请日,日期按 ST.2；与 21 可一一对应,否*,申请
+original_filing_language,ApplicationReference/@lang；回退根 @lang,string,原始申请语言,ISO 639 两位语言码,是,申请
+effective_rights_date,ApplicationReference/base:DocumentID/base:Date（首个非空）,date,权利生效日,日期按 ST.2,是,申请
+previous_application_reference,business:PreviousApplicationReference|ParentApplicationReference|ProvisionalApplicationReference；ApplicationReference[@applType=provisional|parent|previous],object,PLT 5(7) 先前申请引用,需含 office_code+application_number，可选 filing_date,是,申请
+pct_designated_states,business:PctOrRegionalFilingData//DesignatedState|base:WIPOST3Code|CountryCode,list[string],PCT 指定国,ST.3 两位代码,是,PCT
+regional_designated_states,business:RegionalFilingData//DesignatedState|base:WIPOST3Code,list[string],区域专利指定缔约国,ST.3 两位代码,是,PCT
+pct_national_phase_date,business:PctNationalPhaseEntry|NationalPhaseEntry/base:Date,date,PCT 进入国家/地区阶段日期,日期按 ST.2,是,PCT
+pct_filing_data,business:PctOrRegionalFilingData/base:DocumentID/base:Date,list[object],PCT 国际申请提交数据,每项含 filing_date+application_number，可选 language,是,PCT
+pct_publication_data,business:PctOrRegionalFilingData/business:PctPublication/base:DocumentID,list[object],PCT 国际公开数据,每项含 publication_date+publication_number，可选 language,是,PCT
+pct_effect_ceased_date,business:PctRefiledRevised|RefiledRevisedApplication/base:Date,date,PCT 国际申请失效/未进阶段确认日,日期按 ST.2,是,PCT
+search_report_deferred_publication_date,business:SearchReportDifferentPublication/base:Date,date,检索报告延迟公开日,日期按 ST.2,是,PCT
+regional_filing_data,business:RegionalFilingData/base:DocumentID/base:Date,list[object],区域申请提交数据,每项含 filing_date+application_number，可选 language,是,PCT
+regional_publication_data,business:RegionalFilingData/business:RegionalPublication/base:DocumentID,list[object],区域申请/授权公开数据,每项含 publication_date+publication_number，可选 language,是,PCT
+microorganism_deposit_info,business:BiologicalDeposit|MicroorganismDeposit|MicroorganismDepositDetails|DepositInstitution（容器）,object,微生物保藏信息,仅作容器,是,微生物保藏
+microorganism_deposit_no,保藏节点 base:DepositNumber|business:DepositNumber|base:AccessionNumber,string,微生物保藏编号,,,微生物保藏
+microorganism_deposit_address,保藏节点 base:AddressBook/base:Address,string,微生物保藏地址,,,微生物保藏
+microorganism_deposit_date,保藏节点 base:Date|business:DepositDate,string,微生物保藏时间,,,微生物保藏
+microorganism_deposit_unit,保藏节点 base:Name|business:InstitutionName|AddressBook,string,微生物保藏单位名称,,,微生物保藏
+microorganism_deposit_unit_code,保藏节点 base:WIPOST3Code|business:InstitutionCode,string,微生物保藏单位编号,,,微生物保藏
+microorganism_deposit_survival_status,保藏节点 business:RevivalStatus|@revivalStatus,string,存活情况,,,微生物保藏
+,,,,,,微生物保藏
+addition_relation,business:Addition|AdditionApplication|AdditionRelation → DocumentID|ApplicationReference,object,追加/附加专利关联,含前案号，宜带申请日,是,分案与分组
+division_relation,business:Division|DivisionApplication|DivisionRelation,object,分案来源关联,含母案号，宜带申请日,是,分案与分组
+continuation_relation,business:Continuation|ContinuationApplication|ContinuationInPart|ContinuationRelation；或 Description 标题 RELATED APPLICATION,object,continued/continuation 关联,含前案号+申请日,是,分案与分组
+reissue_publication_number,business:ReissuePublication|ReissuePublicationReference,string,再版/再授权前公开号,非空时应为合法文献号,是,分案与分组
+same_application_previous_publication_number,business:SameApplicationPreviousPublication|SameApplicationPreviousPublicationNumber|PreviousPublicationSameApplication,string,同一申请先前公开号,非空时应为合法文献号,是,分案与分组
+substitute_application_relation,business:Substitute|SubstituteApplication|SubstituteApplicationRelation,object,替代申请关联,含前案号+申请日,是,分案与分组
+utility_model_base_relation,business:UtilityModel|UtilityModelBasis|UtilityModelBase,object,实用新型基础专利/申请关联,含案号，宜带申请日,是,分案与分组
+attachments,//business:Attachment,list[object],附件列表,每项建议含 attachment_type+attachment_name+file_name+order；可带 copies/pages,是,附件与变更
+bibliographic_changes,business:BibliographicChange/base:Date + 节点全文,list[object],著录项目变更,每项建议含 change_seq+change_item+before_value+after_value,是,附件与变更
+,,,,,,附件与变更
+sha256 ,,,,,,库信息
+"origin_url 
+",,,,,,库信息
+"origin_path
+",,,,,,库信息
+file_format,,,,,,库信息
+"file_type
+",,,,,,库信息
+"obtain_timestamp
+",,,,,,库信息
+content_type,,,,,,库信息
+content_length,,,,,,库信息
+process_status,,,,,,库信息
+processed_path ,,,,,,库信息
+page_cnt,,,,,,库信息
+is_broken,,,,,,库信息
+dt,,,,,,库信息
+patent_source,,,,,,库信息
diff --git a/dingo/model/rule/scibase/assets/union_unique_data_mapping.csv b/dingo/model/rule/scibase/assets/union_unique_data_mapping.csv
new file mode 100644
index 00000000..3f3785c3
--- /dev/null
+++ b/dingo/model/rule/scibase/assets/union_unique_data_mapping.csv
@@ -0,0 +1,109 @@
+统一字段名,字段值数据类型,源字段映射(论文),源字段映射(图书),源字段映射(星河),字段有效性规则,备注,字段来源
+unique_id,String,'paper:{doi}','ebook:{isbn13}',,,全量表唯一标识，用于跨 Metadata 与 Fulltext 数据关联、去重与检索。,全量表生成字段
+metadata_type,String,,,,"可选值：""paper"",""ebook""",【Metadata表】元数据来源类型。论文来源取值 paper，图书来源取值 ebook。数据类型：String。,Metadata表
+doi,String,doi,,doi,"格式参考10.1016/s0021-9258(19)52451-6，需均保持小写，无前缀“https://doi.org/”部分
+规范：星河图书馆qa测试代码","【Metadata表】数字对象唯一标识符（DOI），主要用于论文等学术资源定位。数据类型：String。来源映射：OSI论文字段=doi。
+【Fulltext表】全文解析得到的 DOI。数据类型：string。","Metadata表, Fulltext表"
+isbns,List[string],,isbns,,"符合 ISBN 校验规则。13 位与 10 位两种格式。格式参考[
+        ""9781426208072"",
+        ""1426208073""
+      ]    
+规范：星河图书馆qa测试代码",【Metadata表】图书 ISBN 列表，可能包含多个 ISBN。数据类型：List[string]。来源映射：OSI图书字段=isbns。,Metadata表
+isbn13,String,,isbn13,,"格式参考""9781426208072""
+规范：星河图书馆qa测试代码",【Metadata表】13 位 ISBN，图书资源的标准编号。数据类型：String。来源映射：OSI图书字段=isbn13。,Metadata表
+title,String,title,title,title,"不包含不可见字符
+规范：星河图书馆qa测试代码","【Metadata表】资源标题/题名。数据类型：String。来源映射：OSI图书字段=title；OSI论文字段=title。
+【Fulltext表】全文解析得到的标题。数据类型：string。","Metadata表, Fulltext表"
+abstract,String,abstract,abstract,abstract,"不包含不可见字符
+规范：星河图书馆qa测试代码","【Metadata表】摘要、简介或内容概述。数据类型：String。来源映射：OSI图书字段=abstract；OSI论文字段=abstract。
+【Fulltext表】全文解析得到的摘要。数据类型：string。","Metadata表, Fulltext表"
+language,String,language,language,language,"有效性规则见元数据目录格式标准【WIP】
+ISO 639-1语言代码标识，其无法覆盖范围则应用iso ISO 639-3代码。规范：星河图书馆qa测试代码","【Metadata表】资源语言。数据类型：String。来源映射：OSI图书字段=language；OSI论文字段=language。
+【Fulltext表】全文解析得到的语言。数据类型：string。","Metadata表, Fulltext表"
+type,String,type,type,,,【Metadata表】资源类型或文献类型。数据类型：String。来源映射：OSI图书字段=type；OSI论文字段=type。,Metadata表
+author,List[object],author,author,author,"2026.06.01 author字段格式修改为List[object]。每个对象都有两个key-value对，key分别为：""name""、""orcid""，其中name存储作者姓名，orcid存储作者的开放学者身份标识。举例：[
+    {
+      ""name"": ""Alan Aspuru-Guzik"",
+      ""orcid"": ""https://orcid.org/0000-0002-8277-4434""
+    }]。
+每个string指向一个作者，不能多个人名放在同个string下
+每个string不包含分割性质字符，规范：星河图书馆qa测试代码","【Metadata表】作者列表。数据类型：List[object]。来源映射：OSI图书字段=author；OSI论文字段=author。
+【Fulltext表】全文解析得到的作者列表。数据类型：List[object]。","Metadata表, Fulltext表"
+contributors,List[string],,contributors,,"每个string指向一个作者，不能多个人名放在同个string下
+每个string不包含分割性质字符，规范：星河图书馆qa测试代码",【Metadata表】贡献者列表，如编者、译者等。数据类型：List[string]。来源映射：OSI图书字段=contributors。,Metadata表
+indexed_in,List[string],indexed_in,indexed_in,,,【Metadata表】收录/索引来源列表。数据类型：List[string]。来源映射：OSI图书字段=indexed_in；OSI论文字段=indexed_in。非对外字段。,Metadata表
+identifiers,Object,identifiers,identifiers,,,【Metadata表】其他外部标识符集合。数据类型：Object。来源映射：OSI图书字段=identifiers；OSI论文字段=identifiers。非对外字段。,Metadata表
+locations,List[object],locations,,,"每个object都有4个属性：
+type：可选值：""download"",""reader"",""display"",""""
+url：正则规范 r'^[Hh][Tt][Tt][Pp][Ss]?://[^/$.?#][\s\S]*$'
+license：可选值: cc-by、cc-by-nc、cc-by-sa、cc-by-nd、cc-by-nc-sa、cc-by-nc-nd、other-oa、cc0、""""、public-domain、publisher-specific-oa、publisher-specific、wiley-specific、elsevier-specific、oup-specific、acs-specific、rsc-specific、iop-specific、other-oa、unspecified-oa、implied-oa、nonexclusive-distrib 。以下协议类型非论文许可而为数据许可（gpl-v1、gpl-v2、gpl-v3、mit）政府许可（ogl-c）未知许可类型（pd）,但允许出现在可选值中。
+is_oa：可选值：""true"" ""false"" ""unknown""",【Metadata表】资源可访问位置、来源链接或馆藏/开放获取位置列表。数据类型：List[object]。来源映射：OSI论文字段=locations。,Metadata表
+access_is_oa,String,access_is_oa,,,可选值："true" "false" "",【Metadata表】是否开放获取。数据类型：String。来源映射：OSI论文字段=is_oa。,Metadata表
+access_oa_status,String,access_oa_status,,,可选值：diamond、gold、green、hybrid、bronze、closed、"",【Metadata表】开放获取状态。数据类型：String。来源映射：OSI论文字段=oa_status。,Metadata表
+access_oa_url,List[string],access_oa_url,access_oa_url,,正则规范 r'^[Hh][Tt][Tt][Pp][Ss]?://[^/$.?#][\s\S]*$',【Metadata表】开放获取 URL。数据类型：String。来源映射：OSI图书字段=oa_url；OSI论文字段=oa_url。,Metadata表
+access_license,String,access_license,,,"可选值: cc-by、cc-by-nc、cc-by-sa、cc-by-nd、cc-by-nc-sa、cc-by-nc-nd、other-oa、cc0、""""、public-domain、publisher-specific-oa、publisher-specific、wiley-specific、elsevier-specific、oup-specific、acs-specific、rsc-specific、iop-specific、other-oa、unspecified-oa、implied-oa、nonexclusive-distrib 。以下协议类型非论文许可而为数据许可（gpl-v1、gpl-v2、gpl-v3、mit）政府许可（ogl-c）未知许可类型（pd）,但允许出现在可选值中。",【Metadata表】开放获取或使用许可协议。数据类型：String。来源映射：OSI论文字段=license。,Metadata表
+publication_published_date,String,published_date,,,格式为"1951-11-01"即"yyyy-mm-dd",【Metadata表】出版/发表日期。数据类型：String。来源映射：OSI论文字段=published_date。,Metadata表
+publication_published_year,Integer,published_year,publication_published_year,,格式为1951，大于0，小于2100,【Metadata表】出版/发表年份。数据类型：Integer。来源映射：OSI图书字段=published_year；OSI论文字段=published_year。,Metadata表
+publication_published_place,List[string],,publication_published_place,,,【Metadata表】出版地。数据类型：List[string]。来源映射：OSI图书字段=published_place。,Metadata表
+publication_published_country,List[string],,publication_published_country,,,【Metadata表】出版国家/地区。数据类型：List[string]。来源映射：OSI图书字段=published_country。,Metadata表
+publication_venue_name,String,venue_name,,,,【Metadata表】发表载体名称，如期刊、会议或图书系列名称。数据类型：String。来源映射：OSI论文字段=venue.name。,Metadata表
+publication_venue_name_unified,String,,,,期刊字典标准化如果publication_venue_name在target_journal_name列，那么本字段就是target_journal_name列，不在则取publication_venue_name,归一化的期刊统一命名,全量表生成字段
+publication_venue_type,String,venue_type,,,,【Metadata表】发表载体类型。数据类型：String。来源映射：OSI论文字段=venue.type。,Metadata表
+publication_venue_issn,List[string],venue_issn,,,"其中string元素：由8位数字组成。8位数字分为前后两段各4位，中间用连接号相连，""xxxx-xxxx""，（前7位数字为单纯的数字序号，无任何特殊含义，第八个数字是根据前七个数字按模数 11算法计算得出的检验码。若计算结果为10，第八个数字可为“X”）
+规范：每个string元素 星河图书馆qa测试代码",【Metadata表】发表载体 ISSN 列表。数据类型：List[string]。来源映射：OSI论文字段=venue.issn。,Metadata表
+publication_publisher,List[string],venue_publisher,publication_publisher,,,【Metadata表】出版方/出版社。数据类型：List[string]。来源映射：OSI图书字段=publisher；OSI论文字段=venue.publisher。,Metadata表
+publication_venue_biblio_volume,String,biblio_volume,,,格式示例为"193"，可以转化为数字,【Metadata表】期刊/会议卷号。数据类型：String。来源映射：OSI论文字段=venue.biblio.volume。,Metadata表
+publication_venue_biblio_issue,String,biblio_issue,,,格式示例为"1"，可以转化为数字,【Metadata表】期刊/会议期号。数据类型：String。来源映射：OSI论文字段=venue.biblio.issue。,Metadata表
+publication_venue_biblio_pages,String,biblio_pages,,,格式示例为"265-275",【Metadata表】期刊/会议页码范围。数据类型：String。来源映射：OSI论文字段=venue.biblio.issue。,Metadata表
+publication_pages,Integer,,publication_pages,,大于0,【Metadata表】图书页数或资源总页数。数据类型：Integer。来源映射：OSI图书字段=pages。,Metadata表
+keywords,List[string],keywords,,,,【Metadata表】关键词列表。数据类型：List[string]。来源映射：OSI论文字段=keywords。,Metadata表
+fieldsOfStudy,List[object],fieldsOfStudy,,,,【Metadata表】学科领域信息。数据类型：List[object]。来源映射：OSI论文字段=fieldsOfStudy。,Metadata表
+s2fieldsofstudy,List[object],s2fieldsofstudy,,,,【Metadata表】Semantic Scholar 学科领域信息，内部字段。数据类型：List[object]。来源映射：OSI论文字段=s2FieldsOfStudy。非对外字段。,Metadata表
+primary_topic,Object,primary_topic,,,,【Metadata表】主要主题信息。数据类型：Object。来源映射：OSI论文字段=primary_topic。,Metadata表
+topics,List[object],topics,,,,【Metadata表】主题列表。数据类型：List[object]。来源映射：OSI论文字段=topics。,Metadata表
+concepts,List[object],concepts,,,,【Metadata表】概念标签列表。数据类型：List[object]。来源映射：OSI论文字段=concepts。,Metadata表
+grade_class,String,/,,grade_class,"有效性规则见元数据目录格式标准【WIP】
+规范：星河图书馆qa测试代码","【Metadata表】等级分类。数据类型：String。来源映射：OSI图书字段=grade_class；OSI论文字段=grade_class。
+【Fulltext表】全文解析得到的等级分类。数据类型：string。","Metadata表, Fulltext表"
+grade,String,/,,grade,"有效性规则见元数据目录格式标准【WIP】
+规范：星河图书馆qa测试代码","【Metadata表】等级。数据类型：String。来源映射：OSI图书字段=grade；OSI论文字段=grade。
+【Fulltext表】全文解析得到的等级。数据类型：string。","Metadata表, Fulltext表"
+subjects,List[string],,subjects,,,【Metadata表】图书主题词列表。数据类型：List[string]。来源映射：OSI图书字段=subjects。,Metadata表
+genre,List[string],,genre,,,【Metadata表】图书体裁/类型列表。数据类型：List[string]。来源映射：OSI图书字段=genre。,Metadata表
+reference_count,Integer,reference_count,,,,【Metadata表】引用文献数,Metadata表
+citation_count,Integer,citation_count,,,,【Metadata表】被引次数，指一篇文章被其他文章引用的次数，是衡量文章影响力的重要指标。,Metadata表
+influential_citation_count,Integer,influential_citation_count,,,,【Metadata表】高影响力被引次数,Metadata表
+fwci,Float,fwci,,,,【Metadata表】一篇文献的“领域加权引用影响”Field-Weighted Citation Impact,Metadata表
+citations,List[object],/,,,,【Metadata表】引用本篇论文的文献列表,Metadata表
+references,List[string],references,,,"string元素中存储url
+规范：url星河图书馆qa测试代码",【Metadata表】本论文引用的文献列表,Metadata表
+related_works,List[string],related_works,,,"string元素中存储url
+规范：url星河图书馆qa测试代码",【Metadata表】本论文相关工作列表,Metadata表
+citation_normalized_percentile,Object,citation_normalized_percentile,,,,【Metadata表】一篇文献的“被引百分位”，表示该文献的引用次数在其文献类型、出版年份和学科子领域构成的可比集合中所处的百分位位置。例如 0.999 62 意味着该文献的引用次数高于 99.962% 的同类型、同年、同领域文献，因此属于全球前 0.038%。,Metadata表
+cited_by_percentile_year,Object,cited_by_percentile_year,,,,"【Metadata表】按“出版年份”细分的被引百分位区间。min: 99, max: 100 表示该文献在同年份的所有可比文献中，处于 99–100 百分位段，即“最顶尖 1%”。",Metadata表
+cited_by_api_url,String,cited_by_api_url,,,"string元素中存储url
+规范：url星河图书馆qa测试代码",【Metadata表】获取“被哪些文献引用”的 API 端点（可直接调用 OpenAlex API，获取所有引用该文献的作品（works）列表。）,Metadata表
+classifications,Object,classifications,/,/,,【Metadata表】,
+supplementary_material,List[object],/,/,supplementary_material,每个对象都有三个key-value对，key分别为：supplementary_material_name、supplementary_material_url、supplementary_material_path，分别存储补充材料文件名、补充材料链接、补充材料的对象存储路径,【Fulltext表】全文对应的补充材料文件,Fulltext表
+access_xinghe_repository_has_fulltext,Boolean,,,/,可选值有布尔值true、布尔值false，默认值为false,【Metadata表】有全文数据。数据类型：Boolean。来源映射：OSI图书字段=xinghe_repository_has_fulltext；OSI论文字段=xinghe_repository_has_fulltext。,Metadata表
+access_xinghe_repository_sha256,String,,,sha256,默认值空列表；当xinghe_repository_has_fulltext 为true时，此字段不可以为空。,"【Metadata表】全文文件 SHA-256 哈希值，用于文件唯一识别、去重与关联。数据类型：List[string]。来源映射：OSI图书字段=access_xinghe_repository_sha256；OSI论文字段=access_xinghe_repository_sha256。
+【Fulltext表】数据类型：string。","Metadata表, Fulltext表"
+access_xinghe_repository_origin_path,String,,,origin_path,默认值""；当xinghe_repository_has_fulltext 为true时，此字段不可以为空。,【Fulltext表】全文原始文件存储路径，内部字段。数据类型：string。非对外字段。,Fulltext表
+access_xinghe_repository_page_cnt,Integer,,,page_cnt,,【Fulltext表】全文页数。数据类型：int。,Fulltext表
+access_xinghe_repository_process_status,Integer,,,process_status,1-已处理；0-未处理，默认值为0,【Fulltext表】全文处理状态，内部字段。数据类型：int。非对外字段。,Fulltext表
+access_xinghe_repository_processed_path,String,,,processed_path,当access_xinghe_repository_process_status为1，此字段不允许为空,【Fulltext表】全文处理后文件路径，内部字段。数据类型：string。非对外字段。,Fulltext表
+access_xinghe_repository_origin_url,String,,,origin_url,,【Fulltext表】全文原始来源 URL。数据类型：string。,Fulltext表
+access_xinghe_repository_file_format,String,,,file_format,示例（仅示例非可选值范围）pdfmp4oggdocx使用python-magic 进行文档类别识别，可选范围即magic识别库中包含的文档类型+unknown,【Fulltext表】全文文件格式，如 pdf、html 等。数据类型：string。,Fulltext表
+access_xinghe_repository_file_type,String,,,file_type,"范围：
+paper
+ebook
+textbook
+patent
+report
+other",【Fulltext表】全文文件类型。数据类型：string。,Fulltext表
+access_xinghe_repository_content_type,String,,,content_type,取值：https://mimetype.io/all-types,【Fulltext表】文件 MIME Content-Type。数据类型：string。,Fulltext表
+access_xinghe_repository_content_length,long,,,content_length,,【Fulltext表】文件内容长度/大小。数据类型：long。,Fulltext表
+access_xinghe_repository_is_broken,Integer,,,is_broken,具体取值范围与定义见文件完整性校验,【Fulltext表】全文文件是否损坏或不可正常解析。数据类型：int。枚举值：0、1、2、3，详见文件完整性校验,Fulltext表
+access_xinghe_repository_obtain_timestamp,timestamp (seconds),,,obtain_timestamp,,【Fulltext表】全文获取时间戳（秒），内部字段。数据类型：timestamp (seconds)。非对外字段。Unix epoch seconds，可以是整数（精确到秒），也可以是浮点数（小数部分可精确至微秒 ,Fulltext表
+access_xinghe_repository_model_name,String,/,/,model_name,目前可选值为【数据字典】数据域-模型名称-学科科目,【Fulltexta表】全文解析使用的模型名称，内部字段。数据类型：string。非对外字段。,Fulltext表
+access_xinghe_repository_model_version,String,/,/,model_version,目前可选值为【数据字典】数据域-模型名称-学科科目,【Fulltext表】全文解析使用的模型版本，内部字段。数据类型：string。非对外字段。,Fulltext表
diff --git a/dingo/model/rule/scibase/meta_ebook_unique.py b/dingo/model/rule/scibase/meta_ebook_unique.py
new file mode 100644
index 00000000..89216d8a
--- /dev/null
+++ b/dingo/model/rule/scibase/meta_ebook_unique.py
@@ -0,0 +1,1520 @@
+#!/usr/bin/env python3
+"""Self-contained meta_ebook unique DB validator.
+
+Field aggregation rules are driven by ../doc/ebook_unique_mapping.csv.
+"""
+from __future__ import annotations
+
+import csv
+import re
+import argparse
+import json
+import sys
+import time
+from collections import Counter
+from dataclasses import dataclass
+from datetime import date, datetime
+from decimal import Decimal
+from pathlib import Path
+from typing import Any, Callable, Dict, Iterable, List, Optional, Sequence, Tuple
+
+SKIP_COMPARE_STRATEGIES = frozenset({"random_pick_cls"})
+ORDER_INSENSITIVE_COMPARE_STRATEGIES = frozenset({"dedup_array", "isbn_normalize"})
+StrategyHandler = Callable[[List[Dict[str, Any]], "FieldRule", Dict[str, Any]], Any]
+
+
+@dataclass
+class FieldRule:
+    field_name: str
+    data_type: str
+    strategy: str
+    params: Dict[str, Any]
+    source_field: str
+    description: str
+
+    @property
+    def effective_source(self) -> str:
+        return self.source_field or self.field_name
+
+
+def _parse_params(raw: str) -> Dict[str, Any]:
+    if not raw:
+        return {}
+    params: Dict[str, Any] = {}
+    for pair in raw.split(";"):
+        pair = pair.strip()
+        if "=" not in pair:
+            continue
+        key, val = pair.split("=", 1)
+        key, val = key.strip(), val.strip()
+        if val.lower() == "true":
+            params[key] = True
+        elif val.lower() == "false":
+            params[key] = False
+        elif val.lstrip("-").isdigit():
+            params[key] = int(val)
+        else:
+            params[key] = val
+    return params
+
+
+def load_field_rules(
+    path: Path,
+    *,
+    field_column: str = "字段名",
+    type_column: str = "数据类型",
+    strategy_column: str = "聚合策略",
+    params_column: str = "策略参数",
+    source_column: str = "源字段名",
+    desc_column: str = "去重 / 聚合处理逻辑",
+) -> List[FieldRule]:
+    rules: List[FieldRule] = []
+    with path.open(encoding="utf-8-sig", newline="") as f:
+        reader = csv.DictReader(f)
+        if not reader.fieldnames or field_column not in reader.fieldnames:
+            available = ", ".join(fn for fn in (reader.fieldnames or []) if fn.strip())
+            raise ValueError(
+                f"映射文件 {path} 缺少字段列 {field_column!r}（可用列: {available}）"
+            )
+        for row in reader:
+            name = (row.get(field_column) or "").strip()
+            if not name:
+                continue
+            rules.append(FieldRule(
+                field_name=name,
+                data_type=(row.get(type_column) or "").strip(),
+                strategy=(row.get(strategy_column) or "").strip(),
+                params=_parse_params((row.get(params_column) or "").strip()),
+                source_field=(row.get(source_column) or "").strip(),
+                description=(row.get(desc_column) or "").strip(),
+            ))
+    return rules
+
+
+def output_fields_from_rules(rules: Sequence[FieldRule]) -> List[str]:
+    return [r.field_name for r in rules if r.strategy not in SKIP_COMPARE_STRATEGIES]
+
+
+def order_insensitive_fields_from_rules(rules: Sequence[FieldRule]) -> set:
+    return {
+        r.field_name
+        for r in rules
+        if r.strategy in ORDER_INSENSITIVE_COMPARE_STRATEGIES
+    }
+
+
+def aggregate_by_rules(
+    records: List[Dict[str, Any]],
+    rules: Sequence[FieldRule],
+    handlers: Dict[str, StrategyHandler],
+) -> Dict[str, Any]:
+    result: Dict[str, Any] = {}
+    for rule in rules:
+        handler = handlers.get(rule.strategy)
+        if handler is None:
+            raise ValueError(
+                f"Unknown aggregation strategy {rule.strategy!r} "
+                f"for field {rule.field_name!r}"
+            )
+        result[rule.field_name] = handler(records, rule, result)
+    return result
+
+try:
+    import pymysql
+except ImportError:  # pragma: no cover - runtime dependency check
+    pymysql = None  # type: ignore
+
+
+CURRENT_YEAR = datetime.now().year
+PROJECT_ROOT = Path(__file__).resolve().parent
+ASSETS_DIR = PROJECT_ROOT / "assets"
+DEFAULT_CONFIG_PATH = Path("sci_base_qa_test_config.json")
+TEMPLATE_CONFIG_PATH = ASSETS_DIR / "settings.template.json"
+DEFAULT_MAPPING_CSV = ASSETS_DIR / "ebook_unique_mapping.csv"
+REPORT_ROOT = Path("report")
+DEFAULT_SOURCE_TABLE = "dws_meta_ebook_data_acc_d"
+DEFAULT_TARGET_TABLE = "dws_meta_ebook_isbn_unique_acc_d"
+
+
+def safe_filename_token(value: Optional[Any]) -> str:
+    text = "all" if value in (None, "") else str(value)
+    return re.sub(r"[^0-9A-Za-z_-]+", "_", text).strip("_") or "all"
+
+
+def default_report_path(dt: Optional[str], sample_mode: str, full: bool) -> Path:
+    mode = "full" if full else sample_mode
+    report_dir = REPORT_ROOT / f"meta_ebook_unique_dt_{safe_filename_token(dt)}_{safe_filename_token(mode)}"
+    return report_dir / "source_field_mismatch.jsonl"
+
+
+def _json_inline(value: Any) -> str:
+    return json.dumps(value, ensure_ascii=False, cls=JsonEncoder)
+
+
+def summary_paths(report_path: Path) -> Tuple[Path, Path]:
+    return report_path.parent / "summary.json", report_path.parent / "readable_summary.md"
+
+
+REPORT_KEY_LABELS = {
+    "report": "报告路径",
+    "total_problem_rows": "问题记录数",
+    "result": "校验结果",
+    "status_counts": "状态分布",
+    "field_counts": "字段问题分布",
+    "field_samples": "字段问题样例",
+    "key": "键值",
+    "expected_key": "预期键值",
+    "dt": "分区日期",
+    "source_count": "源表记录数",
+    "status": "状态",
+    "expected": "预期值",
+    "actual": "实际值",
+    "kind": "校验类型",
+    "source_table": "源表",
+    "target_table": "目标表",
+    "key_field": "去重键字段",
+    "validated_partitions": "已校验分区",
+    "sample_mode": "抽样模式",
+    "sample_size": "抽样数量",
+    "dt_check": "分区检查",
+    "checked": "已校验数",
+    "passed": "通过数",
+    "failed": "失败数",
+    "missing_source": "源表缺失数",
+    "missing_target": "目标表缺失数",
+    "source_count_buckets": "源表记录数分桶",
+    "missing_samples": "缺失样例",
+    "source_records": "源表记录",
+    "target_records": "目标表记录",
+    "expected_record": "预期记录",
+    "report_path": "报告路径",
+    "sample_mismatches": "问题样例",
+    "mismatches": "字段差异",
+    "source_dt_count": "源表分区数",
+    "target_dt_count": "目标表分区数",
+    "missing_in_target": "目标表缺失分区",
+    "extra_in_target": "目标表多余分区",
+    "count_mismatches": "数量不一致明细",
+    "source_distinct_skipped": "源表去重计数已跳过",
+}
+
+
+def localize_report_keys(value: Any) -> Any:
+    if isinstance(value, dict):
+        return {
+            REPORT_KEY_LABELS.get(str(key), str(key)): localize_report_keys(val)
+            for key, val in value.items()
+        }
+    if isinstance(value, list):
+        return [localize_report_keys(item) for item in value]
+    return value
+
+
+TOP_FIELD_LIMIT = 20
+TOP_SAMPLE_FIELD_LIMIT = 5
+SAMPLES_PER_FIELD = 3
+
+
+def compact_record_for_report(record: Dict[str, Any]) -> Dict[str, Any]:
+    keys = (
+        "track_id",
+        "isbn13",
+        "isbns",
+        "origin_osi",
+        "origin_id",
+        "title",
+        "type",
+        "author",
+        "contributors",
+        "published_year",
+        "published_date",
+        "publisher",
+        "dt",
+    )
+    return {
+        key: record.get(key)
+        for key in keys
+        if record.get(key) not in (None, "", [], {})
+    }
+
+
+def compact_records_for_report(records: Any) -> Any:
+    if isinstance(records, dict):
+        return compact_record_for_report(records)
+    if not isinstance(records, list):
+        return records
+    return [
+        compact_record_for_report(record)
+        for record in records
+        if isinstance(record, dict)
+    ]
+
+
+def build_report_summary(
+    report_path: Path,
+    result: Dict[str, Any],
+    mismatch_rows: Sequence[Dict[str, Any]],
+) -> Dict[str, Any]:
+    status_counts = Counter(str(row.get("status") or "unknown") for row in mismatch_rows)
+    field_counts: Counter = Counter()
+    field_samples: Dict[str, List[Dict[str, Any]]] = {}
+    missing_samples: List[Dict[str, Any]] = []
+    for row in mismatch_rows:
+        if row.get("status") in ("missing_target", "missing_source") and len(missing_samples) < SAMPLES_PER_FIELD:
+            missing_samples.append(
+                {
+                    "key": row.get("key"),
+                    "expected_key": row.get("expected_key"),
+                    "dt": row.get("dt"),
+                    "source_count": row.get("source_count"),
+                    "status": row.get("status"),
+                    "source_records": compact_records_for_report(row.get("source_records")),
+                    "target_records": compact_records_for_report(row.get("target_records")),
+                    "expected_record": compact_records_for_report(row.get("expected_record")),
+                }
+            )
+        for field, diff in (row.get("mismatches") or {}).items():
+            field_counts[field] += 1
+            samples = field_samples.setdefault(field, [])
+            if len(samples) < SAMPLES_PER_FIELD:
+                samples.append(
+                    {
+                        "key": row.get("key"),
+                        "expected_key": row.get("expected_key"),
+                        "dt": row.get("dt"),
+                        "source_count": row.get("source_count"),
+                        "status": row.get("status"),
+                        "expected": diff.get("expected") if isinstance(diff, dict) else None,
+                        "actual": diff.get("actual") if isinstance(diff, dict) else None,
+                    }
+                )
+    sorted_field_counts = dict(field_counts.most_common())
+    top_sample_fields = set(list(sorted_field_counts)[:TOP_SAMPLE_FIELD_LIMIT])
+    return {
+        "report": str(report_path),
+        "total_problem_rows": len(mismatch_rows),
+        "result": {k: v for k, v in result.items() if k != "sample_mismatches"},
+        "status_counts": dict(status_counts.most_common()),
+        "field_counts": sorted_field_counts,
+        "field_count_total": len(sorted_field_counts),
+        "field_samples": {
+            field: field_samples[field]
+            for field in sorted_field_counts
+            if field in top_sample_fields and field in field_samples
+        },
+        "missing_samples": missing_samples,
+    }
+
+
+def write_report_summary(report_path: Path, result: Dict[str, Any], mismatch_rows: Sequence[Dict[str, Any]]) -> None:
+    summary_json_path, summary_md_path = summary_paths(report_path)
+    summary = build_report_summary(report_path, result, mismatch_rows)
+    with summary_json_path.open("w", encoding="utf-8") as f:
+        json.dump(localize_report_keys(summary), f, ensure_ascii=False, indent=2, cls=JsonEncoder)
+
+    lines = [
+        "# Ebook 去重校验报告摘要",
+        "",
+        f"- 分区: `{result.get('dt')}`",
+        f"- 抽样: `{result.get('sample_mode')}`, 数量 `{result.get('sample_size')}`",
+        f"- 结果: 已校验 `{result.get('checked')}`，通过 `{result.get('passed')}`，失败 `{result.get('failed')}`",
+        f"- 缺失: 源表 `{result.get('missing_source')}`，目标表 `{result.get('missing_target')}`",
+        f"- 明细报告: `{report_path}`",
+        f"- 报告目录: `{report_path.parent}`",
+        f"- 源表记录数分桶: `{_json_inline(result.get('source_count_buckets'))}`",
+        "",
+        "## Count 校验",
+        "",
+        f"- source_distinct_skipped: `{(result.get('dt_check') or {}).get('source_distinct_skipped')}`",
+        f"- count_mismatches: `{len((result.get('dt_check') or {}).get('count_mismatches') or [])}`",
+        "",
+        "## 状态分布",
+        "",
+    ]
+    for status, count in summary["status_counts"].items():
+        lines.append(f"- `{status}`: {count}")
+    if not summary["status_counts"]:
+        lines.append("- 无")
+    lines.extend(["", "## 字段问题分布", ""])
+    for field, count in summary["field_counts"].items():
+        lines.append(f"- `{field}`: {count}")
+    if not summary["field_counts"]:
+        lines.append("- 无")
+    if summary.get("missing_samples"):
+        lines.extend(["", "## 缺失样例", ""])
+        for sample in summary["missing_samples"]:
+            lines.append(
+                f"- ISBN13 `{sample.get('key')}`, expected_key=`{sample.get('expected_key')}`, "
+                f"source_count={sample.get('source_count')}, status=`{sample.get('status')}`"
+            )
+            for name in ("source_records", "target_records", "expected_record"):
+                if sample.get(name) is not None:
+                    lines.append(f"  - {name}: `{_json_inline(sample.get(name))}`")
+    lines.extend(["", "## 字段问题样例", ""])
+    for field, samples in summary["field_samples"].items():
+        lines.append(f"### {field} ({summary['field_counts'].get(field)})")
+        lines.append("")
+        for sample in samples:
+            lines.append(
+                f"- ISBN13 `{sample.get('key')}`, expected_key=`{sample.get('expected_key')}`, "
+                f"source_count={sample.get('source_count')}, status=`{sample.get('status')}`"
+            )
+            lines.append(f"  - expected: `{_json_inline(sample.get('expected'))}`")
+            lines.append(f"  - actual: `{_json_inline(sample.get('actual'))}`")
+            lines.append("")
+    with summary_md_path.open("w", encoding="utf-8") as f:
+        f.write("\n".join(lines).rstrip() + "\n")
+
+
+class JsonEncoder(json.JSONEncoder):
+    def default(self, obj: Any) -> Any:
+        if isinstance(obj, Decimal):
+            if obj == obj.to_integral_value():
+                return int(obj)
+            return float(obj)
+        if isinstance(obj, (date, datetime)):
+            return obj.isoformat()
+        return super().default(obj)
+
+
+# ---- common scalar/array helpers ----
+
+
+def is_non_empty(value: Any) -> bool:
+    if value is None:
+        return False
+    if isinstance(value, str):
+        return value != ""
+    if isinstance(value, (list, dict)):
+        return len(value) > 0
+    return True
+
+
+def choose_freq_then_lex_max(values: Iterable[str]) -> str:
+    vals = [v for v in values if isinstance(v, str) and v != ""]
+    if not vals:
+        return ""
+    cnt = Counter(vals)
+    max_freq = max(cnt.values())
+    candidates = [k for k, v in cnt.items() if v == max_freq]
+    return max(candidates)
+
+
+def choose_freq_then_max_int(values: Iterable[int]) -> Optional[int]:
+    vals = [v for v in values if isinstance(v, int)]
+    if not vals:
+        return None
+    cnt = Counter(vals)
+    max_freq = max(cnt.values())
+    candidates = [k for k, v in cnt.items() if v == max_freq]
+    return max(candidates)
+
+
+def dedup_str_array(values: Iterable[Any], lower: bool = False) -> List[str]:
+    out = set()
+    for item in values:
+        if isinstance(item, list):
+            for v in item:
+                if v is None:
+                    continue
+                s = str(v)
+                if s == "":
+                    continue
+                out.add(s.lower() if lower else s)
+        elif item is not None:
+            s = str(item)
+            if s != "":
+                out.add(s.lower() if lower else s)
+    return sorted(out)
+
+
+def merge_identifiers(values: Iterable[Any]) -> Dict[str, str]:
+    merged: Dict[str, str] = {}
+    for item in values:
+        if not isinstance(item, dict):
+            continue
+        for k, v in item.items():
+            if v is None:
+                continue
+            sv = str(v)
+            if k not in merged or sv > merged[k]:
+                merged[k] = sv
+    return merged
+
+
+def parse_int(value: Any) -> Optional[int]:
+    if value is None or value == "":
+        return None
+    if isinstance(value, bool):
+        return None
+    if isinstance(value, int):
+        return value
+    txt = str(value).strip()
+    if txt == "":
+        return None
+    try:
+        return int(txt)
+    except ValueError:
+        return None
+
+
+# ---- ebook-specific normalization helpers ----
+
+
+def normalize_isbn_to_13(raw: Any) -> Optional[str]:
+    """10 位 ISBN 前面加 978 转为 13 位，13 位保留，其他长度丢弃。"""
+    if raw is None:
+        return None
+    s = str(raw).strip().replace("-", "")
+    if not s:
+        return None
+    if len(s) == 13 and s.isdigit():
+        return s
+    if len(s) == 10 and s[:9].isdigit() and (s[9].isdigit() or s[9].upper() == "X"):
+        return "978" + s
+    return None
+
+
+def extract_year(value: Any) -> Optional[int]:
+    if value is None or value == "":
+        return None
+    if isinstance(value, int):
+        year = value
+    else:
+        txt = str(value).strip()
+        m = re.search(r"(1\d{3}|20\d{2})", txt)
+        if not m:
+            return None
+        year = int(m.group(1))
+    if year < 1000 or year > CURRENT_YEAR:
+        return None
+    return year
+
+
+# ---- strategy handlers ----
+
+
+def _handle_freq_lex_max(
+    records: List[Dict[str, Any]], rule: FieldRule, result: Dict[str, Any],
+) -> Any:
+    src = rule.effective_source
+    min_len = rule.params.get("min_len")
+    max_len = rule.params.get("max_len")
+    vals: List[str] = []
+    for r in records:
+        v = r.get(src)
+        if not is_non_empty(v):
+            continue
+        s = str(v)
+        if min_len is not None and len(s) < min_len:
+            continue
+        if max_len is not None and len(s) > max_len:
+            continue
+        vals.append(s)
+    return choose_freq_then_lex_max(vals)
+
+
+def _handle_freq_int_max(
+    records: List[Dict[str, Any]], rule: FieldRule, result: Dict[str, Any],
+) -> Any:
+    src = rule.effective_source
+    min_val = rule.params.get("min_val")
+    max_val = rule.params.get("max_val")
+    if isinstance(max_val, str) and max_val == "CURRENT_YEAR":
+        max_val = CURRENT_YEAR
+    use_extract = rule.params.get("extract_year", False)
+    vals: List[int] = []
+    for r in records:
+        v = extract_year(r.get(src)) if use_extract else parse_int(r.get(src))
+        if v is None:
+            continue
+        if min_val is not None and v < min_val:
+            continue
+        if max_val is not None and v > max_val:
+            continue
+        vals.append(v)
+    return choose_freq_then_max_int(vals)
+
+
+def _handle_dedup_array(
+    records: List[Dict[str, Any]], rule: FieldRule, result: Dict[str, Any],
+) -> List[str]:
+    return dedup_str_array(
+        [r.get(rule.effective_source, []) for r in records],
+        lower=rule.params.get("lower", False),
+    )
+
+
+def _handle_merge_map(
+    records: List[Dict[str, Any]], rule: FieldRule, result: Dict[str, Any],
+) -> Dict[str, str]:
+    return merge_identifiers([r.get(rule.effective_source, {}) for r in records])
+
+
+def _handle_max_int(
+    records: List[Dict[str, Any]], rule: FieldRule, result: Dict[str, Any],
+) -> Optional[int]:
+    vals = [v for r in records for v in [parse_int(r.get(rule.effective_source))] if v is not None]
+    return max(vals) if vals else None
+
+
+def _handle_latest_dt(
+    records: List[Dict[str, Any]], rule: FieldRule, result: Dict[str, Any],
+) -> str:
+    src = rule.effective_source
+    vals = [str(r.get(src, "")) for r in records if is_non_empty(r.get(src))]
+    return max(vals) if vals else ""
+
+
+def _handle_isbn_normalize(
+    records: List[Dict[str, Any]], rule: FieldRule, result: Dict[str, Any],
+) -> List[str]:
+    raw = dedup_str_array([r.get(rule.effective_source, []) for r in records])
+    normalized = [v for v in (normalize_isbn_to_13(s) for s in raw) if v is not None]
+    return sorted(set(normalized))
+
+
+def _handle_isbn_min(
+    records: List[Dict[str, Any]], rule: FieldRule, result: Dict[str, Any],
+) -> str:
+    isbns = result.get("isbns", [])
+    if isbns:
+        return isbns[0]
+    return str(records[0].get("isbn13", "")) if records else ""
+
+
+STRATEGY_HANDLERS: Dict[str, StrategyHandler] = {
+    "freq_lex_max": _handle_freq_lex_max,
+    "freq_int_max": _handle_freq_int_max,
+    "dedup_array": _handle_dedup_array,
+    "merge_map": _handle_merge_map,
+    "max_int": _handle_max_int,
+    "latest_dt": _handle_latest_dt,
+    "isbn_normalize": _handle_isbn_normalize,
+    "isbn_min": _handle_isbn_min,
+}
+
+
+# ---- aggregation ----
+
+
+def aggregate_group(records: List[Dict[str, Any]], rules: Sequence[FieldRule]) -> Dict[str, Any]:
+    return aggregate_by_rules(records, rules, STRATEGY_HANDLERS)
+
+
+# ---- DB validation helpers ----
+
+
+def _log(message: str) -> None:
+    print(message, file=sys.stderr, flush=True)
+
+
+def load_config(path: Path) -> Dict[str, Any]:
+    if not path.exists():
+        raise FileNotFoundError(
+            f"Config file not found: {path}\n"
+            f"Copy the template and fill in credentials:\n"
+            f"  cp {TEMPLATE_CONFIG_PATH} {path}"
+        )
+    with path.open("r", encoding="utf-8") as f:
+        return json.load(f)
+
+
+def connect_starrocks(config_path: Path):
+    if pymysql is None:
+        raise RuntimeError("pymysql is required. Install pymysql before running DB validation.")
+    cfg = load_config(config_path)
+    mysql_cfg = cfg["mysql"]
+    retry_cfg = cfg.get("retry", {}) if isinstance(cfg.get("retry"), dict) else {}
+    max_attempts = max(1, int(retry_cfg.get("max_attempts", 3)))
+    delay = max(0.0, float(retry_cfg.get("initial_delay_sec", 2.0)))
+    backoff = max(1.0, float(retry_cfg.get("backoff_factor", 2.0)))
+    read_timeout = int(mysql_cfg.get("read_timeout_sec", 600))
+
+    def _is_retryable_connect_error(exc: Exception) -> bool:
+        if pymysql is None:
+            return False
+        if isinstance(exc, pymysql.err.OperationalError):
+            code = exc.args[0] if exc.args else None
+            if code in (2003, 2006, 2013):
+                return True
+        msg = str(exc).lower()
+        return any(token in msg for token in ("lost connection", "can't connect", "timed out", "timeout"))
+
+    for attempt in range(1, max_attempts + 1):
+        try:
+            # Do not pass database= on connect: this StarRocks endpoint drops
+            # auth when a default schema is selected; use fully-qualified table names in SQL.
+            return pymysql.connect(
+                host=mysql_cfg["host"],
+                port=int(mysql_cfg["port"]),
+                user=mysql_cfg["user"],
+                password=mysql_cfg["password"],
+                charset=mysql_cfg.get("charset", "utf8mb4"),
+                connect_timeout=30,
+                read_timeout=read_timeout,
+            )
+        except Exception as exc:
+            if attempt >= max_attempts or not _is_retryable_connect_error(exc):
+                raise
+            print(
+                f"[retry] MySQL 连接失败 ({type(exc).__name__}: {exc})，"
+                f"{delay:.1f}s 后重试 ({attempt}/{max_attempts})"
+            )
+            time.sleep(delay)
+            delay *= backoff
+
+    raise RuntimeError("MySQL connection retry exhausted unexpectedly")
+
+
+def qualify_table_name(
+    table: str,
+    catalog: Optional[str],
+    database: str = "dws",
+) -> str:
+    """Resolve table to catalog.database.table for StarRocks Iceberg queries."""
+    parts = [part.strip() for part in table.split(".") if part.strip()]
+    if len(parts) >= 3:
+        return table
+    if len(parts) == 2:
+        db_name, table_name = parts
+        if catalog:
+            return f"{catalog}.{db_name}.{table_name}"
+        return table
+    if len(parts) == 1:
+        if catalog:
+            return f"{catalog}.{database}.{parts[0]}"
+        return f"{database}.{parts[0]}"
+    return table
+
+
+def quote_identifier(identifier: str) -> str:
+    parts = [part.strip() for part in identifier.split(".") if part.strip()]
+    if not parts:
+        raise ValueError(f"Invalid identifier: {identifier!r}")
+    return ".".join(f"`{part.replace('`', '``')}`" for part in parts)
+
+
+def fetch_records(conn: Any, sql: str, params: Sequence[Any] = ()) -> List[Dict[str, Any]]:
+    with conn.cursor() as cursor:
+        cursor.execute(sql, params)
+        if cursor.description is None:
+            return []
+        cols = [field[0] for field in cursor.description]
+        return [dict(zip(cols, row)) for row in cursor.fetchall()]
+
+
+def normalize_json_like(value: Any) -> Any:
+    if isinstance(value, (bytes, bytearray)):
+        value = value.decode("utf-8", errors="replace")
+    if isinstance(value, str):
+        stripped = value.strip()
+        if stripped and stripped[0] in "[{":
+            try:
+                return json.loads(stripped)
+            except json.JSONDecodeError:
+                return value
+    return value
+
+
+def canonicalize(value: Any) -> Any:
+    value = normalize_json_like(value)
+    if isinstance(value, Decimal):
+        if value == value.to_integral_value():
+            return int(value)
+        return float(value)
+    if isinstance(value, (date, datetime)):
+        return value.isoformat()
+    if isinstance(value, dict):
+        return {str(k): canonicalize(v) for k, v in sorted(value.items(), key=lambda item: str(item[0]))}
+    if isinstance(value, list):
+        return [canonicalize(v) for v in value]
+    return value
+
+
+def canonical_json(value: Any) -> str:
+    return json.dumps(value, ensure_ascii=False, sort_keys=True, separators=(",", ":"))
+
+
+def comparable_record(record: Dict[str, Any], fields: Iterable[str]) -> Dict[str, Any]:
+    return {field: canonicalize(record.get(field)) for field in fields}
+
+
+def _dt_clause(dt: Optional[str], params: List[Any]) -> str:
+    if dt is not None:
+        params.append(dt)
+        return " AND `dt` = %s"
+    return ""
+
+
+def _limit_clause(limit: Optional[int]) -> str:
+    return "" if limit is None else f" LIMIT {int(limit)}"
+
+
+def source_canonical_isbn13_expr(array_field: str = "`isbns`") -> str:
+    """SQL expression matching normalize_isbn_to_13 + min per source row."""
+    cleaned = "regexp_replace(trim(x), '-', '')"
+    normalized = (
+        "CASE "
+        f"WHEN {cleaned} REGEXP '^[0-9]{{13}}$' THEN {cleaned} "
+        f"WHEN {cleaned} REGEXP '^[0-9]{{9}}[0-9Xx]$' THEN concat('978', {cleaned}) "
+        "ELSE NULL END"
+    )
+    return (
+        "array_min(array_distinct(array_filter("
+        f"array_map(x -> {normalized}, {array_field}), "
+        "x -> x IS NOT NULL AND x != ''"
+        ")))"
+    )
+
+
+def _key_not_null_clause(key_expr: str) -> str:
+    return f" AND {key_expr} IS NOT NULL AND {key_expr} != ''"
+
+
+def _hash_sample_predicate(
+    mod_base: Optional[int],
+    mod_max: Optional[int],
+    *,
+    key_expr: str = "`isbn13`",
+) -> str:
+    if not mod_base or not mod_max or mod_max <= 0:
+        return ""
+    return f" AND (ABS(CRC32({key_expr})) MOD {int(mod_base)}) < {int(mod_max)}"
+
+
+def _sample_order_clause(*, high_first: bool = False, key_expr: str = "sample_key") -> str:
+    if high_first:
+        return f"source_count DESC, CRC32({key_expr})"
+    return f"CRC32({key_expr})"
+
+
+def build_target_key_query(
+    table: str,
+    dt: Optional[str],
+    limit: Optional[int],
+    *,
+    hash_mod_base: Optional[int] = None,
+    hash_mod_max: Optional[int] = None,
+) -> Tuple[str, List[Any]]:
+    params: List[Any] = []
+    sql = (
+        f"SELECT `isbn13` AS sample_key FROM {quote_identifier(table)} "
+        "WHERE 1=1"
+        f"{_key_not_null_clause('`isbn13`')}"
+        f"{_dt_clause(dt, params)}"
+        f"{_hash_sample_predicate(hash_mod_base, hash_mod_max, key_expr='`isbn13`')}"
+        f" ORDER BY {_sample_order_clause(key_expr='`isbn13`')}{_limit_clause(limit)}"
+    )
+    return sql, params
+
+
+def build_target_first_key_query(
+    table: str,
+    dt: Optional[str],
+    limit: Optional[int],
+) -> Tuple[str, List[Any]]:
+    params: List[Any] = []
+    sql = (
+        f"SELECT `isbn13` AS sample_key FROM {quote_identifier(table)} "
+        "WHERE 1=1"
+        f"{_key_not_null_clause('`isbn13`')}"
+        f"{_dt_clause(dt, params)}"
+        f"{_limit_clause(limit)}"
+    )
+    return sql, params
+
+
+def build_random_key_query(
+    table: str,
+    dt: Optional[str],
+    limit: Optional[int],
+    *,
+    hash_mod_base: Optional[int] = None,
+    hash_mod_max: Optional[int] = None,
+) -> Tuple[str, List[Any]]:
+    return build_target_key_query(
+        table,
+        dt,
+        limit,
+        hash_mod_base=hash_mod_base,
+        hash_mod_max=hash_mod_max,
+    )
+
+
+def build_duplicate_key_query(
+    table: str,
+    dt: Optional[str],
+    limit: Optional[int],
+    *,
+    high_first: bool,
+    hash_mod_base: Optional[int] = None,
+    hash_mod_max: Optional[int] = None,
+) -> Tuple[str, List[Any]]:
+    params: List[Any] = []
+    key_expr = source_canonical_isbn13_expr()
+    sql = (
+        "SELECT sample_key, COUNT(*) AS source_count FROM ("
+        f"SELECT {key_expr} AS sample_key FROM {quote_identifier(table)} WHERE 1=1"
+        f"{_dt_clause(dt, params)}"
+        ") keyed WHERE 1=1"
+        f"{_key_not_null_clause('sample_key')}"
+        f"{_hash_sample_predicate(hash_mod_base, hash_mod_max, key_expr='sample_key')}"
+        " GROUP BY sample_key HAVING COUNT(*) > 1 "
+        f"ORDER BY {_sample_order_clause(high_first=high_first)}{_limit_clause(limit)}"
+    )
+    return sql, params
+
+
+def build_field_conflict_key_query(
+    table: str,
+    dt: Optional[str],
+    limit: Optional[int],
+    *,
+    hash_mod_base: Optional[int] = None,
+    hash_mod_max: Optional[int] = None,
+) -> Tuple[str, List[Any]]:
+    params: List[Any] = []
+    key_expr = source_canonical_isbn13_expr()
+    conflict_checks = [
+        "COUNT(DISTINCT `title`) > 1",
+        "COUNT(DISTINCT `abstract`) > 1",
+        "COUNT(DISTINCT `language`) > 1",
+        "COUNT(DISTINCT `published_year`) > 1",
+        "COUNT(DISTINCT `pages`) > 1",
+        "COUNT(DISTINCT `category`) > 1",
+    ]
+    sql = (
+        "SELECT sample_key, COUNT(*) AS source_count FROM ("
+        f"SELECT {key_expr} AS sample_key, `title`, `abstract`, `language`, "
+        f"`published_year`, `pages`, `category` FROM {quote_identifier(table)} WHERE 1=1"
+        f"{_dt_clause(dt, params)}"
+        ") keyed WHERE 1=1"
+        f"{_key_not_null_clause('sample_key')}"
+        f"{_hash_sample_predicate(hash_mod_base, hash_mod_max, key_expr='sample_key')}"
+        " GROUP BY sample_key HAVING COUNT(*) > 1 AND "
+        f"({' OR '.join(conflict_checks)}) "
+        f"ORDER BY {_sample_order_clause(high_first=True)}{_limit_clause(limit)}"
+    )
+    return sql, params
+
+
+def build_count_bucket_key_query(
+    table: str,
+    dt: Optional[str],
+    limit: Optional[int],
+    *,
+    bucket: str,
+    hash_mod_base: Optional[int] = None,
+    hash_mod_max: Optional[int] = None,
+) -> Tuple[str, List[Any]]:
+    params: List[Any] = []
+    key_expr = source_canonical_isbn13_expr()
+    if bucket == "one":
+        having = "COUNT(*) = 1"
+    elif bucket == "two":
+        having = "COUNT(*) = 2"
+    elif bucket == "multi":
+        having = "COUNT(*) > 2"
+    else:
+        raise ValueError(f"Unsupported count bucket: {bucket}")
+    sql = (
+        "SELECT sample_key, COUNT(*) AS source_count FROM ("
+        f"SELECT {key_expr} AS sample_key FROM {quote_identifier(table)} WHERE 1=1"
+        f"{_dt_clause(dt, params)}"
+        ") keyed WHERE 1=1"
+        f"{_key_not_null_clause('sample_key')}"
+        f"{_hash_sample_predicate(hash_mod_base, hash_mod_max, key_expr='sample_key')}"
+        f" GROUP BY sample_key HAVING {having} "
+        f"ORDER BY {_sample_order_clause()}{_limit_clause(limit)}"
+    )
+    return sql, params
+
+
+def _append_sample_key(
+    keys: List[str],
+    seen: set,
+    key: str,
+    *,
+    sample_size: Optional[int],
+) -> bool:
+    if not key or key in seen:
+        return False
+    seen.add(key)
+    keys.append(key)
+    return sample_size is not None and len(keys) >= sample_size
+
+
+def fetch_sample_keys(
+    conn: Any,
+    *,
+    source_table: str,
+    target_table: str,
+    dt: Optional[str],
+    sample_mode: str,
+    sample_size: Optional[int],
+    hash_mod_base: Optional[int] = None,
+    hash_mod_max: Optional[int] = None,
+) -> List[str]:
+    hash_kw = {"hash_mod_base": hash_mod_base, "hash_mod_max": hash_mod_max}
+
+    if sample_mode == "target-first":
+        sql, params = build_target_first_key_query(target_table, dt, sample_size)
+        query_plan: List[Tuple[str, Tuple[str, List[Any]]]] = [("target-first", (sql, params))]
+    elif sample_mode == "target-random":
+        sql, params = build_target_key_query(target_table, dt, sample_size, **hash_kw)
+        query_plan: List[Tuple[str, Tuple[str, List[Any]]]] = [("target-random", (sql, params))]
+    elif sample_mode == "count-buckets":
+        per_bucket = None if sample_size is None else max(1, sample_size // 3)
+        query_plan = [
+            ("count=1", build_count_bucket_key_query(source_table, dt, per_bucket, bucket="one", **hash_kw)),
+            ("count=2", build_count_bucket_key_query(source_table, dt, per_bucket, bucket="two", **hash_kw)),
+            ("count>2", build_count_bucket_key_query(source_table, dt, per_bucket, bucket="multi", **hash_kw)),
+        ]
+    elif sample_mode == "mixed":
+        per_bucket = None if sample_size is None else max(1, sample_size // 6)
+        query_plan = [
+            ("count=1", build_count_bucket_key_query(source_table, dt, per_bucket, bucket="one", **hash_kw)),
+            ("count=2", build_count_bucket_key_query(source_table, dt, per_bucket, bucket="two", **hash_kw)),
+            ("count>2", build_count_bucket_key_query(source_table, dt, per_bucket, bucket="multi", **hash_kw)),
+            ("field-conflict", build_field_conflict_key_query(source_table, dt, per_bucket, **hash_kw)),
+            ("high-duplicate", build_duplicate_key_query(source_table, dt, per_bucket, high_first=True, **hash_kw)),
+            ("target-random", build_random_key_query(target_table, dt, per_bucket, **hash_kw)),
+        ]
+    else:
+        raise ValueError(f"Unsupported sample_mode: {sample_mode}")
+
+    keys: List[str] = []
+    seen: set = set()
+
+    for idx, (label, (sql, params)) in enumerate(query_plan, start=1):
+        _log(
+            f"[info] 抽样 SQL {idx}/{len(query_plan)} [{label}] 开始执行"
+            f"（dt={dt!r}, mode={sample_mode}）…"
+        )
+        t0 = time.monotonic()
+        rows = fetch_records(conn, sql, params)
+        for row in rows:
+            if _append_sample_key(keys, seen, str(row.get("sample_key") or ""), sample_size=sample_size):
+                _log(
+                    f"[info] 抽样 SQL {idx}/{len(query_plan)} [{label}] 完成，"
+                    f"耗时 {time.monotonic() - t0:.1f}s，已收集 {len(keys)} 个 key"
+                )
+                return keys
+        _log(
+            f"[info] 抽样 SQL {idx}/{len(query_plan)} [{label}] 完成，"
+            f"耗时 {time.monotonic() - t0:.1f}s，当前共 {len(keys)} 个 key"
+        )
+    return keys
+
+
+def build_target_record_query(table: str, isbn13: Any, dt: Optional[str]) -> Tuple[str, List[Any]]:
+    params: List[Any] = []
+    if dt is not None:
+        params.append(dt)
+    params.append(str(isbn13))
+    dt_sql = " AND `dt` = %s" if dt is not None else ""
+    sql = (
+        f"SELECT * FROM {quote_identifier(table)} WHERE 1=1"
+        f"{dt_sql} AND `isbn13` = %s LIMIT 1"
+    )
+    return sql, params
+
+
+def build_source_query(table: str, isbn13: Any, dt: Optional[str]) -> Tuple[str, List[Any]]:
+    params: List[Any] = []
+    key_expr = source_canonical_isbn13_expr()
+    if dt is not None:
+        params.append(dt)
+    params.append(str(isbn13))
+    dt_sql = " AND `dt` = %s" if dt is not None else ""
+    return (
+        "SELECT * FROM ("
+        f"SELECT *, {key_expr} AS sample_key FROM {quote_identifier(table)} WHERE 1=1{dt_sql}"
+        ") keyed WHERE sample_key = %s",
+        params,
+    )
+
+
+def build_source_batch_query(
+    table: str,
+    sample_keys: Sequence[str],
+    dt: Optional[str],
+) -> Tuple[str, List[Any]]:
+    if not sample_keys:
+        raise ValueError("sample_keys must not be empty")
+
+    key_expr = source_canonical_isbn13_expr()
+    sample_key_sql = " UNION ALL ".join("SELECT %s AS sample_key" for _ in sample_keys)
+    params: List[Any] = [str(key) for key in sample_keys]
+    if dt is not None:
+        params.append(dt)
+    dt_sql = " AND `dt` = %s" if dt is not None else ""
+
+    sql = (
+        f"WITH sample_keys AS ({sample_key_sql}), "
+        "source_keyed AS ("
+        f"SELECT *, {key_expr} AS sample_key FROM {quote_identifier(table)} WHERE 1=1{dt_sql}"
+        ") "
+        "SELECT source_keyed.* FROM source_keyed "
+        "JOIN sample_keys ON source_keyed.sample_key = sample_keys.sample_key"
+    )
+    return sql, params
+
+
+def group_source_rows_by_sample_key(
+    rows: Sequence[Dict[str, Any]],
+) -> Dict[str, List[Dict[str, Any]]]:
+    grouped: Dict[str, List[Dict[str, Any]]] = {}
+    for row in rows:
+        key = str(row.get("sample_key") or "")
+        if not key:
+            continue
+        grouped.setdefault(key, []).append(row)
+    return grouped
+
+
+def normalize_order_insensitive_value(value: Any) -> Any:
+    value = canonicalize(value)
+    if isinstance(value, list):
+        dedup_map: Dict[str, Any] = {}
+        for item in value:
+            if item is None or item == "":
+                continue
+            dedup_map[canonical_json(item)] = item
+        return [dedup_map[key] for key in sorted(dedup_map)]
+    return value
+
+
+def normalize_empty_for_compare(value: Any, data_type: str) -> Any:
+    type_text = (data_type or "").strip().lower()
+    if value is None:
+        return None
+    if type_text in ("string", "varchar", "char", "text"):
+        return None if isinstance(value, str) and value.strip() == "" else value
+    if type_text.startswith("array"):
+        if value == []:
+            return None
+        if isinstance(value, str) and value.strip() in ("", "[]"):
+            return None
+    return value
+
+
+def compare_records(
+    expected: Dict[str, Any],
+    actual: Dict[str, Any],
+    order_insensitive_fields: Optional[set] = None,
+    field_types: Optional[Dict[str, str]] = None,
+) -> Dict[str, Dict[str, Any]]:
+    mismatches: Dict[str, Dict[str, Any]] = {}
+    order_insensitive_fields = order_insensitive_fields or set()
+    field_types = field_types or {}
+    for field, expected_value in expected.items():
+        if field in order_insensitive_fields:
+            expected_value = normalize_order_insensitive_value(expected_value)
+            actual_value = normalize_order_insensitive_value(actual.get(field))
+        else:
+            actual_value = canonicalize(actual.get(field))
+        expected_value = normalize_empty_for_compare(expected_value, field_types.get(field, ""))
+        actual_value = normalize_empty_for_compare(actual_value, field_types.get(field, ""))
+        if expected_value != actual_value:
+            mismatches[field] = {"expected": expected_value, "actual": actual_value}
+    return mismatches
+
+
+def validate_dt_partitions(
+    conn: Any,
+    source_table: str,
+    target_table: str,
+    dt: Optional[str],
+    *,
+    skip_source_distinct: bool = False,
+) -> Dict[str, Any]:
+    """Check dt partition coverage and key counts between source and target."""
+    params: List[Any] = []
+    dt_filter = _dt_clause(dt, params)
+
+    src_map: Dict[str, int] = {}
+    if not skip_source_distinct:
+        key_expr = source_canonical_isbn13_expr()
+        src_sql = (
+            "SELECT `dt`, COUNT(DISTINCT sample_key) AS key_count FROM ("
+            f"SELECT `dt`, {key_expr} AS sample_key FROM {quote_identifier(source_table)}"
+            f" WHERE 1=1{dt_filter}"
+            ") keyed WHERE 1=1"
+            f"{_key_not_null_clause('sample_key')}"
+            " GROUP BY `dt` ORDER BY `dt`"
+        )
+        src_rows = fetch_records(conn, src_sql, params)
+        src_map = {str(r["dt"]): int(r["key_count"]) for r in src_rows}
+
+    tgt_sql = (
+        f"SELECT `dt`, COUNT(*) AS row_count"
+        f" FROM {quote_identifier(target_table)}"
+        f" WHERE 1=1{dt_filter} GROUP BY `dt` ORDER BY `dt`"
+    )
+    tgt_rows = fetch_records(conn, tgt_sql, params)
+    tgt_map = {str(r["dt"]): int(r["row_count"]) for r in tgt_rows}
+    all_dts = sorted(set(src_map) | set(tgt_map))
+
+    mismatches: List[Dict[str, Any]] = []
+    for d in all_dts:
+        src_cnt = src_map.get(d)
+        tgt_cnt = tgt_map.get(d)
+        if src_cnt != tgt_cnt:
+            mismatches.append({
+                "dt": d,
+                "source_key_count": src_cnt,
+                "target_row_count": tgt_cnt,
+            })
+
+    return {
+        "source_dt_count": len(src_map),
+        "target_dt_count": len(tgt_map),
+        "missing_in_target": sorted(set(src_map) - set(tgt_map)),
+        "extra_in_target": sorted(set(tgt_map) - set(src_map)),
+        "count_mismatches": mismatches,
+        "source_distinct_skipped": skip_source_distinct,
+    }
+
+
+def discover_dt_values(conn: Any, table: str) -> List[str]:
+    sql = (
+        f"SELECT DISTINCT `dt` FROM {quote_identifier(table)} "
+        "WHERE `dt` IS NOT NULL AND `dt` != '' ORDER BY `dt`"
+    )
+    return [str(r["dt"]) for r in fetch_records(conn, sql)]
+
+
+def validate_db(
+    *,
+    config_path: Path,
+    source_table: str,
+    target_table: str,
+    dt: Optional[str],
+    limit: Optional[int],
+    sample_mode: str,
+    report_path: Optional[Path],
+    mapping_csv: Path = DEFAULT_MAPPING_CSV,
+    skip_dt_check: bool = False,
+    skip_source_distinct: bool = False,
+    hash_mod_base: Optional[int] = 100,
+    hash_mod_max: Optional[int] = 2,
+) -> Dict[str, Any]:
+    rules = load_field_rules(mapping_csv)
+    output_fields = output_fields_from_rules(rules)
+    order_insensitive_fields = order_insensitive_fields_from_rules(rules)
+    field_types = {rule.field_name: rule.data_type for rule in rules}
+    cfg = load_config(config_path)
+    mysql_cfg = cfg.get("mysql", {}) if isinstance(cfg.get("mysql"), dict) else {}
+    catalog = mysql_cfg.get("catalog")
+    database = str(mysql_cfg.get("database") or "dws")
+    source_table = qualify_table_name(source_table, catalog, database)
+    target_table = qualify_table_name(target_table, catalog, database)
+    hash_enabled = bool(hash_mod_base and hash_mod_max and hash_mod_max > 0)
+    _log(
+        f"[info] 图书去重校验开始：dt={dt!r}, limit={limit}, sample_mode={sample_mode}, "
+        f"hash_sample={'on' if hash_enabled else 'off'}, "
+        f"skip_dt_check={skip_dt_check}, source={source_table}, target={target_table}"
+    )
+    with connect_starrocks(config_path) as conn:
+        _log("[info] StarRocks 连接成功")
+        if dt is not None:
+            dt_list = [dt]
+        else:
+            _log("[info] 正在发现源表 dt 分区…")
+            dt_list = discover_dt_values(conn, source_table)
+            _log(f"[info] 自动发现 {len(dt_list)} 个 dt 分区，逐分区验证")
+
+        if skip_dt_check:
+            dt_check = {"skipped": True}
+            _log("[info] 跳过分区行数统计（--skip-dt-check）")
+        else:
+            _log("[info] 正在统计目标/源分区行数（源表 DISTINCT 可较慢，可用 --skip-source-distinct 跳过）…")
+            t0 = time.monotonic()
+            dt_check = validate_dt_partitions(
+                conn,
+                source_table,
+                target_table,
+                dt,
+                skip_source_distinct=skip_source_distinct,
+            )
+            _log(f"[info] 分区统计完成，耗时 {time.monotonic() - t0:.1f}s")
+
+        checked = passed = failed = missing_source = missing_target = 0
+        source_count_buckets = {"one": 0, "two": 0, "multi": 0}
+        mismatch_rows: List[Dict[str, Any]] = []
+
+        for partition_dt in dt_list:
+            _log(f"[info] 分区 {partition_dt}：开始抽样 key…")
+            sample_keys = fetch_sample_keys(
+                conn,
+                source_table=source_table,
+                target_table=target_table,
+                dt=partition_dt,
+                sample_mode=sample_mode,
+                sample_size=limit,
+                hash_mod_base=hash_mod_base if hash_enabled else None,
+                hash_mod_max=hash_mod_max if hash_enabled else None,
+            )
+            _log(f"[info] 分区 {partition_dt}：抽到 {len(sample_keys)} 个 ISBN13，开始批量拉取源记录…")
+            t0 = time.monotonic()
+            source_rows_by_key: Dict[str, List[Dict[str, Any]]] = {}
+            if sample_keys:
+                source_sql, source_params = build_source_batch_query(source_table, sample_keys, partition_dt)
+                source_rows_by_key = group_source_rows_by_sample_key(
+                    fetch_records(conn, source_sql, source_params)
+                )
+            _log(
+                f"[info] 分区 {partition_dt}：源记录批量拉取完成，耗时 "
+                f"{time.monotonic() - t0:.1f}s，命中 {len(source_rows_by_key)}/{len(sample_keys)} 个 key"
+            )
+            _log(f"[info] 分区 {partition_dt}：开始逐条比对…")
+
+            for isbn13 in sample_keys:
+                source_rows = source_rows_by_key.get(isbn13, [])
+                checked += 1
+                if checked == 1 or checked % 20 == 0:
+                    _log(f"[info] 分区 {partition_dt}：已比对 {checked}/{len(sample_keys)} 条")
+
+                if not source_rows:
+                    missing_source += 1
+                    mismatch_rows.append({
+                        "key": isbn13,
+                        "dt": partition_dt,
+                        "status": "missing_source",
+                        "source_count": 0,
+                        "mismatches": {},
+                    })
+                    continue
+
+                if len(source_rows) == 1:
+                    source_count_buckets["one"] += 1
+                elif len(source_rows) == 2:
+                    source_count_buckets["two"] += 1
+                else:
+                    source_count_buckets["multi"] += 1
+                normalized_source = [{key: normalize_json_like(value) for key, value in row.items()} for row in source_rows]
+                aggregated = aggregate_group(normalized_source, rules)
+                expected = comparable_record(aggregated, output_fields)
+                expected_isbn13 = str(expected.get("isbn13") or isbn13)
+                target_sql, target_params = build_target_record_query(target_table, expected_isbn13, partition_dt)
+                target_rows = fetch_records(conn, target_sql, target_params)
+                if not target_rows:
+                    missing_target += 1
+                    mismatch_rows.append({
+                        "key": isbn13,
+                        "expected_key": expected_isbn13,
+                        "dt": partition_dt,
+                        "status": "missing_target",
+                        "source_count": len(source_rows),
+                        "source_records": normalized_source,
+                        "expected_record": expected,
+                        "mismatches": {},
+                    })
+                    continue
+
+                target_row = target_rows[0]
+                actual = comparable_record(target_row, output_fields)
+                mismatches = compare_records(expected, actual, order_insensitive_fields, field_types)
+                if mismatches:
+                    failed += 1
+                    mismatch_rows.append(
+                        {
+                            "key": isbn13,
+                            "expected_key": expected_isbn13,
+                            "dt": partition_dt,
+                            "status": "field_mismatch",
+                            "source_count": len(source_rows),
+                            "mismatches": mismatches,
+                        }
+                    )
+                else:
+                    passed += 1
+
+    if report_path is not None:
+        report_path.parent.mkdir(parents=True, exist_ok=True)
+        with report_path.open("w", encoding="utf-8") as f:
+            for row in mismatch_rows:
+                f.write(json.dumps(localize_report_keys(row), ensure_ascii=False, cls=JsonEncoder) + "\n")
+        (report_path.parent / "source_field_warning.jsonl").write_text("", encoding="utf-8")
+
+    result = {
+        "status": "ok",
+        "kind": "ebook",
+        "source_table": source_table,
+        "target_table": target_table,
+        "key_field": "isbn13",
+        "dt": dt,
+        "validated_partitions": dt_list,
+        "sample_mode": sample_mode,
+        "sample_size": limit,
+        "dt_check": dt_check,
+        "checked": checked,
+        "passed": passed,
+        "failed": failed,
+        "missing_source": missing_source,
+        "missing_target": missing_target,
+        "source_count_buckets": source_count_buckets,
+        "report_path": str(report_path) if report_path is not None else None,
+        "sample_mismatches": mismatch_rows[:5],
+    }
+    if report_path is not None:
+        write_report_summary(report_path, result, mismatch_rows)
+    print(json.dumps(result, ensure_ascii=False, cls=JsonEncoder))
+    return result
+
+
+# ---- CLI ----
+
+
+def cli() -> None:
+    config_parser = argparse.ArgumentParser(add_help=False)
+    config_parser.add_argument("--config", type=Path, default=DEFAULT_CONFIG_PATH)
+    config_args, _ = config_parser.parse_known_args()
+    cfg = load_config(config_args.config) if config_args.config.exists() else {}
+    ebook_cfg = cfg.get("unique_ebook", {})
+
+    default_csv = ebook_cfg.get("mapping_csv")
+    if default_csv:
+        default_csv = PROJECT_ROOT / default_csv
+    else:
+        default_csv = DEFAULT_MAPPING_CSV
+
+    parser = argparse.ArgumentParser(description="Validate meta_ebook unique DB table by ISBN13.")
+    parser.add_argument("--config", type=Path, default=DEFAULT_CONFIG_PATH, help="shared settings JSON path")
+    parser.add_argument("--mapping-csv", type=Path, default=default_csv, help="field mapping CSV")
+    parser.add_argument("--source-table", default=ebook_cfg.get("source_table", DEFAULT_SOURCE_TABLE))
+    parser.add_argument("--target-table", default=ebook_cfg.get("target_table", DEFAULT_TARGET_TABLE))
+    parser.add_argument("--dt", default=ebook_cfg.get("dt"), help="dt partition filter")
+    parser.add_argument("--limit", type=int, default=int(ebook_cfg.get("limit", 600)))
+    parser.add_argument(
+        "--sample-mode",
+        choices=("count-buckets", "mixed", "target-random", "target-first"),
+        default=ebook_cfg.get("sample_mode", "count-buckets"),
+        help="count-buckets: 1/2/N 源行分桶；mixed: 加深抽样；target-random: 目标表稳定排序抽样；target-first: 目标表 LIMIT 抽样（smoke 最快）",
+    )
+    parser.add_argument("--full", action="store_true", help="validate all target rows")
+    parser.add_argument("--skip-dt-check", action="store_true", default=bool(ebook_cfg.get("skip_dt_check")))
+    parser.add_argument(
+        "--skip-source-distinct",
+        action="store_true",
+        default=bool(ebook_cfg.get("skip_source_distinct")),
+        help="dt 统计时跳过源表 COUNT(DISTINCT canonical_isbn13)",
+    )
+    parser.add_argument(
+        "--no-sample-hash",
+        action="store_true",
+        help="关闭 CRC32 哈希预过滤（默认 mod 100 取 2，约 2%% 子集）",
+    )
+    parser.add_argument(
+        "--sample-hash-mod-base",
+        type=int,
+        default=int(ebook_cfg.get("sample_hash_mod_base", 100)),
+    )
+    parser.add_argument(
+        "--sample-hash-mod-max",
+        type=int,
+        default=int(ebook_cfg.get("sample_hash_mod_max", 2)),
+    )
+    parser.add_argument("--report", type=Path, default=ebook_cfg.get("report_path"), help="JSONL report path")
+    args = parser.parse_args()
+
+    hash_mod_base = None if args.no_sample_hash else args.sample_hash_mod_base
+    hash_mod_max = None if args.no_sample_hash else args.sample_hash_mod_max
+    report_path = Path(args.report) if args.report else default_report_path(
+        args.dt,
+        "count-buckets" if args.full else args.sample_mode,
+        args.full,
+    )
+
+    validate_db(
+        config_path=args.config,
+        source_table=args.source_table,
+        target_table=args.target_table,
+        dt=args.dt,
+        limit=None if args.full else args.limit,
+        sample_mode="count-buckets" if args.full else args.sample_mode,
+        report_path=report_path,
+        mapping_csv=args.mapping_csv,
+        skip_dt_check=args.skip_dt_check,
+        skip_source_distinct=args.skip_source_distinct,
+        hash_mod_base=hash_mod_base,
+        hash_mod_max=hash_mod_max,
+    )
+
+
+from dingo.config.input_args import EvaluatorRuleArgs
+from dingo.io.input import Data, RequiredField
+from dingo.io.output.eval_detail import EvalDetail, QualityLabel
+from dingo.model.model import Model
+from dingo.model.rule.base import BaseRule
+from dingo.model.rule.scibase.report_utils import bool_param, int_param, write_temp_settings
+
+
+@Model.rule_register(
+    "QUALITY_BAD_EFFECTIVENESS",
+    ["sci_base_qa_test", "meta_ebook_unique"],
+)
+class RuleSciBaseMetaEbookUniqueReport(BaseRule):
+    _metric_info = {
+        "category": "Rule-Based Metadata Quality Metrics",
+        "quality_dimension": "EFFECTIVENESS",
+        "metric_name": "RuleSciBaseMetaEbookUniqueReport",
+        "description": "Run SciBase ebook ISBN unique DB validation and write reports.",
+        "paper_title": "",
+        "paper_url": "",
+        "paper_authors": "",
+        "evaluation_results": "",
+    }
+
+    _required_fields = [RequiredField.METADATA]
+    dynamic_config = EvaluatorRuleArgs(parameters={})
+
+    @classmethod
+    def eval(cls, input_data: Data) -> EvalDetail:
+        del input_data
+        params = cls.dynamic_config.parameters or {}
+        full = bool_param(params, "full", False)
+        sample_mode = str(params.get("sample_mode") or "count-buckets")
+        dt = params.get("dt")
+        report_path = Path(params["report_path"]) if params.get("report_path") else None
+        if report_path is None and params.get("output_dir"):
+            report_path = Path(str(params["output_dir"])) / "source_field_mismatch.jsonl"
+        if report_path is None:
+            report_path = default_report_path(dt, "count-buckets" if full else sample_mode, full)
+
+        config_path = write_temp_settings(params)
+        result = validate_db(
+            config_path=config_path,
+            source_table=str(params.get("source_table") or DEFAULT_SOURCE_TABLE),
+            target_table=str(params.get("target_table") or DEFAULT_TARGET_TABLE),
+            dt=dt,
+            limit=None if full else int_param(params, "limit", 600),
+            sample_mode="count-buckets" if full else sample_mode,
+            report_path=report_path,
+            mapping_csv=Path(str(params.get("mapping_csv") or DEFAULT_MAPPING_CSV)),
+            skip_dt_check=bool_param(params, "skip_dt_check", False),
+            skip_source_distinct=bool_param(params, "skip_source_distinct", False),
+            hash_mod_base=None if bool_param(params, "no_sample_hash", False) else int_param(params, "sample_hash_mod_base", 100),
+            hash_mod_max=None if bool_param(params, "no_sample_hash", False) else int_param(params, "sample_hash_mod_max", 2),
+        )
+        bad = any(
+            int(result.get(key) or 0) > 0
+            for key in ("failed", "missing_source", "missing_target")
+        )
+        count_mismatches = ((result.get("dt_check") or {}).get("count_mismatches") or [])
+        bad = bad or bool(count_mismatches)
+        reason = [str(report_path.parent), f"checked={result.get('checked')}", f"failed={result.get('failed')}"]
+        if bad:
+            return EvalDetail(
+                metric=cls.__name__,
+                status=True,
+                label=[f"{cls.metric_type}.{cls.__name__}"],
+                reason=reason,
+            )
+        return EvalDetail(metric=cls.__name__, label=[QualityLabel.QUALITY_GOOD], reason=reason)
+
+
+if __name__ == "__main__":
+    cli()
diff --git a/dingo/model/rule/scibase/meta_paper_data.py b/dingo/model/rule/scibase/meta_paper_data.py
new file mode 100644
index 00000000..ff20066b
--- /dev/null
+++ b/dingo/model/rule/scibase/meta_paper_data.py
@@ -0,0 +1,3408 @@
+#!/usr/bin/env python3
+"""Single-file verifier for S3 arxiv data loaded into the paper source table.
+
+Generated from /Users/guhuaiyu/PycharmProjects/osi_test without modifying that source project.
+This file validates S3 arxiv metadata against the paper source table.
+Runtime dependencies: pymysql, duckdb, pyarrow, boto3.
+"""
+from __future__ import annotations
+
+
+# ---- osi_verify/common.py ----
+
+
+import json
+from datetime import datetime
+from typing import Any, Dict
+
+
+def sql_literal(value: str) -> str:
+    return value.replace("'", "''")
+
+
+def json_loads_maybe(v: Any) -> Any:
+    if v is None or isinstance(v, (dict, list)):
+        return v
+    if isinstance(v, (bytes, bytearray)):
+        v = v.decode("utf-8", errors="replace")
+    if isinstance(v, str):
+        s = v.strip()
+        if s and s[0] in "{[":
+            try:
+                return json.loads(s)
+            except json.JSONDecodeError:
+                pass
+    return v
+
+
+def normalize_scalar(v: Any) -> Any:
+    if v is None:
+        return None
+    if isinstance(v, bool):
+        return v
+    if isinstance(v, datetime):
+        return v.isoformat(sep=" ", timespec="seconds")
+    if isinstance(v, (int, float)) and not isinstance(v, bool):
+        return v
+    s = str(v).strip()
+    return s if s else None
+
+
+def get_first(row: Dict[str, Any], *keys: str) -> Any:
+    for k in keys:
+        if k in row and row[k] is not None:
+            return row[k]
+    return None
+
+
+def as_bool_flag(v: Any) -> bool:
+    if isinstance(v, bool):
+        return v
+    if v is None:
+        return False
+    if isinstance(v, (int, float)):
+        return int(v) == 1
+    return str(v).strip() in ("1", "true", "True", "yes", "Y")
+
+
+def oa_flag_str(flag: bool) -> str:
+    return "true" if flag else "false"
+
+
+# ---- osi_verify/retry.py ----
+
+
+import sys
+import time
+from dataclasses import dataclass
+from typing import Any, Callable, Dict, Optional, TypeVar
+
+T = TypeVar("T")
+
+try:
+    import pymysql
+except ImportError:
+    pymysql = None  # type: ignore
+
+try:
+    import duckdb
+except ImportError:
+    duckdb = None  # type: ignore
+
+try:
+    from botocore.exceptions import BotoCoreError, ClientError, ConnectionError as BotoConnectionError
+except ImportError:
+    BotoCoreError = ClientError = BotoConnectionError = ()  # type: ignore
+
+
+@dataclass(frozen=True)
+class RetryConfig:
+    enabled: bool = True
+    max_attempts: int = 3
+    initial_delay_sec: float = 1.0
+    backoff_factor: float = 2.0
+    max_delay_sec: float = 30.0
+
+    def __post_init__(self) -> None:
+        max_attempts = max(1, int(self.max_attempts))
+        initial_delay = max(0.0, float(self.initial_delay_sec))
+        backoff = max(1.0, float(self.backoff_factor))
+        max_delay = max(initial_delay, max(0.0, float(self.max_delay_sec)))
+        object.__setattr__(self, "max_attempts", max_attempts)
+        object.__setattr__(self, "initial_delay_sec", initial_delay)
+        object.__setattr__(self, "backoff_factor", backoff)
+        object.__setattr__(self, "max_delay_sec", max_delay)
+
+    @classmethod
+    def disabled(cls) -> "RetryConfig":
+        return cls(enabled=False, max_attempts=1)
+
+    def attempts(self) -> int:
+        return max(1, self.max_attempts) if self.enabled else 1
+
+
+def _positive_int(value: Any, default: int) -> int:
+    try:
+        parsed = int(value)
+    except (TypeError, ValueError):
+        return default
+    return max(1, parsed)
+
+
+def _non_negative_float(value: Any, default: float) -> float:
+    try:
+        parsed = float(value)
+    except (TypeError, ValueError):
+        return default
+    return max(0.0, parsed)
+
+
+def _min_float(value: Any, default: float, minimum: float) -> float:
+    try:
+        parsed = float(value)
+    except (TypeError, ValueError):
+        return default
+    return max(minimum, parsed)
+
+
+def load_retry_config(settings: Dict[str, Any]) -> RetryConfig:
+    raw = settings.get("retry")
+    if not isinstance(raw, dict):
+        return RetryConfig()
+    return RetryConfig(
+        enabled=bool(raw.get("enabled", True)),
+        max_attempts=_positive_int(raw.get("max_attempts", 3), 3),
+        initial_delay_sec=_non_negative_float(raw.get("initial_delay_sec", 1.0), 1.0),
+        backoff_factor=_min_float(raw.get("backoff_factor", 2.0), 2.0, 1.0),
+        max_delay_sec=_non_negative_float(raw.get("max_delay_sec", 30.0), 30.0),
+    )
+
+
+def _exc_message(exc: BaseException) -> str:
+    return str(exc).lower()
+
+
+def is_mysql_retryable(exc: BaseException) -> bool:
+    if pymysql is None:
+        return False
+    if isinstance(exc, pymysql.err.OperationalError):
+        code = exc.args[0] if exc.args else None
+        if code in (2003, 2006, 2013):
+            return True
+    if isinstance(exc, pymysql.err.ProgrammingError):
+        msg = _exc_message(exc)
+        return any(
+            token in msg
+            for token in (
+                "timeout",
+                "timed out",
+                "connection",
+                "lost connection",
+                "brpc",
+                "host is down",
+                "not connected",
+                "could not determine master",
+                "master from helpers",
+                "no alive backend",
+                "frontend",
+            )
+        )
+    if isinstance(exc, (TimeoutError, ConnectionError, OSError)):
+        return True
+    return False
+
+
+def is_s3_retryable(exc: BaseException) -> bool:
+    if duckdb is not None and isinstance(exc, duckdb.IOException):
+        msg = _exc_message(exc)
+        return any(
+            token in msg
+            for token in (
+                "connection",
+                "failed to read",
+                "timeout",
+                "network",
+                "io error",
+            )
+        )
+    if isinstance(exc, (BotoConnectionError, TimeoutError, ConnectionError, OSError)):
+        return True
+    if isinstance(exc, ClientError):
+        code = str(exc.response.get("Error", {}).get("Code", ""))
+        status = int(exc.response.get("ResponseMetadata", {}).get("HTTPStatusCode", 0) or 0)
+        if status in (408, 429, 500, 502, 503, 504):
+            return True
+        return code in {
+            "RequestTimeout",
+            "RequestTimeoutException",
+            "Throttling",
+            "ThrottlingException",
+            "SlowDown",
+            "InternalError",
+            "ServiceUnavailable",
+        }
+    if isinstance(exc, BotoCoreError):
+        return True
+    return False
+
+
+def retry_call(
+    fn: Callable[[], T],
+    config: Optional[RetryConfig],
+    *,
+    label: str,
+    retryable: Callable[[BaseException], bool],
+) -> T:
+    cfg = config or RetryConfig()
+    attempts = cfg.attempts()
+    delay = cfg.initial_delay_sec
+    last_exc: Optional[BaseException] = None
+
+    for attempt in range(1, attempts + 1):
+        try:
+            return fn()
+        except Exception as exc:
+            last_exc = exc
+            if attempt >= attempts or not cfg.enabled or not retryable(exc):
+                raise
+            print(
+                f"[retry] {label} 失败 ({type(exc).__name__}: {exc})，"
+                f"{delay:.1f}s 后重试 ({attempt}/{attempts})",
+                file=sys.stderr,
+            )
+            time.sleep(delay)
+            delay = min(delay * cfg.backoff_factor, cfg.max_delay_sec)
+
+    assert last_exc is not None
+    raise last_exc
+
+
+# ---- osi_verify/config.py ----
+
+
+import json
+import re
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Any, Dict, Optional, Union
+
+PROJECT_ROOT = Path(__file__).resolve().parent
+ASSETS_DIR = PROJECT_ROOT / "assets"
+REPORT_ROOT = Path("report")
+DEFAULT_ICEBERG_CATALOG = "lakehouse_iceberg"
+DEFAULT_SETTINGS_JSON = Path("sci_base_qa_test_config.json")
+DT_RE = re.compile(r"dt=([^/]+)")
+
+
+@dataclass(frozen=True)
+class TargetConfig:
+    name: str
+    kind: str
+    description: str
+    mapping_csv: Path
+    database: str
+    table: str
+    catalog: Optional[str]
+    origin_osi: str
+    source_id_field: str
+    transform: str
+    mapping_target_column: str
+    mapping_source_column: str
+    s3_settings: Dict[str, Any]
+    s3_subpath: Optional[str]
+    s3_path: Optional[str]
+    s3_format: Optional[str]
+
+
+def resolve_project_path(value: Optional[Union[str, Path]]) -> Optional[Path]:
+    if value is None:
+        return None
+    path = Path(value).expanduser()
+    if path.is_absolute():
+        return path
+    return PROJECT_ROOT / path
+
+
+def load_settings(path: Path) -> Dict[str, Any]:
+    if not path.exists():
+        return {}
+    return json.loads(path.read_text(encoding="utf-8"))
+
+
+def load_arxiv_target_config(settings: Dict[str, Any]) -> TargetConfig:
+    arxiv_settings = settings.get("osi_arxiv", {}) if isinstance(settings.get("osi_arxiv"), dict) else {}
+    table_settings = settings.get("table", {}) if isinstance(settings.get("table"), dict) else {}
+    mapping_settings = settings.get("mapping", {}) if isinstance(settings.get("mapping"), dict) else {}
+    s3_settings = arxiv_settings.get("s3", {}) if isinstance(arxiv_settings.get("s3"), dict) else {}
+    for key in ("config_file", "path", "subpath", "format"):
+        if key in arxiv_settings and arxiv_settings[key] not in (None, ""):
+            s3_settings[key] = arxiv_settings[key]
+    mapping_csv = resolve_project_path(
+        arxiv_settings.get("mapping_csv")
+        or table_settings.get("mapping_csv")
+        or mapping_settings.get("csv")
+        or str(ASSETS_DIR / "osi_arxiv_mapping.csv")
+    )
+    if mapping_csv is None:
+        mapping_csv = ASSETS_DIR / "osi_arxiv_mapping.csv"
+    return TargetConfig(
+        name="osi_axiv",
+        kind="osi_axiv",
+        description="S3 arxiv 数据到论文源数据表校验",
+        mapping_csv=mapping_csv,
+        database=str(arxiv_settings.get("database") or table_settings.get("database") or "dws"),
+        table=str(arxiv_settings.get("target_table") or arxiv_settings.get("table") or table_settings.get("table") or "dws_meta_paper_data_acc_d"),
+        catalog=str(arxiv_settings.get("catalog") or table_settings.get("catalog") or DEFAULT_ICEBERG_CATALOG),
+        origin_osi="arxiv",
+        source_id_field="doc_id",
+        transform="osi_arxiv",
+        mapping_target_column=str(arxiv_settings.get("mapping_target_column") or mapping_settings.get("target_column") or "预期字段"),
+        mapping_source_column=str(arxiv_settings.get("mapping_source_column") or mapping_settings.get("source_column") or "arxiv对应字段"),
+        s3_settings=dict(s3_settings),
+        s3_subpath=s3_settings.get("subpath"),
+        s3_path=s3_settings.get("path"),
+        s3_format=s3_settings.get("format"),
+    )
+
+
+
+def _merge_present(base: Dict[str, Any], overrides: Dict[str, Any]) -> Dict[str, Any]:
+    out = dict(base)
+    for k, v in overrides.items():
+        if v is not None and v != "":
+            out[k] = v
+    return out
+
+
+def _strip_endpoint_scheme(value: str) -> str:
+    return value.removeprefix("https://").removeprefix("http://").rstrip("/")
+
+
+def _parse_bool(value: Any) -> bool:
+    if isinstance(value, bool):
+        return value
+    return str(value).strip().lower() in {"1", "true", "yes", "y", "on"}
+
+
+def parse_mysql_config(path: Path) -> Dict[str, Any]:
+    cfg: Dict[str, Any] = {}
+    for line in path.read_text(encoding="utf-8").splitlines():
+        line = line.strip()
+        if not line or "：" not in line:
+            continue
+        key, val = line.split("：", 1)
+        key, val = key.strip(), val.strip()
+        if key in ("账号", "用户名", "user"):
+            cfg["user"] = val
+        elif key in ("密码", "password"):
+            cfg["password"] = val
+        elif key in ("地址", "host"):
+            if ":" in val:
+                host, port = val.rsplit(":", 1)
+                cfg["host"] = host
+                cfg["port"] = int(port)
+            else:
+                cfg["host"] = val
+        elif key in ("catalog", "iceberg_catalog", "catalog名"):
+            cfg["catalog"] = val
+    if "port" not in cfg:
+        cfg["port"] = 3306
+    missing = [k for k in ("user", "password", "host") if k not in cfg]
+    if missing:
+        raise ValueError(f"MySQL 配置缺少字段: {missing}（文件: {path}）")
+    return cfg
+
+
+def load_mysql_config(path: Optional[Path], inline: Optional[Dict[str, Any]] = None) -> Dict[str, Any]:
+    cfg: Dict[str, Any] = {}
+    if path and path.exists():
+        cfg.update(parse_mysql_config(path))
+    if inline:
+        cfg = _merge_present(cfg, inline)
+    host = cfg.get("host")
+    if isinstance(host, str) and ":" in host:
+        raise ValueError("MySQL host 请只配置主机名/IP，端口请通过 port 单独配置")
+    if "port" not in cfg:
+        cfg["port"] = 3306
+    else:
+        cfg["port"] = int(cfg["port"])
+    missing = [k for k in ("user", "password", "host") if k not in cfg]
+    if missing:
+        source = path or "inline settings"
+        raise ValueError(f"MySQL 配置缺少字段: {missing}（来源: {source}）")
+    return cfg
+
+
+def parse_s3_config(path: Path) -> Dict[str, Any]:
+    cfg: Dict[str, Any] = {}
+    for line in path.read_text(encoding="utf-8").splitlines():
+        line = line.strip()
+        if not line:
+            continue
+        if line.startswith("s3://"):
+            cfg["default_path"] = line if line.endswith("/") else line + "/"
+            continue
+        sep = "：" if "：" in line else (":" if ":" in line else None)
+        if not sep:
+            continue
+        key, val = line.split(sep, 1)
+        key, val = key.strip().upper(), val.strip()
+        if key in ("AK", "ACCESS_KEY", "AWS_ACCESS_KEY_ID"):
+            cfg["access_key"] = val
+        elif key in ("SK", "SECRET_KEY", "AWS_SECRET_ACCESS_KEY"):
+            cfg["secret_key"] = val
+        elif key in ("ENDPOINT", "S3_ENDPOINT"):
+            cfg["endpoint"] = _strip_endpoint_scheme(val)
+        elif key in ("USE_SSL", "S3_USE_SSL"):
+            cfg["use_ssl"] = _parse_bool(val)
+        elif key in ("VERIFY_SSL", "S3_VERIFY_SSL"):
+            cfg["verify_ssl"] = _parse_bool(val)
+    missing = [k for k in ("access_key", "secret_key", "endpoint") if k not in cfg]
+    if missing:
+        raise ValueError(f"S3 配置缺少字段: {missing}（文件: {path}）")
+    if "default_path" not in cfg:
+        cfg["default_path"] = "s3://lakehouse-scibase/"
+    return cfg
+
+
+def load_s3_config(path: Optional[Path], inline: Optional[Dict[str, Any]] = None) -> Dict[str, Any]:
+    cfg: Dict[str, Any] = {}
+    if path and path.exists():
+        cfg.update(parse_s3_config(path))
+    if inline:
+        aliases = {
+            "ak": "access_key",
+            "sk": "secret_key",
+            "bucket_path": "default_path",
+            "path": "default_path",
+        }
+        normalized = {aliases.get(k, k): v for k, v in inline.items()}
+        if "endpoint" in normalized and normalized["endpoint"]:
+            normalized["endpoint"] = _strip_endpoint_scheme(str(normalized["endpoint"]))
+        if "use_ssl" in normalized:
+            normalized["use_ssl"] = _parse_bool(normalized["use_ssl"])
+        if "verify_ssl" in normalized:
+            normalized["verify_ssl"] = _parse_bool(normalized["verify_ssl"])
+        cfg = _merge_present(cfg, normalized)
+    missing = [k for k in ("access_key", "secret_key", "endpoint") if k not in cfg]
+    if missing:
+        source = path or "inline settings"
+        raise ValueError(f"S3 配置缺少字段: {missing}（来源: {source}）")
+    if "default_path" not in cfg:
+        cfg["default_path"] = "s3://lakehouse-scibase/"
+    return cfg
+
+
+def resolve_s3_path(base: str, subpath: Optional[str]) -> str:
+    base = base.rstrip("/")
+    if not subpath:
+        return base + "/"
+    return base + "/" + subpath.strip("/") + "/"
+
+
+def apply_s3_dt_to_path(s3_path: str, s3_dt: Optional[str]) -> str:
+    if not s3_dt:
+        return s3_path
+    if "YYYY-MM-DD" in s3_path:
+        return s3_path.replace("YYYY-MM-DD", s3_dt)
+    if DT_RE.search(s3_path):
+        return DT_RE.sub(f"dt={s3_dt}", s3_path, count=1)
+    return s3_path
+
+
+def extract_partition_dt(s3_subpath: Optional[str], override: Optional[str] = None) -> Optional[str]:
+    if override:
+        return override
+    if not s3_subpath:
+        return None
+    m = DT_RE.search(s3_subpath)
+    return m.group(1) if m else None
+
+
+# ---- osi_verify/mapping.py ----
+
+
+import csv
+from dataclasses import dataclass
+from pathlib import Path
+from typing import List, Sequence
+
+
+CONTAINER_COMPARE_FIELDS = {"locations", "classifications"}
+CONTAINER_CHILD_PREFIXES = tuple(f"{field}." for field in CONTAINER_COMPARE_FIELDS)
+NON_COMPARE_MARKERS = ("后续处理",)
+DEFAULT_EMPTY_SOURCE_MARKERS = {"无", "/"}
+
+
+@dataclass(frozen=True)
+class MappingRule:
+    target_field: str
+    source_note: str
+    compare_field: str
+    value_type: str = ""
+    compare: bool = True
+
+
+def canonical_field(field: str) -> str:
+    return field.strip()
+
+
+def should_compare(field: str, source_note: str) -> bool:
+    if not field:
+        return False
+    if field.startswith(CONTAINER_CHILD_PREFIXES):
+        return False
+    if any(marker in source_note for marker in NON_COMPARE_MARKERS):
+        return False
+    if not source_note and field not in CONTAINER_COMPARE_FIELDS:
+        return False
+    return True
+
+
+def load_mapping_rules(
+    path: Path,
+    *,
+    target_column: str = "预期字段",
+    source_column: str = "arxiv对应字段",
+    type_column: str = "字段值数据类型",
+) -> List[MappingRule]:
+    rules: List[MappingRule] = []
+    with path.open(encoding="utf-8-sig", newline="") as f:
+        reader = csv.DictReader(f)
+        if not reader.fieldnames or target_column not in reader.fieldnames:
+            available = ", ".join(reader.fieldnames or [])
+            raise ValueError(
+                f"映射文件 {path} 缺少目标字段列 {target_column!r}"
+                f"（可用列: {available}）"
+            )
+        for row in reader:
+            target = (row.get(target_column) or "").strip()
+            note = (row.get(source_column) or "").strip()
+            value_type = (row.get(type_column) or "").strip()
+            if not target:
+                continue
+            compare_field = canonical_field(target)
+            rules.append(
+                MappingRule(
+                    target_field=target,
+                    source_note=note,
+                    compare_field=compare_field,
+                    value_type=value_type,
+                    compare=should_compare(target, note),
+                )
+            )
+    return rules
+
+
+def compare_fields_from_rules(rules: Sequence[MappingRule]) -> List[str]:
+    fields: List[str] = []
+    seen = set()
+    for rule in rules:
+        if not rule.compare:
+            continue
+        if rule.compare_field in seen:
+            continue
+        seen.add(rule.compare_field)
+        fields.append(rule.compare_field)
+    return fields
+
+
+def default_empty_field_types_from_rules(rules: Sequence[MappingRule]) -> dict[str, str]:
+    """映射来源为“无”的字段：按声明类型做默认空值校验。"""
+    fields: dict[str, str] = {}
+    for rule in rules:
+        if not rule.compare or rule.source_note not in DEFAULT_EMPTY_SOURCE_MARKERS:
+            continue
+        if rule.compare_field in fields:
+            continue
+        fields[rule.compare_field] = rule.value_type
+    return fields
+
+
+# ---- osi_verify/transform.py ----
+
+
+import json
+import re
+from datetime import datetime
+from typing import Any, Dict, List, Optional, Tuple
+
+
+ARXIV_ABS_RE = re.compile(
+    r"^(?:https?://)?(?:arxiv\.org/abs/|export\.arxiv\.org/abs/)?",
+    re.I,
+)
+ARXIV_DOI_PREFIX = "10.48550/arxiv."
+
+# 校验字段名 -> 湖仓表实际列名（当二者不一致时）
+DB_COLUMN_ALIASES: Dict[str, str] = {
+    "s2FieldsOfStudy": "s2fieldsofstudy",
+}
+
+
+def strip_arxiv_id(paper_id: Optional[str]) -> Optional[str]:
+    if not paper_id:
+        return None
+    s = ARXIV_ABS_RE.sub("", str(paper_id).strip())
+    return s.strip("/") or None
+
+
+def parse_datetime_value(updated: Any) -> Optional[datetime]:
+    if updated is None:
+        return None
+    if isinstance(updated, datetime):
+        return updated
+    if hasattr(updated, "year") and hasattr(updated, "month") and hasattr(updated, "day"):
+        return datetime(updated.year, updated.month, updated.day)
+    s = str(updated).strip()
+    if not s:
+        return None
+    if re.match(r"^\d{4}-\d{2}-\d{2}", s):
+        try:
+            return datetime.strptime(s[:10], "%Y-%m-%d")
+        except ValueError:
+            pass
+    for fmt in (
+        "%Y-%m-%d %H:%M:%S",
+        "%Y-%m-%dT%H:%M:%S",
+        "%Y-%m-%d",
+        "%a, %d %b %Y %H:%M:%S %Z",
+    ):
+        try:
+            sample = s[:30] if "GMT" in s else s[:19]
+            return datetime.strptime(sample, fmt)
+        except ValueError:
+            continue
+    m = re.match(r"(\d{4})", s)
+    return datetime(int(m.group(1)), 1, 1) if m else None
+
+
+def parse_date_iso(updated: Any) -> Optional[str]:
+    """将 GMT/各类日期字符串规范为 YYYY-MM-DD（与落库一致）。"""
+    dt = parse_datetime_value(updated)
+    return dt.strftime("%Y-%m-%d") if dt else None
+
+
+def parse_year(updated: Any) -> Optional[int]:
+    dt = parse_datetime_value(updated)
+    return dt.year if dt else None
+
+
+def _normalize_author_name(name: str) -> str:
+    """去掉作者名前的 and（与落库一致，如 ', and Foo' 按逗号拆分后残留）。"""
+    s = name.strip()
+    if s.lower().startswith("and "):
+        s = s[4:].strip()
+    return s
+
+
+def parse_authors(author: Any) -> List[str]:
+    if author is None:
+        return []
+    if isinstance(author, list):
+        return [
+            n
+            for a in author
+            if (n := _normalize_author_name(str(a)))
+        ]
+    s = str(author).strip()
+    if not s:
+        return []
+    return [
+        n
+        for p in re.split(r"[,;]\s*|\s+and\s+", s, flags=re.I)
+        if p.strip() and (n := _normalize_author_name(p))
+    ]
+
+
+# 产品 license_url 可选值（2025.09.01）
+LICENSE_ALLOWED: frozenset = frozenset(
+    {
+        "cc-by",
+        "cc-by-nc",
+        "cc-by-sa",
+        "cc-by-nd",
+        "cc-by-nc-sa",
+        "cc-by-nc-nd",
+        "other-oa",
+        "cc0",
+        "",
+        "public-domain",
+        "publisher-specific-oa",
+        "nonexclusive-distrib",
+    }
+)
+
+# S3 license_url / 历史别名 -> 标准可选值
+DEFAULT_LICENSE_MAP: Dict[str, str] = {
+    "http://arxiv.org/licenses/nonexclusive-distrib/1.0/": "nonexclusive-distrib",
+    "https://arxiv.org/licenses/nonexclusive-distrib/1.0/": "nonexclusive-distrib",
+    "arxiv-nonexclusive-distrib-1.0": "nonexclusive-distrib",
+    "http://creativecommons.org/licenses/by/4.0/": "cc-by",
+    "https://creativecommons.org/licenses/by/4.0/": "cc-by",
+    "http://creativecommons.org/licenses/by/3.0/": "cc-by",
+    "https://creativecommons.org/licenses/by/3.0/": "cc-by",
+    "CC-BY-4.0": "cc-by",
+    "CC-BY-3.0": "cc-by",
+    "http://creativecommons.org/licenses/by-nc/4.0/": "cc-by-nc",
+    "https://creativecommons.org/licenses/by-nc/4.0/": "cc-by-nc",
+    "http://creativecommons.org/licenses/by-sa/4.0/": "cc-by-sa",
+    "https://creativecommons.org/licenses/by-sa/4.0/": "cc-by-sa",
+    "http://creativecommons.org/licenses/by-nd/4.0/": "cc-by-nd",
+    "https://creativecommons.org/licenses/by-nd/4.0/": "cc-by-nd",
+    "http://creativecommons.org/licenses/by-nc-sa/4.0/": "cc-by-nc-sa",
+    "https://creativecommons.org/licenses/by-nc-sa/4.0/": "cc-by-nc-sa",
+    "http://creativecommons.org/licenses/by-nc-nd/4.0/": "cc-by-nc-nd",
+    "https://creativecommons.org/licenses/by-nc-nd/4.0/": "cc-by-nc-nd",
+    "http://creativecommons.org/publicdomain/zero/1.0/": "cc0",
+    "https://creativecommons.org/publicdomain/zero/1.0/": "cc0",
+    "CC0-1.0": "cc0",
+}
+
+_CC_LICENSE_URL_RULES: List[Tuple[re.Pattern, str]] = [
+    (re.compile(r"creativecommons\.org/licenses/by-nc-sa", re.I), "cc-by-nc-sa"),
+    (re.compile(r"creativecommons\.org/licenses/by-nc-nd", re.I), "cc-by-nc-nd"),
+    (re.compile(r"creativecommons\.org/licenses/by-nc(?:/|$)", re.I), "cc-by-nc"),
+    (re.compile(r"creativecommons\.org/licenses/by-sa", re.I), "cc-by-sa"),
+    (re.compile(r"creativecommons\.org/licenses/by-nd", re.I), "cc-by-nd"),
+    (re.compile(r"creativecommons\.org/licenses/by(?:/|$)", re.I), "cc-by"),
+    (re.compile(r"creativecommons\.org/publicdomain/zero", re.I), "cc0"),
+    (re.compile(r"arxiv\.org/licenses/nonexclusive-distrib", re.I), "nonexclusive-distrib"),
+]
+
+
+def normalize_license_value(v: Any, license_map: Dict[str, str]) -> str:
+    """将 S3 URL / 别名 / DB 值规范为 license_url 可选值。"""
+    if v is None:
+        return ""
+    s = str(v).strip()
+    if not s:
+        return ""
+    if s in license_map:
+        return license_map[s]
+    trimmed = s.rstrip("/")
+    if trimmed in license_map:
+        return license_map[trimmed]
+    low = s.lower()
+    if low in LICENSE_ALLOWED:
+        return low
+    for pat, canon in _CC_LICENSE_URL_RULES:
+        if pat.search(s):
+            return canon
+    return low
+
+
+def license_out_of_allowed_warning(value: str, *, source: str = "S3") -> Optional[str]:
+    if value and value not in LICENSE_ALLOWED:
+        allowed = ", ".join(sorted(LICENSE_ALLOWED - {""}))
+        return (
+            f"[WARN] license_url {source} 值 '{value}' 不在产品可选值内"
+            f"（{allowed}），属上游数据，不判定为开发缺陷"
+        )
+    return None
+
+
+def map_license_url(url: Any, license_map: Dict[str, str]) -> str:
+    return normalize_license_value(url, license_map)
+
+
+
+
+
+def build_doi(row: Dict[str, Any]) -> Optional[str]:
+    """与落库一致：S3 有 doi 直接用；否则 10.48550/arxiv.{doc_id}。"""
+    doi = get_first(row, "doi")
+    if doi is not None and str(doi).strip():
+        return str(doi).strip()
+    doc_id = get_first(row, "doc_id")
+    if doc_id is not None and str(doc_id).strip():
+        return f"{ARXIV_DOI_PREFIX}{str(doc_id).strip()}"
+    return None
+
+
+def normalize_doi(v: Any) -> Optional[str]:
+    """DOI 比对忽略大小写。"""
+    v = normalize_scalar(v)
+    if v is None:
+        return None
+    return str(v).strip().lower()
+
+
+def normalize_indexed_in(v: Any) -> List[str]:
+    """与落库一致：List[string]。"""
+    v = json_loads_maybe(v)
+    if v is None:
+        return []
+    if isinstance(v, list):
+        return [str(x).strip() for x in v if str(x).strip()]
+    s = str(v).strip()
+    return [s] if s else []
+
+
+def build_identifiers(row: Dict[str, Any]) -> Dict[str, str]:
+    """
+    与落库一致：map，oaiId <- oai_identifier，arxivId <- paper_id（去掉 https:// 等前缀）。
+    """
+    out: Dict[str, str] = {}
+    oai = get_first(row, "oai_identifier")
+    if oai:
+        out["oaiId"] = str(oai).strip()
+    aid = strip_arxiv_id(get_first(row, "paper_id"))
+    if aid:
+        out["arxivId"] = aid
+    return out
+
+
+def normalize_identifiers(v: Any) -> Dict[str, str]:
+    """比对用：统一为 {oaiId, arxivId} map。"""
+    v = json_loads_maybe(v)
+    if v is None:
+        return {}
+    if isinstance(v, dict):
+        out: Dict[str, str] = {}
+        oai = v.get("oaiId") or v.get("oai_id")
+        if oai:
+            out["oaiId"] = str(oai).strip()
+        arxiv = v.get("arxivId") or v.get("arxiv_id")
+        if arxiv:
+            aid = strip_arxiv_id(arxiv) or str(arxiv).strip()
+            if aid:
+                out["arxivId"] = aid
+        return out
+    if isinstance(v, list):
+        out = {}
+        for item in v:
+            if not isinstance(item, dict):
+                continue
+            t = str(item.get("type", "")).lower()
+            val = item.get("value")
+            if not val:
+                continue
+            if t in ("oai_identifier", "oaiid", "oai_id"):
+                out["oaiId"] = str(val).strip()
+            elif t in ("arxiv_id", "arxivid"):
+                aid = strip_arxiv_id(val) or str(val).strip()
+                if aid:
+                    out["arxivId"] = aid
+        return out
+    return {}
+
+
+def build_locations(row: Dict[str, Any], license_map: Dict[str, str]) -> List[Dict[str, Any]]:
+    locs: List[Dict[str, Any]] = []
+    get_pdf = as_bool_flag(get_first(row, "get_pdf"))
+    get_source = as_bool_flag(get_first(row, "get_source"))
+    lic = map_license_url(get_first(row, "license_url"), license_map)
+    pdf_url = get_first(row, "pdf_url")
+    if pdf_url:
+        locs.append(
+            {
+                "type": "download" if get_pdf else "",
+                "url": str(pdf_url),
+                "license": lic,
+                "is_oa": oa_flag_str(get_pdf),
+            }
+        )
+    source_url = get_first(row, "source_url")
+    if source_url:
+        locs.append(
+            {
+                "type": "download" if get_source else "",
+                "url": str(source_url),
+                "license": lic,
+                "is_oa": oa_flag_str(get_source),
+            }
+        )
+    return locs
+
+
+def normalize_locations(v: Any, license_map: Dict[str, str]) -> List[Dict[str, Any]]:
+    """比对用：统一 locations，is_oa 为 string，license 为标准可选值。"""
+    v = json_loads_maybe(v)
+    if not isinstance(v, list):
+        return []
+    out: List[Dict[str, Any]] = []
+    for item in v:
+        if not isinstance(item, dict):
+            continue
+        loc = dict(item)
+        if "is_oa" in loc:
+            loc["is_oa"] = oa_flag_str(as_bool_flag(loc["is_oa"]))
+        if "license" in loc:
+            loc["license"] = normalize_license_value(loc["license"], license_map)
+        out.append(loc)
+    return out
+
+
+def _classification_field(row: Dict[str, Any], key: str) -> Any:
+    """从 S3 行取 classifications 子字段；category -> arxiv_category。"""
+    if key == "arxiv_category":
+        raw = get_first(row, "category")
+    else:
+        raw = get_first(row, key)
+    if raw is None:
+        return None
+    if isinstance(raw, str) and not raw.strip():
+        return None
+    if key == "arxiv_category":
+        if isinstance(raw, list):
+            return [str(x).strip() for x in raw if str(x).strip()]
+        return [raw.strip()] if str(raw).strip() else None
+    return raw
+
+
+def build_classifications(row: Dict[str, Any]) -> Dict[str, Any]:
+    """与落库一致：固定 Object，含 mesh / msc_class / acm_class / arxiv_category。"""
+    return {
+        "mesh": _classification_field(row, "mesh"),
+        "msc_class": _classification_field(row, "msc_class"),
+        "acm_class": _classification_field(row, "acm_class"),
+        "arxiv_category": _classification_field(row, "arxiv_category"),
+    }
+
+
+def normalize_classifications(v: Any) -> Dict[str, Any]:
+    """比对用：统一四类 key，空值归一为 null；category 别名 -> arxiv_category。"""
+    v = json_loads_maybe(v)
+    if not isinstance(v, dict):
+        v = {}
+    raw_cat = v.get("arxiv_category")
+    if raw_cat is None and "category" in v:
+        raw_cat = v.get("category")
+
+    def norm_scalar(val: Any) -> Any:
+        if val is None:
+            return None
+        if isinstance(val, str) and not val.strip():
+            return None
+        return val
+
+    def norm_category(val: Any) -> Any:
+        if val is None:
+            return None
+        if isinstance(val, list):
+            items = [str(x).strip() for x in val if str(x).strip()]
+            return items or None
+        s = str(val).strip()
+        return [s] if s else None
+
+    return {
+        "mesh": norm_scalar(v.get("mesh")),
+        "msc_class": norm_scalar(v.get("msc_class")),
+        "acm_class": norm_scalar(v.get("acm_class")),
+        "arxiv_category": norm_category(raw_cat),
+    }
+
+
+def build_track_id(row: Dict[str, Any]) -> Optional[str]:
+    """与落库一致：track_id = arxiv:{doc_id}。"""
+    oid = get_first(row, "doc_id")
+    if oid is None or not str(oid).strip():
+        return None
+    return f"arxiv:{str(oid).strip()}"
+
+
+def arxiv_empty_field_defaults() -> Dict[str, Any]:
+    """arxiv 源无对应字段时，落库表中的默认/空值（与湖仓表现一致）。"""
+    return {
+        "language": "",
+        "type": [],
+        "keywords": [],
+        "fieldsOfStudy": [],
+        "s2FieldsOfStudy": [],
+        "primary_topic": {},
+        "topics": [],
+        "concepts": [],
+        "subject": "",
+        "major": "",
+        "major_2": "",
+        "major_3": "",
+        "category": "",
+        "area": "",
+        "grade_class": "",
+        "grade": "",
+        "origin_db_source": "",
+        "reference_count": None,
+        "citation_count": None,
+        "influential_citation_count": None,
+        "fwci": None,
+        "references": [],
+        "related_works": [],
+        "citation_normalized_percentile": {},
+        "cited_by_percentile_year": {},
+        "cited_by_api_url": "",
+        "venue_name": "",
+        "venue_type": "",
+        "venue_issn": [],
+        "venue_publisher": [],
+        "venue.type": "",
+        "venue.issn": [],
+        "venue.publisher": [],
+        "biblio_volume": "",
+        "biblio_issue": "",
+        "biblio_pages": "",
+        "mesh": None,
+        "msc_class": None,
+        "acm_class": None,
+        "arxiv_category": None,
+    }
+
+
+def transform_arxiv_row(row: Dict[str, Any], license_map: Dict[str, str]) -> Dict[str, Any]:
+    updated = get_first(row, "updated")
+    get_pdf = as_bool_flag(get_first(row, "get_pdf"))
+    pdf_url = get_first(row, "pdf_url") or ""
+    expected: Dict[str, Any] = arxiv_empty_field_defaults()
+    expected.update({
+        "track_id": build_track_id(row),
+        "title": get_first(row, "title"),
+        "abstract": get_first(row, "abstract"),
+        "doi": build_doi(row),
+        "author": parse_authors(get_first(row, "authors")),
+        "identifiers": build_identifiers(row),
+        "indexed_in": ["arxiv"],
+        "published_date": parse_date_iso(updated),
+        "published_year": parse_year(updated),
+        "access_is_oa": "true",
+        "access_oa_status": "",
+        "access_oa_url": str(pdf_url) if get_pdf else "",
+        "access_license": map_license_url(get_first(row, "license_url"), license_map),
+        "origin_id": get_first(row, "doc_id"),
+        "origin_osi": "arxiv",
+        "locations": build_locations(row, license_map),
+        "classifications": build_classifications(row),
+        "mesh": _classification_field(row, "mesh"),
+        "msc_class": _classification_field(row, "msc_class"),
+        "acm_class": _classification_field(row, "acm_class"),
+        "arxiv_category": _classification_field(row, "arxiv_category"),
+    })
+    return expected
+
+
+# ---- osi_verify/transforms/registry.py ----
+
+
+from typing import Any, Callable, Dict
+
+
+TransformFn = Callable[[Dict[str, Any], Dict[str, str]], Dict[str, Any]]
+
+TRANSFORMS: Dict[str, TransformFn] = {
+    "osi_arxiv": transform_arxiv_row,
+}
+
+
+def transform_row(row: Dict[str, Any], license_map: Dict[str, str], transform: str) -> Dict[str, Any]:
+    try:
+        fn = TRANSFORMS[transform]
+    except KeyError as e:
+        raise ValueError(f"不支持的 transform: {transform}") from e
+    return fn(row, license_map)
+
+
+# ---- osi_verify/compare.py ----
+
+
+import json
+from dataclasses import dataclass, field
+from datetime import datetime
+from typing import Any, Dict, List, Optional, Sequence, Tuple
+
+
+
+def mysql_column_for_field(field: str, columns: Sequence[str]) -> Optional[str]:
+    original = field
+    if original in columns:
+        return original
+    field = DB_COLUMN_ALIASES.get(field, field)
+    if field in columns:
+        return field
+    flat = original.replace(".", "_")
+    if flat in columns:
+        return flat
+    flat = field.replace(".", "_")
+    if flat in columns:
+        return flat
+    top = field.split(".")[0]
+    return top if top in columns else None
+
+
+def get_nested_value(obj: Any, path: str) -> Any:
+    cur = obj
+    for p in path.split("."):
+        if not isinstance(cur, dict):
+            return None
+        cur = cur.get(p)
+    return cur
+
+
+def get_mysql_field_value(mysql_row: Dict[str, Any], field: str, columns: Sequence[str]) -> Any:
+    col = mysql_column_for_field(field, columns)
+    if not col:
+        return None
+    raw = mysql_row.get(col)
+    parsed = json_loads_maybe(raw)
+    if "." in field:
+        if col == field.split(".")[0] and isinstance(parsed, dict):
+            return get_nested_value(parsed, ".".join(field.split(".")[1:]))
+        return get_nested_value(parsed if isinstance(parsed, dict) else mysql_row, field)
+    return parsed
+
+
+def normalize_scalar(v: Any) -> Any:
+    if v is None:
+        return None
+    if isinstance(v, bool):
+        return v
+    if isinstance(v, datetime):
+        return v.isoformat(sep=" ", timespec="seconds")
+    if isinstance(v, (int, float)) and not isinstance(v, bool):
+        return v
+    s = str(v).strip()
+    return s if s else None
+
+
+def normalize_json(v: Any) -> Any:
+    v = json_loads_maybe(v)
+    if isinstance(v, dict):
+        return {k: normalize_json(vv) for k, vv in sorted(v.items())}
+    if isinstance(v, list):
+        return [normalize_json(x) for x in v]
+    return normalize_scalar(v)
+
+
+def is_empty_value(v: Any) -> bool:
+    v = normalize_json(v)
+    if v is None:
+        return True
+    if isinstance(v, (list, dict)) and not v:
+        return True
+    if isinstance(v, str) and not v.strip():
+        return True
+    return False
+
+
+def is_empty_value_for_type(v: Any, value_type: str) -> bool:
+    v = json_loads_maybe(v)
+    type_name = value_type.strip().lower()
+    if v is None:
+        return True
+    if type_name.startswith("list"):
+        return isinstance(v, list) and not v
+    if type_name in {"object", "dict", "map"}:
+        return isinstance(v, dict) and not v
+    if type_name in {"string", "str"}:
+        return isinstance(v, str) and not v.strip()
+    if type_name in {"integer", "int", "float", "double", "number", "boolean", "bool"}:
+        return v is None
+    return is_empty_value(v)
+
+
+def _format_diff_value(v: Any, max_len: int = 500) -> str:
+    if v is None:
+        return "null"
+    if isinstance(v, (dict, list)):
+        s = json.dumps(v, ensure_ascii=False, default=str)
+    else:
+        s = repr(v) if isinstance(v, str) else str(v)
+    return s if len(s) <= max_len else s[: max_len - 3] + "..."
+
+
+@dataclass
+class FieldMismatch:
+    """单字段不一致：s3 为转换后的期望值，db 为 Iceberg/MySQL 实际值。"""
+
+    field: str
+    s3: Any
+    db: Any
+
+    def to_dict(self) -> Dict[str, Any]:
+        return {"field": self.field, "s3": self.s3, "db": self.db}
+
+    def __str__(self) -> str:
+        return f"{self.field}: S3={_format_diff_value(self.s3)} | DB={_format_diff_value(self.db)}"
+
+
+def values_equal(
+    s3_val: Any,
+    db_val: Any,
+    field: str,
+    license_map: Optional[Dict[str, str]] = None,
+    empty_value_type: Optional[str] = None,
+) -> Tuple[bool, Optional[FieldMismatch], Optional[str]]:
+    license_map = license_map or DEFAULT_LICENSE_MAP
+    s3_n = normalize_json(s3_val)
+    db_n = normalize_json(db_val)
+    if empty_value_type:
+        s3_typed_empty = is_empty_value_for_type(s3_val, empty_value_type)
+        db_typed_empty = is_empty_value_for_type(db_val, empty_value_type)
+        if s3_typed_empty and db_typed_empty:
+            return True, None, None
+        if is_empty_value(s3_n) or is_empty_value(db_n):
+            return False, FieldMismatch(field, s3_val, db_val), None
+    if field == "doi":
+        s3_n = normalize_doi(s3_val)
+        db_n = normalize_doi(db_val)
+        if s3_n == db_n:
+            return True, None, None
+        return False, FieldMismatch(field, s3_n, db_n), None
+    if field == "identifiers":
+        s3_n = normalize_identifiers(s3_val)
+        db_n = normalize_identifiers(db_val)
+        if s3_n == db_n:
+            return True, None, None
+        return False, FieldMismatch(field, s3_n, db_n), None
+    if field == "indexed_in":
+        s3_n = normalize_indexed_in(s3_val)
+        db_n = normalize_indexed_in(db_val)
+        if s3_n == db_n:
+            return True, None, None
+        return False, FieldMismatch(field, s3_n, db_n), None
+    if field in ("published_date", "publication_published_date"):
+        s3_n = parse_date_iso(s3_val)
+        db_n = parse_date_iso(db_val)
+        if s3_n == db_n:
+            return True, None, None
+        return False, FieldMismatch(field, s3_n, db_n), None
+    if field == "access_license":
+        s3_n = normalize_license_value(s3_val, license_map)
+        db_n = normalize_license_value(db_val, license_map)
+        warn = license_out_of_allowed_warning(s3_n, source="S3")
+        if warn:
+            return True, None, warn
+        if s3_n == db_n:
+            return True, None, None
+        return False, FieldMismatch(field, s3_n, db_n), None
+    if field == "locations":
+        s3_n = normalize_locations(s3_val, license_map)
+        db_n = normalize_locations(db_val, license_map)
+        for i, loc in enumerate(s3_n):
+            lic = loc.get("license", "")
+            w = license_out_of_allowed_warning(lic, source=f"S3 locations[{i}]")
+            if w:
+                return True, None, w
+        if s3_n == db_n:
+            return True, None, None
+        return False, FieldMismatch(field, s3_n, db_n), None
+    if field == "classifications":
+        s3_n = normalize_classifications(s3_val)
+        db_n = normalize_classifications(db_val)
+        if s3_n == db_n:
+            return True, None, None
+        return False, FieldMismatch(field, s3_n, db_n), None
+    if field == "author":
+        if s3_n == db_n:
+            return True, None, None
+        return False, FieldMismatch(field, s3_n, db_n), None
+    if field == "access_is_oa":
+        s3_s = oa_flag_str(as_bool_flag(s3_val))
+        db_s = oa_flag_str(as_bool_flag(db_val))
+        if s3_s == db_s:
+            return True, None, None
+        return False, FieldMismatch(field, s3_s, db_s), None
+    if field in ("published_year", "publication_published_year"):
+        try:
+            if int(s3_n or 0) == int(db_n or 0):
+                return True, None, None
+        except (TypeError, ValueError):
+            pass
+        return False, FieldMismatch(field, s3_n, db_n), None
+    if normalize_scalar(s3_n) == normalize_scalar(db_n):
+        return True, None, None
+    return False, FieldMismatch(field, s3_n, db_n), None
+
+
+def compare_fields_for_table(
+    columns: Sequence[str],
+    mapping_rules: Sequence[Any],
+) -> List[str]:
+    """按当前 target 的 mapping CSV 生成字段清单，并仅保留目标表存在的列。"""
+    requested = compare_fields_from_rules(mapping_rules)
+    return [f for f in requested if mysql_column_for_field(f, columns)]
+
+
+def check_track_id(
+    expected_tid: Any,
+    db_tid: Any,
+    origin_id: Any,
+    track_registry: Dict[str, str],
+) -> List[FieldMismatch]:
+    """track_id 非空、与期望值一致、本次校验批次内唯一。"""
+    failures: List[FieldMismatch] = []
+    exp = normalize_scalar(expected_tid)
+    db = normalize_scalar(db_tid)
+    if not db:
+        failures.append(FieldMismatch("track_id", exp, db_tid))
+        return failures
+    if exp is not None and db != exp:
+        failures.append(FieldMismatch("track_id", exp, db))
+    tid = str(db)
+    oid = str(origin_id) if origin_id is not None else ""
+    prev = track_registry.get(tid)
+    if prev is not None and prev != oid:
+        failures.append(
+            FieldMismatch(
+                "track_id",
+                exp,
+                f"duplicate: also used by origin_id={prev}",
+            )
+        )
+    else:
+        track_registry[tid] = oid
+    return failures
+
+
+@dataclass
+class RowResult:
+    origin_id: Any
+    ok: bool
+    jsonl_file: str = ""
+    missing_in_mysql: bool = False
+    failures: List[FieldMismatch] = field(default_factory=list)
+    passes: List[str] = field(default_factory=list)
+    warnings: List[str] = field(default_factory=list)
+
+    @property
+    def status(self) -> str:
+        if self.ok:
+            return "PASS"
+        return "MISSING" if self.missing_in_mysql else "FAIL"
+
+
+def compare_row(
+    s3_row: Dict[str, Any],
+    mysql_row: Optional[Dict[str, Any]],
+    license_map: Dict[str, str],
+    *,
+    track_registry: Optional[Dict[str, str]] = None,
+    compare_fields: Optional[Sequence[str]] = None,
+    default_empty_field_types: Optional[Dict[str, str]] = None,
+    transform: str = "osi_arxiv",
+) -> RowResult:
+    expected = transform_row(s3_row, license_map, transform)
+    origin_id = expected.get("origin_id")
+    if mysql_row is None:
+        return RowResult(origin_id=origin_id, ok=False, missing_in_mysql=True)
+    columns = list(mysql_row.keys())
+    if compare_fields is None:
+        raise ValueError("compare_fields 不能为空；字段校验必须由当前 target 的 mapping CSV 生成")
+    fields = list(compare_fields)
+    empty_field_types = default_empty_field_types or {}
+    failures, passes, warnings = [], [], []
+    registry = track_registry if track_registry is not None else {}
+    if "track_id" in fields:
+        failures.extend(
+            check_track_id(
+                expected.get("track_id"),
+                get_mysql_field_value(mysql_row, "track_id", columns),
+                origin_id,
+                registry,
+            )
+        )
+        fields = [f for f in fields if f != "track_id"]
+    for fld in fields:
+        exp_val = expected.get(fld)
+        ok, mismatch, warn = values_equal(
+            exp_val,
+            get_mysql_field_value(mysql_row, fld, columns),
+            fld,
+            license_map,
+            empty_value_type=empty_field_types.get(fld),
+        )
+        if warn:
+            warnings.append(warn)
+        if ok:
+            passes.append(fld)
+        elif mismatch:
+            failures.append(mismatch)
+    return RowResult(
+        origin_id=origin_id,
+        ok=not failures,
+        failures=failures,
+        passes=passes,
+        warnings=warnings,
+    )
+
+
+# ---- osi_verify/mysql_session.py ----
+
+
+from typing import Any, Callable, Dict, Optional, TypeVar
+
+
+try:
+    import pymysql
+    from pymysql.cursors import DictCursor
+except ImportError:
+    pymysql = None  # type: ignore
+    DictCursor = None  # type: ignore
+
+T = TypeVar("T")
+
+
+class MySQLSession:
+    """带重试的 MySQL/StarRocks 会话；连接断开时自动重连。"""
+
+    def __init__(
+        self,
+        cfg: Dict[str, Any],
+        database: Optional[str],
+        *,
+        catalog: Optional[str] = None,
+        retry_config: Optional[RetryConfig] = None,
+    ):
+        self.cfg = cfg
+        self.database = database
+        self.catalog = catalog
+        self.retry_config = retry_config or RetryConfig()
+        self._conn: Any = None
+
+    @property
+    def conn(self) -> Any:
+        if self._conn is None:
+            self.connect()
+        return self._conn
+
+    def connect(self) -> Any:
+        if pymysql is None:
+            raise RuntimeError("请安装 pymysql: pip install pymysql")
+        kwargs: Dict[str, Any] = dict(
+            host=self.cfg["host"],
+            port=self.cfg["port"],
+            user=self.cfg["user"],
+            password=self.cfg["password"],
+            charset="utf8mb4",
+            cursorclass=DictCursor,
+            connect_timeout=30,
+            read_timeout=300 if self.catalog else 60,
+        )
+        if not self.catalog and self.database:
+            kwargs["database"] = self.database
+
+        def _connect():
+            return pymysql.connect(**kwargs)
+
+        self._conn = retry_call(
+            _connect,
+            self.retry_config,
+            label="MySQL 连接",
+            retryable=is_mysql_retryable,
+        )
+        return self._conn
+
+    def reconnect(self) -> None:
+        self.close()
+        self.connect()
+
+    def close(self) -> None:
+        if self._conn is not None:
+            try:
+                self._conn.close()
+            except Exception:
+                pass
+            self._conn = None
+
+    def run(self, fn: Callable[[Any], T], *, label: str) -> T:
+        def attempt() -> T:
+            try:
+                return fn(self.conn)
+            except Exception as exc:
+                if is_mysql_retryable(exc):
+                    self.close()
+                raise
+
+        return retry_call(
+            attempt,
+            self.retry_config,
+            label=label,
+            retryable=is_mysql_retryable,
+        )
+
+    def __enter__(self) -> "MySQLSession":
+        self.connect()
+        return self
+
+    def __exit__(self, exc_type, exc, tb) -> None:
+        self.close()
+
+
+# ---- osi_verify/s3_reader.py ----
+
+
+import json
+import random
+import sys
+from typing import Any, Dict, Generator, List, Optional, Tuple
+
+try:
+    import duckdb
+except ImportError:
+    duckdb = None  # type: ignore
+
+
+RANGE_SAMPLE_CHUNK_BYTES = 1024 * 1024
+RANGE_SAMPLE_MAX_ATTEMPT_FACTOR = 20
+BOTOCORE_RETRY_ATTEMPTS = 2
+
+
+def rows_from_cursor(cur) -> List[Dict[str, Any]]:
+    cols = [d[0] for d in cur.description]
+    return [dict(zip(cols, row)) for row in cur.fetchall()]
+
+
+def configure_duckdb_s3(con: "duckdb.DuckDBPyConnection", s3_cfg: Dict[str, Any]) -> None:
+    con.execute("INSTALL httpfs; LOAD httpfs;")
+    ep = sql_literal(s3_cfg["endpoint"])
+    ak = sql_literal(s3_cfg["access_key"])
+    sk = sql_literal(s3_cfg["secret_key"])
+    use_ssl = bool(s3_cfg.get("use_ssl", True))
+    con.execute(f"SET s3_endpoint='{ep}';")
+    con.execute(f"SET s3_access_key_id='{ak}';")
+    con.execute(f"SET s3_secret_access_key='{sk}';")
+    con.execute(f"SET s3_use_ssl={'true' if use_ssl else 'false'};")
+    con.execute("SET s3_url_style='path';")
+    con.execute("SET s3_region='us-east-1';")
+
+
+def _detect_s3_format(
+    con: "duckdb.DuckDBPyConnection",
+    s3_path: str,
+    *,
+    s3_cfg: Optional[Dict[str, Any]] = None,
+    retry_config: Optional[RetryConfig] = None,
+) -> str:
+    if s3_cfg:
+        if list_s3_files_boto3(s3_path, s3_cfg, ".jsonl", retry_config=retry_config):
+            return "jsonl"
+        if list_s3_files_boto3(s3_path, s3_cfg, ".parquet", retry_config=retry_config):
+            return "parquet"
+        raise FileNotFoundError(f"S3 路径下未找到 .jsonl 或 .parquet 文件: {s3_path}")
+
+    base = sql_literal(s3_path.rstrip("/"))
+
+    def _jsonl_count() -> int:
+        return int(con.execute(f"SELECT count(*) FROM glob('{base}/*.jsonl')").fetchone()[0])
+
+    def _parquet_count() -> int:
+        return int(con.execute(f"SELECT count(*) FROM glob('{base}/**/*.parquet')").fetchone()[0])
+
+    if retry_call(_jsonl_count, retry_config, label="S3 探测 jsonl", retryable=is_s3_retryable):
+        return "jsonl"
+    if retry_call(_parquet_count, retry_config, label="S3 探测 parquet", retryable=is_s3_retryable):
+        return "parquet"
+    raise FileNotFoundError(f"S3 路径下未找到 .jsonl 或 .parquet 文件: {s3_path}")
+
+
+def open_duckdb_s3(s3_cfg: Optional[Dict[str, Any]]) -> "duckdb.DuckDBPyConnection":
+    if duckdb is None:
+        raise RuntimeError("请安装 duckdb: pip install duckdb pyarrow")
+    con = duckdb.connect()
+    if s3_cfg:
+        configure_duckdb_s3(con, s3_cfg)
+    return con
+
+
+def jsonl_basename(s3_uri: str) -> str:
+    return s3_uri.rsplit("/", 1)[-1]
+
+
+def parse_s3_uri(uri: str) -> Tuple[str, str]:
+    u = uri.replace("\\", "/")
+    if not u.startswith("s3://"):
+        raise ValueError(f"非 S3 URI: {uri}")
+    rest = u[5:]
+    bucket, _, key = rest.partition("/")
+    if not bucket or not key:
+        raise ValueError(f"无法解析 S3 URI: {uri}")
+    return bucket, key
+
+
+def _suppress_insecure_request_warning() -> None:
+    """内网 Ceph 常使用自签证书，verify=False 时抑制 urllib3 重复告警。"""
+    try:
+        import urllib3
+    except ImportError:
+        return
+    urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
+
+
+def s3_boto_client(s3_cfg: Dict[str, Any], *, retry_config: Optional[RetryConfig] = None):
+    try:
+        import boto3
+        from botocore.config import Config
+    except ImportError as e:
+        raise RuntimeError("S3 流式/Range 抽样需要 boto3: pip install boto3") from e
+    cfg = retry_config or RetryConfig()
+    use_ssl = bool(s3_cfg.get("use_ssl", True))
+    verify_ssl = bool(s3_cfg.get("verify_ssl", False))
+    if use_ssl and not verify_ssl:
+        _suppress_insecure_request_warning()
+    scheme = "https" if use_ssl else "http"
+    return boto3.client(
+        "s3",
+        endpoint_url=f"{scheme}://{s3_cfg['endpoint']}",
+        aws_access_key_id=s3_cfg["access_key"],
+        aws_secret_access_key=s3_cfg["secret_key"],
+        region_name="us-east-1",
+        config=Config(
+            s3={"addressing_style": "path"},
+            signature_version="s3v4",
+            retries={
+                "max_attempts": BOTOCORE_RETRY_ATTEMPTS if cfg.enabled else 1,
+                "mode": "standard",
+            },
+        ),
+        verify=verify_ssl,
+    )
+
+
+def list_s3_files_boto3(
+    s3_path: str,
+    s3_cfg: Dict[str, Any],
+    suffix: str,
+    *,
+    retry_config: Optional[RetryConfig] = None,
+) -> List[str]:
+    bucket, prefix = parse_s3_uri(s3_path.rstrip("/") + "/")
+    suffix_lc = suffix.lower()
+
+    def _list() -> List[str]:
+        client = s3_boto_client(s3_cfg, retry_config=retry_config)
+        files: List[str] = []
+        continuation_token: Optional[str] = None
+        while True:
+            kwargs: Dict[str, Any] = {"Bucket": bucket, "Prefix": prefix}
+            if continuation_token:
+                kwargs["ContinuationToken"] = continuation_token
+            response = client.list_objects_v2(**kwargs)
+            for item in response.get("Contents", []):
+                key = item.get("Key")
+                if key and str(key).lower().endswith(suffix_lc):
+                    files.append(f"s3://{bucket}/{key}")
+            if not response.get("IsTruncated"):
+                break
+            continuation_token = response.get("NextContinuationToken")
+            if not continuation_token:
+                break
+        return sorted(files)
+
+    return retry_call(
+        _list,
+        retry_config,
+        label=f"S3 列出 {suffix}",
+        retryable=is_s3_retryable,
+    )
+
+
+def sample_jsonl_rows_sequential_stream(
+    s3_uri: str,
+    s3_cfg: Dict[str, Any],
+    sample_size: int,
+    *,
+    retry_config: Optional[RetryConfig] = None,
+) -> List[Dict[str, Any]]:
+    """流式读取 jsonl 前 N 条（不扫全文件）。"""
+    if sample_size <= 0:
+        return []
+    bucket, key = parse_s3_uri(s3_uri)
+    client = s3_boto_client(s3_cfg, retry_config=retry_config)
+
+    def _read() -> List[Dict[str, Any]]:
+        rows: List[Dict[str, Any]] = []
+        body = client.get_object(Bucket=bucket, Key=key)["Body"]
+        for raw in body.iter_lines():
+            if not raw or not raw.strip():
+                continue
+            rows.append(json.loads(raw))
+            if len(rows) >= sample_size:
+                break
+        return rows
+
+    return retry_call(_read, retry_config, label=f"S3 顺序读 {jsonl_basename(s3_uri)}", retryable=is_s3_retryable)
+
+
+def _json_line_from_range(payload: bytes, *, offset: int, object_size: int) -> Optional[bytes]:
+    if not payload:
+        return None
+    start = 0
+    if offset > 0:
+        first_newline = payload.find(b"\n")
+        if first_newline < 0:
+            return None
+        start = first_newline + 1
+    end = payload.find(b"\n", start)
+    if end < 0:
+        if offset + len(payload) >= object_size:
+            end = len(payload)
+        else:
+            return None
+    line = payload[start:end].strip()
+    return line or None
+
+
+def sample_jsonl_rows_s3_range(
+    s3_uri: str,
+    s3_cfg: Dict[str, Any],
+    sample_size: int,
+    *,
+    retry_config: Optional[RetryConfig] = None,
+) -> List[Dict[str, Any]]:
+    """通过 S3 Range 近似随机抽样，不全量扫描大 JSONL 文件。"""
+    if sample_size <= 0:
+        return []
+    bucket, key = parse_s3_uri(s3_uri)
+    client = s3_boto_client(s3_cfg, retry_config=retry_config)
+
+    def _head_size() -> int:
+        return int(client.head_object(Bucket=bucket, Key=key)["ContentLength"])
+
+    object_size = retry_call(_head_size, retry_config, label=f"S3 head {jsonl_basename(s3_uri)}", retryable=is_s3_retryable)
+    if object_size <= 0:
+        return []
+
+    rows: List[Dict[str, Any]] = []
+    seen = set()
+    attempts = 0
+    max_attempts = max(sample_size * RANGE_SAMPLE_MAX_ATTEMPT_FACTOR, sample_size)
+    while len(rows) < sample_size and attempts < max_attempts:
+        attempts += 1
+        offset = random.randint(0, max(0, object_size - 1))
+        end = min(object_size - 1, offset + RANGE_SAMPLE_CHUNK_BYTES - 1)
+
+        def _read_range(off: int = offset, end_byte: int = end) -> bytes:
+            return client.get_object(
+                Bucket=bucket,
+                Key=key,
+                Range=f"bytes={off}-{end_byte}",
+            )["Body"].read()
+
+        body = retry_call(
+            _read_range,
+            retry_config,
+            label=f"S3 Range 读 {jsonl_basename(s3_uri)}",
+            retryable=is_s3_retryable,
+        )
+        raw_line = _json_line_from_range(body, offset=offset, object_size=object_size)
+        if raw_line is None or raw_line in seen:
+            continue
+        try:
+            row = json.loads(raw_line.decode("utf-8"))
+        except (UnicodeDecodeError, json.JSONDecodeError):
+            continue
+        seen.add(raw_line)
+        rows.append(row)
+    return rows
+
+
+def list_s3_jsonl_files(
+    con: "duckdb.DuckDBPyConnection",
+    s3_path: str,
+    *,
+    s3_cfg: Optional[Dict[str, Any]] = None,
+    retry_config: Optional[RetryConfig] = None,
+) -> List[str]:
+    if s3_cfg:
+        files = list_s3_files_boto3(s3_path, s3_cfg, ".jsonl", retry_config=retry_config)
+        if not files:
+            raise FileNotFoundError(f"未找到 jsonl: {s3_path}")
+        return files
+
+    base = sql_literal(s3_path.rstrip("/"))
+
+    def _list() -> List[str]:
+        return [
+            r[0]
+            for r in con.execute(
+                f"SELECT file FROM glob('{base}/*.jsonl') ORDER BY file"
+            ).fetchall()
+        ]
+
+    files = retry_call(_list, retry_config, label="S3 列出 jsonl", retryable=is_s3_retryable)
+    if not files:
+        raise FileNotFoundError(f"未找到 jsonl: {s3_path}")
+    return files
+
+
+def sample_jsonl_rows(
+    con: "duckdb.DuckDBPyConnection",
+    fpath: str,
+    sample_size: int,
+    *,
+    sequential: bool = False,
+    s3_cfg: Optional[Dict[str, Any]] = None,
+    retry_config: Optional[RetryConfig] = None,
+) -> List[Dict[str, Any]]:
+    """从单个 jsonl 抽取最多 sample_size 行。"""
+    if sample_size <= 0:
+        return []
+    if sequential and s3_cfg and fpath.startswith("s3://"):
+        return sample_jsonl_rows_sequential_stream(
+            fpath, s3_cfg, sample_size, retry_config=retry_config
+        )
+    if not sequential and s3_cfg and fpath.startswith("s3://"):
+        return sample_jsonl_rows_s3_range(
+            fpath, s3_cfg, sample_size, retry_config=retry_config
+        )
+    inner = f"SELECT * FROM read_json_auto('{sql_literal(fpath)}')"
+    if sequential:
+        sql = f"SELECT * FROM ({inner}) LIMIT {int(sample_size)}"
+    else:
+        sql = f"SELECT * FROM ({inner}) ORDER BY random() LIMIT {int(sample_size)}"
+
+    def _sample() -> List[Dict[str, Any]]:
+        return rows_from_cursor(con.execute(sql))
+
+    return retry_call(
+        _sample,
+        retry_config,
+        label=f"S3 DuckDB 抽样 {jsonl_basename(fpath)}",
+        retryable=is_s3_retryable,
+    )
+
+
+def iter_s3_batches(
+    *,
+    parquet_glob: Optional[str],
+    s3_path: Optional[str],
+    s3_cfg: Optional[Dict[str, Any]],
+    s3_format: str,
+    full: bool,
+    limit: int,
+    batch_size: int,
+    sequential: bool = False,
+    retry_config: Optional[RetryConfig] = None,
+) -> Generator[Tuple[str, List[Dict[str, Any]]], None, None]:
+    """按批产出 (jsonl_s3_uri, rows)。"""
+    con = open_duckdb_s3(s3_cfg)
+
+    if s3_path:
+        base = s3_path.rstrip("/")
+        fmt = s3_format if s3_format != "auto" else _detect_s3_format(
+            con,
+            base,
+            s3_cfg=s3_cfg,
+            retry_config=retry_config,
+        )
+        if fmt == "jsonl":
+            files = list_s3_jsonl_files(con, base, s3_cfg=s3_cfg, retry_config=retry_config)
+            if full:
+                print(
+                    f"S3 数据格式: jsonl，全量 {len(files)} 个文件，batch_size={batch_size}",
+                    file=sys.stderr,
+                )
+            else:
+                mode = f"顺序前 {limit} 条" if sequential else f"随机 {limit} 条"
+                print(
+                    f"S3 数据格式: jsonl，抽样 {len(files)} 个文件，每文件{mode}",
+                    file=sys.stderr,
+                )
+            for fpath in files:
+                if full:
+                    offset = 0
+                    while True:
+                        path_lit = sql_literal(fpath)
+                        off = offset
+                        bs = batch_size
+
+                        def _read_batch() -> List[Dict[str, Any]]:
+                            cur = con.execute(
+                                f"SELECT * FROM read_json_auto('{path_lit}') "
+                                f"LIMIT {int(bs)} OFFSET {int(off)}"
+                            )
+                            return rows_from_cursor(cur)
+
+                        rows = retry_call(
+                            _read_batch,
+                            retry_config,
+                            label=f"S3 DuckDB 全量批 {jsonl_basename(fpath)}",
+                            retryable=is_s3_retryable,
+                        )
+                        if not rows:
+                            break
+                        yield fpath, rows
+                        offset += len(rows)
+                        if len(rows) < batch_size:
+                            break
+                else:
+                    basename = jsonl_basename(fpath)
+                    print(f"  [抽样] {basename} ...", file=sys.stderr, flush=True)
+                    rows = sample_jsonl_rows(
+                        con,
+                        fpath,
+                        limit,
+                        sequential=sequential,
+                        s3_cfg=s3_cfg,
+                        retry_config=retry_config,
+                    )
+                    if rows:
+                        print(
+                            f"  [抽样] {basename}: {len(rows)} 条",
+                            file=sys.stderr,
+                        )
+                        yield fpath, rows
+            return
+
+        path_expr = f"'{sql_literal(base)}/**/*.parquet'"
+        sql = f"SELECT * FROM read_parquet({path_expr})"
+        if not full and limit > 0:
+            sql += f" ORDER BY random() LIMIT {int(limit)}"
+
+        def _read_parquet() -> List[Dict[str, Any]]:
+            return rows_from_cursor(con.execute(sql))
+
+        rows = retry_call(_read_parquet, retry_config, label="S3 读 parquet", retryable=is_s3_retryable)
+        yield "parquet", rows
+        return
+
+    if parquet_glob:
+        reader = "read_json_auto" if parquet_glob.endswith(".jsonl") else "read_parquet"
+        sql = f"SELECT * FROM {reader}('{sql_literal(parquet_glob)}')"
+        if not full and limit > 0:
+            sql += f" ORDER BY random() LIMIT {int(limit)}"
+
+        def _read_glob() -> List[Dict[str, Any]]:
+            return rows_from_cursor(con.execute(sql))
+
+        rows = _read_glob()
+        yield parquet_glob, rows
+        return
+
+    raise ValueError("请指定 --parquet-glob、--s3-path，或提供 s3 配置文件")
+
+
+# ---- osi_verify/db.py ----
+
+
+import sys
+from dataclasses import dataclass
+from typing import Any, Dict, List, Optional, Sequence, Tuple
+
+
+
+@dataclass(frozen=True)
+class TableRef:
+    """库表引用：Iceberg 经 StarRocks 时为 catalog.schema.table。"""
+
+    catalog: Optional[str]
+    schema: str
+    table: str
+
+    @property
+    def sql_name(self) -> str:
+        if self.catalog:
+            return f"{self.catalog}.{self.schema}.{self.table}"
+        return f"`{self.schema}`.`{self.table}`"
+
+    @property
+    def display_name(self) -> str:
+        return self.sql_name
+
+
+def resolve_table_ref(
+    catalog: Optional[str], schema: str, table: str
+) -> TableRef:
+    return TableRef(catalog=catalog or None, schema=schema, table=table)
+
+
+def discover_table(session: MySQLSession, table_ref: TableRef) -> TableRef:
+    if table_ref.catalog:
+
+        def _probe(conn) -> None:
+            with conn.cursor() as cur:
+                cur.execute(f"SELECT 1 FROM {table_ref.sql_name} LIMIT 1")
+
+        session.run(_probe, label="MySQL 探活表")
+        return table_ref
+    if table_ref.table:
+        return table_ref
+
+    def _discover(conn) -> TableRef:
+        with conn.cursor() as cur:
+            cur.execute(
+                """
+                SELECT TABLE_NAME, COUNT(*) AS cnt
+                FROM information_schema.COLUMNS
+                WHERE TABLE_SCHEMA = DATABASE()
+                  AND COLUMN_NAME IN ('origin_id', 'origin_osi')
+                GROUP BY TABLE_NAME
+                HAVING cnt >= 2
+                ORDER BY cnt DESC
+                """
+            )
+            rows = cur.fetchall()
+        if not rows:
+            raise RuntimeError("未能自动发现含 origin_id/origin_osi 的表，请用 --table 指定")
+        return TableRef(catalog=None, schema=table_ref.schema, table=rows[0]["TABLE_NAME"])
+
+    return session.run(_discover, label="MySQL 发现表")
+
+
+
+def fetch_mysql_rows_by_ids(
+    session: MySQLSession,
+    table_ref: TableRef,
+    origin_ids: Sequence[Any],
+    origin_osi: str = "arxiv",
+    target_dt: Optional[str] = None,
+) -> Dict[str, Dict[str, Any]]:
+    ids = [str(i) for i in origin_ids if i is not None]
+    if not ids:
+        return {}
+
+    def _fetch(conn) -> Dict[str, Dict[str, Any]]:
+        placeholders = ",".join(["%s"] * len(ids))
+        sql = (
+            f"SELECT * FROM {table_ref.sql_name} "
+            f"WHERE origin_osi = %s AND origin_id IN ({placeholders})"
+        )
+        params: List[Any] = [origin_osi, *ids]
+        if target_dt:
+            sql += " AND dt = %s"
+            params.append(target_dt)
+        with conn.cursor() as cur:
+            cur.execute(sql, params)
+            rows = cur.fetchall()
+        return {str(r["origin_id"]): r for r in rows}
+
+    return session.run(_fetch, label="MySQL 批量查询")
+
+
+def count_s3_jsonl_lines_boto3(
+    s3_uri: str,
+    s3_cfg: Dict[str, Any],
+    *,
+    retry_config: Optional[RetryConfig] = None,
+) -> int:
+    """boto3 流式按换行符计数（不解析 JSON，适合大 jsonl）。"""
+    bucket, key = parse_s3_uri(s3_uri)
+    client = s3_boto_client(s3_cfg, retry_config=retry_config)
+
+    def _count() -> int:
+        n = 0
+        body = client.get_object(Bucket=bucket, Key=key)["Body"]
+        for chunk in body.iter_chunks(chunk_size=16 * 1024 * 1024):
+            n += chunk.count(b"\n")
+        return n
+
+    return retry_call(_count, retry_config, label=f"S3 计数 {jsonl_basename(s3_uri)}", retryable=is_s3_retryable)
+
+
+def count_s3_partition(
+    con: "duckdb.DuckDBPyConnection",
+    s3_path: str,
+    files: Optional[List[str]] = None,
+    *,
+    s3_cfg: Optional[Dict[str, Any]] = None,
+    retry_config: Optional[RetryConfig] = None,
+) -> Tuple[int, Dict[str, int]]:
+    """统计分区内 S3 行数（按 jsonl 文件）。有 s3_cfg 时用 boto3 流式计数。"""
+    files = files or list_s3_jsonl_files(
+        con,
+        s3_path,
+        s3_cfg=s3_cfg,
+        retry_config=retry_config,
+    )
+    per_file: Dict[str, int] = {}
+    total = 0
+    use_boto = bool(s3_cfg)
+    if use_boto:
+        print("  [S3 计数] 使用 boto3 流式按行计数", file=sys.stderr)
+    for fpath in files:
+        if use_boto and fpath.startswith("s3://"):
+            n = count_s3_jsonl_lines_boto3(fpath, s3_cfg, retry_config=retry_config)
+        else:
+            path_lit = sql_literal(fpath)
+
+            def _duck_count() -> int:
+                return int(
+                    con.execute(
+                        f"SELECT count(*) FROM read_json_auto('{path_lit}')"
+                    ).fetchone()[0]
+                )
+
+            n = retry_call(
+                _duck_count,
+                retry_config,
+                label=f"S3 DuckDB 计数 {jsonl_basename(fpath)}",
+                retryable=is_s3_retryable,
+            )
+        name = jsonl_basename(fpath)
+        per_file[name] = n
+        total += n
+        print(f"  [S3 计数] {name}: {n:,} 行", file=sys.stderr)
+    return total, per_file
+
+
+def table_columns(session: MySQLSession, table_ref: TableRef) -> List[str]:
+    def _columns(conn) -> List[str]:
+        with conn.cursor() as cur:
+            if table_ref.catalog:
+                cur.execute(f"DESCRIBE {table_ref.sql_name}")
+                return [r["Field"] for r in cur.fetchall()]
+            cur.execute(
+                """
+                SELECT COLUMN_NAME FROM information_schema.COLUMNS
+                WHERE TABLE_SCHEMA = DATABASE() AND TABLE_NAME = %s
+                """,
+                (table_ref.table,),
+            )
+            return [r["COLUMN_NAME"] for r in cur.fetchall()]
+
+    return session.run(_columns, label="MySQL 读取表结构")
+
+
+def count_mysql_origin(
+    session: MySQLSession,
+    table_ref: TableRef,
+    partition_dt: Optional[str],
+    origin_osi: str = "arxiv",
+) -> Tuple[int, str]:
+    """
+    统计落库表中该分区指定 origin_osi 记录数。
+    优先使用 dt / partition_dt / data_dt 等列；否则用 DATE(updated)=partition_dt。
+    """
+    cols = table_columns(session, table_ref)
+    where = "origin_osi = %s"
+    desc = f"origin_osi='{origin_osi}'"
+    base_params: Tuple[Any, ...] = (origin_osi,)
+
+    dt_cols = [c for c in ("dt", "partition_dt", "data_dt", "crawl_dt", "batch_dt") if c in cols]
+    if partition_dt and dt_cols:
+        c = dt_cols[0]
+        where += f" AND `{c}` = %s"
+        desc += f" AND {c}='{partition_dt}'"
+        params: Tuple[Any, ...] = base_params + (partition_dt,)
+    elif partition_dt and "updated" in cols:
+        where += " AND DATE(`updated`) = %s"
+        desc += f" AND DATE(updated)='{partition_dt}'"
+        params = base_params + (partition_dt,)
+    elif partition_dt and "published_date" in cols:
+        where += " AND DATE(`published_date`) = %s"
+        desc += f" AND DATE(published_date)='{partition_dt}'"
+        params = base_params + (partition_dt,)
+    elif partition_dt:
+        print(
+            f"[warn] 表 {table_ref.display_name} 无 dt/updated 等分区字段，仅按 origin_osi 统计总数",
+            file=sys.stderr,
+        )
+        params = base_params
+    else:
+        params = base_params
+
+    def _count(conn) -> int:
+        with conn.cursor() as cur:
+            cur.execute(f"SELECT COUNT(*) AS n FROM {table_ref.sql_name} WHERE {where}", params)
+            return int(cur.fetchone()["n"])
+
+    n = session.run(_count, label="MySQL 统计行数")
+    return n, desc
+
+
+# ---- osi_verify/report.py ----
+
+
+import json
+from collections import Counter, defaultdict
+from dataclasses import dataclass
+from typing import Any, Dict, List, Optional, Tuple
+
+
+
+@dataclass
+class ReportContext:
+    target: str
+    target_kind: str
+    transform: str
+    table_name: str
+    origin_osi: str
+    s3_path: str
+    mapping_csv: str
+    config_path: str
+
+    def to_dict(self) -> Dict[str, str]:
+        return {
+            "target": self.target,
+            "target_kind": self.target_kind,
+            "transform": self.transform,
+            "table_name": self.table_name,
+            "origin_osi": self.origin_osi,
+            "s3_path": self.s3_path,
+            "mapping_csv": self.mapping_csv,
+            "config_path": self.config_path,
+        }
+
+
+@dataclass
+class FileStats:
+    total: int = 0
+    pass_n: int = 0
+    fail_n: int = 0
+    miss_n: int = 0
+
+
+@dataclass
+class CountSummary:
+    context: ReportContext
+    s3_dt: Optional[str]
+    target_dt: Optional[str]
+    s3_path: str
+    s3_total: int
+    s3_per_file: Dict[str, int]
+    mysql_total: int
+    mysql_filter: str
+    checked_rows: int = 0
+
+
+def print_count_summary(cs: CountSummary) -> None:
+    diff = cs.s3_total - cs.mysql_total
+    print("\n" + "=" * 60)
+    print("数据总量校验")
+    print("=" * 60)
+    print(f"Target            : {cs.context.target} ({cs.context.target_kind})")
+    print(f"目标表            : {cs.context.table_name}")
+    print(f"origin_osi        : {cs.context.origin_osi}")
+    print(f"映射 CSV          : {cs.context.mapping_csv}")
+    print(f"S3 分区 dt        : {cs.s3_dt or '(未解析)'}")
+    print(f"目标表 dt         : {cs.target_dt or '(未指定)'}")
+    print(f"S3 路径           : {cs.s3_path}")
+    print(f"S3 jsonl 文件数   : {len(cs.s3_per_file)}")
+    print(f"S3 总行数         : {cs.s3_total:,}")
+    for name, n in sorted(cs.s3_per_file.items()):
+        print(f"  - {name}: {n:,}")
+    print(f"MySQL 过滤条件    : {cs.mysql_filter}")
+    print(f"MySQL 行数        : {cs.mysql_total:,}")
+    print(f"S3 - MySQL 差异   : {diff:+,}")
+    if diff != 0:
+        print("  >> 总量不一致，请检查落库任务是否漏跑/重复或分区字段过滤条件")
+    else:
+        print("  >> 总量一致")
+    if cs.checked_rows and cs.checked_rows != cs.s3_total:
+        print(
+            f"  >> 本次仅校验抽样 {cs.checked_rows:,} 条，"
+            f"字段级结果不代表全量（加 --full 做全量字段校验）"
+        )
+
+
+def print_anomaly_table(anomalies: List[RowResult], max_show: int = 50) -> None:
+    print("\n" + "=" * 60)
+    print(f"落库异常明细（共 {len(anomalies)} 条，展示前 {min(len(anomalies), max_show)} 条）")
+    print("=" * 60)
+    print(f"{'jsonl 文件':<32} {'origin_id':<16}  {'状态':<8}  异常摘要")
+    print("-" * 60)
+    for r in anomalies[:max_show]:
+        brief = (str(r.failures[0])[:60] if r.failures else "数据库无该 origin_id 记录")
+        print(
+            f"{r.jsonl_file:<32} {str(r.origin_id):<16}  "
+            f"{r.status:<8}  {brief}"
+        )
+    if len(anomalies) > max_show:
+        print(f"... 另有 {len(anomalies) - max_show} 条，见 --report 文件")
+
+
+SummaryKey = Tuple[str, str, str, str]
+
+
+def _summary_value(v: Any, max_len: int = 500) -> str:
+    if v is None:
+        return "null"
+    if isinstance(v, (dict, list)):
+        text = json.dumps(v, ensure_ascii=False, default=str)
+    else:
+        text = repr(v) if isinstance(v, str) else str(v)
+    return text if len(text) <= max_len else text[: max_len - 3] + "..."
+
+
+def _mismatch_key(mismatch: FieldMismatch) -> SummaryKey:
+    return (
+        "mismatch",
+        mismatch.field,
+        _summary_value(mismatch.s3),
+        _summary_value(mismatch.db),
+    )
+
+
+def _print_summary_key(count: int, key: SummaryKey) -> None:
+    kind, label, s3_val, db_val = key
+    if kind == "mismatch":
+        print(f"{count:>8}  {label}:")
+        print(f"{'':>8}    S3: {s3_val}")
+        print(f"{'':>8}    DB: {db_val}")
+        return
+    print(f"{count:>8}  {label}")
+
+
+def print_anomaly_summary(anomalies: List[RowResult], max_examples: int = 3) -> None:
+    print("\n" + "=" * 60)
+    print(f"落库异常/Warning 类型汇总（共 {len(anomalies)} 条记录）")
+    print("=" * 60)
+    if not anomalies:
+        print("无异常")
+        return
+
+    type_counts: Counter[SummaryKey] = Counter()
+    examples: Dict[SummaryKey, List[str]] = defaultdict(list)
+
+    for r in anomalies:
+        if r.missing_in_mysql:
+            key = ("message", "MySQL 缺失: 数据库无该 origin_id 记录", "", "")
+            type_counts[key] += 1
+            if len(examples[key]) < max_examples:
+                examples[key].append(f"{r.jsonl_file} origin_id={r.origin_id}")
+
+        for w in r.warnings:
+            key = ("message", f"Warning: {w}", "", "")
+            type_counts[key] += 1
+            if len(examples[key]) < max_examples:
+                examples[key].append(f"{r.jsonl_file} origin_id={r.origin_id}")
+
+        for m in r.failures:
+            key = _mismatch_key(m)
+            type_counts[key] += 1
+            if len(examples[key]) < max_examples:
+                examples[key].append(f"{r.jsonl_file} origin_id={r.origin_id}")
+
+    print(f"{'次数':>8}  错误类型")
+    print("-" * 60)
+    for key, count in type_counts.most_common():
+        _print_summary_key(count, key)
+        if max_examples > 0 and examples.get(key):
+            print(f"{'':>8}  样例: {', '.join(examples[key])}")
+    print("\n完整逐条明细请查看 --report 输出的 JSONL 文件")
+
+
+def print_file_stats(stats: Dict[str, FileStats]) -> None:
+    print("\n" + "=" * 60)
+    print("按 jsonl 文件统计（本次已校验行）")
+    print("=" * 60)
+    print(f"{'jsonl 文件':<36} {'校验':>8} {'通过':>8} {'失败':>8} {'缺失':>8}")
+    print("-" * 60)
+    for name in sorted(stats):
+        s = stats[name]
+        print(f"{name:<36} {s.total:>8} {s.pass_n:>8} {s.fail_n:>8} {s.miss_n:>8}")
+
+
+def print_run_context(ctx: ReportContext) -> None:
+    print("\n" + "=" * 60)
+    print("校验上下文")
+    print("=" * 60)
+    print(f"Target      : {ctx.target} ({ctx.target_kind})")
+    print(f"Transform   : {ctx.transform}")
+    print(f"目标表      : {ctx.table_name}")
+    print(f"origin_osi  : {ctx.origin_osi}")
+    print(f"S3 路径     : {ctx.s3_path or '(local/parquet-glob)'}")
+    print(f"映射 CSV    : {ctx.mapping_csv}")
+    print(f"配置文件    : {ctx.config_path}")
+
+
+def safe_filename_token(value: Any) -> str:
+    text = "unknown" if value in (None, "") else str(value)
+    return re.sub(r"[^0-9A-Za-z_-]+", "_", text).strip("_") or "unknown"
+
+
+def default_osi_report_path(target: str, s3_dt: Optional[str], target_dt: Optional[str]) -> Path:
+    dt_tag = f"s3_{safe_filename_token(s3_dt)}_target_{safe_filename_token(target_dt)}"
+    report_dir = REPORT_ROOT / f"meta_paper_data_{safe_filename_token(target)}_{dt_tag}"
+    return report_dir / "source_field_mismatch.jsonl"
+
+
+def summary_paths(report_path: Path) -> Tuple[Path, Path]:
+    return report_path.parent / "summary.json", report_path.parent / "readable_summary.md"
+
+
+REPORT_KEY_LABELS = {
+    "report": "报告路径",
+    "context": "校验上下文",
+    "target": "目标名称",
+    "target_kind": "目标类型",
+    "transform": "转换逻辑",
+    "table_name": "目标表",
+    "origin_osi": "来源标识",
+    "s3_path": "S3路径",
+    "mapping_csv": "映射文件",
+    "config_path": "配置文件",
+    "s3_dt": "S3分区",
+    "target_dt": "目标表分区",
+    "partition_dt": "目标表分区",
+    "checked": "已校验数",
+    "passed": "通过数",
+    "failed": "失败数",
+    "missing": "目标表缺失数",
+    "warnings": "Warning数量",
+    "count_summary": "Count校验",
+    "s3_total": "S3总行数",
+    "mysql_total": "目标表行数",
+    "diff": "数量差异",
+    "mysql_filter": "目标表过滤条件",
+    "checked_rows": "已校验行数",
+    "s3_file_count": "S3文件数",
+    "file_stats": "文件统计",
+    "status_counts": "状态分布",
+    "field_counts": "字段问题分布",
+    "field_samples": "字段问题样例",
+    "warning_counts": "Warning分布",
+    "warning_samples": "Warning样例",
+    "jsonl_file": "JSONL文件",
+    "jsonl_s3_uri": "JSONL S3路径",
+    "origin_id": "来源ID",
+    "status": "状态",
+    "field_diffs": "字段差异",
+    "field": "字段",
+    "s3": "S3值",
+    "db": "目标表值",
+    "expected": "预期值",
+    "actual": "实际值",
+    "missing_in_mysql": "目标表缺失",
+}
+
+
+def localize_report_keys(value: Any) -> Any:
+    if isinstance(value, dict):
+        return {
+            REPORT_KEY_LABELS.get(str(key), str(key)): localize_report_keys(val)
+            for key, val in value.items()
+        }
+    if isinstance(value, list):
+        return [localize_report_keys(item) for item in value]
+    return value
+
+
+TOP_FIELD_LIMIT = 20
+TOP_SAMPLE_FIELD_LIMIT = 5
+SAMPLES_PER_FIELD = 3
+
+
+def build_osi_report_summary(
+    *,
+    report_path: Optional[Path],
+    context: ReportContext,
+    s3_dt: Optional[str],
+    target_dt: Optional[str],
+    total: int,
+    ok_n: int,
+    fail_n: int,
+    miss_n: int,
+    warn_n: int,
+    count_summary: Optional[CountSummary],
+    per_file: Dict[str, FileStats],
+    notable_results: List[RowResult],
+) -> Dict[str, Any]:
+    status_counts = Counter()
+    field_counts = Counter()
+    warning_counts = Counter()
+    field_samples: Dict[str, List[Dict[str, Any]]] = defaultdict(list)
+    warning_samples: Dict[str, List[Dict[str, Any]]] = defaultdict(list)
+
+    for row in notable_results:
+        status_counts[row.status if not row.ok else "WARN"] += 1
+        for warning in row.warnings:
+            warning_counts[warning] += 1
+            samples = warning_samples[warning]
+            if len(samples) < SAMPLES_PER_FIELD:
+                samples.append({"jsonl_file": row.jsonl_file, "origin_id": row.origin_id})
+        if row.missing_in_mysql:
+            field_counts["missing_in_mysql"] += 1
+            samples = field_samples["missing_in_mysql"]
+            if len(samples) < SAMPLES_PER_FIELD:
+                samples.append({"jsonl_file": row.jsonl_file, "origin_id": row.origin_id})
+        for mismatch in row.failures:
+            field_counts[mismatch.field] += 1
+            samples = field_samples[mismatch.field]
+            if len(samples) < SAMPLES_PER_FIELD:
+                samples.append(
+                    {
+                        "jsonl_file": row.jsonl_file,
+                        "origin_id": row.origin_id,
+                        "s3": mismatch.s3,
+                        "db": mismatch.db,
+                    }
+                )
+
+    count_payload = None
+    if count_summary:
+        count_payload = {
+            "s3_dt": count_summary.s3_dt,
+            "target_dt": count_summary.target_dt,
+            "s3_total": count_summary.s3_total,
+            "mysql_total": count_summary.mysql_total,
+            "diff": count_summary.s3_total - count_summary.mysql_total,
+            "mysql_filter": count_summary.mysql_filter,
+            "checked_rows": count_summary.checked_rows,
+            "s3_file_count": len(count_summary.s3_per_file),
+        }
+
+    sorted_field_counts = dict(field_counts.most_common())
+    sorted_warning_counts = dict(warning_counts.most_common())
+    top_sample_fields = set(list(sorted_field_counts)[:TOP_SAMPLE_FIELD_LIMIT])
+    top_sample_warnings = set(list(sorted_warning_counts)[:TOP_SAMPLE_FIELD_LIMIT])
+    return {
+        "report": str(report_path) if report_path else None,
+        "context": context.to_dict(),
+        "s3_dt": s3_dt,
+        "target_dt": target_dt,
+        "partition_dt": target_dt,
+        "checked": total,
+        "passed": ok_n,
+        "failed": fail_n,
+        "missing": miss_n,
+        "warnings": warn_n,
+        "count_summary": count_payload,
+        "file_stats": {
+            name: {
+                "total": stats.total,
+                "passed": stats.pass_n,
+                "failed": stats.fail_n,
+                "missing": stats.miss_n,
+            }
+            for name, stats in sorted(per_file.items())
+        },
+        "status_counts": dict(status_counts.most_common()),
+        "field_counts": sorted_field_counts,
+        "field_count_total": len(sorted_field_counts),
+        "field_samples": {
+            field: field_samples[field]
+            for field in sorted_field_counts
+            if field in top_sample_fields
+        },
+        "warning_counts": sorted_warning_counts,
+        "warning_type_total": len(sorted_warning_counts),
+        "warning_samples": {
+            warning: warning_samples[warning]
+            for warning in sorted_warning_counts
+            if warning in top_sample_warnings
+        },
+    }
+
+
+def write_osi_report_summary(report_path: Path, summary: Dict[str, Any]) -> None:
+    summary_json_path, summary_md_path = summary_paths(report_path)
+    summary_json_path.parent.mkdir(parents=True, exist_ok=True)
+    with summary_json_path.open("w", encoding="utf-8") as f:
+        json.dump(localize_report_keys(summary), f, ensure_ascii=False, indent=2, default=str)
+
+    count_summary = summary.get("count_summary") or {}
+    lines = [
+        "# S3 数据到论文源数据表校验报告摘要",
+        "",
+        f"- 目标表: `{summary.get('context', {}).get('table_name')}`",
+        f"- 分区: S3=`{summary.get('s3_dt')}`, 目标表=`{summary.get('target_dt')}`",
+        f"- 结果: 已校验 `{summary.get('checked')}`，通过 `{summary.get('passed')}`，失败 `{summary.get('failed')}`，缺失 `{summary.get('missing')}`",
+        f"- Warning: `{summary.get('warnings')}`",
+        f"- 明细报告: `{summary.get('report')}`",
+        f"- 报告目录: `{Path(str(summary.get('report'))).parent if summary.get('report') else None}`",
+        "",
+        "## Count 校验",
+        "",
+    ]
+    if count_summary:
+        lines.extend(
+            [
+                f"- s3_total: `{count_summary.get('s3_total')}`",
+                f"- mysql_total: `{count_summary.get('mysql_total')}`",
+                f"- diff: `{count_summary.get('diff')}`",
+                f"- checked_rows: `{count_summary.get('checked_rows')}`",
+                f"- mysql_filter: `{count_summary.get('mysql_filter')}`",
+            ]
+        )
+    else:
+        lines.append("- 未执行或已跳过")
+    lines.extend(["", "## 状态分布", ""])
+    for status, count in (summary.get("status_counts") or {}).items():
+        lines.append(f"- `{status}`: {count}")
+    if not summary.get("status_counts"):
+        lines.append("- 无")
+    lines.extend(["", "## 字段问题分布", ""])
+    for field, count in (summary.get("field_counts") or {}).items():
+        lines.append(f"- `{field}`: {count}")
+    if not summary.get("field_counts"):
+        lines.append("- 无")
+    lines.extend(["", "## 字段问题样例", ""])
+    for field, samples in (summary.get("field_samples") or {}).items():
+        count = (summary.get("field_counts") or {}).get(field, len(samples))
+        lines.append(f"### {field} ({count})")
+        lines.append("")
+        for sample in samples:
+            lines.append(
+                f"- origin_id `{sample.get('origin_id')}`, jsonl_file=`{sample.get('jsonl_file')}`"
+            )
+            if "s3" in sample or "db" in sample:
+                lines.append(f"  - s3: `{json.dumps(sample.get('s3'), ensure_ascii=False, default=str)}`")
+                lines.append(f"  - db: `{json.dumps(sample.get('db'), ensure_ascii=False, default=str)}`")
+            lines.append("")
+
+    if summary.get("warnings"):
+        lines.extend(["", "## Warning 分布", ""])
+        for warning, count in (summary.get("warning_counts") or {}).items():
+            lines.append(f"- `{warning}`: {count}")
+        if not summary.get("warning_counts"):
+            lines.append("- 无")
+        lines.extend(["", "## Warning 样例", ""])
+        for warning, samples in (summary.get("warning_samples") or {}).items():
+            count = (summary.get("warning_counts") or {}).get(warning, len(samples))
+            lines.append(f"### {warning} ({count})")
+            lines.append("")
+            for sample in samples:
+                lines.append(
+                    f"- origin_id `{sample.get('origin_id')}`, jsonl_file=`{sample.get('jsonl_file')}`"
+                )
+            lines.append("")
+    with summary_md_path.open("w", encoding="utf-8") as f:
+        f.write("\n".join(lines).rstrip() + "\n")
+
+
+# ---- osi_verify/runner.py ----
+
+
+import json
+import sys
+from argparse import Namespace
+from datetime import datetime
+from typing import Any, Dict, List, Optional, Sequence
+
+
+
+def run_verification(
+    *,
+    args: Namespace,
+    target_config: TargetConfig,
+    mysql_settings: Dict[str, Any],
+    s3_settings: Dict[str, Any],
+    mapping_rules: Sequence[MappingRule],
+    requested_compare_fields: Sequence[str],
+    license_map: Dict[str, str],
+    retry_config: Optional[RetryConfig] = None,
+) -> int:
+    s3_cfg, s3_path = None, args.s3_path
+    if args.parquet_glob:
+        s3_path = None
+    elif args.s3_config.exists() or s3_settings:
+        inline_s3 = {
+            k: v
+            for k, v in s3_settings.items()
+            if k not in {"config_file", "subpath", "format", "path"}
+        }
+        if args.s3_path:
+            inline_s3["default_path"] = args.s3_path
+        s3_cfg = load_s3_config(args.s3_config, inline_s3)
+        s3_path = resolve_s3_path(s3_path or s3_cfg["default_path"], args.s3_subpath)
+        s3_path = apply_s3_dt_to_path(s3_path, args.s3_dt or args.partition_dt)
+        print(f"S3: endpoint={s3_cfg['endpoint']} path={s3_path}", file=sys.stderr)
+    elif not s3_path:
+        print("请指定 --parquet-glob、--s3-path，或提供 s3 配置文件", file=sys.stderr)
+        return 2
+    else:
+        s3_path = apply_s3_dt_to_path(s3_path, args.s3_dt or args.partition_dt)
+
+    row_limit = 0 if args.full else (max(args.limit, 1000) if args.origin_id else args.limit)
+    origin_filter = set(args.origin_id) if args.origin_id else None
+    source_id_field = target_config.source_id_field
+
+    def align_filtered(rows: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
+        if not origin_filter:
+            return rows
+        return [r for r in rows if str(get_first(r, source_id_field)) in origin_filter]
+
+    s3_dt = extract_partition_dt(args.s3_subpath or s3_path, args.s3_dt or args.partition_dt)
+    target_dt = args.target_dt or args.partition_dt or s3_dt
+
+    def batch_kwargs() -> Dict[str, Any]:
+        return {
+            "parquet_glob": args.parquet_glob,
+            "s3_path": s3_path,
+            "s3_cfg": s3_cfg,
+            "s3_format": args.s3_format,
+            "full": args.full,
+            "limit": row_limit,
+            "batch_size": args.batch_size,
+            "sequential": args.sequential,
+            "retry_config": retry_config,
+        }
+
+    dry_run_context = ReportContext(
+        target=target_config.name,
+        target_kind=target_config.kind,
+        transform=target_config.transform,
+        table_name=resolve_table_ref(
+            args.catalog if args.catalog else None,
+            args.database,
+            args.table,
+        ).display_name,
+        origin_osi=target_config.origin_osi,
+        s3_path=s3_path or args.parquet_glob or "",
+        mapping_csv=str(args.mapping_csv),
+        config_path=str(args.config),
+    )
+
+    if args.dry_run:
+        print_run_context(dry_run_context)
+        shown = 0
+        for src, batch in iter_s3_batches(**batch_kwargs()):
+            batch = align_filtered(batch)
+            if not batch:
+                continue
+            for row in batch:
+                exp = transform_row(row, license_map, target_config.transform)
+                print(
+                    f"\n--- [{shown}] {jsonl_basename(src)} "
+                    f"origin_id={exp.get('origin_id')} ---"
+                )
+                print(json.dumps(exp, ensure_ascii=False, indent=2, default=str))
+                shown += 1
+        print(f"共展示 {shown} 条", file=sys.stderr)
+        return 0
+
+    inline_mysql = {
+        k: v
+        for k, v in mysql_settings.items()
+        if k not in {"config_file", "database", "table"}
+    }
+    mysql_cfg = load_mysql_config(args.mysql_config, inline_mysql)
+    catalog = (args.catalog or mysql_cfg.get("catalog") or "").strip() or None
+    table_ref = resolve_table_ref(catalog, args.database, args.table)
+    if retry_config and retry_config.enabled:
+        print(
+            f"[info] 连接重试已启用: max_attempts={retry_config.max_attempts}, "
+            f"initial_delay={retry_config.initial_delay_sec}s, "
+            f"backoff={retry_config.backoff_factor}x",
+            file=sys.stderr,
+        )
+    mysql_session = MySQLSession(
+        mysql_cfg,
+        args.database,
+        catalog=catalog,
+        retry_config=retry_config,
+    )
+    report_context = ReportContext(
+        target=target_config.name,
+        target_kind=target_config.kind,
+        transform=target_config.transform,
+        table_name=table_ref.display_name,
+        origin_osi=target_config.origin_osi,
+        s3_path=s3_path or args.parquet_glob or "",
+        mapping_csv=str(args.mapping_csv),
+        config_path=str(args.config),
+    )
+    report_path = args.report
+    if report_path is None:
+        report_path = default_osi_report_path(target_config.name, s3_dt, target_dt)
+    report_path.parent.mkdir(parents=True, exist_ok=True)
+    warning_report_path = report_path.parent / "source_field_warning.jsonl"
+    report_fp = report_path.open("w", encoding="utf-8")
+    warning_report_fp = warning_report_path.open("w", encoding="utf-8")
+    ok_n = miss_n = fail_n = warn_n = total = 0
+    anomalies: List[RowResult] = []
+    notable_results: List[RowResult] = []
+    per_file: Dict[str, FileStats] = {}
+    count_summary: Optional[CountSummary] = None
+    track_registry: Dict[str, str] = {}
+    active_compare_fields: Optional[List[str]] = None
+    active_default_empty_field_types = default_empty_field_types_from_rules(mapping_rules)
+
+    try:
+        mysql_session.connect()
+        table_ref = discover_table(mysql_session, table_ref)
+        report_context.table_name = table_ref.display_name
+        mode = f"StarRocks Iceberg catalog={catalog}" if catalog else "MySQL"
+        print(f"使用表 ({mode}): {table_ref.display_name}")
+        print_run_context(report_context)
+
+        if s3_path and not args.skip_count:
+            print("\n正在统计 S3 分区行数（可能较慢）...", file=sys.stderr)
+            con = open_duckdb_s3(s3_cfg)
+            jsonl_files = list_s3_jsonl_files(
+                con,
+                s3_path,
+                s3_cfg=s3_cfg,
+                retry_config=retry_config,
+            )
+            s3_total, s3_per_file = count_s3_partition(
+                con,
+                s3_path,
+                jsonl_files,
+                s3_cfg=s3_cfg,
+                retry_config=retry_config,
+            )
+            mysql_total, mysql_filter = count_mysql_origin(
+                mysql_session,
+                table_ref,
+                target_dt,
+                origin_osi=target_config.origin_osi,
+            )
+            count_summary = CountSummary(
+                context=report_context,
+                s3_dt=s3_dt,
+                target_dt=target_dt,
+                s3_path=s3_path,
+                s3_total=s3_total,
+                s3_per_file=s3_per_file,
+                mysql_total=mysql_total,
+                mysql_filter=mysql_filter,
+            )
+            print_count_summary(count_summary)
+
+        if args.count_only:
+            if count_summary:
+                write_osi_report_summary(
+                    report_path,
+                    build_osi_report_summary(
+                        report_path=report_path,
+                        context=report_context,
+                        s3_dt=s3_dt,
+                        target_dt=target_dt,
+                        total=0,
+                        ok_n=0,
+                        fail_n=0,
+                        miss_n=0,
+                        warn_n=0,
+                        count_summary=count_summary,
+                        per_file=per_file,
+                        notable_results=notable_results,
+                    ),
+                )
+                print(f"\n汇总报告: {summary_paths(report_path)[0]}")
+            return 0 if count_summary and count_summary.s3_total == count_summary.mysql_total else 1
+
+        for src, batch in iter_s3_batches(**batch_kwargs()):
+            batch = align_filtered(batch)
+            if not batch:
+                continue
+            fname = jsonl_basename(src)
+            if fname not in per_file:
+                per_file[fname] = FileStats()
+            ids = [get_first(r, source_id_field) for r in batch]
+            mysql_map = fetch_mysql_rows_by_ids(
+                mysql_session,
+                table_ref,
+                ids,
+                origin_osi=target_config.origin_osi,
+                target_dt=target_dt,
+            )
+            if mysql_map and active_compare_fields is None:
+                active_compare_fields = compare_fields_for_table(
+                    list(next(iter(mysql_map.values())).keys()),
+                    mapping_rules,
+                )
+                skipped = [
+                    f for f in requested_compare_fields
+                    if f not in active_compare_fields
+                ]
+                print(
+                    f"[info] 字段比对共 {len(active_compare_fields)} 列"
+                    + (f"，表无列跳过: {', '.join(skipped)}" if skipped else ""),
+                    file=sys.stderr,
+                )
+            for row in batch:
+                oid = str(get_first(row, source_id_field))
+                result = compare_row(
+                    row,
+                    mysql_map.get(oid),
+                    license_map,
+                    track_registry=track_registry,
+                    compare_fields=active_compare_fields,
+                    default_empty_field_types=active_default_empty_field_types,
+                    transform=target_config.transform,
+                )
+                result.jsonl_file = fname
+                total += 1
+                fs = per_file[fname]
+                fs.total += 1
+                if result.warnings:
+                    warn_n += len(result.warnings)
+                    notable_results.append(result)
+                if result.ok:
+                    ok_n += 1
+                    fs.pass_n += 1
+                elif result.missing_in_mysql:
+                    miss_n += 1
+                    fs.miss_n += 1
+                    anomalies.append(result)
+                    if not result.warnings:
+                        notable_results.append(result)
+                else:
+                    fail_n += 1
+                    fs.fail_n += 1
+                    anomalies.append(result)
+                    if not result.warnings:
+                        notable_results.append(result)
+                if report_fp and not result.ok:
+                    payload: Dict[str, Any] = {
+                        "status": result.status,
+                        "context": report_context.to_dict(),
+                        "s3_dt": s3_dt,
+                        "target_dt": target_dt,
+                        "partition_dt": target_dt,
+                        "jsonl_file": fname,
+                        "jsonl_s3_uri": src,
+                        "origin_id": result.origin_id,
+                    }
+                    if result.failures:
+                        payload["field_diffs"] = [m.to_dict() for m in result.failures]
+                    report_fp.write(json.dumps(localize_report_keys(payload), ensure_ascii=False) + "\n")
+                if warning_report_fp and result.warnings:
+                    warning_payload: Dict[str, Any] = {
+                        "status": "warning",
+                        "context": report_context.to_dict(),
+                        "s3_dt": s3_dt,
+                        "target_dt": target_dt,
+                        "partition_dt": target_dt,
+                        "jsonl_file": fname,
+                        "jsonl_s3_uri": src,
+                        "origin_id": result.origin_id,
+                        "warnings": result.warnings,
+                    }
+                    warning_report_fp.write(
+                        json.dumps(localize_report_keys(warning_payload), ensure_ascii=False) + "\n"
+                    )
+                if args.verbose_failures and (not result.ok or result.warnings):
+                    tag = result.status if not result.ok else "WARN"
+                    print(
+                        f"\n[{tag}] {fname} origin_id={result.origin_id}"
+                    )
+                    for w in result.warnings:
+                        print(f"  ! {w}")
+                    for m in result.failures:
+                        print(f"  - {m}")
+            print(
+                f"[进度] 已校验 {total} 条（通过 {ok_n} / 失败 {fail_n} / 缺失 {miss_n}"
+                f" / warning {warn_n}）  当前: {fname}",
+                file=sys.stderr,
+            )
+
+        if count_summary:
+            count_summary.checked_rows = total
+            print_count_summary(count_summary)
+
+        print("\n" + "=" * 60)
+        print("字段校验汇总")
+        print("=" * 60)
+        print(f"已校验行数 : {total:,}")
+        print(f"通过       : {ok_n:,}")
+        print(f"字段不一致 : {fail_n:,}")
+        print(f"MySQL 缺失 : {miss_n:,}")
+        print(f"Warning     : {warn_n:,}（license 超出可选值，不记为缺陷）")
+
+        print_file_stats(per_file)
+        print_anomaly_summary(notable_results, max_examples=args.max_show)
+
+        if report_fp:
+            print(f"\n完整异常报告: {report_path}")
+        write_osi_report_summary(
+            report_path,
+            build_osi_report_summary(
+                report_path=report_path,
+                context=report_context,
+                s3_dt=s3_dt,
+                target_dt=target_dt,
+                total=total,
+                ok_n=ok_n,
+                fail_n=fail_n,
+                miss_n=miss_n,
+                warn_n=warn_n,
+                count_summary=count_summary,
+                per_file=per_file,
+                notable_results=notable_results,
+            ),
+        )
+        print(f"汇总报告: {summary_paths(report_path)[0]}")
+        count_ok = not count_summary or count_summary.s3_total == count_summary.mysql_total
+        return 0 if fail_n == 0 and miss_n == 0 and count_ok else 1
+    finally:
+        mysql_session.close()
+        if report_fp:
+            report_fp.close()
+        if warning_report_fp:
+            warning_report_fp.close()
+
+
+# ---- osi_verify/cli.py ----
+
+
+import argparse
+import json
+import sys
+from pathlib import Path
+from typing import Any, Dict, Optional, Sequence
+
+
+
+def _nested(settings: Dict[str, Any], *keys: str, default: Any = None) -> Any:
+    cur: Any = settings
+    for key in keys:
+        if not isinstance(cur, dict) or key not in cur:
+            return default
+        cur = cur[key]
+    return cur
+
+
+def _bool_default(settings: Dict[str, Any], *keys: str, default: bool = False) -> bool:
+    return bool(_nested(settings, *keys, default=default))
+
+
+def _merge_target_s3_settings(
+    global_s3_settings: Dict[str, Any],
+    target_s3_settings: Dict[str, Any],
+) -> Dict[str, Any]:
+    merged = dict(global_s3_settings)
+    for key, value in target_s3_settings.items():
+        if value is not None and value != "":
+            merged[key] = value
+    return merged
+
+
+def _section_dict(settings: Dict[str, Any], key: str) -> Dict[str, Any]:
+    value = settings.get(key)
+    return dict(value) if isinstance(value, dict) else {}
+
+
+def _merged_arxiv_options(
+    settings: Dict[str, Any],
+    section: str,
+    flat_keys: Sequence[str],
+) -> Dict[str, Any]:
+    arxiv_settings = _section_dict(settings, "osi_arxiv")
+    merged = _section_dict(settings, section)
+    merged.update(_section_dict(arxiv_settings, section))
+    for key in flat_keys:
+        if key in arxiv_settings and arxiv_settings[key] is not None:
+            merged[key] = arxiv_settings[key]
+    return merged
+
+
+def main(argv: Optional[Sequence[str]] = None) -> int:
+    config_parser = argparse.ArgumentParser(add_help=False)
+    config_parser.add_argument("--config", type=Path, default=DEFAULT_SETTINGS_JSON)
+    config_args, remaining_argv = config_parser.parse_known_args(argv)
+    settings = load_settings(config_args.config)
+    target_config = load_arxiv_target_config(settings)
+    mysql_settings = settings.get("mysql", {}) if isinstance(settings.get("mysql", {}), dict) else {}
+    global_s3_settings = settings.get("s3", {}) if isinstance(settings.get("s3", {}), dict) else {}
+    s3_settings = _merge_target_s3_settings(global_s3_settings, target_config.s3_settings)
+    run_settings = _merged_arxiv_options(
+        settings,
+        "run",
+        (
+            "limit",
+            "sequential",
+            "full",
+            "batch_size",
+            "dry_run",
+            "origin_ids",
+            "partition_dt",
+            "s3_dt",
+            "target_dt",
+            "skip_count",
+            "count_only",
+            "parquet_glob",
+            "s3_path",
+        ),
+    )
+    report_settings = _merged_arxiv_options(
+        settings,
+        "report",
+        ("report_path", "summary_only", "verbose_failures", "max_show"),
+    )
+    retry_config = load_retry_config(settings)
+
+    parser = argparse.ArgumentParser(description="校验 S3 arxiv 数据到论文源数据表的一致性")
+    parser.add_argument("--config", type=Path, default=config_args.config, help="可选自动化配置文件")
+    parser.add_argument("--mysql-config", type=Path, default=resolve_project_path(mysql_settings.get("config_file")) or PROJECT_ROOT / "mysql")
+    parser.add_argument("--mapping-csv", type=Path, default=target_config.mapping_csv)
+    parser.add_argument(
+        "--database",
+        default=target_config.database,
+        help="库名（Iceberg 模式下为 schema，如 dws）",
+    )
+    parser.add_argument("--table", default=target_config.table)
+    parser.add_argument(
+        "--catalog",
+        default=target_config.catalog if target_config.catalog is not None else mysql_settings.get("catalog", DEFAULT_ICEBERG_CATALOG),
+        help="StarRocks Iceberg catalog（默认 lakehouse_iceberg）；传空字符串则用原生库连接",
+    )
+    parser.add_argument("--s3-config", type=Path, default=resolve_project_path(s3_settings.get("config_file")) or PROJECT_ROOT / "s3")
+    parser.add_argument("--parquet-glob", default=run_settings.get("parquet_glob"))
+    parser.add_argument("--s3-path", default=target_config.s3_path or s3_settings.get("path") or run_settings.get("s3_path"))
+    parser.add_argument("--s3-subpath", default=target_config.s3_subpath or s3_settings.get("subpath"))
+    parser.add_argument("--s3-format", choices=("auto", "jsonl", "parquet"), default=target_config.s3_format or s3_settings.get("format", "auto"))
+    parser.add_argument(
+        "--limit",
+        type=int,
+        default=int(run_settings.get("limit", 200)),
+        help="抽样模式：每个 jsonl 随机抽查条数（默认 200）；与 --full 互斥",
+    )
+    parser.add_argument(
+        "--sequential",
+        action="store_true",
+        default=bool(run_settings.get("sequential", False)),
+        help="顺序抽取：每个 jsonl 取文件开头前 N 条（配合 --limit，比随机抽样快）",
+    )
+    parser.add_argument(
+        "--full",
+        action="store_true",
+        default=bool(run_settings.get("full", False)),
+        help="全量：读取分区内全部 jsonl 文件、全部行（分批处理）",
+    )
+    parser.add_argument(
+        "--batch-size",
+        type=int,
+        default=int(run_settings.get("batch_size", 500)),
+        help="全量模式每批处理条数（默认 500）",
+    )
+    parser.add_argument(
+        "--report",
+        type=Path,
+        default=resolve_project_path(report_settings.get("path") or report_settings.get("report_path")) if (report_settings.get("path") or report_settings.get("report_path")) else None,
+        help="将 FAIL/MISSING 记录写入 JSONL 报告文件",
+    )
+    parser.add_argument(
+        "--summary-only",
+        action="store_true",
+        default=bool(report_settings.get("summary_only", False)),
+        help="兼容旧参数：当前默认仅打印汇总，不逐条打印失败详情",
+    )
+    parser.add_argument(
+        "--verbose-failures",
+        action="store_true",
+        default=bool(report_settings.get("verbose_failures", False)),
+        help="逐条打印 FAIL/WARN 明细；默认只打印错误类型汇总",
+    )
+    parser.add_argument("--license-map", type=Path, default=resolve_project_path(settings.get("license_map")) if settings.get("license_map") else None)
+    parser.add_argument("--dry-run", action="store_true", default=bool(run_settings.get("dry_run", False)))
+    parser.add_argument("--origin-id", action="append", default=run_settings.get("origin_ids"))
+    parser.add_argument(
+        "--partition-dt",
+        default=run_settings.get("partition_dt"),
+        help="兼容旧参数：同时作为 S3 分区和目标表分区默认值；建议改用 --s3-dt / --target-dt",
+    )
+    parser.add_argument(
+        "--s3-dt",
+        default=run_settings.get("s3_dt"),
+        help="S3 数据分区日期；默认从 S3 path 中的 dt= 解析",
+    )
+    parser.add_argument(
+        "--target-dt",
+        default=run_settings.get("target_dt"),
+        help="论文源数据表 dt；默认沿用 --partition-dt，未指定时再沿用 S3 dt",
+    )
+    parser.add_argument(
+        "--skip-count",
+        action="store_true",
+        default=bool(run_settings.get("skip_count", False)),
+        help="跳过 S3/MySQL 总量统计（大分区计数较慢）",
+    )
+    parser.add_argument(
+        "--count-only",
+        action="store_true",
+        default=bool(run_settings.get("count_only", False)),
+        help="仅做总量统计，不做字段级校验",
+    )
+    parser.add_argument(
+        "--max-show",
+        type=int,
+        default=int(report_settings.get("max_show", 3)),
+        help="错误类型汇总中每类最多展示多少个样例 origin_id",
+    )
+    parser.add_argument(
+        "--retry-max-attempts",
+        type=int,
+        default=retry_config.max_attempts,
+        help="连接/查询失败时的最大重试次数（含首次，默认 3）",
+    )
+    parser.add_argument(
+        "--retry-initial-delay",
+        type=float,
+        default=retry_config.initial_delay_sec,
+        help="重试初始等待秒数（默认 1.0）",
+    )
+    parser.add_argument(
+        "--no-retry",
+        action="store_true",
+        help="禁用数据库与 S3 连接重试",
+    )
+    args = parser.parse_args(remaining_argv)
+    if args.summary_only and "--verbose-failures" not in remaining_argv:
+        args.verbose_failures = False
+    if args.full:
+        print("[info] 全量模式：读取分区内全部 jsonl，忽略 --limit", file=sys.stderr)
+    if args.full and args.dry_run:
+        print("[warn] 全量 dry-run 可能极慢，建议加 --summary-only", file=sys.stderr)
+
+    if not args.mapping_csv.exists():
+        print(f"映射文件不存在: {args.mapping_csv}", file=sys.stderr)
+        return 2
+
+    try:
+        mapping_rules = load_mapping_rules(
+            args.mapping_csv,
+            target_column=target_config.mapping_target_column,
+            source_column=target_config.mapping_source_column,
+        )
+    except ValueError as e:
+        print(str(e), file=sys.stderr)
+        return 2
+    requested_compare_fields = compare_fields_from_rules(mapping_rules)
+    print(
+        f"目标 target={target_config.name} kind={target_config.kind} "
+        f"transform={target_config.transform}"
+    )
+    print(
+        f"已加载映射规则 {len(mapping_rules)} 条，启用字段校验 {len(requested_compare_fields)} 列"
+    )
+    license_map = dict(DEFAULT_LICENSE_MAP)
+    if args.license_map:
+        license_map.update(json.loads(args.license_map.read_text(encoding="utf-8")))
+    effective_retry = RetryConfig(
+        enabled=not args.no_retry and retry_config.enabled,
+        max_attempts=args.retry_max_attempts,
+        initial_delay_sec=args.retry_initial_delay,
+        backoff_factor=retry_config.backoff_factor,
+        max_delay_sec=retry_config.max_delay_sec,
+    )
+    try:
+        return run_verification(
+            args=args,
+            target_config=target_config,
+            mysql_settings=mysql_settings,
+            s3_settings=s3_settings,
+            mapping_rules=mapping_rules,
+            requested_compare_fields=requested_compare_fields,
+            license_map=license_map,
+            retry_config=effective_retry,
+        )
+    except Exception as exc:
+        if is_s3_retryable(exc):
+            print(
+                f"\n[S3 ERROR] {type(exc).__name__}: {exc}\n"
+                "S3 连接重试已耗尽。若是内网 Ceph HTTPS 偶发断连，可重跑；"
+                "抽样校验建议加 --sequential --skip-count 减少 HEAD/Range 请求。"
+                "如果 endpoint 支持 HTTP，可在 evaluator parameters 中设置 use_ssl=false。",
+                file=sys.stderr,
+            )
+            return 2
+        raise
+
+
+
+def init_config(path: Path) -> int:
+    target = path.expanduser()
+    if not target.is_absolute():
+        target = Path.cwd() / target
+    target.parent.mkdir(parents=True, exist_ok=True)
+    source = PROJECT_ROOT / "config" / "settings.template.json"
+    target.write_text(source.read_text(encoding="utf-8"), encoding="utf-8")
+    print(f"created config template: {target}")
+    return 0
+
+
+def arxiv_entry(argv: Optional[Sequence[str]] = None) -> int:
+    args = list(sys.argv[1:] if argv is None else argv)
+    if "--init-config" in args:
+        idx = args.index("--init-config")
+        if idx + 1 < len(args) and not args[idx + 1].startswith("-"):
+            return init_config(Path(args[idx + 1]))
+        return init_config(DEFAULT_SETTINGS_JSON)
+    return main(args)
+
+
+from dingo.config.input_args import EvaluatorRuleArgs
+from dingo.io.input import Data, RequiredField
+from dingo.io.output.eval_detail import EvalDetail, QualityLabel
+from dingo.model.model import Model
+from dingo.model.rule.base import BaseRule
+from dingo.model.rule.scibase.report_utils import (
+    bool_param,
+    int_param,
+    s3_path_from_dingo,
+    write_temp_settings,
+)
+
+
+def _dingo_append_cli_option(argv: list[str], flag: str, value: Any) -> None:
+    if value is not None and value != "":
+        argv.extend([flag, str(value)])
+
+
+def _dingo_append_cli_flag(argv: list[str], flag: str, enabled: bool) -> None:
+    if enabled:
+        argv.append(flag)
+
+
+def _dingo_append_origin_ids(argv: list[str], value: Any) -> None:
+    if value is None or value == "":
+        return
+    if isinstance(value, (list, tuple, set)):
+        for item in value:
+            _dingo_append_cli_option(argv, "--origin-id", item)
+        return
+    _dingo_append_cli_option(argv, "--origin-id", value)
+
+
+@Model.rule_register(
+    "QUALITY_BAD_EFFECTIVENESS",
+    ["sci_base_qa_test", "meta_paper_data"],
+)
+class RuleSciBaseMetaPaperDataReport(BaseRule):
+    _metric_info = {
+        "category": "Rule-Based Metadata Quality Metrics",
+        "quality_dimension": "EFFECTIVENESS",
+        "metric_name": "RuleSciBaseMetaPaperDataReport",
+        "description": "Run SciBase S3 paper source-data validation and write reports.",
+        "paper_title": "",
+        "paper_url": "",
+        "paper_authors": "",
+        "evaluation_results": "",
+    }
+
+    _required_fields = [RequiredField.METADATA]
+    dynamic_config = EvaluatorRuleArgs(parameters={})
+
+    @classmethod
+    def eval(cls, input_data: Data) -> EvalDetail:
+        del input_data
+        params = cls.dynamic_config.parameters or {}
+        config_path = write_temp_settings(params, include_s3=True)
+        report_path = Path(params["report_path"]) if params.get("report_path") else None
+        if report_path is None and params.get("output_dir"):
+            report_path = Path(str(params["output_dir"])) / "source_field_mismatch.jsonl"
+
+        s3_path = s3_path_from_dingo(params)
+        parquet_glob = params.get("parquet_glob")
+        if not s3_path and not parquet_glob:
+            raise RuntimeError(
+                "S3 path is required for RuleSciBaseMetaPaperDataReport. "
+                "Set evaluator config parameters.s3_path, or run with dataset.source=s3 "
+                "so input_path and dataset.s3_config.s3_bucket can be combined."
+            )
+
+        argv = [
+            "--config",
+            str(config_path),
+            "--mapping-csv",
+            str(params.get("mapping_csv") or ASSETS_DIR / "osi_arxiv_mapping.csv"),
+            "--database",
+            str(params.get("database") or "dws"),
+            "--table",
+            str(params.get("target_table") or params.get("table") or "dws_meta_paper_data_acc_d"),
+        ]
+        catalog = params.get("catalog", DEFAULT_ICEBERG_CATALOG)
+        _dingo_append_cli_option(argv, "--catalog", catalog)
+        _dingo_append_cli_option(argv, "--s3-path", s3_path)
+        _dingo_append_cli_option(argv, "--s3-subpath", params.get("s3_subpath"))
+        _dingo_append_cli_option(argv, "--s3-format", params.get("s3_format"))
+        _dingo_append_cli_option(argv, "--parquet-glob", parquet_glob)
+        _dingo_append_cli_option(argv, "--partition-dt", params.get("partition_dt"))
+        _dingo_append_cli_option(argv, "--s3-dt", params.get("s3_dt"))
+        _dingo_append_cli_option(argv, "--target-dt", params.get("target_dt"))
+        _dingo_append_cli_option(argv, "--limit", int_param(params, "limit", 200))
+        _dingo_append_cli_option(argv, "--batch-size", int_param(params, "batch_size", 500))
+        _dingo_append_cli_option(argv, "--max-show", int_param(params, "max_show", 3))
+        _dingo_append_cli_option(argv, "--report", report_path)
+        _dingo_append_cli_option(argv, "--license-map", params.get("license_map"))
+        _dingo_append_cli_option(argv, "--retry-max-attempts", params.get("retry_max_attempts"))
+        _dingo_append_cli_option(argv, "--retry-initial-delay", params.get("retry_initial_delay"))
+        _dingo_append_origin_ids(argv, params.get("origin_id") or params.get("origin_ids"))
+
+        _dingo_append_cli_flag(argv, "--sequential", bool_param(params, "sequential", False))
+        _dingo_append_cli_flag(argv, "--full", bool_param(params, "full", False))
+        _dingo_append_cli_flag(argv, "--dry-run", bool_param(params, "dry_run", False))
+        _dingo_append_cli_flag(argv, "--skip-count", bool_param(params, "skip_count", False))
+        _dingo_append_cli_flag(argv, "--count-only", bool_param(params, "count_only", False))
+        _dingo_append_cli_flag(argv, "--summary-only", bool_param(params, "summary_only", False))
+        _dingo_append_cli_flag(argv, "--verbose-failures", bool_param(params, "verbose_failures", False))
+        _dingo_append_cli_flag(argv, "--no-retry", bool_param(params, "no_retry", False))
+
+        exit_code = main(argv)
+        reason = [
+            f"exit_code={exit_code}",
+            str(report_path.parent if report_path else REPORT_ROOT),
+        ]
+        if exit_code != 0:
+            return EvalDetail(
+                metric=cls.__name__,
+                status=True,
+                label=[f"{cls.metric_type}.{cls.__name__}"],
+                reason=reason,
+            )
+        return EvalDetail(metric=cls.__name__, label=[QualityLabel.QUALITY_GOOD], reason=reason)
+
+
+if __name__ == "__main__":
+    raise SystemExit(arxiv_entry())
diff --git a/dingo/model/rule/scibase/meta_paper_unique.py b/dingo/model/rule/scibase/meta_paper_unique.py
new file mode 100644
index 00000000..90ae8795
--- /dev/null
+++ b/dingo/model/rule/scibase/meta_paper_unique.py
@@ -0,0 +1,2278 @@
+#!/usr/bin/env python3
+"""Self-contained meta_paper unique DB validator.
+
+Field aggregation rules are driven by ../doc/paper_unique_mapping.csv.
+"""
+from __future__ import annotations
+
+import csv
+import argparse
+import html
+import json
+import re
+import sys
+import time
+from collections import Counter
+from dataclasses import dataclass
+from datetime import date, datetime
+from decimal import Decimal, InvalidOperation
+from pathlib import Path
+from typing import Any, Callable, Dict, Iterable, List, Optional, Sequence, Tuple, Union
+
+SKIP_COMPARE_STRATEGIES = frozenset({"random_pick_cls"})
+ORDER_INSENSITIVE_COMPARE_STRATEGIES = frozenset(
+    {"dedup_array", "dedup_map", "dedup_struct", "dedup_locations"}
+)
+StrategyHandler = Callable[[List[Dict[str, Any]], "FieldRule", Dict[str, Any]], Any]
+
+
+@dataclass
+class FieldRule:
+    field_name: str
+    data_type: str
+    strategy: str
+    params: Dict[str, Any]
+    source_field: str
+    description: str
+
+    @property
+    def effective_source(self) -> str:
+        return self.source_field or self.field_name
+
+
+def _parse_params(raw: str) -> Dict[str, Any]:
+    if not raw:
+        return {}
+    params: Dict[str, Any] = {}
+    for pair in raw.split(";"):
+        pair = pair.strip()
+        if "=" not in pair:
+            continue
+        key, val = pair.split("=", 1)
+        key, val = key.strip(), val.strip()
+        if val.lower() == "true":
+            params[key] = True
+        elif val.lower() == "false":
+            params[key] = False
+        elif val.lstrip("-").isdigit():
+            params[key] = int(val)
+        else:
+            params[key] = val
+    return params
+
+
+def load_field_rules(
+    path: Path,
+    *,
+    field_column: str = "字段名",
+    type_column: str = "数据类型",
+    strategy_column: str = "聚合策略",
+    params_column: str = "策略参数",
+    source_column: str = "源字段名",
+    desc_column: str = "去重 / 聚合处理逻辑",
+) -> List[FieldRule]:
+    rules: List[FieldRule] = []
+    with path.open(encoding="utf-8-sig", newline="") as f:
+        reader = csv.DictReader(f)
+        if not reader.fieldnames or field_column not in reader.fieldnames:
+            available = ", ".join(fn for fn in (reader.fieldnames or []) if fn.strip())
+            raise ValueError(
+                f"映射文件 {path} 缺少字段列 {field_column!r}（可用列: {available}）"
+            )
+        for row in reader:
+            name = (row.get(field_column) or "").strip()
+            if not name:
+                continue
+            rules.append(FieldRule(
+                field_name=name,
+                data_type=(row.get(type_column) or "").strip(),
+                strategy=(row.get(strategy_column) or "").strip(),
+                params=_parse_params((row.get(params_column) or "").strip()),
+                source_field=(row.get(source_column) or "").strip(),
+                description=(row.get(desc_column) or "").strip(),
+            ))
+    return rules
+
+
+def output_fields_from_rules(rules: Sequence[FieldRule]) -> List[str]:
+    return [r.field_name for r in rules if r.strategy not in SKIP_COMPARE_STRATEGIES]
+
+
+def order_insensitive_fields_from_rules(rules: Sequence[FieldRule]) -> set:
+    return {
+        r.field_name
+        for r in rules
+        if r.strategy in ORDER_INSENSITIVE_COMPARE_STRATEGIES
+    }
+
+
+def aggregate_by_rules(
+    records: List[Dict[str, Any]],
+    rules: Sequence[FieldRule],
+    handlers: Dict[str, StrategyHandler],
+) -> Dict[str, Any]:
+    result: Dict[str, Any] = {}
+    for rule in rules:
+        handler = handlers.get(rule.strategy)
+        if handler is None:
+            raise ValueError(
+                f"Unknown aggregation strategy {rule.strategy!r} "
+                f"for field {rule.field_name!r}"
+            )
+        result[rule.field_name] = handler(records, rule, result)
+    return result
+
+try:
+    import pymysql
+except ImportError:  # pragma: no cover - runtime dependency check
+    pymysql = None  # type: ignore
+
+
+CURRENT_YEAR = datetime.now().year
+PROJECT_ROOT = Path(__file__).resolve().parent
+ASSETS_DIR = PROJECT_ROOT / "assets"
+DEFAULT_CONFIG_PATH = Path("sci_base_qa_test_config.json")
+TEMPLATE_CONFIG_PATH = ASSETS_DIR / "settings.template.json"
+DEFAULT_MAPPING_CSV = ASSETS_DIR / "paper_unique_mapping.csv"
+REPORT_ROOT = Path("report")
+DEFAULT_SOURCE_TABLE = "dws_meta_paper_data_acc_d"
+DEFAULT_TARGET_TABLE = "dws_meta_paper_doi_unique_acc_d"
+DOI_KEY_SQL_PATTERN = r'(10\.[^[:space:]<>"&;]+|[^[:space:]<>"&;]+)'
+
+
+def safe_filename_token(value: Optional[Any]) -> str:
+    text = "all" if value in (None, "") else str(value)
+    return re.sub(r"[^0-9A-Za-z_-]+", "_", text).strip("_") or "all"
+
+
+def default_report_path(dt: Optional[str], sample_mode: str, full: bool) -> Path:
+    mode = "full" if full else sample_mode
+    report_dir = REPORT_ROOT / f"meta_paper_unique_dt_{safe_filename_token(dt)}_{safe_filename_token(mode)}"
+    return report_dir / "source_field_mismatch.jsonl"
+
+
+def _json_inline(value: Any) -> str:
+    return json.dumps(value, ensure_ascii=False, cls=JsonEncoder)
+
+
+def summary_paths(report_path: Path) -> Tuple[Path, Path]:
+    return report_path.parent / "summary.json", report_path.parent / "readable_summary.md"
+
+
+REPORT_KEY_LABELS = {
+    "report": "报告路径",
+    "total_problem_rows": "问题记录数",
+    "result": "校验结果",
+    "status_counts": "状态分布",
+    "field_counts": "字段问题分布",
+    "field_samples": "字段问题样例",
+    "key": "键值",
+    "dt": "分区日期",
+    "source_count": "源表记录数",
+    "status": "状态",
+    "expected": "预期值",
+    "actual": "实际值",
+    "kind": "校验类型",
+    "source_table": "源表",
+    "target_table": "目标表",
+    "key_field": "去重键字段",
+    "validated_partitions": "已校验分区",
+    "sample_mode": "抽样模式",
+    "sample_size": "抽样数量",
+    "dt_check": "分区检查",
+    "checked": "已校验数",
+    "passed": "通过数",
+    "failed": "失败数",
+    "missing_source": "源表缺失数",
+    "missing_target": "目标表缺失数",
+    "source_count_buckets": "源表记录数分桶",
+    "missing_samples": "缺失样例",
+    "source_records": "源表记录",
+    "target_records": "目标表记录",
+    "report_path": "报告路径",
+    "sample_mismatches": "问题样例",
+    "mismatches": "字段差异",
+    "source_count_mode": "源表计数模式",
+    "source_failed_buckets": "源表计数失败分桶",
+    "count_mismatches": "数量不一致明细",
+    "count_check": "数量校验",
+    "mismatch_count": "数量不一致数",
+    "failed_bucket_count": "计数失败分桶数",
+    "difference": "目标表多出记录数",
+    "source_dt_count": "源表分区数",
+    "target_dt_count": "目标表分区数",
+    "missing_in_target": "目标表缺失分区",
+    "extra_in_target": "目标表多余分区",
+    "source_distinct_skipped": "源表去重计数已跳过",
+    "matched_key_count": "源表目标表共同 DOI 数",
+    "source_missing_in_target_key_count": "元数据有目标无",
+    "target_extra_key_count": "目标有元数据无",
+    "key_gap_failed": "key 覆盖统计失败",
+}
+
+
+def localize_report_keys(value: Any) -> Any:
+    if isinstance(value, dict):
+        return {
+            REPORT_KEY_LABELS.get(str(key), str(key)): localize_report_keys(val)
+            for key, val in value.items()
+        }
+    if isinstance(value, list):
+        return [localize_report_keys(item) for item in value]
+    return value
+
+
+TOP_FIELD_LIMIT = 20
+TOP_SAMPLE_FIELD_LIMIT = 5
+SAMPLES_PER_FIELD = 3
+
+
+def compact_record_for_report(record: Dict[str, Any]) -> Dict[str, Any]:
+    keys = (
+        "track_id",
+        "origin_osi",
+        "origin_id",
+        "title",
+        "published_year",
+        "published_date",
+        "venue_name",
+    )
+    return {
+        key: record.get(key)
+        for key in keys
+        if record.get(key) not in (None, "", [], {})
+    }
+
+
+def compact_records_for_report(records: Any) -> Any:
+    if not isinstance(records, list):
+        return records
+    compacted = []
+    seen = set()
+    for record in records:
+        if not isinstance(record, dict):
+            continue
+        compact = compact_record_for_report(record)
+        marker = json.dumps(compact, ensure_ascii=False, sort_keys=True, cls=JsonEncoder)
+        if marker in seen:
+            continue
+        seen.add(marker)
+        compacted.append(compact)
+    return compacted
+
+
+def compact_dt_check(dt_check: Optional[Dict[str, Any]]) -> Dict[str, Any]:
+    dt_check = dt_check or {}
+    mismatches = []
+    for item in dt_check.get("count_mismatches") or []:
+        source_count = item.get("source_key_count")
+        target_count = item.get("target_row_count")
+        difference = None
+        if source_count is not None and target_count is not None:
+            difference = int(target_count) - int(source_count)
+        mismatches.append(
+            {
+                "dt": item.get("dt"),
+                "source_key_count": source_count,
+                "target_row_count": target_count,
+                "difference": difference,
+            }
+        )
+    failed_buckets = dt_check.get("source_failed_buckets") or []
+    compact = {
+        "source_count_mode": dt_check.get("source_count_mode"),
+        "source_distinct_skipped": dt_check.get("source_distinct_skipped"),
+        "failed_bucket_count": len(failed_buckets),
+        "mismatch_count": len(mismatches),
+        "count_mismatches": mismatches,
+        "missing_in_target": dt_check.get("missing_in_target") or [],
+        "extra_in_target": dt_check.get("extra_in_target") or [],
+    }
+    for key in (
+        "matched_key_count",
+        "source_missing_in_target_key_count",
+        "target_extra_key_count",
+        "key_gap_failed",
+    ):
+        if key in dt_check:
+            compact[key] = dt_check.get(key)
+    return compact
+
+
+def build_report_summary(
+    report_path: Path,
+    result: Dict[str, Any],
+    mismatch_rows: Sequence[Dict[str, Any]],
+) -> Dict[str, Any]:
+    status_counts = Counter(str(row.get("status") or "unknown") for row in mismatch_rows)
+    field_counts: Counter = Counter()
+    field_samples: Dict[str, List[Dict[str, Any]]] = {}
+    missing_samples: List[Dict[str, Any]] = []
+    for row in mismatch_rows:
+        if row.get("status") in ("missing_target", "missing_source") and len(missing_samples) < SAMPLES_PER_FIELD:
+            missing_samples.append(
+                {
+                    "key": row.get("key"),
+                    "dt": row.get("dt"),
+                    "source_count": row.get("source_count"),
+                    "status": row.get("status"),
+                    "source_records": compact_records_for_report(row.get("source_records")),
+                    "target_records": compact_records_for_report(row.get("target_records")),
+                }
+            )
+        for field, diff in (row.get("mismatches") or {}).items():
+            field_counts[field] += 1
+            samples = field_samples.setdefault(field, [])
+            if len(samples) < SAMPLES_PER_FIELD:
+                samples.append(
+                    {
+                        "key": row.get("key"),
+                        "dt": row.get("dt"),
+                        "source_count": row.get("source_count"),
+                        "status": row.get("status"),
+                        "expected": diff.get("expected") if isinstance(diff, dict) else None,
+                        "actual": diff.get("actual") if isinstance(diff, dict) else None,
+                    }
+                )
+    sorted_field_counts = dict(field_counts.most_common())
+    top_sample_fields = set(list(sorted_field_counts)[:TOP_SAMPLE_FIELD_LIMIT])
+    compact_result = {k: v for k, v in result.items() if k not in ("sample_mismatches", "dt_check")}
+    count_check = compact_dt_check(result.get("dt_check"))
+    return {
+        "report": str(report_path),
+        "total_problem_rows": len(mismatch_rows),
+        "result": compact_result,
+        "count_check": count_check,
+        "status_counts": dict(status_counts.most_common()),
+        "field_counts": sorted_field_counts,
+        "field_count_total": len(sorted_field_counts),
+        "field_samples": {
+            field: field_samples[field]
+            for field in sorted_field_counts
+            if field in top_sample_fields and field in field_samples
+        },
+        "missing_samples": missing_samples,
+    }
+
+
+def write_report_summary(report_path: Path, result: Dict[str, Any], mismatch_rows: Sequence[Dict[str, Any]]) -> None:
+    summary_json_path, summary_md_path = summary_paths(report_path)
+    summary = build_report_summary(report_path, result, mismatch_rows)
+    with summary_json_path.open("w", encoding="utf-8") as f:
+        json.dump(localize_report_keys(summary), f, ensure_ascii=False, indent=2, cls=JsonEncoder)
+
+    lines = [
+        "# Paper 去重校验报告摘要",
+        "",
+        f"- 分区: `{result.get('dt')}`",
+        f"- 抽样: `{result.get('sample_mode')}`, 数量 `{result.get('sample_size')}`",
+        f"- 结果: 已校验 `{result.get('checked')}`，通过 `{result.get('passed')}`，失败 `{result.get('failed')}`",
+        f"- 缺失: 源表 `{result.get('missing_source')}`，目标表 `{result.get('missing_target')}`",
+        f"- 明细报告: `{report_path}`",
+        f"- 报告目录: `{report_path.parent}`",
+        f"- 源表记录数分桶: `{_json_inline(result.get('source_count_buckets'))}`",
+        "",
+        "## 数量校验",
+        "",
+        f"- 源表计数模式: `{summary['count_check'].get('source_count_mode')}`",
+        f"- 计数失败分桶数: `{summary['count_check'].get('failed_bucket_count')}`",
+        f"- 数量不一致数: `{summary['count_check'].get('mismatch_count')}`",
+    ]
+    if "source_missing_in_target_key_count" in summary["count_check"]:
+        lines.append(
+            f"- 元数据有目标无: `{summary['count_check'].get('source_missing_in_target_key_count')}`"
+        )
+    if "target_extra_key_count" in summary["count_check"]:
+        lines.append(
+            f"- 目标有元数据无: `{summary['count_check'].get('target_extra_key_count')}`"
+        )
+    for item in summary["count_check"].get("count_mismatches") or []:
+        lines.append(
+            "- 分区 `{}`: source_key_count `{}`，target_row_count `{}`，difference `{}`".format(
+                item.get("dt"),
+                item.get("source_key_count"),
+                item.get("target_row_count"),
+                item.get("difference"),
+            ),
+        )
+    lines.extend(["", "## 状态分布", ""])
+    for status, count in summary["status_counts"].items():
+        lines.append(f"- `{status}`: {count}")
+    if not summary["status_counts"]:
+        lines.append("- 无")
+    lines.extend(["", "## 字段问题分布", ""])
+    for field, count in summary["field_counts"].items():
+        lines.append(f"- `{field}`: {count}")
+    if not summary["field_counts"]:
+        lines.append("- 无")
+    if summary.get("missing_samples"):
+        lines.extend(["", "## 缺失样例", ""])
+        for sample in summary["missing_samples"]:
+            lines.append(
+                f"- DOI `{sample.get('key')}`, source_count={sample.get('source_count')}, "
+                f"status=`{sample.get('status')}`"
+            )
+            source_records = sample.get("source_records")
+            target_records = sample.get("target_records")
+            if source_records is not None:
+                lines.append(f"  - source_records: `{_json_inline(source_records)}`")
+            if target_records is not None:
+                lines.append(f"  - target_records: `{_json_inline(target_records)}`")
+    lines.extend(["", "## 字段问题样例", ""])
+    for field, samples in summary["field_samples"].items():
+        lines.append(f"### {field} ({summary['field_counts'].get(field)})")
+        lines.append("")
+        for sample in samples:
+            lines.append(
+                f"- DOI `{sample.get('key')}`, source_count={sample.get('source_count')}, "
+                f"status=`{sample.get('status')}`"
+            )
+            lines.append(f"  - expected: `{_json_inline(sample.get('expected'))}`")
+            lines.append(f"  - actual: `{_json_inline(sample.get('actual'))}`")
+            lines.append("")
+    with summary_md_path.open("w", encoding="utf-8") as f:
+        f.write("\n".join(lines).rstrip() + "\n")
+
+
+class JsonEncoder(json.JSONEncoder):
+    def default(self, obj: Any) -> Any:
+        if isinstance(obj, Decimal):
+            if obj == obj.to_integral_value():
+                return int(obj)
+            return float(obj)
+        if isinstance(obj, (date, datetime)):
+            return obj.isoformat()
+        return super().default(obj)
+
+
+# ---- common scalar/array helpers ----
+
+
+def is_non_empty(value: Any) -> bool:
+    if value is None:
+        return False
+    if isinstance(value, str):
+        return value not in ("", "{}")
+    if isinstance(value, (list, dict)):
+        return len(value) > 0
+    return True
+
+
+def choose_freq_then_lex_max(values: Iterable[str]) -> str:
+    vals = [v for v in values if v not in ("", "{}")]
+    if not vals:
+        return ""
+    cnt = Counter(vals)
+    max_freq = max(cnt.values())
+    candidates = [k for k, v in cnt.items() if v == max_freq]
+    return max(candidates)
+
+
+def choose_freq_then_max_int(values: Iterable[int]) -> Optional[int]:
+    vals = [v for v in values if isinstance(v, int)]
+    if not vals:
+        return None
+    cnt = Counter(vals)
+    max_freq = max(cnt.values())
+    candidates = [k for k, v in cnt.items() if v == max_freq]
+    return max(candidates)
+
+
+def choose_freq_then_max_decimal(values: Iterable[Decimal]) -> Optional[Decimal]:
+    vals = [v for v in values if isinstance(v, Decimal)]
+    if not vals:
+        return None
+    cnt = Counter(vals)
+    max_freq = max(cnt.values())
+    candidates = [k for k, v in cnt.items() if v == max_freq]
+    return max(candidates)
+
+
+def normalize_doi(doi: Any) -> str:
+    if doi is None:
+        return ""
+    s = html.unescape(str(doi).strip().lower())
+    if s in ("", "{}"):
+        return ""
+    start = s.find("10.")
+    if start >= 0:
+        s = s[start:]
+    s = re.split(r"[\s<>\"&;]", s, maxsplit=1)[0].strip()
+    if s in ("", "{}"):
+        return ""
+    return s
+
+
+def parse_int(value: Any) -> Optional[int]:
+    if value is None or (isinstance(value, str) and value in ("", "{}")):
+        return None
+    if isinstance(value, bool):
+        return None
+    if isinstance(value, int):
+        return value
+    try:
+        return int(str(value))
+    except ValueError:
+        return None
+
+
+def parse_decimal(value: Any) -> Optional[Decimal]:
+    if value is None or (isinstance(value, str) and value in ("", "{}")):
+        return None
+    try:
+        return Decimal(str(value))
+    except (InvalidOperation, ValueError):
+        return None
+
+
+def decimal_to_json_number(value: Decimal) -> Union[int, float]:
+    if value == value.to_integral_value():
+        return int(value)
+    return float(value)
+
+
+def year_from_date_str(s: str) -> Optional[int]:
+    if len(s) < 4:
+        return None
+    year_txt = s[:4]
+    if not year_txt.isdigit():
+        return None
+    year = int(year_txt)
+    if year < 1000 or year > CURRENT_YEAR:
+        return None
+    return year
+
+
+def canonical_json(value: Any) -> str:
+    return json.dumps(value, ensure_ascii=False, sort_keys=True, separators=(",", ":"))
+
+
+def dedup_str_array(values: Iterable[Any], lower: bool = False) -> List[str]:
+    out = set()
+
+    def add_value(raw: Any) -> None:
+        if raw is None:
+            return
+        s = str(raw)
+        if s in ("", "{}", "[]"):
+            return
+        out.add(s.lower() if lower else s)
+
+    for item in values:
+        if isinstance(item, list):
+            for v in item:
+                if isinstance(v, str):
+                    try:
+                        parsed = json.loads(v)
+                    except json.JSONDecodeError:
+                        parsed = None
+                    if isinstance(parsed, list):
+                        for elem in parsed:
+                            add_value(elem)
+                        continue
+                add_value(v)
+        elif item is not None:
+            if isinstance(item, str):
+                try:
+                    parsed = json.loads(item)
+                except json.JSONDecodeError:
+                    parsed = None
+                if isinstance(parsed, list):
+                    for elem in parsed:
+                        add_value(elem)
+                    continue
+            add_value(item)
+    return sorted(out)
+
+
+# ---- paper-specific complex value helpers ----
+
+
+def dedup_complex_to_string(values: Iterable[Any]) -> List[str]:
+    out = set()
+    for item in values:
+        if isinstance(item, list):
+            for v in item:
+                if v is None or (isinstance(v, str) and v in ("", "{}")):
+                    continue
+                out.add(canonical_json(v) if not isinstance(v, str) else v)
+        elif item is not None and not (isinstance(item, str) and item in ("", "{}")):
+            out.add(canonical_json(item) if not isinstance(item, str) else item)
+    return sorted(v for v in out if v not in ("", "{}"))
+
+
+def dedup_locations_struct(values: Iterable[Any]) -> List[Dict[str, str]]:
+    dedup_map: Dict[str, Dict[str, str]] = {}
+    for item in values:
+        candidates = item if isinstance(item, list) else [item]
+        for candidate in candidates:
+            if candidate is None or (isinstance(candidate, str) and candidate in ("", "{}")):
+                continue
+
+            obj: Optional[Dict[str, Any]] = None
+            if isinstance(candidate, dict):
+                obj = candidate
+            elif isinstance(candidate, str):
+                try:
+                    parsed = json.loads(candidate)
+                except json.JSONDecodeError:
+                    parsed = None
+                if isinstance(parsed, dict):
+                    obj = parsed
+
+            if obj is None:
+                continue
+
+            normalized = {
+                "type": "" if obj.get("type") is None or str(obj.get("type")) == "{}" else str(obj.get("type")),
+                "url": "" if obj.get("url") is None or str(obj.get("url")) == "{}" else str(obj.get("url")),
+                "license": ""
+                if obj.get("license") is None or str(obj.get("license")) == "{}"
+                else str(obj.get("license")),
+                "is_oa": "" if obj.get("is_oa") is None or str(obj.get("is_oa")) == "{}" else str(obj.get("is_oa")),
+            }
+            dedup_map[canonical_json(normalized)] = normalized
+
+    return [dedup_map[k] for k in sorted(dedup_map.keys())]
+
+
+def dedup_map_array(values: Iterable[Any]) -> List[Dict[str, str]]:
+    dedup_map: Dict[str, Dict[str, str]] = {}
+    for item in values:
+        candidates = item if isinstance(item, list) else [item]
+        for candidate in candidates:
+            if candidate is None or (isinstance(candidate, str) and candidate in ("", "{}")):
+                continue
+
+            objects: List[Dict[str, Any]] = []
+            if isinstance(candidate, dict):
+                objects = [candidate]
+            elif isinstance(candidate, list):
+                objects = [obj for obj in candidate if isinstance(obj, dict)]
+            elif isinstance(candidate, str):
+                try:
+                    parsed = json.loads(candidate)
+                except json.JSONDecodeError:
+                    parsed = None
+                if isinstance(parsed, dict):
+                    objects = [parsed]
+                elif isinstance(parsed, list):
+                    objects = [obj for obj in parsed if isinstance(obj, dict)]
+
+            if not objects:
+                continue
+
+            for obj in objects:
+                normalized = {
+                    str(k): ""
+                    if v is None or (isinstance(v, str) and v == "{}")
+                    else stringify_map_value(v)
+                    for k, v in obj.items()
+                }
+                dedup_map[canonical_json(normalized)] = normalized
+
+    return [dedup_map[k] for k in sorted(dedup_map.keys())]
+
+
+def choose_freq_then_lex_max_struct(values: Iterable[Any]) -> Dict[str, Any]:
+    candidates: List[str] = []
+    for value in values:
+        obj: Optional[Dict[str, Any]] = None
+        if isinstance(value, dict):
+            obj = value
+        elif isinstance(value, str):
+            if value in ("", "{}"):
+                continue
+            try:
+                parsed = json.loads(value)
+            except json.JSONDecodeError:
+                parsed = None
+            if isinstance(parsed, dict) and parsed:
+                obj = parsed
+
+        if isinstance(obj, dict) and obj:
+            candidates.append(canonical_json(obj))
+
+    if not candidates:
+        return {}
+
+    best = choose_freq_then_lex_max(candidates)
+    try:
+        parsed_best = json.loads(best)
+    except json.JSONDecodeError:
+        return {}
+    return parsed_best if isinstance(parsed_best, dict) else {}
+
+
+def _parse_struct_obj(value: Any) -> Optional[Dict[str, Any]]:
+    if isinstance(value, dict):
+        return value
+    if isinstance(value, str):
+        if value in ("", "{}"):
+            return None
+        try:
+            parsed = json.loads(value)
+        except json.JSONDecodeError:
+            return None
+        if isinstance(parsed, dict):
+            return parsed
+    return None
+
+
+def _normalize_topic_node(value: Any) -> Optional[Dict[str, Any]]:
+    obj = _parse_struct_obj(value)
+    if obj is None:
+        return None
+    normalized = {
+        "id": canonicalize(obj.get("id")),
+        "display_name": canonicalize(obj.get("display_name")),
+    }
+    if not any(is_non_empty(v) for v in normalized.values()):
+        return None
+    return normalized
+
+
+def empty_primary_topic_struct() -> Dict[str, Any]:
+    return {
+        "id": None,
+        "display_name": None,
+        "score": None,
+        "subfield": None,
+        "field": None,
+        "domain": None,
+    }
+
+
+def normalize_primary_topic_struct(value: Any) -> Dict[str, Any]:
+    obj = _parse_struct_obj(value)
+    if obj is None:
+        return empty_primary_topic_struct()
+    score = parse_decimal(obj.get("score"))
+    return {
+        "id": canonicalize(obj.get("id")),
+        "display_name": canonicalize(obj.get("display_name")),
+        "score": decimal_to_json_number(score) if score is not None else None,
+        "subfield": _normalize_topic_node(obj.get("subfield")),
+        "field": _normalize_topic_node(obj.get("field")),
+        "domain": _normalize_topic_node(obj.get("domain")),
+    }
+
+
+def choose_freq_then_lex_max_primary_topic(values: Iterable[Any]) -> Dict[str, Any]:
+    candidates: List[str] = []
+    for value in values:
+        obj = _parse_struct_obj(value)
+        if not obj:
+            continue
+        normalized = normalize_primary_topic_struct(obj)
+        if any(is_non_empty(v) for v in normalized.values()):
+            candidates.append(canonical_json(normalized))
+
+    if not candidates:
+        return empty_primary_topic_struct()
+
+    best = choose_freq_then_lex_max(candidates)
+    parsed_best = json.loads(best)
+    return parsed_best if isinstance(parsed_best, dict) else empty_primary_topic_struct()
+
+
+def dedup_struct_array(values: Iterable[Any]) -> List[Dict[str, Any]]:
+    def parse_to_dict_list(value: Any) -> List[Dict[str, Any]]:
+        if value is None:
+            return []
+
+        def parse_str(raw: str) -> Any:
+            if raw in ("", "{}", "[]"):
+                return None
+            try:
+                return json.loads(raw)
+            except json.JSONDecodeError:
+                return None
+
+        def collect_from_list(items: List[Any]) -> List[Dict[str, Any]]:
+            out: List[Dict[str, Any]] = []
+            for elem in items:
+                if isinstance(elem, dict):
+                    if elem:
+                        out.append(elem)
+                    continue
+                if isinstance(elem, str):
+                    parsed_elem = parse_str(elem)
+                    if isinstance(parsed_elem, dict) and parsed_elem:
+                        out.append(parsed_elem)
+                    elif isinstance(parsed_elem, list):
+                        out.extend(collect_from_list(parsed_elem))
+            return out
+
+        if isinstance(value, dict):
+            return [value] if value else []
+        if isinstance(value, list):
+            return collect_from_list(value)
+        if isinstance(value, str):
+            parsed = parse_str(value)
+            if isinstance(parsed, dict):
+                return [parsed] if parsed else []
+            if isinstance(parsed, list):
+                return collect_from_list(parsed)
+        return []
+
+    merged_topics: List[Dict[str, Any]] = []
+    for item in values:
+        merged_topics.extend(parse_to_dict_list(item))
+
+    dedup_map: Dict[str, Dict[str, Any]] = {}
+    for topic in merged_topics:
+        dedup_map[canonical_json(topic)] = topic
+
+    return [dedup_map[k] for k in sorted(dedup_map.keys())]
+
+
+def normalize_origin_osi(value: Any) -> str:
+    if value is None:
+        return ""
+    origin = str(value).strip().lower()
+    if origin in ("", "{}"):
+        return ""
+    if origin.startswith("semantic"):
+        return "semantic"
+    return origin
+
+
+def stringify_map_value(value: Any) -> str:
+    return stringify_map_value_with_style(value, compact=True)
+
+
+def stringify_map_value_with_style(value: Any, compact: Optional[bool]) -> str:
+    if value is None:
+        return ""
+    if isinstance(value, bool):
+        return "true" if value else "false"
+    if isinstance(value, (dict, list)):
+        if compact is True:
+            return json.dumps(value, ensure_ascii=False, separators=(",", ":"))
+        return json.dumps(value, ensure_ascii=False)
+    return str(value)
+
+
+def detect_json_compact_style(raw: str) -> bool:
+    in_string = False
+    escaped = False
+    length = len(raw)
+
+    for idx, ch in enumerate(raw):
+        if in_string:
+            if escaped:
+                escaped = False
+            elif ch == "\\":
+                escaped = True
+            elif ch == '"':
+                in_string = False
+            continue
+
+        if ch == '"':
+            in_string = True
+            continue
+
+        if ch in {":", ","}:
+            if idx + 1 < length and raw[idx + 1].isspace():
+                return False
+
+    return True
+
+
+def merge_string_map(values: Iterable[Any]) -> Dict[str, str]:
+    merged: Dict[str, str] = {}
+    for item in values:
+        if isinstance(item, str):
+            try:
+                parsed = json.loads(item)
+            except json.JSONDecodeError:
+                parsed = None
+            item = parsed
+        if not isinstance(item, dict):
+            continue
+        for k, v in item.items():
+            if k is None or v is None:
+                continue
+            key = str(k)
+            val = stringify_map_value(v)
+            if key in ("", "{}"):
+                continue
+            if val == "{}":
+                val = ""
+            if key not in merged or val > merged[key]:
+                merged[key] = val
+    return merged
+
+
+def merge_identifiers(values: Iterable[Any], origin_osi_values: Iterable[Any]) -> Dict[str, str]:
+    merged: Dict[str, str] = {}
+    for item, origin_osi in zip(values, origin_osi_values):
+        if isinstance(item, str):
+            try:
+                parsed = json.loads(item)
+            except json.JSONDecodeError:
+                parsed = None
+            item = parsed
+        if not isinstance(item, dict):
+            continue
+        normalized_origin = normalize_origin_osi(origin_osi)
+        for k, v in item.items():
+            if k is None or v is None:
+                continue
+            key = str(k)
+            if key in ("", "{}"):
+                continue
+            lowered_key = key.lower()
+            if lowered_key in {"doi", "mag"} and normalized_origin:
+                key = f"{normalized_origin}_{lowered_key}"
+            sv = str(v)
+            if sv == "{}":
+                sv = ""
+            if key not in merged or sv > merged[key]:
+                merged[key] = sv
+    return merged
+
+
+# ---- strategy handlers ----
+
+
+def _handle_key_lower(
+    records: List[Dict[str, Any]], rule: FieldRule, result: Dict[str, Any],
+) -> str:
+    src = rule.effective_source
+    vals = [normalize_doi(r.get(src, "")) for r in records if normalize_doi(r.get(src, ""))]
+    return vals[0] if vals else ""
+
+
+def _handle_freq_lex_max(
+    records: List[Dict[str, Any]], rule: FieldRule, result: Dict[str, Any],
+) -> str:
+    src = rule.effective_source
+    min_len = rule.params.get("min_len")
+    max_len = rule.params.get("max_len")
+    vals: List[str] = []
+    for r in records:
+        v = r.get(src)
+        if not is_non_empty(v):
+            continue
+        s = str(v)
+        if min_len is not None and len(s) < min_len:
+            continue
+        if max_len is not None and len(s) > max_len:
+            continue
+        if rule.field_name == "access_is_oa" and s.lower() == "unknown":
+            continue
+        vals.append(s)
+    return choose_freq_then_lex_max(vals)
+
+
+def _handle_freq_int_max(
+    records: List[Dict[str, Any]], rule: FieldRule, result: Dict[str, Any],
+) -> Optional[int]:
+    src = rule.effective_source
+    min_val = rule.params.get("min_val")
+    max_val = rule.params.get("max_val")
+    if isinstance(max_val, str) and max_val == "CURRENT_YEAR":
+        max_val = CURRENT_YEAR
+    vals: List[int] = []
+    for r in records:
+        v = parse_int(r.get(src))
+        if v is None:
+            continue
+        if min_val is not None and v < min_val:
+            continue
+        if max_val is not None and v > max_val:
+            continue
+        vals.append(v)
+    return choose_freq_then_max_int(vals)
+
+
+def _handle_freq_decimal_max(
+    records: List[Dict[str, Any]], rule: FieldRule, result: Dict[str, Any],
+) -> Any:
+    src = rule.effective_source
+    vals = [d for r in records for d in [parse_decimal(r.get(src))] if d is not None]
+    best = choose_freq_then_max_decimal(vals)
+    return decimal_to_json_number(best) if best is not None else None
+
+
+def _handle_freq_date(
+    records: List[Dict[str, Any]], rule: FieldRule, result: Dict[str, Any],
+) -> str:
+    src = rule.effective_source
+    vals: List[str] = []
+    for r in records:
+        d = r.get(src)
+        if isinstance(d, str) and d and year_from_date_str(d) is not None:
+            vals.append(d)
+    return choose_freq_then_lex_max(vals)
+
+
+def _handle_freq_struct(
+    records: List[Dict[str, Any]], rule: FieldRule, result: Dict[str, Any],
+) -> Dict[str, Any]:
+    if rule.field_name == "primary_topic":
+        return choose_freq_then_lex_max_primary_topic(
+            [r.get(rule.effective_source) for r in records]
+        )
+    return choose_freq_then_lex_max_struct([r.get(rule.effective_source) for r in records])
+
+
+def _handle_dedup_array(
+    records: List[Dict[str, Any]], rule: FieldRule, result: Dict[str, Any],
+) -> List[str]:
+    return dedup_str_array(
+        [r.get(rule.effective_source, []) for r in records],
+        lower=rule.params.get("lower", False),
+    )
+
+
+def _handle_dedup_map(
+    records: List[Dict[str, Any]], rule: FieldRule, result: Dict[str, Any],
+) -> List[Dict[str, str]]:
+    return dedup_map_array([r.get(rule.effective_source, []) for r in records])
+
+
+def _handle_dedup_struct(
+    records: List[Dict[str, Any]], rule: FieldRule, result: Dict[str, Any],
+) -> List[Dict[str, Any]]:
+    return dedup_struct_array([r.get(rule.effective_source, []) for r in records])
+
+
+def _handle_dedup_locations(
+    records: List[Dict[str, Any]], rule: FieldRule, result: Dict[str, Any],
+) -> List[Dict[str, str]]:
+    return dedup_locations_struct([r.get(rule.effective_source, []) for r in records])
+
+
+def _handle_merge_map(
+    records: List[Dict[str, Any]], rule: FieldRule, result: Dict[str, Any],
+) -> Dict[str, str]:
+    return merge_string_map([r.get(rule.effective_source) for r in records])
+
+
+def _handle_merge_identifiers(
+    records: List[Dict[str, Any]], rule: FieldRule, result: Dict[str, Any],
+) -> Dict[str, str]:
+    src = rule.effective_source
+    return merge_identifiers(
+        [r.get(src) for r in records],
+        [r.get("origin_osi") for r in records],
+    )
+
+
+def _handle_latest_dt(
+    records: List[Dict[str, Any]], rule: FieldRule, result: Dict[str, Any],
+) -> str:
+    src = rule.effective_source
+    vals = [str(r.get(src, "")) for r in records if is_non_empty(r.get(src, ""))]
+    return max(vals) if vals else ""
+
+
+def _handle_random_pick_cls(
+    records: List[Dict[str, Any]], rule: FieldRule, result: Dict[str, Any],
+) -> None:
+    return None
+
+
+STRATEGY_HANDLERS: Dict[str, StrategyHandler] = {
+    "key_lower": _handle_key_lower,
+    "freq_lex_max": _handle_freq_lex_max,
+    "freq_int_max": _handle_freq_int_max,
+    "freq_decimal_max": _handle_freq_decimal_max,
+    "freq_date": _handle_freq_date,
+    "freq_struct": _handle_freq_struct,
+    "dedup_array": _handle_dedup_array,
+    "dedup_map": _handle_dedup_map,
+    "dedup_struct": _handle_dedup_struct,
+    "dedup_locations": _handle_dedup_locations,
+    "merge_map": _handle_merge_map,
+    "merge_identifiers": _handle_merge_identifiers,
+    "latest_dt": _handle_latest_dt,
+    "random_pick_cls": _handle_random_pick_cls,
+}
+
+
+# ---- aggregation ----
+
+
+def aggregate_group(records: List[Dict[str, Any]], rules: Sequence[FieldRule]) -> Dict[str, Any]:
+    return aggregate_by_rules(records, rules, STRATEGY_HANDLERS)
+
+
+# ---- DB validation helpers ----
+
+
+def _log(message: str) -> None:
+    print(message, file=sys.stderr, flush=True)
+
+
+def load_config(path: Path) -> Dict[str, Any]:
+    if not path.exists():
+        raise FileNotFoundError(
+            f"Config file not found: {path}\n"
+            f"Copy the template and fill in credentials:\n"
+            f"  cp {TEMPLATE_CONFIG_PATH} {path}"
+        )
+    with path.open("r", encoding="utf-8") as f:
+        return json.load(f)
+
+
+def connect_starrocks(config_path: Path):
+    if pymysql is None:
+        raise RuntimeError("pymysql is required. Install pymysql before running DB validation.")
+    cfg = load_config(config_path)
+    mysql_cfg = cfg["mysql"]
+    retry_cfg = cfg.get("retry", {}) if isinstance(cfg.get("retry"), dict) else {}
+    max_attempts = max(1, int(retry_cfg.get("max_attempts", 3)))
+    delay = max(0.0, float(retry_cfg.get("initial_delay_sec", 2.0)))
+    backoff = max(1.0, float(retry_cfg.get("backoff_factor", 2.0)))
+    read_timeout = int(mysql_cfg.get("read_timeout_sec", 600))
+
+    def _is_retryable_connect_error(exc: Exception) -> bool:
+        if pymysql is None:
+            return False
+        if isinstance(exc, pymysql.err.OperationalError):
+            code = exc.args[0] if exc.args else None
+            if code in (2003, 2006, 2013):
+                return True
+        msg = str(exc).lower()
+        return any(token in msg for token in ("lost connection", "can't connect", "timed out", "timeout"))
+
+    for attempt in range(1, max_attempts + 1):
+        try:
+            # Do not pass database= on connect: this StarRocks endpoint drops
+            # auth when a default schema is selected; use fully-qualified table names in SQL.
+            return pymysql.connect(
+                host=mysql_cfg["host"],
+                port=int(mysql_cfg["port"]),
+                user=mysql_cfg["user"],
+                password=mysql_cfg["password"],
+                charset=mysql_cfg.get("charset", "utf8mb4"),
+                connect_timeout=30,
+                read_timeout=read_timeout,
+            )
+        except Exception as exc:
+            if attempt >= max_attempts or not _is_retryable_connect_error(exc):
+                raise
+            print(
+                f"[retry] MySQL 连接失败 ({type(exc).__name__}: {exc})，"
+                f"{delay:.1f}s 后重试 ({attempt}/{max_attempts})"
+            )
+            time.sleep(delay)
+            delay *= backoff
+
+    raise RuntimeError("MySQL connection retry exhausted unexpectedly")
+
+
+def qualify_table_name(
+    table: str,
+    catalog: Optional[str],
+    database: str = "dws",
+) -> str:
+    """Resolve table to catalog.database.table for StarRocks Iceberg queries."""
+    parts = [part.strip() for part in table.split(".") if part.strip()]
+    if len(parts) >= 3:
+        return table
+    if len(parts) == 2:
+        db_name, table_name = parts
+        if catalog:
+            return f"{catalog}.{db_name}.{table_name}"
+        return table
+    if len(parts) == 1:
+        if catalog:
+            return f"{catalog}.{database}.{parts[0]}"
+        return f"{database}.{parts[0]}"
+    return table
+
+
+def quote_identifier(identifier: str) -> str:
+    parts = [part.strip() for part in identifier.split(".") if part.strip()]
+    if not parts:
+        raise ValueError(f"Invalid identifier: {identifier!r}")
+    return ".".join(f"`{part.replace('`', '``')}`" for part in parts)
+
+
+def fetch_records(conn: Any, sql: str, params: Sequence[Any] = ()) -> List[Dict[str, Any]]:
+    with conn.cursor() as cursor:
+        cursor.execute(sql, params)
+        if cursor.description is None:
+            return []
+        cols = [field[0] for field in cursor.description]
+        return [dict(zip(cols, row)) for row in cursor.fetchall()]
+
+
+def normalize_json_like(value: Any) -> Any:
+    if isinstance(value, (bytes, bytearray)):
+        value = value.decode("utf-8", errors="replace")
+    if isinstance(value, str):
+        stripped = value.strip()
+        if stripped and stripped[0] in "[{":
+            try:
+                return json.loads(stripped)
+            except json.JSONDecodeError:
+                return value
+    return value
+
+
+def canonicalize(value: Any) -> Any:
+    value = normalize_json_like(value)
+    if isinstance(value, Decimal):
+        if value == value.to_integral_value():
+            return int(value)
+        return float(value)
+    if isinstance(value, (date, datetime)):
+        return value.isoformat()
+    if isinstance(value, dict):
+        return {str(k): canonicalize(v) for k, v in sorted(value.items(), key=lambda item: str(item[0]))}
+    if isinstance(value, list):
+        return [canonicalize(v) for v in value]
+    return value
+
+
+def comparable_record(record: Dict[str, Any], fields: Iterable[str]) -> Dict[str, Any]:
+    return {field: canonicalize(record.get(field)) for field in fields}
+
+
+def _dt_clause(dt: Optional[str], params: List[Any]) -> str:
+    if dt is not None:
+        params.append(dt)
+        return " AND `dt` = %s"
+    return ""
+
+
+def _limit_clause(limit: Optional[int]) -> str:
+    return "" if limit is None else f" LIMIT {int(limit)}"
+
+
+def _doi_not_null_clause() -> str:
+    return " AND `doi` IS NOT NULL AND `doi` != ''"
+
+
+def doi_key_expr(alias: Optional[str] = None) -> str:
+    prefix = f"{alias}." if alias else ""
+    return f"REGEXP_EXTRACT(LOWER(TRIM({prefix}`doi`)), '{DOI_KEY_SQL_PATTERN}', 1)"
+
+
+def _doi_key_not_null_clause(alias: Optional[str] = None) -> str:
+    expr = doi_key_expr(alias)
+    return (
+        f" AND {expr} IS NOT NULL"
+        f" AND {expr} != ''"
+        f" AND {expr} != '{{}}'"
+    )
+
+
+def _hash_sample_predicate(mod_base: Optional[int], mod_max: Optional[int]) -> str:
+    """Narrow scan on Iceberg dt partitions by cleaned DOI key."""
+    if not mod_base or not mod_max or mod_max <= 0:
+        return ""
+    return f" AND (ABS(CRC32({doi_key_expr()})) MOD {int(mod_base)}) < {int(mod_max)}"
+
+
+def _sample_order_clause(*, high_first: bool = False) -> str:
+    if high_first:
+        return f"source_count DESC, CRC32({doi_key_expr()})"
+    return f"CRC32({doi_key_expr()})"
+
+
+def build_target_key_query(
+    table: str,
+    dt: Optional[str],
+    limit: Optional[int],
+    *,
+    hash_mod_base: Optional[int] = None,
+    hash_mod_max: Optional[int] = None,
+) -> Tuple[str, List[Any]]:
+    params: List[Any] = []
+    key_expr = doi_key_expr()
+    sql = (
+        f"SELECT {key_expr} AS sample_key FROM {quote_identifier(table)} "
+        "WHERE 1=1"
+        f"{_doi_key_not_null_clause()}"
+        f"{_dt_clause(dt, params)}"
+        f"{_hash_sample_predicate(hash_mod_base, hash_mod_max)}"
+        f" ORDER BY {_sample_order_clause()}{_limit_clause(limit)}"
+    )
+    return sql, params
+
+
+def build_target_first_key_query(
+    table: str,
+    dt: Optional[str],
+    limit: Optional[int],
+) -> Tuple[str, List[Any]]:
+    params: List[Any] = []
+    key_expr = doi_key_expr()
+    sql = (
+        f"SELECT {key_expr} AS sample_key FROM {quote_identifier(table)} "
+        "WHERE 1=1"
+        f"{_doi_key_not_null_clause()}"
+        f"{_dt_clause(dt, params)}"
+        f"{_limit_clause(limit)}"
+    )
+    return sql, params
+
+
+def build_random_key_query(
+    table: str,
+    dt: Optional[str],
+    limit: Optional[int],
+    *,
+    hash_mod_base: Optional[int] = None,
+    hash_mod_max: Optional[int] = None,
+) -> Tuple[str, List[Any]]:
+    return build_target_key_query(
+        table,
+        dt,
+        limit,
+        hash_mod_base=hash_mod_base,
+        hash_mod_max=hash_mod_max,
+    )
+
+
+def build_duplicate_key_query(
+    table: str,
+    dt: Optional[str],
+    limit: Optional[int],
+    *,
+    high_first: bool,
+    hash_mod_base: Optional[int] = None,
+    hash_mod_max: Optional[int] = None,
+) -> Tuple[str, List[Any]]:
+    params: List[Any] = []
+    key_expr = doi_key_expr()
+    sql = (
+        f"SELECT {key_expr} AS sample_key, COUNT(*) AS source_count FROM {quote_identifier(table)} "
+        "WHERE 1=1"
+        f"{_doi_key_not_null_clause()}"
+        f"{_dt_clause(dt, params)}"
+        f"{_hash_sample_predicate(hash_mod_base, hash_mod_max)}"
+        f" GROUP BY {key_expr} HAVING COUNT(*) > 1 "
+        f"ORDER BY {_sample_order_clause(high_first=high_first)}{_limit_clause(limit)}"
+    )
+    return sql, params
+
+
+def build_field_conflict_key_query(
+    table: str,
+    dt: Optional[str],
+    limit: Optional[int],
+    *,
+    hash_mod_base: Optional[int] = None,
+    hash_mod_max: Optional[int] = None,
+) -> Tuple[str, List[Any]]:
+    params: List[Any] = []
+    conflict_checks = [
+        "COUNT(DISTINCT `title`) > 1",
+        "COUNT(DISTINCT `abstract`) > 1",
+        "COUNT(DISTINCT `language`) > 1",
+        "COUNT(DISTINCT `published_year`) > 1",
+        "COUNT(DISTINCT `published_date`) > 1",
+        "COUNT(DISTINCT `venue_name`) > 1",
+        "COUNT(DISTINCT `venue_type`) > 1",
+        "COUNT(DISTINCT `access_is_oa`) > 1",
+        "COUNT(DISTINCT `access_oa_status`) > 1",
+        "COUNT(DISTINCT `citation_count`) > 1",
+        "COUNT(DISTINCT `reference_count`) > 1",
+        "COUNT(DISTINCT `fwci`) > 1",
+    ]
+    key_expr = doi_key_expr()
+    sql = (
+        f"SELECT {key_expr} AS sample_key, COUNT(*) AS source_count FROM {quote_identifier(table)} "
+        "WHERE 1=1"
+        f"{_doi_key_not_null_clause()}"
+        f"{_dt_clause(dt, params)}"
+        f"{_hash_sample_predicate(hash_mod_base, hash_mod_max)}"
+        f" GROUP BY {key_expr} HAVING COUNT(*) > 1 AND "
+        f"({' OR '.join(conflict_checks)}) "
+        f"ORDER BY {_sample_order_clause(high_first=True)}{_limit_clause(limit)}"
+    )
+    return sql, params
+
+
+def build_count_bucket_key_query(
+    table: str,
+    dt: Optional[str],
+    limit: Optional[int],
+    *,
+    bucket: str,
+    hash_mod_base: Optional[int] = None,
+    hash_mod_max: Optional[int] = None,
+) -> Tuple[str, List[Any]]:
+    params: List[Any] = []
+    if bucket == "one":
+        having = "COUNT(*) = 1"
+    elif bucket == "two":
+        having = "COUNT(*) = 2"
+    elif bucket == "multi":
+        having = "COUNT(*) > 2"
+    else:
+        raise ValueError(f"Unsupported count bucket: {bucket}")
+    key_expr = doi_key_expr()
+    sql = (
+        f"SELECT {key_expr} AS sample_key, COUNT(*) AS source_count FROM {quote_identifier(table)} "
+        "WHERE 1=1"
+        f"{_doi_key_not_null_clause()}"
+        f"{_dt_clause(dt, params)}"
+        f"{_hash_sample_predicate(hash_mod_base, hash_mod_max)}"
+        f" GROUP BY {key_expr} HAVING {having} "
+        f"ORDER BY {_sample_order_clause()}{_limit_clause(limit)}"
+    )
+    return sql, params
+
+
+def _append_sample_key(
+    keys: List[str],
+    seen: set,
+    key: str,
+    *,
+    sample_size: Optional[int],
+) -> bool:
+    if not key or key in seen:
+        return False
+    seen.add(key)
+    keys.append(key)
+    return sample_size is not None and len(keys) >= sample_size
+
+
+def fetch_sample_keys(
+    conn: Any,
+    *,
+    source_table: str,
+    target_table: str,
+    dt: Optional[str],
+    sample_mode: str,
+    sample_size: Optional[int],
+    hash_mod_base: Optional[int] = None,
+    hash_mod_max: Optional[int] = None,
+) -> List[str]:
+    hash_kw = {"hash_mod_base": hash_mod_base, "hash_mod_max": hash_mod_max}
+
+    if sample_mode == "target-first":
+        sql, params = build_target_first_key_query(target_table, dt, sample_size)
+        query_plan: List[Tuple[str, Tuple[str, List[Any]]]] = [("target-first", (sql, params))]
+    elif sample_mode == "target-random":
+        sql, params = build_target_key_query(
+            target_table,
+            dt,
+            sample_size,
+            **hash_kw,
+        )
+        query_plan: List[Tuple[str, Tuple[str, List[Any]]]] = [("target-random", (sql, params))]
+    elif sample_mode == "count-buckets":
+        per_bucket = None if sample_size is None else max(1, sample_size // 3)
+        query_plan = [
+            ("count=1", build_count_bucket_key_query(source_table, dt, per_bucket, bucket="one", **hash_kw)),
+            ("count=2", build_count_bucket_key_query(source_table, dt, per_bucket, bucket="two", **hash_kw)),
+            ("count>2", build_count_bucket_key_query(source_table, dt, per_bucket, bucket="multi", **hash_kw)),
+        ]
+    elif sample_mode == "mixed":
+        per_bucket = None if sample_size is None else max(1, sample_size // 6)
+        query_plan = [
+            ("count=1", build_count_bucket_key_query(source_table, dt, per_bucket, bucket="one", **hash_kw)),
+            ("count=2", build_count_bucket_key_query(source_table, dt, per_bucket, bucket="two", **hash_kw)),
+            ("count>2", build_count_bucket_key_query(source_table, dt, per_bucket, bucket="multi", **hash_kw)),
+            (
+                "field-conflict",
+                build_field_conflict_key_query(source_table, dt, per_bucket, **hash_kw),
+            ),
+            (
+                "high-duplicate",
+                build_duplicate_key_query(
+                    source_table,
+                    dt,
+                    per_bucket,
+                    high_first=True,
+                    **hash_kw,
+                ),
+            ),
+            ("target-random", build_random_key_query(target_table, dt, per_bucket, **hash_kw)),
+        ]
+    else:
+        raise ValueError(f"Unsupported sample_mode: {sample_mode}")
+
+    keys: List[str] = []
+    seen: set = set()
+
+    for idx, (label, (sql, params)) in enumerate(query_plan, start=1):
+        _log(
+            f"[info] 抽样 SQL {idx}/{len(query_plan)} [{label}] 开始执行"
+            f"（dt={dt!r}, mode={sample_mode}）…"
+        )
+        t0 = time.monotonic()
+        rows = fetch_records(conn, sql, params)
+        for row in rows:
+            if _append_sample_key(keys, seen, normalize_doi(row.get("sample_key")), sample_size=sample_size):
+                _log(
+                    f"[info] 抽样 SQL {idx}/{len(query_plan)} [{label}] 完成，"
+                    f"耗时 {time.monotonic() - t0:.1f}s，已收集 {len(keys)} 个 key"
+                )
+                return keys
+        _log(
+            f"[info] 抽样 SQL {idx}/{len(query_plan)} [{label}] 完成，"
+            f"耗时 {time.monotonic() - t0:.1f}s，当前共 {len(keys)} 个 key"
+        )
+    return keys
+
+
+def build_target_record_query(table: str, doi: Any, dt: Optional[str]) -> Tuple[str, List[Any]]:
+    params: List[Any] = []
+    if dt is not None:
+        params.append(dt)
+    params.append(normalize_doi(doi))
+    dt_sql = " AND `dt` = %s" if dt is not None else ""
+    sql = (
+        f"SELECT * FROM {quote_identifier(table)} WHERE 1=1"
+        f"{dt_sql} AND {doi_key_expr()} = %s LIMIT 1"
+    )
+    return sql, params
+
+
+def build_source_query(table: str, doi: Any, dt: Optional[str]) -> Tuple[str, List[Any]]:
+    params: List[Any] = []
+    if dt is not None:
+        params.append(dt)
+    params.append(normalize_doi(doi))
+    dt_sql = " AND `dt` = %s" if dt is not None else ""
+    return (
+        f"SELECT * FROM {quote_identifier(table)} WHERE 1=1{dt_sql} AND {doi_key_expr()} = %s",
+        params,
+    )
+
+
+def build_source_batch_query(
+    table: str,
+    sample_keys: Sequence[str],
+    dt: Optional[str],
+) -> Tuple[str, List[Any]]:
+    if not sample_keys:
+        raise ValueError("sample_keys must not be empty")
+
+    sample_key_sql = " UNION ALL ".join("SELECT %s AS sample_key" for _ in sample_keys)
+    params: List[Any] = [normalize_doi(key) for key in sample_keys]
+    if dt is not None:
+        params.append(dt)
+    dt_sql = " AND s.`dt` = %s" if dt is not None else ""
+
+    sql = (
+        f"WITH sample_keys AS ({sample_key_sql}) "
+        f"SELECT s.* FROM {quote_identifier(table)} s "
+        f"JOIN sample_keys k ON {doi_key_expr('s')} = k.sample_key "
+        f"WHERE 1=1{dt_sql}"
+    )
+    return sql, params
+
+
+def build_target_batch_query(
+    table: str,
+    sample_keys: Sequence[str],
+    dt: Optional[str],
+) -> Tuple[str, List[Any]]:
+    if not sample_keys:
+        raise ValueError("sample_keys must not be empty")
+
+    sample_key_sql = " UNION ALL ".join("SELECT %s AS sample_key" for _ in sample_keys)
+    params: List[Any] = [normalize_doi(key) for key in sample_keys]
+    if dt is not None:
+        params.append(dt)
+    dt_sql = " AND t.`dt` = %s" if dt is not None else ""
+
+    sql = (
+        f"WITH sample_keys AS ({sample_key_sql}) "
+        f"SELECT t.* FROM {quote_identifier(table)} t "
+        f"JOIN sample_keys k ON {doi_key_expr('t')} = k.sample_key "
+        f"WHERE 1=1{dt_sql}"
+    )
+    return sql, params
+
+
+def group_rows_by_doi(rows: Sequence[Dict[str, Any]]) -> Dict[str, List[Dict[str, Any]]]:
+    grouped: Dict[str, List[Dict[str, Any]]] = {}
+    for row in rows:
+        key = normalize_doi(row.get("doi"))
+        if not key:
+            continue
+        grouped.setdefault(key, []).append(row)
+    return grouped
+
+
+def _parse_classifications(raw: Any) -> Optional[Dict[str, Any]]:
+    if raw is None:
+        return None
+    if isinstance(raw, str):
+        try:
+            parsed = json.loads(raw)
+        except json.JSONDecodeError:
+            return None
+        if isinstance(parsed, dict):
+            return parsed
+        return None
+    if isinstance(raw, dict):
+        return raw
+    return None
+
+
+def validate_classifications(
+    source_records: List[Dict[str, Any]],
+    target_row: Dict[str, Any],
+) -> Dict[str, Dict[str, Any]]:
+    """Validate classifications field with random-pick semantics.
+
+    CSV rules:
+    - classifications.mesh: randomly pick one non-empty classifications.mesh value; null if all empty
+    - msc_class, acm_class, arxiv_category: pick from arxiv records only
+    """
+    actual_cls = _parse_classifications(normalize_json_like(target_row.get("classifications")))
+    mismatches: Dict[str, Dict[str, Any]] = {}
+
+    mesh_candidates: List[str] = []
+    arxiv_sub_candidates: Dict[str, List[str]] = {
+        "msc_class": [], "acm_class": [], "arxiv_category": [],
+    }
+
+    def candidate_expectation(candidates: Iterable[str]) -> Any:
+        values: List[Any] = []
+        for raw in sorted(set(candidates)):
+            try:
+                values.append(json.loads(raw))
+            except json.JSONDecodeError:
+                values.append(raw)
+        if len(values) == 1:
+            return values[0]
+        return {"any_of": values}
+
+    for r in source_records:
+        c = _parse_classifications(r.get("classifications"))
+        if c is not None and is_non_empty(c.get("mesh")):
+            mesh_candidates.append(canonical_json(canonicalize(c["mesh"])))
+        origin = normalize_origin_osi(r.get("origin_osi"))
+        if origin == "arxiv":
+            if c is None:
+                continue
+            for sub in arxiv_sub_candidates:
+                if is_non_empty(c.get(sub)):
+                    arxiv_sub_candidates[sub].append(canonical_json(canonicalize(c[sub])))
+
+    if actual_cls is None:
+        has_any_cls = any(_parse_classifications(r.get("classifications")) is not None for r in source_records)
+        if has_any_cls:
+            mismatches["classifications"] = {"expected": "non-null struct", "actual": None}
+        return mismatches
+
+    actual_mesh = actual_cls.get("mesh")
+    if mesh_candidates:
+        expected_mesh = candidate_expectation(mesh_candidates)
+        if not is_non_empty(actual_mesh):
+            mismatches["classifications.mesh"] = {
+                "expected": expected_mesh,
+                "actual": actual_mesh,
+            }
+        elif canonical_json(canonicalize(actual_mesh)) not in set(mesh_candidates):
+            mismatches["classifications.mesh"] = {
+                "expected": expected_mesh,
+                "actual": actual_mesh,
+            }
+    else:
+        if is_non_empty(actual_mesh):
+            mismatches["classifications.mesh"] = {"expected": None, "actual": actual_mesh}
+
+    for sub, candidates in arxiv_sub_candidates.items():
+        actual_sub = actual_cls.get(sub)
+        unique_candidates = set(candidates)
+        if unique_candidates:
+            expected_sub = candidate_expectation(unique_candidates)
+            if not is_non_empty(actual_sub):
+                mismatches[f"classifications.{sub}"] = {
+                    "expected": expected_sub,
+                    "actual": actual_sub,
+                }
+            elif canonical_json(canonicalize(actual_sub)) not in unique_candidates:
+                mismatches[f"classifications.{sub}"] = {
+                    "expected": expected_sub,
+                    "actual": actual_sub,
+                }
+        else:
+            if is_non_empty(actual_sub):
+                mismatches[f"classifications.{sub}"] = {
+                    "expected": "empty (no arxiv source)",
+                    "actual": actual_sub,
+                }
+
+    return mismatches
+
+
+def normalize_order_insensitive_value(value: Any) -> Any:
+    value = canonicalize(value)
+    if isinstance(value, list):
+        return sorted(value, key=canonical_json)
+    return value
+
+
+def normalize_mesh_empty_values(value: Any) -> Any:
+    value = canonicalize(value)
+    if isinstance(value, dict):
+        return {
+            key: normalize_mesh_empty_values(None if val == "" else val)
+            for key, val in value.items()
+        }
+    if isinstance(value, list):
+        return [normalize_mesh_empty_values(item) for item in value]
+    return None if value == "" else value
+
+
+def normalize_empty_for_compare(value: Any, data_type: str) -> Any:
+    type_text = (data_type or "").strip().lower()
+    if value is None:
+        return None
+    if type_text in ("string", "varchar", "char", "text"):
+        return None if isinstance(value, str) and value.strip() == "" else value
+    if type_text.startswith("array"):
+        if value == []:
+            return None
+        if isinstance(value, str) and value.strip() in ("", "[]"):
+            return None
+    return value
+
+
+def compare_records(
+    expected: Dict[str, Any],
+    actual: Dict[str, Any],
+    order_insensitive_fields: Optional[set] = None,
+    field_types: Optional[Dict[str, str]] = None,
+) -> Dict[str, Dict[str, Any]]:
+    mismatches: Dict[str, Dict[str, Any]] = {}
+    order_insensitive_fields = order_insensitive_fields or set()
+    field_types = field_types or {}
+    for field, expected_value in expected.items():
+        if field in order_insensitive_fields:
+            expected_value = normalize_order_insensitive_value(expected_value)
+            actual_value = normalize_order_insensitive_value(actual.get(field))
+        else:
+            actual_value = canonicalize(actual.get(field))
+        expected_value = normalize_empty_for_compare(expected_value, field_types.get(field, ""))
+        actual_value = normalize_empty_for_compare(actual_value, field_types.get(field, ""))
+        if field == "mesh":
+            expected_value = normalize_mesh_empty_values(expected_value)
+            actual_value = normalize_mesh_empty_values(actual_value)
+        if expected_value != actual_value:
+            mismatches[field] = {"expected": expected_value, "actual": actual_value}
+    return mismatches
+
+
+def validate_dt_partitions(
+    conn: Any,
+    source_table: str,
+    target_table: str,
+    dt: Optional[str],
+    *,
+    skip_source_distinct: bool = False,
+    count_mode: str = "hash-buckets",
+    count_buckets: int = 100,
+) -> Dict[str, Any]:
+    """Check dt partition coverage and key counts between source and target."""
+    params: List[Any] = []
+    dt_filter = _dt_clause(dt, params)
+
+    src_map: Dict[str, int] = {}
+    bucket_counts: Dict[str, List[Dict[str, Any]]] = {}
+    failed_buckets: List[Dict[str, Any]] = []
+    matched_key_count: Optional[int] = None
+    key_gap_failed = False
+    if skip_source_distinct:
+        count_mode = "skip"
+
+    if count_mode == "exact":
+        key_expr = doi_key_expr()
+        src_sql = (
+            f"SELECT `dt`, COUNT(DISTINCT {key_expr}) AS key_count"
+            f" FROM {quote_identifier(source_table)}"
+            f" WHERE 1=1{_doi_key_not_null_clause()}{dt_filter} GROUP BY `dt` ORDER BY `dt`"
+        )
+        src_rows = fetch_records(conn, src_sql, params)
+        src_map = {str(r["dt"]): int(r["key_count"]) for r in src_rows}
+    elif count_mode == "hash-buckets":
+        if dt is None:
+            raise ValueError("--count-mode hash-buckets requires --dt")
+        if count_buckets <= 0:
+            raise ValueError("--count-buckets must be positive")
+        src_map[str(dt)] = 0
+        matched_key_count = 0
+        bucket_counts[str(dt)] = []
+        source_key_expr = doi_key_expr()
+        target_key_expr_t = doi_key_expr("t")
+        for bucket in range(count_buckets):
+            bucket_params: List[Any] = [dt, bucket]
+            bucket_sql = (
+                f"SELECT COUNT(DISTINCT {source_key_expr}) AS key_count"
+                f" FROM {quote_identifier(source_table)}"
+                " WHERE 1=1"
+                f"{_doi_key_not_null_clause()}"
+                " AND `dt` = %s"
+                f" AND (ABS(CRC32({source_key_expr})) MOD {int(count_buckets)}) = %s"
+            )
+            _log(
+                f"[info] source distinct hash bucket {bucket + 1}/{count_buckets} "
+                f"开始执行（dt={dt!r}）…"
+            )
+            t0 = time.monotonic()
+            try:
+                rows = fetch_records(conn, bucket_sql, bucket_params)
+                row = rows[0] if rows else None
+                key_count = int(row.get("key_count") or 0) if row else 0
+                src_map[str(dt)] += key_count
+                bucket_counts[str(dt)].append({"bucket": bucket, "key_count": key_count})
+                _log(
+                    f"[info] source distinct hash bucket {bucket + 1}/{count_buckets} "
+                    f"完成，耗时 {time.monotonic() - t0:.1f}s，key_count={key_count}"
+                )
+                join_sql = (
+                    "SELECT COUNT(*) AS key_count"
+                    " FROM ("
+                    f" SELECT DISTINCT {source_key_expr} AS doi_key"
+                    f" FROM {quote_identifier(source_table)}"
+                    " WHERE 1=1"
+                    f"{_doi_key_not_null_clause()}"
+                    " AND `dt` = %s"
+                    f" AND (ABS(CRC32({source_key_expr})) MOD {int(count_buckets)}) = %s"
+                    " ) s"
+                    f" JOIN {quote_identifier(target_table)} t"
+                    f" ON t.`dt` = %s AND {target_key_expr_t} = s.doi_key"
+                )
+                join_t0 = time.monotonic()
+                join_rows = fetch_records(conn, join_sql, [dt, bucket, dt])
+                join_row = join_rows[0] if join_rows else None
+                joined_count = int(join_row.get("key_count") or 0) if join_row else 0
+                matched_key_count += joined_count
+                _log(
+                    f"[info] matched key hash bucket {bucket + 1}/{count_buckets} "
+                    f"完成，耗时 {time.monotonic() - join_t0:.1f}s，key_count={joined_count}"
+                )
+            except Exception as exc:
+                key_gap_failed = True
+                failed_buckets.append({"dt": str(dt), "bucket": bucket, "error": str(exc)})
+                _log(
+                    f"[warn] source distinct hash bucket {bucket + 1}/{count_buckets} "
+                    f"失败，耗时 {time.monotonic() - t0:.1f}s：{exc}"
+                )
+    elif count_mode == "skip":
+        pass
+    else:
+        raise ValueError(f"Unsupported count_mode: {count_mode}")
+
+    tgt_sql = (
+        f"SELECT `dt`, COUNT(*) AS row_count"
+        f" FROM {quote_identifier(target_table)}"
+        f" WHERE 1=1{dt_filter} GROUP BY `dt` ORDER BY `dt`"
+    )
+    tgt_rows = fetch_records(conn, tgt_sql, params)
+    tgt_map = {str(r["dt"]): int(r["row_count"]) for r in tgt_rows}
+    all_dts = sorted(set(src_map) | set(tgt_map))
+
+    mismatches: List[Dict[str, Any]] = []
+    for d in all_dts:
+        src_cnt = src_map.get(d)
+        tgt_cnt = tgt_map.get(d)
+        if src_cnt != tgt_cnt:
+            mismatches.append({
+                "dt": d,
+                "source_key_count": src_cnt,
+                "target_row_count": tgt_cnt,
+            })
+
+    result = {
+        "source_dt_count": len(src_map),
+        "target_dt_count": len(tgt_map),
+        "missing_in_target": sorted(set(src_map) - set(tgt_map)),
+        "extra_in_target": sorted(set(tgt_map) - set(src_map)),
+        "count_mismatches": mismatches,
+        "source_distinct_skipped": count_mode == "skip",
+        "source_count_mode": count_mode,
+        "source_count_buckets": count_buckets if count_mode == "hash-buckets" else None,
+        "source_bucket_counts": bucket_counts,
+        "source_failed_buckets": failed_buckets,
+    }
+    if count_mode == "hash-buckets" and dt is not None and matched_key_count is not None:
+        target_count = tgt_map.get(str(dt))
+        source_count = src_map.get(str(dt))
+        result["matched_key_count"] = matched_key_count
+        result["key_gap_failed"] = key_gap_failed
+        if not key_gap_failed and source_count is not None and target_count is not None:
+            result["source_missing_in_target_key_count"] = max(source_count - matched_key_count, 0)
+            result["target_extra_key_count"] = max(target_count - matched_key_count, 0)
+    return result
+
+
+def discover_dt_values(conn: Any, table: str) -> List[str]:
+    sql = (
+        f"SELECT DISTINCT `dt` FROM {quote_identifier(table)} "
+        "WHERE `dt` IS NOT NULL AND `dt` != '' ORDER BY `dt`"
+    )
+    return [str(r["dt"]) for r in fetch_records(conn, sql)]
+
+
+def validate_db(
+    *,
+    config_path: Path,
+    source_table: str,
+    target_table: str,
+    dt: Optional[str],
+    limit: Optional[int],
+    sample_mode: str,
+    report_path: Optional[Path],
+    mapping_csv: Path = DEFAULT_MAPPING_CSV,
+    skip_dt_check: bool = False,
+    skip_source_distinct: bool = False,
+    count_mode: str = "hash-buckets",
+    count_buckets: int = 100,
+    hash_mod_base: Optional[int] = 100,
+    hash_mod_max: Optional[int] = 2,
+) -> Dict[str, Any]:
+    rules = load_field_rules(mapping_csv)
+    output_fields = output_fields_from_rules(rules)
+    order_insensitive_fields = order_insensitive_fields_from_rules(rules)
+    field_types = {rule.field_name: rule.data_type for rule in rules}
+    has_cls = any(r.strategy == "random_pick_cls" for r in rules)
+    cfg = load_config(config_path)
+    mysql_cfg = cfg.get("mysql", {}) if isinstance(cfg.get("mysql"), dict) else {}
+    catalog = mysql_cfg.get("catalog")
+    database = str(mysql_cfg.get("database") or "dws")
+    source_table = qualify_table_name(source_table, catalog, database)
+    target_table = qualify_table_name(target_table, catalog, database)
+    hash_enabled = bool(hash_mod_base and hash_mod_max and hash_mod_max > 0)
+    _log(
+        f"[info] 论文去重校验开始：dt={dt!r}, limit={limit}, sample_mode={sample_mode}, "
+        f"hash_sample={'on' if hash_enabled else 'off'}, "
+        f"skip_dt_check={skip_dt_check}, count_mode={count_mode}, "
+        f"source={source_table}, target={target_table}"
+    )
+    with connect_starrocks(config_path) as conn:
+        _log("[info] StarRocks 连接成功")
+        if dt is not None:
+            dt_list = [dt]
+        else:
+            _log("[info] 正在发现源表 dt 分区…")
+            dt_list = discover_dt_values(conn, source_table)
+            _log(f"[info] 自动发现 {len(dt_list)} 个 dt 分区，逐分区验证")
+
+        if skip_dt_check:
+            dt_check = {"skipped": True}
+            _log("[info] 跳过分区行数统计（--skip-dt-check）")
+        else:
+            _log("[info] 正在统计目标分区行数（源表 DISTINCT 可较慢，可用 --skip-source-distinct 跳过）…")
+            t0 = time.monotonic()
+            dt_check = validate_dt_partitions(
+                conn,
+                source_table,
+                target_table,
+                dt,
+                skip_source_distinct=skip_source_distinct,
+                count_mode=count_mode,
+                count_buckets=count_buckets,
+            )
+            _log(f"[info] 分区统计完成，耗时 {time.monotonic() - t0:.1f}s")
+
+        checked = passed = failed = missing_source = missing_target = 0
+        source_count_buckets = {"one": 0, "two": 0, "multi": 0}
+        mismatch_rows: List[Dict[str, Any]] = []
+
+        for partition_dt in dt_list:
+            _log(f"[info] 分区 {partition_dt}：开始抽样 key…")
+            sample_keys = fetch_sample_keys(
+                conn,
+                source_table=source_table,
+                target_table=target_table,
+                dt=partition_dt,
+                sample_mode=sample_mode,
+                sample_size=limit,
+                hash_mod_base=hash_mod_base if hash_enabled else None,
+                hash_mod_max=hash_mod_max if hash_enabled else None,
+            )
+            _log(f"[info] 分区 {partition_dt}：抽到 {len(sample_keys)} 个 DOI，开始批量拉取源/目标记录…")
+            t0 = time.monotonic()
+            source_rows_by_key: Dict[str, List[Dict[str, Any]]] = {}
+            target_rows_by_key: Dict[str, List[Dict[str, Any]]] = {}
+            if sample_keys:
+                source_sql, source_params = build_source_batch_query(source_table, sample_keys, partition_dt)
+                source_rows_by_key = group_rows_by_doi(fetch_records(conn, source_sql, source_params))
+                target_sql, target_params = build_target_batch_query(target_table, sample_keys, partition_dt)
+                target_rows_by_key = group_rows_by_doi(fetch_records(conn, target_sql, target_params))
+            _log(
+                f"[info] 分区 {partition_dt}：批量拉取完成，耗时 {time.monotonic() - t0:.1f}s，"
+                f"源命中 {len(source_rows_by_key)}/{len(sample_keys)}，"
+                f"目标命中 {len(target_rows_by_key)}/{len(sample_keys)}"
+            )
+            _log(f"[info] 分区 {partition_dt}：开始逐条比对…")
+
+            for doi in sample_keys:
+                sample_key = normalize_doi(doi)
+                target_rows = target_rows_by_key.get(sample_key, [])
+                source_rows = source_rows_by_key.get(sample_key, [])
+                checked += 1
+                if checked == 1 or checked % 20 == 0:
+                    _log(f"[info] 分区 {partition_dt}：已比对 {checked}/{len(sample_keys)} 条")
+
+                if len(source_rows) == 1:
+                    source_count_buckets["one"] += 1
+                elif len(source_rows) == 2:
+                    source_count_buckets["two"] += 1
+                elif len(source_rows) > 2:
+                    source_count_buckets["multi"] += 1
+
+                if not target_rows:
+                    missing_target += 1
+                    mismatch_rows.append({
+                        "key": doi,
+                        "dt": partition_dt,
+                        "status": "missing_target",
+                        "source_count": len(source_rows),
+                        "source_records": [
+                            {key: normalize_json_like(value) for key, value in row.items()}
+                            for row in source_rows
+                        ],
+                        "mismatches": {},
+                    })
+                    continue
+                if not source_rows:
+                    missing_source += 1
+                    mismatch_rows.append({
+                        "key": doi,
+                        "dt": partition_dt,
+                        "status": "missing_source",
+                        "source_count": 0,
+                        "target_records": [
+                            {key: normalize_json_like(value) for key, value in row.items()}
+                            for row in target_rows
+                        ],
+                        "mismatches": {},
+                    })
+                    continue
+
+                target_row = target_rows[0]
+                normalized_source = [{key: normalize_json_like(value) for key, value in row.items()} for row in source_rows]
+                aggregated = aggregate_group(normalized_source, rules)
+                expected = comparable_record(aggregated, output_fields)
+                actual = comparable_record(target_row, output_fields)
+                mismatches = compare_records(expected, actual, order_insensitive_fields, field_types)
+                if has_cls:
+                    cls_mismatches = validate_classifications(normalized_source, target_row)
+                    mismatches.update(cls_mismatches)
+                if mismatches:
+                    failed += 1
+                    mismatch_rows.append(
+                        {
+                            "key": doi,
+                            "dt": partition_dt,
+                            "status": "field_mismatch",
+                            "source_count": len(source_rows),
+                            "mismatches": mismatches,
+                        }
+                    )
+                else:
+                    passed += 1
+
+    if report_path is not None:
+        report_path.parent.mkdir(parents=True, exist_ok=True)
+        with report_path.open("w", encoding="utf-8") as f:
+            for row in mismatch_rows:
+                f.write(json.dumps(localize_report_keys(row), ensure_ascii=False, cls=JsonEncoder) + "\n")
+        (report_path.parent / "source_field_warning.jsonl").write_text("", encoding="utf-8")
+
+    result = {
+        "status": "ok",
+        "kind": "paper",
+        "source_table": source_table,
+        "target_table": target_table,
+        "key_field": "doi",
+        "dt": dt,
+        "validated_partitions": dt_list,
+        "sample_mode": sample_mode,
+        "sample_size": limit,
+        "dt_check": dt_check,
+        "checked": checked,
+        "passed": passed,
+        "failed": failed,
+        "missing_source": missing_source,
+        "missing_target": missing_target,
+        "source_count_buckets": source_count_buckets,
+        "report_path": str(report_path) if report_path is not None else None,
+        "sample_mismatches": mismatch_rows[:5],
+    }
+    if report_path is not None:
+        write_report_summary(report_path, result, mismatch_rows)
+    print(json.dumps(result, ensure_ascii=False, cls=JsonEncoder))
+    return result
+
+
+# ---- CLI ----
+
+
+def cli() -> None:
+    config_parser = argparse.ArgumentParser(add_help=False)
+    config_parser.add_argument("--config", type=Path, default=DEFAULT_CONFIG_PATH)
+    config_args, _ = config_parser.parse_known_args()
+    cfg = load_config(config_args.config) if config_args.config.exists() else {}
+    paper_cfg = cfg.get("unique_paper", {})
+
+    default_csv = paper_cfg.get("mapping_csv")
+    if default_csv:
+        default_csv = PROJECT_ROOT / default_csv
+    else:
+        default_csv = DEFAULT_MAPPING_CSV
+
+    parser = argparse.ArgumentParser(description="Validate meta_paper unique DB table by DOI.")
+    parser.add_argument("--config", type=Path, default=DEFAULT_CONFIG_PATH, help="shared settings JSON path")
+    parser.add_argument("--mapping-csv", type=Path, default=default_csv, help="field mapping CSV")
+    parser.add_argument("--source-table", default=paper_cfg.get("source_table", DEFAULT_SOURCE_TABLE))
+    parser.add_argument("--target-table", default=paper_cfg.get("target_table", DEFAULT_TARGET_TABLE))
+    parser.add_argument("--dt", default=paper_cfg.get("dt"), help="dt partition filter")
+    parser.add_argument("--limit", type=int, default=int(paper_cfg.get("limit", 600)))
+    parser.add_argument(
+        "--sample-mode",
+        choices=("count-buckets", "mixed", "target-random", "target-first"),
+        default=paper_cfg.get("sample_mode", "count-buckets"),
+        help="count-buckets: 1/2/N 源行分桶；mixed: 加深抽样；target-random: 目标表稳定排序抽样；target-first: 目标表 LIMIT 抽样（smoke 最快）",
+    )
+    parser.add_argument("--full", action="store_true", help="validate all target rows")
+    parser.add_argument("--skip-dt-check", action="store_true", default=bool(paper_cfg.get("skip_dt_check")))
+    parser.add_argument(
+        "--skip-source-distinct",
+        action="store_true",
+        default=bool(paper_cfg.get("skip_source_distinct")),
+        help="dt 统计时跳过源表 COUNT(DISTINCT doi)，等价于 --count-mode skip",
+    )
+    parser.add_argument(
+        "--count-mode",
+        choices=("exact", "skip", "hash-buckets"),
+        default=paper_cfg.get("count_mode", "hash-buckets"),
+        help="源表 distinct DOI 计数模式：exact 单条 COUNT(DISTINCT)，hash-buckets 分桶精确统计，skip 跳过",
+    )
+    parser.add_argument(
+        "--count-buckets",
+        type=int,
+        default=int(paper_cfg.get("count_buckets", 100)),
+        help="--count-mode hash-buckets 时的 hash 分桶数",
+    )
+    parser.add_argument(
+        "--no-sample-hash",
+        action="store_true",
+        help="关闭 CRC32 哈希预过滤（默认 mod 100 取 2，约 2%% 子集）",
+    )
+    parser.add_argument(
+        "--sample-hash-mod-base",
+        type=int,
+        default=int(paper_cfg.get("sample_hash_mod_base", 100)),
+    )
+    parser.add_argument(
+        "--sample-hash-mod-max",
+        type=int,
+        default=int(paper_cfg.get("sample_hash_mod_max", 2)),
+    )
+    parser.add_argument("--report", type=Path, default=paper_cfg.get("report_path"), help="JSONL report path")
+    args = parser.parse_args()
+
+    hash_mod_base = None if args.no_sample_hash else args.sample_hash_mod_base
+    hash_mod_max = None if args.no_sample_hash else args.sample_hash_mod_max
+    count_mode = "skip" if args.skip_source_distinct else args.count_mode
+    report_path = Path(args.report) if args.report else default_report_path(
+        args.dt,
+        "count-buckets" if args.full else args.sample_mode,
+        args.full,
+    )
+
+    validate_db(
+        config_path=args.config,
+        source_table=args.source_table,
+        target_table=args.target_table,
+        dt=args.dt,
+        limit=None if args.full else args.limit,
+        sample_mode="count-buckets" if args.full else args.sample_mode,
+        report_path=report_path,
+        mapping_csv=args.mapping_csv,
+        skip_dt_check=args.skip_dt_check,
+        skip_source_distinct=args.skip_source_distinct,
+        count_mode=count_mode,
+        count_buckets=args.count_buckets,
+        hash_mod_base=hash_mod_base,
+        hash_mod_max=hash_mod_max,
+    )
+
+
+from dingo.config.input_args import EvaluatorRuleArgs
+from dingo.io.input import Data, RequiredField
+from dingo.io.output.eval_detail import EvalDetail, QualityLabel
+from dingo.model.model import Model
+from dingo.model.rule.base import BaseRule
+from dingo.model.rule.scibase.report_utils import bool_param, int_param, write_temp_settings
+
+
+@Model.rule_register(
+    "QUALITY_BAD_EFFECTIVENESS",
+    ["sci_base_qa_test", "meta_paper_unique"],
+)
+class RuleSciBaseMetaPaperUniqueReport(BaseRule):
+    _metric_info = {
+        "category": "Rule-Based Metadata Quality Metrics",
+        "quality_dimension": "EFFECTIVENESS",
+        "metric_name": "RuleSciBaseMetaPaperUniqueReport",
+        "description": "Run SciBase paper DOI unique DB validation and write reports.",
+        "paper_title": "",
+        "paper_url": "",
+        "paper_authors": "",
+        "evaluation_results": "",
+    }
+
+    _required_fields = [RequiredField.METADATA]
+    dynamic_config = EvaluatorRuleArgs(parameters={})
+
+    @classmethod
+    def eval(cls, input_data: Data) -> EvalDetail:
+        del input_data
+        params = cls.dynamic_config.parameters or {}
+        full = bool_param(params, "full", False)
+        sample_mode = str(params.get("sample_mode") or "count-buckets")
+        dt = params.get("dt")
+        report_path = Path(params["report_path"]) if params.get("report_path") else None
+        if report_path is None and params.get("output_dir"):
+            report_path = Path(str(params["output_dir"])) / "source_field_mismatch.jsonl"
+        if report_path is None:
+            report_path = default_report_path(dt, "count-buckets" if full else sample_mode, full)
+
+        config_path = write_temp_settings(params)
+        count_mode = "skip" if bool_param(params, "skip_source_distinct", False) else str(params.get("count_mode") or "hash-buckets")
+        result = validate_db(
+            config_path=config_path,
+            source_table=str(params.get("source_table") or DEFAULT_SOURCE_TABLE),
+            target_table=str(params.get("target_table") or DEFAULT_TARGET_TABLE),
+            dt=dt,
+            limit=None if full else int_param(params, "limit", 600),
+            sample_mode="count-buckets" if full else sample_mode,
+            report_path=report_path,
+            mapping_csv=Path(str(params.get("mapping_csv") or DEFAULT_MAPPING_CSV)),
+            skip_dt_check=bool_param(params, "skip_dt_check", False),
+            skip_source_distinct=bool_param(params, "skip_source_distinct", False),
+            count_mode=count_mode,
+            count_buckets=int_param(params, "count_buckets", 100),
+            hash_mod_base=None if bool_param(params, "no_sample_hash", False) else int_param(params, "sample_hash_mod_base", 100),
+            hash_mod_max=None if bool_param(params, "no_sample_hash", False) else int_param(params, "sample_hash_mod_max", 2),
+        )
+        bad = any(
+            int(result.get(key) or 0) > 0
+            for key in ("failed", "missing_source", "missing_target")
+        )
+        count_mismatches = ((result.get("dt_check") or {}).get("count_mismatches") or [])
+        bad = bad or bool(count_mismatches)
+        reason = [str(report_path.parent), f"checked={result.get('checked')}", f"failed={result.get('failed')}"]
+        if bad:
+            return EvalDetail(
+                metric=cls.__name__,
+                status=True,
+                label=[f"{cls.metric_type}.{cls.__name__}"],
+                reason=reason,
+            )
+        return EvalDetail(metric=cls.__name__, label=[QualityLabel.QUALITY_GOOD], reason=reason)
+
+
+if __name__ == "__main__":
+    cli()
diff --git a/dingo/model/rule/scibase/meta_patent_parsed_info.py b/dingo/model/rule/scibase/meta_patent_parsed_info.py
new file mode 100644
index 00000000..76f3b00f
--- /dev/null
+++ b/dingo/model/rule/scibase/meta_patent_parsed_info.py
@@ -0,0 +1,1720 @@
+#!/usr/bin/env python3
+"""Validate parsed patent fields against the raw XML stored in `content`.
+
+Field extraction rules are driven by ../doc/patent_mapping.csv.  The script is
+intentionally conservative: fields with a confident XML extractor are compared;
+metadata/library fields and unsupported free-form rules are reported as skipped.
+"""
+from __future__ import annotations
+
+import argparse
+import csv
+import json
+import re
+import sys
+import time
+import xml.etree.ElementTree as ET
+from collections import Counter
+from dataclasses import dataclass
+from datetime import date, datetime
+from decimal import Decimal
+from pathlib import Path
+from typing import Any, Callable, Dict, Iterable, List, Optional, Sequence, Tuple
+
+try:
+    import pymysql
+except ImportError:  # pragma: no cover - runtime dependency check
+    pymysql = None  # type: ignore
+
+
+PROJECT_ROOT = Path(__file__).resolve().parent
+ASSETS_DIR = PROJECT_ROOT / "assets"
+DEFAULT_CONFIG_PATH = Path("sci_base_qa_test_config.json")
+TEMPLATE_CONFIG_PATH = ASSETS_DIR / "settings.template.json"
+DEFAULT_MAPPING_CSV = ASSETS_DIR / "patent_mapping.csv"
+REPORT_ROOT = Path("report")
+DEFAULT_TABLE = "test.iceberg_test_patent_parsed_info_acc_d"
+DEFAULT_XML_FIELD = "xml_content"
+
+LIBRARY_MODULE = "库信息"
+SKIP_FIELDS = {
+    "content",  # table content is processed full text; raw XML lives in xml_content for this table.
+}
+FIELD_ALIASES = {
+    "patent_national_classifications": "national_classifications",
+    "patent_domestic_classifications": "domestic_classifications",
+    "patent_fi_classifications": "fi_classifications",
+    "patent_cpc_classifications": "cpc_classifications",
+    "patent_locarno_classes": "locarno_classes",
+}
+ORDER_INSENSITIVE_TYPES = ("list", "array")
+ELEMENT_COVERAGE_SAMPLE_LIMIT = 80
+SAMPLE_MODE_RANDOM = "random"
+SAMPLE_MODE_BRANCH_COVERAGE = "branch-coverage"
+SAMPLE_MODE_ALIASES = {
+    "random": SAMPLE_MODE_RANDOM,
+    "branch-coverage": SAMPLE_MODE_BRANCH_COVERAGE,
+}
+BRANCH_COVERAGE_CANDIDATE_MULTIPLIER = 20
+
+
+@dataclass(frozen=True)
+class PatentRule:
+    field_name: str
+    xml_mapping: str
+    data_type: str
+    description: str
+    validation_rule: str
+    nullable: str
+    module: str
+
+
+@dataclass
+class ExtractResult:
+    value: Any
+    status: str = "ok"
+    reason: str = ""
+    branch: str = ""
+
+
+Extractor = Callable[[ET.Element, PatentRule], ExtractResult]
+
+
+class JsonEncoder(json.JSONEncoder):
+    def default(self, obj: Any) -> Any:
+        if isinstance(obj, Decimal):
+            if obj == obj.to_integral_value():
+                return int(obj)
+            return float(obj)
+        if isinstance(obj, (date, datetime)):
+            return obj.isoformat()
+        return super().default(obj)
+
+
+def _log(message: str) -> None:
+    print(message, file=sys.stderr, flush=True)
+
+
+def load_config(path: Path) -> Dict[str, Any]:
+    if not path.exists():
+        raise FileNotFoundError(
+            f"Config file not found: {path}\n"
+            f"Copy the template and fill in credentials:\n"
+            f"  cp {TEMPLATE_CONFIG_PATH} {path}"
+        )
+    with path.open("r", encoding="utf-8") as f:
+        return json.load(f)
+
+
+def connect_starrocks(config_path: Path):
+    if pymysql is None:
+        raise RuntimeError("pymysql is required. Install pymysql before running DB validation.")
+    cfg = load_config(config_path)
+    mysql_cfg = cfg["mysql"]
+    retry_cfg = cfg.get("retry", {}) if isinstance(cfg.get("retry"), dict) else {}
+    max_attempts = max(1, int(retry_cfg.get("max_attempts", 3)))
+    delay = max(0.0, float(retry_cfg.get("initial_delay_sec", 2.0)))
+    backoff = max(1.0, float(retry_cfg.get("backoff_factor", 2.0)))
+    read_timeout = int(mysql_cfg.get("read_timeout_sec", 600))
+
+    def is_retryable(exc: Exception) -> bool:
+        if pymysql is not None and isinstance(exc, pymysql.err.OperationalError):
+            code = exc.args[0] if exc.args else None
+            if code in (2003, 2006, 2013):
+                return True
+        return any(token in str(exc).lower() for token in ("lost connection", "can't connect", "timeout"))
+
+    for attempt in range(1, max_attempts + 1):
+        try:
+            return pymysql.connect(
+                host=mysql_cfg["host"],
+                port=int(mysql_cfg["port"]),
+                user=mysql_cfg["user"],
+                password=mysql_cfg["password"],
+                charset=mysql_cfg.get("charset", "utf8mb4"),
+                connect_timeout=30,
+                read_timeout=read_timeout,
+            )
+        except Exception as exc:
+            if attempt >= max_attempts or not is_retryable(exc):
+                raise
+            _log(f"[retry] MySQL 连接失败 ({type(exc).__name__}: {exc})，{delay:.1f}s 后重试")
+            time.sleep(delay)
+            delay *= backoff
+    raise RuntimeError("MySQL connection retry exhausted unexpectedly")
+
+
+def qualify_table_name(table: str, catalog: Optional[str], database: str = "dws") -> str:
+    parts = [part.strip() for part in table.split(".") if part.strip()]
+    if len(parts) >= 3:
+        return table
+    if len(parts) == 2:
+        return f"{catalog}.{table}" if catalog else table
+    if len(parts) == 1:
+        return f"{catalog}.{database}.{table}" if catalog else f"{database}.{table}"
+    return table
+
+
+def quote_identifier(identifier: str) -> str:
+    parts = [part.strip() for part in identifier.split(".") if part.strip()]
+    if not parts:
+        raise ValueError(f"Invalid identifier: {identifier!r}")
+    return ".".join(f"`{part.replace('`', '``')}`" for part in parts)
+
+
+def fetch_records(conn: Any, sql: str, params: Sequence[Any] = ()) -> List[Dict[str, Any]]:
+    with conn.cursor() as cursor:
+        cursor.execute(sql, params)
+        if cursor.description is None:
+            return []
+        cols = [field[0] for field in cursor.description]
+        return [dict(zip(cols, row)) for row in cursor.fetchall()]
+
+
+def load_patent_rules(path: Path) -> List[PatentRule]:
+    rules: List[PatentRule] = []
+    with path.open(encoding="utf-8-sig", newline="") as f:
+        reader = csv.DictReader(f)
+        required = {"预期字段名", "xml映射字段", "数据类型", "字段描述", "有效性规则", "可空", "模块"}
+        missing = required - set(reader.fieldnames or [])
+        if missing:
+            raise ValueError(f"映射文件 {path} 缺少列: {', '.join(sorted(missing))}")
+        for row in reader:
+            field_name = clean_header_value(row.get("预期字段名"))
+            if not field_name:
+                continue
+            rules.append(
+                PatentRule(
+                    field_name=field_name,
+                    xml_mapping=clean_header_value(row.get("xml映射字段")),
+                    data_type=clean_header_value(row.get("数据类型")),
+                    description=clean_header_value(row.get("字段描述")),
+                    validation_rule=clean_header_value(row.get("有效性规则")),
+                    nullable=clean_header_value(row.get("可空")),
+                    module=clean_header_value(row.get("模块")),
+                )
+            )
+    return rules
+
+
+def clean_header_value(value: Any) -> str:
+    if value is None:
+        return ""
+    return str(value).strip().strip('"').strip()
+
+
+def safe_filename_token(value: Optional[Any]) -> str:
+    text = "all" if value in (None, "") else str(value)
+    return re.sub(r"[^0-9A-Za-z_-]+", "_", text).strip("_") or "all"
+
+
+def normalize_sample_mode(value: Any) -> str:
+    text = str(value or SAMPLE_MODE_BRANCH_COVERAGE).strip()
+    normalized = SAMPLE_MODE_ALIASES.get(text.lower()) or SAMPLE_MODE_ALIASES.get(text)
+    if normalized is None:
+        raise ValueError(
+            f"Unsupported sample_mode: {value!r}. "
+            f"Use {SAMPLE_MODE_RANDOM!r} or {SAMPLE_MODE_BRANCH_COVERAGE!r}."
+        )
+    return normalized
+
+
+def default_report_path(dt: Optional[str], sample_mode: str, full: bool) -> Path:
+    mode = "full" if full else sample_mode
+    report_dir = REPORT_ROOT / f"meta_patent_parsed_info_dt_{safe_filename_token(dt)}_{safe_filename_token(mode)}"
+    return report_dir / "xml_field_mismatch.jsonl"
+
+
+def summary_paths(report_path: Path) -> Tuple[Path, Path]:
+    return report_path.parent / "summary.json", report_path.parent / "readable_summary.md"
+
+
+def local_name(tag: str) -> str:
+    return tag.rsplit("}", 1)[-1] if "}" in tag else tag.split(":", 1)[-1]
+
+
+def norm_name(name: str) -> str:
+    return re.sub(r"[^a-z0-9]", "", name.lower())
+
+
+def node_name(node: ET.Element) -> str:
+    return norm_name(local_name(node.tag))
+
+
+def text_content(node: Optional[ET.Element]) -> str:
+    if node is None:
+        return ""
+    return normalize_space(" ".join(t for t in node.itertext() if t and t.strip()))
+
+
+def normalize_space(value: Any) -> str:
+    if value is None:
+        return ""
+    return re.sub(r"\s+", " ", str(value)).strip()
+
+
+def attr_value(node: Optional[ET.Element], name: str) -> str:
+    if node is None:
+        return ""
+    wanted = norm_name(name)
+    for key, value in node.attrib.items():
+        if norm_name(local_name(key)) == wanted:
+            return normalize_space(value)
+    return ""
+
+
+def children(node: ET.Element, *names: str) -> List[ET.Element]:
+    wanted = {norm_name(name) for name in names}
+    return [child for child in list(node) if node_name(child) in wanted]
+
+
+def descendants(node: ET.Element, *names: str) -> List[ET.Element]:
+    wanted = {norm_name(name) for name in names}
+    return [elem for elem in node.iter() if elem is not node and node_name(elem) in wanted]
+
+
+def first_descendant(node: ET.Element, *names: str) -> Optional[ET.Element]:
+    items = descendants(node, *names)
+    return items[0] if items else None
+
+
+def child_text(node: ET.Element, *names: str) -> str:
+    for child in children(node, *names):
+        txt = text_content(child)
+        if txt:
+            return txt
+    return ""
+
+
+def first_descendant_text(node: ET.Element, *names: str) -> str:
+    found = first_descendant(node, *names)
+    return text_content(found)
+
+
+def has_ancestor(node: ET.Element, parent_map: Dict[ET.Element, ET.Element], *names: str) -> bool:
+    wanted = {norm_name(name) for name in names}
+    cur = node
+    while cur in parent_map:
+        cur = parent_map[cur]
+        if node_name(cur) in wanted:
+            return True
+    return False
+
+
+def parent_map(root: ET.Element) -> Dict[ET.Element, ET.Element]:
+    return {child: parent for parent in root.iter() for child in list(parent)}
+
+
+def xml_element_path(node: ET.Element, parents: Dict[ET.Element, ET.Element]) -> str:
+    parts = [local_name(node.tag)]
+    cur = node
+    while cur in parents:
+        cur = parents[cur]
+        parts.append(local_name(cur.tag))
+    return "/".join(reversed(parts))
+
+
+def collect_xml_elements(root: ET.Element) -> List[Dict[str, Any]]:
+    parents = parent_map(root)
+    elements: Dict[str, Dict[str, Any]] = {}
+    for node in root.iter():
+        path = xml_element_path(node, parents)
+        item = elements.setdefault(
+            path,
+            {
+                "path": path,
+                "name": local_name(node.tag),
+                "occurrences": 0,
+                "has_text": False,
+                "has_attrs": False,
+            },
+        )
+        item["occurrences"] += 1
+        if text_content(node):
+            item["has_text"] = True
+        if node.attrib:
+            item["has_attrs"] = True
+    return sorted(elements.values(), key=lambda item: item["path"])
+
+
+def mapping_element_names(mapping: str) -> set:
+    if not mapping:
+        return set()
+    # Remove examples and prose-ish tail as much as possible while retaining XML node tokens.
+    cleaned = re.sub(r"[@][A-Za-z0-9_:-]+(?:='[^']*')?", "", mapping)
+    cleaned = re.sub(r"\bdataFormat\b|\boriginal\b|\bstandard\b|\broot\b|根", " ", cleaned, flags=re.I)
+    tokens = re.findall(r"(?:[A-Za-z_][A-Za-z0-9_-]*:)?[A-Za-z_][A-Za-z0-9_-]*", cleaned)
+    ignore = {
+        "business",
+        "base",
+        "xml",
+    }
+    names = set()
+    for token in tokens:
+        name = local_name(token)
+        if name in ignore:
+            continue
+        # Keep field-like XML node names; skip prose fragments that are usually lower-case words.
+        if name and (name[0].isupper() or name in {"lang", "status", "country", "docNumber", "kind", "datePublication"}):
+            names.add(name)
+    return names
+
+
+def mapped_xml_element_names(rules: Sequence[PatentRule]) -> set:
+    names = set()
+    for rule in rules:
+        if rule.module == LIBRARY_MODULE or rule.field_name in SKIP_FIELDS:
+            continue
+        names.update(mapping_element_names(rule.xml_mapping))
+    return names
+
+
+def non_library_rule_fields(rules: Sequence[PatentRule]) -> set:
+    return {
+        rule.field_name
+        for rule in rules
+        if rule.module != LIBRARY_MODULE and rule.field_name not in SKIP_FIELDS
+    }
+
+
+def actual_parsed_fields(row: Dict[str, Any], rules: Sequence[PatentRule]) -> List[str]:
+    fields = []
+    for rule in rules:
+        if rule.module == LIBRARY_MODULE or rule.field_name in SKIP_FIELDS:
+            continue
+        if is_non_empty(actual_field_value(row, rule.field_name)):
+            fields.append(rule.field_name)
+    return sorted(set(fields))
+
+
+def build_element_coverage(
+    row: Dict[str, Any],
+    root: ET.Element,
+    rules: Sequence[PatentRule],
+    *,
+    key: Any,
+    dt: Optional[str],
+) -> Dict[str, Any]:
+    xml_elements = collect_xml_elements(root)
+    mapped_names = mapped_xml_element_names(rules)
+    parsed_fields = actual_parsed_fields(row, rules)
+    rule_fields = non_library_rule_fields(rules)
+    xml_significant = [
+        elem
+        for elem in xml_elements
+        if elem.get("has_text") or elem.get("has_attrs")
+    ]
+    unmapped = [
+        elem
+        for elem in xml_significant
+        if elem["name"] not in mapped_names
+    ]
+    parsed_without_mapping = [
+        field
+        for field in parsed_fields
+        if field not in rule_fields
+    ]
+    return {
+        "key": key,
+        "dt": dt,
+        "parsed_field_count": len(parsed_fields),
+        "parsed_fields": parsed_fields,
+        "xml_element_count": len(xml_elements),
+        "xml_significant_element_count": len(xml_significant),
+        "xml_elements": xml_elements,
+        "mapped_xml_element_name_count": len(mapped_names),
+        "unmapped_xml_element_count": len(unmapped),
+        "unmapped_xml_elements": unmapped[:ELEMENT_COVERAGE_SAMPLE_LIMIT],
+        "unmapped_xml_elements_truncated": max(0, len(unmapped) - ELEMENT_COVERAGE_SAMPLE_LIMIT),
+        "parsed_fields_without_xml_mapping": parsed_without_mapping,
+    }
+
+
+def first_by_path(root: ET.Element, path_names: Sequence[str], attrs: Optional[Dict[str, str]] = None) -> Optional[ET.Element]:
+    current = [root]
+    for raw_name in path_names:
+        wanted = norm_name(raw_name)
+        next_nodes: List[ET.Element] = []
+        for node in current:
+            next_nodes.extend(child for child in node.iter() if child is not node and node_name(child) == wanted)
+        current = next_nodes
+        if not current:
+            return None
+    attrs = attrs or {}
+    for node in current:
+        if all(attr_value(node, key) == value for key, value in attrs.items()):
+            return node
+    return current[0] if current else None
+
+
+def publication_document_ids(root: ET.Element) -> List[ET.Element]:
+    out: List[ET.Element] = []
+    refs = descendants(root, "PublicationReference")
+    refs.sort(key=lambda node: data_format_rank(attr_value(node, "dataFormat")))
+    for pub in refs:
+        doc_ids = descendants(pub, "DocumentID")
+        doc_ids.sort(key=lambda node: data_format_rank(attr_value(node, "dataFormat")))
+        out.extend(doc_ids)
+    return out
+
+
+def publication_refs(root: ET.Element, data_format: Optional[str] = None) -> List[ET.Element]:
+    refs = descendants(root, "PublicationReference")
+    if data_format is not None:
+        wanted = data_format.lower()
+        refs = [ref for ref in refs if attr_value(ref, "dataFormat").lower() == wanted]
+    refs.sort(key=lambda node: data_format_rank(attr_value(node, "dataFormat")))
+    return refs
+
+
+def document_ids_from_refs(refs: Sequence[ET.Element], data_format: Optional[str] = None) -> List[ET.Element]:
+    out: List[ET.Element] = []
+    for ref in refs:
+        doc_ids = descendants(ref, "DocumentID")
+        if data_format is not None:
+            wanted = data_format.lower()
+            doc_ids = [doc_id for doc_id in doc_ids if attr_value(doc_id, "dataFormat").lower() in {"", wanted}]
+        doc_ids.sort(key=lambda node: data_format_rank(attr_value(node, "dataFormat")))
+        out.extend(doc_ids)
+    return out
+
+
+def application_document_ids(root: ET.Element) -> List[ET.Element]:
+    out: List[ET.Element] = []
+    refs = descendants(root, "ApplicationReference")
+    refs.sort(key=lambda node: data_format_rank(attr_value(node, "dataFormat")))
+    for app in refs:
+        doc_ids = descendants(app, "DocumentID")
+        doc_ids.sort(key=lambda node: data_format_rank(attr_value(node, "dataFormat")))
+        out.extend(doc_ids)
+    return out
+
+
+def data_format_rank(value: str) -> int:
+    lowered = value.lower()
+    if lowered == "original":
+        return 0
+    if lowered == "standard":
+        return 1
+    return 2
+
+
+def choose_doc_id(nodes: Sequence[ET.Element]) -> Optional[ET.Element]:
+    if not nodes:
+        return None
+    for node in nodes:
+        parent = node
+        data_formats = [attr_value(n, "dataFormat").lower() for n in [node, *list(node.iter())]]
+        if "original" in data_formats:
+            return parent
+    for node in nodes:
+        data_formats = [attr_value(n, "dataFormat").lower() for n in [node, *list(node.iter())]]
+        if "standard" in data_formats:
+            return node
+    return nodes[0]
+
+
+def document_number_from_doc_id(doc_id: Optional[ET.Element]) -> str:
+    if doc_id is None:
+        return ""
+    parts = [
+        child_text(doc_id, "WIPOST3Code", "CountryCode", "OfficeCode"),
+        child_text(doc_id, "DocNumber", "DocumentNumber"),
+        child_text(doc_id, "Kind"),
+    ]
+    return "".join(part for part in parts if part)
+
+
+def preferred_by_data_format(nodes: Sequence[ET.Element]) -> List[ET.Element]:
+    originals = [node for node in nodes if attr_value(node, "dataFormat").lower() == "original"]
+    if originals:
+        return originals
+    standards = [node for node in nodes if attr_value(node, "dataFormat").lower() == "standard"]
+    if standards:
+        return standards
+    return list(nodes)
+
+
+def date_from_doc_id(doc_id: Optional[ET.Element]) -> str:
+    return child_text(doc_id, "Date") if doc_id is not None else ""
+
+
+def country_from_doc_id(doc_id: Optional[ET.Element]) -> str:
+    if doc_id is None:
+        return ""
+    return child_text(doc_id, "WIPOST3Code", "CountryCode", "OfficeCode")
+
+
+def kind_from_doc_id(doc_id: Optional[ET.Element]) -> str:
+    return child_text(doc_id, "Kind") if doc_id is not None else ""
+
+
+def root_attr(root: ET.Element, name: str) -> str:
+    return attr_value(root, name)
+
+
+def unique_nonempty(values: Iterable[Any]) -> List[str]:
+    seen = set()
+    out: List[str] = []
+    for value in values:
+        text = normalize_space(value)
+        if not text or text in seen:
+            continue
+        seen.add(text)
+        out.append(text)
+    return out
+
+
+def branch_result(value: Any, branch: str) -> ExtractResult:
+    return ExtractResult(value, branch=branch if is_non_empty(value) else "empty")
+
+
+def result_branch(extracted: ExtractResult) -> str:
+    if extracted.status != "ok":
+        return extracted.status
+    if extracted.branch:
+        return extracted.branch
+    return "xml_value" if is_non_empty(extracted.value) else "empty"
+
+
+def extract_document_number(root: ET.Element, rule: PatentRule) -> ExtractResult:
+    value = document_number_from_doc_id(choose_doc_id(document_ids_from_refs(publication_refs(root, "original"), "original")))
+    if value:
+        return branch_result(value, "pub_original")
+    value = document_number_from_doc_id(choose_doc_id(publication_document_ids(root)))
+    if value:
+        return branch_result(value, "pub_fallback")
+    value = "".join(
+        part for part in [root_attr(root, "country"), root_attr(root, "docNumber"), root_attr(root, "kind")] if part
+    )
+    return branch_result(value, "root_attrs")
+
+
+def extract_document_kind_code(root: ET.Element, rule: PatentRule) -> ExtractResult:
+    refs = publication_refs(root, "original")
+    value = kind_from_doc_id(choose_doc_id(document_ids_from_refs(refs, "original")))
+    if value:
+        return branch_result(value, "pub_original")
+    return branch_result(root_attr(root, "kind"), "root_kind")
+
+
+def extract_document_kind_text(root: ET.Element, rule: PatentRule) -> ExtractResult:
+    node = first_by_path(root, ["SpecificBibliographicData", "OriginalKindCode"])
+    return branch_result(text_content(node), "specific_bibliographic_data")
+
+
+def extract_document_status_code(root: ET.Element, rule: PatentRule) -> ExtractResult:
+    for abstract in descendants(root, "Abstract"):
+        status = attr_value(abstract, "status")
+        if status:
+            return branch_result(status, "abstract_status")
+    return branch_result(root_attr(root, "status"), "root_status")
+
+
+def extract_document_wipo_country_code(root: ET.Element, rule: PatentRule) -> ExtractResult:
+    refs = publication_refs(root, "original")
+    value = country_from_doc_id(choose_doc_id(document_ids_from_refs(refs, "original")))
+    if value:
+        return branch_result(value, "pub_original")
+    return branch_result(root_attr(root, "country"), "root_country")
+
+
+def extract_publication_date(root: ET.Element, rule: PatentRule) -> ExtractResult:
+    value = date_from_doc_id(choose_doc_id(publication_document_ids(root)))
+    if value:
+        return branch_result(value, "publication_document_id")
+    return branch_result(root_attr(root, "datePublication"), "root_date_publication")
+
+
+def extract_publication_language(root: ET.Element, rule: PatentRule) -> ExtractResult:
+    return branch_result(root_attr(root, "lang"), "root_lang")
+
+
+def extract_publication_office_code(root: ET.Element, rule: PatentRule) -> ExtractResult:
+    for pub in descendants(root, "PublicationReference"):
+        source_db = attr_value(pub, "sourceDB")
+        if source_db:
+            return branch_result(source_db, "publication_source_db")
+    value = country_from_doc_id(choose_doc_id(publication_document_ids(root)))
+    if value:
+        return branch_result(value, "publication_document_id")
+    return branch_result(root_attr(root, "country"), "root_country")
+
+
+def extract_invention_title(root: ET.Element, rule: PatentRule) -> ExtractResult:
+    lang = root_attr(root, "lang")
+    titles = descendants(root, "InventionTitle")
+    if lang:
+        for title in titles:
+            if attr_value(title, "lang").lower() == lang.lower():
+                return branch_result(text_content(title), "lang_match")
+    return ExtractResult("", branch="empty")
+
+
+def extract_ipc(root: ET.Element, rule: PatentRule) -> ExtractResult:
+    vals: List[str] = []
+    branch = "empty"
+    for ipc_node in descendants(root, "ClassificationIPC"):
+        candidates: List[ET.Element] = []
+        for name in ("MainClassification", "FurtherClassification"):
+            candidates.extend(descendants(ipc_node, name))
+        preferred = preferred_by_data_format(candidates)
+        for node in preferred:
+            if branch == "empty":
+                fmt = attr_value(node, "dataFormat").lower()
+                branch = f"classification_ipc_{fmt}" if fmt else "classification_ipc"
+            vals.append(text_content(node))
+    return branch_result(unique_nonempty(vals), branch)
+
+
+def extract_ipc_text(root: ET.Element, rule: PatentRule) -> ExtractResult:
+    vals: List[str] = []
+    for ipc_node in descendants(root, "ClassificationIPC"):
+        for node in descendants(ipc_node, "Text"):
+            vals.extend(part.strip() for part in text_content(node).splitlines())
+    return branch_result(unique_nonempty(vals), "classification_ipc_text")
+
+
+def extract_ipc_edition_statement(root: ET.Element, rule: PatentRule) -> ExtractResult:
+    for ipc_node in descendants(root, "ClassificationIPC"):
+        text = first_descendant_text(ipc_node, "EditionStatement")
+        if text:
+            return branch_result(text, "classification_ipc_edition_statement")
+    return ExtractResult("", branch="empty")
+
+
+def extract_classification_objects(root: ET.Element, rule: PatentRule) -> ExtractResult:
+    names_by_field = {
+        "ipcr_classifications": ("ClassificationIPCR", "ClassificationIPCRDetails"),
+        "patent_national_classifications": ("ClassificationNational",),
+        "patent_domestic_classifications": ("ClassificationDomestic", "DomesticClassification", "DomesticPatentClassification"),
+        "patent_fi_classifications": ("ClassificationFI", "FIClassification", "ClassificationFIData"),
+        "patent_locarno_classes": ("ClassificationLocarno",),
+    }
+    names = names_by_field.get(rule.field_name, ())
+    values: List[Any] = []
+    branch = "empty"
+    for container in descendants(root, *names):
+        if branch == "empty":
+            branch = local_name(container.tag)
+        texts = unique_nonempty(
+            text_content(node)
+            for node in container.iter()
+            if node is not container and node_name(node) in {"mainclassification", "furtherclassification", "text"}
+        )
+        values.extend(texts)
+    return branch_result(unique_nonempty(values), branch)
+
+
+def extract_cpc(root: ET.Element, rule: PatentRule) -> ExtractResult:
+    values: List[str] = []
+    for pat_cls in descendants(root, "PatentClassification"):
+        scheme = first_descendant(pat_cls, "ClassificationScheme")
+        if scheme is not None and attr_value(scheme, "scheme").upper() != "CPC":
+            continue
+        symbol = first_descendant_text(pat_cls, "ClassificationSymbol") or text_content(pat_cls)
+        values.append(symbol)
+    return branch_result(unique_nonempty(values), "patent_classification_cpc")
+
+
+def extract_abstract(root: ET.Element, rule: PatentRule) -> ExtractResult:
+    vals = [text_content(node) for node in descendants(root, "Abstract")]
+    return branch_result("\n".join(unique_nonempty(vals)), "abstract")
+
+
+def extract_description(root: ET.Element, rule: PatentRule) -> ExtractResult:
+    items = []
+    for idx, node in enumerate(descendants(root, "Description"), start=1):
+        txt = text_content(node)
+        if txt:
+            items.append({"seq": idx, "text": txt})
+    return branch_result(items, "description")
+
+
+def extract_claims(root: ET.Element, rule: PatentRule) -> ExtractResult:
+    claims = []
+    for idx, claim in enumerate(descendants(root, "Claim"), start=1):
+        text = text_content(claim)
+        if not text:
+            continue
+        claims.append(
+            {
+                "claim_id": attr_value(claim, "id") or attr_value(claim, "num") or str(idx),
+                "claim_num": attr_value(claim, "num") or str(idx),
+                "claim_text": text,
+            }
+        )
+    return branch_result(claims, "claims")
+
+
+def extract_drawings(root: ET.Element, rule: PatentRule) -> ExtractResult:
+    drawings = []
+    for idx, figure in enumerate(descendants(root, "Figure"), start=1):
+        image = first_descendant(figure, "Image")
+        if image is None:
+            continue
+        drawings.append(
+            {
+                "figure_id": attr_value(figure, "id") or str(idx),
+                "image_file": attr_value(image, "file") or attr_value(image, "filename") or attr_value(image, "href"),
+            }
+        )
+    return branch_result(drawings, "drawings")
+
+
+def extract_parties(root: ET.Element, rule: PatentRule) -> ExtractResult:
+    field_to_names = {
+        "applicants": ("Applicant",),
+        "assignees": ("Assignee",),
+        "inventors": ("Inventor",),
+        "designers": ("Designer",),
+        "patent_agents": ("Agent", "Agency"),
+        "patent_agency": ("PatentAgency",),
+    }
+    names = field_to_names.get(rule.field_name, ())
+    people = []
+    branch = "empty"
+    for node in descendants(root, *names):
+        if branch == "empty":
+            branch = local_name(node.tag)
+        address_book = first_descendant(node, "AddressBook") or node
+        name = first_descendant_text(address_book, "Name") or first_descendant_text(address_book, "LastName")
+        country = first_descendant_text(address_book, "CountryCode") or first_descendant_text(address_book, "WIPOST3Code")
+        text = text_content(address_book)
+        if name or text:
+            item = {"name": name or text}
+            if country:
+                item["country"] = country
+            people.append(item)
+    return branch_result(dedup_dicts(people), branch)
+
+
+def extract_priority_numbers(root: ET.Element, rule: PatentRule) -> ExtractResult:
+    vals = []
+    for node in descendants(root, "PriorityClaim"):
+        for doc_id in descendants(node, "DocumentID"):
+            vals.append(child_text(doc_id, "DocNumber", "DocumentNumber"))
+    return branch_result(unique_nonempty(vals), "priority_claim_document_id")
+
+
+def extract_priority_filing_dates(root: ET.Element, rule: PatentRule) -> ExtractResult:
+    vals = []
+    for node in descendants(root, "PriorityClaim"):
+        for doc_id in descendants(node, "DocumentID"):
+            vals.append(child_text(doc_id, "Date"))
+    return branch_result(unique_nonempty(vals), "priority_claim_document_id")
+
+
+def extract_priority_office_codes(root: ET.Element, rule: PatentRule) -> ExtractResult:
+    vals = []
+    branch = "empty"
+    for node in descendants(root, "PriorityClaim"):
+        office = first_descendant_text(node, "OfficeCode")
+        generating = first_descendant_text(node, "GeneratingOffice")
+        if office and branch == "empty":
+            branch = "priority_office_code"
+        if generating and branch == "empty":
+            branch = "priority_generating_office"
+        vals.append(office)
+        vals.append(generating)
+        for doc_id in descendants(node, "DocumentID"):
+            country = country_from_doc_id(doc_id)
+            if country and branch == "empty":
+                branch = "priority_document_id_country"
+            vals.append(country)
+    return branch_result(unique_nonempty(vals), branch)
+
+
+def extract_public_availability_date(root: ET.Element, rule: PatentRule) -> ExtractResult:
+    token_map = {
+        "public_availability_unexamined_view_date": ("unexamined", "view"),
+        "public_availability_examined_view_date": ("examined", "view"),
+        "public_availability_unexamined_print_date": ("unexamined", "print"),
+        "public_availability_examined_print_date": ("examined", "print"),
+        "claims_only_public_date": ("claimsonly",),
+        "granted_view_date": ("granted", "view"),
+        "corrected_document_issue_date": ("corrected",),
+    }
+    tokens = token_map.get(rule.field_name, ())
+    for container in descendants(root, "PublicAvailabilityDate"):
+        for node in container.iter():
+            name = node_name(node)
+            if tokens and all(token in name for token in tokens):
+                date_text = first_descendant_text(node, "Date")
+                if date_text:
+                    return branch_result(date_text, local_name(node.tag))
+    return ExtractResult("", branch="empty")
+
+
+def extract_grant_publication_date(root: ET.Element, rule: PatentRule) -> ExtractResult:
+    for container in descendants(root, "PublicAvailabilityDate"):
+        for node in container.iter():
+            if "grant" in node_name(node):
+                date_text = first_descendant_text(node, "Date")
+                if date_text:
+                    return branch_result(date_text, local_name(node.tag))
+    return ExtractResult("", branch="empty")
+
+
+def extract_application_numbers(root: ET.Element, rule: PatentRule) -> ExtractResult:
+    vals: List[str] = []
+    refs = descendants(root, "ApplicationReference")
+    original_refs = [ref for ref in refs if attr_value(ref, "dataFormat").lower() == "original"]
+    for ref in original_refs:
+        doc_id = choose_doc_id(descendants(ref, "DocumentID"))
+        vals.append(document_number_from_doc_id(doc_id) or child_text(doc_id, "DocNumber") if doc_id is not None else "")
+    return branch_result(unique_nonempty(vals), "application_original")
+
+
+def extract_filing_dates(root: ET.Element, rule: PatentRule) -> ExtractResult:
+    return branch_result(
+        unique_nonempty(date_from_doc_id(doc_id) for doc_id in application_document_ids(root)),
+        "application_document_id",
+    )
+
+
+def extract_original_filing_language(root: ET.Element, rule: PatentRule) -> ExtractResult:
+    for app in descendants(root, "ApplicationReference"):
+        lang = attr_value(app, "lang")
+        if lang:
+            return branch_result(lang, "application_lang")
+    return branch_result(root_attr(root, "lang"), "root_lang")
+
+
+def extract_effective_rights_date(root: ET.Element, rule: PatentRule) -> ExtractResult:
+    dates = unique_nonempty(date_from_doc_id(doc_id) for doc_id in application_document_ids(root))
+    return branch_result(dates[0] if dates else "", "application_document_id")
+
+
+def extract_designated_states(root: ET.Element, rule: PatentRule) -> ExtractResult:
+    container_names = ("PctOrRegionalFilingData",) if rule.field_name == "pct_designated_states" else ("RegionalFilingData",)
+    vals: List[str] = []
+    for container in descendants(root, *container_names):
+        for node in descendants(container, "DesignatedState", "WIPOST3Code", "CountryCode"):
+            vals.append(text_content(node))
+    return branch_result(unique_nonempty(vals), "_".join(container_names).lower())
+
+
+def extract_date_by_container(root: ET.Element, rule: PatentRule) -> ExtractResult:
+    token_map = {
+        "pct_national_phase_date": ("PctNationalPhaseEntry", "NationalPhaseEntry"),
+        "pct_effect_ceased_date": ("PctRefiledRevised", "RefiledRevisedApplication"),
+        "search_report_deferred_publication_date": ("SearchReportDifferentPublication",),
+        "spc_application_date": ("SPC",),
+        "microorganism_deposit_date": ("BiologicalDeposit", "MicroorganismDeposit", "MicroorganismDepositDetails", "DepositInstitution"),
+    }
+    for container in descendants(root, *token_map.get(rule.field_name, ())):
+        date_text = first_descendant_text(container, "Date") or first_descendant_text(container, "DepositDate")
+        if date_text:
+            return branch_result(date_text, local_name(container.tag))
+    return ExtractResult("", branch="empty")
+
+
+def extract_generic_object_by_tokens(root: ET.Element, rule: PatentRule) -> ExtractResult:
+    tokens = [token for token in re.split(r"[_\s]+", rule.field_name.lower()) if token and token not in {"patent", "data", "info"}]
+    objects = []
+    for node in root.iter():
+        name = node_name(node)
+        if tokens and any(token in name for token in tokens):
+            txt = text_content(node)
+            if txt:
+                objects.append({"node": local_name(node.tag), "text": txt})
+    if rule.data_type.lower().startswith("list"):
+        return branch_result(dedup_dicts(objects), "token_match")
+    return branch_result(objects[0] if objects else {}, "token_match")
+
+
+def extract_generic_text_by_tokens(root: ET.Element, rule: PatentRule) -> ExtractResult:
+    tokens = [token for token in re.split(r"[_\s]+", rule.field_name.lower()) if token]
+    for node in root.iter():
+        name = node_name(node)
+        if tokens and any(token in name for token in tokens):
+            txt = text_content(node)
+            if txt:
+                return branch_result(txt, local_name(node.tag))
+    return ExtractResult("", branch="empty")
+
+
+def dedup_dicts(items: Iterable[Dict[str, Any]]) -> List[Dict[str, Any]]:
+    dedup: Dict[str, Dict[str, Any]] = {}
+    for item in items:
+        compact = {k: v for k, v in item.items() if is_non_empty(v)}
+        if not compact:
+            continue
+        dedup[canonical_json(compact)] = compact
+    return [dedup[key] for key in sorted(dedup)]
+
+
+FIELD_EXTRACTORS: Dict[str, Extractor] = {
+    "document_number": extract_document_number,
+    "document_kind_text": extract_document_kind_text,
+    "document_kind_code": extract_document_kind_code,
+    "document_status_code": extract_document_status_code,
+    "document_wipo_country_code": extract_document_wipo_country_code,
+    "publication_date": extract_publication_date,
+    "publication_language": extract_publication_language,
+    "publication_office_code": extract_publication_office_code,
+    "invention_title": extract_invention_title,
+    "ipc": extract_ipc,
+    "ipc_text": extract_ipc_text,
+    "ipc_edition_statement": extract_ipc_edition_statement,
+    "ipcr_classifications": extract_classification_objects,
+    "patent_national_classifications": extract_classification_objects,
+    "patent_domestic_classifications": extract_classification_objects,
+    "patent_fi_classifications": extract_classification_objects,
+    "patent_cpc_classifications": extract_cpc,
+    "patent_locarno_classes": extract_classification_objects,
+    "abstract": extract_abstract,
+    "description": extract_description,
+    "claims": extract_claims,
+    "drawings": extract_drawings,
+    "applicants": extract_parties,
+    "assignees": extract_parties,
+    "inventors": extract_parties,
+    "designers": extract_parties,
+    "patent_agents": extract_parties,
+    "patent_agency": extract_parties,
+    "priority_numbers": extract_priority_numbers,
+    "priority_filing_dates": extract_priority_filing_dates,
+    "priority_office_codes": extract_priority_office_codes,
+    "priority_country_codes": extract_priority_office_codes,
+    "public_availability_unexamined_view_date": extract_public_availability_date,
+    "public_availability_examined_view_date": extract_public_availability_date,
+    "public_availability_unexamined_print_date": extract_public_availability_date,
+    "public_availability_examined_print_date": extract_public_availability_date,
+    "grant_publication_date": extract_grant_publication_date,
+    "claims_only_public_date": extract_public_availability_date,
+    "granted_view_date": extract_public_availability_date,
+    "corrected_document_issue_date": extract_public_availability_date,
+    "application_numbers": extract_application_numbers,
+    "filing_dates": extract_filing_dates,
+    "original_filing_language": extract_original_filing_language,
+    "effective_rights_date": extract_effective_rights_date,
+    "pct_designated_states": extract_designated_states,
+    "regional_designated_states": extract_designated_states,
+    "pct_national_phase_date": extract_date_by_container,
+    "pct_effect_ceased_date": extract_date_by_container,
+    "search_report_deferred_publication_date": extract_date_by_container,
+    "spc_application_date": extract_date_by_container,
+    "microorganism_deposit_date": extract_date_by_container,
+}
+
+
+def get_extractor(rule: PatentRule) -> Optional[Extractor]:
+    if rule.field_name in FIELD_EXTRACTORS:
+        return FIELD_EXTRACTORS[rule.field_name]
+    return None
+
+
+def parse_xml(raw: Any) -> ET.Element:
+    if isinstance(raw, (bytes, bytearray)):
+        raw = raw.decode("utf-8", errors="replace")
+    text = str(raw or "").strip()
+    if not text:
+        raise ValueError("empty XML content")
+    text = re.sub(r"^\s*<\?xml[^>]*\?>", "", text, count=1).lstrip()
+    return ET.fromstring(text)
+
+
+def json_loads_maybe(value: Any) -> Any:
+    if isinstance(value, (bytes, bytearray)):
+        value = value.decode("utf-8", errors="replace")
+    if isinstance(value, str):
+        stripped = value.strip()
+        if stripped and stripped[0] in "[{":
+            try:
+                return json.loads(stripped)
+            except json.JSONDecodeError:
+                return value
+    return value
+
+
+def canonicalize(value: Any) -> Any:
+    value = json_loads_maybe(value)
+    if isinstance(value, Decimal):
+        return int(value) if value == value.to_integral_value() else float(value)
+    if isinstance(value, (date, datetime)):
+        return value.isoformat()
+    if isinstance(value, dict):
+        return {str(k): canonicalize(v) for k, v in sorted(value.items(), key=lambda item: str(item[0]))}
+    if isinstance(value, list):
+        return [canonicalize(v) for v in value]
+    if isinstance(value, str):
+        return normalize_space(value)
+    return value
+
+
+def canonical_json(value: Any) -> str:
+    return json.dumps(canonicalize(value), ensure_ascii=False, sort_keys=True, separators=(",", ":"), cls=JsonEncoder)
+
+
+def is_non_empty(value: Any) -> bool:
+    value = json_loads_maybe(value)
+    if value is None:
+        return False
+    if isinstance(value, str):
+        return value.strip() not in {"", "{}", "[]"}
+    if isinstance(value, (list, dict)):
+        return len(value) > 0
+    return True
+
+
+def normalize_dateish(value: Any) -> Any:
+    text = normalize_space(value)
+    digits = re.sub(r"[^0-9]", "", text)
+    if len(digits) == 8:
+        return digits
+    return text
+
+
+def flatten_strings(value: Any) -> List[str]:
+    value = json_loads_maybe(value)
+    out: List[str] = []
+    if value is None:
+        return out
+    if isinstance(value, dict):
+        for v in value.values():
+            out.extend(flatten_strings(v))
+        return out
+    if isinstance(value, list):
+        for item in value:
+            out.extend(flatten_strings(item))
+        return out
+    text = normalize_space(value)
+    if text:
+        out.append(text)
+    return out
+
+
+def compact_text_for_compare(value: Any) -> str:
+    text = " ".join(flatten_strings(value))
+    text = normalize_space(text).lower()
+    return re.sub(r"[^0-9a-z\u3040-\u30ff\u3400-\u9fff]+", "", text)
+
+
+def text_equivalent(expected: Any, actual: Any, field_name: str) -> bool:
+    expected_text = compact_text_for_compare(expected)
+    actual_text = compact_text_for_compare(actual)
+    if not expected_text and not actual_text:
+        return True
+    if not expected_text or not actual_text:
+        return False
+    if expected_text == actual_text:
+        return True
+    if field_name in {"abstract", "description", "claims"}:
+        shorter, longer = sorted((expected_text, actual_text), key=len)
+        return bool(shorter) and shorter in longer
+    return False
+
+
+def compare_values(expected: Any, actual: Any, data_type: str, field_name: str = "") -> Optional[Dict[str, Any]]:
+    expected = canonicalize(expected)
+    actual = canonicalize(actual)
+    type_text = data_type.lower()
+    if not is_non_empty(expected) and not is_non_empty(actual):
+        return None
+    if not is_non_empty(expected) and is_non_empty(actual):
+        return {"expected": expected, "actual": actual, "reason": "xml_empty_but_field_nonempty"}
+    if is_non_empty(expected) and not is_non_empty(actual):
+        return {"expected": expected, "actual": actual, "reason": "xml_nonempty_but_field_empty"}
+    if "date" in type_text:
+        if normalize_dateish(expected) != normalize_dateish(actual):
+            return {"expected": expected, "actual": actual}
+        return None
+    if field_name in {"abstract", "description", "claims"} and text_equivalent(expected, actual, field_name):
+        return None
+    if type_text.startswith(ORDER_INSENSITIVE_TYPES):
+        expected_set = set(flatten_strings(expected))
+        actual_set = set(flatten_strings(actual))
+        if expected_set and not expected_set.issubset(actual_set):
+            return {"expected": sorted(expected_set), "actual": sorted(actual_set)}
+        return None
+    if type_text == "object":
+        expected_tokens = set(flatten_strings(expected))
+        actual_tokens = set(flatten_strings(actual))
+        if expected_tokens and not expected_tokens.intersection(actual_tokens):
+            return {"expected": expected, "actual": actual}
+        return None
+    expected_text = normalize_space(expected)
+    actual_text = normalize_space(actual)
+    if expected_text != actual_text and not text_equivalent(expected, actual, field_name):
+        return {"expected": expected, "actual": actual}
+    return None
+
+
+def compact_record_for_report(record: Dict[str, Any], xml_field: str) -> Dict[str, Any]:
+    keys = (
+        "document_number",
+        "document_kind_code",
+        "publication_date",
+        "invention_title",
+        "sha256",
+        "origin_url",
+        "origin_path",
+        "dt",
+        "patent_source",
+    )
+    return {key: canonicalize(record.get(key)) for key in keys if is_non_empty(record.get(key)) and key != xml_field}
+
+
+def actual_field_value(row: Dict[str, Any], field_name: str) -> Any:
+    if field_name in row:
+        return row.get(field_name)
+    alias = FIELD_ALIASES.get(field_name)
+    if alias:
+        return row.get(alias)
+    return None
+
+
+def build_sample_query(
+    table: str,
+    dt: Optional[str],
+    limit: Optional[int],
+    *,
+    key_field: str,
+    xml_field: str,
+    sample_mode: str,
+) -> Tuple[str, List[Any]]:
+    sample_mode = normalize_sample_mode(sample_mode)
+    params: List[Any] = []
+    where = [f"`{xml_field}` IS NOT NULL", f"`{xml_field}` != ''"]
+    if dt is not None:
+        where.append("`dt` = %s")
+        params.append(dt)
+    if sample_mode == SAMPLE_MODE_RANDOM:
+        order = "RAND()"
+    else:
+        order = f"CRC32(COALESCE(CAST(`{key_field}` AS STRING), CAST(`{xml_field}` AS STRING)))"
+    limit_sql = "" if limit is None else f" LIMIT {int(limit)}"
+    sql = (
+        f"SELECT * FROM {quote_identifier(table)} "
+        f"WHERE {' AND '.join(where)} ORDER BY {order}{limit_sql}"
+    )
+    return sql, params
+
+
+def discover_dt_values(conn: Any, table: str) -> List[str]:
+    sql = (
+        f"SELECT DISTINCT `dt` FROM {quote_identifier(table)} "
+        "WHERE `dt` IS NOT NULL AND `dt` != '' ORDER BY `dt`"
+    )
+    return [str(r["dt"]) for r in fetch_records(conn, sql)]
+
+
+def validate_row(
+    row: Dict[str, Any],
+    rules: Sequence[PatentRule],
+    *,
+    xml_field: str,
+    include_xml_field: bool,
+    selected_fields: Optional[set],
+) -> Tuple[Dict[str, Dict[str, Any]], List[Dict[str, Any]], Dict[str, str]]:
+    mismatches: Dict[str, Dict[str, Any]] = {}
+    warnings: List[Dict[str, Any]] = []
+    branches: Dict[str, str] = {}
+    try:
+        root = parse_xml(row.get(xml_field))
+    except Exception as exc:
+        branches[xml_field] = "xml_parse_failed"
+        return {xml_field: {"expected": "valid XML", "actual": type(exc).__name__, "reason": str(exc)}}, warnings, branches
+
+    for rule in rules:
+        if selected_fields is not None and rule.field_name not in selected_fields:
+            continue
+        if rule.field_name in SKIP_FIELDS:
+            warnings.append({"field": rule.field_name, "status": "skipped", "reason": "processed_fulltext_field"})
+            continue
+        if rule.field_name == xml_field and not include_xml_field:
+            warnings.append({"field": rule.field_name, "status": "skipped", "reason": "raw_xml_field"})
+            continue
+        if rule.module == LIBRARY_MODULE:
+            continue
+        extractor = get_extractor(rule)
+        if extractor is None:
+            warnings.append({"field": rule.field_name, "status": "skipped", "reason": "unsupported_mapping"})
+            continue
+        try:
+            extracted = extractor(root, rule)
+        except Exception as exc:
+            warnings.append({"field": rule.field_name, "status": "extract_error", "reason": str(exc)})
+            continue
+        branches[rule.field_name] = result_branch(extracted)
+        if extracted.status != "ok":
+            warnings.append({"field": rule.field_name, "status": extracted.status, "reason": extracted.reason})
+            continue
+        diff = compare_values(
+            extracted.value,
+            actual_field_value(row, rule.field_name),
+            rule.data_type,
+            rule.field_name,
+        )
+        if diff is not None:
+            mismatches[rule.field_name] = diff
+    return mismatches, warnings, branches
+
+
+def extract_row_branches(
+    row: Dict[str, Any],
+    rules: Sequence[PatentRule],
+    *,
+    xml_field: str,
+    include_xml_field: bool,
+    selected_fields: Optional[set],
+) -> Dict[str, str]:
+    _, _, branches = validate_row(
+        row,
+        rules,
+        xml_field=xml_field,
+        include_xml_field=include_xml_field,
+        selected_fields=selected_fields,
+    )
+    return branches
+
+
+def select_branch_coverage_rows(
+    rows: Sequence[Dict[str, Any]],
+    rules: Sequence[PatentRule],
+    *,
+    limit: Optional[int],
+    xml_field: str,
+    include_xml_field: bool,
+    selected_fields: Optional[set],
+) -> List[Dict[str, Any]]:
+    if limit is None or len(rows) <= limit:
+        return list(rows)
+    selected: List[Dict[str, Any]] = []
+    deferred: List[Dict[str, Any]] = []
+    covered: set = set()
+    for row in rows:
+        branches = extract_row_branches(
+            row,
+            rules,
+            xml_field=xml_field,
+            include_xml_field=include_xml_field,
+            selected_fields=selected_fields,
+        )
+        new_branches = {
+            (field, branch)
+            for field, branch in branches.items()
+            if branch and branch != "empty" and (field, branch) not in covered
+        }
+        if new_branches:
+            selected.append(row)
+            covered.update(new_branches)
+            if len(selected) >= limit:
+                break
+        else:
+            deferred.append(row)
+    if len(selected) < limit:
+        selected.extend(deferred[: limit - len(selected)])
+    return selected
+
+
+def summarize_branch_coverage(field_branch_counts: Dict[str, Counter]) -> Dict[str, Any]:
+    by_field = {
+        field: len(counter)
+        for field, counter in sorted(field_branch_counts.items())
+        if counter
+    }
+    return {
+        "field_count": len(by_field),
+        "total_branch_count": sum(by_field.values()),
+        "by_field": by_field,
+    }
+
+
+def build_report_summary(
+    report_path: Path,
+    result: Dict[str, Any],
+    mismatch_rows: Sequence[Dict[str, Any]],
+    warning_rows: Sequence[Dict[str, Any]],
+) -> Dict[str, Any]:
+    field_counts: Counter = Counter()
+    field_samples: Dict[str, List[Dict[str, Any]]] = {}
+    for row in mismatch_rows:
+        for field, diff in (row.get("mismatches") or {}).items():
+            field_counts[field] += 1
+            samples = field_samples.setdefault(field, [])
+            if len(samples) < 3:
+                samples.append(
+                    {
+                        "key": row.get("key"),
+                        "dt": row.get("dt"),
+                        "expected": truncate_value(diff.get("expected"), max_chars=600) if isinstance(diff, dict) else None,
+                        "actual": truncate_value(diff.get("actual"), max_chars=600) if isinstance(diff, dict) else None,
+                        "reason": diff.get("reason") if isinstance(diff, dict) else None,
+                    }
+                )
+    warning_counts = Counter(item.get("field") for row in warning_rows for item in row.get("warnings", []))
+    return {
+        "report": str(report_path),
+        "total_problem_rows": len(mismatch_rows),
+        "result": {k: v for k, v in result.items() if k != "sample_mismatches"},
+        "field_counts": dict(field_counts.most_common()),
+        "field_samples": {field: field_samples[field] for field, _ in field_counts.most_common(8)},
+        "warning_field_counts": dict(warning_counts.most_common()),
+        "warning_rows": len(warning_rows),
+    }
+
+
+def truncate_value(value: Any, max_chars: int = 600) -> Any:
+    value = canonicalize(value)
+    if isinstance(value, dict):
+        return {k: truncate_value(v, max_chars=max_chars) for k, v in value.items()}
+    if isinstance(value, list):
+        clipped = [truncate_value(item, max_chars=max_chars) for item in value[:5]]
+        if len(value) > 5:
+            clipped.append(f"... ({len(value) - 5} more)")
+        return clipped
+    if isinstance(value, str) and len(value) > max_chars:
+        return value[:max_chars] + f"... ({len(value) - max_chars} more chars)"
+    return value
+
+
+def compact_mismatch_rows(rows: Sequence[Dict[str, Any]], limit: int = 5) -> List[Dict[str, Any]]:
+    compacted: List[Dict[str, Any]] = []
+    for row in rows[:limit]:
+        compacted.append(
+            {
+                "key": row.get("key"),
+                "dt": row.get("dt"),
+                "status": row.get("status"),
+                "record": truncate_value(row.get("record"), max_chars=240),
+                "mismatches": truncate_value(row.get("mismatches"), max_chars=360),
+            }
+        )
+    return compacted
+
+
+def write_report_summary(
+    report_path: Path,
+    result: Dict[str, Any],
+    mismatch_rows: Sequence[Dict[str, Any]],
+    warning_rows: Sequence[Dict[str, Any]],
+) -> None:
+    summary_json_path, summary_md_path = summary_paths(report_path)
+    summary = build_report_summary(report_path, result, mismatch_rows, warning_rows)
+    with summary_json_path.open("w", encoding="utf-8") as f:
+        json.dump(summary, f, ensure_ascii=False, indent=2, cls=JsonEncoder)
+
+    lines = [
+        "# Patent XML 字段校验报告摘要",
+        "",
+        f"- 分区: `{result.get('dt')}`",
+        f"- 抽样: `{result.get('sample_mode')}`, 数量 `{result.get('sample_size')}`",
+        f"- 结果: 已校验 `{result.get('checked')}`，通过 `{result.get('passed')}`，失败 `{result.get('failed')}`",
+        f"- XML 解析失败: `{result.get('xml_parse_failed')}`",
+        f"- 明细报告: `{report_path}`",
+        f"- 报告目录: `{report_path.parent}`",
+        "",
+        "## 字段问题分布",
+        "",
+    ]
+    for field, count in summary["field_counts"].items():
+        lines.append(f"- `{field}`: {count}")
+    if not summary["field_counts"]:
+        lines.append("- 无")
+    lines.extend(["", "## 字段问题样例", ""])
+    for field, samples in summary["field_samples"].items():
+        lines.append(f"### {field} ({summary['field_counts'].get(field)})")
+        lines.append("")
+        for sample in samples:
+            lines.append(f"- key `{sample.get('key')}`, dt `{sample.get('dt')}`, reason `{sample.get('reason')}`")
+            lines.append(f"  - expected: `{json.dumps(sample.get('expected'), ensure_ascii=False, cls=JsonEncoder)}`")
+            lines.append(f"  - actual: `{json.dumps(sample.get('actual'), ensure_ascii=False, cls=JsonEncoder)}`")
+            lines.append("")
+    lines.extend(["", "## 跳过/告警字段", ""])
+    for field, count in summary["warning_field_counts"].items():
+        lines.append(f"- `{field}`: {count}")
+    if not summary["warning_field_counts"]:
+        lines.append("- 无")
+    branch_coverage = summary.get("result", {}).get("branch_coverage", {})
+    lines.extend(["", "## 字段 Branch 覆盖", ""])
+    lines.append(f"- 覆盖字段数: `{branch_coverage.get('field_count', 0)}`")
+    lines.append(f"- 覆盖 branch 总数: `{branch_coverage.get('total_branch_count', 0)}`")
+    for field, count in (branch_coverage.get("by_field") or {}).items():
+        lines.append(f"- `{field}`: {count}")
+    with summary_md_path.open("w", encoding="utf-8") as f:
+        f.write("\n".join(lines).rstrip() + "\n")
+
+
+def validate_db(
+    *,
+    config_path: Path,
+    table: str,
+    dt: Optional[str],
+    limit: Optional[int],
+    sample_mode: str,
+    report_path: Optional[Path],
+    mapping_csv: Path = DEFAULT_MAPPING_CSV,
+    xml_field: str = DEFAULT_XML_FIELD,
+    key_field: str = "document_number",
+    include_xml_field: bool = False,
+    fields: Optional[Sequence[str]] = None,
+) -> Dict[str, Any]:
+    sample_mode = normalize_sample_mode(sample_mode)
+    rules = load_patent_rules(mapping_csv)
+    selected_fields = set(fields) if fields else None
+    cfg = load_config(config_path)
+    mysql_cfg = cfg.get("mysql", {}) if isinstance(cfg.get("mysql"), dict) else {}
+    catalog = mysql_cfg.get("catalog")
+    database = str(mysql_cfg.get("database") or "dws")
+    table = qualify_table_name(table, catalog, database)
+    _log(
+        f"[info] 专利 XML 字段校验开始：dt={dt!r}, limit={limit}, sample_mode={sample_mode}, "
+        f"table={table}, xml_field={xml_field}"
+    )
+
+    checked = passed = failed = xml_parse_failed = 0
+    mismatch_rows: List[Dict[str, Any]] = []
+    warning_rows: List[Dict[str, Any]] = []
+    field_branch_counts: Dict[str, Counter] = {}
+
+    with connect_starrocks(config_path) as conn:
+        _log("[info] StarRocks 连接成功")
+        dt_list = [dt] if dt is not None else discover_dt_values(conn, table)
+        if dt is None:
+            _log(f"[info] 自动发现 {len(dt_list)} 个 dt 分区，逐分区验证")
+
+        for partition_dt in dt_list:
+            _log(f"[info] 分区 {partition_dt}：开始抽样记录…")
+            query_limit = limit
+            if sample_mode == SAMPLE_MODE_BRANCH_COVERAGE and limit is not None:
+                query_limit = max(int(limit), int(limit) * BRANCH_COVERAGE_CANDIDATE_MULTIPLIER)
+            sql, params = build_sample_query(
+                table,
+                partition_dt,
+                query_limit,
+                key_field=key_field,
+                xml_field=xml_field,
+                sample_mode=sample_mode,
+            )
+            t0 = time.monotonic()
+            rows = fetch_records(conn, sql, params)
+            if sample_mode == SAMPLE_MODE_BRANCH_COVERAGE:
+                candidate_count = len(rows)
+                rows = select_branch_coverage_rows(
+                    rows,
+                    rules,
+                    limit=limit,
+                    xml_field=xml_field,
+                    include_xml_field=include_xml_field,
+                    selected_fields=selected_fields,
+                )
+                _log(
+                    f"[info] 分区 {partition_dt}：branch 候选 {candidate_count} 条，"
+                    f"保留 {len(rows)} 条"
+                )
+            _log(f"[info] 分区 {partition_dt}：抽到 {len(rows)} 条，耗时 {time.monotonic() - t0:.1f}s，开始解析 XML…")
+            for idx, row in enumerate(rows, start=1):
+                checked += 1
+                if idx == 1 or idx % 20 == 0:
+                    _log(f"[info] 分区 {partition_dt}：已比对 {idx}/{len(rows)} 条")
+                key = row.get(key_field) or row.get("sha256") or f"{partition_dt}:{idx}"
+                mismatches, warnings, branches = validate_row(
+                    row,
+                    rules,
+                    xml_field=xml_field,
+                    include_xml_field=include_xml_field,
+                    selected_fields=selected_fields,
+                )
+                for field, branch in branches.items():
+                    if not branch:
+                        continue
+                    field_branch_counts.setdefault(field, Counter())[branch] += 1
+                if xml_field in mismatches:
+                    xml_parse_failed += 1
+                if warnings:
+                    warning_rows.append({"key": key, "dt": partition_dt, "warnings": warnings})
+                if mismatches:
+                    failed += 1
+                    mismatch_rows.append(
+                        {
+                            "key": key,
+                            "dt": partition_dt,
+                            "status": "field_mismatch",
+                            "record": compact_record_for_report(row, xml_field),
+                            "mismatches": mismatches,
+                        }
+                    )
+                else:
+                    passed += 1
+
+    if report_path is not None:
+        report_path.parent.mkdir(parents=True, exist_ok=True)
+        with report_path.open("w", encoding="utf-8") as f:
+            for row in mismatch_rows:
+                f.write(json.dumps(row, ensure_ascii=False, cls=JsonEncoder) + "\n")
+        warning_path = report_path.parent / "xml_field_warning.jsonl"
+        with warning_path.open("w", encoding="utf-8") as f:
+            for row in warning_rows:
+                f.write(json.dumps(row, ensure_ascii=False, cls=JsonEncoder) + "\n")
+
+    result = {
+        "status": "ok",
+        "kind": "patent_xml",
+        "table": table,
+        "key_field": key_field,
+        "xml_field": xml_field,
+        "dt": dt,
+        "sample_mode": sample_mode,
+        "sample_size": limit,
+        "checked": checked,
+        "passed": passed,
+        "failed": failed,
+        "xml_parse_failed": xml_parse_failed,
+        "warning_rows": len(warning_rows),
+        "branch_coverage": summarize_branch_coverage(field_branch_counts),
+        "report_path": str(report_path) if report_path is not None else None,
+        "sample_mismatches": compact_mismatch_rows(mismatch_rows),
+    }
+    if report_path is not None:
+        write_report_summary(report_path, result, mismatch_rows, warning_rows)
+    print(json.dumps(result, ensure_ascii=False, cls=JsonEncoder))
+    return result
+
+
+def cli() -> None:
+    config_parser = argparse.ArgumentParser(add_help=False)
+    config_parser.add_argument("--config", type=Path, default=DEFAULT_CONFIG_PATH)
+    config_args, _ = config_parser.parse_known_args()
+    cfg = load_config(config_args.config) if config_args.config.exists() else {}
+    patent_cfg = cfg.get("patent_parsed_info", {}) if isinstance(cfg.get("patent_parsed_info"), dict) else {}
+
+    default_csv = patent_cfg.get("mapping_csv")
+    default_csv_path = PROJECT_ROOT / default_csv if default_csv else DEFAULT_MAPPING_CSV
+
+    parser = argparse.ArgumentParser(description="Validate parsed patent DB fields against raw XML content.")
+    parser.add_argument("--config", type=Path, default=DEFAULT_CONFIG_PATH, help="shared settings JSON path")
+    parser.add_argument("--mapping-csv", type=Path, default=default_csv_path, help="patent field mapping CSV")
+    parser.add_argument("--table", default=patent_cfg.get("table", DEFAULT_TABLE))
+    parser.add_argument("--dt", default=patent_cfg.get("dt"), help="dt partition filter")
+    parser.add_argument("--limit", type=int, default=int(patent_cfg.get("limit", 200)))
+    parser.add_argument("--full", action="store_true", help="validate all sampled partition rows without LIMIT")
+    parser.add_argument("--xml-field", default=patent_cfg.get("xml_field", DEFAULT_XML_FIELD))
+    parser.add_argument("--key-field", default=patent_cfg.get("key_field", "document_number"))
+    parser.add_argument(
+        "--sample-mode",
+        choices=(SAMPLE_MODE_RANDOM, SAMPLE_MODE_BRANCH_COVERAGE),
+        default=normalize_sample_mode(patent_cfg.get("sample_mode", SAMPLE_MODE_BRANCH_COVERAGE)),
+        help="random: 随机抽样；branch-coverage: 覆盖所有 branch 抽样",
+    )
+    parser.add_argument(
+        "--fields",
+        default=patent_cfg.get("fields"),
+        help="comma separated field allowlist, e.g. document_number,publication_date",
+    )
+    parser.add_argument("--include-xml-field", action="store_true", help="also compare the XML field itself")
+    parser.add_argument("--report", type=Path, default=patent_cfg.get("report_path"), help="JSONL report path")
+    args = parser.parse_args()
+
+    fields = None
+    if args.fields:
+        fields = [field.strip() for field in str(args.fields).split(",") if field.strip()]
+    report_path = Path(args.report) if args.report else default_report_path(
+        args.dt,
+        args.sample_mode,
+        args.full,
+    )
+    validate_db(
+        config_path=args.config,
+        table=args.table,
+        dt=args.dt,
+        limit=None if args.full else args.limit,
+        sample_mode=args.sample_mode,
+        report_path=report_path,
+        mapping_csv=args.mapping_csv,
+        xml_field=args.xml_field,
+        key_field=args.key_field,
+        include_xml_field=args.include_xml_field,
+        fields=fields,
+    )
+
+
+from dingo.config.input_args import EvaluatorRuleArgs
+from dingo.io.input import Data, RequiredField
+from dingo.io.output.eval_detail import EvalDetail
+from dingo.model.model import Model
+from dingo.model.rule.base import BaseRule
+from dingo.model.rule.scibase.report_utils import bool_param, int_param, write_temp_settings
+
+
+def _fields_param(value: Any) -> Optional[List[str]]:
+    if value is None or value == "":
+        return None
+    if isinstance(value, str):
+        return [item.strip() for item in value.split(",") if item.strip()]
+    if isinstance(value, (list, tuple, set)):
+        return [str(item).strip() for item in value if str(item).strip()]
+    return [str(value).strip()]
+
+
+@Model.rule_register(
+    "QUALITY_BAD_EFFECTIVENESS",
+    ["sci_base_qa_test", "meta_patent_parsed_info"],
+)
+class RuleSciBaseMetaPatentParsedInfoReport(BaseRule):
+    _metric_info = {
+        "category": "Rule-Based Metadata Quality Metrics",
+        "quality_dimension": "EFFECTIVENESS",
+        "metric_name": "RuleSciBaseMetaPatentParsedInfoReport",
+        "description": "Run SciBase patent XML parsed-field validation with branch coverage sampling.",
+        "paper_title": "",
+        "paper_url": "",
+        "paper_authors": "",
+        "evaluation_results": "",
+    }
+
+    _required_fields = [RequiredField.METADATA]
+    dynamic_config = EvaluatorRuleArgs(parameters={})
+
+    @classmethod
+    def eval(cls, input_data: Data) -> EvalDetail:
+        del input_data
+        params = cls.dynamic_config.parameters or {}
+        full = bool_param(params, "full", False)
+        sample_mode = normalize_sample_mode(params.get("sample_mode", SAMPLE_MODE_BRANCH_COVERAGE))
+        report_path = Path(params["report_path"]) if params.get("report_path") else None
+        if report_path is None and params.get("output_dir"):
+            report_path = Path(str(params["output_dir"])) / "xml_field_mismatch.jsonl"
+        if report_path is None:
+            report_path = default_report_path(
+                params.get("dt"),
+                sample_mode,
+                full,
+            )
+
+        config_path = write_temp_settings(params)
+        result = validate_db(
+            config_path=config_path,
+            table=str(params.get("target_table") or params.get("table") or DEFAULT_TABLE),
+            dt=params.get("dt"),
+            limit=None if full else int_param(params, "limit", 200),
+            sample_mode=sample_mode,
+            report_path=report_path,
+            mapping_csv=Path(str(params.get("mapping_csv") or DEFAULT_MAPPING_CSV)),
+            xml_field=str(params.get("xml_field") or DEFAULT_XML_FIELD),
+            key_field=str(params.get("key_field") or "document_number"),
+            include_xml_field=bool_param(params, "include_xml_field", False),
+            fields=_fields_param(params.get("fields")),
+        )
+        branch_coverage = result.get("branch_coverage") or {}
+        is_bad = bool(result.get("failed") or result.get("xml_parse_failed"))
+        return EvalDetail(
+            metric=cls.__name__,
+            status=is_bad,
+            label=[
+                f"{cls.metric_type}.{cls.__name__}" if is_bad else "QUALITY_GOOD",
+            ],
+            reason=[
+                str(report_path.parent),
+                f"checked={result.get('checked')}",
+                f"failed={result.get('failed')}",
+                f"branch_fields={branch_coverage.get('field_count', 0)}",
+                f"branch_total={branch_coverage.get('total_branch_count', 0)}",
+            ],
+        )
+
+
+if __name__ == "__main__":
+    cli()
diff --git a/dingo/model/rule/scibase/report_utils.py b/dingo/model/rule/scibase/report_utils.py
new file mode 100644
index 00000000..6bc7ef3c
--- /dev/null
+++ b/dingo/model/rule/scibase/report_utils.py
@@ -0,0 +1,163 @@
+import json
+import tempfile
+from pathlib import Path
+from typing import Any, Dict, Optional
+from urllib.parse import parse_qsl
+
+
+def load_scibase_parameters(params: Optional[Dict[str, Any]]) -> Dict[str, Any]:
+    return dict(params or {})
+
+
+def datasource_note() -> Dict[str, str]:
+    return {
+        "source": "dingo dataset datasource",
+        "connection_config": "dataset.sql_config or dataset.s3_config",
+        "input_config": "input_path",
+    }
+
+
+def table_params(params: Dict[str, Any], defaults: Dict[str, str]) -> Dict[str, Any]:
+    result = dict(defaults)
+    for key in (
+        "dt",
+        "s3_dt",
+        "target_dt",
+        "paper_dt",
+        "ebook_dt",
+        "source_table",
+        "target_table",
+        "paper_table",
+        "ebook_table",
+        "xinghe_table",
+    ):
+        if params.get(key) is not None:
+            result[key] = params[key]
+    return result
+
+
+def dingo_sql_config(params: Dict[str, Any]) -> Dict[str, Any]:
+    config = dict(params.get("_dingo_dataset_sql_config") or params.get("sql_config") or {})
+    if not config.get("host") or not config.get("username"):
+        raise RuntimeError(
+            "SQL config is required for this SciBase validator. "
+            "Set dataset.sql_config in the Dingo input config."
+        )
+    return config
+
+
+def dingo_s3_config(params: Dict[str, Any]) -> Dict[str, Any]:
+    config = dict(params.get("_dingo_dataset_s3_config") or params.get("s3_config") or {})
+    return config
+
+
+def s3_path_from_dingo(params: Dict[str, Any]) -> Optional[str]:
+    explicit_path = params.get("s3_path")
+    if explicit_path:
+        return str(explicit_path)
+
+    s3_config = dingo_s3_config(params)
+    input_path = params.get("_dingo_input_path")
+    if input_path:
+        input_path_str = str(input_path).strip()
+        if input_path_str.startswith("s3://"):
+            return input_path_str
+        if params.get("_dingo_dataset_source") == "s3":
+            bucket = str(s3_config.get("s3_bucket") or "").strip().strip("/")
+            if bucket:
+                return f"s3://{bucket}/{input_path_str.lstrip('/')}"
+
+    bucket = str(s3_config.get("s3_bucket") or "").strip().strip("/")
+    if bucket and params.get("s3_subpath"):
+        return f"s3://{bucket}/"
+    return None
+
+
+def _connect_args_dict(raw: Any) -> Dict[str, str]:
+    if not raw:
+        return {}
+    text = str(raw)
+    if text.startswith("?"):
+        text = text[1:]
+    return dict(parse_qsl(text, keep_blank_values=True))
+
+
+def mysql_settings_from_dingo(params: Dict[str, Any]) -> Dict[str, Any]:
+    sql_config = dingo_sql_config(params)
+    connect_args = _connect_args_dict(sql_config.get("connect_args"))
+    settings = {
+        "host": sql_config.get("host"),
+        "port": int(sql_config.get("port") or 0),
+        "user": sql_config.get("username"),
+        "password": sql_config.get("password"),
+        "database": sql_config.get("database") or "dws",
+        "charset": connect_args.get("charset", "utf8mb4"),
+    }
+    for key in ("catalog", "connect_timeout", "read_timeout", "read_timeout_sec"):
+        if params.get(key) is not None:
+            settings[key] = params[key]
+    return settings
+
+
+def s3_settings_from_dingo(params: Dict[str, Any]) -> Dict[str, Any]:
+    s3_config = dingo_s3_config(params)
+    endpoint = str(s3_config.get("s3_endpoint_url") or "").rstrip("/")
+    if endpoint.startswith("https://"):
+        endpoint = endpoint[len("https://"):]
+    elif endpoint.startswith("http://"):
+        endpoint = endpoint[len("http://"):]
+    settings = {
+        "endpoint": endpoint,
+        "access_key": s3_config.get("s3_ak"),
+        "secret_key": s3_config.get("s3_sk"),
+        "path": s3_path_from_dingo(params),
+        "format": params.get("s3_format", "auto"),
+    }
+    if params.get("s3_subpath") is not None:
+        settings["subpath"] = params["s3_subpath"]
+    for key in ("use_ssl", "verify_ssl"):
+        if params.get(key) is not None:
+            settings[key] = params[key]
+        elif s3_config.get(key) is not None:
+            settings[key] = s3_config[key]
+    return settings
+
+
+def write_temp_settings(params: Dict[str, Any], *, include_s3: bool = False) -> Path:
+    payload: Dict[str, Any] = {
+        "mysql": mysql_settings_from_dingo(params),
+        "retry": params.get("retry", {}),
+    }
+    if include_s3:
+        payload["s3"] = s3_settings_from_dingo(params)
+        payload["osi_arxiv"] = {
+            "s3": payload["s3"],
+            "mapping_csv": params.get("mapping_csv"),
+            "target_table": params.get("target_table"),
+            "database": params.get("database"),
+            "catalog": params.get("catalog"),
+        }
+    temp = tempfile.NamedTemporaryFile(
+        "w",
+        encoding="utf-8",
+        suffix=".json",
+        prefix="dingo_scibase_",
+        delete=False,
+    )
+    with temp:
+        json.dump(payload, temp, ensure_ascii=False, indent=2)
+    return Path(temp.name)
+
+
+def int_param(params: Dict[str, Any], key: str, default: int) -> int:
+    value = params.get(key, default)
+    return default if value is None else int(value)
+
+
+def bool_param(params: Dict[str, Any], key: str, default: bool = False) -> bool:
+    value = params.get(key, default)
+    if isinstance(value, bool):
+        return value
+    if isinstance(value, str):
+        return value.strip().lower() in {"1", "true", "yes", "y"}
+    return bool(value)
diff --git a/dingo/model/rule/scibase/rule_quanliang.py b/dingo/model/rule/scibase/rule_quanliang.py
deleted file mode 100644
index 601fb7be..00000000
--- a/dingo/model/rule/scibase/rule_quanliang.py
+++ /dev/null
@@ -1,655 +0,0 @@
-import json
-import re
-from datetime import datetime
-from pathlib import Path
-from typing import Any, Dict, List
-
-from dingo.config.input_args import EvaluatorRuleArgs
-from dingo.io.input import Data, RequiredField
-from dingo.io.output.eval_detail import EvalDetail, QualityLabel
-from dingo.model.model import Model
-from dingo.model.rule.base import BaseRule
-
-URL_RE = re.compile(r"^[Hh][Tt][Tt][Pp][Ss]?://[^/$.?#][\s\S]*$")
-DOI_RE = re.compile(r"^10\.\d{4,9}/([^A-Z\s\|]*)$")
-INVISIBLE_RE = re.compile(r"[\u2000-\u200F\u202F\u205F\u3000\uFEFF\u00A0\u2060-\u206F\xa0]")
-PAGE_RANGE_RE = re.compile(r"^\d+-\d+$")
-ISSN_RE = re.compile(r"^\d{4}-\d{3}[\dX]$")
-AUTHOR_SEP_RE = re.compile(r"[|;；]")
-
-OA_BOOL_VALUES = {"true", "false", "unknown"}
-METADATA_TYPE_VALUES = {"paper", "ebook"}
-OA_STATUS_VALUES = {"diamond", "gold", "green", "hybrid", "bronze", "closed", ""}
-LOC_TYPE_VALUES = {"download", "reader", "display", ""}
-JSON_LIST_FIELDS = {
-    "isbns",
-    "author",
-    "contributors",
-    "locations",
-    "access_oa_url",
-    "publication_venue_issn",
-    "references",
-    "related_works",
-}
-LICENSE_VALUES = {
-    "cc-by",
-    "cc-by-nc",
-    "cc-by-sa",
-    "cc-by-nd",
-    "cc-by-nc-sa",
-    "cc-by-nc-nd",
-    "other-oa",
-    "cc0",
-    "",
-    "public-domain",
-    "publisher-specific-oa",
-    "publisher-specific",
-    "wiley-specific",
-    "elsevier-specific",
-    "oup-specific",
-    "acs-specific",
-    "rsc-specific",
-    "iop-specific",
-    "unspecified-oa",
-    "implied-oa",
-    "nonexclusive-distrib",
-    "gpl-v1",
-    "gpl-v2",
-    "gpl-v3",
-    "mit",
-    "ogl-c",
-    "pd",
-}
-ACCESS_LICENSE_VALUES = set(LICENSE_VALUES)
-GRADE_CLASS_VALUES = {"k12", "higher-edu", "vocational-edu", "other", ""}
-GRADE_VALUES = {"小学", "初中", "高中", ""}
-
-_DEFAULT_LANGUAGE_VALUES = {"zh", "en", "ja", "de", "fr", "es", "ru", "ko", "ar"}
-ASSETS_DIR = Path(__file__).resolve().parent / "assets"
-
-
-def _load_language_allowed_values() -> set[str]:
-    base = ASSETS_DIR / "to_iso-639.json"
-    if not base.exists():
-        return set(_DEFAULT_LANGUAGE_VALUES)
-    try:
-        with base.open("r", encoding="utf-8") as f:
-            values = json.load(f)
-        if isinstance(values, dict):
-            return set(str(v) for v in values.values() if isinstance(v, str))
-    except (TypeError, ValueError, json.JSONDecodeError):
-        return set(_DEFAULT_LANGUAGE_VALUES)
-    return set(_DEFAULT_LANGUAGE_VALUES)
-
-
-def _load_journal_mapping() -> Dict[str, str]:
-    csv_path = ASSETS_DIR / "journal_name_mapping_execute_20260512.csv"
-    if not csv_path.exists():
-        return {}
-    # Lazy import to avoid top-level optional dependency / heavier import.
-    import csv
-
-    mapping: Dict[str, str] = {}
-    with csv_path.open("r", encoding="utf-8", newline="") as f:
-        for row in csv.DictReader(f):
-            source_name = row.get("source_journal_name")
-            target_name = row.get("target_journal_name")
-            if source_name and target_name:
-                mapping[source_name] = target_name
-    return mapping
-
-
-LANGUAGE_ALLOWED_VALUES = _load_language_allowed_values()
-JOURNAL_NAME_MAPPING = _load_journal_mapping()
-
-
-def _valid_isbn10(code: str) -> bool:
-    if not re.fullmatch(r"\d{9}[\dXx]", code):
-        return False
-    total = sum((10 - idx) * int(ch) for idx, ch in enumerate(code[:9]))
-    check = code[9].upper()
-    check_value = 10 if check == "X" else int(check)
-    total += check_value
-    return total % 11 == 0
-
-
-def _valid_isbn13(code: str) -> bool:
-    if not re.fullmatch(r"\d{13}", code):
-        return False
-    if not (code.startswith("978") or code.startswith("979")):
-        return False
-    total = sum(int(ch) * (1 if idx % 2 == 0 else 3) for idx, ch in enumerate(code))
-    return total % 10 == 0
-
-
-def _valid_issn(code: str) -> bool:
-    if not ISSN_RE.fullmatch(code):
-        return False
-    digits = code.replace("-", "")
-    total = sum(int(ch) * (8 - idx) for idx, ch in enumerate(digits[:7]))
-    calculated = (11 - (total % 11)) % 11
-    expected = "X" if calculated == 10 else str(calculated)
-    return digits[7].upper() == expected
-
-
-def check_metadata_type(metadata_type: Any) -> bool:
-    if metadata_type is None:
-        return True
-    if not isinstance(metadata_type, str):
-        return True
-    if metadata_type.strip() == "":
-        return True
-    return metadata_type not in METADATA_TYPE_VALUES
-
-
-def check_doi(doi: Any, metadata_type: Any) -> bool:
-    if metadata_type not in METADATA_TYPE_VALUES:
-        return False
-    required = metadata_type == "paper"
-    if doi is None:
-        return required
-    if not isinstance(doi, str):
-        return True
-    if doi == "":
-        return required
-    if doi != doi.lower():
-        return True
-    if "https://doi.org/" in doi.lower():
-        return True
-    return not bool(DOI_RE.fullmatch(doi))
-
-
-def check_isbns(isbns: Any, metadata_type: Any) -> bool:
-    if metadata_type not in METADATA_TYPE_VALUES:
-        return False
-    required = metadata_type == "ebook"
-    if isbns is None:
-        return required
-    if not (isinstance(isbns, list) and all(isinstance(x, str) for x in isbns)):
-        return True
-    if len(isbns) == 0:
-        return required
-    for item in isbns:
-        if not (_valid_isbn10(item) or _valid_isbn13(item)):
-            return True
-    return False
-
-
-def check_isbn13(isbn13: Any, metadata_type: Any) -> bool:
-    if metadata_type not in METADATA_TYPE_VALUES:
-        return False
-    required = metadata_type == "ebook"
-    if isbn13 is None:
-        return required
-    if not isinstance(isbn13, str):
-        return True
-    if isbn13 == "":
-        return required
-    return not _valid_isbn13(isbn13)
-
-
-def check_title(title: Any) -> bool:
-    if title is None:
-        return True
-    if not isinstance(title, str):
-        return True
-    if title == "":
-        return False
-    return bool(INVISIBLE_RE.search(title))
-
-
-def check_abstract(abstract: Any) -> bool:
-    if abstract is None:
-        return True
-    if not isinstance(abstract, str):
-        return True
-    if abstract == "":
-        return False
-    return bool(INVISIBLE_RE.search(abstract))
-
-
-def check_language(language: Any) -> bool:
-    if language is None:
-        return True
-    if not isinstance(language, str):
-        return True
-    if language == "":
-        return False
-    if not LANGUAGE_ALLOWED_VALUES:
-        return False
-    return language not in LANGUAGE_ALLOWED_VALUES
-
-
-def check_author(author: Any) -> bool:
-    if author is None:
-        return True
-    if not (isinstance(author, list) and all(isinstance(x, str) for x in author)):
-        return True
-    if len(author) == 0:
-        return False
-    for item in author:
-        if AUTHOR_SEP_RE.search(item):
-            return True
-    return False
-
-
-def check_contributors(contributors: Any) -> bool:
-    if contributors is None:
-        return True
-    if not (isinstance(contributors, list) and all(isinstance(x, str) for x in contributors)):
-        return True
-    if len(contributors) == 0:
-        return False
-    for item in contributors:
-        if AUTHOR_SEP_RE.search(item):
-            return True
-    return False
-
-
-def check_locations(locations: Any) -> bool:
-    if locations is None:
-        return True
-    if not isinstance(locations, list):
-        return True
-    if len(locations) == 0:
-        return False
-    for item in locations:
-        if not isinstance(item, dict):
-            return True
-        for key in ("type", "url", "license", "is_oa"):
-            if key not in item:
-                return True
-        if item["type"] not in LOC_TYPE_VALUES:
-            return True
-        if not (isinstance(item["url"], str) and URL_RE.fullmatch(item["url"])):
-            return True
-        if item["license"] not in LICENSE_VALUES:
-            return True
-        if item["is_oa"] not in OA_BOOL_VALUES:
-            return True
-    return False
-
-
-def check_access_is_oa(access_is_oa: Any, metadata_type: Any) -> bool:
-    if metadata_type not in METADATA_TYPE_VALUES:
-        return False
-    required = metadata_type == "paper"
-    if access_is_oa is None:
-        return required
-    if not isinstance(access_is_oa, str):
-        return True
-    if access_is_oa == "":
-        return required
-    return access_is_oa not in OA_BOOL_VALUES
-
-
-def check_access_oa_status(access_oa_status: Any) -> bool:
-    if access_oa_status is None:
-        return True
-    if not isinstance(access_oa_status, str):
-        return True
-    return access_oa_status not in OA_STATUS_VALUES
-
-
-def check_access_oa_url(access_oa_url: Any) -> bool:
-    if access_oa_url is None:
-        return True
-    if not (isinstance(access_oa_url, list) and all(isinstance(x, str) for x in access_oa_url)):
-        return True
-    if len(access_oa_url) == 0:
-        return False
-    return any(not bool(URL_RE.fullmatch(item)) for item in access_oa_url)
-
-
-def check_access_license(access_license: Any) -> bool:
-    if access_license is None:
-        return True
-    if not isinstance(access_license, str):
-        return True
-    if access_license == "":
-        return False
-    return access_license not in ACCESS_LICENSE_VALUES
-
-
-def check_publication_published_date(publication_published_date: Any) -> bool:
-    if publication_published_date is None:
-        return True
-    if not isinstance(publication_published_date, str):
-        return True
-    if publication_published_date == "":
-        return False
-    if not bool(re.fullmatch(r"\d{4}-\d{2}-\d{2}", publication_published_date)):
-        return True
-    try:
-        datetime.strptime(publication_published_date, "%Y-%m-%d")
-        return False
-    except ValueError:
-        return True
-
-
-def check_publication_published_year(publication_published_year: Any) -> bool:
-    if publication_published_year is None:
-        return False
-    if not isinstance(publication_published_year, int) or isinstance(publication_published_year, bool):
-        return True
-    return not (0 < publication_published_year < 2100)
-
-
-def check_publication_venue_issn(publication_venue_issn: Any) -> bool:
-    if publication_venue_issn is None:
-        return True
-    if not (isinstance(publication_venue_issn, list) and all(isinstance(x, str) for x in publication_venue_issn)):
-        return True
-    if len(publication_venue_issn) == 0:
-        return False
-    for item in publication_venue_issn:
-        if not _valid_issn(item):
-            return True
-    return False
-
-
-def check_publication_venue_biblio_volume(publication_venue_biblio_volume: Any) -> bool:
-    if publication_venue_biblio_volume is None:
-        return True
-    if not isinstance(publication_venue_biblio_volume, str):
-        return True
-    if publication_venue_biblio_volume == "":
-        return False
-    try:
-        int(publication_venue_biblio_volume)
-        return False
-    except (TypeError, ValueError):
-        return True
-
-
-def check_publication_venue_biblio_issue(publication_venue_biblio_issue: Any) -> bool:
-    if publication_venue_biblio_issue is None:
-        return True
-    if not isinstance(publication_venue_biblio_issue, str):
-        return True
-    if publication_venue_biblio_issue == "":
-        return False
-    try:
-        int(publication_venue_biblio_issue)
-        return False
-    except (TypeError, ValueError):
-        return True
-
-
-def check_publication_venue_biblio_pages(publication_venue_biblio_pages: Any) -> bool:
-    if publication_venue_biblio_pages is None:
-        return True
-    if not isinstance(publication_venue_biblio_pages, str):
-        return True
-    if publication_venue_biblio_pages == "":
-        return False
-    if not PAGE_RANGE_RE.fullmatch(publication_venue_biblio_pages):
-        return True
-    start, end = [int(x.strip()) for x in publication_venue_biblio_pages.split("-")]
-    return start <= 0 or end <= 0 or start > end
-
-
-def check_publication_pages(publication_pages: Any) -> bool:
-    if publication_pages is None:
-        return False
-    if not isinstance(publication_pages, int) or isinstance(publication_pages, bool):
-        return True
-    return publication_pages <= 0
-
-
-def check_publication_venue_name_unified(
-    publication_venue_name_unified: Any, publication_venue_name: Any
-) -> bool:
-    if publication_venue_name_unified is None:
-        return True
-    if not isinstance(publication_venue_name_unified, str):
-        return True
-    if publication_venue_name is not None and not isinstance(publication_venue_name, str):
-        return True
-    expected_target = None
-    if isinstance(publication_venue_name, str) and publication_venue_name != "":
-        expected_target = JOURNAL_NAME_MAPPING.get(publication_venue_name, publication_venue_name)
-    if publication_venue_name_unified == "":
-        return False
-    if expected_target is None:
-        return True
-    return publication_venue_name_unified != expected_target
-
-
-def check_grade_class(grade_class: Any) -> bool:
-    if grade_class is None:
-        return True
-    if not isinstance(grade_class, str):
-        return True
-    if grade_class == "":
-        return False
-    return grade_class not in GRADE_CLASS_VALUES
-
-
-def check_grade(grade: Any, grade_class: Any) -> bool:
-    if grade is None:
-        return True
-    if not isinstance(grade, str):
-        return True
-    if grade_class is not None and not isinstance(grade_class, str):
-        return True
-    if grade == "":
-        return False
-    if grade not in GRADE_VALUES:
-        return True
-    if grade_class != "k12" and grade != "":
-        return True
-    return False
-
-
-def check_references(references: Any) -> bool:
-    if references is None:
-        return True
-    if not (isinstance(references, list) and all(isinstance(x, str) for x in references)):
-        return True
-    if len(references) == 0:
-        return False
-    return any(not URL_RE.fullmatch(item) for item in references)
-
-
-def check_related_works(related_works: Any) -> bool:
-    if related_works is None:
-        return True
-    if not (isinstance(related_works, list) and all(isinstance(x, str) for x in related_works)):
-        return True
-    if len(related_works) == 0:
-        return False
-    return any(not URL_RE.fullmatch(item) for item in related_works)
-
-
-def check_cited_by_api_url(cited_by_api_url: Any) -> bool:
-    if cited_by_api_url is None:
-        return True
-    if not isinstance(cited_by_api_url, str):
-        return True
-    if cited_by_api_url == "":
-        return False
-    return not bool(URL_RE.fullmatch(cited_by_api_url))
-
-
-def check_access_xinghe_repository_sha256(
-    access_xinghe_repository_sha256: Any, access_xinghe_repository_has_fulltext: Any
-) -> bool:
-    if access_xinghe_repository_sha256 is None:
-        return True
-    if not isinstance(access_xinghe_repository_has_fulltext, bool):
-        return True
-    has_fulltext = access_xinghe_repository_has_fulltext
-    if isinstance(access_xinghe_repository_sha256, str):
-        if not has_fulltext:
-            return False
-        return access_xinghe_repository_sha256 == ""
-    if not (
-        isinstance(access_xinghe_repository_sha256, list)
-        and all(isinstance(x, str) for x in access_xinghe_repository_sha256)
-    ):
-        return True
-    if not has_fulltext:
-        return False
-    return len(access_xinghe_repository_sha256) == 0
-
-
-def check_access_xinghe_repository_origin_path(
-    access_xinghe_repository_origin_path: Any, access_xinghe_repository_has_fulltext: Any
-) -> bool:
-    if not isinstance(access_xinghe_repository_origin_path, str):
-        return True
-    if not isinstance(access_xinghe_repository_has_fulltext, bool):
-        return True
-    if not access_xinghe_repository_has_fulltext:
-        return False
-    return access_xinghe_repository_origin_path.strip() == ""
-
-
-def _normalize_json_like_field(value: Any) -> Any:
-    if not isinstance(value, str):
-        return value
-    stripped = value.strip()
-    if not stripped:
-        return value
-    if stripped[0] not in ("[", "{"):
-        return value
-    try:
-        return json.loads(stripped)
-    except (TypeError, ValueError, json.JSONDecodeError):
-        cleaned = stripped.replace("\r", " ").replace("\n", " ").replace("\t", " ")
-        cleaned = "".join(ch if ord(ch) >= 32 else " " for ch in cleaned)
-        invalid_escape_re = re.compile(r'\\(?!["\\/bfnrtu])')
-        for _ in range(10):
-            next_cleaned = invalid_escape_re.sub(r"\\\\", cleaned)
-            if next_cleaned == cleaned:
-                break
-            cleaned = next_cleaned
-        try:
-            return json.loads(cleaned)
-        except (TypeError, ValueError, json.JSONDecodeError):
-            return value
-
-
-def _normalize_bool_field(value: Any) -> Any:
-    if isinstance(value, bool):
-        return value
-    if isinstance(value, int):
-        if value in (0, 1):
-            return bool(value)
-        return value
-    if isinstance(value, str):
-        lowered = value.strip().lower()
-        if lowered in ("1", "true"):
-            return True
-        if lowered in ("0", "false"):
-            return False
-    return value
-
-
-def normalize_record(record: Dict[str, Any]) -> Dict[str, Any]:
-    normalized = dict(record)
-    for field in JSON_LIST_FIELDS:
-        if field in normalized:
-            normalized[field] = _normalize_json_like_field(normalized.get(field))
-    normalized["access_xinghe_repository_has_fulltext"] = _normalize_bool_field(
-        normalized.get("access_xinghe_repository_has_fulltext")
-    )
-    return normalized
-
-
-FIELD_VALIDATORS = {
-    "metadata_type": lambda record: check_metadata_type(record.get("metadata_type")),
-    "doi": lambda record: check_doi(record.get("doi"), record.get("metadata_type")),
-    "isbns": lambda record: check_isbns(record.get("isbns"), record.get("metadata_type")),
-    "isbn13": lambda record: check_isbn13(record.get("isbn13"), record.get("metadata_type")),
-    "title": lambda record: check_title(record.get("title")),
-    "abstract": lambda record: check_abstract(record.get("abstract")),
-    "language": lambda record: check_language(record.get("language")),
-    "author": lambda record: check_author(record.get("author")),
-    "contributors": lambda record: check_contributors(record.get("contributors")),
-    "locations": lambda record: check_locations(record.get("locations")),
-    "access_is_oa": lambda record: check_access_is_oa(record.get("access_is_oa"), record.get("metadata_type")),
-    "access_oa_status": lambda record: check_access_oa_status(record.get("access_oa_status")),
-    "access_oa_url": lambda record: check_access_oa_url(record.get("access_oa_url")),
-    "access_license": lambda record: check_access_license(record.get("access_license")),
-    "publication_published_date": lambda record: check_publication_published_date(
-        record.get("publication_published_date")
-    ),
-    "publication_published_year": lambda record: check_publication_published_year(
-        record.get("publication_published_year")
-    ),
-    "publication_venue_issn": lambda record: check_publication_venue_issn(record.get("publication_venue_issn")),
-    "publication_venue_biblio_volume": lambda record: check_publication_venue_biblio_volume(
-        record.get("publication_venue_biblio_volume")
-    ),
-    "publication_venue_biblio_issue": lambda record: check_publication_venue_biblio_issue(
-        record.get("publication_venue_biblio_issue")
-    ),
-    "publication_venue_biblio_pages": lambda record: check_publication_venue_biblio_pages(
-        record.get("publication_venue_biblio_pages")
-    ),
-    "publication_pages": lambda record: check_publication_pages(record.get("publication_pages")),
-    "publication_venue_name_unified": lambda record: check_publication_venue_name_unified(
-        record.get("publication_venue_name_unified"),
-        record.get("publication_venue_name"),
-    ),
-    "grade_class": lambda record: check_grade_class(record.get("grade_class")),
-    "grade": lambda record: check_grade(record.get("grade"), record.get("grade_class")),
-    "references": lambda record: check_references(record.get("references")),
-    "related_works": lambda record: check_related_works(record.get("related_works")),
-    "cited_by_api_url": lambda record: check_cited_by_api_url(record.get("cited_by_api_url")),
-    "access_xinghe_repository_sha256": lambda record: check_access_xinghe_repository_sha256(
-        record.get("access_xinghe_repository_sha256"),
-        record.get("access_xinghe_repository_has_fulltext"),
-    ),
-    "access_xinghe_repository_origin_path": lambda record: check_access_xinghe_repository_origin_path(
-        record.get("access_xinghe_repository_origin_path"),
-        record.get("access_xinghe_repository_has_fulltext"),
-    ),
-}
-
-
-@Model.rule_register("QUALITY_BAD_EFFECTIVENESS", ["xinghe", "quanliang"])
-class RuleQuanliangFieldValidation(BaseRule):
-    _metric_info = {
-        "category": "Rule-Based Metadata Quality Metrics",
-        "quality_dimension": "EFFECTIVENESS",
-        "metric_name": "RuleQuanliangFieldValidation",
-        "description": "Validate Quanliang metadata fields and report invalid fields",
-        "paper_title": "",
-        "paper_url": "",
-        "paper_authors": "",
-        "evaluation_results": "",
-    }
-
-    _required_fields = [RequiredField.METADATA]
-    dynamic_config = EvaluatorRuleArgs(key_list=list(FIELD_VALIDATORS.keys()))
-
-    @classmethod
-    def eval(cls, input_data: Data) -> EvalDetail:
-        res = EvalDetail(metric=cls.__name__)
-        normalized = normalize_record(input_data.to_dict())
-        selected_fields = cls.dynamic_config.key_list or []
-        bad_fields: List[str] = []
-        reasons: List[str] = []
-        for field in selected_fields:
-            if field not in FIELD_VALIDATORS:
-                bad_fields.append(field)
-                reasons.append("unsupported field")
-                continue
-            if field not in normalized:
-                bad_fields.append(field)
-                reasons.append("missing field")
-                continue
-            if FIELD_VALIDATORS[field](normalized):
-                bad_fields.append(field)
-                reasons.append(f"{field} invalid")
-
-        if bad_fields:
-            res.status = True
-            res.label = bad_fields
-            res.reason = reasons
-        else:
-            res.label = [QualityLabel.QUALITY_GOOD]
-        return res
diff --git a/dingo/model/rule/scibase/union_unique_meta_data.py b/dingo/model/rule/scibase/union_unique_meta_data.py
new file mode 100644
index 00000000..3e222483
--- /dev/null
+++ b/dingo/model/rule/scibase/union_unique_meta_data.py
@@ -0,0 +1,2548 @@
+#!/usr/bin/env python3
+"""DB validator for unified metadata and Xinghe fulltext union table.
+
+The validator is read-only. It compares the unified target table with three
+source tables (paper unique, ebook unique, Xinghe fulltext), validates target
+field values, and reports target field NULL / empty rates.
+"""
+from __future__ import annotations
+
+import argparse
+import csv
+import html
+import json
+import re
+import sys
+import time
+from collections import defaultdict
+from dataclasses import dataclass
+from datetime import date, datetime
+from decimal import Decimal
+from pathlib import Path
+from typing import Any, Dict, Iterable, List, Optional, Sequence, Set, Tuple
+
+try:
+    import pymysql
+except ImportError:  # pragma: no cover - runtime dependency check
+    pymysql = None  # type: ignore
+
+
+PROJECT_ROOT = Path(__file__).resolve().parent
+ASSETS_DIR = PROJECT_ROOT / "assets"
+DEFAULT_CONFIG_PATH = Path("sci_base_qa_test_config.json")
+TEMPLATE_CONFIG_PATH = ASSETS_DIR / "settings.template.json"
+DEFAULT_MAPPING_CSV = ASSETS_DIR / "union_unique_data_mapping.csv"
+DEFAULT_JOURNAL_MAPPING_CSV = ASSETS_DIR / "journal_name_mapping_execute_20260512.csv"
+REPORT_ROOT = Path("report")
+DEFAULT_PAPER_TABLE = "dws_meta_paper_doi_unique_acc_d"
+DEFAULT_EBOOK_TABLE = "dws_meta_ebook_isbn_unique_acc_d"
+DEFAULT_XINGHE_TABLE = "ads_xinghe_library_acc"
+DEFAULT_TARGET_TABLE = "ads_meta_unified_unique_meta_data_acc_d"
+XINGHE_SUPPLEMENT_FIELDS = {
+    "doi",
+    "title",
+    "abstract",
+    "language",
+    "author",
+    "grade_class",
+    "grade",
+    "supplementary_material",
+}
+IGNORED_TARGET_EXTRA_FIELDS = {"dt", "mesh"}
+LICENSE_ALLOWED: Set[str] = {
+    "cc-by",
+    "cc-by-nc",
+    "cc-by-sa",
+    "cc-by-nd",
+    "cc-by-nc-sa",
+    "cc-by-nc-nd",
+    "other-oa",
+    "cc0",
+    "",
+    "public-domain",
+    "publisher-specific-oa",
+    "publisher-specific",
+    "wiley-specific",
+    "elsevier-specific",
+    "oup-specific",
+    "acs-specific",
+    "rsc-specific",
+    "iop-specific",
+    "unspecified-oa",
+    "implied-oa",
+    "nonexclusive-distrib",
+    "gpl-v1",
+    "gpl-v2",
+    "gpl-v3",
+    "mit",
+    "ogl-c",
+    "pd",
+}
+DEFAULT_LICENSE_MAP: Dict[str, str] = {
+    "http://arxiv.org/licenses/nonexclusive-distrib/1.0/": "nonexclusive-distrib",
+    "https://arxiv.org/licenses/nonexclusive-distrib/1.0/": "nonexclusive-distrib",
+    "arxiv-nonexclusive-distrib-1.0": "nonexclusive-distrib",
+    "http://creativecommons.org/licenses/by/4.0/": "cc-by",
+    "https://creativecommons.org/licenses/by/4.0/": "cc-by",
+    "http://creativecommons.org/licenses/by/3.0/": "cc-by",
+    "https://creativecommons.org/licenses/by/3.0/": "cc-by",
+    "CC-BY-4.0": "cc-by",
+    "CC-BY-3.0": "cc-by",
+    "CCBY": "cc-by",
+    "http://creativecommons.org/licenses/by-nc/4.0/": "cc-by-nc",
+    "https://creativecommons.org/licenses/by-nc/4.0/": "cc-by-nc",
+    "CCBYNC": "cc-by-nc",
+    "http://creativecommons.org/licenses/by-sa/4.0/": "cc-by-sa",
+    "https://creativecommons.org/licenses/by-sa/4.0/": "cc-by-sa",
+    "CCBYSA": "cc-by-sa",
+    "http://creativecommons.org/licenses/by-nd/4.0/": "cc-by-nd",
+    "https://creativecommons.org/licenses/by-nd/4.0/": "cc-by-nd",
+    "CCBYND": "cc-by-nd",
+    "http://creativecommons.org/licenses/by-nc-sa/4.0/": "cc-by-nc-sa",
+    "https://creativecommons.org/licenses/by-nc-sa/4.0/": "cc-by-nc-sa",
+    "CCBYNCSA": "cc-by-nc-sa",
+    "http://creativecommons.org/licenses/by-nc-nd/4.0/": "cc-by-nc-nd",
+    "https://creativecommons.org/licenses/by-nc-nd/4.0/": "cc-by-nc-nd",
+    "CCBYNCND": "cc-by-nc-nd",
+    "http://creativecommons.org/publicdomain/zero/1.0/": "cc0",
+    "https://creativecommons.org/publicdomain/zero/1.0/": "cc0",
+    "CC0-1.0": "cc0",
+    "CC0": "cc0",
+}
+CC_LICENSE_URL_RULES: List[Tuple[re.Pattern, str]] = [
+    (re.compile(r"creativecommons\.org/licenses/by-nc-sa", re.I), "cc-by-nc-sa"),
+    (re.compile(r"creativecommons\.org/licenses/by-nc-nd", re.I), "cc-by-nc-nd"),
+    (re.compile(r"creativecommons\.org/licenses/by-nc(?:/|$)", re.I), "cc-by-nc"),
+    (re.compile(r"creativecommons\.org/licenses/by-sa", re.I), "cc-by-sa"),
+    (re.compile(r"creativecommons\.org/licenses/by-nd", re.I), "cc-by-nd"),
+    (re.compile(r"creativecommons\.org/licenses/by(?:/|$)", re.I), "cc-by"),
+    (re.compile(r"creativecommons\.org/publicdomain/zero", re.I), "cc0"),
+    (re.compile(r"arxiv\.org/licenses/nonexclusive-distrib", re.I), "nonexclusive-distrib"),
+]
+
+
+def log_step(message: str) -> None:
+    print(f"[info] {message}", file=sys.stderr, flush=True)
+
+
+def timed_step(name: str):
+    class _Timer:
+        def __enter__(self):
+            self.start = time.time()
+            log_step(f"{name} 开始")
+            return self
+
+        def __exit__(self, exc_type, exc, tb):
+            elapsed = time.time() - self.start
+            status = "失败" if exc_type else "完成"
+            log_step(f"{name} {status}，耗时 {elapsed:.1f}s")
+            return False
+
+    return _Timer()
+
+
+def safe_filename_token(value: Optional[Any]) -> str:
+    text = "all" if value in (None, "") else str(value)
+    return "".join(ch if ch.isalnum() or ch in ("-", "_") else "_" for ch in text).strip("_") or "all"
+
+
+def default_output_dir(
+    target_dt: Optional[str],
+    paper_dt: Optional[str],
+    ebook_dt: Optional[str],
+    limit: Optional[int],
+    full: bool,
+) -> Path:
+    del paper_dt, ebook_dt, limit, full
+    dt_token = safe_filename_token(target_dt)
+    prefix = f"union_unique_meta_data_{dt_token}_"
+    max_seq = 0
+    if REPORT_ROOT.exists():
+        for path in REPORT_ROOT.glob(f"{prefix}[0-9][0-9][0-9][0-9]"):
+            if not path.is_dir():
+                continue
+            seq_text = path.name.rsplit("_", 1)[-1]
+            if seq_text.isdigit():
+                max_seq = max(max_seq, int(seq_text))
+    return REPORT_ROOT / f"{prefix}{max_seq + 1:04d}"
+
+
+@dataclass(frozen=True)
+class UnionFieldSpec:
+    field_name: str
+    data_type: str
+    paper_source: str
+    ebook_source: str
+    xinghe_source: str
+
+
+class JsonEncoder(json.JSONEncoder):
+    def default(self, obj: Any) -> Any:
+        if isinstance(obj, Decimal):
+            if obj == obj.to_integral_value():
+                return int(obj)
+            return float(obj)
+        if isinstance(obj, (date, datetime)):
+            return obj.isoformat()
+        return super().default(obj)
+
+
+def normalize_data_type(data_type: str) -> str:
+    text = (data_type or "").strip()
+    lower = text.lower()
+    if lower.startswith("list["):
+        inner = lower[5:-1].strip()
+        return f"array<{inner}>"
+    if lower == "object":
+        return "map"
+    if lower in ("string", "integer", "long", "float", "boolean"):
+        return {
+            "string": "string",
+            "integer": "int",
+            "long": "bigint",
+            "float": "float",
+            "boolean": "boolean",
+        }[lower]
+    if lower.startswith("timestamp"):
+        return "bigint"
+    return lower or text
+
+
+def _is_field_ref(value: str) -> bool:
+    if not value or value in ("-", "/"):
+        return False
+    if any("\u4e00" <= c <= "\u9fff" for c in value):
+        return False
+    if "'" in value:
+        return False
+    return True
+
+
+def load_union_specs(
+    path: Path,
+    *,
+    field_col: str = "",
+    type_col: str = "",
+    paper_col: str = "",
+    ebook_col: str = "",
+    xinghe_col: str = "",
+) -> List[UnionFieldSpec]:
+    specs: List[UnionFieldSpec] = []
+    with path.open(encoding="utf-8-sig", newline="") as f:
+        reader = csv.DictReader(f)
+        fieldnames = reader.fieldnames or []
+        if not field_col:
+            field_col = "统一字段名" if "统一字段名" in fieldnames else "字段名"
+        if not type_col:
+            type_col = "字段值数据类型" if "字段值数据类型" in fieldnames else "数据类型"
+        if not paper_col:
+            paper_col = "源字段映射(论文)" if "源字段映射(论文)" in fieldnames else "论文表对应字段"
+        if not ebook_col:
+            ebook_col = "源字段映射(图书)" if "源字段映射(图书)" in fieldnames else "图书表对应字段"
+        if not xinghe_col:
+            xinghe_col = "源字段映射(星河)" if "源字段映射(星河)" in fieldnames else "星河全文表对应字段"
+        if not reader.fieldnames or field_col not in reader.fieldnames:
+            available = ", ".join(fn for fn in (reader.fieldnames or []) if fn.strip())
+            raise ValueError(
+                f"映射文件 {path} 缺少字段列 {field_col!r}（可用列: {available}）"
+            )
+        for row in reader:
+            name = (row.get(field_col) or "").strip()
+            if not name:
+                continue
+            specs.append(
+                UnionFieldSpec(
+                    field_name=name,
+                    data_type=normalize_data_type((row.get(type_col) or "").strip()),
+                    paper_source=(row.get(paper_col) or "").strip(),
+                    ebook_source=(row.get(ebook_col) or "").strip(),
+                    xinghe_source=(row.get(xinghe_col) or "").strip(),
+                )
+            )
+    return specs
+
+
+def build_field_maps(
+    specs: Sequence[UnionFieldSpec],
+    metadata_type: str,
+) -> Tuple[Dict[str, str], Dict[str, str]]:
+    metadata_map: Dict[str, str] = {}
+    xinghe_map: Dict[str, str] = {}
+    for spec in specs:
+        source = spec.paper_source if metadata_type == "paper" else spec.ebook_source
+        if _is_field_ref(source):
+            metadata_map[source] = spec.field_name
+        if _is_field_ref(spec.xinghe_source):
+            xinghe_map[spec.xinghe_source] = spec.field_name
+    return metadata_map, xinghe_map
+
+
+def build_empty_output(specs: Sequence[UnionFieldSpec], metadata_type: str) -> Dict[str, Any]:
+    output: Dict[str, Any] = {}
+    for spec in specs:
+        output[spec.field_name] = False if spec.data_type == "boolean" else None
+    output["metadata_type"] = metadata_type
+    return output
+
+
+def raw_key(value: Any) -> str:
+    if value is None:
+        return ""
+    return str(value)
+
+
+def normalize_key_text(value: Any) -> str:
+    if value is None:
+        return ""
+    return html.unescape(str(value)).strip()
+
+
+def key_from_unique_id(unique_id: Any, metadata_type: str) -> str:
+    if unique_id in (None, ""):
+        return ""
+    prefix = f"{metadata_type}:"
+    text = str(unique_id)
+    if not text.startswith(prefix):
+        return ""
+    return normalize_key_text(text[len(prefix):])
+
+
+def target_key_for_row(row: Dict[str, Any], metadata_type: str) -> str:
+    key_field = "doi" if metadata_type == "paper" else "isbn13"
+    key = normalize_key_text(row.get(key_field))
+    if key:
+        return key
+    return key_from_unique_id(row.get("unique_id"), metadata_type)
+
+
+def normalize_lookup_key(key: Any, metadata_type: str) -> str:
+    if key in (None, ""):
+        return ""
+    text = normalize_key_text(key)
+    return text.lower() if metadata_type == "paper" else text
+
+
+def get_source_value(record: Dict[str, Any], source: str, source_kind: str = "") -> Any:
+    if source in record:
+        return record.get(source)
+    if "." not in source:
+        return None
+    current: Any = record
+    for part in source.split("."):
+        current = normalize_json_like(current)
+        if isinstance(current, dict):
+            current = current.get(part)
+        else:
+            return None
+    return current
+
+
+def apply_field_map(
+    output: Dict[str, Any],
+    record: Optional[Dict[str, Any]],
+    field_map: Dict[str, str],
+    *,
+    source_kind: str = "",
+    overwrite: bool = True,
+    fallback_only_fields: Optional[Set[str]] = None,
+) -> None:
+    if record is None:
+        return
+    fallback_only_fields = fallback_only_fields or set()
+    for src, dst in field_map.items():
+        value = get_source_value(record, src, source_kind)
+        if value is None:
+            continue
+        if not overwrite or dst in fallback_only_fields:
+            current = output.get(dst)
+            if not is_deep_empty(current):
+                continue
+        if value is not None:
+            output[dst] = value
+
+
+def apply_xinghe_only_metadata_fallback(
+    output: Dict[str, Any],
+    record: Optional[Dict[str, Any]],
+    *,
+    metadata_type: str,
+    specs: Sequence[UnionFieldSpec],
+) -> None:
+    if record is None:
+        return
+    for spec in specs:
+        if output.get(spec.field_name) is not None:
+            continue
+        metadata_source = spec.paper_source if metadata_type == "paper" else spec.ebook_source
+        candidates = []
+        if _is_field_ref(metadata_source):
+            candidates.append(metadata_source)
+        candidates.append(spec.field_name)
+        for src in candidates:
+            value = get_source_value(record, src, "xinghe")
+            if value is not None:
+                output[spec.field_name] = value
+                break
+
+
+def normalize_journal_lookup_key(value: Any) -> str:
+    if value is None:
+        return ""
+    return " ".join(str(value).strip().split()).casefold()
+
+
+def load_journal_name_mapping(
+    path: Path = DEFAULT_JOURNAL_MAPPING_CSV,
+) -> Tuple[Dict[str, str], Dict[str, str]]:
+    exact_map: Dict[str, str] = {}
+    normalized_map: Dict[str, str] = {}
+    if not path.exists():
+        return exact_map, normalized_map
+    with path.open("r", encoding="utf-8-sig", newline="") as f:
+        for row in csv.DictReader(f):
+            source_name = (row.get("source_journal_name") or "").strip()
+            target_name = (row.get("target_journal_name") or "").strip()
+            if not source_name or not target_name:
+                continue
+            exact_map.setdefault(source_name, target_name)
+            normalized_key = normalize_journal_lookup_key(source_name)
+            if normalized_key:
+                normalized_map.setdefault(normalized_key, target_name)
+    return exact_map, normalized_map
+
+
+def lookup_journal_name_unified(value: Any) -> Any:
+    if is_deep_empty(value):
+        return value
+    global JOURNAL_NAME_MAPPING_CACHE
+    if JOURNAL_NAME_MAPPING_CACHE is None:
+        JOURNAL_NAME_MAPPING_CACHE = load_journal_name_mapping()
+    exact_map, normalized_map = JOURNAL_NAME_MAPPING_CACHE
+    text = " ".join(str(value).strip().split())
+    return exact_map.get(text) or normalized_map.get(normalize_journal_lookup_key(text)) or value
+
+
+def apply_derived_fields(output: Dict[str, Any]) -> None:
+    if is_deep_empty(output.get("publication_venue_name_unified")):
+        output["publication_venue_name_unified"] = lookup_journal_name_unified(
+            output.get("publication_venue_name")
+        )
+
+
+def merge_one(
+    metadata_record: Optional[Dict[str, Any]],
+    xinghe_record: Optional[Dict[str, Any]],
+    *,
+    metadata_type: str,
+    specs: Sequence[UnionFieldSpec],
+    metadata_map: Dict[str, str],
+    xinghe_map: Dict[str, str],
+    fallback_key: Optional[Any] = None,
+) -> Dict[str, Any]:
+    output = build_empty_output(specs, metadata_type)
+    apply_field_map(output, metadata_record, metadata_map, source_kind=metadata_type)
+
+    if xinghe_record is not None:
+        sha256 = xinghe_record.get("sha256")
+        output["access_xinghe_repository_has_fulltext"] = sha256 not in (None, "", [], {})
+        apply_field_map(
+            output,
+            xinghe_record,
+            xinghe_map,
+            source_kind="xinghe",
+            fallback_only_fields=XINGHE_SUPPLEMENT_FIELDS,
+        )
+        if metadata_record is None:
+            apply_xinghe_only_metadata_fallback(
+                output,
+                xinghe_record,
+                metadata_type=metadata_type,
+                specs=specs,
+            )
+
+    uid_field = "doi" if metadata_type == "paper" else "isbn13"
+    xinghe_key = "doi" if metadata_type == "paper" else "isbn"
+    key_val = raw_key(output.get(uid_field))
+    if not key_val and metadata_record is None:
+        fallback = raw_key(xinghe_record.get(xinghe_key)) if xinghe_record is not None else raw_key(fallback_key)
+        if fallback:
+            output[uid_field] = xinghe_record.get(xinghe_key) if xinghe_record is not None else fallback
+            key_val = fallback
+    output["unique_id"] = f"{metadata_type}:{key_val}" if key_val else None
+    apply_derived_fields(output)
+    return output
+
+
+def load_config(path: Path) -> Dict[str, Any]:
+    if not path.exists():
+        raise FileNotFoundError(
+            f"Config file not found: {path}\n"
+            f"Copy the template and fill in credentials:\n"
+            f"  cp {TEMPLATE_CONFIG_PATH} {path}"
+        )
+    with path.open("r", encoding="utf-8") as f:
+        return json.load(f)
+
+
+def connect_starrocks(config_path: Path):
+    if pymysql is None:
+        raise RuntimeError("pymysql is required. Install pymysql before running DB validation.")
+    cfg = load_config(config_path)
+    mysql_cfg = cfg["mysql"]
+    return pymysql.connect(
+        host=mysql_cfg["host"],
+        port=int(mysql_cfg["port"]),
+        user=mysql_cfg["user"],
+        password=mysql_cfg["password"],
+        charset=mysql_cfg.get("charset", "utf8mb4"),
+        connect_timeout=int(mysql_cfg.get("connect_timeout", 30)),
+        read_timeout=int(mysql_cfg.get("read_timeout", 180)),
+    )
+
+
+def qualify_table_name(
+    table: str,
+    catalog: Optional[str],
+    database: str = "dws",
+) -> str:
+    """Resolve table to catalog.database.table for StarRocks Iceberg queries."""
+    parts = [part.strip() for part in table.split(".") if part.strip()]
+    if len(parts) >= 3:
+        return table
+    if len(parts) == 2:
+        db_name, table_name = parts
+        if catalog:
+            return f"{catalog}.{db_name}.{table_name}"
+        return table
+    if len(parts) == 1:
+        if catalog:
+            return f"{catalog}.{database}.{parts[0]}"
+        return f"{database}.{parts[0]}"
+    return table
+
+
+def quote_identifier(identifier: str) -> str:
+    parts = [part.strip() for part in identifier.split(".") if part.strip()]
+    if not parts:
+        raise ValueError(f"Invalid identifier: {identifier!r}")
+    return ".".join(f"`{part.replace('`', '``')}`" for part in parts)
+
+
+def fetch_records(conn: Any, sql: str, params: Sequence[Any] = ()) -> List[Dict[str, Any]]:
+    with conn.cursor() as cursor:
+        cursor.execute(sql, params)
+        if cursor.description is None:
+            return []
+        cols = [field[0] for field in cursor.description]
+        return [dict(zip(cols, row)) for row in cursor.fetchall()]
+
+
+def fetch_one(conn: Any, sql: str, params: Sequence[Any] = ()) -> Optional[Dict[str, Any]]:
+    rows = fetch_records(conn, sql, params)
+    return rows[0] if rows else None
+
+
+def normalize_json_like(value: Any) -> Any:
+    if isinstance(value, (bytes, bytearray)):
+        value = value.decode("utf-8", errors="replace")
+    if isinstance(value, str):
+        stripped = value.strip()
+        if stripped and stripped[0] in "[{":
+            try:
+                return json.loads(stripped)
+            except json.JSONDecodeError:
+                return value
+    return value
+
+
+def canonicalize(value: Any) -> Any:
+    value = normalize_json_like(value)
+    if isinstance(value, Decimal):
+        if value == value.to_integral_value():
+            return int(value)
+        return float(value)
+    if isinstance(value, (date, datetime)):
+        return value.isoformat()
+    if isinstance(value, dict):
+        return {str(k): canonicalize(v) for k, v in sorted(value.items(), key=lambda item: str(item[0]))}
+    if isinstance(value, list):
+        return [canonicalize(v) for v in value]
+    return value
+
+
+def comparable_record(record: Dict[str, Any], fields: Iterable[str]) -> Dict[str, Any]:
+    return {field: canonicalize(record.get(field)) for field in fields}
+
+
+HTML_UNESCAPE_COMPARE_FIELDS = {"unique_id", "doi", "isbn13"}
+
+
+def normalize_author_for_compare(value: Any) -> Any:
+    value = normalize_json_like(value)
+    if value is None:
+        return None
+    if isinstance(value, str):
+        text = " ".join(value.strip().split())
+        return None if text in ("", "[]", "{}") else [text]
+    if isinstance(value, dict):
+        name = value.get("name")
+        if name is None:
+            return None
+        text = " ".join(str(name).strip().split())
+        return None if not text else [text]
+    if isinstance(value, list):
+        names: List[str] = []
+        for item in value:
+            item = normalize_json_like(item)
+            if isinstance(item, dict):
+                item = item.get("name")
+            if item is None:
+                continue
+            text = " ".join(str(item).strip().split())
+            if text:
+                names.append(text)
+        if not names:
+            return None
+        return sorted(dict.fromkeys(names))
+    return value
+
+
+def normalize_license_value(value: Any) -> str:
+    if value is None:
+        return ""
+    text = str(value).strip()
+    if not text:
+        return ""
+    if text in DEFAULT_LICENSE_MAP:
+        return DEFAULT_LICENSE_MAP[text]
+    trimmed = text.rstrip("/")
+    if trimmed in DEFAULT_LICENSE_MAP:
+        return DEFAULT_LICENSE_MAP[trimmed]
+    compact = re.sub(r"[^A-Za-z0-9]", "", text).upper()
+    if compact in DEFAULT_LICENSE_MAP:
+        return DEFAULT_LICENSE_MAP[compact]
+    lower = text.lower()
+    if lower in LICENSE_ALLOWED:
+        return lower
+    for pattern, canonical in CC_LICENSE_URL_RULES:
+        if pattern.search(text):
+            return canonical
+    return lower
+
+
+def normalize_locations_for_compare(value: Any) -> Any:
+    value = normalize_json_like(value)
+    if value is None:
+        return None
+    if isinstance(value, str):
+        if value.strip() in ("", "[]"):
+            return None
+        return value
+    if not isinstance(value, list):
+        return value
+    out: List[Dict[str, Any]] = []
+    for item in value:
+        item = normalize_json_like(item)
+        if not isinstance(item, dict):
+            continue
+        loc = {str(k): canonicalize(v) for k, v in item.items()}
+        if "license" in loc:
+            loc["license"] = normalize_license_value(loc.get("license"))
+        if "is_oa" in loc and loc.get("is_oa") is not None:
+            loc["is_oa"] = str(loc.get("is_oa")).lower()
+        out.append({key: loc.get(key) for key in sorted(loc)})
+    return out or None
+
+
+def normalize_empty_for_compare(value: Any, data_type: str, field: str = "") -> Any:
+    type_text = (data_type or "").strip().lower()
+    if value is None:
+        return None
+    if field == "author":
+        return normalize_author_for_compare(value)
+    if field == "access_license":
+        normalized_license = normalize_license_value(value)
+        return normalized_license or None
+    if field == "locations":
+        return normalize_locations_for_compare(value)
+    if field in HTML_UNESCAPE_COMPARE_FIELDS and isinstance(value, str):
+        value = html.unescape(value).strip()
+    if isinstance(value, list) and is_deep_empty(value):
+        return None
+    if type_text in ("string", "varchar", "char", "text"):
+        return None if isinstance(value, str) and value.strip() == "" else value
+    if type_text.startswith("array") or type_text.startswith("list"):
+        if is_deep_empty(value):
+            return None
+        if isinstance(value, str) and value.strip() in ("", "[]"):
+            return None
+    if type_text.startswith("struct") or type_text.startswith("map"):
+        return None if is_deep_empty(value) else value
+    return value
+
+
+def is_deep_empty(value: Any) -> bool:
+    value = normalize_json_like(value)
+    if value is None:
+        return True
+    if isinstance(value, str):
+        stripped = value.strip()
+        return stripped in ("", "[]", "{}")
+    if isinstance(value, dict):
+        return all(is_deep_empty(item) for item in value.values())
+    if isinstance(value, list):
+        return all(is_deep_empty(item) for item in value)
+    return False
+
+
+def compare_records(
+    expected: Dict[str, Any],
+    actual: Dict[str, Any],
+    field_types: Optional[Dict[str, str]] = None,
+) -> Dict[str, Dict[str, Any]]:
+    mismatches: Dict[str, Dict[str, Any]] = {}
+    field_types = field_types or {}
+    for field, expected_value in expected.items():
+        actual_value = actual.get(field)
+        expected_value = normalize_empty_for_compare(expected_value, field_types.get(field, ""), field)
+        actual_value = normalize_empty_for_compare(actual_value, field_types.get(field, ""), field)
+        if expected_value != actual_value:
+            mismatches[field] = {
+                "expected": expected_value,
+                "actual": actual_value,
+            }
+    return mismatches
+
+
+def _dt_clause(dt: Optional[str], params: List[Any], alias: Optional[str] = None) -> str:
+    if dt is None:
+        return ""
+    params.append(dt)
+    prefix = f"{quote_identifier(alias)}." if alias else ""
+    return f" AND {prefix}`dt` = %s"
+
+
+def _limit_clause(limit: Optional[int]) -> str:
+    return "" if limit is None else f" LIMIT {int(limit)}"
+
+
+def split_limit(limit: Optional[int], parts: int) -> List[Optional[int]]:
+    if limit is None:
+        return [None] * parts
+    base = max(0, int(limit)) // parts
+    remainder = max(0, int(limit)) % parts
+    return [base + (1 if i < remainder else 0) for i in range(parts)]
+
+
+def show_columns(conn: Any, table: str) -> List[str]:
+    rows = fetch_records(conn, f"SHOW COLUMNS FROM {quote_identifier(table)}")
+    columns: List[str] = []
+    for row in rows:
+        field = row.get("Field") or row.get("field") or next(iter(row.values()))
+        columns.append(str(field))
+    return columns
+
+
+def show_column_types(conn: Any, table: str) -> Dict[str, str]:
+    rows = fetch_records(conn, f"SHOW COLUMNS FROM {quote_identifier(table)}")
+    column_types: Dict[str, str] = {}
+    for row in rows:
+        field = row.get("Field") or row.get("field") or next(iter(row.values()))
+        data_type = row.get("Type") or row.get("type") or ""
+        column_types[str(field)] = str(data_type)
+    return column_types
+
+
+def validate_schema(
+    conn: Any,
+    *,
+    target_table: str,
+    specs: Sequence[UnionFieldSpec],
+) -> Dict[str, Any]:
+    expected_fields = [spec.field_name for spec in specs]
+    actual_fields = show_columns(conn, target_table)
+    actual_set = set(actual_fields)
+    expected_set = set(expected_fields)
+    return {
+        "missing_fields": [field for field in expected_fields if field not in actual_set],
+        "extra_fields": [
+            field
+            for field in actual_fields
+            if field not in expected_set and field not in IGNORED_TARGET_EXTRA_FIELDS
+        ],
+        "expected_count": len(expected_fields),
+        "actual_count": len(actual_fields),
+    }
+
+
+def count_table(conn: Any, table: str, dt: Optional[str]) -> int:
+    params: List[Any] = []
+    sql = f"SELECT COUNT(*) AS cnt FROM {quote_identifier(table)} WHERE 1=1{_dt_clause(dt, params)}"
+    row = fetch_one(conn, sql, params)
+    return int(row["cnt"]) if row else 0
+
+
+def count_xinghe_only_distinct_key(
+    conn: Any,
+    *,
+    xinghe_table: str,
+    metadata_table: str,
+    xinghe_key_field: str,
+    metadata_key_field: str,
+    metadata_dt: Optional[str],
+) -> int:
+    params: List[Any] = []
+    metadata_dt_join = "AND m.`dt` = %s" if metadata_dt is not None else ""
+    if metadata_dt is not None:
+        params.append(metadata_dt)
+    sql = (
+        "SELECT COUNT(DISTINCT "
+        f"x.`{xinghe_key_field}`"
+        ") AS cnt "
+        f"FROM {quote_identifier(xinghe_table)} x "
+        f"LEFT JOIN {quote_identifier(metadata_table)} m "
+        f"ON m.`{metadata_key_field}` = x.`{xinghe_key_field}` {metadata_dt_join} "
+        f"WHERE x.`{xinghe_key_field}` IS NOT NULL AND x.`{xinghe_key_field}` != '' "
+        f"AND m.`{metadata_key_field}` IS NULL"
+    )
+    row = fetch_one(conn, sql, params)
+    return int(row["cnt"]) if row else 0
+
+
+def source_coverage_counts(
+    conn: Any,
+    *,
+    paper_table: str,
+    ebook_table: str,
+    xinghe_table: str,
+    target_table: str,
+    target_dt: Optional[str],
+    paper_dt: Optional[str],
+    ebook_dt: Optional[str],
+) -> Dict[str, Any]:
+    paper_source = count_table(conn, paper_table, paper_dt)
+    ebook_source = count_table(conn, ebook_table, ebook_dt)
+    target = count_table(conn, target_table, target_dt)
+    xinghe_only_paper_count = count_xinghe_only_distinct_key(
+        conn,
+        xinghe_table=xinghe_table,
+        metadata_table=paper_table,
+        xinghe_key_field="doi",
+        metadata_key_field="doi",
+        metadata_dt=paper_dt,
+    )
+    xinghe_only_ebook_count = count_xinghe_only_distinct_key(
+        conn,
+        xinghe_table=xinghe_table,
+        metadata_table=ebook_table,
+        xinghe_key_field="isbn",
+        metadata_key_field="isbn13",
+        metadata_dt=ebook_dt,
+    )
+    expected_target_count = (
+        paper_source
+        + ebook_source
+        + xinghe_only_paper_count
+        + xinghe_only_ebook_count
+    )
+    result: Dict[str, Any] = {
+        "paper_source": paper_source,
+        "ebook_source": ebook_source,
+        "xinghe_only_paper_count": xinghe_only_paper_count,
+        "xinghe_only_ebook_count": xinghe_only_ebook_count,
+        "expected_target_count": expected_target_count,
+        "actual_target_count": target,
+        "target_count_diff": target - expected_target_count,
+    }
+    return result
+
+
+def count_xinghe_only_missing_target(
+    conn: Any,
+    *,
+    xinghe_table: str,
+    metadata_table: str,
+    target_table: str,
+    metadata_type: str,
+    xinghe_key_field: str,
+    metadata_key_field: str,
+    target_dt: Optional[str],
+    metadata_dt: Optional[str],
+) -> int:
+    params: List[Any] = []
+    metadata_dt_join = "AND m.`dt` = %s" if metadata_dt is not None else ""
+    if metadata_dt is not None:
+        params.append(metadata_dt)
+    target_dt_join = "AND t.`dt` = %s" if target_dt is not None else ""
+    if target_dt is not None:
+        params.append(target_dt)
+    sql = (
+        "SELECT COUNT(*) AS cnt "
+        f"FROM {quote_identifier(xinghe_table)} x "
+        f"LEFT JOIN {quote_identifier(metadata_table)} m "
+        f"ON m.`{metadata_key_field}` = x.`{xinghe_key_field}` {metadata_dt_join} "
+        f"LEFT JOIN {quote_identifier(target_table)} t "
+        f"ON t.`unique_id` = CONCAT('{metadata_type}:', x.`{xinghe_key_field}`) {target_dt_join} "
+        f"WHERE x.`{xinghe_key_field}` IS NOT NULL AND x.`{xinghe_key_field}` != '' "
+        f"AND m.`{metadata_key_field}` IS NULL AND t.`unique_id` IS NULL"
+    )
+    row = fetch_one(conn, sql, params)
+    return int(row["cnt"]) if row else 0
+
+
+def skipped_coverage_counts(reason: str) -> Dict[str, Any]:
+    return {"skipped": True, "reason": reason}
+
+
+def failed_coverage_counts(exc: Exception) -> Dict[str, Any]:
+    return {
+        "skipped": True,
+        "status": "failed",
+        "reason": "coverage_count_failed",
+        "error_type": type(exc).__name__,
+        "error": str(exc),
+    }
+
+
+def build_target_sample_query(
+    target_table: str,
+    dt: Optional[str],
+    limit: Optional[int],
+    metadata_type: Optional[str] = None,
+    sample_mode: str = "natural",
+) -> Tuple[str, List[Any]]:
+    params: List[Any] = []
+    type_clause = ""
+    if metadata_type is not None:
+        type_clause = " AND `metadata_type` = %s"
+        params.append(metadata_type)
+    sql = (
+        f"SELECT * FROM {quote_identifier(target_table)} "
+        f"WHERE `unique_id` IS NOT NULL AND `metadata_type` IN ('paper', 'ebook')"
+        f"{type_clause}{_dt_clause(dt, params)}"
+        f"{' AND MOD(CRC32(`unique_id`), 100) = 0' if sample_mode == 'hash' else ''}"
+        f"{_limit_clause(limit)}"
+    )
+    return sql, params
+
+
+def fetch_target_samples(
+    conn: Any,
+    *,
+    target_table: str,
+    dt: Optional[str],
+    limit: Optional[int],
+    sample_mode: str = "natural",
+) -> List[Dict[str, Any]]:
+    if limit is None:
+        sql, params = build_target_sample_query(target_table, dt, None)
+        return fetch_records(conn, sql, params)
+
+    rows: List[Dict[str, Any]] = []
+    for metadata_type, part_limit in zip(("paper", "ebook"), split_limit(limit, 2)):
+        if part_limit == 0:
+            continue
+        sql, params = build_target_sample_query(target_table, dt, part_limit, metadata_type, sample_mode)
+        rows.extend(fetch_records(conn, sql, params))
+    return rows
+
+
+def build_missing_target_sample_query(
+    source_table: str,
+    target_table: str,
+    *,
+    metadata_type: str,
+    key_field: str,
+    source_dt: Optional[str],
+    target_dt: Optional[str],
+    limit: Optional[int],
+) -> Tuple[str, List[Any]]:
+    params: List[Any] = []
+    source_alias = "s"
+    target_dt_join = "AND t.`dt` = %s" if target_dt is not None else "AND t.`dt` = s.`dt`"
+    if target_dt is not None:
+        params.append(target_dt)
+    sql = (
+        f"SELECT {source_alias}.`{key_field}` AS sample_key, {source_alias}.`dt` AS dt "
+        f"FROM {quote_identifier(source_table)} {source_alias} "
+        f"LEFT JOIN {quote_identifier(target_table)} t "
+        f"ON t.`unique_id` = CONCAT('{metadata_type}:', {source_alias}.`{key_field}`) "
+        f"{target_dt_join} "
+        f"WHERE {source_alias}.`{key_field}` IS NOT NULL AND {source_alias}.`{key_field}` != ''"
+        f"{_dt_clause(source_dt, params, source_alias)} AND t.`unique_id` IS NULL "
+        f"ORDER BY {source_alias}.`{key_field}`{_limit_clause(limit)}"
+    )
+    return sql, params
+
+
+def build_xinghe_missing_target_sample_query(
+    xinghe_table: str,
+    target_table: str,
+    *,
+    metadata_type: str,
+    xinghe_key_field: str,
+    dt: Optional[str],
+    limit: Optional[int],
+) -> Tuple[str, List[Any]]:
+    params: List[Any] = []
+    target_dt_join = "AND t.`dt` = %s" if dt is not None else ""
+    if dt is not None:
+        params.append(dt)
+    sql = (
+        f"SELECT x.`{xinghe_key_field}` AS sample_key, "
+        "x.`data_date` AS data_date, x.`sha256` AS sha256, x.`origin_path` AS origin_path "
+        f"FROM {quote_identifier(xinghe_table)} x "
+        f"LEFT JOIN {quote_identifier(target_table)} t "
+        f"ON t.`unique_id` = CONCAT('{metadata_type}:', x.`{xinghe_key_field}`) "
+        f"{target_dt_join} "
+        f"WHERE x.`{xinghe_key_field}` IS NOT NULL AND x.`{xinghe_key_field}` != ''"
+        " AND t.`unique_id` IS NULL "
+        f"ORDER BY x.`{xinghe_key_field}`, x.`sha256`, x.`origin_path`{_limit_clause(limit)}"
+    )
+    return sql, params
+
+
+def build_xinghe_only_missing_target_sample_query(
+    xinghe_table: str,
+    metadata_table: str,
+    target_table: str,
+    *,
+    metadata_type: str,
+    xinghe_key_field: str,
+    metadata_key_field: str,
+    metadata_dt: Optional[str],
+    target_dt: Optional[str],
+    limit: Optional[int],
+) -> Tuple[str, List[Any]]:
+    params: List[Any] = []
+    metadata_dt_join = "AND m.`dt` = %s" if metadata_dt is not None else ""
+    if metadata_dt is not None:
+        params.append(metadata_dt)
+    target_dt_join = "AND t.`dt` = %s" if target_dt is not None else ""
+    if target_dt is not None:
+        params.append(target_dt)
+    sql = (
+        f"SELECT x.`{xinghe_key_field}` AS sample_key, "
+        "x.`data_date` AS data_date, x.`sha256` AS sha256, x.`origin_path` AS origin_path "
+        f"FROM {quote_identifier(xinghe_table)} x "
+        f"LEFT JOIN {quote_identifier(metadata_table)} m "
+        f"ON m.`{metadata_key_field}` = x.`{xinghe_key_field}` {metadata_dt_join} "
+        f"LEFT JOIN {quote_identifier(target_table)} t "
+        f"ON t.`unique_id` = CONCAT('{metadata_type}:', x.`{xinghe_key_field}`) {target_dt_join} "
+        f"WHERE x.`{xinghe_key_field}` IS NOT NULL AND x.`{xinghe_key_field}` != '' "
+        f"AND m.`{metadata_key_field}` IS NULL AND t.`unique_id` IS NULL "
+        f"ORDER BY x.`{xinghe_key_field}`, x.`sha256`, x.`origin_path`{_limit_clause(limit)}"
+    )
+    return sql, params
+
+
+def fetch_metadata_record(
+    conn: Any,
+    *,
+    table: str,
+    metadata_type: str,
+    key: Any,
+    dt: Optional[str],
+) -> Optional[Dict[str, Any]]:
+    key_field = "doi" if metadata_type == "paper" else "isbn13"
+    params: List[Any] = [str(key).lower() if metadata_type == "paper" else key]
+    predicate = f"LOWER(`{key_field}`) = %s" if metadata_type == "paper" else f"`{key_field}` = %s"
+    sql = (
+        f"SELECT * FROM {quote_identifier(table)} WHERE {predicate}"
+        f"{_dt_clause(dt, params)} ORDER BY `{key_field}` LIMIT 2"
+    )
+    rows = fetch_records(conn, sql, params)
+    return rows[0] if rows else None
+
+
+def chunked(values: Sequence[Any], size: int) -> Iterable[Sequence[Any]]:
+    for i in range(0, len(values), size):
+        yield values[i : i + size]
+
+
+def fetch_metadata_records_batch(
+    conn: Any,
+    *,
+    table: str,
+    metadata_type: str,
+    keys: Sequence[Any],
+    dt: Optional[str],
+    batch_size: int = 500,
+) -> Dict[str, Dict[str, Any]]:
+    key_field = "doi" if metadata_type == "paper" else "isbn13"
+    normalized_keys = [
+        str(key).lower() if metadata_type == "paper" else str(key)
+        for key in keys
+        if key not in (None, "")
+    ]
+    result: Dict[str, Dict[str, Any]] = {}
+    for batch in chunked(sorted(set(normalized_keys)), batch_size):
+        params: List[Any] = list(batch)
+        placeholders = ",".join(["%s"] * len(batch))
+        predicate = (
+            f"LOWER(`{key_field}`) IN ({placeholders})"
+            if metadata_type == "paper"
+            else f"`{key_field}` IN ({placeholders})"
+        )
+        sql = (
+            f"SELECT * FROM {quote_identifier(table)} WHERE {predicate}"
+            f"{_dt_clause(dt, params)} ORDER BY `{key_field}`"
+        )
+        for row in fetch_records(conn, sql, params):
+            row_key = row.get(key_field)
+            if row_key in (None, ""):
+                continue
+            map_key = normalize_lookup_key(row_key, metadata_type)
+            result.setdefault(map_key, row)
+    return result
+
+
+def embedded_key_like_patterns(key: Any) -> List[str]:
+    text = normalize_key_text(key).lower()
+    if not text:
+        return []
+    if "<" not in text and ">" not in text:
+        return []
+    variants = {text, html.escape(text, quote=False).lower()}
+    return [f"%{variant}%" for variant in sorted(variants) if variant]
+
+
+def fetch_paper_metadata_records_by_embedded_key(
+    conn: Any,
+    *,
+    table: str,
+    key: Any,
+    dt: Optional[str],
+    limit: int = 20,
+) -> List[Dict[str, Any]]:
+    patterns = embedded_key_like_patterns(key)
+    if not patterns:
+        return []
+    params: List[Any] = list(patterns)
+    like_clause = " OR ".join(["LOWER(`doi`) LIKE %s"] * len(patterns))
+    sql = (
+        f"SELECT * FROM {quote_identifier(table)} "
+        f"WHERE ({like_clause}){_dt_clause(dt, params)} "
+        f"ORDER BY `doi` LIMIT {int(limit)}"
+    )
+    return fetch_records(conn, sql, params)
+
+
+def score_metadata_candidate(
+    target_row: Dict[str, Any],
+    candidate: Dict[str, Any],
+    *,
+    specs: Sequence[UnionFieldSpec],
+    metadata_type: str,
+) -> int:
+    score = 0
+    for spec in specs:
+        source = spec.paper_source if metadata_type == "paper" else spec.ebook_source
+        if not _is_field_ref(source):
+            continue
+        actual_value = normalize_empty_for_compare(
+            canonicalize(target_row.get(spec.field_name)),
+            spec.data_type,
+            spec.field_name,
+        )
+        if actual_value is None:
+            continue
+        candidate_value = normalize_empty_for_compare(
+            canonicalize(get_source_value(candidate, source, metadata_type)),
+            spec.data_type,
+            spec.field_name,
+        )
+        if candidate_value == actual_value:
+            score += 1
+    return score
+
+
+def choose_metadata_record_for_target(
+    target_row: Dict[str, Any],
+    candidates: Sequence[Dict[str, Any]],
+    *,
+    specs: Sequence[UnionFieldSpec],
+    metadata_type: str,
+) -> Optional[Dict[str, Any]]:
+    if not candidates:
+        return None
+    scored = [
+        (
+            score_metadata_candidate(
+                target_row,
+                candidate,
+                specs=specs,
+                metadata_type=metadata_type,
+            ),
+            candidate,
+        )
+        for candidate in candidates
+    ]
+    scored.sort(key=lambda item: item[0], reverse=True)
+    if scored[0][0] > 0 or len(scored) == 1:
+        return scored[0][1]
+    return None
+
+
+def fetch_xinghe_records(
+    conn: Any,
+    *,
+    table: str,
+    metadata_type: str,
+    key: Any,
+    dt: Optional[str],
+    limit: int = 100,
+) -> List[Dict[str, Any]]:
+    key_field = "doi" if metadata_type == "paper" else "isbn"
+    params: List[Any] = [str(key).lower() if metadata_type == "paper" else key]
+    predicate = f"LOWER(`{key_field}`) = %s" if metadata_type == "paper" else f"`{key_field}` = %s"
+    sql = (
+        f"SELECT * FROM {quote_identifier(table)} WHERE {predicate}"
+        f" ORDER BY `sha256`, `origin_path` LIMIT {int(limit)}"
+    )
+    return fetch_records(conn, sql, params)
+
+
+def fetch_xinghe_records_batch(
+    conn: Any,
+    *,
+    table: str,
+    metadata_type: str,
+    keys: Sequence[Any],
+    batch_size: int = 500,
+) -> Dict[str, List[Dict[str, Any]]]:
+    key_field = "doi" if metadata_type == "paper" else "isbn"
+    normalized_keys = [
+        str(key).lower() if metadata_type == "paper" else str(key)
+        for key in keys
+        if key not in (None, "")
+    ]
+    result: Dict[str, List[Dict[str, Any]]] = defaultdict(list)
+    for batch in chunked(sorted(set(normalized_keys)), batch_size):
+        params: List[Any] = list(batch)
+        placeholders = ",".join(["%s"] * len(batch))
+        predicate = (
+            f"LOWER(`{key_field}`) IN ({placeholders})"
+            if metadata_type == "paper"
+            else f"`{key_field}` IN ({placeholders})"
+        )
+        sql = (
+            f"SELECT * FROM {quote_identifier(table)} WHERE {predicate}"
+            " ORDER BY `sha256`, `origin_path`"
+        )
+        for row in fetch_records(conn, sql, params):
+            row_key = row.get(key_field)
+            if row_key in (None, ""):
+                continue
+            map_key = normalize_lookup_key(row_key, metadata_type)
+            result[map_key].append(row)
+    return dict(result)
+
+
+def fetch_xinghe_records_by_sha_batch(
+    conn: Any,
+    *,
+    table: str,
+    sha_values: Sequence[Any],
+    batch_size: int = 500,
+) -> Dict[str, List[Dict[str, Any]]]:
+    normalized = [str(value) for value in sha_values if value not in (None, "")]
+    result: Dict[str, List[Dict[str, Any]]] = defaultdict(list)
+    for batch in chunked(sorted(set(normalized)), batch_size):
+        params: List[Any] = list(batch)
+        placeholders = ",".join(["%s"] * len(batch))
+        sql = (
+            f"SELECT * FROM {quote_identifier(table)} "
+            f"WHERE `sha256` IN ({placeholders}) "
+            "ORDER BY `sha256`, `origin_path`"
+        )
+        for row in fetch_records(conn, sql, params):
+            sha256 = row.get("sha256")
+            if sha256 in (None, ""):
+                continue
+            result[str(sha256)].append(row)
+    return dict(result)
+
+
+def fetch_paper_xinghe_records_by_embedded_key(
+    conn: Any,
+    *,
+    table: str,
+    key: Any,
+    limit: int = 100,
+) -> List[Dict[str, Any]]:
+    patterns = embedded_key_like_patterns(key)
+    if not patterns:
+        return []
+    params: List[Any] = list(patterns)
+    like_clause = " OR ".join(["LOWER(`doi`) LIKE %s"] * len(patterns))
+    sql = (
+        f"SELECT * FROM {quote_identifier(table)} "
+        f"WHERE ({like_clause}) "
+        f"ORDER BY `sha256`, `origin_path` LIMIT {int(limit)}"
+    )
+    return fetch_records(conn, sql, params)
+
+
+def fetch_xinghe_records_by_target_repository_fields(
+    conn: Any,
+    *,
+    table: str,
+    target_row: Dict[str, Any],
+    limit: int = 20,
+) -> List[Dict[str, Any]]:
+    sha256 = target_row.get("access_xinghe_repository_sha256")
+    if sha256 in (None, ""):
+        return []
+    sql = (
+        f"SELECT * FROM {quote_identifier(table)} "
+        "WHERE `sha256` = %s "
+        f"ORDER BY `sha256`, `origin_path` LIMIT {int(limit)}"
+    )
+    return fetch_records(conn, sql, [sha256])
+
+
+XINGHE_TARGET_MATCH_FIELDS = (
+    ("sha256", "access_xinghe_repository_sha256"),
+    ("origin_path", "access_xinghe_repository_origin_path"),
+    ("processed_path", "access_xinghe_repository_processed_path"),
+    ("origin_url", "access_xinghe_repository_origin_url"),
+)
+JOURNAL_NAME_MAPPING_CACHE: Optional[Tuple[Dict[str, str], Dict[str, str]]] = None
+
+
+def choose_xinghe_record_for_target(
+    target_row: Dict[str, Any],
+    xinghe_rows: Sequence[Dict[str, Any]],
+) -> Optional[Dict[str, Any]]:
+    if not xinghe_rows:
+        return None
+
+    for source_field, target_field in XINGHE_TARGET_MATCH_FIELDS:
+        target_value = target_row.get(target_field)
+        if target_value in (None, ""):
+            continue
+        target_cmp = str(target_value).strip()
+        for row in xinghe_rows:
+            source_value = row.get(source_field)
+            if source_value in (None, ""):
+                continue
+            if str(source_value).strip() == target_cmp:
+                return row
+
+    if len(xinghe_rows) == 1:
+        return xinghe_rows[0]
+    return None
+
+
+def expected_for_target_row(
+    conn: Any,
+    *,
+    row: Dict[str, Any],
+    specs: Sequence[UnionFieldSpec],
+    paper_table: str,
+    ebook_table: str,
+    xinghe_table: str,
+    target_dt: Optional[str],
+    paper_dt: Optional[str],
+    ebook_dt: Optional[str],
+) -> Tuple[Optional[Dict[str, Any]], Dict[str, Any]]:
+    metadata_type = row.get("metadata_type")
+    if metadata_type not in ("paper", "ebook"):
+        return None, {"reason": "unsupported_metadata_type", "metadata_type": metadata_type}
+
+    key = target_key_for_row(row, metadata_type)
+    if not key:
+        key_field = "doi" if metadata_type == "paper" else "isbn13"
+        return None, {"reason": "missing_target_key", "key_field": key_field}
+
+    metadata_table = paper_table if metadata_type == "paper" else ebook_table
+    metadata_map, xinghe_map = build_field_maps(specs, metadata_type)
+    row_dt = target_dt if target_dt is not None else row.get("dt")
+    metadata_dt = paper_dt if metadata_type == "paper" else ebook_dt
+    if metadata_dt is None:
+        metadata_dt = row_dt
+    metadata_record = fetch_metadata_record(
+        conn,
+        table=metadata_table,
+        metadata_type=metadata_type,
+        key=key,
+        dt=metadata_dt,
+    )
+    xinghe_rows = fetch_xinghe_records(
+        conn,
+        table=xinghe_table,
+        metadata_type=metadata_type,
+        key=key,
+        dt=row_dt,
+    )
+    warnings: Dict[str, Any] = {}
+    xinghe_record = choose_xinghe_record_for_target(row, xinghe_rows)
+    if len(xinghe_rows) > 1:
+        warnings["xinghe_duplicate_candidates"] = len(xinghe_rows)
+        if xinghe_record is None:
+            warnings["xinghe_match"] = "ambiguous_no_repository_field_match"
+    expected = merge_one(
+        metadata_record,
+        xinghe_record,
+        metadata_type=metadata_type,
+        specs=specs,
+        metadata_map=metadata_map,
+        xinghe_map=xinghe_map,
+        fallback_key=key,
+    )
+    if row_dt is not None:
+        expected["dt"] = row_dt
+    return expected, warnings
+
+
+def expected_for_target_row_from_sources(
+    *,
+    row: Dict[str, Any],
+    specs: Sequence[UnionFieldSpec],
+    metadata_record: Optional[Dict[str, Any]],
+    xinghe_rows: Sequence[Dict[str, Any]],
+    target_dt: Optional[str],
+) -> Tuple[Optional[Dict[str, Any]], Dict[str, Any]]:
+    metadata_type = row.get("metadata_type")
+    if metadata_type not in ("paper", "ebook"):
+        return None, {"reason": "unsupported_metadata_type", "metadata_type": metadata_type}
+
+    key = target_key_for_row(row, metadata_type)
+    if not key:
+        key_field = "doi" if metadata_type == "paper" else "isbn13"
+        return None, {"reason": "missing_target_key", "key_field": key_field}
+
+    metadata_map, xinghe_map = build_field_maps(specs, metadata_type)
+    warnings: Dict[str, Any] = {}
+    xinghe_record = choose_xinghe_record_for_target(row, xinghe_rows)
+    if len(xinghe_rows) > 1:
+        warnings["xinghe_duplicate_candidates"] = len(xinghe_rows)
+        if xinghe_record is None:
+            warnings["xinghe_match"] = "ambiguous_no_repository_field_match"
+    expected = merge_one(
+        metadata_record,
+        xinghe_record,
+        metadata_type=metadata_type,
+        specs=specs,
+        metadata_map=metadata_map,
+        xinghe_map=xinghe_map,
+        fallback_key=key,
+    )
+    row_dt = target_dt if target_dt is not None else row.get("dt")
+    if row_dt is not None:
+        expected["dt"] = row_dt
+    return expected, warnings
+
+
+def validate_source_field_mapping(
+    conn: Any,
+    *,
+    specs: Sequence[UnionFieldSpec],
+    paper_table: str,
+    ebook_table: str,
+    xinghe_table: str,
+    target_table: str,
+    target_dt: Optional[str],
+    paper_dt: Optional[str],
+    ebook_dt: Optional[str],
+    limit: Optional[int],
+    target_sample_mode: str = "natural",
+) -> Dict[str, Any]:
+    target_rows = fetch_target_samples(
+        conn,
+        target_table=target_table,
+        dt=target_dt,
+        limit=limit,
+        sample_mode=target_sample_mode,
+    )
+    log_step(f"source field mapping 抽到目标样本 {len(target_rows)} 条")
+    keys_by_type: Dict[str, List[Any]] = {"paper": [], "ebook": []}
+    repository_sha_values: List[Any] = []
+    for target_row in target_rows:
+        metadata_type = target_row.get("metadata_type")
+        if metadata_type == "paper":
+            keys_by_type["paper"].append(target_key_for_row(target_row, "paper"))
+        elif metadata_type == "ebook":
+            keys_by_type["ebook"].append(target_key_for_row(target_row, "ebook"))
+        sha256 = target_row.get("access_xinghe_repository_sha256")
+        if sha256 not in (None, ""):
+            repository_sha_values.append(sha256)
+    metadata_records = {
+        "paper": fetch_metadata_records_batch(
+            conn,
+            table=paper_table,
+            metadata_type="paper",
+            keys=keys_by_type["paper"],
+            dt=paper_dt if paper_dt is not None else target_dt,
+        ),
+        "ebook": fetch_metadata_records_batch(
+            conn,
+            table=ebook_table,
+            metadata_type="ebook",
+            keys=keys_by_type["ebook"],
+            dt=ebook_dt if ebook_dt is not None else target_dt,
+        ),
+    }
+    xinghe_records = {
+        "paper": fetch_xinghe_records_batch(
+            conn,
+            table=xinghe_table,
+            metadata_type="paper",
+            keys=keys_by_type["paper"],
+        ),
+        "ebook": fetch_xinghe_records_batch(
+            conn,
+            table=xinghe_table,
+            metadata_type="ebook",
+            keys=keys_by_type["ebook"],
+        ),
+    }
+    xinghe_records_by_sha = fetch_xinghe_records_by_sha_batch(
+        conn,
+        table=xinghe_table,
+        sha_values=repository_sha_values,
+    )
+    log_step(
+        "source batch 查询完成："
+        f"paper metadata={len(metadata_records['paper'])}, "
+        f"ebook metadata={len(metadata_records['ebook'])}, "
+        f"paper xinghe={len(xinghe_records['paper'])}, "
+        f"ebook xinghe={len(xinghe_records['ebook'])}, "
+        f"sha xinghe={len(xinghe_records_by_sha)}"
+    )
+    compare_fields = [spec.field_name for spec in specs]
+    field_types = {spec.field_name: spec.data_type for spec in specs}
+    checked = passed = failed = skipped = 0
+    mismatches: List[Dict[str, Any]] = []
+    warnings: List[Dict[str, Any]] = []
+    paper_metadata_embedded_cache: Dict[str, List[Dict[str, Any]]] = {}
+    paper_xinghe_embedded_cache: Dict[str, List[Dict[str, Any]]] = {}
+
+    for target_row in target_rows:
+        checked += 1
+        metadata_type = target_row.get("metadata_type")
+        lookup_key = normalize_lookup_key(target_key_for_row(target_row, str(metadata_type)), str(metadata_type))
+        metadata_record = metadata_records.get(str(metadata_type), {}).get(lookup_key)
+        xinghe_rows = xinghe_records.get(str(metadata_type), {}).get(lookup_key, [])
+        if metadata_type == "paper" and lookup_key:
+            if metadata_record is None:
+                if lookup_key not in paper_metadata_embedded_cache:
+                    paper_metadata_embedded_cache[lookup_key] = fetch_paper_metadata_records_by_embedded_key(
+                        conn,
+                        table=paper_table,
+                        key=lookup_key,
+                        dt=paper_dt if paper_dt is not None else target_dt,
+                    )
+                metadata_record = choose_metadata_record_for_target(
+                    target_row,
+                    paper_metadata_embedded_cache[lookup_key],
+                    specs=specs,
+                    metadata_type="paper",
+                )
+            if not xinghe_rows:
+                if lookup_key not in paper_xinghe_embedded_cache:
+                    paper_xinghe_embedded_cache[lookup_key] = fetch_paper_xinghe_records_by_embedded_key(
+                        conn,
+                        table=xinghe_table,
+                        key=lookup_key,
+                    )
+                xinghe_rows = paper_xinghe_embedded_cache[lookup_key]
+        if not xinghe_rows:
+            sha256 = target_row.get("access_xinghe_repository_sha256")
+            if sha256 not in (None, ""):
+                xinghe_rows = xinghe_records_by_sha.get(str(sha256), [])
+        expected, row_warnings = expected_for_target_row_from_sources(
+            row=target_row,
+            specs=specs,
+            metadata_record=metadata_record,
+            xinghe_rows=xinghe_rows,
+            target_dt=target_dt,
+        )
+        unique_id = target_row.get("unique_id")
+        if row_warnings:
+            warnings.append({"unique_id": unique_id, **row_warnings})
+        if expected is None:
+            skipped += 1
+            mismatches.append({"unique_id": unique_id, "status": "skipped", **row_warnings})
+            continue
+        expected_cmp = comparable_record(expected, compare_fields)
+        actual_cmp = comparable_record(target_row, compare_fields)
+        row_mismatches = compare_records(expected_cmp, actual_cmp, field_types)
+        if row_mismatches:
+            failed += 1
+            mismatches.append(
+                {
+                    "unique_id": unique_id,
+                    "dt": target_row.get("dt"),
+                    "metadata_type": target_row.get("metadata_type"),
+                    "status": "field_mismatch",
+                    "mismatches": row_mismatches,
+                }
+            )
+        else:
+            passed += 1
+
+    return {
+        "checked": checked,
+        "passed": passed,
+        "failed": failed,
+        "skipped": skipped,
+        "warnings": warnings[:100],
+        "mismatches": mismatches,
+    }
+
+
+def validate_missing_target_samples(
+    conn: Any,
+    *,
+    paper_table: str,
+    ebook_table: str,
+    target_table: str,
+    xinghe_table: str,
+    target_dt: Optional[str],
+    paper_dt: Optional[str],
+    ebook_dt: Optional[str],
+    limit: int,
+) -> Dict[str, Any]:
+    per_kind = max(1, limit // 6)
+    result: Dict[str, Any] = {}
+    for metadata_type, table, key_field, source_dt in (
+        ("paper", paper_table, "doi", paper_dt),
+        ("ebook", ebook_table, "isbn13", ebook_dt),
+    ):
+        sql, params = build_missing_target_sample_query(
+            table,
+            target_table,
+            metadata_type=metadata_type,
+            key_field=key_field,
+            source_dt=source_dt,
+            target_dt=target_dt,
+            limit=per_kind,
+        )
+        result[f"{metadata_type}_source"] = fetch_records(conn, sql, params)
+    for metadata_type, key_field in (
+        ("paper", "doi"),
+        ("ebook", "isbn"),
+    ):
+        sql, params = build_xinghe_missing_target_sample_query(
+            xinghe_table,
+            target_table,
+            metadata_type=metadata_type,
+            xinghe_key_field=key_field,
+            dt=target_dt,
+            limit=per_kind,
+        )
+        result[f"xinghe_{metadata_type}_source"] = fetch_records(conn, sql, params)
+    for metadata_type, xinghe_key_field, metadata_table, metadata_key_field, metadata_dt in (
+        ("paper", "doi", paper_table, "doi", paper_dt),
+        ("ebook", "isbn", ebook_table, "isbn13", ebook_dt),
+    ):
+        sql, params = build_xinghe_only_missing_target_sample_query(
+            xinghe_table,
+            metadata_table,
+            target_table,
+            metadata_type=metadata_type,
+            xinghe_key_field=xinghe_key_field,
+            metadata_key_field=metadata_key_field,
+            metadata_dt=metadata_dt,
+            target_dt=target_dt,
+            limit=per_kind,
+        )
+        result[f"xinghe_only_{metadata_type}_source"] = fetch_records(conn, sql, params)
+    return result
+
+
+def null_empty_rate_for_field(
+    conn: Any,
+    *,
+    table: str,
+    field: str,
+    dt: Optional[str],
+) -> Dict[str, Any]:
+    params: List[Any] = []
+    quoted = f"`{field.replace('`', '``')}`"
+    sql = (
+        "SELECT "
+        "COUNT(*) AS total, "
+        f"SUM(CASE WHEN {quoted} IS NULL THEN 1 ELSE 0 END) AS null_count, "
+        f"SUM(CASE WHEN {quoted} IS NOT NULL "
+        f"AND TRIM(CAST({quoted} AS VARCHAR)) IN ('', '[]', '{{}}') "
+        "THEN 1 ELSE 0 END) AS empty_count "
+        f"FROM {quote_identifier(table)} WHERE 1=1{_dt_clause(dt, params)}"
+    )
+    row = fetch_one(conn, sql, params)
+    if not row:
+        return {"field": field, "total": 0, "null_count": 0, "empty_count": 0}
+    total = int(row.get("total") or 0)
+    null_count = int(row.get("null_count") or 0)
+    empty_count = int(row.get("empty_count") or 0)
+    return {
+        "field": field,
+        "total": total,
+        "null_count": null_count,
+        "empty_count": empty_count,
+        "null_rate": null_count / total if total else 0.0,
+        "empty_rate": empty_count / total if total else 0.0,
+    }
+
+
+def empty_condition_sql(quoted_field: str, data_type: str) -> Optional[str]:
+    type_text = (data_type or "").strip().lower()
+    if (
+        type_text in ("string", "text")
+        or type_text.startswith("varchar")
+        or type_text.startswith("char")
+    ):
+        return f"TRIM(CAST({quoted_field} AS VARCHAR)) = ''"
+    if type_text.startswith("array") or type_text.startswith("list"):
+        return f"CARDINALITY({quoted_field}) = 0"
+    return None
+
+
+def build_null_empty_rates(
+    conn: Any,
+    *,
+    target_table: str,
+    specs: Sequence[UnionFieldSpec],
+    dt: Optional[str],
+) -> List[Dict[str, Any]]:
+    target_field_types = show_column_types(conn, target_table)
+    row = fetch_null_empty_rate_row(
+        conn,
+        target_table=target_table,
+        specs=specs,
+        dt=dt,
+        extra_where="",
+        extra_params=[],
+        target_field_types=target_field_types,
+    )
+    return null_empty_rates_from_row(row, specs)
+
+
+def fetch_null_empty_rate_row(
+    conn: Any,
+    *,
+    target_table: str,
+    specs: Sequence[UnionFieldSpec],
+    dt: Optional[str],
+    extra_where: str,
+    extra_params: Sequence[Any],
+    target_field_types: Optional[Dict[str, str]] = None,
+) -> Dict[str, Any]:
+    params: List[Any] = []
+    select_parts: List[str] = ["COUNT(*) AS `total`"]
+    target_field_types = target_field_types or {}
+    for idx, spec in enumerate(specs):
+        quoted = f"`{spec.field_name.replace('`', '``')}`"
+        select_parts.append(
+            f"SUM(CASE WHEN {quoted} IS NULL THEN 1 ELSE 0 END) AS `n_{idx}`"
+        )
+        effective_type = target_field_types.get(spec.field_name) or spec.data_type
+        empty_condition = empty_condition_sql(quoted, effective_type)
+        if empty_condition is None:
+            select_parts.append(f"0 AS `e_{idx}`")
+        else:
+            select_parts.append(
+                f"SUM(CASE WHEN {quoted} IS NOT NULL AND {empty_condition} "
+                f"THEN 1 ELSE 0 END) AS `e_{idx}`"
+            )
+    sql = (
+        "SELECT "
+        + ", ".join(select_parts)
+        + f" FROM {quote_identifier(target_table)} WHERE 1=1{_dt_clause(dt, params)}{extra_where}"
+    )
+    params.extend(extra_params)
+    return fetch_one(conn, sql, params) or {}
+
+
+def null_empty_rates_from_row(row: Dict[str, Any], specs: Sequence[UnionFieldSpec]) -> List[Dict[str, Any]]:
+    total = int(row.get("total") or 0)
+    rates: List[Dict[str, Any]] = []
+    for idx, spec in enumerate(specs):
+        null_count = int(row.get(f"n_{idx}") or 0)
+        empty_count = int(row.get(f"e_{idx}") or 0)
+        rates.append(
+            {
+                "field": spec.field_name,
+                "total": total,
+                "null_count": null_count,
+                "empty_count": empty_count,
+                "null_rate": null_count / total if total else 0.0,
+                "empty_rate": empty_count / total if total else 0.0,
+            }
+        )
+    return rates
+
+
+def skipped_null_empty_rates(reason: str) -> List[Dict[str, Any]]:
+    return [{"skipped": True, "reason": reason}]
+
+
+def failed_null_empty_rates(exc: Exception) -> List[Dict[str, Any]]:
+    return [
+        {
+            "skipped": True,
+            "status": "failed",
+            "reason": "null_empty_count_failed",
+            "error_type": type(exc).__name__,
+            "error": str(exc),
+        }
+    ]
+
+
+def validate_target_field_values(
+    conn: Any,
+    *,
+    target_table: str,
+    dt: Optional[str],
+    limit: Optional[int],
+) -> Dict[str, Any]:
+    return {
+        "checked": 0,
+        "passed": 0,
+        "failed": 0,
+        "fail_rate": 0.0,
+        "field_error_summary": {},
+        "issues": [],
+        "examples": {},
+        "skipped": True,
+        "reason": "field validator removed; union validation uses schema, coverage, null/empty rates, and source field mapping",
+    }
+
+
+REPORT_KEY_LABELS = {
+    "status": "状态",
+    "config_path": "配置文件",
+    "mapping_csv": "映射文件",
+    "paper_table": "论文源表",
+    "ebook_table": "图书源表",
+    "xinghe_table": "星河全文表",
+    "target_table": "目标表",
+    "dt": "目标表分区",
+    "target_dt": "目标表分区",
+    "paper_dt": "论文源表分区",
+    "ebook_dt": "图书源表分区",
+    "sample_size": "抽样数量",
+    "coverage_mode": "覆盖统计模式",
+    "null_empty_mode": "空值率统计模式",
+    "missing_sample_mode": "缺失样例模式",
+    "target_sample_mode": "目标表抽样模式",
+    "schema_check": "Schema检查",
+    "missing_fields": "缺失字段",
+    "extra_fields": "多余字段",
+    "expected_count": "预期字段数",
+    "actual_count": "实际字段数",
+    "coverage_counts": "覆盖统计",
+    "paper_source": "论文去重表记录数",
+    "ebook_source": "图书去重表记录数",
+    "xinghe_only_paper_count": "星河表去重论文兜底数",
+    "xinghe_only_ebook_count": "星河表去重图书兜底数",
+    "expected_target_count": "理论全量表记录数",
+    "actual_target_count": "实际全量表记录数",
+    "target_count_diff": "全量表记录数差异",
+    "source_field_mapping": "源字段映射校验",
+    "checked": "已校验数",
+    "passed": "通过数",
+    "failed": "失败数",
+    "skipped": "跳过数",
+    "warning_count": "Warning数量",
+    "field_quality": "字段质量校验",
+    "fail_rate": "失败率",
+    "field_error_summary": "字段错误汇总",
+    "reason": "原因",
+    "output_dir": "报告目录",
+    "details": "明细",
+    "null_empty_rates": "空值率统计",
+    "top_null_empty_rates": "Top空值率统计",
+    "field": "字段",
+    "total": "总数",
+    "null_count": "NULL数量",
+    "empty_count": "空字符串/空集合数量",
+    "null_rate": "NULL比例",
+    "empty_rate": "空值比例",
+    "null_empty_rate": "NULL和空值合计比例",
+    "missing_target_samples": "缺失目标样例",
+    "mismatches": "字段差异",
+    "warnings": "Warning明细",
+    "unique_id": "唯一ID",
+    "metadata_type": "元数据类型",
+    "expected": "预期值",
+    "actual": "实际值",
+    "report": "报告目录",
+    "total_problem_rows": "问题记录数",
+    "status_counts": "状态分布",
+    "field_counts": "字段问题分布",
+    "field_samples": "字段问题样例",
+    "warning_samples": "Warning样例",
+    "missing_target_sample_counts": "缺失目标样例数量",
+    "sample_key": "样例key",
+    "data_date": "数据日期",
+    "sha256": "sha256",
+    "origin_path": "原始路径",
+    "error_type": "错误类型",
+    "error": "错误信息",
+}
+
+
+FIELD_NAME_KEY_CONTAINERS = {
+    "field_counts",
+    "字段问题分布",
+    "field_samples",
+    "字段问题样例",
+    "mismatches",
+    "字段差异",
+}
+
+
+def localize_report_keys(value: Any, parent_key: Optional[str] = None) -> Any:
+    if isinstance(value, dict):
+        if parent_key in FIELD_NAME_KEY_CONTAINERS:
+            return {
+                str(key): localize_report_keys(val, str(key))
+                for key, val in value.items()
+            }
+        return {
+            REPORT_KEY_LABELS.get(str(key), str(key)): localize_report_keys(val, str(key))
+            for key, val in value.items()
+        }
+    if isinstance(value, list):
+        return [localize_report_keys(item, parent_key) for item in value]
+    return value
+
+
+def write_json(path: Path, payload: Any) -> None:
+    path.parent.mkdir(parents=True, exist_ok=True)
+    with path.open("w", encoding="utf-8") as f:
+        json.dump(localize_report_keys(payload), f, ensure_ascii=False, indent=2, cls=JsonEncoder)
+
+
+def write_jsonl(path: Path, rows: Iterable[Dict[str, Any]]) -> None:
+    path.parent.mkdir(parents=True, exist_ok=True)
+    with path.open("w", encoding="utf-8") as f:
+        for row in rows:
+            f.write(json.dumps(localize_report_keys(row), ensure_ascii=False, cls=JsonEncoder) + "\n")
+
+
+def _json_inline(value: Any) -> str:
+    return json.dumps(value, ensure_ascii=False, cls=JsonEncoder)
+
+
+SAMPLES_PER_FIELD = 3
+
+
+def top_null_empty_rates(rates: Sequence[Dict[str, Any]], limit: int = 10) -> List[Dict[str, Any]]:
+    rows: List[Dict[str, Any]] = []
+    for row in rates:
+        if row.get("skipped") or row.get("error"):
+            continue
+        total = int(row.get("total") or 0)
+        null_count = int(row.get("null_count") or 0)
+        empty_count = int(row.get("empty_count") or 0)
+        total_rate = (null_count + empty_count) / total if total else 0.0
+        rows.append({**row, "null_empty_rate": total_rate})
+    rows.sort(
+        key=lambda item: (
+            float(item.get("null_empty_rate") or 0),
+            int(item.get("null_count") or 0) + int(item.get("empty_count") or 0),
+            str(item.get("field") or ""),
+        ),
+        reverse=True,
+    )
+    return rows[:limit]
+
+
+def build_readable_report_summary(result: Dict[str, Any]) -> Dict[str, Any]:
+    details = result["details"]
+    mismatch_rows = details["source_field_mapping"]["mismatches"]
+    warnings = details["source_field_mapping"]["warnings"]
+    status_counts: Dict[str, int] = {}
+    field_counts: Dict[str, int] = {}
+    field_samples: Dict[str, List[Dict[str, Any]]] = defaultdict(list)
+
+    for row in mismatch_rows:
+        status = str(row.get("status") or "unknown")
+        status_counts[status] = status_counts.get(status, 0) + 1
+        mismatches = row.get("mismatches") or {}
+        for field, diff in mismatches.items():
+            field_counts[field] = field_counts.get(field, 0) + 1
+            if len(field_samples[field]) >= SAMPLES_PER_FIELD:
+                continue
+            field_samples[field].append(
+                {
+                    "unique_id": row.get("unique_id") or row.get("唯一ID"),
+                    "metadata_type": row.get("metadata_type") or row.get("元数据类型"),
+                    "dt": row.get("dt") or row.get("目标表分区"),
+                    "status": status,
+                    "expected": diff.get("expected") if isinstance(diff, dict) else None,
+                    "actual": diff.get("actual") if isinstance(diff, dict) else None,
+                }
+            )
+
+    sorted_field_counts = dict(sorted(field_counts.items(), key=lambda item: (-item[1], item[0])))
+    sorted_status_counts = dict(sorted(status_counts.items(), key=lambda item: (-item[1], item[0])))
+    missing_samples = details.get("missing_target_samples") or {}
+    if isinstance(missing_samples, dict) and missing_samples.get("skipped"):
+        missing_sample_counts = {"skipped": 1}
+    else:
+        missing_sample_counts = {
+            name: len(rows) if isinstance(rows, list) else 0
+            for name, rows in missing_samples.items()
+        }
+    null_empty_rates = details.get("null_empty_rates") or []
+    return {
+        "status": result.get("status"),
+        "report": result.get("output_dir"),
+        "mapping_csv": result.get("mapping_csv"),
+        "paper_table": result.get("paper_table"),
+        "ebook_table": result.get("ebook_table"),
+        "xinghe_table": result.get("xinghe_table"),
+        "target_table": result.get("target_table"),
+        "target_dt": result.get("target_dt") or result.get("dt"),
+        "paper_dt": result.get("paper_dt"),
+        "ebook_dt": result.get("ebook_dt"),
+        "sample_size": result.get("sample_size"),
+        "coverage_mode": result.get("coverage_mode"),
+        "null_empty_mode": result.get("null_empty_mode"),
+        "missing_sample_mode": result.get("missing_sample_mode"),
+        "target_sample_mode": result.get("target_sample_mode"),
+        "schema_check": result.get("schema_check"),
+        "coverage_counts": result.get("coverage_counts"),
+        "source_field_mapping": result.get("source_field_mapping"),
+        "total_problem_rows": len(mismatch_rows),
+        "status_counts": sorted_status_counts,
+        "field_counts": sorted_field_counts,
+        "field_count_total": len(sorted_field_counts),
+        "field_samples": {
+            field: field_samples[field]
+            for field in sorted_field_counts
+            if field in field_samples
+        },
+        "warning_count": len(warnings),
+        "warning_samples": warnings[:5],
+        "null_empty_rates": null_empty_rates,
+        "top_null_empty_rates": top_null_empty_rates(null_empty_rates),
+        "missing_target_sample_counts": missing_sample_counts,
+        "field_quality": result.get("field_quality"),
+    }
+
+
+def _pct(value: Any) -> str:
+    try:
+        return f"{float(value) * 100:.2f}%"
+    except (TypeError, ValueError):
+        return "N/A"
+
+
+def _first_present(row: Dict[str, Any], *keys: str) -> Any:
+    for key in keys:
+        if key in row:
+            return row.get(key)
+    return None
+
+
+def build_readable_report_markdown(summary: Dict[str, Any]) -> str:
+    lines: List[str] = ["# 全量元数据 Union 校验报告摘要", ""]
+    schema_check = summary.get("schema_check") or {}
+    coverage_counts = summary.get("coverage_counts") or {}
+    lines.extend(
+        [
+            f"- 目标分区: `{summary.get('target_dt')}`",
+            f"- 源分区: paper=`{summary.get('paper_dt')}`, ebook=`{summary.get('ebook_dt')}`",
+            f"- 抽样数量: `{summary.get('sample_size')}`",
+            f"- 空值率统计: mode=`{summary.get('null_empty_mode')}`",
+            f"- 字段不一致记录数: `{summary.get('total_problem_rows')}`",
+            f"- 报告目录: `{summary.get('report')}`",
+            "",
+        ]
+    )
+
+    lines.append("## 重点结论")
+    lines.append("")
+    missing_fields = _first_present(schema_check, "missing_fields", "缺失字段") or []
+    extra_fields = _first_present(schema_check, "extra_fields", "多余字段") or []
+    lines.append(
+        f"- Schema: 缺失 `{len(missing_fields)}` 个字段，"
+        f"多余 `{len(extra_fields)}` 个字段"
+    )
+    expected_target_count = coverage_counts.get("expected_target_count")
+    actual_target_count = coverage_counts.get("actual_target_count") or coverage_counts.get("target")
+    target_count_diff = coverage_counts.get("target_count_diff")
+    if expected_target_count is not None and actual_target_count is not None:
+        lines.append(
+            f"- 目标表数量: 理论 `{expected_target_count}`，"
+            f"实际 `{actual_target_count}`，差异 `{target_count_diff}`"
+        )
+    top_fields = list((summary.get("field_counts") or {}).items())[:5]
+    if top_fields:
+        lines.append(
+            "- Top字段问题: "
+            + "；".join(f"`{field}`={count}" for field, count in top_fields)
+        )
+    else:
+        lines.append("- Top字段问题: 无")
+    lines.append("")
+
+    lines.append("## Schema 对比")
+    lines.append("")
+    lines.append(f"- 预期字段数: `{_first_present(schema_check, 'expected_count', '预期字段数')}`")
+    lines.append(f"- 实际字段数: `{_first_present(schema_check, 'actual_count', '实际字段数')}`")
+    for field in missing_fields[:10]:
+        lines.append(f"- missing: `{field}`")
+    for field in extra_fields[:10]:
+        lines.append(f"- extra: `{field}`")
+    if len(extra_fields) > 10:
+        lines.append(f"- extra 其余 `{len(extra_fields) - 10}` 个见 summary.json")
+    lines.append("")
+
+    lines.append("## 覆盖率统计")
+    lines.append("")
+    for key, value in coverage_counts.items():
+        lines.append(f"- `{key}`: {value}")
+    lines.append("")
+
+    lines.append("## NULL/空值率统计")
+    lines.append("")
+    null_empty_rates = summary.get("null_empty_rates") or []
+    if null_empty_rates and isinstance(null_empty_rates[0], dict) and null_empty_rates[0].get("skipped"):
+        if null_empty_rates[0].get("status") == "failed":
+            lines.append(f"- 统计失败: `{null_empty_rates[0].get('error_type')}`")
+            lines.append(f"- 原因: `{null_empty_rates[0].get('error')}`")
+        else:
+            lines.append(f"- 未统计: `{null_empty_rates[0].get('reason')}`")
+            lines.append("- 如需输出实际比例，运行时加 `--null-empty-mode exact`")
+    else:
+        rate_rows = []
+        for row in null_empty_rates:
+            if row.get("error") or row.get("错误"):
+                continue
+            total = int(_first_present(row, "total", "总数") or 0)
+            null_count = int(_first_present(row, "null_count", "NULL数量") or 0)
+            empty_count = int(_first_present(row, "empty_count", "空字符串/空集合数量") or 0)
+            null_empty_rate = _first_present(row, "null_empty_rate", "NULL和空值合计比例")
+            if null_empty_rate is None:
+                null_empty_rate = (null_count + empty_count) / total if total else 0.0
+            rate_rows.append(
+                {
+                    **row,
+                    "field": _first_present(row, "field", "字段"),
+                    "total": total,
+                    "null_count": null_count,
+                    "empty_count": empty_count,
+                    "null_rate": _first_present(row, "null_rate", "NULL比例"),
+                    "empty_rate": _first_present(row, "empty_rate", "空值比例"),
+                    "null_empty_rate": null_empty_rate,
+                }
+            )
+        rate_rows.sort(
+            key=lambda row: (
+                float(row.get("null_empty_rate") or 0),
+                int(row.get("null_count") or 0) + int(row.get("empty_count") or 0),
+                str(row.get("field") or ""),
+            ),
+            reverse=True,
+        )
+        for row in rate_rows:
+            lines.append(
+                f"- `{row.get('field')}`: NULL `{row.get('null_count')}` "
+                f"({_pct(row.get('null_rate'))})，空值 `{row.get('empty_count')}` "
+                f"({_pct(row.get('empty_rate'))})，合计 `{_pct(row.get('null_empty_rate'))}`"
+            )
+        if not rate_rows:
+            lines.append("- 无或未统计")
+    lines.append("")
+
+    lines.append("## 状态分布")
+    lines.append("")
+    for status, count in (summary.get("status_counts") or {}).items():
+        lines.append(f"- `{status}`: {count}")
+    if not summary.get("status_counts"):
+        lines.append("- 无")
+    lines.append("")
+
+    lines.append("## 字段问题分布")
+    lines.append("")
+    for field, count in (summary.get("field_counts") or {}).items():
+        lines.append(f"- `{field}`: {count}")
+    if not summary.get("field_counts"):
+        lines.append("- 无")
+    lines.append("")
+
+    lines.append("## 字段问题样例")
+    lines.append("")
+    for field, samples in (summary.get("field_samples") or {}).items():
+        count = (summary.get("field_counts") or {}).get(field, len(samples))
+        lines.append(f"### {field} ({count})")
+        lines.append("")
+        for sample in samples:
+            lines.append(
+                f"- unique_id `{sample.get('unique_id')}`, metadata_type=`{sample.get('metadata_type')}`, "
+                f"dt=`{sample.get('dt')}`, status=`{sample.get('status')}`"
+            )
+            lines.append(f"  - expected: `{_json_inline(sample.get('expected'))}`")
+            lines.append(f"  - actual: `{_json_inline(sample.get('actual'))}`")
+            lines.append("")
+
+    if summary.get("warning_count"):
+        lines.append("## Warning 样例")
+        lines.append("")
+        lines.append(f"- warning_count: `{summary.get('warning_count')}`")
+        for warning in summary.get("warning_samples") or []:
+            lines.append(f"- `{_json_inline(warning)}`")
+        lines.append("")
+
+    lines.append("## 缺失样例数量")
+    lines.append("")
+    for key, count in (summary.get("missing_target_sample_counts") or {}).items():
+        lines.append(f"- `{key}`: {count}")
+    lines.append("")
+    return "\n".join(lines).rstrip() + "\n"
+
+
+def write_report(output_dir: Path, result: Dict[str, Any]) -> None:
+    details = result["details"]
+    write_jsonl(output_dir / "source_field_mismatch.jsonl", details["source_field_mapping"]["mismatches"])
+    write_jsonl(output_dir / "source_field_warning.jsonl", details["source_field_mapping"]["warnings"])
+    readable_summary = build_readable_report_summary(result)
+    write_json(output_dir / "summary.json", readable_summary)
+    with (output_dir / "readable_summary.md").open("w", encoding="utf-8") as f:
+        f.write(build_readable_report_markdown(readable_summary))
+
+
+def validate_db(
+    *,
+    config_path: Path,
+    paper_table: str,
+    ebook_table: str,
+    xinghe_table: str,
+    target_table: str,
+    dt: Optional[str],
+    paper_dt: Optional[str],
+    ebook_dt: Optional[str],
+    limit: Optional[int],
+    output_dir: Optional[Path],
+    mapping_csv: Path = DEFAULT_MAPPING_CSV,
+    coverage_mode: str = "exact",
+    null_empty_mode: str = "exact",
+    missing_sample_mode: str = "skip",
+    target_sample_mode: str = "natural",
+) -> Dict[str, Any]:
+    specs = load_union_specs(mapping_csv)
+    cfg = load_config(config_path)
+    mysql_cfg = cfg.get("mysql", {}) if isinstance(cfg.get("mysql"), dict) else {}
+    catalog = mysql_cfg.get("catalog")
+    paper_table = qualify_table_name(paper_table, catalog, "dws")
+    ebook_table = qualify_table_name(ebook_table, catalog, "dws")
+    xinghe_table = qualify_table_name(xinghe_table, catalog, "ads")
+    target_table = qualify_table_name(target_table, catalog, "ads")
+    reconnected_conn = None
+    with connect_starrocks(config_path) as conn:
+        try:
+            with timed_step("schema 校验"):
+                schema_check = validate_schema(conn, target_table=target_table, specs=specs)
+            if coverage_mode == "exact":
+                try:
+                    with timed_step("coverage 总量统计"):
+                        coverage_counts = source_coverage_counts(
+                            conn,
+                            paper_table=paper_table,
+                            ebook_table=ebook_table,
+                            xinghe_table=xinghe_table,
+                            target_table=target_table,
+                            target_dt=dt,
+                            paper_dt=paper_dt,
+                            ebook_dt=ebook_dt,
+                        )
+                except Exception as exc:
+                    coverage_counts = failed_coverage_counts(exc)
+                    log_step(
+                        "coverage 总量统计失败，继续生成抽样报告："
+                        f"{type(exc).__name__}: {exc}"
+                    )
+            else:
+                coverage_counts = skipped_coverage_counts("coverage_mode=skip")
+                log_step("coverage 总量统计已跳过（使用 --coverage-mode exact 开启）")
+            with timed_step("source 字段映射抽样校验"):
+                source_field_mapping = validate_source_field_mapping(
+                    conn,
+                    specs=specs,
+                    paper_table=paper_table,
+                    ebook_table=ebook_table,
+                    xinghe_table=xinghe_table,
+                    target_table=target_table,
+                    target_dt=dt,
+                    paper_dt=paper_dt,
+                    ebook_dt=ebook_dt,
+                    limit=limit,
+                    target_sample_mode=target_sample_mode,
+                )
+            field_quality = validate_target_field_values(
+                conn,
+                target_table=target_table,
+                dt=dt,
+                limit=limit,
+            )
+            if null_empty_mode == "exact":
+                try:
+                    with timed_step("null/empty rates 统计"):
+                        null_empty_rates = build_null_empty_rates(
+                            conn,
+                            target_table=target_table,
+                            specs=specs,
+                            dt=dt,
+                        )
+                except Exception as exc:
+                    null_empty_rates = failed_null_empty_rates(exc)
+                    log_step(
+                        "null/empty rates 统计失败，继续生成报告："
+                        f"{type(exc).__name__}: {exc}"
+                    )
+            else:
+                null_empty_rates = skipped_null_empty_rates("null_empty_mode=skip")
+                log_step("null/empty rates 统计已跳过（使用 --null-empty-mode exact 开启）")
+            if missing_sample_mode == "sample":
+                with timed_step("missing target 样例抽取"):
+                    missing_target_samples = validate_missing_target_samples(
+                        conn,
+                        paper_table=paper_table,
+                        ebook_table=ebook_table,
+                        xinghe_table=xinghe_table,
+                        target_table=target_table,
+                        target_dt=dt,
+                        paper_dt=paper_dt,
+                        ebook_dt=ebook_dt,
+                        limit=limit or 200,
+                    )
+            else:
+                missing_target_samples = {"skipped": True, "reason": "missing_sample_mode=skip"}
+                log_step("missing target 样例抽取已跳过")
+        finally:
+            if reconnected_conn is not None:
+                try:
+                    reconnected_conn.close()
+                except Exception:
+                    pass
+
+    result = {
+        "status": "ok",
+        "config_path": str(config_path),
+        "mapping_csv": str(mapping_csv),
+        "paper_table": paper_table,
+        "ebook_table": ebook_table,
+        "xinghe_table": xinghe_table,
+        "target_table": target_table,
+        "dt": dt,
+        "target_dt": dt,
+        "paper_dt": paper_dt,
+        "ebook_dt": ebook_dt,
+        "sample_size": limit,
+        "coverage_mode": coverage_mode,
+        "null_empty_mode": null_empty_mode,
+        "missing_sample_mode": missing_sample_mode,
+        "target_sample_mode": target_sample_mode,
+        "schema_check": schema_check,
+        "coverage_counts": coverage_counts,
+        "source_field_mapping": {
+            "checked": source_field_mapping["checked"],
+            "passed": source_field_mapping["passed"],
+            "failed": source_field_mapping["failed"],
+            "skipped": source_field_mapping["skipped"],
+            "warning_count": len(source_field_mapping["warnings"]),
+        },
+        "field_quality": {
+            "checked": field_quality["checked"],
+            "passed": field_quality["passed"],
+            "failed": field_quality["failed"],
+            "fail_rate": field_quality["fail_rate"],
+            "field_error_summary": field_quality["field_error_summary"],
+            "skipped": field_quality.get("skipped", False),
+            "reason": field_quality.get("reason"),
+        },
+        "output_dir": str(output_dir) if output_dir else None,
+        "details": {
+            "source_field_mapping": source_field_mapping,
+            "field_quality": field_quality,
+            "null_empty_rates": null_empty_rates,
+            "missing_target_samples": missing_target_samples,
+        },
+    }
+    if output_dir is not None:
+        write_report(output_dir, result)
+    print(json.dumps({k: v for k, v in result.items() if k != "details"}, ensure_ascii=False, cls=JsonEncoder))
+    return result
+
+
+def cli() -> None:
+    config_parser = argparse.ArgumentParser(add_help=False)
+    config_parser.add_argument("--config", type=Path, default=DEFAULT_CONFIG_PATH)
+    config_args, _ = config_parser.parse_known_args()
+    cfg = load_config(config_args.config) if config_args.config.exists() else {}
+    union_cfg = cfg.get("union_unique_meta_data", {})
+
+    default_csv = union_cfg.get("mapping_csv")
+    if default_csv:
+        default_csv = PROJECT_ROOT / default_csv
+    else:
+        default_csv = DEFAULT_MAPPING_CSV
+
+    parser = argparse.ArgumentParser(
+        description="Validate unified metadata target table against DB sources."
+    )
+    parser.add_argument("--config", type=Path, default=DEFAULT_CONFIG_PATH, help="shared settings JSON path")
+    parser.add_argument("--mapping-csv", type=Path, default=default_csv, help="field mapping CSV")
+    parser.add_argument("--paper-table", default=union_cfg.get("paper_table", DEFAULT_PAPER_TABLE))
+    parser.add_argument("--ebook-table", default=union_cfg.get("ebook_table", DEFAULT_EBOOK_TABLE))
+    parser.add_argument("--xinghe-table", default=union_cfg.get("xinghe_table", DEFAULT_XINGHE_TABLE))
+    parser.add_argument("--target-table", default=union_cfg.get("target_table", DEFAULT_TARGET_TABLE))
+    parser.add_argument("--dt", default=union_cfg.get("dt"), help="target table dt partition filter")
+    parser.add_argument(
+        "--paper-dt",
+        default=union_cfg.get("paper_dt"),
+        help="paper unique source dt partition filter; defaults to --dt when omitted",
+    )
+    parser.add_argument(
+        "--ebook-dt",
+        default=union_cfg.get("ebook_dt"),
+        help="ebook unique source dt partition filter; defaults to --dt when omitted",
+    )
+    parser.add_argument("--limit", type=int, default=int(union_cfg.get("limit", 3000)), help="sample size")
+    parser.add_argument("--full", action="store_true", help="validate all target rows for sampled checks")
+    parser.add_argument("--output-dir", type=Path, default=union_cfg.get("output_dir"), help="report directory")
+    parser.add_argument(
+        "--coverage-mode",
+        choices=("skip", "exact"),
+        default=union_cfg.get("coverage_mode", "exact"),
+        help="coverage count mode; exact runs full count and missing-target count SQL, then continues on timeout/error",
+    )
+    parser.add_argument(
+        "--null-empty-mode",
+        choices=("skip", "exact"),
+        default=union_cfg.get("null_empty_mode", "exact"),
+        help="null/empty rate mode; exact scans target fields",
+    )
+    parser.add_argument(
+        "--missing-sample-mode",
+        choices=("sample", "skip"),
+        default=union_cfg.get("missing_sample_mode", "skip"),
+        help="whether to collect source-has-target-missing samples",
+    )
+    parser.add_argument(
+        "--target-sample-mode",
+        choices=("natural", "hash"),
+        default=union_cfg.get("target_sample_mode", "natural"),
+        help="target sample mode; natural is fastest, hash adds CRC32 filter",
+    )
+    args = parser.parse_args()
+    paper_dt = args.paper_dt or args.dt
+    ebook_dt = args.ebook_dt or args.dt
+    output_dir = Path(args.output_dir) if args.output_dir else default_output_dir(
+        args.dt,
+        paper_dt,
+        ebook_dt,
+        args.limit,
+        args.full,
+    )
+
+    validate_db(
+        config_path=args.config,
+        paper_table=args.paper_table,
+        ebook_table=args.ebook_table,
+        xinghe_table=args.xinghe_table,
+        target_table=args.target_table,
+        dt=args.dt,
+        paper_dt=paper_dt,
+        ebook_dt=ebook_dt,
+        limit=None if args.full else args.limit,
+        output_dir=output_dir,
+        mapping_csv=args.mapping_csv,
+        coverage_mode=args.coverage_mode,
+        null_empty_mode=args.null_empty_mode,
+        missing_sample_mode=args.missing_sample_mode,
+        target_sample_mode=args.target_sample_mode,
+    )
+
+
+from dingo.config.input_args import EvaluatorRuleArgs
+from dingo.io.input import Data, RequiredField
+from dingo.io.output.eval_detail import EvalDetail, QualityLabel
+from dingo.model.model import Model
+from dingo.model.rule.base import BaseRule
+from dingo.model.rule.scibase.report_utils import bool_param, int_param, write_temp_settings
+
+
+@Model.rule_register(
+    "QUALITY_BAD_EFFECTIVENESS",
+    ["sci_base_qa_test", "union_unique_meta_data"],
+)
+class RuleSciBaseUnionUniqueMetaDataReport(BaseRule):
+    _metric_info = {
+        "category": "Rule-Based Metadata Quality Metrics",
+        "quality_dimension": "EFFECTIVENESS",
+        "metric_name": "RuleSciBaseUnionUniqueMetaDataReport",
+        "description": "Run SciBase unified metadata DB validation and write reports.",
+        "paper_title": "",
+        "paper_url": "",
+        "paper_authors": "",
+        "evaluation_results": "",
+    }
+
+    _required_fields = [RequiredField.METADATA]
+    dynamic_config = EvaluatorRuleArgs(parameters={})
+
+    @classmethod
+    def eval(cls, input_data: Data) -> EvalDetail:
+        del input_data
+        params = cls.dynamic_config.parameters or {}
+        full = bool_param(params, "full", False)
+        dt = params.get("dt")
+        paper_dt = params.get("paper_dt") or dt
+        ebook_dt = params.get("ebook_dt") or dt
+        output_dir = Path(str(params["output_dir"])) if params.get("output_dir") else default_output_dir(
+            dt,
+            paper_dt,
+            ebook_dt,
+            int_param(params, "limit", 3000),
+            full,
+        )
+
+        config_path = write_temp_settings(params)
+        result = validate_db(
+            config_path=config_path,
+            paper_table=str(params.get("paper_table") or DEFAULT_PAPER_TABLE),
+            ebook_table=str(params.get("ebook_table") or DEFAULT_EBOOK_TABLE),
+            xinghe_table=str(params.get("xinghe_table") or DEFAULT_XINGHE_TABLE),
+            target_table=str(params.get("target_table") or DEFAULT_TARGET_TABLE),
+            dt=dt,
+            paper_dt=paper_dt,
+            ebook_dt=ebook_dt,
+            limit=None if full else int_param(params, "limit", 3000),
+            output_dir=output_dir,
+            mapping_csv=Path(str(params.get("mapping_csv") or DEFAULT_MAPPING_CSV)),
+            coverage_mode=str(params.get("coverage_mode") or "exact"),
+            null_empty_mode=str(params.get("null_empty_mode") or "exact"),
+            missing_sample_mode=str(params.get("missing_sample_mode") or "skip"),
+            target_sample_mode=str(params.get("target_sample_mode") or "natural"),
+        )
+        mapping_summary = result.get("source_field_mapping") or {}
+        field_quality = result.get("field_quality") or {}
+        schema_check = result.get("schema_check") or {}
+        bad = bool(schema_check.get("missing_fields") or schema_check.get("type_mismatches"))
+        bad = bad or int(mapping_summary.get("failed") or 0) > 0
+        bad = bad or int(field_quality.get("failed") or 0) > 0
+        reason = [
+            str(output_dir),
+            f"mapping_failed={mapping_summary.get('failed')}",
+            f"field_failed={field_quality.get('failed')}",
+        ]
+        if bad:
+            return EvalDetail(
+                metric=cls.__name__,
+                status=True,
+                label=[f"{cls.metric_type}.{cls.__name__}"],
+                reason=reason,
+            )
+        return EvalDetail(metric=cls.__name__, label=[QualityLabel.QUALITY_GOOD], reason=reason)
+
+
+if __name__ == "__main__":
+    cli()
diff --git a/docs/metrics.md b/docs/metrics.md
index 3acdf41d..a97b0a0c 100644
--- a/docs/metrics.md
+++ b/docs/metrics.md
@@ -127,6 +127,7 @@ This document provides comprehensive information about all quality metrics used
 | Type | Metric | Description | Paper Source | Evaluation Results | Examples |
 |------|--------|-------------|--------------|-------------------|----------|
 | `QUALITY_BAD_EFFECTIVENESS` | RuleMetadataSimilarity | 检查元数据字段与基准数据的相似度匹配，阈值默认为0.6 | Internal Implementation | N/A | N/A |
+| `QUALITY_BAD_EFFECTIVENESS` | RuleSciBaseMetaPaperUniqueReport, RuleSciBaseMetaEbookUniqueReport, RuleSciBaseUnionUniqueMetaDataReport, RuleSciBaseMetaPaperDataReport, RuleSciBaseMetaPatentParsedInfoReport | Validate SciBase paper unique, ebook unique, unified metadata, S3 paper-source, and patent XML parsed-field records while writing per-record reports | Internal Implementation | N/A | N/A |
 
 ### Rule-Based RESUME Quality Metrics
 
@@ -159,4 +160,3 @@ This document provides comprehensive information about all quality metrics used
 | `AgentFactCheck` | AgentFactCheck | Agent-based hallucination detection with autonomous web search | Internal Implementation | N/A | N/A |
 | `ArticleFactChecker` | ArticleFactChecker | Article-level fact checking with autonomous claims extraction and verification | Internal Implementation | N/A | N/A |
 | `LLMCustomMetric` | LLMCustomMetric | Unified metric for user customization | Internal Implementation | N/A | N/A |
-
diff --git a/setup.py b/setup.py
index 357285fc..01e747cc 100644
--- a/setup.py
+++ b/setup.py
@@ -31,6 +31,12 @@ def _read_requirements(path):
     url="https://github.com/MigoXLab/dingo",
     packages=find_packages(),
     include_package_data=True,
+    package_data={
+        "dingo": [
+            "model/rule/scibase/assets/*.csv",
+            "model/rule/scibase/assets/*.json",
+        ],
+    },
     classifiers=[
         "Programming Language :: Python :: 3",
         "Operating System :: OS Independent",