diff --git a/.gitignore b/.gitignore index 5db5909..1a04d97 100644 --- a/.gitignore +++ b/.gitignore @@ -49,3 +49,6 @@ docs/.doctrees/ .DS_Store Thumbs.db +# Claude Code (personal/local config) +.claude/settings.local.json + diff --git a/CHANGELOG.md b/CHANGELOG.md index e0143af..081cb3e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -58,6 +58,15 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/), and this in the `README.md` Roadmap section and the `flake8 onecite tests` validation check. +### Removed +- `onecite process` no longer accepts `--google-scholar`, and + `process_references()` no longer accepts the `use_google_scholar` + parameter. Google Scholar was never consulted from the authoritative + `process` path, so the flag and parameter were no-ops there. Google + Scholar remains available as an opt-in, best-effort fallback on + `onecite suggest --google-scholar` / + `suggest_references(use_google_scholar=True)`. + ### Fixed - Corrected the benchmark Nature DQN DOI fixture from `10.1038/nature14539` to `10.1038/nature14236`, and added regression @@ -87,6 +96,20 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/), and this - Clarified that `onecite benchmark --json` is the deterministic offline health check, while `onecite process ...` may contact upstream APIs unless fixtures or mocks are explicitly configured. +- DOI-backed BibTeX input now keeps the canonical CrossRef/DataCite field + values instead of letting the original entry override them; original + fields still fill gaps the API leaves empty, and the existing citation + key is still preserved. +- A CrossRef 404 now always falls back to DataCite instead of only doing so + for a short hardcoded prefix list, so dataset/software/thesis DOIs + registered under other DataCite prefixes resolve. +- `suggest` no longer routes queries containing words such as "synthesis", + "hypothesis", or "parenthesis" to the thesis search (whole-word match for + "thesis"/"dissertation"). +- GitHub clone URLs ending in `.git` now resolve to the correct repository. +- Plain-text entry ids stay contiguous when entries are separated by more + than one blank line, and a dead PLOS article-id branch was removed from + the text parser. ## [0.1.1] - 2026-04-17 diff --git a/README.md b/README.md index 157b007..a9253c1 100644 --- a/README.md +++ b/README.md @@ -36,13 +36,13 @@ ---

- OneCite is a command-line tool and Python library for citation management. It accepts DOIs, paper titles, arXiv IDs, and mixed inputs, and outputs formatted bibliographic entries. + OneCite is a command-line tool and Python library for citation management. It resolves strong identifiers such as DOIs, PMIDs, arXiv IDs, ISBNs, GitHub URLs, and data DOIs into formatted bibliographic entries, while plain-text title searches are handled by the separate candidate-only suggest command.

--- - Researchers frequently accumulate reference lists in ad-hoc formats—DOIs copied from browser tabs, arXiv IDs from paper PDFs, titles typed by hand, and BibTeX fragments from various sources. Cleaning these into consistent BibTeX output is tedious and error-prone. OneCite parses raw reference text and attempts metadata lookup against configured sources such as CrossRef, PubMed, arXiv, and Semantic Scholar. The result is a reproducible processing layer that reports unresolved entries and produces auditable BibTeX where metadata can be found. + Researchers frequently accumulate reference lists in ad-hoc formats—DOIs copied from browser tabs, arXiv IDs from paper PDFs, PMIDs, ISBNs, software URLs, data DOIs, and BibTeX fragments from various sources. Cleaning these into consistent BibTeX output is tedious and error-prone. OneCite parses raw reference text and resolves strong identifiers against configured sources such as CrossRef, PubMed, arXiv, DataCite, GitHub, and Google Books. Plain-text title searches are exposed through `onecite suggest` so candidates can be reviewed without being mistaken for verified BibTeX. The result is a reproducible processing layer that reports unresolved entries and produces auditable BibTeX where metadata can be found. @@ -54,14 +54,13 @@ | Feature | Description | | ----------------------- | ------------------------------------------------------------------------------------------------------- | -| **Fuzzy Matching** | Attempt to match incomplete references against configured academic metadata sources. | +| **Candidate Suggestions** | Search incomplete plain-text references with `onecite suggest` without resolving them to BibTeX. | | **Multiple Formats** | Input `.txt`/`.bib` → Output **BibTeX**. | | **4-stage Pipeline** | A 4-stage process (clean → query → validate → format) to produce consistent output. | | **Field Completion** | Fill available fields returned by metadata sources, such as journal, volume, pages, authors, and abstract. | | 🎓 **7+ Citation Types** | Handles journal articles, conference papers, books, software, datasets, theses, and preprints. | | **Multi-Source Lookup** | Uses source-specific routes for CrossRef, arXiv, PubMed, Semantic Scholar, Google Books, and others. | -| **Many Identifier Types** | Accepts DOI, PMID, arXiv ID, ISBN, GitHub URL, Zenodo DOI, or plain text queries. | -| 🎛️ **Interactive Mode** | Manually select the correct entry when multiple potential matches are found. | +| **Many Identifier Types** | Resolves DOI, PMID, arXiv ID, ISBN, GitHub URL, Zenodo DOI, and DataCite DOI inputs. | | **Custom Templates** | YAML-based presets that provide a fallback BibTeX entry type when auto-detection is inconclusive. | @@ -97,9 +96,9 @@ Create a file named `references.txt` with your mixed-format references: 10.1038/nature14539 -Attention is all you need, Vaswani et al., NIPS 2017 +arXiv:1706.03762 -Goodfellow, I., Bengio, Y., & Courville, A. (2016). Deep Learning. MIT Press. +ISBN:9780262035613 https://github.com/tensorflow/tensorflow @@ -157,7 +156,7 @@ Your `results.bib` file now contains entries of different types. ```bash onecite process "10.1038/nature14539" -onecite process "Attention is all you need, Vaswani et al., NIPS 2017" +onecite suggest "Attention is all you need, Vaswani et al., NIPS 2017" echo "10.1038/nature14539" | onecite process - ``` @@ -198,16 +197,12 @@ Use OneCite directly in your Python scripts. ```python from onecite import process_references -# A callback can be used for non-interactive selection (e.g., always choose the best match) -def auto_select_callback(candidates): - return 0 # Index of the best candidate - result = process_references( - input_content="Deep learning review\nLeCun, Bengio, Hinton\nNature 2015", + input_content="10.1038/nature14539", input_type="txt", template_name="journal_article_full", output_format="bibtex", - interactive_callback=auto_select_callback + interactive_callback=lambda candidates: -1 ) print('\n\n'.join(result['results'])) @@ -229,7 +224,7 @@ onecite process [OPTIONS] ``` **Arguments:** -- `input_file` - Input file path, `-` for stdin, or a reference string (e.g., DOI, title) +- `input_file` - Input file path, `-` for stdin, or a strong identifier/reference string **Options:** | Option | Short | Description | Default | @@ -243,7 +238,6 @@ onecite process [OPTIONS] | `--json` | | Print a stable JSON envelope instead of BibTeX text | `False` | | `--ndjson` | | Print newline-delimited JSON events for streaming automation workflows | `False` | | `--fail-on-unresolved` | | Return exit code `2` when any entry cannot be resolved | `False` | -| `--google-scholar` | | Enable Google Scholar as an additional data source (requires scholarly package) | `False` | **Examples:** ```bash @@ -253,9 +247,6 @@ onecite process references.txt -o results.bib # Process a BibTeX file with auto-detection onecite process references.bib -# Process with interactive mode -onecite process ambiguous.txt --interactive - # Use stdin echo "10.1038/nature14539" | onecite process - @@ -265,9 +256,6 @@ onecite process "10.1038/nature14539" # Process with custom template onecite process references.txt --template conference_paper -# Enable Google Scholar (requires scholarly package) -onecite process references.txt --google-scholar - # Quiet mode for scripts onecite process references.txt -o results.bib --quiet @@ -278,6 +266,28 @@ onecite process references.txt --json --fail-on-unresolved onecite process references.txt --ndjson ``` +### `onecite suggest` + +Search for candidate matches without producing BibTeX or returning a +validation `passed` status. + +```bash +onecite suggest "Attention is all you need, Vaswani et al., NIPS 2017" --json +``` + +**Optional Google Scholar fallback.** `suggest` accepts `--google-scholar` +(requires the optional `scholarly` package: `pip install onecite[scholar]`). +It is consulted only as a best-effort fallback when CrossRef and Semantic +Scholar return nothing. Because it scrapes a service with no public API, it +is **off by default, may be rate-limited or blocked by a CAPTCHA, and is not +guaranteed to be reproducible** — it is exposed only on `suggest` (candidates +for human review), never on `process` (authoritative output). + +```bash +pip install onecite[scholar] +onecite suggest "some obscure title" --google-scholar +``` + ### `onecite --version` Display the installed OneCite version. diff --git a/docs/api/core.rst b/docs/api/core.rst index 035dc57..e1ff083 100644 --- a/docs/api/core.rst +++ b/docs/api/core.rst @@ -19,7 +19,6 @@ The primary function for processing citations. template_name: str, output_format: str, interactive_callback: Callable[[List[Dict]], int], - use_google_scholar: bool = False, ) -> Dict[str, Any] **Parameters:** @@ -28,8 +27,7 @@ The primary function for processing citations. - ``input_type`` (str): Type of input - ``"txt"`` or ``"bib"`` (required) - ``template_name`` (str): Template name to use (e.g., ``"journal_article_full"``) (required) - ``output_format`` (str): Output format - currently only ``"bibtex"`` is supported (required) -- ``interactive_callback`` (Callable): Function to handle ambiguous matches. Takes a list of candidate dicts and returns the selected index (0-based), or -1 to skip (required) -- ``use_google_scholar`` (bool): Enable Google Scholar as an additional data source. Requires the optional ``scholarly`` package. Default is ``False``. +- ``interactive_callback`` (Callable): Compatibility callback; plain-text candidate search is handled by ``suggest_references`` (required) **Returns:** @@ -53,7 +51,7 @@ A dictionary with keys: input_type="txt", template_name="journal_article_full", output_format="bibtex", - interactive_callback=lambda candidates: 0 # Auto-select first match + interactive_callback=lambda candidates: -1 ) # Access results @@ -216,7 +214,7 @@ For typical usage, ``process_references()`` is simpler. PipelineController expos input_type="txt", template_name="journal_article_full", output_format="bibtex", - interactive_callback=lambda candidates: 0 + interactive_callback=lambda candidates: -1 ) print(result['results']) diff --git a/docs/api/pipeline.rst b/docs/api/pipeline.rst index 5434906..6b4e90a 100644 --- a/docs/api/pipeline.rst +++ b/docs/api/pipeline.rst @@ -72,48 +72,43 @@ parsing fails. Stage 2: Identify (``IdentifierModule``) ---------------------------------------- -**Purpose:** resolve each ``RawEntry`` against academic data sources and -produce an ``IdentifiedEntry`` with a DOI (when possible) plus basic -metadata. +**Purpose:** resolve each ``RawEntry`` with strong identifiers into an +``IdentifiedEntry`` with a DOI / arXiv ID / URL plus basic metadata. +Plain-text title searches are not resolved by the processing pipeline; use +the suggestion workflow for candidate search. -**Input:** ``List[RawEntry]`` and an ``interactive_callback`` that picks -from candidate lists when confidence is medium. +**Input:** ``List[RawEntry]`` and an ``interactive_callback`` kept for API +compatibility. **Output:** ``List[IdentifiedEntry]``. **Data sources actually queried by the code:** -- CrossRef (DOI-based and fuzzy search) -- Semantic Scholar (keyword search) +- CrossRef (DOI-based lookup; candidate search in suggest mode) +- Semantic Scholar (candidate search in suggest mode) - arXiv (via feedparser) - PubMed (biomedical, queried when strong cues are present) - DataCite / Zenodo (datasets) - Google Books (books — triggered by ISBN or publisher cues) - external providerRE / BASE (theses) - GitHub (software repositories) -- Google Scholar (optional, disabled by default; opt-in via - ``--google-scholar`` or ``use_google_scholar=True`` and requires the - ``scholarly`` package) +- Google Scholar (optional, ``suggest``-only best-effort fallback, disabled by + default; opt-in via ``suggest --google-scholar`` or + ``suggest_references(use_google_scholar=True)`` and requires the + ``scholarly`` package; never used by ``process``) There is **no runtime routing based on filename** and no fixed priority -for "medical", "CS" or "general" queries. Signal-based heuristics -inside ``_fuzzy_search`` decide when to *additionally* query PubMed, -Google Books, external providerRE/BASE, etc., but CrossRef and Semantic Scholar are -always consulted for text queries. +for "medical", "CS" or "general" queries. Signal-based heuristics in +suggestion mode decide when to *additionally* query PubMed, Google Books, +external providerRE/BASE, etc. Text-only entries in process mode are +reported as unresolved instead of being guessed. **Confidence model:** -After all sources have returned candidates, ``_score_candidates`` assigns -each candidate a ``match_score`` (0–100) based on title / author / -year / venue similarity to the query. The decision logic in -``_fuzzy_search`` then chooses one of three paths: - -- ``match_score >= 80`` and a clear best candidate → auto-adopt -- ``70 <= match_score < 80`` → call the ``interactive_callback`` with up - to 5 candidates; fall back to the top candidate if the user skips and - the score is still ≥ 75 -- ``match_score >= 50`` and a title is present → adopt cautiously -- otherwise → mark the entry as ``identification_failed`` +After all suggestion sources have returned candidates, ``_score_candidates`` +assigns each candidate a ``match_score`` (0–100) based on title / author / +year / venue similarity to the query. Scores are returned for human or +downstream review; they are not treated as validation proof. Fallback paths never fabricate data: an entry that cannot be resolved is marked ``identification_failed`` rather than filled with invented @@ -219,7 +214,7 @@ high-level ``process_references`` function: input_type="txt", template_name="journal_article_full", output_format="bibtex", - interactive_callback=lambda candidates: 0, # auto-pick first + interactive_callback=lambda candidates: -1 ) print('\n\n'.join(result['results'])) diff --git a/docs/basic_usage.rst b/docs/basic_usage.rst index db35886..47dcc05 100644 --- a/docs/basic_usage.rst +++ b/docs/basic_usage.rst @@ -17,9 +17,9 @@ A text file where each reference is separated by a **blank line**:: 10.1038/nature14539 - Vaswani et al., 2017, Attention is all you need + arXiv:1706.03762 - Smith (2020) Neural Architecture Search + ISBN:9780262035613 .. note:: @@ -115,18 +115,23 @@ line followed by result and failure events:: onecite process input.txt --ndjson -**Google Scholar (--google-scholar)** +**Google Scholar (suggest only, --google-scholar)** -Enable Google Scholar as an additional data source (requires the optional ``scholarly`` package):: +A best-effort fallback for the ``suggest`` command only (requires the optional +``scholarly`` package: ``pip install onecite[scholar]``). It is consulted only +when CrossRef and Semantic Scholar return nothing. Because it scrapes a service +with no public API, it is off by default, may be blocked by a CAPTCHA, and is +not guaranteed to be reproducible. It is never used by ``process``, whose output +is authoritative:: - onecite process input.txt --google-scholar + onecite suggest input.txt --google-scholar **Direct String Input** Pass a reference string directly instead of a file:: onecite process "10.1038/nature14539" - onecite process "Attention is all you need, Vaswani et al., NIPS 2017" + onecite suggest "Attention is all you need, Vaswani et al., NIPS 2017" **Stdin Input** diff --git a/docs/changelog.rst b/docs/changelog.rst index 969bc2b..c0d22c7 100644 --- a/docs/changelog.rst +++ b/docs/changelog.rst @@ -40,6 +40,15 @@ Changed live checks are explicitly marked with ``pytest.mark.live`` so the default suite is deterministic and offline. +Removed +~~~~~~~ + +- ``onecite process`` no longer accepts ``--google-scholar``, and + ``process_references()`` no longer accepts the ``use_google_scholar`` + parameter (both were no-ops on the authoritative ``process`` path). + Google Scholar remains an opt-in, best-effort fallback on + ``onecite suggest --google-scholar``. + Fixed ~~~~~ @@ -60,6 +69,16 @@ Fixed distribution artifacts. - Added benchmark and doctor checks to the GitHub Actions test workflow. +- DOI-backed BibTeX input keeps canonical CrossRef/DataCite fields + instead of letting the original entry override them; original fields + still fill gaps and the existing citation key is preserved. +- A CrossRef 404 always falls back to DataCite instead of only doing so + for a short hardcoded prefix list. +- ``suggest`` no longer routes queries containing words such as + "synthesis" or "hypothesis" to the thesis search. +- GitHub clone URLs ending in ``.git`` resolve to the correct repository. +- Plain-text entry ids stay contiguous across multi-blank-line gaps, and + a dead PLOS article-id branch was removed from the text parser. [0.1.1] - 2026-04-17 --------------------- diff --git a/docs/cli_contracts.rst b/docs/cli_contracts.rst index e70ea9d..cbcbbba 100644 --- a/docs/cli_contracts.rst +++ b/docs/cli_contracts.rst @@ -49,6 +49,31 @@ Hard processing errors in ``--ndjson`` mode emit a ``summary`` event with ``status: "failed"`` followed by one ``failure`` event, then exit with code ``1``. +Suggest JSON +------------ + +``onecite suggest INPUT --json`` writes one JSON object to stdout unless +``--output`` is used. This command searches candidate metadata sources but +does not resolve candidates into BibTeX. Its successful status is +``"completed"``, not ``"passed"``, so suggestion output is not confused with +validated citation output. + +The envelope contains: + +- ``schema_version``: currently ``"1.0"``. +- ``tool`` and ``command``: ``"onecite"`` and ``"suggest"``. +- ``status``: ``"completed"`` when candidate search ran, ``"failed"`` on a + hard command error. +- ``summary``: total entries, entries with candidates, and entries without + candidates. +- ``options``: input type, per-entry limit, and whether Google Scholar was + enabled. +- ``suggestions``: one item per input entry with raw text, query string, + status, and a candidate list. + +The current top-level contract is exactly ``schema_version``, ``tool``, +``command``, ``status``, ``summary``, ``options``, and ``suggestions``. + Benchmark JSON -------------- diff --git a/docs/examples/references.txt b/docs/examples/references.txt index 5210c17..888af24 100644 --- a/docs/examples/references.txt +++ b/docs/examples/references.txt @@ -1,8 +1,8 @@ 10.1038/nature14539 -Attention is all you need, Vaswani et al., NIPS 2017 +arXiv:1706.03762 -Goodfellow, I., Bengio, Y., & Courville, A. (2016). Deep Learning. MIT Press. +ISBN:9780262035613 https://github.com/tensorflow/tensorflow diff --git a/docs/faq.rst b/docs/faq.rst index 1c3766d..4796ffb 100644 --- a/docs/faq.rst +++ b/docs/faq.rst @@ -67,7 +67,8 @@ OneCite accepts: - **Plain text** (`.txt`) - One reference per line or separated by blank lines - **BibTeX** (`.bib`) - Standard BibTeX format -- **Direct identifiers** - DOI, arXiv ID, PMID, ISBN, GitHub URLs, or plain text queries +- **Direct identifiers** - DOI, arXiv ID, PMID, ISBN, GitHub URLs, Zenodo DOI, or DataCite DOI +- **Candidate suggestions** - Use ``onecite suggest`` for plain-text title queries Can I use OneCite with Overleaf? ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -127,7 +128,7 @@ OneCite integrates with: - Google Books (book metadata) - external providerRE / BASE (theses & grey literature) - GitHub (software repositories) -- Google Scholar (optional, off by default) +- Google Scholar (optional ``suggest``-only best-effort fallback, off by default) Which data source is best for my field? ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ diff --git a/docs/index.rst b/docs/index.rst index ecdeaf9..bfa7b75 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -13,7 +13,7 @@ OneCite - Citation & Academic Reference Toolkit :target: https://github.com/HzaCode/OneCite/blob/main/LICENSE :alt: License -**OneCite** is a command-line tool and Python library for citation management. It accepts DOIs, paper titles, arXiv IDs, and mixed inputs, and outputs formatted bibliographic entries. +**OneCite** is a command-line tool and Python library for citation management. It resolves strong identifiers such as DOIs, PMIDs, arXiv IDs, ISBNs, GitHub URLs, and data DOIs into formatted bibliographic entries. Plain-text title searches are handled by ``onecite suggest`` as candidate suggestions. .. toctree:: :maxdepth: 2 @@ -54,12 +54,12 @@ OneCite - Citation & Academic Reference Toolkit Key Features ============ -- **Fuzzy Matching** - Match references against multiple academic databases +- **Candidate Suggestions** - Search incomplete references with ``onecite suggest`` without resolving them to BibTeX - **BibTeX Output** - Standards-compliant ``.bib`` files rendered with ``bibtexparser`` - **4-stage Pipeline** - 4-stage process for consistent output - **Field Completion** - Enrich entries with missing metadata - 🎓 **7+ Citation Types** - Handles journal articles, conference papers, books, software, datasets, theses, and preprints -- **Many Identifier Types** - DOI, PMID, arXiv ID, ISBN, GitHub URLs, Zenodo DOI, or plain text queries +- **Many Identifier Types** - DOI, PMID, arXiv ID, ISBN, GitHub URLs, Zenodo DOI, and DataCite DOI Data Sources ============ @@ -75,7 +75,7 @@ OneCite integrates with multiple authoritative academic data sources: - `Google Books `_ - Book metadata - `external providerRE `_ / `BASE `_ - Theses & grey literature - `GitHub `_ - Software repositories -- Google Scholar (optional, via the ``scholarly`` package) +- Google Scholar (optional ``suggest``-only best-effort fallback, via the ``scholarly`` package) Quick Start =========== @@ -88,9 +88,9 @@ Create a ``references.txt`` file:: 10.1038/nature14539 - Attention is all you need, Vaswani et al., NIPS 2017 + arXiv:1706.03762 - Goodfellow, I., Bengio, Y., & Courville, A. (2016). Deep Learning. MIT Press. + ISBN:9780262035613 Run OneCite:: diff --git a/docs/python_api.rst b/docs/python_api.rst index 334f51d..4eec861 100644 --- a/docs/python_api.rst +++ b/docs/python_api.rst @@ -19,7 +19,7 @@ Simple Citation Processing input_type="txt", template_name="journal_article_full", output_format="bibtex", - interactive_callback=lambda candidates: 0 # Auto-select first match + interactive_callback=lambda candidates: -1 ) # Print results @@ -65,9 +65,9 @@ Plain Text Input txt_content = """ 10.1038/nature14539 - Vaswani et al., 2017, Attention is all you need + arXiv:1706.03762 - Smith (2020) Neural Architecture Search + ISBN:9780262035613 """ result = process_references( @@ -128,22 +128,17 @@ BibTeX-to-APA/MLA tools like pandoc or citeproc-py. Interactive Selection with Callbacks ------------------------------------- -For handling ambiguous references programmatically, use a callback function: +For plain-text title searches, use the suggestion API instead of resolving +directly to BibTeX: :: - from onecite import process_references - - def pick_first(candidates): - """Select the first candidate.""" - return 0 + from onecite import suggest_references - result = process_references( + result = suggest_references( input_content="Deep learning Hinton", input_type="txt", - template_name="journal_article_full", - output_format="bibtex", - interactive_callback=pick_first + limit=5, ) print('\n\n'.join(result['results'])) @@ -341,7 +336,7 @@ Complete Example input_type="txt", template_name="journal_article_full", output_format="bibtex", - interactive_callback=lambda candidates: 0 # Auto-select first match + interactive_callback=lambda candidates: -1 ) # Check results diff --git a/docs/quick_start.rst b/docs/quick_start.rst index 4b078eb..f286143 100644 --- a/docs/quick_start.rst +++ b/docs/quick_start.rst @@ -21,9 +21,9 @@ Example ``references.txt``:: 10.1038/nature14539 - Attention is all you need, Vaswani et al., NIPS 2017 + arXiv:1706.03762 - Goodfellow, I., Bengio, Y., & Courville, A. (2016). Deep Learning. MIT Press. + ISBN:9780262035613 https://github.com/tensorflow/tensorflow @@ -99,11 +99,11 @@ You can also use OneCite directly in your Python scripts:: from onecite import process_references result = process_references( - input_content="Deep learning review\nLeCun, Bengio, Hinton\nNature 2015", + input_content="10.1038/nature14539", input_type="txt", template_name="journal_article_full", output_format="bibtex", - interactive_callback=lambda candidates: 0 # Auto-select first match + interactive_callback=lambda candidates: -1 ) # Print formatted citations diff --git a/onecite/__init__.py b/onecite/__init__.py index e378b7a..f8cff3f 100644 --- a/onecite/__init__.py +++ b/onecite/__init__.py @@ -20,6 +20,7 @@ TemplateLoader, PipelineController, process_references, + suggest_references, ) from .benchmark import ( @@ -45,6 +46,7 @@ "PipelineController", # Main API "process_references", + "suggest_references", # Benchmarking "format_benchmark_text", "load_benchmark_suite", diff --git a/onecite/benchmark.py b/onecite/benchmark.py index 4cc1552..33b83fa 100644 --- a/onecite/benchmark.py +++ b/onecite/benchmark.py @@ -198,7 +198,6 @@ def _run_case(case: Dict[str, Any], process_fn: BenchmarkProcess) -> Dict[str, A template_name=case["template"], output_format=case["output_format"], interactive_callback=lambda _candidates: -1, - use_google_scholar=False, ) output = "\n\n".join(result.get("results", [])) _check_report_expectations(case, result.get("report", {}), failures) diff --git a/onecite/cli.py b/onecite/cli.py index ffc8d61..3a59585 100644 --- a/onecite/cli.py +++ b/onecite/cli.py @@ -18,7 +18,7 @@ from .benchmark import format_benchmark_text, load_benchmark_suite, run_benchmark from .benchmarks.offline import offline_requests_get -from .core import process_references, TemplateLoader +from .core import process_references, suggest_references, TemplateLoader from .exceptions import OneCiteError from . import __version__ @@ -31,6 +31,8 @@ def main() -> int: try: if args.command == "process": return process_command(args) + elif args.command in ("suggest", "sugget"): + return suggest_command(args) elif args.command == "benchmark": return benchmark_command(args) elif args.command == "doctor": @@ -65,6 +67,7 @@ def create_parser() -> argparse.ArgumentParser: onecite process references.txt --interactive --output results.bib onecite process "10.1038/nature14539" onecite process "attention is all you need, Vaswani et al., NIPS 2017" + onecite suggest "attention is all you need, Vaswani et al., NIPS 2017" --json onecite benchmark --json onecite doctor --json onecite templates @@ -127,11 +130,39 @@ def create_parser() -> argparse.ArgumentParser: action="store_true", help="Return exit code 2 when one or more entries could not be resolved", ) - process_parser.add_argument( + + suggest_parser = subparsers.add_parser( + "suggest", + aliases=["sugget"], + help="Suggest candidate matches without resolving them to BibTeX", + ) + suggest_parser.add_argument( + "input_file", help='Input file, "-" for stdin, or a reference string to search' + ) + suggest_parser.add_argument( + "--input-type", choices=["txt", "bib"], default="txt", help="Input type (default: txt)" + ) + suggest_parser.add_argument( + "--limit", + type=int, + default=5, + help="Maximum candidates per entry (default: 5)", + ) + suggest_parser.add_argument("--output", "-o", help="Output file (default: stdout)") + suggest_parser.add_argument( + "--quiet", "-q", action="store_true", help="Suppress saved-file status output" + ) + suggest_parser.add_argument( + "--json", + action="store_true", + dest="as_json", + help="Print a stable machine-readable JSON envelope", + ) + suggest_parser.add_argument( "--google-scholar", action="store_true", default=False, - help="Enable Google Scholar as an additional data source (requires scholarly package)", + help="Enable Google Scholar as an additional suggestion source", ) benchmark_parser = subparsers.add_parser( @@ -210,7 +241,6 @@ def _build_process_report(args: "argparse.Namespace", result: Dict[str, Any]) -> "template": args.template, "output_format": args.output_format, "interactive": bool(args.interactive), - "google_scholar": bool(args.google_scholar), "fail_on_unresolved": bool(args.fail_on_unresolved), }, "failed_entries": failed_entries, @@ -236,7 +266,6 @@ def _build_process_error_report(args: "argparse.Namespace", error: Exception) -> "template": args.template, "output_format": args.output_format, "interactive": bool(args.interactive), - "google_scholar": bool(args.google_scholar), "fail_on_unresolved": bool(args.fail_on_unresolved), }, "failed_entries": [{"id": None, "error": str(error)}], @@ -244,6 +273,79 @@ def _build_process_error_report(args: "argparse.Namespace", error: Exception) -> } +def _build_suggest_report(args: "argparse.Namespace", result: Dict[str, Any]) -> Dict[str, Any]: + """Build the stable suggest report used by JSON mode.""" + report = result.get("report", {}) + return { + "schema_version": "1.0", + "tool": "onecite", + "command": "suggest", + "status": "completed", + "summary": { + "total": int(report.get("total", 0)), + "with_candidates": int(report.get("with_candidates", 0)), + "without_candidates": int(report.get("without_candidates", 0)), + }, + "options": { + "input_type": args.input_type, + "limit": int(args.limit), + "google_scholar": bool(args.google_scholar), + }, + "suggestions": list(result.get("suggestions", [])), + } + + +def _build_suggest_error_report(args: "argparse.Namespace", error: Exception) -> Dict[str, Any]: + """Build a machine-readable suggest report for hard failures.""" + return { + "schema_version": "1.0", + "tool": "onecite", + "command": "suggest", + "status": "failed", + "summary": { + "total": 1, + "with_candidates": 0, + "without_candidates": 1, + }, + "options": { + "input_type": args.input_type, + "limit": int(args.limit), + "google_scholar": bool(args.google_scholar), + }, + "suggestions": [ + { + "id": None, + "raw_text": "", + "query_string": "", + "status": "error", + "error": str(error), + "candidates": [], + } + ], + } + + +def _format_suggest_text(report: Dict[str, Any]) -> str: + """Format a suggest report for humans.""" + lines = [] + for suggestion in report["suggestions"]: + heading = suggestion.get("query_string") or suggestion.get("raw_text") or "" + lines.append(f"Entry {suggestion.get('id')}: {heading}") + candidates = suggestion.get("candidates", []) + if not candidates: + lines.append(" No candidates found.") + continue + for index, candidate in enumerate(candidates, start=1): + score = candidate.get("match_score", "n/a") + source = candidate.get("source", "unknown") + title = candidate.get("title", "Untitled") + identifier = candidate.get("doi") or candidate.get("arxiv_id") or candidate.get("url") or "" + suffix = f" [{identifier}]" if identifier else "" + lines.append(f" {index}. {title}{suffix}") + lines.append(f" source={source} score={score}") + return "\n".join(lines) + + def _format_process_ndjson(report: Dict[str, Any]) -> str: """Format a process report as newline-delimited JSON events.""" events = [ @@ -459,6 +561,54 @@ def templates_command(args: "argparse.Namespace") -> int: return 0 +def suggest_command(args: "argparse.Namespace") -> int: + """Run the ``onecite suggest`` subcommand.""" + try: + input_content = _read_input_content(args) + + if args.quiet or args.as_json: + import logging + + logging.basicConfig(level=logging.CRITICAL) + for logger_name in ["onecite", "scholarly", "httpx", "fake_useragent"]: + logging.getLogger(logger_name).setLevel(logging.CRITICAL) + + with _offline_source_context(): + result = suggest_references( + input_content=input_content, + input_type=args.input_type, + limit=args.limit, + use_google_scholar=args.google_scholar, + ) + + suggest_report = _build_suggest_report(args, result) + output_content = ( + json.dumps(suggest_report, indent=2) + if args.as_json + else _format_suggest_text(suggest_report) + ) + + if args.output: + with open(args.output, "w", encoding="utf-8") as f: + f.write(output_content) + if args.as_json: + f.write("\n") + if not args.quiet: + status_stream = sys.stderr if args.as_json else sys.stdout + print(f"Suggestions saved to: {args.output}", file=status_stream) + else: + print(output_content) + + return 0 + + except Exception as e: + if args.as_json: + print(json.dumps(_build_suggest_error_report(args, e), indent=2)) + else: + print(f"Suggestion failed: {e}", file=sys.stderr) + return 1 + + def process_command(args: "argparse.Namespace") -> int: """Run the ``onecite process`` subcommand. @@ -522,7 +672,6 @@ def interactive_callback(candidates: List[Dict]) -> int: template_name=args.template, output_format=args.output_format, interactive_callback=interactive_callback, - use_google_scholar=args.google_scholar, ) process_report = _build_process_report(args, result) diff --git a/onecite/core.py b/onecite/core.py index 68e6263..6c49581 100644 --- a/onecite/core.py +++ b/onecite/core.py @@ -233,6 +233,21 @@ def process( self.logger.error(f"Processing pipeline execution failed: {str(e)}") raise + def suggest(self, input_content: str, input_type: str, limit: int = 5) -> Dict[str, Any]: + """Return candidate matches without producing resolved BibTeX.""" + self.logger.info("Starting OneCite suggestion pipeline") + raw_entries = self.parser.parse(input_content, input_type) + suggestions = [self.identifier.suggest(entry, limit=limit) for entry in raw_entries] + with_candidates = sum(1 for item in suggestions if item["candidates"]) + return { + "suggestions": suggestions, + "report": { + "total": len(suggestions), + "with_candidates": with_candidates, + "without_candidates": len(suggestions) - with_candidates, + }, + } + def process_references( input_content: str, @@ -240,7 +255,6 @@ def process_references( template_name: str, output_format: str, interactive_callback: Callable[[List[Dict]], int], - use_google_scholar: bool = False, ) -> Dict[str, Any]: """Process references and return formatted citations with a report. @@ -272,7 +286,20 @@ def process_references( """ if not input_content or not input_content.strip(): raise ValidationError("input_content must not be empty.") - pipeline = PipelineController(use_google_scholar=use_google_scholar) + pipeline = PipelineController() return pipeline.process( input_content, input_type, template_name, output_format, interactive_callback ) + + +def suggest_references( + input_content: str, + input_type: str = "txt", + limit: int = 5, + use_google_scholar: bool = False, +) -> Dict[str, Any]: + """Return candidate citation matches without resolving to BibTeX.""" + if not input_content or not input_content.strip(): + raise ValidationError("input_content must not be empty.") + pipeline = PipelineController(use_google_scholar=use_google_scholar) + return pipeline.suggest(input_content, input_type, limit=limit) diff --git a/onecite/pipeline/enricher.py b/onecite/pipeline/enricher.py index 3d75f02..0b64b10 100644 --- a/onecite/pipeline/enricher.py +++ b/onecite/pipeline/enricher.py @@ -147,8 +147,16 @@ def _enrich_single_entry( "bib_data": {}, } - # Generate BibTeX key - bib_key = self._generate_bibtex_key(base_record) + original_id = ( + raw_entry.get("original_entry", {}).get("ID") + if raw_entry and raw_entry.get("original_entry") + else None + ) + bib_key = ( + self._reserve_bibtex_key(str(original_id).strip()) + if original_id and str(original_id).strip() + else self._generate_bibtex_key(base_record) + ) # Fill in missing fields per template. We only attempt the # abstract fallback cascade (Semantic Scholar by DOI, then PubMed @@ -192,14 +200,27 @@ def _enrich_single_entry( api_value = completed_data.get(field) original_value = original[field] - # Preserve original if it exists and is not empty - if original_value and str(original_value).strip(): - # Log when we're overriding API data - if api_value and api_value != original_value: + if not (original_value and str(original_value).strip()): + continue + + # When the entry is DOI-backed, the resolved + # CrossRef/DataCite metadata is authoritative: never let + # an original field overwrite a canonical value, only + # fall back to the original to fill a gap the API left + # empty. Without a DOI we have no authority to override + # the user's own text, so the original still wins. + if raw_has_doi and api_value and str(api_value).strip(): + if api_value != original_value: self.logger.info( - f"Entry {identified_entry['id']}: Preserving original {field}='{original_value}' instead of API value '{api_value}'" + f"Entry {identified_entry['id']}: Using canonical {field}='{api_value}' over original '{original_value}' (DOI-backed)" ) - completed_data[field] = original_value + continue + + if api_value and api_value != original_value: + self.logger.info( + f"Entry {identified_entry['id']}: Preserving original {field}='{original_value}' instead of API value '{api_value}'" + ) + completed_data[field] = original_value else: self.logger.warning( f"Entry {identified_entry['id']}: No original_entry available in raw_entry - raw_entry={raw_entry is not None}" @@ -246,10 +267,15 @@ def _enrich_single_entry( elif is_book_type: completed_data["ENTRYTYPE"] = "book" elif ( - metadata.get("type") == "conference" + metadata.get("type") in ("conference", "proceedings-article") + or base_record.get("type") in ("conference", "proceedings-article") + or completed_data.get("type") in ("conference", "proceedings-article") + or completed_data.get("booktitle") or "conference" in completed_data.get("journal", "").lower() ): completed_data["ENTRYTYPE"] = "inproceedings" + if not completed_data.get("booktitle") and completed_data.get("journal"): + completed_data["booktitle"] = completed_data.pop("journal") else: completed_data["ENTRYTYPE"] = template.get("entry_type", "@article").lstrip("@") @@ -506,7 +532,7 @@ def _convert_search_metadata(self, metadata: Dict) -> Optional[Dict]: result = { "title": metadata.get("title", ""), "author": formatted_authors, - "booktitle": journal, + "booktitle": metadata.get("booktitle") or journal, "year": str(metadata.get("year", "")), } else: @@ -596,7 +622,10 @@ def _generate_bibtex_key(self, metadata: Dict) -> str: first_word = re.sub(r"[^\w]", "", title_words[0]) key_parts.append(first_word) - base_key = "".join(key_parts) or "unknown" + return self._reserve_bibtex_key("".join(key_parts) or "unknown") + + def _reserve_bibtex_key(self, base_key: str) -> str: + """Reserve a BibTeX key, adding a suffix if it was already used.""" key = base_key suffix = ord("a") while key in self._used_keys: diff --git a/onecite/pipeline/identifier.py b/onecite/pipeline/identifier.py index e229fbb..e982287 100644 --- a/onecite/pipeline/identifier.py +++ b/onecite/pipeline/identifier.py @@ -41,6 +41,11 @@ def __init__(self, use_google_scholar: bool = False): self.semantic_scholar_base = "https://api.semanticscholar.org/graph/v1" self.pubmed_base = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils" self.datacite_base = "https://api.datacite.org" + self._crossref_headers = { + "Accept": "application/json", + "User-Agent": "OneCite/0.1.1 (https://github.com/HzaCode/OneCite; mailto:onecite@users.noreply.github.com)", + } + self._crossref_mailto = "onecite@users.noreply.github.com" def identify( self, raw_entries: List[RawEntry], interactive_callback: Callable[[List[Dict]], int] @@ -189,7 +194,42 @@ def _identify_single_entry( # Fuzzy search if raw_entry.get("query_string"): - return self._fuzzy_search(raw_entry, interactive_callback) + query_string = raw_entry["query_string"].strip() + + pmid_match = re.match(r"^(PMID:?\s*)?(\d{7,8})$", query_string, re.IGNORECASE) + if pmid_match: + pmid = pmid_match.group(2) + pubmed_result = self._search_pubmed_by_id(pmid) + if pubmed_result: + return { + "id": raw_entry["id"], + "raw_text": raw_entry["raw_text"], + "doi": pubmed_result.get("doi"), + "arxiv_id": None, + "url": pubmed_result.get("url"), + "metadata": pubmed_result, + "status": "identified", + } + + if re.search(r"isbn[:\s]*[\d\-xX]{10,17}", query_string, re.IGNORECASE): + books_results = self._search_google_books(query_string) + if books_results: + book_result = books_results[0] + return { + "id": raw_entry["id"], + "raw_text": raw_entry["raw_text"], + "doi": book_result.get("doi"), + "arxiv_id": None, + "url": book_result.get("url"), + "metadata": book_result, + "status": "identified", + } + + self.logger.info( + "Entry %s has no strong identifier; use `onecite suggest` for candidates.", + raw_entry["id"], + ) + return identified_entry self.logger.warning(f"Entry {raw_entry['id']} identification failed") return identified_entry @@ -199,31 +239,13 @@ def _validate_doi(self, doi: str) -> bool: doi_pattern = r"^10\.\d{4,}/.+" return bool(re.match(doi_pattern, doi)) - def _is_datacite_doi(self, doi: str) -> bool: - """Check if DOI is registered with DataCite (not CrossRef).""" - datacite_prefixes = [ - "10.5281/", # Zenodo - "10.6084/", # Figshare - "10.5061/", # Dryad - "10.6078/", # DataONE - "10.7910/", # DVN/Dataverse - "10.13003/", # RePEc - "10.14291/", # UBC Dataverse - "10.5683/", # Scholars Portal - "10.20382/", # University of Manitoba Dataverse - "10.5680/", # University of Sheffield - "10.25739/", # Griffith University - ] - return any(doi.startswith(prefix) for prefix in datacite_prefixes) - def _verify_doi_and_get_metadata(self, doi: str) -> Optional[Dict]: """Verify DOI exists in Crossref or DataCite and get real metadata for comparison.""" # Try CrossRef first (covers most academic journals/papers) try: url = f"{self.crossref_base_url}/{doi}" - headers = {"Accept": "application/json"} - - response = requests.get(url, headers=headers, timeout=10) + params = {"mailto": self._crossref_mailto} + response = requests.get(url, headers=self._crossref_headers, params=params, timeout=10) response.raise_for_status() data = response.json() @@ -255,12 +277,15 @@ def _verify_doi_and_get_metadata(self, doi: str) -> Optional[Dict]: except requests.exceptions.HTTPError as e: if e.response.status_code == 404: self.logger.warning(f"DOI {doi} not found in CrossRef (404)") - # Try DataCite for dataset/software DOIs - if self._is_datacite_doi(doi): - self.logger.info(f"DOI {doi} appears to be DataCite, trying DataCite API...") - datacite_result = self._query_datacite(doi) - if datacite_result: - return datacite_result + # A CrossRef 404 only means the DOI is not registered with + # CrossRef. Many valid DOIs (datasets, software, theses, + # preprints) are DataCite-registered under prefixes far beyond + # any short hardcoded list, so always fall back to DataCite + # rather than guessing eligibility from the prefix. + self.logger.info(f"DOI {doi} not in CrossRef, trying DataCite API...") + datacite_result = self._query_datacite(doi) + if datacite_result: + return datacite_result return None else: self.logger.error(f"HTTP error verifying DOI {doi}: {str(e)}") @@ -279,7 +304,10 @@ def _extract_github_info(self, text: str) -> Optional[Dict]: if match: owner = match.group(1) repo = match.group(2) - # Remove any trailing punctuation or special chars + # Drop a trailing ".git" (clone URLs) and any trailing + # punctuation so the API call targets the real repo + # (e.g. ".../repo.git" -> "repo"). + repo = re.sub(r"\.git$", "", repo) repo = re.sub(r"[^a-zA-Z0-9_.-].*$", "", repo) url = f"{self.github_api_base}/repos/{owner}/{repo}" @@ -1218,91 +1246,7 @@ def _fuzzy_search( ) -> IdentifiedEntry: """Perform fuzzy search using simplified routing: always query core sources, conditionally append specialized sources.""" query_string = raw_entry["query_string"] - query_lower = query_string.lower() - - candidates = [] - - # === Core sources: always query === - # CrossRef covers most academic papers; Semantic Scholar adds citation metadata - self.logger.info("Querying core sources: CrossRef + Semantic Scholar") - crossref_results = self._search_crossref(query_string) - candidates.extend(crossref_results) - - semantic_results = self._search_semantic_scholar(query_string) - candidates.extend(semantic_results) - - # === Conditional specialized sources === - - # 1. PMID pattern detected - pmid_match = re.match(r"^(PMID:?\s*)?(\d{7,8})$", query_string.strip()) - if pmid_match: - pmid = pmid_match.group(2) - self.logger.info(f"Detected PubMed ID pattern: {pmid}, querying PubMed") - pubmed_result = self._search_pubmed_by_id(pmid) - if pubmed_result: - # For PMID-only queries (no other text), return directly - # This handles cases where the query is just "PMID:12345678" - text_without_pmid = ( - query_string.replace(f"PMID:{pmid}", "").replace(pmid, "").strip() - ) - if len(text_without_pmid) < 3: # Essentially just the PMID - return { - "id": raw_entry["id"], - "raw_text": raw_entry["raw_text"], - "doi": pubmed_result.get("doi"), - "arxiv_id": None, - "url": pubmed_result.get("url"), - "metadata": pubmed_result, - "status": "identified", - } - # Otherwise, add to candidates for scoring alongside other sources - candidates.append(pubmed_result) - - # 2. Strong biomedical cues → also query PubMed (as additive source) - strong_medical_cues = ["pubmed", "pmid", "clinical trial", "randomized controlled"] - if any(cue in query_lower for cue in strong_medical_cues): - self.logger.info("Strong medical cues detected, querying PubMed as additive source") - pubmed_results = self._search_pubmed(query_string) - candidates.extend(pubmed_results) - - # 3. Book indicators → query Google Books - # Simplified detection: only check strongest signals - has_isbn = bool(re.search(r"isbn[:\s]*[\d\-xX]{10,17}", query_lower, re.IGNORECASE)) - has_edition = bool(re.search(r"\b\d+(?:st|nd|rd|th)?\s+ed\.?\b", query_lower)) - has_book_publisher = any( - pub in query_lower - for pub in ["wiley", "o'reilly", "springer", "cambridge press", "mit press"] - ) - - if has_isbn or has_edition or has_book_publisher: - self.logger.info( - f"Book indicators detected (ISBN={has_isbn}, edition={has_edition}, publisher={has_book_publisher}), querying Google Books" - ) - books_results = self._search_google_books(query_string) - candidates.extend(books_results) - - # 4. Thesis indicators → query external providerRE/BASE - thesis_keywords = [ - "dissertation", - "phd thesis", - "master thesis", - "doctoral thesis", - "thesis", - ] - if any(kw in query_lower for kw in thesis_keywords): - self.logger.info("Thesis indicators detected, querying external providerRE/BASE") - thesis_results = self._search_openaire_for_thesis( - query_string - ) or self._search_base_for_thesis(query_string) - if thesis_results: - candidates.append(thesis_results) - - # 5. Google Scholar as optional fallback (if enabled and core sources returned little) - if self.use_google_scholar: - if len(crossref_results) == 0 and len(semantic_results) == 0: - self.logger.info("Core sources returned no results, trying Google Scholar") - scholar_results = self._search_google_scholar(query_string) - candidates.extend(scholar_results) + candidates = self._collect_suggestion_candidates(query_string) if not candidates: self.logger.warning(f"Entry {raw_entry['id']}: no candidate results found") @@ -1419,25 +1363,6 @@ def _fuzzy_search( "status": "identified", } - # Low confidence fallback: unified threshold - # With two-layer scoring, match_score is purely about query relevance - LOW_CONFIDENCE_THRESHOLD = 50 - if best_candidate["match_score"] >= LOW_CONFIDENCE_THRESHOLD and best_candidate.get( - "title" - ): - self.logger.info( - f"Entry {raw_entry['id']} adopting best candidate with score {best_candidate['match_score']}" - ) - return { - "id": raw_entry["id"], - "raw_text": raw_entry["raw_text"], - "doi": best_candidate.get("doi"), - "arxiv_id": best_candidate.get("arxiv_id"), - "url": best_candidate.get("url"), - "metadata": best_candidate, - "status": "identified", - } - # Debug: Log the best candidate score for analysis self.logger.info( f"Entry {raw_entry['id']} best candidate score: {best_candidate.get('match_score', 0)}" @@ -1459,6 +1384,78 @@ def _fuzzy_search( "status": "identification_failed", } + def suggest(self, raw_entry: RawEntry, limit: int = 5) -> Dict: + """Return candidate matches for a raw entry without resolving it.""" + query_string = (raw_entry.get("query_string") or raw_entry.get("raw_text") or "").strip() + suggestion = { + "id": raw_entry["id"], + "raw_text": raw_entry.get("raw_text", ""), + "query_string": query_string, + "status": "no_candidates", + "candidates": [], + } + if not query_string: + return suggestion + + candidates = self._collect_suggestion_candidates(query_string) + scored_candidates = self._score_candidates(candidates, query_string) if candidates else [] + suggestion["candidates"] = [ + self._public_candidate(candidate) for candidate in scored_candidates[: max(limit, 0)] + ] + if suggestion["candidates"]: + suggestion["status"] = "candidates_found" + return suggestion + + def _public_candidate(self, candidate: Dict) -> Dict: + """Remove private scorer fields from suggestion output.""" + return {key: value for key, value in candidate.items() if not key.startswith("_")} + + def _collect_suggestion_candidates(self, query_string: str) -> List[Dict]: + """Collect possible matches for suggestion-only workflows.""" + query_lower = query_string.lower() + candidates = [] + + self.logger.info("Querying suggestion sources: CrossRef + Semantic Scholar") + crossref_results = self._search_crossref(query_string) + candidates.extend(crossref_results) + + semantic_results = self._search_semantic_scholar(query_string) + candidates.extend(semantic_results) + + pmid_match = re.match(r"^(PMID:?\s*)?(\d{7,8})$", query_string.strip(), re.IGNORECASE) + if pmid_match: + pubmed_result = self._search_pubmed_by_id(pmid_match.group(2)) + if pubmed_result: + candidates.append(pubmed_result) + + strong_medical_cues = ["pubmed", "pmid", "clinical trial", "randomized controlled"] + if any(cue in query_lower for cue in strong_medical_cues): + candidates.extend(self._search_pubmed(query_string)) + + has_isbn = bool(re.search(r"isbn[:\s]*[\d\-xX]{10,17}", query_lower, re.IGNORECASE)) + has_edition = bool(re.search(r"\b\d+(?:st|nd|rd|th)?\s+ed\.?\b", query_lower)) + has_book_publisher = any( + pub in query_lower + for pub in ["wiley", "o'reilly", "springer", "cambridge press", "mit press"] + ) + if has_isbn or has_edition or has_book_publisher: + candidates.extend(self._search_google_books(query_string)) + + # Match whole words only: a bare "thesis"/"dissertation" substring + # check also fires on "hypothesis", "synthesis", "parenthesis", etc., + # routing unrelated queries to thesis search. + if re.search(r"\b(thesis|dissertation)\b", query_lower): + thesis_results = self._search_openaire_for_thesis( + query_string + ) or self._search_base_for_thesis(query_string) + if thesis_results: + candidates.append(thesis_results) + + if self.use_google_scholar and not crossref_results and not semantic_results: + candidates.extend(self._search_google_scholar(query_string)) + + return candidates + def _resolve_doi_via_crossref_title( self, candidate_title: str, original_query: str ) -> Optional[Dict]: diff --git a/onecite/pipeline/parser.py b/onecite/pipeline/parser.py index a5b381d..192833a 100644 --- a/onecite/pipeline/parser.py +++ b/onecite/pipeline/parser.py @@ -86,13 +86,16 @@ def _parse_text(self, text_content: str) -> List[RawEntry]: # Split text blocks using double newlines text_blocks = text_content.split("\n\n") - for i, block in enumerate(text_blocks): + for block in text_blocks: block = block.strip() if not block: continue raw_entry: RawEntry = { - "id": i, + # Running index over non-empty blocks so ids stay contiguous + # even when entries are separated by more than one blank line + # (which yields empty splits that are skipped above). + "id": len(entries), "raw_text": block, "doi": None, "url": None, @@ -102,19 +105,6 @@ def _parse_text(self, text_content: str) -> List[RawEntry]: doi_match = re.search(r"10\.\d{4,}/[^\s,}]+", block) if doi_match: raw_entry["doi"] = doi_match.group().rstrip(".,;:)]") - else: - # Try to find article ID patterns that might be convertible to DOI - # Common patterns: e0000429, PMC123456, etc. - article_id_match = re.search(r"\b[eE]\d{7}\b", block) # PLOS style: e0000429 - if article_id_match: - article_id = article_id_match.group() - # Note potential PLOS article ID but don't assume specific journal - # Let Cross resolve the actual DOI during identification - self.logger.info( - f"Entry {i} found potential PLOS article ID {article_id}, will attempt resolution via CrossRef" - ) - if not raw_entry["query_string"]: - raw_entry["query_string"] = block url_match = re.search(r"https?://[^\s]+", block) if url_match: diff --git a/tests/test_cli.py b/tests/test_cli.py index 3ff30b7..5e65df2 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -47,6 +47,7 @@ def test_help(self): assert "process" in out assert "benchmark" in out assert "doctor" in out + assert "suggest" in out assert "templates" in out def test_version(self): @@ -77,6 +78,15 @@ def test_process_help_lists_all_options(self): ): assert opt in out, f"{opt} missing from process --help" + def test_suggest_help_lists_options(self): + code, out, err = self._run(["suggest", "--help"]) + assert code == 0 + assert err == "" + assert "--input-type" in out + assert "--limit" in out + assert "--json" in out + assert "--google-scholar" in out + def test_input_type_and_output_format_choices(self): """Verify the argparse ``choices`` show up.""" _, out, _ = self._run(["process", "--help"]) @@ -265,6 +275,24 @@ def test_process_ndjson_hard_failure_is_machine_readable(self, tmp_path): assert lines[0]["status"] == "failed" assert lines[1]["entry"]["error"] + def test_suggest_json_subprocess_envelope_has_no_passed_status(self): + code, out, err = self._run( + [ + "suggest", + "Attention is all you need, Vaswani et al., NIPS 2017", + "--json", + ] + ) + + data = json.loads(out) + assert code == 0 + assert err == "" + assert data["command"] == "suggest" + assert data["status"] == "completed" + assert "passed" not in out + assert data["summary"]["total"] == 1 + assert data["suggestions"][0]["candidates"] + # --------------------------------------------------------------------------- # Unit-level (no subprocess, just call process_command / main directly) @@ -310,19 +338,6 @@ def _fake(*, input_type, **kw): assert captured["input_type"] == "bib" - def test_google_scholar_flag_passed_through(self, capsys): - """fix #10: --google-scholar flag must be forwarded to process_references.""" - captured = {} - - def _fake(*, use_google_scholar, **kw): - captured["gs"] = use_google_scholar - return {"results": ["OK"], "report": {"total": 1, "succeeded": 1, "failed_entries": []}} - - with patch("onecite.cli.process_references", side_effect=_fake): - cli.process_command(self._ns(input_file="10.1/x", quiet=True, google_scholar=True)) - - assert captured["gs"] is True - def test_string_input_passed_directly(self, capsys): """fix #36: non-file argument is treated as inline reference content.""" captured = {} @@ -538,7 +553,6 @@ def _fake( "template", "output_format", "interactive", - "google_scholar", "fail_on_unresolved", }, ) @@ -605,6 +619,63 @@ def _fake(**_kw): assert code == 2 assert json.loads(capsys.readouterr().out)["summary"]["failed"] == 1 + def test_suggest_json_output(self, tmp_path, capsys): + inf = tmp_path / "in.txt" + inf.write_text("query", encoding="utf-8") + + def _fake(**_kw): + return { + "suggestions": [ + { + "id": 0, + "raw_text": "query", + "query_string": "query", + "status": "candidates_found", + "candidates": [ + { + "source": "crossref", + "title": "Candidate", + "doi": "10.1/candidate", + "match_score": 91, + } + ], + } + ], + "report": { + "total": 1, + "with_candidates": 1, + "without_candidates": 0, + }, + } + + with patch("onecite.cli.suggest_references", side_effect=_fake): + code = cli.suggest_command( + self._ns( + command="suggest", + input_file=str(inf), + as_json=True, + limit=5, + ) + ) + + data = json.loads(capsys.readouterr().out) + assert code == 0 + _assert_keys( + data, + { + "schema_version", + "tool", + "command", + "status", + "summary", + "options", + "suggestions", + }, + ) + assert data["status"] == "completed" + assert data["summary"]["with_candidates"] == 1 + assert data["suggestions"][0]["candidates"][0]["doi"] == "10.1/candidate" + def test_process_json_output_file_status_uses_stderr(self, tmp_path, capsys): inf = tmp_path / "in.txt" inf.write_text("query", encoding="utf-8") diff --git a/tests/test_integration.py b/tests/test_integration.py index 466d11b..de78f24 100644 --- a/tests/test_integration.py +++ b/tests/test_integration.py @@ -51,15 +51,17 @@ def test_bib_to_bibtex(self, run_onecite_process): assert code == 0, err self._check(result) - def test_conference_paper(self, run_onecite_process): + def test_title_only_conference_paper_is_unresolved_by_process(self, run_onecite_process): code, out, err, result = run_onecite_process( "Attention is all you need\nVaswani et al.\nNIPS 2017", template="conference_paper", ) assert code == 0, err self._check(result) - assert "Attention" in out or "attention" in out, "title must appear in output" - assert "Vaswani" in out or "vaswani" in out.lower(), "author must appear in output" + assert out == "" + assert result["report"]["total"] == 1 + assert result["report"]["succeeded"] == 0 + assert result["report"]["failed_entries"] def test_arxiv(self, run_onecite_process): code, _, err, result = run_onecite_process("1706.03762\n\narxiv:1512.03385") diff --git a/tests/test_onecite_basic.py b/tests/test_onecite_basic.py index 540e444..770ee6e 100644 --- a/tests/test_onecite_basic.py +++ b/tests/test_onecite_basic.py @@ -12,14 +12,14 @@ def _auto_pick(candidates): - """Always pick the first candidate – good enough for smoke tests.""" - return 0 if candidates else -1 + """Process smoke tests should not rely on candidate guessing.""" + return -1 def test_readme_example(): """Make sure the snippet we show in README.md keeps working.""" result = process_references( - input_content="10.1038/nature14539\n\nAttention is all you need\nVaswani et al.\nNIPS 2017", + input_content="10.1038/nature14539\n\narXiv:1706.03762", input_type="txt", template_name="journal_article_full", output_format="bibtex", diff --git a/tests/test_pipeline_unit.py b/tests/test_pipeline_unit.py index 047afdb..ed82ad1 100644 --- a/tests/test_pipeline_unit.py +++ b/tests/test_pipeline_unit.py @@ -619,8 +619,8 @@ def test_no_hardcoded_well_known_papers(self): ident, "well_known_papers" ), "well_known_papers shortcut should have been removed (#19)" - def test_attention_query_goes_through_normal_search(self): - """fix #19: 'attention is all you need' must go through normal multi-source search.""" + def test_attention_query_returns_suggestions(self): + """Title-only queries produce candidates, not resolved citations.""" ident = IdentifierModule() entry = { "id": 1, @@ -635,10 +635,11 @@ def test_attention_query_goes_through_normal_search(self): "url": "https://arxiv.org/abs/1706.03762", } with patch.object(ident, "_search_crossref", return_value=[arxiv_result]): - r = ident._fuzzy_search(entry, lambda _: -1) - assert r["status"] == "identified" + r = ident.suggest(entry) + assert r["status"] == "candidates_found" + assert r["candidates"][0]["title"] == "Attention Is All You Need" - def test_pmid_shortcut(self): + def test_pmid_shortcut_is_strong_identifier(self): ident = IdentifierModule() entry = {"id": 2, "raw_text": "PMID:12345678", "query_string": "PMID:12345678"} with patch.object( @@ -646,11 +647,11 @@ def test_pmid_shortcut(self): "_search_pubmed_by_id", return_value={"source": "pubmed", "doi": "10.1234/pmid", "url": "https://example.com"}, ): - r = ident._fuzzy_search(entry, lambda _: -1) + r = ident._identify_single_entry(entry, lambda _: -1) assert r["status"] == "identified" assert r["doi"] == "10.1234/pmid" - def test_book_prefers_google_books(self): + def test_book_query_returns_google_books_suggestion(self): ident = IdentifierModule() entry = { "id": 3, @@ -686,10 +687,10 @@ def test_book_prefers_google_books(self): patch.object(ident, "_search_google_books", return_value=[gb]), patch.object(ident, "_search_crossref", return_value=[cr]), ): - r = ident._fuzzy_search(entry, lambda _: -1) + r = ident.suggest(entry) - assert r["status"] == "identified" - assert r["metadata"]["source"] == "google_books" + assert r["status"] == "candidates_found" + assert any(candidate["source"] == "google_books" for candidate in r["candidates"]) def test_interactive_user_picks_second(self): ident = IdentifierModule() diff --git a/tests/test_randomized_regression.py b/tests/test_randomized_regression.py new file mode 100644 index 0000000..e227bce --- /dev/null +++ b/tests/test_randomized_regression.py @@ -0,0 +1,459 @@ +import random +import string +from unittest.mock import patch + +import bibtexparser + +from onecite.pipeline import EnricherModule, FormatterModule, ParserModule + +SEED = 20260603 + + +def _word(rng, min_len=4, max_len=10): + letters = string.ascii_lowercase + return "".join(rng.choice(letters) for _ in range(rng.randint(min_len, max_len))) + + +def _title(rng, i): + words = [_word(rng).capitalize() for _ in range(rng.randint(4, 8))] + return f"{' '.join(words)} Study {i}" + + +def _author(rng, i): + first = _word(rng).capitalize() + last = f"{_word(rng).capitalize()}{i}" + return f"{last}, {first}" + + +def _doi(rng, i): + suffix = f"{_word(rng, 5, 8)}.{rng.randint(1000, 9999)}.{i}" + return f"10.{rng.randint(1000, 9999)}/{suffix}" + + +def test_randomized_text_parser_extracts_expected_identifiers(): + rng = random.Random(SEED) + parser = ParserModule() + blocks = [] + cases = [] + + for i in range(64): + title = _title(rng, i) + author = _author(rng, i) + year = str(rng.randint(1990, 2026)) + kind = rng.choice(("doi", "url", "plain", "pmid")) + + if kind == "doi": + doi = _doi(rng, i) + trailing = rng.choice((".", ";", ")", "]", "")) + block = f"{title}\n{author}\nPublished {year}. doi: {doi}{trailing}" + cases.append({"kind": kind, "doi": doi, "url": None, "query": None}) + elif kind == "url": + url = f"https://example.org/{_word(rng)}/{i}" + block = f"{title}\n{author}\nAvailable at {url}" + cases.append({"kind": kind, "doi": None, "url": url, "query": None}) + elif kind == "pmid": + pmid = str(rng.randint(1_000_000, 99_999_999)) + block = f"PMID: {pmid}" + cases.append({"kind": kind, "doi": None, "url": None, "query": block}) + else: + block = f"{title}\n{author}\nJournal of {_word(rng).capitalize()}, {year}" + cases.append( + { + "kind": kind, + "doi": None, + "url": None, + "query": f"{title} {author} {year}", + } + ) + + blocks.append(block) + + entries = parser.parse("\n\n".join(blocks), "txt") + + assert len(entries) == len(cases) + for entry, expected in zip(entries, cases): + assert entry["doi"] == expected["doi"] + assert entry["url"] == expected["url"] + assert entry.get("query_string") == expected["query"] + + +def test_randomized_metadata_enrichment_stays_complete_without_network(): + rng = random.Random(SEED + 1) + template = {"entry_type": "@article", "fields": []} + identified_entries = [] + raw_entries = [] + kinds = [] + + for i in range(48): + title = _title(rng, i) + year = str(rng.randint(1990, 2026)) + kind = rng.choice(("article", "conference", "book", "dataset")) + kinds.append(kind) + metadata = { + "title": title, + "authors": [_author(rng, i), _author(rng, i + 100)], + "year": year, + "doi": _doi(rng, i), + "url": f"https://example.org/work/{i}", + "pages": f"{rng.randint(1, 200)}--{rng.randint(201, 400)}", + } + + if kind == "article": + metadata.update({"type": "journal-article", "journal": "Journal of Random Tests"}) + elif kind == "conference": + metadata.update({"type": "proceedings-article", "journal": "Random Test Conference"}) + elif kind == "book": + metadata.update({"type": "book", "publisher": "Example Press"}) + else: + metadata.update({"type": "dataset", "publisher": "Example Repository"}) + + identified_entries.append( + { + "id": i, + "raw_text": title, + "doi": None, + "arxiv_id": None, + "url": None, + "metadata": metadata, + "status": "identified", + } + ) + raw_entries.append({"id": i, "raw_text": title, "doi": None, "url": None}) + + with patch( + "onecite.pipeline.requests.get", + side_effect=AssertionError("unexpected network call in randomized enrichment test"), + ) as mock_get: + completed = EnricherModule().enrich(identified_entries, template, raw_entries) + mock_get.assert_not_called() + + assert len(completed) == len(identified_entries) + assert all(entry["status"] == "completed" for entry in completed) + assert len({entry["bib_key"] for entry in completed}) == len(completed) + for entry, kind in zip(completed, kinds): + bib_data = entry["bib_data"] + assert bib_data["title"] + assert bib_data["author"] + assert bib_data["year"] + assert bib_data["doi"].startswith("10.") + if kind == "article": + assert bib_data["ENTRYTYPE"] == "article" + assert bib_data["journal"] + elif kind == "conference": + assert bib_data["ENTRYTYPE"] == "inproceedings" + assert bib_data["booktitle"] + assert "journal" not in bib_data + elif kind == "book": + assert bib_data["ENTRYTYPE"] == "book" + assert bib_data["publisher"] + else: + assert bib_data["ENTRYTYPE"] == "misc" + assert bib_data["howpublished"] + + +def test_entry_type_edge_case_matrix_without_network(): + cases = [ + { + "name": "journal article", + "metadata": { + "title": "Journal matrix case", + "authors": ["Doe, Jane"], + "year": "2024", + "type": "journal-article", + "journal": "Journal of Matrix Tests", + }, + "raw_entry": {}, + "entry_type": "article", + "journal": "Journal of Matrix Tests", + "booktitle": None, + }, + { + "name": "metadata conference type", + "metadata": { + "title": "Conference matrix case", + "authors": ["Doe, Jane"], + "year": "2024", + "type": "conference", + "journal": "International Matrix Conference", + }, + "raw_entry": {}, + "entry_type": "inproceedings", + "journal": None, + "booktitle": "International Matrix Conference", + }, + { + "name": "metadata proceedings article type", + "metadata": { + "title": "Proceedings matrix case", + "authors": ["Doe, Jane"], + "year": "2024", + "type": "proceedings-article", + "journal": "Proceedings of Matrix Tests", + }, + "raw_entry": {}, + "entry_type": "inproceedings", + "journal": None, + "booktitle": "Proceedings of Matrix Tests", + }, + { + "name": "booktitle already present", + "metadata": { + "title": "Booktitle matrix case", + "authors": ["Doe, Jane"], + "year": "2024", + "booktitle": "Workshop on Matrix Tests", + }, + "raw_entry": {}, + "entry_type": "inproceedings", + "journal": None, + "booktitle": "Workshop on Matrix Tests", + }, + { + "name": "raw base record proceedings type", + "metadata": { + "title": "Base record matrix case", + "authors": ["Doe, Jane"], + "year": "2024", + "journal": "Base Record Proceedings", + }, + "raw_entry": {"type": "proceedings-article"}, + "entry_type": "inproceedings", + "journal": None, + "booktitle": "Base Record Proceedings", + }, + { + "name": "dataset", + "metadata": { + "title": "Dataset matrix case", + "authors": ["Doe, Jane"], + "year": "2024", + "type": "dataset", + "url": "https://example.org/dataset", + }, + "raw_entry": {}, + "entry_type": "misc", + "journal": None, + "booktitle": None, + }, + { + "name": "book", + "metadata": { + "title": "Book matrix case", + "authors": ["Doe, Jane"], + "year": "2024", + "type": "book", + "publisher": "Matrix Press", + }, + "raw_entry": {}, + "entry_type": "book", + "journal": None, + "booktitle": None, + }, + ] + identified_entries = [] + raw_entries = [] + + for i, case in enumerate(cases): + identified_entries.append( + { + "id": i, + "raw_text": case["name"], + "doi": None, + "arxiv_id": None, + "url": None, + "metadata": case["metadata"], + "status": "identified", + } + ) + raw_entries.append( + { + "id": i, + "raw_text": case["name"], + "doi": None, + "url": None, + **case["raw_entry"], + } + ) + + with patch( + "onecite.pipeline.requests.get", + side_effect=AssertionError("unexpected network call in entry type matrix test"), + ) as mock_get: + completed = EnricherModule().enrich( + identified_entries, + {"entry_type": "@article", "fields": []}, + raw_entries, + ) + mock_get.assert_not_called() + + assert len(completed) == len(cases) + for entry, case in zip(completed, cases): + bib_data = entry["bib_data"] + assert bib_data["ENTRYTYPE"] == case["entry_type"], case["name"] + if case["journal"] is None: + assert "journal" not in bib_data, case["name"] + else: + assert bib_data["journal"] == case["journal"] + if case["booktitle"] is None: + assert "booktitle" not in bib_data, case["name"] + else: + assert bib_data["booktitle"] == case["booktitle"] + + +def test_crossref_proceedings_article_becomes_inproceedings_without_network(): + enricher = EnricherModule() + identified_entries = [ + { + "id": 1, + "raw_text": "Proceedings paper", + "doi": "10.1234/example.1", + "arxiv_id": None, + "url": None, + "metadata": {}, + "status": "identified", + } + ] + raw_entries = [ + { + "id": 1, + "raw_text": "Proceedings paper", + "doi": "10.1234/example.1", + "url": None, + } + ] + crossref_record = { + "title": "Proceedings paper", + "author": "Doe, Jane", + "year": "2024", + "doi": "10.1234/example.1", + "journal": "ACM SenSys", + "type": "proceedings-article", + "abstract": "Already present, so no abstract fallback is needed.", + } + + with ( + patch.object(enricher, "_get_crossref_metadata", return_value=crossref_record), + patch( + "onecite.pipeline.requests.get", + side_effect=AssertionError("unexpected network call"), + ) as mock_get, + ): + completed = enricher.enrich( + identified_entries, + {"entry_type": "@article", "fields": []}, + raw_entries, + ) + mock_get.assert_not_called() + + bib_data = completed[0]["bib_data"] + assert bib_data["ENTRYTYPE"] == "inproceedings" + assert bib_data["booktitle"] == "ACM SenSys" + assert "journal" not in bib_data + + +def test_randomized_bibtex_formatter_outputs_parseable_records(): + rng = random.Random(SEED + 2) + formatter = FormatterModule() + completed_entries = [] + + for i in range(52): + entry_type = rng.choice(("article", "inproceedings", "book", "misc")) + bib_key = f"Rand{rng.randint(1990, 2026)}{i}" + bib_data = { + "ENTRYTYPE": entry_type, + "ID": bib_key, + "title": _title(rng, i), + "author": f"{_author(rng, i)} and {_author(rng, i + 100)}", + "year": str(rng.randint(1990, 2026)), + "doi": _doi(rng, i), + "pages": f"{rng.randint(1, 200)}--{rng.randint(201, 400)}", + } + if entry_type == "article": + bib_data["journal"] = "Journal of Random Tests" + elif entry_type == "inproceedings": + bib_data["booktitle"] = "Proceedings of Random Tests" + elif entry_type == "book": + bib_data["publisher"] = "Example Press" + else: + bib_data["howpublished"] = "Online" + + completed_entries.append( + { + "id": i, + "doi": bib_data["doi"], + "status": "completed", + "bib_key": bib_key, + "bib_data": bib_data, + } + ) + + result = formatter.format(completed_entries, "bibtex") + parsed = bibtexparser.loads("\n\n".join(result["results"])) + + assert result["report"] == { + "total": len(completed_entries), + "succeeded": len(completed_entries), + "failed_entries": [], + } + assert len(parsed.entries) == len(completed_entries) + assert {entry["ID"] for entry in parsed.entries} == { + entry["bib_key"] for entry in completed_entries + } + assert all(entry["doi"].startswith("10.") for entry in parsed.entries) + + +def test_parser_bibtex_round_trip_integrity(): + rng = random.Random(SEED + 3) + formatter = FormatterModule() + expected_by_id = {} + completed_entries = [] + + for i in range(32): + entry_type = ("article", "inproceedings", "book", "misc")[i % 4] + bib_key = f"RoundTrip{rng.randint(1990, 2026)}{i}" + bib_data = { + "ENTRYTYPE": entry_type, + "ID": bib_key, + "title": _title(rng, i), + "author": f"{_author(rng, i)} and {_author(rng, i + 100)}", + "year": str(rng.randint(1990, 2026)), + "doi": _doi(rng, i), + } + + if entry_type == "article": + bib_data["journal"] = "Journal of Round Trip Tests" + elif entry_type == "inproceedings": + bib_data["booktitle"] = "Proceedings of Round Trip Tests" + elif entry_type == "book": + bib_data["publisher"] = "Round Trip Press" + else: + bib_data["howpublished"] = "Online" + + expected_by_id[bib_key] = bib_data + completed_entries.append( + { + "id": i, + "doi": bib_data["doi"], + "status": "completed", + "bib_key": bib_key, + "bib_data": bib_data, + } + ) + + formatted = formatter.format(completed_entries, "bibtex") + parsed_entries = ParserModule().parse("\n\n".join(formatted["results"]), "bib") + + assert formatted["report"]["failed_entries"] == [] + assert len(parsed_entries) == len(completed_entries) + for parsed_entry in parsed_entries: + original_entry = parsed_entry["original_entry"] + expected = expected_by_id[original_entry["ID"]] + for field in ("ENTRYTYPE", "ID", "title", "author", "year", "doi"): + assert original_entry[field] == expected[field] + for field in ("journal", "booktitle", "publisher", "howpublished"): + if field in expected: + assert original_entry[field] == expected[field] + else: + assert field not in original_entry + assert parsed_entry["doi"] == expected["doi"] + assert parsed_entry["query_string"] is None diff --git a/tests/test_real_world_risks.py b/tests/test_real_world_risks.py new file mode 100644 index 0000000..aef54f3 --- /dev/null +++ b/tests/test_real_world_risks.py @@ -0,0 +1,202 @@ +"""Regression tests for real-world citation-cleanup risks. + +These tests exercise behavior that matters when OneCite is used on an +existing manuscript bibliography rather than on isolated demo inputs. +""" + +from unittest.mock import patch + +from onecite.pipeline import EnricherModule, IdentifierModule +from tests.test_pipeline_unit import DummyResponse + + +def test_bibtex_input_preserves_existing_citation_key_when_enriched(): + enricher = EnricherModule(use_google_scholar=False) + identified = { + "id": 0, + "status": "identified", + "doi": "10.1234/example", + "metadata": {}, + } + raw = { + "id": 0, + "raw_text": "@article{localKey2026,...}", + "doi": "10.1234/example", + "original_entry": { + "ENTRYTYPE": "article", + "ID": "localKey2026", + "title": "Local Manuscript Citation", + "author": "Doe, Jane", + "journal": "Local Journal", + "year": "2026", + "doi": "10.1234/example", + }, + } + crossref_record = { + "title": "Local Manuscript Citation", + "author": "Doe, Jane", + "journal": "Local Journal", + "year": "2026", + "doi": "10.1234/example", + } + + with patch.object(enricher, "_get_crossref_metadata", return_value=crossref_record): + result = enricher._enrich_single_entry( + identified, + {"entry_type": "@article", "fields": []}, + raw, + ) + + assert result["status"] == "completed" + assert result["bib_key"] == "localKey2026" + assert result["bib_data"]["ID"] == "localKey2026" + + +def test_doi_backed_bibtex_conflict_does_not_keep_wrong_core_fields(): + enricher = EnricherModule(use_google_scholar=False) + identified = { + "id": 0, + "status": "identified", + "doi": "10.1038/nature14539", + "metadata": {}, + } + raw = { + "id": 0, + "raw_text": "@article{badkey,...}", + "doi": "10.1038/nature14539", + "original_entry": { + "ENTRYTYPE": "article", + "ID": "badkey", + "title": "Totally Wrong Local Title", + "author": "Someone, Alice", + "journal": "Imaginary Journal", + "year": "1900", + "doi": "10.1038/nature14539", + }, + } + canonical_record = { + "title": "Deep learning", + "author": "LeCun, Yann and Bengio, Yoshua and Hinton, Geoffrey", + "journal": "Nature", + "year": "2015", + "volume": "521", + "number": "7553", + "pages": "436--444", + "doi": "10.1038/nature14539", + } + + with patch.object(enricher, "_get_crossref_metadata", return_value=canonical_record): + result = enricher._enrich_single_entry( + identified, + {"entry_type": "@article", "fields": []}, + raw, + ) + + bib_data = result["bib_data"] + assert bib_data["title"] == canonical_record["title"] + assert bib_data["author"] == canonical_record["author"] + assert bib_data["journal"] == canonical_record["journal"] + assert bib_data["year"] == canonical_record["year"] + + +def test_verify_doi_crossref_request_uses_polite_headers_and_mailto(): + identifier = IdentifierModule() + captured = {} + + def fake_get(url, *args, **kwargs): + captured["url"] = url + captured["headers"] = kwargs.get("headers", {}) + captured["params"] = kwargs.get("params", {}) + return DummyResponse( + json_data={ + "message": { + "DOI": "10.1234/example", + "title": ["Example Paper"], + "author": [{"given": "Jane", "family": "Doe"}], + "published-print": {"date-parts": [[2026]]}, + } + } + ) + + with patch("onecite.pipeline.requests.get", side_effect=fake_get): + metadata = identifier._verify_doi_and_get_metadata("10.1234/example") + + assert metadata["doi"] == "10.1234/example" + assert "api.crossref.org/works/10.1234/example" in captured["url"] + assert "OneCite" in captured["headers"].get("User-Agent", "") + assert captured["params"].get("mailto") + + +def test_noninteractive_low_confidence_candidate_stays_unresolved(): + identifier = IdentifierModule() + raw_entry = { + "id": 0, + "raw_text": "ambiguous citation", + "query_string": "ambiguous citation", + } + low_confidence_candidate = { + "source": "crossref", + "doi": "10.1234/weak", + "title": "Weakly Related Paper", + "authors": ["Doe, Jane"], + "year": "2026", + "match_score": 55, + } + + with ( + patch.object(identifier, "_search_crossref", return_value=[low_confidence_candidate]), + patch.object(identifier, "_search_semantic_scholar", return_value=[]), + patch.object(identifier, "_score_candidates", return_value=[low_confidence_candidate]), + ): + result = identifier._fuzzy_search(raw_entry, interactive_callback=lambda _candidates: -1) + + assert result["status"] == "identification_failed" + assert result["doi"] is None + + +def test_plain_text_query_is_unresolved_in_process_identifier_path(): + identifier = IdentifierModule() + raw_entry = { + "id": 0, + "raw_text": "Attention is all you need, Vaswani et al., NIPS 2017", + "query_string": "Attention is all you need, Vaswani et al., NIPS 2017", + } + + with patch.object(identifier, "_fuzzy_search") as fuzzy_search: + result = identifier._identify_single_entry(raw_entry, interactive_callback=lambda _c: 0) + + fuzzy_search.assert_not_called() + assert result["status"] == "identification_failed" + assert result["doi"] is None + + +def test_plain_text_query_can_return_suggestions_without_identifying(): + identifier = IdentifierModule() + raw_entry = { + "id": 0, + "raw_text": "Attention is all you need, Vaswani et al., NIPS 2017", + "query_string": "Attention is all you need, Vaswani et al., NIPS 2017", + } + candidate = { + "source": "crossref", + "doi": "10.5555/3295222.3295349", + "title": "Attention Is All You Need", + "authors": ["Ashish Vaswani"], + "year": "2017", + "journal": "Advances in Neural Information Processing Systems", + "match_score": 92, + "_weights": {"title": 1.0}, + } + + with ( + patch.object(identifier, "_search_crossref", return_value=[candidate]), + patch.object(identifier, "_search_semantic_scholar", return_value=[]), + patch.object(identifier, "_score_candidates", return_value=[candidate]), + ): + suggestion = identifier.suggest(raw_entry, limit=3) + + assert suggestion["id"] == 0 + assert suggestion["status"] == "candidates_found" + assert suggestion["query_string"] == raw_entry["query_string"] + assert suggestion["candidates"][0]["doi"] == candidate["doi"] + assert "_weights" not in suggestion["candidates"][0]