From 689009fe4a1a9f7048532d8f440e75a584dceb49 Mon Sep 17 00:00:00 2001 From: Ehsan ESTAJI <71376358+ehsanestaji@users.noreply.github.com> Date: Thu, 11 Jun 2026 09:56:58 +0200 Subject: [PATCH 01/13] docs: plan v0.5 submission readiness --- AGENTS.md | 21 +- docs/roadmap.md | 28 +- ...aguard-v0.5-submission-readiness-design.md | 477 ++++++++++++++++++ docs/tool-landscape.md | 14 + docs/vision-plan.md | 43 +- 5 files changed, 566 insertions(+), 17 deletions(-) create mode 100644 docs/superpowers/specs/2026-06-11-fastaguard-v0.5-submission-readiness-design.md diff --git a/AGENTS.md b/AGENTS.md index 9b69d86..8686b2f 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -104,9 +104,10 @@ The release strategy is evidence before expansion: ```text v0.3: evidence pack + assembly gate + provenance checksums v0.4: compare mode for many FASTA files -v0.5: transcriptome profile -v0.6: protein profile -v0.7: reference-panel profile +v0.5: submission readiness gate +v0.6: transcriptome profile +v0.7: protein profile +v0.8: reference-panel profile later: MCP/tool-agent interface and optional local summaries ``` @@ -122,9 +123,21 @@ Default product boundaries: Recommended next big release: ```text -v0.3 should make FastaGuard credible as the default assembly gate before adding broad new biological profiles. +v0.5 should make submission readiness concrete before adding broad new biological profiles. ``` +The next planned feature direction is: + +```text +Submission Readiness Gate: --gate submission with --submission-target generic|ncbi. +``` + +This should stay FASTA-level and database-free. It should check identifier +safety, duplicate first-token IDs, unsafe characters, long identifiers, gap-like +N runs, high ambiguity, and tiny-record advisories. It must not claim repository +acceptance, biological completeness, annotation correctness, or contamination +confirmation. + ## Collaboration Preference When moving the project forward, provide a clear recommendation first, then proceed when the user approves or explicitly asks to continue. The default recommendation should favor boring, stable contracts over flashy AI features. diff --git a/docs/roadmap.md b/docs/roadmap.md index 6d8ea28..4c99125 100644 --- a/docs/roadmap.md +++ b/docs/roadmap.md @@ -94,7 +94,27 @@ Development scope: CheckM, official validators, annotation, or other downstream tools; it does not replace them -## v0.5: Transcriptome Profile +## v0.5: Submission Readiness Gate + +Goal: + +```text +Make assembly FASTA files safer to hand to official validators, annotation, and downstream QC. +``` + +Development scope: + +- `--gate submission` for stricter assembly FASTA preflight +- `--submission-target ` for target-aware submission advisories +- stricter identifier and first-token ID safety checks +- gap-like `N` run summaries for submission review +- high ambiguity and tiny-record submission advisories +- submission readiness fields in JSON, TSV, HTML, and MultiQC outputs +- compare-mode aggregation of submission readiness across many FASTA files +- clear scope boundaries: FastaGuard does not replace NCBI, ENA, DDBJ, FCS, + QUAST, BUSCO, BlobToolKit, CheckM, or annotation validation + +## v0.6: Transcriptome Profile Potential additions: @@ -104,7 +124,7 @@ Potential additions: - extreme GC outliers - isoform-heavy warning heuristics -## v0.6: Protein Profile +## v0.7: Protein Profile Potential additions: @@ -114,7 +134,7 @@ Potential additions: - low-complexity regions - suspicious nucleotide-looking proteins -## v0.7: Reference Panel Profile +## v0.8: Reference Panel Profile Potential additions: @@ -166,6 +186,6 @@ Completed foundation: Recommended next sequence: -- extend evidence tables across future transcriptome, protein, reference, and compare modes +- extend evidence tables across submission, transcriptome, protein, reference, and compare modes - keep the v0.3 gate contract stable through workflow adoption examples - explore an MCP or tool-server interface after the CLI schema is stable diff --git a/docs/superpowers/specs/2026-06-11-fastaguard-v0.5-submission-readiness-design.md b/docs/superpowers/specs/2026-06-11-fastaguard-v0.5-submission-readiness-design.md new file mode 100644 index 0000000..edd705d --- /dev/null +++ b/docs/superpowers/specs/2026-06-11-fastaguard-v0.5-submission-readiness-design.md @@ -0,0 +1,477 @@ +# FastaGuard v0.5 Design: Submission Readiness Gate + +## Summary + +FastaGuard v0.5 should make the product's preflight position more concrete: + +```text +FastaGuard catches FASTA problems that break pipelines and delay submissions. +``` + +v0.4 added readiness categories and compare mode. v0.5 should turn the +submission part of readiness into a deliberate gate, focused on FASTA-level +issues that users can fix before they spend time on official validators, +annotation, QUAST, BUSCO, BlobToolKit, CheckM, NCBI FCS, or submission portals. + +Release theme: + +```text +FastaGuard v0.5: Submission Readiness Gate +``` + +Product promise: + +```text +Check whether an assembly FASTA is structurally safe, identifier-safe, and +submission-ready enough to continue into official validation and downstream QC. +``` + +This release should remain assembly-first and database-free by default. It +should not add transcriptome, protein, or reference-panel profiles yet. + +## Why This Matters + +The current bioinformatics landscape already has strong tools, but their roles +start later or solve broader problems: + +- SeqKit, SeqFu, pyfastx, and BBTools provide fast FASTA/FASTQ manipulation or + statistics. +- QUAST evaluates assembly quality and can compare assemblies. +- BUSCO estimates biological completeness. +- BlobToolKit and NCBI FCS help investigate contamination or foreign sequence + signals with supporting data or databases. +- MultiQC aggregates outputs, but custom content remains more limited than a + native module. +- NCBI, ENA, and DDBJ submission systems have their own validation rules and + submission workflows. + +FastaGuard's useful gap is the layer before those tools: + +```text +Is this FASTA safe to hand to other tools and validators? +``` + +v0.5 should turn that into a practical user workflow. + +## Goals + +- Add a `submission` gate preset for assembly FASTA preflight. +- Add a `--submission-target` option with `generic` and `ncbi` as the first + supported targets. +- Add stricter identifier and definition-line checks without changing default + `--gate pipeline` behavior. +- Add structured submission-readiness findings with stable IDs, evidence, + thresholds, and recommended next steps. +- Add report fields that tell pipelines and agents whether the FASTA is ready + for official validation, annotation, and downstream QC. +- Preserve v0.4 compare mode and single-file behavior for users who do not opt + into the submission gate. +- Update docs so users understand FastaGuard is a pre-submission preflight, not + an official substitute for NCBI, ENA, DDBJ, FCS, QUAST, BUSCO, or BlobToolKit. + +## Non-Goals + +- Do not implement official NCBI, ENA, or DDBJ validation. +- Do not claim that passing FastaGuard guarantees repository acceptance. +- Do not add taxonomy databases, marker databases, aligners, read mapping, or + internet requirements. +- Do not run NCBI FCS, BlobToolKit, QUAST, BUSCO, CheckM, or annotation tools. +- Do not infer biological completeness or confirm contamination. +- Do not add transcriptome, protein, or reference-panel profiles in v0.5. +- Do not add an LLM-facing chat feature. + +## Product Position + +Recommended public message: + +```text +Preflight your FASTA before official validators and expensive QC. +``` + +Short slogan: + +```text +Validate the FASTA before the pipeline pays for it. +``` + +Avoid: + +```text +FastaGuard replaces submission validators. +``` + +The correct boundary is: + +```text +FastaGuard finds FASTA-level risks early. Official validators and downstream +tools still decide biological, taxonomic, annotation, and submission acceptance. +``` + +## User Workflows + +### Generic Submission Readiness + +```bash +fastaguard sample.fa \ + --profile assembly \ + --gate submission \ + --submission-target generic \ + --out fastaguard_report.html \ + --json fastaguard.json \ + --tsv fastaguard.tsv \ + --multiqc fastaguard_mqc.json +``` + +This mode should answer: + +- Are FASTA records valid and non-empty? +- Are identifiers safe for common parsers and indexes? +- Are sequence characters valid for assembly FASTA? +- Are ambiguity and gap signals likely to need explanation before submission? +- Should the user fix the FASTA before running official validation? + +### NCBI-Oriented Preflight + +```bash +fastaguard sample.fa \ + --profile assembly \ + --gate submission \ + --submission-target ncbi +``` + +This mode should be stricter about SeqID-like concerns and gap reporting, while +remaining honest that it is not an official NCBI validator. + +### Compare Mode With Submission Gate + +```bash +fastaguard compare assemblies/*.fa \ + --profile assembly \ + --gate submission \ + --submission-target ncbi \ + --json submission_cohort.json \ + --tsv submission_cohort.tsv \ + --out submission_cohort.html +``` + +This should make a cohort-level table of which FASTA files are ready for +official validation and which should be fixed first. + +## CLI Design + +Extend the existing gate enum: + +```text +--gate +``` + +Add: + +```text +--submission-target +``` + +Default behavior: + +- `--gate none`: no blocking gate, same as existing behavior. +- `--gate pipeline`: existing v0.3/v0.4 behavior. +- `--gate submission`: stricter FASTA-level blocking for submission readiness. +- If `--gate submission` is used without `--submission-target`, default to + `generic`. +- If `--submission-target` is provided without `--gate submission`, include + target-aware advisories in readiness output but do not change exit behavior. + +Exit codes remain unchanged: + +```text +0 = pass +1 = warnings above configured threshold +2 = hard QC failure +3 = invalid input / tool error +``` + +## Submission Targets + +### Generic + +The generic target should encode broad, conservative FASTA hygiene: + +- no empty identifiers +- no duplicate IDs +- no duplicate first-token IDs +- no unsafe whitespace ambiguity in identifiers +- no control characters +- no invalid nucleotide/IUPAC symbols +- no empty records +- bounded identifier length advisories +- gap-run and ambiguity summaries + +### NCBI + +The NCBI target should add stricter SeqID-oriented checks inspired by public +NCBI submission guidance: + +- warn or fail on SeqID characters that are risky for submission and downstream + tools +- flag identifiers longer than the configured SeqID threshold +- flag spaces and pipe characters in first-token IDs +- flag definition lines with missing first-token IDs +- report long `N` runs as gap-like evidence +- route users to NCBI FCS when FASTA-level signals suggest contamination + follow-up, while making clear that FastaGuard does not run FCS + +The exact thresholds should be documented in provenance and schema. + +## Finding Scope + +Add or promote stable finding IDs for submission readiness: + +```text +unsafe_identifier_chars +long_identifier +duplicate_first_token_ids +empty_identifier +control_characters +gap_run_summary +submission_gap_like_ns +submission_high_ambiguity +submission_tiny_records +submission_target_scope +``` + +Some of these may reuse existing low-level evidence if a finding already exists. +Do not create duplicate concepts if a current finding ID can be extended safely. + +## Gate Behavior + +`--gate submission` should fail on problems that make the FASTA structurally +unsafe for common tooling: + +```text +invalid_fasta_structure +empty_records +empty_identifier +duplicate_ids +duplicate_first_token_ids +invalid_chars +control_characters +unsafe_identifier_chars +``` + +It should warn, not fail by default, on issues that may be legitimate but need +review or explanation: + +```text +long_identifier +submission_gap_like_ns +submission_high_ambiguity +submission_tiny_records +gap_run_summary +gc_outliers +length_outliers +``` + +Users can still make advisory findings blocking with `--fail-on`. + +## JSON Contract + +Extend the existing `gate` and `readiness` fields rather than inventing a second +submission report type. + +Recommended shape: + +```json +{ + "gate": { + "mode": "submission", + "submission_target": "ncbi", + "status": "FAIL", + "blocking_findings": ["duplicate_first_token_ids", "unsafe_identifier_chars"], + "advisory_findings": ["submission_gap_like_ns", "long_identifier"], + "fail_on": [ + "duplicate_ids", + "duplicate_first_token_ids", + "empty_identifier", + "empty_records", + "invalid_chars", + "invalid_fasta_structure", + "unsafe_identifier_chars" + ] + }, + "readiness": { + "overall": { + "status": "FAIL", + "summary": "FASTA should be fixed before official submission validation." + }, + "categories": [ + { + "id": "submission", + "status": "FAIL", + "target": "ncbi", + "blocking_findings": ["unsafe_identifier_chars"], + "advisory_findings": ["submission_gap_like_ns"] + } + ] + }, + "scope": { + "can_conclude": [ + "FASTA parse validity", + "identifier safety", + "assembly alphabet validity", + "FASTA-level submission-readiness risks" + ], + "cannot_conclude": [ + "repository acceptance", + "taxonomic contamination", + "biological completeness", + "annotation correctness" + ] + } +} +``` + +Compare reports should surface the same gate status per sample and add cohort +counts: + +```text +submission_ready_count +submission_warn_count +submission_fail_count +``` + +## HTML Report + +Add a concise "Submission Readiness" section near the top when submission +signals are present. + +The section should show: + +- target: `generic` or `ncbi` +- status: PASS / WARN / FAIL +- blocking problems +- advisory risks +- recommended next step +- scope note that official validators are still required + +Avoid long prose. The HTML should explain the result, not become a submission +manual. + +## TSV And MultiQC + +Add stable summary columns: + +```text +submission_target +submission_status +submission_blocking_findings +submission_advisory_findings +unsafe_identifier_count +long_identifier_count +duplicate_first_token_id_count +gap_like_n_run_count +``` + +The MultiQC custom-content output should include the same high-level fields so +workflow reports can sort samples by submission status. + +## Evidence And Documentation + +Add a small committed evidence page for v0.5: + +```text +docs/evidence/fastaguard-v0.5-submission-readiness.md +``` + +It should include tiny synthetic cases: + +- clean assembly FASTA +- duplicate first-token IDs +- unsafe identifier characters +- long identifier +- long N runs +- invalid sequence character + +Do not commit large public FASTA files. + +Docs to update: + +- `README.md` +- `docs/roadmap.md` +- `docs/vision-plan.md` +- `docs/tool-landscape.md` +- `docs/output-contract.md` +- `docs/packaging.md` +- `docs/releases/v0.5.0.md` +- `examples/nf-core/README.md` +- `examples/snakemake/README.md` + +## Tests + +Add focused tests for: + +- CLI accepts `--gate submission` +- CLI accepts `--submission-target generic` +- CLI accepts `--submission-target ncbi` +- unknown submission target exits with code `3` +- submission gate fails duplicate first-token IDs +- submission gate fails unsafe identifier characters +- submission gate warns on long identifiers +- submission gate warns on long N runs +- `--fail-on long_identifier` can make long identifiers blocking +- JSON schema validates new fields +- golden JSON fixtures include submission pass/warn/fail cases +- HTML contains "Submission Readiness" +- TSV includes submission columns +- MultiQC output includes submission fields +- compare mode aggregates submission status deterministically + +Run the usual gates: + +```bash +python3 -m unittest discover tests/python -v +cargo fmt --check +cargo test --locked +cargo clippy --locked --all-targets --all-features -- -D warnings +git diff --check +git ls-files | xargs perl -ne 'print "$ARGV:$.:$_" if /[ \t]$/' +``` + +## Release Strategy + +Before tagging v0.5.0: + +1. Merge v0.4.0 into Bioconda or document clearly that Bioconda remains behind. +2. Implement submission readiness behind explicit `--gate submission`. +3. Preserve `--gate pipeline` behavior unless tests intentionally prove an + unchanged report contract. +4. Regenerate schema, examples, and golden reports. +5. Add v0.5 release notes with clear boundaries. +6. Tag and publish GitHub release. +7. Update Bioconda after the public source archive exists. + +## Success Criteria + +v0.5 is successful if: + +- a user can run one command before official validation and see fixable FASTA + risks immediately +- pipeline authors can route on `gate.mode = submission` and + `readiness.categories[id=submission]` +- no downstream tool claims are overstated +- the report tells users when to continue to NCBI FCS, QUAST, BUSCO, + BlobToolKit, CheckM, annotation, or official validators +- all outputs remain deterministic and schema-validated + +## Recommended Implementation Order + +1. Add CLI enums and no-op serialization support for `submission_target`. +2. Add tests for target parsing and unchanged default behavior. +3. Add identifier-safety analyzer functions with focused unit tests. +4. Add submission gate failure/advisory mapping. +5. Extend JSON schema and golden fixtures. +6. Extend TSV, MultiQC, and HTML outputs. +7. Add compare-mode aggregation fields. +8. Update docs, examples, and release notes. +9. Run full verification gates. + +This order keeps the contract clear before touching report presentation. diff --git a/docs/tool-landscape.md b/docs/tool-landscape.md index 0fb2188..19ea41c 100644 --- a/docs/tool-landscape.md +++ b/docs/tool-landscape.md @@ -29,6 +29,12 @@ v0.4 positioning: Preflight readiness and starter cohort triage before interpretive QC. ``` +v0.5 positioning: + +```text +Submission-readiness preflight before official validators and expensive QC. +``` + ## Where FastaGuard Fits | Tool | Primary role | When it runs | What FastaGuard adds before it | @@ -61,6 +67,12 @@ machine concerns. Compare mode gives many FASTA files one starter cohort triage table before teams spend time in QUAST, BUSCO, BlobToolKit, CheckM, official validators, annotation, or other interpretive tools. +v0.5 should make the submission part of readiness explicit. The useful product +move is not to replace NCBI, ENA, DDBJ, FCS, or annotation validators. It is to +catch FASTA-level submission hazards first: unsafe identifiers, duplicate +first-token IDs, invalid characters, gap-like `N` runs, high ambiguity, and +tiny-record advisories. + ## Product Evidence We Have Current product evidence: @@ -95,6 +107,8 @@ Evidence still needed: - user feedback from real pipeline authors - broader public assembly evidence runs - real cohort compare-mode examples from public assemblies +- submission-readiness examples that show fixable FASTA hazards before official + validators - official MultiQC module or packaged plugin - comparison examples showing what FastaGuard catches before QUAST/BUSCO/BlobToolKit diff --git a/docs/vision-plan.md b/docs/vision-plan.md index ae62e39..5229fc4 100644 --- a/docs/vision-plan.md +++ b/docs/vision-plan.md @@ -29,8 +29,9 @@ preflight contract is trusted. The product should earn adoption in this order: 1. **Trust:** reproducible evidence, stable schemas, clear exit codes, installable packages. 2. **Integration:** Bioconda, BioContainers, MultiQC, Nextflow, Snakemake, Galaxy. 3. **Scale:** compare mode for many FASTA files and batch pipeline reports. -4. **Breadth:** transcriptome, protein, and reference-panel profiles. -5. **Intelligence:** optional local-metrics-only summaries, MCP/tool-agent interfaces, and workflow routing. +4. **Readiness depth:** submission-oriented preflight checks before official validators. +5. **Breadth:** transcriptome, protein, and reference-panel profiles. +6. **Intelligence:** optional local-metrics-only summaries, MCP/tool-agent interfaces, and workflow routing. This keeps the project from becoming a bag of heuristics. Each release should make the contract more useful, more trusted, or more integrated. @@ -80,7 +81,29 @@ Compare mode should support: This is more strategically important than adding many profile-specific checks too early, because pipeline authors often need to triage batches, not one file. -### v0.5: Transcriptome Profile +### v0.5: Submission Readiness Gate + +Goal: + +```text +Make assembly FASTA files safer to hand to official validators, annotation, and downstream QC. +``` + +Submission readiness should stay FASTA-level and database-free: + +- stricter identifier and first-token ID safety checks +- target-aware advisories with `--submission-target generic|ncbi` +- gap-like `N` run summaries +- high ambiguity and tiny-record submission advisories +- JSON, TSV, HTML, and MultiQC fields that pipelines can route on +- clear recommendations to continue with official validators, NCBI FCS, QUAST, + BUSCO, BlobToolKit, CheckM, or annotation tools when appropriate + +FastaGuard should not claim repository acceptance, biological completeness, +annotation correctness, or contamination confirmation. It should help users fix +FASTA-level blockers before those later checks. + +### v0.6: Transcriptome Profile Goal: @@ -100,7 +123,7 @@ FastaGuard should not claim transcriptome biological completeness. It should route users to transcriptome-specific completeness and annotation tools when needed. -### v0.6: Protein Profile +### v0.7: Protein Profile Goal: @@ -119,7 +142,7 @@ Initial protein checks: Protein mode should be strict about alphabet validity and careful about biology: it should flag preflight problems, not infer functional correctness. -### v0.7: Reference-Panel Profile +### v0.8: Reference-Panel Profile Goal: @@ -209,11 +232,13 @@ Recommended sequence: ```text v0.3: evidence pack + assembly gate + provenance checksums v0.4: compare mode for many FASTA files -v0.5: transcriptome profile -v0.6: protein profile -v0.7: reference-panel profile +v0.5: submission readiness gate +v0.6: transcriptome profile +v0.7: protein profile +v0.8: reference-panel profile later: MCP/tool-agent interface and optional local summaries ``` This path gives FastaGuard the best chance to become a default tool: prove the -assembly gate first, then scale to batches, then expand profiles. +assembly gate first, scale to batches, make submission readiness concrete, then +expand profiles. From 124166bfed84ba49d025cc5bcc028239549eabca Mon Sep 17 00:00:00 2001 From: Ehsan ESTAJI <71376358+ehsanestaji@users.noreply.github.com> Date: Thu, 11 Jun 2026 10:10:32 +0200 Subject: [PATCH 02/13] docs: plan v0.5 submission implementation --- ...11-fastaguard-v0.5-submission-readiness.md | 1562 +++++++++++++++++ 1 file changed, 1562 insertions(+) create mode 100644 docs/superpowers/plans/2026-06-11-fastaguard-v0.5-submission-readiness.md diff --git a/docs/superpowers/plans/2026-06-11-fastaguard-v0.5-submission-readiness.md b/docs/superpowers/plans/2026-06-11-fastaguard-v0.5-submission-readiness.md new file mode 100644 index 0000000..1bf1bd1 --- /dev/null +++ b/docs/superpowers/plans/2026-06-11-fastaguard-v0.5-submission-readiness.md @@ -0,0 +1,1562 @@ +# FastaGuard v0.5 Submission Readiness Implementation Plan + +> **For agentic workers:** REQUIRED SUB-SKILL: Use superpowers:subagent-driven-development (recommended) or superpowers:executing-plans to implement this plan task-by-task. Steps use checkbox (`- [ ]`) syntax for tracking. + +**Goal:** Build `--gate submission` and `--submission-target generic|ncbi` so FastaGuard can report FASTA-level submission readiness without claiming to replace official validators. + +**Architecture:** Reuse the existing v0.4 analyzer and finding IDs wherever possible. Add one small `submission` module for target parsing and target-specific fail sets, then thread submission target metadata through CLI config, gate decisions, readiness, reports, schema, compare mode, and docs. + +**Tech Stack:** Rust 2021, clap, serde/serde_json, assert_cmd integration tests, jsonschema contract tests, existing static HTML/TSV/MultiQC writers. + +--- + +## File Structure + +Create: + +- `src/submission.rs`: `SubmissionTarget`, display helpers, and submission gate constants. +- `testdata/submission_ids.fa`: FASTA with unsafe IDs, reserved header characters, and duplicate first-token IDs. +- `testdata/submission_warnings.fa`: FASTA with long headers and gap-like N runs. +- `docs/evidence/fastaguard-v0.5-submission-readiness.md`: tiny evidence examples and command transcript. +- `docs/releases/v0.5.0.md`: release notes drafted before tagging. + +Modify: + +- `src/lib.rs`: export `submission`. +- `src/cli.rs`: parse `--submission-target`, carry it into run and compare configs. +- `src/gate.rs`: add `GateMode::Submission` and target-aware fail rules. +- `src/readiness.rs`: add optional target metadata to readiness categories and map existing findings to submission readiness. +- `src/models.rs`: add submission target fields to gate/provenance/compare summaries and bump schema version. +- `src/findings.rs`: tune text/actions for existing submission-relevant findings; avoid renaming v0.4 IDs. +- `src/contract.rs` and `schema/finding-catalog.json`: keep bundled catalog and runtime actions aligned. +- `schema/fastaguard.schema.json`: update schema version and new fields. +- `src/report/html.rs`, `src/report/tsv.rs`, `src/report/multiqc.rs`: add single-report submission output. +- `src/compare.rs`, `src/report/compare_html.rs`, `src/report/compare_tsv.rs`, `src/report/compare_multiqc.rs`: aggregate and render submission status. +- `tests/cli.rs`, `tests/schema_contract.rs`: add CLI, golden, report, and schema coverage. +- `tests/golden/*.json`, `examples/reports/**`: regenerate committed reports after schema changes. +- `README.md`, `docs/roadmap.md`, `docs/vision-plan.md`, `docs/tool-landscape.md`, `docs/output-contract.md`, `docs/packaging.md`, `examples/nf-core/README.md`, `examples/snakemake/wrapper/README.md`: document v0.5 behavior and boundaries. + +Important design choice: + +- Keep current finding IDs: `unsafe_ids`, `long_headers`, `reserved_header_chars`, `duplicate_first_token_ids`, `terminal_ns`, `gap_pattern_warnings`, `gap_runs`, `high_n_rate`, and `tiny_contigs`. +- Do not rename them to `unsafe_identifier_chars` or `submission_gap_like_ns` in v0.5. The v0.5 behavior is to promote existing evidence into a stricter submission gate and clearer submission-readiness fields. + +## Task 1: CLI Plumbing For Submission Target + +**Files:** +- Create: `src/submission.rs` +- Modify: `src/lib.rs` +- Modify: `src/cli.rs` +- Test: `tests/cli.rs` + +- [ ] **Step 1: Add failing CLI tests** + +Append these tests to `tests/cli.rs`: + +```rust +#[test] +fn submission_gate_defaults_to_generic_target() { + let temp_dir = TempDir::new().unwrap(); + let outputs = output_paths(&temp_dir, "submission_default"); + + let mut cmd = Command::cargo_bin("fastaguard").unwrap(); + cmd.args([ + "testdata/valid_assembly.fa", + "--min-contig-length", + "1", + "--gate", + "submission", + "--json", + ]) + .arg(&outputs.json) + .arg("--out") + .arg(&outputs.html) + .arg("--tsv") + .arg(&outputs.tsv) + .arg("--multiqc") + .arg(&outputs.multiqc) + .assert() + .code(1) + .stderr(predicate::str::is_empty()); + + let report = read_json(&outputs.json); + assert_eq!(report["gate"]["mode"], json!("submission")); + assert_eq!(report["gate"]["submission_target"], json!("generic")); + assert_eq!(report["provenance"]["submission_target"], json!("generic")); +} + +#[test] +fn submission_target_ncbi_is_serialized_when_requested() { + let temp_dir = TempDir::new().unwrap(); + let outputs = output_paths(&temp_dir, "submission_ncbi"); + + let mut cmd = Command::cargo_bin("fastaguard").unwrap(); + cmd.args([ + "testdata/valid_assembly.fa", + "--min-contig-length", + "1", + "--gate", + "submission", + "--submission-target", + "ncbi", + "--json", + ]) + .arg(&outputs.json) + .arg("--out") + .arg(&outputs.html) + .arg("--tsv") + .arg(&outputs.tsv) + .arg("--multiqc") + .arg(&outputs.multiqc) + .assert() + .code(1) + .stderr(predicate::str::is_empty()); + + let report = read_json(&outputs.json); + assert_eq!(report["gate"]["submission_target"], json!("ncbi")); + assert_eq!(report["provenance"]["submission_target"], json!("ncbi")); +} + +#[test] +fn unknown_submission_target_is_cli_error() { + let mut cmd = Command::cargo_bin("fastaguard").unwrap(); + cmd.args([ + "testdata/valid_assembly.fa", + "--gate", + "submission", + "--submission-target", + "ena", + ]) + .assert() + .code(2) + .stderr(predicate::str::contains("invalid value 'ena'")); +} +``` + +- [ ] **Step 2: Run tests to verify they fail** + +Run: + +```bash +cargo test --locked --test cli submission_ +``` + +Expected: the first two tests fail because `--gate submission`, `--submission-target`, and serialized fields do not exist. The unknown-target test may fail with a different clap error until the flag exists. + +- [ ] **Step 3: Create `src/submission.rs`** + +Create: + +```rust +use clap::ValueEnum; +use serde::{Deserialize, Serialize}; + +#[derive(Debug, Clone, Copy, PartialEq, Eq, ValueEnum, Serialize, Deserialize)] +#[serde(rename_all = "snake_case")] +#[value(rename_all = "snake_case")] +pub enum SubmissionTarget { + Generic, + Ncbi, +} + +impl SubmissionTarget { + pub fn as_str(self) -> &'static str { + match self { + SubmissionTarget::Generic => "generic", + SubmissionTarget::Ncbi => "ncbi", + } + } +} +``` + +- [ ] **Step 4: Export the module** + +Modify `src/lib.rs` and add: + +```rust +pub mod submission; +``` + +- [ ] **Step 5: Add CLI fields and config plumbing** + +In `src/cli.rs`, import: + +```rust +use crate::submission::SubmissionTarget; +``` + +Add to `AnalysisArgs`: + +```rust + /// Submission-readiness target used by --gate submission. + #[arg(long, value_enum)] + pub submission_target: Option, +``` + +Add to `RunConfig`, `CompareConfig`, and `ValidatedAnalysis`: + +```rust + pub submission_target: Option, +``` + +In the analysis validation function that constructs `ValidatedAnalysis`, set: + +```rust +let submission_target = match (analysis.gate, analysis.submission_target) { + (GateMode::Submission, None) => Some(SubmissionTarget::Generic), + (_, target) => target, +}; +``` + +Carry `submission_target` into both `RunConfig` and `CompareConfig`. + +- [ ] **Step 6: Run focused tests** + +Run: + +```bash +cargo test --locked --test cli submission_ +``` + +Expected: tests still fail until gate/model serialization exists, but clap should now accept `--submission-target generic|ncbi` and reject `ena`. + +- [ ] **Step 7: Commit** + +```bash +git add src/submission.rs src/lib.rs src/cli.rs tests/cli.rs +git commit -m "feat: add submission target CLI" +``` + +## Task 2: Submission Gate Semantics + +**Files:** +- Modify: `src/gate.rs` +- Modify: `src/models.rs` +- Modify: `src/compare.rs` +- Test: `tests/cli.rs` + +- [ ] **Step 1: Add failing tests for blocking behavior** + +Add fixtures: + +`testdata/submission_ids.fa` + +```text +>seq/one +ACGTACGT +>seq two +ACGTACGT +>seq two duplicate-description +ACGTACGA +>pipe|id +ACGTACGT +``` + +Add to `tests/cli.rs`: + +```rust +#[test] +fn submission_gate_fails_identifier_hazards() { + let temp_dir = TempDir::new().unwrap(); + let outputs = output_paths(&temp_dir, "submission_ids"); + + let mut cmd = Command::cargo_bin("fastaguard").unwrap(); + cmd.args([ + "testdata/submission_ids.fa", + "--gate", + "submission", + "--submission-target", + "ncbi", + "--json", + ]) + .arg(&outputs.json) + .arg("--out") + .arg(&outputs.html) + .arg("--tsv") + .arg(&outputs.tsv) + .arg("--multiqc") + .arg(&outputs.multiqc) + .assert() + .code(2) + .stderr(predicate::str::contains("fastaguard error:").not()); + + let report = read_json(&outputs.json); + assert_eq!(report["gate"]["mode"], json!("submission")); + assert_eq!(report["gate"]["status"], json!("FAIL")); + assert!(array_contains_string( + &report["gate"]["blocking_findings"], + "duplicate_first_token_ids" + )); + assert!(array_contains_string( + &report["gate"]["blocking_findings"], + "unsafe_ids" + )); + assert!(array_contains_string( + &report["gate"]["blocking_findings"], + "reserved_header_chars" + )); +} +``` + +- [ ] **Step 2: Run failing test** + +Run: + +```bash +cargo test --locked --test cli submission_gate_fails_identifier_hazards +``` + +Expected: FAIL because `GateMode::Submission` and the submission fail set do not exist. + +- [ ] **Step 3: Add `Submission` gate mode and fail set** + +Modify `src/gate.rs`: + +```rust +use crate::submission::SubmissionTarget; + +pub const SUBMISSION_FAIL_ON_GENERIC: &[&str] = &[ + "duplicate_first_token_ids", + "duplicate_ids", + "invalid_chars", + "invalid_fasta_structure", + "unsafe_ids", +]; + +pub const SUBMISSION_FAIL_ON_NCBI: &[&str] = &[ + "duplicate_first_token_ids", + "duplicate_ids", + "invalid_chars", + "invalid_fasta_structure", + "reserved_header_chars", + "unsafe_ids", +]; +``` + +Extend `GateMode`: + +```rust +pub enum GateMode { + None, + Pipeline, + Submission, +} +``` + +Extend `as_str`: + +```rust +GateMode::Submission => "submission", +``` + +Change `final_fail_on` signature: + +```rust +pub fn final_fail_on( + mode: GateMode, + submission_target: Option, + explicit_rules: &[String], +) -> BTreeSet +``` + +Inside it: + +```rust +match mode { + GateMode::Pipeline => { + fail_on.extend(PIPELINE_FAIL_ON.iter().map(|id| (*id).to_string())); + } + GateMode::Submission => { + let rules = match submission_target.unwrap_or(SubmissionTarget::Generic) { + SubmissionTarget::Generic => SUBMISSION_FAIL_ON_GENERIC, + SubmissionTarget::Ncbi => SUBMISSION_FAIL_ON_NCBI, + }; + fail_on.extend(rules.iter().map(|id| (*id).to_string())); + } + GateMode::None => {} +} +``` + +- [ ] **Step 4: Add target to gate decision** + +Modify `GateDecision` in `src/models.rs`: + +```rust + #[serde(skip_serializing_if = "Option::is_none")] + pub submission_target: Option, +``` + +Change `gate::decision` signature: + +```rust +pub fn decision( + mode: GateMode, + submission_target: Option, + status: VerdictStatus, + findings: &[Finding], + fail_on: &BTreeSet, +) -> GateDecision +``` + +Set: + +```rust +submission_target: submission_target.map(|target| target.as_str().to_string()), +``` + +Update all call sites in `src/models.rs` to pass `config.submission_target`. + +- [ ] **Step 5: Pass target through compare sample runs** + +In `src/compare.rs`, add: + +```rust +submission_target: config.submission_target, +``` + +to the `RunConfig` built inside `run_one_sample`. + +- [ ] **Step 6: Run focused gate tests** + +Run: + +```bash +cargo test --locked --test cli submission_ +``` + +Expected: PASS. + +- [ ] **Step 7: Commit** + +```bash +git add src/gate.rs src/models.rs src/compare.rs tests/cli.rs testdata/submission_ids.fa +git commit -m "feat: add submission gate semantics" +``` + +## Task 3: Submission Readiness Metadata + +**Files:** +- Modify: `src/readiness.rs` +- Modify: `src/models.rs` +- Test: `src/readiness.rs` +- Test: `tests/cli.rs` + +- [ ] **Step 1: Add failing readiness tests** + +Add to `src/readiness.rs` tests: + +```rust +#[test] +fn submission_target_is_attached_to_submission_category() { + let readiness = build_readiness( + VerdictStatus::Fail, + &["reserved_header_chars".to_string()], + &[finding("reserved_header_chars", Severity::Minor)], + ReadinessScope::Single, + Some(crate::submission::SubmissionTarget::Ncbi), + ); + + let submission = readiness.category("submission").unwrap(); + assert_eq!(submission.target.as_deref(), Some("ncbi")); + assert_eq!(submission.status, ReadinessStatus::Fail); + assert_eq!(submission.findings, ["reserved_header_chars"]); +} + +#[test] +fn submission_findings_warn_when_not_blocking() { + let readiness = build_readiness( + VerdictStatus::Warn, + &[], + &[finding("long_headers", Severity::Minor)], + ReadinessScope::Single, + Some(crate::submission::SubmissionTarget::Generic), + ); + + let submission = readiness.category("submission").unwrap(); + assert_eq!(submission.target.as_deref(), Some("generic")); + assert_eq!(submission.status, ReadinessStatus::Warn); + assert!(readiness.overall.blockers.is_empty()); +} +``` + +- [ ] **Step 2: Run failing readiness tests** + +Run: + +```bash +cargo test --locked readiness::tests::submission_target_is_attached_to_submission_category readiness::tests::submission_findings_warn_when_not_blocking +``` + +Expected: FAIL because `ReadinessCategory.target` and the new function signature do not exist. + +- [ ] **Step 3: Extend readiness category** + +Modify `ReadinessCategory`: + +```rust +#[serde(skip_serializing_if = "Option::is_none")] +pub target: Option, +``` + +In `base_categories`, set `target: None`. + +Change `build_readiness` signature: + +```rust +pub fn build_readiness( + verdict: VerdictStatus, + blocking_findings: &[String], + findings: &[Finding], + scope: ReadinessScope, + submission_target: Option, +) -> ReadinessReport +``` + +After `base_categories(scope)`, attach the target: + +```rust +if let Some(target) = submission_target { + if let Some(category) = categories + .iter_mut() + .find(|category| category.id == "submission") + { + category.target = Some(target.as_str().to_string()); + } +} +``` + +Update all call sites. Use `None` in tests that do not care about target. + +- [ ] **Step 4: Promote existing findings into submission readiness** + +Update `category_ids_for_finding`: + +```rust +"duplicate_ids" | "duplicate_first_token_ids" => &["index", "submission"], +"unsafe_ids" | "long_headers" | "reserved_header_chars" => &["index", "submission"], +"terminal_ns" | "gap_pattern_warnings" | "gap_runs" => &["assembly", "submission"], +"high_n_rate" | "tiny_contigs" => &["assembly", "submission"], +``` + +This makes the submission category show every identifier issue that can block +the submission gate, while preserving index readiness for parser/index users. + +- [ ] **Step 5: Update model call sites** + +In `FastaguardReport::from_analysis` and `FastaguardReport::from_invalid_fasta`, pass `config.submission_target` to `build_readiness`. + +In tests that construct readiness manually, pass `None` unless target behavior is under test. + +- [ ] **Step 6: Run readiness tests** + +Run: + +```bash +cargo test --locked readiness +``` + +Expected: PASS. + +- [ ] **Step 7: Commit** + +```bash +git add src/readiness.rs src/models.rs +git commit -m "feat: add submission readiness metadata" +``` + +## Task 4: Provenance, Scope, And Machine Routing + +**Files:** +- Modify: `src/models.rs` +- Modify: `src/findings.rs` +- Modify: `schema/finding-catalog.json` +- Test: `src/contract.rs` +- Test: `tests/cli.rs` + +- [ ] **Step 1: Add failing routing/scope assertions** + +In `tests/cli.rs`, add to `submission_target_ncbi_is_serialized_when_requested`: + +```rust +assert!(array_contains_string( + &report["scope"]["can_conclude"], + "FASTA-level submission readiness" +)); +assert!(array_contains_string( + &report["scope"]["cannot_conclude"], + "repository acceptance" +)); +``` + +Add a separate test: + +```rust +#[test] +fn submission_hazards_route_to_official_validators_and_fcs_without_claiming_results() { + let temp_dir = TempDir::new().unwrap(); + let outputs = output_paths(&temp_dir, "submission_routes"); + + let mut cmd = Command::cargo_bin("fastaguard").unwrap(); + cmd.args([ + "testdata/submission_ids.fa", + "--gate", + "submission", + "--submission-target", + "ncbi", + "--json", + ]) + .arg(&outputs.json) + .arg("--out") + .arg(&outputs.html) + .arg("--tsv") + .arg(&outputs.tsv) + .arg("--multiqc") + .arg(&outputs.multiqc) + .assert() + .code(2); + + let report = read_json(&outputs.json); + assert_routing_hint( + &report, + "submission_readiness_failure", + "fix_fasta_before_official_validation", + false, + ); + assert!(array_contains_tool( + &report["machine_summary"]["recommended_next_tools"], + "official submission validator" + )); +} +``` + +- [ ] **Step 2: Run failing test** + +Run: + +```bash +cargo test --locked --test cli submission_hazards_route_to_official_validators_and_fcs_without_claiming_results +``` + +Expected: FAIL because routing and scope text have not been extended. + +- [ ] **Step 3: Extend scope** + +In `fasta_preflight_scope()` in `src/models.rs`, include: + +```rust +"FASTA-level submission readiness".to_string(), +``` + +in `can_conclude`, and include: + +```rust +"repository acceptance".to_string(), +"official validator acceptance".to_string(), +"annotation correctness".to_string(), +``` + +in `cannot_conclude`. + +- [ ] **Step 4: Extend recommended tools** + +In `recommended_next_tools`, route submission-relevant findings: + +```rust +if has_any_finding( + findings, + &[ + "unsafe_ids", + "long_headers", + "reserved_header_chars", + "duplicate_first_token_ids", + "terminal_ns", + "gap_pattern_warnings", + ], +) { + tools.push(recommended_tool( + "official submission validator", + "Use the target repository validator after FASTA-level issues are fixed; FastaGuard is not an official validator.", + )); +} + +if has_any_finding(findings, &["high_n_rate", "gap_runs"]) { + tools.push(recommended_tool( + "NCBI FCS", + "Run database-backed contamination/adaptor screening when submission-oriented ambiguity or gap signals need follow-up.", + )); +} +``` + +Add these helper functions near the existing recommendation helpers: + +```rust +fn has_any_finding(findings: &[Finding], ids: &[&str]) -> bool { + findings + .iter() + .any(|finding| ids.iter().any(|id| *id == finding.id)) +} + +fn recommended_tool(tool: &str, reason: &str) -> RecommendedTool { + RecommendedTool { + tool: tool.to_string(), + reason: reason.to_string(), + } +} +``` + +- [ ] **Step 5: Extend routing hints** + +In `routing_hints`, add: + +```rust +"unsafe_ids" | "long_headers" | "reserved_header_chars" | "duplicate_first_token_ids" => { + push_routing_hint( + &mut hints, + "submission_readiness_failure", + "fix_fasta_before_official_validation", + false, + ) +} +``` + +Keep existing `index_readiness_failure` if present by using two calls for `duplicate_first_token_ids`. + +- [ ] **Step 6: Update catalog text and runtime action alignment** + +In `schema/finding-catalog.json`, keep IDs unchanged and add wording that these findings affect submission readiness. Do not add suggested actions that are missing from `finding_actions`; `src/contract.rs::bundled_catalog_actions_match_runtime_actions` must remain green. + +- [ ] **Step 7: Run tests** + +Run: + +```bash +cargo test --locked --test cli submission_hazards_route_to_official_validators_and_fcs_without_claiming_results +cargo test --locked contract +``` + +Expected: PASS. + +- [ ] **Step 8: Commit** + +```bash +git add src/models.rs src/findings.rs schema/finding-catalog.json tests/cli.rs +git commit -m "feat: route submission readiness findings" +``` + +## Task 5: Single-Report Output Fields + +**Files:** +- Modify: `src/report/tsv.rs` +- Modify: `src/report/multiqc.rs` +- Modify: `src/report/html.rs` +- Test: `src/report/tsv.rs` +- Test: `src/report/multiqc.rs` +- Test: `tests/cli.rs` + +- [ ] **Step 1: Add failing output assertions** + +In `tests/cli.rs`, add: + +```rust +#[test] +fn submission_gate_outputs_tsv_multiqc_and_html_fields() { + let temp_dir = TempDir::new().unwrap(); + let outputs = output_paths(&temp_dir, "submission_outputs"); + + let mut cmd = Command::cargo_bin("fastaguard").unwrap(); + cmd.args([ + "testdata/submission_ids.fa", + "--gate", + "submission", + "--submission-target", + "ncbi", + "--json", + ]) + .arg(&outputs.json) + .arg("--out") + .arg(&outputs.html) + .arg("--tsv") + .arg(&outputs.tsv) + .arg("--multiqc") + .arg(&outputs.multiqc) + .assert() + .code(2); + + let tsv = std::fs::read_to_string(&outputs.tsv).unwrap(); + assert!(tsv.contains("submission_target\tncbi\n"), "{tsv}"); + assert!(tsv.contains("submission_status\tFAIL\n"), "{tsv}"); + assert!(tsv.contains("unsafe_identifier_count\t"), "{tsv}"); + + let multiqc = read_json(&outputs.multiqc); + assert_eq!(multiqc["data"]["submission_ids"]["submission_target"], json!("ncbi")); + assert_eq!(multiqc["data"]["submission_ids"]["submission_status"], json!("FAIL")); + + let html = std::fs::read_to_string(&outputs.html).unwrap(); + assert!(html.contains("Submission Readiness"), "{html}"); + assert!(html.contains("Official validators are still required"), "{html}"); +} +``` + +- [ ] **Step 2: Run failing output test** + +Run: + +```bash +cargo test --locked --test cli submission_gate_outputs_tsv_multiqc_and_html_fields +``` + +Expected: FAIL because the output fields and HTML section are missing. + +- [ ] **Step 3: Add TSV metrics** + +In `src/report/tsv.rs`, after readiness rows, write: + +```rust +write_metric( + &mut writer, + "submission_target", + report + .gate + .submission_target + .as_deref() + .unwrap_or("."), +)?; +write_metric( + &mut writer, + "submission_status", + submission_status(report), +)?; +write_metric( + &mut writer, + "submission_blocking_findings", + report.gate.blocking_findings.join(","), +)?; +write_metric( + &mut writer, + "submission_advisory_findings", + report.gate.advisory_findings.join(","), +)?; +write_metric( + &mut writer, + "unsafe_identifier_count", + report.summary.unsafe_id_count, +)?; +write_metric( + &mut writer, + "long_identifier_count", + report.summary.long_header_count, +)?; +write_metric( + &mut writer, + "duplicate_first_token_id_count", + report.summary.duplicate_first_token_id_count, +)?; +write_metric( + &mut writer, + "gap_like_n_run_count", + report.summary.repeated_gap_pattern_sequence_count, +)?; +``` + +Add: + +```rust +fn submission_status(report: &FastaguardReport) -> &'static str { + report + .readiness + .category("submission") + .map(|category| readiness_status(category.status)) + .unwrap_or("PASS") +} +``` + +- [ ] **Step 4: Add MultiQC fields** + +Add to `MultiqcSummaryRow`: + +```rust +submission_target: String, +submission_status: String, +unsafe_identifier_count: u64, +long_identifier_count: u64, +duplicate_first_token_id_count: u64, +gap_like_n_run_count: u64, +``` + +Populate in `summary_row` using the same fields as TSV. + +Add headers: + +```rust +("submission_target", "Submission Target"), +("submission_status", "Submission Status"), +("unsafe_identifier_count", "Unsafe IDs"), +("long_identifier_count", "Long Headers"), +("duplicate_first_token_id_count", "Duplicate First-Token IDs"), +("gap_like_n_run_count", "Gap-Like N Runs"), +``` + +- [ ] **Step 5: Add HTML section** + +In `src/report/html.rs`, add `let submission = render_submission_readiness(report);` and place this after the Gate Decision section: + +```html +

Submission Readiness

+{submission} +``` + +Add: + +```rust +fn render_submission_readiness(report: &FastaguardReport) -> String { + let target = report + .gate + .submission_target + .as_deref() + .unwrap_or("generic"); + let category = report.readiness.category("submission"); + let status = category + .map(|category| readiness_status(category.status)) + .unwrap_or("PASS"); + let findings = category + .map(|category| render_string_list_or_none(&category.findings)) + .unwrap_or_else(|| "None".to_string()); + + format!( + r#"
+
+

Target

+

{target}

+
+
+

Status

+

{status}

+
+
+

Findings

+{findings} +
+
+

Official validators are still required. FastaGuard reports FASTA-level preflight risks only.

"#, + target = escape_html(target), + status = escape_html(status), + findings = findings, + ) +} +``` + +- [ ] **Step 6: Run focused output tests** + +Run: + +```bash +cargo test --locked --test cli submission_gate_outputs_tsv_multiqc_and_html_fields +cargo test --locked report::tsv report::multiqc +``` + +Expected: PASS. + +- [ ] **Step 7: Commit** + +```bash +git add src/report/tsv.rs src/report/multiqc.rs src/report/html.rs tests/cli.rs +git commit -m "feat: render submission readiness outputs" +``` + +## Task 6: Compare Mode Submission Aggregation + +**Files:** +- Modify: `src/models.rs` +- Modify: `src/compare.rs` +- Modify: `src/report/compare_tsv.rs` +- Modify: `src/report/compare_multiqc.rs` +- Modify: `src/report/compare_html.rs` +- Test: `tests/cli.rs` +- Test: `src/report/compare_tsv.rs` +- Test: `src/report/compare_multiqc.rs` + +- [ ] **Step 1: Add failing compare test** + +Add to `tests/cli.rs`: + +```rust +#[test] +fn compare_submission_gate_aggregates_submission_status() { + let temp_dir = TempDir::new().unwrap(); + let clean = temp_dir.path().join("clean.fa"); + std::fs::write(&clean, ">clean\nACGTACGT\n").unwrap(); + let outputs = output_paths(&temp_dir, "submission_compare"); + + let mut cmd = Command::cargo_bin("fastaguard").unwrap(); + cmd.arg("compare") + .arg(&clean) + .arg("testdata/submission_ids.fa") + .args([ + "--gate", + "submission", + "--submission-target", + "ncbi", + "--json", + ]) + .arg(&outputs.json) + .arg("--out") + .arg(&outputs.html) + .arg("--tsv") + .arg(&outputs.tsv) + .arg("--multiqc") + .arg(&outputs.multiqc) + .assert() + .code(2); + + let report = read_json(&outputs.json); + assert_eq!(report["summary"]["submission_fail_count"], json!(1)); + assert_eq!(report["summary"]["submission_ready_count"], json!(1)); + let failing = report["samples"] + .as_array() + .unwrap() + .iter() + .find(|sample| sample["sample_id"] == "submission_ids") + .unwrap(); + assert_eq!(failing["submission_target"], json!("ncbi")); + assert_eq!(failing["submission_status"], json!("FAIL")); + + let tsv = std::fs::read_to_string(&outputs.tsv).unwrap(); + assert!(tsv.lines().next().unwrap().contains("submission_status"), "{tsv}"); + + let multiqc = read_json(&outputs.multiqc); + assert_eq!( + multiqc["data"]["submission_ids"]["submission_status"], + json!("FAIL") + ); +} +``` + +- [ ] **Step 2: Run failing compare test** + +Run: + +```bash +cargo test --locked --test cli compare_submission_gate_aggregates_submission_status +``` + +Expected: FAIL because compare summary/sample fields do not exist. + +- [ ] **Step 3: Extend compare models** + +In `CompareSummary`, add: + +```rust +pub submission_ready_count: u64, +pub submission_warn_count: u64, +pub submission_fail_count: u64, +``` + +In `CompareSample`, add: + +```rust +pub submission_target: Option, +pub submission_status: crate::readiness::ReadinessStatus, +``` + +- [ ] **Step 4: Populate compare fields** + +In `compare_sample`: + +```rust +let submission_status = report + .readiness + .category("submission") + .map(|category| category.status) + .unwrap_or(crate::readiness::ReadinessStatus::Pass); +``` + +Set: + +```rust +submission_target: report.gate.submission_target.clone(), +submission_status, +``` + +In `compare_summary`, count submission statuses: + +```rust +submission_ready_count: count_readiness_status(samples, crate::readiness::ReadinessStatus::Pass), +submission_warn_count: count_readiness_status(samples, crate::readiness::ReadinessStatus::Warn), +submission_fail_count: count_readiness_status(samples, crate::readiness::ReadinessStatus::Fail), +``` + +Add: + +```rust +fn count_readiness_status( + samples: &[CompareSample], + status: crate::readiness::ReadinessStatus, +) -> u64 { + usize_to_u64( + samples + .iter() + .filter(|sample| sample.submission_status == status) + .count(), + ) +} +``` + +- [ ] **Step 5: Extend compare TSV and MultiQC** + +In `src/report/compare_tsv.rs`, add `submission_target` and `submission_status` columns after `readiness_status`. + +In `src/report/compare_multiqc.rs`, add the same fields and headers: + +```rust +submission_target: String, +submission_status: &'static str, +``` + +- [ ] **Step 6: Extend compare HTML** + +In `src/report/compare_html.rs`, add submission status to the summary cards or table near readiness. Use existing HTML escaping helpers and the same uppercase status labels. + +- [ ] **Step 7: Run compare tests** + +Run: + +```bash +cargo test --locked --test cli compare_submission_gate_aggregates_submission_status +cargo test --locked compare +cargo test --locked report::compare_tsv report::compare_multiqc +``` + +Expected: PASS. + +- [ ] **Step 8: Commit** + +```bash +git add src/models.rs src/compare.rs src/report/compare_tsv.rs src/report/compare_multiqc.rs src/report/compare_html.rs tests/cli.rs +git commit -m "feat: aggregate submission readiness in compare mode" +``` + +## Task 7: Schema Version, Goldens, And Contract Fixtures + +**Files:** +- Modify: `Cargo.toml` +- Modify: `Cargo.lock` +- Modify: `src/models.rs` +- Modify: `schema/fastaguard.schema.json` +- Modify: `schema/finding-catalog.json` +- Modify: `tests/schema_contract.rs` +- Modify: `tests/golden/*.json` +- Modify: `examples/reports/**` +- Test: `tests/schema_contract.rs` +- Test: `tests/cli.rs` + +- [ ] **Step 1: Add failing schema assertions** + +In `tests/schema_contract.rs`, update schema version assertions: + +```rust +assert_eq!( + single_report["properties"]["schema_version"]["const"], + "0.5.0" +); +``` + +Add: + +```rust +#[test] +fn schema_supports_submission_gate_fields() { + let schema: serde_json::Value = + serde_json::from_str(fastaguard::contract::schema_json()).unwrap(); + let gate = &schema["$defs"]["single_report"]["properties"]["gate"]; + let provenance = &schema["$defs"]["single_report"]["properties"]["provenance"]; + let compare_summary = &schema["$defs"]["compare_summary"]; + let compare_sample = &schema["$defs"]["compare_sample"]; + + assert!(gate["required"] + .as_array() + .unwrap() + .iter() + .any(|value| value == "submission_target")); + assert_eq!( + gate["properties"]["mode"]["enum"], + serde_json::json!(["none", "pipeline", "submission"]) + ); + assert_eq!( + gate["properties"]["submission_target"]["enum"], + serde_json::json!(["generic", "ncbi"]) + ); + assert!(provenance["required"] + .as_array() + .unwrap() + .iter() + .any(|value| value == "submission_target")); + assert!(compare_summary["required"] + .as_array() + .unwrap() + .iter() + .any(|value| value == "submission_fail_count")); + assert!(compare_sample["required"] + .as_array() + .unwrap() + .iter() + .any(|value| value == "submission_status")); +} +``` + +- [ ] **Step 2: Run failing schema tests** + +Run: + +```bash +cargo test --locked --test schema_contract schema_supports_submission_gate_fields +``` + +Expected: FAIL because schema is still v0.4.0 and fields are missing. + +- [ ] **Step 3: Bump crate and schema versions** + +In `Cargo.toml`: + +```toml +version = "0.5.0" +``` + +In `src/models.rs`: + +```rust +pub const SCHEMA_VERSION: &str = "0.5.0"; +``` + +Run: + +```bash +cargo check --locked +``` + +If `Cargo.lock` still records `fastaguard 0.4.0`, run: + +```bash +cargo check +``` + +and verify the lockfile updates only the local package version. + +- [ ] **Step 4: Update schema** + +Update `schema/fastaguard.schema.json`: + +- `schema_version.const` from `0.4.0` to `0.5.0` for single and compare reports. +- `gate.mode.enum` to include `"submission"`. +- `gate.required` to include `"submission_target"`. +- `gate.properties.submission_target`: + +```json +{ + "type": "string", + "enum": ["generic", "ncbi"] +} +``` + +- `provenance.required` to include `"submission_target"`. +- `provenance.properties.submission_target` with the same enum. +- `readiness_category.properties.target` with the same enum. +- `compare_summary.required` and properties for `submission_ready_count`, `submission_warn_count`, `submission_fail_count`. +- `compare_sample.required` and properties for `submission_target` and `submission_status`. + +- [ ] **Step 5: Update finding catalog version** + +In `schema/finding-catalog.json`: + +```json +"schema_version": "0.5.0", +"catalog_version": "0.5.0" +``` + +Ensure every catalog `suggested_actions` still equals `finding_actions(id)`. + +- [ ] **Step 6: Regenerate goldens** + +Run the existing golden tests once to produce current temp outputs if helpers write to `target`; if they do not overwrite goldens, generate with the same commands used in `tests/cli.rs` and copy the JSON into: + +```text +tests/golden/valid_assembly.json +tests/golden/problem_assembly.json +tests/golden/invalid_empty_record.json +tests/golden/compare_mixed_status.json +tests/golden/compare_all_pass.json +examples/reports/assembly_pass/fastaguard.json +examples/reports/assembly_fail/fastaguard.json +examples/reports/assembly_pass/fastaguard.tsv +examples/reports/assembly_fail/fastaguard.tsv +examples/reports/assembly_pass/fastaguard_mqc.json +examples/reports/assembly_fail/fastaguard_mqc.json +examples/reports/assembly_pass/fastaguard_report.html +examples/reports/assembly_fail/fastaguard_report.html +``` + +Use deterministic provenance environment variables already present in `tests/cli.rs` when regenerating golden JSON. + +- [ ] **Step 7: Run contract tests** + +Run: + +```bash +cargo test --locked --test schema_contract +cargo test --locked --test cli golden +cargo test --locked contract +``` + +Expected: PASS. + +- [ ] **Step 8: Commit** + +```bash +git add Cargo.toml Cargo.lock src/models.rs schema/fastaguard.schema.json schema/finding-catalog.json tests/schema_contract.rs tests/golden examples/reports +git commit -m "chore: update v0.5 output contract" +``` + +## Task 8: Documentation, Evidence, And Release Notes + +**Files:** +- Create: `docs/evidence/fastaguard-v0.5-submission-readiness.md` +- Create: `docs/releases/v0.5.0.md` +- Modify: `README.md` +- Modify: `docs/roadmap.md` +- Modify: `docs/vision-plan.md` +- Modify: `docs/tool-landscape.md` +- Modify: `docs/output-contract.md` +- Modify: `docs/packaging.md` +- Modify: `examples/nf-core/README.md` +- Modify: `examples/snakemake/wrapper/README.md` +- Test: `tests/python/test_adoption_assets.py` +- Test: `tests/python/test_release_metadata.py` + +- [ ] **Step 1: Add failing Python docs tests** + +In `tests/python/test_adoption_assets.py`, add: + +```python + def test_v0_5_submission_readiness_docs_are_present(self): + readme = self.read("README.md") + roadmap = self.read("docs/roadmap.md") + evidence = self.read("docs/evidence/fastaguard-v0.5-submission-readiness.md") + release = self.read("docs/releases/v0.5.0.md") + + for text in [readme, roadmap, evidence, release]: + self.assertIn("--gate submission", text) + self.assertIn("--submission-target", text) + self.assertIn("official validators", text) + + self.assertIn("FastaGuard does not replace NCBI, ENA, DDBJ", roadmap) + self.assertIn("repository acceptance", evidence) +``` + +In `tests/python/test_release_metadata.py`, update `test_package_targets_v0_4_0` to `test_package_targets_v0_5_0` and assert: + +```python +self.assertIn('version = "0.5.0"', cargo_toml) +``` + +- [ ] **Step 2: Run failing docs tests** + +Run: + +```bash +python3 -m unittest tests.python.test_adoption_assets tests.python.test_release_metadata -v +``` + +Expected: FAIL because v0.5 docs and release notes are missing or stale. + +- [ ] **Step 3: Update README** + +Add a quickstart block: + +```markdown +Submission-readiness preflight: + +```bash +fastaguard sample.fa \ + --profile assembly \ + --gate submission \ + --submission-target ncbi \ + --json fastaguard.json \ + --out fastaguard_report.html +``` + +FastaGuard reports FASTA-level risks before official validators. It does not +guarantee NCBI, ENA, or DDBJ acceptance and does not replace NCBI FCS, +annotation validation, QUAST, BUSCO, BlobToolKit, or CheckM. +``` + +- [ ] **Step 4: Add evidence page** + +Create `docs/evidence/fastaguard-v0.5-submission-readiness.md`: + +```markdown +# FastaGuard v0.5 Submission Readiness Evidence + +This page records tiny local evidence cases for the v0.5 submission-readiness +gate. The goal is to show FASTA-level hazards before official validators and +expensive QC. + +## Commands + +```bash +fastaguard testdata/submission_ids.fa \ + --gate submission \ + --submission-target ncbi \ + --json target/evidence/v0.5/submission_ids.json + +fastaguard testdata/submission_warnings.fa \ + --gate submission \ + --submission-target generic \ + --json target/evidence/v0.5/submission_warnings.json +``` + +## Scope + +FastaGuard can report parse validity, identifier safety, duplicate first-token +IDs, invalid sequence symbols, gap-like N runs, high ambiguity, and tiny-record +advisories. It cannot guarantee repository acceptance, biological completeness, +annotation correctness, or contamination status. + +## Expected Follow-Up + +After FASTA-level blockers are fixed, users should continue to official +validators, NCBI FCS, QUAST, BUSCO, BlobToolKit, CheckM, annotation, or other +the next workflow step named in the report. +``` + +- [ ] **Step 5: Add release notes** + +Create `docs/releases/v0.5.0.md` with: + +```markdown +# FastaGuard v0.5.0 + +FastaGuard v0.5.0 is the Submission Readiness Gate release. + +## Highlights + +- Adds `--gate submission`. +- Adds `--submission-target generic|ncbi`. +- Adds submission-readiness fields to JSON, TSV, HTML, MultiQC, and compare outputs. +- Promotes existing identifier, header, gap, ambiguity, and tiny-record findings into a clearer submission-readiness view. + +## Boundary + +FastaGuard is a FASTA-level preflight tool. It does not replace NCBI, ENA, DDBJ, +NCBI FCS, QUAST, BUSCO, BlobToolKit, CheckM, annotation validation, or official +repository acceptance checks. +``` + +- [ ] **Step 6: Update workflow docs** + +In nf-core and Snakemake docs, add the command pattern: + +```bash +fastaguard {input.fasta} --gate submission --submission-target ncbi +``` + +State that pipeline authors should route on: + +```text +gate.mode +gate.status +gate.blocking_findings +readiness.categories[id=submission] +``` + +- [ ] **Step 7: Run docs tests** + +Run: + +```bash +python3 -m unittest discover tests/python -v +``` + +Expected: PASS. + +- [ ] **Step 8: Commit** + +```bash +git add README.md docs/roadmap.md docs/vision-plan.md docs/tool-landscape.md docs/output-contract.md docs/packaging.md docs/evidence/fastaguard-v0.5-submission-readiness.md docs/releases/v0.5.0.md examples/nf-core/README.md examples/snakemake/wrapper/README.md tests/python +git commit -m "docs: document v0.5 submission readiness" +``` + +## Task 9: Full Verification And Release Preparation + +**Files:** +- Modify only files needed for failures found by verification. + +- [ ] **Step 1: Run full Rust and Python gates** + +Run: + +```bash +python3 -m unittest discover tests/python -v +cargo fmt --check +cargo test --locked +cargo clippy --locked --all-targets --all-features -- -D warnings +git diff --check +git ls-files | xargs perl -ne 'print "$ARGV:$.:$_" if /[ \t]$/' +``` + +Expected: all commands exit 0 and trailing whitespace scan prints nothing. + +- [ ] **Step 2: Run smoke commands** + +Run: + +```bash +cargo run --locked -- --schema >/tmp/fastaguard-v0.5-schema.json +cargo run --locked -- testdata/submission_ids.fa --gate submission --submission-target ncbi --json /tmp/submission.json --out /tmp/submission.html --tsv /tmp/submission.tsv --multiqc /tmp/submission_mqc.json +cargo run --locked -- compare testdata/valid_assembly.fa testdata/submission_ids.fa --gate submission --submission-target ncbi --json /tmp/submission_compare.json --out /tmp/submission_compare.html --tsv /tmp/submission_compare.tsv --multiqc /tmp/submission_compare_mqc.json +``` + +Expected: + +- `--schema` exits 0. +- single submission run exits 2 because `testdata/submission_ids.fa` contains blocking identifier hazards. +- compare submission run exits 2 because one sample fails. + +- [ ] **Step 3: Inspect final diff** + +Run: + +```bash +git status --short +git diff --stat origin/main..HEAD +git log --oneline --decorate --max-count=12 +``` + +Expected: the branch contains the v0.5 submission-readiness commits and no unrelated file changes. + +- [ ] **Step 4: Commit verification fixes if needed** + +If verification required edits: + +```bash +git status --short +git add src tests schema docs examples Cargo.toml Cargo.lock testdata +git commit -m "fix: stabilize v0.5 submission readiness" +``` + +If verification required no edits, do not create an empty commit. + +## Self-Review Checklist + +- Spec coverage: tasks cover CLI, gate behavior, readiness, JSON/provenance/scope, HTML, TSV, MultiQC, compare mode, schema, goldens, docs, evidence, and verification. +- Scope boundary: the plan stays assembly-first and database-free; it routes to official validators and NCBI FCS without claiming to run or replace them. +- Finding IDs: the plan preserves existing v0.4 IDs for identifier/header/gap findings to avoid breaking report consumers. +- Type consistency: `SubmissionTarget` flows from CLI config into gate/provenance/readiness/compare reports as `Option` or serialized strings. +- Test strategy: every behavior change starts with a failing test, then implementation, then focused verification and commit. + +## Execution Choice + +Plan complete and saved to `docs/superpowers/plans/2026-06-11-fastaguard-v0.5-submission-readiness.md`. Two execution options: + +1. **Subagent-Driven (recommended)** - dispatch a fresh subagent per task, review between tasks, fast iteration. +2. **Inline Execution** - execute tasks in this session using executing-plans, batch execution with checkpoints. + +Recommended choice: **Subagent-Driven**, because the feature touches independent surfaces: CLI/gate, readiness/schema, report writers, compare mode, and docs. From 606fa8f3ba11756035f5a1b7c122392f96974b00 Mon Sep 17 00:00:00 2001 From: Ehsan ESTAJI <71376358+ehsanestaji@users.noreply.github.com> Date: Thu, 11 Jun 2026 10:17:57 +0200 Subject: [PATCH 03/13] feat: add submission target CLI --- src/cli.rs | 18 ++++++++++ src/compare.rs | 1 + src/gate.rs | 3 ++ src/lib.rs | 2 ++ src/models.rs | 13 ++++++-- src/report/html.rs | 2 ++ src/report/mod.rs | 2 ++ src/report/multiqc.rs | 2 ++ src/report/tsv.rs | 2 ++ src/submission.rs | 19 +++++++++++ tests/cli.rs | 78 +++++++++++++++++++++++++++++++++++++++++++ 11 files changed, 140 insertions(+), 2 deletions(-) create mode 100644 src/submission.rs diff --git a/src/cli.rs b/src/cli.rs index 0ea2acf..6aee0ec 100644 --- a/src/cli.rs +++ b/src/cli.rs @@ -10,6 +10,7 @@ use std::path::{Path, PathBuf}; use crate::gate::{self, GateMode}; use crate::profile::ThresholdOverrides; +use crate::submission::SubmissionTarget; #[derive(Debug, Parser)] #[command(name = "fastaguard")] @@ -86,6 +87,10 @@ pub struct AnalysisArgs { #[arg(long, value_enum, default_value_t = GateMode::None)] pub gate: GateMode, + /// Submission-readiness target to record; defaults to generic with --gate submission. + #[arg(long, value_enum)] + pub submission_target: Option, + /// Comma-separated rule IDs that should fail the run when triggered. #[arg(long, value_delimiter = ',')] pub fail_on: Vec, @@ -161,6 +166,7 @@ pub struct RunConfig { pub input: PathBuf, pub profile: String, pub gate_mode: GateMode, + pub submission_target: Option, pub outputs: OutputPaths, pub rules: RuleConfig, pub thresholds: ThresholdOverrides, @@ -175,6 +181,7 @@ pub struct CompareConfig { pub inputs: Vec, pub profile: String, pub gate_mode: GateMode, + pub submission_target: Option, pub outputs: OutputPaths, pub rules: RuleConfig, pub thresholds: ThresholdOverrides, @@ -201,6 +208,7 @@ pub struct RuleConfig { struct ValidatedAnalysis { profile: String, gate_mode: GateMode, + submission_target: Option, rules: RuleConfig, thresholds: ThresholdOverrides, threads: usize, @@ -280,6 +288,7 @@ const ROOT_RUN_ARG_IDS: &[&str] = &[ "input", "profile", "gate", + "submission_target", "fail_on", "max_n_rate", "min_contig_length", @@ -338,6 +347,7 @@ impl RunArgs { input, profile: analysis.profile, gate_mode: analysis.gate_mode, + submission_target: analysis.submission_target, outputs: self.outputs.output_paths(), rules: analysis.rules, thresholds: analysis.thresholds, @@ -364,6 +374,7 @@ impl CompareArgs { inputs: self.inputs.clone(), profile: analysis.profile, gate_mode: analysis.gate_mode, + submission_target: analysis.submission_target, outputs: self.outputs.output_paths(), rules: analysis.rules, thresholds: analysis.thresholds, @@ -381,6 +392,7 @@ impl AnalysisArgs { fn has_overrides(&self) -> bool { self.profile != "assembly" || self.gate != GateMode::None + || self.submission_target.is_some() || !self.fail_on.is_empty() || self.max_n_rate.is_some() || self.min_contig_length.is_some() @@ -416,10 +428,15 @@ impl AnalysisArgs { .as_deref() .map(parse_expected_size) .transpose()?; + let submission_target = match (self.gate, self.submission_target) { + (GateMode::Submission, None) => Some(SubmissionTarget::Generic), + (_, target) => target, + }; Ok(ValidatedAnalysis { profile: self.profile.clone(), gate_mode: self.gate, + submission_target, rules: RuleConfig { fail_on: gate::final_fail_on(self.gate, &self.fail_on), }, @@ -540,6 +557,7 @@ mod tests { analysis: AnalysisArgs { profile: "assembly".to_string(), gate: GateMode::None, + submission_target: None, fail_on: Vec::new(), max_n_rate, min_contig_length: None, diff --git a/src/compare.rs b/src/compare.rs index ea07731..a56dcec 100644 --- a/src/compare.rs +++ b/src/compare.rs @@ -59,6 +59,7 @@ fn run_one_sample(config: &CompareConfig, input: &Path) -> Result "none", GateMode::Pipeline => "pipeline", + GateMode::Submission => "submission", } } } @@ -64,6 +66,7 @@ pub fn decision( GateDecision { mode: mode.as_str().to_string(), + submission_target: None, status, blocking_findings, advisory_findings, diff --git a/src/lib.rs b/src/lib.rs index 721e493..9b19c79 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -10,6 +10,7 @@ pub mod profile; pub mod readiness; pub mod report; pub mod stats; +pub mod submission; use anyhow::Result; use cli::{Cli, CommandConfig}; @@ -116,6 +117,7 @@ mod tests { input: PathBuf::from("input.fa"), profile: "assembly".to_string(), gate_mode: GateMode::None, + submission_target: None, outputs: OutputPaths { html: PathBuf::from("fastaguard_report.html"), json: PathBuf::from("fastaguard.json"), diff --git a/src/models.rs b/src/models.rs index 1c45ceb..e3afef6 100644 --- a/src/models.rs +++ b/src/models.rs @@ -14,6 +14,7 @@ use crate::metrics::AssemblyMetrics; use crate::profile::ProfileConfig; use crate::readiness::{self, ReadinessReport, ReadinessScope}; use crate::stats::composition::percent; +use crate::submission::SubmissionTarget; pub const SCHEMA_VERSION: &str = "0.4.0"; pub const TOOL_NAME: &str = "FastaGuard"; @@ -121,6 +122,8 @@ pub struct Verdict { #[derive(Debug, Clone, Serialize, Deserialize)] pub struct GateDecision { pub mode: String, + #[serde(skip_serializing_if = "Option::is_none")] + pub submission_target: Option, pub status: VerdictStatus, pub blocking_findings: Vec, pub advisory_findings: Vec, @@ -159,6 +162,8 @@ pub struct Scope { #[derive(Debug, Clone, Serialize, Deserialize)] pub struct Provenance { pub profile: String, + #[serde(skip_serializing_if = "Option::is_none")] + pub submission_target: Option, pub threads: usize, pub fail_on: Vec, pub thresholds: ProvenanceThresholds, @@ -341,12 +346,13 @@ impl FastaguardReport { let findings = analysis.findings; let plots = build_plots(&metrics, profile); let provenance = build_provenance(&config, profile, duration_ms)?; - let gate = gate::decision( + let mut gate = gate::decision( config.gate_mode, analysis.status, &findings, &config.rules.fail_on, ); + gate.submission_target = config.submission_target; let readiness = readiness::build_readiness( analysis.status, &gate.blocking_findings, @@ -438,12 +444,13 @@ impl FastaguardReport { actions: finding_actions("invalid_fasta_structure"), }]; let provenance = build_provenance(&config, profile, duration_ms)?; - let gate = gate::decision( + let mut gate = gate::decision( config.gate_mode, VerdictStatus::Fail, &findings, &config.rules.fail_on, ); + gate.submission_target = config.submission_target; let readiness = readiness::build_readiness( VerdictStatus::Fail, &gate.blocking_findings, @@ -873,6 +880,7 @@ fn build_provenance( Ok(Provenance { profile: profile.name.clone(), + submission_target: config.submission_target, threads: config.threads, fail_on: config.rules.fail_on.iter().cloned().collect(), thresholds: ProvenanceThresholds { @@ -1061,6 +1069,7 @@ mod tests { input, profile: "assembly".to_string(), gate_mode: GateMode::None, + submission_target: None, outputs: OutputPaths { html: PathBuf::from("fastaguard_report.html"), json: PathBuf::from("fastaguard.json"), diff --git a/src/report/html.rs b/src/report/html.rs index 6ed0926..fc93a0c 100644 --- a/src/report/html.rs +++ b/src/report/html.rs @@ -718,6 +718,7 @@ mod tests { }, gate: GateDecision { mode: "none".to_string(), + submission_target: None, status: VerdictStatus::Pass, blocking_findings: Vec::new(), advisory_findings: Vec::new(), @@ -743,6 +744,7 @@ mod tests { }, provenance: Provenance { profile: "assembly".to_string(), + submission_target: None, threads: 1, fail_on: Vec::new(), thresholds: ProvenanceThresholds { diff --git a/src/report/mod.rs b/src/report/mod.rs index 037249b..e335952 100644 --- a/src/report/mod.rs +++ b/src/report/mod.rs @@ -287,6 +287,7 @@ mod tests { }, gate: GateDecision { mode: "none".to_string(), + submission_target: None, status: VerdictStatus::Pass, blocking_findings: Vec::new(), advisory_findings: Vec::new(), @@ -312,6 +313,7 @@ mod tests { }, provenance: Provenance { profile: "assembly".to_string(), + submission_target: None, threads: 1, fail_on: Vec::new(), thresholds: ProvenanceThresholds { diff --git a/src/report/multiqc.rs b/src/report/multiqc.rs index 0b0a811..938e47c 100644 --- a/src/report/multiqc.rs +++ b/src/report/multiqc.rs @@ -225,6 +225,7 @@ mod tests { }, gate: GateDecision { mode: "none".to_string(), + submission_target: None, status: VerdictStatus::Pass, blocking_findings: Vec::new(), advisory_findings: Vec::new(), @@ -250,6 +251,7 @@ mod tests { }, provenance: Provenance { profile: "assembly".to_string(), + submission_target: None, threads: 1, fail_on: Vec::new(), thresholds: ProvenanceThresholds { diff --git a/src/report/tsv.rs b/src/report/tsv.rs index 175928a..f7416dd 100644 --- a/src/report/tsv.rs +++ b/src/report/tsv.rs @@ -351,6 +351,7 @@ mod tests { }, gate: GateDecision { mode: "none".to_string(), + submission_target: None, status, blocking_findings: Vec::new(), advisory_findings: Vec::new(), @@ -376,6 +377,7 @@ mod tests { }, provenance: Provenance { profile: "assembly".to_string(), + submission_target: None, threads: 1, fail_on: Vec::new(), thresholds: ProvenanceThresholds { diff --git a/src/submission.rs b/src/submission.rs new file mode 100644 index 0000000..f852d6c --- /dev/null +++ b/src/submission.rs @@ -0,0 +1,19 @@ +use clap::ValueEnum; +use serde::{Deserialize, Serialize}; + +#[derive(Debug, Clone, Copy, PartialEq, Eq, ValueEnum, Serialize, Deserialize)] +#[serde(rename_all = "snake_case")] +#[value(rename_all = "snake_case")] +pub enum SubmissionTarget { + Generic, + Ncbi, +} + +impl SubmissionTarget { + pub fn as_str(self) -> &'static str { + match self { + SubmissionTarget::Generic => "generic", + SubmissionTarget::Ncbi => "ncbi", + } + } +} diff --git a/tests/cli.rs b/tests/cli.rs index 5d33dad..7f68dee 100644 --- a/tests/cli.rs +++ b/tests/cli.rs @@ -1187,6 +1187,84 @@ fn invalid_provenance_timestamp_override_is_tool_error() { )); } +#[test] +fn submission_gate_defaults_to_generic_target() { + let temp_dir = TempDir::new().unwrap(); + let outputs = output_paths(&temp_dir, "submission_default"); + + let mut cmd = Command::cargo_bin("fastaguard").unwrap(); + cmd.args([ + "testdata/valid_assembly.fa", + "--min-contig-length", + "1", + "--gate", + "submission", + "--json", + ]) + .arg(&outputs.json) + .arg("--out") + .arg(&outputs.html) + .arg("--tsv") + .arg(&outputs.tsv) + .arg("--multiqc") + .arg(&outputs.multiqc) + .assert() + .code(1) + .stderr(predicate::str::is_empty()); + + let report = read_json(&outputs.json); + assert_eq!(report["gate"]["mode"], json!("submission")); + assert_eq!(report["gate"]["submission_target"], json!("generic")); + assert_eq!(report["provenance"]["submission_target"], json!("generic")); +} + +#[test] +fn submission_target_ncbi_is_serialized_when_requested() { + let temp_dir = TempDir::new().unwrap(); + let outputs = output_paths(&temp_dir, "submission_ncbi"); + + let mut cmd = Command::cargo_bin("fastaguard").unwrap(); + cmd.args([ + "testdata/valid_assembly.fa", + "--min-contig-length", + "1", + "--gate", + "submission", + "--submission-target", + "ncbi", + "--json", + ]) + .arg(&outputs.json) + .arg("--out") + .arg(&outputs.html) + .arg("--tsv") + .arg(&outputs.tsv) + .arg("--multiqc") + .arg(&outputs.multiqc) + .assert() + .code(1) + .stderr(predicate::str::is_empty()); + + let report = read_json(&outputs.json); + assert_eq!(report["gate"]["submission_target"], json!("ncbi")); + assert_eq!(report["provenance"]["submission_target"], json!("ncbi")); +} + +#[test] +fn unknown_submission_target_is_cli_error() { + let mut cmd = Command::cargo_bin("fastaguard").unwrap(); + cmd.args([ + "testdata/valid_assembly.fa", + "--gate", + "submission", + "--submission-target", + "ena", + ]) + .assert() + .code(2) + .stderr(predicate::str::contains("invalid value 'ena'")); +} + struct OutputPaths { html: std::path::PathBuf, json: std::path::PathBuf, From c5374d388c0935a01936e0666f4746a98f86ec58 Mon Sep 17 00:00:00 2001 From: Ehsan ESTAJI <71376358+ehsanestaji@users.noreply.github.com> Date: Thu, 11 Jun 2026 10:30:15 +0200 Subject: [PATCH 04/13] feat: add submission gate semantics --- src/cli.rs | 2 +- src/gate.rs | 39 +++++++++++++++++++++++++++++++---- src/models.rs | 8 ++++---- testdata/submission_ids.fa | 8 ++++++++ tests/cli.rs | 42 ++++++++++++++++++++++++++++++++++++++ 5 files changed, 90 insertions(+), 9 deletions(-) create mode 100644 testdata/submission_ids.fa diff --git a/src/cli.rs b/src/cli.rs index 6aee0ec..8eaca73 100644 --- a/src/cli.rs +++ b/src/cli.rs @@ -438,7 +438,7 @@ impl AnalysisArgs { gate_mode: self.gate, submission_target, rules: RuleConfig { - fail_on: gate::final_fail_on(self.gate, &self.fail_on), + fail_on: gate::final_fail_on(self.gate, submission_target, &self.fail_on), }, thresholds: ThresholdOverrides { max_n_rate: self.max_n_rate, diff --git a/src/gate.rs b/src/gate.rs index 027c739..6f0d22a 100644 --- a/src/gate.rs +++ b/src/gate.rs @@ -3,6 +3,7 @@ use serde::{Deserialize, Serialize}; use std::collections::BTreeSet; use crate::models::{Finding, GateDecision, Severity, VerdictStatus}; +use crate::submission::SubmissionTarget; pub const PIPELINE_FAIL_ON: &[&str] = &[ "duplicate_first_token_ids", @@ -12,6 +13,23 @@ pub const PIPELINE_FAIL_ON: &[&str] = &[ "invalid_fasta_structure", ]; +pub const SUBMISSION_FAIL_ON_GENERIC: &[&str] = &[ + "duplicate_first_token_ids", + "duplicate_ids", + "invalid_chars", + "invalid_fasta_structure", + "unsafe_ids", +]; + +pub const SUBMISSION_FAIL_ON_NCBI: &[&str] = &[ + "duplicate_first_token_ids", + "duplicate_ids", + "invalid_chars", + "invalid_fasta_structure", + "reserved_header_chars", + "unsafe_ids", +]; + #[derive(Debug, Clone, Copy, PartialEq, Eq, ValueEnum, Serialize, Deserialize)] #[serde(rename_all = "snake_case")] #[value(rename_all = "snake_case")] @@ -31,7 +49,11 @@ impl GateMode { } } -pub fn final_fail_on(mode: GateMode, explicit_rules: &[String]) -> BTreeSet { +pub fn final_fail_on( + mode: GateMode, + submission_target: Option, + explicit_rules: &[String], +) -> BTreeSet { let mut fail_on = explicit_rules .iter() .flat_map(|value| value.split(',')) @@ -40,8 +62,16 @@ pub fn final_fail_on(mode: GateMode, explicit_rules: &[String]) -> BTreeSet>(); - if mode == GateMode::Pipeline { - fail_on.extend(PIPELINE_FAIL_ON.iter().map(|id| (*id).to_string())); + match mode { + GateMode::Pipeline => fail_on.extend(PIPELINE_FAIL_ON.iter().map(|id| (*id).to_string())), + GateMode::Submission => { + let preset = match submission_target.unwrap_or(SubmissionTarget::Generic) { + SubmissionTarget::Generic => SUBMISSION_FAIL_ON_GENERIC, + SubmissionTarget::Ncbi => SUBMISSION_FAIL_ON_NCBI, + }; + fail_on.extend(preset.iter().map(|id| (*id).to_string())); + } + GateMode::None => {} } fail_on @@ -49,6 +79,7 @@ pub fn final_fail_on(mode: GateMode, explicit_rules: &[String]) -> BTreeSet, status: VerdictStatus, findings: &[Finding], fail_on: &BTreeSet, @@ -66,7 +97,7 @@ pub fn decision( GateDecision { mode: mode.as_str().to_string(), - submission_target: None, + submission_target, status, blocking_findings, advisory_findings, diff --git a/src/models.rs b/src/models.rs index e3afef6..0252c45 100644 --- a/src/models.rs +++ b/src/models.rs @@ -346,13 +346,13 @@ impl FastaguardReport { let findings = analysis.findings; let plots = build_plots(&metrics, profile); let provenance = build_provenance(&config, profile, duration_ms)?; - let mut gate = gate::decision( + let gate = gate::decision( config.gate_mode, + config.submission_target, analysis.status, &findings, &config.rules.fail_on, ); - gate.submission_target = config.submission_target; let readiness = readiness::build_readiness( analysis.status, &gate.blocking_findings, @@ -444,13 +444,13 @@ impl FastaguardReport { actions: finding_actions("invalid_fasta_structure"), }]; let provenance = build_provenance(&config, profile, duration_ms)?; - let mut gate = gate::decision( + let gate = gate::decision( config.gate_mode, + config.submission_target, VerdictStatus::Fail, &findings, &config.rules.fail_on, ); - gate.submission_target = config.submission_target; let readiness = readiness::build_readiness( VerdictStatus::Fail, &gate.blocking_findings, diff --git a/testdata/submission_ids.fa b/testdata/submission_ids.fa new file mode 100644 index 0000000..d05baca --- /dev/null +++ b/testdata/submission_ids.fa @@ -0,0 +1,8 @@ +>seq/one +ACGTACGT +>seq two +ACGTACGT +>seq two duplicate-description +ACGTACGA +>pipe|id +ACGTACGT diff --git a/tests/cli.rs b/tests/cli.rs index 7f68dee..3e14c47 100644 --- a/tests/cli.rs +++ b/tests/cli.rs @@ -1250,6 +1250,48 @@ fn submission_target_ncbi_is_serialized_when_requested() { assert_eq!(report["provenance"]["submission_target"], json!("ncbi")); } +#[test] +fn submission_gate_fails_identifier_hazards() { + let temp_dir = TempDir::new().unwrap(); + let outputs = output_paths(&temp_dir, "submission_ids"); + + let mut cmd = Command::cargo_bin("fastaguard").unwrap(); + cmd.args([ + "testdata/submission_ids.fa", + "--gate", + "submission", + "--submission-target", + "ncbi", + "--json", + ]) + .arg(&outputs.json) + .arg("--out") + .arg(&outputs.html) + .arg("--tsv") + .arg(&outputs.tsv) + .arg("--multiqc") + .arg(&outputs.multiqc) + .assert() + .code(2) + .stderr(predicate::str::contains("fastaguard error:").not()); + + let report = read_json(&outputs.json); + assert_eq!(report["gate"]["mode"], json!("submission")); + assert_eq!(report["gate"]["status"], json!("FAIL")); + assert!(array_contains_string( + &report["gate"]["blocking_findings"], + "duplicate_first_token_ids", + )); + assert!(array_contains_string( + &report["gate"]["blocking_findings"], + "unsafe_ids", + )); + assert!(array_contains_string( + &report["gate"]["blocking_findings"], + "reserved_header_chars", + )); +} + #[test] fn unknown_submission_target_is_cli_error() { let mut cmd = Command::cargo_bin("fastaguard").unwrap(); From 9c68f24e443f9a19bac33d76e12b21891b92722d Mon Sep 17 00:00:00 2001 From: Ehsan ESTAJI <71376358+ehsanestaji@users.noreply.github.com> Date: Thu, 11 Jun 2026 10:40:39 +0200 Subject: [PATCH 05/13] feat: add submission readiness metadata --- src/compare.rs | 1 + src/models.rs | 2 + src/readiness.rs | 79 +++++++++++++++++++++----- src/report/compare_html.rs | 1 + src/report/compare_multiqc.rs | 1 + src/report/compare_tsv.rs | 1 + src/report/html.rs | 1 + src/report/mod.rs | 1 + src/report/multiqc.rs | 1 + src/report/tsv.rs | 1 + tests/cli.rs | 8 +++ tests/golden/compare_mixed_status.json | 13 ++++- tests/golden/problem_assembly.json | 9 ++- 13 files changed, 102 insertions(+), 17 deletions(-) diff --git a/src/compare.rs b/src/compare.rs index a56dcec..69f6e15 100644 --- a/src/compare.rs +++ b/src/compare.rs @@ -312,6 +312,7 @@ mod tests { &[], &[], crate::readiness::ReadinessScope::Single, + None, ) .categories, sequence_count, diff --git a/src/models.rs b/src/models.rs index 0252c45..704d2a8 100644 --- a/src/models.rs +++ b/src/models.rs @@ -358,6 +358,7 @@ impl FastaguardReport { &gate.blocking_findings, &findings, ReadinessScope::Single, + config.submission_target, ); Ok(Self { @@ -456,6 +457,7 @@ impl FastaguardReport { &gate.blocking_findings, &findings, ReadinessScope::Single, + config.submission_target, ); Ok(Self { diff --git a/src/readiness.rs b/src/readiness.rs index 769afd0..e41f4a2 100644 --- a/src/readiness.rs +++ b/src/readiness.rs @@ -30,7 +30,7 @@ impl ReadinessReport { impl Default for ReadinessReport { fn default() -> Self { - build_readiness(VerdictStatus::Pass, &[], &[], ReadinessScope::Single) + build_readiness(VerdictStatus::Pass, &[], &[], ReadinessScope::Single, None) } } @@ -44,6 +44,8 @@ pub struct ReadinessOverall { pub struct ReadinessCategory { pub id: String, pub label: String, + #[serde(skip_serializing_if = "Option::is_none")] + pub target: Option, pub status: ReadinessStatus, pub findings: Vec, } @@ -53,8 +55,18 @@ pub fn build_readiness( blocking_findings: &[String], findings: &[Finding], scope: ReadinessScope, + submission_target: Option, ) -> ReadinessReport { let mut categories = base_categories(scope); + if let Some(target) = submission_target { + if let Some(category) = categories + .iter_mut() + .find(|category| category.id == "submission") + { + category.target = Some(target.as_str().to_string()); + } + } + let mut blockers = Vec::new(); for finding in findings { for category_id in category_ids_for_finding(&finding.id) { @@ -114,6 +126,7 @@ fn base_categories(scope: ReadinessScope) -> Vec { .map(|(id, label)| ReadinessCategory { id: id.to_string(), label: label.to_string(), + target: None, status: ReadinessStatus::Pass, findings: Vec::new(), }) @@ -124,17 +137,13 @@ fn category_ids_for_finding(id: &str) -> &'static [&'static str] { match id { "invalid_fasta_structure" => &["file", "structure"], "invalid_chars" => &["alphabet"], - "duplicate_ids" | "duplicate_first_token_ids" => &["index"], + "duplicate_ids" | "duplicate_first_token_ids" => &["index", "submission"], "unsafe_ids" | "long_headers" | "reserved_header_chars" => &["index", "submission"], - "high_n_rate" - | "gap_runs" - | "tiny_contigs" - | "gc_outliers" - | "length_outliers" - | "composite_anomalies" - | "gap_pattern_warnings" - | "expected_size_outlier" => &["assembly"], - "terminal_ns" => &["assembly", "submission"], + "terminal_ns" | "gap_pattern_warnings" | "gap_runs" => &["assembly", "submission"], + "high_n_rate" | "tiny_contigs" => &["assembly", "submission"], + "gc_outliers" | "length_outliers" | "composite_anomalies" | "expected_size_outlier" => { + &["assembly"] + } "cohort_total_length_outliers" | "cohort_gc_outliers" | "cohort_n_percent_outliers" @@ -182,12 +191,16 @@ mod tests { &["duplicate_first_token_ids".to_string()], &[finding("duplicate_first_token_ids", Severity::Critical)], ReadinessScope::Single, + None, ); assert_eq!(readiness.overall.status, ReadinessStatus::Fail); assert_eq!( readiness.overall.blockers, - ["index.duplicate_first_token_ids"] + [ + "index.duplicate_first_token_ids", + "submission.duplicate_first_token_ids" + ] ); let index = readiness.category("index").unwrap(); assert_eq!(index.status, ReadinessStatus::Fail); @@ -204,10 +217,14 @@ mod tests { finding("unsafe_ids", Severity::Major), ], ReadinessScope::Single, + None, ); assert_eq!(readiness.overall.status, ReadinessStatus::Fail); - assert_eq!(readiness.overall.blockers, ["index.duplicate_ids"]); + assert_eq!( + readiness.overall.blockers, + ["index.duplicate_ids", "submission.duplicate_ids"] + ); let index = readiness.category("index").unwrap(); assert_eq!(index.status, ReadinessStatus::Fail); assert_eq!(index.findings, ["duplicate_ids", "unsafe_ids"]); @@ -220,6 +237,7 @@ mod tests { &[], &[finding("terminal_ns", Severity::Major)], ReadinessScope::Single, + None, ); assert_eq!(readiness.overall.status, ReadinessStatus::Warn); @@ -230,9 +248,42 @@ mod tests { ); } + #[test] + fn submission_target_is_attached_to_submission_category() { + let readiness = build_readiness( + VerdictStatus::Fail, + &["reserved_header_chars".to_string()], + &[finding("reserved_header_chars", Severity::Minor)], + ReadinessScope::Single, + Some(crate::submission::SubmissionTarget::Ncbi), + ); + + let submission = readiness.category("submission").unwrap(); + assert_eq!(submission.target.as_deref(), Some("ncbi")); + assert_eq!(submission.status, ReadinessStatus::Fail); + assert_eq!(submission.findings, ["reserved_header_chars"]); + } + + #[test] + fn submission_findings_warn_when_not_blocking() { + let readiness = build_readiness( + VerdictStatus::Warn, + &[], + &[finding("long_headers", Severity::Minor)], + ReadinessScope::Single, + Some(crate::submission::SubmissionTarget::Generic), + ); + + let submission = readiness.category("submission").unwrap(); + assert_eq!(submission.target.as_deref(), Some("generic")); + assert_eq!(submission.status, ReadinessStatus::Warn); + assert!(readiness.overall.blockers.is_empty()); + } + #[test] fn clean_report_has_machine_and_core_categories_pass() { - let readiness = build_readiness(VerdictStatus::Pass, &[], &[], ReadinessScope::Single); + let readiness = + build_readiness(VerdictStatus::Pass, &[], &[], ReadinessScope::Single, None); assert_eq!(readiness.overall.status, ReadinessStatus::Pass); for id in [ diff --git a/src/report/compare_html.rs b/src/report/compare_html.rs index 02054f4..1922545 100644 --- a/src/report/compare_html.rs +++ b/src/report/compare_html.rs @@ -443,6 +443,7 @@ mod tests { &[], &[], crate::readiness::ReadinessScope::Single, + None, ) .categories, sequence_count: 2, diff --git a/src/report/compare_multiqc.rs b/src/report/compare_multiqc.rs index be0452c..6fe1a9a 100644 --- a/src/report/compare_multiqc.rs +++ b/src/report/compare_multiqc.rs @@ -219,6 +219,7 @@ mod tests { &[], &[], crate::readiness::ReadinessScope::Single, + None, ) .categories, sequence_count: 2, diff --git a/src/report/compare_tsv.rs b/src/report/compare_tsv.rs index 4c378ed..4a6e68a 100644 --- a/src/report/compare_tsv.rs +++ b/src/report/compare_tsv.rs @@ -163,6 +163,7 @@ mod tests { &[], &[], crate::readiness::ReadinessScope::Single, + None, ) .categories, sequence_count: 2, diff --git a/src/report/html.rs b/src/report/html.rs index fc93a0c..a2e0d80 100644 --- a/src/report/html.rs +++ b/src/report/html.rs @@ -729,6 +729,7 @@ mod tests { &[], &[], crate::readiness::ReadinessScope::Single, + None, ), machine_summary: MachineSummary { verdict: VerdictStatus::Pass, diff --git a/src/report/mod.rs b/src/report/mod.rs index e335952..c1656b5 100644 --- a/src/report/mod.rs +++ b/src/report/mod.rs @@ -298,6 +298,7 @@ mod tests { &[], &[], crate::readiness::ReadinessScope::Single, + None, ), machine_summary: MachineSummary { verdict: VerdictStatus::Pass, diff --git a/src/report/multiqc.rs b/src/report/multiqc.rs index 938e47c..1fe97e4 100644 --- a/src/report/multiqc.rs +++ b/src/report/multiqc.rs @@ -236,6 +236,7 @@ mod tests { &[], &[], crate::readiness::ReadinessScope::Single, + None, ), machine_summary: MachineSummary { verdict: VerdictStatus::Pass, diff --git a/src/report/tsv.rs b/src/report/tsv.rs index f7416dd..7c7dac2 100644 --- a/src/report/tsv.rs +++ b/src/report/tsv.rs @@ -362,6 +362,7 @@ mod tests { &[], &[], crate::readiness::ReadinessScope::Single, + None, ), machine_summary: MachineSummary { verdict: status, diff --git a/tests/cli.rs b/tests/cli.rs index 3e14c47..db40ff5 100644 --- a/tests/cli.rs +++ b/tests/cli.rs @@ -1290,6 +1290,14 @@ fn submission_gate_fails_identifier_hazards() { &report["gate"]["blocking_findings"], "reserved_header_chars", )); + let submission_readiness = report["readiness"]["categories"] + .as_array() + .unwrap() + .iter() + .find(|category| category["id"] == json!("submission")) + .unwrap(); + assert_eq!(submission_readiness["target"], json!("ncbi")); + assert_eq!(submission_readiness["status"], json!("FAIL")); } #[test] diff --git a/tests/golden/compare_mixed_status.json b/tests/golden/compare_mixed_status.json index 6cd191d..2622b7a 100644 --- a/tests/golden/compare_mixed_status.json +++ b/tests/golden/compare_mixed_status.json @@ -61,6 +61,7 @@ "label": "Submission readiness", "status": "WARN", "findings": [ + "tiny_contigs", "terminal_ns" ] }, @@ -147,8 +148,13 @@ { "id": "submission", "label": "Submission readiness", - "status": "WARN", + "status": "FAIL", "findings": [ + "duplicate_ids", + "duplicate_first_token_ids", + "high_n_rate", + "tiny_contigs", + "gap_runs", "terminal_ns" ] }, @@ -186,9 +192,12 @@ ], "readiness_blockers": [ "index.duplicate_ids", + "submission.duplicate_ids", "index.duplicate_first_token_ids", + "submission.duplicate_first_token_ids", "alphabet.invalid_chars", - "assembly.high_n_rate" + "assembly.high_n_rate", + "submission.high_n_rate" ], "recommended_next_tools": [ "seqkit", diff --git a/tests/golden/problem_assembly.json b/tests/golden/problem_assembly.json index 0aee5c7..1da560f 100644 --- a/tests/golden/problem_assembly.json +++ b/tests/golden/problem_assembly.json @@ -40,7 +40,9 @@ "status": "FAIL", "blockers": [ "index.duplicate_ids", + "submission.duplicate_ids", "index.duplicate_first_token_ids", + "submission.duplicate_first_token_ids", "alphabet.invalid_chars" ] }, @@ -90,8 +92,13 @@ { "id": "submission", "label": "Submission readiness", - "status": "WARN", + "status": "FAIL", "findings": [ + "duplicate_ids", + "duplicate_first_token_ids", + "high_n_rate", + "tiny_contigs", + "gap_runs", "terminal_ns" ] }, From 3074e9df151de670807941ef41f6f429f2b026dd Mon Sep 17 00:00:00 2001 From: Ehsan ESTAJI <71376358+ehsanestaji@users.noreply.github.com> Date: Thu, 11 Jun 2026 10:53:47 +0200 Subject: [PATCH 06/13] feat: route submission readiness findings --- schema/finding-catalog.json | 60 +++++++++++++++----- src/models.rs | 76 ++++++++++++++++++++++---- tests/cli.rs | 45 +++++++++++++++ tests/golden/compare_mixed_status.json | 7 ++- tests/golden/invalid_empty_record.json | 8 ++- tests/golden/problem_assembly.json | 21 ++++++- tests/golden/valid_assembly.json | 12 +++- 7 files changed, 195 insertions(+), 34 deletions(-) diff --git a/schema/finding-catalog.json b/schema/finding-catalog.json index 738abc6..f2f3c87 100644 --- a/schema/finding-catalog.json +++ b/schema/finding-catalog.json @@ -45,7 +45,7 @@ "default_verdict_effect": "fail", "description": "First whitespace-delimited FASTA identifiers collide.", "meaning": "First whitespace-delimited FASTA identifiers collide.", - "why_it_matters": "Many downstream tools index records by the first header token.", + "why_it_matters": "Many downstream tools and submission validators index records by the first header token; collisions can affect submission readiness.", "evidence_fields": [ "summary.duplicate_first_token_id_count", "findings[].affected_count" @@ -63,9 +63,13 @@ { "tool": "seqkit", "role": "inspect and rename records with colliding first-token identifiers" + }, + { + "tool": "official submission validator", + "role": "validate repository-specific rules after FASTA-level identifier collisions are fixed" } ], - "cannot_conclude": ["biological duplication", "assembly correctness"] + "cannot_conclude": ["biological duplication", "assembly correctness", "repository acceptance"] }, { "id": "unsafe_ids", @@ -78,7 +82,7 @@ "default_verdict_effect": "warn", "description": "One or more FASTA identifiers contain characters that may be unsafe for downstream tools.", "meaning": "One or more FASTA identifiers contain characters that may be unsafe for downstream tools.", - "why_it_matters": "Some indexing, database, submission, and workflow tools assume simple portable record identifiers.", + "why_it_matters": "Some indexing, database, submission, and workflow tools assume simple portable record identifiers; unsafe characters can affect submission readiness.", "evidence_fields": ["summary.unsafe_id_count", "findings[].affected_count"], "suggested_actions": [ { @@ -93,9 +97,13 @@ { "tool": "seqkit", "role": "normalize FASTA identifiers before indexing or submission" + }, + { + "tool": "official submission validator", + "role": "validate repository-specific rules after FASTA-level identifier issues are fixed" } ], - "cannot_conclude": ["sequence quality", "assembly correctness"] + "cannot_conclude": ["sequence quality", "assembly correctness", "repository acceptance"] }, { "id": "long_headers", @@ -108,7 +116,7 @@ "default_verdict_effect": "warn", "description": "One or more FASTA headers exceed the compatibility length limit.", "meaning": "One or more FASTA headers exceed the compatibility length limit.", - "why_it_matters": "Long headers can be truncated or rejected by some database, indexing, and submission workflows.", + "why_it_matters": "Long headers can be truncated or rejected by some database, indexing, and submission workflows, so they can affect submission readiness.", "evidence_fields": ["summary.long_header_count", "findings[].affected_count"], "suggested_actions": [ { @@ -123,9 +131,13 @@ { "tool": "seqkit", "role": "shorten headers or move descriptive metadata into sidecar tables" + }, + { + "tool": "official submission validator", + "role": "validate repository-specific header limits after FASTA-level issues are fixed" } ], - "cannot_conclude": ["sequence quality", "assembly correctness"] + "cannot_conclude": ["sequence quality", "assembly correctness", "repository acceptance"] }, { "id": "reserved_header_chars", @@ -138,7 +150,7 @@ "default_verdict_effect": "warn", "description": "One or more FASTA headers contain reserved compatibility characters.", "meaning": "One or more FASTA headers contain reserved compatibility characters.", - "why_it_matters": "Reserved header characters can confuse parsers, database builders, or submission validators.", + "why_it_matters": "Reserved header characters can confuse parsers, database builders, or submission validators and can affect submission readiness.", "evidence_fields": [ "summary.reserved_header_char_count", "findings[].affected_count" @@ -156,9 +168,13 @@ { "tool": "seqkit", "role": "normalize headers before database construction or submission" + }, + { + "tool": "official submission validator", + "role": "validate repository-specific header character rules after FASTA-level issues are fixed" } ], - "cannot_conclude": ["sequence quality", "assembly correctness"] + "cannot_conclude": ["sequence quality", "assembly correctness", "repository acceptance"] }, { "id": "invalid_chars", @@ -197,7 +213,7 @@ "default_severity": "major", "default_verdict_effect": "warn", "meaning": "Global or per-sequence N content exceeds the assembly profile threshold.", - "why_it_matters": "High N content can reduce mapping confidence and fragment annotation or polishing steps.", + "why_it_matters": "High N content can reduce mapping confidence, fragment annotation or polishing steps, and trigger submission-oriented ambiguity follow-up.", "evidence_fields": [ "summary.n_percent", "summary.high_n_sequence_count", @@ -228,9 +244,13 @@ { "tool": "BUSCO", "role": "biological completeness after structural preflight checks" + }, + { + "tool": "NCBI FCS", + "role": "database-backed contamination or adaptor screening when submission-oriented ambiguity signals need follow-up" } ], - "cannot_conclude": ["biological completeness", "misassembly status", "taxonomic contamination"] + "cannot_conclude": ["biological completeness", "misassembly status", "taxonomic contamination", "repository acceptance"] }, { "id": "tiny_contigs", @@ -269,7 +289,7 @@ "default_severity": "major", "default_verdict_effect": "warn", "meaning": "The longest contiguous N gap run exceeds the assembly profile limit.", - "why_it_matters": "Long gap runs can indicate unresolved assembly regions and may disrupt mapping or annotation.", + "why_it_matters": "Long gap runs can indicate unresolved assembly regions, disrupt mapping or annotation, and trigger submission-oriented gap follow-up.", "evidence_fields": ["summary.max_gap_run", "findings[].affected_count"], "suggested_actions": [ { @@ -284,9 +304,13 @@ { "tool": "QUAST", "role": "assembly-level gap and scaffold evaluation" + }, + { + "tool": "NCBI FCS", + "role": "database-backed contamination or adaptor screening when submission-oriented gap signals need follow-up" } ], - "cannot_conclude": ["gap correctness", "misassembly status without alignment evidence"] + "cannot_conclude": ["gap correctness", "misassembly status without alignment evidence", "repository acceptance"] }, { "id": "terminal_ns", @@ -318,9 +342,13 @@ { "tool": "seqkit", "role": "inspect records with leading or trailing N bases" + }, + { + "tool": "official submission validator", + "role": "validate repository-specific rules after terminal N issues are reviewed or fixed" } ], - "cannot_conclude": ["gap correctness", "submission acceptance"] + "cannot_conclude": ["gap correctness", "submission acceptance", "repository acceptance"] }, { "id": "gap_pattern_warnings", @@ -352,9 +380,13 @@ { "tool": "QUAST", "role": "evaluate scaffold gap patterns after FASTA preflight" + }, + { + "tool": "official submission validator", + "role": "validate repository-specific gap pattern rules after FASTA-level issues are reviewed or fixed" } ], - "cannot_conclude": ["gap correctness", "misassembly status without alignment evidence"] + "cannot_conclude": ["gap correctness", "misassembly status without alignment evidence", "repository acceptance"] }, { "id": "expected_size_outlier", diff --git a/src/models.rs b/src/models.rs index 704d2a8..c62c6d0 100644 --- a/src/models.rs +++ b/src/models.rs @@ -723,18 +723,34 @@ fn routing_hints(findings: &[Finding]) -> Vec { "deduplicate_or_rename_records", false, ), - "duplicate_first_token_ids" => push_routing_hint( - &mut hints, - "index_readiness_failure", - "rename_records_before_indexing", - false, - ), - "unsafe_ids" | "long_headers" | "reserved_header_chars" => push_routing_hint( - &mut hints, - "header_compatibility_warning", - "review_headers_before_database_or_submission", - false, - ), + "duplicate_first_token_ids" => { + push_routing_hint( + &mut hints, + "index_readiness_failure", + "rename_records_before_indexing", + false, + ); + push_routing_hint( + &mut hints, + "submission_readiness_failure", + "fix_fasta_before_official_validation", + false, + ); + } + "unsafe_ids" | "long_headers" | "reserved_header_chars" => { + push_routing_hint( + &mut hints, + "header_compatibility_warning", + "review_headers_before_database_or_submission", + false, + ); + push_routing_hint( + &mut hints, + "submission_readiness_failure", + "fix_fasta_before_official_validation", + false, + ); + } "invalid_chars" | "invalid_fasta_structure" => push_routing_hint( &mut hints, "validity_failure", @@ -828,9 +844,41 @@ fn recommended_next_tools(status: VerdictStatus, findings: &[Finding]) -> Vec bool { + findings + .iter() + .any(|finding| ids.contains(&finding.id.as_str())) +} + fn push_tool(tools: &mut Vec, tool: &str, reason: &str) { if tools.iter().any(|existing| existing.tool == tool) { return; @@ -851,12 +899,16 @@ fn fasta_preflight_scope() -> Scope { "invalid sequence symbols".to_string(), "basic structural statistics".to_string(), "sequence composition red flags".to_string(), + "FASTA-level submission readiness".to_string(), ], cannot_conclude: vec![ "biological completeness".to_string(), "taxonomic contamination".to_string(), "whole-assembly accuracy".to_string(), "misassembly status without alignment evidence".to_string(), + "repository acceptance".to_string(), + "official validator acceptance".to_string(), + "annotation correctness".to_string(), ], } } diff --git a/tests/cli.rs b/tests/cli.rs index db40ff5..5f260ed 100644 --- a/tests/cli.rs +++ b/tests/cli.rs @@ -1248,6 +1248,14 @@ fn submission_target_ncbi_is_serialized_when_requested() { let report = read_json(&outputs.json); assert_eq!(report["gate"]["submission_target"], json!("ncbi")); assert_eq!(report["provenance"]["submission_target"], json!("ncbi")); + assert!(array_contains_string( + &report["scope"]["can_conclude"], + "FASTA-level submission readiness" + )); + assert!(array_contains_string( + &report["scope"]["cannot_conclude"], + "repository acceptance" + )); } #[test] @@ -1300,6 +1308,43 @@ fn submission_gate_fails_identifier_hazards() { assert_eq!(submission_readiness["status"], json!("FAIL")); } +#[test] +fn submission_identifier_hazards_route_to_official_validators_without_claiming_results() { + let temp_dir = TempDir::new().unwrap(); + let outputs = output_paths(&temp_dir, "submission_routes"); + + let mut cmd = Command::cargo_bin("fastaguard").unwrap(); + cmd.args([ + "testdata/submission_ids.fa", + "--gate", + "submission", + "--submission-target", + "ncbi", + "--json", + ]) + .arg(&outputs.json) + .arg("--out") + .arg(&outputs.html) + .arg("--tsv") + .arg(&outputs.tsv) + .arg("--multiqc") + .arg(&outputs.multiqc) + .assert() + .code(2); + + let report = read_json(&outputs.json); + assert_routing_hint( + &report, + "submission_readiness_failure", + "fix_fasta_before_official_validation", + false, + ); + assert!(array_contains_tool( + &report["machine_summary"]["recommended_next_tools"], + "official submission validator" + )); +} + #[test] fn unknown_submission_target_is_cli_error() { let mut cmd = Command::cargo_bin("fastaguard").unwrap(); diff --git a/tests/golden/compare_mixed_status.json b/tests/golden/compare_mixed_status.json index 2622b7a..c5e57e5 100644 --- a/tests/golden/compare_mixed_status.json +++ b/tests/golden/compare_mixed_status.json @@ -92,7 +92,8 @@ ], "readiness_blockers": [], "recommended_next_tools": [ - "seqkit" + "seqkit", + "official submission validator" ], "input_sha256": "373699c0422b364607fc6879c46c053c105413c81df9403998d3d26a7122d2e8" }, @@ -202,7 +203,9 @@ "recommended_next_tools": [ "seqkit", "QUAST", - "BlobToolKit" + "BlobToolKit", + "official submission validator", + "NCBI FCS" ], "input_sha256": "4b8551daeda739b62c8e7aaa1ebf300e4118167ca582c51eeccfa1549c96f0a5" } diff --git a/tests/golden/invalid_empty_record.json b/tests/golden/invalid_empty_record.json index a665011..7322085 100644 --- a/tests/golden/invalid_empty_record.json +++ b/tests/golden/invalid_empty_record.json @@ -108,13 +108,17 @@ "duplicate identifiers", "invalid sequence symbols", "basic structural statistics", - "sequence composition red flags" + "sequence composition red flags", + "FASTA-level submission readiness" ], "cannot_conclude": [ "biological completeness", "taxonomic contamination", "whole-assembly accuracy", - "misassembly status without alignment evidence" + "misassembly status without alignment evidence", + "repository acceptance", + "official validator acceptance", + "annotation correctness" ] }, "provenance": { diff --git a/tests/golden/problem_assembly.json b/tests/golden/problem_assembly.json index 1da560f..b2095c5 100644 --- a/tests/golden/problem_assembly.json +++ b/tests/golden/problem_assembly.json @@ -136,6 +136,14 @@ { "tool": "BlobToolKit", "reason": "Records with multiple FASTA-level anomaly signals should be prioritized for composition and coverage review." + }, + { + "tool": "official submission validator", + "reason": "Use the target repository validator after FASTA-level issues are fixed; FastaGuard is not an official validator." + }, + { + "tool": "NCBI FCS", + "reason": "Run database-backed contamination/adaptor screening when submission-oriented ambiguity or gap signals need follow-up." } ], "routing_hints": [ @@ -149,6 +157,11 @@ "suggested_route": "rename_records_before_indexing", "requires_external_database": false }, + { + "condition": "submission_readiness_failure", + "suggested_route": "fix_fasta_before_official_validation", + "requires_external_database": false + }, { "condition": "validity_failure", "suggested_route": "repair_fasta_before_downstream_qc", @@ -188,13 +201,17 @@ "duplicate identifiers", "invalid sequence symbols", "basic structural statistics", - "sequence composition red flags" + "sequence composition red flags", + "FASTA-level submission readiness" ], "cannot_conclude": [ "biological completeness", "taxonomic contamination", "whole-assembly accuracy", - "misassembly status without alignment evidence" + "misassembly status without alignment evidence", + "repository acceptance", + "official validator acceptance", + "annotation correctness" ] }, "provenance": { diff --git a/tests/golden/valid_assembly.json b/tests/golden/valid_assembly.json index 4cb3a36..1fde7f3 100644 --- a/tests/golden/valid_assembly.json +++ b/tests/golden/valid_assembly.json @@ -88,6 +88,10 @@ { "tool": "seqkit", "reason": "Terminal Ns can trigger submission warnings and may indicate records that need trimming or scaffold-boundary review." + }, + { + "tool": "official submission validator", + "reason": "Use the target repository validator after FASTA-level issues are fixed; FastaGuard is not an official validator." } ], "routing_hints": [ @@ -105,13 +109,17 @@ "duplicate identifiers", "invalid sequence symbols", "basic structural statistics", - "sequence composition red flags" + "sequence composition red flags", + "FASTA-level submission readiness" ], "cannot_conclude": [ "biological completeness", "taxonomic contamination", "whole-assembly accuracy", - "misassembly status without alignment evidence" + "misassembly status without alignment evidence", + "repository acceptance", + "official validator acceptance", + "annotation correctness" ] }, "provenance": { From 2a0e1f0905224af864a5ff1c9fa5cdf67bf67102 Mon Sep 17 00:00:00 2001 From: Ehsan ESTAJI <71376358+ehsanestaji@users.noreply.github.com> Date: Thu, 11 Jun 2026 11:07:36 +0200 Subject: [PATCH 07/13] feat: render submission readiness outputs --- src/report/html.rs | 81 +++++++++++++++++++ src/report/multiqc.rs | 55 +++++++++++++ src/report/tsv.rs | 179 ++++++++++++++++++++++++++++++++++++++++++ tests/cli.rs | 47 +++++++++++ 4 files changed, 362 insertions(+) diff --git a/src/report/html.rs b/src/report/html.rs index a2e0d80..82e8fc2 100644 --- a/src/report/html.rs +++ b/src/report/html.rs @@ -12,6 +12,7 @@ pub fn write(report: &FastaguardReport, path: &Path) -> Result<()> { fn render(report: &FastaguardReport) -> Result { let summary = &report.summary; let gate = render_gate(report); + let submission = render_submission_readiness(report); let readiness = render_readiness(report); let machine_summary = render_machine_summary(report); let scope = render_scope(report); @@ -69,6 +70,8 @@ pre {{ overflow-x: auto; background: #202124; color: #f7f7f4; padding: 16px; }}

Before QUAST. Before BUSCO. Before BlobToolKit. Run FastaGuard first.

Gate Decision

{gate} +

Submission Readiness

+{submission}

Readiness

{readiness} @@ -108,6 +111,7 @@ pre {{ overflow-x: auto; background: #202124; color: #f7f7f4; padding: 16px; }} gc_percent = summary.gc_percent, n_percent = summary.n_percent, gate = gate, + submission = submission, readiness = readiness, scope = scope, plots = plots, @@ -140,6 +144,42 @@ fn render_gate(report: &FastaguardReport) -> String { ) } +fn render_submission_readiness(report: &FastaguardReport) -> String { + let target = report + .gate + .submission_target + .map(crate::submission::SubmissionTarget::as_str) + .unwrap_or("."); + let category = report.readiness.category("submission"); + let status = category + .map(|category| readiness_status(category.status)) + .unwrap_or("PASS"); + let findings = category + .map(|category| render_string_list_or_none(&category.findings)) + .unwrap_or_else(|| "None".to_string()); + + format!( + r#"
+
+

Target

+

{target}

+
+
+

Status

+

{status}

+
+
+

Findings

+{findings} +
+
+

Official validators are still required. FastaGuard reports FASTA-level preflight risks only.

"#, + target = escape_html(target), + status = escape_html(status), + findings = findings, + ) +} + fn render_readiness(report: &FastaguardReport) -> String { let rows = report .readiness @@ -700,6 +740,47 @@ mod tests { assert!(output.contains("seqkit")); } + #[test] + fn renders_submission_readiness_section() { + let mut report = test_report(); + report.gate.mode = "submission".to_string(); + report.gate.submission_target = Some(crate::submission::SubmissionTarget::Ncbi); + report.gate.status = VerdictStatus::Fail; + report.gate.blocking_findings = vec!["unsafe_ids".to_string()]; + report.readiness = crate::readiness::build_readiness( + VerdictStatus::Fail, + &report.gate.blocking_findings, + &[Finding { + id: "unsafe_ids".to_string(), + category: FindingCategory::Validity, + severity: Severity::Major, + confidence: FindingConfidence::High, + requires_followup_tool: false, + profile: "assembly".to_string(), + affected_count: 1, + affected_fraction: 0.5, + message: "unsafe identifier".to_string(), + why_it_matters: "may fail submission".to_string(), + suggested_next_step: "rename identifiers".to_string(), + evidence: empty_evidence(), + actions: Vec::new(), + }], + crate::readiness::ReadinessScope::Single, + report.gate.submission_target, + ); + let file = NamedTempFile::new().unwrap(); + + write(&report, file.path()).unwrap(); + + let output = fs::read_to_string(file.path()).unwrap(); + assert!(output.contains("Submission Readiness")); + assert!(output.contains("

Target

")); + assert!(output.contains("

ncbi

")); + assert!(output.contains("

FAIL

")); + assert!(output.contains("unsafe_ids")); + assert!(output.contains("Official validators are still required")); + } + fn test_report() -> FastaguardReport { FastaguardReport { schema_version: "0.1.0".to_string(), diff --git a/src/report/multiqc.rs b/src/report/multiqc.rs index 1fe97e4..6b7ab0b 100644 --- a/src/report/multiqc.rs +++ b/src/report/multiqc.rs @@ -36,6 +36,12 @@ struct MultiqcSummaryRow { gate_blocking_findings: String, readiness_status: String, readiness_blockers: String, + submission_target: String, + submission_status: String, + unsafe_identifier_count: u64, + long_identifier_count: u64, + duplicate_first_token_id_count: u64, + gap_like_n_run_count: u64, sequence_count: u64, total_length: u64, n50: u64, @@ -97,6 +103,17 @@ fn summary_row(report: &FastaguardReport) -> MultiqcSummaryRow { gate_blocking_findings: report.gate.blocking_findings.join(","), readiness_status: readiness_status(report.readiness.overall.status).to_string(), readiness_blockers: report.readiness.overall.blockers.join(","), + submission_target: report + .gate + .submission_target + .map(crate::submission::SubmissionTarget::as_str) + .unwrap_or(".") + .to_string(), + submission_status: submission_status(report).to_string(), + unsafe_identifier_count: report.summary.unsafe_id_count, + long_identifier_count: report.summary.long_header_count, + duplicate_first_token_id_count: report.summary.duplicate_first_token_id_count, + gap_like_n_run_count: report.summary.repeated_gap_pattern_sequence_count, sequence_count: report.summary.sequence_count, total_length: report.summary.total_length, n50: report.summary.n50, @@ -119,6 +136,15 @@ fn summary_headers() -> BTreeMap<&'static str, MultiqcHeader> { [ ("readiness_status", "Readiness"), ("readiness_blockers", "Readiness blockers"), + ("submission_target", "Submission Target"), + ("submission_status", "Submission Status"), + ("unsafe_identifier_count", "Unsafe IDs"), + ("long_identifier_count", "Long Headers"), + ( + "duplicate_first_token_id_count", + "Duplicate First-Token IDs", + ), + ("gap_like_n_run_count", "Gap-Like N Runs"), ] .into_iter() .map(|(id, title)| (id, MultiqcHeader { title })) @@ -150,6 +176,14 @@ fn readiness_status(status: crate::readiness::ReadinessStatus) -> &'static str { } } +fn submission_status(report: &FastaguardReport) -> &'static str { + report + .readiness + .category("submission") + .map(|category| readiness_status(category.status)) + .unwrap_or("PASS") +} + #[cfg(test)] mod tests { use std::fs; @@ -183,12 +217,33 @@ mod tests { output["pconfig"]["headers"]["readiness_blockers"]["title"], "Readiness blockers" ); + assert_eq!( + output["pconfig"]["headers"]["submission_target"]["title"], + "Submission Target" + ); + assert_eq!( + output["pconfig"]["headers"]["submission_status"]["title"], + "Submission Status" + ); + assert_eq!( + output["pconfig"]["headers"]["unsafe_identifier_count"]["title"], + "Unsafe IDs" + ); assert_eq!(output["data"]["sample"]["verdict"], "PASS"); assert_eq!(output["data"]["sample"]["gate_mode"], "none"); assert_eq!(output["data"]["sample"]["gate_status"], "PASS"); assert_eq!(output["data"]["sample"]["gate_blocking_findings"], ""); assert_eq!(output["data"]["sample"]["readiness_status"], "PASS"); assert_eq!(output["data"]["sample"]["readiness_blockers"], ""); + assert_eq!(output["data"]["sample"]["submission_target"], "."); + assert_eq!(output["data"]["sample"]["submission_status"], "PASS"); + assert_eq!(output["data"]["sample"]["unsafe_identifier_count"], 0); + assert_eq!(output["data"]["sample"]["long_identifier_count"], 0); + assert_eq!( + output["data"]["sample"]["duplicate_first_token_id_count"], + 0 + ); + assert_eq!(output["data"]["sample"]["gap_like_n_run_count"], 0); assert_eq!(output["data"]["sample"]["sequence_count"], 2); assert_eq!(output["data"]["sample"]["duplicate_id_count"], 0); assert_eq!(output["data"]["sample"]["invalid_sequence_count"], 0); diff --git a/src/report/tsv.rs b/src/report/tsv.rs index 7c7dac2..e1d8a22 100644 --- a/src/report/tsv.rs +++ b/src/report/tsv.rs @@ -51,6 +51,48 @@ pub fn write(report: &FastaguardReport, path: &Path) -> Result<()> { readiness_status(category.status), )?; } + write_metric( + &mut writer, + "submission_target", + report + .gate + .submission_target + .map(crate::submission::SubmissionTarget::as_str) + .unwrap_or("."), + )?; + write_metric(&mut writer, "submission_status", submission_status(report))?; + let (submission_blocking_findings, submission_advisory_findings) = + submission_finding_partitions(report); + write_metric( + &mut writer, + "submission_blocking_findings", + submission_blocking_findings.join(","), + )?; + write_metric( + &mut writer, + "submission_advisory_findings", + submission_advisory_findings.join(","), + )?; + write_metric( + &mut writer, + "unsafe_identifier_count", + report.summary.unsafe_id_count, + )?; + write_metric( + &mut writer, + "long_identifier_count", + report.summary.long_header_count, + )?; + write_metric( + &mut writer, + "submission_duplicate_first_token_id_count", + report.summary.duplicate_first_token_id_count, + )?; + write_metric( + &mut writer, + "gap_like_n_run_count", + report.summary.repeated_gap_pattern_sequence_count, + )?; write_metric(&mut writer, "input_sha256", &report.provenance.input_sha256)?; write_metric(&mut writer, "sequence_count", report.summary.sequence_count)?; @@ -193,6 +235,42 @@ fn readiness_status(status: crate::readiness::ReadinessStatus) -> &'static str { } } +fn submission_status(report: &FastaguardReport) -> &'static str { + report + .readiness + .category("submission") + .map(|category| readiness_status(category.status)) + .unwrap_or("PASS") +} + +fn submission_finding_partitions(report: &FastaguardReport) -> (Vec, Vec) { + let Some(category) = report.readiness.category("submission") else { + return (Vec::new(), Vec::new()); + }; + let submission_blockers: Vec<&str> = report + .readiness + .overall + .blockers + .iter() + .filter_map(|blocker| blocker.strip_prefix("submission.")) + .collect(); + + let mut blocking = Vec::new(); + let mut advisory = Vec::new(); + for finding_id in &category.findings { + if submission_blockers + .iter() + .any(|blocker| *blocker == finding_id.as_str()) + { + blocking.push(finding_id.clone()); + } else { + advisory.push(finding_id.clone()); + } + } + + (blocking, advisory) +} + #[cfg(test)] mod tests { use std::fs; @@ -315,6 +393,107 @@ mod tests { assert!(output.contains("ungapped_total_length\t94\n"), "{output}"); } + #[test] + fn writes_submission_output_rows() { + let mut report = test_report(VerdictStatus::Fail); + report.gate.mode = "submission".to_string(); + report.gate.submission_target = Some(crate::submission::SubmissionTarget::Ncbi); + report.gate.status = VerdictStatus::Fail; + report.gate.blocking_findings = vec!["unsafe_ids".to_string()]; + report.gate.advisory_findings = vec!["tiny_contigs".to_string()]; + report.summary.unsafe_id_count = 2; + report.summary.long_header_count = 3; + report.summary.duplicate_first_token_id_count = 4; + report.summary.repeated_gap_pattern_sequence_count = 5; + report.readiness = crate::readiness::build_readiness( + VerdictStatus::Fail, + &report.gate.blocking_findings, + &[ + test_finding("unsafe_ids", 2), + test_finding("tiny_contigs", 1), + ], + crate::readiness::ReadinessScope::Single, + report.gate.submission_target, + ); + let file = NamedTempFile::new().unwrap(); + + write(&report, file.path()).unwrap(); + + let output = fs::read_to_string(file.path()).unwrap(); + assert!(output.contains("submission_target\tncbi\n"), "{output}"); + assert!(output.contains("submission_status\tFAIL\n"), "{output}"); + assert!( + output.contains("submission_blocking_findings\tunsafe_ids\n"), + "{output}" + ); + assert!( + output.contains("submission_advisory_findings\ttiny_contigs\n"), + "{output}" + ); + assert!(output.contains("unsafe_identifier_count\t2\n"), "{output}"); + assert!(output.contains("long_identifier_count\t3\n"), "{output}"); + assert!( + output.contains("submission_duplicate_first_token_id_count\t4\n"), + "{output}" + ); + assert!(output.contains("gap_like_n_run_count\t5\n"), "{output}"); + } + + #[test] + fn submission_findings_are_partitioned_from_readiness_category() { + let mut report = test_report(VerdictStatus::Fail); + report.gate.mode = "pipeline".to_string(); + report.gate.status = VerdictStatus::Fail; + report.gate.blocking_findings = vec!["invalid_chars".to_string()]; + report.gate.advisory_findings = vec!["terminal_ns".to_string()]; + report.readiness = crate::readiness::build_readiness( + VerdictStatus::Warn, + &[], + &[test_finding("terminal_ns", 1)], + crate::readiness::ReadinessScope::Single, + None, + ); + let file = NamedTempFile::new().unwrap(); + + write(&report, file.path()).unwrap(); + + let output = fs::read_to_string(file.path()).unwrap(); + assert!( + !output.contains("submission_blocking_findings\tinvalid_chars\n"), + "{output}" + ); + assert!( + output.contains("submission_blocking_findings\t.\n"), + "{output}" + ); + assert!( + output.contains("submission_advisory_findings\tterminal_ns\n"), + "{output}" + ); + } + + #[test] + fn submission_findings_are_empty_without_submission_category() { + let mut report = test_report(VerdictStatus::Fail); + report + .readiness + .categories + .retain(|category| category.id != "submission"); + let file = NamedTempFile::new().unwrap(); + + write(&report, file.path()).unwrap(); + + let output = fs::read_to_string(file.path()).unwrap(); + assert!( + output.contains("submission_blocking_findings\t.\n"), + "{output}" + ); + assert!( + output.contains("submission_advisory_findings\t.\n"), + "{output}" + ); + } + #[test] fn writes_empty_metric_values_as_explicit_marker_without_trailing_whitespace() { let report = test_report(VerdictStatus::Pass); diff --git a/tests/cli.rs b/tests/cli.rs index 5f260ed..fd249d2 100644 --- a/tests/cli.rs +++ b/tests/cli.rs @@ -1345,6 +1345,53 @@ fn submission_identifier_hazards_route_to_official_validators_without_claiming_r )); } +#[test] +fn submission_gate_outputs_tsv_multiqc_and_html_fields() { + let temp_dir = TempDir::new().unwrap(); + let outputs = output_paths(&temp_dir, "submission_outputs"); + + let mut cmd = Command::cargo_bin("fastaguard").unwrap(); + cmd.args([ + "testdata/submission_ids.fa", + "--gate", + "submission", + "--submission-target", + "ncbi", + "--json", + ]) + .arg(&outputs.json) + .arg("--out") + .arg(&outputs.html) + .arg("--tsv") + .arg(&outputs.tsv) + .arg("--multiqc") + .arg(&outputs.multiqc) + .assert() + .code(2); + + let tsv = std::fs::read_to_string(&outputs.tsv).unwrap(); + assert!(tsv.contains("submission_target\tncbi\n"), "{tsv}"); + assert!(tsv.contains("submission_status\tFAIL\n"), "{tsv}"); + assert!(tsv.contains("unsafe_identifier_count\t"), "{tsv}"); + + let multiqc = read_json(&outputs.multiqc); + assert_eq!( + multiqc["data"]["submission_ids"]["submission_target"], + json!("ncbi") + ); + assert_eq!( + multiqc["data"]["submission_ids"]["submission_status"], + json!("FAIL") + ); + + let html = std::fs::read_to_string(&outputs.html).unwrap(); + assert!(html.contains("Submission Readiness"), "{html}"); + assert!( + html.contains("Official validators are still required"), + "{html}" + ); +} + #[test] fn unknown_submission_target_is_cli_error() { let mut cmd = Command::cargo_bin("fastaguard").unwrap(); From 85c483875ec8974dcc3a9c05dfac8733c48c9e53 Mon Sep 17 00:00:00 2001 From: Ehsan ESTAJI <71376358+ehsanestaji@users.noreply.github.com> Date: Thu, 11 Jun 2026 11:29:49 +0200 Subject: [PATCH 08/13] feat: aggregate submission readiness in compare mode --- src/compare.rs | 38 ++++++++++++++ src/models.rs | 5 ++ src/report/compare_html.rs | 34 +++++++++++-- src/report/compare_multiqc.rs | 52 ++++++++++++++++++- src/report/compare_tsv.rs | 33 +++++++++--- tests/cli.rs | 95 +++++++++++++++++++++++++++++++++++ 6 files changed, 245 insertions(+), 12 deletions(-) diff --git a/src/compare.rs b/src/compare.rs index 69f6e15..7de89f0 100644 --- a/src/compare.rs +++ b/src/compare.rs @@ -73,12 +73,24 @@ fn run_one_sample(config: &CompareConfig, input: &Path) -> Result CompareSample { + let submission_status = report + .readiness + .category("submission") + .map(|category| category.status) + .unwrap_or(crate::readiness::ReadinessStatus::Pass); + CompareSample { sample_id: sample_id(input), input_path: report.input.path.clone(), verdict: report.verdict.status, gate_status: report.gate.status, readiness_status: report.readiness.overall.status, + submission_target: report + .gate + .submission_target + .map(crate::submission::SubmissionTarget::as_str) + .map(ToOwned::to_owned), + submission_status, readiness_categories: report.readiness.categories.clone(), sequence_count: report.summary.sequence_count, total_length: report.summary.total_length, @@ -116,6 +128,18 @@ fn compare_summary(samples: &[CompareSample]) -> CompareSummary { pass_count: count_status(samples, VerdictStatus::Pass), warn_count: count_status(samples, VerdictStatus::Warn), fail_count: count_status(samples, VerdictStatus::Fail), + submission_ready_count: count_readiness_status( + samples, + crate::readiness::ReadinessStatus::Pass, + ), + submission_warn_count: count_readiness_status( + samples, + crate::readiness::ReadinessStatus::Warn, + ), + submission_fail_count: count_readiness_status( + samples, + crate::readiness::ReadinessStatus::Fail, + ), } } @@ -186,6 +210,18 @@ fn count_status(samples: &[CompareSample], status: VerdictStatus) -> u64 { ) } +fn count_readiness_status( + samples: &[CompareSample], + status: crate::readiness::ReadinessStatus, +) -> u64 { + usize_to_u64( + samples + .iter() + .filter(|sample| sample.submission_status == status) + .count(), + ) +} + fn affected_record_count(report: &FastaguardReport, finding_id: &str) -> u64 { report .findings @@ -307,6 +343,8 @@ mod tests { verdict: VerdictStatus::Pass, gate_status: VerdictStatus::Pass, readiness_status: crate::readiness::ReadinessStatus::Pass, + submission_target: None, + submission_status: crate::readiness::ReadinessStatus::Pass, readiness_categories: crate::readiness::build_readiness( VerdictStatus::Pass, &[], diff --git a/src/models.rs b/src/models.rs index c62c6d0..f4f0546 100644 --- a/src/models.rs +++ b/src/models.rs @@ -62,6 +62,9 @@ pub struct CompareSummary { pub pass_count: u64, pub warn_count: u64, pub fail_count: u64, + pub submission_ready_count: u64, + pub submission_warn_count: u64, + pub submission_fail_count: u64, } #[derive(Debug, Clone, Serialize, Deserialize)] @@ -71,6 +74,8 @@ pub struct CompareSample { pub verdict: VerdictStatus, pub gate_status: VerdictStatus, pub readiness_status: crate::readiness::ReadinessStatus, + pub submission_target: Option, + pub submission_status: crate::readiness::ReadinessStatus, pub readiness_categories: Vec, pub sequence_count: u64, pub total_length: u64, diff --git a/src/report/compare_html.rs b/src/report/compare_html.rs index 1922545..21ce728 100644 --- a/src/report/compare_html.rs +++ b/src/report/compare_html.rs @@ -73,6 +73,9 @@ pre {{ overflow-x: auto; background: #202124; color: #f7f7f4; padding: 16px; }}

PASS

{pass_count}

WARN

{warn_count}

FAIL

{fail_count}

+

Submission ready

{submission_ready_count}

+

Submission warn

{submission_warn_count}

+

Submission fail

{submission_fail_count}

Readiness Matrix

{readiness_matrix} @@ -92,6 +95,9 @@ pre {{ overflow-x: auto; background: #202124; color: #f7f7f4; padding: 16px; }} pass_count = report.summary.pass_count, warn_count = report.summary.warn_count, fail_count = report.summary.fail_count, + submission_ready_count = report.summary.submission_ready_count, + submission_warn_count = report.summary.submission_warn_count, + submission_fail_count = report.summary.submission_fail_count, readiness_matrix = readiness_matrix, charts = charts, cohort_findings = cohort_findings, @@ -107,7 +113,9 @@ fn render_readiness_matrix(report: &CompareReport) -> String { .map(|sample| { let verdict = verdict_status(sample.verdict); let gate_status = verdict_status(sample.gate_status); - let readiness_status = readiness_status(sample.readiness_status); + let overall_readiness_status = readiness_status(sample.readiness_status); + let submission_target = sample.submission_target.as_deref().unwrap_or("."); + let submission_status = readiness_status(sample.submission_status); let category_cells = render_readiness_category_cells(sample); format!( r#" @@ -116,6 +124,8 @@ fn render_readiness_matrix(report: &CompareReport) -> String { {verdict} {gate_status} {readiness_status} +{submission_target} +{submission_status} {category_cells} {sequence_count} {total_length} @@ -132,8 +142,11 @@ fn render_readiness_matrix(report: &CompareReport) -> String { verdict_class = verdict.to_ascii_lowercase(), gate_status = gate_status, gate_class = gate_status.to_ascii_lowercase(), - readiness_status = readiness_status, - readiness_class = readiness_status.to_ascii_lowercase(), + readiness_status = overall_readiness_status, + readiness_class = overall_readiness_status.to_ascii_lowercase(), + submission_target = escape_html(submission_target), + submission_status = submission_status, + submission_class = submission_status.to_ascii_lowercase(), category_cells = category_cells, sequence_count = sample.sequence_count, total_length = sample.total_length, @@ -156,7 +169,7 @@ fn render_readiness_matrix(report: &CompareReport) -> String { format!( r#"
-{category_headers} +{category_headers}{rows}
SampleInputVerdictGateReadinessSequencesTotal lengthN50N90GC%N%FindingsBlockers
SampleInputVerdictGateReadinessSubmission targetSubmission statusSequencesTotal lengthN50N90GC%N%FindingsBlockers
"# @@ -394,6 +407,12 @@ mod tests { assert!(output.contains("File readiness"), "{output}"); assert!(output.contains("Index readiness"), "{output}"); assert!(output.contains("Machine readiness"), "{output}"); + assert!(output.contains("Submission status"), "{output}"); + assert!(output.contains("Submission warn"), "{output}"); + assert!( + output.contains(r#"WARN"#), + "{output}" + ); assert!(output.contains("Cohort Findings"), "{output}"); assert!(output.contains("Suggested Next Tools"), "{output}"); assert!(output.matches("= 5, "{output}"); @@ -404,12 +423,14 @@ mod tests { let mut report = test_report(); report.samples[0].sample_id = "sample_