Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 19 additions & 0 deletions download.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -423,3 +423,22 @@
-
url: https://raw.githubusercontent.com/biolink/kgx/master/docs/kgx_format.md
local_name: kgx-format.md


#
# **** Selective stub-import ontologies (NCIT, MESH) ****
#
# KG-Microbe does NOT load the full NCIT or MESH ontologies — those belong to
# kg-microbe-biomedical. But the chemical-mapping consolidator and BacDive
# isolation-source mapper reference ~150 NCIT/MESH IDs as canonical xrefs for
# ingredients (e.g. NCIT:C29298 'Oatmeal', mesh:D011136 'Tween'). The
# OntologiesStubsTransform queries these SemSQL DBs to harvest just the
# referenced IDs (label + synonyms + xrefs), emitting one labelled stub node
# each. The DBs themselves are never loaded into the merged KG.
#
-
url: https://s3.amazonaws.com/bbop-sqlite/ncit.db.gz
local_name: ncit.db.gz
-
url: https://s3.amazonaws.com/bbop-sqlite/mesh.db.gz
local_name: mesh.db.gz
8 changes: 8 additions & 0 deletions kg_microbe/transform.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
METATRAITS,
METATRAITS_GTDB,
ONTOLOGIES,
ONTOLOGIES_STUBS,
RHEAMAPPINGS,
)
from kg_microbe.transform_utils.gtdb.gtdb import GTDBTransform
Expand All @@ -32,6 +33,9 @@
ONTOLOGIES_MAP,
OntologiesTransform,
)
from kg_microbe.transform_utils.ontologies_stubs.ontologies_stubs_transform import (
OntologiesStubsTransform,
)
from kg_microbe.transform_utils.rhea_mappings.rhea_mappings import RheaMappingsTransform

DATA_SOURCES = {
Expand All @@ -44,6 +48,10 @@
# "ProteinAtlasTransform": ProteinAtlasTransform,
# "STRINGTransform": STRINGTransform,
ONTOLOGIES: OntologiesTransform,
# Run ontologies_stubs after ontologies so the SemSQL DBs are present and
# so the stub-node TSVs land in data/transformed/ontologies_stubs/ before
# the merge step picks them up.
ONTOLOGIES_STUBS: OntologiesStubsTransform,
BACDIVE: BacDiveTransform,
BAKTA: BaktaTransform,
COG: COGTransform,
Expand Down
16 changes: 15 additions & 1 deletion kg_microbe/transform_utils/bacdive/bacdive.py
Original file line number Diff line number Diff line change
Expand Up @@ -2987,8 +2987,22 @@ def run(self, data_file: Union[Optional[Path], Optional[str]] = None, show_statu
# emit a thin node row here instead of pulling in the full
# ontology. Loaded-ontology targets (UBERON, ENVO, ...) get
# their canonical node from the ontologies transform.
#
# NCIT and mesh stub nodes are NOT emitted here — the
# OntologiesStubsTransform (kg_microbe/transform_utils/
# ontologies_stubs/) writes label+synonym+xref-enriched
# stubs from the SemSQL DBs, which is strictly richer
# than the label-only fallback below. Emitting both
# here and there would produce duplicate node rows
# that the merge would have to dedupe. The PRIDE/PCO/
# GENEPIO/FAO/BTO/SNOMED prefixes stay on the inline
# path because each has 1-3 IDs in the whole repo —
# not worth a SemSQL fetch.
stub_prefix = subject_id.split(":", 1)[0] if ":" in subject_id else ""
if stub_prefix in STUB_ONTOLOGY_PREFIXES:
if stub_prefix in STUB_ONTOLOGY_PREFIXES and stub_prefix not in {
"NCIT",
"mesh",
}:
node_writer.writerow(
self._create_node_row(
subject_id,
Expand Down
1 change: 1 addition & 0 deletions kg_microbe/transform_utils/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
KEGG = "kegg"
RHEAMAPPINGS = "rhea_mappings"
ONTOLOGIES = "ontologies"
ONTOLOGIES_STUBS = "ontologies_stubs"
WALLEN_ETAL = "wallen_etal"
CTD = "ctd"
DISBIOME = "disbiome"
Expand Down
7 changes: 7 additions & 0 deletions kg_microbe/transform_utils/ontologies_stubs/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
"""Ontologies-stubs transform package."""

from kg_microbe.transform_utils.ontologies_stubs.ontologies_stubs_transform import (
OntologiesStubsTransform,
)

__all__ = ["OntologiesStubsTransform"]
Original file line number Diff line number Diff line change
@@ -0,0 +1,253 @@
"""
Ontologies-stubs transform.

KG-Microbe deliberately does NOT load the full NCIT or MESH ontologies — those
belong to the sibling ``kg-microbe-biomedical`` pipeline. But the
chemical-mapping consolidator and the BacDive isolation-source mapper reference
~150 NCIT and MESH IDs as canonical xrefs for ingredients (e.g.
``NCIT:C29298 'Oatmeal'``, ``mesh:D011136 'Tween'``). Without this transform
those CURIEs would appear as dangling node ids in the merged KG: edges point at
them but no node row carries the label.

This transform:

1. Calls :func:`~kg_microbe.utils.stub_curie_collection.collect_stub_curies` to
discover every NCIT and MESH CURIE referenced anywhere under ``mappings/``.
2. For each CURIE, queries the local SemSQL DB (``data/raw/ncit.db``,
``data/raw/mesh.db``) via OAK to fetch its ``rdfs:label``, exact synonyms,
and dbxrefs. The same pattern is used by the chemical-mapping consolidator
for ChEBI in ``scripts/consolidate_chemical_mappings.py``.
3. Writes one KGX node TSV per stub ontology to
``data/transformed/ontologies_stubs/{ncit,mesh}_nodes.tsv`` carrying
``id, category, name, synonym, xref, provided_by, knowledge_source``.
No edges file — stubs are isolated nodes; edges arrive from the source
transforms (BacDive, MediaDive ingredients via the chemical-mapping path,
etc.).

Note for downstream consumers: if a KG built with this transform is ever
merged with a kg-microbe-biomedical KG that loads NCIT/MESH fully, biolink
merge semantics will union nodes — the stub node here is a strict subset of
what the full ontology would emit (label/synonym/xref only; no edges, no
deprecated flag, no parent classes), so the union will simply pick the
fuller record.
"""

from __future__ import annotations

import csv
import gzip
import shutil
from pathlib import Path
from typing import Dict, Iterable, List, Optional, Set

from kg_microbe.transform_utils.constants import (
CATEGORY_COLUMN,
DEPRECATED_COLUMN,
DESCRIPTION_COLUMN,
ID_COLUMN,
NAME_COLUMN,
PROVIDED_BY_COLUMN,
SAME_AS_COLUMN,
SYNONYM_COLUMN,
XREF_COLUMN,
)
from kg_microbe.transform_utils.transform import Transform
from kg_microbe.utils.isolation_source_mapping_utils import STUB_ONTOLOGY_CATEGORY
from kg_microbe.utils.stub_curie_collection import collect_stub_curies

# Stub ontologies handled by this transform. Each entry maps the canonical
# CURIE prefix (case-sensitive — must match how the prefix appears in
# existing mapping rows) to the local SemSQL DB and the InforES knowledge
# source string.
STUB_ONTOLOGY_SOURCES: Dict[str, Dict[str, str]] = {
"NCIT": {
"db_filename": "ncit.db",
"knowledge_source": "infores:ncit",
},
"mesh": {
"db_filename": "mesh.db",
"knowledge_source": "infores:mesh",
},
}

ONTOLOGIES_STUBS_SOURCE_NAME = "ontologies_stubs"


class OntologiesStubsTransform(Transform):

"""Emit one labelled stub node per referenced NCIT / MESH CURIE."""

def __init__(
self,
input_dir: Optional[Path] = None,
output_dir: Optional[Path] = None,
):
"""
Instantiate transform.

:param input_dir: Where the SemSQL DBs live (defaults to ``data/raw/``).
:param output_dir: Where ``ontologies_stubs/{ncit,mesh}_nodes.tsv`` are
written (defaults to ``data/transformed/``).
"""
super().__init__(ONTOLOGIES_STUBS_SOURCE_NAME, input_dir, output_dir)

def run(self, data_file=None) -> None: # noqa: D401 — base class signature
"""
Collect stub CURIEs, fetch metadata via OAK, write per-ontology node TSVs.

:param data_file: Unused (kept for the base-class signature). The
transform discovers its inputs from the mapping TSVs and the
SemSQL DBs in ``input_base_dir``.
"""
prefixes = list(STUB_ONTOLOGY_SOURCES.keys())
curies_by_prefix = collect_stub_curies(prefixes)

Comment on lines +94 to +104
for prefix, curies in curies_by_prefix.items():
cfg = STUB_ONTOLOGY_SOURCES[prefix]
db_path = self.input_base_dir / cfg["db_filename"]
output_file = self.output_dir / f"{prefix.lower()}_nodes.tsv"
self._write_stub_nodes(
prefix=prefix,
curies=sorted(curies),
db_path=db_path,
knowledge_source=cfg["knowledge_source"],
output_file=output_file,
)

# ------------------------------------------------------------------
# internal helpers
# ------------------------------------------------------------------

def _write_stub_nodes(
self,
prefix: str,
curies: List[str],
db_path: Path,
knowledge_source: str,
output_file: Path,
) -> None:
"""Fetch label/synonyms/xrefs per CURIE and write the node TSV."""
if not curies:
print(f" [{prefix}] no CURIEs to import; skipping {output_file.name}")
# Write an empty file with header so the merge step doesn't fail
# on a missing file declared in merge.yaml.
self._write_node_file(output_file, [])
return

adapter = self._open_adapter(prefix, db_path)
if adapter is None:
raise FileNotFoundError(
f"OAK adapter for {prefix} could not be opened (expected SemSQL DB at "
f"{db_path}). Run `poetry run kg download` to fetch it. The stub "
f"transform refuses to silently emit unlabelled nodes — that would "
f"reintroduce the dangling-xref hazard this transform exists to fix."
)

rows: List[List[Optional[str]]] = []
missing: List[str] = []
for curie in curies:
label, synonyms, xrefs = self._fetch_metadata(adapter, curie)
if not label:
# Last-resort fallback: use the CURIE as the name. Log it so
# curators can chase down obsolete or missing entries upstream.
missing.append(curie)
label = curie
row = [
curie, # id
STUB_ONTOLOGY_CATEGORY, # category
label, # name
None, # description
_join_pipe(xrefs), # xref
ONTOLOGIES_STUBS_SOURCE_NAME, # provided_by
_join_pipe(synonyms), # synonym
None, # deprecated
None, # same_as
]
rows.append(row)

self._write_node_file(output_file, rows)
print(
f" [{prefix}] wrote {len(rows)} stub nodes to {output_file.name} "
f"(knowledge_source={knowledge_source}, missing labels: {len(missing)})"
)
if missing:
print(f" [{prefix}] CURIEs with no SemSQL label (used CURIE as name): {missing}")

def _open_adapter(self, prefix: str, db_path: Path):
"""
Open an OAK SemSQL adapter against the local DB; return None on failure.

OBO Foundry distributes the SemSQL DBs as ``.db.gz`` and ``download.yaml``
stores the gzipped form. If the unzipped ``.db`` is missing but a sibling
``.db.gz`` is present, decompress it once (idempotent) and use the result.
"""
if not db_path.is_file():
gz_path = db_path.with_suffix(db_path.suffix + ".gz")
if gz_path.is_file():
print(f" [{prefix}] decompressing {gz_path.name} → {db_path.name}")
with gzip.open(gz_path, "rb") as src, db_path.open("wb") as dst:
shutil.copyfileobj(src, dst)
else:
return None
try:
from oaklib import get_adapter
except ImportError as exc: # pragma: no cover — oaklib is a dep
raise RuntimeError(
f"oaklib import failed while opening SemSQL adapter for {prefix}: {exc}"
) from exc
return get_adapter(f"sqlite:{db_path}")

def _fetch_metadata(self, adapter, curie: str):
"""Return (label, synonyms_set, xrefs_set) for ``curie`` via the OAK adapter."""
label = ""
synonyms: Set[str] = set()
xrefs: Set[str] = set()
try:
label = adapter.label(curie) or ""
except Exception: # noqa: S110 — obsolete CURIEs are expected to miss
pass
try:
synonyms = {s for s in adapter.entity_aliases(curie) if s}
except Exception: # noqa: S110
pass
# Drop the canonical label out of the synonym set to keep them disjoint.
synonyms.discard(label)
try:
metadata = adapter.entity_metadata_map(curie) or {}
except Exception: # noqa: S110
metadata = {}
# OAK returns metadata keyed by short-form predicate. dbxref entries
# land under "oio:hasDbXref" (or "oboInOwl:hasDbXref" on older
# adapters). Accept both.
for predicate_key in ("oio:hasDbXref", "oboInOwl:hasDbXref"):
for value in metadata.get(predicate_key, []) or []:
if value:
xrefs.add(str(value))
return label, sorted(synonyms), sorted(xrefs)

def _write_node_file(self, path: Path, rows: Iterable[Iterable[Optional[str]]]) -> None:
"""Write ``rows`` to ``path`` using the standard Transform node header."""
path.parent.mkdir(parents=True, exist_ok=True)
# Use the canonical 9-column node header from the Transform base class.
header = [
ID_COLUMN,
CATEGORY_COLUMN,
NAME_COLUMN,
DESCRIPTION_COLUMN,
XREF_COLUMN,
PROVIDED_BY_COLUMN,
SYNONYM_COLUMN,
DEPRECATED_COLUMN,
SAME_AS_COLUMN,
]
with path.open("w", newline="", encoding="utf-8") as fh:
writer = csv.writer(fh, delimiter="\t", lineterminator="\n")
writer.writerow(header)
for row in rows:
writer.writerow(["" if cell is None else cell for cell in row])


def _join_pipe(values: Iterable[str]) -> str:
"""Pipe-join a sequence; return ``""`` when empty (matches existing TSV convention)."""
items = [v for v in values if v]
return "|".join(items) if items else ""
30 changes: 24 additions & 6 deletions kg_microbe/utils/isolation_source_mapping_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -86,12 +86,30 @@
# but that are NOT loaded by the ontologies transform (see ONTOLOGIES_MAP in
# kg_microbe/transform_utils/ontologies/ontologies_transform.py). Each prefix
# either has only a tiny number of distinct IDs in use, or its full load is
# impractical (mesh and NCIT are huge clinical thesauri), so the BacDive
# transform writes a thin node row per resolved CURIE using the object_label
# from the mapping TSV. The category is biolink:OntologyClass for all stubs
# because they're typically categorical terms (host body site, microbial
# community, abscess, etc.) rather than specific anatomy / environmental
# features whose canonical metadata would come from a loaded ontology.
# impractical (mesh and NCIT are huge clinical thesauri).
#
# Two stub-import paths exist for these prefixes:
#
# 1. NCIT and mesh: a SemSQL-backed enriched stub source. The
# OntologiesStubsTransform (kg_microbe/transform_utils/ontologies_stubs/)
# queries data/raw/ncit.db and data/raw/mesh.db via OAK to fetch
# rdfs:label, exact synonyms, and dbxrefs for every NCIT/mesh CURIE that
# appears anywhere under mappings/. Output:
# data/transformed/ontologies_stubs/{ncit,mesh}_nodes.tsv. This is the
# preferred path — stubs carry full metadata, not just a label. The
# BacDive inline emit at bacdive.py defers to this transform for these
# two prefixes (see the `not in {"NCIT", "mesh"}` branch there).
#
# 2. The long-tail prefixes (PRIDE, PCO, GENEPIO, FAO, BTO, SNOMED): each
# has 1-3 IDs in the whole repo, so the BacDive transform writes a thin
# label-only node row inline at edge-emit time using the object_label
# from the mapping TSV. Setting up SemSQL DBs for these would be
# overkill.
#
# The category is biolink:OntologyClass for all stubs because they're
# typically categorical terms (host body site, microbial community,
# abscess, etc.) rather than specific anatomy / environmental features
# whose canonical metadata would come from a loaded ontology.
#
# Codex adversarial review #558 found that without stubs for these prefixes
# the BacDive transform was emitting edges to dangling node IDs because the
Expand Down
Loading
Loading