Knowledge-Graph-Hub · realmarcin · May 20, 2026 · May 12, 2026
diff --git a/download.yaml b/download.yaml
@@ -423,3 +423,22 @@
 -
   url: https://raw.githubusercontent.com/biolink/kgx/master/docs/kgx_format.md
   local_name: kgx-format.md
+
+
+#
+# **** Selective stub-import ontologies (NCIT, MESH) ****
+#
+# KG-Microbe does NOT load the full NCIT or MESH ontologies — those belong to
+# kg-microbe-biomedical. But the chemical-mapping consolidator and BacDive
+# isolation-source mapper reference ~150 NCIT/MESH IDs as canonical xrefs for
+# ingredients (e.g. NCIT:C29298 'Oatmeal', mesh:D011136 'Tween'). The
+# OntologiesStubsTransform queries these SemSQL DBs to harvest just the
+# referenced IDs (label + synonyms + xrefs), emitting one labelled stub node
+# each. The DBs themselves are never loaded into the merged KG.
+#
+-
+  url: https://s3.amazonaws.com/bbop-sqlite/ncit.db.gz
+  local_name: ncit.db.gz
+-
+  url: https://s3.amazonaws.com/bbop-sqlite/mesh.db.gz
+  local_name: mesh.db.gz
diff --git a/kg_microbe/transform.py b/kg_microbe/transform.py
@@ -20,6 +20,7 @@
     METATRAITS,
     METATRAITS_GTDB,
     ONTOLOGIES,
+    ONTOLOGIES_STUBS,
     RHEAMAPPINGS,
 )
 from kg_microbe.transform_utils.gtdb.gtdb import GTDBTransform
@@ -32,6 +33,9 @@
     ONTOLOGIES_MAP,
     OntologiesTransform,
 )
+from kg_microbe.transform_utils.ontologies_stubs.ontologies_stubs_transform import (
+    OntologiesStubsTransform,
+)
 from kg_microbe.transform_utils.rhea_mappings.rhea_mappings import RheaMappingsTransform
 
 DATA_SOURCES = {
@@ -44,6 +48,10 @@
     # "ProteinAtlasTransform": ProteinAtlasTransform,
     # "STRINGTransform": STRINGTransform,
     ONTOLOGIES: OntologiesTransform,
+    # Run ontologies_stubs after ontologies so the SemSQL DBs are present and
+    # so the stub-node TSVs land in data/transformed/ontologies_stubs/ before
+    # the merge step picks them up.
+    ONTOLOGIES_STUBS: OntologiesStubsTransform,
     BACDIVE: BacDiveTransform,
     BAKTA: BaktaTransform,
     COG: COGTransform,

diff --git a/kg_microbe/transform_utils/bacdive/bacdive.py b/kg_microbe/transform_utils/bacdive/bacdive.py
@@ -2987,8 +2987,22 @@ def run(self, data_file: Union[Optional[Path], Optional[str]] = None, show_statu
                             # emit a thin node row here instead of pulling in the full
                             # ontology. Loaded-ontology targets (UBERON, ENVO, ...) get
                             # their canonical node from the ontologies transform.
+                            #
+                            # NCIT and mesh stub nodes are NOT emitted here — the
+                            # OntologiesStubsTransform (kg_microbe/transform_utils/
+                            # ontologies_stubs/) writes label+synonym+xref-enriched
+                            # stubs from the SemSQL DBs, which is strictly richer
+                            # than the label-only fallback below. Emitting both
+                            # here and there would produce duplicate node rows
+                            # that the merge would have to dedupe. The PRIDE/PCO/
+                            # GENEPIO/FAO/BTO/SNOMED prefixes stay on the inline
+                            # path because each has 1-3 IDs in the whole repo —
+                            # not worth a SemSQL fetch.
                             stub_prefix = subject_id.split(":", 1)[0] if ":" in subject_id else ""
-                            if stub_prefix in STUB_ONTOLOGY_PREFIXES:
+                            if stub_prefix in STUB_ONTOLOGY_PREFIXES and stub_prefix not in {
+                                "NCIT",
+                                "mesh",
+                            }:
                                 node_writer.writerow(
                                     self._create_node_row(
                                         subject_id,

diff --git a/kg_microbe/transform_utils/constants.py b/kg_microbe/transform_utils/constants.py
@@ -13,6 +13,7 @@
 KEGG = "kegg"
 RHEAMAPPINGS = "rhea_mappings"
 ONTOLOGIES = "ontologies"
+ONTOLOGIES_STUBS = "ontologies_stubs"
 WALLEN_ETAL = "wallen_etal"
 CTD = "ctd"
 DISBIOME = "disbiome"

diff --git a/kg_microbe/transform_utils/ontologies_stubs/__init__.py b/kg_microbe/transform_utils/ontologies_stubs/__init__.py
@@ -0,0 +1,7 @@
+"""Ontologies-stubs transform package."""
+
+from kg_microbe.transform_utils.ontologies_stubs.ontologies_stubs_transform import (
+    OntologiesStubsTransform,
+)
+
+__all__ = ["OntologiesStubsTransform"]
diff --git a/kg_microbe/transform_utils/ontologies_stubs/ontologies_stubs_transform.py b/kg_microbe/transform_utils/ontologies_stubs/ontologies_stubs_transform.py
@@ -0,0 +1,253 @@
+"""
+Ontologies-stubs transform.
+
+KG-Microbe deliberately does NOT load the full NCIT or MESH ontologies — those
+belong to the sibling ``kg-microbe-biomedical`` pipeline. But the
+chemical-mapping consolidator and the BacDive isolation-source mapper reference
+~150 NCIT and MESH IDs as canonical xrefs for ingredients (e.g.
+``NCIT:C29298 'Oatmeal'``, ``mesh:D011136 'Tween'``). Without this transform
+those CURIEs would appear as dangling node ids in the merged KG: edges point at
+them but no node row carries the label.
+
+This transform:
+
+1. Calls :func:`~kg_microbe.utils.stub_curie_collection.collect_stub_curies` to
+   discover every NCIT and MESH CURIE referenced anywhere under ``mappings/``.
+2. For each CURIE, queries the local SemSQL DB (``data/raw/ncit.db``,
+   ``data/raw/mesh.db``) via OAK to fetch its ``rdfs:label``, exact synonyms,
+   and dbxrefs. The same pattern is used by the chemical-mapping consolidator
+   for ChEBI in ``scripts/consolidate_chemical_mappings.py``.
+3. Writes one KGX node TSV per stub ontology to
+   ``data/transformed/ontologies_stubs/{ncit,mesh}_nodes.tsv`` carrying
+   ``id, category, name, synonym, xref, provided_by, knowledge_source``.
+   No edges file — stubs are isolated nodes; edges arrive from the source
+   transforms (BacDive, MediaDive ingredients via the chemical-mapping path,
+   etc.).
+
+Note for downstream consumers: if a KG built with this transform is ever
+merged with a kg-microbe-biomedical KG that loads NCIT/MESH fully, biolink
+merge semantics will union nodes — the stub node here is a strict subset of
+what the full ontology would emit (label/synonym/xref only; no edges, no
+deprecated flag, no parent classes), so the union will simply pick the
+fuller record.
+"""
+
+from __future__ import annotations
+
+import csv
+import gzip
+import shutil
+from pathlib import Path
+from typing import Dict, Iterable, List, Optional, Set
+
+from kg_microbe.transform_utils.constants import (
+    CATEGORY_COLUMN,
+    DEPRECATED_COLUMN,
+    DESCRIPTION_COLUMN,
+    ID_COLUMN,
+    NAME_COLUMN,
+    PROVIDED_BY_COLUMN,
+    SAME_AS_COLUMN,
+    SYNONYM_COLUMN,
+    XREF_COLUMN,
+)
+from kg_microbe.transform_utils.transform import Transform
+from kg_microbe.utils.isolation_source_mapping_utils import STUB_ONTOLOGY_CATEGORY
+from kg_microbe.utils.stub_curie_collection import collect_stub_curies
+
+# Stub ontologies handled by this transform. Each entry maps the canonical
+# CURIE prefix (case-sensitive — must match how the prefix appears in
+# existing mapping rows) to the local SemSQL DB and the InforES knowledge
+# source string.
+STUB_ONTOLOGY_SOURCES: Dict[str, Dict[str, str]] = {
+    "NCIT": {
+        "db_filename": "ncit.db",
+        "knowledge_source": "infores:ncit",
+    },
+    "mesh": {
+        "db_filename": "mesh.db",
+        "knowledge_source": "infores:mesh",
+    },
+}
+
+ONTOLOGIES_STUBS_SOURCE_NAME = "ontologies_stubs"
+
+
+class OntologiesStubsTransform(Transform):
+
+    """Emit one labelled stub node per referenced NCIT / MESH CURIE."""
+
+    def __init__(
+        self,
+        input_dir: Optional[Path] = None,
+        output_dir: Optional[Path] = None,
+    ):
+        """
+        Instantiate transform.
+
+        :param input_dir: Where the SemSQL DBs live (defaults to ``data/raw/``).
+        :param output_dir: Where ``ontologies_stubs/{ncit,mesh}_nodes.tsv`` are
+            written (defaults to ``data/transformed/``).
+        """
+        super().__init__(ONTOLOGIES_STUBS_SOURCE_NAME, input_dir, output_dir)
+
+    def run(self, data_file=None) -> None:  # noqa: D401 — base class signature
+        """
+        Collect stub CURIEs, fetch metadata via OAK, write per-ontology node TSVs.
+
+        :param data_file: Unused (kept for the base-class signature). The
+            transform discovers its inputs from the mapping TSVs and the
+            SemSQL DBs in ``input_base_dir``.
+        """
+        prefixes = list(STUB_ONTOLOGY_SOURCES.keys())
+        curies_by_prefix = collect_stub_curies(prefixes)
+
+        for prefix, curies in curies_by_prefix.items():
+            cfg = STUB_ONTOLOGY_SOURCES[prefix]
+            db_path = self.input_base_dir / cfg["db_filename"]
+            output_file = self.output_dir / f"{prefix.lower()}_nodes.tsv"
+            self._write_stub_nodes(
+                prefix=prefix,
+                curies=sorted(curies),
+                db_path=db_path,
+                knowledge_source=cfg["knowledge_source"],
+                output_file=output_file,
+            )
+
+    # ------------------------------------------------------------------
+    # internal helpers
+    # ------------------------------------------------------------------
+
+    def _write_stub_nodes(
+        self,
+        prefix: str,
+        curies: List[str],
+        db_path: Path,
+        knowledge_source: str,
+        output_file: Path,
+    ) -> None:
+        """Fetch label/synonyms/xrefs per CURIE and write the node TSV."""
+        if not curies:
+            print(f"  [{prefix}] no CURIEs to import; skipping {output_file.name}")
+            # Write an empty file with header so the merge step doesn't fail
+            # on a missing file declared in merge.yaml.
+            self._write_node_file(output_file, [])
+            return
+
+        adapter = self._open_adapter(prefix, db_path)
+        if adapter is None:
+            raise FileNotFoundError(
+                f"OAK adapter for {prefix} could not be opened (expected SemSQL DB at "
+                f"{db_path}). Run `poetry run kg download` to fetch it. The stub "
+                f"transform refuses to silently emit unlabelled nodes — that would "
+                f"reintroduce the dangling-xref hazard this transform exists to fix."
+            )
+
+        rows: List[List[Optional[str]]] = []
+        missing: List[str] = []
+        for curie in curies:
+            label, synonyms, xrefs = self._fetch_metadata(adapter, curie)
+            if not label:
+                # Last-resort fallback: use the CURIE as the name. Log it so
+                # curators can chase down obsolete or missing entries upstream.
+                missing.append(curie)
+                label = curie
+            row = [
+                curie,                      # id
+                STUB_ONTOLOGY_CATEGORY,     # category
+                label,                      # name
+                None,                        # description
+                _join_pipe(xrefs),          # xref
+                ONTOLOGIES_STUBS_SOURCE_NAME,  # provided_by
+                _join_pipe(synonyms),       # synonym
+                None,                        # deprecated
+                None,                        # same_as
+            ]
+            rows.append(row)
+
+        self._write_node_file(output_file, rows)
+        print(
+            f"  [{prefix}] wrote {len(rows)} stub nodes to {output_file.name} "
+            f"(knowledge_source={knowledge_source}, missing labels: {len(missing)})"
+        )
+        if missing:
+            print(f"  [{prefix}] CURIEs with no SemSQL label (used CURIE as name): {missing}")
+
+    def _open_adapter(self, prefix: str, db_path: Path):
+        """
+        Open an OAK SemSQL adapter against the local DB; return None on failure.
+
+        OBO Foundry distributes the SemSQL DBs as ``.db.gz`` and ``download.yaml``
+        stores the gzipped form. If the unzipped ``.db`` is missing but a sibling
+        ``.db.gz`` is present, decompress it once (idempotent) and use the result.
+        """
+        if not db_path.is_file():
+            gz_path = db_path.with_suffix(db_path.suffix + ".gz")
+            if gz_path.is_file():
+                print(f"  [{prefix}] decompressing {gz_path.name} → {db_path.name}")
+                with gzip.open(gz_path, "rb") as src, db_path.open("wb") as dst:
+                    shutil.copyfileobj(src, dst)
+            else:
+                return None
+        try:
+            from oaklib import get_adapter
+        except ImportError as exc:  # pragma: no cover — oaklib is a dep
+            raise RuntimeError(
+                f"oaklib import failed while opening SemSQL adapter for {prefix}: {exc}"
+            ) from exc
+        return get_adapter(f"sqlite:{db_path}")
+
+    def _fetch_metadata(self, adapter, curie: str):
+        """Return (label, synonyms_set, xrefs_set) for ``curie`` via the OAK adapter."""
+        label = ""
+        synonyms: Set[str] = set()
+        xrefs: Set[str] = set()
+        try:
+            label = adapter.label(curie) or ""
+        except Exception:  # noqa: S110 — obsolete CURIEs are expected to miss
+            pass
+        try:
+            synonyms = {s for s in adapter.entity_aliases(curie) if s}
+        except Exception:  # noqa: S110
+            pass
+        # Drop the canonical label out of the synonym set to keep them disjoint.
+        synonyms.discard(label)
+        try:
+            metadata = adapter.entity_metadata_map(curie) or {}
+        except Exception:  # noqa: S110
+            metadata = {}
+        # OAK returns metadata keyed by short-form predicate. dbxref entries
+        # land under "oio:hasDbXref" (or "oboInOwl:hasDbXref" on older
+        # adapters). Accept both.
+        for predicate_key in ("oio:hasDbXref", "oboInOwl:hasDbXref"):
+            for value in metadata.get(predicate_key, []) or []:
+                if value:
+                    xrefs.add(str(value))
+        return label, sorted(synonyms), sorted(xrefs)
+
+    def _write_node_file(self, path: Path, rows: Iterable[Iterable[Optional[str]]]) -> None:
+        """Write ``rows`` to ``path`` using the standard Transform node header."""
+        path.parent.mkdir(parents=True, exist_ok=True)
+        # Use the canonical 9-column node header from the Transform base class.
+        header = [
+            ID_COLUMN,
+            CATEGORY_COLUMN,
+            NAME_COLUMN,
+            DESCRIPTION_COLUMN,
+            XREF_COLUMN,
+            PROVIDED_BY_COLUMN,
+            SYNONYM_COLUMN,
+            DEPRECATED_COLUMN,
+            SAME_AS_COLUMN,
+        ]
+        with path.open("w", newline="", encoding="utf-8") as fh:
+            writer = csv.writer(fh, delimiter="\t", lineterminator="\n")
+            writer.writerow(header)
+            for row in rows:
+                writer.writerow(["" if cell is None else cell for cell in row])
+
+
+def _join_pipe(values: Iterable[str]) -> str:
+    """Pipe-join a sequence; return ``""`` when empty (matches existing TSV convention)."""
+    items = [v for v in values if v]
+    return "|".join(items) if items else ""
diff --git a/kg_microbe/utils/isolation_source_mapping_utils.py b/kg_microbe/utils/isolation_source_mapping_utils.py
@@ -86,12 +86,30 @@
 # but that are NOT loaded by the ontologies transform (see ONTOLOGIES_MAP in
 # kg_microbe/transform_utils/ontologies/ontologies_transform.py). Each prefix
 # either has only a tiny number of distinct IDs in use, or its full load is
-# impractical (mesh and NCIT are huge clinical thesauri), so the BacDive
-# transform writes a thin node row per resolved CURIE using the object_label
-# from the mapping TSV. The category is biolink:OntologyClass for all stubs
-# because they're typically categorical terms (host body site, microbial
-# community, abscess, etc.) rather than specific anatomy / environmental
-# features whose canonical metadata would come from a loaded ontology.
+# impractical (mesh and NCIT are huge clinical thesauri).
+#
+# Two stub-import paths exist for these prefixes:
+#
+# 1. NCIT and mesh: a SemSQL-backed enriched stub source. The
+#    OntologiesStubsTransform (kg_microbe/transform_utils/ontologies_stubs/)
+#    queries data/raw/ncit.db and data/raw/mesh.db via OAK to fetch
+#    rdfs:label, exact synonyms, and dbxrefs for every NCIT/mesh CURIE that
+#    appears anywhere under mappings/. Output:
+#    data/transformed/ontologies_stubs/{ncit,mesh}_nodes.tsv. This is the
+#    preferred path — stubs carry full metadata, not just a label. The
+#    BacDive inline emit at bacdive.py defers to this transform for these
+#    two prefixes (see the `not in {"NCIT", "mesh"}` branch there).
+#
+# 2. The long-tail prefixes (PRIDE, PCO, GENEPIO, FAO, BTO, SNOMED): each
+#    has 1-3 IDs in the whole repo, so the BacDive transform writes a thin
+#    label-only node row inline at edge-emit time using the object_label
+#    from the mapping TSV. Setting up SemSQL DBs for these would be
+#    overkill.
+#
+# The category is biolink:OntologyClass for all stubs because they're
+# typically categorical terms (host body site, microbial community,
+# abscess, etc.) rather than specific anatomy / environmental features
+# whose canonical metadata would come from a loaded ontology.
 #
 # Codex adversarial review #558 found that without stubs for these prefixes
 # the BacDive transform was emitting edges to dangling node IDs because the