From 03dfc79e752444ccca24d2e037c828ff6c68a337 Mon Sep 17 00:00:00 2001
From: Josh Loecker <joshloecker@icloud.com>
Date: Fri, 6 Mar 2026 10:24:40 -0600
Subject: [PATCH 01/41] chore: bump zFPKM version requirement

Signed-off-by: Josh Loecker <joshloecker@icloud.com>
---
 pyproject.toml | 2 +-
 uv.lock        | 8 ++++----
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index ede17cb1..fb234ecc 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -28,7 +28,7 @@ dependencies = [
     "statsmodels>=0.13.0; python_version < '3.12'",
     "statsmodels>=0.14.0; python_version >= '3.12'",
     "troppo@git+https://github.com/JoshLoecker/troppo@master",
-    "zfpkm>=1.0.3",
+    "zfpkm>=1.1.0",
 ]
 
 [project.optional-dependencies]
diff --git a/uv.lock b/uv.lock
index 9139aea2..64be9f70 100644
--- a/uv.lock
+++ b/uv.lock
@@ -526,7 +526,7 @@ requires-dist = [
     { name = "statsmodels", marker = "python_full_version < '3.12'", specifier = ">=0.13.0" },
     { name = "statsmodels", marker = "python_full_version >= '3.12'", specifier = ">=0.14.0" },
     { name = "troppo", git = "https://github.com/JoshLoecker/troppo?rev=master" },
-    { name = "zfpkm", specifier = ">=1.0.3" },
+    { name = "zfpkm", specifier = ">=1.1.0" },
 ]
 provides-extras = ["gurobi", "interactive"]
 
@@ -3697,7 +3697,7 @@ wheels = [
 
 [[package]]
 name = "zfpkm"
-version = "1.0.3"
+version = "1.1.0"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
     { name = "loguru" },
@@ -3705,9 +3705,9 @@ dependencies = [
     { name = "numpy" },
     { name = "pandas" },
 ]
-sdist = { url = "https://files.pythonhosted.org/packages/e3/7f/ff714f85601cd66439f2beed0d740772509b32b8be5a8b01a53652248714/zfpkm-1.0.3.tar.gz", hash = "sha256:58830ea61e6adc0c75f28d5304885bd03a33a6e9e56aa693856cbb37e30a5046", size = 15410, upload-time = "2025-11-10T16:47:45.614Z" }
+sdist = { url = "https://files.pythonhosted.org/packages/0d/bf/43471c26e47fc38b5f748723e3bfa104f77ee0352f5b6a242f8b01786ae4/zfpkm-1.1.0.tar.gz", hash = "sha256:c159f78703d9d853c5f8cbbc590fb9381f097329487947d4eaf95c72ab309870", size = 15623, upload-time = "2026-03-06T16:22:29.929Z" }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/11/f8/ef2baeaf2d15682d5d663c3f5165b63abad114fc0c4cd90b67b1ed0a6456/zfpkm-1.0.3-py3-none-any.whl", hash = "sha256:085007f97e75e50d686677ee28e3fceba5fc19958b35e9fbad3756ca2302a219", size = 17841, upload-time = "2025-11-10T16:47:44.805Z" },
+    { url = "https://files.pythonhosted.org/packages/85/a1/10cf3c5268131d37a4a968a1f5f935a07e81cb78dbb83e8bedc4a835c048/zfpkm-1.1.0-py3-none-any.whl", hash = "sha256:765d1785a22729adeb89732da01ba47abcbc9597c17c587e42176398a758b7a3", size = 18103, upload-time = "2026-03-06T16:22:29.104Z" },
 ]
 
 [[package]]

From 1e2e14f434b45ceea2aa6d986b2ed3d1ed977d45 Mon Sep 17 00:00:00 2001
From: Josh Loecker <joshloecker@icloud.com>
Date: Fri, 6 Mar 2026 10:25:05 -0600
Subject: [PATCH 02/41] chore: allow ruff to fix imports and `__all__` sections

Signed-off-by: Josh Loecker <joshloecker@icloud.com>
---
 ruff.toml | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/ruff.toml b/ruff.toml
index 691022d2..edc19396 100644
--- a/ruff.toml
+++ b/ruff.toml
@@ -1,3 +1,4 @@
+src = ["main/como"]
 line-length = 120
 extend-include = ["docs/**/*.py", "tests/**/*.py", "**/*.ipynb"]
 
@@ -6,7 +7,11 @@ quote-style = "double"
 docstring-code-format = true
 
 [lint]
-extend-fixable = ["I001"]
+extend-fixable = [
+    "I001",
+    "RUF022",
+    "W293",
+]
 # Linting rules: https://docs.astral.sh/ruff/rules/
 unfixable = [
     "F401", # warn about, but do not remove, unused imports
@@ -45,6 +50,9 @@ ignore = [
     "TRY003", # allow exception messages outside the `Exception` class
 ]
 
+[lint.isort]
+known-first-party = ["como"]
+
 [lint.pydocstyle]
 convention = "google"
 

From a6b25e9d548b0d487d18c2317580b1ee2440125d Mon Sep 17 00:00:00 2001
From: Josh Loecker <joshloecker@icloud.com>
Date: Fri, 6 Mar 2026 10:27:52 -0600
Subject: [PATCH 03/41] chore: bump bioservices requirement

Signed-off-by: Josh Loecker <joshloecker@icloud.com>
---
 pyproject.toml | 2 +-
 uv.lock        | 8 ++++----
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index fb234ecc..7a277114 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -9,7 +9,7 @@ requires-python = ">=3.11,<3.14"
 dependencies = [
     "aioftp>=0.23.1",
     "anndata>=0.12.0",
-    "bioservices>=1.12.1",
+    "bioservices>=1.13.0",
     "cobamp@git+https://github.com/JoshLoecker/cobamp@master",
     "cobra>=0.28.0",
     "joypy>=0.2.6",
diff --git a/uv.lock b/uv.lock
index 64be9f70..09d79c73 100644
--- a/uv.lock
+++ b/uv.lock
@@ -181,7 +181,7 @@ wheels = [
 
 [[package]]
 name = "bioservices"
-version = "1.12.1"
+version = "1.13.0"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
     { name = "appdirs" },
@@ -201,9 +201,9 @@ dependencies = [
     { name = "wrapt" },
     { name = "xmltodict" },
 ]
-sdist = { url = "https://files.pythonhosted.org/packages/5e/51/1a8dc87ffc6c27e0a896b982da3ff1b483f5ef85a47ae1ffafbc6a479bb4/bioservices-1.12.1.tar.gz", hash = "sha256:0f31782ae50930d4ab82b43f98d1ca2cc9befaa699f3a7a6cba512a8f9e2cab0", size = 218752, upload-time = "2025-02-27T22:38:58.628Z" }
+sdist = { url = "https://files.pythonhosted.org/packages/99/8c/b9a20b6656446daaea5f3085d5465aad8f536c95fd14ffd6fcdcdf410031/bioservices-1.13.0.tar.gz", hash = "sha256:24d30be650d37c1377b4fd452abead9ec5ee891dbdaa96a21543ab508d401386", size = 220384, upload-time = "2026-03-02T14:49:11.002Z" }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/b8/c9/656854139abbdc0a09d544bacc6fa3132245ebf9847dd45da9a8e416c5a5/bioservices-1.12.1-py3-none-any.whl", hash = "sha256:898033fe0158a0e31ea5ad93ff02f5000aa67ff9d06e9dc6477583d3f60ace80", size = 258032, upload-time = "2025-02-27T22:38:56.406Z" },
+    { url = "https://files.pythonhosted.org/packages/27/49/93a413b3b70a96e525f11212d0ca7992c4c7b35c47da78bf227a52282e85/bioservices-1.13.0-py3-none-any.whl", hash = "sha256:aaafb5ee246bccd9323a21de12ea6a35163e01c1ed4777e45acbda565804fc32", size = 259727, upload-time = "2026-03-02T14:49:09.879Z" },
 ]
 
 [[package]]
@@ -504,7 +504,7 @@ dev = [
 requires-dist = [
     { name = "aioftp", specifier = ">=0.23.1" },
     { name = "anndata", specifier = ">=0.12.0" },
-    { name = "bioservices", specifier = ">=1.12.1" },
+    { name = "bioservices", specifier = ">=1.13.0" },
     { name = "cobamp", git = "https://github.com/JoshLoecker/cobamp?rev=master" },
     { name = "cobra", specifier = ">=0.28.0" },
     { name = "gurobipy", marker = "extra == 'gurobi'", specifier = "<14" },

From 027f97a38d80277852af1f5f12bf4baa13b814a3 Mon Sep 17 00:00:00 2001
From: Josh Loecker <joshloecker@icloud.com>
Date: Fri, 6 Mar 2026 10:29:06 -0600
Subject: [PATCH 04/41] chore: remove the `log_and_raise_error` helper function
 for better visibility of where errors are being raised

Signed-off-by: Josh Loecker <joshloecker@icloud.com>
---
 main/como/utils.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/main/como/utils.py b/main/como/utils.py
index 98daca16..a0c51b80 100644
--- a/main/como/utils.py
+++ b/main/como/utils.py
@@ -18,7 +18,6 @@
 T = TypeVar("T")
 __all__ = [
     "get_missing_gene_data",
-    "log_and_raise_error",
     "num_columns",
     "num_rows",
     "read_file",

From fa75fc6bda552d4a1cc3f73ee7f884cf0ddad4b2 Mon Sep 17 00:00:00 2001
From: Josh Loecker <joshloecker@icloud.com>
Date: Fri, 6 Mar 2026 10:31:50 -0600
Subject: [PATCH 05/41] chore: remove the internal `log_and_raise_error` helper
 function for better visibility of where errors are being raised

Signed-off-by: Josh Loecker <joshloecker@icloud.com>
---
 main/como/merge_xomics.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/main/como/merge_xomics.py b/main/como/merge_xomics.py
index 2cfdd1a1..564c2025 100644
--- a/main/como/merge_xomics.py
+++ b/main/como/merge_xomics.py
@@ -22,7 +22,7 @@
     _SourceWeights,
 )
 from como.project import Config
-from como.utils import get_missing_gene_data, log_and_raise_error, read_file, return_placeholder_data, set_up_logging
+from como.utils import get_missing_gene_data, read_file, return_placeholder_data, set_up_logging
 
 
 class _MergedHeaderNames:

From f59e21a67153ff1737aaf01fc9e14b5233725787 Mon Sep 17 00:00:00 2001
From: Josh Loecker <joshloecker@icloud.com>
Date: Fri, 6 Mar 2026 10:36:07 -0600
Subject: [PATCH 06/41] feat: add genomic conversion pipelines directly into
 COMO using `bioservices`

Signed-off-by: Josh Loecker <joshloecker@icloud.com>
---
 main/como/pipelines/identifier.py | 221 +++++++++++++++++++++++-------
 1 file changed, 170 insertions(+), 51 deletions(-)

diff --git a/main/como/pipelines/identifier.py b/main/como/pipelines/identifier.py
index 59fe7444..e75c46af 100644
--- a/main/como/pipelines/identifier.py
+++ b/main/como/pipelines/identifier.py
@@ -1,37 +1,55 @@
-from collections.abc import Sequence
-from typing import Literal
+from collections.abc import Iterable, Iterator, Sequence
+from typing import Any, Literal, overload
 
 import pandas as pd
 from bioservices.mygeneinfo import MyGeneInfo
+from tqdm import tqdm
 
 __all__ = [
-    "convert",
+    "build_gene_info",
     "determine_gene_type",
+    "get_remaining_identifiers",
 ]
 
+T_IDS = int | str | Iterable[int] | Iterable[str] | Iterable[int | str]
 T_MG_SCOPE = Literal["entrezgene", "ensembl.gene", "symbol"]
 T_MG_TRANSLATE = Literal["entrez_gene_id", "ensembl_gene_id", "gene_symbol"]
 T_MG_RETURN = list[dict[T_MG_TRANSLATE, str]]
 
 
-def _get_conversion(info: MyGeneInfo, values: list[str], scope: T_MG_SCOPE, fields: str, taxon: str) -> T_MG_RETURN:
-    value_str = ",".join(map(str, values))
-    results = info.get_queries(query=value_str, dotfield=True, scopes=scope, fields=fields, species=taxon)
-    if not isinstance(results, list):
-        raise TypeError(f"Expected results to be a list, but got {type(results)}")
-    if not isinstance(results[0], dict):
-        raise TypeError(f"Expected each result to be a dict, but got {type(results[0])}")
-
-    data: T_MG_RETURN = []
-    for result in results:
-        ensembl = result.get("query" if scope == "ensembl.gene" else "ensembl.gene")
-        entrez = result.get("query" if scope == "entrezgene" else "entrezgene")
-        symbol = result.get("query" if scope == "symbol" else "symbol")
-        data.append({"ensembl_gene_id": ensembl, "entrez_gene_id": entrez, "gene_symbol": symbol})
+def _get_conversion(info: MyGeneInfo, values: T_IDS, taxon: str | int) -> list[dict[str, Any]]:
+    value_list = sorted(map(str, [values] if isinstance(values, (int, str)) else values))
+    data_type = determine_gene_type(value_list)
+    if not all(v == data_type[value_list[0]] for v in data_type.values()):
+        raise ValueError("All items in ids must be of the same type (Entrez, Ensembl, or symbols).")
+
+    chunk_size = 1000
+    taxon_str = str(taxon)
+    scope: T_MG_SCOPE = next(iter(data_type.values()))
+    data = []
+    chunks = range(0, len(value_list), chunk_size)
+
+    for i in tqdm(chunks, desc=f"Getting info for '{scope}'"):
+        result = info.get_queries(
+            query=",".join(map(str, value_list[i : i + chunk_size])),
+            dotfield=True,
+            scopes=scope,
+            fields="ensembl.gene,entrezgene,symbol,genomic_pos.start,genomic_pos.end,taxid,notfound",
+            species=taxon_str,
+        )
+        if isinstance(result, int) and result == 414:
+            raise ValueError(
+                f"Query too long. Reduce the number of IDs in each query chunk (current chunk size: {chunk_size})."
+            )
+        if not isinstance(result, list):
+            raise TypeError(f"Expected results to be a list, but got {type(result)}")
+        if not isinstance(result[0], dict):
+            raise TypeError(f"Expected each result to be a dict, but got {type(result[0])}")
+        data.extend(result)
     return data
 
 
-def convert(ids: int | str | Sequence[int] | Sequence[str] | Sequence[int | str], taxon: int | str, cache: bool = True):
+def get_remaining_identifiers(ids: T_IDS, taxon: int | str, cache: bool = True):
     """Convert between genomic identifiers.
 
     This function will convert between the following components:
@@ -46,33 +64,111 @@ def convert(ids: int | str | Sequence[int] | Sequence[str] | Sequence[int | str]
     :return: DataFrame with columns "entrez_gene_id", "ensembl_gene_id", and "gene_symbol"
     """
     my_geneinfo = MyGeneInfo(cache=cache)
-    chunk_size = 1000
-    id_list = list(map(str, [ids] if isinstance(ids, (int, str)) else ids))
-    chunks = list(range(0, len(id_list), chunk_size))
+    gene_data = _get_conversion(info=my_geneinfo, values=ids, taxon=taxon)
+    df = (
+        pd.json_normalize(gene_data)
+        .rename(
+            columns={
+                "ensembl.gene": "ensembl_gene_id",
+                "entrezgene": "entrez_gene_id",
+                "symbol": "gene_symbol",
+                "taxid": "taxon_id",
+            }
+        )
+        .drop(
+            columns=["query", "_id", "_score", "genomic_pos.end", "genomic_pos.start"],
+            errors="ignore",
+        )
+    )
+    df = df[df["taxon_id"] == taxon]
+    df["taxon_id"] = df["taxon_id"].astype(int, copy=True)
+
+    # BUG: For an unknown reason, some Ensembl IDs are actually Entrez IDs
+    #   To filter these, two approaches can be done:
+    #   1) Remove rows where Ensembl IDs are integers
+    #   2) Remove rows where Ensembl IDs equal Entrez IDs
+    # We are selecting option 1 because it goes for the root cause: Ensembl IDs are not pure integers
+    mask = df["ensembl_gene_id"].astype(str).str.fullmatch(r"\d+").fillna(False)
+    df = df[
+        (df["ensembl_gene_id"].astype("string").notna())  # remove NA values
+        & (~df["ensembl_gene_id"].astype("string").str.fullmatch(r"\d+"))  # remove Entrez IDs
+    ]
+    return df
+
+
+def _to_scalar(val) -> int:
+    """Calculate the distance between end (e) and start (s)."""
+    if isinstance(val, list):
+        return int(sum(val) / len(val)) if val else 0  # `if val` checks that the list contains items
+    if pd.isna(val):
+        return 0
+    return int(val)
+
+
+def build_gene_info(ids: T_IDS, taxon: int | str, cache: bool = True):
+    """Get genomic information from a given set of IDs.
+
+    The input should be of the same type, otherwise this function will fail.
+    Expected types are:
+        - Ensembl Gene ID
+        - Entrez Gene ID
+        - Gene Symbol
 
-    data_type = determine_gene_type(id_list)
-    if not all(v == data_type[id_list[0]] for v in data_type.values()):
-        raise ValueError("All items in ids must be of the same type (Entrez, Ensembl, or symbols).")
+    The returned data frame will have the following columns:
+        - ensembl_gene_id
+        - entrez_gene_id
+        - gene_symbol
+        - size (distance between genomic end and start)
 
-    scope = next(iter(data_type.values()))
-    fields = ",".join({"ensembl.gene", "entrezgene", "symbol"} - {scope})
-    taxon_str = str(taxon)
-    return pd.DataFrame(
-        [
-            row
-            for i in chunks
-            for row in _get_conversion(
-                info=my_geneinfo,
-                values=id_list[i : i + chunk_size],
-                scope=scope,
-                fields=fields,
-                taxon=taxon_str,
-            )
-        ]
+    :param ids: IDs to be converted
+    :param taxon: Taxonomic identifier
+    :param cache: Should local caching be used for queries
+    :return: pandas.DataFrame
+    """
+    my_geneinfo = MyGeneInfo(cache=cache)
+    gene_data = _get_conversion(info=my_geneinfo, values=ids, taxon=taxon)
+    df = pd.json_normalize(gene_data).rename(columns={"taxid": "taxon_id"})
+    df = df[df["taxon_id"] == taxon]
+    df["taxon_id"] = df["taxon_id"].astype(int, copy=True)
+
+    df["size"] = df["genomic_pos.end"].fillna(0).map(_to_scalar) - df["genomic_pos.start"].fillna(0).map(_to_scalar)
+    df = (
+        df[~(df["size"] == 0)]
+        .drop(
+            columns=[
+                "query",
+                "_id",
+                "_score",
+                "genomic_pos.start",
+                "genomic_pos.end",
+                "notfound",
+            ],
+            inplace=False,
+            errors="ignore",
+        )
+        .rename(
+            columns={
+                "ensembl.gene": "ensembl_gene_id",
+                "entrezgene": "entrez_gene_id",
+                "symbol": "gene_symbol",
+            }
+        )
+        .explode(column=["ensembl_gene_id"])
+        .sort_values(by="ensembl_gene_id", inplace=False)
     )
 
+    return df
+
+
+@overload
+def determine_gene_type(items: str, /) -> T_MG_SCOPE: ...
+
+
+@overload
+def determine_gene_type(items: Sequence[str], /) -> dict[str, T_MG_SCOPE]: ...
+
 
-def determine_gene_type(items: str | list[str], /) -> dict[str, T_MG_SCOPE]:
+def determine_gene_type(items: str | Sequence[str], /) -> str | dict[str, T_MG_SCOPE]:
     """Determine the genomic data type.
 
     :param items: A string or list of strings representing gene identifiers.
@@ -85,16 +181,39 @@ def determine_gene_type(items: str | list[str], /) -> dict[str, T_MG_SCOPE]:
             followed by a specific format (length greater than 11 and the last 11 characters are digits).
         - "gene_symbol": If the item does not match the above criteria, it is assumed to be a gene symbol.
     """
-    items = [items] if isinstance(items, str) else items
-
-    determine: dict[str, Literal["entrezgene", "ensembl.gene", "symbol"]] = {}
-    for i in items:
-        i_str = str(i).split(".")[0] if isinstance(i, float) else str(i)
-        if i_str.isdigit():
-            determine[i_str] = "entrezgene"
-        elif i_str.startswith("ENS") and (len(i_str) > 11 and all(i.isdigit() for i in i_str[-11:])):
-            determine[i_str] = "ensembl.gene"
+    values = (items,) if isinstance(items, str) else items
+    result: dict[str, Literal["entrezgene", "ensembl.gene", "symbol"]] = {}
+
+    for i in values:
+        s = str(i).partition(".")[0]  # remove any transcripts that may exist
+
+        if s.startswith("ENS") and len(s) > 11 and s[-11:].isdigit():
+            result[s] = "ensembl.gene"
+        elif s.isdigit():
+            result[s] = "entrezgene"
         else:
-            determine[i_str] = "symbol"
+            result[s] = "symbol"
+
+    if isinstance(items, str):
+        return result[items]
+    return result
+
 
-    return determine
+def contains_identical_gene_types(values: dict[str, T_MG_SCOPE] | Sequence[T_MG_SCOPE]) -> bool:
+    data = values if not isinstance(values, dict) else list(values.values())
+    return all(v == data[0] for v in data)
+
+
+if __name__ == "__main__":
+    r = build_gene_info(
+        ids=[
+            "ENSG00000284484",
+            "ENSG00000299311",
+            "ENSG00000202151",
+            "ENSG00000226053",
+            "ENSG00000131831",
+        ],
+        taxon=9606,
+        cache=True,
+    )
+    print(r.columns)

From 391188020a3b6e489c1d9c5fee850c4192c5c8d0 Mon Sep 17 00:00:00 2001
From: Josh Loecker <joshloecker@icloud.com>
Date: Fri, 6 Mar 2026 10:41:21 -0600
Subject: [PATCH 07/41] feat: add genomic conversion pipelines directly into
 COMO using `bioservices`

Signed-off-by: Josh Loecker <joshloecker@icloud.com>
---
 main/como/utils.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/main/como/utils.py b/main/como/utils.py
index a0c51b80..11a34f16 100644
--- a/main/como/utils.py
+++ b/main/como/utils.py
@@ -13,7 +13,7 @@
 from loguru import logger
 
 from como.data_types import LOG_FORMAT, Algorithm, LogLevel
-from como.pipelines.identifier import convert
+from como.pipelines.identifier import get_remaining_identifiers
 
 T = TypeVar("T")
 __all__ = [
@@ -134,7 +134,7 @@ def get_missing_gene_data(values: Sequence[str] | pd.DataFrame | sc.AnnData, tax
     # second isinstance required for static type check to be happy
     # if isinstance(values, list) and not isinstance(values, pd.DataFrame):
     if isinstance(values, list):
-        return convert(ids=values, taxon=taxon_id)
+        return get_remaining_identifiers(ids=values, taxon=taxon_id)
     elif isinstance(values, pd.DataFrame):
         # raise error if duplicate column names exist
         if any(values.columns.duplicated(keep=False)):

From 46f30cd336b15701c2a36c98f548d33163640817 Mon Sep 17 00:00:00 2001
From: Josh Loecker <joshloecker@icloud.com>
Date: Fri, 6 Mar 2026 10:41:55 -0600
Subject: [PATCH 08/41] feat: add genomic conversion pipelines directly into
 COMO using `bioservices`

Signed-off-by: Josh Loecker <joshloecker@icloud.com>
---
 main/como/rnaseq_preprocess.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/main/como/rnaseq_preprocess.py b/main/como/rnaseq_preprocess.py
index 24d5e8b6..b4ef2841 100644
--- a/main/como/rnaseq_preprocess.py
+++ b/main/como/rnaseq_preprocess.py
@@ -18,7 +18,7 @@
 from loguru import logger
 
 from como.data_types import LogLevel, RNAType
-from como.pipelines.identifier import convert
+from como.pipelines.identifier import build_gene_info, get_remaining_identifiers
 from como.utils import read_file, set_up_logging
 
 
@@ -473,7 +473,7 @@ async def read_ensembl_gene_ids(file: Path) -> list[str]:
         if isinstance(data_, pd.DataFrame):
             return data_["ensembl_gene_id"].tolist()
         try:
-            conversion = convert(ids=data_.var_names.tolist(), taxon=taxon)
+            conversion = get_remaining_identifiers(ids=data_.var_names.tolist(), taxon=taxon)
         except json.JSONDecodeError as e:
             raise ValueError(f"Got a JSON decode error for file '{counts_matrix_filepaths}' ({e})")
 

From f2b3277e5202a7229fbc3a24396c907fa22455e3 Mon Sep 17 00:00:00 2001
From: Josh Loecker <joshloecker@icloud.com>
Date: Fri, 6 Mar 2026 10:43:16 -0600
Subject: [PATCH 09/41] feat: add genomic conversion pipelines directly into
 COMO using `bioservices`

Signed-off-by: Josh Loecker <joshloecker@icloud.com>
---
 main/como/combine_distributions.py | 5 ++---
 main/como/rnaseq_gen.py            | 2 +-
 2 files changed, 3 insertions(+), 4 deletions(-)

diff --git a/main/como/combine_distributions.py b/main/como/combine_distributions.py
index 5bb39d24..24f8889a 100644
--- a/main/como/combine_distributions.py
+++ b/main/como/combine_distributions.py
@@ -18,8 +18,7 @@
     _OutputCombinedSourceFilepath,
     _SourceWeights,
 )
-from como.pipelines.identifier import convert
-from como.utils import LogLevel, get_missing_gene_data, log_and_raise_error, num_columns
+from como.pipelines.identifier import get_remaining_identifiers
 from como.utils import num_columns
 
 
@@ -287,7 +286,7 @@ async def _begin_combining_distributions(
                 matrix_subset = matrix_subset.set_index(keys=[GeneIdentifier.entrez_gene_id.value], drop=True)
                 matrix_subset = matrix_subset.drop(columns=["gene_symbol", "ensembl_gene_id"], errors="ignore")
             elif isinstance(matrix, sc.AnnData):
-                conversion = convert(ids=matrix.var_names.tolist(), taxon=taxon)
+                conversion = get_remaining_identifiers(ids=matrix.var_names.tolist(), taxon=taxon)
                 conversion.reset_index(drop=False, inplace=True)
                 matrix: pd.DataFrame = matrix.to_df().T
                 matrix.reset_index(inplace=True, drop=False, names=["gene_symbol"])
diff --git a/main/como/rnaseq_gen.py b/main/como/rnaseq_gen.py
index 3f8c00db..5a5588ae 100644
--- a/main/como/rnaseq_gen.py
+++ b/main/como/rnaseq_gen.py
@@ -24,7 +24,7 @@
 
 from como.data_types import FilteringTechnique, LogLevel, RNAType
 from como.migrations import gene_info_migrations
-from como.pipelines.identifier import convert
+from como.pipelines.identifier import contains_identical_gene_types, determine_gene_type
 from como.project import Config
 from como.utils import read_file, set_up_logging
 

From d4e7ea8d3fc8260a1c40851ad154a53a0f08ff8a Mon Sep 17 00:00:00 2001
From: Josh Loecker <joshloecker@icloud.com>
Date: Fri, 6 Mar 2026 10:45:05 -0600
Subject: [PATCH 10/41] feat: extract gene-info building as a pipeline
 component

Signed-off-by: Josh Loecker <joshloecker@icloud.com>
---
 main/como/pipelines/identifier.py | 21 +++-----
 main/como/rnaseq_preprocess.py    | 86 +------------------------------
 2 files changed, 8 insertions(+), 99 deletions(-)

diff --git a/main/como/pipelines/identifier.py b/main/como/pipelines/identifier.py
index e75c46af..e09bd9dc 100644
--- a/main/como/pipelines/identifier.py
+++ b/main/como/pipelines/identifier.py
@@ -7,6 +7,7 @@
 
 __all__ = [
     "build_gene_info",
+    "contains_identical_gene_types",
     "determine_gene_type",
     "get_remaining_identifiers",
 ]
@@ -200,20 +201,10 @@ def determine_gene_type(items: str | Sequence[str], /) -> str | dict[str, T_MG_S
 
 
 def contains_identical_gene_types(values: dict[str, T_MG_SCOPE] | Sequence[T_MG_SCOPE]) -> bool:
+    """Check if all values in the input are identical.
+
+    :param values: A dictionary mapping gene identifiers to their types or a sequence of gene types.
+    :return: True if all values are identical, False otherwise.
+    """
     data = values if not isinstance(values, dict) else list(values.values())
     return all(v == data[0] for v in data)
-
-
-if __name__ == "__main__":
-    r = build_gene_info(
-        ids=[
-            "ENSG00000284484",
-            "ENSG00000299311",
-            "ENSG00000202151",
-            "ENSG00000226053",
-            "ENSG00000131831",
-        ],
-        taxon=9606,
-        cache=True,
-    )
-    print(r.columns)
diff --git a/main/como/rnaseq_preprocess.py b/main/como/rnaseq_preprocess.py
index b4ef2841..ec1a4f81 100644
--- a/main/como/rnaseq_preprocess.py
+++ b/main/como/rnaseq_preprocess.py
@@ -486,90 +486,8 @@ async def read_ensembl_gene_ids(file: Path) -> list[str]:
         "depending on the number of genes and your internet connection"
     )
 
-    ensembl_ids: set[str] = set(
-        chain.from_iterable(await asyncio.gather(*[read_ensembl_gene_ids(f) for f in counts_matrix_filepaths]))
-    )
-    gene_data: list[dict[str, str | int | list[str] | list[int] | None]] = await MyGene(cache=cache).query(
-        items=list(ensembl_ids),
-        taxon=taxon,
-        scopes="ensemblgene",
-    )
-
-    n = len(gene_data)
-    all_gene_symbols: list[str] = ["-"] * n
-    all_entrez_ids: list[str | int] = ["-"] * n
-    all_ensembl_ids: list[str] = ["-"] * n
-    all_sizes: list[int] = [-1] * n
-
-    def _avg_pos(value: int | list[int] | None) -> int:
-        if value is None:
-            return 0
-        if isinstance(value, list):
-            return int(sum(value) / len(value)) if value else 0
-        return int(value)
-
-    for i, data in enumerate(gene_data):
-        data: dict[str, str | int | list[str] | list[int] | None]
-        if "genomic_pos.start" not in data:
-            log_and_raise_error(
-                message="Unexpectedly missing key 'genomic_pos.start'", error=KeyError, level=LogLevel.WARNING
-            )
-        if "genomic_pos.end" not in data:
-            log_and_raise_error(
-                message="Unexpectedly missing key 'genomic_pos.end'", error=KeyError, level=LogLevel.WARNING
-            )
-        if "ensembl.gene" not in data:
-            log_and_raise_error(
-                message="Unexpectedly missing key 'ensembl.gene'", error=KeyError, level=LogLevel.WARNING
-            )
-
-        start = data["genomic_pos.start"]
-        end = data["genomic_pos.end"]
-        ensembl_id = data["ensembl.gene"]
-
-        if not isinstance(start, int):
-            log_and_raise_error(
-                message=f"Unexpected type for 'genomic_pos.start': expected int, got {type(start)}",
-                error=TypeError,
-                level=LogLevel.WARNING,
-            )
-        if not isinstance(end, int):
-            log_and_raise_error(
-                message=f"Unexpected type for 'genomic_pos.end': expected int, got {type(start)}",
-                error=TypeError,
-                level=LogLevel.WARNING,
-            )
-        if not isinstance(ensembl_id, str):
-            log_and_raise_error(
-                message=f"Unexpected type for 'ensembl.gene': expected str, got {type(ensembl_id)}",
-                error=ValueError,
-                level=LogLevel.WARNING,
-            )
-
-        size = end - start
-        all_ensembl_ids[i] = ",".join(map(str, ensembl_id)) if isinstance(ensembl_id, list) else ensembl_id
-        all_gene_symbols[i] = str(data.get("symbol", "-"))
-        all_entrez_ids[i] = str(data.get("entrezgene", "-"))
-        all_sizes[i] = max(size, -1)  # use `size` otherwise -1
-
-    gene_info: pd.DataFrame = pd.DataFrame(
-        {
-            "ensembl_gene_id": all_ensembl_ids,
-            "gene_symbol": all_gene_symbols,
-            "entrez_gene_id": all_entrez_ids,
-            "size": all_sizes,
-        }
-    )
-
-    # remove rows where every gene size value is -1 (not available)
-    gene_info = gene_info[~(gene_info == -1).all(axis=1)]
-
-    gene_info["ensembl_gene_id"] = gene_info["ensembl_gene_id"].str.split(",")  # extend lists into multiple rows
-    gene_info = gene_info.explode(column=["ensembl_gene_id"])
-    # we would set `entrez_gene_id` to int here as well, but not all ensembl ids are mapped to entrez ids,
-    #   and as a result, there are still "-" values in the entrez id column that cannot be converted to an integer
-
-    gene_info = gene_info.sort_values(by="ensembl_gene_id")
+    ensembl_ids: set[str] = set(chain.from_iterable(read_ensembl_gene_ids(f) for f in counts_matrix_filepaths))
+    gene_info = build_gene_info(ids=ensembl_ids, taxon=taxon, cache=cache)
     output_filepath.parent.mkdir(parents=True, exist_ok=True)
     gene_info.to_csv(output_filepath, index=False)
     logger.success(f"Gene Info file written at '{output_filepath}'")

From b75e126f47edba9d0df3217bbde76ef4b29f8cf1 Mon Sep 17 00:00:00 2001
From: Josh Loecker <joshloecker@icloud.com>
Date: Fri, 6 Mar 2026 10:58:29 -0600
Subject: [PATCH 11/41] chore: add types for ModelBuildSettings

- Add `ModelBuildSettings` dataclass for solver parameters
- Remove unused PeakIdentificationParameters and _BuildResults
- Minor formatting cleanup

Signed-off-by: Josh Loecker <joshloecker@icloud.com>
---
 main/como/data_types.py | 122 +++++++++++++++++++++++++++++++++++-----
 1 file changed, 107 insertions(+), 15 deletions(-)

diff --git a/main/como/data_types.py b/main/como/data_types.py
index ca0f9b8c..07cd3ba8 100644
--- a/main/como/data_types.py
+++ b/main/como/data_types.py
@@ -4,9 +4,11 @@
 from dataclasses import dataclass, field, fields
 from enum import Enum
 from pathlib import Path
-from typing import ClassVar, NamedTuple
+from typing import ClassVar, NamedTuple, NotRequired, TypedDict
 
 import cobra
+import numpy as np
+import numpy.typing as npt
 import pandas as pd
 from loguru import logger
 
@@ -78,11 +80,6 @@ class SourceTypes(str, Enum):
     proteomics = "proteomics"
 
 
-class PeakIdentificationParameters(NamedTuple):
-    height: float = 0.02
-    distance: float = 1.0
-
-
 class CobraCompartments:
     """Convert from compartment "long-hand" to "short-hand".
 
@@ -135,7 +132,9 @@ class CobraCompartments:
         "s": ["eyespot", "eyespot apparatus", "stigma"],
     }
 
-    _REVERSE_LOOKUP: ClassVar[dict[str, list[str]]] = {value.lower(): key for key, values in SHORTHAND.items() for value in values}
+    _REVERSE_LOOKUP: ClassVar[dict[str, list[str]]] = {
+        value.lower(): key for key, values in SHORTHAND.items() for value in values
+    }
 
     @classmethod
     def get_shorthand(cls, longhand: str) -> str | None:
@@ -163,14 +162,6 @@ def get_longhand(cls, shorthand: str) -> str | None:
         return longhand[0] if longhand else None
 
 
-class _BuildResults(NamedTuple):
-    """Results of building a context specific model."""
-
-    model: cobra.Model
-    expression_index_list: list[int]
-    infeasible_reactions: pd.DataFrame
-
-
 class _BoundaryReactions(NamedTuple):
     """Boundary reactions to be used in the context specific model."""
 
@@ -269,3 +260,104 @@ class _SourceWeights(_BaseDataType):
     mrna: int
     scrna: int
     proteomics: int
+
+
+@dataclass
+class ModelBuildSettings:
+    troppo_epsilon: float = 1e-4
+    min_reaction_flux: float = 1e-7
+    solver_timeout: int = 1800  # time in seconds, defaults to 30 minutes
+
+    """
+    Verbosity
+        Type: int
+        Default value: 0
+        Range: [0, 3]
+        From: https://docs.gurobi.com/projects/optimizer/en/current/reference/parameters.html#csclientlog
+    """
+    solver_verbosity: int = 0
+
+    """
+    Feasibility
+        Type: double
+        Default value: 1e-6
+        Range: [1e-9, 1e-2]
+        From: https://docs.gurobi.com/projects/optimizer/en/current/reference/parameters.html#feasibilitytol
+    """
+    solver_feasibility: float = 1e-6
+
+    """
+    OptimalityTol
+        Type: double
+        Default value: 1e-6
+        Range: [1e-9, 1e-2]
+        From: https://docs.gurobi.com/projects/optimizer/en/current/reference/parameters.html#optimalitytol
+    """
+    solver_optimality: float = 1e-6
+
+    """
+    IntFeasTol
+        Type: double
+        Default value: 1e-5
+        Range: [1e-9, 1e-1]
+        From: https://docs.gurobi.com/projects/optimizer/en/current/reference/parameters.html#intfeastol
+    """
+    solver_integrality: float = 1e-6
+
+    """
+    MIPGap
+        Relative MIP optimality gap
+        Default value: 1e-4
+        Range: [0, inf)
+        From: https://docs.gurobi.com/projects/optimizer/en/current/reference/parameters.html#mipgap
+    """
+    gurobi_mipgap: float = 1e-4
+
+    """
+    IntegralityFocus
+        Type: int
+        Default value: 0
+        Range: [0, 1]
+        From: https://docs.gurobi.com/projects/optimizer/en/current/reference/parameters.html#integralityfocus
+    """
+    gurobi_integrality_focus: int = 0
+
+    """
+    NumericFocus
+        Type: int
+        Default value: 0
+        Range: [0, 3]
+        From: https://docs.gurobi.com/projects/optimizer/en/current/reference/parameters.html#numericfocus
+    """
+    gurobi_numeric_focus: int = 2
+
+    def __post_init__(self):  # noqa: C901
+        """Validate provided arguments.
+
+        :raises: ValueError if any check fails.
+        """
+        if self.troppo_epsilon < 0:
+            raise ValueError("ModelBuildSettings: `troppo_epsilon` must be a non-negative float")
+        if self.min_reaction_flux < 0:
+            raise ValueError("ModelBuildSettings: `min_reaction_flux` must be a non-negative float")
+        if self.solver_verbosity not in {0, 1, 2, 3} or not isinstance(self.solver_verbosity, int):
+            raise ValueError("ModelBuildSettings: `solver_verbosity` must be an integer in the range [0, 3]")
+        if self.solver_timeout < 0 or not isinstance(self.solver_timeout, int):
+            raise ValueError("ModelBuildSettings: `solver_timeout` must be a non-negative integer")
+        if not (1e-9 < self.solver_feasibility < 1e-2):
+            raise ValueError("ModelBuildSettings: `solver_feasibility` must be a float in the range [1e-9, 1e-2]")
+        if not (1e-9 < self.solver_optimality < 1e-2):
+            raise ValueError("ModelBuildSettings: `solver_optimality` must be a float in the range [1e-9, 1e-2]")
+        if not (1e-9 < self.solver_integrality < 1e-1):
+            raise ValueError("ModelBuildSettings: `solver_integrality` must be a float in the range [1e-9, 1e-1]")
+        if self.gurobi_mipgap < 0:
+            raise ValueError("ModelBuildSettings: `gurobi_mipgap` must be a float in the range [1e-4, inf.)")
+        if self.gurobi_integrality_focus not in {0, 1} or not isinstance(self.gurobi_integrality_focus, int):
+            raise ValueError("ModelBuildSettings: `gurobi_integrality_focus` must be an integer in the range [0, 1]")
+        if self.gurobi_numeric_focus not in {0, 1, 2, 3} or not isinstance(self.gurobi_numeric_focus, int):
+            raise ValueError("ModelBuildSettings: `gurobi_numeric_focus` must be an integer in the range [0, 3]")
+        if self.troppo_epsilon < self.min_reaction_flux * 1000:
+            logger.warning(
+                "ModelBuildSettings: `troppo_epsilon` and `min_reaction_flux` have similar values. "
+                "This can cause inconsistent model builds. Consider ~1000x fold change between the two. "
+            )

From e06f64038beb531a63b83be5010e0cf428f4eeff Mon Sep 17 00:00:00 2001
From: Josh Loecker <joshloecker@icloud.com>
Date: Fri, 6 Mar 2026 11:01:48 -0600
Subject: [PATCH 12/41] feat: add TPM processing from Salmon quantification

- Adds TPM dataframe output support
- !Converts all async functions to sync
- Refactors `_write_counts_matrix` to `_write_matrices
- Add better validation for output filepaths
- No longer utilize STAR gene count files

![BREAKING-CHANGE]: Converts all async functions to sync

Signed-off-by: Josh Loecker <joshloecker@icloud.com>
---
 main/como/rnaseq_preprocess.py | 381 +++++++++++++++++++--------------
 1 file changed, 223 insertions(+), 158 deletions(-)

diff --git a/main/como/rnaseq_preprocess.py b/main/como/rnaseq_preprocess.py
index ec1a4f81..edd1c88d 100644
--- a/main/como/rnaseq_preprocess.py
+++ b/main/como/rnaseq_preprocess.py
@@ -1,19 +1,15 @@
 from __future__ import annotations
 
-import asyncio
-import csv
 import functools
-import io
 import json
 import re
 import sys
 from dataclasses import dataclass, field
 from itertools import chain
 from pathlib import Path
-from typing import Final, Literal, TextIO, cast
+from typing import Final, Literal, TextIO
 
 import numpy as np
-import numpy.typing as npt
 import pandas as pd
 from loguru import logger
 
@@ -101,6 +97,7 @@ def __post_init__(self):
 class SampleConfiguration:
     sample_name: str
     effective_lengths: pd.DataFrame
+    tpm: pd.DataFrame
     mean_effective_length: float
     layout: str
     strand: str
@@ -122,31 +119,36 @@ def __post_init__(self):
             raise ValueError(f"Sample '{self.sample_name}' is missing 'effective_length' column")
 
     @classmethod
-    def to_dataframe(cls, samples: list[SampleConfiguration]) -> tuple[pd.DataFrame, pd.DataFrame]:
+    def to_dataframe(cls, samples: list[SampleConfiguration]) -> tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
         """Convert a list of SampleConfiguration to a dataframe.
 
         :param samples: The list of SampleConfiguration objects to convert.
         :return: A tuple of dataframes:
             [0]: The sample configuration as a dataframe
-            [1]: The effective lengths as a separate data frame with `same_name` as columns
+            [1]: The effective lengths as a separate data frame with `sample_name` as columns
+            [2]: The salmon TPM values as a separate data frame with `sample_name` as columns
         """
-        config = pd.DataFrame(
-            columns=["sample_name", "mean_effective_length", "layout", "strand", "study", "library_prep"]
+        config: pd.DataFrame = pd.DataFrame.from_records(
+            [(s.sample_name, s.mean_effective_length, s.layout, s.strand, s.study, s.library_prep) for s in samples],
+            columns=["sample_name", "mean_effective_length", "layout", "strand", "study", "library_prep"],
         )
+        lengths = pd.concat(
+            [
+                s.effective_lengths.set_index("name")["effective_length"]
+                .astype(np.float64, copy=False)
+                .rename(s.sample_name)
+                for s in samples
+            ],
+            axis=1,
+        ).fillna(0.0)
+        lengths.index.name = "ensembl_gene_id"
+        tpms = pd.concat(
+            [s.tpm.set_index("name")["tpm"].astype(np.float64, copy=False).rename(s.sample_name) for s in samples],
+            axis=1,
+        )
+        tpms.index.name = "ensembl_gene_id"
 
-        genes = set()
-        for s in samples:
-            genes.update(s.effective_lengths["name"].to_list())
-
-        lengths = pd.DataFrame(data=np.float64(0.0), columns=[s.sample_name for s in samples], index=list(genes))
-        for sample in samples:
-            ids: list[str] = sample.effective_lengths["name"].to_list()
-            data: npt.NDArray[np.floating] = sample.effective_lengths["effective_length"].to_numpy(dtype=np.float64)
-            lengths.loc[ids, sample.sample_name] = data
-
-        return config, lengths
-
-        return config, lengths
+        return config, lengths, tpms
 
 
 def _sample_name_from_filepath(file: Path) -> str:
@@ -234,10 +236,7 @@ def _process_first_multirun_sample(strand_file: Path, all_quant_files: list[Path
     sample_counts = sample_counts.fillna(value=0)
     sample_counts["counts"] = sample_counts["counts"].astype(float)
 
-    count_avg = cast(  # type checkers think `.groupby(...).mean()` returns a pd.Series, force pd.DataFrame
-        pd.DataFrame, cast(object, sample_counts.groupby("ensembl_gene_id", as_index=False)["counts"].mean())
-    )
-    count_avg["counts"] = np.ceil(count_avg["counts"].astype(int))
+    count_avg = sample_counts.groupby("ensembl_gene_id", as_index=False).mean()
     count_avg.columns = ["ensembl_gene_id", _sample_name_from_filepath(strand_file)]
     return count_avg
 
@@ -270,6 +269,9 @@ def _create_sample_counts_matrix(metrics: _StudyMetrics) -> pd.DataFrame:
         strand_file=metrics.strand_files[0],
         all_quant_files=metrics.quant_files,
     )
+    if counts is None:
+        raise ValueError(f"Unable to process the first sample for study '{metrics.study_name}'")
+    counts = counts.drop(columns=["length", "effective_length", "tpm"])
 
     for i in range(1, metrics.num_samples):
         new_counts = _prepare_sample_counts(
@@ -283,6 +285,7 @@ def _create_sample_counts_matrix(metrics: _StudyMetrics) -> pd.DataFrame:
             continue
 
         assert isinstance(counts, pd.DataFrame)  # noqa: S101
+        new_counts = new_counts.drop(columns=["length", "effective_length", "tpm"])
         counts = counts.merge(new_counts, on="ensembl_gene_id", how="outer")
         counts = counts.fillna(value=0)
 
@@ -298,13 +301,15 @@ def _create_sample_counts_matrix(metrics: _StudyMetrics) -> pd.DataFrame:
     return counts
 
 
-def _write_counts_matrix(
+def _write_matrices(
     *,
     config_df: pd.DataFrame,
     fragment_lengths: pd.DataFrame,
+    tpm_df: pd.DataFrame,
     como_context_dir: Path,
-    output_counts_matrix_filepath: Path,
-    output_fragment_lengths_filepath: Path,
+    out_counts_filepath: Path,
+    out_frag_length_filepath: Path,
+    out_tpm_filepath: Path,
     rna: RNAType,
 ) -> pd.DataFrame:
     """Create a counts matrix file by reading gene counts table(s).
@@ -312,46 +317,49 @@ def _write_counts_matrix(
     :param config_df: Configuration DataFrame containing sample information.
     :param fragment_lengths: DataFrame containing effective lengths for each gene and sample,
         used for zFPKM normalization.
+    :param tpm_df: DataFrame containing TPM values for each gene and sample, used for zFPKM normalization.
     :param como_context_dir: Path to the COMO_input directory containing gene count files.
-    :param output_counts_matrix_filepath: Path where the output counts matrix CSV will be saved.
-    :param output_fragment_lengths_filepath: Path where the output fragment lengths CSV will be saved.
+    :param out_counts_filepath: Path where the output counts matrix CSV will be saved.
+    :param out_frag_length_filepath: Path where the output fragment lengths CSV will be saved.
+    :param out_tpm_filepath: Path where the output TPM values CSV will be saved.
     :param rna: RNAType enum indicating whether to process 'trna' or 'mrna' samples.
     :return: A pandas DataFrame representing the final counts matrix.
     """
     study_metrics = _organize_gene_counts_files(data_dir=como_context_dir)
     counts: list[pd.DataFrame] = [_create_sample_counts_matrix(metric) for metric in study_metrics]
-    rna_specific_sample_names = set(
-        config_df.loc[config_df["library_prep"].str.lower() == rna.value.lower(), "sample_name"].tolist()
+    rna_specific_sample_names = sorted(
+        set(config_df.loc[config_df["library_prep"].str.lower() == rna.value.lower(), "sample_name"].tolist())
     )
 
     final_matrix: pd.DataFrame = functools.reduce(
         lambda left, right: pd.merge(left, right, on="ensembl_gene_id", how="outer"), counts
     )
     final_matrix.fillna(value=0, inplace=True)
-    final_matrix.iloc[:, 1:] = final_matrix.iloc[:, 1:].astype(int)
     final_matrix = final_matrix[["ensembl_gene_id", *rna_specific_sample_names]]
+    final_matrix = final_matrix.reindex(columns=["ensembl_gene_id", *rna_specific_sample_names])
+    fragment_lengths = fragment_lengths.reindex(columns=rna_specific_sample_names).sort_index()
+    tpm_df = tpm_df.reindex(columns=rna_specific_sample_names).sort_index()
 
-    output_counts_matrix_filepath.parent.mkdir(parents=True, exist_ok=True)
-    output_fragment_lengths_filepath.parent.mkdir(parents=True, exist_ok=True)
-
-    final_matrix.to_csv(output_counts_matrix_filepath, index=False)
-    fragment_lengths[rna_specific_sample_names].to_csv(output_fragment_lengths_filepath, index=True)
+    out_counts_filepath.parent.mkdir(parents=True, exist_ok=True)
+    out_frag_length_filepath.parent.mkdir(parents=True, exist_ok=True)
+    final_matrix.to_csv(out_counts_filepath, index=False, float_format="%.15g")
+    fragment_lengths[rna_specific_sample_names].to_csv(out_frag_length_filepath, index=True, float_format="%.15g")
+    tpm_df[rna_specific_sample_names].to_csv(out_tpm_filepath, index=True, float_format="%.15g")
 
-    logger.success(f"Wrote gene count matrix for '{rna.value}' RNA at '{output_counts_matrix_filepath}'")
+    logger.success(f"Wrote gene count matrix for '{rna.value}' RNA at '{out_counts_filepath}'")
 
     return final_matrix
 
 
-def _create_config_df(  # noqa: C901
+def _create_config_df(
     context_name: str,
     /,
     como_context_dir: Path,
-    gene_count_dirname: str = "geneCounts",
     layout_dirname: str = "layouts",
     strandedness_dirname: str = "strandedness",
     quantification_dir: str = "quantification",
     prep_method_dirname: str = "prepMethods",
-) -> tuple[pd.DataFrame, pd.DataFrame]:
+) -> tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
     """Create configuration sheet.
 
     The configuration file created is based on the gene counts matrix.
@@ -361,7 +369,6 @@ def _create_config_df(  # noqa: C901
         context_name: Name of the context, used as a prefix for sample names.
         como_context_dir: Path to the COMO_input directory containing subdirectories for
             gene counts, layouts, strandedness, fragment sizes, and prep methods.
-        gene_count_dirname: Name of the subdirectory containing gene count files.
         layout_dirname: Name of the subdirectory containing layout files.
         strandedness_dirname: Name of the subdirectory containing strandedness files.
         quantification_dir: Name of the subdirectory containing Salmon's quantification files.
@@ -370,12 +377,13 @@ def _create_config_df(  # noqa: C901
     Returns:
         [0]: A pandas DataFrame representing the configuration sheet.
         [1]: Fragment lengths for downstream calculations
+        [2]: Salmon TPM values for downstream calculations
     """
     label_regex: Final = re.compile(r"(?P<study>S\d{1,3})(?P<rep>R\d{1,3})(?P<run>r\d{1,3})?")
-    quant_files: list[Path] = list((como_context_dir / quantification_dir).rglob("*.genes.sf"))
+    quant_files: list[Path] = sorted((como_context_dir / quantification_dir).rglob("*.genes.sf"))
     # gene_counts: list[Path] = list((como_context_dir / gene_count_dirname).rglob("*.tab"))
     if not quant_files:
-        raise FileNotFoundError(f"No gene count files found in '{gene_count_dirname}'")
+        raise FileNotFoundError(f"No gene count files found in '{quantification_dir}'")
 
     auxillary_directories = {
         "layout": como_context_dir / layout_dirname,
@@ -385,18 +393,14 @@ def _create_config_df(  # noqa: C901
     }
     aux_lookup: dict[str, dict[str, Path]] = {kind: {} for kind in auxillary_directories}
     for kind, root in auxillary_directories.items():
-        kind: str
-        root: Path
-        for p in root.rglob("*"):
-            if p.is_file():
-                m = label_regex.search(p.stem)
-                if m:
-                    aux_lookup[kind][m.group(0)] = p
-    if "layout" not in aux_lookup:
-        raise ValueError
+        files = (p for p in root.rglob("*") if p.is_file())
+        for file in files:
+            m = label_regex.search(file.stem)
+            if m:
+                aux_lookup[kind][m.group(0)] = file
 
     rows: list[SampleConfiguration] = []
-    for quant_file in sorted(quant_files):
+    for quant_file in quant_files:
         m = label_regex.search(quant_file.as_posix())
         if m is None:
             raise ValueError(f"Filename '{quant_file.name}' does not match contextName_SXRYrZ.tab pattern")
@@ -416,35 +420,23 @@ def _create_config_df(  # noqa: C901
         if prep not in {"total", "mrna"}:
             raise ValueError(f"Prep method must be 'total' or 'mrna' (got '{prep}') for {label}")
         if layout == "":
-            raise FileNotFoundError(message=f"No layout file found for '{label}'.")
-
-        quant_paths = [p for p in aux_lookup["quantification"].values() if p.name == f"{sample_id}_quant.genes.sf"]
-        if (
-            not quant_paths
-            and layout in ["paired-end", "", None]
-            and prep.lower() in [RNAType.TRNA.value.lower(), RNAType.MRNA.value.lower()]
-        ):
-            log_and_raise_error(
-                message=f"No quantification file found for '{label}'; defaulting to 100 bp (needed for zFPKM).",
-                error=FileNotFoundError,
-                level=LogLevel.WARNING,
-            )
-        elif len(quant_paths) == 1 and layout == "single-end":
-            effective_len = pd.DataFrame({"Name": [], "EffectiveLength": []})
-            mean_effective_len = 0.0  # cannot compute FPKM for single-ended data
-        else:
-            df = read_file(quant_file)
-            df.columns = [c.lower() for c in df.columns]
-            df = df.rename(columns={"effectivelength": "effective_length"})
-
-            effective_len = df[["name", "effective_length"]]
-            effective_len["effective_length"] = effective_len["effective_length"].astype(np.float64)
-            mean_effective_len: float = effective_len["effective_length"].sum() / len(df)
+            raise FileNotFoundError(f"No layout file found for '{label}'.")
+
+        df = read_file(quant_file)
+        df.columns = [c.lower() for c in df.columns]
+        df = df.rename(columns={"effectivelength": "effective_length", "TPM": "tpm"})
+        df["effective_length"] = df["effective_length"].astype(float)
+        df["tpm"] = df["tpm"].astype(float)
+
+        tpm = df[["name", "tpm"]].copy()
+        effective_len = df[["name", "effective_length"]].copy()
+        mean_effective_len: float = effective_len["effective_length"].sum() / df.shape[1]
 
         rows.append(
             SampleConfiguration(
                 sample_name=sample_id,
                 effective_lengths=effective_len,
+                tpm=tpm,
                 mean_effective_length=mean_effective_len,
                 layout=layout,
                 strand=strand,
@@ -456,7 +448,7 @@ def _create_config_df(  # noqa: C901
     return SampleConfiguration.to_dataframe(rows)
 
 
-async def _create_gene_info_file(  # noqa: C901
+def _create_gene_info_file(
     *,
     counts_matrix_filepaths: list[Path],
     output_filepath: Path,
@@ -468,17 +460,18 @@ async def _create_gene_info_file(  # noqa: C901
     The gene information file will be created by reading each matrix filepath in the provided list
     """
 
-    async def read_ensembl_gene_ids(file: Path) -> list[str]:
+    def read_ensembl_gene_ids(file: Path) -> list[str]:
         data_ = read_file(file, h5ad_as_df=False)
         if isinstance(data_, pd.DataFrame):
             return data_["ensembl_gene_id"].tolist()
         try:
             conversion = get_remaining_identifiers(ids=data_.var_names.tolist(), taxon=taxon)
         except json.JSONDecodeError as e:
-            raise ValueError(f"Got a JSON decode error for file '{counts_matrix_filepaths}' ({e})")
+            raise ValueError(f"Got a JSON decode error for file '{counts_matrix_filepaths}' ({e})") from e
 
         # Remove NA values from entrez_gene_id dataframe column
         conversion = conversion[~conversion["ensembl_gene_id"].isna()]
+        conversion = conversion.explode(column=["ensembl_gene_id"])
         return conversion["ensembl_gene_id"].tolist()
 
     logger.info(
@@ -495,90 +488,96 @@ async def read_ensembl_gene_ids(file: Path) -> list[str]:
 
 def _process_como_input(
     context_name: str,
-    output_config_filepath: Path,
     como_context_dir: Path,
-    output_counts_matrix_filepath: Path,
-    output_fragment_lengths_filepath: Path,
-    rna: RNAType,
+    out_mrna_config: Path | None,
+    out_trna_config: Path | None,
+    out_mrna_counts: Path | None,
+    out_trna_counts: Path | None,
+    out_mrna_fragments: Path | None,
+    out_trna_fragments: Path | None,
+    out_mrna_tpm: Path | None,
+    out_trna_tpm: Path | None,
 ) -> None:
-    config_df, fragment_lengths = _create_config_df(context_name, como_context_dir=como_context_dir)
-
-    _write_counts_matrix(
-        config_df=config_df,
-        fragment_lengths=fragment_lengths,
-        como_context_dir=como_context_dir,
-        output_counts_matrix_filepath=output_counts_matrix_filepath,
-        output_fragment_lengths_filepath=output_fragment_lengths_filepath,
-        rna=rna,
-    )
-    with pd.ExcelWriter(output_config_filepath) as writer:
-        subset_config = config_df[config_df["library_prep"].str.lower() == rna.value.lower()]
-        subset_config.to_excel(writer, sheet_name=context_name, header=True, index=False)
+    config_df, fragment_lengths, tpm_df = _create_config_df(context_name, como_context_dir=como_context_dir)
+
+    if out_trna_config and out_trna_counts and out_trna_fragments and out_trna_tpm:
+        _write_matrices(
+            config_df=config_df,
+            fragment_lengths=fragment_lengths,
+            tpm_df=tpm_df,
+            como_context_dir=como_context_dir,
+            out_counts_filepath=out_trna_counts,
+            out_frag_length_filepath=out_trna_fragments,
+            out_tpm_filepath=out_trna_tpm,
+            rna=RNAType.TRNA,
+        )
+        with pd.ExcelWriter(out_trna_config) as writer:
+            subset_config = config_df[config_df["library_prep"].str.lower() == RNAType.TRNA.value.lower()]
+            subset_config.to_excel(writer, sheet_name=context_name, header=True, index=False)
+
+    if out_mrna_config and out_mrna_counts and out_mrna_fragments and out_mrna_tpm:
+        _write_matrices(
+            config_df=config_df,
+            fragment_lengths=fragment_lengths,
+            tpm_df=tpm_df,
+            como_context_dir=como_context_dir,
+            out_counts_filepath=out_mrna_counts,
+            out_frag_length_filepath=out_mrna_fragments,
+            out_tpm_filepath=out_mrna_tpm,
+            rna=RNAType.MRNA,
+        )
+        with pd.ExcelWriter(out_mrna_config) as writer:
+            subset_config = config_df[config_df["library_prep"].str.lower() == RNAType.MRNA.value.lower()]
+            subset_config.to_excel(writer, sheet_name=context_name, header=True, index=False)
 
 
-async def _process(
+def _process(
     context_name: str,
     taxon: int,
     output_gene_info_filepath: Path,
     como_context_dir: Path | None,
-    input_matrix_filepath: list[Path] | None,
-    output_trna_fragment_lengths_filepath: Path | None,
-    output_mrna_fragment_lengths_filepath: Path | None,
-    output_trna_config_filepath: Path | None,
-    output_mrna_config_filepath: Path | None,
-    output_trna_matrix_filepath: Path | None,
-    output_mrna_matrix_filepath: Path | None,
+    in_matrix_fp: list[Path] | None,
+    out_trna_frag_lengths_fp: Path | None,
+    out_mrna_frag_lengths_fp: Path | None,
+    out_trna_config_fp: Path | None,
+    out_mrna_config_fp: Path | None,
+    out_trna_matrix_fp: Path | None,
+    out_mrna_matrix_fp: Path | None,
+    out_mrna_tpm_fp: Path | None,
+    out_trna_tpm_fp: Path | None,
     *,
     cache: bool,
     create_gene_info_only: bool,
 ):
-    rna_types: list[tuple[RNAType, Path, Path, Path]] = []
-    if output_trna_config_filepath and output_trna_matrix_filepath and output_trna_fragment_lengths_filepath:
-        rna_types.append(
-            (
-                RNAType.TRNA,
-                output_trna_config_filepath,
-                output_trna_matrix_filepath,
-                output_trna_fragment_lengths_filepath,
-            )
-        )
-    if output_mrna_config_filepath and output_mrna_matrix_filepath and output_mrna_fragment_lengths_filepath:
-        rna_types.append(
-            (
-                RNAType.MRNA,
-                output_mrna_config_filepath,
-                output_mrna_matrix_filepath,
-                output_mrna_fragment_lengths_filepath,
-            )
-        )
-
-    # if provided, iterate through como-input specific directories
     if not create_gene_info_only:
         if como_context_dir is None:
             raise ValueError("como_context_dir must be provided if create_gene_info_only is False")
-        if output_trna_fragment_lengths_filepath is None:
+        if out_trna_frag_lengths_fp is None:
             raise ValueError("output_fragment_lengths_filepath must be provided if create_gene_info_only is False")
 
-        for rna, out_config, out_matrix, out_frag_len in rna_types:
-            _process_como_input(
-                context_name=context_name,
-                output_config_filepath=out_config,
-                como_context_dir=como_context_dir,
-                output_counts_matrix_filepath=out_matrix,
-                output_fragment_lengths_filepath=out_frag_len,
-                rna=rna,
-            )
+        _process_como_input(
+            context_name=context_name,
+            como_context_dir=como_context_dir,
+            out_mrna_config=out_mrna_config_fp,
+            out_trna_config=out_trna_config_fp,
+            out_mrna_counts=out_mrna_matrix_fp,
+            out_trna_counts=out_trna_matrix_fp,
+            out_mrna_fragments=out_mrna_frag_lengths_fp,
+            out_trna_fragments=out_trna_frag_lengths_fp,
+            out_mrna_tpm=out_mrna_tpm_fp,
+            out_trna_tpm=out_trna_tpm_fp,
+        )
 
     # create the gene info filepath based on provided data
     input_files = []
-    if input_matrix_filepath:
-        input_files.extend(input_matrix_filepath)
-    if output_trna_matrix_filepath:
-        input_files.append(output_trna_matrix_filepath)
-    if output_mrna_matrix_filepath:
-        input_files.append(output_mrna_matrix_filepath)
-
-    await _create_gene_info_file(
+    if in_matrix_fp:
+        input_files.extend(in_matrix_fp)
+    if out_trna_matrix_fp:
+        input_files.append(out_trna_matrix_fp)
+    if out_mrna_matrix_fp:
+        input_files.append(out_mrna_matrix_fp)
+
+    _create_gene_info_file(
         counts_matrix_filepaths=input_files,
         output_filepath=output_gene_info_filepath,
         taxon=taxon,
@@ -586,7 +585,7 @@ async def _process(
     )
 
 
-async def rnaseq_preprocess(  # noqa: C901
+def rnaseq_preprocess(  # noqa: C901
     context_name: str,
     taxon: int,
     output_gene_info_filepath: str | Path,
@@ -598,6 +597,8 @@ async def rnaseq_preprocess(  # noqa: C901
     output_mrna_metadata_filepath: str | Path | None = None,
     output_trna_count_matrix_filepath: str | Path | None = None,
     output_mrna_count_matrix_filepath: str | Path | None = None,
+    output_trna_tpm_filepath: str | Path | None = None,
+    output_mrna_tpm_filepath: str | Path | None = None,
     cache: bool = True,
     log_level: LogLevel | str = LogLevel.INFO,
     log_location: str | TextIO = sys.stderr,
@@ -618,6 +619,8 @@ async def rnaseq_preprocess(  # noqa: C901
     :param output_mrna_metadata_filepath: Path to the output mRNA config file (if in "create" mode)
     :param output_trna_count_matrix_filepath: The path to write total RNA count matrices
     :param output_mrna_count_matrix_filepath: The path to write messenger RNA count matrices
+    :param output_trna_tpm_filepath: The path to write total RNA TPM matrices
+    :param output_mrna_tpm_filepath: The path to write messenger RNA TPM matrices
     :param como_context_dir: If in "create" mode, the input path(s) to the COMO_input directory of the current context
         i.e., the directory containing "fragmentSizes", "geneCounts", "insertSizeMetrics", etc. directories
     :param input_matrix_filepath: If in "provide" mode, the path(s) to the count matrices to be processed~
@@ -628,10 +631,67 @@ async def rnaseq_preprocess(  # noqa: C901
     """
     set_up_logging(level=log_level, location=log_location)
 
-    # ruff: disable[ASYNC240]
     if not output_gene_info_filepath:
         raise ValueError("output_gene_info_filepath must be provided")
 
+    if any(
+        (
+            output_trna_fragment_lengths_filepath,
+            output_trna_metadata_filepath,
+            output_trna_count_matrix_filepath,
+            output_trna_tpm_filepath,
+        ),
+    ) and not all(
+        (
+            output_trna_fragment_lengths_filepath,
+            output_trna_metadata_filepath,
+            output_trna_count_matrix_filepath,
+            output_trna_tpm_filepath,
+        ),
+    ):
+        missing = [
+            name
+            for name, value in {
+                "output_trna_fragment_lengths_filepath": output_trna_fragment_lengths_filepath,
+                "output_trna_metadata_filepath": output_trna_metadata_filepath,
+                "output_trna_count_matrix_filepath": output_trna_count_matrix_filepath,
+                "output_trna_tpm_filepath": output_trna_tpm_filepath,
+            }.items()
+            if value is None
+        ]
+        raise ValueError(
+            f"If any tRNA output filepaths are provided, all must be provided. Missing: {', '.join(missing)}"
+        )
+
+    if any(
+        (
+            output_mrna_fragment_lengths_filepath,
+            output_mrna_metadata_filepath,
+            output_mrna_count_matrix_filepath,
+            output_mrna_tpm_filepath,
+        ),
+    ) and not all(
+        (
+            output_mrna_fragment_lengths_filepath,
+            output_mrna_metadata_filepath,
+            output_mrna_count_matrix_filepath,
+            output_mrna_tpm_filepath,
+        ),
+    ):
+        missing = [
+            name
+            for name, value in {
+                "output_mrna_fragment_lengths_filepath": output_mrna_fragment_lengths_filepath,
+                "output_mrna_metadata_filepath": output_mrna_metadata_filepath,
+                "output_mrna_count_matrix_filepath": output_mrna_count_matrix_filepath,
+                "output_mrna_tpm_filepath": output_mrna_tpm_filepath,
+            }.items()
+            if value is None
+        ]
+        raise ValueError(
+            f"If any mRNA output filepaths are provided, all must be provided. Missing: {', '.join(missing)}"
+        )
+
     output_gene_info_filepath = Path(output_gene_info_filepath).resolve()
 
     context_dir = None
@@ -657,20 +717,25 @@ async def rnaseq_preprocess(  # noqa: C901
         output_trna_fragment_lengths_filepath = Path(output_trna_fragment_lengths_filepath).resolve()
     if output_mrna_fragment_lengths_filepath is not None:
         output_mrna_fragment_lengths_filepath = Path(output_mrna_fragment_lengths_filepath).resolve()
+    if output_trna_tpm_filepath is not None:
+        output_trna_tpm_filepath = Path(output_trna_tpm_filepath).resolve()
+    if output_mrna_tpm_filepath is not None:
+        output_mrna_tpm_filepath = Path(output_mrna_tpm_filepath).resolve()
 
-    await _process(
+    _process(
         context_name=context_name,
         taxon=taxon,
         como_context_dir=context_dir,
-        input_matrix_filepath=in_matrix,
+        in_matrix_fp=in_matrix,
         output_gene_info_filepath=output_gene_info_filepath,
-        output_trna_config_filepath=output_trna_metadata_filepath,
-        output_trna_matrix_filepath=output_trna_count_matrix_filepath,
-        output_trna_fragment_lengths_filepath=output_trna_fragment_lengths_filepath,
-        output_mrna_config_filepath=output_mrna_metadata_filepath,
-        output_mrna_matrix_filepath=output_mrna_count_matrix_filepath,
-        output_mrna_fragment_lengths_filepath=output_mrna_fragment_lengths_filepath,
+        out_trna_config_fp=output_trna_metadata_filepath,
+        out_trna_matrix_fp=output_trna_count_matrix_filepath,
+        out_trna_frag_lengths_fp=output_trna_fragment_lengths_filepath,
+        out_mrna_config_fp=output_mrna_metadata_filepath,
+        out_mrna_matrix_fp=output_mrna_count_matrix_filepath,
+        out_mrna_frag_lengths_fp=output_mrna_fragment_lengths_filepath,
+        out_trna_tpm_fp=output_trna_tpm_filepath,
+        out_mrna_tpm_fp=output_mrna_tpm_filepath,
         cache=cache,
         create_gene_info_only=create_gene_info_only,
     )
-    # ruff: enable[ASYNC240]

From 8c18566df188425abea88c223a33520cd33ec1ed Mon Sep 17 00:00:00 2001
From: Josh Loecker <joshloecker@icloud.com>
Date: Fri, 6 Mar 2026 11:03:24 -0600
Subject: [PATCH 13/41] feat: refactor filtering and normalization

- !Converts from async functions to sync
- Change `fragment_lengths` from a per-sample value to a per-gene value (np.ndarray to pd.DataFrame)
- Refactor TPM, FPKM, and zFPKM functions
- Adds fpkm output CSV file support
- Improves type hints and validations

[BREAKING-CHANGE]: This removes all async functions for synchronous counterparts

Signed-off-by: Josh Loecker <joshloecker@icloud.com>
---
 main/como/rnaseq_gen.py | 508 ++++++++++++++++++++--------------------
 1 file changed, 253 insertions(+), 255 deletions(-)

diff --git a/main/como/rnaseq_gen.py b/main/como/rnaseq_gen.py
index 5a5588ae..1d2b258a 100644
--- a/main/como/rnaseq_gen.py
+++ b/main/como/rnaseq_gen.py
@@ -16,8 +16,7 @@
 import scanpy as sc
 import sklearn
 import sklearn.neighbors
-from anndata.compat import XDataArray
-from anndata.experimental.backed import Dataset2D
+import sklearn.preprocessing
 from loguru import logger
 from scipy import sparse
 from zfpkm import zFPKM, zfpkm_plot
@@ -49,7 +48,7 @@ class _StudyMetrics:
     study: str
     num_samples: int
     count_matrix: pd.DataFrame | sc.AnnData
-    fragment_lengths: npt.NDArray[np.floating] | None
+    eff_length: pd.DataFrame  # The effective length of the fragment from, e.g., Salmon Quantification
     sample_names: list[str]
     layout: list[LayoutMethod]
     entrez_gene_ids: npt.NDArray[np.integer]
@@ -59,6 +58,11 @@ class _StudyMetrics:
     __high_confidence_entrez_gene_ids: list[str] = field(default_factory=list)
 
     def __post_init__(self):
+        if not self.layout:
+            raise ValueError("Layout list cannot be empty")
+        if not self.sample_names:
+            raise ValueError("Sample names list cannot be empty")
+
         for layout in self.layout:
             if layout not in LayoutMethod:
                 raise ValueError(f"Layout must be 'paired-end' or 'single-end'; got: {layout}")
@@ -134,23 +138,24 @@ def genefilter(data: pd.DataFrame | npt.NDArray, filter_func: Callable[[npt.NDAr
     Returns:
         A NumPy array of the filtered data.
     """
-    if not isinstance(data, pd.DataFrame | np.ndarray):
+    if not isinstance(data, (pd.DataFrame, np.ndarray)):
         raise TypeError(f"Unsupported data type. Must be a Pandas DataFrame or a NumPy array, got '{type(data)}'")
 
     return (
-        data.apply(filter_func, axis=1).to_numpy()
+        data.apply(filter_func, axis=1).to_numpy(copy=False)
         if isinstance(data, pd.DataFrame)
         else np.apply_along_axis(filter_func, axis=1, arr=data)
     )
 
 
-async def _build_matrix_results(
+def _build_matrix_results(
     matrix: pd.DataFrame | sc.AnnData,
     *,
     gene_info: pd.DataFrame,
     metadata_df: pd.DataFrame,
-    fragment_df: pd.DataFrame | None,
+    normalize_df: pd.DataFrame,
     taxon: int,
+    normalize_df_is_fragment_length: bool = True,
 ) -> tuple[NamedMetrics, list[int]]:
     """Read the counts matrix and returns the results.
 
@@ -159,91 +164,53 @@ async def _build_matrix_results(
     :param taxon: The NCBI Taxon ID
     :returns: A dataclass `ReadMatrixResults`
     """
-    if isinstance(matrix, sc.AnnData):
-        if not isinstance(matrix.var, pd.DataFrame):
-            raise TypeError("AnnData.var is expected to be a pandas.DataFrame")
-
-        matrix.var = matrix.var.reset_index(drop=False, names=["gene_symbol"])
-        conversion = convert(ids=matrix.var["gene_symbol"].tolist(), taxon=taxon)
-    else:
-        if "ensembl_gene_id" not in matrix.columns:
-            log_and_raise_error(
-                message="'ensembl_gene_id' column not found in the provided DataFrame.",
-                error=ValueError,
-                level=LogLevel.CRITICAL,
-            )
-        conversion: pd.DataFrame = convert(ids=matrix["ensembl_gene_id"].tolist(), taxon=taxon)
-    # If the entrez gene id column is empty, it is indicative that the incorrect taxon id was provided
-    if conversion["entrez_gene_id"].eq("-").all():
-        logger.critical(
-            f"Conversion of Ensembl Gene IDs to Entrez IDs and Gene Symbols was empty - "
-            f"is '{taxon}' the correct taxon ID for this data?"
-        )
-    conversion["ensembl_gene_id"] = conversion["ensembl_gene_id"].str.split(",")
-    conversion = conversion.explode("ensembl_gene_id")
-    conversion = conversion[conversion["entrez_gene_id"] != "-"]
-    conversion["entrez_gene_id"] = conversion["entrez_gene_id"]
-    conversion = conversion.reset_index(drop=False)
-
-    # conversion_merge_on should contain at least one of "ensembl_gene_id", "entrez_gene_id", or "gene_symbol"
-    conversion_merge_on: list[str] = list(
-        set(matrix.columns if isinstance(matrix, pd.DataFrame) else matrix.var.columns) & set(conversion.columns)
-    )
-    if not conversion_merge_on:
-        log_and_raise_error(
-            (
-                "No columns to merge on, unable to find at least one of `ensembl_gene_id`, `entrez_gene_id`, or `gene_symbol`. "
-                "Please check your input files."
-            ),
-            error=ValueError,
-            level=LogLevel.ERROR,
-        )
-
-    if isinstance(matrix, pd.DataFrame):
-        if "entrez_gene_id" in matrix.columns:
-            matrix["entrez_gene_id"] = matrix["entrez_gene_id"].astype(int)
-        matrix = matrix.merge(conversion, on=conversion_merge_on, how="left")
-    elif isinstance(matrix, sc.AnnData):
-        if "entrez_gene_id" in matrix.var.columns:
-            matrix.var["entrez_gene_id"] = matrix.var["entrez_gene_id"].astype(int)
-        matrix.var = matrix.var.merge(conversion, on=conversion_merge_on, how="left")
-
     gene_info = gene_info_migrations(gene_info)
+    gene_info = gene_info[~gene_info["entrez_gene_id"].isna()]
     gene_info = gene_info[gene_info["entrez_gene_id"] != "-"]
-    gene_info.loc[:, "entrez_gene_id"] = gene_info.loc[:, "entrez_gene_id"].astype(int)
+    gene_info["entrez_gene_id"] = gene_info["entrez_gene_id"].astype(int)
 
-    gene_info_merge_on: list[str] = list(
-        set(matrix.columns if isinstance(matrix, pd.DataFrame) else matrix.var.columns) & set(gene_info.columns)
-    )
-
-    if "entrez_gene_id" in gene_info_merge_on:
-        gene_info = gene_info[~gene_info["entrez_gene_id"].isna()]
-        gene_info["entrez_gene_id"] = gene_info["entrez_gene_id"].astype(int)
-
-        if isinstance(matrix, pd.DataFrame):
-            matrix = matrix[~matrix["entrez_gene_id"].isna()]
-            matrix["entrez_gene_id"] = matrix["entrez_gene_id"].astype(int)
-        elif isinstance(matrix, sc.AnnData):
-            if isinstance(matrix.var, XDataArray):
-                raise TypeError("Expected matrix.var object to be 'pd.DataFrame', got 'anndata.compat.XDataArray'")
-            matrix = matrix[:, ~matrix.var["entrez_gene_id"].isna()]
-            matrix.var["entrez_gene_id"] = matrix.var["entrez_gene_id"].astype(int)
+    if not normalize_df.empty:  # This is likely empty when using single-cell data
+        normalize_merge_on = normalize_df.columns.intersection(gene_info.columns).to_list()
+        if not normalize_merge_on:
+            raise ValueError(
+                f"No common columns to merge on between effective length dataframe and gene info dataframe. "
+                f"Effective length columns: {normalize_df.columns.tolist()}, gene info columns: {gene_info.columns.tolist()}"
+            )
+        normalize_df = normalize_df.merge(gene_info, on=normalize_merge_on, how="left")
+        normalize_df = normalize_df[~normalize_df["entrez_gene_id"].isna()]
 
     if isinstance(matrix, pd.DataFrame):
-        matrix = matrix.merge(gene_info, on=gene_info_merge_on, how="inner")
+        matrix_merge_on = matrix.columns.intersection(gene_info.columns).to_list()
+        matrix = matrix.merge(gene_info, on=matrix_merge_on, how="inner")
     elif isinstance(matrix, sc.AnnData):
         if not isinstance(matrix.var, pd.DataFrame):
             raise TypeError(f"Expected matrix.var object to be 'pd.DataFrame', got '{type(matrix.var)}'")
-        matrix.var["original_index"] = matrix.var.index
-        new_var = matrix.var.merge(gene_info, on=gene_info_merge_on, how="inner")
-        new_matrix = matrix[:, new_var["original_index"]].copy()
-        new_matrix.var = new_var
-        new_matrix.var = new_matrix.var.drop(columns=["original_index"])
-        new_matrix.var.reset_index(drop=True)
-        matrix = new_matrix
 
-        non_duplicates = ~matrix.var.duplicated(subset=matrix.var.columns, keep="first")
-        matrix = matrix[:, non_duplicates].copy()
+        gene_info = gene_info.sort_values(["entrez_gene_id", "size"], ascending=[True, True]).drop_duplicates(
+            subset=["entrez_gene_id"], keep="first"
+        )
+        types: dict[str, str] = determine_gene_type(matrix.var_names)
+        if contains_identical_gene_types(types):
+            # set matrix index name to gene type
+            matrix.var.index.name = "gene_symbol"
+
+        matrix.var = matrix.var.reset_index(drop=False).merge(gene_info, on="gene_symbol", how="left")
+        matrix = matrix[:, matrix.var["entrez_gene_id"].notna()]
+        matrix.var = matrix.var.astype(
+            {
+                "entrez_gene_id": int,
+                "taxon_id": int,
+                "size": int,
+            }
+        )
+
+        # new_matrix = matrix[:, new_var["original_index"]].copy()
+        # new_matrix.var = new_var
+        # new_matrix.var = new_matrix.var.drop(columns=["original_index"])
+        # new_matrix.var.reset_index(drop=True)
+        # matrix = new_matrix
+        # non_duplicates = ~matrix.var.duplicated(subset=matrix.var.columns, keep="first")
+        # matrix = matrix[:, non_duplicates].copy()
 
     metrics: NamedMetrics = {}
     for study in metadata_df["study"].unique():
@@ -253,109 +220,62 @@ async def _build_matrix_results(
         if isinstance(matrix, pd.DataFrame):
             subset = matrix.set_index(keys=["entrez_gene_id"], drop=True)
             subset = subset[subset.columns.intersection(study_sample_names)]
-            subset.index = subset.index.astype(int)
+            # subset = subset.groupby(subset.index).mean()
             entrez_gene_ids = subset.index.to_numpy(copy=False)
-            gene_sizes = matrix["size"].to_numpy(dtype=int, copy=False)
+
         elif isinstance(matrix, sc.AnnData):
             # matrix.var = matrix.var.set_index(keys=["entrez_gene_id"], drop=True)
             subset = matrix[matrix.obs_names.intersection(study_sample_names)]
             entrez_gene_ids = subset.var["entrez_gene_id"].to_numpy(dtype=int)
-            gene_sizes = subset.var["size"].to_numpy(dtype=int)
         else:
             raise TypeError(f"Matrix must be a pandas DataFrame or scanpy AnnData object, got: '{type(matrix)}'.")
 
-        frag_lengths = None
-        if fragment_df is not None:
-            frag_lengths = fragment_df["effective_length"].to_numpy(dtype=np.float64)
+        normalize_df = normalize_df.drop(columns=["ensembl_gene_id", "gene_symbol"], errors="ignore")
+        if normalize_df.index.name != "entrez_gene_id" and "entrez_gene_id" in normalize_df.columns:
+            normalize_df = normalize_df.set_index("entrez_gene_id", drop=True)
+        normalize_df.index = normalize_df.index.astype(int)
+
+        # gene_sizes = gene_info["size"].to_numpy(dtype=int, copy=False)
         metrics[study] = _StudyMetrics(
             count_matrix=subset,
-            fragment_lengths=frag_lengths,
+            eff_length=normalize_df[study_sample_names] if not normalize_df.empty else normalize_df,
             sample_names=study_sample_names,
             layout=[LayoutMethod(layout) for layout in layouts],
             num_samples=len(study_sample_names),
             entrez_gene_ids=entrez_gene_ids,
-            gene_sizes=gene_sizes,
+            gene_sizes=gene_info.loc[gene_info["entrez_gene_id"].isin(entrez_gene_ids), "size"].to_numpy(
+                dtype=int, copy=False
+            ),
             study=study,
         )
 
     return metrics, gene_info["entrez_gene_id"].astype(int).tolist()
 
 
-def calculate_tpm(metrics: NamedMetrics) -> NamedMetrics:
-    """Calculate the Transcripts Per Million (TPM) for each sample in the metrics dictionary.
-
-    Args:
-        metrics: A dictionary of study metrics to calculate TPM for.
-
-    Returns:
-        A dictionary of study metrics with TPM calculated.
-    """
-    for sample, metric in metrics.items():
-        if isinstance(metric.count_matrix, sc.AnnData):
-            adata = metric.count_matrix
-            gene_sizes = pd.Series(metric.gene_sizes, index=adata.var_names)
-            counts_df = pd.DataFrame(
-                data=np.asarray(adata.X.toarray() if sparse.issparse(adata.X) else adata.X),
-                index=adata.var_names,
-                columns=adata.obs_names,
-            )
-        else:
-            counts_df = metric.count_matrix
-            gene_sizes = pd.Series(metric.gene_sizes)
-
-        adjusted_counts = counts_df.add(1e-6)
-        tpm_matrix = adjusted_counts.div(gene_sizes, axis=0)  # (count + 1) / gene_length
-        tpm_matrix = tpm_matrix.div(tpm_matrix.sum(axis=0), axis=1)  # normalize by total
-        tpm_matrix = tpm_matrix.mul(1e6)  # scale to per-million
-        metrics[sample].normalization_matrix = tpm_matrix
-
-    return metrics
-
-
-def _calculate_fpkm(metrics: NamedMetrics, scale: float = 1e6) -> NamedMetrics:
-    """Calculate the Fragments Per Kilobase of transcript per Million mapped reads (FPKM) for each i in the metrics dictionary.
+def _calculate_fpkm(metrics: NamedMetrics) -> NamedMetrics:
+    """Calculate the Fragments Per Kilobase of transcript per Million mapped reads (FPKM).
 
     Args:
         metrics: A dictionary of study metrics to calculate FPKM for.
-        scale: The scaling factor for normalization (default is 1e6).
 
     Returns:
         A dictionary of study metrics with FPKM calculated.
     """
     for study in metrics:
-        matrix_values: dict[str, npt.NDArray[np.floating]] = {}
-        count_matrix = metrics[study].count_matrix
-        if not isinstance(count_matrix, pd.DataFrame):
-            raise TypeError("FPKM cannot be performed on scanpy.AnnData objects!")
-
-        study_counts = count_matrix.to_numpy(dtype=int, copy=False)
-        for i in range(metrics[study].num_samples):
-            layout = metrics[study].layout[i]
-            sample_name = metrics[study].sample_names[i]
-            length = metrics[study].fragment_lengths if layout == LayoutMethod.paired_end else metrics[study].gene_sizes
-            counts = study_counts[:, i]
-            mapped_reads = counts.sum()
-            matrix_values[sample_name] = ((counts * 1e9) / (length * mapped_reads)).astype(int)
-
-        metrics[study].normalization_matrix = pd.DataFrame(matrix_values, index=metrics[study].entrez_gene_ids)
-    return metrics
-
+        count_matrix = metrics[study].count_matrix.copy()
+        length = metrics[study].eff_length
 
-def calculate_z_score(metrics: NamedMetrics) -> NamedMetrics:
-    """Calculate the z-score for each sample in the metrics dictionary.
-
-    Args:
-        metrics: A dictionary of study metrics to calculate z-scores for.
+        if not isinstance(count_matrix, pd.DataFrame):
+            raise TypeError("FPKM can only be performed on pandas.DataFrame objects")
+        non_overlapping_cols = count_matrix.columns.difference(length.columns)
+        if not non_overlapping_cols.empty:
+            raise ValueError(
+                f"Count matrix columns and effective length columns must match for FPKM calculation. "
+                f"Non-overlapping columns: {non_overlapping_cols}"
+            )
 
-    Returns:
-        A dictionary of study metrics with z-scores calculated.
-    """
-    for sample in metrics:
-        log_matrix = np.log(metrics[sample].normalization_matrix)
-        z_matrix = pd.DataFrame(
-            data=sklearn.preprocessing.scale(log_matrix, axis=1), columns=metrics[sample].sample_names
-        )
-        metrics[sample].z_score_matrix = z_matrix
+        mapped_reads = count_matrix.sum()
+        metrics[study].normalization_matrix = (count_matrix * 1e9) / (mapped_reads * length)
     return metrics
 
 
@@ -410,7 +330,7 @@ def cpm_filter(
         test_bools = pd.DataFrame({"entrez_gene_ids": entrez_ids})
         for i in range(len(counts_per_million.columns)):
             median_sum = float(np.median(np.sum(counts[:, i])))
-            if cut_off == "default":  # noqa: SIM108
+            if np.isnan(cut_off):  # noqa: SIM108
                 cutoff = float(10e6 / median_sum)
             else:
                 cutoff = float(1e6 * cut_off) / median_sum
@@ -419,22 +339,29 @@ def cpm_filter(
     return metrics
 
 
-def tpm_quantile_filter(*, metrics: NamedMetrics, filtering_options: _FilteringOptions) -> NamedMetrics:
+def tpm_filter(
+    *,
+    metrics: NamedMetrics,
+    filtering_options: _FilteringOptions,
+    tpm_df: pd.DataFrame,
+) -> NamedMetrics:
     """Apply quantile-based filtering to the TPM matrix for a given sample.
 
-    Args:
-        metrics: A dictionary of study metrics to filter.
-        filtering_options: Options for filtering the count matrix.
-
-    Returns:
-        A dictionary of filtered study metrics.
+    :param metrics: A dictionary of study metrics to filter.
+    :param filtering_options: Options for filtering the count matrix.
+    :param tpm_df: A dataframe containing TPM values for each gene in each sample, used for TPM quantile filtering
+    :returns: A dictionary of filtered study metrics.
     """
     # TODO: Write the TPM matrix to disk
 
-    n_exp = filtering_options.replicate_ratio
-    n_top = filtering_options.high_replicate_ratio
+    min_sample_expression = filtering_options.replicate_ratio
+    high_confidence_sample_expression = filtering_options.high_replicate_ratio
     cut_off = filtering_options.cut_off
-    metrics = calculate_tpm(metrics)
+
+    # Map the TPM results directly onto the sample
+    for sample, metric in metrics.items():
+        metrics[sample].normalization_matrix = tpm_df[metric.sample_names]
+        metrics[sample].normalization_matrix.index = tpm_df["ensembl_gene_id"]
 
     sample: str
     metric: _StudyMetrics
@@ -443,33 +370,36 @@ def tpm_quantile_filter(*, metrics: NamedMetrics, filtering_options: _FilteringO
         gene_size = metric.gene_sizes
         tpm_matrix: pd.DataFrame = metric.normalization_matrix
 
-        min_samples = round(n_exp * len(tpm_matrix.columns))
-        top_samples = round(n_top * len(tpm_matrix.columns))
-
         tpm_quantile = tpm_matrix[tpm_matrix > 0]
-        quantile_cutoff = np.quantile(a=tpm_quantile.values, q=1 - (cut_off / 100), axis=0)
-        boolean_expression = pd.DataFrame(
-            data=tpm_matrix > quantile_cutoff, index=tpm_matrix.index, columns=tpm_matrix.columns
-        ).astype(int)
-
-        min_func = k_over_a(min_samples, 0.9)
-        top_func = k_over_a(top_samples, 0.9)
 
-        min_genes: npt.NDArray[np.bool] = genefilter(boolean_expression, min_func)
-        top_genes: npt.NDArray[np.bool] = genefilter(boolean_expression, top_func)
+        # TODO: Is using quantile correct?
+        # quantile_cutoff = np.nanquantile(a=tpm_quantile.values, q=1 - (cut_off / 100), axis=0)
+        tpm_above_cutoff = tpm_matrix[tpm_matrix > filtering_options.cut_off]
 
         # Only keep `entrez_gene_ids` that pass `min_genes`
-        metric.entrez_gene_ids = [gene for gene, keep in zip(entrez_ids, min_genes, strict=True) if keep]
-        metric.gene_sizes = np.asarray(gene for gene, keep in zip(gene_size, min_genes, strict=True) if keep)
-        metric.count_matrix = cast(pd.DataFrame, metric.count_matrix.iloc[min_genes, :])
-        metric.normalization_matrix = cast(pd.DataFrame, metrics[sample].normalization_matrix.iloc[min_genes, :])
-
-        keep_top_genes = [gene for gene, keep in zip(entrez_ids, top_genes, strict=True) if keep]
-        metric.high_confidence_entrez_gene_ids = [
-            gene for gene, keep in zip(entrez_ids, keep_top_genes, strict=True) if keep
-        ]
-
-    metrics = calculate_z_score(metrics)
+        min_samples = round(min_sample_expression * len(tpm_matrix.columns))
+        min_func = k_over_a(min_samples, filtering_options.cut_off)
+        min_genes: npt.NDArray[np.bool] = genefilter(tpm_above_cutoff, min_func)
+        metric.entrez_gene_ids = [gene for gene, keep in zip(tpm_matrix.index, min_genes, strict=True) if keep]
+        # metric.gene_sizes = np.asarray(gene for gene, keep in zip(gene_size, min_genes, strict=True) if keep)
+        # metric.count_matrix = metric.count_matrix.iloc[min_genes, :]
+        metrics[sample].normalization_matrix = metrics[sample].normalization_matrix.iloc[min_genes, :]
+
+        # top_samples = round(high_confidence_sample_expression * len(tpm_matrix.columns))
+        # top_func = k_over_a(top_samples, filtering_options.cut_off)
+        # top_genes: npt.NDArray[np.bool] = genefilter(boolean_expression, top_func)
+        # keep_top_genes = [gene for gene, keep in zip(entrez_ids, top_genes, strict=True) if keep]
+        # metric.high_confidence_entrez_gene_ids = [
+        #     gene for gene, keep in zip(entrez_ids, keep_top_genes, strict=True) if keep
+        # ]
+
+        # Calculate z-scores
+        log_matrix = np.log1p(metrics[sample].normalization_matrix.values)
+        metrics[sample].z_score_matrix = pd.DataFrame(
+            sklearn.preprocessing.scale(log_matrix, axis=0),
+            columns=metrics[sample].normalization_matrix.columns,
+            index=metrics[sample].normalization_matrix.index,
+        )
 
     return metrics
 
@@ -480,8 +410,7 @@ def zfpkm_filter(
     filtering_options: _FilteringOptions,
     calculate_fpkm: bool,
     force_zfpkm_plot: bool,
-    min_peak_height: float,
-    min_peak_distance: int,
+    output_fpkm_filepath: Path | None,
     output_png_dirpath: Path | None,
     force_negative_to_zero: bool = False,
 ) -> NamedMetrics:
@@ -506,8 +435,14 @@ def zfpkm_filter(
     cut_off = filtering_options.cut_off
     metrics = _calculate_fpkm(metrics) if calculate_fpkm else metrics
 
+    if output_fpkm_filepath:
+        combined_fpkm = pd.concat([df.normalization_matrix for df in metrics.values()], axis=1)
+        combined_fpkm = combined_fpkm.sort_index().reindex(columns=sorted(combined_fpkm.columns))
+        combined_fpkm.to_csv(output_fpkm_filepath, index=True)
+        context = next(iter(metrics.values())).sample_names[0].partition("_")[0]
+        logger.success(f"Wrote FPKM data for {context} to: {output_fpkm_filepath}")
+
     for metric in metrics.values():
-        metric: _StudyMetrics
         # if fpkm was not calculated, the normalization matrix will be empty; collect the count matrix instead
         matrix = metric.count_matrix if metric.normalization_matrix.empty else metric.normalization_matrix
         if not isinstance(matrix, pd.DataFrame):
@@ -587,29 +522,28 @@ def umi_filter(
         logger.warning(
             "Setting a minimum sample expression greater than ~20% for UMI-based filtering will likely result in very few/no genes being marked as active. "  # noqa: E501
             "Activity values ranging from 10-20% are recommended based on recent literature. "
-            f"Got: {min_sample_expression} for option 'replicate_ratio'"
+            f"Got: {min_sample_expression:%} for option 'replicate_ratio'"
         )
     if high_confidence_sample_expression > 0.40:
         logger.warning(
             f"Setting high-confidence expression greater than ~40% for UMI-based filtering will likely result in very few to no genes being marked as highly active. "  # noqa: E501
             "Activity values ranging from 20-30% are recommended based on recent literature. "
-            f"Got: {high_confidence_sample_expression} for option 'high_replicate_ratio'."
+            f"Got: {high_confidence_sample_expression:%} for option 'high_replicate_ratio'."
         )
 
     for metric in metrics.values():
         metric: _StudyMetrics
         if not isinstance(metric.count_matrix, sc.AnnData):
             raise TypeError(f"Expected a scanpy.AnnData for UMI filtering, got: '{type(metric.count_matrix)}'")
-        adata: sc.AnnData = metric.count_matrix
+        adata: sc.AnnData = metric.count_matrix.copy()
 
         if perform_normalization:
             if adata.raw is not None:
                 adata.X = adata.raw.X.copy()
-            sc.pp.filter_cells(adata, min_genes=20)
+            sc.pp.filter_cells(adata, min_genes=10)
             sc.pp.filter_genes(adata, min_cells=1)
             sc.pp.normalize_total(adata, target_sum=target_sum)
             sc.pp.log1p(adata)
-            # sc.pp.scale(adata, max_value=15)  # abs(values)>10 standard deviations away will be set to +/-10
 
         metric.z_score_matrix = adata
 
@@ -622,9 +556,8 @@ def umi_filter(
         for j in range(n_genes):
             col = adata_x.getcol(j).toarray().ravel() if sparse.issparse(adata_x) else adata_x[:, j]
             min_genes_mask[j] = min_func(col)
-        metric.entrez_gene_ids = (
-            adata.var.loc[min_genes_mask, "entrez_gene_id"].dropna().tolist()
-        )  # at this point we do not need/want NA entrez IDs
+        # at this point we do not need/want NA entrez IDs
+        metric.entrez_gene_ids = adata.var.loc[min_genes_mask, "entrez_gene_id"].dropna().tolist()
 
         top_samples = round(high_confidence_sample_expression * n_cells)
         top_func = k_over_a(top_samples, cut_off)
@@ -645,10 +578,11 @@ def filter_counts(
     filtering_options: _FilteringOptions,
     prep: RNAType,
     force_zfpkm_plot: bool,
-    zfpkm_min_peak_height: float,
-    zfpkm_min_peak_distance: int,
     umi_target_sum: int = 10_000,
+    tpm_df: pd.DataFrame | None,
+    calculate_fpkm: bool = True,
     umi_perform_normalization: bool = False,
+    output_fpkm_filepath: Path | None,
     output_zfpkm_plot_dirpath: Path | None = None,
     force_negative_to_zero: bool = False,
 ) -> NamedMetrics:
@@ -660,32 +594,34 @@ def filter_counts(
     :param filtering_options: Options for filtering the count matrix.
     :param prep: The RNA preparation type.
     :param force_zfpkm_plot: Whether to force plotting of zFPKM results even if there are many samples.
-    :param zfpkm_min_peak_height: Minimum peak height for zFPKM peak identification.
-    :param zfpkm_min_peak_distance: Minimum peak distance for zFPKM peak identification.
     :param umi_target_sum: The target sum for UMI normalization.
+    :param tpm_df: A dataframe containing TPM values for each gene in each sample, used for TPM quantile filtering
+    :param calculate_fpkm: If using paired-end data, should FPKM be calculated?
     :param umi_perform_normalization: Whether to perform normalization before UMI filtering.
     :param output_zfpkm_plot_dirpath: Optional filepath to save the zFPKM plot.
+    :param output_fpkm_filepath: Optional filepath to save the FPKM matrix (only applies if `calculate_fpkm=True` and `technique=FilteringTechnique.ZFPKM`)
     :param force_negative_to_zero: Should negative values be forcibly set to 0?
             This could happen as a result of normalization producing negative near-zero values (e.g., -0.001)
 
     :returns: A dictionary of filtered study metrics.
-    """
+    """  # noqa: E501
     match technique:
         case FilteringTechnique.CPM:
             return cpm_filter(
                 context_name=context_name, metrics=metrics, filtering_options=filtering_options, prep=prep
             )
         case FilteringTechnique.TPM:
-            return tpm_quantile_filter(metrics=metrics, filtering_options=filtering_options)
+            if tpm_df is None:
+                raise ValueError("TPM DataFrame must be provided for TPM quantile filtering")
+            return tpm_filter(metrics=metrics, filtering_options=filtering_options, tpm_df=tpm_df)
         case FilteringTechnique.ZFPKM:
             return zfpkm_filter(
                 metrics=metrics,
                 filtering_options=filtering_options,
                 calculate_fpkm=True,
                 force_zfpkm_plot=force_zfpkm_plot,
-                min_peak_height=zfpkm_min_peak_height,
-                min_peak_distance=zfpkm_min_peak_distance,
                 output_png_dirpath=output_zfpkm_plot_dirpath,
+                output_fpkm_filepath=output_fpkm_filepath,
                 force_negative_to_zero=force_negative_to_zero,
             )
         case FilteringTechnique.UMI:
@@ -695,16 +631,15 @@ def filter_counts(
                 target_sum=umi_target_sum,
                 perform_normalization=umi_perform_normalization,
             )
-        case _:
-            raise ValueError(f"Technique must be one of {FilteringTechnique}, got '{technique.value}'")
 
 
-async def _process(
+def _process(
     context_name: str,
     rnaseq_matrix_filepath: Path,
     metadata_df: pd.DataFrame,
     gene_info_df: pd.DataFrame,
-    fragment_df: pd.DataFrame | None,
+    fragment_df: pd.DataFrame,
+    tpm_df: pd.DataFrame,
     prep: RNAType,
     taxon: int,
     replicate_ratio: float,
@@ -714,12 +649,11 @@ async def _process(
     technique: FilteringTechnique,
     cut_off: int | float,
     force_zfpkm_plot: bool,
-    zfpkm_min_peak_height: float,
-    zfpkm_min_peak_distance: int,
     umi_target_sum: int,
     umi_perform_normalization: bool,
     output_boolean_activity_filepath: Path,
     output_zscore_normalization_filepath: Path,
+    output_fpkm_filepath: Path | None,
     output_zfpkm_plot_dirpath: Path | None,
     force_negative_to_zero: bool,
 ):
@@ -735,12 +669,15 @@ async def _process(
         high_batch_ratio=high_batch_ratio,
     )
 
-    metrics, entrez_gene_ids = await _build_matrix_results(
+    if fragment_df.empty and tpm_df.empty and prep != RNAType.SCRNA:
+        raise ValueError("Effective length dataframe and TPM dataframe cannot both be empty for RNA-Seq processing")
+    metrics, entrez_gene_ids = _build_matrix_results(
         rnaseq_matrix,
         gene_info=gene_info_df,
         metadata_df=metadata_df,
-        fragment_df=fragment_df,
+        normalize_df=fragment_df if not fragment_df.empty else tpm_df,
         taxon=taxon,
+        normalize_df_is_fragment_length=not bool(fragment_df.empty),
     )
     metrics = filter_counts(
         context_name=context_name,
@@ -748,12 +685,12 @@ async def _process(
         technique=technique,
         filtering_options=filtering_options,
         prep=prep,
+        tpm_df=tpm_df,
         force_zfpkm_plot=force_zfpkm_plot,
-        zfpkm_min_peak_height=zfpkm_min_peak_height,
-        zfpkm_min_peak_distance=zfpkm_min_peak_distance,
         umi_target_sum=umi_target_sum,
         umi_perform_normalization=umi_perform_normalization,
         output_zfpkm_plot_dirpath=output_zfpkm_plot_dirpath,
+        output_fpkm_filepath=output_fpkm_filepath,
         force_negative_to_zero=force_negative_to_zero,
     )
 
@@ -771,20 +708,20 @@ async def _process(
         )
 
         merged_zscores = merged_zscores.reindex(columns=sorted(merged_zscores.columns))
-        merged_zscores = merged_zscores.groupby("entrez_gene_id").mean()
+        merged_zscores = merged_zscores.groupby(by=merged_zscores.index.name).mean()
         merged_zscores.to_csv(output_zscore_normalization_filepath, index=True)
     elif isinstance(rnaseq_matrix, sc.AnnData):
         merged_zscores = ad.concat([m.z_score_matrix for m in metrics.values()], axis="obs")
         merged_zscores.var.index.name = "entrez_gene_id"
         merged_zscores.obs = merged_zscores.obs.reindex(columns=sorted(merged_zscores.obs.columns))
         merged_zscores.write_h5ad(output_zscore_normalization_filepath.with_suffix(".h5ad"))
+    logger.success(f"Wrote z-score normalization matrix to {output_zscore_normalization_filepath}")
+
     expressed_genes: list[str] = list(itertools.chain.from_iterable(m.entrez_gene_ids for m in metrics.values()))
     top_genes: list[str] = list(
         itertools.chain.from_iterable(m.high_confidence_entrez_gene_ids for m in metrics.values())
     )
 
-    logger.success(f"Wrote z-score normalization matrix to {output_zscore_normalization_filepath}")
-
     expression_frequency = pd.Series(expressed_genes).value_counts()
     expression_df = pd.DataFrame(
         {"entrez_gene_id": expression_frequency.index, "frequency": expression_frequency.values}
@@ -812,16 +749,17 @@ async def _process(
     # TODO: 2025-OCT-31: commented out dropping entrez ids, should this be kept?
     # boolean_matrix.dropna(subset="entrez_gene_id", inplace=True)
     boolean_matrix = boolean_matrix.groupby("entrez_gene_id", as_index=False).mean()
-    boolean_matrix["expressed"] = boolean_matrix["expressed"].copy().astype(int)
+    boolean_matrix["expressed"] = boolean_matrix["expressed"].copy().round(0)
     boolean_matrix["high"] = boolean_matrix["high"].copy().astype(int)
     boolean_matrix.to_csv(output_boolean_activity_filepath, index=False)
     logger.info(
-        f"{context_name} - Found {expressed_count} expressed genes, {high_confidence_count} of which are confidently expressed"
+        f"{context_name} - Found {expressed_count} expressed genes, "
+        f"{high_confidence_count} of which are confidently expressed"
     )
     logger.success(f"Wrote boolean matrix to {output_boolean_activity_filepath}")
 
 
-async def rnaseq_gen(  # noqa: C901
+def rnaseq_gen(  # noqa: C901
     context_name: str,
     input_rnaseq_filepath: Path,
     input_gene_info_filepath: Path,
@@ -835,10 +773,10 @@ async def rnaseq_gen(  # noqa: C901
     batch_ratio: float = 0.5,
     high_batch_ratio: float = 0.75,
     technique: FilteringTechnique | str = FilteringTechnique.ZFPKM,
-    zfpkm_min_peak_height: float = 0.02,
-    zfpkm_min_peak_distance: int = 1,
     umi_target_sum: int = 10_000,
+    output_fpkm_filepath: Path | None = None,
     input_fragment_lengths: Path | None = None,
+    input_tpm_filepath: Path | None = None,
     umi_perform_normalization: bool = False,
     cutoff: int | float | None = None,
     force_zfpkm_plot: bool = False,
@@ -862,6 +800,7 @@ async def rnaseq_gen(  # noqa: C901
     :param taxon_id: The NCBI Taxon ID
     :param input_metadata_filepath_or_df: The filepath or dataframe containing metadata information
     :param input_fragment_lengths: The filepath to the fragment lengths file, if applicable.
+    :param input_tpm_filepath: The filepath to the TPM values file, if applicable.
     :param replicate_ratio: The percentage of replicates that a gene must
         appear in for a gene to be marked as "active" in a batch/study
     :param batch_ratio: The percentage of batches that a gene must appear in for a gene to be marked as 'active"
@@ -870,11 +809,13 @@ async def rnaseq_gen(  # noqa: C901
     :param high_batch_ratio: The percentage of batches that a gene must
         appear in for a gene to be marked "highly confident" in its expression
     :param technique: The filtering technique to use
-    :param zfpkm_min_peak_height: The height of the zFPKM peak
-    :param zfpkm_min_peak_distance: The distance of the zFPKM peak
     :param umi_target_sum: The target sum for UMI normalization
-    :param umi_perform_normalization: Should UMI normalization be performed?
+    :param output_fpkm_filepath: Optional filepath to write the FPKM matrix
+        (required if using zFPKM filtering with paired-end data)
+    :param umi_perform_normalization: Should UMI normalization be performed
     :param cutoff: The cutoff value to use for the provided filtering technique
+        TPM: This is a TPM-based value; if `replicate_ratio` genes have a value less than this, the gene will be marked as inactive
+        zFPKM: This is a zFPKM-based value; if `replicate_ratio` genes have a value less than this, the gene will be marked as inactive
     :param force_zfpkm_plot: If too many samples exist, should plotting be done anyway?
     :param log_level: The level of logging to output
     :param log_location: The location to write logs to
@@ -883,28 +824,32 @@ async def rnaseq_gen(  # noqa: C901
         This could happen as a result of normalization producing negative near-zero values (e.g., -0.001)
 
     :return: None
-    """
+    """  # noqa: E501
     set_up_logging(level=log_level, location=log_location)
 
+    fragment_df = pd.DataFrame()
     technique = FilteringTechnique(technique) if isinstance(technique, str) else technique
-    match technique:
-        case FilteringTechnique.TPM:
-            cutoff: int | float = cutoff or 25
-            if cutoff < 1 or cutoff > 100:
-                raise ValueError("Quantile must be between 1 - 100")
-
-        case FilteringTechnique.CPM:
-            if cutoff and cutoff < 0:
-                raise ValueError("Cutoff must be greater than or equal to 0")
-            elif cutoff:
-                cutoff = "default"
-
-        case FilteringTechnique.ZFPKM:
-            cutoff: int | float = cutoff or -3
-        case FilteringTechnique.UMI:
-            cutoff: int = cutoff or 1
-        case _:
-            raise ValueError(f"Technique must be one of {','.join(FilteringTechnique)}. Got: {technique.value}")
+    if technique == FilteringTechnique.TPM:
+        if input_tpm_filepath is None:
+            raise ValueError("Missing argument: `input_tpm_filepath` is required for TPM filtering")
+        cutoff = cutoff or 1
+    elif technique == FilteringTechnique.CPM:
+        if isinstance(cutoff, int) and cutoff < 0:
+            raise ValueError("Cutoff must be greater than or equal to 0")
+        elif not cutoff:
+            cutoff = np.nan
+    elif technique == FilteringTechnique.ZFPKM:
+        if not input_fragment_lengths:
+            raise ValueError("Fragment lengths file is required for zFPKM filtering")
+        if not output_fpkm_filepath:
+            raise ValueError("Output FPKM filepath is required for zFPKM filtering")
+        cutoff = cutoff or -3
+        fragment_df = read_file(input_fragment_lengths)
+
+    elif technique == FilteringTechnique.UMI:
+        cutoff = cutoff or 1
+    else:
+        raise ValueError(f"Technique must be one of {','.join(FilteringTechnique)}. Got: {technique.value}")
 
     if not input_rnaseq_filepath.exists():
         raise FileNotFoundError(f"Input RNA-seq file not found! Searching for: '{input_rnaseq_filepath}'")
@@ -923,7 +868,7 @@ async def rnaseq_gen(  # noqa: C901
         if input_metadata_filepath_or_df.suffix not in {".xlsx", ".xls"}:
             raise ValueError(
                 f"Expected an excel file with extension of '.xlsx' or '.xls', "
-                f"got '{input_metadata_filepath_or_df.suffix}'"
+                f"got '{input_metadata_filepath_or_df.suffix}'."
             )
         if not input_metadata_filepath_or_df.exists():
             raise FileNotFoundError(f"Input metadata file not found! Searching for: '{input_metadata_filepath_or_df}'")
@@ -934,13 +879,13 @@ async def rnaseq_gen(  # noqa: C901
             f"Expected a pandas DataFrame or Path object as metadata, got '{type(input_metadata_filepath_or_df)}'"
         )
 
-    logger.debug(f"Starting '{context_name}'")
-    await _process(
+    _process(
         context_name=context_name,
         rnaseq_matrix_filepath=input_rnaseq_filepath,
         metadata_df=metadata_df,
         gene_info_df=read_file(input_gene_info_filepath),
-        fragment_df=read_file(input_fragment_lengths),
+        fragment_df=fragment_df,
+        tpm_df=read_file(input_tpm_filepath) if input_tpm_filepath else pd.DataFrame(),
         prep=prep,
         taxon=taxon_id,
         replicate_ratio=replicate_ratio,
@@ -950,12 +895,65 @@ async def rnaseq_gen(  # noqa: C901
         technique=technique,
         cut_off=cutoff,
         force_zfpkm_plot=force_zfpkm_plot,
-        zfpkm_min_peak_height=zfpkm_min_peak_height,
-        zfpkm_min_peak_distance=zfpkm_min_peak_distance,
         umi_target_sum=umi_target_sum,
         umi_perform_normalization=umi_perform_normalization,
+        output_fpkm_filepath=output_fpkm_filepath,
         output_boolean_activity_filepath=output_boolean_activity_filepath,
         output_zscore_normalization_filepath=output_zscore_normalization_filepath,
         output_zfpkm_plot_dirpath=output_zfpkm_plot_dirpath,
         force_negative_to_zero=force_negative_counts_to_zero,
     )
+
+
+if __name__ == "__main__":
+    import matplotlib.pyplot as plt
+
+    data = pd.read_csv("/Users/joshl/Downloads/fpkm_example_data/CD8.genes.results.txt", sep="\t")
+    data["gene_id"] = data["gene_id"].str.partition(".")[0]
+    counts = (
+        data[["gene_id", "expected_count"]]
+        .copy()
+        .set_index("gene_id")
+        .sort_index()
+        .rename(columns={"expected_count": "actual"})
+    )
+    eff_len = (
+        data[["gene_id", "effective_length"]]
+        .copy()
+        .set_index("gene_id")
+        .sort_index()
+        .rename(columns={"effective_length": "actual"})
+    )
+    expected_fpkm = (
+        data[["gene_id", "FPKM"]].copy().set_index("gene_id").sort_index().rename(columns={"FPKM": "expected"})
+    )
+
+    metrics = {
+        "S1": _StudyMetrics(
+            study="S1",
+            num_samples=1,
+            count_matrix=counts,
+            eff_length=eff_len,
+            sample_names=[""],
+            layout=[LayoutMethod.paired_end],
+            entrez_gene_ids=np.ndarray([0]),
+            gene_sizes=np.ndarray([0]),
+        )
+    }
+    calculated_fpkm = _calculate_fpkm(metrics)["S1"].normalization_matrix
+    calculated_fpkm = calculated_fpkm.round(2)
+
+    joined = calculated_fpkm.join(expected_fpkm, how="inner")
+    joined["actual"] = joined["actual"].replace([np.nan, np.inf], 0)
+
+    zfpkm_df, _ = zFPKM(joined, remove_na=True)
+    zfpkm_df = zfpkm_df.replace(-np.inf, np.nan)
+
+    fig, axes = cast(tuple[plt.Figure, list[plt.Axes]], plt.subplots(nrows=2, ncols=1))
+    axes[0].hist(zfpkm_df["actual"].to_numpy())
+    axes[0].set_title("Expected zFPKM")
+
+    axes[1].hist(zfpkm_df["expected"].to_numpy())
+    axes[1].set_title("Actual zFPKM")
+    fig.tight_layout()
+    fig.show()

From 0f66f122a816332cc40f9e06b11b976cb7f5fdc0 Mon Sep 17 00:00:00 2001
From: Josh Loecker <joshloecker@icloud.com>
Date: Fri, 6 Mar 2026 11:04:43 -0600
Subject: [PATCH 14/41] feat: simplify merging logic

- Removes `_get_transcriptomic_details` function as it was unused
- Renames `_merge_xomics` to `_trinarize_data` for -1/0/+1 binning
- Simplify missing data handling

[BREAKING-CHANGE]: Removes all async functions for synchronous counterparts

Signed-off-by: Josh Loecker <joshloecker@icloud.com>
---
 main/como/merge_xomics.py | 195 ++++++++++++--------------------------
 1 file changed, 58 insertions(+), 137 deletions(-)

diff --git a/main/como/merge_xomics.py b/main/como/merge_xomics.py
index 564c2025..6221e455 100644
--- a/main/como/merge_xomics.py
+++ b/main/como/merge_xomics.py
@@ -88,7 +88,6 @@ def load_dummy_dict():
         return load_dummy_dict()
 
 
-# Merge Output
 def _merge_logical_table(df: pd.DataFrame):
     """Merge rows of Logical Table belonging to the same entrez_gene_id.
 
@@ -100,12 +99,14 @@ def _merge_logical_table(df: pd.DataFrame):
     """
     # step 1: get all plural ENTREZ_GENE_IDs in the input table, extract unique IDs
     df.dropna(subset=["entrez_gene_id"], inplace=True)
+    df["entrez_gene_id"] = df["entrez_gene_id"].copy().astype(str)
     df.loc[:, "entrez_gene_id"] = df.loc[:, "entrez_gene_id"].astype(str).str.replace(" /// ", "//").astype(str)
 
-    id_list: list[str] = df.loc[~df["entrez_gene_id"].str.contains("//"), "entrez_gene_id"].tolist()  # Collect "single" ids, like "123"
-    multiple_entrez_ids: list[str] = df.loc[
-        df["entrez_gene_id"].str.contains("//"), "entrez_gene_id"
-    ].tolist()  # Collect "double" ids, like "123//456"
+    # Collect "single" ids, like "123"
+    id_list: list[str] = df.loc[~df["entrez_gene_id"].str.contains("//"), "entrez_gene_id"].tolist()
+
+    # Collect "double" ids, like "123//456"
+    multiple_entrez_ids: list[str] = df.loc[df["entrez_gene_id"].str.contains("//"), "entrez_gene_id"].tolist()
 
     for i in multiple_entrez_ids:
         ids = i.split("//")
@@ -174,81 +175,7 @@ def _merge_logical_table(df: pd.DataFrame):
     return df
 
 
-async def _get_transcriptmoic_details(merged_df: pd.DataFrame, taxon_id: int) -> pd.DataFrame:
-    """Get details of transcriptomic data.
-
-    This function will get the following details of transcriptomic data:
-    - Gene Symbol
-    - Gene Name
-    - entrez_gene_id
-
-    The resulting dataframe will have its columns created in the order listed above
-    It will return a pandas dataframe with this information
-
-    Args:
-        merged_df: A dataframe containing all active transcriptomic and proteomic genes
-        taxon_id: The NCBI taxonomy ID of the organism
-
-    Returns:
-        A dataframe with the above-listed columns
-    """
-    # If _ExpressedHeaderNames.PROTEOMICS.value is in the dataframe, lower the required expression by 1
-    # We are only trying to get details for transcriptomic data
-    logger.debug("Obtaining transcriptomic details")
-    transcriptomic_df: pd.DataFrame = merged_df.copy()
-    if _ExpressedHeaderNames.PROTEOMICS in merged_df.columns:
-        logger.trace("Proteomic data found, modifying required and total expression values")
-        # Get the number of sources required for a gene to be marked "expressed"
-        required_expression = merged_df["required"].iloc[0]
-
-        # Subtract 1 from merged_df["TotalExpressed"] if the current value is greater than or equal to 1
-        # This is done to take into account the removal of proteomic expression
-        merged_df["total_expressed"] = merged_df["total_expressed"].apply(lambda x: x - 1 if x >= 1 else x)
-
-        # Subtract required_expression by 1 if it is greater than 1
-        if required_expression > 1:
-            required_expression -= 1
-
-        transcriptomic_df: pd.DataFrame = merged_df.drop(
-            columns=[
-                _ExpressedHeaderNames.PROTEOMICS,
-                _HighExpressionHeaderNames.PROTEOMICS,
-            ],
-            inplace=False,
-        )
-        logger.trace(f"Modified transcriptomic dataframe: {transcriptomic_df.shape}")
-
-        # Must recalculate TotalExpressed because proteomic data was removed
-        # If the TotalExpressed column is less than the Required column, set active to 1, otherwise set it to 0
-        transcriptomic_df.loc[
-            transcriptomic_df["total_expressed"] >= transcriptomic_df["required"],
-            "active",
-        ] = 1
-
-    my_gene = MyGene()
-    gene_details: pd.DataFrame = pd.DataFrame(
-        data=pd.NA,
-        columns=["entrez_gene_id", "gene_symbol", "description", "gene_type"],
-        index=list(range(len(transcriptomic_df))),
-    )
-    logger.trace(f"Querying MyGene for details on {len(transcriptomic_df)} genes")
-    for i, detail in enumerate(
-        await my_gene.query(
-            items=transcriptomic_df["entrez_gene_id"].astype(int).tolist(),
-            taxon=taxon_id,
-            scopes="entrezgene",
-        )
-    ):
-        gene_details.at[i, "entrez_gene_id"] = detail["entrezgene"]
-        gene_details.at[i, "gene_symbol"] = detail["symbol"]
-        gene_details.at[i, "description"] = detail["name"]
-        gene_details.at[i, "gene_type"] = detail["type_of_gene"]
-
-    logger.debug("Finished obtaining transcriptomic details")
-    return gene_details
-
-
-async def _merge_xomics(
+def _trinarize_data(
     context_name: str,
     expression_requirement: int,
     trna_boolean_matrix: pd.DataFrame | None,
@@ -257,8 +184,6 @@ async def _merge_xomics(
     proteomic_boolean_matrix: pd.DataFrame | None,
     output_merged_filepath: Path,
     output_gene_activity_filepath: Path,
-    output_transcriptomic_details_filepath: Path,
-    taxon_id: int,
     force_activate_high_confidence: bool = True,
     adjust_for_missing_sources: bool = False,
 ):
@@ -277,17 +202,18 @@ async def _merge_xomics(
             logger.trace(f"Skipping {expressed_sourcetype} because it's matrix does not exist")
             continue
 
-        matrix: pd.DataFrame  # re-define type to assist in type hinting for IDEs
         expression_list.append(expressed_sourcetype)
         high_confidence_list.append(high_expressed_sourcetype)
         matrix.rename(columns={"expressed": expressed_sourcetype, "high": high_expressed_sourcetype}, inplace=True)
-        matrix = cast(pd.DataFrame, matrix[matrix["entrez_gene_id"] != "-"])
+        matrix = matrix[matrix["entrez_gene_id"] != "-"]
         matrix.loc[:, "entrez_gene_id"] = matrix.loc[:, "entrez_gene_id"].astype(int)
         merge_data = matrix if merge_data.empty else merge_data.merge(matrix, on="entrez_gene_id", how="outer")
 
     logger.trace(f"Shape of merged data before merging logical tables: {merge_data.shape}")
     if merge_data.empty:
-        logger.warning(f"No data is available for the '{context_name}' context. If this is intentional, ignore this error.")
+        logger.warning(
+            f"No data is available for the '{context_name}' context. If this is intentional, ignore this error."
+        )
         return {}
 
     merge_data = _merge_logical_table(merge_data)
@@ -300,7 +226,11 @@ async def _merge_xomics(
     if adjust_for_missing_sources:  # Subtract 1 from requirement per missing source
         logger.trace("Adjusting for missing data sources")
         merge_data.loc[:, "required"] = merge_data[expression_list].apply(
-            lambda x: expression_requirement - (num_sources - x.count()) if (expression_requirement - (num_sources - x.count()) > 0) else 1,
+            lambda x: (
+                expression_requirement - (num_sources - x.count())
+                if (expression_requirement - (num_sources - x.count()) > 0)
+                else 1
+            ),
             axis=1,
         )
     else:  # Do not adjust for missing sources
@@ -321,29 +251,26 @@ async def _merge_xomics(
         merge_data.loc[merge_data[high_confidence_list].sum(axis=1) > 0, "active"] = 1
 
     merge_data.dropna(inplace=True)
+    merge_data = merge_data.groupby("entrez_gene_id", as_index=False).mean()
     merge_data.to_csv(output_merged_filepath, index=False)
     logger.success(f"Saved merged data to {output_merged_filepath}")
 
-    logger.debug(f"Generating transcriptomic details using {output_merged_filepath}")
-    transcriptomic_details = await _get_transcriptmoic_details(merge_data, taxon_id=taxon_id)
-    logger.debug(f"Saving transcriptomic details to {output_transcriptomic_details_filepath}")
-    transcriptomic_details.dropna(inplace=True)
-    transcriptomic_details.to_csv(output_transcriptomic_details_filepath, index=False)
-    logger.success(f"Saved transcriptomic details to {output_transcriptomic_details_filepath}")
     return {context_name: output_gene_activity_filepath.as_posix()}
 
 
-async def _update_missing_data(input_matrices: _InputMatrices, taxon_id: int) -> _InputMatrices:
+def _update_missing_data(input_matrices: _InputMatrices, taxon_id: int) -> _InputMatrices:
     logger.trace("Updating missing genomic data")
-    matrix_keys: dict[str, list[pd.DataFrame]] = {
+    matrix_keys: dict[str, list[pd.DataFrame | None]] = {
         "trna": [input_matrices.trna],
         "mrna": [input_matrices.mrna],
         "scrna": [input_matrices.scrna],
         "proteomics": [input_matrices.proteomics],
     }
     logger.trace(f"Gathering missing data for data sources: {','.join(key for key in matrix_keys if key is not None)}")
+
     # ruff: disable[E501]
     # fmt: off
+    # TODO: Use the local `gene_info.csv` file
     results: tuple[pd.DataFrame | None, ...] = (
         get_missing_gene_data(values=input_matrices.trna, taxon_id=taxon_id) if input_matrices.trna is not None else None,
         get_missing_gene_data(values=input_matrices.mrna, taxon_id=taxon_id) if input_matrices.mrna is not None else None,
@@ -356,34 +283,29 @@ async def _update_missing_data(input_matrices: _InputMatrices, taxon_id: int) ->
         matrix_keys[key].append(results[i])
 
     for matrix_name, (matrix, conversion) in matrix_keys.items():
-        matrix: pd.DataFrame
-        if matrix is not None:
-            # fmt: off
-            existing_data = (
-                "gene_symbol" if "gene_symbol" in matrix.columns
-                else "entrez_gene_id" if "entrez_gene_id" in matrix.columns
-                else "ensembl_gene_id"
-            )
-            # fmt: on
-            logger.trace(f"Merging conversion data for {matrix_name}, existing id column is: {existing_data}")
-            # input_matrices[matrix_name][existing_data] = input_matrices[matrix_name][existing_data].astype(str)
-            if existing_data == "entrez_gene_id":
-                input_matrices[matrix_name]["entrez_gene_id"] = input_matrices[matrix_name]["entrez_gene_id"].astype(int)
-                conversion.index = conversion.index.astype(int)
-
-            input_matrices[matrix_name] = (
-                input_matrices[matrix_name].merge(conversion, how="left", left_on=existing_data, right_index=True).reset_index(drop=True)
-            )
+        if matrix is None or conversion is None:
+            continue
 
-            # input_matrices[matrix_name] = (
-            #     input_matrices[matrix_name].merge(conversion, how="left", left_index=True, right_index=True).dropna().reset_index(drop=True)
-            # )
+        merge_on = matrix.columns.intersection(conversion.columns).to_list()
+        logger.trace(f"Merging conversion data for {matrix_name} on column(s): {','.join(merge_on)}")
+        if "entrez_gene_id" in merge_on:
+            matrix["entrez_gene_id"] = matrix["entrez_gene_id"].astype(int)
+            conversion["entrez_gene_id"] = conversion["entrez_gene_id"].astype(int)
+        conversion = conversion.explode(column="ensembl_gene_id", ignore_index=True)
+
+        if "notfound" in conversion.columns:
+            conversion["notfound"] = conversion["notfound"].replace(np.nan, False)
+            conversion = conversion[~conversion["notfound"]]
+            conversion = conversion.drop(columns=["notfound"])
+        matrix = matrix.merge(conversion, how="outer", on=merge_on).reset_index(drop=True)
+        matrix = matrix[~matrix["entrez_gene_id"].isna()]  # we need to exclude NA entrez IDs for model building
+        input_matrices[matrix_name] = matrix
 
     logger.debug("Updated missing genomic data")
     return input_matrices
 
 
-async def _process(
+def _process(
     *,
     context_name: str,
     input_matrices: _InputMatrices,
@@ -396,11 +318,10 @@ async def _process(
     weighted_z_floor: int,
     weighted_z_ceiling: int,
     adjust_method: AdjustmentMethod,
-    merge_zfpkm_distribution: bool,
+    merge_zscore_distribution: bool,
     force_activate_high_confidence: bool,
     adjust_for_missing_sources: bool,
     output_merge_activity_filepath: Path,
-    output_transcriptomic_details_filepath: Path,
     output_activity_filepaths: _OutputCombinedSourceFilepath,
     output_final_model_scores_filepath: Path,
     output_figure_dirpath: Path | None,
@@ -409,18 +330,19 @@ async def _process(
     logger.trace(
         f"Settings: Min Expression: {minimum_source_expression}, Expression Requirement: {expression_requirement}, "
         f"Weighted Z-Score Floor: {weighted_z_floor}, Weighted Z-Score Ceiling: {weighted_z_ceiling}, "
-        f"Adjust Method: {adjust_method.value}, Merge Z-Scores: {merge_zfpkm_distribution}, "
+        f"Adjust Method: {adjust_method.value}, Merge Z-Scores: {merge_zscore_distribution}, "
         f"Force High Confidence: {force_activate_high_confidence}, Adjust for Missing: {adjust_for_missing_sources}"
     )
 
     # Collect missing genomic data for each of the input items in asynchronous parallel
-    input_matrices = await _update_missing_data(input_matrices, taxon_id)
+    input_matrices = _update_missing_data(input_matrices, taxon_id)
     logger.trace("Missing data updated")
 
-    if merge_zfpkm_distribution:
+    if merge_zscore_distribution:
         logger.trace("Merging Z-Scores")
-        await _begin_combining_distributions(
+        _begin_combining_distributions(
             context_name=context_name,
+            taxon=taxon_id,
             input_matrices=input_matrices,
             batch_names=batch_names,
             source_weights=source_weights,
@@ -471,7 +393,7 @@ async def _process(
         adjusted_expression_requirement = 1
 
     logger.debug(f"Final Expression Requirement: {adjusted_expression_requirement}")
-    await _merge_xomics(
+    _trinarize_data(
         context_name=context_name,
         expression_requirement=adjusted_expression_requirement,
         trna_boolean_matrix=boolean_matrices.trna,
@@ -480,8 +402,6 @@ async def _process(
         proteomic_boolean_matrix=boolean_matrices.proteomics,
         output_merged_filepath=output_merge_activity_filepath,
         output_gene_activity_filepath=output_final_model_scores_filepath,
-        output_transcriptomic_details_filepath=output_transcriptomic_details_filepath,
-        taxon_id=taxon_id,
         force_activate_high_confidence=force_activate_high_confidence,
         adjust_for_missing_sources=adjust_for_missing_sources,
     )
@@ -494,14 +414,17 @@ def _build_batches(
     proteomic_metadata: pd.DataFrame | None,
 ) -> _BatchNames:
     batch_names = _BatchNames()
-    for source, metadata in zip(SourceTypes.__members__.values(), [trna_metadata, mrna_metadata, scrna_metadata, proteomic_metadata], strict=True):
+    metadata: pd.DataFrame | None
+    for source, metadata in zip(
+        SourceTypes.__members__.values(),
+        [trna_metadata, mrna_metadata, scrna_metadata, proteomic_metadata],
+        strict=True,
+    ):
         source: SourceTypes
-        metadata: pd.DataFrame
         if metadata is None:
             logger.trace(f"Metadata for source '{source.value}' is None, skipping")
             continue
 
-        metadata: pd.DataFrame  # Re-assign type to assist in type hinting
         for study in sorted(metadata["study"].unique()):
             batch_search = re.search(r"\d+", study)
             if not batch_search:
@@ -535,10 +458,9 @@ def _validate_source_arguments(
         raise ValueError(f"Must specify all or none of '{source.value}' arguments")
 
 
-async def merge_xomics(  # noqa: C901
+def merge_xomics(  # noqa: C901
     context_name: str,
     output_merge_activity_filepath: Path,
-    output_transcriptomic_details_filepath: Path,
     output_final_model_scores_filepath: Path,
     output_figure_dirpath: Path | None,
     taxon_id: int,
@@ -567,7 +489,7 @@ async def merge_xomics(  # noqa: C901
     adjust_method: AdjustmentMethod = AdjustmentMethod.FLAT,
     force_activate_high_confidence: bool = False,
     adjust_for_na: bool = False,
-    merge_zfpkm_distribution: bool = False,
+    merge_zscore_distributions: bool = True,
     weighted_z_floor: int = -6,
     weighted_z_ceiling: int = 6,
     log_level: LogLevel = LogLevel.INFO,
@@ -600,7 +522,9 @@ async def merge_xomics(  # noqa: C901
         raise ValueError("No data was passed!")
 
     if expression_requirement and expression_requirement < 1:
-        logger.warning(f"Expression requirement must be at least 1! Setting to the minimum of 1 now. Got: {expression_requirement}")
+        logger.warning(
+            f"Expression requirement must be at least 1! Setting to the minimum of 1 now. Got: {expression_requirement}"
+        )
         expression_requirement = 1
 
     if expression_requirement is None:
@@ -619,8 +543,6 @@ async def merge_xomics(  # noqa: C901
     output_final_model_scores_filepath.parent.mkdir(parents=True, exist_ok=True)
     if output_merge_activity_filepath:
         output_merge_activity_filepath.parent.mkdir(parents=True, exist_ok=True)
-    if output_transcriptomic_details_filepath:
-        output_transcriptomic_details_filepath.parent.mkdir(parents=True, exist_ok=True)
     if output_trna_activity_filepath:
         output_trna_activity_filepath.parent.mkdir(parents=True, exist_ok=True)
     if output_mrna_activity_filepath:
@@ -674,7 +596,7 @@ async def merge_xomics(  # noqa: C901
         proteomic_metadata=proteomic_metadata,
     )
 
-    await _process(
+    _process(
         context_name=context_name,
         input_matrices=input_matrices,
         boolean_matrices=boolean_matrices,
@@ -686,12 +608,11 @@ async def merge_xomics(  # noqa: C901
         weighted_z_floor=weighted_z_floor,
         weighted_z_ceiling=weighted_z_ceiling,
         adjust_method=adjust_method,
-        merge_zfpkm_distribution=merge_zfpkm_distribution,
+        merge_zscore_distribution=merge_zscore_distributions,
         force_activate_high_confidence=force_activate_high_confidence,
         adjust_for_missing_sources=adjust_for_na,
         output_activity_filepaths=output_activity_filepaths,
         output_merge_activity_filepath=output_merge_activity_filepath,
-        output_transcriptomic_details_filepath=output_transcriptomic_details_filepath,
         output_final_model_scores_filepath=output_final_model_scores_filepath,
         output_figure_dirpath=output_figure_dirpath,
     )

From 85434fd5c3a62f5b7d7d44adbc3ba3c5c40521d8 Mon Sep 17 00:00:00 2001
From: Josh Loecker <joshloecker@icloud.com>
Date: Fri, 6 Mar 2026 11:05:59 -0600
Subject: [PATCH 15/41] fix: z-score combination logic

- Fixes Z-score combination formula to use equal weights/Stouffer's method
- Adds proper handling for NaN/Inf values
- Fixes output formatting

[BREAKING-CHANGE]: Converts all async functions to their synchronous counterparts

Signed-off-by: Josh Loecker <joshloecker@icloud.com>
---
 main/como/combine_distributions.py | 151 +++++++++++++++++------------
 1 file changed, 88 insertions(+), 63 deletions(-)

diff --git a/main/como/combine_distributions.py b/main/como/combine_distributions.py
index 24f8889a..f08430fd 100644
--- a/main/como/combine_distributions.py
+++ b/main/como/combine_distributions.py
@@ -2,10 +2,10 @@
 
 from collections.abc import Sequence
 from pathlib import Path
-from typing import cast
 
 import numpy as np
 import pandas as pd
+import scanpy as sc
 from loguru import logger
 
 from como.data_types import (
@@ -29,23 +29,17 @@ def _combine_z_distribution_for_batch(
     source: SourceTypes,
     output_combined_matrix_filepath: Path,
     output_figure_dirpath: Path,
-    weighted_z_floor: int,
-    weighted_z_ceiling: int,
 ) -> pd.DataFrame:
     """Combine z-score distributions across samples for a single batch.
 
-    Args:
-        context_name: Name of the context (e.g., tissue or condition).
-        batch: Batch entry containing batch number and sample names.
-        matrix: DataFrame with 'ensembl_gene_id' and sample columns.
-        source: Source type (e.g., trna, mrna, scrna, proteomics).
-        output_combined_matrix_filepath: Path to save the combined z-score matrix.
-        output_figure_dirpath: Path to save the z-score distribution figure.
-        weighted_z_floor: Minimum z-score value after combining.
-        weighted_z_ceiling: Maximum z-score value after combining.
+    :param context_name: Name of the context (e.g., tissue or condition).
+    :param batch: Batch entry containing batch number and sample names.
+    :param matrix: DataFrame with 'ensembl_gene_id' and sample columns.
+    :param source: Source type (e.g., trna, mrna, scrna, proteomics).
+    :param output_combined_matrix_filepath: Path to save the combined z-score matrix.
+    :param output_figure_dirpath: Path to save the z-score distribution figure.
 
-    Returns:
-            A pandas dataframe of the weighted z-distributions
+    :returns: A pandas dataframe of the weighted z-distributions
     """
     output_combined_matrix_filepath.parent.mkdir(parents=True, exist_ok=True)
     output_figure_dirpath.mkdir(parents=True, exist_ok=True)
@@ -58,20 +52,30 @@ def _combine_z_distribution_for_batch(
     )
     if num_columns(matrix) < 2:
         logger.trace(
-            f"A single sample exists for batch '{batch.batch_num}'. Returning as-is because no additional combining can be done"
+            f"A single sample exists for batch '{batch.batch_num}'. "
+            f"Returning as-is because no additional combining can be done"
         )
         with_batch_num = matrix.copy()
         with_batch_num.columns = [batch.batch_num]
         return with_batch_num
 
-    values = matrix.values
-    weighted_matrix = np.clip(
-        a=np.sum(values, axis=1) / np.sqrt(values.shape[1]),  # calculate a weighted matrix
-        a_min=weighted_z_floor,
-        a_max=weighted_z_ceiling,
-    ).astype(float)
-
-    weighted_matrix = pd.DataFrame({"combine_z": weighted_matrix}, index=matrix.index)
+    values = matrix.copy().replace([np.inf, -np.inf], np.nan)
+    values_array = values.to_numpy(dtype=float)
+    mask = ~np.isnan(values_array)
+    weights = np.ones(values.shape[1], dtype=float)
+    masked_values = np.where(mask, values_array * weights, 0.0)
+    weighted_sum = masked_values.sum(axis=1)
+
+    weights_squared = weights**2
+    sum_weights_squared = (mask * weights_squared).sum(axis=1)
+    denominator = np.sqrt(sum_weights_squared)
+    combined_z = np.divide(
+        weighted_sum,
+        denominator,
+        out=np.full_like(weighted_sum, np.nan, dtype=float),
+        where=denominator != 0.0,
+    )
+    df = pd.DataFrame({"combine_z": combined_z}, index=values.index)
 
     # merge_df = pd.concat([matrix, pd.Series(weighted_matrix, name="combined")], axis=1)
     # stack_df = pd.melt(
@@ -91,9 +95,11 @@ def _combine_z_distribution_for_batch(
     #     / f"{context_name}_{source.value}_batch{batch.batch_num}_combined_zscore_distribution.pdf",
     # )
 
-    weighted_matrix.columns = [batch.batch_num]
-    weighted_matrix.to_csv(output_combined_matrix_filepath, index=True)
-    return weighted_matrix
+    df.columns = [batch.batch_num]
+    df.index = df.index.astype(int)
+    df = df.sort_index()
+    df.to_csv(output_combined_matrix_filepath, index=True)
+    return df
 
 
 def _combine_z_distribution_for_source(
@@ -108,7 +114,8 @@ def _combine_z_distribution_for_source(
     """Combine z-score distributions across batches for a single source.
 
     Args:
-        merged_source_data : DataFrame with columns: ["ensembl_gene_id", <batch1>, <batch2>, ...]. Each batch column contains the within-batch combined z for that gene.
+        merged_source_data : DataFrame with columns: ["ensembl_gene_id", <batch1>, <batch2>, ...].
+            Each batch column contains the within-batch combined z for that gene.
         context_name: tissue or condition name
         replicates_per_batch: vector of replicate counts per batch, aligned exactly to the order of
             batch columns in `merged_source_data.iloc[:, 1:]`.
@@ -141,19 +148,20 @@ def _combine_z_distribution_for_source(
         )
 
     logger.trace(
-        f"[{context_name}] Combining {expected_num_batches} batches with replicate counts: {dict(zip(batch_column_names, replicates_per_batch))}"
+        f"[{context_name}] Combining {expected_num_batches} batches with"
+        f" replicate counts: {dict(zip(batch_column_names, replicates_per_batch, strict=True))}"
     )
 
     # extract values and build masks (shape is in gene x batch)
     gene_by_batch_values: np.ndarray = merged_source_data.to_numpy(dtype=float)
     present_mask: np.ndarray = ~np.isnan(gene_by_batch_values)
 
-    # per-batch weights from replicate counts, normalized over all batches
+    # per-batch weights: use equal weights (Stouffer's Z method)
+    # Replicate counts are not statistically valid weights for z-scores because
+    #   z-scores are already standardized (mean=0, std=1 under null hypothesis)
+    #   Weighting by sample size assumes information about precision that was lost during standardization
     replicates_per_batch_array: np.ndarray = np.asarray(replicates_per_batch, dtype=float)
-    total_replicates: float = replicates_per_batch_array.sum()
-    if total_replicates <= 0.0:
-        raise ValueError(f"[{context_name}] Sum of replicates_per_batch must be > 0; got {total_replicates}.")
-    base_weights_per_batch: np.ndarray = replicates_per_batch_array / total_replicates  # shape: (B,)
+    base_weights_per_batch = np.ones_like(replicates_per_batch_array, dtype=float)  # shape: (B,)
 
     # drop NA columns per gene by zeroing their weights
     masked_weights_per_batch: np.ndarray = np.where(present_mask, base_weights_per_batch, 0.0)  # (G,B)
@@ -173,16 +181,17 @@ def _combine_z_distribution_for_source(
 
     logger.trace(f"[{context_name}] Finished combining batch z-distributions for source.")
 
-    return pd.DataFrame({"combine_z": combined_z_per_gene}, index=merged_source_data.index)
+    df = pd.DataFrame({"combine_z": combined_z_per_gene}, index=merged_source_data.index)
+    df.index = df.index.astype(int)
+    df = df.sort_index()
+    return df
 
 
 def _combine_z_distribution_for_context(
     context: str,
     zscore_results: list[_CombineOmicsInput],
     output_graph_filepath: Path,
-    weighted_z_floor: int = -6,
-    weighted_z_ceiling: int = 6,
-):
+) -> pd.DataFrame:
     if not zscore_results:
         logger.warning("No zscore results exist, returning empty dataframe")
         return pd.DataFrame({"ensembl_gene_id": [], "combine_z": []})
@@ -197,13 +206,12 @@ def _combine_z_distribution_for_context(
             )
 
         matrix.columns = [res.type.value.lower()]
-        matrix = cast(pd.DataFrame, matrix.loc[matrix.index != "-"])
+        matrix = matrix.loc[matrix.index != "-"]
 
         # if the matrix has duplicate ids (i.e., multiple entrez ids map to a single ensembl id, etc.)
-        #   take the mean value of them to remove the duplicates
+        #   take the max value of them to remove the duplicates
         if not matrix.index.is_unique:
-            # matrix = cast(pd.DataFrame, matrix.groupby(level=0).mean())
-            matrix = cast(pd.DataFrame, matrix.groupby(level=0).max())
+            matrix = matrix.groupby(level=0).max()
         z_matrices.append(matrix)
 
     z_matrix = pd.concat(z_matrices, axis="columns", join="outer", ignore_index=False)
@@ -214,20 +222,23 @@ def _combine_z_distribution_for_context(
         z_matrix.columns = ["combine_z"]
         return z_matrix
 
-    values = z_matrix.iloc[:, 1:].values
+    values = z_matrix.values
     weights = np.array([r.weight for r in zscore_results])
     mask = ~np.isnan(values)
     masked_values = np.where(mask, values, 0)
     masked_weights = np.where(mask, weights, 0)
 
-    normalized_weights = masked_weights / np.sum(masked_weights, axis=1, keepdims=True)
-    numerator = np.sum(normalized_weights * masked_values, axis=1)
-    denominator = np.sqrt(np.sum(normalized_weights**2, axis=1))
-    combined_z_matrix = numerator / denominator
-    combined_z_matrix = np.clip(combined_z_matrix, weighted_z_floor, weighted_z_ceiling, dtype=float)
+    numerator = np.sum(masked_weights * masked_values, axis=1)
+    denominator = np.sqrt(np.sum(masked_weights**2, axis=1))
+    combined_z_matrix = np.divide(
+        numerator,
+        denominator,
+        out=np.full_like(numerator, np.nan, dtype=float),
+        where=denominator != 0.0,
+    )
     combined_z_matrix_df = pd.DataFrame(
         {
-            "ensembl_gene_id": z_matrix.index,
+            "entrez_gene_id": z_matrix.index,
             "combine_z": combined_z_matrix,
         }
     )
@@ -252,11 +263,14 @@ def _combine_z_distribution_for_context(
     #     title=f"Combined Z-score Distribution for {context}",
     #     output_filepath=output_graph_filepath,
     # )
+    combined_z_matrix_df["entrez_gene_id"] = combined_z_matrix_df["entrez_gene_id"].astype(int)
+    combined_z_matrix_df = combined_z_matrix_df.sort_values(by="entrez_gene_id")
     return combined_z_matrix_df
 
 
-async def _begin_combining_distributions(
+def _begin_combining_distributions(
     context_name: str,
+    taxon: int,
     input_matrices: _InputMatrices,
     batch_names: _BatchNames,
     source_weights: _SourceWeights,
@@ -270,6 +284,8 @@ async def _begin_combining_distributions(
     output_figure_dirpath.mkdir(parents=True, exist_ok=True)
 
     z_score_results: list[_CombineOmicsInput] = []
+
+    matrix: pd.DataFrame | None
     for source, matrix in input_matrices:
         if matrix is None:
             logger.trace(f"Source '{source.value}' is None, skipping")
@@ -288,21 +304,23 @@ async def _begin_combining_distributions(
             elif isinstance(matrix, sc.AnnData):
                 conversion = get_remaining_identifiers(ids=matrix.var_names.tolist(), taxon=taxon)
                 conversion.reset_index(drop=False, inplace=True)
-                matrix: pd.DataFrame = matrix.to_df().T
+                matrix = matrix.to_df().T
                 matrix.reset_index(inplace=True, drop=False, names=["gene_symbol"])
-                matrix: pd.DataFrame = matrix.merge(
-                    conversion, left_on="gene_symbol", right_on="gene_symbol", how="left"
-                )
+                matrix = matrix.merge(conversion, left_on="gene_symbol", right_on="gene_symbol", how="left")
                 matrix_subset: pd.DataFrame = matrix[[GeneIdentifier.entrez_gene_id.value, *batch.sample_names]]
                 matrix_subset: pd.DataFrame = matrix_subset.set_index(
                     keys=[GeneIdentifier.entrez_gene_id.value], drop=True
                 )
             else:
                 raise TypeError(
-                    f"Unexpected matrix type for source '{source.value}'; expected 'pandas.DataFrame' or 'scanpy.AnnData': {type(matrix)}"
+                    f"Unexpected matrix type for source '{source.value}'; "
+                    f"expected 'pandas.DataFrame' or 'scanpy.AnnData': {type(matrix)}"
                 )
 
-            output_fp = output_filepaths[source.value].parent / f"{source.value}_batch{batch.batch_num}_combined_z_distrobution_{context_name}.csv"
+            output_fp = (
+                output_filepaths[source.value].parent
+                / f"{source.value}_batch{batch.batch_num}_combined_z_distrobution_{context_name}.csv"
+            )
             batch_results.append(
                 _combine_z_distribution_for_batch(
                     context_name=context_name,
@@ -311,8 +329,6 @@ async def _begin_combining_distributions(
                     source=source,
                     output_combined_matrix_filepath=output_fp,
                     output_figure_dirpath=output_figure_dirpath,
-                    weighted_z_floor=weighted_z_floor,
-                    weighted_z_ceiling=weighted_z_ceiling,
                 )
             )
 
@@ -333,14 +349,17 @@ async def _begin_combining_distributions(
         merged_batch_results = pd.concat(batch_results, axis="columns")
         merged_batch_results.index.name = index_name
 
-        replicates_by_batch_num_map: dict[str, int] = {str(batch.batch_num): batch.num_samples for batch in batch_names[source.value]}
+        replicates_by_batch_num_map: dict[str, int] = {
+            str(batch.batch_num): batch.num_samples for batch in batch_names[source.value]
+        }
         batch_column_names: list[str] = list(merged_batch_results.columns)
         replicates_per_batch: list[int] = []
         for col in batch_column_names:
             key = str(col)
             if key not in replicates_by_batch_num_map:
                 raise KeyError(
-                    f"Missing replicate count for batch column '{col}' in source '{source.value}'. Known: {list(replicates_by_batch_num_map.keys())}"
+                    f"Missing replicate count for batch column '{col}' in source '{source.value}'. "
+                    f"Known: {list(replicates_by_batch_num_map.keys())}"
                 )
             replicates_per_batch.append(replicates_by_batch_num_map[key])
 
@@ -349,9 +368,12 @@ async def _begin_combining_distributions(
             context_name=context_name,
             replicates_per_batch=replicates_per_batch,
             output_combined_matrix_filepath=(
-                output_filepaths[source.value].parent / f"{context_name}_{source.value}_combined_zscore_distribution.csv"
+                output_filepaths[source.value].parent
+                / f"{context_name}_{source.value}_combined_zscore_distribution.csv"
+            ),
+            output_figure_filepath=(
+                output_figure_dirpath / f"{context_name}_{source.value}_combined_zscore_distribution.pdf"
             ),
-            output_figure_filepath=(output_figure_dirpath / f"{context_name}_{source.value}_combined_zscore_distribution.pdf"),
             weighted_z_floor=weighted_z_floor,
             weighted_z_ceiling=weighted_z_ceiling,
         )
@@ -363,7 +385,10 @@ async def _begin_combining_distributions(
             )
         )
         merged_source_results.to_csv(output_filepaths[source.value], index=True)
-        logger.success(f"Wrote z-scores for source '{source.value}' in context '{context_name}' to '{output_filepaths[source.value]}'")
+        logger.success(
+            f"Wrote z-scores for source '{source.value}' in context '{context_name}' "
+            f"to '{output_filepaths[source.value]}'"
+        )
 
     logger.trace(f"Combining z-score distributions for all sources in context '{context_name}'")
     merged_context_results = _combine_z_distribution_for_context(
@@ -371,5 +396,5 @@ async def _begin_combining_distributions(
         zscore_results=z_score_results,
         output_graph_filepath=output_figure_dirpath / f"{context_name}_combined_omics_distribution.pdf",
     )
-    merged_context_results.to_csv(output_final_model_scores, index=True)
+    merged_context_results.to_csv(output_final_model_scores, index=len(merged_context_results.columns) <= 1)
     logger.success(f"Wrote combined z-scores for context '{context_name}' to {output_final_model_scores}")

From 2db1fc4be8d3a65d471474a8a7f4d89600edc908 Mon Sep 17 00:00:00 2001
From: Josh Loecker <joshloecker@icloud.com>
Date: Fri, 6 Mar 2026 11:14:57 -0600
Subject: [PATCH 16/41] refactor: remove async/await and simplify I/O
 operations

- Convert all async functions to synchronous
- Remove unused imports/types
- Replace `await _read_file` with `pd.read_csv` calls

Signed-off-by: Josh Loecker <joshloecker@icloud.com>
---
 main/como/create_context_specific_model.py | 42 ++++++++++++----------
 1 file changed, 23 insertions(+), 19 deletions(-)

diff --git a/main/como/create_context_specific_model.py b/main/como/create_context_specific_model.py
index 518bf6b3..2813b8cc 100644
--- a/main/como/create_context_specific_model.py
+++ b/main/como/create_context_specific_model.py
@@ -1,25 +1,24 @@
 from __future__ import annotations
 
-import asyncio
 import collections
 import os
 import re
 import sys
-from collections.abc import Coroutine, Sequence
+from collections.abc import Sequence
 from io import TextIOWrapper
+from math import isfinite
+from multiprocessing import Value
 from pathlib import Path
-from typing import Any, Literal, TextIO, cast
+from typing import Literal, Never, TextIO, cast, reveal_type
 
+import cobamp.core.optimization
 import cobra
 import cobra.util.array
 import numpy as np
 import numpy.typing as npt
 import pandas as pd
-from cobra import Model
-from cobra.flux_analysis import pfba
-from fast_bioservices.common import Taxon
-from fast_bioservices.pipeline import ensembl_to_gene_id_and_symbol
 from loguru import logger
+from troppo.methods.reconstruction.corda import CORDA, CORDAProperties
 from troppo.methods.reconstruction.fastcore import FASTcore, FastcoreProperties
 from troppo.methods.reconstruction.gimme import GIMME, GIMMEProperties
 from troppo.methods.reconstruction.imat import IMAT, IMATProperties
@@ -27,7 +26,6 @@
 
 from como.data_types import (
     Algorithm,
-    BuildResults,
     CobraCompartments,
     LogLevel,
     ModelBuildSettings,
@@ -453,6 +451,7 @@ def _read_reference_model(filepath: Path) -> cobra.Model:
 
 async def _build_model(
     general_model_file: Path,
+def _build_model(
     gene_expression_file: Path,
     recon_algorithm: Algorithm,
     objective: str,
@@ -640,7 +639,8 @@ async def _build_model(
     )
 
 
-async def _create_df(path: Path, *, lowercase_col_names: bool = False) -> pd.DataFrame:
+
+def _create_df(path: Path, *, lowercase_col_names: bool = False) -> pd.DataFrame:
     if path.suffix not in {".csv", ".tsv"}:
         raise ValueError(f"File must be a .csv or .tsv file, got '{path.suffix}'")
     df: pd.DataFrame = await _read_file(path=path, header=0, sep="," if path.suffix == ".csv" else "\t", h5ad_as_df=True)
@@ -657,8 +657,14 @@ async def _create_df(path: Path, *, lowercase_col_names: bool = False) -> pd.Dat
     return df
 
 
-async def _collect_boundary_reactions(path: Path) -> _BoundaryReactions:
-    df: pd.DataFrame = await _create_df(path, lowercase_col_names=True)
+def _collect_boundary_reactions(
+    path: Path,
+    reference_model: cobra.Model,
+    *,
+    close_unlisted_exchanges: bool,
+) -> _BoundaryReactions:
+    df: pd.DataFrame = _create_df(path, lowercase_col_names=True)
+
     for column in df.columns:
         if column not in [
             "reaction",
@@ -692,7 +698,7 @@ async def _collect_boundary_reactions(path: Path) -> _BoundaryReactions:
     )
 
 
-async def _write_model_to_disk(
+def _write_model_to_disk(
     context_name: str,
     model: cobra.Model,
     output_filepaths: list[Path],
@@ -700,24 +706,22 @@ async def _write_model_to_disk(
     json_suffix: set[str],
     xml_suffix: set[str],
 ) -> None:
-    tasks: set[Coroutine[Any, Any, None]] = set()
     for path in output_filepaths:
         path.parent.mkdir(parents=True, exist_ok=True)
         if path.suffix in mat_suffix:
-            tasks.add(asyncio.to_thread(cobra.io.save_matlab_model, model=model, file_name=path))
+            cobra.io.save_matlab_model(model=model, file_name=path)
         elif path.suffix in json_suffix:
-            tasks.add(asyncio.to_thread(cobra.io.save_json_model, model=model, filename=path, pretty=True))
+            cobra.io.save_json_model(model=model, filename=path, pretty=True)
         elif path.suffix in xml_suffix:
-            tasks.add(asyncio.to_thread(cobra.io.write_sbml_model, model=model, filename=path))
+            cobra.io.write_sbml_model(cobra_model=model, filename=path)
         else:
             raise ValueError(
                 f"Invalid output model filetype. Should be one of .xml, .sbml, .mat, or .json. Got '{path.suffix}'"
             )
-        logger.success(f"Will save metabolic model for context '{context_name}' to: '{path}'")
-    await asyncio.gather(*tasks)
+        logger.success(f"Model for context '{context_name}' saved to to: '{path}'")
 
 
-async def create_context_specific_model(  # noqa: C901
+def create_context_specific_model(  # noqa: C901
     context_name: str,
     taxon: int | str | Taxon,
     reference_model: Path,

From 5a99d13d8b173b651940ca2b770c6c6797b052d2 Mon Sep 17 00:00:00 2001
From: Josh Loecker <joshloecker@icloud.com>
Date: Fri, 6 Mar 2026 11:15:20 -0600
Subject: [PATCH 17/41] refactor: inline setting boundary reaction conditions

Signed-off-by: Josh Loecker <joshloecker@icloud.com>
---
 main/como/create_context_specific_model.py | 52 +++++-----------------
 1 file changed, 11 insertions(+), 41 deletions(-)

diff --git a/main/como/create_context_specific_model.py b/main/como/create_context_specific_model.py
index 2813b8cc..2ad48b80 100644
--- a/main/como/create_context_specific_model.py
+++ b/main/como/create_context_specific_model.py
@@ -122,39 +122,6 @@ def _gene_rule_logical(gpr_expression: str, level: int = 0) -> str:
     return expression_out
 
 
-def _set_boundaries(
-    model: cobra.Model,
-    boundary_reactions: list[str],
-    lower_bounds: list[float],
-    upper_bounds: list[float],
-) -> cobra.Model:
-    # get boundary reactions
-    exchange_rxns = [rxn.id for rxn in model.reactions if rxn.id.startswith("EX_")]
-    sink_rxns = [rxn.id for rxn in model.reactions if rxn.id.startswith("sink_")]
-    demand_rxns = [rxn.id for rxn in model.reactions if rxn.id.startswith("DM_")]
-
-    # Allows all boundary reactions to be used if none are given
-    allow_all_boundary_rxns = not boundary_reactions
-
-    # close sinks and demands not in boundary reactions unless no boundary reactions were given
-    if not allow_all_boundary_rxns:
-        for i, rxn in enumerate(sink_rxns):  # set sinks to 0
-            getattr(model.reactions, rxn).lower_bounds = lower_bounds[i] if rxn in boundary_reactions else 0
-            getattr(model.reactions, rxn).upper_bounds = upper_bounds[i] if rxn in boundary_reactions else 1000
-
-        for i, rxn in enumerate(demand_rxns):
-            getattr(model.reactions, rxn).lower_bounds = 0
-            getattr(model.reactions, rxn).upper_bounds = upper_bounds[i] if rxn in boundary_reactions else 0
-
-    # Reaction media
-    medium = model.medium
-    for rxn in exchange_rxns:  # open exchanges from exchange file, close unspecified exchanges
-        medium[rxn] = -float(lower_bounds[boundary_reactions.index(rxn)]) if rxn in boundary_reactions else 0.0
-    model.medium = medium
-
-    return model
-
-
 def _feasibility_test(model_cobra: cobra.Model, step: str):
     # check number of unsolvable reactions for reference model under media assumptions
     # create flux consistant model (rmemoves some reactions)
@@ -449,9 +416,8 @@ def _read_reference_model(filepath: Path) -> cobra.Model:
             raise ValueError(f"Reference model format must be .xml, .mat, or .json; found '{filepath.suffix}'")
 
 
-async def _build_model(
-    general_model_file: Path,
 def _build_model(
+    reference_model: cobra.Model,
     gene_expression_file: Path,
     recon_algorithm: Algorithm,
     objective: str,
@@ -502,12 +468,16 @@ def _build_model(
     reference_model = _set_boundaries(reference_model, boundary_reactions, lower_bounds, upper_bounds)
     reference_model.solver = solver.lower()
 
-    # check number of unsolvable reactions for reference model under media assumptions
-    # inconsistent_reactions, cobra_model = _feasibility_test(cobra_model, "before_seeding")
-    inconsistent_reactions = []
-    s_matrix = cobra.util.array.create_stoichiometric_matrix(reference_model, array_type="dense")
-    lower_bounds: list[int] = []
-    upper_bounds: list[int] = []
+    # Set reference model boundaries
+    for rxn, lb, ub in zip(boundary_reactions, lower_bounds, upper_bounds, strict=True):
+        cast(cobra.Reaction, reference_model.reactions.get_by_id(rxn)).lower_bound = lb  # type: ignore[bad-argument-count]
+        cast(cobra.Reaction, reference_model.reactions.get_by_id(rxn)).upper_bound = ub  # type: ignore[bad-argument-count]
+    reference_model.solver = solver.lower()  # type: ignore[bad-argument-count]
+    reference_model.objective_direction = objective_direction  # type: ignore[bad-argument-count]
+
+    # Collect the lower/upper bounds for reactions in the reference model to provide it to reconstruction methods
+    ref_lb: npt.NDArray[np.floating] = np.full(shape=(len(reference_model.reactions)), fill_value=np.nan, dtype=float)
+    ref_ub: npt.NDArray[np.floating] = np.full(shape=(len(reference_model.reactions)), fill_value=np.nan, dtype=float)
     reaction_ids: list[str] = []
     for i, rxn in enumerate(reference_model.reactions):
         rxn: cobra.Reaction

From b9c773e9fd84aa88e56f1864e020c701e35d7308 Mon Sep 17 00:00:00 2001
From: Josh Loecker <joshloecker@icloud.com>
Date: Fri, 6 Mar 2026 11:17:09 -0600
Subject: [PATCH 18/41] refactor: update `_map_expression_to_reaction`
 signature

- Change from threshold-based (low_thresh, high_thresh) to percentile-based (low_percentile, high_percentile)
- Return tuple with (reaction_expression, min_val, low_expr, high_expr) instead of just a dictionary
- Adjust expression values to have a minimum of 0 for consistency and downstream building with Troppo
- use np.nanpercentile for threshold computing

Signed-off-by: Josh Loecker <joshloecker@icloud.com>
---
 main/como/create_context_specific_model.py | 112 +++++++++++++--------
 1 file changed, 72 insertions(+), 40 deletions(-)

diff --git a/main/como/create_context_specific_model.py b/main/como/create_context_specific_model.py
index 2ad48b80..83bfed5b 100644
--- a/main/como/create_context_specific_model.py
+++ b/main/como/create_context_specific_model.py
@@ -67,7 +67,7 @@ def _correct_bracket(rule: str, name: str) -> str:
         right_name = name[name_match.span()[1] :]
         operator = rule_match.group()
 
-    new_right_rule = []
+    new_right_rule: list[str] = []
     for char in list(left_rule):
         if char.isspace() or char.isdigit():
             new_right_rule.append(char)
@@ -320,10 +320,10 @@ def _map_expression_to_reaction(
     reference_model: cobra.Model,
     gene_expression_file: Path,
     recon_algorithm: Algorithm,
-    low_thresh: float,
-    high_thresh: float,
-    taxon: int | str | Taxon,
-) -> collections.OrderedDict[str, int]:
+    taxon: int | str,
+    low_percentile: int,
+    high_percentile: int,
+) -> tuple[collections.OrderedDict[str, float], float, float, float]:
     """Map gene ids to a reaction based on GPR (gene to protein to reaction) association rules.
 
     These rules should be defined in the general genome-scale metabolic model
@@ -332,76 +332,108 @@ def _map_expression_to_reaction(
         reference_model: A COBRA model object representing the general genome-scale metabolic model.
         gene_expression_file: Path to a gene expression file (.csv, .tsv, .xlsx, or .xls)
         recon_algorithm: Algorithm to use for reconstruction (GIMME, FASTCORE, iMAT, or tINIT)
-        low_thresh: Low expression threshold for algorithms that require it (iMAT, tINIT)
-        high_thresh: High expression threshold for algorithms that require it (iMAT, tINIT)
         taxon: Taxon ID or Taxon object for gene ID conversion.
+        low_percentile: Low percentile for threshold calculation
+        high_percentile: High percentile for threshold calculation
 
     Returns:
-        An ordered dictionary mapping reaction IDs to their corresponding expression values.
+        [0]: An ordered dictionary mapping reaction IDs to their corresponding expression values.
+        [1]: The abs(minimum value) from the expression vector
+        [2]: The adjusted `low_expr` value
+        [3]: The adjusted `high_expr` value
 
     Raises:
         ValueError: If neither 'entrez_gene_id' nor 'ensembl_gene_id' columns are found in the gene expression file.
     """
-    expression_data = await _read_file(gene_expression_file)
-    identifier_column = next((col for col in ("entrez_gene_id", "ensembl_gene_id") if col in expression_data.columns), "")
-
-    if not identifier_column:
-        raise ValueError(
-            f"At least one column of 'entrez_gene_id' or 'ensembl_gene_id' could not be found in the gene expression file '{gene_expression_file}'"
-        )
+    expression_data = pd.read_csv(gene_expression_file)
     gene_activity = split_gene_expression_data(
-        expression_data,
-        identifier_column=cast(Literal["ensembl_gene_id", "entrez_gene_id"], identifier_column),
-        recon_algorithm=recon_algorithm,
+        expression_data, identifier_column="entrez_gene_id", recon_algorithm=recon_algorithm
     )
 
-    # convert ensembl IDs to entrez ids to map expression data to reactions in the reference model
-    if identifier_column == "ensembl_gene_id":
-        conversion = await ensembl_to_gene_id_and_symbol(ids=gene_activity.index.tolist(), taxon=taxon)
-        gene_activity = gene_activity.merge(conversion, how="left", left_index=True, right_on="ensembl_gene_id")
-        gene_activity = gene_activity.set_index(keys=["entrez_gene_id"])
-        gene_activity = gene_activity.drop(columns=["ensembl_gene_id", "gene_symbol"])
+    # adjust expression to have a minimum value of 0; if the minimum value is already 0, this does nothing
+    gene_arr = gene_activity["active"].to_numpy(dtype=float, copy=False)
+    min_val = float(gene_arr[np.isfinite(gene_arr)].min())
+    if min_val < 0:
+        gene_arr[np.isfinite(gene_arr)] += abs(min_val)
 
-    reaction_expression = collections.OrderedDict()
+    # We are computing percentiles on non-zero data because we need to make sure that
+    #   zero values are definitely placed in the low bin
+    low_expr, high_expr = np.nanpercentile(gene_arr[gene_arr > 0], [low_percentile, high_percentile])
+    gene_activity["active"] = gene_arr
+    gene_activity.index = gene_activity.index.astype(str)  # maintain strings for reference model gene_ids
 
     # fmt: off
-    # Define a default expression value if a gene ID is not found
-    default_expression = (
-        np.mean([low_thresh, high_thresh]) if recon_algorithm in {Algorithm.IMAT, Algorithm.TINIT}
-        else -1 if recon_algorithm == Algorithm.GIMME
+    # Define a default expression values for the following conditions:
+    # 1. `no_expression`: If a reaction has a gene reaction rule but is not found in the data, then
+    #       we can be positive that the reaction should be marked as inactive
+    #   For all algorithms, placing no-expression data at an expression value of 0.0 will
+    #       mark it as inactive, because we have adjusted the data to be >= 0 above
+    no_expression = 0
+    # 2. `missing_gpr`: If a reaction has no gene reaction rule, we don't know if it should be marked as (in)active
+    #   Thus, place it in a no-cost location where possible, otherwise mark it as inactive
+    #   IMAT/TINIT/GIMME: Place in mid-bin, which is a "no-cost weight" value
+    #   FASTCORE: Mark as inactive, which is the best we can do with FASTCORE
+    missing_gpr = (
+        # -1 if recon_algorithm in {Algorithm.IMAT, Algorithm.TINIT, Algorithm.GIMME}
+        (low_expr + high_expr) / 2 if recon_algorithm in {Algorithm.IMAT, Algorithm.TINIT, Algorithm.GIMME}
         else 0 if recon_algorithm == Algorithm.FASTCORE
         else 1
     )
     # fmt: on
 
+    reaction_expression: collections.OrderedDict[str, float] = collections.OrderedDict()
+    activity_map = cast(dict[str, float], gene_activity.loc[:, "active"].to_dict())
     error_count = 0
+    _base_warn = "could not generate evaluable gene rule."
     for rxn in reference_model.reactions:
         rxn: cobra.Reaction
-
         gene_reaction_rule = rxn.gene_reaction_rule
+
         if not gene_reaction_rule:
+            reaction_expression[rxn.id] = missing_gpr
             continue
 
-        gene_ids = set(re.findall(r"\d+", gene_reaction_rule))
-        reaction_expression[rxn.id] = default_expression
+        gene_ids: set[str] = set(re.findall(r"\d+", gene_reaction_rule))
         for gene_id in gene_ids:
-            activity = gene_activity.at[gene_id, "active"] if gene_id in gene_activity.index else f"{default_expression!s}"
-            # replace gene_id with activity, using optional whitespace before and after the gene id
-            # Do not replace the whitespace (if it exists) before and after the gene ID
-            gene_reaction_rule = re.sub(pattern=rf"\b{gene_id}\b", repl=str(activity), string=str(gene_reaction_rule))
+            activity = activity_map.get(gene_id, no_expression)
+            if np.isposinf(activity):  # For positive infinity values, place in high bin
+                activity = high_expr + 1
+            elif not isfinite(activity):  # For non-finite values, place in low/non-active bin
+                activity = no_expression
+
+            # Matches the standalone gene ID only when it's not part of a larger number.
+            # It ensures the ID is not immediately preceded by a digit or decimal point and not immediately followed by
+            #   a digit, so if the gene id is '48', decimals like 1.48 or numbers like 1648 are left unchanged.
+            gene_reaction_rule = re.sub(
+                pattern=rf"(?<![\d.]){gene_id}(?!\d)",
+                repl=str(activity),
+                string=str(gene_reaction_rule),
+            )
 
+        evaluable_gene_rule = None
         try:
             # We are using eval here because ast.literal_eval is unable to process an evaluable such as `max(-4, 0, 1)`
-            # This isn't ideal, but ultimately the only other option is writing and maintaining a custom parsing engine, which is too much work
+            # This isn't ideal, but ultimately the only other option is writing and maintaining a custom parsing engine
+            #   but this is too much work
             evaluable_gene_rule = _gene_rule_logical(gene_reaction_rule).replace("{", "(").replace("}", ")")
             reaction_expression[rxn.id] = eval(evaluable_gene_rule)  # noqa: S307
+
         except ValueError:
+            logger.warning(f"ValueError for Reaction '{rxn.id}': {evaluable_gene_rule or _base_warn}")
+            error_count += 1
+        except NameError:
+            logger.warning(f"NameError for Reaction '{rxn.id}': {evaluable_gene_rule or _base_warn}")
             error_count += 1
+        except SyntaxError as e:
+            raise SyntaxError(f"SyntaxError on Reaction '{rxn.id}': {evaluable_gene_rule or _base_warn}") from e
+        except Exception as e:
+            error_type = type(e).__name__
+            raise RuntimeError(
+                f"Unexpected error {error_type} on Reaction '{rxn.id}': {evaluable_gene_rule or _base_warn}"
+            ) from e
 
     logger.debug(f"Mapped gene expression to reactions, found {error_count} error(s).")
-    # expr_vector = np.array(list(reaction_expression.values()), dtype=float)
-
-    return reaction_expression
+    return reaction_expression, float(min_val), float(low_expr), float(high_expr)
 
 
 def _read_reference_model(filepath: Path) -> cobra.Model:

From 41d1b56f2c203f8d18f2046ae3ecefc94341c56d Mon Sep 17 00:00:00 2001
From: Josh Loecker <joshloecker@icloud.com>
Date: Fri, 6 Mar 2026 11:18:31 -0600
Subject: [PATCH 19/41] refactor: add and update types for model build
 functions

Signed-off-by: Josh Loecker <joshloecker@icloud.com>
---
 main/como/create_context_specific_model.py | 37 +++++++++++++---------
 1 file changed, 22 insertions(+), 15 deletions(-)

diff --git a/main/como/create_context_specific_model.py b/main/como/create_context_specific_model.py
index 83bfed5b..0867eaaa 100644
--- a/main/como/create_context_specific_model.py
+++ b/main/como/create_context_specific_model.py
@@ -152,10 +152,12 @@ def _feasibility_test(model_cobra: cobra.Model, step: str):
 
 def _build_with_gimme(
     reference_model: cobra.Model,
-    lower_bounds: Sequence[float] | npt.NDArray[float],
-    upper_bounds: Sequence[float] | npt.NDArray[float],
+    expression_vector: Sequence[float] | npt.NDArray[np.floating],
     idx_objective: int,
-    expr_vector: npt.NDArray[float],
+    lower_bounds: npt.NDArray[np.floating],
+    upper_bounds: npt.NDArray[np.floating],
+    solver: str,
+    threshold_percentile: int = 30,
 ):
     model_reconstruction = reference_model.copy()
     s_matrix: list[float] = list(cobra.util.array.create_stoichiometric_matrix(model=model_reconstruction))
@@ -218,12 +220,14 @@ def _build_with_fastcore(
 
 def _build_with_imat(
     reference_model: cobra.Model,
-    lower_bounds: Sequence[float] | npt.NDArray[float],
-    upper_bounds: Sequence[float] | npt.NDArray[float],
+    lower_bounds: npt.NDArray[np.floating],
+    upper_bounds: npt.NDArray[np.floating],
     expr_vector: npt.NDArray,
-    expr_thresh: tuple[float, float],
-    force_reaction_indices: Sequence[int],
+    low_expression_threshold: float,
+    high_expression_threshold: float,
+    force_reaction_indices: npt.NDArray[np.integer],
     solver: str,
+    build_settings: ModelBuildSettings,
 ) -> cobra.Model:
     properties: IMATProperties = IMATProperties(
         exp_vector=expr_vector,
@@ -459,13 +463,15 @@ def _build_model(
     lower_bounds: list[float],
     upper_bounds: list[float],
     solver: str,
-    low_thresh: float,
-    high_thresh: float,
+    low_percentile: int,
+    high_percentile: int,
     output_flux_result_filepath: Path,
-    taxon: int | str | Taxon,
+    taxon: int | str,
+    objective_direction: Literal["min", "max"],
+    build_settings: ModelBuildSettings,
     *,
     force_boundary_rxn_inclusion: bool,
-) -> _BuildResults:
+) -> cobra.Model:
     """Seed a context specific reference_model.
 
     Core reactions are determined from GPR associations with gene expression logicals.
@@ -474,7 +480,7 @@ def _build_model(
     force excluded even if they meet GPR association requirements using the force exclude file.
 
     Args:
-        general_model_file: Path to a COBRA model file (.xml, .mat, or .json)
+        reference_model: The reference model used in reconstruction
         gene_expression_file: Path to a gene expression file (.csv, .tsv, .xlsx, or .xls)
         recon_algorithm: Algorithm to use for reconstruction (GIMME, FASTCORE, iMAT, or tINIT)
         objective: Objective reaction ID in the general model
@@ -484,14 +490,15 @@ def _build_model(
         lower_bounds: List of lower bounds corresponding to boundary reactions
         upper_bounds: List of upper bounds corresponding to boundary reactions
         solver: Solver to use (e.g., 'glpk', 'cplex', 'gurobi')
-        low_thresh: Low expression threshold for algorithms that require it (iMAT, tINIT)
-        high_thresh: High expression threshold for algorithms that require it (iMAT, tINIT)
         output_flux_result_filepath: Path to save flux results (for iMAT only)
         taxon: Taxon ID or Taxon object for gene ID conversion.
         force_boundary_rxn_inclusion: If True, ensure that all boundary reactions are included in the final model.
 
     Returns:
-        A _BuildResults object containing the context-specific model, list of expression indices used, and a DataFrame of infeasible reactions.
+        A _BuildResults object containing:
+         the context-specific model
+         list of expression indices used
+        A DataFrame of infeasible reactions.
     """
     reference_model: cobra.Model = _read_reference_model(general_model_file)
 

From 4b28e851cd72e502987554bd631aa12a0ad9ca14 Mon Sep 17 00:00:00 2001
From: Josh Loecker <joshloecker@icloud.com>
Date: Fri, 6 Mar 2026 11:20:06 -0600
Subject: [PATCH 20/41] fix: correct iMAT implementation with Troppo

- Uses cobamp for optimization handling
- Properly partition solution into high-bin and low-bin reactions
- Fix high/low index calculation to match Troppo's expected behavior
- Add validation for expected solution length
- Use 0.5 midpoint for binarizing reaction inclusion

Signed-off-by: Josh Loecker <joshloecker@icloud.com>
---
 main/como/create_context_specific_model.py | 100 ++++++++++++++++-----
 1 file changed, 80 insertions(+), 20 deletions(-)

diff --git a/main/como/create_context_specific_model.py b/main/como/create_context_specific_model.py
index 0867eaaa..a25dcd37 100644
--- a/main/como/create_context_specific_model.py
+++ b/main/como/create_context_specific_model.py
@@ -229,35 +229,95 @@ def _build_with_imat(
     solver: str,
     build_settings: ModelBuildSettings,
 ) -> cobra.Model:
+    model = reference_model.copy()
+
+    # `list()` is required because imat uses `if core:`; this returns an error on numpy arrays
+    force_rxn_list = list(np.asarray(force_reaction_indices, dtype=int))
     properties: IMATProperties = IMATProperties(
         exp_vector=expr_vector,
-        exp_thresholds=expr_thresh,
-        # core=np.array(force_gene_indices).tolist(),
-        core=list(force_reaction_indices),  # `list()` is required because imat uses `if core:`; this returns an error on numpy arrays
-        epsilon=0.01,
-        solver=solver.upper(),
+        exp_thresholds=(
+            low_expression_threshold,
+            high_expression_threshold,
+        ),
+        tolerance=build_settings.min_reaction_flux,
+        core=force_rxn_list,
+        epsilon=build_settings.troppo_epsilon,
+        solver=solver,
     )
 
-    # Creating a copy of the model ensures we don't make any in-place modifications by accident
     # Using cobra to create the stoichiometry matrix means we have less work to do
-    force_reaction_indices = np.array(force_reaction_indices, dtype=np.uint16)
-    model_reconstruction: cobra.Model = reference_model.copy()
-    s_matrix: npt.NDArray[float] = cobra.util.array.create_stoichiometric_matrix(model=model_reconstruction)
-    algorithm: IMAT = IMAT(S=s_matrix, lb=np.array(lower_bounds), ub=np.array(upper_bounds), properties=properties)
-    rxns_from_imat: npt.NDArray[np.uint16] = algorithm.run().astype(np.uint16)
+    s_matrix: npt.NDArray[np.floating] = cast(
+        npt.NDArray[np.floating], cobra.util.array.create_stoichiometric_matrix(model=model)
+    )
+    algorithm: IMAT = IMAT(S=s_matrix, lb=lower_bounds, ub=upper_bounds, properties=properties)
+
+    # Match Troppo's iMAT high/low bin calculations
+    high_index = np.where(expr_vector >= high_expression_threshold)[0].astype(int)
+    low_index = np.where((expr_vector >= 0) & (expr_vector < low_expression_threshold))[0].astype(int)
+    if force_reaction_indices.size > 0:
+        high_index = np.union1d(high_index, force_reaction_indices)
+    lso, lsystem = algorithm.generate_imat_problem(
+        S=algorithm.S,
+        lb=algorithm.lb,
+        ub=algorithm.ub,
+        high_idx=high_index,
+        low_idx=low_index,
+        epsilon=build_settings.troppo_epsilon,
+    )
+    # reset default Troppo/Cobamp thresholds
+    cfg = lsystem.get_configuration()
+    cfg.verbosity = build_settings.solver_verbosity
+    cfg.timeout = build_settings.solver_timeout
+    cfg.tolerances.feasibility = build_settings.solver_feasibility
+    cfg.tolerances.optimality = build_settings.solver_optimality
+    cfg.tolerances.integrality = build_settings.solver_integrality
+    grb_model = getattr(getattr(cfg, "problem", None), "problem", None)  # `getattr` to handle non-gurobi solvers
+    if grb_model is not None:
+        grb_model.Params.MIPGap = build_settings.gurobi_mipgap
+        grb_model.Params.IntegralityFocus = build_settings.gurobi_integrality_focus
+        grb_model.Params.NumericFocus = build_settings.gurobi_numeric_focus
+
+    # Keep reactions from imat + force-included reactions
+    solution: cobamp.core.optimization.Solution = lso.optimize()
+    x_sol: npt.NDArray[np.floating] = solution.x().astype(float)
+
+    # partition the solution into the following components:
+    #   1) High-bin, forward-direction reactions
+    #   2) High-bin, reverse-direction reactions
+    #   3) Activated low-bin reactions
+    n_rxns = len(reference_model.reactions)
+    expected_len = n_rxns + (2 * high_index.size) + low_index.size
+    if expected_len != x_sol.size:
+        raise ValueError(
+            f"Unexpected solution length: got {x_sol.size}, expected {expected_len}"
+            f"(n_rxns={n_rxns}, n_high_index={high_index.size}, n_low_index={low_index.size}"
+        )
 
-    # Collect all reaction IDs and their associated index (e.g., HEX1 is at index 123)
-    all_rxn_ids: npt.NDArray[str] = np.array([r.id for r in model_reconstruction.reactions], dtype=object)
-    all_rxn_indices: npt.NDArray[np.uint16] = np.array(range(len(model_reconstruction.reactions)), dtype=np.uint16)
+    part: int = 0
+    flux_vector: npt.NDArray[np.floating] = x_sol[part : part + n_rxns]
+    part += n_rxns
+    high_forward: npt.NDArray[np.floating] = x_sol[part : part + high_index.size]
+    part += high_index.size
+    high_reverse: npt.NDArray[np.floating] = x_sol[part : part + high_index.size]
+    part += high_index.size + low_index.size
 
-    rxn_indices_to_keep: npt.NDArray[int] = np.unique(np.concatenate([rxns_from_imat, force_reaction_indices], dtype=int))
+    # iMAT binarizes at ~0 and ~1 for off/on reactions,
+    #   so we can use 0.5 as the midpoint for defining inclusion/exclusion
+    high_forward_rxns: npt.NDArray[np.integer] = high_index[high_forward > 0.5]
+    high_reverse_rxns: npt.NDArray[np.integer] = high_index[high_reverse > 0.5]
+    high_on_rxns = np.unique(np.concatenate([high_forward_rxns, high_reverse_rxns])).astype(int)
 
-    # Reaction indices to exclude from the model are thus reactions that are not marked to be included in the model
-    # Assume unique is false because every value that is in `rxn_indices_to_keep` is included in `all_rxn_indices`
-    rxn_indices_to_remove: npt.NDArray[np.uint16] = np.setdiff1d(all_rxn_indices, rxn_indices_to_keep, assume_unique=False)
-    model_reconstruction.remove_reactions(reactions=all_rxn_ids[rxn_indices_to_remove].tolist(), remove_orphans=True)
+    # Low bin reactions may be included if it has a flux greater than `min_reaction_flux`
+    flux_keep = np.where(np.abs(solution.x())[:n_rxns] >= build_settings.min_reaction_flux)[0].astype(int)
+    to_keep = np.unique(np.concatenate([flux_keep, high_on_rxns, force_reaction_indices]))
 
-    return model_reconstruction
+    # Collect all reaction IDs and their associated index (e.g., HEX1 is at index 123)
+    all_rxn_ids: npt.NDArray[np.str_] = np.asarray([r.id for r in model.reactions], dtype=str)
+    to_remove = np.ones(shape=len(model.reactions), dtype=bool)
+    to_remove[to_keep] = False
+    remove_rxn_ids: list[str | cobra.Reaction] = list(all_rxn_ids[to_remove])
+    model.remove_reactions(reactions=remove_rxn_ids, remove_orphans=True)
+    return model
 
 
 def _build_with_tinit(

From 78f62f8504fd2850ca63eb5e51bbe51b5bcaea68 Mon Sep 17 00:00:00 2001
From: Josh Loecker <joshloecker@icloud.com>
Date: Fri, 6 Mar 2026 11:22:03 -0600
Subject: [PATCH 21/41] fix: update boundary reaction collection logic

- Adds `reference_model` parameter to internal `_collect_boundary_reactions` to define constraints for all boundary reactions
- Validate compartments exist in reference model

Signed-off-by: Josh Loecker <joshloecker@icloud.com>
---
 main/como/create_context_specific_model.py | 71 +++++++++++++++++-----
 1 file changed, 55 insertions(+), 16 deletions(-)

diff --git a/main/como/create_context_specific_model.py b/main/como/create_context_specific_model.py
index a25dcd37..8169fa8a 100644
--- a/main/como/create_context_specific_model.py
+++ b/main/como/create_context_specific_model.py
@@ -739,31 +739,70 @@ def _collect_boundary_reactions(
             "reaction",
             "abbreviation",
             "compartment",
-            "minimum reaction rate",
-            "maximum reaction rate",
+            "lower bound",
+            "upper bound",
         ]:
             raise ValueError(
-                f"Boundary reactions file must have columns named 'Reaction', 'Abbreviation', 'Compartment', "
-                f"'Minimum Reaction Rate', and 'Maximum Reaction Rate'. Found: {column}"
+                f"Boundary reactions file must have columns named 'reaction', 'abbreviation', 'compartment', "
+                f"'lower bound', and 'upper bound'. Found: '{column}'"
             )
 
-    reactions: list[str] = [""] * len(df)
-    boundary_type: list[str] = df["reaction"].tolist()
-    reaction_abbreviation: list[str] = list(df["abbreviation"].astype(str))
-    reaction_compartment: list[str] = list(df["compartment"].astype(str))
+    for compartment in df["compartment"].unique():
+        as_shorthand = CobraCompartments.get_shorthand(compartment) if len(compartment) > 2 else compartment
+        if as_shorthand not in reference_model.compartments:
+            raise ValueError(f"Unknown compartment: '{compartment}'")
+
     boundary_map = {"exchange": "EX", "demand": "DM", "sink": "SK"}
-    for i in range(len(boundary_type)):
-        boundary: str = boundary_type[i].lower()
-        if boundary not in boundary_map:
-            raise ValueError(f"Boundary reaction type must be 'Exchange', 'Demand', or 'Sink'. Found: {boundary}")
+    for exchange in df["reaction"].str.lower().unique():
+        if boundary_map.get(exchange) is None:
+            raise ValueError(
+                f"Unable to determine reaction type for: '{exchange}'\n"
+                f"Validate the boundary reactions file has a 'Reaction' value of: 'Exchange', 'Demand', or 'Sink'."
+            )
 
-        shorthand_compartment = CobraCompartments.get_shorthand(reaction_compartment[i])
-        reactions[i] = f"{boundary_map.get(boundary)}_{reaction_abbreviation[i]}[{shorthand_compartment}]"
+    reactions: list[str] = []
+    lower_bounds: list[float] = []
+    upper_bounds: list[float] = []
+    for ref_rxn in reference_model.boundary:
+        ref_rxn: cobra.Reaction
+
+        # Remove prefixes from reactions: EX_glc_D[e] -> glc_D[e]; DM_h2o[c] -> h2o[c]; sink_o2[m] -> o2[m]
+        # Then remove suffixes from reactions: glc_D[e] -> glc_D; h2o[c] -> h2o; o2[m] -> o2
+        base_rxn_id: str = ref_rxn.id.split("_", maxsplit=1)[1]
+        base_rxn_id = base_rxn_id.split("[", maxsplit=1)[0]
+
+        # If we are excluding unlisted exchange reactions from the model,
+        #   then we need to set their lower/upper bounds to 0
+        # Otherwise, keep the unlisted exchanges as their default
+        if base_rxn_id not in df["abbreviation"].values:
+            reactions.append(ref_rxn.id)
+            lower_bounds.append(0 if close_unlisted_exchanges else ref_rxn.lower_bound)
+            upper_bounds.append(0 if close_unlisted_exchanges else ref_rxn.upper_bound)
+            continue
+
+        rxn_index = df[df["abbreviation"] == base_rxn_id].index.item()
+        shorthand_compartment = CobraCompartments.get_shorthand(str(df.at[rxn_index, "compartment"]))
+        exch_type: str | None = boundary_map.get(str(df.at[rxn_index, "reaction"]).lower())
+        lower_bound, upper_bound = df.loc[rxn_index, ["lower bound", "upper bound"]].astype(float).values.ravel()
+
+        # The boundary reactions are built on the base reaction id;
+        #   if the reference model boundary has a matching base ID, but a non-matching
+        #   boundary type (exchange, sink, demand), this will check to make sure we are not
+        #   mistakingly setting boundaries that should not exist
+        formatted_boundary_reaction = f"{exch_type}_{base_rxn_id}[{shorthand_compartment}]"
+        if ref_rxn.id == formatted_boundary_reaction:
+            reactions.append(formatted_boundary_reaction)
+            lower_bounds.append(lower_bound)
+            upper_bounds.append(upper_bound)
+        else:
+            reactions.append(ref_rxn.id)
+            lower_bounds.append(0 if close_unlisted_exchanges else ref_rxn.lower_bound)
+            upper_bounds.append(0 if close_unlisted_exchanges else ref_rxn.upper_bound)
 
     return _BoundaryReactions(
         reactions=reactions,
-        lower_bounds=df["minimum reaction rate"].tolist(),
-        upper_bounds=df["maximum reaction rate"].tolist(),
+        lower_bounds=lower_bounds,
+        upper_bounds=upper_bounds,
     )
 
 

From bf30d2dde8ff23e9d207e790cd2ec3cc74844600 Mon Sep 17 00:00:00 2001
From: Josh Loecker <joshloecker@icloud.com>
Date: Fri, 6 Mar 2026 11:23:56 -0600
Subject: [PATCH 22/41] feat: add `close_unlisted_exchanges` to forcibly set
 bounds to 0 for exchange reactions not included in the boundary reactions
 input file

This option defaults to False to maintain compatibility with preivous releases

Signed-off-by: Josh Loecker <joshloecker@icloud.com>
---
 main/como/create_context_specific_model.py | 16 ++++++++++------
 1 file changed, 10 insertions(+), 6 deletions(-)

diff --git a/main/como/create_context_specific_model.py b/main/como/create_context_specific_model.py
index 8169fa8a..957f8ecb 100644
--- a/main/como/create_context_specific_model.py
+++ b/main/como/create_context_specific_model.py
@@ -849,8 +849,9 @@ def create_context_specific_model(  # noqa: C901
     log_level: LogLevel = LogLevel.INFO,
     log_location: str | TextIO | TextIOWrapper = sys.stderr,
     *,
-    force_boundary_rxn_inclusion: bool = True,
-):
+    force_boundary_rxn_inclusion: bool = False,
+    close_unlisted_exchanges: bool = False,
+) -> cobra.Model:
     """Create a context-specific model using the provided data.
 
     Args:
@@ -909,11 +910,14 @@ def create_context_specific_model(  # noqa: C901
             for path in output_model_filepaths
             if path.suffix not in {*mat_suffix, *json_suffix, *xml_suffix}
         )
-        log_and_raise_error(
-            f"Invalid output filetype. Should be 'xml', 'sbml', 'mat', or 'json'. Got:\n{invalid_suffix}'",
-            error=ValueError,
-            level=LogLevel.ERROR,
         raise ValueError(f"Invalid output filetype. Should be 'xml', 'sbml', 'mat', or 'json'. Got:\n{invalid_suffix}'")
+
+    reference_model = _read_reference_model(reference_model_filepath)
+    boundary_reactions = (
+        _collect_boundary_reactions(
+            boundary_rxns_filepath,
+            reference_model=reference_model,
+            close_unlisted_exchanges=close_unlisted_exchanges,
         )
 
     boundary_reactions = None

From b92b555d453e96b2d46369011bdb52daa7f1a989 Mon Sep 17 00:00:00 2001
From: Josh Loecker <joshloecker@icloud.com>
Date: Fri, 6 Mar 2026 11:24:38 -0600
Subject: [PATCH 23/41] refactor: streamline GIMME model reconstruction and
 expression handling

Signed-off-by: Josh Loecker <joshloecker@icloud.com>
---
 main/como/create_context_specific_model.py | 58 +++++++++++++++-------
 1 file changed, 41 insertions(+), 17 deletions(-)

diff --git a/main/como/create_context_specific_model.py b/main/como/create_context_specific_model.py
index 957f8ecb..812842fd 100644
--- a/main/como/create_context_specific_model.py
+++ b/main/como/create_context_specific_model.py
@@ -159,30 +159,54 @@ def _build_with_gimme(
     solver: str,
     threshold_percentile: int = 30,
 ):
-    model_reconstruction = reference_model.copy()
-    s_matrix: list[float] = list(cobra.util.array.create_stoichiometric_matrix(model=model_reconstruction))
+    model = reference_model
+    if threshold_percentile < 1:
+        logger.critical(
+            f"GIMME quantile calculation is less than 1 ({threshold_percentile}), but expected an integer of ~25! "
+            f"This will likely result in a model with many more reactions than expected."
+        )
+
+    expression: npt.NDArray[np.floating] = np.asarray(expression_vector, dtype=float)
+    nonzero = expression[expression > 0]
+    min_expr_percentile = np.percentile(nonzero, threshold_percentile).astype(float)
+    logger.debug(f"GIMME: minimum expression is {expression.min():.4f}")
+    logger.debug(f"GIMME: minimum non-zero expression is {nonzero.min():.4f}")
+    logger.debug(f"GIMME: maximum expression is {expression.max():.4f}")
+
+    expression[(expression < min_expr_percentile) & (expression > 0)] = 0
+    expression[expression >= min_expr_percentile] = 1
+
+    suffix: str = {1: "st", 2: "nd", 3: "rd"}.get(threshold_percentile, "th")
+    logger.debug(f"GIMME: {threshold_percentile}{suffix} percentile is {min_expr_percentile:.4f}")
+
+    s_matrix: list[float] = list(np.asarray(cobra.util.array.create_stoichiometric_matrix(model=model), dtype=float))
+    reaction_ids: list[str] = [r.id for r in model.reactions]
+    metabolite_ids: list[str] = [m.id for m in model.metabolites]
+
     # `Becker and Palsson (2008). Context-specific metabolic networks are
     # consistent with experiments. PLoS Comput. Biol. 4, e1000082.`
     properties = GIMMEProperties(
-        exp_vector=expr_vector,  # np.array(gimme_data['0']),
-        obj_frac=0.9,
+        exp_vector=list(expression),
         objectives=[{idx_objective: 1}],
         preprocess=True,
-        flux_threshold=0.9,
+        reaction_ids=reaction_ids,
+        metabolite_ids=metabolite_ids,
+        solver=solver.upper(),
     )
     algorithm = GIMME(s_matrix, list(lower_bounds), list(upper_bounds), properties)
-    gene_activity = algorithm.run()
-    reaction_ids = [r.id for r in model_reconstruction.reactions]
-    to_remove_ids = [reaction_ids[r] for r in np.where(gene_activity == 0)[0]]
+    active_rxn_indices: list[int] = algorithm.run()
+    to_remove_ids: list[str | cobra.Reaction] = [
+        reaction_ids[i] for i in range(len(model.reactions)) if i not in active_rxn_indices
+    ]
+    model.remove_reactions(to_remove_ids, True)
 
-    model_reconstruction.remove_reactions(to_remove_ids, True)
-    psol = pfba(model_reconstruction)  # noqa: F841
+    # psol = pfba(model)
     # reaction_ids = [r.id for r in context_cobra_model.reactions]
     # psol = context_cobra_model.optimize()
     # to_remove_ids = [reaction_ids[r] for r in np.where(abs(psol.fluxes) < 1e-8)[0]]
     # context_cobra_model.remove_reactions(to_remove_ids, True)
 
-    return model_reconstruction
+    return model
 
 
 def _build_with_fastcore(
@@ -338,12 +362,12 @@ def _build_with_tinit(
         allow_excretion=False,
         no_reverse_loops=True,
     )
-    model_reconstruction = reference_model.copy()
-    s_matrix: npt.NDArray[float] = np.asarray(cobra.util.array.create_stoichiometric_matrix(model=model_reconstruction), dtype=float)
-    algorithm = tINIT(s_matrix, lower_bounds, upper_bounds, properties)
-    algorithm.preprocessing()
-    algorithm.build_problem()
-    _log_and_raise_error("tINIT is not yet implemented.", error=NotImplementedError, level=LogLevel.CRITICAL)
+    s_matrix: npt.NDArray[np.floating] = np.asarray(
+        cobra.util.array.create_stoichiometric_matrix(model=model), dtype=float
+    )
+    algorithm = tINIT(s_matrix, np.asarray(lower_bounds), np.asarray(upper_bounds), properties)
+    active_rxn_indices: npt.NDArray[np.integer] = algorithm.run()
+
 
 def _build_with_corda(
     reference_model: cobra.Model,

From 46b89aa9790af53155f5b5141d8f4d7e83dc1f50 Mon Sep 17 00:00:00 2001
From: Josh Loecker <joshloecker@icloud.com>
Date: Fri, 6 Mar 2026 11:24:58 -0600
Subject: [PATCH 24/41] refactor: improve context-specific model creation logic

Signed-off-by: Josh Loecker <joshloecker@icloud.com>
---
 main/como/create_context_specific_model.py | 13 ++++++-------
 1 file changed, 6 insertions(+), 7 deletions(-)

diff --git a/main/como/create_context_specific_model.py b/main/como/create_context_specific_model.py
index 812842fd..0ac816c6 100644
--- a/main/como/create_context_specific_model.py
+++ b/main/como/create_context_specific_model.py
@@ -230,16 +230,15 @@ def _build_with_fastcore(
             "Lower bounds, upper bounds, and stoichiometric matrix must have the same number of reactions."
         )
     logger.debug("Creating feasible model")
-    _, cobra_model = _feasibility_test(cobra_model, "other")
-    properties = FastcoreProperties(core=exp_idx_list, solver=solver)
-    algorithm = FASTcore(s_matrix, lower_bounds, upper_bounds, properties)
+    _, reference_model = _feasibility_test(reference_model, "other")
+    properties = FastcoreProperties(core=list(exp_idx_list), solver=solver)
+    algorithm = FASTcore(s_matrix, np.asarray(lower_bounds), np.asarray(upper_bounds), properties)
     context_rxns = algorithm.fastcore()
-    context_cobra_model = cobra_model.copy()
-    r_ids = [r.id for r in context_cobra_model.reactions]
+    r_ids = [r.id for r in model.reactions]
     remove_rxns = [r_ids[int(i)] for i in range(s_matrix.shape[1]) if i not in context_rxns]
-    context_cobra_model.remove_reactions(remove_rxns, True)
+    model.remove_reactions(remove_rxns, True)
 
-    return context_cobra_model
+    return model
 
 
 def _build_with_imat(

From f2d7199cd65093a00c3f9cd32dd697119ff5591e Mon Sep 17 00:00:00 2001
From: Josh Loecker <joshloecker@icloud.com>
Date: Fri, 6 Mar 2026 11:25:12 -0600
Subject: [PATCH 25/41] chore: add type hints for reconstructing with tINIT

Signed-off-by: Josh Loecker <joshloecker@icloud.com>
---
 main/como/create_context_specific_model.py | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/main/como/create_context_specific_model.py b/main/como/create_context_specific_model.py
index 0ac816c6..7b867da2 100644
--- a/main/como/create_context_specific_model.py
+++ b/main/como/create_context_specific_model.py
@@ -345,18 +345,18 @@ def _build_with_imat(
 
 def _build_with_tinit(
     reference_model: cobra.Model,
-    lower_bounds,
-    upper_bounds,
-    expr_vector,
-    solver,
-    idx_force,
-) -> Model:
+    lower_bounds: npt.NDArray[np.floating],
+    upper_bounds: npt.NDArray[np.floating],
+    expr_vector: npt.NDArray[np.floating],
+    solver: str,
+    idx_force: npt.NDArray[np.integer],
+) -> None:
     raise NotImplementedError("tINIT is not yet implemented.")
     model = reference_model
     properties = tINITProperties(
-        reactions_scores=expr_vector,
+        reactions_scores=list(expr_vector),
         solver=solver,
-        essential_reactions=idx_force,
+        essential_reactions=list(idx_force),
         production_weight=0.0,
         allow_excretion=False,
         no_reverse_loops=True,

From 98a54365fe34c4561968fd3bb1fe316ad74867ef Mon Sep 17 00:00:00 2001
From: Josh Loecker <joshloecker@icloud.com>
Date: Fri, 6 Mar 2026 11:26:13 -0600
Subject: [PATCH 26/41] chore: do not use match case; reduces indentation

Signed-off-by: Josh Loecker <joshloecker@icloud.com>
---
 main/como/create_context_specific_model.py | 16 +++++++---------
 1 file changed, 7 insertions(+), 9 deletions(-)

diff --git a/main/como/create_context_specific_model.py b/main/como/create_context_specific_model.py
index 7b867da2..4a9e5ccd 100644
--- a/main/como/create_context_specific_model.py
+++ b/main/como/create_context_specific_model.py
@@ -524,15 +524,13 @@ def _map_expression_to_reaction(
 
 
 def _read_reference_model(filepath: Path) -> cobra.Model:
-    match filepath.suffix:
-        case ".mat":
-            reference_model = cobra.io.load_matlab_model(filepath)
-        case ".xml" | ".sbml":
-            reference_model = cobra.io.read_sbml_model(filepath)
-        case ".json":
-            reference_model = cobra.io.load_json_model(filepath)
-        case _:
-            raise ValueError(f"Reference model format must be .xml, .mat, or .json; found '{filepath.suffix}'")
+    if filepath.suffix == ".json":
+        return cobra.io.load_json_model(filepath)
+    if filepath.suffix == ".mat":
+        return cobra.io.load_matlab_model(filepath)
+    if filepath.suffix in {".xml", ".sbml"}:
+        return cobra.io.read_sbml_model(filepath)
+    raise ValueError(f"Reference model format must be .xml, .mat, or .json; found '{filepath.suffix}'")
 
 
 def _build_model(

From b83842c5092ca8e821257be50050395f21f9bed2 Mon Sep 17 00:00:00 2001
From: Josh Loecker <joshloecker@icloud.com>
Date: Fri, 6 Mar 2026 11:26:49 -0600
Subject: [PATCH 27/41] perf: pass in cobra.Model object to prevent reading
 multiple times

Signed-off-by: Josh Loecker <joshloecker@icloud.com>
---
 main/como/create_context_specific_model.py | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/main/como/create_context_specific_model.py b/main/como/create_context_specific_model.py
index 4a9e5ccd..c8ce8a0d 100644
--- a/main/como/create_context_specific_model.py
+++ b/main/como/create_context_specific_model.py
@@ -581,12 +581,8 @@ def _build_model(
          list of expression indices used
         A DataFrame of infeasible reactions.
     """
-    reference_model: cobra.Model = _read_reference_model(general_model_file)
-
     if objective not in force_reactions:
-        force_reactions.append(objective)
-    reference_model = _set_boundaries(reference_model, boundary_reactions, lower_bounds, upper_bounds)
-    reference_model.solver = solver.lower()
+        force_reactions.append(objective)  # forcibly include the objective in the reconstructed model
 
     # Set reference model boundaries
     for rxn, lb, ub in zip(boundary_reactions, lower_bounds, upper_bounds, strict=True):

From a419bce82aca58213451ca2a1190515f5f068a7d Mon Sep 17 00:00:00 2001
From: Josh Loecker <joshloecker@icloud.com>
Date: Fri, 6 Mar 2026 11:27:19 -0600
Subject: [PATCH 28/41] refactor: enhance reaction expression mapping and
 logging for missing reactions

Signed-off-by: Josh Loecker <joshloecker@icloud.com>
---
 main/como/create_context_specific_model.py | 19 +++++++++++--------
 1 file changed, 11 insertions(+), 8 deletions(-)

diff --git a/main/como/create_context_specific_model.py b/main/como/create_context_specific_model.py
index c8ce8a0d..d488879e 100644
--- a/main/como/create_context_specific_model.py
+++ b/main/como/create_context_specific_model.py
@@ -612,16 +612,19 @@ def _build_model(
     if np.isnan(ref_ub).any():
         raise ValueError("Upper bounds contains unfilled values!")
 
-    # get expressed reactions
-    reaction_expression: collections.OrderedDict[str, int] = await _map_expression_to_reaction(
-        reference_model, gene_expression_file, recon_algorithm, high_thresh=high_thresh, low_thresh=low_thresh, taxon=taxon
+    reaction_expression, min_expr_val, low_expr, high_expr = _map_expression_to_reaction(
+        reference_model,
+        gene_expression_file,
+        recon_algorithm,
+        taxon=taxon,
+        low_percentile=low_percentile,
+        high_percentile=high_percentile,
     )
-    expression_vector: npt.NDArray[float] = np.array(list(reaction_expression.values()), dtype=float)
-
-    for rxn in force_reactions:
-        if rxn not in reaction_ids:
+    expression_vector: npt.NDArray[np.floating] = np.asarray(list(reaction_expression.values()), dtype=float)
+    for rxn_id in force_reactions:
+        if rxn_id not in reaction_ids:
             logger.warning(
-                f"The force reaction '{rxn}' was not found in the reference model. "
+                f"The force reaction '{rxn_id}' was not found in the reference model. "
                 f"Check BiGG, or the relevant database for your reference model, for synonyms."
             )
 

From 3a8f131d2ce54da312e5d9009ac249772071eba9 Mon Sep 17 00:00:00 2001
From: Josh Loecker <joshloecker@icloud.com>
Date: Fri, 6 Mar 2026 11:27:40 -0600
Subject: [PATCH 29/41] refactor: streamline reaction expression handling in
 model creation

Signed-off-by: Josh Loecker <joshloecker@icloud.com>
---
 main/como/create_context_specific_model.py | 26 ++++++++--------------
 1 file changed, 9 insertions(+), 17 deletions(-)

diff --git a/main/como/create_context_specific_model.py b/main/como/create_context_specific_model.py
index d488879e..1d8e9f5f 100644
--- a/main/como/create_context_specific_model.py
+++ b/main/como/create_context_specific_model.py
@@ -628,23 +628,15 @@ def _build_model(
                 f"Check BiGG, or the relevant database for your reference model, for synonyms."
             )
 
-    # collect list of reactions that are infeasible but active in expression data or user defined
-    infeasible_expression_reactions = []
-    infeasible_force_reactions = []
-
-    for i, rxn in enumerate(reaction_expression):
-        # log reactions in expressed and force lists that are infeasible that the user may wish to review
-        if rxn in inconsistent_reactions and expression_vector[i] == 1:
-            infeasible_expression_reactions.append(rxn)
-        if rxn in inconsistent_reactions and rxn in force_reactions:
-            infeasible_force_reactions.append(rxn)
-
-        if rxn in force_reactions:
-            expression_vector[i] = high_thresh + 0.1 if recon_algorithm in {Algorithm.TINIT, Algorithm.IMAT} else 1
-        if rxn in inconsistent_reactions or rxn in exclude_reactions:
-            expression_vector[i] = low_thresh - 0.1 if recon_algorithm in {Algorithm.TINIT, Algorithm.IMAT} else 0
-
-    objective_index = reaction_ids.index(objective)
+    # Set force-include reactions to have an expression higher than the upper threshold
+    # Set force-exclude reactions to have an expression lower than the lower threshold
+    rxn_ids = list(reaction_expression.keys())
+    force_mask = np.isin(rxn_ids, force_reactions)
+    exclude_mask = np.isin(rxn_ids, exclude_reactions)
+    expression_vector[force_mask] = high_expr + 1 if recon_algorithm in {Algorithm.TINIT, Algorithm.IMAT} else 1
+    expression_vector[exclude_mask] = (
+        max(0.0, low_expr - 1) if recon_algorithm in {Algorithm.TINIT, Algorithm.IMAT} else 0
+    )
 
     if force_boundary_rxn_inclusion:
         all_forced: set[str] = {*force_reactions, *boundary_reactions}

From 4cab288abca5f0fc6ecbad1a85f9e379fd5192fb Mon Sep 17 00:00:00 2001
From: Josh Loecker <joshloecker@icloud.com>
Date: Fri, 6 Mar 2026 11:28:00 -0600
Subject: [PATCH 30/41] refactor: improve reaction index handling and
 expression vector initialization

Signed-off-by: Josh Loecker <joshloecker@icloud.com>
---
 main/como/create_context_specific_model.py | 17 ++++++++---------
 1 file changed, 8 insertions(+), 9 deletions(-)

diff --git a/main/como/create_context_specific_model.py b/main/como/create_context_specific_model.py
index 1d8e9f5f..3b8384d2 100644
--- a/main/como/create_context_specific_model.py
+++ b/main/como/create_context_specific_model.py
@@ -640,16 +640,15 @@ def _build_model(
 
     if force_boundary_rxn_inclusion:
         all_forced: set[str] = {*force_reactions, *boundary_reactions}
-        force_reaction_indices: npt.NDArray[np.uint16] = np.array(
-            [reaction_ids.index(rxn) for rxn in all_forced if rxn in reaction_ids], dtype=np.uint16
+        force_reaction_indices: npt.NDArray[np.integer] = np.asarray(
+            [reaction_ids.index(rxn_id) for rxn_id in all_forced if rxn_id in reaction_ids], dtype=int
         )
     else:
-        force_reaction_indices: npt.NDArray[np.uint16] = np.array(
-            [reaction_ids.index(rxn) for rxn in force_reactions if rxn in reaction_ids], dtype=np.uint16
+        force_reaction_indices: npt.NDArray[np.integer] = np.asarray(
+            [reaction_ids.index(rxn_id) for rxn_id in force_reactions if rxn_id in reaction_ids], dtype=int
         )
 
-    expression_vector_indices = [i for (i, val) in enumerate(expression_vector) if val > 0]
-    expression_threshold = (low_thresh, high_thresh)
+    expression_vector_indices: list[int] = list(range(len(expression_vector)))
 
     if recon_algorithm == Algorithm.IMAT:
         context_model_cobra: cobra.Model = _build_with_imat(
@@ -657,8 +656,8 @@ def _build_model(
             lower_bounds=ref_lb,
             upper_bounds=ref_ub,
             expr_vector=expression_vector,
-            low_expression_threshold=updated_low_thresh,
-            high_expression_threshold=updated_high_thresh,
+            low_expression_threshold=low_expr,
+            high_expression_threshold=high_expr,
             force_reaction_indices=force_reaction_indices,
             solver=solver,
             build_settings=build_settings,
@@ -673,7 +672,7 @@ def _build_model(
         context_model_cobra: cobra.Model = _build_with_gimme(
             reference_model=reference_model,
             expression_vector=expression_vector,
-            idx_objective=objective_index,
+            idx_objective=reaction_ids.index(objective),
             lower_bounds=ref_lb,
             upper_bounds=ref_ub,
             solver=solver,

From e4b90b5799238cf29228eef1a4dde55e68ce02ff Mon Sep 17 00:00:00 2001
From: Josh Loecker <joshloecker@icloud.com>
Date: Fri, 6 Mar 2026 11:28:15 -0600
Subject: [PATCH 31/41] refactor: update reaction subsystem assignment and
 optimize flux output handling

Signed-off-by: Josh Loecker <joshloecker@icloud.com>
---
 main/como/create_context_specific_model.py | 33 +++++++++++-----------
 1 file changed, 16 insertions(+), 17 deletions(-)

diff --git a/main/como/create_context_specific_model.py b/main/como/create_context_specific_model.py
index 3b8384d2..71b8baf4 100644
--- a/main/como/create_context_specific_model.py
+++ b/main/como/create_context_specific_model.py
@@ -701,23 +701,22 @@ def _build_model(
             f"Got: {recon_algorithm.value}"
         )
 
-    inconsistent_and_infeasible_reactions: pd.DataFrame = pd.concat(
-        [
-            pd.DataFrame({"infeasible_reactions": inconsistent_reactions}),
-            pd.DataFrame({"expressed_infeasible_reactions": infeasible_expression_reactions}),
-            pd.DataFrame({"infeasible_force_reactions": infeasible_force_reactions}),
-            pd.DataFrame({"infeasible_context_reactions": []}),  # Included to maintain legacy support
-        ],
-        ignore_index=True,
-        axis=0,
-    )
-
-    return _BuildResults(
-        model=context_model_cobra,
-        expression_index_list=expression_vector_indices,
-        infeasible_reactions=inconsistent_and_infeasible_reactions,
-    )
-
+    # Set each reaction subsystem based on the reference model
+    for rxn in context_model_cobra.reactions:
+        cast(cobra.Reaction, context_model_cobra.reactions.get_by_id(rxn.id)).subsystem = cast(
+            cobra.Reaction, reference_model.reactions.get_by_id(rxn.id)
+        ).subsystem
+
+    context_model_cobra.objective = objective
+    flux_sol: cobra.Solution = context_model_cobra.optimize()
+    fluxes = flux_sol.fluxes
+    model_reactions: list[str] = [reaction.id for reaction in context_model_cobra.reactions]
+    reaction_intersections: set[str] = set(fluxes.index).intersection(model_reactions)
+    flux_df = fluxes[~fluxes.index.isin(reaction_intersections)]
+    flux_df.dropna(inplace=True)
+    flux_df.to_csv(output_flux_result_filepath)
+
+    return context_model_cobra
 
 
 def _create_df(path: Path, *, lowercase_col_names: bool = False) -> pd.DataFrame:

From 2f6e09e0cda6b57d9a36dbf169717556bf84c171 Mon Sep 17 00:00:00 2001
From: Josh Loecker <joshloecker@icloud.com>
Date: Fri, 6 Mar 2026 11:28:43 -0600
Subject: [PATCH 32/41] refactor: simplify DataFrame creation by replacing
 async file read with pandas read_csv

Signed-off-by: Josh Loecker <joshloecker@icloud.com>
---
 main/como/create_context_specific_model.py | 9 +--------
 1 file changed, 1 insertion(+), 8 deletions(-)

diff --git a/main/como/create_context_specific_model.py b/main/como/create_context_specific_model.py
index 71b8baf4..0a8d31ff 100644
--- a/main/como/create_context_specific_model.py
+++ b/main/como/create_context_specific_model.py
@@ -722,15 +722,8 @@ def _build_model(
 def _create_df(path: Path, *, lowercase_col_names: bool = False) -> pd.DataFrame:
     if path.suffix not in {".csv", ".tsv"}:
         raise ValueError(f"File must be a .csv or .tsv file, got '{path.suffix}'")
-    df: pd.DataFrame = await _read_file(path=path, header=0, sep="," if path.suffix == ".csv" else "\t", h5ad_as_df=True)
-
-    if not isinstance(df, pd.DataFrame):
-        _log_and_raise_error(
-            f"Expected a pandas.DataFrame, got {type(df)}",
-            error=TypeError,
-            level=LogLevel.ERROR,
-        )
 
+    df = pd.read_csv(path, header=0, sep="," if path.suffix == ".csv" else "\t")
     if lowercase_col_names:
         df.columns = [c.lower() for c in df.columns]
     return df

From d2246395057aa991aebb08442e1c77707d4f2310 Mon Sep 17 00:00:00 2001
From: Josh Loecker <joshloecker@icloud.com>
Date: Fri, 6 Mar 2026 11:29:04 -0600
Subject: [PATCH 33/41] refactor: update parameter names for clarity and
 improve docstring formatting

Signed-off-by: Josh Loecker <joshloecker@icloud.com>
---
 main/como/create_context_specific_model.py | 61 ++++++++++++----------
 1 file changed, 33 insertions(+), 28 deletions(-)

diff --git a/main/como/create_context_specific_model.py b/main/como/create_context_specific_model.py
index 0a8d31ff..cbcf3a08 100644
--- a/main/como/create_context_specific_model.py
+++ b/main/como/create_context_specific_model.py
@@ -834,53 +834,58 @@ def _write_model_to_disk(
 
 def create_context_specific_model(  # noqa: C901
     context_name: str,
-    taxon: int | str | Taxon,
-    reference_model: Path,
+    taxon_id: int | str,
+    reference_model_filepath: Path,
     active_genes_filepath: Path,
     output_infeasible_reactions_filepath: Path,
     output_flux_result_filepath: Path,
     output_model_filepaths: Path | list[Path],
     output_fastcore_expression_index_filepath: Path | None = None,
     objective: str = "biomass_reaction",
+    objective_direction: Literal["min", "max"] = "max",
+    contextualized_model_id: str | None = None,
     boundary_rxns_filepath: str | Path | None = None,
     exclude_rxns_filepath: str | Path | None = None,
     force_rxns_filepath: str | Path | None = None,
     algorithm: Algorithm = Algorithm.GIMME,
-    low_threshold: float = -5,
-    high_threshold: float = -3,
+    low_percentile: int | None = None,
+    high_percentile: int | None = None,
     solver: Solver = Solver.GLPK,
     log_level: LogLevel = LogLevel.INFO,
     log_location: str | TextIO | TextIOWrapper = sys.stderr,
+    build_settings: ModelBuildSettings | None = None,
     *,
     force_boundary_rxn_inclusion: bool = False,
     close_unlisted_exchanges: bool = False,
 ) -> cobra.Model:
     """Create a context-specific model using the provided data.
 
-    Args:
-        context_name: Name of the context-specific model.
-        taxon: NCBI taxonomy ID or name for the organism of interest.
-        reference_model: Path to the general genome-scale metabolic model file (.xml, .mat, or .json).
-        active_genes_filepath: Path to the gene expression data file (csv, tsv, or Excel).
-        output_infeasible_reactions_filepath: Path to save infeasible reactions (csv).
-        output_flux_result_filepath: Path to save flux results (csv).
-        output_model_filepaths: Path or list of paths to save the context-specific model (.xml, .mat, or .json).
-        output_fastcore_expression_index_filepath: Path to save Fastcore expression indices (txt). Required if using Fastcore.
-        objective: Objective function reaction ID.
-        boundary_rxns_filepath: Optional path to boundary reactions file (csv, tsv, or Excel).
-        exclude_rxns_filepath: Optional path to reactions to exclude file (csv, tsv, or Excel).
-        force_rxns_filepath: Optional path to reactions to force include file (csv, tsv, or Excel).
-        algorithm: Algorithm to use for reconstruction. One of Algorithm.GIMME, Algorithm.FASTCORE, Algorithm.IMAT, Algorithm.TINIT.
-        low_threshold: Low expression threshold for algorithms that require it.
-        high_threshold: High expression threshold for algorithms that require it.
-        solver: Solver to use. One of Solver.GLPK, Solver.CPLEX, Solver.GUROBI
-        log_level: Logging level. One of LogLevel.DEBUG, LogLevel.INFO, LogLevel.WARNING, LogLevel.ERROR, LogLevel.CRITICAL
-        log_location: Location for log output. Can be a file path or sys.stderr/sys.stdout.
-        force_boundary_rxn_inclusion: If True, ensure that all provided boundary reactions are included in the final model.
-
-    Raises:
-        ImportError: If Gurobi solver is selected but gurobipy is not installed.
-    """
+    :param context_name: Name of the context-specific model.
+    :param taxon_id: NCBI taxonomy ID or name for the organism of interest.
+    :param reference_model_filepath: Path to the general genome-scale metabolic model file (.xml, .mat, or .json).
+    :param active_genes_filepath: Path to the gene expression data file (csv, tsv, or Excel).
+    :param output_infeasible_reactions_filepath: Path to save infeasible reactions (csv).
+    :param output_flux_result_filepath: Path to save flux results (csv).
+    :param output_model_filepaths: Path or list of paths to save the context-specific model (.xml, .mat, or .json).
+    :param output_fastcore_expression_index_filepath: Path to save Fastcore expression indices (txt).
+    :param objective: Objective function reaction ID.
+    :param objective_direction: Direction of the objective function, either 'min' or 'max'.
+    :param contextualized_model_id: A custom name to save under `model.id`, otherwise the `context_name` value will be used.
+    :param boundary_rxns_filepath: Optional path to boundary reactions file (csv, tsv, or Excel).
+    :param exclude_rxns_filepath: Optional path to reactions to exclude file (csv, tsv, or Excel).
+    :param force_rxns_filepath: Optional path to reactions to force include file (csv, tsv, or Excel).
+    :param algorithm: Algorithm to use for reconstruction.
+    :param low_percentile: Low percentile for gene expression threshold calculation.
+    :param high_percentile: High percentile for gene expression threshold calculation.
+    :param solver: Solver to use. One of Solver.GLPK, Solver.CPLEX, Solver.GUROBI
+    :param log_level: Logging level
+    :param log_location: Location for log output. Can be a file path or sys.stderr/sys.stdout.
+    :param force_boundary_rxn_inclusion: If True, ensure that all provided boundary reactions are included in the final model.
+    :param build_settings: Optional ModelBuildSettings object to customize model building parameters.
+    :param close_unlisted_exchanges: If True, all exchanges not listed in the boundary reactions input will be closed
+
+    :raises ImportError: If Gurobi solver is selected but gurobipy is not installed.
+    """  # noqa: E501
     set_up_logging(level=log_level, location=log_location)
     if low_percentile is None:
         raise ValueError("low_percentile must be provided")

From e1b2e79b9b33a21a92c9434cb4cf003832f61502 Mon Sep 17 00:00:00 2001
From: Josh Loecker <joshloecker@icloud.com>
Date: Fri, 6 Mar 2026 11:29:26 -0600
Subject: [PATCH 34/41] refactor: improve error handling and streamline file
 path assignments in model creation

Signed-off-by: Josh Loecker <joshloecker@icloud.com>
---
 main/como/create_context_specific_model.py | 32 +++++++++++++---------
 1 file changed, 19 insertions(+), 13 deletions(-)

diff --git a/main/como/create_context_specific_model.py b/main/como/create_context_specific_model.py
index cbcf3a08..d73106b1 100644
--- a/main/como/create_context_specific_model.py
+++ b/main/como/create_context_specific_model.py
@@ -891,13 +891,14 @@ def create_context_specific_model(  # noqa: C901
         raise ValueError("low_percentile must be provided")
     if high_percentile is None:
         raise ValueError("high_percentile must be provided")
-    # TODO: set up zfpkm threshold defaults
 
-    boundary_rxns_filepath: Path | None = Path(boundary_rxns_filepath) if boundary_rxns_filepath else None
-    output_model_filepaths = [output_model_filepaths] if isinstance(output_model_filepaths, Path) else output_model_filepaths
+    boundary_rxns_filepath = Path(boundary_rxns_filepath) if boundary_rxns_filepath else None
+    output_model_filepaths = (
+        [output_model_filepaths] if isinstance(output_model_filepaths, Path) else output_model_filepaths
+    )
 
-    if not reference_model.exists():
-        raise FileNotFoundError(f"Reference model not found at {reference_model}")
+    if not reference_model_filepath.exists():
+        raise FileNotFoundError(f"Reference model not found at {reference_model_filepath}")
     if not active_genes_filepath.exists():
         raise FileNotFoundError(f"Active genes file not found at {active_genes_filepath}")
     if algorithm == Algorithm.FASTCORE and not output_fastcore_expression_index_filepath:
@@ -927,23 +928,22 @@ def create_context_specific_model(  # noqa: C901
             reference_model=reference_model,
             close_unlisted_exchanges=close_unlisted_exchanges,
         )
-
-    boundary_reactions = None
-    if boundary_rxns_filepath:
-        boundary_reactions = await _collect_boundary_reactions(boundary_rxns_filepath)
+        if boundary_rxns_filepath
+        else _BoundaryReactions(reactions=[], lower_bounds=[], upper_bounds=[])
+    )
 
     exclude_rxns: list[str] = []
     if exclude_rxns_filepath:
-        exclude_rxns_filepath: Path = Path(exclude_rxns_filepath)
-        df = await _create_df(exclude_rxns_filepath)
+        exclude_rxns_filepath = Path(exclude_rxns_filepath)
+        df = _create_df(exclude_rxns_filepath)
         if "abbreviation" not in df.columns:
             raise ValueError("The exclude reactions file should have a single column with a header named Abbreviation")
         exclude_rxns = df["abbreviation"].tolist()
 
     force_rxns: list[str] = []
     if force_rxns_filepath:
-        force_rxns_filepath: Path = Path(force_rxns_filepath)
-        df = await _create_df(force_rxns_filepath, lowercase_col_names=True)
+        force_rxns_filepath = Path(force_rxns_filepath)
+        df = _create_df(force_rxns_filepath, lowercase_col_names=True)
         if "abbreviation" not in df.columns:
             raise ValueError("The force reactions file should have a single column with a header named Abbreviation")
         force_rxns = df["abbreviation"].tolist()
@@ -966,6 +966,12 @@ def create_context_specific_model(  # noqa: C901
                 "COMO will continue, but it is HIGHLY unlikely the resulting model will be valid."
             )
 
+    overlapping_rxns = set(force_rxns) & set(exclude_rxns)
+    if overlapping_rxns:
+        raise ValueError(
+            f"Reactions found in force-include and force-exclude sets: {','.join(sorted(overlapping_rxns))}"
+        )
+
     logger.info(f"context='{context_name}', reconstruction method='{algorithm.value}', solver='{solver.value}'")
     build_results: _BuildResults = await _build_model(
         general_model_file=reference_model,

From f7de63b175e5294919ca375a3812126e99b9978c Mon Sep 17 00:00:00 2001
From: Josh Loecker <joshloecker@icloud.com>
Date: Fri, 6 Mar 2026 11:29:45 -0600
Subject: [PATCH 35/41] refactor: enhance model building parameters for clarity
 and consistency

Signed-off-by: Josh Loecker <joshloecker@icloud.com>
---
 main/como/create_context_specific_model.py | 18 ++++++++++--------
 1 file changed, 10 insertions(+), 8 deletions(-)

diff --git a/main/como/create_context_specific_model.py b/main/como/create_context_specific_model.py
index d73106b1..c6eb4269 100644
--- a/main/como/create_context_specific_model.py
+++ b/main/como/create_context_specific_model.py
@@ -973,22 +973,24 @@ def create_context_specific_model(  # noqa: C901
         )
 
     logger.info(f"context='{context_name}', reconstruction method='{algorithm.value}', solver='{solver.value}'")
-    build_results: _BuildResults = await _build_model(
-        general_model_file=reference_model,
+    context_model: cobra.Model = _build_model(
+        reference_model=reference_model,
         gene_expression_file=active_genes_filepath,
         recon_algorithm=algorithm,
         objective=objective,
-        boundary_reactions=boundary_reactions.reactions if boundary_reactions else [],
+        objective_direction=objective_direction,
+        boundary_reactions=boundary_reactions.reactions,
         exclude_reactions=exclude_rxns,
         force_reactions=force_rxns,
-        lower_bounds=boundary_reactions.lower_bounds if boundary_reactions else [],
-        upper_bounds=boundary_reactions.upper_bounds if boundary_reactions else [],
+        lower_bounds=boundary_reactions.lower_bounds,
+        upper_bounds=boundary_reactions.upper_bounds,
         solver=solver.value.lower(),
-        low_thresh=low_threshold,
-        high_thresh=high_threshold,
+        low_percentile=low_percentile,
+        high_percentile=high_percentile,
         output_flux_result_filepath=output_flux_result_filepath,
         force_boundary_rxn_inclusion=force_boundary_rxn_inclusion,
-        taxon=taxon,
+        taxon=taxon_id,
+        build_settings=build_settings or ModelBuildSettings(),
     )
 
     build_results.infeasible_reactions.dropna(inplace=True)

From d1784919dde9f744d7811fa09676b22024639661 Mon Sep 17 00:00:00 2001
From: Josh Loecker <joshloecker@icloud.com>
Date: Fri, 6 Mar 2026 11:30:07 -0600
Subject: [PATCH 36/41] refactor: streamline context model creation and improve
 logging output

Signed-off-by: Josh Loecker <joshloecker@icloud.com>
---
 main/como/create_context_specific_model.py | 16 +++++++++-------
 1 file changed, 9 insertions(+), 7 deletions(-)

diff --git a/main/como/create_context_specific_model.py b/main/como/create_context_specific_model.py
index c6eb4269..b0dff92a 100644
--- a/main/como/create_context_specific_model.py
+++ b/main/como/create_context_specific_model.py
@@ -992,23 +992,25 @@ def create_context_specific_model(  # noqa: C901
         taxon=taxon_id,
         build_settings=build_settings or ModelBuildSettings(),
     )
-
-    build_results.infeasible_reactions.dropna(inplace=True)
-    build_results.infeasible_reactions.to_csv(output_infeasible_reactions_filepath, index=False)
+    context_model.id = contextualized_model_id or context_name
 
     if algorithm == Algorithm.FASTCORE:
-        fastcore_df = pd.DataFrame(build_results.expression_index_list)
+        fastcore_df = pd.DataFrame([r.id for r in context_model.reactions])
         fastcore_df.dropna(inplace=True)
         fastcore_df.to_csv(output_fastcore_expression_index_filepath, index=False)
 
-    await _write_model_to_disk(
+    _write_model_to_disk(
         context_name=context_name,
-        model=build_results.model,
+        model=context_model,
         output_filepaths=output_model_filepaths,
         mat_suffix=mat_suffix,
         json_suffix=json_suffix,
         xml_suffix=xml_suffix,
     )
     logger.info(
-        f"context={context_name}, reactions={len(build_results.model.reactions)}, genes={len(build_results.model.genes)}, metabolites={len(build_results.model.metabolites)}"
+        f"context={context_name}, "
+        f"reactions={len(context_model.reactions)}, "
+        f"genes={len(context_model.genes)}, "
+        f"metabolites={len(context_model.metabolites)}"
     )
+    return context_model

From cae84ea7124324a6fb6593127dc958fa2944a4f7 Mon Sep 17 00:00:00 2001
From: Josh Loecker <joshloecker@icloud.com>
Date: Fri, 6 Mar 2026 11:30:35 -0600
Subject: [PATCH 37/41] chore: use proper NumPy types for better compatibility

Signed-off-by: Josh Loecker <joshloecker@icloud.com>
---
 main/como/stats/_two_sample.py       |  2 +-
 main/como/stats/fisher_exact_test.py |  2 +-
 main/como/stats/ks_test.py           | 10 +++++-----
 3 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/main/como/stats/_two_sample.py b/main/como/stats/_two_sample.py
index e97dffbf..1b38bd4b 100644
--- a/main/como/stats/_two_sample.py
+++ b/main/como/stats/_two_sample.py
@@ -40,7 +40,7 @@ def _run(
         df2: pd.DataFrame,
         cores: int = 1,
         worker_kwargs: dict | None = None,
-    ) -> tuple[list[str], Mapping[str, npt.NDArray[float | np.uint8]]]:
+    ) -> tuple[list[str], Mapping[str, npt.NDArray[np.floating]]]:
         all_reactions = list(set(df1.columns) & set(df2.columns))
         array_a = df1[all_reactions].to_numpy(dtype=float, copy=False)
         array_b = df2[all_reactions].to_numpy(dtype=float, copy=False)
diff --git a/main/como/stats/fisher_exact_test.py b/main/como/stats/fisher_exact_test.py
index 068eb4c2..4312069c 100644
--- a/main/como/stats/fisher_exact_test.py
+++ b/main/como/stats/fisher_exact_test.py
@@ -97,5 +97,5 @@ def run(
                 else:
                     d += 1
 
-        result = scipy.stats.fisher_exact(np.array([[a, b], [c, d]]), alternative=alternative)
+        result = scipy.stats.fisher_exact(np.asarray([[a, b], [c, d]]), alternative=alternative)
         return cls(statistic=result.statistic, pvalue=result.pvalue, pathway=pathway, a=a, b=b, c=c, d=d)
diff --git a/main/como/stats/ks_test.py b/main/como/stats/ks_test.py
index 68fb1c2f..b41d57f0 100644
--- a/main/como/stats/ks_test.py
+++ b/main/como/stats/ks_test.py
@@ -21,13 +21,13 @@ class KSTest(BaseTwoSample[KS_RESULT]):
     }
 
     reaction_ids: list[str]
-    statistic: npt.NDArray[float]
-    pvalue: npt.NDArray[float]
-    statistic_location: npt.NDArray[float]
-    statistic_sign: npt.NDArray[int]
+    statistic: npt.NDArray[np.floating]
+    pvalue: npt.NDArray[np.floating]
+    statistic_location: npt.NDArray[np.floating]
+    statistic_sign: npt.NDArray[np.integer]
 
     @staticmethod
-    def _worker(a: npt.NDArray[float], b: npt.NDArray[float], **kwargs) -> KS_RESULT:
+    def _worker(a: npt.NDArray[np.floating], b: npt.NDArray[np.floating], **kwargs) -> KS_RESULT:
         """Calculate the KS statistic.
 
         Args:

From 53b183059eeb29497fccc24e8b6cf2bc720cb3f2 Mon Sep 17 00:00:00 2001
From: Josh Loecker <joshloecker@icloud.com>
Date: Fri, 6 Mar 2026 11:31:00 -0600
Subject: [PATCH 38/41] chore: use `mod.get_remaining_identifiers` in
 replacement of `mod.convert`

Signed-off-by: Josh Loecker <joshloecker@icloud.com>
---
 tests/unit/test_identifier_convert.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/tests/unit/test_identifier_convert.py b/tests/unit/test_identifier_convert.py
index 72118877..736518cd 100644
--- a/tests/unit/test_identifier_convert.py
+++ b/tests/unit/test_identifier_convert.py
@@ -172,7 +172,7 @@ def test_get_conversion_scope_specific_mapping(mod, scope, results, expected):
 def test_convert_rejects_mixed_id_types(mod):
     # Mixed: Entrez-like ("1017") + symbol ("TP53") => ValueError
     with pytest.raises(ValueError, match="All items in ids must be of the same type"):
-        mod.convert(ids=["1017", "TP53"], taxon=9606)
+        mod.get_remaining_identifiers(ids=["1017", "TP53"], taxon=9606)
 
 
 def test_convert_single_id_passes_expected_scope_fields_taxon_and_cache(monkeypatch, mod):
@@ -190,7 +190,7 @@ def fake_get_conversion(*, info, values, scope, fields, taxon):
     monkeypatch.setattr(mod, "MyGeneInfo", CapturingMyGeneInfo)
     monkeypatch.setattr(mod, "_get_conversion", fake_get_conversion)
 
-    df = mod.convert(ids=1017, taxon=9606, cache=False)
+    df = mod.get_remaining_identifiers(ids=1017, taxon=9606, cache=False)
 
     assert isinstance(df, pd.DataFrame)
     assert set(df.columns) == {"ensembl_gene_id", "entrez_gene_id", "gene_symbol"}
@@ -228,7 +228,7 @@ def fake_get_conversion(*, info, values, scope, fields, taxon):
     monkeypatch.setattr(mod, "_get_conversion", fake_get_conversion)
 
     ids = list(range(1, 1002))  # 1001 ids => should create 2 chunks: 1000 and 1
-    df = mod.convert(ids=ids, taxon="9606")
+    df = mod.get_remaining_identifiers(ids=ids, taxon="9606")
 
     assert len(df) == 1001
     assert set(df.columns) == {"ensembl_gene_id", "entrez_gene_id", "gene_symbol"}
@@ -262,7 +262,7 @@ def fake_get_conversion(*, info, values, scope, fields, taxon):
     monkeypatch.setattr(mod, "MyGeneInfo", CapturingMyGeneInfo)
     monkeypatch.setattr(mod, "_get_conversion", fake_get_conversion)
 
-    df = mod.convert(ids=["TP53", "BRCA1"], taxon=9606)
+    df = mod.get_remaining_identifiers(ids=["TP53", "BRCA1"], taxon=9606)
 
     assert len(df) == 2
     assert set(df.columns) == {"ensembl_gene_id", "entrez_gene_id", "gene_symbol"}

From b661349fc288d180b57a844210f2f6c9e42d6067 Mon Sep 17 00:00:00 2001
From: Josh Loecker <joshloecker@icloud.com>
Date: Fri, 6 Mar 2026 11:39:53 -0600
Subject: [PATCH 39/41] chore: update built-in boundary reaction bounds to
 match expected format

Signed-off-by: Josh Loecker <joshloecker@icloud.com>
---
 .../boundary_rxns/naiveB_boundary_rxns.csv    | 102 +++++++++---------
 main/data/boundary_rxns/smB_boundary_rxns.csv |   2 +-
 2 files changed, 52 insertions(+), 52 deletions(-)

diff --git a/main/data/boundary_rxns/naiveB_boundary_rxns.csv b/main/data/boundary_rxns/naiveB_boundary_rxns.csv
index be3eec35..11dac2c6 100644
--- a/main/data/boundary_rxns/naiveB_boundary_rxns.csv
+++ b/main/data/boundary_rxns/naiveB_boundary_rxns.csv
@@ -1,7 +1,7 @@
-Reaction,Abbreviation,Compartment,Minimum Reaction Rate,Maximum Reaction Rate
-Exchange,glc_D,Extracellular,-1000,100
+Reaction,Abbreviation,Compartment,Lower Bound,Upper Bound
+Exchange,glc_D,Extracellular,-100,1000
 Exchange,fe2,Extracellular,-1000,1000
-Exchange,gal,Extracellular,-1000,0
+Exchange,gal,Extracellular,-0,1000
 Exchange,zn2,Extracellular,-1000,1000
 Exchange,ca2,Extracellular,-1000,1000
 Exchange,na1,Extracellular,-1000,1000
@@ -16,55 +16,55 @@ Exchange,co2,Extracellular,-1000,1000
 Exchange,nh4,Extracellular,-1000,1000
 Exchange,no2,Extracellular,-1000,1000
 Exchange,co,Extracellular,-1000,1000
-Exchange,asn_L,Extracellular,-1000,1
-Exchange,asp_L,Extracellular,-1000,1
-Exchange,glu_L,Extracellular,-1000,1
-Exchange,ile_L,Extracellular,-1000,1
-Exchange,leu_L,Extracellular,-1000,1
-Exchange,lys_L,Extracellular,-1000,1
-Exchange,met_L,Extracellular,-1000,1
-Exchange,pro_L,Extracellular,-1000,1
-Exchange,ser_L,Extracellular,-1000,1
-Exchange,val_L,Extracellular,-1000,1
-Exchange,gly,Extracellular,-1000,1
-Exchange,cys_L,Extracellular,-1000,1
-Exchange,ala_L,Extracellular,-1000,1
-Exchange,his_L,Extracellular,-1000,1
-Exchange,thr_L,Extracellular,-1000,1
-Exchange,gln_L,Extracellular,-1000,1
-Exchange,phe_L,Extracellular,-1000,1
-Exchange,tyr_L,Extracellular,-1000,1
-Exchange,arg_L,Extracellular,-1000,1
-Exchange,trp_L,Extracellular,-1000,1
-Exchange,eicostet,Extracellular,-1000,1
-Exchange,hdca,Extracellular,-1000,1
-Exchange,hdcea,Extracellular,-1000,1
-Exchange,lnlc,Extracellular,-1000,1
-Exchange,lnlnca,Extracellular,-1000,1
-Exchange,lnlncg,Extracellular,-1000,1
-Exchange,ocdca,Extracellular,-1000,1
-Exchange,ocdcea,Extracellular,-1000,1
-Exchange,thmmp,Extracellular,-1000,0
-Exchange,thmtp,Extracellular,-1000,0
-Exchange,ncam,Extracellular,-1000,0
-Exchange,pnto_R,Extracellular,-1000,0
-Exchange,pydxn,Extracellular,-1000,0
-Exchange,pydx,Extracellular,-1000,0
-Exchange,pydam,Extracellular,-1000,0
-Exchange,btn,Extracellular,-1000,0
-Exchange,fol,Extracellular,-1000,10
-Exchange,aqcobal,Extracellular,-1000,0
-Exchange,vitd3,Extracellular,-1000,1
-Exchange,ascb_L,Extracellular,-1000,0
-Exchange,retinol,Extracellular,-1000,10
-Exchange,retinal,Extracellular,-1000,10
-Exchange,ala_B,Extracellular,-1000,1
-Exchange,ala_D,Extracellular,-1000,1
+Exchange,asn_L,Extracellular,-1,1000
+Exchange,asp_L,Extracellular,-1,1000
+Exchange,glu_L,Extracellular,-1,1000
+Exchange,ile_L,Extracellular,-1,1000
+Exchange,leu_L,Extracellular,-1,1000
+Exchange,lys_L,Extracellular,-1,1000
+Exchange,met_L,Extracellular,-1,1000
+Exchange,pro_L,Extracellular,-1,1000
+Exchange,ser_L,Extracellular,-1,1000
+Exchange,val_L,Extracellular,-1,1000
+Exchange,gly,Extracellular,-1,1000
+Exchange,cys_L,Extracellular,-1,1000
+Exchange,ala_L,Extracellular,-1,1000
+Exchange,his_L,Extracellular,-1,1000
+Exchange,thr_L,Extracellular,-1,1000
+Exchange,gln_L,Extracellular,-1,1000
+Exchange,phe_L,Extracellular,-1,1000
+Exchange,tyr_L,Extracellular,-1,1000
+Exchange,arg_L,Extracellular,-1,1000
+Exchange,trp_L,Extracellular,-1,1000
+Exchange,eicostet,Extracellular,-1,1000
+Exchange,hdca,Extracellular,-1,1000
+Exchange,hdcea,Extracellular,-1,1000
+Exchange,lnlc,Extracellular,-1,1000
+Exchange,lnlnca,Extracellular,-1,1000
+Exchange,lnlncg,Extracellular,-1,1000
+Exchange,ocdca,Extracellular,-1,1000
+Exchange,ocdcea,Extracellular,-1,1000
+Exchange,thmmp,Extracellular,-0,1000
+Exchange,thmtp,Extracellular,-0,1000
+Exchange,ncam,Extracellular,-0,1000
+Exchange,pnto_R,Extracellular,-0,1000
+Exchange,pydxn,Extracellular,-0,1000
+Exchange,pydx,Extracellular,-0,1000
+Exchange,pydam,Extracellular,-0,1000
+Exchange,btn,Extracellular,-0,1000
+Exchange,fol,Extracellular,-10,1000
+Exchange,aqcobal,Extracellular,-0,1000
+Exchange,vitd3,Extracellular,-1,1000
+Exchange,ascb_L,Extracellular,-0,1000
+Exchange,retinol,Extracellular,-10,1000
+Exchange,retinal,Extracellular,-10,1000
+Exchange,ala_B,Extracellular,-1,1000
+Exchange,ala_D,Extracellular,-1,1000
 Exchange,h2co3,Extracellular,-1000,1000
 Exchange,h2o2,Extracellular,-1000,1000
 Exchange,hco3,Extracellular,-1000,1000
-Exchange,orn,Extracellular,-1000,1
-Exchange,orn_D,Extracellular,-1000,1
+Exchange,orn,Extracellular,-1,1000
+Exchange,orn_D,Extracellular,-1,1000
 Exchange,so4,Extracellular,-1000,1000
-Exchange,ribflv,Extracellular,-1000,1
-Exchange,Lcystin,Extracellular,-1000,1
+Exchange,ribflv,Extracellular,-1,1000
+Exchange,Lcystin,Extracellular,-1,1000
diff --git a/main/data/boundary_rxns/smB_boundary_rxns.csv b/main/data/boundary_rxns/smB_boundary_rxns.csv
index 9f0f9385..be12d9b7 100644
--- a/main/data/boundary_rxns/smB_boundary_rxns.csv
+++ b/main/data/boundary_rxns/smB_boundary_rxns.csv
@@ -1,4 +1,4 @@
-Reaction,Abbreviation,Compartment,Minimum Reaction Rate,Maximum Reaction Rate
+Reaction,Abbreviation,Compartment,Lower Bound,Upper Bound
 Exchange,glc_D,Extracellular,-100,1000
 Exchange,fe2,Extracellular,-1000,1000
 Exchange,gal,Extracellular,0,1000

From 2628568be5fc3ad375cd95767f4b158fc5f3d1e6 Mon Sep 17 00:00:00 2001
From: Josh Loecker <joshloecker@icloud.com>
Date: Fri, 6 Mar 2026 11:40:30 -0600
Subject: [PATCH 40/41] chore: ruff formatting

Signed-off-by: Josh Loecker <joshloecker@icloud.com>
---
 main/como/proteomics/FTPManager.py            |  8 +++++++-
 main/como/proteomics/proteomics_preprocess.py | 18 +++++++++++++++---
 2 files changed, 22 insertions(+), 4 deletions(-)

diff --git a/main/como/proteomics/FTPManager.py b/main/como/proteomics/FTPManager.py
index 8cca5095..be5a54a3 100644
--- a/main/como/proteomics/FTPManager.py
+++ b/main/como/proteomics/FTPManager.py
@@ -20,7 +20,13 @@
 from como.data_types import LogLevel
 
 
-async def aioftp_client(host: str, username: str = "anonymous", password: str = "guest", port: int = 21, max_attempts: int = 3) -> aioftp.Client:
+async def aioftp_client(
+    host: str,
+    username: str = "anonymous",
+    password: str = "guest",
+    port: int = 21,
+    max_attempts: int = 3,
+) -> aioftp.Client:
     """This class is responsible for creating a "client" connection"""
     connection_successful: bool = False
     attempt_num: int = 1
diff --git a/main/como/proteomics/proteomics_preprocess.py b/main/como/proteomics/proteomics_preprocess.py
index 04242443..79ff9850 100644
--- a/main/como/proteomics/proteomics_preprocess.py
+++ b/main/como/proteomics/proteomics_preprocess.py
@@ -159,7 +159,10 @@ def _gather_data(self):
 
             # Iterate through the URLs available
             for url, study in zip(ftp_urls, studies, strict=True):
-                ftp_data: FTPManager.Reader = FTPManager.Reader(root_link=url, file_extensions=self._preferred_extensions)
+                ftp_data: FTPManager.Reader = FTPManager.Reader(
+                    root_link=url,
+                    file_extensions=self._preferred_extensions,
+                )
 
                 urls = list(ftp_data.files)
                 sizes = list(ftp_data.file_sizes)
@@ -167,7 +170,14 @@ def _gather_data(self):
 
                 # Iterate through all files and sizes found for url_##
                 for file, size in zip(urls, sizes, strict=True):
-                    self.file_information.append(FileInformation(cell_type=cell_type, download_url=file, file_size=size, study=study))
+                    self.file_information.append(
+                        FileInformation(
+                            cell_type=cell_type,
+                            download_url=file,
+                            file_size=size,
+                            study=study,
+                        )
+                    )
 
     def print_download_size(self):
         """Print the total size to download if we must download data."""
@@ -327,7 +337,9 @@ def parse_args() -> argparse.Namespace:
     else:
         args.core_count = int(args.core_count)
         if args.core_count > os.cpu_count():
-            logger.info(f"{args.core_count} cores not available, system only has {os.cpu_count()} cores. Setting '--cores' to {os.cpu_count()}")
+            logger.info(
+                f"{args.core_count} cores not available, system only has {os.cpu_count()} cores. Setting '--cores' to {os.cpu_count()}"
+            )
             args.core_count = os.cpu_count()
 
     return args

From cd61f79c84c755be197865871b8f7fdc6ddc977b Mon Sep 17 00:00:00 2001
From: Josh Loecker <joshloecker@icloud.com>
Date: Fri, 6 Mar 2026 13:29:36 -0600
Subject: [PATCH 41/41] chore: update identifier pipeline tests

Signed-off-by: Josh Loecker <joshloecker@icloud.com>
---
 tests/unit/test_identifier_convert.py | 219 +++++++++++++-------------
 1 file changed, 109 insertions(+), 110 deletions(-)

diff --git a/tests/unit/test_identifier_convert.py b/tests/unit/test_identifier_convert.py
index 736518cd..3df2289f 100644
--- a/tests/unit/test_identifier_convert.py
+++ b/tests/unit/test_identifier_convert.py
@@ -1,4 +1,3 @@
-# tests/test_gene_conversion.py
 from __future__ import annotations
 
 import importlib
@@ -22,38 +21,38 @@ def mod(monkeypatch):
     # Stub bioservices.mygeneinfo.MyGeneInfo so importing the module works
     bioservices = types.ModuleType("bioservices")
     mygeneinfo = types.ModuleType("bioservices.mygeneinfo")
-
+    
     class DummyMyGeneInfo:
         def __init__(self, cache: bool = True):
             self.cache = cache
-
+        
         def get_queries(self, **kwargs):
             raise AssertionError(
                 "DummyMyGeneInfo.get_queries was called unexpectedly. "
                 "Patch mod.MyGeneInfo or use _get_conversion tests with a fake info object."
             )
-
+    
     mygeneinfo.MyGeneInfo = DummyMyGeneInfo
     bioservices.mygeneinfo = mygeneinfo
-
+    
     monkeypatch.setitem(sys.modules, "bioservices", bioservices)
     monkeypatch.setitem(sys.modules, "bioservices.mygeneinfo", mygeneinfo)
-
+    
     # Ensure a clean import each test
     if MODULE_NAME in sys.modules:
         del sys.modules[MODULE_NAME]
-
+    
     return importlib.import_module(MODULE_NAME)
 
 
 def test_determine_gene_type_accepts_single_string(mod):
-    assert mod.determine_gene_type("1017") == {"1017": "entrezgene"}
+    assert mod.determine_gene_type("1017") == "entrezgene"
 
 
 def test_determine_gene_type_classifies_common_formats_and_float(mod):
     # Includes a float to exercise the float-specific split(".")[0] branch.
     out = mod.determine_gene_type(["1017", "ENSG00000141510", "TP53", 123.0])
-
+    
     assert out["1017"] == "entrezgene"
     assert out["ENSG00000141510"] == "ensembl.gene"
     assert out["TP53"] == "symbol"
@@ -70,19 +69,14 @@ def test_determine_gene_type_ensembl_format_requires_ens_prefix_and_11_trailing_
     assert mod.determine_gene_type(["ENSGABCDEFGHIJK"])["ENSGABCDEFGHIJK"] == "symbol"
 
 
-# ---------------------------------------------------------------------------
-# _get_conversion tests
-# ---------------------------------------------------------------------------
-
-
 @dataclass
 class FakeInfo:
     return_value: object
     calls: list[dict] = None
-
+    
     def __post_init__(self):  # noqa: D105
         self.calls = []
-
+    
     def get_queries(self, **kwargs):
         self.calls.append(kwargs)
         return self.return_value
@@ -91,13 +85,13 @@ def get_queries(self, **kwargs):
 def test_get_conversion_raises_typeerror_when_results_not_list(mod):
     info = FakeInfo(return_value={"not": "a list"})
     with pytest.raises(TypeError, match="Expected results to be a list"):
-        mod._get_conversion(info=info, values=["1017"], scope="entrezgene", fields="ensembl.gene,symbol", taxon="9606")
+        mod._get_conversion(info=info, values=["1017"], taxon="9606")
 
 
 def test_get_conversion_raises_typeerror_when_first_item_not_dict(mod):
     info = FakeInfo(return_value=["not a dict"])
     with pytest.raises(TypeError, match="Expected each result to be a dict"):
-        mod._get_conversion(info=info, values=["1017"], scope="entrezgene", fields="ensembl.gene,symbol", taxon="9606")
+        mod._get_conversion(info=info, values=["1017"], taxon="9606")
 
 
 def test_get_conversion_calls_get_queries_with_expected_arguments(mod):
@@ -106,67 +100,58 @@ def test_get_conversion_calls_get_queries_with_expected_arguments(mod):
             {"query": "1017", "ensembl.gene": "ENSG00000123374", "symbol": "CDK2"},
         ]
     )
-
-    out = mod._get_conversion(
-        info=info, values=["1017"], scope="entrezgene", fields="ensembl.gene,symbol", taxon="9606"
-    )
-
+    
+    out = mod._get_conversion(info=info, values=["1017"], taxon="9606")
+    
+    # _get_conversion returns raw API results
     assert out == [
-        {"ensembl_gene_id": "ENSG00000123374", "entrez_gene_id": "1017", "gene_symbol": "CDK2"},
+        {"query": "1017", "ensembl.gene": "ENSG00000123374", "symbol": "CDK2"},
     ]
-
+    
     assert len(info.calls) == 1
     call = info.calls[0]
     assert call["query"] == "1017"
     assert call["dotfield"] is True
     assert call["scopes"] == "entrezgene"
-    assert call["fields"] == "ensembl.gene,symbol"
+    assert set(call["fields"].split(",")) == {
+        "ensembl.gene",
+        "entrezgene",
+        "symbol",
+        "genomic_pos.start",
+        "genomic_pos.end",
+        "taxid",
+        "notfound",
+    }
     assert call["species"] == "9606"
 
 
 @pytest.mark.parametrize(
-    ("scope", "results", "expected"),
+    ("results",),
     [
+        # Test data format: list of raw API results to pass through
         (
-            "entrezgene",
-            [
-                {"query": "1017", "ensembl.gene": "ENSG00000123374", "symbol": "CDK2"},
-                {"query": "1018", "ensembl.gene": "ENSG00000123400", "symbol": "CDK3"},
-            ],
-            [
-                {"ensembl_gene_id": "ENSG00000123374", "entrez_gene_id": "1017", "gene_symbol": "CDK2"},
-                {"ensembl_gene_id": "ENSG00000123400", "entrez_gene_id": "1018", "gene_symbol": "CDK3"},
-            ],
+                [
+                    {"query": "1017", "ensembl.gene": "ENSG00000123374", "symbol": "CDK2"},
+                    {"query": "1018", "ensembl.gene": "ENSG00000123400", "symbol": "CDK3"},
+                ],
         ),
         (
-            "ensembl.gene",
-            [
-                {"query": "ENSG00000141510", "entrezgene": "7157", "symbol": "TP53"},
-            ],
-            [
-                {"ensembl_gene_id": "ENSG00000141510", "entrez_gene_id": "7157", "gene_symbol": "TP53"},
-            ],
+                [
+                    {"query": "ENSG00000141510", "entrezgene": "7157", "symbol": "TP53"},
+                ],
         ),
         (
-            "symbol",
-            [
-                {"query": "TP53", "ensembl.gene": "ENSG00000141510", "entrezgene": "7157"},
-            ],
-            [
-                {"ensembl_gene_id": "ENSG00000141510", "entrez_gene_id": "7157", "gene_symbol": "TP53"},
-            ],
+                [
+                    {"query": "TP53", "ensembl.gene": "ENSG00000141510", "entrezgene": "7157"},
+                ],
         ),
     ],
 )
-def test_get_conversion_scope_specific_mapping(mod, scope, results, expected):
+def test_get_conversion_returns_raw_results(mod, results):
+    """Test that _get_conversion returns raw API results without transformation."""
     info = FakeInfo(return_value=results)
-    out = mod._get_conversion(info=info, values=[r["query"] for r in results], scope=scope, fields="x", taxon="9606")
-    assert out == expected
-
-
-# ---------------------------------------------------------------------------
-# convert tests
-# ---------------------------------------------------------------------------
+    out = mod._get_conversion(info=info, values=[r["query"] for r in results], taxon="9606")
+    assert out == results
 
 
 def test_convert_rejects_mixed_id_types(mod):
@@ -177,69 +162,75 @@ def test_convert_rejects_mixed_id_types(mod):
 
 def test_convert_single_id_passes_expected_scope_fields_taxon_and_cache(monkeypatch, mod):
     calls = []
-
+    
     class CapturingMyGeneInfo:
         def __init__(self, cache: bool = True):
             self.cache = cache
-
-    def fake_get_conversion(*, info, values, scope, fields, taxon):
-        calls.append({"info": info, "values": values, "scope": scope, "fields": fields, "taxon": taxon})
-        # Return one row per value
-        return [{"ensembl_gene_id": "ENSG_TEST", "entrez_gene_id": v, "gene_symbol": "SYM_TEST"} for v in values]
-
+    
+    def fake_get_conversion(*, info, values, taxon):
+        calls.append({"info": info, "values": values, "taxon": taxon})
+        # Return raw API format with taxid for filtering
+        value_list = [values] if isinstance(values, (int, str)) else values
+        return [
+            {"query": str(v), "ensembl.gene": "ENSG_TEST", "entrezgene": str(v), "symbol": f"SYM{v}", "taxid": taxon}
+            for v in value_list
+        ]
+    
     monkeypatch.setattr(mod, "MyGeneInfo", CapturingMyGeneInfo)
     monkeypatch.setattr(mod, "_get_conversion", fake_get_conversion)
-
+    
     df = mod.get_remaining_identifiers(ids=1017, taxon=9606, cache=False)
-
+    
     assert isinstance(df, pd.DataFrame)
-    assert set(df.columns) == {"ensembl_gene_id", "entrez_gene_id", "gene_symbol"}
+    assert set(df.columns) == {"ensembl_gene_id", "entrez_gene_id", "gene_symbol", "taxon_id"}
     assert len(df) == 1
     assert df.loc[0, "entrez_gene_id"] == "1017"
     assert df.loc[0, "ensembl_gene_id"] == "ENSG_TEST"
-    assert df.loc[0, "gene_symbol"] == "SYM_TEST"
-
+    assert df.loc[0, "gene_symbol"] == "SYM1017"
+    assert df.loc[0, "taxon_id"] == 9606
+    
     assert len(calls) == 1
     call = calls[0]
     assert isinstance(call["info"], CapturingMyGeneInfo)
     assert call["info"].cache is False
-    assert call["values"] == ["1017"]
-    assert call["scope"] == "entrezgene"
-    assert call["taxon"] == "9606"
+    assert call["values"] == 1017
+    assert call["taxon"] == 9606
 
-    # Fields are computed via set arithmetic; order is not guaranteed.
-    assert set(call["fields"].split(",")) == {"ensembl.gene", "symbol"}
 
-
-def test_convert_chunks_inputs_over_1000(monkeypatch, mod):
+def test_convert_large_input_list(monkeypatch, mod):
+    """Test that get_remaining_identifiers handles large input lists correctly."""
     calls = []
-
+    
     class CapturingMyGeneInfo:
         def __init__(self, cache: bool = True):
             self.cache = cache
-
-    def fake_get_conversion(*, info, values, scope, fields, taxon):
-        calls.append({"values_len": len(values), "scope": scope, "fields": fields, "taxon": taxon})
+    
+    def fake_get_conversion(*, info, values, taxon):
+        calls.append({"values_len": len(values), "taxon": taxon})
+        value_list = [values] if isinstance(values, (int, str)) else values
         return [
-            {"ensembl_gene_id": f"ENSG{v.zfill(11)}", "entrez_gene_id": v, "gene_symbol": f"SYM{v}"} for v in values
+            {
+                "query": str(v),
+                "ensembl.gene": f"ENSG{str(v).zfill(11)}",
+                "entrezgene": str(v),
+                "symbol": f"SYM{v}",
+                "taxid": taxon,
+            }
+            for v in value_list
         ]
-
+    
     monkeypatch.setattr(mod, "MyGeneInfo", CapturingMyGeneInfo)
     monkeypatch.setattr(mod, "_get_conversion", fake_get_conversion)
-
-    ids = list(range(1, 1002))  # 1001 ids => should create 2 chunks: 1000 and 1
+    
+    ids = list(range(1, 1002))
     df = mod.get_remaining_identifiers(ids=ids, taxon="9606")
-
+    
     assert len(df) == 1001
-    assert set(df.columns) == {"ensembl_gene_id", "entrez_gene_id", "gene_symbol"}
-
-    # Ensure chunking happened as expected
-    assert [c["values_len"] for c in calls] == [1000, 1]
-    for c in calls:
-        assert c["scope"] == "entrezgene"
-        assert c["taxon"] == "9606"
-        assert set(c["fields"].split(",")) == {"ensembl.gene", "symbol"}
-
+    assert set(df.columns) == {"ensembl_gene_id", "entrez_gene_id", "gene_symbol", "taxon_id"}
+    assert len(calls) == 1
+    assert calls[0]["values_len"] == 1001
+    assert calls[0]["taxon"] == "9606"
+    
     # Spot-check first and last rows keep ordering
     assert df.loc[0, "entrez_gene_id"] == "1"
     assert df.loc[0, "gene_symbol"] == "SYM1"
@@ -249,26 +240,34 @@ def fake_get_conversion(*, info, values, scope, fields, taxon):
 
 def test_convert_symbol_scope_fields_exclude_symbol(monkeypatch, mod):
     calls = []
-
+    
     class CapturingMyGeneInfo:
         def __init__(self, cache: bool = True):
             self.cache = cache
-
-    def fake_get_conversion(*, info, values, scope, fields, taxon):
-        calls.append({"scope": scope, "fields": fields, "taxon": taxon, "values": values})
-        # Return minimal row(s)
-        return [{"ensembl_gene_id": None, "entrez_gene_id": None, "gene_symbol": v} for v in values]
-
+    
+    def fake_get_conversion(*, info, values, taxon):
+        calls.append({"taxon": taxon, "values": values})
+        return [
+            {
+                "query": v,
+                "ensembl.gene": f"ENSG{v}_TEST",
+                "entrezgene": "123",
+                "symbol": v,
+                "taxid": taxon,
+            }
+            for v in values
+        ]
+    
     monkeypatch.setattr(mod, "MyGeneInfo", CapturingMyGeneInfo)
     monkeypatch.setattr(mod, "_get_conversion", fake_get_conversion)
-
+    
     df = mod.get_remaining_identifiers(ids=["TP53", "BRCA1"], taxon=9606)
-
+    
     assert len(df) == 2
-    assert set(df.columns) == {"ensembl_gene_id", "entrez_gene_id", "gene_symbol"}
-
+    assert set(df.columns) == {"ensembl_gene_id", "entrez_gene_id", "gene_symbol", "taxon_id"}
+    assert df.loc[0, "gene_symbol"] == "TP53"
+    assert df.loc[1, "gene_symbol"] == "BRCA1"
+    assert df.loc[0, "taxon_id"] == 9606
+    
     assert len(calls) == 1
-    assert calls[0]["scope"] == "symbol"
-    assert calls[0]["taxon"] == "9606"
-    # When scope is "symbol", requested fields should be the other two
-    assert set(calls[0]["fields"].split(",")) == {"ensembl.gene", "entrezgene"}
+    assert calls[0]["taxon"] == 9606