From 03dfc79e752444ccca24d2e037c828ff6c68a337 Mon Sep 17 00:00:00 2001 From: Josh Loecker Date: Fri, 6 Mar 2026 10:24:40 -0600 Subject: [PATCH 01/41] chore: bump zFPKM version requirement Signed-off-by: Josh Loecker --- pyproject.toml | 2 +- uv.lock | 8 ++++---- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index ede17cb1..fb234ecc 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -28,7 +28,7 @@ dependencies = [ "statsmodels>=0.13.0; python_version < '3.12'", "statsmodels>=0.14.0; python_version >= '3.12'", "troppo@git+https://github.com/JoshLoecker/troppo@master", - "zfpkm>=1.0.3", + "zfpkm>=1.1.0", ] [project.optional-dependencies] diff --git a/uv.lock b/uv.lock index 9139aea2..64be9f70 100644 --- a/uv.lock +++ b/uv.lock @@ -526,7 +526,7 @@ requires-dist = [ { name = "statsmodels", marker = "python_full_version < '3.12'", specifier = ">=0.13.0" }, { name = "statsmodels", marker = "python_full_version >= '3.12'", specifier = ">=0.14.0" }, { name = "troppo", git = "https://github.com/JoshLoecker/troppo?rev=master" }, - { name = "zfpkm", specifier = ">=1.0.3" }, + { name = "zfpkm", specifier = ">=1.1.0" }, ] provides-extras = ["gurobi", "interactive"] @@ -3697,7 +3697,7 @@ wheels = [ [[package]] name = "zfpkm" -version = "1.0.3" +version = "1.1.0" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "loguru" }, @@ -3705,9 +3705,9 @@ dependencies = [ { name = "numpy" }, { name = "pandas" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/e3/7f/ff714f85601cd66439f2beed0d740772509b32b8be5a8b01a53652248714/zfpkm-1.0.3.tar.gz", hash = "sha256:58830ea61e6adc0c75f28d5304885bd03a33a6e9e56aa693856cbb37e30a5046", size = 15410, upload-time = "2025-11-10T16:47:45.614Z" } +sdist = { url = "https://files.pythonhosted.org/packages/0d/bf/43471c26e47fc38b5f748723e3bfa104f77ee0352f5b6a242f8b01786ae4/zfpkm-1.1.0.tar.gz", hash = "sha256:c159f78703d9d853c5f8cbbc590fb9381f097329487947d4eaf95c72ab309870", size = 15623, upload-time = "2026-03-06T16:22:29.929Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/11/f8/ef2baeaf2d15682d5d663c3f5165b63abad114fc0c4cd90b67b1ed0a6456/zfpkm-1.0.3-py3-none-any.whl", hash = "sha256:085007f97e75e50d686677ee28e3fceba5fc19958b35e9fbad3756ca2302a219", size = 17841, upload-time = "2025-11-10T16:47:44.805Z" }, + { url = "https://files.pythonhosted.org/packages/85/a1/10cf3c5268131d37a4a968a1f5f935a07e81cb78dbb83e8bedc4a835c048/zfpkm-1.1.0-py3-none-any.whl", hash = "sha256:765d1785a22729adeb89732da01ba47abcbc9597c17c587e42176398a758b7a3", size = 18103, upload-time = "2026-03-06T16:22:29.104Z" }, ] [[package]] From 1e2e14f434b45ceea2aa6d986b2ed3d1ed977d45 Mon Sep 17 00:00:00 2001 From: Josh Loecker Date: Fri, 6 Mar 2026 10:25:05 -0600 Subject: [PATCH 02/41] chore: allow ruff to fix imports and `__all__` sections Signed-off-by: Josh Loecker --- ruff.toml | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/ruff.toml b/ruff.toml index 691022d2..edc19396 100644 --- a/ruff.toml +++ b/ruff.toml @@ -1,3 +1,4 @@ +src = ["main/como"] line-length = 120 extend-include = ["docs/**/*.py", "tests/**/*.py", "**/*.ipynb"] @@ -6,7 +7,11 @@ quote-style = "double" docstring-code-format = true [lint] -extend-fixable = ["I001"] +extend-fixable = [ + "I001", + "RUF022", + "W293", +] # Linting rules: https://docs.astral.sh/ruff/rules/ unfixable = [ "F401", # warn about, but do not remove, unused imports @@ -45,6 +50,9 @@ ignore = [ "TRY003", # allow exception messages outside the `Exception` class ] +[lint.isort] +known-first-party = ["como"] + [lint.pydocstyle] convention = "google" From a6b25e9d548b0d487d18c2317580b1ee2440125d Mon Sep 17 00:00:00 2001 From: Josh Loecker Date: Fri, 6 Mar 2026 10:27:52 -0600 Subject: [PATCH 03/41] chore: bump bioservices requirement Signed-off-by: Josh Loecker --- pyproject.toml | 2 +- uv.lock | 8 ++++---- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index fb234ecc..7a277114 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -9,7 +9,7 @@ requires-python = ">=3.11,<3.14" dependencies = [ "aioftp>=0.23.1", "anndata>=0.12.0", - "bioservices>=1.12.1", + "bioservices>=1.13.0", "cobamp@git+https://github.com/JoshLoecker/cobamp@master", "cobra>=0.28.0", "joypy>=0.2.6", diff --git a/uv.lock b/uv.lock index 64be9f70..09d79c73 100644 --- a/uv.lock +++ b/uv.lock @@ -181,7 +181,7 @@ wheels = [ [[package]] name = "bioservices" -version = "1.12.1" +version = "1.13.0" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "appdirs" }, @@ -201,9 +201,9 @@ dependencies = [ { name = "wrapt" }, { name = "xmltodict" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/5e/51/1a8dc87ffc6c27e0a896b982da3ff1b483f5ef85a47ae1ffafbc6a479bb4/bioservices-1.12.1.tar.gz", hash = "sha256:0f31782ae50930d4ab82b43f98d1ca2cc9befaa699f3a7a6cba512a8f9e2cab0", size = 218752, upload-time = "2025-02-27T22:38:58.628Z" } +sdist = { url = "https://files.pythonhosted.org/packages/99/8c/b9a20b6656446daaea5f3085d5465aad8f536c95fd14ffd6fcdcdf410031/bioservices-1.13.0.tar.gz", hash = "sha256:24d30be650d37c1377b4fd452abead9ec5ee891dbdaa96a21543ab508d401386", size = 220384, upload-time = "2026-03-02T14:49:11.002Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/b8/c9/656854139abbdc0a09d544bacc6fa3132245ebf9847dd45da9a8e416c5a5/bioservices-1.12.1-py3-none-any.whl", hash = "sha256:898033fe0158a0e31ea5ad93ff02f5000aa67ff9d06e9dc6477583d3f60ace80", size = 258032, upload-time = "2025-02-27T22:38:56.406Z" }, + { url = "https://files.pythonhosted.org/packages/27/49/93a413b3b70a96e525f11212d0ca7992c4c7b35c47da78bf227a52282e85/bioservices-1.13.0-py3-none-any.whl", hash = "sha256:aaafb5ee246bccd9323a21de12ea6a35163e01c1ed4777e45acbda565804fc32", size = 259727, upload-time = "2026-03-02T14:49:09.879Z" }, ] [[package]] @@ -504,7 +504,7 @@ dev = [ requires-dist = [ { name = "aioftp", specifier = ">=0.23.1" }, { name = "anndata", specifier = ">=0.12.0" }, - { name = "bioservices", specifier = ">=1.12.1" }, + { name = "bioservices", specifier = ">=1.13.0" }, { name = "cobamp", git = "https://github.com/JoshLoecker/cobamp?rev=master" }, { name = "cobra", specifier = ">=0.28.0" }, { name = "gurobipy", marker = "extra == 'gurobi'", specifier = "<14" }, From 027f97a38d80277852af1f5f12bf4baa13b814a3 Mon Sep 17 00:00:00 2001 From: Josh Loecker Date: Fri, 6 Mar 2026 10:29:06 -0600 Subject: [PATCH 04/41] chore: remove the `log_and_raise_error` helper function for better visibility of where errors are being raised Signed-off-by: Josh Loecker --- main/como/utils.py | 1 - 1 file changed, 1 deletion(-) diff --git a/main/como/utils.py b/main/como/utils.py index 98daca16..a0c51b80 100644 --- a/main/como/utils.py +++ b/main/como/utils.py @@ -18,7 +18,6 @@ T = TypeVar("T") __all__ = [ "get_missing_gene_data", - "log_and_raise_error", "num_columns", "num_rows", "read_file", From fa75fc6bda552d4a1cc3f73ee7f884cf0ddad4b2 Mon Sep 17 00:00:00 2001 From: Josh Loecker Date: Fri, 6 Mar 2026 10:31:50 -0600 Subject: [PATCH 05/41] chore: remove the internal `log_and_raise_error` helper function for better visibility of where errors are being raised Signed-off-by: Josh Loecker --- main/como/merge_xomics.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/main/como/merge_xomics.py b/main/como/merge_xomics.py index 2cfdd1a1..564c2025 100644 --- a/main/como/merge_xomics.py +++ b/main/como/merge_xomics.py @@ -22,7 +22,7 @@ _SourceWeights, ) from como.project import Config -from como.utils import get_missing_gene_data, log_and_raise_error, read_file, return_placeholder_data, set_up_logging +from como.utils import get_missing_gene_data, read_file, return_placeholder_data, set_up_logging class _MergedHeaderNames: From f59e21a67153ff1737aaf01fc9e14b5233725787 Mon Sep 17 00:00:00 2001 From: Josh Loecker Date: Fri, 6 Mar 2026 10:36:07 -0600 Subject: [PATCH 06/41] feat: add genomic conversion pipelines directly into COMO using `bioservices` Signed-off-by: Josh Loecker --- main/como/pipelines/identifier.py | 221 +++++++++++++++++++++++------- 1 file changed, 170 insertions(+), 51 deletions(-) diff --git a/main/como/pipelines/identifier.py b/main/como/pipelines/identifier.py index 59fe7444..e75c46af 100644 --- a/main/como/pipelines/identifier.py +++ b/main/como/pipelines/identifier.py @@ -1,37 +1,55 @@ -from collections.abc import Sequence -from typing import Literal +from collections.abc import Iterable, Iterator, Sequence +from typing import Any, Literal, overload import pandas as pd from bioservices.mygeneinfo import MyGeneInfo +from tqdm import tqdm __all__ = [ - "convert", + "build_gene_info", "determine_gene_type", + "get_remaining_identifiers", ] +T_IDS = int | str | Iterable[int] | Iterable[str] | Iterable[int | str] T_MG_SCOPE = Literal["entrezgene", "ensembl.gene", "symbol"] T_MG_TRANSLATE = Literal["entrez_gene_id", "ensembl_gene_id", "gene_symbol"] T_MG_RETURN = list[dict[T_MG_TRANSLATE, str]] -def _get_conversion(info: MyGeneInfo, values: list[str], scope: T_MG_SCOPE, fields: str, taxon: str) -> T_MG_RETURN: - value_str = ",".join(map(str, values)) - results = info.get_queries(query=value_str, dotfield=True, scopes=scope, fields=fields, species=taxon) - if not isinstance(results, list): - raise TypeError(f"Expected results to be a list, but got {type(results)}") - if not isinstance(results[0], dict): - raise TypeError(f"Expected each result to be a dict, but got {type(results[0])}") - - data: T_MG_RETURN = [] - for result in results: - ensembl = result.get("query" if scope == "ensembl.gene" else "ensembl.gene") - entrez = result.get("query" if scope == "entrezgene" else "entrezgene") - symbol = result.get("query" if scope == "symbol" else "symbol") - data.append({"ensembl_gene_id": ensembl, "entrez_gene_id": entrez, "gene_symbol": symbol}) +def _get_conversion(info: MyGeneInfo, values: T_IDS, taxon: str | int) -> list[dict[str, Any]]: + value_list = sorted(map(str, [values] if isinstance(values, (int, str)) else values)) + data_type = determine_gene_type(value_list) + if not all(v == data_type[value_list[0]] for v in data_type.values()): + raise ValueError("All items in ids must be of the same type (Entrez, Ensembl, or symbols).") + + chunk_size = 1000 + taxon_str = str(taxon) + scope: T_MG_SCOPE = next(iter(data_type.values())) + data = [] + chunks = range(0, len(value_list), chunk_size) + + for i in tqdm(chunks, desc=f"Getting info for '{scope}'"): + result = info.get_queries( + query=",".join(map(str, value_list[i : i + chunk_size])), + dotfield=True, + scopes=scope, + fields="ensembl.gene,entrezgene,symbol,genomic_pos.start,genomic_pos.end,taxid,notfound", + species=taxon_str, + ) + if isinstance(result, int) and result == 414: + raise ValueError( + f"Query too long. Reduce the number of IDs in each query chunk (current chunk size: {chunk_size})." + ) + if not isinstance(result, list): + raise TypeError(f"Expected results to be a list, but got {type(result)}") + if not isinstance(result[0], dict): + raise TypeError(f"Expected each result to be a dict, but got {type(result[0])}") + data.extend(result) return data -def convert(ids: int | str | Sequence[int] | Sequence[str] | Sequence[int | str], taxon: int | str, cache: bool = True): +def get_remaining_identifiers(ids: T_IDS, taxon: int | str, cache: bool = True): """Convert between genomic identifiers. This function will convert between the following components: @@ -46,33 +64,111 @@ def convert(ids: int | str | Sequence[int] | Sequence[str] | Sequence[int | str] :return: DataFrame with columns "entrez_gene_id", "ensembl_gene_id", and "gene_symbol" """ my_geneinfo = MyGeneInfo(cache=cache) - chunk_size = 1000 - id_list = list(map(str, [ids] if isinstance(ids, (int, str)) else ids)) - chunks = list(range(0, len(id_list), chunk_size)) + gene_data = _get_conversion(info=my_geneinfo, values=ids, taxon=taxon) + df = ( + pd.json_normalize(gene_data) + .rename( + columns={ + "ensembl.gene": "ensembl_gene_id", + "entrezgene": "entrez_gene_id", + "symbol": "gene_symbol", + "taxid": "taxon_id", + } + ) + .drop( + columns=["query", "_id", "_score", "genomic_pos.end", "genomic_pos.start"], + errors="ignore", + ) + ) + df = df[df["taxon_id"] == taxon] + df["taxon_id"] = df["taxon_id"].astype(int, copy=True) + + # BUG: For an unknown reason, some Ensembl IDs are actually Entrez IDs + # To filter these, two approaches can be done: + # 1) Remove rows where Ensembl IDs are integers + # 2) Remove rows where Ensembl IDs equal Entrez IDs + # We are selecting option 1 because it goes for the root cause: Ensembl IDs are not pure integers + mask = df["ensembl_gene_id"].astype(str).str.fullmatch(r"\d+").fillna(False) + df = df[ + (df["ensembl_gene_id"].astype("string").notna()) # remove NA values + & (~df["ensembl_gene_id"].astype("string").str.fullmatch(r"\d+")) # remove Entrez IDs + ] + return df + + +def _to_scalar(val) -> int: + """Calculate the distance between end (e) and start (s).""" + if isinstance(val, list): + return int(sum(val) / len(val)) if val else 0 # `if val` checks that the list contains items + if pd.isna(val): + return 0 + return int(val) + + +def build_gene_info(ids: T_IDS, taxon: int | str, cache: bool = True): + """Get genomic information from a given set of IDs. + + The input should be of the same type, otherwise this function will fail. + Expected types are: + - Ensembl Gene ID + - Entrez Gene ID + - Gene Symbol - data_type = determine_gene_type(id_list) - if not all(v == data_type[id_list[0]] for v in data_type.values()): - raise ValueError("All items in ids must be of the same type (Entrez, Ensembl, or symbols).") + The returned data frame will have the following columns: + - ensembl_gene_id + - entrez_gene_id + - gene_symbol + - size (distance between genomic end and start) - scope = next(iter(data_type.values())) - fields = ",".join({"ensembl.gene", "entrezgene", "symbol"} - {scope}) - taxon_str = str(taxon) - return pd.DataFrame( - [ - row - for i in chunks - for row in _get_conversion( - info=my_geneinfo, - values=id_list[i : i + chunk_size], - scope=scope, - fields=fields, - taxon=taxon_str, - ) - ] + :param ids: IDs to be converted + :param taxon: Taxonomic identifier + :param cache: Should local caching be used for queries + :return: pandas.DataFrame + """ + my_geneinfo = MyGeneInfo(cache=cache) + gene_data = _get_conversion(info=my_geneinfo, values=ids, taxon=taxon) + df = pd.json_normalize(gene_data).rename(columns={"taxid": "taxon_id"}) + df = df[df["taxon_id"] == taxon] + df["taxon_id"] = df["taxon_id"].astype(int, copy=True) + + df["size"] = df["genomic_pos.end"].fillna(0).map(_to_scalar) - df["genomic_pos.start"].fillna(0).map(_to_scalar) + df = ( + df[~(df["size"] == 0)] + .drop( + columns=[ + "query", + "_id", + "_score", + "genomic_pos.start", + "genomic_pos.end", + "notfound", + ], + inplace=False, + errors="ignore", + ) + .rename( + columns={ + "ensembl.gene": "ensembl_gene_id", + "entrezgene": "entrez_gene_id", + "symbol": "gene_symbol", + } + ) + .explode(column=["ensembl_gene_id"]) + .sort_values(by="ensembl_gene_id", inplace=False) ) + return df + + +@overload +def determine_gene_type(items: str, /) -> T_MG_SCOPE: ... + + +@overload +def determine_gene_type(items: Sequence[str], /) -> dict[str, T_MG_SCOPE]: ... + -def determine_gene_type(items: str | list[str], /) -> dict[str, T_MG_SCOPE]: +def determine_gene_type(items: str | Sequence[str], /) -> str | dict[str, T_MG_SCOPE]: """Determine the genomic data type. :param items: A string or list of strings representing gene identifiers. @@ -85,16 +181,39 @@ def determine_gene_type(items: str | list[str], /) -> dict[str, T_MG_SCOPE]: followed by a specific format (length greater than 11 and the last 11 characters are digits). - "gene_symbol": If the item does not match the above criteria, it is assumed to be a gene symbol. """ - items = [items] if isinstance(items, str) else items - - determine: dict[str, Literal["entrezgene", "ensembl.gene", "symbol"]] = {} - for i in items: - i_str = str(i).split(".")[0] if isinstance(i, float) else str(i) - if i_str.isdigit(): - determine[i_str] = "entrezgene" - elif i_str.startswith("ENS") and (len(i_str) > 11 and all(i.isdigit() for i in i_str[-11:])): - determine[i_str] = "ensembl.gene" + values = (items,) if isinstance(items, str) else items + result: dict[str, Literal["entrezgene", "ensembl.gene", "symbol"]] = {} + + for i in values: + s = str(i).partition(".")[0] # remove any transcripts that may exist + + if s.startswith("ENS") and len(s) > 11 and s[-11:].isdigit(): + result[s] = "ensembl.gene" + elif s.isdigit(): + result[s] = "entrezgene" else: - determine[i_str] = "symbol" + result[s] = "symbol" + + if isinstance(items, str): + return result[items] + return result + - return determine +def contains_identical_gene_types(values: dict[str, T_MG_SCOPE] | Sequence[T_MG_SCOPE]) -> bool: + data = values if not isinstance(values, dict) else list(values.values()) + return all(v == data[0] for v in data) + + +if __name__ == "__main__": + r = build_gene_info( + ids=[ + "ENSG00000284484", + "ENSG00000299311", + "ENSG00000202151", + "ENSG00000226053", + "ENSG00000131831", + ], + taxon=9606, + cache=True, + ) + print(r.columns) From 391188020a3b6e489c1d9c5fee850c4192c5c8d0 Mon Sep 17 00:00:00 2001 From: Josh Loecker Date: Fri, 6 Mar 2026 10:41:21 -0600 Subject: [PATCH 07/41] feat: add genomic conversion pipelines directly into COMO using `bioservices` Signed-off-by: Josh Loecker --- main/como/utils.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/main/como/utils.py b/main/como/utils.py index a0c51b80..11a34f16 100644 --- a/main/como/utils.py +++ b/main/como/utils.py @@ -13,7 +13,7 @@ from loguru import logger from como.data_types import LOG_FORMAT, Algorithm, LogLevel -from como.pipelines.identifier import convert +from como.pipelines.identifier import get_remaining_identifiers T = TypeVar("T") __all__ = [ @@ -134,7 +134,7 @@ def get_missing_gene_data(values: Sequence[str] | pd.DataFrame | sc.AnnData, tax # second isinstance required for static type check to be happy # if isinstance(values, list) and not isinstance(values, pd.DataFrame): if isinstance(values, list): - return convert(ids=values, taxon=taxon_id) + return get_remaining_identifiers(ids=values, taxon=taxon_id) elif isinstance(values, pd.DataFrame): # raise error if duplicate column names exist if any(values.columns.duplicated(keep=False)): From 46f30cd336b15701c2a36c98f548d33163640817 Mon Sep 17 00:00:00 2001 From: Josh Loecker Date: Fri, 6 Mar 2026 10:41:55 -0600 Subject: [PATCH 08/41] feat: add genomic conversion pipelines directly into COMO using `bioservices` Signed-off-by: Josh Loecker --- main/como/rnaseq_preprocess.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/main/como/rnaseq_preprocess.py b/main/como/rnaseq_preprocess.py index 24d5e8b6..b4ef2841 100644 --- a/main/como/rnaseq_preprocess.py +++ b/main/como/rnaseq_preprocess.py @@ -18,7 +18,7 @@ from loguru import logger from como.data_types import LogLevel, RNAType -from como.pipelines.identifier import convert +from como.pipelines.identifier import build_gene_info, get_remaining_identifiers from como.utils import read_file, set_up_logging @@ -473,7 +473,7 @@ async def read_ensembl_gene_ids(file: Path) -> list[str]: if isinstance(data_, pd.DataFrame): return data_["ensembl_gene_id"].tolist() try: - conversion = convert(ids=data_.var_names.tolist(), taxon=taxon) + conversion = get_remaining_identifiers(ids=data_.var_names.tolist(), taxon=taxon) except json.JSONDecodeError as e: raise ValueError(f"Got a JSON decode error for file '{counts_matrix_filepaths}' ({e})") From f2b3277e5202a7229fbc3a24396c907fa22455e3 Mon Sep 17 00:00:00 2001 From: Josh Loecker Date: Fri, 6 Mar 2026 10:43:16 -0600 Subject: [PATCH 09/41] feat: add genomic conversion pipelines directly into COMO using `bioservices` Signed-off-by: Josh Loecker --- main/como/combine_distributions.py | 5 ++--- main/como/rnaseq_gen.py | 2 +- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/main/como/combine_distributions.py b/main/como/combine_distributions.py index 5bb39d24..24f8889a 100644 --- a/main/como/combine_distributions.py +++ b/main/como/combine_distributions.py @@ -18,8 +18,7 @@ _OutputCombinedSourceFilepath, _SourceWeights, ) -from como.pipelines.identifier import convert -from como.utils import LogLevel, get_missing_gene_data, log_and_raise_error, num_columns +from como.pipelines.identifier import get_remaining_identifiers from como.utils import num_columns @@ -287,7 +286,7 @@ async def _begin_combining_distributions( matrix_subset = matrix_subset.set_index(keys=[GeneIdentifier.entrez_gene_id.value], drop=True) matrix_subset = matrix_subset.drop(columns=["gene_symbol", "ensembl_gene_id"], errors="ignore") elif isinstance(matrix, sc.AnnData): - conversion = convert(ids=matrix.var_names.tolist(), taxon=taxon) + conversion = get_remaining_identifiers(ids=matrix.var_names.tolist(), taxon=taxon) conversion.reset_index(drop=False, inplace=True) matrix: pd.DataFrame = matrix.to_df().T matrix.reset_index(inplace=True, drop=False, names=["gene_symbol"]) diff --git a/main/como/rnaseq_gen.py b/main/como/rnaseq_gen.py index 3f8c00db..5a5588ae 100644 --- a/main/como/rnaseq_gen.py +++ b/main/como/rnaseq_gen.py @@ -24,7 +24,7 @@ from como.data_types import FilteringTechnique, LogLevel, RNAType from como.migrations import gene_info_migrations -from como.pipelines.identifier import convert +from como.pipelines.identifier import contains_identical_gene_types, determine_gene_type from como.project import Config from como.utils import read_file, set_up_logging From d4e7ea8d3fc8260a1c40851ad154a53a0f08ff8a Mon Sep 17 00:00:00 2001 From: Josh Loecker Date: Fri, 6 Mar 2026 10:45:05 -0600 Subject: [PATCH 10/41] feat: extract gene-info building as a pipeline component Signed-off-by: Josh Loecker --- main/como/pipelines/identifier.py | 21 +++----- main/como/rnaseq_preprocess.py | 86 +------------------------------ 2 files changed, 8 insertions(+), 99 deletions(-) diff --git a/main/como/pipelines/identifier.py b/main/como/pipelines/identifier.py index e75c46af..e09bd9dc 100644 --- a/main/como/pipelines/identifier.py +++ b/main/como/pipelines/identifier.py @@ -7,6 +7,7 @@ __all__ = [ "build_gene_info", + "contains_identical_gene_types", "determine_gene_type", "get_remaining_identifiers", ] @@ -200,20 +201,10 @@ def determine_gene_type(items: str | Sequence[str], /) -> str | dict[str, T_MG_S def contains_identical_gene_types(values: dict[str, T_MG_SCOPE] | Sequence[T_MG_SCOPE]) -> bool: + """Check if all values in the input are identical. + + :param values: A dictionary mapping gene identifiers to their types or a sequence of gene types. + :return: True if all values are identical, False otherwise. + """ data = values if not isinstance(values, dict) else list(values.values()) return all(v == data[0] for v in data) - - -if __name__ == "__main__": - r = build_gene_info( - ids=[ - "ENSG00000284484", - "ENSG00000299311", - "ENSG00000202151", - "ENSG00000226053", - "ENSG00000131831", - ], - taxon=9606, - cache=True, - ) - print(r.columns) diff --git a/main/como/rnaseq_preprocess.py b/main/como/rnaseq_preprocess.py index b4ef2841..ec1a4f81 100644 --- a/main/como/rnaseq_preprocess.py +++ b/main/como/rnaseq_preprocess.py @@ -486,90 +486,8 @@ async def read_ensembl_gene_ids(file: Path) -> list[str]: "depending on the number of genes and your internet connection" ) - ensembl_ids: set[str] = set( - chain.from_iterable(await asyncio.gather(*[read_ensembl_gene_ids(f) for f in counts_matrix_filepaths])) - ) - gene_data: list[dict[str, str | int | list[str] | list[int] | None]] = await MyGene(cache=cache).query( - items=list(ensembl_ids), - taxon=taxon, - scopes="ensemblgene", - ) - - n = len(gene_data) - all_gene_symbols: list[str] = ["-"] * n - all_entrez_ids: list[str | int] = ["-"] * n - all_ensembl_ids: list[str] = ["-"] * n - all_sizes: list[int] = [-1] * n - - def _avg_pos(value: int | list[int] | None) -> int: - if value is None: - return 0 - if isinstance(value, list): - return int(sum(value) / len(value)) if value else 0 - return int(value) - - for i, data in enumerate(gene_data): - data: dict[str, str | int | list[str] | list[int] | None] - if "genomic_pos.start" not in data: - log_and_raise_error( - message="Unexpectedly missing key 'genomic_pos.start'", error=KeyError, level=LogLevel.WARNING - ) - if "genomic_pos.end" not in data: - log_and_raise_error( - message="Unexpectedly missing key 'genomic_pos.end'", error=KeyError, level=LogLevel.WARNING - ) - if "ensembl.gene" not in data: - log_and_raise_error( - message="Unexpectedly missing key 'ensembl.gene'", error=KeyError, level=LogLevel.WARNING - ) - - start = data["genomic_pos.start"] - end = data["genomic_pos.end"] - ensembl_id = data["ensembl.gene"] - - if not isinstance(start, int): - log_and_raise_error( - message=f"Unexpected type for 'genomic_pos.start': expected int, got {type(start)}", - error=TypeError, - level=LogLevel.WARNING, - ) - if not isinstance(end, int): - log_and_raise_error( - message=f"Unexpected type for 'genomic_pos.end': expected int, got {type(start)}", - error=TypeError, - level=LogLevel.WARNING, - ) - if not isinstance(ensembl_id, str): - log_and_raise_error( - message=f"Unexpected type for 'ensembl.gene': expected str, got {type(ensembl_id)}", - error=ValueError, - level=LogLevel.WARNING, - ) - - size = end - start - all_ensembl_ids[i] = ",".join(map(str, ensembl_id)) if isinstance(ensembl_id, list) else ensembl_id - all_gene_symbols[i] = str(data.get("symbol", "-")) - all_entrez_ids[i] = str(data.get("entrezgene", "-")) - all_sizes[i] = max(size, -1) # use `size` otherwise -1 - - gene_info: pd.DataFrame = pd.DataFrame( - { - "ensembl_gene_id": all_ensembl_ids, - "gene_symbol": all_gene_symbols, - "entrez_gene_id": all_entrez_ids, - "size": all_sizes, - } - ) - - # remove rows where every gene size value is -1 (not available) - gene_info = gene_info[~(gene_info == -1).all(axis=1)] - - gene_info["ensembl_gene_id"] = gene_info["ensembl_gene_id"].str.split(",") # extend lists into multiple rows - gene_info = gene_info.explode(column=["ensembl_gene_id"]) - # we would set `entrez_gene_id` to int here as well, but not all ensembl ids are mapped to entrez ids, - # and as a result, there are still "-" values in the entrez id column that cannot be converted to an integer - - gene_info = gene_info.sort_values(by="ensembl_gene_id") + ensembl_ids: set[str] = set(chain.from_iterable(read_ensembl_gene_ids(f) for f in counts_matrix_filepaths)) + gene_info = build_gene_info(ids=ensembl_ids, taxon=taxon, cache=cache) output_filepath.parent.mkdir(parents=True, exist_ok=True) gene_info.to_csv(output_filepath, index=False) logger.success(f"Gene Info file written at '{output_filepath}'") From b75e126f47edba9d0df3217bbde76ef4b29f8cf1 Mon Sep 17 00:00:00 2001 From: Josh Loecker Date: Fri, 6 Mar 2026 10:58:29 -0600 Subject: [PATCH 11/41] chore: add types for ModelBuildSettings - Add `ModelBuildSettings` dataclass for solver parameters - Remove unused PeakIdentificationParameters and _BuildResults - Minor formatting cleanup Signed-off-by: Josh Loecker --- main/como/data_types.py | 122 +++++++++++++++++++++++++++++++++++----- 1 file changed, 107 insertions(+), 15 deletions(-) diff --git a/main/como/data_types.py b/main/como/data_types.py index ca0f9b8c..07cd3ba8 100644 --- a/main/como/data_types.py +++ b/main/como/data_types.py @@ -4,9 +4,11 @@ from dataclasses import dataclass, field, fields from enum import Enum from pathlib import Path -from typing import ClassVar, NamedTuple +from typing import ClassVar, NamedTuple, NotRequired, TypedDict import cobra +import numpy as np +import numpy.typing as npt import pandas as pd from loguru import logger @@ -78,11 +80,6 @@ class SourceTypes(str, Enum): proteomics = "proteomics" -class PeakIdentificationParameters(NamedTuple): - height: float = 0.02 - distance: float = 1.0 - - class CobraCompartments: """Convert from compartment "long-hand" to "short-hand". @@ -135,7 +132,9 @@ class CobraCompartments: "s": ["eyespot", "eyespot apparatus", "stigma"], } - _REVERSE_LOOKUP: ClassVar[dict[str, list[str]]] = {value.lower(): key for key, values in SHORTHAND.items() for value in values} + _REVERSE_LOOKUP: ClassVar[dict[str, list[str]]] = { + value.lower(): key for key, values in SHORTHAND.items() for value in values + } @classmethod def get_shorthand(cls, longhand: str) -> str | None: @@ -163,14 +162,6 @@ def get_longhand(cls, shorthand: str) -> str | None: return longhand[0] if longhand else None -class _BuildResults(NamedTuple): - """Results of building a context specific model.""" - - model: cobra.Model - expression_index_list: list[int] - infeasible_reactions: pd.DataFrame - - class _BoundaryReactions(NamedTuple): """Boundary reactions to be used in the context specific model.""" @@ -269,3 +260,104 @@ class _SourceWeights(_BaseDataType): mrna: int scrna: int proteomics: int + + +@dataclass +class ModelBuildSettings: + troppo_epsilon: float = 1e-4 + min_reaction_flux: float = 1e-7 + solver_timeout: int = 1800 # time in seconds, defaults to 30 minutes + + """ + Verbosity + Type: int + Default value: 0 + Range: [0, 3] + From: https://docs.gurobi.com/projects/optimizer/en/current/reference/parameters.html#csclientlog + """ + solver_verbosity: int = 0 + + """ + Feasibility + Type: double + Default value: 1e-6 + Range: [1e-9, 1e-2] + From: https://docs.gurobi.com/projects/optimizer/en/current/reference/parameters.html#feasibilitytol + """ + solver_feasibility: float = 1e-6 + + """ + OptimalityTol + Type: double + Default value: 1e-6 + Range: [1e-9, 1e-2] + From: https://docs.gurobi.com/projects/optimizer/en/current/reference/parameters.html#optimalitytol + """ + solver_optimality: float = 1e-6 + + """ + IntFeasTol + Type: double + Default value: 1e-5 + Range: [1e-9, 1e-1] + From: https://docs.gurobi.com/projects/optimizer/en/current/reference/parameters.html#intfeastol + """ + solver_integrality: float = 1e-6 + + """ + MIPGap + Relative MIP optimality gap + Default value: 1e-4 + Range: [0, inf) + From: https://docs.gurobi.com/projects/optimizer/en/current/reference/parameters.html#mipgap + """ + gurobi_mipgap: float = 1e-4 + + """ + IntegralityFocus + Type: int + Default value: 0 + Range: [0, 1] + From: https://docs.gurobi.com/projects/optimizer/en/current/reference/parameters.html#integralityfocus + """ + gurobi_integrality_focus: int = 0 + + """ + NumericFocus + Type: int + Default value: 0 + Range: [0, 3] + From: https://docs.gurobi.com/projects/optimizer/en/current/reference/parameters.html#numericfocus + """ + gurobi_numeric_focus: int = 2 + + def __post_init__(self): # noqa: C901 + """Validate provided arguments. + + :raises: ValueError if any check fails. + """ + if self.troppo_epsilon < 0: + raise ValueError("ModelBuildSettings: `troppo_epsilon` must be a non-negative float") + if self.min_reaction_flux < 0: + raise ValueError("ModelBuildSettings: `min_reaction_flux` must be a non-negative float") + if self.solver_verbosity not in {0, 1, 2, 3} or not isinstance(self.solver_verbosity, int): + raise ValueError("ModelBuildSettings: `solver_verbosity` must be an integer in the range [0, 3]") + if self.solver_timeout < 0 or not isinstance(self.solver_timeout, int): + raise ValueError("ModelBuildSettings: `solver_timeout` must be a non-negative integer") + if not (1e-9 < self.solver_feasibility < 1e-2): + raise ValueError("ModelBuildSettings: `solver_feasibility` must be a float in the range [1e-9, 1e-2]") + if not (1e-9 < self.solver_optimality < 1e-2): + raise ValueError("ModelBuildSettings: `solver_optimality` must be a float in the range [1e-9, 1e-2]") + if not (1e-9 < self.solver_integrality < 1e-1): + raise ValueError("ModelBuildSettings: `solver_integrality` must be a float in the range [1e-9, 1e-1]") + if self.gurobi_mipgap < 0: + raise ValueError("ModelBuildSettings: `gurobi_mipgap` must be a float in the range [1e-4, inf.)") + if self.gurobi_integrality_focus not in {0, 1} or not isinstance(self.gurobi_integrality_focus, int): + raise ValueError("ModelBuildSettings: `gurobi_integrality_focus` must be an integer in the range [0, 1]") + if self.gurobi_numeric_focus not in {0, 1, 2, 3} or not isinstance(self.gurobi_numeric_focus, int): + raise ValueError("ModelBuildSettings: `gurobi_numeric_focus` must be an integer in the range [0, 3]") + if self.troppo_epsilon < self.min_reaction_flux * 1000: + logger.warning( + "ModelBuildSettings: `troppo_epsilon` and `min_reaction_flux` have similar values. " + "This can cause inconsistent model builds. Consider ~1000x fold change between the two. " + ) From e06f64038beb531a63b83be5010e0cf428f4eeff Mon Sep 17 00:00:00 2001 From: Josh Loecker Date: Fri, 6 Mar 2026 11:01:48 -0600 Subject: [PATCH 12/41] feat: add TPM processing from Salmon quantification - Adds TPM dataframe output support - !Converts all async functions to sync - Refactors `_write_counts_matrix` to `_write_matrices - Add better validation for output filepaths - No longer utilize STAR gene count files ![BREAKING-CHANGE]: Converts all async functions to sync Signed-off-by: Josh Loecker --- main/como/rnaseq_preprocess.py | 381 +++++++++++++++++++-------------- 1 file changed, 223 insertions(+), 158 deletions(-) diff --git a/main/como/rnaseq_preprocess.py b/main/como/rnaseq_preprocess.py index ec1a4f81..edd1c88d 100644 --- a/main/como/rnaseq_preprocess.py +++ b/main/como/rnaseq_preprocess.py @@ -1,19 +1,15 @@ from __future__ import annotations -import asyncio -import csv import functools -import io import json import re import sys from dataclasses import dataclass, field from itertools import chain from pathlib import Path -from typing import Final, Literal, TextIO, cast +from typing import Final, Literal, TextIO import numpy as np -import numpy.typing as npt import pandas as pd from loguru import logger @@ -101,6 +97,7 @@ def __post_init__(self): class SampleConfiguration: sample_name: str effective_lengths: pd.DataFrame + tpm: pd.DataFrame mean_effective_length: float layout: str strand: str @@ -122,31 +119,36 @@ def __post_init__(self): raise ValueError(f"Sample '{self.sample_name}' is missing 'effective_length' column") @classmethod - def to_dataframe(cls, samples: list[SampleConfiguration]) -> tuple[pd.DataFrame, pd.DataFrame]: + def to_dataframe(cls, samples: list[SampleConfiguration]) -> tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]: """Convert a list of SampleConfiguration to a dataframe. :param samples: The list of SampleConfiguration objects to convert. :return: A tuple of dataframes: [0]: The sample configuration as a dataframe - [1]: The effective lengths as a separate data frame with `same_name` as columns + [1]: The effective lengths as a separate data frame with `sample_name` as columns + [2]: The salmon TPM values as a separate data frame with `sample_name` as columns """ - config = pd.DataFrame( - columns=["sample_name", "mean_effective_length", "layout", "strand", "study", "library_prep"] + config: pd.DataFrame = pd.DataFrame.from_records( + [(s.sample_name, s.mean_effective_length, s.layout, s.strand, s.study, s.library_prep) for s in samples], + columns=["sample_name", "mean_effective_length", "layout", "strand", "study", "library_prep"], ) + lengths = pd.concat( + [ + s.effective_lengths.set_index("name")["effective_length"] + .astype(np.float64, copy=False) + .rename(s.sample_name) + for s in samples + ], + axis=1, + ).fillna(0.0) + lengths.index.name = "ensembl_gene_id" + tpms = pd.concat( + [s.tpm.set_index("name")["tpm"].astype(np.float64, copy=False).rename(s.sample_name) for s in samples], + axis=1, + ) + tpms.index.name = "ensembl_gene_id" - genes = set() - for s in samples: - genes.update(s.effective_lengths["name"].to_list()) - - lengths = pd.DataFrame(data=np.float64(0.0), columns=[s.sample_name for s in samples], index=list(genes)) - for sample in samples: - ids: list[str] = sample.effective_lengths["name"].to_list() - data: npt.NDArray[np.floating] = sample.effective_lengths["effective_length"].to_numpy(dtype=np.float64) - lengths.loc[ids, sample.sample_name] = data - - return config, lengths - - return config, lengths + return config, lengths, tpms def _sample_name_from_filepath(file: Path) -> str: @@ -234,10 +236,7 @@ def _process_first_multirun_sample(strand_file: Path, all_quant_files: list[Path sample_counts = sample_counts.fillna(value=0) sample_counts["counts"] = sample_counts["counts"].astype(float) - count_avg = cast( # type checkers think `.groupby(...).mean()` returns a pd.Series, force pd.DataFrame - pd.DataFrame, cast(object, sample_counts.groupby("ensembl_gene_id", as_index=False)["counts"].mean()) - ) - count_avg["counts"] = np.ceil(count_avg["counts"].astype(int)) + count_avg = sample_counts.groupby("ensembl_gene_id", as_index=False).mean() count_avg.columns = ["ensembl_gene_id", _sample_name_from_filepath(strand_file)] return count_avg @@ -270,6 +269,9 @@ def _create_sample_counts_matrix(metrics: _StudyMetrics) -> pd.DataFrame: strand_file=metrics.strand_files[0], all_quant_files=metrics.quant_files, ) + if counts is None: + raise ValueError(f"Unable to process the first sample for study '{metrics.study_name}'") + counts = counts.drop(columns=["length", "effective_length", "tpm"]) for i in range(1, metrics.num_samples): new_counts = _prepare_sample_counts( @@ -283,6 +285,7 @@ def _create_sample_counts_matrix(metrics: _StudyMetrics) -> pd.DataFrame: continue assert isinstance(counts, pd.DataFrame) # noqa: S101 + new_counts = new_counts.drop(columns=["length", "effective_length", "tpm"]) counts = counts.merge(new_counts, on="ensembl_gene_id", how="outer") counts = counts.fillna(value=0) @@ -298,13 +301,15 @@ def _create_sample_counts_matrix(metrics: _StudyMetrics) -> pd.DataFrame: return counts -def _write_counts_matrix( +def _write_matrices( *, config_df: pd.DataFrame, fragment_lengths: pd.DataFrame, + tpm_df: pd.DataFrame, como_context_dir: Path, - output_counts_matrix_filepath: Path, - output_fragment_lengths_filepath: Path, + out_counts_filepath: Path, + out_frag_length_filepath: Path, + out_tpm_filepath: Path, rna: RNAType, ) -> pd.DataFrame: """Create a counts matrix file by reading gene counts table(s). @@ -312,46 +317,49 @@ def _write_counts_matrix( :param config_df: Configuration DataFrame containing sample information. :param fragment_lengths: DataFrame containing effective lengths for each gene and sample, used for zFPKM normalization. + :param tpm_df: DataFrame containing TPM values for each gene and sample, used for zFPKM normalization. :param como_context_dir: Path to the COMO_input directory containing gene count files. - :param output_counts_matrix_filepath: Path where the output counts matrix CSV will be saved. - :param output_fragment_lengths_filepath: Path where the output fragment lengths CSV will be saved. + :param out_counts_filepath: Path where the output counts matrix CSV will be saved. + :param out_frag_length_filepath: Path where the output fragment lengths CSV will be saved. + :param out_tpm_filepath: Path where the output TPM values CSV will be saved. :param rna: RNAType enum indicating whether to process 'trna' or 'mrna' samples. :return: A pandas DataFrame representing the final counts matrix. """ study_metrics = _organize_gene_counts_files(data_dir=como_context_dir) counts: list[pd.DataFrame] = [_create_sample_counts_matrix(metric) for metric in study_metrics] - rna_specific_sample_names = set( - config_df.loc[config_df["library_prep"].str.lower() == rna.value.lower(), "sample_name"].tolist() + rna_specific_sample_names = sorted( + set(config_df.loc[config_df["library_prep"].str.lower() == rna.value.lower(), "sample_name"].tolist()) ) final_matrix: pd.DataFrame = functools.reduce( lambda left, right: pd.merge(left, right, on="ensembl_gene_id", how="outer"), counts ) final_matrix.fillna(value=0, inplace=True) - final_matrix.iloc[:, 1:] = final_matrix.iloc[:, 1:].astype(int) final_matrix = final_matrix[["ensembl_gene_id", *rna_specific_sample_names]] + final_matrix = final_matrix.reindex(columns=["ensembl_gene_id", *rna_specific_sample_names]) + fragment_lengths = fragment_lengths.reindex(columns=rna_specific_sample_names).sort_index() + tpm_df = tpm_df.reindex(columns=rna_specific_sample_names).sort_index() - output_counts_matrix_filepath.parent.mkdir(parents=True, exist_ok=True) - output_fragment_lengths_filepath.parent.mkdir(parents=True, exist_ok=True) - - final_matrix.to_csv(output_counts_matrix_filepath, index=False) - fragment_lengths[rna_specific_sample_names].to_csv(output_fragment_lengths_filepath, index=True) + out_counts_filepath.parent.mkdir(parents=True, exist_ok=True) + out_frag_length_filepath.parent.mkdir(parents=True, exist_ok=True) + final_matrix.to_csv(out_counts_filepath, index=False, float_format="%.15g") + fragment_lengths[rna_specific_sample_names].to_csv(out_frag_length_filepath, index=True, float_format="%.15g") + tpm_df[rna_specific_sample_names].to_csv(out_tpm_filepath, index=True, float_format="%.15g") - logger.success(f"Wrote gene count matrix for '{rna.value}' RNA at '{output_counts_matrix_filepath}'") + logger.success(f"Wrote gene count matrix for '{rna.value}' RNA at '{out_counts_filepath}'") return final_matrix -def _create_config_df( # noqa: C901 +def _create_config_df( context_name: str, /, como_context_dir: Path, - gene_count_dirname: str = "geneCounts", layout_dirname: str = "layouts", strandedness_dirname: str = "strandedness", quantification_dir: str = "quantification", prep_method_dirname: str = "prepMethods", -) -> tuple[pd.DataFrame, pd.DataFrame]: +) -> tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]: """Create configuration sheet. The configuration file created is based on the gene counts matrix. @@ -361,7 +369,6 @@ def _create_config_df( # noqa: C901 context_name: Name of the context, used as a prefix for sample names. como_context_dir: Path to the COMO_input directory containing subdirectories for gene counts, layouts, strandedness, fragment sizes, and prep methods. - gene_count_dirname: Name of the subdirectory containing gene count files. layout_dirname: Name of the subdirectory containing layout files. strandedness_dirname: Name of the subdirectory containing strandedness files. quantification_dir: Name of the subdirectory containing Salmon's quantification files. @@ -370,12 +377,13 @@ def _create_config_df( # noqa: C901 Returns: [0]: A pandas DataFrame representing the configuration sheet. [1]: Fragment lengths for downstream calculations + [2]: Salmon TPM values for downstream calculations """ label_regex: Final = re.compile(r"(?PS\d{1,3})(?PR\d{1,3})(?Pr\d{1,3})?") - quant_files: list[Path] = list((como_context_dir / quantification_dir).rglob("*.genes.sf")) + quant_files: list[Path] = sorted((como_context_dir / quantification_dir).rglob("*.genes.sf")) # gene_counts: list[Path] = list((como_context_dir / gene_count_dirname).rglob("*.tab")) if not quant_files: - raise FileNotFoundError(f"No gene count files found in '{gene_count_dirname}'") + raise FileNotFoundError(f"No gene count files found in '{quantification_dir}'") auxillary_directories = { "layout": como_context_dir / layout_dirname, @@ -385,18 +393,14 @@ def _create_config_df( # noqa: C901 } aux_lookup: dict[str, dict[str, Path]] = {kind: {} for kind in auxillary_directories} for kind, root in auxillary_directories.items(): - kind: str - root: Path - for p in root.rglob("*"): - if p.is_file(): - m = label_regex.search(p.stem) - if m: - aux_lookup[kind][m.group(0)] = p - if "layout" not in aux_lookup: - raise ValueError + files = (p for p in root.rglob("*") if p.is_file()) + for file in files: + m = label_regex.search(file.stem) + if m: + aux_lookup[kind][m.group(0)] = file rows: list[SampleConfiguration] = [] - for quant_file in sorted(quant_files): + for quant_file in quant_files: m = label_regex.search(quant_file.as_posix()) if m is None: raise ValueError(f"Filename '{quant_file.name}' does not match contextName_SXRYrZ.tab pattern") @@ -416,35 +420,23 @@ def _create_config_df( # noqa: C901 if prep not in {"total", "mrna"}: raise ValueError(f"Prep method must be 'total' or 'mrna' (got '{prep}') for {label}") if layout == "": - raise FileNotFoundError(message=f"No layout file found for '{label}'.") - - quant_paths = [p for p in aux_lookup["quantification"].values() if p.name == f"{sample_id}_quant.genes.sf"] - if ( - not quant_paths - and layout in ["paired-end", "", None] - and prep.lower() in [RNAType.TRNA.value.lower(), RNAType.MRNA.value.lower()] - ): - log_and_raise_error( - message=f"No quantification file found for '{label}'; defaulting to 100 bp (needed for zFPKM).", - error=FileNotFoundError, - level=LogLevel.WARNING, - ) - elif len(quant_paths) == 1 and layout == "single-end": - effective_len = pd.DataFrame({"Name": [], "EffectiveLength": []}) - mean_effective_len = 0.0 # cannot compute FPKM for single-ended data - else: - df = read_file(quant_file) - df.columns = [c.lower() for c in df.columns] - df = df.rename(columns={"effectivelength": "effective_length"}) - - effective_len = df[["name", "effective_length"]] - effective_len["effective_length"] = effective_len["effective_length"].astype(np.float64) - mean_effective_len: float = effective_len["effective_length"].sum() / len(df) + raise FileNotFoundError(f"No layout file found for '{label}'.") + + df = read_file(quant_file) + df.columns = [c.lower() for c in df.columns] + df = df.rename(columns={"effectivelength": "effective_length", "TPM": "tpm"}) + df["effective_length"] = df["effective_length"].astype(float) + df["tpm"] = df["tpm"].astype(float) + + tpm = df[["name", "tpm"]].copy() + effective_len = df[["name", "effective_length"]].copy() + mean_effective_len: float = effective_len["effective_length"].sum() / df.shape[1] rows.append( SampleConfiguration( sample_name=sample_id, effective_lengths=effective_len, + tpm=tpm, mean_effective_length=mean_effective_len, layout=layout, strand=strand, @@ -456,7 +448,7 @@ def _create_config_df( # noqa: C901 return SampleConfiguration.to_dataframe(rows) -async def _create_gene_info_file( # noqa: C901 +def _create_gene_info_file( *, counts_matrix_filepaths: list[Path], output_filepath: Path, @@ -468,17 +460,18 @@ async def _create_gene_info_file( # noqa: C901 The gene information file will be created by reading each matrix filepath in the provided list """ - async def read_ensembl_gene_ids(file: Path) -> list[str]: + def read_ensembl_gene_ids(file: Path) -> list[str]: data_ = read_file(file, h5ad_as_df=False) if isinstance(data_, pd.DataFrame): return data_["ensembl_gene_id"].tolist() try: conversion = get_remaining_identifiers(ids=data_.var_names.tolist(), taxon=taxon) except json.JSONDecodeError as e: - raise ValueError(f"Got a JSON decode error for file '{counts_matrix_filepaths}' ({e})") + raise ValueError(f"Got a JSON decode error for file '{counts_matrix_filepaths}' ({e})") from e # Remove NA values from entrez_gene_id dataframe column conversion = conversion[~conversion["ensembl_gene_id"].isna()] + conversion = conversion.explode(column=["ensembl_gene_id"]) return conversion["ensembl_gene_id"].tolist() logger.info( @@ -495,90 +488,96 @@ async def read_ensembl_gene_ids(file: Path) -> list[str]: def _process_como_input( context_name: str, - output_config_filepath: Path, como_context_dir: Path, - output_counts_matrix_filepath: Path, - output_fragment_lengths_filepath: Path, - rna: RNAType, + out_mrna_config: Path | None, + out_trna_config: Path | None, + out_mrna_counts: Path | None, + out_trna_counts: Path | None, + out_mrna_fragments: Path | None, + out_trna_fragments: Path | None, + out_mrna_tpm: Path | None, + out_trna_tpm: Path | None, ) -> None: - config_df, fragment_lengths = _create_config_df(context_name, como_context_dir=como_context_dir) - - _write_counts_matrix( - config_df=config_df, - fragment_lengths=fragment_lengths, - como_context_dir=como_context_dir, - output_counts_matrix_filepath=output_counts_matrix_filepath, - output_fragment_lengths_filepath=output_fragment_lengths_filepath, - rna=rna, - ) - with pd.ExcelWriter(output_config_filepath) as writer: - subset_config = config_df[config_df["library_prep"].str.lower() == rna.value.lower()] - subset_config.to_excel(writer, sheet_name=context_name, header=True, index=False) + config_df, fragment_lengths, tpm_df = _create_config_df(context_name, como_context_dir=como_context_dir) + + if out_trna_config and out_trna_counts and out_trna_fragments and out_trna_tpm: + _write_matrices( + config_df=config_df, + fragment_lengths=fragment_lengths, + tpm_df=tpm_df, + como_context_dir=como_context_dir, + out_counts_filepath=out_trna_counts, + out_frag_length_filepath=out_trna_fragments, + out_tpm_filepath=out_trna_tpm, + rna=RNAType.TRNA, + ) + with pd.ExcelWriter(out_trna_config) as writer: + subset_config = config_df[config_df["library_prep"].str.lower() == RNAType.TRNA.value.lower()] + subset_config.to_excel(writer, sheet_name=context_name, header=True, index=False) + + if out_mrna_config and out_mrna_counts and out_mrna_fragments and out_mrna_tpm: + _write_matrices( + config_df=config_df, + fragment_lengths=fragment_lengths, + tpm_df=tpm_df, + como_context_dir=como_context_dir, + out_counts_filepath=out_mrna_counts, + out_frag_length_filepath=out_mrna_fragments, + out_tpm_filepath=out_mrna_tpm, + rna=RNAType.MRNA, + ) + with pd.ExcelWriter(out_mrna_config) as writer: + subset_config = config_df[config_df["library_prep"].str.lower() == RNAType.MRNA.value.lower()] + subset_config.to_excel(writer, sheet_name=context_name, header=True, index=False) -async def _process( +def _process( context_name: str, taxon: int, output_gene_info_filepath: Path, como_context_dir: Path | None, - input_matrix_filepath: list[Path] | None, - output_trna_fragment_lengths_filepath: Path | None, - output_mrna_fragment_lengths_filepath: Path | None, - output_trna_config_filepath: Path | None, - output_mrna_config_filepath: Path | None, - output_trna_matrix_filepath: Path | None, - output_mrna_matrix_filepath: Path | None, + in_matrix_fp: list[Path] | None, + out_trna_frag_lengths_fp: Path | None, + out_mrna_frag_lengths_fp: Path | None, + out_trna_config_fp: Path | None, + out_mrna_config_fp: Path | None, + out_trna_matrix_fp: Path | None, + out_mrna_matrix_fp: Path | None, + out_mrna_tpm_fp: Path | None, + out_trna_tpm_fp: Path | None, *, cache: bool, create_gene_info_only: bool, ): - rna_types: list[tuple[RNAType, Path, Path, Path]] = [] - if output_trna_config_filepath and output_trna_matrix_filepath and output_trna_fragment_lengths_filepath: - rna_types.append( - ( - RNAType.TRNA, - output_trna_config_filepath, - output_trna_matrix_filepath, - output_trna_fragment_lengths_filepath, - ) - ) - if output_mrna_config_filepath and output_mrna_matrix_filepath and output_mrna_fragment_lengths_filepath: - rna_types.append( - ( - RNAType.MRNA, - output_mrna_config_filepath, - output_mrna_matrix_filepath, - output_mrna_fragment_lengths_filepath, - ) - ) - - # if provided, iterate through como-input specific directories if not create_gene_info_only: if como_context_dir is None: raise ValueError("como_context_dir must be provided if create_gene_info_only is False") - if output_trna_fragment_lengths_filepath is None: + if out_trna_frag_lengths_fp is None: raise ValueError("output_fragment_lengths_filepath must be provided if create_gene_info_only is False") - for rna, out_config, out_matrix, out_frag_len in rna_types: - _process_como_input( - context_name=context_name, - output_config_filepath=out_config, - como_context_dir=como_context_dir, - output_counts_matrix_filepath=out_matrix, - output_fragment_lengths_filepath=out_frag_len, - rna=rna, - ) + _process_como_input( + context_name=context_name, + como_context_dir=como_context_dir, + out_mrna_config=out_mrna_config_fp, + out_trna_config=out_trna_config_fp, + out_mrna_counts=out_mrna_matrix_fp, + out_trna_counts=out_trna_matrix_fp, + out_mrna_fragments=out_mrna_frag_lengths_fp, + out_trna_fragments=out_trna_frag_lengths_fp, + out_mrna_tpm=out_mrna_tpm_fp, + out_trna_tpm=out_trna_tpm_fp, + ) # create the gene info filepath based on provided data input_files = [] - if input_matrix_filepath: - input_files.extend(input_matrix_filepath) - if output_trna_matrix_filepath: - input_files.append(output_trna_matrix_filepath) - if output_mrna_matrix_filepath: - input_files.append(output_mrna_matrix_filepath) - - await _create_gene_info_file( + if in_matrix_fp: + input_files.extend(in_matrix_fp) + if out_trna_matrix_fp: + input_files.append(out_trna_matrix_fp) + if out_mrna_matrix_fp: + input_files.append(out_mrna_matrix_fp) + + _create_gene_info_file( counts_matrix_filepaths=input_files, output_filepath=output_gene_info_filepath, taxon=taxon, @@ -586,7 +585,7 @@ async def _process( ) -async def rnaseq_preprocess( # noqa: C901 +def rnaseq_preprocess( # noqa: C901 context_name: str, taxon: int, output_gene_info_filepath: str | Path, @@ -598,6 +597,8 @@ async def rnaseq_preprocess( # noqa: C901 output_mrna_metadata_filepath: str | Path | None = None, output_trna_count_matrix_filepath: str | Path | None = None, output_mrna_count_matrix_filepath: str | Path | None = None, + output_trna_tpm_filepath: str | Path | None = None, + output_mrna_tpm_filepath: str | Path | None = None, cache: bool = True, log_level: LogLevel | str = LogLevel.INFO, log_location: str | TextIO = sys.stderr, @@ -618,6 +619,8 @@ async def rnaseq_preprocess( # noqa: C901 :param output_mrna_metadata_filepath: Path to the output mRNA config file (if in "create" mode) :param output_trna_count_matrix_filepath: The path to write total RNA count matrices :param output_mrna_count_matrix_filepath: The path to write messenger RNA count matrices + :param output_trna_tpm_filepath: The path to write total RNA TPM matrices + :param output_mrna_tpm_filepath: The path to write messenger RNA TPM matrices :param como_context_dir: If in "create" mode, the input path(s) to the COMO_input directory of the current context i.e., the directory containing "fragmentSizes", "geneCounts", "insertSizeMetrics", etc. directories :param input_matrix_filepath: If in "provide" mode, the path(s) to the count matrices to be processed~ @@ -628,10 +631,67 @@ async def rnaseq_preprocess( # noqa: C901 """ set_up_logging(level=log_level, location=log_location) - # ruff: disable[ASYNC240] if not output_gene_info_filepath: raise ValueError("output_gene_info_filepath must be provided") + if any( + ( + output_trna_fragment_lengths_filepath, + output_trna_metadata_filepath, + output_trna_count_matrix_filepath, + output_trna_tpm_filepath, + ), + ) and not all( + ( + output_trna_fragment_lengths_filepath, + output_trna_metadata_filepath, + output_trna_count_matrix_filepath, + output_trna_tpm_filepath, + ), + ): + missing = [ + name + for name, value in { + "output_trna_fragment_lengths_filepath": output_trna_fragment_lengths_filepath, + "output_trna_metadata_filepath": output_trna_metadata_filepath, + "output_trna_count_matrix_filepath": output_trna_count_matrix_filepath, + "output_trna_tpm_filepath": output_trna_tpm_filepath, + }.items() + if value is None + ] + raise ValueError( + f"If any tRNA output filepaths are provided, all must be provided. Missing: {', '.join(missing)}" + ) + + if any( + ( + output_mrna_fragment_lengths_filepath, + output_mrna_metadata_filepath, + output_mrna_count_matrix_filepath, + output_mrna_tpm_filepath, + ), + ) and not all( + ( + output_mrna_fragment_lengths_filepath, + output_mrna_metadata_filepath, + output_mrna_count_matrix_filepath, + output_mrna_tpm_filepath, + ), + ): + missing = [ + name + for name, value in { + "output_mrna_fragment_lengths_filepath": output_mrna_fragment_lengths_filepath, + "output_mrna_metadata_filepath": output_mrna_metadata_filepath, + "output_mrna_count_matrix_filepath": output_mrna_count_matrix_filepath, + "output_mrna_tpm_filepath": output_mrna_tpm_filepath, + }.items() + if value is None + ] + raise ValueError( + f"If any mRNA output filepaths are provided, all must be provided. Missing: {', '.join(missing)}" + ) + output_gene_info_filepath = Path(output_gene_info_filepath).resolve() context_dir = None @@ -657,20 +717,25 @@ async def rnaseq_preprocess( # noqa: C901 output_trna_fragment_lengths_filepath = Path(output_trna_fragment_lengths_filepath).resolve() if output_mrna_fragment_lengths_filepath is not None: output_mrna_fragment_lengths_filepath = Path(output_mrna_fragment_lengths_filepath).resolve() + if output_trna_tpm_filepath is not None: + output_trna_tpm_filepath = Path(output_trna_tpm_filepath).resolve() + if output_mrna_tpm_filepath is not None: + output_mrna_tpm_filepath = Path(output_mrna_tpm_filepath).resolve() - await _process( + _process( context_name=context_name, taxon=taxon, como_context_dir=context_dir, - input_matrix_filepath=in_matrix, + in_matrix_fp=in_matrix, output_gene_info_filepath=output_gene_info_filepath, - output_trna_config_filepath=output_trna_metadata_filepath, - output_trna_matrix_filepath=output_trna_count_matrix_filepath, - output_trna_fragment_lengths_filepath=output_trna_fragment_lengths_filepath, - output_mrna_config_filepath=output_mrna_metadata_filepath, - output_mrna_matrix_filepath=output_mrna_count_matrix_filepath, - output_mrna_fragment_lengths_filepath=output_mrna_fragment_lengths_filepath, + out_trna_config_fp=output_trna_metadata_filepath, + out_trna_matrix_fp=output_trna_count_matrix_filepath, + out_trna_frag_lengths_fp=output_trna_fragment_lengths_filepath, + out_mrna_config_fp=output_mrna_metadata_filepath, + out_mrna_matrix_fp=output_mrna_count_matrix_filepath, + out_mrna_frag_lengths_fp=output_mrna_fragment_lengths_filepath, + out_trna_tpm_fp=output_trna_tpm_filepath, + out_mrna_tpm_fp=output_mrna_tpm_filepath, cache=cache, create_gene_info_only=create_gene_info_only, ) - # ruff: enable[ASYNC240] From 8c18566df188425abea88c223a33520cd33ec1ed Mon Sep 17 00:00:00 2001 From: Josh Loecker Date: Fri, 6 Mar 2026 11:03:24 -0600 Subject: [PATCH 13/41] feat: refactor filtering and normalization - !Converts from async functions to sync - Change `fragment_lengths` from a per-sample value to a per-gene value (np.ndarray to pd.DataFrame) - Refactor TPM, FPKM, and zFPKM functions - Adds fpkm output CSV file support - Improves type hints and validations [BREAKING-CHANGE]: This removes all async functions for synchronous counterparts Signed-off-by: Josh Loecker --- main/como/rnaseq_gen.py | 508 ++++++++++++++++++++-------------------- 1 file changed, 253 insertions(+), 255 deletions(-) diff --git a/main/como/rnaseq_gen.py b/main/como/rnaseq_gen.py index 5a5588ae..1d2b258a 100644 --- a/main/como/rnaseq_gen.py +++ b/main/como/rnaseq_gen.py @@ -16,8 +16,7 @@ import scanpy as sc import sklearn import sklearn.neighbors -from anndata.compat import XDataArray -from anndata.experimental.backed import Dataset2D +import sklearn.preprocessing from loguru import logger from scipy import sparse from zfpkm import zFPKM, zfpkm_plot @@ -49,7 +48,7 @@ class _StudyMetrics: study: str num_samples: int count_matrix: pd.DataFrame | sc.AnnData - fragment_lengths: npt.NDArray[np.floating] | None + eff_length: pd.DataFrame # The effective length of the fragment from, e.g., Salmon Quantification sample_names: list[str] layout: list[LayoutMethod] entrez_gene_ids: npt.NDArray[np.integer] @@ -59,6 +58,11 @@ class _StudyMetrics: __high_confidence_entrez_gene_ids: list[str] = field(default_factory=list) def __post_init__(self): + if not self.layout: + raise ValueError("Layout list cannot be empty") + if not self.sample_names: + raise ValueError("Sample names list cannot be empty") + for layout in self.layout: if layout not in LayoutMethod: raise ValueError(f"Layout must be 'paired-end' or 'single-end'; got: {layout}") @@ -134,23 +138,24 @@ def genefilter(data: pd.DataFrame | npt.NDArray, filter_func: Callable[[npt.NDAr Returns: A NumPy array of the filtered data. """ - if not isinstance(data, pd.DataFrame | np.ndarray): + if not isinstance(data, (pd.DataFrame, np.ndarray)): raise TypeError(f"Unsupported data type. Must be a Pandas DataFrame or a NumPy array, got '{type(data)}'") return ( - data.apply(filter_func, axis=1).to_numpy() + data.apply(filter_func, axis=1).to_numpy(copy=False) if isinstance(data, pd.DataFrame) else np.apply_along_axis(filter_func, axis=1, arr=data) ) -async def _build_matrix_results( +def _build_matrix_results( matrix: pd.DataFrame | sc.AnnData, *, gene_info: pd.DataFrame, metadata_df: pd.DataFrame, - fragment_df: pd.DataFrame | None, + normalize_df: pd.DataFrame, taxon: int, + normalize_df_is_fragment_length: bool = True, ) -> tuple[NamedMetrics, list[int]]: """Read the counts matrix and returns the results. @@ -159,91 +164,53 @@ async def _build_matrix_results( :param taxon: The NCBI Taxon ID :returns: A dataclass `ReadMatrixResults` """ - if isinstance(matrix, sc.AnnData): - if not isinstance(matrix.var, pd.DataFrame): - raise TypeError("AnnData.var is expected to be a pandas.DataFrame") - - matrix.var = matrix.var.reset_index(drop=False, names=["gene_symbol"]) - conversion = convert(ids=matrix.var["gene_symbol"].tolist(), taxon=taxon) - else: - if "ensembl_gene_id" not in matrix.columns: - log_and_raise_error( - message="'ensembl_gene_id' column not found in the provided DataFrame.", - error=ValueError, - level=LogLevel.CRITICAL, - ) - conversion: pd.DataFrame = convert(ids=matrix["ensembl_gene_id"].tolist(), taxon=taxon) - # If the entrez gene id column is empty, it is indicative that the incorrect taxon id was provided - if conversion["entrez_gene_id"].eq("-").all(): - logger.critical( - f"Conversion of Ensembl Gene IDs to Entrez IDs and Gene Symbols was empty - " - f"is '{taxon}' the correct taxon ID for this data?" - ) - conversion["ensembl_gene_id"] = conversion["ensembl_gene_id"].str.split(",") - conversion = conversion.explode("ensembl_gene_id") - conversion = conversion[conversion["entrez_gene_id"] != "-"] - conversion["entrez_gene_id"] = conversion["entrez_gene_id"] - conversion = conversion.reset_index(drop=False) - - # conversion_merge_on should contain at least one of "ensembl_gene_id", "entrez_gene_id", or "gene_symbol" - conversion_merge_on: list[str] = list( - set(matrix.columns if isinstance(matrix, pd.DataFrame) else matrix.var.columns) & set(conversion.columns) - ) - if not conversion_merge_on: - log_and_raise_error( - ( - "No columns to merge on, unable to find at least one of `ensembl_gene_id`, `entrez_gene_id`, or `gene_symbol`. " - "Please check your input files." - ), - error=ValueError, - level=LogLevel.ERROR, - ) - - if isinstance(matrix, pd.DataFrame): - if "entrez_gene_id" in matrix.columns: - matrix["entrez_gene_id"] = matrix["entrez_gene_id"].astype(int) - matrix = matrix.merge(conversion, on=conversion_merge_on, how="left") - elif isinstance(matrix, sc.AnnData): - if "entrez_gene_id" in matrix.var.columns: - matrix.var["entrez_gene_id"] = matrix.var["entrez_gene_id"].astype(int) - matrix.var = matrix.var.merge(conversion, on=conversion_merge_on, how="left") - gene_info = gene_info_migrations(gene_info) + gene_info = gene_info[~gene_info["entrez_gene_id"].isna()] gene_info = gene_info[gene_info["entrez_gene_id"] != "-"] - gene_info.loc[:, "entrez_gene_id"] = gene_info.loc[:, "entrez_gene_id"].astype(int) + gene_info["entrez_gene_id"] = gene_info["entrez_gene_id"].astype(int) - gene_info_merge_on: list[str] = list( - set(matrix.columns if isinstance(matrix, pd.DataFrame) else matrix.var.columns) & set(gene_info.columns) - ) - - if "entrez_gene_id" in gene_info_merge_on: - gene_info = gene_info[~gene_info["entrez_gene_id"].isna()] - gene_info["entrez_gene_id"] = gene_info["entrez_gene_id"].astype(int) - - if isinstance(matrix, pd.DataFrame): - matrix = matrix[~matrix["entrez_gene_id"].isna()] - matrix["entrez_gene_id"] = matrix["entrez_gene_id"].astype(int) - elif isinstance(matrix, sc.AnnData): - if isinstance(matrix.var, XDataArray): - raise TypeError("Expected matrix.var object to be 'pd.DataFrame', got 'anndata.compat.XDataArray'") - matrix = matrix[:, ~matrix.var["entrez_gene_id"].isna()] - matrix.var["entrez_gene_id"] = matrix.var["entrez_gene_id"].astype(int) + if not normalize_df.empty: # This is likely empty when using single-cell data + normalize_merge_on = normalize_df.columns.intersection(gene_info.columns).to_list() + if not normalize_merge_on: + raise ValueError( + f"No common columns to merge on between effective length dataframe and gene info dataframe. " + f"Effective length columns: {normalize_df.columns.tolist()}, gene info columns: {gene_info.columns.tolist()}" + ) + normalize_df = normalize_df.merge(gene_info, on=normalize_merge_on, how="left") + normalize_df = normalize_df[~normalize_df["entrez_gene_id"].isna()] if isinstance(matrix, pd.DataFrame): - matrix = matrix.merge(gene_info, on=gene_info_merge_on, how="inner") + matrix_merge_on = matrix.columns.intersection(gene_info.columns).to_list() + matrix = matrix.merge(gene_info, on=matrix_merge_on, how="inner") elif isinstance(matrix, sc.AnnData): if not isinstance(matrix.var, pd.DataFrame): raise TypeError(f"Expected matrix.var object to be 'pd.DataFrame', got '{type(matrix.var)}'") - matrix.var["original_index"] = matrix.var.index - new_var = matrix.var.merge(gene_info, on=gene_info_merge_on, how="inner") - new_matrix = matrix[:, new_var["original_index"]].copy() - new_matrix.var = new_var - new_matrix.var = new_matrix.var.drop(columns=["original_index"]) - new_matrix.var.reset_index(drop=True) - matrix = new_matrix - non_duplicates = ~matrix.var.duplicated(subset=matrix.var.columns, keep="first") - matrix = matrix[:, non_duplicates].copy() + gene_info = gene_info.sort_values(["entrez_gene_id", "size"], ascending=[True, True]).drop_duplicates( + subset=["entrez_gene_id"], keep="first" + ) + types: dict[str, str] = determine_gene_type(matrix.var_names) + if contains_identical_gene_types(types): + # set matrix index name to gene type + matrix.var.index.name = "gene_symbol" + + matrix.var = matrix.var.reset_index(drop=False).merge(gene_info, on="gene_symbol", how="left") + matrix = matrix[:, matrix.var["entrez_gene_id"].notna()] + matrix.var = matrix.var.astype( + { + "entrez_gene_id": int, + "taxon_id": int, + "size": int, + } + ) + + # new_matrix = matrix[:, new_var["original_index"]].copy() + # new_matrix.var = new_var + # new_matrix.var = new_matrix.var.drop(columns=["original_index"]) + # new_matrix.var.reset_index(drop=True) + # matrix = new_matrix + # non_duplicates = ~matrix.var.duplicated(subset=matrix.var.columns, keep="first") + # matrix = matrix[:, non_duplicates].copy() metrics: NamedMetrics = {} for study in metadata_df["study"].unique(): @@ -253,109 +220,62 @@ async def _build_matrix_results( if isinstance(matrix, pd.DataFrame): subset = matrix.set_index(keys=["entrez_gene_id"], drop=True) subset = subset[subset.columns.intersection(study_sample_names)] - subset.index = subset.index.astype(int) + # subset = subset.groupby(subset.index).mean() entrez_gene_ids = subset.index.to_numpy(copy=False) - gene_sizes = matrix["size"].to_numpy(dtype=int, copy=False) + elif isinstance(matrix, sc.AnnData): # matrix.var = matrix.var.set_index(keys=["entrez_gene_id"], drop=True) subset = matrix[matrix.obs_names.intersection(study_sample_names)] entrez_gene_ids = subset.var["entrez_gene_id"].to_numpy(dtype=int) - gene_sizes = subset.var["size"].to_numpy(dtype=int) else: raise TypeError(f"Matrix must be a pandas DataFrame or scanpy AnnData object, got: '{type(matrix)}'.") - frag_lengths = None - if fragment_df is not None: - frag_lengths = fragment_df["effective_length"].to_numpy(dtype=np.float64) + normalize_df = normalize_df.drop(columns=["ensembl_gene_id", "gene_symbol"], errors="ignore") + if normalize_df.index.name != "entrez_gene_id" and "entrez_gene_id" in normalize_df.columns: + normalize_df = normalize_df.set_index("entrez_gene_id", drop=True) + normalize_df.index = normalize_df.index.astype(int) + + # gene_sizes = gene_info["size"].to_numpy(dtype=int, copy=False) metrics[study] = _StudyMetrics( count_matrix=subset, - fragment_lengths=frag_lengths, + eff_length=normalize_df[study_sample_names] if not normalize_df.empty else normalize_df, sample_names=study_sample_names, layout=[LayoutMethod(layout) for layout in layouts], num_samples=len(study_sample_names), entrez_gene_ids=entrez_gene_ids, - gene_sizes=gene_sizes, + gene_sizes=gene_info.loc[gene_info["entrez_gene_id"].isin(entrez_gene_ids), "size"].to_numpy( + dtype=int, copy=False + ), study=study, ) return metrics, gene_info["entrez_gene_id"].astype(int).tolist() -def calculate_tpm(metrics: NamedMetrics) -> NamedMetrics: - """Calculate the Transcripts Per Million (TPM) for each sample in the metrics dictionary. - - Args: - metrics: A dictionary of study metrics to calculate TPM for. - - Returns: - A dictionary of study metrics with TPM calculated. - """ - for sample, metric in metrics.items(): - if isinstance(metric.count_matrix, sc.AnnData): - adata = metric.count_matrix - gene_sizes = pd.Series(metric.gene_sizes, index=adata.var_names) - counts_df = pd.DataFrame( - data=np.asarray(adata.X.toarray() if sparse.issparse(adata.X) else adata.X), - index=adata.var_names, - columns=adata.obs_names, - ) - else: - counts_df = metric.count_matrix - gene_sizes = pd.Series(metric.gene_sizes) - - adjusted_counts = counts_df.add(1e-6) - tpm_matrix = adjusted_counts.div(gene_sizes, axis=0) # (count + 1) / gene_length - tpm_matrix = tpm_matrix.div(tpm_matrix.sum(axis=0), axis=1) # normalize by total - tpm_matrix = tpm_matrix.mul(1e6) # scale to per-million - metrics[sample].normalization_matrix = tpm_matrix - - return metrics - - -def _calculate_fpkm(metrics: NamedMetrics, scale: float = 1e6) -> NamedMetrics: - """Calculate the Fragments Per Kilobase of transcript per Million mapped reads (FPKM) for each i in the metrics dictionary. +def _calculate_fpkm(metrics: NamedMetrics) -> NamedMetrics: + """Calculate the Fragments Per Kilobase of transcript per Million mapped reads (FPKM). Args: metrics: A dictionary of study metrics to calculate FPKM for. - scale: The scaling factor for normalization (default is 1e6). Returns: A dictionary of study metrics with FPKM calculated. """ for study in metrics: - matrix_values: dict[str, npt.NDArray[np.floating]] = {} - count_matrix = metrics[study].count_matrix - if not isinstance(count_matrix, pd.DataFrame): - raise TypeError("FPKM cannot be performed on scanpy.AnnData objects!") - - study_counts = count_matrix.to_numpy(dtype=int, copy=False) - for i in range(metrics[study].num_samples): - layout = metrics[study].layout[i] - sample_name = metrics[study].sample_names[i] - length = metrics[study].fragment_lengths if layout == LayoutMethod.paired_end else metrics[study].gene_sizes - counts = study_counts[:, i] - mapped_reads = counts.sum() - matrix_values[sample_name] = ((counts * 1e9) / (length * mapped_reads)).astype(int) - - metrics[study].normalization_matrix = pd.DataFrame(matrix_values, index=metrics[study].entrez_gene_ids) - return metrics - + count_matrix = metrics[study].count_matrix.copy() + length = metrics[study].eff_length -def calculate_z_score(metrics: NamedMetrics) -> NamedMetrics: - """Calculate the z-score for each sample in the metrics dictionary. - - Args: - metrics: A dictionary of study metrics to calculate z-scores for. + if not isinstance(count_matrix, pd.DataFrame): + raise TypeError("FPKM can only be performed on pandas.DataFrame objects") + non_overlapping_cols = count_matrix.columns.difference(length.columns) + if not non_overlapping_cols.empty: + raise ValueError( + f"Count matrix columns and effective length columns must match for FPKM calculation. " + f"Non-overlapping columns: {non_overlapping_cols}" + ) - Returns: - A dictionary of study metrics with z-scores calculated. - """ - for sample in metrics: - log_matrix = np.log(metrics[sample].normalization_matrix) - z_matrix = pd.DataFrame( - data=sklearn.preprocessing.scale(log_matrix, axis=1), columns=metrics[sample].sample_names - ) - metrics[sample].z_score_matrix = z_matrix + mapped_reads = count_matrix.sum() + metrics[study].normalization_matrix = (count_matrix * 1e9) / (mapped_reads * length) return metrics @@ -410,7 +330,7 @@ def cpm_filter( test_bools = pd.DataFrame({"entrez_gene_ids": entrez_ids}) for i in range(len(counts_per_million.columns)): median_sum = float(np.median(np.sum(counts[:, i]))) - if cut_off == "default": # noqa: SIM108 + if np.isnan(cut_off): # noqa: SIM108 cutoff = float(10e6 / median_sum) else: cutoff = float(1e6 * cut_off) / median_sum @@ -419,22 +339,29 @@ def cpm_filter( return metrics -def tpm_quantile_filter(*, metrics: NamedMetrics, filtering_options: _FilteringOptions) -> NamedMetrics: +def tpm_filter( + *, + metrics: NamedMetrics, + filtering_options: _FilteringOptions, + tpm_df: pd.DataFrame, +) -> NamedMetrics: """Apply quantile-based filtering to the TPM matrix for a given sample. - Args: - metrics: A dictionary of study metrics to filter. - filtering_options: Options for filtering the count matrix. - - Returns: - A dictionary of filtered study metrics. + :param metrics: A dictionary of study metrics to filter. + :param filtering_options: Options for filtering the count matrix. + :param tpm_df: A dataframe containing TPM values for each gene in each sample, used for TPM quantile filtering + :returns: A dictionary of filtered study metrics. """ # TODO: Write the TPM matrix to disk - n_exp = filtering_options.replicate_ratio - n_top = filtering_options.high_replicate_ratio + min_sample_expression = filtering_options.replicate_ratio + high_confidence_sample_expression = filtering_options.high_replicate_ratio cut_off = filtering_options.cut_off - metrics = calculate_tpm(metrics) + + # Map the TPM results directly onto the sample + for sample, metric in metrics.items(): + metrics[sample].normalization_matrix = tpm_df[metric.sample_names] + metrics[sample].normalization_matrix.index = tpm_df["ensembl_gene_id"] sample: str metric: _StudyMetrics @@ -443,33 +370,36 @@ def tpm_quantile_filter(*, metrics: NamedMetrics, filtering_options: _FilteringO gene_size = metric.gene_sizes tpm_matrix: pd.DataFrame = metric.normalization_matrix - min_samples = round(n_exp * len(tpm_matrix.columns)) - top_samples = round(n_top * len(tpm_matrix.columns)) - tpm_quantile = tpm_matrix[tpm_matrix > 0] - quantile_cutoff = np.quantile(a=tpm_quantile.values, q=1 - (cut_off / 100), axis=0) - boolean_expression = pd.DataFrame( - data=tpm_matrix > quantile_cutoff, index=tpm_matrix.index, columns=tpm_matrix.columns - ).astype(int) - - min_func = k_over_a(min_samples, 0.9) - top_func = k_over_a(top_samples, 0.9) - min_genes: npt.NDArray[np.bool] = genefilter(boolean_expression, min_func) - top_genes: npt.NDArray[np.bool] = genefilter(boolean_expression, top_func) + # TODO: Is using quantile correct? + # quantile_cutoff = np.nanquantile(a=tpm_quantile.values, q=1 - (cut_off / 100), axis=0) + tpm_above_cutoff = tpm_matrix[tpm_matrix > filtering_options.cut_off] # Only keep `entrez_gene_ids` that pass `min_genes` - metric.entrez_gene_ids = [gene for gene, keep in zip(entrez_ids, min_genes, strict=True) if keep] - metric.gene_sizes = np.asarray(gene for gene, keep in zip(gene_size, min_genes, strict=True) if keep) - metric.count_matrix = cast(pd.DataFrame, metric.count_matrix.iloc[min_genes, :]) - metric.normalization_matrix = cast(pd.DataFrame, metrics[sample].normalization_matrix.iloc[min_genes, :]) - - keep_top_genes = [gene for gene, keep in zip(entrez_ids, top_genes, strict=True) if keep] - metric.high_confidence_entrez_gene_ids = [ - gene for gene, keep in zip(entrez_ids, keep_top_genes, strict=True) if keep - ] - - metrics = calculate_z_score(metrics) + min_samples = round(min_sample_expression * len(tpm_matrix.columns)) + min_func = k_over_a(min_samples, filtering_options.cut_off) + min_genes: npt.NDArray[np.bool] = genefilter(tpm_above_cutoff, min_func) + metric.entrez_gene_ids = [gene for gene, keep in zip(tpm_matrix.index, min_genes, strict=True) if keep] + # metric.gene_sizes = np.asarray(gene for gene, keep in zip(gene_size, min_genes, strict=True) if keep) + # metric.count_matrix = metric.count_matrix.iloc[min_genes, :] + metrics[sample].normalization_matrix = metrics[sample].normalization_matrix.iloc[min_genes, :] + + # top_samples = round(high_confidence_sample_expression * len(tpm_matrix.columns)) + # top_func = k_over_a(top_samples, filtering_options.cut_off) + # top_genes: npt.NDArray[np.bool] = genefilter(boolean_expression, top_func) + # keep_top_genes = [gene for gene, keep in zip(entrez_ids, top_genes, strict=True) if keep] + # metric.high_confidence_entrez_gene_ids = [ + # gene for gene, keep in zip(entrez_ids, keep_top_genes, strict=True) if keep + # ] + + # Calculate z-scores + log_matrix = np.log1p(metrics[sample].normalization_matrix.values) + metrics[sample].z_score_matrix = pd.DataFrame( + sklearn.preprocessing.scale(log_matrix, axis=0), + columns=metrics[sample].normalization_matrix.columns, + index=metrics[sample].normalization_matrix.index, + ) return metrics @@ -480,8 +410,7 @@ def zfpkm_filter( filtering_options: _FilteringOptions, calculate_fpkm: bool, force_zfpkm_plot: bool, - min_peak_height: float, - min_peak_distance: int, + output_fpkm_filepath: Path | None, output_png_dirpath: Path | None, force_negative_to_zero: bool = False, ) -> NamedMetrics: @@ -506,8 +435,14 @@ def zfpkm_filter( cut_off = filtering_options.cut_off metrics = _calculate_fpkm(metrics) if calculate_fpkm else metrics + if output_fpkm_filepath: + combined_fpkm = pd.concat([df.normalization_matrix for df in metrics.values()], axis=1) + combined_fpkm = combined_fpkm.sort_index().reindex(columns=sorted(combined_fpkm.columns)) + combined_fpkm.to_csv(output_fpkm_filepath, index=True) + context = next(iter(metrics.values())).sample_names[0].partition("_")[0] + logger.success(f"Wrote FPKM data for {context} to: {output_fpkm_filepath}") + for metric in metrics.values(): - metric: _StudyMetrics # if fpkm was not calculated, the normalization matrix will be empty; collect the count matrix instead matrix = metric.count_matrix if metric.normalization_matrix.empty else metric.normalization_matrix if not isinstance(matrix, pd.DataFrame): @@ -587,29 +522,28 @@ def umi_filter( logger.warning( "Setting a minimum sample expression greater than ~20% for UMI-based filtering will likely result in very few/no genes being marked as active. " # noqa: E501 "Activity values ranging from 10-20% are recommended based on recent literature. " - f"Got: {min_sample_expression} for option 'replicate_ratio'" + f"Got: {min_sample_expression:%} for option 'replicate_ratio'" ) if high_confidence_sample_expression > 0.40: logger.warning( f"Setting high-confidence expression greater than ~40% for UMI-based filtering will likely result in very few to no genes being marked as highly active. " # noqa: E501 "Activity values ranging from 20-30% are recommended based on recent literature. " - f"Got: {high_confidence_sample_expression} for option 'high_replicate_ratio'." + f"Got: {high_confidence_sample_expression:%} for option 'high_replicate_ratio'." ) for metric in metrics.values(): metric: _StudyMetrics if not isinstance(metric.count_matrix, sc.AnnData): raise TypeError(f"Expected a scanpy.AnnData for UMI filtering, got: '{type(metric.count_matrix)}'") - adata: sc.AnnData = metric.count_matrix + adata: sc.AnnData = metric.count_matrix.copy() if perform_normalization: if adata.raw is not None: adata.X = adata.raw.X.copy() - sc.pp.filter_cells(adata, min_genes=20) + sc.pp.filter_cells(adata, min_genes=10) sc.pp.filter_genes(adata, min_cells=1) sc.pp.normalize_total(adata, target_sum=target_sum) sc.pp.log1p(adata) - # sc.pp.scale(adata, max_value=15) # abs(values)>10 standard deviations away will be set to +/-10 metric.z_score_matrix = adata @@ -622,9 +556,8 @@ def umi_filter( for j in range(n_genes): col = adata_x.getcol(j).toarray().ravel() if sparse.issparse(adata_x) else adata_x[:, j] min_genes_mask[j] = min_func(col) - metric.entrez_gene_ids = ( - adata.var.loc[min_genes_mask, "entrez_gene_id"].dropna().tolist() - ) # at this point we do not need/want NA entrez IDs + # at this point we do not need/want NA entrez IDs + metric.entrez_gene_ids = adata.var.loc[min_genes_mask, "entrez_gene_id"].dropna().tolist() top_samples = round(high_confidence_sample_expression * n_cells) top_func = k_over_a(top_samples, cut_off) @@ -645,10 +578,11 @@ def filter_counts( filtering_options: _FilteringOptions, prep: RNAType, force_zfpkm_plot: bool, - zfpkm_min_peak_height: float, - zfpkm_min_peak_distance: int, umi_target_sum: int = 10_000, + tpm_df: pd.DataFrame | None, + calculate_fpkm: bool = True, umi_perform_normalization: bool = False, + output_fpkm_filepath: Path | None, output_zfpkm_plot_dirpath: Path | None = None, force_negative_to_zero: bool = False, ) -> NamedMetrics: @@ -660,32 +594,34 @@ def filter_counts( :param filtering_options: Options for filtering the count matrix. :param prep: The RNA preparation type. :param force_zfpkm_plot: Whether to force plotting of zFPKM results even if there are many samples. - :param zfpkm_min_peak_height: Minimum peak height for zFPKM peak identification. - :param zfpkm_min_peak_distance: Minimum peak distance for zFPKM peak identification. :param umi_target_sum: The target sum for UMI normalization. + :param tpm_df: A dataframe containing TPM values for each gene in each sample, used for TPM quantile filtering + :param calculate_fpkm: If using paired-end data, should FPKM be calculated? :param umi_perform_normalization: Whether to perform normalization before UMI filtering. :param output_zfpkm_plot_dirpath: Optional filepath to save the zFPKM plot. + :param output_fpkm_filepath: Optional filepath to save the FPKM matrix (only applies if `calculate_fpkm=True` and `technique=FilteringTechnique.ZFPKM`) :param force_negative_to_zero: Should negative values be forcibly set to 0? This could happen as a result of normalization producing negative near-zero values (e.g., -0.001) :returns: A dictionary of filtered study metrics. - """ + """ # noqa: E501 match technique: case FilteringTechnique.CPM: return cpm_filter( context_name=context_name, metrics=metrics, filtering_options=filtering_options, prep=prep ) case FilteringTechnique.TPM: - return tpm_quantile_filter(metrics=metrics, filtering_options=filtering_options) + if tpm_df is None: + raise ValueError("TPM DataFrame must be provided for TPM quantile filtering") + return tpm_filter(metrics=metrics, filtering_options=filtering_options, tpm_df=tpm_df) case FilteringTechnique.ZFPKM: return zfpkm_filter( metrics=metrics, filtering_options=filtering_options, calculate_fpkm=True, force_zfpkm_plot=force_zfpkm_plot, - min_peak_height=zfpkm_min_peak_height, - min_peak_distance=zfpkm_min_peak_distance, output_png_dirpath=output_zfpkm_plot_dirpath, + output_fpkm_filepath=output_fpkm_filepath, force_negative_to_zero=force_negative_to_zero, ) case FilteringTechnique.UMI: @@ -695,16 +631,15 @@ def filter_counts( target_sum=umi_target_sum, perform_normalization=umi_perform_normalization, ) - case _: - raise ValueError(f"Technique must be one of {FilteringTechnique}, got '{technique.value}'") -async def _process( +def _process( context_name: str, rnaseq_matrix_filepath: Path, metadata_df: pd.DataFrame, gene_info_df: pd.DataFrame, - fragment_df: pd.DataFrame | None, + fragment_df: pd.DataFrame, + tpm_df: pd.DataFrame, prep: RNAType, taxon: int, replicate_ratio: float, @@ -714,12 +649,11 @@ async def _process( technique: FilteringTechnique, cut_off: int | float, force_zfpkm_plot: bool, - zfpkm_min_peak_height: float, - zfpkm_min_peak_distance: int, umi_target_sum: int, umi_perform_normalization: bool, output_boolean_activity_filepath: Path, output_zscore_normalization_filepath: Path, + output_fpkm_filepath: Path | None, output_zfpkm_plot_dirpath: Path | None, force_negative_to_zero: bool, ): @@ -735,12 +669,15 @@ async def _process( high_batch_ratio=high_batch_ratio, ) - metrics, entrez_gene_ids = await _build_matrix_results( + if fragment_df.empty and tpm_df.empty and prep != RNAType.SCRNA: + raise ValueError("Effective length dataframe and TPM dataframe cannot both be empty for RNA-Seq processing") + metrics, entrez_gene_ids = _build_matrix_results( rnaseq_matrix, gene_info=gene_info_df, metadata_df=metadata_df, - fragment_df=fragment_df, + normalize_df=fragment_df if not fragment_df.empty else tpm_df, taxon=taxon, + normalize_df_is_fragment_length=not bool(fragment_df.empty), ) metrics = filter_counts( context_name=context_name, @@ -748,12 +685,12 @@ async def _process( technique=technique, filtering_options=filtering_options, prep=prep, + tpm_df=tpm_df, force_zfpkm_plot=force_zfpkm_plot, - zfpkm_min_peak_height=zfpkm_min_peak_height, - zfpkm_min_peak_distance=zfpkm_min_peak_distance, umi_target_sum=umi_target_sum, umi_perform_normalization=umi_perform_normalization, output_zfpkm_plot_dirpath=output_zfpkm_plot_dirpath, + output_fpkm_filepath=output_fpkm_filepath, force_negative_to_zero=force_negative_to_zero, ) @@ -771,20 +708,20 @@ async def _process( ) merged_zscores = merged_zscores.reindex(columns=sorted(merged_zscores.columns)) - merged_zscores = merged_zscores.groupby("entrez_gene_id").mean() + merged_zscores = merged_zscores.groupby(by=merged_zscores.index.name).mean() merged_zscores.to_csv(output_zscore_normalization_filepath, index=True) elif isinstance(rnaseq_matrix, sc.AnnData): merged_zscores = ad.concat([m.z_score_matrix for m in metrics.values()], axis="obs") merged_zscores.var.index.name = "entrez_gene_id" merged_zscores.obs = merged_zscores.obs.reindex(columns=sorted(merged_zscores.obs.columns)) merged_zscores.write_h5ad(output_zscore_normalization_filepath.with_suffix(".h5ad")) + logger.success(f"Wrote z-score normalization matrix to {output_zscore_normalization_filepath}") + expressed_genes: list[str] = list(itertools.chain.from_iterable(m.entrez_gene_ids for m in metrics.values())) top_genes: list[str] = list( itertools.chain.from_iterable(m.high_confidence_entrez_gene_ids for m in metrics.values()) ) - logger.success(f"Wrote z-score normalization matrix to {output_zscore_normalization_filepath}") - expression_frequency = pd.Series(expressed_genes).value_counts() expression_df = pd.DataFrame( {"entrez_gene_id": expression_frequency.index, "frequency": expression_frequency.values} @@ -812,16 +749,17 @@ async def _process( # TODO: 2025-OCT-31: commented out dropping entrez ids, should this be kept? # boolean_matrix.dropna(subset="entrez_gene_id", inplace=True) boolean_matrix = boolean_matrix.groupby("entrez_gene_id", as_index=False).mean() - boolean_matrix["expressed"] = boolean_matrix["expressed"].copy().astype(int) + boolean_matrix["expressed"] = boolean_matrix["expressed"].copy().round(0) boolean_matrix["high"] = boolean_matrix["high"].copy().astype(int) boolean_matrix.to_csv(output_boolean_activity_filepath, index=False) logger.info( - f"{context_name} - Found {expressed_count} expressed genes, {high_confidence_count} of which are confidently expressed" + f"{context_name} - Found {expressed_count} expressed genes, " + f"{high_confidence_count} of which are confidently expressed" ) logger.success(f"Wrote boolean matrix to {output_boolean_activity_filepath}") -async def rnaseq_gen( # noqa: C901 +def rnaseq_gen( # noqa: C901 context_name: str, input_rnaseq_filepath: Path, input_gene_info_filepath: Path, @@ -835,10 +773,10 @@ async def rnaseq_gen( # noqa: C901 batch_ratio: float = 0.5, high_batch_ratio: float = 0.75, technique: FilteringTechnique | str = FilteringTechnique.ZFPKM, - zfpkm_min_peak_height: float = 0.02, - zfpkm_min_peak_distance: int = 1, umi_target_sum: int = 10_000, + output_fpkm_filepath: Path | None = None, input_fragment_lengths: Path | None = None, + input_tpm_filepath: Path | None = None, umi_perform_normalization: bool = False, cutoff: int | float | None = None, force_zfpkm_plot: bool = False, @@ -862,6 +800,7 @@ async def rnaseq_gen( # noqa: C901 :param taxon_id: The NCBI Taxon ID :param input_metadata_filepath_or_df: The filepath or dataframe containing metadata information :param input_fragment_lengths: The filepath to the fragment lengths file, if applicable. + :param input_tpm_filepath: The filepath to the TPM values file, if applicable. :param replicate_ratio: The percentage of replicates that a gene must appear in for a gene to be marked as "active" in a batch/study :param batch_ratio: The percentage of batches that a gene must appear in for a gene to be marked as 'active" @@ -870,11 +809,13 @@ async def rnaseq_gen( # noqa: C901 :param high_batch_ratio: The percentage of batches that a gene must appear in for a gene to be marked "highly confident" in its expression :param technique: The filtering technique to use - :param zfpkm_min_peak_height: The height of the zFPKM peak - :param zfpkm_min_peak_distance: The distance of the zFPKM peak :param umi_target_sum: The target sum for UMI normalization - :param umi_perform_normalization: Should UMI normalization be performed? + :param output_fpkm_filepath: Optional filepath to write the FPKM matrix + (required if using zFPKM filtering with paired-end data) + :param umi_perform_normalization: Should UMI normalization be performed :param cutoff: The cutoff value to use for the provided filtering technique + TPM: This is a TPM-based value; if `replicate_ratio` genes have a value less than this, the gene will be marked as inactive + zFPKM: This is a zFPKM-based value; if `replicate_ratio` genes have a value less than this, the gene will be marked as inactive :param force_zfpkm_plot: If too many samples exist, should plotting be done anyway? :param log_level: The level of logging to output :param log_location: The location to write logs to @@ -883,28 +824,32 @@ async def rnaseq_gen( # noqa: C901 This could happen as a result of normalization producing negative near-zero values (e.g., -0.001) :return: None - """ + """ # noqa: E501 set_up_logging(level=log_level, location=log_location) + fragment_df = pd.DataFrame() technique = FilteringTechnique(technique) if isinstance(technique, str) else technique - match technique: - case FilteringTechnique.TPM: - cutoff: int | float = cutoff or 25 - if cutoff < 1 or cutoff > 100: - raise ValueError("Quantile must be between 1 - 100") - - case FilteringTechnique.CPM: - if cutoff and cutoff < 0: - raise ValueError("Cutoff must be greater than or equal to 0") - elif cutoff: - cutoff = "default" - - case FilteringTechnique.ZFPKM: - cutoff: int | float = cutoff or -3 - case FilteringTechnique.UMI: - cutoff: int = cutoff or 1 - case _: - raise ValueError(f"Technique must be one of {','.join(FilteringTechnique)}. Got: {technique.value}") + if technique == FilteringTechnique.TPM: + if input_tpm_filepath is None: + raise ValueError("Missing argument: `input_tpm_filepath` is required for TPM filtering") + cutoff = cutoff or 1 + elif technique == FilteringTechnique.CPM: + if isinstance(cutoff, int) and cutoff < 0: + raise ValueError("Cutoff must be greater than or equal to 0") + elif not cutoff: + cutoff = np.nan + elif technique == FilteringTechnique.ZFPKM: + if not input_fragment_lengths: + raise ValueError("Fragment lengths file is required for zFPKM filtering") + if not output_fpkm_filepath: + raise ValueError("Output FPKM filepath is required for zFPKM filtering") + cutoff = cutoff or -3 + fragment_df = read_file(input_fragment_lengths) + + elif technique == FilteringTechnique.UMI: + cutoff = cutoff or 1 + else: + raise ValueError(f"Technique must be one of {','.join(FilteringTechnique)}. Got: {technique.value}") if not input_rnaseq_filepath.exists(): raise FileNotFoundError(f"Input RNA-seq file not found! Searching for: '{input_rnaseq_filepath}'") @@ -923,7 +868,7 @@ async def rnaseq_gen( # noqa: C901 if input_metadata_filepath_or_df.suffix not in {".xlsx", ".xls"}: raise ValueError( f"Expected an excel file with extension of '.xlsx' or '.xls', " - f"got '{input_metadata_filepath_or_df.suffix}'" + f"got '{input_metadata_filepath_or_df.suffix}'." ) if not input_metadata_filepath_or_df.exists(): raise FileNotFoundError(f"Input metadata file not found! Searching for: '{input_metadata_filepath_or_df}'") @@ -934,13 +879,13 @@ async def rnaseq_gen( # noqa: C901 f"Expected a pandas DataFrame or Path object as metadata, got '{type(input_metadata_filepath_or_df)}'" ) - logger.debug(f"Starting '{context_name}'") - await _process( + _process( context_name=context_name, rnaseq_matrix_filepath=input_rnaseq_filepath, metadata_df=metadata_df, gene_info_df=read_file(input_gene_info_filepath), - fragment_df=read_file(input_fragment_lengths), + fragment_df=fragment_df, + tpm_df=read_file(input_tpm_filepath) if input_tpm_filepath else pd.DataFrame(), prep=prep, taxon=taxon_id, replicate_ratio=replicate_ratio, @@ -950,12 +895,65 @@ async def rnaseq_gen( # noqa: C901 technique=technique, cut_off=cutoff, force_zfpkm_plot=force_zfpkm_plot, - zfpkm_min_peak_height=zfpkm_min_peak_height, - zfpkm_min_peak_distance=zfpkm_min_peak_distance, umi_target_sum=umi_target_sum, umi_perform_normalization=umi_perform_normalization, + output_fpkm_filepath=output_fpkm_filepath, output_boolean_activity_filepath=output_boolean_activity_filepath, output_zscore_normalization_filepath=output_zscore_normalization_filepath, output_zfpkm_plot_dirpath=output_zfpkm_plot_dirpath, force_negative_to_zero=force_negative_counts_to_zero, ) + + +if __name__ == "__main__": + import matplotlib.pyplot as plt + + data = pd.read_csv("/Users/joshl/Downloads/fpkm_example_data/CD8.genes.results.txt", sep="\t") + data["gene_id"] = data["gene_id"].str.partition(".")[0] + counts = ( + data[["gene_id", "expected_count"]] + .copy() + .set_index("gene_id") + .sort_index() + .rename(columns={"expected_count": "actual"}) + ) + eff_len = ( + data[["gene_id", "effective_length"]] + .copy() + .set_index("gene_id") + .sort_index() + .rename(columns={"effective_length": "actual"}) + ) + expected_fpkm = ( + data[["gene_id", "FPKM"]].copy().set_index("gene_id").sort_index().rename(columns={"FPKM": "expected"}) + ) + + metrics = { + "S1": _StudyMetrics( + study="S1", + num_samples=1, + count_matrix=counts, + eff_length=eff_len, + sample_names=[""], + layout=[LayoutMethod.paired_end], + entrez_gene_ids=np.ndarray([0]), + gene_sizes=np.ndarray([0]), + ) + } + calculated_fpkm = _calculate_fpkm(metrics)["S1"].normalization_matrix + calculated_fpkm = calculated_fpkm.round(2) + + joined = calculated_fpkm.join(expected_fpkm, how="inner") + joined["actual"] = joined["actual"].replace([np.nan, np.inf], 0) + + zfpkm_df, _ = zFPKM(joined, remove_na=True) + zfpkm_df = zfpkm_df.replace(-np.inf, np.nan) + + fig, axes = cast(tuple[plt.Figure, list[plt.Axes]], plt.subplots(nrows=2, ncols=1)) + axes[0].hist(zfpkm_df["actual"].to_numpy()) + axes[0].set_title("Expected zFPKM") + + axes[1].hist(zfpkm_df["expected"].to_numpy()) + axes[1].set_title("Actual zFPKM") + fig.tight_layout() + fig.show() From 0f66f122a816332cc40f9e06b11b976cb7f5fdc0 Mon Sep 17 00:00:00 2001 From: Josh Loecker Date: Fri, 6 Mar 2026 11:04:43 -0600 Subject: [PATCH 14/41] feat: simplify merging logic - Removes `_get_transcriptomic_details` function as it was unused - Renames `_merge_xomics` to `_trinarize_data` for -1/0/+1 binning - Simplify missing data handling [BREAKING-CHANGE]: Removes all async functions for synchronous counterparts Signed-off-by: Josh Loecker --- main/como/merge_xomics.py | 195 ++++++++++++-------------------------- 1 file changed, 58 insertions(+), 137 deletions(-) diff --git a/main/como/merge_xomics.py b/main/como/merge_xomics.py index 564c2025..6221e455 100644 --- a/main/como/merge_xomics.py +++ b/main/como/merge_xomics.py @@ -88,7 +88,6 @@ def load_dummy_dict(): return load_dummy_dict() -# Merge Output def _merge_logical_table(df: pd.DataFrame): """Merge rows of Logical Table belonging to the same entrez_gene_id. @@ -100,12 +99,14 @@ def _merge_logical_table(df: pd.DataFrame): """ # step 1: get all plural ENTREZ_GENE_IDs in the input table, extract unique IDs df.dropna(subset=["entrez_gene_id"], inplace=True) + df["entrez_gene_id"] = df["entrez_gene_id"].copy().astype(str) df.loc[:, "entrez_gene_id"] = df.loc[:, "entrez_gene_id"].astype(str).str.replace(" /// ", "//").astype(str) - id_list: list[str] = df.loc[~df["entrez_gene_id"].str.contains("//"), "entrez_gene_id"].tolist() # Collect "single" ids, like "123" - multiple_entrez_ids: list[str] = df.loc[ - df["entrez_gene_id"].str.contains("//"), "entrez_gene_id" - ].tolist() # Collect "double" ids, like "123//456" + # Collect "single" ids, like "123" + id_list: list[str] = df.loc[~df["entrez_gene_id"].str.contains("//"), "entrez_gene_id"].tolist() + + # Collect "double" ids, like "123//456" + multiple_entrez_ids: list[str] = df.loc[df["entrez_gene_id"].str.contains("//"), "entrez_gene_id"].tolist() for i in multiple_entrez_ids: ids = i.split("//") @@ -174,81 +175,7 @@ def _merge_logical_table(df: pd.DataFrame): return df -async def _get_transcriptmoic_details(merged_df: pd.DataFrame, taxon_id: int) -> pd.DataFrame: - """Get details of transcriptomic data. - - This function will get the following details of transcriptomic data: - - Gene Symbol - - Gene Name - - entrez_gene_id - - The resulting dataframe will have its columns created in the order listed above - It will return a pandas dataframe with this information - - Args: - merged_df: A dataframe containing all active transcriptomic and proteomic genes - taxon_id: The NCBI taxonomy ID of the organism - - Returns: - A dataframe with the above-listed columns - """ - # If _ExpressedHeaderNames.PROTEOMICS.value is in the dataframe, lower the required expression by 1 - # We are only trying to get details for transcriptomic data - logger.debug("Obtaining transcriptomic details") - transcriptomic_df: pd.DataFrame = merged_df.copy() - if _ExpressedHeaderNames.PROTEOMICS in merged_df.columns: - logger.trace("Proteomic data found, modifying required and total expression values") - # Get the number of sources required for a gene to be marked "expressed" - required_expression = merged_df["required"].iloc[0] - - # Subtract 1 from merged_df["TotalExpressed"] if the current value is greater than or equal to 1 - # This is done to take into account the removal of proteomic expression - merged_df["total_expressed"] = merged_df["total_expressed"].apply(lambda x: x - 1 if x >= 1 else x) - - # Subtract required_expression by 1 if it is greater than 1 - if required_expression > 1: - required_expression -= 1 - - transcriptomic_df: pd.DataFrame = merged_df.drop( - columns=[ - _ExpressedHeaderNames.PROTEOMICS, - _HighExpressionHeaderNames.PROTEOMICS, - ], - inplace=False, - ) - logger.trace(f"Modified transcriptomic dataframe: {transcriptomic_df.shape}") - - # Must recalculate TotalExpressed because proteomic data was removed - # If the TotalExpressed column is less than the Required column, set active to 1, otherwise set it to 0 - transcriptomic_df.loc[ - transcriptomic_df["total_expressed"] >= transcriptomic_df["required"], - "active", - ] = 1 - - my_gene = MyGene() - gene_details: pd.DataFrame = pd.DataFrame( - data=pd.NA, - columns=["entrez_gene_id", "gene_symbol", "description", "gene_type"], - index=list(range(len(transcriptomic_df))), - ) - logger.trace(f"Querying MyGene for details on {len(transcriptomic_df)} genes") - for i, detail in enumerate( - await my_gene.query( - items=transcriptomic_df["entrez_gene_id"].astype(int).tolist(), - taxon=taxon_id, - scopes="entrezgene", - ) - ): - gene_details.at[i, "entrez_gene_id"] = detail["entrezgene"] - gene_details.at[i, "gene_symbol"] = detail["symbol"] - gene_details.at[i, "description"] = detail["name"] - gene_details.at[i, "gene_type"] = detail["type_of_gene"] - - logger.debug("Finished obtaining transcriptomic details") - return gene_details - - -async def _merge_xomics( +def _trinarize_data( context_name: str, expression_requirement: int, trna_boolean_matrix: pd.DataFrame | None, @@ -257,8 +184,6 @@ async def _merge_xomics( proteomic_boolean_matrix: pd.DataFrame | None, output_merged_filepath: Path, output_gene_activity_filepath: Path, - output_transcriptomic_details_filepath: Path, - taxon_id: int, force_activate_high_confidence: bool = True, adjust_for_missing_sources: bool = False, ): @@ -277,17 +202,18 @@ async def _merge_xomics( logger.trace(f"Skipping {expressed_sourcetype} because it's matrix does not exist") continue - matrix: pd.DataFrame # re-define type to assist in type hinting for IDEs expression_list.append(expressed_sourcetype) high_confidence_list.append(high_expressed_sourcetype) matrix.rename(columns={"expressed": expressed_sourcetype, "high": high_expressed_sourcetype}, inplace=True) - matrix = cast(pd.DataFrame, matrix[matrix["entrez_gene_id"] != "-"]) + matrix = matrix[matrix["entrez_gene_id"] != "-"] matrix.loc[:, "entrez_gene_id"] = matrix.loc[:, "entrez_gene_id"].astype(int) merge_data = matrix if merge_data.empty else merge_data.merge(matrix, on="entrez_gene_id", how="outer") logger.trace(f"Shape of merged data before merging logical tables: {merge_data.shape}") if merge_data.empty: - logger.warning(f"No data is available for the '{context_name}' context. If this is intentional, ignore this error.") + logger.warning( + f"No data is available for the '{context_name}' context. If this is intentional, ignore this error." + ) return {} merge_data = _merge_logical_table(merge_data) @@ -300,7 +226,11 @@ async def _merge_xomics( if adjust_for_missing_sources: # Subtract 1 from requirement per missing source logger.trace("Adjusting for missing data sources") merge_data.loc[:, "required"] = merge_data[expression_list].apply( - lambda x: expression_requirement - (num_sources - x.count()) if (expression_requirement - (num_sources - x.count()) > 0) else 1, + lambda x: ( + expression_requirement - (num_sources - x.count()) + if (expression_requirement - (num_sources - x.count()) > 0) + else 1 + ), axis=1, ) else: # Do not adjust for missing sources @@ -321,29 +251,26 @@ async def _merge_xomics( merge_data.loc[merge_data[high_confidence_list].sum(axis=1) > 0, "active"] = 1 merge_data.dropna(inplace=True) + merge_data = merge_data.groupby("entrez_gene_id", as_index=False).mean() merge_data.to_csv(output_merged_filepath, index=False) logger.success(f"Saved merged data to {output_merged_filepath}") - logger.debug(f"Generating transcriptomic details using {output_merged_filepath}") - transcriptomic_details = await _get_transcriptmoic_details(merge_data, taxon_id=taxon_id) - logger.debug(f"Saving transcriptomic details to {output_transcriptomic_details_filepath}") - transcriptomic_details.dropna(inplace=True) - transcriptomic_details.to_csv(output_transcriptomic_details_filepath, index=False) - logger.success(f"Saved transcriptomic details to {output_transcriptomic_details_filepath}") return {context_name: output_gene_activity_filepath.as_posix()} -async def _update_missing_data(input_matrices: _InputMatrices, taxon_id: int) -> _InputMatrices: +def _update_missing_data(input_matrices: _InputMatrices, taxon_id: int) -> _InputMatrices: logger.trace("Updating missing genomic data") - matrix_keys: dict[str, list[pd.DataFrame]] = { + matrix_keys: dict[str, list[pd.DataFrame | None]] = { "trna": [input_matrices.trna], "mrna": [input_matrices.mrna], "scrna": [input_matrices.scrna], "proteomics": [input_matrices.proteomics], } logger.trace(f"Gathering missing data for data sources: {','.join(key for key in matrix_keys if key is not None)}") + # ruff: disable[E501] # fmt: off + # TODO: Use the local `gene_info.csv` file results: tuple[pd.DataFrame | None, ...] = ( get_missing_gene_data(values=input_matrices.trna, taxon_id=taxon_id) if input_matrices.trna is not None else None, get_missing_gene_data(values=input_matrices.mrna, taxon_id=taxon_id) if input_matrices.mrna is not None else None, @@ -356,34 +283,29 @@ async def _update_missing_data(input_matrices: _InputMatrices, taxon_id: int) -> matrix_keys[key].append(results[i]) for matrix_name, (matrix, conversion) in matrix_keys.items(): - matrix: pd.DataFrame - if matrix is not None: - # fmt: off - existing_data = ( - "gene_symbol" if "gene_symbol" in matrix.columns - else "entrez_gene_id" if "entrez_gene_id" in matrix.columns - else "ensembl_gene_id" - ) - # fmt: on - logger.trace(f"Merging conversion data for {matrix_name}, existing id column is: {existing_data}") - # input_matrices[matrix_name][existing_data] = input_matrices[matrix_name][existing_data].astype(str) - if existing_data == "entrez_gene_id": - input_matrices[matrix_name]["entrez_gene_id"] = input_matrices[matrix_name]["entrez_gene_id"].astype(int) - conversion.index = conversion.index.astype(int) - - input_matrices[matrix_name] = ( - input_matrices[matrix_name].merge(conversion, how="left", left_on=existing_data, right_index=True).reset_index(drop=True) - ) + if matrix is None or conversion is None: + continue - # input_matrices[matrix_name] = ( - # input_matrices[matrix_name].merge(conversion, how="left", left_index=True, right_index=True).dropna().reset_index(drop=True) - # ) + merge_on = matrix.columns.intersection(conversion.columns).to_list() + logger.trace(f"Merging conversion data for {matrix_name} on column(s): {','.join(merge_on)}") + if "entrez_gene_id" in merge_on: + matrix["entrez_gene_id"] = matrix["entrez_gene_id"].astype(int) + conversion["entrez_gene_id"] = conversion["entrez_gene_id"].astype(int) + conversion = conversion.explode(column="ensembl_gene_id", ignore_index=True) + + if "notfound" in conversion.columns: + conversion["notfound"] = conversion["notfound"].replace(np.nan, False) + conversion = conversion[~conversion["notfound"]] + conversion = conversion.drop(columns=["notfound"]) + matrix = matrix.merge(conversion, how="outer", on=merge_on).reset_index(drop=True) + matrix = matrix[~matrix["entrez_gene_id"].isna()] # we need to exclude NA entrez IDs for model building + input_matrices[matrix_name] = matrix logger.debug("Updated missing genomic data") return input_matrices -async def _process( +def _process( *, context_name: str, input_matrices: _InputMatrices, @@ -396,11 +318,10 @@ async def _process( weighted_z_floor: int, weighted_z_ceiling: int, adjust_method: AdjustmentMethod, - merge_zfpkm_distribution: bool, + merge_zscore_distribution: bool, force_activate_high_confidence: bool, adjust_for_missing_sources: bool, output_merge_activity_filepath: Path, - output_transcriptomic_details_filepath: Path, output_activity_filepaths: _OutputCombinedSourceFilepath, output_final_model_scores_filepath: Path, output_figure_dirpath: Path | None, @@ -409,18 +330,19 @@ async def _process( logger.trace( f"Settings: Min Expression: {minimum_source_expression}, Expression Requirement: {expression_requirement}, " f"Weighted Z-Score Floor: {weighted_z_floor}, Weighted Z-Score Ceiling: {weighted_z_ceiling}, " - f"Adjust Method: {adjust_method.value}, Merge Z-Scores: {merge_zfpkm_distribution}, " + f"Adjust Method: {adjust_method.value}, Merge Z-Scores: {merge_zscore_distribution}, " f"Force High Confidence: {force_activate_high_confidence}, Adjust for Missing: {adjust_for_missing_sources}" ) # Collect missing genomic data for each of the input items in asynchronous parallel - input_matrices = await _update_missing_data(input_matrices, taxon_id) + input_matrices = _update_missing_data(input_matrices, taxon_id) logger.trace("Missing data updated") - if merge_zfpkm_distribution: + if merge_zscore_distribution: logger.trace("Merging Z-Scores") - await _begin_combining_distributions( + _begin_combining_distributions( context_name=context_name, + taxon=taxon_id, input_matrices=input_matrices, batch_names=batch_names, source_weights=source_weights, @@ -471,7 +393,7 @@ async def _process( adjusted_expression_requirement = 1 logger.debug(f"Final Expression Requirement: {adjusted_expression_requirement}") - await _merge_xomics( + _trinarize_data( context_name=context_name, expression_requirement=adjusted_expression_requirement, trna_boolean_matrix=boolean_matrices.trna, @@ -480,8 +402,6 @@ async def _process( proteomic_boolean_matrix=boolean_matrices.proteomics, output_merged_filepath=output_merge_activity_filepath, output_gene_activity_filepath=output_final_model_scores_filepath, - output_transcriptomic_details_filepath=output_transcriptomic_details_filepath, - taxon_id=taxon_id, force_activate_high_confidence=force_activate_high_confidence, adjust_for_missing_sources=adjust_for_missing_sources, ) @@ -494,14 +414,17 @@ def _build_batches( proteomic_metadata: pd.DataFrame | None, ) -> _BatchNames: batch_names = _BatchNames() - for source, metadata in zip(SourceTypes.__members__.values(), [trna_metadata, mrna_metadata, scrna_metadata, proteomic_metadata], strict=True): + metadata: pd.DataFrame | None + for source, metadata in zip( + SourceTypes.__members__.values(), + [trna_metadata, mrna_metadata, scrna_metadata, proteomic_metadata], + strict=True, + ): source: SourceTypes - metadata: pd.DataFrame if metadata is None: logger.trace(f"Metadata for source '{source.value}' is None, skipping") continue - metadata: pd.DataFrame # Re-assign type to assist in type hinting for study in sorted(metadata["study"].unique()): batch_search = re.search(r"\d+", study) if not batch_search: @@ -535,10 +458,9 @@ def _validate_source_arguments( raise ValueError(f"Must specify all or none of '{source.value}' arguments") -async def merge_xomics( # noqa: C901 +def merge_xomics( # noqa: C901 context_name: str, output_merge_activity_filepath: Path, - output_transcriptomic_details_filepath: Path, output_final_model_scores_filepath: Path, output_figure_dirpath: Path | None, taxon_id: int, @@ -567,7 +489,7 @@ async def merge_xomics( # noqa: C901 adjust_method: AdjustmentMethod = AdjustmentMethod.FLAT, force_activate_high_confidence: bool = False, adjust_for_na: bool = False, - merge_zfpkm_distribution: bool = False, + merge_zscore_distributions: bool = True, weighted_z_floor: int = -6, weighted_z_ceiling: int = 6, log_level: LogLevel = LogLevel.INFO, @@ -600,7 +522,9 @@ async def merge_xomics( # noqa: C901 raise ValueError("No data was passed!") if expression_requirement and expression_requirement < 1: - logger.warning(f"Expression requirement must be at least 1! Setting to the minimum of 1 now. Got: {expression_requirement}") + logger.warning( + f"Expression requirement must be at least 1! Setting to the minimum of 1 now. Got: {expression_requirement}" + ) expression_requirement = 1 if expression_requirement is None: @@ -619,8 +543,6 @@ async def merge_xomics( # noqa: C901 output_final_model_scores_filepath.parent.mkdir(parents=True, exist_ok=True) if output_merge_activity_filepath: output_merge_activity_filepath.parent.mkdir(parents=True, exist_ok=True) - if output_transcriptomic_details_filepath: - output_transcriptomic_details_filepath.parent.mkdir(parents=True, exist_ok=True) if output_trna_activity_filepath: output_trna_activity_filepath.parent.mkdir(parents=True, exist_ok=True) if output_mrna_activity_filepath: @@ -674,7 +596,7 @@ async def merge_xomics( # noqa: C901 proteomic_metadata=proteomic_metadata, ) - await _process( + _process( context_name=context_name, input_matrices=input_matrices, boolean_matrices=boolean_matrices, @@ -686,12 +608,11 @@ async def merge_xomics( # noqa: C901 weighted_z_floor=weighted_z_floor, weighted_z_ceiling=weighted_z_ceiling, adjust_method=adjust_method, - merge_zfpkm_distribution=merge_zfpkm_distribution, + merge_zscore_distribution=merge_zscore_distributions, force_activate_high_confidence=force_activate_high_confidence, adjust_for_missing_sources=adjust_for_na, output_activity_filepaths=output_activity_filepaths, output_merge_activity_filepath=output_merge_activity_filepath, - output_transcriptomic_details_filepath=output_transcriptomic_details_filepath, output_final_model_scores_filepath=output_final_model_scores_filepath, output_figure_dirpath=output_figure_dirpath, ) From 85434fd5c3a62f5b7d7d44adbc3ba3c5c40521d8 Mon Sep 17 00:00:00 2001 From: Josh Loecker Date: Fri, 6 Mar 2026 11:05:59 -0600 Subject: [PATCH 15/41] fix: z-score combination logic - Fixes Z-score combination formula to use equal weights/Stouffer's method - Adds proper handling for NaN/Inf values - Fixes output formatting [BREAKING-CHANGE]: Converts all async functions to their synchronous counterparts Signed-off-by: Josh Loecker --- main/como/combine_distributions.py | 151 +++++++++++++++++------------ 1 file changed, 88 insertions(+), 63 deletions(-) diff --git a/main/como/combine_distributions.py b/main/como/combine_distributions.py index 24f8889a..f08430fd 100644 --- a/main/como/combine_distributions.py +++ b/main/como/combine_distributions.py @@ -2,10 +2,10 @@ from collections.abc import Sequence from pathlib import Path -from typing import cast import numpy as np import pandas as pd +import scanpy as sc from loguru import logger from como.data_types import ( @@ -29,23 +29,17 @@ def _combine_z_distribution_for_batch( source: SourceTypes, output_combined_matrix_filepath: Path, output_figure_dirpath: Path, - weighted_z_floor: int, - weighted_z_ceiling: int, ) -> pd.DataFrame: """Combine z-score distributions across samples for a single batch. - Args: - context_name: Name of the context (e.g., tissue or condition). - batch: Batch entry containing batch number and sample names. - matrix: DataFrame with 'ensembl_gene_id' and sample columns. - source: Source type (e.g., trna, mrna, scrna, proteomics). - output_combined_matrix_filepath: Path to save the combined z-score matrix. - output_figure_dirpath: Path to save the z-score distribution figure. - weighted_z_floor: Minimum z-score value after combining. - weighted_z_ceiling: Maximum z-score value after combining. + :param context_name: Name of the context (e.g., tissue or condition). + :param batch: Batch entry containing batch number and sample names. + :param matrix: DataFrame with 'ensembl_gene_id' and sample columns. + :param source: Source type (e.g., trna, mrna, scrna, proteomics). + :param output_combined_matrix_filepath: Path to save the combined z-score matrix. + :param output_figure_dirpath: Path to save the z-score distribution figure. - Returns: - A pandas dataframe of the weighted z-distributions + :returns: A pandas dataframe of the weighted z-distributions """ output_combined_matrix_filepath.parent.mkdir(parents=True, exist_ok=True) output_figure_dirpath.mkdir(parents=True, exist_ok=True) @@ -58,20 +52,30 @@ def _combine_z_distribution_for_batch( ) if num_columns(matrix) < 2: logger.trace( - f"A single sample exists for batch '{batch.batch_num}'. Returning as-is because no additional combining can be done" + f"A single sample exists for batch '{batch.batch_num}'. " + f"Returning as-is because no additional combining can be done" ) with_batch_num = matrix.copy() with_batch_num.columns = [batch.batch_num] return with_batch_num - values = matrix.values - weighted_matrix = np.clip( - a=np.sum(values, axis=1) / np.sqrt(values.shape[1]), # calculate a weighted matrix - a_min=weighted_z_floor, - a_max=weighted_z_ceiling, - ).astype(float) - - weighted_matrix = pd.DataFrame({"combine_z": weighted_matrix}, index=matrix.index) + values = matrix.copy().replace([np.inf, -np.inf], np.nan) + values_array = values.to_numpy(dtype=float) + mask = ~np.isnan(values_array) + weights = np.ones(values.shape[1], dtype=float) + masked_values = np.where(mask, values_array * weights, 0.0) + weighted_sum = masked_values.sum(axis=1) + + weights_squared = weights**2 + sum_weights_squared = (mask * weights_squared).sum(axis=1) + denominator = np.sqrt(sum_weights_squared) + combined_z = np.divide( + weighted_sum, + denominator, + out=np.full_like(weighted_sum, np.nan, dtype=float), + where=denominator != 0.0, + ) + df = pd.DataFrame({"combine_z": combined_z}, index=values.index) # merge_df = pd.concat([matrix, pd.Series(weighted_matrix, name="combined")], axis=1) # stack_df = pd.melt( @@ -91,9 +95,11 @@ def _combine_z_distribution_for_batch( # / f"{context_name}_{source.value}_batch{batch.batch_num}_combined_zscore_distribution.pdf", # ) - weighted_matrix.columns = [batch.batch_num] - weighted_matrix.to_csv(output_combined_matrix_filepath, index=True) - return weighted_matrix + df.columns = [batch.batch_num] + df.index = df.index.astype(int) + df = df.sort_index() + df.to_csv(output_combined_matrix_filepath, index=True) + return df def _combine_z_distribution_for_source( @@ -108,7 +114,8 @@ def _combine_z_distribution_for_source( """Combine z-score distributions across batches for a single source. Args: - merged_source_data : DataFrame with columns: ["ensembl_gene_id", , , ...]. Each batch column contains the within-batch combined z for that gene. + merged_source_data : DataFrame with columns: ["ensembl_gene_id", , , ...]. + Each batch column contains the within-batch combined z for that gene. context_name: tissue or condition name replicates_per_batch: vector of replicate counts per batch, aligned exactly to the order of batch columns in `merged_source_data.iloc[:, 1:]`. @@ -141,19 +148,20 @@ def _combine_z_distribution_for_source( ) logger.trace( - f"[{context_name}] Combining {expected_num_batches} batches with replicate counts: {dict(zip(batch_column_names, replicates_per_batch))}" + f"[{context_name}] Combining {expected_num_batches} batches with" + f" replicate counts: {dict(zip(batch_column_names, replicates_per_batch, strict=True))}" ) # extract values and build masks (shape is in gene x batch) gene_by_batch_values: np.ndarray = merged_source_data.to_numpy(dtype=float) present_mask: np.ndarray = ~np.isnan(gene_by_batch_values) - # per-batch weights from replicate counts, normalized over all batches + # per-batch weights: use equal weights (Stouffer's Z method) + # Replicate counts are not statistically valid weights for z-scores because + # z-scores are already standardized (mean=0, std=1 under null hypothesis) + # Weighting by sample size assumes information about precision that was lost during standardization replicates_per_batch_array: np.ndarray = np.asarray(replicates_per_batch, dtype=float) - total_replicates: float = replicates_per_batch_array.sum() - if total_replicates <= 0.0: - raise ValueError(f"[{context_name}] Sum of replicates_per_batch must be > 0; got {total_replicates}.") - base_weights_per_batch: np.ndarray = replicates_per_batch_array / total_replicates # shape: (B,) + base_weights_per_batch = np.ones_like(replicates_per_batch_array, dtype=float) # shape: (B,) # drop NA columns per gene by zeroing their weights masked_weights_per_batch: np.ndarray = np.where(present_mask, base_weights_per_batch, 0.0) # (G,B) @@ -173,16 +181,17 @@ def _combine_z_distribution_for_source( logger.trace(f"[{context_name}] Finished combining batch z-distributions for source.") - return pd.DataFrame({"combine_z": combined_z_per_gene}, index=merged_source_data.index) + df = pd.DataFrame({"combine_z": combined_z_per_gene}, index=merged_source_data.index) + df.index = df.index.astype(int) + df = df.sort_index() + return df def _combine_z_distribution_for_context( context: str, zscore_results: list[_CombineOmicsInput], output_graph_filepath: Path, - weighted_z_floor: int = -6, - weighted_z_ceiling: int = 6, -): +) -> pd.DataFrame: if not zscore_results: logger.warning("No zscore results exist, returning empty dataframe") return pd.DataFrame({"ensembl_gene_id": [], "combine_z": []}) @@ -197,13 +206,12 @@ def _combine_z_distribution_for_context( ) matrix.columns = [res.type.value.lower()] - matrix = cast(pd.DataFrame, matrix.loc[matrix.index != "-"]) + matrix = matrix.loc[matrix.index != "-"] # if the matrix has duplicate ids (i.e., multiple entrez ids map to a single ensembl id, etc.) - # take the mean value of them to remove the duplicates + # take the max value of them to remove the duplicates if not matrix.index.is_unique: - # matrix = cast(pd.DataFrame, matrix.groupby(level=0).mean()) - matrix = cast(pd.DataFrame, matrix.groupby(level=0).max()) + matrix = matrix.groupby(level=0).max() z_matrices.append(matrix) z_matrix = pd.concat(z_matrices, axis="columns", join="outer", ignore_index=False) @@ -214,20 +222,23 @@ def _combine_z_distribution_for_context( z_matrix.columns = ["combine_z"] return z_matrix - values = z_matrix.iloc[:, 1:].values + values = z_matrix.values weights = np.array([r.weight for r in zscore_results]) mask = ~np.isnan(values) masked_values = np.where(mask, values, 0) masked_weights = np.where(mask, weights, 0) - normalized_weights = masked_weights / np.sum(masked_weights, axis=1, keepdims=True) - numerator = np.sum(normalized_weights * masked_values, axis=1) - denominator = np.sqrt(np.sum(normalized_weights**2, axis=1)) - combined_z_matrix = numerator / denominator - combined_z_matrix = np.clip(combined_z_matrix, weighted_z_floor, weighted_z_ceiling, dtype=float) + numerator = np.sum(masked_weights * masked_values, axis=1) + denominator = np.sqrt(np.sum(masked_weights**2, axis=1)) + combined_z_matrix = np.divide( + numerator, + denominator, + out=np.full_like(numerator, np.nan, dtype=float), + where=denominator != 0.0, + ) combined_z_matrix_df = pd.DataFrame( { - "ensembl_gene_id": z_matrix.index, + "entrez_gene_id": z_matrix.index, "combine_z": combined_z_matrix, } ) @@ -252,11 +263,14 @@ def _combine_z_distribution_for_context( # title=f"Combined Z-score Distribution for {context}", # output_filepath=output_graph_filepath, # ) + combined_z_matrix_df["entrez_gene_id"] = combined_z_matrix_df["entrez_gene_id"].astype(int) + combined_z_matrix_df = combined_z_matrix_df.sort_values(by="entrez_gene_id") return combined_z_matrix_df -async def _begin_combining_distributions( +def _begin_combining_distributions( context_name: str, + taxon: int, input_matrices: _InputMatrices, batch_names: _BatchNames, source_weights: _SourceWeights, @@ -270,6 +284,8 @@ async def _begin_combining_distributions( output_figure_dirpath.mkdir(parents=True, exist_ok=True) z_score_results: list[_CombineOmicsInput] = [] + + matrix: pd.DataFrame | None for source, matrix in input_matrices: if matrix is None: logger.trace(f"Source '{source.value}' is None, skipping") @@ -288,21 +304,23 @@ async def _begin_combining_distributions( elif isinstance(matrix, sc.AnnData): conversion = get_remaining_identifiers(ids=matrix.var_names.tolist(), taxon=taxon) conversion.reset_index(drop=False, inplace=True) - matrix: pd.DataFrame = matrix.to_df().T + matrix = matrix.to_df().T matrix.reset_index(inplace=True, drop=False, names=["gene_symbol"]) - matrix: pd.DataFrame = matrix.merge( - conversion, left_on="gene_symbol", right_on="gene_symbol", how="left" - ) + matrix = matrix.merge(conversion, left_on="gene_symbol", right_on="gene_symbol", how="left") matrix_subset: pd.DataFrame = matrix[[GeneIdentifier.entrez_gene_id.value, *batch.sample_names]] matrix_subset: pd.DataFrame = matrix_subset.set_index( keys=[GeneIdentifier.entrez_gene_id.value], drop=True ) else: raise TypeError( - f"Unexpected matrix type for source '{source.value}'; expected 'pandas.DataFrame' or 'scanpy.AnnData': {type(matrix)}" + f"Unexpected matrix type for source '{source.value}'; " + f"expected 'pandas.DataFrame' or 'scanpy.AnnData': {type(matrix)}" ) - output_fp = output_filepaths[source.value].parent / f"{source.value}_batch{batch.batch_num}_combined_z_distrobution_{context_name}.csv" + output_fp = ( + output_filepaths[source.value].parent + / f"{source.value}_batch{batch.batch_num}_combined_z_distrobution_{context_name}.csv" + ) batch_results.append( _combine_z_distribution_for_batch( context_name=context_name, @@ -311,8 +329,6 @@ async def _begin_combining_distributions( source=source, output_combined_matrix_filepath=output_fp, output_figure_dirpath=output_figure_dirpath, - weighted_z_floor=weighted_z_floor, - weighted_z_ceiling=weighted_z_ceiling, ) ) @@ -333,14 +349,17 @@ async def _begin_combining_distributions( merged_batch_results = pd.concat(batch_results, axis="columns") merged_batch_results.index.name = index_name - replicates_by_batch_num_map: dict[str, int] = {str(batch.batch_num): batch.num_samples for batch in batch_names[source.value]} + replicates_by_batch_num_map: dict[str, int] = { + str(batch.batch_num): batch.num_samples for batch in batch_names[source.value] + } batch_column_names: list[str] = list(merged_batch_results.columns) replicates_per_batch: list[int] = [] for col in batch_column_names: key = str(col) if key not in replicates_by_batch_num_map: raise KeyError( - f"Missing replicate count for batch column '{col}' in source '{source.value}'. Known: {list(replicates_by_batch_num_map.keys())}" + f"Missing replicate count for batch column '{col}' in source '{source.value}'. " + f"Known: {list(replicates_by_batch_num_map.keys())}" ) replicates_per_batch.append(replicates_by_batch_num_map[key]) @@ -349,9 +368,12 @@ async def _begin_combining_distributions( context_name=context_name, replicates_per_batch=replicates_per_batch, output_combined_matrix_filepath=( - output_filepaths[source.value].parent / f"{context_name}_{source.value}_combined_zscore_distribution.csv" + output_filepaths[source.value].parent + / f"{context_name}_{source.value}_combined_zscore_distribution.csv" + ), + output_figure_filepath=( + output_figure_dirpath / f"{context_name}_{source.value}_combined_zscore_distribution.pdf" ), - output_figure_filepath=(output_figure_dirpath / f"{context_name}_{source.value}_combined_zscore_distribution.pdf"), weighted_z_floor=weighted_z_floor, weighted_z_ceiling=weighted_z_ceiling, ) @@ -363,7 +385,10 @@ async def _begin_combining_distributions( ) ) merged_source_results.to_csv(output_filepaths[source.value], index=True) - logger.success(f"Wrote z-scores for source '{source.value}' in context '{context_name}' to '{output_filepaths[source.value]}'") + logger.success( + f"Wrote z-scores for source '{source.value}' in context '{context_name}' " + f"to '{output_filepaths[source.value]}'" + ) logger.trace(f"Combining z-score distributions for all sources in context '{context_name}'") merged_context_results = _combine_z_distribution_for_context( @@ -371,5 +396,5 @@ async def _begin_combining_distributions( zscore_results=z_score_results, output_graph_filepath=output_figure_dirpath / f"{context_name}_combined_omics_distribution.pdf", ) - merged_context_results.to_csv(output_final_model_scores, index=True) + merged_context_results.to_csv(output_final_model_scores, index=len(merged_context_results.columns) <= 1) logger.success(f"Wrote combined z-scores for context '{context_name}' to {output_final_model_scores}") From 2db1fc4be8d3a65d471474a8a7f4d89600edc908 Mon Sep 17 00:00:00 2001 From: Josh Loecker Date: Fri, 6 Mar 2026 11:14:57 -0600 Subject: [PATCH 16/41] refactor: remove async/await and simplify I/O operations - Convert all async functions to synchronous - Remove unused imports/types - Replace `await _read_file` with `pd.read_csv` calls Signed-off-by: Josh Loecker --- main/como/create_context_specific_model.py | 42 ++++++++++++---------- 1 file changed, 23 insertions(+), 19 deletions(-) diff --git a/main/como/create_context_specific_model.py b/main/como/create_context_specific_model.py index 518bf6b3..2813b8cc 100644 --- a/main/como/create_context_specific_model.py +++ b/main/como/create_context_specific_model.py @@ -1,25 +1,24 @@ from __future__ import annotations -import asyncio import collections import os import re import sys -from collections.abc import Coroutine, Sequence +from collections.abc import Sequence from io import TextIOWrapper +from math import isfinite +from multiprocessing import Value from pathlib import Path -from typing import Any, Literal, TextIO, cast +from typing import Literal, Never, TextIO, cast, reveal_type +import cobamp.core.optimization import cobra import cobra.util.array import numpy as np import numpy.typing as npt import pandas as pd -from cobra import Model -from cobra.flux_analysis import pfba -from fast_bioservices.common import Taxon -from fast_bioservices.pipeline import ensembl_to_gene_id_and_symbol from loguru import logger +from troppo.methods.reconstruction.corda import CORDA, CORDAProperties from troppo.methods.reconstruction.fastcore import FASTcore, FastcoreProperties from troppo.methods.reconstruction.gimme import GIMME, GIMMEProperties from troppo.methods.reconstruction.imat import IMAT, IMATProperties @@ -27,7 +26,6 @@ from como.data_types import ( Algorithm, - BuildResults, CobraCompartments, LogLevel, ModelBuildSettings, @@ -453,6 +451,7 @@ def _read_reference_model(filepath: Path) -> cobra.Model: async def _build_model( general_model_file: Path, +def _build_model( gene_expression_file: Path, recon_algorithm: Algorithm, objective: str, @@ -640,7 +639,8 @@ async def _build_model( ) -async def _create_df(path: Path, *, lowercase_col_names: bool = False) -> pd.DataFrame: + +def _create_df(path: Path, *, lowercase_col_names: bool = False) -> pd.DataFrame: if path.suffix not in {".csv", ".tsv"}: raise ValueError(f"File must be a .csv or .tsv file, got '{path.suffix}'") df: pd.DataFrame = await _read_file(path=path, header=0, sep="," if path.suffix == ".csv" else "\t", h5ad_as_df=True) @@ -657,8 +657,14 @@ async def _create_df(path: Path, *, lowercase_col_names: bool = False) -> pd.Dat return df -async def _collect_boundary_reactions(path: Path) -> _BoundaryReactions: - df: pd.DataFrame = await _create_df(path, lowercase_col_names=True) +def _collect_boundary_reactions( + path: Path, + reference_model: cobra.Model, + *, + close_unlisted_exchanges: bool, +) -> _BoundaryReactions: + df: pd.DataFrame = _create_df(path, lowercase_col_names=True) + for column in df.columns: if column not in [ "reaction", @@ -692,7 +698,7 @@ async def _collect_boundary_reactions(path: Path) -> _BoundaryReactions: ) -async def _write_model_to_disk( +def _write_model_to_disk( context_name: str, model: cobra.Model, output_filepaths: list[Path], @@ -700,24 +706,22 @@ async def _write_model_to_disk( json_suffix: set[str], xml_suffix: set[str], ) -> None: - tasks: set[Coroutine[Any, Any, None]] = set() for path in output_filepaths: path.parent.mkdir(parents=True, exist_ok=True) if path.suffix in mat_suffix: - tasks.add(asyncio.to_thread(cobra.io.save_matlab_model, model=model, file_name=path)) + cobra.io.save_matlab_model(model=model, file_name=path) elif path.suffix in json_suffix: - tasks.add(asyncio.to_thread(cobra.io.save_json_model, model=model, filename=path, pretty=True)) + cobra.io.save_json_model(model=model, filename=path, pretty=True) elif path.suffix in xml_suffix: - tasks.add(asyncio.to_thread(cobra.io.write_sbml_model, model=model, filename=path)) + cobra.io.write_sbml_model(cobra_model=model, filename=path) else: raise ValueError( f"Invalid output model filetype. Should be one of .xml, .sbml, .mat, or .json. Got '{path.suffix}'" ) - logger.success(f"Will save metabolic model for context '{context_name}' to: '{path}'") - await asyncio.gather(*tasks) + logger.success(f"Model for context '{context_name}' saved to to: '{path}'") -async def create_context_specific_model( # noqa: C901 +def create_context_specific_model( # noqa: C901 context_name: str, taxon: int | str | Taxon, reference_model: Path, From 5a99d13d8b173b651940ca2b770c6c6797b052d2 Mon Sep 17 00:00:00 2001 From: Josh Loecker Date: Fri, 6 Mar 2026 11:15:20 -0600 Subject: [PATCH 17/41] refactor: inline setting boundary reaction conditions Signed-off-by: Josh Loecker --- main/como/create_context_specific_model.py | 52 +++++----------------- 1 file changed, 11 insertions(+), 41 deletions(-) diff --git a/main/como/create_context_specific_model.py b/main/como/create_context_specific_model.py index 2813b8cc..2ad48b80 100644 --- a/main/como/create_context_specific_model.py +++ b/main/como/create_context_specific_model.py @@ -122,39 +122,6 @@ def _gene_rule_logical(gpr_expression: str, level: int = 0) -> str: return expression_out -def _set_boundaries( - model: cobra.Model, - boundary_reactions: list[str], - lower_bounds: list[float], - upper_bounds: list[float], -) -> cobra.Model: - # get boundary reactions - exchange_rxns = [rxn.id for rxn in model.reactions if rxn.id.startswith("EX_")] - sink_rxns = [rxn.id for rxn in model.reactions if rxn.id.startswith("sink_")] - demand_rxns = [rxn.id for rxn in model.reactions if rxn.id.startswith("DM_")] - - # Allows all boundary reactions to be used if none are given - allow_all_boundary_rxns = not boundary_reactions - - # close sinks and demands not in boundary reactions unless no boundary reactions were given - if not allow_all_boundary_rxns: - for i, rxn in enumerate(sink_rxns): # set sinks to 0 - getattr(model.reactions, rxn).lower_bounds = lower_bounds[i] if rxn in boundary_reactions else 0 - getattr(model.reactions, rxn).upper_bounds = upper_bounds[i] if rxn in boundary_reactions else 1000 - - for i, rxn in enumerate(demand_rxns): - getattr(model.reactions, rxn).lower_bounds = 0 - getattr(model.reactions, rxn).upper_bounds = upper_bounds[i] if rxn in boundary_reactions else 0 - - # Reaction media - medium = model.medium - for rxn in exchange_rxns: # open exchanges from exchange file, close unspecified exchanges - medium[rxn] = -float(lower_bounds[boundary_reactions.index(rxn)]) if rxn in boundary_reactions else 0.0 - model.medium = medium - - return model - - def _feasibility_test(model_cobra: cobra.Model, step: str): # check number of unsolvable reactions for reference model under media assumptions # create flux consistant model (rmemoves some reactions) @@ -449,9 +416,8 @@ def _read_reference_model(filepath: Path) -> cobra.Model: raise ValueError(f"Reference model format must be .xml, .mat, or .json; found '{filepath.suffix}'") -async def _build_model( - general_model_file: Path, def _build_model( + reference_model: cobra.Model, gene_expression_file: Path, recon_algorithm: Algorithm, objective: str, @@ -502,12 +468,16 @@ def _build_model( reference_model = _set_boundaries(reference_model, boundary_reactions, lower_bounds, upper_bounds) reference_model.solver = solver.lower() - # check number of unsolvable reactions for reference model under media assumptions - # inconsistent_reactions, cobra_model = _feasibility_test(cobra_model, "before_seeding") - inconsistent_reactions = [] - s_matrix = cobra.util.array.create_stoichiometric_matrix(reference_model, array_type="dense") - lower_bounds: list[int] = [] - upper_bounds: list[int] = [] + # Set reference model boundaries + for rxn, lb, ub in zip(boundary_reactions, lower_bounds, upper_bounds, strict=True): + cast(cobra.Reaction, reference_model.reactions.get_by_id(rxn)).lower_bound = lb # type: ignore[bad-argument-count] + cast(cobra.Reaction, reference_model.reactions.get_by_id(rxn)).upper_bound = ub # type: ignore[bad-argument-count] + reference_model.solver = solver.lower() # type: ignore[bad-argument-count] + reference_model.objective_direction = objective_direction # type: ignore[bad-argument-count] + + # Collect the lower/upper bounds for reactions in the reference model to provide it to reconstruction methods + ref_lb: npt.NDArray[np.floating] = np.full(shape=(len(reference_model.reactions)), fill_value=np.nan, dtype=float) + ref_ub: npt.NDArray[np.floating] = np.full(shape=(len(reference_model.reactions)), fill_value=np.nan, dtype=float) reaction_ids: list[str] = [] for i, rxn in enumerate(reference_model.reactions): rxn: cobra.Reaction From b9c773e9fd84aa88e56f1864e020c701e35d7308 Mon Sep 17 00:00:00 2001 From: Josh Loecker Date: Fri, 6 Mar 2026 11:17:09 -0600 Subject: [PATCH 18/41] refactor: update `_map_expression_to_reaction` signature - Change from threshold-based (low_thresh, high_thresh) to percentile-based (low_percentile, high_percentile) - Return tuple with (reaction_expression, min_val, low_expr, high_expr) instead of just a dictionary - Adjust expression values to have a minimum of 0 for consistency and downstream building with Troppo - use np.nanpercentile for threshold computing Signed-off-by: Josh Loecker --- main/como/create_context_specific_model.py | 112 +++++++++++++-------- 1 file changed, 72 insertions(+), 40 deletions(-) diff --git a/main/como/create_context_specific_model.py b/main/como/create_context_specific_model.py index 2ad48b80..83bfed5b 100644 --- a/main/como/create_context_specific_model.py +++ b/main/como/create_context_specific_model.py @@ -67,7 +67,7 @@ def _correct_bracket(rule: str, name: str) -> str: right_name = name[name_match.span()[1] :] operator = rule_match.group() - new_right_rule = [] + new_right_rule: list[str] = [] for char in list(left_rule): if char.isspace() or char.isdigit(): new_right_rule.append(char) @@ -320,10 +320,10 @@ def _map_expression_to_reaction( reference_model: cobra.Model, gene_expression_file: Path, recon_algorithm: Algorithm, - low_thresh: float, - high_thresh: float, - taxon: int | str | Taxon, -) -> collections.OrderedDict[str, int]: + taxon: int | str, + low_percentile: int, + high_percentile: int, +) -> tuple[collections.OrderedDict[str, float], float, float, float]: """Map gene ids to a reaction based on GPR (gene to protein to reaction) association rules. These rules should be defined in the general genome-scale metabolic model @@ -332,76 +332,108 @@ def _map_expression_to_reaction( reference_model: A COBRA model object representing the general genome-scale metabolic model. gene_expression_file: Path to a gene expression file (.csv, .tsv, .xlsx, or .xls) recon_algorithm: Algorithm to use for reconstruction (GIMME, FASTCORE, iMAT, or tINIT) - low_thresh: Low expression threshold for algorithms that require it (iMAT, tINIT) - high_thresh: High expression threshold for algorithms that require it (iMAT, tINIT) taxon: Taxon ID or Taxon object for gene ID conversion. + low_percentile: Low percentile for threshold calculation + high_percentile: High percentile for threshold calculation Returns: - An ordered dictionary mapping reaction IDs to their corresponding expression values. + [0]: An ordered dictionary mapping reaction IDs to their corresponding expression values. + [1]: The abs(minimum value) from the expression vector + [2]: The adjusted `low_expr` value + [3]: The adjusted `high_expr` value Raises: ValueError: If neither 'entrez_gene_id' nor 'ensembl_gene_id' columns are found in the gene expression file. """ - expression_data = await _read_file(gene_expression_file) - identifier_column = next((col for col in ("entrez_gene_id", "ensembl_gene_id") if col in expression_data.columns), "") - - if not identifier_column: - raise ValueError( - f"At least one column of 'entrez_gene_id' or 'ensembl_gene_id' could not be found in the gene expression file '{gene_expression_file}'" - ) + expression_data = pd.read_csv(gene_expression_file) gene_activity = split_gene_expression_data( - expression_data, - identifier_column=cast(Literal["ensembl_gene_id", "entrez_gene_id"], identifier_column), - recon_algorithm=recon_algorithm, + expression_data, identifier_column="entrez_gene_id", recon_algorithm=recon_algorithm ) - # convert ensembl IDs to entrez ids to map expression data to reactions in the reference model - if identifier_column == "ensembl_gene_id": - conversion = await ensembl_to_gene_id_and_symbol(ids=gene_activity.index.tolist(), taxon=taxon) - gene_activity = gene_activity.merge(conversion, how="left", left_index=True, right_on="ensembl_gene_id") - gene_activity = gene_activity.set_index(keys=["entrez_gene_id"]) - gene_activity = gene_activity.drop(columns=["ensembl_gene_id", "gene_symbol"]) + # adjust expression to have a minimum value of 0; if the minimum value is already 0, this does nothing + gene_arr = gene_activity["active"].to_numpy(dtype=float, copy=False) + min_val = float(gene_arr[np.isfinite(gene_arr)].min()) + if min_val < 0: + gene_arr[np.isfinite(gene_arr)] += abs(min_val) - reaction_expression = collections.OrderedDict() + # We are computing percentiles on non-zero data because we need to make sure that + # zero values are definitely placed in the low bin + low_expr, high_expr = np.nanpercentile(gene_arr[gene_arr > 0], [low_percentile, high_percentile]) + gene_activity["active"] = gene_arr + gene_activity.index = gene_activity.index.astype(str) # maintain strings for reference model gene_ids # fmt: off - # Define a default expression value if a gene ID is not found - default_expression = ( - np.mean([low_thresh, high_thresh]) if recon_algorithm in {Algorithm.IMAT, Algorithm.TINIT} - else -1 if recon_algorithm == Algorithm.GIMME + # Define a default expression values for the following conditions: + # 1. `no_expression`: If a reaction has a gene reaction rule but is not found in the data, then + # we can be positive that the reaction should be marked as inactive + # For all algorithms, placing no-expression data at an expression value of 0.0 will + # mark it as inactive, because we have adjusted the data to be >= 0 above + no_expression = 0 + # 2. `missing_gpr`: If a reaction has no gene reaction rule, we don't know if it should be marked as (in)active + # Thus, place it in a no-cost location where possible, otherwise mark it as inactive + # IMAT/TINIT/GIMME: Place in mid-bin, which is a "no-cost weight" value + # FASTCORE: Mark as inactive, which is the best we can do with FASTCORE + missing_gpr = ( + # -1 if recon_algorithm in {Algorithm.IMAT, Algorithm.TINIT, Algorithm.GIMME} + (low_expr + high_expr) / 2 if recon_algorithm in {Algorithm.IMAT, Algorithm.TINIT, Algorithm.GIMME} else 0 if recon_algorithm == Algorithm.FASTCORE else 1 ) # fmt: on + reaction_expression: collections.OrderedDict[str, float] = collections.OrderedDict() + activity_map = cast(dict[str, float], gene_activity.loc[:, "active"].to_dict()) error_count = 0 + _base_warn = "could not generate evaluable gene rule." for rxn in reference_model.reactions: rxn: cobra.Reaction - gene_reaction_rule = rxn.gene_reaction_rule + if not gene_reaction_rule: + reaction_expression[rxn.id] = missing_gpr continue - gene_ids = set(re.findall(r"\d+", gene_reaction_rule)) - reaction_expression[rxn.id] = default_expression + gene_ids: set[str] = set(re.findall(r"\d+", gene_reaction_rule)) for gene_id in gene_ids: - activity = gene_activity.at[gene_id, "active"] if gene_id in gene_activity.index else f"{default_expression!s}" - # replace gene_id with activity, using optional whitespace before and after the gene id - # Do not replace the whitespace (if it exists) before and after the gene ID - gene_reaction_rule = re.sub(pattern=rf"\b{gene_id}\b", repl=str(activity), string=str(gene_reaction_rule)) + activity = activity_map.get(gene_id, no_expression) + if np.isposinf(activity): # For positive infinity values, place in high bin + activity = high_expr + 1 + elif not isfinite(activity): # For non-finite values, place in low/non-active bin + activity = no_expression + + # Matches the standalone gene ID only when it's not part of a larger number. + # It ensures the ID is not immediately preceded by a digit or decimal point and not immediately followed by + # a digit, so if the gene id is '48', decimals like 1.48 or numbers like 1648 are left unchanged. + gene_reaction_rule = re.sub( + pattern=rf"(? cobra.Model: From 41d1b56f2c203f8d18f2046ae3ecefc94341c56d Mon Sep 17 00:00:00 2001 From: Josh Loecker Date: Fri, 6 Mar 2026 11:18:31 -0600 Subject: [PATCH 19/41] refactor: add and update types for model build functions Signed-off-by: Josh Loecker --- main/como/create_context_specific_model.py | 37 +++++++++++++--------- 1 file changed, 22 insertions(+), 15 deletions(-) diff --git a/main/como/create_context_specific_model.py b/main/como/create_context_specific_model.py index 83bfed5b..0867eaaa 100644 --- a/main/como/create_context_specific_model.py +++ b/main/como/create_context_specific_model.py @@ -152,10 +152,12 @@ def _feasibility_test(model_cobra: cobra.Model, step: str): def _build_with_gimme( reference_model: cobra.Model, - lower_bounds: Sequence[float] | npt.NDArray[float], - upper_bounds: Sequence[float] | npt.NDArray[float], + expression_vector: Sequence[float] | npt.NDArray[np.floating], idx_objective: int, - expr_vector: npt.NDArray[float], + lower_bounds: npt.NDArray[np.floating], + upper_bounds: npt.NDArray[np.floating], + solver: str, + threshold_percentile: int = 30, ): model_reconstruction = reference_model.copy() s_matrix: list[float] = list(cobra.util.array.create_stoichiometric_matrix(model=model_reconstruction)) @@ -218,12 +220,14 @@ def _build_with_fastcore( def _build_with_imat( reference_model: cobra.Model, - lower_bounds: Sequence[float] | npt.NDArray[float], - upper_bounds: Sequence[float] | npt.NDArray[float], + lower_bounds: npt.NDArray[np.floating], + upper_bounds: npt.NDArray[np.floating], expr_vector: npt.NDArray, - expr_thresh: tuple[float, float], - force_reaction_indices: Sequence[int], + low_expression_threshold: float, + high_expression_threshold: float, + force_reaction_indices: npt.NDArray[np.integer], solver: str, + build_settings: ModelBuildSettings, ) -> cobra.Model: properties: IMATProperties = IMATProperties( exp_vector=expr_vector, @@ -459,13 +463,15 @@ def _build_model( lower_bounds: list[float], upper_bounds: list[float], solver: str, - low_thresh: float, - high_thresh: float, + low_percentile: int, + high_percentile: int, output_flux_result_filepath: Path, - taxon: int | str | Taxon, + taxon: int | str, + objective_direction: Literal["min", "max"], + build_settings: ModelBuildSettings, *, force_boundary_rxn_inclusion: bool, -) -> _BuildResults: +) -> cobra.Model: """Seed a context specific reference_model. Core reactions are determined from GPR associations with gene expression logicals. @@ -474,7 +480,7 @@ def _build_model( force excluded even if they meet GPR association requirements using the force exclude file. Args: - general_model_file: Path to a COBRA model file (.xml, .mat, or .json) + reference_model: The reference model used in reconstruction gene_expression_file: Path to a gene expression file (.csv, .tsv, .xlsx, or .xls) recon_algorithm: Algorithm to use for reconstruction (GIMME, FASTCORE, iMAT, or tINIT) objective: Objective reaction ID in the general model @@ -484,14 +490,15 @@ def _build_model( lower_bounds: List of lower bounds corresponding to boundary reactions upper_bounds: List of upper bounds corresponding to boundary reactions solver: Solver to use (e.g., 'glpk', 'cplex', 'gurobi') - low_thresh: Low expression threshold for algorithms that require it (iMAT, tINIT) - high_thresh: High expression threshold for algorithms that require it (iMAT, tINIT) output_flux_result_filepath: Path to save flux results (for iMAT only) taxon: Taxon ID or Taxon object for gene ID conversion. force_boundary_rxn_inclusion: If True, ensure that all boundary reactions are included in the final model. Returns: - A _BuildResults object containing the context-specific model, list of expression indices used, and a DataFrame of infeasible reactions. + A _BuildResults object containing: + the context-specific model + list of expression indices used + A DataFrame of infeasible reactions. """ reference_model: cobra.Model = _read_reference_model(general_model_file) From 4b28e851cd72e502987554bd631aa12a0ad9ca14 Mon Sep 17 00:00:00 2001 From: Josh Loecker Date: Fri, 6 Mar 2026 11:20:06 -0600 Subject: [PATCH 20/41] fix: correct iMAT implementation with Troppo - Uses cobamp for optimization handling - Properly partition solution into high-bin and low-bin reactions - Fix high/low index calculation to match Troppo's expected behavior - Add validation for expected solution length - Use 0.5 midpoint for binarizing reaction inclusion Signed-off-by: Josh Loecker --- main/como/create_context_specific_model.py | 100 ++++++++++++++++----- 1 file changed, 80 insertions(+), 20 deletions(-) diff --git a/main/como/create_context_specific_model.py b/main/como/create_context_specific_model.py index 0867eaaa..a25dcd37 100644 --- a/main/como/create_context_specific_model.py +++ b/main/como/create_context_specific_model.py @@ -229,35 +229,95 @@ def _build_with_imat( solver: str, build_settings: ModelBuildSettings, ) -> cobra.Model: + model = reference_model.copy() + + # `list()` is required because imat uses `if core:`; this returns an error on numpy arrays + force_rxn_list = list(np.asarray(force_reaction_indices, dtype=int)) properties: IMATProperties = IMATProperties( exp_vector=expr_vector, - exp_thresholds=expr_thresh, - # core=np.array(force_gene_indices).tolist(), - core=list(force_reaction_indices), # `list()` is required because imat uses `if core:`; this returns an error on numpy arrays - epsilon=0.01, - solver=solver.upper(), + exp_thresholds=( + low_expression_threshold, + high_expression_threshold, + ), + tolerance=build_settings.min_reaction_flux, + core=force_rxn_list, + epsilon=build_settings.troppo_epsilon, + solver=solver, ) - # Creating a copy of the model ensures we don't make any in-place modifications by accident # Using cobra to create the stoichiometry matrix means we have less work to do - force_reaction_indices = np.array(force_reaction_indices, dtype=np.uint16) - model_reconstruction: cobra.Model = reference_model.copy() - s_matrix: npt.NDArray[float] = cobra.util.array.create_stoichiometric_matrix(model=model_reconstruction) - algorithm: IMAT = IMAT(S=s_matrix, lb=np.array(lower_bounds), ub=np.array(upper_bounds), properties=properties) - rxns_from_imat: npt.NDArray[np.uint16] = algorithm.run().astype(np.uint16) + s_matrix: npt.NDArray[np.floating] = cast( + npt.NDArray[np.floating], cobra.util.array.create_stoichiometric_matrix(model=model) + ) + algorithm: IMAT = IMAT(S=s_matrix, lb=lower_bounds, ub=upper_bounds, properties=properties) + + # Match Troppo's iMAT high/low bin calculations + high_index = np.where(expr_vector >= high_expression_threshold)[0].astype(int) + low_index = np.where((expr_vector >= 0) & (expr_vector < low_expression_threshold))[0].astype(int) + if force_reaction_indices.size > 0: + high_index = np.union1d(high_index, force_reaction_indices) + lso, lsystem = algorithm.generate_imat_problem( + S=algorithm.S, + lb=algorithm.lb, + ub=algorithm.ub, + high_idx=high_index, + low_idx=low_index, + epsilon=build_settings.troppo_epsilon, + ) + # reset default Troppo/Cobamp thresholds + cfg = lsystem.get_configuration() + cfg.verbosity = build_settings.solver_verbosity + cfg.timeout = build_settings.solver_timeout + cfg.tolerances.feasibility = build_settings.solver_feasibility + cfg.tolerances.optimality = build_settings.solver_optimality + cfg.tolerances.integrality = build_settings.solver_integrality + grb_model = getattr(getattr(cfg, "problem", None), "problem", None) # `getattr` to handle non-gurobi solvers + if grb_model is not None: + grb_model.Params.MIPGap = build_settings.gurobi_mipgap + grb_model.Params.IntegralityFocus = build_settings.gurobi_integrality_focus + grb_model.Params.NumericFocus = build_settings.gurobi_numeric_focus + + # Keep reactions from imat + force-included reactions + solution: cobamp.core.optimization.Solution = lso.optimize() + x_sol: npt.NDArray[np.floating] = solution.x().astype(float) + + # partition the solution into the following components: + # 1) High-bin, forward-direction reactions + # 2) High-bin, reverse-direction reactions + # 3) Activated low-bin reactions + n_rxns = len(reference_model.reactions) + expected_len = n_rxns + (2 * high_index.size) + low_index.size + if expected_len != x_sol.size: + raise ValueError( + f"Unexpected solution length: got {x_sol.size}, expected {expected_len}" + f"(n_rxns={n_rxns}, n_high_index={high_index.size}, n_low_index={low_index.size}" + ) - # Collect all reaction IDs and their associated index (e.g., HEX1 is at index 123) - all_rxn_ids: npt.NDArray[str] = np.array([r.id for r in model_reconstruction.reactions], dtype=object) - all_rxn_indices: npt.NDArray[np.uint16] = np.array(range(len(model_reconstruction.reactions)), dtype=np.uint16) + part: int = 0 + flux_vector: npt.NDArray[np.floating] = x_sol[part : part + n_rxns] + part += n_rxns + high_forward: npt.NDArray[np.floating] = x_sol[part : part + high_index.size] + part += high_index.size + high_reverse: npt.NDArray[np.floating] = x_sol[part : part + high_index.size] + part += high_index.size + low_index.size - rxn_indices_to_keep: npt.NDArray[int] = np.unique(np.concatenate([rxns_from_imat, force_reaction_indices], dtype=int)) + # iMAT binarizes at ~0 and ~1 for off/on reactions, + # so we can use 0.5 as the midpoint for defining inclusion/exclusion + high_forward_rxns: npt.NDArray[np.integer] = high_index[high_forward > 0.5] + high_reverse_rxns: npt.NDArray[np.integer] = high_index[high_reverse > 0.5] + high_on_rxns = np.unique(np.concatenate([high_forward_rxns, high_reverse_rxns])).astype(int) - # Reaction indices to exclude from the model are thus reactions that are not marked to be included in the model - # Assume unique is false because every value that is in `rxn_indices_to_keep` is included in `all_rxn_indices` - rxn_indices_to_remove: npt.NDArray[np.uint16] = np.setdiff1d(all_rxn_indices, rxn_indices_to_keep, assume_unique=False) - model_reconstruction.remove_reactions(reactions=all_rxn_ids[rxn_indices_to_remove].tolist(), remove_orphans=True) + # Low bin reactions may be included if it has a flux greater than `min_reaction_flux` + flux_keep = np.where(np.abs(solution.x())[:n_rxns] >= build_settings.min_reaction_flux)[0].astype(int) + to_keep = np.unique(np.concatenate([flux_keep, high_on_rxns, force_reaction_indices])) - return model_reconstruction + # Collect all reaction IDs and their associated index (e.g., HEX1 is at index 123) + all_rxn_ids: npt.NDArray[np.str_] = np.asarray([r.id for r in model.reactions], dtype=str) + to_remove = np.ones(shape=len(model.reactions), dtype=bool) + to_remove[to_keep] = False + remove_rxn_ids: list[str | cobra.Reaction] = list(all_rxn_ids[to_remove]) + model.remove_reactions(reactions=remove_rxn_ids, remove_orphans=True) + return model def _build_with_tinit( From 78f62f8504fd2850ca63eb5e51bbe51b5bcaea68 Mon Sep 17 00:00:00 2001 From: Josh Loecker Date: Fri, 6 Mar 2026 11:22:03 -0600 Subject: [PATCH 21/41] fix: update boundary reaction collection logic - Adds `reference_model` parameter to internal `_collect_boundary_reactions` to define constraints for all boundary reactions - Validate compartments exist in reference model Signed-off-by: Josh Loecker --- main/como/create_context_specific_model.py | 71 +++++++++++++++++----- 1 file changed, 55 insertions(+), 16 deletions(-) diff --git a/main/como/create_context_specific_model.py b/main/como/create_context_specific_model.py index a25dcd37..8169fa8a 100644 --- a/main/como/create_context_specific_model.py +++ b/main/como/create_context_specific_model.py @@ -739,31 +739,70 @@ def _collect_boundary_reactions( "reaction", "abbreviation", "compartment", - "minimum reaction rate", - "maximum reaction rate", + "lower bound", + "upper bound", ]: raise ValueError( - f"Boundary reactions file must have columns named 'Reaction', 'Abbreviation', 'Compartment', " - f"'Minimum Reaction Rate', and 'Maximum Reaction Rate'. Found: {column}" + f"Boundary reactions file must have columns named 'reaction', 'abbreviation', 'compartment', " + f"'lower bound', and 'upper bound'. Found: '{column}'" ) - reactions: list[str] = [""] * len(df) - boundary_type: list[str] = df["reaction"].tolist() - reaction_abbreviation: list[str] = list(df["abbreviation"].astype(str)) - reaction_compartment: list[str] = list(df["compartment"].astype(str)) + for compartment in df["compartment"].unique(): + as_shorthand = CobraCompartments.get_shorthand(compartment) if len(compartment) > 2 else compartment + if as_shorthand not in reference_model.compartments: + raise ValueError(f"Unknown compartment: '{compartment}'") + boundary_map = {"exchange": "EX", "demand": "DM", "sink": "SK"} - for i in range(len(boundary_type)): - boundary: str = boundary_type[i].lower() - if boundary not in boundary_map: - raise ValueError(f"Boundary reaction type must be 'Exchange', 'Demand', or 'Sink'. Found: {boundary}") + for exchange in df["reaction"].str.lower().unique(): + if boundary_map.get(exchange) is None: + raise ValueError( + f"Unable to determine reaction type for: '{exchange}'\n" + f"Validate the boundary reactions file has a 'Reaction' value of: 'Exchange', 'Demand', or 'Sink'." + ) - shorthand_compartment = CobraCompartments.get_shorthand(reaction_compartment[i]) - reactions[i] = f"{boundary_map.get(boundary)}_{reaction_abbreviation[i]}[{shorthand_compartment}]" + reactions: list[str] = [] + lower_bounds: list[float] = [] + upper_bounds: list[float] = [] + for ref_rxn in reference_model.boundary: + ref_rxn: cobra.Reaction + + # Remove prefixes from reactions: EX_glc_D[e] -> glc_D[e]; DM_h2o[c] -> h2o[c]; sink_o2[m] -> o2[m] + # Then remove suffixes from reactions: glc_D[e] -> glc_D; h2o[c] -> h2o; o2[m] -> o2 + base_rxn_id: str = ref_rxn.id.split("_", maxsplit=1)[1] + base_rxn_id = base_rxn_id.split("[", maxsplit=1)[0] + + # If we are excluding unlisted exchange reactions from the model, + # then we need to set their lower/upper bounds to 0 + # Otherwise, keep the unlisted exchanges as their default + if base_rxn_id not in df["abbreviation"].values: + reactions.append(ref_rxn.id) + lower_bounds.append(0 if close_unlisted_exchanges else ref_rxn.lower_bound) + upper_bounds.append(0 if close_unlisted_exchanges else ref_rxn.upper_bound) + continue + + rxn_index = df[df["abbreviation"] == base_rxn_id].index.item() + shorthand_compartment = CobraCompartments.get_shorthand(str(df.at[rxn_index, "compartment"])) + exch_type: str | None = boundary_map.get(str(df.at[rxn_index, "reaction"]).lower()) + lower_bound, upper_bound = df.loc[rxn_index, ["lower bound", "upper bound"]].astype(float).values.ravel() + + # The boundary reactions are built on the base reaction id; + # if the reference model boundary has a matching base ID, but a non-matching + # boundary type (exchange, sink, demand), this will check to make sure we are not + # mistakingly setting boundaries that should not exist + formatted_boundary_reaction = f"{exch_type}_{base_rxn_id}[{shorthand_compartment}]" + if ref_rxn.id == formatted_boundary_reaction: + reactions.append(formatted_boundary_reaction) + lower_bounds.append(lower_bound) + upper_bounds.append(upper_bound) + else: + reactions.append(ref_rxn.id) + lower_bounds.append(0 if close_unlisted_exchanges else ref_rxn.lower_bound) + upper_bounds.append(0 if close_unlisted_exchanges else ref_rxn.upper_bound) return _BoundaryReactions( reactions=reactions, - lower_bounds=df["minimum reaction rate"].tolist(), - upper_bounds=df["maximum reaction rate"].tolist(), + lower_bounds=lower_bounds, + upper_bounds=upper_bounds, ) From bf30d2dde8ff23e9d207e790cd2ec3cc74844600 Mon Sep 17 00:00:00 2001 From: Josh Loecker Date: Fri, 6 Mar 2026 11:23:56 -0600 Subject: [PATCH 22/41] feat: add `close_unlisted_exchanges` to forcibly set bounds to 0 for exchange reactions not included in the boundary reactions input file This option defaults to False to maintain compatibility with preivous releases Signed-off-by: Josh Loecker --- main/como/create_context_specific_model.py | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/main/como/create_context_specific_model.py b/main/como/create_context_specific_model.py index 8169fa8a..957f8ecb 100644 --- a/main/como/create_context_specific_model.py +++ b/main/como/create_context_specific_model.py @@ -849,8 +849,9 @@ def create_context_specific_model( # noqa: C901 log_level: LogLevel = LogLevel.INFO, log_location: str | TextIO | TextIOWrapper = sys.stderr, *, - force_boundary_rxn_inclusion: bool = True, -): + force_boundary_rxn_inclusion: bool = False, + close_unlisted_exchanges: bool = False, +) -> cobra.Model: """Create a context-specific model using the provided data. Args: @@ -909,11 +910,14 @@ def create_context_specific_model( # noqa: C901 for path in output_model_filepaths if path.suffix not in {*mat_suffix, *json_suffix, *xml_suffix} ) - log_and_raise_error( - f"Invalid output filetype. Should be 'xml', 'sbml', 'mat', or 'json'. Got:\n{invalid_suffix}'", - error=ValueError, - level=LogLevel.ERROR, raise ValueError(f"Invalid output filetype. Should be 'xml', 'sbml', 'mat', or 'json'. Got:\n{invalid_suffix}'") + + reference_model = _read_reference_model(reference_model_filepath) + boundary_reactions = ( + _collect_boundary_reactions( + boundary_rxns_filepath, + reference_model=reference_model, + close_unlisted_exchanges=close_unlisted_exchanges, ) boundary_reactions = None From b92b555d453e96b2d46369011bdb52daa7f1a989 Mon Sep 17 00:00:00 2001 From: Josh Loecker Date: Fri, 6 Mar 2026 11:24:38 -0600 Subject: [PATCH 23/41] refactor: streamline GIMME model reconstruction and expression handling Signed-off-by: Josh Loecker --- main/como/create_context_specific_model.py | 58 +++++++++++++++------- 1 file changed, 41 insertions(+), 17 deletions(-) diff --git a/main/como/create_context_specific_model.py b/main/como/create_context_specific_model.py index 957f8ecb..812842fd 100644 --- a/main/como/create_context_specific_model.py +++ b/main/como/create_context_specific_model.py @@ -159,30 +159,54 @@ def _build_with_gimme( solver: str, threshold_percentile: int = 30, ): - model_reconstruction = reference_model.copy() - s_matrix: list[float] = list(cobra.util.array.create_stoichiometric_matrix(model=model_reconstruction)) + model = reference_model + if threshold_percentile < 1: + logger.critical( + f"GIMME quantile calculation is less than 1 ({threshold_percentile}), but expected an integer of ~25! " + f"This will likely result in a model with many more reactions than expected." + ) + + expression: npt.NDArray[np.floating] = np.asarray(expression_vector, dtype=float) + nonzero = expression[expression > 0] + min_expr_percentile = np.percentile(nonzero, threshold_percentile).astype(float) + logger.debug(f"GIMME: minimum expression is {expression.min():.4f}") + logger.debug(f"GIMME: minimum non-zero expression is {nonzero.min():.4f}") + logger.debug(f"GIMME: maximum expression is {expression.max():.4f}") + + expression[(expression < min_expr_percentile) & (expression > 0)] = 0 + expression[expression >= min_expr_percentile] = 1 + + suffix: str = {1: "st", 2: "nd", 3: "rd"}.get(threshold_percentile, "th") + logger.debug(f"GIMME: {threshold_percentile}{suffix} percentile is {min_expr_percentile:.4f}") + + s_matrix: list[float] = list(np.asarray(cobra.util.array.create_stoichiometric_matrix(model=model), dtype=float)) + reaction_ids: list[str] = [r.id for r in model.reactions] + metabolite_ids: list[str] = [m.id for m in model.metabolites] + # `Becker and Palsson (2008). Context-specific metabolic networks are # consistent with experiments. PLoS Comput. Biol. 4, e1000082.` properties = GIMMEProperties( - exp_vector=expr_vector, # np.array(gimme_data['0']), - obj_frac=0.9, + exp_vector=list(expression), objectives=[{idx_objective: 1}], preprocess=True, - flux_threshold=0.9, + reaction_ids=reaction_ids, + metabolite_ids=metabolite_ids, + solver=solver.upper(), ) algorithm = GIMME(s_matrix, list(lower_bounds), list(upper_bounds), properties) - gene_activity = algorithm.run() - reaction_ids = [r.id for r in model_reconstruction.reactions] - to_remove_ids = [reaction_ids[r] for r in np.where(gene_activity == 0)[0]] + active_rxn_indices: list[int] = algorithm.run() + to_remove_ids: list[str | cobra.Reaction] = [ + reaction_ids[i] for i in range(len(model.reactions)) if i not in active_rxn_indices + ] + model.remove_reactions(to_remove_ids, True) - model_reconstruction.remove_reactions(to_remove_ids, True) - psol = pfba(model_reconstruction) # noqa: F841 + # psol = pfba(model) # reaction_ids = [r.id for r in context_cobra_model.reactions] # psol = context_cobra_model.optimize() # to_remove_ids = [reaction_ids[r] for r in np.where(abs(psol.fluxes) < 1e-8)[0]] # context_cobra_model.remove_reactions(to_remove_ids, True) - return model_reconstruction + return model def _build_with_fastcore( @@ -338,12 +362,12 @@ def _build_with_tinit( allow_excretion=False, no_reverse_loops=True, ) - model_reconstruction = reference_model.copy() - s_matrix: npt.NDArray[float] = np.asarray(cobra.util.array.create_stoichiometric_matrix(model=model_reconstruction), dtype=float) - algorithm = tINIT(s_matrix, lower_bounds, upper_bounds, properties) - algorithm.preprocessing() - algorithm.build_problem() - _log_and_raise_error("tINIT is not yet implemented.", error=NotImplementedError, level=LogLevel.CRITICAL) + s_matrix: npt.NDArray[np.floating] = np.asarray( + cobra.util.array.create_stoichiometric_matrix(model=model), dtype=float + ) + algorithm = tINIT(s_matrix, np.asarray(lower_bounds), np.asarray(upper_bounds), properties) + active_rxn_indices: npt.NDArray[np.integer] = algorithm.run() + def _build_with_corda( reference_model: cobra.Model, From 46b89aa9790af53155f5b5141d8f4d7e83dc1f50 Mon Sep 17 00:00:00 2001 From: Josh Loecker Date: Fri, 6 Mar 2026 11:24:58 -0600 Subject: [PATCH 24/41] refactor: improve context-specific model creation logic Signed-off-by: Josh Loecker --- main/como/create_context_specific_model.py | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/main/como/create_context_specific_model.py b/main/como/create_context_specific_model.py index 812842fd..0ac816c6 100644 --- a/main/como/create_context_specific_model.py +++ b/main/como/create_context_specific_model.py @@ -230,16 +230,15 @@ def _build_with_fastcore( "Lower bounds, upper bounds, and stoichiometric matrix must have the same number of reactions." ) logger.debug("Creating feasible model") - _, cobra_model = _feasibility_test(cobra_model, "other") - properties = FastcoreProperties(core=exp_idx_list, solver=solver) - algorithm = FASTcore(s_matrix, lower_bounds, upper_bounds, properties) + _, reference_model = _feasibility_test(reference_model, "other") + properties = FastcoreProperties(core=list(exp_idx_list), solver=solver) + algorithm = FASTcore(s_matrix, np.asarray(lower_bounds), np.asarray(upper_bounds), properties) context_rxns = algorithm.fastcore() - context_cobra_model = cobra_model.copy() - r_ids = [r.id for r in context_cobra_model.reactions] + r_ids = [r.id for r in model.reactions] remove_rxns = [r_ids[int(i)] for i in range(s_matrix.shape[1]) if i not in context_rxns] - context_cobra_model.remove_reactions(remove_rxns, True) + model.remove_reactions(remove_rxns, True) - return context_cobra_model + return model def _build_with_imat( From f2d7199cd65093a00c3f9cd32dd697119ff5591e Mon Sep 17 00:00:00 2001 From: Josh Loecker Date: Fri, 6 Mar 2026 11:25:12 -0600 Subject: [PATCH 25/41] chore: add type hints for reconstructing with tINIT Signed-off-by: Josh Loecker --- main/como/create_context_specific_model.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/main/como/create_context_specific_model.py b/main/como/create_context_specific_model.py index 0ac816c6..7b867da2 100644 --- a/main/como/create_context_specific_model.py +++ b/main/como/create_context_specific_model.py @@ -345,18 +345,18 @@ def _build_with_imat( def _build_with_tinit( reference_model: cobra.Model, - lower_bounds, - upper_bounds, - expr_vector, - solver, - idx_force, -) -> Model: + lower_bounds: npt.NDArray[np.floating], + upper_bounds: npt.NDArray[np.floating], + expr_vector: npt.NDArray[np.floating], + solver: str, + idx_force: npt.NDArray[np.integer], +) -> None: raise NotImplementedError("tINIT is not yet implemented.") model = reference_model properties = tINITProperties( - reactions_scores=expr_vector, + reactions_scores=list(expr_vector), solver=solver, - essential_reactions=idx_force, + essential_reactions=list(idx_force), production_weight=0.0, allow_excretion=False, no_reverse_loops=True, From 98a54365fe34c4561968fd3bb1fe316ad74867ef Mon Sep 17 00:00:00 2001 From: Josh Loecker Date: Fri, 6 Mar 2026 11:26:13 -0600 Subject: [PATCH 26/41] chore: do not use match case; reduces indentation Signed-off-by: Josh Loecker --- main/como/create_context_specific_model.py | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) diff --git a/main/como/create_context_specific_model.py b/main/como/create_context_specific_model.py index 7b867da2..4a9e5ccd 100644 --- a/main/como/create_context_specific_model.py +++ b/main/como/create_context_specific_model.py @@ -524,15 +524,13 @@ def _map_expression_to_reaction( def _read_reference_model(filepath: Path) -> cobra.Model: - match filepath.suffix: - case ".mat": - reference_model = cobra.io.load_matlab_model(filepath) - case ".xml" | ".sbml": - reference_model = cobra.io.read_sbml_model(filepath) - case ".json": - reference_model = cobra.io.load_json_model(filepath) - case _: - raise ValueError(f"Reference model format must be .xml, .mat, or .json; found '{filepath.suffix}'") + if filepath.suffix == ".json": + return cobra.io.load_json_model(filepath) + if filepath.suffix == ".mat": + return cobra.io.load_matlab_model(filepath) + if filepath.suffix in {".xml", ".sbml"}: + return cobra.io.read_sbml_model(filepath) + raise ValueError(f"Reference model format must be .xml, .mat, or .json; found '{filepath.suffix}'") def _build_model( From b83842c5092ca8e821257be50050395f21f9bed2 Mon Sep 17 00:00:00 2001 From: Josh Loecker Date: Fri, 6 Mar 2026 11:26:49 -0600 Subject: [PATCH 27/41] perf: pass in cobra.Model object to prevent reading multiple times Signed-off-by: Josh Loecker --- main/como/create_context_specific_model.py | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/main/como/create_context_specific_model.py b/main/como/create_context_specific_model.py index 4a9e5ccd..c8ce8a0d 100644 --- a/main/como/create_context_specific_model.py +++ b/main/como/create_context_specific_model.py @@ -581,12 +581,8 @@ def _build_model( list of expression indices used A DataFrame of infeasible reactions. """ - reference_model: cobra.Model = _read_reference_model(general_model_file) - if objective not in force_reactions: - force_reactions.append(objective) - reference_model = _set_boundaries(reference_model, boundary_reactions, lower_bounds, upper_bounds) - reference_model.solver = solver.lower() + force_reactions.append(objective) # forcibly include the objective in the reconstructed model # Set reference model boundaries for rxn, lb, ub in zip(boundary_reactions, lower_bounds, upper_bounds, strict=True): From a419bce82aca58213451ca2a1190515f5f068a7d Mon Sep 17 00:00:00 2001 From: Josh Loecker Date: Fri, 6 Mar 2026 11:27:19 -0600 Subject: [PATCH 28/41] refactor: enhance reaction expression mapping and logging for missing reactions Signed-off-by: Josh Loecker --- main/como/create_context_specific_model.py | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) diff --git a/main/como/create_context_specific_model.py b/main/como/create_context_specific_model.py index c8ce8a0d..d488879e 100644 --- a/main/como/create_context_specific_model.py +++ b/main/como/create_context_specific_model.py @@ -612,16 +612,19 @@ def _build_model( if np.isnan(ref_ub).any(): raise ValueError("Upper bounds contains unfilled values!") - # get expressed reactions - reaction_expression: collections.OrderedDict[str, int] = await _map_expression_to_reaction( - reference_model, gene_expression_file, recon_algorithm, high_thresh=high_thresh, low_thresh=low_thresh, taxon=taxon + reaction_expression, min_expr_val, low_expr, high_expr = _map_expression_to_reaction( + reference_model, + gene_expression_file, + recon_algorithm, + taxon=taxon, + low_percentile=low_percentile, + high_percentile=high_percentile, ) - expression_vector: npt.NDArray[float] = np.array(list(reaction_expression.values()), dtype=float) - - for rxn in force_reactions: - if rxn not in reaction_ids: + expression_vector: npt.NDArray[np.floating] = np.asarray(list(reaction_expression.values()), dtype=float) + for rxn_id in force_reactions: + if rxn_id not in reaction_ids: logger.warning( - f"The force reaction '{rxn}' was not found in the reference model. " + f"The force reaction '{rxn_id}' was not found in the reference model. " f"Check BiGG, or the relevant database for your reference model, for synonyms." ) From 3a8f131d2ce54da312e5d9009ac249772071eba9 Mon Sep 17 00:00:00 2001 From: Josh Loecker Date: Fri, 6 Mar 2026 11:27:40 -0600 Subject: [PATCH 29/41] refactor: streamline reaction expression handling in model creation Signed-off-by: Josh Loecker --- main/como/create_context_specific_model.py | 26 ++++++++-------------- 1 file changed, 9 insertions(+), 17 deletions(-) diff --git a/main/como/create_context_specific_model.py b/main/como/create_context_specific_model.py index d488879e..1d8e9f5f 100644 --- a/main/como/create_context_specific_model.py +++ b/main/como/create_context_specific_model.py @@ -628,23 +628,15 @@ def _build_model( f"Check BiGG, or the relevant database for your reference model, for synonyms." ) - # collect list of reactions that are infeasible but active in expression data or user defined - infeasible_expression_reactions = [] - infeasible_force_reactions = [] - - for i, rxn in enumerate(reaction_expression): - # log reactions in expressed and force lists that are infeasible that the user may wish to review - if rxn in inconsistent_reactions and expression_vector[i] == 1: - infeasible_expression_reactions.append(rxn) - if rxn in inconsistent_reactions and rxn in force_reactions: - infeasible_force_reactions.append(rxn) - - if rxn in force_reactions: - expression_vector[i] = high_thresh + 0.1 if recon_algorithm in {Algorithm.TINIT, Algorithm.IMAT} else 1 - if rxn in inconsistent_reactions or rxn in exclude_reactions: - expression_vector[i] = low_thresh - 0.1 if recon_algorithm in {Algorithm.TINIT, Algorithm.IMAT} else 0 - - objective_index = reaction_ids.index(objective) + # Set force-include reactions to have an expression higher than the upper threshold + # Set force-exclude reactions to have an expression lower than the lower threshold + rxn_ids = list(reaction_expression.keys()) + force_mask = np.isin(rxn_ids, force_reactions) + exclude_mask = np.isin(rxn_ids, exclude_reactions) + expression_vector[force_mask] = high_expr + 1 if recon_algorithm in {Algorithm.TINIT, Algorithm.IMAT} else 1 + expression_vector[exclude_mask] = ( + max(0.0, low_expr - 1) if recon_algorithm in {Algorithm.TINIT, Algorithm.IMAT} else 0 + ) if force_boundary_rxn_inclusion: all_forced: set[str] = {*force_reactions, *boundary_reactions} From 4cab288abca5f0fc6ecbad1a85f9e379fd5192fb Mon Sep 17 00:00:00 2001 From: Josh Loecker Date: Fri, 6 Mar 2026 11:28:00 -0600 Subject: [PATCH 30/41] refactor: improve reaction index handling and expression vector initialization Signed-off-by: Josh Loecker --- main/como/create_context_specific_model.py | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/main/como/create_context_specific_model.py b/main/como/create_context_specific_model.py index 1d8e9f5f..3b8384d2 100644 --- a/main/como/create_context_specific_model.py +++ b/main/como/create_context_specific_model.py @@ -640,16 +640,15 @@ def _build_model( if force_boundary_rxn_inclusion: all_forced: set[str] = {*force_reactions, *boundary_reactions} - force_reaction_indices: npt.NDArray[np.uint16] = np.array( - [reaction_ids.index(rxn) for rxn in all_forced if rxn in reaction_ids], dtype=np.uint16 + force_reaction_indices: npt.NDArray[np.integer] = np.asarray( + [reaction_ids.index(rxn_id) for rxn_id in all_forced if rxn_id in reaction_ids], dtype=int ) else: - force_reaction_indices: npt.NDArray[np.uint16] = np.array( - [reaction_ids.index(rxn) for rxn in force_reactions if rxn in reaction_ids], dtype=np.uint16 + force_reaction_indices: npt.NDArray[np.integer] = np.asarray( + [reaction_ids.index(rxn_id) for rxn_id in force_reactions if rxn_id in reaction_ids], dtype=int ) - expression_vector_indices = [i for (i, val) in enumerate(expression_vector) if val > 0] - expression_threshold = (low_thresh, high_thresh) + expression_vector_indices: list[int] = list(range(len(expression_vector))) if recon_algorithm == Algorithm.IMAT: context_model_cobra: cobra.Model = _build_with_imat( @@ -657,8 +656,8 @@ def _build_model( lower_bounds=ref_lb, upper_bounds=ref_ub, expr_vector=expression_vector, - low_expression_threshold=updated_low_thresh, - high_expression_threshold=updated_high_thresh, + low_expression_threshold=low_expr, + high_expression_threshold=high_expr, force_reaction_indices=force_reaction_indices, solver=solver, build_settings=build_settings, @@ -673,7 +672,7 @@ def _build_model( context_model_cobra: cobra.Model = _build_with_gimme( reference_model=reference_model, expression_vector=expression_vector, - idx_objective=objective_index, + idx_objective=reaction_ids.index(objective), lower_bounds=ref_lb, upper_bounds=ref_ub, solver=solver, From e4b90b5799238cf29228eef1a4dde55e68ce02ff Mon Sep 17 00:00:00 2001 From: Josh Loecker Date: Fri, 6 Mar 2026 11:28:15 -0600 Subject: [PATCH 31/41] refactor: update reaction subsystem assignment and optimize flux output handling Signed-off-by: Josh Loecker --- main/como/create_context_specific_model.py | 33 +++++++++++----------- 1 file changed, 16 insertions(+), 17 deletions(-) diff --git a/main/como/create_context_specific_model.py b/main/como/create_context_specific_model.py index 3b8384d2..71b8baf4 100644 --- a/main/como/create_context_specific_model.py +++ b/main/como/create_context_specific_model.py @@ -701,23 +701,22 @@ def _build_model( f"Got: {recon_algorithm.value}" ) - inconsistent_and_infeasible_reactions: pd.DataFrame = pd.concat( - [ - pd.DataFrame({"infeasible_reactions": inconsistent_reactions}), - pd.DataFrame({"expressed_infeasible_reactions": infeasible_expression_reactions}), - pd.DataFrame({"infeasible_force_reactions": infeasible_force_reactions}), - pd.DataFrame({"infeasible_context_reactions": []}), # Included to maintain legacy support - ], - ignore_index=True, - axis=0, - ) - - return _BuildResults( - model=context_model_cobra, - expression_index_list=expression_vector_indices, - infeasible_reactions=inconsistent_and_infeasible_reactions, - ) - + # Set each reaction subsystem based on the reference model + for rxn in context_model_cobra.reactions: + cast(cobra.Reaction, context_model_cobra.reactions.get_by_id(rxn.id)).subsystem = cast( + cobra.Reaction, reference_model.reactions.get_by_id(rxn.id) + ).subsystem + + context_model_cobra.objective = objective + flux_sol: cobra.Solution = context_model_cobra.optimize() + fluxes = flux_sol.fluxes + model_reactions: list[str] = [reaction.id for reaction in context_model_cobra.reactions] + reaction_intersections: set[str] = set(fluxes.index).intersection(model_reactions) + flux_df = fluxes[~fluxes.index.isin(reaction_intersections)] + flux_df.dropna(inplace=True) + flux_df.to_csv(output_flux_result_filepath) + + return context_model_cobra def _create_df(path: Path, *, lowercase_col_names: bool = False) -> pd.DataFrame: From 2f6e09e0cda6b57d9a36dbf169717556bf84c171 Mon Sep 17 00:00:00 2001 From: Josh Loecker Date: Fri, 6 Mar 2026 11:28:43 -0600 Subject: [PATCH 32/41] refactor: simplify DataFrame creation by replacing async file read with pandas read_csv Signed-off-by: Josh Loecker --- main/como/create_context_specific_model.py | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) diff --git a/main/como/create_context_specific_model.py b/main/como/create_context_specific_model.py index 71b8baf4..0a8d31ff 100644 --- a/main/como/create_context_specific_model.py +++ b/main/como/create_context_specific_model.py @@ -722,15 +722,8 @@ def _build_model( def _create_df(path: Path, *, lowercase_col_names: bool = False) -> pd.DataFrame: if path.suffix not in {".csv", ".tsv"}: raise ValueError(f"File must be a .csv or .tsv file, got '{path.suffix}'") - df: pd.DataFrame = await _read_file(path=path, header=0, sep="," if path.suffix == ".csv" else "\t", h5ad_as_df=True) - - if not isinstance(df, pd.DataFrame): - _log_and_raise_error( - f"Expected a pandas.DataFrame, got {type(df)}", - error=TypeError, - level=LogLevel.ERROR, - ) + df = pd.read_csv(path, header=0, sep="," if path.suffix == ".csv" else "\t") if lowercase_col_names: df.columns = [c.lower() for c in df.columns] return df From d2246395057aa991aebb08442e1c77707d4f2310 Mon Sep 17 00:00:00 2001 From: Josh Loecker Date: Fri, 6 Mar 2026 11:29:04 -0600 Subject: [PATCH 33/41] refactor: update parameter names for clarity and improve docstring formatting Signed-off-by: Josh Loecker --- main/como/create_context_specific_model.py | 61 ++++++++++++---------- 1 file changed, 33 insertions(+), 28 deletions(-) diff --git a/main/como/create_context_specific_model.py b/main/como/create_context_specific_model.py index 0a8d31ff..cbcf3a08 100644 --- a/main/como/create_context_specific_model.py +++ b/main/como/create_context_specific_model.py @@ -834,53 +834,58 @@ def _write_model_to_disk( def create_context_specific_model( # noqa: C901 context_name: str, - taxon: int | str | Taxon, - reference_model: Path, + taxon_id: int | str, + reference_model_filepath: Path, active_genes_filepath: Path, output_infeasible_reactions_filepath: Path, output_flux_result_filepath: Path, output_model_filepaths: Path | list[Path], output_fastcore_expression_index_filepath: Path | None = None, objective: str = "biomass_reaction", + objective_direction: Literal["min", "max"] = "max", + contextualized_model_id: str | None = None, boundary_rxns_filepath: str | Path | None = None, exclude_rxns_filepath: str | Path | None = None, force_rxns_filepath: str | Path | None = None, algorithm: Algorithm = Algorithm.GIMME, - low_threshold: float = -5, - high_threshold: float = -3, + low_percentile: int | None = None, + high_percentile: int | None = None, solver: Solver = Solver.GLPK, log_level: LogLevel = LogLevel.INFO, log_location: str | TextIO | TextIOWrapper = sys.stderr, + build_settings: ModelBuildSettings | None = None, *, force_boundary_rxn_inclusion: bool = False, close_unlisted_exchanges: bool = False, ) -> cobra.Model: """Create a context-specific model using the provided data. - Args: - context_name: Name of the context-specific model. - taxon: NCBI taxonomy ID or name for the organism of interest. - reference_model: Path to the general genome-scale metabolic model file (.xml, .mat, or .json). - active_genes_filepath: Path to the gene expression data file (csv, tsv, or Excel). - output_infeasible_reactions_filepath: Path to save infeasible reactions (csv). - output_flux_result_filepath: Path to save flux results (csv). - output_model_filepaths: Path or list of paths to save the context-specific model (.xml, .mat, or .json). - output_fastcore_expression_index_filepath: Path to save Fastcore expression indices (txt). Required if using Fastcore. - objective: Objective function reaction ID. - boundary_rxns_filepath: Optional path to boundary reactions file (csv, tsv, or Excel). - exclude_rxns_filepath: Optional path to reactions to exclude file (csv, tsv, or Excel). - force_rxns_filepath: Optional path to reactions to force include file (csv, tsv, or Excel). - algorithm: Algorithm to use for reconstruction. One of Algorithm.GIMME, Algorithm.FASTCORE, Algorithm.IMAT, Algorithm.TINIT. - low_threshold: Low expression threshold for algorithms that require it. - high_threshold: High expression threshold for algorithms that require it. - solver: Solver to use. One of Solver.GLPK, Solver.CPLEX, Solver.GUROBI - log_level: Logging level. One of LogLevel.DEBUG, LogLevel.INFO, LogLevel.WARNING, LogLevel.ERROR, LogLevel.CRITICAL - log_location: Location for log output. Can be a file path or sys.stderr/sys.stdout. - force_boundary_rxn_inclusion: If True, ensure that all provided boundary reactions are included in the final model. - - Raises: - ImportError: If Gurobi solver is selected but gurobipy is not installed. - """ + :param context_name: Name of the context-specific model. + :param taxon_id: NCBI taxonomy ID or name for the organism of interest. + :param reference_model_filepath: Path to the general genome-scale metabolic model file (.xml, .mat, or .json). + :param active_genes_filepath: Path to the gene expression data file (csv, tsv, or Excel). + :param output_infeasible_reactions_filepath: Path to save infeasible reactions (csv). + :param output_flux_result_filepath: Path to save flux results (csv). + :param output_model_filepaths: Path or list of paths to save the context-specific model (.xml, .mat, or .json). + :param output_fastcore_expression_index_filepath: Path to save Fastcore expression indices (txt). + :param objective: Objective function reaction ID. + :param objective_direction: Direction of the objective function, either 'min' or 'max'. + :param contextualized_model_id: A custom name to save under `model.id`, otherwise the `context_name` value will be used. + :param boundary_rxns_filepath: Optional path to boundary reactions file (csv, tsv, or Excel). + :param exclude_rxns_filepath: Optional path to reactions to exclude file (csv, tsv, or Excel). + :param force_rxns_filepath: Optional path to reactions to force include file (csv, tsv, or Excel). + :param algorithm: Algorithm to use for reconstruction. + :param low_percentile: Low percentile for gene expression threshold calculation. + :param high_percentile: High percentile for gene expression threshold calculation. + :param solver: Solver to use. One of Solver.GLPK, Solver.CPLEX, Solver.GUROBI + :param log_level: Logging level + :param log_location: Location for log output. Can be a file path or sys.stderr/sys.stdout. + :param force_boundary_rxn_inclusion: If True, ensure that all provided boundary reactions are included in the final model. + :param build_settings: Optional ModelBuildSettings object to customize model building parameters. + :param close_unlisted_exchanges: If True, all exchanges not listed in the boundary reactions input will be closed + + :raises ImportError: If Gurobi solver is selected but gurobipy is not installed. + """ # noqa: E501 set_up_logging(level=log_level, location=log_location) if low_percentile is None: raise ValueError("low_percentile must be provided") From e1b2e79b9b33a21a92c9434cb4cf003832f61502 Mon Sep 17 00:00:00 2001 From: Josh Loecker Date: Fri, 6 Mar 2026 11:29:26 -0600 Subject: [PATCH 34/41] refactor: improve error handling and streamline file path assignments in model creation Signed-off-by: Josh Loecker --- main/como/create_context_specific_model.py | 32 +++++++++++++--------- 1 file changed, 19 insertions(+), 13 deletions(-) diff --git a/main/como/create_context_specific_model.py b/main/como/create_context_specific_model.py index cbcf3a08..d73106b1 100644 --- a/main/como/create_context_specific_model.py +++ b/main/como/create_context_specific_model.py @@ -891,13 +891,14 @@ def create_context_specific_model( # noqa: C901 raise ValueError("low_percentile must be provided") if high_percentile is None: raise ValueError("high_percentile must be provided") - # TODO: set up zfpkm threshold defaults - boundary_rxns_filepath: Path | None = Path(boundary_rxns_filepath) if boundary_rxns_filepath else None - output_model_filepaths = [output_model_filepaths] if isinstance(output_model_filepaths, Path) else output_model_filepaths + boundary_rxns_filepath = Path(boundary_rxns_filepath) if boundary_rxns_filepath else None + output_model_filepaths = ( + [output_model_filepaths] if isinstance(output_model_filepaths, Path) else output_model_filepaths + ) - if not reference_model.exists(): - raise FileNotFoundError(f"Reference model not found at {reference_model}") + if not reference_model_filepath.exists(): + raise FileNotFoundError(f"Reference model not found at {reference_model_filepath}") if not active_genes_filepath.exists(): raise FileNotFoundError(f"Active genes file not found at {active_genes_filepath}") if algorithm == Algorithm.FASTCORE and not output_fastcore_expression_index_filepath: @@ -927,23 +928,22 @@ def create_context_specific_model( # noqa: C901 reference_model=reference_model, close_unlisted_exchanges=close_unlisted_exchanges, ) - - boundary_reactions = None - if boundary_rxns_filepath: - boundary_reactions = await _collect_boundary_reactions(boundary_rxns_filepath) + if boundary_rxns_filepath + else _BoundaryReactions(reactions=[], lower_bounds=[], upper_bounds=[]) + ) exclude_rxns: list[str] = [] if exclude_rxns_filepath: - exclude_rxns_filepath: Path = Path(exclude_rxns_filepath) - df = await _create_df(exclude_rxns_filepath) + exclude_rxns_filepath = Path(exclude_rxns_filepath) + df = _create_df(exclude_rxns_filepath) if "abbreviation" not in df.columns: raise ValueError("The exclude reactions file should have a single column with a header named Abbreviation") exclude_rxns = df["abbreviation"].tolist() force_rxns: list[str] = [] if force_rxns_filepath: - force_rxns_filepath: Path = Path(force_rxns_filepath) - df = await _create_df(force_rxns_filepath, lowercase_col_names=True) + force_rxns_filepath = Path(force_rxns_filepath) + df = _create_df(force_rxns_filepath, lowercase_col_names=True) if "abbreviation" not in df.columns: raise ValueError("The force reactions file should have a single column with a header named Abbreviation") force_rxns = df["abbreviation"].tolist() @@ -966,6 +966,12 @@ def create_context_specific_model( # noqa: C901 "COMO will continue, but it is HIGHLY unlikely the resulting model will be valid." ) + overlapping_rxns = set(force_rxns) & set(exclude_rxns) + if overlapping_rxns: + raise ValueError( + f"Reactions found in force-include and force-exclude sets: {','.join(sorted(overlapping_rxns))}" + ) + logger.info(f"context='{context_name}', reconstruction method='{algorithm.value}', solver='{solver.value}'") build_results: _BuildResults = await _build_model( general_model_file=reference_model, From f7de63b175e5294919ca375a3812126e99b9978c Mon Sep 17 00:00:00 2001 From: Josh Loecker Date: Fri, 6 Mar 2026 11:29:45 -0600 Subject: [PATCH 35/41] refactor: enhance model building parameters for clarity and consistency Signed-off-by: Josh Loecker --- main/como/create_context_specific_model.py | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/main/como/create_context_specific_model.py b/main/como/create_context_specific_model.py index d73106b1..c6eb4269 100644 --- a/main/como/create_context_specific_model.py +++ b/main/como/create_context_specific_model.py @@ -973,22 +973,24 @@ def create_context_specific_model( # noqa: C901 ) logger.info(f"context='{context_name}', reconstruction method='{algorithm.value}', solver='{solver.value}'") - build_results: _BuildResults = await _build_model( - general_model_file=reference_model, + context_model: cobra.Model = _build_model( + reference_model=reference_model, gene_expression_file=active_genes_filepath, recon_algorithm=algorithm, objective=objective, - boundary_reactions=boundary_reactions.reactions if boundary_reactions else [], + objective_direction=objective_direction, + boundary_reactions=boundary_reactions.reactions, exclude_reactions=exclude_rxns, force_reactions=force_rxns, - lower_bounds=boundary_reactions.lower_bounds if boundary_reactions else [], - upper_bounds=boundary_reactions.upper_bounds if boundary_reactions else [], + lower_bounds=boundary_reactions.lower_bounds, + upper_bounds=boundary_reactions.upper_bounds, solver=solver.value.lower(), - low_thresh=low_threshold, - high_thresh=high_threshold, + low_percentile=low_percentile, + high_percentile=high_percentile, output_flux_result_filepath=output_flux_result_filepath, force_boundary_rxn_inclusion=force_boundary_rxn_inclusion, - taxon=taxon, + taxon=taxon_id, + build_settings=build_settings or ModelBuildSettings(), ) build_results.infeasible_reactions.dropna(inplace=True) From d1784919dde9f744d7811fa09676b22024639661 Mon Sep 17 00:00:00 2001 From: Josh Loecker Date: Fri, 6 Mar 2026 11:30:07 -0600 Subject: [PATCH 36/41] refactor: streamline context model creation and improve logging output Signed-off-by: Josh Loecker --- main/como/create_context_specific_model.py | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/main/como/create_context_specific_model.py b/main/como/create_context_specific_model.py index c6eb4269..b0dff92a 100644 --- a/main/como/create_context_specific_model.py +++ b/main/como/create_context_specific_model.py @@ -992,23 +992,25 @@ def create_context_specific_model( # noqa: C901 taxon=taxon_id, build_settings=build_settings or ModelBuildSettings(), ) - - build_results.infeasible_reactions.dropna(inplace=True) - build_results.infeasible_reactions.to_csv(output_infeasible_reactions_filepath, index=False) + context_model.id = contextualized_model_id or context_name if algorithm == Algorithm.FASTCORE: - fastcore_df = pd.DataFrame(build_results.expression_index_list) + fastcore_df = pd.DataFrame([r.id for r in context_model.reactions]) fastcore_df.dropna(inplace=True) fastcore_df.to_csv(output_fastcore_expression_index_filepath, index=False) - await _write_model_to_disk( + _write_model_to_disk( context_name=context_name, - model=build_results.model, + model=context_model, output_filepaths=output_model_filepaths, mat_suffix=mat_suffix, json_suffix=json_suffix, xml_suffix=xml_suffix, ) logger.info( - f"context={context_name}, reactions={len(build_results.model.reactions)}, genes={len(build_results.model.genes)}, metabolites={len(build_results.model.metabolites)}" + f"context={context_name}, " + f"reactions={len(context_model.reactions)}, " + f"genes={len(context_model.genes)}, " + f"metabolites={len(context_model.metabolites)}" ) + return context_model From cae84ea7124324a6fb6593127dc958fa2944a4f7 Mon Sep 17 00:00:00 2001 From: Josh Loecker Date: Fri, 6 Mar 2026 11:30:35 -0600 Subject: [PATCH 37/41] chore: use proper NumPy types for better compatibility Signed-off-by: Josh Loecker --- main/como/stats/_two_sample.py | 2 +- main/como/stats/fisher_exact_test.py | 2 +- main/como/stats/ks_test.py | 10 +++++----- 3 files changed, 7 insertions(+), 7 deletions(-) diff --git a/main/como/stats/_two_sample.py b/main/como/stats/_two_sample.py index e97dffbf..1b38bd4b 100644 --- a/main/como/stats/_two_sample.py +++ b/main/como/stats/_two_sample.py @@ -40,7 +40,7 @@ def _run( df2: pd.DataFrame, cores: int = 1, worker_kwargs: dict | None = None, - ) -> tuple[list[str], Mapping[str, npt.NDArray[float | np.uint8]]]: + ) -> tuple[list[str], Mapping[str, npt.NDArray[np.floating]]]: all_reactions = list(set(df1.columns) & set(df2.columns)) array_a = df1[all_reactions].to_numpy(dtype=float, copy=False) array_b = df2[all_reactions].to_numpy(dtype=float, copy=False) diff --git a/main/como/stats/fisher_exact_test.py b/main/como/stats/fisher_exact_test.py index 068eb4c2..4312069c 100644 --- a/main/como/stats/fisher_exact_test.py +++ b/main/como/stats/fisher_exact_test.py @@ -97,5 +97,5 @@ def run( else: d += 1 - result = scipy.stats.fisher_exact(np.array([[a, b], [c, d]]), alternative=alternative) + result = scipy.stats.fisher_exact(np.asarray([[a, b], [c, d]]), alternative=alternative) return cls(statistic=result.statistic, pvalue=result.pvalue, pathway=pathway, a=a, b=b, c=c, d=d) diff --git a/main/como/stats/ks_test.py b/main/como/stats/ks_test.py index 68fb1c2f..b41d57f0 100644 --- a/main/como/stats/ks_test.py +++ b/main/como/stats/ks_test.py @@ -21,13 +21,13 @@ class KSTest(BaseTwoSample[KS_RESULT]): } reaction_ids: list[str] - statistic: npt.NDArray[float] - pvalue: npt.NDArray[float] - statistic_location: npt.NDArray[float] - statistic_sign: npt.NDArray[int] + statistic: npt.NDArray[np.floating] + pvalue: npt.NDArray[np.floating] + statistic_location: npt.NDArray[np.floating] + statistic_sign: npt.NDArray[np.integer] @staticmethod - def _worker(a: npt.NDArray[float], b: npt.NDArray[float], **kwargs) -> KS_RESULT: + def _worker(a: npt.NDArray[np.floating], b: npt.NDArray[np.floating], **kwargs) -> KS_RESULT: """Calculate the KS statistic. Args: From 53b183059eeb29497fccc24e8b6cf2bc720cb3f2 Mon Sep 17 00:00:00 2001 From: Josh Loecker Date: Fri, 6 Mar 2026 11:31:00 -0600 Subject: [PATCH 38/41] chore: use `mod.get_remaining_identifiers` in replacement of `mod.convert` Signed-off-by: Josh Loecker --- tests/unit/test_identifier_convert.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/unit/test_identifier_convert.py b/tests/unit/test_identifier_convert.py index 72118877..736518cd 100644 --- a/tests/unit/test_identifier_convert.py +++ b/tests/unit/test_identifier_convert.py @@ -172,7 +172,7 @@ def test_get_conversion_scope_specific_mapping(mod, scope, results, expected): def test_convert_rejects_mixed_id_types(mod): # Mixed: Entrez-like ("1017") + symbol ("TP53") => ValueError with pytest.raises(ValueError, match="All items in ids must be of the same type"): - mod.convert(ids=["1017", "TP53"], taxon=9606) + mod.get_remaining_identifiers(ids=["1017", "TP53"], taxon=9606) def test_convert_single_id_passes_expected_scope_fields_taxon_and_cache(monkeypatch, mod): @@ -190,7 +190,7 @@ def fake_get_conversion(*, info, values, scope, fields, taxon): monkeypatch.setattr(mod, "MyGeneInfo", CapturingMyGeneInfo) monkeypatch.setattr(mod, "_get_conversion", fake_get_conversion) - df = mod.convert(ids=1017, taxon=9606, cache=False) + df = mod.get_remaining_identifiers(ids=1017, taxon=9606, cache=False) assert isinstance(df, pd.DataFrame) assert set(df.columns) == {"ensembl_gene_id", "entrez_gene_id", "gene_symbol"} @@ -228,7 +228,7 @@ def fake_get_conversion(*, info, values, scope, fields, taxon): monkeypatch.setattr(mod, "_get_conversion", fake_get_conversion) ids = list(range(1, 1002)) # 1001 ids => should create 2 chunks: 1000 and 1 - df = mod.convert(ids=ids, taxon="9606") + df = mod.get_remaining_identifiers(ids=ids, taxon="9606") assert len(df) == 1001 assert set(df.columns) == {"ensembl_gene_id", "entrez_gene_id", "gene_symbol"} @@ -262,7 +262,7 @@ def fake_get_conversion(*, info, values, scope, fields, taxon): monkeypatch.setattr(mod, "MyGeneInfo", CapturingMyGeneInfo) monkeypatch.setattr(mod, "_get_conversion", fake_get_conversion) - df = mod.convert(ids=["TP53", "BRCA1"], taxon=9606) + df = mod.get_remaining_identifiers(ids=["TP53", "BRCA1"], taxon=9606) assert len(df) == 2 assert set(df.columns) == {"ensembl_gene_id", "entrez_gene_id", "gene_symbol"} From b661349fc288d180b57a844210f2f6c9e42d6067 Mon Sep 17 00:00:00 2001 From: Josh Loecker Date: Fri, 6 Mar 2026 11:39:53 -0600 Subject: [PATCH 39/41] chore: update built-in boundary reaction bounds to match expected format Signed-off-by: Josh Loecker --- .../boundary_rxns/naiveB_boundary_rxns.csv | 102 +++++++++--------- main/data/boundary_rxns/smB_boundary_rxns.csv | 2 +- 2 files changed, 52 insertions(+), 52 deletions(-) diff --git a/main/data/boundary_rxns/naiveB_boundary_rxns.csv b/main/data/boundary_rxns/naiveB_boundary_rxns.csv index be3eec35..11dac2c6 100644 --- a/main/data/boundary_rxns/naiveB_boundary_rxns.csv +++ b/main/data/boundary_rxns/naiveB_boundary_rxns.csv @@ -1,7 +1,7 @@ -Reaction,Abbreviation,Compartment,Minimum Reaction Rate,Maximum Reaction Rate -Exchange,glc_D,Extracellular,-1000,100 +Reaction,Abbreviation,Compartment,Lower Bound,Upper Bound +Exchange,glc_D,Extracellular,-100,1000 Exchange,fe2,Extracellular,-1000,1000 -Exchange,gal,Extracellular,-1000,0 +Exchange,gal,Extracellular,-0,1000 Exchange,zn2,Extracellular,-1000,1000 Exchange,ca2,Extracellular,-1000,1000 Exchange,na1,Extracellular,-1000,1000 @@ -16,55 +16,55 @@ Exchange,co2,Extracellular,-1000,1000 Exchange,nh4,Extracellular,-1000,1000 Exchange,no2,Extracellular,-1000,1000 Exchange,co,Extracellular,-1000,1000 -Exchange,asn_L,Extracellular,-1000,1 -Exchange,asp_L,Extracellular,-1000,1 -Exchange,glu_L,Extracellular,-1000,1 -Exchange,ile_L,Extracellular,-1000,1 -Exchange,leu_L,Extracellular,-1000,1 -Exchange,lys_L,Extracellular,-1000,1 -Exchange,met_L,Extracellular,-1000,1 -Exchange,pro_L,Extracellular,-1000,1 -Exchange,ser_L,Extracellular,-1000,1 -Exchange,val_L,Extracellular,-1000,1 -Exchange,gly,Extracellular,-1000,1 -Exchange,cys_L,Extracellular,-1000,1 -Exchange,ala_L,Extracellular,-1000,1 -Exchange,his_L,Extracellular,-1000,1 -Exchange,thr_L,Extracellular,-1000,1 -Exchange,gln_L,Extracellular,-1000,1 -Exchange,phe_L,Extracellular,-1000,1 -Exchange,tyr_L,Extracellular,-1000,1 -Exchange,arg_L,Extracellular,-1000,1 -Exchange,trp_L,Extracellular,-1000,1 -Exchange,eicostet,Extracellular,-1000,1 -Exchange,hdca,Extracellular,-1000,1 -Exchange,hdcea,Extracellular,-1000,1 -Exchange,lnlc,Extracellular,-1000,1 -Exchange,lnlnca,Extracellular,-1000,1 -Exchange,lnlncg,Extracellular,-1000,1 -Exchange,ocdca,Extracellular,-1000,1 -Exchange,ocdcea,Extracellular,-1000,1 -Exchange,thmmp,Extracellular,-1000,0 -Exchange,thmtp,Extracellular,-1000,0 -Exchange,ncam,Extracellular,-1000,0 -Exchange,pnto_R,Extracellular,-1000,0 -Exchange,pydxn,Extracellular,-1000,0 -Exchange,pydx,Extracellular,-1000,0 -Exchange,pydam,Extracellular,-1000,0 -Exchange,btn,Extracellular,-1000,0 -Exchange,fol,Extracellular,-1000,10 -Exchange,aqcobal,Extracellular,-1000,0 -Exchange,vitd3,Extracellular,-1000,1 -Exchange,ascb_L,Extracellular,-1000,0 -Exchange,retinol,Extracellular,-1000,10 -Exchange,retinal,Extracellular,-1000,10 -Exchange,ala_B,Extracellular,-1000,1 -Exchange,ala_D,Extracellular,-1000,1 +Exchange,asn_L,Extracellular,-1,1000 +Exchange,asp_L,Extracellular,-1,1000 +Exchange,glu_L,Extracellular,-1,1000 +Exchange,ile_L,Extracellular,-1,1000 +Exchange,leu_L,Extracellular,-1,1000 +Exchange,lys_L,Extracellular,-1,1000 +Exchange,met_L,Extracellular,-1,1000 +Exchange,pro_L,Extracellular,-1,1000 +Exchange,ser_L,Extracellular,-1,1000 +Exchange,val_L,Extracellular,-1,1000 +Exchange,gly,Extracellular,-1,1000 +Exchange,cys_L,Extracellular,-1,1000 +Exchange,ala_L,Extracellular,-1,1000 +Exchange,his_L,Extracellular,-1,1000 +Exchange,thr_L,Extracellular,-1,1000 +Exchange,gln_L,Extracellular,-1,1000 +Exchange,phe_L,Extracellular,-1,1000 +Exchange,tyr_L,Extracellular,-1,1000 +Exchange,arg_L,Extracellular,-1,1000 +Exchange,trp_L,Extracellular,-1,1000 +Exchange,eicostet,Extracellular,-1,1000 +Exchange,hdca,Extracellular,-1,1000 +Exchange,hdcea,Extracellular,-1,1000 +Exchange,lnlc,Extracellular,-1,1000 +Exchange,lnlnca,Extracellular,-1,1000 +Exchange,lnlncg,Extracellular,-1,1000 +Exchange,ocdca,Extracellular,-1,1000 +Exchange,ocdcea,Extracellular,-1,1000 +Exchange,thmmp,Extracellular,-0,1000 +Exchange,thmtp,Extracellular,-0,1000 +Exchange,ncam,Extracellular,-0,1000 +Exchange,pnto_R,Extracellular,-0,1000 +Exchange,pydxn,Extracellular,-0,1000 +Exchange,pydx,Extracellular,-0,1000 +Exchange,pydam,Extracellular,-0,1000 +Exchange,btn,Extracellular,-0,1000 +Exchange,fol,Extracellular,-10,1000 +Exchange,aqcobal,Extracellular,-0,1000 +Exchange,vitd3,Extracellular,-1,1000 +Exchange,ascb_L,Extracellular,-0,1000 +Exchange,retinol,Extracellular,-10,1000 +Exchange,retinal,Extracellular,-10,1000 +Exchange,ala_B,Extracellular,-1,1000 +Exchange,ala_D,Extracellular,-1,1000 Exchange,h2co3,Extracellular,-1000,1000 Exchange,h2o2,Extracellular,-1000,1000 Exchange,hco3,Extracellular,-1000,1000 -Exchange,orn,Extracellular,-1000,1 -Exchange,orn_D,Extracellular,-1000,1 +Exchange,orn,Extracellular,-1,1000 +Exchange,orn_D,Extracellular,-1,1000 Exchange,so4,Extracellular,-1000,1000 -Exchange,ribflv,Extracellular,-1000,1 -Exchange,Lcystin,Extracellular,-1000,1 +Exchange,ribflv,Extracellular,-1,1000 +Exchange,Lcystin,Extracellular,-1,1000 diff --git a/main/data/boundary_rxns/smB_boundary_rxns.csv b/main/data/boundary_rxns/smB_boundary_rxns.csv index 9f0f9385..be12d9b7 100644 --- a/main/data/boundary_rxns/smB_boundary_rxns.csv +++ b/main/data/boundary_rxns/smB_boundary_rxns.csv @@ -1,4 +1,4 @@ -Reaction,Abbreviation,Compartment,Minimum Reaction Rate,Maximum Reaction Rate +Reaction,Abbreviation,Compartment,Lower Bound,Upper Bound Exchange,glc_D,Extracellular,-100,1000 Exchange,fe2,Extracellular,-1000,1000 Exchange,gal,Extracellular,0,1000 From 2628568be5fc3ad375cd95767f4b158fc5f3d1e6 Mon Sep 17 00:00:00 2001 From: Josh Loecker Date: Fri, 6 Mar 2026 11:40:30 -0600 Subject: [PATCH 40/41] chore: ruff formatting Signed-off-by: Josh Loecker --- main/como/proteomics/FTPManager.py | 8 +++++++- main/como/proteomics/proteomics_preprocess.py | 18 +++++++++++++++--- 2 files changed, 22 insertions(+), 4 deletions(-) diff --git a/main/como/proteomics/FTPManager.py b/main/como/proteomics/FTPManager.py index 8cca5095..be5a54a3 100644 --- a/main/como/proteomics/FTPManager.py +++ b/main/como/proteomics/FTPManager.py @@ -20,7 +20,13 @@ from como.data_types import LogLevel -async def aioftp_client(host: str, username: str = "anonymous", password: str = "guest", port: int = 21, max_attempts: int = 3) -> aioftp.Client: +async def aioftp_client( + host: str, + username: str = "anonymous", + password: str = "guest", + port: int = 21, + max_attempts: int = 3, +) -> aioftp.Client: """This class is responsible for creating a "client" connection""" connection_successful: bool = False attempt_num: int = 1 diff --git a/main/como/proteomics/proteomics_preprocess.py b/main/como/proteomics/proteomics_preprocess.py index 04242443..79ff9850 100644 --- a/main/como/proteomics/proteomics_preprocess.py +++ b/main/como/proteomics/proteomics_preprocess.py @@ -159,7 +159,10 @@ def _gather_data(self): # Iterate through the URLs available for url, study in zip(ftp_urls, studies, strict=True): - ftp_data: FTPManager.Reader = FTPManager.Reader(root_link=url, file_extensions=self._preferred_extensions) + ftp_data: FTPManager.Reader = FTPManager.Reader( + root_link=url, + file_extensions=self._preferred_extensions, + ) urls = list(ftp_data.files) sizes = list(ftp_data.file_sizes) @@ -167,7 +170,14 @@ def _gather_data(self): # Iterate through all files and sizes found for url_## for file, size in zip(urls, sizes, strict=True): - self.file_information.append(FileInformation(cell_type=cell_type, download_url=file, file_size=size, study=study)) + self.file_information.append( + FileInformation( + cell_type=cell_type, + download_url=file, + file_size=size, + study=study, + ) + ) def print_download_size(self): """Print the total size to download if we must download data.""" @@ -327,7 +337,9 @@ def parse_args() -> argparse.Namespace: else: args.core_count = int(args.core_count) if args.core_count > os.cpu_count(): - logger.info(f"{args.core_count} cores not available, system only has {os.cpu_count()} cores. Setting '--cores' to {os.cpu_count()}") + logger.info( + f"{args.core_count} cores not available, system only has {os.cpu_count()} cores. Setting '--cores' to {os.cpu_count()}" + ) args.core_count = os.cpu_count() return args From cd61f79c84c755be197865871b8f7fdc6ddc977b Mon Sep 17 00:00:00 2001 From: Josh Loecker Date: Fri, 6 Mar 2026 13:29:36 -0600 Subject: [PATCH 41/41] chore: update identifier pipeline tests Signed-off-by: Josh Loecker --- tests/unit/test_identifier_convert.py | 219 +++++++++++++------------- 1 file changed, 109 insertions(+), 110 deletions(-) diff --git a/tests/unit/test_identifier_convert.py b/tests/unit/test_identifier_convert.py index 736518cd..3df2289f 100644 --- a/tests/unit/test_identifier_convert.py +++ b/tests/unit/test_identifier_convert.py @@ -1,4 +1,3 @@ -# tests/test_gene_conversion.py from __future__ import annotations import importlib @@ -22,38 +21,38 @@ def mod(monkeypatch): # Stub bioservices.mygeneinfo.MyGeneInfo so importing the module works bioservices = types.ModuleType("bioservices") mygeneinfo = types.ModuleType("bioservices.mygeneinfo") - + class DummyMyGeneInfo: def __init__(self, cache: bool = True): self.cache = cache - + def get_queries(self, **kwargs): raise AssertionError( "DummyMyGeneInfo.get_queries was called unexpectedly. " "Patch mod.MyGeneInfo or use _get_conversion tests with a fake info object." ) - + mygeneinfo.MyGeneInfo = DummyMyGeneInfo bioservices.mygeneinfo = mygeneinfo - + monkeypatch.setitem(sys.modules, "bioservices", bioservices) monkeypatch.setitem(sys.modules, "bioservices.mygeneinfo", mygeneinfo) - + # Ensure a clean import each test if MODULE_NAME in sys.modules: del sys.modules[MODULE_NAME] - + return importlib.import_module(MODULE_NAME) def test_determine_gene_type_accepts_single_string(mod): - assert mod.determine_gene_type("1017") == {"1017": "entrezgene"} + assert mod.determine_gene_type("1017") == "entrezgene" def test_determine_gene_type_classifies_common_formats_and_float(mod): # Includes a float to exercise the float-specific split(".")[0] branch. out = mod.determine_gene_type(["1017", "ENSG00000141510", "TP53", 123.0]) - + assert out["1017"] == "entrezgene" assert out["ENSG00000141510"] == "ensembl.gene" assert out["TP53"] == "symbol" @@ -70,19 +69,14 @@ def test_determine_gene_type_ensembl_format_requires_ens_prefix_and_11_trailing_ assert mod.determine_gene_type(["ENSGABCDEFGHIJK"])["ENSGABCDEFGHIJK"] == "symbol" -# --------------------------------------------------------------------------- -# _get_conversion tests -# --------------------------------------------------------------------------- - - @dataclass class FakeInfo: return_value: object calls: list[dict] = None - + def __post_init__(self): # noqa: D105 self.calls = [] - + def get_queries(self, **kwargs): self.calls.append(kwargs) return self.return_value @@ -91,13 +85,13 @@ def get_queries(self, **kwargs): def test_get_conversion_raises_typeerror_when_results_not_list(mod): info = FakeInfo(return_value={"not": "a list"}) with pytest.raises(TypeError, match="Expected results to be a list"): - mod._get_conversion(info=info, values=["1017"], scope="entrezgene", fields="ensembl.gene,symbol", taxon="9606") + mod._get_conversion(info=info, values=["1017"], taxon="9606") def test_get_conversion_raises_typeerror_when_first_item_not_dict(mod): info = FakeInfo(return_value=["not a dict"]) with pytest.raises(TypeError, match="Expected each result to be a dict"): - mod._get_conversion(info=info, values=["1017"], scope="entrezgene", fields="ensembl.gene,symbol", taxon="9606") + mod._get_conversion(info=info, values=["1017"], taxon="9606") def test_get_conversion_calls_get_queries_with_expected_arguments(mod): @@ -106,67 +100,58 @@ def test_get_conversion_calls_get_queries_with_expected_arguments(mod): {"query": "1017", "ensembl.gene": "ENSG00000123374", "symbol": "CDK2"}, ] ) - - out = mod._get_conversion( - info=info, values=["1017"], scope="entrezgene", fields="ensembl.gene,symbol", taxon="9606" - ) - + + out = mod._get_conversion(info=info, values=["1017"], taxon="9606") + + # _get_conversion returns raw API results assert out == [ - {"ensembl_gene_id": "ENSG00000123374", "entrez_gene_id": "1017", "gene_symbol": "CDK2"}, + {"query": "1017", "ensembl.gene": "ENSG00000123374", "symbol": "CDK2"}, ] - + assert len(info.calls) == 1 call = info.calls[0] assert call["query"] == "1017" assert call["dotfield"] is True assert call["scopes"] == "entrezgene" - assert call["fields"] == "ensembl.gene,symbol" + assert set(call["fields"].split(",")) == { + "ensembl.gene", + "entrezgene", + "symbol", + "genomic_pos.start", + "genomic_pos.end", + "taxid", + "notfound", + } assert call["species"] == "9606" @pytest.mark.parametrize( - ("scope", "results", "expected"), + ("results",), [ + # Test data format: list of raw API results to pass through ( - "entrezgene", - [ - {"query": "1017", "ensembl.gene": "ENSG00000123374", "symbol": "CDK2"}, - {"query": "1018", "ensembl.gene": "ENSG00000123400", "symbol": "CDK3"}, - ], - [ - {"ensembl_gene_id": "ENSG00000123374", "entrez_gene_id": "1017", "gene_symbol": "CDK2"}, - {"ensembl_gene_id": "ENSG00000123400", "entrez_gene_id": "1018", "gene_symbol": "CDK3"}, - ], + [ + {"query": "1017", "ensembl.gene": "ENSG00000123374", "symbol": "CDK2"}, + {"query": "1018", "ensembl.gene": "ENSG00000123400", "symbol": "CDK3"}, + ], ), ( - "ensembl.gene", - [ - {"query": "ENSG00000141510", "entrezgene": "7157", "symbol": "TP53"}, - ], - [ - {"ensembl_gene_id": "ENSG00000141510", "entrez_gene_id": "7157", "gene_symbol": "TP53"}, - ], + [ + {"query": "ENSG00000141510", "entrezgene": "7157", "symbol": "TP53"}, + ], ), ( - "symbol", - [ - {"query": "TP53", "ensembl.gene": "ENSG00000141510", "entrezgene": "7157"}, - ], - [ - {"ensembl_gene_id": "ENSG00000141510", "entrez_gene_id": "7157", "gene_symbol": "TP53"}, - ], + [ + {"query": "TP53", "ensembl.gene": "ENSG00000141510", "entrezgene": "7157"}, + ], ), ], ) -def test_get_conversion_scope_specific_mapping(mod, scope, results, expected): +def test_get_conversion_returns_raw_results(mod, results): + """Test that _get_conversion returns raw API results without transformation.""" info = FakeInfo(return_value=results) - out = mod._get_conversion(info=info, values=[r["query"] for r in results], scope=scope, fields="x", taxon="9606") - assert out == expected - - -# --------------------------------------------------------------------------- -# convert tests -# --------------------------------------------------------------------------- + out = mod._get_conversion(info=info, values=[r["query"] for r in results], taxon="9606") + assert out == results def test_convert_rejects_mixed_id_types(mod): @@ -177,69 +162,75 @@ def test_convert_rejects_mixed_id_types(mod): def test_convert_single_id_passes_expected_scope_fields_taxon_and_cache(monkeypatch, mod): calls = [] - + class CapturingMyGeneInfo: def __init__(self, cache: bool = True): self.cache = cache - - def fake_get_conversion(*, info, values, scope, fields, taxon): - calls.append({"info": info, "values": values, "scope": scope, "fields": fields, "taxon": taxon}) - # Return one row per value - return [{"ensembl_gene_id": "ENSG_TEST", "entrez_gene_id": v, "gene_symbol": "SYM_TEST"} for v in values] - + + def fake_get_conversion(*, info, values, taxon): + calls.append({"info": info, "values": values, "taxon": taxon}) + # Return raw API format with taxid for filtering + value_list = [values] if isinstance(values, (int, str)) else values + return [ + {"query": str(v), "ensembl.gene": "ENSG_TEST", "entrezgene": str(v), "symbol": f"SYM{v}", "taxid": taxon} + for v in value_list + ] + monkeypatch.setattr(mod, "MyGeneInfo", CapturingMyGeneInfo) monkeypatch.setattr(mod, "_get_conversion", fake_get_conversion) - + df = mod.get_remaining_identifiers(ids=1017, taxon=9606, cache=False) - + assert isinstance(df, pd.DataFrame) - assert set(df.columns) == {"ensembl_gene_id", "entrez_gene_id", "gene_symbol"} + assert set(df.columns) == {"ensembl_gene_id", "entrez_gene_id", "gene_symbol", "taxon_id"} assert len(df) == 1 assert df.loc[0, "entrez_gene_id"] == "1017" assert df.loc[0, "ensembl_gene_id"] == "ENSG_TEST" - assert df.loc[0, "gene_symbol"] == "SYM_TEST" - + assert df.loc[0, "gene_symbol"] == "SYM1017" + assert df.loc[0, "taxon_id"] == 9606 + assert len(calls) == 1 call = calls[0] assert isinstance(call["info"], CapturingMyGeneInfo) assert call["info"].cache is False - assert call["values"] == ["1017"] - assert call["scope"] == "entrezgene" - assert call["taxon"] == "9606" + assert call["values"] == 1017 + assert call["taxon"] == 9606 - # Fields are computed via set arithmetic; order is not guaranteed. - assert set(call["fields"].split(",")) == {"ensembl.gene", "symbol"} - -def test_convert_chunks_inputs_over_1000(monkeypatch, mod): +def test_convert_large_input_list(monkeypatch, mod): + """Test that get_remaining_identifiers handles large input lists correctly.""" calls = [] - + class CapturingMyGeneInfo: def __init__(self, cache: bool = True): self.cache = cache - - def fake_get_conversion(*, info, values, scope, fields, taxon): - calls.append({"values_len": len(values), "scope": scope, "fields": fields, "taxon": taxon}) + + def fake_get_conversion(*, info, values, taxon): + calls.append({"values_len": len(values), "taxon": taxon}) + value_list = [values] if isinstance(values, (int, str)) else values return [ - {"ensembl_gene_id": f"ENSG{v.zfill(11)}", "entrez_gene_id": v, "gene_symbol": f"SYM{v}"} for v in values + { + "query": str(v), + "ensembl.gene": f"ENSG{str(v).zfill(11)}", + "entrezgene": str(v), + "symbol": f"SYM{v}", + "taxid": taxon, + } + for v in value_list ] - + monkeypatch.setattr(mod, "MyGeneInfo", CapturingMyGeneInfo) monkeypatch.setattr(mod, "_get_conversion", fake_get_conversion) - - ids = list(range(1, 1002)) # 1001 ids => should create 2 chunks: 1000 and 1 + + ids = list(range(1, 1002)) df = mod.get_remaining_identifiers(ids=ids, taxon="9606") - + assert len(df) == 1001 - assert set(df.columns) == {"ensembl_gene_id", "entrez_gene_id", "gene_symbol"} - - # Ensure chunking happened as expected - assert [c["values_len"] for c in calls] == [1000, 1] - for c in calls: - assert c["scope"] == "entrezgene" - assert c["taxon"] == "9606" - assert set(c["fields"].split(",")) == {"ensembl.gene", "symbol"} - + assert set(df.columns) == {"ensembl_gene_id", "entrez_gene_id", "gene_symbol", "taxon_id"} + assert len(calls) == 1 + assert calls[0]["values_len"] == 1001 + assert calls[0]["taxon"] == "9606" + # Spot-check first and last rows keep ordering assert df.loc[0, "entrez_gene_id"] == "1" assert df.loc[0, "gene_symbol"] == "SYM1" @@ -249,26 +240,34 @@ def fake_get_conversion(*, info, values, scope, fields, taxon): def test_convert_symbol_scope_fields_exclude_symbol(monkeypatch, mod): calls = [] - + class CapturingMyGeneInfo: def __init__(self, cache: bool = True): self.cache = cache - - def fake_get_conversion(*, info, values, scope, fields, taxon): - calls.append({"scope": scope, "fields": fields, "taxon": taxon, "values": values}) - # Return minimal row(s) - return [{"ensembl_gene_id": None, "entrez_gene_id": None, "gene_symbol": v} for v in values] - + + def fake_get_conversion(*, info, values, taxon): + calls.append({"taxon": taxon, "values": values}) + return [ + { + "query": v, + "ensembl.gene": f"ENSG{v}_TEST", + "entrezgene": "123", + "symbol": v, + "taxid": taxon, + } + for v in values + ] + monkeypatch.setattr(mod, "MyGeneInfo", CapturingMyGeneInfo) monkeypatch.setattr(mod, "_get_conversion", fake_get_conversion) - + df = mod.get_remaining_identifiers(ids=["TP53", "BRCA1"], taxon=9606) - + assert len(df) == 2 - assert set(df.columns) == {"ensembl_gene_id", "entrez_gene_id", "gene_symbol"} - + assert set(df.columns) == {"ensembl_gene_id", "entrez_gene_id", "gene_symbol", "taxon_id"} + assert df.loc[0, "gene_symbol"] == "TP53" + assert df.loc[1, "gene_symbol"] == "BRCA1" + assert df.loc[0, "taxon_id"] == 9606 + assert len(calls) == 1 - assert calls[0]["scope"] == "symbol" - assert calls[0]["taxon"] == "9606" - # When scope is "symbol", requested fields should be the other two - assert set(calls[0]["fields"].split(",")) == {"ensembl.gene", "entrezgene"} + assert calls[0]["taxon"] == 9606