From 0996dbcc6375038aa7c407ef4c705ef9414547c0 Mon Sep 17 00:00:00 2001 From: Jayaram Kancherla Date: Thu, 26 Mar 2026 21:01:53 -0700 Subject: [PATCH 1/5] update workflows --- .github/workflows/publish-pypi.yml | 52 +++++++++++++++++++++ .github/workflows/run-tests.yml | 73 ++++++++++++++++++++++++++++++ 2 files changed, 125 insertions(+) create mode 100644 .github/workflows/publish-pypi.yml create mode 100644 .github/workflows/run-tests.yml diff --git a/.github/workflows/publish-pypi.yml b/.github/workflows/publish-pypi.yml new file mode 100644 index 0000000..405fee0 --- /dev/null +++ b/.github/workflows/publish-pypi.yml @@ -0,0 +1,52 @@ +name: Publish to PyPI + +on: + push: + tags: "*" + +jobs: + build: + runs-on: ubuntu-latest + permissions: + id-token: write + repository-projects: write + contents: write + pages: write + + steps: + - uses: actions/checkout@v4 + + - name: Set up Python 3.12 + uses: actions/setup-python@v5 + with: + python-version: 3.12 + + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install tox + + - name: Test with tox + run: | + tox + + - name: Build Project and Publish + run: | + python -m tox -e clean,build + + # This uses the trusted publisher workflow so no token is required. + - name: Publish to PyPI + uses: pypa/gh-action-pypi-publish@release/v1 + + - name: Build docs + run: | + tox -e docs + + - run: touch ./docs/_build/html/.nojekyll + + - name: GH Pages Deployment + uses: JamesIves/github-pages-deploy-action@v4 + with: + branch: gh-pages # The branch the action should deploy to. + folder: ./docs/_build/html + clean: true # Automatically remove deleted files from the deploy branch diff --git a/.github/workflows/run-tests.yml b/.github/workflows/run-tests.yml new file mode 100644 index 0000000..01f4e9a --- /dev/null +++ b/.github/workflows/run-tests.yml @@ -0,0 +1,73 @@ +name: Test the library + +on: + push: + branches: + - master # for legacy repos + - main + pull_request: + branches: + - master # for legacy repos + - main + workflow_dispatch: # Allow manually triggering the workflow + schedule: + # Run roughly every 15 days at 00:00 UTC + # (useful to check if updates on dependencies break the package) + - cron: "0 0 1,16 * *" + +permissions: + contents: read + +concurrency: + group: >- + ${{ github.workflow }}-${{ github.ref_type }}- + ${{ github.event.pull_request.number || github.sha }} + cancel-in-progress: true + +jobs: + test: + strategy: + matrix: + python: ["3.10", "3.11", "3.12", "3.13", "3.14"] + platform: + - ubuntu-latest + # - macos-latest + # - windows-latest + runs-on: ${{ matrix.platform }} + name: Python ${{ matrix.python }}, ${{ matrix.platform }} + steps: + - uses: actions/checkout@v4 + + - uses: actions/setup-python@v5 + id: setup-python + with: + python-version: ${{ matrix.python }} + + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install tox coverage + + - name: Run tests + run: >- + pipx run --python '${{ steps.setup-python.outputs.python-path }}' + tox + -- -rFEx --durations 10 --color yes --cov --cov-branch --cov-report=xml # pytest args + + - name: Check for codecov token availability + id: codecov-check + shell: bash + run: | + if [ ${{ secrets.CODECOV_TOKEN }} != '' ]; then + echo "codecov=true" >> $GITHUB_OUTPUT; + else + echo "codecov=false" >> $GITHUB_OUTPUT; + fi + + - name: Upload coverage reports to Codecov with GitHub Action + uses: codecov/codecov-action@v5 + if: ${{ steps.codecov-check.outputs.codecov == 'true' }} + env: + CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }} + slug: ${{ github.repository }} + flags: ${{ matrix.platform }} - py${{ matrix.python }} From 059ad4582842c8f0c53c0a43839a800a470cd699 Mon Sep 17 00:00:00 2001 From: Jayaram Kancherla Date: Thu, 26 Mar 2026 21:25:29 -0700 Subject: [PATCH 2/5] skip time taking tests --- tests/test_integration.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_integration.py b/tests/test_integration.py index c6c91f5..6e98d46 100644 --- a/tests/test_integration.py +++ b/tests/test_integration.py @@ -9,8 +9,8 @@ from expressionatlas import ExpressionAtlasClient from expressionatlas.validation import is_valid_accession - @pytest.mark.integration +@pytest.mark.skip("takes too long") class TestExpressionAtlasClientIntegration: """Integration tests for ExpressionAtlasClient.""" From bfa85c8d4a9daf0d309fb85e8efb457857de6fce Mon Sep 17 00:00:00 2001 From: Jayaram Kancherla Date: Thu, 26 Mar 2026 21:26:57 -0700 Subject: [PATCH 3/5] cleaning up imports --- src/expressionatlas/__init__.py | 3 --- src/expressionatlas/client.py | 2 +- src/expressionatlas/download.py | 3 +-- 3 files changed, 2 insertions(+), 6 deletions(-) diff --git a/src/expressionatlas/__init__.py b/src/expressionatlas/__init__.py index ad588c0..abea65e 100644 --- a/src/expressionatlas/__init__.py +++ b/src/expressionatlas/__init__.py @@ -26,7 +26,6 @@ finally: del version, PackageNotFoundError - from expressionatlas.client import ExpressionAtlasClient from expressionatlas.download import ( get_atlas_data, @@ -41,5 +40,3 @@ InvalidAccessionError, ) from expressionatlas.models import SearchResult -from summarizedexperiment import SummarizedExperiment -from biocutils import NamedList diff --git a/src/expressionatlas/client.py b/src/expressionatlas/client.py index a37c612..b694239 100644 --- a/src/expressionatlas/client.py +++ b/src/expressionatlas/client.py @@ -6,11 +6,11 @@ from collections.abc import Sequence import pandas as pd +from biocutils import NamedList from expressionatlas.api import BioStudiesAPI from expressionatlas.download import get_atlas_data, get_atlas_experiment from expressionatlas.models import search_results_to_dataframe -from biocutils import NamedList from expressionatlas.validation import validate_accession logger = logging.getLogger(__name__) diff --git a/src/expressionatlas/download.py b/src/expressionatlas/download.py index 85b7d46..3c59885 100644 --- a/src/expressionatlas/download.py +++ b/src/expressionatlas/download.py @@ -19,9 +19,8 @@ import numpy as np import pandas as pd - -from biocutils import NamedList from biocframe import BiocFrame +from biocutils import NamedList from summarizedexperiment import SummarizedExperiment from expressionatlas.exceptions import DownloadError From c475c46048b382d47ee65f8904a8fe6dfcf65808 Mon Sep 17 00:00:00 2001 From: Jayaram Kancherla Date: Mon, 30 Mar 2026 11:05:25 -0700 Subject: [PATCH 4/5] replace pandas with biocframe --- README.md | 51 ++-- setup.cfg | 1 - src/expressionatlas/client.py | 30 ++- src/expressionatlas/converter.py | 410 ------------------------------- src/expressionatlas/download.py | 123 ++++++---- src/expressionatlas/models.py | 33 ++- tests/test_integration.py | 18 +- tests/test_models.py | 29 +-- 8 files changed, 167 insertions(+), 528 deletions(-) delete mode 100644 src/expressionatlas/converter.py diff --git a/README.md b/README.md index f2ef102..19bb2a7 100644 --- a/README.md +++ b/README.md @@ -37,10 +37,18 @@ results = client.search_experiments( properties=["cancer", "breast"], species="homo sapiens" ) -print(results.head()) -# Accession Species Type ... -# 0 E-MTAB-1624 homo sapiens microarray data ... +print(results) ``` + BiocFrame with 208 rows and 4 columns + Accession Species Type Title + + [0] E-MTAB-8198 Homo sapiens Cell line - High-thr... Functional effect of... + [1] E-MTAB-8532 Homo sapiens Human - One-color mi... DNA microarray studi... + [2] E-GEOD-43306 Homo sapiens RNA-seq of coding RNA Translating transcri... + ... ... ... ... + [205] E-MTAB-779 Homo sapiens transcription profil... OncomiRs like let-7 ... + [206] E-TABM-1118 Homo sapiens transcription profil... Transcrption profili... + [207] E-TABM-601 Homo sapiens transcription profil... Transcription profil... ### Download RNA-seq Data @@ -53,44 +61,35 @@ rnaseq = exp["rnaseq"] counts = rnaseq.assay("counts") # numpy array: genes × samples print(f"Shape: {counts.shape[0]} genes × {counts.shape[1]} samples") -# Shape: 58735 genes × 48 samples +# Shape: 58735 genes × 24 samples # Sample metadata (BiocFrame) sample_info = rnaseq.get_column_data() print(sample_info.get_column_names()) +# ['cell line', 'compound', 'developmental stage', 'disease', 'dose', 'genotype', 'organism', 'organism part'] # Gene annotations (BiocFrame) gene_info = rnaseq.get_row_data() print(gene_info.shape) -``` - -### Download Microarray Data - -```python -exp = client.get_experiment("E-MTAB-1624") - -# Microarray data is keyed by array design -array_design = "A-AFFY-126" -eset = exp[array_design] # This is also a SummarizedExperiment now +# (58735, 1) -# Expression matrix (probes × samples) -intensities = eset.assay("exprs") -print(intensities.shape) -# (54675, 96) - -# Sample metadata (BiocFrame) -sample_annotations = eset.get_column_data() -print(sample_annotations.shape) - -# Feature annotations (BiocFrame) -probe_annotations = eset.get_row_data() +print(rnaseq) ``` + class: SummarizedExperiment + dimensions: (58735, 24) + assays(1): ['counts'] + row_data columns(1): ['Gene Name'] + row_names(58735): ['ENSG00000000003', 'ENSG00000000005', 'ENSG00000000419', ..., 'ENSG00000285992', 'ENSG00000285993', 'ENSG00000285994'] + column_data columns(8): ['cell line', 'compound', 'developmental stage', 'disease', 'dose', 'genotype', 'organism', 'organism part'] + column_names(24): ['ERR3456453', 'ERR3456442', 'ERR3456443', ..., 'ERR3456450', 'ERR3456459', 'ERR3456444'] + metadata(2): accession source + ### Batch Downloads ```python # Download multiple experiments -accessions = results["Accession"].head(10).tolist() +accessions = results.get_column("Accession")[:10] experiments = client.get_experiments(accessions) # Access individual experiments diff --git a/setup.cfg b/setup.cfg index b889390..36c0dfc 100644 --- a/setup.cfg +++ b/setup.cfg @@ -50,7 +50,6 @@ python_requires = >=3.9 install_requires = importlib-metadata; python_version<"3.8" requests - pandas numpy biocframe summarizedexperiment diff --git a/src/expressionatlas/client.py b/src/expressionatlas/client.py index b694239..c4e4c87 100644 --- a/src/expressionatlas/client.py +++ b/src/expressionatlas/client.py @@ -5,12 +5,12 @@ import logging from collections.abc import Sequence -import pandas as pd +from biocframe import BiocFrame from biocutils import NamedList from expressionatlas.api import BioStudiesAPI from expressionatlas.download import get_atlas_data, get_atlas_experiment -from expressionatlas.models import search_results_to_dataframe +from expressionatlas.models import search_results_to_biocframe from expressionatlas.validation import validate_accession logger = logging.getLogger(__name__) @@ -74,7 +74,7 @@ def search_experiments( self, properties: str | Sequence[str], species: str | None = None, - ) -> pd.DataFrame: + ) -> BiocFrame: """ Search for Expression Atlas experiments matching given criteria. @@ -90,8 +90,8 @@ def search_experiments( Returns ------- - pandas.DataFrame - DataFrame with columns: Accession, Species, Type, Title. + BiocFrame + BiocFrame with columns: Accession, Species, Type, Title. Sorted by Species, Type, then Accession. Raises @@ -130,8 +130,8 @@ def search_experiments( results = self.api.search(properties=list(properties), species=species) - # Filter out connection errors and convert to DataFrame - df = search_results_to_dataframe(results) + # Filter out connection errors and convert to BiocFrame + df = search_results_to_biocframe(results) # Log warning if any connection errors occurred error_count = sum(1 for r in results if r.connection_error) @@ -232,16 +232,14 @@ def get_experiments( ... species="homo sapiens", ... ) >>> # Download all RNA-seq experiments from search results - >>> rnaseq_accessions = results[ - ... results[ - ... "Type" - ... ].str.contains( - ... "RNA-seq", - ... na=False, - ... ) - ... ]["Accession"] + >>> types = results.get_column("Type") + >>> accessions = results.get_column("Accession") + >>> rnaseq_accessions = [ + ... acc for acc, typ in zip(accessions, types) + ... if typ and "RNA-seq" in typ + ... ] >>> experiments = client.get_experiments( - ... rnaseq_accessions.tolist() + ... rnaseq_accessions ... ) >>> # Access: experiments["E-MTAB-XXXX"]["rnaseq"].assays["counts"] """ diff --git a/src/expressionatlas/converter.py b/src/expressionatlas/converter.py deleted file mode 100644 index fcad96a..0000000 --- a/src/expressionatlas/converter.py +++ /dev/null @@ -1,410 +0,0 @@ -"""Client for the Expression Atlas RData Converter AWS service.""" - -from __future__ import annotations - -import io -import json -import logging -import os -import tempfile -import zipfile -from dataclasses import dataclass, field -from pathlib import Path -from typing import Any -from urllib.request import Request, urlopen - -import numpy as np -import pandas as pd - -logger = logging.getLogger(__name__) - - -class ConverterError(Exception): - """Error from the converter service.""" - - def __init__(self, message: str, status_code: int | None = None): - super().__init__(message) - self.status_code = status_code - - -@dataclass -class ConvertedBundle: - """Container for converted Expression Atlas data.""" - - # Expression matrix (genes x samples) - matrix: np.ndarray | None = None - - # Gene annotations - genes: pd.DataFrame = field(default_factory=pd.DataFrame) - - # Sample annotations - samples: pd.DataFrame = field(default_factory=pd.DataFrame) - - # Metadata from conversion - meta: dict[str, Any] = field(default_factory=dict) - - # Row names (gene IDs) - rownames: list[str] = field(default_factory=list) - - # Column names (sample IDs) - colnames: list[str] = field(default_factory=list) - - @property - def shape(self) -> tuple[int, int]: - """Return (n_genes, n_samples) shape.""" - if self.matrix is not None: - return self.matrix.shape - return (len(self.rownames), len(self.colnames)) - - -class ConverterClient: - """ - Client for the Expression Atlas RData Converter service (AWS). - - This client calls the AWS App Runner/ECS service to convert .RData files - to portable formats that Python can read without R. - - Parameters - ---------- - service_url : str, optional - URL of the converter service. Defaults to CONVERTER_URL env var. - use_iam_auth : bool - If True, use AWS IAM authentication (SigV4). - If False, use API key from CONVERTER_API_KEY env var. - cache_dir : Path, optional - Directory to cache downloaded bundles. Defaults to temp dir. - timeout : int - Request timeout in seconds. - - Examples - -------- - >>> client = ConverterClient( - ... use_iam_auth=False - ... ) - >>> bundle = client.convert_and_load( - ... "ftp://ftp.ebi.ac.uk/.../E-MTAB-7841-atlasExperimentSummary.Rdata", - ... "E-MTAB-7841", - ... ) - >>> print( - ... bundle.matrix.shape - ... ) - (58735, 48) - """ - - def __init__( - self, - service_url: str | None = None, - use_iam_auth: bool = False, - cache_dir: Path | None = None, - timeout: int = 600, - ): - self.service_url = service_url or os.environ.get("CONVERTER_URL", "") - self.use_iam_auth = use_iam_auth - self.cache_dir = cache_dir or Path(tempfile.gettempdir()) / "atlas_converter_cache" - self.timeout = timeout - - if not self.service_url: - logger.warning( - "CONVERTER_URL not set. Converter client will not work. " - "Set CONVERTER_URL environment variable or pass service_url parameter." - ) - - def _get_auth_headers(self) -> dict[str, str]: - """Get authentication headers for the request.""" - headers = {"Content-Type": "application/json"} - - if self.use_iam_auth: - # Use AWS SigV4 signing for IAM auth - try: - from botocore.auth import SigV4Auth - from botocore.awsrequest import AWSRequest - from botocore.session import Session - - session = Session() - credentials = session.get_credentials() - if credentials: - # Create a request to sign - aws_request = AWSRequest( - method="POST", - url=f"{self.service_url.rstrip('/')}/convert", - headers=headers, - ) - SigV4Auth(credentials, "execute-api", os.environ.get("AWS_REGION", "us-east-1")).add_auth( - aws_request - ) - headers.update(dict(aws_request.headers)) - except ImportError: - logger.warning("botocore not installed. Cannot use IAM auth. " "Install with: pip install botocore") - except Exception as e: - logger.warning(f"Failed to sign request: {e}") - else: - # Use API key - api_key = os.environ.get("CONVERTER_API_KEY", "") - if api_key: - headers["X-API-Key"] = api_key - - return headers - - def convert( - self, - rdata_url: str, - accession: str, - output_format: str = "mtx_bundle", - assay_name: str | None = None, - force: bool = False, - ) -> dict[str, Any]: - """ - Request conversion of an .RData file. - - Parameters - ---------- - rdata_url : str - URL to the .RData file. - accession : str - Experiment accession (e.g., E-MTAB-7841). - output_format : str - Output format (mtx_bundle or tsv_bundle). - assay_name : str, optional - Specific assay to extract. - force : bool - Force re-conversion even if cached. - - Returns - ------- - dict - Response from the converter service including signed_url. - """ - if not self.service_url: - raise ConverterError("CONVERTER_URL not configured") - - endpoint = f"{self.service_url.rstrip('/')}/convert" - - payload = { - "rdata_url": rdata_url, - "accession": accession, - "output_format": output_format, - "force": force, - } - if assay_name: - payload["assay_name"] = assay_name - - headers = self._get_auth_headers() - - logger.info(f"Requesting conversion for {accession}") - - try: - req = Request( - endpoint, - data=json.dumps(payload).encode("utf-8"), - headers=headers, - method="POST", - ) - with urlopen(req, timeout=self.timeout) as response: - result = json.loads(response.read().decode("utf-8")) - - if result.get("status") == "error": - raise ConverterError(result.get("detail", result.get("error", "Unknown error"))) - - logger.info(f"Conversion {'cache hit' if result.get('cache_hit') else 'complete'} " f"for {accession}") - return result - - except Exception as e: - if isinstance(e, ConverterError): - raise - raise ConverterError(f"Request failed: {e}") from e - - def download_bundle(self, signed_url: str, accession: str) -> Path: - """ - Download converted bundle from signed URL. - - Parameters - ---------- - signed_url : str - Signed URL from convert() response. - accession : str - Experiment accession (for cache path). - - Returns - ------- - Path - Path to extracted bundle directory. - """ - # Create cache directory - bundle_dir = self.cache_dir / accession - bundle_dir.mkdir(parents=True, exist_ok=True) - - logger.info(f"Downloading bundle for {accession}") - - try: - with urlopen(signed_url, timeout=self.timeout) as response: - zip_data = response.read() - - # Extract zip - with zipfile.ZipFile(io.BytesIO(zip_data)) as zf: - zf.extractall(bundle_dir) - - logger.info(f"Bundle extracted to {bundle_dir}") - return bundle_dir - - except Exception as e: - raise ConverterError(f"Failed to download bundle: {e}") from e - - def load_bundle(self, bundle_dir: Path) -> dict[str, ConvertedBundle]: - """ - Load converted data from bundle directory. - - Parameters - ---------- - bundle_dir : Path - Path to extracted bundle directory. - - Returns - ------- - dict[str, ConvertedBundle] - Dict mapping dataset name to ConvertedBundle. - """ - results = {} - - # Find all dataset directories - for item in bundle_dir.iterdir(): - if item.is_dir() and item.name.startswith("dataset_"): - dataset_name = item.name.replace("dataset_", "") - results[dataset_name] = self._load_dataset(item) - - # Load metadata - meta_path = bundle_dir / "meta.json" - if meta_path.exists(): - with open(meta_path) as f: - meta = json.load(f) - # Attach to each bundle - for bundle in results.values(): - bundle.meta = meta - - return results - - def _load_dataset(self, dataset_dir: Path) -> ConvertedBundle: - """Load a single dataset from directory.""" - bundle = ConvertedBundle() - - # Load matrix - mtx_path = dataset_dir / "matrix.mtx" - tsv_path = dataset_dir / "counts.tsv.gz" - - if mtx_path.exists(): - bundle.matrix = self._load_mtx(mtx_path) - elif tsv_path.exists(): - df = pd.read_csv(tsv_path, sep="\t", index_col=0, compression="gzip") - bundle.matrix = df.values - bundle.rownames = df.index.tolist() - bundle.colnames = df.columns.tolist() - - # Load row/column names from separate files if MTX format - barcodes_path = dataset_dir / "barcodes.tsv" - features_path = dataset_dir / "features.tsv" - - if barcodes_path.exists(): - bundle.colnames = pd.read_csv(barcodes_path, header=None)[0].tolist() - if features_path.exists(): - bundle.rownames = pd.read_csv(features_path, header=None)[0].tolist() - - # Load genes (rowData) - genes_path = dataset_dir / "genes.csv" - if genes_path.exists(): - bundle.genes = pd.read_csv(genes_path, index_col=0) - if not bundle.rownames: - bundle.rownames = bundle.genes.index.tolist() - - # Load samples (colData) - samples_path = dataset_dir / "samples.csv" - if samples_path.exists(): - bundle.samples = pd.read_csv(samples_path, index_col=0) - if not bundle.colnames: - bundle.colnames = bundle.samples.index.tolist() - - return bundle - - def _load_mtx(self, mtx_path: Path) -> np.ndarray: - """Load Matrix Market file.""" - try: - from scipy.io import mmread - - sparse_matrix = mmread(str(mtx_path)) - return sparse_matrix.toarray() - except ImportError: - logger.warning("scipy not installed, cannot load MTX files efficiently") - # Fallback: manual parsing (slow) - return self._parse_mtx_manual(mtx_path) - - def _parse_mtx_manual(self, mtx_path: Path) -> np.ndarray: - """Parse MTX file manually (fallback if scipy not available).""" - with open(mtx_path) as f: - # Skip comments - line = f.readline() - while line.startswith("%"): - line = f.readline() - - # Read dimensions - parts = line.strip().split() - nrows, ncols, _ = int(parts[0]), int(parts[1]), int(parts[2]) - - # Create dense matrix - matrix = np.zeros((nrows, ncols)) - - # Read entries - for line in f: - parts = line.strip().split() - if len(parts) >= 3: - i, j, val = int(parts[0]) - 1, int(parts[1]) - 1, float(parts[2]) - matrix[i, j] = val - - return matrix - - def convert_and_load( - self, - rdata_url: str, - accession: str, - output_format: str = "mtx_bundle", - assay_name: str | None = None, - force: bool = False, - ) -> dict[str, ConvertedBundle]: - """ - Convert .RData and load the result in one call. - - Parameters - ---------- - rdata_url : str - URL to the .RData file. - accession : str - Experiment accession. - output_format : str - Output format. - assay_name : str, optional - Specific assay to extract. - force : bool - Force re-conversion. - - Returns - ------- - dict[str, ConvertedBundle] - Dict mapping dataset name to ConvertedBundle. - """ - # Check local cache first - bundle_dir = self.cache_dir / accession - meta_path = bundle_dir / "meta.json" - - if not force and meta_path.exists(): - logger.info(f"Loading from local cache: {bundle_dir}") - return self.load_bundle(bundle_dir) - - # Request conversion - result = self.convert(rdata_url, accession, output_format, assay_name, force) - - # Download and extract - bundle_dir = self.download_bundle(result["signed_url"], accession) - - # Load and return - return self.load_bundle(bundle_dir) - - def is_configured(self) -> bool: - """Check if the converter client is properly configured.""" - return bool(self.service_url) diff --git a/src/expressionatlas/download.py b/src/expressionatlas/download.py index 3c59885..1a1c18e 100644 --- a/src/expressionatlas/download.py +++ b/src/expressionatlas/download.py @@ -10,15 +10,16 @@ from __future__ import annotations +import csv import io import logging import tempfile from pathlib import Path from urllib.error import URLError from urllib.request import urlopen +from typing import TypedDict, Dict, List, Optional, Any import numpy as np -import pandas as pd from biocframe import BiocFrame from biocutils import NamedList from summarizedexperiment import SummarizedExperiment @@ -216,15 +217,25 @@ def _download_tsv_fallback(accession: str) -> NamedList: raise DownloadError(accession, "No TSV or RDS data files found.") -def _download_tsv(url: str) -> pd.DataFrame: - """Download and parse a TSV file from URL.""" +def _download_tsv(url: str) -> dict[str, list[str]]: + """Download and parse a TSV file from URL into a column-oriented dictionary.""" logger.debug(f"Downloading: {url}") with urlopen(url, timeout=60) as response: content = response.read().decode("utf-8") - return pd.read_csv(io.StringIO(content), sep="\t") - - -def _try_download_sdrf(url: str) -> pd.DataFrame | None: + + reader = csv.reader(io.StringIO(content), delimiter="\t") + header = next(reader) + data = {h: [] for h in header} + + for row in reader: + for i, h in enumerate(header): + val = row[i] if i < len(row) else None + data[h].append(val) + + return data + + +def _try_download_sdrf(url: str) -> dict[str, dict[str, str]] | None: try: logger.debug(f"Downloading sample annotations: {url}") with urlopen(url, timeout=60) as response: @@ -240,71 +251,100 @@ def _try_download_sdrf(url: str) -> pd.DataFrame | None: else: sample_idx, attr_idx, value_idx = 1, 3, 4 - records: list[tuple[str, str, str]] = [] + records = {} for line in lines: parts = line.split("\t") if len(parts) > max(sample_idx, attr_idx, value_idx): sample_id = parts[sample_idx] attr_name = parts[attr_idx] attr_value = parts[value_idx] - records.append((sample_id, attr_name, attr_value)) - - if not records: - return None - - df = pd.DataFrame(records, columns=["sample_id", "attribute", "value"]) - - result = df.pivot_table( - index="sample_id", - columns="attribute", - values="value", - aggfunc="first", - ) - - result.columns.name = None - result.index.name = "sample_id" - - return result + + if sample_id not in records: + records[sample_id] = {} + + if attr_name not in records[sample_id]: + records[sample_id][attr_name] = attr_value + + return records except Exception as e: logger.debug(f"Could not download sample annotations: {e}") return None def _create_summarized_experiment_from_tsv( - df_data: pd.DataFrame, design_df: pd.DataFrame | None, accession: str, assay_name: str = "counts" + df_data: dict[str, list[str]], design_data: dict[str, dict[str, str]] | None, accession: str, assay_name: str = "counts" ) -> SummarizedExperiment: """Create SummarizedExperiment from TSV data.""" - if df_data.empty: + if not df_data: return SummarizedExperiment() - numeric_cols = df_data.select_dtypes(include=[np.number]).columns.tolist() - annotation_cols = [c for c in df_data.columns if c not in numeric_cols] + all_cols = list(df_data.keys()) + if not all_cols: + return SummarizedExperiment() + + numeric_cols = [] + annotation_cols = [] + + for col in all_cols: + vals = df_data[col] + is_num = True + for v in vals: + if v is not None and v.strip() != "" and v.strip().lower() != "na": + try: + float(v) + except ValueError: + is_num = False + break + if is_num: + numeric_cols.append(col) + else: + annotation_cols.append(col) if not numeric_cols: logger.warning("No numeric columns found in TSV") return SummarizedExperiment() - gene_col = annotation_cols[0] if annotation_cols else df_data.columns[0] + gene_col = annotation_cols[0] if annotation_cols else all_cols[0] sample_cols = numeric_cols - rownames = df_data[gene_col].tolist() + rownames = df_data[gene_col] colnames = sample_cols - assays = {assay_name: df_data[sample_cols].values.astype(np.float64)} + matrix_data = [] + for c in sample_cols: + col_float = [] + for v in df_data[c]: + if v is None or v.strip() == "" or v.strip().lower() == "na": + col_float.append(np.nan) + else: + col_float.append(float(v)) + matrix_data.append(col_float) + + matrix = np.array(matrix_data, dtype=np.float64).T + assays = {assay_name: matrix} row_data = {} for col in annotation_cols: if col != gene_col: - row_data[col] = df_data[col].values.tolist() + row_data[col] = df_data[col] row_bioc = BiocFrame(row_data, row_names=rownames) col_data = {} - if design_df is not None and not design_df.empty: - reindexed_df = design_df.reindex(colnames) - for col in reindexed_df.columns: - col_data[col] = reindexed_df[col].values.tolist() - + if design_data is not None and len(design_data) > 0: + all_attrs = set() + for s in colnames: + if s in design_data: + all_attrs.update(design_data[s].keys()) + + all_attrs = sorted(list(all_attrs)) + + for attr in all_attrs: + col_data[attr] = [] + for s in colnames: + val = design_data.get(s, {}).get(attr, None) + col_data[attr].append(val) + col_bioc = BiocFrame(col_data, row_names=colnames) metadata = {"accession": accession, "source": "tsv"} @@ -328,8 +368,9 @@ def _download_via_converter(rdata_url: str, accession: str) -> NamedList: for name, bundle in bundles.items(): key = name.replace("dataset_", "") if name.startswith("dataset_") else name - row_bioc = BiocFrame(bundle.genes.to_dict("list"), row_names=bundle.rownames) - col_bioc = BiocFrame(bundle.samples.to_dict("list"), row_names=bundle.colnames) + # We need to make sure bundle.genes and bundle.samples return dictionaries of column names mapping to lists of values + row_bioc = BiocFrame(bundle.genes, row_names=bundle.rownames) + col_bioc = BiocFrame(bundle.samples, row_names=bundle.colnames) assays = {} if bundle.matrix is not None: diff --git a/src/expressionatlas/models.py b/src/expressionatlas/models.py index 74ccf84..b0c5d73 100644 --- a/src/expressionatlas/models.py +++ b/src/expressionatlas/models.py @@ -6,7 +6,7 @@ from enum import Enum from typing import Any -import pandas as pd +from biocframe import BiocFrame class ExperimentType(str, Enum): @@ -58,16 +58,27 @@ def to_dict(self) -> dict[str, Any]: } -def search_results_to_dataframe(results: list[SearchResult]) -> pd.DataFrame: - """Convert list of SearchResult objects to a pandas DataFrame.""" +def search_results_to_biocframe(results: list[SearchResult]) -> BiocFrame: + """Convert list of SearchResult objects to a BiocFrame.""" + columns = ["Accession", "Species", "Type", "Title"] if not results: - return pd.DataFrame(columns=["Accession", "Species", "Type", "Title"]) - - data = [r.to_dict() for r in results if not r.connection_error] - df = pd.DataFrame(data) + return BiocFrame({col: [] for col in columns}, column_names=columns) + valid_results = [r for r in results if not r.connection_error] + # Sort by Species, Type, then Accession (matching R package behavior) - if not df.empty: - df = df.sort_values(["Species", "Type", "Accession"]).reset_index(drop=True) - - return df + valid_results.sort( + key=lambda r: ( + r.species if r.species is not None else "", + r.experiment_type if r.experiment_type is not None else "", + r.accession, + ) + ) + + data = {col: [] for col in columns} + for r in valid_results: + d = r.to_dict() + for col in columns: + data[col].append(d[col]) + + return BiocFrame(data, column_names=columns) diff --git a/tests/test_integration.py b/tests/test_integration.py index 6e98d46..4d8cc60 100644 --- a/tests/test_integration.py +++ b/tests/test_integration.py @@ -10,7 +10,7 @@ from expressionatlas.validation import is_valid_accession @pytest.mark.integration -@pytest.mark.skip("takes too long") +# @pytest.mark.skip("takes too long") class TestExpressionAtlasClientIntegration: """Integration tests for ExpressionAtlasClient.""" @@ -19,14 +19,14 @@ def test_search_cancer_human(self) -> None: client = ExpressionAtlasClient() results = client.search_experiments(properties=["cancer"], species="homo sapiens") - assert len(results) > 0 - assert "Accession" in results.columns - assert "Species" in results.columns - assert "Type" in results.columns - assert "Title" in results.columns + assert results.shape[0] > 0 + columns = results.get_column_names() + assert "Accession" in columns + assert "Species" in columns + assert "Type" in columns + assert "Title" in columns - # All accessions should be valid - for acc in results["Accession"]: + for acc in results.get_column("Accession"): assert is_valid_accession(acc) def test_search_salt_oryza(self) -> None: @@ -34,7 +34,7 @@ def test_search_salt_oryza(self) -> None: client = ExpressionAtlasClient() results = client.search_experiments(properties=["salt"], species="oryza sativa") - assert len(results) > 0 + assert results.shape[0] > 0 def test_download_single_experiment(self) -> None: """Download a single experiment should succeed.""" diff --git a/tests/test_models.py b/tests/test_models.py index bffceb3..e213fec 100644 --- a/tests/test_models.py +++ b/tests/test_models.py @@ -4,7 +4,7 @@ from expressionatlas.models import ( ExperimentType, SearchResult, - search_results_to_dataframe, + search_results_to_biocframe, ) @@ -65,14 +65,14 @@ def test_default_connection_error(self) -> None: assert result.connection_error is False -class TestSearchResultsToDataframe: - """Tests for search_results_to_dataframe function.""" +class TestSearchResultsToBiocframe: + """Tests for search_results_to_biocframe function.""" def test_empty_list(self) -> None: - """Empty list should return empty DataFrame with correct columns.""" - df = search_results_to_dataframe([]) - assert list(df.columns) == ["Accession", "Species", "Type", "Title"] - assert len(df) == 0 + """Empty list should return empty BiocFrame with correct columns.""" + bf = search_results_to_biocframe([]) + assert list(bf.get_column_names()) == ["Accession", "Species", "Type", "Title"] + assert bf.shape[0] == 0 def test_filters_connection_errors(self) -> None: """Should exclude results with connection errors.""" @@ -80,9 +80,9 @@ def test_filters_connection_errors(self) -> None: SearchResult("E-MTAB-1624", "Human", "RNA-seq", "Test 1"), SearchResult("E-MTAB-1625", None, None, None, connection_error=True), ] - df = search_results_to_dataframe(results) - assert len(df) == 1 - assert df.iloc[0]["Accession"] == "E-MTAB-1624" + bf = search_results_to_biocframe(results) + assert bf.shape[0] == 1 + assert bf.get_column("Accession")[0] == "E-MTAB-1624" def test_sorts_by_species_type_accession(self) -> None: """Should sort by Species, Type, then Accession.""" @@ -91,8 +91,9 @@ def test_sorts_by_species_type_accession(self) -> None: SearchResult("E-MTAB-1", "Human", "Array", "Test 1"), SearchResult("E-MTAB-3", "Human", "RNA-seq", "Test 3"), ] - df = search_results_to_dataframe(results) + bf = search_results_to_biocframe(results) # Human Array, Human RNA-seq, Zebra RNA-seq - assert df.iloc[0]["Accession"] == "E-MTAB-1" - assert df.iloc[1]["Accession"] == "E-MTAB-3" - assert df.iloc[2]["Accession"] == "E-MTAB-2" + ids = bf.get_column("Accession") + assert ids[0] == "E-MTAB-1" + assert ids[1] == "E-MTAB-3" + assert ids[2] == "E-MTAB-2" From eb46703be3fba547b418778a7f0a3636217d08c6 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Mon, 30 Mar 2026 18:05:37 +0000 Subject: [PATCH 5/5] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- src/expressionatlas/client.py | 18 ++++++++++++++---- src/expressionatlas/download.py | 26 ++++++++++++++------------ src/expressionatlas/models.py | 2 +- 3 files changed, 29 insertions(+), 17 deletions(-) diff --git a/src/expressionatlas/client.py b/src/expressionatlas/client.py index c4e4c87..9a8bcb4 100644 --- a/src/expressionatlas/client.py +++ b/src/expressionatlas/client.py @@ -232,11 +232,21 @@ def get_experiments( ... species="homo sapiens", ... ) >>> # Download all RNA-seq experiments from search results - >>> types = results.get_column("Type") - >>> accessions = results.get_column("Accession") + >>> types = results.get_column( + ... "Type" + ... ) + >>> accessions = results.get_column( + ... "Accession" + ... ) >>> rnaseq_accessions = [ - ... acc for acc, typ in zip(accessions, types) - ... if typ and "RNA-seq" in typ + ... acc + ... for acc, typ in zip( + ... accessions, + ... types, + ... ) + ... if typ + ... and "RNA-seq" + ... in typ ... ] >>> experiments = client.get_experiments( ... rnaseq_accessions diff --git a/src/expressionatlas/download.py b/src/expressionatlas/download.py index 1a1c18e..e57a581 100644 --- a/src/expressionatlas/download.py +++ b/src/expressionatlas/download.py @@ -17,7 +17,6 @@ from pathlib import Path from urllib.error import URLError from urllib.request import urlopen -from typing import TypedDict, Dict, List, Optional, Any import numpy as np from biocframe import BiocFrame @@ -222,16 +221,16 @@ def _download_tsv(url: str) -> dict[str, list[str]]: logger.debug(f"Downloading: {url}") with urlopen(url, timeout=60) as response: content = response.read().decode("utf-8") - + reader = csv.reader(io.StringIO(content), delimiter="\t") header = next(reader) data = {h: [] for h in header} - + for row in reader: for i, h in enumerate(header): val = row[i] if i < len(row) else None data[h].append(val) - + return data @@ -258,10 +257,10 @@ def _try_download_sdrf(url: str) -> dict[str, dict[str, str]] | None: sample_id = parts[sample_idx] attr_name = parts[attr_idx] attr_value = parts[value_idx] - + if sample_id not in records: records[sample_id] = {} - + if attr_name not in records[sample_id]: records[sample_id][attr_name] = attr_value @@ -272,7 +271,10 @@ def _try_download_sdrf(url: str) -> dict[str, dict[str, str]] | None: def _create_summarized_experiment_from_tsv( - df_data: dict[str, list[str]], design_data: dict[str, dict[str, str]] | None, accession: str, assay_name: str = "counts" + df_data: dict[str, list[str]], + design_data: dict[str, dict[str, str]] | None, + accession: str, + assay_name: str = "counts", ) -> SummarizedExperiment: """Create SummarizedExperiment from TSV data.""" if not df_data: @@ -284,7 +286,7 @@ def _create_summarized_experiment_from_tsv( numeric_cols = [] annotation_cols = [] - + for col in all_cols: vals = df_data[col] is_num = True @@ -319,7 +321,7 @@ def _create_summarized_experiment_from_tsv( else: col_float.append(float(v)) matrix_data.append(col_float) - + matrix = np.array(matrix_data, dtype=np.float64).T assays = {assay_name: matrix} @@ -336,15 +338,15 @@ def _create_summarized_experiment_from_tsv( for s in colnames: if s in design_data: all_attrs.update(design_data[s].keys()) - + all_attrs = sorted(list(all_attrs)) - + for attr in all_attrs: col_data[attr] = [] for s in colnames: val = design_data.get(s, {}).get(attr, None) col_data[attr].append(val) - + col_bioc = BiocFrame(col_data, row_names=colnames) metadata = {"accession": accession, "source": "tsv"} diff --git a/src/expressionatlas/models.py b/src/expressionatlas/models.py index b0c5d73..5f937ab 100644 --- a/src/expressionatlas/models.py +++ b/src/expressionatlas/models.py @@ -65,7 +65,7 @@ def search_results_to_biocframe(results: list[SearchResult]) -> BiocFrame: return BiocFrame({col: [] for col in columns}, column_names=columns) valid_results = [r for r in results if not r.connection_error] - + # Sort by Species, Type, then Accession (matching R package behavior) valid_results.sort( key=lambda r: (