From 25ab506fd093c831183f474ff4ed6deb5a6e9784 Mon Sep 17 00:00:00 2001 From: Ken Brewer Date: Wed, 29 Apr 2026 12:23:20 -0400 Subject: [PATCH 1/3] Fix multi-source convert path collision (#442) Sibling sources whose parent directories share a name (e.g. analyses/{1,2,3}/analysis) wrote to identical intermediate parquet paths, causing ArrowInvalid (race) or FileNotFoundError (overwrite). Add a stable SHA-1 hash of the source's parent path to the intermediate directory, and extend cleanup in _concat_source_group to remove the new dir and its parent. Co-Authored-By: Claude Opus 4.7 --- cytotable/convert.py | 26 +++++++++------ tests/test_convert_threaded.py | 59 ++++++++++++++++++++++++++++++++++ 2 files changed, 76 insertions(+), 9 deletions(-) diff --git a/cytotable/convert.py b/cytotable/convert.py index 57e940a8..a5bfcc81 100644 --- a/cytotable/convert.py +++ b/cytotable/convert.py @@ -412,6 +412,7 @@ def _source_pageset_to_parquet( A string of the output filepath. """ + import hashlib import pathlib import duckdb @@ -426,10 +427,15 @@ def _source_pageset_to_parquet( source_type = str(source["source_path"].suffix).lower() - # attempt to build dest_path + # hash of parent path discriminates sources whose parent dirs share a name + # (e.g. analyses/{1,2}/analysis) — see cytomining/CytoTable#442 + source_parent_hash = hashlib.sha1( + str(source["source_path"].parent).encode("utf-8") + ).hexdigest()[:12] source_dest_path = ( f"{dest_path}/{str(AnyPath(source_group_name).stem).lower()}/" - f"{str(source['source_path'].parent.name).lower()}" + f"{str(source['source_path'].parent.name).lower()}/" + f"{source_parent_hash}" ) pathlib.Path(source_dest_path).mkdir(parents=True, exist_ok=True) @@ -825,13 +831,15 @@ def _concat_source_group( # remove the file which was written in the concatted parquet file (we no longer need it) pathlib.Path(table).unlink() - # attempt to clean up dir containing original table(s) only if it's empty - try: - pathlib.Path(pathlib.Path(source["table"][0]).parent).rmdir() - except OSError as os_err: - # raise only if we don't have a dir not empty errno - if os_err.errno != errno.ENOTEMPTY: - raise + # clean up the per-source hash dir and its parent if empty + chunk_parent = pathlib.Path(source["table"][0]).parent + for cleanup_dir in (chunk_parent, chunk_parent.parent): + try: + cleanup_dir.rmdir() + except OSError as os_err: + if os_err.errno != errno.ENOTEMPTY: + raise + break # return the concatted parquet filename concatted[0]["table"] = [destination_path] diff --git a/tests/test_convert_threaded.py b/tests/test_convert_threaded.py index 938b5780..7a5409b6 100644 --- a/tests/test_convert_threaded.py +++ b/tests/test_convert_threaded.py @@ -6,6 +6,7 @@ import pathlib +import shutil from typing import List import anndata as ad @@ -470,3 +471,61 @@ def test_convert_nested_dirs(fx_tempdir: pathlib.Path): table = parquet.read_table(source=result) assert table.shape == (397, 6049) + + +def test_convert_multi_source_colliding_parent_dir_names( + load_parsl_threaded: None, + fx_tempdir: str, + data_dir_cellprofiler: str, +): + """ + Regression test for cytomining/CytoTable#442: multi-source convert with + identical parent dir names (analyses/{1,2,3}/analysis) used to collide on + intermediate parquet paths. + """ + + src_root = pathlib.Path(fx_tempdir) / "analyses" + for site in ("1", "2", "3"): + site_dir = src_root / site / "analysis" + site_dir.mkdir(parents=True, exist_ok=True) + for table_name in ("Cells.csv", "Cytoplasm.csv", "Nuclei.csv", "Image.csv"): + shutil.copy( + f"{data_dir_cellprofiler}/ExampleHuman/{table_name}", + site_dir / table_name, + ) + + result = convert( + source_path=str(src_root), + dest_path=f"{fx_tempdir}/multi_site.parquet", + dest_datatype="parquet", + preset="cellprofiler_csv", + join=False, + ) + + assert set(result.keys()) == { + "Cells.csv", + "Cytoplasm.csv", + "Nuclei.csv", + "Image.csv", + } + + single_site_result = convert( + source_path=f"{data_dir_cellprofiler}/ExampleHuman", + dest_path=f"{fx_tempdir}/single_site.parquet", + dest_datatype="parquet", + source_datatype="csv", + preset="cellprofiler_csv", + join=False, + ) + + for compartment in ("Cells.csv", "Cytoplasm.csv", "Nuclei.csv"): + multi_rows = parquet.read_table( + source=result[compartment][0]["table"][0] + ).num_rows + single_rows = parquet.read_table( + source=single_site_result[compartment][0]["table"][0] + ).num_rows + assert multi_rows == 3 * single_rows, ( + f"{compartment}: expected 3x single-site rows ({3 * single_rows})," + f" got {multi_rows}" + ) From 6e5aa4e9e63b69a341752493319915e90ac1b4dd Mon Sep 17 00:00:00 2001 From: "pre-commit-ci-lite[bot]" <117423508+pre-commit-ci-lite[bot]@users.noreply.github.com> Date: Wed, 29 Apr 2026 16:25:25 +0000 Subject: [PATCH 2/3] [pre-commit.ci lite] apply automatic fixes --- .pre-commit-config.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 5e9dfb43..9a86806e 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -63,7 +63,7 @@ repos: hooks: - id: vulture - repo: https://github.com/pre-commit/mirrors-mypy - rev: v1.20.1 + rev: v1.20.2 hooks: - id: mypy exclude: > From 91d1d7545ed3a9d1d09b4da29a3fd59500bca15b Mon Sep 17 00:00:00 2001 From: Ken Brewer Date: Wed, 29 Apr 2026 12:54:25 -0400 Subject: [PATCH 3/3] Fix pre-commit failures from #442 - Mark sha1 path-discriminator hash as usedforsecurity=False (bandit B324). - Narrow convert() return type with isinstance(dict) asserts in the new multi-source regression test so it passes mypy (test_convert.py is mypy-excluded; test_convert_threaded.py is not). Co-Authored-By: Claude Opus 4.7 --- cytotable/convert.py | 3 ++- tests/test_convert_threaded.py | 2 ++ 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/cytotable/convert.py b/cytotable/convert.py index a5bfcc81..5283a231 100644 --- a/cytotable/convert.py +++ b/cytotable/convert.py @@ -430,7 +430,8 @@ def _source_pageset_to_parquet( # hash of parent path discriminates sources whose parent dirs share a name # (e.g. analyses/{1,2}/analysis) — see cytomining/CytoTable#442 source_parent_hash = hashlib.sha1( - str(source["source_path"].parent).encode("utf-8") + str(source["source_path"].parent).encode("utf-8"), + usedforsecurity=False, ).hexdigest()[:12] source_dest_path = ( f"{dest_path}/{str(AnyPath(source_group_name).stem).lower()}/" diff --git a/tests/test_convert_threaded.py b/tests/test_convert_threaded.py index 7a5409b6..8ff935a7 100644 --- a/tests/test_convert_threaded.py +++ b/tests/test_convert_threaded.py @@ -501,6 +501,7 @@ def test_convert_multi_source_colliding_parent_dir_names( preset="cellprofiler_csv", join=False, ) + assert isinstance(result, dict) assert set(result.keys()) == { "Cells.csv", @@ -517,6 +518,7 @@ def test_convert_multi_source_colliding_parent_dir_names( preset="cellprofiler_csv", join=False, ) + assert isinstance(single_site_result, dict) for compartment in ("Cells.csv", "Cytoplasm.csv", "Nuclei.csv"): multi_rows = parquet.read_table(