From 25ab506fd093c831183f474ff4ed6deb5a6e9784 Mon Sep 17 00:00:00 2001
From: Ken Brewer <kenibrewer@users.noreply.github.com>
Date: Wed, 29 Apr 2026 12:23:20 -0400
Subject: [PATCH 1/3] Fix multi-source convert path collision (#442)

Sibling sources whose parent directories share a name (e.g.
analyses/{1,2,3}/analysis) wrote to identical intermediate parquet paths,
causing ArrowInvalid (race) or FileNotFoundError (overwrite). Add a stable
SHA-1 hash of the source's parent path to the intermediate directory, and
extend cleanup in _concat_source_group to remove the new dir and its parent.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 cytotable/convert.py           | 26 +++++++++------
 tests/test_convert_threaded.py | 59 ++++++++++++++++++++++++++++++++++
 2 files changed, 76 insertions(+), 9 deletions(-)

diff --git a/cytotable/convert.py b/cytotable/convert.py
index 57e940a8..a5bfcc81 100644
--- a/cytotable/convert.py
+++ b/cytotable/convert.py
@@ -412,6 +412,7 @@ def _source_pageset_to_parquet(
             A string of the output filepath.
     """
 
+    import hashlib
     import pathlib
 
     import duckdb
@@ -426,10 +427,15 @@ def _source_pageset_to_parquet(
 
     source_type = str(source["source_path"].suffix).lower()
 
-    # attempt to build dest_path
+    # hash of parent path discriminates sources whose parent dirs share a name
+    # (e.g. analyses/{1,2}/analysis) — see cytomining/CytoTable#442
+    source_parent_hash = hashlib.sha1(
+        str(source["source_path"].parent).encode("utf-8")
+    ).hexdigest()[:12]
     source_dest_path = (
         f"{dest_path}/{str(AnyPath(source_group_name).stem).lower()}/"
-        f"{str(source['source_path'].parent.name).lower()}"
+        f"{str(source['source_path'].parent.name).lower()}/"
+        f"{source_parent_hash}"
     )
     pathlib.Path(source_dest_path).mkdir(parents=True, exist_ok=True)
 
@@ -825,13 +831,15 @@ def _concat_source_group(
                 # remove the file which was written in the concatted parquet file (we no longer need it)
                 pathlib.Path(table).unlink()
 
-            # attempt to clean up dir containing original table(s) only if it's empty
-            try:
-                pathlib.Path(pathlib.Path(source["table"][0]).parent).rmdir()
-            except OSError as os_err:
-                # raise only if we don't have a dir not empty errno
-                if os_err.errno != errno.ENOTEMPTY:
-                    raise
+            # clean up the per-source hash dir and its parent if empty
+            chunk_parent = pathlib.Path(source["table"][0]).parent
+            for cleanup_dir in (chunk_parent, chunk_parent.parent):
+                try:
+                    cleanup_dir.rmdir()
+                except OSError as os_err:
+                    if os_err.errno != errno.ENOTEMPTY:
+                        raise
+                    break
 
     # return the concatted parquet filename
     concatted[0]["table"] = [destination_path]
diff --git a/tests/test_convert_threaded.py b/tests/test_convert_threaded.py
index 938b5780..7a5409b6 100644
--- a/tests/test_convert_threaded.py
+++ b/tests/test_convert_threaded.py
@@ -6,6 +6,7 @@
 
 
 import pathlib
+import shutil
 from typing import List
 
 import anndata as ad
@@ -470,3 +471,61 @@ def test_convert_nested_dirs(fx_tempdir: pathlib.Path):
 
     table = parquet.read_table(source=result)
     assert table.shape == (397, 6049)
+
+
+def test_convert_multi_source_colliding_parent_dir_names(
+    load_parsl_threaded: None,
+    fx_tempdir: str,
+    data_dir_cellprofiler: str,
+):
+    """
+    Regression test for cytomining/CytoTable#442: multi-source convert with
+    identical parent dir names (analyses/{1,2,3}/analysis) used to collide on
+    intermediate parquet paths.
+    """
+
+    src_root = pathlib.Path(fx_tempdir) / "analyses"
+    for site in ("1", "2", "3"):
+        site_dir = src_root / site / "analysis"
+        site_dir.mkdir(parents=True, exist_ok=True)
+        for table_name in ("Cells.csv", "Cytoplasm.csv", "Nuclei.csv", "Image.csv"):
+            shutil.copy(
+                f"{data_dir_cellprofiler}/ExampleHuman/{table_name}",
+                site_dir / table_name,
+            )
+
+    result = convert(
+        source_path=str(src_root),
+        dest_path=f"{fx_tempdir}/multi_site.parquet",
+        dest_datatype="parquet",
+        preset="cellprofiler_csv",
+        join=False,
+    )
+
+    assert set(result.keys()) == {
+        "Cells.csv",
+        "Cytoplasm.csv",
+        "Nuclei.csv",
+        "Image.csv",
+    }
+
+    single_site_result = convert(
+        source_path=f"{data_dir_cellprofiler}/ExampleHuman",
+        dest_path=f"{fx_tempdir}/single_site.parquet",
+        dest_datatype="parquet",
+        source_datatype="csv",
+        preset="cellprofiler_csv",
+        join=False,
+    )
+
+    for compartment in ("Cells.csv", "Cytoplasm.csv", "Nuclei.csv"):
+        multi_rows = parquet.read_table(
+            source=result[compartment][0]["table"][0]
+        ).num_rows
+        single_rows = parquet.read_table(
+            source=single_site_result[compartment][0]["table"][0]
+        ).num_rows
+        assert multi_rows == 3 * single_rows, (
+            f"{compartment}: expected 3x single-site rows ({3 * single_rows}),"
+            f" got {multi_rows}"
+        )

From 6e5aa4e9e63b69a341752493319915e90ac1b4dd Mon Sep 17 00:00:00 2001
From: "pre-commit-ci-lite[bot]"
 <117423508+pre-commit-ci-lite[bot]@users.noreply.github.com>
Date: Wed, 29 Apr 2026 16:25:25 +0000
Subject: [PATCH 2/3] [pre-commit.ci lite] apply automatic fixes

---
 .pre-commit-config.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 5e9dfb43..9a86806e 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -63,7 +63,7 @@ repos:
     hooks:
     -   id: vulture
 -   repo: https://github.com/pre-commit/mirrors-mypy
-    rev: v1.20.1
+    rev: v1.20.2
     hooks:
     -   id: mypy
         exclude: >

From 91d1d7545ed3a9d1d09b4da29a3fd59500bca15b Mon Sep 17 00:00:00 2001
From: Ken Brewer <kenibrewer@users.noreply.github.com>
Date: Wed, 29 Apr 2026 12:54:25 -0400
Subject: [PATCH 3/3] Fix pre-commit failures from #442

- Mark sha1 path-discriminator hash as usedforsecurity=False (bandit B324).
- Narrow convert() return type with isinstance(dict) asserts in the new
  multi-source regression test so it passes mypy (test_convert.py is
  mypy-excluded; test_convert_threaded.py is not).

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 cytotable/convert.py           | 3 ++-
 tests/test_convert_threaded.py | 2 ++
 2 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/cytotable/convert.py b/cytotable/convert.py
index a5bfcc81..5283a231 100644
--- a/cytotable/convert.py
+++ b/cytotable/convert.py
@@ -430,7 +430,8 @@ def _source_pageset_to_parquet(
     # hash of parent path discriminates sources whose parent dirs share a name
     # (e.g. analyses/{1,2}/analysis) — see cytomining/CytoTable#442
     source_parent_hash = hashlib.sha1(
-        str(source["source_path"].parent).encode("utf-8")
+        str(source["source_path"].parent).encode("utf-8"),
+        usedforsecurity=False,
     ).hexdigest()[:12]
     source_dest_path = (
         f"{dest_path}/{str(AnyPath(source_group_name).stem).lower()}/"
diff --git a/tests/test_convert_threaded.py b/tests/test_convert_threaded.py
index 7a5409b6..8ff935a7 100644
--- a/tests/test_convert_threaded.py
+++ b/tests/test_convert_threaded.py
@@ -501,6 +501,7 @@ def test_convert_multi_source_colliding_parent_dir_names(
         preset="cellprofiler_csv",
         join=False,
     )
+    assert isinstance(result, dict)
 
     assert set(result.keys()) == {
         "Cells.csv",
@@ -517,6 +518,7 @@ def test_convert_multi_source_colliding_parent_dir_names(
         preset="cellprofiler_csv",
         join=False,
     )
+    assert isinstance(single_site_result, dict)
 
     for compartment in ("Cells.csv", "Cytoplasm.csv", "Nuclei.csv"):
         multi_rows = parquet.read_table(