-
Notifications
You must be signed in to change notification settings - Fork 6
Fix multi-source convert path collision (#442) #444
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -412,6 +412,7 @@ def _source_pageset_to_parquet( | |
| A string of the output filepath. | ||
| """ | ||
|
|
||
| import hashlib | ||
| import pathlib | ||
|
|
||
| import duckdb | ||
|
|
@@ -426,10 +427,16 @@ def _source_pageset_to_parquet( | |
|
|
||
| source_type = str(source["source_path"].suffix).lower() | ||
|
|
||
| # attempt to build dest_path | ||
| # hash of parent path discriminates sources whose parent dirs share a name | ||
| # (e.g. analyses/{1,2}/analysis) — see cytomining/CytoTable#442 | ||
| source_parent_hash = hashlib.sha1( | ||
| str(source["source_path"].parent).encode("utf-8"), | ||
| usedforsecurity=False, | ||
| ).hexdigest()[:12] | ||
| source_dest_path = ( | ||
| f"{dest_path}/{str(AnyPath(source_group_name).stem).lower()}/" | ||
| f"{str(source['source_path'].parent.name).lower()}" | ||
| f"{str(source['source_path'].parent.name).lower()}/" | ||
| f"{source_parent_hash}" | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think this will affect returned paths when using Perhaps some kind of combination of both here could work ( |
||
| ) | ||
| pathlib.Path(source_dest_path).mkdir(parents=True, exist_ok=True) | ||
|
|
||
|
|
@@ -825,13 +832,15 @@ def _concat_source_group( | |
| # remove the file which was written in the concatted parquet file (we no longer need it) | ||
| pathlib.Path(table).unlink() | ||
|
|
||
| # attempt to clean up dir containing original table(s) only if it's empty | ||
| try: | ||
| pathlib.Path(pathlib.Path(source["table"][0]).parent).rmdir() | ||
| except OSError as os_err: | ||
| # raise only if we don't have a dir not empty errno | ||
| if os_err.errno != errno.ENOTEMPTY: | ||
| raise | ||
| # clean up the per-source hash dir and its parent if empty | ||
| chunk_parent = pathlib.Path(source["table"][0]).parent | ||
| for cleanup_dir in (chunk_parent, chunk_parent.parent): | ||
| try: | ||
| cleanup_dir.rmdir() | ||
| except OSError as os_err: | ||
| if os_err.errno != errno.ENOTEMPTY: | ||
| raise | ||
| break | ||
|
|
||
| # return the concatted parquet filename | ||
| concatted[0]["table"] = [destination_path] | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -6,6 +6,7 @@ | |
|
|
||
|
|
||
| import pathlib | ||
| import shutil | ||
| from typing import List | ||
|
|
||
| import anndata as ad | ||
|
|
@@ -470,3 +471,63 @@ def test_convert_nested_dirs(fx_tempdir: pathlib.Path): | |
|
|
||
| table = parquet.read_table(source=result) | ||
| assert table.shape == (397, 6049) | ||
|
|
||
|
|
||
| def test_convert_multi_source_colliding_parent_dir_names( | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 🙌 |
||
| load_parsl_threaded: None, | ||
| fx_tempdir: str, | ||
| data_dir_cellprofiler: str, | ||
| ): | ||
| """ | ||
| Regression test for cytomining/CytoTable#442: multi-source convert with | ||
| identical parent dir names (analyses/{1,2,3}/analysis) used to collide on | ||
| intermediate parquet paths. | ||
| """ | ||
|
|
||
| src_root = pathlib.Path(fx_tempdir) / "analyses" | ||
| for site in ("1", "2", "3"): | ||
| site_dir = src_root / site / "analysis" | ||
| site_dir.mkdir(parents=True, exist_ok=True) | ||
| for table_name in ("Cells.csv", "Cytoplasm.csv", "Nuclei.csv", "Image.csv"): | ||
| shutil.copy( | ||
| f"{data_dir_cellprofiler}/ExampleHuman/{table_name}", | ||
| site_dir / table_name, | ||
| ) | ||
|
|
||
| result = convert( | ||
| source_path=str(src_root), | ||
| dest_path=f"{fx_tempdir}/multi_site.parquet", | ||
| dest_datatype="parquet", | ||
| preset="cellprofiler_csv", | ||
| join=False, | ||
| ) | ||
| assert isinstance(result, dict) | ||
|
|
||
| assert set(result.keys()) == { | ||
| "Cells.csv", | ||
| "Cytoplasm.csv", | ||
| "Nuclei.csv", | ||
| "Image.csv", | ||
| } | ||
|
|
||
| single_site_result = convert( | ||
| source_path=f"{data_dir_cellprofiler}/ExampleHuman", | ||
| dest_path=f"{fx_tempdir}/single_site.parquet", | ||
| dest_datatype="parquet", | ||
| source_datatype="csv", | ||
| preset="cellprofiler_csv", | ||
| join=False, | ||
| ) | ||
| assert isinstance(single_site_result, dict) | ||
|
|
||
| for compartment in ("Cells.csv", "Cytoplasm.csv", "Nuclei.csv"): | ||
| multi_rows = parquet.read_table( | ||
| source=result[compartment][0]["table"][0] | ||
| ).num_rows | ||
| single_rows = parquet.read_table( | ||
| source=single_site_result[compartment][0]["table"][0] | ||
| ).num_rows | ||
| assert multi_rows == 3 * single_rows, ( | ||
| f"{compartment}: expected 3x single-site rows ({3 * single_rows})," | ||
| f" got {multi_rows}" | ||
| ) | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
🧩 Analysis chain
🌐 Web query:
mypy version 1.20.2 release notes or changelog💡 Result:
No evidence found for a mypy version 1.20.2 release as of 2026-05-07. The latest versions are mypy 1.20.0 (released March 31, 2026 [1][2]) and 1.20.1 (April 2026 [3][4][5]). GitHub tags confirm up to v1.20.0 [1], no v1.20.2 tag or PyPI page [2]. Changelog at v1.20.1 includes 1.20.1 fixes like SQLite cache sync disable and narrowing fixes [3][4]. Official release notes for 1.20.0 cover better type narrowing, Python 3.9 support drop, SQLite cache default, local partial types improvements, and Mypy 2.0 plans [6][7]. For full 1.20 details, see https://github.com/python/mypy/blob/v1.20.1/CHANGELOG.md [3] or https://mypy.readthedocs.io/en/stable/changelog.html [7].
Citations:
Change mypy version to v1.20.1 or v1.20.0.
mypy v1.20.2 does not exist. As of May 2026, the latest available versions are v1.20.1 (April 2026) and v1.20.0 (March 2026). Update the revision to one of these releases.
🤖 Prompt for AI Agents