From 20cf022bbe56261dc33891e7d4a0de6d137c9dca Mon Sep 17 00:00:00 2001 From: Cyril Achard Date: Tue, 24 Mar 2026 11:43:43 -0500 Subject: [PATCH 1/5] Use copy2 and robust HuggingFace download handling Replace os.rename with shutil.copy2 when handling downloaded files to avoid cross-filesystem rename errors and to preserve metadata. Ensure target_dir exists, use the path returned by hf_hub_download instead of assuming cache folder layout, normalize rename_mapping into a mapping, and call _handle_downloaded_file with the actual downloaded file path. Track and remove the HF cache folder safely (ignore_errors) only if present. These changes make HuggingFace model downloads more reliable across environments. --- dlclibrary/dlcmodelzoo/modelzoo_download.py | 28 ++++++++++++--------- 1 file changed, 16 insertions(+), 12 deletions(-) diff --git a/dlclibrary/dlcmodelzoo/modelzoo_download.py b/dlclibrary/dlcmodelzoo/modelzoo_download.py index 12dff74..a613b0a 100644 --- a/dlclibrary/dlcmodelzoo/modelzoo_download.py +++ b/dlclibrary/dlcmodelzoo/modelzoo_download.py @@ -132,7 +132,9 @@ def _handle_downloaded_file( os.path.join(os.path.dirname(file_path), file_path_) ) file_path = file_path_ - os.rename(file_path, os.path.join(target_dir, file_name)) + import shutil + shutil.copy2(file_path, os.path.join(target_dir, file_name)) + def download_huggingface_model( @@ -189,24 +191,26 @@ def download_huggingface_model( if not os.path.isabs(target_dir): target_dir = os.path.abspath(target_dir) + os.makedirs(target_dir, exist_ok=True) + + last_hf_folder = None + for url in urls: url = url.split("/") repo_id, targzfn = url[0] + "/" + url[1], str(url[-1]) - hf_hub_download(repo_id, targzfn, cache_dir=str(target_dir)) - - # Create a new subfolder as indicated below, unzipping from there and deleting this folder - hf_folder = f"models--{url[0]}--{url[1]}" - path_ = os.path.join(target_dir, hf_folder, "snapshots") - commit = os.listdir(path_)[0] - file_name = os.path.join(path_, commit, targzfn) + downloaded = hf_hub_download(repo_id, targzfn, cache_dir=str(target_dir)) if isinstance(rename_mapping, str): - rename_mapping = {targzfn: rename_mapping} + mapping = {targzfn: rename_mapping} + else: + mapping = rename_mapping + + _handle_downloaded_file(downloaded, target_dir, mapping) - _handle_downloaded_file(file_name, target_dir, rename_mapping) + last_hf_folder = f"models--{url[0]}--{url[1]}" - if remove_hf_folder: + if remove_hf_folder and last_hf_folder is not None: import shutil + shutil.rmtree(os.path.join(target_dir, last_hf_folder), ignore_errors=True) - shutil.rmtree(os.path.join(target_dir, hf_folder)) From 8fe4e6bedf690387d15b1789148c6d177b001f89 Mon Sep 17 00:00:00 2001 From: Cyril Achard Date: Tue, 24 Mar 2026 11:43:47 -0500 Subject: [PATCH 2/5] Update test_modeldownload.py --- tests/test_modeldownload.py | 171 +++++++++++++++++++++++++++++++++--- 1 file changed, 157 insertions(+), 14 deletions(-) diff --git a/tests/test_modeldownload.py b/tests/test_modeldownload.py index 6532f0e..22ffc0d 100644 --- a/tests/test_modeldownload.py +++ b/tests/test_modeldownload.py @@ -8,27 +8,125 @@ # # Licensed under GNU Lesser General Public License v3.0 # -import dlclibrary +from __future__ import annotations + +import io import os +import tarfile +from pathlib import Path + import pytest + +import dlclibrary +import dlclibrary.dlcmodelzoo.modelzoo_download as modelzoo_download from dlclibrary.dlcmodelzoo.modelzoo_download import MODELOPTIONS -def test_download_huggingface_model(tmp_path_factory, model="full_cat"): - folder = tmp_path_factory.mktemp("temp") +def _fake_model_names(): + """ + Return a deterministic fake URL for each model. + Alternate between tar.gz and .pt to test both branches. + """ + mapping = {} + for i, model in enumerate(MODELOPTIONS): + ext = ".tar.gz" if i % 2 == 0 else ".pt" + mapping[model] = f"fakeorg/{model}-repo/{model}{ext}" + return mapping + + +def _write_fake_tar_gz(path: Path): + """ + Create a fake tar.gz archive with the files the downloader expects + for archive-based DLC models. + """ + path.parent.mkdir(parents=True, exist_ok=True) + + with tarfile.open(path, mode="w:gz") as tar: + files = { + "pose_cfg.yaml": b"all_joints: [0, 1]\n", + "snapshot-103000.index": b"fake index", + "snapshot-103000.data-00000-of-00001": b"fake weights", + "snapshot-103000.meta": b"fake meta", + } + + for name, content in files.items(): + info = tarfile.TarInfo(name=name) + info.size = len(content) + tar.addfile(info, io.BytesIO(content)) + + +def _write_fake_pt(path: Path): + """ + Create a fake .pt / .pth weight file. + """ + path.parent.mkdir(parents=True, exist_ok=True) + path.write_bytes(b"fake pytorch weights") + + +@pytest.fixture +def mock_modelzoo(monkeypatch): + """ + Patch both: + - model name resolution + - hf_hub_download network call + + so all downloads are local and deterministic. + """ + fake_names = _fake_model_names() + + monkeypatch.setattr(modelzoo_download, "_load_model_names", lambda: fake_names) + + def fake_hf_hub_download(repo_id, filename, cache_dir): + cache_dir = Path(cache_dir) + hf_folder = cache_dir / f"models--{repo_id.replace('/', '--')}" + snapshot_dir = hf_folder / "snapshots" / "fakecommit123" + returned_file = snapshot_dir / filename + + if filename.endswith(".tar.gz"): + _write_fake_tar_gz(returned_file) + elif filename.endswith(".pt") or filename.endswith(".pth"): + _write_fake_pt(returned_file) + else: + raise AssertionError(f"Unexpected mocked filename: {filename}") + + return str(returned_file) + + monkeypatch.setattr(modelzoo_download, "hf_hub_download", fake_hf_hub_download) + + return fake_names + + +def _assert_download_success(folder: Path, model: str): + """ + Shared assertion helper for download_huggingface_model. + """ dlclibrary.download_huggingface_model(model, str(folder)) - try: # These are not created for .pt models - assert os.path.exists(folder / "pose_cfg.yaml") - assert any(f.startswith("snapshot-") for f in os.listdir(folder)) - except AssertionError: - assert any(f.endswith(".pth") for f in os.listdir(folder)) + files = {p.name for p in folder.iterdir()} + + # Archive-based DLC model + if "pose_cfg.yaml" in files: + assert "pose_cfg.yaml" in files + assert any(name.startswith("snapshot-") for name in files) + + # Direct PyTorch model + else: + assert any(name.endswith((".pt", ".pth")) for name in files) - # Verify that the Hugging Face folder was removed - assert not any(f.startswith("models--") for f in os.listdir(folder)) + # Verify that the Hugging Face cache folder was removed + assert not any(name.startswith("models--") for name in files) -def test_download_huggingface_wrong_model(): +def test_download_huggingface_model_tar_or_pt(tmp_path, mock_modelzoo): + folder = tmp_path / "download_one" + folder.mkdir() + + # "full_cat" may map to tar.gz or .pt depending on ordering; + # this assertion helper supports both branches. + _assert_download_success(folder, "full_cat") + + +def test_download_huggingface_wrong_model(mock_modelzoo): with pytest.raises(ValueError): dlclibrary.download_huggingface_model("wrong_model_name") @@ -40,6 +138,51 @@ def test_parse_superanimal_models(): @pytest.mark.parametrize("model", MODELOPTIONS) -def test_download_all_models(tmp_path_factory, model): - print("Downloading ...", model) - test_download_huggingface_model(tmp_path_factory, model) +def test_download_all_models(tmp_path, mock_modelzoo, model): + folder = tmp_path / model + folder.mkdir() + _assert_download_success(folder, model) + + +def test_download_with_rename_mapping_for_pt(tmp_path, mock_modelzoo): + """ + Explicitly test rename_mapping for a .pt model. + """ + # Pick one of the mocked .pt models + pt_model = None + for i, model in enumerate(MODELOPTIONS): + if i % 2 == 1: + pt_model = model + break + + assert pt_model is not None, "Expected at least one mocked .pt model" + + folder = tmp_path / "rename_pt" + folder.mkdir() + + dlclibrary.download_huggingface_model( + pt_model, + str(folder), + rename_mapping="renamed_weights.pt", + ) + + files = {p.name for p in folder.iterdir()} + assert "renamed_weights.pt" in files + assert not any(name.startswith("models--") for name in files) + + +def test_keep_hf_folder_when_requested(tmp_path, mock_modelzoo): + """ + If remove_hf_folder=False, the cache structure should still exist. + """ + folder = tmp_path / "keep_cache" + folder.mkdir() + + dlclibrary.download_huggingface_model( + "full_cat", + str(folder), + remove_hf_folder=False, + ) + + files = {p.name for p in folder.iterdir()} + assert any(name.startswith("models--") for name in files) \ No newline at end of file From b6a32f6c82a5abfead42d3727668af0dc9a5478b Mon Sep 17 00:00:00 2001 From: Cyril Achard Date: Wed, 25 Mar 2026 13:13:44 -0500 Subject: [PATCH 3/5] Use temporary HF cache and fix tar extraction Use a temporary directory for HuggingFace cache (tempfile.TemporaryDirectory) so HF cache folders are not created under the target_dir. Improve tar.gz handling in _handle_downloaded_file by extracting members with tar.extractfile and shutil.copyfileobj (and fall back to shutil.copy2 for non-tar files), and remove the previous symlink-resolution and explicit HF-folder removal logic. Update hf_hub_download call to use repo_id/filename named args and thread per-file rename mappings through to the extractor. Adjusted tests to reflect that the HF cache is no longer created inside the target directory and to ensure the final artifact still exists. --- dlclibrary/dlcmodelzoo/modelzoo_download.py | 54 +++++++++------------ tests/test_modeldownload.py | 12 +++-- 2 files changed, 30 insertions(+), 36 deletions(-) diff --git a/dlclibrary/dlcmodelzoo/modelzoo_download.py b/dlclibrary/dlcmodelzoo/modelzoo_download.py index a613b0a..5e54a63 100644 --- a/dlclibrary/dlcmodelzoo/modelzoo_download.py +++ b/dlclibrary/dlcmodelzoo/modelzoo_download.py @@ -13,6 +13,8 @@ import json import os import tarfile +import shutil +import tempfile from pathlib import Path from huggingface_hub import hf_hub_download @@ -111,32 +113,27 @@ def get_available_models(dataset: str) -> list[str]: return list(_load_pytorch_dataset_models(dataset)["pose_models"].keys()) + def _handle_downloaded_file( file_path: str, target_dir: str, rename_mapping: dict | None = None ): - """Handle the downloaded file from HuggingFace""" + """Handle the downloaded file from HuggingFace cache and place the final artifact in target_dir.""" file_name = os.path.basename(file_path) + try: with tarfile.open(file_path, mode="r:gz") as tar: for member in tar: if not member.isdir(): fname = Path(member.name).name - tar.makefile(member, os.path.join(target_dir, fname)) - except tarfile.ReadError: # The model is a .pt file + extracted_path = os.path.join(target_dir, fname) + with tar.extractfile(member) as src, open(extracted_path, "wb") as dst: + shutil.copyfileobj(src, dst) + except tarfile.ReadError: if rename_mapping is not None: file_name = rename_mapping.get(file_name, file_name) - if os.path.islink(file_path): - file_path_ = os.readlink(file_path) - if not os.path.isabs(file_path_): - file_path_ = os.path.abspath( - os.path.join(os.path.dirname(file_path), file_path_) - ) - file_path = file_path_ - import shutil shutil.copy2(file_path, os.path.join(target_dir, file_name)) - def download_huggingface_model( model_name: str, target_dir: str = ".", @@ -190,27 +187,22 @@ def download_huggingface_model( if not os.path.isabs(target_dir): target_dir = os.path.abspath(target_dir) - os.makedirs(target_dir, exist_ok=True) - last_hf_folder = None - - for url in urls: - url = url.split("/") - repo_id, targzfn = url[0] + "/" + url[1], str(url[-1]) - - downloaded = hf_hub_download(repo_id, targzfn, cache_dir=str(target_dir)) - - if isinstance(rename_mapping, str): - mapping = {targzfn: rename_mapping} - else: - mapping = rename_mapping - - _handle_downloaded_file(downloaded, target_dir, mapping) + with tempfile.TemporaryDirectory(prefix="dlc_hf_") as hf_cache_dir: + for url in urls: + url = url.split("/") + repo_id, targzfn = url[0] + "/" + url[1], str(url[-1]) - last_hf_folder = f"models--{url[0]}--{url[1]}" + downloaded = hf_hub_download( + repo_id=repo_id, + filename=targzfn, + cache_dir=hf_cache_dir, + ) - if remove_hf_folder and last_hf_folder is not None: - import shutil - shutil.rmtree(os.path.join(target_dir, last_hf_folder), ignore_errors=True) + if isinstance(rename_mapping, str): + mapping = {targzfn: rename_mapping} + else: + mapping = rename_mapping + _handle_downloaded_file(downloaded, target_dir, mapping) diff --git a/tests/test_modeldownload.py b/tests/test_modeldownload.py index 22ffc0d..13a5159 100644 --- a/tests/test_modeldownload.py +++ b/tests/test_modeldownload.py @@ -171,10 +171,7 @@ def test_download_with_rename_mapping_for_pt(tmp_path, mock_modelzoo): assert not any(name.startswith("models--") for name in files) -def test_keep_hf_folder_when_requested(tmp_path, mock_modelzoo): - """ - If remove_hf_folder=False, the cache structure should still exist. - """ +def test_remove_hf_folder_flag_no_longer_affects_target_dir(tmp_path, mock_modelzoo): folder = tmp_path / "keep_cache" folder.mkdir() @@ -185,4 +182,9 @@ def test_keep_hf_folder_when_requested(tmp_path, mock_modelzoo): ) files = {p.name for p in folder.iterdir()} - assert any(name.startswith("models--") for name in files) \ No newline at end of file + + # Final artifact should still be there + assert "full_cat.pt" in files + + # HF cache should no longer be created inside target_dir + assert not any(name.startswith("models--") for name in files) \ No newline at end of file From 673ff9a607118895593494970912f8ba7f440c4b Mon Sep 17 00:00:00 2001 From: Cyril Achard Date: Wed, 25 Mar 2026 13:17:53 -0500 Subject: [PATCH 4/5] Remove remove_hf_folder param and change tar check Delete the remove_hf_folder parameter from download_huggingface_model (and its docstring) and remove the related unit test. Also modify the tar extraction logic in _handle_downloaded_file by switching the TarInfo check from member.isdir() to member.isfile(), changing which tar members are processed during extraction. --- dlclibrary/dlcmodelzoo/modelzoo_download.py | 7 +------ tests/test_modeldownload.py | 19 ------------------- 2 files changed, 1 insertion(+), 25 deletions(-) diff --git a/dlclibrary/dlcmodelzoo/modelzoo_download.py b/dlclibrary/dlcmodelzoo/modelzoo_download.py index 5e54a63..4f1e25d 100644 --- a/dlclibrary/dlcmodelzoo/modelzoo_download.py +++ b/dlclibrary/dlcmodelzoo/modelzoo_download.py @@ -123,7 +123,7 @@ def _handle_downloaded_file( try: with tarfile.open(file_path, mode="r:gz") as tar: for member in tar: - if not member.isdir(): + if not member.isfile(): fname = Path(member.name).name extracted_path = os.path.join(target_dir, fname) with tar.extractfile(member) as src, open(extracted_path, "wb") as dst: @@ -137,7 +137,6 @@ def _handle_downloaded_file( def download_huggingface_model( model_name: str, target_dir: str = ".", - remove_hf_folder: bool = True, rename_mapping: str | dict | None = None, ): """ @@ -150,10 +149,6 @@ def download_huggingface_model( target_dir (str, optional): Target directory where the model weights will be stored. Defaults to the current directory. - remove_hf_folder (bool, optional): - Whether to remove the directory structure created by HuggingFace - after downloading and decompressing the data into DeepLabCut format. - Defaults to True. rename_mapping (dict | str | None, optional): - If a dictionary, it should map the original Hugging Face filenames to new filenames (e.g. {"snapshot-12345.tar.gz": "mymodel.tar.gz"}). diff --git a/tests/test_modeldownload.py b/tests/test_modeldownload.py index 13a5159..2e0d7bc 100644 --- a/tests/test_modeldownload.py +++ b/tests/test_modeldownload.py @@ -169,22 +169,3 @@ def test_download_with_rename_mapping_for_pt(tmp_path, mock_modelzoo): files = {p.name for p in folder.iterdir()} assert "renamed_weights.pt" in files assert not any(name.startswith("models--") for name in files) - - -def test_remove_hf_folder_flag_no_longer_affects_target_dir(tmp_path, mock_modelzoo): - folder = tmp_path / "keep_cache" - folder.mkdir() - - dlclibrary.download_huggingface_model( - "full_cat", - str(folder), - remove_hf_folder=False, - ) - - files = {p.name for p in folder.iterdir()} - - # Final artifact should still be there - assert "full_cat.pt" in files - - # HF cache should no longer be created inside target_dir - assert not any(name.startswith("models--") for name in files) \ No newline at end of file From 9919240a9b7f0ccb3c790bdf85bc5d526f2cd1ac Mon Sep 17 00:00:00 2001 From: Cyril Achard Date: Wed, 25 Mar 2026 13:22:32 -0500 Subject: [PATCH 5/5] Make tar extraction more robust and permissive Open archives with a permissive compression mode (r:*) and only extract regular files. Skip members with empty basenames or where extractfile() returns None, use a context manager for file streams, and raise a ReadError if no regular files were extracted to fail loudly. Preserve previous fallback: if the file isn't an archive, copy it as a direct model file. Also update example usage to remove the explicit remove_hf_folder argument. --- dlclibrary/dlcmodelzoo/modelzoo_download.py | 34 ++++++++++++++++----- 1 file changed, 27 insertions(+), 7 deletions(-) diff --git a/dlclibrary/dlcmodelzoo/modelzoo_download.py b/dlclibrary/dlcmodelzoo/modelzoo_download.py index 4f1e25d..49162a5 100644 --- a/dlclibrary/dlcmodelzoo/modelzoo_download.py +++ b/dlclibrary/dlcmodelzoo/modelzoo_download.py @@ -121,14 +121,34 @@ def _handle_downloaded_file( file_name = os.path.basename(file_path) try: - with tarfile.open(file_path, mode="r:gz") as tar: - for member in tar: + # Be permissive about compression type + with tarfile.open(file_path, mode="r:*") as tar: + extracted_any = False + for member in tar.getmembers(): + # Only extract regular files if not member.isfile(): - fname = Path(member.name).name - extracted_path = os.path.join(target_dir, fname) - with tar.extractfile(member) as src, open(extracted_path, "wb") as dst: - shutil.copyfileobj(src, dst) + continue + + fname = Path(member.name).name + if not fname: + continue + + src = tar.extractfile(member) + if src is None: + continue + + extracted_path = os.path.join(target_dir, fname) + with src, open(extracted_path, "wb") as dst: + shutil.copyfileobj(src, dst) + + extracted_any = True + + # If it opened as a tar but contained nothing useful, fail loudly + if not extracted_any: + raise tarfile.ReadError(f"No regular files extracted from archive: {file_path}") + except tarfile.ReadError: + # Not an archive -> treat as a direct model file (.pt/.pth/etc.) if rename_mapping is not None: file_name = rename_mapping.get(file_name, file_name) shutil.copy2(file_path, os.path.join(target_dir, file_name)) @@ -158,7 +178,7 @@ def download_huggingface_model( Examples: >>> # Download without renaming, keep original filename - download_huggingface_model("superanimal_bird_resnet_50", remove_hf_folder=False) + download_huggingface_model("superanimal_bird_resnet_50") >>> # Download and rename by specifying the new name directly download_huggingface_model(