Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
359 commits
Select commit Hold shift + click to select a range
567eca4
add test: test_tag_and_untag
geetu040 Feb 2, 2026
d23790b
Merge, post fix
JATAYU000 Feb 2, 2026
95e8890
md5_checksum, and request fix
JATAYU000 Feb 2, 2026
16c9251
Merge branch 'openml:main' into dataset_resource
JATAYU000 Feb 2, 2026
23fe19b
Merge commit /pull/1576
JATAYU000 Feb 2, 2026
be29dc9
Merge branch 'dataset_resource'
JATAYU000 Feb 2, 2026
b2287c3
implement get/set_config_values
geetu040 Feb 3, 2026
b7e285e
improve APIBackend.set_config_values
geetu040 Feb 3, 2026
fd43c48
use LegacyConfig
geetu040 Feb 3, 2026
f4aab6b
Revert "use LegacyConfig"
geetu040 Feb 3, 2026
d43cf86
implement _sync_api_config
geetu040 Feb 3, 2026
3e323ed
update tests with _sync_api_config
geetu040 Feb 3, 2026
9195fa6
rename config: timeout -> timeout_seconds
geetu040 Feb 3, 2026
5342eec
use timedelta for default ttl value
geetu040 Feb 3, 2026
adc0e74
update tests, adds v2/fallback
geetu040 Feb 3, 2026
bfb2d3e
add MinIOClient in TestBase
geetu040 Feb 3, 2026
707e1f1
publish,tag methods need testing
JATAYU000 Feb 3, 2026
cabaecf
fix linting for builder
geetu040 Feb 3, 2026
79cf49c
new migration tests
JATAYU000 Feb 3, 2026
5c8791a
Merge /1576
JATAYU000 Feb 3, 2026
85c1113
fix unbound variables: "code", "message"
geetu040 Feb 4, 2026
39bf86a
use requests.Session()
geetu040 Feb 4, 2026
7b66677
remove "timeout_seconds" entirely
geetu040 Feb 4, 2026
d2224c4
update/refactor tests
geetu040 Feb 4, 2026
9608c36
remove unused current_api_version from TestAPIBase
geetu040 Feb 5, 2026
f6bc7f7
make TestAPIBase inherit TestBase
geetu040 Feb 5, 2026
baa3a38
nits: test classes
geetu040 Feb 5, 2026
29c93d1
Review changes, new tests
JATAYU000 Feb 5, 2026
7674b3a
Merge bse migration
JATAYU000 Feb 5, 2026
ddb0774
Doc strings
JATAYU000 Feb 5, 2026
52b93fe
minor fix in _sync_api_config
geetu040 Feb 6, 2026
ec9477f
chore: rerun CI
geetu040 Feb 6, 2026
cea6188
delete mock, decorator
JATAYU000 Feb 9, 2026
3d4e84d
Merge base
JATAYU000 Feb 9, 2026
839bd33
delete url in test
JATAYU000 Feb 9, 2026
8417349
New test design
JATAYU000 Feb 10, 2026
10d134a
remove duplicates in _api/resources/__init__.py
geetu040 Feb 10, 2026
935f0f4
implement HTTPClient.download and add tests
geetu040 Feb 10, 2026
9514df8
add docstrings
geetu040 Feb 11, 2026
09f9ad6
Review changes
JATAYU000 Feb 12, 2026
0b52427
Merge base pr
JATAYU000 Feb 12, 2026
53bee94
update minio
geetu040 Feb 12, 2026
33b4ca0
make delay functions static
geetu040 Feb 13, 2026
a6b9a45
rename: retry_raise_e -> exception
geetu040 Feb 13, 2026
f924b32
use context-manager for requests.Session
geetu040 Feb 13, 2026
541b0f2
remove "assert response is not None"
geetu040 Feb 13, 2026
acb173f
verify checksum before caching
geetu040 Feb 13, 2026
3e8d1f0
update tests
geetu040 Feb 13, 2026
f83bdb5
minor fix in ResourceV1API.untag
geetu040 Feb 13, 2026
969c7d8
Merge branch 'main' into dataset_resource
JATAYU000 Feb 16, 2026
2a42712
remove cache.ttl
geetu040 Feb 16, 2026
001caad
replace config.cache.dir with config.cache_dir
geetu040 Feb 16, 2026
fb38a2d
make HTTPClient.cache compulsory
geetu040 Feb 17, 2026
03c4ca9
remove unused OpenMLCacheRequiredError
geetu040 Feb 17, 2026
8d708fd
implement and use TestAPIBase._create_resource
geetu040 Feb 17, 2026
4f75bba
make ResourceAPI.minio compulsory
geetu040 Feb 17, 2026
164f66f
Merge branch 'main' into migration
geetu040 Feb 17, 2026
c4dae43
rename: use_cache -> enable_cache; reset_cache -> refresh_cache
geetu040 Feb 17, 2026
36c20a2
use server config from TestBase
geetu040 Feb 17, 2026
ab3c1eb
tests: mock HTTP post calls to prevent race conditions
geetu040 Feb 17, 2026
0fc3c74
Merge bse_migration into dataset_resource
JATAYU000 Feb 17, 2026
741a66b
rename cache params
JATAYU000 Feb 17, 2026
81dff8d
Merge branch 'dataset_resource'
JATAYU000 Feb 17, 2026
27ac86f
Minio assertions, other reviews
JATAYU000 Feb 17, 2026
2a488ca
Merge branch 'main' into migration
geetu040 Feb 18, 2026
cbc7194
Merge base migration
JATAYU000 Feb 18, 2026
599c7e1
remove hardcoded server in TestHTTPClient.test_cache
geetu040 Feb 18, 2026
2867862
fix docstring in _resolve_default_cache_dir
geetu040 Feb 18, 2026
f09f3cd
fix docstring in ResourceAPI
geetu040 Feb 18, 2026
5f731ce
remove duplicates in __all__
geetu040 Feb 18, 2026
bad7842
remove ttl related code/docs
geetu040 Feb 18, 2026
aefdb38
remove delay methods in HTTPClient
geetu040 Feb 18, 2026
0f40b02
minor fix in _resolve_default_cache_dir
geetu040 Feb 18, 2026
7ac1672
update FallbackProxy
geetu040 Feb 18, 2026
6ac1dfe
simplify _backend creation
geetu040 Feb 18, 2026
62924c9
Merge branch 'main' into migration
geetu040 Feb 18, 2026
27696bb
req changes
satvshr Feb 20, 2026
190face
resolve conflicts
satvshr Feb 20, 2026
95daaa6
remove old config file
satvshr Feb 20, 2026
7841ea8
added OPENML_TEST_SERVER_ADMIN_KEY_ENV_VAR
satvshr Feb 20, 2026
cc515aa
bug fixing
satvshr Feb 20, 2026
e6a92df
armagh fix
satvshr Feb 20, 2026
1b8c22a
update content_type check
geetu040 Feb 20, 2026
fc839a6
Revert "make delay functions static"
geetu040 Feb 20, 2026
1c922af
Revert "remove delay methods in HTTPClient"
geetu040 Feb 20, 2026
ffa9ce9
Merge branch 'main' into migration
geetu040 Feb 20, 2026
a7b2d21
allow api_key=None
geetu040 Feb 20, 2026
27fe790
add tests for api_key=None
geetu040 Feb 20, 2026
8965112
update cache not found message
geetu040 Feb 23, 2026
72ea1a4
update docs for path in HTTPCache
geetu040 Feb 23, 2026
a696c49
remove elapsed from cached meta
geetu040 Feb 23, 2026
755636d
move self.headers to _HEADERS
geetu040 Feb 23, 2026
d07af34
fix indentation in docstrings of _resolve_default_cache_dir
geetu040 Feb 23, 2026
2d9c8ec
Update openml/_api/clients/http.py
geetu040 Feb 23, 2026
002b989
Merge branch 'main' into migration
geetu040 Feb 23, 2026
045d896
move _handle_delete_exception and_get_endpoint_name, legal_resources
geetu040 Feb 23, 2026
c437966
set HTTPClient.headers
geetu040 Feb 23, 2026
e27470a
remove main_tag
geetu040 Feb 23, 2026
d04d956
remove and merge TestAPIBase into TestBase
geetu040 Feb 23, 2026
9263f7f
minor change in TestHTTPClient.test_cache
geetu040 Feb 23, 2026
79dea29
make HTTPClient.request private
geetu040 Feb 23, 2026
f6497c2
Revert "update FallbackProxy"
geetu040 Feb 23, 2026
dce7f54
use st_ctime instead of st_ctime for cache refresh test
geetu040 Feb 23, 2026
40dd460
Merge branch 'main' into issue1564
geetu040 Feb 24, 2026
0fc917c
majore config refactor
geetu040 Feb 24, 2026
3d86b18
Merge branch 'pr-1577' into migration
geetu040 Feb 24, 2026
aba3d3e
update _config.py
geetu040 Feb 24, 2026
d99d54d
update test_openml_cache_dir_env_var
geetu040 Feb 24, 2026
dc22e3a
fix mutable SERVERS_REGISTRY
geetu040 Feb 25, 2026
7318573
update set_api_version for fallback
geetu040 Feb 25, 2026
29ef187
minor fix
geetu040 Feb 25, 2026
cf94c89
fixes for test_config
geetu040 Feb 25, 2026
298fbda
fixes in conftest urls
geetu040 Feb 25, 2026
9870502
update test_http.py
geetu040 Feb 25, 2026
33065c2
undo changes with test_openml_cache_dir_env_var
geetu040 Feb 25, 2026
76b92bb
fix server mode in test_config.py
geetu040 Feb 25, 2026
419edcb
move _HEADERS to confing
geetu040 Feb 25, 2026
cb6d937
add fixtures for migration tests
geetu040 Feb 25, 2026
8544c8a
update test_http.py with fixtures
geetu040 Feb 25, 2026
d4c413b
update test_versions.py
geetu040 Feb 25, 2026
fab1a15
update test_versions.py
geetu040 Feb 25, 2026
6392be8
Merge base-migration
JATAYU000 Feb 25, 2026
276324a
fix error message in HTTPClient.server
geetu040 Feb 26, 2026
73f7594
fixes in test_versions.py: use DummyTaskAPI instead of TaskAPI
geetu040 Feb 26, 2026
2ee7fa3
add clients in openml._backend
geetu040 Feb 26, 2026
4f37607
skip parquet env var
JATAYU000 Feb 26, 2026
c74754a
Merge base-migration
JATAYU000 Feb 26, 2026
2473208
Updated test,admin fixture
JATAYU000 Feb 26, 2026
7afb0e3
code qulity Reviews
JATAYU000 Feb 26, 2026
3b96559
Test fixes
JATAYU000 Feb 26, 2026
ea80785
remove unnecessary
JATAYU000 Feb 26, 2026
83a2e80
Fix mock delete
JATAYU000 Feb 26, 2026
9eb6c90
Exception review
JATAYU000 Feb 26, 2026
4be5bbd
fixes with openml.config.[server|apikey] leakage
geetu040 Feb 26, 2026
9027c01
remove unused fixtures: use_api_[v1|v2]
geetu040 Feb 26, 2026
c1efdeb
Merge base-mgration
JATAYU000 Feb 27, 2026
dd048d5
mock requests
JATAYU000 Feb 27, 2026
98041ed
skip v2 test for now
JATAYU000 Feb 27, 2026
e5461a9
add more config tests
geetu040 Feb 27, 2026
7d899a9
make SERVERS_REGISTRY private
geetu040 Feb 27, 2026
8587414
fix marker: uses_test_server->test_server
geetu040 Feb 27, 2026
23a3450
fix UserWarning
geetu040 Feb 27, 2026
ac28f82
update fixture: with_server
geetu040 Feb 27, 2026
4a66245
req changes
satvshr Mar 2, 2026
c762fb4
Merge branch 'issue1564' of https://github.com/satvshr/openml-python …
satvshr Mar 2, 2026
77c21f2
Update openml/_api/clients/http.py
geetu040 Mar 4, 2026
eac24fc
Update tests/test_api/test_http.py
geetu040 Mar 4, 2026
2ed65fe
update test_get_uses_cached_response
geetu040 Mar 4, 2026
f3b07de
test_get_with_api_key
geetu040 Mar 4, 2026
29db3f1
use .arff instead of .bin in tests
geetu040 Mar 4, 2026
3b4e538
update test_download_creates_file to use md5_checksum
geetu040 Mar 4, 2026
8ac886b
update test_download_is_cached_on_disk
geetu040 Mar 4, 2026
305f4f0
update APIBackendBuilder
geetu040 Mar 4, 2026
b2bf164
Merge branch 'main' into migration
geetu040 Mar 4, 2026
e97e6c2
Update openml/_api/clients/http.py
geetu040 Mar 4, 2026
c66d73c
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Mar 4, 2026
aa54e8e
pre-commit fixes
geetu040 Mar 4, 2026
2d452d3
Merge branch 'main' into issue1564
geetu040 Mar 6, 2026
c235812
Merge branch 'main' into issue1564
fkiraly Mar 6, 2026
39eb823
Trigger CI
satvshr Mar 6, 2026
50eed37
Merge branch 'main' into migration
geetu040 Mar 6, 2026
7a000eb
Merge branch 'main' into issue1564
geetu040 Mar 10, 2026
79f6187
Merge branch 'main' into issue1564
geetu040 Mar 10, 2026
b1a9e7f
Merge branch 'pr-1577' into migration (merge conflicts)
geetu040 Mar 12, 2026
d716ecf
update server methods in config
geetu040 Mar 12, 2026
3c29e71
fix api-version leakage in tests
geetu040 Mar 12, 2026
b4ff0b2
remove unused migration code
geetu040 Mar 12, 2026
93155ee
debug ci: separate cache for each test-case
geetu040 Mar 12, 2026
d3cc9a7
update port for localhost
geetu040 Mar 12, 2026
a6b82f4
Revert "debug ci: separate cache for each test-case"
geetu040 Mar 12, 2026
3419973
rerun CI
geetu040 Mar 12, 2026
8de99b7
Merge branch 'main' into migration
geetu040 Mar 12, 2026
d0202b0
Merge base migration
JATAYU000 Mar 13, 2026
0fa9e3b
Fix tests for new test setup
JATAYU000 Mar 16, 2026
7d61107
create enum ServerMode
geetu040 Mar 16, 2026
1ecbbba
update config for ServerMode
geetu040 Mar 16, 2026
65472ed
update tests for ServerMode
geetu040 Mar 16, 2026
9219266
Update status_update
JATAYU000 Mar 17, 2026
44b48b5
udpate apikey in _TEST_SERVERS_LOCAL
geetu040 Mar 17, 2026
11b19de
skip v2 status_update
JATAYU000 Mar 18, 2026
4df12d3
Merge base migration
JATAYU000 Mar 18, 2026
77d2af2
skip v2 status_update
JATAYU000 Mar 18, 2026
04bc83b
fix: remove duplicate server name in cache path
geetu040 Mar 23, 2026
f926092
test: remove check for ":" since windows CI expects it
geetu040 Mar 23, 2026
8072e34
adds marker
JATAYU000 Mar 23, 2026
47464e9
Merge base migration
JATAYU000 Mar 23, 2026
f059e71
switch 1 worker test
JATAYU000 Mar 23, 2026
b6d5e31
Merge main
JATAYU000 Mar 23, 2026
4ee28f1
reduce test workers
JATAYU000 Mar 23, 2026
509b4c3
revert workers, static patch
JATAYU000 Mar 24, 2026
6385597
fixes
JATAYU000 Mar 24, 2026
5fea9c9
exists check in load
JATAYU000 Mar 24, 2026
4b43003
updating cache tests
JATAYU000 Mar 24, 2026
f01db35
Revert "updating cache tests"
JATAYU000 Mar 24, 2026
e10d776
fixes
JATAYU000 Mar 24, 2026
ba7edd8
update lazy_behaviour tests, helper functions on paths
JATAYU000 Mar 24, 2026
f003425
file lock
JATAYU000 Mar 25, 2026
713356e
Merge main
JATAYU000 Mar 25, 2026
92bc246
mock get
JATAYU000 Mar 25, 2026
f9dddac
Merge branch 'main' into dataset_resource
JATAYU000 Mar 25, 2026
b90e7c4
path updates
JATAYU000 Mar 25, 2026
4f3ec74
chore: rerun CI
JATAYU000 Mar 25, 2026
98616db
isolate njobs>1
JATAYU000 Mar 25, 2026
ed35e69
taggin exception
JATAYU000 Mar 26, 2026
4af9cbe
Merge main resolve conflicts
JATAYU000 Mar 26, 2026
1c4f946
new tests files, conftest
JATAYU000 Mar 26, 2026
8c1c205
debugger
JATAYU000 Mar 26, 2026
0d99b8d
conftest update
JATAYU000 Mar 26, 2026
c0871f3
add path for id 1
JATAYU000 Mar 26, 2026
6791fb6
debug
JATAYU000 Mar 26, 2026
4164607
Merge main
JATAYU000 Mar 26, 2026
55f13ad
debug
JATAYU000 Mar 26, 2026
d6fe96a
debug fixes
JATAYU000 Mar 26, 2026
c0b3377
debug fixes
JATAYU000 Mar 26, 2026
8d37464
debug fixes
JATAYU000 Mar 26, 2026
e79bb91
debug fixes
JATAYU000 Mar 26, 2026
db74277
update cache clearing
JATAYU000 Mar 26, 2026
fac0240
chore: rerun CI
JATAYU000 Mar 26, 2026
95c68c6
chore: rerun CI
JATAYU000 Mar 26, 2026
2b7df47
Force refresh cache
JATAYU000 Mar 26, 2026
b5836b9
inc tries for fork, debug
JATAYU000 Mar 26, 2026
5e34368
update HTTPClient
geetu040 Mar 27, 2026
17fc002
replace _http.download with _http.get
geetu040 Mar 27, 2026
55b3f11
undo change in HTTPClient.download signature
geetu040 Mar 27, 2026
8fe5941
HTTPClient.cache_path_from_response
geetu040 Mar 27, 2026
37526bb
delete previously added files for tests/files
geetu040 Mar 27, 2026
3996bdd
update cache files in tests/files
geetu040 Mar 27, 2026
b3e9ab1
update cache files in tests/files
geetu040 Mar 27, 2026
ac3b903
update HTTPClient for zip files
geetu040 Mar 27, 2026
94ed2a6
update: cache_path_from_response -> cache_path_from_url
geetu040 Mar 27, 2026
df8b4b8
remove all meta.json
geetu040 Mar 27, 2026
3778204
update conftest.py
geetu040 Mar 27, 2026
b6c4b91
update test_list_all_few_results_available
geetu040 Mar 27, 2026
2edd5d0
update test_lazy_loading_metadata
geetu040 Mar 27, 2026
8d88959
update test_lazy_loading_metadata
geetu040 Mar 27, 2026
5d104c1
udpate test_get_dataset_force_refresh_cache
geetu040 Mar 27, 2026
9dffaae
remove test_get_dataset_force_refresh_cache_clean_start
geetu040 Mar 27, 2026
fe7cf96
Revert "remove test_get_dataset_force_refresh_cache_clean_start"
geetu040 Mar 27, 2026
7ae536a
add http_client in TestBase
geetu040 Mar 27, 2026
4fc0616
update test_lazy_loading_metadata
geetu040 Mar 27, 2026
c1c5544
update _get_body_filename_from_path
geetu040 Mar 27, 2026
140830e
update test_lazy_loading_metadata
geetu040 Mar 27, 2026
af8810d
update test_get_dataset_force_refresh_cache
geetu040 Mar 27, 2026
959d56b
remove test_deletion_of_cache_dir_faulty_download
geetu040 Mar 27, 2026
4719013
Revert class labels
JATAYU000 Mar 27, 2026
b33a895
Update improper paths in tests
JATAYU000 Mar 27, 2026
e2eddc6
test debuger
JATAYU000 Mar 27, 2026
9f73cca
fixup! test debuger
JATAYU000 Mar 27, 2026
28c9946
log file paths
JATAYU000 Mar 27, 2026
b499d7c
remove logger
JATAYU000 Mar 27, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
38 changes: 31 additions & 7 deletions openml/_api/clients/http.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
from typing import Any, cast
from urllib.parse import urlencode, urljoin, urlparse

import arff
import requests
import xmltodict
from requests import Response
Expand Down Expand Up @@ -98,16 +99,32 @@ def _get_body_filename_from_response(self, response: Response) -> str:
if "text/xml" in content_type:
return "body.xml"

if response.content.startswith(b"PK\x03\x04"):
return "body.zip"

try:
arff.loads(response.text)
return "body.arff"
except arff.ArffException:
pass

return "body.txt"

def _get_body_filename_from_path(self, path: Path) -> str:
if (path / "body.json").exists():
return "body.json"
candidates = []
for p in path.iterdir():
if p.name.startswith("body.") and len(p.suffixes) == 1:
candidates.append(p)

if (path / "body.xml").exists():
return "body.xml"
if not candidates:
raise FileNotFoundError(f"No body file found in path: {path}")

return "body.txt"
if len(candidates) > 1:
raise FileNotFoundError(
f"Multiple body files found in path: {path} ({[p.name for p in candidates]})"
)

return candidates[0].name

def load(self, key: str) -> Response:
"""
Expand All @@ -132,6 +149,9 @@ def load(self, key: str) -> Response:
"""
path = self._key_to_path(key)

if not path.exists():
raise FileNotFoundError(f"Cache path not found: {path}")

meta_path = path / "meta.json"
meta_raw = meta_path.read_bytes() if meta_path.exists() else "{}"
meta = json.loads(meta_raw)
Expand All @@ -141,8 +161,6 @@ def load(self, key: str) -> Response:
headers = json.loads(headers_raw)

body_path = path / self._get_body_filename_from_path(path)
if not body_path.exists():
raise FileNotFoundError(f"Incomplete cache at {body_path}")
body = body_path.read_bytes()

response = Response()
Expand Down Expand Up @@ -825,3 +843,9 @@ def write_to_file(response: Response, path: Path, encoding: str) -> None:
handler = handler or write_to_file
handler(response, file_path, encoding)
return file_path

def cache_path_from_url(self, url: str) -> Path:
full_url = urljoin(self.server, url)
key = self.cache.get_key(full_url, params={})
path = self.cache._key_to_path(key)
return path / self.cache._get_body_filename_from_path(path)
133 changes: 132 additions & 1 deletion openml/_api/clients/minio.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,17 @@
from __future__ import annotations

import contextlib
import shutil
import urllib
import zipfile
from pathlib import Path

import minio
import requests
from urllib3 import ProxyManager

import openml
from openml.utils import ProgressBar


class MinIOClient:
Expand All @@ -16,13 +25,135 @@ class MinIOClient:
Attributes
----------
path : pathlib.Path or None
path : pathlib.Path
Configured base path for storage operations.
headers : dict of str to str
Default HTTP headers, including a user-agent identifying the
OpenML Python client version.
"""

@property
def headers(self) -> dict[str, str]:
return openml.config._HEADERS

@property
def path(self) -> Path:
return Path(openml.config.get_cache_directory())

def _get_path(self, url: str) -> Path:
parsed_url = urllib.parse.urlparse(url)
return self.path / "minio" / parsed_url.path.lstrip("/")
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this should be made public and probably give a better name? and we could directly use this method to find where the path could look for a particular url
use-case is in tests


def download_minio_file(
self,
source: str,
destination: str | Path | None = None,
exists_ok: bool = True, # noqa: FBT002
proxy: str | None = "auto",
) -> Path:
Copy link
Copy Markdown
Collaborator

@geetu040 geetu040 Mar 27, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think we can remove these parameters: destination and exists_ok altogether since the destination is always dynamically created from a given url using self._get_path

"""Download file ``source`` from a MinIO Bucket and store it at ``destination``.
Parameters
----------
source : str
URL to a file in a MinIO bucket.
destination : str | Path
Path to store the file to, if a directory is provided the original filename is used.
exists_ok : bool, optional (default=True)
If False, raise FileExists if a file already exists in ``destination``.
proxy: str, optional (default = "auto")
The proxy server to use. By default it's "auto" which uses ``requests`` to
automatically find the proxy to use. Pass None or the environment variable
``no_proxy="*"`` to disable proxies.
"""
destination = self._get_path(source) if destination is None else Path(destination)
parsed_url = urllib.parse.urlparse(source)

# expect path format: /BUCKET/path/to/file.ext
bucket, object_name = parsed_url.path[1:].split("/", maxsplit=1)
if destination.is_dir():
destination = Path(destination, object_name)
if destination.is_file() and not exists_ok:
raise FileExistsError(f"File already exists in {destination}.")

destination = destination.expanduser()
destination.parent.mkdir(parents=True, exist_ok=True)

if proxy == "auto":
resolved_proxies = requests.utils.get_environ_proxies(parsed_url.geturl())
proxy = requests.utils.select_proxy(parsed_url.geturl(), resolved_proxies) # type: ignore

proxy_client = ProxyManager(proxy) if proxy else None

client = minio.Minio(endpoint=parsed_url.netloc, secure=False, http_client=proxy_client)
try:
client.fget_object(
bucket_name=bucket,
object_name=object_name,
file_path=str(destination),
progress=ProgressBar() if openml.config.show_progress else None,
request_headers=self.headers,
)
if destination.is_file() and destination.suffix == ".zip":
with zipfile.ZipFile(destination, "r") as zip_ref:
zip_ref.extractall(destination.parent)

except minio.error.S3Error as e:
if e.message is not None and e.message.startswith("Object does not exist"):
raise FileNotFoundError(f"Object at '{source}' does not exist.") from e
# e.g. permission error, or a bucket does not exist (which is also interpreted as a
# permission error on minio level).
raise FileNotFoundError("Bucket does not exist or is private.") from e

return destination

def download_minio_bucket(self, source: str, destination: str | Path | None = None) -> None:
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

as suggested above

Suggested change
def download_minio_bucket(self, source: str, destination: str | Path | None = None) -> None:
def download_minio_bucket(self, source: str,) -> None:

"""Download file ``source`` from a MinIO Bucket and store it at ``destination``.
Does not redownload files which already exist.
Parameters
----------
source : str
URL to a MinIO bucket.
destination : str | Path
Path to a directory to store the bucket content in.
"""
destination = self._get_path(source) if destination is None else Path(destination)
parsed_url = urllib.parse.urlparse(source)
if destination.suffix:
destination = destination.parent
# expect path format: /BUCKET/path/to/file.ext
_, bucket, *prefixes, _ = parsed_url.path.split("/")
prefix = "/".join(prefixes)

client = minio.Minio(endpoint=parsed_url.netloc, secure=False)

for file_object in client.list_objects(bucket, prefix=prefix, recursive=True):
if file_object.object_name is None:
raise ValueError(f"Object name is None for object {file_object!r}")
if file_object.etag is None:
raise ValueError(f"Object etag is None for object {file_object!r}")

marker = destination / file_object.etag
if marker.exists():
continue

file_destination = destination / file_object.object_name.rsplit("/", 1)[1]
if (file_destination.parent / file_destination.stem).exists():
# Marker is missing but archive exists means the server archive changed
# force a refresh
shutil.rmtree(file_destination.parent / file_destination.stem)

with contextlib.suppress(FileExistsError):
self.download_minio_file(
source=source.rsplit("/", 1)[0]
+ "/"
+ file_object.object_name.rsplit("/", 1)[1],
destination=file_destination,
exists_ok=False,
)

if file_destination.is_file() and file_destination.suffix == ".zip":
file_destination.unlink()
marker.touch()
111 changes: 110 additions & 1 deletion openml/_api/resources/base/resources.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,13 @@
import builtins
from abc import abstractmethod
from collections.abc import Iterable
from typing import TYPE_CHECKING, Any
from pathlib import Path
from typing import TYPE_CHECKING, Any, Literal

if TYPE_CHECKING:
import pandas as pd

from openml.datasets.dataset import OpenMLDataFeature, OpenMLDataset
from openml.enums import ResourceType

from .base import ResourceAPI
Expand All @@ -21,6 +26,110 @@ class DatasetAPI(ResourceAPI):

resource_type: ResourceType = ResourceType.DATASET

@abstractmethod
def get( # noqa: PLR0913
self,
dataset_id: int,
download_data: bool = False, # noqa: FBT002
cache_format: Literal["pickle", "feather"] = "pickle",
download_qualities: bool = False, # noqa: FBT002
download_features_meta_data: bool = False, # noqa: FBT002
download_all_files: bool = False, # noqa: FBT002
force_refresh_cache: bool = False, # noqa: FBT002
) -> OpenMLDataset: ...

@abstractmethod
def list(
self,
limit: int,
offset: int,
*,
data_id: list[int] | None = None, # type: ignore
**kwargs: Any,
) -> pd.DataFrame: ...
Comment on lines +42 to +49
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can we not have same signature for all 3 methods: DatasetsAPI.list, DatasetsV1.list, DatasetsV2.list? does it raise pre-commit failures since a few might not be used?

Copy link
Copy Markdown
Contributor Author

@JATAYU000 JATAYU000 Jan 14, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Oh that v2 signature was experimental, idk how pre-commits did not catch that, Will make them same

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is mypy supposed to catch that?

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yes unused parameters are caught under #ARG001 as seen in the cache_directory params.


@abstractmethod
def edit( # noqa: PLR0913
self,
dataset_id: int,
description: str | None = None,
creator: str | None = None,
contributor: str | None = None,
collection_date: str | None = None,
language: str | None = None,
default_target_attribute: str | None = None,
ignore_attribute: str | list[str] | None = None, # type: ignore
citation: str | None = None,
row_id_attribute: str | None = None,
original_data_url: str | None = None,
paper_url: str | None = None,
) -> int: ...

@abstractmethod
def fork(self, dataset_id: int) -> int: ...

@abstractmethod
def status_update(self, dataset_id: int, status: Literal["active", "deactivated"]) -> None: ...

@abstractmethod
def list_qualities(self) -> builtins.list[str]: ...

@abstractmethod
def feature_add_ontology(self, dataset_id: int, index: int, ontology: str) -> bool: ...

@abstractmethod
def feature_remove_ontology(self, dataset_id: int, index: int, ontology: str) -> bool: ...

@abstractmethod
def get_features(self, dataset_id: int) -> dict[int, OpenMLDataFeature]: ...

@abstractmethod
def get_qualities(self, dataset_id: int) -> dict[str, float] | None: ...

@abstractmethod
def parse_features_file(
self, features_file: Path, features_pickle_file: Path
) -> dict[int, OpenMLDataFeature]: ...

@abstractmethod
def parse_qualities_file(
self, qualities_file: Path, qualities_pickle_file: Path
) -> dict[str, float]: ...

@abstractmethod
def _download_file(self, url_ext: str) -> Path: ...

@abstractmethod
def download_features_file(self, dataset_id: int) -> Path: ...

@abstractmethod
def download_qualities_file(self, dataset_id: int) -> Path: ...

@abstractmethod
def download_dataset_parquet(
self,
description: dict | OpenMLDataset,
download_all_files: bool = False, # noqa: FBT002
) -> Path | None: ...

@abstractmethod
def download_dataset_arff(
self,
description: dict | OpenMLDataset,
) -> Path: ...

@abstractmethod
def add_topic(self, dataset_id: int, topic: str) -> int: ...

@abstractmethod
def delete_topic(self, dataset_id: int, topic: str) -> int: ...

@abstractmethod
def get_online_dataset_format(self, dataset_id: int) -> str: ...

@abstractmethod
def get_online_dataset_arff(self, dataset_id: int) -> str | None: ...


class TaskAPI(ResourceAPI):
"""Abstract API interface for task resources."""
Expand Down
Loading
Loading