-
-
Notifications
You must be signed in to change notification settings - Fork 272
[ENH] V1 → V2 API Migration - datasets #1608
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
567eca4
d23790b
95e8890
16c9251
23fe19b
be29dc9
b2287c3
b7e285e
fd43c48
f4aab6b
d43cf86
3e323ed
9195fa6
5342eec
adc0e74
bfb2d3e
707e1f1
cabaecf
79cf49c
5c8791a
85c1113
39bf86a
7b66677
d2224c4
9608c36
f6bc7f7
baa3a38
29c93d1
7674b3a
ddb0774
52b93fe
ec9477f
cea6188
3d4e84d
839bd33
8417349
10d134a
935f0f4
9514df8
09f9ad6
0b52427
53bee94
33b4ca0
a6b9a45
f924b32
541b0f2
acb173f
3e8d1f0
f83bdb5
969c7d8
2a42712
001caad
fb38a2d
03c4ca9
8d708fd
4f75bba
164f66f
c4dae43
36c20a2
ab3c1eb
0fc3c74
741a66b
81dff8d
27ac86f
2a488ca
cbc7194
599c7e1
2867862
f09f3cd
5f731ce
bad7842
aefdb38
0f40b02
7ac1672
6ac1dfe
62924c9
27696bb
190face
95daaa6
7841ea8
cc515aa
e6a92df
1b8c22a
fc839a6
1c922af
ffa9ce9
a7b2d21
27fe790
8965112
72ea1a4
a696c49
755636d
d07af34
2d9c8ec
002b989
045d896
c437966
e27470a
d04d956
9263f7f
79dea29
f6497c2
dce7f54
40dd460
0fc917c
3d86b18
aba3d3e
d99d54d
dc22e3a
7318573
29ef187
cf94c89
298fbda
9870502
33065c2
76b92bb
419edcb
cb6d937
8544c8a
d4c413b
fab1a15
6392be8
276324a
73f7594
2ee7fa3
4f37607
c74754a
2473208
7afb0e3
3b96559
ea80785
83a2e80
9eb6c90
4be5bbd
9027c01
c1efdeb
dd048d5
98041ed
e5461a9
7d899a9
8587414
23a3450
ac28f82
4a66245
c762fb4
77c21f2
eac24fc
2ed65fe
f3b07de
29db3f1
3b4e538
8ac886b
305f4f0
b2bf164
e97e6c2
c66d73c
aa54e8e
2d452d3
c235812
39eb823
50eed37
7a000eb
79f6187
b1a9e7f
d716ecf
3c29e71
b4ff0b2
93155ee
d3cc9a7
a6b82f4
3419973
8de99b7
d0202b0
0fa9e3b
7d61107
1ecbbba
65472ed
9219266
44b48b5
11b19de
4df12d3
77d2af2
04bc83b
f926092
8072e34
47464e9
f059e71
b6d5e31
4ee28f1
509b4c3
6385597
5fea9c9
4b43003
f01db35
e10d776
ba7edd8
f003425
713356e
92bc246
f9dddac
b90e7c4
4f3ec74
98616db
ed35e69
4af9cbe
1c4f946
8c1c205
0d99b8d
c0871f3
6791fb6
4164607
55f13ad
d6fe96a
c0b3377
8d37464
e79bb91
db74277
fac0240
95c68c6
2b7df47
b5836b9
5e34368
17fc002
55b3f11
8fe5941
37526bb
3996bdd
b3e9ab1
ac3b903
94ed2a6
df8b4b8
3778204
b6c4b91
2edd5d0
8d88959
5d104c1
9dffaae
fe7cf96
7ae536a
4fc0616
c1c5544
140830e
af8810d
959d56b
4719013
b33a895
e2eddc6
9f73cca
28c9946
b499d7c
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change | ||||
|---|---|---|---|---|---|---|
| @@ -1,8 +1,17 @@ | ||||||
| from __future__ import annotations | ||||||
|
|
||||||
| import contextlib | ||||||
| import shutil | ||||||
| import urllib | ||||||
| import zipfile | ||||||
| from pathlib import Path | ||||||
|
|
||||||
| import minio | ||||||
| import requests | ||||||
| from urllib3 import ProxyManager | ||||||
|
|
||||||
| import openml | ||||||
| from openml.utils import ProgressBar | ||||||
|
|
||||||
|
|
||||||
| class MinIOClient: | ||||||
|
|
@@ -16,13 +25,135 @@ class MinIOClient: | |||||
| Attributes | ||||||
| ---------- | ||||||
| path : pathlib.Path or None | ||||||
| path : pathlib.Path | ||||||
| Configured base path for storage operations. | ||||||
| headers : dict of str to str | ||||||
| Default HTTP headers, including a user-agent identifying the | ||||||
| OpenML Python client version. | ||||||
| """ | ||||||
|
|
||||||
| @property | ||||||
| def headers(self) -> dict[str, str]: | ||||||
| return openml.config._HEADERS | ||||||
|
|
||||||
| @property | ||||||
| def path(self) -> Path: | ||||||
| return Path(openml.config.get_cache_directory()) | ||||||
|
|
||||||
| def _get_path(self, url: str) -> Path: | ||||||
| parsed_url = urllib.parse.urlparse(url) | ||||||
| return self.path / "minio" / parsed_url.path.lstrip("/") | ||||||
|
|
||||||
| def download_minio_file( | ||||||
| self, | ||||||
| source: str, | ||||||
| destination: str | Path | None = None, | ||||||
| exists_ok: bool = True, # noqa: FBT002 | ||||||
| proxy: str | None = "auto", | ||||||
| ) -> Path: | ||||||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think we can remove these parameters: |
||||||
| """Download file ``source`` from a MinIO Bucket and store it at ``destination``. | ||||||
| Parameters | ||||||
| ---------- | ||||||
| source : str | ||||||
| URL to a file in a MinIO bucket. | ||||||
| destination : str | Path | ||||||
| Path to store the file to, if a directory is provided the original filename is used. | ||||||
| exists_ok : bool, optional (default=True) | ||||||
| If False, raise FileExists if a file already exists in ``destination``. | ||||||
| proxy: str, optional (default = "auto") | ||||||
| The proxy server to use. By default it's "auto" which uses ``requests`` to | ||||||
| automatically find the proxy to use. Pass None or the environment variable | ||||||
| ``no_proxy="*"`` to disable proxies. | ||||||
| """ | ||||||
| destination = self._get_path(source) if destination is None else Path(destination) | ||||||
| parsed_url = urllib.parse.urlparse(source) | ||||||
|
|
||||||
| # expect path format: /BUCKET/path/to/file.ext | ||||||
| bucket, object_name = parsed_url.path[1:].split("/", maxsplit=1) | ||||||
| if destination.is_dir(): | ||||||
| destination = Path(destination, object_name) | ||||||
| if destination.is_file() and not exists_ok: | ||||||
| raise FileExistsError(f"File already exists in {destination}.") | ||||||
|
|
||||||
| destination = destination.expanduser() | ||||||
| destination.parent.mkdir(parents=True, exist_ok=True) | ||||||
|
|
||||||
| if proxy == "auto": | ||||||
| resolved_proxies = requests.utils.get_environ_proxies(parsed_url.geturl()) | ||||||
| proxy = requests.utils.select_proxy(parsed_url.geturl(), resolved_proxies) # type: ignore | ||||||
|
|
||||||
| proxy_client = ProxyManager(proxy) if proxy else None | ||||||
|
|
||||||
| client = minio.Minio(endpoint=parsed_url.netloc, secure=False, http_client=proxy_client) | ||||||
| try: | ||||||
| client.fget_object( | ||||||
| bucket_name=bucket, | ||||||
| object_name=object_name, | ||||||
| file_path=str(destination), | ||||||
| progress=ProgressBar() if openml.config.show_progress else None, | ||||||
| request_headers=self.headers, | ||||||
| ) | ||||||
| if destination.is_file() and destination.suffix == ".zip": | ||||||
| with zipfile.ZipFile(destination, "r") as zip_ref: | ||||||
| zip_ref.extractall(destination.parent) | ||||||
|
|
||||||
| except minio.error.S3Error as e: | ||||||
| if e.message is not None and e.message.startswith("Object does not exist"): | ||||||
| raise FileNotFoundError(f"Object at '{source}' does not exist.") from e | ||||||
| # e.g. permission error, or a bucket does not exist (which is also interpreted as a | ||||||
| # permission error on minio level). | ||||||
| raise FileNotFoundError("Bucket does not exist or is private.") from e | ||||||
|
|
||||||
| return destination | ||||||
|
|
||||||
| def download_minio_bucket(self, source: str, destination: str | Path | None = None) -> None: | ||||||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. as suggested above
Suggested change
|
||||||
| """Download file ``source`` from a MinIO Bucket and store it at ``destination``. | ||||||
| Does not redownload files which already exist. | ||||||
| Parameters | ||||||
| ---------- | ||||||
| source : str | ||||||
| URL to a MinIO bucket. | ||||||
| destination : str | Path | ||||||
| Path to a directory to store the bucket content in. | ||||||
| """ | ||||||
| destination = self._get_path(source) if destination is None else Path(destination) | ||||||
| parsed_url = urllib.parse.urlparse(source) | ||||||
| if destination.suffix: | ||||||
| destination = destination.parent | ||||||
| # expect path format: /BUCKET/path/to/file.ext | ||||||
| _, bucket, *prefixes, _ = parsed_url.path.split("/") | ||||||
| prefix = "/".join(prefixes) | ||||||
|
|
||||||
| client = minio.Minio(endpoint=parsed_url.netloc, secure=False) | ||||||
|
|
||||||
| for file_object in client.list_objects(bucket, prefix=prefix, recursive=True): | ||||||
| if file_object.object_name is None: | ||||||
| raise ValueError(f"Object name is None for object {file_object!r}") | ||||||
| if file_object.etag is None: | ||||||
| raise ValueError(f"Object etag is None for object {file_object!r}") | ||||||
|
|
||||||
| marker = destination / file_object.etag | ||||||
| if marker.exists(): | ||||||
| continue | ||||||
|
|
||||||
| file_destination = destination / file_object.object_name.rsplit("/", 1)[1] | ||||||
| if (file_destination.parent / file_destination.stem).exists(): | ||||||
| # Marker is missing but archive exists means the server archive changed | ||||||
| # force a refresh | ||||||
| shutil.rmtree(file_destination.parent / file_destination.stem) | ||||||
|
|
||||||
| with contextlib.suppress(FileExistsError): | ||||||
| self.download_minio_file( | ||||||
| source=source.rsplit("/", 1)[0] | ||||||
| + "/" | ||||||
| + file_object.object_name.rsplit("/", 1)[1], | ||||||
| destination=file_destination, | ||||||
| exists_ok=False, | ||||||
| ) | ||||||
|
|
||||||
| if file_destination.is_file() and file_destination.suffix == ".zip": | ||||||
| file_destination.unlink() | ||||||
| marker.touch() | ||||||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -3,8 +3,13 @@ | |
| import builtins | ||
| from abc import abstractmethod | ||
| from collections.abc import Iterable | ||
| from typing import TYPE_CHECKING, Any | ||
| from pathlib import Path | ||
| from typing import TYPE_CHECKING, Any, Literal | ||
|
|
||
| if TYPE_CHECKING: | ||
| import pandas as pd | ||
|
|
||
| from openml.datasets.dataset import OpenMLDataFeature, OpenMLDataset | ||
| from openml.enums import ResourceType | ||
|
|
||
| from .base import ResourceAPI | ||
|
|
@@ -21,6 +26,110 @@ class DatasetAPI(ResourceAPI): | |
|
|
||
| resource_type: ResourceType = ResourceType.DATASET | ||
|
|
||
| @abstractmethod | ||
| def get( # noqa: PLR0913 | ||
| self, | ||
| dataset_id: int, | ||
| download_data: bool = False, # noqa: FBT002 | ||
| cache_format: Literal["pickle", "feather"] = "pickle", | ||
| download_qualities: bool = False, # noqa: FBT002 | ||
| download_features_meta_data: bool = False, # noqa: FBT002 | ||
| download_all_files: bool = False, # noqa: FBT002 | ||
| force_refresh_cache: bool = False, # noqa: FBT002 | ||
| ) -> OpenMLDataset: ... | ||
|
|
||
| @abstractmethod | ||
| def list( | ||
| self, | ||
| limit: int, | ||
| offset: int, | ||
| *, | ||
| data_id: list[int] | None = None, # type: ignore | ||
| **kwargs: Any, | ||
| ) -> pd.DataFrame: ... | ||
|
Comment on lines
+42
to
+49
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. can we not have same signature for all 3 methods:
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Oh that v2 signature was experimental, idk how pre-commits did not catch that, Will make them same
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Is
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. yes unused parameters are caught under #ARG001 as seen in the |
||
|
|
||
| @abstractmethod | ||
| def edit( # noqa: PLR0913 | ||
| self, | ||
| dataset_id: int, | ||
| description: str | None = None, | ||
| creator: str | None = None, | ||
| contributor: str | None = None, | ||
| collection_date: str | None = None, | ||
| language: str | None = None, | ||
| default_target_attribute: str | None = None, | ||
| ignore_attribute: str | list[str] | None = None, # type: ignore | ||
| citation: str | None = None, | ||
| row_id_attribute: str | None = None, | ||
| original_data_url: str | None = None, | ||
| paper_url: str | None = None, | ||
| ) -> int: ... | ||
|
|
||
| @abstractmethod | ||
| def fork(self, dataset_id: int) -> int: ... | ||
|
|
||
| @abstractmethod | ||
| def status_update(self, dataset_id: int, status: Literal["active", "deactivated"]) -> None: ... | ||
|
|
||
| @abstractmethod | ||
| def list_qualities(self) -> builtins.list[str]: ... | ||
|
|
||
| @abstractmethod | ||
| def feature_add_ontology(self, dataset_id: int, index: int, ontology: str) -> bool: ... | ||
|
|
||
| @abstractmethod | ||
| def feature_remove_ontology(self, dataset_id: int, index: int, ontology: str) -> bool: ... | ||
|
|
||
| @abstractmethod | ||
| def get_features(self, dataset_id: int) -> dict[int, OpenMLDataFeature]: ... | ||
|
|
||
| @abstractmethod | ||
| def get_qualities(self, dataset_id: int) -> dict[str, float] | None: ... | ||
|
|
||
| @abstractmethod | ||
| def parse_features_file( | ||
| self, features_file: Path, features_pickle_file: Path | ||
| ) -> dict[int, OpenMLDataFeature]: ... | ||
|
|
||
| @abstractmethod | ||
| def parse_qualities_file( | ||
| self, qualities_file: Path, qualities_pickle_file: Path | ||
| ) -> dict[str, float]: ... | ||
|
|
||
| @abstractmethod | ||
| def _download_file(self, url_ext: str) -> Path: ... | ||
|
|
||
| @abstractmethod | ||
| def download_features_file(self, dataset_id: int) -> Path: ... | ||
|
|
||
| @abstractmethod | ||
| def download_qualities_file(self, dataset_id: int) -> Path: ... | ||
|
|
||
| @abstractmethod | ||
| def download_dataset_parquet( | ||
| self, | ||
| description: dict | OpenMLDataset, | ||
| download_all_files: bool = False, # noqa: FBT002 | ||
| ) -> Path | None: ... | ||
|
|
||
| @abstractmethod | ||
| def download_dataset_arff( | ||
| self, | ||
| description: dict | OpenMLDataset, | ||
| ) -> Path: ... | ||
|
|
||
| @abstractmethod | ||
| def add_topic(self, dataset_id: int, topic: str) -> int: ... | ||
|
|
||
| @abstractmethod | ||
| def delete_topic(self, dataset_id: int, topic: str) -> int: ... | ||
|
|
||
| @abstractmethod | ||
| def get_online_dataset_format(self, dataset_id: int) -> str: ... | ||
|
|
||
| @abstractmethod | ||
| def get_online_dataset_arff(self, dataset_id: int) -> str | None: ... | ||
|
|
||
|
|
||
| class TaskAPI(ResourceAPI): | ||
| """Abstract API interface for task resources.""" | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
this should be made public and probably give a better name? and we could directly use this method to find where the path could look for a particular url
use-case is in tests