openml · JATAYU000 · Feb 10, 2026 · Feb 10, 2026 · Feb 11, 2026 · Feb 12, 2026
diff --git a/openml/_api/clients/http.py b/openml/_api/clients/http.py
@@ -12,6 +12,7 @@
 from typing import Any, cast
 from urllib.parse import urlencode, urljoin, urlparse
 
+import arff
 import requests
 import xmltodict
 from requests import Response
@@ -98,16 +99,32 @@ def _get_body_filename_from_response(self, response: Response) -> str:
         if "text/xml" in content_type:
             return "body.xml"
 
+        if response.content.startswith(b"PK\x03\x04"):
+            return "body.zip"
+
+        try:
+            arff.loads(response.text)
+            return "body.arff"
+        except arff.ArffException:
+            pass
+
         return "body.txt"
 
     def _get_body_filename_from_path(self, path: Path) -> str:
-        if (path / "body.json").exists():
-            return "body.json"
+        candidates = []
+        for p in path.iterdir():
+            if p.name.startswith("body.") and len(p.suffixes) == 1:
+                candidates.append(p)
 
-        if (path / "body.xml").exists():
-            return "body.xml"
+        if not candidates:
+            raise FileNotFoundError(f"No body file found in path: {path}")
 
-        return "body.txt"
+        if len(candidates) > 1:
+            raise FileNotFoundError(
+                f"Multiple body files found in path: {path} ({[p.name for p in candidates]})"
+            )
+
+        return candidates[0].name
 
     def load(self, key: str) -> Response:
         """
@@ -132,6 +149,9 @@ def load(self, key: str) -> Response:
         """
         path = self._key_to_path(key)
 
+        if not path.exists():
+            raise FileNotFoundError(f"Cache path not found: {path}")
+
         meta_path = path / "meta.json"
         meta_raw = meta_path.read_bytes() if meta_path.exists() else "{}"
         meta = json.loads(meta_raw)
@@ -141,8 +161,6 @@ def load(self, key: str) -> Response:
         headers = json.loads(headers_raw)
 
         body_path = path / self._get_body_filename_from_path(path)
-        if not body_path.exists():
-            raise FileNotFoundError(f"Incomplete cache at {body_path}")
         body = body_path.read_bytes()
 
         response = Response()
@@ -825,3 +843,9 @@ def write_to_file(response: Response, path: Path, encoding: str) -> None:
         handler = handler or write_to_file
         handler(response, file_path, encoding)
         return file_path
+
+    def cache_path_from_url(self, url: str) -> Path:
+        full_url = urljoin(self.server, url)
+        key = self.cache.get_key(full_url, params={})
+        path = self.cache._key_to_path(key)
+        return path / self.cache._get_body_filename_from_path(path)
diff --git a/openml/_api/clients/minio.py b/openml/_api/clients/minio.py
@@ -1,8 +1,17 @@
 from __future__ import annotations
 
+import contextlib
+import shutil
+import urllib
+import zipfile
 from pathlib import Path
 
+import minio
+import requests
+from urllib3 import ProxyManager
+
 import openml
+from openml.utils import ProgressBar
 
 
 class MinIOClient:
@@ -16,13 +25,125 @@ class MinIOClient:
 
     Attributes
     ----------
-    path : pathlib.Path or None
+    path : pathlib.Path
         Configured base path for storage operations.
     headers : dict of str to str
         Default HTTP headers, including a user-agent identifying the
         OpenML Python client version.
     """
 
+    @property
+    def headers(self) -> dict[str, str]:
+        return openml.config._HEADERS
+
     @property
     def path(self) -> Path:
         return Path(openml.config.get_cache_directory())
+
+    def download_minio_file(
+        self,
+        source: str,
+        exists_ok: bool = True,  # noqa: FBT002
+        proxy: str | None = "auto",
+    ) -> Path:
+        """Download file ``source`` from a MinIO Bucket and store it at ``destination``.
+
+        Parameters
+        ----------
+        source : str
+            URL to a file in a MinIO bucket.
+        exists_ok : bool, optional (default=True)
+            If False, raise FileExists if a file already exists in ``destination``.
+        proxy: str, optional (default = "auto")
+            The proxy server to use. By default it's "auto" which uses ``requests`` to
+            automatically find the proxy to use. Pass None or the environment variable
+            ``no_proxy="*"`` to disable proxies.
+        """
+        destination = Path(openml.config.get_minio_download_path(source))
+        parsed_url = urllib.parse.urlparse(source)
+
+        # expect path format: /BUCKET/path/to/file.ext
+        bucket, object_name = parsed_url.path[1:].split("/", maxsplit=1)
+        if destination.is_dir():
+            destination = Path(destination, object_name)
+        if destination.is_file() and not exists_ok:
+            raise FileExistsError(f"File already exists in {destination}.")
+
+        destination = destination.expanduser()
+        destination.parent.mkdir(parents=True, exist_ok=True)
+
+        if proxy == "auto":
+            resolved_proxies = requests.utils.get_environ_proxies(parsed_url.geturl())
+            proxy = requests.utils.select_proxy(parsed_url.geturl(), resolved_proxies)  # type: ignore
+
+        proxy_client = ProxyManager(proxy) if proxy else None
+
+        client = minio.Minio(endpoint=parsed_url.netloc, secure=False, http_client=proxy_client)
+        try:
+            client.fget_object(
+                bucket_name=bucket,
+                object_name=object_name,
+                file_path=str(destination),
+                progress=ProgressBar() if openml.config.show_progress else None,
+                request_headers=self.headers,
+            )
+            if destination.is_file() and destination.suffix == ".zip":
+                with zipfile.ZipFile(destination, "r") as zip_ref:
+                    zip_ref.extractall(destination.parent)
+
+        except minio.error.S3Error as e:
+            if e.message is not None and e.message.startswith("Object does not exist"):
+                raise FileNotFoundError(f"Object at '{source}' does not exist.") from e
+            # e.g. permission error, or a bucket does not exist (which is also interpreted as a
+            # permission error on minio level).
+            raise FileNotFoundError("Bucket does not exist or is private.") from e
+
+        return destination
+
+    def download_minio_bucket(self, source: str) -> None:
+        """Download file ``source`` from a MinIO Bucket and store it at ``destination``.
+
+        Does not redownload files which already exist.
+
+        Parameters
+        ----------
+        source : str
+            URL to a MinIO bucket.
+        """
+        destination = Path(openml.config.get_minio_download_path(source))
+        parsed_url = urllib.parse.urlparse(source)
+        if destination.suffix:
+            destination = destination.parent
+        # expect path format: /BUCKET/path/to/file.ext
+        _, bucket, *prefixes, _ = parsed_url.path.split("/")
+        prefix = "/".join(prefixes)
+
+        client = minio.Minio(endpoint=parsed_url.netloc, secure=False)
+
+        for file_object in client.list_objects(bucket, prefix=prefix, recursive=True):
+            if file_object.object_name is None:
+                raise ValueError(f"Object name is None for object {file_object!r}")
+            if file_object.etag is None:
+                raise ValueError(f"Object etag is None for object {file_object!r}")
+
+            marker = destination / file_object.etag
+            if marker.exists():
+                continue
+
+            file_destination = destination / file_object.object_name.rsplit("/", 1)[1]
+            if (file_destination.parent / file_destination.stem).exists():
+                # Marker is missing but archive exists means the server archive changed
+                # force a refresh
+                shutil.rmtree(file_destination.parent / file_destination.stem)
+
+            with contextlib.suppress(FileExistsError):
+                self.download_minio_file(
+                    source=source.rsplit("/", 1)[0]
+                    + "/"
+                    + file_object.object_name.rsplit("/", 1)[1],
+                    exists_ok=False,
+                )
+
+            if file_destination.is_file() and file_destination.suffix == ".zip":
+                file_destination.unlink()
+                marker.touch()
diff --git a/openml/_api/resources/base/resources.py b/openml/_api/resources/base/resources.py
@@ -3,7 +3,8 @@
 import builtins
 from abc import abstractmethod
 from collections.abc import Iterable
-from typing import TYPE_CHECKING, Any
+from pathlib import Path
+from typing import TYPE_CHECKING, Any, Literal
 
 from openml.enums import ResourceType
 
@@ -12,6 +13,8 @@
 if TYPE_CHECKING:
     import pandas as pd
 
+    from openml.datasets.data_feature import OpenMLDataFeature
+    from openml.datasets.dataset import OpenMLDataset
     from openml.estimation_procedures import OpenMLEstimationProcedure
     from openml.evaluations import OpenMLEvaluation
     from openml.flows.flow import OpenMLFlow
@@ -23,6 +26,110 @@ class DatasetAPI(ResourceAPI):
 
     resource_type: ResourceType = ResourceType.DATASET
 
+    @abstractmethod
+    def get(  # noqa: PLR0913
+        self,
+        dataset_id: int,
+        download_data: bool = False,  # noqa: FBT002
+        cache_format: Literal["pickle", "feather"] = "pickle",
+        download_qualities: bool = False,  # noqa: FBT002
+        download_features_meta_data: bool = False,  # noqa: FBT002
+        download_all_files: bool = False,  # noqa: FBT002
+        force_refresh_cache: bool = False,  # noqa: FBT002
+    ) -> OpenMLDataset: ...
+
+    @abstractmethod
+    def list(
+        self,
+        limit: int,
+        offset: int,
+        *,
+        data_id: list[int] | None = None,  # type: ignore
+        **kwargs: Any,
+    ) -> pd.DataFrame: ...
+
+    @abstractmethod
+    def edit(  # noqa: PLR0913
+        self,
+        dataset_id: int,
+        description: str | None = None,
+        creator: str | None = None,
+        contributor: str | None = None,
+        collection_date: str | None = None,
+        language: str | None = None,
+        default_target_attribute: str | None = None,
+        ignore_attribute: str | list[str] | None = None,  # type: ignore
+        citation: str | None = None,
+        row_id_attribute: str | None = None,
+        original_data_url: str | None = None,
+        paper_url: str | None = None,
+    ) -> int: ...
+
+    @abstractmethod
+    def fork(self, dataset_id: int) -> int: ...
+
+    @abstractmethod
+    def status_update(self, dataset_id: int, status: Literal["active", "deactivated"]) -> None: ...
+
+    @abstractmethod
+    def list_qualities(self) -> builtins.list[str]: ...
+
+    @abstractmethod
+    def feature_add_ontology(self, dataset_id: int, index: int, ontology: str) -> bool: ...
+
+    @abstractmethod
+    def feature_remove_ontology(self, dataset_id: int, index: int, ontology: str) -> bool: ...
+
+    @abstractmethod
+    def get_features(self, dataset_id: int) -> dict[int, OpenMLDataFeature]: ...
+
+    @abstractmethod
+    def get_qualities(self, dataset_id: int) -> dict[str, float] | None: ...
+
+    @abstractmethod
+    def parse_features_file(
+        self, features_file: Path, features_pickle_file: Path | None = None
+    ) -> dict[int, OpenMLDataFeature]: ...
+
+    @abstractmethod
+    def parse_qualities_file(
+        self, qualities_file: Path, qualities_pickle_file: Path | None = None
+    ) -> dict[str, float]: ...
+
+    @abstractmethod
+    def _download_file(self, url_ext: str) -> Path: ...
+
+    @abstractmethod
+    def download_features_file(self, dataset_id: int) -> Path: ...
+
+    @abstractmethod
+    def download_qualities_file(self, dataset_id: int) -> Path | None: ...
+
+    @abstractmethod
+    def download_dataset_parquet(
+        self,
+        description: dict | OpenMLDataset,
+        download_all_files: bool = False,  # noqa: FBT002
+    ) -> Path | None: ...
+
+    @abstractmethod
+    def download_dataset_arff(
+        self,
+        description: dict | OpenMLDataset,
+    ) -> Path: ...
+
+    @abstractmethod
+    def add_topic(self, dataset_id: int, topic: str) -> int: ...
+
+    @abstractmethod
+    def delete_topic(self, dataset_id: int, topic: str) -> int: ...
+
+    @abstractmethod
+    def get_online_dataset_format(self, dataset_id: int) -> str: ...
+
+    @abstractmethod
+    def get_online_dataset_arff(self, dataset_id: int) -> str | None: ...
+
 
 class TaskAPI(ResourceAPI):
     """Abstract API interface for task resources."""