diff --git a/openml/_api/clients/http.py b/openml/_api/clients/http.py
index 27eeaac22..2b80023fd 100644
--- a/openml/_api/clients/http.py
+++ b/openml/_api/clients/http.py
@@ -12,6 +12,7 @@
from typing import Any, cast
from urllib.parse import urlencode, urljoin, urlparse
+import arff
import requests
import xmltodict
from requests import Response
@@ -98,16 +99,32 @@ def _get_body_filename_from_response(self, response: Response) -> str:
if "text/xml" in content_type:
return "body.xml"
+ if response.content.startswith(b"PK\x03\x04"):
+ return "body.zip"
+
+ try:
+ arff.loads(response.text)
+ return "body.arff"
+ except arff.ArffException:
+ pass
+
return "body.txt"
def _get_body_filename_from_path(self, path: Path) -> str:
- if (path / "body.json").exists():
- return "body.json"
+ candidates = []
+ for p in path.iterdir():
+ if p.name.startswith("body.") and len(p.suffixes) == 1:
+ candidates.append(p)
- if (path / "body.xml").exists():
- return "body.xml"
+ if not candidates:
+ raise FileNotFoundError(f"No body file found in path: {path}")
- return "body.txt"
+ if len(candidates) > 1:
+ raise FileNotFoundError(
+ f"Multiple body files found in path: {path} ({[p.name for p in candidates]})"
+ )
+
+ return candidates[0].name
def load(self, key: str) -> Response:
"""
@@ -132,6 +149,9 @@ def load(self, key: str) -> Response:
"""
path = self._key_to_path(key)
+ if not path.exists():
+ raise FileNotFoundError(f"Cache path not found: {path}")
+
meta_path = path / "meta.json"
meta_raw = meta_path.read_bytes() if meta_path.exists() else "{}"
meta = json.loads(meta_raw)
@@ -141,8 +161,6 @@ def load(self, key: str) -> Response:
headers = json.loads(headers_raw)
body_path = path / self._get_body_filename_from_path(path)
- if not body_path.exists():
- raise FileNotFoundError(f"Incomplete cache at {body_path}")
body = body_path.read_bytes()
response = Response()
@@ -825,3 +843,9 @@ def write_to_file(response: Response, path: Path, encoding: str) -> None:
handler = handler or write_to_file
handler(response, file_path, encoding)
return file_path
+
+ def cache_path_from_url(self, url: str) -> Path:
+ full_url = urljoin(self.server, url)
+ key = self.cache.get_key(full_url, params={})
+ path = self.cache._key_to_path(key)
+ return path / self.cache._get_body_filename_from_path(path)
diff --git a/openml/_api/resources/base/resources.py b/openml/_api/resources/base/resources.py
index 0c60e69de..301483f25 100644
--- a/openml/_api/resources/base/resources.py
+++ b/openml/_api/resources/base/resources.py
@@ -10,10 +10,13 @@
from .base import ResourceAPI
if TYPE_CHECKING:
+ import pandas as pd
+
from openml.estimation_procedures import OpenMLEstimationProcedure
- from openml.evaluations import OpenMLEvaluation
+ from openml.evaluations.evaluation import OpenMLEvaluation
from openml.flows.flow import OpenMLFlow
from openml.setups.setup import OpenMLSetup
+ from openml.tasks.task import OpenMLTask, TaskType
class DatasetAPI(ResourceAPI):
@@ -27,6 +30,49 @@ class TaskAPI(ResourceAPI):
resource_type: ResourceType = ResourceType.TASK
+ @abstractmethod
+ def get(
+ self,
+ task_id: int,
+ ) -> OpenMLTask:
+ """
+ API v1:
+ GET /task/{task_id}
+
+ API v2:
+ GET /tasks/{task_id}
+ """
+ ...
+
+ @abstractmethod
+ def supports_download_splits(self) -> bool:
+ """Return whether the task API implementation supports split downloads."""
+ ...
+
+ # Task listing (V1 only)
+ @abstractmethod
+ def list(
+ self,
+ limit: int,
+ offset: int,
+ task_type: TaskType | int | None = None,
+ **kwargs: Any,
+ ) -> pd.DataFrame:
+ """
+ List tasks with filters.
+
+ API v1:
+ GET /task/list
+
+ API v2:
+ Not available.
+
+ Returns
+ -------
+ pandas.DataFrame
+ """
+ ...
+
class EvaluationMeasureAPI(ResourceAPI):
"""Abstract API interface for evaluation measure resources."""
diff --git a/openml/_api/resources/task.py b/openml/_api/resources/task.py
index 1f62aa3f3..3b6f504b9 100644
--- a/openml/_api/resources/task.py
+++ b/openml/_api/resources/task.py
@@ -1,11 +1,353 @@
from __future__ import annotations
+import warnings
+from typing import Any
+
+import pandas as pd
+import xmltodict
+
+from openml.tasks.functions import _get_estimation_procedure_list
+from openml.tasks.task import (
+ OpenMLClassificationTask,
+ OpenMLClusteringTask,
+ OpenMLLearningCurveTask,
+ OpenMLRegressionTask,
+ OpenMLTask,
+ TaskType,
+)
+
from .base import ResourceV1API, ResourceV2API, TaskAPI
+def _create_task_from_xml(xml: str) -> OpenMLTask:
+ """Create a task given a xml string.
+
+ Parameters
+ ----------
+ xml : string
+ Task xml representation.
+
+ Returns
+ -------
+ OpenMLTask
+ """
+ dic = xmltodict.parse(xml)["oml:task"]
+ estimation_parameters = {}
+ inputs = {}
+ # Due to the unordered structure we obtain, we first have to extract
+ # the possible keys of oml:input; dic["oml:input"] is a list of
+ # OrderedDicts
+
+ # Check if there is a list of inputs
+ if isinstance(dic["oml:input"], list):
+ for input_ in dic["oml:input"]:
+ name = input_["@name"]
+ inputs[name] = input_
+ # Single input case
+ elif isinstance(dic["oml:input"], dict):
+ name = dic["oml:input"]["@name"]
+ inputs[name] = dic["oml:input"]
+
+ evaluation_measures = None
+ if "evaluation_measures" in inputs:
+ evaluation_measures = inputs["evaluation_measures"]["oml:evaluation_measures"][
+ "oml:evaluation_measure"
+ ]
+
+ task_type = TaskType(int(dic["oml:task_type_id"]))
+ common_kwargs = {
+ "task_id": dic["oml:task_id"],
+ "task_type": dic["oml:task_type"],
+ "task_type_id": task_type,
+ "data_set_id": inputs["source_data"]["oml:data_set"]["oml:data_set_id"],
+ "evaluation_measure": evaluation_measures,
+ }
+ # TODO: add OpenMLClusteringTask?
+ if task_type in (
+ TaskType.SUPERVISED_CLASSIFICATION,
+ TaskType.SUPERVISED_REGRESSION,
+ TaskType.LEARNING_CURVE,
+ ):
+ # Convert some more parameters
+ for parameter in inputs["estimation_procedure"]["oml:estimation_procedure"][
+ "oml:parameter"
+ ]:
+ name = parameter["@name"]
+ text = parameter.get("#text", "")
+ estimation_parameters[name] = text
+
+ common_kwargs["estimation_procedure_type"] = inputs["estimation_procedure"][
+ "oml:estimation_procedure"
+ ]["oml:type"]
+ common_kwargs["estimation_procedure_id"] = int(
+ inputs["estimation_procedure"]["oml:estimation_procedure"]["oml:id"]
+ )
+
+ common_kwargs["estimation_parameters"] = estimation_parameters
+ common_kwargs["target_name"] = inputs["source_data"]["oml:data_set"]["oml:target_feature"]
+ common_kwargs["data_splits_url"] = inputs["estimation_procedure"][
+ "oml:estimation_procedure"
+ ]["oml:data_splits_url"]
+
+ cls = {
+ TaskType.SUPERVISED_CLASSIFICATION: OpenMLClassificationTask,
+ TaskType.SUPERVISED_REGRESSION: OpenMLRegressionTask,
+ TaskType.CLUSTERING: OpenMLClusteringTask,
+ TaskType.LEARNING_CURVE: OpenMLLearningCurveTask,
+ }.get(task_type)
+ if cls is None:
+ raise NotImplementedError(f"Task type {common_kwargs['task_type']} not supported.")
+ return cls(**common_kwargs) # type: ignore
+
+
+def _build_url(
+ limit: int, offset: int, task_type: TaskType | int | None, kwargs: dict[str, Any]
+) -> str:
+ api_call = "task/list"
+ if limit is not None:
+ api_call += f"/limit/{limit}"
+ if offset is not None:
+ api_call += f"/offset/{offset}"
+ if task_type is not None:
+ tvalue = task_type.value if isinstance(task_type, TaskType) else task_type
+ api_call += f"/type/{tvalue}"
+ if kwargs is not None:
+ for operator, value in kwargs.items():
+ if value is not None:
+ if operator == "task_id":
+ value = ",".join([str(int(i)) for i in value]) # noqa: PLW2901
+ api_call += f"/{operator}/{value}"
+ return api_call
+
+
class TaskV1API(ResourceV1API, TaskAPI):
- """Version 1 API implementation for task resources."""
+ def get(self, task_id: int) -> OpenMLTask:
+ """Download OpenML task for a given task ID.
+
+ Downloads the task representation.
+
+ Parameters
+ ----------
+ task_id : int
+ The OpenML task id of the task to download.
+ get_dataset_kwargs :
+ Args and kwargs can be used pass optional parameters to
+ :meth:`openml.datasets.get_dataset`.
+
+ Returns
+ -------
+ task: OpenMLTask
+ """
+ if not isinstance(task_id, int):
+ raise TypeError(f"Task id should be integer, is {type(task_id)}")
+
+ response = self._http.get(f"task/{task_id}", enable_cache=True)
+ return _create_task_from_xml(response.text)
+
+ def supports_download_splits(self) -> bool:
+ return True
+
+ def list(
+ self,
+ limit: int,
+ offset: int,
+ task_type: TaskType | int | None = None,
+ **kwargs: Any,
+ ) -> pd.DataFrame:
+ """
+ Perform the api call to return a number of tasks having the given filters.
+
+ Parameters
+ ----------
+ Filter task_type is separated from the other filters because
+ it is used as task_type in the task description, but it is named
+ type when used as a filter in list tasks call.
+ limit: int
+ offset: int
+ task_type : TaskType, optional
+ Refers to the type of task.
+ kwargs: dict, optional
+ Legal filter operators: tag, task_id (list), data_tag, status, limit,
+ offset, data_id, data_name, number_instances, number_features,
+ number_classes, number_missing_values.
+
+ Returns
+ -------
+ dataframe
+ """
+ api_call = _build_url(limit, offset, task_type, kwargs)
+ return self._parse_list_xml(api_call=api_call)
+
+ def _parse_list_xml(self, api_call: str) -> pd.DataFrame: # noqa: C901, PLR0912
+ """Returns a Pandas DataFrame with information about OpenML tasks.
+
+ Parameters
+ ----------
+ api_call : str
+ The API call specifying which tasks to return.
+
+ Returns
+ -------
+ A Pandas DataFrame with information about OpenML tasks.
+
+ Raises
+ ------
+ ValueError
+ If the XML returned by the OpenML API does not contain 'oml:tasks', '@xmlns:oml',
+ or has an incorrect value for '@xmlns:oml'.
+ KeyError
+ If an invalid key is found in the XML for a task.
+ """
+ xml_string = self._http.get(api_call).text
+
+ tasks_dict = xmltodict.parse(xml_string, force_list=("oml:task", "oml:input"))
+ # Minimalistic check if the XML is useful
+ if "oml:tasks" not in tasks_dict:
+ raise ValueError(f'Error in return XML, does not contain "oml:runs": {tasks_dict}')
+
+ if "@xmlns:oml" not in tasks_dict["oml:tasks"]:
+ raise ValueError(
+ f'Error in return XML, does not contain "oml:runs"/@xmlns:oml: {tasks_dict}'
+ )
+
+ if tasks_dict["oml:tasks"]["@xmlns:oml"] != "http://openml.org/openml":
+ raise ValueError(
+ "Error in return XML, value of "
+ '"oml:runs"/@xmlns:oml is not '
+ f'"http://openml.org/openml": {tasks_dict!s}',
+ )
+
+ assert isinstance(tasks_dict["oml:tasks"]["oml:task"], list), type(tasks_dict["oml:tasks"])
+
+ tasks = {}
+ procs = _get_estimation_procedure_list()
+ proc_dict = {x["id"]: x for x in procs}
+
+ for task_ in tasks_dict["oml:tasks"]["oml:task"]:
+ tid = None
+ try:
+ tid = int(task_["oml:task_id"])
+ task_type_int = int(task_["oml:task_type_id"])
+ try:
+ task_type_id = TaskType(task_type_int)
+ except ValueError as e:
+ warnings.warn(
+ f"Could not create task type id for {task_type_int} due to error {e}",
+ RuntimeWarning,
+ stacklevel=2,
+ )
+ continue
+
+ task = {
+ "tid": tid,
+ "ttid": task_type_id,
+ "did": int(task_["oml:did"]),
+ "name": task_["oml:name"],
+ "task_type": task_["oml:task_type"],
+ "status": task_["oml:status"],
+ }
+
+ # Other task inputs
+ for _input in task_.get("oml:input", []):
+ if _input["@name"] == "estimation_procedure":
+ task[_input["@name"]] = proc_dict[int(_input["#text"])]["name"]
+ else:
+ value = _input.get("#text")
+ task[_input["@name"]] = value
+
+ # The number of qualities can range from 0 to infinity
+ for quality in task_.get("oml:quality", []):
+ if "#text" not in quality:
+ quality_value = 0.0
+ else:
+ quality["#text"] = float(quality["#text"])
+ if abs(int(quality["#text"]) - quality["#text"]) < 0.0000001:
+ quality["#text"] = int(quality["#text"])
+ quality_value = quality["#text"]
+ task[quality["@name"]] = quality_value
+ tasks[tid] = task
+ except KeyError as e:
+ if tid is not None:
+ warnings.warn(
+ f"Invalid xml for task {tid}: {e}\nFrom {task_}",
+ RuntimeWarning,
+ stacklevel=2,
+ )
+ else:
+ warnings.warn(
+ f"Could not find key {e} in {task_}!", RuntimeWarning, stacklevel=2
+ )
+
+ return pd.DataFrame.from_dict(tasks, orient="index")
+
+
+def _create_task_from_json(task_json: dict) -> OpenMLTask:
+ task_type_id = TaskType(int(task_json["task_type_id"]))
+
+ inputs = {i["name"]: i for i in task_json.get("input", [])}
+
+ source = inputs["source_data"]["data_set"]
+
+ common_kwargs = {
+ "task_id": int(task_json["id"]),
+ "task_type": task_json["task_type"],
+ "task_type_id": task_type_id,
+ "data_set_id": int(source["data_set_id"]),
+ "evaluation_measure": None,
+ }
+
+ if task_type_id in (
+ TaskType.SUPERVISED_CLASSIFICATION,
+ TaskType.SUPERVISED_REGRESSION,
+ TaskType.LEARNING_CURVE,
+ ):
+ est = inputs.get("estimation_procedure", {}).get("estimation_procedure")
+
+ if est:
+ common_kwargs["estimation_procedure_id"] = int(est["id"])
+ common_kwargs["estimation_procedure_type"] = est["type"]
+ common_kwargs["estimation_parameters"] = {
+ p["name"]: p.get("value") for p in est.get("parameter", [])
+ }
+
+ common_kwargs["target_name"] = source.get("target_feature")
+
+ cls = {
+ TaskType.SUPERVISED_CLASSIFICATION: OpenMLClassificationTask,
+ TaskType.SUPERVISED_REGRESSION: OpenMLRegressionTask,
+ TaskType.CLUSTERING: OpenMLClusteringTask,
+ TaskType.LEARNING_CURVE: OpenMLLearningCurveTask,
+ }[task_type_id]
+
+ return cls(**common_kwargs) # type: ignore
class TaskV2API(ResourceV2API, TaskAPI):
- """Version 2 API implementation for task resources."""
+ def get(self, task_id: int) -> OpenMLTask:
+ """Download OpenML task for a given task ID.
+
+ Downloads the task representation.
+
+ Parameters
+ ----------
+ task_id : int
+ The OpenML task id of the task to download.
+
+ Returns
+ -------
+ task: OpenMLTask
+ """
+ response = self._http.get(f"tasks/{task_id}", enable_cache=True)
+ return _create_task_from_json(response.json())
+
+ def list(
+ self,
+ limit: int, # noqa: ARG002
+ offset: int, # noqa: ARG002
+ task_type: TaskType | int | None = None, # noqa: ARG002
+ **kwargs: Any, # noqa: ARG002
+ ) -> pd.DataFrame:
+ raise self._not_supported(method="list")
+
+ def supports_download_splits(self) -> bool:
+ return False
diff --git a/openml/tasks/functions.py b/openml/tasks/functions.py
index 22fb26f9b..0b8aaecf0 100644
--- a/openml/tasks/functions.py
+++ b/openml/tasks/functions.py
@@ -1,19 +1,14 @@
# License: BSD 3-Clause
from __future__ import annotations
-import os
-import re
import warnings
from functools import partial
-from typing import Any
+from typing import TYPE_CHECKING, Any
import pandas as pd
-import xmltodict
-import openml._api_calls
import openml.utils
from openml.datasets import get_dataset
-from openml.exceptions import OpenMLCacheException
from .task import (
OpenMLClassificationTask,
@@ -21,55 +16,16 @@
OpenMLLearningCurveTask,
OpenMLRegressionTask,
OpenMLSupervisedTask,
- OpenMLTask,
TaskType,
)
+if TYPE_CHECKING:
+ from .task import (
+ OpenMLTask,
+ )
TASKS_CACHE_DIR_NAME = "tasks"
-def _get_cached_tasks() -> dict[int, OpenMLTask]:
- """Return a dict of all the tasks which are cached locally.
-
- Returns
- -------
- tasks : OrderedDict
- A dict of all the cached tasks. Each task is an instance of
- OpenMLTask.
- """
- task_cache_dir = openml.utils._create_cache_directory(TASKS_CACHE_DIR_NAME)
- directory_content = os.listdir(task_cache_dir) # noqa: PTH208
- directory_content.sort()
-
- # Find all dataset ids for which we have downloaded the dataset
- # description
- tids = (int(did) for did in directory_content if re.match(r"[0-9]*", did))
- return {tid: _get_cached_task(tid) for tid in tids}
-
-
-def _get_cached_task(tid: int) -> OpenMLTask:
- """Return a cached task based on the given id.
-
- Parameters
- ----------
- tid : int
- Id of the task.
-
- Returns
- -------
- OpenMLTask
- """
- tid_cache_dir = openml.utils._create_cache_directory_for_id(TASKS_CACHE_DIR_NAME, tid)
-
- task_xml_path = tid_cache_dir / "task.xml"
- try:
- with task_xml_path.open(encoding="utf8") as fh:
- return _create_task_from_xml(fh.read())
- except OSError as e:
- openml.utils._remove_cache_dir_for_id(TASKS_CACHE_DIR_NAME, tid_cache_dir)
- raise OpenMLCacheException(f"Task file for tid {tid} not cached") from e
-
-
def _get_estimation_procedure_list() -> list[dict[str, Any]]:
"""Return a list of all estimation procedures which are on OpenML.
@@ -133,7 +89,7 @@ def list_tasks( # noqa: PLR0913
calculated for the associated dataset, some of these are also returned.
"""
listing_call = partial(
- _list_tasks,
+ openml._backend.task.list,
task_type=task_type,
tag=tag,
data_tag=data_tag,
@@ -152,151 +108,6 @@ def list_tasks( # noqa: PLR0913
return pd.concat(batches)
-def _list_tasks(
- limit: int,
- offset: int,
- task_type: TaskType | int | None = None,
- **kwargs: Any,
-) -> pd.DataFrame:
- """
- Perform the api call to return a number of tasks having the given filters.
-
- Parameters
- ----------
- Filter task_type is separated from the other filters because
- it is used as task_type in the task description, but it is named
- type when used as a filter in list tasks call.
- limit: int
- offset: int
- task_type : TaskType, optional
- Refers to the type of task.
- kwargs: dict, optional
- Legal filter operators: tag, task_id (list), data_tag, status, limit,
- offset, data_id, data_name, number_instances, number_features,
- number_classes, number_missing_values.
-
- Returns
- -------
- dataframe
- """
- api_call = "task/list"
- if limit is not None:
- api_call += f"/limit/{limit}"
- if offset is not None:
- api_call += f"/offset/{offset}"
- if task_type is not None:
- tvalue = task_type.value if isinstance(task_type, TaskType) else task_type
- api_call += f"/type/{tvalue}"
- if kwargs is not None:
- for operator, value in kwargs.items():
- if value is not None:
- if operator == "task_id":
- value = ",".join([str(int(i)) for i in value]) # noqa: PLW2901
- api_call += f"/{operator}/{value}"
-
- return __list_tasks(api_call=api_call)
-
-
-def __list_tasks(api_call: str) -> pd.DataFrame: # noqa: C901, PLR0912
- """Returns a Pandas DataFrame with information about OpenML tasks.
-
- Parameters
- ----------
- api_call : str
- The API call specifying which tasks to return.
-
- Returns
- -------
- A Pandas DataFrame with information about OpenML tasks.
-
- Raises
- ------
- ValueError
- If the XML returned by the OpenML API does not contain 'oml:tasks', '@xmlns:oml',
- or has an incorrect value for '@xmlns:oml'.
- KeyError
- If an invalid key is found in the XML for a task.
- """
- xml_string = openml._api_calls._perform_api_call(api_call, "get")
- tasks_dict = xmltodict.parse(xml_string, force_list=("oml:task", "oml:input"))
- # Minimalistic check if the XML is useful
- if "oml:tasks" not in tasks_dict:
- raise ValueError(f'Error in return XML, does not contain "oml:runs": {tasks_dict}')
-
- if "@xmlns:oml" not in tasks_dict["oml:tasks"]:
- raise ValueError(
- f'Error in return XML, does not contain "oml:runs"/@xmlns:oml: {tasks_dict}'
- )
-
- if tasks_dict["oml:tasks"]["@xmlns:oml"] != "http://openml.org/openml":
- raise ValueError(
- "Error in return XML, value of "
- '"oml:runs"/@xmlns:oml is not '
- f'"http://openml.org/openml": {tasks_dict!s}',
- )
-
- assert isinstance(tasks_dict["oml:tasks"]["oml:task"], list), type(tasks_dict["oml:tasks"])
-
- tasks = {}
- procs = _get_estimation_procedure_list()
- proc_dict = {x["id"]: x for x in procs}
-
- for task_ in tasks_dict["oml:tasks"]["oml:task"]:
- tid = None
- try:
- tid = int(task_["oml:task_id"])
- task_type_int = int(task_["oml:task_type_id"])
- try:
- task_type_id = TaskType(task_type_int)
- except ValueError as e:
- warnings.warn(
- f"Could not create task type id for {task_type_int} due to error {e}",
- RuntimeWarning,
- stacklevel=2,
- )
- continue
-
- task = {
- "tid": tid,
- "ttid": task_type_id,
- "did": int(task_["oml:did"]),
- "name": task_["oml:name"],
- "task_type": task_["oml:task_type"],
- "status": task_["oml:status"],
- }
-
- # Other task inputs
- for _input in task_.get("oml:input", []):
- if _input["@name"] == "estimation_procedure":
- task[_input["@name"]] = proc_dict[int(_input["#text"])]["name"]
- else:
- value = _input.get("#text")
- task[_input["@name"]] = value
-
- # The number of qualities can range from 0 to infinity
- for quality in task_.get("oml:quality", []):
- if "#text" not in quality:
- quality_value = 0.0
- else:
- quality["#text"] = float(quality["#text"])
- if abs(int(quality["#text"]) - quality["#text"]) < 0.0000001:
- quality["#text"] = int(quality["#text"])
- quality_value = quality["#text"]
- task[quality["@name"]] = quality_value
- tasks[tid] = task
- except KeyError as e:
- if tid is not None:
- warnings.warn(
- f"Invalid xml for task {tid}: {e}\nFrom {task_}",
- RuntimeWarning,
- stacklevel=2,
- )
- else:
- warnings.warn(f"Could not find key {e} in {task_}!", RuntimeWarning, stacklevel=2)
-
- return pd.DataFrame.from_dict(tasks, orient="index")
-
-
def get_tasks(
task_ids: list[int],
download_data: bool | None = None,
@@ -304,7 +115,7 @@ def get_tasks(
) -> list[OpenMLTask]:
"""Download tasks.
- This function iterates :meth:`openml.tasks.get_task`.
+ This function iterates :meth:`openml.task.get`.
Parameters
----------
@@ -338,7 +149,11 @@ def get_tasks(
tasks = []
for task_id in task_ids:
tasks.append(
- get_task(task_id, download_data=download_data, download_qualities=download_qualities)
+ get_task(
+ task_id,
+ download_data=download_data,
+ download_qualities=download_qualities,
+ )
)
return tasks
@@ -373,133 +188,27 @@ def get_task(
if not isinstance(task_id, int):
raise TypeError(f"Task id should be integer, is {type(task_id)}")
- task_cache_directory = openml.utils._create_cache_directory_for_id(
- TASKS_CACHE_DIR_NAME, task_id
- )
- task_cache_directory_existed = task_cache_directory.exists()
- try:
- task = _get_task_description(task_id)
- dataset = get_dataset(task.dataset_id, **get_dataset_kwargs)
- # List of class labels available in dataset description
- # Including class labels as part of task meta data handles
- # the case where data download was initially disabled
- if isinstance(task, (OpenMLClassificationTask, OpenMLLearningCurveTask)):
- assert task.target_name is not None, (
- "Supervised tasks must define a target feature before retrieving class labels."
- )
- task.class_labels = dataset.retrieve_class_labels(task.target_name)
- # Clustering tasks do not have class labels
- # and do not offer download_split
- if download_splits and isinstance(task, OpenMLSupervisedTask):
- task.download_split()
- except Exception as e:
- if not task_cache_directory_existed:
- openml.utils._remove_cache_dir_for_id(TASKS_CACHE_DIR_NAME, task_cache_directory)
- raise e
-
- return task
-
+ task = openml._backend.task.get(task_id)
+ dataset = get_dataset(task.dataset_id, **get_dataset_kwargs)
-def _get_task_description(task_id: int) -> OpenMLTask:
- try:
- return _get_cached_task(task_id)
- except OpenMLCacheException:
- _cache_dir = openml.utils._create_cache_directory_for_id(TASKS_CACHE_DIR_NAME, task_id)
- xml_file = _cache_dir / "task.xml"
- task_xml = openml._api_calls._perform_api_call(f"task/{task_id}", "get")
-
- with xml_file.open("w", encoding="utf8") as fh:
- fh.write(task_xml)
- return _create_task_from_xml(task_xml)
-
-
-def _create_task_from_xml(xml: str) -> OpenMLTask:
- """Create a task given a xml string.
-
- Parameters
- ----------
- xml : string
- Task xml representation.
-
- Returns
- -------
- OpenMLTask
- """
- dic = xmltodict.parse(xml)["oml:task"]
- estimation_parameters = {}
- inputs = {}
- # Due to the unordered structure we obtain, we first have to extract
- # the possible keys of oml:input; dic["oml:input"] is a list of
- # OrderedDicts
-
- # Check if there is a list of inputs
- if isinstance(dic["oml:input"], list):
- for input_ in dic["oml:input"]:
- name = input_["@name"]
- inputs[name] = input_
- # Single input case
- elif isinstance(dic["oml:input"], dict):
- name = dic["oml:input"]["@name"]
- inputs[name] = dic["oml:input"]
-
- evaluation_measures = None
- if "evaluation_measures" in inputs:
- evaluation_measures = inputs["evaluation_measures"]["oml:evaluation_measures"][
- "oml:evaluation_measure"
- ]
-
- task_type = TaskType(int(dic["oml:task_type_id"]))
- common_kwargs = {
- "task_id": dic["oml:task_id"],
- "task_type": dic["oml:task_type"],
- "task_type_id": task_type,
- "data_set_id": inputs["source_data"]["oml:data_set"]["oml:data_set_id"],
- "evaluation_measure": evaluation_measures,
- }
- # TODO: add OpenMLClusteringTask?
- if task_type in (
- TaskType.SUPERVISED_CLASSIFICATION,
- TaskType.SUPERVISED_REGRESSION,
- TaskType.LEARNING_CURVE,
+ if (
+ isinstance(task, (OpenMLClassificationTask, OpenMLLearningCurveTask))
+ and task.target_name is not None
):
- # Convert some more parameters
- for parameter in inputs["estimation_procedure"]["oml:estimation_procedure"][
- "oml:parameter"
- ]:
- name = parameter["@name"]
- text = parameter.get("#text", "")
- estimation_parameters[name] = text
-
- common_kwargs["estimation_procedure_type"] = inputs["estimation_procedure"][
- "oml:estimation_procedure"
- ]["oml:type"]
- common_kwargs["estimation_procedure_id"] = int(
- inputs["estimation_procedure"]["oml:estimation_procedure"]["oml:id"]
- )
+ task.class_labels = dataset.retrieve_class_labels(task.target_name)
- common_kwargs["estimation_parameters"] = estimation_parameters
- common_kwargs["target_name"] = inputs["source_data"]["oml:data_set"]["oml:target_feature"]
- common_kwargs["data_splits_url"] = inputs["estimation_procedure"][
- "oml:estimation_procedure"
- ]["oml:data_splits_url"]
-
- cls = {
- TaskType.SUPERVISED_CLASSIFICATION: OpenMLClassificationTask,
- TaskType.SUPERVISED_REGRESSION: OpenMLRegressionTask,
- TaskType.CLUSTERING: OpenMLClusteringTask,
- TaskType.LEARNING_CURVE: OpenMLLearningCurveTask,
- }.get(task_type)
- if cls is None:
- raise NotImplementedError(
- f"Task type '{common_kwargs['task_type']}' is not supported. "
- f"Supported task types: SUPERVISED_CLASSIFICATION,"
- f"SUPERVISED_REGRESSION, CLUSTERING, LEARNING_CURVE."
- f"Please check the OpenML documentation for available task types."
- )
- return cls(**common_kwargs) # type: ignore
+ if download_splits and isinstance(task, OpenMLSupervisedTask):
+ if openml._backend.task.supports_download_splits():
+ task.download_split()
+ else:
+ warnings.warn(
+ "`download_splits` is not yet supported in the v2 API and will be ignored.",
+ stacklevel=2,
+ )
+
+ return task
-# TODO(eddiebergman): overload on `task_type`
def create_task(
task_type: TaskType,
dataset_id: int,
@@ -587,4 +296,4 @@ def delete_task(task_id: int) -> bool:
bool
True if the deletion was successful. False otherwise.
"""
- return openml.utils._delete_entity("task", task_id)
+ return openml._backend.task.delete(task_id)
diff --git a/openml/tasks/task.py b/openml/tasks/task.py
index ab3cb3da4..a709fdb45 100644
--- a/openml/tasks/task.py
+++ b/openml/tasks/task.py
@@ -241,6 +241,48 @@ def _parse_publish_response(self, xml_response: dict) -> None:
"""Parse the id from the xml_response and assign it to self."""
self.task_id = int(xml_response["oml:upload_task"]["oml:id"])
+ def publish(self) -> OpenMLTask:
+ """Publish this task to OpenML server.
+
+ Returns
+ -------
+ self : OpenMLTask
+ """
+ file_elements = self._get_file_elements()
+ if "description" not in file_elements:
+ file_elements["description"] = self._to_xml()
+ task_id = openml._backend.task.publish(path="task", files=file_elements)
+ self.task_id = task_id
+ return self
+
+ def push_tag(self, tag: str) -> None:
+ """Annotates this task with a tag on the server.
+
+ Parameters
+ ----------
+ tag : str
+ Tag to attach to the task.
+ """
+ if self.task_id is None:
+ raise openml.exceptions.ObjectNotPublishedError(
+ "Please publish the task first before being able to tag it."
+ )
+ openml._backend.task.tag(self.task_id, tag)
+
+ def remove_tag(self, tag: str) -> None:
+ """Removes a tag from this task on the server.
+
+ Parameters
+ ----------
+ tag : str
+ Tag to remove from the task.
+ """
+ if self.task_id is None:
+ raise openml.exceptions.ObjectNotPublishedError(
+ "Please publish the task first before being able to untag it."
+ )
+ openml._backend.task.untag(self.task_id, tag)
+
class OpenMLSupervisedTask(OpenMLTask, ABC):
"""OpenML Supervised Classification object.
diff --git a/tests/conftest.py b/tests/conftest.py
index 1359e6247..0a663af15 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -205,7 +205,7 @@ def _expected_static_cache_state(root_dir: Path) -> list[Path]:
_c_root_dir = root_dir / "org" / "openml" / "test"
res_paths = [root_dir, _c_root_dir]
- for _d in ["datasets", "tasks", "runs"]:
+ for _d in ["datasets", "runs"]:
res_paths.append(_c_root_dir / _d)
for _id in ["-1", "2"]:
@@ -222,21 +222,21 @@ def _expected_static_cache_state(root_dir: Path) -> list[Path]:
res_paths.append(_c_root_dir / "datasets" / "30" / "dataset_30.pq")
res_paths.append(_c_root_dir / "runs" / "1" / "description.xml")
- for _id in ["1", "3", "1882"]:
- tmp_p = _c_root_dir / "tasks" / _id
- res_paths.extend(
- [
- tmp_p / "datasplits.arff",
- tmp_p / "task.xml",
- ]
- )
-
res_paths.extend([
_c_root_dir / "api" / "v1" / "xml" / "setup",
_c_root_dir / "api" / "v1" / "xml" / "setup" / "1",
_c_root_dir / "api" / "v1" / "xml" / "setup" / "1" / "body.xml",
])
+ res_paths.extend([
+ _c_root_dir / "api_splits" / "get" / "1882" / "Task_1882_splits.arff" / "body.arff",
+ _c_root_dir / "api_splits" / "get" / "3" / "Task_3_splits.arff" / "body.arff",
+ _c_root_dir / "api_splits" / "get" / "1" / "Task_1_splits.arff" / "body.arff",
+ _c_root_dir / "api" / "v1" / "xml" / "task" / "1882" / "body.xml",
+ _c_root_dir / "api" / "v1" / "xml" / "task" / "3" / "body.xml",
+ _c_root_dir / "api" / "v1" / "xml" / "task" / "1" / "body.xml",
+ ])
+
return res_paths
@@ -324,8 +324,8 @@ def with_test_cache(test_files_directory, request):
openml.config.set_root_cache_directory(_root_cache_directory)
if tmp_cache.exists():
shutil.rmtree(tmp_cache)
-
+
@pytest.fixture
def static_cache_dir():
return Path(__file__).parent / "files"
diff --git a/tests/files/org/openml/test/tasks/1/task.xml b/tests/files/org/openml/test/api/v1/xml/task/1/body.xml
similarity index 100%
rename from tests/files/org/openml/test/tasks/1/task.xml
rename to tests/files/org/openml/test/api/v1/xml/task/1/body.xml
diff --git a/tests/files/org/openml/test/tasks/1882/task.xml b/tests/files/org/openml/test/api/v1/xml/task/1882/body.xml
similarity index 100%
rename from tests/files/org/openml/test/tasks/1882/task.xml
rename to tests/files/org/openml/test/api/v1/xml/task/1882/body.xml
diff --git a/tests/files/org/openml/test/tasks/3/task.xml b/tests/files/org/openml/test/api/v1/xml/task/3/body.xml
similarity index 100%
rename from tests/files/org/openml/test/tasks/3/task.xml
rename to tests/files/org/openml/test/api/v1/xml/task/3/body.xml
diff --git a/tests/files/org/openml/test/tasks/1/datasplits.arff b/tests/files/org/openml/test/api_splits/get/1/Task_1_splits.arff/body.arff
similarity index 100%
rename from tests/files/org/openml/test/tasks/1/datasplits.arff
rename to tests/files/org/openml/test/api_splits/get/1/Task_1_splits.arff/body.arff
diff --git a/tests/files/org/openml/test/tasks/1882/datasplits.arff b/tests/files/org/openml/test/api_splits/get/1882/Task_1882_splits.arff/body.arff
similarity index 100%
rename from tests/files/org/openml/test/tasks/1882/datasplits.arff
rename to tests/files/org/openml/test/api_splits/get/1882/Task_1882_splits.arff/body.arff
diff --git a/tests/files/org/openml/test/tasks/3/datasplits.arff b/tests/files/org/openml/test/api_splits/get/3/Task_3_splits.arff/body.arff
similarity index 100%
rename from tests/files/org/openml/test/tasks/3/datasplits.arff
rename to tests/files/org/openml/test/api_splits/get/3/Task_3_splits.arff/body.arff
diff --git a/tests/test_api/test_task.py b/tests/test_api/test_task.py
new file mode 100644
index 000000000..6cad784b3
--- /dev/null
+++ b/tests/test_api/test_task.py
@@ -0,0 +1,191 @@
+import pytest
+import pandas as pd
+from requests import Session, Response
+from unittest.mock import patch
+
+import openml
+from openml._api.resources.task import TaskV1API, TaskV2API
+from openml._api.resources.base.fallback import FallbackProxy
+from openml.exceptions import OpenMLNotSupportedError
+from openml.tasks.task import TaskType
+
+
+@pytest.fixture
+def task_v1(http_client_v1, minio_client) -> TaskV1API:
+ return TaskV1API(http=http_client_v1, minio=minio_client)
+
+
+@pytest.fixture
+def task_v2(http_client_v2, minio_client) -> TaskV2API:
+ return TaskV2API(http=http_client_v2, minio=minio_client)
+
+
+@pytest.mark.uses_test_server()
+def test_v1_list_tasks(task_v1):
+ """Verify V1 list endpoint returns a populated DataFrame."""
+ tasks_df = task_v1.list(limit=5, offset=0)
+ assert isinstance(tasks_df, pd.DataFrame)
+ assert not tasks_df.empty
+ assert "tid" in tasks_df.columns
+
+
+@pytest.mark.uses_test_server()
+def test_v1_get(task_v1):
+ """Verify V1 get endpoint returns a task."""
+ task = task_v1.get(1)
+ assert task is not None
+ assert task.task_id == 1
+
+
+@pytest.mark.uses_test_server()
+def test_v2_list_tasks(task_v2):
+ """Verify V2 list endpoint raises NotSupported."""
+ with pytest.raises(OpenMLNotSupportedError):
+ task_v2.list(limit=5, offset=0)
+
+
+@pytest.mark.uses_test_server()
+def test_v2_get(task_v2):
+ """Verify V2 get endpoint returns a task."""
+ task = task_v2.get(1)
+ assert task is not None
+ assert task.task_id == 1
+
+
+def test_v1_publish(task_v1):
+ resource_name = task_v1.resource_type.value
+ resource_files = {"description": "Resource Description File"}
+ resource_id = 123
+
+ with patch.object(Session, "request") as mock_request:
+ mock_request.return_value = Response()
+ mock_request.return_value.status_code = 200
+ mock_request.return_value._content = (
+ f'\n'
+ f"\t{resource_id}\n"
+ f"\n"
+ ).encode("utf-8")
+
+ published_resource_id = task_v1.publish(
+ resource_name,
+ files=resource_files,
+ )
+
+ assert resource_id == published_resource_id
+
+ mock_request.assert_called_once_with(
+ method="POST",
+ url=openml.config.server + resource_name,
+ params={},
+ data={"api_key": openml.config.apikey},
+ headers=openml.config._HEADERS,
+ files=resource_files,
+ )
+
+
+def test_v1_delete(task_v1):
+ resource_name = task_v1.resource_type.value
+ resource_id = 123
+
+ with patch.object(Session, "request") as mock_request:
+ mock_request.return_value = Response()
+ mock_request.return_value.status_code = 200
+ mock_request.return_value._content = (
+ f'\n'
+ f" {resource_id}\n"
+ f"\n"
+ ).encode("utf-8")
+
+ task_v1.delete(resource_id)
+
+ mock_request.assert_called_once_with(
+ method="DELETE",
+ url=(openml.config.server + resource_name + "/" + str(resource_id)),
+ params={"api_key": openml.config.apikey},
+ data={},
+ headers=openml.config._HEADERS,
+ files=None,
+ )
+
+
+def test_v1_tag(task_v1):
+ resource_id = 123
+ resource_tag = "TAG"
+
+ with patch.object(Session, "request") as mock_request:
+ mock_request.return_value = Response()
+ mock_request.return_value.status_code = 200
+ mock_request.return_value._content = (
+ f''
+ f"{resource_id}"
+ f"{resource_tag}"
+ f""
+ ).encode("utf-8")
+
+ tags = task_v1.tag(resource_id, resource_tag)
+
+ assert resource_tag in tags
+
+ mock_request.assert_called_once_with(
+ method="POST",
+ url=(openml.config.server + task_v1.resource_type.value + "/tag"),
+ params={},
+ data={
+ "api_key": openml.config.apikey,
+ "task_id": resource_id,
+ "tag": resource_tag,
+ },
+ headers=openml.config._HEADERS,
+ files=None,
+ )
+
+
+def test_v1_untag(task_v1):
+ resource_id = 123
+ resource_tag = "TAG"
+
+ with patch.object(Session, "request") as mock_request:
+ mock_request.return_value = Response()
+ mock_request.return_value.status_code = 200
+ mock_request.return_value._content = (
+ f''
+ f"{resource_id}"
+ f""
+ ).encode("utf-8")
+
+ tags = task_v1.untag(resource_id, resource_tag)
+
+ assert resource_tag not in tags
+
+ mock_request.assert_called_once_with(
+ method="POST",
+ url=(openml.config.server + task_v1.resource_type.value + "/untag"),
+ params={},
+ data={
+ "api_key": openml.config.apikey,
+ "task_id": resource_id,
+ "tag": resource_tag,
+ },
+ headers=openml.config._HEADERS,
+ files=None,
+ )
+
+
+def test_v2_publish(task_v2):
+ with pytest.raises(OpenMLNotSupportedError):
+ task_v2.publish(path=None, files=None)
+
+
+def test_v2_delete(task_v2):
+ with pytest.raises(OpenMLNotSupportedError):
+ task_v2.delete(resource_id=None)
+
+
+def test_v2_tag(task_v2):
+ with pytest.raises(OpenMLNotSupportedError):
+ task_v2.tag(resource_id=None, tag=None)
+
+
+def test_v2_untag(task_v2):
+ with pytest.raises(OpenMLNotSupportedError):
+ task_v2.untag(resource_id=None, tag=None)
diff --git a/tests/test_datasets/test_dataset_functions.py b/tests/test_datasets/test_dataset_functions.py
index 80b0b4215..f885198f1 100644
--- a/tests/test_datasets/test_dataset_functions.py
+++ b/tests/test_datasets/test_dataset_functions.py
@@ -289,7 +289,9 @@ def test_get_dataset_cannot_access_private_data(self):
@pytest.mark.skip("Need to find dataset name of private dataset")
def test_dataset_by_name_cannot_access_private_data(self):
self.use_production_server()
- self.assertRaises(OpenMLPrivateDatasetError, openml.datasets.get_dataset, "NAME_GOES_HERE")
+ self.assertRaises(
+ OpenMLPrivateDatasetError, openml.datasets.get_dataset, "NAME_GOES_HERE"
+ )
@pytest.mark.test_server()
def test_get_dataset_lazy_all_functions(self):
@@ -299,7 +301,9 @@ def test_get_dataset_lazy_all_functions(self):
def ensure_absence_of_real_data():
assert not os.path.exists(
- os.path.join(openml.config.get_cache_directory(), "datasets", "1", "dataset.arff")
+ os.path.join(
+ openml.config.get_cache_directory(), "datasets", "1", "dataset.arff"
+ )
)
tag = "test_lazy_tag_%d" % random.randint(1, 1000000)
@@ -404,7 +408,6 @@ def test__download_minio_file_works_with_bucket_subdirectory(self):
file_destination
), "_download_minio_file can download from subdirectories"
-
@mock.patch("openml._api_calls._download_minio_file")
@pytest.mark.test_server()
def test__get_dataset_parquet_is_cached(self, patch):
@@ -524,13 +527,29 @@ def test_deletion_of_cache_dir(self):
@pytest.mark.test_server()
def test_deletion_of_cache_dir_faulty_download(self, patch):
patch.side_effect = Exception("Boom!")
- self.assertRaisesRegex(Exception, "Boom!", openml.datasets.get_dataset, dataset_id=1)
- datasets_cache_dir = os.path.join(openml.config.get_cache_directory(), "datasets")
+ self.assertRaisesRegex(
+ Exception, "Boom!", openml.datasets.get_dataset, dataset_id=1
+ )
+ datasets_cache_dir = os.path.join(
+ openml.config.get_cache_directory(), "datasets"
+ )
assert len(os.listdir(datasets_cache_dir)) == 0
+ @pytest.mark.skipif(
+ os.getenv("OPENML_USE_LOCAL_SERVICES") == "true",
+ reason="Pending resolution of #1657",
+ )
@pytest.mark.test_server()
def test_publish_dataset(self):
- arff_file_path = self.static_cache_dir / "org" / "openml" / "test" / "datasets" / "2" / "dataset.arff"
+ arff_file_path = (
+ self.static_cache_dir
+ / "org"
+ / "openml"
+ / "test"
+ / "datasets"
+ / "2"
+ / "dataset.arff"
+ )
dataset = OpenMLDataset(
"anneal",
"test",
@@ -561,7 +580,9 @@ def test__retrieve_class_labels(self):
# Test workaround for string-typed class labels
custom_ds = openml.datasets.get_dataset(2)
custom_ds.features[31].data_type = "string"
- labels = custom_ds.retrieve_class_labels(target_name=custom_ds.features[31].name)
+ labels = custom_ds.retrieve_class_labels(
+ target_name=custom_ds.features[31].name
+ )
assert labels == ["COIL", "SHEET"]
@pytest.mark.test_server()
@@ -682,11 +703,16 @@ def test_attributes_arff_from_df_unknown_dtype(self):
for arr, dt in zip(data, dtype):
df = pd.DataFrame(arr)
err_msg = (
- f"The dtype '{dt}' of the column '0' is not currently " "supported by liac-arff"
+ f"The dtype '{dt}' of the column '0' is not currently "
+ "supported by liac-arff"
)
with pytest.raises(ValueError, match=err_msg):
attributes_arff_from_df(df)
+ @pytest.mark.skipif(
+ os.getenv("OPENML_USE_LOCAL_SERVICES") == "true",
+ reason="Pending resolution of #1657",
+ )
@pytest.mark.test_server()
def test_create_dataset_numpy(self):
data = np.array([[1, 2, 3], [1.2, 2.5, 3.8], [2, 5, 8], [0, 1, 0]]).T
@@ -719,8 +745,14 @@ def test_create_dataset_numpy(self):
assert (
_get_online_dataset_arff(dataset.id) == dataset._dataset
), "Uploaded arff does not match original one"
- assert _get_online_dataset_format(dataset.id) == "arff", "Wrong format for dataset"
+ assert (
+ _get_online_dataset_format(dataset.id) == "arff"
+ ), "Wrong format for dataset"
+ @pytest.mark.skipif(
+ os.getenv("OPENML_USE_LOCAL_SERVICES") == "true",
+ reason="Pending resolution of #1657",
+ )
@pytest.mark.test_server()
def test_create_dataset_list(self):
data = [
@@ -774,8 +806,14 @@ def test_create_dataset_list(self):
assert (
_get_online_dataset_arff(dataset.id) == dataset._dataset
), "Uploaded ARFF does not match original one"
- assert _get_online_dataset_format(dataset.id) == "arff", "Wrong format for dataset"
+ assert (
+ _get_online_dataset_format(dataset.id) == "arff"
+ ), "Wrong format for dataset"
+ @pytest.mark.skipif(
+ os.getenv("OPENML_USE_LOCAL_SERVICES") == "true",
+ reason="Pending resolution of #1657",
+ )
@pytest.mark.test_server()
def test_create_dataset_sparse(self):
# test the scipy.sparse.coo_matrix
@@ -924,6 +962,10 @@ def test_get_online_dataset_format(self):
dataset_id
), "The format of the ARFF files is different"
+ @pytest.mark.skipif(
+ os.getenv("OPENML_USE_LOCAL_SERVICES") == "true",
+ reason="Pending resolution of #1657",
+ )
@pytest.mark.test_server()
def test_create_dataset_pandas(self):
data = [
@@ -991,7 +1033,9 @@ def test_create_dataset_pandas(self):
column_names = ["input1", "input2", "y"]
df = pd.DataFrame.sparse.from_spmatrix(sparse_data, columns=column_names)
# meta-information
- description = "Synthetic dataset created from a Pandas DataFrame with Sparse columns"
+ description = (
+ "Synthetic dataset created from a Pandas DataFrame with Sparse columns"
+ )
dataset = openml.datasets.functions.create_dataset(
name=name,
description=description,
@@ -1016,7 +1060,9 @@ def test_create_dataset_pandas(self):
assert (
_get_online_dataset_arff(dataset.id) == dataset._dataset
), "Uploaded ARFF does not match original one"
- assert _get_online_dataset_format(dataset.id) == "sparse_arff", "Wrong format for dataset"
+ assert (
+ _get_online_dataset_format(dataset.id) == "sparse_arff"
+ ), "Wrong format for dataset"
# Check that we can overwrite the attributes
data = [["a"], ["b"], ["c"], ["d"], ["e"]]
@@ -1046,7 +1092,9 @@ def test_create_dataset_pandas(self):
TestBase._mark_entity_for_removal("data", dataset.id)
TestBase.logger.info(f"collected from {__file__.split('/')[-1]}: {dataset.id}")
downloaded_data = _get_online_dataset_arff(dataset.id)
- assert downloaded_data == dataset._dataset, "Uploaded ARFF does not match original one"
+ assert (
+ downloaded_data == dataset._dataset
+ ), "Uploaded ARFF does not match original one"
assert "@ATTRIBUTE rnd_str {a, b, c, d, e, f, g}" in downloaded_data
def test_ignore_attributes_dataset(self):
@@ -1149,6 +1197,10 @@ def test_ignore_attributes_dataset(self):
paper_url=paper_url,
)
+ @pytest.mark.skipif(
+ os.getenv("OPENML_USE_LOCAL_SERVICES") == "true",
+ reason="Pending resolution of #1657",
+ )
@pytest.mark.test_server()
def test_publish_fetch_ignore_attribute(self):
"""Test to upload and retrieve dataset and check ignore_attributes"""
@@ -1268,6 +1320,10 @@ def test_create_dataset_row_id_attribute_error(self):
paper_url=paper_url,
)
+ @pytest.mark.skipif(
+ os.getenv("OPENML_USE_LOCAL_SERVICES") == "true",
+ reason="Pending resolution of #1657",
+ )
@pytest.mark.test_server()
def test_create_dataset_row_id_attribute_inference(self):
# meta-information
@@ -1396,7 +1452,9 @@ def test_get_dataset_cache_format_feather(self):
cache_dir = openml.config.get_cache_directory()
cache_dir_for_id = os.path.join(cache_dir, "datasets", "128")
feather_file = os.path.join(cache_dir_for_id, "dataset.feather")
- pickle_file = os.path.join(cache_dir_for_id, "dataset.feather.attributes.pkl.py3")
+ pickle_file = os.path.join(
+ cache_dir_for_id, "dataset.feather.attributes.pkl.py3"
+ )
data = pd.read_feather(feather_file)
assert os.path.isfile(feather_file), "Feather file is missing"
assert os.path.isfile(pickle_file), "Attributes pickle file is missing"
@@ -1436,6 +1494,10 @@ def test_data_edit_non_critical_field(self):
edited_dataset = openml.datasets.get_dataset(did)
assert edited_dataset.description == desc
+ @pytest.mark.skipif(
+ os.getenv("OPENML_USE_LOCAL_SERVICES") == "true",
+ reason="Pending resolution of #1657",
+ )
@pytest.mark.test_server()
def test_data_edit_critical_field(self):
# Case 2
@@ -1443,7 +1505,9 @@ def test_data_edit_critical_field(self):
# for this, we need to first clone a dataset to do changes
did = fork_dataset(1)
self._wait_for_dataset_being_processed(did)
- result = edit_dataset(did, default_target_attribute="shape", ignore_attribute="oil")
+ result = edit_dataset(
+ did, default_target_attribute="shape", ignore_attribute="oil"
+ )
assert did == result
n_tries = 10
@@ -1451,7 +1515,9 @@ def test_data_edit_critical_field(self):
for i in range(n_tries):
edited_dataset = openml.datasets.get_dataset(did)
try:
- assert edited_dataset.default_target_attribute == "shape", edited_dataset
+ assert (
+ edited_dataset.default_target_attribute == "shape"
+ ), edited_dataset
assert edited_dataset.ignore_attribute == ["oil"], edited_dataset
break
except AssertionError as e:
@@ -1459,9 +1525,11 @@ def test_data_edit_critical_field(self):
raise e
time.sleep(10)
# Delete the cache dir to get the newer version of the dataset
-
+
shutil.rmtree(
- os.path.join(openml.config.get_cache_directory(), "datasets", str(did)),
+ os.path.join(
+ openml.config.get_cache_directory(), "datasets", str(did)
+ ),
)
@pytest.mark.test_server()
@@ -1488,6 +1556,10 @@ def test_data_edit_requires_valid_dataset(self):
description="xor operation dataset",
)
+ @pytest.mark.skipif(
+ os.getenv("OPENML_USE_LOCAL_SERVICES") == "true",
+ reason="Pending resolution of #1657",
+ )
@pytest.mark.test_server()
def test_data_edit_cannot_edit_critical_field_if_dataset_has_task(self):
# Need to own a dataset to be able to edit meta-data
@@ -1540,7 +1612,6 @@ def test_data_fork(self):
data_id=999999,
)
-
@pytest.mark.production_server()
def test_list_datasets_with_high_size_parameter(self):
# Testing on prod since concurrent deletion of uploded datasets make the test fail
@@ -1626,7 +1697,9 @@ def test_invalid_attribute_validations(
(None, None, ["outlook", "windy"]),
],
)
-def test_valid_attribute_validations(default_target_attribute, row_id_attribute, ignore_attribute):
+def test_valid_attribute_validations(
+ default_target_attribute, row_id_attribute, ignore_attribute
+):
data = [
["a", "sunny", 85.0, 85.0, "FALSE", "no"],
["b", "sunny", 80.0, 90.0, "TRUE", "no"],
@@ -1726,7 +1799,10 @@ def test_delete_dataset(self):
@mock.patch.object(requests.Session, "delete")
def test_delete_dataset_not_owned(mock_delete, test_files_directory, test_server_v1, test_apikey_v1):
content_file = (
- test_files_directory / "mock_responses" / "datasets" / "data_delete_not_owned.xml"
+ test_files_directory
+ / "mock_responses"
+ / "datasets"
+ / "data_delete_not_owned.xml"
)
mock_delete.return_value = create_request_response(
status_code=412,
@@ -1747,7 +1823,10 @@ def test_delete_dataset_not_owned(mock_delete, test_files_directory, test_server
@mock.patch.object(requests.Session, "delete")
def test_delete_dataset_with_run(mock_delete, test_files_directory, test_server_v1, test_apikey_v1):
content_file = (
- test_files_directory / "mock_responses" / "datasets" / "data_delete_has_tasks.xml"
+ test_files_directory
+ / "mock_responses"
+ / "datasets"
+ / "data_delete_has_tasks.xml"
)
mock_delete.return_value = create_request_response(
status_code=412,
@@ -1768,7 +1847,10 @@ def test_delete_dataset_with_run(mock_delete, test_files_directory, test_server_
@mock.patch.object(requests.Session, "delete")
def test_delete_dataset_success(mock_delete, test_files_directory, test_server_v1, test_apikey_v1):
content_file = (
- test_files_directory / "mock_responses" / "datasets" / "data_delete_successful.xml"
+ test_files_directory
+ / "mock_responses"
+ / "datasets"
+ / "data_delete_successful.xml"
)
mock_delete.return_value = create_request_response(
status_code=200,
@@ -1786,7 +1868,10 @@ def test_delete_dataset_success(mock_delete, test_files_directory, test_server_v
@mock.patch.object(requests.Session, "delete")
def test_delete_unknown_dataset(mock_delete, test_files_directory, test_server_v1, test_apikey_v1):
content_file = (
- test_files_directory / "mock_responses" / "datasets" / "data_delete_not_exist.xml"
+ test_files_directory
+ / "mock_responses"
+ / "datasets"
+ / "data_delete_not_exist.xml"
)
mock_delete.return_value = create_request_response(
status_code=412,
@@ -1956,9 +2041,15 @@ def test_get_dataset_lazy_behavior(
with_features=with_features,
with_data=with_data,
)
- assert dataset.features, "Features should be downloaded on-demand if not during get_dataset"
- assert dataset.qualities, "Qualities should be downloaded on-demand if not during get_dataset"
- assert dataset.get_data(), "Data should be downloaded on-demand if not during get_dataset"
+ assert (
+ dataset.features
+ ), "Features should be downloaded on-demand if not during get_dataset"
+ assert (
+ dataset.qualities
+ ), "Qualities should be downloaded on-demand if not during get_dataset"
+ assert (
+ dataset.get_data()
+ ), "Data should be downloaded on-demand if not during get_dataset"
_assert_datasets_retrieved_successfully(
[1], with_qualities=True, with_features=True, with_data=True
)
@@ -1977,7 +2068,9 @@ def test__get_dataset_parquet_not_cached():
"oml:parquet_url": "http://data.openml.org/dataset20/dataset_20.pq",
"oml:id": "20",
}
- path = _get_dataset_parquet(description, cache_directory=Path(openml.config.get_cache_directory()))
+ path = _get_dataset_parquet(
+ description, cache_directory=Path(openml.config.get_cache_directory())
+ )
assert isinstance(path, Path), "_get_dataset_parquet returns a path"
assert path.is_file(), "_get_dataset_parquet returns path to real file"
@@ -1986,7 +2079,10 @@ def test_read_features_from_xml_with_whitespace() -> None:
from openml.datasets.dataset import _read_features
features_file = (
- Path(__file__).parent.parent / "files" / "misc" / "features_with_whitespaces.xml"
+ Path(__file__).parent.parent
+ / "files"
+ / "misc"
+ / "features_with_whitespaces.xml"
)
dict = _read_features(features_file)
assert dict[1].nominal_values == [" - 50000.", " 50000+."]
@@ -1997,7 +2093,7 @@ def test_get_dataset_parquet(requests_mock, test_files_directory, test_server_v1
# Parquet functionality is disabled on the test server
# There is no parquet-copy of the test server yet.
content_file = (
- test_files_directory / "mock_responses" / "datasets" / "data_description_61.xml"
+ test_files_directory / "mock_responses" / "datasets" / "data_description_61.xml"
)
# While the mocked example is from production, unit tests by default connect to the test server.
requests_mock.get(test_server_v1 + "data/61", text=content_file.read_text())
diff --git a/tests/test_flows/test_flow.py b/tests/test_flows/test_flow.py
index 4e391fd3b..108a05c3f 100644
--- a/tests/test_flows/test_flow.py
+++ b/tests/test_flows/test_flow.py
@@ -4,6 +4,7 @@
import collections
import copy
import hashlib
+import os
import re
import os
import time
@@ -162,12 +163,16 @@ def test_from_xml_to_xml(self):
def test_to_xml_from_xml(self):
scaler = sklearn.preprocessing.StandardScaler(with_mean=False)
estimator_name = (
- "base_estimator" if Version(sklearn.__version__) < Version("1.4") else "estimator"
+ "base_estimator"
+ if Version(sklearn.__version__) < Version("1.4")
+ else "estimator"
)
boosting = sklearn.ensemble.AdaBoostClassifier(
**{estimator_name: sklearn.tree.DecisionTreeClassifier()},
)
- model = sklearn.pipeline.Pipeline(steps=(("scaler", scaler), ("boosting", boosting)))
+ model = sklearn.pipeline.Pipeline(
+ steps=(("scaler", scaler), ("boosting", boosting))
+ )
flow = self.extension.model_to_flow(model)
flow.flow_id = -234
# end of setup
@@ -180,6 +185,10 @@ def test_to_xml_from_xml(self):
openml.flows.functions.assert_flows_equal(new_flow, flow)
assert new_flow is not flow
+ @pytest.mark.skipif(
+ os.getenv("OPENML_USE_LOCAL_SERVICES") == "true",
+ reason="Pending resolution of #1657",
+ )
@pytest.mark.sklearn()
@pytest.mark.test_server()
def test_publish_flow(self):
@@ -204,7 +213,9 @@ def test_publish_flow(self):
flow.publish()
TestBase._mark_entity_for_removal("flow", flow.flow_id, flow.name)
- TestBase.logger.info(f"collected from {__file__.split('/')[-1]}: {flow.flow_id}")
+ TestBase.logger.info(
+ f"collected from {__file__.split('/')[-1]}: {flow.flow_id}"
+ )
assert isinstance(flow.flow_id, int)
@pytest.mark.sklearn()
@@ -214,7 +225,9 @@ def test_publish_existing_flow(self, flow_exists_mock):
flow = self.extension.model_to_flow(clf)
flow_exists_mock.return_value = 1
- with pytest.raises(openml.exceptions.PyOpenMLError, match="OpenMLFlow already exists"):
+ with pytest.raises(
+ openml.exceptions.PyOpenMLError, match="OpenMLFlow already exists"
+ ):
flow.publish(raise_error_if_exists=True)
TestBase._mark_entity_for_removal("flow", flow.flow_id, flow.name)
@@ -222,6 +235,10 @@ def test_publish_existing_flow(self, flow_exists_mock):
f"collected from {__file__.split('/')[-1]}: {flow.flow_id}",
)
+ @pytest.mark.skipif(
+ os.getenv("OPENML_USE_LOCAL_SERVICES") == "true",
+ reason="Pending resolution of #1657",
+ )
@pytest.mark.sklearn()
@pytest.mark.test_server()
def test_publish_flow_with_similar_components(self):
@@ -232,7 +249,9 @@ def test_publish_flow_with_similar_components(self):
flow, _ = self._add_sentinel_to_flow_name(flow, None)
flow.publish()
TestBase._mark_entity_for_removal("flow", flow.flow_id, flow.name)
- TestBase.logger.info(f"collected from {__file__.split('/')[-1]}: {flow.flow_id}")
+ TestBase.logger.info(
+ f"collected from {__file__.split('/')[-1]}: {flow.flow_id}"
+ )
# For a flow where both components are published together, the upload
# date should be equal
assert flow.upload_date == flow.components["lr"].upload_date, (
@@ -247,7 +266,9 @@ def test_publish_flow_with_similar_components(self):
flow1, sentinel = self._add_sentinel_to_flow_name(flow1, None)
flow1.publish()
TestBase._mark_entity_for_removal("flow", flow.flow_id, flow.name)
- TestBase.logger.info(f"collected from {__file__.split('/')[-1]}: {flow1.flow_id}")
+ TestBase.logger.info(
+ f"collected from {__file__.split('/')[-1]}: {flow1.flow_id}"
+ )
# In order to assign different upload times to the flows!
time.sleep(1)
@@ -259,20 +280,30 @@ def test_publish_flow_with_similar_components(self):
flow2, _ = self._add_sentinel_to_flow_name(flow2, sentinel)
flow2.publish()
TestBase._mark_entity_for_removal("flow", flow2.flow_id, flow2.name)
- TestBase.logger.info(f"collected from {__file__.split('/')[-1]}: {flow2.flow_id}")
+ TestBase.logger.info(
+ f"collected from {__file__.split('/')[-1]}: {flow2.flow_id}"
+ )
# If one component was published before the other, the components in
# the flow should have different upload dates
assert flow2.upload_date != flow2.components["dt"].upload_date
- clf3 = sklearn.ensemble.AdaBoostClassifier(sklearn.tree.DecisionTreeClassifier(max_depth=3))
+ clf3 = sklearn.ensemble.AdaBoostClassifier(
+ sklearn.tree.DecisionTreeClassifier(max_depth=3)
+ )
flow3 = self.extension.model_to_flow(clf3)
flow3, _ = self._add_sentinel_to_flow_name(flow3, sentinel)
# Child flow has different parameter. Check for storing the flow
# correctly on the server should thus not check the child's parameters!
flow3.publish()
TestBase._mark_entity_for_removal("flow", flow3.flow_id, flow3.name)
- TestBase.logger.info(f"collected from {__file__.split('/')[-1]}: {flow3.flow_id}")
+ TestBase.logger.info(
+ f"collected from {__file__.split('/')[-1]}: {flow3.flow_id}"
+ )
+ @pytest.mark.skipif(
+ os.getenv("OPENML_USE_LOCAL_SERVICES") == "true",
+ reason="Pending resolution of #1657",
+ )
@pytest.mark.sklearn()
@pytest.mark.test_server()
def test_semi_legal_flow(self):
@@ -280,7 +311,9 @@ def test_semi_legal_flow(self):
# should not throw error as it contains two differentiable forms of
# Bagging i.e., Bagging(Bagging(J48)) and Bagging(J48)
estimator_name = (
- "base_estimator" if Version(sklearn.__version__) < Version("1.4") else "estimator"
+ "base_estimator"
+ if Version(sklearn.__version__) < Version("1.4")
+ else "estimator"
)
semi_legal = sklearn.ensemble.BaggingClassifier(
**{
@@ -296,7 +329,9 @@ def test_semi_legal_flow(self):
flow.publish()
TestBase._mark_entity_for_removal("flow", flow.flow_id, flow.name)
- TestBase.logger.info(f"collected from {__file__.split('/')[-1]}: {flow.flow_id}")
+ TestBase.logger.info(
+ f"collected from {__file__.split('/')[-1]}: {flow.flow_id}"
+ )
@pytest.mark.sklearn()
@mock.patch("openml.flows.functions.get_flow")
@@ -383,13 +418,21 @@ def get_sentinel():
flow_id = openml.flows.flow_exists(name, version)
assert not flow_id
+ @pytest.mark.skipif(
+ os.getenv("OPENML_USE_LOCAL_SERVICES") == "true",
+ reason="Pending resolution of #1657",
+ )
@pytest.mark.sklearn()
@pytest.mark.test_server()
def test_existing_flow_exists(self):
# create a flow
nb = sklearn.naive_bayes.GaussianNB()
- sparse = "sparse" if Version(sklearn.__version__) < Version("1.4") else "sparse_output"
+ sparse = (
+ "sparse"
+ if Version(sklearn.__version__) < Version("1.4")
+ else "sparse_output"
+ )
ohe_params = {sparse: False, "handle_unknown": "ignore"}
if Version(sklearn.__version__) >= Version("0.20"):
ohe_params["categories"] = "auto"
@@ -424,6 +467,10 @@ def test_existing_flow_exists(self):
)
assert downloaded_flow_id == flow.flow_id
+ @pytest.mark.skipif(
+ os.getenv("OPENML_USE_LOCAL_SERVICES") == "true",
+ reason="Pending resolution of #1657",
+ )
@pytest.mark.sklearn()
@pytest.mark.test_server()
def test_sklearn_to_upload_to_flow(self):
@@ -444,13 +491,20 @@ def test_sklearn_to_upload_to_flow(self):
)
fu = sklearn.pipeline.FeatureUnion(transformer_list=[("pca", pca), ("fs", fs)])
estimator_name = (
- "base_estimator" if Version(sklearn.__version__) < Version("1.4") else "estimator"
+ "base_estimator"
+ if Version(sklearn.__version__) < Version("1.4")
+ else "estimator"
)
boosting = sklearn.ensemble.AdaBoostClassifier(
**{estimator_name: sklearn.tree.DecisionTreeClassifier()},
)
model = sklearn.pipeline.Pipeline(
- steps=[("ohe", ohe), ("scaler", scaler), ("fu", fu), ("boosting", boosting)],
+ steps=[
+ ("ohe", ohe),
+ ("scaler", scaler),
+ ("fu", fu),
+ ("boosting", boosting),
+ ],
)
parameter_grid = {
"boosting__n_estimators": [1, 5, 10, 100],
@@ -477,7 +531,9 @@ def test_sklearn_to_upload_to_flow(self):
flow.publish()
TestBase._mark_entity_for_removal("flow", flow.flow_id, flow.name)
- TestBase.logger.info(f"collected from {__file__.split('/')[-1]}: {flow.flow_id}")
+ TestBase.logger.info(
+ f"collected from {__file__.split('/')[-1]}: {flow.flow_id}"
+ )
assert isinstance(flow.flow_id, int)
# Check whether we can load the flow again
@@ -560,7 +616,10 @@ def test_extract_tags(self):
tags = openml.utils.extract_xml_tags("oml:tag", flow_dict)
assert tags == ["study_14"]
- flow_xml = "OpenmlWeka\n" "weka"
+ flow_xml = (
+ "OpenmlWeka\n"
+ "weka"
+ )
flow_dict = xmltodict.parse(flow_xml)
tags = openml.utils.extract_xml_tags("oml:tag", flow_dict["oml:flow"])
assert tags == ["OpenmlWeka", "weka"]
diff --git a/tests/test_flows/test_flow_functions.py b/tests/test_flows/test_flow_functions.py
index 7a1331c45..f0709bb45 100644
--- a/tests/test_flows/test_flow_functions.py
+++ b/tests/test_flows/test_flow_functions.py
@@ -6,7 +6,7 @@
import unittest
from collections import OrderedDict
from multiprocessing.managers import Value
-
+import os
from openml_sklearn import SklearnExtension
from packaging.version import Version
from unittest import mock
@@ -153,7 +153,9 @@ def test_are_flows_equal(self):
openml.flows.functions.assert_flows_equal(flow, flow)
new_flow = copy.deepcopy(flow)
new_flow.parameters["abc"] = 3.0
- self.assertRaises(ValueError, openml.flows.functions.assert_flows_equal, flow, new_flow)
+ self.assertRaises(
+ ValueError, openml.flows.functions.assert_flows_equal, flow, new_flow
+ )
# Now test for components (subflows)
parent_flow = copy.deepcopy(flow)
@@ -195,24 +197,28 @@ def test_are_flows_equal_ignore_parameter_values(self):
)
openml.flows.functions.assert_flows_equal(flow, flow)
- openml.flows.functions.assert_flows_equal(flow, flow, ignore_parameter_values=True)
+ openml.flows.functions.assert_flows_equal(
+ flow, flow, ignore_parameter_values=True
+ )
new_flow = copy.deepcopy(flow)
new_flow.parameters["a"] = 7
with pytest.raises(ValueError) as excinfo:
openml.flows.functions.assert_flows_equal(flow, new_flow)
- assert str(paramaters) in str(excinfo.value) and str(new_flow.parameters) in str(
- excinfo.value
- )
+ assert str(paramaters) in str(excinfo.value) and str(
+ new_flow.parameters
+ ) in str(excinfo.value)
- openml.flows.functions.assert_flows_equal(flow, new_flow, ignore_parameter_values=True)
+ openml.flows.functions.assert_flows_equal(
+ flow, new_flow, ignore_parameter_values=True
+ )
del new_flow.parameters["a"]
with pytest.raises(ValueError) as excinfo:
openml.flows.functions.assert_flows_equal(flow, new_flow)
- assert str(paramaters) in str(excinfo.value) and str(new_flow.parameters) in str(
- excinfo.value
- )
+ assert str(paramaters) in str(excinfo.value) and str(
+ new_flow.parameters
+ ) in str(excinfo.value)
self.assertRaisesRegex(
ValueError,
@@ -246,7 +252,9 @@ def test_are_flows_equal_ignore_if_older(self):
upload_date=flow_upload_date,
)
- assert_flows_equal(flow, flow, ignore_parameter_values_on_older_children=flow_upload_date)
+ assert_flows_equal(
+ flow, flow, ignore_parameter_values_on_older_children=flow_upload_date
+ )
assert_flows_equal(flow, flow, ignore_parameter_values_on_older_children=None)
new_flow = copy.deepcopy(flow)
new_flow.parameters["a"] = 7
@@ -296,7 +304,9 @@ def test_sklearn_to_flow_list_of_lists(self):
self._add_sentinel_to_flow_name(flow)
flow.publish()
TestBase._mark_entity_for_removal("flow", flow.flow_id, flow.name)
- TestBase.logger.info(f"collected from {__file__.split('/')[-1]}: {flow.flow_id}")
+ TestBase.logger.info(
+ f"collected from {__file__.split('/')[-1]}: {flow.flow_id}"
+ )
# Test deserialization works
server_flow = openml.flows.get_flow(flow.flow_id, reinstantiate=True)
assert server_flow.parameters["categories"] == "[[0, 1], [0, 1]]"
@@ -310,6 +320,10 @@ def test_get_flow1(self):
flow = openml.flows.get_flow(1)
assert flow.external_version is None
+ @pytest.mark.skipif(
+ os.getenv("OPENML_USE_LOCAL_SERVICES") == "true",
+ reason="Pending resolution of #1657",
+ )
@pytest.mark.sklearn()
@pytest.mark.test_server()
def test_get_flow_reinstantiate_model(self):
@@ -318,10 +332,14 @@ def test_get_flow_reinstantiate_model(self):
flow = extension.model_to_flow(model)
flow.publish(raise_error_if_exists=False)
TestBase._mark_entity_for_removal("flow", flow.flow_id, flow.name)
- TestBase.logger.info(f"collected from {__file__.split('/')[-1]}: {flow.flow_id}")
+ TestBase.logger.info(
+ f"collected from {__file__.split('/')[-1]}: {flow.flow_id}"
+ )
downloaded_flow = openml.flows.get_flow(flow.flow_id, reinstantiate=True)
- assert isinstance(downloaded_flow.model, sklearn.ensemble.RandomForestClassifier)
+ assert isinstance(
+ downloaded_flow.model, sklearn.ensemble.RandomForestClassifier
+ )
@pytest.mark.test_server()
def test_get_flow_reinstantiate_model_no_extension(self):
@@ -340,7 +358,9 @@ def test_get_flow_reinstantiate_model_no_extension(self):
reason="Requires scikit-learn!=0.19.1, because target flow is from that version.",
)
@pytest.mark.production_server()
- def test_get_flow_with_reinstantiate_strict_with_wrong_version_raises_exception(self):
+ def test_get_flow_with_reinstantiate_strict_with_wrong_version_raises_exception(
+ self,
+ ):
self.use_production_server()
flow = 8175
expected = "Trying to deserialize a model with dependency sklearn==0.19.1 not satisfied."
@@ -363,7 +383,9 @@ def test_get_flow_with_reinstantiate_strict_with_wrong_version_raises_exception(
@pytest.mark.production_server()
def test_get_flow_reinstantiate_flow_not_strict_post_1(self):
self.use_production_server()
- flow = openml.flows.get_flow(flow_id=19190, reinstantiate=True, strict_version=False)
+ flow = openml.flows.get_flow(
+ flow_id=19190, reinstantiate=True, strict_version=False
+ )
assert flow.flow_id is None
assert "sklearn==1.0.0" not in flow.dependencies
@@ -377,7 +399,9 @@ def test_get_flow_reinstantiate_flow_not_strict_post_1(self):
@pytest.mark.production_server()
def test_get_flow_reinstantiate_flow_not_strict_023_and_024(self):
self.use_production_server()
- flow = openml.flows.get_flow(flow_id=18587, reinstantiate=True, strict_version=False)
+ flow = openml.flows.get_flow(
+ flow_id=18587, reinstantiate=True, strict_version=False
+ )
assert flow.flow_id is None
assert "sklearn==0.23.1" not in flow.dependencies
@@ -389,10 +413,16 @@ def test_get_flow_reinstantiate_flow_not_strict_023_and_024(self):
@pytest.mark.production_server()
def test_get_flow_reinstantiate_flow_not_strict_pre_023(self):
self.use_production_server()
- flow = openml.flows.get_flow(flow_id=8175, reinstantiate=True, strict_version=False)
+ flow = openml.flows.get_flow(
+ flow_id=8175, reinstantiate=True, strict_version=False
+ )
assert flow.flow_id is None
assert "sklearn==0.19.1" not in flow.dependencies
+ @pytest.mark.skipif(
+ os.getenv("OPENML_USE_LOCAL_SERVICES") == "true",
+ reason="Pending resolution of #1657",
+ )
@pytest.mark.sklearn()
@pytest.mark.test_server()
def test_get_flow_id(self):
@@ -402,13 +432,19 @@ def test_get_flow_id(self):
list_all = functools.lru_cache()(openml.utils._list_all)
with patch("openml.utils._list_all", list_all):
clf = sklearn.tree.DecisionTreeClassifier()
- flow = openml.extensions.get_extension_by_model(clf).model_to_flow(clf).publish()
+ flow = (
+ openml.extensions.get_extension_by_model(clf)
+ .model_to_flow(clf)
+ .publish()
+ )
TestBase._mark_entity_for_removal("flow", flow.flow_id, flow.name)
TestBase.logger.info(
f"collected from {__file__.split('/')[-1]}: {flow.flow_id}",
)
- assert openml.flows.get_flow_id(model=clf, exact_version=True) == flow.flow_id
+ assert (
+ openml.flows.get_flow_id(model=clf, exact_version=True) == flow.flow_id
+ )
flow_ids = openml.flows.get_flow_id(model=clf, exact_version=False)
assert flow.flow_id in flow_ids
assert len(flow_ids) > 0
@@ -424,9 +460,13 @@ def test_get_flow_id(self):
exact_version=False,
)
assert flow.flow_id in flow_ids_exact_version_True
- assert set(flow_ids_exact_version_True).issubset(set(flow_ids_exact_version_False))
+ assert set(flow_ids_exact_version_True).issubset(
+ set(flow_ids_exact_version_False)
+ )
# instead of the assertion above, the assertion below used to be used.
- pytest.skip(reason="Not sure why there should only be one version of this flow.")
+ pytest.skip(
+ reason="Not sure why there should only be one version of this flow."
+ )
assert flow_ids_exact_version_True == flow_ids_exact_version_False
@pytest.mark.test_server()
diff --git a/tests/test_openml/test_api_calls.py b/tests/test_openml/test_api_calls.py
index f2a81be9f..538fbe59f 100644
--- a/tests/test_openml/test_api_calls.py
+++ b/tests/test_openml/test_api_calls.py
@@ -6,6 +6,7 @@
from unittest import mock
import minio
+import os
import pytest
import os
@@ -20,6 +21,10 @@ def test_too_long_uri(self):
with pytest.raises(openml.exceptions.OpenMLServerError, match="URI too long!"):
openml.datasets.list_datasets(data_id=list(range(10000)))
+ @pytest.mark.skipif(
+ os.getenv("OPENML_USE_LOCAL_SERVICES") == "true",
+ reason="Pending resolution of #1657",
+ )
@unittest.mock.patch("time.sleep")
@unittest.mock.patch("requests.Session")
@pytest.mark.test_server()
@@ -33,11 +38,17 @@ def test_retry_on_database_error(self, Session_class_mock, _):
"Please wait for N seconds and try again.\n"
""
)
- Session_class_mock.return_value.__enter__.return_value.get.return_value = response_mock
- with pytest.raises(openml.exceptions.OpenMLServerException, match="/abc returned code 107"):
+ Session_class_mock.return_value.__enter__.return_value.get.return_value = (
+ response_mock
+ )
+ with pytest.raises(
+ openml.exceptions.OpenMLServerException, match="/abc returned code 107"
+ ):
openml._api_calls._send_request("get", "/abc", {})
- assert Session_class_mock.return_value.__enter__.return_value.get.call_count == 20
+ assert (
+ Session_class_mock.return_value.__enter__.return_value.get.call_count == 20
+ )
class FakeObject(NamedTuple):
@@ -124,5 +135,9 @@ def test_authentication_endpoints_requiring_api_key_show_relevant_help_link(
) -> None:
# We need to temporarily disable the API key to test the error message
with openml.config.overwrite_config_context({"apikey": None}):
- with pytest.raises(openml.exceptions.OpenMLAuthenticationError, match=API_TOKEN_HELP_LINK):
- openml._api_calls._perform_api_call(call=endpoint, request_method=method, data=None)
+ with pytest.raises(
+ openml.exceptions.OpenMLAuthenticationError, match=API_TOKEN_HELP_LINK
+ ):
+ openml._api_calls._perform_api_call(
+ call=endpoint, request_method=method, data=None
+ )
diff --git a/tests/test_runs/test_run.py b/tests/test_runs/test_run.py
index 22a8bc936..05e8ef1dd 100644
--- a/tests/test_runs/test_run.py
+++ b/tests/test_runs/test_run.py
@@ -48,7 +48,10 @@ def test_tagging(self):
def _test_prediction_data_equal(run, run_prime):
# Determine which attributes are numeric and which not
num_cols = np.array(
- [d_type == "NUMERIC" for _, d_type in run._generate_arff_dict()["attributes"]],
+ [
+ d_type == "NUMERIC"
+ for _, d_type in run._generate_arff_dict()["attributes"]
+ ],
)
# Get run data consistently
# (For run from server, .data_content does not exist)
@@ -66,7 +69,9 @@ def _test_prediction_data_equal(run, run_prime):
def _test_run_obj_equals(self, run, run_prime):
for dictionary in ["evaluations", "fold_evaluations", "sample_evaluations"]:
if getattr(run, dictionary) is not None:
- self.assertDictEqual(getattr(run, dictionary), getattr(run_prime, dictionary))
+ self.assertDictEqual(
+ getattr(run, dictionary), getattr(run_prime, dictionary)
+ )
else:
# should be none or empty
other = getattr(run_prime, dictionary)
@@ -76,7 +81,9 @@ def _test_run_obj_equals(self, run, run_prime):
self._test_prediction_data_equal(run, run_prime)
# Test trace
- run_trace_content = run.trace.trace_to_arff()["data"] if run.trace is not None else None
+ run_trace_content = (
+ run.trace.trace_to_arff()["data"] if run.trace is not None else None
+ )
if run_prime.trace is not None:
run_prime_trace_content = run_prime.trace.trace_to_arff()["data"]
@@ -118,6 +125,10 @@ def _check_array(array, type_):
else:
assert run_prime_trace_content is None
+ @pytest.mark.skipif(
+ os.getenv("OPENML_USE_LOCAL_SERVICES") == "true",
+ reason="Pending resolution of #1657",
+ )
@pytest.mark.sklearn()
@pytest.mark.test_server()
def test_to_from_filesystem_vanilla(self):
@@ -153,6 +164,10 @@ def test_to_from_filesystem_vanilla(self):
f"collected from {__file__.split('/')[-1]}: {run_prime.run_id}",
)
+ @pytest.mark.skipif(
+ os.getenv("OPENML_USE_LOCAL_SERVICES") == "true",
+ reason="Pending resolution of #1657",
+ )
@pytest.mark.sklearn()
@pytest.mark.flaky()
@pytest.mark.test_server()
@@ -189,14 +204,23 @@ def test_to_from_filesystem_search(self):
f"collected from {__file__.split('/')[-1]}: {run_prime.run_id}",
)
+ @pytest.mark.skipif(
+ os.getenv("OPENML_USE_LOCAL_SERVICES") == "true",
+ reason="Pending resolution of #1657",
+ )
@pytest.mark.sklearn()
@pytest.mark.test_server()
def test_to_from_filesystem_no_model(self):
model = Pipeline(
- [("imputer", SimpleImputer(strategy="mean")), ("classifier", DummyClassifier())],
+ [
+ ("imputer", SimpleImputer(strategy="mean")),
+ ("classifier", DummyClassifier()),
+ ],
)
task = openml.tasks.get_task(119) # diabetes; crossvalidation
- run = openml.runs.run_model_on_task(model=model, task=task, add_local_measures=False)
+ run = openml.runs.run_model_on_task(
+ model=model, task=task, add_local_measures=False
+ )
cache_path = os.path.join(self.workdir, "runs", str(random.getrandbits(128)))
run.to_filesystem(cache_path, store_model=False)
@@ -265,7 +289,9 @@ def assert_run_prediction_data(task, run, model):
# Check correctness of y_true and y_pred in run
for fold_id in range(n_folds):
# Get data for fold
- _, test_indices = task.get_train_test_split_indices(repeat=0, fold=fold_id, sample=0)
+ _, test_indices = task.get_train_test_split_indices(
+ repeat=0, fold=fold_id, sample=0
+ )
train_mask = np.full(len(X), True)
train_mask[test_indices] = False
@@ -279,7 +305,9 @@ def assert_run_prediction_data(task, run, model):
y_pred = model.fit(X_train, y_train).predict(X_test)
# Get stored data for fold
- saved_fold_data = run.predictions[run.predictions["fold"] == fold_id].sort_values(
+ saved_fold_data = run.predictions[
+ run.predictions["fold"] == fold_id
+ ].sort_values(
by="row_id",
)
saved_y_pred = saved_fold_data["prediction"].values
@@ -295,6 +323,10 @@ def assert_run_prediction_data(task, run, model):
assert_method(y_pred, saved_y_pred)
assert_method(y_test, saved_y_test)
+ @pytest.mark.skipif(
+ os.getenv("OPENML_USE_LOCAL_SERVICES") == "true",
+ reason="Pending resolution of #1657",
+ )
@pytest.mark.sklearn()
@pytest.mark.test_server()
def test_publish_with_local_loaded_flow(self):
@@ -323,7 +355,9 @@ def test_publish_with_local_loaded_flow(self):
# Make sure that the prediction data stored in the run is correct.
self.assert_run_prediction_data(task, run, clone(model))
- cache_path = os.path.join(self.workdir, "runs", str(random.getrandbits(128)))
+ cache_path = os.path.join(
+ self.workdir, "runs", str(random.getrandbits(128))
+ )
run.to_filesystem(cache_path)
# obtain run from filesystem
loaded_run = openml.runs.OpenMLRun.from_filesystem(cache_path)
@@ -339,6 +373,10 @@ def test_publish_with_local_loaded_flow(self):
assert openml.flows.flow_exists(flow.name, flow.external_version)
openml.runs.get_run(loaded_run.run_id)
+ @pytest.mark.skipif(
+ os.getenv("OPENML_USE_LOCAL_SERVICES") == "true",
+ reason="Pending resolution of #1657",
+ )
@pytest.mark.sklearn()
@pytest.mark.test_server()
@pytest.mark.skip(reason="https://github.com/openml/openml-python/issues/1586")
@@ -362,7 +400,9 @@ def test_offline_and_online_run_identical(self):
assert not openml.flows.flow_exists(flow.name, flow.external_version)
# Load from filesystem
- cache_path = os.path.join(self.workdir, "runs", str(random.getrandbits(128)))
+ cache_path = os.path.join(
+ self.workdir, "runs", str(random.getrandbits(128))
+ )
run.to_filesystem(cache_path)
loaded_run = openml.runs.OpenMLRun.from_filesystem(cache_path)
@@ -396,5 +436,7 @@ def test_run_setup_string_included_in_xml(self):
assert "oml:setup_string" in run_dict
assert run_dict["oml:setup_string"] == SETUP_STRING
- recreated_run = openml.runs.functions._create_run_from_xml(xml, from_server=False)
+ recreated_run = openml.runs.functions._create_run_from_xml(
+ xml, from_server=False
+ )
assert recreated_run.setup_string == SETUP_STRING
diff --git a/tests/test_runs/test_run_functions.py b/tests/test_runs/test_run_functions.py
index 3728e0d78..3f7cc12e9 100644
--- a/tests/test_runs/test_run_functions.py
+++ b/tests/test_runs/test_run_functions.py
@@ -40,7 +40,8 @@
OpenMLNotAuthorizedError,
OpenMLServerException,
)
-#from openml.extensions.sklearn import cat, cont
+
+# from openml.extensions.sklearn import cat, cont
from openml.runs.functions import (
_run_task_get_arffcontent,
delete_run,
@@ -132,9 +133,9 @@ def _wait_for_processed_run(self, run_id, max_waiting_time_seconds):
time.sleep(10)
continue
- assert len(run.evaluations) > 0, (
- "Expect not-None evaluations to always contain elements."
- )
+ assert (
+ len(run.evaluations) > 0
+ ), "Expect not-None evaluations to always contain elements."
return
raise RuntimeError(
@@ -143,7 +144,10 @@ def _wait_for_processed_run(self, run_id, max_waiting_time_seconds):
)
def _assert_predictions_equal(self, predictions, predictions_prime):
- assert np.array(predictions_prime["data"]).shape == np.array(predictions["data"]).shape
+ assert (
+ np.array(predictions_prime["data"]).shape
+ == np.array(predictions["data"]).shape
+ )
# The original search model does not submit confidence
# bounds, so we can not compare the arff line
@@ -164,7 +168,9 @@ def _assert_predictions_equal(self, predictions, predictions_prime):
else:
assert val_1 == val_2
- def _rerun_model_and_compare_predictions(self, run_id, model_prime, seed, create_task_obj):
+ def _rerun_model_and_compare_predictions(
+ self, run_id, model_prime, seed, create_task_obj
+ ):
run = openml.runs.get_run(run_id)
# TODO: assert holdout task
@@ -251,9 +257,13 @@ def _perform_run(
"sklearn.pipeline.Pipeline",
]
if Version(sklearn.__version__) < Version("0.22"):
- classes_without_random_state.append("sklearn.linear_model.base.LinearRegression")
+ classes_without_random_state.append(
+ "sklearn.linear_model.base.LinearRegression"
+ )
else:
- classes_without_random_state.append("sklearn.linear_model._base.LinearRegression")
+ classes_without_random_state.append(
+ "sklearn.linear_model._base.LinearRegression"
+ )
def _remove_random_state(flow):
if "random_state" in flow.parameters:
@@ -305,9 +315,12 @@ def _remove_random_state(flow):
flow_server = self.extension.model_to_flow(clf_server)
if flow.class_name not in classes_without_random_state:
- error_msg = "Flow class %s (id=%d) does not have a random state parameter" % (
- flow.class_name,
- flow.flow_id,
+ error_msg = (
+ "Flow class %s (id=%d) does not have a random state parameter"
+ % (
+ flow.class_name,
+ flow.flow_id,
+ )
)
assert "random_state" in flow.parameters, error_msg
# If the flow is initialized from a model without a random
@@ -397,6 +410,10 @@ def _check_sample_evaluations(
assert evaluation > 0
assert evaluation < max_time_allowed
+ @pytest.mark.skipif(
+ os.getenv("OPENML_USE_LOCAL_SERVICES") == "true",
+ reason="Pending resolution of #1657",
+ )
@pytest.mark.sklearn()
@pytest.mark.test_server()
def test_run_regression_on_classif_task(self):
@@ -407,13 +424,18 @@ def test_run_regression_on_classif_task(self):
# internally dataframe is loaded and targets are categorical
# which LinearRegression() cannot handle
with pytest.raises(
- AttributeError, match="'LinearRegression' object has no attribute 'classes_'"
+ AttributeError,
+ match="'LinearRegression' object has no attribute 'classes_'",
):
openml.runs.run_model_on_task(
model=clf,
task=task,
)
+ @pytest.mark.skipif(
+ os.getenv("OPENML_USE_LOCAL_SERVICES") == "true",
+ reason="Pending resolution of #1657",
+ )
@pytest.mark.sklearn()
@pytest.mark.test_server()
def test_check_erronous_sklearn_flow_fails(self):
@@ -479,7 +501,9 @@ def determine_grid_size(param_grid):
grid_iterations += determine_grid_size(sub_grid)
return grid_iterations
else:
- raise TypeError("Param Grid should be of type list (GridSearch only) or dict")
+ raise TypeError(
+ "Param Grid should be of type list (GridSearch only) or dict"
+ )
run = self._perform_run(
task_id,
@@ -627,6 +651,10 @@ def _run_and_upload_regression(
sentinel=sentinel,
)
+ @pytest.mark.skipif(
+ os.getenv("OPENML_USE_LOCAL_SERVICES") == "true",
+ reason="Pending resolution of #1657",
+ )
@pytest.mark.sklearn()
@pytest.mark.test_server()
def test_run_and_upload_logistic_regression(self):
@@ -634,8 +662,14 @@ def test_run_and_upload_logistic_regression(self):
task_id = self.TEST_SERVER_TASK_SIMPLE["task_id"]
n_missing_vals = self.TEST_SERVER_TASK_SIMPLE["n_missing_vals"]
n_test_obs = self.TEST_SERVER_TASK_SIMPLE["n_test_obs"]
- self._run_and_upload_classification(lr, task_id, n_missing_vals, n_test_obs, "62501")
+ self._run_and_upload_classification(
+ lr, task_id, n_missing_vals, n_test_obs, "62501"
+ )
+ @pytest.mark.skipif(
+ os.getenv("OPENML_USE_LOCAL_SERVICES") == "true",
+ reason="Pending resolution of #1657",
+ )
@pytest.mark.sklearn()
@pytest.mark.test_server()
def test_run_and_upload_linear_regression(self):
@@ -656,7 +690,9 @@ def test_run_and_upload_linear_regression(self):
if e.code == 614: # Task already exists
# the exception message contains the task_id that was matched in the format
# 'Task already exists. - matched id(s): [xxxx]'
- task_id = ast.literal_eval(e.message.split("matched id(s):")[-1].strip())[0]
+ task_id = ast.literal_eval(
+ e.message.split("matched id(s):")[-1].strip()
+ )[0]
else:
raise Exception(repr(e))
# mark to remove the uploaded task
@@ -665,8 +701,14 @@ def test_run_and_upload_linear_regression(self):
n_missing_vals = self.TEST_SERVER_TASK_REGRESSION["n_missing_vals"]
n_test_obs = self.TEST_SERVER_TASK_REGRESSION["n_test_obs"]
- self._run_and_upload_regression(lr, task_id, n_missing_vals, n_test_obs, "62501")
+ self._run_and_upload_regression(
+ lr, task_id, n_missing_vals, n_test_obs, "62501"
+ )
+ @pytest.mark.skipif(
+ os.getenv("OPENML_USE_LOCAL_SERVICES") == "true",
+ reason="Pending resolution of #1657",
+ )
@pytest.mark.sklearn()
@pytest.mark.test_server()
def test_run_and_upload_pipeline_dummy_pipeline(self):
@@ -679,8 +721,14 @@ def test_run_and_upload_pipeline_dummy_pipeline(self):
task_id = self.TEST_SERVER_TASK_SIMPLE["task_id"]
n_missing_vals = self.TEST_SERVER_TASK_SIMPLE["n_missing_vals"]
n_test_obs = self.TEST_SERVER_TASK_SIMPLE["n_test_obs"]
- self._run_and_upload_classification(pipeline1, task_id, n_missing_vals, n_test_obs, "62501")
+ self._run_and_upload_classification(
+ pipeline1, task_id, n_missing_vals, n_test_obs, "62501"
+ )
+ @pytest.mark.skipif(
+ os.getenv("OPENML_USE_LOCAL_SERVICES") == "true",
+ reason="Pending resolution of #1657",
+ )
@pytest.mark.sklearn()
@unittest.skipIf(
Version(sklearn.__version__) < Version("0.20"),
@@ -706,7 +754,9 @@ def get_ct_cf(nominal_indices, numeric_indices):
"nominal",
make_pipeline(
CustomImputer(strategy="most_frequent"),
- sklearn.preprocessing.OneHotEncoder(handle_unknown="ignore"),
+ sklearn.preprocessing.OneHotEncoder(
+ handle_unknown="ignore"
+ ),
),
nominal_indices,
),
@@ -782,7 +832,9 @@ def test_run_and_upload_knn_pipeline(self, warnings_mock):
task_id = self.TEST_SERVER_TASK_MISSING_VALS["task_id"]
n_missing_vals = self.TEST_SERVER_TASK_MISSING_VALS["n_missing_vals"]
n_test_obs = self.TEST_SERVER_TASK_MISSING_VALS["n_test_obs"]
- self._run_and_upload_classification(pipeline2, task_id, n_missing_vals, n_test_obs, "62501")
+ self._run_and_upload_classification(
+ pipeline2, task_id, n_missing_vals, n_test_obs, "62501"
+ )
# The warning raised is:
# "The total space of parameters 8 is smaller than n_iter=10.
# Running 8 iterations. For exhaustive searches, use GridSearchCV."
@@ -798,15 +850,24 @@ def test_run_and_upload_knn_pipeline(self, warnings_mock):
call_count += 1
assert call_count == 3
+ @pytest.mark.skipif(
+ os.getenv("OPENML_USE_LOCAL_SERVICES") == "true",
+ reason="Pending resolution of #1657",
+ )
@pytest.mark.sklearn()
@pytest.mark.test_server()
def test_run_and_upload_gridsearch(self):
estimator_name = (
- "base_estimator" if Version(sklearn.__version__) < Version("1.4") else "estimator"
+ "base_estimator"
+ if Version(sklearn.__version__) < Version("1.4")
+ else "estimator"
)
gridsearch = GridSearchCV(
BaggingClassifier(**{estimator_name: SVC()}),
- {f"{estimator_name}__C": [0.01, 0.1, 10], f"{estimator_name}__gamma": [0.01, 0.1, 10]},
+ {
+ f"{estimator_name}__C": [0.01, 0.1, 10],
+ f"{estimator_name}__gamma": [0.01, 0.1, 10],
+ },
cv=3,
)
task_id = self.TEST_SERVER_TASK_SIMPLE["task_id"]
@@ -821,6 +882,10 @@ def test_run_and_upload_gridsearch(self):
)
assert len(run.trace.trace_iterations) == 9
+ @pytest.mark.skipif(
+ os.getenv("OPENML_USE_LOCAL_SERVICES") == "true",
+ reason="Pending resolution of #1657",
+ )
@pytest.mark.sklearn()
@pytest.mark.test_server()
def test_run_and_upload_randomsearch(self):
@@ -854,6 +919,10 @@ def test_run_and_upload_randomsearch(self):
trace = openml.runs.get_run_trace(run.run_id)
assert len(trace.trace_iterations) == 5
+ @pytest.mark.skipif(
+ os.getenv("OPENML_USE_LOCAL_SERVICES") == "true",
+ reason="Pending resolution of #1657",
+ )
@pytest.mark.sklearn()
@pytest.mark.test_server()
def test_run_and_upload_maskedarrays(self):
@@ -882,6 +951,10 @@ def test_run_and_upload_maskedarrays(self):
##########################################################################
+ @pytest.mark.skipif(
+ os.getenv("OPENML_USE_LOCAL_SERVICES") == "true",
+ reason="Pending resolution of #1657",
+ )
@pytest.mark.sklearn()
@pytest.mark.test_server()
def test_learning_curve_task_1(self):
@@ -905,8 +978,14 @@ def test_learning_curve_task_1(self):
pipeline1,
flow_expected_rsv="62501",
)
- self._check_sample_evaluations(run.sample_evaluations, num_repeats, num_folds, num_samples)
+ self._check_sample_evaluations(
+ run.sample_evaluations, num_repeats, num_folds, num_samples
+ )
+ @pytest.mark.skipif(
+ os.getenv("OPENML_USE_LOCAL_SERVICES") == "true",
+ reason="Pending resolution of #1657",
+ )
@pytest.mark.sklearn()
@pytest.mark.test_server()
def test_learning_curve_task_2(self):
@@ -942,8 +1021,14 @@ def test_learning_curve_task_2(self):
pipeline2,
flow_expected_rsv="62501",
)
- self._check_sample_evaluations(run.sample_evaluations, num_repeats, num_folds, num_samples)
+ self._check_sample_evaluations(
+ run.sample_evaluations, num_repeats, num_folds, num_samples
+ )
+ @pytest.mark.skipif(
+ os.getenv("OPENML_USE_LOCAL_SERVICES") == "true",
+ reason="Pending resolution of #1657",
+ )
@pytest.mark.sklearn()
@unittest.skipIf(
Version(sklearn.__version__) < Version("0.21"),
@@ -1023,6 +1108,10 @@ def _test_local_evaluations(self, run):
assert alt_scores[idx] >= 0
assert alt_scores[idx] <= 1
+ @pytest.mark.skipif(
+ os.getenv("OPENML_USE_LOCAL_SERVICES") == "true",
+ reason="Pending resolution of #1657",
+ )
@pytest.mark.sklearn()
@pytest.mark.test_server()
def test_local_run_swapped_parameter_order_model(self):
@@ -1039,6 +1128,10 @@ def test_local_run_swapped_parameter_order_model(self):
self._test_local_evaluations(run)
+ @pytest.mark.skipif(
+ os.getenv("OPENML_USE_LOCAL_SERVICES") == "true",
+ reason="Pending resolution of #1657",
+ )
@pytest.mark.sklearn()
@pytest.mark.skip("https://github.com/openml/openml-python/issues/1586")
@unittest.skipIf(
@@ -1108,6 +1201,10 @@ def test_online_run_metric_score(self):
self._test_local_evaluations(run)
+ @pytest.mark.skipif(
+ os.getenv("OPENML_USE_LOCAL_SERVICES") == "true",
+ reason="Pending resolution of #1657",
+ )
@pytest.mark.sklearn()
@unittest.skipIf(
Version(sklearn.__version__) < Version("0.20"),
@@ -1141,7 +1238,9 @@ def test_initialize_model_from_run(self):
if e.code == 614: # Task already exists
# the exception message contains the task_id that was matched in the format
# 'Task already exists. - matched id(s): [xxxx]'
- task_id = ast.literal_eval(e.message.split("matched id(s):")[-1].strip())[0]
+ task_id = ast.literal_eval(
+ e.message.split("matched id(s):")[-1].strip()
+ )[0]
else:
raise Exception(repr(e))
# mark to remove the uploaded task
@@ -1170,6 +1269,10 @@ def test_initialize_model_from_run(self):
assert flowS.components["Imputer"].parameters["strategy"] == '"most_frequent"'
assert flowS.components["VarianceThreshold"].parameters["threshold"] == "0.05"
+ @pytest.mark.skipif(
+ os.getenv("OPENML_USE_LOCAL_SERVICES") == "true",
+ reason="Pending resolution of #1657",
+ )
@pytest.mark.sklearn()
@unittest.skipIf(
Version(sklearn.__version__) < Version("0.20"),
@@ -1230,6 +1333,10 @@ def test__run_exists(self):
run_ids = run_exists(task.task_id, setup_exists)
assert run_ids, (run_ids, clf)
+ @pytest.mark.skipif(
+ os.getenv("OPENML_USE_LOCAL_SERVICES") == "true",
+ reason="Pending resolution of #1657",
+ )
@pytest.mark.sklearn()
@pytest.mark.test_server()
def test_run_with_illegal_flow_id(self):
@@ -1243,13 +1350,19 @@ def test_run_with_illegal_flow_id(self):
expected_message_regex = (
r"Flow does not exist on the server, but 'flow.flow_id' is not None."
)
- with pytest.raises(openml.exceptions.PyOpenMLError, match=expected_message_regex):
+ with pytest.raises(
+ openml.exceptions.PyOpenMLError, match=expected_message_regex
+ ):
openml.runs.run_flow_on_task(
task=task,
flow=flow,
avoid_duplicate_runs=True,
)
+ @pytest.mark.skipif(
+ os.getenv("OPENML_USE_LOCAL_SERVICES") == "true",
+ reason="Pending resolution of #1657",
+ )
@pytest.mark.sklearn()
@pytest.mark.test_server()
def test_run_with_illegal_flow_id_after_load(self):
@@ -1277,11 +1390,19 @@ def test_run_with_illegal_flow_id_after_load(self):
expected_message_regex = (
r"Flow does not exist on the server, but 'flow.flow_id' is not None."
)
- with pytest.raises(openml.exceptions.PyOpenMLError, match=expected_message_regex):
+ with pytest.raises(
+ openml.exceptions.PyOpenMLError, match=expected_message_regex
+ ):
loaded_run.publish()
TestBase._mark_entity_for_removal("run", loaded_run.run_id)
- TestBase.logger.info(f"collected from test_run_functions: {loaded_run.run_id}")
+ TestBase.logger.info(
+ f"collected from test_run_functions: {loaded_run.run_id}"
+ )
+ @pytest.mark.skipif(
+ os.getenv("OPENML_USE_LOCAL_SERVICES") == "true",
+ reason="Pending resolution of #1657",
+ )
@pytest.mark.sklearn()
@pytest.mark.test_server()
def test_run_with_illegal_flow_id_1(self):
@@ -1293,21 +1414,31 @@ def test_run_with_illegal_flow_id_1(self):
try:
flow_orig.publish() # ensures flow exist on server
TestBase._mark_entity_for_removal("flow", flow_orig.flow_id, flow_orig.name)
- TestBase.logger.info(f"collected from test_run_functions: {flow_orig.flow_id}")
+ TestBase.logger.info(
+ f"collected from test_run_functions: {flow_orig.flow_id}"
+ )
except openml.exceptions.OpenMLServerException:
# flow already exists
pass
flow_new = self.extension.model_to_flow(clf)
flow_new.flow_id = -1
- expected_message_regex = "Local flow_id does not match server flow_id: '-1' vs '[0-9]+'"
- with pytest.raises(openml.exceptions.PyOpenMLError, match=expected_message_regex):
+ expected_message_regex = (
+ "Local flow_id does not match server flow_id: '-1' vs '[0-9]+'"
+ )
+ with pytest.raises(
+ openml.exceptions.PyOpenMLError, match=expected_message_regex
+ ):
openml.runs.run_flow_on_task(
task=task,
flow=flow_new,
avoid_duplicate_runs=True,
)
+ @pytest.mark.skipif(
+ os.getenv("OPENML_USE_LOCAL_SERVICES") == "true",
+ reason="Pending resolution of #1657",
+ )
@pytest.mark.sklearn()
@pytest.mark.test_server()
def test_run_with_illegal_flow_id_1_after_load(self):
@@ -1319,7 +1450,9 @@ def test_run_with_illegal_flow_id_1_after_load(self):
try:
flow_orig.publish() # ensures flow exist on server
TestBase._mark_entity_for_removal("flow", flow_orig.flow_id, flow_orig.name)
- TestBase.logger.info(f"collected from test_run_functions: {flow_orig.flow_id}")
+ TestBase.logger.info(
+ f"collected from test_run_functions: {flow_orig.flow_id}"
+ )
except openml.exceptions.OpenMLServerException:
# flow already exists
pass
@@ -1340,13 +1473,19 @@ def test_run_with_illegal_flow_id_1_after_load(self):
run.to_filesystem(cache_path)
loaded_run = openml.runs.OpenMLRun.from_filesystem(cache_path)
- expected_message_regex = "Local flow_id does not match server flow_id: '-1' vs '[0-9]+'"
+ expected_message_regex = (
+ "Local flow_id does not match server flow_id: '-1' vs '[0-9]+'"
+ )
self.assertRaisesRegex(
openml.exceptions.PyOpenMLError,
expected_message_regex,
loaded_run.publish,
)
+ @pytest.mark.skipif(
+ os.getenv("OPENML_USE_LOCAL_SERVICES") == "true",
+ reason="Pending resolution of #1657",
+ )
@pytest.mark.sklearn()
@unittest.skipIf(
Version(sklearn.__version__) < Version("0.20"),
@@ -1577,6 +1716,10 @@ def test_get_runs_list_by_tag(self):
runs = openml.runs.list_runs(tag="curves", size=2)
assert len(runs) >= 1
+ @pytest.mark.skipif(
+ os.getenv("OPENML_USE_LOCAL_SERVICES") == "true",
+ reason="Pending resolution of #1657",
+ )
@pytest.mark.sklearn()
@unittest.skipIf(
Version(sklearn.__version__) < Version("0.20"),
@@ -1598,7 +1741,10 @@ def test_run_on_dataset_with_missing_labels_dataframe(self):
cont_imp = make_pipeline(CustomImputer(), StandardScaler())
ct = ColumnTransformer([("cat", cat_imp, cat), ("cont", cont_imp, cont)])
model = Pipeline(
- steps=[("preprocess", ct), ("estimator", sklearn.tree.DecisionTreeClassifier())],
+ steps=[
+ ("preprocess", ct),
+ ("estimator", sklearn.tree.DecisionTreeClassifier()),
+ ],
) # build a sklearn classifier
data_content, _, _, _ = _run_task_get_arffcontent(
@@ -1614,6 +1760,10 @@ def test_run_on_dataset_with_missing_labels_dataframe(self):
# repeat, fold, row_id, 6 confidences, prediction and correct label
assert len(row) == 12
+ @pytest.mark.skipif(
+ os.getenv("OPENML_USE_LOCAL_SERVICES") == "true",
+ reason="Pending resolution of #1657",
+ )
@pytest.mark.sklearn()
@unittest.skipIf(
Version(sklearn.__version__) < Version("0.20"),
@@ -1642,7 +1792,10 @@ def test_run_on_dataset_with_missing_labels_array(self):
cont_imp = make_pipeline(CustomImputer(), StandardScaler())
ct = ColumnTransformer([("cat", cat_imp, cat), ("cont", cont_imp, cont)])
model = Pipeline(
- steps=[("preprocess", ct), ("estimator", sklearn.tree.DecisionTreeClassifier())],
+ steps=[
+ ("preprocess", ct),
+ ("estimator", sklearn.tree.DecisionTreeClassifier()),
+ ],
) # build a sklearn classifier
data_content, _, _, _ = _run_task_get_arffcontent(
@@ -1668,6 +1821,10 @@ def test_get_uncached_run(self):
with pytest.raises(openml.exceptions.OpenMLCacheException):
openml.runs.functions._get_cached_run(10)
+ @pytest.mark.skipif(
+ os.getenv("OPENML_USE_LOCAL_SERVICES") == "true",
+ reason="Pending resolution of #1657",
+ )
@pytest.mark.sklearn()
@pytest.mark.test_server()
def test_run_flow_on_task_downloaded_flow(self):
@@ -1696,7 +1853,8 @@ def test_format_prediction_non_supervised(self):
clustering = openml.tasks.get_task(126033, download_data=False)
ignored_input = [0] * 5
with pytest.raises(
- NotImplementedError, match=r"Formatting for is not supported."
+ NotImplementedError,
+ match=r"Formatting for is not supported.",
):
format_prediction(clustering, *ignored_input)
@@ -1707,7 +1865,9 @@ def test_format_prediction_classification_no_probabilities(self):
download_data=False,
)
ignored_input = [0] * 5
- with pytest.raises(ValueError, match="`proba` is required for classification task"):
+ with pytest.raises(
+ ValueError, match="`proba` is required for classification task"
+ ):
format_prediction(classification, *ignored_input, proba=None)
@pytest.mark.test_server()
@@ -1718,8 +1878,12 @@ def test_format_prediction_classification_incomplete_probabilities(self):
)
ignored_input = [0] * 5
incomplete_probabilities = {c: 0.2 for c in classification.class_labels[1:]}
- with pytest.raises(ValueError, match="Each class should have a predicted probability"):
- format_prediction(classification, *ignored_input, proba=incomplete_probabilities)
+ with pytest.raises(
+ ValueError, match="Each class should have a predicted probability"
+ ):
+ format_prediction(
+ classification, *ignored_input, proba=incomplete_probabilities
+ )
@pytest.mark.test_server()
def test_format_prediction_task_without_classlabels_set(self):
@@ -1729,16 +1893,24 @@ def test_format_prediction_task_without_classlabels_set(self):
)
classification.class_labels = None
ignored_input = [0] * 5
- with pytest.raises(ValueError, match="The classification task must have class labels set"):
+ with pytest.raises(
+ ValueError, match="The classification task must have class labels set"
+ ):
format_prediction(classification, *ignored_input, proba={})
@pytest.mark.test_server()
def test_format_prediction_task_learning_curve_sample_not_set(self):
- learning_curve = openml.tasks.get_task(801, download_data=False) # diabetes;crossvalidation
+ learning_curve = openml.tasks.get_task(
+ 801, download_data=False
+ ) # diabetes;crossvalidation
probabilities = {c: 0.2 for c in learning_curve.class_labels}
ignored_input = [0] * 5
- with pytest.raises(ValueError, match="`sample` can not be none for LearningCurveTask"):
- format_prediction(learning_curve, *ignored_input, sample=None, proba=probabilities)
+ with pytest.raises(
+ ValueError, match="`sample` can not be none for LearningCurveTask"
+ ):
+ format_prediction(
+ learning_curve, *ignored_input, sample=None, proba=probabilities
+ )
@pytest.mark.test_server()
def test_format_prediction_task_regression(self):
@@ -1756,7 +1928,9 @@ def test_format_prediction_task_regression(self):
if e.code == 614: # Task already exists
# the exception message contains the task_id that was matched in the format
# 'Task already exists. - matched id(s): [xxxx]'
- task_id = ast.literal_eval(e.message.split("matched id(s):")[-1].strip())[0]
+ task_id = ast.literal_eval(
+ e.message.split("matched id(s):")[-1].strip()
+ )[0]
else:
raise Exception(repr(e))
# mark to remove the uploaded task
@@ -1786,12 +1960,16 @@ def test_delete_run(self):
task = openml.tasks.get_task(32) # diabetes; crossvalidation
run = openml.runs.run_model_on_task(
- model=clf, task=task, seed=rs,
+ model=clf,
+ task=task,
+ seed=rs,
)
run.publish()
with pytest.raises(openml.exceptions.OpenMLRunsExistError):
- openml.runs.run_model_on_task(model=clf, task=task, seed=rs, avoid_duplicate_runs=True)
+ openml.runs.run_model_on_task(
+ model=clf, task=task, seed=rs, avoid_duplicate_runs=True
+ )
TestBase._mark_entity_for_removal("run", run.run_id)
TestBase.logger.info(f"collected from test_run_functions: {run.run_id}")
@@ -1799,7 +1977,9 @@ def test_delete_run(self):
_run_id = run.run_id
assert delete_run(_run_id)
- @pytest.mark.skip(reason="run id is in problematic state on test server due to PR#1454")
+ @pytest.mark.skip(
+ reason="run id is in problematic state on test server due to PR#1454"
+ )
@unittest.skipIf(
Version(sklearn.__version__) < Version("0.20"),
reason="SimpleImputer doesn't handle mixed type DataFrame as input",
@@ -1866,15 +2046,19 @@ def test_delete_unknown_run(mock_delete, test_files_directory, test_server_v1, t
assert test_apikey_v1 == mock_delete.call_args.kwargs.get("params", {}).get("api_key")
+@pytest.mark.skipif(
+ os.getenv("OPENML_USE_LOCAL_SERVICES") == "true",
+ reason="Pending resolution of #1657",
+)
@pytest.mark.sklearn()
@unittest.skipIf(
Version(sklearn.__version__) < Version("0.21"),
reason="couldn't perform local tests successfully w/o bloating RAM",
- )
+)
@unittest.skipIf(
Version(sklearn.__version__) >= Version("1.8"),
reason="predictions differ significantly",
- )
+)
@mock.patch("openml_sklearn.SklearnExtension._prevent_optimize_n_jobs")
@pytest.mark.test_server()
def test__run_task_get_arffcontent_2(parallel_mock):
@@ -1903,8 +2087,11 @@ def test__run_task_get_arffcontent_2(parallel_mock):
]
)
n_jobs = 2
- backend = "loky" if Version(joblib.__version__) > Version("0.11") else "multiprocessing"
+ backend = (
+ "loky" if Version(joblib.__version__) > Version("0.11") else "multiprocessing"
+ )
from openml_sklearn import SklearnExtension
+
extension = SklearnExtension()
with parallel_backend(backend, n_jobs=n_jobs):
res = openml.runs.functions._run_task_get_arffcontent(
@@ -1948,11 +2135,15 @@ def test__run_task_get_arffcontent_2(parallel_mock):
)
+@pytest.mark.skipif(
+ os.getenv("OPENML_USE_LOCAL_SERVICES") == "true",
+ reason="Pending resolution of #1657",
+)
@pytest.mark.sklearn()
@unittest.skipIf(
Version(sklearn.__version__) < Version("0.21"),
reason="couldn't perform local tests successfully w/o bloating RAM",
- )
+)
@mock.patch("openml_sklearn.SklearnExtension._prevent_optimize_n_jobs")
@pytest.mark.parametrize(
("n_jobs", "backend", "call_count"),
@@ -1961,18 +2152,28 @@ def test__run_task_get_arffcontent_2(parallel_mock):
# spawns multiple processes if n_jobs != 1, which means the mock is not applied.
(2, None, 0),
(-1, None, 0),
- (1, None, 10), # with n_jobs=1 the mock *is* applied, since there is no new subprocess
+ (
+ 1,
+ None,
+ 10,
+ ), # with n_jobs=1 the mock *is* applied, since there is no new subprocess
(1, "sequential", 10),
(1, "threading", 10),
- (-1, "threading", 10), # the threading backend does preserve mocks even with parallelizing
- ]
+ (
+ -1,
+ "threading",
+ 10,
+ ), # the threading backend does preserve mocks even with parallelizing
+ ],
)
@pytest.mark.test_server()
def test_joblib_backends(parallel_mock, n_jobs, backend, call_count):
"""Tests evaluation of a run using various joblib backends and n_jobs."""
if backend is None:
backend = (
- "loky" if Version(joblib.__version__) > Version("0.11") else "multiprocessing"
+ "loky"
+ if Version(joblib.__version__) > Version("0.11")
+ else "multiprocessing"
)
task = openml.tasks.get_task(7) # Supervised Classification on kr-vs-kp
diff --git a/tests/test_setups/test_setup_functions.py b/tests/test_setups/test_setup_functions.py
index 0735925f2..da87c0cc9 100644
--- a/tests/test_setups/test_setup_functions.py
+++ b/tests/test_setups/test_setup_functions.py
@@ -34,6 +34,10 @@ def setUp(self):
self.extension = SklearnExtension()
super().setUp()
+ @pytest.mark.skipif(
+ os.getenv("OPENML_USE_LOCAL_SERVICES") == "true",
+ reason="Pending resolution of #1657",
+ )
@pytest.mark.sklearn()
@pytest.mark.test_server()
def test_nonexisting_setup_exists(self):
@@ -45,7 +49,9 @@ def test_nonexisting_setup_exists(self):
flow.name = f"TEST{sentinel}{flow.name}"
flow.publish()
TestBase._mark_entity_for_removal("flow", flow.flow_id, flow.name)
- TestBase.logger.info(f"collected from {__file__.split('/')[-1]}: {flow.flow_id}")
+ TestBase.logger.info(
+ f"collected from {__file__.split('/')[-1]}: {flow.flow_id}"
+ )
# although the flow exists (created as of previous statement),
# we can be sure there are no setups (yet) as it was just created
@@ -58,7 +64,9 @@ def _existing_setup_exists(self, classif):
flow.name = f"TEST{get_sentinel()}{flow.name}"
flow.publish()
TestBase._mark_entity_for_removal("flow", flow.flow_id, flow.name)
- TestBase.logger.info(f"collected from {__file__.split('/')[-1]}: {flow.flow_id}")
+ TestBase.logger.info(
+ f"collected from {__file__.split('/')[-1]}: {flow.flow_id}"
+ )
# although the flow exists, we can be sure there are no
# setups (yet) as it hasn't been ran
@@ -82,6 +90,10 @@ def _existing_setup_exists(self, classif):
setup_id = openml.setups.setup_exists(flow)
assert setup_id == run.setup_id
+ @pytest.mark.skipif(
+ os.getenv("OPENML_USE_LOCAL_SERVICES") == "true",
+ reason="Pending resolution of #1657",
+ )
@pytest.mark.sklearn()
@pytest.mark.test_server()
def test_existing_setup_exists_1(self):
@@ -98,12 +110,20 @@ def side_effect(self):
nb = sklearn.naive_bayes.GaussianNB()
self._existing_setup_exists(nb)
+ @pytest.mark.skipif(
+ os.getenv("OPENML_USE_LOCAL_SERVICES") == "true",
+ reason="Pending resolution of #1657",
+ )
@pytest.mark.sklearn()
@pytest.mark.test_server()
def test_exisiting_setup_exists_2(self):
# Check a flow with one hyperparameter
self._existing_setup_exists(sklearn.naive_bayes.GaussianNB())
+ @pytest.mark.skipif(
+ os.getenv("OPENML_USE_LOCAL_SERVICES") == "true",
+ reason="Pending resolution of #1657",
+ )
@pytest.mark.sklearn()
@pytest.mark.test_server()
def test_existing_setup_exists_3(self):
@@ -161,10 +181,14 @@ def test_list_setups_output_format(self):
flow_id = 6794
setups = openml.setups.list_setups(flow=flow_id, size=10)
assert isinstance(setups, dict)
- assert isinstance(setups[next(iter(setups.keys()))], openml.setups.setup.OpenMLSetup)
+ assert isinstance(
+ setups[next(iter(setups.keys()))], openml.setups.setup.OpenMLSetup
+ )
assert len(setups) == 10
- setups = openml.setups.list_setups(flow=flow_id, size=10, output_format="dataframe")
+ setups = openml.setups.list_setups(
+ flow=flow_id, size=10, output_format="dataframe"
+ )
assert isinstance(setups, pd.DataFrame)
assert len(setups) == 10
diff --git a/tests/test_tasks/test_task_functions.py b/tests/test_tasks/test_task_functions.py
index bf2fcfeae..931855841 100644
--- a/tests/test_tasks/test_task_functions.py
+++ b/tests/test_tasks/test_task_functions.py
@@ -3,17 +3,18 @@
import os
import unittest
-from typing import cast
from unittest import mock
-import pandas as pd
import pytest
import requests
import openml
from openml import OpenMLSplit, OpenMLTask
-from openml.exceptions import OpenMLCacheException, OpenMLNotAuthorizedError, OpenMLServerException
-from openml.tasks import TaskType
+from openml.exceptions import (
+ OpenMLNotAuthorizedError,
+ OpenMLServerException,
+)
+from openml.tasks import TaskType, task
from openml.testing import TestBase, create_request_response
@@ -26,29 +27,6 @@ def setUp(self):
def tearDown(self):
super().tearDown()
- @pytest.mark.test_server()
- def test__get_cached_tasks(self):
- openml.config.set_root_cache_directory(self.static_cache_dir)
- tasks = openml.tasks.functions._get_cached_tasks()
- assert isinstance(tasks, dict)
- assert len(tasks) == 3
- assert isinstance(next(iter(tasks.values())), OpenMLTask)
-
- @pytest.mark.test_server()
- def test__get_cached_task(self):
- openml.config.set_root_cache_directory(self.static_cache_dir)
- task = openml.tasks.functions._get_cached_task(1)
- assert isinstance(task, OpenMLTask)
-
- def test__get_cached_task_not_cached(self):
- openml.config.set_root_cache_directory(self.static_cache_dir)
- self.assertRaisesRegex(
- OpenMLCacheException,
- "Task file for tid 2 not cached",
- openml.tasks.functions._get_cached_task,
- 2,
- )
-
@pytest.mark.test_server()
def test__get_estimation_procedure_list(self):
estimation_procedures = openml.tasks.functions._get_estimation_procedure_list()
@@ -141,7 +119,9 @@ def test_list_tasks_per_type_paginate(self):
@pytest.mark.test_server()
def test__get_task(self):
openml.config.set_root_cache_directory(self.static_cache_dir)
- openml.tasks.get_task(1882)
+ with unittest.mock.patch("requests.sessions.Session.request") as mock_request:
+ openml.tasks.get_task(1882)
+ mock_request.assert_not_called()
@unittest.skip(
"Please await outcome of discussion: https://github.com/openml/OpenML/issues/776",
@@ -155,21 +135,16 @@ def test__get_task_live(self):
@pytest.mark.test_server()
def test_get_task(self):
- task = openml.tasks.get_task(1, download_data=True) # anneal; crossvalidation
- assert isinstance(task, OpenMLTask)
- assert os.path.exists(
- os.path.join(openml.config.get_cache_directory(), "tasks", "1", "task.xml")
- )
- assert not os.path.exists(
- os.path.join(openml.config.get_cache_directory(), "tasks", "1", "datasplits.arff")
- )
- assert os.path.exists(
- os.path.join(openml.config.get_cache_directory(), "datasets", "1", "dataset_1.pq")
- )
+ with unittest.mock.patch("requests.sessions.Session.request") as mock_request:
+ openml.tasks.get_task(1)
+ mock_request.assert_not_called()
@pytest.mark.test_server()
def test_get_task_lazy(self):
- task = openml.tasks.get_task(2, download_data=False) # anneal; crossvalidation
+ with unittest.mock.patch("requests.sessions.Session.request") as mock_request:
+ task = openml.tasks.get_task(2, download_data=False) # anneal; crossvalidation
+ mock_request.assert_not_called()
+
assert isinstance(task, OpenMLTask)
assert os.path.exists(
os.path.join(openml.config.get_cache_directory(), "tasks", "2", "task.xml")
@@ -177,16 +152,25 @@ def test_get_task_lazy(self):
assert task.class_labels == ["1", "2", "3", "4", "5", "U"]
assert not os.path.exists(
- os.path.join(openml.config.get_cache_directory(), "tasks", "2", "datasplits.arff")
+ os.path.join(
+ openml.config.get_cache_directory(), "tasks", "2", "datasplits.arff"
+ )
)
# Since the download_data=False is propagated to get_dataset
assert not os.path.exists(
- os.path.join(openml.config.get_cache_directory(), "datasets", "2", "dataset.arff")
+ os.path.join(
+ openml.config.get_cache_directory(), "datasets", "2", "dataset.arff"
+ )
)
- task.download_split()
+ with unittest.mock.patch("requests.sessions.Session.request") as mock_request:
+ task.download_split()
+ mock_request.assert_not_called()
+
assert os.path.exists(
- os.path.join(openml.config.get_cache_directory(), "tasks", "2", "datasplits.arff")
+ os.path.join(
+ openml.config.get_cache_directory(), "tasks", "2", "datasplits.arff"
+ )
)
@mock.patch("openml.tasks.functions.get_dataset")
@@ -211,7 +195,10 @@ def assert_and_raise(*args, **kwargs):
@pytest.mark.test_server()
def test_get_task_with_cache(self):
openml.config.set_root_cache_directory(self.static_cache_dir)
- task = openml.tasks.get_task(1)
+ with unittest.mock.patch("requests.sessions.Session.request") as mock_request:
+ task = openml.tasks.get_task(1)
+ mock_request.assert_not_called()
+
assert isinstance(task, OpenMLTask)
@pytest.mark.production_server()
@@ -226,11 +213,15 @@ def test_get_task_different_types(self):
@pytest.mark.test_server()
def test_download_split(self):
- task = openml.tasks.get_task(1) # anneal; crossvalidation
- split = task.download_split()
+ with unittest.mock.patch("requests.sessions.Session.request") as mock_request:
+ task = openml.tasks.get_task(1) # anneal; crossvalidation
+ split = task.download_split()
+ mock_request.assert_not_called()
assert type(split) == OpenMLSplit
assert os.path.exists(
- os.path.join(openml.config.get_cache_directory(), "tasks", "1", "datasplits.arff")
+ os.path.join(
+ openml.config.get_cache_directory(), "tasks", "1", "datasplits.arff"
+ )
)
def test_deletion_of_cache_dir(self):
@@ -244,14 +235,13 @@ def test_deletion_of_cache_dir(self):
assert not os.path.exists(tid_cache_dir)
-@mock.patch.object(requests.Session, "delete")
-def test_delete_task_not_owned(mock_delete, test_files_directory, test_server_v1, test_apikey_v1):
+@mock.patch.object(requests.Session, "request")
+def test_delete_task_not_owned(mock_request, test_files_directory, test_server_v1, test_apikey_v1):
content_file = test_files_directory / "mock_responses" / "tasks" / "task_delete_not_owned.xml"
- mock_delete.return_value = create_request_response(
+ mock_request.return_value = create_request_response(
status_code=412,
content_filepath=content_file,
)
-
with pytest.raises(
OpenMLNotAuthorizedError,
match="The task can not be deleted because it was not uploaded by you.",
@@ -259,14 +249,14 @@ def test_delete_task_not_owned(mock_delete, test_files_directory, test_server_v1
openml.tasks.delete_task(1)
task_url = test_server_v1 + "task/1"
- assert task_url == mock_delete.call_args.args[0]
- assert test_apikey_v1 == mock_delete.call_args.kwargs.get("params", {}).get("api_key")
+ assert task_url == mock_request.call_args.kwargs.get("url")
+ assert test_apikey_v1 == mock_request.call_args.kwargs.get("params", {}).get("api_key")
-@mock.patch.object(requests.Session, "delete")
-def test_delete_task_with_run(mock_delete, test_files_directory, test_server_v1, test_apikey_v1):
+@mock.patch.object(requests.Session, "request")
+def test_delete_task_with_run(mock_request, test_files_directory, test_server_v1, test_apikey_v1):
content_file = test_files_directory / "mock_responses" / "tasks" / "task_delete_has_runs.xml"
- mock_delete.return_value = create_request_response(
+ mock_request.return_value = create_request_response(
status_code=412,
content_filepath=content_file,
)
@@ -278,14 +268,14 @@ def test_delete_task_with_run(mock_delete, test_files_directory, test_server_v1,
openml.tasks.delete_task(3496)
task_url = test_server_v1 + "task/3496"
- assert task_url == mock_delete.call_args.args[0]
- assert test_apikey_v1 == mock_delete.call_args.kwargs.get("params", {}).get("api_key")
+ assert task_url == mock_request.call_args.kwargs.get("url")
+ assert test_apikey_v1 == mock_request.call_args.kwargs.get("params", {}).get("api_key")
-@mock.patch.object(requests.Session, "delete")
-def test_delete_success(mock_delete, test_files_directory, test_server_v1, test_apikey_v1):
+@mock.patch.object(requests.Session, "request")
+def test_delete_success(mock_request, test_files_directory, test_server_v1, test_apikey_v1):
content_file = test_files_directory / "mock_responses" / "tasks" / "task_delete_successful.xml"
- mock_delete.return_value = create_request_response(
+ mock_request.return_value = create_request_response(
status_code=200,
content_filepath=content_file,
)
@@ -294,14 +284,14 @@ def test_delete_success(mock_delete, test_files_directory, test_server_v1, test_
assert success
task_url = test_server_v1 + "task/361323"
- assert task_url == mock_delete.call_args.args[0]
- assert test_apikey_v1 == mock_delete.call_args.kwargs.get("params", {}).get("api_key")
+ assert task_url == mock_request.call_args.kwargs.get("url")
+ assert test_apikey_v1 == mock_request.call_args.kwargs.get("params", {}).get("api_key")
-@mock.patch.object(requests.Session, "delete")
-def test_delete_unknown_task(mock_delete, test_files_directory, test_server_v1, test_apikey_v1):
+@mock.patch.object(requests.Session, "request")
+def test_delete_unknown_task(mock_request, test_files_directory, test_server_v1, test_apikey_v1):
content_file = test_files_directory / "mock_responses" / "tasks" / "task_delete_not_exist.xml"
- mock_delete.return_value = create_request_response(
+ mock_request.return_value = create_request_response(
status_code=412,
content_filepath=content_file,
)
@@ -313,5 +303,5 @@ def test_delete_unknown_task(mock_delete, test_files_directory, test_server_v1,
openml.tasks.delete_task(9_999_999)
task_url = test_server_v1 + "task/9999999"
- assert task_url == mock_delete.call_args.args[0]
- assert test_apikey_v1 == mock_delete.call_args.kwargs.get("params", {}).get("api_key")
+ assert task_url == mock_request.call_args.kwargs.get("url")
+ assert test_apikey_v1 == mock_request.call_args.kwargs.get("params", {}).get("api_key")
diff --git a/tests/test_tasks/test_task_methods.py b/tests/test_tasks/test_task_methods.py
index 9316d0876..81c133edc 100644
--- a/tests/test_tasks/test_task_methods.py
+++ b/tests/test_tasks/test_task_methods.py
@@ -6,6 +6,7 @@
import openml
from openml.testing import TestBase
import pytest
+import unittest.mock
# Common methods between tasks
@@ -33,9 +34,13 @@ def test_tagging(self):
assert len(tasks) == 0
@pytest.mark.test_server()
- def test_get_train_and_test_split_indices(self):
+ def test_get_train_and_test_split_indices(self):
openml.config.set_root_cache_directory(self.static_cache_dir)
- task = openml.tasks.get_task(1882)
+
+ with unittest.mock.patch("requests.sessions.Session.request") as mock_request:
+ task = openml.tasks.get_task(1882)
+ mock_request.assert_not_called()
+
train_indices, test_indices = task.get_train_test_split_indices(0, 0)
assert train_indices[0] == 16
assert train_indices[-1] == 395
diff --git a/tests/test_utils/test_utils.py b/tests/test_utils/test_utils.py
index 111ff778c..b74294575 100644
--- a/tests/test_utils/test_utils.py
+++ b/tests/test_utils/test_utils.py
@@ -50,7 +50,7 @@ def _mocked_perform_api_call(call, request_method):
@pytest.mark.test_server()
def test_list_all():
- openml.utils._list_all(listing_call=openml.tasks.functions._list_tasks)
+ openml.utils._list_all(listing_call=openml.tasks.functions.list_tasks)
@pytest.mark.test_server()
@@ -65,7 +65,7 @@ def test_list_all_with_multiple_batches(min_number_tasks_on_test_server):
# batches and at the same time do as few batches (roundtrips) as possible.
batch_size = min_number_tasks_on_test_server - 1
batches = openml.utils._list_all(
- listing_call=openml.tasks.functions._list_tasks,
+ listing_call=openml._backend.task.list,
batch_size=batch_size,
)
assert len(batches) >= 2