diff --git a/openml/_api/clients/http.py b/openml/_api/clients/http.py index 27eeaac22..2b80023fd 100644 --- a/openml/_api/clients/http.py +++ b/openml/_api/clients/http.py @@ -12,6 +12,7 @@ from typing import Any, cast from urllib.parse import urlencode, urljoin, urlparse +import arff import requests import xmltodict from requests import Response @@ -98,16 +99,32 @@ def _get_body_filename_from_response(self, response: Response) -> str: if "text/xml" in content_type: return "body.xml" + if response.content.startswith(b"PK\x03\x04"): + return "body.zip" + + try: + arff.loads(response.text) + return "body.arff" + except arff.ArffException: + pass + return "body.txt" def _get_body_filename_from_path(self, path: Path) -> str: - if (path / "body.json").exists(): - return "body.json" + candidates = [] + for p in path.iterdir(): + if p.name.startswith("body.") and len(p.suffixes) == 1: + candidates.append(p) - if (path / "body.xml").exists(): - return "body.xml" + if not candidates: + raise FileNotFoundError(f"No body file found in path: {path}") - return "body.txt" + if len(candidates) > 1: + raise FileNotFoundError( + f"Multiple body files found in path: {path} ({[p.name for p in candidates]})" + ) + + return candidates[0].name def load(self, key: str) -> Response: """ @@ -132,6 +149,9 @@ def load(self, key: str) -> Response: """ path = self._key_to_path(key) + if not path.exists(): + raise FileNotFoundError(f"Cache path not found: {path}") + meta_path = path / "meta.json" meta_raw = meta_path.read_bytes() if meta_path.exists() else "{}" meta = json.loads(meta_raw) @@ -141,8 +161,6 @@ def load(self, key: str) -> Response: headers = json.loads(headers_raw) body_path = path / self._get_body_filename_from_path(path) - if not body_path.exists(): - raise FileNotFoundError(f"Incomplete cache at {body_path}") body = body_path.read_bytes() response = Response() @@ -825,3 +843,9 @@ def write_to_file(response: Response, path: Path, encoding: str) -> None: handler = handler or write_to_file handler(response, file_path, encoding) return file_path + + def cache_path_from_url(self, url: str) -> Path: + full_url = urljoin(self.server, url) + key = self.cache.get_key(full_url, params={}) + path = self.cache._key_to_path(key) + return path / self.cache._get_body_filename_from_path(path) diff --git a/openml/_api/resources/base/resources.py b/openml/_api/resources/base/resources.py index 0c60e69de..301483f25 100644 --- a/openml/_api/resources/base/resources.py +++ b/openml/_api/resources/base/resources.py @@ -10,10 +10,13 @@ from .base import ResourceAPI if TYPE_CHECKING: + import pandas as pd + from openml.estimation_procedures import OpenMLEstimationProcedure - from openml.evaluations import OpenMLEvaluation + from openml.evaluations.evaluation import OpenMLEvaluation from openml.flows.flow import OpenMLFlow from openml.setups.setup import OpenMLSetup + from openml.tasks.task import OpenMLTask, TaskType class DatasetAPI(ResourceAPI): @@ -27,6 +30,49 @@ class TaskAPI(ResourceAPI): resource_type: ResourceType = ResourceType.TASK + @abstractmethod + def get( + self, + task_id: int, + ) -> OpenMLTask: + """ + API v1: + GET /task/{task_id} + + API v2: + GET /tasks/{task_id} + """ + ... + + @abstractmethod + def supports_download_splits(self) -> bool: + """Return whether the task API implementation supports split downloads.""" + ... + + # Task listing (V1 only) + @abstractmethod + def list( + self, + limit: int, + offset: int, + task_type: TaskType | int | None = None, + **kwargs: Any, + ) -> pd.DataFrame: + """ + List tasks with filters. + + API v1: + GET /task/list + + API v2: + Not available. + + Returns + ------- + pandas.DataFrame + """ + ... + class EvaluationMeasureAPI(ResourceAPI): """Abstract API interface for evaluation measure resources.""" diff --git a/openml/_api/resources/task.py b/openml/_api/resources/task.py index 1f62aa3f3..3b6f504b9 100644 --- a/openml/_api/resources/task.py +++ b/openml/_api/resources/task.py @@ -1,11 +1,353 @@ from __future__ import annotations +import warnings +from typing import Any + +import pandas as pd +import xmltodict + +from openml.tasks.functions import _get_estimation_procedure_list +from openml.tasks.task import ( + OpenMLClassificationTask, + OpenMLClusteringTask, + OpenMLLearningCurveTask, + OpenMLRegressionTask, + OpenMLTask, + TaskType, +) + from .base import ResourceV1API, ResourceV2API, TaskAPI +def _create_task_from_xml(xml: str) -> OpenMLTask: + """Create a task given a xml string. + + Parameters + ---------- + xml : string + Task xml representation. + + Returns + ------- + OpenMLTask + """ + dic = xmltodict.parse(xml)["oml:task"] + estimation_parameters = {} + inputs = {} + # Due to the unordered structure we obtain, we first have to extract + # the possible keys of oml:input; dic["oml:input"] is a list of + # OrderedDicts + + # Check if there is a list of inputs + if isinstance(dic["oml:input"], list): + for input_ in dic["oml:input"]: + name = input_["@name"] + inputs[name] = input_ + # Single input case + elif isinstance(dic["oml:input"], dict): + name = dic["oml:input"]["@name"] + inputs[name] = dic["oml:input"] + + evaluation_measures = None + if "evaluation_measures" in inputs: + evaluation_measures = inputs["evaluation_measures"]["oml:evaluation_measures"][ + "oml:evaluation_measure" + ] + + task_type = TaskType(int(dic["oml:task_type_id"])) + common_kwargs = { + "task_id": dic["oml:task_id"], + "task_type": dic["oml:task_type"], + "task_type_id": task_type, + "data_set_id": inputs["source_data"]["oml:data_set"]["oml:data_set_id"], + "evaluation_measure": evaluation_measures, + } + # TODO: add OpenMLClusteringTask? + if task_type in ( + TaskType.SUPERVISED_CLASSIFICATION, + TaskType.SUPERVISED_REGRESSION, + TaskType.LEARNING_CURVE, + ): + # Convert some more parameters + for parameter in inputs["estimation_procedure"]["oml:estimation_procedure"][ + "oml:parameter" + ]: + name = parameter["@name"] + text = parameter.get("#text", "") + estimation_parameters[name] = text + + common_kwargs["estimation_procedure_type"] = inputs["estimation_procedure"][ + "oml:estimation_procedure" + ]["oml:type"] + common_kwargs["estimation_procedure_id"] = int( + inputs["estimation_procedure"]["oml:estimation_procedure"]["oml:id"] + ) + + common_kwargs["estimation_parameters"] = estimation_parameters + common_kwargs["target_name"] = inputs["source_data"]["oml:data_set"]["oml:target_feature"] + common_kwargs["data_splits_url"] = inputs["estimation_procedure"][ + "oml:estimation_procedure" + ]["oml:data_splits_url"] + + cls = { + TaskType.SUPERVISED_CLASSIFICATION: OpenMLClassificationTask, + TaskType.SUPERVISED_REGRESSION: OpenMLRegressionTask, + TaskType.CLUSTERING: OpenMLClusteringTask, + TaskType.LEARNING_CURVE: OpenMLLearningCurveTask, + }.get(task_type) + if cls is None: + raise NotImplementedError(f"Task type {common_kwargs['task_type']} not supported.") + return cls(**common_kwargs) # type: ignore + + +def _build_url( + limit: int, offset: int, task_type: TaskType | int | None, kwargs: dict[str, Any] +) -> str: + api_call = "task/list" + if limit is not None: + api_call += f"/limit/{limit}" + if offset is not None: + api_call += f"/offset/{offset}" + if task_type is not None: + tvalue = task_type.value if isinstance(task_type, TaskType) else task_type + api_call += f"/type/{tvalue}" + if kwargs is not None: + for operator, value in kwargs.items(): + if value is not None: + if operator == "task_id": + value = ",".join([str(int(i)) for i in value]) # noqa: PLW2901 + api_call += f"/{operator}/{value}" + return api_call + + class TaskV1API(ResourceV1API, TaskAPI): - """Version 1 API implementation for task resources.""" + def get(self, task_id: int) -> OpenMLTask: + """Download OpenML task for a given task ID. + + Downloads the task representation. + + Parameters + ---------- + task_id : int + The OpenML task id of the task to download. + get_dataset_kwargs : + Args and kwargs can be used pass optional parameters to + :meth:`openml.datasets.get_dataset`. + + Returns + ------- + task: OpenMLTask + """ + if not isinstance(task_id, int): + raise TypeError(f"Task id should be integer, is {type(task_id)}") + + response = self._http.get(f"task/{task_id}", enable_cache=True) + return _create_task_from_xml(response.text) + + def supports_download_splits(self) -> bool: + return True + + def list( + self, + limit: int, + offset: int, + task_type: TaskType | int | None = None, + **kwargs: Any, + ) -> pd.DataFrame: + """ + Perform the api call to return a number of tasks having the given filters. + + Parameters + ---------- + Filter task_type is separated from the other filters because + it is used as task_type in the task description, but it is named + type when used as a filter in list tasks call. + limit: int + offset: int + task_type : TaskType, optional + Refers to the type of task. + kwargs: dict, optional + Legal filter operators: tag, task_id (list), data_tag, status, limit, + offset, data_id, data_name, number_instances, number_features, + number_classes, number_missing_values. + + Returns + ------- + dataframe + """ + api_call = _build_url(limit, offset, task_type, kwargs) + return self._parse_list_xml(api_call=api_call) + + def _parse_list_xml(self, api_call: str) -> pd.DataFrame: # noqa: C901, PLR0912 + """Returns a Pandas DataFrame with information about OpenML tasks. + + Parameters + ---------- + api_call : str + The API call specifying which tasks to return. + + Returns + ------- + A Pandas DataFrame with information about OpenML tasks. + + Raises + ------ + ValueError + If the XML returned by the OpenML API does not contain 'oml:tasks', '@xmlns:oml', + or has an incorrect value for '@xmlns:oml'. + KeyError + If an invalid key is found in the XML for a task. + """ + xml_string = self._http.get(api_call).text + + tasks_dict = xmltodict.parse(xml_string, force_list=("oml:task", "oml:input")) + # Minimalistic check if the XML is useful + if "oml:tasks" not in tasks_dict: + raise ValueError(f'Error in return XML, does not contain "oml:runs": {tasks_dict}') + + if "@xmlns:oml" not in tasks_dict["oml:tasks"]: + raise ValueError( + f'Error in return XML, does not contain "oml:runs"/@xmlns:oml: {tasks_dict}' + ) + + if tasks_dict["oml:tasks"]["@xmlns:oml"] != "http://openml.org/openml": + raise ValueError( + "Error in return XML, value of " + '"oml:runs"/@xmlns:oml is not ' + f'"http://openml.org/openml": {tasks_dict!s}', + ) + + assert isinstance(tasks_dict["oml:tasks"]["oml:task"], list), type(tasks_dict["oml:tasks"]) + + tasks = {} + procs = _get_estimation_procedure_list() + proc_dict = {x["id"]: x for x in procs} + + for task_ in tasks_dict["oml:tasks"]["oml:task"]: + tid = None + try: + tid = int(task_["oml:task_id"]) + task_type_int = int(task_["oml:task_type_id"]) + try: + task_type_id = TaskType(task_type_int) + except ValueError as e: + warnings.warn( + f"Could not create task type id for {task_type_int} due to error {e}", + RuntimeWarning, + stacklevel=2, + ) + continue + + task = { + "tid": tid, + "ttid": task_type_id, + "did": int(task_["oml:did"]), + "name": task_["oml:name"], + "task_type": task_["oml:task_type"], + "status": task_["oml:status"], + } + + # Other task inputs + for _input in task_.get("oml:input", []): + if _input["@name"] == "estimation_procedure": + task[_input["@name"]] = proc_dict[int(_input["#text"])]["name"] + else: + value = _input.get("#text") + task[_input["@name"]] = value + + # The number of qualities can range from 0 to infinity + for quality in task_.get("oml:quality", []): + if "#text" not in quality: + quality_value = 0.0 + else: + quality["#text"] = float(quality["#text"]) + if abs(int(quality["#text"]) - quality["#text"]) < 0.0000001: + quality["#text"] = int(quality["#text"]) + quality_value = quality["#text"] + task[quality["@name"]] = quality_value + tasks[tid] = task + except KeyError as e: + if tid is not None: + warnings.warn( + f"Invalid xml for task {tid}: {e}\nFrom {task_}", + RuntimeWarning, + stacklevel=2, + ) + else: + warnings.warn( + f"Could not find key {e} in {task_}!", RuntimeWarning, stacklevel=2 + ) + + return pd.DataFrame.from_dict(tasks, orient="index") + + +def _create_task_from_json(task_json: dict) -> OpenMLTask: + task_type_id = TaskType(int(task_json["task_type_id"])) + + inputs = {i["name"]: i for i in task_json.get("input", [])} + + source = inputs["source_data"]["data_set"] + + common_kwargs = { + "task_id": int(task_json["id"]), + "task_type": task_json["task_type"], + "task_type_id": task_type_id, + "data_set_id": int(source["data_set_id"]), + "evaluation_measure": None, + } + + if task_type_id in ( + TaskType.SUPERVISED_CLASSIFICATION, + TaskType.SUPERVISED_REGRESSION, + TaskType.LEARNING_CURVE, + ): + est = inputs.get("estimation_procedure", {}).get("estimation_procedure") + + if est: + common_kwargs["estimation_procedure_id"] = int(est["id"]) + common_kwargs["estimation_procedure_type"] = est["type"] + common_kwargs["estimation_parameters"] = { + p["name"]: p.get("value") for p in est.get("parameter", []) + } + + common_kwargs["target_name"] = source.get("target_feature") + + cls = { + TaskType.SUPERVISED_CLASSIFICATION: OpenMLClassificationTask, + TaskType.SUPERVISED_REGRESSION: OpenMLRegressionTask, + TaskType.CLUSTERING: OpenMLClusteringTask, + TaskType.LEARNING_CURVE: OpenMLLearningCurveTask, + }[task_type_id] + + return cls(**common_kwargs) # type: ignore class TaskV2API(ResourceV2API, TaskAPI): - """Version 2 API implementation for task resources.""" + def get(self, task_id: int) -> OpenMLTask: + """Download OpenML task for a given task ID. + + Downloads the task representation. + + Parameters + ---------- + task_id : int + The OpenML task id of the task to download. + + Returns + ------- + task: OpenMLTask + """ + response = self._http.get(f"tasks/{task_id}", enable_cache=True) + return _create_task_from_json(response.json()) + + def list( + self, + limit: int, # noqa: ARG002 + offset: int, # noqa: ARG002 + task_type: TaskType | int | None = None, # noqa: ARG002 + **kwargs: Any, # noqa: ARG002 + ) -> pd.DataFrame: + raise self._not_supported(method="list") + + def supports_download_splits(self) -> bool: + return False diff --git a/openml/tasks/functions.py b/openml/tasks/functions.py index 22fb26f9b..0b8aaecf0 100644 --- a/openml/tasks/functions.py +++ b/openml/tasks/functions.py @@ -1,19 +1,14 @@ # License: BSD 3-Clause from __future__ import annotations -import os -import re import warnings from functools import partial -from typing import Any +from typing import TYPE_CHECKING, Any import pandas as pd -import xmltodict -import openml._api_calls import openml.utils from openml.datasets import get_dataset -from openml.exceptions import OpenMLCacheException from .task import ( OpenMLClassificationTask, @@ -21,55 +16,16 @@ OpenMLLearningCurveTask, OpenMLRegressionTask, OpenMLSupervisedTask, - OpenMLTask, TaskType, ) +if TYPE_CHECKING: + from .task import ( + OpenMLTask, + ) TASKS_CACHE_DIR_NAME = "tasks" -def _get_cached_tasks() -> dict[int, OpenMLTask]: - """Return a dict of all the tasks which are cached locally. - - Returns - ------- - tasks : OrderedDict - A dict of all the cached tasks. Each task is an instance of - OpenMLTask. - """ - task_cache_dir = openml.utils._create_cache_directory(TASKS_CACHE_DIR_NAME) - directory_content = os.listdir(task_cache_dir) # noqa: PTH208 - directory_content.sort() - - # Find all dataset ids for which we have downloaded the dataset - # description - tids = (int(did) for did in directory_content if re.match(r"[0-9]*", did)) - return {tid: _get_cached_task(tid) for tid in tids} - - -def _get_cached_task(tid: int) -> OpenMLTask: - """Return a cached task based on the given id. - - Parameters - ---------- - tid : int - Id of the task. - - Returns - ------- - OpenMLTask - """ - tid_cache_dir = openml.utils._create_cache_directory_for_id(TASKS_CACHE_DIR_NAME, tid) - - task_xml_path = tid_cache_dir / "task.xml" - try: - with task_xml_path.open(encoding="utf8") as fh: - return _create_task_from_xml(fh.read()) - except OSError as e: - openml.utils._remove_cache_dir_for_id(TASKS_CACHE_DIR_NAME, tid_cache_dir) - raise OpenMLCacheException(f"Task file for tid {tid} not cached") from e - - def _get_estimation_procedure_list() -> list[dict[str, Any]]: """Return a list of all estimation procedures which are on OpenML. @@ -133,7 +89,7 @@ def list_tasks( # noqa: PLR0913 calculated for the associated dataset, some of these are also returned. """ listing_call = partial( - _list_tasks, + openml._backend.task.list, task_type=task_type, tag=tag, data_tag=data_tag, @@ -152,151 +108,6 @@ def list_tasks( # noqa: PLR0913 return pd.concat(batches) -def _list_tasks( - limit: int, - offset: int, - task_type: TaskType | int | None = None, - **kwargs: Any, -) -> pd.DataFrame: - """ - Perform the api call to return a number of tasks having the given filters. - - Parameters - ---------- - Filter task_type is separated from the other filters because - it is used as task_type in the task description, but it is named - type when used as a filter in list tasks call. - limit: int - offset: int - task_type : TaskType, optional - Refers to the type of task. - kwargs: dict, optional - Legal filter operators: tag, task_id (list), data_tag, status, limit, - offset, data_id, data_name, number_instances, number_features, - number_classes, number_missing_values. - - Returns - ------- - dataframe - """ - api_call = "task/list" - if limit is not None: - api_call += f"/limit/{limit}" - if offset is not None: - api_call += f"/offset/{offset}" - if task_type is not None: - tvalue = task_type.value if isinstance(task_type, TaskType) else task_type - api_call += f"/type/{tvalue}" - if kwargs is not None: - for operator, value in kwargs.items(): - if value is not None: - if operator == "task_id": - value = ",".join([str(int(i)) for i in value]) # noqa: PLW2901 - api_call += f"/{operator}/{value}" - - return __list_tasks(api_call=api_call) - - -def __list_tasks(api_call: str) -> pd.DataFrame: # noqa: C901, PLR0912 - """Returns a Pandas DataFrame with information about OpenML tasks. - - Parameters - ---------- - api_call : str - The API call specifying which tasks to return. - - Returns - ------- - A Pandas DataFrame with information about OpenML tasks. - - Raises - ------ - ValueError - If the XML returned by the OpenML API does not contain 'oml:tasks', '@xmlns:oml', - or has an incorrect value for '@xmlns:oml'. - KeyError - If an invalid key is found in the XML for a task. - """ - xml_string = openml._api_calls._perform_api_call(api_call, "get") - tasks_dict = xmltodict.parse(xml_string, force_list=("oml:task", "oml:input")) - # Minimalistic check if the XML is useful - if "oml:tasks" not in tasks_dict: - raise ValueError(f'Error in return XML, does not contain "oml:runs": {tasks_dict}') - - if "@xmlns:oml" not in tasks_dict["oml:tasks"]: - raise ValueError( - f'Error in return XML, does not contain "oml:runs"/@xmlns:oml: {tasks_dict}' - ) - - if tasks_dict["oml:tasks"]["@xmlns:oml"] != "http://openml.org/openml": - raise ValueError( - "Error in return XML, value of " - '"oml:runs"/@xmlns:oml is not ' - f'"http://openml.org/openml": {tasks_dict!s}', - ) - - assert isinstance(tasks_dict["oml:tasks"]["oml:task"], list), type(tasks_dict["oml:tasks"]) - - tasks = {} - procs = _get_estimation_procedure_list() - proc_dict = {x["id"]: x for x in procs} - - for task_ in tasks_dict["oml:tasks"]["oml:task"]: - tid = None - try: - tid = int(task_["oml:task_id"]) - task_type_int = int(task_["oml:task_type_id"]) - try: - task_type_id = TaskType(task_type_int) - except ValueError as e: - warnings.warn( - f"Could not create task type id for {task_type_int} due to error {e}", - RuntimeWarning, - stacklevel=2, - ) - continue - - task = { - "tid": tid, - "ttid": task_type_id, - "did": int(task_["oml:did"]), - "name": task_["oml:name"], - "task_type": task_["oml:task_type"], - "status": task_["oml:status"], - } - - # Other task inputs - for _input in task_.get("oml:input", []): - if _input["@name"] == "estimation_procedure": - task[_input["@name"]] = proc_dict[int(_input["#text"])]["name"] - else: - value = _input.get("#text") - task[_input["@name"]] = value - - # The number of qualities can range from 0 to infinity - for quality in task_.get("oml:quality", []): - if "#text" not in quality: - quality_value = 0.0 - else: - quality["#text"] = float(quality["#text"]) - if abs(int(quality["#text"]) - quality["#text"]) < 0.0000001: - quality["#text"] = int(quality["#text"]) - quality_value = quality["#text"] - task[quality["@name"]] = quality_value - tasks[tid] = task - except KeyError as e: - if tid is not None: - warnings.warn( - f"Invalid xml for task {tid}: {e}\nFrom {task_}", - RuntimeWarning, - stacklevel=2, - ) - else: - warnings.warn(f"Could not find key {e} in {task_}!", RuntimeWarning, stacklevel=2) - - return pd.DataFrame.from_dict(tasks, orient="index") - - def get_tasks( task_ids: list[int], download_data: bool | None = None, @@ -304,7 +115,7 @@ def get_tasks( ) -> list[OpenMLTask]: """Download tasks. - This function iterates :meth:`openml.tasks.get_task`. + This function iterates :meth:`openml.task.get`. Parameters ---------- @@ -338,7 +149,11 @@ def get_tasks( tasks = [] for task_id in task_ids: tasks.append( - get_task(task_id, download_data=download_data, download_qualities=download_qualities) + get_task( + task_id, + download_data=download_data, + download_qualities=download_qualities, + ) ) return tasks @@ -373,133 +188,27 @@ def get_task( if not isinstance(task_id, int): raise TypeError(f"Task id should be integer, is {type(task_id)}") - task_cache_directory = openml.utils._create_cache_directory_for_id( - TASKS_CACHE_DIR_NAME, task_id - ) - task_cache_directory_existed = task_cache_directory.exists() - try: - task = _get_task_description(task_id) - dataset = get_dataset(task.dataset_id, **get_dataset_kwargs) - # List of class labels available in dataset description - # Including class labels as part of task meta data handles - # the case where data download was initially disabled - if isinstance(task, (OpenMLClassificationTask, OpenMLLearningCurveTask)): - assert task.target_name is not None, ( - "Supervised tasks must define a target feature before retrieving class labels." - ) - task.class_labels = dataset.retrieve_class_labels(task.target_name) - # Clustering tasks do not have class labels - # and do not offer download_split - if download_splits and isinstance(task, OpenMLSupervisedTask): - task.download_split() - except Exception as e: - if not task_cache_directory_existed: - openml.utils._remove_cache_dir_for_id(TASKS_CACHE_DIR_NAME, task_cache_directory) - raise e - - return task - + task = openml._backend.task.get(task_id) + dataset = get_dataset(task.dataset_id, **get_dataset_kwargs) -def _get_task_description(task_id: int) -> OpenMLTask: - try: - return _get_cached_task(task_id) - except OpenMLCacheException: - _cache_dir = openml.utils._create_cache_directory_for_id(TASKS_CACHE_DIR_NAME, task_id) - xml_file = _cache_dir / "task.xml" - task_xml = openml._api_calls._perform_api_call(f"task/{task_id}", "get") - - with xml_file.open("w", encoding="utf8") as fh: - fh.write(task_xml) - return _create_task_from_xml(task_xml) - - -def _create_task_from_xml(xml: str) -> OpenMLTask: - """Create a task given a xml string. - - Parameters - ---------- - xml : string - Task xml representation. - - Returns - ------- - OpenMLTask - """ - dic = xmltodict.parse(xml)["oml:task"] - estimation_parameters = {} - inputs = {} - # Due to the unordered structure we obtain, we first have to extract - # the possible keys of oml:input; dic["oml:input"] is a list of - # OrderedDicts - - # Check if there is a list of inputs - if isinstance(dic["oml:input"], list): - for input_ in dic["oml:input"]: - name = input_["@name"] - inputs[name] = input_ - # Single input case - elif isinstance(dic["oml:input"], dict): - name = dic["oml:input"]["@name"] - inputs[name] = dic["oml:input"] - - evaluation_measures = None - if "evaluation_measures" in inputs: - evaluation_measures = inputs["evaluation_measures"]["oml:evaluation_measures"][ - "oml:evaluation_measure" - ] - - task_type = TaskType(int(dic["oml:task_type_id"])) - common_kwargs = { - "task_id": dic["oml:task_id"], - "task_type": dic["oml:task_type"], - "task_type_id": task_type, - "data_set_id": inputs["source_data"]["oml:data_set"]["oml:data_set_id"], - "evaluation_measure": evaluation_measures, - } - # TODO: add OpenMLClusteringTask? - if task_type in ( - TaskType.SUPERVISED_CLASSIFICATION, - TaskType.SUPERVISED_REGRESSION, - TaskType.LEARNING_CURVE, + if ( + isinstance(task, (OpenMLClassificationTask, OpenMLLearningCurveTask)) + and task.target_name is not None ): - # Convert some more parameters - for parameter in inputs["estimation_procedure"]["oml:estimation_procedure"][ - "oml:parameter" - ]: - name = parameter["@name"] - text = parameter.get("#text", "") - estimation_parameters[name] = text - - common_kwargs["estimation_procedure_type"] = inputs["estimation_procedure"][ - "oml:estimation_procedure" - ]["oml:type"] - common_kwargs["estimation_procedure_id"] = int( - inputs["estimation_procedure"]["oml:estimation_procedure"]["oml:id"] - ) + task.class_labels = dataset.retrieve_class_labels(task.target_name) - common_kwargs["estimation_parameters"] = estimation_parameters - common_kwargs["target_name"] = inputs["source_data"]["oml:data_set"]["oml:target_feature"] - common_kwargs["data_splits_url"] = inputs["estimation_procedure"][ - "oml:estimation_procedure" - ]["oml:data_splits_url"] - - cls = { - TaskType.SUPERVISED_CLASSIFICATION: OpenMLClassificationTask, - TaskType.SUPERVISED_REGRESSION: OpenMLRegressionTask, - TaskType.CLUSTERING: OpenMLClusteringTask, - TaskType.LEARNING_CURVE: OpenMLLearningCurveTask, - }.get(task_type) - if cls is None: - raise NotImplementedError( - f"Task type '{common_kwargs['task_type']}' is not supported. " - f"Supported task types: SUPERVISED_CLASSIFICATION," - f"SUPERVISED_REGRESSION, CLUSTERING, LEARNING_CURVE." - f"Please check the OpenML documentation for available task types." - ) - return cls(**common_kwargs) # type: ignore + if download_splits and isinstance(task, OpenMLSupervisedTask): + if openml._backend.task.supports_download_splits(): + task.download_split() + else: + warnings.warn( + "`download_splits` is not yet supported in the v2 API and will be ignored.", + stacklevel=2, + ) + + return task -# TODO(eddiebergman): overload on `task_type` def create_task( task_type: TaskType, dataset_id: int, @@ -587,4 +296,4 @@ def delete_task(task_id: int) -> bool: bool True if the deletion was successful. False otherwise. """ - return openml.utils._delete_entity("task", task_id) + return openml._backend.task.delete(task_id) diff --git a/openml/tasks/task.py b/openml/tasks/task.py index ab3cb3da4..a709fdb45 100644 --- a/openml/tasks/task.py +++ b/openml/tasks/task.py @@ -241,6 +241,48 @@ def _parse_publish_response(self, xml_response: dict) -> None: """Parse the id from the xml_response and assign it to self.""" self.task_id = int(xml_response["oml:upload_task"]["oml:id"]) + def publish(self) -> OpenMLTask: + """Publish this task to OpenML server. + + Returns + ------- + self : OpenMLTask + """ + file_elements = self._get_file_elements() + if "description" not in file_elements: + file_elements["description"] = self._to_xml() + task_id = openml._backend.task.publish(path="task", files=file_elements) + self.task_id = task_id + return self + + def push_tag(self, tag: str) -> None: + """Annotates this task with a tag on the server. + + Parameters + ---------- + tag : str + Tag to attach to the task. + """ + if self.task_id is None: + raise openml.exceptions.ObjectNotPublishedError( + "Please publish the task first before being able to tag it." + ) + openml._backend.task.tag(self.task_id, tag) + + def remove_tag(self, tag: str) -> None: + """Removes a tag from this task on the server. + + Parameters + ---------- + tag : str + Tag to remove from the task. + """ + if self.task_id is None: + raise openml.exceptions.ObjectNotPublishedError( + "Please publish the task first before being able to untag it." + ) + openml._backend.task.untag(self.task_id, tag) + class OpenMLSupervisedTask(OpenMLTask, ABC): """OpenML Supervised Classification object. diff --git a/tests/conftest.py b/tests/conftest.py index 1359e6247..0a663af15 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -205,7 +205,7 @@ def _expected_static_cache_state(root_dir: Path) -> list[Path]: _c_root_dir = root_dir / "org" / "openml" / "test" res_paths = [root_dir, _c_root_dir] - for _d in ["datasets", "tasks", "runs"]: + for _d in ["datasets", "runs"]: res_paths.append(_c_root_dir / _d) for _id in ["-1", "2"]: @@ -222,21 +222,21 @@ def _expected_static_cache_state(root_dir: Path) -> list[Path]: res_paths.append(_c_root_dir / "datasets" / "30" / "dataset_30.pq") res_paths.append(_c_root_dir / "runs" / "1" / "description.xml") - for _id in ["1", "3", "1882"]: - tmp_p = _c_root_dir / "tasks" / _id - res_paths.extend( - [ - tmp_p / "datasplits.arff", - tmp_p / "task.xml", - ] - ) - res_paths.extend([ _c_root_dir / "api" / "v1" / "xml" / "setup", _c_root_dir / "api" / "v1" / "xml" / "setup" / "1", _c_root_dir / "api" / "v1" / "xml" / "setup" / "1" / "body.xml", ]) + res_paths.extend([ + _c_root_dir / "api_splits" / "get" / "1882" / "Task_1882_splits.arff" / "body.arff", + _c_root_dir / "api_splits" / "get" / "3" / "Task_3_splits.arff" / "body.arff", + _c_root_dir / "api_splits" / "get" / "1" / "Task_1_splits.arff" / "body.arff", + _c_root_dir / "api" / "v1" / "xml" / "task" / "1882" / "body.xml", + _c_root_dir / "api" / "v1" / "xml" / "task" / "3" / "body.xml", + _c_root_dir / "api" / "v1" / "xml" / "task" / "1" / "body.xml", + ]) + return res_paths @@ -324,8 +324,8 @@ def with_test_cache(test_files_directory, request): openml.config.set_root_cache_directory(_root_cache_directory) if tmp_cache.exists(): shutil.rmtree(tmp_cache) - + @pytest.fixture def static_cache_dir(): return Path(__file__).parent / "files" diff --git a/tests/files/org/openml/test/tasks/1/task.xml b/tests/files/org/openml/test/api/v1/xml/task/1/body.xml similarity index 100% rename from tests/files/org/openml/test/tasks/1/task.xml rename to tests/files/org/openml/test/api/v1/xml/task/1/body.xml diff --git a/tests/files/org/openml/test/tasks/1882/task.xml b/tests/files/org/openml/test/api/v1/xml/task/1882/body.xml similarity index 100% rename from tests/files/org/openml/test/tasks/1882/task.xml rename to tests/files/org/openml/test/api/v1/xml/task/1882/body.xml diff --git a/tests/files/org/openml/test/tasks/3/task.xml b/tests/files/org/openml/test/api/v1/xml/task/3/body.xml similarity index 100% rename from tests/files/org/openml/test/tasks/3/task.xml rename to tests/files/org/openml/test/api/v1/xml/task/3/body.xml diff --git a/tests/files/org/openml/test/tasks/1/datasplits.arff b/tests/files/org/openml/test/api_splits/get/1/Task_1_splits.arff/body.arff similarity index 100% rename from tests/files/org/openml/test/tasks/1/datasplits.arff rename to tests/files/org/openml/test/api_splits/get/1/Task_1_splits.arff/body.arff diff --git a/tests/files/org/openml/test/tasks/1882/datasplits.arff b/tests/files/org/openml/test/api_splits/get/1882/Task_1882_splits.arff/body.arff similarity index 100% rename from tests/files/org/openml/test/tasks/1882/datasplits.arff rename to tests/files/org/openml/test/api_splits/get/1882/Task_1882_splits.arff/body.arff diff --git a/tests/files/org/openml/test/tasks/3/datasplits.arff b/tests/files/org/openml/test/api_splits/get/3/Task_3_splits.arff/body.arff similarity index 100% rename from tests/files/org/openml/test/tasks/3/datasplits.arff rename to tests/files/org/openml/test/api_splits/get/3/Task_3_splits.arff/body.arff diff --git a/tests/test_api/test_task.py b/tests/test_api/test_task.py new file mode 100644 index 000000000..6cad784b3 --- /dev/null +++ b/tests/test_api/test_task.py @@ -0,0 +1,191 @@ +import pytest +import pandas as pd +from requests import Session, Response +from unittest.mock import patch + +import openml +from openml._api.resources.task import TaskV1API, TaskV2API +from openml._api.resources.base.fallback import FallbackProxy +from openml.exceptions import OpenMLNotSupportedError +from openml.tasks.task import TaskType + + +@pytest.fixture +def task_v1(http_client_v1, minio_client) -> TaskV1API: + return TaskV1API(http=http_client_v1, minio=minio_client) + + +@pytest.fixture +def task_v2(http_client_v2, minio_client) -> TaskV2API: + return TaskV2API(http=http_client_v2, minio=minio_client) + + +@pytest.mark.uses_test_server() +def test_v1_list_tasks(task_v1): + """Verify V1 list endpoint returns a populated DataFrame.""" + tasks_df = task_v1.list(limit=5, offset=0) + assert isinstance(tasks_df, pd.DataFrame) + assert not tasks_df.empty + assert "tid" in tasks_df.columns + + +@pytest.mark.uses_test_server() +def test_v1_get(task_v1): + """Verify V1 get endpoint returns a task.""" + task = task_v1.get(1) + assert task is not None + assert task.task_id == 1 + + +@pytest.mark.uses_test_server() +def test_v2_list_tasks(task_v2): + """Verify V2 list endpoint raises NotSupported.""" + with pytest.raises(OpenMLNotSupportedError): + task_v2.list(limit=5, offset=0) + + +@pytest.mark.uses_test_server() +def test_v2_get(task_v2): + """Verify V2 get endpoint returns a task.""" + task = task_v2.get(1) + assert task is not None + assert task.task_id == 1 + + +def test_v1_publish(task_v1): + resource_name = task_v1.resource_type.value + resource_files = {"description": "Resource Description File"} + resource_id = 123 + + with patch.object(Session, "request") as mock_request: + mock_request.return_value = Response() + mock_request.return_value.status_code = 200 + mock_request.return_value._content = ( + f'\n' + f"\t{resource_id}\n" + f"\n" + ).encode("utf-8") + + published_resource_id = task_v1.publish( + resource_name, + files=resource_files, + ) + + assert resource_id == published_resource_id + + mock_request.assert_called_once_with( + method="POST", + url=openml.config.server + resource_name, + params={}, + data={"api_key": openml.config.apikey}, + headers=openml.config._HEADERS, + files=resource_files, + ) + + +def test_v1_delete(task_v1): + resource_name = task_v1.resource_type.value + resource_id = 123 + + with patch.object(Session, "request") as mock_request: + mock_request.return_value = Response() + mock_request.return_value.status_code = 200 + mock_request.return_value._content = ( + f'\n' + f" {resource_id}\n" + f"\n" + ).encode("utf-8") + + task_v1.delete(resource_id) + + mock_request.assert_called_once_with( + method="DELETE", + url=(openml.config.server + resource_name + "/" + str(resource_id)), + params={"api_key": openml.config.apikey}, + data={}, + headers=openml.config._HEADERS, + files=None, + ) + + +def test_v1_tag(task_v1): + resource_id = 123 + resource_tag = "TAG" + + with patch.object(Session, "request") as mock_request: + mock_request.return_value = Response() + mock_request.return_value.status_code = 200 + mock_request.return_value._content = ( + f'' + f"{resource_id}" + f"{resource_tag}" + f"" + ).encode("utf-8") + + tags = task_v1.tag(resource_id, resource_tag) + + assert resource_tag in tags + + mock_request.assert_called_once_with( + method="POST", + url=(openml.config.server + task_v1.resource_type.value + "/tag"), + params={}, + data={ + "api_key": openml.config.apikey, + "task_id": resource_id, + "tag": resource_tag, + }, + headers=openml.config._HEADERS, + files=None, + ) + + +def test_v1_untag(task_v1): + resource_id = 123 + resource_tag = "TAG" + + with patch.object(Session, "request") as mock_request: + mock_request.return_value = Response() + mock_request.return_value.status_code = 200 + mock_request.return_value._content = ( + f'' + f"{resource_id}" + f"" + ).encode("utf-8") + + tags = task_v1.untag(resource_id, resource_tag) + + assert resource_tag not in tags + + mock_request.assert_called_once_with( + method="POST", + url=(openml.config.server + task_v1.resource_type.value + "/untag"), + params={}, + data={ + "api_key": openml.config.apikey, + "task_id": resource_id, + "tag": resource_tag, + }, + headers=openml.config._HEADERS, + files=None, + ) + + +def test_v2_publish(task_v2): + with pytest.raises(OpenMLNotSupportedError): + task_v2.publish(path=None, files=None) + + +def test_v2_delete(task_v2): + with pytest.raises(OpenMLNotSupportedError): + task_v2.delete(resource_id=None) + + +def test_v2_tag(task_v2): + with pytest.raises(OpenMLNotSupportedError): + task_v2.tag(resource_id=None, tag=None) + + +def test_v2_untag(task_v2): + with pytest.raises(OpenMLNotSupportedError): + task_v2.untag(resource_id=None, tag=None) diff --git a/tests/test_datasets/test_dataset_functions.py b/tests/test_datasets/test_dataset_functions.py index 80b0b4215..f885198f1 100644 --- a/tests/test_datasets/test_dataset_functions.py +++ b/tests/test_datasets/test_dataset_functions.py @@ -289,7 +289,9 @@ def test_get_dataset_cannot_access_private_data(self): @pytest.mark.skip("Need to find dataset name of private dataset") def test_dataset_by_name_cannot_access_private_data(self): self.use_production_server() - self.assertRaises(OpenMLPrivateDatasetError, openml.datasets.get_dataset, "NAME_GOES_HERE") + self.assertRaises( + OpenMLPrivateDatasetError, openml.datasets.get_dataset, "NAME_GOES_HERE" + ) @pytest.mark.test_server() def test_get_dataset_lazy_all_functions(self): @@ -299,7 +301,9 @@ def test_get_dataset_lazy_all_functions(self): def ensure_absence_of_real_data(): assert not os.path.exists( - os.path.join(openml.config.get_cache_directory(), "datasets", "1", "dataset.arff") + os.path.join( + openml.config.get_cache_directory(), "datasets", "1", "dataset.arff" + ) ) tag = "test_lazy_tag_%d" % random.randint(1, 1000000) @@ -404,7 +408,6 @@ def test__download_minio_file_works_with_bucket_subdirectory(self): file_destination ), "_download_minio_file can download from subdirectories" - @mock.patch("openml._api_calls._download_minio_file") @pytest.mark.test_server() def test__get_dataset_parquet_is_cached(self, patch): @@ -524,13 +527,29 @@ def test_deletion_of_cache_dir(self): @pytest.mark.test_server() def test_deletion_of_cache_dir_faulty_download(self, patch): patch.side_effect = Exception("Boom!") - self.assertRaisesRegex(Exception, "Boom!", openml.datasets.get_dataset, dataset_id=1) - datasets_cache_dir = os.path.join(openml.config.get_cache_directory(), "datasets") + self.assertRaisesRegex( + Exception, "Boom!", openml.datasets.get_dataset, dataset_id=1 + ) + datasets_cache_dir = os.path.join( + openml.config.get_cache_directory(), "datasets" + ) assert len(os.listdir(datasets_cache_dir)) == 0 + @pytest.mark.skipif( + os.getenv("OPENML_USE_LOCAL_SERVICES") == "true", + reason="Pending resolution of #1657", + ) @pytest.mark.test_server() def test_publish_dataset(self): - arff_file_path = self.static_cache_dir / "org" / "openml" / "test" / "datasets" / "2" / "dataset.arff" + arff_file_path = ( + self.static_cache_dir + / "org" + / "openml" + / "test" + / "datasets" + / "2" + / "dataset.arff" + ) dataset = OpenMLDataset( "anneal", "test", @@ -561,7 +580,9 @@ def test__retrieve_class_labels(self): # Test workaround for string-typed class labels custom_ds = openml.datasets.get_dataset(2) custom_ds.features[31].data_type = "string" - labels = custom_ds.retrieve_class_labels(target_name=custom_ds.features[31].name) + labels = custom_ds.retrieve_class_labels( + target_name=custom_ds.features[31].name + ) assert labels == ["COIL", "SHEET"] @pytest.mark.test_server() @@ -682,11 +703,16 @@ def test_attributes_arff_from_df_unknown_dtype(self): for arr, dt in zip(data, dtype): df = pd.DataFrame(arr) err_msg = ( - f"The dtype '{dt}' of the column '0' is not currently " "supported by liac-arff" + f"The dtype '{dt}' of the column '0' is not currently " + "supported by liac-arff" ) with pytest.raises(ValueError, match=err_msg): attributes_arff_from_df(df) + @pytest.mark.skipif( + os.getenv("OPENML_USE_LOCAL_SERVICES") == "true", + reason="Pending resolution of #1657", + ) @pytest.mark.test_server() def test_create_dataset_numpy(self): data = np.array([[1, 2, 3], [1.2, 2.5, 3.8], [2, 5, 8], [0, 1, 0]]).T @@ -719,8 +745,14 @@ def test_create_dataset_numpy(self): assert ( _get_online_dataset_arff(dataset.id) == dataset._dataset ), "Uploaded arff does not match original one" - assert _get_online_dataset_format(dataset.id) == "arff", "Wrong format for dataset" + assert ( + _get_online_dataset_format(dataset.id) == "arff" + ), "Wrong format for dataset" + @pytest.mark.skipif( + os.getenv("OPENML_USE_LOCAL_SERVICES") == "true", + reason="Pending resolution of #1657", + ) @pytest.mark.test_server() def test_create_dataset_list(self): data = [ @@ -774,8 +806,14 @@ def test_create_dataset_list(self): assert ( _get_online_dataset_arff(dataset.id) == dataset._dataset ), "Uploaded ARFF does not match original one" - assert _get_online_dataset_format(dataset.id) == "arff", "Wrong format for dataset" + assert ( + _get_online_dataset_format(dataset.id) == "arff" + ), "Wrong format for dataset" + @pytest.mark.skipif( + os.getenv("OPENML_USE_LOCAL_SERVICES") == "true", + reason="Pending resolution of #1657", + ) @pytest.mark.test_server() def test_create_dataset_sparse(self): # test the scipy.sparse.coo_matrix @@ -924,6 +962,10 @@ def test_get_online_dataset_format(self): dataset_id ), "The format of the ARFF files is different" + @pytest.mark.skipif( + os.getenv("OPENML_USE_LOCAL_SERVICES") == "true", + reason="Pending resolution of #1657", + ) @pytest.mark.test_server() def test_create_dataset_pandas(self): data = [ @@ -991,7 +1033,9 @@ def test_create_dataset_pandas(self): column_names = ["input1", "input2", "y"] df = pd.DataFrame.sparse.from_spmatrix(sparse_data, columns=column_names) # meta-information - description = "Synthetic dataset created from a Pandas DataFrame with Sparse columns" + description = ( + "Synthetic dataset created from a Pandas DataFrame with Sparse columns" + ) dataset = openml.datasets.functions.create_dataset( name=name, description=description, @@ -1016,7 +1060,9 @@ def test_create_dataset_pandas(self): assert ( _get_online_dataset_arff(dataset.id) == dataset._dataset ), "Uploaded ARFF does not match original one" - assert _get_online_dataset_format(dataset.id) == "sparse_arff", "Wrong format for dataset" + assert ( + _get_online_dataset_format(dataset.id) == "sparse_arff" + ), "Wrong format for dataset" # Check that we can overwrite the attributes data = [["a"], ["b"], ["c"], ["d"], ["e"]] @@ -1046,7 +1092,9 @@ def test_create_dataset_pandas(self): TestBase._mark_entity_for_removal("data", dataset.id) TestBase.logger.info(f"collected from {__file__.split('/')[-1]}: {dataset.id}") downloaded_data = _get_online_dataset_arff(dataset.id) - assert downloaded_data == dataset._dataset, "Uploaded ARFF does not match original one" + assert ( + downloaded_data == dataset._dataset + ), "Uploaded ARFF does not match original one" assert "@ATTRIBUTE rnd_str {a, b, c, d, e, f, g}" in downloaded_data def test_ignore_attributes_dataset(self): @@ -1149,6 +1197,10 @@ def test_ignore_attributes_dataset(self): paper_url=paper_url, ) + @pytest.mark.skipif( + os.getenv("OPENML_USE_LOCAL_SERVICES") == "true", + reason="Pending resolution of #1657", + ) @pytest.mark.test_server() def test_publish_fetch_ignore_attribute(self): """Test to upload and retrieve dataset and check ignore_attributes""" @@ -1268,6 +1320,10 @@ def test_create_dataset_row_id_attribute_error(self): paper_url=paper_url, ) + @pytest.mark.skipif( + os.getenv("OPENML_USE_LOCAL_SERVICES") == "true", + reason="Pending resolution of #1657", + ) @pytest.mark.test_server() def test_create_dataset_row_id_attribute_inference(self): # meta-information @@ -1396,7 +1452,9 @@ def test_get_dataset_cache_format_feather(self): cache_dir = openml.config.get_cache_directory() cache_dir_for_id = os.path.join(cache_dir, "datasets", "128") feather_file = os.path.join(cache_dir_for_id, "dataset.feather") - pickle_file = os.path.join(cache_dir_for_id, "dataset.feather.attributes.pkl.py3") + pickle_file = os.path.join( + cache_dir_for_id, "dataset.feather.attributes.pkl.py3" + ) data = pd.read_feather(feather_file) assert os.path.isfile(feather_file), "Feather file is missing" assert os.path.isfile(pickle_file), "Attributes pickle file is missing" @@ -1436,6 +1494,10 @@ def test_data_edit_non_critical_field(self): edited_dataset = openml.datasets.get_dataset(did) assert edited_dataset.description == desc + @pytest.mark.skipif( + os.getenv("OPENML_USE_LOCAL_SERVICES") == "true", + reason="Pending resolution of #1657", + ) @pytest.mark.test_server() def test_data_edit_critical_field(self): # Case 2 @@ -1443,7 +1505,9 @@ def test_data_edit_critical_field(self): # for this, we need to first clone a dataset to do changes did = fork_dataset(1) self._wait_for_dataset_being_processed(did) - result = edit_dataset(did, default_target_attribute="shape", ignore_attribute="oil") + result = edit_dataset( + did, default_target_attribute="shape", ignore_attribute="oil" + ) assert did == result n_tries = 10 @@ -1451,7 +1515,9 @@ def test_data_edit_critical_field(self): for i in range(n_tries): edited_dataset = openml.datasets.get_dataset(did) try: - assert edited_dataset.default_target_attribute == "shape", edited_dataset + assert ( + edited_dataset.default_target_attribute == "shape" + ), edited_dataset assert edited_dataset.ignore_attribute == ["oil"], edited_dataset break except AssertionError as e: @@ -1459,9 +1525,11 @@ def test_data_edit_critical_field(self): raise e time.sleep(10) # Delete the cache dir to get the newer version of the dataset - + shutil.rmtree( - os.path.join(openml.config.get_cache_directory(), "datasets", str(did)), + os.path.join( + openml.config.get_cache_directory(), "datasets", str(did) + ), ) @pytest.mark.test_server() @@ -1488,6 +1556,10 @@ def test_data_edit_requires_valid_dataset(self): description="xor operation dataset", ) + @pytest.mark.skipif( + os.getenv("OPENML_USE_LOCAL_SERVICES") == "true", + reason="Pending resolution of #1657", + ) @pytest.mark.test_server() def test_data_edit_cannot_edit_critical_field_if_dataset_has_task(self): # Need to own a dataset to be able to edit meta-data @@ -1540,7 +1612,6 @@ def test_data_fork(self): data_id=999999, ) - @pytest.mark.production_server() def test_list_datasets_with_high_size_parameter(self): # Testing on prod since concurrent deletion of uploded datasets make the test fail @@ -1626,7 +1697,9 @@ def test_invalid_attribute_validations( (None, None, ["outlook", "windy"]), ], ) -def test_valid_attribute_validations(default_target_attribute, row_id_attribute, ignore_attribute): +def test_valid_attribute_validations( + default_target_attribute, row_id_attribute, ignore_attribute +): data = [ ["a", "sunny", 85.0, 85.0, "FALSE", "no"], ["b", "sunny", 80.0, 90.0, "TRUE", "no"], @@ -1726,7 +1799,10 @@ def test_delete_dataset(self): @mock.patch.object(requests.Session, "delete") def test_delete_dataset_not_owned(mock_delete, test_files_directory, test_server_v1, test_apikey_v1): content_file = ( - test_files_directory / "mock_responses" / "datasets" / "data_delete_not_owned.xml" + test_files_directory + / "mock_responses" + / "datasets" + / "data_delete_not_owned.xml" ) mock_delete.return_value = create_request_response( status_code=412, @@ -1747,7 +1823,10 @@ def test_delete_dataset_not_owned(mock_delete, test_files_directory, test_server @mock.patch.object(requests.Session, "delete") def test_delete_dataset_with_run(mock_delete, test_files_directory, test_server_v1, test_apikey_v1): content_file = ( - test_files_directory / "mock_responses" / "datasets" / "data_delete_has_tasks.xml" + test_files_directory + / "mock_responses" + / "datasets" + / "data_delete_has_tasks.xml" ) mock_delete.return_value = create_request_response( status_code=412, @@ -1768,7 +1847,10 @@ def test_delete_dataset_with_run(mock_delete, test_files_directory, test_server_ @mock.patch.object(requests.Session, "delete") def test_delete_dataset_success(mock_delete, test_files_directory, test_server_v1, test_apikey_v1): content_file = ( - test_files_directory / "mock_responses" / "datasets" / "data_delete_successful.xml" + test_files_directory + / "mock_responses" + / "datasets" + / "data_delete_successful.xml" ) mock_delete.return_value = create_request_response( status_code=200, @@ -1786,7 +1868,10 @@ def test_delete_dataset_success(mock_delete, test_files_directory, test_server_v @mock.patch.object(requests.Session, "delete") def test_delete_unknown_dataset(mock_delete, test_files_directory, test_server_v1, test_apikey_v1): content_file = ( - test_files_directory / "mock_responses" / "datasets" / "data_delete_not_exist.xml" + test_files_directory + / "mock_responses" + / "datasets" + / "data_delete_not_exist.xml" ) mock_delete.return_value = create_request_response( status_code=412, @@ -1956,9 +2041,15 @@ def test_get_dataset_lazy_behavior( with_features=with_features, with_data=with_data, ) - assert dataset.features, "Features should be downloaded on-demand if not during get_dataset" - assert dataset.qualities, "Qualities should be downloaded on-demand if not during get_dataset" - assert dataset.get_data(), "Data should be downloaded on-demand if not during get_dataset" + assert ( + dataset.features + ), "Features should be downloaded on-demand if not during get_dataset" + assert ( + dataset.qualities + ), "Qualities should be downloaded on-demand if not during get_dataset" + assert ( + dataset.get_data() + ), "Data should be downloaded on-demand if not during get_dataset" _assert_datasets_retrieved_successfully( [1], with_qualities=True, with_features=True, with_data=True ) @@ -1977,7 +2068,9 @@ def test__get_dataset_parquet_not_cached(): "oml:parquet_url": "http://data.openml.org/dataset20/dataset_20.pq", "oml:id": "20", } - path = _get_dataset_parquet(description, cache_directory=Path(openml.config.get_cache_directory())) + path = _get_dataset_parquet( + description, cache_directory=Path(openml.config.get_cache_directory()) + ) assert isinstance(path, Path), "_get_dataset_parquet returns a path" assert path.is_file(), "_get_dataset_parquet returns path to real file" @@ -1986,7 +2079,10 @@ def test_read_features_from_xml_with_whitespace() -> None: from openml.datasets.dataset import _read_features features_file = ( - Path(__file__).parent.parent / "files" / "misc" / "features_with_whitespaces.xml" + Path(__file__).parent.parent + / "files" + / "misc" + / "features_with_whitespaces.xml" ) dict = _read_features(features_file) assert dict[1].nominal_values == [" - 50000.", " 50000+."] @@ -1997,7 +2093,7 @@ def test_get_dataset_parquet(requests_mock, test_files_directory, test_server_v1 # Parquet functionality is disabled on the test server # There is no parquet-copy of the test server yet. content_file = ( - test_files_directory / "mock_responses" / "datasets" / "data_description_61.xml" + test_files_directory / "mock_responses" / "datasets" / "data_description_61.xml" ) # While the mocked example is from production, unit tests by default connect to the test server. requests_mock.get(test_server_v1 + "data/61", text=content_file.read_text()) diff --git a/tests/test_flows/test_flow.py b/tests/test_flows/test_flow.py index 4e391fd3b..108a05c3f 100644 --- a/tests/test_flows/test_flow.py +++ b/tests/test_flows/test_flow.py @@ -4,6 +4,7 @@ import collections import copy import hashlib +import os import re import os import time @@ -162,12 +163,16 @@ def test_from_xml_to_xml(self): def test_to_xml_from_xml(self): scaler = sklearn.preprocessing.StandardScaler(with_mean=False) estimator_name = ( - "base_estimator" if Version(sklearn.__version__) < Version("1.4") else "estimator" + "base_estimator" + if Version(sklearn.__version__) < Version("1.4") + else "estimator" ) boosting = sklearn.ensemble.AdaBoostClassifier( **{estimator_name: sklearn.tree.DecisionTreeClassifier()}, ) - model = sklearn.pipeline.Pipeline(steps=(("scaler", scaler), ("boosting", boosting))) + model = sklearn.pipeline.Pipeline( + steps=(("scaler", scaler), ("boosting", boosting)) + ) flow = self.extension.model_to_flow(model) flow.flow_id = -234 # end of setup @@ -180,6 +185,10 @@ def test_to_xml_from_xml(self): openml.flows.functions.assert_flows_equal(new_flow, flow) assert new_flow is not flow + @pytest.mark.skipif( + os.getenv("OPENML_USE_LOCAL_SERVICES") == "true", + reason="Pending resolution of #1657", + ) @pytest.mark.sklearn() @pytest.mark.test_server() def test_publish_flow(self): @@ -204,7 +213,9 @@ def test_publish_flow(self): flow.publish() TestBase._mark_entity_for_removal("flow", flow.flow_id, flow.name) - TestBase.logger.info(f"collected from {__file__.split('/')[-1]}: {flow.flow_id}") + TestBase.logger.info( + f"collected from {__file__.split('/')[-1]}: {flow.flow_id}" + ) assert isinstance(flow.flow_id, int) @pytest.mark.sklearn() @@ -214,7 +225,9 @@ def test_publish_existing_flow(self, flow_exists_mock): flow = self.extension.model_to_flow(clf) flow_exists_mock.return_value = 1 - with pytest.raises(openml.exceptions.PyOpenMLError, match="OpenMLFlow already exists"): + with pytest.raises( + openml.exceptions.PyOpenMLError, match="OpenMLFlow already exists" + ): flow.publish(raise_error_if_exists=True) TestBase._mark_entity_for_removal("flow", flow.flow_id, flow.name) @@ -222,6 +235,10 @@ def test_publish_existing_flow(self, flow_exists_mock): f"collected from {__file__.split('/')[-1]}: {flow.flow_id}", ) + @pytest.mark.skipif( + os.getenv("OPENML_USE_LOCAL_SERVICES") == "true", + reason="Pending resolution of #1657", + ) @pytest.mark.sklearn() @pytest.mark.test_server() def test_publish_flow_with_similar_components(self): @@ -232,7 +249,9 @@ def test_publish_flow_with_similar_components(self): flow, _ = self._add_sentinel_to_flow_name(flow, None) flow.publish() TestBase._mark_entity_for_removal("flow", flow.flow_id, flow.name) - TestBase.logger.info(f"collected from {__file__.split('/')[-1]}: {flow.flow_id}") + TestBase.logger.info( + f"collected from {__file__.split('/')[-1]}: {flow.flow_id}" + ) # For a flow where both components are published together, the upload # date should be equal assert flow.upload_date == flow.components["lr"].upload_date, ( @@ -247,7 +266,9 @@ def test_publish_flow_with_similar_components(self): flow1, sentinel = self._add_sentinel_to_flow_name(flow1, None) flow1.publish() TestBase._mark_entity_for_removal("flow", flow.flow_id, flow.name) - TestBase.logger.info(f"collected from {__file__.split('/')[-1]}: {flow1.flow_id}") + TestBase.logger.info( + f"collected from {__file__.split('/')[-1]}: {flow1.flow_id}" + ) # In order to assign different upload times to the flows! time.sleep(1) @@ -259,20 +280,30 @@ def test_publish_flow_with_similar_components(self): flow2, _ = self._add_sentinel_to_flow_name(flow2, sentinel) flow2.publish() TestBase._mark_entity_for_removal("flow", flow2.flow_id, flow2.name) - TestBase.logger.info(f"collected from {__file__.split('/')[-1]}: {flow2.flow_id}") + TestBase.logger.info( + f"collected from {__file__.split('/')[-1]}: {flow2.flow_id}" + ) # If one component was published before the other, the components in # the flow should have different upload dates assert flow2.upload_date != flow2.components["dt"].upload_date - clf3 = sklearn.ensemble.AdaBoostClassifier(sklearn.tree.DecisionTreeClassifier(max_depth=3)) + clf3 = sklearn.ensemble.AdaBoostClassifier( + sklearn.tree.DecisionTreeClassifier(max_depth=3) + ) flow3 = self.extension.model_to_flow(clf3) flow3, _ = self._add_sentinel_to_flow_name(flow3, sentinel) # Child flow has different parameter. Check for storing the flow # correctly on the server should thus not check the child's parameters! flow3.publish() TestBase._mark_entity_for_removal("flow", flow3.flow_id, flow3.name) - TestBase.logger.info(f"collected from {__file__.split('/')[-1]}: {flow3.flow_id}") + TestBase.logger.info( + f"collected from {__file__.split('/')[-1]}: {flow3.flow_id}" + ) + @pytest.mark.skipif( + os.getenv("OPENML_USE_LOCAL_SERVICES") == "true", + reason="Pending resolution of #1657", + ) @pytest.mark.sklearn() @pytest.mark.test_server() def test_semi_legal_flow(self): @@ -280,7 +311,9 @@ def test_semi_legal_flow(self): # should not throw error as it contains two differentiable forms of # Bagging i.e., Bagging(Bagging(J48)) and Bagging(J48) estimator_name = ( - "base_estimator" if Version(sklearn.__version__) < Version("1.4") else "estimator" + "base_estimator" + if Version(sklearn.__version__) < Version("1.4") + else "estimator" ) semi_legal = sklearn.ensemble.BaggingClassifier( **{ @@ -296,7 +329,9 @@ def test_semi_legal_flow(self): flow.publish() TestBase._mark_entity_for_removal("flow", flow.flow_id, flow.name) - TestBase.logger.info(f"collected from {__file__.split('/')[-1]}: {flow.flow_id}") + TestBase.logger.info( + f"collected from {__file__.split('/')[-1]}: {flow.flow_id}" + ) @pytest.mark.sklearn() @mock.patch("openml.flows.functions.get_flow") @@ -383,13 +418,21 @@ def get_sentinel(): flow_id = openml.flows.flow_exists(name, version) assert not flow_id + @pytest.mark.skipif( + os.getenv("OPENML_USE_LOCAL_SERVICES") == "true", + reason="Pending resolution of #1657", + ) @pytest.mark.sklearn() @pytest.mark.test_server() def test_existing_flow_exists(self): # create a flow nb = sklearn.naive_bayes.GaussianNB() - sparse = "sparse" if Version(sklearn.__version__) < Version("1.4") else "sparse_output" + sparse = ( + "sparse" + if Version(sklearn.__version__) < Version("1.4") + else "sparse_output" + ) ohe_params = {sparse: False, "handle_unknown": "ignore"} if Version(sklearn.__version__) >= Version("0.20"): ohe_params["categories"] = "auto" @@ -424,6 +467,10 @@ def test_existing_flow_exists(self): ) assert downloaded_flow_id == flow.flow_id + @pytest.mark.skipif( + os.getenv("OPENML_USE_LOCAL_SERVICES") == "true", + reason="Pending resolution of #1657", + ) @pytest.mark.sklearn() @pytest.mark.test_server() def test_sklearn_to_upload_to_flow(self): @@ -444,13 +491,20 @@ def test_sklearn_to_upload_to_flow(self): ) fu = sklearn.pipeline.FeatureUnion(transformer_list=[("pca", pca), ("fs", fs)]) estimator_name = ( - "base_estimator" if Version(sklearn.__version__) < Version("1.4") else "estimator" + "base_estimator" + if Version(sklearn.__version__) < Version("1.4") + else "estimator" ) boosting = sklearn.ensemble.AdaBoostClassifier( **{estimator_name: sklearn.tree.DecisionTreeClassifier()}, ) model = sklearn.pipeline.Pipeline( - steps=[("ohe", ohe), ("scaler", scaler), ("fu", fu), ("boosting", boosting)], + steps=[ + ("ohe", ohe), + ("scaler", scaler), + ("fu", fu), + ("boosting", boosting), + ], ) parameter_grid = { "boosting__n_estimators": [1, 5, 10, 100], @@ -477,7 +531,9 @@ def test_sklearn_to_upload_to_flow(self): flow.publish() TestBase._mark_entity_for_removal("flow", flow.flow_id, flow.name) - TestBase.logger.info(f"collected from {__file__.split('/')[-1]}: {flow.flow_id}") + TestBase.logger.info( + f"collected from {__file__.split('/')[-1]}: {flow.flow_id}" + ) assert isinstance(flow.flow_id, int) # Check whether we can load the flow again @@ -560,7 +616,10 @@ def test_extract_tags(self): tags = openml.utils.extract_xml_tags("oml:tag", flow_dict) assert tags == ["study_14"] - flow_xml = "OpenmlWeka\n" "weka" + flow_xml = ( + "OpenmlWeka\n" + "weka" + ) flow_dict = xmltodict.parse(flow_xml) tags = openml.utils.extract_xml_tags("oml:tag", flow_dict["oml:flow"]) assert tags == ["OpenmlWeka", "weka"] diff --git a/tests/test_flows/test_flow_functions.py b/tests/test_flows/test_flow_functions.py index 7a1331c45..f0709bb45 100644 --- a/tests/test_flows/test_flow_functions.py +++ b/tests/test_flows/test_flow_functions.py @@ -6,7 +6,7 @@ import unittest from collections import OrderedDict from multiprocessing.managers import Value - +import os from openml_sklearn import SklearnExtension from packaging.version import Version from unittest import mock @@ -153,7 +153,9 @@ def test_are_flows_equal(self): openml.flows.functions.assert_flows_equal(flow, flow) new_flow = copy.deepcopy(flow) new_flow.parameters["abc"] = 3.0 - self.assertRaises(ValueError, openml.flows.functions.assert_flows_equal, flow, new_flow) + self.assertRaises( + ValueError, openml.flows.functions.assert_flows_equal, flow, new_flow + ) # Now test for components (subflows) parent_flow = copy.deepcopy(flow) @@ -195,24 +197,28 @@ def test_are_flows_equal_ignore_parameter_values(self): ) openml.flows.functions.assert_flows_equal(flow, flow) - openml.flows.functions.assert_flows_equal(flow, flow, ignore_parameter_values=True) + openml.flows.functions.assert_flows_equal( + flow, flow, ignore_parameter_values=True + ) new_flow = copy.deepcopy(flow) new_flow.parameters["a"] = 7 with pytest.raises(ValueError) as excinfo: openml.flows.functions.assert_flows_equal(flow, new_flow) - assert str(paramaters) in str(excinfo.value) and str(new_flow.parameters) in str( - excinfo.value - ) + assert str(paramaters) in str(excinfo.value) and str( + new_flow.parameters + ) in str(excinfo.value) - openml.flows.functions.assert_flows_equal(flow, new_flow, ignore_parameter_values=True) + openml.flows.functions.assert_flows_equal( + flow, new_flow, ignore_parameter_values=True + ) del new_flow.parameters["a"] with pytest.raises(ValueError) as excinfo: openml.flows.functions.assert_flows_equal(flow, new_flow) - assert str(paramaters) in str(excinfo.value) and str(new_flow.parameters) in str( - excinfo.value - ) + assert str(paramaters) in str(excinfo.value) and str( + new_flow.parameters + ) in str(excinfo.value) self.assertRaisesRegex( ValueError, @@ -246,7 +252,9 @@ def test_are_flows_equal_ignore_if_older(self): upload_date=flow_upload_date, ) - assert_flows_equal(flow, flow, ignore_parameter_values_on_older_children=flow_upload_date) + assert_flows_equal( + flow, flow, ignore_parameter_values_on_older_children=flow_upload_date + ) assert_flows_equal(flow, flow, ignore_parameter_values_on_older_children=None) new_flow = copy.deepcopy(flow) new_flow.parameters["a"] = 7 @@ -296,7 +304,9 @@ def test_sklearn_to_flow_list_of_lists(self): self._add_sentinel_to_flow_name(flow) flow.publish() TestBase._mark_entity_for_removal("flow", flow.flow_id, flow.name) - TestBase.logger.info(f"collected from {__file__.split('/')[-1]}: {flow.flow_id}") + TestBase.logger.info( + f"collected from {__file__.split('/')[-1]}: {flow.flow_id}" + ) # Test deserialization works server_flow = openml.flows.get_flow(flow.flow_id, reinstantiate=True) assert server_flow.parameters["categories"] == "[[0, 1], [0, 1]]" @@ -310,6 +320,10 @@ def test_get_flow1(self): flow = openml.flows.get_flow(1) assert flow.external_version is None + @pytest.mark.skipif( + os.getenv("OPENML_USE_LOCAL_SERVICES") == "true", + reason="Pending resolution of #1657", + ) @pytest.mark.sklearn() @pytest.mark.test_server() def test_get_flow_reinstantiate_model(self): @@ -318,10 +332,14 @@ def test_get_flow_reinstantiate_model(self): flow = extension.model_to_flow(model) flow.publish(raise_error_if_exists=False) TestBase._mark_entity_for_removal("flow", flow.flow_id, flow.name) - TestBase.logger.info(f"collected from {__file__.split('/')[-1]}: {flow.flow_id}") + TestBase.logger.info( + f"collected from {__file__.split('/')[-1]}: {flow.flow_id}" + ) downloaded_flow = openml.flows.get_flow(flow.flow_id, reinstantiate=True) - assert isinstance(downloaded_flow.model, sklearn.ensemble.RandomForestClassifier) + assert isinstance( + downloaded_flow.model, sklearn.ensemble.RandomForestClassifier + ) @pytest.mark.test_server() def test_get_flow_reinstantiate_model_no_extension(self): @@ -340,7 +358,9 @@ def test_get_flow_reinstantiate_model_no_extension(self): reason="Requires scikit-learn!=0.19.1, because target flow is from that version.", ) @pytest.mark.production_server() - def test_get_flow_with_reinstantiate_strict_with_wrong_version_raises_exception(self): + def test_get_flow_with_reinstantiate_strict_with_wrong_version_raises_exception( + self, + ): self.use_production_server() flow = 8175 expected = "Trying to deserialize a model with dependency sklearn==0.19.1 not satisfied." @@ -363,7 +383,9 @@ def test_get_flow_with_reinstantiate_strict_with_wrong_version_raises_exception( @pytest.mark.production_server() def test_get_flow_reinstantiate_flow_not_strict_post_1(self): self.use_production_server() - flow = openml.flows.get_flow(flow_id=19190, reinstantiate=True, strict_version=False) + flow = openml.flows.get_flow( + flow_id=19190, reinstantiate=True, strict_version=False + ) assert flow.flow_id is None assert "sklearn==1.0.0" not in flow.dependencies @@ -377,7 +399,9 @@ def test_get_flow_reinstantiate_flow_not_strict_post_1(self): @pytest.mark.production_server() def test_get_flow_reinstantiate_flow_not_strict_023_and_024(self): self.use_production_server() - flow = openml.flows.get_flow(flow_id=18587, reinstantiate=True, strict_version=False) + flow = openml.flows.get_flow( + flow_id=18587, reinstantiate=True, strict_version=False + ) assert flow.flow_id is None assert "sklearn==0.23.1" not in flow.dependencies @@ -389,10 +413,16 @@ def test_get_flow_reinstantiate_flow_not_strict_023_and_024(self): @pytest.mark.production_server() def test_get_flow_reinstantiate_flow_not_strict_pre_023(self): self.use_production_server() - flow = openml.flows.get_flow(flow_id=8175, reinstantiate=True, strict_version=False) + flow = openml.flows.get_flow( + flow_id=8175, reinstantiate=True, strict_version=False + ) assert flow.flow_id is None assert "sklearn==0.19.1" not in flow.dependencies + @pytest.mark.skipif( + os.getenv("OPENML_USE_LOCAL_SERVICES") == "true", + reason="Pending resolution of #1657", + ) @pytest.mark.sklearn() @pytest.mark.test_server() def test_get_flow_id(self): @@ -402,13 +432,19 @@ def test_get_flow_id(self): list_all = functools.lru_cache()(openml.utils._list_all) with patch("openml.utils._list_all", list_all): clf = sklearn.tree.DecisionTreeClassifier() - flow = openml.extensions.get_extension_by_model(clf).model_to_flow(clf).publish() + flow = ( + openml.extensions.get_extension_by_model(clf) + .model_to_flow(clf) + .publish() + ) TestBase._mark_entity_for_removal("flow", flow.flow_id, flow.name) TestBase.logger.info( f"collected from {__file__.split('/')[-1]}: {flow.flow_id}", ) - assert openml.flows.get_flow_id(model=clf, exact_version=True) == flow.flow_id + assert ( + openml.flows.get_flow_id(model=clf, exact_version=True) == flow.flow_id + ) flow_ids = openml.flows.get_flow_id(model=clf, exact_version=False) assert flow.flow_id in flow_ids assert len(flow_ids) > 0 @@ -424,9 +460,13 @@ def test_get_flow_id(self): exact_version=False, ) assert flow.flow_id in flow_ids_exact_version_True - assert set(flow_ids_exact_version_True).issubset(set(flow_ids_exact_version_False)) + assert set(flow_ids_exact_version_True).issubset( + set(flow_ids_exact_version_False) + ) # instead of the assertion above, the assertion below used to be used. - pytest.skip(reason="Not sure why there should only be one version of this flow.") + pytest.skip( + reason="Not sure why there should only be one version of this flow." + ) assert flow_ids_exact_version_True == flow_ids_exact_version_False @pytest.mark.test_server() diff --git a/tests/test_openml/test_api_calls.py b/tests/test_openml/test_api_calls.py index f2a81be9f..538fbe59f 100644 --- a/tests/test_openml/test_api_calls.py +++ b/tests/test_openml/test_api_calls.py @@ -6,6 +6,7 @@ from unittest import mock import minio +import os import pytest import os @@ -20,6 +21,10 @@ def test_too_long_uri(self): with pytest.raises(openml.exceptions.OpenMLServerError, match="URI too long!"): openml.datasets.list_datasets(data_id=list(range(10000))) + @pytest.mark.skipif( + os.getenv("OPENML_USE_LOCAL_SERVICES") == "true", + reason="Pending resolution of #1657", + ) @unittest.mock.patch("time.sleep") @unittest.mock.patch("requests.Session") @pytest.mark.test_server() @@ -33,11 +38,17 @@ def test_retry_on_database_error(self, Session_class_mock, _): "Please wait for N seconds and try again.\n" "" ) - Session_class_mock.return_value.__enter__.return_value.get.return_value = response_mock - with pytest.raises(openml.exceptions.OpenMLServerException, match="/abc returned code 107"): + Session_class_mock.return_value.__enter__.return_value.get.return_value = ( + response_mock + ) + with pytest.raises( + openml.exceptions.OpenMLServerException, match="/abc returned code 107" + ): openml._api_calls._send_request("get", "/abc", {}) - assert Session_class_mock.return_value.__enter__.return_value.get.call_count == 20 + assert ( + Session_class_mock.return_value.__enter__.return_value.get.call_count == 20 + ) class FakeObject(NamedTuple): @@ -124,5 +135,9 @@ def test_authentication_endpoints_requiring_api_key_show_relevant_help_link( ) -> None: # We need to temporarily disable the API key to test the error message with openml.config.overwrite_config_context({"apikey": None}): - with pytest.raises(openml.exceptions.OpenMLAuthenticationError, match=API_TOKEN_HELP_LINK): - openml._api_calls._perform_api_call(call=endpoint, request_method=method, data=None) + with pytest.raises( + openml.exceptions.OpenMLAuthenticationError, match=API_TOKEN_HELP_LINK + ): + openml._api_calls._perform_api_call( + call=endpoint, request_method=method, data=None + ) diff --git a/tests/test_runs/test_run.py b/tests/test_runs/test_run.py index 22a8bc936..05e8ef1dd 100644 --- a/tests/test_runs/test_run.py +++ b/tests/test_runs/test_run.py @@ -48,7 +48,10 @@ def test_tagging(self): def _test_prediction_data_equal(run, run_prime): # Determine which attributes are numeric and which not num_cols = np.array( - [d_type == "NUMERIC" for _, d_type in run._generate_arff_dict()["attributes"]], + [ + d_type == "NUMERIC" + for _, d_type in run._generate_arff_dict()["attributes"] + ], ) # Get run data consistently # (For run from server, .data_content does not exist) @@ -66,7 +69,9 @@ def _test_prediction_data_equal(run, run_prime): def _test_run_obj_equals(self, run, run_prime): for dictionary in ["evaluations", "fold_evaluations", "sample_evaluations"]: if getattr(run, dictionary) is not None: - self.assertDictEqual(getattr(run, dictionary), getattr(run_prime, dictionary)) + self.assertDictEqual( + getattr(run, dictionary), getattr(run_prime, dictionary) + ) else: # should be none or empty other = getattr(run_prime, dictionary) @@ -76,7 +81,9 @@ def _test_run_obj_equals(self, run, run_prime): self._test_prediction_data_equal(run, run_prime) # Test trace - run_trace_content = run.trace.trace_to_arff()["data"] if run.trace is not None else None + run_trace_content = ( + run.trace.trace_to_arff()["data"] if run.trace is not None else None + ) if run_prime.trace is not None: run_prime_trace_content = run_prime.trace.trace_to_arff()["data"] @@ -118,6 +125,10 @@ def _check_array(array, type_): else: assert run_prime_trace_content is None + @pytest.mark.skipif( + os.getenv("OPENML_USE_LOCAL_SERVICES") == "true", + reason="Pending resolution of #1657", + ) @pytest.mark.sklearn() @pytest.mark.test_server() def test_to_from_filesystem_vanilla(self): @@ -153,6 +164,10 @@ def test_to_from_filesystem_vanilla(self): f"collected from {__file__.split('/')[-1]}: {run_prime.run_id}", ) + @pytest.mark.skipif( + os.getenv("OPENML_USE_LOCAL_SERVICES") == "true", + reason="Pending resolution of #1657", + ) @pytest.mark.sklearn() @pytest.mark.flaky() @pytest.mark.test_server() @@ -189,14 +204,23 @@ def test_to_from_filesystem_search(self): f"collected from {__file__.split('/')[-1]}: {run_prime.run_id}", ) + @pytest.mark.skipif( + os.getenv("OPENML_USE_LOCAL_SERVICES") == "true", + reason="Pending resolution of #1657", + ) @pytest.mark.sklearn() @pytest.mark.test_server() def test_to_from_filesystem_no_model(self): model = Pipeline( - [("imputer", SimpleImputer(strategy="mean")), ("classifier", DummyClassifier())], + [ + ("imputer", SimpleImputer(strategy="mean")), + ("classifier", DummyClassifier()), + ], ) task = openml.tasks.get_task(119) # diabetes; crossvalidation - run = openml.runs.run_model_on_task(model=model, task=task, add_local_measures=False) + run = openml.runs.run_model_on_task( + model=model, task=task, add_local_measures=False + ) cache_path = os.path.join(self.workdir, "runs", str(random.getrandbits(128))) run.to_filesystem(cache_path, store_model=False) @@ -265,7 +289,9 @@ def assert_run_prediction_data(task, run, model): # Check correctness of y_true and y_pred in run for fold_id in range(n_folds): # Get data for fold - _, test_indices = task.get_train_test_split_indices(repeat=0, fold=fold_id, sample=0) + _, test_indices = task.get_train_test_split_indices( + repeat=0, fold=fold_id, sample=0 + ) train_mask = np.full(len(X), True) train_mask[test_indices] = False @@ -279,7 +305,9 @@ def assert_run_prediction_data(task, run, model): y_pred = model.fit(X_train, y_train).predict(X_test) # Get stored data for fold - saved_fold_data = run.predictions[run.predictions["fold"] == fold_id].sort_values( + saved_fold_data = run.predictions[ + run.predictions["fold"] == fold_id + ].sort_values( by="row_id", ) saved_y_pred = saved_fold_data["prediction"].values @@ -295,6 +323,10 @@ def assert_run_prediction_data(task, run, model): assert_method(y_pred, saved_y_pred) assert_method(y_test, saved_y_test) + @pytest.mark.skipif( + os.getenv("OPENML_USE_LOCAL_SERVICES") == "true", + reason="Pending resolution of #1657", + ) @pytest.mark.sklearn() @pytest.mark.test_server() def test_publish_with_local_loaded_flow(self): @@ -323,7 +355,9 @@ def test_publish_with_local_loaded_flow(self): # Make sure that the prediction data stored in the run is correct. self.assert_run_prediction_data(task, run, clone(model)) - cache_path = os.path.join(self.workdir, "runs", str(random.getrandbits(128))) + cache_path = os.path.join( + self.workdir, "runs", str(random.getrandbits(128)) + ) run.to_filesystem(cache_path) # obtain run from filesystem loaded_run = openml.runs.OpenMLRun.from_filesystem(cache_path) @@ -339,6 +373,10 @@ def test_publish_with_local_loaded_flow(self): assert openml.flows.flow_exists(flow.name, flow.external_version) openml.runs.get_run(loaded_run.run_id) + @pytest.mark.skipif( + os.getenv("OPENML_USE_LOCAL_SERVICES") == "true", + reason="Pending resolution of #1657", + ) @pytest.mark.sklearn() @pytest.mark.test_server() @pytest.mark.skip(reason="https://github.com/openml/openml-python/issues/1586") @@ -362,7 +400,9 @@ def test_offline_and_online_run_identical(self): assert not openml.flows.flow_exists(flow.name, flow.external_version) # Load from filesystem - cache_path = os.path.join(self.workdir, "runs", str(random.getrandbits(128))) + cache_path = os.path.join( + self.workdir, "runs", str(random.getrandbits(128)) + ) run.to_filesystem(cache_path) loaded_run = openml.runs.OpenMLRun.from_filesystem(cache_path) @@ -396,5 +436,7 @@ def test_run_setup_string_included_in_xml(self): assert "oml:setup_string" in run_dict assert run_dict["oml:setup_string"] == SETUP_STRING - recreated_run = openml.runs.functions._create_run_from_xml(xml, from_server=False) + recreated_run = openml.runs.functions._create_run_from_xml( + xml, from_server=False + ) assert recreated_run.setup_string == SETUP_STRING diff --git a/tests/test_runs/test_run_functions.py b/tests/test_runs/test_run_functions.py index 3728e0d78..3f7cc12e9 100644 --- a/tests/test_runs/test_run_functions.py +++ b/tests/test_runs/test_run_functions.py @@ -40,7 +40,8 @@ OpenMLNotAuthorizedError, OpenMLServerException, ) -#from openml.extensions.sklearn import cat, cont + +# from openml.extensions.sklearn import cat, cont from openml.runs.functions import ( _run_task_get_arffcontent, delete_run, @@ -132,9 +133,9 @@ def _wait_for_processed_run(self, run_id, max_waiting_time_seconds): time.sleep(10) continue - assert len(run.evaluations) > 0, ( - "Expect not-None evaluations to always contain elements." - ) + assert ( + len(run.evaluations) > 0 + ), "Expect not-None evaluations to always contain elements." return raise RuntimeError( @@ -143,7 +144,10 @@ def _wait_for_processed_run(self, run_id, max_waiting_time_seconds): ) def _assert_predictions_equal(self, predictions, predictions_prime): - assert np.array(predictions_prime["data"]).shape == np.array(predictions["data"]).shape + assert ( + np.array(predictions_prime["data"]).shape + == np.array(predictions["data"]).shape + ) # The original search model does not submit confidence # bounds, so we can not compare the arff line @@ -164,7 +168,9 @@ def _assert_predictions_equal(self, predictions, predictions_prime): else: assert val_1 == val_2 - def _rerun_model_and_compare_predictions(self, run_id, model_prime, seed, create_task_obj): + def _rerun_model_and_compare_predictions( + self, run_id, model_prime, seed, create_task_obj + ): run = openml.runs.get_run(run_id) # TODO: assert holdout task @@ -251,9 +257,13 @@ def _perform_run( "sklearn.pipeline.Pipeline", ] if Version(sklearn.__version__) < Version("0.22"): - classes_without_random_state.append("sklearn.linear_model.base.LinearRegression") + classes_without_random_state.append( + "sklearn.linear_model.base.LinearRegression" + ) else: - classes_without_random_state.append("sklearn.linear_model._base.LinearRegression") + classes_without_random_state.append( + "sklearn.linear_model._base.LinearRegression" + ) def _remove_random_state(flow): if "random_state" in flow.parameters: @@ -305,9 +315,12 @@ def _remove_random_state(flow): flow_server = self.extension.model_to_flow(clf_server) if flow.class_name not in classes_without_random_state: - error_msg = "Flow class %s (id=%d) does not have a random state parameter" % ( - flow.class_name, - flow.flow_id, + error_msg = ( + "Flow class %s (id=%d) does not have a random state parameter" + % ( + flow.class_name, + flow.flow_id, + ) ) assert "random_state" in flow.parameters, error_msg # If the flow is initialized from a model without a random @@ -397,6 +410,10 @@ def _check_sample_evaluations( assert evaluation > 0 assert evaluation < max_time_allowed + @pytest.mark.skipif( + os.getenv("OPENML_USE_LOCAL_SERVICES") == "true", + reason="Pending resolution of #1657", + ) @pytest.mark.sklearn() @pytest.mark.test_server() def test_run_regression_on_classif_task(self): @@ -407,13 +424,18 @@ def test_run_regression_on_classif_task(self): # internally dataframe is loaded and targets are categorical # which LinearRegression() cannot handle with pytest.raises( - AttributeError, match="'LinearRegression' object has no attribute 'classes_'" + AttributeError, + match="'LinearRegression' object has no attribute 'classes_'", ): openml.runs.run_model_on_task( model=clf, task=task, ) + @pytest.mark.skipif( + os.getenv("OPENML_USE_LOCAL_SERVICES") == "true", + reason="Pending resolution of #1657", + ) @pytest.mark.sklearn() @pytest.mark.test_server() def test_check_erronous_sklearn_flow_fails(self): @@ -479,7 +501,9 @@ def determine_grid_size(param_grid): grid_iterations += determine_grid_size(sub_grid) return grid_iterations else: - raise TypeError("Param Grid should be of type list (GridSearch only) or dict") + raise TypeError( + "Param Grid should be of type list (GridSearch only) or dict" + ) run = self._perform_run( task_id, @@ -627,6 +651,10 @@ def _run_and_upload_regression( sentinel=sentinel, ) + @pytest.mark.skipif( + os.getenv("OPENML_USE_LOCAL_SERVICES") == "true", + reason="Pending resolution of #1657", + ) @pytest.mark.sklearn() @pytest.mark.test_server() def test_run_and_upload_logistic_regression(self): @@ -634,8 +662,14 @@ def test_run_and_upload_logistic_regression(self): task_id = self.TEST_SERVER_TASK_SIMPLE["task_id"] n_missing_vals = self.TEST_SERVER_TASK_SIMPLE["n_missing_vals"] n_test_obs = self.TEST_SERVER_TASK_SIMPLE["n_test_obs"] - self._run_and_upload_classification(lr, task_id, n_missing_vals, n_test_obs, "62501") + self._run_and_upload_classification( + lr, task_id, n_missing_vals, n_test_obs, "62501" + ) + @pytest.mark.skipif( + os.getenv("OPENML_USE_LOCAL_SERVICES") == "true", + reason="Pending resolution of #1657", + ) @pytest.mark.sklearn() @pytest.mark.test_server() def test_run_and_upload_linear_regression(self): @@ -656,7 +690,9 @@ def test_run_and_upload_linear_regression(self): if e.code == 614: # Task already exists # the exception message contains the task_id that was matched in the format # 'Task already exists. - matched id(s): [xxxx]' - task_id = ast.literal_eval(e.message.split("matched id(s):")[-1].strip())[0] + task_id = ast.literal_eval( + e.message.split("matched id(s):")[-1].strip() + )[0] else: raise Exception(repr(e)) # mark to remove the uploaded task @@ -665,8 +701,14 @@ def test_run_and_upload_linear_regression(self): n_missing_vals = self.TEST_SERVER_TASK_REGRESSION["n_missing_vals"] n_test_obs = self.TEST_SERVER_TASK_REGRESSION["n_test_obs"] - self._run_and_upload_regression(lr, task_id, n_missing_vals, n_test_obs, "62501") + self._run_and_upload_regression( + lr, task_id, n_missing_vals, n_test_obs, "62501" + ) + @pytest.mark.skipif( + os.getenv("OPENML_USE_LOCAL_SERVICES") == "true", + reason="Pending resolution of #1657", + ) @pytest.mark.sklearn() @pytest.mark.test_server() def test_run_and_upload_pipeline_dummy_pipeline(self): @@ -679,8 +721,14 @@ def test_run_and_upload_pipeline_dummy_pipeline(self): task_id = self.TEST_SERVER_TASK_SIMPLE["task_id"] n_missing_vals = self.TEST_SERVER_TASK_SIMPLE["n_missing_vals"] n_test_obs = self.TEST_SERVER_TASK_SIMPLE["n_test_obs"] - self._run_and_upload_classification(pipeline1, task_id, n_missing_vals, n_test_obs, "62501") + self._run_and_upload_classification( + pipeline1, task_id, n_missing_vals, n_test_obs, "62501" + ) + @pytest.mark.skipif( + os.getenv("OPENML_USE_LOCAL_SERVICES") == "true", + reason="Pending resolution of #1657", + ) @pytest.mark.sklearn() @unittest.skipIf( Version(sklearn.__version__) < Version("0.20"), @@ -706,7 +754,9 @@ def get_ct_cf(nominal_indices, numeric_indices): "nominal", make_pipeline( CustomImputer(strategy="most_frequent"), - sklearn.preprocessing.OneHotEncoder(handle_unknown="ignore"), + sklearn.preprocessing.OneHotEncoder( + handle_unknown="ignore" + ), ), nominal_indices, ), @@ -782,7 +832,9 @@ def test_run_and_upload_knn_pipeline(self, warnings_mock): task_id = self.TEST_SERVER_TASK_MISSING_VALS["task_id"] n_missing_vals = self.TEST_SERVER_TASK_MISSING_VALS["n_missing_vals"] n_test_obs = self.TEST_SERVER_TASK_MISSING_VALS["n_test_obs"] - self._run_and_upload_classification(pipeline2, task_id, n_missing_vals, n_test_obs, "62501") + self._run_and_upload_classification( + pipeline2, task_id, n_missing_vals, n_test_obs, "62501" + ) # The warning raised is: # "The total space of parameters 8 is smaller than n_iter=10. # Running 8 iterations. For exhaustive searches, use GridSearchCV." @@ -798,15 +850,24 @@ def test_run_and_upload_knn_pipeline(self, warnings_mock): call_count += 1 assert call_count == 3 + @pytest.mark.skipif( + os.getenv("OPENML_USE_LOCAL_SERVICES") == "true", + reason="Pending resolution of #1657", + ) @pytest.mark.sklearn() @pytest.mark.test_server() def test_run_and_upload_gridsearch(self): estimator_name = ( - "base_estimator" if Version(sklearn.__version__) < Version("1.4") else "estimator" + "base_estimator" + if Version(sklearn.__version__) < Version("1.4") + else "estimator" ) gridsearch = GridSearchCV( BaggingClassifier(**{estimator_name: SVC()}), - {f"{estimator_name}__C": [0.01, 0.1, 10], f"{estimator_name}__gamma": [0.01, 0.1, 10]}, + { + f"{estimator_name}__C": [0.01, 0.1, 10], + f"{estimator_name}__gamma": [0.01, 0.1, 10], + }, cv=3, ) task_id = self.TEST_SERVER_TASK_SIMPLE["task_id"] @@ -821,6 +882,10 @@ def test_run_and_upload_gridsearch(self): ) assert len(run.trace.trace_iterations) == 9 + @pytest.mark.skipif( + os.getenv("OPENML_USE_LOCAL_SERVICES") == "true", + reason="Pending resolution of #1657", + ) @pytest.mark.sklearn() @pytest.mark.test_server() def test_run_and_upload_randomsearch(self): @@ -854,6 +919,10 @@ def test_run_and_upload_randomsearch(self): trace = openml.runs.get_run_trace(run.run_id) assert len(trace.trace_iterations) == 5 + @pytest.mark.skipif( + os.getenv("OPENML_USE_LOCAL_SERVICES") == "true", + reason="Pending resolution of #1657", + ) @pytest.mark.sklearn() @pytest.mark.test_server() def test_run_and_upload_maskedarrays(self): @@ -882,6 +951,10 @@ def test_run_and_upload_maskedarrays(self): ########################################################################## + @pytest.mark.skipif( + os.getenv("OPENML_USE_LOCAL_SERVICES") == "true", + reason="Pending resolution of #1657", + ) @pytest.mark.sklearn() @pytest.mark.test_server() def test_learning_curve_task_1(self): @@ -905,8 +978,14 @@ def test_learning_curve_task_1(self): pipeline1, flow_expected_rsv="62501", ) - self._check_sample_evaluations(run.sample_evaluations, num_repeats, num_folds, num_samples) + self._check_sample_evaluations( + run.sample_evaluations, num_repeats, num_folds, num_samples + ) + @pytest.mark.skipif( + os.getenv("OPENML_USE_LOCAL_SERVICES") == "true", + reason="Pending resolution of #1657", + ) @pytest.mark.sklearn() @pytest.mark.test_server() def test_learning_curve_task_2(self): @@ -942,8 +1021,14 @@ def test_learning_curve_task_2(self): pipeline2, flow_expected_rsv="62501", ) - self._check_sample_evaluations(run.sample_evaluations, num_repeats, num_folds, num_samples) + self._check_sample_evaluations( + run.sample_evaluations, num_repeats, num_folds, num_samples + ) + @pytest.mark.skipif( + os.getenv("OPENML_USE_LOCAL_SERVICES") == "true", + reason="Pending resolution of #1657", + ) @pytest.mark.sklearn() @unittest.skipIf( Version(sklearn.__version__) < Version("0.21"), @@ -1023,6 +1108,10 @@ def _test_local_evaluations(self, run): assert alt_scores[idx] >= 0 assert alt_scores[idx] <= 1 + @pytest.mark.skipif( + os.getenv("OPENML_USE_LOCAL_SERVICES") == "true", + reason="Pending resolution of #1657", + ) @pytest.mark.sklearn() @pytest.mark.test_server() def test_local_run_swapped_parameter_order_model(self): @@ -1039,6 +1128,10 @@ def test_local_run_swapped_parameter_order_model(self): self._test_local_evaluations(run) + @pytest.mark.skipif( + os.getenv("OPENML_USE_LOCAL_SERVICES") == "true", + reason="Pending resolution of #1657", + ) @pytest.mark.sklearn() @pytest.mark.skip("https://github.com/openml/openml-python/issues/1586") @unittest.skipIf( @@ -1108,6 +1201,10 @@ def test_online_run_metric_score(self): self._test_local_evaluations(run) + @pytest.mark.skipif( + os.getenv("OPENML_USE_LOCAL_SERVICES") == "true", + reason="Pending resolution of #1657", + ) @pytest.mark.sklearn() @unittest.skipIf( Version(sklearn.__version__) < Version("0.20"), @@ -1141,7 +1238,9 @@ def test_initialize_model_from_run(self): if e.code == 614: # Task already exists # the exception message contains the task_id that was matched in the format # 'Task already exists. - matched id(s): [xxxx]' - task_id = ast.literal_eval(e.message.split("matched id(s):")[-1].strip())[0] + task_id = ast.literal_eval( + e.message.split("matched id(s):")[-1].strip() + )[0] else: raise Exception(repr(e)) # mark to remove the uploaded task @@ -1170,6 +1269,10 @@ def test_initialize_model_from_run(self): assert flowS.components["Imputer"].parameters["strategy"] == '"most_frequent"' assert flowS.components["VarianceThreshold"].parameters["threshold"] == "0.05" + @pytest.mark.skipif( + os.getenv("OPENML_USE_LOCAL_SERVICES") == "true", + reason="Pending resolution of #1657", + ) @pytest.mark.sklearn() @unittest.skipIf( Version(sklearn.__version__) < Version("0.20"), @@ -1230,6 +1333,10 @@ def test__run_exists(self): run_ids = run_exists(task.task_id, setup_exists) assert run_ids, (run_ids, clf) + @pytest.mark.skipif( + os.getenv("OPENML_USE_LOCAL_SERVICES") == "true", + reason="Pending resolution of #1657", + ) @pytest.mark.sklearn() @pytest.mark.test_server() def test_run_with_illegal_flow_id(self): @@ -1243,13 +1350,19 @@ def test_run_with_illegal_flow_id(self): expected_message_regex = ( r"Flow does not exist on the server, but 'flow.flow_id' is not None." ) - with pytest.raises(openml.exceptions.PyOpenMLError, match=expected_message_regex): + with pytest.raises( + openml.exceptions.PyOpenMLError, match=expected_message_regex + ): openml.runs.run_flow_on_task( task=task, flow=flow, avoid_duplicate_runs=True, ) + @pytest.mark.skipif( + os.getenv("OPENML_USE_LOCAL_SERVICES") == "true", + reason="Pending resolution of #1657", + ) @pytest.mark.sklearn() @pytest.mark.test_server() def test_run_with_illegal_flow_id_after_load(self): @@ -1277,11 +1390,19 @@ def test_run_with_illegal_flow_id_after_load(self): expected_message_regex = ( r"Flow does not exist on the server, but 'flow.flow_id' is not None." ) - with pytest.raises(openml.exceptions.PyOpenMLError, match=expected_message_regex): + with pytest.raises( + openml.exceptions.PyOpenMLError, match=expected_message_regex + ): loaded_run.publish() TestBase._mark_entity_for_removal("run", loaded_run.run_id) - TestBase.logger.info(f"collected from test_run_functions: {loaded_run.run_id}") + TestBase.logger.info( + f"collected from test_run_functions: {loaded_run.run_id}" + ) + @pytest.mark.skipif( + os.getenv("OPENML_USE_LOCAL_SERVICES") == "true", + reason="Pending resolution of #1657", + ) @pytest.mark.sklearn() @pytest.mark.test_server() def test_run_with_illegal_flow_id_1(self): @@ -1293,21 +1414,31 @@ def test_run_with_illegal_flow_id_1(self): try: flow_orig.publish() # ensures flow exist on server TestBase._mark_entity_for_removal("flow", flow_orig.flow_id, flow_orig.name) - TestBase.logger.info(f"collected from test_run_functions: {flow_orig.flow_id}") + TestBase.logger.info( + f"collected from test_run_functions: {flow_orig.flow_id}" + ) except openml.exceptions.OpenMLServerException: # flow already exists pass flow_new = self.extension.model_to_flow(clf) flow_new.flow_id = -1 - expected_message_regex = "Local flow_id does not match server flow_id: '-1' vs '[0-9]+'" - with pytest.raises(openml.exceptions.PyOpenMLError, match=expected_message_regex): + expected_message_regex = ( + "Local flow_id does not match server flow_id: '-1' vs '[0-9]+'" + ) + with pytest.raises( + openml.exceptions.PyOpenMLError, match=expected_message_regex + ): openml.runs.run_flow_on_task( task=task, flow=flow_new, avoid_duplicate_runs=True, ) + @pytest.mark.skipif( + os.getenv("OPENML_USE_LOCAL_SERVICES") == "true", + reason="Pending resolution of #1657", + ) @pytest.mark.sklearn() @pytest.mark.test_server() def test_run_with_illegal_flow_id_1_after_load(self): @@ -1319,7 +1450,9 @@ def test_run_with_illegal_flow_id_1_after_load(self): try: flow_orig.publish() # ensures flow exist on server TestBase._mark_entity_for_removal("flow", flow_orig.flow_id, flow_orig.name) - TestBase.logger.info(f"collected from test_run_functions: {flow_orig.flow_id}") + TestBase.logger.info( + f"collected from test_run_functions: {flow_orig.flow_id}" + ) except openml.exceptions.OpenMLServerException: # flow already exists pass @@ -1340,13 +1473,19 @@ def test_run_with_illegal_flow_id_1_after_load(self): run.to_filesystem(cache_path) loaded_run = openml.runs.OpenMLRun.from_filesystem(cache_path) - expected_message_regex = "Local flow_id does not match server flow_id: '-1' vs '[0-9]+'" + expected_message_regex = ( + "Local flow_id does not match server flow_id: '-1' vs '[0-9]+'" + ) self.assertRaisesRegex( openml.exceptions.PyOpenMLError, expected_message_regex, loaded_run.publish, ) + @pytest.mark.skipif( + os.getenv("OPENML_USE_LOCAL_SERVICES") == "true", + reason="Pending resolution of #1657", + ) @pytest.mark.sklearn() @unittest.skipIf( Version(sklearn.__version__) < Version("0.20"), @@ -1577,6 +1716,10 @@ def test_get_runs_list_by_tag(self): runs = openml.runs.list_runs(tag="curves", size=2) assert len(runs) >= 1 + @pytest.mark.skipif( + os.getenv("OPENML_USE_LOCAL_SERVICES") == "true", + reason="Pending resolution of #1657", + ) @pytest.mark.sklearn() @unittest.skipIf( Version(sklearn.__version__) < Version("0.20"), @@ -1598,7 +1741,10 @@ def test_run_on_dataset_with_missing_labels_dataframe(self): cont_imp = make_pipeline(CustomImputer(), StandardScaler()) ct = ColumnTransformer([("cat", cat_imp, cat), ("cont", cont_imp, cont)]) model = Pipeline( - steps=[("preprocess", ct), ("estimator", sklearn.tree.DecisionTreeClassifier())], + steps=[ + ("preprocess", ct), + ("estimator", sklearn.tree.DecisionTreeClassifier()), + ], ) # build a sklearn classifier data_content, _, _, _ = _run_task_get_arffcontent( @@ -1614,6 +1760,10 @@ def test_run_on_dataset_with_missing_labels_dataframe(self): # repeat, fold, row_id, 6 confidences, prediction and correct label assert len(row) == 12 + @pytest.mark.skipif( + os.getenv("OPENML_USE_LOCAL_SERVICES") == "true", + reason="Pending resolution of #1657", + ) @pytest.mark.sklearn() @unittest.skipIf( Version(sklearn.__version__) < Version("0.20"), @@ -1642,7 +1792,10 @@ def test_run_on_dataset_with_missing_labels_array(self): cont_imp = make_pipeline(CustomImputer(), StandardScaler()) ct = ColumnTransformer([("cat", cat_imp, cat), ("cont", cont_imp, cont)]) model = Pipeline( - steps=[("preprocess", ct), ("estimator", sklearn.tree.DecisionTreeClassifier())], + steps=[ + ("preprocess", ct), + ("estimator", sklearn.tree.DecisionTreeClassifier()), + ], ) # build a sklearn classifier data_content, _, _, _ = _run_task_get_arffcontent( @@ -1668,6 +1821,10 @@ def test_get_uncached_run(self): with pytest.raises(openml.exceptions.OpenMLCacheException): openml.runs.functions._get_cached_run(10) + @pytest.mark.skipif( + os.getenv("OPENML_USE_LOCAL_SERVICES") == "true", + reason="Pending resolution of #1657", + ) @pytest.mark.sklearn() @pytest.mark.test_server() def test_run_flow_on_task_downloaded_flow(self): @@ -1696,7 +1853,8 @@ def test_format_prediction_non_supervised(self): clustering = openml.tasks.get_task(126033, download_data=False) ignored_input = [0] * 5 with pytest.raises( - NotImplementedError, match=r"Formatting for is not supported." + NotImplementedError, + match=r"Formatting for is not supported.", ): format_prediction(clustering, *ignored_input) @@ -1707,7 +1865,9 @@ def test_format_prediction_classification_no_probabilities(self): download_data=False, ) ignored_input = [0] * 5 - with pytest.raises(ValueError, match="`proba` is required for classification task"): + with pytest.raises( + ValueError, match="`proba` is required for classification task" + ): format_prediction(classification, *ignored_input, proba=None) @pytest.mark.test_server() @@ -1718,8 +1878,12 @@ def test_format_prediction_classification_incomplete_probabilities(self): ) ignored_input = [0] * 5 incomplete_probabilities = {c: 0.2 for c in classification.class_labels[1:]} - with pytest.raises(ValueError, match="Each class should have a predicted probability"): - format_prediction(classification, *ignored_input, proba=incomplete_probabilities) + with pytest.raises( + ValueError, match="Each class should have a predicted probability" + ): + format_prediction( + classification, *ignored_input, proba=incomplete_probabilities + ) @pytest.mark.test_server() def test_format_prediction_task_without_classlabels_set(self): @@ -1729,16 +1893,24 @@ def test_format_prediction_task_without_classlabels_set(self): ) classification.class_labels = None ignored_input = [0] * 5 - with pytest.raises(ValueError, match="The classification task must have class labels set"): + with pytest.raises( + ValueError, match="The classification task must have class labels set" + ): format_prediction(classification, *ignored_input, proba={}) @pytest.mark.test_server() def test_format_prediction_task_learning_curve_sample_not_set(self): - learning_curve = openml.tasks.get_task(801, download_data=False) # diabetes;crossvalidation + learning_curve = openml.tasks.get_task( + 801, download_data=False + ) # diabetes;crossvalidation probabilities = {c: 0.2 for c in learning_curve.class_labels} ignored_input = [0] * 5 - with pytest.raises(ValueError, match="`sample` can not be none for LearningCurveTask"): - format_prediction(learning_curve, *ignored_input, sample=None, proba=probabilities) + with pytest.raises( + ValueError, match="`sample` can not be none for LearningCurveTask" + ): + format_prediction( + learning_curve, *ignored_input, sample=None, proba=probabilities + ) @pytest.mark.test_server() def test_format_prediction_task_regression(self): @@ -1756,7 +1928,9 @@ def test_format_prediction_task_regression(self): if e.code == 614: # Task already exists # the exception message contains the task_id that was matched in the format # 'Task already exists. - matched id(s): [xxxx]' - task_id = ast.literal_eval(e.message.split("matched id(s):")[-1].strip())[0] + task_id = ast.literal_eval( + e.message.split("matched id(s):")[-1].strip() + )[0] else: raise Exception(repr(e)) # mark to remove the uploaded task @@ -1786,12 +1960,16 @@ def test_delete_run(self): task = openml.tasks.get_task(32) # diabetes; crossvalidation run = openml.runs.run_model_on_task( - model=clf, task=task, seed=rs, + model=clf, + task=task, + seed=rs, ) run.publish() with pytest.raises(openml.exceptions.OpenMLRunsExistError): - openml.runs.run_model_on_task(model=clf, task=task, seed=rs, avoid_duplicate_runs=True) + openml.runs.run_model_on_task( + model=clf, task=task, seed=rs, avoid_duplicate_runs=True + ) TestBase._mark_entity_for_removal("run", run.run_id) TestBase.logger.info(f"collected from test_run_functions: {run.run_id}") @@ -1799,7 +1977,9 @@ def test_delete_run(self): _run_id = run.run_id assert delete_run(_run_id) - @pytest.mark.skip(reason="run id is in problematic state on test server due to PR#1454") + @pytest.mark.skip( + reason="run id is in problematic state on test server due to PR#1454" + ) @unittest.skipIf( Version(sklearn.__version__) < Version("0.20"), reason="SimpleImputer doesn't handle mixed type DataFrame as input", @@ -1866,15 +2046,19 @@ def test_delete_unknown_run(mock_delete, test_files_directory, test_server_v1, t assert test_apikey_v1 == mock_delete.call_args.kwargs.get("params", {}).get("api_key") +@pytest.mark.skipif( + os.getenv("OPENML_USE_LOCAL_SERVICES") == "true", + reason="Pending resolution of #1657", +) @pytest.mark.sklearn() @unittest.skipIf( Version(sklearn.__version__) < Version("0.21"), reason="couldn't perform local tests successfully w/o bloating RAM", - ) +) @unittest.skipIf( Version(sklearn.__version__) >= Version("1.8"), reason="predictions differ significantly", - ) +) @mock.patch("openml_sklearn.SklearnExtension._prevent_optimize_n_jobs") @pytest.mark.test_server() def test__run_task_get_arffcontent_2(parallel_mock): @@ -1903,8 +2087,11 @@ def test__run_task_get_arffcontent_2(parallel_mock): ] ) n_jobs = 2 - backend = "loky" if Version(joblib.__version__) > Version("0.11") else "multiprocessing" + backend = ( + "loky" if Version(joblib.__version__) > Version("0.11") else "multiprocessing" + ) from openml_sklearn import SklearnExtension + extension = SklearnExtension() with parallel_backend(backend, n_jobs=n_jobs): res = openml.runs.functions._run_task_get_arffcontent( @@ -1948,11 +2135,15 @@ def test__run_task_get_arffcontent_2(parallel_mock): ) +@pytest.mark.skipif( + os.getenv("OPENML_USE_LOCAL_SERVICES") == "true", + reason="Pending resolution of #1657", +) @pytest.mark.sklearn() @unittest.skipIf( Version(sklearn.__version__) < Version("0.21"), reason="couldn't perform local tests successfully w/o bloating RAM", - ) +) @mock.patch("openml_sklearn.SklearnExtension._prevent_optimize_n_jobs") @pytest.mark.parametrize( ("n_jobs", "backend", "call_count"), @@ -1961,18 +2152,28 @@ def test__run_task_get_arffcontent_2(parallel_mock): # spawns multiple processes if n_jobs != 1, which means the mock is not applied. (2, None, 0), (-1, None, 0), - (1, None, 10), # with n_jobs=1 the mock *is* applied, since there is no new subprocess + ( + 1, + None, + 10, + ), # with n_jobs=1 the mock *is* applied, since there is no new subprocess (1, "sequential", 10), (1, "threading", 10), - (-1, "threading", 10), # the threading backend does preserve mocks even with parallelizing - ] + ( + -1, + "threading", + 10, + ), # the threading backend does preserve mocks even with parallelizing + ], ) @pytest.mark.test_server() def test_joblib_backends(parallel_mock, n_jobs, backend, call_count): """Tests evaluation of a run using various joblib backends and n_jobs.""" if backend is None: backend = ( - "loky" if Version(joblib.__version__) > Version("0.11") else "multiprocessing" + "loky" + if Version(joblib.__version__) > Version("0.11") + else "multiprocessing" ) task = openml.tasks.get_task(7) # Supervised Classification on kr-vs-kp diff --git a/tests/test_setups/test_setup_functions.py b/tests/test_setups/test_setup_functions.py index 0735925f2..da87c0cc9 100644 --- a/tests/test_setups/test_setup_functions.py +++ b/tests/test_setups/test_setup_functions.py @@ -34,6 +34,10 @@ def setUp(self): self.extension = SklearnExtension() super().setUp() + @pytest.mark.skipif( + os.getenv("OPENML_USE_LOCAL_SERVICES") == "true", + reason="Pending resolution of #1657", + ) @pytest.mark.sklearn() @pytest.mark.test_server() def test_nonexisting_setup_exists(self): @@ -45,7 +49,9 @@ def test_nonexisting_setup_exists(self): flow.name = f"TEST{sentinel}{flow.name}" flow.publish() TestBase._mark_entity_for_removal("flow", flow.flow_id, flow.name) - TestBase.logger.info(f"collected from {__file__.split('/')[-1]}: {flow.flow_id}") + TestBase.logger.info( + f"collected from {__file__.split('/')[-1]}: {flow.flow_id}" + ) # although the flow exists (created as of previous statement), # we can be sure there are no setups (yet) as it was just created @@ -58,7 +64,9 @@ def _existing_setup_exists(self, classif): flow.name = f"TEST{get_sentinel()}{flow.name}" flow.publish() TestBase._mark_entity_for_removal("flow", flow.flow_id, flow.name) - TestBase.logger.info(f"collected from {__file__.split('/')[-1]}: {flow.flow_id}") + TestBase.logger.info( + f"collected from {__file__.split('/')[-1]}: {flow.flow_id}" + ) # although the flow exists, we can be sure there are no # setups (yet) as it hasn't been ran @@ -82,6 +90,10 @@ def _existing_setup_exists(self, classif): setup_id = openml.setups.setup_exists(flow) assert setup_id == run.setup_id + @pytest.mark.skipif( + os.getenv("OPENML_USE_LOCAL_SERVICES") == "true", + reason="Pending resolution of #1657", + ) @pytest.mark.sklearn() @pytest.mark.test_server() def test_existing_setup_exists_1(self): @@ -98,12 +110,20 @@ def side_effect(self): nb = sklearn.naive_bayes.GaussianNB() self._existing_setup_exists(nb) + @pytest.mark.skipif( + os.getenv("OPENML_USE_LOCAL_SERVICES") == "true", + reason="Pending resolution of #1657", + ) @pytest.mark.sklearn() @pytest.mark.test_server() def test_exisiting_setup_exists_2(self): # Check a flow with one hyperparameter self._existing_setup_exists(sklearn.naive_bayes.GaussianNB()) + @pytest.mark.skipif( + os.getenv("OPENML_USE_LOCAL_SERVICES") == "true", + reason="Pending resolution of #1657", + ) @pytest.mark.sklearn() @pytest.mark.test_server() def test_existing_setup_exists_3(self): @@ -161,10 +181,14 @@ def test_list_setups_output_format(self): flow_id = 6794 setups = openml.setups.list_setups(flow=flow_id, size=10) assert isinstance(setups, dict) - assert isinstance(setups[next(iter(setups.keys()))], openml.setups.setup.OpenMLSetup) + assert isinstance( + setups[next(iter(setups.keys()))], openml.setups.setup.OpenMLSetup + ) assert len(setups) == 10 - setups = openml.setups.list_setups(flow=flow_id, size=10, output_format="dataframe") + setups = openml.setups.list_setups( + flow=flow_id, size=10, output_format="dataframe" + ) assert isinstance(setups, pd.DataFrame) assert len(setups) == 10 diff --git a/tests/test_tasks/test_task_functions.py b/tests/test_tasks/test_task_functions.py index bf2fcfeae..931855841 100644 --- a/tests/test_tasks/test_task_functions.py +++ b/tests/test_tasks/test_task_functions.py @@ -3,17 +3,18 @@ import os import unittest -from typing import cast from unittest import mock -import pandas as pd import pytest import requests import openml from openml import OpenMLSplit, OpenMLTask -from openml.exceptions import OpenMLCacheException, OpenMLNotAuthorizedError, OpenMLServerException -from openml.tasks import TaskType +from openml.exceptions import ( + OpenMLNotAuthorizedError, + OpenMLServerException, +) +from openml.tasks import TaskType, task from openml.testing import TestBase, create_request_response @@ -26,29 +27,6 @@ def setUp(self): def tearDown(self): super().tearDown() - @pytest.mark.test_server() - def test__get_cached_tasks(self): - openml.config.set_root_cache_directory(self.static_cache_dir) - tasks = openml.tasks.functions._get_cached_tasks() - assert isinstance(tasks, dict) - assert len(tasks) == 3 - assert isinstance(next(iter(tasks.values())), OpenMLTask) - - @pytest.mark.test_server() - def test__get_cached_task(self): - openml.config.set_root_cache_directory(self.static_cache_dir) - task = openml.tasks.functions._get_cached_task(1) - assert isinstance(task, OpenMLTask) - - def test__get_cached_task_not_cached(self): - openml.config.set_root_cache_directory(self.static_cache_dir) - self.assertRaisesRegex( - OpenMLCacheException, - "Task file for tid 2 not cached", - openml.tasks.functions._get_cached_task, - 2, - ) - @pytest.mark.test_server() def test__get_estimation_procedure_list(self): estimation_procedures = openml.tasks.functions._get_estimation_procedure_list() @@ -141,7 +119,9 @@ def test_list_tasks_per_type_paginate(self): @pytest.mark.test_server() def test__get_task(self): openml.config.set_root_cache_directory(self.static_cache_dir) - openml.tasks.get_task(1882) + with unittest.mock.patch("requests.sessions.Session.request") as mock_request: + openml.tasks.get_task(1882) + mock_request.assert_not_called() @unittest.skip( "Please await outcome of discussion: https://github.com/openml/OpenML/issues/776", @@ -155,21 +135,16 @@ def test__get_task_live(self): @pytest.mark.test_server() def test_get_task(self): - task = openml.tasks.get_task(1, download_data=True) # anneal; crossvalidation - assert isinstance(task, OpenMLTask) - assert os.path.exists( - os.path.join(openml.config.get_cache_directory(), "tasks", "1", "task.xml") - ) - assert not os.path.exists( - os.path.join(openml.config.get_cache_directory(), "tasks", "1", "datasplits.arff") - ) - assert os.path.exists( - os.path.join(openml.config.get_cache_directory(), "datasets", "1", "dataset_1.pq") - ) + with unittest.mock.patch("requests.sessions.Session.request") as mock_request: + openml.tasks.get_task(1) + mock_request.assert_not_called() @pytest.mark.test_server() def test_get_task_lazy(self): - task = openml.tasks.get_task(2, download_data=False) # anneal; crossvalidation + with unittest.mock.patch("requests.sessions.Session.request") as mock_request: + task = openml.tasks.get_task(2, download_data=False) # anneal; crossvalidation + mock_request.assert_not_called() + assert isinstance(task, OpenMLTask) assert os.path.exists( os.path.join(openml.config.get_cache_directory(), "tasks", "2", "task.xml") @@ -177,16 +152,25 @@ def test_get_task_lazy(self): assert task.class_labels == ["1", "2", "3", "4", "5", "U"] assert not os.path.exists( - os.path.join(openml.config.get_cache_directory(), "tasks", "2", "datasplits.arff") + os.path.join( + openml.config.get_cache_directory(), "tasks", "2", "datasplits.arff" + ) ) # Since the download_data=False is propagated to get_dataset assert not os.path.exists( - os.path.join(openml.config.get_cache_directory(), "datasets", "2", "dataset.arff") + os.path.join( + openml.config.get_cache_directory(), "datasets", "2", "dataset.arff" + ) ) - task.download_split() + with unittest.mock.patch("requests.sessions.Session.request") as mock_request: + task.download_split() + mock_request.assert_not_called() + assert os.path.exists( - os.path.join(openml.config.get_cache_directory(), "tasks", "2", "datasplits.arff") + os.path.join( + openml.config.get_cache_directory(), "tasks", "2", "datasplits.arff" + ) ) @mock.patch("openml.tasks.functions.get_dataset") @@ -211,7 +195,10 @@ def assert_and_raise(*args, **kwargs): @pytest.mark.test_server() def test_get_task_with_cache(self): openml.config.set_root_cache_directory(self.static_cache_dir) - task = openml.tasks.get_task(1) + with unittest.mock.patch("requests.sessions.Session.request") as mock_request: + task = openml.tasks.get_task(1) + mock_request.assert_not_called() + assert isinstance(task, OpenMLTask) @pytest.mark.production_server() @@ -226,11 +213,15 @@ def test_get_task_different_types(self): @pytest.mark.test_server() def test_download_split(self): - task = openml.tasks.get_task(1) # anneal; crossvalidation - split = task.download_split() + with unittest.mock.patch("requests.sessions.Session.request") as mock_request: + task = openml.tasks.get_task(1) # anneal; crossvalidation + split = task.download_split() + mock_request.assert_not_called() assert type(split) == OpenMLSplit assert os.path.exists( - os.path.join(openml.config.get_cache_directory(), "tasks", "1", "datasplits.arff") + os.path.join( + openml.config.get_cache_directory(), "tasks", "1", "datasplits.arff" + ) ) def test_deletion_of_cache_dir(self): @@ -244,14 +235,13 @@ def test_deletion_of_cache_dir(self): assert not os.path.exists(tid_cache_dir) -@mock.patch.object(requests.Session, "delete") -def test_delete_task_not_owned(mock_delete, test_files_directory, test_server_v1, test_apikey_v1): +@mock.patch.object(requests.Session, "request") +def test_delete_task_not_owned(mock_request, test_files_directory, test_server_v1, test_apikey_v1): content_file = test_files_directory / "mock_responses" / "tasks" / "task_delete_not_owned.xml" - mock_delete.return_value = create_request_response( + mock_request.return_value = create_request_response( status_code=412, content_filepath=content_file, ) - with pytest.raises( OpenMLNotAuthorizedError, match="The task can not be deleted because it was not uploaded by you.", @@ -259,14 +249,14 @@ def test_delete_task_not_owned(mock_delete, test_files_directory, test_server_v1 openml.tasks.delete_task(1) task_url = test_server_v1 + "task/1" - assert task_url == mock_delete.call_args.args[0] - assert test_apikey_v1 == mock_delete.call_args.kwargs.get("params", {}).get("api_key") + assert task_url == mock_request.call_args.kwargs.get("url") + assert test_apikey_v1 == mock_request.call_args.kwargs.get("params", {}).get("api_key") -@mock.patch.object(requests.Session, "delete") -def test_delete_task_with_run(mock_delete, test_files_directory, test_server_v1, test_apikey_v1): +@mock.patch.object(requests.Session, "request") +def test_delete_task_with_run(mock_request, test_files_directory, test_server_v1, test_apikey_v1): content_file = test_files_directory / "mock_responses" / "tasks" / "task_delete_has_runs.xml" - mock_delete.return_value = create_request_response( + mock_request.return_value = create_request_response( status_code=412, content_filepath=content_file, ) @@ -278,14 +268,14 @@ def test_delete_task_with_run(mock_delete, test_files_directory, test_server_v1, openml.tasks.delete_task(3496) task_url = test_server_v1 + "task/3496" - assert task_url == mock_delete.call_args.args[0] - assert test_apikey_v1 == mock_delete.call_args.kwargs.get("params", {}).get("api_key") + assert task_url == mock_request.call_args.kwargs.get("url") + assert test_apikey_v1 == mock_request.call_args.kwargs.get("params", {}).get("api_key") -@mock.patch.object(requests.Session, "delete") -def test_delete_success(mock_delete, test_files_directory, test_server_v1, test_apikey_v1): +@mock.patch.object(requests.Session, "request") +def test_delete_success(mock_request, test_files_directory, test_server_v1, test_apikey_v1): content_file = test_files_directory / "mock_responses" / "tasks" / "task_delete_successful.xml" - mock_delete.return_value = create_request_response( + mock_request.return_value = create_request_response( status_code=200, content_filepath=content_file, ) @@ -294,14 +284,14 @@ def test_delete_success(mock_delete, test_files_directory, test_server_v1, test_ assert success task_url = test_server_v1 + "task/361323" - assert task_url == mock_delete.call_args.args[0] - assert test_apikey_v1 == mock_delete.call_args.kwargs.get("params", {}).get("api_key") + assert task_url == mock_request.call_args.kwargs.get("url") + assert test_apikey_v1 == mock_request.call_args.kwargs.get("params", {}).get("api_key") -@mock.patch.object(requests.Session, "delete") -def test_delete_unknown_task(mock_delete, test_files_directory, test_server_v1, test_apikey_v1): +@mock.patch.object(requests.Session, "request") +def test_delete_unknown_task(mock_request, test_files_directory, test_server_v1, test_apikey_v1): content_file = test_files_directory / "mock_responses" / "tasks" / "task_delete_not_exist.xml" - mock_delete.return_value = create_request_response( + mock_request.return_value = create_request_response( status_code=412, content_filepath=content_file, ) @@ -313,5 +303,5 @@ def test_delete_unknown_task(mock_delete, test_files_directory, test_server_v1, openml.tasks.delete_task(9_999_999) task_url = test_server_v1 + "task/9999999" - assert task_url == mock_delete.call_args.args[0] - assert test_apikey_v1 == mock_delete.call_args.kwargs.get("params", {}).get("api_key") + assert task_url == mock_request.call_args.kwargs.get("url") + assert test_apikey_v1 == mock_request.call_args.kwargs.get("params", {}).get("api_key") diff --git a/tests/test_tasks/test_task_methods.py b/tests/test_tasks/test_task_methods.py index 9316d0876..81c133edc 100644 --- a/tests/test_tasks/test_task_methods.py +++ b/tests/test_tasks/test_task_methods.py @@ -6,6 +6,7 @@ import openml from openml.testing import TestBase import pytest +import unittest.mock # Common methods between tasks @@ -33,9 +34,13 @@ def test_tagging(self): assert len(tasks) == 0 @pytest.mark.test_server() - def test_get_train_and_test_split_indices(self): + def test_get_train_and_test_split_indices(self): openml.config.set_root_cache_directory(self.static_cache_dir) - task = openml.tasks.get_task(1882) + + with unittest.mock.patch("requests.sessions.Session.request") as mock_request: + task = openml.tasks.get_task(1882) + mock_request.assert_not_called() + train_indices, test_indices = task.get_train_test_split_indices(0, 0) assert train_indices[0] == 16 assert train_indices[-1] == 395 diff --git a/tests/test_utils/test_utils.py b/tests/test_utils/test_utils.py index 111ff778c..b74294575 100644 --- a/tests/test_utils/test_utils.py +++ b/tests/test_utils/test_utils.py @@ -50,7 +50,7 @@ def _mocked_perform_api_call(call, request_method): @pytest.mark.test_server() def test_list_all(): - openml.utils._list_all(listing_call=openml.tasks.functions._list_tasks) + openml.utils._list_all(listing_call=openml.tasks.functions.list_tasks) @pytest.mark.test_server() @@ -65,7 +65,7 @@ def test_list_all_with_multiple_batches(min_number_tasks_on_test_server): # batches and at the same time do as few batches (roundtrips) as possible. batch_size = min_number_tasks_on_test_server - 1 batches = openml.utils._list_all( - listing_call=openml.tasks.functions._list_tasks, + listing_call=openml._backend.task.list, batch_size=batch_size, ) assert len(batches) >= 2