diff --git a/openml/_api/clients/http.py b/openml/_api/clients/http.py
index 27eeaac22..2b80023fd 100644
--- a/openml/_api/clients/http.py
+++ b/openml/_api/clients/http.py
@@ -12,6 +12,7 @@
 from typing import Any, cast
 from urllib.parse import urlencode, urljoin, urlparse
 
+import arff
 import requests
 import xmltodict
 from requests import Response
@@ -98,16 +99,32 @@ def _get_body_filename_from_response(self, response: Response) -> str:
         if "text/xml" in content_type:
             return "body.xml"
 
+        if response.content.startswith(b"PK\x03\x04"):
+            return "body.zip"
+
+        try:
+            arff.loads(response.text)
+            return "body.arff"
+        except arff.ArffException:
+            pass
+
         return "body.txt"
 
     def _get_body_filename_from_path(self, path: Path) -> str:
-        if (path / "body.json").exists():
-            return "body.json"
+        candidates = []
+        for p in path.iterdir():
+            if p.name.startswith("body.") and len(p.suffixes) == 1:
+                candidates.append(p)
 
-        if (path / "body.xml").exists():
-            return "body.xml"
+        if not candidates:
+            raise FileNotFoundError(f"No body file found in path: {path}")
 
-        return "body.txt"
+        if len(candidates) > 1:
+            raise FileNotFoundError(
+                f"Multiple body files found in path: {path} ({[p.name for p in candidates]})"
+            )
+
+        return candidates[0].name
 
     def load(self, key: str) -> Response:
         """
@@ -132,6 +149,9 @@ def load(self, key: str) -> Response:
         """
         path = self._key_to_path(key)
 
+        if not path.exists():
+            raise FileNotFoundError(f"Cache path not found: {path}")
+
         meta_path = path / "meta.json"
         meta_raw = meta_path.read_bytes() if meta_path.exists() else "{}"
         meta = json.loads(meta_raw)
@@ -141,8 +161,6 @@ def load(self, key: str) -> Response:
         headers = json.loads(headers_raw)
 
         body_path = path / self._get_body_filename_from_path(path)
-        if not body_path.exists():
-            raise FileNotFoundError(f"Incomplete cache at {body_path}")
         body = body_path.read_bytes()
 
         response = Response()
@@ -825,3 +843,9 @@ def write_to_file(response: Response, path: Path, encoding: str) -> None:
         handler = handler or write_to_file
         handler(response, file_path, encoding)
         return file_path
+
+    def cache_path_from_url(self, url: str) -> Path:
+        full_url = urljoin(self.server, url)
+        key = self.cache.get_key(full_url, params={})
+        path = self.cache._key_to_path(key)
+        return path / self.cache._get_body_filename_from_path(path)
diff --git a/openml/_api/resources/base/resources.py b/openml/_api/resources/base/resources.py
index 0c60e69de..301483f25 100644
--- a/openml/_api/resources/base/resources.py
+++ b/openml/_api/resources/base/resources.py
@@ -10,10 +10,13 @@
 from .base import ResourceAPI
 
 if TYPE_CHECKING:
+    import pandas as pd
+
     from openml.estimation_procedures import OpenMLEstimationProcedure
-    from openml.evaluations import OpenMLEvaluation
+    from openml.evaluations.evaluation import OpenMLEvaluation
     from openml.flows.flow import OpenMLFlow
     from openml.setups.setup import OpenMLSetup
+    from openml.tasks.task import OpenMLTask, TaskType
 
 
 class DatasetAPI(ResourceAPI):
@@ -27,6 +30,49 @@ class TaskAPI(ResourceAPI):
 
     resource_type: ResourceType = ResourceType.TASK
 
+    @abstractmethod
+    def get(
+        self,
+        task_id: int,
+    ) -> OpenMLTask:
+        """
+        API v1:
+            GET /task/{task_id}
+
+        API v2:
+            GET /tasks/{task_id}
+        """
+        ...
+
+    @abstractmethod
+    def supports_download_splits(self) -> bool:
+        """Return whether the task API implementation supports split downloads."""
+        ...
+
+    # Task listing (V1 only)
+    @abstractmethod
+    def list(
+        self,
+        limit: int,
+        offset: int,
+        task_type: TaskType | int | None = None,
+        **kwargs: Any,
+    ) -> pd.DataFrame:
+        """
+        List tasks with filters.
+
+        API v1:
+            GET /task/list
+
+        API v2:
+            Not available.
+
+        Returns
+        -------
+        pandas.DataFrame
+        """
+        ...
+
 
 class EvaluationMeasureAPI(ResourceAPI):
     """Abstract API interface for evaluation measure resources."""
diff --git a/openml/_api/resources/task.py b/openml/_api/resources/task.py
index 1f62aa3f3..3b6f504b9 100644
--- a/openml/_api/resources/task.py
+++ b/openml/_api/resources/task.py
@@ -1,11 +1,353 @@
 from __future__ import annotations
 
+import warnings
+from typing import Any
+
+import pandas as pd
+import xmltodict
+
+from openml.tasks.functions import _get_estimation_procedure_list
+from openml.tasks.task import (
+    OpenMLClassificationTask,
+    OpenMLClusteringTask,
+    OpenMLLearningCurveTask,
+    OpenMLRegressionTask,
+    OpenMLTask,
+    TaskType,
+)
+
 from .base import ResourceV1API, ResourceV2API, TaskAPI
 
 
+def _create_task_from_xml(xml: str) -> OpenMLTask:
+    """Create a task given a xml string.
+
+    Parameters
+    ----------
+    xml : string
+        Task xml representation.
+
+    Returns
+    -------
+    OpenMLTask
+    """
+    dic = xmltodict.parse(xml)["oml:task"]
+    estimation_parameters = {}
+    inputs = {}
+    # Due to the unordered structure we obtain, we first have to extract
+    # the possible keys of oml:input; dic["oml:input"] is a list of
+    # OrderedDicts
+
+    # Check if there is a list of inputs
+    if isinstance(dic["oml:input"], list):
+        for input_ in dic["oml:input"]:
+            name = input_["@name"]
+            inputs[name] = input_
+    # Single input case
+    elif isinstance(dic["oml:input"], dict):
+        name = dic["oml:input"]["@name"]
+        inputs[name] = dic["oml:input"]
+
+    evaluation_measures = None
+    if "evaluation_measures" in inputs:
+        evaluation_measures = inputs["evaluation_measures"]["oml:evaluation_measures"][
+            "oml:evaluation_measure"
+        ]
+
+    task_type = TaskType(int(dic["oml:task_type_id"]))
+    common_kwargs = {
+        "task_id": dic["oml:task_id"],
+        "task_type": dic["oml:task_type"],
+        "task_type_id": task_type,
+        "data_set_id": inputs["source_data"]["oml:data_set"]["oml:data_set_id"],
+        "evaluation_measure": evaluation_measures,
+    }
+    # TODO: add OpenMLClusteringTask?
+    if task_type in (
+        TaskType.SUPERVISED_CLASSIFICATION,
+        TaskType.SUPERVISED_REGRESSION,
+        TaskType.LEARNING_CURVE,
+    ):
+        # Convert some more parameters
+        for parameter in inputs["estimation_procedure"]["oml:estimation_procedure"][
+            "oml:parameter"
+        ]:
+            name = parameter["@name"]
+            text = parameter.get("#text", "")
+            estimation_parameters[name] = text
+
+        common_kwargs["estimation_procedure_type"] = inputs["estimation_procedure"][
+            "oml:estimation_procedure"
+        ]["oml:type"]
+        common_kwargs["estimation_procedure_id"] = int(
+            inputs["estimation_procedure"]["oml:estimation_procedure"]["oml:id"]
+        )
+
+        common_kwargs["estimation_parameters"] = estimation_parameters
+        common_kwargs["target_name"] = inputs["source_data"]["oml:data_set"]["oml:target_feature"]
+        common_kwargs["data_splits_url"] = inputs["estimation_procedure"][
+            "oml:estimation_procedure"
+        ]["oml:data_splits_url"]
+
+    cls = {
+        TaskType.SUPERVISED_CLASSIFICATION: OpenMLClassificationTask,
+        TaskType.SUPERVISED_REGRESSION: OpenMLRegressionTask,
+        TaskType.CLUSTERING: OpenMLClusteringTask,
+        TaskType.LEARNING_CURVE: OpenMLLearningCurveTask,
+    }.get(task_type)
+    if cls is None:
+        raise NotImplementedError(f"Task type {common_kwargs['task_type']} not supported.")
+    return cls(**common_kwargs)  # type: ignore
+
+
+def _build_url(
+    limit: int, offset: int, task_type: TaskType | int | None, kwargs: dict[str, Any]
+) -> str:
+    api_call = "task/list"
+    if limit is not None:
+        api_call += f"/limit/{limit}"
+    if offset is not None:
+        api_call += f"/offset/{offset}"
+    if task_type is not None:
+        tvalue = task_type.value if isinstance(task_type, TaskType) else task_type
+        api_call += f"/type/{tvalue}"
+    if kwargs is not None:
+        for operator, value in kwargs.items():
+            if value is not None:
+                if operator == "task_id":
+                    value = ",".join([str(int(i)) for i in value])  # noqa: PLW2901
+                api_call += f"/{operator}/{value}"
+    return api_call
+
+
 class TaskV1API(ResourceV1API, TaskAPI):
-    """Version 1 API implementation for task resources."""
+    def get(self, task_id: int) -> OpenMLTask:
+        """Download OpenML task for a given task ID.
+
+        Downloads the task representation.
+
+        Parameters
+        ----------
+        task_id : int
+            The OpenML task id of the task to download.
+        get_dataset_kwargs :
+            Args and kwargs can be used pass optional parameters to
+            :meth:`openml.datasets.get_dataset`.
+
+        Returns
+        -------
+        task: OpenMLTask
+        """
+        if not isinstance(task_id, int):
+            raise TypeError(f"Task id should be integer, is {type(task_id)}")
+
+        response = self._http.get(f"task/{task_id}", enable_cache=True)
+        return _create_task_from_xml(response.text)
+
+    def supports_download_splits(self) -> bool:
+        return True
+
+    def list(
+        self,
+        limit: int,
+        offset: int,
+        task_type: TaskType | int | None = None,
+        **kwargs: Any,
+    ) -> pd.DataFrame:
+        """
+        Perform the api call to return a number of tasks having the given filters.
+
+        Parameters
+        ----------
+        Filter task_type is separated from the other filters because
+        it is used as task_type in the task description, but it is named
+        type when used as a filter in list tasks call.
+        limit: int
+        offset: int
+        task_type : TaskType, optional
+            Refers to the type of task.
+        kwargs: dict, optional
+            Legal filter operators: tag, task_id (list), data_tag, status, limit,
+            offset, data_id, data_name, number_instances, number_features,
+            number_classes, number_missing_values.
+
+        Returns
+        -------
+        dataframe
+        """
+        api_call = _build_url(limit, offset, task_type, kwargs)
+        return self._parse_list_xml(api_call=api_call)
+
+    def _parse_list_xml(self, api_call: str) -> pd.DataFrame:  # noqa: C901, PLR0912
+        """Returns a Pandas DataFrame with information about OpenML tasks.
+
+        Parameters
+        ----------
+        api_call : str
+            The API call specifying which tasks to return.
+
+        Returns
+        -------
+            A Pandas DataFrame with information about OpenML tasks.
+
+        Raises
+        ------
+        ValueError
+            If the XML returned by the OpenML API does not contain 'oml:tasks', '@xmlns:oml',
+            or has an incorrect value for '@xmlns:oml'.
+        KeyError
+            If an invalid key is found in the XML for a task.
+        """
+        xml_string = self._http.get(api_call).text
+
+        tasks_dict = xmltodict.parse(xml_string, force_list=("oml:task", "oml:input"))
+        # Minimalistic check if the XML is useful
+        if "oml:tasks" not in tasks_dict:
+            raise ValueError(f'Error in return XML, does not contain "oml:runs": {tasks_dict}')
+
+        if "@xmlns:oml" not in tasks_dict["oml:tasks"]:
+            raise ValueError(
+                f'Error in return XML, does not contain "oml:runs"/@xmlns:oml: {tasks_dict}'
+            )
+
+        if tasks_dict["oml:tasks"]["@xmlns:oml"] != "http://openml.org/openml":
+            raise ValueError(
+                "Error in return XML, value of  "
+                '"oml:runs"/@xmlns:oml is not '
+                f'"http://openml.org/openml": {tasks_dict!s}',
+            )
+
+        assert isinstance(tasks_dict["oml:tasks"]["oml:task"], list), type(tasks_dict["oml:tasks"])
+
+        tasks = {}
+        procs = _get_estimation_procedure_list()
+        proc_dict = {x["id"]: x for x in procs}
+
+        for task_ in tasks_dict["oml:tasks"]["oml:task"]:
+            tid = None
+            try:
+                tid = int(task_["oml:task_id"])
+                task_type_int = int(task_["oml:task_type_id"])
+                try:
+                    task_type_id = TaskType(task_type_int)
+                except ValueError as e:
+                    warnings.warn(
+                        f"Could not create task type id for {task_type_int} due to error {e}",
+                        RuntimeWarning,
+                        stacklevel=2,
+                    )
+                    continue
+
+                task = {
+                    "tid": tid,
+                    "ttid": task_type_id,
+                    "did": int(task_["oml:did"]),
+                    "name": task_["oml:name"],
+                    "task_type": task_["oml:task_type"],
+                    "status": task_["oml:status"],
+                }
+
+                # Other task inputs
+                for _input in task_.get("oml:input", []):
+                    if _input["@name"] == "estimation_procedure":
+                        task[_input["@name"]] = proc_dict[int(_input["#text"])]["name"]
+                    else:
+                        value = _input.get("#text")
+                        task[_input["@name"]] = value
+
+                # The number of qualities can range from 0 to infinity
+                for quality in task_.get("oml:quality", []):
+                    if "#text" not in quality:
+                        quality_value = 0.0
+                    else:
+                        quality["#text"] = float(quality["#text"])
+                        if abs(int(quality["#text"]) - quality["#text"]) < 0.0000001:
+                            quality["#text"] = int(quality["#text"])
+                        quality_value = quality["#text"]
+                    task[quality["@name"]] = quality_value
+                tasks[tid] = task
+            except KeyError as e:
+                if tid is not None:
+                    warnings.warn(
+                        f"Invalid xml for task {tid}: {e}\nFrom {task_}",
+                        RuntimeWarning,
+                        stacklevel=2,
+                    )
+                else:
+                    warnings.warn(
+                        f"Could not find key {e} in {task_}!", RuntimeWarning, stacklevel=2
+                    )
+
+        return pd.DataFrame.from_dict(tasks, orient="index")
+
+
+def _create_task_from_json(task_json: dict) -> OpenMLTask:
+    task_type_id = TaskType(int(task_json["task_type_id"]))
+
+    inputs = {i["name"]: i for i in task_json.get("input", [])}
+
+    source = inputs["source_data"]["data_set"]
+
+    common_kwargs = {
+        "task_id": int(task_json["id"]),
+        "task_type": task_json["task_type"],
+        "task_type_id": task_type_id,
+        "data_set_id": int(source["data_set_id"]),
+        "evaluation_measure": None,
+    }
+
+    if task_type_id in (
+        TaskType.SUPERVISED_CLASSIFICATION,
+        TaskType.SUPERVISED_REGRESSION,
+        TaskType.LEARNING_CURVE,
+    ):
+        est = inputs.get("estimation_procedure", {}).get("estimation_procedure")
+
+        if est:
+            common_kwargs["estimation_procedure_id"] = int(est["id"])
+            common_kwargs["estimation_procedure_type"] = est["type"]
+            common_kwargs["estimation_parameters"] = {
+                p["name"]: p.get("value") for p in est.get("parameter", [])
+            }
+
+        common_kwargs["target_name"] = source.get("target_feature")
+
+    cls = {
+        TaskType.SUPERVISED_CLASSIFICATION: OpenMLClassificationTask,
+        TaskType.SUPERVISED_REGRESSION: OpenMLRegressionTask,
+        TaskType.CLUSTERING: OpenMLClusteringTask,
+        TaskType.LEARNING_CURVE: OpenMLLearningCurveTask,
+    }[task_type_id]
+
+    return cls(**common_kwargs)  # type: ignore
 
 
 class TaskV2API(ResourceV2API, TaskAPI):
-    """Version 2 API implementation for task resources."""
+    def get(self, task_id: int) -> OpenMLTask:
+        """Download OpenML task for a given task ID.
+
+        Downloads the task representation.
+
+        Parameters
+        ----------
+        task_id : int
+            The OpenML task id of the task to download.
+
+        Returns
+        -------
+        task: OpenMLTask
+        """
+        response = self._http.get(f"tasks/{task_id}", enable_cache=True)
+        return _create_task_from_json(response.json())
+
+    def list(
+        self,
+        limit: int,  # noqa: ARG002
+        offset: int,  # noqa: ARG002
+        task_type: TaskType | int | None = None,  # noqa: ARG002
+        **kwargs: Any,  # noqa: ARG002
+    ) -> pd.DataFrame:
+        raise self._not_supported(method="list")
+
+    def supports_download_splits(self) -> bool:
+        return False
diff --git a/openml/tasks/functions.py b/openml/tasks/functions.py
index 22fb26f9b..0b8aaecf0 100644
--- a/openml/tasks/functions.py
+++ b/openml/tasks/functions.py
@@ -1,19 +1,14 @@
 # License: BSD 3-Clause
 from __future__ import annotations
 
-import os
-import re
 import warnings
 from functools import partial
-from typing import Any
+from typing import TYPE_CHECKING, Any
 
 import pandas as pd
-import xmltodict
 
-import openml._api_calls
 import openml.utils
 from openml.datasets import get_dataset
-from openml.exceptions import OpenMLCacheException
 
 from .task import (
     OpenMLClassificationTask,
@@ -21,55 +16,16 @@
     OpenMLLearningCurveTask,
     OpenMLRegressionTask,
     OpenMLSupervisedTask,
-    OpenMLTask,
     TaskType,
 )
 
+if TYPE_CHECKING:
+    from .task import (
+        OpenMLTask,
+    )
 TASKS_CACHE_DIR_NAME = "tasks"
 
 
-def _get_cached_tasks() -> dict[int, OpenMLTask]:
-    """Return a dict of all the tasks which are cached locally.
-
-    Returns
-    -------
-    tasks : OrderedDict
-        A dict of all the cached tasks. Each task is an instance of
-        OpenMLTask.
-    """
-    task_cache_dir = openml.utils._create_cache_directory(TASKS_CACHE_DIR_NAME)
-    directory_content = os.listdir(task_cache_dir)  # noqa: PTH208
-    directory_content.sort()
-
-    # Find all dataset ids for which we have downloaded the dataset
-    # description
-    tids = (int(did) for did in directory_content if re.match(r"[0-9]*", did))
-    return {tid: _get_cached_task(tid) for tid in tids}
-
-
-def _get_cached_task(tid: int) -> OpenMLTask:
-    """Return a cached task based on the given id.
-
-    Parameters
-    ----------
-    tid : int
-        Id of the task.
-
-    Returns
-    -------
-    OpenMLTask
-    """
-    tid_cache_dir = openml.utils._create_cache_directory_for_id(TASKS_CACHE_DIR_NAME, tid)
-
-    task_xml_path = tid_cache_dir / "task.xml"
-    try:
-        with task_xml_path.open(encoding="utf8") as fh:
-            return _create_task_from_xml(fh.read())
-    except OSError as e:
-        openml.utils._remove_cache_dir_for_id(TASKS_CACHE_DIR_NAME, tid_cache_dir)
-        raise OpenMLCacheException(f"Task file for tid {tid} not cached") from e
-
-
 def _get_estimation_procedure_list() -> list[dict[str, Any]]:
     """Return a list of all estimation procedures which are on OpenML.
 
@@ -133,7 +89,7 @@ def list_tasks(  # noqa: PLR0913
         calculated for the associated dataset, some of these are also returned.
     """
     listing_call = partial(
-        _list_tasks,
+        openml._backend.task.list,
         task_type=task_type,
         tag=tag,
         data_tag=data_tag,
@@ -152,151 +108,6 @@ def list_tasks(  # noqa: PLR0913
     return pd.concat(batches)
 
 
-def _list_tasks(
-    limit: int,
-    offset: int,
-    task_type: TaskType | int | None = None,
-    **kwargs: Any,
-) -> pd.DataFrame:
-    """
-    Perform the api call to return a number of tasks having the given filters.
-
-    Parameters
-    ----------
-    Filter task_type is separated from the other filters because
-    it is used as task_type in the task description, but it is named
-    type when used as a filter in list tasks call.
-    limit: int
-    offset: int
-    task_type : TaskType, optional
-        Refers to the type of task.
-    kwargs: dict, optional
-        Legal filter operators: tag, task_id (list), data_tag, status, limit,
-        offset, data_id, data_name, number_instances, number_features,
-        number_classes, number_missing_values.
-
-    Returns
-    -------
-    dataframe
-    """
-    api_call = "task/list"
-    if limit is not None:
-        api_call += f"/limit/{limit}"
-    if offset is not None:
-        api_call += f"/offset/{offset}"
-    if task_type is not None:
-        tvalue = task_type.value if isinstance(task_type, TaskType) else task_type
-        api_call += f"/type/{tvalue}"
-    if kwargs is not None:
-        for operator, value in kwargs.items():
-            if value is not None:
-                if operator == "task_id":
-                    value = ",".join([str(int(i)) for i in value])  # noqa: PLW2901
-                api_call += f"/{operator}/{value}"
-
-    return __list_tasks(api_call=api_call)
-
-
-def __list_tasks(api_call: str) -> pd.DataFrame:  # noqa: C901, PLR0912
-    """Returns a Pandas DataFrame with information about OpenML tasks.
-
-    Parameters
-    ----------
-    api_call : str
-        The API call specifying which tasks to return.
-
-    Returns
-    -------
-        A Pandas DataFrame with information about OpenML tasks.
-
-    Raises
-    ------
-    ValueError
-        If the XML returned by the OpenML API does not contain 'oml:tasks', '@xmlns:oml',
-        or has an incorrect value for '@xmlns:oml'.
-    KeyError
-        If an invalid key is found in the XML for a task.
-    """
-    xml_string = openml._api_calls._perform_api_call(api_call, "get")
-    tasks_dict = xmltodict.parse(xml_string, force_list=("oml:task", "oml:input"))
-    # Minimalistic check if the XML is useful
-    if "oml:tasks" not in tasks_dict:
-        raise ValueError(f'Error in return XML, does not contain "oml:runs": {tasks_dict}')
-
-    if "@xmlns:oml" not in tasks_dict["oml:tasks"]:
-        raise ValueError(
-            f'Error in return XML, does not contain "oml:runs"/@xmlns:oml: {tasks_dict}'
-        )
-
-    if tasks_dict["oml:tasks"]["@xmlns:oml"] != "http://openml.org/openml":
-        raise ValueError(
-            "Error in return XML, value of  "
-            '"oml:runs"/@xmlns:oml is not '
-            f'"http://openml.org/openml": {tasks_dict!s}',
-        )
-
-    assert isinstance(tasks_dict["oml:tasks"]["oml:task"], list), type(tasks_dict["oml:tasks"])
-
-    tasks = {}
-    procs = _get_estimation_procedure_list()
-    proc_dict = {x["id"]: x for x in procs}
-
-    for task_ in tasks_dict["oml:tasks"]["oml:task"]:
-        tid = None
-        try:
-            tid = int(task_["oml:task_id"])
-            task_type_int = int(task_["oml:task_type_id"])
-            try:
-                task_type_id = TaskType(task_type_int)
-            except ValueError as e:
-                warnings.warn(
-                    f"Could not create task type id for {task_type_int} due to error {e}",
-                    RuntimeWarning,
-                    stacklevel=2,
-                )
-                continue
-
-            task = {
-                "tid": tid,
-                "ttid": task_type_id,
-                "did": int(task_["oml:did"]),
-                "name": task_["oml:name"],
-                "task_type": task_["oml:task_type"],
-                "status": task_["oml:status"],
-            }
-
-            # Other task inputs
-            for _input in task_.get("oml:input", []):
-                if _input["@name"] == "estimation_procedure":
-                    task[_input["@name"]] = proc_dict[int(_input["#text"])]["name"]
-                else:
-                    value = _input.get("#text")
-                    task[_input["@name"]] = value
-
-            # The number of qualities can range from 0 to infinity
-            for quality in task_.get("oml:quality", []):
-                if "#text" not in quality:
-                    quality_value = 0.0
-                else:
-                    quality["#text"] = float(quality["#text"])
-                    if abs(int(quality["#text"]) - quality["#text"]) < 0.0000001:
-                        quality["#text"] = int(quality["#text"])
-                    quality_value = quality["#text"]
-                task[quality["@name"]] = quality_value
-            tasks[tid] = task
-        except KeyError as e:
-            if tid is not None:
-                warnings.warn(
-                    f"Invalid xml for task {tid}: {e}\nFrom {task_}",
-                    RuntimeWarning,
-                    stacklevel=2,
-                )
-            else:
-                warnings.warn(f"Could not find key {e} in {task_}!", RuntimeWarning, stacklevel=2)
-
-    return pd.DataFrame.from_dict(tasks, orient="index")
-
-
 def get_tasks(
     task_ids: list[int],
     download_data: bool | None = None,
@@ -304,7 +115,7 @@ def get_tasks(
 ) -> list[OpenMLTask]:
     """Download tasks.
 
-    This function iterates :meth:`openml.tasks.get_task`.
+    This function iterates :meth:`openml.task.get`.
 
     Parameters
     ----------
@@ -338,7 +149,11 @@ def get_tasks(
     tasks = []
     for task_id in task_ids:
         tasks.append(
-            get_task(task_id, download_data=download_data, download_qualities=download_qualities)
+            get_task(
+                task_id,
+                download_data=download_data,
+                download_qualities=download_qualities,
+            )
         )
     return tasks
 
@@ -373,133 +188,27 @@ def get_task(
     if not isinstance(task_id, int):
         raise TypeError(f"Task id should be integer, is {type(task_id)}")
 
-    task_cache_directory = openml.utils._create_cache_directory_for_id(
-        TASKS_CACHE_DIR_NAME, task_id
-    )
-    task_cache_directory_existed = task_cache_directory.exists()
-    try:
-        task = _get_task_description(task_id)
-        dataset = get_dataset(task.dataset_id, **get_dataset_kwargs)
-        # List of class labels available in dataset description
-        # Including class labels as part of task meta data handles
-        #   the case where data download was initially disabled
-        if isinstance(task, (OpenMLClassificationTask, OpenMLLearningCurveTask)):
-            assert task.target_name is not None, (
-                "Supervised tasks must define a target feature before retrieving class labels."
-            )
-            task.class_labels = dataset.retrieve_class_labels(task.target_name)
-        # Clustering tasks do not have class labels
-        # and do not offer download_split
-        if download_splits and isinstance(task, OpenMLSupervisedTask):
-            task.download_split()
-    except Exception as e:
-        if not task_cache_directory_existed:
-            openml.utils._remove_cache_dir_for_id(TASKS_CACHE_DIR_NAME, task_cache_directory)
-        raise e
-
-    return task
-
+    task = openml._backend.task.get(task_id)
+    dataset = get_dataset(task.dataset_id, **get_dataset_kwargs)
 
-def _get_task_description(task_id: int) -> OpenMLTask:
-    try:
-        return _get_cached_task(task_id)
-    except OpenMLCacheException:
-        _cache_dir = openml.utils._create_cache_directory_for_id(TASKS_CACHE_DIR_NAME, task_id)
-        xml_file = _cache_dir / "task.xml"
-        task_xml = openml._api_calls._perform_api_call(f"task/{task_id}", "get")
-
-        with xml_file.open("w", encoding="utf8") as fh:
-            fh.write(task_xml)
-        return _create_task_from_xml(task_xml)
-
-
-def _create_task_from_xml(xml: str) -> OpenMLTask:
-    """Create a task given a xml string.
-
-    Parameters
-    ----------
-    xml : string
-        Task xml representation.
-
-    Returns
-    -------
-    OpenMLTask
-    """
-    dic = xmltodict.parse(xml)["oml:task"]
-    estimation_parameters = {}
-    inputs = {}
-    # Due to the unordered structure we obtain, we first have to extract
-    # the possible keys of oml:input; dic["oml:input"] is a list of
-    # OrderedDicts
-
-    # Check if there is a list of inputs
-    if isinstance(dic["oml:input"], list):
-        for input_ in dic["oml:input"]:
-            name = input_["@name"]
-            inputs[name] = input_
-    # Single input case
-    elif isinstance(dic["oml:input"], dict):
-        name = dic["oml:input"]["@name"]
-        inputs[name] = dic["oml:input"]
-
-    evaluation_measures = None
-    if "evaluation_measures" in inputs:
-        evaluation_measures = inputs["evaluation_measures"]["oml:evaluation_measures"][
-            "oml:evaluation_measure"
-        ]
-
-    task_type = TaskType(int(dic["oml:task_type_id"]))
-    common_kwargs = {
-        "task_id": dic["oml:task_id"],
-        "task_type": dic["oml:task_type"],
-        "task_type_id": task_type,
-        "data_set_id": inputs["source_data"]["oml:data_set"]["oml:data_set_id"],
-        "evaluation_measure": evaluation_measures,
-    }
-    # TODO: add OpenMLClusteringTask?
-    if task_type in (
-        TaskType.SUPERVISED_CLASSIFICATION,
-        TaskType.SUPERVISED_REGRESSION,
-        TaskType.LEARNING_CURVE,
+    if (
+        isinstance(task, (OpenMLClassificationTask, OpenMLLearningCurveTask))
+        and task.target_name is not None
     ):
-        # Convert some more parameters
-        for parameter in inputs["estimation_procedure"]["oml:estimation_procedure"][
-            "oml:parameter"
-        ]:
-            name = parameter["@name"]
-            text = parameter.get("#text", "")
-            estimation_parameters[name] = text
-
-        common_kwargs["estimation_procedure_type"] = inputs["estimation_procedure"][
-            "oml:estimation_procedure"
-        ]["oml:type"]
-        common_kwargs["estimation_procedure_id"] = int(
-            inputs["estimation_procedure"]["oml:estimation_procedure"]["oml:id"]
-        )
+        task.class_labels = dataset.retrieve_class_labels(task.target_name)
 
-        common_kwargs["estimation_parameters"] = estimation_parameters
-        common_kwargs["target_name"] = inputs["source_data"]["oml:data_set"]["oml:target_feature"]
-        common_kwargs["data_splits_url"] = inputs["estimation_procedure"][
-            "oml:estimation_procedure"
-        ]["oml:data_splits_url"]
-
-    cls = {
-        TaskType.SUPERVISED_CLASSIFICATION: OpenMLClassificationTask,
-        TaskType.SUPERVISED_REGRESSION: OpenMLRegressionTask,
-        TaskType.CLUSTERING: OpenMLClusteringTask,
-        TaskType.LEARNING_CURVE: OpenMLLearningCurveTask,
-    }.get(task_type)
-    if cls is None:
-        raise NotImplementedError(
-            f"Task type '{common_kwargs['task_type']}' is not supported. "
-            f"Supported task types: SUPERVISED_CLASSIFICATION,"
-            f"SUPERVISED_REGRESSION, CLUSTERING, LEARNING_CURVE."
-            f"Please check the OpenML documentation for available task types."
-        )
-    return cls(**common_kwargs)  # type: ignore
+    if download_splits and isinstance(task, OpenMLSupervisedTask):
+        if openml._backend.task.supports_download_splits():
+            task.download_split()
+        else:
+            warnings.warn(
+                "`download_splits` is not yet supported in the v2 API and will be ignored.",
+                stacklevel=2,
+            )
+
+    return task
 
 
-# TODO(eddiebergman): overload on `task_type`
 def create_task(
     task_type: TaskType,
     dataset_id: int,
@@ -587,4 +296,4 @@ def delete_task(task_id: int) -> bool:
     bool
         True if the deletion was successful. False otherwise.
     """
-    return openml.utils._delete_entity("task", task_id)
+    return openml._backend.task.delete(task_id)
diff --git a/openml/tasks/task.py b/openml/tasks/task.py
index ab3cb3da4..a709fdb45 100644
--- a/openml/tasks/task.py
+++ b/openml/tasks/task.py
@@ -241,6 +241,48 @@ def _parse_publish_response(self, xml_response: dict) -> None:
         """Parse the id from the xml_response and assign it to self."""
         self.task_id = int(xml_response["oml:upload_task"]["oml:id"])
 
+    def publish(self) -> OpenMLTask:
+        """Publish this task to OpenML server.
+
+        Returns
+        -------
+        self : OpenMLTask
+        """
+        file_elements = self._get_file_elements()
+        if "description" not in file_elements:
+            file_elements["description"] = self._to_xml()
+        task_id = openml._backend.task.publish(path="task", files=file_elements)
+        self.task_id = task_id
+        return self
+
+    def push_tag(self, tag: str) -> None:
+        """Annotates this task with a tag on the server.
+
+        Parameters
+        ----------
+        tag : str
+            Tag to attach to the task.
+        """
+        if self.task_id is None:
+            raise openml.exceptions.ObjectNotPublishedError(
+                "Please publish the task first before being able to tag it."
+            )
+        openml._backend.task.tag(self.task_id, tag)
+
+    def remove_tag(self, tag: str) -> None:
+        """Removes a tag from this task on the server.
+
+        Parameters
+        ----------
+        tag : str
+            Tag to remove from the task.
+        """
+        if self.task_id is None:
+            raise openml.exceptions.ObjectNotPublishedError(
+                "Please publish the task first before being able to untag it."
+            )
+        openml._backend.task.untag(self.task_id, tag)
+
 
 class OpenMLSupervisedTask(OpenMLTask, ABC):
     """OpenML Supervised Classification object.
diff --git a/tests/conftest.py b/tests/conftest.py
index 1359e6247..0a663af15 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -205,7 +205,7 @@ def _expected_static_cache_state(root_dir: Path) -> list[Path]:
     _c_root_dir = root_dir / "org" / "openml" / "test"
     res_paths = [root_dir, _c_root_dir]
 
-    for _d in ["datasets", "tasks", "runs"]:
+    for _d in ["datasets", "runs"]:
         res_paths.append(_c_root_dir / _d)
 
     for _id in ["-1", "2"]:
@@ -222,21 +222,21 @@ def _expected_static_cache_state(root_dir: Path) -> list[Path]:
     res_paths.append(_c_root_dir / "datasets" / "30" / "dataset_30.pq")
     res_paths.append(_c_root_dir / "runs" / "1" / "description.xml")
 
-    for _id in ["1", "3", "1882"]:
-        tmp_p = _c_root_dir / "tasks" / _id
-        res_paths.extend(
-            [
-                tmp_p / "datasplits.arff",
-                tmp_p / "task.xml",
-            ]
-        )
-
     res_paths.extend([
         _c_root_dir / "api" / "v1" / "xml" / "setup",
         _c_root_dir / "api" / "v1" / "xml" / "setup" / "1",
         _c_root_dir / "api" / "v1" / "xml" / "setup" / "1" / "body.xml",
     ])
 
+    res_paths.extend([
+        _c_root_dir / "api_splits" / "get" / "1882" / "Task_1882_splits.arff" / "body.arff",
+        _c_root_dir / "api_splits" / "get" / "3" / "Task_3_splits.arff" / "body.arff",
+        _c_root_dir / "api_splits" / "get" / "1" / "Task_1_splits.arff" / "body.arff",
+        _c_root_dir / "api" / "v1" / "xml" / "task" / "1882" / "body.xml",
+        _c_root_dir / "api" / "v1" / "xml" / "task" / "3" / "body.xml",
+        _c_root_dir / "api" / "v1" / "xml" / "task" / "1" / "body.xml",
+    ])
+
     return res_paths
 
 
@@ -324,8 +324,8 @@ def with_test_cache(test_files_directory, request):
     openml.config.set_root_cache_directory(_root_cache_directory)
     if tmp_cache.exists():
         shutil.rmtree(tmp_cache)
-        
 
+ 
 @pytest.fixture
 def static_cache_dir():
     return Path(__file__).parent / "files" 
diff --git a/tests/files/org/openml/test/tasks/1/task.xml b/tests/files/org/openml/test/api/v1/xml/task/1/body.xml
similarity index 100%
rename from tests/files/org/openml/test/tasks/1/task.xml
rename to tests/files/org/openml/test/api/v1/xml/task/1/body.xml
diff --git a/tests/files/org/openml/test/tasks/1882/task.xml b/tests/files/org/openml/test/api/v1/xml/task/1882/body.xml
similarity index 100%
rename from tests/files/org/openml/test/tasks/1882/task.xml
rename to tests/files/org/openml/test/api/v1/xml/task/1882/body.xml
diff --git a/tests/files/org/openml/test/tasks/3/task.xml b/tests/files/org/openml/test/api/v1/xml/task/3/body.xml
similarity index 100%
rename from tests/files/org/openml/test/tasks/3/task.xml
rename to tests/files/org/openml/test/api/v1/xml/task/3/body.xml
diff --git a/tests/files/org/openml/test/tasks/1/datasplits.arff b/tests/files/org/openml/test/api_splits/get/1/Task_1_splits.arff/body.arff
similarity index 100%
rename from tests/files/org/openml/test/tasks/1/datasplits.arff
rename to tests/files/org/openml/test/api_splits/get/1/Task_1_splits.arff/body.arff
diff --git a/tests/files/org/openml/test/tasks/1882/datasplits.arff b/tests/files/org/openml/test/api_splits/get/1882/Task_1882_splits.arff/body.arff
similarity index 100%
rename from tests/files/org/openml/test/tasks/1882/datasplits.arff
rename to tests/files/org/openml/test/api_splits/get/1882/Task_1882_splits.arff/body.arff
diff --git a/tests/files/org/openml/test/tasks/3/datasplits.arff b/tests/files/org/openml/test/api_splits/get/3/Task_3_splits.arff/body.arff
similarity index 100%
rename from tests/files/org/openml/test/tasks/3/datasplits.arff
rename to tests/files/org/openml/test/api_splits/get/3/Task_3_splits.arff/body.arff
diff --git a/tests/test_api/test_task.py b/tests/test_api/test_task.py
new file mode 100644
index 000000000..6cad784b3
--- /dev/null
+++ b/tests/test_api/test_task.py
@@ -0,0 +1,191 @@
+import pytest
+import pandas as pd
+from requests import Session, Response
+from unittest.mock import patch
+
+import openml
+from openml._api.resources.task import TaskV1API, TaskV2API
+from openml._api.resources.base.fallback import FallbackProxy
+from openml.exceptions import OpenMLNotSupportedError
+from openml.tasks.task import TaskType
+
+
+@pytest.fixture
+def task_v1(http_client_v1, minio_client) -> TaskV1API:
+    return TaskV1API(http=http_client_v1, minio=minio_client)
+
+
+@pytest.fixture
+def task_v2(http_client_v2, minio_client) -> TaskV2API:
+    return TaskV2API(http=http_client_v2, minio=minio_client)
+
+
+@pytest.mark.uses_test_server()
+def test_v1_list_tasks(task_v1):
+    """Verify V1 list endpoint returns a populated DataFrame."""
+    tasks_df = task_v1.list(limit=5, offset=0)
+    assert isinstance(tasks_df, pd.DataFrame)
+    assert not tasks_df.empty
+    assert "tid" in tasks_df.columns
+
+
+@pytest.mark.uses_test_server()
+def test_v1_get(task_v1):
+    """Verify V1 get endpoint returns a task."""
+    task = task_v1.get(1)
+    assert task is not None
+    assert task.task_id == 1
+
+
+@pytest.mark.uses_test_server()
+def test_v2_list_tasks(task_v2):
+    """Verify V2 list endpoint raises NotSupported."""
+    with pytest.raises(OpenMLNotSupportedError):
+        task_v2.list(limit=5, offset=0)
+
+
+@pytest.mark.uses_test_server()
+def test_v2_get(task_v2):
+    """Verify V2 get endpoint returns a task."""
+    task = task_v2.get(1)
+    assert task is not None
+    assert task.task_id == 1
+
+
+def test_v1_publish(task_v1):
+    resource_name = task_v1.resource_type.value
+    resource_files = {"description": "Resource Description File"}
+    resource_id = 123
+
+    with patch.object(Session, "request") as mock_request:
+        mock_request.return_value = Response()
+        mock_request.return_value.status_code = 200
+        mock_request.return_value._content = (
+            f'<oml:upload_task xmlns:oml="http://openml.org/openml">\n'
+            f"\t<oml:id>{resource_id}</oml:id>\n"
+            f"</oml:upload_task>\n"
+        ).encode("utf-8")
+
+        published_resource_id = task_v1.publish(
+            resource_name,
+            files=resource_files,
+        )
+
+        assert resource_id == published_resource_id
+
+        mock_request.assert_called_once_with(
+            method="POST",
+            url=openml.config.server + resource_name,
+            params={},
+            data={"api_key": openml.config.apikey},
+            headers=openml.config._HEADERS,
+            files=resource_files,
+        )
+
+
+def test_v1_delete(task_v1):
+    resource_name = task_v1.resource_type.value
+    resource_id = 123
+
+    with patch.object(Session, "request") as mock_request:
+        mock_request.return_value = Response()
+        mock_request.return_value.status_code = 200
+        mock_request.return_value._content = (
+            f'<oml:task_delete xmlns:oml="http://openml.org/openml">\n'
+            f"  <oml:id>{resource_id}</oml:id>\n"
+            f"</oml:task_delete>\n"
+        ).encode("utf-8")
+
+        task_v1.delete(resource_id)
+
+        mock_request.assert_called_once_with(
+            method="DELETE",
+            url=(openml.config.server + resource_name + "/" + str(resource_id)),
+            params={"api_key": openml.config.apikey},
+            data={},
+            headers=openml.config._HEADERS,
+            files=None,
+        )
+
+
+def test_v1_tag(task_v1):
+    resource_id = 123
+    resource_tag = "TAG"
+
+    with patch.object(Session, "request") as mock_request:
+        mock_request.return_value = Response()
+        mock_request.return_value.status_code = 200
+        mock_request.return_value._content = (
+            f'<oml:task_tag xmlns:oml="http://openml.org/openml">'
+            f"<oml:id>{resource_id}</oml:id>"
+            f"<oml:tag>{resource_tag}</oml:tag>"
+            f"</oml:task_tag>"
+        ).encode("utf-8")
+
+        tags = task_v1.tag(resource_id, resource_tag)
+
+        assert resource_tag in tags
+
+        mock_request.assert_called_once_with(
+            method="POST",
+            url=(openml.config.server + task_v1.resource_type.value + "/tag"),
+            params={},
+            data={
+                "api_key": openml.config.apikey,
+                "task_id": resource_id,
+                "tag": resource_tag,
+            },
+            headers=openml.config._HEADERS,
+            files=None,
+        )
+
+
+def test_v1_untag(task_v1):
+    resource_id = 123
+    resource_tag = "TAG"
+
+    with patch.object(Session, "request") as mock_request:
+        mock_request.return_value = Response()
+        mock_request.return_value.status_code = 200
+        mock_request.return_value._content = (
+            f'<oml:task_untag xmlns:oml="http://openml.org/openml">'
+            f"<oml:id>{resource_id}</oml:id>"
+            f"</oml:task_untag>"
+        ).encode("utf-8")
+
+        tags = task_v1.untag(resource_id, resource_tag)
+
+        assert resource_tag not in tags
+
+        mock_request.assert_called_once_with(
+            method="POST",
+            url=(openml.config.server + task_v1.resource_type.value + "/untag"),
+            params={},
+            data={
+                "api_key": openml.config.apikey,
+                "task_id": resource_id,
+                "tag": resource_tag,
+            },
+            headers=openml.config._HEADERS,
+            files=None,
+        )
+
+
+def test_v2_publish(task_v2):
+    with pytest.raises(OpenMLNotSupportedError):
+        task_v2.publish(path=None, files=None)
+
+
+def test_v2_delete(task_v2):
+    with pytest.raises(OpenMLNotSupportedError):
+        task_v2.delete(resource_id=None)
+
+
+def test_v2_tag(task_v2):
+    with pytest.raises(OpenMLNotSupportedError):
+        task_v2.tag(resource_id=None, tag=None)
+
+
+def test_v2_untag(task_v2):
+    with pytest.raises(OpenMLNotSupportedError):
+        task_v2.untag(resource_id=None, tag=None)
diff --git a/tests/test_datasets/test_dataset_functions.py b/tests/test_datasets/test_dataset_functions.py
index 80b0b4215..f885198f1 100644
--- a/tests/test_datasets/test_dataset_functions.py
+++ b/tests/test_datasets/test_dataset_functions.py
@@ -289,7 +289,9 @@ def test_get_dataset_cannot_access_private_data(self):
     @pytest.mark.skip("Need to find dataset name of private dataset")
     def test_dataset_by_name_cannot_access_private_data(self):
         self.use_production_server()
-        self.assertRaises(OpenMLPrivateDatasetError, openml.datasets.get_dataset, "NAME_GOES_HERE")
+        self.assertRaises(
+            OpenMLPrivateDatasetError, openml.datasets.get_dataset, "NAME_GOES_HERE"
+        )
 
     @pytest.mark.test_server()
     def test_get_dataset_lazy_all_functions(self):
@@ -299,7 +301,9 @@ def test_get_dataset_lazy_all_functions(self):
 
         def ensure_absence_of_real_data():
             assert not os.path.exists(
-                os.path.join(openml.config.get_cache_directory(), "datasets", "1", "dataset.arff")
+                os.path.join(
+                    openml.config.get_cache_directory(), "datasets", "1", "dataset.arff"
+                )
             )
 
         tag = "test_lazy_tag_%d" % random.randint(1, 1000000)
@@ -404,7 +408,6 @@ def test__download_minio_file_works_with_bucket_subdirectory(self):
             file_destination
         ), "_download_minio_file can download from subdirectories"
 
-
     @mock.patch("openml._api_calls._download_minio_file")
     @pytest.mark.test_server()
     def test__get_dataset_parquet_is_cached(self, patch):
@@ -524,13 +527,29 @@ def test_deletion_of_cache_dir(self):
     @pytest.mark.test_server()
     def test_deletion_of_cache_dir_faulty_download(self, patch):
         patch.side_effect = Exception("Boom!")
-        self.assertRaisesRegex(Exception, "Boom!", openml.datasets.get_dataset, dataset_id=1)
-        datasets_cache_dir = os.path.join(openml.config.get_cache_directory(), "datasets")
+        self.assertRaisesRegex(
+            Exception, "Boom!", openml.datasets.get_dataset, dataset_id=1
+        )
+        datasets_cache_dir = os.path.join(
+            openml.config.get_cache_directory(), "datasets"
+        )
         assert len(os.listdir(datasets_cache_dir)) == 0
 
+    @pytest.mark.skipif(
+        os.getenv("OPENML_USE_LOCAL_SERVICES") == "true",
+        reason="Pending resolution of #1657",
+    )
     @pytest.mark.test_server()
     def test_publish_dataset(self):
-        arff_file_path = self.static_cache_dir / "org" / "openml" / "test" / "datasets" / "2" / "dataset.arff"
+        arff_file_path = (
+            self.static_cache_dir
+            / "org"
+            / "openml"
+            / "test"
+            / "datasets"
+            / "2"
+            / "dataset.arff"
+        )
         dataset = OpenMLDataset(
             "anneal",
             "test",
@@ -561,7 +580,9 @@ def test__retrieve_class_labels(self):
         # Test workaround for string-typed class labels
         custom_ds = openml.datasets.get_dataset(2)
         custom_ds.features[31].data_type = "string"
-        labels = custom_ds.retrieve_class_labels(target_name=custom_ds.features[31].name)
+        labels = custom_ds.retrieve_class_labels(
+            target_name=custom_ds.features[31].name
+        )
         assert labels == ["COIL", "SHEET"]
 
     @pytest.mark.test_server()
@@ -682,11 +703,16 @@ def test_attributes_arff_from_df_unknown_dtype(self):
         for arr, dt in zip(data, dtype):
             df = pd.DataFrame(arr)
             err_msg = (
-                f"The dtype '{dt}' of the column '0' is not currently " "supported by liac-arff"
+                f"The dtype '{dt}' of the column '0' is not currently "
+                "supported by liac-arff"
             )
             with pytest.raises(ValueError, match=err_msg):
                 attributes_arff_from_df(df)
 
+    @pytest.mark.skipif(
+        os.getenv("OPENML_USE_LOCAL_SERVICES") == "true",
+        reason="Pending resolution of #1657",
+    )
     @pytest.mark.test_server()
     def test_create_dataset_numpy(self):
         data = np.array([[1, 2, 3], [1.2, 2.5, 3.8], [2, 5, 8], [0, 1, 0]]).T
@@ -719,8 +745,14 @@ def test_create_dataset_numpy(self):
         assert (
             _get_online_dataset_arff(dataset.id) == dataset._dataset
         ), "Uploaded arff does not match original one"
-        assert _get_online_dataset_format(dataset.id) == "arff", "Wrong format for dataset"
+        assert (
+            _get_online_dataset_format(dataset.id) == "arff"
+        ), "Wrong format for dataset"
 
+    @pytest.mark.skipif(
+        os.getenv("OPENML_USE_LOCAL_SERVICES") == "true",
+        reason="Pending resolution of #1657",
+    )
     @pytest.mark.test_server()
     def test_create_dataset_list(self):
         data = [
@@ -774,8 +806,14 @@ def test_create_dataset_list(self):
         assert (
             _get_online_dataset_arff(dataset.id) == dataset._dataset
         ), "Uploaded ARFF does not match original one"
-        assert _get_online_dataset_format(dataset.id) == "arff", "Wrong format for dataset"
+        assert (
+            _get_online_dataset_format(dataset.id) == "arff"
+        ), "Wrong format for dataset"
 
+    @pytest.mark.skipif(
+        os.getenv("OPENML_USE_LOCAL_SERVICES") == "true",
+        reason="Pending resolution of #1657",
+    )
     @pytest.mark.test_server()
     def test_create_dataset_sparse(self):
         # test the scipy.sparse.coo_matrix
@@ -924,6 +962,10 @@ def test_get_online_dataset_format(self):
             dataset_id
         ), "The format of the ARFF files is different"
 
+    @pytest.mark.skipif(
+        os.getenv("OPENML_USE_LOCAL_SERVICES") == "true",
+        reason="Pending resolution of #1657",
+    )
     @pytest.mark.test_server()
     def test_create_dataset_pandas(self):
         data = [
@@ -991,7 +1033,9 @@ def test_create_dataset_pandas(self):
         column_names = ["input1", "input2", "y"]
         df = pd.DataFrame.sparse.from_spmatrix(sparse_data, columns=column_names)
         # meta-information
-        description = "Synthetic dataset created from a Pandas DataFrame with Sparse columns"
+        description = (
+            "Synthetic dataset created from a Pandas DataFrame with Sparse columns"
+        )
         dataset = openml.datasets.functions.create_dataset(
             name=name,
             description=description,
@@ -1016,7 +1060,9 @@ def test_create_dataset_pandas(self):
         assert (
             _get_online_dataset_arff(dataset.id) == dataset._dataset
         ), "Uploaded ARFF does not match original one"
-        assert _get_online_dataset_format(dataset.id) == "sparse_arff", "Wrong format for dataset"
+        assert (
+            _get_online_dataset_format(dataset.id) == "sparse_arff"
+        ), "Wrong format for dataset"
 
         # Check that we can overwrite the attributes
         data = [["a"], ["b"], ["c"], ["d"], ["e"]]
@@ -1046,7 +1092,9 @@ def test_create_dataset_pandas(self):
         TestBase._mark_entity_for_removal("data", dataset.id)
         TestBase.logger.info(f"collected from {__file__.split('/')[-1]}: {dataset.id}")
         downloaded_data = _get_online_dataset_arff(dataset.id)
-        assert downloaded_data == dataset._dataset, "Uploaded ARFF does not match original one"
+        assert (
+            downloaded_data == dataset._dataset
+        ), "Uploaded ARFF does not match original one"
         assert "@ATTRIBUTE rnd_str {a, b, c, d, e, f, g}" in downloaded_data
 
     def test_ignore_attributes_dataset(self):
@@ -1149,6 +1197,10 @@ def test_ignore_attributes_dataset(self):
                 paper_url=paper_url,
             )
 
+    @pytest.mark.skipif(
+        os.getenv("OPENML_USE_LOCAL_SERVICES") == "true",
+        reason="Pending resolution of #1657",
+    )
     @pytest.mark.test_server()
     def test_publish_fetch_ignore_attribute(self):
         """Test to upload and retrieve dataset and check ignore_attributes"""
@@ -1268,6 +1320,10 @@ def test_create_dataset_row_id_attribute_error(self):
                 paper_url=paper_url,
             )
 
+    @pytest.mark.skipif(
+        os.getenv("OPENML_USE_LOCAL_SERVICES") == "true",
+        reason="Pending resolution of #1657",
+    )
     @pytest.mark.test_server()
     def test_create_dataset_row_id_attribute_inference(self):
         # meta-information
@@ -1396,7 +1452,9 @@ def test_get_dataset_cache_format_feather(self):
         cache_dir = openml.config.get_cache_directory()
         cache_dir_for_id = os.path.join(cache_dir, "datasets", "128")
         feather_file = os.path.join(cache_dir_for_id, "dataset.feather")
-        pickle_file = os.path.join(cache_dir_for_id, "dataset.feather.attributes.pkl.py3")
+        pickle_file = os.path.join(
+            cache_dir_for_id, "dataset.feather.attributes.pkl.py3"
+        )
         data = pd.read_feather(feather_file)
         assert os.path.isfile(feather_file), "Feather file is missing"
         assert os.path.isfile(pickle_file), "Attributes pickle file is missing"
@@ -1436,6 +1494,10 @@ def test_data_edit_non_critical_field(self):
         edited_dataset = openml.datasets.get_dataset(did)
         assert edited_dataset.description == desc
 
+    @pytest.mark.skipif(
+        os.getenv("OPENML_USE_LOCAL_SERVICES") == "true",
+        reason="Pending resolution of #1657",
+    )
     @pytest.mark.test_server()
     def test_data_edit_critical_field(self):
         # Case 2
@@ -1443,7 +1505,9 @@ def test_data_edit_critical_field(self):
         # for this, we need to first clone a dataset to do changes
         did = fork_dataset(1)
         self._wait_for_dataset_being_processed(did)
-        result = edit_dataset(did, default_target_attribute="shape", ignore_attribute="oil")
+        result = edit_dataset(
+            did, default_target_attribute="shape", ignore_attribute="oil"
+        )
         assert did == result
 
         n_tries = 10
@@ -1451,7 +1515,9 @@ def test_data_edit_critical_field(self):
         for i in range(n_tries):
             edited_dataset = openml.datasets.get_dataset(did)
             try:
-                assert edited_dataset.default_target_attribute == "shape", edited_dataset
+                assert (
+                    edited_dataset.default_target_attribute == "shape"
+                ), edited_dataset
                 assert edited_dataset.ignore_attribute == ["oil"], edited_dataset
                 break
             except AssertionError as e:
@@ -1459,9 +1525,11 @@ def test_data_edit_critical_field(self):
                     raise e
                 time.sleep(10)
                 # Delete the cache dir to get the newer version of the dataset
-                
+
                 shutil.rmtree(
-                    os.path.join(openml.config.get_cache_directory(), "datasets", str(did)),
+                    os.path.join(
+                        openml.config.get_cache_directory(), "datasets", str(did)
+                    ),
                 )
 
     @pytest.mark.test_server()
@@ -1488,6 +1556,10 @@ def test_data_edit_requires_valid_dataset(self):
             description="xor operation dataset",
         )
 
+    @pytest.mark.skipif(
+        os.getenv("OPENML_USE_LOCAL_SERVICES") == "true",
+        reason="Pending resolution of #1657",
+    )
     @pytest.mark.test_server()
     def test_data_edit_cannot_edit_critical_field_if_dataset_has_task(self):
         # Need to own a dataset to be able to edit meta-data
@@ -1540,7 +1612,6 @@ def test_data_fork(self):
             data_id=999999,
         )
 
-
     @pytest.mark.production_server()
     def test_list_datasets_with_high_size_parameter(self):
         # Testing on prod since concurrent deletion of uploded datasets make the test fail
@@ -1626,7 +1697,9 @@ def test_invalid_attribute_validations(
         (None, None, ["outlook", "windy"]),
     ],
 )
-def test_valid_attribute_validations(default_target_attribute, row_id_attribute, ignore_attribute):
+def test_valid_attribute_validations(
+    default_target_attribute, row_id_attribute, ignore_attribute
+):
     data = [
         ["a", "sunny", 85.0, 85.0, "FALSE", "no"],
         ["b", "sunny", 80.0, 90.0, "TRUE", "no"],
@@ -1726,7 +1799,10 @@ def test_delete_dataset(self):
 @mock.patch.object(requests.Session, "delete")
 def test_delete_dataset_not_owned(mock_delete, test_files_directory, test_server_v1, test_apikey_v1):
     content_file = (
-        test_files_directory / "mock_responses" / "datasets" / "data_delete_not_owned.xml"
+        test_files_directory
+        / "mock_responses"
+        / "datasets"
+        / "data_delete_not_owned.xml"
     )
     mock_delete.return_value = create_request_response(
         status_code=412,
@@ -1747,7 +1823,10 @@ def test_delete_dataset_not_owned(mock_delete, test_files_directory, test_server
 @mock.patch.object(requests.Session, "delete")
 def test_delete_dataset_with_run(mock_delete, test_files_directory, test_server_v1, test_apikey_v1):
     content_file = (
-        test_files_directory / "mock_responses" / "datasets" / "data_delete_has_tasks.xml"
+        test_files_directory
+        / "mock_responses"
+        / "datasets"
+        / "data_delete_has_tasks.xml"
     )
     mock_delete.return_value = create_request_response(
         status_code=412,
@@ -1768,7 +1847,10 @@ def test_delete_dataset_with_run(mock_delete, test_files_directory, test_server_
 @mock.patch.object(requests.Session, "delete")
 def test_delete_dataset_success(mock_delete, test_files_directory, test_server_v1, test_apikey_v1):
     content_file = (
-        test_files_directory / "mock_responses" / "datasets" / "data_delete_successful.xml"
+        test_files_directory
+        / "mock_responses"
+        / "datasets"
+        / "data_delete_successful.xml"
     )
     mock_delete.return_value = create_request_response(
         status_code=200,
@@ -1786,7 +1868,10 @@ def test_delete_dataset_success(mock_delete, test_files_directory, test_server_v
 @mock.patch.object(requests.Session, "delete")
 def test_delete_unknown_dataset(mock_delete, test_files_directory, test_server_v1, test_apikey_v1):
     content_file = (
-        test_files_directory / "mock_responses" / "datasets" / "data_delete_not_exist.xml"
+        test_files_directory
+        / "mock_responses"
+        / "datasets"
+        / "data_delete_not_exist.xml"
     )
     mock_delete.return_value = create_request_response(
         status_code=412,
@@ -1956,9 +2041,15 @@ def test_get_dataset_lazy_behavior(
         with_features=with_features,
         with_data=with_data,
     )
-    assert dataset.features, "Features should be downloaded on-demand if not during get_dataset"
-    assert dataset.qualities, "Qualities should be downloaded on-demand if not during get_dataset"
-    assert dataset.get_data(), "Data should be downloaded on-demand if not during get_dataset"
+    assert (
+        dataset.features
+    ), "Features should be downloaded on-demand if not during get_dataset"
+    assert (
+        dataset.qualities
+    ), "Qualities should be downloaded on-demand if not during get_dataset"
+    assert (
+        dataset.get_data()
+    ), "Data should be downloaded on-demand if not during get_dataset"
     _assert_datasets_retrieved_successfully(
         [1], with_qualities=True, with_features=True, with_data=True
     )
@@ -1977,7 +2068,9 @@ def test__get_dataset_parquet_not_cached():
         "oml:parquet_url": "http://data.openml.org/dataset20/dataset_20.pq",
         "oml:id": "20",
     }
-    path = _get_dataset_parquet(description, cache_directory=Path(openml.config.get_cache_directory()))
+    path = _get_dataset_parquet(
+        description, cache_directory=Path(openml.config.get_cache_directory())
+    )
     assert isinstance(path, Path), "_get_dataset_parquet returns a path"
     assert path.is_file(), "_get_dataset_parquet returns path to real file"
 
@@ -1986,7 +2079,10 @@ def test_read_features_from_xml_with_whitespace() -> None:
     from openml.datasets.dataset import _read_features
 
     features_file = (
-        Path(__file__).parent.parent / "files" / "misc" / "features_with_whitespaces.xml"
+        Path(__file__).parent.parent
+        / "files"
+        / "misc"
+        / "features_with_whitespaces.xml"
     )
     dict = _read_features(features_file)
     assert dict[1].nominal_values == [" - 50000.", " 50000+."]
@@ -1997,7 +2093,7 @@ def test_get_dataset_parquet(requests_mock, test_files_directory, test_server_v1
     # Parquet functionality is disabled on the test server
     # There is no parquet-copy of the test server yet.
     content_file = (
-            test_files_directory / "mock_responses" / "datasets" / "data_description_61.xml"
+        test_files_directory / "mock_responses" / "datasets" / "data_description_61.xml"
     )
     # While the mocked example is from production, unit tests by default connect to the test server.
     requests_mock.get(test_server_v1 + "data/61", text=content_file.read_text())
diff --git a/tests/test_flows/test_flow.py b/tests/test_flows/test_flow.py
index 4e391fd3b..108a05c3f 100644
--- a/tests/test_flows/test_flow.py
+++ b/tests/test_flows/test_flow.py
@@ -4,6 +4,7 @@
 import collections
 import copy
 import hashlib
+import os
 import re
 import os
 import time
@@ -162,12 +163,16 @@ def test_from_xml_to_xml(self):
     def test_to_xml_from_xml(self):
         scaler = sklearn.preprocessing.StandardScaler(with_mean=False)
         estimator_name = (
-            "base_estimator" if Version(sklearn.__version__) < Version("1.4") else "estimator"
+            "base_estimator"
+            if Version(sklearn.__version__) < Version("1.4")
+            else "estimator"
         )
         boosting = sklearn.ensemble.AdaBoostClassifier(
             **{estimator_name: sklearn.tree.DecisionTreeClassifier()},
         )
-        model = sklearn.pipeline.Pipeline(steps=(("scaler", scaler), ("boosting", boosting)))
+        model = sklearn.pipeline.Pipeline(
+            steps=(("scaler", scaler), ("boosting", boosting))
+        )
         flow = self.extension.model_to_flow(model)
         flow.flow_id = -234
         # end of setup
@@ -180,6 +185,10 @@ def test_to_xml_from_xml(self):
         openml.flows.functions.assert_flows_equal(new_flow, flow)
         assert new_flow is not flow
 
+    @pytest.mark.skipif(
+        os.getenv("OPENML_USE_LOCAL_SERVICES") == "true",
+        reason="Pending resolution of #1657",
+    )
     @pytest.mark.sklearn()
     @pytest.mark.test_server()
     def test_publish_flow(self):
@@ -204,7 +213,9 @@ def test_publish_flow(self):
 
         flow.publish()
         TestBase._mark_entity_for_removal("flow", flow.flow_id, flow.name)
-        TestBase.logger.info(f"collected from {__file__.split('/')[-1]}: {flow.flow_id}")
+        TestBase.logger.info(
+            f"collected from {__file__.split('/')[-1]}: {flow.flow_id}"
+        )
         assert isinstance(flow.flow_id, int)
 
     @pytest.mark.sklearn()
@@ -214,7 +225,9 @@ def test_publish_existing_flow(self, flow_exists_mock):
         flow = self.extension.model_to_flow(clf)
         flow_exists_mock.return_value = 1
 
-        with pytest.raises(openml.exceptions.PyOpenMLError, match="OpenMLFlow already exists"):
+        with pytest.raises(
+            openml.exceptions.PyOpenMLError, match="OpenMLFlow already exists"
+        ):
             flow.publish(raise_error_if_exists=True)
 
         TestBase._mark_entity_for_removal("flow", flow.flow_id, flow.name)
@@ -222,6 +235,10 @@ def test_publish_existing_flow(self, flow_exists_mock):
             f"collected from {__file__.split('/')[-1]}: {flow.flow_id}",
         )
 
+    @pytest.mark.skipif(
+        os.getenv("OPENML_USE_LOCAL_SERVICES") == "true",
+        reason="Pending resolution of #1657",
+    )
     @pytest.mark.sklearn()
     @pytest.mark.test_server()
     def test_publish_flow_with_similar_components(self):
@@ -232,7 +249,9 @@ def test_publish_flow_with_similar_components(self):
         flow, _ = self._add_sentinel_to_flow_name(flow, None)
         flow.publish()
         TestBase._mark_entity_for_removal("flow", flow.flow_id, flow.name)
-        TestBase.logger.info(f"collected from {__file__.split('/')[-1]}: {flow.flow_id}")
+        TestBase.logger.info(
+            f"collected from {__file__.split('/')[-1]}: {flow.flow_id}"
+        )
         # For a flow where both components are published together, the upload
         # date should be equal
         assert flow.upload_date == flow.components["lr"].upload_date, (
@@ -247,7 +266,9 @@ def test_publish_flow_with_similar_components(self):
         flow1, sentinel = self._add_sentinel_to_flow_name(flow1, None)
         flow1.publish()
         TestBase._mark_entity_for_removal("flow", flow.flow_id, flow.name)
-        TestBase.logger.info(f"collected from {__file__.split('/')[-1]}: {flow1.flow_id}")
+        TestBase.logger.info(
+            f"collected from {__file__.split('/')[-1]}: {flow1.flow_id}"
+        )
 
         # In order to assign different upload times to the flows!
         time.sleep(1)
@@ -259,20 +280,30 @@ def test_publish_flow_with_similar_components(self):
         flow2, _ = self._add_sentinel_to_flow_name(flow2, sentinel)
         flow2.publish()
         TestBase._mark_entity_for_removal("flow", flow2.flow_id, flow2.name)
-        TestBase.logger.info(f"collected from {__file__.split('/')[-1]}: {flow2.flow_id}")
+        TestBase.logger.info(
+            f"collected from {__file__.split('/')[-1]}: {flow2.flow_id}"
+        )
         # If one component was published before the other, the components in
         # the flow should have different upload dates
         assert flow2.upload_date != flow2.components["dt"].upload_date
 
-        clf3 = sklearn.ensemble.AdaBoostClassifier(sklearn.tree.DecisionTreeClassifier(max_depth=3))
+        clf3 = sklearn.ensemble.AdaBoostClassifier(
+            sklearn.tree.DecisionTreeClassifier(max_depth=3)
+        )
         flow3 = self.extension.model_to_flow(clf3)
         flow3, _ = self._add_sentinel_to_flow_name(flow3, sentinel)
         # Child flow has different parameter. Check for storing the flow
         # correctly on the server should thus not check the child's parameters!
         flow3.publish()
         TestBase._mark_entity_for_removal("flow", flow3.flow_id, flow3.name)
-        TestBase.logger.info(f"collected from {__file__.split('/')[-1]}: {flow3.flow_id}")
+        TestBase.logger.info(
+            f"collected from {__file__.split('/')[-1]}: {flow3.flow_id}"
+        )
 
+    @pytest.mark.skipif(
+        os.getenv("OPENML_USE_LOCAL_SERVICES") == "true",
+        reason="Pending resolution of #1657",
+    )
     @pytest.mark.sklearn()
     @pytest.mark.test_server()
     def test_semi_legal_flow(self):
@@ -280,7 +311,9 @@ def test_semi_legal_flow(self):
         # should not throw error as it contains two differentiable forms of
         # Bagging i.e., Bagging(Bagging(J48)) and Bagging(J48)
         estimator_name = (
-            "base_estimator" if Version(sklearn.__version__) < Version("1.4") else "estimator"
+            "base_estimator"
+            if Version(sklearn.__version__) < Version("1.4")
+            else "estimator"
         )
         semi_legal = sklearn.ensemble.BaggingClassifier(
             **{
@@ -296,7 +329,9 @@ def test_semi_legal_flow(self):
 
         flow.publish()
         TestBase._mark_entity_for_removal("flow", flow.flow_id, flow.name)
-        TestBase.logger.info(f"collected from {__file__.split('/')[-1]}: {flow.flow_id}")
+        TestBase.logger.info(
+            f"collected from {__file__.split('/')[-1]}: {flow.flow_id}"
+        )
 
     @pytest.mark.sklearn()
     @mock.patch("openml.flows.functions.get_flow")
@@ -383,13 +418,21 @@ def get_sentinel():
         flow_id = openml.flows.flow_exists(name, version)
         assert not flow_id
 
+    @pytest.mark.skipif(
+        os.getenv("OPENML_USE_LOCAL_SERVICES") == "true",
+        reason="Pending resolution of #1657",
+    )
     @pytest.mark.sklearn()
     @pytest.mark.test_server()
     def test_existing_flow_exists(self):
         # create a flow
         nb = sklearn.naive_bayes.GaussianNB()
 
-        sparse = "sparse" if Version(sklearn.__version__) < Version("1.4") else "sparse_output"
+        sparse = (
+            "sparse"
+            if Version(sklearn.__version__) < Version("1.4")
+            else "sparse_output"
+        )
         ohe_params = {sparse: False, "handle_unknown": "ignore"}
         if Version(sklearn.__version__) >= Version("0.20"):
             ohe_params["categories"] = "auto"
@@ -424,6 +467,10 @@ def test_existing_flow_exists(self):
             )
             assert downloaded_flow_id == flow.flow_id
 
+    @pytest.mark.skipif(
+        os.getenv("OPENML_USE_LOCAL_SERVICES") == "true",
+        reason="Pending resolution of #1657",
+    )
     @pytest.mark.sklearn()
     @pytest.mark.test_server()
     def test_sklearn_to_upload_to_flow(self):
@@ -444,13 +491,20 @@ def test_sklearn_to_upload_to_flow(self):
         )
         fu = sklearn.pipeline.FeatureUnion(transformer_list=[("pca", pca), ("fs", fs)])
         estimator_name = (
-            "base_estimator" if Version(sklearn.__version__) < Version("1.4") else "estimator"
+            "base_estimator"
+            if Version(sklearn.__version__) < Version("1.4")
+            else "estimator"
         )
         boosting = sklearn.ensemble.AdaBoostClassifier(
             **{estimator_name: sklearn.tree.DecisionTreeClassifier()},
         )
         model = sklearn.pipeline.Pipeline(
-            steps=[("ohe", ohe), ("scaler", scaler), ("fu", fu), ("boosting", boosting)],
+            steps=[
+                ("ohe", ohe),
+                ("scaler", scaler),
+                ("fu", fu),
+                ("boosting", boosting),
+            ],
         )
         parameter_grid = {
             "boosting__n_estimators": [1, 5, 10, 100],
@@ -477,7 +531,9 @@ def test_sklearn_to_upload_to_flow(self):
 
         flow.publish()
         TestBase._mark_entity_for_removal("flow", flow.flow_id, flow.name)
-        TestBase.logger.info(f"collected from {__file__.split('/')[-1]}: {flow.flow_id}")
+        TestBase.logger.info(
+            f"collected from {__file__.split('/')[-1]}: {flow.flow_id}"
+        )
         assert isinstance(flow.flow_id, int)
 
         # Check whether we can load the flow again
@@ -560,7 +616,10 @@ def test_extract_tags(self):
         tags = openml.utils.extract_xml_tags("oml:tag", flow_dict)
         assert tags == ["study_14"]
 
-        flow_xml = "<oml:flow><oml:tag>OpenmlWeka</oml:tag>\n" "<oml:tag>weka</oml:tag></oml:flow>"
+        flow_xml = (
+            "<oml:flow><oml:tag>OpenmlWeka</oml:tag>\n"
+            "<oml:tag>weka</oml:tag></oml:flow>"
+        )
         flow_dict = xmltodict.parse(flow_xml)
         tags = openml.utils.extract_xml_tags("oml:tag", flow_dict["oml:flow"])
         assert tags == ["OpenmlWeka", "weka"]
diff --git a/tests/test_flows/test_flow_functions.py b/tests/test_flows/test_flow_functions.py
index 7a1331c45..f0709bb45 100644
--- a/tests/test_flows/test_flow_functions.py
+++ b/tests/test_flows/test_flow_functions.py
@@ -6,7 +6,7 @@
 import unittest
 from collections import OrderedDict
 from multiprocessing.managers import Value
-
+import os
 from openml_sklearn import SklearnExtension
 from packaging.version import Version
 from unittest import mock
@@ -153,7 +153,9 @@ def test_are_flows_equal(self):
         openml.flows.functions.assert_flows_equal(flow, flow)
         new_flow = copy.deepcopy(flow)
         new_flow.parameters["abc"] = 3.0
-        self.assertRaises(ValueError, openml.flows.functions.assert_flows_equal, flow, new_flow)
+        self.assertRaises(
+            ValueError, openml.flows.functions.assert_flows_equal, flow, new_flow
+        )
 
         # Now test for components (subflows)
         parent_flow = copy.deepcopy(flow)
@@ -195,24 +197,28 @@ def test_are_flows_equal_ignore_parameter_values(self):
         )
 
         openml.flows.functions.assert_flows_equal(flow, flow)
-        openml.flows.functions.assert_flows_equal(flow, flow, ignore_parameter_values=True)
+        openml.flows.functions.assert_flows_equal(
+            flow, flow, ignore_parameter_values=True
+        )
 
         new_flow = copy.deepcopy(flow)
         new_flow.parameters["a"] = 7
         with pytest.raises(ValueError) as excinfo:
             openml.flows.functions.assert_flows_equal(flow, new_flow)
-        assert str(paramaters) in str(excinfo.value) and str(new_flow.parameters) in str(
-            excinfo.value
-        )
+        assert str(paramaters) in str(excinfo.value) and str(
+            new_flow.parameters
+        ) in str(excinfo.value)
 
-        openml.flows.functions.assert_flows_equal(flow, new_flow, ignore_parameter_values=True)
+        openml.flows.functions.assert_flows_equal(
+            flow, new_flow, ignore_parameter_values=True
+        )
 
         del new_flow.parameters["a"]
         with pytest.raises(ValueError) as excinfo:
             openml.flows.functions.assert_flows_equal(flow, new_flow)
-        assert str(paramaters) in str(excinfo.value) and str(new_flow.parameters) in str(
-            excinfo.value
-        )
+        assert str(paramaters) in str(excinfo.value) and str(
+            new_flow.parameters
+        ) in str(excinfo.value)
 
         self.assertRaisesRegex(
             ValueError,
@@ -246,7 +252,9 @@ def test_are_flows_equal_ignore_if_older(self):
             upload_date=flow_upload_date,
         )
 
-        assert_flows_equal(flow, flow, ignore_parameter_values_on_older_children=flow_upload_date)
+        assert_flows_equal(
+            flow, flow, ignore_parameter_values_on_older_children=flow_upload_date
+        )
         assert_flows_equal(flow, flow, ignore_parameter_values_on_older_children=None)
         new_flow = copy.deepcopy(flow)
         new_flow.parameters["a"] = 7
@@ -296,7 +304,9 @@ def test_sklearn_to_flow_list_of_lists(self):
         self._add_sentinel_to_flow_name(flow)
         flow.publish()
         TestBase._mark_entity_for_removal("flow", flow.flow_id, flow.name)
-        TestBase.logger.info(f"collected from {__file__.split('/')[-1]}: {flow.flow_id}")
+        TestBase.logger.info(
+            f"collected from {__file__.split('/')[-1]}: {flow.flow_id}"
+        )
         # Test deserialization works
         server_flow = openml.flows.get_flow(flow.flow_id, reinstantiate=True)
         assert server_flow.parameters["categories"] == "[[0, 1], [0, 1]]"
@@ -310,6 +320,10 @@ def test_get_flow1(self):
         flow = openml.flows.get_flow(1)
         assert flow.external_version is None
 
+    @pytest.mark.skipif(
+        os.getenv("OPENML_USE_LOCAL_SERVICES") == "true",
+        reason="Pending resolution of #1657",
+    )
     @pytest.mark.sklearn()
     @pytest.mark.test_server()
     def test_get_flow_reinstantiate_model(self):
@@ -318,10 +332,14 @@ def test_get_flow_reinstantiate_model(self):
         flow = extension.model_to_flow(model)
         flow.publish(raise_error_if_exists=False)
         TestBase._mark_entity_for_removal("flow", flow.flow_id, flow.name)
-        TestBase.logger.info(f"collected from {__file__.split('/')[-1]}: {flow.flow_id}")
+        TestBase.logger.info(
+            f"collected from {__file__.split('/')[-1]}: {flow.flow_id}"
+        )
 
         downloaded_flow = openml.flows.get_flow(flow.flow_id, reinstantiate=True)
-        assert isinstance(downloaded_flow.model, sklearn.ensemble.RandomForestClassifier)
+        assert isinstance(
+            downloaded_flow.model, sklearn.ensemble.RandomForestClassifier
+        )
 
     @pytest.mark.test_server()
     def test_get_flow_reinstantiate_model_no_extension(self):
@@ -340,7 +358,9 @@ def test_get_flow_reinstantiate_model_no_extension(self):
         reason="Requires scikit-learn!=0.19.1, because target flow is from that version.",
     )
     @pytest.mark.production_server()
-    def test_get_flow_with_reinstantiate_strict_with_wrong_version_raises_exception(self):
+    def test_get_flow_with_reinstantiate_strict_with_wrong_version_raises_exception(
+        self,
+    ):
         self.use_production_server()
         flow = 8175
         expected = "Trying to deserialize a model with dependency sklearn==0.19.1 not satisfied."
@@ -363,7 +383,9 @@ def test_get_flow_with_reinstantiate_strict_with_wrong_version_raises_exception(
     @pytest.mark.production_server()
     def test_get_flow_reinstantiate_flow_not_strict_post_1(self):
         self.use_production_server()
-        flow = openml.flows.get_flow(flow_id=19190, reinstantiate=True, strict_version=False)
+        flow = openml.flows.get_flow(
+            flow_id=19190, reinstantiate=True, strict_version=False
+        )
         assert flow.flow_id is None
         assert "sklearn==1.0.0" not in flow.dependencies
 
@@ -377,7 +399,9 @@ def test_get_flow_reinstantiate_flow_not_strict_post_1(self):
     @pytest.mark.production_server()
     def test_get_flow_reinstantiate_flow_not_strict_023_and_024(self):
         self.use_production_server()
-        flow = openml.flows.get_flow(flow_id=18587, reinstantiate=True, strict_version=False)
+        flow = openml.flows.get_flow(
+            flow_id=18587, reinstantiate=True, strict_version=False
+        )
         assert flow.flow_id is None
         assert "sklearn==0.23.1" not in flow.dependencies
 
@@ -389,10 +413,16 @@ def test_get_flow_reinstantiate_flow_not_strict_023_and_024(self):
     @pytest.mark.production_server()
     def test_get_flow_reinstantiate_flow_not_strict_pre_023(self):
         self.use_production_server()
-        flow = openml.flows.get_flow(flow_id=8175, reinstantiate=True, strict_version=False)
+        flow = openml.flows.get_flow(
+            flow_id=8175, reinstantiate=True, strict_version=False
+        )
         assert flow.flow_id is None
         assert "sklearn==0.19.1" not in flow.dependencies
 
+    @pytest.mark.skipif(
+        os.getenv("OPENML_USE_LOCAL_SERVICES") == "true",
+        reason="Pending resolution of #1657",
+    )
     @pytest.mark.sklearn()
     @pytest.mark.test_server()
     def test_get_flow_id(self):
@@ -402,13 +432,19 @@ def test_get_flow_id(self):
             list_all = functools.lru_cache()(openml.utils._list_all)
         with patch("openml.utils._list_all", list_all):
             clf = sklearn.tree.DecisionTreeClassifier()
-            flow = openml.extensions.get_extension_by_model(clf).model_to_flow(clf).publish()
+            flow = (
+                openml.extensions.get_extension_by_model(clf)
+                .model_to_flow(clf)
+                .publish()
+            )
             TestBase._mark_entity_for_removal("flow", flow.flow_id, flow.name)
             TestBase.logger.info(
                 f"collected from {__file__.split('/')[-1]}: {flow.flow_id}",
             )
 
-            assert openml.flows.get_flow_id(model=clf, exact_version=True) == flow.flow_id
+            assert (
+                openml.flows.get_flow_id(model=clf, exact_version=True) == flow.flow_id
+            )
             flow_ids = openml.flows.get_flow_id(model=clf, exact_version=False)
             assert flow.flow_id in flow_ids
             assert len(flow_ids) > 0
@@ -424,9 +460,13 @@ def test_get_flow_id(self):
                 exact_version=False,
             )
             assert flow.flow_id in flow_ids_exact_version_True
-            assert set(flow_ids_exact_version_True).issubset(set(flow_ids_exact_version_False))
+            assert set(flow_ids_exact_version_True).issubset(
+                set(flow_ids_exact_version_False)
+            )
             # instead of the assertion above, the assertion below used to be used.
-            pytest.skip(reason="Not sure why there should only be one version of this flow.")
+            pytest.skip(
+                reason="Not sure why there should only be one version of this flow."
+            )
             assert flow_ids_exact_version_True == flow_ids_exact_version_False
 
     @pytest.mark.test_server()
diff --git a/tests/test_openml/test_api_calls.py b/tests/test_openml/test_api_calls.py
index f2a81be9f..538fbe59f 100644
--- a/tests/test_openml/test_api_calls.py
+++ b/tests/test_openml/test_api_calls.py
@@ -6,6 +6,7 @@
 from unittest import mock
 
 import minio
+import os
 import pytest
 import os
 
@@ -20,6 +21,10 @@ def test_too_long_uri(self):
         with pytest.raises(openml.exceptions.OpenMLServerError, match="URI too long!"):
             openml.datasets.list_datasets(data_id=list(range(10000)))
 
+    @pytest.mark.skipif(
+        os.getenv("OPENML_USE_LOCAL_SERVICES") == "true",
+        reason="Pending resolution of #1657",
+    )
     @unittest.mock.patch("time.sleep")
     @unittest.mock.patch("requests.Session")
     @pytest.mark.test_server()
@@ -33,11 +38,17 @@ def test_retry_on_database_error(self, Session_class_mock, _):
             "Please wait for N seconds and try again.</oml:message>\n"
             "</oml:error>"
         )
-        Session_class_mock.return_value.__enter__.return_value.get.return_value = response_mock
-        with pytest.raises(openml.exceptions.OpenMLServerException, match="/abc returned code 107"):
+        Session_class_mock.return_value.__enter__.return_value.get.return_value = (
+            response_mock
+        )
+        with pytest.raises(
+            openml.exceptions.OpenMLServerException, match="/abc returned code 107"
+        ):
             openml._api_calls._send_request("get", "/abc", {})
 
-        assert Session_class_mock.return_value.__enter__.return_value.get.call_count == 20
+        assert (
+            Session_class_mock.return_value.__enter__.return_value.get.call_count == 20
+        )
 
 
 class FakeObject(NamedTuple):
@@ -124,5 +135,9 @@ def test_authentication_endpoints_requiring_api_key_show_relevant_help_link(
 ) -> None:
     # We need to temporarily disable the API key to test the error message
     with openml.config.overwrite_config_context({"apikey": None}):
-        with pytest.raises(openml.exceptions.OpenMLAuthenticationError, match=API_TOKEN_HELP_LINK):
-            openml._api_calls._perform_api_call(call=endpoint, request_method=method, data=None)
+        with pytest.raises(
+            openml.exceptions.OpenMLAuthenticationError, match=API_TOKEN_HELP_LINK
+        ):
+            openml._api_calls._perform_api_call(
+                call=endpoint, request_method=method, data=None
+            )
diff --git a/tests/test_runs/test_run.py b/tests/test_runs/test_run.py
index 22a8bc936..05e8ef1dd 100644
--- a/tests/test_runs/test_run.py
+++ b/tests/test_runs/test_run.py
@@ -48,7 +48,10 @@ def test_tagging(self):
     def _test_prediction_data_equal(run, run_prime):
         # Determine which attributes are numeric and which not
         num_cols = np.array(
-            [d_type == "NUMERIC" for _, d_type in run._generate_arff_dict()["attributes"]],
+            [
+                d_type == "NUMERIC"
+                for _, d_type in run._generate_arff_dict()["attributes"]
+            ],
         )
         # Get run data consistently
         #   (For run from server, .data_content does not exist)
@@ -66,7 +69,9 @@ def _test_prediction_data_equal(run, run_prime):
     def _test_run_obj_equals(self, run, run_prime):
         for dictionary in ["evaluations", "fold_evaluations", "sample_evaluations"]:
             if getattr(run, dictionary) is not None:
-                self.assertDictEqual(getattr(run, dictionary), getattr(run_prime, dictionary))
+                self.assertDictEqual(
+                    getattr(run, dictionary), getattr(run_prime, dictionary)
+                )
             else:
                 # should be none or empty
                 other = getattr(run_prime, dictionary)
@@ -76,7 +81,9 @@ def _test_run_obj_equals(self, run, run_prime):
         self._test_prediction_data_equal(run, run_prime)
 
         # Test trace
-        run_trace_content = run.trace.trace_to_arff()["data"] if run.trace is not None else None
+        run_trace_content = (
+            run.trace.trace_to_arff()["data"] if run.trace is not None else None
+        )
 
         if run_prime.trace is not None:
             run_prime_trace_content = run_prime.trace.trace_to_arff()["data"]
@@ -118,6 +125,10 @@ def _check_array(array, type_):
         else:
             assert run_prime_trace_content is None
 
+    @pytest.mark.skipif(
+        os.getenv("OPENML_USE_LOCAL_SERVICES") == "true",
+        reason="Pending resolution of #1657",
+    )
     @pytest.mark.sklearn()
     @pytest.mark.test_server()
     def test_to_from_filesystem_vanilla(self):
@@ -153,6 +164,10 @@ def test_to_from_filesystem_vanilla(self):
             f"collected from {__file__.split('/')[-1]}: {run_prime.run_id}",
         )
 
+    @pytest.mark.skipif(
+        os.getenv("OPENML_USE_LOCAL_SERVICES") == "true",
+        reason="Pending resolution of #1657",
+    )
     @pytest.mark.sklearn()
     @pytest.mark.flaky()
     @pytest.mark.test_server()
@@ -189,14 +204,23 @@ def test_to_from_filesystem_search(self):
             f"collected from {__file__.split('/')[-1]}: {run_prime.run_id}",
         )
 
+    @pytest.mark.skipif(
+        os.getenv("OPENML_USE_LOCAL_SERVICES") == "true",
+        reason="Pending resolution of #1657",
+    )
     @pytest.mark.sklearn()
     @pytest.mark.test_server()
     def test_to_from_filesystem_no_model(self):
         model = Pipeline(
-            [("imputer", SimpleImputer(strategy="mean")), ("classifier", DummyClassifier())],
+            [
+                ("imputer", SimpleImputer(strategy="mean")),
+                ("classifier", DummyClassifier()),
+            ],
         )
         task = openml.tasks.get_task(119)  # diabetes; crossvalidation
-        run = openml.runs.run_model_on_task(model=model, task=task, add_local_measures=False)
+        run = openml.runs.run_model_on_task(
+            model=model, task=task, add_local_measures=False
+        )
 
         cache_path = os.path.join(self.workdir, "runs", str(random.getrandbits(128)))
         run.to_filesystem(cache_path, store_model=False)
@@ -265,7 +289,9 @@ def assert_run_prediction_data(task, run, model):
         # Check correctness of y_true and y_pred in run
         for fold_id in range(n_folds):
             # Get data for fold
-            _, test_indices = task.get_train_test_split_indices(repeat=0, fold=fold_id, sample=0)
+            _, test_indices = task.get_train_test_split_indices(
+                repeat=0, fold=fold_id, sample=0
+            )
             train_mask = np.full(len(X), True)
             train_mask[test_indices] = False
 
@@ -279,7 +305,9 @@ def assert_run_prediction_data(task, run, model):
             y_pred = model.fit(X_train, y_train).predict(X_test)
 
             # Get stored data for fold
-            saved_fold_data = run.predictions[run.predictions["fold"] == fold_id].sort_values(
+            saved_fold_data = run.predictions[
+                run.predictions["fold"] == fold_id
+            ].sort_values(
                 by="row_id",
             )
             saved_y_pred = saved_fold_data["prediction"].values
@@ -295,6 +323,10 @@ def assert_run_prediction_data(task, run, model):
             assert_method(y_pred, saved_y_pred)
             assert_method(y_test, saved_y_test)
 
+    @pytest.mark.skipif(
+        os.getenv("OPENML_USE_LOCAL_SERVICES") == "true",
+        reason="Pending resolution of #1657",
+    )
     @pytest.mark.sklearn()
     @pytest.mark.test_server()
     def test_publish_with_local_loaded_flow(self):
@@ -323,7 +355,9 @@ def test_publish_with_local_loaded_flow(self):
             # Make sure that the prediction data stored in the run is correct.
             self.assert_run_prediction_data(task, run, clone(model))
 
-            cache_path = os.path.join(self.workdir, "runs", str(random.getrandbits(128)))
+            cache_path = os.path.join(
+                self.workdir, "runs", str(random.getrandbits(128))
+            )
             run.to_filesystem(cache_path)
             # obtain run from filesystem
             loaded_run = openml.runs.OpenMLRun.from_filesystem(cache_path)
@@ -339,6 +373,10 @@ def test_publish_with_local_loaded_flow(self):
             assert openml.flows.flow_exists(flow.name, flow.external_version)
             openml.runs.get_run(loaded_run.run_id)
 
+    @pytest.mark.skipif(
+        os.getenv("OPENML_USE_LOCAL_SERVICES") == "true",
+        reason="Pending resolution of #1657",
+    )
     @pytest.mark.sklearn()
     @pytest.mark.test_server()
     @pytest.mark.skip(reason="https://github.com/openml/openml-python/issues/1586")
@@ -362,7 +400,9 @@ def test_offline_and_online_run_identical(self):
             assert not openml.flows.flow_exists(flow.name, flow.external_version)
 
             # Load from filesystem
-            cache_path = os.path.join(self.workdir, "runs", str(random.getrandbits(128)))
+            cache_path = os.path.join(
+                self.workdir, "runs", str(random.getrandbits(128))
+            )
             run.to_filesystem(cache_path)
             loaded_run = openml.runs.OpenMLRun.from_filesystem(cache_path)
 
@@ -396,5 +436,7 @@ def test_run_setup_string_included_in_xml(self):
         assert "oml:setup_string" in run_dict
         assert run_dict["oml:setup_string"] == SETUP_STRING
 
-        recreated_run = openml.runs.functions._create_run_from_xml(xml, from_server=False)
+        recreated_run = openml.runs.functions._create_run_from_xml(
+            xml, from_server=False
+        )
         assert recreated_run.setup_string == SETUP_STRING
diff --git a/tests/test_runs/test_run_functions.py b/tests/test_runs/test_run_functions.py
index 3728e0d78..3f7cc12e9 100644
--- a/tests/test_runs/test_run_functions.py
+++ b/tests/test_runs/test_run_functions.py
@@ -40,7 +40,8 @@
     OpenMLNotAuthorizedError,
     OpenMLServerException,
 )
-#from openml.extensions.sklearn import cat, cont
+
+# from openml.extensions.sklearn import cat, cont
 from openml.runs.functions import (
     _run_task_get_arffcontent,
     delete_run,
@@ -132,9 +133,9 @@ def _wait_for_processed_run(self, run_id, max_waiting_time_seconds):
                 time.sleep(10)
                 continue
 
-            assert len(run.evaluations) > 0, (
-                "Expect not-None evaluations to always contain elements."
-            )
+            assert (
+                len(run.evaluations) > 0
+            ), "Expect not-None evaluations to always contain elements."
             return
 
         raise RuntimeError(
@@ -143,7 +144,10 @@ def _wait_for_processed_run(self, run_id, max_waiting_time_seconds):
         )
 
     def _assert_predictions_equal(self, predictions, predictions_prime):
-        assert np.array(predictions_prime["data"]).shape == np.array(predictions["data"]).shape
+        assert (
+            np.array(predictions_prime["data"]).shape
+            == np.array(predictions["data"]).shape
+        )
 
         # The original search model does not submit confidence
         # bounds, so we can not compare the arff line
@@ -164,7 +168,9 @@ def _assert_predictions_equal(self, predictions, predictions_prime):
                 else:
                     assert val_1 == val_2
 
-    def _rerun_model_and_compare_predictions(self, run_id, model_prime, seed, create_task_obj):
+    def _rerun_model_and_compare_predictions(
+        self, run_id, model_prime, seed, create_task_obj
+    ):
         run = openml.runs.get_run(run_id)
 
         # TODO: assert holdout task
@@ -251,9 +257,13 @@ def _perform_run(
             "sklearn.pipeline.Pipeline",
         ]
         if Version(sklearn.__version__) < Version("0.22"):
-            classes_without_random_state.append("sklearn.linear_model.base.LinearRegression")
+            classes_without_random_state.append(
+                "sklearn.linear_model.base.LinearRegression"
+            )
         else:
-            classes_without_random_state.append("sklearn.linear_model._base.LinearRegression")
+            classes_without_random_state.append(
+                "sklearn.linear_model._base.LinearRegression"
+            )
 
         def _remove_random_state(flow):
             if "random_state" in flow.parameters:
@@ -305,9 +315,12 @@ def _remove_random_state(flow):
             flow_server = self.extension.model_to_flow(clf_server)
 
             if flow.class_name not in classes_without_random_state:
-                error_msg = "Flow class %s (id=%d) does not have a random state parameter" % (
-                    flow.class_name,
-                    flow.flow_id,
+                error_msg = (
+                    "Flow class %s (id=%d) does not have a random state parameter"
+                    % (
+                        flow.class_name,
+                        flow.flow_id,
+                    )
                 )
                 assert "random_state" in flow.parameters, error_msg
                 # If the flow is initialized from a model without a random
@@ -397,6 +410,10 @@ def _check_sample_evaluations(
                                 assert evaluation > 0
                             assert evaluation < max_time_allowed
 
+    @pytest.mark.skipif(
+        os.getenv("OPENML_USE_LOCAL_SERVICES") == "true",
+        reason="Pending resolution of #1657",
+    )
     @pytest.mark.sklearn()
     @pytest.mark.test_server()
     def test_run_regression_on_classif_task(self):
@@ -407,13 +424,18 @@ def test_run_regression_on_classif_task(self):
         # internally dataframe is loaded and targets are categorical
         # which LinearRegression() cannot handle
         with pytest.raises(
-            AttributeError, match="'LinearRegression' object has no attribute 'classes_'"
+            AttributeError,
+            match="'LinearRegression' object has no attribute 'classes_'",
         ):
             openml.runs.run_model_on_task(
                 model=clf,
                 task=task,
             )
 
+    @pytest.mark.skipif(
+        os.getenv("OPENML_USE_LOCAL_SERVICES") == "true",
+        reason="Pending resolution of #1657",
+    )
     @pytest.mark.sklearn()
     @pytest.mark.test_server()
     def test_check_erronous_sklearn_flow_fails(self):
@@ -479,7 +501,9 @@ def determine_grid_size(param_grid):
                     grid_iterations += determine_grid_size(sub_grid)
                 return grid_iterations
             else:
-                raise TypeError("Param Grid should be of type list (GridSearch only) or dict")
+                raise TypeError(
+                    "Param Grid should be of type list (GridSearch only) or dict"
+                )
 
         run = self._perform_run(
             task_id,
@@ -627,6 +651,10 @@ def _run_and_upload_regression(
             sentinel=sentinel,
         )
 
+    @pytest.mark.skipif(
+        os.getenv("OPENML_USE_LOCAL_SERVICES") == "true",
+        reason="Pending resolution of #1657",
+    )
     @pytest.mark.sklearn()
     @pytest.mark.test_server()
     def test_run_and_upload_logistic_regression(self):
@@ -634,8 +662,14 @@ def test_run_and_upload_logistic_regression(self):
         task_id = self.TEST_SERVER_TASK_SIMPLE["task_id"]
         n_missing_vals = self.TEST_SERVER_TASK_SIMPLE["n_missing_vals"]
         n_test_obs = self.TEST_SERVER_TASK_SIMPLE["n_test_obs"]
-        self._run_and_upload_classification(lr, task_id, n_missing_vals, n_test_obs, "62501")
+        self._run_and_upload_classification(
+            lr, task_id, n_missing_vals, n_test_obs, "62501"
+        )
 
+    @pytest.mark.skipif(
+        os.getenv("OPENML_USE_LOCAL_SERVICES") == "true",
+        reason="Pending resolution of #1657",
+    )
     @pytest.mark.sklearn()
     @pytest.mark.test_server()
     def test_run_and_upload_linear_regression(self):
@@ -656,7 +690,9 @@ def test_run_and_upload_linear_regression(self):
                 if e.code == 614:  # Task already exists
                     # the exception message contains the task_id that was matched in the format
                     # 'Task already exists. - matched id(s): [xxxx]'
-                    task_id = ast.literal_eval(e.message.split("matched id(s):")[-1].strip())[0]
+                    task_id = ast.literal_eval(
+                        e.message.split("matched id(s):")[-1].strip()
+                    )[0]
                 else:
                     raise Exception(repr(e))
             # mark to remove the uploaded task
@@ -665,8 +701,14 @@ def test_run_and_upload_linear_regression(self):
 
         n_missing_vals = self.TEST_SERVER_TASK_REGRESSION["n_missing_vals"]
         n_test_obs = self.TEST_SERVER_TASK_REGRESSION["n_test_obs"]
-        self._run_and_upload_regression(lr, task_id, n_missing_vals, n_test_obs, "62501")
+        self._run_and_upload_regression(
+            lr, task_id, n_missing_vals, n_test_obs, "62501"
+        )
 
+    @pytest.mark.skipif(
+        os.getenv("OPENML_USE_LOCAL_SERVICES") == "true",
+        reason="Pending resolution of #1657",
+    )
     @pytest.mark.sklearn()
     @pytest.mark.test_server()
     def test_run_and_upload_pipeline_dummy_pipeline(self):
@@ -679,8 +721,14 @@ def test_run_and_upload_pipeline_dummy_pipeline(self):
         task_id = self.TEST_SERVER_TASK_SIMPLE["task_id"]
         n_missing_vals = self.TEST_SERVER_TASK_SIMPLE["n_missing_vals"]
         n_test_obs = self.TEST_SERVER_TASK_SIMPLE["n_test_obs"]
-        self._run_and_upload_classification(pipeline1, task_id, n_missing_vals, n_test_obs, "62501")
+        self._run_and_upload_classification(
+            pipeline1, task_id, n_missing_vals, n_test_obs, "62501"
+        )
 
+    @pytest.mark.skipif(
+        os.getenv("OPENML_USE_LOCAL_SERVICES") == "true",
+        reason="Pending resolution of #1657",
+    )
     @pytest.mark.sklearn()
     @unittest.skipIf(
         Version(sklearn.__version__) < Version("0.20"),
@@ -706,7 +754,9 @@ def get_ct_cf(nominal_indices, numeric_indices):
                         "nominal",
                         make_pipeline(
                             CustomImputer(strategy="most_frequent"),
-                            sklearn.preprocessing.OneHotEncoder(handle_unknown="ignore"),
+                            sklearn.preprocessing.OneHotEncoder(
+                                handle_unknown="ignore"
+                            ),
                         ),
                         nominal_indices,
                     ),
@@ -782,7 +832,9 @@ def test_run_and_upload_knn_pipeline(self, warnings_mock):
         task_id = self.TEST_SERVER_TASK_MISSING_VALS["task_id"]
         n_missing_vals = self.TEST_SERVER_TASK_MISSING_VALS["n_missing_vals"]
         n_test_obs = self.TEST_SERVER_TASK_MISSING_VALS["n_test_obs"]
-        self._run_and_upload_classification(pipeline2, task_id, n_missing_vals, n_test_obs, "62501")
+        self._run_and_upload_classification(
+            pipeline2, task_id, n_missing_vals, n_test_obs, "62501"
+        )
         # The warning raised is:
         # "The total space of parameters 8 is smaller than n_iter=10.
         # Running 8 iterations. For exhaustive searches, use GridSearchCV."
@@ -798,15 +850,24 @@ def test_run_and_upload_knn_pipeline(self, warnings_mock):
                 call_count += 1
         assert call_count == 3
 
+    @pytest.mark.skipif(
+        os.getenv("OPENML_USE_LOCAL_SERVICES") == "true",
+        reason="Pending resolution of #1657",
+    )
     @pytest.mark.sklearn()
     @pytest.mark.test_server()
     def test_run_and_upload_gridsearch(self):
         estimator_name = (
-            "base_estimator" if Version(sklearn.__version__) < Version("1.4") else "estimator"
+            "base_estimator"
+            if Version(sklearn.__version__) < Version("1.4")
+            else "estimator"
         )
         gridsearch = GridSearchCV(
             BaggingClassifier(**{estimator_name: SVC()}),
-            {f"{estimator_name}__C": [0.01, 0.1, 10], f"{estimator_name}__gamma": [0.01, 0.1, 10]},
+            {
+                f"{estimator_name}__C": [0.01, 0.1, 10],
+                f"{estimator_name}__gamma": [0.01, 0.1, 10],
+            },
             cv=3,
         )
         task_id = self.TEST_SERVER_TASK_SIMPLE["task_id"]
@@ -821,6 +882,10 @@ def test_run_and_upload_gridsearch(self):
         )
         assert len(run.trace.trace_iterations) == 9
 
+    @pytest.mark.skipif(
+        os.getenv("OPENML_USE_LOCAL_SERVICES") == "true",
+        reason="Pending resolution of #1657",
+    )
     @pytest.mark.sklearn()
     @pytest.mark.test_server()
     def test_run_and_upload_randomsearch(self):
@@ -854,6 +919,10 @@ def test_run_and_upload_randomsearch(self):
         trace = openml.runs.get_run_trace(run.run_id)
         assert len(trace.trace_iterations) == 5
 
+    @pytest.mark.skipif(
+        os.getenv("OPENML_USE_LOCAL_SERVICES") == "true",
+        reason="Pending resolution of #1657",
+    )
     @pytest.mark.sklearn()
     @pytest.mark.test_server()
     def test_run_and_upload_maskedarrays(self):
@@ -882,6 +951,10 @@ def test_run_and_upload_maskedarrays(self):
 
     ##########################################################################
 
+    @pytest.mark.skipif(
+        os.getenv("OPENML_USE_LOCAL_SERVICES") == "true",
+        reason="Pending resolution of #1657",
+    )
     @pytest.mark.sklearn()
     @pytest.mark.test_server()
     def test_learning_curve_task_1(self):
@@ -905,8 +978,14 @@ def test_learning_curve_task_1(self):
             pipeline1,
             flow_expected_rsv="62501",
         )
-        self._check_sample_evaluations(run.sample_evaluations, num_repeats, num_folds, num_samples)
+        self._check_sample_evaluations(
+            run.sample_evaluations, num_repeats, num_folds, num_samples
+        )
 
+    @pytest.mark.skipif(
+        os.getenv("OPENML_USE_LOCAL_SERVICES") == "true",
+        reason="Pending resolution of #1657",
+    )
     @pytest.mark.sklearn()
     @pytest.mark.test_server()
     def test_learning_curve_task_2(self):
@@ -942,8 +1021,14 @@ def test_learning_curve_task_2(self):
             pipeline2,
             flow_expected_rsv="62501",
         )
-        self._check_sample_evaluations(run.sample_evaluations, num_repeats, num_folds, num_samples)
+        self._check_sample_evaluations(
+            run.sample_evaluations, num_repeats, num_folds, num_samples
+        )
 
+    @pytest.mark.skipif(
+        os.getenv("OPENML_USE_LOCAL_SERVICES") == "true",
+        reason="Pending resolution of #1657",
+    )
     @pytest.mark.sklearn()
     @unittest.skipIf(
         Version(sklearn.__version__) < Version("0.21"),
@@ -1023,6 +1108,10 @@ def _test_local_evaluations(self, run):
                 assert alt_scores[idx] >= 0
                 assert alt_scores[idx] <= 1
 
+    @pytest.mark.skipif(
+        os.getenv("OPENML_USE_LOCAL_SERVICES") == "true",
+        reason="Pending resolution of #1657",
+    )
     @pytest.mark.sklearn()
     @pytest.mark.test_server()
     def test_local_run_swapped_parameter_order_model(self):
@@ -1039,6 +1128,10 @@ def test_local_run_swapped_parameter_order_model(self):
 
         self._test_local_evaluations(run)
 
+    @pytest.mark.skipif(
+        os.getenv("OPENML_USE_LOCAL_SERVICES") == "true",
+        reason="Pending resolution of #1657",
+    )
     @pytest.mark.sklearn()
     @pytest.mark.skip("https://github.com/openml/openml-python/issues/1586")
     @unittest.skipIf(
@@ -1108,6 +1201,10 @@ def test_online_run_metric_score(self):
 
         self._test_local_evaluations(run)
 
+    @pytest.mark.skipif(
+        os.getenv("OPENML_USE_LOCAL_SERVICES") == "true",
+        reason="Pending resolution of #1657",
+    )
     @pytest.mark.sklearn()
     @unittest.skipIf(
         Version(sklearn.__version__) < Version("0.20"),
@@ -1141,7 +1238,9 @@ def test_initialize_model_from_run(self):
                 if e.code == 614:  # Task already exists
                     # the exception message contains the task_id that was matched in the format
                     # 'Task already exists. - matched id(s): [xxxx]'
-                    task_id = ast.literal_eval(e.message.split("matched id(s):")[-1].strip())[0]
+                    task_id = ast.literal_eval(
+                        e.message.split("matched id(s):")[-1].strip()
+                    )[0]
                 else:
                     raise Exception(repr(e))
             # mark to remove the uploaded task
@@ -1170,6 +1269,10 @@ def test_initialize_model_from_run(self):
         assert flowS.components["Imputer"].parameters["strategy"] == '"most_frequent"'
         assert flowS.components["VarianceThreshold"].parameters["threshold"] == "0.05"
 
+    @pytest.mark.skipif(
+        os.getenv("OPENML_USE_LOCAL_SERVICES") == "true",
+        reason="Pending resolution of #1657",
+    )
     @pytest.mark.sklearn()
     @unittest.skipIf(
         Version(sklearn.__version__) < Version("0.20"),
@@ -1230,6 +1333,10 @@ def test__run_exists(self):
             run_ids = run_exists(task.task_id, setup_exists)
             assert run_ids, (run_ids, clf)
 
+    @pytest.mark.skipif(
+        os.getenv("OPENML_USE_LOCAL_SERVICES") == "true",
+        reason="Pending resolution of #1657",
+    )
     @pytest.mark.sklearn()
     @pytest.mark.test_server()
     def test_run_with_illegal_flow_id(self):
@@ -1243,13 +1350,19 @@ def test_run_with_illegal_flow_id(self):
         expected_message_regex = (
             r"Flow does not exist on the server, but 'flow.flow_id' is not None."
         )
-        with pytest.raises(openml.exceptions.PyOpenMLError, match=expected_message_regex):
+        with pytest.raises(
+            openml.exceptions.PyOpenMLError, match=expected_message_regex
+        ):
             openml.runs.run_flow_on_task(
                 task=task,
                 flow=flow,
                 avoid_duplicate_runs=True,
             )
 
+    @pytest.mark.skipif(
+        os.getenv("OPENML_USE_LOCAL_SERVICES") == "true",
+        reason="Pending resolution of #1657",
+    )
     @pytest.mark.sklearn()
     @pytest.mark.test_server()
     def test_run_with_illegal_flow_id_after_load(self):
@@ -1277,11 +1390,19 @@ def test_run_with_illegal_flow_id_after_load(self):
         expected_message_regex = (
             r"Flow does not exist on the server, but 'flow.flow_id' is not None."
         )
-        with pytest.raises(openml.exceptions.PyOpenMLError, match=expected_message_regex):
+        with pytest.raises(
+            openml.exceptions.PyOpenMLError, match=expected_message_regex
+        ):
             loaded_run.publish()
             TestBase._mark_entity_for_removal("run", loaded_run.run_id)
-            TestBase.logger.info(f"collected from test_run_functions: {loaded_run.run_id}")
+            TestBase.logger.info(
+                f"collected from test_run_functions: {loaded_run.run_id}"
+            )
 
+    @pytest.mark.skipif(
+        os.getenv("OPENML_USE_LOCAL_SERVICES") == "true",
+        reason="Pending resolution of #1657",
+    )
     @pytest.mark.sklearn()
     @pytest.mark.test_server()
     def test_run_with_illegal_flow_id_1(self):
@@ -1293,21 +1414,31 @@ def test_run_with_illegal_flow_id_1(self):
         try:
             flow_orig.publish()  # ensures flow exist on server
             TestBase._mark_entity_for_removal("flow", flow_orig.flow_id, flow_orig.name)
-            TestBase.logger.info(f"collected from test_run_functions: {flow_orig.flow_id}")
+            TestBase.logger.info(
+                f"collected from test_run_functions: {flow_orig.flow_id}"
+            )
         except openml.exceptions.OpenMLServerException:
             # flow already exists
             pass
         flow_new = self.extension.model_to_flow(clf)
 
         flow_new.flow_id = -1
-        expected_message_regex = "Local flow_id does not match server flow_id: '-1' vs '[0-9]+'"
-        with pytest.raises(openml.exceptions.PyOpenMLError, match=expected_message_regex):
+        expected_message_regex = (
+            "Local flow_id does not match server flow_id: '-1' vs '[0-9]+'"
+        )
+        with pytest.raises(
+            openml.exceptions.PyOpenMLError, match=expected_message_regex
+        ):
             openml.runs.run_flow_on_task(
                 task=task,
                 flow=flow_new,
                 avoid_duplicate_runs=True,
             )
 
+    @pytest.mark.skipif(
+        os.getenv("OPENML_USE_LOCAL_SERVICES") == "true",
+        reason="Pending resolution of #1657",
+    )
     @pytest.mark.sklearn()
     @pytest.mark.test_server()
     def test_run_with_illegal_flow_id_1_after_load(self):
@@ -1319,7 +1450,9 @@ def test_run_with_illegal_flow_id_1_after_load(self):
         try:
             flow_orig.publish()  # ensures flow exist on server
             TestBase._mark_entity_for_removal("flow", flow_orig.flow_id, flow_orig.name)
-            TestBase.logger.info(f"collected from test_run_functions: {flow_orig.flow_id}")
+            TestBase.logger.info(
+                f"collected from test_run_functions: {flow_orig.flow_id}"
+            )
         except openml.exceptions.OpenMLServerException:
             # flow already exists
             pass
@@ -1340,13 +1473,19 @@ def test_run_with_illegal_flow_id_1_after_load(self):
         run.to_filesystem(cache_path)
         loaded_run = openml.runs.OpenMLRun.from_filesystem(cache_path)
 
-        expected_message_regex = "Local flow_id does not match server flow_id: '-1' vs '[0-9]+'"
+        expected_message_regex = (
+            "Local flow_id does not match server flow_id: '-1' vs '[0-9]+'"
+        )
         self.assertRaisesRegex(
             openml.exceptions.PyOpenMLError,
             expected_message_regex,
             loaded_run.publish,
         )
 
+    @pytest.mark.skipif(
+        os.getenv("OPENML_USE_LOCAL_SERVICES") == "true",
+        reason="Pending resolution of #1657",
+    )
     @pytest.mark.sklearn()
     @unittest.skipIf(
         Version(sklearn.__version__) < Version("0.20"),
@@ -1577,6 +1716,10 @@ def test_get_runs_list_by_tag(self):
         runs = openml.runs.list_runs(tag="curves", size=2)
         assert len(runs) >= 1
 
+    @pytest.mark.skipif(
+        os.getenv("OPENML_USE_LOCAL_SERVICES") == "true",
+        reason="Pending resolution of #1657",
+    )
     @pytest.mark.sklearn()
     @unittest.skipIf(
         Version(sklearn.__version__) < Version("0.20"),
@@ -1598,7 +1741,10 @@ def test_run_on_dataset_with_missing_labels_dataframe(self):
         cont_imp = make_pipeline(CustomImputer(), StandardScaler())
         ct = ColumnTransformer([("cat", cat_imp, cat), ("cont", cont_imp, cont)])
         model = Pipeline(
-            steps=[("preprocess", ct), ("estimator", sklearn.tree.DecisionTreeClassifier())],
+            steps=[
+                ("preprocess", ct),
+                ("estimator", sklearn.tree.DecisionTreeClassifier()),
+            ],
         )  # build a sklearn classifier
 
         data_content, _, _, _ = _run_task_get_arffcontent(
@@ -1614,6 +1760,10 @@ def test_run_on_dataset_with_missing_labels_dataframe(self):
             # repeat, fold, row_id, 6 confidences, prediction and correct label
             assert len(row) == 12
 
+    @pytest.mark.skipif(
+        os.getenv("OPENML_USE_LOCAL_SERVICES") == "true",
+        reason="Pending resolution of #1657",
+    )
     @pytest.mark.sklearn()
     @unittest.skipIf(
         Version(sklearn.__version__) < Version("0.20"),
@@ -1642,7 +1792,10 @@ def test_run_on_dataset_with_missing_labels_array(self):
         cont_imp = make_pipeline(CustomImputer(), StandardScaler())
         ct = ColumnTransformer([("cat", cat_imp, cat), ("cont", cont_imp, cont)])
         model = Pipeline(
-            steps=[("preprocess", ct), ("estimator", sklearn.tree.DecisionTreeClassifier())],
+            steps=[
+                ("preprocess", ct),
+                ("estimator", sklearn.tree.DecisionTreeClassifier()),
+            ],
         )  # build a sklearn classifier
 
         data_content, _, _, _ = _run_task_get_arffcontent(
@@ -1668,6 +1821,10 @@ def test_get_uncached_run(self):
         with pytest.raises(openml.exceptions.OpenMLCacheException):
             openml.runs.functions._get_cached_run(10)
 
+    @pytest.mark.skipif(
+        os.getenv("OPENML_USE_LOCAL_SERVICES") == "true",
+        reason="Pending resolution of #1657",
+    )
     @pytest.mark.sklearn()
     @pytest.mark.test_server()
     def test_run_flow_on_task_downloaded_flow(self):
@@ -1696,7 +1853,8 @@ def test_format_prediction_non_supervised(self):
         clustering = openml.tasks.get_task(126033, download_data=False)
         ignored_input = [0] * 5
         with pytest.raises(
-            NotImplementedError, match=r"Formatting for <class '[\w.]+'> is not supported."
+            NotImplementedError,
+            match=r"Formatting for <class '[\w.]+'> is not supported.",
         ):
             format_prediction(clustering, *ignored_input)
 
@@ -1707,7 +1865,9 @@ def test_format_prediction_classification_no_probabilities(self):
             download_data=False,
         )
         ignored_input = [0] * 5
-        with pytest.raises(ValueError, match="`proba` is required for classification task"):
+        with pytest.raises(
+            ValueError, match="`proba` is required for classification task"
+        ):
             format_prediction(classification, *ignored_input, proba=None)
 
     @pytest.mark.test_server()
@@ -1718,8 +1878,12 @@ def test_format_prediction_classification_incomplete_probabilities(self):
         )
         ignored_input = [0] * 5
         incomplete_probabilities = {c: 0.2 for c in classification.class_labels[1:]}
-        with pytest.raises(ValueError, match="Each class should have a predicted probability"):
-            format_prediction(classification, *ignored_input, proba=incomplete_probabilities)
+        with pytest.raises(
+            ValueError, match="Each class should have a predicted probability"
+        ):
+            format_prediction(
+                classification, *ignored_input, proba=incomplete_probabilities
+            )
 
     @pytest.mark.test_server()
     def test_format_prediction_task_without_classlabels_set(self):
@@ -1729,16 +1893,24 @@ def test_format_prediction_task_without_classlabels_set(self):
         )
         classification.class_labels = None
         ignored_input = [0] * 5
-        with pytest.raises(ValueError, match="The classification task must have class labels set"):
+        with pytest.raises(
+            ValueError, match="The classification task must have class labels set"
+        ):
             format_prediction(classification, *ignored_input, proba={})
 
     @pytest.mark.test_server()
     def test_format_prediction_task_learning_curve_sample_not_set(self):
-        learning_curve = openml.tasks.get_task(801, download_data=False)  # diabetes;crossvalidation
+        learning_curve = openml.tasks.get_task(
+            801, download_data=False
+        )  # diabetes;crossvalidation
         probabilities = {c: 0.2 for c in learning_curve.class_labels}
         ignored_input = [0] * 5
-        with pytest.raises(ValueError, match="`sample` can not be none for LearningCurveTask"):
-            format_prediction(learning_curve, *ignored_input, sample=None, proba=probabilities)
+        with pytest.raises(
+            ValueError, match="`sample` can not be none for LearningCurveTask"
+        ):
+            format_prediction(
+                learning_curve, *ignored_input, sample=None, proba=probabilities
+            )
 
     @pytest.mark.test_server()
     def test_format_prediction_task_regression(self):
@@ -1756,7 +1928,9 @@ def test_format_prediction_task_regression(self):
                 if e.code == 614:  # Task already exists
                     # the exception message contains the task_id that was matched in the format
                     # 'Task already exists. - matched id(s): [xxxx]'
-                    task_id = ast.literal_eval(e.message.split("matched id(s):")[-1].strip())[0]
+                    task_id = ast.literal_eval(
+                        e.message.split("matched id(s):")[-1].strip()
+                    )[0]
                 else:
                     raise Exception(repr(e))
             # mark to remove the uploaded task
@@ -1786,12 +1960,16 @@ def test_delete_run(self):
         task = openml.tasks.get_task(32)  # diabetes; crossvalidation
 
         run = openml.runs.run_model_on_task(
-            model=clf, task=task, seed=rs,
+            model=clf,
+            task=task,
+            seed=rs,
         )
         run.publish()
 
         with pytest.raises(openml.exceptions.OpenMLRunsExistError):
-            openml.runs.run_model_on_task(model=clf, task=task, seed=rs, avoid_duplicate_runs=True)
+            openml.runs.run_model_on_task(
+                model=clf, task=task, seed=rs, avoid_duplicate_runs=True
+            )
 
         TestBase._mark_entity_for_removal("run", run.run_id)
         TestBase.logger.info(f"collected from test_run_functions: {run.run_id}")
@@ -1799,7 +1977,9 @@ def test_delete_run(self):
         _run_id = run.run_id
         assert delete_run(_run_id)
 
-    @pytest.mark.skip(reason="run id is in problematic state on test server due to PR#1454")
+    @pytest.mark.skip(
+        reason="run id is in problematic state on test server due to PR#1454"
+    )
     @unittest.skipIf(
         Version(sklearn.__version__) < Version("0.20"),
         reason="SimpleImputer doesn't handle mixed type DataFrame as input",
@@ -1866,15 +2046,19 @@ def test_delete_unknown_run(mock_delete, test_files_directory, test_server_v1, t
     assert test_apikey_v1 == mock_delete.call_args.kwargs.get("params", {}).get("api_key")
 
 
+@pytest.mark.skipif(
+    os.getenv("OPENML_USE_LOCAL_SERVICES") == "true",
+    reason="Pending resolution of #1657",
+)
 @pytest.mark.sklearn()
 @unittest.skipIf(
     Version(sklearn.__version__) < Version("0.21"),
     reason="couldn't perform local tests successfully w/o bloating RAM",
-    )
+)
 @unittest.skipIf(
     Version(sklearn.__version__) >= Version("1.8"),
     reason="predictions differ significantly",
-    )
+)
 @mock.patch("openml_sklearn.SklearnExtension._prevent_optimize_n_jobs")
 @pytest.mark.test_server()
 def test__run_task_get_arffcontent_2(parallel_mock):
@@ -1903,8 +2087,11 @@ def test__run_task_get_arffcontent_2(parallel_mock):
         ]
     )
     n_jobs = 2
-    backend = "loky" if Version(joblib.__version__) > Version("0.11") else "multiprocessing"
+    backend = (
+        "loky" if Version(joblib.__version__) > Version("0.11") else "multiprocessing"
+    )
     from openml_sklearn import SklearnExtension
+
     extension = SklearnExtension()
     with parallel_backend(backend, n_jobs=n_jobs):
         res = openml.runs.functions._run_task_get_arffcontent(
@@ -1948,11 +2135,15 @@ def test__run_task_get_arffcontent_2(parallel_mock):
     )
 
 
+@pytest.mark.skipif(
+    os.getenv("OPENML_USE_LOCAL_SERVICES") == "true",
+    reason="Pending resolution of #1657",
+)
 @pytest.mark.sklearn()
 @unittest.skipIf(
     Version(sklearn.__version__) < Version("0.21"),
     reason="couldn't perform local tests successfully w/o bloating RAM",
-    )
+)
 @mock.patch("openml_sklearn.SklearnExtension._prevent_optimize_n_jobs")
 @pytest.mark.parametrize(
     ("n_jobs", "backend", "call_count"),
@@ -1961,18 +2152,28 @@ def test__run_task_get_arffcontent_2(parallel_mock):
         # spawns multiple processes if n_jobs != 1, which means the mock is not applied.
         (2, None, 0),
         (-1, None, 0),
-        (1, None, 10),  # with n_jobs=1 the mock *is* applied, since there is no new subprocess
+        (
+            1,
+            None,
+            10,
+        ),  # with n_jobs=1 the mock *is* applied, since there is no new subprocess
         (1, "sequential", 10),
         (1, "threading", 10),
-        (-1, "threading", 10),  # the threading backend does preserve mocks even with parallelizing
-    ]
+        (
+            -1,
+            "threading",
+            10,
+        ),  # the threading backend does preserve mocks even with parallelizing
+    ],
 )
 @pytest.mark.test_server()
 def test_joblib_backends(parallel_mock, n_jobs, backend, call_count):
     """Tests evaluation of a run using various joblib backends and n_jobs."""
     if backend is None:
         backend = (
-            "loky" if Version(joblib.__version__) > Version("0.11") else "multiprocessing"
+            "loky"
+            if Version(joblib.__version__) > Version("0.11")
+            else "multiprocessing"
         )
 
     task = openml.tasks.get_task(7)  # Supervised Classification on kr-vs-kp
diff --git a/tests/test_setups/test_setup_functions.py b/tests/test_setups/test_setup_functions.py
index 0735925f2..da87c0cc9 100644
--- a/tests/test_setups/test_setup_functions.py
+++ b/tests/test_setups/test_setup_functions.py
@@ -34,6 +34,10 @@ def setUp(self):
         self.extension = SklearnExtension()
         super().setUp()
 
+    @pytest.mark.skipif(
+        os.getenv("OPENML_USE_LOCAL_SERVICES") == "true",
+        reason="Pending resolution of #1657",
+    )
     @pytest.mark.sklearn()
     @pytest.mark.test_server()
     def test_nonexisting_setup_exists(self):
@@ -45,7 +49,9 @@ def test_nonexisting_setup_exists(self):
         flow.name = f"TEST{sentinel}{flow.name}"
         flow.publish()
         TestBase._mark_entity_for_removal("flow", flow.flow_id, flow.name)
-        TestBase.logger.info(f"collected from {__file__.split('/')[-1]}: {flow.flow_id}")
+        TestBase.logger.info(
+            f"collected from {__file__.split('/')[-1]}: {flow.flow_id}"
+        )
 
         # although the flow exists (created as of previous statement),
         # we can be sure there are no setups (yet) as it was just created
@@ -58,7 +64,9 @@ def _existing_setup_exists(self, classif):
         flow.name = f"TEST{get_sentinel()}{flow.name}"
         flow.publish()
         TestBase._mark_entity_for_removal("flow", flow.flow_id, flow.name)
-        TestBase.logger.info(f"collected from {__file__.split('/')[-1]}: {flow.flow_id}")
+        TestBase.logger.info(
+            f"collected from {__file__.split('/')[-1]}: {flow.flow_id}"
+        )
 
         # although the flow exists, we can be sure there are no
         # setups (yet) as it hasn't been ran
@@ -82,6 +90,10 @@ def _existing_setup_exists(self, classif):
         setup_id = openml.setups.setup_exists(flow)
         assert setup_id == run.setup_id
 
+    @pytest.mark.skipif(
+        os.getenv("OPENML_USE_LOCAL_SERVICES") == "true",
+        reason="Pending resolution of #1657",
+    )
     @pytest.mark.sklearn()
     @pytest.mark.test_server()
     def test_existing_setup_exists_1(self):
@@ -98,12 +110,20 @@ def side_effect(self):
             nb = sklearn.naive_bayes.GaussianNB()
             self._existing_setup_exists(nb)
 
+    @pytest.mark.skipif(
+        os.getenv("OPENML_USE_LOCAL_SERVICES") == "true",
+        reason="Pending resolution of #1657",
+    )
     @pytest.mark.sklearn()
     @pytest.mark.test_server()
     def test_exisiting_setup_exists_2(self):
         # Check a flow with one hyperparameter
         self._existing_setup_exists(sklearn.naive_bayes.GaussianNB())
 
+    @pytest.mark.skipif(
+        os.getenv("OPENML_USE_LOCAL_SERVICES") == "true",
+        reason="Pending resolution of #1657",
+    )
     @pytest.mark.sklearn()
     @pytest.mark.test_server()
     def test_existing_setup_exists_3(self):
@@ -161,10 +181,14 @@ def test_list_setups_output_format(self):
         flow_id = 6794
         setups = openml.setups.list_setups(flow=flow_id, size=10)
         assert isinstance(setups, dict)
-        assert isinstance(setups[next(iter(setups.keys()))], openml.setups.setup.OpenMLSetup)
+        assert isinstance(
+            setups[next(iter(setups.keys()))], openml.setups.setup.OpenMLSetup
+        )
         assert len(setups) == 10
 
-        setups = openml.setups.list_setups(flow=flow_id, size=10, output_format="dataframe")
+        setups = openml.setups.list_setups(
+            flow=flow_id, size=10, output_format="dataframe"
+        )
         assert isinstance(setups, pd.DataFrame)
         assert len(setups) == 10
 
diff --git a/tests/test_tasks/test_task_functions.py b/tests/test_tasks/test_task_functions.py
index bf2fcfeae..931855841 100644
--- a/tests/test_tasks/test_task_functions.py
+++ b/tests/test_tasks/test_task_functions.py
@@ -3,17 +3,18 @@
 
 import os
 import unittest
-from typing import cast
 from unittest import mock
 
-import pandas as pd
 import pytest
 import requests
 
 import openml
 from openml import OpenMLSplit, OpenMLTask
-from openml.exceptions import OpenMLCacheException, OpenMLNotAuthorizedError, OpenMLServerException
-from openml.tasks import TaskType
+from openml.exceptions import (
+    OpenMLNotAuthorizedError,
+    OpenMLServerException,
+)
+from openml.tasks import TaskType, task
 from openml.testing import TestBase, create_request_response
 
 
@@ -26,29 +27,6 @@ def setUp(self):
     def tearDown(self):
         super().tearDown()
 
-    @pytest.mark.test_server()
-    def test__get_cached_tasks(self):
-        openml.config.set_root_cache_directory(self.static_cache_dir)
-        tasks = openml.tasks.functions._get_cached_tasks()
-        assert isinstance(tasks, dict)
-        assert len(tasks) == 3
-        assert isinstance(next(iter(tasks.values())), OpenMLTask)
-
-    @pytest.mark.test_server()
-    def test__get_cached_task(self):
-        openml.config.set_root_cache_directory(self.static_cache_dir)
-        task = openml.tasks.functions._get_cached_task(1)
-        assert isinstance(task, OpenMLTask)
-
-    def test__get_cached_task_not_cached(self):
-        openml.config.set_root_cache_directory(self.static_cache_dir)
-        self.assertRaisesRegex(
-            OpenMLCacheException,
-            "Task file for tid 2 not cached",
-            openml.tasks.functions._get_cached_task,
-            2,
-        )
-
     @pytest.mark.test_server()
     def test__get_estimation_procedure_list(self):
         estimation_procedures = openml.tasks.functions._get_estimation_procedure_list()
@@ -141,7 +119,9 @@ def test_list_tasks_per_type_paginate(self):
     @pytest.mark.test_server()
     def test__get_task(self):
         openml.config.set_root_cache_directory(self.static_cache_dir)
-        openml.tasks.get_task(1882)
+        with unittest.mock.patch("requests.sessions.Session.request") as mock_request:
+            openml.tasks.get_task(1882)
+            mock_request.assert_not_called()
 
     @unittest.skip(
         "Please await outcome of discussion: https://github.com/openml/OpenML/issues/776",
@@ -155,21 +135,16 @@ def test__get_task_live(self):
 
     @pytest.mark.test_server()
     def test_get_task(self):
-        task = openml.tasks.get_task(1, download_data=True)  # anneal; crossvalidation
-        assert isinstance(task, OpenMLTask)
-        assert os.path.exists(
-            os.path.join(openml.config.get_cache_directory(), "tasks", "1", "task.xml")
-        )
-        assert not os.path.exists(
-            os.path.join(openml.config.get_cache_directory(), "tasks", "1", "datasplits.arff")
-        )
-        assert os.path.exists(
-            os.path.join(openml.config.get_cache_directory(), "datasets", "1", "dataset_1.pq")
-        )
+        with unittest.mock.patch("requests.sessions.Session.request") as mock_request:
+            openml.tasks.get_task(1)
+            mock_request.assert_not_called()
 
     @pytest.mark.test_server()
     def test_get_task_lazy(self):
-        task = openml.tasks.get_task(2, download_data=False)  # anneal; crossvalidation
+        with unittest.mock.patch("requests.sessions.Session.request") as mock_request:
+            task = openml.tasks.get_task(2, download_data=False)  # anneal; crossvalidation
+            mock_request.assert_not_called()
+
         assert isinstance(task, OpenMLTask)
         assert os.path.exists(
             os.path.join(openml.config.get_cache_directory(), "tasks", "2", "task.xml")
@@ -177,16 +152,25 @@ def test_get_task_lazy(self):
         assert task.class_labels == ["1", "2", "3", "4", "5", "U"]
 
         assert not os.path.exists(
-            os.path.join(openml.config.get_cache_directory(), "tasks", "2", "datasplits.arff")
+            os.path.join(
+                openml.config.get_cache_directory(), "tasks", "2", "datasplits.arff"
+            )
         )
         # Since the download_data=False is propagated to get_dataset
         assert not os.path.exists(
-            os.path.join(openml.config.get_cache_directory(), "datasets", "2", "dataset.arff")
+            os.path.join(
+                openml.config.get_cache_directory(), "datasets", "2", "dataset.arff"
+            )
         )
 
-        task.download_split()
+        with unittest.mock.patch("requests.sessions.Session.request") as mock_request:
+            task.download_split()
+            mock_request.assert_not_called()        
+            
         assert os.path.exists(
-            os.path.join(openml.config.get_cache_directory(), "tasks", "2", "datasplits.arff")
+            os.path.join(
+                openml.config.get_cache_directory(), "tasks", "2", "datasplits.arff"
+            )
         )
 
     @mock.patch("openml.tasks.functions.get_dataset")
@@ -211,7 +195,10 @@ def assert_and_raise(*args, **kwargs):
     @pytest.mark.test_server()
     def test_get_task_with_cache(self):
         openml.config.set_root_cache_directory(self.static_cache_dir)
-        task = openml.tasks.get_task(1)
+        with unittest.mock.patch("requests.sessions.Session.request") as mock_request:
+            task = openml.tasks.get_task(1)
+            mock_request.assert_not_called()
+
         assert isinstance(task, OpenMLTask)
 
     @pytest.mark.production_server()
@@ -226,11 +213,15 @@ def test_get_task_different_types(self):
 
     @pytest.mark.test_server()
     def test_download_split(self):
-        task = openml.tasks.get_task(1)  # anneal; crossvalidation
-        split = task.download_split()
+        with unittest.mock.patch("requests.sessions.Session.request") as mock_request:
+            task = openml.tasks.get_task(1)  # anneal; crossvalidation
+            split = task.download_split()
+            mock_request.assert_not_called()
         assert type(split) == OpenMLSplit
         assert os.path.exists(
-            os.path.join(openml.config.get_cache_directory(), "tasks", "1", "datasplits.arff")
+            os.path.join(
+                openml.config.get_cache_directory(), "tasks", "1", "datasplits.arff"
+            )
         )
 
     def test_deletion_of_cache_dir(self):
@@ -244,14 +235,13 @@ def test_deletion_of_cache_dir(self):
         assert not os.path.exists(tid_cache_dir)
 
 
-@mock.patch.object(requests.Session, "delete")
-def test_delete_task_not_owned(mock_delete, test_files_directory, test_server_v1, test_apikey_v1):
+@mock.patch.object(requests.Session, "request")
+def test_delete_task_not_owned(mock_request, test_files_directory, test_server_v1, test_apikey_v1):
     content_file = test_files_directory / "mock_responses" / "tasks" / "task_delete_not_owned.xml"
-    mock_delete.return_value = create_request_response(
+    mock_request.return_value = create_request_response(
         status_code=412,
         content_filepath=content_file,
     )
-
     with pytest.raises(
         OpenMLNotAuthorizedError,
         match="The task can not be deleted because it was not uploaded by you.",
@@ -259,14 +249,14 @@ def test_delete_task_not_owned(mock_delete, test_files_directory, test_server_v1
         openml.tasks.delete_task(1)
 
     task_url = test_server_v1 + "task/1"
-    assert task_url == mock_delete.call_args.args[0]
-    assert test_apikey_v1 == mock_delete.call_args.kwargs.get("params", {}).get("api_key")
+    assert task_url == mock_request.call_args.kwargs.get("url")
+    assert test_apikey_v1 == mock_request.call_args.kwargs.get("params", {}).get("api_key")
 
 
-@mock.patch.object(requests.Session, "delete")
-def test_delete_task_with_run(mock_delete, test_files_directory, test_server_v1, test_apikey_v1):
+@mock.patch.object(requests.Session, "request")
+def test_delete_task_with_run(mock_request, test_files_directory, test_server_v1, test_apikey_v1):
     content_file = test_files_directory / "mock_responses" / "tasks" / "task_delete_has_runs.xml"
-    mock_delete.return_value = create_request_response(
+    mock_request.return_value = create_request_response(
         status_code=412,
         content_filepath=content_file,
     )
@@ -278,14 +268,14 @@ def test_delete_task_with_run(mock_delete, test_files_directory, test_server_v1,
         openml.tasks.delete_task(3496)
 
     task_url = test_server_v1 + "task/3496"
-    assert task_url == mock_delete.call_args.args[0]
-    assert test_apikey_v1 == mock_delete.call_args.kwargs.get("params", {}).get("api_key")
+    assert task_url == mock_request.call_args.kwargs.get("url")
+    assert test_apikey_v1 == mock_request.call_args.kwargs.get("params", {}).get("api_key")
 
 
-@mock.patch.object(requests.Session, "delete")
-def test_delete_success(mock_delete, test_files_directory, test_server_v1, test_apikey_v1):
+@mock.patch.object(requests.Session, "request")
+def test_delete_success(mock_request, test_files_directory, test_server_v1, test_apikey_v1):
     content_file = test_files_directory / "mock_responses" / "tasks" / "task_delete_successful.xml"
-    mock_delete.return_value = create_request_response(
+    mock_request.return_value = create_request_response(
         status_code=200,
         content_filepath=content_file,
     )
@@ -294,14 +284,14 @@ def test_delete_success(mock_delete, test_files_directory, test_server_v1, test_
     assert success
 
     task_url = test_server_v1 + "task/361323"
-    assert task_url == mock_delete.call_args.args[0]
-    assert test_apikey_v1 == mock_delete.call_args.kwargs.get("params", {}).get("api_key")
+    assert task_url == mock_request.call_args.kwargs.get("url")
+    assert test_apikey_v1 == mock_request.call_args.kwargs.get("params", {}).get("api_key")
 
 
-@mock.patch.object(requests.Session, "delete")
-def test_delete_unknown_task(mock_delete, test_files_directory, test_server_v1, test_apikey_v1):
+@mock.patch.object(requests.Session, "request")
+def test_delete_unknown_task(mock_request, test_files_directory, test_server_v1, test_apikey_v1):
     content_file = test_files_directory / "mock_responses" / "tasks" / "task_delete_not_exist.xml"
-    mock_delete.return_value = create_request_response(
+    mock_request.return_value = create_request_response(
         status_code=412,
         content_filepath=content_file,
     )
@@ -313,5 +303,5 @@ def test_delete_unknown_task(mock_delete, test_files_directory, test_server_v1,
         openml.tasks.delete_task(9_999_999)
 
     task_url = test_server_v1 + "task/9999999"
-    assert task_url == mock_delete.call_args.args[0]
-    assert test_apikey_v1 == mock_delete.call_args.kwargs.get("params", {}).get("api_key")
+    assert task_url == mock_request.call_args.kwargs.get("url")
+    assert test_apikey_v1 == mock_request.call_args.kwargs.get("params", {}).get("api_key")
diff --git a/tests/test_tasks/test_task_methods.py b/tests/test_tasks/test_task_methods.py
index 9316d0876..81c133edc 100644
--- a/tests/test_tasks/test_task_methods.py
+++ b/tests/test_tasks/test_task_methods.py
@@ -6,6 +6,7 @@
 import openml
 from openml.testing import TestBase
 import pytest
+import unittest.mock
 
 
 # Common methods between tasks
@@ -33,9 +34,13 @@ def test_tagging(self):
         assert len(tasks) == 0
 
     @pytest.mark.test_server()
-    def test_get_train_and_test_split_indices(self):
+    def test_get_train_and_test_split_indices(self):        
         openml.config.set_root_cache_directory(self.static_cache_dir)
-        task = openml.tasks.get_task(1882)
+        
+        with unittest.mock.patch("requests.sessions.Session.request") as mock_request:
+            task = openml.tasks.get_task(1882)
+            mock_request.assert_not_called()
+            
         train_indices, test_indices = task.get_train_test_split_indices(0, 0)
         assert train_indices[0] == 16
         assert train_indices[-1] == 395
diff --git a/tests/test_utils/test_utils.py b/tests/test_utils/test_utils.py
index 111ff778c..b74294575 100644
--- a/tests/test_utils/test_utils.py
+++ b/tests/test_utils/test_utils.py
@@ -50,7 +50,7 @@ def _mocked_perform_api_call(call, request_method):
 
 @pytest.mark.test_server()
 def test_list_all():
-    openml.utils._list_all(listing_call=openml.tasks.functions._list_tasks)
+    openml.utils._list_all(listing_call=openml.tasks.functions.list_tasks)
 
 
 @pytest.mark.test_server()
@@ -65,7 +65,7 @@ def test_list_all_with_multiple_batches(min_number_tasks_on_test_server):
     # batches and at the same time do as few batches (roundtrips) as possible.
     batch_size = min_number_tasks_on_test_server - 1
     batches = openml.utils._list_all(
-        listing_call=openml.tasks.functions._list_tasks,
+        listing_call=openml._backend.task.list,
         batch_size=batch_size,
     )
     assert len(batches) >= 2