diff --git a/src/dstack/_internal/core/models/routers.py b/src/dstack/_internal/core/models/routers.py index b7950369c2..b1f189c522 100644 --- a/src/dstack/_internal/core/models/routers.py +++ b/src/dstack/_internal/core/models/routers.py @@ -9,6 +9,7 @@ class RouterType(str, Enum): SGLANG = "sglang" + DYNAMO = "dynamo" class SGLangGatewayRouterConfig(CoreModel): @@ -45,8 +46,15 @@ class SGLangServiceRouterConfig(CoreModel): class ReplicaGroupRouterConfig(CoreModel): type: Annotated[ - Literal["sglang"], - Field(description="The router implementation for this replica group."), + Literal["sglang", "dynamo"], + Field( + description=( + "The router implementation for this replica group. " + "`sglang` runs the SGLang router and dstack syncs worker URLs to it. " + "`dynamo` runs the NVIDIA Dynamo frontend, which discovers workers " + "itself via etcd/NATS." + ), + ), ] = "sglang" diff --git a/src/dstack/_internal/core/models/runs.py b/src/dstack/_internal/core/models/runs.py index 9748aa46f8..e7b40c8a0b 100644 --- a/src/dstack/_internal/core/models/runs.py +++ b/src/dstack/_internal/core/models/runs.py @@ -43,6 +43,7 @@ ) from dstack._internal.core.models.repos import AnyRunRepoData from dstack._internal.core.models.resources import Memory, ResourcesSpec +from dstack._internal.core.models.routers import RouterType from dstack._internal.core.models.unix import UnixUser from dstack._internal.core.models.volumes import MountPoint from dstack._internal.utils import common as common_utils @@ -603,6 +604,31 @@ def _merged_profile(cls, values) -> Dict: values["merged_profile"] = merged_profile return values + @root_validator + def _validate_dynamo_no_retry(cls, values) -> Dict: + """Reject `retry` for services with a Dynamo router replica group. + Dynamo workers cache the router's internal IP at provisioning time. A + retry would produce a new router and likely a new internal_ip, leaving workers bound + to a router that no longer exists. + """ + merged_profile = values.get("merged_profile") + cfg = values.get("configuration") + if merged_profile is None or merged_profile.retry is None: + return values + if not isinstance(cfg, ServiceConfiguration): + return values + for g in cfg.replica_groups: + if g.router is not None and g.router.type == RouterType.DYNAMO: + raise ValueError( + "Retry cannot be configured for services with a Dynamo " + "router replica group. The router's address must remain " + "stable for the life of the run; allowing retry would " + "leave workers bound to a router that no longer exists. " + "Remove `retry` from the profile/configuration and " + "re-apply." + ) + return values + class ServiceModelSpec(CoreModel): name: str diff --git a/src/dstack/_internal/proxy/gateway/services/registry.py b/src/dstack/_internal/proxy/gateway/services/registry.py index 84fbce8711..f190523a39 100644 --- a/src/dstack/_internal/proxy/gateway/services/registry.py +++ b/src/dstack/_internal/proxy/gateway/services/registry.py @@ -20,7 +20,7 @@ ServiceConfig, ) from dstack._internal.proxy.lib import models -from dstack._internal.proxy.lib.const import SGLANG_WHITELISTED_PATHS +from dstack._internal.proxy.lib.const import ROUTER_WHITELISTED_PATHS from dstack._internal.proxy.lib.errors import ProxyError, UnexpectedProxyError from dstack._internal.proxy.lib.repo import BaseProxyRepo from dstack._internal.proxy.lib.services.service_connection import ( @@ -344,7 +344,7 @@ async def get_nginx_service_config( ) -> ServiceConfig: limit_req_zones: list[LimitReqZoneConfig] = [] locations: list[LocationConfig] = [] - is_sglang = ( + is_router = ( service.router is not None and service.router.type == RouterType.SGLANG ) or service.has_router_replica sglang_limits: dict[str, LimitReqConfig] = {} @@ -361,8 +361,8 @@ async def get_nginx_service_config( limit_req_zones.append( LimitReqZoneConfig(name=zone_name, key=key, rpm=round(rate_limit.rps * 60)) ) - if is_sglang: - for path in SGLANG_WHITELISTED_PATHS: + if is_router: + for path in ROUTER_WHITELISTED_PATHS: if rate_limit.prefix == path or path.startswith(rate_limit.prefix): # Use the longest prefix if multiple prefixes match the same path current_prefix_len = len(rate_limit.prefix) @@ -381,9 +381,9 @@ async def get_nginx_service_config( ) ) - # Add SGLang whitelisted paths as locations - if is_sglang: - for path in SGLANG_WHITELISTED_PATHS: + # Add router whitelisted paths as locations + if is_router: + for path in ROUTER_WHITELISTED_PATHS: # Use prefix match for paths that end with a slash and exact match for paths that don't if path.endswith("/"): locations.append(LocationConfig(prefix=path, limit_req=sglang_limits.get(path))) @@ -392,8 +392,8 @@ async def get_nginx_service_config( LocationConfig(prefix=f"= {path}", limit_req=sglang_limits.get(path)) ) - # Don't auto-add / location for SGLang routers (catch-all 403 handles it) - if not any(location.prefix == "/" for location in locations) and not is_sglang: + # Don't auto-add / location for router-based services (catch-all 403 handles it) + if not any(location.prefix == "/" for location in locations) and not is_router: locations.append(LocationConfig(prefix="/", limit_req=None)) return ServiceConfig( domain=service.domain_safe, diff --git a/src/dstack/_internal/proxy/lib/const.py b/src/dstack/_internal/proxy/lib/const.py index a307c22756..43ede03ac8 100644 --- a/src/dstack/_internal/proxy/lib/const.py +++ b/src/dstack/_internal/proxy/lib/const.py @@ -2,7 +2,10 @@ Shared constants for proxy components (gateway + in-server proxy). """ -SGLANG_WHITELISTED_PATHS: tuple[str, ...] = ( +# Inference endpoints exposed by the in-replica HTTP router. Applies to both +# SGLang's router and Dynamo's `dynamo.frontend` — they share the +# OpenAI-compatible endpoint surface. +ROUTER_WHITELISTED_PATHS: tuple[str, ...] = ( "/generate", "/v1/", "/chat/completions", diff --git a/src/dstack/_internal/server/background/pipeline_tasks/jobs_running.py b/src/dstack/_internal/server/background/pipeline_tasks/jobs_running.py index d441a9e2d3..0d818d3a4f 100644 --- a/src/dstack/_internal/server/background/pipeline_tasks/jobs_running.py +++ b/src/dstack/_internal/server/background/pipeline_tasks/jobs_running.py @@ -23,6 +23,7 @@ from dstack._internal.core.models.metrics import Metric from dstack._internal.core.models.profiles import StartupOrder from dstack._internal.core.models.repos import RemoteRepoCreds +from dstack._internal.core.models.routers import RouterType from dstack._internal.core.models.runs import ( ClusterInfo, ImagePullProgress, @@ -102,6 +103,13 @@ from dstack._internal.server.services.runner import client from dstack._internal.server.services.runner.ssh import runner_ssh_tunnel from dstack._internal.server.services.runs import is_job_ready, run_model_to_run +from dstack._internal.server.services.runs.replicas import ( + ROUTER_FAILED, + ROUTER_NOT_PROVISIONED, + get_router_env_for_job, + get_router_replica_group, + get_router_replica_num, +) from dstack._internal.server.services.secrets import get_project_secrets_mapping from dstack._internal.server.services.storage import get_default_storage from dstack._internal.server.utils import sentry_utils @@ -114,6 +122,8 @@ JOB_STATUSES_WITH_MIN_PROCESSING_INTERVAL = [JobStatus.PROVISIONING, JobStatus.PULLING] +ROUTER_PROVISIONING_WAIT_TIMEOUT_SECONDS = 30 * 60 + JOB_DISCONNECTED_RETRY_TIMEOUT = timedelta(minutes=2) """`The minimum time before terminating active job in case of connectivity issues.""" @@ -384,8 +394,12 @@ async def _load_process_context(item: JobRunningPipelineItem) -> Optional[_Proce job_submissions=[job_model_to_job_submission(job_model)], ) else: - # PROVISIONING/PULLING jobs need same-replica siblings for cluster coordination. - # All sibling access is replica-scoped, so only load jobs for this replica. + # PROVISIONING/PULLING jobs need same-replica siblings for cluster + # coordination, plus — when the run has a router replica group — + # the router replica's job (cross-replica) so the env-injection + # gate in _prepare_startup_context can read its status / IP. + # _fetch_run_model handles both: same-replica jobs always, plus + # the router replica's job when one exists. run_model = await _fetch_run_model( session=session, run_id=job_model.run_id, replica_num=item.replica_num ) @@ -477,6 +491,54 @@ async def _prepare_startup_context( ) return None + # If this run has a router replica group and this job is a worker, gate + # startup on the router replica's state. The helper returns None for the + # router itself and for runs without a router group, so this whole block + # is a no-op in those cases. + router_env = get_router_env_for_job( + run_model=context.run_model, + run_spec=context.run.run_spec, + job_model=context.job_model, + ) + if router_env is ROUTER_FAILED: + # Router has reached a terminal state — the worker cannot recover by + # waiting. Terminate it now with a clear reason instead of letting it + # idle until the run-level reconciler tears the whole run down. + _terminate_job( + job_model=context.job_model, + job_update_map=result.job_update_map, + termination_reason=JobTerminationReason.TERMINATED_BY_SERVER, + termination_reason_message=( + "Router replica is in a terminal state; cannot provision worker " + "without a running router." + ), + ) + return None + if router_env is ROUTER_NOT_PROVISIONED: + # Router is alive but its internal_ip is not yet known. Defer this + # worker — the next pipeline tick will re-check. Bound the wait so a + # router that is genuinely stuck can't burn worker instance-hours + # forever; see ROUTER_PROVISIONING_WAIT_TIMEOUT_SECONDS. + waited_seconds = (get_current_datetime() - context.job_model.submitted_at).total_seconds() + if waited_seconds > ROUTER_PROVISIONING_WAIT_TIMEOUT_SECONDS: + _terminate_job( + job_model=context.job_model, + job_update_map=result.job_update_map, + termination_reason=JobTerminationReason.TERMINATED_BY_SERVER, + termination_reason_message=( + f"Router replica did not acquire an internal IP within " + f"{ROUTER_PROVISIONING_WAIT_TIMEOUT_SECONDS}s; terminating worker." + ), + ) + return None + logger.debug( + "%s: waiting for router replica to be provisioned", + fmt(context.job_model), + ) + return None + if router_env: + context.job.job_spec.env.update(router_env) + cluster_info = _get_cluster_info( jobs=context.run.jobs, replica_num=context.job.job_spec.replica_num, @@ -549,7 +611,10 @@ async def _fetch_run_model( Args: replica_num: If None, skip loading jobs (for RUNNING jobs that don't need siblings). If set, load only latest-submission jobs for that replica (for PROVISIONING/PULLING - jobs that need same-replica siblings for cluster coordination). + jobs that need same-replica siblings for cluster coordination). When the run has + a router replica group whose replica_num differs from this one, that replica's + jobs are also loaded so cross-replica router lookups (see get_router_env_for_job + in services/runs/replicas.py) can find it. """ query = ( select(RunModel) @@ -560,6 +625,31 @@ async def _fetch_run_model( .options(joinedload(RunModel.fleet).load_only(FleetModel.id, FleetModel.name)) ) if replica_num is not None: + # Pre-fetch the bare run_spec to discover whether the run has a + # router replica group, and if so at which replica_num. The query + # below then includes both this replica AND the router replica + # For runs without a router group (services without one, plus + # all tasks and dev-environments), the helper returns None and we + # fall through to the original single-replica behavior. + spec_res = await session.execute(select(RunModel.run_spec).where(RunModel.id == run_id)) + run_spec_str = spec_res.scalar_one() + run_spec = RunSpec.__response__.parse_raw(run_spec_str) + # The router pre-fetch only exists to feed get_router_env_for_job, + # which is gated to Dynamo. Skip it for SGLang and non-router runs. + router_group = get_router_replica_group(run_spec) + if ( + router_group is not None + and router_group.router is not None + and router_group.router.type == RouterType.DYNAMO + ): + router_replica_num = get_router_replica_num(run_spec) + else: + router_replica_num = None + + replica_nums: list[int] = [replica_num] + if router_replica_num is not None and router_replica_num != replica_num: + replica_nums.append(router_replica_num) + latest_submissions_sq = ( select( JobModel.run_id.label("run_id"), @@ -567,7 +657,7 @@ async def _fetch_run_model( JobModel.job_num.label("job_num"), func.max(JobModel.submission_num).label("max_submission_num"), ) - .where(JobModel.run_id == run_id, JobModel.replica_num == replica_num) + .where(JobModel.run_id == run_id, JobModel.replica_num.in_(replica_nums)) .group_by(JobModel.run_id, JobModel.replica_num, JobModel.job_num) .subquery() ) diff --git a/src/dstack/_internal/server/background/pipeline_tasks/service_router_worker_sync.py b/src/dstack/_internal/server/background/pipeline_tasks/service_router_worker_sync.py index 2fa80fc43a..2b416fb823 100644 --- a/src/dstack/_internal/server/background/pipeline_tasks/service_router_worker_sync.py +++ b/src/dstack/_internal/server/background/pipeline_tasks/service_router_worker_sync.py @@ -33,7 +33,7 @@ from dstack._internal.server.services.locking import get_locker from dstack._internal.server.services.pipelines import PipelineHinterProtocol from dstack._internal.server.services.runs.router_worker_sync import ( - run_model_has_router_replica_group, + run_model_has_sglang_router_replica_group, sync_router_workers_for_run_model, ) from dstack._internal.server.utils import sentry_utils @@ -212,7 +212,7 @@ async def process(self, item: ServiceRouterWorkerSyncPipelineItem) -> None: run_model.deleted or run_model.status.is_finished() or run_model.status != RunStatus.RUNNING - or not run_model_has_router_replica_group(run_model) + or not run_model_has_sglang_router_replica_group(run_model) ): early_cleanup_update_map: _SyncRowUpdateMap = {"deleted": True} set_processed_update_map_fields(early_cleanup_update_map) diff --git a/src/dstack/_internal/server/services/proxy/services/service_proxy.py b/src/dstack/_internal/server/services/proxy/services/service_proxy.py index c75fb23542..85b7150e8c 100644 --- a/src/dstack/_internal/server/services/proxy/services/service_proxy.py +++ b/src/dstack/_internal/server/services/proxy/services/service_proxy.py @@ -6,7 +6,7 @@ from starlette.requests import ClientDisconnect from dstack._internal.core.models.routers import RouterType -from dstack._internal.proxy.lib.const import SGLANG_WHITELISTED_PATHS +from dstack._internal.proxy.lib.const import ROUTER_WHITELISTED_PATHS from dstack._internal.proxy.lib.deps import ProxyAuthContext from dstack._internal.proxy.lib.errors import ProxyError from dstack._internal.proxy.lib.repo import BaseProxyRepo @@ -45,7 +45,7 @@ async def proxy( service.router is not None and service.router.type == RouterType.SGLANG ) or service.has_router_replica: path_for_match = path if path.startswith("/") else f"/{path}" - if not _is_whitelisted_path(path_for_match, SGLANG_WHITELISTED_PATHS): + if not _is_whitelisted_path(path_for_match, ROUTER_WHITELISTED_PATHS): raise ProxyError("Path is not allowed for this service", status.HTTP_403_FORBIDDEN) client = await get_service_replica_client(service, repo, service_conn_pool) diff --git a/src/dstack/_internal/server/services/runs/replicas.py b/src/dstack/_internal/server/services/runs/replicas.py index 53c26ca738..1f414cb685 100644 --- a/src/dstack/_internal/server/services/runs/replicas.py +++ b/src/dstack/_internal/server/services/runs/replicas.py @@ -1,14 +1,33 @@ from dataclasses import dataclass -from typing import List, Optional, Tuple +from typing import Dict, List, Optional, Tuple -from dstack._internal.core.models.configurations import ReplicaGroup -from dstack._internal.core.models.runs import JobStatus, JobTerminationReason +from dstack._internal.core.models.configurations import ReplicaGroup, ServiceConfiguration +from dstack._internal.core.models.routers import RouterType +from dstack._internal.core.models.runs import JobStatus, JobTerminationReason, RunSpec from dstack._internal.server.models import JobModel, RunModel from dstack._internal.server.services.jobs import ( + get_job_provisioning_data, get_job_spec, group_jobs_by_replica_latest, ) +# ROUTER_NOT_PROVISIONED — router job exists but its internal_ip is not yet +# known. The condition is transient; the caller +# should defer this worker and retry on the next +# pipeline tick (subject to a wait timeout — see +# ROUTER_PROVISIONING_WAIT_TIMEOUT_SECONDS in +# jobs_running.py). +# +# ROUTER_FAILED — router job has reached a terminal state +# (TERMINATING/TERMINATED/FAILED/ABORTED/DONE). +# The condition is permanent; the caller should +# stop deferring and terminate this worker with a +# clear reason — waiting longer cannot recover the +# run because the router will not come back with a +# fresh internal_ip. +ROUTER_NOT_PROVISIONED: Dict[str, str] = {} +ROUTER_FAILED: Dict[str, str] = {} + @dataclass class GroupRolloutState: @@ -124,3 +143,95 @@ def has_out_of_date_replicas(run: RunModel, group_filter: Optional[str] = None) def is_replica_registered(jobs: list[JobModel]) -> bool: # Only job_num=0 is supposed to receive service requests return jobs[0].registered + + +def get_router_replica_group(run_spec: RunSpec) -> Optional[ReplicaGroup]: + """Return the (single) replica group with a `router:` field, or None. + + `validate_at_most_one_router_replica_group` guarantees at most one such + group exists, so we can safely return on the first match. + """ + cfg = run_spec.configuration + if not isinstance(cfg, ServiceConfiguration): + return None + for g in cfg.replica_groups: + if g.router is not None: + return g + return None + + +def get_router_replica_num(run_spec: RunSpec) -> Optional[int]: + """Return the global replica_num assigned to the router replica group, or + None if the run has no router replica group. Used by _fetch_run_model in + pipeline_tasks/jobs_running.py to load the router replica's job alongside + the worker's own same-replica siblings, so get_router_env_for_job can see the + router's status / internal_ip. + """ + cfg = run_spec.configuration + if not isinstance(cfg, ServiceConfiguration): + return None + global_replica_num = 0 + for group in cfg.replica_groups: + if group.router is not None: + return global_replica_num + assert group.count.min is not None + global_replica_num += group.count.min + return None + + +def find_router_job(run_model: RunModel, router_group_name: str) -> Optional[JobModel]: + for j in run_model.jobs: + if job_belongs_to_group(j, router_group_name): + return j + return None + + +def get_router_env_for_job( + run_model: RunModel, run_spec: RunSpec, job_model: JobModel +) -> Optional[Dict[str, str]]: + """Compute env vars exposing the router replica's address to a worker job. + + Returns one of four values, each communicating a distinct outcome: + + None -> not applicable. Either the run has no router + replica group, or this job IS the router + replica. Caller does nothing. + ROUTER_NOT_PROVISIONED -> router job exists but has no internal_ip yet. + + ROUTER_FAILED -> router job has reached a terminal state and + can never expose an internal_ip. Caller terminates + this worker; waiting cannot + recover. + {"DSTACK_ROUTER_..."} -> ready-to-merge env dict containing the + router replica's internal IP. + """ + router_group = get_router_replica_group(run_spec) + if router_group is None or router_group.name is None: + return None + # DSTACK_ROUTER_INTERNAL_IP is Dynamo-specific. SGLang workers + # are registered via the worker-sync pipeline (ServiceRouterWorkerSyncModel) + if router_group.router is None or router_group.router.type != RouterType.DYNAMO: + return None + if job_belongs_to_group(job_model, router_group.name): + # Router replica itself doesn't need to be told its own IP. + return None + + router_job = find_router_job(run_model, router_group.name) + if router_job is None: + # No router job yet — the run was just submitted and jobs haven't + # been materialized. Treat as "not provisioned" so the caller defers. + return ROUTER_NOT_PROVISIONED + + # If the router has reached a terminal state, the worker cannot recover + # by waiting — the router will not come back with a fresh internal_ip + # under the same job. Surface this as ROUTER_FAILED so the caller can + # stop the wait loop and terminate the worker with a clear reason. + if router_job.status == JobStatus.TERMINATING or router_job.status.is_finished(): + return ROUTER_FAILED + + # Router is alive but may not yet have been assigned a machine. + jpd = get_job_provisioning_data(router_job) + if jpd is None or not jpd.internal_ip: + return ROUTER_NOT_PROVISIONED + + return {"DSTACK_ROUTER_INTERNAL_IP": jpd.internal_ip} diff --git a/src/dstack/_internal/server/services/runs/router_worker_sync.py b/src/dstack/_internal/server/services/runs/router_worker_sync.py index 5087749acf..2fc9add74b 100644 --- a/src/dstack/_internal/server/services/runs/router_worker_sync.py +++ b/src/dstack/_internal/server/services/runs/router_worker_sync.py @@ -19,7 +19,7 @@ from dstack._internal.utils.logging import get_logger from .replicas import job_belongs_to_group -from .service_router_worker_sync import run_spec_has_router_replica_group +from .service_router_worker_sync import run_spec_has_sglang_router_replica_group logger = get_logger(__name__) @@ -93,9 +93,9 @@ class _TargetWorker(TypedDict): bootstrap_port: NotRequired[Optional[int]] -def run_model_has_router_replica_group(run_model: RunModel) -> bool: +def run_model_has_sglang_router_replica_group(run_model: RunModel) -> bool: run_spec = RunSpec.__response__.parse_raw(run_model.run_spec) - return run_spec_has_router_replica_group(run_spec) + return run_spec_has_sglang_router_replica_group(run_spec) def _get_router_job(run_model: RunModel, router_group: ReplicaGroup) -> Optional[JobModel]: diff --git a/src/dstack/_internal/server/services/runs/service_router_worker_sync.py b/src/dstack/_internal/server/services/runs/service_router_worker_sync.py index f23a0d44cd..b251e76f92 100644 --- a/src/dstack/_internal/server/services/runs/service_router_worker_sync.py +++ b/src/dstack/_internal/server/services/runs/service_router_worker_sync.py @@ -9,6 +9,7 @@ import dstack._internal.utils.common as common_utils from dstack._internal.core.models.configurations import ServiceConfiguration +from dstack._internal.core.models.routers import RouterType from dstack._internal.core.models.runs import RunSpec from dstack._internal.server.models import RunModel, ServiceRouterWorkerSyncModel @@ -31,13 +32,15 @@ def _reactivate_sync_row_update_map(*, now: datetime) -> _SyncRowUpdateMap: } -def run_spec_has_router_replica_group(run_spec: RunSpec) -> bool: +def run_spec_has_sglang_router_replica_group(run_spec: RunSpec) -> bool: if run_spec.configuration.type != "service": return False cfg = run_spec.configuration if not isinstance(cfg, ServiceConfiguration): return False - return any(g.router is not None for g in cfg.replica_groups) + return any( + g.router is not None and g.router.type == RouterType.SGLANG for g in cfg.replica_groups + ) async def ensure_service_router_worker_sync_row( @@ -45,7 +48,7 @@ async def ensure_service_router_worker_sync_row( run_model: RunModel, run_spec: RunSpec, ) -> None: - if not run_spec_has_router_replica_group(run_spec): + if not run_spec_has_sglang_router_replica_group(run_spec): return res = await session.execute( select(ServiceRouterWorkerSyncModel).where( diff --git a/src/dstack/_internal/server/services/runs/spec.py b/src/dstack/_internal/server/services/runs/spec.py index 403437964b..86ed1113d3 100644 --- a/src/dstack/_internal/server/services/runs/spec.py +++ b/src/dstack/_internal/server/services/runs/spec.py @@ -5,6 +5,7 @@ ServiceConfiguration, ) from dstack._internal.core.models.repos.virtual import DEFAULT_VIRTUAL_REPO_ID, VirtualRunRepoData +from dstack._internal.core.models.routers import RouterType from dstack._internal.core.models.runs import LEGACY_REPO_DIR, AnyRunConfiguration, RunSpec from dstack._internal.core.models.volumes import InstanceMountPoint from dstack._internal.core.services import validate_dstack_resource_name @@ -214,6 +215,19 @@ def _check_can_update_configuration( "Cannot update router replica groups in-place (adding/removing `router` or changing " "which replica group is the router is not supported). Stop the run and apply again." ) + # Dynamo: any change to the router replica group requires stop+re-apply. + # The router's replica_num and address must remain stable so workers' + # cached DSTACK_ROUTER_INTERNAL_IP stays valid for the life of the run. + if ( + current_router_group is not None + and current_router_group.router is not None + and current_router_group.router.type == RouterType.DYNAMO + and current_router_group != new_router_group + ): + raise ServerClientError( + "Cannot update a Dynamo router replica group in place. " + "Stop the run with `dstack stop` and re-apply." + ) updatable_fields = _CONF_UPDATABLE_FIELDS + _TYPE_SPECIFIC_CONF_UPDATABLE_FIELDS.get( new.type, [] )