From d0257ec89c12980356b2eb567e1ddf07ec3a8dc1 Mon Sep 17 00:00:00 2001 From: Victor Skvortsov Date: Fri, 8 May 2026 14:18:49 +0500 Subject: [PATCH 1/4] Support multi-EFA instances with public IPs --- mkdocs/docs/concepts/backends.md | 14 ++ mkdocs/docs/examples/clusters/aws.md | 6 - .../_internal/core/backends/aws/compute.py | 151 +++++++++++++++++- .../_internal/core/backends/aws/resources.py | 22 ++- 4 files changed, 172 insertions(+), 21 deletions(-) diff --git a/mkdocs/docs/concepts/backends.md b/mkdocs/docs/concepts/backends.md index 2f6186c6be..7f1964b2ad 100644 --- a/mkdocs/docs/concepts/backends.md +++ b/mkdocs/docs/concepts/backends.md @@ -154,6 +154,20 @@ There are two ways to configure AWS: using an access key or using the default cr The `iam:*` permissions are only needed if you specify `iam_instance_profile` to assign to EC2 instances. + The following additional permissions are required when running [multi-EFA instance types](https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/using-eni.html#network-cards) with `public_ips: true`: + + ``` + { + "Effect": "Allow", + "Action": [ + "ec2:AllocateAddress", + "ec2:AssociateAddress", + "ec2:ReleaseAddress" + ], + "Resource": "*" + } + ``` + You can also limit permissions to specific resources in your account: ``` diff --git a/mkdocs/docs/examples/clusters/aws.md b/mkdocs/docs/examples/clusters/aws.md index 54e1cd667d..b35e59d6c7 100644 --- a/mkdocs/docs/examples/clusters/aws.md +++ b/mkdocs/docs/examples/clusters/aws.md @@ -25,16 +25,10 @@ projects: creds: type: default regions: ["us-west-2"] - - public_ips: false - vpc_name: my-custom-vpc ``` -!!! info "Multiple network interfaces" - To use P4, P5, or P6 instances, set `public_ips` to `false` — this allows AWS to attach multiple network interfaces for EFA. In this case, the `dstack` server can reach your VPC’s private subnets. - !!! info "VPC" If you use a custom VPC, verify that it permits all internal traffic between nodes for EFA to function properly diff --git a/src/dstack/_internal/core/backends/aws/compute.py b/src/dstack/_internal/core/backends/aws/compute.py index a5ec987226..b65ea88c2c 100644 --- a/src/dstack/_internal/core/backends/aws/compute.py +++ b/src/dstack/_internal/core/backends/aws/compute.py @@ -98,6 +98,13 @@ class AWSVolumeBackendData(CoreModel): iops: int +class AWSInstanceBackendData(CoreModel): + eip_allocation_id: Optional[str] = None + """Elastic IP allocated for multi-ENI instances launched with `public_ips: true`. + See `_allocate_and_associate_eip` for context. + """ + + def _ec2client_cache_methodkey(self, ec2_client, *args, **kwargs): return hashkey(*args, **kwargs) @@ -227,6 +234,13 @@ def terminate_instance( logger.debug("Skipping instance %s termination. Instance not found.", instance_id) else: raise e + # AWS auto-disassociates EIP on instance termination, so we only need to release. + instance_backend_data = _parse_instance_backend_data(backend_data) + if instance_backend_data.eip_allocation_id is not None: + _release_eip( + ec2_client=ec2_client, + allocation_id=instance_backend_data.eip_allocation_id, + ) def create_instance( self, @@ -395,6 +409,7 @@ def update_provisioning_data( project_ssh_private_key: str, ): ec2_resource = self.session.resource("ec2", region_name=provisioning_data.region) + ec2_client = self.session.client("ec2", region_name=provisioning_data.region) instance = ec2_resource.Instance(provisioning_data.instance_id) # pyright: ignore[reportAttributeAccessIssue] try: instance.load() @@ -422,8 +437,23 @@ def update_provisioning_data( f"Failed to get instance IP address. Unknown instance state {state}." ) - hostname = _get_instance_ip(instance, self.config.allocate_public_ips) - provisioning_data.hostname = hostname + if self.config.allocate_public_ips and instance.public_ip_address is None: + # AWS can't auto-assign a public IPv4 to multi-ENI instances (multi-EFA instances). + # When `public_ips: true` and no public IP is present after launch, attach an Elastic IP to the primary ENI. + public_ip, allocation_id = _allocate_and_associate_eip( + ec2_client=ec2_client, + instance=instance, + project_name=_get_project_name_from_instance_tags(instance), + backend_tags=self.config.tags, + ) + provisioning_data.backend_data = AWSInstanceBackendData( + eip_allocation_id=allocation_id + ).json() + provisioning_data.hostname = public_ip + else: + provisioning_data.hostname = _get_instance_ip( + instance, self.config.allocate_public_ips + ) provisioning_data.internal_ip = instance.private_ip_address provisioning_data.ssh_port = 22 @@ -1263,3 +1293,120 @@ def _get_instance_ip(instance: Any, public_ip: bool) -> str: def _get_volume_price(size: int, iops: int) -> float: # https://aws.amazon.com/ebs/pricing/ return size * 0.08 + (iops - 3000) * 0.005 + + +def _parse_instance_backend_data(backend_data: Optional[str]) -> "AWSInstanceBackendData": + if backend_data is None: + return AWSInstanceBackendData() + try: + return AWSInstanceBackendData.parse_raw(backend_data) + except ValidationError: + logger.exception("Failed to parse AWS instance backend_data; treating as empty") + return AWSInstanceBackendData() + + +def _get_project_name_from_instance_tags(instance: Any) -> Optional[str]: + for tag in instance.tags or []: + if tag.get("Key") == "dstack_project": + return tag.get("Value") + return None + + +def _allocate_and_associate_eip( + ec2_client: botocore.client.BaseClient, + instance: Any, + project_name: Optional[str], + backend_tags: Optional[Dict[str, str]], +) -> Tuple[str, str]: + """ + Allocates an Elastic IP and associates it with the primary ENI of `instance`. + Returns `(public_ip, allocation_id)`. + """ + primary_nic_id = _get_primary_network_interface_id(instance) + tags = { + "owner": "dstack", + "dstack_instance": instance.instance_id, + } + if project_name is not None: + tags["dstack_project"] = project_name + if backend_tags: + for k, v in backend_tags.items(): + tags.setdefault(k, v) + tags = aws_resources.filter_invalid_tags(tags) + + try: + allocate_response = ec2_client.allocate_address( + Domain="vpc", + TagSpecifications=[ + { + "ResourceType": "elastic-ip", + "Tags": aws_resources.make_tags(tags), + } + ], + ) + except botocore.exceptions.ClientError as e: + code = e.response.get("Error", {}).get("Code", "") + region = ec2_client.meta.region_name + if code == "AddressLimitExceeded": + raise ProvisioningError( + f"Elastic IP quota exceeded in {region}. " + "Raise the EC2 'EC2-VPC Elastic IPs' quota in Service Quotas, " + "or reduce concurrent multi-EFA instances." + ) + raise ProvisioningError(f"Failed to allocate Elastic IP in {region}: {e}") + + allocation_id = allocate_response["AllocationId"] + public_ip = allocate_response["PublicIp"] + try: + ec2_client.associate_address( + AllocationId=allocation_id, + NetworkInterfaceId=primary_nic_id, + AllowReassociation=False, + ) + except botocore.exceptions.ClientError as e: + # Best-effort release; on failure the EIP leaks until manually released. + logger.warning( + "Failed to associate EIP %s to instance %s; releasing.", + allocation_id, + instance.instance_id, + ) + try: + ec2_client.release_address(AllocationId=allocation_id) + except botocore.exceptions.ClientError: + logger.exception( + "Failed to release just-allocated EIP %s; release it manually.", + allocation_id, + ) + raise ProvisioningError( + f"Failed to associate Elastic IP {allocation_id} to instance " + f"{instance.instance_id}: {e}" + ) + return public_ip, allocation_id + + +def _get_primary_network_interface_id(instance: Any) -> str: + for nic in instance.network_interfaces_attribute or []: + attachment = nic.get("Attachment") or {} + if attachment.get("DeviceIndex") == 0: + return nic["NetworkInterfaceId"] + raise ProvisioningError( + f"Instance {instance.instance_id} has no primary network interface (DeviceIndex=0)" + ) + + +def _release_eip(ec2_client: botocore.client.BaseClient, allocation_id: str) -> None: + """ + Releases an Elastic IP by allocation ID. Tolerates "not found" (already released). + AWS auto-disassociates EIPs once the instance reaches the `terminated` state, but + `TerminateInstances` only initiates shutdown — `ReleaseAddress` may briefly fail + with `InvalidIPAddress.InUse`. Surface that error so the termination pipeline + retries. + """ + try: + ec2_client.release_address(AllocationId=allocation_id) + except botocore.exceptions.ClientError as e: + code = e.response.get("Error", {}).get("Code", "") + if code in ("InvalidAllocationID.NotFound", "InvalidAddress.NotFound"): + logger.debug("Skipping EIP %s release. Already released.", allocation_id) + return + raise diff --git a/src/dstack/_internal/core/backends/aws/resources.py b/src/dstack/_internal/core/backends/aws/resources.py index 17e6cf08de..7b53606a26 100644 --- a/src/dstack/_internal/core/backends/aws/resources.py +++ b/src/dstack/_internal/core/backends/aws/resources.py @@ -196,19 +196,15 @@ def create_instances_struct( # AWS allows specifying either NetworkInterfaces for specific subnet_id # or instance-level SecurityGroupIds in case of no specific subnet_id, not both. if subnet_id is not None: - # If the instance type supports multiple cards, we request multiple interfaces only if not allocate_public_ip - # due to the limitation: "AssociatePublicIpAddress [...] You cannot specify more than one - # network interface in the request". - # Error message: "(InvalidParameterCombination) when calling the RunInstances operation: - # The associatePublicIPAddress parameter cannot be specified when launching with - # multiple network interfaces". - # See: https://stackoverflow.com/questions/49882121 - # If we need more than one card, we should either use Elastic IP (AWS-recommended way) or - # create the instance with one interface and add the rest later (the latter is not tested - # and may or may not work). + # AWS does not auto-assign a public IPv4 to instances launched with multiple network + # interfaces ("AssociatePublicIpAddress [...] You cannot specify more than one network + # interface in the request"). For multi-EFA instance types (e.g. p4d, p5, trn1), we + # therefore launch all EFA NICs without `AssociatePublicIpAddress` and, when + # `public_ips: true`, attach an Elastic IP after launch in `update_provisioning_data`. + multi_eni = max_efa_interfaces > 1 struct["NetworkInterfaces"] = [ { - "AssociatePublicIpAddress": allocate_public_ip, + "AssociatePublicIpAddress": allocate_public_ip and not multi_eni, "DeviceIndex": 0, "SubnetId": subnet_id, "Groups": [security_group_id], @@ -216,7 +212,7 @@ def create_instances_struct( }, ] - if max_efa_interfaces > 1 and allocate_public_ip is False: + if multi_eni: for i in range(1, max_efa_interfaces): # Set to efa-only to use interfaces exclusively for GPU-to-GPU communication interface_type = "efa-only" @@ -226,7 +222,7 @@ def create_instances_struct( interface_type = "efa" if i % 4 == 0 else "efa-only" struct["NetworkInterfaces"].append( { - "AssociatePublicIpAddress": allocate_public_ip, + "AssociatePublicIpAddress": False, "NetworkCardIndex": i, "DeviceIndex": 1, "SubnetId": subnet_id, From 3a3d1620427c185ea81aea9c73d38f02b03eb4cd Mon Sep 17 00:00:00 2001 From: Victor Skvortsov Date: Fri, 8 May 2026 14:46:05 +0500 Subject: [PATCH 2/4] Fix comments --- src/dstack/_internal/core/backends/aws/compute.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/dstack/_internal/core/backends/aws/compute.py b/src/dstack/_internal/core/backends/aws/compute.py index b65ea88c2c..dc850abe4a 100644 --- a/src/dstack/_internal/core/backends/aws/compute.py +++ b/src/dstack/_internal/core/backends/aws/compute.py @@ -101,7 +101,6 @@ class AWSVolumeBackendData(CoreModel): class AWSInstanceBackendData(CoreModel): eip_allocation_id: Optional[str] = None """Elastic IP allocated for multi-ENI instances launched with `public_ips: true`. - See `_allocate_and_associate_eip` for context. """ @@ -440,6 +439,7 @@ def update_provisioning_data( if self.config.allocate_public_ips and instance.public_ip_address is None: # AWS can't auto-assign a public IPv4 to multi-ENI instances (multi-EFA instances). # When `public_ips: true` and no public IP is present after launch, attach an Elastic IP to the primary ENI. + # The check relies on running instances always having IP assigned if ever. public_ip, allocation_id = _allocate_and_associate_eip( ec2_client=ec2_client, instance=instance, From ab9c04e1679c3d0385527e662cb73fbbbb641425 Mon Sep 17 00:00:00 2001 From: Victor Skvortsov Date: Fri, 8 May 2026 14:51:13 +0500 Subject: [PATCH 3/4] Fix docs --- mkdocs/docs/concepts/fleets.md | 4 ---- 1 file changed, 4 deletions(-) diff --git a/mkdocs/docs/concepts/fleets.md b/mkdocs/docs/concepts/fleets.md index 22057c0709..d39899bd4a 100644 --- a/mkdocs/docs/concepts/fleets.md +++ b/mkdocs/docs/concepts/fleets.md @@ -162,10 +162,6 @@ This property ensures that instances are interconnected. This is required for ru Fast interconnect is supported on the `aws`, `gcp`, `nebius`, `crusoe`, and `kubernetes` backends. Some backends may require additional configuration. - === "AWS" - On AWS, `dstack` requires `public_ips` to be set to `false` in the backend configuration. - Refer to the [AWS](../examples/clusters/aws.md) example for more details. - === "GCP" On GCP, you may need to configure `extra_vpcs` and `roce_vpcs` in the `gcp` backend configuration. Refer to the [GCP](../examples/clusters/gcp.md) examples for more details. From 831aa4623577dd9f5dcd2d2588753310a0b2b5b0 Mon Sep 17 00:00:00 2001 From: Victor Skvortsov Date: Fri, 8 May 2026 15:49:44 +0500 Subject: [PATCH 4/4] Explicitly disassociate EIP --- mkdocs/docs/concepts/backends.md | 2 ++ .../_internal/core/backends/aws/compute.py | 32 +++++++++++++++---- 2 files changed, 27 insertions(+), 7 deletions(-) diff --git a/mkdocs/docs/concepts/backends.md b/mkdocs/docs/concepts/backends.md index 7f1964b2ad..4879f5f184 100644 --- a/mkdocs/docs/concepts/backends.md +++ b/mkdocs/docs/concepts/backends.md @@ -162,6 +162,8 @@ There are two ways to configure AWS: using an access key or using the default cr "Action": [ "ec2:AllocateAddress", "ec2:AssociateAddress", + "ec2:DescribeAddresses", + "ec2:DisassociateAddress", "ec2:ReleaseAddress" ], "Resource": "*" diff --git a/src/dstack/_internal/core/backends/aws/compute.py b/src/dstack/_internal/core/backends/aws/compute.py index dc850abe4a..037ec1c160 100644 --- a/src/dstack/_internal/core/backends/aws/compute.py +++ b/src/dstack/_internal/core/backends/aws/compute.py @@ -233,7 +233,6 @@ def terminate_instance( logger.debug("Skipping instance %s termination. Instance not found.", instance_id) else: raise e - # AWS auto-disassociates EIP on instance termination, so we only need to release. instance_backend_data = _parse_instance_backend_data(backend_data) if instance_backend_data.eip_allocation_id is not None: _release_eip( @@ -1396,17 +1395,36 @@ def _get_primary_network_interface_id(instance: Any) -> str: def _release_eip(ec2_client: botocore.client.BaseClient, allocation_id: str) -> None: """ - Releases an Elastic IP by allocation ID. Tolerates "not found" (already released). - AWS auto-disassociates EIPs once the instance reaches the `terminated` state, but - `TerminateInstances` only initiates shutdown — `ReleaseAddress` may briefly fail - with `InvalidIPAddress.InUse`. Surface that error so the termination pipeline - retries. + Releases an Elastic IP by allocation ID. Disassociates first if the EIP is still + bound to an instance — `TerminateInstances` only initiates shutdown, and AWS + auto-disassociates only once the instance reaches `terminated`. Releasing + explicitly avoids the `InvalidIPAddress.InUse` race and the retry loop. """ try: - ec2_client.release_address(AllocationId=allocation_id) + response = ec2_client.describe_addresses(AllocationIds=[allocation_id]) except botocore.exceptions.ClientError as e: code = e.response.get("Error", {}).get("Code", "") if code in ("InvalidAllocationID.NotFound", "InvalidAddress.NotFound"): logger.debug("Skipping EIP %s release. Already released.", allocation_id) return raise + addresses = response.get("Addresses", []) + if not addresses: + return + association_id = addresses[0].get("AssociationId") + if association_id is not None: + try: + ec2_client.disassociate_address(AssociationId=association_id) + except botocore.exceptions.ClientError as e: + code = e.response.get("Error", {}).get("Code", "") + # AWS may have auto-disassociated between our Describe and Disassociate + # if the instance just reached `terminated`. Tolerated. + if code != "InvalidAssociationID.NotFound": + raise + try: + ec2_client.release_address(AllocationId=allocation_id) + except botocore.exceptions.ClientError as e: + code = e.response.get("Error", {}).get("Code", "") + if code in ("InvalidAllocationID.NotFound", "InvalidAddress.NotFound"): + return + raise