Skip to content

Commit de889f9

Browse files
authored
Optimize GCP offers (#3793)
1 parent 7bc818e commit de889f9

2 files changed

Lines changed: 74 additions & 49 deletions

File tree

src/dstack/_internal/core/backends/base/offers.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -38,7 +38,14 @@ def get_catalog_offers(
3838
configurable_disk_size: Range[Memory] = Range[Memory](min=Memory.parse("1GB"), max=None),
3939
extra_filter: Optional[Callable[[InstanceOffer], bool]] = None,
4040
catalog: Optional[gpuhunt.Catalog] = None,
41+
catalog_item_filter: Optional[Callable[[gpuhunt.CatalogItem], bool]] = None,
4142
) -> List[InstanceOffer]:
43+
"""
44+
Args:
45+
catalog_item_filter: applied to raw catalog items before the conversion to
46+
`InstanceOffer` models. Use it for filtering that can be done on raw catalog fields
47+
to avoid expensive model construction for items that will be discarded.
48+
"""
4249
provider = backend.value
4350
if backend == BackendType.DATACRUNCH:
4451
provider = BackendType.VERDA.value # Backward compatibility
@@ -54,6 +61,8 @@ def get_catalog_offers(
5461
for item in catalog.query(**asdict(q)):
5562
if locations is not None and item.location not in locations:
5663
continue
64+
if catalog_item_filter is not None and not catalog_item_filter(item):
65+
continue
5766
offer = catalog_item_to_offer(backend, item, requirements, configurable_disk_size)
5867
if offer is None:
5968
continue

src/dstack/_internal/core/backends/gcp/compute.py

Lines changed: 65 additions & 49 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88

99
import google.api_core.exceptions
1010
import google.cloud.compute_v1 as compute_v1
11+
import gpuhunt
1112
from cachetools import TTLCache, cachedmethod
1213
from google.cloud import tpu_v2
1314
from google.cloud.compute_v1.types.compute import Instance
@@ -64,7 +65,6 @@
6465
InstanceConfiguration,
6566
InstanceOffer,
6667
InstanceOfferWithAvailability,
67-
InstanceType,
6868
Resources,
6969
)
7070
from dstack._internal.core.models.placement import PlacementGroup, PlacementGroupProvisioningData
@@ -136,35 +136,37 @@ def __init__(self, config: GCPConfig):
136136

137137
def get_all_offers_with_availability(self) -> List[InstanceOfferWithAvailability]:
138138
regions = get_or_error(self.config.regions)
139+
zones_by_key: Dict[Tuple, List[str]] = {}
140+
catalog_item_filter = _make_catalog_item_filter(regions, zones_by_key)
139141
offers = get_catalog_offers(
140142
backend=BackendType.GCP,
141-
extra_filter=_supported_instances_and_zones(regions),
143+
catalog_item_filter=catalog_item_filter,
142144
)
143145
quotas: Dict[str, Dict[str, float]] = defaultdict(dict)
144146
for region in self.regions_client.list(project=self.config.project_id):
145147
for quota in region.quotas:
146148
quotas[region.name][quota.metric] = quota.limit - quota.usage
147149

148-
offer_keys_to_offers = {}
149150
offers_with_availability = []
150151
for offer in offers:
151152
region = offer.region[:-2] # strip zone
152-
key = (_unique_instance_name(offer.instance), region)
153-
if key in offer_keys_to_offers:
154-
offer_keys_to_offers[key].availability_zones.append(offer.region)
155-
continue
153+
gpu_name = (
154+
offer.instance.resources.gpus[0].name if offer.instance.resources.gpus else None
155+
)
156+
key = _offer_dedup_key(
157+
offer.instance.name, offer.instance.resources.spot, gpu_name, region
158+
)
156159
availability = InstanceAvailability.NO_QUOTA
157160
if _has_gpu_quota(quotas[region], offer.instance.resources):
158161
availability = InstanceAvailability.UNKNOWN
159162
# todo quotas: cpu, memory, global gpu, tpu
160163
offer_with_availability = InstanceOfferWithAvailability(
161164
**offer.dict(),
162165
availability=availability,
163-
availability_zones=[offer.region],
166+
availability_zones=zones_by_key.get(key, []),
164167
)
165-
offer_keys_to_offers[key] = offer_with_availability
166168
offers_with_availability.append(offer_with_availability)
167-
offers_with_availability[-1].region = region
169+
offer_with_availability.region = region
168170
return offers_with_availability
169171

170172
def get_offers_modifiers(self, requirements: Requirements) -> Iterable[OfferModifier]:
@@ -992,37 +994,62 @@ def _find_reservation(self, configured_name: str) -> dict[str, compute_v1.Reserv
992994
)
993995

994996

995-
def _supported_instances_and_zones(
997+
def _is_supported_gcp_instance(instance_name: str, gpu_name: Optional[str]) -> bool:
998+
"""Check if the instance is supported by dstack."""
999+
if _is_tpu(instance_name) and not _is_single_host_tpu(instance_name):
1000+
return False
1001+
for family in [
1002+
"m4-",
1003+
"c4-",
1004+
"n4-",
1005+
"h3-",
1006+
"n2-",
1007+
"e2-medium",
1008+
"e2-standard-",
1009+
"e2-highmem-",
1010+
"e2-highcpu-",
1011+
"m1-",
1012+
"a2-",
1013+
"a3-",
1014+
"g2-",
1015+
]:
1016+
if instance_name.startswith(family):
1017+
return True
1018+
if gpu_name is not None and gpu_name not in {"K80", "P4"}:
1019+
return True
1020+
return False
1021+
1022+
1023+
def _offer_dedup_key(
1024+
instance_name: str, spot: bool, gpu_name: Optional[str], region: str
1025+
) -> Tuple[str, bool, Optional[str], str]:
1026+
"""Key for deduplicating GCP per-zone items into per-region offers."""
1027+
return (instance_name, spot, gpu_name, region)
1028+
1029+
1030+
def _make_catalog_item_filter(
9961031
regions: List[str],
997-
) -> Optional[Callable[[InstanceOffer], bool]]:
998-
def _filter(offer: InstanceOffer) -> bool:
999-
# strip zone
1000-
if offer.region[:-2] not in regions:
1032+
zones_by_key: Dict[Tuple, List[str]],
1033+
) -> Callable[[gpuhunt.CatalogItem], bool]:
1034+
"""
1035+
Returns a filter that checks region, instance support, and deduplicates
1036+
per-zone items into per-region offers. Zones are collected in `zones_by_key`
1037+
so the caller can attach them to offers later.
1038+
"""
1039+
seen: set = set()
1040+
1041+
def _filter(item: gpuhunt.CatalogItem) -> bool:
1042+
region = item.location[:-2]
1043+
if region not in regions:
10011044
return False
1002-
# remove multi-host TPUs for initial release
1003-
if _is_tpu(offer.instance.name) and not _is_single_host_tpu(offer.instance.name):
1045+
if not _is_supported_gcp_instance(item.instance_name, item.gpu_name):
10041046
return False
1005-
for family in [
1006-
"m4-",
1007-
"c4-",
1008-
"n4-",
1009-
"h3-",
1010-
"n2-",
1011-
"e2-medium",
1012-
"e2-standard-",
1013-
"e2-highmem-",
1014-
"e2-highcpu-",
1015-
"m1-",
1016-
"a2-",
1017-
"a3-",
1018-
"g2-",
1019-
]:
1020-
if offer.instance.name.startswith(family):
1021-
return True
1022-
if offer.instance.resources.gpus:
1023-
if offer.instance.resources.gpus[0].name not in {"K80", "P4"}:
1024-
return True
1025-
return False
1047+
key = _offer_dedup_key(item.instance_name, item.spot, item.gpu_name, region)
1048+
zones_by_key.setdefault(key, []).append(item.location)
1049+
if key in seen:
1050+
return False
1051+
seen.add(key)
1052+
return True
10261053

10271054
return _filter
10281055

@@ -1090,17 +1117,6 @@ def _reservation_has_capacity(reservation: compute_v1.Reservation) -> bool:
10901117
)
10911118

10921119

1093-
def _unique_instance_name(instance: InstanceType) -> str:
1094-
if instance.resources.spot:
1095-
name = f"{instance.name}-spot"
1096-
else:
1097-
name = instance.name
1098-
if not instance.resources.gpus:
1099-
return name
1100-
gpu = instance.resources.gpus[0]
1101-
return f"{name}-{gpu.name}-{gpu.memory_mib}"
1102-
1103-
11041120
@dataclass
11051121
class GCPImage:
11061122
id: str

0 commit comments

Comments
 (0)