Skip to content

[RunPod] Use zone to provision in a specific data center ID #5166

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 7 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 14 additions & 5 deletions sky/clouds/runpod.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,13 +67,18 @@ def regions_with_offering(cls, instance_type: str,
accelerators: Optional[Dict[str, int]],
use_spot: bool, region: Optional[str],
zone: Optional[str]) -> List[clouds.Region]:
assert zone is None, 'RunPod does not support zones.'
del accelerators, zone # unused
del accelerators # unused
regions = service_catalog.get_region_zones_for_instance_type(
instance_type, use_spot, 'runpod')

if region is not None:
regions = [r for r in regions if r.name == region]

if zone is not None:
for r in regions:
assert r.zones is not None, r
r.set_zones([z for z in r.zones if z.name == zone])
regions = [r for r in regions if r.zones]
return regions

@classmethod
Expand All @@ -93,15 +98,15 @@ def zones_provision_loop(
instance_type: str,
accelerators: Optional[Dict[str, int]] = None,
use_spot: bool = False,
) -> Iterator[None]:
) -> Iterator[Optional[List['clouds.Zone']]]:
del num_nodes # unused
regions = cls.regions_with_offering(instance_type,
accelerators,
use_spot,
region=region,
zone=None)
for r in regions:
assert r.zones is None, r
assert r
yield r.zones

def instance_type_to_hourly_cost(self,
Expand Down Expand Up @@ -158,7 +163,10 @@ def make_deploy_resources_variables(
zones: Optional[List['clouds.Zone']],
num_nodes: int,
dryrun: bool = False) -> Dict[str, Optional[str]]:
del zones, dryrun, cluster_name # unused
del dryrun, cluster_name # unused
assert zones is not None, (region, zones)

zone_name = zones[0].name

r = resources
acc_dict = self.get_accelerators_from_instance_type(r.instance_type)
Expand Down Expand Up @@ -187,6 +195,7 @@ def make_deploy_resources_variables(
'instance_type': instance_type,
'custom_resources': custom_resources,
'region': region.name,
'availability_zone': zone_name,
'image_id': image_id,
'use_spot': use_spot,
'bid_per_gpu': str(hourly_cost),
Expand Down
9 changes: 0 additions & 9 deletions sky/clouds/service_catalog/runpod_catalog.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,9 +23,6 @@ def instance_type_exists(instance_type: str) -> bool:
def validate_region_zone(
region: Optional[str],
zone: Optional[str]) -> Tuple[Optional[str], Optional[str]]:
if zone is not None:
with ux_utils.print_exception_no_traceback():
raise ValueError('RunPod does not support zones.')
return common.validate_region_zone_impl('runpod', _df, region, zone)


Expand All @@ -34,9 +31,6 @@ def get_hourly_cost(instance_type: str,
region: Optional[str] = None,
zone: Optional[str] = None) -> float:
"""Returns the cost, or the cheapest cost among all zones for spot."""
if zone is not None:
with ux_utils.print_exception_no_traceback():
raise ValueError('RunPod does not support zones.')
return common.get_hourly_cost_impl(_df, instance_type, use_spot, region,
zone)

Expand Down Expand Up @@ -69,9 +63,6 @@ def get_instance_type_for_accelerator(
region: Optional[str] = None,
zone: Optional[str] = None) -> Tuple[Optional[List[str]], List[str]]:
"""Returns a list of instance types that have the given accelerator."""
if zone is not None:
with ux_utils.print_exception_no_traceback():
raise ValueError('RunPod does not support zones.')
return common.get_instance_type_for_accelerator_impl(df=_df,
acc_name=acc_name,
acc_count=acc_count,
Expand Down
31 changes: 17 additions & 14 deletions sky/provision/runpod/instance.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,13 +70,14 @@ def run_instances(region: str, cluster_name_on_cloud: str,
f'Cluster {cluster_name_on_cloud} has no head node.')
logger.info(f'Cluster {cluster_name_on_cloud} already has '
f'{len(exist_instances)} nodes, no need to start more.')
return common.ProvisionRecord(provider_name='runpod',
cluster_name=cluster_name_on_cloud,
region=region,
zone=None,
head_instance_id=head_instance_id,
resumed_instance_ids=[],
created_instance_ids=[])
return common.ProvisionRecord(
provider_name='runpod',
cluster_name=cluster_name_on_cloud,
region=region,
zone=config.provider_config['availability_zone'],
head_instance_id=head_instance_id,
resumed_instance_ids=[],
created_instance_ids=[])

created_instance_ids = []
for _ in range(to_start_count):
Expand All @@ -87,6 +88,7 @@ def run_instances(region: str, cluster_name_on_cloud: str,
node_type=node_type,
instance_type=config.node_config['InstanceType'],
region=region,
zone=config.provider_config['availability_zone'],
disk_size=config.node_config['DiskSize'],
image_name=config.node_config['ImageId'],
ports=config.ports_to_open_on_launch,
Expand Down Expand Up @@ -118,13 +120,14 @@ def run_instances(region: str, cluster_name_on_cloud: str,

time.sleep(POLL_INTERVAL)
assert head_instance_id is not None, 'head_instance_id should not be None'
return common.ProvisionRecord(provider_name='runpod',
cluster_name=cluster_name_on_cloud,
region=region,
zone=None,
head_instance_id=head_instance_id,
resumed_instance_ids=[],
created_instance_ids=created_instance_ids)
return common.ProvisionRecord(
provider_name='runpod',
cluster_name=cluster_name_on_cloud,
region=region,
zone=config.provider_config['availability_zone'],
head_instance_id=head_instance_id,
resumed_instance_ids=[],
created_instance_ids=created_instance_ids)


def wait_instances(region: str, cluster_name_on_cloud: str,
Expand Down
6 changes: 4 additions & 2 deletions sky/provision/runpod/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -264,8 +264,9 @@ def _create_template_for_docker_login(


def launch(cluster_name: str, node_type: str, instance_type: str, region: str,
disk_size: int, image_name: str, ports: Optional[List[int]],
public_key: str, preemptible: Optional[bool], bid_per_gpu: float,
zone: str, disk_size: int, image_name: str,
ports: Optional[List[int]], public_key: str,
preemptible: Optional[bool], bid_per_gpu: float,
docker_login_config: Optional[Dict[str, str]]) -> str:
"""Launches an instance with the given parameters.

Expand Down Expand Up @@ -332,6 +333,7 @@ def launch(cluster_name: str, node_type: str, instance_type: str, region: str,
'min_memory_in_gb': gpu_specs['memoryInGb'] * gpu_quantity,
'gpu_count': gpu_quantity,
'country_code': region,
'data_center_id': zone,
'ports': ports_str,
'support_public_ip': True,
'docker_args': docker_args,
Expand Down
1 change: 1 addition & 0 deletions sky/templates/runpod-ray.yml.j2
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ provider:
type: external
module: sky.provision.runpod
region: "{{region}}"
availability_zone: "{{availability_zone}}"
disable_launch_config_check: true
# For RunPod, we directly set the image id for the docker as runtime environment
# support, thus we need to avoid the DockerInitializer detects the docker field
Expand Down
Loading