Skip to content

Commit 0e2bdbf

Browse files
Fix issue 390 support different machine types on gcp (#451)
* Adding support for different machine types on GCP * Making GPUs present only on worker instances * Fixing the GPU instance on scheduler * Cleanup * Adjusting to feedback: - added info on GPU logic in docs - adjusted scheduler GPU logic - fixed the machine type checker * Adjusting to feedback: - Maintain existing GPU logic - Add ability to specify different GPU configurations for workers and scheduler * Adjusting to feedback: - Updating the documentation to get rid of old GPUs
1 parent 9f07eab commit 0e2bdbf

File tree

2 files changed

+93
-15
lines changed

2 files changed

+93
-15
lines changed

dask_cloudprovider/cloudprovider.yaml

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -101,10 +101,16 @@ cloudprovider:
101101
network_projectid: null # GCP project id where the network exists
102102
projectid: "" # name of the google cloud project
103103
on_host_maintenance: "TERMINATE"
104-
machine_type: "n1-standard-1" # size of the machine type to use
104+
machine_type: "n1-standard-1" # size of the machine type to use for the scheduler and all workers
105+
scheduler_machine_type: "n1-standard-1" # size of the machine type to use for the scheduler
106+
worker_machine_type: "n1-standard-1" # size of the machine type to use for all workers
105107
filesystem_size: 50 # amount in GBs of hard drive space to allocate
106-
ngpus: "" # number of GPUs to use
107-
gpu_type: "" # type of gpus to use: nvidia-tesla-k80, nvidia-tesla-p100, nvidia-tesla-t4
108+
ngpus: "" # number of GPUs to use. If provided, will be used for both scheduler and worker
109+
gpu_type: "" # type of gpus to use. (e.g. 'nvidia-tesla-t4'). You can view the possible values through ``gcloud compute accelerator-types list``. If provided, will be used for both scheduler and worker
110+
scheduler_ngpus: "" # number of GPUs to use on scheduler
111+
scheduler_gpu_type: "" # type of gpus to use. (e.g. 'nvidia-tesla-t4'). You can view the possible values through ``gcloud compute accelerator-types list``.
112+
worker_ngpus: "" # number of GPUs to use on worker
113+
worker_gpu_type: "" # type of gpus to use. (e.g. 'nvidia-tesla-t4'). You can view the possible values through ``gcloud compute accelerator-types list``.
108114
disk_type: "pd-standard" # type of disk to use: pd-standard, pd-ssd
109115
docker_image: "daskdev/dask:latest" # docker image to use
110116
auto_shutdown: true # Shutdown instances automatically if the scheduler or worker services time out.

dask_cloudprovider/gcp/instances.py

Lines changed: 84 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -302,7 +302,8 @@ async def start_scheduler(self):
302302
f"\n Machine Type: {self.machine_type} "
303303
f"\n Filesystem Size: {self.filesystem_size} "
304304
f"\n Disk Type: {self.disk_type} "
305-
f"\n N-GPU Type: {self.ngpus} {self.gpu_type}"
305+
f"\n Scheduler GPU Count: {self.ngpus}"
306+
f"\n Scheduler GPU Type: {self.gpu_type}"
306307
f"\n Zone: {self.zone} "
307308
)
308309
self.cluster._log("Creating scheduler instance")
@@ -375,6 +376,8 @@ async def start(self):
375376

376377
async def start_worker(self):
377378
self.cluster._log("Creating worker instance")
379+
self.cluster._log(f"Worker GPU Count: {self.ngpus}")
380+
self.cluster._log(f"Worker GPU Type: {self.gpu_type}")
378381
self.internal_ip, self.external_ip = await self.create_vm()
379382
if self.config.get("public_ingress", True):
380383
# scheduler is publicly available
@@ -407,7 +410,7 @@ class GCPCluster(VMCluster):
407410
The GCP zone to launch you cluster in. A full list can be obtained with ``gcloud compute zones list``.
408411
network: str
409412
The GCP VPC network/subnetwork to use. The default is `default`. If using firewall rules,
410-
please ensure the follwing accesses are configured:
413+
please ensure the following accesses are configured:
411414
- egress 0.0.0.0/0 on all ports for downloading docker images and general data access
412415
- ingress 10.0.0.0/8 on all ports for internal communication of workers
413416
- ingress 0.0.0.0/0 on 8786-8787 for external accessibility of the dashboard/scheduler
@@ -417,9 +420,17 @@ class GCPCluster(VMCluster):
417420
be cases (i.e. Shared VPC) when network configurations from a different GCP project are used.
418421
machine_type: str
419422
The VM machine_type. You can get a full list with ``gcloud compute machine-types list``.
420-
The default is ``n1-standard-1`` which is 3.75GB RAM and 1 vCPU
423+
The default is ``n1-standard-1`` which is 3.75GB RAM and 1 vCPU.
424+
This will determine the resources available to both the scheduler and all workers.
425+
If supplied, you may not specify ``scheduler_machine_type`` or ``worker_machine_type``.
426+
scheduler_machine_type: str
427+
The VM machine_type. This will determine the resources available to the scheduler.
428+
The default is ``n1-standard-1`` which is 3.75GB RAM and 1 vCPU.
429+
worker_machine_type: str
430+
The VM machine_type. This will determine the resources available to all workers.
431+
The default is ``n1-standard-1`` which is 3.75GB RAM and 1 vCPU.
421432
source_image: str
422-
The OS image to use for the VM. Dask Cloudprovider will boostrap Ubuntu based images automatically.
433+
The OS image to use for the VM. Dask Cloudprovider will bootstrap Ubuntu based images automatically.
423434
Other images require Docker and for GPUs the NVIDIA Drivers and NVIDIA Docker.
424435
425436
A list of available images can be found with ``gcloud compute images list``
@@ -445,11 +456,25 @@ class GCPCluster(VMCluster):
445456
extra_bootstrap: list[str] (optional)
446457
Extra commands to be run during the bootstrap phase.
447458
ngpus: int (optional)
448-
The number of GPUs to atatch to the instance.
449-
Default is ``0``.
459+
The number of GPUs to attach to both the worker and scheduler instances. If specified,
460+
you cannot use ``scheduler_ngpus`` or ``worker_ngpus`` (they must be None). Default is None.
461+
462+
Note: Due to the way that Dask uses pickle to move things around there are cases where the scheduler might
463+
deserialize a meta object which may try and allocate a small amount of GPU memory. It's always recommended
464+
to have a matching GPU configuration on the scheduler and workers.
450465
gpu_type: str (optional)
451-
The name of the GPU to use. This must be set if ``ngpus>0``.
466+
The name of the GPU to use on both the worker and scheduler. This must be set if ``ngpus>0``.
452467
You can see a list of GPUs available in each zone with ``gcloud compute accelerator-types list``.
468+
scheduler_ngpus: int (optional)
469+
The number of GPUs to attach to the scheduler instance. If you specified this,
470+
nothing should be specified for the ``ngpus`` and ``gpu_type``. Default is ``0``.
471+
scheduler_gpu_type: str (optional)
472+
The name of the GPU to use on the scheduler. This must be set if ``scheduler_ngpus>0``.
473+
worker_ngpus: int (optional)
474+
The number of GPUs to attach to the worker instance. If you specified this,
475+
nothing should be specified for the ``ngpus`` and ``gpu_type``. Default is ``0``.
476+
worker_gpu_type: str (optional)
477+
The name of the GPU to use on the worker. This must be set if ``worker_ngpus>0``.
453478
filesystem_size: int (optional)
454479
The VM filesystem size in GB. Defaults to ``50``.
455480
disk_type: str (optional)
@@ -573,11 +598,17 @@ def __init__(
573598
network=None,
574599
network_projectid=None,
575600
machine_type=None,
601+
scheduler_machine_type=None,
602+
worker_machine_type=None,
576603
on_host_maintenance=None,
577604
source_image=None,
578605
docker_image=None,
579606
ngpus=None,
580607
gpu_type=None,
608+
scheduler_ngpus=None,
609+
scheduler_gpu_type=None,
610+
worker_ngpus=None,
611+
worker_gpu_type=None,
581612
filesystem_size=None,
582613
disk_type=None,
583614
auto_shutdown=None,
@@ -603,7 +634,43 @@ def __init__(
603634
bootstrap if bootstrap is not None else self.config.get("bootstrap")
604635
)
605636
self.machine_type = machine_type or self.config.get("machine_type")
606-
self.gpu_instance = "gpu" in self.machine_type or bool(ngpus)
637+
if machine_type is None:
638+
self.scheduler_machine_type = scheduler_machine_type or self.config.get("scheduler_machine_type")
639+
self.worker_machine_type = worker_machine_type or self.config.get("worker_machine_type")
640+
if self.scheduler_machine_type is None or self.worker_machine_type is None:
641+
raise ValueError("machine_type and scheduler_machine_type must be set")
642+
else:
643+
if scheduler_machine_type is not None or worker_machine_type is not None:
644+
raise ValueError("If you specify machine_type, you may not specify scheduler_machine_type or worker_machine_type")
645+
self.scheduler_machine_type = machine_type
646+
self.worker_machine_type = machine_type
647+
648+
self.ngpus = ngpus or self.config.get("ngpus")
649+
if not self.ngpus:
650+
self.scheduler_ngpus = scheduler_ngpus if scheduler_ngpus is not None else self.config.get("scheduler_ngpus", 0)
651+
self.worker_ngpus = worker_ngpus if worker_ngpus is not None else self.config.get("worker_ngpus", 0)
652+
if self.scheduler_ngpus == 0 and self.worker_ngpus == 0:
653+
self._log("No GPU instances configured")
654+
else:
655+
if scheduler_ngpus is not None or worker_ngpus is not None:
656+
raise ValueError("If you specify ngpus, you may not specify scheduler_ngpus or worker_ngpus")
657+
self.scheduler_ngpus = self.ngpus
658+
self.worker_ngpus = self.ngpus
659+
660+
self.gpu_type = gpu_type or self.config.get("gpu_type")
661+
if not self.gpu_type:
662+
self.scheduler_gpu_type = scheduler_gpu_type or self.config.get("scheduler_gpu_type")
663+
self.worker_gpu_type = worker_gpu_type or self.config.get("worker_gpu_type")
664+
if self.scheduler_ngpus > 0 and self.scheduler_gpu_type is None:
665+
raise ValueError("scheduler_gpu_type must be specified when scheduler_ngpus > 0")
666+
if self.worker_ngpus > 0 and self.worker_gpu_type is None:
667+
raise ValueError("worker_gpu_type must be specified when worker_ngpus > 0")
668+
else:
669+
if scheduler_gpu_type is not None or worker_gpu_type is not None:
670+
raise ValueError("If you specify gpu_type, you may not specify scheduler_gpu_type or worker_gpu_type")
671+
self.scheduler_gpu_type = self.gpu_type
672+
self.worker_gpu_type = self.gpu_type
673+
607674
self.debug = debug
608675
self.options = {
609676
"cluster": self,
@@ -616,13 +683,9 @@ def __init__(
616683
"on_host_maintenance": on_host_maintenance
617684
or self.config.get("on_host_maintenance"),
618685
"zone": zone or self.config.get("zone"),
619-
"machine_type": self.machine_type,
620-
"ngpus": ngpus or self.config.get("ngpus"),
621686
"network": network or self.config.get("network"),
622687
"network_projectid": network_projectid
623688
or self.config.get("network_projectid"),
624-
"gpu_type": gpu_type or self.config.get("gpu_type"),
625-
"gpu_instance": self.gpu_instance,
626689
"bootstrap": self.bootstrap,
627690
"auto_shutdown": self.auto_shutdown,
628691
"preemptible": (
@@ -634,7 +697,16 @@ def __init__(
634697
"service_account": service_account or self.config.get("service_account"),
635698
}
636699
self.scheduler_options = {**self.options}
700+
self.scheduler_options["machine_type"] = self.scheduler_machine_type
701+
self.scheduler_options["ngpus"] = self.scheduler_ngpus
702+
self.scheduler_options["gpu_type"] = self.scheduler_gpu_type
703+
self.scheduler_options["gpu_instance"] = bool(self.scheduler_ngpus)
704+
637705
self.worker_options = {**self.options}
706+
self.worker_options["machine_type"] = self.worker_machine_type
707+
self.worker_options["ngpus"] = self.worker_ngpus
708+
self.worker_options["gpu_type"] = self.worker_gpu_type
709+
self.worker_options["gpu_instance"] = bool(self.worker_ngpus)
638710

639711
if "extra_bootstrap" not in kwargs:
640712
kwargs["extra_bootstrap"] = self.config.get("extra_bootstrap")

0 commit comments

Comments
 (0)