Skip to content
Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion dask_cloudprovider/cloudprovider.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -101,7 +101,9 @@ cloudprovider:
network_projectid: null # GCP project id where the network exists
projectid: "" # name of the google cloud project
on_host_maintenance: "TERMINATE"
machine_type: "n1-standard-1" # size of the machine type to use
machine_type: "n1-standard-1" # size of the machine type to use for the scheduler and all workers
scheduler_machine_type: "n1-standard-1" # size of the machine type to use for the scheduler
worker_machine_type: "n1-standard-1" # size of the machine type to use for all workers
filesystem_size: 50 # amount in GBs of hard drive space to allocate
ngpus: "" # number of GPUs to use
gpu_type: "" # type of gpus to use: nvidia-tesla-k80, nvidia-tesla-p100, nvidia-tesla-t4
Expand Down
35 changes: 33 additions & 2 deletions dask_cloudprovider/gcp/instances.py
Original file line number Diff line number Diff line change
Expand Up @@ -417,7 +417,15 @@ class GCPCluster(VMCluster):
be cases (i.e. Shared VPC) when network configurations from a different GCP project are used.
machine_type: str
The VM machine_type. You can get a full list with ``gcloud compute machine-types list``.
The default is ``n1-standard-1`` which is 3.75GB RAM and 1 vCPU
The default is ``n1-standard-1`` which is 3.75GB RAM and 1 vCPU.
This will determine the resources available to both the sceduler and all workers.
If supplied, you may not specify ``scheduler_machine_type`` or ``worker_machine_type``.
scheduler_machine_type: str
The VM machine_type. This will determine the resources available to the scheduler.
The default is ``n1-standard-1`` which is 3.75GB RAM and 1 vCPU.
worker_machine_type: str
The VM machine_type. This will determine the resources available to all workers.
The default is ``n1-standard-1`` which is 3.75GB RAM and 1 vCPU.
source_image: str
The OS image to use for the VM. Dask Cloudprovider will boostrap Ubuntu based images automatically.
Other images require Docker and for GPUs the NVIDIA Drivers and NVIDIA Docker.
Expand Down Expand Up @@ -573,6 +581,8 @@ def __init__(
network=None,
network_projectid=None,
machine_type=None,
scheduler_machine_type=None,
worker_machine_type=None,
on_host_maintenance=None,
source_image=None,
docker_image=None,
Expand Down Expand Up @@ -603,7 +613,14 @@ def __init__(
bootstrap if bootstrap is not None else self.config.get("bootstrap")
)
self.machine_type = machine_type or self.config.get("machine_type")
self.gpu_instance = "gpu" in self.machine_type or bool(ngpus)
if machine_type is None:
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@gmiasnychenko it would be great if we could check that machine_type is set XOR scheduler/worker_machine_type; otherwise, we should throw an error. It should be a BC safe check.

self.scheduler_machine_type = scheduler_machine_type or self.config.get("scheduler_machine_type")
self.worker_machine_type = worker_machine_type or self.config.get("worker_machine_type")
else:
if scheduler_machine_type is not None or worker_machine_type is not None:
raise ValueError("If you specify machine_type, you may not specify scheduler_machine_type or worker_machine_type")
self.scheduler_machine_type = machine_type
self.worker_machine_type = machine_type
self.debug = debug
self.options = {
"cluster": self,
Expand All @@ -617,6 +634,8 @@ def __init__(
or self.config.get("on_host_maintenance"),
"zone": zone or self.config.get("zone"),
"machine_type": self.machine_type,
"scheduler_machine_type": self.scheduler_machine_type,
"worker_machine_type": self.worker_machine_type,
"ngpus": ngpus or self.config.get("ngpus"),
"network": network or self.config.get("network"),
"network_projectid": network_projectid
Expand All @@ -635,6 +654,18 @@ def __init__(
}
self.scheduler_options = {**self.options}
self.worker_options = {**self.options}
self.scheduler_options["machine_type"] = self.scheduler_machine_type
self.worker_options["machine_type"] = self.worker_machine_type

if ngpus is not None:
self.scheduler_options["ngpus"] = 0
self.scheduler_options["gpu_type"] = None
self.scheduler_options["gpu_instance"] = False
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@gmiasnychenko should we set scheduler gpus settings always, whatever the number of GPUs?

Also, please leave a comment that we don't run tasks on scheduler, so we don't need a GPU there.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

As for setting GPUs settings, I believe the answer is yes. All the settings are going into the self.options, which is the base for later self.scheduler_options and self.worker_options. If we don't override the scheduler GPU settings, they will stay from up above, and we will have the same configuration for both scheduler and worker.

I can move the overriding outside the if statement, if that's what you mean. It provides more clarity, but functionally should be the same

I agree with providing more documentation. I will add it for the ngpus and gpu_type argument descriptions.


self.worker_ngpus = ngpus
self.worker_options["ngpus"] = ngpus or self.config.get("ngpus")
self.worker_options["gpu_type"] = gpu_type or self.config.get("gpu_type")
self.worker_options["gpu_instance"] = True

if "extra_bootstrap" not in kwargs:
kwargs["extra_bootstrap"] = self.config.get("extra_bootstrap")
Expand Down
Loading