Open
Description
Describe the issue:
I try to spawn a cluster via dask-gateway with the following config:
# Configure the gateway to use slurm
c.Proxy.address = '0.0.0.0:8000'
c.Proxy.tcp_address = '0.0.0.0:8001'
c.DaskGateway.backend_class = (
"dask_gateway_server.backends.jobqueue.slurm.SlurmBackend"
)
#c.SlurmClusterConfig.scheduler_cmd = "~/miniconda/bin/dask-scheduler"
#c.SlurmClusterConfig.worker_cmd = "~/miniconda/bin/dask-worker"
c.SlurmClusterConfig.partition = "heavy"
c.SlurmClusterConfig.staging_directory = '/tmp/.dask-gateway/'
from dask_gateway_server.options import Options, String
c.JobQueueClusterConfig.scheduler_setup = 'source /opt/dask/MyEnv/bin/activate'
c.JobQueueClusterConfig.worker_setup = 'source /opt/dask/MyEnv/bin/activate'
c.SlurmClusterConfig.scheduler_cmd = "/opt/dask/MyEnv/bin/dask-scheduler"
c.SlurmClusterConfig.worker_cmd = "/opt/dask/MyEnv/bin/dask-worker"
#resource limits for a worker
c.JobQueueClusterConfig.worker_memory = '100G'
c.JobQueueClusterConfig.worker_cores = 16
Slurm accepts the submission and tries to execute job. which fails with
---------------------------------------------------------------------------
GatewayClusterError Traceback (most recent call last)
Cell In[176], line 1
----> 1 cluster = gateway.new_cluster(options)
File /opt/conda/lib/python3.10/site-packages/dask_gateway/client.py:641, in Gateway.new_cluster(self, cluster_options, shutdown_on_close, **kwargs)
618 def new_cluster(self, cluster_options=None, shutdown_on_close=True, **kwargs):
619 """Submit a new cluster to the gateway, and wait for it to be started.
620
621 Same as calling ``submit`` and ``connect`` in one go.
(...)
639 cluster : GatewayCluster
640 """
--> 641 return GatewayCluster(
642 address=self.address,
643 proxy_address=self.proxy_address,
644 public_address=self._public_address,
645 auth=self.auth,
646 asynchronous=self.asynchronous,
647 loop=self.loop,
648 cluster_options=cluster_options,
649 shutdown_on_close=shutdown_on_close,
650 **kwargs,
651 )
File /opt/conda/lib/python3.10/site-packages/dask_gateway/client.py:816, in GatewayCluster.__init__(self, address, proxy_address, public_address, auth, cluster_options, shutdown_on_close, asynchronous, loop, **kwargs)
804 def __init__(
805 self,
806 address=None,
(...)
814 **kwargs,
815 ):
--> 816 self._init_internal(
817 address=address,
818 proxy_address=proxy_address,
819 public_address=public_address,
820 auth=auth,
821 cluster_options=cluster_options,
822 cluster_kwargs=kwargs,
823 shutdown_on_close=shutdown_on_close,
824 asynchronous=asynchronous,
825 loop=loop,
826 )
File /opt/conda/lib/python3.10/site-packages/dask_gateway/client.py:921, in GatewayCluster._init_internal(self, address, proxy_address, public_address, auth, cluster_options, cluster_kwargs, shutdown_on_close, asynchronous, loop, name)
919 self.status = "starting"
920 if not self.asynchronous:
--> 921 self.gateway.sync(self._start_internal)
File /opt/conda/lib/python3.10/site-packages/dask_gateway/client.py:344, in Gateway.sync(self, func, *args, **kwargs)
340 future = asyncio.run_coroutine_threadsafe(
341 func(*args, **kwargs), self.loop.asyncio_loop
342 )
343 try:
--> 344 return future.result()
345 except BaseException:
346 future.cancel()
File /opt/conda/lib/python3.10/concurrent/futures/_base.py:458, in Future.result(self, timeout)
456 raise CancelledError()
457 elif self._state == FINISHED:
--> 458 return self.__get_result()
459 else:
460 raise TimeoutError()
File /opt/conda/lib/python3.10/concurrent/futures/_base.py:403, in Future.__get_result(self)
401 if self._exception:
402 try:
--> 403 raise self._exception
404 finally:
405 # Break a reference cycle with the exception in self._exception
406 self = None
File /opt/conda/lib/python3.10/site-packages/dask_gateway/client.py:935, in GatewayCluster._start_internal(self)
933 self._start_task = asyncio.ensure_future(self._start_async())
934 try:
--> 935 await self._start_task
936 except BaseException:
937 # On exception, cleanup
938 await self._stop_internal()
File /opt/conda/lib/python3.10/site-packages/dask_gateway/client.py:953, in GatewayCluster._start_async(self)
951 # Connect to cluster
952 try:
--> 953 report = await self.gateway._wait_for_start(self.name)
954 except GatewayClusterError:
955 raise
File /opt/conda/lib/python3.10/site-packages/dask_gateway/client.py:581, in Gateway._wait_for_start(self, cluster_name)
579 return report
580 elif report.status is ClusterStatus.FAILED:
--> 581 raise GatewayClusterError(
582 "Cluster %r failed to start, see logs for "
583 "more information" % cluster_name
584 )
585 elif report.status is ClusterStatus.STOPPED:
586 raise GatewayClusterError(
587 "Cluster %r is already stopped" % cluster_name
588 )
GatewayClusterError: Cluster '904e0c326cd5484cacf9d343bd1c5226' failed to start, see logs for more information
inside the logs from that submission is:
/opt/slurm/slurmd/job00068/slurm_script: 3: source: not found
/opt/slurm/slurmd/job00068/slurm_script: 5: dask-scheduler: not found
and if i check how the script put together by the gateway with
$ scontrol write batch_script 68
$ cat slurm-68.sh
#!/bin/sh
source /opt/dask/MyEnv/bin/activate
/opt/dask/MyEnv/bin/dask-scheduler --protocol tls --port 0 --host 0.0.0.0 --dashboard-address 0.0.0.0:0 --preload dask_gateway.scheduler_preload --dg-api-address 0.0.0.0:0 --dg-heartbeat-period 15 --dg-adaptive-period 3.0 --dg-idle-timeout 0.0
/bin/sh
does not know the command source
and me changing it to .
results in Unrecognized Shell
which i think conda is not happy with sh
.
So the Example on https://gateway.dask.org/install-jobqueue.html is not working for me.
Minimal Complete Verifiable Example:
from dask_gateway import Gateway,BasicAuth
auth = BasicAuth(username="dask")
gateway = Gateway("http://IP.OF.OUR.GATEWAY:8000",auth=auth)
cluster = gateway.new_cluster(options)
Anything else we need to know?:
Suggested change is to expose a config for the slurm script shebang, like you are already doing for dask_jobqueue.SLURMCluster
Affected lines are:
Environment:
- Dask version: 2024.1.0
- Slurm version: 2024.05.0
- Python version: 2024.1.0
- Operating System: Ubuntu 22.04.4
- Install method (conda, pip, source): conda
Metadata
Metadata
Assignees
Labels
No labels