Skip to content

Commit e905b27

Browse files
[Slurm] Set reasonable default CPU and memory for GPU instances (#8365)
[Slurm] Reasonable default CPU and memory for GPU instances
1 parent 5d7b414 commit e905b27

File tree

2 files changed

+121
-7
lines changed

2 files changed

+121
-7
lines changed

sky/clouds/slurm.py

Lines changed: 10 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -55,6 +55,10 @@ class Slurm(clouds.Cloud):
5555
_regions: List[clouds.Region] = []
5656
_INDENT_PREFIX = ' '
5757

58+
# Same as Kubernetes.
59+
_DEFAULT_NUM_VCPUS_WITH_GPU = 4
60+
_DEFAULT_MEMORY_CPU_RATIO_WITH_GPU = 4
61+
5862
# Using the latest SkyPilot provisioner API to provision and check status.
5963
PROVISIONER_VERSION = clouds.ProvisionerVersion.SKYPILOT
6064
STATUS_VERSION = clouds.StatusVersion.SKYPILOT
@@ -436,13 +440,12 @@ def _make(instance_list):
436440
from_instance_type(default_instance_type))
437441

438442
gpu_task_cpus = slurm_instance_type.cpus
439-
gpu_task_memory = slurm_instance_type.memory
440-
# if resources.cpus is None:
441-
# gpu_task_cpus = self._DEFAULT_NUM_VCPUS_WITH_GPU * acc_count
442-
# gpu_task_memory = (float(resources.memory.strip('+')) if
443-
# resources.memory is not None else
444-
# gpu_task_cpus *
445-
# self._DEFAULT_MEMORY_CPU_RATIO_WITH_GPU)
443+
if resources.cpus is None:
444+
gpu_task_cpus = self._DEFAULT_NUM_VCPUS_WITH_GPU * acc_count
445+
# Special handling to bump up memory multiplier for GPU instances
446+
gpu_task_memory = (float(resources.memory.strip('+')) if
447+
resources.memory is not None else gpu_task_cpus *
448+
self._DEFAULT_MEMORY_CPU_RATIO_WITH_GPU)
446449

447450
chosen_instance_type = (
448451
slurm_utils.SlurmInstanceType.from_resources(

tests/unit_tests/test_sky/clouds/test_slurm.py

Lines changed: 111 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,9 @@
55

66
import pytest
77

8+
from sky import resources as resources_lib
89
from sky.adaptors import slurm
10+
from sky.clouds import slurm as slurm_cloud
911
from sky.provision.slurm import instance as slurm_instance
1012
from sky.provision.slurm import utils as slurm_utils
1113

@@ -193,3 +195,112 @@ def test_terminate_instances_no_jobs_found(self, mock_slurm_client_class,
193195

194196
# Should return early without canceling
195197
mock_client.cancel_jobs_by_name.assert_not_called()
198+
199+
200+
class TestSlurmGPUDefaults:
201+
"""Test Slurm GPU default CPU and memory allocation.
202+
203+
These tests verify that when GPU instances are requested without explicit
204+
CPU/memory specifications, Slurm allocates reasonable defaults matching
205+
Kubernetes behavior (4 CPUs and 16GB memory per GPU).
206+
"""
207+
208+
@pytest.mark.parametrize(
209+
'gpu_count,expected_cpus,expected_memory',
210+
[
211+
(1, 4, 16.0), # 1 GPU: 4 CPUs, 16GB
212+
(2, 8, 32.0), # 2 GPUs: 8 CPUs, 32GB
213+
(4, 16, 64.0), # 4 GPUs: 16 CPUs, 64GB
214+
(8, 32, 128.0), # 8 GPUs: 32 CPUs, 128GB
215+
])
216+
@patch('sky.clouds.slurm.Slurm.regions_with_offering')
217+
def test_gpu_defaults_without_explicit_cpu_memory(self, mock_regions,
218+
gpu_count, expected_cpus,
219+
expected_memory):
220+
"""Test GPU instances get correct default CPU and memory allocation."""
221+
mock_region = mock.MagicMock()
222+
mock_region.name = 'test-cluster'
223+
mock_regions.return_value = [mock_region]
224+
225+
# Create resources with GPU but no explicit CPU/memory
226+
resources = resources_lib.Resources(
227+
cloud=slurm_cloud.Slurm(),
228+
accelerators={f'H200': gpu_count},
229+
# No cpus or memory specified - should use defaults
230+
)
231+
232+
cloud = slurm_cloud.Slurm()
233+
feasible = cloud._get_feasible_launchable_resources(resources)
234+
235+
assert len(feasible.resources_list) == 1
236+
resource = feasible.resources_list[0]
237+
238+
instance_type = slurm_utils.SlurmInstanceType.from_instance_type(
239+
resource.instance_type)
240+
assert instance_type.cpus == expected_cpus
241+
assert instance_type.memory == expected_memory
242+
assert instance_type.accelerator_count == gpu_count
243+
assert instance_type.accelerator_type == 'H200'
244+
245+
@pytest.mark.parametrize(
246+
'accelerators,cpus,memory,expected_cpus,expected_memory',
247+
[
248+
# Various GPU types with defaults
249+
({
250+
'H200': 2
251+
}, None, None, 8, 32.0),
252+
({
253+
'A100': 2
254+
}, None, None, 8, 32.0),
255+
({
256+
'H100': 2
257+
}, None, None, 8, 32.0),
258+
({
259+
'A10G': 2
260+
}, None, None, 8, 32.0),
261+
# Explicit CPU override (memory scales)
262+
({
263+
'H200': 2
264+
}, '16', None, 16, 64.0),
265+
# Explicit memory override (CPU uses default)
266+
({
267+
'H200': 1
268+
}, None, '32', 4, 32.0),
269+
# Both CPU and memory override
270+
({
271+
'H200': 2
272+
}, '32', '64', 32, 64.0),
273+
# Memory with '+' suffix
274+
({
275+
'H200': 1
276+
}, None, '32+', 4, 32.0),
277+
# CPU-only instance (basic defaults)
278+
(None, None, None, 2, 2.0),
279+
])
280+
@patch('sky.clouds.slurm.Slurm.regions_with_offering')
281+
def test_resource_allocation_scenarios(self, mock_regions, accelerators,
282+
cpus, memory, expected_cpus,
283+
expected_memory):
284+
"""Test various resource allocation scenarios including GPU types and overrides."""
285+
mock_region = mock.MagicMock()
286+
mock_region.name = 'test-cluster'
287+
mock_regions.return_value = [mock_region]
288+
289+
kwargs = {'cloud': slurm_cloud.Slurm()}
290+
if accelerators:
291+
kwargs['accelerators'] = accelerators
292+
if cpus:
293+
kwargs['cpus'] = cpus
294+
if memory:
295+
kwargs['memory'] = memory
296+
297+
resources = resources_lib.Resources(**kwargs)
298+
cloud = slurm_cloud.Slurm()
299+
feasible = cloud._get_feasible_launchable_resources(resources)
300+
301+
resource = feasible.resources_list[0]
302+
instance_type = slurm_utils.SlurmInstanceType.from_instance_type(
303+
resource.instance_type)
304+
305+
assert instance_type.cpus == expected_cpus
306+
assert instance_type.memory == expected_memory

0 commit comments

Comments
 (0)