[Slurm] Set reasonable default CPU and memory for GPU instances (#8365)

kevinmingtarja · web-flow · commit e905b279ee3f · 2025-12-23T09:13:06.000+07:00
[Slurm] Reasonable default CPU and memory for GPU instances
diff --git a/sky/clouds/slurm.py b/sky/clouds/slurm.py
@@ -55,6 +55,10 @@ class Slurm(clouds.Cloud):
     _regions: List[clouds.Region] = []
     _INDENT_PREFIX = '    '
 
+    # Same as Kubernetes.
+    _DEFAULT_NUM_VCPUS_WITH_GPU = 4
+    _DEFAULT_MEMORY_CPU_RATIO_WITH_GPU = 4
+
     # Using the latest SkyPilot provisioner API to provision and check status.
     PROVISIONER_VERSION = clouds.ProvisionerVersion.SKYPILOT
     STATUS_VERSION = clouds.StatusVersion.SKYPILOT
@@ -436,13 +440,12 @@ def _make(instance_list):
                                    from_instance_type(default_instance_type))
 
             gpu_task_cpus = slurm_instance_type.cpus
-            gpu_task_memory = slurm_instance_type.memory
-            # if resources.cpus is None:
-            #     gpu_task_cpus = self._DEFAULT_NUM_VCPUS_WITH_GPU * acc_count
-            # gpu_task_memory = (float(resources.memory.strip('+')) if
-            #                    resources.memory is not None else
-            #                    gpu_task_cpus *
-            #                    self._DEFAULT_MEMORY_CPU_RATIO_WITH_GPU)
+            if resources.cpus is None:
+                gpu_task_cpus = self._DEFAULT_NUM_VCPUS_WITH_GPU * acc_count
+            # Special handling to bump up memory multiplier for GPU instances
+            gpu_task_memory = (float(resources.memory.strip('+')) if
+                               resources.memory is not None else gpu_task_cpus *
+                               self._DEFAULT_MEMORY_CPU_RATIO_WITH_GPU)
 
             chosen_instance_type = (
                 slurm_utils.SlurmInstanceType.from_resources(
diff --git a/tests/unit_tests/test_sky/clouds/test_slurm.py b/tests/unit_tests/test_sky/clouds/test_slurm.py
@@ -5,7 +5,9 @@
 
 import pytest
 
+from sky import resources as resources_lib
 from sky.adaptors import slurm
+from sky.clouds import slurm as slurm_cloud
 from sky.provision.slurm import instance as slurm_instance
 from sky.provision.slurm import utils as slurm_utils
 
@@ -193,3 +195,112 @@ def test_terminate_instances_no_jobs_found(self, mock_slurm_client_class,
 
         # Should return early without canceling
         mock_client.cancel_jobs_by_name.assert_not_called()
+
+
+class TestSlurmGPUDefaults:
+    """Test Slurm GPU default CPU and memory allocation.
+
+    These tests verify that when GPU instances are requested without explicit
+    CPU/memory specifications, Slurm allocates reasonable defaults matching
+    Kubernetes behavior (4 CPUs and 16GB memory per GPU).
+    """
+
+    @pytest.mark.parametrize(
+        'gpu_count,expected_cpus,expected_memory',
+        [
+            (1, 4, 16.0),  # 1 GPU: 4 CPUs, 16GB
+            (2, 8, 32.0),  # 2 GPUs: 8 CPUs, 32GB
+            (4, 16, 64.0),  # 4 GPUs: 16 CPUs, 64GB
+            (8, 32, 128.0),  # 8 GPUs: 32 CPUs, 128GB
+        ])
+    @patch('sky.clouds.slurm.Slurm.regions_with_offering')
+    def test_gpu_defaults_without_explicit_cpu_memory(self, mock_regions,
+                                                      gpu_count, expected_cpus,
+                                                      expected_memory):
+        """Test GPU instances get correct default CPU and memory allocation."""
+        mock_region = mock.MagicMock()
+        mock_region.name = 'test-cluster'
+        mock_regions.return_value = [mock_region]
+
+        # Create resources with GPU but no explicit CPU/memory
+        resources = resources_lib.Resources(
+            cloud=slurm_cloud.Slurm(),
+            accelerators={f'H200': gpu_count},
+            # No cpus or memory specified - should use defaults
+        )
+
+        cloud = slurm_cloud.Slurm()
+        feasible = cloud._get_feasible_launchable_resources(resources)
+
+        assert len(feasible.resources_list) == 1
+        resource = feasible.resources_list[0]
+
+        instance_type = slurm_utils.SlurmInstanceType.from_instance_type(
+            resource.instance_type)
+        assert instance_type.cpus == expected_cpus
+        assert instance_type.memory == expected_memory
+        assert instance_type.accelerator_count == gpu_count
+        assert instance_type.accelerator_type == 'H200'
+
+    @pytest.mark.parametrize(
+        'accelerators,cpus,memory,expected_cpus,expected_memory',
+        [
+            # Various GPU types with defaults
+            ({
+                'H200': 2
+            }, None, None, 8, 32.0),
+            ({
+                'A100': 2
+            }, None, None, 8, 32.0),
+            ({
+                'H100': 2
+            }, None, None, 8, 32.0),
+            ({
+                'A10G': 2
+            }, None, None, 8, 32.0),
+            # Explicit CPU override (memory scales)
+            ({
+                'H200': 2
+            }, '16', None, 16, 64.0),
+            # Explicit memory override (CPU uses default)
+            ({
+                'H200': 1
+            }, None, '32', 4, 32.0),
+            # Both CPU and memory override
+            ({
+                'H200': 2
+            }, '32', '64', 32, 64.0),
+            # Memory with '+' suffix
+            ({
+                'H200': 1
+            }, None, '32+', 4, 32.0),
+            # CPU-only instance (basic defaults)
+            (None, None, None, 2, 2.0),
+        ])
+    @patch('sky.clouds.slurm.Slurm.regions_with_offering')
+    def test_resource_allocation_scenarios(self, mock_regions, accelerators,
+                                           cpus, memory, expected_cpus,
+                                           expected_memory):
+        """Test various resource allocation scenarios including GPU types and overrides."""
+        mock_region = mock.MagicMock()
+        mock_region.name = 'test-cluster'
+        mock_regions.return_value = [mock_region]
+
+        kwargs = {'cloud': slurm_cloud.Slurm()}
+        if accelerators:
+            kwargs['accelerators'] = accelerators
+        if cpus:
+            kwargs['cpus'] = cpus
+        if memory:
+            kwargs['memory'] = memory
+
+        resources = resources_lib.Resources(**kwargs)
+        cloud = slurm_cloud.Slurm()
+        feasible = cloud._get_feasible_launchable_resources(resources)
+
+        resource = feasible.resources_list[0]
+        instance_type = slurm_utils.SlurmInstanceType.from_instance_type(
+            resource.instance_type)
+
+        assert instance_type.cpus == expected_cpus
+        assert instance_type.memory == expected_memory