|
5 | 5 |
|
6 | 6 | import pytest |
7 | 7 |
|
| 8 | +from sky import resources as resources_lib |
8 | 9 | from sky.adaptors import slurm |
| 10 | +from sky.clouds import slurm as slurm_cloud |
9 | 11 | from sky.provision.slurm import instance as slurm_instance |
10 | 12 | from sky.provision.slurm import utils as slurm_utils |
11 | 13 |
|
@@ -193,3 +195,112 @@ def test_terminate_instances_no_jobs_found(self, mock_slurm_client_class, |
193 | 195 |
|
194 | 196 | # Should return early without canceling |
195 | 197 | mock_client.cancel_jobs_by_name.assert_not_called() |
| 198 | + |
| 199 | + |
| 200 | +class TestSlurmGPUDefaults: |
| 201 | + """Test Slurm GPU default CPU and memory allocation. |
| 202 | +
|
| 203 | + These tests verify that when GPU instances are requested without explicit |
| 204 | + CPU/memory specifications, Slurm allocates reasonable defaults matching |
| 205 | + Kubernetes behavior (4 CPUs and 16GB memory per GPU). |
| 206 | + """ |
| 207 | + |
| 208 | + @pytest.mark.parametrize( |
| 209 | + 'gpu_count,expected_cpus,expected_memory', |
| 210 | + [ |
| 211 | + (1, 4, 16.0), # 1 GPU: 4 CPUs, 16GB |
| 212 | + (2, 8, 32.0), # 2 GPUs: 8 CPUs, 32GB |
| 213 | + (4, 16, 64.0), # 4 GPUs: 16 CPUs, 64GB |
| 214 | + (8, 32, 128.0), # 8 GPUs: 32 CPUs, 128GB |
| 215 | + ]) |
| 216 | + @patch('sky.clouds.slurm.Slurm.regions_with_offering') |
| 217 | + def test_gpu_defaults_without_explicit_cpu_memory(self, mock_regions, |
| 218 | + gpu_count, expected_cpus, |
| 219 | + expected_memory): |
| 220 | + """Test GPU instances get correct default CPU and memory allocation.""" |
| 221 | + mock_region = mock.MagicMock() |
| 222 | + mock_region.name = 'test-cluster' |
| 223 | + mock_regions.return_value = [mock_region] |
| 224 | + |
| 225 | + # Create resources with GPU but no explicit CPU/memory |
| 226 | + resources = resources_lib.Resources( |
| 227 | + cloud=slurm_cloud.Slurm(), |
| 228 | + accelerators={f'H200': gpu_count}, |
| 229 | + # No cpus or memory specified - should use defaults |
| 230 | + ) |
| 231 | + |
| 232 | + cloud = slurm_cloud.Slurm() |
| 233 | + feasible = cloud._get_feasible_launchable_resources(resources) |
| 234 | + |
| 235 | + assert len(feasible.resources_list) == 1 |
| 236 | + resource = feasible.resources_list[0] |
| 237 | + |
| 238 | + instance_type = slurm_utils.SlurmInstanceType.from_instance_type( |
| 239 | + resource.instance_type) |
| 240 | + assert instance_type.cpus == expected_cpus |
| 241 | + assert instance_type.memory == expected_memory |
| 242 | + assert instance_type.accelerator_count == gpu_count |
| 243 | + assert instance_type.accelerator_type == 'H200' |
| 244 | + |
| 245 | + @pytest.mark.parametrize( |
| 246 | + 'accelerators,cpus,memory,expected_cpus,expected_memory', |
| 247 | + [ |
| 248 | + # Various GPU types with defaults |
| 249 | + ({ |
| 250 | + 'H200': 2 |
| 251 | + }, None, None, 8, 32.0), |
| 252 | + ({ |
| 253 | + 'A100': 2 |
| 254 | + }, None, None, 8, 32.0), |
| 255 | + ({ |
| 256 | + 'H100': 2 |
| 257 | + }, None, None, 8, 32.0), |
| 258 | + ({ |
| 259 | + 'A10G': 2 |
| 260 | + }, None, None, 8, 32.0), |
| 261 | + # Explicit CPU override (memory scales) |
| 262 | + ({ |
| 263 | + 'H200': 2 |
| 264 | + }, '16', None, 16, 64.0), |
| 265 | + # Explicit memory override (CPU uses default) |
| 266 | + ({ |
| 267 | + 'H200': 1 |
| 268 | + }, None, '32', 4, 32.0), |
| 269 | + # Both CPU and memory override |
| 270 | + ({ |
| 271 | + 'H200': 2 |
| 272 | + }, '32', '64', 32, 64.0), |
| 273 | + # Memory with '+' suffix |
| 274 | + ({ |
| 275 | + 'H200': 1 |
| 276 | + }, None, '32+', 4, 32.0), |
| 277 | + # CPU-only instance (basic defaults) |
| 278 | + (None, None, None, 2, 2.0), |
| 279 | + ]) |
| 280 | + @patch('sky.clouds.slurm.Slurm.regions_with_offering') |
| 281 | + def test_resource_allocation_scenarios(self, mock_regions, accelerators, |
| 282 | + cpus, memory, expected_cpus, |
| 283 | + expected_memory): |
| 284 | + """Test various resource allocation scenarios including GPU types and overrides.""" |
| 285 | + mock_region = mock.MagicMock() |
| 286 | + mock_region.name = 'test-cluster' |
| 287 | + mock_regions.return_value = [mock_region] |
| 288 | + |
| 289 | + kwargs = {'cloud': slurm_cloud.Slurm()} |
| 290 | + if accelerators: |
| 291 | + kwargs['accelerators'] = accelerators |
| 292 | + if cpus: |
| 293 | + kwargs['cpus'] = cpus |
| 294 | + if memory: |
| 295 | + kwargs['memory'] = memory |
| 296 | + |
| 297 | + resources = resources_lib.Resources(**kwargs) |
| 298 | + cloud = slurm_cloud.Slurm() |
| 299 | + feasible = cloud._get_feasible_launchable_resources(resources) |
| 300 | + |
| 301 | + resource = feasible.resources_list[0] |
| 302 | + instance_type = slurm_utils.SlurmInstanceType.from_instance_type( |
| 303 | + resource.instance_type) |
| 304 | + |
| 305 | + assert instance_type.cpus == expected_cpus |
| 306 | + assert instance_type.memory == expected_memory |
0 commit comments