Merge pull request #6313 from yuvipanda/resource-alloc-wiggle

yuvipanda · web-flow · commit effe7779a673 · 2025-08-06T13:48:38.000-07:00
Improve resource allocation choice generator
diff --git a/deployer/commands/generate/resource_allocation/generate_choices.py b/deployer/commands/generate/resource_allocation/generate_choices.py
@@ -1,8 +1,8 @@
 import json
-import math
 import sys
 from enum import Enum
 from pathlib import Path
+from typing import List
 
 import typer
 from ruamel.yaml import YAML
@@ -48,15 +48,17 @@ def proportional_memory_strategy(
     # We operate on *available* memory, which already accounts for system components (like kubelet & systemd)
     # as well as daemonsets we run on every node. This represents the resources that are available
     # for user pods.
-
-    # FIXME: Add some more more wiggle room here
-    available_node_mem = nodeinfo["available"]["memory"]
-    available_node_cpu = nodeinfo["available"]["cpu"]
-
-    # Only show one digit after . for CPU, but round *down* not up so we never
-    # say they are getting more CPU than our limit is set to. We multiply & divide
-    # with a floor, as otherwise 3.75 gets rounded to 3.8, not 3.7
-    cpu_display = math.floor(available_node_cpu * 10) / 10
+    # In addition, we provide some wiggle room to account for additional daemonset requests or other
+    # issues that may pop up due to changes outside our control (like k8s upgrades). This is either
+    # 2% of the available capacity, or 2GB / 1 CPU (whichever is smaller)
+    WIGGLE_PERCENTAGE = 0.02
+    mem_overhead_wiggle = min(
+        nodeinfo["available"]["memory"] * WIGGLE_PERCENTAGE, 2 * 1024 * 1024 * 1024
+    )
+    cpu_overhead_wiggle = min(nodeinfo["available"]["cpu"] * WIGGLE_PERCENTAGE, 1)
+
+    available_node_mem = nodeinfo["available"]["memory"] - mem_overhead_wiggle
+    available_node_cpu = nodeinfo["available"]["cpu"] - cpu_overhead_wiggle
 
     # We always start from the top, and provide a choice that takes up the whole node.
     mem_limit = available_node_mem
@@ -67,12 +69,26 @@ def proportional_memory_strategy(
         # This makes sure we utilize all the memory on a node all the time.
         cpu_guarantee = (mem_limit / available_node_mem) * available_node_cpu
 
-        # Memory is in bytes, let's convert it to GB (with only 1 digit after .) to display
-        mem_display = f"{mem_limit / 1024 / 1024 / 1024:.1f}"
-        display_name = f"{mem_display} GB RAM, upto {cpu_display} CPUs"
+        # Memory is in bytes, let's convert it to GB or MB (with no digits after 0) to display
+        if mem_limit < 1024 * 1024 * 1024:
+            mem_display = f"{mem_limit / 1024 / 1024:.0f} MB"
+        else:
+            mem_display = f"{mem_limit / 1024 / 1024 / 1024:.0f} GB"
+
+        if cpu_guarantee < 2:
+            cpu_guarantee_display = f"~{cpu_guarantee:0.1f}"
+        else:
+            cpu_guarantee_display = f"~{cpu_guarantee:0.0f}"
+
+        display_name = f"~{mem_display} RAM, {cpu_guarantee_display} CPUs"
+        if cpu_guarantee != available_node_cpu:
+            description = f"Up to ~{available_node_cpu:.0f} CPUs when available"
+        else:
+            description = f"~{available_node_cpu:.0f} CPUs always available"
 
         choice = {
             "display_name": display_name,
+            "description": description,
             "kubespawner_override": {
                 # Guarantee and Limit are the same - this strategy has no oversubscription
                 "mem_guarantee": int(mem_limit),
@@ -91,26 +107,24 @@ def proportional_memory_strategy(
         # Use the amount of RAM made available as a slug, to allow combining choices from
         # multiple instance types in the same profile. This does mean you can not have
         # the same RAM allocation from multiple node selectors. But that's a feature, not a bug.
-        choices[f"mem_{mem_display.replace('.', '_')}"] = choice
+        choice_key = f"mem_{mem_display.replace('.', '_').replace(' ', '_')}".lower()
+        choices[choice_key] = choice
 
         # Halve the mem_limit for the next choice
         mem_limit = mem_limit / 2
 
     # Reverse the choices so the smallest one is first
     choices = dict(reversed(choices.items()))
 
-    # Make the smallest choice the default explicitly
-    choices[list(choices.keys())[0]]["default"] = True
-
     return choices
 
 
 @resource_allocation_app.command()
 def choices(
-    instance_type: str = typer.Argument(
-        ..., help="Instance type to generate Resource Allocation options for"
+    instance_specification: List[str] = typer.Argument(
+        ...,
+        help="Instance type and number of choices to generate Resource Allocation options for. Specify as instance_type:count.",
     ),
-    num_allocations: int = typer.Option(5, help="Number of choices to generate"),
     strategy: ResourceAllocationStrategies = typer.Option(
         ResourceAllocationStrategies.PROPORTIONAL_MEMORY_STRATEGY,
         help="Strategy to use for generating resource allocation choices choices",
@@ -122,19 +136,25 @@ def choices(
     """
     with open(HERE / "node-capacity-info.json") as f:
         nodeinfo = json.load(f)
-
-    if instance_type not in nodeinfo:
-        print(
-            f"Capacity information about {instance_type} not available", file=sys.stderr
-        )
-        print("TODO: Provide information on how to update it", file=sys.stderr)
-        sys.exit(1)
-
-    # Call appropriate function based on what strategy we want to use
-    if strategy == ResourceAllocationStrategies.PROPORTIONAL_MEMORY_STRATEGY:
-        choices = proportional_memory_strategy(
-            instance_type, nodeinfo[instance_type], num_allocations
-        )
-    else:
-        raise ValueError(f"Strategy {strategy} is not currently supported")
+    choices = {}
+    for instance_spec in instance_specification:
+        instance_type, num_allocations = instance_spec.split(":", 2)
+
+        if instance_type not in nodeinfo:
+            print(
+                f"Capacity information about {instance_type} not available",
+                file=sys.stderr,
+            )
+            print("TODO: Provide information on how to update it", file=sys.stderr)
+            sys.exit(1)
+
+        # Call appropriate function based on what strategy we want to use
+        if strategy == ResourceAllocationStrategies.PROPORTIONAL_MEMORY_STRATEGY:
+            choices.update(
+                proportional_memory_strategy(
+                    instance_type, nodeinfo[instance_type], int(num_allocations)
+                )
+            )
+        else:
+            raise ValueError(f"Strategy {strategy} is not currently supported")
     yaml.dump(choices, sys.stdout)