Skip to content

Commit effe777

Browse files
authored
Merge pull request #6313 from yuvipanda/resource-alloc-wiggle
Improve resource allocation choice generator
2 parents db411b7 + f8d3ddf commit effe777

File tree

1 file changed

+55
-35
lines changed

1 file changed

+55
-35
lines changed

deployer/commands/generate/resource_allocation/generate_choices.py

Lines changed: 55 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,8 @@
11
import json
2-
import math
32
import sys
43
from enum import Enum
54
from pathlib import Path
5+
from typing import List
66

77
import typer
88
from ruamel.yaml import YAML
@@ -48,15 +48,17 @@ def proportional_memory_strategy(
4848
# We operate on *available* memory, which already accounts for system components (like kubelet & systemd)
4949
# as well as daemonsets we run on every node. This represents the resources that are available
5050
# for user pods.
51-
52-
# FIXME: Add some more more wiggle room here
53-
available_node_mem = nodeinfo["available"]["memory"]
54-
available_node_cpu = nodeinfo["available"]["cpu"]
55-
56-
# Only show one digit after . for CPU, but round *down* not up so we never
57-
# say they are getting more CPU than our limit is set to. We multiply & divide
58-
# with a floor, as otherwise 3.75 gets rounded to 3.8, not 3.7
59-
cpu_display = math.floor(available_node_cpu * 10) / 10
51+
# In addition, we provide some wiggle room to account for additional daemonset requests or other
52+
# issues that may pop up due to changes outside our control (like k8s upgrades). This is either
53+
# 2% of the available capacity, or 2GB / 1 CPU (whichever is smaller)
54+
WIGGLE_PERCENTAGE = 0.02
55+
mem_overhead_wiggle = min(
56+
nodeinfo["available"]["memory"] * WIGGLE_PERCENTAGE, 2 * 1024 * 1024 * 1024
57+
)
58+
cpu_overhead_wiggle = min(nodeinfo["available"]["cpu"] * WIGGLE_PERCENTAGE, 1)
59+
60+
available_node_mem = nodeinfo["available"]["memory"] - mem_overhead_wiggle
61+
available_node_cpu = nodeinfo["available"]["cpu"] - cpu_overhead_wiggle
6062

6163
# We always start from the top, and provide a choice that takes up the whole node.
6264
mem_limit = available_node_mem
@@ -67,12 +69,26 @@ def proportional_memory_strategy(
6769
# This makes sure we utilize all the memory on a node all the time.
6870
cpu_guarantee = (mem_limit / available_node_mem) * available_node_cpu
6971

70-
# Memory is in bytes, let's convert it to GB (with only 1 digit after .) to display
71-
mem_display = f"{mem_limit / 1024 / 1024 / 1024:.1f}"
72-
display_name = f"{mem_display} GB RAM, upto {cpu_display} CPUs"
72+
# Memory is in bytes, let's convert it to GB or MB (with no digits after 0) to display
73+
if mem_limit < 1024 * 1024 * 1024:
74+
mem_display = f"{mem_limit / 1024 / 1024:.0f} MB"
75+
else:
76+
mem_display = f"{mem_limit / 1024 / 1024 / 1024:.0f} GB"
77+
78+
if cpu_guarantee < 2:
79+
cpu_guarantee_display = f"~{cpu_guarantee:0.1f}"
80+
else:
81+
cpu_guarantee_display = f"~{cpu_guarantee:0.0f}"
82+
83+
display_name = f"~{mem_display} RAM, {cpu_guarantee_display} CPUs"
84+
if cpu_guarantee != available_node_cpu:
85+
description = f"Up to ~{available_node_cpu:.0f} CPUs when available"
86+
else:
87+
description = f"~{available_node_cpu:.0f} CPUs always available"
7388

7489
choice = {
7590
"display_name": display_name,
91+
"description": description,
7692
"kubespawner_override": {
7793
# Guarantee and Limit are the same - this strategy has no oversubscription
7894
"mem_guarantee": int(mem_limit),
@@ -91,26 +107,24 @@ def proportional_memory_strategy(
91107
# Use the amount of RAM made available as a slug, to allow combining choices from
92108
# multiple instance types in the same profile. This does mean you can not have
93109
# the same RAM allocation from multiple node selectors. But that's a feature, not a bug.
94-
choices[f"mem_{mem_display.replace('.', '_')}"] = choice
110+
choice_key = f"mem_{mem_display.replace('.', '_').replace(' ', '_')}".lower()
111+
choices[choice_key] = choice
95112

96113
# Halve the mem_limit for the next choice
97114
mem_limit = mem_limit / 2
98115

99116
# Reverse the choices so the smallest one is first
100117
choices = dict(reversed(choices.items()))
101118

102-
# Make the smallest choice the default explicitly
103-
choices[list(choices.keys())[0]]["default"] = True
104-
105119
return choices
106120

107121

108122
@resource_allocation_app.command()
109123
def choices(
110-
instance_type: str = typer.Argument(
111-
..., help="Instance type to generate Resource Allocation options for"
124+
instance_specification: List[str] = typer.Argument(
125+
...,
126+
help="Instance type and number of choices to generate Resource Allocation options for. Specify as instance_type:count.",
112127
),
113-
num_allocations: int = typer.Option(5, help="Number of choices to generate"),
114128
strategy: ResourceAllocationStrategies = typer.Option(
115129
ResourceAllocationStrategies.PROPORTIONAL_MEMORY_STRATEGY,
116130
help="Strategy to use for generating resource allocation choices choices",
@@ -122,19 +136,25 @@ def choices(
122136
"""
123137
with open(HERE / "node-capacity-info.json") as f:
124138
nodeinfo = json.load(f)
125-
126-
if instance_type not in nodeinfo:
127-
print(
128-
f"Capacity information about {instance_type} not available", file=sys.stderr
129-
)
130-
print("TODO: Provide information on how to update it", file=sys.stderr)
131-
sys.exit(1)
132-
133-
# Call appropriate function based on what strategy we want to use
134-
if strategy == ResourceAllocationStrategies.PROPORTIONAL_MEMORY_STRATEGY:
135-
choices = proportional_memory_strategy(
136-
instance_type, nodeinfo[instance_type], num_allocations
137-
)
138-
else:
139-
raise ValueError(f"Strategy {strategy} is not currently supported")
139+
choices = {}
140+
for instance_spec in instance_specification:
141+
instance_type, num_allocations = instance_spec.split(":", 2)
142+
143+
if instance_type not in nodeinfo:
144+
print(
145+
f"Capacity information about {instance_type} not available",
146+
file=sys.stderr,
147+
)
148+
print("TODO: Provide information on how to update it", file=sys.stderr)
149+
sys.exit(1)
150+
151+
# Call appropriate function based on what strategy we want to use
152+
if strategy == ResourceAllocationStrategies.PROPORTIONAL_MEMORY_STRATEGY:
153+
choices.update(
154+
proportional_memory_strategy(
155+
instance_type, nodeinfo[instance_type], int(num_allocations)
156+
)
157+
)
158+
else:
159+
raise ValueError(f"Strategy {strategy} is not currently supported")
140160
yaml.dump(choices, sys.stdout)

0 commit comments

Comments
 (0)