Skip to content

Commit 2253e91

Browse files
committed
Configmap-based GPU mock for the launcher
Signed-off-by: Jun Duan <jun.duan.phd@outlook.com>
1 parent 63537f5 commit 2253e91

File tree

8 files changed

+144
-22
lines changed

8 files changed

+144
-22
lines changed

dockerfiles/Dockerfile.launcher.cpu

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,6 @@ WORKDIR /app
1717
COPY inference_server/launcher/launcher.py inference_server/launcher/gputranslator.py /app/
1818

1919
# Install uvicorn for serving the launcher API and nvidia-ml-py for gputranslator
20-
RUN pip install --root-user-action=ignore --no-cache-dir uvicorn nvidia-ml-py
20+
RUN pip install --root-user-action=ignore --no-cache-dir uvicorn nvidia-ml-py kubernetes
2121

2222
ENTRYPOINT ["uvicorn", "--app-dir", "/app", "launcher:app"]

inference_server/launcher/gputranslator.py

Lines changed: 89 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -18,29 +18,39 @@
1818
"""
1919

2020
import importlib.metadata
21+
import json
2122
import logging
22-
from typing import Dict
23+
import os
24+
from typing import Dict, Optional
2325

2426
import pynvml
27+
from kubernetes import client, config
2528

2629
logger = logging.getLogger(__name__)
2730

2831

2932
# VLLM process manager
3033
class GpuTranslator:
31-
def __init__(self, mock_gpus: bool = False, mock_gpu_count: int = 8):
34+
def __init__(
35+
self,
36+
mock_gpus: bool = False,
37+
mock_gpu_count: int = 8,
38+
node_name: Optional[str] = None,
39+
):
3240
"""
3341
Initialize GPU Translator
3442
3543
Args:
3644
mock_gpus: If True, skip pynvml and use mock mode for testing
3745
mock_gpu_count: Number of mock GPUs to create (default: 8)
46+
node_name: Kubernetes node name for ConfigMap-based GPU discovery
3847
"""
3948
self.mapping = {}
4049
self.reverse_mapping = {}
4150
self.device_count = 0
4251
self.mock_mode = mock_gpus
4352
self.mock_gpu_count = mock_gpu_count
53+
self.node_name = node_name or os.getenv("NODE_NAME")
4454
if not self.mock_mode:
4555
self._check_library()
4656
self._populate_mapping()
@@ -62,21 +72,93 @@ def _check_library(self):
6272
f"package {package_name} not found. Please install it."
6373
)
6474

75+
def _load_gpu_map_from_configmap(self) -> Optional[Dict[str, int]]:
76+
"""
77+
Load GPU mapping from Kubernetes ConfigMap 'gpu-map'.
78+
79+
Returns:
80+
Dict[str, int]: GPU UUID to index mapping, or None if ConfigMap not available
81+
"""
82+
if not self.node_name:
83+
logger.info("No node name provided, skipping ConfigMap GPU discovery")
84+
return None
85+
86+
try:
87+
# Try to load in-cluster config first, fall back to kubeconfig
88+
try:
89+
config.load_incluster_config()
90+
except config.ConfigException:
91+
config.load_kube_config()
92+
93+
v1 = client.CoreV1Api()
94+
95+
# Read the ConfigMap
96+
namespace = os.getenv("NAMESPACE", "default")
97+
cm = v1.read_namespaced_config_map(name="gpu-map", namespace=namespace)
98+
99+
if not cm.data or self.node_name not in cm.data:
100+
logger.warning(
101+
"Node '%s' not found in ConfigMap 'gpu-map' in namespace '%s'",
102+
self.node_name,
103+
namespace,
104+
)
105+
return None
106+
107+
# Parse the JSON mapping for this node
108+
node_gpu_data = cm.data[self.node_name]
109+
gpu_mapping = json.loads(node_gpu_data)
110+
111+
logger.info(
112+
"Loaded GPU mapping from ConfigMap for node '%s': %s",
113+
self.node_name,
114+
gpu_mapping,
115+
)
116+
return gpu_mapping
117+
118+
except Exception as e:
119+
logger.warning(
120+
"Failed to load GPU mapping from ConfigMap: %s. Falling back to mock mode.",
121+
e,
122+
)
123+
return None
124+
65125
def _populate_mapping(self):
66126
"""
67127
Creates mapping and reverse_mapping for the GPU Translator.
68-
In mock mode, pre-populates with mock GPU UUIDs following the pattern GPU-{index}.
128+
Priority order:
129+
1. ConfigMap 'gpu-map' based mock if mock mode is enabled and node_name is available
130+
2. Naive mock with GPU-0, GPU-1, etc. if mock mode is enabled
131+
3. Real GPUs via pynvml
69132
"""
133+
# Try ConfigMap first if in mock mode and node_name is available
134+
if self.mock_mode and self.node_name:
135+
configmap_mapping = self._load_gpu_map_from_configmap()
136+
if configmap_mapping:
137+
self.mapping = configmap_mapping
138+
self.reverse_mapping = {v: k for k, v in self.mapping.items()}
139+
self.device_count = len(self.mapping)
140+
logger.info(
141+
"GPU Translator initialized from ConfigMap with %d GPUs for node '%s'",
142+
self.device_count,
143+
self.node_name,
144+
)
145+
return
146+
147+
# Fall back to hardcoded mock mode
70148
if self.mock_mode:
71149
# Pre-populate with mock GPUs following the test pattern: GPU-0, GPU-1, etc.
72150
for index in range(self.mock_gpu_count):
73151
uuid = f"GPU-{index}"
74152
self.mapping[uuid] = index
75153
self.reverse_mapping[index] = uuid
76154
self.device_count = self.mock_gpu_count
77-
logger.info("GPU Translator initialized in mock mode with %d mock GPUs", self.mock_gpu_count)
155+
logger.info(
156+
"GPU Translator initialized in mock mode with %d mock GPUs",
157+
self.mock_gpu_count,
158+
)
78159
return
79160

161+
# Use real GPUs via pynvml
80162
try:
81163
pynvml.nvmlInit()
82164
self.device_count = pynvml.nvmlDeviceGetCount()
@@ -90,7 +172,9 @@ def _populate_mapping(self):
90172
)
91173
self.mapping[uuid] = index
92174
pynvml.nvmlShutdown()
93-
logger.info("GPU Translator initialized with %d real GPUs", self.device_count)
175+
logger.info(
176+
"GPU Translator initialized with %d real GPUs", self.device_count
177+
)
94178

95179
except pynvml.NVMLError as error:
96180
logger.error("Failed to initialize pynvml: %s", error)

inference_server/launcher/launcher.py

Lines changed: 23 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -81,6 +81,9 @@ def __init__(
8181
for uuid_str in config.gpu_uuids:
8282
index = gpu_translator.uuid_to_index(uuid_str)
8383
cuda_indices.append(str(index))
84+
logger.info(
85+
f"Translated GPU UUIDs {config.gpu_uuids} to indices {cuda_indices}."
86+
)
8487

8588
if config.env_vars is None:
8689
config.env_vars = {}
@@ -193,9 +196,16 @@ def get_logs(
193196

194197
# Multi-instance vLLM process manager
195198
class VllmMultiProcessManager:
196-
def __init__(self, mock_gpus: bool = False, mock_gpu_count: int = 8):
199+
def __init__(
200+
self,
201+
mock_gpus: bool = False,
202+
mock_gpu_count: int = 8,
203+
node_name: Optional[str] = None,
204+
):
197205
self.instances: Dict[str, VllmInstance] = {}
198-
self.gpu_translator = GpuTranslator(mock_gpus=mock_gpus, mock_gpu_count=mock_gpu_count)
206+
self.gpu_translator = GpuTranslator(
207+
mock_gpus=mock_gpus, mock_gpu_count=mock_gpu_count, node_name=node_name
208+
)
199209

200210
def create_instance(
201211
self, vllm_config: VllmConfig, instance_id: Optional[str] = None
@@ -575,45 +585,44 @@ def set_env_vars(env_vars: Dict[str, Any]):
575585
parser.add_argument(
576586
"--mock-gpus",
577587
action="store_true",
578-
help="Enable mock GPU mode for CPU-only testing environments"
588+
help="Enable mock GPU mode for CPU-only testing environments",
579589
)
580590
parser.add_argument(
581591
"--mock-gpu-count",
582592
type=int,
583593
default=8,
584-
help="Number of mock GPUs to create in mock mode (default: 8)"
594+
help="Number of mock GPUs to create in mock mode (default: 8)",
585595
)
586596
parser.add_argument(
587597
"--host",
588598
type=str,
589599
default="0.0.0.0",
590-
help="Host to bind the server to (default: 0.0.0.0)"
600+
help="Host to bind the server to (default: 0.0.0.0)",
591601
)
592602
parser.add_argument(
593603
"--port",
594604
type=int,
595605
default=8001,
596-
help="Port to bind the server to (default: 8001)"
606+
help="Port to bind the server to (default: 8001)",
597607
)
598608
parser.add_argument(
599609
"--log-level",
600610
type=str,
601611
default="info",
602612
choices=["critical", "error", "warning", "info", "debug"],
603-
help="Logging level (default: info)"
613+
help="Logging level (default: info)",
604614
)
605615

606616
args = parser.parse_args()
607617

618+
# Get node name from environment variable
619+
node_name = os.getenv("NODE_NAME")
620+
608621
# Reinitialize the global manager with mock mode settings
609622
vllm_manager = VllmMultiProcessManager(
610623
mock_gpus=args.mock_gpus,
611-
mock_gpu_count=args.mock_gpu_count
624+
mock_gpu_count=args.mock_gpu_count,
625+
node_name=node_name,
612626
)
613627

614-
uvicorn.run(
615-
app,
616-
host=args.host,
617-
port=args.port,
618-
log_level=args.log_level
619-
)
628+
uvicorn.run(app, host=args.host, port=args.port, log_level=args.log_level)

inference_server/launcher/requirements.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,5 +3,6 @@ pydantic
33
uvicorn
44
uvloop
55
nvidia-ml-py
6+
kubernetes
67
# WARNING: vllm must be built from source on a macOS Silicon
78
vllm; sys_platform != "darwin" or platform_machine != "arm64"

inference_server/launcher/tests/test_launcher.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@
1717
Run as:
1818
python -m pytest tests/test_launcher.py -v
1919
"""
20+
2021
import signal
2122
import sys
2223
from unittest.mock import MagicMock, patch

pkg/controller/dual-pods/inference-server.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -462,7 +462,7 @@ func (item infSvrItem) process(urCtx context.Context, ctl *controller, nodeDat *
462462

463463
cfg, iscHash, err := ctl.configInferenceServer(isc, serverDat.GPUIDs)
464464
if err != nil {
465-
return fmt.Errorf("parse inference server config: %w", err), true
465+
return fmt.Errorf("failed to configure inference server config: %w", err), true
466466
}
467467
logger.V(5).Info("Nominal hash of InferenceServerConfig", "hash", iscHash)
468468

test/e2e/mkobjs.sh

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,6 @@
11
#!/usr/bin/env bash
22

33
inst=$(date +%d-%H-%M-%S)
4-
server_img=$(make echo-var VAR=TEST_SERVER_IMG)
54
requester_img=$(make echo-var VAR=TEST_REQUESTER_IMG)
65
launcher_img=$(make echo-var VAR=TEST_LAUNCHER_IMG)
76
if out=$(kubectl apply -f - 2>&1 <<EOF
@@ -78,6 +77,7 @@ spec:
7877
maxSleepingInstances: 1
7978
podTemplate:
8079
spec:
80+
serviceAccount: testlauncher
8181
containers:
8282
- name: inference-server
8383
image: $launcher_img
@@ -92,6 +92,13 @@ spec:
9292
--host 0.0.0.0 \
9393
--port 8001 \
9494
--log-level info
95+
env:
96+
- name: NODE_NAME
97+
valueFrom:
98+
fieldRef: { fieldPath: spec.nodeName }
99+
- name: NAMESPACE
100+
valueFrom:
101+
fieldRef: { fieldPath: metadata.namespace }
95102
---
96103
apiVersion: apps/v1
97104
kind: ReplicaSet

test/e2e/run-launcher-based.sh

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -114,6 +114,26 @@ kubectl create rolebinding testreq --role=testreq --serviceaccount=$(kubectl get
114114
kubectl create clusterrolebinding testreq-view --clusterrole=view --serviceaccount=$(kubectl get sa default -o jsonpath={.metadata.namespace}):testreq
115115

116116
kubectl create sa testreq
117+
118+
kubectl apply -f - <<EOF
119+
apiVersion: rbac.authorization.k8s.io/v1
120+
kind: Role
121+
metadata:
122+
name: testlauncher
123+
rules:
124+
- apiGroups:
125+
- ""
126+
resources:
127+
- configmaps
128+
verbs:
129+
- get
130+
- list
131+
- watch
132+
EOF
133+
134+
kubectl create rolebinding testlauncher --role=testlauncher --serviceaccount=$(kubectl get sa default -o jsonpath={.metadata.namespace}):testlauncher
135+
136+
kubectl create sa testlauncher
117137
kubectl create cm gpu-map
118138
kubectl get nodes -o name | sed 's%^node/%%' | while read node; do
119139
kubectl label node $node nvidia.com/gpu.present=true nvidia.com/gpu.product=NVIDIA-L40S nvidia.com/gpu.count=2 --overwrite=true

0 commit comments

Comments
 (0)