1818"""
1919
2020import importlib .metadata
21+ import json
2122import logging
22- from typing import Dict
23+ import os
24+ from typing import Dict , Optional
2325
2426import pynvml
27+ from kubernetes import client , config
2528
2629logger = logging .getLogger (__name__ )
2730
2831
2932# VLLM process manager
3033class GpuTranslator :
31- def __init__ (self , mock_gpus : bool = False , mock_gpu_count : int = 8 ):
34+ def __init__ (
35+ self ,
36+ mock_gpus : bool = False ,
37+ mock_gpu_count : int = 8 ,
38+ node_name : Optional [str ] = None ,
39+ ):
3240 """
3341 Initialize GPU Translator
3442
3543 Args:
3644 mock_gpus: If True, skip pynvml and use mock mode for testing
3745 mock_gpu_count: Number of mock GPUs to create (default: 8)
46+ node_name: Kubernetes node name for ConfigMap-based GPU discovery
3847 """
3948 self .mapping = {}
4049 self .reverse_mapping = {}
4150 self .device_count = 0
4251 self .mock_mode = mock_gpus
4352 self .mock_gpu_count = mock_gpu_count
53+ self .node_name = node_name or os .getenv ("NODE_NAME" )
4454 if not self .mock_mode :
4555 self ._check_library ()
4656 self ._populate_mapping ()
@@ -62,21 +72,93 @@ def _check_library(self):
6272 f"package { package_name } not found. Please install it."
6373 )
6474
75+ def _load_gpu_map_from_configmap (self ) -> Optional [Dict [str , int ]]:
76+ """
77+ Load GPU mapping from Kubernetes ConfigMap 'gpu-map'.
78+
79+ Returns:
80+ Dict[str, int]: GPU UUID to index mapping, or None if ConfigMap not available
81+ """
82+ if not self .node_name :
83+ logger .info ("No node name provided, skipping ConfigMap GPU discovery" )
84+ return None
85+
86+ try :
87+ # Try to load in-cluster config first, fall back to kubeconfig
88+ try :
89+ config .load_incluster_config ()
90+ except config .ConfigException :
91+ config .load_kube_config ()
92+
93+ v1 = client .CoreV1Api ()
94+
95+ # Read the ConfigMap
96+ namespace = os .getenv ("NAMESPACE" , "default" )
97+ cm = v1 .read_namespaced_config_map (name = "gpu-map" , namespace = namespace )
98+
99+ if not cm .data or self .node_name not in cm .data :
100+ logger .warning (
101+ "Node '%s' not found in ConfigMap 'gpu-map' in namespace '%s'" ,
102+ self .node_name ,
103+ namespace ,
104+ )
105+ return None
106+
107+ # Parse the JSON mapping for this node
108+ node_gpu_data = cm .data [self .node_name ]
109+ gpu_mapping = json .loads (node_gpu_data )
110+
111+ logger .info (
112+ "Loaded GPU mapping from ConfigMap for node '%s': %s" ,
113+ self .node_name ,
114+ gpu_mapping ,
115+ )
116+ return gpu_mapping
117+
118+ except Exception as e :
119+ logger .warning (
120+ "Failed to load GPU mapping from ConfigMap: %s. Falling back to mock mode." ,
121+ e ,
122+ )
123+ return None
124+
65125 def _populate_mapping (self ):
66126 """
67127 Creates mapping and reverse_mapping for the GPU Translator.
68- In mock mode, pre-populates with mock GPU UUIDs following the pattern GPU-{index}.
128+ Priority order:
129+ 1. ConfigMap 'gpu-map' based mock if mock mode is enabled and node_name is available
130+ 2. Naive mock with GPU-0, GPU-1, etc. if mock mode is enabled
131+ 3. Real GPUs via pynvml
69132 """
133+ # Try ConfigMap first if in mock mode and node_name is available
134+ if self .mock_mode and self .node_name :
135+ configmap_mapping = self ._load_gpu_map_from_configmap ()
136+ if configmap_mapping :
137+ self .mapping = configmap_mapping
138+ self .reverse_mapping = {v : k for k , v in self .mapping .items ()}
139+ self .device_count = len (self .mapping )
140+ logger .info (
141+ "GPU Translator initialized from ConfigMap with %d GPUs for node '%s'" ,
142+ self .device_count ,
143+ self .node_name ,
144+ )
145+ return
146+
147+ # Fall back to hardcoded mock mode
70148 if self .mock_mode :
71149 # Pre-populate with mock GPUs following the test pattern: GPU-0, GPU-1, etc.
72150 for index in range (self .mock_gpu_count ):
73151 uuid = f"GPU-{ index } "
74152 self .mapping [uuid ] = index
75153 self .reverse_mapping [index ] = uuid
76154 self .device_count = self .mock_gpu_count
77- logger .info ("GPU Translator initialized in mock mode with %d mock GPUs" , self .mock_gpu_count )
155+ logger .info (
156+ "GPU Translator initialized in mock mode with %d mock GPUs" ,
157+ self .mock_gpu_count ,
158+ )
78159 return
79160
161+ # Use real GPUs via pynvml
80162 try :
81163 pynvml .nvmlInit ()
82164 self .device_count = pynvml .nvmlDeviceGetCount ()
@@ -90,7 +172,9 @@ def _populate_mapping(self):
90172 )
91173 self .mapping [uuid ] = index
92174 pynvml .nvmlShutdown ()
93- logger .info ("GPU Translator initialized with %d real GPUs" , self .device_count )
175+ logger .info (
176+ "GPU Translator initialized with %d real GPUs" , self .device_count
177+ )
94178
95179 except pynvml .NVMLError as error :
96180 logger .error ("Failed to initialize pynvml: %s" , error )
0 commit comments