Skip to content

Commit 3a92093

Browse files
authored
Re-enable memory profiler in new architecture (#5)
Also added CI test and updated documentation
1 parent e59d8f2 commit 3a92093

File tree

7 files changed

+124
-28
lines changed

7 files changed

+124
-28
lines changed

.github/workflows/test_lemonade_eval.yml

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -122,6 +122,7 @@ jobs:
122122
$serverExe = "$installPath\bin\lemonade-server.exe"
123123
$logFile = "${{ github.workspace }}\lemonade-server.log"
124124
$venvPython = ".venv\Scripts\python.exe"
125+
$venvLemonade = ".venv\Scripts\lemonade-eval"
125126
126127
Write-Host "Starting Lemonade Server..." -ForegroundColor Cyan
127128
@@ -162,6 +163,11 @@ jobs:
162163
exit 1
163164
}
164165
166+
# Test CLI
167+
Write-Host "Testing lemonade-eval CLI..."
168+
& $venvLemonade -m -i Llama-3.2-1B-Instruct-GGUF load bench -w 0 -i 5
169+
if ($LASTEXITCODE -ne 0) { exit $LASTEXITCODE }
170+
165171
try {
166172
Write-Host "Running server integration tests with Qwen3-4B-Instruct-2507-GGUF..." -ForegroundColor Cyan
167173
& $venvPython test/llm_api.py

README.md

Lines changed: 14 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@ Contents:
1313
- [Export a Finetuned Model](#exporting-a-finetuned-model)
1414
- [LLM Report](#llm-report)
1515
- [Memory Usage](#memory-usage)
16+
- [Power Profiling](#power-profiling)
1617
- [System Information](#system-information)
1718

1819
## Overview
@@ -250,7 +251,7 @@ lemonade-eval -i Qwen3-4B-Instruct-2507-GGUF load bench
250251
The benchmark measures:
251252
- **Time to First Token (TTFT)**: Latency before first token is generated
252253
- **Tokens per Second**: Generation throughput
253-
- **Memory Usage**: Peak memory consumption (with `--memory` flag)
254+
- **Memory Usage**: Peak memory consumption (on Windows)
254255

255256
#### Options
256257

@@ -281,6 +282,18 @@ Results can be filtered by model name, device type, and data type:
281282
lemonade-eval report --perf --filter-model "Qwen"
282283
```
283284

285+
## Memory Usage
286+
287+
On Windows, memory usage of the inference server backend can be tracked with the `--memory` flag.
288+
For example:
289+
290+
```bash
291+
lemonade-eval --memory -i Llama-3.2-1B-Instruct-GGUF load bench
292+
```
293+
294+
This generates a PNG file that is stored in the current folder and the build folder. This file
295+
contains a figure plotting the memory usage of the inference backend over the `lemonade-eval`
296+
tool sequence. Learn more by running `lemonade-eval -h`.
284297

285298
## Power Profiling
286299

src/lemonade/cli.py

Lines changed: 7 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,13 @@
11
import os
2+
import platform
23

34
# pylint: disable=C0413
45
# Prevent HF warnings from showing on every import
56
os.environ["TRANSFORMERS_NO_ADVISORY_WARNINGS"] = "1"
67
from lemonade.version import __version__ as version_number
78
from lemonade.tools import FirstTool, NiceHelpFormatter
89

9-
# from lemonade.profilers.memory_tracker import MemoryTracker
10+
from lemonade.profilers.memory_tracker import MemoryTracker
1011
import lemonade.common.filesystem as fs
1112
import lemonade.common.cli_helpers as cli
1213
from lemonade.sequence import Sequence
@@ -21,9 +22,11 @@ def get_available_profilers(warn_missing=False):
2122
warn_missing: If True, print warnings for missing profilers. If False, fail silently.
2223
"""
2324

24-
# Temporarily disable memory profiling due to changes in lemonade architecture
25-
# profilers = [MemoryTracker]
26-
profilers = []
25+
# Allow memory profiling on Windows
26+
if platform.system() == "Windows":
27+
profilers = [MemoryTracker]
28+
else:
29+
profilers = []
2730

2831
try:
2932
from lemonade.profilers.hwinfo_power import HWINFOPowerProfiler

src/lemonade/profilers/memory_tracker.py

Lines changed: 35 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -33,19 +33,23 @@ def add_arguments_to_parser(parser):
3333
)
3434

3535
@staticmethod
36-
def get_time_mem_list(process):
37-
return [time.time(), process.memory_info().rss]
36+
def get_time_mem_list(processes):
37+
if processes is None:
38+
return [time.time(), float("nan")]
39+
if len(processes) == 0:
40+
return [time.time(), 0]
41+
return [time.time(), sum([process.memory_info().rss for process in processes])]
3842

3943
def __init__(self, parser_arg_value):
4044
super().__init__()
4145
self.status_stats += [fs.Keys.MEMORY_USAGE_PLOT]
4246
self.track_memory_interval = parser_arg_value
43-
self.process_being_tracked = None
4447
self.build_dir = None
4548
self.queue = None
4649
self.tracker_process = None
4750
self.tracking_active = False
4851
self.yaml_path = None
52+
self.processes_being_tracked = None
4953

5054
def start(self, build_dir):
5155
if self.tracking_active:
@@ -54,10 +58,6 @@ def start(self, build_dir):
5458
# Save the folder where data and plot will be stored
5559
self.build_dir = build_dir
5660

57-
# Get the process being tracked
58-
track_pid = os.getpid()
59-
self.process_being_tracked = psutil.Process(track_pid)
60-
6161
# Create queue for passing messages to the tracker
6262
self.queue = Queue()
6363

@@ -68,21 +68,31 @@ def start(self, build_dir):
6868
self.tracker_process = Process(
6969
target=self._memory_tracker_,
7070
args=(
71-
track_pid,
7271
self.queue,
7372
self.yaml_path,
7473
self.track_memory_interval,
7574
),
7675
)
7776
self.tracker_process.start()
7877
self.tracking_active = True
78+
# Set start of track and log a zero memory usage
7979
self.set_label("start")
80-
self.sample()
80+
self.queue.put(MemoryTracker.get_time_mem_list([]))
81+
82+
def add_pid_to_track(self, pid):
83+
if self.tracking_active:
84+
self.processes_being_tracked.append(psutil.Process(pid))
85+
self.queue.put(pid)
8186

8287
def tool_starting(self, tool_name):
8388
self.set_label(tool_name)
8489

85-
def tool_stopping(self):
90+
def tool_stopping(self, state):
91+
# Check it the tool as created the inference_processes attribute to state
92+
if self.processes_being_tracked is None and hasattr(state, "inference_pids"):
93+
self.processes_being_tracked = []
94+
for pid in state.inference_pids:
95+
self.add_pid_to_track(pid)
8696
self.sample()
8797

8898
def set_label(self, label):
@@ -91,7 +101,12 @@ def set_label(self, label):
91101

92102
def sample(self):
93103
if self.tracking_active:
94-
self.queue.put(MemoryTracker.get_time_mem_list(self.process_being_tracked))
104+
if self.processes_being_tracked is None:
105+
self.queue.put(MemoryTracker.get_time_mem_list([]))
106+
else:
107+
self.queue.put(
108+
MemoryTracker.get_time_mem_list(self.processes_being_tracked)
109+
)
95110

96111
def stop(self):
97112
if self.tracking_active:
@@ -136,8 +151,8 @@ def generate_results(self, state, timestamp, _):
136151

137152
# last_t and last_y are used to draw a line between the last point of the prior
138153
# track and the first point of the current track
139-
last_t = None
140-
last_y = None
154+
last_t = 0
155+
last_y = track[-1][1]
141156

142157
plt.figure()
143158
for k, v in memory_tracks[1:]:
@@ -174,7 +189,6 @@ def generate_results(self, state, timestamp, _):
174189

175190
@staticmethod
176191
def _memory_tracker_(
177-
tracked_pid,
178192
input_queue: Queue,
179193
yaml_path: str,
180194
track_memory_interval: float,
@@ -191,17 +205,14 @@ def _memory_tracker_(
191205
3) None - This indicates that the tracker should stop tracking, save its data to a file
192206
and end
193207
"""
208+
tracked_processes = None
194209
memory_tracks = []
195210
current_track = []
196211
track_name = None
197212
tracker_exit = False
198213

199214
try:
200-
tracked_process = psutil.Process(tracked_pid)
201-
while (
202-
not tracker_exit and tracked_process.status() == psutil.STATUS_RUNNING
203-
):
204-
215+
while not tracker_exit:
205216
time.sleep(track_memory_interval)
206217

207218
# Read any messages from the parent process
@@ -227,6 +238,10 @@ def _memory_tracker_(
227238
"Track name must be passed to memory tracker prior to "
228239
"sending data"
229240
)
241+
elif isinstance(message, int):
242+
if tracked_processes is None:
243+
tracked_processes = []
244+
tracked_processes.append(psutil.Process(message))
230245
else:
231246
raise TypeError(
232247
"Unrecognized message type in memory_tracker input queue: "
@@ -240,7 +255,7 @@ def _memory_tracker_(
240255
if not tracker_exit and track_name is not None:
241256
# Save current time and memory usage
242257
current_track.append(
243-
MemoryTracker.get_time_mem_list(tracked_process)
258+
MemoryTracker.get_time_mem_list(tracked_processes)
244259
)
245260

246261
# Save the collected memory tracks

src/lemonade/profilers/profiler.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -31,9 +31,10 @@ def tool_starting(self, tool_name):
3131
This method is called to inform the profiler of the name of the tool that is about to start.
3232
"""
3333

34-
def tool_stopping(self):
34+
def tool_stopping(self, state):
3535
"""
36-
This method is called to inform the profiler that the tool has finished.
36+
This method is called to inform the profiler that the tool has finished. The state is
37+
passed for the tool to gather any relevant data.
3738
"""
3839

3940
def stop(self):

src/lemonade/sequence.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -288,7 +288,7 @@ def launch(
288288

289289
# Inform profilers that tool has finished
290290
for profiler in self.profilers:
291-
profiler.tool_stopping()
291+
profiler.tool_stopping(state)
292292

293293
start_times["cool down"] = time.time()
294294

src/lemonade/tools/server_load.py

Lines changed: 58 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -364,6 +364,10 @@ def run(
364364
backend_str = self._get_backend_string(server_url, model_name)
365365
state.save_stat(Keys.BACKEND, backend_str)
366366

367+
# Look up the inference processes
368+
if platform.system() == "Windows":
369+
state.inference_pids = self._get_inference_pids(server_url)
370+
367371
# Create adapters for the loaded model
368372
state.model = ServerAdapter(
369373
server_url=server_url,
@@ -446,6 +450,60 @@ def _get_backend_string(self, server_url: str, model_name: str) -> str:
446450
printing.log_warning(f"Could not determine backend: {e}")
447451
return "Lemonade Server"
448452

453+
def _get_inference_pids(self, server_url):
454+
"""
455+
Extract the inference process ids from the load response.
456+
457+
Returns:
458+
List of pids for the inference processes, or None if not applicable.
459+
"""
460+
try:
461+
health_response = requests.get(
462+
f"{server_url}/api/v1/health",
463+
timeout=10,
464+
)
465+
health_response.raise_for_status()
466+
health_result = health_response.json()
467+
except requests.exceptions.ConnectionError:
468+
raise ConnectionError(
469+
f"Cannot connect to Lemonade Server at {server_url}. "
470+
"Make sure the server is running with 'lemonade-server serve'."
471+
)
472+
except requests.exceptions.RequestException as e:
473+
raise ConnectionError(f"Error connecting to Lemonade Server: {e}")
474+
475+
# Extract the model_loaded info and find the associated backend_url
476+
ports = []
477+
for model_loaded in health_result.get("all_models_loaded", []):
478+
if model_loaded.get("model_name") == health_result.get("model_loaded"):
479+
backend_url = model_loaded.get("backend_url", "")
480+
if backend_url.startswith("http://127.0.0.1:"):
481+
# Local backend, extract port from backend_url, e.g., http://127.0.0.1:PORT/v1
482+
port = backend_url.split(":")[2].split("/")[0]
483+
ports.append(int(port))
484+
printing.log_info(
485+
f"Identified inference backend port {port} "
486+
f"for {model_loaded.get('model_name')}"
487+
)
488+
if not ports:
489+
return []
490+
inference_pids = []
491+
try:
492+
import psutil
493+
494+
connections = psutil.net_connections(kind="tcp4")
495+
for conn in connections:
496+
if conn.status == "LISTEN" and conn.laddr and conn.laddr.port in ports:
497+
inference_pids.append(conn.pid)
498+
printing.log_info(
499+
f"Identified process listening on port "
500+
f"{conn.laddr.port}: {conn.pid}"
501+
)
502+
except Exception: # pylint: disable=broad-exception-caught
503+
pass
504+
505+
return inference_pids
506+
449507

450508
# This file was originally licensed under Apache 2.0. It has been modified.
451509
# Modifications Copyright (c) 2025 AMD

0 commit comments

Comments
 (0)