Re-enable memory profiler in new architecture (#5)

amd-pworfolk · web-flow · commit 3a92093e9a06 · 2026-01-28T15:38:16.000-08:00
Also added CI test and updated documentation
diff --git a/.github/workflows/test_lemonade_eval.yml b/.github/workflows/test_lemonade_eval.yml
@@ -122,6 +122,7 @@ jobs:
           $serverExe = "$installPath\bin\lemonade-server.exe"
           $logFile = "${{ github.workspace }}\lemonade-server.log"
           $venvPython = ".venv\Scripts\python.exe"
+          $venvLemonade = ".venv\Scripts\lemonade-eval"
           
           Write-Host "Starting Lemonade Server..." -ForegroundColor Cyan
           
@@ -162,6 +163,11 @@ jobs:
             exit 1
           }
           
+          # Test CLI
+          Write-Host "Testing lemonade-eval CLI..."
+          & $venvLemonade -m -i Llama-3.2-1B-Instruct-GGUF load bench -w 0 -i 5
+          if ($LASTEXITCODE -ne 0) { exit $LASTEXITCODE }
+          
           try {
             Write-Host "Running server integration tests with Qwen3-4B-Instruct-2507-GGUF..." -ForegroundColor Cyan
             & $venvPython test/llm_api.py
diff --git a/README.md b/README.md
@@ -13,6 +13,7 @@ Contents:
 - [Export a Finetuned Model](#exporting-a-finetuned-model)
 - [LLM Report](#llm-report)
 - [Memory Usage](#memory-usage)
+- [Power Profiling](#power-profiling)
 - [System Information](#system-information)
 
 ## Overview
@@ -250,7 +251,7 @@ lemonade-eval -i Qwen3-4B-Instruct-2507-GGUF load bench
 The benchmark measures:
 - **Time to First Token (TTFT)**: Latency before first token is generated
 - **Tokens per Second**: Generation throughput
-- **Memory Usage**: Peak memory consumption (with `--memory` flag)
+- **Memory Usage**: Peak memory consumption (on Windows)
 
 #### Options
 
@@ -281,6 +282,18 @@ Results can be filtered by model name, device type, and data type:
 lemonade-eval report --perf --filter-model "Qwen"
 ```
 
+## Memory Usage
+
+On Windows, memory usage of the inference server backend can be tracked with the `--memory` flag.
+For example:
+
+```bash
+    lemonade-eval --memory -i Llama-3.2-1B-Instruct-GGUF load bench
+```
+
+This generates a PNG file that is stored in the current folder and the build folder.  This file
+contains a figure plotting the memory usage of the inference backend over the `lemonade-eval`
+tool sequence.  Learn more by running `lemonade-eval -h`.
 
 ## Power Profiling
 
diff --git a/src/lemonade/cli.py b/src/lemonade/cli.py
@@ -1,12 +1,13 @@
 import os
+import platform
 
 # pylint: disable=C0413
 # Prevent HF warnings from showing on every import
 os.environ["TRANSFORMERS_NO_ADVISORY_WARNINGS"] = "1"
 from lemonade.version import __version__ as version_number
 from lemonade.tools import FirstTool, NiceHelpFormatter
 
-# from lemonade.profilers.memory_tracker import MemoryTracker
+from lemonade.profilers.memory_tracker import MemoryTracker
 import lemonade.common.filesystem as fs
 import lemonade.common.cli_helpers as cli
 from lemonade.sequence import Sequence
@@ -21,9 +22,11 @@ def get_available_profilers(warn_missing=False):
         warn_missing: If True, print warnings for missing profilers. If False, fail silently.
     """
 
-    # Temporarily disable memory profiling due to changes in lemonade architecture
-    # profilers = [MemoryTracker]
-    profilers = []
+    # Allow memory profiling on Windows
+    if platform.system() == "Windows":
+        profilers = [MemoryTracker]
+    else:
+        profilers = []
 
     try:
         from lemonade.profilers.hwinfo_power import HWINFOPowerProfiler
diff --git a/src/lemonade/profilers/memory_tracker.py b/src/lemonade/profilers/memory_tracker.py
@@ -33,19 +33,23 @@ def add_arguments_to_parser(parser):
         )
 
     @staticmethod
-    def get_time_mem_list(process):
-        return [time.time(), process.memory_info().rss]
+    def get_time_mem_list(processes):
+        if processes is None:
+            return [time.time(), float("nan")]
+        if len(processes) == 0:
+            return [time.time(), 0]
+        return [time.time(), sum([process.memory_info().rss for process in processes])]
 
     def __init__(self, parser_arg_value):
         super().__init__()
         self.status_stats += [fs.Keys.MEMORY_USAGE_PLOT]
         self.track_memory_interval = parser_arg_value
-        self.process_being_tracked = None
         self.build_dir = None
         self.queue = None
         self.tracker_process = None
         self.tracking_active = False
         self.yaml_path = None
+        self.processes_being_tracked = None
 
     def start(self, build_dir):
         if self.tracking_active:
@@ -54,10 +58,6 @@ def start(self, build_dir):
         # Save the folder where data and plot will be stored
         self.build_dir = build_dir
 
-        # Get the process being tracked
-        track_pid = os.getpid()
-        self.process_being_tracked = psutil.Process(track_pid)
-
         # Create queue for passing messages to the tracker
         self.queue = Queue()
 
@@ -68,21 +68,31 @@ def start(self, build_dir):
         self.tracker_process = Process(
             target=self._memory_tracker_,
             args=(
-                track_pid,
                 self.queue,
                 self.yaml_path,
                 self.track_memory_interval,
             ),
         )
         self.tracker_process.start()
         self.tracking_active = True
+        # Set start of track and log a zero memory usage
         self.set_label("start")
-        self.sample()
+        self.queue.put(MemoryTracker.get_time_mem_list([]))
+
+    def add_pid_to_track(self, pid):
+        if self.tracking_active:
+            self.processes_being_tracked.append(psutil.Process(pid))
+            self.queue.put(pid)
 
     def tool_starting(self, tool_name):
         self.set_label(tool_name)
 
-    def tool_stopping(self):
+    def tool_stopping(self, state):
+        # Check it the tool as created the inference_processes attribute to state
+        if self.processes_being_tracked is None and hasattr(state, "inference_pids"):
+            self.processes_being_tracked = []
+            for pid in state.inference_pids:
+                self.add_pid_to_track(pid)
         self.sample()
 
     def set_label(self, label):
@@ -91,7 +101,12 @@ def set_label(self, label):
 
     def sample(self):
         if self.tracking_active:
-            self.queue.put(MemoryTracker.get_time_mem_list(self.process_being_tracked))
+            if self.processes_being_tracked is None:
+                self.queue.put(MemoryTracker.get_time_mem_list([]))
+            else:
+                self.queue.put(
+                    MemoryTracker.get_time_mem_list(self.processes_being_tracked)
+                )
 
     def stop(self):
         if self.tracking_active:
@@ -136,8 +151,8 @@ def generate_results(self, state, timestamp, _):
 
         # last_t and last_y are used to draw a line between the last point of the prior
         # track and the first point of the current track
-        last_t = None
-        last_y = None
+        last_t = 0
+        last_y = track[-1][1]
 
         plt.figure()
         for k, v in memory_tracks[1:]:
@@ -174,7 +189,6 @@ def generate_results(self, state, timestamp, _):
 
     @staticmethod
     def _memory_tracker_(
-        tracked_pid,
         input_queue: Queue,
         yaml_path: str,
         track_memory_interval: float,
@@ -191,17 +205,14 @@ def _memory_tracker_(
           3) None - This indicates that the tracker should stop tracking, save its data to a file
                     and end
         """
+        tracked_processes = None
         memory_tracks = []
         current_track = []
         track_name = None
         tracker_exit = False
 
         try:
-            tracked_process = psutil.Process(tracked_pid)
-            while (
-                not tracker_exit and tracked_process.status() == psutil.STATUS_RUNNING
-            ):
-
+            while not tracker_exit:
                 time.sleep(track_memory_interval)
 
                 # Read any messages from the parent process
@@ -227,6 +238,10 @@ def _memory_tracker_(
                                     "Track name must be passed to memory tracker prior to "
                                     "sending data"
                                 )
+                        elif isinstance(message, int):
+                            if tracked_processes is None:
+                                tracked_processes = []
+                            tracked_processes.append(psutil.Process(message))
                         else:
                             raise TypeError(
                                 "Unrecognized message type in memory_tracker input queue: "
@@ -240,7 +255,7 @@ def _memory_tracker_(
                 if not tracker_exit and track_name is not None:
                     # Save current time and memory usage
                     current_track.append(
-                        MemoryTracker.get_time_mem_list(tracked_process)
+                        MemoryTracker.get_time_mem_list(tracked_processes)
                     )
 
             # Save the collected memory tracks
diff --git a/src/lemonade/profilers/profiler.py b/src/lemonade/profilers/profiler.py
@@ -31,9 +31,10 @@ def tool_starting(self, tool_name):
         This method is called to inform the profiler of the name of the tool that is about to start.
         """
 
-    def tool_stopping(self):
+    def tool_stopping(self, state):
         """
-        This method is called to inform the profiler that the tool has finished.
+        This method is called to inform the profiler that the tool has finished.  The state is
+        passed for the tool to gather any relevant data.
         """
 
     def stop(self):
diff --git a/src/lemonade/sequence.py b/src/lemonade/sequence.py
@@ -288,7 +288,7 @@ def launch(
 
                 # Inform profilers that tool has finished
                 for profiler in self.profilers:
-                    profiler.tool_stopping()
+                    profiler.tool_stopping(state)
 
         start_times["cool down"] = time.time()
 
diff --git a/src/lemonade/tools/server_load.py b/src/lemonade/tools/server_load.py
@@ -364,6 +364,10 @@ def run(
         backend_str = self._get_backend_string(server_url, model_name)
         state.save_stat(Keys.BACKEND, backend_str)
 
+        # Look up the inference processes
+        if platform.system() == "Windows":
+            state.inference_pids = self._get_inference_pids(server_url)
+
         # Create adapters for the loaded model
         state.model = ServerAdapter(
             server_url=server_url,
@@ -446,6 +450,60 @@ def _get_backend_string(self, server_url: str, model_name: str) -> str:
             printing.log_warning(f"Could not determine backend: {e}")
             return "Lemonade Server"
 
+    def _get_inference_pids(self, server_url):
+        """
+        Extract the inference process ids from the load response.
+
+        Returns:
+            List of pids for the inference processes, or None if not applicable.
+        """
+        try:
+            health_response = requests.get(
+                f"{server_url}/api/v1/health",
+                timeout=10,
+            )
+            health_response.raise_for_status()
+            health_result = health_response.json()
+        except requests.exceptions.ConnectionError:
+            raise ConnectionError(
+                f"Cannot connect to Lemonade Server at {server_url}. "
+                "Make sure the server is running with 'lemonade-server serve'."
+            )
+        except requests.exceptions.RequestException as e:
+            raise ConnectionError(f"Error connecting to Lemonade Server: {e}")
+
+        # Extract the model_loaded info and find the associated backend_url
+        ports = []
+        for model_loaded in health_result.get("all_models_loaded", []):
+            if model_loaded.get("model_name") == health_result.get("model_loaded"):
+                backend_url = model_loaded.get("backend_url", "")
+                if backend_url.startswith("http://127.0.0.1:"):
+                    # Local backend, extract port from backend_url, e.g., http://127.0.0.1:PORT/v1
+                    port = backend_url.split(":")[2].split("/")[0]
+                    ports.append(int(port))
+                    printing.log_info(
+                        f"Identified inference backend port {port} "
+                        f"for {model_loaded.get('model_name')}"
+                    )
+        if not ports:
+            return []
+        inference_pids = []
+        try:
+            import psutil
+
+            connections = psutil.net_connections(kind="tcp4")
+            for conn in connections:
+                if conn.status == "LISTEN" and conn.laddr and conn.laddr.port in ports:
+                    inference_pids.append(conn.pid)
+                    printing.log_info(
+                        f"Identified process listening on port "
+                        f"{conn.laddr.port}: {conn.pid}"
+                    )
+        except Exception:  # pylint: disable=broad-exception-caught
+            pass
+
+        return inference_pids
+
 
 # This file was originally licensed under Apache 2.0. It has been modified.
 # Modifications Copyright (c) 2025 AMD