-
Notifications
You must be signed in to change notification settings - Fork 77
Initial POC for perfspect integration #959
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Merged
Merged
Changes from 3 commits
Commits
Show all changes
5 commits
Select commit
Hold shift + click to select a range
43d8eb6
Initial POC for perfspect integration
mlim19 6177fd1
Fix python linter issues
mlim19 ddbb9c8
Remove the existing files when collection starts
mlim19 e29cc6f
Revised based on review comments
mlim19 e47123c
Merge branch 'master' into enable_perfspect_html
mlim19 File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,194 @@ | ||
| import base64 | ||
| import gzip | ||
| import os | ||
| import platform | ||
| import shutil | ||
| import subprocess | ||
| from abc import ABCMeta, abstractmethod | ||
| from dataclasses import dataclass | ||
| from pathlib import Path | ||
| from threading import Event, RLock, Thread | ||
| from typing import Optional | ||
|
|
||
| DEFAULT_POLLING_INTERVAL_SECONDS = 5 | ||
| STOP_TIMEOUT_SECONDS = 2 | ||
| PERFSPECT_DATA_DIRECTORY = "/tmp/perfspect_data" | ||
|
|
||
|
|
||
| @dataclass | ||
| class HWMetrics: | ||
| # HW metrics data in json format | ||
| metrics_data: Optional[dict] | ||
| # base64 encoded HTML data | ||
| metrics_html: Optional[str] | ||
|
|
||
|
|
||
| class HWMetricsMonitorBase(metaclass=ABCMeta): | ||
| @abstractmethod | ||
| def start(self) -> None: | ||
| pass | ||
|
|
||
| @abstractmethod | ||
| def stop(self) -> None: | ||
| pass | ||
|
|
||
| @abstractmethod | ||
| def _get_hw_metrics_dict(self) -> Optional[dict]: | ||
| """ | ||
| Returns the HW metrics in dictionary data structure | ||
| """ | ||
| raise NotImplementedError | ||
|
|
||
| @abstractmethod | ||
| def _get_hw_metrics_html(self) -> Optional[str]: | ||
| """ | ||
| Returns the base64 encoded string with HW metrics in HTML format | ||
| """ | ||
| raise NotImplementedError | ||
|
|
||
| def get_hw_metrics(self) -> HWMetrics: | ||
| return HWMetrics(self._get_hw_metrics_dict(), self._get_hw_metrics_html()) | ||
|
|
||
|
|
||
| class HWMetricsMonitor(HWMetricsMonitorBase): | ||
| def __init__( | ||
| self, | ||
| stop_event: Event, | ||
| perfspect_path: Optional[Path] = None, | ||
| perfspect_duration: int = 60, | ||
| polling_rate_seconds: int = DEFAULT_POLLING_INTERVAL_SECONDS, | ||
| ): | ||
| self._polling_rate_seconds = polling_rate_seconds | ||
| self._stop_event = stop_event | ||
| self._thread: Optional[Thread] = None | ||
| self._lock = RLock() | ||
| self._ps_process: Optional[subprocess.Popen[bytes]] = None | ||
| self._perfspect_path: Optional[Path] = perfspect_path | ||
| self._perfspect_duration = perfspect_duration | ||
|
|
||
| self._ps_raw_csv_filename = PERFSPECT_DATA_DIRECTORY + "/" + platform.node() + "_metrics.csv" | ||
| self._ps_summary_csv_filename = PERFSPECT_DATA_DIRECTORY + "/" + platform.node() + "_metrics_summary.csv" | ||
| self._ps_summary_html_filename = PERFSPECT_DATA_DIRECTORY + "/" + platform.node() + "_metrics_summary.html" | ||
| self._ps_latest_csv_filename = PERFSPECT_DATA_DIRECTORY + "/" + platform.node() + "_metrics_summary_latest.csv" | ||
| self._ps_latest_html_filename = ( | ||
| PERFSPECT_DATA_DIRECTORY + "/" + platform.node() + "_metrics_summary_latest.html" | ||
| ) | ||
|
|
||
| if not os.path.exists(PERFSPECT_DATA_DIRECTORY): | ||
| os.makedirs(PERFSPECT_DATA_DIRECTORY) | ||
| else: | ||
| if os.path.exists(self._ps_raw_csv_filename): | ||
| os.remove(self._ps_raw_csv_filename) | ||
| if os.path.exists(self._ps_summary_csv_filename): | ||
| os.remove(self._ps_summary_csv_filename) | ||
| if os.path.exists(self._ps_summary_html_filename): | ||
| os.remove(self._ps_summary_html_filename) | ||
|
|
||
| def start(self) -> None: | ||
| # assert self._thread is None, "HWMetricsMonitor is already running" | ||
| # assert not self._stop_event.is_set(), "Stop condition is already set (perhaps gProfiler was already stopped?)" | ||
|
|
||
| if self._perfspect_path is None: | ||
| return None | ||
|
|
||
| ps_cmd = [ | ||
| str(self._perfspect_path), | ||
| "metrics", | ||
| # "--metrics", | ||
|
mlim19 marked this conversation as resolved.
Outdated
|
||
| # '"CPU operating frequency (in GHz)","CPI","TMA_Frontend_Bound(%)","TMA_Bad_Speculation(%)",' | ||
| # '"TMA_Backend_Bound(%)","TMA_Retiring(%)"', | ||
| "--duration", | ||
| str(self._perfspect_duration), | ||
| # "--live", | ||
| # "--format", | ||
| # "csv", | ||
| # "--interval", | ||
| # "10", | ||
| "--output", | ||
| PERFSPECT_DATA_DIRECTORY, | ||
| ] | ||
|
|
||
| self._ps_process = subprocess.Popen(ps_cmd, stdout=subprocess.PIPE) | ||
| # ps_stdout, ps_stderr = ps_process.communicate() | ||
|
mlim19 marked this conversation as resolved.
Outdated
|
||
| # try: | ||
| # # wait 2 seconds to ensure it starts | ||
| # ps_process.wait(2) | ||
| # except subprocess.TimeoutExpired: | ||
| # pass | ||
| # else: | ||
| # raise Exception(f"Command {ps_cmd} exited unexpectedly with {ps_process.returncode}") | ||
| # self._thread = Thread(target=self._continuously_poll_perfspect, args=(self._polling_rate_seconds,)) | ||
| # self._thread.start() | ||
|
|
||
| def stop(self) -> None: | ||
|
mlim19 marked this conversation as resolved.
|
||
| # assert self._thread is not None, "HWMetricsMonitor is not running" | ||
| # assert self._stop_event.is_set(), "Stop event was not set before stopping the HWMetricsMonitor" | ||
| if self._ps_process: | ||
| self._ps_process.terminate() | ||
| # self._thread.join(STOP_TIMEOUT_SECONDS) | ||
|
mlim19 marked this conversation as resolved.
Outdated
|
||
| # if self._thread.is_alive(): | ||
| # raise ThreadStopTimeoutError("Timed out while waiting for the HWMetricsMonitor internal thread to stop") | ||
| self._thread = None | ||
|
|
||
| def _get_hw_metrics_dict(self) -> Optional[dict]: | ||
| summary_dict = {} | ||
| if os.path.exists(self._ps_summary_csv_filename) and os.path.isfile(self._ps_summary_csv_filename): | ||
| shutil.copy(self._ps_summary_csv_filename, self._ps_latest_csv_filename) | ||
| with open(self._ps_latest_csv_filename, "r") as f: | ||
| next(f) # Skip the first line | ||
| for line in f: | ||
| csv_data = line.split(",") | ||
| summary_dict[csv_data[0]] = csv_data[1] | ||
|
|
||
| # For debug, print the json string here | ||
|
mlim19 marked this conversation as resolved.
Outdated
|
||
| # summary_json_string = json.dumps(summary_dict) | ||
| # print(summary_json_string, file=sys.stderr) | ||
| os.remove(self._ps_latest_csv_filename) | ||
| return summary_dict | ||
|
|
||
| else: | ||
| return None | ||
|
|
||
| def _get_hw_metrics_html(self) -> Optional[str]: | ||
| if os.path.exists(self._ps_summary_html_filename) and os.path.isfile(self._ps_summary_html_filename): | ||
| encoded_html_data = None | ||
| shutil.copy(self._ps_summary_html_filename, self._ps_latest_html_filename) | ||
| with open(self._ps_latest_html_filename, "rb") as f: | ||
| html_data = f.read() | ||
| # Compress the HTML data using gzip | ||
| compressed_html_data = gzip.compress(html_data) | ||
|
|
||
| # For debug, save the compressed HTML data to a file | ||
| # compressed_html_filename = self._ps_latest_html_filename + ".gz" | ||
| # with open(compressed_html_filename, "wb") as compressed_html_file: | ||
| # compressed_html_file.write(compressed_html_data) | ||
| # compressed_html_file.close() | ||
|
|
||
| # Encode the compressed HTML data to base64 | ||
| encoded_html_data = base64.b64encode(compressed_html_data).decode("utf-8") | ||
|
|
||
| # For debug, save the base64 encoded HTML data to a file | ||
| # encoded_html_filename = self._ps_latest_html_filename + ".b64" | ||
| # with open(encoded_html_filename, "w") as encoded_html_file: | ||
| # encoded_html_file.write(encoded_html_data) | ||
| # encoded_html_file.close() | ||
|
|
||
| os.remove(self._ps_latest_html_filename) | ||
| return encoded_html_data | ||
|
|
||
| else: | ||
| return None | ||
|
|
||
|
|
||
| class NoopHWMetricsMonitor(HWMetricsMonitorBase): | ||
| def start(self) -> None: | ||
| pass | ||
|
|
||
| def stop(self) -> None: | ||
| pass | ||
|
|
||
| def _get_hw_metrics_dict(self) -> Optional[dict]: | ||
| return None | ||
|
|
||
| def _get_hw_metrics_html(self) -> Optional[str]: | ||
| return None | ||
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
Uh oh!
There was an error while loading. Please reload this page.