Skip to content

Commit 1249d76

Browse files
jamesjwufacebook-github-bot
authored andcommitted
Log PT2 chromium events to scuba
Summary: X-link: pytorch/pytorch#133859 This diff implements a bunch of views for internal scuba viewing. TODOS that I might punt to another diff: - Saving cache stats via counter is definitely sus here, but there's not really a good way to track "fx graph cache hit for this compile phase" right now. Will think about this more. - We should definitely log frame id, compile id, etc - We should definitely be logging configs. That way, we can A/B test based on whether a config is turned on. - idk what I'm doing with compile_uuid yet, but it's useful when you want to look at samples for a single run. I think if we had mast job info this field is not needed, but it's nice to be able to drill down to a single run and get its chrome trace view or icicle view, so idk Differential Revision: D61392607
1 parent 64130f1 commit 1249d76

File tree

1 file changed

+72
-10
lines changed
  • userbenchmark/dynamo/dynamobench/_dynamo

1 file changed

+72
-10
lines changed

userbenchmark/dynamo/dynamobench/_dynamo/utils.py

+72-10
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111
import enum
1212
import functools
1313
import gc
14+
import uuid
1415
import importlib
1516
import inspect
1617
import itertools
@@ -64,7 +65,7 @@
6465
from torch._dispatch.python import enable_python_dispatcher
6566
from torch._guards import TracingContext
6667
from torch._subclasses.meta_utils import is_sparse_compressed
67-
from torch._utils_internal import log_compilation_event
68+
from torch._utils_internal import log_compilation_event, log_chromium_event_internal
6869
from torch.fx._utils import _format_graph_code, lazy_format_graph_code
6970
from torch.nn.modules.lazy import LazyModuleMixin
7071
from torch.utils._triton import has_triton, has_triton_package
@@ -212,6 +213,15 @@ def _add_time_spent(key: str, phase_name: str, time_spent: float) -> None:
212213
frame_phase_timing[key][phase_name] += time_spent
213214

214215

216+
def get_cache_stats() -> Dict[str, Any]:
217+
"""Get a bunch of metadata about cache hits and misses to use in chromium events"""
218+
cache_stats = {
219+
"fxgraph_cache_hit":counters["inductor"]["fxgraph_cache_hit"],
220+
"fxgraph_cache_miss": counters["inductor"]["fxgraph_cache_miss"],
221+
"fxgraph_cache_bypass": counters["inductor"]["fxgraph_cache_bypass"],
222+
}
223+
return cache_stats
224+
215225
# dynamo_timed is a context manager
216226
# By wrapping a function in dynamo_timed, we can store a record in compilation_time_metrics
217227
# where the key is the functions name.
@@ -251,16 +261,20 @@ def dynamo_timed(
251261
fail_type: Optional[str] = None
252262
fail_reason: Optional[str] = None
253263
time_spent = float("-inf")
264+
if phase_name == "entire_frame_compile":
265+
reset_chromium_events()
254266
try:
255267
with torch.profiler.record_function(f"{key} (dynamo_timed)"):
256268
t0 = time.time()
257-
ChromiumEventLogger.log_event_start(key, time.time_ns())
269+
start = time.time_ns()
270+
ChromiumEventLogger.log_event_start(key, start, None)
258271
if phase_name:
259-
ChromiumEventLogger.log_event_start(phase_name, time.time_ns())
272+
ChromiumEventLogger.log_event_start(phase_name, start)
260273
yield
274+
261275
if phase_name:
262-
ChromiumEventLogger.log_event_end(phase_name, time.time_ns())
263-
ChromiumEventLogger.log_event_end(key, time.time_ns())
276+
ChromiumEventLogger.log_event_end(phase_name, time.time_ns(), {"cache_stats": get_cache_stats()}, start)
277+
ChromiumEventLogger.log_event_end(key, time.time_ns(), {"cache_stats": get_cache_stats()}, start)
264278
time_spent = time.time() - t0
265279
compilation_time_metrics[key].append(time_spent)
266280
except Exception as e:
@@ -807,6 +821,18 @@ def get_compilation_metrics() -> List[Union[CompilationMetrics, BwdCompilationMe
807821
return list(_compilation_metrics)
808822

809823

824+
chromium_event_stack = ["__start__"]
825+
# Generate a unique id for this process, which we can use in scuba to filter down
826+
# to a single python run.
827+
# TODO: figure out what this actually should be reset at
828+
compile_unique_id = str(uuid.uuid4())
829+
830+
def reset_chromium_events() -> None:
831+
global chromium_event_stack
832+
chromium_event_stack = ["__start__"]
833+
834+
835+
810836
class ChromiumEventLogger:
811837
"""Logs chromium events to structured logs. tlparse will concatenate these into a perfetto UI link.
812838
@@ -826,18 +852,22 @@ def log_event_start(
826852
:param time_ns Timestamp in nanoseconds
827853
:param metadata: Any extra metadata associated with this event
828854
"""
829-
ChromiumEventLogger._log_timed_event(
855+
global chromium_event_stack
856+
event = ChromiumEventLogger._log_timed_event(
830857
event_name,
831858
time_ns,
832859
"B",
833860
metadata,
834861
)
862+
log_chromium_event_internal(event, chromium_event_stack, compile_unique_id)
863+
chromium_event_stack.append(event_name)
835864

836865
@staticmethod
837866
def log_event_end(
838867
event_name: str,
839868
time_ns: int,
840869
metadata: Optional[Dict[str, Any]] = None,
870+
start_time_ns: Optional[int] = None,
841871
) -> None:
842872
"""
843873
Logs the end of a single event. This function should only be
@@ -846,28 +876,53 @@ def log_event_end(
846876
:param time_ns: Timestamp in nanoseconds
847877
:param metadata: Any extra metadata associated with this event
848878
"""
849-
ChromiumEventLogger._log_timed_event(
879+
global chromium_event_stack
880+
# These stack health checks currently never happen,
881+
# but they're written this way to future proof any weird event
882+
# overlaps in the future.
883+
if (event_name not in chromium_event_stack):
884+
# Something went wrong, we never called start on this event,
885+
# or it was skipped due to overlapping events below
886+
log.warn("Start event not in stack, ignoring")
887+
return
888+
889+
event = ChromiumEventLogger._log_timed_event(
850890
event_name,
851891
time_ns,
852892
"E",
853893
metadata,
854894
)
855895

896+
while event_name != chromium_event_stack[-1]:
897+
# If the event isn't the most recent one to end, pop
898+
# off the stack until it is.
899+
# Since event_name in chromium_event_stack, this pop is always safe
900+
log.warn("Detected overlapping events, fixing stack")
901+
chromium_event_stack.pop()
902+
903+
log_chromium_event_internal(event, chromium_event_stack, compile_unique_id, start_time_ns)
904+
# Finally pop the actual event off the stack
905+
chromium_event_stack.pop()
906+
907+
856908
@staticmethod
857909
def _log_timed_event(
858910
event_name: str,
859911
time_ns: int,
860912
phase: str,
861913
metadata: Optional[Dict[str, Any]] = None,
862-
) -> None:
914+
) -> Dict[str, Any]:
863915
"""
864916
Logs a timed event in chromium format. See log_event_start, log_event_end, etc.
865917
"""
866918
event = {
867919
"name": event_name,
868-
"ts": time_ns / 1000, # Chromium events are in ms
920+
"ts": time_ns / 1000, # Chromium events are in micro seconds
869921
"args": metadata,
870922
"ph": phase,
923+
# These categories are needed in all chromium traces
924+
"cat": "dynamo_timed",
925+
"tid": 0,
871926
"pid": 0, # pid should be specified on all logs, we don't personally care about the actual process id
872927
}
873928
torch._logging.trace_structured(
@@ -876,6 +931,7 @@ def _log_timed_event(
876931
suppress_context=False,
877932
expect_trace_id=False, # Not every chromium event will have a trace_id
878933
)
934+
return event
879935

880936
@staticmethod
881937
def log_instant_event(
@@ -895,7 +951,10 @@ def log_instant_event(
895951
"ts": time_ns / 1000,
896952
"args": metadata,
897953
"ph": "i",
898-
"pid": 0, # pid should be specified on all logs, we don't personally care about the actual process id
954+
# These categories are needed in all chromium traces
955+
"cat": "dynamo_timed",
956+
"tid": 0,
957+
"pid": 0,
899958
"s": "p", # We use "process" level instant events so they all appear on the same row in the trace.
900959
}
901960
torch._logging.trace_structured(
@@ -904,6 +963,9 @@ def log_instant_event(
904963
suppress_context=False,
905964
expect_trace_id=True,
906965
)
966+
# Log an instant event with the same start and end time
967+
log_chromium_event_internal(event, chromium_event_stack, compile_unique_id)
968+
907969

908970

909971
@dataclasses.dataclass

0 commit comments

Comments
 (0)