-
Notifications
You must be signed in to change notification settings - Fork 82
Open
Labels
Description
🐛 Describe the bug
Run:
analyzer = TraceAnalysis(trace_dir = "./output/profiler/tf_tracing_logs")
cuda_sequences_df = analyzer.get_frequent_cuda_kernel_sequences(
operator_name = "aten::linear",
output_dir = "/tmp/"
)Failed and report:
File "/home/vscode/project/HolisticTraceAnalysis/hta/trace_analysis.py", line 282, in get_frequent_cuda_kernel_sequences
return CudaKernelAnalysis.get_frequent_cuda_kernel_sequences(
File "/home/vscode/project/HolisticTraceAnalysis/hta/analyzers/cuda_kernel_analysis.py", line 73, in get_frequent_cuda_kernel_sequences
cg = CallGraph(t, ranks=[rank])
File "/home/vscode/project/HolisticTraceAnalysis/hta/common/trace_call_graph.py", line 90, in __init__
self._construct_call_graph()
File "/home/vscode/project/HolisticTraceAnalysis/hta/common/trace_call_graph.py", line 143, in _construct_call_graph
self._build_call_stacks(df, self.trace_data.symbol_table, rank)
File "/home/vscode/project/HolisticTraceAnalysis/hta/common/trace_call_graph.py", line 209, in _build_call_stacks
csg = CallStackGraph(
File "/home/vscode/project/HolisticTraceAnalysis/hta/common/trace_call_stack.py", line 256, in __init__
self._construct_call_stack_graph(df)
File "/home/vscode/project/HolisticTraceAnalysis/hta/common/trace_call_stack.py", line 364, in _construct_call_stack_graph
sort_events(events)
File "/home/vscode/project/HolisticTraceAnalysis/hta/common/trace_call_stack.py", line 110, in sort_events
a[:] = sorted(a.tolist(), key=cmp_to_key(_less_than_cmp))
File "/home/vscode/project/HolisticTraceAnalysis/hta/common/trace_call_stack.py", line 108, in _less_than_cmp
return -1 if _less_than(x, y) else 1
File "/home/vscode/project/HolisticTraceAnalysis/hta/common/trace_call_stack.py", line 75, in _less_than
return _cmp_events_with_zero_duration(x, y)
File "/home/vscode/project/HolisticTraceAnalysis/hta/common/trace_call_stack.py", line 56, in _cmp_events_with_zero_duration
raise ValueError(f"Unexpected case: {x} {y}")
ValueError: Unexpected case: [2123.0, -1.0, 1.0, 357.0] [2122.0, 0.0, 1.0, 357.0]
Steps to reproduce
Use the trace file:
ae9027adea69_161834.1748945322925280553.pt.trace.zip
Expected behavior
Hope it work or let me know if the trace file has any problem and what action may make this problem.
Environment
Ubuntu 20.04.6 LTS
Python 3.9.21
HTA version : v0.5.0 with source
Additional Info
trace file created by
import torch
import torch.nn as nn
import torch.optim as optim
x = torch.randn(10000, 1)
y = 2 * x + 1 + 0.1 * torch.randn(10000, 1)
model = nn.Sequential(
nn.Linear(1, 64),
nn.ReLU(),
nn.Linear(64, 64),
nn.ReLU(),
nn.Linear(64, 1)
)
criterion = nn.MSELoss()
optimizer = optim.SGD(model.parameters(), lr=0.1)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
if torch.cuda.device_count() > 1:
model = nn.DataParallel(model)
x = x.to(device)
y = y.to(device)
model = model.to(device)
with torch.profiler.profile(
schedule=torch.profiler.schedule(wait=1, warmup=1, active=3, repeat=1),
on_trace_ready=torch.profiler.tensorboard_trace_handler('./output/profiler/tf_tracing_logs'),
record_shapes=True,
profile_memory=True,
with_stack=True
) as prof:
for epoch in range(100):
prof.step()
optimizer.zero_grad()
outputs = model(x)
loss = criterion(outputs, y)
loss.backward()
optimizer.step()
if (epoch+1) % 10 == 0:
print(f'Epoch [{epoch+1}/100], Loss: {loss.item():.4f}')
if isinstance(model, nn.DataParallel):
first_layer = model.module[0]
else:
first_layer = model[0]
print('Trained weight:', first_layer.weight.data.cpu().numpy())
print('Trained bias:', first_layer.bias.data.cpu().numpy())Reactions are currently unavailable