Skip to content
Merged
Show file tree
Hide file tree
Changes from 51 commits
Commits
Show all changes
56 commits
Select commit Hold shift + click to select a range
41b319e
Refactor CuptiProfiler and related components for improved metric han…
Jokeren Feb 8, 2026
fbad03a
Fix call stack resizing in convertToTimelineTrace to include all cont…
Jokeren Feb 8, 2026
756cce7
Update parentContextId logic in addOp to use event contextId when not…
Jokeren Feb 9, 2026
5cc4984
Fix call stack resizing in convertToTimelineTrace to exclude the last…
Jokeren Feb 9, 2026
83df444
Refactor code for improved readability and consistency in Data and Tr…
Jokeren Feb 9, 2026
e76182f
Remove linkedId from DataEntry and update linkOp methods in TraceData…
Jokeren Feb 9, 2026
a170f0f
Change linkedTargetMetrics and linkedTargetFlexibleMetrics to use uno…
Jokeren Feb 9, 2026
7f95941
Refactor linkOp method to accept multiple targetEntryIds and a callba…
Jokeren Feb 9, 2026
0d43850
Refactor DataLaunchState to use pointers for node states and update r…
Jokeren Feb 9, 2026
86b6709
Refactor dumpHatchet and related methods to eliminate unnecessary clo…
Jokeren Feb 9, 2026
a7efde4
Refactor linkOp methods in Data, TraceData, and TreeData classes to a…
Jokeren Feb 9, 2026
97f9742
Refactor GraphState and CuptiProfiler to replace NodeState with NodeL…
Jokeren Feb 9, 2026
424f48f
Optimize linkOp method in TreeData to reserve space for linked target…
Jokeren Feb 10, 2026
75ff32d
Refactor linkOp method signatures in Data, TraceData, and TreeData cl…
Jokeren Feb 10, 2026
dbb43e8
Merge branch 'main' into keren/proton-shadow-tree
Jokeren Feb 10, 2026
8b1502e
Refactor metric handling in Graph and CuptiProfiler to improve clarit…
Jokeren Feb 10, 2026
c80e1f8
Refactor DataEntry handling in multiple profilers to use std::vector …
Jokeren Feb 10, 2026
68d06b8
Refactor GPU and Graph profilers to streamline DataEntry handling and…
Jokeren Feb 10, 2026
07405b4
Refactor function signatures and formatting in GPUProfiler, CuptiPCSa…
Jokeren Feb 10, 2026
ae17872
Refactor linkOp method signatures in Data, TraceData, and TreeData cl…
Jokeren Feb 10, 2026
d5d4088
Refactor formatting in Graph, CuptiProfiler, and RoctracerProfiler fo…
Jokeren Feb 10, 2026
6cac952
Refactor dataToNodeStates structure in GraphState to use unordered_ma…
Jokeren Feb 10, 2026
b8e4435
Refactor GraphState and CuptiProfiler to simplify NodeState structure…
Jokeren Feb 10, 2026
824eec9
Refactor GraphState NodeState representation and update CuptiProfiler…
Jokeren Feb 10, 2026
be908ae
Refactor NodeState and GraphState handling in CuptiProfiler for impro…
Jokeren Feb 10, 2026
a87f566
Refactor DataEntry structure and update metric handling for improved …
Jokeren Feb 10, 2026
857d670
Refactor DataEntry constructor calls for improved readability and con…
Jokeren Feb 10, 2026
526263f
Refactor upsertMetric calls to remove unnecessary withLock parameter …
Jokeren Feb 10, 2026
f52e81a
Refactor DataEntry and related classes to consolidate metric handling…
Jokeren Feb 10, 2026
0bc81d5
Refactor InstrumentationProfiler to unify data entry handling by rena…
Jokeren Feb 10, 2026
a53a59c
Remove unnecessary lock guard from handle function in DataEntry for i…
Jokeren Feb 10, 2026
840210e
Try
Jokeren Feb 10, 2026
ad72352
Refactor DataEntry handling in Graph.cpp to simplify upsertFlexibleMe…
Jokeren Feb 11, 2026
028ae42
Try
Jokeren Feb 11, 2026
69b91c8
Refactor DataEntry and Graph handling in Profiler to improve clarity …
Jokeren Feb 11, 2026
2303c91
Remove redundant variable declaration in handleGraphResourceCallbacks…
Jokeren Feb 11, 2026
6eba17c
Update
Jokeren Feb 11, 2026
a67b178
Update
Jokeren Feb 11, 2026
cda2bd6
Update
Jokeren Feb 11, 2026
414664b
Update
Jokeren Feb 11, 2026
7f03251
Update
Jokeren Feb 11, 2026
ddbe753
Update
Jokeren Feb 11, 2026
93b7a9e
Refactors DataEntry upserts and virtual-phase linking
Jokeren Feb 11, 2026
46b6d4d
Improve code formatting and line wrapping
Jokeren Feb 11, 2026
b47ba2a
Fix phase attribution for child and graph entries
Jokeren Feb 11, 2026
184e35e
Rename target metric API to 'linked'
Jokeren Feb 11, 2026
7ed69f7
Add upsertLinkedFlexibleMetric methods to DataEntry for linked metric…
Jokeren Feb 11, 2026
780f4d7
Refactor upsertLinkedFlexibleMetric and upsertLinkedFlexibleMetrics f…
Jokeren Feb 11, 2026
45746f2
Enhance DataEntry and metric structures with linked metrics support a…
Jokeren Feb 11, 2026
ed716b3
Refactor DataEntry and NodeStatus structures for improved clarity and…
Jokeren Feb 11, 2026
b72a092
Merge branch 'main' into keren/proton-shadow-tree
Jokeren Feb 11, 2026
bf0f438
Refactor DataEntry handling to use DataToEntryMap for improved organi…
Jokeren Feb 12, 2026
80931e5
Refactor CuptiProfiler to modularize graph node entry processing and …
Jokeren Feb 12, 2026
dfa32a3
Refactor CuptiPCSampling and CuptiProfiler for improved readability a…
Jokeren Feb 12, 2026
a1c42c1
Merge branch 'keren/proton-shadow-tree' of github.com:triton-lang/tri…
Jokeren Feb 13, 2026
351d427
Merge branch 'main' into keren/proton-shadow-tree
Jokeren Feb 13, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
104 changes: 65 additions & 39 deletions third_party/proton/csrc/include/Data/Data.h
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
#include "PhaseStore.h"
#include <atomic>
#include <cstdint>
#include <functional>
#include <limits>
#include <map>
#include <memory>
Expand All @@ -15,48 +16,76 @@
#include <shared_mutex>
#include <stdexcept>
#include <string>
#include <unordered_map>
#include <utility>
#include <vector>

namespace proton {

enum class OutputFormat { Hatchet, HatchetMsgPack, ChromeTrace, Count };

class Data;

/// An "entry" is a data specific unit of operation, e.g., a node in a tree
/// data structure or an event in a trace data structure.
struct DataEntry {
/// `entryId` is a unique identifier for the entry in the data.
using MetricMap = std::map<MetricKind, std::unique_ptr<Metric>>;
using FlexibleMetricMap = std::map<std::string, FlexibleMetric>;
using LinkedMetricMap = std::unordered_map<size_t, MetricMap>;
using LinkedFlexibleMetricMap = std::unordered_map<size_t, FlexibleMetricMap>;
struct MetricSet {
// Direct metrics associated with this entry.
MetricMap metrics{};
// Direct flexible metrics associated with this entry.
FlexibleMetricMap flexibleMetrics{};
// Metrics associated with linked entries.
LinkedMetricMap linkedMetrics{};
// Flexible metrics associated with linked entries.
LinkedFlexibleMetricMap linkedFlexibleMetrics{};
};

/// `id` is a unique identifier for the entry in the data.
/// When `phase` is a virtual phase, `id` refers to the linked entry id
/// for the node entry.
size_t id{Scope::DummyScopeId};
/// `phase` indicates which phase the entry belongs to.
size_t phase{0};
/// `metrics` is a map from metric kind to metric accumulator associated
/// with the entry.
/// Flexible metrics cannot be directly stored here since they maybe added by
/// both the frontend and the backend.
/// Use `Data::addMetrics` and `Data::addMetrics` to add flexible
/// metrics.
std::reference_wrapper<std::map<MetricKind, std::unique_ptr<Metric>>> metrics;

explicit DataEntry(size_t id, size_t phase,
std::map<MetricKind, std::unique_ptr<Metric>> &metrics)
: id(id), phase(phase), metrics(metrics) {}

void upsertMetric(std::unique_ptr<Metric> metric) {
if (!metric)
return;
auto &metricsMap = metrics.get();
auto it = metricsMap.find(metric->getKind());
if (it == metricsMap.end()) {
metricsMap.emplace(metric->getKind(), std::move(metric));
} else {
it->second->updateMetric(*metric);
}
}
/// `data` points to the owning data object for this entry.
Data *data{nullptr};
/// Per-entry storage for direct and linked metric maps.
std::reference_wrapper<MetricSet> metricSet;

explicit DataEntry(size_t id, size_t phase, Data *data, MetricSet &metricSet)
: id(id), phase(phase), data(data), metricSet(metricSet) {}

void upsertMetric(std::unique_ptr<Metric> metric) const;

void upsertLinkedMetric(std::unique_ptr<Metric> metric,
size_t linkedId) const;

void upsertFlexibleMetric(const std::string &metricName,
const MetricValueType &metricValue) const;

void upsertFlexibleMetrics(
const std::map<std::string, MetricValueType> &metrics) const;

void upsertLinkedFlexibleMetric(const std::string &metricName,
const MetricValueType &metricValue,
size_t linkedId) const;

void upsertLinkedFlexibleMetrics(
const std::map<std::string, MetricValueType> &metrics,
size_t linkedId) const;
};

class Data : public ScopeInterface {
public:
static constexpr size_t kNoCompletePhase = std::numeric_limits<size_t>::max();
// A special phase used for static/captured graph metadata.
static constexpr size_t kVirtualPhase =
std::numeric_limits<size_t>::max() - 1;
// Sentinel root id used when adding an op from the root.
static constexpr size_t kRootEntryId = Scope::DummyScopeId;

struct PhaseInfo {
size_t current{0};
Expand All @@ -67,7 +96,7 @@ class Data : public ScopeInterface {
}
};

Data(const std::string &path, ContextSource *contextSource = nullptr)
Data(const std::string &path, ContextSource *contextSource)
: path(path), contextSource(contextSource) {}
virtual ~Data() = default;

Expand Down Expand Up @@ -100,7 +129,7 @@ class Data : public ScopeInterface {
/// If `opName` is empty, just use the current context as is.
/// Otherwise obtain the current context and append `opName` to it. Return the
/// entry id of the added op.
virtual DataEntry addOp(const std::string &opName = {}) = 0;
DataEntry addOp(const std::string &opName = {});

/// Add an op with custom contexts to the data.
/// This is often used when context source is not available or when
Expand All @@ -124,17 +153,6 @@ class Data : public ScopeInterface {
addMetrics(size_t scopeId,
const std::map<std::string, MetricValueType> &metrics) = 0;

/// Record a batch of named metrics for an entry.
///
/// This is primarily intended for user-defined metrics defined in Python and
/// added lazily by the backend profiler.
/// `metrics` is a map from metric name to value to be applied to `entryId`.
///
/// The same as `addOp`, `phase` is important for asynchronous profilers.
virtual void
addMetrics(size_t phase, size_t entryId,
const std::map<std::string, MetricValueType> &metrics) = 0;

/// To Json
virtual std::string toJsonString(size_t phase) const = 0;

Expand Down Expand Up @@ -172,6 +190,16 @@ class Data : public ScopeInterface {
return lock;
}

[[nodiscard]] std::unique_lock<std::shared_mutex>
lockIfCurrentOrStaticPhase(size_t phase) {
std::unique_lock<std::shared_mutex> lock(mutex, std::defer_lock);
const auto currentPhaseValue = currentPhase.load(std::memory_order_relaxed);
if (phase == currentPhaseValue || phase == kVirtualPhase) {
lock.lock();
}
return lock;
}

std::atomic<std::size_t> currentPhase{0};
std::size_t completeUpToPhase{kNoCompletePhase};
std::set<size_t> activePhases{};
Expand All @@ -185,8 +213,6 @@ class Data : public ScopeInterface {
void *currentPhasePtr{};
};

typedef std::map<Data *, DataEntry> DataToEntryMap;

OutputFormat parseOutputFormat(const std::string &outputFormat);

const std::string outputFormatToString(OutputFormat outputFormat);
Expand Down
6 changes: 0 additions & 6 deletions third_party/proton/csrc/include/Data/TraceData.h
Original file line number Diff line number Diff line change
Expand Up @@ -16,19 +16,13 @@ class TraceData : public Data {

std::vector<uint8_t> toMsgPack(size_t phase) const override;

DataEntry addOp(const std::string &name) override;

DataEntry addOp(size_t phase, size_t eventId,
const std::vector<Context> &contexts) override;

void
addMetrics(size_t scopeId,
const std::map<std::string, MetricValueType> &metrics) override;

void
addMetrics(size_t phase, size_t entryId,
const std::map<std::string, MetricValueType> &metrics) override;

class Trace;

protected:
Expand Down
11 changes: 3 additions & 8 deletions third_party/proton/csrc/include/Data/TreeData.h
Original file line number Diff line number Diff line change
Expand Up @@ -24,19 +24,13 @@ class TreeData : public Data {

std::vector<uint8_t> toMsgPack(size_t phase) const override;

DataEntry addOp(const std::string &name) override;

DataEntry addOp(size_t phase, size_t contextId,
const std::vector<Context> &contexts) override;

void
addMetrics(size_t scopeId,
const std::map<std::string, MetricValueType> &metrics) override;

void
addMetrics(size_t phase, size_t entryId,
const std::map<std::string, MetricValueType> &metrics) override;

protected:
// ScopeInterface
void enterScope(const Scope &scope) override;
Expand All @@ -48,8 +42,9 @@ class TreeData : public Data {
// the background threads concurrently, so methods that access them should be
// protected by a (shared) mutex.
class Tree;
json buildHatchetJson(TreeData::Tree *tree) const;
std::vector<uint8_t> buildHatchetMsgPack(TreeData::Tree *tree) const;
json buildHatchetJson(TreeData::Tree *tree, TreeData::Tree *staticTree) const;
std::vector<uint8_t> buildHatchetMsgPack(TreeData::Tree *tree,
TreeData::Tree *staticTree) const;

// Data
void doDump(std::ostream &os, OutputFormat outputFormat,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -109,7 +109,7 @@ class CuptiPCSampling : public Singleton<CuptiPCSampling> {

void start(CUcontext context);

void stop(CUcontext context, const DataToEntryMap &dataToEntry);
void stop(CUcontext context, const std::vector<DataEntry> &dataEntries);

void finalize(CUcontext context);

Expand All @@ -123,7 +123,7 @@ class CuptiPCSampling : public Singleton<CuptiPCSampling> {
CubinData *getCubinData(uint64_t cubinCrc);

void processPCSamplingData(ConfigureData *configureData,
const DataToEntryMap &dataToEntry);
const std::vector<DataEntry> &dataEntries);

ThreadSafeMap<uint32_t, ConfigureData> contextIdToConfigureData;
// In case the same cubin is loaded multiple times, we need to keep track of
Expand Down
53 changes: 26 additions & 27 deletions third_party/proton/csrc/include/Profiler/GPUProfiler.h
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,8 @@ class GPUProfiler : public Profiler,

struct ExternIdState {
// ----non-graph launch fields----
DataToEntryMap dataToEntry;
// Active entries for each data sink associated with this extern launch.
std::vector<DataEntry> dataEntries;
// Sometimes the kernel name cannot be retrieved in application threads
// for reasons like uninitialize CUDA context.
bool isMissingName{true};
Expand All @@ -71,32 +72,29 @@ class GPUProfiler : public Profiler,
size_t numNodes{1};

struct GraphNodeState {
// If the node is launched as a metric kernel, ignore it's timing data.
bool isMetricNode{false};
bool isMissingName{true};
// Per-node launch status bits (missing-name / metric-node).
NodeStatus status{};

void setEntry(Data *data, const DataEntry &entry) {
dataToEntry.insert_or_assign(data, entry);
}
// If the node is launched as a metric kernel, ignore its timing data.
bool isMetricNode() const { return status.isMetricNode(); }
bool isMissingName() const { return status.isMissingName(); }

const DataEntry *findEntry(Data *data) const {
auto it = dataToEntry.find(data);
if (it == dataToEntry.end())
return nullptr;
return &it->second;
void addEntry(DataEntry &&entry) {
dataEntries.emplace_back(std::move(entry));
}

template <typename FnT> void forEachEntry(FnT &&fn) {
for (auto &[data, entry] : dataToEntry)
fn(data, entry);
for (auto &entry : dataEntries)
fn(entry);
}

DataToEntryMap dataToEntry;
// Entries in the launched graph phase for each data sink.
std::vector<DataEntry> dataEntries;
};

using GraphNodeStateTable = RangeTable<GraphNodeState>;

// graphNodeId -> (per-Data entry)
// graphNodeId -> per-node entries across active data sinks
GraphNodeStateTable graphNodeIdToState;
};

Expand All @@ -109,14 +107,13 @@ class GPUProfiler : public Profiler,
void startOp(const Scope &scope) override {
this->threadState.scopeStack.push_back(scope);
for (auto *data : dataSet) {
auto entry = data->addOp(scope.name);
threadState.dataToEntry.insert_or_assign(data, entry);
threadState.dataEntries.push_back(data->addOp(scope.name));
}
}

void stopOp(const Scope &scope) override {
this->threadState.scopeStack.pop_back();
threadState.dataToEntry.clear();
threadState.dataEntries.clear();
}

void flushDataPhases(
Expand Down Expand Up @@ -145,7 +142,8 @@ class GPUProfiler : public Profiler,
ConcreteProfilerT &profiler;
SessionManager &sessionManager = SessionManager::instance();
std::vector<Scope> scopeStack; // Used for nvtx range or triton op tracking
DataToEntryMap dataToEntry;
// Active entries for the currently open op, one per data sink.
std::vector<DataEntry> dataEntries;
bool isApiExternOp{false};
bool isStreamCapturing{false};
bool isMetricKernelLaunching{false};
Expand Down Expand Up @@ -200,11 +198,12 @@ class GPUProfiler : public Profiler,

// Correlate the correlationId with the last externId
void correlate(uint64_t correlationId, size_t externId, size_t numNodes,
bool isMissingName, const DataToEntryMap &dataToEntry) {
bool isMissingName,
const std::vector<DataEntry> &dataEntries) {
corrIdToExternId.insert(correlationId, externId);
externIdToState.upsert(externId, [&](ExternIdState &state) {
state.numNodes = numNodes;
state.dataToEntry = dataToEntry;
state.dataEntries = dataEntries;
state.isMissingName = isMissingName;
});
}
Expand Down Expand Up @@ -269,18 +268,18 @@ class GPUProfiler : public Profiler,
auto tensorMetricsHost =
collectTensorMetrics(profiler.metricBuffer->getRuntime(),
tensorMetrics, profiler.metricKernelStream);
auto &dataToEntry = threadState.dataToEntry;
if (dataToEntry.empty()) {
auto &dataEntries = threadState.dataEntries;
if (dataEntries.empty()) {
// Add metrics to a specific scope
for (auto *data : profiler.dataSet) {
data->addMetrics(scopeId, scalarMetrics);
data->addMetrics(scopeId, tensorMetricsHost);
}
} else {
// Add metrics to the current op
for (auto [data, entry] : dataToEntry) {
data->addMetrics(entry.phase, entry.id, scalarMetrics);
data->addMetrics(entry.phase, entry.id, tensorMetricsHost);
for (const auto &entry : dataEntries) {
entry.upsertFlexibleMetrics(scalarMetrics);
entry.upsertFlexibleMetrics(tensorMetricsHost);
}
}
}
Expand Down
Loading
Loading