Skip to content

Commit ca1eedb

Browse files
sraikund16facebook-github-bot
authored andcommitted
Add CUPTI/RoCM versions to traces (#985)
Summary: Pull Request resolved: #985 Because of the differences that are emerging between different versions, it would be useful in the metadata we could see which third-party library version we are using. We add them to our kineto traces in this diff. Reviewed By: aaronenyeshi Differential Revision: D62538511 fbshipit-source-id: 813af45c1d2e82002ca7b4b7f3788407f13c254c
1 parent 76f2334 commit ca1eedb

File tree

2 files changed

+37
-13
lines changed

2 files changed

+37
-13
lines changed

libkineto/src/CuptiActivityProfiler.cpp

Lines changed: 21 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -193,7 +193,7 @@ std::ostream& operator<<(std::ostream& oss, const CuptiActivityProfiler::ErrorCo
193193

194194
void CuptiActivityProfiler::transferCpuTrace(
195195
std::unique_ptr<libkineto::CpuTraceBuffer> cpuTrace) {
196-
std::lock_guard<std::mutex> guard(mutex_);
196+
std::lock_guard<std::recursive_mutex> guard(mutex_);
197197
const string& trace_name = cpuTrace->span.name;
198198
if (currentRunloopState_ != RunloopState::CollectTrace &&
199199
currentRunloopState_ != RunloopState::ProcessTrace) {
@@ -248,6 +248,12 @@ void CuptiActivityProfiler::logGpuVersions() {
248248
"cuda_runtime_version", std::to_string(cudaRuntimeVersion));
249249
LOGGER_OBSERVER_ADD_METADATA(
250250
"cuda_driver_version", std::to_string(cudaDriverVersion));
251+
addVersionMetadata(
252+
"cupti_version", std::to_string(cuptiVersion));
253+
addVersionMetadata(
254+
"cuda_runtime_version", std::to_string(cudaRuntimeVersion));
255+
addVersionMetadata(
256+
"cuda_driver_version", std::to_string(cudaDriverVersion));
251257

252258
#elif defined(HAS_ROCTRACER)
253259
uint32_t majorVersion = roctracer_version_major();
@@ -267,13 +273,23 @@ void CuptiActivityProfiler::logGpuVersions() {
267273
"hip_runtime_version", std::to_string(hipRuntimeVersion));
268274
LOGGER_OBSERVER_ADD_METADATA(
269275
"hip_driver_version", std::to_string(hipDriverVersion));
276+
addVersionMetadata(
277+
"roctracer_version", roctracerVersion);
278+
addVersionMetadata(
279+
"hip_runtime_version", std::to_string(hipRuntimeVersion));
280+
addVersionMetadata(
281+
"hip_driver_version", std::to_string(hipDriverVersion));
282+
270283
#endif
271284
}
272285

273286
void CuptiActivityProfiler::processTraceInternal(ActivityLogger& logger) {
274287
LOG(INFO) << "Processing " << traceBuffers_->cpu.size() << " CPU buffers";
275288
VLOG(0) << "Profile time range: " << captureWindowStartTime_ << " - "
276289
<< captureWindowEndTime_;
290+
for (auto& pair : versionMetadata_) {
291+
addMetadata(pair.first, pair.second);
292+
}
277293
logger.handleTraceStart(metadata_);
278294
setCpuActivityPresent(false);
279295
setGpuActivityPresent(false);
@@ -948,7 +964,7 @@ void CuptiActivityProfiler::configureChildProfilers() {
948964
void CuptiActivityProfiler::configure(
949965
const Config& config,
950966
const time_point<system_clock>& now) {
951-
std::lock_guard<std::mutex> guard(mutex_);
967+
std::lock_guard<std::recursive_mutex> guard(mutex_);
952968
if (isActive()) {
953969
LOG(WARNING) << "CuptiActivityProfiler already busy, terminating";
954970
return;
@@ -1171,7 +1187,7 @@ const time_point<system_clock> CuptiActivityProfiler::performRunLoopStep(
11711187

11721188
if (cupti_.stopCollection) {
11731189
// Go to process trace to clear any outstanding buffers etc
1174-
std::lock_guard<std::mutex> guard(mutex_);
1190+
std::lock_guard<std::recursive_mutex> guard(mutex_);
11751191
stopTraceInternal(now);
11761192
resetInternal();
11771193
LOG(ERROR) << "State: Warmup stopped by CUPTI. (Buffer size configured is " << config_->activitiesMaxGpuBufferSize() / 1024 / 1024 << "MB)";
@@ -1230,7 +1246,7 @@ const time_point<system_clock> CuptiActivityProfiler::performRunLoopStep(
12301246
}
12311247
#endif // HAS_CUPTI || HAS_ROCTRACER
12321248

1233-
std::lock_guard<std::mutex> guard(mutex_);
1249+
std::lock_guard<std::recursive_mutex> guard(mutex_);
12341250
stopTraceInternal(now);
12351251
VLOG_IF(0, collection_done) << "Reached profile end time";
12361252
UST_LOGGER_MARK_COMPLETED(kCollectionStage);
@@ -1254,7 +1270,7 @@ const time_point<system_clock> CuptiActivityProfiler::performRunLoopStep(
12541270
}
12551271
// FIXME: Probably want to allow interruption here
12561272
// for quickly handling trace request via synchronous API
1257-
std::lock_guard<std::mutex> guard(mutex_);
1273+
std::lock_guard<std::recursive_mutex> guard(mutex_);
12581274
processTraceInternal(*logger_);
12591275
UST_LOGGER_MARK_COMPLETED(kPostProcessingStage);
12601276
resetInternal();

libkineto/src/CuptiActivityProfiler.h

Lines changed: 16 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -155,23 +155,23 @@ class CuptiActivityProfiler {
155155
// Synchronous control API
156156
void startTrace(
157157
const std::chrono::time_point<std::chrono::system_clock>& now) {
158-
std::lock_guard<std::mutex> guard(mutex_);
158+
std::lock_guard<std::recursive_mutex> guard(mutex_);
159159
startTraceInternal(now);
160160
}
161161

162162
void stopTrace(const std::chrono::time_point<std::chrono::system_clock>& now) {
163-
std::lock_guard<std::mutex> guard(mutex_);
163+
std::lock_guard<std::recursive_mutex> guard(mutex_);
164164
stopTraceInternal(now);
165165
}
166166

167167
// Process CPU and GPU traces
168168
void processTrace(ActivityLogger& logger) {
169-
std::lock_guard<std::mutex> guard(mutex_);
169+
std::lock_guard<std::recursive_mutex> guard(mutex_);
170170
processTraceInternal(logger);
171171
}
172172

173173
void reset() {
174-
std::lock_guard<std::mutex> guard(mutex_);
174+
std::lock_guard<std::recursive_mutex> guard(mutex_);
175175
resetInternal();
176176
}
177177

@@ -197,7 +197,7 @@ class CuptiActivityProfiler {
197197
// as key, because that's what CUPTI records.
198198
int32_t tid = threadId();
199199
int32_t pid = processId();
200-
std::lock_guard<std::mutex> guard(mutex_);
200+
std::lock_guard<std::recursive_mutex> guard(mutex_);
201201
recordThreadInfo(sysTid, tid, pid);
202202
}
203203

@@ -215,13 +215,18 @@ class CuptiActivityProfiler {
215215
}
216216

217217
void addMetadata(const std::string& key, const std::string& value) {
218-
std::lock_guard<std::mutex> guard(mutex_);
218+
std::lock_guard<std::recursive_mutex> guard(mutex_);
219219
metadata_[key] = value;
220220
}
221221

222+
void addVersionMetadata(const std::string& key, const std::string& value) {
223+
std::lock_guard<std::recursive_mutex> guard(mutex_);
224+
versionMetadata_[key] = value;
225+
}
226+
222227
void addChildActivityProfiler(
223228
std::unique_ptr<IActivityProfiler> profiler) {
224-
std::lock_guard<std::mutex> guard(mutex_);
229+
std::lock_guard<std::recursive_mutex> guard(mutex_);
225230
profilers_.push_back(std::move(profiler));
226231
}
227232

@@ -472,7 +477,7 @@ class CuptiActivityProfiler {
472477
// ***************************************************************************
473478

474479
// Mutex to protect non-atomic access to below state
475-
std::mutex mutex_;
480+
std::recursive_mutex mutex_;
476481

477482
// Runloop phase
478483
std::atomic<RunloopState> currentRunloopState_{RunloopState::WaitForRequest};
@@ -528,6 +533,9 @@ class CuptiActivityProfiler {
528533
// Trace metadata
529534
std::unordered_map<std::string, std::string> metadata_;
530535

536+
// Version metadata
537+
std::unordered_map<std::string, std::string> versionMetadata_;
538+
531539
// child activity profilers
532540
std::vector<std::unique_ptr<IActivityProfiler>> profilers_;
533541

0 commit comments

Comments
 (0)