diff --git a/libkineto/include/Config.h b/libkineto/include/Config.h index aabe5fafb..e99ff824f 100644 --- a/libkineto/include/Config.h +++ b/libkineto/include/Config.h @@ -174,6 +174,11 @@ class Config : public AbstractConfig { return selectedActivityTypes_; } + // Set the types of activities to be traced + bool perThreadBufferEnabled() const { + return perThreadBufferEnabled_; + } + void setSelectedActivityTypes(const std::set& types) { selectedActivityTypes_ = types; } @@ -431,6 +436,9 @@ class Config : public AbstractConfig { // Activity profiler bool activityProfilerEnabled_; + + // Enable per-thread buffer + bool perThreadBufferEnabled_; std::set selectedActivityTypes_; // The activity profiler settings are all on-demand diff --git a/libkineto/src/Config.cpp b/libkineto/src/Config.cpp index 83d57ad42..5875342a5 100644 --- a/libkineto/src/Config.cpp +++ b/libkineto/src/Config.cpp @@ -65,6 +65,8 @@ constexpr char kHeartbeatMonitorPeriodKey[] = // Activity Profiler constexpr char kActivitiesEnabledKey[] = "ACTIVITIES_ENABLED"; +constexpr char kCuptiPerThreadBufferEnabledKey[] = + "CUPTI_PER_THREAD_BUFFER_ENABLED"; constexpr char kActivityTypesKey[] = "ACTIVITY_TYPES"; constexpr char kActivitiesLogFileKey[] = "ACTIVITIES_LOG_FILE"; constexpr char kActivitiesDurationKey[] = "ACTIVITIES_DURATION_SECS"; @@ -219,6 +221,7 @@ Config::Config() kDefaultEventProfilerHearbeatMonitorPeriod), multiplexPeriod_(kDefaultMultiplexPeriodMsecs), activityProfilerEnabled_(true), + perThreadBufferEnabled_(false), activitiesLogFile_(defaultTraceFileName()), activitiesLogUrl_(fmt::format("file://{}", activitiesLogFile_)), activitiesMaxGpuBufferSize_(kDefaultActivitiesMaxGpuBufferSize), @@ -379,6 +382,8 @@ bool Config::handleOption(const std::string& name, std::string& val) { verboseLogModules_ = splitAndTrim(val, ','); } else if (!name.compare(kActivitiesEnabledKey)) { activityProfilerEnabled_ = toBool(val); + } else if (!name.compare(kCuptiPerThreadBufferEnabledKey)) { + perThreadBufferEnabled_ = toBool(val); } else if (!name.compare(kActivitiesLogFileKey)) { activitiesLogFile_ = val; activitiesLogUrl_ = fmt::format("file://{}", val); diff --git a/libkineto/src/CuptiActivityApi.cpp b/libkineto/src/CuptiActivityApi.cpp index 425be9bf2..58df6a9ee 100644 --- a/libkineto/src/CuptiActivityApi.cpp +++ b/libkineto/src/CuptiActivityApi.cpp @@ -167,8 +167,8 @@ void CuptiActivityApi::bufferRequested( if (allocatedGpuTraceBuffers_.size() >= maxGpuBufferCount_) { stopCollection = true; LOG(WARNING) << "Exceeded max GPU buffer count (" - << allocatedGpuTraceBuffers_.size() << " > " - << maxGpuBufferCount_ << ") - terminating tracing"; + << allocatedGpuTraceBuffers_.size() + << " >= " << maxGpuBufferCount_ << ") - terminating tracing"; } auto buf = std::make_unique(kBufSize); @@ -307,7 +307,8 @@ void CuptiActivityApi::bufferCompleted( #endif void CuptiActivityApi::enableCuptiActivities( - const std::set& selected_activities) { + const std::set& selected_activities, + bool enablePerThreadBuffers) { #ifdef HAS_CUPTI // Lazily support re-init of CUPTI Callbacks, if they were finalized before. auto cbapi_ = CuptiCallbackApi::singleton(); @@ -316,6 +317,20 @@ void CuptiActivityApi::enableCuptiActivities( } cbapi_.reset(); + if (enablePerThreadBuffers) { +#if (CUDART_VERSION >= 12030) + uint8_t value = 1; + size_t sizeof_value = sizeof(value); + LOG(INFO) << ("Enabling per-thread activity buffer"); + CUPTI_CALL(cuptiActivitySetAttribute( + CUPTI_ACTIVITY_ATTR_PER_THREAD_ACTIVITY_BUFFER, &sizeof_value, &value)); +#else + LOG(WARNING) << "Per-thread activity buffer is not supported on CUDA"; +#endif // (CUDART_VERSION >= 12030) + } else { + LOG(VERBOSE) << ("Not enabling per-thread activity buffer"); + } + CUPTI_CALL(cuptiActivityRegisterCallbacks( bufferRequestedTrampoline, bufferCompletedTrampoline)); @@ -398,7 +413,15 @@ void CuptiActivityApi::disableCuptiActivities( } } externalCorrelationEnabled_ = false; -#endif + // Clear out per-thread buffer flag in case it was set +#if (CUDART_VERSION >= 12030) + uint8_t value = 0; + size_t sizeof_value = sizeof(value); + + CUPTI_CALL(cuptiActivitySetAttribute( + CUPTI_ACTIVITY_ATTR_PER_THREAD_ACTIVITY_BUFFER, &sizeof_value, &value)); +#endif // (CUDART_VERSION >= 12030) +#endif // HAS_CUPTI } void CuptiActivityApi::teardownContext() { diff --git a/libkineto/src/CuptiActivityApi.h b/libkineto/src/CuptiActivityApi.h index 448cc5415..afa17446a 100644 --- a/libkineto/src/CuptiActivityApi.h +++ b/libkineto/src/CuptiActivityApi.h @@ -54,7 +54,9 @@ class CuptiActivityApi { static void pushCorrelationID(int id, CorrelationFlowType type); static void popCorrelationID(CorrelationFlowType type); - void enableCuptiActivities(const std::set& selected_activities); + void enableCuptiActivities( + const std::set& selected_activities, + bool enablePerThreadBuffers = false); void disableCuptiActivities( const std::set& selected_activities); void clearActivities(); diff --git a/libkineto/src/CuptiActivityProfiler.cpp b/libkineto/src/CuptiActivityProfiler.cpp index e40b90964..07b16f9ec 100644 --- a/libkineto/src/CuptiActivityProfiler.cpp +++ b/libkineto/src/CuptiActivityProfiler.cpp @@ -131,6 +131,7 @@ ConfigDerivedState::ConfigDerivedState(const Config& config) { profileDuration_ = config.activitiesDuration(); profileWarmupDuration_ = config.activitiesWarmupDuration(); profilingByIter_ = config.hasProfileStartIteration(); + perThreadBufferEnabled_ = config.perThreadBufferEnabled(); if (profilingByIter_) { profileStartIter_ = config.profileStartIteration(); profileEndIter_ = profileStartIter_ + config.activitiesRunIterations(); @@ -1095,7 +1096,8 @@ void CuptiActivityProfiler::configure( } #endif // CUDA_VERSION >= 11060 #endif // _WIN32 - cupti_.enableCuptiActivities(config_->selectedActivityTypes()); + cupti_.enableCuptiActivities( + config_->selectedActivityTypes(), config_->perThreadBufferEnabled()); #else cupti_.enableActivities(config_->selectedActivityTypes()); #endif @@ -1177,7 +1179,9 @@ void CuptiActivityProfiler::ensureCollectTraceDone() { void CuptiActivityProfiler::toggleCollectionDynamic(const bool enable) { #ifdef HAS_CUPTI if (enable) { - cupti_.enableCuptiActivities(derivedConfig_->profileActivityTypes()); + cupti_.enableCuptiActivities( + derivedConfig_->profileActivityTypes(), + derivedConfig_->isPerThreadBufferEnabled()); } else { cupti_.disableCuptiActivities(derivedConfig_->profileActivityTypes()); } diff --git a/libkineto/src/CuptiActivityProfiler.h b/libkineto/src/CuptiActivityProfiler.h index 30d2203d4..a70cd81a4 100644 --- a/libkineto/src/CuptiActivityProfiler.h +++ b/libkineto/src/CuptiActivityProfiler.h @@ -96,6 +96,10 @@ struct ConfigDerivedState final { return profilingByIter_; } + bool isPerThreadBufferEnabled() const { + return perThreadBufferEnabled_; + } + private: std::set profileActivityTypes_; // Start and end time used for triggering and stopping profiling @@ -106,6 +110,7 @@ struct ConfigDerivedState final { int64_t profileStartIter_{-1}; int64_t profileEndIter_{-1}; bool profilingByIter_{false}; + bool perThreadBufferEnabled_{false}; }; namespace detail {