Skip to content

Commit

Permalink
Add Per Thread Buffer Option
Browse files Browse the repository at this point in the history
Summary: We have noticed that vanguard jobs now have very high overhead during Kineto traces due to their high amount of threads. The issue is that all the events triggered to these threads funnel to the same buffer within CUPTI which has some contention during synchronization. When reporting this to NV, they suggested setting CUPTI_ACTIVITY_ATTR_PER_THREAD_ACTIVITY_BUFFER with a nonzero value. This resulted in significantly lower overhead.

Differential Revision: D69421331
  • Loading branch information
sraikund16 authored and facebook-github-bot committed Feb 12, 2025
1 parent 3c3fa42 commit 8055ecf
Show file tree
Hide file tree
Showing 6 changed files with 47 additions and 5 deletions.
8 changes: 8 additions & 0 deletions libkineto/include/Config.h
Original file line number Diff line number Diff line change
Expand Up @@ -174,6 +174,11 @@ class Config : public AbstractConfig {
return selectedActivityTypes_;
}

// Set the types of activities to be traced
bool perThreadBufferEnabled() const {
return perThreadBufferEnabled_;
}

void setSelectedActivityTypes(const std::set<ActivityType>& types) {
selectedActivityTypes_ = types;
}
Expand Down Expand Up @@ -431,6 +436,9 @@ class Config : public AbstractConfig {

// Activity profiler
bool activityProfilerEnabled_;

// Enable per-thread buffer
bool perThreadBufferEnabled_;
std::set<ActivityType> selectedActivityTypes_;

// The activity profiler settings are all on-demand
Expand Down
4 changes: 4 additions & 0 deletions libkineto/src/Config.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,7 @@ constexpr char kHeartbeatMonitorPeriodKey[] =

// Activity Profiler
constexpr char kActivitiesEnabledKey[] = "ACTIVITIES_ENABLED";
constexpr char kPerThreadBufferEnabledKey[] = "PER_THREAD_BUFFER_ENABLED";
constexpr char kActivityTypesKey[] = "ACTIVITY_TYPES";
constexpr char kActivitiesLogFileKey[] = "ACTIVITIES_LOG_FILE";
constexpr char kActivitiesDurationKey[] = "ACTIVITIES_DURATION_SECS";
Expand Down Expand Up @@ -219,6 +220,7 @@ Config::Config()
kDefaultEventProfilerHearbeatMonitorPeriod),
multiplexPeriod_(kDefaultMultiplexPeriodMsecs),
activityProfilerEnabled_(true),
perThreadBufferEnabled_(false),
activitiesLogFile_(defaultTraceFileName()),
activitiesLogUrl_(fmt::format("file://{}", activitiesLogFile_)),
activitiesMaxGpuBufferSize_(kDefaultActivitiesMaxGpuBufferSize),
Expand Down Expand Up @@ -379,6 +381,8 @@ bool Config::handleOption(const std::string& name, std::string& val) {
verboseLogModules_ = splitAndTrim(val, ',');
} else if (!name.compare(kActivitiesEnabledKey)) {
activityProfilerEnabled_ = toBool(val);
} else if (!name.compare(kPerThreadBufferEnabledKey)) {
perThreadBufferEnabled_ = toBool(val);
} else if (!name.compare(kActivitiesLogFileKey)) {
activitiesLogFile_ = val;
activitiesLogUrl_ = fmt::format("file://{}", val);
Expand Down
23 changes: 21 additions & 2 deletions libkineto/src/CuptiActivityApi.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -307,7 +307,8 @@ void CuptiActivityApi::bufferCompleted(
#endif

void CuptiActivityApi::enableCuptiActivities(
const std::set<ActivityType>& selected_activities) {
const std::set<ActivityType>& selected_activities,
bool enablePerThreadBuffers) {
#ifdef HAS_CUPTI
// Lazily support re-init of CUPTI Callbacks, if they were finalized before.
auto cbapi_ = CuptiCallbackApi::singleton();
Expand All @@ -316,6 +317,16 @@ void CuptiActivityApi::enableCuptiActivities(
}
cbapi_.reset();

#if (CUDART_VERSION >= 12030)
if (enablePerThreadBuffers) {
uint8_t value = 1;
size_t sizeof_value = sizeof(value);
LOG(WARNING) << ("Enabling per-thread activity buffer");
CUPTI_CALL(cuptiActivitySetAttribute(
CUPTI_ACTIVITY_ATTR_PER_THREAD_ACTIVITY_BUFFER, &sizeof_value, &value));
}
#endif

CUPTI_CALL(cuptiActivityRegisterCallbacks(
bufferRequestedTrampoline, bufferCompletedTrampoline));

Expand Down Expand Up @@ -398,7 +409,15 @@ void CuptiActivityApi::disableCuptiActivities(
}
}
externalCorrelationEnabled_ = false;
#endif
// Clear out per-thread buffer flag in case it was set
#if (CUDART_VERSION >= 12030)
uint8_t value = 0;
size_t sizeof_value = sizeof(value);

CUPTI_CALL(cuptiActivitySetAttribute(
CUPTI_ACTIVITY_ATTR_PER_THREAD_ACTIVITY_BUFFER, &sizeof_value, &value));
#endif // (CUDART_VERSION >= 12030)
#endif // HAS_CUPTI
}

void CuptiActivityApi::teardownContext() {
Expand Down
4 changes: 3 additions & 1 deletion libkineto/src/CuptiActivityApi.h
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,9 @@ class CuptiActivityApi {
static void pushCorrelationID(int id, CorrelationFlowType type);
static void popCorrelationID(CorrelationFlowType type);

void enableCuptiActivities(const std::set<ActivityType>& selected_activities);
void enableCuptiActivities(
const std::set<ActivityType>& selected_activities,
bool enablePerThreadBuffers = false);
void disableCuptiActivities(
const std::set<ActivityType>& selected_activities);
void clearActivities();
Expand Down
8 changes: 6 additions & 2 deletions libkineto/src/CuptiActivityProfiler.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -131,6 +131,7 @@ ConfigDerivedState::ConfigDerivedState(const Config& config) {
profileDuration_ = config.activitiesDuration();
profileWarmupDuration_ = config.activitiesWarmupDuration();
profilingByIter_ = config.hasProfileStartIteration();
perThreadBufferEnabled_ = config.perThreadBufferEnabled();
if (profilingByIter_) {
profileStartIter_ = config.profileStartIteration();
profileEndIter_ = profileStartIter_ + config.activitiesRunIterations();
Expand Down Expand Up @@ -1095,7 +1096,8 @@ void CuptiActivityProfiler::configure(
}
#endif // CUDA_VERSION >= 11060
#endif // _WIN32
cupti_.enableCuptiActivities(config_->selectedActivityTypes());
cupti_.enableCuptiActivities(
config_->selectedActivityTypes(), config_->perThreadBufferEnabled());
#else
cupti_.enableActivities(config_->selectedActivityTypes());
#endif
Expand Down Expand Up @@ -1177,7 +1179,9 @@ void CuptiActivityProfiler::ensureCollectTraceDone() {
void CuptiActivityProfiler::toggleCollectionDynamic(const bool enable) {
#ifdef HAS_CUPTI
if (enable) {
cupti_.enableCuptiActivities(derivedConfig_->profileActivityTypes());
cupti_.enableCuptiActivities(
derivedConfig_->profileActivityTypes(),
derivedConfig_->isPerThreadBufferEnabled());
} else {
cupti_.disableCuptiActivities(derivedConfig_->profileActivityTypes());
}
Expand Down
5 changes: 5 additions & 0 deletions libkineto/src/CuptiActivityProfiler.h
Original file line number Diff line number Diff line change
Expand Up @@ -96,6 +96,10 @@ struct ConfigDerivedState final {
return profilingByIter_;
}

bool isPerThreadBufferEnabled() const {
return perThreadBufferEnabled_;
}

private:
std::set<ActivityType> profileActivityTypes_;
// Start and end time used for triggering and stopping profiling
Expand All @@ -106,6 +110,7 @@ struct ConfigDerivedState final {
int64_t profileStartIter_{-1};
int64_t profileEndIter_{-1};
bool profilingByIter_{false};
bool perThreadBufferEnabled_{false};
};

namespace detail {
Expand Down

0 comments on commit 8055ecf

Please sign in to comment.