-
Notifications
You must be signed in to change notification settings - Fork 62
update flight recorder usage #2292
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change | ||||
|---|---|---|---|---|---|---|
|
|
@@ -483,7 +483,7 @@ c10::intrusive_ptr<ProcessGroupXCCL::WorkXCCL> ProcessGroupXCCL::initWork( | |||||
| : std::nullopt); | ||||||
|
|
||||||
| if (record) { | ||||||
| r->trace_id_ = FlightRecorderXCCL::get()->record( | ||||||
| auto traceId = FlightRecorderXCCL::get()->record( | ||||||
| local_id_, | ||||||
| std::make_tuple(pg_uid_, pg_desc_), // PG name tuple | ||||||
| seqCollective_, | ||||||
|
|
@@ -497,6 +497,9 @@ c10::intrusive_ptr<ProcessGroupXCCL::WorkXCCL> ProcessGroupXCCL::initWork( | |||||
| options_->timeout, | ||||||
| pgStatus_, | ||||||
| isP2P); | ||||||
|
|
||||||
| r->trace_id_ = traceId.id; | ||||||
| r->trace_reset_epoch_ = traceId.reset_epoch; | ||||||
| } | ||||||
| return r; | ||||||
| } | ||||||
|
|
@@ -803,9 +806,11 @@ c10::intrusive_ptr<Work> ProcessGroupXCCL::collective( | |||||
| c10::ListType::create(c10::TensorType::get()), devices); | ||||||
| work->future_->markCompleted(at::IValue(*work->outputs_)); | ||||||
| auto id = work->trace_id_; | ||||||
| auto reset_epoch = work->trace_reset_epoch_; | ||||||
| work->future_->addCallback( | ||||||
| [id](at::ivalue::Future&) { | ||||||
| FlightRecorderXCCL::get()->retire_id(id, /*compute_duration*/ false); | ||||||
| [id, reset_epoch](at::ivalue::Future&) { | ||||||
| FlightRecorderXCCL::get()->retire_id( | ||||||
| id, reset_epoch, /*compute_duration*/ false); | ||||||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Let’s not pass it for now to maintain compatibility with both before and after the 166970 merge. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
|
||||||
| }, | ||||||
| /*use_future*/ false); | ||||||
| work->blockingWait_ = blockingWait_; | ||||||
|
|
@@ -891,7 +896,7 @@ c10::intrusive_ptr<Work> ProcessGroupXCCL::pointToPoint( | |||||
| work->outputs_ = std::make_shared<std::vector<at::Tensor>>(); | ||||||
| work->outputs_->push_back(tensor); | ||||||
|
|
||||||
| work->trace_id_ = FlightRecorderXCCL::get()->record( | ||||||
| auto traceId = FlightRecorderXCCL::get()->record( | ||||||
| local_id_, | ||||||
| std::make_tuple(pg_uid_, pg_desc_), // PG name tuple | ||||||
| seqCollective_, | ||||||
|
|
@@ -905,6 +910,8 @@ c10::intrusive_ptr<Work> ProcessGroupXCCL::pointToPoint( | |||||
| options_->timeout, | ||||||
| pgStatus_, | ||||||
| true); | ||||||
| work->trace_id_ = traceId.id; | ||||||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Same as above. Could you please add a check to see if traceId is of type uint64_t? If it is, assign trace_id_ directly to r->trace_id_; otherwise, parse it using traceId.id. |
||||||
| work->trace_reset_epoch_ = traceId.reset_epoch; | ||||||
|
|
||||||
| c10::OptionalDeviceGuard gpuGuard(device); | ||||||
|
|
||||||
|
|
@@ -922,9 +929,11 @@ c10::intrusive_ptr<Work> ProcessGroupXCCL::pointToPoint( | |||||
| c10::ListType::create(c10::TensorType::get()), devices); | ||||||
| work->future_->markCompleted(at::IValue(*work->outputs_)); | ||||||
| auto id = work->trace_id_; | ||||||
| auto reset_epoch = work->trace_reset_epoch_; | ||||||
| work->future_->addCallback( | ||||||
| [id](at::ivalue::Future&) { | ||||||
| FlightRecorderXCCL::get()->retire_id(id, /*compute_duration*/ false); | ||||||
| [id, reset_epoch](at::ivalue::Future&) { | ||||||
| FlightRecorderXCCL::get()->retire_id( | ||||||
| id, reset_epoch, /*compute_duration*/ false); | ||||||
| }, | ||||||
| /*use_future*/ false); | ||||||
|
|
||||||
|
|
@@ -2059,8 +2068,8 @@ c10::DeviceIndex ProcessGroupXCCL::guessDeviceId() const { | |||||
| } else if (!usedDeviceIdxs_.empty()) { | ||||||
| return *usedDeviceIdxs_.begin(); | ||||||
| } | ||||||
| int devIdx = | ||||||
| static_cast<int16_t>(globalRank() % at::detail::getXPUHooks().getNumGPUs()); | ||||||
| int devIdx = static_cast<int16_t>( | ||||||
| globalRank() % at::detail::getXPUHooks().getNumGPUs()); | ||||||
| LOG(WARNING) | ||||||
| << logPrefix() | ||||||
| << c10::str( | ||||||
|
|
||||||
Uh oh!
There was an error while loading. Please reload this page.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This change depends on PyTorch PR pytorch/pytorch#166970, and 166970 also depends on this PR for merging, creating a circular dependency. Could you please add a check to see if traceId is of type uint64_t? If it is, assign trace_id_ directly to r->trace_id_; otherwise, parse it using traceId.id.