@@ -34,7 +34,6 @@ limitations under the License.
3434#include " xla/core/collectives/communicator.h"
3535#include " xla/core/collectives/rank_id.h"
3636#include " xla/service/lockable.h"
37- #include " xla/service/rendezvous.h"
3837#include " xla/tsl/util/tied_ref.h"
3938
4039namespace xla ::gpu {
@@ -91,11 +90,6 @@ class GpuClique : public Clique {
9190 // Returns a parent clique iff *this one was created by clique splitting.
9291 const GpuClique* parent () const { return parent_; }
9392
94- std::pair<RendezvousFlag*, RendezvousFlag*> GetFirstRendezvousFlags () {
95- return std::make_pair (&pre_call_rendezvous_flag_,
96- &post_call_rendezvous_flag_);
97- }
98-
9993 private:
10094 friend LockableGpuClique;
10195
@@ -117,15 +111,6 @@ class GpuClique : public Clique {
117111 // A parent GPU clique iff *this clique was constructed by split operation.
118112 const GpuClique* parent_;
119113
120- // Before and after a first call to this particular instance of a collective
121- // thunk we do a round of rendezvous to make sure that all participants are
122- // ready to execute the collective operation and that all of them successfully
123- // allocated on-device state required for it. This is required to avoid
124- // deadlocks when one device goes too far ahead and causes a deadlock in CUDA
125- // driver (root cause rumored to be fixed in 590 driver series).
126- RendezvousFlag pre_call_rendezvous_flag_;
127- RendezvousFlag post_call_rendezvous_flag_;
128-
129114 // We keep device communicators in a sorted container to guarantee that they
130115 // are destroyed in deterministic order.
131116 mutable absl::Mutex mu_;
0 commit comments