@@ -549,6 +549,12 @@ absl::StatusOr<PreparedSend> PrepareSend(
549549 GlobalDeviceId src_device (buffer->device ()->global_device_id ().value ());
550550 GlobalDeviceId dst_device (dst_global_device_id.value ());
551551
552+ tsl::profiler::TraceMe trace ([&] {
553+ return tsl::profiler::TraceMeEncode (
554+ absl::StrFormat (" PrepareSend: src=%v dst=%v" , src_device, dst_device),
555+ {{" transfer_key" , transfer_key}});
556+ });
557+
552558 // Form the GPU clique key.
553559 // TODO(asrao, mwhittaker): Supply correct incarnations when creating the
554560 // clique key.
@@ -601,6 +607,13 @@ absl::StatusOr<PreparedReceive> PrepareReceive(
601607 GlobalDeviceId src_device (src_global_device_id.value ());
602608 GlobalDeviceId dst_device (device->global_device_id ().value ());
603609
610+ tsl::profiler::TraceMe trace ([&] {
611+ return tsl::profiler::TraceMeEncode (
612+ absl::StrFormat (" PrepareReceive: src=%v dst=%v" , src_device,
613+ dst_device),
614+ {{" transfer_key" , transfer_key}});
615+ });
616+
604617 // Form the GPU clique key.
605618 // TODO(asrao, mwhittaker): Supply correct incarnations when creating the
606619 // clique key.
@@ -776,6 +789,14 @@ void StreamExecutorGpuClient::ScheduleSendsOnLocalDevice(
776789 prepared_sends.reserve (buffers.size ());
777790 tsl::RCReference<PjRtStreamExecutorDeviceEvent> usage_event;
778791
792+ tsl::profiler::TraceMe trace ([&] {
793+ return tsl::profiler::TraceMeEncode (
794+ absl::StrFormat (
795+ " [%v] StreamExecutorGpuClient::ScheduleSendsOnLocalDevice" ,
796+ device->local_device_id ()),
797+ {{" num_buffers" , buffers.size ()}});
798+ });
799+
779800 auto setup_sends = [&]() -> absl::Status {
780801 TF_ASSIGN_OR_RETURN (local_device_state, GetLocalDeviceState (device));
781802 stream = local_device_state->GetDeviceToDeviceStream ();
@@ -853,6 +874,10 @@ void StreamExecutorGpuClient::ScheduleSendsOnLocalDevice(
853874 group_futures.reserve (grouped_sends.size ());
854875
855876 for (auto & [clique_key, curr_sends] : grouped_sends) {
877+ tsl::profiler::TraceMe trace ([&k = clique_key] {
878+ return tsl::profiler::TraceMeEncode (" LaunchSend" , {{" clique" , k}});
879+ });
880+
856881 // Get the communicator on which we will execute this group of
857882 // transfers. We assume each clique key is associated with a unique
858883 // communicator, so we just take the communicator of the first
@@ -994,6 +1019,13 @@ StreamExecutorGpuClient::CrossHostReceiveBuffers(
9941019 prepared_receives.reserve (shapes.size ());
9951020 tsl::RCReference<PjRtStreamExecutorDeviceEvent> definition_event;
9961021
1022+ tsl::profiler::TraceMe trace ([&] {
1023+ return tsl::profiler::TraceMeEncode (
1024+ absl::StrFormat (" [%v] StreamExecutorGpuClient::CrossHostReceiveBuffers" ,
1025+ device->local_device_id ()),
1026+ {{" num_shapes" , shapes.size ()}});
1027+ });
1028+
9971029 auto setup_receives = [&]() -> absl::Status {
9981030 TF_ASSIGN_OR_RETURN (local_device_state, GetLocalDeviceState (device));
9991031 stream = local_device_state->GetDeviceToDeviceStream ();
@@ -1064,6 +1096,10 @@ StreamExecutorGpuClient::CrossHostReceiveBuffers(
10641096 group_futures.reserve (grouped_receives.size ());
10651097
10661098 for (auto & [clique_key, curr_receives] : grouped_receives) {
1099+ tsl::profiler::TraceMe trace ([&k = clique_key] {
1100+ return tsl::profiler::TraceMeEncode (" LaunchRecv" , {{" clique" , k}});
1101+ });
1102+
10671103 // Get the communicator on which we will execute this group of
10681104 // transfers. We assume each clique key is associated with a unique
10691105 // communicator, so we just take the communicator of the first
0 commit comments