[Runtime] Optimize CUDA multi-stream by merging copy stream into compute stream. (#557)

shanshanpt · web-flow · commit 05f76dcf766e · 2022-11-17T10:06:35.000+08:00
Use MERGE_COMPUTE_COPY_STREAM=true to enable merged stream. It could reduce compute stream and copy stream synchronize cost,  and improve performance when multi-stream(multi compute stream) enabled.
diff --git a/tensorflow/core/common_runtime/direct_session.cc b/tensorflow/core/common_runtime/direct_session.cc
@@ -766,6 +766,10 @@ DirectSession::DirectSession(const SessionOptions& options,
     LOG(INFO) << "Current DirectSession " << this << " will be pinned to core: " << msg;
     thread_pools_[0].first->SetThreadPoolAffinity(cpuset);
   }
+
+  tensorflow::ReadBoolFromEnvVar("MERGE_COMPUTE_COPY_STREAM",
+                                 /*default_val=*/false,
+                                 &merge_compute_and_copy_stream_);
 }
 
 DirectSession::~DirectSession() {
@@ -961,8 +965,16 @@ Status DirectSession::RunInternal(
 
   // Start parallel Executors.
   const size_t num_executors = executors_and_keys->items.size();
+  // ref_send_inputs will be filled during execute graph.
+  std::vector<TensorReference*> ref_send_inputs;
   ExecutorBarrier* barrier = new ExecutorBarrier(
-      num_executors, run_state.rendez, [&run_state](const Status& ret) {
+      num_executors, run_state.rendez,
+      [&run_state, &ref_send_inputs](const Status& ret) {
+        VLOG(2) << "To unref buffer size: " << ref_send_inputs.size();
+        for (auto& ref : ref_send_inputs) {
+          ref->Unref();
+        }
+        ref_send_inputs.clear();
         {
           mutex_lock l(run_state.mu_);
           run_state.status.Update(ret);
@@ -994,6 +1006,10 @@ Status DirectSession::RunInternal(
     args.executor_policy = ExecutorPolicy::USE_NORMAL_EXECUTOR;
   }
 
+  args.ref_send_inputs_mu_ptr = std::make_unique<mutex>();
+  args.ref_send_inputs_ptr = &ref_send_inputs;
+  args.merge_compute_and_copy_stream = merge_compute_and_copy_stream_;
+
   const bool do_trace = (run_options.trace_level() > RunOptions::NO_TRACE);
 
   bool update_cost_model = false;
diff --git a/tensorflow/core/common_runtime/direct_session.h b/tensorflow/core/common_runtime/direct_session.h
@@ -447,6 +447,10 @@ class DirectSession : public Session {
   int multi_stream_num_ = 0;
   ResourceMgr* multi_stream_shared_rmgr_ = nullptr;
 
+  // User decide whether use compute stream as copy stream
+  // by set environment 'MERGE_COMPUTE_COPY_STREAM'
+  bool merge_compute_and_copy_stream_ = false;
+
   TF_DISALLOW_COPY_AND_ASSIGN(DirectSession);
 
   // EXPERIMENTAL: debugger (tfdbg) related
diff --git a/tensorflow/core/common_runtime/executor.cc b/tensorflow/core/common_runtime/executor.cc
@@ -395,6 +395,11 @@ class ExecutorState {
   bool finish_when_deferred_ops_done_ TF_GUARDED_BY(num_deferred_ops_mu_) =
       false;
 
+  // Ref Tensors of input of send op
+  mutex* ref_send_inputs_mu_ptr_;
+  std::vector<TensorReference*>* ref_send_inputs_ptr_;
+  bool merge_compute_and_copy_stream_;
+
   mutex mu_;
   Status status_ TF_GUARDED_BY(mu_);
 };
@@ -489,8 +494,8 @@ struct SortTaggedNode {
   SortTaggedNode(const std::vector<int64>* immutable_accumulative_cost) :
       immutable_accumulative_cost_(immutable_accumulative_cost) {}
   bool operator()(const TaggedNode& n1, const TaggedNode& n2) {
-    return (*immutable_accumulative_cost_)[n1.get_node_item().node_id] >
-        (*immutable_accumulative_cost_)[n2.get_node_item().node_id];
+    return (*immutable_accumulative_cost_)[n1.get_node_item().node->id()] >
+        (*immutable_accumulative_cost_)[n2.get_node_item().node->id()];
   }
   const std::vector<int64>* immutable_accumulative_cost_;
 };
@@ -537,7 +542,10 @@ ExecutorState<PropagatorStateType>::ExecutorState(
       sync_on_finish_(args.sync_on_finish),
       executor_policy_(args.executor_policy),
       propagator_(immutable_state, step_id_, vlog_),
-      num_outstanding_ops_(0) {
+      num_outstanding_ops_(0),
+      ref_send_inputs_mu_ptr_(args.ref_send_inputs_mu_ptr.get()),
+      ref_send_inputs_ptr_(args.ref_send_inputs_ptr),
+      merge_compute_and_copy_stream_(args.merge_compute_and_copy_stream) {
   // TODO: FIXME Consider function lib executor later
   //if (args.cost_runner == nullptr) {
   //  LOG(FATAL) << "cost_runner is nullptr, please check the args.";
@@ -668,16 +676,31 @@ Status ExecutorState<PropagatorStateType>::ProcessSync(
     NodeExecStatsInterface* stats) {
   Status s;
   OpKernelContext ctx(params, item.num_outputs);
-  nodestats::SetOpStart(stats);
-
-  ExecutorInternal::KernelStatsInfo kernel_stat_buffer;
-  kernel_stats_->StartCollectOp(&item, &kernel_stat_buffer);
-
   OpKernel* op_kernel = item.kernel;
   Device* device = immutable_state_.params().device;
   if (item.virtual_device.get() != nullptr) {
     device = item.virtual_device.get();
   }
+
+  if (merge_compute_and_copy_stream_ &&
+      (op_kernel->type_string() == "_HostSend" ||
+       (op_kernel->type_string() == "_Send" &&
+        device->parsed_name().type == "CPU")) &&
+      item.node->attrs().Find("recv_device")->s().find("GPU") != string::npos &&
+      (*params->inputs)[0].tensor->NumElements() > 0) {
+    CHECK(item.num_inputs == 1);  // send op allow one tensor
+    TensorReference* ref = new TensorReference(*((*params->inputs)[0].tensor));
+    {
+      mutex_lock l(*ref_send_inputs_mu_ptr_);
+      ref_send_inputs_ptr_->push_back(std::move(ref));
+    }
+  }
+
+  nodestats::SetOpStart(stats);
+
+  ExecutorInternal::KernelStatsInfo kernel_stat_buffer;
+  kernel_stats_->StartCollectOp(&item, &kernel_stat_buffer);
+
   const bool is_expensive = kernel_stats_->IsExpensive(item);
 
   if (TF_PREDICT_FALSE(MightTrace(item, event_collector_))) {
@@ -748,7 +771,7 @@ void ExecutorState<PropagatorStateType>::ProcessAsync(
     Status s = ProcessOutputs(*state->item, &state->ctx, outputs.data(), stats);
     nodestats::SetMemory(stats, &state->ctx);
     if (vlog_) {
-      VLOG(2) << "Async kernel done: " << state->item->node_id << " step "
+      VLOG(2) << "Async kernel done: " << state->item->node->id() << " step "
               << step_id_ << " " << SummarizeNodeDef(state->item->kernel->def())
               << (state->tagged_node.get_is_dead() ? " is dead" : "")
               << " device: " << device->name();
@@ -898,7 +921,7 @@ void ExecutorState<PropagatorStateType>::BatchProcess(std::vector<TaggedNode> no
     tagged_node = inline_ready.front();
     inline_ready.pop_front();
     const NodeItem& item = tagged_node.get_node_item();
-    const int id = item.node_id;
+    const int id = item.node->id();
 
     propagator_.MaybeMarkStarted(tagged_node);
 
diff --git a/tensorflow/core/common_runtime/executor.h b/tensorflow/core/common_runtime/executor.h
@@ -121,6 +121,12 @@ class Executor {
     CostRunner cost_runner = nullptr;
 
     ExecutorPolicy executor_policy = ExecutorPolicy::USE_NORMAL_EXECUTOR;
+
+    // store refs to cpu tensors that will be sent to gpu,
+    // and release them when the session run finishes.
+    std::unique_ptr<mutex> ref_send_inputs_mu_ptr;
+    std::vector<TensorReference*>* ref_send_inputs_ptr = nullptr;
+    bool merge_compute_and_copy_stream = false;
   };
   typedef std::function<void(const Status&)> DoneCallback;
   virtual void RunAsync(const Args& args, DoneCallback done) = 0;
diff --git a/tensorflow/core/common_runtime/gpu/gpu_device.cc b/tensorflow/core/common_runtime/gpu/gpu_device.cc
@@ -698,17 +698,42 @@ Status BaseGPUDevice::MaybeCopyTensorToGPU(
       return err;
     }
 
-    StatusCallback wrapped_done = std::bind(
-        [to, copy](StatusCallback done_,
-                   // Begin unbound arguments.
-                   const Status& s) {
-          if (s.ok()) {
-            *to = std::move(*copy);
-          }
-          delete copy;
-          done_(s);
-        },
-        std::move(done), std::placeholders::_1);
+    StatusCallback wrapped_done;
+    if (GPUUtil::MergeComputeAndCopyStream()) {
+      TensorReference input_ref(from);
+      auto recv_host_to_device_stream = device_contexts_[0]->stream();
+      auto event_mgr = em_;
+      wrapped_done = std::bind(
+          [to, copy, recv_host_to_device_stream, event_mgr, input_ref](
+              StatusCallback done_,
+              // Begin unbound arguments.
+              const Status& s) {
+            event_mgr->ThenExecute(
+                recv_host_to_device_stream,
+                [to, copy, recv_host_to_device_stream, done_, &s, input_ref]() {
+                  input_ref.Unref();
+                  if (!recv_host_to_device_stream->ok()) {
+                    LOG(FATAL) << "CPU->GPU Memcpy failed";
+                  }
+                  *to = std::move(*copy);
+                  delete copy;
+                  done_(s);
+                });
+          },
+          std::move(done), std::placeholders::_1);
+    } else {
+      wrapped_done = std::bind(
+          [to, copy](StatusCallback done_,
+                     // Begin unbound arguments.
+                     const Status& s) {
+            if (s.ok()) {
+              *to = std::move(*copy);
+            }
+            delete copy;
+            done_(s);
+          },
+          std::move(done), std::placeholders::_1);
+    }
 
     tracing::ScopedAnnotation annotation("MakeTensorFromProto");
     device_contexts_[0]->CopyCPUTensorToDevice(
diff --git a/tensorflow/core/common_runtime/gpu/gpu_util.cc b/tensorflow/core/common_runtime/gpu/gpu_util.cc
@@ -36,6 +36,7 @@ limitations under the License.
 #include "tensorflow/core/platform/stream_executor.h"
 #include "tensorflow/core/platform/tensor_coding.h"
 #include "tensorflow/core/platform/tracing.h"
+#include "tensorflow/core/util/env_var.h"
 #include "tensorflow/core/util/util.h"
 
 // IMPLEMENTATION NOTE:
@@ -111,6 +112,23 @@ void* GetBase(const Tensor* src) {
 
 void* GetBase(Tensor* dst) { return DMAHelper::base(dst); }
 
+/*static*/
+bool GPUUtil::MergeComputeAndCopyStream() {
+  static bool merge = false;
+  static bool check_setting = true;
+  if (check_setting) {
+    static mutex mu;
+    mutex_lock l(mu);
+    if (check_setting) {
+      tensorflow::ReadBoolFromEnvVar("MERGE_COMPUTE_COPY_STREAM",
+                                     /*default_val=*/false, &merge);
+      check_setting = false;
+    }
+  }
+
+  return merge;
+}
+
 /*static*/
 void GPUUtil::SetProtoFromGPU(const Tensor& tensor, Device* dev,
                               const DeviceContext* device_context,
@@ -273,16 +291,22 @@ void GPUUtil::CopyGPUTensorToCPU(Device* gpu_device,
     return;
   }
 
-  auto send_device_to_host_stream =
-      static_cast<const GPUDeviceContext*>(device_context)
-          ->device_to_host_stream();
-  if (send_device_to_host_stream == nullptr) {
-    done(errors::Internal("No send gpu copy-out-stream is available."));
-    return;
-  }
-  // Wait for the sender's main stream to make sure the data are available.
-  if (send_device_to_host_stream != send_stream) {
-    send_device_to_host_stream->ThenWaitFor(send_stream);
+  se::Stream* send_device_to_host_stream = nullptr;
+  if (MergeComputeAndCopyStream()) {
+    send_device_to_host_stream = send_stream;
+  } else {
+    send_device_to_host_stream =
+        static_cast<const GPUDeviceContext*>(device_context)
+            ->device_to_host_stream();
+    if (send_device_to_host_stream == nullptr) {
+      done(errors::Internal("No send gpu copy-out-stream is available."));
+      return;
+    }
+
+    // Wait for the sender's main stream to make sure the data are available.
+    if (send_device_to_host_stream != send_stream) {
+      send_device_to_host_stream->ThenWaitFor(send_stream);
+    }
   }
 
   const int64 total_bytes = gpu_tensor->TotalBytes();
@@ -320,17 +344,22 @@ void GPUUtil::CopyCPUTensorToGPU(const Tensor* cpu_tensor,
     return;
   }
 
-  auto recv_host_to_device_stream =
-      static_cast<const GPUDeviceContext*>(device_context)
-          ->host_to_device_stream();
-  if (recv_host_to_device_stream == nullptr) {
-    done(errors::Internal("No send gpu copy-out-stream is available."));
-    return;
-  }
-  // Wait for the recv-stream to make sure the buffer is truly available.
-  if (sync_dst_compute) {
-    if (recv_host_to_device_stream != recv_stream) {
-      recv_host_to_device_stream->ThenWaitFor(recv_stream);
+  se::Stream* recv_host_to_device_stream = nullptr;
+  if (MergeComputeAndCopyStream()) {
+    recv_host_to_device_stream = recv_stream;
+  } else {
+    recv_host_to_device_stream =
+        static_cast<const GPUDeviceContext*>(device_context)
+            ->host_to_device_stream();
+    if (recv_host_to_device_stream == nullptr) {
+      done(errors::Internal("No send gpu copy-out-stream is available."));
+      return;
+    }
+    // Wait for the recv-stream to make sure the buffer is truly available.
+    if (sync_dst_compute) {
+      if (recv_host_to_device_stream != recv_stream) {
+        recv_host_to_device_stream->ThenWaitFor(recv_stream);
+      }
     }
   }
 
@@ -342,17 +371,22 @@ void GPUUtil::CopyCPUTensorToGPU(const Tensor* cpu_tensor,
     DeviceMemoryBase gpu_dst_ptr(dst_ptr, total_bytes);
     recv_host_to_device_stream->ThenMemcpy(&gpu_dst_ptr, src_ptr, total_bytes);
   }
-  // Use of cpu_tensor may outlive stack scope, so keep a ref.
-  TensorReference input_ref(*cpu_tensor);
-  dev_info->event_mgr->ThenExecute(
-      recv_host_to_device_stream,
-      [recv_host_to_device_stream, done, input_ref]() {
-        input_ref.Unref();
-        if (!recv_host_to_device_stream->ok()) {
-          LOG(FATAL) << "CPU->GPU Memcpy failed";
-        }
-        done(Status::OK());
-      });
+
+  if (MergeComputeAndCopyStream()) {
+    done(Status::OK());
+  } else {
+    // Use of cpu_tensor may outlive stack scope, so keep a ref.
+    TensorReference input_ref(*cpu_tensor);
+    dev_info->event_mgr->ThenExecute(
+        recv_host_to_device_stream,
+        [recv_host_to_device_stream, done, input_ref]() {
+          input_ref.Unref();
+          if (!recv_host_to_device_stream->ok()) {
+            LOG(FATAL) << "CPU->GPU Memcpy failed";
+          }
+          done(Status::OK());
+        });
+  }
 }
 
 Status GPUUtil::Sync(Device* gpu_device) {
diff --git a/tensorflow/core/common_runtime/gpu/gpu_util.h b/tensorflow/core/common_runtime/gpu/gpu_util.h
@@ -105,6 +105,10 @@ class GPUUtil {
                                      const Tensor* src_gpu_tensor,
                                      Tensor* dst_gpu_tensor,
                                      StatusCallback done);
+
+  // User decide whether use compute stream as copy stream
+  // by set environment 'MERGE_COMPUTE_COPY_STREAM'
+  static bool MergeComputeAndCopyStream();
 };
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/graph_view.cc b/tensorflow/core/common_runtime/graph_view.cc
@@ -36,7 +36,7 @@ limitations under the License.
 namespace tensorflow {
 
 string NodeItem::DebugString() const {
-  string ret = strings::StrCat("{name:'", kernel->name(), "' id:", node_id);
+  string ret = strings::StrCat("{name:'", kernel->name(), "' id:", node->id());
   if (is_source) {
     strings::StrAppend(&ret, " source}");
   } else {
diff --git a/tensorflow/core/common_runtime/graph_view.h b/tensorflow/core/common_runtime/graph_view.h
@@ -58,8 +58,7 @@ struct ControlEdgeInfo {
 //
 // Each NodeItem is an element of exactly one GraphView.
 struct NodeItem {
-  // The index of this node's item in its GraphView.
-  int node_id = -1;
+  const Node* node = nullptr;
 
   // Cached attributes of this node for fast lookup.
   bool kernel_is_async : 1;     // True iff kernel->AsAsync() != nullptr
diff --git a/tensorflow/core/common_runtime/immutable_executor_state.cc b/tensorflow/core/common_runtime/immutable_executor_state.cc
@@ -126,7 +126,7 @@ Status ImmutableExecutorState::Initialize() {
     FrameInfo* frame_info = EnsureFrameInfo(frame_name);
 
     NodeItem* item = gview_.node(id);
-    item->node_id = id;
+    item->node = n;
 
     item->input_start = frame_info->total_inputs;
     frame_info->total_inputs += n->num_inputs();
diff --git a/tensorflow/core/common_runtime/immutable_executor_state.h b/tensorflow/core/common_runtime/immutable_executor_state.h
@@ -103,7 +103,7 @@ class ImmutableExecutorState {
 
   const FrameInfo& get_enter_frame_info(const NodeItem& node_item) const {
     DCHECK(node_item.is_enter);
-    return *enter_frame_info_[node_item.node_id];
+    return *enter_frame_info_[node_item.node->id()];
   }
 
   bool requires_control_flow_support() const { return requires_control_flow_; }
diff --git a/tensorflow/core/common_runtime/kernel_stat.h b/tensorflow/core/common_runtime/kernel_stat.h
diff --git a/tensorflow/core/common_runtime/propagator_state.cc b/tensorflow/core/common_runtime/propagator_state.cc
diff --git a/tensorflow/core/common_runtime/propagator_state.h b/tensorflow/core/common_runtime/propagator_state.h
diff --git a/tensorflow/core/common_runtime/simple_propagator_state.cc b/tensorflow/core/common_runtime/simple_propagator_state.cc
diff --git a/tensorflow/core/common_runtime/simple_propagator_state.h b/tensorflow/core/common_runtime/simple_propagator_state.h

Original file line number	Diff line number	Diff line change
`@@ -103,7 +103,7 @@ class ImmutableExecutorState {`
`103`	`103`
`104`	`104`	`const FrameInfo& get_enter_frame_info(const NodeItem& node_item) const {`
`105`	`105`	`DCHECK(node_item.is_enter);`
`106`		`- return *enter_frame_info_[node_item.node_id];`
	`106`	`+ return *enter_frame_info_[node_item.node->id()];`
`107`	`107`	`}`
`108`	`108`
`109`	`109`	`bool requires_control_flow_support() const { return requires_control_flow_; }`