Debug [skip ci]

UNIDY2002 · UNIDY2002 · commit f83add6dd6d2 · 2025-12-21T16:11:35.000+08:00
diff --git a/mooncake-ep/src/mooncake_backend.cpp b/mooncake-ep/src/mooncake_backend.cpp
@@ -1,4 +1,5 @@
 #include <ATen/cuda/CUDAContext.h>
+#include <c10/cuda/CUDAGuard.h>
 #include <cuda_runtime.h>
 #include <torch/torch.h>
 #include <torch/csrc/distributed/c10d/Backend.hpp>
@@ -962,11 +963,10 @@ void MooncakeBackend::advanceSendOp(SendOpStateData& opData) {
             if (isCpu_) {
                 std::memcpy(opData.sendBuf, opData.op.tensor.data_ptr(), opData.numBytes);
             } else {
-                // Worker thread doesn't have CUDA context, so we need to set the device
-                // and use a stream from the pool
+                // Worker thread needs to set CUDA device context
                 int deviceIndex = opData.op.tensor.device().index();
-                at::cuda::CUDAGuard guard(deviceIndex);
-                auto stream = at::cuda::getStreamFromPool(false, deviceIndex);
+                c10::cuda::CUDAGuard guard(deviceIndex);
+                auto stream = at::cuda::getCurrentCUDAStream(deviceIndex);
                 auto err = cudaMemcpyAsync(opData.sendBuf, opData.op.tensor.data_ptr(), 
                                            opData.numBytes, cudaMemcpyDeviceToDevice, stream);
                 TORCH_CHECK(!err, "P2P send cudaMemcpyAsync failed: ", cudaGetErrorString(err));
@@ -1144,11 +1144,10 @@ void MooncakeBackend::advanceRecvOp(RecvOpStateData& opData) {
             if (isCpu_) {
                 std::memcpy(opData.op.tensor.data_ptr(), opData.recvBuf, opData.numBytes);
             } else {
-                // Worker thread doesn't have CUDA context, so we need to set the device
-                // and use a stream from the pool
+                // Worker thread needs to set CUDA device context
                 int deviceIndex = opData.op.tensor.device().index();
-                at::cuda::CUDAGuard guard(deviceIndex);
-                auto stream = at::cuda::getStreamFromPool(false, deviceIndex);
+                c10::cuda::CUDAGuard guard(deviceIndex);
+                auto stream = at::cuda::getCurrentCUDAStream(deviceIndex);
                 auto err = cudaMemcpyAsync(opData.op.tensor.data_ptr(), opData.recvBuf, 
                                            opData.numBytes, cudaMemcpyDeviceToDevice, stream);
                 TORCH_CHECK(!err, "P2P recv cudaMemcpyAsync failed: ", cudaGetErrorString(err));