diff --git a/CMakeLists.txt b/CMakeLists.txt
index 1b29c5b..3c479cc 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -87,8 +87,12 @@ set(TORCH_LIBRARIES "")
 file(GLOB_RECURSE TORCH_LIBRARIES "${TORCH_DIR}/lib/*.so"
      "${TORCH_DIR}/lib/*.a")
 
+find_package(CUDAToolkit QUIET)
 set(TORCH_INCLUDE_DIR "${TORCH_DIR}/include"
                       "${TORCH_DIR}/include/torch/csrc/api/include/")
+if(CUDAToolkit_FOUND)
+  list(APPEND TORCH_INCLUDE_DIR "${CUDAToolkit_INCLUDE_DIRS}")
+endif()
 
 set(TORCH_TARGET_FOLDER ${CMAKE_BINARY_DIR}/torch)
 set(BIN_PREFIX "torch_")
@@ -120,11 +124,18 @@ set(PADDLE_INCLUDE_DIR
     "${PADDLE_DIR}/include/paddle/phi/api/include/compat/"
     "${PADDLE_DIR}/include/paddle/phi/api/include/compat/torch/csrc/api/include/"
 )
+if(CUDAToolkit_FOUND)
+  list(APPEND PADDLE_INCLUDE_DIR "${CUDAToolkit_INCLUDE_DIRS}")
+  list(APPEND PADDLE_INCLUDE_DIR "${CUDAToolkit_INCLUDE_DIRS}/cccl")
+endif()
 
 set(PADDLE_LIBRARIES
     "${PADDLE_DIR}/base/libpaddle.so" "${PADDLE_DIR}/libs/libcommon.so"
     "${PADDLE_DIR}/libs/libphi.so" "${PADDLE_DIR}/libs/libphi_core.so"
     "${PADDLE_DIR}/libs/libphi_gpu.so")
+if(CUDAToolkit_FOUND)
+  list(APPEND PADDLE_LIBRARIES CUDA::cudart)
+endif()
 link_directories("${PADDLE_DIR}/base")
 link_directories("${PADDLE_DIR}/libs")
 
diff --git a/cmake/build.cmake b/cmake/build.cmake
index 2bcaf4e..e6fe44a 100644
--- a/cmake/build.cmake
+++ b/cmake/build.cmake
@@ -24,6 +24,20 @@ function(
     target_compile_definitions(${_test_name}
                                PRIVATE USE_PADDLE_API=${USE_PADDLE_API})
     message(STATUS "USE_PADDLE_API: ${USE_PADDLE_API}")
+    if(USE_PADDLE_API AND CUDAToolkit_FOUND)
+      target_compile_definitions(${_test_name} PRIVATE PADDLE_WITH_CUDA)
+    endif()
+    if(NOT USE_PADDLE_API)
+      # libtorch_cuda.so registers CUDA hooks via static initializers. Linux's
+      # --as-needed would normally strip it from DT_NEEDED since no symbols are
+      # directly referenced; force-load it with --no-as-needed.
+      foreach(_dep_lib ${DEPS_LIBRARIES})
+        if("${_dep_lib}" MATCHES "libtorch_cuda\\.so$")
+          target_link_libraries(${_test_name}
+                                "-Wl,--no-as-needed,${_dep_lib},--as-needed")
+        endif()
+      endforeach()
+    endif()
     add_test(NAME ${_test_name} COMMAND ${_test_name})
     set_tests_properties(${_test_name} PROPERTIES TIMEOUT 5)
     set_target_properties(${_test_name} PROPERTIES RUNTIME_OUTPUT_DIRECTORY
diff --git a/test/RecordStreamTest.cpp b/test/RecordStreamTest.cpp
new file mode 100644
index 0000000..c0704c9
--- /dev/null
+++ b/test/RecordStreamTest.cpp
@@ -0,0 +1,247 @@
+#include <ATen/ATen.h>
+#include <ATen/core/Tensor.h>
+#include <ATen/ops/zeros.h>
+#include <c10/core/Stream.h>
+#include <c10/cuda/CUDAStream.h>
+#include <gtest/gtest.h>
+#include <torch/all.h>
+
+#include <string>
+#include <vector>
+
+#include "../src/file_manager.h"
+
+extern paddle_api_test::ThreadSafeParam g_custom_param;
+
+namespace at {
+namespace test {
+
+using paddle_api_test::FileManerger;
+using paddle_api_test::ThreadSafeParam;
+
+class RecordStreamTest : public ::testing::Test {
+ protected:
+  void SetUp() override { cpu_tensor = at::zeros({2, 3}, at::kFloat); }
+  at::Tensor cpu_tensor;
+};
+
+static at::Stream get_default_cuda_stream() {
+  return c10::cuda::getCurrentCUDAStream(0);
+}
+
+// --- 基础功能测试：CUDA tensor + CUDA stream ---
+
+// kFloat, shape {2,3} (small)
+TEST_F(RecordStreamTest, CudaFloat2x3) {
+  auto file_name = g_custom_param.get();
+  FileManerger file(file_name);
+  file.createFile();
+  file << "CudaFloat2x3 ";
+  if (!torch::cuda::is_available()) {
+    file << "no_cuda";
+    file << "\n";
+    file.saveFile();
+    return;
+  }
+  try {
+    at::Tensor t = cpu_tensor.cuda();
+    at::Stream stream = get_default_cuda_stream();
+    t.record_stream(stream);
+    file << "1";
+  } catch (const std::exception& e) {
+    file << "exception";
+  }
+  file << "\n";
+  file.saveFile();
+}
+
+// kDouble, shape {4} (small, different dtype)
+TEST_F(RecordStreamTest, CudaDouble4) {
+  auto file_name = g_custom_param.get();
+  FileManerger file(file_name);
+  file.openAppend();
+  file << "CudaDouble4 ";
+  if (!torch::cuda::is_available()) {
+    file << "no_cuda";
+    file << "\n";
+    file.saveFile();
+    return;
+  }
+  try {
+    at::Tensor t = at::zeros({4}, at::kDouble).cuda();
+    at::Stream stream = get_default_cuda_stream();
+    t.record_stream(stream);
+    file << "1";
+  } catch (const std::exception& e) {
+    file << "exception";
+  }
+  file << "\n";
+  file.saveFile();
+}
+
+// kInt, shape {100,100} (large, >= 10000 elements)
+TEST_F(RecordStreamTest, CudaInt100x100) {
+  auto file_name = g_custom_param.get();
+  FileManerger file(file_name);
+  file.openAppend();
+  file << "CudaInt100x100 ";
+  if (!torch::cuda::is_available()) {
+    file << "no_cuda";
+    file << "\n";
+    file.saveFile();
+    return;
+  }
+  try {
+    at::Tensor t = at::zeros({100, 100}, at::kInt).cuda();
+    at::Stream stream = get_default_cuda_stream();
+    t.record_stream(stream);
+    file << "1";
+  } catch (const std::exception& e) {
+    file << "exception";
+  }
+  file << "\n";
+  file.saveFile();
+}
+
+// kLong, shape {} (0-d scalar tensor)
+TEST_F(RecordStreamTest, CudaLongScalar) {
+  auto file_name = g_custom_param.get();
+  FileManerger file(file_name);
+  file.openAppend();
+  file << "CudaLongScalar ";
+  if (!torch::cuda::is_available()) {
+    file << "no_cuda";
+    file << "\n";
+    file.saveFile();
+    return;
+  }
+  try {
+    at::Tensor t = at::zeros({}, at::kLong).cuda();
+    at::Stream stream = get_default_cuda_stream();
+    t.record_stream(stream);
+    file << "1";
+  } catch (const std::exception& e) {
+    file << "exception";
+  }
+  file << "\n";
+  file.saveFile();
+}
+
+// kFloat, shape {0} (空 tensor，边界 shape)
+TEST_F(RecordStreamTest, CudaEmptyShape) {
+  auto file_name = g_custom_param.get();
+  FileManerger file(file_name);
+  file.openAppend();
+  file << "CudaEmptyShape ";
+  if (!torch::cuda::is_available()) {
+    file << "no_cuda";
+    file << "\n";
+    file.saveFile();
+    return;
+  }
+  try {
+    at::Tensor t = at::zeros({0}, at::kFloat).cuda();
+    at::Stream stream = get_default_cuda_stream();
+    t.record_stream(stream);
+    file << "1";
+  } catch (const std::exception& e) {
+    file << "exception";
+  }
+  file << "\n";
+  file.saveFile();
+}
+
+// kFloat, shape {1,1,1} (全一维度，边界 shape)
+TEST_F(RecordStreamTest, CudaAllOnes) {
+  auto file_name = g_custom_param.get();
+  FileManerger file(file_name);
+  file.openAppend();
+  file << "CudaAllOnes ";
+  if (!torch::cuda::is_available()) {
+    file << "no_cuda";
+    file << "\n";
+    file.saveFile();
+    return;
+  }
+  try {
+    at::Tensor t = at::zeros({1, 1, 1}, at::kFloat).cuda();
+    at::Stream stream = get_default_cuda_stream();
+    t.record_stream(stream);
+    file << "1";
+  } catch (const std::exception& e) {
+    file << "exception";
+  }
+  file << "\n";
+  file.saveFile();
+}
+
+// kFloat, 非连续 tensor（经 transpose）
+TEST_F(RecordStreamTest, CudaNonContiguous) {
+  auto file_name = g_custom_param.get();
+  FileManerger file(file_name);
+  file.openAppend();
+  file << "CudaNonContiguous ";
+  if (!torch::cuda::is_available()) {
+    file << "no_cuda";
+    file << "\n";
+    file.saveFile();
+    return;
+  }
+  try {
+    at::Tensor base = at::zeros({3, 4}, at::kFloat).cuda();
+    at::Tensor t = base.transpose(0, 1);  // 非连续
+    at::Stream stream = get_default_cuda_stream();
+    t.record_stream(stream);
+    file << "1";
+  } catch (const std::exception& e) {
+    file << "exception";
+  }
+  file << "\n";
+  file.saveFile();
+}
+
+// --- 异常路径：CPU tensor + CUDA stream（如有 CUDA） ---
+// record_stream 在两个框架下对 CPU tensor 的处理行为
+TEST_F(RecordStreamTest, CpuTensorCudaStream) {
+  auto file_name = g_custom_param.get();
+  FileManerger file(file_name);
+  file.openAppend();
+  file << "CpuTensorCudaStream ";
+  if (!torch::cuda::is_available()) {
+    file << "no_cuda";
+    file << "\n";
+    file.saveFile();
+    return;
+  }
+  try {
+    at::Stream stream = get_default_cuda_stream();
+    cpu_tensor.record_stream(stream);
+    file << "1";
+  } catch (const std::exception& e) {
+    file << "exception";
+  }
+  file << "\n";
+  file.saveFile();
+}
+
+// --- 异常路径：CPU tensor + CPU stream（无 CUDA 依赖） ---
+// record_stream 是 CUDA-only API，CPU stream 应触发异常
+TEST_F(RecordStreamTest, CpuTensorCpuStream) {
+  auto file_name = g_custom_param.get();
+  FileManerger file(file_name);
+  file.openAppend();
+  file << "CpuTensorCpuStream ";
+  c10::Stream stream(c10::Stream::DEFAULT,
+                     c10::Device(c10::DeviceType::CPU, 0));
+  try {
+    cpu_tensor.record_stream(stream);
+    file << "1";
+  } catch (const std::exception& e) {
+    file << "exception";
+  }
+  file << "\n";
+  file.saveFile();
+}
+
+}  // namespace test
+}  // namespace at