Fix XPU CPU-view tensor lifetime (vllm-project#262)

chaojun-zhang · hsubramony · web-flow · commit 3fb2f1ea0eb0 · 2026-04-14T08:02:45.000+08:00
Signed-off-by: chaojun-zhang &lt;chaojun.zhang@intel.com&gt;
Co-authored-by: Harish Subramony &lt;harish.subramony@intel.com&gt;
diff --git a/csrc/xpu_view.cpp b/csrc/xpu_view.cpp
@@ -2,6 +2,7 @@
 #include "ops.h"
 #include <c10/core/Device.h>
 #include <c10/xpu/XPUFunctions.h>
+#include <memory>
 
 namespace vllm::xpu {
 
@@ -17,13 +18,17 @@ namespace vllm::xpu {
 
 class XPUHostViewAllocator : public c10::Allocator {
  public:
+  struct OwnerContext {
+    torch::Tensor owner;
+  };
+
   /**
    * @brief Constructor
    * @param host_ptr Pre-allocated host memory pointer
    * @param size Size of the host memory (in bytes)
    */
-  XPUHostViewAllocator(void* host_ptr, size_t size)
-      : host_ptr_(host_ptr), size_(size) {}
+  XPUHostViewAllocator(void* host_ptr, size_t size, torch::Tensor owner)
+      : host_ptr_(host_ptr), size_(size), owner_(std::move(owner)) {}
 
   /**
    * @brief Allocate memory (actually just validates and wraps existing host
@@ -36,15 +41,20 @@ class XPUHostViewAllocator : public c10::Allocator {
     // Verify requested memory size doesn't exceed pre-allocated memory size
     TORCH_CHECK(
         n <= size_, "Requested size exceeds allocated host pointer size");
-    // Return wrapped data pointer with no-op deleter since memory is externally
-    // managed
+    // Use unique_ptr for RAII: if current_device() or DataPtr construction
+    // throws, the OwnerContext is automatically cleaned up instead of leaked.
+    auto ctx = std::make_unique<OwnerContext>(OwnerContext{owner_});
     auto device_id = c10::xpu::current_device();
-    return {
-        host_ptr_,     // Actual data pointer
-        host_ptr_,     // Context pointer (same as data pointer here)
-        [](void*) {},  // No-op deleter, doesn't actually free memory
-        c10::Device(c10::DeviceType::XPU, device_id)  // Device type set to XPU
-    };
+
+    c10::DataPtr data_ptr{
+        host_ptr_,
+        ctx.get(),
+        [](void* ptr) { delete static_cast<OwnerContext*>(ptr); },
+        c10::Device(c10::DeviceType::XPU, device_id)};
+
+    // DataPtr now owns the context via its deleter — release from unique_ptr.
+    ctx.release();
+    return data_ptr;
   }
 
   /**
@@ -71,6 +81,7 @@ class XPUHostViewAllocator : public c10::Allocator {
  private:
   void* const host_ptr_;  // Pre-allocated host memory pointer
   const size_t size_;     // Size of pre-allocated memory
+  torch::Tensor owner_;   // Keeps pinned host storage alive
 };
 }  // namespace vllm::xpu
 
@@ -92,7 +103,8 @@ torch::Tensor get_xpu_view_from_cpu_tensor(torch::Tensor& cpu_tensor) {
   auto scalar_type = cpu_tensor.scalar_type();
 
   size_t byte_size = cpu_tensor.numel() * cpu_tensor.element_size();
-  vllm::xpu::XPUHostViewAllocator allocator(host_ptr, byte_size);
+  // Keep `cpu_tensor` storage alive through the view tensor's lifetime.
+  vllm::xpu::XPUHostViewAllocator allocator(host_ptr, byte_size, cpu_tensor);
   c10::DataPtr data_ptr = allocator.allocate(byte_size);
   c10::Storage storage(
       c10::Storage::use_byte_size_t(), byte_size, std::move(data_ptr));
diff --git a/tests/test_uva.py b/tests/test_uva.py
@@ -1,5 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import gc
 
 import pytest
 import torch
@@ -69,3 +70,23 @@ def test_gpu_write(device):
     assert cpu_tensor[0, 0] == 2
     assert cpu_tensor[2, 3] == 4
     assert cpu_tensor[4, 5] == -2
+
+
+@pytest.mark.parametrize("device", XPU_DEVICES)
+def test_view_lifetime_after_owner_drop(device):
+    torch.set_default_device(device)
+    cpu_tensor = torch.arange(100,
+                              dtype=torch.int32,
+                              device="cpu",
+                              pin_memory=True).view(10, 10)
+    xpu_view = torch.ops._C.get_xpu_view_from_cpu_tensor(cpu_tensor)
+
+    # Drop the original owner reference and force Python GC.
+    del cpu_tensor
+    gc.collect()
+
+    # Exercise both read and write from the XPU view after owner drop.
+    assert xpu_view[2, 3].item() == 23
+    xpu_view.add_(1)
+    assert xpu_view[0, 0].item() == 1
+    assert xpu_view[9, 9].item() == 100