resolve comments

richardhuo-nv · richardhuo-nv · commit 02dcf1dae25f · 2025-03-25T10:38:08.000-07:00
diff --git a/README.md b/README.md
@@ -1418,13 +1418,10 @@ will be automatically deallocated. This can increase the number of BLS requests
 that you can execute in your model without running into the out of GPU or
 shared memory error.
 
-Starting from the 25.04 release, you can use the `infer_responses.cancel()` function
-on a BLS decoupled response iterator to stop the response stream, which cancels
-the request to the decoupled model. This is useful for stopping long inference
-requests, such as those from auto-generative large language models, which may
-run for an indeterminate amount of time and consume significant server resources.
-The response iterator can be generated from `infer_request.exec(decoupled=True)`
-and `infer_request.async_exec(decoupled=True)` functions:
+### Cancelling decoupled BLS requests
+A decoupled BLS inference request may be cancelled by calling the `cancel()`
+method on the response iterator returned from the method executing the BLS
+inference request. For example,
 
 ```python
 import triton_python_backend_utils as pb_utils
@@ -1433,12 +1430,12 @@ class TritonPythonModel:
   ...
     def execute(self, requests):
       ...
-      inference_request = pb_utils.InferenceRequest(
+      infer_request = pb_utils.InferenceRequest(
           model_name='model_name',
           requested_output_names=['REQUESTED_OUTPUT'],
           inputs=[<pb_utils.Tensor object>])
 
-      # Execute the inference_request and wait for the response. Here we are
+      # Execute the infer_request and wait for the response. Here we are
       # running a BLS request on a decoupled model, hence setting the parameter
       # 'decoupled' to 'True'.
       infer_responses = infer_request.exec(decoupled=True)
@@ -1449,14 +1446,14 @@ class TritonPythonModel:
         # vLLM backend uses the CANCELLED error code when a request is cancelled.
         # TensorRT-LLM backend does not use error codes; instead, it sends the
         # TRITONSERVER_RESPONSE_COMPLETE_FINAL flag to the iterator.
-        if inference_response.has_error():
+        if infer_response.has_error():
             if infer_response.error().code() == pb_utils.TritonError.CANCELLED:
                 print("request has been cancelled.")
                 break
 
         # Collect the output tensor from the model's response
         output = pb_utils.get_output_tensor_by_name(
-            inference_response, 'REQUESTED_OUTPUT')
+            infer_response, 'REQUESTED_OUTPUT')
         response_tensors_received.append(output)
 
         # Check if we have received enough inference output tensors
diff --git a/src/infer_payload.cc b/src/infer_payload.cc
@@ -32,6 +32,7 @@ InferPayload::InferPayload(
     const bool is_decoupled,
     std::function<void(std::unique_ptr<InferResponse>)> callback)
     : is_decoupled_(is_decoupled), is_promise_set_(false), callback_(callback),
+      is_request_deleted_(false),
       request_address_(reinterpret_cast<intptr_t>(nullptr))
 {
   promise_.reset(new std::promise<std::unique_ptr<InferResponse>>());
@@ -104,4 +105,35 @@ InferPayload::GetRequestAddress()
   return request_address_;
 }
 
+void
+InferPayload::SetRequestDeleted()
+{
+  std::unique_lock<std::mutex> lock(request_deletion_mutex_);
+  is_request_deleted_ = true;
+}
+
+void
+InferPayload::SetRequestCancellationFunc(
+    const std::function<void(intptr_t)>& request_cancel_func)
+{
+  request_cancel_func_ = request_cancel_func;
+}
+
+void
+InferPayload::SafeCancelRequest()
+{
+  std::unique_lock<std::mutex> lock(request_deletion_mutex_);
+  if (is_request_deleted_) {
+    return;
+  }
+
+  if (request_address_ == 0L) {
+    return;
+  }
+
+  if (request_cancel_func_) {
+    request_cancel_func_(request_address_);
+  }
+}
+
 }}}  // namespace triton::backend::python
diff --git a/src/infer_payload.h b/src/infer_payload.h
@@ -62,8 +62,12 @@ class InferPayload : public std::enable_shared_from_this<InferPayload> {
   void SetResponseAllocUserp(
       const ResponseAllocatorUserp& response_alloc_userp);
   std::shared_ptr<ResponseAllocatorUserp> ResponseAllocUserp();
+  void SetRequestDeleted();
   void SetRequestAddress(intptr_t request_address);
   intptr_t GetRequestAddress();
+  void SetRequestCancellationFunc(
+      const std::function<void(intptr_t)>& request_cancel_func);
+  void SafeCancelRequest();
 
  private:
   std::unique_ptr<std::promise<std::unique_ptr<InferResponse>>> promise_;
@@ -72,7 +76,10 @@ class InferPayload : public std::enable_shared_from_this<InferPayload> {
   bool is_promise_set_;
   std::function<void(std::unique_ptr<InferResponse>)> callback_;
   std::shared_ptr<ResponseAllocatorUserp> response_alloc_userp_;
+  std::mutex request_deletion_mutex_;
+  bool is_request_deleted_;
   intptr_t request_address_;
+  std::function<void(intptr_t)> request_cancel_func_;
 };
 
 }}}  // namespace triton::backend::python
diff --git a/src/ipc_message.h b/src/ipc_message.h
@@ -68,7 +68,7 @@ typedef enum PYTHONSTUB_commandtype_enum {
   PYTHONSTUB_UnloadModelRequest,
   PYTHONSTUB_ModelReadinessRequest,
   PYTHONSTUB_IsRequestCancelled,
-  PYTHONSTUB_CancelBLSDecoupledInferRequest
+  PYTHONSTUB_CancelBLSInferRequest
 } PYTHONSTUB_CommandType;
 
 ///
diff --git a/src/pb_bls_cancel.cc b/src/pb_bls_cancel.cc
@@ -72,7 +72,7 @@ PbBLSCancel::Cancel()
       return;
     }
 
-    stub->EnqueueCancelBLSDecoupledRequest(this);
+    stub->EnqueueCancelBLSRequest(this);
     updating_ = true;
   }
   cv_.wait(lk, [this] { return !updating_; });
diff --git a/src/pb_stub.cc b/src/pb_stub.cc
@@ -1137,9 +1137,8 @@ Stub::ServiceStubToParentRequests()
           utils_msg_payload->command_type == PYTHONSTUB_IsRequestCancelled) {
         SendIsCancelled(utils_msg_payload);
       } else if (
-          utils_msg_payload->command_type ==
-          PYTHONSTUB_CancelBLSDecoupledInferRequest) {
-        SendCancelBLSDecoupledRequest(utils_msg_payload);
+          utils_msg_payload->command_type == PYTHONSTUB_CancelBLSInferRequest) {
+        SendCancelBLSRequest(utils_msg_payload);
       } else {
         std::cerr << "Error when sending message via stub_to_parent message "
                      "buffer - unknown command\n";
@@ -1226,7 +1225,7 @@ Stub::EnqueueCleanupId(void* id, const PYTHONSTUB_CommandType& command_type)
 }
 
 void
-Stub::SendCancelBLSDecoupledRequest(
+Stub::SendCancelBLSRequest(
     std::unique_ptr<UtilsMessagePayload>& utils_msg_payload)
 {
   PbBLSCancel* pb_bls_cancel =
@@ -1256,11 +1255,11 @@ Stub::SendCancelBLSDecoupledRequest(
 }
 
 void
-Stub::EnqueueCancelBLSDecoupledRequest(PbBLSCancel* pb_bls_cancel)
+Stub::EnqueueCancelBLSRequest(PbBLSCancel* pb_bls_cancel)
 {
   std::unique_ptr<UtilsMessagePayload> utils_msg_payload =
       std::make_unique<UtilsMessagePayload>(
-          PYTHONSTUB_CancelBLSDecoupledInferRequest,
+          PYTHONSTUB_CancelBLSInferRequest,
           reinterpret_cast<void*>(pb_bls_cancel));
   EnqueueUtilsMessage(std::move(utils_msg_payload));
 }
diff --git a/src/pb_stub.h b/src/pb_stub.h
@@ -322,12 +322,12 @@ class Stub {
   void EnqueueCleanupId(void* id, const PYTHONSTUB_CommandType& command_type);
 
   /// Send the id to the python backend for object cleanup
-  void SendCancelBLSDecoupledRequest(
+  void SendCancelBLSRequest(
       std::unique_ptr<UtilsMessagePayload>& utils_msg_payload);
 
   /// Add infer payload id to queue. This is used for retrieving the request
   /// address from the infer_payload
-  void EnqueueCancelBLSDecoupledRequest(PbBLSCancel* pb_bls_cancel);
+  void EnqueueCancelBLSRequest(PbBLSCancel* pb_bls_cancel);
 
   /// Add request cancellation query to queue
   void EnqueueIsCancelled(PbCancel* pb_cancel);
diff --git a/src/python_be.cc b/src/python_be.cc
@@ -765,8 +765,8 @@ ModelInstanceState::StubToParentMQMonitor()
         boost::asio::post(*thread_pool_, std::move(task));
         break;
       }
-      case PYTHONSTUB_CancelBLSDecoupledInferRequest: {
-        ProcessCancelBLSDecoupledRequest(message);
+      case PYTHONSTUB_CancelBLSInferRequest: {
+        ProcessCancelBLSRequest(message);
         break;
       }
       default: {
@@ -860,7 +860,7 @@ ModelInstanceState::ProcessCleanupRequest(
 }
 
 void
-ModelInstanceState::ProcessCancelBLSDecoupledRequest(
+ModelInstanceState::ProcessCancelBLSRequest(
     const std::unique_ptr<IPCMessage>& message)
 {
   AllocatedSharedMemory<CancelBLSRequestMessage> message_shm =
@@ -876,7 +876,7 @@ ModelInstanceState::ProcessCancelBLSDecoupledRequest(
       {
         std::lock_guard<std::mutex> lock(infer_payload_mu_);
         if (infer_payload_.find(id) != infer_payload_.end()) {
-          request_executor_->Cancel(infer_payload_[id]);
+          infer_payload_[id]->SafeCancelRequest();
         }
       }
       message_payload->is_cancelled = true;
diff --git a/src/python_be.h b/src/python_be.h
@@ -403,9 +403,8 @@ class ModelInstanceState : public BackendModelInstance {
   // Process the decoupled cleanup request for InferPayload and ResponseFactory
   void ProcessCleanupRequest(const std::unique_ptr<IPCMessage>& message);
 
-  // Process cancelling a BLS decoupled request
-  void ProcessCancelBLSDecoupledRequest(
-      const std::unique_ptr<IPCMessage>& message);
+  // Process cancelling a BLS request
+  void ProcessCancelBLSRequest(const std::unique_ptr<IPCMessage>& message);
 
   // Process request cancellation query
   void ProcessIsRequestCancelled(const std::unique_ptr<IPCMessage>& message);
diff --git a/src/request_executor.cc b/src/request_executor.cc
@@ -69,12 +69,15 @@ InferRequestComplete(
     TRITONSERVER_InferenceRequest* request, const uint32_t flags, void* userp)
 {
   if (request != nullptr) {
-    auto request_executor = reinterpret_cast<RequestExecutor*>(userp);
-    request_executor->EraseRequestAddress(reinterpret_cast<intptr_t>(request));
+    RequestCompletionUserp* completion_userp =
+        reinterpret_cast<RequestCompletionUserp*>(userp);
+    completion_userp->infer_payload->SetRequestDeleted();
 
     LOG_IF_ERROR(
         TRITONSERVER_InferenceRequestDelete(request),
         "Failed to delete inference request.");
+
+    delete completion_userp;
   }
 }
 
@@ -322,6 +325,18 @@ ResponseAlloc(
   return nullptr;  // Success
 }
 
+void
+InferRequestCancel(intptr_t request_address)
+{
+  if (request_address == 0L) {
+    return;
+  }
+
+  TRITONSERVER_InferenceRequest* irequest =
+      reinterpret_cast<TRITONSERVER_InferenceRequest*>(request_address);
+  THROW_IF_TRITON_ERROR(TRITONSERVER_InferenceRequestCancel(irequest));
+}
+
 TRITONSERVER_Error*
 OutputBufferQuery(
     TRITONSERVER_ResponseAllocator* allocator, void* userp,
@@ -364,6 +379,7 @@ RequestExecutor::Infer(
   bool is_ready = false;
   const char* model_name = infer_request->ModelName().c_str();
   TRITONSERVER_InferenceRequest* irequest = nullptr;
+  RequestCompletionUserp* completion_userp = nullptr;
 
   try {
     int64_t model_version = infer_request->ModelVersion();
@@ -415,8 +431,10 @@ RequestExecutor::Infer(
     THROW_IF_TRITON_ERROR(TRITONSERVER_InferenceRequestSetTimeoutMicroseconds(
         irequest, infer_request->Timeout()));
 
+    completion_userp = new RequestCompletionUserp(infer_payload);
     THROW_IF_TRITON_ERROR(TRITONSERVER_InferenceRequestSetReleaseCallback(
-        irequest, InferRequestComplete, reinterpret_cast<void*>(this)));
+        irequest, InferRequestComplete,
+        reinterpret_cast<void*>(completion_userp)));
 
     TRITONSERVER_InferenceTrace* trace = nullptr;
     if (infer_request->GetTrace().TritonTrace() != nullptr) {
@@ -485,22 +503,20 @@ RequestExecutor::Infer(
           reinterpret_cast<void*>(infer_payload->ResponseAllocUserp().get()),
           InferResponseComplete, reinterpret_cast<void*>(infer_payload.get())));
 
-      {
-        std::lock_guard<std::mutex> lk(on_going_request_addresses_mu_);
-        on_going_request_addresses_.insert(
-            reinterpret_cast<intptr_t>(irequest));
-      }
       // Store the inference request address submitted to the Triton server for
       // retrieval
       infer_payload->SetRequestAddress(reinterpret_cast<intptr_t>(irequest));
+      infer_payload->SetRequestCancellationFunc(InferRequestCancel);
 
       THROW_IF_TRITON_ERROR(
           TRITONSERVER_ServerInferAsync(server_, irequest, trace));
     }
   }
   catch (const PythonBackendException& pb_exception) {
-    EraseRequestAddress(reinterpret_cast<intptr_t>(irequest));
     infer_payload->SetRequestAddress(0L);
+    if (completion_userp != nullptr) {
+      delete completion_userp;
+    }
 
     LOG_IF_ERROR(
         TRITONSERVER_InferenceRequestDelete(irequest),
@@ -514,34 +530,6 @@ RequestExecutor::Infer(
   return response_future;
 }
 
-void
-RequestExecutor::Cancel(std::shared_ptr<InferPayload>& infer_payload)
-{
-  intptr_t request_address = infer_payload->GetRequestAddress();
-  if (request_address == 0L) {
-    return;
-  }
-
-  {
-    std::lock_guard<std::mutex> lk(on_going_request_addresses_mu_);
-    if (on_going_request_addresses_.find(request_address) !=
-        on_going_request_addresses_.end()) {
-      TRITONSERVER_InferenceRequest* irequest =
-          reinterpret_cast<TRITONSERVER_InferenceRequest*>(request_address);
-      THROW_IF_TRITON_ERROR(TRITONSERVER_InferenceRequestCancel(irequest));
-    }
-  }
-}
-
-void
-RequestExecutor::EraseRequestAddress(intptr_t request_address)
-{
-  if (request_address != 0L) {
-    std::unique_lock<std::mutex> lk(on_going_request_addresses_mu_);
-    on_going_request_addresses_.erase(request_address);
-  }
-}
-
 RequestExecutor::~RequestExecutor()
 {
   if (response_allocator_ != nullptr) {
diff --git a/src/request_executor.h b/src/request_executor.h
@@ -38,19 +38,21 @@ namespace triton { namespace backend { namespace python {
 TRITONSERVER_Error* CreateTritonErrorFromException(
     const PythonBackendException& pb_exception);
 
+struct RequestCompletionUserp {
+  std::shared_ptr<InferPayload> infer_payload;
+  RequestCompletionUserp(std::shared_ptr<InferPayload>& infer_payload)
+      : infer_payload(infer_payload){};
+};
+
 class RequestExecutor {
   TRITONSERVER_ResponseAllocator* response_allocator_ = nullptr;
   TRITONSERVER_Server* server_;
   std::unique_ptr<SharedMemoryManager>& shm_pool_;
-  std::mutex on_going_request_addresses_mu_;
-  std::unordered_set<intptr_t> on_going_request_addresses_;
 
  public:
   std::future<std::unique_ptr<InferResponse>> Infer(
       std::shared_ptr<InferRequest>& infer_request,
       std::shared_ptr<InferPayload>& infer_payload);
-  void EraseRequestAddress(intptr_t request_address);
-  void Cancel(std::shared_ptr<InferPayload>& infer_payload);
 
   RequestExecutor(
       std::unique_ptr<SharedMemoryManager>& shm_pool,

Original file line number	Diff line number	Diff line change
`@@ -72,7 +72,7 @@ PbBLSCancel::Cancel()`
`72`	`72`	`return;`
`73`	`73`	`}`
`74`	`74`
`75`		`- stub->EnqueueCancelBLSDecoupledRequest(this);`
	`75`	`+ stub->EnqueueCancelBLSRequest(this);`
`76`	`76`	`updating_ = true;`
`77`	`77`	`}`
`78`	`78`	`cv_.wait(lk, [this] { return !updating_; });`
Original file line number	Diff line number	Diff line change
`@@ -765,8 +765,8 @@ ModelInstanceState::StubToParentMQMonitor()`
`765`	`765`	`boost::asio::post(*thread_pool_, std::move(task));`
`766`	`766`	`break;`
`767`	`767`	`}`
`768`		`- case PYTHONSTUB_CancelBLSDecoupledInferRequest: {`
`769`		`- ProcessCancelBLSDecoupledRequest(message);`
	`768`	`+ case PYTHONSTUB_CancelBLSInferRequest: {`
	`769`	`+ ProcessCancelBLSRequest(message);`
`770`	`770`	`break;`
`771`	`771`	`}`
`772`	`772`	`default: {`
`@@ -860,7 +860,7 @@ ModelInstanceState::ProcessCleanupRequest(`
`860`	`860`	`}`
`861`	`861`
`862`	`862`	`void`
`863`		`-ModelInstanceState::ProcessCancelBLSDecoupledRequest(`
	`863`	`+ModelInstanceState::ProcessCancelBLSRequest(`
`864`	`864`	`const std::unique_ptr<IPCMessage>& message)`
`865`	`865`	`{`
`866`	`866`	`AllocatedSharedMemory<CancelBLSRequestMessage> message_shm =`
`@@ -876,7 +876,7 @@ ModelInstanceState::ProcessCancelBLSDecoupledRequest(`
`876`	`876`	`{`
`877`	`877`	`std::lock_guard<std::mutex> lock(infer_payload_mu_);`
`878`	`878`	`if (infer_payload_.find(id) != infer_payload_.end()) {`
`879`		`- request_executor_->Cancel(infer_payload_[id]);`
	`879`	`+ infer_payload_[id]->SafeCancelRequest();`
`880`	`880`	`}`
`881`	`881`	`}`
`882`	`882`	`message_payload->is_cancelled = true;`