@@ -15,6 +15,7 @@ namespace Microsoft.Azure.Cosmos
1515 using System . Threading ;
1616 using System . Threading . Tasks ;
1717 using global ::Azure . Core ;
18+ using Microsoft . Azure . Cosmos . Resource . CosmosExceptions ;
1819 using Microsoft . Azure . Documents ;
1920 using Microsoft . Azure . Documents . Collections ;
2021
@@ -32,9 +33,16 @@ internal class InferenceService : IDisposable
3233 private const string InferenceTokenPrefix = "Bearer " ;
3334 private const int inferenceServiceDefaultMaxConnectionLimit = 50 ;
3435
36+ /// <summary>
37+ /// Default per-request timeout for inference requests. Referenced by
38+ /// <see cref="CosmosClientOptions.InferenceRequestTimeout"/>.
39+ /// </summary>
40+ internal static readonly TimeSpan DefaultInferenceRequestTimeout = TimeSpan . FromSeconds ( 5 ) ;
41+
3542 private readonly int inferenceServiceMaxConnectionLimit ;
3643 private readonly string inferenceServiceBaseUrl ;
3744 private readonly Uri inferenceEndpoint ;
45+ private readonly TimeSpan inferenceRequestTimeout ;
3846
3947 private HttpClient httpClient ;
4048 private AuthorizationTokenProvider cosmosAuthorization ;
@@ -59,6 +67,9 @@ public InferenceService(CosmosClient client)
5967 "AZURE_COSMOS_SEMANTIC_RERANKER_INFERENCE_SERVICE_MAX_CONNECTION_LIMIT" ,
6068 inferenceServiceDefaultMaxConnectionLimit ) ?? inferenceServiceDefaultMaxConnectionLimit ;
6169
70+ Debug . Assert ( client . ClientOptions != null , "ClientOptions should not be null" ) ;
71+ this . inferenceRequestTimeout = client . ClientOptions . InferenceRequestTimeout ;
72+
6273 // Create and configure HttpClient for inference requests.
6374 HttpMessageHandler httpMessageHandler = CosmosHttpClientCore . CreateHttpClientHandler (
6475 gatewayModeMaxConnectionLimit : this . inferenceServiceMaxConnectionLimit ,
@@ -95,6 +106,7 @@ public InferenceService(CosmosClient client)
95106 /// </summary>
96107 internal InferenceService ( HttpMessageHandler messageHandler , Uri inferenceEndpoint , AuthorizationTokenProvider cosmosAuthorization )
97108 {
109+ this . inferenceRequestTimeout = InferenceService . DefaultInferenceRequestTimeout ;
98110 this . httpClient = new HttpClient ( messageHandler ) ;
99111 this . CreateClientHelper ( this . httpClient ) ;
100112 this . inferenceEndpoint = inferenceEndpoint ;
@@ -115,6 +127,8 @@ public async Task<SemanticRerankResult> SemanticRerankAsync(
115127 IDictionary < string , object > options = null ,
116128 CancellationToken cancellationToken = default )
117129 {
130+ DateTime startDateTimeUtc = DateTime . UtcNow ;
131+
118132 // Prepare HTTP request for semantic reranking.
119133 HttpRequestMessage message = new HttpRequestMessage ( HttpMethod . Post , this . inferenceEndpoint ) ;
120134 INameValueCollection additionalHeaders = new RequestNameValueCollection ( ) ;
@@ -139,8 +153,29 @@ await this.cosmosAuthorization.AddAuthorizationHeaderAsync(
139153 Encoding . UTF8 ,
140154 RuntimeConstants . MediaTypes . Json ) ;
141155
142- // Send the request and check for success.
143- HttpResponseMessage responseMessage = await this . httpClient . SendAsync ( message , cancellationToken ) ;
156+ // Enforce a single-attempt, no-retry timeout for the inference request.
157+ // HttpClient.Timeout is intentionally left unchanged; this linked CTS is the authoritative
158+ // per-request timeout for inference calls.
159+ using CancellationTokenSource linkedCts = CancellationTokenSource . CreateLinkedTokenSource ( cancellationToken ) ;
160+ linkedCts . CancelAfter ( this . inferenceRequestTimeout ) ;
161+
162+ HttpResponseMessage responseMessage ;
163+ try
164+ {
165+ responseMessage = await this . httpClient . SendAsync ( message , linkedCts . Token ) ;
166+ }
167+ catch ( OperationCanceledException operationCanceledException ) when ( ! cancellationToken . IsCancellationRequested )
168+ {
169+ // Timeout triggered by the linked CTS (not the caller's cancellationToken).
170+ string errorMessage = $ "Inference Service Request Timeout. Start Time UTC:{ startDateTimeUtc } ; Total Duration:{ ( DateTime . UtcNow - startDateTimeUtc ) . TotalMilliseconds } Ms; Inference Request Timeout:{ this . inferenceRequestTimeout . TotalMilliseconds } Ms; Activity id: { System . Diagnostics . Trace . CorrelationManager . ActivityId } ;";
171+ throw CosmosExceptionFactory . CreateRequestTimeoutException (
172+ message : errorMessage ,
173+ headers : new Headers ( )
174+ {
175+ ActivityId = System . Diagnostics . Trace . CorrelationManager . ActivityId . ToString ( )
176+ } ,
177+ innerException : operationCanceledException ) ;
178+ }
144179
145180 if ( ! responseMessage . IsSuccessStatusCode )
146181 {
0 commit comments