-
Notifications
You must be signed in to change notification settings - Fork 11
fix(csharp): add configurable timeout for CloudFetch body reads #374
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
48a88ec
84435a6
f32bfaf
741430f
6748d23
18b96b3
052d9a5
890237f
4fcead3
057aa11
9764764
4efc5d8
a964fff
b20b38d
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change | ||||||||||||||
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
|
|
@@ -55,6 +55,7 @@ internal sealed class CloudFetchDownloader : ICloudFetchDownloader | |||||||||||||||
| private readonly int _retryDelayMs; | ||||||||||||||||
| private readonly int _maxUrlRefreshAttempts; | ||||||||||||||||
| private readonly int _urlExpirationBufferSeconds; | ||||||||||||||||
| private readonly int _timeoutMinutes; | ||||||||||||||||
| private readonly SemaphoreSlim _downloadSemaphore; | ||||||||||||||||
| private readonly RecyclableMemoryStreamManager? _memoryStreamManager; | ||||||||||||||||
| private readonly ArrayPool<byte>? _lz4BufferPool; | ||||||||||||||||
|
|
@@ -99,6 +100,7 @@ public CloudFetchDownloader( | |||||||||||||||
| _retryDelayMs = config.RetryDelayMs; | ||||||||||||||||
| _maxUrlRefreshAttempts = config.MaxUrlRefreshAttempts; | ||||||||||||||||
| _urlExpirationBufferSeconds = config.UrlExpirationBufferSeconds; | ||||||||||||||||
| _timeoutMinutes = config.TimeoutMinutes; | ||||||||||||||||
| _memoryStreamManager = config.MemoryStreamManager; | ||||||||||||||||
| _lz4BufferPool = config.Lz4BufferPool; | ||||||||||||||||
| _downloadSemaphore = new SemaphoreSlim(_maxParallelDownloads, _maxParallelDownloads); | ||||||||||||||||
|
|
@@ -146,6 +148,7 @@ internal CloudFetchDownloader( | |||||||||||||||
| _retryDelayMs = retryDelayMs; | ||||||||||||||||
| _maxUrlRefreshAttempts = CloudFetchConfiguration.DefaultMaxUrlRefreshAttempts; | ||||||||||||||||
| _urlExpirationBufferSeconds = CloudFetchConfiguration.DefaultUrlExpirationBufferSeconds; | ||||||||||||||||
| _timeoutMinutes = CloudFetchConfiguration.DefaultTimeoutMinutes; | ||||||||||||||||
| _memoryStreamManager = null; | ||||||||||||||||
| _lz4BufferPool = null; | ||||||||||||||||
| _downloadSemaphore = new SemaphoreSlim(_maxParallelDownloads, _maxParallelDownloads); | ||||||||||||||||
|
|
@@ -573,7 +576,7 @@ await _activityTracer.TraceActivityAsync(async activity => | |||||||||||||||
|
|
||||||||||||||||
| response.EnsureSuccessStatusCode(); | ||||||||||||||||
|
|
||||||||||||||||
| // Log the download size if available from response headers | ||||||||||||||||
| // Log the download size from response headers | ||||||||||||||||
| long? contentLength = response.Content.Headers.ContentLength; | ||||||||||||||||
| if (contentLength.HasValue && contentLength.Value > 0) | ||||||||||||||||
| { | ||||||||||||||||
|
|
@@ -584,9 +587,38 @@ await _activityTracer.TraceActivityAsync(async activity => | |||||||||||||||
| new("content_length_mb", contentLength.Value / 1024.0 / 1024.0) | ||||||||||||||||
| ]); | ||||||||||||||||
| } | ||||||||||||||||
| else | ||||||||||||||||
| { | ||||||||||||||||
| activity?.AddEvent("cloudfetch.content_length_missing", [ | ||||||||||||||||
| new("offset", downloadResult.StartRowOffset), | ||||||||||||||||
| new("sanitized_url", sanitizedUrl) | ||||||||||||||||
| ]); | ||||||||||||||||
| } | ||||||||||||||||
|
|
||||||||||||||||
| // Read the file data | ||||||||||||||||
| fileData = await response.Content.ReadAsByteArrayAsync().ConfigureAwait(false); | ||||||||||||||||
| // Read the file data with an explicit timeout. | ||||||||||||||||
| // ReadAsByteArrayAsync() on net472 has no CancellationToken overload, | ||||||||||||||||
| // and HttpClient.Timeout does not protect body reads when | ||||||||||||||||
| // HttpCompletionOption.ResponseHeadersRead is used — SendAsync returns | ||||||||||||||||
| // after headers, and the subsequent body read is a separate call on | ||||||||||||||||
| // HttpContent with no timeout coverage. | ||||||||||||||||
| // Using CopyToAsync with an explicit token ensures dead TCP connections | ||||||||||||||||
| // are detected on every 81920-byte chunk read. | ||||||||||||||||
| using (var bodyTimeoutCts = CancellationTokenSource.CreateLinkedTokenSource(cancellationToken)) | ||||||||||||||||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can we just reuse the existing cancellationToken and extend that to 15mins?
Collaborator
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. That cancellation token is global and will cancel all the downloads. We want this to be a per file cancellation token for this use case
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The body read timeout should reuse the same However, bump the default from 5 to 15 minutes. The 5-min 15 min is a safer default for body reads since large CloudFetch files can take longer to transfer, especially on slower networks.
Collaborator
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Done. Bumped The 15-min default is important because we confirmed via testing on .NET Framework 4.7.2 that |
||||||||||||||||
| { | ||||||||||||||||
| bodyTimeoutCts.CancelAfter(TimeSpan.FromMinutes(_timeoutMinutes)); | ||||||||||||||||
| using (var contentStream = await response.Content.ReadAsStreamAsync().ConfigureAwait(false)) | ||||||||||||||||
| { | ||||||||||||||||
| // Pre-allocate with Content-Length when available (CloudFetch always provides it). | ||||||||||||||||
| int capacity = contentLength.HasValue && contentLength.Value > 0 | ||||||||||||||||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. How often will we not know the Content-Length in advance (i.e. use a chunked transfer)? For Direct Query in the service, I believe there's a commit limit set and I'm not sure what it's set to. I'll try to get an answer on that.
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. agree on this, we should be able to get the contentLength from the http response header always
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Okay, I've looked at four different potential hosts and there's only one case where this value is set low enough for it to be a possible concern, and that's setting the value to 512 MB. Direct Query datasets are capped at one million rows and are generally not huge, but if there are five concurrent downloads of 20 MB chunks then we would blow through that with the preallocations. One possible mitigation would be to use p/Invoke to check for the current commit limit and be more conservative if that limit is set conservatively.
Collaborator
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I agree. There wouldn't be a case without content length. Just added this as a safety net fallback. For the scope of this PR, I can either change it to 0 if content length isn't available which is never the case actually.
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes, that seems like a better option for now. Does this code have access to tracing? If we always expect content-length then it might be worth tracing something when it's not present. |
||||||||||||||||
| ? (int)contentLength.Value | ||||||||||||||||
| : 0; | ||||||||||||||||
| using (var memoryStream = new MemoryStream(capacity)) | ||||||||||||||||
| { | ||||||||||||||||
| await contentStream.CopyToAsync(memoryStream, 81920, bodyTimeoutCts.Token).ConfigureAwait(false); | ||||||||||||||||
| fileData = memoryStream.ToArray(); | ||||||||||||||||
| } | ||||||||||||||||
| } | ||||||||||||||||
| } | ||||||||||||||||
| break; // Success, exit retry loop | ||||||||||||||||
| } | ||||||||||||||||
|
||||||||||||||||
| } | |
| } | |
| catch (OperationCanceledException oce) when (!cancellationToken.IsCancellationRequested) | |
| { | |
| // Treat a timeout of the body read (via the linked CTS) as a fault, not a cancellation. | |
| throw new TimeoutException("Timed out while reading the response body.", oce); | |
| } |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Good catch. Fixed — added catch (OperationCanceledException) when (!cancellationToken.IsCancellationRequested) that rethrows as TimeoutException. This ensures the download continuation sees t.IsFaulted (not t.IsCanceled) and properly completes the DownloadCompletedTask.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
PR description mentions a new connection property
adbc.databricks.cloudfetch.body_read_timeout_minutes(default 15), but this change wires the body-read timeout to the existingCloudFetchConfiguration.TimeoutMinutes(default 5) via_timeoutMinutes = config.TimeoutMinutes. Either the implementation should use the new dedicated property (and its intended default), or the PR description/commentary should be updated to reflect that the existingtimeout_minutessetting controls body reads as well.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Fixed — removed the separate
body_read_timeout_minutesparameter. Now usingtimeout_minutes(default bumped to 15 min) for bothHttpClient.Timeoutand body readCancelAfter.