Skip to content

Commit 9d25e90

Browse files
committed
Gate no-HEAD download path on response_checksum_validation=when_required
Address reviewer feedback that the HEAD request is needed by default to enable full-object checksum validation on downloads. Only skip the HEAD when the caller has opted out via botocore's `response_checksum_validation = when_required` config. The prior HEAD + size-based GET logic is restored as the default path. Also: - Add TestDownloadWhenRequiredChecksumValidation covering the HEAD-less path for non-empty and 0-byte objects (InvalidRange handling) - Fix ruff import ordering / formatting in time-batch-download.py flagged by the Lint CI action
1 parent 9fee33f commit 9d25e90

6 files changed

Lines changed: 568 additions & 171 deletions

File tree

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
{
2+
"type": "feature",
3+
"category": "download",
4+
"description": "Skip the HEAD request during downloads when the client is configured with ``response_checksum_validation='when_required'``, reducing latency for small-object transfers. The HEAD request remains in place by default to enable full-object checksum validation until S3 supports returning checksums for ranged GETs."
5+
}

s3transfer/download.py

Lines changed: 198 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -356,17 +356,207 @@ def _submit(
356356
transfer_future, osutil
357357
)(osutil, self._transfer_coordinator, io_executor)
358358

359-
self._submit_first_chunk_request(
360-
client,
361-
config,
362-
osutil,
359+
# Skip the HEAD request only when the caller has explicitly opted out
360+
# of response checksum validation. Otherwise we need the HEAD to
361+
# obtain the full-object ETag/size for checksum validation.
362+
if client.meta.config.response_checksum_validation == "when_required":
363+
self._submit_first_chunk_request(
364+
client,
365+
config,
366+
osutil,
367+
request_executor,
368+
io_executor,
369+
download_output_manager,
370+
transfer_future,
371+
bandwidth_limiter,
372+
)
373+
return
374+
375+
if (
376+
transfer_future.meta.size is None
377+
or transfer_future.meta.etag is None
378+
):
379+
response = client.head_object(
380+
Bucket=transfer_future.meta.call_args.bucket,
381+
Key=transfer_future.meta.call_args.key,
382+
**transfer_future.meta.call_args.extra_args,
383+
)
384+
# If a size was not provided figure out the size for the
385+
# user.
386+
transfer_future.meta.provide_transfer_size(
387+
response['ContentLength']
388+
)
389+
# Provide an etag to ensure a stored object is not modified
390+
# during a multipart download.
391+
transfer_future.meta.provide_object_etag(response.get('ETag'))
392+
393+
# If it is greater than threshold do a ranged download, otherwise
394+
# do a regular GetObject download.
395+
if transfer_future.meta.size < config.multipart_threshold:
396+
self._submit_download_request(
397+
client,
398+
config,
399+
osutil,
400+
request_executor,
401+
io_executor,
402+
download_output_manager,
403+
transfer_future,
404+
bandwidth_limiter,
405+
)
406+
else:
407+
self._submit_ranged_download_request(
408+
client,
409+
config,
410+
osutil,
411+
request_executor,
412+
io_executor,
413+
download_output_manager,
414+
transfer_future,
415+
bandwidth_limiter,
416+
)
417+
418+
def _submit_download_request(
419+
self,
420+
client,
421+
config,
422+
osutil,
423+
request_executor,
424+
io_executor,
425+
download_output_manager,
426+
transfer_future,
427+
bandwidth_limiter,
428+
):
429+
call_args = transfer_future.meta.call_args
430+
431+
# Get a handle to the file that will be used for writing downloaded
432+
# contents
433+
fileobj = download_output_manager.get_fileobj_for_io_writes(
434+
transfer_future
435+
)
436+
437+
# Get the needed callbacks for the task
438+
progress_callbacks = get_callbacks(transfer_future, 'progress')
439+
440+
# Get any associated tags for the get object task.
441+
get_object_tag = download_output_manager.get_download_task_tag()
442+
443+
# Get the final io task to run once the download is complete.
444+
final_task = download_output_manager.get_final_io_task()
445+
446+
# Submit the task to download the object.
447+
self._transfer_coordinator.submit(
363448
request_executor,
364-
io_executor,
365-
download_output_manager,
366-
transfer_future,
367-
bandwidth_limiter,
449+
ImmediatelyWriteIOGetObjectTask(
450+
transfer_coordinator=self._transfer_coordinator,
451+
main_kwargs={
452+
'client': client,
453+
'bucket': call_args.bucket,
454+
'key': call_args.key,
455+
'fileobj': fileobj,
456+
'extra_args': call_args.extra_args,
457+
'callbacks': progress_callbacks,
458+
'max_attempts': config.num_download_attempts,
459+
'download_output_manager': download_output_manager,
460+
'io_chunksize': config.io_chunksize,
461+
'bandwidth_limiter': bandwidth_limiter,
462+
},
463+
done_callbacks=[final_task],
464+
),
465+
tag=get_object_tag,
368466
)
369467

468+
def _submit_ranged_download_request(
469+
self,
470+
client,
471+
config,
472+
osutil,
473+
request_executor,
474+
io_executor,
475+
download_output_manager,
476+
transfer_future,
477+
bandwidth_limiter,
478+
):
479+
call_args = transfer_future.meta.call_args
480+
481+
# Get the needed progress callbacks for the task
482+
progress_callbacks = get_callbacks(transfer_future, 'progress')
483+
484+
# Get a handle to the file that will be used for writing downloaded
485+
# contents
486+
fileobj = download_output_manager.get_fileobj_for_io_writes(
487+
transfer_future
488+
)
489+
490+
# Determine the number of parts
491+
part_size = config.multipart_chunksize
492+
num_parts = calculate_num_parts(transfer_future.meta.size, part_size)
493+
494+
# Get any associated tags for the get object task.
495+
get_object_tag = download_output_manager.get_download_task_tag()
496+
497+
# Callback invoker to submit the final io task once all downloads
498+
# are complete.
499+
finalize_download_invoker = CountCallbackInvoker(
500+
self._get_final_io_task_submission_callback(
501+
download_output_manager, io_executor
502+
)
503+
)
504+
for i in range(num_parts):
505+
# Calculate the range parameter
506+
range_parameter = calculate_range_parameter(
507+
part_size, i, num_parts
508+
)
509+
510+
# Inject extra parameters to be passed in as extra args
511+
extra_args = {
512+
'Range': range_parameter,
513+
}
514+
if transfer_future.meta.etag is not None:
515+
extra_args['IfMatch'] = transfer_future.meta.etag
516+
extra_args.update(call_args.extra_args)
517+
finalize_download_invoker.increment()
518+
# Submit the ranged downloads
519+
self._transfer_coordinator.submit(
520+
request_executor,
521+
GetObjectTask(
522+
transfer_coordinator=self._transfer_coordinator,
523+
main_kwargs={
524+
'client': client,
525+
'bucket': call_args.bucket,
526+
'key': call_args.key,
527+
'fileobj': fileobj,
528+
'extra_args': extra_args,
529+
'callbacks': progress_callbacks,
530+
'max_attempts': config.num_download_attempts,
531+
'start_index': i * part_size,
532+
'download_output_manager': download_output_manager,
533+
'io_chunksize': config.io_chunksize,
534+
'bandwidth_limiter': bandwidth_limiter,
535+
},
536+
done_callbacks=[finalize_download_invoker.decrement],
537+
),
538+
tag=get_object_tag,
539+
)
540+
finalize_download_invoker.finalize()
541+
542+
def _get_final_io_task_submission_callback(
543+
self, download_manager, io_executor
544+
):
545+
final_task = download_manager.get_final_io_task()
546+
return FunctionContainer(
547+
self._transfer_coordinator.submit, io_executor, final_task
548+
)
549+
550+
def _calculate_range_param(self, part_size, part_index, num_parts):
551+
# Used to calculate the Range parameter
552+
start_range = part_index * part_size
553+
if part_index == num_parts - 1:
554+
end_range = ''
555+
else:
556+
end_range = start_range + part_size - 1
557+
range_param = f'bytes={start_range}-{end_range}'
558+
return range_param
559+
370560
def _submit_first_chunk_request(
371561
self,
372562
client,

scripts/performance/time-batch-download.py

Lines changed: 9 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -2,10 +2,12 @@
22
"""Direct timing of batch downloads without shell wrapper."""
33

44
import argparse
5-
import tempfile
65
import shutil
6+
import tempfile
77
import time
8+
89
from botocore.session import get_session
10+
911
from s3transfer.manager import TransferManager
1012

1113

@@ -20,13 +22,13 @@ def main():
2022
parser.add_argument('--file-size', type=int, required=True)
2123
parser.add_argument('--s3-bucket', required=True)
2224
args = parser.parse_args()
23-
25+
2426
session = get_session()
2527
client = session.create_client('s3')
26-
28+
2729
tempdir = tempfile.mkdtemp()
2830
s3_keys = []
29-
31+
3032
try:
3133
# Upload files
3234
print(f"Uploading {args.file_count} files...")
@@ -37,7 +39,7 @@ def main():
3739
s3_key = f"perf_test_{i}"
3840
manager.upload(file_path, args.s3_bucket, s3_key)
3941
s3_keys.append(s3_key)
40-
42+
4143
# Download files
4244
print(f"Downloading {args.file_count} files...")
4345
start_time = time.time()
@@ -46,9 +48,9 @@ def main():
4648
download_path = f"{tempdir}/download_{i}"
4749
manager.download(args.s3_bucket, s3_key, download_path)
4850
duration = time.time() - start_time
49-
51+
5052
print(f"Download duration: {duration:.2f} seconds")
51-
53+
5254
# Cleanup
5355
for s3_key in s3_keys:
5456
client.delete_object(Bucket=args.s3_bucket, Key=s3_key)

0 commit comments

Comments
 (0)