Skip to content

Commit 47ca40a

Browse files
committed
Gate no-HEAD download path on response_checksum_validation=when_required
Address reviewer feedback (SDKs durability team): HEAD requests are required by default for full-object checksum validation on downloads. Only skip the HEAD when the caller has already opted out of response checksum validation via botocore's `response_checksum_validation = when_required` config. The prior HEAD + size-based GET logic is restored as the default path. This conditional is temporary; it can be removed once S3 supports returning checksums for ranged GETs (tracking: P320750170). Also: - Add TestDownloadWhenRequiredChecksumValidation covering the HEAD-less path for non-empty and 0-byte objects (InvalidRange handling) - Fix ruff import ordering / formatting in time-batch-download.py flagged by the Lint CI action
1 parent 9fee33f commit 47ca40a

5 files changed

Lines changed: 320 additions & 171 deletions

File tree

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
{
2+
"type": "feature",
3+
"category": "download",
4+
"description": "Skip the HEAD request during downloads when the client is configured with ``response_checksum_validation='when_required'``, reducing latency for small-object transfers. The HEAD request remains in place by default to enable full-object checksum validation until S3 supports returning checksums for ranged GETs."
5+
}

s3transfer/download.py

Lines changed: 155 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -356,15 +356,162 @@ def _submit(
356356
transfer_future, osutil
357357
)(osutil, self._transfer_coordinator, io_executor)
358358

359-
self._submit_first_chunk_request(
360-
client,
361-
config,
362-
osutil,
359+
# Skip the HEAD request only when the caller has explicitly opted out
360+
# of response checksum validation. Otherwise we need the HEAD to
361+
# obtain the full-object ETag/size for checksum validation. This
362+
# conditional can be removed once S3 supports returning checksums
363+
# for ranged GETs (tracking: P320750170).
364+
if client.meta.config.response_checksum_validation == "when_required":
365+
self._submit_first_chunk_request(
366+
client,
367+
config,
368+
osutil,
369+
request_executor,
370+
io_executor,
371+
download_output_manager,
372+
transfer_future,
373+
bandwidth_limiter,
374+
)
375+
return
376+
377+
if (
378+
transfer_future.meta.size is None
379+
or transfer_future.meta.etag is None
380+
):
381+
response = client.head_object(
382+
Bucket=transfer_future.meta.call_args.bucket,
383+
Key=transfer_future.meta.call_args.key,
384+
**transfer_future.meta.call_args.extra_args,
385+
)
386+
transfer_future.meta.provide_transfer_size(
387+
response['ContentLength']
388+
)
389+
transfer_future.meta.provide_object_etag(response.get('ETag'))
390+
391+
if transfer_future.meta.size < config.multipart_threshold:
392+
self._submit_download_request(
393+
client,
394+
config,
395+
osutil,
396+
request_executor,
397+
io_executor,
398+
download_output_manager,
399+
transfer_future,
400+
bandwidth_limiter,
401+
)
402+
else:
403+
self._submit_ranged_download_request(
404+
client,
405+
config,
406+
osutil,
407+
request_executor,
408+
io_executor,
409+
download_output_manager,
410+
transfer_future,
411+
bandwidth_limiter,
412+
)
413+
414+
def _submit_download_request(
415+
self,
416+
client,
417+
config,
418+
osutil,
419+
request_executor,
420+
io_executor,
421+
download_output_manager,
422+
transfer_future,
423+
bandwidth_limiter,
424+
):
425+
call_args = transfer_future.meta.call_args
426+
fileobj = download_output_manager.get_fileobj_for_io_writes(
427+
transfer_future
428+
)
429+
progress_callbacks = get_callbacks(transfer_future, 'progress')
430+
get_object_tag = download_output_manager.get_download_task_tag()
431+
final_task = download_output_manager.get_final_io_task()
432+
self._transfer_coordinator.submit(
363433
request_executor,
364-
io_executor,
365-
download_output_manager,
366-
transfer_future,
367-
bandwidth_limiter,
434+
ImmediatelyWriteIOGetObjectTask(
435+
transfer_coordinator=self._transfer_coordinator,
436+
main_kwargs={
437+
'client': client,
438+
'bucket': call_args.bucket,
439+
'key': call_args.key,
440+
'fileobj': fileobj,
441+
'extra_args': call_args.extra_args,
442+
'callbacks': progress_callbacks,
443+
'max_attempts': config.num_download_attempts,
444+
'download_output_manager': download_output_manager,
445+
'io_chunksize': config.io_chunksize,
446+
'bandwidth_limiter': bandwidth_limiter,
447+
},
448+
done_callbacks=[final_task],
449+
),
450+
tag=get_object_tag,
451+
)
452+
453+
def _submit_ranged_download_request(
454+
self,
455+
client,
456+
config,
457+
osutil,
458+
request_executor,
459+
io_executor,
460+
download_output_manager,
461+
transfer_future,
462+
bandwidth_limiter,
463+
):
464+
call_args = transfer_future.meta.call_args
465+
progress_callbacks = get_callbacks(transfer_future, 'progress')
466+
fileobj = download_output_manager.get_fileobj_for_io_writes(
467+
transfer_future
468+
)
469+
part_size = config.multipart_chunksize
470+
num_parts = calculate_num_parts(transfer_future.meta.size, part_size)
471+
get_object_tag = download_output_manager.get_download_task_tag()
472+
finalize_download_invoker = CountCallbackInvoker(
473+
self._get_final_io_task_submission_callback(
474+
download_output_manager, io_executor
475+
)
476+
)
477+
for i in range(num_parts):
478+
range_parameter = calculate_range_parameter(
479+
part_size, i, num_parts
480+
)
481+
extra_args = {'Range': range_parameter}
482+
if transfer_future.meta.etag is not None:
483+
extra_args['IfMatch'] = transfer_future.meta.etag
484+
extra_args.update(call_args.extra_args)
485+
finalize_download_invoker.increment()
486+
self._transfer_coordinator.submit(
487+
request_executor,
488+
GetObjectTask(
489+
transfer_coordinator=self._transfer_coordinator,
490+
main_kwargs={
491+
'client': client,
492+
'bucket': call_args.bucket,
493+
'key': call_args.key,
494+
'fileobj': fileobj,
495+
'extra_args': extra_args,
496+
'callbacks': progress_callbacks,
497+
'max_attempts': config.num_download_attempts,
498+
'start_index': i * part_size,
499+
'download_output_manager': download_output_manager,
500+
'io_chunksize': config.io_chunksize,
501+
'bandwidth_limiter': bandwidth_limiter,
502+
},
503+
done_callbacks=[finalize_download_invoker.decrement],
504+
),
505+
tag=get_object_tag,
506+
)
507+
finalize_download_invoker.finalize()
508+
509+
def _get_final_io_task_submission_callback(
510+
self, download_manager, io_executor
511+
):
512+
final_task = download_manager.get_final_io_task()
513+
return FunctionContainer(
514+
self._transfer_coordinator.submit, io_executor, final_task
368515
)
369516

370517
def _submit_first_chunk_request(

scripts/performance/time-batch-download.py

Lines changed: 9 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -2,10 +2,12 @@
22
"""Direct timing of batch downloads without shell wrapper."""
33

44
import argparse
5-
import tempfile
65
import shutil
6+
import tempfile
77
import time
8+
89
from botocore.session import get_session
10+
911
from s3transfer.manager import TransferManager
1012

1113

@@ -20,13 +22,13 @@ def main():
2022
parser.add_argument('--file-size', type=int, required=True)
2123
parser.add_argument('--s3-bucket', required=True)
2224
args = parser.parse_args()
23-
25+
2426
session = get_session()
2527
client = session.create_client('s3')
26-
28+
2729
tempdir = tempfile.mkdtemp()
2830
s3_keys = []
29-
31+
3032
try:
3133
# Upload files
3234
print(f"Uploading {args.file_count} files...")
@@ -37,7 +39,7 @@ def main():
3739
s3_key = f"perf_test_{i}"
3840
manager.upload(file_path, args.s3_bucket, s3_key)
3941
s3_keys.append(s3_key)
40-
42+
4143
# Download files
4244
print(f"Downloading {args.file_count} files...")
4345
start_time = time.time()
@@ -46,9 +48,9 @@ def main():
4648
download_path = f"{tempdir}/download_{i}"
4749
manager.download(args.s3_bucket, s3_key, download_path)
4850
duration = time.time() - start_time
49-
51+
5052
print(f"Download duration: {duration:.2f} seconds")
51-
53+
5254
# Cleanup
5355
for s3_key in s3_keys:
5456
client.delete_object(Bucket=args.s3_bucket, Key=s3_key)

0 commit comments

Comments
 (0)