Skip to content

Commit 6fb04d0

Browse files
authored
Reintroduce case-sensitivity handling to multi-object download operations (#9969)
1 parent 743fd04 commit 6fb04d0

File tree

20 files changed

+877
-13
lines changed

20 files changed

+877
-13
lines changed
Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
{
2+
"type": "enhancement",
3+
"category": "``s3``",
4+
"description": "Adds new parameter ``--case-conflict`` that configures how case conflicts are handled on case-insensitive filesystems"
5+
}

awscli/customizations/s3/filegenerator.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -94,7 +94,8 @@ def __init__(self, directory, filename):
9494
class FileStat(object):
9595
def __init__(self, src, dest=None, compare_key=None, size=None,
9696
last_update=None, src_type=None, dest_type=None,
97-
operation_name=None, response_data=None, etag=None):
97+
operation_name=None, response_data=None, etag=None,
98+
case_conflict_submitted=None, case_conflict_key=None,):
9899
self.src = src
99100
self.dest = dest
100101
self.compare_key = compare_key
@@ -105,6 +106,8 @@ def __init__(self, src, dest=None, compare_key=None, size=None,
105106
self.operation_name = operation_name
106107
self.response_data = response_data
107108
self.etag = etag
109+
self.case_conflict_submitted = case_conflict_submitted
110+
self.case_conflict_key = case_conflict_key
108111

109112

110113
class FileGenerator(object):

awscli/customizations/s3/fileinfo.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -42,7 +42,8 @@ def __init__(self, src, dest=None, compare_key=None, size=None,
4242
last_update=None, src_type=None, dest_type=None,
4343
operation_name=None, client=None, parameters=None,
4444
source_client=None, is_stream=False,
45-
associated_response_data=None, etag=None):
45+
associated_response_data=None, etag=None,
46+
case_conflict_submitted=None, case_conflict_key=None,):
4647
self.src = src
4748
self.src_type = src_type
4849
self.operation_name = operation_name
@@ -60,6 +61,8 @@ def __init__(self, src, dest=None, compare_key=None, size=None,
6061
self.is_stream = is_stream
6162
self.associated_response_data = associated_response_data
6263
self.etag = etag
64+
self.case_conflict_submitted = case_conflict_submitted
65+
self.case_conflict_key = case_conflict_key
6366

6467
def is_glacier_compatible(self):
6568
"""Determines if a file info object is glacier compatible

awscli/customizations/s3/fileinfobuilder.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,12 @@ def _inject_info(self, file_base):
4646
file_info_attr['is_stream'] = self._is_stream
4747
file_info_attr['associated_response_data'] = file_base.response_data
4848
file_info_attr['etag'] = file_base.etag
49+
file_info_attr['case_conflict_submitted'] = getattr(
50+
file_base, 'case_conflict_submitted', None
51+
)
52+
file_info_attr['case_conflict_key'] = getattr(
53+
file_base, 'case_conflict_key', None
54+
)
4955

5056
# This is a bit quirky. The below conditional hinges on the --delete
5157
# flag being set, which only occurs during a sync command. The source

awscli/customizations/s3/s3handler.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -47,6 +47,7 @@
4747
from awscli.customizations.s3.utils import DeleteSourceFileSubscriber
4848
from awscli.customizations.s3.utils import DeleteSourceObjectSubscriber
4949
from awscli.customizations.s3.utils import DeleteCopySourceObjectSubscriber
50+
from awscli.customizations.s3.utils import CaseConflictCleanupSubscriber
5051
from awscli.compat import get_binary_stdin
5152

5253

@@ -403,6 +404,13 @@ def _add_additional_subscribers(self, subscribers, fileinfo):
403404
if self._cli_params.get('is_move', False):
404405
subscribers.append(DeleteSourceObjectSubscriber(
405406
fileinfo.source_client))
407+
if fileinfo.case_conflict_submitted is not None:
408+
subscribers.append(
409+
CaseConflictCleanupSubscriber(
410+
fileinfo.case_conflict_submitted,
411+
fileinfo.case_conflict_key,
412+
)
413+
)
406414

407415
def _submit_transfer_request(self, fileinfo, extra_args, subscribers):
408416
bucket, key = find_bucket_key(fileinfo.src)

awscli/customizations/s3/subcommands.py

Lines changed: 124 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,8 @@
3434
S3PathResolver
3535
from awscli.customizations.utils import uni_print
3636
from awscli.customizations.s3.syncstrategy.base import MissingFileSync, \
37-
SizeAndLastModifiedSync, NeverSync
37+
SizeAndLastModifiedSync, NeverSync, AlwaysSync
38+
from awscli.customizations.s3.syncstrategy.caseconflict import CaseConflictSync
3839
from awscli.customizations.s3 import transferconfig
3940
from awscli.utils import resolve_v2_debug_mode
4041

@@ -482,6 +483,33 @@
482483
)
483484
}
484485

486+
CASE_CONFLICT = {
487+
'name': 'case-conflict',
488+
'choices': [
489+
'ignore',
490+
'skip',
491+
'warn',
492+
'error',
493+
],
494+
'default': 'ignore',
495+
'help_text': (
496+
"Configures behavior when attempting to download multiple objects "
497+
"whose keys differ only by case, which can cause undefined behavior "
498+
"on case-insensitive filesystems. "
499+
"This parameter only applies for commands that perform multiple S3 "
500+
"to local downloads. "
501+
f"See <a href='{CaseConflictSync.DOC_URI}'>Handling case "
502+
"conflicts</a> for details. Valid values are: "
503+
"<ul>"
504+
"<li>``error`` - Raise an error and abort downloads.</li>"
505+
"<li>``warn`` - Emit a warning and download the object.</li>"
506+
"<li>``skip`` - Skip downloading the object.</li>"
507+
"<li>``ignore`` - The default value. Ignore the conflict and "
508+
"download the object.</li>"
509+
"</ul>"
510+
),
511+
}
512+
485513
TRANSFER_ARGS = [DRYRUN, QUIET, INCLUDE, EXCLUDE, ACL,
486514
FOLLOW_SYMLINKS, NO_FOLLOW_SYMLINKS, NO_GUESS_MIME_TYPE,
487515
SSE, SSE_C, SSE_C_KEY, SSE_KMS_KEY_ID, SSE_C_COPY_SOURCE,
@@ -807,7 +835,8 @@ class CpCommand(S3TransferCommand):
807835
"or <S3Uri> <S3Uri>"
808836
ARG_TABLE = [{'name': 'paths', 'nargs': 2, 'positional_arg': True,
809837
'synopsis': USAGE}] + TRANSFER_ARGS + \
810-
[METADATA, METADATA_DIRECTIVE, EXPECTED_SIZE, RECURSIVE]
838+
[METADATA, METADATA_DIRECTIVE, EXPECTED_SIZE, RECURSIVE,
839+
CASE_CONFLICT]
811840

812841

813842
class MvCommand(S3TransferCommand):
@@ -817,7 +846,8 @@ class MvCommand(S3TransferCommand):
817846
"or <S3Uri> <S3Uri>"
818847
ARG_TABLE = [{'name': 'paths', 'nargs': 2, 'positional_arg': True,
819848
'synopsis': USAGE}] + TRANSFER_ARGS +\
820-
[METADATA, METADATA_DIRECTIVE, RECURSIVE, VALIDATE_SAME_S3_PATHS]
849+
[METADATA, METADATA_DIRECTIVE, RECURSIVE, VALIDATE_SAME_S3_PATHS,
850+
CASE_CONFLICT]
821851

822852

823853
class RmCommand(S3TransferCommand):
@@ -839,7 +869,7 @@ class SyncCommand(S3TransferCommand):
839869
"<LocalPath> or <S3Uri> <S3Uri>"
840870
ARG_TABLE = [{'name': 'paths', 'nargs': 2, 'positional_arg': True,
841871
'synopsis': USAGE}] + TRANSFER_ARGS + \
842-
[METADATA, METADATA_DIRECTIVE]
872+
[METADATA, METADATA_DIRECTIVE, CASE_CONFLICT]
843873

844874

845875
class MbCommand(S3Command):
@@ -1004,7 +1034,16 @@ def choose_sync_strategies(self):
10041034
# Set the default strategies.
10051035
sync_strategies['file_at_src_and_dest_sync_strategy'] = \
10061036
SizeAndLastModifiedSync()
1007-
sync_strategies['file_not_at_dest_sync_strategy'] = MissingFileSync()
1037+
if self._should_handle_case_conflicts():
1038+
sync_strategies['file_not_at_dest_sync_strategy'] = (
1039+
CaseConflictSync(
1040+
on_case_conflict=self.parameters['case_conflict']
1041+
)
1042+
)
1043+
else:
1044+
sync_strategies['file_not_at_dest_sync_strategy'] = (
1045+
MissingFileSync()
1046+
)
10081047
sync_strategies['file_not_at_src_sync_strategy'] = NeverSync()
10091048

10101049
# Determine what strategies to override if any.
@@ -1138,6 +1177,12 @@ def run(self):
11381177
'filters': [create_filter(self.parameters)],
11391178
'file_info_builder': [file_info_builder],
11401179
's3_handler': [s3_transfer_handler]}
1180+
if self._should_handle_case_conflicts():
1181+
self._handle_case_conflicts(
1182+
command_dict,
1183+
rev_files,
1184+
rev_generator,
1185+
)
11411186
elif self.cmd == 'rm':
11421187
command_dict = {'setup': [files],
11431188
'file_generator': [file_generator],
@@ -1150,6 +1195,12 @@ def run(self):
11501195
'filters': [create_filter(self.parameters)],
11511196
'file_info_builder': [file_info_builder],
11521197
's3_handler': [s3_transfer_handler]}
1198+
if self._should_handle_case_conflicts():
1199+
self._handle_case_conflicts(
1200+
command_dict,
1201+
rev_files,
1202+
rev_generator,
1203+
)
11531204

11541205
files = command_dict['setup']
11551206
while self.instructions:
@@ -1215,6 +1266,74 @@ def _map_sse_c_params(self, request_parameters, paths_type):
12151266
}
12161267
)
12171268

1269+
def _should_handle_case_conflicts(self):
1270+
return (
1271+
self.cmd in {'sync', 'cp', 'mv'}
1272+
and self.parameters.get('paths_type') == 's3local'
1273+
and self.parameters['case_conflict'] != 'ignore'
1274+
and self.parameters.get('dir_op')
1275+
)
1276+
1277+
def _handle_case_conflicts(self, command_dict, rev_files, rev_generator):
1278+
# Objects are not returned in lexicographical order when
1279+
# operated on S3 Express directory buckets. This is required
1280+
# for sync operations to behave correctly, which is what
1281+
# recursive copies and moves fall back to so potential case
1282+
# conflicts can be detected and handled.
1283+
if not is_s3express_bucket(
1284+
split_s3_bucket_key(self.parameters['src'])[0]
1285+
):
1286+
self._modify_instructions_for_case_conflicts(
1287+
command_dict, rev_files, rev_generator
1288+
)
1289+
return
1290+
# `skip` and `error` are not valid choices in this case because
1291+
# it's not possible to detect case conflicts.
1292+
if self.parameters['case_conflict'] not in {'ignore', 'warn'}:
1293+
raise ValueError(
1294+
f"`{self.parameters['case_conflict']}` is not a valid value "
1295+
"for `--case-conflict` when operating on S3 Express "
1296+
"directory buckets. Valid values: `warn`, `ignore`."
1297+
)
1298+
msg = (
1299+
"warning: Recursive copies/moves from an S3 Express "
1300+
"directory bucket to a case-insensitive local filesystem "
1301+
"may result in undefined behavior if there are "
1302+
"S3 object key names that differ only by case. To disable "
1303+
"this warning, set the `--case-conflict` parameter to `ignore`. "
1304+
f"For more information, see {CaseConflictSync.DOC_URI}."
1305+
)
1306+
uni_print(msg, sys.stderr)
1307+
1308+
def _modify_instructions_for_case_conflicts(
1309+
self, command_dict, rev_files, rev_generator
1310+
):
1311+
# Command will perform recursive S3 to local downloads.
1312+
# Checking for potential case conflicts requires knowledge
1313+
# of local files. Instead of writing a separate validation
1314+
# mechanism for recursive downloads, we modify the instructions
1315+
# to mimic a sync command.
1316+
sync_strategies = {
1317+
# Local filename exists with exact case match. Always sync
1318+
# because it's a copy operation.
1319+
'file_at_src_and_dest_sync_strategy': AlwaysSync(),
1320+
# Local filename either doesn't exist or differs only by case.
1321+
# Let `CaseConflictSync` determine which it is and handle it
1322+
# according to configured `--case-conflict` parameter.
1323+
'file_not_at_dest_sync_strategy': CaseConflictSync(
1324+
on_case_conflict=self.parameters['case_conflict']
1325+
),
1326+
# Copy is one-way so never sync if not at source.
1327+
'file_not_at_src_sync_strategy': NeverSync(),
1328+
}
1329+
command_dict['setup'].append(rev_files)
1330+
command_dict['file_generator'].append(rev_generator)
1331+
command_dict['filters'].append(create_filter(self.parameters))
1332+
command_dict['comparator'] = [Comparator(**sync_strategies)]
1333+
self.instructions.insert(
1334+
self.instructions.index('file_info_builder'), 'comparator'
1335+
)
1336+
12181337

12191338
class CommandParameters(object):
12201339
"""

awscli/customizations/s3/syncstrategy/base.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -254,3 +254,12 @@ def determine_should_sync(self, src_file, dest_file):
254254
LOG.debug("syncing: %s -> %s, file does not exist at destination",
255255
src_file.src, src_file.dest)
256256
return True
257+
258+
259+
class AlwaysSync(BaseSync):
260+
def __init__(self, sync_type='file_at_src_and_dest'):
261+
super(AlwaysSync, self).__init__(sync_type)
262+
263+
def determine_should_sync(self, src_file, dest_file):
264+
LOG.debug(f"syncing: {src_file.src} -> {src_file.dest}")
265+
return True
Lines changed: 96 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,96 @@
1+
# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
2+
# SPDX-License-Identifier: Apache-2.0
3+
import logging
4+
import os
5+
import sys
6+
7+
from awscli.customizations.s3.syncstrategy.base import BaseSync
8+
from awscli.customizations.utils import uni_print
9+
10+
LOG = logging.getLogger(__name__)
11+
12+
13+
class CaseConflictException(Exception):
14+
pass
15+
16+
17+
class CaseConflictSync(BaseSync):
18+
DOC_URI = (
19+
"https://docs.aws.amazon.com/cli/v1/topic/"
20+
"s3-case-insensitivity.html"
21+
)
22+
23+
def __init__(
24+
self,
25+
sync_type='file_not_at_dest',
26+
on_case_conflict='ignore',
27+
submitted=None,
28+
):
29+
super().__init__(sync_type)
30+
self._on_case_conflict = on_case_conflict
31+
if submitted is None:
32+
submitted = set()
33+
self._submitted = submitted
34+
35+
@property
36+
def submitted(self):
37+
return self._submitted
38+
39+
def determine_should_sync(self, src_file, dest_file):
40+
# `src_file.compare_key` and `dest_file.compare_key` are not equal.
41+
# This could mean that they're completely different or differ
42+
# only by case. eg, `/tmp/a` and `/tmp/b` versus `/tmp/a` and `/tmp/A`.
43+
# If the source file's destination already exists, that means it
44+
# differs only by case and the conflict needs to be handled.
45+
should_sync = True
46+
# Normalize compare key for case sensitivity.
47+
lower_compare_key = src_file.compare_key.lower()
48+
if lower_compare_key in self._submitted or os.path.exists(
49+
src_file.dest
50+
):
51+
handler = getattr(self, f"_handle_{self._on_case_conflict}")
52+
should_sync = handler(src_file)
53+
if should_sync:
54+
LOG.debug(f"syncing: {src_file.src} -> {src_file.dest}")
55+
self._submitted.add(lower_compare_key)
56+
# Set properties so that a subscriber can be created
57+
# that removes the key from the set after download finishes.
58+
src_file.case_conflict_submitted = self._submitted
59+
src_file.case_conflict_key = lower_compare_key
60+
return should_sync
61+
62+
@staticmethod
63+
def _handle_ignore(src_file):
64+
return True
65+
66+
@staticmethod
67+
def _handle_skip(src_file):
68+
msg = (
69+
f"warning: Skipping {src_file.src} -> {src_file.dest} "
70+
"because a file whose name differs only by case either exists "
71+
"or is being downloaded.\n"
72+
)
73+
uni_print(msg, sys.stderr)
74+
return False
75+
76+
@staticmethod
77+
def _handle_warn(src_file):
78+
msg = (
79+
f"warning: Downloading {src_file.src} -> {src_file.dest} "
80+
"despite a file whose name differs only by case either existing "
81+
"or being downloaded. This behavior is not defined on "
82+
"case-insensitive filesystems and may result in overwriting "
83+
"existing files or race conditions between concurrent downloads. "
84+
f"For more information, see {CaseConflictSync.DOC_URI}.\n"
85+
)
86+
uni_print(msg, sys.stderr)
87+
return True
88+
89+
@staticmethod
90+
def _handle_error(src_file):
91+
msg = (
92+
f"Failed to download {src_file.src} -> {src_file.dest} "
93+
"because a file whose name differs only by case either exists "
94+
"or is being downloaded."
95+
)
96+
raise CaseConflictException(msg)

0 commit comments

Comments
 (0)